Parent Directory | Revision Log
Revision 1909 - (view) (download)
1 : | edgomez | 1382 | ;/**************************************************************************** |
2 : | Isibaar | 262 | ; * |
3 : | edgomez | 1382 | ; * XVID MPEG-4 VIDEO CODEC |
4 : | ; * - SSE2 optimized SAD operators - | ||
5 : | Isibaar | 262 | ; * |
6 : | Isibaar | 1909 | ; * Copyright(C) 2003-2010 Pascal Massimino <skal@planet-d.net> |
7 : | ; * 2008-2010 Michael Militzer <michael@xvid.org> | ||
8 : | Isibaar | 262 | ; * |
9 : | ; * | ||
10 : | edgomez | 1382 | ; * This program is free software; you can redistribute it and/or modify it |
11 : | ; * under the terms of the GNU General Public License as published by | ||
12 : | ; * the Free Software Foundation; either version 2 of the License, or | ||
13 : | ; * (at your option) any later version. | ||
14 : | Isibaar | 262 | ; * |
15 : | edgomez | 1382 | ; * This program is distributed in the hope that it will be useful, |
16 : | ; * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 : | ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 : | ; * GNU General Public License for more details. | ||
19 : | Isibaar | 262 | ; * |
20 : | edgomez | 1382 | ; * You should have received a copy of the GNU General Public License |
21 : | ; * along with this program; if not, write to the Free Software | ||
22 : | ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
23 : | Isibaar | 262 | ; * |
24 : | Isibaar | 1909 | ; * $Id: sad_sse2.asm,v 1.21 2010-11-28 15:18:21 Isibaar Exp $ |
25 : | edgomez | 652 | ; * |
26 : | edgomez | 1382 | ; ***************************************************************************/ |
27 : | Isibaar | 262 | |
28 : | Isibaar | 1795 | %include "nasm.inc" |
29 : | Isibaar | 262 | |
30 : | edgomez | 1382 | ;============================================================================= |
31 : | ; Read only data | ||
32 : | ;============================================================================= | ||
33 : | Isibaar | 262 | |
34 : | Isibaar | 1795 | DATA |
35 : | Isibaar | 262 | |
36 : | Isibaar | 1795 | ALIGN SECTION_ALIGN |
37 : | Isibaar | 262 | zero times 4 dd 0 |
38 : | |||
39 : | Isibaar | 1909 | ALIGN SECTION_ALIGN |
40 : | ones times 8 dw 1 | ||
41 : | |||
42 : | ALIGN SECTION_ALIGN | ||
43 : | round32 times 4 dd 32 | ||
44 : | |||
45 : | edgomez | 1382 | ;============================================================================= |
46 : | Isibaar | 1909 | ; Coeffs for MSE_H calculation |
47 : | ;============================================================================= | ||
48 : | |||
49 : | ALIGN SECTION_ALIGN | ||
50 : | iMask_Coeff: | ||
51 : | dw 0, 29788, 32767, 20479, 13653, 8192, 6425, 5372, | ||
52 : | dw 27306, 27306, 23405, 17246, 12603, 5650, 5461, 5958, | ||
53 : | dw 23405, 25205, 20479, 13653, 8192, 5749, 4749, 5851, | ||
54 : | dw 23405, 19275, 14894, 11299, 6425, 3766, 4096, 5285, | ||
55 : | dw 18204, 14894, 8856, 5851, 4819, 3006, 3181, 4255, | ||
56 : | dw 13653, 9362, 5958, 5120, 4045, 3151, 2900, 3562, | ||
57 : | dw 6687, 5120, 4201, 3766, 3181, 2708, 2730, 3244, | ||
58 : | dw 4551, 3562, 3449, 3344, 2926, 3277, 3181, 3310 | ||
59 : | |||
60 : | ALIGN SECTION_ALIGN | ||
61 : | Inv_iMask_Coeff: | ||
62 : | dd 0, 155, 128, 328, 737, 2048, 3329, 4763, | ||
63 : | dd 184, 184, 251, 462, 865, 4306, 4608, 3872, | ||
64 : | dd 251, 216, 328, 737, 2048, 4159, 6094, 4014, | ||
65 : | dd 251, 370, 620, 1076, 3329, 9688, 8192, 4920, | ||
66 : | dd 415, 620, 1752, 4014, 5919, 15207, 13579, 7589, | ||
67 : | dd 737, 1568, 3872, 5243, 8398, 13844, 16345, 10834, | ||
68 : | dd 3073, 5243, 7787, 9688, 13579, 18741, 18433, 13057, | ||
69 : | dd 6636, 10834, 11552, 12294, 16056, 12800, 13579, 12545 | ||
70 : | |||
71 : | ALIGN SECTION_ALIGN | ||
72 : | iCSF_Coeff: | ||
73 : | dw 26353, 38331, 42164, 26353, 17568, 10541, 8268, 6912, | ||
74 : | dw 35137, 35137, 30117, 22192, 16217, 7270, 7027, 7666, | ||
75 : | dw 30117, 32434, 26353, 17568, 10541, 7397, 6111, 7529, | ||
76 : | dw 30117, 24803, 19166, 14539, 8268, 4846, 5271, 6801, | ||
77 : | dw 23425, 19166, 11396, 7529, 6201, 3868, 4094, 5476, | ||
78 : | dw 17568, 12047, 7666, 6588, 5205, 4054, 3731, 4583, | ||
79 : | dw 8605, 6588, 5406, 4846, 4094, 3485, 3514, 4175, | ||
80 : | dw 5856, 4583, 4438, 4302, 3765, 4216, 4094, 4259 | ||
81 : | |||
82 : | ALIGN SECTION_ALIGN | ||
83 : | iCSF_Round: | ||
84 : | dw 1, 1, 1, 1, 2, 3, 4, 5, | ||
85 : | dw 1, 1, 1, 1, 2, 5, 5, 4, | ||
86 : | dw 1, 1, 1, 2, 3, 4, 5, 4, | ||
87 : | dw 1, 1, 2, 2, 4, 7, 6, 5, | ||
88 : | dw 1, 2, 3, 4, 5, 8, 8, 6, | ||
89 : | dw 2, 3, 4, 5, 6, 8, 9, 7, | ||
90 : | dw 4, 5, 6, 7, 8, 9, 9, 8, | ||
91 : | dw 6, 7, 7, 8, 9, 8, 8, 8 | ||
92 : | |||
93 : | |||
94 : | ;============================================================================= | ||
95 : | edgomez | 1382 | ; Code |
96 : | ;============================================================================= | ||
97 : | Isibaar | 262 | |
98 : | Isibaar | 1844 | TEXT |
99 : | edgomez | 1382 | |
100 : | Isibaar | 262 | cglobal sad16_sse2 |
101 : | cglobal dev16_sse2 | ||
102 : | |||
103 : | Isibaar | 1764 | cglobal sad16_sse3 |
104 : | cglobal dev16_sse3 | ||
105 : | |||
106 : | Isibaar | 1909 | cglobal sseh8_16bit_sse2 |
107 : | cglobal coeff8_energy_sse2 | ||
108 : | cglobal blocksum8_sse2 | ||
109 : | |||
110 : | edgomez | 1382 | ;----------------------------------------------------------------------------- |
111 : | ; uint32_t sad16_sse2 (const uint8_t * const cur, <- assumed aligned! | ||
112 : | ; const uint8_t * const ref, | ||
113 : | ; const uint32_t stride, | ||
114 : | ; const uint32_t /*ignored*/); | ||
115 : | ;----------------------------------------------------------------------------- | ||
116 : | Isibaar | 262 | |
117 : | |||
118 : | Isibaar | 1764 | %macro SAD_16x16_SSE2 1 |
119 : | Isibaar | 1795 | %1 xmm0, [TMP1] |
120 : | %1 xmm1, [TMP1+TMP0] | ||
121 : | lea TMP1,[TMP1+2*TMP0] | ||
122 : | movdqa xmm2, [_EAX] | ||
123 : | movdqa xmm3, [_EAX+TMP0] | ||
124 : | lea _EAX,[_EAX+2*TMP0] | ||
125 : | edgomez | 1382 | psadbw xmm0, xmm2 |
126 : | Isibaar | 1839 | paddusw xmm4,xmm0 |
127 : | edgomez | 1382 | psadbw xmm1, xmm3 |
128 : | Isibaar | 1839 | paddusw xmm4,xmm1 |
129 : | Isibaar | 262 | %endmacro |
130 : | |||
131 : | Isibaar | 1764 | %macro SAD16_SSE2_SSE3 1 |
132 : | Isibaar | 1795 | mov _EAX, prm1 ; cur (assumed aligned) |
133 : | mov TMP1, prm2 ; ref | ||
134 : | mov TMP0, prm3 ; stride | ||
135 : | Isibaar | 262 | |
136 : | Isibaar | 1839 | pxor xmm4, xmm4 ; accum |
137 : | Isibaar | 262 | |
138 : | Isibaar | 1764 | SAD_16x16_SSE2 %1 |
139 : | SAD_16x16_SSE2 %1 | ||
140 : | SAD_16x16_SSE2 %1 | ||
141 : | SAD_16x16_SSE2 %1 | ||
142 : | SAD_16x16_SSE2 %1 | ||
143 : | SAD_16x16_SSE2 %1 | ||
144 : | SAD_16x16_SSE2 %1 | ||
145 : | SAD_16x16_SSE2 %1 | ||
146 : | Isibaar | 262 | |
147 : | Isibaar | 1839 | pshufd xmm5, xmm4, 00000010b |
148 : | paddusw xmm4, xmm5 | ||
149 : | pextrw eax, xmm4, 0 | ||
150 : | Isibaar | 1833 | |
151 : | edgomez | 1382 | ret |
152 : | Isibaar | 1764 | %endmacro |
153 : | |||
154 : | Isibaar | 1795 | ALIGN SECTION_ALIGN |
155 : | Isibaar | 1764 | sad16_sse2: |
156 : | SAD16_SSE2_SSE3 movdqu | ||
157 : | Isibaar | 1793 | ENDFUNC |
158 : | Isibaar | 262 | |
159 : | |||
160 : | Isibaar | 1795 | ALIGN SECTION_ALIGN |
161 : | Isibaar | 1764 | sad16_sse3: |
162 : | SAD16_SSE2_SSE3 lddqu | ||
163 : | Isibaar | 1793 | ENDFUNC |
164 : | Isibaar | 1764 | |
165 : | |||
166 : | edgomez | 1382 | ;----------------------------------------------------------------------------- |
167 : | ; uint32_t dev16_sse2(const uint8_t * const cur, const uint32_t stride); | ||
168 : | ;----------------------------------------------------------------------------- | ||
169 : | Isibaar | 262 | |
170 : | Isibaar | 1795 | %macro MEAN_16x16_SSE2 1 ; _EAX: src, TMP0:stride, mm7: zero or mean => mm6: result |
171 : | %1 xmm0, [_EAX] | ||
172 : | %1 xmm1, [_EAX+TMP0] | ||
173 : | lea _EAX, [_EAX+2*TMP0] ; + 2*stride | ||
174 : | Isibaar | 1839 | psadbw xmm0, xmm5 |
175 : | paddusw xmm4, xmm0 | ||
176 : | psadbw xmm1, xmm5 | ||
177 : | paddusw xmm4, xmm1 | ||
178 : | Isibaar | 262 | %endmacro |
179 : | |||
180 : | |||
181 : | Isibaar | 1764 | %macro MEAN16_SSE2_SSE3 1 |
182 : | Isibaar | 1795 | mov _EAX, prm1 ; src |
183 : | mov TMP0, prm2 ; stride | ||
184 : | Isibaar | 262 | |
185 : | Isibaar | 1839 | pxor xmm4, xmm4 ; accum |
186 : | pxor xmm5, xmm5 ; zero | ||
187 : | Isibaar | 262 | |
188 : | Isibaar | 1764 | MEAN_16x16_SSE2 %1 |
189 : | MEAN_16x16_SSE2 %1 | ||
190 : | MEAN_16x16_SSE2 %1 | ||
191 : | MEAN_16x16_SSE2 %1 | ||
192 : | Isibaar | 262 | |
193 : | Isibaar | 1764 | MEAN_16x16_SSE2 %1 |
194 : | MEAN_16x16_SSE2 %1 | ||
195 : | MEAN_16x16_SSE2 %1 | ||
196 : | MEAN_16x16_SSE2 %1 | ||
197 : | Isibaar | 262 | |
198 : | Isibaar | 1795 | mov _EAX, prm1 ; src again |
199 : | Isibaar | 262 | |
200 : | Isibaar | 1839 | pshufd xmm5, xmm4, 10b |
201 : | paddusw xmm5, xmm4 | ||
202 : | pxor xmm4, xmm4 ; zero accum | ||
203 : | psrlw xmm5, 8 ; => Mean | ||
204 : | pshuflw xmm5, xmm5, 0 ; replicate Mean | ||
205 : | packuswb xmm5, xmm5 | ||
206 : | pshufd xmm5, xmm5, 00000000b | ||
207 : | Isibaar | 262 | |
208 : | Isibaar | 1764 | MEAN_16x16_SSE2 %1 |
209 : | MEAN_16x16_SSE2 %1 | ||
210 : | MEAN_16x16_SSE2 %1 | ||
211 : | MEAN_16x16_SSE2 %1 | ||
212 : | Isibaar | 262 | |
213 : | Isibaar | 1764 | MEAN_16x16_SSE2 %1 |
214 : | MEAN_16x16_SSE2 %1 | ||
215 : | MEAN_16x16_SSE2 %1 | ||
216 : | MEAN_16x16_SSE2 %1 | ||
217 : | Isibaar | 262 | |
218 : | Isibaar | 1839 | pshufd xmm5, xmm4, 10b |
219 : | paddusw xmm5, xmm4 | ||
220 : | pextrw eax, xmm5, 0 | ||
221 : | Isibaar | 1833 | |
222 : | edgomez | 1382 | ret |
223 : | Isibaar | 1764 | %endmacro |
224 : | |||
225 : | Isibaar | 1795 | ALIGN SECTION_ALIGN |
226 : | Isibaar | 1764 | dev16_sse2: |
227 : | MEAN16_SSE2_SSE3 movdqu | ||
228 : | Isibaar | 1793 | ENDFUNC |
229 : | edgomez | 1540 | |
230 : | Isibaar | 1795 | ALIGN SECTION_ALIGN |
231 : | Isibaar | 1764 | dev16_sse3: |
232 : | MEAN16_SSE2_SSE3 lddqu | ||
233 : | Isibaar | 1793 | ENDFUNC |
234 : | Isibaar | 1790 | |
235 : | Isibaar | 1909 | ;----------------------------------------------------------------------------- |
236 : | ; uint32_t coeff8_energy_sse2(const int16_t * dct); | ||
237 : | ;----------------------------------------------------------------------------- | ||
238 : | |||
239 : | %macro DCT_ENERGY_SSE2 4 | ||
240 : | |||
241 : | movdqa %1, [%3 + %4] | ||
242 : | movdqa %2, [%3 + %4 + 16] | ||
243 : | |||
244 : | psllw %1, 4 | ||
245 : | psllw %2, 4 | ||
246 : | |||
247 : | pmulhw %1, [iMask_Coeff + %4] | ||
248 : | pmulhw %2, [iMask_Coeff + %4 + 16] | ||
249 : | |||
250 : | pmaddwd %1, %1 | ||
251 : | pmaddwd %2, %2 | ||
252 : | |||
253 : | paddd %1, %2 | ||
254 : | psrld %1, 3 | ||
255 : | |||
256 : | %endmacro | ||
257 : | |||
258 : | ALIGN SECTION_ALIGN | ||
259 : | coeff8_energy_sse2: | ||
260 : | |||
261 : | mov TMP0, prm1 ; DCT_A | ||
262 : | |||
263 : | DCT_ENERGY_SSE2 xmm0, xmm1, TMP0, 0 | ||
264 : | DCT_ENERGY_SSE2 xmm1, xmm2, TMP0, 32 | ||
265 : | |||
266 : | DCT_ENERGY_SSE2 xmm2, xmm3, TMP0, 64 | ||
267 : | DCT_ENERGY_SSE2 xmm3, xmm4, TMP0, 96 | ||
268 : | |||
269 : | paddd xmm0, xmm1 | ||
270 : | paddd xmm2, xmm3 | ||
271 : | |||
272 : | paddd xmm0, xmm2 ; A B C D | ||
273 : | |||
274 : | ; convolute | ||
275 : | pshufd xmm1, xmm0, 238 | ||
276 : | paddd xmm0, xmm1 | ||
277 : | |||
278 : | pshufd xmm2, xmm0, 85 | ||
279 : | paddd xmm0, xmm2 | ||
280 : | |||
281 : | movd eax, xmm0 | ||
282 : | |||
283 : | ret | ||
284 : | ENDFUNC | ||
285 : | |||
286 : | ;----------------------------------------------------------------------------------- | ||
287 : | ; uint32_t mseh8_16bit_sse2(const int16_t * cur, const int16_t * ref, uint16_t mask) | ||
288 : | ;----------------------------------------------------------------------------------- | ||
289 : | |||
290 : | %macro SSEH_SSE2 4 | ||
291 : | movdqa xmm0, [%1 + %3] | ||
292 : | movdqa xmm1, [%2 + %3] | ||
293 : | |||
294 : | movdqa xmm2, [%1 + %3 + 16] | ||
295 : | movdqa xmm3, [%2 + %3 + 16] | ||
296 : | |||
297 : | |||
298 : | movdqa xmm4, xmm7 ; MASK | ||
299 : | movdqa xmm5, xmm7 | ||
300 : | |||
301 : | psubsw xmm0, xmm1 ; A - B | ||
302 : | psubsw xmm2, xmm3 | ||
303 : | |||
304 : | |||
305 : | ; ABS | ||
306 : | pxor xmm1, xmm1 | ||
307 : | pxor xmm3, xmm3 | ||
308 : | |||
309 : | pcmpgtw xmm1, xmm0 | ||
310 : | pcmpgtw xmm3, xmm2 | ||
311 : | |||
312 : | pxor xmm0, xmm1 ; change sign if negative | ||
313 : | pxor xmm2, xmm3 ; | ||
314 : | |||
315 : | psubw xmm0, xmm1 ; ABS (A - B) | ||
316 : | psubw xmm2, xmm3 ; ABS (A - B) | ||
317 : | |||
318 : | |||
319 : | movdqa xmm1, xmm7 ; MASK | ||
320 : | movdqa xmm3, xmm7 | ||
321 : | |||
322 : | pmaddwd xmm4, [Inv_iMask_Coeff + 2*(%3)] | ||
323 : | pmaddwd xmm5, [Inv_iMask_Coeff + 2*(%3) + 16] | ||
324 : | |||
325 : | pmaddwd xmm1, [Inv_iMask_Coeff + 2*(%3) + 32] | ||
326 : | pmaddwd xmm3, [Inv_iMask_Coeff + 2*(%3) + 48] | ||
327 : | |||
328 : | psllw xmm0, 4 | ||
329 : | psllw xmm2, 4 | ||
330 : | |||
331 : | paddd xmm4, [round32] | ||
332 : | paddd xmm5, [round32] | ||
333 : | |||
334 : | paddd xmm1, [round32] | ||
335 : | paddd xmm3, [round32] | ||
336 : | |||
337 : | psrad xmm4, 7 | ||
338 : | psrad xmm5, 7 | ||
339 : | |||
340 : | psrad xmm1, 7 | ||
341 : | psrad xmm3, 7 | ||
342 : | |||
343 : | packssdw xmm4, xmm5 ; Thresh | ||
344 : | packssdw xmm1, xmm3 ; Thresh | ||
345 : | |||
346 : | |||
347 : | psubusw xmm0, xmm4 ; Decimate by masking effect | ||
348 : | psubusw xmm2, xmm1 | ||
349 : | |||
350 : | paddusw xmm0, [iCSF_Round + %3] | ||
351 : | paddusw xmm2, [iCSF_Round + %3 + 16] | ||
352 : | |||
353 : | pmulhuw xmm0, [iCSF_Coeff + %3] | ||
354 : | pmulhuw xmm2, [iCSF_Coeff + %3 + 16] | ||
355 : | |||
356 : | pmaddwd xmm0, xmm0 | ||
357 : | pmaddwd xmm2, xmm2 | ||
358 : | |||
359 : | paddd xmm0, xmm2 | ||
360 : | %endmacro | ||
361 : | |||
362 : | |||
363 : | ALIGN SECTION_ALIGN | ||
364 : | sseh8_16bit_sse2: | ||
365 : | |||
366 : | PUSH_XMM6_XMM7 | ||
367 : | |||
368 : | mov TMP0, prm1 ; DCT_A | ||
369 : | mov TMP1, prm2 ; DCT_B | ||
370 : | mov _EAX, prm3 ; MASK | ||
371 : | |||
372 : | movd xmm7, eax | ||
373 : | pshufd xmm7, xmm7, 0 | ||
374 : | |||
375 : | SSEH_SSE2 TMP0, TMP1, 0, xmm7 | ||
376 : | movdqa xmm6, xmm0 | ||
377 : | SSEH_SSE2 TMP0, TMP1, 32, xmm7 | ||
378 : | paddd xmm6, xmm0 | ||
379 : | SSEH_SSE2 TMP0, TMP1, 64, xmm7 | ||
380 : | paddd xmm6, xmm0 | ||
381 : | SSEH_SSE2 TMP0, TMP1, 96, xmm7 | ||
382 : | paddd xmm6, xmm0 | ||
383 : | |||
384 : | ; convolute | ||
385 : | pshufd xmm1, xmm6, 238 | ||
386 : | paddd xmm6, xmm1 | ||
387 : | |||
388 : | pshufd xmm2, xmm6, 85 | ||
389 : | paddd xmm6, xmm2 | ||
390 : | |||
391 : | |||
392 : | movd eax, xmm6 | ||
393 : | |||
394 : | POP_XMM6_XMM7 | ||
395 : | ret | ||
396 : | ENDFUNC | ||
397 : | |||
398 : | ;-------------------------------------------------------------------------------------------- | ||
399 : | ; uint32_t blocksum8_c(const int8_t * cur, int stride, uint16_t sums[4], uint32_t squares[4]) | ||
400 : | ;-------------------------------------------------------------------------------------------- | ||
401 : | |||
402 : | %macro BLOCKSUM_SSE2 3 | ||
403 : | movq xmm0, [%1 ] ; 0 0 B A | ||
404 : | movq xmm2, [%1 + %2] ; 0 0 B A | ||
405 : | movq xmm1, [%1 + 2*%2] | ||
406 : | movq xmm3, [%1 + %3] | ||
407 : | |||
408 : | punpckldq xmm0, xmm2 ; B B A A | ||
409 : | punpckldq xmm1, xmm3 ; B B A A | ||
410 : | |||
411 : | movdqa xmm2, xmm0 | ||
412 : | movdqa xmm3, xmm1 | ||
413 : | |||
414 : | psadbw xmm0, xmm7 ; 000b000a | ||
415 : | psadbw xmm1, xmm7 | ||
416 : | |||
417 : | movdqa xmm4, xmm2 | ||
418 : | movdqa xmm5, xmm3 | ||
419 : | |||
420 : | punpcklbw xmm2, xmm7 ; aaaaaaaa | ||
421 : | punpcklbw xmm3, xmm7 | ||
422 : | |||
423 : | punpckhbw xmm4, xmm7 ; bbbbbbbb | ||
424 : | punpckhbw xmm5, xmm7 | ||
425 : | |||
426 : | pmaddwd xmm2, xmm2 ; a*a+a*a a*a+a*a a*a+a*a a*a+a*a | ||
427 : | pmaddwd xmm3, xmm3 | ||
428 : | |||
429 : | pmaddwd xmm4, xmm4 ; b*b+b*b b*b+b*b b*b+b*b b*b+b*b | ||
430 : | pmaddwd xmm5, xmm5 | ||
431 : | |||
432 : | paddd xmm2, xmm3 | ||
433 : | paddd xmm4, xmm5 | ||
434 : | |||
435 : | movdqa xmm3, xmm2 | ||
436 : | punpckldq xmm2, xmm4 ; BABA | ||
437 : | punpckhdq xmm3, xmm4 ; BABA | ||
438 : | |||
439 : | paddd xmm2, xmm3 | ||
440 : | |||
441 : | lea %1, [%1 + 4*%2] | ||
442 : | |||
443 : | movdqa xmm4, xmm2 | ||
444 : | punpckhqdq xmm4, xmm7 ; | ||
445 : | |||
446 : | paddd xmm2, xmm4 | ||
447 : | |||
448 : | ; | ||
449 : | movq xmm3, [%1 ] ; 0 0 D C | ||
450 : | movq xmm5, [%1 + %2] ; 0 0 D C | ||
451 : | movq xmm4, [%1 + 2*%2] | ||
452 : | movq xmm6, [%1 + %3] | ||
453 : | |||
454 : | punpckldq xmm3, xmm5 ; D D C C | ||
455 : | punpckldq xmm4, xmm6 ; D D C C | ||
456 : | |||
457 : | movdqa xmm5, xmm3 | ||
458 : | movdqa xmm6, xmm4 | ||
459 : | |||
460 : | psadbw xmm3, xmm7 ; 000d000c | ||
461 : | psadbw xmm4, xmm7 | ||
462 : | |||
463 : | packssdw xmm0, xmm3 ; 0d0c0b0a | ||
464 : | packssdw xmm1, xmm4 ; | ||
465 : | |||
466 : | paddusw xmm0, xmm1 | ||
467 : | packssdw xmm0, xmm7 ; 0000dcba | ||
468 : | |||
469 : | |||
470 : | movdqa xmm3, xmm5 | ||
471 : | movdqa xmm4, xmm6 | ||
472 : | |||
473 : | punpcklbw xmm3, xmm7 | ||
474 : | punpcklbw xmm4, xmm7 | ||
475 : | |||
476 : | punpckhbw xmm5, xmm7 | ||
477 : | punpckhbw xmm6, xmm7 | ||
478 : | |||
479 : | pmaddwd xmm3, xmm3 ; C*C+C*C | ||
480 : | pmaddwd xmm4, xmm4 | ||
481 : | |||
482 : | pmaddwd xmm5, xmm5 ; D*D+D*D | ||
483 : | pmaddwd xmm6, xmm6 | ||
484 : | |||
485 : | paddd xmm3, xmm4 | ||
486 : | paddd xmm5, xmm6 | ||
487 : | |||
488 : | movdqa xmm1, xmm3 | ||
489 : | punpckldq xmm3, xmm5 ; DCDC | ||
490 : | punpckhdq xmm1, xmm5 ; DCDC | ||
491 : | |||
492 : | paddd xmm3, xmm1 | ||
493 : | |||
494 : | movdqa xmm4, xmm3 | ||
495 : | punpckhqdq xmm4, xmm7 ; | ||
496 : | |||
497 : | paddd xmm3, xmm4 | ||
498 : | punpcklqdq xmm2, xmm3 | ||
499 : | %endmacro | ||
500 : | |||
501 : | |||
502 : | ALIGN SECTION_ALIGN | ||
503 : | blocksum8_sse2: | ||
504 : | |||
505 : | PUSH_XMM6_XMM7 | ||
506 : | |||
507 : | mov TMP0, prm1 ; cur | ||
508 : | mov TMP1, prm2 ; stride | ||
509 : | mov _EAX, prm3 ; sums | ||
510 : | |||
511 : | push _EBP | ||
512 : | lea _EBP, [TMP1 + 2*TMP1] | ||
513 : | |||
514 : | pxor xmm7, xmm7 | ||
515 : | |||
516 : | BLOCKSUM_SSE2 TMP0, TMP1, _EBP | ||
517 : | |||
518 : | pop _EBP | ||
519 : | mov TMP0, prm4 ; squares | ||
520 : | |||
521 : | movq [_EAX], xmm0 ; sums of the 4x4 sub-blocks | ||
522 : | movdqa [TMP0], xmm2 ; squares of the 4x4 sub-blocks | ||
523 : | |||
524 : | pmaddwd xmm0, [ones] | ||
525 : | packssdw xmm0, xmm7 | ||
526 : | |||
527 : | pmaddwd xmm0, [ones] | ||
528 : | movd eax, xmm0 | ||
529 : | |||
530 : | POP_XMM6_XMM7 | ||
531 : | ret | ||
532 : | ENDFUNC | ||
533 : | |||
534 : | Isibaar | 1877 | NON_EXEC_STACK |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |