3 |
; * XVID MPEG-4 VIDEO CODEC |
; * XVID MPEG-4 VIDEO CODEC |
4 |
; * - SSE2 optimized SAD operators - |
; * - SSE2 optimized SAD operators - |
5 |
; * |
; * |
6 |
; * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net> |
; * Copyright(C) 2003-2010 Pascal Massimino <skal@planet-d.net> |
7 |
|
; * 2008-2010 Michael Militzer <michael@xvid.org> |
8 |
; * |
; * |
9 |
; * |
; * |
10 |
; * This program is free software; you can redistribute it and/or modify it |
; * This program is free software; you can redistribute it and/or modify it |
21 |
; * along with this program; if not, write to the Free Software |
; * along with this program; if not, write to the Free Software |
22 |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
23 |
; * |
; * |
24 |
; * $Id: sad_sse2.asm,v 1.20 2009-09-16 17:07:58 Isibaar Exp $ |
; * $Id: sad_sse2.asm,v 1.21 2010-11-28 15:18:21 Isibaar Exp $ |
25 |
; * |
; * |
26 |
; ***************************************************************************/ |
; ***************************************************************************/ |
27 |
|
|
36 |
ALIGN SECTION_ALIGN |
ALIGN SECTION_ALIGN |
37 |
zero times 4 dd 0 |
zero times 4 dd 0 |
38 |
|
|
39 |
|
ALIGN SECTION_ALIGN |
40 |
|
ones times 8 dw 1 |
41 |
|
|
42 |
|
ALIGN SECTION_ALIGN |
43 |
|
round32 times 4 dd 32 |
44 |
|
|
45 |
|
;============================================================================= |
46 |
|
; Coeffs for MSE_H calculation |
47 |
|
;============================================================================= |
48 |
|
|
49 |
|
ALIGN SECTION_ALIGN |
50 |
|
iMask_Coeff: |
51 |
|
dw 0, 29788, 32767, 20479, 13653, 8192, 6425, 5372, |
52 |
|
dw 27306, 27306, 23405, 17246, 12603, 5650, 5461, 5958, |
53 |
|
dw 23405, 25205, 20479, 13653, 8192, 5749, 4749, 5851, |
54 |
|
dw 23405, 19275, 14894, 11299, 6425, 3766, 4096, 5285, |
55 |
|
dw 18204, 14894, 8856, 5851, 4819, 3006, 3181, 4255, |
56 |
|
dw 13653, 9362, 5958, 5120, 4045, 3151, 2900, 3562, |
57 |
|
dw 6687, 5120, 4201, 3766, 3181, 2708, 2730, 3244, |
58 |
|
dw 4551, 3562, 3449, 3344, 2926, 3277, 3181, 3310 |
59 |
|
|
60 |
|
ALIGN SECTION_ALIGN |
61 |
|
Inv_iMask_Coeff: |
62 |
|
dd 0, 155, 128, 328, 737, 2048, 3329, 4763, |
63 |
|
dd 184, 184, 251, 462, 865, 4306, 4608, 3872, |
64 |
|
dd 251, 216, 328, 737, 2048, 4159, 6094, 4014, |
65 |
|
dd 251, 370, 620, 1076, 3329, 9688, 8192, 4920, |
66 |
|
dd 415, 620, 1752, 4014, 5919, 15207, 13579, 7589, |
67 |
|
dd 737, 1568, 3872, 5243, 8398, 13844, 16345, 10834, |
68 |
|
dd 3073, 5243, 7787, 9688, 13579, 18741, 18433, 13057, |
69 |
|
dd 6636, 10834, 11552, 12294, 16056, 12800, 13579, 12545 |
70 |
|
|
71 |
|
ALIGN SECTION_ALIGN |
72 |
|
iCSF_Coeff: |
73 |
|
dw 26353, 38331, 42164, 26353, 17568, 10541, 8268, 6912, |
74 |
|
dw 35137, 35137, 30117, 22192, 16217, 7270, 7027, 7666, |
75 |
|
dw 30117, 32434, 26353, 17568, 10541, 7397, 6111, 7529, |
76 |
|
dw 30117, 24803, 19166, 14539, 8268, 4846, 5271, 6801, |
77 |
|
dw 23425, 19166, 11396, 7529, 6201, 3868, 4094, 5476, |
78 |
|
dw 17568, 12047, 7666, 6588, 5205, 4054, 3731, 4583, |
79 |
|
dw 8605, 6588, 5406, 4846, 4094, 3485, 3514, 4175, |
80 |
|
dw 5856, 4583, 4438, 4302, 3765, 4216, 4094, 4259 |
81 |
|
|
82 |
|
ALIGN SECTION_ALIGN |
83 |
|
iCSF_Round: |
84 |
|
dw 1, 1, 1, 1, 2, 3, 4, 5, |
85 |
|
dw 1, 1, 1, 1, 2, 5, 5, 4, |
86 |
|
dw 1, 1, 1, 2, 3, 4, 5, 4, |
87 |
|
dw 1, 1, 2, 2, 4, 7, 6, 5, |
88 |
|
dw 1, 2, 3, 4, 5, 8, 8, 6, |
89 |
|
dw 2, 3, 4, 5, 6, 8, 9, 7, |
90 |
|
dw 4, 5, 6, 7, 8, 9, 9, 8, |
91 |
|
dw 6, 7, 7, 8, 9, 8, 8, 8 |
92 |
|
|
93 |
|
|
94 |
;============================================================================= |
;============================================================================= |
95 |
; Code |
; Code |
96 |
;============================================================================= |
;============================================================================= |
103 |
cglobal sad16_sse3 |
cglobal sad16_sse3 |
104 |
cglobal dev16_sse3 |
cglobal dev16_sse3 |
105 |
|
|
106 |
|
cglobal sseh8_16bit_sse2 |
107 |
|
cglobal coeff8_energy_sse2 |
108 |
|
cglobal blocksum8_sse2 |
109 |
|
|
110 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
111 |
; uint32_t sad16_sse2 (const uint8_t * const cur, <- assumed aligned! |
; uint32_t sad16_sse2 (const uint8_t * const cur, <- assumed aligned! |
112 |
; const uint8_t * const ref, |
; const uint8_t * const ref, |
232 |
MEAN16_SSE2_SSE3 lddqu |
MEAN16_SSE2_SSE3 lddqu |
233 |
ENDFUNC |
ENDFUNC |
234 |
|
|
235 |
|
;----------------------------------------------------------------------------- |
236 |
|
; uint32_t coeff8_energy_sse2(const int16_t * dct); |
237 |
|
;----------------------------------------------------------------------------- |
238 |
|
|
239 |
|
%macro DCT_ENERGY_SSE2 4 |
240 |
|
|
241 |
|
movdqa %1, [%3 + %4] |
242 |
|
movdqa %2, [%3 + %4 + 16] |
243 |
|
|
244 |
|
psllw %1, 4 |
245 |
|
psllw %2, 4 |
246 |
|
|
247 |
|
pmulhw %1, [iMask_Coeff + %4] |
248 |
|
pmulhw %2, [iMask_Coeff + %4 + 16] |
249 |
|
|
250 |
|
pmaddwd %1, %1 |
251 |
|
pmaddwd %2, %2 |
252 |
|
|
253 |
|
paddd %1, %2 |
254 |
|
psrld %1, 3 |
255 |
|
|
256 |
|
%endmacro |
257 |
|
|
258 |
|
ALIGN SECTION_ALIGN |
259 |
|
coeff8_energy_sse2: |
260 |
|
|
261 |
|
mov TMP0, prm1 ; DCT_A |
262 |
|
|
263 |
|
DCT_ENERGY_SSE2 xmm0, xmm1, TMP0, 0 |
264 |
|
DCT_ENERGY_SSE2 xmm1, xmm2, TMP0, 32 |
265 |
|
|
266 |
|
DCT_ENERGY_SSE2 xmm2, xmm3, TMP0, 64 |
267 |
|
DCT_ENERGY_SSE2 xmm3, xmm4, TMP0, 96 |
268 |
|
|
269 |
|
paddd xmm0, xmm1 |
270 |
|
paddd xmm2, xmm3 |
271 |
|
|
272 |
|
paddd xmm0, xmm2 ; A B C D |
273 |
|
|
274 |
|
; convolute |
275 |
|
pshufd xmm1, xmm0, 238 |
276 |
|
paddd xmm0, xmm1 |
277 |
|
|
278 |
|
pshufd xmm2, xmm0, 85 |
279 |
|
paddd xmm0, xmm2 |
280 |
|
|
281 |
|
movd eax, xmm0 |
282 |
|
|
283 |
|
ret |
284 |
|
ENDFUNC |
285 |
|
|
286 |
|
;----------------------------------------------------------------------------------- |
287 |
|
; uint32_t mseh8_16bit_sse2(const int16_t * cur, const int16_t * ref, uint16_t mask) |
288 |
|
;----------------------------------------------------------------------------------- |
289 |
|
|
290 |
|
%macro SSEH_SSE2 4 |
291 |
|
movdqa xmm0, [%1 + %3] |
292 |
|
movdqa xmm1, [%2 + %3] |
293 |
|
|
294 |
|
movdqa xmm2, [%1 + %3 + 16] |
295 |
|
movdqa xmm3, [%2 + %3 + 16] |
296 |
|
|
297 |
|
|
298 |
|
movdqa xmm4, xmm7 ; MASK |
299 |
|
movdqa xmm5, xmm7 |
300 |
|
|
301 |
|
psubsw xmm0, xmm1 ; A - B |
302 |
|
psubsw xmm2, xmm3 |
303 |
|
|
304 |
|
|
305 |
|
; ABS |
306 |
|
pxor xmm1, xmm1 |
307 |
|
pxor xmm3, xmm3 |
308 |
|
|
309 |
|
pcmpgtw xmm1, xmm0 |
310 |
|
pcmpgtw xmm3, xmm2 |
311 |
|
|
312 |
|
pxor xmm0, xmm1 ; change sign if negative |
313 |
|
pxor xmm2, xmm3 ; |
314 |
|
|
315 |
|
psubw xmm0, xmm1 ; ABS (A - B) |
316 |
|
psubw xmm2, xmm3 ; ABS (A - B) |
317 |
|
|
318 |
|
|
319 |
|
movdqa xmm1, xmm7 ; MASK |
320 |
|
movdqa xmm3, xmm7 |
321 |
|
|
322 |
|
pmaddwd xmm4, [Inv_iMask_Coeff + 2*(%3)] |
323 |
|
pmaddwd xmm5, [Inv_iMask_Coeff + 2*(%3) + 16] |
324 |
|
|
325 |
|
pmaddwd xmm1, [Inv_iMask_Coeff + 2*(%3) + 32] |
326 |
|
pmaddwd xmm3, [Inv_iMask_Coeff + 2*(%3) + 48] |
327 |
|
|
328 |
|
psllw xmm0, 4 |
329 |
|
psllw xmm2, 4 |
330 |
|
|
331 |
|
paddd xmm4, [round32] |
332 |
|
paddd xmm5, [round32] |
333 |
|
|
334 |
|
paddd xmm1, [round32] |
335 |
|
paddd xmm3, [round32] |
336 |
|
|
337 |
|
psrad xmm4, 7 |
338 |
|
psrad xmm5, 7 |
339 |
|
|
340 |
|
psrad xmm1, 7 |
341 |
|
psrad xmm3, 7 |
342 |
|
|
343 |
|
packssdw xmm4, xmm5 ; Thresh |
344 |
|
packssdw xmm1, xmm3 ; Thresh |
345 |
|
|
346 |
|
|
347 |
|
psubusw xmm0, xmm4 ; Decimate by masking effect |
348 |
|
psubusw xmm2, xmm1 |
349 |
|
|
350 |
|
paddusw xmm0, [iCSF_Round + %3] |
351 |
|
paddusw xmm2, [iCSF_Round + %3 + 16] |
352 |
|
|
353 |
|
pmulhuw xmm0, [iCSF_Coeff + %3] |
354 |
|
pmulhuw xmm2, [iCSF_Coeff + %3 + 16] |
355 |
|
|
356 |
|
pmaddwd xmm0, xmm0 |
357 |
|
pmaddwd xmm2, xmm2 |
358 |
|
|
359 |
|
paddd xmm0, xmm2 |
360 |
|
%endmacro |
361 |
|
|
362 |
|
|
363 |
|
ALIGN SECTION_ALIGN |
364 |
|
sseh8_16bit_sse2: |
365 |
|
|
366 |
|
PUSH_XMM6_XMM7 |
367 |
|
|
368 |
|
mov TMP0, prm1 ; DCT_A |
369 |
|
mov TMP1, prm2 ; DCT_B |
370 |
|
mov _EAX, prm3 ; MASK |
371 |
|
|
372 |
|
movd xmm7, eax |
373 |
|
pshufd xmm7, xmm7, 0 |
374 |
|
|
375 |
|
SSEH_SSE2 TMP0, TMP1, 0, xmm7 |
376 |
|
movdqa xmm6, xmm0 |
377 |
|
SSEH_SSE2 TMP0, TMP1, 32, xmm7 |
378 |
|
paddd xmm6, xmm0 |
379 |
|
SSEH_SSE2 TMP0, TMP1, 64, xmm7 |
380 |
|
paddd xmm6, xmm0 |
381 |
|
SSEH_SSE2 TMP0, TMP1, 96, xmm7 |
382 |
|
paddd xmm6, xmm0 |
383 |
|
|
384 |
|
; convolute |
385 |
|
pshufd xmm1, xmm6, 238 |
386 |
|
paddd xmm6, xmm1 |
387 |
|
|
388 |
|
pshufd xmm2, xmm6, 85 |
389 |
|
paddd xmm6, xmm2 |
390 |
|
|
391 |
|
|
392 |
|
movd eax, xmm6 |
393 |
|
|
394 |
|
POP_XMM6_XMM7 |
395 |
|
ret |
396 |
|
ENDFUNC |
397 |
|
|
398 |
|
;-------------------------------------------------------------------------------------------- |
399 |
|
; uint32_t blocksum8_c(const int8_t * cur, int stride, uint16_t sums[4], uint32_t squares[4]) |
400 |
|
;-------------------------------------------------------------------------------------------- |
401 |
|
|
402 |
|
%macro BLOCKSUM_SSE2 3 |
403 |
|
movq xmm0, [%1 ] ; 0 0 B A |
404 |
|
movq xmm2, [%1 + %2] ; 0 0 B A |
405 |
|
movq xmm1, [%1 + 2*%2] |
406 |
|
movq xmm3, [%1 + %3] |
407 |
|
|
408 |
|
punpckldq xmm0, xmm2 ; B B A A |
409 |
|
punpckldq xmm1, xmm3 ; B B A A |
410 |
|
|
411 |
|
movdqa xmm2, xmm0 |
412 |
|
movdqa xmm3, xmm1 |
413 |
|
|
414 |
|
psadbw xmm0, xmm7 ; 000b000a |
415 |
|
psadbw xmm1, xmm7 |
416 |
|
|
417 |
|
movdqa xmm4, xmm2 |
418 |
|
movdqa xmm5, xmm3 |
419 |
|
|
420 |
|
punpcklbw xmm2, xmm7 ; aaaaaaaa |
421 |
|
punpcklbw xmm3, xmm7 |
422 |
|
|
423 |
|
punpckhbw xmm4, xmm7 ; bbbbbbbb |
424 |
|
punpckhbw xmm5, xmm7 |
425 |
|
|
426 |
|
pmaddwd xmm2, xmm2 ; a*a+a*a a*a+a*a a*a+a*a a*a+a*a |
427 |
|
pmaddwd xmm3, xmm3 |
428 |
|
|
429 |
|
pmaddwd xmm4, xmm4 ; b*b+b*b b*b+b*b b*b+b*b b*b+b*b |
430 |
|
pmaddwd xmm5, xmm5 |
431 |
|
|
432 |
|
paddd xmm2, xmm3 |
433 |
|
paddd xmm4, xmm5 |
434 |
|
|
435 |
|
movdqa xmm3, xmm2 |
436 |
|
punpckldq xmm2, xmm4 ; BABA |
437 |
|
punpckhdq xmm3, xmm4 ; BABA |
438 |
|
|
439 |
|
paddd xmm2, xmm3 |
440 |
|
|
441 |
|
lea %1, [%1 + 4*%2] |
442 |
|
|
443 |
|
movdqa xmm4, xmm2 |
444 |
|
punpckhqdq xmm4, xmm7 ; |
445 |
|
|
446 |
|
paddd xmm2, xmm4 |
447 |
|
|
448 |
|
; |
449 |
|
movq xmm3, [%1 ] ; 0 0 D C |
450 |
|
movq xmm5, [%1 + %2] ; 0 0 D C |
451 |
|
movq xmm4, [%1 + 2*%2] |
452 |
|
movq xmm6, [%1 + %3] |
453 |
|
|
454 |
|
punpckldq xmm3, xmm5 ; D D C C |
455 |
|
punpckldq xmm4, xmm6 ; D D C C |
456 |
|
|
457 |
|
movdqa xmm5, xmm3 |
458 |
|
movdqa xmm6, xmm4 |
459 |
|
|
460 |
|
psadbw xmm3, xmm7 ; 000d000c |
461 |
|
psadbw xmm4, xmm7 |
462 |
|
|
463 |
|
packssdw xmm0, xmm3 ; 0d0c0b0a |
464 |
|
packssdw xmm1, xmm4 ; |
465 |
|
|
466 |
|
paddusw xmm0, xmm1 |
467 |
|
packssdw xmm0, xmm7 ; 0000dcba |
468 |
|
|
469 |
|
|
470 |
|
movdqa xmm3, xmm5 |
471 |
|
movdqa xmm4, xmm6 |
472 |
|
|
473 |
|
punpcklbw xmm3, xmm7 |
474 |
|
punpcklbw xmm4, xmm7 |
475 |
|
|
476 |
|
punpckhbw xmm5, xmm7 |
477 |
|
punpckhbw xmm6, xmm7 |
478 |
|
|
479 |
|
pmaddwd xmm3, xmm3 ; C*C+C*C |
480 |
|
pmaddwd xmm4, xmm4 |
481 |
|
|
482 |
|
pmaddwd xmm5, xmm5 ; D*D+D*D |
483 |
|
pmaddwd xmm6, xmm6 |
484 |
|
|
485 |
|
paddd xmm3, xmm4 |
486 |
|
paddd xmm5, xmm6 |
487 |
|
|
488 |
|
movdqa xmm1, xmm3 |
489 |
|
punpckldq xmm3, xmm5 ; DCDC |
490 |
|
punpckhdq xmm1, xmm5 ; DCDC |
491 |
|
|
492 |
|
paddd xmm3, xmm1 |
493 |
|
|
494 |
|
movdqa xmm4, xmm3 |
495 |
|
punpckhqdq xmm4, xmm7 ; |
496 |
|
|
497 |
|
paddd xmm3, xmm4 |
498 |
|
punpcklqdq xmm2, xmm3 |
499 |
|
%endmacro |
500 |
|
|
501 |
|
|
502 |
|
ALIGN SECTION_ALIGN |
503 |
|
blocksum8_sse2: |
504 |
|
|
505 |
|
PUSH_XMM6_XMM7 |
506 |
|
|
507 |
|
mov TMP0, prm1 ; cur |
508 |
|
mov TMP1, prm2 ; stride |
509 |
|
mov _EAX, prm3 ; sums |
510 |
|
|
511 |
|
push _EBP |
512 |
|
lea _EBP, [TMP1 + 2*TMP1] |
513 |
|
|
514 |
|
pxor xmm7, xmm7 |
515 |
|
|
516 |
|
BLOCKSUM_SSE2 TMP0, TMP1, _EBP |
517 |
|
|
518 |
|
pop _EBP |
519 |
|
mov TMP0, prm4 ; squares |
520 |
|
|
521 |
|
movq [_EAX], xmm0 ; sums of the 4x4 sub-blocks |
522 |
|
movdqa [TMP0], xmm2 ; squares of the 4x4 sub-blocks |
523 |
|
|
524 |
|
pmaddwd xmm0, [ones] |
525 |
|
packssdw xmm0, xmm7 |
526 |
|
|
527 |
|
pmaddwd xmm0, [ones] |
528 |
|
movd eax, xmm0 |
529 |
|
|
530 |
|
POP_XMM6_XMM7 |
531 |
|
ret |
532 |
|
ENDFUNC |
533 |
|
|
534 |
NON_EXEC_STACK |
NON_EXEC_STACK |