20 |
; * along with this program; if not, write to the Free Software |
; * along with this program; if not, write to the Free Software |
21 |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
22 |
; * |
; * |
23 |
; * $Id: sad_mmx.asm,v 1.12 2004-03-22 22:36:24 edgomez Exp $ |
; * $Id: sad_mmx.asm,v 1.19 2008-11-11 20:46:24 Isibaar Exp $ |
24 |
; * |
; * |
25 |
; ***************************************************************************/ |
; ***************************************************************************/ |
26 |
|
|
28 |
|
|
29 |
%macro cglobal 1 |
%macro cglobal 1 |
30 |
%ifdef PREFIX |
%ifdef PREFIX |
31 |
|
%ifdef MARK_FUNCS |
32 |
|
global _%1:function %1.endfunc-%1 |
33 |
|
%define %1 _%1:function %1.endfunc-%1 |
34 |
|
%define ENDFUNC .endfunc |
35 |
|
%else |
36 |
global _%1 |
global _%1 |
37 |
%define %1 _%1 |
%define %1 _%1 |
38 |
|
%define ENDFUNC |
39 |
|
%endif |
40 |
|
%else |
41 |
|
%ifdef MARK_FUNCS |
42 |
|
global %1:function %1.endfunc-%1 |
43 |
|
%define ENDFUNC .endfunc |
44 |
%else |
%else |
45 |
global %1 |
global %1 |
46 |
|
%define ENDFUNC |
47 |
|
%endif |
48 |
%endif |
%endif |
49 |
%endmacro |
%endmacro |
50 |
|
|
53 |
;============================================================================= |
;============================================================================= |
54 |
|
|
55 |
%ifdef FORMAT_COFF |
%ifdef FORMAT_COFF |
56 |
SECTION .rodata data |
SECTION .rodata |
57 |
%else |
%else |
58 |
SECTION .rodata data align=16 |
SECTION .rodata align=16 |
59 |
%endif |
%endif |
60 |
|
|
61 |
ALIGN 16 |
ALIGN 16 |
78 |
lea eax, [eax+ecx] |
lea eax, [eax+ecx] |
79 |
movq mm5, mm2 |
movq mm5, mm2 |
80 |
psubusb mm2, mm3 |
psubusb mm2, mm3 |
|
lea edx, [edx+ecx] |
|
81 |
|
|
82 |
psubusb mm1, mm4 |
psubusb mm1, mm4 |
|
por mm0, mm1 |
|
83 |
psubusb mm3, mm5 |
psubusb mm3, mm5 |
84 |
|
por mm0, mm1 |
85 |
por mm2, mm3 |
por mm2, mm3 |
86 |
|
|
87 |
movq mm1, mm0 |
movq mm1, mm0 |
|
movq mm3, mm2 |
|
|
|
|
88 |
punpcklbw mm0,mm7 |
punpcklbw mm0,mm7 |
89 |
|
movq mm3, mm2 |
90 |
punpckhbw mm1,mm7 |
punpckhbw mm1,mm7 |
91 |
|
lea edx, [edx+ecx] |
92 |
punpcklbw mm2,mm7 |
punpcklbw mm2,mm7 |
|
punpckhbw mm3,mm7 |
|
|
|
|
93 |
paddusw mm0, mm1 |
paddusw mm0, mm1 |
94 |
|
punpckhbw mm3,mm7 |
95 |
paddusw mm6, mm0 |
paddusw mm6, mm0 |
96 |
paddusw mm2, mm3 |
paddusw mm2, mm3 |
97 |
paddusw mm6, mm2 |
paddusw mm6, mm2 |
98 |
|
|
99 |
%endmacro |
%endmacro |
100 |
|
|
101 |
%macro SAD_8x8_MMX 0 |
%macro SAD_8x8_MMX 0 |
114 |
psubusb mm2, mm3 |
psubusb mm2, mm3 |
115 |
|
|
116 |
psubusb mm1, mm4 |
psubusb mm1, mm4 |
|
por mm0, mm1 |
|
117 |
psubusb mm3, mm5 |
psubusb mm3, mm5 |
118 |
|
por mm0, mm1 |
119 |
por mm2, mm3 |
por mm2, mm3 |
120 |
|
|
121 |
movq mm1,mm0 |
movq mm1,mm0 |
|
movq mm3,mm2 |
|
|
|
|
122 |
punpcklbw mm0,mm7 |
punpcklbw mm0,mm7 |
123 |
|
movq mm3,mm2 |
124 |
punpckhbw mm1,mm7 |
punpckhbw mm1,mm7 |
125 |
punpcklbw mm2,mm7 |
punpcklbw mm2,mm7 |
|
punpckhbw mm3,mm7 |
|
|
|
|
126 |
paddusw mm0,mm1 |
paddusw mm0,mm1 |
127 |
|
punpckhbw mm3,mm7 |
128 |
paddusw mm6,mm0 |
paddusw mm6,mm0 |
129 |
paddusw mm2,mm3 |
paddusw mm2,mm3 |
130 |
paddusw mm6,mm2 |
paddusw mm6,mm2 |
131 |
%endmacro |
%endmacro |
132 |
|
|
133 |
|
|
134 |
%macro SADV_16x16_MMX 0 |
%macro SADV_16x16_MMX 0 |
135 |
movq mm0, [eax] |
movq mm0, [eax] |
136 |
movq mm1, [edx] |
movq mm1, [edx] |
137 |
|
|
138 |
movq mm2, [eax+8] |
movq mm2, [eax+8] |
|
movq mm3, [edx+8] |
|
|
|
|
139 |
movq mm4, mm0 |
movq mm4, mm0 |
140 |
|
movq mm3, [edx+8] |
141 |
psubusb mm0, mm1 |
psubusb mm0, mm1 |
142 |
|
|
143 |
psubusb mm1, mm4 |
psubusb mm1, mm4 |
|
por mm0, mm1 |
|
144 |
lea eax,[eax+ecx] |
lea eax,[eax+ecx] |
145 |
|
por mm0, mm1 |
146 |
|
|
147 |
movq mm4, mm2 |
movq mm4, mm2 |
148 |
psubusb mm2, mm3 |
psubusb mm2, mm3 |
149 |
|
|
150 |
psubusb mm3, mm4 |
psubusb mm3, mm4 |
151 |
por mm2, mm3 |
por mm2, mm3 |
|
lea edx,[edx+ecx] |
|
152 |
|
|
153 |
movq mm1,mm0 |
movq mm1,mm0 |
|
movq mm3,mm2 |
|
|
|
|
154 |
punpcklbw mm0,mm7 |
punpcklbw mm0,mm7 |
155 |
|
movq mm3,mm2 |
156 |
punpckhbw mm1,mm7 |
punpckhbw mm1,mm7 |
157 |
punpcklbw mm2,mm7 |
punpcklbw mm2,mm7 |
|
punpckhbw mm3,mm7 |
|
|
|
|
158 |
paddusw mm0,mm1 |
paddusw mm0,mm1 |
159 |
paddusw mm2,mm3 |
punpckhbw mm3,mm7 |
|
|
|
160 |
paddusw mm5, mm0 |
paddusw mm5, mm0 |
161 |
|
paddusw mm2,mm3 |
162 |
|
lea edx,[edx+ecx] |
163 |
paddusw mm6, mm2 |
paddusw mm6, mm2 |
164 |
%endmacro |
%endmacro |
165 |
|
|
218 |
movq mm2, [eax+8] |
movq mm2, [eax+8] |
219 |
lea eax, [eax+ecx] |
lea eax, [eax+ecx] |
220 |
movq mm1, mm0 |
movq mm1, mm0 |
|
movq mm3, mm2 |
|
221 |
punpcklbw mm0, mm7 |
punpcklbw mm0, mm7 |
222 |
punpcklbw mm2, mm7 |
movq mm3, mm2 |
223 |
punpckhbw mm1, mm7 |
punpckhbw mm1, mm7 |
|
punpckhbw mm3, mm7 |
|
224 |
paddw mm5, mm0 |
paddw mm5, mm0 |
225 |
|
punpcklbw mm2, mm7 |
226 |
paddw mm6, mm1 |
paddw mm6, mm1 |
227 |
|
punpckhbw mm3, mm7 |
228 |
paddw mm5, mm2 |
paddw mm5, mm2 |
229 |
paddw mm6, mm3 |
paddw mm6, mm3 |
230 |
%endmacro |
%endmacro |
277 |
cglobal sad8bi_mmx |
cglobal sad8bi_mmx |
278 |
cglobal dev16_mmx |
cglobal dev16_mmx |
279 |
cglobal sse8_16bit_mmx |
cglobal sse8_16bit_mmx |
280 |
|
cglobal sse8_8bit_mmx |
281 |
|
|
282 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
283 |
; |
; |
326 |
movd eax, mm6 |
movd eax, mm6 |
327 |
|
|
328 |
ret |
ret |
329 |
|
ENDFUNC |
330 |
|
|
331 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
332 |
; |
; |
359 |
movd eax, mm6 |
movd eax, mm6 |
360 |
|
|
361 |
ret |
ret |
362 |
|
ENDFUNC |
363 |
|
|
364 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
365 |
; |
; |
450 |
pop ebx |
pop ebx |
451 |
|
|
452 |
ret |
ret |
453 |
|
ENDFUNC |
454 |
|
|
455 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
456 |
; |
; |
471 |
|
|
472 |
pxor mm6, mm6 ; accum2 |
pxor mm6, mm6 ; accum2 |
473 |
pxor mm7, mm7 |
pxor mm7, mm7 |
474 |
.Loop |
.Loop: |
475 |
SADBI_16x16_MMX 0, 0 |
SADBI_16x16_MMX 0, 0 |
476 |
SADBI_16x16_MMX 8, 1 |
SADBI_16x16_MMX 8, 1 |
477 |
SADBI_16x16_MMX 0, 0 |
SADBI_16x16_MMX 0, 0 |
515 |
pop ebx |
pop ebx |
516 |
|
|
517 |
ret |
ret |
518 |
|
ENDFUNC |
519 |
|
|
520 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
521 |
; |
; |
536 |
|
|
537 |
pxor mm6, mm6 ; accum2 |
pxor mm6, mm6 ; accum2 |
538 |
pxor mm7, mm7 |
pxor mm7, mm7 |
539 |
.Loop |
.Loop: |
540 |
SADBI_16x16_MMX 0, 1 |
SADBI_16x16_MMX 0, 1 |
541 |
SADBI_16x16_MMX 0, 1 |
SADBI_16x16_MMX 0, 1 |
542 |
SADBI_16x16_MMX 0, 1 |
SADBI_16x16_MMX 0, 1 |
554 |
movd eax, mm6 |
movd eax, mm6 |
555 |
pop ebx |
pop ebx |
556 |
ret |
ret |
557 |
|
ENDFUNC |
558 |
|
|
559 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
560 |
; |
; |
634 |
movd eax, mm6 |
movd eax, mm6 |
635 |
|
|
636 |
ret |
ret |
637 |
|
ENDFUNC |
638 |
|
|
639 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
640 |
; |
; |
644 |
; |
; |
645 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
646 |
|
|
647 |
%macro ROW_SSE_MMX 2 |
%macro ROW_SSE_16bit_MMX 2 |
648 |
movq mm0, [%1] |
movq mm0, [%1] |
649 |
movq mm1, [%1+8] |
movq mm1, [%1+8] |
650 |
psubw mm0, [%2] |
psubw mm0, [%2] |
668 |
pxor mm2, mm2 |
pxor mm2, mm2 |
669 |
|
|
670 |
;; Let's go |
;; Let's go |
671 |
ROW_SSE_MMX esi, edi |
%rep 8 |
672 |
lea esi, [esi+edx] |
ROW_SSE_16bit_MMX esi, edi |
|
lea edi, [edi+edx] |
|
|
ROW_SSE_MMX esi, edi |
|
|
lea esi, [esi+edx] |
|
|
lea edi, [edi+edx] |
|
|
ROW_SSE_MMX esi, edi |
|
|
lea esi, [esi+edx] |
|
|
lea edi, [edi+edx] |
|
|
ROW_SSE_MMX esi, edi |
|
|
lea esi, [esi+edx] |
|
|
lea edi, [edi+edx] |
|
|
ROW_SSE_MMX esi, edi |
|
|
lea esi, [esi+edx] |
|
|
lea edi, [edi+edx] |
|
|
ROW_SSE_MMX esi, edi |
|
|
lea esi, [esi+edx] |
|
|
lea edi, [edi+edx] |
|
|
ROW_SSE_MMX esi, edi |
|
|
lea esi, [esi+edx] |
|
|
lea edi, [edi+edx] |
|
|
ROW_SSE_MMX esi, edi |
|
673 |
lea esi, [esi+edx] |
lea esi, [esi+edx] |
674 |
lea edi, [edi+edx] |
lea edi, [edi+edx] |
675 |
|
%endrep |
676 |
|
|
677 |
;; Finish adding each dword of the accumulator |
;; Finish adding each dword of the accumulator |
678 |
movq mm3, mm2 |
movq mm3, mm2 |
684 |
pop edi |
pop edi |
685 |
pop esi |
pop esi |
686 |
ret |
ret |
687 |
|
ENDFUNC |
688 |
|
|
689 |
|
;----------------------------------------------------------------------------- |
690 |
|
; |
691 |
|
; uint32_t sse8_8bit_mmx(const int8_t *b1, |
692 |
|
; const int8_t *b2, |
693 |
|
; const uint32_t stride); |
694 |
|
; |
695 |
|
;----------------------------------------------------------------------------- |
696 |
|
|
697 |
|
%macro ROW_SSE_8bit_MMX 2 |
698 |
|
movq mm0, [%1] ; load a row |
699 |
|
movq mm2, [%2] ; load a row |
700 |
|
|
701 |
|
movq mm1, mm0 ; copy row |
702 |
|
movq mm3, mm2 ; copy row |
703 |
|
|
704 |
|
punpcklbw mm0, mm7 ; turn the 4low elements into 16bit |
705 |
|
punpckhbw mm1, mm7 ; turn the 4high elements into 16bit |
706 |
|
|
707 |
|
punpcklbw mm2, mm7 ; turn the 4low elements into 16bit |
708 |
|
punpckhbw mm3, mm7 ; turn the 4high elements into 16bit |
709 |
|
|
710 |
|
psubw mm0, mm2 ; low part of src-dst |
711 |
|
psubw mm1, mm3 ; high part of src-dst |
712 |
|
|
713 |
|
pmaddwd mm0, mm0 ; compute the square sum |
714 |
|
pmaddwd mm1, mm1 ; compute the square sum |
715 |
|
|
716 |
|
paddd mm6, mm0 ; add to the accumulator |
717 |
|
paddd mm6, mm1 ; add to the accumulator |
718 |
|
%endmacro |
719 |
|
|
720 |
|
sse8_8bit_mmx: |
721 |
|
push esi |
722 |
|
push edi |
723 |
|
|
724 |
|
;; Load the function params |
725 |
|
mov esi, [esp+8+4] |
726 |
|
mov edi, [esp+8+8] |
727 |
|
mov edx, [esp+8+12] |
728 |
|
|
729 |
|
;; Reset the sse accumulator |
730 |
|
pxor mm6, mm6 |
731 |
|
|
732 |
|
;; Used to interleave 8bit data with 0x00 values |
733 |
|
pxor mm7, mm7 |
734 |
|
|
735 |
|
;; Let's go |
736 |
|
%rep 8 |
737 |
|
ROW_SSE_8bit_MMX esi, edi |
738 |
|
lea esi, [esi+edx] |
739 |
|
lea edi, [edi+edx] |
740 |
|
%endrep |
741 |
|
|
742 |
|
;; Finish adding each dword of the accumulator |
743 |
|
movq mm7, mm6 |
744 |
|
psrlq mm6, 32 |
745 |
|
paddd mm6, mm7 |
746 |
|
movd eax, mm6 |
747 |
|
|
748 |
|
;; All done |
749 |
|
pop edi |
750 |
|
pop esi |
751 |
|
ret |
752 |
|
ENDFUNC |
753 |
|
|
754 |
|
|
755 |
|
%ifidn __OUTPUT_FORMAT__,elf |
756 |
|
section ".note.GNU-stack" noalloc noexec nowrite progbits |
757 |
|
%endif |
758 |
|
|