51 |
paddw %1, %2 |
paddw %1, %2 |
52 |
%endmacro |
%endmacro |
53 |
|
|
|
;load a dq from mem to a xmm reg |
|
|
%macro LOAD_XMM 2 |
|
|
movdqu %1,[%2] |
|
|
;movhps %1,[%2+8] |
|
|
%endmacro |
|
|
|
|
|
%macro WRITE_XMM 2 |
|
|
;movlps [%1],%2 |
|
|
;movhps [%1+8],%2 |
|
|
movdqu [%1],%2 |
|
|
%endmacro |
|
|
|
|
54 |
%macro CONSIM_1x8_SSE2 0 |
%macro CONSIM_1x8_SSE2 0 |
55 |
LOAD_XMM xmm0,ecx |
movdqu xmm0,[ecx] |
56 |
LOAD_XMM xmm1,edx |
movdqu xmm1,[edx] |
|
pxor xmm2,xmm2 |
|
57 |
|
|
58 |
;unpack to words |
;unpack to words |
59 |
punpcklbw xmm0,xmm2 |
punpcklbw xmm0,xmm2 |
60 |
punpcklbw xmm1,xmm2 |
punpcklbw xmm1,xmm2 |
61 |
|
|
62 |
;devo |
movaps xmm3,xmm0 |
63 |
psubw xmm0,xmm6 |
movaps xmm4,xmm1 |
|
movaps xmm2,xmm0 |
|
|
pmaddwd xmm2,xmm0 |
|
|
paddd xmm3,xmm2 |
|
|
|
|
|
;devc |
|
|
psubw xmm1,xmm7 |
|
|
movaps xmm2,xmm1 |
|
|
pmaddwd xmm2,xmm1 |
|
|
paddd xmm4,xmm2 |
|
|
|
|
|
;corr |
|
|
pmaddwd xmm1,xmm0 |
|
|
paddd xmm5,xmm1 |
|
|
%endmacro |
|
64 |
|
|
65 |
|
pmaddwd xmm0,xmm0;orig |
66 |
|
pmaddwd xmm1,xmm1;comp |
67 |
|
pmaddwd xmm3,xmm4;corr |
68 |
|
|
69 |
|
paddd xmm5,xmm0 |
70 |
|
paddd xmm6,xmm1 |
71 |
|
paddd xmm7,xmm3 |
72 |
|
%endmacro |
73 |
|
|
74 |
%macro CONSIM_1x8_MMX 0 |
%macro CONSIM_1x8_MMX 0 |
75 |
movq mm0,[ecx];orig |
movq mm0,[ecx];orig |
76 |
movq mm1,[edx];comp |
movq mm1,[edx];comp |
|
pxor mm2,mm2;null vector |
|
77 |
|
|
78 |
;unpack low half of qw to words |
;unpack low half of qw to words |
79 |
punpcklbw mm0,mm2 |
punpcklbw mm0,mm2 |
80 |
punpcklbw mm1,mm2 |
punpcklbw mm1,mm2 |
81 |
|
|
82 |
;devo |
movq mm3,mm0 |
83 |
psubw mm0,mm6 |
pmaddwd mm3,mm0 |
84 |
movq mm2,mm0 |
paddd mm5,mm3; |
85 |
pmaddwd mm2,mm0 |
|
86 |
paddd mm3,mm2; |
movq mm4,mm1 |
87 |
|
pmaddwd mm4,mm1 |
88 |
;devc |
paddd mm6,mm4; |
|
psubw mm1,mm7 |
|
|
movq mm2,mm1 |
|
|
pmaddwd mm2,mm1 |
|
|
paddd mm4,mm2 |
|
89 |
|
|
|
;corr |
|
90 |
pmaddwd mm1,mm0 |
pmaddwd mm1,mm0 |
91 |
paddd mm5,mm1 |
paddd mm7,mm1 |
92 |
|
|
93 |
movq mm0,[ecx] |
movq mm0,[ecx];orig |
94 |
movq mm1,[edx] |
movq mm1,[edx];comp |
|
pxor mm2,mm2;null vector |
|
95 |
|
|
96 |
;unpack high half of qw to words |
;unpack high half of qw to words |
97 |
punpckhbw mm0,mm2 |
punpckhbw mm0,mm2 |
98 |
punpckhbw mm1,mm2 |
punpckhbw mm1,mm2 |
99 |
|
|
100 |
;devo |
movq mm3,mm0 |
101 |
psubw mm0,mm6 |
pmaddwd mm3,mm0 |
102 |
movq mm2,mm0 |
paddd mm5,mm3; |
103 |
pmaddwd mm2,mm0 |
|
104 |
paddd mm3,mm2; |
movq mm4,mm1 |
105 |
|
pmaddwd mm4,mm1 |
106 |
;devc |
paddd mm6,mm4; |
|
psubw mm1,mm7 |
|
|
movq mm2,mm1 |
|
|
pmaddwd mm2,mm1 |
|
|
paddd mm4,mm2 |
|
107 |
|
|
|
;corr |
|
108 |
pmaddwd mm1,mm0 |
pmaddwd mm1,mm0 |
109 |
paddd mm5,mm1 |
paddd mm7,mm1 |
110 |
%endmacro |
%endmacro |
111 |
|
|
112 |
|
%macro CONSIM_WRITEOUT 3 |
113 |
|
mov eax,[esp + 16];lumo |
114 |
|
mul eax; lumo^2 |
115 |
|
add eax, 32 |
116 |
|
shr eax,6; 64*lum0^2 |
117 |
|
movd ecx,%1 |
118 |
|
sub ecx,eax |
119 |
|
|
120 |
|
mov edx,[esp + 24]; pdevo |
121 |
|
mov [edx],ecx |
122 |
|
|
123 |
|
mov eax,[esp + 20];lumc |
124 |
|
mul eax; lumc^2 |
125 |
|
add eax, 32 |
126 |
|
shr eax,6; 64*lumc^2 |
127 |
|
movd ecx,%2 |
128 |
|
sub ecx,eax |
129 |
|
|
130 |
|
mov edx,[esp + 28]; pdevc |
131 |
|
mov [edx],ecx |
132 |
|
|
133 |
|
mov eax,[esp + 16];lumo |
134 |
|
mul dword [esp + 20]; lumo*lumc, should fit in eax |
135 |
|
add eax, 32 |
136 |
|
shr eax,6; 64*lumo*lumc |
137 |
|
movd ecx,%3 |
138 |
|
sub ecx,eax |
139 |
|
|
140 |
|
mov edx,[esp + 32]; pcorr |
141 |
|
mov [edx],ecx |
142 |
|
%endmacro |
143 |
|
|
144 |
|
|
145 |
SECTION .text |
SECTION .text |
174 |
.endfunc |
.endfunc |
175 |
|
|
176 |
ALIGN 16 |
ALIGN 16 |
177 |
consim_mmx: |
consim_sse2: |
178 |
mov ecx,[esp+4] ;ptro |
mov ecx,[esp+4] ;ptro |
|
pxor mm6,mm6; |
|
|
|
|
179 |
mov edx,[esp+8] ;ptrc |
mov edx,[esp+8] ;ptrc |
|
pxor mm3,mm3;devo |
|
|
pxor mm4,mm4;devc |
|
|
movd mm6,[esp + 16];lumo |
|
|
pxor mm7,mm7 |
|
180 |
mov eax,[esp+12];stride |
mov eax,[esp+12];stride |
|
movd mm7,[esp + 20];lumc |
|
|
pshufw mm6,mm6,00000000b ; TODO: remove later! not MMX, but SSE |
|
|
pxor mm5,mm5;corr |
|
|
pshufw mm7,mm7,00000000b |
|
181 |
|
|
182 |
CONSIM_1x8_MMX |
pxor xmm2,xmm2;null vektor |
183 |
|
pxor xmm5,xmm5;devo |
184 |
|
pxor xmm6,xmm6;devc |
185 |
|
pxor xmm7,xmm7;corr |
186 |
|
|
187 |
|
;broadcast lumo/c |
188 |
|
punpcklbw xmm6,xmm6 |
189 |
|
punpcklwd xmm6,xmm6 |
190 |
|
pshufd xmm6,xmm6,00000000b;or shufps |
191 |
|
punpcklbw xmm7,xmm7 |
192 |
|
punpcklwd xmm7,xmm7 |
193 |
|
pshufd xmm7,xmm7,00000000b |
194 |
|
|
195 |
|
CONSIM_1x8_SSE2 |
196 |
add ecx,eax |
add ecx,eax |
197 |
add edx,eax |
add edx,eax |
198 |
CONSIM_1x8_MMX |
CONSIM_1x8_SSE2 |
199 |
add ecx,eax |
add ecx,eax |
200 |
add edx,eax |
add edx,eax |
201 |
CONSIM_1x8_MMX |
CONSIM_1x8_SSE2 |
202 |
add ecx,eax |
add ecx,eax |
203 |
add edx,eax |
add edx,eax |
204 |
CONSIM_1x8_MMX |
CONSIM_1x8_SSE2 |
205 |
add ecx,eax |
add ecx,eax |
206 |
add edx,eax |
add edx,eax |
207 |
CONSIM_1x8_MMX |
CONSIM_1x8_SSE2 |
208 |
add ecx,eax |
add ecx,eax |
209 |
add edx,eax |
add edx,eax |
210 |
CONSIM_1x8_MMX |
CONSIM_1x8_SSE2 |
211 |
add ecx,eax |
add ecx,eax |
212 |
add edx,eax |
add edx,eax |
213 |
CONSIM_1x8_MMX |
CONSIM_1x8_SSE2 |
214 |
add ecx,eax |
add ecx,eax |
215 |
add edx,eax |
add edx,eax |
216 |
CONSIM_1x8_MMX |
CONSIM_1x8_SSE2 |
217 |
|
|
218 |
pshufw mm0,mm3,01001110b |
;accumulate xmm5-7 |
219 |
paddd mm3,mm0 |
pshufd xmm0, xmm5, 0EH |
220 |
pshufw mm1,mm4,01001110b |
paddd xmm5, xmm0 |
221 |
paddd mm4,mm1 |
pshufd xmm0, xmm5, 01H |
222 |
pshufw mm2,mm5,01001110b |
paddd xmm5, xmm0 |
223 |
paddd mm5,mm2 |
|
224 |
|
pshufd xmm1, xmm6, 0EH |
225 |
|
paddd xmm6, xmm1 |
226 |
|
pshufd xmm1, xmm6, 01H |
227 |
|
paddd xmm6, xmm1 |
228 |
|
|
229 |
|
pshufd xmm2, xmm7, 0EH |
230 |
|
paddd xmm7, xmm2 |
231 |
|
pshufd xmm2, xmm7, 01H |
232 |
|
paddd xmm7, xmm2 |
233 |
|
|
234 |
;load target pointer |
CONSIM_WRITEOUT xmm5,xmm6,xmm7 |
|
mov ecx,[esp + 24]; pdevo |
|
|
movd [ecx],mm3 |
|
|
mov edx,[esp + 28]; pdevc |
|
|
movd [edx],mm4 |
|
|
mov eax,[esp + 32]; corr |
|
|
movd [eax],mm5 |
|
|
emms |
|
235 |
ret |
ret |
236 |
.endfunc |
.endfunc |
237 |
|
|
238 |
consim_sse2: |
|
239 |
|
|
240 |
|
|
241 |
|
|
242 |
|
ALIGN 16 |
243 |
|
consim_mmx: |
244 |
mov ecx,[esp+4] ;ptro |
mov ecx,[esp+4] ;ptro |
|
pxor xmm6,xmm6; |
|
245 |
mov edx,[esp+8] ;ptrc |
mov edx,[esp+8] ;ptrc |
|
pxor xmm3,xmm3;devo |
|
|
pxor xmm4,xmm4;devc |
|
|
movd xmm6,[esp + 16];lumo |
|
|
pxor xmm7,xmm7 |
|
246 |
mov eax,[esp+12];stride |
mov eax,[esp+12];stride |
247 |
movd xmm7,[esp + 20];lumc |
pxor mm2,mm2;null |
248 |
pxor xmm5,xmm5;corr |
pxor mm5,mm5;devo |
249 |
|
pxor mm6,mm6;devc |
250 |
|
pxor mm7,mm7;corr |
251 |
|
|
252 |
;broadcast lumo/c |
CONSIM_1x8_MMX |
|
;punpcklbw xmm6,xmm6 |
|
|
punpcklwd xmm6,xmm6 |
|
|
pshufd xmm6,xmm6,00000000b;or shufps |
|
|
;punpcklbw xmm7,xmm7 |
|
|
punpcklwd xmm7,xmm7 |
|
|
pshufd xmm7,xmm7,00000000b |
|
|
|
|
|
CONSIM_1x8_SSE2 |
|
253 |
add ecx,eax |
add ecx,eax |
254 |
add edx,eax |
add edx,eax |
255 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_MMX |
256 |
add ecx,eax |
add ecx,eax |
257 |
add edx,eax |
add edx,eax |
258 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_MMX |
259 |
add ecx,eax |
add ecx,eax |
260 |
add edx,eax |
add edx,eax |
261 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_MMX |
262 |
add ecx,eax |
add ecx,eax |
263 |
add edx,eax |
add edx,eax |
264 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_MMX |
265 |
add ecx,eax |
add ecx,eax |
266 |
add edx,eax |
add edx,eax |
267 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_MMX |
268 |
add ecx,eax |
add ecx,eax |
269 |
add edx,eax |
add edx,eax |
270 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_MMX |
271 |
add ecx,eax |
add ecx,eax |
272 |
add edx,eax |
add edx,eax |
273 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_MMX |
274 |
|
|
275 |
;accumulate xmm3-5 |
movq mm0,mm5 |
276 |
pshufd xmm0, xmm3, 0EH ; Get bit 64-127 from xmm1 (or use movhlps) |
psrlq mm0,32 |
277 |
paddd xmm3, xmm0 ; Sums are in 2 dwords |
paddd mm5,mm0 |
278 |
pshufd xmm0, xmm3, 01H ; Get bit 32-63 from xmm0 |
movq mm1,mm6 |
279 |
paddd xmm3, xmm0 ; Sum is in one dword |
psrlq mm1,32 |
280 |
|
paddd mm6,mm1 |
281 |
pshufd xmm1, xmm4, 0EH ; Get bit 64-127 from xmm1 (or use movhlps) |
movq mm2,mm7 |
282 |
paddd xmm4, xmm1 ; Sums are in 2 dwords |
psrlq mm2,32 |
283 |
pshufd xmm1, xmm4, 01H ; Get bit 32-63 from xmm0 |
paddd mm7,mm2 |
284 |
paddd xmm4, xmm1 ; Sum is in one dword |
|
285 |
|
CONSIM_WRITEOUT mm5,mm6,mm7 |
|
pshufd xmm2, xmm5, 0EH ; Get bit 64-127 from xmm1 (or use movhlps) |
|
|
paddd xmm5, xmm2 ; Sums are in 2 dwords |
|
|
pshufd xmm2, xmm5, 01H ; Get bit 32-63 from xmm0 |
|
|
paddd xmm5, xmm2 ; Sum is in one dword |
|
|
|
|
|
|
|
|
;load target pointer |
|
|
mov ecx,[esp + 24]; pdevo |
|
|
movd [ecx],xmm3 |
|
|
mov edx,[esp + 28]; pdevc |
|
|
movd [edx],xmm4 |
|
|
mov eax,[esp + 32]; corr |
|
|
movd [eax],xmm5 |
|
286 |
ret |
ret |
287 |
.endfunc |
.endfunc |