Annotation of /trunk/xvidcore/src/motion/x86_asm/sad_sse2.asm

Revision 262 - (view) (download)

1 :	Isibaar	262	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * sse2 sum of absolute difference
5 :			; *
6 :			; * This program is free software; you can redistribute it and/or modify
7 :			; * it under the terms of the GNU General Public License as published by
8 :			; * the Free Software Foundation; either version 2 of the License, or
9 :			; * (at your option) any later version.
10 :			; *
11 :			; * This program is distributed in the hope that it will be useful,
12 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 :			; * GNU General Public License for more details.
15 :			; *
16 :			; * You should have received a copy of the GNU General Public License
17 :			; * along with this program; if not, write to the Free Software
18 :			; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 :			; *
20 :			; *************************************************************************/
21 :
22 :			;/**************************************************************************
23 :			; *
24 :			; * History:
25 :			; *
26 :			; * 24.05.2002 inital version; (c)2002 Dmitry Rozhdestvensky
27 :			; *
28 :			; *************************************************************************/
29 :
30 :			bits 32
31 :
32 :			%macro cglobal 1
33 :			%ifdef PREFIX
34 :			global _%1
35 :			%define %1 _%1
36 :			%else
37 :			global %1
38 :			%endif
39 :			%endmacro
40 :
41 :			%define sad_debug 0 ;1=unaligned 2=ref unaligned 3=aligned 0=autodetect
42 :			%define dev_debug 2 ;1=unaligned 2=aligned 0=autodetect
43 :			%define test_stride_alignment 0 ;test stride for alignment while autodetect
44 :			%define early_return 0 ;use early return in sad
45 :
46 :			section .data
47 :
48 :			align 64
49 :			buffer times 4*8 dd 0 ;8 128-bit words
50 :			zero times 4 dd 0
51 :
52 :			section .text
53 :
54 :			cglobal sad16_sse2
55 :			cglobal dev16_sse2
56 :
57 :			;===========================================================================
58 :			; General macros for SSE2 code
59 :			;===========================================================================
60 :
61 :			%macro load_stride 1
62 :			mov ecx,%1
63 :			add ecx,ecx
64 :			mov edx,ecx
65 :			add ecx,%1 ;stride*3
66 :			add edx,edx ;stride*4
67 :			%endmacro
68 :
69 :			%macro sad8lines 1
70 :
71 :			psadbw xmm0,[%1]
72 :			psadbw xmm1,[%1+ebx]
73 :			psadbw xmm2,[%1+ebx*2]
74 :			psadbw xmm3,[%1+ecx]
75 :
76 :			add %1,edx
77 :
78 :			psadbw xmm4,[%1]
79 :			psadbw xmm5,[%1+ebx]
80 :			psadbw xmm6,[%1+ebx*2]
81 :			psadbw xmm7,[%1+ecx]
82 :
83 :			add %1,edx
84 :			%endmacro
85 :
86 :			%macro after_sad 1 ; Summarizes 0th and 4th words of all xmm registers
87 :
88 :			paddusw xmm0,xmm1
89 :			paddusw xmm2,xmm3
90 :			paddusw xmm4,xmm5
91 :			paddusw xmm6,xmm7
92 :
93 :			paddusw xmm0,xmm2
94 :			paddusw xmm4,xmm6
95 :
96 :			paddusw xmm4,xmm0
97 :			pshufd xmm5,xmm4,11111110b
98 :			paddusw xmm5,xmm4
99 :
100 :			pextrw %1,xmm5,0 ;less latency then movd
101 :			%endmacro
102 :
103 :			%macro restore 1 ;restores used registers
104 :
105 :			%if %1=1
106 :			pop ebp
107 :			%endif
108 :			pop edi
109 :			pop esi
110 :			pop ebx
111 :			%endmacro
112 :
113 :			;===========================================================================
114 :			;
115 :			; uint32_t sad16_sse2 (const uint8_t * const cur,
116 :			; const uint8_t * const ref,
117 :			; const uint32_t stride,
118 :			; const uint32_t best_sad);
119 :			;
120 :			;
121 :			;===========================================================================
122 :
123 :			align 16
124 :			sad16_sse2
125 :			push ebx
126 :			push esi
127 :			push edi
128 :
129 :			mov ebx,[esp + 3*4 + 12] ;stride
130 :
131 :			%if sad_debug<>0
132 :			mov edi,[esp + 3*4 + 4]
133 :			cglobal sad16_sse2
134 :			mov esi,[esp + 3*4 + 8]
135 :			%endif
136 :
137 :			%if sad_debug=1
138 :			jmp sad16_sse2_ul
139 :			%endif
140 :			%if sad_debug=2
141 :			jmp sad16_sse2_semial
142 :			%endif
143 :			%if sad_debug=3
144 :			jmp sad16_sse2_al
145 :			%endif
146 :
147 :			%if test_stride_alignment<>0
148 :			test ebx,15
149 :			jnz sad16_sse2_ul
150 :			%endif
151 :			mov edi,[esp + 3*4 + 4] ;cur (most likely aligned)
152 :
153 :			test edi,15
154 :			cmovz esi,[esp + 3*4 + 8] ;load esi if edi is aligned
155 :			cmovnz esi,edi ;move to esi and load edi
156 :			cmovnz edi,[esp + 3*4 + 8] ;if not
157 :			jnz esi_unaligned
158 :
159 :			test esi,15
160 :			jnz near sad16_sse2_semial
161 :			jmp sad16_sse2_al
162 :
163 :			esi_unaligned: test edi,15
164 :			jnz near sad16_sse2_ul
165 :			jmp sad16_sse2_semial
166 :
167 :			;===========================================================================
168 :			; Branch requires 16-byte alignment of esi and edi and stride
169 :			;===========================================================================
170 :
171 :			%macro sad16x8_al 1
172 :
173 :			movdqa xmm0,[esi]
174 :			movdqa xmm1,[esi+ebx]
175 :			movdqa xmm2,[esi+ebx*2]
176 :			movdqa xmm3,[esi+ecx]
177 :
178 :			add esi,edx
179 :
180 :			movdqa xmm4,[esi]
181 :			movdqa xmm5,[esi+ebx]
182 :			movdqa xmm6,[esi+ebx*2]
183 :			movdqa xmm7,[esi+ecx]
184 :
185 :			add esi,edx
186 :
187 :			sad8lines edi
188 :
189 :			after_sad %1
190 :
191 :			%endmacro
192 :
193 :			align 16
194 :			sad16_sse2_al
195 :
196 :			load_stride ebx
197 :
198 :			sad16x8_al eax
199 :
200 :			%if early_return=1
201 :			cmp eax,[esp + 3*4 + 16] ;best_sad
202 :			jg continue_al
203 :			%endif
204 :
205 :			sad16x8_al ebx
206 :
207 :			add eax,ebx
208 :
209 :			continue_al: restore 0
210 :
211 :			ret
212 :
213 :			;===========================================================================
214 :			; Branch requires 16-byte alignment of the edi and stride only
215 :			;===========================================================================
216 :
217 :			%macro sad16x8_semial 1
218 :
219 :			movdqu xmm0,[esi]
220 :			movdqu xmm1,[esi+ebx]
221 :			movdqu xmm2,[esi+ebx*2]
222 :			movdqu xmm3,[esi+ecx]
223 :
224 :			add esi,edx
225 :
226 :			movdqu xmm4,[esi]
227 :			movdqu xmm5,[esi+ebx]
228 :			movdqu xmm6,[esi+ebx*2]
229 :			movdqu xmm7,[esi+ecx]
230 :
231 :			add esi,edx
232 :
233 :			sad8lines edi
234 :
235 :			after_sad %1
236 :
237 :			%endmacro
238 :
239 :			align 16
240 :			sad16_sse2_semial
241 :
242 :			load_stride ebx
243 :
244 :			sad16x8_semial eax
245 :
246 :			%if early_return=1
247 :			cmp eax,[esp + 3*4 + 16] ;best_sad
248 :			jg cont_semial
249 :			%endif
250 :
251 :			sad16x8_semial ebx
252 :
253 :			add eax,ebx
254 :
255 :			cont_semial: restore 0
256 :
257 :			ret
258 :
259 :
260 :			;===========================================================================
261 :			; Branch does not require alignment, even stride
262 :			;===========================================================================
263 :
264 :			%macro sad16x4_ul 1
265 :
266 :			movdqu xmm0,[esi]
267 :			movdqu xmm1,[esi+ebx]
268 :			movdqu xmm2,[esi+ebx*2]
269 :			movdqu xmm3,[esi+ecx]
270 :
271 :			add esi,edx
272 :
273 :			movdqu xmm4,[edi]
274 :			movdqu xmm5,[edi+ebx]
275 :			movdqu xmm6,[edi+ebx*2]
276 :			movdqu xmm7,[edi+ecx]
277 :
278 :			add edi,edx
279 :
280 :			psadbw xmm4,xmm0
281 :			psadbw xmm5,xmm1
282 :			psadbw xmm6,xmm2
283 :			psadbw xmm7,xmm3
284 :
285 :			paddusw xmm4,xmm5
286 :			paddusw xmm6,xmm7
287 :
288 :			paddusw xmm4,xmm6
289 :			pshufd xmm7,xmm4,11111110b
290 :			paddusw xmm7,xmm4
291 :
292 :			pextrw %1,xmm7,0
293 :			%endmacro
294 :
295 :
296 :			align 16
297 :			sad16_sse2_ul
298 :
299 :			load_stride ebx
300 :
301 :			push ebp
302 :
303 :			sad16x4_ul eax
304 :
305 :			%if early_return=1
306 :			cmp eax,[esp + 4*4 + 16] ;best_sad
307 :			jg continue_ul
308 :			%endif
309 :
310 :			sad16x4_ul ebp
311 :			add eax,ebp
312 :
313 :			%if early_return=1
314 :			cmp eax,[esp + 4*4 + 16] ;best_sad
315 :			jg continue_ul
316 :			%endif
317 :
318 :			sad16x4_ul ebp
319 :			add eax,ebp
320 :
321 :			%if early_return=1
322 :			cmp eax,[esp + 4*4 + 16] ;best_sad
323 :			jg continue_ul
324 :			%endif
325 :
326 :			sad16x4_ul ebp
327 :			add eax,ebp
328 :
329 :			continue_ul: restore 1
330 :
331 :			ret
332 :
333 :			;===========================================================================
334 :			;
335 :			; uint32_t dev16_sse2(const uint8_t * const cur,
336 :			; const uint32_t stride);
337 :			;
338 :			; experimental!
339 :			;
340 :			;===========================================================================
341 :
342 :			align 16
343 :			dev16_sse2
344 :
345 :			push ebx
346 :			push esi
347 :			push edi
348 :			push ebp
349 :
350 :			mov esi, [esp + 4*4 + 4] ; cur
351 :			mov ebx, [esp + 4*4 + 8] ; stride
352 :			cglobal dev16_sse2
353 :			mov edi, buffer
354 :
355 :			%if dev_debug=1
356 :			jmp dev16_sse2_ul
357 :			%endif
358 :
359 :			%if dev_debug=2
360 :			jmp dev16_sse2_al
361 :			%endif
362 :
363 :			test esi,15
364 :			jnz near dev16_sse2_ul
365 :
366 :			%if test_stride_alignment=1
367 :			test ebx,15
368 :			jnz dev16_sse2_ul
369 :			%endif
370 :
371 :			mov edi,esi
372 :			jmp dev16_sse2_al
373 :
374 :			;===========================================================================
375 :			; Branch requires alignment of both the cur and stride
376 :			;===========================================================================
377 :
378 :			%macro make_mean 0
379 :			add eax,ebp ;mean 16-bit
380 :			mov al,ah ;eax= {0 0 mean/256 mean/256}
381 :			mov ebp,eax
382 :			shl ebp,16
383 :			or eax,ebp
384 :			%endmacro
385 :
386 :			%macro sad_mean16x8_al 3 ;destination,0=zero,1=mean from eax,source
387 :
388 :			%if %2=0
389 :			pxor xmm0,xmm0
390 :			%else
391 :			movd xmm0,eax
392 :			pshufd xmm0,xmm0,0
393 :			%endif
394 :			movdqa xmm1,xmm0
395 :			movdqa xmm2,xmm0
396 :			movdqa xmm3,xmm0
397 :			movdqa xmm4,xmm0
398 :			movdqa xmm5,xmm0
399 :			movdqa xmm6,xmm0
400 :			movdqa xmm7,xmm0
401 :
402 :			sad8lines %3
403 :
404 :			after_sad %1
405 :
406 :			%endmacro
407 :
408 :			align 16
409 :			dev16_sse2_al
410 :
411 :			load_stride ebx
412 :
413 :			sad_mean16x8_al eax,0,esi
414 :			sad_mean16x8_al ebp,0,esi
415 :
416 :			make_mean
417 :
418 :			sad_mean16x8_al ebp,1,edi
419 :			sad_mean16x8_al eax,1,edi
420 :
421 :			add eax,ebp
422 :
423 :			restore 1
424 :
425 :			ret
426 :
427 :			;===========================================================================
428 :			; Branch does not require alignment
429 :			;===========================================================================
430 :
431 :			%macro sad_mean16x8_ul 2
432 :
433 :			pxor xmm7,xmm7
434 :
435 :			movdqu xmm0,[%1]
436 :			movdqu xmm1,[%1+ebx]
437 :			movdqu xmm2,[%1+ebx*2]
438 :			movdqu xmm3,[%1+ecx]
439 :
440 :			add %1,edx
441 :
442 :			movdqa [buffer+16*0],xmm0
443 :			movdqa [buffer+16*1],xmm1
444 :			movdqa [buffer+16*2],xmm2
445 :			movdqa [buffer+16*3],xmm3
446 :
447 :			movdqu xmm4,[%1]
448 :			movdqu xmm5,[%1+ebx]
449 :			movdqu xmm6,[%1+ebx*2]
450 :			movdqa [buffer+16*4],xmm4
451 :			movdqa [buffer+16*5],xmm5
452 :			movdqa [buffer+16*6],xmm6
453 :
454 :			psadbw xmm0,xmm7
455 :			psadbw xmm1,xmm7
456 :			psadbw xmm2,xmm7
457 :			psadbw xmm3,xmm7
458 :			psadbw xmm4,xmm7
459 :			psadbw xmm5,xmm7
460 :			psadbw xmm6,xmm7
461 :
462 :			movdqu xmm7,[%1+ecx]
463 :			movdqa [buffer+16*7],xmm7
464 :			psadbw xmm7,[zero]
465 :
466 :			add %1,edx
467 :
468 :			after_sad %2
469 :			%endmacro
470 :
471 :			align 16
472 :			dev16_sse2_ul
473 :
474 :			load_stride ebx
475 :
476 :			sad_mean16x8_ul esi,eax
477 :			sad_mean16x8_ul esi,ebp
478 :
479 :			make_mean
480 :
481 :			sad_mean16x8_al ebp,1,edi
482 :			sad_mean16x8_al eax,1,edi
483 :
484 :			add eax,ebp
485 :
486 :			restore 1
487 :
488 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4