Annotation of /trunk/xvidcore/src/motion/x86_asm/sad_sse2.asm

Revision 652 - (view) (download)

1 :	chl	430	;/*****************************************************************************
2 :	Isibaar	262	; *
3 :	chl	430	; * XVID MPEG-4 VIDEO CODEC
4 :			; * sse2 sum of absolute difference
5 :	Isibaar	262	; *
6 :	chl	430	; * Copyright(C) 2002 Dmitry Rozhdestvensky
7 :	Isibaar	262	; *
8 :	edgomez	652	; * This file is part of XviD, a free MPEG-4 video encoder/decoder
9 :	Isibaar	262	; *
10 :	edgomez	652	; * XviD is free software; you can redistribute it and/or modify it
11 :			; * under the terms of the GNU General Public License as published by
12 :	chl	430	; * the Free Software Foundation; either version 2 of the License, or
13 :			; * (at your option) any later version.
14 :	Isibaar	262	; *
15 :	chl	430	; * This program is distributed in the hope that it will be useful,
16 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 :			; * GNU General Public License for more details.
19 :	Isibaar	262	; *
20 :	chl	430	; * You should have received a copy of the GNU General Public License
21 :			; * along with this program; if not, write to the Free Software
22 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 :	Isibaar	262	; *
24 :	edgomez	652	; * Under section 8 of the GNU General Public License, the copyright
25 :			; * holders of XVID explicitly forbid distribution in the following
26 :			; * countries:
27 :			; *
28 :			; * - Japan
29 :			; * - United States of America
30 :			; *
31 :			; * Linking XviD statically or dynamically with other modules is making a
32 :			; * combined work based on XviD. Thus, the terms and conditions of the
33 :			; * GNU General Public License cover the whole combination.
34 :			; *
35 :			; * As a special exception, the copyright holders of XviD give you
36 :			; * permission to link XviD with independent modules that communicate with
37 :			; * XviD solely through the VFW1.1 and DShow interfaces, regardless of the
38 :			; * license terms of these independent modules, and to copy and distribute
39 :			; * the resulting combined work under terms of your choice, provided that
40 :			; * every copy of the combined work is accompanied by a complete copy of
41 :			; * the source code of XviD (the version of XviD used to produce the
42 :			; * combined work), being distributed under the terms of the GNU General
43 :			; * Public License plus this exception. An independent module is a module
44 :			; * which is not derived from or based on XviD.
45 :			; *
46 :			; * Note that people who make modified versions of XviD are not obligated
47 :			; * to grant this special exception for their modified versions; it is
48 :			; * their choice whether to do so. The GNU General Public License gives
49 :			; * permission to release a modified version without this exception; this
50 :			; * exception also makes it possible to release a modified version which
51 :			; * carries forward this exception.
52 :			; *
53 :			; * $Id: sad_sse2.asm,v 1.7 2002-11-17 00:32:06 edgomez Exp $
54 :			; *
55 :	chl	431	; ****************************************************************************/
56 :	Isibaar	262
57 :			bits 32
58 :
59 :			%macro cglobal 1
60 :			%ifdef PREFIX
61 :			global _%1
62 :			%define %1 _%1
63 :			%else
64 :			global %1
65 :			%endif
66 :			%endmacro
67 :
68 :			%define sad_debug 0 ;1=unaligned 2=ref unaligned 3=aligned 0=autodetect
69 :			%define dev_debug 2 ;1=unaligned 2=aligned 0=autodetect
70 :			%define test_stride_alignment 0 ;test stride for alignment while autodetect
71 :			%define early_return 0 ;use early return in sad
72 :
73 :			section .data
74 :
75 :			align 64
76 :			buffer times 4*8 dd 0 ;8 128-bit words
77 :			zero times 4 dd 0
78 :
79 :			section .text
80 :
81 :			cglobal sad16_sse2
82 :			cglobal dev16_sse2
83 :
84 :			;===========================================================================
85 :			; General macros for SSE2 code
86 :			;===========================================================================
87 :
88 :			%macro load_stride 1
89 :			mov ecx,%1
90 :			add ecx,ecx
91 :			mov edx,ecx
92 :			add ecx,%1 ;stride*3
93 :			add edx,edx ;stride*4
94 :			%endmacro
95 :
96 :			%macro sad8lines 1
97 :
98 :			psadbw xmm0,[%1]
99 :			psadbw xmm1,[%1+ebx]
100 :			psadbw xmm2,[%1+ebx*2]
101 :			psadbw xmm3,[%1+ecx]
102 :
103 :			add %1,edx
104 :
105 :			psadbw xmm4,[%1]
106 :			psadbw xmm5,[%1+ebx]
107 :			psadbw xmm6,[%1+ebx*2]
108 :			psadbw xmm7,[%1+ecx]
109 :
110 :			add %1,edx
111 :			%endmacro
112 :
113 :			%macro after_sad 1 ; Summarizes 0th and 4th words of all xmm registers
114 :
115 :			paddusw xmm0,xmm1
116 :			paddusw xmm2,xmm3
117 :			paddusw xmm4,xmm5
118 :			paddusw xmm6,xmm7
119 :
120 :			paddusw xmm0,xmm2
121 :			paddusw xmm4,xmm6
122 :
123 :			paddusw xmm4,xmm0
124 :			pshufd xmm5,xmm4,11111110b
125 :			paddusw xmm5,xmm4
126 :
127 :			pextrw %1,xmm5,0 ;less latency then movd
128 :			%endmacro
129 :
130 :			%macro restore 1 ;restores used registers
131 :
132 :			%if %1=1
133 :			pop ebp
134 :			%endif
135 :			pop edi
136 :			pop esi
137 :			pop ebx
138 :			%endmacro
139 :
140 :			;===========================================================================
141 :			;
142 :			; uint32_t sad16_sse2 (const uint8_t * const cur,
143 :			; const uint8_t * const ref,
144 :			; const uint32_t stride,
145 :			; const uint32_t best_sad);
146 :			;
147 :			;
148 :			;===========================================================================
149 :
150 :			align 16
151 :			sad16_sse2
152 :			push ebx
153 :			push esi
154 :			push edi
155 :
156 :			mov ebx,[esp + 3*4 + 12] ;stride
157 :
158 :			%if sad_debug<>0
159 :			mov edi,[esp + 3*4 + 4]
160 :			mov esi,[esp + 3*4 + 8]
161 :			%endif
162 :
163 :			%if sad_debug=1
164 :			jmp sad16_sse2_ul
165 :			%endif
166 :			%if sad_debug=2
167 :			jmp sad16_sse2_semial
168 :			%endif
169 :			%if sad_debug=3
170 :			jmp sad16_sse2_al
171 :			%endif
172 :
173 :			%if test_stride_alignment<>0
174 :			test ebx,15
175 :			jnz sad16_sse2_ul
176 :			%endif
177 :			mov edi,[esp + 3*4 + 4] ;cur (most likely aligned)
178 :
179 :			test edi,15
180 :			cmovz esi,[esp + 3*4 + 8] ;load esi if edi is aligned
181 :			cmovnz esi,edi ;move to esi and load edi
182 :			cmovnz edi,[esp + 3*4 + 8] ;if not
183 :			jnz esi_unaligned
184 :
185 :			test esi,15
186 :			jnz near sad16_sse2_semial
187 :			jmp sad16_sse2_al
188 :
189 :			esi_unaligned: test edi,15
190 :			jnz near sad16_sse2_ul
191 :			jmp sad16_sse2_semial
192 :
193 :			;===========================================================================
194 :			; Branch requires 16-byte alignment of esi and edi and stride
195 :			;===========================================================================
196 :
197 :			%macro sad16x8_al 1
198 :
199 :			movdqa xmm0,[esi]
200 :			movdqa xmm1,[esi+ebx]
201 :			movdqa xmm2,[esi+ebx*2]
202 :			movdqa xmm3,[esi+ecx]
203 :
204 :			add esi,edx
205 :
206 :			movdqa xmm4,[esi]
207 :			movdqa xmm5,[esi+ebx]
208 :			movdqa xmm6,[esi+ebx*2]
209 :			movdqa xmm7,[esi+ecx]
210 :
211 :			add esi,edx
212 :
213 :			sad8lines edi
214 :
215 :			after_sad %1
216 :
217 :			%endmacro
218 :
219 :			align 16
220 :			sad16_sse2_al
221 :
222 :			load_stride ebx
223 :
224 :			sad16x8_al eax
225 :
226 :			%if early_return=1
227 :			cmp eax,[esp + 3*4 + 16] ;best_sad
228 :			jg continue_al
229 :			%endif
230 :
231 :			sad16x8_al ebx
232 :
233 :			add eax,ebx
234 :
235 :			continue_al: restore 0
236 :
237 :			ret
238 :
239 :			;===========================================================================
240 :			; Branch requires 16-byte alignment of the edi and stride only
241 :			;===========================================================================
242 :
243 :			%macro sad16x8_semial 1
244 :
245 :			movdqu xmm0,[esi]
246 :			movdqu xmm1,[esi+ebx]
247 :			movdqu xmm2,[esi+ebx*2]
248 :			movdqu xmm3,[esi+ecx]
249 :
250 :			add esi,edx
251 :
252 :			movdqu xmm4,[esi]
253 :			movdqu xmm5,[esi+ebx]
254 :			movdqu xmm6,[esi+ebx*2]
255 :			movdqu xmm7,[esi+ecx]
256 :
257 :			add esi,edx
258 :
259 :			sad8lines edi
260 :
261 :			after_sad %1
262 :
263 :			%endmacro
264 :
265 :			align 16
266 :			sad16_sse2_semial
267 :
268 :			load_stride ebx
269 :
270 :			sad16x8_semial eax
271 :
272 :			%if early_return=1
273 :			cmp eax,[esp + 3*4 + 16] ;best_sad
274 :			jg cont_semial
275 :			%endif
276 :
277 :			sad16x8_semial ebx
278 :
279 :			add eax,ebx
280 :
281 :			cont_semial: restore 0
282 :
283 :			ret
284 :
285 :
286 :			;===========================================================================
287 :			; Branch does not require alignment, even stride
288 :			;===========================================================================
289 :
290 :			%macro sad16x4_ul 1
291 :
292 :			movdqu xmm0,[esi]
293 :			movdqu xmm1,[esi+ebx]
294 :			movdqu xmm2,[esi+ebx*2]
295 :			movdqu xmm3,[esi+ecx]
296 :
297 :			add esi,edx
298 :
299 :			movdqu xmm4,[edi]
300 :			movdqu xmm5,[edi+ebx]
301 :			movdqu xmm6,[edi+ebx*2]
302 :			movdqu xmm7,[edi+ecx]
303 :
304 :			add edi,edx
305 :
306 :			psadbw xmm4,xmm0
307 :			psadbw xmm5,xmm1
308 :			psadbw xmm6,xmm2
309 :			psadbw xmm7,xmm3
310 :
311 :			paddusw xmm4,xmm5
312 :			paddusw xmm6,xmm7
313 :
314 :			paddusw xmm4,xmm6
315 :			pshufd xmm7,xmm4,11111110b
316 :			paddusw xmm7,xmm4
317 :
318 :			pextrw %1,xmm7,0
319 :			%endmacro
320 :
321 :
322 :			align 16
323 :			sad16_sse2_ul
324 :
325 :			load_stride ebx
326 :
327 :			push ebp
328 :
329 :			sad16x4_ul eax
330 :
331 :			%if early_return=1
332 :			cmp eax,[esp + 4*4 + 16] ;best_sad
333 :			jg continue_ul
334 :			%endif
335 :
336 :			sad16x4_ul ebp
337 :			add eax,ebp
338 :
339 :			%if early_return=1
340 :			cmp eax,[esp + 4*4 + 16] ;best_sad
341 :			jg continue_ul
342 :			%endif
343 :
344 :			sad16x4_ul ebp
345 :			add eax,ebp
346 :
347 :			%if early_return=1
348 :			cmp eax,[esp + 4*4 + 16] ;best_sad
349 :			jg continue_ul
350 :			%endif
351 :
352 :			sad16x4_ul ebp
353 :			add eax,ebp
354 :
355 :			continue_ul: restore 1
356 :
357 :			ret
358 :
359 :			;===========================================================================
360 :			;
361 :			; uint32_t dev16_sse2(const uint8_t * const cur,
362 :			; const uint32_t stride);
363 :			;
364 :			; experimental!
365 :			;
366 :			;===========================================================================
367 :
368 :			align 16
369 :			dev16_sse2
370 :
371 :			push ebx
372 :			push esi
373 :			push edi
374 :			push ebp
375 :
376 :			mov esi, [esp + 4*4 + 4] ; cur
377 :			mov ebx, [esp + 4*4 + 8] ; stride
378 :			mov edi, buffer
379 :
380 :			%if dev_debug=1
381 :			jmp dev16_sse2_ul
382 :			%endif
383 :
384 :			%if dev_debug=2
385 :			jmp dev16_sse2_al
386 :			%endif
387 :
388 :			test esi,15
389 :			jnz near dev16_sse2_ul
390 :
391 :			%if test_stride_alignment=1
392 :			test ebx,15
393 :			jnz dev16_sse2_ul
394 :			%endif
395 :
396 :			mov edi,esi
397 :			jmp dev16_sse2_al
398 :
399 :			;===========================================================================
400 :			; Branch requires alignment of both the cur and stride
401 :			;===========================================================================
402 :
403 :			%macro make_mean 0
404 :			add eax,ebp ;mean 16-bit
405 :			mov al,ah ;eax= {0 0 mean/256 mean/256}
406 :			mov ebp,eax
407 :			shl ebp,16
408 :			or eax,ebp
409 :			%endmacro
410 :
411 :			%macro sad_mean16x8_al 3 ;destination,0=zero,1=mean from eax,source
412 :
413 :			%if %2=0
414 :			pxor xmm0,xmm0
415 :			%else
416 :			movd xmm0,eax
417 :			pshufd xmm0,xmm0,0
418 :			%endif
419 :			movdqa xmm1,xmm0
420 :			movdqa xmm2,xmm0
421 :			movdqa xmm3,xmm0
422 :			movdqa xmm4,xmm0
423 :			movdqa xmm5,xmm0
424 :			movdqa xmm6,xmm0
425 :			movdqa xmm7,xmm0
426 :
427 :			sad8lines %3
428 :
429 :			after_sad %1
430 :
431 :			%endmacro
432 :
433 :			align 16
434 :			dev16_sse2_al
435 :
436 :			load_stride ebx
437 :
438 :			sad_mean16x8_al eax,0,esi
439 :			sad_mean16x8_al ebp,0,esi
440 :
441 :			make_mean
442 :
443 :			sad_mean16x8_al ebp,1,edi
444 :			sad_mean16x8_al eax,1,edi
445 :
446 :			add eax,ebp
447 :
448 :			restore 1
449 :
450 :			ret
451 :
452 :			;===========================================================================
453 :			; Branch does not require alignment
454 :			;===========================================================================
455 :
456 :			%macro sad_mean16x8_ul 2
457 :
458 :			pxor xmm7,xmm7
459 :
460 :			movdqu xmm0,[%1]
461 :			movdqu xmm1,[%1+ebx]
462 :			movdqu xmm2,[%1+ebx*2]
463 :			movdqu xmm3,[%1+ecx]
464 :
465 :			add %1,edx
466 :
467 :			movdqa [buffer+16*0],xmm0
468 :			movdqa [buffer+16*1],xmm1
469 :			movdqa [buffer+16*2],xmm2
470 :			movdqa [buffer+16*3],xmm3
471 :
472 :			movdqu xmm4,[%1]
473 :			movdqu xmm5,[%1+ebx]
474 :			movdqu xmm6,[%1+ebx*2]
475 :			movdqa [buffer+16*4],xmm4
476 :			movdqa [buffer+16*5],xmm5
477 :			movdqa [buffer+16*6],xmm6
478 :
479 :			psadbw xmm0,xmm7
480 :			psadbw xmm1,xmm7
481 :			psadbw xmm2,xmm7
482 :			psadbw xmm3,xmm7
483 :			psadbw xmm4,xmm7
484 :			psadbw xmm5,xmm7
485 :			psadbw xmm6,xmm7
486 :
487 :			movdqu xmm7,[%1+ecx]
488 :			movdqa [buffer+16*7],xmm7
489 :			psadbw xmm7,[zero]
490 :
491 :			add %1,edx
492 :
493 :			after_sad %2
494 :			%endmacro
495 :
496 :			align 16
497 :			dev16_sse2_ul
498 :
499 :			load_stride ebx
500 :
501 :			sad_mean16x8_ul esi,eax
502 :			sad_mean16x8_ul esi,ebp
503 :
504 :			make_mean
505 :
506 :			sad_mean16x8_al ebp,1,edi
507 :			sad_mean16x8_al eax,1,edi
508 :
509 :			add eax,ebp
510 :
511 :			restore 1
512 :
513 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4