Annotation of /branches/dev-api-4/xvidcore/src/motion/x86_asm/sad_sse2.asm

Revision 605 - (view) (download)
Original Path: trunk/xvidcore/src/motion/x86_asm/sad_sse2.asm

1 :	chl	430	;/*****************************************************************************
2 :	Isibaar	262	; *
3 :	chl	430	; * XVID MPEG-4 VIDEO CODEC
4 :			; * sse2 sum of absolute difference
5 :	Isibaar	262	; *
6 :	chl	430	; * Copyright(C) 2002 Dmitry Rozhdestvensky
7 :	Isibaar	262	; *
8 :	chl	430	; * This program is an implementation of a part of one or more MPEG-4
9 :			; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending
10 :			; * to use this software module in hardware or software products are
11 :			; * advised that its use may infringe existing patents or copyrights, and
12 :			; * any such use would be at such party's own risk. The original
13 :			; * developer of this software module and his/her company, and subsequent
14 :			; * editors and their companies, will have no liability for use of this
15 :			; * software or modifications or derivatives thereof.
16 :	Isibaar	262	; *
17 :	chl	430	; * This program is free software; you can redistribute it and/or modify
18 :			; * it under the terms of the GNU General Public License as published by
19 :			; * the Free Software Foundation; either version 2 of the License, or
20 :			; * (at your option) any later version.
21 :	Isibaar	262	; *
22 :	chl	430	; * This program is distributed in the hope that it will be useful,
23 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 :			; * GNU General Public License for more details.
26 :	Isibaar	262	; *
27 :	chl	430	; * You should have received a copy of the GNU General Public License
28 :			; * along with this program; if not, write to the Free Software
29 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30 :	Isibaar	262	; *
31 :	chl	431	; ****************************************************************************/
32 :	Isibaar	262
33 :			bits 32
34 :
35 :			%macro cglobal 1
36 :			%ifdef PREFIX
37 :			global _%1
38 :			%define %1 _%1
39 :			%else
40 :			global %1
41 :			%endif
42 :			%endmacro
43 :
44 :			%define sad_debug 0 ;1=unaligned 2=ref unaligned 3=aligned 0=autodetect
45 :			%define dev_debug 2 ;1=unaligned 2=aligned 0=autodetect
46 :			%define test_stride_alignment 0 ;test stride for alignment while autodetect
47 :			%define early_return 0 ;use early return in sad
48 :
49 :			section .data
50 :
51 :			align 64
52 :			buffer times 4*8 dd 0 ;8 128-bit words
53 :			zero times 4 dd 0
54 :
55 :			section .text
56 :
57 :			cglobal sad16_sse2
58 :			cglobal dev16_sse2
59 :
60 :			;===========================================================================
61 :			; General macros for SSE2 code
62 :			;===========================================================================
63 :
64 :			%macro load_stride 1
65 :			mov ecx,%1
66 :			add ecx,ecx
67 :			mov edx,ecx
68 :			add ecx,%1 ;stride*3
69 :			add edx,edx ;stride*4
70 :			%endmacro
71 :
72 :			%macro sad8lines 1
73 :
74 :			psadbw xmm0,[%1]
75 :			psadbw xmm1,[%1+ebx]
76 :			psadbw xmm2,[%1+ebx*2]
77 :			psadbw xmm3,[%1+ecx]
78 :
79 :			add %1,edx
80 :
81 :			psadbw xmm4,[%1]
82 :			psadbw xmm5,[%1+ebx]
83 :			psadbw xmm6,[%1+ebx*2]
84 :			psadbw xmm7,[%1+ecx]
85 :
86 :			add %1,edx
87 :			%endmacro
88 :
89 :			%macro after_sad 1 ; Summarizes 0th and 4th words of all xmm registers
90 :
91 :			paddusw xmm0,xmm1
92 :			paddusw xmm2,xmm3
93 :			paddusw xmm4,xmm5
94 :			paddusw xmm6,xmm7
95 :
96 :			paddusw xmm0,xmm2
97 :			paddusw xmm4,xmm6
98 :
99 :			paddusw xmm4,xmm0
100 :			pshufd xmm5,xmm4,11111110b
101 :			paddusw xmm5,xmm4
102 :
103 :			pextrw %1,xmm5,0 ;less latency then movd
104 :			%endmacro
105 :
106 :			%macro restore 1 ;restores used registers
107 :
108 :			%if %1=1
109 :			pop ebp
110 :			%endif
111 :			pop edi
112 :			pop esi
113 :			pop ebx
114 :			%endmacro
115 :
116 :			;===========================================================================
117 :			;
118 :			; uint32_t sad16_sse2 (const uint8_t * const cur,
119 :			; const uint8_t * const ref,
120 :			; const uint32_t stride,
121 :			; const uint32_t best_sad);
122 :			;
123 :			;
124 :			;===========================================================================
125 :
126 :			align 16
127 :			sad16_sse2
128 :			push ebx
129 :			push esi
130 :			push edi
131 :
132 :			mov ebx,[esp + 3*4 + 12] ;stride
133 :
134 :			%if sad_debug<>0
135 :			mov edi,[esp + 3*4 + 4]
136 :			mov esi,[esp + 3*4 + 8]
137 :			%endif
138 :
139 :			%if sad_debug=1
140 :			jmp sad16_sse2_ul
141 :			%endif
142 :			%if sad_debug=2
143 :			jmp sad16_sse2_semial
144 :			%endif
145 :			%if sad_debug=3
146 :			jmp sad16_sse2_al
147 :			%endif
148 :
149 :			%if test_stride_alignment<>0
150 :			test ebx,15
151 :			jnz sad16_sse2_ul
152 :			%endif
153 :			mov edi,[esp + 3*4 + 4] ;cur (most likely aligned)
154 :
155 :			test edi,15
156 :			cmovz esi,[esp + 3*4 + 8] ;load esi if edi is aligned
157 :			cmovnz esi,edi ;move to esi and load edi
158 :			cmovnz edi,[esp + 3*4 + 8] ;if not
159 :			jnz esi_unaligned
160 :
161 :			test esi,15
162 :			jnz near sad16_sse2_semial
163 :			jmp sad16_sse2_al
164 :
165 :			esi_unaligned: test edi,15
166 :			jnz near sad16_sse2_ul
167 :			jmp sad16_sse2_semial
168 :
169 :			;===========================================================================
170 :			; Branch requires 16-byte alignment of esi and edi and stride
171 :			;===========================================================================
172 :
173 :			%macro sad16x8_al 1
174 :
175 :			movdqa xmm0,[esi]
176 :			movdqa xmm1,[esi+ebx]
177 :			movdqa xmm2,[esi+ebx*2]
178 :			movdqa xmm3,[esi+ecx]
179 :
180 :			add esi,edx
181 :
182 :			movdqa xmm4,[esi]
183 :			movdqa xmm5,[esi+ebx]
184 :			movdqa xmm6,[esi+ebx*2]
185 :			movdqa xmm7,[esi+ecx]
186 :
187 :			add esi,edx
188 :
189 :			sad8lines edi
190 :
191 :			after_sad %1
192 :
193 :			%endmacro
194 :
195 :			align 16
196 :			sad16_sse2_al
197 :
198 :			load_stride ebx
199 :
200 :			sad16x8_al eax
201 :
202 :			%if early_return=1
203 :			cmp eax,[esp + 3*4 + 16] ;best_sad
204 :			jg continue_al
205 :			%endif
206 :
207 :			sad16x8_al ebx
208 :
209 :			add eax,ebx
210 :
211 :			continue_al: restore 0
212 :
213 :			ret
214 :
215 :			;===========================================================================
216 :			; Branch requires 16-byte alignment of the edi and stride only
217 :			;===========================================================================
218 :
219 :			%macro sad16x8_semial 1
220 :
221 :			movdqu xmm0,[esi]
222 :			movdqu xmm1,[esi+ebx]
223 :			movdqu xmm2,[esi+ebx*2]
224 :			movdqu xmm3,[esi+ecx]
225 :
226 :			add esi,edx
227 :
228 :			movdqu xmm4,[esi]
229 :			movdqu xmm5,[esi+ebx]
230 :			movdqu xmm6,[esi+ebx*2]
231 :			movdqu xmm7,[esi+ecx]
232 :
233 :			add esi,edx
234 :
235 :			sad8lines edi
236 :
237 :			after_sad %1
238 :
239 :			%endmacro
240 :
241 :			align 16
242 :			sad16_sse2_semial
243 :
244 :			load_stride ebx
245 :
246 :			sad16x8_semial eax
247 :
248 :			%if early_return=1
249 :			cmp eax,[esp + 3*4 + 16] ;best_sad
250 :			jg cont_semial
251 :			%endif
252 :
253 :			sad16x8_semial ebx
254 :
255 :			add eax,ebx
256 :
257 :			cont_semial: restore 0
258 :
259 :			ret
260 :
261 :
262 :			;===========================================================================
263 :			; Branch does not require alignment, even stride
264 :			;===========================================================================
265 :
266 :			%macro sad16x4_ul 1
267 :
268 :			movdqu xmm0,[esi]
269 :			movdqu xmm1,[esi+ebx]
270 :			movdqu xmm2,[esi+ebx*2]
271 :			movdqu xmm3,[esi+ecx]
272 :
273 :			add esi,edx
274 :
275 :			movdqu xmm4,[edi]
276 :			movdqu xmm5,[edi+ebx]
277 :			movdqu xmm6,[edi+ebx*2]
278 :			movdqu xmm7,[edi+ecx]
279 :
280 :			add edi,edx
281 :
282 :			psadbw xmm4,xmm0
283 :			psadbw xmm5,xmm1
284 :			psadbw xmm6,xmm2
285 :			psadbw xmm7,xmm3
286 :
287 :			paddusw xmm4,xmm5
288 :			paddusw xmm6,xmm7
289 :
290 :			paddusw xmm4,xmm6
291 :			pshufd xmm7,xmm4,11111110b
292 :			paddusw xmm7,xmm4
293 :
294 :			pextrw %1,xmm7,0
295 :			%endmacro
296 :
297 :
298 :			align 16
299 :			sad16_sse2_ul
300 :
301 :			load_stride ebx
302 :
303 :			push ebp
304 :
305 :			sad16x4_ul eax
306 :
307 :			%if early_return=1
308 :			cmp eax,[esp + 4*4 + 16] ;best_sad
309 :			jg continue_ul
310 :			%endif
311 :
312 :			sad16x4_ul ebp
313 :			add eax,ebp
314 :
315 :			%if early_return=1
316 :			cmp eax,[esp + 4*4 + 16] ;best_sad
317 :			jg continue_ul
318 :			%endif
319 :
320 :			sad16x4_ul ebp
321 :			add eax,ebp
322 :
323 :			%if early_return=1
324 :			cmp eax,[esp + 4*4 + 16] ;best_sad
325 :			jg continue_ul
326 :			%endif
327 :
328 :			sad16x4_ul ebp
329 :			add eax,ebp
330 :
331 :			continue_ul: restore 1
332 :
333 :			ret
334 :
335 :			;===========================================================================
336 :			;
337 :			; uint32_t dev16_sse2(const uint8_t * const cur,
338 :			; const uint32_t stride);
339 :			;
340 :			; experimental!
341 :			;
342 :			;===========================================================================
343 :
344 :			align 16
345 :			dev16_sse2
346 :
347 :			push ebx
348 :			push esi
349 :			push edi
350 :			push ebp
351 :
352 :			mov esi, [esp + 4*4 + 4] ; cur
353 :			mov ebx, [esp + 4*4 + 8] ; stride
354 :			mov edi, buffer
355 :
356 :			%if dev_debug=1
357 :			jmp dev16_sse2_ul
358 :			%endif
359 :
360 :			%if dev_debug=2
361 :			jmp dev16_sse2_al
362 :			%endif
363 :
364 :			test esi,15
365 :			jnz near dev16_sse2_ul
366 :
367 :			%if test_stride_alignment=1
368 :			test ebx,15
369 :			jnz dev16_sse2_ul
370 :			%endif
371 :
372 :			mov edi,esi
373 :			jmp dev16_sse2_al
374 :
375 :			;===========================================================================
376 :			; Branch requires alignment of both the cur and stride
377 :			;===========================================================================
378 :
379 :			%macro make_mean 0
380 :			add eax,ebp ;mean 16-bit
381 :			mov al,ah ;eax= {0 0 mean/256 mean/256}
382 :			mov ebp,eax
383 :			shl ebp,16
384 :			or eax,ebp
385 :			%endmacro
386 :
387 :			%macro sad_mean16x8_al 3 ;destination,0=zero,1=mean from eax,source
388 :
389 :			%if %2=0
390 :			pxor xmm0,xmm0
391 :			%else
392 :			movd xmm0,eax
393 :			pshufd xmm0,xmm0,0
394 :			%endif
395 :			movdqa xmm1,xmm0
396 :			movdqa xmm2,xmm0
397 :			movdqa xmm3,xmm0
398 :			movdqa xmm4,xmm0
399 :			movdqa xmm5,xmm0
400 :			movdqa xmm6,xmm0
401 :			movdqa xmm7,xmm0
402 :
403 :			sad8lines %3
404 :
405 :			after_sad %1
406 :
407 :			%endmacro
408 :
409 :			align 16
410 :			dev16_sse2_al
411 :
412 :			load_stride ebx
413 :
414 :			sad_mean16x8_al eax,0,esi
415 :			sad_mean16x8_al ebp,0,esi
416 :
417 :			make_mean
418 :
419 :			sad_mean16x8_al ebp,1,edi
420 :			sad_mean16x8_al eax,1,edi
421 :
422 :			add eax,ebp
423 :
424 :			restore 1
425 :
426 :			ret
427 :
428 :			;===========================================================================
429 :			; Branch does not require alignment
430 :			;===========================================================================
431 :
432 :			%macro sad_mean16x8_ul 2
433 :
434 :			pxor xmm7,xmm7
435 :
436 :			movdqu xmm0,[%1]
437 :			movdqu xmm1,[%1+ebx]
438 :			movdqu xmm2,[%1+ebx*2]
439 :			movdqu xmm3,[%1+ecx]
440 :
441 :			add %1,edx
442 :
443 :			movdqa [buffer+16*0],xmm0
444 :			movdqa [buffer+16*1],xmm1
445 :			movdqa [buffer+16*2],xmm2
446 :			movdqa [buffer+16*3],xmm3
447 :
448 :			movdqu xmm4,[%1]
449 :			movdqu xmm5,[%1+ebx]
450 :			movdqu xmm6,[%1+ebx*2]
451 :			movdqa [buffer+16*4],xmm4
452 :			movdqa [buffer+16*5],xmm5
453 :			movdqa [buffer+16*6],xmm6
454 :
455 :			psadbw xmm0,xmm7
456 :			psadbw xmm1,xmm7
457 :			psadbw xmm2,xmm7
458 :			psadbw xmm3,xmm7
459 :			psadbw xmm4,xmm7
460 :			psadbw xmm5,xmm7
461 :			psadbw xmm6,xmm7
462 :
463 :			movdqu xmm7,[%1+ecx]
464 :			movdqa [buffer+16*7],xmm7
465 :			psadbw xmm7,[zero]
466 :
467 :			add %1,edx
468 :
469 :			after_sad %2
470 :			%endmacro
471 :
472 :			align 16
473 :			dev16_sse2_ul
474 :
475 :			load_stride ebx
476 :
477 :			sad_mean16x8_ul esi,eax
478 :			sad_mean16x8_ul esi,ebp
479 :
480 :			make_mean
481 :
482 :			sad_mean16x8_al ebp,1,edi
483 :			sad_mean16x8_al eax,1,edi
484 :
485 :			add eax,ebp
486 :
487 :			restore 1
488 :
489 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4