Annotation of /trunk/xvidcore/src/image/x86_asm/interpolate8x8_3dn.asm

Revision 1793 - (view) (download)

1 :	edgomez	1382	;/*****************************************************************************
2 :	Isibaar	262	; *
3 :	edgomez	1382	; * XVID MPEG-4 VIDEO CODEC
4 :			; * - 3dnow 8x8 block-based halfpel interpolation -
5 :	Isibaar	262	; *
6 :	edgomez	1382	; * Copyright(C) 2001 Peter Ross <pross@xvid.org>
7 :			; * 2002 Michael Militzer <isibaar@xvid.org>
8 :			; * 2002 Pascal Massimino <skal@planet-d.net>
9 :	Isibaar	262	; *
10 :	edgomez	1382	; * This program is free software ; you can redistribute it and/or modify
11 :			; * it under the terms of the GNU General Public License as published by
12 :			; * the Free Software Foundation ; either version 2 of the License, or
13 :			; * (at your option) any later version.
14 :	Isibaar	262	; *
15 :	edgomez	1382	; * This program is distributed in the hope that it will be useful,
16 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
17 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 :			; * GNU General Public License for more details.
19 :	Isibaar	262	; *
20 :	edgomez	1382	; * You should have received a copy of the GNU General Public License
21 :			; * along with this program ; if not, write to the Free Software
22 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 :	Isibaar	262	; *
24 :	edgomez	1382	; ****************************************************************************/
25 :	Isibaar	262
26 :	edgomez	1382	BITS 32
27 :	edgomez	851
28 :	edgomez	1382	%macro cglobal 1
29 :	Isibaar	262	%ifdef PREFIX
30 :	edgomez	1535	%ifdef MARK_FUNCS
31 :	edgomez	1540	global _%1:function %1.endfunc-%1
32 :			%define %1 _%1:function %1.endfunc-%1
33 :	Isibaar	1793	%define ENDFUNC .endfunc
34 :	edgomez	1535	%else
35 :			global _%1
36 :			%define %1 _%1
37 :	Isibaar	1793	%define ENDFUNC
38 :	edgomez	1535	%endif
39 :	Isibaar	262	%else
40 :	edgomez	1535	%ifdef MARK_FUNCS
41 :	edgomez	1540	global %1:function %1.endfunc-%1
42 :	Isibaar	1793	%define ENDFUNC .endfunc
43 :	edgomez	1535	%else
44 :			global %1
45 :	Isibaar	1793	%define ENDFUNC
46 :	edgomez	1535	%endif
47 :	Isibaar	262	%endif
48 :			%endmacro
49 :
50 :	edgomez	1382	;=============================================================================
51 :			; Read Only data
52 :			;=============================================================================
53 :	Isibaar	262
54 :	edgomez	1382	%ifdef FORMAT_COFF
55 :	edgomez	1519	SECTION .rodata
56 :	edgomez	1382	%else
57 :	edgomez	1519	SECTION .rodata align=16
58 :	edgomez	1382	%endif
59 :	Isibaar	262
60 :	edgomez	1382	ALIGN 16
61 :			mmx_one:
62 :			times 8 db 1
63 :	Isibaar	262
64 :	edgomez	1382	;=============================================================================
65 :			; Code
66 :			;=============================================================================
67 :	Isibaar	262
68 :	edgomez	1382	SECTION .text
69 :	Isibaar	262
70 :			cglobal interpolate8x8_halfpel_h_3dn
71 :			cglobal interpolate8x8_halfpel_v_3dn
72 :			cglobal interpolate8x8_halfpel_hv_3dn
73 :
74 :	suxen_drol	1632	cglobal interpolate8x4_halfpel_h_3dn
75 :			cglobal interpolate8x4_halfpel_v_3dn
76 :			cglobal interpolate8x4_halfpel_hv_3dn
77 :
78 :	edgomez	1382	;-----------------------------------------------------------------------------
79 :	Isibaar	262	;
80 :			; void interpolate8x8_halfpel_h_3dn(uint8_t * const dst,
81 :	edgomez	1382	; const uint8_t * const src,
82 :			; const uint32_t stride,
83 :			; const uint32_t rounding);
84 :	Isibaar	262	;
85 :	edgomez	1382	;-----------------------------------------------------------------------------
86 :	Isibaar	262
87 :			%macro COPY_H_3DN_RND0 0
88 :	edgomez	1382	movq mm0, [eax]
89 :	Isibaar	262	pavgusb mm0, [eax+1]
90 :	edgomez	1382	movq mm1, [eax+edx]
91 :	Isibaar	262	pavgusb mm1, [eax+edx+1]
92 :	edgomez	1382	lea eax, [eax+2*edx]
93 :			movq [ecx], mm0
94 :			movq [ecx+edx], mm1
95 :	Isibaar	262	%endmacro
96 :
97 :			%macro COPY_H_3DN_RND1 0
98 :			movq mm0, [eax]
99 :			movq mm1, [eax+edx]
100 :			movq mm4, mm0
101 :			movq mm5, mm1
102 :	edgomez	1382	movq mm2, [eax+1]
103 :	Isibaar	262	movq mm3, [eax+edx+1]
104 :			pavgusb mm0, mm2
105 :			pxor mm2, mm4
106 :			pavgusb mm1, mm3
107 :	edgomez	1382	lea eax, [eax+2*edx]
108 :	Isibaar	262	pxor mm3, mm5
109 :			pand mm2, mm7
110 :			pand mm3, mm7
111 :			psubb mm0, mm2
112 :			movq [ecx], mm0
113 :			psubb mm1, mm3
114 :			movq [ecx+edx], mm1
115 :			%endmacro
116 :
117 :	edgomez	1382	ALIGN 16
118 :	Isibaar	262	interpolate8x8_halfpel_h_3dn:
119 :
120 :			mov eax, [esp+16] ; rounding
121 :			mov ecx, [esp+ 4] ; Dst
122 :	edgomez	1382	test eax, eax
123 :	Isibaar	262	mov eax, [esp+ 8] ; Src
124 :			mov edx, [esp+12] ; stride
125 :
126 :			jnz near .rounding1
127 :
128 :			COPY_H_3DN_RND0
129 :	edgomez	1382	lea ecx, [ecx+2*edx]
130 :	Isibaar	262	COPY_H_3DN_RND0
131 :	edgomez	1382	lea ecx, [ecx+2*edx]
132 :	Isibaar	262	COPY_H_3DN_RND0
133 :	edgomez	1382	lea ecx, [ecx+2*edx]
134 :	Isibaar	262	COPY_H_3DN_RND0
135 :			ret
136 :
137 :	Isibaar	1793	.rounding1:
138 :	edgomez	1382	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
139 :	Isibaar	262	movq mm7, [mmx_one]
140 :			COPY_H_3DN_RND1
141 :			lea ecx, [ecx+2*edx]
142 :			COPY_H_3DN_RND1
143 :	edgomez	1382	lea ecx, [ecx+2*edx]
144 :	Isibaar	262	COPY_H_3DN_RND1
145 :	edgomez	1382	lea ecx, [ecx+2*edx]
146 :	Isibaar	262	COPY_H_3DN_RND1
147 :			ret
148 :	Isibaar	1793	ENDFUNC
149 :	Isibaar	262
150 :
151 :	edgomez	1382	;-----------------------------------------------------------------------------
152 :	Isibaar	262	;
153 :			; void interpolate8x8_halfpel_v_3dn(uint8_t * const dst,
154 :	edgomez	1382	; const uint8_t * const src,
155 :			; const uint32_t stride,
156 :			; const uint32_t rounding);
157 :	Isibaar	262	;
158 :	edgomez	1382	;-----------------------------------------------------------------------------
159 :	Isibaar	262
160 :			%macro COPY_V_3DN_RND0 0
161 :	edgomez	1382	movq mm0, [eax]
162 :			movq mm1, [eax+edx]
163 :	Isibaar	262	pavgusb mm0, mm1
164 :			pavgusb mm1, [eax+2*edx]
165 :	edgomez	1382	lea eax, [eax+2*edx]
166 :			movq [ecx], mm0
167 :			movq [ecx+edx], mm1
168 :	Isibaar	262	%endmacro
169 :
170 :			%macro COPY_V_3DN_RND1 0
171 :			movq mm0, mm2
172 :			movq mm1, [eax]
173 :			movq mm2, [eax+edx]
174 :	edgomez	1382	lea eax, [eax+2*edx]
175 :	Isibaar	262	movq mm4, mm0
176 :			movq mm5, mm1
177 :			pavgusb mm0, mm1
178 :	edgomez	1382	pxor mm4, mm1
179 :	Isibaar	262	pavgusb mm1, mm2
180 :			pxor mm5, mm2
181 :	edgomez	1382	pand mm4, mm7 ; lsb's of (i^j)...
182 :			pand mm5, mm7 ; lsb's of (i^j)...
183 :			psubb mm0, mm4 ; ...are substracted from result of pavgusb
184 :	Isibaar	262	movq [ecx], mm0
185 :	edgomez	1382	psubb mm1, mm5 ; ...are substracted from result of pavgusb
186 :	Isibaar	262	movq [ecx+edx], mm1
187 :			%endmacro
188 :
189 :	edgomez	1382	ALIGN 16
190 :	Isibaar	262	interpolate8x8_halfpel_v_3dn:
191 :
192 :			mov eax, [esp+16] ; rounding
193 :			mov ecx, [esp+ 4] ; Dst
194 :			test eax,eax
195 :			mov eax, [esp+ 8] ; Src
196 :			mov edx, [esp+12] ; stride
197 :
198 :			; we process 2 line at a time
199 :
200 :			jnz near .rounding1
201 :
202 :			COPY_V_3DN_RND0
203 :			lea ecx, [ecx+2*edx]
204 :			COPY_V_3DN_RND0
205 :			lea ecx, [ecx+2*edx]
206 :			COPY_V_3DN_RND0
207 :			lea ecx, [ecx+2*edx]
208 :			COPY_V_3DN_RND0
209 :			ret
210 :
211 :	Isibaar	1793	.rounding1:
212 :	edgomez	1382	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
213 :	Isibaar	262	movq mm7, [mmx_one]
214 :	edgomez	1382	movq mm2, [eax] ; loop invariant
215 :	Isibaar	262	add eax, edx
216 :
217 :			COPY_V_3DN_RND1
218 :	edgomez	1382	lea ecx, [ecx+2*edx]
219 :	Isibaar	262	COPY_V_3DN_RND1
220 :	edgomez	1382	lea ecx, [ecx+2*edx]
221 :	Isibaar	262	COPY_V_3DN_RND1
222 :	edgomez	1382	lea ecx, [ecx+2*edx]
223 :	Isibaar	262	COPY_V_3DN_RND1
224 :			ret
225 :	Isibaar	1793	ENDFUNC
226 :	Isibaar	262
227 :
228 :	edgomez	1382	;-----------------------------------------------------------------------------
229 :	Isibaar	262	;
230 :			; void interpolate8x8_halfpel_hv_3dn(uint8_t * const dst,
231 :	edgomez	1382	; const uint8_t * const src,
232 :			; const uint32_t stride,
233 :			; const uint32_t rounding);
234 :	Isibaar	262	;
235 :			;
236 :	edgomez	1382	;-----------------------------------------------------------------------------
237 :	Isibaar	262
238 :			; The trick is to correct the result of 'pavgusb' with some combination of the
239 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgusb' (s and t).
240 :			; The boolean relations are:
241 :	edgomez	1382	; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
242 :	Isibaar	262	; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
243 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
244 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
245 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
246 :
247 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
248 :
249 :			%macro COPY_HV_3DN_RND0 0
250 :	edgomez	1382	lea eax, [eax+edx]
251 :	Isibaar	262
252 :	edgomez	1382	movq mm0, [eax]
253 :			movq mm1, [eax+1]
254 :	Isibaar	262
255 :	edgomez	1382	movq mm6, mm0
256 :			pavgusb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
257 :			lea eax, [eax+edx]
258 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
259 :	Isibaar	262
260 :	edgomez	1382	por mm3, mm1 ; ij \|= jk
261 :			movq mm6, mm2
262 :			pxor mm6, mm0 ; mm6 = s^t
263 :			pand mm3, mm6 ; (ij\|jk) &= st
264 :			pavgusb mm2, mm0 ; mm2 = (s+t+1)/2
265 :			pand mm3, mm7 ; mask lsb
266 :			psubb mm2, mm3 ; apply.
267 :	Isibaar	262
268 :	edgomez	1382	movq [ecx], mm2
269 :	Isibaar	262
270 :	edgomez	1382	movq mm2, [eax]
271 :			movq mm3, [eax+1]
272 :			movq mm6, mm2
273 :			pavgusb mm2, mm3 ; preserved for next iteration
274 :			lea ecx, [ecx+edx]
275 :			pxor mm3, mm6 ; preserved for next iteration
276 :	Isibaar	262
277 :	edgomez	1382	por mm1, mm3
278 :			movq mm6, mm0
279 :			pxor mm6, mm2
280 :			pand mm1, mm6
281 :			pavgusb mm0, mm2
282 :	Isibaar	262
283 :	edgomez	1382	pand mm1, mm7
284 :			psubb mm0, mm1
285 :	Isibaar	262
286 :	edgomez	1382	movq [ecx], mm0
287 :	Isibaar	262	%endmacro
288 :
289 :			%macro COPY_HV_3DN_RND1 0
290 :	edgomez	1382	lea eax,[eax+edx]
291 :	Isibaar	262
292 :	edgomez	1382	movq mm0, [eax]
293 :			movq mm1, [eax+1]
294 :	Isibaar	262
295 :	edgomez	1382	movq mm6, mm0
296 :			pavgusb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
297 :			lea eax, [eax+edx]
298 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
299 :	Isibaar	262
300 :	edgomez	1382	pand mm3, mm1
301 :			movq mm6, mm2
302 :			pxor mm6, mm0
303 :			por mm3, mm6
304 :			pavgusb mm2, mm0
305 :			pand mm3, mm7
306 :			psubb mm2, mm3
307 :	Isibaar	262
308 :	edgomez	1382	movq [ecx], mm2
309 :	Isibaar	262
310 :	edgomez	1382	movq mm2, [eax]
311 :			movq mm3, [eax+1]
312 :			movq mm6, mm2
313 :			pavgusb mm2, mm3 ; preserved for next iteration
314 :			lea ecx, [ecx+edx]
315 :			pxor mm3, mm6 ; preserved for next iteration
316 :	Isibaar	262
317 :	edgomez	1382	pand mm1, mm3
318 :			movq mm6, mm0
319 :			pxor mm6, mm2
320 :			por mm1, mm6
321 :			pavgusb mm0, mm2
322 :			pand mm1, mm7
323 :			psubb mm0, mm1
324 :	Isibaar	262
325 :	edgomez	1382	movq [ecx], mm0
326 :	Isibaar	262	%endmacro
327 :
328 :	edgomez	1382	ALIGN 16
329 :	Isibaar	1793	interpolate8x8_halfpel_hv_3dn:
330 :	Isibaar	262	mov eax, [esp+16] ; rounding
331 :			mov ecx, [esp+ 4] ; Dst
332 :	edgomez	1382	test eax, eax
333 :	Isibaar	262	mov eax, [esp+ 8] ; Src
334 :			mov edx, [esp+12] ; stride
335 :
336 :			movq mm7, [mmx_one]
337 :
338 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
339 :			movq mm2, [eax]
340 :			movq mm3, [eax+1]
341 :			movq mm6, mm2
342 :			pavgusb mm2, mm3
343 :	edgomez	1382	pxor mm3, mm6 ; mm2/mm3 ready
344 :	Isibaar	262
345 :			jnz near .rounding1
346 :
347 :			COPY_HV_3DN_RND0
348 :			add ecx, edx
349 :			COPY_HV_3DN_RND0
350 :			add ecx, edx
351 :			COPY_HV_3DN_RND0
352 :			add ecx, edx
353 :			COPY_HV_3DN_RND0
354 :			ret
355 :
356 :	Isibaar	1793	.rounding1:
357 :	Isibaar	262	COPY_HV_3DN_RND1
358 :			add ecx, edx
359 :			COPY_HV_3DN_RND1
360 :			add ecx, edx
361 :			COPY_HV_3DN_RND1
362 :			add ecx, edx
363 :			COPY_HV_3DN_RND1
364 :	edgomez	1382	ret
365 :	Isibaar	1793	ENDFUNC
366 :	edgomez	1540
367 :	suxen_drol	1632	;-----------------------------------------------------------------------------
368 :			;
369 :			; void interpolate8x4_halfpel_h_3dn(uint8_t * const dst,
370 :			; const uint8_t * const src,
371 :			; const uint32_t stride,
372 :			; const uint32_t rounding);
373 :			;
374 :			;-----------------------------------------------------------------------------
375 :
376 :			ALIGN 16
377 :			interpolate8x4_halfpel_h_3dn:
378 :
379 :			mov eax, [esp+16] ; rounding
380 :			mov ecx, [esp+ 4] ; Dst
381 :			test eax, eax
382 :			mov eax, [esp+ 8] ; Src
383 :			mov edx, [esp+12] ; stride
384 :
385 :			jnz near .rounding1
386 :
387 :			COPY_H_3DN_RND0
388 :			lea ecx, [ecx+2*edx]
389 :			COPY_H_3DN_RND0
390 :			ret
391 :
392 :	Isibaar	1793	.rounding1:
393 :	suxen_drol	1632	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
394 :			movq mm7, [mmx_one]
395 :			COPY_H_3DN_RND1
396 :			lea ecx, [ecx+2*edx]
397 :			COPY_H_3DN_RND1
398 :			ret
399 :	Isibaar	1793	ENDFUNC
400 :	suxen_drol	1632
401 :
402 :			;-----------------------------------------------------------------------------
403 :			;
404 :			; void interpolate8x4_halfpel_v_3dn(uint8_t * const dst,
405 :			; const uint8_t * const src,
406 :			; const uint32_t stride,
407 :			; const uint32_t rounding);
408 :			;
409 :			;-----------------------------------------------------------------------------
410 :
411 :			ALIGN 16
412 :			interpolate8x4_halfpel_v_3dn:
413 :
414 :			mov eax, [esp+16] ; rounding
415 :			mov ecx, [esp+ 4] ; Dst
416 :			test eax,eax
417 :			mov eax, [esp+ 8] ; Src
418 :			mov edx, [esp+12] ; stride
419 :
420 :			; we process 2 line at a time
421 :
422 :			jnz near .rounding1
423 :
424 :			COPY_V_3DN_RND0
425 :			lea ecx, [ecx+2*edx]
426 :			COPY_V_3DN_RND0
427 :			ret
428 :
429 :	Isibaar	1793	.rounding1:
430 :	suxen_drol	1632	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
431 :			movq mm7, [mmx_one]
432 :			movq mm2, [eax] ; loop invariant
433 :			add eax, edx
434 :
435 :			COPY_V_3DN_RND1
436 :			lea ecx, [ecx+2*edx]
437 :			COPY_V_3DN_RND1
438 :			ret
439 :	Isibaar	1793	ENDFUNC
440 :	suxen_drol	1632
441 :
442 :			;-----------------------------------------------------------------------------
443 :			;
444 :			; void interpolate8x4_halfpel_hv_3dn(uint8_t * const dst,
445 :			; const uint8_t * const src,
446 :			; const uint32_t stride,
447 :			; const uint32_t rounding);
448 :			;
449 :			;
450 :			;-----------------------------------------------------------------------------
451 :
452 :			; The trick is to correct the result of 'pavgusb' with some combination of the
453 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgusb' (s and t).
454 :			; The boolean relations are:
455 :			; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
456 :			; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
457 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
458 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
459 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
460 :
461 :			ALIGN 16
462 :	Isibaar	1793	interpolate8x4_halfpel_hv_3dn:
463 :	suxen_drol	1632	mov eax, [esp+16] ; rounding
464 :			mov ecx, [esp+ 4] ; Dst
465 :			test eax, eax
466 :			mov eax, [esp+ 8] ; Src
467 :			mov edx, [esp+12] ; stride
468 :
469 :			movq mm7, [mmx_one]
470 :
471 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
472 :			movq mm2, [eax]
473 :			movq mm3, [eax+1]
474 :			movq mm6, mm2
475 :			pavgusb mm2, mm3
476 :			pxor mm3, mm6 ; mm2/mm3 ready
477 :
478 :			jnz near .rounding1
479 :
480 :			COPY_HV_3DN_RND0
481 :			add ecx, edx
482 :			COPY_HV_3DN_RND0
483 :			ret
484 :
485 :	Isibaar	1793	.rounding1:
486 :	suxen_drol	1632	COPY_HV_3DN_RND1
487 :			add ecx, edx
488 :			COPY_HV_3DN_RND1
489 :			ret
490 :	Isibaar	1793	ENDFUNC
491 :	suxen_drol	1632
492 :	Isibaar	1790
493 :			%ifidn __OUTPUT_FORMAT__,elf
494 :			section ".note.GNU-stack" noalloc noexec nowrite progbits
495 :			%endif
496 :

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4