Annotation of /trunk/xvidcore/src/image/x86_asm/interpolate8x8_xmm.asm

Revision 1632 - (view) (download)

1 :	edgomez	1382	;/*****************************************************************************
2 :	Isibaar	262	; *
3 :	edgomez	1382	; * XVID MPEG-4 VIDEO CODEC
4 :			; * - mmx 8x8 block-based halfpel interpolation -
5 :	Isibaar	262	; *
6 :	edgomez	1382	; * Copyright(C) 2002 Michael Militzer <isibaar@xvid.org>
7 :			; * 2002 Pascal Massimino <skal@planet-d.net>
8 :	Isibaar	262	; *
9 :	edgomez	1382	; * This program is free software ; you can redistribute it and/or modify
10 :			; * it under the terms of the GNU General Public License as published by
11 :			; * the Free Software Foundation ; either version 2 of the License, or
12 :			; * (at your option) any later version.
13 :	Isibaar	262	; *
14 :	edgomez	1382	; * This program is distributed in the hope that it will be useful,
15 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
16 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 :			; * GNU General Public License for more details.
18 :	Isibaar	262	; *
19 :	edgomez	1382	; * You should have received a copy of the GNU General Public License
20 :			; * along with this program ; if not, write to the Free Software
21 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 :	Isibaar	262	; *
23 :	edgomez	1382	; ****************************************************************************/
24 :	Isibaar	262
25 :	edgomez	1382	BITS 32
26 :	edgomez	851
27 :	edgomez	1382	%macro cglobal 1
28 :	Isibaar	262	%ifdef PREFIX
29 :	edgomez	1535	%ifdef MARK_FUNCS
30 :	edgomez	1540	global _%1:function %1.endfunc-%1
31 :			%define %1 _%1:function %1.endfunc-%1
32 :	edgomez	1535	%else
33 :			global _%1
34 :			%define %1 _%1
35 :			%endif
36 :	Isibaar	262	%else
37 :	edgomez	1535	%ifdef MARK_FUNCS
38 :	edgomez	1540	global %1:function %1.endfunc-%1
39 :	edgomez	1535	%else
40 :			global %1
41 :			%endif
42 :	Isibaar	262	%endif
43 :			%endmacro
44 :
45 :	edgomez	1382	;=============================================================================
46 :			; Read only data
47 :			;=============================================================================
48 :	Isibaar	262
49 :	edgomez	1382	%ifdef FORMAT_COFF
50 :	edgomez	1519	SECTION .rodata
51 :	edgomez	1382	%else
52 :	edgomez	1519	SECTION .rodata align=16
53 :	edgomez	1382	%endif
54 :	Isibaar	262
55 :	edgomez	1382	ALIGN 16
56 :			mmx_one:
57 :			times 8 db 1
58 :	Isibaar	262
59 :	edgomez	1382	SECTION .text
60 :	Isibaar	262
61 :			cglobal interpolate8x8_halfpel_h_xmm
62 :			cglobal interpolate8x8_halfpel_v_xmm
63 :			cglobal interpolate8x8_halfpel_hv_xmm
64 :
65 :	suxen_drol	1632	cglobal interpolate8x4_halfpel_h_xmm
66 :			cglobal interpolate8x4_halfpel_v_xmm
67 :			cglobal interpolate8x4_halfpel_hv_xmm
68 :
69 :	edgomez	1530	cglobal interpolate8x8_halfpel_add_xmm
70 :			cglobal interpolate8x8_halfpel_h_add_xmm
71 :			cglobal interpolate8x8_halfpel_v_add_xmm
72 :			cglobal interpolate8x8_halfpel_hv_add_xmm
73 :
74 :	Isibaar	262	;===========================================================================
75 :			;
76 :			; void interpolate8x8_halfpel_h_xmm(uint8_t * const dst,
77 :			; const uint8_t * const src,
78 :			; const uint32_t stride,
79 :			; const uint32_t rounding);
80 :			;
81 :			;===========================================================================
82 :
83 :			%macro COPY_H_SSE_RND0 0
84 :			movq mm0, [eax]
85 :			pavgb mm0, [eax+1]
86 :			movq mm1, [eax+edx]
87 :			pavgb mm1, [eax+edx+1]
88 :			lea eax,[eax+2*edx]
89 :			movq [ecx],mm0
90 :			movq [ecx+edx],mm1
91 :			%endmacro
92 :
93 :			%macro COPY_H_SSE_RND1 0
94 :			movq mm0, [eax]
95 :			movq mm1, [eax+edx]
96 :			movq mm4, mm0
97 :			movq mm5, mm1
98 :	edgomez	1382	movq mm2, [eax+1]
99 :	Isibaar	262	movq mm3, [eax+edx+1]
100 :			pavgb mm0, mm2
101 :			pxor mm2, mm4
102 :			pavgb mm1, mm3
103 :	edgomez	1382	lea eax, [eax+2*edx]
104 :	Isibaar	262	pxor mm3, mm5
105 :			pand mm2, mm7
106 :			pand mm3, mm7
107 :			psubb mm0, mm2
108 :			movq [ecx], mm0
109 :			psubb mm1, mm3
110 :	edgomez	1382	movq [ecx+edx], mm1
111 :	Isibaar	262	%endmacro
112 :
113 :	edgomez	1382	ALIGN 16
114 :	Isibaar	262	interpolate8x8_halfpel_h_xmm:
115 :
116 :	edgomez	1382	mov eax, [esp+16] ; rounding
117 :			mov ecx, [esp+ 4] ; Dst
118 :	Isibaar	262	test eax,eax
119 :	edgomez	1382	mov eax, [esp+ 8] ; Src
120 :			mov edx, [esp+12] ; stride
121 :	Isibaar	262
122 :			jnz near .rounding1
123 :
124 :			COPY_H_SSE_RND0
125 :			lea ecx,[ecx+2*edx]
126 :			COPY_H_SSE_RND0
127 :			lea ecx,[ecx+2*edx]
128 :			COPY_H_SSE_RND0
129 :			lea ecx,[ecx+2*edx]
130 :			COPY_H_SSE_RND0
131 :			ret
132 :
133 :			.rounding1
134 :	edgomez	1382	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
135 :	Isibaar	262	movq mm7, [mmx_one]
136 :			COPY_H_SSE_RND1
137 :			lea ecx, [ecx+2*edx]
138 :			COPY_H_SSE_RND1
139 :			lea ecx,[ecx+2*edx]
140 :			COPY_H_SSE_RND1
141 :			lea ecx,[ecx+2*edx]
142 :			COPY_H_SSE_RND1
143 :			ret
144 :	edgomez	1540	.endfunc
145 :	Isibaar	262
146 :			;===========================================================================
147 :			;
148 :			; void interpolate8x8_halfpel_v_xmm(uint8_t * const dst,
149 :	edgomez	1382	; const uint8_t * const src,
150 :			; const uint32_t stride,
151 :			; const uint32_t rounding);
152 :	Isibaar	262	;
153 :			;===========================================================================
154 :
155 :			%macro COPY_V_SSE_RND0 0
156 :	edgomez	1382	movq mm0, [eax]
157 :			movq mm1, [eax+edx]
158 :	Isibaar	262	pavgb mm0, mm1
159 :			pavgb mm1, [eax+2*edx]
160 :	edgomez	1382	lea eax, [eax+2*edx]
161 :			movq [ecx], mm0
162 :	Isibaar	262	movq [ecx+edx],mm1
163 :			%endmacro
164 :
165 :			%macro COPY_V_SSE_RND1 0
166 :			movq mm0, mm2
167 :			movq mm1, [eax]
168 :			movq mm2, [eax+edx]
169 :			lea eax,[eax+2*edx]
170 :			movq mm4, mm0
171 :			movq mm5, mm1
172 :			pavgb mm0, mm1
173 :	edgomez	1382	pxor mm4, mm1
174 :	Isibaar	262	pavgb mm1, mm2
175 :			pxor mm5, mm2
176 :	edgomez	1382	pand mm4, mm7 ; lsb's of (i^j)...
177 :			pand mm5, mm7 ; lsb's of (i^j)...
178 :			psubb mm0, mm4 ; ...are substracted from result of pavgb
179 :	Isibaar	262	movq [ecx], mm0
180 :	edgomez	1382	psubb mm1, mm5 ; ...are substracted from result of pavgb
181 :	Isibaar	262	movq [ecx+edx], mm1
182 :			%endmacro
183 :
184 :	edgomez	1382	ALIGN 16
185 :	Isibaar	262	interpolate8x8_halfpel_v_xmm:
186 :
187 :			mov eax, [esp+16]; rounding
188 :	edgomez	1382	mov ecx, [esp+ 4] ; Dst
189 :	Isibaar	262	test eax,eax
190 :	edgomez	1382	mov eax, [esp+ 8] ; Src
191 :			mov edx, [esp+12] ; stride
192 :	Isibaar	262
193 :	edgomez	1382	; we process 2 line at a time
194 :	Isibaar	262	jnz near .rounding1
195 :
196 :			COPY_V_SSE_RND0
197 :			lea ecx, [ecx+2*edx]
198 :			COPY_V_SSE_RND0
199 :			lea ecx, [ecx+2*edx]
200 :			COPY_V_SSE_RND0
201 :			lea ecx, [ecx+2*edx]
202 :			COPY_V_SSE_RND0
203 :			ret
204 :
205 :			.rounding1
206 :	edgomez	1382	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
207 :	Isibaar	262	movq mm7, [mmx_one]
208 :	edgomez	1382	movq mm2, [eax] ; loop invariant
209 :	Isibaar	262	add eax, edx
210 :
211 :			COPY_V_SSE_RND1
212 :			lea ecx,[ecx+2*edx]
213 :			COPY_V_SSE_RND1
214 :			lea ecx,[ecx+2*edx]
215 :			COPY_V_SSE_RND1
216 :			lea ecx,[ecx+2*edx]
217 :			COPY_V_SSE_RND1
218 :			ret
219 :	edgomez	1540	.endfunc
220 :	Isibaar	262
221 :			;===========================================================================
222 :			;
223 :			; void interpolate8x8_halfpel_hv_xmm(uint8_t * const dst,
224 :	edgomez	1382	; const uint8_t * const src,
225 :			; const uint32_t stride,
226 :			; const uint32_t rounding);
227 :	Isibaar	262	;
228 :			;
229 :			;===========================================================================
230 :
231 :			; The trick is to correct the result of 'pavgb' with some combination of the
232 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
233 :			; The boolean relations are:
234 :	edgomez	1382	; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
235 :	Isibaar	262	; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
236 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
237 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
238 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
239 :
240 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
241 :
242 :			%macro COPY_HV_SSE_RND0 0
243 :	edgomez	1382	lea eax, [eax+edx]
244 :	Isibaar	262
245 :	edgomez	1382	movq mm0, [eax]
246 :			movq mm1, [eax+1]
247 :	Isibaar	262
248 :	edgomez	1382	movq mm6, mm0
249 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
250 :			lea eax, [eax+edx]
251 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
252 :	Isibaar	262
253 :	edgomez	1382	por mm3, mm1 ; ij \|= jk
254 :			movq mm6, mm2
255 :			pxor mm6, mm0 ; mm6 = s^t
256 :			pand mm3, mm6 ; (ij\|jk) &= st
257 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
258 :			pand mm3, mm7 ; mask lsb
259 :			psubb mm2, mm3 ; apply.
260 :	Isibaar	262
261 :	edgomez	1382	movq [ecx], mm2
262 :	Isibaar	262
263 :	edgomez	1382	movq mm2, [eax]
264 :			movq mm3, [eax+1]
265 :			movq mm6, mm2
266 :			pavgb mm2, mm3 ; preserved for next iteration
267 :			lea ecx,[ecx+edx]
268 :			pxor mm3, mm6 ; preserved for next iteration
269 :	Isibaar	262
270 :	edgomez	1382	por mm1, mm3
271 :			movq mm6, mm0
272 :			pxor mm6, mm2
273 :			pand mm1, mm6
274 :			pavgb mm0, mm2
275 :	Isibaar	262
276 :	edgomez	1382	pand mm1, mm7
277 :			psubb mm0, mm1
278 :	Isibaar	262
279 :	edgomez	1382	movq [ecx], mm0
280 :	Isibaar	262	%endmacro
281 :
282 :			%macro COPY_HV_SSE_RND1 0
283 :	edgomez	1382	lea eax, [eax+edx]
284 :	Isibaar	262
285 :	edgomez	1382	movq mm0, [eax]
286 :			movq mm1, [eax+1]
287 :	Isibaar	262
288 :	edgomez	1382	movq mm6, mm0
289 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
290 :			lea eax, [eax+edx]
291 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
292 :	Isibaar	262
293 :	edgomez	1382	pand mm3, mm1
294 :			movq mm6, mm2
295 :			pxor mm6, mm0
296 :			por mm3, mm6
297 :			pavgb mm2, mm0
298 :			pand mm3, mm7
299 :			psubb mm2, mm3
300 :	Isibaar	262
301 :	edgomez	1382	movq [ecx], mm2
302 :	Isibaar	262
303 :	edgomez	1382	movq mm2, [eax]
304 :			movq mm3, [eax+1]
305 :			movq mm6, mm2
306 :			pavgb mm2, mm3 ; preserved for next iteration
307 :			lea ecx,[ecx+edx]
308 :			pxor mm3, mm6 ; preserved for next iteration
309 :	Isibaar	262
310 :	edgomez	1382	pand mm1, mm3
311 :			movq mm6, mm0
312 :			pxor mm6, mm2
313 :			por mm1, mm6
314 :			pavgb mm0, mm2
315 :			pand mm1, mm7
316 :			psubb mm0, mm1
317 :	Isibaar	262
318 :	edgomez	1382	movq [ecx], mm0
319 :	Isibaar	262	%endmacro
320 :
321 :	edgomez	1382	ALIGN 16
322 :	Isibaar	262	interpolate8x8_halfpel_hv_xmm:
323 :	edgomez	1382	mov eax, [esp+16] ; rounding
324 :			mov ecx, [esp+ 4] ; Dst
325 :			test eax, eax
326 :			mov eax, [esp+ 8] ; Src
327 :			mov edx, [esp+12] ; stride
328 :	Isibaar	262
329 :			movq mm7, [mmx_one]
330 :
331 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
332 :			movq mm2, [eax]
333 :			movq mm3, [eax+1]
334 :			movq mm6, mm2
335 :			pavgb mm2, mm3
336 :	edgomez	1382	pxor mm3, mm6 ; mm2/mm3 ready
337 :	Isibaar	262
338 :			jnz near .rounding1
339 :
340 :			COPY_HV_SSE_RND0
341 :			add ecx, edx
342 :			COPY_HV_SSE_RND0
343 :			add ecx, edx
344 :			COPY_HV_SSE_RND0
345 :			add ecx, edx
346 :			COPY_HV_SSE_RND0
347 :			ret
348 :
349 :			.rounding1
350 :			COPY_HV_SSE_RND1
351 :			add ecx, edx
352 :			COPY_HV_SSE_RND1
353 :			add ecx, edx
354 :			COPY_HV_SSE_RND1
355 :			add ecx, edx
356 :			COPY_HV_SSE_RND1
357 :	edgomez	1382	ret
358 :	edgomez	1540	.endfunc
359 :	edgomez	1530
360 :			;===========================================================================
361 :			;
362 :	suxen_drol	1632	; void interpolate8x4_halfpel_h_xmm(uint8_t * const dst,
363 :			; const uint8_t * const src,
364 :			; const uint32_t stride,
365 :			; const uint32_t rounding);
366 :			;
367 :			;===========================================================================
368 :
369 :			ALIGN 16
370 :			interpolate8x4_halfpel_h_xmm:
371 :
372 :			mov eax, [esp+16] ; rounding
373 :			mov ecx, [esp+ 4] ; Dst
374 :			test eax,eax
375 :			mov eax, [esp+ 8] ; Src
376 :			mov edx, [esp+12] ; stride
377 :
378 :			jnz near .rounding1
379 :
380 :			COPY_H_SSE_RND0
381 :			lea ecx,[ecx+2*edx]
382 :			COPY_H_SSE_RND0
383 :			ret
384 :
385 :			.rounding1
386 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
387 :			movq mm7, [mmx_one]
388 :			COPY_H_SSE_RND1
389 :			lea ecx, [ecx+2*edx]
390 :			COPY_H_SSE_RND1
391 :			ret
392 :			.endfunc
393 :
394 :			;===========================================================================
395 :			;
396 :			; void interpolate8x4_halfpel_v_xmm(uint8_t * const dst,
397 :			; const uint8_t * const src,
398 :			; const uint32_t stride,
399 :			; const uint32_t rounding);
400 :			;
401 :			;===========================================================================
402 :
403 :			ALIGN 16
404 :			interpolate8x4_halfpel_v_xmm:
405 :
406 :			mov eax, [esp+16]; rounding
407 :			mov ecx, [esp+ 4] ; Dst
408 :			test eax,eax
409 :			mov eax, [esp+ 8] ; Src
410 :			mov edx, [esp+12] ; stride
411 :
412 :			; we process 2 line at a time
413 :			jnz near .rounding1
414 :
415 :			COPY_V_SSE_RND0
416 :			lea ecx, [ecx+2*edx]
417 :			COPY_V_SSE_RND0
418 :			ret
419 :
420 :			.rounding1
421 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
422 :			movq mm7, [mmx_one]
423 :			movq mm2, [eax] ; loop invariant
424 :			add eax, edx
425 :
426 :			COPY_V_SSE_RND1
427 :			lea ecx,[ecx+2*edx]
428 :			COPY_V_SSE_RND1
429 :			ret
430 :			.endfunc
431 :
432 :			;===========================================================================
433 :			;
434 :			; void interpolate8x4_halfpel_hv_xmm(uint8_t * const dst,
435 :			; const uint8_t * const src,
436 :			; const uint32_t stride,
437 :			; const uint32_t rounding);
438 :			;
439 :			;
440 :			;===========================================================================
441 :
442 :			; The trick is to correct the result of 'pavgb' with some combination of the
443 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
444 :			; The boolean relations are:
445 :			; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
446 :			; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
447 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
448 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
449 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
450 :
451 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
452 :
453 :			ALIGN 16
454 :			interpolate8x4_halfpel_hv_xmm:
455 :			mov eax, [esp+16] ; rounding
456 :			mov ecx, [esp+ 4] ; Dst
457 :			test eax, eax
458 :			mov eax, [esp+ 8] ; Src
459 :			mov edx, [esp+12] ; stride
460 :
461 :			movq mm7, [mmx_one]
462 :
463 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
464 :			movq mm2, [eax]
465 :			movq mm3, [eax+1]
466 :			movq mm6, mm2
467 :			pavgb mm2, mm3
468 :			pxor mm3, mm6 ; mm2/mm3 ready
469 :
470 :			jnz near .rounding1
471 :
472 :			COPY_HV_SSE_RND0
473 :			add ecx, edx
474 :			COPY_HV_SSE_RND0
475 :			ret
476 :
477 :			.rounding1
478 :			COPY_HV_SSE_RND1
479 :			add ecx, edx
480 :			COPY_HV_SSE_RND1
481 :			ret
482 :			.endfunc
483 :
484 :			;===========================================================================
485 :			;
486 :	edgomez	1530	; The next functions combine both source halfpel interpolation step and the
487 :			; averaging (with rouding) step to avoid wasting memory bandwidth computing
488 :			; intermediate halfpel images and then averaging them.
489 :			;
490 :			;===========================================================================
491 :
492 :			%macro PROLOG0 0
493 :			mov ecx, [esp+ 4] ; Dst
494 :			mov eax, [esp+ 8] ; Src
495 :			mov edx, [esp+12] ; BpS
496 :			%endmacro
497 :			%macro PROLOG1 0
498 :			PROLOG0
499 :			test dword [esp+16], 1; Rounding?
500 :			%endmacro
501 :			%macro EPILOG 0
502 :			ret
503 :			%endmacro
504 :
505 :			;===========================================================================
506 :			;
507 :			; void interpolate8x8_halfpel_add_xmm(uint8_t * const dst,
508 :			; const uint8_t * const src,
509 :			; const uint32_t stride,
510 :			; const uint32_t rounding);
511 :			;
512 :			;
513 :			;===========================================================================
514 :
515 :			%macro ADD_FF 2
516 :			movq mm0, [eax+%1]
517 :			movq mm1, [eax+%2]
518 :			;;---
519 :			;; movq mm2, mm0
520 :			;; movq mm3, mm1
521 :			;;---
522 :			pavgb mm0, [ecx+%1]
523 :			pavgb mm1, [ecx+%2]
524 :			;;--
525 :			;; por mm2, [ecx+%1]
526 :			;; por mm3, [ecx+%2]
527 :			;; pand mm2, [mmx_one]
528 :			;; pand mm3, [mmx_one]
529 :			;; psubsb mm0, mm2
530 :			;; psubsb mm1, mm3
531 :			;;--
532 :			movq [ecx+%1], mm0
533 :			movq [ecx+%2], mm1
534 :			%endmacro
535 :
536 :			ALIGN 16
537 :			interpolate8x8_halfpel_add_xmm: ; 23c
538 :			PROLOG1
539 :			ADD_FF 0, edx
540 :			lea eax,[eax+2*edx]
541 :			lea ecx,[ecx+2*edx]
542 :			ADD_FF 0, edx
543 :			lea eax,[eax+2*edx]
544 :			lea ecx,[ecx+2*edx]
545 :			ADD_FF 0, edx
546 :			lea eax,[eax+2*edx]
547 :			lea ecx,[ecx+2*edx]
548 :			ADD_FF 0, edx
549 :			EPILOG
550 :	edgomez	1540	.endfunc
551 :	edgomez	1530
552 :			;===========================================================================
553 :			;
554 :			; void interpolate8x8_halfpel_h_add_xmm(uint8_t * const dst,
555 :			; const uint8_t * const src,
556 :			; const uint32_t stride,
557 :			; const uint32_t rounding);
558 :			;
559 :			;
560 :			;===========================================================================
561 :
562 :
563 :			%macro ADD_FH_RND0 2
564 :			movq mm0, [eax+%1]
565 :			movq mm1, [eax+%2]
566 :			pavgb mm0, [eax+%1+1]
567 :			pavgb mm1, [eax+%2+1]
568 :			pavgb mm0, [ecx+%1]
569 :			pavgb mm1, [ecx+%2]
570 :			movq [ecx+%1],mm0
571 :			movq [ecx+%2],mm1
572 :			%endmacro
573 :
574 :			%macro ADD_FH_RND1 2
575 :			movq mm0, [eax+%1]
576 :			movq mm1, [eax+%2]
577 :			movq mm4, mm0
578 :			movq mm5, mm1
579 :			movq mm2, [eax+%1+1]
580 :			movq mm3, [eax+%2+1]
581 :			pavgb mm0, mm2
582 :			; lea ??
583 :			pxor mm2, mm4
584 :			pavgb mm1, mm3
585 :			pxor mm3, mm5
586 :			pand mm2, [mmx_one]
587 :			pand mm3, [mmx_one]
588 :			psubb mm0, mm2
589 :			psubb mm1, mm3
590 :			pavgb mm0, [ecx+%1]
591 :			pavgb mm1, [ecx+%2]
592 :			movq [ecx+%1],mm0
593 :			movq [ecx+%2],mm1
594 :			%endmacro
595 :
596 :			ALIGN 16
597 :			interpolate8x8_halfpel_h_add_xmm: ; 32c
598 :			PROLOG1
599 :			jnz near .Loop1
600 :			ADD_FH_RND0 0, edx
601 :			lea eax,[eax+2*edx]
602 :			lea ecx,[ecx+2*edx]
603 :			ADD_FH_RND0 0, edx
604 :			lea eax,[eax+2*edx]
605 :			lea ecx,[ecx+2*edx]
606 :			ADD_FH_RND0 0, edx
607 :			lea eax,[eax+2*edx]
608 :			lea ecx,[ecx+2*edx]
609 :			ADD_FH_RND0 0, edx
610 :			EPILOG
611 :
612 :			.Loop1
613 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
614 :			; movq mm7, [mmx_one]
615 :			ADD_FH_RND1 0, edx
616 :			lea eax,[eax+2*edx]
617 :			lea ecx,[ecx+2*edx]
618 :			ADD_FH_RND1 0, edx
619 :			lea eax,[eax+2*edx]
620 :			lea ecx,[ecx+2*edx]
621 :			ADD_FH_RND1 0, edx
622 :			lea eax,[eax+2*edx]
623 :			lea ecx,[ecx+2*edx]
624 :			ADD_FH_RND1 0, edx
625 :			EPILOG
626 :	edgomez	1540	.endfunc
627 :	edgomez	1530
628 :
629 :			;===========================================================================
630 :			;
631 :			; void interpolate8x8_halfpel_v_add_xmm(uint8_t * const dst,
632 :			; const uint8_t * const src,
633 :			; const uint32_t stride,
634 :			; const uint32_t rounding);
635 :			;
636 :			;
637 :			;===========================================================================
638 :
639 :			%macro ADD_8_HF_RND0 0
640 :			movq mm0, [eax]
641 :			movq mm1, [eax+edx]
642 :			pavgb mm0, mm1
643 :			pavgb mm1, [eax+2*edx]
644 :			lea eax,[eax+2*edx]
645 :			pavgb mm0, [ecx]
646 :			pavgb mm1, [ecx+edx]
647 :			movq [ecx],mm0
648 :			movq [ecx+edx],mm1
649 :			%endmacro
650 :
651 :			%macro ADD_8_HF_RND1 0
652 :			movq mm1, [eax+edx]
653 :			movq mm2, [eax+2*edx]
654 :			lea eax,[eax+2*edx]
655 :			movq mm4, mm0
656 :			movq mm5, mm1
657 :			pavgb mm0, mm1
658 :			pxor mm4, mm1
659 :			pavgb mm1, mm2
660 :			pxor mm5, mm2
661 :			pand mm4, mm7 ; lsb's of (i^j)...
662 :			pand mm5, mm7 ; lsb's of (i^j)...
663 :			psubb mm0, mm4 ; ...are substracted from result of pavgb
664 :			pavgb mm0, [ecx]
665 :			movq [ecx], mm0
666 :			psubb mm1, mm5 ; ...are substracted from result of pavgb
667 :			pavgb mm1, [ecx+edx]
668 :			movq [ecx+edx], mm1
669 :			%endmacro
670 :
671 :			ALIGN 16
672 :			interpolate8x8_halfpel_v_add_xmm:
673 :			PROLOG1
674 :
675 :			jnz near .Loop1
676 :			pxor mm7, mm7 ; this is a NOP
677 :
678 :			ADD_8_HF_RND0
679 :			lea ecx,[ecx+2*edx]
680 :			ADD_8_HF_RND0
681 :			lea ecx,[ecx+2*edx]
682 :			ADD_8_HF_RND0
683 :			lea ecx,[ecx+2*edx]
684 :			ADD_8_HF_RND0
685 :			EPILOG
686 :
687 :			.Loop1
688 :			movq mm0, [eax] ; loop invariant
689 :			movq mm7, [mmx_one]
690 :
691 :			ADD_8_HF_RND1
692 :			movq mm0, mm2
693 :			lea ecx,[ecx+2*edx]
694 :			ADD_8_HF_RND1
695 :			movq mm0, mm2
696 :			lea ecx,[ecx+2*edx]
697 :			ADD_8_HF_RND1
698 :			movq mm0, mm2
699 :			lea ecx,[ecx+2*edx]
700 :			ADD_8_HF_RND1
701 :			EPILOG
702 :	edgomez	1540	.endfunc
703 :	edgomez	1530
704 :			; The trick is to correct the result of 'pavgb' with some combination of the
705 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
706 :			; The boolean relations are:
707 :			; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
708 :			; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
709 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
710 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
711 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
712 :
713 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
714 :
715 :			;===========================================================================
716 :			;
717 :			; void interpolate8x8_halfpel_hv_add_xmm(uint8_t * const dst,
718 :			; const uint8_t * const src,
719 :			; const uint32_t stride,
720 :			; const uint32_t rounding);
721 :			;
722 :			;
723 :			;===========================================================================
724 :
725 :			%macro ADD_HH_RND0 0
726 :			lea eax,[eax+edx]
727 :
728 :			movq mm0, [eax]
729 :			movq mm1, [eax+1]
730 :
731 :			movq mm6, mm0
732 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
733 :			lea eax,[eax+edx]
734 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
735 :
736 :			por mm3, mm1 ; ij \|= jk
737 :			movq mm6, mm2
738 :			pxor mm6, mm0 ; mm6 = s^t
739 :			pand mm3, mm6 ; (ij\|jk) &= st
740 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
741 :			pand mm3, mm7 ; mask lsb
742 :			psubb mm2, mm3 ; apply.
743 :
744 :			pavgb mm2, [ecx]
745 :			movq [ecx], mm2
746 :
747 :			movq mm2, [eax]
748 :			movq mm3, [eax+1]
749 :			movq mm6, mm2
750 :			pavgb mm2, mm3 ; preserved for next iteration
751 :			lea ecx,[ecx+edx]
752 :			pxor mm3, mm6 ; preserved for next iteration
753 :
754 :			por mm1, mm3
755 :			movq mm6, mm0
756 :			pxor mm6, mm2
757 :			pand mm1, mm6
758 :			pavgb mm0, mm2
759 :
760 :			pand mm1, mm7
761 :			psubb mm0, mm1
762 :
763 :			pavgb mm0, [ecx]
764 :			movq [ecx], mm0
765 :			%endmacro
766 :
767 :			%macro ADD_HH_RND1 0
768 :			lea eax,[eax+edx]
769 :
770 :			movq mm0, [eax]
771 :			movq mm1, [eax+1]
772 :
773 :			movq mm6, mm0
774 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
775 :			lea eax,[eax+edx]
776 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
777 :
778 :			pand mm3, mm1
779 :			movq mm6, mm2
780 :			pxor mm6, mm0
781 :			por mm3, mm6
782 :			pavgb mm2, mm0
783 :			pand mm3, mm7
784 :			psubb mm2, mm3
785 :
786 :			pavgb mm2, [ecx]
787 :			movq [ecx], mm2
788 :
789 :			movq mm2, [eax]
790 :			movq mm3, [eax+1]
791 :			movq mm6, mm2
792 :			pavgb mm2, mm3 ; preserved for next iteration
793 :			lea ecx,[ecx+edx]
794 :			pxor mm3, mm6 ; preserved for next iteration
795 :
796 :			pand mm1, mm3
797 :			movq mm6, mm0
798 :			pxor mm6, mm2
799 :			por mm1, mm6
800 :			pavgb mm0, mm2
801 :			pand mm1, mm7
802 :			psubb mm0, mm1
803 :
804 :			pavgb mm0, [ecx]
805 :			movq [ecx], mm0
806 :			%endmacro
807 :
808 :			ALIGN 16
809 :			interpolate8x8_halfpel_hv_add_xmm:
810 :			PROLOG1
811 :
812 :			movq mm7, [mmx_one]
813 :
814 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
815 :			movq mm2, [eax]
816 :			movq mm3, [eax+1]
817 :			movq mm6, mm2
818 :			pavgb mm2, mm3
819 :			pxor mm3, mm6 ; mm2/mm3 ready
820 :
821 :			jnz near .Loop1
822 :
823 :			ADD_HH_RND0
824 :			add ecx, edx
825 :			ADD_HH_RND0
826 :			add ecx, edx
827 :			ADD_HH_RND0
828 :			add ecx, edx
829 :			ADD_HH_RND0
830 :			EPILOG
831 :
832 :			.Loop1
833 :			ADD_HH_RND1
834 :			add ecx, edx
835 :			ADD_HH_RND1
836 :			add ecx, edx
837 :			ADD_HH_RND1
838 :			add ecx, edx
839 :			ADD_HH_RND1
840 :
841 :			EPILOG
842 :	edgomez	1540	.endfunc
843 :

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4