Annotation of /trunk/xvidcore/src/image/x86_asm/interpolate8x8_xmm.asm

Revision 1540 - (view) (download)

1 :	edgomez	1382	;/*****************************************************************************
2 :	Isibaar	262	; *
3 :	edgomez	1382	; * XVID MPEG-4 VIDEO CODEC
4 :			; * - mmx 8x8 block-based halfpel interpolation -
5 :	Isibaar	262	; *
6 :	edgomez	1382	; * Copyright(C) 2002 Michael Militzer <isibaar@xvid.org>
7 :			; * 2002 Pascal Massimino <skal@planet-d.net>
8 :	Isibaar	262	; *
9 :	edgomez	1382	; * This program is free software ; you can redistribute it and/or modify
10 :			; * it under the terms of the GNU General Public License as published by
11 :			; * the Free Software Foundation ; either version 2 of the License, or
12 :			; * (at your option) any later version.
13 :	Isibaar	262	; *
14 :	edgomez	1382	; * This program is distributed in the hope that it will be useful,
15 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
16 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 :			; * GNU General Public License for more details.
18 :	Isibaar	262	; *
19 :	edgomez	1382	; * You should have received a copy of the GNU General Public License
20 :			; * along with this program ; if not, write to the Free Software
21 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 :	Isibaar	262	; *
23 :	edgomez	1382	; ****************************************************************************/
24 :	Isibaar	262
25 :	edgomez	1382	BITS 32
26 :	edgomez	851
27 :	edgomez	1382	%macro cglobal 1
28 :	Isibaar	262	%ifdef PREFIX
29 :	edgomez	1535	%ifdef MARK_FUNCS
30 :	edgomez	1540	global _%1:function %1.endfunc-%1
31 :			%define %1 _%1:function %1.endfunc-%1
32 :	edgomez	1535	%else
33 :			global _%1
34 :			%define %1 _%1
35 :			%endif
36 :	Isibaar	262	%else
37 :	edgomez	1535	%ifdef MARK_FUNCS
38 :	edgomez	1540	global %1:function %1.endfunc-%1
39 :	edgomez	1535	%else
40 :			global %1
41 :			%endif
42 :	Isibaar	262	%endif
43 :			%endmacro
44 :
45 :	edgomez	1382	;=============================================================================
46 :			; Read only data
47 :			;=============================================================================
48 :	Isibaar	262
49 :	edgomez	1382	%ifdef FORMAT_COFF
50 :	edgomez	1519	SECTION .rodata
51 :	edgomez	1382	%else
52 :	edgomez	1519	SECTION .rodata align=16
53 :	edgomez	1382	%endif
54 :	Isibaar	262
55 :	edgomez	1382	ALIGN 16
56 :			mmx_one:
57 :			times 8 db 1
58 :	Isibaar	262
59 :	edgomez	1382	SECTION .text
60 :	Isibaar	262
61 :			cglobal interpolate8x8_halfpel_h_xmm
62 :			cglobal interpolate8x8_halfpel_v_xmm
63 :			cglobal interpolate8x8_halfpel_hv_xmm
64 :
65 :	edgomez	1530	cglobal interpolate8x8_halfpel_add_xmm
66 :			cglobal interpolate8x8_halfpel_h_add_xmm
67 :			cglobal interpolate8x8_halfpel_v_add_xmm
68 :			cglobal interpolate8x8_halfpel_hv_add_xmm
69 :
70 :	Isibaar	262	;===========================================================================
71 :			;
72 :			; void interpolate8x8_halfpel_h_xmm(uint8_t * const dst,
73 :			; const uint8_t * const src,
74 :			; const uint32_t stride,
75 :			; const uint32_t rounding);
76 :			;
77 :			;===========================================================================
78 :
79 :			%macro COPY_H_SSE_RND0 0
80 :			movq mm0, [eax]
81 :			pavgb mm0, [eax+1]
82 :			movq mm1, [eax+edx]
83 :			pavgb mm1, [eax+edx+1]
84 :			lea eax,[eax+2*edx]
85 :			movq [ecx],mm0
86 :			movq [ecx+edx],mm1
87 :			%endmacro
88 :
89 :			%macro COPY_H_SSE_RND1 0
90 :			movq mm0, [eax]
91 :			movq mm1, [eax+edx]
92 :			movq mm4, mm0
93 :			movq mm5, mm1
94 :	edgomez	1382	movq mm2, [eax+1]
95 :	Isibaar	262	movq mm3, [eax+edx+1]
96 :			pavgb mm0, mm2
97 :			pxor mm2, mm4
98 :			pavgb mm1, mm3
99 :	edgomez	1382	lea eax, [eax+2*edx]
100 :	Isibaar	262	pxor mm3, mm5
101 :			pand mm2, mm7
102 :			pand mm3, mm7
103 :			psubb mm0, mm2
104 :			movq [ecx], mm0
105 :			psubb mm1, mm3
106 :	edgomez	1382	movq [ecx+edx], mm1
107 :	Isibaar	262	%endmacro
108 :
109 :	edgomez	1382	ALIGN 16
110 :	Isibaar	262	interpolate8x8_halfpel_h_xmm:
111 :
112 :	edgomez	1382	mov eax, [esp+16] ; rounding
113 :			mov ecx, [esp+ 4] ; Dst
114 :	Isibaar	262	test eax,eax
115 :	edgomez	1382	mov eax, [esp+ 8] ; Src
116 :			mov edx, [esp+12] ; stride
117 :	Isibaar	262
118 :			jnz near .rounding1
119 :
120 :			COPY_H_SSE_RND0
121 :			lea ecx,[ecx+2*edx]
122 :			COPY_H_SSE_RND0
123 :			lea ecx,[ecx+2*edx]
124 :			COPY_H_SSE_RND0
125 :			lea ecx,[ecx+2*edx]
126 :			COPY_H_SSE_RND0
127 :			ret
128 :
129 :			.rounding1
130 :	edgomez	1382	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
131 :	Isibaar	262	movq mm7, [mmx_one]
132 :			COPY_H_SSE_RND1
133 :			lea ecx, [ecx+2*edx]
134 :			COPY_H_SSE_RND1
135 :			lea ecx,[ecx+2*edx]
136 :			COPY_H_SSE_RND1
137 :			lea ecx,[ecx+2*edx]
138 :			COPY_H_SSE_RND1
139 :			ret
140 :	edgomez	1540	.endfunc
141 :	Isibaar	262
142 :			;===========================================================================
143 :			;
144 :			; void interpolate8x8_halfpel_v_xmm(uint8_t * const dst,
145 :	edgomez	1382	; const uint8_t * const src,
146 :			; const uint32_t stride,
147 :			; const uint32_t rounding);
148 :	Isibaar	262	;
149 :			;===========================================================================
150 :
151 :			%macro COPY_V_SSE_RND0 0
152 :	edgomez	1382	movq mm0, [eax]
153 :			movq mm1, [eax+edx]
154 :	Isibaar	262	pavgb mm0, mm1
155 :			pavgb mm1, [eax+2*edx]
156 :	edgomez	1382	lea eax, [eax+2*edx]
157 :			movq [ecx], mm0
158 :	Isibaar	262	movq [ecx+edx],mm1
159 :			%endmacro
160 :
161 :			%macro COPY_V_SSE_RND1 0
162 :			movq mm0, mm2
163 :			movq mm1, [eax]
164 :			movq mm2, [eax+edx]
165 :			lea eax,[eax+2*edx]
166 :			movq mm4, mm0
167 :			movq mm5, mm1
168 :			pavgb mm0, mm1
169 :	edgomez	1382	pxor mm4, mm1
170 :	Isibaar	262	pavgb mm1, mm2
171 :			pxor mm5, mm2
172 :	edgomez	1382	pand mm4, mm7 ; lsb's of (i^j)...
173 :			pand mm5, mm7 ; lsb's of (i^j)...
174 :			psubb mm0, mm4 ; ...are substracted from result of pavgb
175 :	Isibaar	262	movq [ecx], mm0
176 :	edgomez	1382	psubb mm1, mm5 ; ...are substracted from result of pavgb
177 :	Isibaar	262	movq [ecx+edx], mm1
178 :			%endmacro
179 :
180 :	edgomez	1382	ALIGN 16
181 :	Isibaar	262	interpolate8x8_halfpel_v_xmm:
182 :
183 :			mov eax, [esp+16]; rounding
184 :	edgomez	1382	mov ecx, [esp+ 4] ; Dst
185 :	Isibaar	262	test eax,eax
186 :	edgomez	1382	mov eax, [esp+ 8] ; Src
187 :			mov edx, [esp+12] ; stride
188 :	Isibaar	262
189 :	edgomez	1382	; we process 2 line at a time
190 :	Isibaar	262	jnz near .rounding1
191 :
192 :			COPY_V_SSE_RND0
193 :			lea ecx, [ecx+2*edx]
194 :			COPY_V_SSE_RND0
195 :			lea ecx, [ecx+2*edx]
196 :			COPY_V_SSE_RND0
197 :			lea ecx, [ecx+2*edx]
198 :			COPY_V_SSE_RND0
199 :			ret
200 :
201 :			.rounding1
202 :	edgomez	1382	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
203 :	Isibaar	262	movq mm7, [mmx_one]
204 :	edgomez	1382	movq mm2, [eax] ; loop invariant
205 :	Isibaar	262	add eax, edx
206 :
207 :			COPY_V_SSE_RND1
208 :			lea ecx,[ecx+2*edx]
209 :			COPY_V_SSE_RND1
210 :			lea ecx,[ecx+2*edx]
211 :			COPY_V_SSE_RND1
212 :			lea ecx,[ecx+2*edx]
213 :			COPY_V_SSE_RND1
214 :			ret
215 :	edgomez	1540	.endfunc
216 :	Isibaar	262
217 :			;===========================================================================
218 :			;
219 :			; void interpolate8x8_halfpel_hv_xmm(uint8_t * const dst,
220 :	edgomez	1382	; const uint8_t * const src,
221 :			; const uint32_t stride,
222 :			; const uint32_t rounding);
223 :	Isibaar	262	;
224 :			;
225 :			;===========================================================================
226 :
227 :			; The trick is to correct the result of 'pavgb' with some combination of the
228 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
229 :			; The boolean relations are:
230 :	edgomez	1382	; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
231 :	Isibaar	262	; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
232 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
233 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
234 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
235 :
236 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
237 :
238 :			%macro COPY_HV_SSE_RND0 0
239 :	edgomez	1382	lea eax, [eax+edx]
240 :	Isibaar	262
241 :	edgomez	1382	movq mm0, [eax]
242 :			movq mm1, [eax+1]
243 :	Isibaar	262
244 :	edgomez	1382	movq mm6, mm0
245 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
246 :			lea eax, [eax+edx]
247 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
248 :	Isibaar	262
249 :	edgomez	1382	por mm3, mm1 ; ij \|= jk
250 :			movq mm6, mm2
251 :			pxor mm6, mm0 ; mm6 = s^t
252 :			pand mm3, mm6 ; (ij\|jk) &= st
253 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
254 :			pand mm3, mm7 ; mask lsb
255 :			psubb mm2, mm3 ; apply.
256 :	Isibaar	262
257 :	edgomez	1382	movq [ecx], mm2
258 :	Isibaar	262
259 :	edgomez	1382	movq mm2, [eax]
260 :			movq mm3, [eax+1]
261 :			movq mm6, mm2
262 :			pavgb mm2, mm3 ; preserved for next iteration
263 :			lea ecx,[ecx+edx]
264 :			pxor mm3, mm6 ; preserved for next iteration
265 :	Isibaar	262
266 :	edgomez	1382	por mm1, mm3
267 :			movq mm6, mm0
268 :			pxor mm6, mm2
269 :			pand mm1, mm6
270 :			pavgb mm0, mm2
271 :	Isibaar	262
272 :	edgomez	1382	pand mm1, mm7
273 :			psubb mm0, mm1
274 :	Isibaar	262
275 :	edgomez	1382	movq [ecx], mm0
276 :	Isibaar	262	%endmacro
277 :
278 :			%macro COPY_HV_SSE_RND1 0
279 :	edgomez	1382	lea eax, [eax+edx]
280 :	Isibaar	262
281 :	edgomez	1382	movq mm0, [eax]
282 :			movq mm1, [eax+1]
283 :	Isibaar	262
284 :	edgomez	1382	movq mm6, mm0
285 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
286 :			lea eax, [eax+edx]
287 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
288 :	Isibaar	262
289 :	edgomez	1382	pand mm3, mm1
290 :			movq mm6, mm2
291 :			pxor mm6, mm0
292 :			por mm3, mm6
293 :			pavgb mm2, mm0
294 :			pand mm3, mm7
295 :			psubb mm2, mm3
296 :	Isibaar	262
297 :	edgomez	1382	movq [ecx], mm2
298 :	Isibaar	262
299 :	edgomez	1382	movq mm2, [eax]
300 :			movq mm3, [eax+1]
301 :			movq mm6, mm2
302 :			pavgb mm2, mm3 ; preserved for next iteration
303 :			lea ecx,[ecx+edx]
304 :			pxor mm3, mm6 ; preserved for next iteration
305 :	Isibaar	262
306 :	edgomez	1382	pand mm1, mm3
307 :			movq mm6, mm0
308 :			pxor mm6, mm2
309 :			por mm1, mm6
310 :			pavgb mm0, mm2
311 :			pand mm1, mm7
312 :			psubb mm0, mm1
313 :	Isibaar	262
314 :	edgomez	1382	movq [ecx], mm0
315 :	Isibaar	262	%endmacro
316 :
317 :	edgomez	1382	ALIGN 16
318 :	Isibaar	262	interpolate8x8_halfpel_hv_xmm:
319 :	edgomez	1382	mov eax, [esp+16] ; rounding
320 :			mov ecx, [esp+ 4] ; Dst
321 :			test eax, eax
322 :			mov eax, [esp+ 8] ; Src
323 :			mov edx, [esp+12] ; stride
324 :	Isibaar	262
325 :			movq mm7, [mmx_one]
326 :
327 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
328 :			movq mm2, [eax]
329 :			movq mm3, [eax+1]
330 :			movq mm6, mm2
331 :			pavgb mm2, mm3
332 :	edgomez	1382	pxor mm3, mm6 ; mm2/mm3 ready
333 :	Isibaar	262
334 :			jnz near .rounding1
335 :
336 :			COPY_HV_SSE_RND0
337 :			add ecx, edx
338 :			COPY_HV_SSE_RND0
339 :			add ecx, edx
340 :			COPY_HV_SSE_RND0
341 :			add ecx, edx
342 :			COPY_HV_SSE_RND0
343 :			ret
344 :
345 :			.rounding1
346 :			COPY_HV_SSE_RND1
347 :			add ecx, edx
348 :			COPY_HV_SSE_RND1
349 :			add ecx, edx
350 :			COPY_HV_SSE_RND1
351 :			add ecx, edx
352 :			COPY_HV_SSE_RND1
353 :	edgomez	1382	ret
354 :	edgomez	1540	.endfunc
355 :	edgomez	1530
356 :			;===========================================================================
357 :			;
358 :			; The next functions combine both source halfpel interpolation step and the
359 :			; averaging (with rouding) step to avoid wasting memory bandwidth computing
360 :			; intermediate halfpel images and then averaging them.
361 :			;
362 :			;===========================================================================
363 :
364 :			%macro PROLOG0 0
365 :			mov ecx, [esp+ 4] ; Dst
366 :			mov eax, [esp+ 8] ; Src
367 :			mov edx, [esp+12] ; BpS
368 :			%endmacro
369 :			%macro PROLOG1 0
370 :			PROLOG0
371 :			test dword [esp+16], 1; Rounding?
372 :			%endmacro
373 :			%macro EPILOG 0
374 :			ret
375 :			%endmacro
376 :
377 :			;===========================================================================
378 :			;
379 :			; void interpolate8x8_halfpel_add_xmm(uint8_t * const dst,
380 :			; const uint8_t * const src,
381 :			; const uint32_t stride,
382 :			; const uint32_t rounding);
383 :			;
384 :			;
385 :			;===========================================================================
386 :
387 :			%macro ADD_FF 2
388 :			movq mm0, [eax+%1]
389 :			movq mm1, [eax+%2]
390 :			;;---
391 :			;; movq mm2, mm0
392 :			;; movq mm3, mm1
393 :			;;---
394 :			pavgb mm0, [ecx+%1]
395 :			pavgb mm1, [ecx+%2]
396 :			;;--
397 :			;; por mm2, [ecx+%1]
398 :			;; por mm3, [ecx+%2]
399 :			;; pand mm2, [mmx_one]
400 :			;; pand mm3, [mmx_one]
401 :			;; psubsb mm0, mm2
402 :			;; psubsb mm1, mm3
403 :			;;--
404 :			movq [ecx+%1], mm0
405 :			movq [ecx+%2], mm1
406 :			%endmacro
407 :
408 :			ALIGN 16
409 :			interpolate8x8_halfpel_add_xmm: ; 23c
410 :			PROLOG1
411 :			ADD_FF 0, edx
412 :			lea eax,[eax+2*edx]
413 :			lea ecx,[ecx+2*edx]
414 :			ADD_FF 0, edx
415 :			lea eax,[eax+2*edx]
416 :			lea ecx,[ecx+2*edx]
417 :			ADD_FF 0, edx
418 :			lea eax,[eax+2*edx]
419 :			lea ecx,[ecx+2*edx]
420 :			ADD_FF 0, edx
421 :			EPILOG
422 :	edgomez	1540	.endfunc
423 :	edgomez	1530
424 :			;===========================================================================
425 :			;
426 :			; void interpolate8x8_halfpel_h_add_xmm(uint8_t * const dst,
427 :			; const uint8_t * const src,
428 :			; const uint32_t stride,
429 :			; const uint32_t rounding);
430 :			;
431 :			;
432 :			;===========================================================================
433 :
434 :
435 :			%macro ADD_FH_RND0 2
436 :			movq mm0, [eax+%1]
437 :			movq mm1, [eax+%2]
438 :			pavgb mm0, [eax+%1+1]
439 :			pavgb mm1, [eax+%2+1]
440 :			pavgb mm0, [ecx+%1]
441 :			pavgb mm1, [ecx+%2]
442 :			movq [ecx+%1],mm0
443 :			movq [ecx+%2],mm1
444 :			%endmacro
445 :
446 :			%macro ADD_FH_RND1 2
447 :			movq mm0, [eax+%1]
448 :			movq mm1, [eax+%2]
449 :			movq mm4, mm0
450 :			movq mm5, mm1
451 :			movq mm2, [eax+%1+1]
452 :			movq mm3, [eax+%2+1]
453 :			pavgb mm0, mm2
454 :			; lea ??
455 :			pxor mm2, mm4
456 :			pavgb mm1, mm3
457 :			pxor mm3, mm5
458 :			pand mm2, [mmx_one]
459 :			pand mm3, [mmx_one]
460 :			psubb mm0, mm2
461 :			psubb mm1, mm3
462 :			pavgb mm0, [ecx+%1]
463 :			pavgb mm1, [ecx+%2]
464 :			movq [ecx+%1],mm0
465 :			movq [ecx+%2],mm1
466 :			%endmacro
467 :
468 :			ALIGN 16
469 :			interpolate8x8_halfpel_h_add_xmm: ; 32c
470 :			PROLOG1
471 :			jnz near .Loop1
472 :			ADD_FH_RND0 0, edx
473 :			lea eax,[eax+2*edx]
474 :			lea ecx,[ecx+2*edx]
475 :			ADD_FH_RND0 0, edx
476 :			lea eax,[eax+2*edx]
477 :			lea ecx,[ecx+2*edx]
478 :			ADD_FH_RND0 0, edx
479 :			lea eax,[eax+2*edx]
480 :			lea ecx,[ecx+2*edx]
481 :			ADD_FH_RND0 0, edx
482 :			EPILOG
483 :
484 :			.Loop1
485 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
486 :			; movq mm7, [mmx_one]
487 :			ADD_FH_RND1 0, edx
488 :			lea eax,[eax+2*edx]
489 :			lea ecx,[ecx+2*edx]
490 :			ADD_FH_RND1 0, edx
491 :			lea eax,[eax+2*edx]
492 :			lea ecx,[ecx+2*edx]
493 :			ADD_FH_RND1 0, edx
494 :			lea eax,[eax+2*edx]
495 :			lea ecx,[ecx+2*edx]
496 :			ADD_FH_RND1 0, edx
497 :			EPILOG
498 :	edgomez	1540	.endfunc
499 :	edgomez	1530
500 :
501 :			;===========================================================================
502 :			;
503 :			; void interpolate8x8_halfpel_v_add_xmm(uint8_t * const dst,
504 :			; const uint8_t * const src,
505 :			; const uint32_t stride,
506 :			; const uint32_t rounding);
507 :			;
508 :			;
509 :			;===========================================================================
510 :
511 :			%macro ADD_8_HF_RND0 0
512 :			movq mm0, [eax]
513 :			movq mm1, [eax+edx]
514 :			pavgb mm0, mm1
515 :			pavgb mm1, [eax+2*edx]
516 :			lea eax,[eax+2*edx]
517 :			pavgb mm0, [ecx]
518 :			pavgb mm1, [ecx+edx]
519 :			movq [ecx],mm0
520 :			movq [ecx+edx],mm1
521 :			%endmacro
522 :
523 :			%macro ADD_8_HF_RND1 0
524 :			movq mm1, [eax+edx]
525 :			movq mm2, [eax+2*edx]
526 :			lea eax,[eax+2*edx]
527 :			movq mm4, mm0
528 :			movq mm5, mm1
529 :			pavgb mm0, mm1
530 :			pxor mm4, mm1
531 :			pavgb mm1, mm2
532 :			pxor mm5, mm2
533 :			pand mm4, mm7 ; lsb's of (i^j)...
534 :			pand mm5, mm7 ; lsb's of (i^j)...
535 :			psubb mm0, mm4 ; ...are substracted from result of pavgb
536 :			pavgb mm0, [ecx]
537 :			movq [ecx], mm0
538 :			psubb mm1, mm5 ; ...are substracted from result of pavgb
539 :			pavgb mm1, [ecx+edx]
540 :			movq [ecx+edx], mm1
541 :			%endmacro
542 :
543 :			ALIGN 16
544 :			interpolate8x8_halfpel_v_add_xmm:
545 :			PROLOG1
546 :
547 :			jnz near .Loop1
548 :			pxor mm7, mm7 ; this is a NOP
549 :
550 :			ADD_8_HF_RND0
551 :			lea ecx,[ecx+2*edx]
552 :			ADD_8_HF_RND0
553 :			lea ecx,[ecx+2*edx]
554 :			ADD_8_HF_RND0
555 :			lea ecx,[ecx+2*edx]
556 :			ADD_8_HF_RND0
557 :			EPILOG
558 :
559 :			.Loop1
560 :			movq mm0, [eax] ; loop invariant
561 :			movq mm7, [mmx_one]
562 :
563 :			ADD_8_HF_RND1
564 :			movq mm0, mm2
565 :			lea ecx,[ecx+2*edx]
566 :			ADD_8_HF_RND1
567 :			movq mm0, mm2
568 :			lea ecx,[ecx+2*edx]
569 :			ADD_8_HF_RND1
570 :			movq mm0, mm2
571 :			lea ecx,[ecx+2*edx]
572 :			ADD_8_HF_RND1
573 :			EPILOG
574 :	edgomez	1540	.endfunc
575 :	edgomez	1530
576 :			; The trick is to correct the result of 'pavgb' with some combination of the
577 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
578 :			; The boolean relations are:
579 :			; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
580 :			; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
581 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
582 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
583 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
584 :
585 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
586 :
587 :			;===========================================================================
588 :			;
589 :			; void interpolate8x8_halfpel_hv_add_xmm(uint8_t * const dst,
590 :			; const uint8_t * const src,
591 :			; const uint32_t stride,
592 :			; const uint32_t rounding);
593 :			;
594 :			;
595 :			;===========================================================================
596 :
597 :			%macro ADD_HH_RND0 0
598 :			lea eax,[eax+edx]
599 :
600 :			movq mm0, [eax]
601 :			movq mm1, [eax+1]
602 :
603 :			movq mm6, mm0
604 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
605 :			lea eax,[eax+edx]
606 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
607 :
608 :			por mm3, mm1 ; ij \|= jk
609 :			movq mm6, mm2
610 :			pxor mm6, mm0 ; mm6 = s^t
611 :			pand mm3, mm6 ; (ij\|jk) &= st
612 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
613 :			pand mm3, mm7 ; mask lsb
614 :			psubb mm2, mm3 ; apply.
615 :
616 :			pavgb mm2, [ecx]
617 :			movq [ecx], mm2
618 :
619 :			movq mm2, [eax]
620 :			movq mm3, [eax+1]
621 :			movq mm6, mm2
622 :			pavgb mm2, mm3 ; preserved for next iteration
623 :			lea ecx,[ecx+edx]
624 :			pxor mm3, mm6 ; preserved for next iteration
625 :
626 :			por mm1, mm3
627 :			movq mm6, mm0
628 :			pxor mm6, mm2
629 :			pand mm1, mm6
630 :			pavgb mm0, mm2
631 :
632 :			pand mm1, mm7
633 :			psubb mm0, mm1
634 :
635 :			pavgb mm0, [ecx]
636 :			movq [ecx], mm0
637 :			%endmacro
638 :
639 :			%macro ADD_HH_RND1 0
640 :			lea eax,[eax+edx]
641 :
642 :			movq mm0, [eax]
643 :			movq mm1, [eax+1]
644 :
645 :			movq mm6, mm0
646 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
647 :			lea eax,[eax+edx]
648 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
649 :
650 :			pand mm3, mm1
651 :			movq mm6, mm2
652 :			pxor mm6, mm0
653 :			por mm3, mm6
654 :			pavgb mm2, mm0
655 :			pand mm3, mm7
656 :			psubb mm2, mm3
657 :
658 :			pavgb mm2, [ecx]
659 :			movq [ecx], mm2
660 :
661 :			movq mm2, [eax]
662 :			movq mm3, [eax+1]
663 :			movq mm6, mm2
664 :			pavgb mm2, mm3 ; preserved for next iteration
665 :			lea ecx,[ecx+edx]
666 :			pxor mm3, mm6 ; preserved for next iteration
667 :
668 :			pand mm1, mm3
669 :			movq mm6, mm0
670 :			pxor mm6, mm2
671 :			por mm1, mm6
672 :			pavgb mm0, mm2
673 :			pand mm1, mm7
674 :			psubb mm0, mm1
675 :
676 :			pavgb mm0, [ecx]
677 :			movq [ecx], mm0
678 :			%endmacro
679 :
680 :			ALIGN 16
681 :			interpolate8x8_halfpel_hv_add_xmm:
682 :			PROLOG1
683 :
684 :			movq mm7, [mmx_one]
685 :
686 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
687 :			movq mm2, [eax]
688 :			movq mm3, [eax+1]
689 :			movq mm6, mm2
690 :			pavgb mm2, mm3
691 :			pxor mm3, mm6 ; mm2/mm3 ready
692 :
693 :			jnz near .Loop1
694 :
695 :			ADD_HH_RND0
696 :			add ecx, edx
697 :			ADD_HH_RND0
698 :			add ecx, edx
699 :			ADD_HH_RND0
700 :			add ecx, edx
701 :			ADD_HH_RND0
702 :			EPILOG
703 :
704 :			.Loop1
705 :			ADD_HH_RND1
706 :			add ecx, edx
707 :			ADD_HH_RND1
708 :			add ecx, edx
709 :			ADD_HH_RND1
710 :			add ecx, edx
711 :			ADD_HH_RND1
712 :
713 :			EPILOG
714 :	edgomez	1540	.endfunc
715 :

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4