Annotation of /trunk/xvidcore/src/image/x86_asm/interpolate8x8_3dne.asm

Revision 1632 - (view) (download)

1 :	edgomez	1382	;/*****************************************************************************
2 :	edgomez	851	; *
3 :	edgomez	1382	; * XVID MPEG-4 VIDEO CODEC
4 :			; * - 3dne pipeline optimized 8x8 block-based halfpel interpolation -
5 :	edgomez	851	; *
6 :	edgomez	1382	; * Copyright(C) 2002 Jaan Kalda
7 :	edgomez	851	; *
8 :	edgomez	1382	; * This program is free software ; you can redistribute it and/or modify
9 :			; * it under the terms of the GNU General Public License as published by
10 :			; * the Free Software Foundation ; either version 2 of the License, or
11 :			; * (at your option) any later version.
12 :	edgomez	851	; *
13 :	edgomez	1382	; * This program is distributed in the hope that it will be useful,
14 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
15 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 :			; * GNU General Public License for more details.
17 :	edgomez	851	; *
18 :	edgomez	1382	; * You should have received a copy of the GNU General Public License
19 :			; * along with this program ; if not, write to the Free Software
20 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 :	edgomez	851	; *
22 :	edgomez	1382	; ****************************************************************************/
23 :	edgomez	851
24 :	edgomez	1382	; these 3dne functions are compatible with iSSE, but are optimized specifically
25 :			; for K7 pipelines
26 :	edgomez	851
27 :	edgomez	1382	BITS 32
28 :	edgomez	851
29 :	edgomez	1382	%macro cglobal 1
30 :	edgomez	851	%ifdef PREFIX
31 :	edgomez	1535	%ifdef MARK_FUNCS
32 :	edgomez	1540	global _%1:function %1.endfunc-%1
33 :			%define %1 _%1:function %1.endfunc-%1
34 :	edgomez	1535	%else
35 :			global _%1
36 :			%define %1 _%1
37 :			%endif
38 :	edgomez	851	%else
39 :	edgomez	1535	%ifdef MARK_FUNCS
40 :	edgomez	1540	global %1:function %1.endfunc-%1
41 :	edgomez	1535	%else
42 :			global %1
43 :			%endif
44 :	edgomez	851	%endif
45 :			%endmacro
46 :	edgomez	1382
47 :			;=============================================================================
48 :			; Read only data
49 :			;=============================================================================
50 :
51 :	edgomez	851	%ifdef FORMAT_COFF
52 :	edgomez	1519	SECTION .rodata
53 :	edgomez	851	%else
54 :	edgomez	1519	SECTION .rodata align=16
55 :	edgomez	851	%endif
56 :
57 :	edgomez	1382	ALIGN 16
58 :			mmx_one:
59 :			times 8 db 1
60 :	edgomez	851
61 :	edgomez	1382	ALIGN 8
62 :	edgomez	851	mm_minusone:
63 :	edgomez	1382	dd -1,-1
64 :	edgomez	851
65 :	edgomez	1382	;=============================================================================
66 :			; Macros
67 :			;=============================================================================
68 :	edgomez	851
69 :	edgomez	1382	%macro nop4 0
70 :			DB 08Dh,074h,026h,0
71 :			%endmacro
72 :
73 :			;=============================================================================
74 :			; Macros
75 :			;=============================================================================
76 :
77 :			SECTION .text
78 :
79 :	edgomez	851	cglobal interpolate8x8_halfpel_h_3dne
80 :			cglobal interpolate8x8_halfpel_v_3dne
81 :			cglobal interpolate8x8_halfpel_hv_3dne
82 :
83 :	suxen_drol	1632	cglobal interpolate8x4_halfpel_h_3dne
84 :			cglobal interpolate8x4_halfpel_v_3dne
85 :			cglobal interpolate8x4_halfpel_hv_3dne
86 :
87 :	edgomez	1382	;-----------------------------------------------------------------------------
88 :	edgomez	851	;
89 :			; void interpolate8x8_halfpel_h_3dne(uint8_t * const dst,
90 :	edgomez	1382	; const uint8_t * const src,
91 :			; const uint32_t stride,
92 :			; const uint32_t rounding);
93 :	edgomez	851	;
94 :	edgomez	1382	;-----------------------------------------------------------------------------
95 :	edgomez	851
96 :			%macro COPY_H_SSE_RND0 1
97 :			%if (%1)
98 :	edgomez	1382	movq mm0, [eax]
99 :	edgomez	851	%else
100 :	edgomez	1382	movq mm0, [dword eax]
101 :	edgomez	851	%endif
102 :			pavgb mm0, [eax+1]
103 :	edgomez	1382	movq mm1, [eax+edx]
104 :	edgomez	851	pavgb mm1, [eax+edx+1]
105 :	edgomez	1382	lea eax, [eax+2*edx]
106 :			movq [ecx], mm0
107 :			movq [ecx+edx], mm1
108 :	edgomez	851	%endmacro
109 :
110 :			%macro COPY_H_SSE_RND1 0
111 :			movq mm0, [eax]
112 :			movq mm1, [eax+edx]
113 :			movq mm4, mm0
114 :			movq mm5, mm1
115 :	edgomez	1382	movq mm2, [eax+1]
116 :	edgomez	851	movq mm3, [eax+edx+1]
117 :			pavgb mm0, mm2
118 :			pxor mm2, mm4
119 :			pavgb mm1, mm3
120 :	edgomez	1382	lea eax, [eax+2*edx]
121 :	edgomez	851	pxor mm3, mm5
122 :			pand mm2, mm7
123 :			pand mm3, mm7
124 :			psubb mm0, mm2
125 :			movq [ecx], mm0
126 :			psubb mm1, mm3
127 :	edgomez	1382	movq [ecx+edx], mm1
128 :	edgomez	851	%endmacro
129 :
130 :	edgomez	1382	ALIGN 16
131 :	edgomez	851	interpolate8x8_halfpel_h_3dne:
132 :
133 :			mov eax, [esp+ 8] ; Src
134 :			mov edx, [esp+12] ; stride
135 :			dec dword [esp+16]; rounding
136 :
137 :			jz .rounding1
138 :			mov ecx, [esp+ 4] ; Dst
139 :
140 :			COPY_H_SSE_RND0 0
141 :			lea ecx,[ecx+2*edx]
142 :			COPY_H_SSE_RND0 1
143 :			lea ecx,[ecx+2*edx]
144 :			COPY_H_SSE_RND0 1
145 :			lea ecx,[ecx+2*edx]
146 :			COPY_H_SSE_RND0 1
147 :			ret
148 :
149 :			.rounding1
150 :	edgomez	1382	; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
151 :	edgomez	851	mov ecx, [esp+ 4] ; Dst
152 :			movq mm7, [mmx_one]
153 :			COPY_H_SSE_RND1
154 :			lea ecx, [ecx+2*edx]
155 :			COPY_H_SSE_RND1
156 :			lea ecx,[ecx+2*edx]
157 :			COPY_H_SSE_RND1
158 :			lea ecx,[ecx+2*edx]
159 :			COPY_H_SSE_RND1
160 :			ret
161 :	edgomez	1540	.endfunc
162 :	edgomez	851
163 :	edgomez	1382	;-----------------------------------------------------------------------------
164 :	edgomez	851	;
165 :			; void interpolate8x8_halfpel_v_3dne(uint8_t * const dst,
166 :	edgomez	1382	; const uint8_t * const src,
167 :			; const uint32_t stride,
168 :			; const uint32_t rounding);
169 :	edgomez	851	;
170 :	edgomez	1382	;-----------------------------------------------------------------------------
171 :	edgomez	851
172 :	edgomez	1382	ALIGN 16
173 :	edgomez	851	interpolate8x8_halfpel_v_3dne:
174 :
175 :			mov eax, [esp+ 8] ; Src
176 :			mov edx, [esp+12] ; stride
177 :			dec dword [esp+16]; rounding
178 :
179 :			; we process 2 line at a time
180 :
181 :			jz .rounding1
182 :			pxor mm2,mm2
183 :	edgomez	1382	movq mm0, [eax]
184 :			movq mm1, [eax+edx]
185 :			por mm2, [eax+2*edx]
186 :	edgomez	851	mov ecx, [esp+ 4] ; Dst
187 :	edgomez	1382	lea eax, [eax+2*edx]
188 :			pxor mm4, mm4
189 :	edgomez	851	pavgb mm0, mm1
190 :	edgomez	1382	pavgb mm1, mm2
191 :			movq [byte ecx], mm0
192 :			movq [ecx+edx], mm1
193 :			pxor mm6, mm6
194 :			add eax, edx
195 :			lea ecx, [ecx+2*edx]
196 :			movq mm3, [byte eax]
197 :			por mm4, [eax+edx]
198 :			lea eax, [eax+2*edx]
199 :	edgomez	851	pavgb mm2, mm3
200 :			pavgb mm3, mm4
201 :	edgomez	1382	movq [ecx], mm2
202 :			movq [ecx+edx], mm3
203 :			lea ecx, [byte ecx+2*edx]
204 :			movq mm5, [byte eax]
205 :			por mm6, [eax+edx]
206 :			lea eax, [eax+2*edx]
207 :	edgomez	851	pavgb mm4, mm5
208 :			pavgb mm5, mm6
209 :	edgomez	1382	movq [ecx], mm4
210 :			movq [ecx+edx], mm5
211 :			lea ecx, [ecx+2*edx]
212 :			movq mm7, [eax]
213 :			movq mm0, [eax+edx]
214 :	edgomez	851	pavgb mm6, mm7
215 :			pavgb mm7, mm0
216 :	edgomez	1382	movq [ecx], mm6
217 :			movq [ecx+edx], mm7
218 :	edgomez	851	ret
219 :
220 :	edgomez	1382	ALIGN 8
221 :	edgomez	851	.rounding1
222 :	edgomez	1382	pcmpeqb mm0, mm0
223 :			psubusb mm0, [eax]
224 :			add eax, edx
225 :	edgomez	851	mov ecx, [esp+ 4] ; Dst
226 :			push esi
227 :	edgomez	1382	pcmpeqb mm1, mm1
228 :			pcmpeqb mm2, mm2
229 :			mov esi, mm_minusone
230 :			psubusb mm1, [byte eax]
231 :			psubusb mm2, [eax+edx]
232 :			lea eax, [eax+2*edx]
233 :	edgomez	851	movq mm6, [esi]
234 :			movq mm7, [esi]
235 :			pavgb mm0, mm1
236 :			pavgb mm1, mm2
237 :	edgomez	1382	psubusb mm6, mm0
238 :			psubusb mm7, mm1
239 :	edgomez	851	movq [ecx], mm6
240 :			movq [ecx+edx], mm7
241 :	edgomez	1382	lea ecx, [ecx+2*edx]
242 :			pcmpeqb mm3, mm3
243 :			pcmpeqb mm4, mm4
244 :			psubusb mm3, [eax]
245 :			psubusb mm4, [eax+edx]
246 :			lea eax, [eax+2*edx]
247 :	edgomez	851	pavgb mm2, mm3
248 :			pavgb mm3, mm4
249 :			movq mm0, [esi]
250 :			movq mm1, [esi]
251 :	edgomez	1382	psubusb mm0, mm2
252 :			psubusb mm1, mm3
253 :	edgomez	851	movq [ecx], mm0
254 :			movq [ecx+edx], mm1
255 :			lea ecx,[ecx+2*edx]
256 :
257 :	edgomez	1382	pcmpeqb mm5, mm5
258 :			pcmpeqb mm6, mm6
259 :			psubusb mm5, [eax]
260 :			psubusb mm6, [eax+edx]
261 :			lea eax, [eax+2*edx]
262 :	edgomez	851	pavgb mm4, mm5
263 :			pavgb mm5, mm6
264 :			movq mm2, [esi]
265 :			movq mm3, [esi]
266 :	edgomez	1382	psubusb mm2, mm4
267 :			psubusb mm3, mm5
268 :	edgomez	851	movq [ecx], mm2
269 :			movq [ecx+edx], mm3
270 :	edgomez	1382	lea ecx, [ecx+2*edx]
271 :			pcmpeqb mm7, mm7
272 :			pcmpeqb mm0, mm0
273 :			psubusb mm7, [eax]
274 :			psubusb mm0, [eax+edx]
275 :	edgomez	851	pavgb mm6, mm7
276 :			pavgb mm7, mm0
277 :			movq mm4, [esi]
278 :			movq mm5, [esi]
279 :	edgomez	1382	psubusb mm4, mm6
280 :	edgomez	851	pop esi
281 :	edgomez	1382	psubusb mm5, mm7
282 :	edgomez	851	movq [ecx], mm4
283 :			movq [ecx+edx], mm5
284 :			ret
285 :	edgomez	1540	.endfunc
286 :	edgomez	1382
287 :			;-----------------------------------------------------------------------------
288 :	edgomez	851	;
289 :			; void interpolate8x8_halfpel_hv_3dne(uint8_t * const dst,
290 :	edgomez	1382	; const uint8_t * const src,
291 :			; const uint32_t stride,
292 :			; const uint32_t rounding);
293 :	edgomez	851	;
294 :			;
295 :	edgomez	1382	;-----------------------------------------------------------------------------
296 :	edgomez	851
297 :			; The trick is to correct the result of 'pavgb' with some combination of the
298 :			; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
299 :			; The boolean relations are:
300 :	edgomez	1382	; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
301 :	edgomez	851	; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij\|kl)&st
302 :			; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)\|st
303 :			; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij\|kl)\|st
304 :			; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
305 :
306 :			; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
307 :
308 :			%macro COPY_HV_SSE_RND0 0
309 :
310 :	edgomez	1382	movq mm0, [eax+edx]
311 :			movq mm1, [eax+edx+1]
312 :	edgomez	851
313 :	edgomez	1382	movq mm6, mm0
314 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
315 :			lea eax, [eax+2*edx]
316 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
317 :	edgomez	851
318 :	edgomez	1382	por mm3, mm1 ; ij \|= jk
319 :			movq mm6, mm2
320 :			pxor mm6, mm0 ; mm6 = s^t
321 :			pand mm3, mm6 ; (ij\|jk) &= st
322 :			pavgb mm2, mm0 ; mm2 = (s+t+1)/2
323 :			movq mm6, [eax]
324 :			pand mm3, mm7 ; mask lsb
325 :			psubb mm2, mm3 ; apply.
326 :	edgomez	851
327 :	edgomez	1382	movq [ecx], mm2
328 :	edgomez	851
329 :	edgomez	1382	movq mm2, [eax]
330 :			movq mm3, [eax+1]
331 :			pavgb mm2, mm3 ; preserved for next iteration
332 :			pxor mm3, mm6 ; preserved for next iteration
333 :	edgomez	851
334 :	edgomez	1382	por mm1, mm3
335 :			movq mm6, mm0
336 :			pxor mm6, mm2
337 :			pand mm1, mm6
338 :			pavgb mm0, mm2
339 :	edgomez	851
340 :	edgomez	1382	pand mm1, mm7
341 :			psubb mm0, mm1
342 :	edgomez	851
343 :	edgomez	1382	movq [ecx+edx], mm0
344 :	edgomez	851	%endmacro
345 :
346 :			%macro COPY_HV_SSE_RND1 0
347 :	edgomez	1382	movq mm0, [eax+edx]
348 :			movq mm1, [eax+edx+1]
349 :	edgomez	851
350 :	edgomez	1382	movq mm6, mm0
351 :			pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
352 :			lea eax,[eax+2*edx]
353 :			pxor mm1, mm6 ; mm1=(j^k). preserved for next step
354 :	edgomez	851
355 :	edgomez	1382	pand mm3, mm1
356 :			movq mm6, mm2
357 :			pxor mm6, mm0
358 :			por mm3, mm6
359 :			pavgb mm2, mm0
360 :			movq mm6, [eax]
361 :			pand mm3, mm7
362 :			psubb mm2, mm3
363 :	edgomez	851
364 :	edgomez	1382	movq [ecx], mm2
365 :	edgomez	851
366 :	edgomez	1382	movq mm2, [eax]
367 :			movq mm3, [eax+1]
368 :			pavgb mm2, mm3 ; preserved for next iteration
369 :			pxor mm3, mm6 ; preserved for next iteration
370 :	edgomez	851
371 :	edgomez	1382	pand mm1, mm3
372 :			movq mm6, mm0
373 :			pxor mm6, mm2
374 :			por mm1, mm6
375 :			pavgb mm0, mm2
376 :			pand mm1, mm7
377 :			psubb mm0, mm1
378 :			movq [ecx+edx], mm0
379 :	edgomez	851	%endmacro
380 :
381 :	edgomez	1382	ALIGN 16
382 :	edgomez	851	interpolate8x8_halfpel_hv_3dne:
383 :	edgomez	1382	mov eax, [esp+ 8] ; Src
384 :			mov edx, [esp+12] ; stride
385 :			dec dword [esp+16] ; rounding
386 :	edgomez	851
387 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
388 :			movq mm2, [eax]
389 :			movq mm3, [eax+1]
390 :			movq mm6, mm2
391 :			pavgb mm2, mm3
392 :	edgomez	1382	pxor mm3, mm6 ; mm2/mm3 ready
393 :			mov ecx, [esp+ 4] ; Dst
394 :	edgomez	851	movq mm7, [mmx_one]
395 :
396 :			jz near .rounding1
397 :	edgomez	1382	lea ebp,[byte ebp]
398 :	edgomez	851	COPY_HV_SSE_RND0
399 :	edgomez	1382	lea ecx,[ecx+2*edx]
400 :	edgomez	851	COPY_HV_SSE_RND0
401 :	edgomez	1382	lea ecx,[ecx+2*edx]
402 :	edgomez	851	COPY_HV_SSE_RND0
403 :	edgomez	1382	lea ecx,[ecx+2*edx]
404 :	edgomez	851	COPY_HV_SSE_RND0
405 :			ret
406 :
407 :	edgomez	1382	ALIGN 16
408 :	edgomez	851	.rounding1
409 :			COPY_HV_SSE_RND1
410 :	edgomez	1382	lea ecx,[ecx+2*edx]
411 :	edgomez	851	COPY_HV_SSE_RND1
412 :	edgomez	1382	lea ecx,[ecx+2*edx]
413 :	edgomez	851	COPY_HV_SSE_RND1
414 :	edgomez	1382	lea ecx,[ecx+2*edx]
415 :	edgomez	851	COPY_HV_SSE_RND1
416 :	edgomez	1382	ret
417 :	edgomez	1540	.endfunc
418 :
419 :	suxen_drol	1632	;-----------------------------------------------------------------------------
420 :			;
421 :			; void interpolate8x4_halfpel_h_3dne(uint8_t * const dst,
422 :			; const uint8_t * const src,
423 :			; const uint32_t stride,
424 :			; const uint32_t rounding);
425 :			;
426 :			;-----------------------------------------------------------------------------
427 :
428 :			ALIGN 16
429 :			interpolate8x4_halfpel_h_3dne:
430 :
431 :			mov eax, [esp+ 8] ; Src
432 :			mov edx, [esp+12] ; stride
433 :			dec dword [esp+16]; rounding
434 :
435 :			jz .rounding1
436 :			mov ecx, [esp+ 4] ; Dst
437 :
438 :			COPY_H_SSE_RND0 0
439 :			lea ecx,[ecx+2*edx]
440 :			COPY_H_SSE_RND0 1
441 :			ret
442 :
443 :			.rounding1
444 :			; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
445 :			mov ecx, [esp+ 4] ; Dst
446 :			movq mm7, [mmx_one]
447 :			COPY_H_SSE_RND1
448 :			lea ecx, [ecx+2*edx]
449 :			COPY_H_SSE_RND1
450 :			ret
451 :			.endfunc
452 :
453 :			;-----------------------------------------------------------------------------
454 :			;
455 :			; void interpolate8x4_halfpel_v_3dne(uint8_t * const dst,
456 :			; const uint8_t * const src,
457 :			; const uint32_t stride,
458 :			; const uint32_t rounding);
459 :			;
460 :			;-----------------------------------------------------------------------------
461 :
462 :			ALIGN 16
463 :			interpolate8x4_halfpel_v_3dne:
464 :
465 :			mov eax, [esp+ 8] ; Src
466 :			mov edx, [esp+12] ; stride
467 :			dec dword [esp+16]; rounding
468 :
469 :			; we process 2 line at a time
470 :
471 :			jz .rounding1
472 :			pxor mm2,mm2
473 :			movq mm0, [eax]
474 :			movq mm1, [eax+edx]
475 :			por mm2, [eax+2*edx] ; Something like preload (pipelining)
476 :			mov ecx, [esp+ 4] ; Dst
477 :			lea eax, [eax+2*edx]
478 :			pxor mm4, mm4
479 :			pavgb mm0, mm1
480 :			pavgb mm1, mm2
481 :			movq [byte ecx], mm0
482 :			movq [ecx+edx], mm1
483 :
484 :			pxor mm6, mm6
485 :			add eax, edx
486 :			lea ecx, [ecx+2*edx]
487 :			movq mm3, [byte eax]
488 :			por mm4, [eax+edx]
489 :			lea eax, [eax+2*edx]
490 :			pavgb mm2, mm3
491 :			pavgb mm3, mm4
492 :			movq [ecx], mm2
493 :			movq [ecx+edx], mm3
494 :
495 :			ret
496 :
497 :			ALIGN 8
498 :			.rounding1
499 :			pcmpeqb mm0, mm0
500 :			psubusb mm0, [eax] ; eax==line0
501 :			add eax, edx ; eax==line1
502 :			mov ecx, [esp+ 4] ; Dst
503 :
504 :			push esi
505 :
506 :			pcmpeqb mm1, mm1
507 :			pcmpeqb mm2, mm2
508 :			mov esi, mm_minusone
509 :			psubusb mm1, [byte eax] ; line1
510 :			psubusb mm2, [eax+edx] ; line2
511 :			lea eax, [eax+2*edx] ; eax==line3
512 :			movq mm6, [esi]
513 :			movq mm7, [esi]
514 :			pavgb mm0, mm1
515 :			pavgb mm1, mm2
516 :			psubusb mm6, mm0
517 :			psubusb mm7, mm1
518 :			movq [ecx], mm6 ; store line0
519 :			movq [ecx+edx], mm7 ; store line1
520 :
521 :			lea ecx, [ecx+2*edx]
522 :			pcmpeqb mm3, mm3
523 :			pcmpeqb mm4, mm4
524 :			psubusb mm3, [eax] ; line3
525 :			psubusb mm4, [eax+edx] ; line4
526 :			lea eax, [eax+2*edx] ; eax==line 5
527 :			pavgb mm2, mm3
528 :			pavgb mm3, mm4
529 :			movq mm0, [esi]
530 :			movq mm1, [esi]
531 :			psubusb mm0, mm2
532 :			psubusb mm1, mm3
533 :			movq [ecx], mm0
534 :			movq [ecx+edx], mm1
535 :
536 :			pop esi
537 :
538 :			ret
539 :
540 :			.endfunc
541 :
542 :			;-----------------------------------------------------------------------------
543 :			;
544 :			; void interpolate8x4_halfpel_hv_3dne(uint8_t * const dst,
545 :			; const uint8_t * const src,
546 :			; const uint32_t stride,
547 :			; const uint32_t rounding);
548 :			;
549 :			;
550 :			;-----------------------------------------------------------------------------
551 :
552 :			ALIGN 16
553 :			interpolate8x4_halfpel_hv_3dne:
554 :			mov eax, [esp+ 8] ; Src
555 :			mov edx, [esp+12] ; stride
556 :			dec dword [esp+16] ; rounding
557 :
558 :			; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
559 :			movq mm2, [eax]
560 :			movq mm3, [eax+1]
561 :			movq mm6, mm2
562 :			pavgb mm2, mm3
563 :			pxor mm3, mm6 ; mm2/mm3 ready
564 :			mov ecx, [esp+ 4] ; Dst
565 :			movq mm7, [mmx_one]
566 :
567 :			jz near .rounding1
568 :			lea ebp,[byte ebp]
569 :			COPY_HV_SSE_RND0
570 :			lea ecx,[ecx+2*edx]
571 :			COPY_HV_SSE_RND0
572 :			ret
573 :
574 :			ALIGN 16
575 :			.rounding1
576 :			COPY_HV_SSE_RND1
577 :			lea ecx,[ecx+2*edx]
578 :			COPY_HV_SSE_RND1
579 :			ret
580 :			.endfunc
581 :

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4