Annotation of /trunk/xvidcore/src/quant/x86_asm/quantize_mpeg_mmx.asm

Revision 1719 - (view) (download)

1 :	edgomez	1382	;/**************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * - 3dne Quantization/Dequantization -
5 :			; *
6 :			; * Copyright (C) 2002-2003 Peter Ross <pross@xvid.org>
7 :			; * 2002-2003 Michael Militzer <isibaar@xvid.org>
8 :			; * 2002-2003 Pascal Massimino <skal@planet-d.net>
9 :			; *
10 :			; * This program is free software ; you can redistribute it and/or modify
11 :			; * it under the terms of the GNU General Public License as published by
12 :			; * the Free Software Foundation ; either version 2 of the License, or
13 :			; * (at your option) any later version.
14 :			; *
15 :			; * This program is distributed in the hope that it will be useful,
16 :			; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
17 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 :			; * GNU General Public License for more details.
19 :			; *
20 :			; * You should have received a copy of the GNU General Public License
21 :			; * along with this program ; if not, write to the Free Software
22 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 :			; *
24 :	chl	1719	; * $Id: quantize_mpeg_mmx.asm,v 1.7 2006-07-11 10:01:27 chl Exp $
25 :	edgomez	1382	; *
26 :			; *************************************************************************/
27 :
28 :			%define SATURATE
29 :
30 :			BITS 32
31 :
32 :			%macro cglobal 1
33 :			%ifdef PREFIX
34 :	edgomez	1535	%ifdef MARK_FUNCS
35 :	edgomez	1540	global _%1:function %1.endfunc-%1
36 :			%define %1 _%1:function %1.endfunc-%1
37 :	edgomez	1535	%else
38 :			global _%1
39 :			%define %1 _%1
40 :			%endif
41 :	edgomez	1382	%else
42 :	edgomez	1535	%ifdef MARK_FUNCS
43 :	edgomez	1540	global %1:function %1.endfunc-%1
44 :	edgomez	1535	%else
45 :			global %1
46 :			%endif
47 :	edgomez	1382	%endif
48 :			%endmacro
49 :
50 :			%macro cextern 1
51 :			%ifdef PREFIX
52 :			extern _%1
53 :			%define %1 _%1
54 :			%else
55 :			extern %1
56 :			%endif
57 :			%endmacro
58 :
59 :			;=============================================================================
60 :			; Local data (Read Only)
61 :			;=============================================================================
62 :
63 :			%ifdef FORMAT_COFF
64 :	edgomez	1519	SECTION .rodata
65 :	edgomez	1382	%else
66 :	edgomez	1519	SECTION .rodata align=16
67 :	edgomez	1382	%endif
68 :
69 :			mmx_one:
70 :			times 4 dw 1
71 :
72 :			;-----------------------------------------------------------------------------
73 :			; divide by 2Q table
74 :			;-----------------------------------------------------------------------------
75 :
76 :			ALIGN 16
77 :			mmx_div:
78 :			times 4 dw 65535 ; the div by 2 formula will overflow for the case
79 :			; quant=1 but we don't care much because quant=1
80 :			; is handled by a different piece of code that
81 :			; doesn't use this table.
82 :			%assign quant 2
83 :			%rep 30
84 :			times 4 dw (1<<17) / (quant*2) + 1
85 :			%assign quant quant+1
86 :			%endrep
87 :
88 :			%define VM18P 3
89 :			%define VM18Q 4
90 :
91 :
92 :			;-----------------------------------------------------------------------------
93 :			; quantd table
94 :			;-----------------------------------------------------------------------------
95 :
96 :			quantd:
97 :			%assign quant 1
98 :			%rep 31
99 :			times 4 dw ((VM18P*quant) + (VM18Q/2)) / VM18Q
100 :			%assign quant quant+1
101 :			%endrep
102 :
103 :			;-----------------------------------------------------------------------------
104 :			; multiple by 2Q table
105 :			;-----------------------------------------------------------------------------
106 :
107 :			mmx_mul_quant:
108 :			%assign quant 1
109 :			%rep 31
110 :			times 4 dw quant
111 :			%assign quant quant+1
112 :			%endrep
113 :
114 :			;-----------------------------------------------------------------------------
115 :			; saturation limits
116 :			;-----------------------------------------------------------------------------
117 :
118 :			ALIGN 16
119 :
120 :			mmx_32767_minus_2047:
121 :			times 4 dw (32767-2047)
122 :			mmx_32768_minus_2048:
123 :			times 4 dw (32768-2048)
124 :			mmx_2047:
125 :			times 4 dw 2047
126 :			mmx_minus_2048:
127 :			times 4 dw (-2048)
128 :			zero:
129 :			times 4 dw 0
130 :
131 :			;=============================================================================
132 :	syskin	1713	; rounding
133 :			;=============================================================================
134 :
135 :			mmx_rounding:
136 :			dw (1<<13)
137 :			dw 0
138 :			dw (1<<13)
139 :			dw 0
140 :
141 :			;=============================================================================
142 :	edgomez	1382	; Code
143 :			;=============================================================================
144 :
145 :			SECTION .text
146 :
147 :			cglobal quant_mpeg_intra_mmx
148 :			cglobal quant_mpeg_inter_mmx
149 :			cglobal dequant_mpeg_intra_mmx
150 :			cglobal dequant_mpeg_inter_mmx
151 :
152 :	syskin	1713
153 :			%macro QUANT_MMX 1
154 :			movq mm0, [eax + 16*(%1)] ; data
155 :			movq mm2, [ecx + 16*(%1) + 128] ; intra_matrix_rec
156 :			movq mm4, [eax + 16*(%1) + 8] ; data
157 :			movq mm6, [ecx + 16*(%1) + 128 + 8] ; intra_matrix_rec
158 :
159 :			movq mm1, mm0
160 :			movq mm5, mm4
161 :
162 :			pmullw mm0, mm2 ; low results
163 :			pmulhw mm1, mm2 ; high results
164 :			pmullw mm4, mm6 ; low results
165 :			pmulhw mm5, mm6 ; high results
166 :
167 :			movq mm2, mm0
168 :			movq mm6, mm4
169 :
170 :			punpckhwd mm0, mm1
171 :			punpcklwd mm2, mm1
172 :			punpckhwd mm4, mm5
173 :			punpcklwd mm6, mm5
174 :
175 :			paddd mm2, mm7
176 :			paddd mm0, mm7
177 :			paddd mm6, mm7
178 :			paddd mm4, mm7
179 :
180 :			psrad mm2, 14
181 :			psrad mm0, 14
182 :			psrad mm6, 14
183 :			psrad mm4, 14
184 :
185 :			packssdw mm2, mm0
186 :			packssdw mm6, mm4
187 :
188 :			movq [edi + 16*(%1)], mm2
189 :			movq [edi + 16*(%1)+8], mm6
190 :			%endmacro
191 :
192 :	edgomez	1382	;-----------------------------------------------------------------------------
193 :			;
194 :			; uint32_t quant_mpeg_intra_mmx(int16_t * coeff,
195 :			; const int16_t const * data,
196 :			; const uint32_t quant,
197 :			; const uint32_t dcscalar,
198 :			; const uint16_t *mpeg_matrices);
199 :			;
200 :			;-----------------------------------------------------------------------------
201 :
202 :			ALIGN 16
203 :			quant_mpeg_intra_mmx:
204 :
205 :			push edi
206 :	syskin	1713	movq mm7, [mmx_rounding]
207 :	edgomez	1382
208 :	syskin	1713	mov eax, [esp + 4 + 8] ; data
209 :			mov ecx, [esp + 4 + 20] ; mpeg_quant_matrices
210 :			mov edi, [esp + 4 + 4] ; coeff
211 :	edgomez	1382
212 :	syskin	1713	QUANT_MMX(0)
213 :			QUANT_MMX(1)
214 :			QUANT_MMX(2)
215 :			QUANT_MMX(3)
216 :			QUANT_MMX(4)
217 :			QUANT_MMX(5)
218 :			QUANT_MMX(6)
219 :			QUANT_MMX(7)
220 :	edgomez	1382
221 :	syskin	1713	; calculate DC
222 :			movsx eax, word [eax] ; data[0]
223 :			mov ecx, [esp + 4 + 16] ; dcscalar
224 :	edgomez	1382	mov edx, ecx
225 :			shr edx, 1 ; edx = dcscalar /2
226 :	syskin	1713	mov edi, edx
227 :			neg edi
228 :
229 :	edgomez	1382	cmp eax, 0
230 :	syskin	1713	cmovg edx, edi
231 :	edgomez	1382	sub eax, edx
232 :	syskin	1713
233 :			mov edi, [esp + 4 + 4] ; coeff again
234 :
235 :	edgomez	1382	cdq ; expand eax -> edx:eax
236 :			idiv ecx ; eax = edx:eax / dcscalar
237 :
238 :			mov [edi], ax ; coeff[0] = ax
239 :
240 :			pop edi
241 :
242 :			xor eax, eax ; return(0);
243 :			ret
244 :	chl	1719	.endfunc
245 :	edgomez	1382
246 :
247 :			;-----------------------------------------------------------------------------
248 :			;
249 :			; uint32_t quant_mpeg_inter_mmx(int16_t * coeff,
250 :			; const int16_t const * data,
251 :			; const uint32_t quant,
252 :			; const uint16_t *mpeg_matrices);
253 :			;
254 :			;-----------------------------------------------------------------------------
255 :
256 :			ALIGN 16
257 :			quant_mpeg_inter_mmx:
258 :
259 :			push ecx
260 :			push esi
261 :			push edi
262 :			push ebx
263 :
264 :			mov edi, [esp + 16 + 4] ; coeff
265 :			mov esi, [esp + 16 + 8] ; data
266 :			mov eax, [esp + 16 + 12] ; quant
267 :			mov ebx, [esp + 16 + 16] ; mpeg_quant_matrices
268 :
269 :			xor ecx, ecx
270 :
271 :			pxor mm5, mm5 ; sum
272 :
273 :			cmp al, 1
274 :			jz near .q1loop
275 :
276 :			cmp al, 2
277 :			jz near .q2loop
278 :
279 :			movq mm7, [mmx_div + eax * 8 - 8] ; divider
280 :
281 :			ALIGN 16
282 :			.loop
283 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
284 :			movq mm3, [esi + 8*ecx + 8] ;
285 :			pxor mm1, mm1 ; mm1 = 0
286 :			pxor mm4, mm4 ;
287 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
288 :			pcmpgtw mm4, mm3 ;
289 :			pxor mm0, mm1 ; mm0 = \|mm0\|
290 :			pxor mm3, mm4 ;
291 :			psubw mm0, mm1 ; displace
292 :			psubw mm3, mm4 ;
293 :			psllw mm0, 4
294 :			psllw mm3, 4
295 :			movq mm2, [ebx + 512 + 8*ecx]
296 :			psrlw mm2, 1
297 :			paddw mm0, mm2
298 :			movq mm2, [ebx + 768 + ecx*8]
299 :			pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
300 :			movq mm2, [ebx + 512 + 8*ecx + 8]
301 :			psrlw mm2, 1
302 :			paddw mm3, mm2
303 :			movq mm2, [ebx + 768 + ecx*8 + 8]
304 :			pmulhw mm3, mm2
305 :			pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
306 :			pmulhw mm3, mm7 ;
307 :			psrlw mm0, 1 ; additional shift by 1 => 16 + 1 = 17
308 :			psrlw mm3, 1
309 :			paddw mm5, mm0 ; sum += mm0
310 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
311 :			paddw mm5, mm3 ;
312 :			pxor mm3, mm4 ;
313 :			psubw mm0, mm1 ; undisplace
314 :			psubw mm3, mm4
315 :			movq [edi + 8*ecx], mm0
316 :			movq [edi + 8*ecx + 8], mm3
317 :
318 :			add ecx, 2
319 :			cmp ecx, 16
320 :			jnz near .loop
321 :
322 :			.done
323 :			pmaddwd mm5, [mmx_one]
324 :			movq mm0, mm5
325 :			psrlq mm5, 32
326 :			paddd mm0, mm5
327 :			movd eax, mm0 ; return sum
328 :
329 :			pop ebx
330 :			pop edi
331 :			pop esi
332 :			pop ecx
333 :
334 :			ret
335 :
336 :			ALIGN 16
337 :			.q1loop
338 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
339 :			movq mm3, [esi + 8*ecx+ 8]
340 :			pxor mm1, mm1 ; mm1 = 0
341 :			pxor mm4, mm4 ;
342 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
343 :			pcmpgtw mm4, mm3 ;
344 :			pxor mm0, mm1 ; mm0 = \|mm0\|
345 :			pxor mm3, mm4 ;
346 :			psubw mm0, mm1 ; displace
347 :			psubw mm3, mm4 ;
348 :			psllw mm0, 4
349 :			psllw mm3, 4
350 :			movq mm2, [ebx + 512 + 8*ecx]
351 :			psrlw mm2, 1
352 :			paddw mm0, mm2
353 :			movq mm2, [ebx + 768 + ecx*8]
354 :			pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
355 :			movq mm2, [ebx + 512 + 8*ecx + 8]
356 :			psrlw mm2, 1
357 :			paddw mm3, mm2
358 :			movq mm2, [ebx + 768 + ecx*8 + 8]
359 :			pmulhw mm3, mm2
360 :			psrlw mm0, 1 ; mm0 >>= 1 (/2)
361 :			psrlw mm3, 1 ;
362 :			paddw mm5, mm0 ; sum += mm0
363 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
364 :			paddw mm5, mm3 ;
365 :			pxor mm3, mm4 ;
366 :			psubw mm0, mm1 ; undisplace
367 :			psubw mm3, mm4
368 :			movq [edi + 8*ecx], mm0
369 :			movq [edi + 8*ecx + 8], mm3
370 :
371 :			add ecx, 2
372 :			cmp ecx, 16
373 :			jnz near .q1loop
374 :
375 :			jmp .done
376 :
377 :			ALIGN 16
378 :			.q2loop
379 :			movq mm0, [esi + 8*ecx] ; mm0 = [1st]
380 :			movq mm3, [esi + 8*ecx+ 8]
381 :			pxor mm1, mm1 ; mm1 = 0
382 :			pxor mm4, mm4 ;
383 :			pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
384 :			pcmpgtw mm4, mm3 ;
385 :			pxor mm0, mm1 ; mm0 = \|mm0\|
386 :			pxor mm3, mm4 ;
387 :			psubw mm0, mm1 ; displace
388 :			psubw mm3, mm4 ;
389 :			psllw mm0, 4
390 :			psllw mm3, 4
391 :			movq mm2, [ebx + 512 + 8*ecx]
392 :			psrlw mm2, 1
393 :			paddw mm0, mm2
394 :			movq mm2, [ebx + 768 + ecx*8]
395 :			pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
396 :			movq mm2, [ebx + 512 + 8*ecx + 8]
397 :			psrlw mm2, 1
398 :			paddw mm3, mm2
399 :			movq mm2, [ebx + 768 + ecx*8 + 8]
400 :			pmulhw mm3, mm2
401 :			psrlw mm0, 2 ; mm0 >>= 1 (/2)
402 :			psrlw mm3, 2 ;
403 :			paddw mm5, mm0 ; sum += mm0
404 :			pxor mm0, mm1 ; mm0 *= sign(mm0)
405 :			paddw mm5, mm3 ;
406 :			pxor mm3, mm4 ;
407 :			psubw mm0, mm1 ; undisplace
408 :			psubw mm3, mm4
409 :			movq [edi + 8*ecx], mm0
410 :			movq [edi + 8*ecx + 8], mm3
411 :
412 :			add ecx, 2
413 :			cmp ecx, 16
414 :			jnz near .q2loop
415 :
416 :			jmp .done
417 :	edgomez	1540	.endfunc
418 :	edgomez	1382
419 :
420 :			;-----------------------------------------------------------------------------
421 :			;
422 :			; uint32_t dequant_mpeg_intra_mmx(int16_t *data,
423 :			; const int16_t const *coeff,
424 :			; const uint32_t quant,
425 :			; const uint32_t dcscalar,
426 :			; const uint16_t *mpeg_matrices);
427 :			;
428 :			;-----------------------------------------------------------------------------
429 :
430 :			; Note: in order to saturate 'easily', we pre-shift the quantifier
431 :			; by 4. Then, the high-word of (coeff[]matrix[i]quant) are used to
432 :			; build a saturating mask. It is non-zero only when an overflow occured.
433 :			; We thus avoid packing/unpacking toward double-word.
434 :			; Moreover, we perform the mult (matrix[i]*quant) first, instead of, e.g.,
435 :			; (coeff[i]*matrix[i]). This is less prone to overflow if coeff[] are not
436 :			; checked. Input ranges are: coeff in [-127,127], inter_matrix in [1..255],a
437 :			; and quant in [1..31].
438 :			;
439 :			; The original loop is:
440 :			;
441 :			%if 0
442 :			movq mm0, [ecx+8eax + 816] ; mm0 = coeff[i]
443 :			pxor mm1, mm1
444 :			pcmpgtw mm1, mm0
445 :			pxor mm0, mm1 ; change sign if negative
446 :			psubw mm0, mm1 ; -> mm0 = abs(coeff[i]), mm1 = sign of coeff[i]
447 :
448 :			movq mm2, mm7 ; mm2 = quant
449 :			pmullw mm2, [ebx + 8eax + 816 ] ; matrix[i]*quant.
450 :
451 :			movq mm6, mm2
452 :			pmulhw mm2, mm0 ; high of coeff(matrixquant) (should be 0 if no overflow)
453 :			pmullw mm0, mm6 ; low of coeff(matrixquant)
454 :
455 :			pxor mm5, mm5
456 :			pcmpgtw mm2, mm5 ; otherflow?
457 :			psrlw mm2, 5 ; =0 if no clamp, 2047 otherwise
458 :			psrlw mm0, 5
459 :			paddw mm0, mm1 ; start restoring sign
460 :			por mm0, mm2 ; saturate to 2047 if needed
461 :			pxor mm0, mm1 ; finish negating back
462 :
463 :			movq [edx + 8eax + 816], mm0 ; data[i]
464 :			add eax, 1
465 :			%endif
466 :
467 :			;********************************************************************
468 :
469 :			ALIGN 16
470 :			dequant_mpeg_intra_mmx:
471 :
472 :			push ebx
473 :
474 :			mov edx, [esp + 4 + 4] ; data
475 :			mov ecx, [esp + 4 + 8] ; coeff
476 :			mov eax, [esp + 4 + 12] ; quant
477 :			mov ebx, [esp + 4 + 20] ; mpeg_quant_matrices
478 :
479 :			movq mm7, [mmx_mul_quant + eax*8 - 8]
480 :			mov eax, -16 ; to keep ALIGNed, we regularly process coeff[0]
481 :			psllw mm7, 2 ; << 2. See comment.
482 :			pxor mm6, mm6 ; this is a NOP
483 :
484 :			ALIGN 16
485 :			.loop
486 :			movq mm0, [ecx+8eax + 816] ; mm0 = c = coeff[i]
487 :			movq mm3, [ecx+8eax + 816 +8]; mm3 = c' = coeff[i+1]
488 :			pxor mm1, mm1
489 :			pxor mm4, mm4
490 :			pcmpgtw mm1, mm0 ; mm1 = sgn(c)
491 :			movq mm2, mm7 ; mm2 = quant
492 :
493 :			pcmpgtw mm4, mm3 ; mm4 = sgn(c')
494 :			pmullw mm2, [ebx + 8eax + 816 ] ; matrix[i]*quant
495 :
496 :			pxor mm0, mm1 ; negate if negative
497 :			pxor mm3, mm4 ; negate if negative
498 :
499 :			psubw mm0, mm1
500 :			psubw mm3, mm4
501 :
502 :			; we're short on register, here. Poor pairing...
503 :
504 :			movq mm5, mm2
505 :			pmullw mm2, mm0 ; low of coeff(matrixquant)
506 :
507 :			pmulhw mm0, mm5 ; high of coeff(matrixquant)
508 :			movq mm5, mm7 ; mm2 = quant
509 :
510 :			pmullw mm5, [ebx + 8eax + 816 +8] ; matrix[i+1]*quant
511 :
512 :			movq mm6, mm5
513 :			add eax,2 ; z-flag will be tested later
514 :
515 :			pmullw mm6, mm3 ; low of coeff(matrixquant)
516 :			pmulhw mm3, mm5 ; high of coeff(matrixquant)
517 :
518 :			pcmpgtw mm0, [zero]
519 :			paddusw mm2, mm0
520 :			psrlw mm2, 5
521 :
522 :			pcmpgtw mm3, [zero]
523 :			paddusw mm6, mm3
524 :			psrlw mm6, 5
525 :
526 :			pxor mm2, mm1 ; start negating back
527 :			pxor mm6, mm4 ; start negating back
528 :
529 :			psubusw mm1, mm0
530 :			psubusw mm4, mm3
531 :
532 :			psubw mm2, mm1 ; finish negating back
533 :			psubw mm6, mm4 ; finish negating back
534 :
535 :			movq [edx + 8eax + 816 -2*8 ], mm2 ; data[i]
536 :			movq [edx + 8eax + 816 -2*8 +8], mm6 ; data[i+1]
537 :
538 :			jnz near .loop
539 :
540 :			; deal with DC
541 :			movd mm0, [ecx]
542 :			pmullw mm0, [esp + 4 + 16] ; dcscalar
543 :			movq mm2, [mmx_32767_minus_2047]
544 :			paddsw mm0, mm2
545 :			psubsw mm0, mm2
546 :			movq mm2, [mmx_32768_minus_2048]
547 :			psubsw mm0, mm2
548 :			paddsw mm0, mm2
549 :			movd eax, mm0
550 :			mov [edx], ax
551 :
552 :			xor eax, eax
553 :
554 :			pop ebx
555 :
556 :			ret
557 :	edgomez	1540	.endfunc
558 :	edgomez	1382
559 :			;-----------------------------------------------------------------------------
560 :			;
561 :			; uint32_t dequant_mpeg_inter_mmx(int16_t * data,
562 :			; const int16_t * const coeff,
563 :			; const uint32_t quant,
564 :			; const uint16_t *mpeg_matrices);
565 :			;
566 :			;-----------------------------------------------------------------------------
567 :
568 :			; Note: We use (2*c + sgn(c) - sgn(-c)) as multiplier
569 :			; so we handle the 3 cases: c<0, c==0, and c>0 in one shot.
570 :			; sgn(x) is the result of 'pcmpgtw 0,x': 0 if x>=0, -1 if x<0.
571 :			; It's mixed with the extraction of the absolute value.
572 :
573 :			ALIGN 16
574 :			dequant_mpeg_inter_mmx:
575 :
576 :			push ebx
577 :
578 :			mov edx, [esp + 4 + 4] ; data
579 :			mov ecx, [esp + 4 + 8] ; coeff
580 :			mov eax, [esp + 4 + 12] ; quant
581 :			mov ebx, [esp + 4 + 16] ; mpeg_quant_matrices
582 :
583 :			movq mm7, [mmx_mul_quant + eax*8 - 8]
584 :			mov eax, -16
585 :			paddw mm7, mm7 ; << 1
586 :			pxor mm6, mm6 ; mismatch sum
587 :
588 :			ALIGN 16
589 :			.loop
590 :			movq mm0, [ecx+8eax + 816 ] ; mm0 = coeff[i]
591 :			movq mm2, [ecx+8eax + 816 +8] ; mm2 = coeff[i+1]
592 :			add eax, 2
593 :
594 :			pxor mm1, mm1
595 :			pxor mm3, mm3
596 :			pcmpgtw mm1, mm0 ; mm1 = sgn(c) (preserved)
597 :			pcmpgtw mm3, mm2 ; mm3 = sgn(c') (preserved)
598 :			paddsw mm0, mm1 ; c += sgn(c)
599 :			paddsw mm2, mm3 ; c += sgn(c')
600 :			paddw mm0, mm0 ; c *= 2
601 :			paddw mm2, mm2 ; c'*= 2
602 :
603 :			pxor mm4, mm4
604 :			pxor mm5, mm5
605 :			psubw mm4, mm0 ; -c
606 :			psubw mm5, mm2 ; -c'
607 :			psraw mm4, 16 ; mm4 = sgn(-c)
608 :			psraw mm5, 16 ; mm5 = sgn(-c')
609 :			psubsw mm0, mm4 ; c -= sgn(-c)
610 :			psubsw mm2, mm5 ; c' -= sgn(-c')
611 :			pxor mm0, mm1 ; finish changing sign if needed
612 :			pxor mm2, mm3 ; finish changing sign if needed
613 :
614 :			; we're short on register, here. Poor pairing...
615 :
616 :			movq mm4, mm7 ; (matrix*quant)
617 :			pmullw mm4, [ebx + 512 + 8eax + 816 -2*8]
618 :			movq mm5, mm4
619 :			pmulhw mm5, mm0 ; high of c(matrixquant)
620 :			pmullw mm0, mm4 ; low of c(matrixquant)
621 :
622 :			movq mm4, mm7 ; (matrix*quant)
623 :			pmullw mm4, [ebx + 512 + 8eax + 816 -2*8 + 8]
624 :
625 :			pcmpgtw mm5, [zero]
626 :			paddusw mm0, mm5
627 :			psrlw mm0, 5
628 :			pxor mm0, mm1 ; start restoring sign
629 :			psubusw mm1, mm5
630 :
631 :			movq mm5, mm4
632 :			pmulhw mm5, mm2 ; high of c(matrixquant)
633 :			pmullw mm2, mm4 ; low of c(matrixquant)
634 :			psubw mm0, mm1 ; finish restoring sign
635 :
636 :			pcmpgtw mm5, [zero]
637 :			paddusw mm2, mm5
638 :			psrlw mm2, 5
639 :			pxor mm2, mm3 ; start restoring sign
640 :			psubusw mm3, mm5
641 :			psubw mm2, mm3 ; finish restoring sign
642 :
643 :			pxor mm6, mm0 ; mismatch control
644 :			movq [edx + 8eax + 816 -2*8 ], mm0 ; data[i]
645 :			pxor mm6, mm2 ; mismatch control
646 :			movq [edx + 8eax + 816 -2*8 +8], mm2 ; data[i+1]
647 :
648 :			jnz near .loop
649 :
650 :			; mismatch control
651 :
652 :			movq mm0, mm6
653 :			psrlq mm0, 48
654 :			movq mm1, mm6
655 :			movq mm2, mm6
656 :			psrlq mm1, 32
657 :			pxor mm6, mm0
658 :			psrlq mm2, 16
659 :			pxor mm6, mm1
660 :			pxor mm6, mm2
661 :			movd eax, mm6
662 :			and eax, 1
663 :			xor eax, 1
664 :			xor word [edx + 2*63], ax
665 :
666 :			xor eax, eax
667 :
668 :			pop ebx
669 :
670 :			ret
671 :	edgomez	1540	.endfunc
672 :

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4