--- trunk/xvidcore/src/quant/x86_asm/quantize4_mmx.asm 2002/03/08 02:46:11 3 +++ trunk/xvidcore/src/quant/x86_asm/quantize4_mmx.asm 2002/09/10 21:16:45 463 @@ -1,50 +1,36 @@ -;/****************************************************************************** -; * * -; * This file is part of XviD, a free MPEG-4 video encoder/decoder * -; * * -; * XviD is an implementation of a part of one or more MPEG-4 Video tools * -; * as specified in ISO/IEC 14496-2 standard. Those intending to use this * -; * software module in hardware or software products are advised that its * -; * use may infringe existing patents or copyrights, and any such use * -; * would be at such party's own risk. The original developer of this * -; * software module and his/her company, and subsequent editors and their * -; * companies, will have no liability for use of this software or * -; * modifications or derivatives thereof. * -; * * -; * XviD is free software; you can redistribute it and/or modify it * -; * under the terms of the GNU General Public License as published by * -; * the Free Software Foundation; either version 2 of the License, or * -; * (at your option) any later version. * -; * * -; * XviD is distributed in the hope that it will be useful, but * -; * WITHOUT ANY WARRANTY; without even the implied warranty of * -; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * -; * GNU General Public License for more details. * -; * * -; * You should have received a copy of the GNU General Public License * -; * along with this program; if not, write to the Free Software * -; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * -; * * -; ******************************************************************************/ -; -;/****************************************************************************** -; * * -; * quantize4.asm, MMX optimized MPEG quantization/dequantization * -; * * -; * Copyright (C) 2002 - Peter Ross * -; * Copyright (C) 2002 - Michael Militzer * -; * * -; * For more information visit the XviD homepage: http://www.xvid.org * -; * * -; ******************************************************************************/ -; -;/****************************************************************************** -; * * -; * Revision history: * -; * * -; * 22.01.2002 initial version * -; * * -; ******************************************************************************/ +;/***************************************************************************** +; * +; * XVID MPEG-4 VIDEO CODEC +; * mmx optimized MPEG quantization/dequantization +; * +; * Copyright(C) 2002 Peter Ross +; * Copyright(C) 2002 Michael Militzer +; * Copyright(C) 2002 Pascal Massimino +; * +; * This program is an implementation of a part of one or more MPEG-4 +; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending +; * to use this software module in hardware or software products are +; * advised that its use may infringe existing patents or copyrights, and +; * any such use would be at such party's own risk. The original +; * developer of this software module and his/her company, and subsequent +; * editors and their companies, will have no liability for use of this +; * software or modifications or derivatives thereof. +; * +; * This program is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 2 of the License, or +; * (at your option) any later version. +; * +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +; * +; *************************************************************************/ ; data/text alignment %define ALIGN 8 @@ -64,6 +50,15 @@ %endif %endmacro +%macro cextern 1 + %ifdef PREFIX + extern _%1 + %define %1 _%1 + %else + extern %1 + %endif +%endmacro + mmx_one times 4 dw 1 ;=========================================================================== @@ -113,97 +108,27 @@ ;=========================================================================== ; -; default intra matrix +; intra matrix ; ;=========================================================================== -mmx_intra_matrix - dw 8, 17, 18, 19 - dw 21, 23, 25, 27 - dw 17, 18, 19, 21 - dw 23, 25, 27, 28 - dw 20, 21, 22, 23 - dw 24, 26, 28, 30 - dw 21, 22, 23, 24 - dw 26, 28, 30, 32 - dw 22, 23, 24, 26 - dw 28, 30, 32, 35 - dw 23, 24, 26, 28 - dw 30, 32, 35, 38 - dw 25, 26, 28, 30 - dw 32, 35, 38, 41 - dw 27, 28, 30, 32 - dw 35, 38, 41, 45 - -%macro MMX_FIX 4 -dw (1 << 16) / (%1) + 1, (1 << 16) / (%2) + 1, (1 << 16) / (%3) + 1, (1 << 16) / (%4) + 1 -%endmacro - -mmx_intra_matrix_fix - MMX_FIX 8, 17, 18, 19 - MMX_FIX 21, 23, 25, 27 - MMX_FIX 17, 18, 19, 21 - MMX_FIX 23, 25, 27, 28 - MMX_FIX 20, 21, 22, 23 - MMX_FIX 24, 26, 28, 30 - MMX_FIX 21, 22, 23, 24 - MMX_FIX 26, 28, 30, 32 - MMX_FIX 22, 23, 24, 26 - MMX_FIX 28, 30, 32, 35 - MMX_FIX 23, 24, 26, 28 - MMX_FIX 30, 32, 35, 38 - MMX_FIX 25, 26, 28, 30 - MMX_FIX 32, 35, 38, 41 - MMX_FIX 27, 28, 30, 32 - MMX_FIX 35, 38, 41, 45 - +cextern intra_matrix +cextern intra_matrix_fix ;=========================================================================== ; -; default inter matrix +; inter matrix ; ;=========================================================================== -mmx_inter_matrix - dw 16,17,18,19 - dw 20,21,22,23 - dw 17,18,19,20 - dw 21,22,23,24 - dw 18,19,20,21 - dw 22,23,24,25 - dw 19,20,21,22 - dw 23,24,26,27 - dw 20,21,22,23 - dw 25,26,27,28 - dw 21,22,23,24 - dw 26,27,28,30 - dw 22,23,24,26 - dw 27,28,30,31 - dw 23,24,25,27 - dw 28,30,31,33 - - -mmx_inter_matrix_fix - MMX_FIX 16,17,18,19 - MMX_FIX 20,21,22,23 - MMX_FIX 17,18,19,20 - MMX_FIX 21,22,23,24 - MMX_FIX 18,19,20,21 - MMX_FIX 22,23,24,25 - MMX_FIX 19,20,21,22 - MMX_FIX 23,24,26,27 - MMX_FIX 20,21,22,23 - MMX_FIX 25,26,27,28 - MMX_FIX 21,22,23,24 - MMX_FIX 26,27,28,30 - MMX_FIX 22,23,24,26 - MMX_FIX 27,28,30,31 - MMX_FIX 23,24,25,27 - MMX_FIX 28,30,31,33 +cextern inter_matrix +cextern inter_matrix_fix + %define VM18P 3 %define VM18Q 4 + ;=========================================================================== ; ; quantd table @@ -250,57 +175,6 @@ ;=========================================================================== ; -; multiple by matrix table -; -;=========================================================================== - -%macro MMX_MUL 4 -dw %1 -dw %2 -dw %3 -dw %4 -%endmacro - -default_inter_matrix_mul - MMX_MUL 16,17,18,19 - MMX_MUL 20,21,22,23 - MMX_MUL 17,18,19,20 - MMX_MUL 21,22,23,24 - MMX_MUL 18,19,20,21 - MMX_MUL 22,23,24,25 - MMX_MUL 19,20,21,22 - MMX_MUL 23,24,26,27 - MMX_MUL 20,21,22,23 - MMX_MUL 25,26,27,28 - MMX_MUL 21,22,23,24 - MMX_MUL 26,27,28,30 - MMX_MUL 22,23,24,26 - MMX_MUL 27,28,30,31 - MMX_MUL 23,24,25,27 - MMX_MUL 28,30,31,33 - - -default_intra_matrix_mul - MMX_MUL 8,17,18,19 - MMX_MUL 21,23,25,27 - MMX_MUL 17,18,19,21 - MMX_MUL 23,25,27,28 - MMX_MUL 20,21,22,23 - MMX_MUL 24,26,28,30 - MMX_MUL 21,22,23,24 - MMX_MUL 26,28,30,32 - MMX_MUL 22,23,24,26 - MMX_MUL 28,30,32,35 - MMX_MUL 23,24,26,28 - MMX_MUL 30,32,35,38 - MMX_MUL 25,26,28,30 - MMX_MUL 32,35,38,41 - MMX_MUL 27,28,30,32 - MMX_MUL 35,38,41,45 - - -;=========================================================================== -; ; multiple by 2Q table ; ;=========================================================================== @@ -349,8 +223,12 @@ ;=========================================================================== align 16 -mmx_32768_minus_2048 times 4 dw (32768-2048) -mmx_32767_minus_2047 times 4 dw (32767-2047) + +mmx_32767_minus_2047 times 4 dw (32767-2047) +mmx_32768_minus_2048 times 4 dw (32768-2048) +mmx_2047 times 4 dw 2047 +mmx_minus_2048 times 4 dw (-2048) +zero times 4 dw 0 section .text @@ -405,18 +283,18 @@ psllw mm0, 4 ; level << 4 psllw mm3, 4 ; - movq mm2, [mmx_intra_matrix + 8*ecx] + movq mm2, [intra_matrix + 8*ecx] psrlw mm2, 1 ; intra_matrix[i]>>1 paddw mm0, mm2 - movq mm2, [mmx_intra_matrix_fix + ecx*8] + movq mm2, [intra_matrix_fix + ecx*8] pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i] - movq mm2, [mmx_intra_matrix + 8*ecx + 8] + movq mm2, [intra_matrix + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 - movq mm2, [mmx_intra_matrix_fix + ecx*8 + 8] + movq mm2, [intra_matrix_fix + ecx*8 + 8] pmulhw mm3, mm2 paddw mm0, mm5 ; + quantd @@ -484,18 +362,18 @@ psllw mm0, 4 psllw mm3, 4 - movq mm2, [mmx_intra_matrix + 8*ecx] + movq mm2, [intra_matrix + 8*ecx] psrlw mm2, 1 paddw mm0, mm2 - movq mm2, [mmx_intra_matrix_fix + ecx*8] + movq mm2, [intra_matrix_fix + ecx*8] pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i] - movq mm2, [mmx_intra_matrix + 8*ecx + 8] + movq mm2, [intra_matrix + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 - movq mm2, [mmx_intra_matrix_fix + ecx*8 + 8] + movq mm2, [intra_matrix_fix + ecx*8 + 8] pmulhw mm3, mm2 paddw mm0, mm5 @@ -537,18 +415,18 @@ psllw mm0, 4 psllw mm3, 4 - movq mm2, [mmx_intra_matrix + 8*ecx] + movq mm2, [intra_matrix + 8*ecx] psrlw mm2, 1 paddw mm0, mm2 - movq mm2, [mmx_intra_matrix_fix + ecx*8] + movq mm2, [intra_matrix_fix + ecx*8] pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i] - movq mm2, [mmx_intra_matrix + 8*ecx + 8] + movq mm2, [intra_matrix + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 - movq mm2, [mmx_intra_matrix_fix + ecx*8 + 8] + movq mm2, [intra_matrix_fix + ecx*8 + 8] pmulhw mm3, mm2 paddw mm0, mm5 @@ -619,18 +497,18 @@ psllw mm0, 4 psllw mm3, 4 - movq mm2, [mmx_inter_matrix + 8*ecx] + movq mm2, [inter_matrix + 8*ecx] psrlw mm2, 1 paddw mm0, mm2 - movq mm2, [mmx_inter_matrix_fix + ecx*8] + movq mm2, [inter_matrix_fix + ecx*8] pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i] - movq mm2, [mmx_inter_matrix + 8*ecx + 8] + movq mm2, [inter_matrix + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 - movq mm2, [mmx_inter_matrix_fix + ecx*8 + 8] + movq mm2, [inter_matrix_fix + ecx*8 + 8] pmulhw mm3, mm2 pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16 @@ -683,18 +561,18 @@ psllw mm0, 4 psllw mm3, 4 - movq mm2, [mmx_inter_matrix + 8*ecx] + movq mm2, [inter_matrix + 8*ecx] psrlw mm2, 1 paddw mm0, mm2 - movq mm2, [mmx_inter_matrix_fix + ecx*8] + movq mm2, [inter_matrix_fix + ecx*8] pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i] - movq mm2, [mmx_inter_matrix + 8*ecx + 8] + movq mm2, [inter_matrix + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 - movq mm2, [mmx_inter_matrix_fix + ecx*8 + 8] + movq mm2, [inter_matrix_fix + ecx*8 + 8] pmulhw mm3, mm2 psrlw mm0, 1 ; mm0 >>= 1 (/2) @@ -736,18 +614,18 @@ psllw mm0, 4 psllw mm3, 4 - movq mm2, [mmx_inter_matrix + 8*ecx] + movq mm2, [inter_matrix + 8*ecx] psrlw mm2, 1 paddw mm0, mm2 - movq mm2, [mmx_inter_matrix_fix + ecx*8] + movq mm2, [inter_matrix_fix + ecx*8] pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i] - movq mm2, [mmx_inter_matrix + 8*ecx + 8] + movq mm2, [inter_matrix + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 - movq mm2, [mmx_inter_matrix_fix + ecx*8 + 8] + movq mm2, [inter_matrix_fix + ecx*8 + 8] pmulhw mm3, mm2 psrlw mm0, 2 ; mm0 >>= 1 (/2) @@ -779,99 +657,128 @@ ; ;=========================================================================== -align 16 -cglobal dequant4_intra_mmx -dequant4_intra_mmx - - push esi - push edi - - mov edi, [esp + 8 + 4] ; data - mov esi, [esp + 8 + 8] ; coeff - mov eax, [esp + 8 + 12] ; quant - - movq mm7, [mmx_mul_quant + eax*8 - 8] - - xor eax, eax + ; Note: in order to saturate 'easily', we pre-shift the quantifier + ; by 4. Then, the high-word of (coeff[]*matrix[i]*quant) are used to + ; build a saturating mask. It is non-zero only when an overflow occured. + ; We thus avoid packing/unpacking toward double-word. + ; Moreover, we perform the mult (matrix[i]*quant) first, instead of, e.g., + ; (coeff[i]*matrix[i]). This is less prone to overflow if coeff[] are not + ; checked. Input ranges are: coeff in [-127,127], inter_matrix in [1..255],a + ; and quant in [1..31]. + ; + ; The original loop is: + ; +%if 0 + movq mm0, [ecx+8*eax + 8*16] ; mm0 = coeff[i] + pxor mm1, mm1 + pcmpgtw mm1, mm0 + pxor mm0, mm1 ; change sign if negative + psubw mm0, mm1 ; -> mm0 = abs(coeff[i]), mm1 = sign of coeff[i] + + movq mm2, mm7 ; mm2 = quant + pmullw mm2, [intra_matrix + 8*eax + 8*16 ] ; matrix[i]*quant. + + movq mm6, mm2 + pmulhw mm2, mm0 ; high of coeff*(matrix*quant) (should be 0 if no overflow) + pmullw mm0, mm6 ; low of coeff*(matrix*quant) + + pxor mm5, mm5 + pcmpgtw mm2, mm5 ; otherflow? + psrlw mm2, 5 ; =0 if no clamp, 2047 otherwise + psrlw mm0, 5 + paddw mm0, mm1 ; start restoring sign + por mm0, mm2 ; saturate to 2047 if needed + pxor mm0, mm1 ; finish negating back - -align 16 -.loop - movq mm0, [esi + 8*eax] ; mm0 = [coeff] - - pxor mm1, mm1 ; mm1 = 0 - pcmpeqw mm1, mm0 ; mm1 = (0 == mm0) - - pxor mm2, mm2 ; mm2 = 0 - pcmpgtw mm2, mm0 ; mm2 = (0 > mm0) - pxor mm0, mm2 ; mm0 = |mm0| - psubw mm0, mm2 ; displace - - pmullw mm0, mm7 ; mm0 *= quant - - movq mm3, [default_intra_matrix_mul + 8*eax] - - movq mm4, mm0 ; - pmullw mm0, mm3 ; mm0 = low(mm0 * mm3) - pmulhw mm3, mm4 ; mm3 = high(mm0 * mm3) - - movq mm4, mm0 ; mm0,mm4 = unpack(mm3, mm0) - punpcklwd mm0, mm3 ; - punpckhwd mm4, mm3 ; - psrld mm0, 3 ; mm0,mm4 /= 8 - psrld mm4, 3 ; - packssdw mm0, mm4 ; mm0 = pack(mm4, mm0) - - pxor mm0, mm2 ; mm0 *= sign(mm0) - psubw mm0, mm2 ; undisplace - pandn mm1, mm0 ; mm1 = ~(iszero) & mm0 - -%ifdef SATURATE - movq mm2, [mmx_32767_minus_2047] - movq mm6, [mmx_32768_minus_2048] - paddsw mm1, mm2 - psubsw mm1, mm2 - psubsw mm1, mm6 - paddsw mm1, mm6 + movq [edx + 8*eax + 8*16], mm0 ; data[i] + add eax, 1 %endif - movq [edi + 8*eax], mm1 ; [data] = mm0 + ;******************************************************************** - add eax, 1 - cmp eax, 16 - jnz near .loop - - mov ax, [esi] ; ax = data[0] - imul ax, [esp + 8 + 16] ; eax = data[0] * dcscalar - mov [edi], ax ; data[0] = ax - -%ifdef SATURATE - cmp ax, -2048 - jl .set_n2048 - cmp ax, 2047 - jg .set_2047 -%endif - - pop edi - pop esi - ret - -%ifdef SATURATE -.set_n2048 - mov word [edi], -2048 - pop edi - pop esi - ret - -.set_2047 - mov word [edi], 2047 - pop edi - pop esi +align 16 +cglobal dequant4_intra_mmx +dequant4_intra_mmx: - ret -%endif + mov edx, [esp+4] ; data + mov ecx, [esp+8] ; coeff + mov eax, [esp+12] ; quant + + movq mm7, [mmx_mul_quant + eax*8 - 8] + mov eax, -16 ; to keep aligned, we regularly process coeff[0] + psllw mm7, 2 ; << 2. See comment. + pxor mm6, mm6 ; this is a NOP +align 16 +.loop + movq mm0, [ecx+8*eax + 8*16] ; mm0 = c = coeff[i] + movq mm3, [ecx+8*eax + 8*16 +8]; mm3 = c' = coeff[i+1] + pxor mm1, mm1 + pxor mm4, mm4 + pcmpgtw mm1, mm0 ; mm1 = sgn(c) + movq mm2, mm7 ; mm2 = quant + + pcmpgtw mm4, mm3 ; mm4 = sgn(c') + pmullw mm2, [intra_matrix + 8*eax + 8*16 ] ; matrix[i]*quant + + pxor mm0, mm1 ; negate if negative + pxor mm3, mm4 ; negate if negative + + psubw mm0, mm1 + psubw mm3, mm4 + + ; we're short on register, here. Poor pairing... + + movq mm5, mm2 + pmullw mm2, mm0 ; low of coeff*(matrix*quant) + + pmulhw mm0, mm5 ; high of coeff*(matrix*quant) + movq mm5, mm7 ; mm2 = quant + + pmullw mm5, [intra_matrix + 8*eax + 8*16 +8] ; matrix[i+1]*quant + + movq mm6, mm5 + add eax,2 ; z-flag will be tested later + + pmullw mm6, mm3 ; low of coeff*(matrix*quant) + pmulhw mm3, mm5 ; high of coeff*(matrix*quant) + + pcmpgtw mm0, [zero] + paddusw mm2, mm0 + psrlw mm2, 5 + + pcmpgtw mm3, [zero] + paddusw mm6, mm3 + psrlw mm6, 5 + + pxor mm2, mm1 ; start negating back + pxor mm6, mm4 ; start negating back + + psubusw mm1, mm0 + psubusw mm4, mm3 + + psubw mm2, mm1 ; finish negating back + psubw mm6, mm4 ; finish negating back + + movq [edx + 8*eax + 8*16 -2*8 ], mm2 ; data[i] + movq [edx + 8*eax + 8*16 -2*8 +8], mm6 ; data[i+1] + + jnz near .loop + + ; deal with DC + + movd mm0, [ecx] + pmullw mm0, [esp+16] ; dcscalar + movq mm2, [mmx_32767_minus_2047] + paddsw mm0, mm2 + psubsw mm0, mm2 + movq mm2, [mmx_32768_minus_2048] + psubsw mm0, mm2 + paddsw mm0, mm2 + movd eax, mm0 + mov [edx], ax + ret ;=========================================================================== ; @@ -881,94 +788,100 @@ ; ;=========================================================================== + ; Note: We use (2*c + sgn(c) - sgn(-c)) as multiplier + ; so we handle the 3 cases: c<0, c==0, and c>0 in one shot. + ; sgn(x) is the result of 'pcmpgtw 0,x': 0 if x>=0, -1 if x<0. + ; It's mixed with the extraction of the absolute value. + align 16 cglobal dequant4_inter_mmx -dequant4_inter_mmx - - push esi - push edi - - mov edi, [esp + 8 + 4] ; data - mov esi, [esp + 8 + 8] ; coeff - mov eax, [esp + 8 + 12] ; quant - movq mm7, [mmx_mul_quant + eax*8 - 8] - movq mm6, [mmx_one] - xor eax, eax - pxor mm5, mm5 ; mismatch sum +dequant4_inter_mmx: + mov edx, [esp+ 4] ; data + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + movq mm7, [mmx_mul_quant + eax*8 - 8] + mov eax, -16 + paddw mm7, mm7 ; << 1 + pxor mm6, mm6 ; mismatch sum -align 16 +align 16 .loop - movq mm0, [esi + 8*eax] ; mm0 = [coeff] - - pxor mm1, mm1 ; mm1 = 0 - pcmpeqw mm1, mm0 ; mm1 = (0 == mm0) + movq mm0, [ecx+8*eax + 8*16 ] ; mm0 = coeff[i] + movq mm2, [ecx+8*eax + 8*16 +8] ; mm2 = coeff[i+1] + add eax,2 + + pxor mm1, mm1 + pxor mm3, mm3 + pcmpgtw mm1, mm0 ; mm1 = sgn(c) (preserved) + pcmpgtw mm3, mm2 ; mm3 = sgn(c') (preserved) + paddsw mm0, mm1 ; c += sgn(c) + paddsw mm2, mm3 ; c += sgn(c') + paddw mm0, mm0 ; c *= 2 + paddw mm2, mm2 ; c'*= 2 + + pxor mm4, mm4 + pxor mm5, mm5 + psubw mm4, mm0 ; -c + psubw mm5, mm2 ; -c' + psraw mm4, 16 ; mm4 = sgn(-c) + psraw mm5, 16 ; mm5 = sgn(-c') + psubsw mm0, mm4 ; c -= sgn(-c) + psubsw mm2, mm5 ; c' -= sgn(-c') + pxor mm0, mm1 ; finish changing sign if needed + pxor mm2, mm3 ; finish changing sign if needed + + ; we're short on register, here. Poor pairing... + + movq mm4, mm7 ; (matrix*quant) + pmullw mm4, [inter_matrix + 8*eax + 8*16 -2*8] + movq mm5, mm4 + pmulhw mm5, mm0 ; high of c*(matrix*quant) + pmullw mm0, mm4 ; low of c*(matrix*quant) + + movq mm4, mm7 ; (matrix*quant) + pmullw mm4, [inter_matrix + 8*eax + 8*16 -2*8 + 8] + + pcmpgtw mm5, [zero] + paddusw mm0, mm5 + psrlw mm0, 5 + pxor mm0, mm1 ; start restoring sign + psubusw mm1, mm5 + + movq mm5, mm4 + pmulhw mm5, mm2 ; high of c*(matrix*quant) + pmullw mm2, mm4 ; low of c*(matrix*quant) + psubw mm0, mm1 ; finish restoring sign + + pcmpgtw mm5, [zero] + paddusw mm2, mm5 + psrlw mm2, 5 + pxor mm2, mm3 ; start restoring sign + psubusw mm3, mm5 + psubw mm2, mm3 ; finish restoring sign + + pxor mm6, mm0 ; mismatch control + movq [edx + 8*eax + 8*16 -2*8 ], mm0 ; data[i] + pxor mm6, mm2 ; mismatch control + movq [edx + 8*eax + 8*16 -2*8 +8], mm2 ; data[i+1] + + jnz near .loop + + ; mismatch control + + movq mm0, mm6 + psrlq mm0, 48 + movq mm1, mm6 + movq mm2, mm6 + psrlq mm1, 32 + pxor mm6, mm0 + psrlq mm2, 16 + pxor mm6, mm1 + pxor mm6, mm2 + movd eax, mm6 + and eax, 1 + xor eax, 1 + xor word [edx + 2*63], ax - pxor mm2, mm2 ; mm2 = 0 - pcmpgtw mm2, mm0 ; mm2 = (0 > mm0) - pxor mm0, mm2 ; mm0 = |mm0| - psubw mm0, mm2 ; displace - - psllw mm0, 1 ; - paddsw mm0, mm6 ; mm0 = 2*mm0 + 1 - pmullw mm0, mm7 ; mm0 *= quant - - movq mm3, [default_inter_matrix_mul + 8*eax] - - movq mm4, mm0 - pmullw mm0, mm3 ; mm0 = low(mm0 * mm3) - pmulhw mm3, mm4 ; mm3 = high(mm0 * mm3) - - movq mm4, mm0 ; mm0,mm4 = unpack(mm3, mm0) - punpcklwd mm0, mm3 ; - punpckhwd mm4, mm3 ; - - psrad mm0, 4 ; mm0,mm4 /= 16 - psrad mm4, 4 ; - packssdw mm0, mm4 ; mm0 = pack(mm4, mm0) - - pxor mm0, mm2 ; mm0 *= sign(mm0) - psubw mm0, mm2 ; undisplace - pandn mm1, mm0 ; mm1 = ~(iszero) & mm0 - - -;%ifdef SATURATE - movq mm2, [mmx_32767_minus_2047] - movq mm4, [mmx_32768_minus_2048] - paddsw mm1, mm2 - psubsw mm1, mm2 - psubsw mm1, mm4 - paddsw mm1, mm4 -;%endif - - pxor mm5, mm1 ; mismatch - - movq [edi + 8*eax], mm1 ; [data] = mm0 - - add eax, 1 - cmp eax, 16 - jnz near .loop - - ; mismatch control - - movq mm0, mm5 - movq mm1, mm5 - movq mm2, mm5 - psrlq mm0, 48 - psrlq mm1, 32 - psrlq mm2, 16 - pxor mm5, mm0 - pxor mm5, mm1 - pxor mm5, mm2 - - movd eax, mm5 - test eax, 0x1 - jnz .done - - xor word [edi + 2*63], 1 - -.done - pop edi - pop esi + ret - ret