--- branches/dev-api-4/xvidcore/src/quant/x86_asm/quantize_mpeg_xmm.asm 2003/10/07 13:02:35 1174 +++ branches/dev-api-4/xvidcore/src/quant/x86_asm/quantize_mpeg_xmm.asm 2003/10/09 18:50:22 1176 @@ -1,9 +1,10 @@ ;/************************************************************************** ; * ; * XVID MPEG-4 VIDEO CODEC -; * - mmx quantization/dequantization - +; * - 3dne Quantization/Dequantization - ; * -; * Copyright(C) 2001-2003 XviD Team +; * Copyright (C) 2002-2003 Peter Ross +; * 2002 Jaan Kalda ; * ; * This program is free software ; you can redistribute it and/or modify ; * it under the terms of the GNU General Public License as published by @@ -19,26 +20,12 @@ ; * along with this program ; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; * $Id: quantize_mpeg_xmm.asm,v 1.1.2.1 2003-10-07 13:02:35 edgomez Exp $ +; * $Id: quantize_mpeg_xmm.asm,v 1.1.2.2 2003-10-09 18:50:22 edgomez Exp $ ; * ; *************************************************************************/ -;/************************************************************************** -; * quant4 bugs have been fixed: (a) overflow bug for matrix elements -; * equal to 1 or 2 is fixed by substituting pmulhw with pmulhuw (iSSE) -; * and using multiplier 0ffffh instead of 10001h (for matrix element = 1; -; * in that case, 1 is added before multiplying, that additional 1 comes -; * from intra_matrix1; (b) rounding error for large coefficients and matrix -; * elements is fixed by two-step approach: first approximation (rounded -; * down) is found as usual; the result is multiplied by the matrix element -; * and mismatch is used to calculate the correction. -; *************************************************************************/ + ; _3dne functions are compatible with iSSE, but are optimized specifically ; for K7 pipelines -; -;--------------------------------------------------------------------------- -; 09.12.2002 Athlon optimizations contributed by Jaan Kalda -;--------------------------------------------------------------------------- - ; data/text alignment %define ALIGN 8 @@ -46,12 +33,6 @@ bits 32 -%ifdef FORMAT_COFF -SECTION .data data -%else -SECTION .data data align=8 -%endif - %macro cglobal 1 %ifdef PREFIX global _%1 @@ -69,10 +50,22 @@ extern %1 %endif %endmacro -align 8 -mmzero dd 0,0 -mmx_one times 4 dw 1 +;*************************************************************************** +; Local data +;*************************************************************************** + +%ifdef FORMAT_COFF +SECTION .data data +%else +SECTION .data data align=8 +%endif + +align 8 +mmzero: + dd 0,0 +mmx_one: + times 4 dw 1 ;=========================================================================== ; @@ -81,7 +74,7 @@ ;=========================================================================== align ALIGN -mmx_divs ;i>2 +mmx_divs: ;i>2 %assign i 1 %rep 31 times 4 dw ((1 << 15) / i + 1) @@ -89,7 +82,7 @@ %endrep align ALIGN -mmx_div ;i>2 +mmx_div: ;i>2 %assign i 1 %rep 31 times 4 dw ((1 << 16) / i + 1) @@ -124,13 +117,11 @@ cextern inter_matrix_fixl -%define VM18P 3 -%define VM18Q 4 -%define nop4 db 08Dh,074h,026h,0 -%define nop3 add esp,byte 0 -%define nop2 mov esp,esp -%define nop7 db 08dh,02ch,02dh,0,0,0,0 -%define nop6 add ebp,dword 0 +%define nop4 db 08Dh, 074h, 026h,0 +%define nop3 add esp, byte 0 +%define nop2 mov esp, esp +%define nop7 db 08dh, 02ch, 02dh,0,0,0,0 +%define nop6 add ebp, dword 0 ;=========================================================================== ; @@ -138,8 +129,11 @@ ; ;=========================================================================== +%define VM18P 3 +%define VM18Q 4 -quantd +align 16 +quantd: %assign i 1 %rep 31 times 4 dw (((VM18P*i) + (VM18Q/2)) / VM18Q) @@ -152,8 +146,8 @@ ; ;=========================================================================== - -mmx_mul_quant +align 16 +mmx_mul_quant: %assign i 1 %rep 31 times 4 dw i @@ -167,14 +161,18 @@ ;=========================================================================== align 16 +mmx_32767_minus_2047: + times 4 dw (32767-2047) +mmx_32768_minus_2048: + times 4 dw (32768-2048) +mmx_2047: + times 4 dw 2047 +mmx_minus_2048: + times 4 dw (-2048) +zero: + times 4 dw 0 -mmx_32767_minus_2047 times 4 dw (32767-2047) -mmx_32768_minus_2048 times 4 dw (32768-2048) -mmx_2047 times 4 dw 2047 -mmx_minus_2048 times 4 dw (-2048) -zero times 4 dw 0 - -int_div +int_div: dd 0 %assign i 1 %rep 255 @@ -182,37 +180,40 @@ %assign i i+1 %endrep +;*************************************************************************** +; Code +;*************************************************************************** + section .text ;=========================================================================== ; -; void quant4_intra_xmm(int16_t * coeff, -; const int16_t const * data, -; const uint32_t quant, -; const uint32_t dcscalar); +; uint32_t quant_mpeg_intra_xmm(int16_t * coeff, +; const int16_t const * data, +; const uint32_t quant, +; const uint32_t dcscalar); ; ;=========================================================================== align ALIGN cglobal quant_mpeg_intra_xmm quant_mpeg_intra_xmm: - mov eax, [esp + 8] ; data - mov ecx, [esp + 12] ; quant - mov edx, [esp + 4] ; coeff + mov eax, [esp + 8] ; data + mov ecx, [esp + 12] ; quant + mov edx, [esp + 4] ; coeff push esi push edi push ebx nop - mov edi,mmzero - mov esi,-14 - pxor mm0,mm0 - pxor mm3,mm3 - cmp ecx,byte 1 - je near .q1loop - cmp ecx,byte 19 - jg near .lloop + mov edi, mmzero + mov esi, -14 + pxor mm0, mm0 + pxor mm3, mm3 + cmp ecx, byte 1 + je near .q1loop + cmp ecx, byte 19 + jg near .lloop nop6 - align ALIGN .loop @@ -233,7 +234,7 @@ movq mm7, [intra_matrix_fixl + 8*esi+120] pmulhuw mm5, mm0 pmulhuw mm7, mm3 - mov esp, esp + mov esp, esp movq mm2, [intra_matrix + 8*esi+112] movq mm6, [intra_matrix + 8*esi+120] pmullw mm2, mm5 @@ -245,7 +246,7 @@ movq mm6, [mmx_divs + ecx * 8 - 8] paddw mm5, mm2 paddw mm7, mm2 - mov esp, esp + mov esp, esp pmulhuw mm0, [intra_matrix_fix + 8*esi+112] pmulhuw mm3, [intra_matrix_fix + 8*esi+120] paddw mm5, mm0 @@ -260,34 +261,35 @@ psubw mm7, mm4 ; movq [edx + 8*esi+112], mm5 movq [edx + 8*esi +120], mm7 - add esi, byte 2 - jng near .loop + add esi, byte 2 + jng near .loop .done ; calculate data[0] // (int32_t)dcscalar) - mov esi, [esp + 12 + 16] ; dcscalar + mov esi, [esp + 12 + 16] ; dcscalar movsx ecx, word [eax] - mov edi, ecx - mov edx, [esp + 12 + 16] - shr edx, 1 ; ebx = dcscalar /2 - sar edi, 31 ; cdq is vectorpath - xor edx, edi ; ebx = eax V -eax -1 - sub ecx, edi - add ecx, edx - mov edx, [dword esp + 12 + 4] - mov esi, [int_div+4*esi] + mov edi, ecx + mov edx, [esp + 12 + 16] + shr edx, 1 ; ebx = dcscalar /2 + sar edi, 31 ; cdq is vectorpath + xor edx, edi ; ebx = eax V -eax -1 + sub ecx, edi + add ecx, edx + mov edx, [dword esp + 12 + 4] + mov esi, [int_div+4*esi] imul ecx, esi - sar ecx, 17 - lea ebx, [byte ecx + 1] + sar ecx, 17 + lea ebx, [byte ecx + 1] cmovs ecx, ebx ; idiv cx ; ecx = edi:ecx / dcscalar - mov ebx, [esp] - mov edi, [esp+4] - mov esi, [esp+8] - add esp, byte 12 - mov [edx], cx ; coeff[0] = ax + mov ebx, [esp] + mov edi, [esp+4] + mov esi, [esp+8] + add esp, byte 12 + mov [edx], cx ; coeff[0] = ax + xor eax, eax ret align ALIGN @@ -309,7 +311,7 @@ movq mm7, [intra_matrix_fixl + 8*esi+120] pmulhuw mm5, mm0 pmulhuw mm7, mm3 ;mm7: first approx of division - mov esp, esp + mov esp, esp movq mm2, [intra_matrix + 8*esi+112] movq mm6, [intra_matrix + 8*esi+120] ; divs for q<=16 pmullw mm2, mm5 ;test value <= original @@ -320,14 +322,14 @@ movq mm2, [quantd + ecx * 8 - 8] paddw mm5, mm2 ;first approx with quantd paddw mm7, mm2 - mov esp, esp + mov esp, esp pmulhuw mm0, [intra_matrix_fix + 8*esi+112] ;correction pmulhuw mm3, [intra_matrix_fix + 8*esi+120] paddw mm5, mm0 ;final result with quantd paddw mm7, mm3 movq mm0, [edi] movq mm3, [edi] - mov esp, esp + mov esp, esp psrlw mm5, 1 ; (level + quantd) /2 (quant = 1) psrlw mm7, 1 pxor mm5, mm1 ; mm0 *= sign(mm0) @@ -336,9 +338,9 @@ psubw mm7, mm4 ; movq [edx + 8*esi+112], mm5 movq [edx + 8*esi +120], mm7 - add esi, byte 2 - jng near .q1loop - jmp near .done + add esi, byte 2 + jng near .q1loop + jmp near .done align 8 .lloop @@ -359,7 +361,7 @@ movq mm7, [intra_matrix_fixl + 8*esi+120] pmulhuw mm5, mm0 pmulhuw mm7, mm3 ;mm7: first approx of division - mov esp, esp + mov esp, esp movq mm2, [intra_matrix + 8*esi+112] movq mm6, [intra_matrix + 8*esi+120] pmullw mm2, mm5 ;test value <= original @@ -371,14 +373,14 @@ movq mm6, [mmx_div + ecx * 8 - 8] ; divs for q<=16 paddw mm5, mm2 ;first approx with quantd paddw mm7, mm2 - mov esp, esp + mov esp, esp pmulhuw mm0, [intra_matrix_fix + 8*esi+112] ;correction pmulhuw mm3, [intra_matrix_fix + 8*esi+120] paddw mm5, mm0 ;final result with quantd paddw mm7, mm3 movq mm0, [edi] movq mm3, [edi] - mov esp, esp + mov esp, esp pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16 pmulhuw mm7, mm6 ; (level + quantd) / quant (0> 16 pmulhuw mm7, mm6 ; (level ) / quant (0> 16 pmulhuw mm7, mm6 ; (level ) / quant (0