--- branches/dev-api-4/xvidcore/src/quant/x86_asm/quantize4_xmm.asm 2003/07/14 12:40:16 1088 +++ branches/dev-api-4/xvidcore/src/quant/x86_asm/quantize4_xmm.asm 2003/07/16 23:00:08 1089 @@ -1,49 +1,43 @@ -;/****************************************************************************** -; * * -; * This file is part of XviD, a free MPEG-4 video encoder/decoder * -; * * -; * XviD is an implementation of a part of one or more MPEG-4 Video tools * -; * as specified in ISO/IEC 14496-2 standard. Those intending to use this * -; * software module in hardware or software products are advised that its * -; * use may infringe existing patents or copyrights, and any such use * -; * would be at such party's own risk. The original developer of this * -; * software module and his/her company, and subsequent editors and their * -; * companies, will have no liability for use of this software or * -; * modifications or derivatives thereof. * -; * * -; * XviD is free software; you can redistribute it and/or modify it * -; * under the terms of the GNU General Public License as published by * -; * the Free Software Foundation; either version 2 of the License, or * -; * (at your option) any later version. * -; * * -; * XviD is distributed in the hope that it will be useful, but * -; * WITHOUT ANY WARRANTY; without even the implied warranty of * -; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * -; * GNU General Public License for more details. * -; * * -; * You should have received a copy of the GNU General Public License * -; * along with this program; if not, write to the Free Software * -; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * -; * * -; ******************************************************************************/ -; -;/****************************************************************************** -; * quant4 bugs have been fixed: (a) overflow bug for matrix elements * -; * equal to 1 or 2 is fixed by substituting pmulhw with pmulhuw (iSSE) * -; * and using multiplier 0ffffh instead of 10001h (for matrix element = 1; * -; * in that case, 1 is added before multiplying, that additional 1 comes * -; * from intra_matrix1; (b) rounding error for large coefficients and matrix * -; * elements is fixed by two-step approach: first approximation (rounded * -; * down) is found as usual; the result is multiplied by the matrix element * -; * and mismatch is used to calculate the correction. * -; ******************************************************************************/ +;/************************************************************************** +; * +; * XVID MPEG-4 VIDEO CODEC +; * - mmx quantization/dequantization - +; * +; * Copyright(C) 2001-2003 XviD Team +; * +; * This program is free software ; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation ; either version 2 of the License, or +; * (at your option) any later version. +; * +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY ; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program ; if not, write to the Free Software +; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +; * +; * $Id: quantize4_xmm.asm,v 1.2.2.1 2003-07-16 22:59:15 edgomez Exp $ +; * +; *************************************************************************/ +;/************************************************************************** +; * quant4 bugs have been fixed: (a) overflow bug for matrix elements +; * equal to 1 or 2 is fixed by substituting pmulhw with pmulhuw (iSSE) +; * and using multiplier 0ffffh instead of 10001h (for matrix element = 1; +; * in that case, 1 is added before multiplying, that additional 1 comes +; * from intra_matrix1; (b) rounding error for large coefficients and matrix +; * elements is fixed by two-step approach: first approximation (rounded +; * down) is found as usual; the result is multiplied by the matrix element +; * and mismatch is used to calculate the correction. +; *************************************************************************/ +; _3dne functions are compatible with iSSE, but are optimized specifically +; for K7 pipelines ; -; _3dne functions are compatible with iSSE, but are optimized specifically for -; K7 pipelines -; -;------------------------------------------------------------------------------ +;--------------------------------------------------------------------------- ; 09.12.2002 Athlon optimizations contributed by Jaan Kalda -;------------------------------------------------------------------------------ +;--------------------------------------------------------------------------- ; data/text alignment @@ -130,13 +124,14 @@ cextern inter_matrix_fixl -%define VM18P 3 -%define VM18Q 4 -%define nop4 DB 08Dh,074h,026h,0 -%define nop3 add esp,byte 0 -%define nop2 mov esp,esp -%define nop7 db 08dh,02ch,02dh,0,0,0,0 -%define nop6 add ebp,dword 0 +%define VM18P 3 +%define VM18Q 4 +%define nop4 db 08Dh,074h,026h,0 +%define nop3 add esp,byte 0 +%define nop2 mov esp,esp +%define nop7 db 08dh,02ch,02dh,0,0,0,0 +%define nop6 add ebp,dword 0 + ;=========================================================================== ; ; quantd table @@ -173,11 +168,11 @@ align 16 -mmx_32767_minus_2047 times 4 dw (32767-2047) -mmx_32768_minus_2048 times 4 dw (32768-2048) -mmx_2047 times 4 dw 2047 -mmx_minus_2048 times 4 dw (-2048) -zero times 4 dw 0 +mmx_32767_minus_2047 times 4 dw (32767-2047) +mmx_32768_minus_2048 times 4 dw (32768-2048) +mmx_2047 times 4 dw 2047 +mmx_minus_2048 times 4 dw (-2048) +zero times 4 dw 0 int_div dd 0 @@ -191,7 +186,7 @@ ;=========================================================================== ; -; void quant_intra4_xmm(int16_t * coeff, +; void quant4_intra_xmm(int16_t * coeff, ; const int16_t const * data, ; const uint32_t quant, ; const uint32_t dcscalar); @@ -201,202 +196,202 @@ align ALIGN cglobal quant4_intra_xmm quant4_intra_xmm - - mov eax, [esp + 8] ; data - mov ecx, [esp + 12] ; quant - mov edx, [esp + 4] ; coeff - push esi - push edi - push ebx - nop - mov edi,mmzero - mov esi,-14 - pxor mm0,mm0 - pxor mm3,mm3 - cmp ecx,byte 1 - je near .q1loop - cmp ecx,byte 19 - jg near .lloop - nop6 + mov eax, [esp + 8] ; data + mov ecx, [esp + 12] ; quant + mov edx, [esp + 4] ; coeff + push esi + push edi + push ebx + nop + mov edi,mmzero + mov esi,-14 + pxor mm0,mm0 + pxor mm3,mm3 + cmp ecx,byte 1 + je near .q1loop + cmp ecx,byte 19 + jg near .lloop + nop6 align ALIGN .loop - movq mm1, [eax + 8*esi+112] ; mm0 = [1st] - psubw mm0,mm1 ;-mm1 - movq mm4, [eax + 8*esi + 120] ; - psubw mm3,mm4 ;-mm4 - pmaxsw mm0,mm1 ;|src| - pmaxsw mm3,mm4 - nop2 - psraw mm1,15 ;sign src - psraw mm4,15 - psllw mm0, 4 ; level << 4 - psllw mm3, 4 ; - paddw mm0, [intra_matrix1 + 8*esi+112] - paddw mm3, [intra_matrix1 + 8*esi+120] - movq mm5,[intra_matrix_fixl + 8*esi+112] - movq mm7,[intra_matrix_fixl + 8*esi+120] - pmulhuw mm5,mm0 - pmulhuw mm7,mm3 - mov esp,esp - movq mm2,[intra_matrix + 8*esi+112] - movq mm6,[intra_matrix + 8*esi+120] - pmullw mm2,mm5 - pmullw mm6,mm7 - psubw mm0,mm2 - psubw mm3,mm6 - nop4 - movq mm2,[quantd + ecx * 8 - 8] - movq mm6,[mmx_divs + ecx * 8 - 8] - paddw mm5,mm2 - paddw mm7,mm2 - mov esp,esp - pmulhuw mm0,[intra_matrix_fix + 8*esi+112] - pmulhuw mm3,[intra_matrix_fix + 8*esi+120] - paddw mm5,mm0 - paddw mm7,mm3 - movq mm0,[edi] - movq mm3,[edi] - pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16 - pmulhuw mm7, mm6 ; (level + quantd) / quant (0> 16 + pmulhuw mm7, mm6 ; (level + quantd) / quant (0> 16 - pmulhuw mm7, mm6 ; (level + quantd) / quant (0> 16 + pmulhuw mm7, mm6 ; (level + quantd) / quant (0> 16 - pmulhuw mm7, mm6 ; (level ) / quant (0> 16 + pmulhuw mm7, mm6 ; (level ) / quant (0> 16 - pmulhuw mm7, mm6 ; (level ) / quant (0> 16 + pmulhuw mm7, mm6 ; (level ) / quant (0