--- trunk/xvidcore/src/bitstream/x86_asm/cbp_3dne.asm 2004/03/18 16:11:28 1381 +++ trunk/xvidcore/src/bitstream/x86_asm/cbp_3dne.asm 2004/03/22 22:36:25 1382 @@ -1,44 +1,37 @@ -;/************************************************************************** +;/**************************************************************************** ; * -; * XVID MPEG-4 VIDEO CODEC -; * mmx cbp calc +; * XVID MPEG-4 VIDEO CODEC +; * - 3dne CBP computation - ; * -; * This program is an implementation of a part of one or more MPEG-4 -; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending -; * to use this software module in hardware or software products are -; * advised that its use may infringe existing patents or copyrights, and -; * any such use would be at such party's own risk. The original -; * developer of this software module and his/her company, and subsequent -; * editors and their companies, will have no liability for use of this -; * software or modifications or derivatives thereof. -; * -; * This program is free software; you can redistribute it and/or modify -; * it under the terms of the GNU General Public License as published by -; * the Free Software Foundation; either version 2 of the License, or -; * (at your option) any later version. -; * -; * This program is distributed in the hope that it will be useful, -; * but WITHOUT ANY WARRANTY; without even the implied warranty of -; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; * GNU General Public License for more details. -; * -; * You should have received a copy of the GNU General Public License -; * along with this program; if not, write to the Free Software -; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -; * -; *************************************************************************/ - - -; these 3dne functions are compatible with iSSE, but are optimized specifically for -; K7 pipelines -; -;------------------------------------------------------------------------------ -; 09.12.2002 Athlon optimizations contributed by Jaan Kalda -;------------------------------------------------------------------------------ +; * Copyright (C) 2002 Jaan Kalda +; * +; * This program is free software ; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation ; either version 2 of the License, or +; * (at your option) any later version. +; * +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY ; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program ; if not, write to the Free Software +; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +; * +; * $Id: cbp_3dne.asm,v 1.3 2004-03-22 22:36:23 edgomez Exp $ +; * +; ***************************************************************************/ + +; these 3dne functions are compatible with iSSE, but are optimized +; specifically for K7 pipelines -bits 32 +BITS 32 + +;============================================================================= +; Macros +;============================================================================= -section .data %macro cglobal 1 %ifdef PREFIX global _%1 @@ -48,72 +41,75 @@ %endif %endmacro -section .text - -cglobal calc_cbp_3dne -;=========================================================================== -; -; uint32_t calc_cbp_3dne(const int16_t coeff[6][64]); -; -;=========================================================================== %macro calc_cbp 1 - pshufw mm0, [eax], 229 ; =11100101 - movq mm1, [eax+8] - por mm0, [eax+64] - por mm1, [eax+72] - movq mm2, [eax+16] - movq mm3, [eax+24] - por mm2, [eax+80] - por mm3, [eax+88] - movq mm4, [eax+32] - movq mm5, [eax+40] - por mm4, [eax+96] - por mm5, [eax+104] - movq mm6, [eax+48] - movq mm7, [eax+56] - por mm6, [eax+112] - por mm7, [eax+120] - por mm1, mm0 -%if %1 - sub eax,byte -128 ;ecx ;+= 128; needed 3 bytes for alignment + pshufw mm0, [eax], 229 ; =11100101 + movq mm1, [eax+8] + por mm0, [eax+64] + por mm1, [eax+72] + movq mm2, [eax+16] + movq mm3, [eax+24] + por mm2, [eax+80] + por mm3, [eax+88] + movq mm4, [eax+32] + movq mm5, [eax+40] + por mm4, [eax+96] + por mm5, [eax+104] + movq mm6, [eax+48] + movq mm7, [eax+56] + por mm6, [eax+112] + por mm7, [eax+120] + por mm1, mm0 +%if %1 + sub eax, byte -128 ;ecx ;+= 128; needed 3 bytes for alignment %else - xor eax,eax - xor edx,edx -%endif - por mm3, mm2 - por mm5, mm4 - por mm7, mm6 - por mm3, mm1 - por mm7, mm5 - por mm7, mm3 - packsswb mm7,mm7 - movd [esp+%1*4],mm7 + xor eax, eax + xor edx, edx +%endif + por mm3, mm2 + por mm5, mm4 + por mm7, mm6 + por mm3, mm1 + por mm7, mm5 + por mm7, mm3 + packsswb mm7, mm7 + movd [esp+%1*4], mm7 %endmacro +;============================================================================= +; Code +;============================================================================= + +SECTION .text + +cglobal calc_cbp_3dne +;----------------------------------------------------------------------------- +; uint32_t calc_cbp_3dne(const int16_t coeff[6*64]); +;----------------------------------------------------------------------------- +;AMD K7, in cache: ca 80 clk -align 16 ;AMD K7, in cache: ca 80 clk +ALIGN 16 calc_cbp_3dne: - mov eax, [esp+ 4] ; coeff - lea esp,[esp-24] + mov eax, [esp+ 4] ; coeff + lea esp, [esp-24] calc_cbp 5 ;bit 5 - calc_cbp 4 ;b4 + calc_cbp 4 ;b4 calc_cbp 3 ;b3 calc_cbp 2 ;b2 calc_cbp 1 ;b1 calc_cbp 0 ;b0 - cmp eax,[esp+5*4] - adc eax,eax - cmp edx,[esp+4*4] - adc eax,eax - cmp edx,[esp+3*4] - adc eax,eax - cmp edx,[esp+2*4] - adc eax,eax - cmp edx,[esp+1*4] - adc eax,eax - cmp edx,[esp+0*4] - adc eax,eax - add esp,byte 24 - ret \ No newline at end of file + cmp eax, [esp+5*4] + adc eax, eax + cmp edx, [esp+4*4] + adc eax, eax + cmp edx, [esp+3*4] + adc eax, eax + cmp edx, [esp+2*4] + adc eax, eax + cmp edx, [esp+1*4] + adc eax, eax + cmp edx, [esp+0*4] + adc eax, eax + add esp, byte 24 + ret