;/************************************************************************** ; * ; * XVID MPEG-4 VIDEO CODEC ; * mmx cbp calc ; * ; * This program is an implementation of a part of one or more MPEG-4 ; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending ; * to use this software module in hardware or software products are ; * advised that its use may infringe existing patents or copyrights, and ; * any such use would be at such party's own risk. The original ; * developer of this software module and his/her company, and subsequent ; * editors and their companies, will have no liability for use of this ; * software or modifications or derivatives thereof. ; * ; * This program is free software; you can redistribute it and/or modify ; * it under the terms of the GNU General Public License as published by ; * the Free Software Foundation; either version 2 of the License, or ; * (at your option) any later version. ; * ; * This program is distributed in the hope that it will be useful, ; * but WITHOUT ANY WARRANTY; without even the implied warranty of ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; * GNU General Public License for more details. ; * ; * You should have received a copy of the GNU General Public License ; * along with this program; if not, write to the Free Software ; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ; * ; *************************************************************************/ ; these 3dne functions are compatible with iSSE, but are optimized specifically for ; K7 pipelines ; ;------------------------------------------------------------------------------ ; 09.12.2002 Athlon optimizations contributed by Jaan Kalda ;------------------------------------------------------------------------------ bits 32 section .data %macro cglobal 1 %if 1 global _%1 %define %1 _%1 %else global %1 %endif %endmacro section .text cglobal calc_cbp_3dne ;=========================================================================== ; ; uint32_t calc_cbp_3dne(const int16_t coeff[6][64]); ; ;=========================================================================== %macro calc_cbp 1 pshufw mm0, [eax], 229 ; =11100101 movq mm1, [eax+8] por mm0, [eax+64] por mm1, [eax+72] movq mm2, [eax+16] movq mm3, [eax+24] por mm2, [eax+80] por mm3, [eax+88] movq mm4, [eax+32] movq mm5, [eax+40] por mm4, [eax+96] por mm5, [eax+104] movq mm6, [eax+48] movq mm7, [eax+56] por mm6, [eax+112] por mm7, [eax+120] por mm1, mm0 %if %1 sub eax,byte -128 ;ecx ;+= 128; needed 3 bytes for alignment %else xor eax,eax xor edx,edx %endif por mm3, mm2 por mm5, mm4 por mm7, mm6 por mm3, mm1 por mm7, mm5 por mm7, mm3 packsswb mm7,mm7 movd [esp+%1*4],mm7 %endmacro align 16 ;AMD K7, in cache: ca 80 clk calc_cbp_3dne: mov eax, [esp+ 4] ; coeff lea esp,[esp-24] calc_cbp 5 ;bit 5 calc_cbp 4 ;b4 calc_cbp 3 ;b3 calc_cbp 2 ;b2 calc_cbp 1 ;b1 calc_cbp 0 ;b0 cmp eax,[esp+5*4] adc eax,eax cmp edx,[esp+4*4] adc eax,eax cmp edx,[esp+3*4] adc eax,eax cmp edx,[esp+2*4] adc eax,eax cmp edx,[esp+1*4] adc eax,eax cmp edx,[esp+0*4] adc eax,eax add esp,byte 24 ret