;/************************************************************************** ; * ; * XVID MPEG-4 VIDEO CODEC ; * sse2 cbp calc ; * ; * This program is an implementation of a part of one or more MPEG-4 ; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending ; * to use this software module in hardware or software products are ; * advised that its use may infringe existing patents or copyrights, and ; * any such use would be at such party's own risk. The original ; * developer of this software module and his/her company, and subsequent ; * editors and their companies, will have no liability for use of this ; * software or modifications or derivatives thereof. ; * ; * This program is free software; you can redistribute it and/or modify ; * it under the terms of the GNU General Public License as published by ; * the Free Software Foundation; either version 2 of the License, or ; * (at your option) any later version. ; * ; * This program is distributed in the hope that it will be useful, ; * but WITHOUT ANY WARRANTY; without even the implied warranty of ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; * GNU General Public License for more details. ; * ; * You should have received a copy of the GNU General Public License ; * along with this program; if not, write to the Free Software ; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ; * ; *************************************************************************/ ;/************************************************************************** ; * ; * History: ; * ; * 14.06.2002 cleanup -Skal- ; * 24.04.2002 had to use sse2's movdqu instead of movdqa (???) ; * 17.04.2002 initial version (c) 2002 Daniel Smith ; * ; *************************************************************************/ bits 32 section .data %macro cglobal 1 %ifdef PREFIX global _%1 %define %1 _%1 %else global %1 %endif %endmacro align 16 ignore_dc dw 0, -1, -1, -1, -1, -1, -1, -1 section .text cglobal calc_cbp_sse2 ;=========================================================================== ; ; uint32_t calc_cbp_sse2(const int16_t coeff[6][64]); ; ; not enabled - slower than mmx? ; ;=========================================================================== %macro LOOP_SSE2 1 movdqa xmm0, [edx+(%1)*128] pand xmm0, xmm7 movdqa xmm1, [edx+(%1)*128+16] por xmm0, [edx+(%1)*128+32] por xmm1, [edx+(%1)*128+48] por xmm0, [edx+(%1)*128+64] por xmm1, [edx+(%1)*128+80] por xmm0, [edx+(%1)*128+96] por xmm1, [edx+(%1)*128+112] por xmm0, xmm1 ; xmm0 = xmm1 = 128 bits worth of info psadbw xmm0, xmm6 ; contains 2 dwords with sums movhlps xmm1, xmm0 ; move high dword from xmm0 to low xmm1 por xmm0, xmm1 ; combine movd ecx, xmm0 ; if ecx set, values were found test ecx, ecx %endmacro align 16 calc_cbp_sse2: mov edx, [esp+4] ; coeff[] xor eax, eax ; cbp = 0 movdqu xmm7, [ignore_dc] ; mask to ignore dc value pxor xmm6, xmm6 ; zero LOOP_SSE2 0 test ecx, ecx jz .blk2 or eax, (1<<5) .blk2 LOOP_SSE2 1 test ecx, ecx jz .blk3 or eax, (1<<4) .blk3 LOOP_SSE2 2 test ecx, ecx jz .blk4 or eax, (1<<3) .blk4 LOOP_SSE2 3 test ecx, ecx jz .blk5 or eax, (1<<2) .blk5 LOOP_SSE2 4 test ecx, ecx jz .blk6 or eax, (1<<1) .blk6 LOOP_SSE2 5 test ecx, ecx jz .finished or eax, (1<<0) .finished ret