--- trunk/xvidcore/src/bitstream/x86_asm/cbp_sse2.asm 2004/03/18 16:11:28 1381 +++ trunk/xvidcore/src/bitstream/x86_asm/cbp_sse2.asm 2004/03/22 22:36:25 1382 @@ -1,47 +1,34 @@ -;/************************************************************************** +;/**************************************************************************** ; * -; * XVID MPEG-4 VIDEO CODEC -; * sse2 cbp calc +; * XVID MPEG-4 VIDEO CODEC +; * - SSE2 CBP computation - ; * -; * This program is an implementation of a part of one or more MPEG-4 -; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending -; * to use this software module in hardware or software products are -; * advised that its use may infringe existing patents or copyrights, and -; * any such use would be at such party's own risk. The original -; * developer of this software module and his/her company, and subsequent -; * editors and their companies, will have no liability for use of this -; * software or modifications or derivatives thereof. +; * Copyright (C) 2002 Daniel Smith +; * 2002 Pascal Massimino ; * -; * This program is free software; you can redistribute it and/or modify -; * it under the terms of the GNU General Public License as published by -; * the Free Software Foundation; either version 2 of the License, or -; * (at your option) any later version. +; * This program is free software ; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation ; either version 2 of the License, or +; * (at your option) any later version. ; * -; * This program is distributed in the hope that it will be useful, -; * but WITHOUT ANY WARRANTY; without even the implied warranty of -; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; * GNU General Public License for more details. +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY ; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. ; * -; * You should have received a copy of the GNU General Public License -; * along with this program; if not, write to the Free Software -; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +; * You should have received a copy of the GNU General Public License +; * along with this program ; if not, write to the Free Software +; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; *************************************************************************/ - -;/************************************************************************** -; * -; * History: -; * -; * 14.06.2002 cleanup -Skal- -; * 24.04.2002 had to use sse2's movdqu instead of movdqa (???) -; * 17.04.2002 initial version (c) 2002 Daniel Smith +; * $Id: cbp_sse2.asm,v 1.4 2004-03-22 22:36:23 edgomez Exp $ ; * -; *************************************************************************/ +; ***************************************************************************/ +BITS 32 -bits 32 - -section .data +;============================================================================= +; Macros +;============================================================================= %macro cglobal 1 %ifdef PREFIX @@ -52,80 +39,93 @@ %endif %endmacro -align 16 - -ignore_dc dw 0, -1, -1, -1, -1, -1, -1, -1 - -section .text - -cglobal calc_cbp_sse2 - -;=========================================================================== -; -; uint32_t calc_cbp_sse2(const int16_t coeff[6][64]); -; -; not enabled - slower than mmx? -; -;=========================================================================== - %macro LOOP_SSE2 1 - movdqa xmm0, [edx+(%1)*128] - pand xmm0, xmm7 - movdqa xmm1, [edx+(%1)*128+16] - - por xmm0, [edx+(%1)*128+32] - por xmm1, [edx+(%1)*128+48] - por xmm0, [edx+(%1)*128+64] - por xmm1, [edx+(%1)*128+80] - por xmm0, [edx+(%1)*128+96] - por xmm1, [edx+(%1)*128+112] - - por xmm0, xmm1 ; xmm0 = xmm1 = 128 bits worth of info - psadbw xmm0, xmm6 ; contains 2 dwords with sums - movhlps xmm1, xmm0 ; move high dword from xmm0 to low xmm1 - por xmm0, xmm1 ; combine - movd ecx, xmm0 ; if ecx set, values were found - test ecx, ecx + movdqa xmm0, [edx+(%1)*128] + pand xmm0, xmm7 + movdqa xmm1, [edx+(%1)*128+16] + + por xmm0, [edx+(%1)*128+32] + por xmm1, [edx+(%1)*128+48] + por xmm0, [edx+(%1)*128+64] + por xmm1, [edx+(%1)*128+80] + por xmm0, [edx+(%1)*128+96] + por xmm1, [edx+(%1)*128+112] + + por xmm0, xmm1 ; xmm0 = xmm1 = 128 bits worth of info + psadbw xmm0, xmm6 ; contains 2 dwords with sums + movhlps xmm1, xmm0 ; move high dword from xmm0 to low xmm1 + por xmm0, xmm1 ; combine + movd ecx, xmm0 ; if ecx set, values were found + test ecx, ecx %endmacro -align 16 +;============================================================================= +; Data (Read Only) +;============================================================================= + +%ifdef FORMAT_COFF +SECTION .rodata data +%else +SECTION .rodata data align=16 +%endif + +ALIGN 16 +ignore_dc: + dw 0, -1, -1, -1, -1, -1, -1, -1 + +;============================================================================= +; Code +;============================================================================= + +SECTION .text + +;----------------------------------------------------------------------------- +; uint32_t calc_cbp_sse2(const int16_t coeff[6*64]); +;----------------------------------------------------------------------------- +ALIGN 16 +cglobal calc_cbp_sse2 calc_cbp_sse2: - mov edx, [esp+4] ; coeff[] - xor eax, eax ; cbp = 0 + mov edx, [esp+4] ; coeff[] + xor eax, eax ; cbp = 0 - movdqu xmm7, [ignore_dc] ; mask to ignore dc value - pxor xmm6, xmm6 ; zero + movdqu xmm7, [ignore_dc] ; mask to ignore dc value + pxor xmm6, xmm6 ; zero LOOP_SSE2 0 - test ecx, ecx - jz .blk2 - or eax, (1<<5) + test ecx, ecx + jz .blk2 + or eax, (1<<5) + .blk2 LOOP_SSE2 1 - test ecx, ecx - jz .blk3 + test ecx, ecx + jz .blk3 or eax, (1<<4) + .blk3 LOOP_SSE2 2 - test ecx, ecx - jz .blk4 + test ecx, ecx + jz .blk4 or eax, (1<<3) + .blk4 LOOP_SSE2 3 - test ecx, ecx - jz .blk5 + test ecx, ecx + jz .blk5 or eax, (1<<2) + .blk5 LOOP_SSE2 4 - test ecx, ecx - jz .blk6 + test ecx, ecx + jz .blk6 or eax, (1<<1) + .blk6 LOOP_SSE2 5 - test ecx, ecx - jz .finished + test ecx, ecx + jz .finished or eax, (1<<0) + .finished - - ret + ret