--- trunk/xvidcore/src/motion/x86_asm/sad_xmm.asm 2003/02/15 15:22:19 851 +++ branches/release-1_2-branch/xvidcore/src/motion/x86_asm/sad_xmm.asm 2008/11/28 16:54:45 1820 @@ -1,442 +1,443 @@ -;/************************************************************************** +;/**************************************************************************** ; * -; * XVID MPEG-4 VIDEO CODEC -; * xmm sum of absolute difference +; * XVID MPEG-4 VIDEO CODEC +; * - K7 optimized SAD operators - ; * -; * This program is an implementation of a part of one or more MPEG-4 -; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending -; * to use this software module in hardware or software products are -; * advised that its use may infringe existing patents or copyrights, and -; * any such use would be at such party's own risk. The original -; * developer of this software module and his/her company, and subsequent -; * editors and their companies, will have no liability for use of this -; * software or modifications or derivatives thereof. +; * Copyright(C) 2001 Peter Ross +; * 2001-2008 Michael Militzer +; * 2002 Pascal Massimino ; * -; * This program is free software; you can redistribute it and/or modify -; * it under the terms of the GNU General Public License as published by -; * the Free Software Foundation; either version 2 of the License, or -; * (at your option) any later version. +; * This program is free software; you can redistribute it and/or modify it +; * under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 2 of the License, or +; * (at your option) any later version. ; * -; * This program is distributed in the hope that it will be useful, -; * but WITHOUT ANY WARRANTY; without even the implied warranty of -; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; * GNU General Public License for more details. +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. ; * -; * You should have received a copy of the GNU General Public License -; * along with this program; if not, write to the Free Software -; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; *************************************************************************/ - -;/************************************************************************** -; * -; * History: +; * $Id: sad_xmm.asm,v 1.13 2008-11-26 01:04:34 Isibaar Exp $ ; * -; * 23.07.2002 sad8bi_xmm; -; * 04.06.2002 rewrote some funcs (XMM mainly) -Skal- -; * 17.11.2001 bugfix and small improvement for dev16_xmm, -; * removed terminate early in sad16_xmm (Isibaar) -; * 12.11.2001 inital version; (c)2001 peter ross -; * -; *************************************************************************/ +; ***************************************************************************/ + +%include "nasm.inc" + +;============================================================================= +; Read only data +;============================================================================= + +DATA + +ALIGN SECTION_ALIGN +mmx_one: times 4 dw 1 -bits 32 +;============================================================================= +; Helper macros +;============================================================================= -%macro cglobal 1 - %ifdef PREFIX - global _%1 - %define %1 _%1 - %else - global %1 - %endif +%macro SAD_16x16_SSE 0 + movq mm0, [_EAX] + psadbw mm0, [TMP1] + movq mm1, [_EAX+8] + add _EAX, TMP0 + psadbw mm1, [TMP1+8] + paddusw mm5, mm0 + add TMP1, TMP0 + paddusw mm6, mm1 %endmacro -section .data +%macro SAD_8x8_SSE 0 + movq mm0, [_EAX] + movq mm1, [_EAX+TMP0] + psadbw mm0, [TMP1] + psadbw mm1, [TMP1+TMP0] + add _EAX, _EBX + add TMP1, _EBX + paddusw mm5, mm0 + paddusw mm6, mm1 +%endmacro -align 16 -mmx_one times 4 dw 1 +%macro SADBI_16x16_SSE 0 + movq mm0, [_EAX] + movq mm1, [_EAX+8] + movq mm2, [TMP1] + movq mm3, [TMP1+8] + pavgb mm2, [_EBX] + add TMP1, TMP0 + pavgb mm3, [_EBX+8] + add _EBX, TMP0 + psadbw mm0, mm2 + add _EAX, TMP0 + psadbw mm1, mm3 + paddusw mm5, mm0 + paddusw mm6, mm1 +%endmacro -section .text +%macro SADBI_8x8_XMM 0 + movq mm0, [_EAX] + movq mm1, [_EAX+TMP0] + movq mm2, [TMP1] + movq mm3, [TMP1+TMP0] + pavgb mm2, [_EBX] + lea TMP1, [TMP1+2*TMP0] + pavgb mm3, [_EBX+TMP0] + lea _EBX, [_EBX+2*TMP0] + psadbw mm0, mm2 + lea _EAX, [_EAX+2*TMP0] + psadbw mm1, mm3 + paddusw mm5, mm0 + paddusw mm6, mm1 +%endmacro -cglobal sad16_xmm -cglobal sad8_xmm -cglobal sad16bi_xmm -cglobal sad8bi_xmm -cglobal dev16_xmm +%macro MEAN_16x16_SSE 0 + movq mm0, [_EAX] + movq mm1, [_EAX+8] + psadbw mm0, mm7 + psadbw mm1, mm7 + add _EAX, TMP0 + paddw mm5, mm0 + paddw mm6, mm1 +%endmacro + +%macro ABS_16x16_SSE 0 + movq mm0, [_EAX] + movq mm1, [_EAX+8] + psadbw mm0, mm4 + psadbw mm1, mm4 + lea _EAX, [_EAX+TMP0] + paddw mm5, mm0 + paddw mm6, mm1 +%endmacro + +;============================================================================= +; Code +;============================================================================= + +SECTION .rotext align=SECTION_ALIGN + +cglobal sad16_xmm +cglobal sad8_xmm +cglobal sad16bi_xmm +cglobal sad8bi_xmm +cglobal dev16_xmm +cglobal sad16v_xmm -;=========================================================================== +;----------------------------------------------------------------------------- ; ; uint32_t sad16_xmm(const uint8_t * const cur, ; const uint8_t * const ref, ; const uint32_t stride, ; const uint32_t best_sad); ; -;=========================================================================== - -%macro SAD_16x16_SSE 0 - movq mm0, [eax] - psadbw mm0, [edx] - movq mm1, [eax+8] - add eax, ecx - psadbw mm1, [edx+8] - paddusw mm5,mm0 - add edx, ecx - paddusw mm6,mm1 -%endmacro +;----------------------------------------------------------------------------- -align 16 +ALIGN SECTION_ALIGN sad16_xmm: - mov eax, [esp+ 4] ; Src1 - mov edx, [esp+ 8] ; Src2 - mov ecx, [esp+12] ; Stride - - pxor mm5, mm5 ; accum1 - pxor mm6, mm6 ; accum2 - - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - - paddusw mm6,mm5 - movd eax, mm6 - ret + mov _EAX, prm1 ; Src1 + mov TMP1, prm2 ; Src2 + mov TMP0, prm3 ; Stride + + pxor mm5, mm5 ; accum1 + pxor mm6, mm6 ; accum2 + + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + + paddusw mm6,mm5 + movd eax, mm6 + ret +ENDFUNC -;=========================================================================== +;----------------------------------------------------------------------------- ; ; uint32_t sad8_xmm(const uint8_t * const cur, ; const uint8_t * const ref, ; const uint32_t stride); ; -;=========================================================================== - -%macro SAD_8x8_SSE 0 - movq mm0, [eax] - movq mm1, [eax+ecx] +;----------------------------------------------------------------------------- - psadbw mm0, [edx] - psadbw mm1, [edx+ecx] - add eax, ebx - add edx, ebx - - paddusw mm5,mm0 - paddusw mm6,mm1 -%endmacro - -align 16 +ALIGN SECTION_ALIGN sad8_xmm: - mov eax, [esp+ 4] ; Src1 - mov edx, [esp+ 8] ; Src2 - mov ecx, [esp+12] ; Stride - push ebx - lea ebx, [ecx+ecx] - - pxor mm5, mm5 ; accum1 - pxor mm6, mm6 ; accum2 + mov _EAX, prm1 ; Src1 + mov TMP1, prm2 ; Src2 + mov TMP0, prm3 ; Stride + push _EBX + lea _EBX, [TMP0+TMP0] - SAD_8x8_SSE - SAD_8x8_SSE - SAD_8x8_SSE + pxor mm5, mm5 ; accum1 + pxor mm6, mm6 ; accum2 - movq mm0, [eax] - movq mm1, [eax+ecx] - psadbw mm0, [edx] - psadbw mm1, [edx+ecx] + SAD_8x8_SSE + SAD_8x8_SSE + SAD_8x8_SSE - pop ebx + movq mm0, [_EAX] + movq mm1, [_EAX+TMP0] + psadbw mm0, [TMP1] + psadbw mm1, [TMP1+TMP0] - paddusw mm5,mm0 - paddusw mm6,mm1 + pop _EBX - paddusw mm6,mm5 - movd eax, mm6 + paddusw mm5,mm0 + paddusw mm6,mm1 - ret + paddusw mm6,mm5 + movd eax, mm6 + ret +ENDFUNC -;=========================================================================== + +;----------------------------------------------------------------------------- ; ; uint32_t sad16bi_xmm(const uint8_t * const cur, ; const uint8_t * const ref1, ; const uint8_t * const ref2, ; const uint32_t stride); ; -;=========================================================================== - -%macro SADBI_16x16_SSE 0 - movq mm0, [eax] - movq mm1, [eax+8] +;----------------------------------------------------------------------------- - movq mm2, [edx] - movq mm3, [edx+8] - - pavgb mm2, [ebx] - add edx, ecx - - pavgb mm3, [ebx+8] - add ebx, ecx - - psadbw mm0, mm2 - add eax, ecx - - psadbw mm1, mm3 - paddusw mm5,mm0 +ALIGN SECTION_ALIGN +sad16bi_xmm: + mov _EAX, prm1 ; Src + mov TMP1, prm2 ; Ref1 + mov TMP0, prm4 ; Stride + + push _EBX +%ifdef ARCH_IS_X86_64 + mov _EBX, prm3 +%else + mov _EBX, [_ESP+4+12] ; Ref2 +%endif + pxor mm5, mm5 ; accum1 + pxor mm6, mm6 ; accum2 + + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + SADBI_16x16_SSE + + paddusw mm6,mm5 + movd eax, mm6 + pop _EBX + ret +ENDFUNC - paddusw mm6,mm1 -%endmacro +;----------------------------------------------------------------------------- +; +; uint32_t sad8bi_xmm(const uint8_t * const cur, +; const uint8_t * const ref1, +; const uint8_t * const ref2, +; const uint32_t stride); +; +;----------------------------------------------------------------------------- -align 16 -sad16bi_xmm: - push ebx - mov eax, [esp+4+ 4] ; Src - mov edx, [esp+4+ 8] ; Ref1 - mov ebx, [esp+4+12] ; Ref2 - mov ecx, [esp+4+16] ; Stride - - pxor mm5, mm5 ; accum1 - pxor mm6, mm6 ; accum2 - - SADBI_16x16_SSE - SADBI_16x16_SSE - SADBI_16x16_SSE - SADBI_16x16_SSE - SADBI_16x16_SSE - SADBI_16x16_SSE - SADBI_16x16_SSE - SADBI_16x16_SSE - - SADBI_16x16_SSE - SADBI_16x16_SSE - SADBI_16x16_SSE - SADBI_16x16_SSE - SADBI_16x16_SSE - SADBI_16x16_SSE - SADBI_16x16_SSE - SADBI_16x16_SSE - - paddusw mm6,mm5 - movd eax, mm6 - pop ebx - ret - -;=========================================================================== -; -; uint32_t sad8bi_xmm(const uint8_t * const cur, -; const uint8_t * const ref1, -; const uint8_t * const ref2, -; const uint32_t stride); -; -;=========================================================================== - -%macro SADBI_8x8_XMM 0 - movq mm0, [eax] - movq mm1, [eax+ecx] - - movq mm2, [edx] - movq mm3, [edx+ecx] - - pavgb mm2, [ebx] - lea edx, [edx+2*ecx] - - pavgb mm3, [ebx+ecx] - lea ebx, [ebx+2*ecx] - - psadbw mm0, mm2 - lea eax, [eax+2*ecx] - - psadbw mm1, mm3 - paddusw mm5,mm0 - - paddusw mm6,mm1 -%endmacro - -align 16 -sad8bi_xmm: - push ebx - mov eax, [esp+4+ 4] ; Src - mov edx, [esp+4+ 8] ; Ref1 - mov ebx, [esp+4+12] ; Ref2 - mov ecx, [esp+4+16] ; Stride - - pxor mm5, mm5 ; accum1 - pxor mm6, mm6 ; accum2 -.Loop - SADBI_8x8_XMM - SADBI_8x8_XMM - SADBI_8x8_XMM - SADBI_8x8_XMM - - paddusw mm6,mm5 - movd eax, mm6 - pop ebx - ret +ALIGN SECTION_ALIGN +sad8bi_xmm: + mov _EAX, prm1 ; Src + mov TMP1, prm2 ; Ref1 + mov TMP0, prm4 ; Stride + + push _EBX +%ifdef ARCH_IS_X86_64 + mov _EBX, prm3 +%else + mov _EBX, [_ESP+4+12] ; Ref2 +%endif + + pxor mm5, mm5 ; accum1 + pxor mm6, mm6 ; accum2 +.Loop: + SADBI_8x8_XMM + SADBI_8x8_XMM + SADBI_8x8_XMM + SADBI_8x8_XMM + + paddusw mm6,mm5 + movd eax, mm6 + pop _EBX + ret +ENDFUNC -;=========================================================================== +;----------------------------------------------------------------------------- ; ; uint32_t dev16_xmm(const uint8_t * const cur, ; const uint32_t stride); ; -;=========================================================================== - -%macro MEAN_16x16_SSE 0 - movq mm0, [eax] - movq mm1, [eax+8] - psadbw mm0, mm7 - psadbw mm1, mm7 - add eax, ecx - paddw mm5, mm0 - paddw mm6, mm1 -%endmacro - -%macro ABS_16x16_SSE 0 - movq mm0, [eax] - movq mm1, [eax+8] - psadbw mm0, mm4 - psadbw mm1, mm4 - lea eax,[eax+ecx] - paddw mm5, mm0 - paddw mm6, mm1 -%endmacro +;----------------------------------------------------------------------------- -align 16 +ALIGN SECTION_ALIGN dev16_xmm: - mov eax, [esp+ 4] ; Src - mov ecx, [esp+ 8] ; Stride - - pxor mm7, mm7 ; zero - pxor mm5, mm5 ; mean accums - pxor mm6, mm6 - - MEAN_16x16_SSE - MEAN_16x16_SSE - MEAN_16x16_SSE - MEAN_16x16_SSE - MEAN_16x16_SSE - MEAN_16x16_SSE - MEAN_16x16_SSE - MEAN_16x16_SSE - - MEAN_16x16_SSE - MEAN_16x16_SSE - MEAN_16x16_SSE - MEAN_16x16_SSE - MEAN_16x16_SSE - MEAN_16x16_SSE - MEAN_16x16_SSE - MEAN_16x16_SSE - - paddusw mm6, mm5 - - movq mm4, mm6 - psllq mm4, 32 - paddd mm4, mm6 - psrld mm4, 8 ; /= (16*16) - - packssdw mm4, mm4 - packuswb mm4, mm4 - - ; mm4 contains the mean - - mov eax, [esp+ 4] ; Src - - - pxor mm5, mm5 ; sums - pxor mm6, mm6 - - ABS_16x16_SSE - ABS_16x16_SSE - ABS_16x16_SSE - ABS_16x16_SSE - ABS_16x16_SSE - ABS_16x16_SSE - ABS_16x16_SSE - ABS_16x16_SSE - - ABS_16x16_SSE - ABS_16x16_SSE - ABS_16x16_SSE - ABS_16x16_SSE - ABS_16x16_SSE - ABS_16x16_SSE - ABS_16x16_SSE - ABS_16x16_SSE - - paddusw mm6, mm5 - movq mm7, mm6 - psllq mm7, 32 - paddd mm6, mm7 - - movd eax, mm6 - ret + mov _EAX, prm1 ; Src + mov TMP0, prm2 ; Stride -cglobal sad16v_xmm + pxor mm7, mm7 ; zero + pxor mm5, mm5 ; mean accums + pxor mm6, mm6 + + MEAN_16x16_SSE + MEAN_16x16_SSE + MEAN_16x16_SSE + MEAN_16x16_SSE + MEAN_16x16_SSE + MEAN_16x16_SSE + MEAN_16x16_SSE + MEAN_16x16_SSE + + MEAN_16x16_SSE + MEAN_16x16_SSE + MEAN_16x16_SSE + MEAN_16x16_SSE + MEAN_16x16_SSE + MEAN_16x16_SSE + MEAN_16x16_SSE + MEAN_16x16_SSE + + paddusw mm6, mm5 + + movq mm4, mm6 + psllq mm4, 32 + paddd mm4, mm6 + psrld mm4, 8 ; /= (16*16) + + packssdw mm4, mm4 + packuswb mm4, mm4 + + ; mm4 contains the mean + + mov _EAX, prm1 ; Src + + + pxor mm5, mm5 ; sums + pxor mm6, mm6 + + ABS_16x16_SSE + ABS_16x16_SSE + ABS_16x16_SSE + ABS_16x16_SSE + ABS_16x16_SSE + ABS_16x16_SSE + ABS_16x16_SSE + ABS_16x16_SSE + + ABS_16x16_SSE + ABS_16x16_SSE + ABS_16x16_SSE + ABS_16x16_SSE + ABS_16x16_SSE + ABS_16x16_SSE + ABS_16x16_SSE + ABS_16x16_SSE + + paddusw mm6, mm5 + movq mm7, mm6 + psllq mm7, 32 + paddd mm6, mm7 + + movd eax, mm6 + ret +ENDFUNC -;=========================================================================== +;----------------------------------------------------------------------------- ;int sad16v_xmm(const uint8_t * const cur, ; const uint8_t * const ref, ; const uint32_t stride, ; int* sad8); -;=========================================================================== -align 16 -sad16v_xmm: - push ebx - mov eax, [esp+4+ 4] ; Src1 - mov edx, [esp+4+ 8] ; Src2 - mov ecx, [esp+4+12] ; Stride - mov ebx, [esp+4+16] ; sad ptr - - pxor mm5, mm5 ; accum1 - pxor mm6, mm6 ; accum2 - pxor mm7, mm7 ; total - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - paddusw mm7, mm5 - paddusw mm7, mm6 - movd [ebx], mm5 - movd [ebx+4], mm6 - - pxor mm5, mm5 ; accum1 - pxor mm6, mm6 ; accum2 - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - SAD_16x16_SSE - paddusw mm7, mm5 - paddusw mm7, mm6 - movd [ebx+8], mm5 - movd [ebx+12], mm6 - - movd eax, mm7 - pop ebx - ret -;-------- +;----------------------------------------------------------------------------- +ALIGN SECTION_ALIGN +sad16v_xmm: + mov _EAX, prm1 ; Src1 + mov TMP1, prm2 ; Src2 + mov TMP0, prm3 ; Stride + + push _EBX +%ifdef ARCH_IS_X86_64 + mov _EBX, prm4 +%else + mov _EBX, [_ESP+4+16] ; sad ptr +%endif + + pxor mm5, mm5 ; accum1 + pxor mm6, mm6 ; accum2 + pxor mm7, mm7 ; total + + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + + paddusw mm7, mm5 + paddusw mm7, mm6 + movd [_EBX], mm5 + movd [_EBX+4], mm6 + + pxor mm5, mm5 ; accum1 + pxor mm6, mm6 ; accum2 + + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + SAD_16x16_SSE + + paddusw mm7, mm5 + paddusw mm7, mm6 + movd [_EBX+8], mm5 + movd [_EBX+12], mm6 + + movd eax, mm7 + pop _EBX + ret +ENDFUNC + + +%ifidn __OUTPUT_FORMAT__,elf +section ".note.GNU-stack" noalloc noexec nowrite progbits +%endif