--- trunk/xvidcore/src/motion/x86_asm/sad_3dn.asm 2002/09/06 16:59:47 430 +++ trunk/xvidcore/src/motion/x86_asm/sad_3dn.asm 2008/11/26 01:04:34 1795 @@ -1,21 +1,12 @@ -;/***************************************************************************** +;/**************************************************************************** ; * ; * XVID MPEG-4 VIDEO CODEC -; * 3dnow (*but without xmm*) sum of absolute difference +; * - 3DNow sad operators w/o XMM instructions - ; * -; * Copyright(C) 2002 Peter Ross +; * Copyright(C) 2002 Peter ross ; * -; * This program is an implementation of a part of one or more MPEG-4 -; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending -; * to use this software module in hardware or software products are -; * advised that its use may infringe existing patents or copyrights, and -; * any such use would be at such party's own risk. The original -; * developer of this software module and his/her company, and subsequent -; * editors and their companies, will have no liability for use of this -; * software or modifications or derivatives thereof. -; * -; * This program is free software; you can redistribute it and/or modify -; * it under the terms of the GNU General Public License as published by +; * This program is free software; you can redistribute it and/or modify it +; * under the terms of the GNU General Public License as published by ; * the Free Software Foundation; either version 2 of the License, or ; * (at your option) any later version. ; * @@ -28,184 +19,205 @@ ; * along with this program; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -, ****************************************************************************/ +; * $Id: sad_3dn.asm,v 1.12 2008-11-26 01:04:34 Isibaar Exp $ +; * +; ***************************************************************************/ -bits 32 +%include "nasm.inc" -%macro cglobal 1 - %ifdef PREFIX - global _%1 - %define %1 _%1 - %else - global %1 - %endif +;============================================================================= +; Read only data +;============================================================================= + +DATA + +ALIGN SECTION_ALIGN +mmx_one: + times 4 dw 1 + +;============================================================================= +; Helper macros +;============================================================================= +%macro SADBI_16x16_3DN 0 + movq mm0, [_EAX] ; src + movq mm2, [_EAX+8] + + movq mm1, [TMP1] ; ref1 + movq mm3, [TMP1+8] + pavgusb mm1, [_EBX] ; ref2 + lea TMP1, [TMP1+TMP0] + pavgusb mm3, [_EBX+8] + lea _EBX, [_EBX+TMP0] + + movq mm4, mm0 + lea _EAX, [_EAX+TMP0] + psubusb mm0, mm1 + movq mm5, mm2 + psubusb mm2, mm3 + + psubusb mm1, mm4 + por mm0, mm1 + psubusb mm3, mm5 + por mm2, mm3 + + movq mm1, mm0 + movq mm3, mm2 + + punpcklbw mm0,mm7 + punpckhbw mm1,mm7 + punpcklbw mm2,mm7 + punpckhbw mm3,mm7 + + paddusw mm0,mm1 + paddusw mm2,mm3 + paddusw mm6,mm0 + paddusw mm6,mm2 %endmacro -section .data +%macro SADBI_8x8_3DN 0 + movq mm0, [_EAX] ; src + movq mm2, [_EAX+TMP0] + + movq mm1, [TMP1] ; ref1 + movq mm3, [TMP1+TMP0] + pavgusb mm1, [_EBX] ; ref2 + lea TMP1, [TMP1+2*TMP0] + pavgusb mm3, [_EBX+TMP0] + lea _EBX, [_EBX+2*TMP0] + + movq mm4, mm0 + lea _EAX, [_EAX+2*TMP0] + psubusb mm0, mm1 + movq mm5, mm2 + psubusb mm2, mm3 + + psubusb mm1, mm4 + por mm0, mm1 + psubusb mm3, mm5 + por mm2, mm3 + + movq mm1, mm0 + movq mm3, mm2 + + punpcklbw mm0,mm7 + punpckhbw mm1,mm7 + punpcklbw mm2,mm7 + punpckhbw mm3,mm7 + + paddusw mm0,mm1 + paddusw mm2,mm3 + paddusw mm6,mm0 + paddusw mm6,mm2 +%endmacro -align 16 -mmx_one times 4 dw 1 +;============================================================================= +; Code +;============================================================================= -section .text +SECTION .rotext align=SECTION_ALIGN cglobal sad16bi_3dn cglobal sad8bi_3dn -;=========================================================================== -; -; uint32_t sad16bi_3dn(const uint8_t * const cur, -; const uint8_t * const ref1, -; const uint8_t * const ref2, -; const uint32_t stride); -; -;=========================================================================== - -%macro SADBI_16x16_3DN 0 - movq mm0, [eax] ; src - movq mm2, [eax+8] - - movq mm1, [edx] ; ref1 - movq mm3, [edx+8] - pavgusb mm1, [ebx] ; ref2 - lea edx,[edx+ecx] - pavgusb mm3, [ebx+8] - lea ebx,[ebx+ecx] - - movq mm4, mm0 - lea eax,[eax+ecx] - psubusb mm0, mm1 - movq mm5, mm2 - psubusb mm2, mm3 - - psubusb mm1, mm4 - por mm0, mm1 - psubusb mm3, mm5 - por mm2, mm3 - - movq mm1,mm0 - movq mm3,mm2 - - punpcklbw mm0,mm7 - punpckhbw mm1,mm7 - punpcklbw mm2,mm7 - punpckhbw mm3,mm7 - - paddusw mm0,mm1 - paddusw mm2,mm3 - paddusw mm6,mm0 - paddusw mm6,mm2 -%endmacro - -align 16 -sad16bi_3dn: - push ebx - mov eax, [esp+4+ 4] ; Src - mov edx, [esp+4+ 8] ; Ref1 - mov ebx, [esp+4+12] ; Ref2 - mov ecx, [esp+4+16] ; Stride - - pxor mm6, mm6 ; accum2 -pxor mm7, mm7 -.Loop - SADBI_16x16_3DN - SADBI_16x16_3DN - SADBI_16x16_3DN - SADBI_16x16_3DN - SADBI_16x16_3DN - SADBI_16x16_3DN - SADBI_16x16_3DN - SADBI_16x16_3DN - - SADBI_16x16_3DN - SADBI_16x16_3DN - SADBI_16x16_3DN - SADBI_16x16_3DN - SADBI_16x16_3DN - SADBI_16x16_3DN - SADBI_16x16_3DN - SADBI_16x16_3DN - - pmaddwd mm6, [mmx_one] ; collapse - movq mm7, mm6 - psrlq mm7, 32 - paddd mm6, mm7 - - movd eax, mm6 - - pop ebx - - ret - - - -;=========================================================================== -; -; uint32_t sad8bi_3dn(const uint8_t * const cur, -; const uint8_t * const ref1, -; const uint8_t * const ref2, -; const uint32_t stride); -; -;=========================================================================== - -%macro SADBI_8x8_3DN 0 - movq mm0, [eax] ; src - movq mm2, [eax+ecx] - - movq mm1, [edx] ; ref1 - movq mm3, [edx+ecx] - pavgusb mm1, [ebx] ; ref2 - lea edx,[edx+2*ecx] - pavgusb mm3, [ebx+ecx] - lea ebx,[ebx+2*ecx] - - movq mm4, mm0 - lea eax,[eax+2*ecx] - psubusb mm0, mm1 - movq mm5, mm2 - psubusb mm2, mm3 - - psubusb mm1, mm4 - por mm0, mm1 - psubusb mm3, mm5 - por mm2, mm3 - - movq mm1,mm0 - movq mm3,mm2 - - punpcklbw mm0,mm7 - punpckhbw mm1,mm7 - punpcklbw mm2,mm7 - punpckhbw mm3,mm7 - - paddusw mm0,mm1 - paddusw mm2,mm3 - paddusw mm6,mm0 - paddusw mm6,mm2 -%endmacro - -align 16 -sad8bi_3dn: - push ebx - mov eax, [esp+4+ 4] ; Src - mov edx, [esp+4+ 8] ; Ref1 - mov ebx, [esp+4+12] ; Ref2 - mov ecx, [esp+4+16] ; Stride - - pxor mm6, mm6 ; accum2 -pxor mm7, mm7 -.Loop - SADBI_8x8_3DN - SADBI_8x8_3DN - SADBI_8x8_3DN - SADBI_8x8_3DN - - pmaddwd mm6, [mmx_one] ; collapse - movq mm7, mm6 - psrlq mm7, 32 - paddd mm6, mm7 - - movd eax, mm6 - - pop ebx +;----------------------------------------------------------------------------- +; +; uint32_t sad16bi_3dn(const uint8_t * const cur, +; const uint8_t * const ref1, +; const uint8_t * const ref2, +; const uint32_t stride); +; +;----------------------------------------------------------------------------- + +ALIGN SECTION_ALIGN +sad16bi_3dn: + mov _EAX, prm1 ; Src + mov TMP1, prm2 ; Ref1 + mov TMP0, prm4 ; Stride + + push _EBX +%ifdef ARCH_IS_X86_64 + mov _EBX, prm3 +%else + mov _EBX, [_ESP+4+12] ; Ref2 +%endif + + pxor mm6, mm6 ; accum2 + pxor mm7, mm7 +.Loop: + SADBI_16x16_3DN + SADBI_16x16_3DN + SADBI_16x16_3DN + SADBI_16x16_3DN + SADBI_16x16_3DN + SADBI_16x16_3DN + SADBI_16x16_3DN + SADBI_16x16_3DN + + SADBI_16x16_3DN + SADBI_16x16_3DN + SADBI_16x16_3DN + SADBI_16x16_3DN + SADBI_16x16_3DN + SADBI_16x16_3DN + SADBI_16x16_3DN + SADBI_16x16_3DN + + pmaddwd mm6, [mmx_one] ; collapse + movq mm7, mm6 + psrlq mm7, 32 + paddd mm6, mm7 + + movd eax, mm6 + + pop _EBX + + ret +ENDFUNC + +;----------------------------------------------------------------------------- +; +; uint32_t sad8bi_3dn(const uint8_t * const cur, +; const uint8_t * const ref1, +; const uint8_t * const ref2, +; const uint32_t stride); +; +;----------------------------------------------------------------------------- + +ALIGN SECTION_ALIGN +sad8bi_3dn: + mov _EAX, prm1 ; Src + mov TMP1, prm2 ; Ref1 + mov TMP0, prm4 ; Stride + + push _EBX +%ifdef ARCH_IS_X86_64 + mov _EBX, prm3 +%else + mov _EBX, [_ESP+4+12] ; Ref2 +%endif + + pxor mm6, mm6 ; accum2 + pxor mm7, mm7 +.Loop: + SADBI_8x8_3DN + SADBI_8x8_3DN + SADBI_8x8_3DN + SADBI_8x8_3DN + + pmaddwd mm6, [mmx_one] ; collapse + movq mm7, mm6 + psrlq mm7, 32 + paddd mm6, mm7 + + movd eax, mm6 + + pop _EBX + + ret +ENDFUNC + + +%ifidn __OUTPUT_FORMAT__,elf +section ".note.GNU-stack" noalloc noexec nowrite progbits +%endif - ret