--- branches/dev-api-4/xvidcore/src/utils/x86_asm/mem_transfer_3dne.asm 2003/10/27 01:03:43 1191 +++ branches/dev-api-4/xvidcore/src/utils/x86_asm/mem_transfer_3dne.asm 2003/10/28 22:23:03 1192 @@ -1,67 +1,64 @@ -;/************************************************************************** +;/**************************************************************************** ; * -; * XVID MPEG-4 VIDEO CODEC -; * mmx 8bit<->16bit transfers +; * XVID MPEG-4 VIDEO CODEC +; * - 8<->16 bit transfer functions - ; * -; * This program is an implementation of a part of one or more MPEG-4 -; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending -; * to use this software module in hardware or software products are -; * advised that its use may infringe existing patents or copyrights, and -; * any such use would be at such party's own risk. The original -; * developer of this software module and his/her company, and subsequent -; * editors and their companies, will have no liability for use of this -; * software or modifications or derivatives thereof. +; * Copyright (C) 2002 Jaan Kalda ; * -; * This program is free software; you can redistribute it and/or modify -; * it under the terms of the GNU General Public License as published by -; * the Free Software Foundation; either version 2 of the License, or -; * (at your option) any later version. +; * This program is free software ; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation ; either version 2 of the License, or +; * (at your option) any later version. ; * -; * This program is distributed in the hope that it will be useful, -; * but WITHOUT ANY WARRANTY; without even the implied warranty of -; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; * GNU General Public License for more details. +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY ; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. ; * -; * You should have received a copy of the GNU General Public License -; * along with this program; if not, write to the Free Software -; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +; * You should have received a copy of the GNU General Public License +; * along with this program ; if not, write to the Free Software +; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; *************************************************************************/ - -; these 3dne functions are compatible with iSSE, but are optimized specifically for -; K7 pipelines -; -;------------------------------------------------------------------------------ -; 09.12.2002 Athlon optimizations contributed by Jaan Kalda -;------------------------------------------------------------------------------ - - -bits 32 -%ifdef FORMAT_COFF -section .data data -%else -section .data data align=16 -%endif +; * $Id: mem_transfer_3dne.asm,v 1.2.2.1 2003-10-28 22:23:03 edgomez Exp $ +; * +; ***************************************************************************/ +; these 3dne functions are compatible with iSSE, but are optimized specifically +; for K7 pipelines -align 8 -mm_zero: -dd 0,0 +BITS 32 - -%macro cglobal 1 +%macro cglobal 1 %ifdef PREFIX - global _%1 + global _%1 %define %1 _%1 %else global %1 %endif %endmacro + +;============================================================================= +; Read only data +;============================================================================= + +SECTION .rodata + +ALIGN 8 +mm_zero: + dd 0,0 +;============================================================================= +; Macros +;============================================================================= + %macro nop4 0 -DB 08Dh,074h,026h,0 + db 08Dh, 074h, 026h, 0 %endmacro -section .text +;============================================================================= +; Code +;============================================================================= + +SECTION .text cglobal transfer_8to16copy_3dne cglobal transfer_16to8copy_3dne @@ -71,15 +68,15 @@ cglobal transfer_16to8add_3dne cglobal transfer8x8_copy_3dne -;=========================================================================== +;----------------------------------------------------------------------------- ; ; void transfer_8to16copy_3dne(int16_t * const dst, ; const uint8_t * const src, ; uint32_t stride); ; -;=========================================================================== +;----------------------------------------------------------------------------- -align 16 +ALIGN 16 transfer_8to16copy_3dne: mov eax, [esp+ 8] ; Src @@ -87,69 +84,69 @@ mov ecx, [esp+ 4] ; Dst punpcklbw mm0, [byte eax] punpcklbw mm1, [eax+4] - movq mm2,[eax+edx] - movq mm3,[eax+edx] - pxor mm7,mm7 - lea eax,[eax+2*edx] - punpcklbw mm2,mm7 - punpckhbw mm3,mm7 - psrlw mm0,8 - psrlw mm1,8 + movq mm2, [eax+edx] + movq mm3, [eax+edx] + pxor mm7, mm7 + lea eax, [eax+2*edx] + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + psrlw mm0, 8 + psrlw mm1, 8 punpcklbw mm4, [eax] punpcklbw mm5, [eax+edx+4] movq [byte ecx+0*64], mm0 movq [ecx+0*64+8], mm1 punpcklbw mm6, [eax+edx] punpcklbw mm7, [eax+4] - lea eax,[byte eax+2*edx] - psrlw mm4,8 - psrlw mm5,8 + lea eax, [byte eax+2*edx] + psrlw mm4, 8 + psrlw mm5, 8 punpcklbw mm0, [eax] punpcklbw mm1, [eax+edx+4] movq [ecx+0*64+16], mm2 movq [ecx+0*64+24], mm3 - psrlw mm6,8 - psrlw mm7,8 + psrlw mm6, 8 + psrlw mm7, 8 punpcklbw mm2, [eax+edx] punpcklbw mm3, [eax+4] - lea eax,[byte eax+2*edx] + lea eax, [byte eax+2*edx] movq [byte ecx+0*64+32], mm4 movq [ecx+0*64+56], mm5 - psrlw mm0,8 - psrlw mm1,8 + psrlw mm0, 8 + psrlw mm1, 8 punpcklbw mm4, [eax] punpcklbw mm5, [eax+edx+4] movq [byte ecx+0*64+48], mm6 movq [ecx+0*64+40], mm7 - psrlw mm2,8 - psrlw mm3,8 + psrlw mm2, 8 + psrlw mm3, 8 punpcklbw mm6, [eax+edx] punpcklbw mm7, [eax+4] movq [byte ecx+1*64], mm0 movq [ecx+1*64+24], mm1 - psrlw mm4,8 - psrlw mm5,8 + psrlw mm4, 8 + psrlw mm5, 8 movq [ecx+1*64+16], mm2 movq [ecx+1*64+8], mm3 - psrlw mm6,8 - psrlw mm7,8 + psrlw mm6, 8 + psrlw mm7, 8 movq [byte ecx+1*64+32], mm4 - movq [ecx+1*64+56], mm5 + movq [ecx+1*64+56], mm5 movq [byte ecx+1*64+48], mm6 movq [ecx+1*64+40], mm7 -ret - - + ret -;=========================================================================== + + +;----------------------------------------------------------------------------- ; ; void transfer_16to8copy_3dne(uint8_t * const dst, ; const int16_t * const src, ; uint32_t stride); ; -;=========================================================================== +;----------------------------------------------------------------------------- -align 16 +ALIGN 16 transfer_16to8copy_3dne: mov eax, [esp+ 8] ; Src @@ -157,53 +154,42 @@ mov edx, [esp+12] ; Stride movq mm0, [byte eax+0*32] - packuswb mm0,[eax+0*32+8] + packuswb mm0, [eax+0*32+8] movq mm1, [eax+0*32+16] - packuswb mm1,[eax+0*32+24] + packuswb mm1, [eax+0*32+24] movq mm5, [eax+2*32+16] movq mm2, [eax+1*32] - packuswb mm2, [eax+1*32+8] + packuswb mm2, [eax+1*32+8] movq mm3, [eax+1*32+16] packuswb mm3, [eax+1*32+24] - movq mm6, [eax+3*32] + movq mm6, [eax+3*32] movq mm4, [eax+2*32] - packuswb mm4, [eax+2*32+8] + packuswb mm4, [eax+2*32+8] packuswb mm5, [eax+2*32+24] movq mm7, [eax+3*32+16] packuswb mm7, [eax+3*32+24] - packuswb mm6, [eax+3*32+8] + packuswb mm6, [eax+3*32+8] movq [ecx], mm0 - lea eax,[3*edx] - add eax,ecx + lea eax, [3*edx] + add eax, ecx movq [ecx+edx], mm1 movq [ecx+2*edx], mm2 movq [byte eax], mm3 movq [ecx+4*edx], mm4 - lea ecx,[byte ecx+4*edx] + lea ecx, [byte ecx+4*edx] movq [eax+2*edx], mm5 movq [eax+4*edx], mm7 movq [ecx+2*edx], mm6 ret -;=========================================================================== +;----------------------------------------------------------------------------- ; ; void transfer_8to16sub_3dne(int16_t * const dct, ; uint8_t * const cur, ; const uint8_t * const ref, ; const uint32_t stride); ; -;=========================================================================== -;/************************************************************************** -; * -; * History: -; * -; * 27.12.2001 renamed from 'compensate' to 'transfer_8to16sub' -; * 02.12.2001 loop unrolled, code runs 10% faster now (Isibaar) -; * 30.11.2001 16 pixels are processed per iteration (Isibaar) -; * 30.11.2001 .text missing -; * 06.11.2001 inital version; (c)2001 peter ross -; * -; *************************************************************************/ +;----------------------------------------------------------------------------- ; when second argument == 1, reference (ebx) block is to current (eax) %macro COPY_8_TO_16_SUB 2 @@ -217,7 +203,7 @@ punpckhbw mm1, mm7 punpckhbw mm6, mm7 punpcklbw mm4, mm7 -align 8 +ALIGN 8 movq mm2, [byte eax+edx] punpcklbw mm0, mm7 movq mm3, [byte eax+edx] @@ -231,7 +217,7 @@ movq mm6, mm5 psubsw mm0, mm4 -%if (%1 < 3) +%if (%1 < 3) lea eax,[eax+2*edx] lea ecx,[ecx+2*edx] %else @@ -248,7 +234,7 @@ movq [edi+%1*32+24], mm3 %endmacro -align 16 +ALIGN 16 transfer_8to16sub_3dne: mov eax, [esp + 8] ; Cur mov ecx, [esp +12] ; Ref @@ -257,15 +243,15 @@ mov edi, [esp+4+ 4] ; Dst pxor mm7, mm7 nop -align 4 +ALIGN 4 COPY_8_TO_16_SUB 0, 1 COPY_8_TO_16_SUB 1, 1 COPY_8_TO_16_SUB 2, 1 COPY_8_TO_16_SUB 3, 1 - mov edi,ecx + mov edi, ecx ret -align 16 +ALIGN 16 transfer_8to16subro_3dne: mov eax, [esp + 8] ; Cur mov ecx, [esp +12] ; Ref @@ -274,16 +260,16 @@ mov edi, [esp+4+ 4] ; Dst pxor mm7, mm7 nop -align 4 +ALIGN 4 COPY_8_TO_16_SUB 0, 0 COPY_8_TO_16_SUB 1, 0 COPY_8_TO_16_SUB 2, 0 COPY_8_TO_16_SUB 3, 0 - mov edi,ecx + mov edi, ecx ret -;=========================================================================== +;----------------------------------------------------------------------------- ; ; void transfer_8to16sub2_3dne(int16_t * const dct, ; uint8_t * const cur, @@ -291,10 +277,10 @@ ; const uint8_t * ref2, ; const uint32_t stride) ; -;=========================================================================== +;----------------------------------------------------------------------------- %macro COPY_8_TO_16_SUB2_SSE 1 - db 0Fh, 6Fh, 44h, 20h, 00 ;movq mm0, [byte eax] ; cur + db 0Fh, 6Fh, 44h, 20h, 00 ;movq mm0, [byte eax] ; cur punpcklbw mm0, mm7 movq mm2, [byte eax+edx] punpcklbw mm2, mm7 @@ -302,7 +288,7 @@ punpckhbw mm1, mm7 movq mm3, [byte eax+edx] punpckhbw mm3, mm7 - + movq mm4, [byte ebx] ; ref1 pavgb mm4, [byte esi] ; ref2 movq mm5, [ebx+edx] ; ref @@ -318,7 +304,7 @@ mov esi,[esp] mov ebx,[esp+4] add esp,byte 8 -%endif + %endif psubsw mm0, mm4 psubsw mm1, mm6 movq mm6, mm5 @@ -332,7 +318,7 @@ movq [ecx+%1*32+24], mm3 %endmacro -align 16 +ALIGN 16 transfer_8to16sub2_3dne: mov edx, [esp +20] ; Stride mov ecx, [esp + 4] ; Dst @@ -343,7 +329,7 @@ push esi pxor mm7, mm7 mov esi, [esp+8+16] ; Ref2 - nop4 + nop4 COPY_8_TO_16_SUB2_SSE 0 COPY_8_TO_16_SUB2_SSE 1 COPY_8_TO_16_SUB2_SSE 2 @@ -352,13 +338,13 @@ ret -;=========================================================================== +;----------------------------------------------------------------------------- ; ; void transfer_16to8add_3dne(uint8_t * const dst, ; const int16_t * const src, ; uint32_t stride); ; -;=========================================================================== +;----------------------------------------------------------------------------- %macro COPY_16_TO_8_ADD 1 db 0Fh, 6Fh, 44h, 21h, 00 ;movq mm0, [byte ecx] @@ -375,13 +361,13 @@ paddsw mm3, [eax+%1*32+24] packuswb mm0, mm1 packuswb mm2, mm3 - mov esp,esp + mov esp,esp movq [byte ecx], mm0 movq [ecx+edx], mm2 %endmacro -align 16 +ALIGN 16 transfer_16to8add_3dne: mov ecx, [esp+ 4] ; Dst mov edx, [esp+12] ; Stride @@ -398,14 +384,14 @@ COPY_16_TO_8_ADD 3 ret -;=========================================================================== +;----------------------------------------------------------------------------- ; ; void transfer8x8_copy_3dne(uint8_t * const dst, ; const uint8_t * const src, ; const uint32_t stride); ; ; -;=========================================================================== +;----------------------------------------------------------------------------- %macro COPY_8_TO_8 0 movq mm0, [byte eax] @@ -415,7 +401,7 @@ movq [ecx+edx], mm1 %endmacro -align 16 +ALIGN 16 transfer8x8_copy_3dne: mov eax, [esp+ 8] ; Src mov edx, [esp+12] ; Stride