--- branches/dev-api-4/xvidcore/src/utils/x86_asm/mem_transfer_3dne.asm	2003/10/27 01:03:43	1191
+++ branches/dev-api-4/xvidcore/src/utils/x86_asm/mem_transfer_3dne.asm	2003/10/28 22:23:03	1192
@@ -1,67 +1,64 @@
-;/**************************************************************************
+;/****************************************************************************
 ; *
-; *	XVID MPEG-4 VIDEO CODEC
-; *	mmx 8bit<->16bit transfers
+; *  XVID MPEG-4 VIDEO CODEC
+; *  - 8<->16 bit transfer functions -
 ; *
-; *	This program is an implementation of a part of one or more MPEG-4
-; *	Video tools as specified in ISO/IEC 14496-2 standard.  Those intending
-; *	to use this software module in hardware or software products are
-; *	advised that its use may infringe existing patents or copyrights, and
-; *	any such use would be at such party's own risk.  The original
-; *	developer of this software module and his/her company, and subsequent
-; *	editors and their companies, will have no liability for use of this
-; *	software or modifications or derivatives thereof.
+; *  Copyright (C) 2002 Jaan Kalda
 ; *
-; *	This program is free software; you can redistribute it and/or modify
-; *	it under the terms of the GNU General Public License as published by
-; *	the Free Software Foundation; either version 2 of the License, or
-; *	(at your option) any later version.
+; *  This program is free software ; you can redistribute it and/or modify
+; *  it under the terms of the GNU General Public License as published by
+; *  the Free Software Foundation ; either version 2 of the License, or
+; *  (at your option) any later version.
 ; *
-; *	This program is distributed in the hope that it will be useful,
-; *	but WITHOUT ANY WARRANTY; without even the implied warranty of
-; *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-; *	GNU General Public License for more details.
+; *  This program is distributed in the hope that it will be useful,
+; *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
+; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+; *  GNU General Public License for more details.
 ; *
-; *	You should have received a copy of the GNU General Public License
-; *	along with this program; if not, write to the Free Software
-; *	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+; *  You should have received a copy of the GNU General Public License
+; *  along with this program ; if not, write to the Free Software
+; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 ; *
-; *************************************************************************/
-
-; these 3dne functions are compatible with iSSE, but are optimized specifically for 
-; K7 pipelines
-;
-;------------------------------------------------------------------------------
-; 09.12.2002  Athlon optimizations contributed by Jaan Kalda 
-;------------------------------------------------------------------------------
-
-
-bits 32
-%ifdef FORMAT_COFF
-section .data data
-%else
-section .data data align=16
-%endif
+; * $Id: mem_transfer_3dne.asm,v 1.2.2.1 2003-10-28 22:23:03 edgomez Exp $
+; *
+; ***************************************************************************/
 
+; these 3dne functions are compatible with iSSE, but are optimized specifically
+; for K7 pipelines
 
-align 8
-mm_zero:
-dd 0,0
+BITS 32
 
-
-%macro cglobal 1 
+%macro cglobal 1
 	%ifdef PREFIX
-		global _%1 
+		global _%1
 		%define %1 _%1
 	%else
 		global %1
 	%endif
 %endmacro
+
+;=============================================================================
+; Read only data
+;=============================================================================
+
+SECTION .rodata
+
+ALIGN 8
+mm_zero:
+	dd 0,0
+;=============================================================================
+; Macros
+;=============================================================================
+
 %macro nop4 0
-DB 08Dh,074h,026h,0
+	db 08Dh, 074h, 026h, 0
 %endmacro
 
-section .text
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
 
 cglobal transfer_8to16copy_3dne
 cglobal transfer_16to8copy_3dne
@@ -71,15 +68,15 @@
 cglobal transfer_16to8add_3dne
 cglobal transfer8x8_copy_3dne
 
-;===========================================================================
+;-----------------------------------------------------------------------------
 ;
 ; void transfer_8to16copy_3dne(int16_t * const dst,
 ;							const uint8_t * const src,
 ;							uint32_t stride);
 ;
-;===========================================================================
+;-----------------------------------------------------------------------------
 
-align 16
+ALIGN 16
 transfer_8to16copy_3dne:
 
   mov eax, [esp+ 8] ; Src
@@ -87,69 +84,69 @@
   mov ecx, [esp+ 4] ; Dst
   punpcklbw mm0, [byte eax]
   punpcklbw mm1, [eax+4]
-  movq mm2,[eax+edx]
-  movq mm3,[eax+edx]
-  pxor mm7,mm7  
-  lea eax,[eax+2*edx]
-  punpcklbw mm2,mm7
-  punpckhbw mm3,mm7
-  psrlw mm0,8
-  psrlw mm1,8  
+  movq mm2, [eax+edx]
+  movq mm3, [eax+edx]
+  pxor mm7, mm7
+  lea eax, [eax+2*edx]
+  punpcklbw mm2, mm7
+  punpckhbw mm3, mm7
+  psrlw mm0, 8
+  psrlw mm1, 8
   punpcklbw mm4, [eax]
   punpcklbw mm5, [eax+edx+4]
   movq [byte ecx+0*64], mm0
   movq [ecx+0*64+8], mm1
   punpcklbw mm6, [eax+edx]
   punpcklbw mm7, [eax+4]
-  lea eax,[byte eax+2*edx]
-  psrlw mm4,8
-  psrlw mm5,8  
+  lea eax, [byte eax+2*edx]
+  psrlw mm4, 8
+  psrlw mm5, 8
   punpcklbw mm0, [eax]
   punpcklbw mm1, [eax+edx+4]
   movq [ecx+0*64+16], mm2
   movq [ecx+0*64+24], mm3
-  psrlw mm6,8
-  psrlw mm7,8  
+  psrlw mm6, 8
+  psrlw mm7, 8
   punpcklbw mm2, [eax+edx]
   punpcklbw mm3, [eax+4]
-  lea eax,[byte eax+2*edx]
+  lea eax, [byte eax+2*edx]
   movq [byte ecx+0*64+32], mm4
   movq [ecx+0*64+56], mm5
-  psrlw mm0,8
-  psrlw mm1,8  
+  psrlw mm0, 8
+  psrlw mm1, 8
   punpcklbw mm4, [eax]
   punpcklbw mm5, [eax+edx+4]
   movq [byte ecx+0*64+48], mm6
   movq [ecx+0*64+40], mm7
-  psrlw mm2,8
-  psrlw mm3,8  
+  psrlw mm2, 8
+  psrlw mm3, 8
   punpcklbw mm6, [eax+edx]
   punpcklbw mm7, [eax+4]
   movq [byte ecx+1*64], mm0
   movq [ecx+1*64+24], mm1
-  psrlw mm4,8
-  psrlw mm5,8  
+  psrlw mm4, 8
+  psrlw mm5, 8
   movq [ecx+1*64+16], mm2
   movq [ecx+1*64+8], mm3
-  psrlw mm6,8
-  psrlw mm7,8  
+  psrlw mm6, 8
+  psrlw mm7, 8
   movq [byte ecx+1*64+32], mm4
-  movq [ecx+1*64+56], mm5  
+  movq [ecx+1*64+56], mm5
   movq [byte ecx+1*64+48], mm6
   movq [ecx+1*64+40], mm7
-ret
-  
- 
+  ret
 
-;===========================================================================
+
+
+;-----------------------------------------------------------------------------
 ;
 ; void transfer_16to8copy_3dne(uint8_t * const dst,
 ;							const int16_t * const src,
 ;							uint32_t stride);
 ;
-;===========================================================================
+;-----------------------------------------------------------------------------
 
-align 16
+ALIGN 16
 transfer_16to8copy_3dne:
 
   mov eax, [esp+ 8] ; Src
@@ -157,53 +154,42 @@
   mov edx, [esp+12] ; Stride
 
   movq mm0, [byte eax+0*32]
-  packuswb mm0,[eax+0*32+8]
+  packuswb mm0, [eax+0*32+8]
   movq mm1, [eax+0*32+16]
-  packuswb mm1,[eax+0*32+24]
+  packuswb mm1, [eax+0*32+24]
   movq mm5, [eax+2*32+16]
   movq mm2, [eax+1*32]
-  packuswb  mm2, [eax+1*32+8]
+  packuswb mm2, [eax+1*32+8]
   movq mm3, [eax+1*32+16]
   packuswb mm3, [eax+1*32+24]
-  movq mm6, [eax+3*32]  
+  movq mm6, [eax+3*32]
   movq mm4, [eax+2*32]
-  packuswb  mm4, [eax+2*32+8]
+  packuswb mm4, [eax+2*32+8]
   packuswb mm5, [eax+2*32+24]
   movq mm7, [eax+3*32+16]
   packuswb mm7, [eax+3*32+24]
-  packuswb  mm6, [eax+3*32+8]
+  packuswb mm6, [eax+3*32+8]
   movq [ecx], mm0
-  lea eax,[3*edx]
-  add eax,ecx
+  lea eax, [3*edx]
+  add eax, ecx
   movq [ecx+edx], mm1
   movq [ecx+2*edx], mm2
   movq [byte eax], mm3
   movq [ecx+4*edx], mm4
-  lea ecx,[byte ecx+4*edx] 
+  lea ecx, [byte ecx+4*edx]
   movq [eax+2*edx], mm5
   movq [eax+4*edx], mm7
   movq [ecx+2*edx], mm6
   ret
 
-;===========================================================================
+;-----------------------------------------------------------------------------
 ;
 ; void transfer_8to16sub_3dne(int16_t * const dct,
 ;				uint8_t * const cur,
 ;				const uint8_t * const ref,
 ;				const uint32_t stride);
 ;
-;===========================================================================
-;/**************************************************************************
-; *
-; *	History:
-; *
-; * 27.12.2001	renamed from 'compensate' to 'transfer_8to16sub'
-; * 02.12.2001  loop unrolled, code runs 10% faster now (Isibaar)
-; * 30.11.2001  16 pixels are processed per iteration (Isibaar)
-; * 30.11.2001	.text missing
-; *	06.11.2001	inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
-; *
-; *************************************************************************/
+;-----------------------------------------------------------------------------
 
 ; when second argument == 1, reference (ebx) block is to current (eax)
 %macro COPY_8_TO_16_SUB 2
@@ -217,7 +203,7 @@
   punpckhbw mm1, mm7
   punpckhbw mm6, mm7
   punpcklbw mm4, mm7
-align 8  
+ALIGN 8
   movq mm2, [byte eax+edx]
   punpcklbw mm0, mm7
   movq mm3, [byte eax+edx]
@@ -231,7 +217,7 @@
 
   movq mm6, mm5
   psubsw mm0, mm4
-%if (%1 < 3)  
+%if (%1 < 3)
   lea eax,[eax+2*edx]
   lea ecx,[ecx+2*edx]
 %else
@@ -248,7 +234,7 @@
   movq [edi+%1*32+24], mm3
 %endmacro
 
-align 16
+ALIGN 16
 transfer_8to16sub_3dne:
   mov eax, [esp + 8] ; Cur
   mov ecx, [esp +12] ; Ref
@@ -257,15 +243,15 @@
   mov edi, [esp+4+ 4] ; Dst
   pxor mm7, mm7
   nop
-align 4
+ALIGN 4
   COPY_8_TO_16_SUB 0, 1
   COPY_8_TO_16_SUB 1, 1
   COPY_8_TO_16_SUB 2, 1
   COPY_8_TO_16_SUB 3, 1
-  mov edi,ecx
+  mov edi, ecx
   ret
 
-align 16
+ALIGN 16
 transfer_8to16subro_3dne:
   mov eax, [esp + 8] ; Cur
   mov ecx, [esp +12] ; Ref
@@ -274,16 +260,16 @@
   mov edi, [esp+4+ 4] ; Dst
   pxor mm7, mm7
   nop
-align 4
+ALIGN 4
   COPY_8_TO_16_SUB 0, 0
   COPY_8_TO_16_SUB 1, 0
   COPY_8_TO_16_SUB 2, 0
   COPY_8_TO_16_SUB 3, 0
-  mov edi,ecx
+  mov edi, ecx
   ret
 
 
-;===========================================================================
+;-----------------------------------------------------------------------------
 ;
 ; void transfer_8to16sub2_3dne(int16_t * const dct,
 ;				uint8_t * const cur,
@@ -291,10 +277,10 @@
 ;				const uint8_t * ref2,
 ;				const uint32_t stride)
 ;
-;===========================================================================
+;-----------------------------------------------------------------------------
 
 %macro COPY_8_TO_16_SUB2_SSE 1
- db 0Fh, 6Fh, 44h, 20h, 00  ;movq mm0, [byte eax]      ; cur
+  db 0Fh, 6Fh, 44h, 20h, 00  ;movq mm0, [byte eax]      ; cur
   punpcklbw mm0, mm7
   movq mm2, [byte eax+edx]
   punpcklbw mm2, mm7
@@ -302,7 +288,7 @@
   punpckhbw mm1, mm7
   movq mm3, [byte eax+edx]
   punpckhbw mm3, mm7
-  
+
   movq mm4, [byte ebx]      ; ref1
   pavgb mm4, [byte esi]     ; ref2
   movq mm5, [ebx+edx]  ; ref
@@ -318,7 +304,7 @@
   mov esi,[esp]
   mov ebx,[esp+4]
   add esp,byte 8
-%endif  
+  %endif
   psubsw mm0, mm4
   psubsw mm1, mm6
   movq mm6, mm5
@@ -332,7 +318,7 @@
   movq [ecx+%1*32+24], mm3
 %endmacro
 
-align 16
+ALIGN 16
 transfer_8to16sub2_3dne:
   mov edx, [esp +20] ; Stride
   mov ecx, [esp  + 4] ; Dst
@@ -343,7 +329,7 @@
   push esi
   pxor mm7, mm7
   mov esi, [esp+8+16] ; Ref2
- nop4
+  nop4
   COPY_8_TO_16_SUB2_SSE 0
   COPY_8_TO_16_SUB2_SSE 1
   COPY_8_TO_16_SUB2_SSE 2
@@ -352,13 +338,13 @@
   ret
 
 
-;===========================================================================
+;-----------------------------------------------------------------------------
 ;
 ; void transfer_16to8add_3dne(uint8_t * const dst,
 ;						const int16_t * const src,
 ;						uint32_t stride);
 ;
-;===========================================================================
+;-----------------------------------------------------------------------------
 
 %macro COPY_16_TO_8_ADD 1
   db 0Fh, 6Fh, 44h, 21h, 00 ;movq mm0, [byte ecx]
@@ -375,13 +361,13 @@
   paddsw mm3, [eax+%1*32+24]
   packuswb mm0, mm1
   packuswb mm2, mm3
-  mov	esp,esp
+  mov esp,esp
   movq [byte ecx], mm0
   movq [ecx+edx], mm2
 %endmacro
 
 
-align 16
+ALIGN 16
 transfer_16to8add_3dne:
   mov ecx, [esp+ 4] ; Dst
   mov edx, [esp+12] ; Stride
@@ -398,14 +384,14 @@
   COPY_16_TO_8_ADD 3
   ret
 
-;===========================================================================
+;-----------------------------------------------------------------------------
 ;
 ; void transfer8x8_copy_3dne(uint8_t * const dst,
 ;					const uint8_t * const src,
 ;					const uint32_t stride);
 ;
 ;
-;===========================================================================
+;-----------------------------------------------------------------------------
 
 %macro COPY_8_TO_8 0
   movq mm0, [byte  eax]
@@ -415,7 +401,7 @@
   movq [ecx+edx], mm1
 %endmacro
 
-align 16
+ALIGN 16
 transfer8x8_copy_3dne:
   mov eax, [esp+ 8] ; Src
   mov edx, [esp+12] ; Stride