--- trunk/xvidcore/src/utils/x86_asm/mem_transfer_mmx.asm	2002/11/17 00:51:11	654
+++ trunk/xvidcore/src/utils/x86_asm/mem_transfer_mmx.asm	2003/02/15 15:22:19	851
@@ -3,52 +3,28 @@
 ; *	XVID MPEG-4 VIDEO CODEC
 ; *	mmx 8bit<->16bit transfers
 ; *
-; *  This file is part of XviD, a free MPEG-4 video encoder/decoder
-; *
-; *  XviD is free software; you can redistribute it and/or modify it
-; *  under the terms of the GNU General Public License as published by
-; *  the Free Software Foundation; either version 2 of the License, or
-; *  (at your option) any later version.
-; *
-; *  This program is distributed in the hope that it will be useful,
-; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
-; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-; *  GNU General Public License for more details.
-; *
-; *  You should have received a copy of the GNU General Public License
-; *  along with this program; if not, write to the Free Software
-; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
-; *
-; *  Under section 8 of the GNU General Public License, the copyright
-; *  holders of XVID explicitly forbid distribution in the following
-; *  countries:
-; *
-; *    - Japan
-; *    - United States of America
-; *
-; *  Linking XviD statically or dynamically with other modules is making a
-; *  combined work based on XviD.  Thus, the terms and conditions of the
-; *  GNU General Public License cover the whole combination.
-; *
-; *  As a special exception, the copyright holders of XviD give you
-; *  permission to link XviD with independent modules that communicate with
-; *  XviD solely through the VFW1.1 and DShow interfaces, regardless of the
-; *  license terms of these independent modules, and to copy and distribute
-; *  the resulting combined work under terms of your choice, provided that
-; *  every copy of the combined work is accompanied by a complete copy of
-; *  the source code of XviD (the version of XviD used to produce the
-; *  combined work), being distributed under the terms of the GNU General
-; *  Public License plus this exception.  An independent module is a module
-; *  which is not derived from or based on XviD.
-; *
-; *  Note that people who make modified versions of XviD are not obligated
-; *  to grant this special exception for their modified versions; it is
-; *  their choice whether to do so.  The GNU General Public License gives
-; *  permission to release a modified version without this exception; this
-; *  exception also makes it possible to release a modified version which
-; *  carries forward this exception.
-; *
-; * $Id: mem_transfer_mmx.asm,v 1.8 2002-11-17 00:51:11 edgomez Exp $
+; *	This program is an implementation of a part of one or more MPEG-4
+; *	Video tools as specified in ISO/IEC 14496-2 standard.  Those intending
+; *	to use this software module in hardware or software products are
+; *	advised that its use may infringe existing patents or copyrights, and
+; *	any such use would be at such party's own risk.  The original
+; *	developer of this software module and his/her company, and subsequent
+; *	editors and their companies, will have no liability for use of this
+; *	software or modifications or derivatives thereof.
+; *
+; *	This program is free software; you can redistribute it and/or modify
+; *	it under the terms of the GNU General Public License as published by
+; *	the Free Software Foundation; either version 2 of the License, or
+; *	(at your option) any later version.
+; *
+; *	This program is distributed in the hope that it will be useful,
+; *	but WITHOUT ANY WARRANTY; without even the implied warranty of
+; *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+; *	GNU General Public License for more details.
+; *
+; *	You should have received a copy of the GNU General Public License
+; *	along with this program; if not, write to the Free Software
+; *	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 ; *
 ; *************************************************************************/
 
@@ -59,7 +35,7 @@
 ; * 04.06.2002  speed enhancement (unroll+overlap). -Skal-
 ; *             + added transfer_8to16sub2_mmx/xmm
 ; * 07.01.2002	merge functions from compensate_mmx; rename functions
-; *	07.11.2001	initial version; (c)2001 peter ross <pross@xvid.org>
+; *	07.11.2001	initial version; (c)2001 peter ross <pross@cs.rmit.edu.au>
 ; *
 ; *************************************************************************/
 
@@ -81,6 +57,7 @@
 cglobal transfer_8to16copy_mmx
 cglobal transfer_16to8copy_mmx
 cglobal transfer_8to16sub_mmx
+cglobal transfer_8to16subro_mmx
 cglobal transfer_8to16sub2_mmx
 cglobal transfer_8to16sub2_xmm
 cglobal transfer_16to8add_mmx
@@ -175,11 +152,12 @@
 ; * 02.12.2001  loop unrolled, code runs 10% faster now (Isibaar)
 ; * 30.11.2001  16 pixels are processed per iteration (Isibaar)
 ; * 30.11.2001	.text missing
-; *	06.11.2001	inital version; (c)2001 peter ross <pross@xvid.org>
+; *	06.11.2001	inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
 ; *
 ; *************************************************************************/
 
-%macro COPY_8_TO_16_SUB 1
+; when second argument == 1, reference (ebx) block is to current (eax)
+%macro COPY_8_TO_16_SUB 2
   movq mm0, [eax]      ; cur
   movq mm2, [eax+edx]
   movq mm1, mm0
@@ -188,13 +166,15 @@
   punpcklbw mm0, mm7
   punpcklbw mm2, mm7
   movq mm4, [ebx]      ; ref
-	punpckhbw mm1, mm7
-	punpckhbw mm3, mm7
+  punpckhbw mm1, mm7
+  punpckhbw mm3, mm7
   movq mm5, [ebx+edx]  ; ref
 
   movq mm6, mm4
+%if %2 == 1
   movq [eax], mm4
   movq [eax+edx], mm5
+%endif
   punpcklbw mm4, mm7
   punpckhbw mm6, mm7
   psubsw mm0, mm4
@@ -208,9 +188,9 @@
   lea ebx,[ebx+2*edx]
 
   movq [ecx+%1*32+ 0], mm0 ; dst
-	movq [ecx+%1*32+ 8], mm1
-	movq [ecx+%1*32+16], mm2
-	movq [ecx+%1*32+24], mm3
+  movq [ecx+%1*32+ 8], mm1
+  movq [ecx+%1*32+16], mm2
+  movq [ecx+%1*32+24], mm3
 %endmacro
 
 align 16
@@ -222,14 +202,33 @@
   mov edx, [esp+4+16] ; Stride
   pxor mm7, mm7
 
-  COPY_8_TO_16_SUB 0
-  COPY_8_TO_16_SUB 1
-  COPY_8_TO_16_SUB 2
-  COPY_8_TO_16_SUB 3
+  COPY_8_TO_16_SUB 0, 1
+  COPY_8_TO_16_SUB 1, 1
+  COPY_8_TO_16_SUB 2, 1
+  COPY_8_TO_16_SUB 3, 1
+
+  pop ebx
+  ret
+
+
+align 16
+transfer_8to16subro_mmx:
+  mov ecx, [esp  + 4] ; Dst
+  mov eax, [esp  + 8] ; Cur
+  push ebx
+  mov ebx, [esp+4+12] ; Ref
+  mov edx, [esp+4+16] ; Stride
+  pxor mm7, mm7
+
+  COPY_8_TO_16_SUB 0, 0
+  COPY_8_TO_16_SUB 1, 0
+  COPY_8_TO_16_SUB 2, 0
+  COPY_8_TO_16_SUB 3, 0
 
   pop ebx
   ret
 
+
 ;===========================================================================
 ;
 ; void transfer_8to16sub2_mmx(int16_t * const dct,