--- trunk/xvidcore/src/motion/x86_asm/sad_xmm.asm	2002/07/07 09:45:40	262
+++ trunk/xvidcore/src/motion/x86_asm/sad_xmm.asm	2002/09/06 16:59:47	430
@@ -1,37 +1,39 @@
-;/**************************************************************************
+;/*****************************************************************************
 ; *
-; *	XVID MPEG-4 VIDEO CODEC
-; *	xmm sum of absolute difference
+; *  XVID MPEG-4 VIDEO CODEC
+; *  xmm (extended mmx) sum of absolute difference
 ; *
-; *	This program is free software; you can redistribute it and/or modify
-; *	it under the terms of the GNU General Public License as published by
-; *	the Free Software Foundation; either version 2 of the License, or
-; *	(at your option) any later version.
+; *  Copyright(C) 2002 Peter Ross <pross@xvid.org>
+; *  Copyright(C) 2002 Michael Militzer <michael@xvid.org>
+; *  Copyright(C) 2002 -Skal-
 ; *
-; *	This program is distributed in the hope that it will be useful,
-; *	but WITHOUT ANY WARRANTY; without even the implied warranty of
-; *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-; *	GNU General Public License for more details.
+; *  This program is an implementation of a part of one or more MPEG-4
+; *  Video tools as specified in ISO/IEC 14496-2 standard.  Those intending
+; *  to use this software module in hardware or software products are
+; *  advised that its use may infringe existing patents or copyrights, and
+; *  any such use would be at such party's own risk.  The original
+; *  developer of this software module and his/her company, and subsequent
+; *  editors and their companies, will have no liability for use of this
+; *  software or modifications or derivatives thereof.
 ; *
-; *	You should have received a copy of the GNU General Public License
-; *	along with this program; if not, write to the Free Software
-; *	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+; *  This program is free software; you can redistribute it and/or modify
+; *  it under the terms of the GNU General Public License as published by
+; *  the Free Software Foundation; either version 2 of the License, or
+; *  (at your option) any later version.
 ; *
-; *************************************************************************/
-
-;/**************************************************************************
+; *  This program is distributed in the hope that it will be useful,
+; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+; *  GNU General Public License for more details.
 ; *
-; *	History:
-; *
-; * 23.07.2002	sad8bi_xmm; <pross@xvid.org>
-; * 04.06.2002  rewrote some funcs (XMM mainly)     -Skal-
-; * 17.11.2001  bugfix and small improvement for dev16_xmm,
-; *             removed terminate early in sad16_xmm (Isibaar)
-; *	12.11.2001	inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
+; *  You should have received a copy of the GNU General Public License
+; *  along with this program; if not, write to the Free Software
+; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 ; *
 ; *************************************************************************/
 
 bits 32
+
 %macro cglobal 1 
 	%ifdef PREFIX
 		global _%1 
@@ -58,8 +60,9 @@
 ;
 ; uint32_t sad16_xmm(const uint8_t * const cur,
 ;					const uint8_t * const ref,
+;					const uint32_t stride,
 ;					const uint32_t best_sad);
-cglobal  sad8_xmm
+;
 ;===========================================================================
 
 %macro SAD_16x16_SSE 0
@@ -114,6 +117,60 @@
 ;
 ;===========================================================================
 
+%macro SAD_8x8_SSE 0
+    movq mm0, [eax]
+    movq mm1, [eax+ecx]
+
+    psadbw mm0, [edx]
+    psadbw mm1, [edx+ecx]
+    add eax, ebx
+    add edx, ebx
+
+    paddusw mm5,mm0
+    paddusw mm6,mm1
+%endmacro
+
+align 16
+sad8_xmm:
+
+    mov eax, [esp+ 4] ; Src1
+    mov edx, [esp+ 8] ; Src2
+    mov ecx, [esp+12] ; Stride
+    push ebx
+    lea ebx, [ecx+ecx]
+    
+    pxor mm5, mm5 ; accum1
+    pxor mm6, mm6 ; accum2
+
+    SAD_8x8_SSE
+    SAD_8x8_SSE
+    SAD_8x8_SSE
+
+    movq mm0, [eax]
+    movq mm1, [eax+ecx]
+    psadbw mm0, [edx]
+    psadbw mm1, [edx+ecx]
+
+    pop ebx
+
+    paddusw mm5,mm0
+    paddusw mm6,mm1
+
+    paddusw mm6,mm5
+    movd eax, mm6
+
+    ret
+
+
+;===========================================================================
+;
+; uint32_t sad16bi_xmm(const uint8_t * const cur,
+;					const uint8_t * const ref1,
+;					const uint8_t * const ref2,
+;					const uint32_t stride);
+;
+;===========================================================================
+
 %macro SADBI_16x16_SSE 0
     movq mm0, [eax]
     movq mm1, [eax+8]
@@ -179,57 +236,58 @@
 ; 
 ;=========================================================================== 
 
+%macro SADBI_8x8_XMM 0 
+   movq mm0, [eax] 
+   movq mm1, [eax+ecx] 
+
+   movq mm2, [edx] 
+   movq mm3, [edx+ecx] 
+
+   pavgb mm2, [ebx] 
+   lea edx, [edx+2*ecx] 
+
+   pavgb mm3, [ebx+ecx] 
+   lea ebx, [ebx+2*ecx] 
+
+   psadbw mm0, mm2 
+   lea eax, [eax+2*ecx] 
+
+   psadbw mm1, mm3 
+   paddusw mm5,mm0 
+
+   paddusw mm6,mm1 
+%endmacro 
+
+align 16 
+sad8bi_xmm: 
+   push ebx 
+   mov eax, [esp+4+ 4] ; Src 
+   mov edx, [esp+4+ 8] ; Ref1 
+   mov ebx, [esp+4+12] ; Ref2 
+   mov ecx, [esp+4+16] ; Stride 
+
+   pxor mm5, mm5 ; accum1 
+   pxor mm6, mm6 ; accum2 
+.Loop 
+   SADBI_8x8_XMM 
+   SADBI_8x8_XMM 
+   SADBI_8x8_XMM 
+   SADBI_8x8_XMM 
+
+   paddusw mm6,mm5 
+   movd eax, mm6 
+   pop ebx 
+   ret 
+
+
 ;===========================================================================
 ;
-; uint32_t sad8_xmm(const uint8_t * const cur,
-;					const uint8_t * const ref,
+; uint32_t dev16_xmm(const uint8_t * const cur,
 ;					const uint32_t stride);
 ;
 ;===========================================================================
 
-%macro SAD_8x8_SSE 0
-    movq mm0, [eax]
-    movq mm1, [eax+ecx]
 %macro MEAN_16x16_SSE 0
-    psadbw mm0, [edx]
-    psadbw mm1, [edx+ecx]
-    add eax, ebx
-    add edx, ebx
-
-    paddusw mm5,mm0
-    paddusw mm6,mm1
-%endmacro
-
-align 16
-sad8_xmm:
-
-    mov eax, [esp+ 4] ; Src1
-    mov edx, [esp+ 8] ; Src2
-    mov ecx, [esp+12] ; Stride
-    push ebx
-    lea ebx, [ecx+ecx]
-    
-    pxor mm5, mm5 ; accum1
-    pxor mm6, mm6 ; accum2
-
-    SAD_8x8_SSE
-    SAD_8x8_SSE
-    SAD_8x8_SSE
-
-    movq mm0, [eax]
-    movq mm1, [eax+ecx]
-    psadbw mm0, [edx]
-    psadbw mm1, [edx+ecx]
-
-    pop ebx
-
-    paddusw mm5,mm0
-    paddusw mm6,mm1
-
-    paddusw mm6,mm5
-    movd eax, mm6
-
-    ret
     movq mm0, [eax]
     movq mm1, [eax+8]
     psadbw mm0, mm7
@@ -318,4 +376,4 @@
 	paddd mm6, mm7
 
     movd eax, mm6
-    ret
\ No newline at end of file
+    ret