--- trunk/xvidcore/src/motion/x86_asm/sad_xmm.asm	2002/07/07 09:45:40	262
+++ branches/dev-api-4/xvidcore/src/motion/x86_asm/sad_xmm.asm	2003/02/21 14:49:29	886
@@ -3,6 +3,15 @@
 ; *	XVID MPEG-4 VIDEO CODEC
 ; *	xmm sum of absolute difference
 ; *
+; *	This program is an implementation of a part of one or more MPEG-4
+; *	Video tools as specified in ISO/IEC 14496-2 standard.  Those intending
+; *	to use this software module in hardware or software products are
+; *	advised that its use may infringe existing patents or copyrights, and
+; *	any such use would be at such party's own risk.  The original
+; *	developer of this software module and his/her company, and subsequent
+; *	editors and their companies, will have no liability for use of this
+; *	software or modifications or derivatives thereof.
+; *
 ; *	This program is free software; you can redistribute it and/or modify
 ; *	it under the terms of the GNU General Public License as published by
 ; *	the Free Software Foundation; either version 2 of the License, or
@@ -32,6 +41,7 @@
 ; *************************************************************************/
 
 bits 32
+
 %macro cglobal 1 
 	%ifdef PREFIX
 		global _%1 
@@ -58,8 +68,9 @@
 ;
 ; uint32_t sad16_xmm(const uint8_t * const cur,
 ;					const uint8_t * const ref,
+;					const uint32_t stride,
 ;					const uint32_t best_sad);
-cglobal  sad8_xmm
+;
 ;===========================================================================
 
 %macro SAD_16x16_SSE 0
@@ -114,6 +125,60 @@
 ;
 ;===========================================================================
 
+%macro SAD_8x8_SSE 0
+    movq mm0, [eax]
+    movq mm1, [eax+ecx]
+
+    psadbw mm0, [edx]
+    psadbw mm1, [edx+ecx]
+    add eax, ebx
+    add edx, ebx
+
+    paddusw mm5,mm0
+    paddusw mm6,mm1
+%endmacro
+
+align 16
+sad8_xmm:
+
+    mov eax, [esp+ 4] ; Src1
+    mov edx, [esp+ 8] ; Src2
+    mov ecx, [esp+12] ; Stride
+    push ebx
+    lea ebx, [ecx+ecx]
+    
+    pxor mm5, mm5 ; accum1
+    pxor mm6, mm6 ; accum2
+
+    SAD_8x8_SSE
+    SAD_8x8_SSE
+    SAD_8x8_SSE
+
+    movq mm0, [eax]
+    movq mm1, [eax+ecx]
+    psadbw mm0, [edx]
+    psadbw mm1, [edx+ecx]
+
+    pop ebx
+
+    paddusw mm5,mm0
+    paddusw mm6,mm1
+
+    paddusw mm6,mm5
+    movd eax, mm6
+
+    ret
+
+
+;===========================================================================
+;
+; uint32_t sad16bi_xmm(const uint8_t * const cur,
+;					const uint8_t * const ref1,
+;					const uint8_t * const ref2,
+;					const uint32_t stride);
+;
+;===========================================================================
+
 %macro SADBI_16x16_SSE 0
     movq mm0, [eax]
     movq mm1, [eax+8]
@@ -179,57 +244,58 @@
 ; 
 ;=========================================================================== 
 
+%macro SADBI_8x8_XMM 0 
+   movq mm0, [eax] 
+   movq mm1, [eax+ecx] 
+
+   movq mm2, [edx] 
+   movq mm3, [edx+ecx] 
+
+   pavgb mm2, [ebx] 
+   lea edx, [edx+2*ecx] 
+
+   pavgb mm3, [ebx+ecx] 
+   lea ebx, [ebx+2*ecx] 
+
+   psadbw mm0, mm2 
+   lea eax, [eax+2*ecx] 
+
+   psadbw mm1, mm3 
+   paddusw mm5,mm0 
+
+   paddusw mm6,mm1 
+%endmacro 
+
+align 16 
+sad8bi_xmm: 
+   push ebx 
+   mov eax, [esp+4+ 4] ; Src 
+   mov edx, [esp+4+ 8] ; Ref1 
+   mov ebx, [esp+4+12] ; Ref2 
+   mov ecx, [esp+4+16] ; Stride 
+
+   pxor mm5, mm5 ; accum1 
+   pxor mm6, mm6 ; accum2 
+.Loop 
+   SADBI_8x8_XMM 
+   SADBI_8x8_XMM 
+   SADBI_8x8_XMM 
+   SADBI_8x8_XMM 
+
+   paddusw mm6,mm5 
+   movd eax, mm6 
+   pop ebx 
+   ret 
+
+
 ;===========================================================================
 ;
-; uint32_t sad8_xmm(const uint8_t * const cur,
-;					const uint8_t * const ref,
+; uint32_t dev16_xmm(const uint8_t * const cur,
 ;					const uint32_t stride);
 ;
 ;===========================================================================
 
-%macro SAD_8x8_SSE 0
-    movq mm0, [eax]
-    movq mm1, [eax+ecx]
 %macro MEAN_16x16_SSE 0
-    psadbw mm0, [edx]
-    psadbw mm1, [edx+ecx]
-    add eax, ebx
-    add edx, ebx
-
-    paddusw mm5,mm0
-    paddusw mm6,mm1
-%endmacro
-
-align 16
-sad8_xmm:
-
-    mov eax, [esp+ 4] ; Src1
-    mov edx, [esp+ 8] ; Src2
-    mov ecx, [esp+12] ; Stride
-    push ebx
-    lea ebx, [ecx+ecx]
-    
-    pxor mm5, mm5 ; accum1
-    pxor mm6, mm6 ; accum2
-
-    SAD_8x8_SSE
-    SAD_8x8_SSE
-    SAD_8x8_SSE
-
-    movq mm0, [eax]
-    movq mm1, [eax+ecx]
-    psadbw mm0, [edx]
-    psadbw mm1, [edx+ecx]
-
-    pop ebx
-
-    paddusw mm5,mm0
-    paddusw mm6,mm1
-
-    paddusw mm6,mm5
-    movd eax, mm6
-
-    ret
     movq mm0, [eax]
     movq mm1, [eax+8]
     psadbw mm0, mm7
@@ -291,6 +357,7 @@
 
     mov eax, [esp+ 4] ; Src
 
+
     pxor mm5, mm5 ; sums
     pxor mm6, mm6
 
@@ -318,4 +385,58 @@
 	paddd mm6, mm7
 
     movd eax, mm6
-    ret
\ No newline at end of file
+    ret
+
+cglobal sad16v_xmm
+
+;===========================================================================
+;int sad16v_xmm(const uint8_t * const cur,
+;               const uint8_t * const ref,
+;               const uint32_t stride,
+;               int* sad8);
+;===========================================================================
+align 16
+sad16v_xmm:
+    push ebx
+    mov eax, [esp+4+ 4] ; Src1
+    mov edx, [esp+4+ 8] ; Src2
+    mov ecx, [esp+4+12] ; Stride
+    mov ebx, [esp+4+16] ; sad ptr
+
+    pxor mm5, mm5 ; accum1
+    pxor mm6, mm6 ; accum2
+    pxor mm7, mm7 ; total
+    SAD_16x16_SSE
+    SAD_16x16_SSE
+    SAD_16x16_SSE
+    SAD_16x16_SSE
+    SAD_16x16_SSE
+    SAD_16x16_SSE
+    SAD_16x16_SSE
+    SAD_16x16_SSE
+    paddusw mm7, mm5
+    paddusw mm7, mm6
+    movd [ebx], mm5
+    movd [ebx+4], mm6
+
+    pxor mm5, mm5 ; accum1
+    pxor mm6, mm6 ; accum2
+    SAD_16x16_SSE
+    SAD_16x16_SSE
+    SAD_16x16_SSE
+    SAD_16x16_SSE
+    SAD_16x16_SSE
+    SAD_16x16_SSE
+    SAD_16x16_SSE
+    SAD_16x16_SSE
+    paddusw mm7, mm5
+    paddusw mm7, mm6
+    movd [ebx+8], mm5
+    movd [ebx+12], mm6
+
+    movd eax, mm7
+    pop ebx
+    ret
+;--------
+
+