--- trunk/xvidcore/src/motion/x86_asm/sad_mmx.asm	2002/07/07 09:45:40	262
+++ trunk/xvidcore/src/motion/x86_asm/sad_mmx.asm	2002/09/06 16:59:47	430
@@ -1,37 +1,39 @@
-;/**************************************************************************
+;/*****************************************************************************
 ; *
-; *	XVID MPEG-4 VIDEO CODEC
-; *	mmx sum of absolute difference
+; *  XVID MPEG-4 VIDEO CODEC
+; *  mmx sum of absolute difference
 ; *
-; *	This program is free software; you can redistribute it and/or modify
-; *	it under the terms of the GNU General Public License as published by
-; *	the Free Software Foundation; either version 2 of the License, or
-; *	(at your option) any later version.
+; *  Copyright(C) 2002 Peter Ross <pross@xvid.org>
 ; *
-; *	This program is distributed in the hope that it will be useful,
-; *	but WITHOUT ANY WARRANTY; without even the implied warranty of
-; *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-; *	GNU General Public License for more details.
+; *  This program is an implementation of a part of one or more MPEG-4
+; *  Video tools as specified in ISO/IEC 14496-2 standard.  Those intending
+; *  to use this software module in hardware or software products are
+; *  advised that its use may infringe existing patents or copyrights, and
+; *  any such use would be at such party's own risk.  The original
+; *  developer of this software module and his/her company, and subsequent
+; *  editors and their companies, will have no liability for use of this
+; *  software or modifications or derivatives thereof.
 ; *
-; *	You should have received a copy of the GNU General Public License
-; *	along with this program; if not, write to the Free Software
-; *	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+; *  This program is free software; you can redistribute it and/or modify
+; *  it under the terms of the GNU General Public License as published by
+; *  the Free Software Foundation; either version 2 of the License, or
+; *  (at your option) any later version.
 ; *
-; *************************************************************************/
-
-;/**************************************************************************
-; *
-; *	History:
+; *  This program is distributed in the hope that it will be useful,
+; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+; *  GNU General Public License for more details.
 ; *
-; * 23.07.2002	sad[16,8]bi_xmm; <pross@xvid.org>
-; * 04.06.2002	cleanup -Skal-
-; *	12.11.2001	inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
+; *  You should have received a copy of the GNU General Public License
+; *  along with this program; if not, write to the Free Software
+; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 ; *
-; *************************************************************************/
+, ****************************************************************************/
 
 bits 32
 
 %macro cglobal 1 
+	%ifdef PREFIX
 		global _%1 
 		%define %1 _%1
 	%else
@@ -57,6 +59,8 @@
 ; uint32_t sad16_mmx(const uint8_t * const cur,
 ;					 const uint8_t * const ref,
 ;					 const uint32_t stride,
+;					 const uint32_t best_sad);
+;
 ; (early termination ignore; slows this down)
 ;
 ;===========================================================================
@@ -206,6 +210,162 @@
 ; 
 ; uint32_t sad16bi_mmx(const uint8_t * const cur, 
 ; const uint8_t * const ref1, 
+; const uint8_t * const ref2, 
+; const uint32_t stride); 
+; 
+;=========================================================================== 
+%macro SADBI_16x16_MMX 2    ; SADBI_16x16_MMX( int_ptr_offset, bool_increment_ptr ); 
+
+   movq mm0, [edx+%1] 
+   movq mm2, [ebx+%1] 
+   movq mm1, mm0 
+   movq mm3, mm2 
+
+%if %2 != 0 
+   add edx, ecx 
+%endif 
+
+   punpcklbw mm0, mm7 
+   punpckhbw mm1, mm7 
+punpcklbw mm2, mm7 
+punpckhbw mm3, mm7 
+
+%if %2 != 0 
+   add ebx, ecx 
+%endif 
+
+paddusw mm0, mm2    ; mm01 = ref1 + ref2 
+paddusw mm1, mm3 
+paddusw mm0, [mmx_one] ; mm01 += 1 
+paddusw mm1, [mmx_one] 
+psrlw mm0, 1     ; mm01 >>= 1 
+psrlw mm1, 1 
+
+   movq mm2, [eax+%1] 
+   movq mm3, mm2 
+   punpcklbw mm2, mm7          ; mm23 = src 
+   punpckhbw mm3, mm7 
+
+%if %2 != 0 
+   add eax, ecx 
+%endif 
+
+   movq mm4, mm0 
+   movq mm5, mm1 
+   psubusw mm0, mm2 
+   psubusw mm1, mm3 
+   psubusw mm2, mm4 
+   psubusw mm3, mm5 
+   por mm0, mm2                ; mm01 = ABS(mm01 - mm23) 
+   por mm1, mm3 
+
+   paddusw mm6,mm0             ; mm6 += mm01 
+   paddusw mm6,mm1 
+
+%endmacro 
+
+align 16 
+sad16bi_mmx: 
+   push ebx 
+   mov eax, [esp+4+ 4] ; Src 
+   mov edx, [esp+4+ 8] ; Ref1 
+   mov ebx, [esp+4+12] ; Ref2 
+   mov ecx, [esp+4+16] ; Stride 
+
+   pxor mm6, mm6 ; accum2 
+pxor mm7, mm7 
+.Loop 
+   SADBI_16x16_MMX 0, 0 
+   SADBI_16x16_MMX 8, 1 
+   SADBI_16x16_MMX 0, 0 
+   SADBI_16x16_MMX 8, 1 
+   SADBI_16x16_MMX 0, 0 
+   SADBI_16x16_MMX 8, 1 
+   SADBI_16x16_MMX 0, 0 
+   SADBI_16x16_MMX 8, 1 
+   SADBI_16x16_MMX 0, 0 
+   SADBI_16x16_MMX 8, 1 
+   SADBI_16x16_MMX 0, 0 
+   SADBI_16x16_MMX 8, 1 
+   SADBI_16x16_MMX 0, 0 
+   SADBI_16x16_MMX 8, 1 
+   SADBI_16x16_MMX 0, 0 
+   SADBI_16x16_MMX 8, 1 
+
+   SADBI_16x16_MMX 0, 0 
+   SADBI_16x16_MMX 8, 1 
+   SADBI_16x16_MMX 0, 0 
+   SADBI_16x16_MMX 8, 1 
+   SADBI_16x16_MMX 0, 0 
+   SADBI_16x16_MMX 8, 1 
+   SADBI_16x16_MMX 0, 0 
+   SADBI_16x16_MMX 8, 1 
+   SADBI_16x16_MMX 0, 0 
+   SADBI_16x16_MMX 8, 1 
+   SADBI_16x16_MMX 0, 0 
+   SADBI_16x16_MMX 8, 1 
+   SADBI_16x16_MMX 0, 0 
+   SADBI_16x16_MMX 8, 1 
+   SADBI_16x16_MMX 0, 0 
+   SADBI_16x16_MMX 8, 1 
+
+   pmaddwd mm6, [mmx_one] ; collapse 
+   movq mm7, mm6 
+   psrlq mm7, 32 
+   paddd mm6, mm7 
+
+   movd eax, mm6 
+   pop ebx 
+   ret 
+
+;=========================================================================== 
+; 
+; uint32_t sad8bi_mmx(const uint8_t * const cur, 
+; const uint8_t * const ref1, 
+; const uint8_t * const ref2, 
+; const uint32_t stride); 
+; 
+;=========================================================================== 
+align 16 
+sad8bi_mmx: 
+   push ebx 
+   mov eax, [esp+4+ 4] ; Src 
+   mov edx, [esp+4+ 8] ; Ref1 
+   mov ebx, [esp+4+12] ; Ref2 
+   mov ecx, [esp+4+16] ; Stride 
+
+   pxor mm6, mm6 ; accum2 
+pxor mm7, mm7 
+.Loop 
+   SADBI_16x16_MMX 0, 1 
+   SADBI_16x16_MMX 0, 1 
+   SADBI_16x16_MMX 0, 1 
+   SADBI_16x16_MMX 0, 1 
+   SADBI_16x16_MMX 0, 1 
+   SADBI_16x16_MMX 0, 1 
+   SADBI_16x16_MMX 0, 1 
+   SADBI_16x16_MMX 0, 1 
+
+   pmaddwd mm6, [mmx_one] ; collapse 
+   movq mm7, mm6 
+   psrlq mm7, 32 
+   paddd mm6, mm7 
+
+   movd eax, mm6 
+   pop ebx 
+   ret 
+
+
+
+
+;===========================================================================
+;
+; uint32_t dev16_mmx(const uint8_t * const cur,
+;					const uint32_t stride);
+;
+;===========================================================================
+
+%macro MEAN_16x16_MMX 0
     movq mm0, [eax]
     movq mm2, [eax+8]
     lea eax,[eax+ecx]    
@@ -325,4 +485,5 @@
     paddd mm6, mm5
 
     movd eax, mm6
-    ret
\ No newline at end of file
+    ret
+