--- branches/dev-api-4/xvidcore/src/motion/x86_asm/sad_sse2.asm	2003/10/27 01:03:43	1191
+++ branches/dev-api-4/xvidcore/src/motion/x86_asm/sad_sse2.asm	2003/10/28 22:23:03	1192
@@ -1,486 +1,156 @@
-;/**************************************************************************
+;/****************************************************************************
 ; *
-; *	XVID MPEG-4 VIDEO CODEC
-; *	sse2 sum of absolute difference
+; *  XVID MPEG-4 VIDEO CODEC
+; *  - SSE2 optimized SAD operators -
 ; *
-; *	This program is free software; you can redistribute it and/or modify
-; *	it under the terms of the GNU General Public License as published by
-; *	the Free Software Foundation; either version 2 of the License, or
-; *	(at your option) any later version.
+; *  Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
 ; *
-; *	This program is distributed in the hope that it will be useful,
-; *	but WITHOUT ANY WARRANTY; without even the implied warranty of
-; *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-; *	GNU General Public License for more details.
 ; *
-; *	You should have received a copy of the GNU General Public License
-; *	along with this program; if not, write to the Free Software
-; *	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+; *  This program is free software; you can redistribute it and/or modify it
+; *  under the terms of the GNU General Public License as published by
+; *  the Free Software Foundation; either version 2 of the License, or
+; *  (at your option) any later version.
 ; *
-; *************************************************************************/
-
-;/**************************************************************************
+; *  This program is distributed in the hope that it will be useful,
+; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+; *  GNU General Public License for more details.
 ; *
-; *	History:
+; *  You should have received a copy of the GNU General Public License
+; *  along with this program; if not, write to the Free Software
+; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 ; *
-; *	24.05.2002	inital version; (c)2002 Dmitry Rozhdestvensky
+; * $Id: sad_sse2.asm,v 1.8.2.1 2003-10-28 22:23:03 edgomez Exp $
 ; *
-; *************************************************************************/
+; ***************************************************************************/
 
-bits 32
+BITS 32
 
-%macro cglobal 1 
+%macro cglobal 1
 	%ifdef PREFIX
-		global _%1 
+		global _%1
 		%define %1 _%1
 	%else
 		global %1
 	%endif
 %endmacro
 
-%define sad_debug 0 ;1=unaligned 2=ref unaligned 3=aligned 0=autodetect
-%define dev_debug 2 ;1=unaligned 2=aligned 0=autodetect
-%define test_stride_alignment 0 ;test stride for alignment while autodetect
-%define early_return 0 ;use early return in sad
+;=============================================================================
+; Read only data
+;=============================================================================
 
-section .data
+SECTION .rodata
 
-align 64
-buffer  times 4*8 dd 0   ;8 128-bit words
+ALIGN 64
 zero    times 4   dd 0
 
-section .text
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
 
 cglobal  sad16_sse2
 cglobal  dev16_sse2
 
-;===========================================================================
-;               General macros for SSE2 code
-;===========================================================================
-
-%macro load_stride 1
-                mov     ecx,%1
-                add     ecx,ecx
-                mov     edx,ecx
-                add     ecx,%1          ;stride*3
-                add     edx,edx         ;stride*4
-%endmacro
-
-%macro sad8lines 1
-
-                psadbw  xmm0,[%1]
-                psadbw  xmm1,[%1+ebx]
-                psadbw  xmm2,[%1+ebx*2]
-                psadbw  xmm3,[%1+ecx]
-
-                add     %1,edx
-
-                psadbw  xmm4,[%1]
-                psadbw  xmm5,[%1+ebx]
-                psadbw  xmm6,[%1+ebx*2]
-                psadbw  xmm7,[%1+ecx]
-
-                add     %1,edx
-%endmacro
-
-%macro after_sad 1 ; Summarizes 0th and 4th words of all xmm registers
-
-                paddusw xmm0,xmm1
-                paddusw xmm2,xmm3
-                paddusw xmm4,xmm5
-                paddusw xmm6,xmm7
-
-                paddusw xmm0,xmm2
-                paddusw xmm4,xmm6
-
-                paddusw xmm4,xmm0
-                pshufd  xmm5,xmm4,11111110b
-                paddusw xmm5,xmm4
-
-                pextrw  %1,xmm5,0       ;less latency then movd
-%endmacro
-
-%macro restore 1  ;restores used registers
-
-%if %1=1
-                pop ebp
-%endif
-                pop edi
-                pop esi
-                pop ebx
-%endmacro
-
-;===========================================================================
-;
-; uint32_t sad16_sse2 (const uint8_t * const cur,
-;					const uint8_t * const ref,
-;					const uint32_t stride,
-;					const uint32_t best_sad);
-;
-;
-;===========================================================================
-
-align 16
-sad16_sse2
-                push    ebx
-                push    esi
-                push    edi
-
-                mov     ebx,[esp + 3*4 + 12]    ;stride
-
-%if sad_debug<>0
-                mov     edi,[esp + 3*4 + 4]
-                mov     esi,[esp + 3*4 + 8]
-%endif
-
-%if sad_debug=1
-                jmp     sad16_sse2_ul
-%endif
-%if sad_debug=2
-                jmp     sad16_sse2_semial
-%endif        
-%if sad_debug=3
-                jmp     sad16_sse2_al
-%endif
-
-%if test_stride_alignment<>0
-                test    ebx,15
-                jnz     sad16_sse2_ul
-%endif
-                mov     edi,[esp + 3*4 + 4]     ;cur (most likely aligned)
-
-                test    edi,15
-                cmovz   esi,[esp + 3*4 + 8]     ;load esi if edi is aligned
-                cmovnz  esi,edi                 ;move to esi and load edi
-                cmovnz  edi,[esp + 3*4 + 8]     ;if not
-                jnz     esi_unaligned
-
-                test    esi,15                     
-                jnz     near sad16_sse2_semial           
-                jmp     sad16_sse2_al
-
-esi_unaligned:  test    edi,15
-                jnz     near sad16_sse2_ul
-                jmp     sad16_sse2_semial
-
-;===========================================================================
-;       Branch requires 16-byte alignment of esi and edi and stride
-;===========================================================================
-
-%macro sad16x8_al 1
-
-                movdqa  xmm0,[esi]
-                movdqa  xmm1,[esi+ebx]
-                movdqa  xmm2,[esi+ebx*2]
-                movdqa  xmm3,[esi+ecx]
-
-                add     esi,edx
-
-                movdqa  xmm4,[esi]
-                movdqa  xmm5,[esi+ebx]
-                movdqa  xmm6,[esi+ebx*2]
-                movdqa  xmm7,[esi+ecx]
-
-                add     esi,edx
-
-                sad8lines edi
-
-                after_sad %1
-
-%endmacro
-
-align 16
-sad16_sse2_al
-
-                load_stride ebx
-
-                sad16x8_al eax
-
-%if early_return=1
-                cmp     eax,[esp + 3*4 + 16]    ;best_sad
-                jg      continue_al
-%endif
-
-                sad16x8_al ebx
-
-                add     eax,ebx
-
-continue_al:    restore 0
-
-                ret
-
-;===========================================================================
-;       Branch requires 16-byte alignment of the edi and stride only
-;===========================================================================
-
-%macro sad16x8_semial 1
-
-                movdqu  xmm0,[esi]
-                movdqu  xmm1,[esi+ebx]
-                movdqu  xmm2,[esi+ebx*2]
-                movdqu  xmm3,[esi+ecx]
-
-                add     esi,edx
-
-                movdqu  xmm4,[esi]
-                movdqu  xmm5,[esi+ebx]
-                movdqu  xmm6,[esi+ebx*2]
-                movdqu  xmm7,[esi+ecx]
-
-                add     esi,edx
-
-                sad8lines edi
-
-                after_sad %1
-
-%endmacro
-
-align 16
-sad16_sse2_semial
-
-                load_stride ebx
-
-                sad16x8_semial eax
-
-%if early_return=1
-                cmp     eax,[esp + 3*4 + 16]    ;best_sad
-                jg      cont_semial
-%endif
-
-                sad16x8_semial ebx
-
-                add     eax,ebx
-
-cont_semial:    restore 0
-
-                ret
-
-
-;===========================================================================
-;               Branch does not require alignment, even stride
-;===========================================================================
-
-%macro sad16x4_ul 1
-
-                movdqu  xmm0,[esi]
-                movdqu  xmm1,[esi+ebx]
-                movdqu  xmm2,[esi+ebx*2]
-                movdqu  xmm3,[esi+ecx]
-
-                add     esi,edx
-
-                movdqu  xmm4,[edi]
-                movdqu  xmm5,[edi+ebx]
-                movdqu  xmm6,[edi+ebx*2]
-                movdqu  xmm7,[edi+ecx]
-
-                add     edi,edx
-
-                psadbw  xmm4,xmm0
-                psadbw  xmm5,xmm1
-                psadbw  xmm6,xmm2
-                psadbw  xmm7,xmm3
-
-                paddusw xmm4,xmm5
-                paddusw xmm6,xmm7
-
-                paddusw xmm4,xmm6
-                pshufd  xmm7,xmm4,11111110b
-                paddusw xmm7,xmm4
-
-                pextrw  %1,xmm7,0
-%endmacro
-                
-
-align 16
-sad16_sse2_ul
-
-                load_stride ebx
-
-                push ebp
-
-                sad16x4_ul eax
-
-%if early_return=1
-                cmp     eax,[esp + 4*4 + 16]    ;best_sad
-                jg      continue_ul
-%endif
-
-                sad16x4_ul ebp
-                add     eax,ebp
-
-%if early_return=1
-                cmp     eax,[esp + 4*4 + 16]    ;best_sad
-                jg      continue_ul
-%endif
-
-                sad16x4_ul ebp
-                add     eax,ebp
-
-%if early_return=1
-                cmp     eax,[esp + 4*4 + 16]    ;best_sad
-                jg      continue_ul
-%endif
-
-                sad16x4_ul ebp
-                add     eax,ebp
-
-continue_ul:    restore 1
-
-                ret
-
-;===========================================================================
-;
-; uint32_t dev16_sse2(const uint8_t * const cur,
-;					const uint32_t stride);
-;
-; experimental!
-;
-;===========================================================================
-
-align 16
-dev16_sse2
-
-                push    ebx
-		push 	esi
-		push 	edi
-                push    ebp
-
-                mov     esi, [esp + 4*4 + 4]      ; cur
-                mov     ebx, [esp + 4*4 + 8]      ; stride
-                mov     edi, buffer
-
-%if dev_debug=1
-                jmp     dev16_sse2_ul
-%endif
-
-%if dev_debug=2
-                jmp     dev16_sse2_al
-%endif
-
-                test    esi,15
-                jnz     near dev16_sse2_ul
-
-%if test_stride_alignment=1
-                test    ebx,15
-                jnz     dev16_sse2_ul
-%endif
-
-                mov     edi,esi
-                jmp     dev16_sse2_al
-
-;===========================================================================
-;               Branch requires alignment of both the cur and stride
-;===========================================================================
-
-%macro make_mean 0
-                add     eax,ebp         ;mean 16-bit
-                mov     al,ah           ;eax= {0 0 mean/256 mean/256}
-                mov     ebp,eax
-                shl     ebp,16
-                or      eax,ebp
-%endmacro
-
-%macro sad_mean16x8_al 3        ;destination,0=zero,1=mean from eax,source
-
-%if %2=0
-                pxor    xmm0,xmm0
-%else
-                movd    xmm0,eax
-                pshufd  xmm0,xmm0,0
-%endif
-                movdqa  xmm1,xmm0
-                movdqa  xmm2,xmm0
-                movdqa  xmm3,xmm0
-                movdqa  xmm4,xmm0
-                movdqa  xmm5,xmm0
-                movdqa  xmm6,xmm0
-                movdqa  xmm7,xmm0
-
-                sad8lines %3
-
-                after_sad %1
-
-%endmacro
-
-align 16
-dev16_sse2_al
-
-                load_stride ebx
-
-                sad_mean16x8_al eax,0,esi
-                sad_mean16x8_al ebp,0,esi
-
-                make_mean
-
-                sad_mean16x8_al ebp,1,edi
-                sad_mean16x8_al eax,1,edi
-
-                add eax,ebp
-
-                restore 1
-
-                ret
-
-;===========================================================================
-;               Branch does not require alignment
-;===========================================================================
-
-%macro sad_mean16x8_ul 2
-
-                pxor    xmm7,xmm7
-
-                movdqu  xmm0,[%1]
-                movdqu  xmm1,[%1+ebx]
-                movdqu  xmm2,[%1+ebx*2]
-                movdqu  xmm3,[%1+ecx]
-
-                add     %1,edx
-
-                movdqa  [buffer+16*0],xmm0
-                movdqa  [buffer+16*1],xmm1
-                movdqa  [buffer+16*2],xmm2
-                movdqa  [buffer+16*3],xmm3
-                
-                movdqu  xmm4,[%1]
-                movdqu  xmm5,[%1+ebx]
-                movdqu  xmm6,[%1+ebx*2]
-                movdqa  [buffer+16*4],xmm4
-                movdqa  [buffer+16*5],xmm5
-                movdqa  [buffer+16*6],xmm6
-
-                psadbw  xmm0,xmm7
-                psadbw  xmm1,xmm7
-                psadbw  xmm2,xmm7
-                psadbw  xmm3,xmm7
-                psadbw  xmm4,xmm7
-                psadbw  xmm5,xmm7
-                psadbw  xmm6,xmm7
-
-                movdqu  xmm7,[%1+ecx]
-                movdqa  [buffer+16*7],xmm7
-                psadbw  xmm7,[zero]
-
-                add     %1,edx
-
-                after_sad %2
-%endmacro
-
-align 16
-dev16_sse2_ul
-
-                load_stride ebx
-
-                sad_mean16x8_ul esi,eax
-                sad_mean16x8_ul esi,ebp
-
-                make_mean
-
-                sad_mean16x8_al ebp,1,edi
-                sad_mean16x8_al eax,1,edi
-
-                add     eax,ebp
-
-                restore 1
+;-----------------------------------------------------------------------------
+; uint32_t sad16_sse2 (const uint8_t * const cur, <- assumed aligned!
+;                      const uint8_t * const ref,
+;	                   const uint32_t stride,
+;                      const uint32_t /*ignored*/);
+;-----------------------------------------------------------------------------
+
+
+%macro SAD_16x16_SSE2 0
+  movdqu  xmm0, [edx]
+  movdqu  xmm1, [edx+ecx]
+  lea edx,[edx+2*ecx]
+  movdqa  xmm2, [eax]
+  movdqa  xmm3, [eax+ecx]
+  lea eax,[eax+2*ecx]
+  psadbw  xmm0, xmm2
+  paddusw xmm6,xmm0
+  psadbw  xmm1, xmm3
+  paddusw xmm6,xmm1
+%endmacro
+
+align 16
+sad16_sse2:
+  mov eax, [esp+ 4] ; cur (assumed aligned)
+  mov edx, [esp+ 8] ; ref
+  mov ecx, [esp+12] ; stride
+
+  pxor xmm6, xmm6 ; accum
+
+  SAD_16x16_SSE2
+  SAD_16x16_SSE2
+  SAD_16x16_SSE2
+  SAD_16x16_SSE2
+  SAD_16x16_SSE2
+  SAD_16x16_SSE2
+  SAD_16x16_SSE2
+  SAD_16x16_SSE2
+
+  pshufd  xmm5, xmm6, 00000010b
+  paddusw xmm6, xmm5
+  pextrw  eax, xmm6, 0
+  ret
+
+
+;-----------------------------------------------------------------------------
+; uint32_t dev16_sse2(const uint8_t * const cur, const uint32_t stride);
+;-----------------------------------------------------------------------------
+
+%macro MEAN_16x16_SSE2 0  ; eax: src, ecx:stride, mm7: zero or mean => mm6: result
+  movdqu xmm0, [eax]
+  movdqu xmm1, [eax+ecx]
+  lea eax, [eax+2*ecx]    ; + 2*stride
+  psadbw xmm0, xmm7
+  paddusw xmm6, xmm0
+  psadbw xmm1, xmm7
+  paddusw xmm6, xmm1
+%endmacro
+
+
+align 16
+dev16_sse2:
+  mov eax, [esp+ 4]   ; src
+  mov ecx, [esp+ 8]   ; stride
+
+  pxor xmm6, xmm6     ; accum
+  pxor xmm7, xmm7     ; zero
+
+  MEAN_16x16_SSE2
+  MEAN_16x16_SSE2
+  MEAN_16x16_SSE2
+  MEAN_16x16_SSE2
+
+  MEAN_16x16_SSE2
+  MEAN_16x16_SSE2
+  MEAN_16x16_SSE2
+  MEAN_16x16_SSE2
+
+  mov eax, [esp+ 4]       ; src again
+
+  pshufd   xmm7, xmm6, 0010b
+  paddusw  xmm7, xmm6
+  pxor     xmm6, xmm6     ; zero accum
+  psrlw    xmm7, 8        ; => Mean
+  pshuflw  xmm7, xmm7, 0  ; replicate Mean
+  packuswb xmm7,xmm7
+
+  MEAN_16x16_SSE2
+  MEAN_16x16_SSE2
+  MEAN_16x16_SSE2
+  MEAN_16x16_SSE2
+
+  MEAN_16x16_SSE2
+  MEAN_16x16_SSE2
+  MEAN_16x16_SSE2
+  MEAN_16x16_SSE2
+
+  pshufd  xmm5, xmm6, 0010b
+  paddusw xmm6, xmm5
+  pextrw  eax, xmm6, 0
 
-                ret
+  ret