Diff of /branches/release-1_2-branch/xvidcore/src/motion/x86_asm/sad_sse2.asm

-trunk/xvidcore/src/motion/x86_asm/sad_sse2.asm
revision 605, Sat Oct 19 12:20:33 2002 UTC
+branches/release-1_2-branch/xvidcore/src/motion/x86_asm/sad_sse2.asm
revision 1820, Fri Nov 28 16:54:45 2008 UTC
 Line 1
- ;/*****************************************************************************
+ ;/****************************************************************************
  ; *
  ; *  XVID MPEG-4 VIDEO CODEC
- ; *  sse2 sum of absolute difference
+ ; *  - SSE2 optimized SAD operators -
  ; *
- ; *  Copyright(C) 2002 Dmitry Rozhdestvensky
+ ; *  Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
  ; *
- ; *  This program is an implementation of a part of one or more MPEG-4
- ; *  Video tools as specified in ISO/IEC 14496-2 standard.  Those intending
- ; *  to use this software module in hardware or software products are
- ; *  advised that its use may infringe existing patents or copyrights, and
- ; *  any such use would be at such party's own risk.  The original
- ; *  developer of this software module and his/her company, and subsequent
- ; *  editors and their companies, will have no liability for use of this
- ; *  software or modifications or derivatives thereof.
  ; *
- ; *  This program is free software; you can redistribute it and/or modify
+ ; *  This program is free software; you can redistribute it and/or modify it
- ; *  it under the terms of the GNU General Public License as published by
+ ; *  under the terms of the GNU General Public License as published by
  ; *  the Free Software Foundation; either version 2 of the License, or
  ; *  (at your option) any later version.
  ; *
-Line 28
+Line 20
  ; *  along with this program; if not, write to the Free Software
  ; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  ; *
- ; ****************************************************************************/
+ ; * $Id: sad_sse2.asm,v 1.16 2008-11-26 01:04:34 Isibaar Exp $
+ ; *
+ ; ***************************************************************************/
- bits 32
+ %include "nasm.inc"
- %macro cglobal 1
+ ;=============================================================================
-         %ifdef PREFIX
+ ; Read only data
-                 global _%1
+ ;=============================================================================
-                 %define %1 _%1
-         %else
-                 global %1
-         %endif
- %endmacro
- %define sad_debug 0 ;1=unaligned 2=ref unaligned 3=aligned 0=autodetect
+ DATA
- %define dev_debug 2 ;1=unaligned 2=aligned 0=autodetect
- %define test_stride_alignment 0 ;test stride for alignment while autodetect
- %define early_return 0 ;use early return in sad
- section .data
+ ALIGN SECTION_ALIGN
- align 64
- buffer  times 4*8 dd 0   ;8 128-bit words
  zero    times 4   dd 0
- section .text
+ ;=============================================================================
+ ; Code
+ ;=============================================================================
+ SECTION .rotext align=SECTION_ALIGN
  cglobal  sad16_sse2
  cglobal  dev16_sse2
- ;===========================================================================
+ cglobal  sad16_sse3
- ;               General macros for SSE2 code
+ cglobal  dev16_sse3
- ;===========================================================================
- %macro load_stride 1
-                 mov     ecx,%1
-                 add     ecx,ecx
-                 mov     edx,ecx
-                 add     ecx,%1          ;stride*3
-                 add     edx,edx         ;stride*4
- %endmacro
- %macro sad8lines 1
-                 psadbw  xmm0,[%1]
-                 psadbw  xmm1,[%1+ebx]
-                 psadbw  xmm2,[%1+ebx*2]
-                 psadbw  xmm3,[%1+ecx]
-                 add     %1,edx
-                 psadbw  xmm4,[%1]
-                 psadbw  xmm5,[%1+ebx]
-                 psadbw  xmm6,[%1+ebx*2]
-                 psadbw  xmm7,[%1+ecx]
-                 add     %1,edx
- %endmacro
- %macro after_sad 1 ; Summarizes 0th and 4th words of all xmm registers
-                 paddusw xmm0,xmm1
-                 paddusw xmm2,xmm3
-                 paddusw xmm4,xmm5
-                 paddusw xmm6,xmm7
-                 paddusw xmm0,xmm2
-                 paddusw xmm4,xmm6
-                 paddusw xmm4,xmm0
-                 pshufd  xmm5,xmm4,11111110b
-                 paddusw xmm5,xmm4
-                 pextrw  %1,xmm5,0       ;less latency then movd
- %endmacro
- %macro restore 1  ;restores used registers
- %if %1=1
-                 pop ebp
- %endif
-                 pop edi
-                 pop esi
-                 pop ebx
- %endmacro
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- ;
+ ; uint32_t sad16_sse2 (const uint8_t * const cur, <- assumed aligned!
- ; uint32_t sad16_sse2 (const uint8_t * const cur,
  ;                                       const uint8_t * const ref,
  ;                                       const uint32_t stride,
- ;                                       const uint32_t best_sad);
+ ;                      const uint32_t /*ignored*/);
- ;
+ ;-----------------------------------------------------------------------------
- ;
- ;===========================================================================
- align 16
- sad16_sse2
-                 push    ebx
-                 push    esi
-                 push    edi
-                 mov     ebx,[esp + 3*4 + 12]    ;stride
- %if sad_debug<>0
-                 mov     edi,[esp + 3*4 + 4]
-                 mov     esi,[esp + 3*4 + 8]
- %endif
- %if sad_debug=1
-                 jmp     sad16_sse2_ul
- %endif
- %if sad_debug=2
-                 jmp     sad16_sse2_semial
- %endif
- %if sad_debug=3
-                 jmp     sad16_sse2_al
- %endif
- %if test_stride_alignment<>0
-                 test    ebx,15
-                 jnz     sad16_sse2_ul
- %endif
-                 mov     edi,[esp + 3*4 + 4]     ;cur (most likely aligned)
-                 test    edi,15
-                 cmovz   esi,[esp + 3*4 + 8]     ;load esi if edi is aligned
-                 cmovnz  esi,edi                 ;move to esi and load edi
-                 cmovnz  edi,[esp + 3*4 + 8]     ;if not
-                 jnz     esi_unaligned
-                 test    esi,15
-                 jnz     near sad16_sse2_semial
-                 jmp     sad16_sse2_al
- esi_unaligned:  test    edi,15
-                 jnz     near sad16_sse2_ul
-                 jmp     sad16_sse2_semial
- ;===========================================================================
- ;       Branch requires 16-byte alignment of esi and edi and stride
- ;===========================================================================
- %macro sad16x8_al 1
-                 movdqa  xmm0,[esi]
-                 movdqa  xmm1,[esi+ebx]
-                 movdqa  xmm2,[esi+ebx*2]
-                 movdqa  xmm3,[esi+ecx]
-                 add     esi,edx
-                 movdqa  xmm4,[esi]
-                 movdqa  xmm5,[esi+ebx]
-                 movdqa  xmm6,[esi+ebx*2]
-                 movdqa  xmm7,[esi+ecx]
-                 add     esi,edx
-                 sad8lines edi
-                 after_sad %1
- %endmacro
- align 16
- sad16_sse2_al
-                 load_stride ebx
-                 sad16x8_al eax
- %if early_return=1
-                 cmp     eax,[esp + 3*4 + 16]    ;best_sad
-                 jg      continue_al
- %endif
-                 sad16x8_al ebx
-                 add     eax,ebx
- continue_al:    restore 0
+ %macro SAD_16x16_SSE2 1
+   %1  xmm0, [TMP1]
+   %1  xmm1, [TMP1+TMP0]
+   lea TMP1,[TMP1+2*TMP0]
+   movdqa  xmm2, [_EAX]
+   movdqa  xmm3, [_EAX+TMP0]
+   lea _EAX,[_EAX+2*TMP0]
+   psadbw  xmm0, xmm2
+   paddusw xmm6,xmm0
+   psadbw  xmm1, xmm3
+   paddusw xmm6,xmm1
+ %endmacro
+ %macro SAD16_SSE2_SSE3 1
+   mov _EAX, prm1 ; cur (assumed aligned)
+   mov TMP1, prm2 ; ref
+   mov TMP0, prm3 ; stride
+   pxor xmm6, xmm6 ; accum
+   SAD_16x16_SSE2 %1
+   SAD_16x16_SSE2 %1
+   SAD_16x16_SSE2 %1
+   SAD_16x16_SSE2 %1
+   SAD_16x16_SSE2 %1
+   SAD_16x16_SSE2 %1
+   SAD_16x16_SSE2 %1
+   SAD_16x16_SSE2 %1
+   pshufd  xmm5, xmm6, 00000010b
+   paddusw xmm6, xmm5
+   pextrw  eax, xmm6, 0
                  ret
- ;===========================================================================
- ;       Branch requires 16-byte alignment of the edi and stride only
- ;===========================================================================
- %macro sad16x8_semial 1
-                 movdqu  xmm0,[esi]
-                 movdqu  xmm1,[esi+ebx]
-                 movdqu  xmm2,[esi+ebx*2]
-                 movdqu  xmm3,[esi+ecx]
-                 add     esi,edx
-                 movdqu  xmm4,[esi]
-                 movdqu  xmm5,[esi+ebx]
-                 movdqu  xmm6,[esi+ebx*2]
-                 movdqu  xmm7,[esi+ecx]
-                 add     esi,edx
-                 sad8lines edi
-                 after_sad %1
  %endmacro
- align 16
+ ALIGN SECTION_ALIGN
- sad16_sse2_semial
+ sad16_sse2:
+   SAD16_SSE2_SSE3 movdqu
-                 load_stride ebx
+ ENDFUNC
-                 sad16x8_semial eax
+ ALIGN SECTION_ALIGN
- %if early_return=1
+ sad16_sse3:
-                 cmp     eax,[esp + 3*4 + 16]    ;best_sad
+   SAD16_SSE2_SSE3 lddqu
-                 jg      cont_semial
+ ENDFUNC
- %endif
-                 sad16x8_semial ebx
+ ;-----------------------------------------------------------------------------
+ ; uint32_t dev16_sse2(const uint8_t * const cur, const uint32_t stride);
-                 add     eax,ebx
+ ;-----------------------------------------------------------------------------
- cont_semial:    restore 0
+ %macro MEAN_16x16_SSE2 1  ; _EAX: src, TMP0:stride, mm7: zero or mean => mm6: result
+   %1 xmm0, [_EAX]
-                 ret
+   %1 xmm1, [_EAX+TMP0]
+   lea _EAX, [_EAX+2*TMP0]    ; + 2*stride
- ;===========================================================================
- ;               Branch does not require alignment, even stride
- ;===========================================================================
- %macro sad16x4_ul 1
-                 movdqu  xmm0,[esi]
-                 movdqu  xmm1,[esi+ebx]
-                 movdqu  xmm2,[esi+ebx*2]
-                 movdqu  xmm3,[esi+ecx]
-                 add     esi,edx
-                 movdqu  xmm4,[edi]
-                 movdqu  xmm5,[edi+ebx]
-                 movdqu  xmm6,[edi+ebx*2]
-                 movdqu  xmm7,[edi+ecx]
-                 add     edi,edx
-                 psadbw  xmm4,xmm0
-                 psadbw  xmm5,xmm1
-                 psadbw  xmm6,xmm2
-                 psadbw  xmm7,xmm3
-                 paddusw xmm4,xmm5
-                 paddusw xmm6,xmm7
-                 paddusw xmm4,xmm6
-                 pshufd  xmm7,xmm4,11111110b
-                 paddusw xmm7,xmm4
-                 pextrw  %1,xmm7,0
- %endmacro
- align 16
- sad16_sse2_ul
-                 load_stride ebx
-                 push ebp
-                 sad16x4_ul eax
- %if early_return=1
-                 cmp     eax,[esp + 4*4 + 16]    ;best_sad
-                 jg      continue_ul
- %endif
-                 sad16x4_ul ebp
-                 add     eax,ebp
- %if early_return=1
-                 cmp     eax,[esp + 4*4 + 16]    ;best_sad
-                 jg      continue_ul
- %endif
-                 sad16x4_ul ebp
-                 add     eax,ebp
- %if early_return=1
-                 cmp     eax,[esp + 4*4 + 16]    ;best_sad
-                 jg      continue_ul
- %endif
-                 sad16x4_ul ebp
-                 add     eax,ebp
- continue_ul:    restore 1
-                 ret
- ;===========================================================================
- ;
- ; uint32_t dev16_sse2(const uint8_t * const cur,
- ;                                       const uint32_t stride);
- ;
- ; experimental!
- ;
- ;===========================================================================
- align 16
- dev16_sse2
-                 push    ebx
-                 push    esi
-                 push    edi
-                 push    ebp
-                 mov     esi, [esp + 4*4 + 4]      ; cur
-                 mov     ebx, [esp + 4*4 + 8]      ; stride
-                 mov     edi, buffer
- %if dev_debug=1
-                 jmp     dev16_sse2_ul
- %endif
- %if dev_debug=2
-                 jmp     dev16_sse2_al
- %endif
-                 test    esi,15
-                 jnz     near dev16_sse2_ul
- %if test_stride_alignment=1
-                 test    ebx,15
-                 jnz     dev16_sse2_ul
- %endif
-                 mov     edi,esi
-                 jmp     dev16_sse2_al
- ;===========================================================================
- ;               Branch requires alignment of both the cur and stride
- ;===========================================================================
- %macro make_mean 0
-                 add     eax,ebp         ;mean 16-bit
-                 mov     al,ah           ;eax= {0 0 mean/256 mean/256}
-                 mov     ebp,eax
-                 shl     ebp,16
-                 or      eax,ebp
- %endmacro
- %macro sad_mean16x8_al 3        ;destination,0=zero,1=mean from eax,source
- %if %2=0
-                 pxor    xmm0,xmm0
- %else
-                 movd    xmm0,eax
-                 pshufd  xmm0,xmm0,0
- %endif
-                 movdqa  xmm1,xmm0
-                 movdqa  xmm2,xmm0
-                 movdqa  xmm3,xmm0
-                 movdqa  xmm4,xmm0
-                 movdqa  xmm5,xmm0
-                 movdqa  xmm6,xmm0
-                 movdqa  xmm7,xmm0
-                 sad8lines %3
-                 after_sad %1
- %endmacro
- align 16
- dev16_sse2_al
-                 load_stride ebx
-                 sad_mean16x8_al eax,0,esi
-                 sad_mean16x8_al ebp,0,esi
-                 make_mean
-                 sad_mean16x8_al ebp,1,edi
-                 sad_mean16x8_al eax,1,edi
-                 add eax,ebp
-                 restore 1
-                 ret
- ;===========================================================================
- ;               Branch does not require alignment
- ;===========================================================================
- %macro sad_mean16x8_ul 2
-                 pxor    xmm7,xmm7
-                 movdqu  xmm0,[%1]
-                 movdqu  xmm1,[%1+ebx]
-                 movdqu  xmm2,[%1+ebx*2]
-                 movdqu  xmm3,[%1+ecx]
-                 add     %1,edx
-                 movdqa  [buffer+16*0],xmm0
-                 movdqa  [buffer+16*1],xmm1
-                 movdqa  [buffer+16*2],xmm2
-                 movdqa  [buffer+16*3],xmm3
-                 movdqu  xmm4,[%1]
-                 movdqu  xmm5,[%1+ebx]
-                 movdqu  xmm6,[%1+ebx*2]
-                 movdqa  [buffer+16*4],xmm4
-                 movdqa  [buffer+16*5],xmm5
-                 movdqa  [buffer+16*6],xmm6
                  psadbw  xmm0,xmm7
+   paddusw xmm6, xmm0
                  psadbw  xmm1,xmm7
-                 psadbw  xmm2,xmm7
+   paddusw xmm6, xmm1
-                 psadbw  xmm3,xmm7
-                 psadbw  xmm4,xmm7
-                 psadbw  xmm5,xmm7
-                 psadbw  xmm6,xmm7
-                 movdqu  xmm7,[%1+ecx]
-                 movdqa  [buffer+16*7],xmm7
-                 psadbw  xmm7,[zero]
-                 add     %1,edx
-                 after_sad %2
  %endmacro
- align 16
- dev16_sse2_ul
-                 load_stride ebx
-                 sad_mean16x8_ul esi,eax
-                 sad_mean16x8_ul esi,ebp
-                 make_mean
+ %macro MEAN16_SSE2_SSE3 1
+   mov _EAX, prm1   ; src
+   mov TMP0, prm2   ; stride
+   pxor xmm6, xmm6     ; accum
+   pxor xmm7, xmm7     ; zero
+   MEAN_16x16_SSE2 %1
+   MEAN_16x16_SSE2 %1
+   MEAN_16x16_SSE2 %1
+   MEAN_16x16_SSE2 %1
+   MEAN_16x16_SSE2 %1
+   MEAN_16x16_SSE2 %1
+   MEAN_16x16_SSE2 %1
+   MEAN_16x16_SSE2 %1
+   mov _EAX, prm1       ; src again
+   pshufd   xmm7, xmm6, 10b
+   paddusw  xmm7, xmm6
+   pxor     xmm6, xmm6     ; zero accum
+   psrlw    xmm7, 8        ; => Mean
+   pshuflw  xmm7, xmm7, 0  ; replicate Mean
+   packuswb xmm7, xmm7
+   pshufd   xmm7, xmm7, 00000000b
+   MEAN_16x16_SSE2 %1
+   MEAN_16x16_SSE2 %1
+   MEAN_16x16_SSE2 %1
+   MEAN_16x16_SSE2 %1
+   MEAN_16x16_SSE2 %1
+   MEAN_16x16_SSE2 %1
+   MEAN_16x16_SSE2 %1
+   MEAN_16x16_SSE2 %1
+   pshufd   xmm7, xmm6, 10b
+   paddusw  xmm7, xmm6
+   pextrw eax, xmm7, 0
+   ret
+ %endmacro
-                 sad_mean16x8_al ebp,1,edi
+ ALIGN SECTION_ALIGN
-                 sad_mean16x8_al eax,1,edi
+ dev16_sse2:
+   MEAN16_SSE2_SSE3 movdqu
+ ENDFUNC
+ ALIGN SECTION_ALIGN
+ dev16_sse3:
+   MEAN16_SSE2_SSE3 lddqu
+ ENDFUNC
-                 add     eax,ebp
-                 restore 1
+ %ifidn __OUTPUT_FORMAT__,elf
+ section ".note.GNU-stack" noalloc noexec nowrite progbits
+ %endif
-                 ret

 Legend:



Removed from v.605
 


changed lines


 
Added in v.1820
 Legend:



Removed from v.605
 


changed lines


 
Added in v.1820
-Removed from v.605
+Added in v.1820

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4