Diff of /trunk/xvidcore/src/motion/x86_asm/sad_mmx.asm

-revision 652, Sun Nov 17 00:35:33 2002 UTC
+revision 1535, Sun Aug 22 11:46:10 2004 UTC
 Line 1
- ;/*****************************************************************************
+ ;/****************************************************************************
  ; *
  ; *  XVID MPEG-4 VIDEO CODEC
- ; *  mmx sum of absolute difference
+ ; *  - K7 optimized SAD operators -
  ; *
- ; *  Copyright(C) 2002 Peter Ross <pross@xvid.org>
+ ; *  Copyright(C) 2001 Peter Ross <pross@xvid.org>
+ ; *               2002 Pascal Massimino <skal@planet-d.net>
  ; *
- ; *  This file is part of XviD, a free MPEG-4 video encoder/decoder
+ ; *  This program is free software; you can redistribute it and/or modify it
- ; *
- ; *  XviD is free software; you can redistribute it and/or modify it
  ; *  under the terms of the GNU General Public License as published by
  ; *  the Free Software Foundation; either version 2 of the License, or
  ; *  (at your option) any later version.
-Line 21
+Line 20
  ; *  along with this program; if not, write to the Free Software
  ; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  ; *
- ; *  Under section 8 of the GNU General Public License, the copyright
+ ; * $Id: sad_mmx.asm,v 1.15 2004-08-22 11:46:10 edgomez Exp $
- ; *  holders of XVID explicitly forbid distribution in the following
- ; *  countries:
- ; *
- ; *    - Japan
- ; *    - United States of America
- ; *
- ; *  Linking XviD statically or dynamically with other modules is making a
- ; *  combined work based on XviD.  Thus, the terms and conditions of the
- ; *  GNU General Public License cover the whole combination.
- ; *
- ; *  As a special exception, the copyright holders of XviD give you
- ; *  permission to link XviD with independent modules that communicate with
- ; *  XviD solely through the VFW1.1 and DShow interfaces, regardless of the
- ; *  license terms of these independent modules, and to copy and distribute
- ; *  the resulting combined work under terms of your choice, provided that
- ; *  every copy of the combined work is accompanied by a complete copy of
- ; *  the source code of XviD (the version of XviD used to produce the
- ; *  combined work), being distributed under the terms of the GNU General
- ; *  Public License plus this exception.  An independent module is a module
- ; *  which is not derived from or based on XviD.
- ; *
- ; *  Note that people who make modified versions of XviD are not obligated
- ; *  to grant this special exception for their modified versions; it is
- ; *  their choice whether to do so.  The GNU General Public License gives
- ; *  permission to release a modified version without this exception; this
- ; *  exception also makes it possible to release a modified version which
- ; *  carries forward this exception.
- ; *
- ; * $Id: sad_mmx.asm,v 1.10 2002-11-17 00:32:06 edgomez Exp $
  ; *
- ; ****************************************************************************/
+ ; ***************************************************************************/
- bits 32
+ BITS 32
  %macro cglobal 1
          %ifdef PREFIX
+                 %ifdef MARK_FUNCS
+                         global _%1:function
+                         %define %1 _%1:function
+                 %else
                  global _%1
                  %define %1 _%1
+                 %endif
+         %else
+                 %ifdef MARK_FUNCS
+                         global %1:function
          %else
                  global %1
          %endif
+         %endif
  %endmacro
- section .data
+ ;=============================================================================
+ ; Read only data
- align 16
+ ;=============================================================================
- mmx_one times 4 dw 1
- section .text
- cglobal  sad16_mmx
+ %ifdef FORMAT_COFF
- cglobal  sad8_mmx
+ SECTION .rodata
- cglobal  sad16bi_mmx
+ %else
- cglobal  sad8bi_mmx
+ SECTION .rodata align=16
- cglobal  dev16_mmx
+ %endif
- ;===========================================================================
+ ALIGN 16
- ;
+ mmx_one:
- ; uint32_t sad16_mmx(const uint8_t * const cur,
+         times 4 dw 1
- ;                                        const uint8_t * const ref,
- ;                                        const uint32_t stride,
+ ;=============================================================================
- ;                                        const uint32_t best_sad);
+ ; Helper macros
- ;
+ ;=============================================================================
- ; (early termination ignore; slows this down)
- ;
- ;===========================================================================
  %macro SAD_16x16_MMX 0
      movq mm0, [eax]
-Line 122
+Line 95
      paddusw mm6,mm2
  %endmacro
- align 16
- sad16_mmx:
-     mov eax, [esp+ 4] ; Src1
-     mov edx, [esp+ 8] ; Src2
-     mov ecx, [esp+12] ; Stride
-     pxor mm6, mm6 ; accum
-     pxor mm7, mm7 ; zero
-     SAD_16x16_MMX
-     SAD_16x16_MMX
-     SAD_16x16_MMX
-     SAD_16x16_MMX
-     SAD_16x16_MMX
-     SAD_16x16_MMX
-     SAD_16x16_MMX
-     SAD_16x16_MMX
-     SAD_16x16_MMX
-     SAD_16x16_MMX
-     SAD_16x16_MMX
-     SAD_16x16_MMX
-     SAD_16x16_MMX
-     SAD_16x16_MMX
-     SAD_16x16_MMX
-     SAD_16x16_MMX
-     pmaddwd mm6, [mmx_one] ; collapse
-     movq mm7, mm6
-     psrlq mm7, 32
-     paddd mm6, mm7
-     movd eax, mm6
-     ret
- ;===========================================================================
- ;
- ; uint32_t sad8_mmx(const uint8_t * const cur,
- ;                                       const uint8_t * const ref,
- ;                                       const uint32_t stride);
- ;
- ;===========================================================================
  %macro SAD_8x8_MMX  0
      movq mm0, [eax]
      movq mm1, [edx]
-Line 202
+Line 129
      paddusw mm6,mm2
  %endmacro
- align 16
+ %macro SADV_16x16_MMX 0
- sad8_mmx:
+   movq mm0, [eax]
+   movq mm1, [edx]
-     mov eax, [esp+ 4] ; Src1
-     mov edx, [esp+ 8] ; Src2
-     mov ecx, [esp+12] ; Stride
-     pxor mm6, mm6 ; accum
+   movq mm2, [eax+8]
-     pxor mm7, mm7 ; zero
+   movq mm3, [edx+8]
-     SAD_8x8_MMX
+   movq mm4, mm0
-     SAD_8x8_MMX
+   psubusb mm0, mm1
-     SAD_8x8_MMX
-     SAD_8x8_MMX
-     pmaddwd mm6, [mmx_one] ; collapse
+   psubusb mm1, mm4
-     movq mm7, mm6
+   por mm0, mm1
-     psrlq mm7, 32
+   lea eax,[eax+ecx]
-     paddd mm6, mm7
-     movd eax, mm6
+   movq mm4, mm2
+   psubusb mm2, mm3
-     ret
+   psubusb mm3, mm4
+   por mm2, mm3
+   lea edx,[edx+ecx]
+   movq mm1,mm0
+   movq mm3,mm2
+   punpcklbw mm0,mm7
+   punpckhbw mm1,mm7
+   punpcklbw mm2,mm7
+   punpckhbw mm3,mm7
+   paddusw mm0,mm1
+   paddusw mm2,mm3
+   paddusw mm5, mm0
+   paddusw mm6, mm2
+ %endmacro
- ;===========================================================================
- ;
- ; uint32_t sad16bi_mmx(const uint8_t * const cur,
- ; const uint8_t * const ref1,
- ; const uint8_t * const ref2,
- ; const uint32_t stride);
- ;
- ;===========================================================================
  %macro SADBI_16x16_MMX 2    ; SADBI_16x16_MMX( int_ptr_offset, bool_increment_ptr );
     movq mm0, [edx+%1]
-Line 288
+Line 215
  %endmacro
- align 16
+ %macro MEAN_16x16_MMX 0
+   movq mm0, [eax]
+   movq mm2, [eax+8]
+   lea eax, [eax+ecx]
+   movq mm1, mm0
+   movq mm3, mm2
+   punpcklbw mm0, mm7
+   punpcklbw mm2, mm7
+   punpckhbw mm1, mm7
+   punpckhbw mm3, mm7
+   paddw mm5, mm0
+   paddw mm6, mm1
+   paddw mm5, mm2
+   paddw mm6, mm3
+ %endmacro
+ %macro ABS_16x16_MMX 0
+   movq mm0, [eax]
+   movq mm2, [eax+8]
+   lea eax, [eax+ecx]
+   movq mm1, mm0
+   movq mm3, mm2
+   punpcklbw mm0, mm7
+   punpcklbw mm2, mm7
+   punpckhbw mm1, mm7
+   punpckhbw mm3, mm7
+   movq mm4, mm6
+   psubusw mm4, mm0
+   psubusw mm0, mm6
+   por mm0, mm4
+   movq mm4, mm6
+   psubusw mm4, mm1
+   psubusw mm1, mm6
+   por mm1, mm4
+   movq mm4, mm6
+   psubusw mm4, mm2
+   psubusw mm2, mm6
+   por mm2, mm4
+   movq mm4, mm6
+   psubusw mm4, mm3
+   psubusw mm3, mm6
+   por mm3, mm4
+   paddw mm0, mm1
+   paddw mm2, mm3
+   paddw mm5, mm0
+   paddw mm5, mm2
+ %endmacro
+ ;=============================================================================
+ ; Code
+ ;=============================================================================
+ SECTION .text
+ cglobal sad16_mmx
+ cglobal sad16v_mmx
+ cglobal sad8_mmx
+ cglobal sad16bi_mmx
+ cglobal sad8bi_mmx
+ cglobal dev16_mmx
+ cglobal sse8_16bit_mmx
+ cglobal sse8_8bit_mmx
+ ;-----------------------------------------------------------------------------
+ ;
+ ; uint32_t sad16_mmx(const uint8_t * const cur,
+ ;                                        const uint8_t * const ref,
+ ;                                        const uint32_t stride,
+ ;                                        const uint32_t best_sad);
+ ;
+ ; (early termination ignore; slows this down)
+ ;
+ ;-----------------------------------------------------------------------------
+ ALIGN 16
+ sad16_mmx:
+   mov eax, [esp+ 4] ; Src1
+   mov edx, [esp+ 8] ; Src2
+   mov ecx, [esp+12] ; Stride
+   pxor mm6, mm6 ; accum
+   pxor mm7, mm7 ; zero
+   SAD_16x16_MMX
+   SAD_16x16_MMX
+   SAD_16x16_MMX
+   SAD_16x16_MMX
+   SAD_16x16_MMX
+   SAD_16x16_MMX
+   SAD_16x16_MMX
+   SAD_16x16_MMX
+   SAD_16x16_MMX
+   SAD_16x16_MMX
+   SAD_16x16_MMX
+   SAD_16x16_MMX
+   SAD_16x16_MMX
+   SAD_16x16_MMX
+   SAD_16x16_MMX
+   SAD_16x16_MMX
+   pmaddwd mm6, [mmx_one] ; collapse
+   movq mm7, mm6
+   psrlq mm7, 32
+   paddd mm6, mm7
+   movd eax, mm6
+   ret
+ ;-----------------------------------------------------------------------------
+ ;
+ ; uint32_t sad8_mmx(const uint8_t * const cur,
+ ;                                       const uint8_t * const ref,
+ ;                                       const uint32_t stride);
+ ;
+ ;-----------------------------------------------------------------------------
+ ALIGN 16
+ sad8_mmx:
+   mov eax, [esp+ 4] ; Src1
+   mov edx, [esp+ 8] ; Src2
+   mov ecx, [esp+12] ; Stride
+   pxor mm6, mm6 ; accum
+   pxor mm7, mm7 ; zero
+   SAD_8x8_MMX
+   SAD_8x8_MMX
+   SAD_8x8_MMX
+   SAD_8x8_MMX
+   pmaddwd mm6, [mmx_one] ; collapse
+   movq mm7, mm6
+   psrlq mm7, 32
+   paddd mm6, mm7
+   movd eax, mm6
+   ret
+ ;-----------------------------------------------------------------------------
+ ;
+ ; uint32_t sad16v_mmx(const uint8_t * const cur,
+ ;                                     const uint8_t * const ref,
+ ;                                         const uint32_t stride,
+ ;                                         int32_t *sad);
+ ;
+ ;-----------------------------------------------------------------------------
+ ALIGN 16
+ sad16v_mmx:
+   push ebx
+   push edi
+   mov eax, [esp + 8 + 4] ; Src1
+   mov edx, [esp + 8 + 8] ; Src2
+   mov ecx, [esp + 8 + 12] ; Stride
+   mov ebx, [esp + 8 + 16] ; sad ptr
+   pxor mm5, mm5 ; accum
+   pxor mm6, mm6 ; accum
+   pxor mm7, mm7 ; zero
+   SADV_16x16_MMX
+   SADV_16x16_MMX
+   SADV_16x16_MMX
+   SADV_16x16_MMX
+   SADV_16x16_MMX
+   SADV_16x16_MMX
+   SADV_16x16_MMX
+   SADV_16x16_MMX
+   pmaddwd mm5, [mmx_one] ; collapse
+   pmaddwd mm6, [mmx_one] ; collapse
+   movq mm2, mm5
+   movq mm3, mm6
+   psrlq mm2, 32
+   psrlq mm3, 32
+   paddd mm5, mm2
+   paddd mm6, mm3
+   movd [ebx], mm5
+   movd [ebx + 4], mm6
+   paddd mm5, mm6
+   movd edi, mm5
+   pxor mm5, mm5
+   pxor mm6, mm6
+   SADV_16x16_MMX
+   SADV_16x16_MMX
+   SADV_16x16_MMX
+   SADV_16x16_MMX
+   SADV_16x16_MMX
+   SADV_16x16_MMX
+   SADV_16x16_MMX
+   SADV_16x16_MMX
+   pmaddwd mm5, [mmx_one] ; collapse
+   pmaddwd mm6, [mmx_one] ; collapse
+   movq mm2, mm5
+   movq mm3, mm6
+   psrlq mm2, 32
+   psrlq mm3, 32
+   paddd mm5, mm2
+   paddd mm6, mm3
+   movd [ebx + 8], mm5
+   movd [ebx + 12], mm6
+   paddd mm5, mm6
+   movd eax, mm5
+   add eax, edi
+   pop edi
+   pop ebx
+   ret
+ ;-----------------------------------------------------------------------------
+ ;
+ ; uint32_t sad16bi_mmx(const uint8_t * const cur,
+ ; const uint8_t * const ref1,
+ ; const uint8_t * const ref2,
+ ; const uint32_t stride);
+ ;
+ ;-----------------------------------------------------------------------------
+ ALIGN 16
  sad16bi_mmx:
     push ebx
     mov eax, [esp+4+ 4] ; Src
-Line 340
+Line 512
     movd eax, mm6
     pop ebx
     ret
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
  ;
  ; uint32_t sad8bi_mmx(const uint8_t * const cur,
  ; const uint8_t * const ref1,
  ; const uint8_t * const ref2,
  ; const uint32_t stride);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- align 16
+ ALIGN 16
  sad8bi_mmx:
     push ebx
     mov eax, [esp+4+ 4] ; Src
-Line 379
+Line 553
     pop ebx
     ret
+ ;-----------------------------------------------------------------------------
- ;===========================================================================
  ;
  ; uint32_t dev16_mmx(const uint8_t * const cur,
  ;                                       const uint32_t stride);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- %macro MEAN_16x16_MMX 0
-     movq mm0, [eax]
-     movq mm2, [eax+8]
-     lea eax,[eax+ecx]
-     movq mm1, mm0
-     movq mm3, mm2
-     punpcklbw mm0,mm7
-     punpcklbw mm2,mm7
-     punpckhbw mm1,mm7
-     punpckhbw mm3,mm7
-     paddw mm5, mm0
-     paddw mm6, mm1
-     paddw mm5, mm2
-     paddw mm6, mm3
- %endmacro
- %macro ABS_16x16_MMX 0
-     movq mm0, [eax]
-     movq mm2, [eax+8]
-     lea eax,[eax+ecx]
-     movq mm1, mm0
-     movq mm3, mm2
-     punpcklbw mm0, mm7
-     punpcklbw mm2, mm7
-     punpckhbw mm1, mm7
-     punpckhbw mm3, mm7
-     movq mm4, mm6
-         psubusw mm4, mm0
-         psubusw mm0, mm6
-         por mm0, mm4
-         movq mm4, mm6
-         psubusw mm4, mm1
-         psubusw mm1, mm6
-         por mm1, mm4
-     movq mm4, mm6
-         psubusw mm4, mm2
-         psubusw mm2, mm6
-         por mm2, mm4
-         movq mm4, mm6
-         psubusw mm4, mm3
-         psubusw mm3, mm6
-         por mm3, mm4
-         paddw mm0, mm1
-         paddw mm2, mm3
-         paddw mm5, mm0
-         paddw mm5, mm2
- %endmacro
- align 16
+ ALIGN 16
  dev16_mmx:
      mov eax, [esp+ 4] ; Src
      mov ecx, [esp+ 8] ; Stride
-Line 509
+Line 629
      paddd mm6, mm5
      movd eax, mm6
+   ret
+ ;-----------------------------------------------------------------------------
+ ;
+ ; uint32_t sse8_16bit_mmx(const int16_t *b1,
+ ;                         const int16_t *b2,
+ ;                         const uint32_t stride);
+ ;
+ ;-----------------------------------------------------------------------------
+ %macro ROW_SSE_16bit_MMX 2
+   movq mm0, [%1]
+   movq mm1, [%1+8]
+   psubw mm0, [%2]
+   psubw mm1, [%2+8]
+   pmaddwd mm0, mm0
+   pmaddwd mm1, mm1
+   paddd mm2, mm0
+   paddd mm2, mm1
+ %endmacro
+ sse8_16bit_mmx:
+   push esi
+   push edi
+   ;; Load the function params
+   mov esi, [esp+8+4]
+   mov edi, [esp+8+8]
+   mov edx, [esp+8+12]
+   ;; Reset the sse accumulator
+   pxor mm2, mm2
+   ;; Let's go
+ %rep 8
+   ROW_SSE_16bit_MMX esi, edi
+   lea esi, [esi+edx]
+   lea edi, [edi+edx]
+ %endrep
+   ;; Finish adding each dword of the accumulator
+   movq mm3, mm2
+   psrlq mm2, 32
+   paddd mm2, mm3
+   movd eax, mm2
+   ;; All done
+   pop edi
+   pop esi
      ret
+ ;-----------------------------------------------------------------------------
+ ;
+ ; uint32_t sse8_8bit_mmx(const int8_t *b1,
+ ;                        const int8_t *b2,
+ ;                        const uint32_t stride);
+ ;
+ ;-----------------------------------------------------------------------------
+ %macro ROW_SSE_8bit_MMX 2
+   movq mm0, [%1] ; load a row
+   movq mm2, [%2] ; load a row
+   movq mm1, mm0  ; copy row
+   movq mm3, mm2  ; copy row
+   punpcklbw mm0, mm7 ; turn the 4low elements into 16bit
+   punpckhbw mm1, mm7 ; turn the 4high elements into 16bit
+   punpcklbw mm2, mm7 ; turn the 4low elements into 16bit
+   punpckhbw mm3, mm7 ; turn the 4high elements into 16bit
+   psubw mm0, mm2 ; low  part of src-dst
+   psubw mm1, mm3 ; high part of src-dst
+   pmaddwd mm0, mm0 ; compute the square sum
+   pmaddwd mm1, mm1 ; compute the square sum
+   paddd mm6, mm0 ; add to the accumulator
+   paddd mm6, mm1 ; add to the accumulator
+ %endmacro
+ sse8_8bit_mmx:
+   push esi
+   push edi
+   ;; Load the function params
+   mov esi, [esp+8+4]
+   mov edi, [esp+8+8]
+   mov edx, [esp+8+12]
+   ;; Reset the sse accumulator
+   pxor mm6, mm6
+   ;; Used to interleave 8bit data with 0x00 values
+   pxor mm7, mm7
+   ;; Let's go
+ %rep 8
+   ROW_SSE_8bit_MMX esi, edi
+   lea esi, [esi+edx]
+   lea edi, [edi+edx]
+ %endrep
+   ;; Finish adding each dword of the accumulator
+   movq mm7, mm6
+   psrlq mm6, 32
+   paddd mm6, mm7
+   movd eax, mm6
+   ;; All done
+   pop edi
+   pop esi
+   ret

 Legend:



Removed from v.652
 


changed lines


 
Added in v.1535
 Legend:



Removed from v.652
 


changed lines


 
Added in v.1535
-Removed from v.652
+Added in v.1535

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4