Diff of /branches/dev-api-4/xvidcore/src/motion/x86_asm/sad_mmx.asm

-revision 886, Fri Feb 21 14:49:29 2003 UTC
+revision 1192, Tue Oct 28 22:23:03 2003 UTC
 Line 1
- ;/**************************************************************************
+ ;/****************************************************************************
  ; *
  ; *     XVID MPEG-4 VIDEO CODEC
- ; *     mmx sum of absolute difference
+ ; *  - K7 optimized SAD operators -
  ; *
- ; *     This program is free software; you can redistribute it and/or modify
+ ; *  Copyright(C) 2001 Peter Ross <pross@xvid.org>
- ; *     it under the terms of the GNU General Public License as published by
+ ; *               2002 Pascal Massimino <skal@planet-d.net>
+ ; *
+ ; *  This program is free software; you can redistribute it and/or modify it
+ ; *  under the terms of the GNU General Public License as published by
  ; *     the Free Software Foundation; either version 2 of the License, or
  ; *     (at your option) any later version.
  ; *
-Line 15
+Line 18
  ; *
  ; *     You should have received a copy of the GNU General Public License
  ; *     along with this program; if not, write to the Free Software
- ; *     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ ; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- ; *
- ; *************************************************************************/
- ;/**************************************************************************
- ; *
- ; *     History:
  ; *
- ; * 23.07.2002  sad[16,8]bi_xmm; <pross@xvid.org>
+ ; * $Id: sad_mmx.asm,v 1.11.2.1 2003-10-28 22:23:03 edgomez Exp $
- ; * 04.06.2002  cleanup -Skal-
- ; *     12.11.2001      inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
  ; *
- ; *************************************************************************/
+ ; ***************************************************************************/
- bits 32
+ BITS 32
  %macro cglobal 1
          %ifdef PREFIX
-Line 40
+Line 35
          %endif
  %endmacro
- section .data
+ ;=============================================================================
+ ; Read only data
- align 16
+ ;=============================================================================
- mmx_one times 4 dw 1
+ SECTION .rodata
- section .text
+ ALIGN 16
- cglobal  sad16_mmx
+ mmx_one:
- cglobal  sad16v_mmx
+         times 4 dw 1
- cglobal  sad8_mmx
- cglobal  sad16bi_mmx
+ ;=============================================================================
- cglobal  sad8bi_mmx
+ ; Helper macros
- cglobal  dev16_mmx
+ ;=============================================================================
- ;===========================================================================
- ;
- ; uint32_t sad16_mmx(const uint8_t * const cur,
- ;                                        const uint8_t * const ref,
- ;                                        const uint32_t stride,
- ;                                        const uint32_t best_sad);
- ;
- ; (early termination ignore; slows this down)
- ;
- ;===========================================================================
  %macro SAD_16x16_MMX 0
      movq mm0, [eax]
-Line 98
+Line 82
      paddusw mm6,mm2
  %endmacro
- align 16
+ %macro SAD_8x8_MMX      0
+   movq mm0, [eax]
+   movq mm1, [edx]
+   movq mm2, [eax+ecx]
+   movq mm3, [edx+ecx]
+   lea eax,[eax+2*ecx]
+   lea edx,[edx+2*ecx]
+   movq mm4, mm0
+   psubusb mm0, mm1
+   movq mm5, mm2
+   psubusb mm2, mm3
+   psubusb mm1, mm4
+   por mm0, mm1
+   psubusb mm3, mm5
+   por mm2, mm3
+   movq mm1,mm0
+   movq mm3,mm2
+   punpcklbw mm0,mm7
+   punpckhbw mm1,mm7
+   punpcklbw mm2,mm7
+   punpckhbw mm3,mm7
+   paddusw mm0,mm1
+   paddusw mm6,mm0
+   paddusw mm2,mm3
+   paddusw mm6,mm2
+ %endmacro
+ %macro SADV_16x16_MMX 0
+   movq mm0, [eax]
+   movq mm1, [edx]
+   movq mm2, [eax+8]
+   movq mm3, [edx+8]
+   movq mm4, mm0
+   psubusb mm0, mm1
+   psubusb mm1, mm4
+   por mm0, mm1
+   lea eax,[eax+ecx]
+   movq mm4, mm2
+   psubusb mm2, mm3
+   psubusb mm3, mm4
+   por mm2, mm3
+   lea edx,[edx+ecx]
+   movq mm1,mm0
+   movq mm3,mm2
+   punpcklbw mm0,mm7
+   punpckhbw mm1,mm7
+   punpcklbw mm2,mm7
+   punpckhbw mm3,mm7
+   paddusw mm0,mm1
+   paddusw mm2,mm3
+   paddusw mm5, mm0
+   paddusw mm6, mm2
+ %endmacro
+ %macro SADBI_16x16_MMX 2    ; SADBI_16x16_MMX( int_ptr_offset, bool_increment_ptr );
+   movq mm0, [edx+%1]
+   movq mm2, [ebx+%1]
+   movq mm1, mm0
+   movq mm3, mm2
+ %if %2 != 0
+   add edx, ecx
+ %endif
+   punpcklbw mm0, mm7
+   punpckhbw mm1, mm7
+   punpcklbw mm2, mm7
+   punpckhbw mm3, mm7
+ %if %2 != 0
+   add ebx, ecx
+ %endif
+   paddusw mm0, mm2              ; mm01 = ref1 + ref2
+   paddusw mm1, mm3
+   paddusw mm0, [mmx_one]        ; mm01 += 1
+   paddusw mm1, [mmx_one]
+   psrlw mm0, 1                  ; mm01 >>= 1
+   psrlw mm1, 1
+   movq mm2, [eax+%1]
+   movq mm3, mm2
+   punpcklbw mm2, mm7            ; mm23 = src
+   punpckhbw mm3, mm7
+ %if %2 != 0
+   add eax, ecx
+ %endif
+   movq mm4, mm0
+   movq mm5, mm1
+   psubusw mm0, mm2
+   psubusw mm1, mm3
+   psubusw mm2, mm4
+   psubusw mm3, mm5
+   por mm0, mm2                  ; mm01 = ABS(mm01 - mm23)
+   por mm1, mm3
+   paddusw mm6, mm0              ; mm6 += mm01
+   paddusw mm6, mm1
+ %endmacro
+ %macro MEAN_16x16_MMX 0
+   movq mm0, [eax]
+   movq mm2, [eax+8]
+   lea eax, [eax+ecx]
+   movq mm1, mm0
+   movq mm3, mm2
+   punpcklbw mm0, mm7
+   punpcklbw mm2, mm7
+   punpckhbw mm1, mm7
+   punpckhbw mm3, mm7
+   paddw mm5, mm0
+   paddw mm6, mm1
+   paddw mm5, mm2
+   paddw mm6, mm3
+ %endmacro
+ %macro ABS_16x16_MMX 0
+   movq mm0, [eax]
+   movq mm2, [eax+8]
+   lea eax, [eax+ecx]
+   movq mm1, mm0
+   movq mm3, mm2
+   punpcklbw mm0, mm7
+   punpcklbw mm2, mm7
+   punpckhbw mm1, mm7
+   punpckhbw mm3, mm7
+   movq mm4, mm6
+   psubusw mm4, mm0
+   psubusw mm0, mm6
+   por mm0, mm4
+   movq mm4, mm6
+   psubusw mm4, mm1
+   psubusw mm1, mm6
+   por mm1, mm4
+   movq mm4, mm6
+   psubusw mm4, mm2
+   psubusw mm2, mm6
+   por mm2, mm4
+   movq mm4, mm6
+   psubusw mm4, mm3
+   psubusw mm3, mm6
+   por mm3, mm4
+   paddw mm0, mm1
+   paddw mm2, mm3
+   paddw mm5, mm0
+   paddw mm5, mm2
+ %endmacro
+ ;=============================================================================
+ ; Code
+ ;=============================================================================
+ SECTION .text
+ cglobal  sad16_mmx
+ cglobal  sad16v_mmx
+ cglobal  sad8_mmx
+ cglobal  sad16bi_mmx
+ cglobal  sad8bi_mmx
+ cglobal  dev16_mmx
+ ;-----------------------------------------------------------------------------
+ ;
+ ; uint32_t sad16_mmx(const uint8_t * const cur,
+ ;                                        const uint8_t * const ref,
+ ;                                        const uint32_t stride,
+ ;                                        const uint32_t best_sad);
+ ;
+ ; (early termination ignore; slows this down)
+ ;
+ ;-----------------------------------------------------------------------------
+ ALIGN 16
  sad16_mmx:
      mov eax, [esp+ 4] ; Src1
-Line 135
+Line 314
      ret
+ ;-----------------------------------------------------------------------------
- ;===========================================================================
  ;
  ; uint32_t sad8_mmx(const uint8_t * const cur,
  ;                                       const uint8_t * const ref,
  ;                                       const uint32_t stride);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- %macro SAD_8x8_MMX  0
-     movq mm0, [eax]
-     movq mm1, [edx]
-     movq mm2, [eax+ecx]
-     movq mm3, [edx+ecx]
-     lea eax,[eax+2*ecx]
-     lea edx,[edx+2*ecx]
-     movq mm4, mm0
-     psubusb mm0, mm1
-     movq mm5, mm2
-     psubusb mm2, mm3
-     psubusb mm1, mm4
-     por mm0, mm1
-     psubusb mm3, mm5
-     por mm2, mm3
-     movq mm1,mm0
-     movq mm3,mm2
-     punpcklbw mm0,mm7
-     punpckhbw mm1,mm7
-     punpcklbw mm2,mm7
-     punpckhbw mm3,mm7
-     paddusw mm0,mm1
-     paddusw mm6,mm0
-     paddusw mm2,mm3
-     paddusw mm6,mm2
- %endmacro
- align 16
+ ALIGN 16
  sad8_mmx:
      mov eax, [esp+ 4] ; Src1
-Line 202
+Line 346
      ret
+ ;-----------------------------------------------------------------------------
- ;===========================================================================
  ;
  ; uint32_t sad16v_mmx(const uint8_t * const cur,
  ;                                     const uint8_t * const ref,
  ;                                         const uint32_t stride,
  ;                                         int32_t *sad);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- %macro SADV_16x16_MMX 0
+ ALIGN 16
-     movq mm0, [eax]
-     movq mm1, [edx]
-     movq mm2, [eax+8]
-     movq mm3, [edx+8]
-     movq mm4, mm0
-     psubusb mm0, mm1
-     psubusb mm1, mm4
-     por mm0, mm1
-     lea eax,[eax+ecx]
-     movq mm4, mm2
-     psubusb mm2, mm3
-     psubusb mm3, mm4
-     por mm2, mm3
-     lea edx,[edx+ecx]
-     movq mm1,mm0
-     movq mm3,mm2
-     punpcklbw mm0,mm7
-     punpckhbw mm1,mm7
-     punpcklbw mm2,mm7
-     punpckhbw mm3,mm7
-     paddusw mm0,mm1
-     paddusw mm2,mm3
-         paddusw mm5, mm0
-         paddusw mm6, mm2
- %endmacro
- align 16
  sad16v_mmx:
          push ebx
-Line 326
+Line 433
          pop edi
      pop ebx
-         ret
+   ret
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
  ;
  ; uint32_t sad16bi_mmx(const uint8_t * const cur,
  ; const uint8_t * const ref1,
  ; const uint8_t * const ref2,
  ; const uint32_t stride);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- %macro SADBI_16x16_MMX 2    ; SADBI_16x16_MMX( int_ptr_offset, bool_increment_ptr );
-    movq mm0, [edx+%1]
-    movq mm2, [ebx+%1]
-    movq mm1, mm0
-    movq mm3, mm2
- %if %2 != 0
-    add edx, ecx
- %endif
-    punpcklbw mm0, mm7
-    punpckhbw mm1, mm7
- punpcklbw mm2, mm7
- punpckhbw mm3, mm7
- %if %2 != 0
+ ALIGN 16
-    add ebx, ecx
- %endif
- paddusw mm0, mm2    ; mm01 = ref1 + ref2
- paddusw mm1, mm3
- paddusw mm0, [mmx_one] ; mm01 += 1
- paddusw mm1, [mmx_one]
- psrlw mm0, 1     ; mm01 >>= 1
- psrlw mm1, 1
-    movq mm2, [eax+%1]
-    movq mm3, mm2
-    punpcklbw mm2, mm7          ; mm23 = src
-    punpckhbw mm3, mm7
- %if %2 != 0
-    add eax, ecx
- %endif
-    movq mm4, mm0
-    movq mm5, mm1
-    psubusw mm0, mm2
-    psubusw mm1, mm3
-    psubusw mm2, mm4
-    psubusw mm3, mm5
-    por mm0, mm2                ; mm01 = ABS(mm01 - mm23)
-    por mm1, mm3
-    paddusw mm6,mm0             ; mm6 += mm01
-    paddusw mm6,mm1
- %endmacro
- align 16
  sad16bi_mmx:
     push ebx
     mov eax, [esp+4+ 4] ; Src
-Line 440
+Line 497
     movd eax, mm6
     pop ebx
     ret
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
  ;
  ; uint32_t sad8bi_mmx(const uint8_t * const cur,
  ; const uint8_t * const ref1,
  ; const uint8_t * const ref2,
  ; const uint32_t stride);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- align 16
+ ALIGN 16
  sad8bi_mmx:
     push ebx
     mov eax, [esp+4+ 4] ; Src
-Line 479
+Line 538
     pop ebx
     ret
+ ;-----------------------------------------------------------------------------
- ;===========================================================================
  ;
  ; uint32_t dev16_mmx(const uint8_t * const cur,
  ;                                       const uint32_t stride);
  ;
- ;===========================================================================
+ ;-----------------------------------------------------------------------------
- %macro MEAN_16x16_MMX 0
-     movq mm0, [eax]
-     movq mm2, [eax+8]
-     lea eax,[eax+ecx]
-     movq mm1, mm0
-     movq mm3, mm2
-     punpcklbw mm0,mm7
-     punpcklbw mm2,mm7
-     punpckhbw mm1,mm7
-     punpckhbw mm3,mm7
-     paddw mm5, mm0
-     paddw mm6, mm1
-     paddw mm5, mm2
-     paddw mm6, mm3
- %endmacro
- %macro ABS_16x16_MMX 0
+ ALIGN 16
-     movq mm0, [eax]
-     movq mm2, [eax+8]
-     lea eax,[eax+ecx]
-     movq mm1, mm0
-     movq mm3, mm2
-     punpcklbw mm0, mm7
-     punpcklbw mm2, mm7
-     punpckhbw mm1, mm7
-     punpckhbw mm3, mm7
-     movq mm4, mm6
-         psubusw mm4, mm0
-         psubusw mm0, mm6
-         por mm0, mm4
-         movq mm4, mm6
-         psubusw mm4, mm1
-         psubusw mm1, mm6
-         por mm1, mm4
-     movq mm4, mm6
-         psubusw mm4, mm2
-         psubusw mm2, mm6
-         por mm2, mm4
-         movq mm4, mm6
-         psubusw mm4, mm3
-         psubusw mm3, mm6
-         por mm3, mm4
-         paddw mm0, mm1
-         paddw mm2, mm3
-         paddw mm5, mm0
-         paddw mm5, mm2
- %endmacro
- align 16
  dev16_mmx:
      mov eax, [esp+ 4] ; Src
      mov ecx, [esp+ 8] ; Stride
-Line 609
+Line 614
      paddd mm6, mm5
      movd eax, mm6
      ret

 Legend:



Removed from v.886
 


changed lines


 
Added in v.1192
 Legend:



Removed from v.886
 


changed lines


 
Added in v.1192
-Removed from v.886
+Added in v.1192

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4