Diff of /trunk/xvidcore/src/image/x86_asm/interpolate8x8_mmx.asm

-revision 1519, Sat Jul 24 11:46:08 2004 UTC
+revision 1793, Tue Nov 11 20:46:24 2008 UTC
 Line 26
  %macro cglobal 1
          %ifdef PREFIX
+                 %ifdef MARK_FUNCS
+                         global _%1:function %1.endfunc-%1
+                         %define %1 _%1:function %1.endfunc-%1
+                         %define ENDFUNC .endfunc
+                 %else
                  global _%1
                  %define %1 _%1
+                         %define ENDFUNC
+                 %endif
+         %else
+                 %ifdef MARK_FUNCS
+                         global %1:function %1.endfunc-%1
+                         %define ENDFUNC .endfunc
          %else
                  global %1
+                         %define ENDFUNC
+                 %endif
          %endif
  %endmacro
-Line 95
+Line 108
  cglobal interpolate8x8_halfpel_h_mmx
  cglobal interpolate8x8_halfpel_v_mmx
  cglobal interpolate8x8_halfpel_hv_mmx
+ cglobal interpolate8x4_halfpel_h_mmx
+ cglobal interpolate8x4_halfpel_v_mmx
+ cglobal interpolate8x4_halfpel_hv_mmx
  cglobal interpolate8x8_avg4_mmx
  cglobal interpolate8x8_avg2_mmx
  cglobal interpolate8x8_6tap_lowpass_h_mmx
  cglobal interpolate8x8_6tap_lowpass_v_mmx
+ cglobal interpolate8x8_halfpel_add_mmx
+ cglobal interpolate8x8_halfpel_h_add_mmx
+ cglobal interpolate8x8_halfpel_v_add_mmx
+ cglobal interpolate8x8_halfpel_hv_add_mmx
  %macro  CALC_AVG 6
    punpcklbw %3, %6
    punpckhbw %4, %6
-Line 169
+Line 193
    pop esi
    ret
+ ENDFUNC
  ;-----------------------------------------------------------------------------
-Line 228
+Line 253
    pop esi
    ret
+ ENDFUNC
  ;-----------------------------------------------------------------------------
-Line 319
+Line 345
    pop esi
    ret
+ ENDFUNC
+ ;-----------------------------------------------------------------------------
+ ;
+ ; void interpolate8x4_halfpel_h_mmx(uint8_t * const dst,
+ ;                       const uint8_t * const src,
+ ;                       const uint32_t stride,
+ ;                       const uint32_t rounding);
+ ;
+ ;-----------------------------------------------------------------------------
+ ALIGN 16
+ interpolate8x4_halfpel_h_mmx:
+   push esi
+   push edi
+   mov eax, [esp + 8 + 16]       ; rounding
+   movq mm7, [rounding1_mmx + eax * 8]
+   mov edi, [esp + 8 + 4]        ; dst
+   mov esi, [esp + 8 + 8]        ; src
+   mov edx, [esp + 8 + 12]       ; stride
+   pxor mm6, mm6                 ; zero
+   COPY_H_MMX
+   COPY_H_MMX
+   COPY_H_MMX
+   COPY_H_MMX
+   pop edi
+   pop esi
+   ret
+ ENDFUNC
+ ;-----------------------------------------------------------------------------
+ ;
+ ; void interpolate8x4_halfpel_v_mmx(uint8_t * const dst,
+ ;                       const uint8_t * const src,
+ ;                       const uint32_t stride,
+ ;                       const uint32_t rounding);
+ ;
+ ;-----------------------------------------------------------------------------
+ ALIGN 16
+ interpolate8x4_halfpel_v_mmx:
+   push esi
+   push edi
+   mov eax, [esp + 8 + 16]       ; rounding
+   movq mm7, [rounding1_mmx + eax * 8]
+   mov edi, [esp + 8 + 4]        ; dst
+   mov esi, [esp + 8 + 8]        ; src
+   mov edx, [esp + 8 + 12]       ; stride
+   pxor mm6, mm6                 ; zero
+   COPY_V_MMX
+   COPY_V_MMX
+   COPY_V_MMX
+   COPY_V_MMX
+   pop edi
+   pop esi
+   ret
+ ENDFUNC
+ ;-----------------------------------------------------------------------------
+ ;
+ ; void interpolate8x4_halfpel_hv_mmx(uint8_t * const dst,
+ ;                       const uint8_t * const src,
+ ;                       const uint32_t stride,
+ ;                       const uint32_t rounding);
+ ;
+ ;
+ ;-----------------------------------------------------------------------------
+ ALIGN 16
+ interpolate8x4_halfpel_hv_mmx:
+   push esi
+   push edi
+   mov eax, [esp + 8 + 16]   ; rounding
+   movq mm7, [rounding2_mmx + eax * 8]
+   mov edi, [esp + 8 + 4]    ; dst
+   mov esi, [esp + 8 + 8]    ; src
+   mov eax, 8
+   pxor mm6, mm6             ; zero
+   mov edx, [esp + 8 + 12]   ; stride
+   COPY_HV_MMX
+   COPY_HV_MMX
+   COPY_HV_MMX
+   COPY_HV_MMX
+   pop edi
+   pop esi
+   ret
+ ENDFUNC
  ;-----------------------------------------------------------------------------
  ;
-Line 451
+Line 592
    AVG2_MMX_RND0
    lea ecx, [ecx+2*edx]
- .start0
+ .start0:
    AVG2_MMX_RND0
    lea ecx, [ecx+2*edx]
-Line 464
+Line 605
    pop ebx
    ret
- .rounding1
+ .rounding1:
    mov eax, [esp + 4 + 24]       ; height -> eax
    sub eax, 8
    test eax, eax
-Line 481
+Line 622
    AVG2_MMX_RND1
    lea ecx, [ecx+2*edx]
- .start1
+ .start1:
    AVG2_MMX_RND1
    lea ecx, [ecx+2*edx]
-Line 493
+Line 634
    pop ebx
    ret
+ ENDFUNC
  ;-----------------------------------------------------------------------------
-Line 660
+Line 802
    pop ebx
    ret
- .rounding1
+ .rounding1:
    AVG4_MMX_RND1
    lea ecx, [ecx+edx]
    AVG4_MMX_RND1
-Line 681
+Line 823
    pop edi
    pop ebx
    ret
+ ENDFUNC
  ;-----------------------------------------------------------------------------
-Line 791
+Line 934
    LOWPASS_6TAP_H_MMX
    ret
+ ENDFUNC
  ;-----------------------------------------------------------------------------
  ;
-Line 908
+Line 1052
    pop ebx
    ret
+ ENDFUNC
+ ;===========================================================================
+ ;
+ ; The next functions combine both source halfpel interpolation step and the
+ ; averaging (with rouding) step to avoid wasting memory bandwidth computing
+ ; intermediate halfpel images and then averaging them.
+ ;
+ ;===========================================================================
+ %macro PROLOG0 0
+   mov ecx, [esp+ 4] ; Dst
+   mov eax, [esp+ 8] ; Src
+   mov edx, [esp+12] ; BpS
+ %endmacro
+ %macro PROLOG 2   ; %1: Rounder, %2 load Dst-Rounder
+   pxor mm6, mm6
+   movq mm7, [%1]    ; TODO: dangerous! (eax isn't checked)
+ %if %2
+   movq mm5, [rounding1_mmx]
+ %endif
+   PROLOG0
+ %endmacro
+   ; performs: mm0 == (mm0+mm2)  mm1 == (mm1+mm3)
+ %macro MIX 0
+   punpcklbw mm0, mm6
+   punpcklbw mm2, mm6
+   punpckhbw mm1, mm6
+   punpckhbw mm3, mm6
+   paddusw mm0, mm2
+   paddusw mm1, mm3
+ %endmacro
+ %macro MIX_DST 0
+   movq mm3, mm2
+   paddusw mm0, mm7  ; rounder
+   paddusw mm1, mm7  ; rounder
+   punpcklbw mm2, mm6
+   punpckhbw mm3, mm6
+   psrlw mm0, 1
+   psrlw mm1, 1
+   paddusw mm0, mm2  ; mix Src(mm0/mm1) with Dst(mm2/mm3)
+   paddusw mm1, mm3
+   paddusw mm0, mm5
+   paddusw mm1, mm5
+   psrlw mm0, 1
+   psrlw mm1, 1
+   packuswb mm0, mm1
+ %endmacro
+ %macro MIX2 0
+   punpcklbw mm0, mm6
+   punpcklbw mm2, mm6
+   paddusw mm0, mm2
+   paddusw mm0, mm7
+   punpckhbw mm1, mm6
+   punpckhbw mm3, mm6
+   paddusw mm1, mm7
+   paddusw mm1, mm3
+   psrlw mm0, 1
+   psrlw mm1, 1
+   packuswb mm0, mm1
+ %endmacro
+ ;===========================================================================
+ ;
+ ; void interpolate8x8_halfpel_add_mmx(uint8_t * const dst,
+ ;                       const uint8_t * const src,
+ ;                       const uint32_t stride,
+ ;                       const uint32_t rounding);
+ ;
+ ;
+ ;===========================================================================
+ %macro ADD_FF_MMX 1
+   movq mm0, [eax]
+   movq mm2, [ecx]
+   movq mm1, mm0
+   movq mm3, mm2
+ %if (%1!=0)
+   lea eax,[eax+%1*edx]
+ %endif
+   MIX
+   paddusw mm0, mm5  ; rounder
+   paddusw mm1, mm5  ; rounder
+   psrlw mm0, 1
+   psrlw mm1, 1
+   packuswb mm0, mm1
+   movq [ecx], mm0
+ %if (%1!=0)
+   lea ecx,[ecx+%1*edx]
+ %endif
+ %endmacro
+ ALIGN 16
+ interpolate8x8_halfpel_add_mmx:
+   PROLOG rounding1_mmx, 1
+   ADD_FF_MMX 1
+   ADD_FF_MMX 1
+   ADD_FF_MMX 1
+   ADD_FF_MMX 1
+   ADD_FF_MMX 1
+   ADD_FF_MMX 1
+   ADD_FF_MMX 1
+   ADD_FF_MMX 0
+   ret
+ ENDFUNC
+ ;===========================================================================
+ ;
+ ; void interpolate8x8_halfpel_h_add_mmx(uint8_t * const dst,
+ ;                       const uint8_t * const src,
+ ;                       const uint32_t stride,
+ ;                       const uint32_t rounding);
+ ;
+ ;
+ ;===========================================================================
+ %macro ADD_FH_MMX 0
+   movq mm0, [eax]
+   movq mm2, [eax+1]
+   movq mm1, mm0
+   movq mm3, mm2
+   lea eax,[eax+edx]
+   MIX
+   movq mm2, [ecx]   ; prepare mix with Dst[0]
+   MIX_DST
+   movq [ecx], mm0
+ %endmacro
+ ALIGN 16
+ interpolate8x8_halfpel_h_add_mmx:
+   PROLOG rounding1_mmx, 1
+   ADD_FH_MMX
+   lea ecx,[ecx+edx]
+   ADD_FH_MMX
+   lea ecx,[ecx+edx]
+   ADD_FH_MMX
+   lea ecx,[ecx+edx]
+   ADD_FH_MMX
+   lea ecx,[ecx+edx]
+   ADD_FH_MMX
+   lea ecx,[ecx+edx]
+   ADD_FH_MMX
+   lea ecx,[ecx+edx]
+   ADD_FH_MMX
+   lea ecx,[ecx+edx]
+   ADD_FH_MMX
+   ret
+ ENDFUNC
+ ;===========================================================================
+ ;
+ ; void interpolate8x8_halfpel_v_add_mmx(uint8_t * const dst,
+ ;                       const uint8_t * const src,
+ ;                       const uint32_t stride,
+ ;                       const uint32_t rounding);
+ ;
+ ;
+ ;===========================================================================
+ %macro ADD_HF_MMX 0
+   movq mm0, [eax]
+   movq mm2, [eax+edx]
+   movq mm1, mm0
+   movq mm3, mm2
+   lea eax,[eax+edx]
+   MIX
+   movq mm2, [ecx]   ; prepare mix with Dst[0]
+   MIX_DST
+   movq [ecx], mm0
+ %endmacro
+ ALIGN 16
+ interpolate8x8_halfpel_v_add_mmx:
+   PROLOG rounding1_mmx, 1
+   ADD_HF_MMX
+   lea ecx,[ecx+edx]
+   ADD_HF_MMX
+   lea ecx,[ecx+edx]
+   ADD_HF_MMX
+   lea ecx,[ecx+edx]
+   ADD_HF_MMX
+   lea ecx,[ecx+edx]
+   ADD_HF_MMX
+   lea ecx,[ecx+edx]
+   ADD_HF_MMX
+   lea ecx,[ecx+edx]
+   ADD_HF_MMX
+   lea ecx,[ecx+edx]
+   ADD_HF_MMX
+   ret
+ ENDFUNC
+ ; The trick is to correct the result of 'pavgb' with some combination of the
+ ; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
+ ; The boolean relations are:
+ ;   (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
+ ;   (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st
+ ;   (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st
+ ;   (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st
+ ; with  s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
+ ; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
+ ;===========================================================================
+ ;
+ ; void interpolate8x8_halfpel_hv_add_mmx(uint8_t * const dst,
+ ;                       const uint8_t * const src,
+ ;                       const uint32_t stride,
+ ;                       const uint32_t rounding);
+ ;
+ ;
+ ;===========================================================================
+ %macro ADD_HH_MMX 0
+   lea eax,[eax+edx]
+     ; transfert prev line to mm0/mm1
+   movq mm0, mm2
+   movq mm1, mm3
+     ; load new line in mm2/mm3
+   movq mm2, [eax]
+   movq mm4, [eax+1]
+   movq mm3, mm2
+   movq mm5, mm4
+   punpcklbw mm2, mm6
+   punpcklbw mm4, mm6
+   paddusw mm2, mm4
+   punpckhbw mm3, mm6
+   punpckhbw mm5, mm6
+   paddusw mm3, mm5
+     ; mix current line (mm2/mm3) with previous (mm0,mm1);
+     ; we'll preserve mm2/mm3 for next line...
+   paddusw mm0, mm2
+   paddusw mm1, mm3
+   movq mm4, [ecx]   ; prepare mix with Dst[0]
+   movq mm5, mm4
+   paddusw mm0, mm7  ; finish mixing current line
+   paddusw mm1, mm7
+   punpcklbw mm4, mm6
+   punpckhbw mm5, mm6
+   psrlw mm0, 2
+   psrlw mm1, 2
+   paddusw mm0, mm4  ; mix Src(mm0/mm1) with Dst(mm2/mm3)
+   paddusw mm1, mm5
+   paddusw mm0, [rounding1_mmx]
+   paddusw mm1, [rounding1_mmx]
+   psrlw mm0, 1
+   psrlw mm1, 1
+   packuswb mm0, mm1
+   movq [ecx], mm0
+ %endmacro
+ ALIGN 16
+ interpolate8x8_halfpel_hv_add_mmx:
+   PROLOG rounding2_mmx, 0    ; mm5 is busy. Don't load dst-rounder
+     ; preprocess first line
+   movq mm0, [eax]
+   movq mm2, [eax+1]
+   movq mm1, mm0
+   movq mm3, mm2
+   punpcklbw mm0, mm6
+   punpcklbw mm2, mm6
+   punpckhbw mm1, mm6
+   punpckhbw mm3, mm6
+   paddusw mm2, mm0
+   paddusw mm3, mm1
+    ; Input: mm2/mm3 contains the value (Src[0]+Src[1]) of previous line
+   ADD_HH_MMX
+   lea ecx,[ecx+edx]
+   ADD_HH_MMX
+   lea ecx,[ecx+edx]
+   ADD_HH_MMX
+   lea ecx,[ecx+edx]
+   ADD_HH_MMX
+   lea ecx,[ecx+edx]
+   ADD_HH_MMX
+   lea ecx,[ecx+edx]
+   ADD_HH_MMX
+   lea ecx,[ecx+edx]
+   ADD_HH_MMX
+   lea ecx,[ecx+edx]
+   ADD_HH_MMX
+   ret
+ ENDFUNC
+ %ifidn __OUTPUT_FORMAT__,elf
+ section ".note.GNU-stack" noalloc noexec nowrite progbits
+ %endif

 Legend:



Removed from v.1519
 


changed lines


 
Added in v.1793
 Legend:



Removed from v.1519
 


changed lines


 
Added in v.1793
-Removed from v.1519
+Added in v.1793

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4