Diff of /branches/release-1_2-branch/xvidcore/src/image/x86_asm/interpolate8x8_3dne.asm

-revision 1382, Mon Mar 22 22:36:25 2004 UTC
+revision 1793, Tue Nov 11 20:46:24 2008 UTC
 Line 28
  %macro cglobal 1
          %ifdef PREFIX
+                 %ifdef MARK_FUNCS
+                         global _%1:function %1.endfunc-%1
+                         %define %1 _%1:function %1.endfunc-%1
+                         %define ENDFUNC .endfunc
+                 %else
                  global _%1
                  %define %1 _%1
+                         %define ENDFUNC
+                 %endif
+         %else
+                 %ifdef MARK_FUNCS
+                         global %1:function %1.endfunc-%1
+                         %define ENDFUNC .endfunc
          %else
                  global %1
+                         %define ENDFUNC
+                 %endif
          %endif
  %endmacro
-Line 40
+Line 53
  ;=============================================================================
  %ifdef FORMAT_COFF
- SECTION .rodata data
+ SECTION .rodata
  %else
- SECTION .rodata data align=16
+ SECTION .rodata align=16
  %endif
  ALIGN 16
-Line 71
+Line 84
  cglobal interpolate8x8_halfpel_v_3dne
  cglobal interpolate8x8_halfpel_hv_3dne
+ cglobal interpolate8x4_halfpel_h_3dne
+ cglobal interpolate8x4_halfpel_v_3dne
+ cglobal interpolate8x4_halfpel_hv_3dne
  ;-----------------------------------------------------------------------------
  ;
  ; void interpolate8x8_halfpel_h_3dne(uint8_t * const dst,
-Line 84
+Line 101
  %if (%1)
    movq mm0, [eax]
  %else
-   movq mm0, [dword eax]
+   movq mm0, [eax+0]
+   ; ---
+   ; nasm >0.99.x rejects the original statement:
+   ;   movq mm0, [dword eax]
+   ; as it is ambiguous. for this statement nasm <0.99.x would
+   ; generate "movq mm0,[eax+0]"
+   ; ---
  %endif
    pavgb mm0, [eax+1]
    movq mm1, [eax+edx]
-Line 133
+Line 156
    COPY_H_SSE_RND0 1
    ret
- .rounding1
+ .rounding1:
   ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
    mov ecx, [esp+ 4] ; Dst
    movq mm7, [mmx_one]
-Line 145
+Line 168
    lea ecx,[ecx+2*edx]
    COPY_H_SSE_RND1
    ret
+ ENDFUNC
  ;-----------------------------------------------------------------------------
  ;
-Line 204
+Line 228
    ret
  ALIGN 8
- .rounding1
+ .rounding1:
    pcmpeqb mm0, mm0
    psubusb mm0, [eax]
    add eax, edx
-Line 268
+Line 292
    movq [ecx], mm4
    movq [ecx+edx], mm5
    ret
+ ENDFUNC
  ;-----------------------------------------------------------------------------
  ;
-Line 390
+Line 415
    ret
  ALIGN 16
- .rounding1
+ .rounding1:
    COPY_HV_SSE_RND1
    lea ecx,[ecx+2*edx]
    COPY_HV_SSE_RND1
-Line 399
+Line 424
    lea ecx,[ecx+2*edx]
    COPY_HV_SSE_RND1
    ret
+ ENDFUNC
+ ;-----------------------------------------------------------------------------
+ ;
+ ; void interpolate8x4_halfpel_h_3dne(uint8_t * const dst,
+ ;                       const uint8_t * const src,
+ ;                       const uint32_t stride,
+ ;                       const uint32_t rounding);
+ ;
+ ;-----------------------------------------------------------------------------
+ ALIGN 16
+ interpolate8x4_halfpel_h_3dne:
+   mov eax, [esp+ 8] ; Src
+   mov edx, [esp+12] ; stride
+   dec dword [esp+16]; rounding
+   jz .rounding1
+   mov ecx, [esp+ 4] ; Dst
+   COPY_H_SSE_RND0 0
+   lea ecx,[ecx+2*edx]
+   COPY_H_SSE_RND0 1
+   ret
+ .rounding1:
+  ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
+   mov ecx, [esp+ 4] ; Dst
+   movq mm7, [mmx_one]
+   COPY_H_SSE_RND1
+   lea ecx, [ecx+2*edx]
+   COPY_H_SSE_RND1
+   ret
+ ENDFUNC
+ ;-----------------------------------------------------------------------------
+ ;
+ ; void interpolate8x4_halfpel_v_3dne(uint8_t * const dst,
+ ;                       const uint8_t * const src,
+ ;                       const uint32_t stride,
+ ;                       const uint32_t rounding);
+ ;
+ ;-----------------------------------------------------------------------------
+ ALIGN 16
+ interpolate8x4_halfpel_v_3dne:
+   mov eax, [esp+ 8] ; Src
+   mov edx, [esp+12] ; stride
+   dec dword [esp+16]; rounding
+     ; we process 2 line at a time
+   jz .rounding1
+   pxor mm2,mm2
+   movq mm0, [eax]
+   movq mm1, [eax+edx]
+   por mm2, [eax+2*edx]      ; Something like preload (pipelining)
+   mov ecx, [esp+ 4] ; Dst
+   lea eax, [eax+2*edx]
+   pxor mm4, mm4
+   pavgb mm0, mm1
+   pavgb mm1, mm2
+   movq [byte ecx], mm0
+   movq [ecx+edx], mm1
+   pxor mm6, mm6
+   add eax, edx
+   lea ecx, [ecx+2*edx]
+   movq mm3, [byte eax]
+   por mm4, [eax+edx]
+   lea eax, [eax+2*edx]
+   pavgb mm2, mm3
+   pavgb mm3, mm4
+   movq [ecx], mm2
+   movq [ecx+edx], mm3
+   ret
+ ALIGN 8
+ .rounding1:
+   pcmpeqb mm0, mm0
+   psubusb mm0, [eax]            ; eax==line0
+   add eax, edx                  ; eax==line1
+   mov ecx, [esp+ 4] ; Dst
+   push esi
+   pcmpeqb mm1, mm1
+   pcmpeqb mm2, mm2
+   mov esi, mm_minusone
+   psubusb mm1, [byte eax]       ; line1
+   psubusb mm2, [eax+edx]        ; line2
+   lea eax, [eax+2*edx]          ; eax==line3
+   movq mm6, [esi]
+   movq mm7, [esi]
+   pavgb mm0, mm1
+   pavgb mm1, mm2
+   psubusb mm6, mm0
+   psubusb mm7, mm1
+   movq [ecx], mm6               ; store line0
+   movq [ecx+edx], mm7           ; store line1
+   lea ecx, [ecx+2*edx]
+   pcmpeqb mm3, mm3
+   pcmpeqb mm4, mm4
+   psubusb mm3, [eax]            ; line3
+   psubusb mm4, [eax+edx]        ; line4
+   lea eax, [eax+2*edx]          ; eax==line 5
+   pavgb mm2, mm3
+   pavgb mm3, mm4
+   movq mm0, [esi]
+   movq mm1, [esi]
+   psubusb mm0, mm2
+   psubusb mm1, mm3
+   movq [ecx], mm0
+   movq [ecx+edx], mm1
+   pop esi
+   ret
+ ENDFUNC
+ ;-----------------------------------------------------------------------------
+ ;
+ ; void interpolate8x4_halfpel_hv_3dne(uint8_t * const dst,
+ ;                       const uint8_t * const src,
+ ;                       const uint32_t stride,
+ ;                       const uint32_t rounding);
+ ;
+ ;
+ ;-----------------------------------------------------------------------------
+ ALIGN 16
+ interpolate8x4_halfpel_hv_3dne:
+   mov eax, [esp+ 8]     ; Src
+   mov edx, [esp+12]     ; stride
+   dec dword [esp+16]    ; rounding
+     ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
+   movq mm2, [eax]
+   movq mm3, [eax+1]
+   movq mm6, mm2
+   pavgb mm2, mm3
+   pxor mm3, mm6         ; mm2/mm3 ready
+   mov ecx, [esp+ 4]     ; Dst
+   movq mm7, [mmx_one]
+   jz near .rounding1
+   lea ebp,[byte ebp]
+   COPY_HV_SSE_RND0
+   lea ecx,[ecx+2*edx]
+   COPY_HV_SSE_RND0
+   ret
+ ALIGN 16
+ .rounding1:
+   COPY_HV_SSE_RND1
+   lea ecx,[ecx+2*edx]
+   COPY_HV_SSE_RND1
+   ret
+ ENDFUNC
+ %ifidn __OUTPUT_FORMAT__,elf
+ section ".note.GNU-stack" noalloc noexec nowrite progbits
+ %endif

 Legend:



Removed from v.1382
 


changed lines


 
Added in v.1793
 Legend:



Removed from v.1382
 


changed lines


 
Added in v.1793
-Removed from v.1382
+Added in v.1793

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4