--- trunk/xvidcore/src/image/x86_asm/interpolate8x8_3dne.asm	2004/08/29 10:02:38	1540
+++ trunk/xvidcore/src/image/x86_asm/interpolate8x8_3dne.asm	2005/09/13 12:12:15	1632
@@ -80,6 +80,10 @@
 cglobal interpolate8x8_halfpel_v_3dne
 cglobal interpolate8x8_halfpel_hv_3dne
 
+cglobal interpolate8x4_halfpel_h_3dne
+cglobal interpolate8x4_halfpel_v_3dne
+cglobal interpolate8x4_halfpel_hv_3dne
+
 ;-----------------------------------------------------------------------------
 ;
 ; void interpolate8x8_halfpel_h_3dne(uint8_t * const dst,
@@ -412,3 +416,166 @@
   ret
 .endfunc
 
+;-----------------------------------------------------------------------------
+;
+; void interpolate8x4_halfpel_h_3dne(uint8_t * const dst,
+;                       const uint8_t * const src,
+;                       const uint32_t stride,
+;                       const uint32_t rounding);
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+interpolate8x4_halfpel_h_3dne:
+
+  mov eax, [esp+ 8] ; Src
+  mov edx, [esp+12] ; stride
+  dec dword [esp+16]; rounding
+
+  jz .rounding1
+  mov ecx, [esp+ 4] ; Dst
+
+  COPY_H_SSE_RND0 0
+  lea ecx,[ecx+2*edx]
+  COPY_H_SSE_RND0 1
+  ret
+
+.rounding1
+ ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
+  mov ecx, [esp+ 4] ; Dst
+  movq mm7, [mmx_one]
+  COPY_H_SSE_RND1
+  lea ecx, [ecx+2*edx]
+  COPY_H_SSE_RND1
+  ret
+.endfunc
+
+;-----------------------------------------------------------------------------
+;
+; void interpolate8x4_halfpel_v_3dne(uint8_t * const dst,
+;                       const uint8_t * const src,
+;                       const uint32_t stride,
+;                       const uint32_t rounding);
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+interpolate8x4_halfpel_v_3dne:
+
+  mov eax, [esp+ 8] ; Src
+  mov edx, [esp+12] ; stride
+  dec dword [esp+16]; rounding
+
+    ; we process 2 line at a time
+
+  jz .rounding1
+  pxor mm2,mm2
+  movq mm0, [eax]
+  movq mm1, [eax+edx]
+  por mm2, [eax+2*edx]      ; Something like preload (pipelining)
+  mov ecx, [esp+ 4] ; Dst
+  lea eax, [eax+2*edx]
+  pxor mm4, mm4
+  pavgb mm0, mm1
+  pavgb mm1, mm2
+  movq [byte ecx], mm0
+  movq [ecx+edx], mm1
+  
+  pxor mm6, mm6
+  add eax, edx
+  lea ecx, [ecx+2*edx]
+  movq mm3, [byte eax]
+  por mm4, [eax+edx]
+  lea eax, [eax+2*edx]
+  pavgb mm2, mm3
+  pavgb mm3, mm4
+  movq [ecx], mm2
+  movq [ecx+edx], mm3
+  
+  ret
+
+ALIGN 8
+.rounding1
+  pcmpeqb mm0, mm0
+  psubusb mm0, [eax]            ; eax==line0
+  add eax, edx                  ; eax==line1
+  mov ecx, [esp+ 4] ; Dst
+
+  push esi
+
+  pcmpeqb mm1, mm1
+  pcmpeqb mm2, mm2
+  mov esi, mm_minusone
+  psubusb mm1, [byte eax]       ; line1
+  psubusb mm2, [eax+edx]        ; line2
+  lea eax, [eax+2*edx]          ; eax==line3
+  movq mm6, [esi]
+  movq mm7, [esi]
+  pavgb mm0, mm1
+  pavgb mm1, mm2
+  psubusb mm6, mm0
+  psubusb mm7, mm1
+  movq [ecx], mm6               ; store line0
+  movq [ecx+edx], mm7           ; store line1
+  
+  lea ecx, [ecx+2*edx]
+  pcmpeqb mm3, mm3
+  pcmpeqb mm4, mm4
+  psubusb mm3, [eax]            ; line3
+  psubusb mm4, [eax+edx]        ; line4
+  lea eax, [eax+2*edx]          ; eax==line 5
+  pavgb mm2, mm3
+  pavgb mm3, mm4
+  movq mm0, [esi]
+  movq mm1, [esi]
+  psubusb mm0, mm2
+  psubusb mm1, mm3
+  movq [ecx], mm0
+  movq [ecx+edx], mm1
+
+  pop esi
+
+  ret
+
+.endfunc
+
+;-----------------------------------------------------------------------------
+;
+; void interpolate8x4_halfpel_hv_3dne(uint8_t * const dst,
+;                       const uint8_t * const src,
+;                       const uint32_t stride,
+;                       const uint32_t rounding);
+;
+;
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+interpolate8x4_halfpel_hv_3dne:
+  mov eax, [esp+ 8]     ; Src
+  mov edx, [esp+12]     ; stride
+  dec dword [esp+16]    ; rounding
+
+    ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
+  movq mm2, [eax]
+  movq mm3, [eax+1]
+  movq mm6, mm2
+  pavgb mm2, mm3
+  pxor mm3, mm6         ; mm2/mm3 ready
+  mov ecx, [esp+ 4]     ; Dst
+  movq mm7, [mmx_one]
+
+  jz near .rounding1
+  lea ebp,[byte ebp]
+  COPY_HV_SSE_RND0
+  lea ecx,[ecx+2*edx]
+  COPY_HV_SSE_RND0
+  ret
+
+ALIGN 16
+.rounding1
+  COPY_HV_SSE_RND1
+  lea ecx,[ecx+2*edx]
+  COPY_HV_SSE_RND1
+  ret
+.endfunc
+