--- trunk/xvidcore/src/image/x86_asm/interpolate8x8_xmm.asm	2005/09/09 12:18:10	1631
+++ trunk/xvidcore/src/image/x86_asm/interpolate8x8_xmm.asm	2005/09/13 12:12:15	1632
@@ -62,6 +62,10 @@
 cglobal interpolate8x8_halfpel_v_xmm
 cglobal interpolate8x8_halfpel_hv_xmm
 
+cglobal interpolate8x4_halfpel_h_xmm
+cglobal interpolate8x4_halfpel_v_xmm
+cglobal interpolate8x4_halfpel_hv_xmm
+
 cglobal interpolate8x8_halfpel_add_xmm
 cglobal interpolate8x8_halfpel_h_add_xmm
 cglobal interpolate8x8_halfpel_v_add_xmm
@@ -355,6 +359,130 @@
 
 ;===========================================================================
 ;
+; void interpolate8x4_halfpel_h_xmm(uint8_t * const dst,
+;						const uint8_t * const src,
+;						const uint32_t stride,
+;						const uint32_t rounding);
+;
+;===========================================================================
+
+ALIGN 16
+interpolate8x4_halfpel_h_xmm:
+
+  mov eax, [esp+16]     ; rounding
+  mov ecx, [esp+ 4]     ; Dst
+  test eax,eax
+  mov eax, [esp+ 8]     ; Src
+  mov edx, [esp+12]     ; stride
+
+  jnz near .rounding1
+
+  COPY_H_SSE_RND0
+  lea ecx,[ecx+2*edx]
+  COPY_H_SSE_RND0
+  ret
+
+.rounding1
+ ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
+  movq mm7, [mmx_one]
+  COPY_H_SSE_RND1
+  lea ecx, [ecx+2*edx]
+  COPY_H_SSE_RND1
+  ret
+.endfunc
+
+;===========================================================================
+;
+; void interpolate8x4_halfpel_v_xmm(uint8_t * const dst,
+;                       const uint8_t * const src,
+;                       const uint32_t stride,
+;                       const uint32_t rounding);
+;
+;===========================================================================
+
+ALIGN 16
+interpolate8x4_halfpel_v_xmm:
+
+  mov eax, [esp+16]; rounding
+  mov ecx, [esp+ 4]     ; Dst
+  test eax,eax
+  mov eax, [esp+ 8]     ; Src
+  mov edx, [esp+12]     ; stride
+
+  ; we process 2 line at a time
+  jnz near .rounding1
+
+  COPY_V_SSE_RND0
+  lea ecx, [ecx+2*edx]
+  COPY_V_SSE_RND0
+  ret
+
+.rounding1
+ ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
+  movq mm7, [mmx_one]
+  movq mm2, [eax]       ; loop invariant
+  add eax, edx
+
+  COPY_V_SSE_RND1
+  lea ecx,[ecx+2*edx]
+  COPY_V_SSE_RND1
+  ret
+.endfunc
+
+;===========================================================================
+;
+; void interpolate8x4_halfpel_hv_xmm(uint8_t * const dst,
+;                       const uint8_t * const src,
+;                       const uint32_t stride,
+;                       const uint32_t rounding);
+;
+;
+;===========================================================================
+
+; The trick is to correct the result of 'pavgb' with some combination of the
+; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
+; The boolean relations are:
+;   (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
+;   (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st
+;   (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st
+;   (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st
+; with  s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
+
+; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
+
+ALIGN 16
+interpolate8x4_halfpel_hv_xmm:
+  mov eax, [esp+16]  ; rounding
+  mov ecx, [esp+ 4]  ; Dst
+  test eax, eax
+  mov eax, [esp+ 8]  ; Src
+  mov edx, [esp+12]  ; stride
+
+  movq mm7, [mmx_one]
+
+    ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
+  movq mm2, [eax]
+  movq mm3, [eax+1]
+  movq mm6, mm2
+  pavgb mm2, mm3
+  pxor mm3, mm6       ; mm2/mm3 ready
+
+  jnz near .rounding1
+
+  COPY_HV_SSE_RND0
+  add ecx, edx
+  COPY_HV_SSE_RND0
+  ret
+
+.rounding1
+  COPY_HV_SSE_RND1
+  add ecx, edx
+  COPY_HV_SSE_RND1
+  ret
+.endfunc
+
+;===========================================================================
+;
 ; The next functions combine both source halfpel interpolation step and the
 ; averaging (with rouding) step to avoid wasting memory bandwidth computing
 ; intermediate halfpel images and then averaging them.