--- trunk/xvidcore/src/image/x86_asm/interpolate8x8_xmm.asm	2004/08/22 11:46:10	1535
+++ trunk/xvidcore/src/image/x86_asm/interpolate8x8_xmm.asm	2008/08/19 09:06:48	1790
@@ -27,15 +27,15 @@
 %macro cglobal 1
 	%ifdef PREFIX
 		%ifdef MARK_FUNCS
-			global _%1:function
-			%define %1 _%1:function
+			global _%1:function %1.endfunc-%1
+			%define %1 _%1:function %1.endfunc-%1
 		%else
 			global _%1
 			%define %1 _%1
 		%endif
 	%else
 		%ifdef MARK_FUNCS
-			global %1:function
+			global %1:function %1.endfunc-%1
 		%else
 			global %1
 		%endif
@@ -62,6 +62,10 @@
 cglobal interpolate8x8_halfpel_v_xmm
 cglobal interpolate8x8_halfpel_hv_xmm
 
+cglobal interpolate8x4_halfpel_h_xmm
+cglobal interpolate8x4_halfpel_v_xmm
+cglobal interpolate8x4_halfpel_hv_xmm
+
 cglobal interpolate8x8_halfpel_add_xmm
 cglobal interpolate8x8_halfpel_h_add_xmm
 cglobal interpolate8x8_halfpel_v_add_xmm
@@ -137,6 +141,7 @@
   lea ecx,[ecx+2*edx]
   COPY_H_SSE_RND1
   ret
+.endfunc
 
 ;===========================================================================
 ;
@@ -211,6 +216,7 @@
   lea ecx,[ecx+2*edx]
   COPY_V_SSE_RND1
   ret
+.endfunc
 
 ;===========================================================================
 ;
@@ -349,6 +355,131 @@
   add ecx, edx
   COPY_HV_SSE_RND1
   ret
+.endfunc
+
+;===========================================================================
+;
+; void interpolate8x4_halfpel_h_xmm(uint8_t * const dst,
+;						const uint8_t * const src,
+;						const uint32_t stride,
+;						const uint32_t rounding);
+;
+;===========================================================================
+
+ALIGN 16
+interpolate8x4_halfpel_h_xmm:
+
+  mov eax, [esp+16]     ; rounding
+  mov ecx, [esp+ 4]     ; Dst
+  test eax,eax
+  mov eax, [esp+ 8]     ; Src
+  mov edx, [esp+12]     ; stride
+
+  jnz near .rounding1
+
+  COPY_H_SSE_RND0
+  lea ecx,[ecx+2*edx]
+  COPY_H_SSE_RND0
+  ret
+
+.rounding1
+ ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
+  movq mm7, [mmx_one]
+  COPY_H_SSE_RND1
+  lea ecx, [ecx+2*edx]
+  COPY_H_SSE_RND1
+  ret
+.endfunc
+
+;===========================================================================
+;
+; void interpolate8x4_halfpel_v_xmm(uint8_t * const dst,
+;                       const uint8_t * const src,
+;                       const uint32_t stride,
+;                       const uint32_t rounding);
+;
+;===========================================================================
+
+ALIGN 16
+interpolate8x4_halfpel_v_xmm:
+
+  mov eax, [esp+16]; rounding
+  mov ecx, [esp+ 4]     ; Dst
+  test eax,eax
+  mov eax, [esp+ 8]     ; Src
+  mov edx, [esp+12]     ; stride
+
+  ; we process 2 line at a time
+  jnz near .rounding1
+
+  COPY_V_SSE_RND0
+  lea ecx, [ecx+2*edx]
+  COPY_V_SSE_RND0
+  ret
+
+.rounding1
+ ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
+  movq mm7, [mmx_one]
+  movq mm2, [eax]       ; loop invariant
+  add eax, edx
+
+  COPY_V_SSE_RND1
+  lea ecx,[ecx+2*edx]
+  COPY_V_SSE_RND1
+  ret
+.endfunc
+
+;===========================================================================
+;
+; void interpolate8x4_halfpel_hv_xmm(uint8_t * const dst,
+;                       const uint8_t * const src,
+;                       const uint32_t stride,
+;                       const uint32_t rounding);
+;
+;
+;===========================================================================
+
+; The trick is to correct the result of 'pavgb' with some combination of the
+; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
+; The boolean relations are:
+;   (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
+;   (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st
+;   (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st
+;   (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st
+; with  s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
+
+; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
+
+ALIGN 16
+interpolate8x4_halfpel_hv_xmm:
+  mov eax, [esp+16]  ; rounding
+  mov ecx, [esp+ 4]  ; Dst
+  test eax, eax
+  mov eax, [esp+ 8]  ; Src
+  mov edx, [esp+12]  ; stride
+
+  movq mm7, [mmx_one]
+
+    ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
+  movq mm2, [eax]
+  movq mm3, [eax+1]
+  movq mm6, mm2
+  pavgb mm2, mm3
+  pxor mm3, mm6       ; mm2/mm3 ready
+
+  jnz near .rounding1
+
+  COPY_HV_SSE_RND0
+  add ecx, edx
+  COPY_HV_SSE_RND0
+  ret
+
+.rounding1
+  COPY_HV_SSE_RND1
+  add ecx, edx
+  COPY_HV_SSE_RND1
+  ret
+.endfunc
 
 ;===========================================================================
 ;
@@ -416,6 +547,7 @@
   lea ecx,[ecx+2*edx]
   ADD_FF 0, edx
   EPILOG
+.endfunc
 
 ;===========================================================================
 ;
@@ -491,6 +623,7 @@
   lea ecx,[ecx+2*edx]
   ADD_FH_RND1 0, edx
   EPILOG
+.endfunc
 
 
 ;===========================================================================
@@ -566,6 +699,7 @@
   lea ecx,[ecx+2*edx]
   ADD_8_HF_RND1 
   EPILOG
+.endfunc
 
 ; The trick is to correct the result of 'pavgb' with some combination of the
 ; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
@@ -705,3 +839,10 @@
   ADD_HH_RND1
 
   EPILOG
+.endfunc
+
+
+%ifidn __OUTPUT_FORMAT__,elf
+section ".note.GNU-stack" noalloc noexec nowrite progbits
+%endif
+