--- trunk/xvidcore/src/image/x86_asm/interpolate8x8_3dn.asm 2005/09/09 12:18:10 1631 +++ trunk/xvidcore/src/image/x86_asm/interpolate8x8_3dn.asm 2005/09/13 12:12:15 1632 @@ -67,6 +67,10 @@ cglobal interpolate8x8_halfpel_v_3dn cglobal interpolate8x8_halfpel_hv_3dn +cglobal interpolate8x4_halfpel_h_3dn +cglobal interpolate8x4_halfpel_v_3dn +cglobal interpolate8x4_halfpel_hv_3dn + ;----------------------------------------------------------------------------- ; ; void interpolate8x8_halfpel_h_3dn(uint8_t * const dst, @@ -356,3 +360,128 @@ ret .endfunc +;----------------------------------------------------------------------------- +; +; void interpolate8x4_halfpel_h_3dn(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +;----------------------------------------------------------------------------- + +ALIGN 16 +interpolate8x4_halfpel_h_3dn: + + mov eax, [esp+16] ; rounding + mov ecx, [esp+ 4] ; Dst + test eax, eax + mov eax, [esp+ 8] ; Src + mov edx, [esp+12] ; stride + + jnz near .rounding1 + + COPY_H_3DN_RND0 + lea ecx, [ecx+2*edx] + COPY_H_3DN_RND0 + ret + +.rounding1 + ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 + movq mm7, [mmx_one] + COPY_H_3DN_RND1 + lea ecx, [ecx+2*edx] + COPY_H_3DN_RND1 + ret +.endfunc + + +;----------------------------------------------------------------------------- +; +; void interpolate8x4_halfpel_v_3dn(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +;----------------------------------------------------------------------------- + +ALIGN 16 +interpolate8x4_halfpel_v_3dn: + + mov eax, [esp+16] ; rounding + mov ecx, [esp+ 4] ; Dst + test eax,eax + mov eax, [esp+ 8] ; Src + mov edx, [esp+12] ; stride + + ; we process 2 line at a time + + jnz near .rounding1 + + COPY_V_3DN_RND0 + lea ecx, [ecx+2*edx] + COPY_V_3DN_RND0 + ret + +.rounding1 + ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 + movq mm7, [mmx_one] + movq mm2, [eax] ; loop invariant + add eax, edx + + COPY_V_3DN_RND1 + lea ecx, [ecx+2*edx] + COPY_V_3DN_RND1 + ret +.endfunc + + +;----------------------------------------------------------------------------- +; +; void interpolate8x4_halfpel_hv_3dn(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +; +;----------------------------------------------------------------------------- + +; The trick is to correct the result of 'pavgusb' with some combination of the +; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgusb' (s and t). +; The boolean relations are: +; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st +; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st +; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st +; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st +; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t. + +ALIGN 16 +interpolate8x4_halfpel_hv_3dn + mov eax, [esp+16] ; rounding + mov ecx, [esp+ 4] ; Dst + test eax, eax + mov eax, [esp+ 8] ; Src + mov edx, [esp+12] ; stride + + movq mm7, [mmx_one] + + ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j + movq mm2, [eax] + movq mm3, [eax+1] + movq mm6, mm2 + pavgusb mm2, mm3 + pxor mm3, mm6 ; mm2/mm3 ready + + jnz near .rounding1 + + COPY_HV_3DN_RND0 + add ecx, edx + COPY_HV_3DN_RND0 + ret + +.rounding1 + COPY_HV_3DN_RND1 + add ecx, edx + COPY_HV_3DN_RND1 + ret +.endfunc +