--- trunk/xvidcore/src/image/x86_asm/interpolate8x8_xmm.asm 2004/08/01 08:45:15 1529 +++ trunk/xvidcore/src/image/x86_asm/interpolate8x8_xmm.asm 2004/08/10 21:58:55 1530 @@ -53,6 +53,11 @@ cglobal interpolate8x8_halfpel_v_xmm cglobal interpolate8x8_halfpel_hv_xmm +cglobal interpolate8x8_halfpel_add_xmm +cglobal interpolate8x8_halfpel_h_add_xmm +cglobal interpolate8x8_halfpel_v_add_xmm +cglobal interpolate8x8_halfpel_hv_add_xmm + ;=========================================================================== ; ; void interpolate8x8_halfpel_h_xmm(uint8_t * const dst, @@ -335,3 +340,359 @@ add ecx, edx COPY_HV_SSE_RND1 ret + +;=========================================================================== +; +; The next functions combine both source halfpel interpolation step and the +; averaging (with rouding) step to avoid wasting memory bandwidth computing +; intermediate halfpel images and then averaging them. +; +;=========================================================================== + +%macro PROLOG0 0 + mov ecx, [esp+ 4] ; Dst + mov eax, [esp+ 8] ; Src + mov edx, [esp+12] ; BpS +%endmacro +%macro PROLOG1 0 + PROLOG0 + test dword [esp+16], 1; Rounding? +%endmacro +%macro EPILOG 0 + ret +%endmacro + +;=========================================================================== +; +; void interpolate8x8_halfpel_add_xmm(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +; +;=========================================================================== + +%macro ADD_FF 2 + movq mm0, [eax+%1] + movq mm1, [eax+%2] +;;--- +;; movq mm2, mm0 +;; movq mm3, mm1 +;;--- + pavgb mm0, [ecx+%1] + pavgb mm1, [ecx+%2] +;;-- +;; por mm2, [ecx+%1] +;; por mm3, [ecx+%2] +;; pand mm2, [mmx_one] +;; pand mm3, [mmx_one] +;; psubsb mm0, mm2 +;; psubsb mm1, mm3 +;;-- + movq [ecx+%1], mm0 + movq [ecx+%2], mm1 +%endmacro + +ALIGN 16 +interpolate8x8_halfpel_add_xmm: ; 23c + PROLOG1 + ADD_FF 0, edx + lea eax,[eax+2*edx] + lea ecx,[ecx+2*edx] + ADD_FF 0, edx + lea eax,[eax+2*edx] + lea ecx,[ecx+2*edx] + ADD_FF 0, edx + lea eax,[eax+2*edx] + lea ecx,[ecx+2*edx] + ADD_FF 0, edx + EPILOG + +;=========================================================================== +; +; void interpolate8x8_halfpel_h_add_xmm(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +; +;=========================================================================== + + +%macro ADD_FH_RND0 2 + movq mm0, [eax+%1] + movq mm1, [eax+%2] + pavgb mm0, [eax+%1+1] + pavgb mm1, [eax+%2+1] + pavgb mm0, [ecx+%1] + pavgb mm1, [ecx+%2] + movq [ecx+%1],mm0 + movq [ecx+%2],mm1 +%endmacro + +%macro ADD_FH_RND1 2 + movq mm0, [eax+%1] + movq mm1, [eax+%2] + movq mm4, mm0 + movq mm5, mm1 + movq mm2, [eax+%1+1] + movq mm3, [eax+%2+1] + pavgb mm0, mm2 + ; lea ?? + pxor mm2, mm4 + pavgb mm1, mm3 + pxor mm3, mm5 + pand mm2, [mmx_one] + pand mm3, [mmx_one] + psubb mm0, mm2 + psubb mm1, mm3 + pavgb mm0, [ecx+%1] + pavgb mm1, [ecx+%2] + movq [ecx+%1],mm0 + movq [ecx+%2],mm1 +%endmacro + +ALIGN 16 +interpolate8x8_halfpel_h_add_xmm: ; 32c + PROLOG1 + jnz near .Loop1 + ADD_FH_RND0 0, edx + lea eax,[eax+2*edx] + lea ecx,[ecx+2*edx] + ADD_FH_RND0 0, edx + lea eax,[eax+2*edx] + lea ecx,[ecx+2*edx] + ADD_FH_RND0 0, edx + lea eax,[eax+2*edx] + lea ecx,[ecx+2*edx] + ADD_FH_RND0 0, edx + EPILOG + +.Loop1 + ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 + ; movq mm7, [mmx_one] + ADD_FH_RND1 0, edx + lea eax,[eax+2*edx] + lea ecx,[ecx+2*edx] + ADD_FH_RND1 0, edx + lea eax,[eax+2*edx] + lea ecx,[ecx+2*edx] + ADD_FH_RND1 0, edx + lea eax,[eax+2*edx] + lea ecx,[ecx+2*edx] + ADD_FH_RND1 0, edx + EPILOG + + +;=========================================================================== +; +; void interpolate8x8_halfpel_v_add_xmm(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +; +;=========================================================================== + +%macro ADD_8_HF_RND0 0 + movq mm0, [eax] + movq mm1, [eax+edx] + pavgb mm0, mm1 + pavgb mm1, [eax+2*edx] + lea eax,[eax+2*edx] + pavgb mm0, [ecx] + pavgb mm1, [ecx+edx] + movq [ecx],mm0 + movq [ecx+edx],mm1 +%endmacro + +%macro ADD_8_HF_RND1 0 + movq mm1, [eax+edx] + movq mm2, [eax+2*edx] + lea eax,[eax+2*edx] + movq mm4, mm0 + movq mm5, mm1 + pavgb mm0, mm1 + pxor mm4, mm1 + pavgb mm1, mm2 + pxor mm5, mm2 + pand mm4, mm7 ; lsb's of (i^j)... + pand mm5, mm7 ; lsb's of (i^j)... + psubb mm0, mm4 ; ...are substracted from result of pavgb + pavgb mm0, [ecx] + movq [ecx], mm0 + psubb mm1, mm5 ; ...are substracted from result of pavgb + pavgb mm1, [ecx+edx] + movq [ecx+edx], mm1 +%endmacro + +ALIGN 16 +interpolate8x8_halfpel_v_add_xmm: + PROLOG1 + + jnz near .Loop1 + pxor mm7, mm7 ; this is a NOP + + ADD_8_HF_RND0 + lea ecx,[ecx+2*edx] + ADD_8_HF_RND0 + lea ecx,[ecx+2*edx] + ADD_8_HF_RND0 + lea ecx,[ecx+2*edx] + ADD_8_HF_RND0 + EPILOG + +.Loop1 + movq mm0, [eax] ; loop invariant + movq mm7, [mmx_one] + + ADD_8_HF_RND1 + movq mm0, mm2 + lea ecx,[ecx+2*edx] + ADD_8_HF_RND1 + movq mm0, mm2 + lea ecx,[ecx+2*edx] + ADD_8_HF_RND1 + movq mm0, mm2 + lea ecx,[ecx+2*edx] + ADD_8_HF_RND1 + EPILOG + +; The trick is to correct the result of 'pavgb' with some combination of the +; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t). +; The boolean relations are: +; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st +; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st +; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st +; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st +; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t. + +; Moreover, we process 2 lines at a times, for better overlapping (~15% faster). + +;=========================================================================== +; +; void interpolate8x8_halfpel_hv_add_xmm(uint8_t * const dst, +; const uint8_t * const src, +; const uint32_t stride, +; const uint32_t rounding); +; +; +;=========================================================================== + +%macro ADD_HH_RND0 0 + lea eax,[eax+edx] + + movq mm0, [eax] + movq mm1, [eax+1] + + movq mm6, mm0 + pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step + lea eax,[eax+edx] + pxor mm1, mm6 ; mm1=(j^k). preserved for next step + + por mm3, mm1 ; ij |= jk + movq mm6, mm2 + pxor mm6, mm0 ; mm6 = s^t + pand mm3, mm6 ; (ij|jk) &= st + pavgb mm2, mm0 ; mm2 = (s+t+1)/2 + pand mm3, mm7 ; mask lsb + psubb mm2, mm3 ; apply. + + pavgb mm2, [ecx] + movq [ecx], mm2 + + movq mm2, [eax] + movq mm3, [eax+1] + movq mm6, mm2 + pavgb mm2, mm3 ; preserved for next iteration + lea ecx,[ecx+edx] + pxor mm3, mm6 ; preserved for next iteration + + por mm1, mm3 + movq mm6, mm0 + pxor mm6, mm2 + pand mm1, mm6 + pavgb mm0, mm2 + + pand mm1, mm7 + psubb mm0, mm1 + + pavgb mm0, [ecx] + movq [ecx], mm0 +%endmacro + +%macro ADD_HH_RND1 0 + lea eax,[eax+edx] + + movq mm0, [eax] + movq mm1, [eax+1] + + movq mm6, mm0 + pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step + lea eax,[eax+edx] + pxor mm1, mm6 ; mm1=(j^k). preserved for next step + + pand mm3, mm1 + movq mm6, mm2 + pxor mm6, mm0 + por mm3, mm6 + pavgb mm2, mm0 + pand mm3, mm7 + psubb mm2, mm3 + + pavgb mm2, [ecx] + movq [ecx], mm2 + + movq mm2, [eax] + movq mm3, [eax+1] + movq mm6, mm2 + pavgb mm2, mm3 ; preserved for next iteration + lea ecx,[ecx+edx] + pxor mm3, mm6 ; preserved for next iteration + + pand mm1, mm3 + movq mm6, mm0 + pxor mm6, mm2 + por mm1, mm6 + pavgb mm0, mm2 + pand mm1, mm7 + psubb mm0, mm1 + + pavgb mm0, [ecx] + movq [ecx], mm0 +%endmacro + +ALIGN 16 +interpolate8x8_halfpel_hv_add_xmm: + PROLOG1 + + movq mm7, [mmx_one] + + ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j + movq mm2, [eax] + movq mm3, [eax+1] + movq mm6, mm2 + pavgb mm2, mm3 + pxor mm3, mm6 ; mm2/mm3 ready + + jnz near .Loop1 + + ADD_HH_RND0 + add ecx, edx + ADD_HH_RND0 + add ecx, edx + ADD_HH_RND0 + add ecx, edx + ADD_HH_RND0 + EPILOG + +.Loop1 + ADD_HH_RND1 + add ecx, edx + ADD_HH_RND1 + add ecx, edx + ADD_HH_RND1 + add ecx, edx + ADD_HH_RND1 + + EPILOG