--- trunk/xvidcore/src/image/x86_asm/colorspace_yuv_mmx.asm 2004/08/29 10:02:38 1540 +++ trunk/xvidcore/src/image/x86_asm/colorspace_yuv_mmx.asm 2006/10/30 10:52:00 1742 @@ -19,7 +19,7 @@ ; * along with this program; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; * $Id: colorspace_yuv_mmx.asm,v 1.5 2004-08-29 10:02:38 edgomez Exp $ +; * $Id: colorspace_yuv_mmx.asm,v 1.6 2006-10-30 10:52:00 Skal Exp $ ; * ; ***************************************************************************/ @@ -48,24 +48,24 @@ ;============================================================================= ;------------------------------------------------------------------------------ -; PLANE_COPY ( DST, DST_DIF, SRC, SRC_DIF, WIDTH, HEIGHT, OPT ) +; PLANE_COPY ( DST, DST_STRIDE, SRC, SRC_STRIDE, WIDTH, HEIGHT, OPT ) ; DST dst buffer -; DST_DIF dst stride difference (e.g. stride - width) +; DST_STRIDE dst stride ; SRC src destination buffer -; SRC_DIF src stride difference (e.g. stride - width) +; SRC_STRIDE src stride ; WIDTH width ; HEIGHT height ; OPT 0=plain mmx, 1=xmm ;------------------------------------------------------------------------------ %macro PLANE_COPY 7 -%define DST %1 -%define DST_DIF %2 -%define SRC %3 -%define SRC_DIF %4 +%define DST %1 +%define DST_STRIDE %2 +%define SRC %3 +%define SRC_STRIDE %4 %define WIDTH %5 %define HEIGHT %6 -%define OPT %7 +%define OPT %7 mov eax, WIDTH mov ebp, HEIGHT ; $ebp$ = height @@ -79,17 +79,20 @@ shr ebx, 4 ; $ebx$ = remainder / 16 and edx, 15 ; $edx$ = remainder % 16 -%%loop64_start - or eax, eax - jz %%loop16_start - mov ecx, eax ; width64 -%%loop64: +%%loop64_start_pc: + push edi + push esi + mov ecx, eax ; width64 + test eax, eax + jz %%loop16_start_pc + +%%loop64_pc: %if OPT == 1 ; xmm prefetchnta [esi + 64] ; non temporal prefetch prefetchnta [esi + 96] %endif - movq mm1, [esi] ; read from src - movq mm2, [esi + 8] + movq mm1, [esi ] ; read from src + movq mm2, [esi + 8] movq mm3, [esi + 16] movq mm4, [esi + 24] movq mm5, [esi + 32] @@ -98,8 +101,8 @@ movq mm0, [esi + 56] %if OPT == 0 ; plain mmx - movq [edi], mm1 ; write to y_out - movq [edi + 8], mm2 + movq [edi ], mm1 ; write to y_out + movq [edi + 8], mm2 movq [edi + 16], mm3 movq [edi + 24], mm4 movq [edi + 32], mm5 @@ -107,8 +110,8 @@ movq [edi + 48], mm7 movq [edi + 56], mm0 %else - movntq [edi], mm1 ; write to y_out - movntq [edi + 8], mm2 + movntq [edi ], mm1 ; write to y_out + movntq [edi + 8], mm2 movntq [edi + 16], mm3 movntq [edi + 24], mm4 movntq [edi + 32], mm5 @@ -119,15 +122,15 @@ add esi, 64 add edi, 64 - dec ecx - jnz %%loop64 + loop %%loop64_pc + +%%loop16_start_pc: + mov ecx, ebx ; width16 + test ebx, ebx + jz %%loop1_start_pc -%%loop16_start - or ebx, ebx - jz %%loop1_start - mov ecx, ebx ; width16 -%%loop16: +%%loop16_pc: movq mm1, [esi] movq mm2, [esi + 8] %if OPT == 0 ; plain mmx @@ -140,18 +143,107 @@ add esi, 16 add edi, 16 - dec ecx - jnz %%loop16 + loop %%loop16_pc -%%loop1_start +%%loop1_start_pc: mov ecx, edx rep movsb - add esi, SRC_DIF - add edi, DST_DIF + pop esi + pop edi + add esi, SRC_STRIDE + add edi, DST_STRIDE dec ebp - jnz near %%loop64_start + jg near %%loop64_start_pc +%endmacro + +;------------------------------------------------------------------------------ +; PLANE_FILL ( DST, DST_STRIDE, WIDTH, HEIGHT, OPT ) +; DST dst buffer +; DST_STRIDE dst stride +; WIDTH width +; HEIGHT height +; OPT 0=plain mmx, 1=xmm +;------------------------------------------------------------------------------ + +%macro PLANE_FILL 5 +%define DST %1 +%define DST_STRIDE %2 +%define WIDTH %3 +%define HEIGHT %4 +%define OPT %5 + + mov esi, WIDTH + mov ebp, HEIGHT ; $ebp$ = height + mov edi, DST + + mov eax, 0x80808080 + mov ebx, esi + shr esi, 6 ; $esi$ = width / 64 + and ebx, 63 ; ebx = remainder = width % 64 + movd mm0, eax + mov edx, ebx + shr ebx, 4 ; $ebx$ = remainder / 16 + and edx, 15 ; $edx$ = remainder % 16 + punpckldq mm0, mm0 + +%%loop64_start_pf: + push edi + mov ecx, esi ; width64 + test esi, esi + jz %%loop16_start_pf + +%%loop64_pf: + +%if OPT == 0 ; plain mmx + movq [edi ], mm0 ; write to y_out + movq [edi + 8], mm0 + movq [edi + 16], mm0 + movq [edi + 24], mm0 + movq [edi + 32], mm0 + movq [edi + 40], mm0 + movq [edi + 48], mm0 + movq [edi + 56], mm0 +%else + movntq [edi ], mm0 ; write to y_out + movntq [edi + 8], mm0 + movntq [edi + 16], mm0 + movntq [edi + 24], mm0 + movntq [edi + 32], mm0 + movntq [edi + 40], mm0 + movntq [edi + 48], mm0 + movntq [edi + 56], mm0 +%endif + + add edi, 64 + loop %%loop64_pf + +%%loop16_start_pf: + mov ecx, ebx ; width16 + test ebx, ebx + jz %%loop1_start_pf + +%%loop16_pf: +%if OPT == 0 ; plain mmx + movq [edi ], mm0 + movq [edi + 8], mm0 +%else + movntq [edi ], mm0 + movntq [edi + 8], mm0 +%endif + + add edi, 16 + loop %%loop16_pf + +%%loop1_start_pf: + mov ecx, edx + rep stosb + + pop edi + add edi, DST_STRIDE + dec ebp + jg near %%loop64_start_pf %endmacro ;------------------------------------------------------------------------------ @@ -167,27 +259,27 @@ ;------------------------------------------------------------------------------ %macro MAKE_YV12_TO_YV12 2 %define NAME %1 -%define OPT %2 +%define OPT %2 ALIGN 16 cglobal NAME NAME: %define pushsize 16 -%define localsize 24 +%define localsize 12 -%define vflip esp + localsize + pushsize + 52 -%define height esp + localsize + pushsize + 48 +%define vflip esp + localsize + pushsize + 52 +%define height esp + localsize + pushsize + 48 %define width esp + localsize + pushsize + 44 %define uv_src_stride esp + localsize + pushsize + 40 %define y_src_stride esp + localsize + pushsize + 36 -%define v_src esp + localsize + pushsize + 32 -%define u_src esp + localsize + pushsize + 28 -%define y_src esp + localsize + pushsize + 24 +%define v_src esp + localsize + pushsize + 32 +%define u_src esp + localsize + pushsize + 28 +%define y_src esp + localsize + pushsize + 24 %define uv_dst_stride esp + localsize + pushsize + 20 %define y_dst_stride esp + localsize + pushsize + 16 -%define v_dst esp + localsize + pushsize + 12 -%define u_dst esp + localsize + pushsize + 8 -%define y_dst esp + localsize + pushsize + 4 -%define _ip esp + localsize + pushsize + 0 +%define v_dst esp + localsize + pushsize + 12 +%define u_dst esp + localsize + pushsize + 8 +%define y_dst esp + localsize + pushsize + 4 +%define _ip esp + localsize + pushsize + 0 push ebx ; esp + localsize + 16 push esi ; esp + localsize + 8 @@ -196,10 +288,6 @@ %define width2 esp + localsize - 4 %define height2 esp + localsize - 8 -%define y_src_dif esp + localsize - 12 -%define y_dst_dif esp + localsize - 16 -%define uv_src_dif esp + localsize - 20 -%define uv_dst_dif esp + localsize - 24 sub esp, localsize @@ -210,63 +298,60 @@ mov [width2], eax mov [height2], ebx - mov ebp, [vflip] - or ebp, ebp - jz near .dont_flip + mov eax, [vflip] + test eax, eax + jz near .go -; flipping support + ; flipping support mov eax, [height] mov esi, [y_src] - mov edx, [y_src_stride] - push edx - mul edx - pop edx + mov ecx, [y_src_stride] + sub eax, 1 + imul eax, ecx add esi, eax ; y_src += (height-1) * y_src_stride - neg edx + neg ecx mov [y_src], esi - mov [y_src_stride], edx ; y_src_stride = -y_src_stride + mov [y_src_stride], ecx ; y_src_stride = -y_src_stride mov eax, [height2] mov esi, [u_src] mov edi, [v_src] - mov edx, [uv_src_stride] - sub eax, 1 ; ebp = height2 - 1 - push edx - mul edx - pop edx + mov ecx, [uv_src_stride] + test esi, esi + jz .go + test edi, edi + jz .go + sub eax, 1 ; eax = height2 - 1 + imul eax, ecx add esi, eax ; u_src += (height2-1) * uv_src_stride add edi, eax ; v_src += (height2-1) * uv_src_stride - neg edx + neg ecx mov [u_src], esi mov [v_src], edi - mov [uv_src_stride], edx ; uv_src_stride = -uv_src_stride + mov [uv_src_stride], ecx ; uv_src_stride = -uv_src_stride -.dont_flip +.go: - mov eax, [y_src_stride] - mov ebx, [y_dst_stride] - mov ecx, [uv_src_stride] - mov edx, [uv_dst_stride] - sub eax, [width] - sub ebx, [width] - sub ecx, [width2] - sub edx, [width2] - mov [y_src_dif], eax ; y_src_dif = y_src_stride - width - mov [y_dst_dif], ebx ; y_dst_dif = y_dst_stride - width - mov [uv_src_dif], ecx ; uv_src_dif = uv_src_stride - width2 - mov [uv_dst_dif], edx ; uv_dst_dif = uv_dst_stride - width2 - - PLANE_COPY [y_dst], [y_dst_dif], [y_src], [y_src_dif], [width], [height], OPT - PLANE_COPY [u_dst], [uv_dst_dif], [u_src], [uv_src_dif], [width2], [height2], OPT - PLANE_COPY [v_dst], [uv_dst_dif], [v_src], [uv_src_dif], [width2], [height2], OPT + PLANE_COPY [y_dst], [y_dst_stride], [y_src], [y_src_stride], [width], [height], OPT + mov eax, [u_src] + or eax, [v_src] + jz near .UVFill_0x80 + PLANE_COPY [u_dst], [uv_dst_stride], [u_src], [uv_src_stride], [width2], [height2], OPT + PLANE_COPY [v_dst], [uv_dst_stride], [v_src], [uv_src_stride], [width2], [height2], OPT + +.Done_UVPlane: add esp, localsize pop ebp pop edi pop esi pop ebx - ret + +.UVFill_0x80: + PLANE_FILL [u_dst], [uv_dst_stride], [width2], [height2], OPT + PLANE_FILL [v_dst], [uv_dst_stride], [width2], [height2], OPT + jmp near .Done_UVPlane .endfunc %endmacro