--- trunk/xvidcore/src/image/x86_asm/colorspace_yuv_mmx.asm 2004/08/29 10:02:38 1540 +++ trunk/xvidcore/src/image/x86_asm/colorspace_yuv_mmx.asm 2008/12/01 15:22:37 1836 @@ -3,7 +3,7 @@ ; * XVID MPEG-4 VIDEO CODEC ; * - MMX and XMM YV12->YV12 conversion - ; * -; * Copyright(C) 2001 Michael Militzer +; * Copyright(C) 2001-2008 Michael Militzer ; * ; * This program is free software; you can redistribute it and/or modify it ; * under the terms of the GNU General Public License as published by @@ -19,139 +19,215 @@ ; * along with this program; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; * $Id: colorspace_yuv_mmx.asm,v 1.5 2004-08-29 10:02:38 edgomez Exp $ +; * $Id: colorspace_yuv_mmx.asm,v 1.12 2008-12-01 15:22:37 Isibaar Exp $ ; * ; ***************************************************************************/ -BITS 32 - -%macro cglobal 1 - %ifdef PREFIX - %ifdef MARK_FUNCS - global _%1:function %1.endfunc-%1 - %define %1 _%1:function %1.endfunc-%1 - %else - global _%1 - %define %1 _%1 - %endif - %else - %ifdef MARK_FUNCS - global %1:function %1.endfunc-%1 - %else - global %1 - %endif - %endif -%endmacro +%include "nasm.inc" ;============================================================================= ; Helper macros ;============================================================================= +%macro _MOVQ 3 +%if %1 == 1 + movntq %2, %3 ; xmm +%else + movq %2, %3 ; plain mmx +%endif +%endmacro + ;------------------------------------------------------------------------------ -; PLANE_COPY ( DST, DST_DIF, SRC, SRC_DIF, WIDTH, HEIGHT, OPT ) +; PLANE_COPY ( DST, DST_STRIDE, SRC, SRC_STRIDE, WIDTH, HEIGHT, OPT ) ; DST dst buffer -; DST_DIF dst stride difference (e.g. stride - width) +; DST_STRIDE dst stride ; SRC src destination buffer -; SRC_DIF src stride difference (e.g. stride - width) +; SRC_STRIDE src stride ; WIDTH width ; HEIGHT height ; OPT 0=plain mmx, 1=xmm +; +; +; Trashes: DST, SRC, WIDTH, HEIGHT, _EBX, _ECX, _EDX ;------------------------------------------------------------------------------ %macro PLANE_COPY 7 -%define DST %1 -%define DST_DIF %2 -%define SRC %3 -%define SRC_DIF %4 -%define WIDTH %5 -%define HEIGHT %6 -%define OPT %7 - - mov eax, WIDTH - mov ebp, HEIGHT ; $ebp$ = height - mov esi, SRC - mov edi, DST - - mov ebx, eax - shr eax, 6 ; $eax$ = width / 64 - and ebx, 63 ; remainder = width % 64 - mov edx, ebx - shr ebx, 4 ; $ebx$ = remainder / 16 - and edx, 15 ; $edx$ = remainder % 16 - -%%loop64_start - or eax, eax - jz %%loop16_start - mov ecx, eax ; width64 -%%loop64: +%define DST %1 +%define DST_STRIDE %2 +%define SRC %3 +%define SRC_STRIDE %4 +%define WIDTH %5 +%define HEIGHT %6 +%define OPT %7 + + mov _EBX, WIDTH + shr WIDTH, 6 ; $_EAX$ = width / 64 + and _EBX, 63 ; remainder = width % 64 + mov _EDX, _EBX + shr _EBX, 4 ; $_EBX$ = remainder / 16 + and _EDX, 15 ; $_EDX$ = remainder % 16 + +%%loop64_start_pc: + push DST + push SRC + + mov _ECX, WIDTH ; width64 + test WIDTH, WIDTH + jz %%loop16_start_pc + +%%loop64_pc: %if OPT == 1 ; xmm - prefetchnta [esi + 64] ; non temporal prefetch - prefetchnta [esi + 96] -%endif - movq mm1, [esi] ; read from src - movq mm2, [esi + 8] - movq mm3, [esi + 16] - movq mm4, [esi + 24] - movq mm5, [esi + 32] - movq mm6, [esi + 40] - movq mm7, [esi + 48] - movq mm0, [esi + 56] - -%if OPT == 0 ; plain mmx - movq [edi], mm1 ; write to y_out - movq [edi + 8], mm2 - movq [edi + 16], mm3 - movq [edi + 24], mm4 - movq [edi + 32], mm5 - movq [edi + 40], mm6 - movq [edi + 48], mm7 - movq [edi + 56], mm0 -%else - movntq [edi], mm1 ; write to y_out - movntq [edi + 8], mm2 - movntq [edi + 16], mm3 - movntq [edi + 24], mm4 - movntq [edi + 32], mm5 - movntq [edi + 40], mm6 - movntq [edi + 48], mm7 - movntq [edi + 56], mm0 + prefetchnta [SRC + 64] ; non temporal prefetch + prefetchnta [SRC + 96] %endif + movq mm1, [SRC ] ; read from src + movq mm2, [SRC + 8] + movq mm3, [SRC + 16] + movq mm4, [SRC + 24] + movq mm5, [SRC + 32] + movq mm6, [SRC + 40] + movq mm7, [SRC + 48] + movq mm0, [SRC + 56] + + _MOVQ OPT, [DST ], mm1 ; write to y_out + _MOVQ OPT, [DST + 8], mm2 + _MOVQ OPT, [DST + 16], mm3 + _MOVQ OPT, [DST + 24], mm4 + _MOVQ OPT, [DST + 32], mm5 + _MOVQ OPT, [DST + 40], mm6 + _MOVQ OPT, [DST + 48], mm7 + _MOVQ OPT, [DST + 56], mm0 + + add SRC, 64 + add DST, 64 + loop %%loop64_pc + +%%loop16_start_pc: + mov _ECX, _EBX ; width16 + test _EBX, _EBX + jz %%loop1_start_pc + +%%loop16_pc: + movq mm1, [SRC] + movq mm2, [SRC + 8] + + _MOVQ OPT, [DST], mm1 + _MOVQ OPT, [DST + 8], mm2 + + add SRC, 16 + add DST, 16 + loop %%loop16_pc + +%%loop1_start_pc: + mov _ECX, _EDX + rep movsb - add esi, 64 - add edi, 64 - dec ecx - jnz %%loop64 - - -%%loop16_start - or ebx, ebx - jz %%loop1_start - mov ecx, ebx ; width16 -%%loop16: - movq mm1, [esi] - movq mm2, [esi + 8] -%if OPT == 0 ; plain mmx - movq [edi], mm1 - movq [edi + 8], mm2 + pop SRC + pop DST + +%ifdef ARCH_IS_X86_64 + movsx _ECX, SRC_STRIDE + add SRC, _ECX + mov ecx, DST_STRIDE + add DST, _ECX %else - movntq [edi], mm1 - movntq [edi + 8], mm2 + add SRC, SRC_STRIDE + add DST, DST_STRIDE %endif - add esi, 16 - add edi, 16 - dec ecx - jnz %%loop16 + dec HEIGHT + jg near %%loop64_start_pc +%undef DST +%undef DST_STRIDE +%undef SRC +%undef SRC_STRIDE +%undef WIDTH +%undef HEIGHT +%undef OPT +%endmacro -%%loop1_start - mov ecx, edx - rep movsb +;------------------------------------------------------------------------------ +; PLANE_FILL ( DST, DST_STRIDE, WIDTH, HEIGHT, OPT ) +; DST dst buffer +; DST_STRIDE dst stride +; WIDTH width +; HEIGHT height +; OPT 0=plain mmx, 1=xmm +; +; Trashes: DST, WIDTH, HEIGHT, _EBX, _ECX, _EDX, _EAX +;------------------------------------------------------------------------------ - add esi, SRC_DIF - add edi, DST_DIF - dec ebp - jnz near %%loop64_start +%macro PLANE_FILL 5 +%define DST %1 +%define DST_STRIDE %2 +%define WIDTH %3 +%define HEIGHT %4 +%define OPT %5 + + mov _EAX, 0x80808080 + mov _EBX, WIDTH + shr WIDTH, 6 ; $_ESI$ = width / 64 + and _EBX, 63 ; _EBX = remainder = width % 64 + movd mm0, eax + mov _EDX, _EBX + shr _EBX, 4 ; $_EBX$ = remainder / 16 + and _EDX, 15 ; $_EDX$ = remainder % 16 + punpckldq mm0, mm0 + +%%loop64_start_pf: + push DST + mov _ECX, WIDTH ; width64 + test WIDTH, WIDTH + jz %%loop16_start_pf + +%%loop64_pf: + + _MOVQ OPT, [DST ], mm0 ; write to y_out + _MOVQ OPT, [DST + 8], mm0 + _MOVQ OPT, [DST + 16], mm0 + _MOVQ OPT, [DST + 24], mm0 + _MOVQ OPT, [DST + 32], mm0 + _MOVQ OPT, [DST + 40], mm0 + _MOVQ OPT, [DST + 48], mm0 + _MOVQ OPT, [DST + 56], mm0 + + add DST, 64 + loop %%loop64_pf + +%%loop16_start_pf: + mov _ECX, _EBX ; width16 + test _EBX, _EBX + jz %%loop1_start_pf + +%%loop16_pf: + _MOVQ OPT, [DST ], mm0 + _MOVQ OPT, [DST + 8], mm0 + + add DST, 16 + loop %%loop16_pf + +%%loop1_start_pf: + mov _ECX, _EDX + rep stosb + + pop DST + +%ifdef ARCH_IS_X86_64 + mov ecx, DST_STRIDE + add DST, _ECX +%else + add DST, DST_STRIDE +%endif + + dec HEIGHT + jg near %%loop64_start_pf + +%undef DST +%undef DST_STRIDE +%undef WIDTH +%undef HEIGHT +%undef OPT %endmacro ;------------------------------------------------------------------------------ @@ -167,115 +243,215 @@ ;------------------------------------------------------------------------------ %macro MAKE_YV12_TO_YV12 2 %define NAME %1 -%define OPT %2 -ALIGN 16 +%define XMM_OPT %2 +ALIGN SECTION_ALIGN cglobal NAME NAME: -%define pushsize 16 -%define localsize 24 -%define vflip esp + localsize + pushsize + 52 -%define height esp + localsize + pushsize + 48 -%define width esp + localsize + pushsize + 44 -%define uv_src_stride esp + localsize + pushsize + 40 -%define y_src_stride esp + localsize + pushsize + 36 -%define v_src esp + localsize + pushsize + 32 -%define u_src esp + localsize + pushsize + 28 -%define y_src esp + localsize + pushsize + 24 -%define uv_dst_stride esp + localsize + pushsize + 20 -%define y_dst_stride esp + localsize + pushsize + 16 -%define v_dst esp + localsize + pushsize + 12 -%define u_dst esp + localsize + pushsize + 8 -%define y_dst esp + localsize + pushsize + 4 -%define _ip esp + localsize + pushsize + 0 - - push ebx ; esp + localsize + 16 - push esi ; esp + localsize + 8 - push edi ; esp + localsize + 4 - push ebp ; esp + localsize + 0 - -%define width2 esp + localsize - 4 -%define height2 esp + localsize - 8 -%define y_src_dif esp + localsize - 12 -%define y_dst_dif esp + localsize - 16 -%define uv_src_dif esp + localsize - 20 -%define uv_dst_dif esp + localsize - 24 + push _EBX ; _ESP + localsize + 3*PTR_SIZE + +%define localsize 2*4 + +%ifdef ARCH_IS_X86_64 + +%ifndef WINDOWS +%define pushsize 2*PTR_SIZE +%define shadow 0 +%else +%define pushsize 4*PTR_SIZE +%define shadow 32 + 2*PTR_SIZE +%endif + +%define prm_vflip dword [_ESP + localsize + pushsize + shadow + 7*PTR_SIZE] +%define prm_height dword [_ESP + localsize + pushsize + shadow + 6*PTR_SIZE] +%define prm_width dword [_ESP + localsize + pushsize + shadow + 5*PTR_SIZE] +%define prm_uv_src_stride dword [_ESP + localsize + pushsize + shadow + 4*PTR_SIZE] +%define prm_y_src_stride dword [_ESP + localsize + pushsize + shadow + 3*PTR_SIZE] +%define prm_v_src [_ESP + localsize + pushsize + shadow + 2*PTR_SIZE] +%define prm_u_src [_ESP + localsize + pushsize + shadow + 1*PTR_SIZE] + +%ifdef WINDOWS + push _ESI ; _ESP + localsize + 2*PTR_SIZE + push _EDI ; _ESP + localsize + 1*PTR_SIZE + push _EBP ; _ESP + localsize + 0*PTR_SIZE + + sub _ESP, localsize + +%define prm_y_src _ESI +%define prm_uv_dst_stride TMP0d +%define prm_y_dst_stride prm4d +%define prm_v_dst prm3 +%define prm_u_dst TMP1 +%define prm_y_dst _EDI + + mov _EDI, prm1 + mov TMP1, prm2 + + mov _ESI, [_ESP + localsize + pushsize + shadow + 0*PTR_SIZE] + mov TMP0d, dword [_ESP + localsize + pushsize + shadow - 1*PTR_SIZE] + +%else + push _EBP ; _ESP + localsize + 0*PTR_SIZE + + sub _ESP, localsize + +%define prm_y_src _ESI +%define prm_uv_dst_stride prm5d +%define prm_y_dst_stride TMP1d +%define prm_v_dst prm6 +%define prm_u_dst TMP0 +%define prm_y_dst _EDI + + mov TMP0, prm2 + mov _ESI, prm6 + + mov prm6, prm3 + mov TMP1d, prm4d +%endif + +%define _ip _ESP + localsize + pushsize + 0 + +%else + +%define pushsize 4*PTR_SIZE + +%define prm_vflip [_ESP + localsize + pushsize + 13*PTR_SIZE] +%define prm_height [_ESP + localsize + pushsize + 12*PTR_SIZE] +%define prm_width [_ESP + localsize + pushsize + 11*PTR_SIZE] +%define prm_uv_src_stride [_ESP + localsize + pushsize + 10*PTR_SIZE] +%define prm_y_src_stride [_ESP + localsize + pushsize + 9*PTR_SIZE] +%define prm_v_src [_ESP + localsize + pushsize + 8*PTR_SIZE] +%define prm_u_src [_ESP + localsize + pushsize + 7*PTR_SIZE] + +%define prm_y_src _ESI +%define prm_uv_dst_stride [_ESP + localsize + pushsize + 5*PTR_SIZE] +%define prm_y_dst_stride [_ESP + localsize + pushsize + 4*PTR_SIZE] +%define prm_v_dst [_ESP + localsize + pushsize + 3*PTR_SIZE] +%define prm_u_dst [_ESP + localsize + pushsize + 2*PTR_SIZE] +%define prm_y_dst _EDI + +%define _ip _ESP + localsize + pushsize + 0 + + push _ESI ; _ESP + localsize + 2*PTR_SIZE + push _EDI ; _ESP + localsize + 1*PTR_SIZE + push _EBP ; _ESP + localsize + 0*PTR_SIZE + + sub _ESP, localsize - sub esp, localsize + mov _ESI, [_ESP + localsize + pushsize + 6*PTR_SIZE] + mov _EDI, [_ESP + localsize + pushsize + 1*PTR_SIZE] - mov eax, [width] - mov ebx, [height] +%endif + +%define width2 dword [_ESP + localsize - 1*4] +%define height2 dword [_ESP + localsize - 2*4] + + mov eax, prm_width + mov ebx, prm_height shr eax, 1 ; calculate widht/2, heigh/2 shr ebx, 1 - mov [width2], eax - mov [height2], ebx + mov width2, eax + mov height2, ebx - mov ebp, [vflip] - or ebp, ebp - jz near .dont_flip - -; flipping support - mov eax, [height] - mov esi, [y_src] - mov edx, [y_src_stride] - push edx - mul edx - pop edx - add esi, eax ; y_src += (height-1) * y_src_stride - neg edx - mov [y_src], esi - mov [y_src_stride], edx ; y_src_stride = -y_src_stride - - mov eax, [height2] - mov esi, [u_src] - mov edi, [v_src] - mov edx, [uv_src_stride] - sub eax, 1 ; ebp = height2 - 1 - push edx - mul edx - pop edx - add esi, eax ; u_src += (height2-1) * uv_src_stride - add edi, eax ; v_src += (height2-1) * uv_src_stride - neg edx - mov [u_src], esi - mov [v_src], edi - mov [uv_src_stride], edx ; uv_src_stride = -uv_src_stride - -.dont_flip - - mov eax, [y_src_stride] - mov ebx, [y_dst_stride] - mov ecx, [uv_src_stride] - mov edx, [uv_dst_stride] - sub eax, [width] - sub ebx, [width] - sub ecx, [width2] - sub edx, [width2] - mov [y_src_dif], eax ; y_src_dif = y_src_stride - width - mov [y_dst_dif], ebx ; y_dst_dif = y_dst_stride - width - mov [uv_src_dif], ecx ; uv_src_dif = uv_src_stride - width2 - mov [uv_dst_dif], edx ; uv_dst_dif = uv_dst_stride - width2 - - PLANE_COPY [y_dst], [y_dst_dif], [y_src], [y_src_dif], [width], [height], OPT - PLANE_COPY [u_dst], [uv_dst_dif], [u_src], [uv_src_dif], [width2], [height2], OPT - PLANE_COPY [v_dst], [uv_dst_dif], [v_src], [uv_src_dif], [width2], [height2], OPT - - add esp, localsize - pop ebp - pop edi - pop esi - pop ebx + mov eax, prm_vflip + test eax, eax + jz near .go + + ; flipping support + mov eax, prm_height + mov ecx, prm_y_src_stride + sub eax, 1 + imul eax, ecx + add _ESI, _EAX ; y_src += (height-1) * y_src_stride + neg ecx + mov prm_y_src_stride, ecx ; y_src_stride = -y_src_stride + + mov eax, height2 + mov _EDX, prm_u_src + mov _EBP, prm_v_src + mov ecx, prm_uv_src_stride + test _EDX, _EDX + jz .go + test _EBP, _EBP + jz .go + sub eax, 1 ; _EAX = height2 - 1 + imul eax, ecx + add _EDX, _EAX ; u_src += (height2-1) * uv_src_stride + add _EBP, _EAX ; v_src += (height2-1) * uv_src_stride + neg ecx + mov prm_u_src, _EDX + mov prm_v_src, _EBP + mov prm_uv_src_stride, ecx ; uv_src_stride = -uv_src_stride + +.go: + mov eax, prm_width + mov ebp, prm_height + PLANE_COPY _EDI, prm_y_dst_stride, _ESI, prm_y_src_stride, _EAX, _EBP, XMM_OPT + + mov _EAX, prm_u_src + or _EAX, prm_v_src + jz near .UVFill_0x80 + + mov eax, width2 + mov ebp, height2 + mov _ESI, prm_u_src + mov _EDI, prm_u_dst + PLANE_COPY _EDI, prm_uv_dst_stride, _ESI, prm_uv_src_stride, _EAX, _EBP, XMM_OPT + + mov eax, width2 + mov ebp, height2 + mov _ESI, prm_v_src + mov _EDI, prm_v_dst + PLANE_COPY _EDI, prm_uv_dst_stride, _ESI, prm_uv_src_stride, _EAX, _EBP, XMM_OPT + +.Done_UVPlane: + add _ESP, localsize + + pop _EBP +%ifndef ARCH_IS_X86_64 + pop _EDI + pop _ESI +%else +%ifdef WINDOWS + pop _EDI + pop _ESI +%endif +%endif + pop _EBX ret -.endfunc + +.UVFill_0x80: + + mov esi, width2 + mov ebp, height2 + mov _EDI, prm_u_dst + PLANE_FILL _EDI, prm_uv_dst_stride, _ESI, _EBP, XMM_OPT + + mov esi, width2 + mov ebp, height2 + mov _EDI, prm_v_dst + PLANE_FILL _EDI, prm_uv_dst_stride, _ESI, _EBP, XMM_OPT + + jmp near .Done_UVPlane + +ENDFUNC + +%undef NAME +%undef XMM_OPT %endmacro ;============================================================================= ; Code ;============================================================================= -SECTION .text +SECTION .rotext align=SECTION_ALIGN MAKE_YV12_TO_YV12 yv12_to_yv12_mmx, 0 MAKE_YV12_TO_YV12 yv12_to_yv12_xmm, 1 + +%ifidn __OUTPUT_FORMAT__,elf +section ".note.GNU-stack" noalloc noexec nowrite progbits +%endif +