--- trunk/xvidcore/src/image/x86_asm/colorspace_yuv_mmx.asm 2003/02/15 15:22:19 851 +++ trunk/xvidcore/src/image/x86_asm/colorspace_yuv_mmx.asm 2008/11/26 01:04:34 1795 @@ -1,282 +1,479 @@ +;/**************************************************************************** +; * +; * XVID MPEG-4 VIDEO CODEC +; * - MMX and XMM YV12->YV12 conversion - +; * +; * Copyright(C) 2001-2008 Michael Militzer +; * +; * This program is free software; you can redistribute it and/or modify it +; * under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 2 of the License, or +; * (at your option) any later version. +; * +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +; * +; * $Id: colorspace_yuv_mmx.asm,v 1.9 2008-11-26 01:04:34 Isibaar Exp $ +; * +; ***************************************************************************/ + +%include "nasm.inc" + +;============================================================================= +; Helper macros +;============================================================================= + ;------------------------------------------------------------------------------ +; PLANE_COPY ( DST, DST_STRIDE, SRC, SRC_STRIDE, WIDTH, HEIGHT, OPT ) +; DST dst buffer +; DST_STRIDE dst stride +; SRC src destination buffer +; SRC_STRIDE src stride +; WIDTH width +; HEIGHT height +; OPT 0=plain mmx, 1=xmm ; -; This file is part of XviD, a free MPEG-4 video encoder/decoder -; -; This program is free software; you can redistribute it and/or modify it -; under the terms of the GNU General Public License as published by -; the Free Software Foundation; either version 2 of the License, or -; (at your option) any later version. -; -; This program is distributed in the hope that it will be useful, but -; WITHOUT ANY WARRANTY; without even the implied warranty of -; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; GNU General Public License for more details. -; -; You should have received a copy of the GNU General Public License -; along with this program; if not, write to the Free Software -; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -; -;------------------------------------------------------------------------------ -;------------------------------------------------------------------------------ -; -; yuv_to_yuv.asm, MMX optimized color conversion -; -; Copyright (C) 2001 - Michael Militzer -; -; For more information visit the XviD homepage: http://www.xvid.org ; +; Trashes: DST, SRC, WIDTH, HEIGHT, _EBX, _ECX, _EDX ;------------------------------------------------------------------------------ -;------------------------------------------------------------------------------ -; -; Revision history: -; -; 24.11.2001 initial version (Isibaar) -; 23.07.2002 thread safe (edgomez) -; -; $Id: colorspace_yuv_mmx.asm,v 1.2 2003-02-15 15:22:18 edgomez Exp $ -; -;------------------------------------------------------------------------------ -BITS 32 +%macro PLANE_COPY 7 +%define DST %1 +%define DST_STRIDE %2 +%define SRC %3 +%define SRC_STRIDE %4 +%define WIDTH %5 +%define HEIGHT %6 +%define OPT %7 + + mov _EBX, WIDTH + shr WIDTH, 6 ; $_EAX$ = width / 64 + and _EBX, 63 ; remainder = width % 64 + mov _EDX, _EBX + shr _EBX, 4 ; $_EBX$ = remainder / 16 + and _EDX, 15 ; $_EDX$ = remainder % 16 + +%%loop64_start_pc: + push DST + push SRC + + mov _ECX, WIDTH ; width64 + test WIDTH, WIDTH + jz %%loop16_start_pc + +%%loop64_pc: +%if OPT == 1 ; xmm + prefetchnta [SRC + 64] ; non temporal prefetch + prefetchnta [SRC + 96] +%endif + movq mm1, [SRC ] ; read from src + movq mm2, [SRC + 8] + movq mm3, [SRC + 16] + movq mm4, [SRC + 24] + movq mm5, [SRC + 32] + movq mm6, [SRC + 40] + movq mm7, [SRC + 48] + movq mm0, [SRC + 56] + +%if OPT == 0 ; plain mmx + movq [DST ], mm1 ; write to y_out + movq [DST + 8], mm2 + movq [DST + 16], mm3 + movq [DST + 24], mm4 + movq [DST + 32], mm5 + movq [DST + 40], mm6 + movq [DST + 48], mm7 + movq [DST + 56], mm0 +%else + movntq [DST ], mm1 ; write to y_out + movntq [DST + 8], mm2 + movntq [DST + 16], mm3 + movntq [DST + 24], mm4 + movntq [DST + 32], mm5 + movntq [DST + 40], mm6 + movntq [DST + 48], mm7 + movntq [DST + 56], mm0 +%endif -%macro cglobal 1 -%ifdef PREFIX - global _%1 - %define %1 _%1 - %else - global %1 - %endif -%endmacro + add SRC, 64 + add DST, 64 + loop %%loop64_pc + +%%loop16_start_pc: + mov _ECX, _EBX ; width16 + test _EBX, _EBX + jz %%loop1_start_pc + +%%loop16_pc: + movq mm1, [SRC] + movq mm2, [SRC + 8] +%if OPT == 0 ; plain mmx + movq [DST], mm1 + movq [DST + 8], mm2 +%else + movntq [DST], mm1 + movntq [DST + 8], mm2 +%endif -SECTION .text + add SRC, 16 + add DST, 16 + loop %%loop16_pc + +%%loop1_start_pc: + mov _ECX, _EDX + rep movsb + + pop SRC + pop DST + +%ifdef ARCH_IS_X86_64 + movsx _ECX, SRC_STRIDE + add SRC, _ECX + mov ecx, DST_STRIDE + add DST, _ECX +%else + add SRC, SRC_STRIDE + add DST, DST_STRIDE +%endif + dec HEIGHT + jg near %%loop64_start_pc -ALIGN 64 +%undef DST +%undef DST_STRIDE +%undef SRC +%undef SRC_STRIDE +%undef WIDTH +%undef HEIGHT +%undef OPT +%endmacro ;------------------------------------------------------------------------------ -; PLANE_COPY ( DST, DST_DIF, SRC, SRC_DIF, WIDTH, HEIGHT, OPT ) +; PLANE_FILL ( DST, DST_STRIDE, WIDTH, HEIGHT, OPT ) ; DST dst buffer -; DST_DIF dst stride difference (e.g. stride - width) -; SRC src destination buffer -; SRC_DIF src stride difference (e.g. stride - width) +; DST_STRIDE dst stride ; WIDTH width ; HEIGHT height ; OPT 0=plain mmx, 1=xmm +; +; Trashes: DST, WIDTH, HEIGHT, _EBX, _ECX, _EDX, _EAX ;------------------------------------------------------------------------------ -%macro PLANE_COPY 7 -%define DST %1 -%define DST_DIF %2 -%define SRC %3 -%define SRC_DIF %4 -%define WIDTH %5 -%define HEIGHT %6 -%define OPT %7 - - mov eax, WIDTH - mov ebp, HEIGHT ; $ebp$ = height - mov esi, SRC - mov edi, DST - - mov ebx, eax - shr eax, 6 ; $eax$ = width / 64 - and ebx, 63 ; remainder = width % 64 - mov edx, ebx - shr ebx, 4 ; $ebx$ = reaminder / 16 - and edx, 15 ; $edx$ = remainder % 16 - -%%loop64_start - or eax, eax - jz %%loop16_start - mov ecx, eax ; width64 -%%loop64: -%if OPT == 1 ; xmm - prefetchnta [esi + 64] ; non temporal prefetch - prefetchnta [esi + 96] -%endif - movq mm1, [esi] ; read from src - movq mm2, [esi + 8] - movq mm3, [esi + 16] - movq mm4, [esi + 24] - movq mm5, [esi + 32] - movq mm6, [esi + 40] - movq mm7, [esi + 48] - movq mm0, [esi + 56] - -%if OPT == 0 ; plain mmx - movq [edi], mm1 ; write to y_out - movq [edi + 8], mm2 - movq [edi + 16], mm3 - movq [edi + 24], mm4 - movq [edi + 32], mm5 - movq [edi + 40], mm6 - movq [edi + 48], mm7 - movq [edi + 56], mm0 -%else - movntq [edi], mm1 ; write to y_out - movntq [edi + 8], mm2 - movntq [edi + 16], mm3 - movntq [edi + 24], mm4 - movntq [edi + 32], mm5 - movntq [edi + 40], mm6 - movntq [edi + 48], mm7 - movntq [edi + 56], mm0 -%endif - - add esi, 64 - add edi, 64 - dec ecx - jnz %%loop64 - - -%%loop16_start - or ebx, ebx - jz %%loop1_start - mov ecx, ebx ; width16 -%%loop16: - movq mm1, [esi] - movq mm2, [esi + 8] -%if OPT == 0 ; plain mmx - movq [edi], mm1 - movq [edi + 8], mm2 + +%macro PLANE_FILL 5 +%define DST %1 +%define DST_STRIDE %2 +%define WIDTH %3 +%define HEIGHT %4 +%define OPT %5 + + mov _EAX, 0x80808080 + mov _EBX, WIDTH + shr WIDTH, 6 ; $_ESI$ = width / 64 + and _EBX, 63 ; _EBX = remainder = width % 64 + movd mm0, eax + mov _EDX, _EBX + shr _EBX, 4 ; $_EBX$ = remainder / 16 + and _EDX, 15 ; $_EDX$ = remainder % 16 + punpckldq mm0, mm0 + +%%loop64_start_pf: + push DST + mov _ECX, WIDTH ; width64 + test WIDTH, WIDTH + jz %%loop16_start_pf + +%%loop64_pf: + +%if OPT == 0 ; plain mmx + movq [DST ], mm0 ; write to y_out + movq [DST + 8], mm0 + movq [DST + 16], mm0 + movq [DST + 24], mm0 + movq [DST + 32], mm0 + movq [DST + 40], mm0 + movq [DST + 48], mm0 + movq [DST + 56], mm0 %else - movntq [edi], mm1 - movntq [edi + 8], mm2 + movntq [DST ], mm0 ; write to y_out + movntq [DST + 8], mm0 + movntq [DST + 16], mm0 + movntq [DST + 24], mm0 + movntq [DST + 32], mm0 + movntq [DST + 40], mm0 + movntq [DST + 48], mm0 + movntq [DST + 56], mm0 %endif - add esi, 16 - add edi, 16 - dec ecx - jnz %%loop16 + add DST, 64 + loop %%loop64_pf +%%loop16_start_pf: + mov _ECX, _EBX ; width16 + test _EBX, _EBX + jz %%loop1_start_pf + +%%loop16_pf: +%if OPT == 0 ; plain mmx + movq [DST ], mm0 + movq [DST + 8], mm0 +%else + movntq [DST ], mm0 + movntq [DST + 8], mm0 +%endif -%%loop1_start - mov ecx, edx - rep movsb + add DST, 16 + loop %%loop16_pf - add esi, SRC_DIF - add edi, DST_DIF - dec ebp - jnz near %%loop64_start -%endmacro -;------------------------------------------------------------------------------ +%%loop1_start_pf: + mov _ECX, _EDX + rep stosb + + pop DST + +%ifdef ARCH_IS_X86_64 + mov ecx, DST_STRIDE + add DST, _ECX +%else + add DST, DST_STRIDE +%endif + dec HEIGHT + jg near %%loop64_start_pf +%undef DST +%undef DST_STRIDE +%undef WIDTH +%undef HEIGHT +%undef OPT +%endmacro ;------------------------------------------------------------------------------ ; MAKE_YV12_TO_YV12( NAME, OPT ) ; NAME function name ; OPT 0=plain mmx, 1=xmm ; -; yv12_to_yv12_mmx(uint8_t * y_dst, uint8_t * u_dst, uint8_t * v_dst, +; yv12_to_yv12_mmx(uint8_t * y_dst, uint8_t * u_dst, uint8_t * v_dst, ; int y_dst_stride, int uv_dst_stride, -; uint8_t * y_src, uint8_t * u_src, uint8_t * v_src, +; uint8_t * y_src, uint8_t * u_src, uint8_t * v_src, ; int y_src_stride, int uv_src_stride, ; int width, int height, int vflip) ;------------------------------------------------------------------------------ %macro MAKE_YV12_TO_YV12 2 %define NAME %1 -%define OPT %2 -align 16 +%define XMM_OPT %2 +ALIGN SECTION_ALIGN cglobal NAME -NAME -%define pushsize 16 -%define localsize 24 - -%define vflip esp + localsize + pushsize + 52 -%define height esp + localsize + pushsize + 48 -%define width esp + localsize + pushsize + 44 -%define uv_src_stride esp + localsize + pushsize + 40 -%define y_src_stride esp + localsize + pushsize + 36 -%define v_src esp + localsize + pushsize + 32 -%define u_src esp + localsize + pushsize + 28 -%define y_src esp + localsize + pushsize + 24 -%define uv_dst_stride esp + localsize + pushsize + 20 -%define y_dst_stride esp + localsize + pushsize + 16 -%define v_dst esp + localsize + pushsize + 12 -%define u_dst esp + localsize + pushsize + 8 -%define y_dst esp + localsize + pushsize + 4 -%define _ip esp + localsize + pushsize + 0 - - push ebx ; esp + localsize + 16 - push esi ; esp + localsize + 8 - push edi ; esp + localsize + 4 - push ebp ; esp + localsize + 0 - -%define width2 esp + localsize - 4 -%define height2 esp + localsize - 8 -%define y_src_dif esp + localsize - 12 -%define y_dst_dif esp + localsize - 16 -%define uv_src_dif esp + localsize - 20 -%define uv_dst_dif esp + localsize - 24 - - sub esp, localsize - - mov eax, [width] - mov ebx, [height] - shr eax, 1 ; calculate widht/2, heigh/2 - shr ebx, 1 - mov [width2], eax - mov [height2], ebx - - mov ebp, [vflip] - or ebp, ebp - jz near .dont_flip - -; flipping support - mov eax, [height] - mov esi, [y_src] - mov edx, [y_src_stride] - push edx - mul edx - pop edx - add esi, eax ; y_src += (height-1) * y_src_stride - neg edx - mov [y_src], esi - mov [y_src_stride], edx ; y_src_stride = -y_src_stride - - mov eax, [height2] - mov esi, [u_src] - mov edi, [v_src] - mov edx, [uv_src_stride] - sub eax, 1 ; ebp = height2 - 1 - push edx - mul edx - pop edx - add esi, eax ; u_src += (height2-1) * uv_src_stride - add edi, eax ; v_src += (height2-1) * uv_src_stride - neg edx - mov [u_src], esi - mov [v_src], edi - mov [uv_src_stride], edx ; uv_src_stride = -uv_src_stride - -.dont_flip - - mov eax, [y_src_stride] - mov ebx, [y_dst_stride] - mov ecx, [uv_src_stride] - mov edx, [uv_dst_stride] - sub eax, [width] - sub ebx, [width] - sub ecx, [width2] - sub edx, [width2] - mov [y_src_dif], eax ; y_src_dif = y_src_stride - width - mov [y_dst_dif], ebx ; y_dst_dif = y_dst_stride - width - mov [uv_src_dif], ecx ; uv_src_dif = uv_src_stride - width2 - mov [uv_dst_dif], edx ; uv_dst_dif = uv_dst_stride - width2 - - PLANE_COPY [y_dst], [y_dst_dif], [y_src], [y_src_dif], [width], [height], OPT - PLANE_COPY [u_dst], [uv_dst_dif], [u_src], [uv_src_dif], [width2], [height2], OPT - PLANE_COPY [v_dst], [uv_dst_dif], [v_src], [uv_src_dif], [width2], [height2], OPT - - add esp, localsize - pop ebp - pop edi - pop esi - pop ebx +NAME: + + push _EBX ; _ESP + localsize + 3*PTR_SIZE - ret +%define localsize 2*4 + +%ifdef ARCH_IS_X86_64 + +%ifndef WINDOWS +%define pushsize 2*PTR_SIZE +%define shadow 0 +%else +%define pushsize 4*PTR_SIZE +%define shadow 32 + 16 +%endif + +%define prm_vflip dword [_ESP + localsize + pushsize + shadow + 7*PTR_SIZE] +%define prm_height dword [_ESP + localsize + pushsize + shadow + 6*PTR_SIZE] +%define prm_width dword [_ESP + localsize + pushsize + shadow + 5*PTR_SIZE] +%define prm_uv_src_stride dword [_ESP + localsize + pushsize + shadow + 4*PTR_SIZE] +%define prm_y_src_stride dword [_ESP + localsize + pushsize + shadow + 3*PTR_SIZE] +%define prm_v_src [_ESP + localsize + pushsize + shadow + 2*PTR_SIZE] +%define prm_u_src [_ESP + localsize + pushsize + shadow + 1*PTR_SIZE] + +%ifdef WINDOWS + push _ESI ; _ESP + localsize + 2*PTR_SIZE + push _EDI ; _ESP + localsize + 1*PTR_SIZE + push _EBP ; _ESP + localsize + 0*PTR_SIZE + + sub _ESP, localsize + +%define prm_y_src _ESI +%define prm_uv_dst_stride TMP0d +%define prm_y_dst_stride prm4d +%define prm_v_dst prm3 +%define prm_u_dst TMP1 +%define prm_y_dst _EDI + + mov _EDI, prm1 + mov TMP1, prm2 + + mov _ESI, [_ESP + localsize + pushsize + shadow - 1*PTR_SIZE] + mov TMP0d, dword [_ESP + localsize + pushsize + shadow - 2*PTR_SIZE] + +%else + push _EBP ; _ESP + localsize + 0*PTR_SIZE + + sub _ESP, localsize + +%define prm_y_src _ESI +%define prm_uv_dst_stride prm5d +%define prm_y_dst_stride TMP1d +%define prm_v_dst prm6 +%define prm_u_dst TMP0 +%define prm_y_dst _EDI + + mov TMP0, prm2 + mov _ESI, prm6 + + mov prm6, prm3 + mov TMP1d, prm4d +%endif + +%define _ip _ESP + localsize + pushsize + 0 + +%else + +%define pushsize 4*PTR_SIZE + +%define prm_vflip [_ESP + localsize + pushsize + 13*PTR_SIZE] +%define prm_height [_ESP + localsize + pushsize + 12*PTR_SIZE] +%define prm_width [_ESP + localsize + pushsize + 11*PTR_SIZE] +%define prm_uv_src_stride [_ESP + localsize + pushsize + 10*PTR_SIZE] +%define prm_y_src_stride [_ESP + localsize + pushsize + 9*PTR_SIZE] +%define prm_v_src [_ESP + localsize + pushsize + 8*PTR_SIZE] +%define prm_u_src [_ESP + localsize + pushsize + 7*PTR_SIZE] + +%define prm_y_src _ESI +%define prm_uv_dst_stride [_ESP + localsize + pushsize + 5*PTR_SIZE] +%define prm_y_dst_stride [_ESP + localsize + pushsize + 4*PTR_SIZE] +%define prm_v_dst [_ESP + localsize + pushsize + 3*PTR_SIZE] +%define prm_u_dst [_ESP + localsize + pushsize + 2*PTR_SIZE] +%define prm_y_dst _EDI + +%define _ip _ESP + localsize + pushsize + 0 + + push _ESI ; _ESP + localsize + 2*PTR_SIZE + push _EDI ; _ESP + localsize + 1*PTR_SIZE + push _EBP ; _ESP + localsize + 0*PTR_SIZE + + sub _ESP, localsize + + mov _ESI, [_ESP + localsize + pushsize + 6*PTR_SIZE] + mov _EDI, [_ESP + localsize + pushsize + 1*PTR_SIZE] + +%endif + +%define width2 dword [_ESP + localsize - 1*4] +%define height2 dword [_ESP + localsize - 2*4] + + mov eax, prm_width + mov ebx, prm_height + shr eax, 1 ; calculate widht/2, heigh/2 + shr ebx, 1 + mov width2, eax + mov height2, ebx + + mov eax, prm_vflip + test eax, eax + jz near .go + + ; flipping support + mov eax, prm_height + mov ecx, prm_y_src_stride + sub eax, 1 + imul eax, ecx + add _ESI, _EAX ; y_src += (height-1) * y_src_stride + neg ecx + mov prm_y_src_stride, ecx ; y_src_stride = -y_src_stride + + mov eax, height2 + mov _EDX, prm_u_src + mov _EBP, prm_v_src + mov ecx, prm_uv_src_stride + test _EDX, _EDX + jz .go + test _EBP, _EBP + jz .go + sub eax, 1 ; _EAX = height2 - 1 + imul eax, ecx + add _EDX, _EAX ; u_src += (height2-1) * uv_src_stride + add _EBP, _EAX ; v_src += (height2-1) * uv_src_stride + neg ecx + mov prm_u_src, _EDX + mov prm_v_src, _EBP + mov prm_uv_src_stride, ecx ; uv_src_stride = -uv_src_stride + +.go: + mov eax, prm_width + mov ebp, prm_height + PLANE_COPY _EDI, prm_y_dst_stride, _ESI, prm_y_src_stride, _EAX, _EBP, XMM_OPT + + mov _EAX, prm_u_src + or _EAX, prm_v_src + jz near .UVFill_0x80 + + mov eax, width2 + mov ebp, height2 + mov _ESI, prm_u_src + mov _EDI, prm_u_dst + PLANE_COPY _EDI, prm_uv_dst_stride, _ESI, prm_uv_src_stride, _EAX, _EBP, XMM_OPT + + mov eax, width2 + mov ebp, height2 + mov _ESI, prm_v_src + mov _EDI, prm_v_dst + PLANE_COPY _EDI, prm_uv_dst_stride, _ESI, prm_uv_src_stride, _EAX, _EBP, XMM_OPT + +.Done_UVPlane: + add _ESP, localsize + + pop _EBP +%ifndef ARCH_IS_X86_64 + pop _EDI + pop _ESI +%else +%ifdef WINDOWS + pop _EDI + pop _ESI +%endif +%endif + pop _EBX + + ret + +.UVFill_0x80: + + mov esi, width2 + mov ebp, height2 + mov _EDI, prm_u_dst + PLANE_FILL _EDI, prm_uv_dst_stride, _ESI, _EBP, XMM_OPT + + mov esi, width2 + mov ebp, height2 + mov _EDI, prm_v_dst + PLANE_FILL _EDI, prm_uv_dst_stride, _ESI, _EBP, XMM_OPT + + jmp near .Done_UVPlane + +ENDFUNC + +%undef NAME +%undef XMM_OPT %endmacro -;------------------------------------------------------------------------------ +;============================================================================= +; Code +;============================================================================= + +SECTION .rotext align=SECTION_ALIGN MAKE_YV12_TO_YV12 yv12_to_yv12_mmx, 0 + MAKE_YV12_TO_YV12 yv12_to_yv12_xmm, 1 + +%ifidn __OUTPUT_FORMAT__,elf +section ".note.GNU-stack" noalloc noexec nowrite progbits +%endif +