--- trunk/xvidcore/src/image/x86_asm/colorspace_yuv_mmx.asm 2004/03/18 16:11:28 1381 +++ trunk/xvidcore/src/image/x86_asm/colorspace_yuv_mmx.asm 2004/03/22 22:36:25 1382 @@ -1,56 +1,42 @@ -;------------------------------------------------------------------------------ -; -; This file is part of XviD, a free MPEG-4 video encoder/decoder -; -; This program is free software; you can redistribute it and/or modify it -; under the terms of the GNU General Public License as published by -; the Free Software Foundation; either version 2 of the License, or -; (at your option) any later version. -; -; This program is distributed in the hope that it will be useful, but -; WITHOUT ANY WARRANTY; without even the implied warranty of -; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; GNU General Public License for more details. -; -; You should have received a copy of the GNU General Public License -; along with this program; if not, write to the Free Software -; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -; -;------------------------------------------------------------------------------ -;------------------------------------------------------------------------------ -; -; yuv_to_yuv.asm, MMX optimized color conversion -; -; Copyright (C) 2001 - Michael Militzer -; -; For more information visit the XviD homepage: http://www.xvid.org -; -;------------------------------------------------------------------------------ -;------------------------------------------------------------------------------ -; -; Revision history: -; -; 24.11.2001 initial version (Isibaar) -; 23.07.2002 thread safe (edgomez) -; -; $Id: colorspace_yuv_mmx.asm,v 1.2 2003-02-15 15:22:18 edgomez Exp $ -; -;------------------------------------------------------------------------------ +;/**************************************************************************** +; * +; * XVID MPEG-4 VIDEO CODEC +; * - MMX and XMM YV12->YV12 conversion - +; * +; * Copyright(C) 2001 Michael Militzer +; * +; * This program is free software; you can redistribute it and/or modify it +; * under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 2 of the License, or +; * (at your option) any later version. +; * +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +; * +; * $Id: colorspace_yuv_mmx.asm,v 1.3 2004-03-22 22:36:24 edgomez Exp $ +; * +; ***************************************************************************/ BITS 32 -%macro cglobal 1 +%macro cglobal 1 %ifdef PREFIX - global _%1 + global _%1 %define %1 _%1 %else global %1 %endif %endmacro -SECTION .text - -ALIGN 64 +;============================================================================= +; Helper macros +;============================================================================= ;------------------------------------------------------------------------------ ; PLANE_COPY ( DST, DST_DIF, SRC, SRC_DIF, WIDTH, HEIGHT, OPT ) @@ -62,6 +48,7 @@ ; HEIGHT height ; OPT 0=plain mmx, 1=xmm ;------------------------------------------------------------------------------ + %macro PLANE_COPY 7 %define DST %1 %define DST_DIF %2 @@ -71,113 +58,110 @@ %define HEIGHT %6 %define OPT %7 - mov eax, WIDTH - mov ebp, HEIGHT ; $ebp$ = height - mov esi, SRC - mov edi, DST - - mov ebx, eax - shr eax, 6 ; $eax$ = width / 64 - and ebx, 63 ; remainder = width % 64 - mov edx, ebx - shr ebx, 4 ; $ebx$ = reaminder / 16 - and edx, 15 ; $edx$ = remainder % 16 + mov eax, WIDTH + mov ebp, HEIGHT ; $ebp$ = height + mov esi, SRC + mov edi, DST + + mov ebx, eax + shr eax, 6 ; $eax$ = width / 64 + and ebx, 63 ; remainder = width % 64 + mov edx, ebx + shr ebx, 4 ; $ebx$ = remainder / 16 + and edx, 15 ; $edx$ = remainder % 16 %%loop64_start - or eax, eax - jz %%loop16_start - mov ecx, eax ; width64 + or eax, eax + jz %%loop16_start + mov ecx, eax ; width64 %%loop64: -%if OPT == 1 ; xmm - prefetchnta [esi + 64] ; non temporal prefetch - prefetchnta [esi + 96] +%if OPT == 1 ; xmm + prefetchnta [esi + 64] ; non temporal prefetch + prefetchnta [esi + 96] %endif - movq mm1, [esi] ; read from src - movq mm2, [esi + 8] - movq mm3, [esi + 16] - movq mm4, [esi + 24] - movq mm5, [esi + 32] - movq mm6, [esi + 40] - movq mm7, [esi + 48] - movq mm0, [esi + 56] - -%if OPT == 0 ; plain mmx - movq [edi], mm1 ; write to y_out - movq [edi + 8], mm2 - movq [edi + 16], mm3 - movq [edi + 24], mm4 - movq [edi + 32], mm5 - movq [edi + 40], mm6 - movq [edi + 48], mm7 - movq [edi + 56], mm0 + movq mm1, [esi] ; read from src + movq mm2, [esi + 8] + movq mm3, [esi + 16] + movq mm4, [esi + 24] + movq mm5, [esi + 32] + movq mm6, [esi + 40] + movq mm7, [esi + 48] + movq mm0, [esi + 56] + +%if OPT == 0 ; plain mmx + movq [edi], mm1 ; write to y_out + movq [edi + 8], mm2 + movq [edi + 16], mm3 + movq [edi + 24], mm4 + movq [edi + 32], mm5 + movq [edi + 40], mm6 + movq [edi + 48], mm7 + movq [edi + 56], mm0 %else - movntq [edi], mm1 ; write to y_out - movntq [edi + 8], mm2 - movntq [edi + 16], mm3 - movntq [edi + 24], mm4 - movntq [edi + 32], mm5 - movntq [edi + 40], mm6 - movntq [edi + 48], mm7 - movntq [edi + 56], mm0 + movntq [edi], mm1 ; write to y_out + movntq [edi + 8], mm2 + movntq [edi + 16], mm3 + movntq [edi + 24], mm4 + movntq [edi + 32], mm5 + movntq [edi + 40], mm6 + movntq [edi + 48], mm7 + movntq [edi + 56], mm0 %endif - add esi, 64 - add edi, 64 - dec ecx - jnz %%loop64 + add esi, 64 + add edi, 64 + dec ecx + jnz %%loop64 %%loop16_start - or ebx, ebx - jz %%loop1_start - mov ecx, ebx ; width16 + or ebx, ebx + jz %%loop1_start + mov ecx, ebx ; width16 %%loop16: - movq mm1, [esi] - movq mm2, [esi + 8] -%if OPT == 0 ; plain mmx - movq [edi], mm1 - movq [edi + 8], mm2 + movq mm1, [esi] + movq mm2, [esi + 8] +%if OPT == 0 ; plain mmx + movq [edi], mm1 + movq [edi + 8], mm2 %else - movntq [edi], mm1 - movntq [edi + 8], mm2 + movntq [edi], mm1 + movntq [edi + 8], mm2 %endif - add esi, 16 - add edi, 16 - dec ecx - jnz %%loop16 + add esi, 16 + add edi, 16 + dec ecx + jnz %%loop16 %%loop1_start - mov ecx, edx - rep movsb + mov ecx, edx + rep movsb - add esi, SRC_DIF - add edi, DST_DIF - dec ebp - jnz near %%loop64_start + add esi, SRC_DIF + add edi, DST_DIF + dec ebp + jnz near %%loop64_start %endmacro -;------------------------------------------------------------------------------ - - ;------------------------------------------------------------------------------ ; MAKE_YV12_TO_YV12( NAME, OPT ) ; NAME function name ; OPT 0=plain mmx, 1=xmm ; -; yv12_to_yv12_mmx(uint8_t * y_dst, uint8_t * u_dst, uint8_t * v_dst, +; yv12_to_yv12_mmx(uint8_t * y_dst, uint8_t * u_dst, uint8_t * v_dst, ; int y_dst_stride, int uv_dst_stride, -; uint8_t * y_src, uint8_t * u_src, uint8_t * v_src, +; uint8_t * y_src, uint8_t * u_src, uint8_t * v_src, ; int y_src_stride, int uv_src_stride, ; int width, int height, int vflip) ;------------------------------------------------------------------------------ %macro MAKE_YV12_TO_YV12 2 %define NAME %1 %define OPT %2 -align 16 +ALIGN 16 cglobal NAME -NAME +NAME: %define pushsize 16 %define localsize 24 @@ -196,10 +180,10 @@ %define y_dst esp + localsize + pushsize + 4 %define _ip esp + localsize + pushsize + 0 - push ebx ; esp + localsize + 16 - push esi ; esp + localsize + 8 - push edi ; esp + localsize + 4 - push ebp ; esp + localsize + 0 + push ebx ; esp + localsize + 16 + push esi ; esp + localsize + 8 + push edi ; esp + localsize + 4 + push ebp ; esp + localsize + 0 %define width2 esp + localsize - 4 %define height2 esp + localsize - 8 @@ -208,75 +192,80 @@ %define uv_src_dif esp + localsize - 20 %define uv_dst_dif esp + localsize - 24 - sub esp, localsize + sub esp, localsize - mov eax, [width] - mov ebx, [height] - shr eax, 1 ; calculate widht/2, heigh/2 - shr ebx, 1 - mov [width2], eax - mov [height2], ebx - - mov ebp, [vflip] - or ebp, ebp - jz near .dont_flip + mov eax, [width] + mov ebx, [height] + shr eax, 1 ; calculate widht/2, heigh/2 + shr ebx, 1 + mov [width2], eax + mov [height2], ebx + + mov ebp, [vflip] + or ebp, ebp + jz near .dont_flip ; flipping support - mov eax, [height] - mov esi, [y_src] - mov edx, [y_src_stride] - push edx - mul edx - pop edx - add esi, eax ; y_src += (height-1) * y_src_stride - neg edx - mov [y_src], esi - mov [y_src_stride], edx ; y_src_stride = -y_src_stride - - mov eax, [height2] - mov esi, [u_src] - mov edi, [v_src] - mov edx, [uv_src_stride] - sub eax, 1 ; ebp = height2 - 1 - push edx - mul edx - pop edx - add esi, eax ; u_src += (height2-1) * uv_src_stride - add edi, eax ; v_src += (height2-1) * uv_src_stride - neg edx - mov [u_src], esi - mov [v_src], edi - mov [uv_src_stride], edx ; uv_src_stride = -uv_src_stride + mov eax, [height] + mov esi, [y_src] + mov edx, [y_src_stride] + push edx + mul edx + pop edx + add esi, eax ; y_src += (height-1) * y_src_stride + neg edx + mov [y_src], esi + mov [y_src_stride], edx ; y_src_stride = -y_src_stride + + mov eax, [height2] + mov esi, [u_src] + mov edi, [v_src] + mov edx, [uv_src_stride] + sub eax, 1 ; ebp = height2 - 1 + push edx + mul edx + pop edx + add esi, eax ; u_src += (height2-1) * uv_src_stride + add edi, eax ; v_src += (height2-1) * uv_src_stride + neg edx + mov [u_src], esi + mov [v_src], edi + mov [uv_src_stride], edx ; uv_src_stride = -uv_src_stride .dont_flip - mov eax, [y_src_stride] - mov ebx, [y_dst_stride] - mov ecx, [uv_src_stride] - mov edx, [uv_dst_stride] - sub eax, [width] - sub ebx, [width] - sub ecx, [width2] - sub edx, [width2] - mov [y_src_dif], eax ; y_src_dif = y_src_stride - width - mov [y_dst_dif], ebx ; y_dst_dif = y_dst_stride - width - mov [uv_src_dif], ecx ; uv_src_dif = uv_src_stride - width2 - mov [uv_dst_dif], edx ; uv_dst_dif = uv_dst_stride - width2 - - PLANE_COPY [y_dst], [y_dst_dif], [y_src], [y_src_dif], [width], [height], OPT - PLANE_COPY [u_dst], [uv_dst_dif], [u_src], [uv_src_dif], [width2], [height2], OPT - PLANE_COPY [v_dst], [uv_dst_dif], [v_src], [uv_src_dif], [width2], [height2], OPT - - add esp, localsize - pop ebp - pop edi - pop esi - pop ebx + mov eax, [y_src_stride] + mov ebx, [y_dst_stride] + mov ecx, [uv_src_stride] + mov edx, [uv_dst_stride] + sub eax, [width] + sub ebx, [width] + sub ecx, [width2] + sub edx, [width2] + mov [y_src_dif], eax ; y_src_dif = y_src_stride - width + mov [y_dst_dif], ebx ; y_dst_dif = y_dst_stride - width + mov [uv_src_dif], ecx ; uv_src_dif = uv_src_stride - width2 + mov [uv_dst_dif], edx ; uv_dst_dif = uv_dst_stride - width2 + + PLANE_COPY [y_dst], [y_dst_dif], [y_src], [y_src_dif], [width], [height], OPT + PLANE_COPY [u_dst], [uv_dst_dif], [u_src], [uv_src_dif], [width2], [height2], OPT + PLANE_COPY [v_dst], [uv_dst_dif], [v_src], [uv_src_dif], [width2], [height2], OPT + + add esp, localsize + pop ebp + pop edi + pop esi + pop ebx - ret + ret %endmacro -;------------------------------------------------------------------------------ +;============================================================================= +; Code +;============================================================================= + +SECTION .text MAKE_YV12_TO_YV12 yv12_to_yv12_mmx, 0 + MAKE_YV12_TO_YV12 yv12_to_yv12_xmm, 1