--- trunk/xvidcore/src/image/x86_asm/reduced_mmx.asm 2003/02/15 15:22:19 851 +++ trunk/xvidcore/src/image/x86_asm/reduced_mmx.asm 2004/08/29 10:02:38 1540 @@ -1,12 +1,10 @@ ;/***************************************************************************** ; * ; * XVID MPEG-4 VIDEO CODEC -; * Reduced-Resolution utilities +; * - Reduced-Resolution utilities - ; * ; * Copyright(C) 2002 Pascal Massimino ; * -; * This file is part of XviD, a free MPEG-4 video encoder/decoder -; * ; * XviD is free software; you can redistribute it and/or modify it ; * under the terms of the GNU General Public License as published by ; * the Free Software Foundation; either version 2 of the License, or @@ -21,53 +19,37 @@ ; * along with this program; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; * Under section 8 of the GNU General Public License, the copyright -; * holders of XVID explicitly forbid distribution in the following -; * countries: -; * -; * - Japan -; * - United States of America -; * -; * Linking XviD statically or dynamically with other modules is making a -; * combined work based on XviD. Thus, the terms and conditions of the -; * GNU General Public License cover the whole combination. -; * -; * As a special exception, the copyright holders of XviD give you -; * permission to link XviD with independent modules that communicate with -; * XviD solely through the VFW1.1 and DShow interfaces, regardless of the -; * license terms of these independent modules, and to copy and distribute -; * the resulting combined work under terms of your choice, provided that -; * every copy of the combined work is accompanied by a complete copy of -; * the source code of XviD (the version of XviD used to produce the -; * combined work), being distributed under the terms of the GNU General -; * Public License plus this exception. An independent module is a module -; * which is not derived from or based on XviD. -; * -; * Note that people who make modified versions of XviD are not obligated -; * to grant this special exception for their modified versions; it is -; * their choice whether to do so. The GNU General Public License gives -; * permission to release a modified version without this exception; this -; * exception also makes it possible to release a modified version which -; * carries forward this exception. -; * -; * $Id: reduced_mmx.asm,v 1.2 2003-02-15 15:22:18 edgomez Exp $ +; * $Id: reduced_mmx.asm,v 1.6 2004-08-29 10:02:38 edgomez Exp $ ; * ; *************************************************************************/ -bits 32 +BITS 32 -%macro cglobal 1 +%macro cglobal 1 %ifdef PREFIX - global _%1 - %define %1 _%1 + %ifdef MARK_FUNCS + global _%1:function %1.endfunc-%1 + %define %1 _%1:function %1.endfunc-%1 + %else + global _%1 + %define %1 _%1 + %endif %else - global %1 + %ifdef MARK_FUNCS + global %1:function %1.endfunc-%1 + %else + global %1 + %endif %endif %endmacro ;=========================================================================== -section .data +%ifdef FORMAT_COFF +SECTION .rodata +%else +SECTION .rodata align=16 +%endif align 16 Up31 dw 3, 1, 3, 1 @@ -85,7 +67,7 @@ ;=========================================================================== -section .text +SECTION .text cglobal xvid_Copy_Upsampled_8x8_16To8_mmx cglobal xvid_Add_Upsampled_8x8_16To8_mmx @@ -158,18 +140,18 @@ ;=========================================================================== ; -; void xvid_Copy_Upsampled_8x8_16To8_mmx(uint8_t *Dst, +; void xvid_Copy_Upsampled_8x8_16To8_mmx(uint8_t *Dst, ; const int16_t *Src, const int BpS); ; ;=========================================================================== - ; Note: we can use ">>2" instead of "/4" here, since we + ; Note: we can use ">>2" instead of "/4" here, since we ; are (supposed to be) averaging positive values %macro STORE_1 2 psraw %1, 2 psraw %2, 2 - packuswb %1,%2 + packuswb %1,%2 movq [ecx], %1 %endmacro @@ -289,16 +271,17 @@ STORE_1 mm2, mm3 ret +.endfunc ;=========================================================================== ; -; void xvid_Add_Upsampled_8x8_16To8_mmx(uint8_t *Dst, +; void xvid_Add_Upsampled_8x8_16To8_mmx(uint8_t *Dst, ; const int16_t *Src, const int BpS); ; ;=========================================================================== ; Note: grrr... the 'pcmpgtw' stuff are the "/4" and "/16" operators - ; implemented with ">>2" and ">>4" using: + ; implemented with ">>2" and ">>4" using: ; x/4 = ( (x-(x<0))>>2 ) + (x<0) ; x/16 = ( (x-(x<0))>>4 ) + (x<0) @@ -391,7 +374,7 @@ mov eax, [esp+12] ; BpS COL03 mm0, mm1, 0 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [Up13], [Up31] movq mm4, mm0 movq mm5, mm1 STORE_ADD_1 mm4, mm5 @@ -439,9 +422,9 @@ add ecx, 8 COL47 mm0, mm1, 0 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [Up13], [Up31] movq mm4, mm0 - movq mm5, mm1 + movq mm5, mm1 STORE_ADD_1 mm4, mm5 add ecx, eax @@ -483,10 +466,11 @@ STORE_ADD_1 mm2, mm3 ret +.endfunc ;=========================================================================== ; -; void xvid_Copy_Upsampled_8x8_16To8_xmm(uint8_t *Dst, +; void xvid_Copy_Upsampled_8x8_16To8_xmm(uint8_t *Dst, ; const int16_t *Src, const int BpS); ; ;=========================================================================== @@ -614,10 +598,11 @@ STORE_1 mm2, mm3 ret +.endfunc ;=========================================================================== ; -; void xvid_Add_Upsampled_8x8_16To8_xmm(uint8_t *Dst, +; void xvid_Add_Upsampled_8x8_16To8_xmm(uint8_t *Dst, ; const int16_t *Src, const int BpS); ; ;=========================================================================== @@ -630,7 +615,7 @@ mov eax, [esp+12] ; BpS COL03_SSE mm0, mm1, 0 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [Up13], [Up31] movq mm4, mm0 movq mm5, mm1 STORE_ADD_1 mm4, mm5 @@ -678,9 +663,9 @@ add ecx, 8 COL47_SSE mm0, mm1, 0 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [Up13], [Up31] movq mm4, mm0 - movq mm5, mm1 + movq mm5, mm1 STORE_ADD_1 mm4, mm5 add ecx, eax @@ -722,6 +707,7 @@ STORE_ADD_1 mm2, mm3 ret +.endfunc ;=========================================================================== @@ -755,9 +741,9 @@ neg eax .Loop: ;12c - movd mm0, [esi+eax*4] + movd mm0, [esi+eax*4] movd mm1, [edi+eax*4] - movq mm2, mm5 + movq mm2, mm5 punpcklbw mm0, mm7 punpcklbw mm1, mm7 paddsw mm2, mm0 @@ -778,6 +764,7 @@ pop edi pop esi ret +.endfunc ; mmx is of no use here. Better use plain ASM. Moreover, ; this is for the fun of ASM coding, coz' every modern compiler can @@ -817,6 +804,7 @@ pop edi pop esi ret +.endfunc ; this one's just a little faster than gcc's code. Very little. @@ -853,6 +841,7 @@ pop edi pop esi ret +.endfunc ;////////////////////////////////////////////////////////////////////// ;// 16b downsampling 16x16 -> 8x8 @@ -875,7 +864,7 @@ %macro VFILTER_1331 4 ; %1-4: regs %1-%2: trashed paddsw %1, [Cst32] - paddsw %2, %3 + paddsw %2, %3 pmullw %2, mm7 paddsw %1,%4 paddsw %1, %2 @@ -942,6 +931,7 @@ COPY_TWO_LINES_1331 ecx + 6*16 +8 ret +.endfunc ;=========================================================================== ; @@ -1006,6 +996,7 @@ DIFF_TWO_LINES_1331 ecx + 6*16 +8 ret +.endfunc ;//////////////////////////////////////////////////////////////////////