--- trunk/xvidcore/src/image/x86_asm/qpel_mmx.asm 2004/07/24 11:46:08 1519 +++ trunk/xvidcore/src/image/x86_asm/qpel_mmx.asm 2008/11/26 23:35:50 1803 @@ -6,7 +6,7 @@ ; * ; * This file is part of XviD, a free MPEG-4 video encoder/decoder ; * -; * XviD is free software; you can redistribute it and/or modify it +; * XviD is free software; you can rDST_PTRstribute it and/or modify it ; * under the terms of the GNU General Public License as published by ; * the Free Software Foundation; either version 2 of the License, or ; * (at your option) any later version. @@ -20,7 +20,7 @@ ; * along with this program; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; * $Id: qpel_mmx.asm,v 1.3 2004-07-24 11:46:08 edgomez Exp $ +; * $Id: qpel_mmx.asm,v 1.9 2008-11-26 23:35:50 Isibaar Exp $ ; * ; *************************************************************************/ @@ -38,25 +38,7 @@ ; instead of xvid_Expand_mmx... -bits 32 - -%macro cglobal 1 - %ifdef PREFIX - global _%1 - %define %1 _%1 - %else - global %1 - %endif -%endmacro -%macro cextern 1 - %ifdef PREFIX - extern _%1 - %define %1 _%1 - %else - extern %1 - %endif -%endmacro - +%include "nasm.inc" ;////////////////////////////////////////////////////////////////////// ;// Declarations @@ -91,57 +73,124 @@ cglobal xvid_V_Pass_Avrg_8_Add_mmx cglobal xvid_V_Pass_Avrg_Up_8_Add_mmx -cextern xvid_Expand_mmx - -%ifdef USE_TABLES - -cextern xvid_FIR_1_0_0_0 -cextern xvid_FIR_3_1_0_0 -cextern xvid_FIR_6_3_1_0 -cextern xvid_FIR_14_3_2_1 -cextern xvid_FIR_20_6_3_1 -cextern xvid_FIR_20_20_6_3 -cextern xvid_FIR_23_19_6_3 -cextern xvid_FIR_7_20_20_6 -cextern xvid_FIR_6_20_20_6 -cextern xvid_FIR_6_20_20_7 -cextern xvid_FIR_3_6_20_20 -cextern xvid_FIR_3_6_19_23 -cextern xvid_FIR_1_3_6_20 -cextern xvid_FIR_1_2_3_14 -cextern xvid_FIR_0_1_3_6 -cextern xvid_FIR_0_0_1_3 -cextern xvid_FIR_0_0_0_1 +cglobal xvid_Expand_mmx -%endif +cglobal xvid_FIR_1_0_0_0 +cglobal xvid_FIR_3_1_0_0 +cglobal xvid_FIR_6_3_1_0 +cglobal xvid_FIR_14_3_2_1 +cglobal xvid_FIR_20_6_3_1 +cglobal xvid_FIR_20_20_6_3 +cglobal xvid_FIR_23_19_6_3 +cglobal xvid_FIR_7_20_20_6 +cglobal xvid_FIR_6_20_20_6 +cglobal xvid_FIR_6_20_20_7 +cglobal xvid_FIR_3_6_20_20 +cglobal xvid_FIR_3_6_19_23 +cglobal xvid_FIR_1_3_6_20 +cglobal xvid_FIR_1_2_3_14 +cglobal xvid_FIR_0_1_3_6 +cglobal xvid_FIR_0_0_1_3 +cglobal xvid_FIR_0_0_0_1 + +SECTION .data align=SECTION_ALIGN + +align SECTION_ALIGN +xvid_Expand_mmx: +times 256*4 dw 0 ; uint16_t xvid_Expand_mmx[256][4] +ENDFUNC + +xvid_FIR_1_0_0_0: +times 256*4 dw 0 +ENDFUNC + +xvid_FIR_3_1_0_0: +times 256*4 dw 0 +ENDFUNC + +xvid_FIR_6_3_1_0: +times 256*4 dw 0 +ENDFUNC + +xvid_FIR_14_3_2_1: +times 256*4 dw 0 +ENDFUNC + +xvid_FIR_20_6_3_1: +times 256*4 dw 0 +ENDFUNC + +xvid_FIR_20_20_6_3: +times 256*4 dw 0 +ENDFUNC + +xvid_FIR_23_19_6_3: +times 256*4 dw 0 +ENDFUNC + +xvid_FIR_7_20_20_6: +times 256*4 dw 0 +ENDFUNC + +xvid_FIR_6_20_20_6: +times 256*4 dw 0 +ENDFUNC + +xvid_FIR_6_20_20_7: +times 256*4 dw 0 +ENDFUNC + +xvid_FIR_3_6_20_20: +times 256*4 dw 0 +ENDFUNC + +xvid_FIR_3_6_19_23: +times 256*4 dw 0 +ENDFUNC + +xvid_FIR_1_3_6_20: +times 256*4 dw 0 +ENDFUNC + +xvid_FIR_1_2_3_14: +times 256*4 dw 0 +ENDFUNC + +xvid_FIR_0_1_3_6: +times 256*4 dw 0 +ENDFUNC + +xvid_FIR_0_0_1_3: +times 256*4 dw 0 +ENDFUNC + +xvid_FIR_0_0_0_1: +times 256*4 dw 0 +ENDFUNC ;////////////////////////////////////////////////////////////////////// -%ifdef FORMAT_COFF -SECTION .rodata -%else -SECTION .rodata align=16 -%endif +DATA -align 16 +align SECTION_ALIGN Rounder1_MMX: times 4 dw 1 Rounder0_MMX: times 4 dw 0 -align 16 -Rounder_QP_MMX +align SECTION_ALIGN +Rounder_QP_MMX: times 4 dw 16 times 4 dw 15 %ifndef USE_TABLES -align 16 +align SECTION_ALIGN ; H-Pass table shared by 16x? and 8x? filters FIR_R0: dw 14, -3, 2, -1 -align 16 +align SECTION_ALIGN FIR_R1: dw 23, 19, -6, 3, -1, 0, 0, 0 FIR_R2: dw -7, 20, 20, -6, 3, -1, 0, 0 @@ -151,19 +200,19 @@ FIR_R4: dw -1, 3, -6, 20, 20, -6, 3, -1 FIR_R5: dw 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0, 0 -align 16 +align SECTION_ALIGN FIR_R6: dw 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0 -align 16 +align SECTION_ALIGN FIR_R7: dw 0, 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0 -align 16 +align SECTION_ALIGN FIR_R8: dw -1, 3, -6, 20, 20, -6, 3, -1 FIR_R9: dw 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0, 0 -align 16 +align SECTION_ALIGN FIR_R10: dw 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0, 0 -align 16 +align SECTION_ALIGN FIR_R11: dw 0, 0, 0, -1, 3, -6, 20, 20, -6, 3, -1, 0 -align 16 +align SECTION_ALIGN FIR_R12: dw -1, 3, -6, 20, 20, -6, 3, -1 FIR_R13: dw 0, -1, 3, -6, 20, 20, -6, 3 @@ -178,7 +227,7 @@ ; V-Pass taps -align 16 +align SECTION_ALIGN FIR_Cm7: times 4 dw -7 FIR_Cm6: times 4 dw -6 FIR_Cm3: times 4 dw -3 @@ -190,7 +239,7 @@ FIR_C20: times 4 dw 20 FIR_C23: times 4 dw 23 -SECTION .text +SECTION .rotext align=SECTION_ALIGN ;////////////////////////////////////////////////////////////////////// ;// Here we go with the Q-Pel mess. @@ -198,46 +247,91 @@ ;// For vertical ones, we process 4 *input* pixel in parallel. ;////////////////////////////////////////////////////////////////////// +%ifdef ARCH_IS_X86_64 +%macro XVID_MOVQ 3 + lea r9, [%2] + movq %1, [r9 + %3] +%endmacro +%macro XVID_PADDW 3 + lea r9, [%2] + paddw %1, [r9 + %3] +%endmacro +%define SRC_PTR prm2 +%define DST_PTR prm1 +%else +%macro XVID_MOVQ 3 + movq %1, [%2 + %3] +%endmacro +%macro XVID_PADDW 3 + paddw %1, [%2 + %3] +%endmacro +%define SRC_PTR _ESI +%define DST_PTR _EDI +%endif + %macro PROLOG_NO_AVRG 0 - push esi - push edi - push ebp - mov edi, [esp+16 + 0*4] ; Dst - mov esi, [esp+16 + 1*4] ; Src - mov ecx, [esp+16 + 2*4] ; Size - mov ebp, [esp+16 + 3*4] ; BpS - mov eax, [esp+16 + 4*4] ; Rnd - and eax, 1 - movq mm7, [Rounder_QP_MMX+eax*8] ; rounder + mov TMP0, prm3 ; Size + mov TMP1, prm4 ; BpS + mov eax, prm5d ; Rnd + +%ifndef ARCH_IS_X86_64 + push SRC_PTR + push DST_PTR +%endif + push _EBP + mov _EBP, TMP1 + +%ifndef ARCH_IS_X86_64 + mov DST_PTR, [_ESP+16 + 0*4] ; Dst + mov SRC_PTR, [_ESP+16 + 1*4] ; Src +%endif + + and _EAX, 1 + lea TMP1, [Rounder_QP_MMX] + movq mm7, [TMP1+_EAX*8] ; rounder %endmacro %macro EPILOG_NO_AVRG 0 - pop ebp - pop edi - pop esi + pop _EBP +%ifndef ARCH_IS_X86_64 + pop DST_PTR + pop SRC_PTR +%endif ret %endmacro %macro PROLOG_AVRG 0 - push ebx - push esi - push edi - push ebp - mov edi, [esp+20 + 0*4] ; Dst - mov esi, [esp+20 + 1*4] ; Src - mov ecx, [esp+20 + 2*4] ; Size - mov ebp, [esp+20 + 3*4] ; BpS - mov eax, [esp+20 + 4*4] ; Rnd - and eax, 1 - movq mm7, [Rounder_QP_MMX+eax*8] ; rounder - lea ebx, [Rounder1_MMX+eax*8] ; *Rounder2 + mov TMP0, prm3 ; Size + mov TMP1, prm4 ; BpS + mov eax, prm5d ; Rnd + + push _EBX + push _EBP +%ifndef ARCH_IS_X86_64 + push SRC_PTR + push DST_PTR +%endif + mov _EBP, TMP1 + +%ifndef ARCH_IS_X86_64 + mov DST_PTR, [_ESP+20 + 0*4] ; Dst + mov SRC_PTR, [_ESP+20 + 1*4] ; Src +%endif + + and _EAX, 1 + lea TMP1, [Rounder_QP_MMX] + movq mm7, [TMP1+_EAX*8] ; rounder + lea TMP1, [Rounder1_MMX] + lea _EBX, [TMP1+_EAX*8] ; *Rounder2 %endmacro %macro EPILOG_AVRG 0 - pop ebp - pop edi - pop esi - pop ebx +%ifndef ARCH_IS_X86_64 + pop DST_PTR + pop SRC_PTR +%endif + pop _EBP + pop _EBX ret %endmacro @@ -250,25 +344,25 @@ ; macros for USE_TABLES %macro TLOAD 2 ; %1,%2: src pixels - movzx eax, byte [esi+%1] - movzx edx, byte [esi+%2] - movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ] - movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ] + movzx _EAX, byte [SRC_PTR+%1] + movzx TMP1, byte [SRC_PTR+%2] + XVID_MOVQ mm0, xvid_FIR_14_3_2_1, _EAX*8 + XVID_MOVQ mm3, xvid_FIR_1_2_3_14, TMP1*8 paddw mm0, mm7 paddw mm3, mm7 %endmacro %macro TACCUM2 5 ;%1:src pixel/%2-%3:Taps tables/ %4-%5:dst regs - movzx eax, byte [esi+%1] - paddw %4, [%2 + eax*8] - paddw %5, [%3 + eax*8] + movzx _EAX, byte [SRC_PTR+%1] + XVID_PADDW %4, %2, _EAX*8 + XVID_PADDW %5, %3, _EAX*8 %endmacro %macro TACCUM3 7 ;%1:src pixel/%2-%4:Taps tables/%5-%7:dst regs - movzx eax, byte [esi+%1] - paddw %5, [%2 + eax*8] - paddw %6, [%3 + eax*8] - paddw %7, [%4 + eax*8] + movzx _EAX, byte [SRC_PTR+%1] + XVID_PADDW %5, %2, _EAX*8 + XVID_PADDW %6, %3, _EAX*8 + XVID_PADDW %7, %4, _EAX*8 %endmacro ;////////////////////////////////////////////////////////////////////// @@ -276,10 +370,10 @@ ; macros without USE_TABLES %macro LOAD 2 ; %1,%2: src pixels - movzx eax, byte [esi+%1] - movzx edx, byte [esi+%2] - movq mm0, [xvid_Expand_mmx + eax*8] - movq mm3, [xvid_Expand_mmx + edx*8] + movzx _EAX, byte [SRC_PTR+%1] + movzx TMP1, byte [SRC_PTR+%2] + XVID_MOVQ mm0, xvid_Expand_mmx, _EAX*8 + XVID_MOVQ mm3, xvid_Expand_mmx, TMP1*8 pmullw mm0, [FIR_R0 ] pmullw mm3, [FIR_R16] paddw mm0, mm7 @@ -287,8 +381,8 @@ %endmacro %macro ACCUM2 4 ;src pixel/Taps/dst regs #1-#2 - movzx eax, byte [esi+%1] - movq mm4, [xvid_Expand_mmx + eax*8] + movzx _EAX, byte [SRC_PTR+%1] + XVID_MOVQ mm4, xvid_Expand_mmx, _EAX*8 movq mm5, mm4 pmullw mm4, [%2] pmullw mm5, [%2+8] @@ -297,8 +391,8 @@ %endmacro %macro ACCUM3 5 ;src pixel/Taps/dst regs #1-#2-#3 - movzx eax, byte [esi+%1] - movq mm4, [xvid_Expand_mmx + eax*8] + movzx _EAX, byte [SRC_PTR+%1] + XVID_MOVQ mm4, xvid_Expand_mmx, _EAX*8 movq mm5, mm4 movq mm6, mm5 pmullw mm4, [%2 ] @@ -340,7 +434,7 @@ PROLOG_AVRG %endif -.Loop +.Loop: ; mm0..mm3 serves as a 4x4 delay line @@ -404,30 +498,30 @@ packuswb mm2, mm3 %if (%1==1) - MIX mm0, esi, ebx + MIX mm0, SRC_PTR, _EBX %elif (%1==2) - MIX mm0, esi+1, ebx + MIX mm0, SRC_PTR+1, _EBX %endif %if (%2==1) - MIX mm0, edi, Rounder1_MMX + MIX mm0, DST_PTR, Rounder1_MMX %endif %if (%1==1) - MIX mm2, esi+8, ebx + MIX mm2, SRC_PTR+8, _EBX %elif (%1==2) - MIX mm2, esi+9, ebx + MIX mm2, SRC_PTR+9, _EBX %endif %if (%2==1) - MIX mm2, edi+8, Rounder1_MMX + MIX mm2, DST_PTR+8, Rounder1_MMX %endif - lea esi, [esi+ebp] + lea SRC_PTR, [SRC_PTR+_EBP] - movq [edi+0], mm0 - movq [edi+8], mm2 + movq [DST_PTR+0], mm0 + movq [DST_PTR+8], mm2 - add edi, ebp - dec ecx + add DST_PTR, _EBP + dec TMP0 jg .Loop %if (%2==0) && (%1==0) @@ -449,7 +543,7 @@ PROLOG_AVRG %endif -.Loop +.Loop: ; mm0..mm3 serves as a 4x4 delay line %ifndef USE_TABLES @@ -479,40 +573,40 @@ %else ; test with unrolling (little faster, but not much) - movzx eax, byte [esi] - movzx edx, byte [esi+8] - movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ] - movzx eax, byte [esi+1] - movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ] + movzx _EAX, byte [SRC_PTR] + movzx TMP1, byte [SRC_PTR+8] + XVID_MOVQ mm0, xvid_FIR_14_3_2_1, _EAX*8 + movzx _EAX, byte [SRC_PTR+1] + XVID_MOVQ mm3, xvid_FIR_1_2_3_14, TMP1*8 paddw mm0, mm7 paddw mm3, mm7 - movzx edx, byte [esi+2] - paddw mm0, [xvid_FIR_23_19_6_3 + eax*8] - paddw mm3, [xvid_FIR_1_0_0_0 + eax*8] - - movzx eax, byte [esi+3] - paddw mm0, [xvid_FIR_7_20_20_6 + edx*8] - paddw mm3, [xvid_FIR_3_1_0_0 + edx*8] - - movzx edx, byte [esi+4] - paddw mm0, [xvid_FIR_3_6_20_20 + eax*8] - paddw mm3, [xvid_FIR_6_3_1_0 + eax*8] - - movzx eax, byte [esi+5] - paddw mm0, [xvid_FIR_1_3_6_20 + edx*8] - paddw mm3, [xvid_FIR_20_6_3_1 + edx*8] - - movzx edx, byte [esi+6] - paddw mm0, [xvid_FIR_0_1_3_6 + eax*8] - paddw mm3, [xvid_FIR_20_20_6_3 + eax*8] - - movzx eax, byte [esi+7] - paddw mm0, [xvid_FIR_0_0_1_3 + edx*8] - paddw mm3, [xvid_FIR_6_20_20_7 + edx*8] + movzx TMP1, byte [SRC_PTR+2] + XVID_PADDW mm0, xvid_FIR_23_19_6_3, _EAX*8 + XVID_PADDW mm3, xvid_FIR_1_0_0_0, _EAX*8 + + movzx _EAX, byte [SRC_PTR+3] + XVID_PADDW mm0, xvid_FIR_7_20_20_6, TMP1*8 + XVID_PADDW mm3, xvid_FIR_3_1_0_0, TMP1*8 + + movzx TMP1, byte [SRC_PTR+4] + XVID_PADDW mm0, xvid_FIR_3_6_20_20, _EAX*8 + XVID_PADDW mm3, xvid_FIR_6_3_1_0, _EAX*8 + + movzx _EAX, byte [SRC_PTR+5] + XVID_PADDW mm0, xvid_FIR_1_3_6_20, TMP1*8 + XVID_PADDW mm3, xvid_FIR_20_6_3_1, TMP1*8 + + movzx TMP1, byte [SRC_PTR+6] + XVID_PADDW mm0, xvid_FIR_0_1_3_6, _EAX*8 + XVID_PADDW mm3, xvid_FIR_20_20_6_3, _EAX*8 + + movzx _EAX, byte [SRC_PTR+7] + XVID_PADDW mm0, xvid_FIR_0_0_1_3, TMP1*8 + XVID_PADDW mm3, xvid_FIR_6_20_20_7, TMP1*8 - paddw mm0, [xvid_FIR_0_0_0_1 + eax*8] - paddw mm3, [xvid_FIR_3_6_19_23 + eax*8] + XVID_PADDW mm0, xvid_FIR_0_0_0_1, _EAX*8 + XVID_PADDW mm3, xvid_FIR_3_6_19_23, _EAX*8 %endif @@ -523,19 +617,19 @@ packuswb mm0, mm3 %if (%1==1) - MIX mm0, esi, ebx + MIX mm0, SRC_PTR, _EBX %elif (%1==2) - MIX mm0, esi+1, ebx + MIX mm0, SRC_PTR+1, _EBX %endif %if (%2==1) - MIX mm0, edi, Rounder1_MMX + MIX mm0, DST_PTR, Rounder1_MMX %endif - movq [edi], mm0 + movq [DST_PTR], mm0 - add edi, ebp - add esi, ebp - dec ecx + add DST_PTR, _EBP + add SRC_PTR, _EBP + dec TMP0 jg .Loop %if (%2==0) && (%1==0) @@ -551,40 +645,52 @@ xvid_H_Pass_16_mmx: H_PASS_16 0, 0 +ENDFUNC xvid_H_Pass_Avrg_16_mmx: H_PASS_16 1, 0 +ENDFUNC xvid_H_Pass_Avrg_Up_16_mmx: H_PASS_16 2, 0 +ENDFUNC ;////////////////////////////////////////////////////////////////////// ;// 8x? copy Functions xvid_H_Pass_8_mmx: H_PASS_8 0, 0 +ENDFUNC xvid_H_Pass_Avrg_8_mmx: H_PASS_8 1, 0 +ENDFUNC xvid_H_Pass_Avrg_Up_8_mmx: H_PASS_8 2, 0 +ENDFUNC ;////////////////////////////////////////////////////////////////////// ;// 16x? avrg Functions xvid_H_Pass_Add_16_mmx: H_PASS_16 0, 1 +ENDFUNC xvid_H_Pass_Avrg_Add_16_mmx: H_PASS_16 1, 1 +ENDFUNC xvid_H_Pass_Avrg_Up_Add_16_mmx: H_PASS_16 2, 1 +ENDFUNC ;////////////////////////////////////////////////////////////////////// ;// 8x? avrg Functions xvid_H_Pass_8_Add_mmx: H_PASS_8 0, 1 +ENDFUNC xvid_H_Pass_Avrg_8_Add_mmx: H_PASS_8 1, 1 +ENDFUNC xvid_H_Pass_Avrg_Up_8_Add_mmx: H_PASS_8 2, 1 +ENDFUNC @@ -596,10 +702,10 @@ %macro V_LOAD 1 ; %1=Last? - movd mm4, [edx] + movd mm4, dword [TMP1] pxor mm6, mm6 %if (%1==0) - add edx, ebp + add TMP1, _EBP %endif punpcklbw mm4, mm6 @@ -650,21 +756,21 @@ packuswb %3, %3 %if (%1==1) - V_MIX %3, esi, ebx - add esi, ebp + V_MIX %3, SRC_PTR, _EBX + add SRC_PTR, _EBP %elif (%1==2) - add esi, ebp - V_MIX %3, esi, ebx + add SRC_PTR, _EBP + V_MIX %3, SRC_PTR, _EBX %endif %if (%2==1) - V_MIX %3, edi, Rounder1_MMX + V_MIX %3, DST_PTR, Rounder1_MMX %endif movd eax, %3 - mov [edi], eax + mov dword [DST_PTR], eax %if (%4==0) - add edi, ebp + add DST_PTR, _EBP %endif %endmacro @@ -683,11 +789,11 @@ ; the size (3rd argument) is meant to be a multiple of 4 ; mm0..mm3 serves as a 4x4 delay line -.Loop +.Loop: - push edi - push esi ; esi is preserved for src-mixing - mov edx, esi + push DST_PTR + push SRC_PTR ; SRC_PTR is preserved for src-mixing + mov TMP1, SRC_PTR ; ouput rows [0..3], from input rows [0..8] @@ -723,11 +829,11 @@ ; ouput rows [4..7], from input rows [1..11] (!!) - mov esi, [esp] - lea edx, [esi+ebp] + mov SRC_PTR, [_ESP] + lea TMP1, [SRC_PTR+_EBP] - lea esi, [esi+4*ebp] ; for src-mixing - push esi ; this will be the new value for next round + lea SRC_PTR, [SRC_PTR+4*_EBP] ; for src-mixing + push SRC_PTR ; this will be the new value for next round movq mm0, mm7 movq mm1, mm7 @@ -771,11 +877,11 @@ ; ouput rows [8..11], from input rows [5..15] - pop esi - lea edx, [esi+ebp] + pop SRC_PTR + lea TMP1, [SRC_PTR+_EBP] - lea esi, [esi+4*ebp] ; for src-mixing - push esi ; this will be the new value for next round + lea SRC_PTR, [SRC_PTR+4*_EBP] ; for src-mixing + push SRC_PTR ; this will be the new value for next round movq mm0, mm7 movq mm1, mm7 @@ -821,11 +927,11 @@ ; ouput rows [12..15], from input rows [9.16] - pop esi - lea edx, [esi+ebp] + pop SRC_PTR + lea TMP1, [SRC_PTR+_EBP] %if (%1!=0) - lea esi, [esi+4*ebp] ; for src-mixing + lea SRC_PTR, [SRC_PTR+4*_EBP] ; for src-mixing %endif movq mm0, mm7 @@ -861,11 +967,11 @@ ; ... next 4 columns - pop esi - pop edi - add esi, 4 - add edi, 4 - sub ecx, 4 + pop SRC_PTR + pop DST_PTR + add SRC_PTR, 4 + add DST_PTR, 4 + sub TMP0, 4 jg .Loop %if (%2==0) && (%1==0) @@ -889,11 +995,11 @@ ; we process one stripe of 4x8 pixel each time ; the size (3rd argument) is meant to be a multiple of 4 ; mm0..mm3 serves as a 4x4 delay line -.Loop +.Loop: - push edi - push esi ; esi is preserved for src-mixing - mov edx, esi + push DST_PTR + push SRC_PTR ; SRC_PTR is preserved for src-mixing + mov TMP1, SRC_PTR ; ouput rows [0..3], from input rows [0..8] @@ -930,11 +1036,11 @@ ; ouput rows [4..7], from input rows [1..9] - mov esi, [esp] - lea edx, [esi+ebp] + mov SRC_PTR, [_ESP] + lea TMP1, [SRC_PTR+_EBP] %if (%1!=0) - lea esi, [esi+4*ebp] ; for src-mixing + lea SRC_PTR, [SRC_PTR+4*_EBP] ; for src-mixing %endif movq mm0, mm7 @@ -970,11 +1076,11 @@ ; ... next 4 columns - pop esi - pop edi - add esi, 4 - add edi, 4 - sub ecx, 4 + pop SRC_PTR + pop DST_PTR + add SRC_PTR, 4 + add DST_PTR, 4 + sub TMP0, 4 jg .Loop %if (%2==0) && (%1==0) @@ -991,39 +1097,59 @@ xvid_V_Pass_16_mmx: V_PASS_16 0, 0 +ENDFUNC xvid_V_Pass_Avrg_16_mmx: V_PASS_16 1, 0 +ENDFUNC xvid_V_Pass_Avrg_Up_16_mmx: V_PASS_16 2, 0 +ENDFUNC ;////////////////////////////////////////////////////////////////////// ;// 8x? copy Functions xvid_V_Pass_8_mmx: V_PASS_8 0, 0 +ENDFUNC xvid_V_Pass_Avrg_8_mmx: V_PASS_8 1, 0 +ENDFUNC xvid_V_Pass_Avrg_Up_8_mmx: V_PASS_8 2, 0 +ENDFUNC ;////////////////////////////////////////////////////////////////////// ;// 16x? avrg Functions xvid_V_Pass_Add_16_mmx: V_PASS_16 0, 1 +ENDFUNC xvid_V_Pass_Avrg_Add_16_mmx: V_PASS_16 1, 1 +ENDFUNC xvid_V_Pass_Avrg_Up_Add_16_mmx: V_PASS_16 2, 1 +ENDFUNC ;////////////////////////////////////////////////////////////////////// ;// 8x? avrg Functions xvid_V_Pass_8_Add_mmx: V_PASS_8 0, 1 +ENDFUNC xvid_V_Pass_Avrg_8_Add_mmx: V_PASS_8 1, 1 +ENDFUNC xvid_V_Pass_Avrg_Up_8_Add_mmx: V_PASS_8 2, 1 +ENDFUNC ;////////////////////////////////////////////////////////////////////// + +%undef SRC_PTR +%undef DST_PTR + +%ifidn __OUTPUT_FORMAT__,elf +section ".note.GNU-stack" noalloc noexec nowrite progbits +%endif +