--- trunk/xvidcore/src/quant/x86_asm/quantize_h263_3dne.asm 2004/08/22 11:46:10 1535 +++ trunk/xvidcore/src/quant/x86_asm/quantize_h263_3dne.asm 2008/11/26 01:04:34 1795 @@ -5,7 +5,7 @@ ; * ; * Copyright(C) 2002-2003 Jaan Kalda ; * -; * This program is free software ; you can redistribute it and/or modify +; * This program is free software ; you can r_EDIstribute it and/or modify ; * it under the terms of the GNU General Public License as published by ; * the Free Software Foundation ; either version 2 of the License, or ; * (at your option) any later version. @@ -19,7 +19,7 @@ ; * along with this program ; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; * $Id: quantize_h263_3dne.asm,v 1.4 2004-08-22 11:46:10 edgomez Exp $ +; * $Id: quantize_h263_3dne.asm,v 1.9 2008-11-26 01:04:34 Isibaar Exp $ ; * ; *************************************************************************/ ; @@ -29,37 +29,15 @@ ; enable dequant saturate [-2048,2047], test purposes only. %define SATURATE -BITS 32 - -%macro cglobal 1 - %ifdef PREFIX - %ifdef MARK_FUNCS - global _%1:function - %define %1 _%1:function - %else - global _%1 - %define %1 _%1 - %endif - %else - %ifdef MARK_FUNCS - global %1:function - %else - global %1 - %endif - %endif -%endmacro +%include "nasm.inc" ;============================================================================= ; Local data ;============================================================================= -%ifdef FORMAT_COFF -SECTION .rodata -%else -SECTION .rodata align=16 -%endif +DATA -align 4 +align SECTION_ALIGN int_div: dd 0 %assign i 1 @@ -68,7 +46,7 @@ %assign i i+1 %endrep -ALIGN 16 +ALIGN SECTION_ALIGN plus_one: times 8 dw 1 @@ -76,7 +54,7 @@ ; subtract by Q/2 table ;----------------------------------------------------------------------------- -ALIGN 16 +ALIGN SECTION_ALIGN mmx_sub: %assign i 1 %rep 31 @@ -95,7 +73,7 @@ ; ;----------------------------------------------------------------------------- -ALIGN 16 +ALIGN SECTION_ALIGN mmx_div: %assign i 1 %rep 31 @@ -107,7 +85,7 @@ ; add by (odd(Q) ? Q : Q - 1) table ;----------------------------------------------------------------------------- -ALIGN 16 +ALIGN SECTION_ALIGN mmx_add: %assign i 1 %rep 31 @@ -123,7 +101,7 @@ ; multiple by 2Q table ;----------------------------------------------------------------------------- -ALIGN 16 +ALIGN SECTION_ALIGN mmx_mul: %assign i 1 %rep 31 @@ -135,17 +113,17 @@ ; saturation limits ;----------------------------------------------------------------------------- -ALIGN 8 +ALIGN SECTION_ALIGN mmx_32768_minus_2048: times 4 dw (32768-2048) mmx_32767_minus_2047: times 4 dw (32767-2047) -ALIGN 16 +ALIGN SECTION_ALIGN mmx_2047: times 4 dw 2047 -ALIGN 8 +ALIGN SECTION_ALIGN mmzero: dd 0, 0 int2047: @@ -157,7 +135,7 @@ ; Code ;============================================================================= -SECTION .text +SECTION .rotext align=SECTION_ALIGN ;----------------------------------------------------------------------------- @@ -179,24 +157,24 @@ psubw mm7, mm6 ;D8 %endif -ALIGN 8 - db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1 +ALIGN SECTION_ALIGN + movq mm4, [_ECX + %1 * 32 +16] ;C1 pmaxsw mm1, mm0 ;A4 - db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1 + movq mm6, [_ECX + %1 * 32 +24] ;D1 pmaxsw mm3, mm2 ;B4 psraw mm0, 15 ;A5 psraw mm2, 15 ;B5 %if (%1) - movq [edx + %1 * 32 + 16-32], mm5 ;C9 - movq [edx + %1 * 32 + 24-32], mm7 ;D9 + movq [_EDX + %1 * 32 + 16-32], mm5 ;C9 + movq [_EDX + %1 * 32 + 24-32], mm7 ;D9 %endif psrlw mm1, 1 ;A6 psrlw mm3, 1 ;B6 - movq mm5, [ebx] ;C2 - movq mm7, [ebx] ;D2 + movq mm5, [_EBX] ;C2 + movq mm7, [_EBX] ;D2 pxor mm1, mm0 ;A7 pxor mm3, mm2 ;B7 @@ -207,33 +185,38 @@ psubw mm3, mm2 ;B8 %if (%1 == 0) - push ebp - movq mm0, [ecx + %1 * 32 +32] + push _EBP + movq mm0, [_ECX + %1 * 32 +32] %elif (%1 < 3) - db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1 + movq mm0, [_ECX + %1 * 32 +32] ;A1 %endif pmaxsw mm5, mm4 ;C4 %if (%1 < 3) - db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1 + movq mm2, [_ECX + %1 * 32 +8+32] ;B1 %else - cmp esp, esp + cmp _ESP, _ESP %endif pmaxsw mm7, mm6 ;D4 psraw mm4, 15 ;C5 psraw mm6, 15 ;D5 - movq [byte edx + %1 * 32], mm1 ;A9 - movq [edx + %1 * 32+8], mm3 ;B9 + movq [byte _EDX + %1 * 32], mm1 ;A9 + movq [_EDX + %1 * 32+8], mm3 ;B9 psrlw mm5, 1 ;C6 psrlw mm7, 1 ;D6 %if (%1 < 3) - movq mm1, [ebx] ;A2 - movq mm3, [ebx] ;B2 + movq mm1, [_EBX] ;A2 + movq mm3, [_EBX] ;B2 %endif %if (%1 == 3) - imul eax, [int_div+4*edi] +%ifdef ARCH_IS_X86_64 + lea r9, [int_div] + imul eax, dword [r9+4*_EDI] +%else + imul _EAX, [int_div+4*_EDI] +%endif %endif pxor mm5, mm4 ;C7 pxor mm7, mm6 ;D7 @@ -253,24 +236,24 @@ psubw mm7, mm6 ;D8 %endif -ALIGN 8 - db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1 +ALIGN SECTION_ALIGN + movq mm4, [_ECX + %1 * 32 +16] ;C1 pmaxsw mm1, mm0 ;A4 - db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1 + movq mm6, [_ECX + %1 * 32 +24] ;D1 pmaxsw mm3, mm2 ;B4 psraw mm0, 15 ;A5 psraw mm2, 15 ;B5 %if (%1) - movq [edx + %1 * 32 + 16-32], mm5 ;C9 - movq [edx + %1 * 32 + 24-32], mm7 ;D9 + movq [_EDX + %1 * 32 + 16-32], mm5 ;C9 + movq [_EDX + %1 * 32 + 24-32], mm7 ;D9 %endif - pmulhw mm1, [esi] ;A6 - pmulhw mm3, [esi] ;B6 - movq mm5, [ebx] ;C2 - movq mm7, [ebx] ;D2 + pmulhw mm1, [_ESI] ;A6 + pmulhw mm3, [_ESI] ;B6 + movq mm5, [_EBX] ;C2 + movq mm7, [_EBX] ;D2 nop nop @@ -284,134 +267,180 @@ %if (%1 < 3) - db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1 + movq mm0, [_ECX + %1 * 32 +32] ;A1 %endif pmaxsw mm5, mm4 ;C4 %if (%1 < 3) - db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1 + movq mm2, [_ECX + %1 * 32 +8+32] ;B1 %else - cmp esp, esp + cmp _ESP, _ESP %endif pmaxsw mm7,mm6 ;D4 psraw mm4, 15 ;C5 psraw mm6, 15 ;D5 - movq [byte edx + %1 * 32], mm1 ;A9 - movq [edx + %1 * 32+8], mm3 ;B9 + movq [byte _EDX + %1 * 32], mm1 ;A9 + movq [_EDX + %1 * 32+8], mm3 ;B9 - pmulhw mm5, [esi] ;C6 - pmulhw mm7, [esi] ;D6 + pmulhw mm5, [_ESI] ;C6 + pmulhw mm7, [_ESI] ;D6 %if (%1 < 3) - movq mm1, [ebx] ;A2 - movq mm3, [ebx] ;B2 + movq mm1, [_EBX] ;A2 + movq mm3, [_EBX] ;B2 %endif %if (%1 == 0) - push ebp + push _EBP %elif (%1 < 3) nop %endif nop %if (%1 == 3) - imul eax, [int_div+4*edi] +%ifdef ARCH_IS_X86_64 + lea r9, [int_div] + imul eax, dword [r9+4*_EDI] +%else + imul _EAX, [int_div+4*_EDI] +%endif %endif pxor mm5, mm4 ;C7 pxor mm7, mm6 ;D7 %endmacro -ALIGN 16 +ALIGN SECTION_ALIGN cglobal quant_h263_intra_3dne quant_h263_intra_3dne: - mov eax, [esp + 12] ; quant - mov ecx, [esp + 8] ; data - mov edx, [esp + 4] ; coeff +%ifdef ARCH_IS_X86_64 + mov TMP0, [_ESP] + add _ESP, PTR_SIZE +%ifndef WINDOWS + push prm6 + push prm5 +%endif + push prm4 + push prm3 + push prm2 + push prm1 + sub _ESP, PTR_SIZE + mov [_ESP], TMP0 +%endif + + mov _EAX, [_ESP + 3*PTR_SIZE] ; quant + mov _ECX, [_ESP + 2*PTR_SIZE] ; data + mov _EDX, [_ESP + 1*PTR_SIZE] ; coeff cmp al, 1 pxor mm1, mm1 pxor mm3, mm3 - movq mm0, [ecx] ; mm0 = [1st] - movq mm2, [ecx + 8] - push esi - lea esi, [mmx_div + eax*8 - 8] - - push ebx - mov ebx, mmzero - push edi + movq mm0, [_ECX] ; mm0 = [1st] + movq mm2, [_ECX + 8] + push _ESI +%ifdef ARCH_IS_X86_64 + lea _ESI, [mmx_div] + lea _ESI, [_ESI + _EAX*8 - 8] +%else + lea _ESI, [mmx_div + _EAX*8 - 8] +%endif + + push _EBX + mov _EBX, mmzero + push _EDI jz near .q1loop quant_intra 0 - mov ebp, [esp + 16 + 16] ; dcscalar - ; NB -- there are 3 pushes in the function preambule and one more - ; in "quant_intra 0", thus an added offset of 16 bytes - movsx eax, word [byte ecx] ; DC + mov _EBP, [_ESP + (4+4)*PTR_SIZE] ; dcscalar + ; NB -- there are 3 pushes in the function preambule and one more + ; in "quant_intra 0", thus an added offset of 16 bytes + movsx _EAX, word [byte _ECX] ; DC quant_intra 1 - mov edi, eax - sar edi, 31 ; sign(DC) - shr ebp, byte 1 ; ebp = dcscalar/2 + mov _EDI, _EAX + sar _EDI, 31 ; sign(DC) + shr _EBP, byte 1 ; _EBP = dcscalar/2 quant_intra 2 - sub eax, edi ; DC (+1) - xor ebp, edi ; sign(DC) dcscalar /2 (-1) - mov edi, [esp + 16 + 16] ; dscalar - lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar/2 - mov ebp, [byte esp] + sub _EAX, _EDI ; DC (+1) + xor _EBP, _EDI ; sign(DC) dcscalar /2 (-1) + mov _EDI, [_ESP + (4+4)*PTR_SIZE] ; dscalar + lea _EAX, [byte _EAX + _EBP] ; DC + sign(DC) dcscalar/2 + mov _EBP, [byte _ESP] quant_intra 3 psubw mm5, mm4 ;C8 - mov esi, [esp + 12] ; pop back the register value - mov edi, [esp + 4] ; pop back the register value - sar eax, 16 - lea ebx, [byte eax + 1] ; workaround for eax < 0 - cmovs eax, ebx ; conditionnaly move the corrected value - mov [edx], ax ; coeff[0] = ax - mov ebx, [esp + 8] ; pop back the register value - add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 + mov _ESI, [_ESP + 3*PTR_SIZE] ; pop back the register value + mov _EDI, [_ESP + 1*PTR_SIZE] ; pop back the register value + sar _EAX, 16 + lea _EBX, [byte _EAX + 1] ; workaround for _EAX < 0 + cmovs _EAX, _EBX ; conditionnaly move the corrected value + mov [_EDX], ax ; coeff[0] = ax + mov _EBX, [_ESP + 2*PTR_SIZE] ; pop back the register value + add _ESP, byte 4*PTR_SIZE ; "quant_intra 0" pushed _EBP, but we don't restore that one, just correct the stack offset by 16 psubw mm7, mm6 ;D8 - movq [edx + 3 * 32 + 16], mm5 ;C9 - movq [edx + 3 * 32 + 24], mm7 ;D9 + movq [_EDX + 3 * 32 + 16], mm5 ;C9 + movq [_EDX + 3 * 32 + 24], mm7 ;D9 + + xor _EAX, _EAX + +%ifdef ARCH_IS_X86_64 + mov TMP0, [_ESP] +%ifndef WINDOWS + add _ESP, 6*PTR_SIZE +%else + add _ESP, 4*PTR_SIZE +%endif + mov [_ESP], TMP0 +%endif - xor eax, eax ret -ALIGN 16 +ALIGN SECTION_ALIGN -.q1loop +.q1loop: quant_intra1 0 - mov ebp, [esp + 16 + 16] ; dcscalar - movsx eax, word [byte ecx] ; DC + mov _EBP, [_ESP + (4+4)*PTR_SIZE] ; dcscalar + movsx _EAX, word [byte _ECX] ; DC quant_intra1 1 - mov edi, eax - sar edi, 31 ; sign(DC) - shr ebp, byte 1 ; ebp = dcscalar /2 + mov _EDI, _EAX + sar _EDI, 31 ; sign(DC) + shr _EBP, byte 1 ; _EBP = dcscalar /2 quant_intra1 2 - sub eax, edi ; DC (+1) - xor ebp, edi ; sign(DC) dcscalar /2 (-1) - mov edi, [esp + 16 + 16] ; dcscalar - lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar /2 - mov ebp, [byte esp] + sub _EAX, _EDI ; DC (+1) + xor _EBP, _EDI ; sign(DC) dcscalar /2 (-1) + mov _EDI, [_ESP + (4+4)*PTR_SIZE] ; dcscalar + lea _EAX, [byte _EAX + _EBP] ; DC + sign(DC) dcscalar /2 + mov _EBP, [byte _ESP] quant_intra1 3 psubw mm5, mm4 ;C8 - mov esi, [dword esp + 12] ; pop back the register value - mov edi, [esp + 4] ; pop back the register value - sar eax, 16 - lea ebx, [byte eax + 1] ; workaround for eax < 0 - cmovs eax, ebx ; conditionnaly move the corrected value - mov [edx], ax ; coeff[0] = ax - mov ebx, [esp + 8] ; pop back the register value - add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 + mov _ESI, [_ESP + 3*PTR_SIZE] ; pop back the register value + mov _EDI, [_ESP + 1*PTR_SIZE] ; pop back the register value + sar _EAX, 16 + lea _EBX, [byte _EAX + 1] ; workaround for _EAX < 0 + cmovs _EAX, _EBX ; conditionnaly move the corrected value + mov [_EDX], ax ; coeff[0] = ax + mov _EBX, [_ESP + 2*PTR_SIZE] ; pop back the register value + add _ESP, byte 4*PTR_SIZE ; "quant_intra 0" pushed _EBP, but we don't restore that one, just correct the stack offset by 16 psubw mm7, mm6 ;D8 - movq [edx + 3 * 32 + 16], mm5 ;C9 - movq [edx + 3 * 32 + 24], mm7 ;D9 + movq [_EDX + 3 * 32 + 16], mm5 ;C9 + movq [_EDX + 3 * 32 + 24], mm7 ;D9 - xor eax, eax - ret + xor _EAX, _EAX +%ifdef ARCH_IS_X86_64 + mov TMP0, [_ESP] +%ifndef WINDOWS + add _ESP, 6*PTR_SIZE +%else + add _ESP, 4*PTR_SIZE +%endif + mov [_ESP], TMP0 +%endif + ret +ENDFUNC ;----------------------------------------------------------------------------- @@ -427,54 +456,54 @@ %macro quantinter 1 - movq mm1, [eax] ;A2 + movq mm1, [_EAX] ;A2 psraw mm3, 15 ;B6 %if (%1) psubw mm2, mm6 ;C10 %endif psubw mm1, mm0 ;A3 pmulhw mm4, mm7 ;B7 - movq mm6, [ecx + %1*24+16] ;C1 + movq mm6, [_ECX + %1*24+16] ;C1 pmaxsw mm1, mm0 ;A4 paddw mm5, mm4 ;B8 %if (%1) - movq [edx + %1*24+16-24], mm2 ;C11 + movq [_EDX + %1*24+16-24], mm2 ;C11 %endif - psubusw mm1, [ebx] ;A5 mm0 -= sub (unsigned, dont go < 0) + psubusw mm1, [_EBX] ;A5 mm0 -= sub (unsigned, dont go < 0) pxor mm4, mm3 ;B9 - movq mm2, [eax] ;C2 + movq mm2, [_EAX] ;C2 psraw mm0, 15 ;A6 psubw mm4, mm3 ;B10 psubw mm2, mm6 ;C3 pmulhw mm1, mm7 ;A7 mm0 = (mm0 / 2Q) >> 24 - movq mm3, [ecx + %1*24+8] ;B1 + movq mm3, [_ECX + %1*24+8] ;B1 pmaxsw mm2, mm6 ;C4 paddw mm5, mm1 ;A8 sum += mm0 %if (%1) - movq [edx + %1*24+8-24], mm4 ;B11 + movq [_EDX + %1*24+8-24], mm4 ;B11 %else - movq [edx + 120], mm4 ;B11 + movq [_EDX + 120], mm4 ;B11 %endif - psubusw mm2, [ebx] ;C5 + psubusw mm2, [_EBX] ;C5 pxor mm1, mm0 ;A9 mm0 *= sign(mm0) - movq mm4, [eax] ;B2 + movq mm4, [_EAX] ;B2 psraw mm6, 15 ;C6 psubw mm1, mm0 ;A10 undisplace psubw mm4, mm3 ;B3 pmulhw mm2, mm7 ;C7 - movq mm0, [ecx + %1*24+24] ;A1 mm0 = [1st] + movq mm0, [_ECX + %1*24+24] ;A1 mm0 = [1st] pmaxsw mm4, mm3 ;B4 paddw mm5, mm2 ;C8 - movq [byte edx + %1*24], mm1 ;A11 - psubusw mm4, [ebx] ;B5 + movq [byte _EDX + %1*24], mm1 ;A11 + psubusw mm4, [_EBX] ;B5 pxor mm2, mm6 ;C9 %endmacro %macro quantinter1 1 - movq mm0, [byte ecx + %1*16] ;mm0 = [1st] - movq mm3, [ecx + %1*16+8] ; - movq mm1, [eax] - movq mm4, [eax] + movq mm0, [byte _ECX + %1*16] ;mm0 = [1st] + movq mm3, [_ECX + %1*16+8] ; + movq mm1, [_EAX] + movq mm4, [_EAX] psubw mm1, mm0 psubw mm4, mm3 pmaxsw mm1, mm0 @@ -491,35 +520,58 @@ pxor mm4, mm3 ; psubw mm1, mm0 ; undisplace psubw mm4, mm3 - cmp esp, esp - movq [byte edx + %1*16], mm1 - movq [edx + %1*16+8], mm4 + cmp _ESP, _ESP + movq [byte _EDX + %1*16], mm1 + movq [_EDX + %1*16+8], mm4 %endmacro -ALIGN 16 +ALIGN SECTION_ALIGN cglobal quant_h263_inter_3dne quant_h263_inter_3dne: - mov edx, [esp + 4] ; coeff - mov ecx, [esp + 8] ; data - mov eax, [esp + 12] ; quant - push ebx + +%ifdef ARCH_IS_X86_64 + mov TMP0, [_ESP] + add _ESP, PTR_SIZE +%ifndef WINDOWS + push prm6 + push prm5 +%endif + push prm4 + push prm3 + push prm2 + push prm1 + sub _ESP, PTR_SIZE + mov [_ESP], TMP0 +%endif + + mov _EDX, [_ESP + 1*PTR_SIZE] ; coeff + mov _ECX, [_ESP + 2*PTR_SIZE] ; data + mov _EAX, [_ESP + 3*PTR_SIZE] ; quant + push _EBX pxor mm5, mm5 ; sum nop - lea ebx,[mmx_sub + eax * 8 - 8] ; sub - movq mm7, [mmx_div + eax * 8 - 8] ; divider +%ifdef ARCH_IS_X86_64 + lea _EBX, [mmx_div] + movq mm7, [_EBX + _EAX * 8 - 8] + lea _EBX, [mmx_sub] + lea _EBX, [_EBX + _EAX * 8 - 8] +%else + lea _EBX,[mmx_sub + _EAX * 8 - 8] ; sub + movq mm7, [mmx_div + _EAX * 8 - 8] ; divider +%endif cmp al, 1 - lea eax, [mmzero] + lea _EAX, [mmzero] jz near .q1loop - cmp esp, esp -ALIGN 8 - movq mm3, [ecx + 120] ;B1 + cmp _ESP, _ESP +ALIGN SECTION_ALIGN + movq mm3, [_ECX + 120] ;B1 pxor mm4, mm4 ;B2 psubw mm4, mm3 ;B3 - movq mm0, [ecx] ;A1 mm0 = [1st] + movq mm0, [_ECX] ;A1 mm0 = [1st] pmaxsw mm4, mm3 ;B4 - psubusw mm4, [ebx] ;B5 + psubusw mm4, [_EBX] ;B5 quantinter 0 quantinter 1 @@ -533,20 +585,30 @@ paddw mm5, mm4 ;B8 pxor mm4, mm3 ;B9 psubw mm4, mm3 ;B10 - movq [edx + 4*24+16], mm2 ;C11 - pop ebx - movq [edx + 4*24+8], mm4 ;B11 + movq [_EDX + 4*24+16], mm2 ;C11 + pop _EBX + movq [_EDX + 4*24+8], mm4 ;B11 pmaddwd mm5, [plus_one] movq mm0, mm5 punpckhdq mm5, mm5 paddd mm0, mm5 movd eax, mm0 ; return sum +%ifdef ARCH_IS_X86_64 + mov TMP0, [_ESP] +%ifndef WINDOWS + add _ESP, 6*PTR_SIZE +%else + add _ESP, 4*PTR_SIZE +%endif + mov [_ESP], TMP0 +%endif + ret -ALIGN 16 -.q1loop - movq mm6, [byte ebx] +ALIGN SECTION_ALIGN +.q1loop: + movq mm6, [byte _EBX] quantinter1 0 quantinter1 1 @@ -563,9 +625,21 @@ paddd mm0, mm5 movd eax, mm0 ; return sum - pop ebx + pop _EBX + +%ifdef ARCH_IS_X86_64 + mov TMP0, [_ESP] +%ifndef WINDOWS + add _ESP, 6*PTR_SIZE +%else + add _ESP, 4*PTR_SIZE +%endif + mov [_ESP], TMP0 +%endif ret +ENDFUNC + ;----------------------------------------------------------------------------- ; @@ -583,140 +657,179 @@ ;This is Athlon-optimized code (ca 106 clk per call) %macro dequant 1 - movq mm1, [ecx+%1*24] ; c = coeff[i] ;A2 + movq mm1, [_ECX+%1*24] ; c = coeff[i] ;A2 psubw mm0, mm1 ;-c ;A3 (1st dep) %if (%1) paddw mm4, mm6 ;C11 mm6 free (4th+) %endif pmaxsw mm0, mm1 ;|c| ;A4 (2nd) %if (%1) - mov ebp, ebp - pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) later + mov _EBP, _EBP + pminsw mm4, [_EBX] ;C12 saturates to +2047 (5th+) later %endif - movq mm6, [esi] ;0 ;A5 mm6 in use - pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) + movq mm6, [_ESI] ;0 ;A5 mm6 in use + pandn mm7, [_EAX] ;B9 offset = isZero ? 0 : quant_add (2nd) %if (%1) pxor mm5, mm4 ;C13 (6th+) 1later %endif - movq mm4, [esi] ;C1 ;0 - mov esp, esp - pcmpeqw mm6, [ecx+%1*24] ;A6 (c ==0) ? -1 : 0 (1st) -ALIGN 4 + movq mm4, [_ESI] ;C1 ;0 + mov _ESP, _ESP + pcmpeqw mm6, [_ECX+%1*24] ;A6 (c ==0) ? -1 : 0 (1st) +ALIGN SECTION_ALIGN psraw mm1, 15 ; sign(c) ;A7 (2nd) %if (%1) - movq [edx+%1*24+16-24], mm5 ; C14 (7th) 2later + movq [_EDX+%1*24+16-24], mm5 ; C14 (7th) 2later %endif paddw mm7, mm3 ;B10 offset +negate back (3rd) - pmullw mm0, [edi] ;*= 2Q ;A8 (3rd+) + pmullw mm0, [_EDI] ;*= 2Q ;A8 (3rd+) paddw mm2, mm7 ;B11 mm7 free (4th+) - lea ebp, [byte ebp] - movq mm5, [ecx+%1*24+16] ;C2 ; c = coeff[i] + lea _EBP, [byte _EBP] + movq mm5, [_ECX+%1*24+16] ;C2 ; c = coeff[i] psubw mm4, mm5 ;-c ;C3 (1st dep) - pandn mm6, [eax] ;A9 offset = isZero ? 0 : quant_add (2nd) - pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) + pandn mm6, [_EAX] ;A9 offset = isZero ? 0 : quant_add (2nd) + pminsw mm2, [_EBX] ;B12 saturates to +2047 (5th+) pxor mm3, mm2 ;B13 (6th+) - movq mm2, [byte esi] ;B1 ;0 + movq mm2, [byte _ESI] ;B1 ;0 %if (%1) - movq [edx+%1*24+8-24], mm3 ;B14 (7th) + movq [_EDX+%1*24+8-24], mm3 ;B14 (7th) %else - movq [edx+120], mm3 + movq [_EDX+120], mm3 %endif pmaxsw mm4, mm5 ;|c| ;C4 (2nd) paddw mm6, mm1 ;A10 offset +negate back (3rd) - movq mm3, [ecx+%1*24 + 8] ;B2 ; c = coeff[i] + movq mm3, [_ECX+%1*24 + 8] ;B2 ; c = coeff[i] psubw mm2, mm3 ;-c ;B3 (1st dep) paddw mm0, mm6 ;A11 mm6 free (4th+) - movq mm6, [byte esi] ;0 ;C5 mm6 in use - pcmpeqw mm6, [ecx+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st) - pminsw mm0, [ebx] ;A12 saturates to +2047 (5th+) + movq mm6, [byte _ESI] ;0 ;C5 mm6 in use + pcmpeqw mm6, [_ECX+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st) + pminsw mm0, [_EBX] ;A12 saturates to +2047 (5th+) pmaxsw mm2, mm3 ;|c| ;B4 (2nd) pxor mm1, mm0 ;A13 (6th+) - pmullw mm4, [edi] ;*= 2Q ;C8 (3rd+) + pmullw mm4, [_EDI] ;*= 2Q ;C8 (3rd+) psraw mm5, 15 ; sign(c) ;C7 (2nd) - movq mm7, [byte esi] ;0 ;B5 mm7 in use - pcmpeqw mm7, [ecx+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st) + movq mm7, [byte _ESI] ;0 ;B5 mm7 in use + pcmpeqw mm7, [_ECX+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st) %if (%1 < 4) - movq mm0, [byte esi] ;A1 ;0 + movq mm0, [byte _ESI] ;A1 ;0 %endif - pandn mm6, [byte eax] ;C9 offset = isZero ? 0 : quant_add (2nd) + pandn mm6, [byte _EAX] ;C9 offset = isZero ? 0 : quant_add (2nd) psraw mm3, 15 ;sign(c) ;B7 (2nd) - movq [byte edx+%1*24], mm1 ;A14 (7th) + movq [byte _EDX+%1*24], mm1 ;A14 (7th) paddw mm6, mm5 ;C10 offset +negate back (3rd) - pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) - mov esp, esp + pmullw mm2, [_EDI] ;*= 2Q ;B8 (3rd+) + mov _ESP, _ESP %endmacro -ALIGN 16 +ALIGN SECTION_ALIGN cglobal dequant_h263_intra_3dne dequant_h263_intra_3dne: - mov ecx, [esp+ 8] ; coeff - mov eax, [esp+12] ; quant + +%ifdef ARCH_IS_X86_64 + mov TMP0, [_ESP] + add _ESP, PTR_SIZE +%ifndef WINDOWS + push prm6 + push prm5 +%endif + push prm4 + push prm3 + push prm2 + push prm1 + sub _ESP, PTR_SIZE + mov [_ESP], TMP0 +%endif + + mov _ECX, [_ESP+ 2*PTR_SIZE] ; coeff + mov _EAX, [_ESP+ 3*PTR_SIZE] ; quant pxor mm0, mm0 pxor mm2, mm2 - push edi - push ebx - lea edi, [mmx_mul + eax*8 - 8] ; 2*quant - push ebp - mov ebx, mmx_2047 - movsx ebp, word [ecx] - lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1 - push esi - mov esi, mmzero + push _EDI + push _EBX +%ifdef ARCH_IS_X86_64 + lea _EDI, [mmx_mul] + lea _EDI, [_EDI + _EAX*8 - 8] ; 2*quant +%else + lea _EDI, [mmx_mul + _EAX*8 - 8] ; 2*quant +%endif + push _EBP + mov _EBX, mmx_2047 + movsx _EBP, word [_ECX] +%ifdef ARCH_IS_X86_64 + lea r9, [mmx_add] + lea _EAX, [r9 + _EAX*8 - 8] ; quant or quant-1 +%else + lea _EAX, [mmx_add + _EAX*8 - 8] ; quant or quant-1 +%endif + push _ESI + mov _ESI, mmzero pxor mm7, mm7 - movq mm3, [ecx+120] ;B2 ; c = coeff[i] - pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) + movq mm3, [_ECX+120] ;B2 ; c = coeff[i] + pcmpeqw mm7, [_ECX+120] ;B6 (c ==0) ? -1 : 0 (1st) - imul ebp, [esp+16+16] ; dcscalar + imul _EBP, [_ESP+(4+4)*PTR_SIZE] ; dcscalar psubw mm2, mm3 ;-c ;B3 (1st dep) pmaxsw mm2, mm3 ;|c| ;B4 (2nd) - pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) + pmullw mm2, [_EDI] ;*= 2Q ;B8 (3rd+) psraw mm3, 15 ; sign(c) ;B7 (2nd) - mov edx, [esp+ 4+16] ; data + mov _EDX, [_ESP+ (1+4)*PTR_SIZE] ; data -ALIGN 8 +ALIGN SECTION_ALIGN dequant 0 - cmp ebp, -2048 - mov esp, esp + cmp _EBP, -2048 + mov _ESP, _ESP dequant 1 - cmovl ebp, [int_2048] + cmovl _EBP, [int_2048] nop dequant 2 - cmp ebp, 2047 - mov esp, esp + cmp _EBP, 2047 + mov _ESP, _ESP dequant 3 - cmovg ebp, [int2047] + cmovg _EBP, [int2047] nop dequant 4 paddw mm4, mm6 ;C11 mm6 free (4th+) - pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) - pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) - mov eax, ebp - mov esi, [esp] - mov ebp, [esp+4] + pminsw mm4, [_EBX] ;C12 saturates to +2047 (5th+) + pandn mm7, [_EAX] ;B9 offset = isZero ? 0 : quant_add (2nd) + mov _EAX, _EBP + mov _ESI, [_ESP] + mov _EBP, [_ESP+PTR_SIZE] pxor mm5, mm4 ;C13 (6th+) paddw mm7, mm3 ;B10 offset +negate back (3rd) - movq [edx+4*24+16], mm5 ;C14 (7th) + movq [_EDX+4*24+16], mm5 ;C14 (7th) paddw mm2, mm7 ;B11 mm7 free (4th+) - pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) - mov ebx, [esp+8] - mov edi, [esp+12] - add esp, byte 16 + pminsw mm2, [_EBX] ;B12 saturates to +2047 (5th+) + mov _EBX, [_ESP+2*PTR_SIZE] + mov _EDI, [_ESP+3*PTR_SIZE] + add _ESP, byte 4*PTR_SIZE pxor mm3, mm2 ;B13 (6th+) - movq [edx+4*24+8], mm3 ;B14 (7th) - mov [edx], ax + movq [_EDX+4*24+8], mm3 ;B14 (7th) + mov [_EDX], ax + + xor _EAX, _EAX + +%ifdef ARCH_IS_X86_64 + mov TMP0, [_ESP] +%ifndef WINDOWS + add _ESP, 6*PTR_SIZE +%else + add _ESP, 4*PTR_SIZE +%endif + mov [_ESP], TMP0 +%endif - xor eax, eax ret +ENDFUNC + ;----------------------------------------------------------------------------- ; @@ -731,30 +844,56 @@ ; except that we're saturating using 'pminsw' (saves 2 cycles/loop) ; This is Athlon-optimized code (ca 100 clk per call) -ALIGN 16 +ALIGN SECTION_ALIGN cglobal dequant_h263_inter_3dne dequant_h263_inter_3dne: - mov ecx, [esp+ 8] ; coeff - mov eax, [esp+12] ; quant + +%ifdef ARCH_IS_X86_64 + mov TMP0, [_ESP] + add _ESP, PTR_SIZE +%ifndef WINDOWS + push prm6 + push prm5 +%endif + push prm4 + push prm3 + push prm2 + push prm1 + sub _ESP, PTR_SIZE + mov [_ESP], TMP0 +%endif + + mov _ECX, [_ESP+ 2*PTR_SIZE] ; coeff + mov _EAX, [_ESP+ 3*PTR_SIZE] ; quant pxor mm0, mm0 pxor mm2, mm2 - push edi - push ebx - push esi - lea edi, [mmx_mul + eax*8 - 8] ; 2*quant - mov ebx, mmx_2047 + push _EDI + push _EBX + push _ESI +%ifdef ARCH_IS_X86_64 + lea _EDI, [mmx_mul] + lea _EDI, [_EDI + _EAX*8 - 8] ; 2*quant +%else + lea _EDI, [mmx_mul + _EAX*8 - 8] ; 2*quant +%endif + mov _EBX, mmx_2047 pxor mm7, mm7 - movq mm3, [ecx+120] ;B2 ; c = coeff[i] - pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) - lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1 + movq mm3, [_ECX+120] ;B2 ; c = coeff[i] + pcmpeqw mm7, [_ECX+120] ;B6 (c ==0) ? -1 : 0 (1st) +%ifdef ARCH_IS_X86_64 + lea r9, [mmx_add] + lea _EAX, [r9 + _EAX*8 - 8] ; quant or quant-1 +%else + lea _EAX, [mmx_add + _EAX*8 - 8] ; quant or quant-1 +%endif psubw mm2, mm3 ;-c ;B3 (1st dep) - mov esi, mmzero + mov _ESI, mmzero pmaxsw mm2, mm3 ;|c| ;B4 (2nd) - pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) + pmullw mm2, [_EDI] ;*= 2Q ;B8 (3rd+) psraw mm3, 15 ; sign(c) ;B7 (2nd) - mov edx, [dword esp+ 4+12] ; data + mov _EDX, [_ESP+ (1+3)*PTR_SIZE] ; data -ALIGN 8 +ALIGN SECTION_ALIGN dequant 0 dequant 1 @@ -763,19 +902,36 @@ dequant 4 paddw mm4, mm6 ;C11 mm6 free (4th+) - pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) - pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) - mov esi, [esp] + pminsw mm4, [_EBX] ;C12 saturates to +2047 (5th+) + pandn mm7, [_EAX] ;B9 offset = isZero ? 0 : quant_add (2nd) + mov _ESI, [_ESP] pxor mm5, mm4 ;C13 (6th+) paddw mm7, mm3 ;B10 offset +negate back (3rd) - movq [edx+4*24+16], mm5 ;C14 (7th) + movq [_EDX+4*24+16], mm5 ;C14 (7th) paddw mm2, mm7 ;B11 mm7 free (4th+) - pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) - mov ebx, [esp+4] - mov edi, [esp+8] - add esp, byte 12 + pminsw mm2, [_EBX] ;B12 saturates to +2047 (5th+) + mov _EBX, [_ESP+PTR_SIZE] + mov _EDI, [_ESP+2*PTR_SIZE] + add _ESP, byte 3*PTR_SIZE pxor mm3, mm2 ;B13 (6th+) - movq [edx+4*24+8], mm3 ;B14 (7th) + movq [_EDX+4*24+8], mm3 ;B14 (7th) + + xor _EAX, _EAX + +%ifdef ARCH_IS_X86_64 + mov TMP0, [_ESP] +%ifndef WINDOWS + add _ESP, 6*PTR_SIZE +%else + add _ESP, 4*PTR_SIZE +%endif + mov [_ESP], TMP0 +%endif - xor eax, eax ret +ENDFUNC + +%ifidn __OUTPUT_FORMAT__,elf +section ".note.GNU-stack" noalloc noexec nowrite progbits +%endif +