;/************************************************************************** ; * ; * XVID MPEG-4 VIDEO CODEC ; * - 3dne Quantization/Dequantization - ; * ; * Copyright(C) 2002-2003 Jaan Kalda ; * ; * This program is free software ; you can r_EDIstribute it and/or modify ; * it under the terms of the GNU General Public License as published by ; * the Free Software Foundation ; either version 2 of the License, or ; * (at your option) any later version. ; * ; * This program is distributed in the hope that it will be useful, ; * but WITHOUT ANY WARRANTY ; without even the implied warranty of ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; * GNU General Public License for more details. ; * ; * You should have received a copy of the GNU General Public License ; * along with this program ; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * ; * $Id: quantize_h263_3dne.asm,v 1.10 2008-12-04 14:41:50 Isibaar Exp $ ; * ; *************************************************************************/ ; ; these 3dne functions are compatible with iSSE, but are optimized specifically for ; K7 pipelines ; enable dequant saturate [-2048,2047], test purposes only. %define SATURATE %include "nasm.inc" ;============================================================================= ; Local data ;============================================================================= DATA align SECTION_ALIGN int_div: dd 0 %assign i 1 %rep 255 dd (1 << 16) / (i) + 1 %assign i i+1 %endrep ALIGN SECTION_ALIGN plus_one: times 8 dw 1 ;----------------------------------------------------------------------------- ; subtract by Q/2 table ;----------------------------------------------------------------------------- ALIGN SECTION_ALIGN mmx_sub: %assign i 1 %rep 31 times 4 dw i / 2 %assign i i+1 %endrep ;----------------------------------------------------------------------------- ; ; divide by 2Q table ; ; use a shift of 16 to take full advantage of _pmulhw_ ; for q=1, _pmulhw_ will overflow so it is treated seperately ; (3dnow2 provides _pmulhuw_ which wont cause overflow) ; ;----------------------------------------------------------------------------- ALIGN SECTION_ALIGN mmx_div: %assign i 1 %rep 31 times 4 dw (1 << 16) / (i * 2) + 1 %assign i i+1 %endrep ;----------------------------------------------------------------------------- ; add by (odd(Q) ? Q : Q - 1) table ;----------------------------------------------------------------------------- ALIGN SECTION_ALIGN mmx_add: %assign i 1 %rep 31 %if i % 2 != 0 times 4 dw i %else times 4 dw i - 1 %endif %assign i i+1 %endrep ;----------------------------------------------------------------------------- ; multiple by 2Q table ;----------------------------------------------------------------------------- ALIGN SECTION_ALIGN mmx_mul: %assign i 1 %rep 31 times 4 dw i * 2 %assign i i+1 %endrep ;----------------------------------------------------------------------------- ; saturation limits ;----------------------------------------------------------------------------- ALIGN SECTION_ALIGN mmx_32768_minus_2048: times 4 dw (32768-2048) mmx_32767_minus_2047: times 4 dw (32767-2047) ALIGN SECTION_ALIGN mmx_2047: times 4 dw 2047 ALIGN SECTION_ALIGN mmzero: dd 0, 0 int2047: dd 2047 int_2048: dd -2048 ;============================================================================= ; Code ;============================================================================= TEXT ;----------------------------------------------------------------------------- ; ; uint32_t quant_h263_intra_3dne(int16_t * coeff, ; const int16_t const * data, ; const uint32_t quant, ; const uint32_t dcscalar, ; const uint16_t *mpeg_matrices); ; ;----------------------------------------------------------------------------- ;This is Athlon-optimized code (ca 70 clk per call) %macro quant_intra1 1 psubw mm1, mm0 ;A3 psubw mm3, mm2 ;B3 %if (%1) psubw mm5, mm4 ;C8 psubw mm7, mm6 ;D8 %endif ALIGN SECTION_ALIGN movq mm4, [_ECX + %1 * 32 +16] ;C1 pmaxsw mm1, mm0 ;A4 movq mm6, [_ECX + %1 * 32 +24] ;D1 pmaxsw mm3, mm2 ;B4 psraw mm0, 15 ;A5 psraw mm2, 15 ;B5 %if (%1) movq [_EDX + %1 * 32 + 16-32], mm5 ;C9 movq [_EDX + %1 * 32 + 24-32], mm7 ;D9 %endif psrlw mm1, 1 ;A6 psrlw mm3, 1 ;B6 movq mm5, [_EBX] ;C2 movq mm7, [_EBX] ;D2 pxor mm1, mm0 ;A7 pxor mm3, mm2 ;B7 psubw mm5, mm4 ;C3 psubw mm7, mm6 ;D3 psubw mm1, mm0 ;A8 psubw mm3, mm2 ;B8 %if (%1 == 0) push _EBP movq mm0, [_ECX + %1 * 32 +32] %elif (%1 < 3) movq mm0, [_ECX + %1 * 32 +32] ;A1 %endif pmaxsw mm5, mm4 ;C4 %if (%1 < 3) movq mm2, [_ECX + %1 * 32 +8+32] ;B1 %else cmp _ESP, _ESP %endif pmaxsw mm7, mm6 ;D4 psraw mm4, 15 ;C5 psraw mm6, 15 ;D5 movq [byte _EDX + %1 * 32], mm1 ;A9 movq [_EDX + %1 * 32+8], mm3 ;B9 psrlw mm5, 1 ;C6 psrlw mm7, 1 ;D6 %if (%1 < 3) movq mm1, [_EBX] ;A2 movq mm3, [_EBX] ;B2 %endif %if (%1 == 3) %ifdef ARCH_IS_X86_64 lea r9, [int_div] imul eax, dword [r9+4*_EDI] %else imul _EAX, [int_div+4*_EDI] %endif %endif pxor mm5, mm4 ;C7 pxor mm7, mm6 ;D7 %endm %macro quant_intra 1 ; Rules for athlon: ; 1) schedule latencies ; 2) add/mul and load/store in 2:1 proportion ; 3) avoid spliting >3byte instructions over 8byte boundaries psubw mm1, mm0 ;A3 psubw mm3, mm2 ;B3 %if (%1) psubw mm5, mm4 ;C8 psubw mm7, mm6 ;D8 %endif ALIGN SECTION_ALIGN movq mm4, [_ECX + %1 * 32 +16] ;C1 pmaxsw mm1, mm0 ;A4 movq mm6, [_ECX + %1 * 32 +24] ;D1 pmaxsw mm3, mm2 ;B4 psraw mm0, 15 ;A5 psraw mm2, 15 ;B5 %if (%1) movq [_EDX + %1 * 32 + 16-32], mm5 ;C9 movq [_EDX + %1 * 32 + 24-32], mm7 ;D9 %endif pmulhw mm1, [_ESI] ;A6 pmulhw mm3, [_ESI] ;B6 movq mm5, [_EBX] ;C2 movq mm7, [_EBX] ;D2 nop nop pxor mm1, mm0 ;A7 pxor mm3, mm2 ;B7 psubw mm5, mm4 ;C3 psubw mm7, mm6 ;D3 psubw mm1, mm0 ;A8 psubw mm3, mm2 ;B8 %if (%1 < 3) movq mm0, [_ECX + %1 * 32 +32] ;A1 %endif pmaxsw mm5, mm4 ;C4 %if (%1 < 3) movq mm2, [_ECX + %1 * 32 +8+32] ;B1 %else cmp _ESP, _ESP %endif pmaxsw mm7,mm6 ;D4 psraw mm4, 15 ;C5 psraw mm6, 15 ;D5 movq [byte _EDX + %1 * 32], mm1 ;A9 movq [_EDX + %1 * 32+8], mm3 ;B9 pmulhw mm5, [_ESI] ;C6 pmulhw mm7, [_ESI] ;D6 %if (%1 < 3) movq mm1, [_EBX] ;A2 movq mm3, [_EBX] ;B2 %endif %if (%1 == 0) push _EBP %elif (%1 < 3) nop %endif nop %if (%1 == 3) %ifdef ARCH_IS_X86_64 lea r9, [int_div] imul eax, dword [r9+4*_EDI] %else imul _EAX, [int_div+4*_EDI] %endif %endif pxor mm5, mm4 ;C7 pxor mm7, mm6 ;D7 %endmacro ALIGN SECTION_ALIGN cglobal quant_h263_intra_3dne quant_h263_intra_3dne: %ifdef ARCH_IS_X86_64 mov TMP0, [_ESP] add _ESP, PTR_SIZE %ifndef WINDOWS push prm6 push prm5 %endif push prm4 push prm3 push prm2 push prm1 sub _ESP, PTR_SIZE mov [_ESP], TMP0 %endif mov _EAX, [_ESP + 3*PTR_SIZE] ; quant mov _ECX, [_ESP + 2*PTR_SIZE] ; data mov _EDX, [_ESP + 1*PTR_SIZE] ; coeff cmp al, 1 pxor mm1, mm1 pxor mm3, mm3 movq mm0, [_ECX] ; mm0 = [1st] movq mm2, [_ECX + 8] push _ESI %ifdef ARCH_IS_X86_64 lea _ESI, [mmx_div] lea _ESI, [_ESI + _EAX*8 - 8] %else lea _ESI, [mmx_div + _EAX*8 - 8] %endif push _EBX mov _EBX, mmzero push _EDI jz near .q1loop quant_intra 0 mov _EBP, [_ESP + (4+4)*PTR_SIZE] ; dcscalar ; NB -- there are 3 pushes in the function preambule and one more ; in "quant_intra 0", thus an added offset of 16 bytes XVID_MOVSX _EAX, word [byte _ECX] ; DC quant_intra 1 mov _EDI, _EAX sar _EDI, 31 ; sign(DC) shr _EBP, byte 1 ; _EBP = dcscalar/2 quant_intra 2 sub _EAX, _EDI ; DC (+1) xor _EBP, _EDI ; sign(DC) dcscalar /2 (-1) mov _EDI, [_ESP + (4+4)*PTR_SIZE] ; dscalar lea _EAX, [byte _EAX + _EBP] ; DC + sign(DC) dcscalar/2 mov _EBP, [byte _ESP] quant_intra 3 psubw mm5, mm4 ;C8 mov _ESI, [_ESP + 3*PTR_SIZE] ; pop back the register value mov _EDI, [_ESP + 1*PTR_SIZE] ; pop back the register value sar _EAX, 16 lea _EBX, [byte _EAX + 1] ; workaround for _EAX < 0 cmovs _EAX, _EBX ; conditionnaly move the corrected value mov [_EDX], ax ; coeff[0] = ax mov _EBX, [_ESP + 2*PTR_SIZE] ; pop back the register value add _ESP, byte 4*PTR_SIZE ; "quant_intra 0" pushed _EBP, but we don't restore that one, just correct the stack offset by 16 psubw mm7, mm6 ;D8 movq [_EDX + 3 * 32 + 16], mm5 ;C9 movq [_EDX + 3 * 32 + 24], mm7 ;D9 xor _EAX, _EAX %ifdef ARCH_IS_X86_64 mov TMP0, [_ESP] %ifndef WINDOWS add _ESP, 6*PTR_SIZE %else add _ESP, 4*PTR_SIZE %endif mov [_ESP], TMP0 %endif ret ALIGN SECTION_ALIGN .q1loop: quant_intra1 0 mov _EBP, [_ESP + (4+4)*PTR_SIZE] ; dcscalar XVID_MOVSX _EAX, word [byte _ECX] ; DC quant_intra1 1 mov _EDI, _EAX sar _EDI, 31 ; sign(DC) shr _EBP, byte 1 ; _EBP = dcscalar /2 quant_intra1 2 sub _EAX, _EDI ; DC (+1) xor _EBP, _EDI ; sign(DC) dcscalar /2 (-1) mov _EDI, [_ESP + (4+4)*PTR_SIZE] ; dcscalar lea _EAX, [byte _EAX + _EBP] ; DC + sign(DC) dcscalar /2 mov _EBP, [byte _ESP] quant_intra1 3 psubw mm5, mm4 ;C8 mov _ESI, [_ESP + 3*PTR_SIZE] ; pop back the register value mov _EDI, [_ESP + 1*PTR_SIZE] ; pop back the register value sar _EAX, 16 lea _EBX, [byte _EAX + 1] ; workaround for _EAX < 0 cmovs _EAX, _EBX ; conditionnaly move the corrected value mov [_EDX], ax ; coeff[0] = ax mov _EBX, [_ESP + 2*PTR_SIZE] ; pop back the register value add _ESP, byte 4*PTR_SIZE ; "quant_intra 0" pushed _EBP, but we don't restore that one, just correct the stack offset by 16 psubw mm7, mm6 ;D8 movq [_EDX + 3 * 32 + 16], mm5 ;C9 movq [_EDX + 3 * 32 + 24], mm7 ;D9 xor _EAX, _EAX %ifdef ARCH_IS_X86_64 mov TMP0, [_ESP] %ifndef WINDOWS add _ESP, 6*PTR_SIZE %else add _ESP, 4*PTR_SIZE %endif mov [_ESP], TMP0 %endif ret ENDFUNC ;----------------------------------------------------------------------------- ; ; uint32_t quant_h263_inter_3dne(int16_t * coeff, ; const int16_t const * data, ; const uint32_t quant, ; const uint16_t *mpeg_matrices); ; ;----------------------------------------------------------------------------- ;This is Athlon-optimized code (ca 90 clk per call) ;Optimized by Jaan, 30 Nov 2002 %macro quantinter 1 movq mm1, [_EAX] ;A2 psraw mm3, 15 ;B6 %if (%1) psubw mm2, mm6 ;C10 %endif psubw mm1, mm0 ;A3 pmulhw mm4, mm7 ;B7 movq mm6, [_ECX + %1*24+16] ;C1 pmaxsw mm1, mm0 ;A4 paddw mm5, mm4 ;B8 %if (%1) movq [_EDX + %1*24+16-24], mm2 ;C11 %endif psubusw mm1, [_EBX] ;A5 mm0 -= sub (unsigned, dont go < 0) pxor mm4, mm3 ;B9 movq mm2, [_EAX] ;C2 psraw mm0, 15 ;A6 psubw mm4, mm3 ;B10 psubw mm2, mm6 ;C3 pmulhw mm1, mm7 ;A7 mm0 = (mm0 / 2Q) >> 24 movq mm3, [_ECX + %1*24+8] ;B1 pmaxsw mm2, mm6 ;C4 paddw mm5, mm1 ;A8 sum += mm0 %if (%1) movq [_EDX + %1*24+8-24], mm4 ;B11 %else movq [_EDX + 120], mm4 ;B11 %endif psubusw mm2, [_EBX] ;C5 pxor mm1, mm0 ;A9 mm0 *= sign(mm0) movq mm4, [_EAX] ;B2 psraw mm6, 15 ;C6 psubw mm1, mm0 ;A10 undisplace psubw mm4, mm3 ;B3 pmulhw mm2, mm7 ;C7 movq mm0, [_ECX + %1*24+24] ;A1 mm0 = [1st] pmaxsw mm4, mm3 ;B4 paddw mm5, mm2 ;C8 movq [byte _EDX + %1*24], mm1 ;A11 psubusw mm4, [_EBX] ;B5 pxor mm2, mm6 ;C9 %endmacro %macro quantinter1 1 movq mm0, [byte _ECX + %1*16] ;mm0 = [1st] movq mm3, [_ECX + %1*16+8] ; movq mm1, [_EAX] movq mm4, [_EAX] psubw mm1, mm0 psubw mm4, mm3 pmaxsw mm1, mm0 pmaxsw mm4, mm3 psubusw mm1, mm6 ; mm0 -= sub (unsigned, dont go < 0) psubusw mm4, mm6 ; psraw mm0, 15 psraw mm3, 15 psrlw mm1, 1 ; mm0 = (mm0 / 2Q) >> 16 psrlw mm4, 1 ; paddw mm5, mm1 ; sum += mm0 pxor mm1, mm0 ; mm0 *= sign(mm0) paddw mm5, mm4 pxor mm4, mm3 ; psubw mm1, mm0 ; undisplace psubw mm4, mm3 cmp _ESP, _ESP movq [byte _EDX + %1*16], mm1 movq [_EDX + %1*16+8], mm4 %endmacro ALIGN SECTION_ALIGN cglobal quant_h263_inter_3dne quant_h263_inter_3dne: %ifdef ARCH_IS_X86_64 mov TMP0, [_ESP] add _ESP, PTR_SIZE %ifndef WINDOWS push prm6 push prm5 %endif push prm4 push prm3 push prm2 push prm1 sub _ESP, PTR_SIZE mov [_ESP], TMP0 %endif mov _EDX, [_ESP + 1*PTR_SIZE] ; coeff mov _ECX, [_ESP + 2*PTR_SIZE] ; data mov _EAX, [_ESP + 3*PTR_SIZE] ; quant push _EBX pxor mm5, mm5 ; sum nop %ifdef ARCH_IS_X86_64 lea _EBX, [mmx_div] movq mm7, [_EBX + _EAX * 8 - 8] lea _EBX, [mmx_sub] lea _EBX, [_EBX + _EAX * 8 - 8] %else lea _EBX,[mmx_sub + _EAX * 8 - 8] ; sub movq mm7, [mmx_div + _EAX * 8 - 8] ; divider %endif cmp al, 1 lea _EAX, [mmzero] jz near .q1loop cmp _ESP, _ESP ALIGN SECTION_ALIGN movq mm3, [_ECX + 120] ;B1 pxor mm4, mm4 ;B2 psubw mm4, mm3 ;B3 movq mm0, [_ECX] ;A1 mm0 = [1st] pmaxsw mm4, mm3 ;B4 psubusw mm4, [_EBX] ;B5 quantinter 0 quantinter 1 quantinter 2 quantinter 3 quantinter 4 psraw mm3, 15 ;B6 psubw mm2, mm6 ;C10 pmulhw mm4, mm7 ;B7 paddw mm5, mm4 ;B8 pxor mm4, mm3 ;B9 psubw mm4, mm3 ;B10 movq [_EDX + 4*24+16], mm2 ;C11 pop _EBX movq [_EDX + 4*24+8], mm4 ;B11 pmaddwd mm5, [plus_one] movq mm0, mm5 punpckhdq mm5, mm5 paddd mm0, mm5 movd eax, mm0 ; return sum %ifdef ARCH_IS_X86_64 mov TMP0, [_ESP] %ifndef WINDOWS add _ESP, 6*PTR_SIZE %else add _ESP, 4*PTR_SIZE %endif mov [_ESP], TMP0 %endif ret ALIGN SECTION_ALIGN .q1loop: movq mm6, [byte _EBX] quantinter1 0 quantinter1 1 quantinter1 2 quantinter1 3 quantinter1 4 quantinter1 5 quantinter1 6 quantinter1 7 pmaddwd mm5, [plus_one] movq mm0, mm5 psrlq mm5, 32 paddd mm0, mm5 movd eax, mm0 ; return sum pop _EBX %ifdef ARCH_IS_X86_64 mov TMP0, [_ESP] %ifndef WINDOWS add _ESP, 6*PTR_SIZE %else add _ESP, 4*PTR_SIZE %endif mov [_ESP], TMP0 %endif ret ENDFUNC ;----------------------------------------------------------------------------- ; ; uint32_t dequant_h263_intra_3dne(int16_t *data, ; const int16_t const *coeff, ; const uint32_t quant, ; const uint32_t dcscalar, ; const uint16_t *mpeg_matrices); ; ;----------------------------------------------------------------------------- ; this is the same as dequant_inter_3dne, except that we're ; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster) ;This is Athlon-optimized code (ca 106 clk per call) %macro dequant 1 movq mm1, [_ECX+%1*24] ; c = coeff[i] ;A2 psubw mm0, mm1 ;-c ;A3 (1st dep) %if (%1) paddw mm4, mm6 ;C11 mm6 free (4th+) %endif pmaxsw mm0, mm1 ;|c| ;A4 (2nd) %if (%1) mov _EBP, _EBP pminsw mm4, [_EBX] ;C12 saturates to +2047 (5th+) later %endif movq mm6, [_ESI] ;0 ;A5 mm6 in use pandn mm7, [_EAX] ;B9 offset = isZero ? 0 : quant_add (2nd) %if (%1) pxor mm5, mm4 ;C13 (6th+) 1later %endif movq mm4, [_ESI] ;C1 ;0 mov _ESP, _ESP pcmpeqw mm6, [_ECX+%1*24] ;A6 (c ==0) ? -1 : 0 (1st) ALIGN SECTION_ALIGN psraw mm1, 15 ; sign(c) ;A7 (2nd) %if (%1) movq [_EDX+%1*24+16-24], mm5 ; C14 (7th) 2later %endif paddw mm7, mm3 ;B10 offset +negate back (3rd) pmullw mm0, [_EDI] ;*= 2Q ;A8 (3rd+) paddw mm2, mm7 ;B11 mm7 free (4th+) lea _EBP, [byte _EBP] movq mm5, [_ECX+%1*24+16] ;C2 ; c = coeff[i] psubw mm4, mm5 ;-c ;C3 (1st dep) pandn mm6, [_EAX] ;A9 offset = isZero ? 0 : quant_add (2nd) pminsw mm2, [_EBX] ;B12 saturates to +2047 (5th+) pxor mm3, mm2 ;B13 (6th+) movq mm2, [byte _ESI] ;B1 ;0 %if (%1) movq [_EDX+%1*24+8-24], mm3 ;B14 (7th) %else movq [_EDX+120], mm3 %endif pmaxsw mm4, mm5 ;|c| ;C4 (2nd) paddw mm6, mm1 ;A10 offset +negate back (3rd) movq mm3, [_ECX+%1*24 + 8] ;B2 ; c = coeff[i] psubw mm2, mm3 ;-c ;B3 (1st dep) paddw mm0, mm6 ;A11 mm6 free (4th+) movq mm6, [byte _ESI] ;0 ;C5 mm6 in use pcmpeqw mm6, [_ECX+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st) pminsw mm0, [_EBX] ;A12 saturates to +2047 (5th+) pmaxsw mm2, mm3 ;|c| ;B4 (2nd) pxor mm1, mm0 ;A13 (6th+) pmullw mm4, [_EDI] ;*= 2Q ;C8 (3rd+) psraw mm5, 15 ; sign(c) ;C7 (2nd) movq mm7, [byte _ESI] ;0 ;B5 mm7 in use pcmpeqw mm7, [_ECX+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st) %if (%1 < 4) movq mm0, [byte _ESI] ;A1 ;0 %endif pandn mm6, [byte _EAX] ;C9 offset = isZero ? 0 : quant_add (2nd) psraw mm3, 15 ;sign(c) ;B7 (2nd) movq [byte _EDX+%1*24], mm1 ;A14 (7th) paddw mm6, mm5 ;C10 offset +negate back (3rd) pmullw mm2, [_EDI] ;*= 2Q ;B8 (3rd+) mov _ESP, _ESP %endmacro ALIGN SECTION_ALIGN cglobal dequant_h263_intra_3dne dequant_h263_intra_3dne: %ifdef ARCH_IS_X86_64 mov TMP0, [_ESP] add _ESP, PTR_SIZE %ifndef WINDOWS push prm6 push prm5 %endif push prm4 push prm3 push prm2 push prm1 sub _ESP, PTR_SIZE mov [_ESP], TMP0 %endif mov _ECX, [_ESP+ 2*PTR_SIZE] ; coeff mov _EAX, [_ESP+ 3*PTR_SIZE] ; quant pxor mm0, mm0 pxor mm2, mm2 push _EDI push _EBX %ifdef ARCH_IS_X86_64 lea _EDI, [mmx_mul] lea _EDI, [_EDI + _EAX*8 - 8] ; 2*quant %else lea _EDI, [mmx_mul + _EAX*8 - 8] ; 2*quant %endif push _EBP mov _EBX, mmx_2047 XVID_MOVSX _EBP, word [_ECX] %ifdef ARCH_IS_X86_64 lea r9, [mmx_add] lea _EAX, [r9 + _EAX*8 - 8] ; quant or quant-1 %else lea _EAX, [mmx_add + _EAX*8 - 8] ; quant or quant-1 %endif push _ESI mov _ESI, mmzero pxor mm7, mm7 movq mm3, [_ECX+120] ;B2 ; c = coeff[i] pcmpeqw mm7, [_ECX+120] ;B6 (c ==0) ? -1 : 0 (1st) imul _EBP, [_ESP+(4+4)*PTR_SIZE] ; dcscalar psubw mm2, mm3 ;-c ;B3 (1st dep) pmaxsw mm2, mm3 ;|c| ;B4 (2nd) pmullw mm2, [_EDI] ;*= 2Q ;B8 (3rd+) psraw mm3, 15 ; sign(c) ;B7 (2nd) mov _EDX, [_ESP+ (1+4)*PTR_SIZE] ; data ALIGN SECTION_ALIGN dequant 0 cmp _EBP, -2048 mov _ESP, _ESP dequant 1 cmovl _EBP, [int_2048] nop dequant 2 cmp _EBP, 2047 mov _ESP, _ESP dequant 3 cmovg _EBP, [int2047] nop dequant 4 paddw mm4, mm6 ;C11 mm6 free (4th+) pminsw mm4, [_EBX] ;C12 saturates to +2047 (5th+) pandn mm7, [_EAX] ;B9 offset = isZero ? 0 : quant_add (2nd) mov _EAX, _EBP mov _ESI, [_ESP] mov _EBP, [_ESP+PTR_SIZE] pxor mm5, mm4 ;C13 (6th+) paddw mm7, mm3 ;B10 offset +negate back (3rd) movq [_EDX+4*24+16], mm5 ;C14 (7th) paddw mm2, mm7 ;B11 mm7 free (4th+) pminsw mm2, [_EBX] ;B12 saturates to +2047 (5th+) mov _EBX, [_ESP+2*PTR_SIZE] mov _EDI, [_ESP+3*PTR_SIZE] add _ESP, byte 4*PTR_SIZE pxor mm3, mm2 ;B13 (6th+) movq [_EDX+4*24+8], mm3 ;B14 (7th) mov [_EDX], ax xor _EAX, _EAX %ifdef ARCH_IS_X86_64 mov TMP0, [_ESP] %ifndef WINDOWS add _ESP, 6*PTR_SIZE %else add _ESP, 4*PTR_SIZE %endif mov [_ESP], TMP0 %endif ret ENDFUNC ;----------------------------------------------------------------------------- ; ; uint32_t dequant_h263_inter_3dne(int16_t * data, ; const int16_t * const coeff, ; const uint32_t quant, ; const uint16_t *mpeg_matrices); ; ;----------------------------------------------------------------------------- ; this is the same as dequant_inter_3dne, ; except that we're saturating using 'pminsw' (saves 2 cycles/loop) ; This is Athlon-optimized code (ca 100 clk per call) ALIGN SECTION_ALIGN cglobal dequant_h263_inter_3dne dequant_h263_inter_3dne: %ifdef ARCH_IS_X86_64 mov TMP0, [_ESP] add _ESP, PTR_SIZE %ifndef WINDOWS push prm6 push prm5 %endif push prm4 push prm3 push prm2 push prm1 sub _ESP, PTR_SIZE mov [_ESP], TMP0 %endif mov _ECX, [_ESP+ 2*PTR_SIZE] ; coeff mov _EAX, [_ESP+ 3*PTR_SIZE] ; quant pxor mm0, mm0 pxor mm2, mm2 push _EDI push _EBX push _ESI %ifdef ARCH_IS_X86_64 lea _EDI, [mmx_mul] lea _EDI, [_EDI + _EAX*8 - 8] ; 2*quant %else lea _EDI, [mmx_mul + _EAX*8 - 8] ; 2*quant %endif mov _EBX, mmx_2047 pxor mm7, mm7 movq mm3, [_ECX+120] ;B2 ; c = coeff[i] pcmpeqw mm7, [_ECX+120] ;B6 (c ==0) ? -1 : 0 (1st) %ifdef ARCH_IS_X86_64 lea r9, [mmx_add] lea _EAX, [r9 + _EAX*8 - 8] ; quant or quant-1 %else lea _EAX, [mmx_add + _EAX*8 - 8] ; quant or quant-1 %endif psubw mm2, mm3 ;-c ;B3 (1st dep) mov _ESI, mmzero pmaxsw mm2, mm3 ;|c| ;B4 (2nd) pmullw mm2, [_EDI] ;*= 2Q ;B8 (3rd+) psraw mm3, 15 ; sign(c) ;B7 (2nd) mov _EDX, [_ESP+ (1+3)*PTR_SIZE] ; data ALIGN SECTION_ALIGN dequant 0 dequant 1 dequant 2 dequant 3 dequant 4 paddw mm4, mm6 ;C11 mm6 free (4th+) pminsw mm4, [_EBX] ;C12 saturates to +2047 (5th+) pandn mm7, [_EAX] ;B9 offset = isZero ? 0 : quant_add (2nd) mov _ESI, [_ESP] pxor mm5, mm4 ;C13 (6th+) paddw mm7, mm3 ;B10 offset +negate back (3rd) movq [_EDX+4*24+16], mm5 ;C14 (7th) paddw mm2, mm7 ;B11 mm7 free (4th+) pminsw mm2, [_EBX] ;B12 saturates to +2047 (5th+) mov _EBX, [_ESP+PTR_SIZE] mov _EDI, [_ESP+2*PTR_SIZE] add _ESP, byte 3*PTR_SIZE pxor mm3, mm2 ;B13 (6th+) movq [_EDX+4*24+8], mm3 ;B14 (7th) xor _EAX, _EAX %ifdef ARCH_IS_X86_64 mov TMP0, [_ESP] %ifndef WINDOWS add _ESP, 6*PTR_SIZE %else add _ESP, 4*PTR_SIZE %endif mov [_ESP], TMP0 %endif ret ENDFUNC %ifidn __OUTPUT_FORMAT__,elf section ".note.GNU-stack" noalloc noexec nowrite progbits %endif