;/************************************************************************** ; * ; * XVID MPEG-4 VIDEO CODEC ; * - mmx quantization/dequantization - ; * ; * Copyright(C) 2001-2003 XviD Team ; * ; * This program is free software ; you can redistribute it and/or modify ; * it under the terms of the GNU General Public License as published by ; * the Free Software Foundation ; either version 2 of the License, or ; * (at your option) any later version. ; * ; * This program is distributed in the hope that it will be useful, ; * but WITHOUT ANY WARRANTY ; without even the implied warranty of ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; * GNU General Public License for more details. ; * ; * You should have received a copy of the GNU General Public License ; * along with this program ; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * ; * $Id: quantize_3dne.asm,v 1.2.2.1 2003-07-16 22:59:20 edgomez Exp $ ; * ; *************************************************************************/ ; these 3dne functions are compatible with iSSE, but are optimized specifically for ; K7 pipelines ; ;------------------------------------------------------------------------------ ; 09.12.2002 Athlon optimizations contributed by Jaan Kalda ;------------------------------------------------------------------------------ ; enable dequant saturate [-2048,2047], test purposes only. %define SATURATE ; data/text alignment %define ALIGN 16 bits 32 %ifdef FORMAT_COFF section .data data %else section .data data align=16 %endif %macro cglobal 1 %ifdef PREFIX global _%1 %define %1 _%1 %else global %1 %endif %endmacro align 4 int_div dd 0 %assign i 1 %rep 255 dd (1 << 16) / (i) + 1 %assign i i+1 %endrep align 16 plus_one times 8 dw 1 ;=========================================================================== ; ; subtract by Q/2 table ; ;=========================================================================== %macro MMX_SUB 1 times 4 dw %1 / 2 %endmacro align 16 mmx_sub %assign i 1 %rep 31 times 4 dw i / 2 %assign i i+1 %endrep ;=========================================================================== ; ; divide by 2Q table ; ; use a shift of 16 to take full advantage of _pmulhw_ ; for q=1, _pmulhw_ will overflow so it is treated seperately ; (3dnow2 provides _pmulhuw_ which wont cause overflow) ; ;=========================================================================== align 16 mmx_div %assign i 1 %rep 31 times 4 dw (1 << 16) / (i * 2) + 1 %assign i i+1 %endrep ;=========================================================================== ; ; add by (odd(Q) ? Q : Q - 1) table ; ;=========================================================================== %macro MMX_ADD 1 %if %1 % 2 != 0 times 4 dw %1 %else times 4 dw %1 - 1 %endif %endmacro align 16 mmx_add %assign i 1 %rep 31 MMX_ADD i %assign i i+1 %endrep ;=========================================================================== ; ; multiple by 2Q table ; ;=========================================================================== %macro MMX_MUL 1 times 4 dw %1 * 2 %endmacro align 16 mmx_mul %assign i 1 %rep 31 times 4 dw i * 2 %assign i i+1 %endrep ;=========================================================================== ; ; saturation limits ; ;=========================================================================== align 8 mmx_32768_minus_2048 times 4 dw (32768-2048) mmx_32767_minus_2047 times 4 dw (32767-2047) align 16 mmx_2047 times 4 dw 2047 align 8 mmzero dd 0, 0 int2047 dd 2047 int_2048 dd -2048 section .text ;=========================================================================== ; ; void quant_intra_3dne(int16_t * coeff, ; const int16_t const * data, ; const uint32_t quant, ; const uint32_t dcscalar); ; ;=========================================================================== ;This is Athlon-optimized code (ca 70 clk per call) ;Optimized by Jaan, 30 Nov 2002 %macro quant_intra1 1 psubw mm1, mm0 ;A3 psubw mm3, mm2 ;B3 %if (%1) psubw mm5, mm4 ;C8 psubw mm7, mm6 ;D8 %endif align 8 db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1 pmaxsw mm1, mm0 ;A4 db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1 pmaxsw mm3, mm2 ;B4 psraw mm0, 15 ;A5 psraw mm2, 15 ;B5 %if (%1) movq [edx + %1 * 32 + 16-32], mm5 ;C9 movq [edx + %1 * 32 + 24-32], mm7 ;D9 %endif psrlw mm1, 1 ;A6 psrlw mm3, 1 ;B6 movq mm5, [ebx] ;C2 movq mm7, [ebx] ;D2 pxor mm1, mm0 ;A7 pxor mm3, mm2 ;B7 psubw mm5, mm4 ;C3 psubw mm7, mm6 ;D3 psubw mm1, mm0 ;A8 psubw mm3, mm2 ;B8 %if (%1 == 0) push ebp movq mm0, [ecx + %1 * 32 +32] %elif (%1 < 3) db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1 %endif pmaxsw mm5, mm4 ;C4 %if (%1 < 3) db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1 %else cmp esp, esp %endif pmaxsw mm7, mm6 ;D4 psraw mm4, 15 ;C5 psraw mm6, 15 ;D5 movq [byte edx + %1 * 32], mm1 ;A9 movq [edx + %1 * 32+8], mm3 ;B9 psrlw mm5, 1 ;C6 psrlw mm7, 1 ;D6 %if (%1 < 3) movq mm1, [ebx] ;A2 movq mm3, [ebx] ;B2 %endif %if (%1 == 3) imul eax, [int_div+4*edi] %endif pxor mm5, mm4 ;C7 pxor mm7, mm6 ;D7 %endm %macro quant_intra 1 ; Rules for athlon: ; 1) schedule latencies ; 2) add/mul and load/store in 2:1 proportion ; 3) avoid spliting >3byte instructions over 8byte boundaries psubw mm1, mm0 ;A3 psubw mm3, mm2 ;B3 %if (%1) psubw mm5, mm4 ;C8 psubw mm7, mm6 ;D8 %endif align 8 db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1 pmaxsw mm1, mm0 ;A4 db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1 pmaxsw mm3, mm2 ;B4 psraw mm0, 15 ;A5 psraw mm2, 15 ;B5 %if (%1) movq [edx + %1 * 32 + 16-32], mm5 ;C9 movq [edx + %1 * 32 + 24-32], mm7 ;D9 %endif pmulhw mm1, [esi] ;A6 pmulhw mm3, [esi] ;B6 movq mm5, [ebx] ;C2 movq mm7, [ebx] ;D2 nop nop pxor mm1, mm0 ;A7 pxor mm3, mm2 ;B7 psubw mm5, mm4 ;C3 psubw mm7, mm6 ;D3 psubw mm1, mm0 ;A8 psubw mm3, mm2 ;B8 %if (%1 < 3) db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1 %endif pmaxsw mm5, mm4 ;C4 %if (%1 < 3) db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1 %else cmp esp, esp %endif pmaxsw mm7,mm6 ;D4 psraw mm4, 15 ;C5 psraw mm6, 15 ;D5 movq [byte edx + %1 * 32], mm1 ;A9 movq [edx + %1 * 32+8], mm3 ;B9 pmulhw mm5, [esi] ;C6 pmulhw mm7, [esi] ;D6 %if (%1 < 3) movq mm1, [ebx] ;A2 movq mm3, [ebx] ;B2 %endif %if (%1 == 0) push ebp %elif (%1 < 3) nop %endif nop %if (%1 == 3) imul eax, [int_div+4*edi] %endif pxor mm5, mm4 ;C7 pxor mm7, mm6 ;D7 %endmacro align ALIGN cglobal quant_intra_3dne quant_intra_3dne: mov eax, [esp + 12] ; quant mov ecx, [esp + 8] ; data mov edx, [esp + 4] ; coeff cmp al, 1 pxor mm1, mm1 pxor mm3, mm3 movq mm0, [ecx] ; mm0 = [1st] movq mm2, [ecx + 8] push esi lea esi, [mmx_div + eax*8 - 8] push ebx mov ebx, mmzero push edi jz near .q1loop quant_intra 0 mov ebp, [esp + 16 + 16] ; dcscalar ; NB -- there are 3 pushes in the function preambule and one more ; in "quant_intra 0", thus an added offset of 16 bytes movsx eax, word [byte ecx] ; DC quant_intra 1 mov edi, eax sar edi, 31 ; sign(DC) shr ebp, byte 1 ; ebp = dcscalar/2 quant_intra 2 sub eax, edi ; DC (+1) xor ebp, edi ; sign(DC) dcscalar /2 (-1) mov edi, [esp + 16 + 16] ; dscalar lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar/2 mov ebp, [byte esp] quant_intra 3 psubw mm5, mm4 ;C8 mov esi, [esp + 12] ; pop back the register value mov edi, [esp + 4] ; pop back the register value sar eax, 16 lea ebx, [byte eax + 1] ; workaround for eax < 0 cmovs eax, ebx ; conditionnaly move the corrected value mov [edx], ax ; coeff[0] = ax mov ebx, [esp + 8] ; pop back the register value add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 psubw mm7, mm6 ;D8 movq [edx + 3 * 32 + 16], mm5 ;C9 movq [edx + 3 * 32 + 24], mm7 ;D9 ret align 16 .q1loop quant_intra1 0 mov ebp, [esp + 16 + 16] ; dcscalar movsx eax, word [byte ecx] ; DC quant_intra1 1 mov edi, eax sar edi, 31 ; sign(DC) shr ebp, byte 1 ; ebp = dcscalar /2 quant_intra1 2 sub eax, edi ; DC (+1) xor ebp, edi ; sign(DC) dcscalar /2 (-1) mov edi, [esp + 16 + 16] ; dcscalar lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar /2 mov ebp, [byte esp] quant_intra1 3 psubw mm5, mm4 ;C8 mov esi, [dword esp + 12] ; pop back the register value mov edi, [esp + 4] ; pop back the register value sar eax, 16 lea ebx, [byte eax + 1] ; workaround for eax < 0 cmovs eax, ebx ; conditionnaly move the corrected value mov [edx], ax ; coeff[0] = ax mov ebx, [esp + 8] ; pop back the register value add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 psubw mm7, mm6 ;D8 movq [edx + 3 * 32 + 16], mm5 ;C9 movq [edx + 3 * 32 + 24], mm7 ;D9 ret ;=========================================================================== ; ; uint32_t quant_inter_3dne(int16_t * coeff, ; const int16_t const * data, ; const uint32_t quant); ; ;=========================================================================== ;This is Athlon-optimized code (ca 90 clk per call) ;Optimized by Jaan, 30 Nov 2002 %macro quantinter 1 movq mm1, [eax] ;A2 psraw mm3, 15 ;B6 %if (%1) psubw mm2, mm6 ;C10 %endif psubw mm1, mm0 ;A3 pmulhw mm4, mm7 ;B7 movq mm6, [ecx + %1*24+16] ;C1 pmaxsw mm1, mm0 ;A4 paddw mm5, mm4 ;B8 %if (%1) movq [edx + %1*24+16-24], mm2 ;C11 %endif psubusw mm1, [ebx] ;A5 mm0 -= sub (unsigned, dont go < 0) pxor mm4, mm3 ;B9 movq mm2, [eax] ;C2 psraw mm0, 15 ;A6 psubw mm4, mm3 ;B10 psubw mm2, mm6 ;C3 pmulhw mm1, mm7 ;A7 mm0 = (mm0 / 2Q) >> 24 movq mm3, [ecx + %1*24+8] ;B1 pmaxsw mm2, mm6 ;C4 paddw mm5, mm1 ;A8 sum += mm0 %if (%1) movq [edx + %1*24+8-24], mm4 ;B11 %else movq [edx + 120], mm4 ;B11 %endif psubusw mm2, [ebx] ;C5 pxor mm1, mm0 ;A9 mm0 *= sign(mm0) movq mm4, [eax] ;B2 psraw mm6, 15 ;C6 psubw mm1, mm0 ;A10 undisplace psubw mm4, mm3 ;B3 pmulhw mm2, mm7 ;C7 movq mm0, [ecx + %1*24+24] ;A1 mm0 = [1st] pmaxsw mm4, mm3 ;B4 paddw mm5, mm2 ;C8 movq [byte edx + %1*24], mm1 ;A11 psubusw mm4, [ebx] ;B5 pxor mm2, mm6 ;C9 %endmacro %macro quantinter1 1 movq mm0, [byte ecx + %1*16] ;mm0 = [1st] movq mm3, [ecx + %1*16+8] ; movq mm1, [eax] movq mm4, [eax] psubw mm1, mm0 psubw mm4, mm3 pmaxsw mm1, mm0 pmaxsw mm4, mm3 psubusw mm1, mm6 ; mm0 -= sub (unsigned, dont go < 0) psubusw mm4, mm6 ; psraw mm0, 15 psraw mm3, 15 psrlw mm1, 1 ; mm0 = (mm0 / 2Q) >> 16 psrlw mm4, 1 ; paddw mm5, mm1 ; sum += mm0 pxor mm1, mm0 ; mm0 *= sign(mm0) paddw mm5, mm4 pxor mm4, mm3 ; psubw mm1, mm0 ; undisplace psubw mm4, mm3 cmp esp, esp movq [byte edx + %1*16], mm1 movq [edx + %1*16+8], mm4 %endmacro align ALIGN cglobal quant_inter_3dne quant_inter_3dne mov edx, [esp + 4] ; coeff mov ecx, [esp + 8] ; data mov eax, [esp + 12] ; quant push ebx pxor mm5, mm5 ; sum nop lea ebx,[mmx_sub + eax * 8 - 8] ; sub movq mm7, [mmx_div + eax * 8 - 8] ; divider cmp al, 1 lea eax, [mmzero] jz near .q1loop cmp esp, esp align 8 movq mm3, [ecx + 120] ;B1 pxor mm4, mm4 ;B2 psubw mm4, mm3 ;B3 movq mm0, [ecx] ;A1 mm0 = [1st] pmaxsw mm4, mm3 ;B4 psubusw mm4, [ebx] ;B5 quantinter 0 quantinter 1 quantinter 2 quantinter 3 quantinter 4 psraw mm3, 15 ;B6 psubw mm2, mm6 ;C10 pmulhw mm4, mm7 ;B7 paddw mm5, mm4 ;B8 pxor mm4, mm3 ;B9 psubw mm4, mm3 ;B10 movq [edx + 4*24+16], mm2 ;C11 pop ebx movq [edx + 4*24+8], mm4 ;B11 pmaddwd mm5, [plus_one] movq mm0, mm5 punpckhdq mm5, mm5 paddd mm0, mm5 movd eax, mm0 ; return sum ret align ALIGN .q1loop movq mm6, [byte ebx] quantinter1 0 quantinter1 1 quantinter1 2 quantinter1 3 quantinter1 4 quantinter1 5 quantinter1 6 quantinter1 7 pmaddwd mm5, [plus_one] movq mm0, mm5 psrlq mm5, 32 paddd mm0, mm5 movd eax, mm0 ; return sum pop ebx ret ;=========================================================================== ; ; void dequant_intra_3dne(int16_t *data, ; const int16_t const *coeff, ; const uint32_t quant, ; const uint32_t dcscalar); ; ;=========================================================================== ; this is the same as dequant_inter_3dne, except that we're ; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster) ;This is Athlon-optimized code (ca 106 clk per call) %macro dequant 1 movq mm1, [ecx+%1*24] ; c = coeff[i] ;A2 psubw mm0, mm1 ;-c ;A3 (1st dep) %if (%1) paddw mm4, mm6 ;C11 mm6 free (4th+) %endif pmaxsw mm0, mm1 ;|c| ;A4 (2nd) %if (%1) mov ebp, ebp pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) later %endif movq mm6, [esi] ;0 ;A5 mm6 in use pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) %if (%1) pxor mm5, mm4 ;C13 (6th+) 1later %endif movq mm4, [esi] ;C1 ;0 mov esp, esp pcmpeqw mm6, [ecx+%1*24] ;A6 (c ==0) ? -1 : 0 (1st) align 4 psraw mm1, 15 ; sign(c) ;A7 (2nd) %if (%1) movq [edx+%1*24+16-24], mm5 ; C14 (7th) 2later %endif paddw mm7, mm3 ;B10 offset +negate back (3rd) pmullw mm0, [edi] ;*= 2Q ;A8 (3rd+) paddw mm2, mm7 ;B11 mm7 free (4th+) lea ebp, [byte ebp] movq mm5, [ecx+%1*24+16] ;C2 ; c = coeff[i] psubw mm4, mm5 ;-c ;C3 (1st dep) pandn mm6, [eax] ;A9 offset = isZero ? 0 : quant_add (2nd) pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) pxor mm3, mm2 ;B13 (6th+) movq mm2, [byte esi] ;B1 ;0 %if (%1) movq [edx+%1*24+8-24], mm3 ;B14 (7th) %else movq [edx+120], mm3 %endif pmaxsw mm4, mm5 ;|c| ;C4 (2nd) paddw mm6, mm1 ;A10 offset +negate back (3rd) movq mm3, [ecx+%1*24 + 8] ;B2 ; c = coeff[i] psubw mm2, mm3 ;-c ;B3 (1st dep) paddw mm0, mm6 ;A11 mm6 free (4th+) movq mm6, [byte esi] ;0 ;C5 mm6 in use pcmpeqw mm6, [ecx+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st) pminsw mm0, [ebx] ;A12 saturates to +2047 (5th+) pmaxsw mm2, mm3 ;|c| ;B4 (2nd) pxor mm1, mm0 ;A13 (6th+) pmullw mm4, [edi] ;*= 2Q ;C8 (3rd+) psraw mm5, 15 ; sign(c) ;C7 (2nd) movq mm7, [byte esi] ;0 ;B5 mm7 in use pcmpeqw mm7, [ecx+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st) %if (%1 < 4) movq mm0, [byte esi] ;A1 ;0 %endif pandn mm6, [byte eax] ;C9 offset = isZero ? 0 : quant_add (2nd) psraw mm3, 15 ;sign(c) ;B7 (2nd) movq [byte edx+%1*24], mm1 ;A14 (7th) paddw mm6, mm5 ;C10 offset +negate back (3rd) pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) mov esp, esp %endmacro align ALIGN cglobal dequant_intra_3dne dequant_intra_3dne: mov ecx, [esp+ 8] ; coeff mov eax, [esp+12] ; quant pxor mm0, mm0 pxor mm2, mm2 push edi push ebx lea edi, [mmx_mul + eax*8 - 8] ; 2*quant push ebp mov ebx, mmx_2047 movsx ebp, word [ecx] lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1 push esi mov esi, mmzero pxor mm7, mm7 movq mm3, [ecx+120] ;B2 ; c = coeff[i] pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) imul ebp, [esp+16+16] ; dcscalar psubw mm2, mm3 ;-c ;B3 (1st dep) pmaxsw mm2, mm3 ;|c| ;B4 (2nd) pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) psraw mm3, 15 ; sign(c) ;B7 (2nd) mov edx, [esp+ 4+16] ; data align 8 dequant 0 cmp ebp, -2048 mov esp, esp dequant 1 cmovl ebp, [int_2048] nop dequant 2 cmp ebp, 2047 mov esp, esp dequant 3 cmovg ebp, [int2047] nop dequant 4 paddw mm4, mm6 ;C11 mm6 free (4th+) pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) mov eax, ebp mov esi, [esp] mov ebp, [esp+4] pxor mm5, mm4 ;C13 (6th+) paddw mm7, mm3 ;B10 offset +negate back (3rd) movq [edx+4*24+16], mm5 ;C14 (7th) paddw mm2, mm7 ;B11 mm7 free (4th+) pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) mov ebx, [esp+8] mov edi, [esp+12] add esp, byte 16 pxor mm3, mm2 ;B13 (6th+) movq [edx+4*24+8], mm3 ;B14 (7th) mov [edx], ax ret ;=========================================================================== ; ; void dequant_inter_3dne(int16_t * data, ; const int16_t * const coeff, ; const uint32_t quant); ; ;=========================================================================== ; this is the same as dequant_inter_3dne, ; except that we're saturating using 'pminsw' (saves 2 cycles/loop) ;This is Athlon-optimized code (ca 100 clk per call) ;Optimized by Jaan, 30 Nov 2002 align ALIGN cglobal dequant_inter_3dne dequant_inter_3dne: mov ecx, [esp+ 8] ; coeff mov eax, [esp+12] ; quant pxor mm0, mm0 pxor mm2, mm2 push edi push ebx push esi lea edi, [mmx_mul + eax*8 - 8] ; 2*quant mov ebx, mmx_2047 pxor mm7, mm7 movq mm3, [ecx+120] ;B2 ; c = coeff[i] pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1 psubw mm2, mm3 ;-c ;B3 (1st dep) mov esi, mmzero pmaxsw mm2, mm3 ;|c| ;B4 (2nd) pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) psraw mm3, 15 ; sign(c) ;B7 (2nd) mov edx, [dword esp+ 4+12] ; data align 8 dequant 0 dequant 1 dequant 2 dequant 3 dequant 4 paddw mm4, mm6 ;C11 mm6 free (4th+) pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) mov esi, [esp] pxor mm5, mm4 ;C13 (6th+) paddw mm7, mm3 ;B10 offset +negate back (3rd) movq [edx+4*24+16], mm5 ;C14 (7th) paddw mm2, mm7 ;B11 mm7 free (4th+) pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) mov ebx, [esp+4] mov edi, [esp+8] add esp, byte 12 pxor mm3, mm2 ;B13 (6th+) movq [edx+4*24+8], mm3 ;B14 (7th) ret