;/************************************************************************** ; * ; * XVID MPEG-4 VIDEO CODEC ; * mmx quantization/dequantization ; * ; * This program is an implementation of a part of one or more MPEG-4 ; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending ; * to use this software module in hardware or software products are ; * advised that its use may infringe existing patents or copyrights, and ; * any such use would be at such party's own risk. The original ; * developer of this software module and his/her company, and subsequent ; * editors and their companies, will have no liability for use of this ; * software or modifications or derivatives thereof. ; * ; * This program is free software; you can redistribute it and/or modify ; * it under the terms of the GNU General Public License as published by ; * the Free Software Foundation; either version 2 of the License, or ; * (at your option) any later version. ; * ; * This program is distributed in the hope that it will be useful, ; * but WITHOUT ANY WARRANTY; without even the implied warranty of ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; * GNU General Public License for more details. ; * ; * You should have received a copy of the GNU General Public License ; * along with this program; if not, write to the Free Software ; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ; * ; *************************************************************************/ ; these 3dne functions are compatible with iSSE, but are optimized specifically for ; K7 pipelines ; ;------------------------------------------------------------------------------ ; 09.12.2002 Athlon optimizations contributed by Jaan Kalda ;------------------------------------------------------------------------------ ; enable dequant saturate [-2048,2047], test purposes only. %define SATURATE ; data/text alignment %define ALIGN 16 bits 32 %ifdef FORMAT_COFF section .data data %else section .data data align=16 %endif %macro cglobal 1 %ifdef PREFIX global _%1 %define %1 _%1 %else global %1 %endif %endmacro align 4 int_div dd 0 %assign i 1 %rep 255 dd (1 << 16) / ( i) + 1 %assign i i+1 %endrep align 16 plus_one times 8 dw 1 ;=========================================================================== ; ; subtract by Q/2 table ; ;=========================================================================== %macro MMX_SUB 1 times 4 dw %1 / 2 %endmacro align 16 mmx_sub %assign i 1 %rep 31 times 4 dw i / 2 %assign i i+1 %endrep ;=========================================================================== ; ; divide by 2Q table ; ; use a shift of 16 to take full advantage of _pmulhw_ ; for q=1, _pmulhw_ will overflow so it is treated seperately ; (3dnow2 provides _pmulhuw_ which wont cause overflow) ; ;=========================================================================== align 16 mmx_div %assign i 1 %rep 31 times 4 dw (1 << 16) / (i * 2) + 1 %assign i i+1 %endrep ;=========================================================================== ; ; add by (odd(Q) ? Q : Q - 1) table ; ;=========================================================================== %macro MMX_ADD 1 %if %1 % 2 != 0 times 4 dw %1 %else times 4 dw %1 - 1 %endif %endmacro align 16 mmx_add %assign i 1 %rep 31 MMX_ADD i %assign i i+1 %endrep ;=========================================================================== ; ; multiple by 2Q table ; ;=========================================================================== %macro MMX_MUL 1 times 4 dw %1 * 2 %endmacro align 16 mmx_mul %assign i 1 %rep 31 times 4 dw i * 2 %assign i i+1 %endrep ;=========================================================================== ; ; saturation limits ; ;=========================================================================== align 8 mmx_32768_minus_2048 times 4 dw (32768-2048) mmx_32767_minus_2047 times 4 dw (32767-2047) align 16 mmx_2047 times 4 dw 2047 align 8 mmzero dd 0, 0 int2047 dd 2047 int_2048 dd -2048 section .text ;=========================================================================== ; ; void quant_intra_3dne(int16_t * coeff, ; const int16_t const * data, ; const uint32_t quant, ; const uint32_t dcscalar); ; ;=========================================================================== ;This is Athlon-optimized code (ca 70 clk per call) ;Optimized by Jaan, 30 Nov 2002 %macro quant_intra1 1 psubw mm1,mm0 ;A3 psubw mm3,mm2 ;B3 %if (%1) psubw mm5, mm4 ;C8 psubw mm7, mm6 ;D8 %endif align 8 db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1 pmaxsw mm1,mm0 ;A4 db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24);movq mm6, [ecx + %1 * 32 +24+32] ;D1 pmaxsw mm3,mm2 ;B4 psraw mm0,15 ;A5 psraw mm2,15 ;B5 %if (%1) movq [edx + %1 * 32 + 16-32], mm5 ;C9 movq [edx + %1 * 32 + 24-32], mm7 ;D9 %endif psrlw mm1, 1 ;A6 psrlw mm3, 1 ;B6 movq mm5, [ebx] ;C2 movq mm7, [ebx] ;D2 pxor mm1, mm0 ;A7 pxor mm3, mm2 ;B7 psubw mm5,mm4 ;C3 psubw mm7,mm6 ;D3 psubw mm1, mm0 ;A8 psubw mm3, mm2 ;B8 %if (%1 == 0) push ebp movq mm0, [ecx + %1 * 32 +32] %elif (%1 < 3) db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1 %endif pmaxsw mm5,mm4 ;C4 %if (%1 < 3) db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1 %else cmp esp,esp %endif pmaxsw mm7,mm6 ;D4 psraw mm4,15 ;C5 psraw mm6,15 ;D5 movq [byte edx + %1 * 32], mm1 ;A9 movq [edx + %1 * 32+8], mm3 ;B9 psrlw mm5, 1 ;C6 psrlw mm7, 1 ;D6 %if (%1 < 3) movq mm1, [ebx] ;A2 movq mm3, [ebx] ;B2 %endif %if (%1 == 3) imul eax,[int_div+4*edi] %endif pxor mm5, mm4 ;C7 pxor mm7, mm6 ;D7 %endm %macro quant_intra 1 ;rules for athlon: 1) schedule latencies, 2) add/mul and load/store in 2:1 proportion, ; 3) avoid spliting >3byte instructions over 8byte boundaries psubw mm1,mm0 ;A3 psubw mm3,mm2 ;B3 %if (%1) psubw mm5, mm4 ;C8 psubw mm7, mm6 ;D8 %endif align 8 db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1 pmaxsw mm1,mm0 ;A4 db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24);movq mm6, [ecx + %1 * 32 +24+32] ;D1 pmaxsw mm3,mm2 ;B4 psraw mm0,15 ;A5 psraw mm2,15 ;B5 %if (%1) movq [edx + %1 * 32 + 16-32], mm5 ;C9 movq [edx + %1 * 32 + 24-32], mm7 ;D9 %endif pmulhw mm1, [esi] ;A6 pmulhw mm3, [esi] ;B6 movq mm5, [ebx] ;C2 movq mm7, [ebx] ;D2 nop nop pxor mm1, mm0 ;A7 pxor mm3, mm2 ;B7 psubw mm5,mm4 ;C3 psubw mm7,mm6 ;D3 psubw mm1, mm0 ;A8 psubw mm3, mm2 ;B8 %if (%1 < 3) db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1 %endif pmaxsw mm5,mm4 ;C4 %if (%1 < 3) db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1 %else cmp esp,esp %endif pmaxsw mm7,mm6 ;D4 psraw mm4,15 ;C5 psraw mm6,15 ;D5 movq [byte edx + %1 * 32], mm1 ;A9 movq [edx + %1 * 32+8], mm3 ;B9 pmulhw mm5, [esi] ;C6 pmulhw mm7, [esi] ;D6 %if (%1 < 3) movq mm1, [ebx] ;A2 movq mm3, [ebx] ;B2 %endif %if (%1 == 0) push ebp %elif (%1 < 3) nop %endif nop %if (%1 == 3) imul eax,[int_div+4*edi] %endif pxor mm5, mm4 ;C7 pxor mm7, mm6 ;D7 %endmacro align ALIGN cglobal quant_intra_3dne quant_intra_3dne: mov eax, [esp + 12] ; quant mov ecx, [esp + 8] ; data mov edx, [esp + 4] ; coeff cmp al, 1 pxor mm1,mm1 pxor mm3,mm3 movq mm0, [ecx ] ; mm0 = [1st] movq mm2, [ecx +8] push esi lea esi, [mmx_div + eax * 8 - 8] push ebx mov ebx,mmzero push edi jz near .q1loop quant_intra 0 mov ebp, [esp + 16 + 16] ; dcscalar movsx eax, word [byte ecx] ;x quant_intra 1 mov edi,eax sar edi,31 ;sign(x) shr ebp,byte 1 ; ebp = dcscalar /2 quant_intra 2 sub eax,edi ; x (+1) xor ebp,edi ;sign(x) dcscalar /2 (-1) mov edi,[esp + 16 + 16] lea eax,[byte eax+ebp] ;x + sign(x) dcscalar /2 mov ebp,[byte esp] quant_intra 3 psubw mm5, mm4 ;C8 mov esi,[esp+12] mov edi,[esp+4] mov ebx,[esp+8] add esp,byte 16 sar eax,16 mov [edx], ax ; coeff[0] = ax psubw mm7, mm6 ;D8 movq [edx + 3 * 32 + 16], mm5 ;C9 movq [edx + 3 * 32 + 24], mm7 ;D9 ret align 16 .q1loop quant_intra1 0 mov ebp, [esp + 16 + 16] ; dcscalar movsx eax, word [byte ecx] ;x quant_intra1 1 mov edi,eax sar edi,31 ;sign(x) shr ebp,byte 1 ; ebp = dcscalar /2 quant_intra1 2 sub eax,edi ; x (+1) xor ebp,edi ;sign(x) dcscalar /2 (-1) mov edi,[esp + 16 + 16] lea eax,[byte eax+ebp] ;x + sign(x) dcscalar /2 mov ebp,[byte esp] quant_intra1 3 psubw mm5, mm4 ;C8 mov esi,[dword esp+12] mov edi,[esp+4] mov ebx,[esp+8] add esp,byte 16 sar eax,16 mov [edx], ax ; coeff[0] = ax psubw mm7, mm6 ;D8 movq [edx + 3 * 32 + 16], mm5 ;C9 movq [edx + 3 * 32 + 24], mm7 ;D9 ret ;=========================================================================== ; ; uint32_t quant_inter_3dne(int16_t * coeff, ; const int16_t const * data, ; const uint32_t quant); ; ;=========================================================================== ;This is Athlon-optimized code (ca 90 clk per call) ;Optimized by Jaan, 30 Nov 2002 %macro quantinter 1 movq mm1, [eax] ;A2 psraw mm3,15 ;B6 %if (%1) psubw mm2, mm6 ;C10 %endif psubw mm1,mm0 ;A3 pmulhw mm4, mm7 ; B7 movq mm6, [ecx + %1*24+16] ; C1 pmaxsw mm1,mm0 ;A4 paddw mm5, mm4 ;B8 %if (%1) movq [edx + %1*24+16-24], mm2 ;C11 %endif psubusw mm1, [ebx] ; A5 mm0 -= sub (unsigned, dont go < 0) pxor mm4, mm3 ;B9 movq mm2, [eax] ;C2 psraw mm0,15 ;A6 psubw mm4, mm3 ;B10 psubw mm2,mm6 ;C3 pmulhw mm1, mm7 ; A7 mm0 = (mm0 / 2Q) >> 24 movq mm3, [ecx + %1*24+8] ; B1 pmaxsw mm2,mm6 ;C4 paddw mm5, mm1 ; A8 sum += mm0 %if (%1) movq [edx + %1*24+8-24], mm4 ;B11 %else movq [edx + 120], mm4 ;B11 %endif psubusw mm2, [ebx] ;C5 pxor mm1, mm0 ; A9 mm0 *= sign(mm0) movq mm4, [eax] ;B2 psraw mm6,15 ;C6 psubw mm1, mm0 ;A10 undisplace psubw mm4,mm3 ;B3 pmulhw mm2, mm7 ; C7 movq mm0, [ecx + %1*24+24] ;A1 mm0 = [1st] pmaxsw mm4,mm3 ;B4 paddw mm5, mm2 ;C8 movq [byte edx + %1*24], mm1 ;A11 psubusw mm4, [ebx] ;B5 pxor mm2, mm6 ;C9 %endmacro %macro quantinter1 1 movq mm0, [byte ecx + %1*16] ; mm0 = [1st] movq mm3, [ecx + %1*16+8] ; movq mm1, [eax] movq mm4, [eax] psubw mm1,mm0 psubw mm4,mm3 pmaxsw mm1,mm0 pmaxsw mm4,mm3 psubusw mm1, mm6 ; mm0 -= sub (unsigned, dont go < 0) psubusw mm4, mm6 ; psraw mm0,15 psraw mm3,15 psrlw mm1, 1 ; mm0 = (mm0 / 2Q) >> 16 psrlw mm4, 1 ; paddw mm5, mm1 ; sum += mm0 pxor mm1, mm0 ; mm0 *= sign(mm0) paddw mm5, mm4 pxor mm4, mm3 ; psubw mm1, mm0 ; undisplace psubw mm4, mm3 cmp esp,esp movq [byte edx + %1*16], mm1 movq [edx + %1*16+8], mm4 %endmacro align ALIGN cglobal quant_inter_3dne quant_inter_3dne mov edx, [esp + 4] ; coeff mov ecx, [esp + 8] ; data mov eax, [esp + 12] ; quant push ebx pxor mm5, mm5 ; sum nop lea ebx,[mmx_sub + eax * 8 - 8] ; sub movq mm7, [mmx_div + eax * 8 - 8] ; divider cmp al, 1 lea eax,[mmzero] jz near .q1loop cmp esp,esp align 8 movq mm3, [ecx + 120] ; B1 pxor mm4,mm4 ;B2 psubw mm4,mm3 ;B3 movq mm0, [ecx] ;A1 mm0 = [1st] pmaxsw mm4,mm3 ;B4 psubusw mm4, [ebx] ;B5 quantinter 0 quantinter 1 quantinter 2 quantinter 3 quantinter 4 psraw mm3,15 ;B6 psubw mm2, mm6 ;C10 pmulhw mm4, mm7 ; B7 paddw mm5, mm4 ;B8 pxor mm4, mm3 ;B9 psubw mm4, mm3 ;B10 movq [edx + 4*24+16], mm2 ;C11 pop ebx movq [edx + 4*24+8], mm4 ;B11 pmaddwd mm5, [plus_one] movq mm0, mm5 punpckhdq mm5, mm5 paddd mm0, mm5 movd eax, mm0 ; return sum ret align ALIGN .q1loop movq mm6,[byte ebx] quantinter1 0 quantinter1 1 quantinter1 2 quantinter1 3 quantinter1 4 quantinter1 5 quantinter1 6 quantinter1 7 pmaddwd mm5, [plus_one] movq mm0, mm5 psrlq mm5, 32 paddd mm0, mm5 movd eax, mm0 ; return sum pop ebx ret ;=========================================================================== ; ; void dequant_intra_3dne(int16_t *data, ; const int16_t const *coeff, ; const uint32_t quant, ; const uint32_t dcscalar); ; ;=========================================================================== ; this is the same as dequant_inter_3dne, except that we're ; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster) ;This is Athlon-optimized code (ca 106 clk per call) %macro dequant 1 movq mm1, [ecx+%1*24] ;A2 ; c = coeff[i] psubw mm0,mm1 ;-c ;A3 (1st dep) %if (%1) paddw mm4,mm6 ; C11 mm6 free (4th+) %endif pmaxsw mm0,mm1 ;|c| ;A4 (2nd) %if (%1) mov ebp,ebp pminsw mm4,[ebx] ; C12 saturates to +2047 (5th+) 1ater %endif movq mm6,[esi] ;0 ;A5 mm6 in use pandn mm7,[eax] ; B9 offset = isZero ? 0 : quant_add (2nd) %if (%1) pxor mm5, mm4 ; C13 (6th+) 1later %endif movq mm4,[esi] ; C1 ;0 mov esp,esp pcmpeqw mm6, [ecx+%1*24] ;A6 (c ==0) ? -1 : 0 (1st) align 4 psraw mm1,15 ; sign(c) ;A7 (2nd) %if (%1) movq [edx+%1*24+16-24], mm5 ; C14 (7th) 2later %endif paddw mm7,mm3 ; B10 offset +negate back (3rd) pmullw mm0, [edi] ;*= 2Q ;A8 (3rd+) paddw mm2,mm7 ; B11 mm7 free (4th+) lea ebp,[byte ebp] movq mm5, [ecx+%1*24+16] ;C2 ; c = coeff[i] psubw mm4,mm5 ;-c ;C3 (1st dep) pandn mm6,[eax] ; A9 offset = isZero ? 0 : quant_add (2nd) pminsw mm2,[ebx] ; B12 saturates to +2047 (5th+) pxor mm3, mm2 ; B13 (6th+) movq mm2,[byte esi] ; B1 ;0 %if (%1) movq [edx+%1*24+8-24], mm3 ; B14 (7th) %else movq [edx+120], mm3 %endif pmaxsw mm4,mm5 ;|c| ;C4 (2nd) paddw mm6,mm1 ; A10 offset +negate back (3rd) movq mm3, [ecx+%1*24 + 8] ;B2 ; c = coeff[i] psubw mm2,mm3 ;-c ;B3 (1st dep) paddw mm0,mm6 ; A11 mm6 free (4th+) movq mm6,[byte esi] ;0 ;C5 mm6 in use pcmpeqw mm6, [ecx+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st) pminsw mm0,[ebx] ; A12 saturates to +2047 (5th+) pmaxsw mm2,mm3 ;|c| ;B4 (2nd) pxor mm1, mm0 ; A13 (6th+) pmullw mm4, [edi] ;*= 2Q ;C8 (3rd+) psraw mm5,15 ; sign(c) ;C7 (2nd) movq mm7,[byte esi] ;0 ;B5 mm7 in use pcmpeqw mm7, [ecx+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st) %if (%1 < 4) movq mm0,[byte esi] ; A1 ;0 %endif pandn mm6,[byte eax] ; C9 offset = isZero ? 0 : quant_add (2nd) psraw mm3,15 ; sign(c) ;B7 (2nd) movq [byte edx+%1*24], mm1 ; A14 (7th) paddw mm6,mm5 ; C10 offset +negate back (3rd) pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) mov esp,esp %endmacro align ALIGN cglobal dequant_intra_3dne dequant_intra_3dne: mov ecx, [esp+ 8] ; coeff mov eax, [esp+12] ; quant pxor mm0,mm0 pxor mm2,mm2 push edi push ebx lea edi,[mmx_mul + eax*8 - 8] ; 2*quant push ebp mov ebx,mmx_2047 movsx ebp,word [ecx] lea eax,[mmx_add + eax*8 - 8] ; quant or quant-1 push esi mov esi,mmzero pxor mm7,mm7 movq mm3, [ecx+120] ;B2 ; c = coeff[i] pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) imul ebp,[esp+16+16] ; dcscalar psubw mm2,mm3 ;-c ;B3 (1st dep) pmaxsw mm2,mm3 ;|c| ;B4 (2nd) pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) psraw mm3,15 ; sign(c) ;B7 (2nd) mov edx, [esp+ 4+16] ; data align 8 dequant 0 cmp ebp,-2048 mov esp,esp dequant 1 cmovl ebp,[int_2048] nop dequant 2 cmp ebp,2047 mov esp,esp dequant 3 cmovg ebp,[int2047] nop dequant 4 paddw mm4,mm6 ; C11 mm6 free (4th+) pminsw mm4,[ebx] ; C12 saturates to +2047 (5th+) pandn mm7,[eax] ; B9 offset = isZero ? 0 : quant_add (2nd) mov eax,ebp mov esi,[esp] mov ebp,[esp+4] pxor mm5, mm4 ; C13 (6th+) paddw mm7,mm3 ; B10 offset +negate back (3rd) movq [edx+4*24+16], mm5 ; C14 (7th) paddw mm2,mm7 ; B11 mm7 free (4th+) pminsw mm2,[ebx] ; B12 saturates to +2047 (5th+) mov ebx,[esp+8] mov edi,[esp+12] add esp,byte 16 pxor mm3, mm2 ; B13 (6th+) movq [edx+4*24+8], mm3 ; B14 (7th) mov [edx], ax ret ;=========================================================================== ; ; void dequant_inter_3dne(int16_t * data, ; const int16_t * const coeff, ; const uint32_t quant); ; ;=========================================================================== ; this is the same as dequant_inter_3dne, ; except that we're saturating using 'pminsw' (saves 2 cycles/loop) ;This is Athlon-optimized code (ca 100 clk per call) ;Optimized by Jaan, 30 Nov 2002 align ALIGN cglobal dequant_inter_3dne dequant_inter_3dne: mov ecx, [esp+ 8] ; coeff mov eax, [esp+12] ; quant pxor mm0,mm0 pxor mm2,mm2 push edi push ebx push esi lea edi,[mmx_mul + eax*8 - 8] ; 2*quant mov ebx,mmx_2047 pxor mm7,mm7 movq mm3, [ecx+120] ;B2 ; c = coeff[i] pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) lea eax,[mmx_add + eax*8 - 8] ; quant or quant-1 psubw mm2,mm3 ;-c ;B3 (1st dep) mov esi,mmzero pmaxsw mm2,mm3 ;|c| ;B4 (2nd) pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) psraw mm3,15 ; sign(c) ;B7 (2nd) mov edx, [dword esp+ 4+12] ; data align 8 dequant 0 dequant 1 dequant 2 dequant 3 dequant 4 paddw mm4,mm6 ; C11 mm6 free (4th+) pminsw mm4,[ebx] ; C12 saturates to +2047 (5th+) pandn mm7,[eax] ; B9 offset = isZero ? 0 : quant_add (2nd) mov esi,[esp] pxor mm5, mm4 ; C13 (6th+) paddw mm7,mm3 ; B10 offset +negate back (3rd) movq [edx+4*24+16], mm5 ; C14 (7th) paddw mm2,mm7 ; B11 mm7 free (4th+) pminsw mm2,[ebx] ; B12 saturates to +2047 (5th+) mov ebx,[esp+4] mov edi,[esp+8] add esp,byte 12 pxor mm3, mm2 ; B13 (6th+) movq [edx+4*24+8], mm3 ; B14 (7th) ret