--- branches/dev-api-4/xvidcore/src/quant/x86_asm/quantize_3dne.asm 2003/02/21 14:49:29 886 +++ branches/dev-api-4/xvidcore/src/quant/x86_asm/quantize_3dne.asm 2003/07/16 23:00:08 1089 @@ -1,30 +1,25 @@ ;/************************************************************************** ; * -; * XVID MPEG-4 VIDEO CODEC -; * mmx quantization/dequantization +; * XVID MPEG-4 VIDEO CODEC +; * - mmx quantization/dequantization - ; * -; * This program is an implementation of a part of one or more MPEG-4 -; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending -; * to use this software module in hardware or software products are -; * advised that its use may infringe existing patents or copyrights, and -; * any such use would be at such party's own risk. The original -; * developer of this software module and his/her company, and subsequent -; * editors and their companies, will have no liability for use of this -; * software or modifications or derivatives thereof. -; * -; * This program is free software; you can redistribute it and/or modify -; * it under the terms of the GNU General Public License as published by -; * the Free Software Foundation; either version 2 of the License, or -; * (at your option) any later version. +; * Copyright(C) 2001-2003 XviD Team ; * -; * This program is distributed in the hope that it will be useful, -; * but WITHOUT ANY WARRANTY; without even the implied warranty of -; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; * GNU General Public License for more details. +; * This program is free software ; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation ; either version 2 of the License, or +; * (at your option) any later version. ; * -; * You should have received a copy of the GNU General Public License -; * along with this program; if not, write to the Free Software -; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY ; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program ; if not, write to the Free Software +; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +; * +; * $Id: quantize_3dne.asm,v 1.2.2.1 2003-07-16 22:59:20 edgomez Exp $ ; * ; *************************************************************************/ ; these 3dne functions are compatible with iSSE, but are optimized specifically for @@ -61,7 +56,7 @@ dd 0 %assign i 1 %rep 255 - dd (1 << 16) / ( i) + 1 + dd (1 << 16) / (i) + 1 %assign i i+1 %endrep @@ -182,148 +177,153 @@ ;This is Athlon-optimized code (ca 70 clk per call) ;Optimized by Jaan, 30 Nov 2002 - %macro quant_intra1 1 - psubw mm1,mm0 ;A3 - psubw mm3,mm2 ;B3 +%macro quant_intra1 1 + + psubw mm1, mm0 ;A3 + psubw mm3, mm2 ;B3 %if (%1) - psubw mm5, mm4 ;C8 - psubw mm7, mm6 ;D8 + psubw mm5, mm4 ;C8 + psubw mm7, mm6 ;D8 %endif align 8 - db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1 - pmaxsw mm1,mm0 ;A4 - db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24);movq mm6, [ecx + %1 * 32 +24+32] ;D1 - pmaxsw mm3,mm2 ;B4 + db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1 + pmaxsw mm1, mm0 ;A4 + db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1 + pmaxsw mm3, mm2 ;B4 - psraw mm0,15 ;A5 - psraw mm2,15 ;B5 + psraw mm0, 15 ;A5 + psraw mm2, 15 ;B5 %if (%1) - movq [edx + %1 * 32 + 16-32], mm5 ;C9 - movq [edx + %1 * 32 + 24-32], mm7 ;D9 + movq [edx + %1 * 32 + 16-32], mm5 ;C9 + movq [edx + %1 * 32 + 24-32], mm7 ;D9 %endif - psrlw mm1, 1 ;A6 - psrlw mm3, 1 ;B6 - movq mm5, [ebx] ;C2 - movq mm7, [ebx] ;D2 - - pxor mm1, mm0 ;A7 - pxor mm3, mm2 ;B7 - - psubw mm5,mm4 ;C3 - psubw mm7,mm6 ;D3 - psubw mm1, mm0 ;A8 - psubw mm3, mm2 ;B8 + psrlw mm1, 1 ;A6 + psrlw mm3, 1 ;B6 + movq mm5, [ebx] ;C2 + movq mm7, [ebx] ;D2 + + pxor mm1, mm0 ;A7 + pxor mm3, mm2 ;B7 + + psubw mm5, mm4 ;C3 + psubw mm7, mm6 ;D3 + psubw mm1, mm0 ;A8 + psubw mm3, mm2 ;B8 %if (%1 == 0) - push ebp - movq mm0, [ecx + %1 * 32 +32] + push ebp + movq mm0, [ecx + %1 * 32 +32] %elif (%1 < 3) - db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1 + db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1 %endif - pmaxsw mm5,mm4 ;C4 + pmaxsw mm5, mm4 ;C4 %if (%1 < 3) - db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1 + db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1 %else - cmp esp,esp + cmp esp, esp %endif - pmaxsw mm7,mm6 ;D4 + pmaxsw mm7, mm6 ;D4 - psraw mm4,15 ;C5 - psraw mm6,15 ;D5 - movq [byte edx + %1 * 32], mm1 ;A9 - movq [edx + %1 * 32+8], mm3 ;B9 + psraw mm4, 15 ;C5 + psraw mm6, 15 ;D5 + movq [byte edx + %1 * 32], mm1 ;A9 + movq [edx + %1 * 32+8], mm3 ;B9 - psrlw mm5, 1 ;C6 - psrlw mm7, 1 ;D6 + psrlw mm5, 1 ;C6 + psrlw mm7, 1 ;D6 %if (%1 < 3) - movq mm1, [ebx] ;A2 - movq mm3, [ebx] ;B2 + movq mm1, [ebx] ;A2 + movq mm3, [ebx] ;B2 %endif %if (%1 == 3) - imul eax,[int_div+4*edi] + imul eax, [int_div+4*edi] %endif - pxor mm5, mm4 ;C7 - pxor mm7, mm6 ;D7 + pxor mm5, mm4 ;C7 + pxor mm7, mm6 ;D7 %endm -%macro quant_intra 1 ;rules for athlon: 1) schedule latencies, 2) add/mul and load/store in 2:1 proportion, - ; 3) avoid spliting >3byte instructions over 8byte boundaries - psubw mm1,mm0 ;A3 - psubw mm3,mm2 ;B3 +%macro quant_intra 1 + ; Rules for athlon: + ; 1) schedule latencies + ; 2) add/mul and load/store in 2:1 proportion + ; 3) avoid spliting >3byte instructions over 8byte boundaries + + psubw mm1, mm0 ;A3 + psubw mm3, mm2 ;B3 %if (%1) - psubw mm5, mm4 ;C8 - psubw mm7, mm6 ;D8 + psubw mm5, mm4 ;C8 + psubw mm7, mm6 ;D8 %endif align 8 - db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1 - pmaxsw mm1,mm0 ;A4 - db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24);movq mm6, [ecx + %1 * 32 +24+32] ;D1 - pmaxsw mm3,mm2 ;B4 + db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1 + pmaxsw mm1, mm0 ;A4 + db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1 + pmaxsw mm3, mm2 ;B4 - psraw mm0,15 ;A5 - psraw mm2,15 ;B5 + psraw mm0, 15 ;A5 + psraw mm2, 15 ;B5 %if (%1) - movq [edx + %1 * 32 + 16-32], mm5 ;C9 - movq [edx + %1 * 32 + 24-32], mm7 ;D9 + movq [edx + %1 * 32 + 16-32], mm5 ;C9 + movq [edx + %1 * 32 + 24-32], mm7 ;D9 %endif - pmulhw mm1, [esi] ;A6 - pmulhw mm3, [esi] ;B6 - movq mm5, [ebx] ;C2 - movq mm7, [ebx] ;D2 - - nop - nop - pxor mm1, mm0 ;A7 - pxor mm3, mm2 ;B7 - - psubw mm5,mm4 ;C3 - psubw mm7,mm6 ;D3 - psubw mm1, mm0 ;A8 - psubw mm3, mm2 ;B8 + pmulhw mm1, [esi] ;A6 + pmulhw mm3, [esi] ;B6 + movq mm5, [ebx] ;C2 + movq mm7, [ebx] ;D2 + + nop + nop + pxor mm1, mm0 ;A7 + pxor mm3, mm2 ;B7 + + psubw mm5, mm4 ;C3 + psubw mm7, mm6 ;D3 + psubw mm1, mm0 ;A8 + psubw mm3, mm2 ;B8 %if (%1 < 3) - db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1 + db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1 %endif - pmaxsw mm5,mm4 ;C4 + pmaxsw mm5, mm4 ;C4 %if (%1 < 3) - db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1 + db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1 %else - cmp esp,esp + cmp esp, esp %endif - pmaxsw mm7,mm6 ;D4 + pmaxsw mm7,mm6 ;D4 - psraw mm4,15 ;C5 - psraw mm6,15 ;D5 - movq [byte edx + %1 * 32], mm1 ;A9 - movq [edx + %1 * 32+8], mm3 ;B9 + psraw mm4, 15 ;C5 + psraw mm6, 15 ;D5 + movq [byte edx + %1 * 32], mm1 ;A9 + movq [edx + %1 * 32+8], mm3 ;B9 - pmulhw mm5, [esi] ;C6 - pmulhw mm7, [esi] ;D6 + pmulhw mm5, [esi] ;C6 + pmulhw mm7, [esi] ;D6 %if (%1 < 3) - movq mm1, [ebx] ;A2 - movq mm3, [ebx] ;B2 + movq mm1, [ebx] ;A2 + movq mm3, [ebx] ;B2 %endif %if (%1 == 0) - push ebp + push ebp %elif (%1 < 3) - nop + nop %endif - nop + nop %if (%1 == 3) - imul eax,[int_div+4*edi] + imul eax, [int_div+4*edi] %endif - pxor mm5, mm4 ;C7 - pxor mm7, mm6 ;D7 + pxor mm5, mm4 ;C7 + pxor mm7, mm6 ;D7 %endmacro @@ -331,73 +331,90 @@ cglobal quant_intra_3dne quant_intra_3dne: - mov eax, [esp + 12] ; quant - mov ecx, [esp + 8] ; data - mov edx, [esp + 4] ; coeff - cmp al, 1 - pxor mm1,mm1 - pxor mm3,mm3 - movq mm0, [ecx ] ; mm0 = [1st] - movq mm2, [ecx +8] - push esi - lea esi, [mmx_div + eax * 8 - 8] - - push ebx - mov ebx,mmzero - push edi - jz near .q1loop + mov eax, [esp + 12] ; quant + mov ecx, [esp + 8] ; data + mov edx, [esp + 4] ; coeff + cmp al, 1 + pxor mm1, mm1 + pxor mm3, mm3 + movq mm0, [ecx] ; mm0 = [1st] + movq mm2, [ecx + 8] + push esi + lea esi, [mmx_div + eax*8 - 8] + + push ebx + mov ebx, mmzero + push edi + jz near .q1loop + quant_intra 0 -mov ebp, [esp + 16 + 16] ; dcscalar -movsx eax, word [byte ecx] ;x + mov ebp, [esp + 16 + 16] ; dcscalar + ; NB -- there are 3 pushes in the function preambule and one more + ; in "quant_intra 0", thus an added offset of 16 bytes + movsx eax, word [byte ecx] ; DC + quant_intra 1 -mov edi,eax -sar edi,31 ;sign(x) -shr ebp,byte 1 ; ebp = dcscalar /2 + mov edi, eax + sar edi, 31 ; sign(DC) + shr ebp, byte 1 ; ebp = dcscalar/2 + quant_intra 2 -sub eax,edi ; x (+1) -xor ebp,edi ;sign(x) dcscalar /2 (-1) -mov edi,[esp + 16 + 16] -lea eax,[byte eax+ebp] ;x + sign(x) dcscalar /2 -mov ebp,[byte esp] + sub eax, edi ; DC (+1) + xor ebp, edi ; sign(DC) dcscalar /2 (-1) + mov edi, [esp + 16 + 16] ; dscalar + lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar/2 + mov ebp, [byte esp] + quant_intra 3 - psubw mm5, mm4 ;C8 - mov esi,[esp+12] - mov edi,[esp+4] - mov ebx,[esp+8] - add esp,byte 16 - sar eax,16 - mov [edx], ax ; coeff[0] = ax - psubw mm7, mm6 ;D8 - movq [edx + 3 * 32 + 16], mm5 ;C9 - movq [edx + 3 * 32 + 24], mm7 ;D9 - ret -align 16 + psubw mm5, mm4 ;C8 + mov esi, [esp + 12] ; pop back the register value + mov edi, [esp + 4] ; pop back the register value + sar eax, 16 + lea ebx, [byte eax + 1] ; workaround for eax < 0 + cmovs eax, ebx ; conditionnaly move the corrected value + mov [edx], ax ; coeff[0] = ax + mov ebx, [esp + 8] ; pop back the register value + add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 + psubw mm7, mm6 ;D8 + movq [edx + 3 * 32 + 16], mm5 ;C9 + movq [edx + 3 * 32 + 24], mm7 ;D9 + + ret + + align 16 + .q1loop quant_intra1 0 -mov ebp, [esp + 16 + 16] ; dcscalar -movsx eax, word [byte ecx] ;x + mov ebp, [esp + 16 + 16] ; dcscalar + movsx eax, word [byte ecx] ; DC + quant_intra1 1 -mov edi,eax -sar edi,31 ;sign(x) -shr ebp,byte 1 ; ebp = dcscalar /2 + mov edi, eax + sar edi, 31 ; sign(DC) + shr ebp, byte 1 ; ebp = dcscalar /2 + quant_intra1 2 -sub eax,edi ; x (+1) -xor ebp,edi ;sign(x) dcscalar /2 (-1) -mov edi,[esp + 16 + 16] -lea eax,[byte eax+ebp] ;x + sign(x) dcscalar /2 -mov ebp,[byte esp] + sub eax, edi ; DC (+1) + xor ebp, edi ; sign(DC) dcscalar /2 (-1) + mov edi, [esp + 16 + 16] ; dcscalar + lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar /2 + mov ebp, [byte esp] + quant_intra1 3 - psubw mm5, mm4 ;C8 - mov esi,[dword esp+12] - mov edi,[esp+4] - mov ebx,[esp+8] - add esp,byte 16 - sar eax,16 - mov [edx], ax ; coeff[0] = ax - psubw mm7, mm6 ;D8 - movq [edx + 3 * 32 + 16], mm5 ;C9 - movq [edx + 3 * 32 + 24], mm7 ;D9 - ret + psubw mm5, mm4 ;C8 + mov esi, [dword esp + 12] ; pop back the register value + mov edi, [esp + 4] ; pop back the register value + sar eax, 16 + lea ebx, [byte eax + 1] ; workaround for eax < 0 + cmovs eax, ebx ; conditionnaly move the corrected value + mov [edx], ax ; coeff[0] = ax + mov ebx, [esp + 8] ; pop back the register value + add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 + psubw mm7, mm6 ;D8 + movq [edx + 3 * 32 + 16], mm5 ;C9 + movq [edx + 3 * 32 + 24], mm7 ;D9 + + ret @@ -414,143 +431,145 @@ %macro quantinter 1 - movq mm1, [eax] ;A2 - psraw mm3,15 ;B6 + movq mm1, [eax] ;A2 + psraw mm3, 15 ;B6 %if (%1) - psubw mm2, mm6 ;C10 + psubw mm2, mm6 ;C10 %endif - psubw mm1,mm0 ;A3 - pmulhw mm4, mm7 ; B7 - movq mm6, [ecx + %1*24+16] ; C1 - pmaxsw mm1,mm0 ;A4 - paddw mm5, mm4 ;B8 + psubw mm1, mm0 ;A3 + pmulhw mm4, mm7 ;B7 + movq mm6, [ecx + %1*24+16] ;C1 + pmaxsw mm1, mm0 ;A4 + paddw mm5, mm4 ;B8 %if (%1) - movq [edx + %1*24+16-24], mm2 ;C11 + movq [edx + %1*24+16-24], mm2 ;C11 %endif - psubusw mm1, [ebx] ; A5 mm0 -= sub (unsigned, dont go < 0) - pxor mm4, mm3 ;B9 - movq mm2, [eax] ;C2 - psraw mm0,15 ;A6 - psubw mm4, mm3 ;B10 - psubw mm2,mm6 ;C3 - pmulhw mm1, mm7 ; A7 mm0 = (mm0 / 2Q) >> 24 - movq mm3, [ecx + %1*24+8] ; B1 - pmaxsw mm2,mm6 ;C4 - paddw mm5, mm1 ; A8 sum += mm0 + psubusw mm1, [ebx] ;A5 mm0 -= sub (unsigned, dont go < 0) + pxor mm4, mm3 ;B9 + movq mm2, [eax] ;C2 + psraw mm0, 15 ;A6 + psubw mm4, mm3 ;B10 + psubw mm2, mm6 ;C3 + pmulhw mm1, mm7 ;A7 mm0 = (mm0 / 2Q) >> 24 + movq mm3, [ecx + %1*24+8] ;B1 + pmaxsw mm2, mm6 ;C4 + paddw mm5, mm1 ;A8 sum += mm0 %if (%1) - movq [edx + %1*24+8-24], mm4 ;B11 + movq [edx + %1*24+8-24], mm4 ;B11 %else - movq [edx + 120], mm4 ;B11 + movq [edx + 120], mm4 ;B11 %endif - psubusw mm2, [ebx] ;C5 - pxor mm1, mm0 ; A9 mm0 *= sign(mm0) - movq mm4, [eax] ;B2 - psraw mm6,15 ;C6 - psubw mm1, mm0 ;A10 undisplace - psubw mm4,mm3 ;B3 - pmulhw mm2, mm7 ; C7 - movq mm0, [ecx + %1*24+24] ;A1 mm0 = [1st] - pmaxsw mm4,mm3 ;B4 - paddw mm5, mm2 ;C8 - movq [byte edx + %1*24], mm1 ;A11 - psubusw mm4, [ebx] ;B5 - pxor mm2, mm6 ;C9 + psubusw mm2, [ebx] ;C5 + pxor mm1, mm0 ;A9 mm0 *= sign(mm0) + movq mm4, [eax] ;B2 + psraw mm6, 15 ;C6 + psubw mm1, mm0 ;A10 undisplace + psubw mm4, mm3 ;B3 + pmulhw mm2, mm7 ;C7 + movq mm0, [ecx + %1*24+24] ;A1 mm0 = [1st] + pmaxsw mm4, mm3 ;B4 + paddw mm5, mm2 ;C8 + movq [byte edx + %1*24], mm1 ;A11 + psubusw mm4, [ebx] ;B5 + pxor mm2, mm6 ;C9 %endmacro %macro quantinter1 1 - movq mm0, [byte ecx + %1*16] ; mm0 = [1st] - movq mm3, [ecx + %1*16+8] ; - movq mm1, [eax] - movq mm4, [eax] - psubw mm1,mm0 - psubw mm4,mm3 - pmaxsw mm1,mm0 - pmaxsw mm4,mm3 - psubusw mm1, mm6 ; mm0 -= sub (unsigned, dont go < 0) - psubusw mm4, mm6 ; - psraw mm0,15 - psraw mm3,15 - psrlw mm1, 1 ; mm0 = (mm0 / 2Q) >> 16 - psrlw mm4, 1 ; - paddw mm5, mm1 ; sum += mm0 - pxor mm1, mm0 ; mm0 *= sign(mm0) - paddw mm5, mm4 - pxor mm4, mm3 ; - psubw mm1, mm0 ; undisplace - psubw mm4, mm3 - cmp esp,esp - movq [byte edx + %1*16], mm1 - movq [edx + %1*16+8], mm4 + movq mm0, [byte ecx + %1*16] ;mm0 = [1st] + movq mm3, [ecx + %1*16+8] ; + movq mm1, [eax] + movq mm4, [eax] + psubw mm1, mm0 + psubw mm4, mm3 + pmaxsw mm1, mm0 + pmaxsw mm4, mm3 + psubusw mm1, mm6 ; mm0 -= sub (unsigned, dont go < 0) + psubusw mm4, mm6 ; + psraw mm0, 15 + psraw mm3, 15 + psrlw mm1, 1 ; mm0 = (mm0 / 2Q) >> 16 + psrlw mm4, 1 ; + paddw mm5, mm1 ; sum += mm0 + pxor mm1, mm0 ; mm0 *= sign(mm0) + paddw mm5, mm4 + pxor mm4, mm3 ; + psubw mm1, mm0 ; undisplace + psubw mm4, mm3 + cmp esp, esp + movq [byte edx + %1*16], mm1 + movq [edx + %1*16+8], mm4 %endmacro align ALIGN cglobal quant_inter_3dne - quant_inter_3dne - - mov edx, [esp + 4] ; coeff - mov ecx, [esp + 8] ; data - mov eax, [esp + 12] ; quant - push ebx - - pxor mm5, mm5 ; sum - nop - lea ebx,[mmx_sub + eax * 8 - 8] ; sub - movq mm7, [mmx_div + eax * 8 - 8] ; divider - - cmp al, 1 - lea eax,[mmzero] - jz near .q1loop - cmp esp,esp +quant_inter_3dne + mov edx, [esp + 4] ; coeff + mov ecx, [esp + 8] ; data + mov eax, [esp + 12] ; quant + push ebx + + pxor mm5, mm5 ; sum + nop + lea ebx,[mmx_sub + eax * 8 - 8] ; sub + movq mm7, [mmx_div + eax * 8 - 8] ; divider + + cmp al, 1 + lea eax, [mmzero] + jz near .q1loop + cmp esp, esp align 8 - movq mm3, [ecx + 120] ; B1 - pxor mm4,mm4 ;B2 - psubw mm4,mm3 ;B3 - movq mm0, [ecx] ;A1 mm0 = [1st] - pmaxsw mm4,mm3 ;B4 - psubusw mm4, [ebx] ;B5 - - quantinter 0 - quantinter 1 - quantinter 2 - quantinter 3 - quantinter 4 - psraw mm3,15 ;B6 - psubw mm2, mm6 ;C10 - pmulhw mm4, mm7 ; B7 - paddw mm5, mm4 ;B8 - pxor mm4, mm3 ;B9 - psubw mm4, mm3 ;B10 - movq [edx + 4*24+16], mm2 ;C11 - pop ebx - movq [edx + 4*24+8], mm4 ;B11 - pmaddwd mm5, [plus_one] - movq mm0, mm5 - punpckhdq mm5, mm5 - paddd mm0, mm5 - movd eax, mm0 ; return sum - ret + movq mm3, [ecx + 120] ;B1 + pxor mm4, mm4 ;B2 + psubw mm4, mm3 ;B3 + movq mm0, [ecx] ;A1 mm0 = [1st] + pmaxsw mm4, mm3 ;B4 + psubusw mm4, [ebx] ;B5 + + quantinter 0 + quantinter 1 + quantinter 2 + quantinter 3 + quantinter 4 + + psraw mm3, 15 ;B6 + psubw mm2, mm6 ;C10 + pmulhw mm4, mm7 ;B7 + paddw mm5, mm4 ;B8 + pxor mm4, mm3 ;B9 + psubw mm4, mm3 ;B10 + movq [edx + 4*24+16], mm2 ;C11 + pop ebx + movq [edx + 4*24+8], mm4 ;B11 + pmaddwd mm5, [plus_one] + movq mm0, mm5 + punpckhdq mm5, mm5 + paddd mm0, mm5 + movd eax, mm0 ; return sum + + ret align ALIGN .q1loop - movq mm6,[byte ebx] - quantinter1 0 - quantinter1 1 - quantinter1 2 - quantinter1 3 - quantinter1 4 - quantinter1 5 - quantinter1 6 - quantinter1 7 - - pmaddwd mm5, [plus_one] - movq mm0, mm5 - psrlq mm5, 32 - paddd mm0, mm5 - movd eax, mm0 ; return sum + movq mm6, [byte ebx] + + quantinter1 0 + quantinter1 1 + quantinter1 2 + quantinter1 3 + quantinter1 4 + quantinter1 5 + quantinter1 6 + quantinter1 7 + + pmaddwd mm5, [plus_one] + movq mm0, mm5 + psrlq mm5, 32 + paddd mm0, mm5 + movd eax, mm0 ; return sum - pop ebx + pop ebx - ret + ret ;=========================================================================== ; @@ -567,129 +586,138 @@ ;This is Athlon-optimized code (ca 106 clk per call) %macro dequant 1 - movq mm1, [ecx+%1*24] ;A2 ; c = coeff[i] - psubw mm0,mm1 ;-c ;A3 (1st dep) + movq mm1, [ecx+%1*24] ; c = coeff[i] ;A2 + psubw mm0, mm1 ;-c ;A3 (1st dep) %if (%1) - paddw mm4,mm6 ; C11 mm6 free (4th+) + paddw mm4, mm6 ;C11 mm6 free (4th+) %endif - pmaxsw mm0,mm1 ;|c| ;A4 (2nd) + pmaxsw mm0, mm1 ;|c| ;A4 (2nd) %if (%1) - mov ebp,ebp - pminsw mm4,[ebx] ; C12 saturates to +2047 (5th+) 1ater + mov ebp, ebp + pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) later %endif - movq mm6,[esi] ;0 ;A5 mm6 in use - pandn mm7,[eax] ; B9 offset = isZero ? 0 : quant_add (2nd) + movq mm6, [esi] ;0 ;A5 mm6 in use + pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) %if (%1) - pxor mm5, mm4 ; C13 (6th+) 1later + pxor mm5, mm4 ;C13 (6th+) 1later %endif - movq mm4,[esi] ; C1 ;0 - mov esp,esp - pcmpeqw mm6, [ecx+%1*24] ;A6 (c ==0) ? -1 : 0 (1st) + movq mm4, [esi] ;C1 ;0 + mov esp, esp + pcmpeqw mm6, [ecx+%1*24] ;A6 (c ==0) ? -1 : 0 (1st) align 4 - psraw mm1,15 ; sign(c) ;A7 (2nd) + psraw mm1, 15 ; sign(c) ;A7 (2nd) %if (%1) - movq [edx+%1*24+16-24], mm5 ; C14 (7th) 2later + movq [edx+%1*24+16-24], mm5 ; C14 (7th) 2later %endif - paddw mm7,mm3 ; B10 offset +negate back (3rd) - pmullw mm0, [edi] ;*= 2Q ;A8 (3rd+) - paddw mm2,mm7 ; B11 mm7 free (4th+) - lea ebp,[byte ebp] - movq mm5, [ecx+%1*24+16] ;C2 ; c = coeff[i] - psubw mm4,mm5 ;-c ;C3 (1st dep) - pandn mm6,[eax] ; A9 offset = isZero ? 0 : quant_add (2nd) - pminsw mm2,[ebx] ; B12 saturates to +2047 (5th+) - pxor mm3, mm2 ; B13 (6th+) - movq mm2,[byte esi] ; B1 ;0 + paddw mm7, mm3 ;B10 offset +negate back (3rd) + pmullw mm0, [edi] ;*= 2Q ;A8 (3rd+) + paddw mm2, mm7 ;B11 mm7 free (4th+) + lea ebp, [byte ebp] + movq mm5, [ecx+%1*24+16] ;C2 ; c = coeff[i] + psubw mm4, mm5 ;-c ;C3 (1st dep) + pandn mm6, [eax] ;A9 offset = isZero ? 0 : quant_add (2nd) + pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) + pxor mm3, mm2 ;B13 (6th+) + movq mm2, [byte esi] ;B1 ;0 %if (%1) - movq [edx+%1*24+8-24], mm3 ; B14 (7th) + movq [edx+%1*24+8-24], mm3 ;B14 (7th) %else - movq [edx+120], mm3 + movq [edx+120], mm3 %endif - pmaxsw mm4,mm5 ;|c| ;C4 (2nd) - paddw mm6,mm1 ; A10 offset +negate back (3rd) - movq mm3, [ecx+%1*24 + 8] ;B2 ; c = coeff[i] - psubw mm2,mm3 ;-c ;B3 (1st dep) - paddw mm0,mm6 ; A11 mm6 free (4th+) - movq mm6,[byte esi] ;0 ;C5 mm6 in use - pcmpeqw mm6, [ecx+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st) - pminsw mm0,[ebx] ; A12 saturates to +2047 (5th+) - pmaxsw mm2,mm3 ;|c| ;B4 (2nd) - pxor mm1, mm0 ; A13 (6th+) - pmullw mm4, [edi] ;*= 2Q ;C8 (3rd+) - psraw mm5,15 ; sign(c) ;C7 (2nd) - movq mm7,[byte esi] ;0 ;B5 mm7 in use - pcmpeqw mm7, [ecx+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st) + pmaxsw mm4, mm5 ;|c| ;C4 (2nd) + paddw mm6, mm1 ;A10 offset +negate back (3rd) + movq mm3, [ecx+%1*24 + 8] ;B2 ; c = coeff[i] + psubw mm2, mm3 ;-c ;B3 (1st dep) + paddw mm0, mm6 ;A11 mm6 free (4th+) + movq mm6, [byte esi] ;0 ;C5 mm6 in use + pcmpeqw mm6, [ecx+%1*24+16] ;C6 (c ==0) ? -1 : 0 (1st) + pminsw mm0, [ebx] ;A12 saturates to +2047 (5th+) + pmaxsw mm2, mm3 ;|c| ;B4 (2nd) + pxor mm1, mm0 ;A13 (6th+) + pmullw mm4, [edi] ;*= 2Q ;C8 (3rd+) + psraw mm5, 15 ; sign(c) ;C7 (2nd) + movq mm7, [byte esi] ;0 ;B5 mm7 in use + pcmpeqw mm7, [ecx+%1*24 + 8] ;B6 (c ==0) ? -1 : 0 (1st) %if (%1 < 4) - movq mm0,[byte esi] ; A1 ;0 + movq mm0, [byte esi] ;A1 ;0 %endif - pandn mm6,[byte eax] ; C9 offset = isZero ? 0 : quant_add (2nd) - psraw mm3,15 ; sign(c) ;B7 (2nd) - movq [byte edx+%1*24], mm1 ; A14 (7th) - paddw mm6,mm5 ; C10 offset +negate back (3rd) - pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) - mov esp,esp + pandn mm6, [byte eax] ;C9 offset = isZero ? 0 : quant_add (2nd) + psraw mm3, 15 ;sign(c) ;B7 (2nd) + movq [byte edx+%1*24], mm1 ;A14 (7th) + paddw mm6, mm5 ;C10 offset +negate back (3rd) + pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) + mov esp, esp %endmacro align ALIGN cglobal dequant_intra_3dne dequant_intra_3dne: - mov ecx, [esp+ 8] ; coeff - mov eax, [esp+12] ; quant - pxor mm0,mm0 - pxor mm2,mm2 - push edi - push ebx - lea edi,[mmx_mul + eax*8 - 8] ; 2*quant - push ebp - mov ebx,mmx_2047 - movsx ebp,word [ecx] - lea eax,[mmx_add + eax*8 - 8] ; quant or quant-1 - push esi - mov esi,mmzero - pxor mm7,mm7 - movq mm3, [ecx+120] ;B2 ; c = coeff[i] - pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + pxor mm0, mm0 + pxor mm2, mm2 + push edi + push ebx + lea edi, [mmx_mul + eax*8 - 8] ; 2*quant + push ebp + mov ebx, mmx_2047 + movsx ebp, word [ecx] + lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1 + push esi + mov esi, mmzero + pxor mm7, mm7 + movq mm3, [ecx+120] ;B2 ; c = coeff[i] + pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) - imul ebp,[esp+16+16] ; dcscalar - psubw mm2,mm3 ;-c ;B3 (1st dep) - pmaxsw mm2,mm3 ;|c| ;B4 (2nd) - pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) - psraw mm3,15 ; sign(c) ;B7 (2nd) - mov edx, [esp+ 4+16] ; data + imul ebp, [esp+16+16] ; dcscalar + psubw mm2, mm3 ;-c ;B3 (1st dep) + pmaxsw mm2, mm3 ;|c| ;B4 (2nd) + pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) + psraw mm3, 15 ; sign(c) ;B7 (2nd) + mov edx, [esp+ 4+16] ; data + align 8 -dequant 0 - cmp ebp,-2048 - mov esp,esp -dequant 1 - cmovl ebp,[int_2048] - nop -dequant 2 - cmp ebp,2047 - mov esp,esp -dequant 3 - cmovg ebp,[int2047] - nop -dequant 4 - - paddw mm4,mm6 ; C11 mm6 free (4th+) - pminsw mm4,[ebx] ; C12 saturates to +2047 (5th+) - pandn mm7,[eax] ; B9 offset = isZero ? 0 : quant_add (2nd) - mov eax,ebp - mov esi,[esp] - mov ebp,[esp+4] - pxor mm5, mm4 ; C13 (6th+) - paddw mm7,mm3 ; B10 offset +negate back (3rd) - movq [edx+4*24+16], mm5 ; C14 (7th) - paddw mm2,mm7 ; B11 mm7 free (4th+) - pminsw mm2,[ebx] ; B12 saturates to +2047 (5th+) - mov ebx,[esp+8] - mov edi,[esp+12] - add esp,byte 16 - pxor mm3, mm2 ; B13 (6th+) - movq [edx+4*24+8], mm3 ; B14 (7th) - mov [edx], ax - ret + dequant 0 + + cmp ebp, -2048 + mov esp, esp + + dequant 1 + + cmovl ebp, [int_2048] + nop + + dequant 2 + + cmp ebp, 2047 + mov esp, esp + + dequant 3 + + cmovg ebp, [int2047] + nop + + dequant 4 + + paddw mm4, mm6 ;C11 mm6 free (4th+) + pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) + pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) + mov eax, ebp + mov esi, [esp] + mov ebp, [esp+4] + pxor mm5, mm4 ;C13 (6th+) + paddw mm7, mm3 ;B10 offset +negate back (3rd) + movq [edx+4*24+16], mm5 ;C14 (7th) + paddw mm2, mm7 ;B11 mm7 free (4th+) + pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) + mov ebx, [esp+8] + mov edi, [esp+12] + add esp, byte 16 + pxor mm3, mm2 ;B13 (6th+) + movq [edx+4*24+8], mm3 ;B14 (7th) + mov [edx], ax + ret ;=========================================================================== ; @@ -699,53 +727,55 @@ ; ;=========================================================================== - ; this is the same as dequant_inter_3dne, - ; except that we're saturating using 'pminsw' (saves 2 cycles/loop) +; this is the same as dequant_inter_3dne, +; except that we're saturating using 'pminsw' (saves 2 cycles/loop) ;This is Athlon-optimized code (ca 100 clk per call) ;Optimized by Jaan, 30 Nov 2002 align ALIGN cglobal dequant_inter_3dne dequant_inter_3dne: + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + pxor mm0, mm0 + pxor mm2, mm2 + push edi + push ebx + push esi + lea edi, [mmx_mul + eax*8 - 8] ; 2*quant + mov ebx, mmx_2047 + pxor mm7, mm7 + movq mm3, [ecx+120] ;B2 ; c = coeff[i] + pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) + lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1 + psubw mm2, mm3 ;-c ;B3 (1st dep) + mov esi, mmzero + pmaxsw mm2, mm3 ;|c| ;B4 (2nd) + pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) + psraw mm3, 15 ; sign(c) ;B7 (2nd) + mov edx, [dword esp+ 4+12] ; data - mov ecx, [esp+ 8] ; coeff - mov eax, [esp+12] ; quant - pxor mm0,mm0 - pxor mm2,mm2 - push edi - push ebx - push esi - lea edi,[mmx_mul + eax*8 - 8] ; 2*quant - mov ebx,mmx_2047 - pxor mm7,mm7 - movq mm3, [ecx+120] ;B2 ; c = coeff[i] - pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) - lea eax,[mmx_add + eax*8 - 8] ; quant or quant-1 - psubw mm2,mm3 ;-c ;B3 (1st dep) - mov esi,mmzero - pmaxsw mm2,mm3 ;|c| ;B4 (2nd) - pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) - psraw mm3,15 ; sign(c) ;B7 (2nd) - mov edx, [dword esp+ 4+12] ; data align 8 -dequant 0 -dequant 1 -dequant 2 -dequant 3 -dequant 4 - - paddw mm4,mm6 ; C11 mm6 free (4th+) - pminsw mm4,[ebx] ; C12 saturates to +2047 (5th+) - pandn mm7,[eax] ; B9 offset = isZero ? 0 : quant_add (2nd) - mov esi,[esp] - pxor mm5, mm4 ; C13 (6th+) - paddw mm7,mm3 ; B10 offset +negate back (3rd) - movq [edx+4*24+16], mm5 ; C14 (7th) - paddw mm2,mm7 ; B11 mm7 free (4th+) - pminsw mm2,[ebx] ; B12 saturates to +2047 (5th+) - mov ebx,[esp+4] - mov edi,[esp+8] - add esp,byte 12 - pxor mm3, mm2 ; B13 (6th+) - movq [edx+4*24+8], mm3 ; B14 (7th) - ret + + dequant 0 + dequant 1 + dequant 2 + dequant 3 + dequant 4 + + paddw mm4, mm6 ;C11 mm6 free (4th+) + pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) + pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) + mov esi, [esp] + pxor mm5, mm4 ;C13 (6th+) + paddw mm7, mm3 ;B10 offset +negate back (3rd) + movq [edx+4*24+16], mm5 ;C14 (7th) + paddw mm2, mm7 ;B11 mm7 free (4th+) + pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) + mov ebx, [esp+4] + mov edi, [esp+8] + add esp, byte 12 + pxor mm3, mm2 ;B13 (6th+) + movq [edx+4*24+8], mm3 ;B14 (7th) + + ret