--- trunk/xvidcore/src/quant/x86_asm/quantize_mmx.asm 2002/04/24 12:21:43 135 +++ trunk/xvidcore/src/quant/x86_asm/quantize_mmx.asm 2002/11/17 00:41:20 653 @@ -1,42 +1,58 @@ -;/************************************************************************** +;/***************************************************************************** ; * -; * XVID MPEG-4 VIDEO CODEC -; * mmx quantization/dequantization +; * XVID MPEG-4 VIDEO CODEC +; * mmx optimized quantization/dequantization ; * -; * This program is an implementation of a part of one or more MPEG-4 -; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending -; * to use this software module in hardware or software products are -; * advised that its use may infringe existing patents or copyrights, and -; * any such use would be at such party's own risk. The original -; * developer of this software module and his/her company, and subsequent -; * editors and their companies, will have no liability for use of this -; * software or modifications or derivatives thereof. +; * Copyright(C) 2002 Peter Ross +; * Copyright(C) 2002 Michael Militzer +; * Copyright(C) 2002 Pascal Massimino ; * -; * This program is free software; you can redistribute it and/or modify -; * it under the terms of the GNU General Public License as published by -; * the Free Software Foundation; either version 2 of the License, or -; * (at your option) any later version. +; * This file is part of XviD, a free MPEG-4 video encoder/decoder ; * -; * This program is distributed in the hope that it will be useful, -; * but WITHOUT ANY WARRANTY; without even the implied warranty of -; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -; * GNU General Public License for more details. +; * XviD is free software; you can redistribute it and/or modify it +; * under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 2 of the License, or +; * (at your option) any later version. ; * -; * You should have received a copy of the GNU General Public License -; * along with this program; if not, write to the Free Software -; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. ; * -; *************************************************************************/ - -;/************************************************************************** +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +; * +; * Under section 8 of the GNU General Public License, the copyright +; * holders of XVID explicitly forbid distribution in the following +; * countries: +; * +; * - Japan +; * - United States of America +; * +; * Linking XviD statically or dynamically with other modules is making a +; * combined work based on XviD. Thus, the terms and conditions of the +; * GNU General Public License cover the whole combination. ; * -; * History: +; * As a special exception, the copyright holders of XviD give you +; * permission to link XviD with independent modules that communicate with +; * XviD solely through the VFW1.1 and DShow interfaces, regardless of the +; * license terms of these independent modules, and to copy and distribute +; * the resulting combined work under terms of your choice, provided that +; * every copy of the combined work is accompanied by a complete copy of +; * the source code of XviD (the version of XviD used to produce the +; * combined work), being distributed under the terms of the GNU General +; * Public License plus this exception. An independent module is a module +; * which is not derived from or based on XviD. ; * -; * 24.02.2002 sse2 quant_intra / dequant_intra (have to use movdqu ???) -; * 17.04.2002 sse2 quant_inter / dequant_inter -; * 26.12.2001 minor bug fixes, dequant saturate, further optimization -; * 19.11.2001 quant_inter_mmx now returns sum of abs. coefficient values -; * 04.11.2001 nasm version; (c)2001 peter ross +; * Note that people who make modified versions of XviD are not obligated +; * to grant this special exception for their modified versions; it is +; * their choice whether to do so. The GNU General Public License gives +; * permission to release a modified version without this exception; this +; * exception also makes it possible to release a modified version which +; * carries forward this exception. +; * +; * $Id: quantize_mmx.asm,v 1.7 2002-11-17 00:41:20 edgomez Exp $ ; * ; *************************************************************************/ @@ -64,7 +80,6 @@ plus_one times 8 dw 1 - ;=========================================================================== ; ; subtract by Q/2 table @@ -261,13 +276,15 @@ ; ;=========================================================================== -align ALIGN -mmx_32768_minus_2048 times 4 dw (32768-2048) -mmx_32767_minus_2047 times 4 dw (32767-2047) +align 16 +sse2_2047 times 8 dw 2047 align 16 -sse2_pos_2047 times 8 dw 2047 -sse2_neg_2048 times 8 dw -2048 +mmx_2047 times 4 dw 2047 + +align 8 +mmx_32768_minus_2048 times 4 dw (32768-2048) +mmx_32767_minus_2047 times 4 dw (32767-2047) section .text @@ -700,7 +717,6 @@ jmp .qes2_done - ;=========================================================================== ; ; void dequant_intra_mmx(int16_t *data, @@ -710,99 +726,148 @@ ; ;=========================================================================== + ; note: we only saturate to +2047 *before* restoring the sign. + ; Hence, final clamp really is [-2048,2047] + align ALIGN cglobal dequant_intra_mmx -dequant_intra_mmx - - push esi - push edi +dequant_intra_mmx: - mov edi, [esp + 8 + 4] ; data - mov esi, [esp + 8 + 8] ; coeff - mov eax, [esp + 8 + 12] ; quant - - movq mm6, [mmx_add + eax * 8 - 8] - movq mm7, [mmx_mul + eax * 8 - 8] - xor eax, eax + mov edx, [esp+ 4] ; data + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1 + movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant + mov eax, -16 align ALIGN .loop - movq mm0, [esi + 8*eax] ; mm0 = [coeff] - movq mm3, [esi + 8*eax + 8] ; - pxor mm1, mm1 ; mm1 = 0 - pxor mm4, mm4 ; - pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) - pcmpgtw mm4, mm3 ; - pxor mm2, mm2 ; mm2 = 0 - pxor mm5, mm5 ; - pcmpeqw mm2, mm0 ; mm2 = (0 == mm0) - pcmpeqw mm5, mm3 ; - pandn mm2, mm6 ; mm2 = (iszero ? 0 : add) - pandn mm5, mm6 ; - pxor mm0, mm1 ; mm0 = |mm0| - pxor mm3, mm4 ; - psubw mm0, mm1 ; displace - psubw mm3, mm4 ; - pmullw mm0, mm7 ; mm0 *= 2Q - pmullw mm3, mm7 ; - paddw mm0, mm2 ; mm0 += mm2 (add) - paddw mm3, mm5 ; - pxor mm0, mm1 ; mm0 *= sign(mm0) - pxor mm3, mm4 ; - psubw mm0, mm1 ; undisplace - psubw mm3, mm4 + movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] + movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] + pxor mm1, mm1 + pxor mm4, mm4 + pcmpgtw mm1, mm0 ; sign(c) + pcmpgtw mm4, mm3 ; sign(c') + pxor mm2, mm2 + pxor mm5, mm5 + pcmpeqw mm2, mm0 ; c is zero + pcmpeqw mm5, mm3 ; c' is zero + pandn mm2, mm6 ; offset = isZero ? 0 : quant_add + pandn mm5, mm6 + pxor mm0, mm1 ; negate if negative + pxor mm3, mm4 ; negate if negative + psubw mm0, mm1 + psubw mm3, mm4 + pmullw mm0, mm7 ; *= 2Q + pmullw mm3, mm7 ; *= 2Q + paddw mm0, mm2 ; + offset + paddw mm3, mm5 ; + offset + paddw mm0, mm1 ; negate back + paddw mm3, mm4 ; negate back + + ; saturates to +2047 + movq mm2, [mmx_32767_minus_2047] + add eax, 2 + paddsw mm0, mm2 + paddsw mm3, mm2 + psubsw mm0, mm2 + psubsw mm3, mm2 + + pxor mm0, mm1 + pxor mm3, mm4 + movq [edx + 8*eax + 8*16 - 2*8], mm0 + movq [edx + 8*eax + 8*16+8 - 2*8], mm3 + jnz near .loop + + ; deal with DC + + movd mm0, [ecx] + pmullw mm0, [esp+16] ; dcscalar + movq mm2, [mmx_32767_minus_2047] + paddsw mm0, mm2 + psubsw mm0, mm2 + movq mm3, [mmx_32768_minus_2048] + psubsw mm0, mm3 + paddsw mm0, mm3 + movd eax, mm0 + mov [edx], ax -%ifdef SATURATE - movq mm2, [mmx_32767_minus_2047] - movq mm4, [mmx_32768_minus_2048] - paddsw mm0, mm2 - paddsw mm3, mm2 - psubsw mm0, mm2 - psubsw mm3, mm2 - psubsw mm0, mm4 - psubsw mm3, mm4 - paddsw mm0, mm4 - paddsw mm3, mm4 -%endif + ret - movq [edi + 8*eax], mm0 ; [data] = mm0 - movq [edi + 8*eax + 8], mm3 - - add eax, 2 - cmp eax, 16 - jnz near .loop - - mov ax, [esi] ; ax = data[0] - imul ax, [esp + 8 + 16] ; eax = data[0] * dcscalar - -%ifdef SATURATE - cmp ax, -2048 - jl .set_n2048 - cmp ax, 2047 - jg .set_2047 -%endif - mov [edi], ax +;=========================================================================== +; +; void dequant_intra_xmm(int16_t *data, +; const int16_t const *coeff, +; const uint32_t quant, +; const uint32_t dcscalar); +; +;=========================================================================== - pop edi - pop esi - ret + ; this is the same as dequant_inter_mmx, except that we're + ; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster) -%ifdef SATURATE align ALIGN -.set_n2048 - mov word [edi], -2048 - pop edi - pop esi - ret - +cglobal dequant_intra_xmm +dequant_intra_xmm: + + mov edx, [esp+ 4] ; data + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1 + movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant + mov eax, -16 + align ALIGN -.set_2047 - mov word [edi], 2047 - pop edi - pop esi - ret -%endif +.loop + movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] + movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] + pxor mm1, mm1 + pxor mm4, mm4 + pcmpgtw mm1, mm0 ; sign(c) + pcmpgtw mm4, mm3 ; sign(c') + pxor mm2, mm2 + pxor mm5, mm5 + pcmpeqw mm2, mm0 ; c is zero + pcmpeqw mm5, mm3 ; c' is zero + pandn mm2, mm6 ; offset = isZero ? 0 : quant_add + pandn mm5, mm6 + pxor mm0, mm1 ; negate if negative + pxor mm3, mm4 ; negate if negative + psubw mm0, mm1 + psubw mm3, mm4 + pmullw mm0, mm7 ; *= 2Q + pmullw mm3, mm7 ; *= 2Q + paddw mm0, mm2 ; + offset + paddw mm3, mm5 ; + offset + paddw mm0, mm1 ; negate back + paddw mm3, mm4 ; negate back + + ; saturates to +2047 + movq mm2, [mmx_2047] + pminsw mm0, mm2 + add eax, 2 + pminsw mm3, mm2 + + pxor mm0, mm1 + pxor mm3, mm4 + movq [edx + 8*eax + 8*16 - 2*8], mm0 + movq [edx + 8*eax + 8*16+8 - 2*8], mm3 + jnz near .loop + + ; deal with DC + + movd mm0, [ecx] + pmullw mm0, [esp+16] ; dcscalar + movq mm2, [mmx_32767_minus_2047] + paddsw mm0, mm2 + psubsw mm0, mm2 + movq mm2, [mmx_32768_minus_2048] + psubsw mm0, mm2 + paddsw mm0, mm2 + movd eax, mm0 + mov [edx], ax + ret ;=========================================================================== @@ -813,100 +878,71 @@ ; const uint32_t dcscalar); ; ;=========================================================================== - -align 16 +align ALIGN cglobal dequant_intra_sse2 -dequant_intra_sse2 - - push esi - push edi - - mov edi, [esp + 8 + 4] ; data - mov esi, [esp + 8 + 8] ; coeff - mov eax, [esp + 8 + 12] ; quant - - movq mm6, [mmx_add + eax*8 - 8] - movq mm7, [mmx_mul + eax*8 - 8] - movq2dq xmm6, mm6 - movq2dq xmm7, mm7 - movlhps xmm6, xmm6 - movlhps xmm7, xmm7 - - xor eax, eax - -align 16 -.das2_loop - movdqa xmm0, [esi + eax*8] - movdqa xmm3, [esi + eax*8 + 16] - pxor xmm1, xmm1 - pxor xmm4, xmm4 - pcmpgtw xmm1, xmm0 - pcmpgtw xmm4, xmm3 - pxor xmm2, xmm2 - pxor xmm5, xmm5 - pcmpeqw xmm2, xmm0 - pcmpeqw xmm5, xmm3 - pandn xmm2, xmm6 - pandn xmm5, xmm6 - pxor xmm0, xmm1 - pxor xmm3, xmm4 - psubw xmm0, xmm1 - psubw xmm3, xmm4 - pmullw xmm0, xmm7 - pmullw xmm3, xmm7 - paddw xmm0, xmm2 - paddw xmm3, xmm5 - pxor xmm0, xmm1 - pxor xmm3, xmm4 - psubw xmm0, xmm1 - psubw xmm3, xmm4 - -%ifdef SATURATE - movdqu xmm2, [sse2_pos_2047] - movdqu xmm4, [sse2_neg_2048] - pminsw xmm0, xmm2 - pminsw xmm3, xmm2 - pmaxsw xmm0, xmm4 - pmaxsw xmm3, xmm4 -%endif - - movdqa [edi + eax*8], xmm0 - movdqa [edi + eax*8 + 16], xmm3 +dequant_intra_sse2: + mov edx, [esp+ 4] ; data + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + movq mm6, [mmx_add + eax * 8 - 8] + movq mm7, [mmx_mul + eax * 8 - 8] + movq2dq xmm6, mm6 + movq2dq xmm7, mm7 + movlhps xmm6, xmm6 + movlhps xmm7, xmm7 + mov eax, -16 - add eax, 4 - cmp eax, 16 - jnz near .das2_loop - - mov ax, [esi] ; ax = data[0] - imul ax, [esp + 8 + 16] ; eax = data[0] * dcscalar - -%ifdef SATURATE - cmp ax, -2048 - jl .das2_set_n2048 - cmp ax, 2047 - jg .das2_set_2047 -%endif - mov [edi], ax +align ALIGN +.loop + movdqa xmm0, [ecx + 8*16 + 8*eax] ; c = coeff[i] + movdqa xmm3, [ecx + 8*16 + 8*eax+ 16] + pxor xmm1, xmm1 + pxor xmm4, xmm4 + pcmpgtw xmm1, xmm0 ; sign(c) + pcmpgtw xmm4, xmm3 + pxor xmm2, xmm2 + pxor xmm5, xmm5 + pcmpeqw xmm2, xmm0 ; c is zero + pcmpeqw xmm5, xmm3 + pandn xmm2, xmm6 ; offset = isZero ? 0 : quant_add + pandn xmm5, xmm6 + pxor xmm0, xmm1 ; negate if negative + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + pmullw xmm0, xmm7 ; *= 2Q + pmullw xmm3, xmm7 + paddw xmm0, xmm2 ; + offset + paddw xmm3, xmm5 + paddw xmm0, xmm1 ; negate back + paddw xmm3, xmm4 + + ; saturates to +2047 + movdqa xmm2, [sse2_2047] + pminsw xmm0, xmm2 + add eax, 4 + pminsw xmm3, xmm2 + + pxor xmm0, xmm1 + pxor xmm3, xmm4 + movdqa [edx + 8*16 - 8*4 + 8*eax], xmm0 + movdqa [edx + 8*16 - 8*4 + 8*eax + 16], xmm3 + jnz near .loop + + ; deal with DC + movd mm0, [ecx] + pmullw mm0, [esp+16] ; dcscalar + movq mm2, [mmx_32767_minus_2047] + paddsw mm0, mm2 + psubsw mm0, mm2 + movq mm2, [mmx_32768_minus_2048] + psubsw mm0, mm2 + paddsw mm0, mm2 + movd eax, mm0 + mov [edx], ax - pop edi - pop esi - ret + ret -%ifdef SATURATE -align 16 -.das2_set_n2048 - mov word [edi], -2048 - pop edi - pop esi - ret - -align 16 -.das2_set_2047 - mov word [edi], 2047 - pop edi - pop esi - ret -%endif @@ -920,71 +956,116 @@ align ALIGN cglobal dequant_inter_mmx -dequant_inter_mmx - - push esi - push edi +dequant_inter_mmx: - mov edi, [esp + 8 + 4] ; data - mov esi, [esp + 8 + 8] ; coeff - mov eax, [esp + 8 + 12] ; quant - movq mm6, [mmx_add + eax * 8 - 8] - movq mm7, [mmx_mul + eax * 8 - 8] - - xor eax, eax + mov edx, [esp+ 4] ; data + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1 + movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant + mov eax, -16 align ALIGN .loop - movq mm0, [esi + 8*eax] ; mm0 = [coeff] - movq mm3, [esi + 8*eax + 8] ; - pxor mm1, mm1 ; mm1 = 0 - pxor mm4, mm4 ; - pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) - pcmpgtw mm4, mm3 ; - pxor mm2, mm2 ; mm2 = 0 - pxor mm5, mm5 ; - pcmpeqw mm2, mm0 ; mm2 = (0 == mm0) - pcmpeqw mm5, mm3 ; - pandn mm2, mm6 ; mm2 = (iszero ? 0 : add) - pandn mm5, mm6 ; - pxor mm0, mm1 ; mm0 = |mm0| - pxor mm3, mm4 ; - psubw mm0, mm1 ; displace - psubw mm3, mm4 ; - pmullw mm0, mm7 ; mm0 *= 2Q - pmullw mm3, mm7 ; - paddw mm0, mm2 ; mm0 += mm2 (add) - paddw mm3, mm5 ; - pxor mm0, mm1 ; mm0 *= sign(mm0) - pxor mm3, mm4 ; - psubw mm0, mm1 ; undisplace - psubw mm3, mm4 + movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] + movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] + pxor mm1, mm1 + pxor mm4, mm4 + pcmpgtw mm1, mm0 ; sign(c) + pcmpgtw mm4, mm3 ; sign(c') + pxor mm2, mm2 + pxor mm5, mm5 + pcmpeqw mm2, mm0 ; c is zero + pcmpeqw mm5, mm3 ; c' is zero + pandn mm2, mm6 ; offset = isZero ? 0 : quant_add + pandn mm5, mm6 + pxor mm0, mm1 ; negate if negative + pxor mm3, mm4 ; negate if negative + psubw mm0, mm1 + psubw mm3, mm4 + pmullw mm0, mm7 ; *= 2Q + pmullw mm3, mm7 ; *= 2Q + paddw mm0, mm2 ; + offset + paddw mm3, mm5 ; + offset + paddw mm0, mm1 ; negate back + paddw mm3, mm4 ; negate back + + ; saturates to +2047 + movq mm2, [mmx_32767_minus_2047] + add eax, 2 + paddsw mm0, mm2 + paddsw mm3, mm2 + psubsw mm0, mm2 + psubsw mm3, mm2 + + pxor mm0, mm1 + pxor mm3, mm4 + movq [edx + 8*eax + 8*16 - 2*8], mm0 + movq [edx + 8*eax + 8*16+8 - 2*8], mm3 + jnz near .loop -%ifdef SATURATE - movq mm2, [mmx_32767_minus_2047] - movq mm4, [mmx_32768_minus_2048] - paddsw mm0, mm2 - paddsw mm3, mm2 - psubsw mm0, mm2 - psubsw mm3, mm2 - psubsw mm0, mm4 - psubsw mm3, mm4 - paddsw mm0, mm4 - paddsw mm3, mm4 -%endif + ret - movq [edi + 8*eax], mm0 - movq [edi + 8*eax + 8], mm3 +;=========================================================================== +; +; void dequant_inter_xmm(int16_t * data, +; const int16_t * const coeff, +; const uint32_t quant); +; +;=========================================================================== - add eax, 2 - cmp eax, 16 - jnz near .loop + ; this is the same as dequant_inter_mmx, + ; except that we're saturating using 'pminsw' (saves 2 cycles/loop) - pop edi - pop esi +align ALIGN +cglobal dequant_inter_xmm +dequant_inter_xmm: - ret + mov edx, [esp+ 4] ; data + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1 + movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant + mov eax, -16 +align ALIGN +.loop + movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] + movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] + pxor mm1, mm1 + pxor mm4, mm4 + pcmpgtw mm1, mm0 ; sign(c) + pcmpgtw mm4, mm3 ; sign(c') + pxor mm2, mm2 + pxor mm5, mm5 + pcmpeqw mm2, mm0 ; c is zero + pcmpeqw mm5, mm3 ; c' is zero + pandn mm2, mm6 ; offset = isZero ? 0 : quant_add + pandn mm5, mm6 + pxor mm0, mm1 ; negate if negative + pxor mm3, mm4 ; negate if negative + psubw mm0, mm1 + psubw mm3, mm4 + pmullw mm0, mm7 ; *= 2Q + pmullw mm3, mm7 ; *= 2Q + paddw mm0, mm2 ; + offset + paddw mm3, mm5 ; + offset + paddw mm0, mm1 ; start restoring sign + paddw mm3, mm4 ; start restoring sign + + ; saturates to +2047 + movq mm2, [mmx_2047] + pminsw mm0, mm2 + add eax, 2 + pminsw mm3, mm2 + + pxor mm0, mm1 ; finish restoring sign + pxor mm3, mm4 ; finish restoring sign + movq [edx + 8*eax + 8*16 - 2*8], mm0 + movq [edx + 8*eax + 8*16+8 - 2*8], mm3 + jnz near .loop + + ret ;=========================================================================== ; @@ -993,71 +1074,57 @@ ; const uint32_t quant); ; ;=========================================================================== - -align 16 +align ALIGN cglobal dequant_inter_sse2 dequant_inter_sse2 + mov edx, [esp + 4] ; data + mov ecx, [esp + 8] ; coeff + mov eax, [esp + 12] ; quant + movq mm6, [mmx_add + eax * 8 - 8] + movq mm7, [mmx_mul + eax * 8 - 8] + movq2dq xmm6, mm6 + movq2dq xmm7, mm7 + movlhps xmm6, xmm6 + movlhps xmm7, xmm7 + mov eax, -16 - push esi - push edi - - mov edi, [esp + 8 + 4] ; data - mov esi, [esp + 8 + 8] ; coeff - mov eax, [esp + 8 + 12] ; quant - movq mm6, [mmx_add + eax * 8 - 8] - movq mm7, [mmx_mul + eax * 8 - 8] - - movq2dq xmm6, mm6 - movq2dq xmm7, mm7 - movlhps xmm6, xmm6 - movlhps xmm7, xmm7 - - xor eax, eax - -align 16 -.des2_loop - movdqa xmm0, [esi + eax*8] ; xmm0 = [coeff] - movdqa xmm3, [esi + eax*8 + 16] - pxor xmm1, xmm1 - pxor xmm4, xmm4 - pcmpgtw xmm1, xmm0 - pcmpgtw xmm4, xmm3 - pxor xmm2, xmm2 - pxor xmm5, xmm5 - pcmpeqw xmm2, xmm0 - pcmpeqw xmm5, xmm3 - pandn xmm2, xmm6 - pandn xmm5, xmm6 - pxor xmm0, xmm1 - pxor xmm3, xmm4 - psubw xmm0, xmm1 - psubw xmm3, xmm4 - pmullw xmm0, xmm7 - pmullw xmm3, xmm7 - paddw xmm0, xmm2 - paddw xmm3, xmm5 - pxor xmm0, xmm1 - pxor xmm3, xmm4 - psubw xmm0, xmm1 - psubw xmm3, xmm4 - -%ifdef SATURATE - movdqu xmm2, [sse2_pos_2047] - movdqu xmm4, [sse2_neg_2048] - pminsw xmm0, xmm2 - pminsw xmm3, xmm2 - pmaxsw xmm0, xmm4 - pmaxsw xmm3, xmm4 -%endif - - movdqa [edi + eax*8], xmm0 - movdqa [edi + eax*8 + 16], xmm3 - - add eax, 4 - cmp eax, 16 - jnz near .des2_loop +align ALIGN +.loop + movdqa xmm0, [ecx + 8*16 + 8*eax] ; c = coeff[i] + movdqa xmm3, [ecx + 8*16 + 8*eax + 16] - pop edi - pop esi + pxor xmm1, xmm1 + pxor xmm4, xmm4 + pcmpgtw xmm1, xmm0 ; sign(c) + pcmpgtw xmm4, xmm3 + pxor xmm2, xmm2 + pxor xmm5, xmm5 + pcmpeqw xmm2, xmm0 ; c is zero + pcmpeqw xmm5, xmm3 + pandn xmm2, xmm6 + pandn xmm5, xmm6 + pxor xmm0, xmm1 ; negate if negative + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + pmullw xmm0, xmm7 ; *= 2Q + pmullw xmm3, xmm7 + paddw xmm0, xmm2 ; + offset + paddw xmm3, xmm5 + + paddw xmm0, xmm1 ; start restoring sign + paddw xmm3, xmm4 + + ; saturates to +2047 + movdqa xmm2, [sse2_2047] + pminsw xmm0, xmm2 + add eax, 4 + pminsw xmm3, xmm2 + + pxor xmm0, xmm1 ; finish restoring sign + pxor xmm3, xmm4 + movdqa [edx + 8*16 - 8*4 + 8*eax], xmm0 + movdqa [edx + 8*16 - 8*4 + 8*eax + 16], xmm3 + jnz near .loop - ret + ret