--- trunk/xvidcore/src/quant/x86_asm/quantize_mmx.asm 2002/03/08 02:46:11 3 +++ trunk/xvidcore/src/quant/x86_asm/quantize_mmx.asm 2002/07/07 13:21:34 269 @@ -32,6 +32,9 @@ ; * ; * History: ; * +; * 14.06.2002 mmx+xmm dequant_* funcs revamped -Skal- +; * 24.02.2002 sse2 quant_intra / dequant_intra (have to use movdqu ???) +; * 17.04.2002 sse2 quant_inter / dequant_inter ; * 26.12.2001 minor bug fixes, dequant saturate, further optimization ; * 19.11.2001 quant_inter_mmx now returns sum of abs. coefficient values ; * 04.11.2001 nasm version; (c)2001 peter ross @@ -58,7 +61,9 @@ %endif %endmacro -plus_one times 4 dw 1 +align 16 + +plus_one times 8 dw 1 ;=========================================================================== ; @@ -70,7 +75,7 @@ times 4 dw %1 / 2 %endmacro -align ALIGN +align 16 mmx_sub MMX_SUB 1 MMX_SUB 2 @@ -120,7 +125,7 @@ times 4 dw (1 << 16) / (%1 * 2) + 1 %endmacro -align ALIGN +align 16 mmx_div MMX_DIV 1 MMX_DIV 2 @@ -170,7 +175,7 @@ %endif %endmacro -align ALIGN +align 16 mmx_add MMX_ADD 1 MMX_ADD 2 @@ -215,7 +220,7 @@ times 4 dw %1 * 2 %endmacro -align ALIGN +align 16 mmx_mul MMX_MUL 1 MMX_MUL 2 @@ -256,10 +261,17 @@ ; ;=========================================================================== -align ALIGN +align 8 mmx_32768_minus_2048 times 4 dw (32768-2048) mmx_32767_minus_2047 times 4 dw (32767-2047) +align 16 +mmx_2047 times 4 dw 2047 + +align 16 +sse2_pos_2047 times 8 dw 2047 +sse2_neg_2048 times 8 dw -2048 + section .text @@ -371,6 +383,111 @@ ;=========================================================================== ; +; void quant_intra_sse2(int16_t * coeff, +; const int16_t const * data, +; const uint32_t quant, +; const uint32_t dcscalar); +; +;=========================================================================== + +align ALIGN +cglobal quant_intra_sse2 +quant_intra_sse2 + + push esi + push edi + + mov edi, [esp + 8 + 4] ; coeff + mov esi, [esp + 8 + 8] ; data + mov eax, [esp + 8 + 12] ; quant + + xor ecx, ecx + cmp al, 1 + jz near .qas2_q1loop + +.qas2_not1 + movq mm7, [mmx_div + eax*8 - 8] + movq2dq xmm7, mm7 + movlhps xmm7, xmm7 + +align 16 +.qas2_loop + movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] + movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] + pxor xmm1, xmm1 + pxor xmm4, xmm4 + pcmpgtw xmm1, xmm0 + pcmpgtw xmm4, xmm3 + pxor xmm0, xmm1 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + pmulhw xmm0, xmm7 + pmulhw xmm3, xmm7 + pxor xmm0, xmm1 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + movdqa [edi + ecx*8], xmm0 + movdqa [edi + ecx*8 + 16], xmm3 + + add ecx, 4 + cmp ecx, 16 + jnz .qas2_loop + +.qas2_done + mov ecx, [esp + 8 + 16] ; dcscalar + mov edx, ecx + movsx eax, word [esi] + shr edx, 1 + cmp eax, 0 + jg .qas2_gtzero + + sub eax, edx + jmp short .qas2_mul +.qas2_gtzero + add eax, edx +.qas2_mul + cdq + idiv ecx + + mov [edi], ax + + pop edi + pop esi + + ret + +align 16 +.qas2_q1loop + movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] + movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] + pxor xmm1, xmm1 + pxor xmm4, xmm4 + pcmpgtw xmm1, xmm0 + pcmpgtw xmm4, xmm3 + pxor xmm0, xmm1 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + psrlw xmm0, 1 + psrlw xmm3, 1 + pxor xmm0, xmm1 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + movdqa [edi + ecx*8], xmm0 + movdqa [edi + ecx*8 + 16], xmm3 + + add ecx, 4 + cmp ecx, 16 + jnz .qas2_q1loop + jmp near .qas2_done + + + +;=========================================================================== +; ; uint32_t quant_inter_mmx(int16_t * coeff, ; const int16_t const * data, ; const uint32_t quant); @@ -476,6 +593,118 @@ ;=========================================================================== ; +; uint32_t quant_inter_sse2(int16_t * coeff, +; const int16_t const * data, +; const uint32_t quant); +; +;=========================================================================== + +align 16 +cglobal quant_inter_sse2 + quant_inter_sse2 + + push esi + push edi + + mov edi, [esp + 8 + 4] ; coeff + mov esi, [esp + 8 + 8] ; data + mov eax, [esp + 8 + 12] ; quant + + xor ecx, ecx + + pxor xmm5, xmm5 ; sum + + movq mm0, [mmx_sub + eax*8 - 8] ; sub + movq2dq xmm6, mm0 ; load into low 8 bytes + movlhps xmm6, xmm6 ; duplicate into high 8 bytes + + cmp al, 1 + jz near .qes2_q1loop + +.qes2_not1 + movq mm0, [mmx_div + eax*8 - 8] ; divider + movq2dq xmm7, mm0 + movlhps xmm7, xmm7 + +align 16 +.qes2_loop + movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] + movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] + pxor xmm1, xmm1 + pxor xmm4, xmm4 + pcmpgtw xmm1, xmm0 + pcmpgtw xmm4, xmm3 + pxor xmm0, xmm1 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + psubusw xmm0, xmm6 + psubusw xmm3, xmm6 + pmulhw xmm0, xmm7 + pmulhw xmm3, xmm7 + paddw xmm5, xmm0 + pxor xmm0, xmm1 + paddw xmm5, xmm3 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + movdqa [edi + ecx*8], xmm0 + movdqa [edi + ecx*8 + 16], xmm3 + + add ecx, 4 + cmp ecx, 16 + jnz .qes2_loop + +.qes2_done + movdqu xmm6, [plus_one] + pmaddwd xmm5, xmm6 + movhlps xmm6, xmm5 + paddd xmm5, xmm6 + movdq2q mm0, xmm5 + + movq mm5, mm0 + psrlq mm5, 32 + paddd mm0, mm5 + movd eax, mm0 ; return sum + + pop edi + pop esi + + ret + +align 16 +.qes2_q1loop + movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] + movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] + pxor xmm1, xmm1 + pxor xmm4, xmm4 + pcmpgtw xmm1, xmm0 + pcmpgtw xmm4, xmm3 + pxor xmm0, xmm1 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + psubusw xmm0, xmm6 + psubusw xmm3, xmm6 + psrlw xmm0, 1 + psrlw xmm3, 1 + paddw xmm5, xmm0 + pxor xmm0, xmm1 + paddw xmm5, xmm3 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + movdqa [edi + ecx*8], xmm0 + movdqa [edi + ecx*8 + 16], xmm3 + + add ecx,4 + cmp ecx,16 + jnz .qes2_q1loop + jmp .qes2_done + + +;=========================================================================== +; ; void dequant_intra_mmx(int16_t *data, ; const int16_t const *coeff, ; const uint32_t quant, @@ -483,96 +712,249 @@ ; ;=========================================================================== + ; note: we only saturate to +2047 *before* restoring the sign. + ; Hence, final clamp really is [-2048,2047] + align ALIGN cglobal dequant_intra_mmx -dequant_intra_mmx +dequant_intra_mmx: - push esi - push edi + mov edx, [esp+ 4] ; data + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1 + movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant + mov eax, -16 - mov edi, [esp + 8 + 4] ; data - mov esi, [esp + 8 + 8] ; coeff - mov eax, [esp + 8 + 12] ; quant +align ALIGN +.loop + movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] + movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] + pxor mm1, mm1 + pxor mm4, mm4 + pcmpgtw mm1, mm0 ; sign(c) + pcmpgtw mm4, mm3 ; sign(c') + pxor mm2, mm2 + pxor mm5, mm5 + pcmpeqw mm2, mm0 ; c is zero + pcmpeqw mm5, mm3 ; c' is zero + pandn mm2, mm6 ; offset = isZero ? 0 : quant_add + pandn mm5, mm6 + pxor mm0, mm1 ; negate if negative + pxor mm3, mm4 ; negate if negative + psubw mm0, mm1 + psubw mm3, mm4 + pmullw mm0, mm7 ; *= 2Q + pmullw mm3, mm7 ; *= 2Q + paddw mm0, mm2 ; + offset + paddw mm3, mm5 ; + offset + paddw mm0, mm1 ; negate back + paddw mm3, mm4 ; negate back + + ; saturates to +2047 + movq mm2, [mmx_32767_minus_2047] + add eax, 2 + paddsw mm0, mm2 + paddsw mm3, mm2 + psubsw mm0, mm2 + psubsw mm3, mm2 + + pxor mm0, mm1 + pxor mm3, mm4 + movq [edx + 8*eax + 8*16 - 2*8], mm0 + movq [edx + 8*eax + 8*16+8 - 2*8], mm3 + jnz near .loop + + ; deal with DC + + movd mm0, [ecx] + pmullw mm0, [esp+16] ; dcscalar + movq mm2, [mmx_32767_minus_2047] + paddsw mm0, mm2 + psubsw mm0, mm2 + movq mm3, [mmx_32768_minus_2048] + psubsw mm0, mm3 + paddsw mm0, mm3 + movd eax, mm0 + mov [edx], ax - movq mm6, [mmx_add + eax * 8 - 8] - movq mm7, [mmx_mul + eax * 8 - 8] - xor eax, eax + ret + +;=========================================================================== +; +; void dequant_intra_xmm(int16_t *data, +; const int16_t const *coeff, +; const uint32_t quant, +; const uint32_t dcscalar); +; +;=========================================================================== + + ; this is the same as dequant_inter_mmx, except that we're + ; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster) + +align ALIGN +cglobal dequant_intra_xmm +dequant_intra_xmm: + + mov edx, [esp+ 4] ; data + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1 + movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant + mov eax, -16 align ALIGN .loop - movq mm0, [esi + 8*eax] ; mm0 = [coeff] - movq mm3, [esi + 8*eax + 8] ; - pxor mm1, mm1 ; mm1 = 0 - pxor mm4, mm4 ; - pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) - pcmpgtw mm4, mm3 ; - pxor mm2, mm2 ; mm2 = 0 - pxor mm5, mm5 ; - pcmpeqw mm2, mm0 ; mm2 = (0 == mm0) - pcmpeqw mm5, mm3 ; - pandn mm2, mm6 ; mm2 = (iszero ? 0 : add) - pandn mm5, mm6 ; - pxor mm0, mm1 ; mm0 = |mm0| - pxor mm3, mm4 ; - psubw mm0, mm1 ; displace - psubw mm3, mm4 ; - pmullw mm0, mm7 ; mm0 *= 2Q - pmullw mm3, mm7 ; - paddw mm0, mm2 ; mm0 += mm2 (add) - paddw mm3, mm5 ; - pxor mm0, mm1 ; mm0 *= sign(mm0) - pxor mm3, mm4 ; - psubw mm0, mm1 ; undisplace - psubw mm3, mm4 + movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] + movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] + pxor mm1, mm1 + pxor mm4, mm4 + pcmpgtw mm1, mm0 ; sign(c) + pcmpgtw mm4, mm3 ; sign(c') + pxor mm2, mm2 + pxor mm5, mm5 + pcmpeqw mm2, mm0 ; c is zero + pcmpeqw mm5, mm3 ; c' is zero + pandn mm2, mm6 ; offset = isZero ? 0 : quant_add + pandn mm5, mm6 + pxor mm0, mm1 ; negate if negative + pxor mm3, mm4 ; negate if negative + psubw mm0, mm1 + psubw mm3, mm4 + pmullw mm0, mm7 ; *= 2Q + pmullw mm3, mm7 ; *= 2Q + paddw mm0, mm2 ; + offset + paddw mm3, mm5 ; + offset + paddw mm0, mm1 ; negate back + paddw mm3, mm4 ; negate back + + ; saturates to +2047 + movq mm2, [mmx_2047] + pminsw mm0, mm2 + add eax, 2 + pminsw mm3, mm2 + + pxor mm0, mm1 + pxor mm3, mm4 + movq [edx + 8*eax + 8*16 - 2*8], mm0 + movq [edx + 8*eax + 8*16+8 - 2*8], mm3 + jnz near .loop + + ; deal with DC + + movd mm0, [ecx] + pmullw mm0, [esp+16] ; dcscalar + movq mm2, [mmx_32767_minus_2047] + paddsw mm0, mm2 + psubsw mm0, mm2 + movq mm2, [mmx_32768_minus_2048] + psubsw mm0, mm2 + paddsw mm0, mm2 + movd eax, mm0 + mov [edx], ax + + ret + +;=========================================================================== +; +; void dequant_intra_sse2(int16_t *data, +; const int16_t const *coeff, +; const uint32_t quant, +; const uint32_t dcscalar); +; +;=========================================================================== + +align 16 +cglobal dequant_intra_sse2 +dequant_intra_sse2: + + push esi + push edi + + mov edi, [esp + 8 + 4] ; data + mov esi, [esp + 8 + 8] ; coeff + mov eax, [esp + 8 + 12] ; quant + + movq mm6, [mmx_add + eax*8 - 8] + movq mm7, [mmx_mul + eax*8 - 8] + movq2dq xmm6, mm6 + movq2dq xmm7, mm7 + movlhps xmm6, xmm6 + movlhps xmm7, xmm7 + + xor eax, eax + +align 16 +.das2_loop + movdqa xmm0, [esi + eax*8] + movdqa xmm3, [esi + eax*8 + 16] + pxor xmm1, xmm1 + pxor xmm4, xmm4 + pcmpgtw xmm1, xmm0 + pcmpgtw xmm4, xmm3 + pxor xmm2, xmm2 + pxor xmm5, xmm5 + pcmpeqw xmm2, xmm0 + pcmpeqw xmm5, xmm3 + pandn xmm2, xmm6 + pandn xmm5, xmm6 + pxor xmm0, xmm1 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + pmullw xmm0, xmm7 + pmullw xmm3, xmm7 + paddw xmm0, xmm2 + paddw xmm3, xmm5 + pxor xmm0, xmm1 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 %ifdef SATURATE - movq mm2, [mmx_32767_minus_2047] - movq mm4, [mmx_32768_minus_2048] - paddsw mm0, mm2 - paddsw mm3, mm2 - psubsw mm0, mm2 - psubsw mm3, mm2 - psubsw mm0, mm4 - psubsw mm3, mm4 - paddsw mm0, mm4 - paddsw mm3, mm4 + movdqu xmm2, [sse2_pos_2047] + movdqu xmm4, [sse2_neg_2048] + pminsw xmm0, xmm2 + pminsw xmm3, xmm2 + pmaxsw xmm0, xmm4 + pmaxsw xmm3, xmm4 %endif - movq [edi + 8*eax], mm0 ; [data] = mm0 - movq [edi + 8*eax + 8], mm3 + movdqa [edi + eax*8], xmm0 + movdqa [edi + eax*8 + 16], xmm3 - add eax, 2 - cmp eax, 16 - jnz near .loop + add eax, 4 + cmp eax, 16 + jnz near .das2_loop - mov ax, [esi] ; ax = data[0] - imul ax, [esp + 8 + 16] ; eax = data[0] * dcscalar + mov ax, [esi] ; ax = data[0] + imul ax, [esp + 8 + 16] ; eax = data[0] * dcscalar %ifdef SATURATE - cmp ax, -2048 - jl .set_n2048 - cmp ax, 2047 - jg .set_2047 + cmp ax, -2048 + jl .das2_set_n2048 + cmp ax, 2047 + jg .das2_set_2047 %endif - mov [edi], ax + mov [edi], ax - pop edi - pop esi + pop edi + pop esi ret %ifdef SATURATE -align ALIGN -.set_n2048 - mov word [edi], -2048 - pop edi - pop esi +align 16 +.das2_set_n2048 + mov word [edi], -2048 + pop edi + pop esi ret -align ALIGN -.set_2047 - mov word [edi], 2047 - pop edi - pop esi +align 16 +.das2_set_2047 + mov word [edi], 2047 + pop edi + pop esi ret %endif @@ -586,7 +968,128 @@ align ALIGN cglobal dequant_inter_mmx -dequant_inter_mmx +dequant_inter_mmx: + + mov edx, [esp+ 4] ; data + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1 + movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant + mov eax, -16 + +align ALIGN +.loop + movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] + movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] + pxor mm1, mm1 + pxor mm4, mm4 + pcmpgtw mm1, mm0 ; sign(c) + pcmpgtw mm4, mm3 ; sign(c') + pxor mm2, mm2 + pxor mm5, mm5 + pcmpeqw mm2, mm0 ; c is zero + pcmpeqw mm5, mm3 ; c' is zero + pandn mm2, mm6 ; offset = isZero ? 0 : quant_add + pandn mm5, mm6 + pxor mm0, mm1 ; negate if negative + pxor mm3, mm4 ; negate if negative + psubw mm0, mm1 + psubw mm3, mm4 + pmullw mm0, mm7 ; *= 2Q + pmullw mm3, mm7 ; *= 2Q + paddw mm0, mm2 ; + offset + paddw mm3, mm5 ; + offset + paddw mm0, mm1 ; negate back + paddw mm3, mm4 ; negate back + + ; saturates to +2047 + movq mm2, [mmx_32767_minus_2047] + add eax, 2 + paddsw mm0, mm2 + paddsw mm3, mm2 + psubsw mm0, mm2 + psubsw mm3, mm2 + + pxor mm0, mm1 + pxor mm3, mm4 + movq [edx + 8*eax + 8*16 - 2*8], mm0 + movq [edx + 8*eax + 8*16+8 - 2*8], mm3 + jnz near .loop + + ret + +;=========================================================================== +; +; void dequant_inter_xmm(int16_t * data, +; const int16_t * const coeff, +; const uint32_t quant); +; +;=========================================================================== + + ; this is the same as dequant_inter_mmx, + ; except that we're saturating using 'pminsw' (saves 2 cycles/loop) + +align ALIGN +cglobal dequant_inter_xmm +dequant_inter_xmm: + + mov edx, [esp+ 4] ; data + mov ecx, [esp+ 8] ; coeff + mov eax, [esp+12] ; quant + movq mm6, [mmx_add + eax*8 - 8] ; quant or quant-1 + movq mm7, [mmx_mul + eax*8 - 8] ; 2*quant + mov eax, -16 + +align ALIGN +.loop + movq mm0, [ecx+8*eax+8*16] ; c = coeff[i] + movq mm3, [ecx+8*eax+8*16 + 8] ; c' = coeff[i+1] + pxor mm1, mm1 + pxor mm4, mm4 + pcmpgtw mm1, mm0 ; sign(c) + pcmpgtw mm4, mm3 ; sign(c') + pxor mm2, mm2 + pxor mm5, mm5 + pcmpeqw mm2, mm0 ; c is zero + pcmpeqw mm5, mm3 ; c' is zero + pandn mm2, mm6 ; offset = isZero ? 0 : quant_add + pandn mm5, mm6 + pxor mm0, mm1 ; negate if negative + pxor mm3, mm4 ; negate if negative + psubw mm0, mm1 + psubw mm3, mm4 + pmullw mm0, mm7 ; *= 2Q + pmullw mm3, mm7 ; *= 2Q + paddw mm0, mm2 ; + offset + paddw mm3, mm5 ; + offset + paddw mm0, mm1 ; start restoring sign + paddw mm3, mm4 ; start restoring sign + + ; saturates to +2047 + movq mm2, [mmx_2047] + pminsw mm0, mm2 + add eax, 2 + pminsw mm3, mm2 + + pxor mm0, mm1 ; finish restoring sign + pxor mm3, mm4 ; finish restoring sign + movq [edx + 8*eax + 8*16 - 2*8], mm0 + movq [edx + 8*eax + 8*16+8 - 2*8], mm3 + jnz near .loop + + ret + +;=========================================================================== +; +; void dequant_inter_sse2(int16_t * data, +; const int16_t * const coeff, +; const uint32_t quant); +; +;=========================================================================== + +align 16 +cglobal dequant_inter_sse2 +dequant_inter_sse2 push esi push edi @@ -596,57 +1099,58 @@ mov eax, [esp + 8 + 12] ; quant movq mm6, [mmx_add + eax * 8 - 8] movq mm7, [mmx_mul + eax * 8 - 8] + + movq2dq xmm6, mm6 + movq2dq xmm7, mm7 + movlhps xmm6, xmm6 + movlhps xmm7, xmm7 xor eax, eax -align ALIGN -.loop - movq mm0, [esi + 8*eax] ; mm0 = [coeff] - movq mm3, [esi + 8*eax + 8] ; - pxor mm1, mm1 ; mm1 = 0 - pxor mm4, mm4 ; - pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) - pcmpgtw mm4, mm3 ; - pxor mm2, mm2 ; mm2 = 0 - pxor mm5, mm5 ; - pcmpeqw mm2, mm0 ; mm2 = (0 == mm0) - pcmpeqw mm5, mm3 ; - pandn mm2, mm6 ; mm2 = (iszero ? 0 : add) - pandn mm5, mm6 ; - pxor mm0, mm1 ; mm0 = |mm0| - pxor mm3, mm4 ; - psubw mm0, mm1 ; displace - psubw mm3, mm4 ; - pmullw mm0, mm7 ; mm0 *= 2Q - pmullw mm3, mm7 ; - paddw mm0, mm2 ; mm0 += mm2 (add) - paddw mm3, mm5 ; - pxor mm0, mm1 ; mm0 *= sign(mm0) - pxor mm3, mm4 ; - psubw mm0, mm1 ; undisplace - psubw mm3, mm4 +align 16 +.des2_loop + movdqa xmm0, [esi + eax*8] ; xmm0 = [coeff] + movdqa xmm3, [esi + eax*8 + 16] + pxor xmm1, xmm1 + pxor xmm4, xmm4 + pcmpgtw xmm1, xmm0 + pcmpgtw xmm4, xmm3 + pxor xmm2, xmm2 + pxor xmm5, xmm5 + pcmpeqw xmm2, xmm0 + pcmpeqw xmm5, xmm3 + pandn xmm2, xmm6 + pandn xmm5, xmm6 + pxor xmm0, xmm1 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 + pmullw xmm0, xmm7 + pmullw xmm3, xmm7 + paddw xmm0, xmm2 + paddw xmm3, xmm5 + pxor xmm0, xmm1 + pxor xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm3, xmm4 %ifdef SATURATE - movq mm2, [mmx_32767_minus_2047] - movq mm4, [mmx_32768_minus_2048] - paddsw mm0, mm2 - paddsw mm3, mm2 - psubsw mm0, mm2 - psubsw mm3, mm2 - psubsw mm0, mm4 - psubsw mm3, mm4 - paddsw mm0, mm4 - paddsw mm3, mm4 + movdqu xmm2, [sse2_pos_2047] + movdqu xmm4, [sse2_neg_2048] + pminsw xmm0, xmm2 + pminsw xmm3, xmm2 + pmaxsw xmm0, xmm4 + pmaxsw xmm3, xmm4 %endif - movq [edi + 8*eax], mm0 - movq [edi + 8*eax + 8], mm3 + movdqa [edi + eax*8], xmm0 + movdqa [edi + eax*8 + 16], xmm3 - add eax, 2 + add eax, 4 cmp eax, 16 - jnz near .loop + jnz near .des2_loop pop edi pop esi - ret \ No newline at end of file + ret