--- trunk/xvidcore/src/quant/x86_asm/quantize_mmx.asm	2002/03/08 02:46:11	3
+++ trunk/xvidcore/src/quant/x86_asm/quantize_mmx.asm	2002/07/07 13:21:34	269
@@ -32,6 +32,9 @@
 ; *
 ; *	History:
 ; *
+; * 14.06.2002  mmx+xmm dequant_* funcs revamped  -Skal-
+; * 24.02.2002	sse2 quant_intra / dequant_intra (have to use movdqu ???)
+; * 17.04.2002	sse2 quant_inter / dequant_inter
 ; * 26.12.2001	minor bug fixes, dequant saturate, further optimization
 ; * 19.11.2001  quant_inter_mmx now returns sum of abs. coefficient values
 ; *	04.11.2001	nasm version; (c)2001 peter ross <pross@cs.rmit.edu.au>
@@ -58,7 +61,9 @@
 	%endif
 %endmacro
 
-plus_one times 4	dw	 1
+align 16
+
+plus_one times 8	dw	 1
 
 ;===========================================================================
 ;
@@ -70,7 +75,7 @@
 times 4 dw %1 / 2
 %endmacro
 
-align ALIGN
+align 16
 mmx_sub
 		MMX_SUB 1
 		MMX_SUB 2
@@ -120,7 +125,7 @@
 times 4 dw  (1 << 16) / (%1 * 2) + 1
 %endmacro
 
-align ALIGN
+align 16
 mmx_div
 		MMX_DIV 1
 		MMX_DIV 2
@@ -170,7 +175,7 @@
 %endif
 %endmacro
 
-align ALIGN
+align 16
 mmx_add
 		MMX_ADD 1
 		MMX_ADD 2
@@ -215,7 +220,7 @@
 times 4 dw %1 * 2
 %endmacro
 
-align ALIGN
+align 16
 mmx_mul
 		MMX_MUL 1
 		MMX_MUL 2
@@ -256,10 +261,17 @@
 ;
 ;===========================================================================
 
-align ALIGN
+align 8
 mmx_32768_minus_2048				times 4 dw (32768-2048)
 mmx_32767_minus_2047				times 4 dw (32767-2047)
 
+align 16
+mmx_2047 times 4 dw 2047
+
+align 16
+sse2_pos_2047						times 8 dw 2047
+sse2_neg_2048						times 8 dw -2048
+
 
 section .text
 
@@ -371,6 +383,111 @@
 
 ;===========================================================================
 ;
+; void quant_intra_sse2(int16_t * coeff, 
+;					const int16_t const * data,
+;					const uint32_t quant,
+;					const uint32_t dcscalar);
+;
+;===========================================================================
+
+align ALIGN
+cglobal quant_intra_sse2
+quant_intra_sse2
+
+		push	esi
+		push	edi
+
+		mov		edi, [esp + 8 + 4]			; coeff
+		mov		esi, [esp + 8 + 8]			; data
+		mov		eax, [esp + 8 + 12]			; quant
+
+		xor		ecx, ecx
+		cmp		al, 1
+		jz		near .qas2_q1loop
+
+.qas2_not1
+		movq	mm7, [mmx_div + eax*8 - 8]
+		movq2dq	xmm7, mm7
+		movlhps	xmm7, xmm7
+
+align 16
+.qas2_loop
+		movdqa	xmm0, [esi + ecx*8]			; xmm0 = [1st]
+		movdqa	xmm3, [esi + ecx*8 + 16]	; xmm3 = [2nd]
+		pxor	xmm1, xmm1
+		pxor	xmm4, xmm4
+		pcmpgtw	xmm1, xmm0
+		pcmpgtw	xmm4, xmm3
+		pxor	xmm0, xmm1
+		pxor	xmm3, xmm4
+		psubw	xmm0, xmm1
+		psubw	xmm3, xmm4
+		pmulhw	xmm0, xmm7
+		pmulhw	xmm3, xmm7
+		pxor	xmm0, xmm1
+		pxor	xmm3, xmm4
+		psubw	xmm0, xmm1
+		psubw	xmm3, xmm4
+		movdqa	[edi + ecx*8], xmm0
+		movdqa	[edi + ecx*8 + 16], xmm3
+		
+		add		ecx, 4
+		cmp		ecx, 16
+		jnz 	.qas2_loop 
+
+.qas2_done	
+		mov 	ecx, [esp + 8 + 16]	; dcscalar
+		mov 	edx, ecx
+		movsx 	eax, word [esi]
+		shr 	edx, 1
+		cmp		eax, 0
+		jg		.qas2_gtzero
+
+		sub		eax, edx
+		jmp		short .qas2_mul
+.qas2_gtzero
+		add		eax, edx
+.qas2_mul
+		cdq
+		idiv	ecx
+		
+		mov		[edi], ax
+
+		pop		edi
+		pop		esi
+
+		ret		
+
+align 16
+.qas2_q1loop
+		movdqa	xmm0, [esi + ecx*8]			; xmm0 = [1st]
+		movdqa	xmm3, [esi + ecx*8 + 16]	; xmm3 = [2nd]
+		pxor	xmm1, xmm1
+		pxor	xmm4, xmm4
+		pcmpgtw	xmm1, xmm0
+		pcmpgtw	xmm4, xmm3
+		pxor	xmm0, xmm1
+		pxor	xmm3, xmm4
+		psubw	xmm0, xmm1
+		psubw	xmm3, xmm4
+		psrlw	xmm0, 1
+		psrlw	xmm3, 1
+		pxor	xmm0, xmm1
+		pxor	xmm3, xmm4
+		psubw	xmm0, xmm1
+		psubw	xmm3, xmm4
+		movdqa	[edi + ecx*8], xmm0
+		movdqa	[edi + ecx*8 + 16], xmm3
+
+		add		ecx, 4
+		cmp		ecx, 16
+		jnz		.qas2_q1loop
+		jmp		near .qas2_done
+
+
+
+;===========================================================================
+;
 ; uint32_t quant_inter_mmx(int16_t * coeff,
 ;					const int16_t const * data,
 ;					const uint32_t quant);
@@ -476,6 +593,118 @@
 
 ;===========================================================================
 ;
+; uint32_t quant_inter_sse2(int16_t * coeff,
+;					const int16_t const * data,
+;					const uint32_t quant);
+;
+;===========================================================================
+
+align 16
+cglobal quant_inter_sse2
+		quant_inter_sse2
+
+		push	esi
+		push	edi
+
+		mov		edi, [esp + 8 + 4]			; coeff
+		mov		esi, [esp + 8 + 8]			; data
+		mov		eax, [esp + 8 + 12]			; quant
+
+		xor		ecx, ecx
+
+		pxor	xmm5, xmm5					; sum
+
+		movq	mm0, [mmx_sub + eax*8 - 8]	; sub
+		movq2dq	xmm6, mm0					; load into low 8 bytes
+		movlhps	xmm6, xmm6					; duplicate into high 8 bytes
+
+		cmp		al, 1
+		jz		near .qes2_q1loop
+
+.qes2_not1
+		movq	mm0, [mmx_div + eax*8 - 8]	; divider
+		movq2dq	xmm7, mm0
+		movlhps	xmm7, xmm7
+
+align 16
+.qes2_loop
+		movdqa	xmm0, [esi + ecx*8]			; xmm0 = [1st]
+		movdqa	xmm3, [esi + ecx*8 + 16]	; xmm3 = [2nd]
+		pxor	xmm1, xmm1
+		pxor	xmm4, xmm4
+		pcmpgtw	xmm1, xmm0
+		pcmpgtw	xmm4, xmm3
+		pxor	xmm0, xmm1
+		pxor	xmm3, xmm4
+		psubw	xmm0, xmm1
+		psubw	xmm3, xmm4
+		psubusw	xmm0, xmm6
+		psubusw	xmm3, xmm6
+		pmulhw	xmm0, xmm7
+		pmulhw	xmm3, xmm7
+		paddw	xmm5, xmm0
+		pxor	xmm0, xmm1
+		paddw	xmm5, xmm3
+		pxor	xmm3, xmm4
+		psubw	xmm0, xmm1
+		psubw	xmm3, xmm4
+		movdqa	[edi + ecx*8], xmm0
+		movdqa	[edi + ecx*8 + 16], xmm3
+
+		add		ecx, 4	
+		cmp		ecx, 16
+		jnz		.qes2_loop
+
+.qes2_done
+		movdqu	xmm6, [plus_one]
+		pmaddwd xmm5, xmm6
+		movhlps	xmm6, xmm5
+		paddd	xmm5, xmm6
+		movdq2q	mm0, xmm5
+
+		movq    mm5, mm0
+		psrlq   mm5, 32
+		paddd   mm0, mm5
+		movd	eax, mm0					; return sum
+
+		pop		edi
+		pop		esi
+
+		ret
+
+align 16
+.qes2_q1loop
+		movdqa	xmm0, [esi + ecx*8]			; xmm0 = [1st]
+		movdqa	xmm3, [esi + ecx*8 + 16]	; xmm3 = [2nd]
+		pxor	xmm1, xmm1
+		pxor	xmm4, xmm4
+		pcmpgtw	xmm1, xmm0
+		pcmpgtw	xmm4, xmm3
+		pxor	xmm0, xmm1
+		pxor	xmm3, xmm4
+		psubw	xmm0, xmm1
+		psubw	xmm3, xmm4
+		psubusw	xmm0, xmm6
+		psubusw	xmm3, xmm6
+		psrlw	xmm0, 1
+		psrlw	xmm3, 1
+		paddw	xmm5, xmm0
+		pxor	xmm0, xmm1
+		paddw	xmm5, xmm3
+		pxor	xmm3, xmm4
+		psubw	xmm0, xmm1
+		psubw	xmm3, xmm4
+		movdqa	[edi + ecx*8], xmm0
+		movdqa	[edi + ecx*8 + 16], xmm3
+		
+		add		ecx,4
+		cmp		ecx,16
+		jnz		.qes2_q1loop
+		jmp		.qes2_done
+		
+		
+;===========================================================================
+;
 ; void dequant_intra_mmx(int16_t *data,
 ;					const int16_t const *coeff,
 ;					const uint32_t quant,
@@ -483,96 +712,249 @@
 ;
 ;===========================================================================
 
+  ; note: we only saturate to +2047 *before* restoring the sign.
+  ; Hence, final clamp really is [-2048,2047]
+
 align ALIGN
 cglobal dequant_intra_mmx
-dequant_intra_mmx
+dequant_intra_mmx:
 
-		push	esi
-		push	edi
+  mov    edx, [esp+ 4]        ; data
+  mov    ecx, [esp+ 8]        ; coeff
+  mov    eax, [esp+12]        ; quant
+  movq mm6, [mmx_add + eax*8 - 8]  ; quant or quant-1 
+  movq mm7, [mmx_mul + eax*8 - 8]  ; 2*quant
+  mov eax, -16
 
-		mov	edi, [esp + 8 + 4]		; data
-		mov	esi, [esp + 8 + 8]		; coeff
-		mov	eax, [esp + 8 + 12]		; quant
+align ALIGN
+.loop
+  movq mm0, [ecx+8*eax+8*16]      ; c  = coeff[i]
+  movq mm3, [ecx+8*eax+8*16 + 8]  ; c' = coeff[i+1]
+  pxor mm1, mm1
+  pxor mm4, mm4
+  pcmpgtw mm1, mm0  ; sign(c)
+  pcmpgtw mm4, mm3  ; sign(c')
+  pxor mm2, mm2
+  pxor mm5, mm5
+  pcmpeqw mm2, mm0  ; c is zero
+  pcmpeqw mm5, mm3  ; c' is zero
+  pandn mm2, mm6    ; offset = isZero ? 0 : quant_add
+  pandn mm5, mm6
+  pxor mm0, mm1     ; negate if negative
+  pxor mm3, mm4     ; negate if negative
+  psubw mm0, mm1 
+  psubw mm3, mm4
+  pmullw mm0, mm7 ; *= 2Q
+  pmullw mm3, mm7 ; *= 2Q
+  paddw mm0, mm2 ; + offset
+  paddw mm3, mm5 ; + offset
+  paddw mm0, mm1 ; negate back
+  paddw mm3, mm4 ; negate back
+
+    ; saturates to +2047
+  movq mm2, [mmx_32767_minus_2047]
+  add eax, 2
+  paddsw mm0, mm2
+  paddsw mm3, mm2
+  psubsw mm0, mm2
+  psubsw mm3, mm2
+
+  pxor mm0, mm1
+  pxor mm3, mm4
+  movq [edx + 8*eax + 8*16   - 2*8], mm0
+  movq [edx + 8*eax + 8*16+8 - 2*8], mm3
+  jnz	near .loop
+
+    ; deal with DC
+
+  movd mm0, [ecx]
+  pmullw mm0, [esp+16]    ; dcscalar
+  movq mm2, [mmx_32767_minus_2047]
+  paddsw mm0, mm2
+  psubsw mm0, mm2
+  movq mm3, [mmx_32768_minus_2048]
+  psubsw mm0, mm3
+  paddsw mm0, mm3
+  movd eax, mm0
+  mov [edx], ax
 
-		movq	mm6, [mmx_add + eax * 8 - 8]
-		movq	mm7, [mmx_mul + eax * 8 - 8]
-		xor eax, eax
+  ret
+
+;===========================================================================
+;
+; void dequant_intra_xmm(int16_t *data,
+;					const int16_t const *coeff,
+;					const uint32_t quant,
+;					const uint32_t dcscalar);
+;
+;===========================================================================
+
+  ; this is the same as dequant_inter_mmx, except that we're
+  ; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)
+
+align ALIGN
+cglobal dequant_intra_xmm
+dequant_intra_xmm:
+
+  mov    edx, [esp+ 4]        ; data
+  mov    ecx, [esp+ 8]        ; coeff
+  mov    eax, [esp+12]        ; quant
+  movq mm6, [mmx_add + eax*8 - 8]  ; quant or quant-1 
+  movq mm7, [mmx_mul + eax*8 - 8]  ; 2*quant
+  mov eax, -16
 
 align ALIGN
 .loop
-		movq	mm0, [esi + 8*eax]		; mm0 = [coeff]
-		movq	mm3, [esi + 8*eax + 8]	; 
-		pxor	mm1, mm1		; mm1 = 0
-		pxor	mm4, mm4		;
-		pcmpgtw	mm1, mm0		; mm1 = (0 > mm0)
-		pcmpgtw	mm4, mm3		; 
-		pxor	mm2, mm2		; mm2 = 0
-		pxor	mm5, mm5		;
-		pcmpeqw	mm2, mm0		; mm2 = (0 == mm0)
-		pcmpeqw	mm5, mm3		; 
-		pandn   mm2, mm6		; mm2 = (iszero ? 0 : add)
-		pandn   mm5, mm6		;
-		pxor	mm0, mm1		; mm0 = |mm0|
-		pxor	mm3, mm4		; 
-		psubw	mm0, mm1		; displace
-		psubw	mm3, mm4		; 
-		pmullw	mm0, mm7		; mm0 *= 2Q
-		pmullw	mm3, mm7		; 
-		paddw	mm0, mm2		; mm0 += mm2 (add)
-		paddw	mm3, mm5		;
-		pxor	mm0, mm1		; mm0 *= sign(mm0)
-		pxor	mm3, mm4		;
-		psubw	mm0, mm1		; undisplace
-		psubw	mm3, mm4
+  movq mm0, [ecx+8*eax+8*16]      ; c  = coeff[i]
+  movq mm3, [ecx+8*eax+8*16 + 8]  ; c' = coeff[i+1]
+  pxor mm1, mm1
+  pxor mm4, mm4
+  pcmpgtw mm1, mm0  ; sign(c)
+  pcmpgtw mm4, mm3  ; sign(c')
+  pxor mm2, mm2
+  pxor mm5, mm5
+  pcmpeqw mm2, mm0  ; c is zero
+  pcmpeqw mm5, mm3  ; c' is zero
+  pandn mm2, mm6    ; offset = isZero ? 0 : quant_add
+  pandn mm5, mm6
+  pxor mm0, mm1     ; negate if negative
+  pxor mm3, mm4     ; negate if negative
+  psubw mm0, mm1 
+  psubw mm3, mm4
+  pmullw mm0, mm7 ; *= 2Q
+  pmullw mm3, mm7 ; *= 2Q
+  paddw mm0, mm2 ; + offset
+  paddw mm3, mm5 ; + offset
+  paddw mm0, mm1 ; negate back
+  paddw mm3, mm4 ; negate back
+
+    ; saturates to +2047
+  movq mm2, [mmx_2047]
+  pminsw mm0, mm2
+  add eax, 2
+  pminsw mm3, mm2
+
+  pxor mm0, mm1
+  pxor mm3, mm4
+  movq [edx + 8*eax + 8*16   - 2*8], mm0
+  movq [edx + 8*eax + 8*16+8 - 2*8], mm3
+  jnz	near .loop
+
+    ; deal with DC
+
+  movd mm0, [ecx]
+  pmullw mm0, [esp+16]    ; dcscalar
+  movq mm2, [mmx_32767_minus_2047]
+  paddsw mm0, mm2
+  psubsw mm0, mm2
+  movq mm2, [mmx_32768_minus_2048]
+  psubsw mm0, mm2
+  paddsw mm0, mm2
+  movd eax, mm0
+  mov [edx], ax
+
+  ret
+
+;===========================================================================
+;
+; void dequant_intra_sse2(int16_t *data,
+;					const int16_t const *coeff,
+;					const uint32_t quant,
+;					const uint32_t dcscalar);
+;
+;===========================================================================
+
+align 16
+cglobal dequant_intra_sse2
+dequant_intra_sse2:
+
+		push	esi
+		push	edi
+
+		mov		edi, [esp + 8 + 4]			; data
+		mov		esi, [esp + 8 + 8]			; coeff
+		mov		eax, [esp + 8 + 12]			; quant
+
+		movq	mm6, [mmx_add + eax*8 - 8]
+		movq	mm7, [mmx_mul + eax*8 - 8]
+		movq2dq	xmm6, mm6
+		movq2dq	xmm7, mm7
+		movlhps	xmm6, xmm6
+		movlhps	xmm7, xmm7
+
+		xor		eax, eax
+
+align 16
+.das2_loop
+		movdqa	xmm0, [esi + eax*8]
+		movdqa	xmm3, [esi + eax*8 + 16]
+		pxor	xmm1, xmm1
+		pxor	xmm4, xmm4
+		pcmpgtw	xmm1, xmm0
+		pcmpgtw	xmm4, xmm3
+		pxor	xmm2, xmm2
+		pxor	xmm5, xmm5
+		pcmpeqw	xmm2, xmm0
+		pcmpeqw	xmm5, xmm3
+		pandn   xmm2, xmm6
+		pandn   xmm5, xmm6
+		pxor	xmm0, xmm1
+		pxor	xmm3, xmm4
+		psubw	xmm0, xmm1
+		psubw	xmm3, xmm4
+		pmullw	xmm0, xmm7
+		pmullw	xmm3, xmm7
+		paddw	xmm0, xmm2
+		paddw	xmm3, xmm5
+		pxor	xmm0, xmm1
+		pxor	xmm3, xmm4
+		psubw	xmm0, xmm1
+		psubw	xmm3, xmm4
 
 %ifdef SATURATE
-		movq mm2, [mmx_32767_minus_2047] 
-		movq mm4, [mmx_32768_minus_2048] 
-		paddsw	mm0, mm2
-		paddsw	mm3, mm2
-		psubsw	mm0, mm2
-		psubsw	mm3, mm2
-		psubsw	mm0, mm4
-		psubsw	mm3, mm4
-		paddsw	mm0, mm4
-		paddsw	mm3, mm4
+		movdqu	xmm2, [sse2_pos_2047]
+		movdqu	xmm4, [sse2_neg_2048]
+		pminsw	xmm0, xmm2
+		pminsw	xmm3, xmm2
+		pmaxsw	xmm0, xmm4
+		pmaxsw	xmm3, xmm4
 %endif
 
-		movq	[edi + 8*eax], mm0		; [data] = mm0
-		movq	[edi + 8*eax + 8], mm3
+		movdqa	[edi + eax*8], xmm0
+		movdqa	[edi + eax*8 + 16], xmm3
 
-		add eax, 2
-		cmp eax, 16
-		jnz	near .loop
+		add		eax, 4
+		cmp		eax, 16
+		jnz		near .das2_loop
 
-		mov	ax, [esi]					; ax = data[0]
-		imul ax, [esp + 8 + 16]			; eax = data[0] * dcscalar
+		mov		ax, [esi]					; ax = data[0]
+		imul	ax, [esp + 8 + 16]			; eax = data[0] * dcscalar
 
 %ifdef SATURATE
-		cmp ax, -2048
-		jl .set_n2048
-		cmp ax, 2047
-		jg .set_2047
+		cmp		ax, -2048
+		jl		.das2_set_n2048
+		cmp		ax, 2047
+		jg		.das2_set_2047
 %endif
-		mov	[edi], ax
+		mov		[edi], ax
 
-		pop	edi
-		pop	esi
+		pop		edi
+		pop		esi
 		ret
 
 %ifdef SATURATE
-align ALIGN
-.set_n2048
-		mov	word [edi], -2048
-		pop	edi
-		pop	esi
+align 16
+.das2_set_n2048
+		mov		word [edi], -2048
+		pop		edi
+		pop		esi
 		ret
 	
-align ALIGN
-.set_2047
-		mov	word [edi], 2047
-		pop	edi
-		pop	esi
+align 16
+.das2_set_2047
+		mov		word [edi], 2047
+		pop		edi
+		pop		esi
 		ret
 %endif
 
@@ -586,7 +968,128 @@
 
 align ALIGN
 cglobal dequant_inter_mmx
-dequant_inter_mmx
+dequant_inter_mmx:
+
+  mov    edx, [esp+ 4]        ; data
+  mov    ecx, [esp+ 8]        ; coeff
+  mov    eax, [esp+12]        ; quant
+  movq mm6, [mmx_add + eax*8 - 8]  ; quant or quant-1 
+  movq mm7, [mmx_mul + eax*8 - 8]  ; 2*quant
+  mov eax, -16
+
+align ALIGN
+.loop
+  movq mm0, [ecx+8*eax+8*16]      ; c  = coeff[i]
+  movq mm3, [ecx+8*eax+8*16 + 8]  ; c' = coeff[i+1]
+  pxor mm1, mm1
+  pxor mm4, mm4
+  pcmpgtw mm1, mm0  ; sign(c)
+  pcmpgtw mm4, mm3  ; sign(c')
+  pxor mm2, mm2
+  pxor mm5, mm5
+  pcmpeqw mm2, mm0  ; c is zero
+  pcmpeqw mm5, mm3  ; c' is zero
+  pandn mm2, mm6    ; offset = isZero ? 0 : quant_add
+  pandn mm5, mm6
+  pxor mm0, mm1     ; negate if negative
+  pxor mm3, mm4     ; negate if negative
+  psubw mm0, mm1 
+  psubw mm3, mm4
+  pmullw mm0, mm7 ; *= 2Q
+  pmullw mm3, mm7 ; *= 2Q
+  paddw mm0, mm2 ; + offset
+  paddw mm3, mm5 ; + offset
+  paddw mm0, mm1 ; negate back
+  paddw mm3, mm4 ; negate back
+
+    ; saturates to +2047
+  movq mm2, [mmx_32767_minus_2047]
+  add eax, 2
+  paddsw mm0, mm2
+  paddsw mm3, mm2
+  psubsw mm0, mm2
+  psubsw mm3, mm2
+
+  pxor mm0, mm1
+  pxor mm3, mm4
+  movq [edx + 8*eax + 8*16   - 2*8], mm0
+  movq [edx + 8*eax + 8*16+8 - 2*8], mm3
+  jnz	near .loop
+
+  ret
+
+;===========================================================================
+;
+; void dequant_inter_xmm(int16_t * data,
+;					const int16_t * const coeff,
+;					const uint32_t quant);
+;
+;===========================================================================
+
+  ; this is the same as dequant_inter_mmx,
+  ; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
+
+align ALIGN
+cglobal dequant_inter_xmm
+dequant_inter_xmm:
+
+  mov    edx, [esp+ 4]        ; data
+  mov    ecx, [esp+ 8]        ; coeff
+  mov    eax, [esp+12]        ; quant
+  movq mm6, [mmx_add + eax*8 - 8]  ; quant or quant-1 
+  movq mm7, [mmx_mul + eax*8 - 8]  ; 2*quant
+  mov eax, -16
+
+align ALIGN
+.loop
+  movq mm0, [ecx+8*eax+8*16]      ; c  = coeff[i]
+  movq mm3, [ecx+8*eax+8*16 + 8]  ; c' = coeff[i+1]
+  pxor mm1, mm1
+  pxor mm4, mm4
+  pcmpgtw mm1, mm0  ; sign(c)
+  pcmpgtw mm4, mm3  ; sign(c')
+  pxor mm2, mm2
+  pxor mm5, mm5
+  pcmpeqw mm2, mm0  ; c is zero
+  pcmpeqw mm5, mm3  ; c' is zero
+  pandn mm2, mm6    ; offset = isZero ? 0 : quant_add
+  pandn mm5, mm6
+  pxor mm0, mm1     ; negate if negative
+  pxor mm3, mm4     ; negate if negative
+  psubw mm0, mm1 
+  psubw mm3, mm4
+  pmullw mm0, mm7 ; *= 2Q
+  pmullw mm3, mm7 ; *= 2Q
+  paddw mm0, mm2 ; + offset
+  paddw mm3, mm5 ; + offset
+  paddw mm0, mm1 ; start restoring sign
+  paddw mm3, mm4 ; start restoring sign
+
+      ; saturates to +2047
+  movq mm2, [mmx_2047]
+  pminsw mm0, mm2
+  add eax, 2
+  pminsw mm3, mm2
+
+  pxor mm0, mm1 ; finish restoring sign
+  pxor mm3, mm4 ; finish restoring sign
+  movq [edx + 8*eax + 8*16   - 2*8], mm0
+  movq [edx + 8*eax + 8*16+8 - 2*8], mm3
+  jnz	near .loop
+
+  ret
+
+;===========================================================================
+;
+; void dequant_inter_sse2(int16_t * data,
+;					const int16_t * const coeff,
+;					const uint32_t quant);
+;
+;===========================================================================
+
+align 16
+cglobal dequant_inter_sse2
+dequant_inter_sse2
 
 		push 	esi
 		push 	edi
@@ -596,57 +1099,58 @@
 		mov 	eax, [esp + 8 + 12]	; quant
 		movq	mm6, [mmx_add + eax * 8 - 8]
 		movq	mm7, [mmx_mul + eax * 8 - 8]
+
+		movq2dq	xmm6, mm6
+		movq2dq xmm7, mm7
+		movlhps xmm6, xmm6
+		movlhps	xmm7, xmm7
 		
 		xor eax, eax
 
-align ALIGN
-.loop
-		movq	mm0, [esi + 8*eax]			; mm0 = [coeff]
-		movq	mm3, [esi + 8*eax + 8]		; 
-		pxor	mm1, mm1		; mm1 = 0
-		pxor	mm4, mm4		;
-		pcmpgtw	mm1, mm0		; mm1 = (0 > mm0)
-		pcmpgtw	mm4, mm3		; 
-		pxor	mm2, mm2		; mm2 = 0
-		pxor	mm5, mm5		;
-		pcmpeqw	mm2, mm0		; mm2 = (0 == mm0)
-		pcmpeqw	mm5, mm3		; 
-		pandn   mm2, mm6		; mm2 = (iszero ? 0 : add)
-		pandn   mm5, mm6		;
-		pxor	mm0, mm1		; mm0 = |mm0|
-		pxor	mm3, mm4		; 
-		psubw	mm0, mm1		; displace
-		psubw	mm3, mm4		; 
-		pmullw	mm0, mm7		; mm0 *= 2Q
-		pmullw	mm3, mm7		; 
-		paddw	mm0, mm2		; mm0 += mm2 (add)
-		paddw	mm3, mm5		;
-		pxor	mm0, mm1		; mm0 *= sign(mm0)
-		pxor	mm3, mm4		;
-		psubw	mm0, mm1		; undisplace
-		psubw	mm3, mm4
+align 16
+.des2_loop
+		movdqa	xmm0, [esi + eax*8]			; xmm0 = [coeff]
+		movdqa	xmm3, [esi + eax*8 + 16]
+		pxor	xmm1, xmm1
+		pxor	xmm4, xmm4
+		pcmpgtw	xmm1, xmm0
+		pcmpgtw	xmm4, xmm3
+		pxor	xmm2, xmm2
+		pxor	xmm5, xmm5
+		pcmpeqw	xmm2, xmm0
+		pcmpeqw	xmm5, xmm3
+		pandn   xmm2, xmm6
+		pandn   xmm5, xmm6
+		pxor	xmm0, xmm1
+		pxor	xmm3, xmm4
+		psubw	xmm0, xmm1
+		psubw	xmm3, xmm4
+		pmullw	xmm0, xmm7
+		pmullw	xmm3, xmm7
+		paddw	xmm0, xmm2
+		paddw	xmm3, xmm5
+		pxor	xmm0, xmm1
+		pxor	xmm3, xmm4
+		psubw	xmm0, xmm1
+		psubw	xmm3, xmm4
 
 %ifdef SATURATE
-		movq mm2, [mmx_32767_minus_2047] 
-		movq mm4, [mmx_32768_minus_2048] 
-		paddsw	mm0, mm2
-		paddsw	mm3, mm2
-		psubsw	mm0, mm2
-		psubsw	mm3, mm2
-		psubsw	mm0, mm4
-		psubsw	mm3, mm4
-		paddsw	mm0, mm4
-		paddsw	mm3, mm4
+		movdqu	xmm2, [sse2_pos_2047]
+		movdqu	xmm4, [sse2_neg_2048]
+		pminsw	xmm0, xmm2
+		pminsw	xmm3, xmm2
+		pmaxsw	xmm0, xmm4
+		pmaxsw	xmm3, xmm4
 %endif
 
-		movq	[edi + 8*eax], mm0
-		movq	[edi + 8*eax + 8], mm3
+		movdqa	[edi + eax*8], xmm0
+		movdqa	[edi + eax*8 + 16], xmm3
 
-		add eax, 2
+		add eax, 4
 		cmp eax, 16
-		jnz	near .loop
+		jnz	near .des2_loop
 
 		pop 	edi
 		pop 	esi
 
-		ret
\ No newline at end of file
+		ret