--- trunk/xvidcore/src/quant/x86_asm/quantize_mmx.asm	2002/04/24 12:21:43	135
+++ trunk/xvidcore/src/quant/x86_asm/quantize_mmx.asm	2002/11/17 00:41:20	653
@@ -1,42 +1,58 @@
-;/**************************************************************************
+;/*****************************************************************************
 ; *
-; *	XVID MPEG-4 VIDEO CODEC
-; *	mmx quantization/dequantization
+; *  XVID MPEG-4 VIDEO CODEC
+; *  mmx optimized quantization/dequantization             
 ; *
-; *	This program is an implementation of a part of one or more MPEG-4
-; *	Video tools as specified in ISO/IEC 14496-2 standard.  Those intending
-; *	to use this software module in hardware or software products are
-; *	advised that its use may infringe existing patents or copyrights, and
-; *	any such use would be at such party's own risk.  The original
-; *	developer of this software module and his/her company, and subsequent
-; *	editors and their companies, will have no liability for use of this
-; *	software or modifications or derivatives thereof.
+; *  Copyright(C) 2002 Peter Ross <pross@xvid.org>
+; *  Copyright(C) 2002 Michael Militzer <michael@xvid.org>
+; *  Copyright(C) 2002 Pascal Massimino <skal@planet-d.net>
 ; *
-; *	This program is free software; you can redistribute it and/or modify
-; *	it under the terms of the GNU General Public License as published by
-; *	the Free Software Foundation; either version 2 of the License, or
-; *	(at your option) any later version.
+; *  This file is part of XviD, a free MPEG-4 video encoder/decoder
 ; *
-; *	This program is distributed in the hope that it will be useful,
-; *	but WITHOUT ANY WARRANTY; without even the implied warranty of
-; *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-; *	GNU General Public License for more details.
+; *  XviD is free software; you can redistribute it and/or modify it
+; *  under the terms of the GNU General Public License as published by
+; *  the Free Software Foundation; either version 2 of the License, or
+; *  (at your option) any later version.
 ; *
-; *	You should have received a copy of the GNU General Public License
-; *	along with this program; if not, write to the Free Software
-; *	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+; *  This program is distributed in the hope that it will be useful,
+; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+; *  GNU General Public License for more details.
 ; *
-; *************************************************************************/
-
-;/**************************************************************************
+; *  You should have received a copy of the GNU General Public License
+; *  along with this program; if not, write to the Free Software
+; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+; *
+; *  Under section 8 of the GNU General Public License, the copyright
+; *  holders of XVID explicitly forbid distribution in the following
+; *  countries:
+; *
+; *    - Japan
+; *    - United States of America
+; *
+; *  Linking XviD statically or dynamically with other modules is making a
+; *  combined work based on XviD.  Thus, the terms and conditions of the
+; *  GNU General Public License cover the whole combination.
 ; *
-; *	History:
+; *  As a special exception, the copyright holders of XviD give you
+; *  permission to link XviD with independent modules that communicate with
+; *  XviD solely through the VFW1.1 and DShow interfaces, regardless of the
+; *  license terms of these independent modules, and to copy and distribute
+; *  the resulting combined work under terms of your choice, provided that
+; *  every copy of the combined work is accompanied by a complete copy of
+; *  the source code of XviD (the version of XviD used to produce the
+; *  combined work), being distributed under the terms of the GNU General
+; *  Public License plus this exception.  An independent module is a module
+; *  which is not derived from or based on XviD.
 ; *
-; * 24.02.2002	sse2 quant_intra / dequant_intra (have to use movdqu ???)
-; * 17.04.2002	sse2 quant_inter / dequant_inter
-; * 26.12.2001	minor bug fixes, dequant saturate, further optimization
-; * 19.11.2001  quant_inter_mmx now returns sum of abs. coefficient values
-; *	04.11.2001	nasm version; (c)2001 peter ross <pross@cs.rmit.edu.au>
+; *  Note that people who make modified versions of XviD are not obligated
+; *  to grant this special exception for their modified versions; it is
+; *  their choice whether to do so.  The GNU General Public License gives
+; *  permission to release a modified version without this exception; this
+; *  exception also makes it possible to release a modified version which
+; *  carries forward this exception.
+; *
+; * $Id: quantize_mmx.asm,v 1.7 2002-11-17 00:41:20 edgomez Exp $
 ; *
 ; *************************************************************************/
 
@@ -64,7 +80,6 @@
 
 plus_one times 8	dw	 1
 
-
 ;===========================================================================
 ;
 ; subtract by Q/2 table
@@ -261,13 +276,15 @@
 ;
 ;===========================================================================
 
-align ALIGN
-mmx_32768_minus_2048				times 4 dw (32768-2048)
-mmx_32767_minus_2047				times 4 dw (32767-2047)
+align 16
+sse2_2047	times 8 dw 2047
 
 align 16
-sse2_pos_2047						times 8 dw 2047
-sse2_neg_2048						times 8 dw -2048
+mmx_2047	times 4 dw 2047
+
+align 8
+mmx_32768_minus_2048				times 4 dw (32768-2048)
+mmx_32767_minus_2047				times 4 dw (32767-2047)
 
 
 section .text
@@ -700,7 +717,6 @@
 		jmp		.qes2_done
 		
 		
-		
 ;===========================================================================
 ;
 ; void dequant_intra_mmx(int16_t *data,
@@ -710,99 +726,148 @@
 ;
 ;===========================================================================
 
+  ; note: we only saturate to +2047 *before* restoring the sign.
+  ; Hence, final clamp really is [-2048,2047]
+
 align ALIGN
 cglobal dequant_intra_mmx
-dequant_intra_mmx
-
-		push	esi
-		push	edi
+dequant_intra_mmx:
 
-		mov	edi, [esp + 8 + 4]		; data
-		mov	esi, [esp + 8 + 8]		; coeff
-		mov	eax, [esp + 8 + 12]		; quant
-
-		movq	mm6, [mmx_add + eax * 8 - 8]
-		movq	mm7, [mmx_mul + eax * 8 - 8]
-		xor eax, eax
+  mov    edx, [esp+ 4]        ; data
+  mov    ecx, [esp+ 8]        ; coeff
+  mov    eax, [esp+12]        ; quant
+  movq mm6, [mmx_add + eax*8 - 8]  ; quant or quant-1 
+  movq mm7, [mmx_mul + eax*8 - 8]  ; 2*quant
+  mov eax, -16
 
 align ALIGN
 .loop
-		movq	mm0, [esi + 8*eax]		; mm0 = [coeff]
-		movq	mm3, [esi + 8*eax + 8]	; 
-		pxor	mm1, mm1		; mm1 = 0
-		pxor	mm4, mm4		;
-		pcmpgtw	mm1, mm0		; mm1 = (0 > mm0)
-		pcmpgtw	mm4, mm3		; 
-		pxor	mm2, mm2		; mm2 = 0
-		pxor	mm5, mm5		;
-		pcmpeqw	mm2, mm0		; mm2 = (0 == mm0)
-		pcmpeqw	mm5, mm3		; 
-		pandn   mm2, mm6		; mm2 = (iszero ? 0 : add)
-		pandn   mm5, mm6		;
-		pxor	mm0, mm1		; mm0 = |mm0|
-		pxor	mm3, mm4		; 
-		psubw	mm0, mm1		; displace
-		psubw	mm3, mm4		; 
-		pmullw	mm0, mm7		; mm0 *= 2Q
-		pmullw	mm3, mm7		; 
-		paddw	mm0, mm2		; mm0 += mm2 (add)
-		paddw	mm3, mm5		;
-		pxor	mm0, mm1		; mm0 *= sign(mm0)
-		pxor	mm3, mm4		;
-		psubw	mm0, mm1		; undisplace
-		psubw	mm3, mm4
+  movq mm0, [ecx+8*eax+8*16]      ; c  = coeff[i]
+  movq mm3, [ecx+8*eax+8*16 + 8]  ; c' = coeff[i+1]
+  pxor mm1, mm1
+  pxor mm4, mm4
+  pcmpgtw mm1, mm0  ; sign(c)
+  pcmpgtw mm4, mm3  ; sign(c')
+  pxor mm2, mm2
+  pxor mm5, mm5
+  pcmpeqw mm2, mm0  ; c is zero
+  pcmpeqw mm5, mm3  ; c' is zero
+  pandn mm2, mm6    ; offset = isZero ? 0 : quant_add
+  pandn mm5, mm6
+  pxor mm0, mm1     ; negate if negative
+  pxor mm3, mm4     ; negate if negative
+  psubw mm0, mm1 
+  psubw mm3, mm4
+  pmullw mm0, mm7 ; *= 2Q
+  pmullw mm3, mm7 ; *= 2Q
+  paddw mm0, mm2 ; + offset
+  paddw mm3, mm5 ; + offset
+  paddw mm0, mm1 ; negate back
+  paddw mm3, mm4 ; negate back
+
+    ; saturates to +2047
+  movq mm2, [mmx_32767_minus_2047]
+  add eax, 2
+  paddsw mm0, mm2
+  paddsw mm3, mm2
+  psubsw mm0, mm2
+  psubsw mm3, mm2
+
+  pxor mm0, mm1
+  pxor mm3, mm4
+  movq [edx + 8*eax + 8*16   - 2*8], mm0
+  movq [edx + 8*eax + 8*16+8 - 2*8], mm3
+  jnz	near .loop
+
+    ; deal with DC
+
+  movd mm0, [ecx]
+  pmullw mm0, [esp+16]    ; dcscalar
+  movq mm2, [mmx_32767_minus_2047]
+  paddsw mm0, mm2
+  psubsw mm0, mm2
+  movq mm3, [mmx_32768_minus_2048]
+  psubsw mm0, mm3
+  paddsw mm0, mm3
+  movd eax, mm0
+  mov [edx], ax
 
-%ifdef SATURATE
-		movq mm2, [mmx_32767_minus_2047] 
-		movq mm4, [mmx_32768_minus_2048] 
-		paddsw	mm0, mm2
-		paddsw	mm3, mm2
-		psubsw	mm0, mm2
-		psubsw	mm3, mm2
-		psubsw	mm0, mm4
-		psubsw	mm3, mm4
-		paddsw	mm0, mm4
-		paddsw	mm3, mm4
-%endif
+  ret
 
-		movq	[edi + 8*eax], mm0		; [data] = mm0
-		movq	[edi + 8*eax + 8], mm3
-
-		add eax, 2
-		cmp eax, 16
-		jnz	near .loop
-
-		mov	ax, [esi]					; ax = data[0]
-		imul ax, [esp + 8 + 16]			; eax = data[0] * dcscalar
-
-%ifdef SATURATE
-		cmp ax, -2048
-		jl .set_n2048
-		cmp ax, 2047
-		jg .set_2047
-%endif
-		mov	[edi], ax
+;===========================================================================
+;
+; void dequant_intra_xmm(int16_t *data,
+;					const int16_t const *coeff,
+;					const uint32_t quant,
+;					const uint32_t dcscalar);
+;
+;===========================================================================
 
-		pop	edi
-		pop	esi
-		ret
+  ; this is the same as dequant_inter_mmx, except that we're
+  ; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)
 
-%ifdef SATURATE
 align ALIGN
-.set_n2048
-		mov	word [edi], -2048
-		pop	edi
-		pop	esi
-		ret
-	
+cglobal dequant_intra_xmm
+dequant_intra_xmm:
+
+  mov    edx, [esp+ 4]        ; data
+  mov    ecx, [esp+ 8]        ; coeff
+  mov    eax, [esp+12]        ; quant
+  movq mm6, [mmx_add + eax*8 - 8]  ; quant or quant-1 
+  movq mm7, [mmx_mul + eax*8 - 8]  ; 2*quant
+  mov eax, -16
+
 align ALIGN
-.set_2047
-		mov	word [edi], 2047
-		pop	edi
-		pop	esi
-		ret
-%endif
+.loop
+  movq mm0, [ecx+8*eax+8*16]      ; c  = coeff[i]
+  movq mm3, [ecx+8*eax+8*16 + 8]  ; c' = coeff[i+1]
+  pxor mm1, mm1
+  pxor mm4, mm4
+  pcmpgtw mm1, mm0  ; sign(c)
+  pcmpgtw mm4, mm3  ; sign(c')
+  pxor mm2, mm2
+  pxor mm5, mm5
+  pcmpeqw mm2, mm0  ; c is zero
+  pcmpeqw mm5, mm3  ; c' is zero
+  pandn mm2, mm6    ; offset = isZero ? 0 : quant_add
+  pandn mm5, mm6
+  pxor mm0, mm1     ; negate if negative
+  pxor mm3, mm4     ; negate if negative
+  psubw mm0, mm1 
+  psubw mm3, mm4
+  pmullw mm0, mm7 ; *= 2Q
+  pmullw mm3, mm7 ; *= 2Q
+  paddw mm0, mm2 ; + offset
+  paddw mm3, mm5 ; + offset
+  paddw mm0, mm1 ; negate back
+  paddw mm3, mm4 ; negate back
+
+    ; saturates to +2047
+  movq mm2, [mmx_2047]
+  pminsw mm0, mm2
+  add eax, 2
+  pminsw mm3, mm2
+
+  pxor mm0, mm1
+  pxor mm3, mm4
+  movq [edx + 8*eax + 8*16   - 2*8], mm0
+  movq [edx + 8*eax + 8*16+8 - 2*8], mm3
+  jnz	near .loop
+
+    ; deal with DC
+
+  movd mm0, [ecx]
+  pmullw mm0, [esp+16]    ; dcscalar
+  movq mm2, [mmx_32767_minus_2047]
+  paddsw mm0, mm2
+  psubsw mm0, mm2
+  movq mm2, [mmx_32768_minus_2048]
+  psubsw mm0, mm2
+  paddsw mm0, mm2
+  movd eax, mm0
+  mov [edx], ax
 
+  ret
 
 
 ;===========================================================================
@@ -813,100 +878,71 @@
 ;					const uint32_t dcscalar);
 ;
 ;===========================================================================
-
-align 16
+align ALIGN
 cglobal dequant_intra_sse2
-dequant_intra_sse2
-
-		push	esi
-		push	edi
-
-		mov		edi, [esp + 8 + 4]			; data
-		mov		esi, [esp + 8 + 8]			; coeff
-		mov		eax, [esp + 8 + 12]			; quant
-
-		movq	mm6, [mmx_add + eax*8 - 8]
-		movq	mm7, [mmx_mul + eax*8 - 8]
-		movq2dq	xmm6, mm6
-		movq2dq	xmm7, mm7
-		movlhps	xmm6, xmm6
-		movlhps	xmm7, xmm7
-
-		xor		eax, eax
-
-align 16
-.das2_loop
-		movdqa	xmm0, [esi + eax*8]
-		movdqa	xmm3, [esi + eax*8 + 16]
-		pxor	xmm1, xmm1
-		pxor	xmm4, xmm4
-		pcmpgtw	xmm1, xmm0
-		pcmpgtw	xmm4, xmm3
-		pxor	xmm2, xmm2
-		pxor	xmm5, xmm5
-		pcmpeqw	xmm2, xmm0
-		pcmpeqw	xmm5, xmm3
-		pandn   xmm2, xmm6
-		pandn   xmm5, xmm6
-		pxor	xmm0, xmm1
-		pxor	xmm3, xmm4
-		psubw	xmm0, xmm1
-		psubw	xmm3, xmm4
-		pmullw	xmm0, xmm7
-		pmullw	xmm3, xmm7
-		paddw	xmm0, xmm2
-		paddw	xmm3, xmm5
-		pxor	xmm0, xmm1
-		pxor	xmm3, xmm4
-		psubw	xmm0, xmm1
-		psubw	xmm3, xmm4
-
-%ifdef SATURATE
-		movdqu	xmm2, [sse2_pos_2047]
-		movdqu	xmm4, [sse2_neg_2048]
-		pminsw	xmm0, xmm2
-		pminsw	xmm3, xmm2
-		pmaxsw	xmm0, xmm4
-		pmaxsw	xmm3, xmm4
-%endif
-
-		movdqa	[edi + eax*8], xmm0
-		movdqa	[edi + eax*8 + 16], xmm3
+dequant_intra_sse2:
+	mov edx, [esp+ 4]        ; data
+	mov ecx, [esp+ 8]        ; coeff
+	mov eax, [esp+12]        ; quant
+	movq mm6, [mmx_add + eax * 8 - 8]
+	movq mm7, [mmx_mul + eax * 8 - 8]
+	movq2dq xmm6, mm6
+	movq2dq xmm7, mm7
+	movlhps xmm6, xmm6
+	movlhps xmm7, xmm7
+	mov eax, -16
 
-		add		eax, 4
-		cmp		eax, 16
-		jnz		near .das2_loop
-
-		mov		ax, [esi]					; ax = data[0]
-		imul	ax, [esp + 8 + 16]			; eax = data[0] * dcscalar
-
-%ifdef SATURATE
-		cmp		ax, -2048
-		jl		.das2_set_n2048
-		cmp		ax, 2047
-		jg		.das2_set_2047
-%endif
-		mov		[edi], ax
+align ALIGN
+.loop
+	movdqa xmm0, [ecx + 8*16 + 8*eax]      ; c  = coeff[i]
+	movdqa xmm3, [ecx + 8*16 + 8*eax+ 16]
+	pxor xmm1, xmm1
+	pxor xmm4, xmm4
+	pcmpgtw xmm1, xmm0  ; sign(c)
+	pcmpgtw xmm4, xmm3
+	pxor xmm2, xmm2
+	pxor xmm5, xmm5
+	pcmpeqw xmm2, xmm0  ; c is zero
+	pcmpeqw xmm5, xmm3
+	pandn xmm2, xmm6    ; offset = isZero ? 0 : quant_add
+	pandn xmm5, xmm6
+	pxor xmm0, xmm1     ; negate if negative
+	pxor xmm3, xmm4
+	psubw xmm0, xmm1
+	psubw xmm3, xmm4
+	pmullw xmm0, xmm7 ; *= 2Q
+	pmullw xmm3, xmm7
+	paddw xmm0, xmm2 ; + offset
+	paddw xmm3, xmm5
+	paddw xmm0, xmm1 ; negate back
+	paddw xmm3, xmm4
+
+	; saturates to +2047
+	movdqa xmm2, [sse2_2047]
+	pminsw xmm0, xmm2
+	add eax, 4
+	pminsw xmm3, xmm2
+
+	pxor xmm0, xmm1
+	pxor xmm3, xmm4
+	movdqa [edx + 8*16 - 8*4 + 8*eax], xmm0
+	movdqa [edx + 8*16 - 8*4 + 8*eax + 16], xmm3
+	jnz	near .loop
+
+	; deal with DC
+	movd mm0, [ecx]
+	pmullw mm0, [esp+16]    ; dcscalar
+	movq mm2, [mmx_32767_minus_2047]
+	paddsw mm0, mm2
+	psubsw mm0, mm2
+	movq mm2, [mmx_32768_minus_2048]
+	psubsw mm0, mm2
+	paddsw mm0, mm2
+	movd eax, mm0
+	mov [edx], ax
 
-		pop		edi
-		pop		esi
-		ret
+	ret
 
-%ifdef SATURATE
-align 16
-.das2_set_n2048
-		mov		word [edi], -2048
-		pop		edi
-		pop		esi
-		ret
-	
-align 16
-.das2_set_2047
-		mov		word [edi], 2047
-		pop		edi
-		pop		esi
-		ret
-%endif
 
 
 
@@ -920,71 +956,116 @@
 
 align ALIGN
 cglobal dequant_inter_mmx
-dequant_inter_mmx
-
-		push 	esi
-		push 	edi
+dequant_inter_mmx:
 
-		mov 	edi, [esp + 8 + 4]	; data
-		mov 	esi, [esp + 8 + 8]	; coeff
-		mov 	eax, [esp + 8 + 12]	; quant
-		movq	mm6, [mmx_add + eax * 8 - 8]
-		movq	mm7, [mmx_mul + eax * 8 - 8]
-		
-		xor eax, eax
+  mov    edx, [esp+ 4]        ; data
+  mov    ecx, [esp+ 8]        ; coeff
+  mov    eax, [esp+12]        ; quant
+  movq mm6, [mmx_add + eax*8 - 8]  ; quant or quant-1 
+  movq mm7, [mmx_mul + eax*8 - 8]  ; 2*quant
+  mov eax, -16
 
 align ALIGN
 .loop
-		movq	mm0, [esi + 8*eax]			; mm0 = [coeff]
-		movq	mm3, [esi + 8*eax + 8]		; 
-		pxor	mm1, mm1		; mm1 = 0
-		pxor	mm4, mm4		;
-		pcmpgtw	mm1, mm0		; mm1 = (0 > mm0)
-		pcmpgtw	mm4, mm3		; 
-		pxor	mm2, mm2		; mm2 = 0
-		pxor	mm5, mm5		;
-		pcmpeqw	mm2, mm0		; mm2 = (0 == mm0)
-		pcmpeqw	mm5, mm3		; 
-		pandn   mm2, mm6		; mm2 = (iszero ? 0 : add)
-		pandn   mm5, mm6		;
-		pxor	mm0, mm1		; mm0 = |mm0|
-		pxor	mm3, mm4		; 
-		psubw	mm0, mm1		; displace
-		psubw	mm3, mm4		; 
-		pmullw	mm0, mm7		; mm0 *= 2Q
-		pmullw	mm3, mm7		; 
-		paddw	mm0, mm2		; mm0 += mm2 (add)
-		paddw	mm3, mm5		;
-		pxor	mm0, mm1		; mm0 *= sign(mm0)
-		pxor	mm3, mm4		;
-		psubw	mm0, mm1		; undisplace
-		psubw	mm3, mm4
+  movq mm0, [ecx+8*eax+8*16]      ; c  = coeff[i]
+  movq mm3, [ecx+8*eax+8*16 + 8]  ; c' = coeff[i+1]
+  pxor mm1, mm1
+  pxor mm4, mm4
+  pcmpgtw mm1, mm0  ; sign(c)
+  pcmpgtw mm4, mm3  ; sign(c')
+  pxor mm2, mm2
+  pxor mm5, mm5
+  pcmpeqw mm2, mm0  ; c is zero
+  pcmpeqw mm5, mm3  ; c' is zero
+  pandn mm2, mm6    ; offset = isZero ? 0 : quant_add
+  pandn mm5, mm6
+  pxor mm0, mm1     ; negate if negative
+  pxor mm3, mm4     ; negate if negative
+  psubw mm0, mm1 
+  psubw mm3, mm4
+  pmullw mm0, mm7 ; *= 2Q
+  pmullw mm3, mm7 ; *= 2Q
+  paddw mm0, mm2 ; + offset
+  paddw mm3, mm5 ; + offset
+  paddw mm0, mm1 ; negate back
+  paddw mm3, mm4 ; negate back
+
+    ; saturates to +2047
+  movq mm2, [mmx_32767_minus_2047]
+  add eax, 2
+  paddsw mm0, mm2
+  paddsw mm3, mm2
+  psubsw mm0, mm2
+  psubsw mm3, mm2
+
+  pxor mm0, mm1
+  pxor mm3, mm4
+  movq [edx + 8*eax + 8*16   - 2*8], mm0
+  movq [edx + 8*eax + 8*16+8 - 2*8], mm3
+  jnz	near .loop
 
-%ifdef SATURATE
-		movq mm2, [mmx_32767_minus_2047] 
-		movq mm4, [mmx_32768_minus_2048] 
-		paddsw	mm0, mm2
-		paddsw	mm3, mm2
-		psubsw	mm0, mm2
-		psubsw	mm3, mm2
-		psubsw	mm0, mm4
-		psubsw	mm3, mm4
-		paddsw	mm0, mm4
-		paddsw	mm3, mm4
-%endif
+  ret
 
-		movq	[edi + 8*eax], mm0
-		movq	[edi + 8*eax + 8], mm3
+;===========================================================================
+;
+; void dequant_inter_xmm(int16_t * data,
+;					const int16_t * const coeff,
+;					const uint32_t quant);
+;
+;===========================================================================
 
-		add eax, 2
-		cmp eax, 16
-		jnz	near .loop
+  ; this is the same as dequant_inter_mmx,
+  ; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
 
-		pop 	edi
-		pop 	esi
+align ALIGN
+cglobal dequant_inter_xmm
+dequant_inter_xmm:
 
-		ret
+  mov    edx, [esp+ 4]        ; data
+  mov    ecx, [esp+ 8]        ; coeff
+  mov    eax, [esp+12]        ; quant
+  movq mm6, [mmx_add + eax*8 - 8]  ; quant or quant-1 
+  movq mm7, [mmx_mul + eax*8 - 8]  ; 2*quant
+  mov eax, -16
 
+align ALIGN
+.loop
+  movq mm0, [ecx+8*eax+8*16]      ; c  = coeff[i]
+  movq mm3, [ecx+8*eax+8*16 + 8]  ; c' = coeff[i+1]
+  pxor mm1, mm1
+  pxor mm4, mm4
+  pcmpgtw mm1, mm0  ; sign(c)
+  pcmpgtw mm4, mm3  ; sign(c')
+  pxor mm2, mm2
+  pxor mm5, mm5
+  pcmpeqw mm2, mm0  ; c is zero
+  pcmpeqw mm5, mm3  ; c' is zero
+  pandn mm2, mm6    ; offset = isZero ? 0 : quant_add
+  pandn mm5, mm6
+  pxor mm0, mm1     ; negate if negative
+  pxor mm3, mm4     ; negate if negative
+  psubw mm0, mm1 
+  psubw mm3, mm4
+  pmullw mm0, mm7 ; *= 2Q
+  pmullw mm3, mm7 ; *= 2Q
+  paddw mm0, mm2 ; + offset
+  paddw mm3, mm5 ; + offset
+  paddw mm0, mm1 ; start restoring sign
+  paddw mm3, mm4 ; start restoring sign
+
+      ; saturates to +2047
+  movq mm2, [mmx_2047]
+  pminsw mm0, mm2
+  add eax, 2
+  pminsw mm3, mm2
+
+  pxor mm0, mm1 ; finish restoring sign
+  pxor mm3, mm4 ; finish restoring sign
+  movq [edx + 8*eax + 8*16   - 2*8], mm0
+  movq [edx + 8*eax + 8*16+8 - 2*8], mm3
+  jnz	near .loop
+
+  ret
 
 ;===========================================================================
 ;
@@ -993,71 +1074,57 @@
 ;					const uint32_t quant);
 ;
 ;===========================================================================
-
-align 16
+align ALIGN
 cglobal dequant_inter_sse2
 dequant_inter_sse2
+	mov edx, [esp + 4]	; data
+	mov ecx, [esp + 8]	; coeff
+	mov eax, [esp + 12]	; quant
+	movq mm6, [mmx_add + eax * 8 - 8]
+	movq mm7, [mmx_mul + eax * 8 - 8]
+	movq2dq	xmm6, mm6
+	movq2dq xmm7, mm7
+	movlhps xmm6, xmm6
+	movlhps	xmm7, xmm7
+	mov eax, -16
 
-		push 	esi
-		push 	edi
-
-		mov 	edi, [esp + 8 + 4]	; data
-		mov 	esi, [esp + 8 + 8]	; coeff
-		mov 	eax, [esp + 8 + 12]	; quant
-		movq	mm6, [mmx_add + eax * 8 - 8]
-		movq	mm7, [mmx_mul + eax * 8 - 8]
-
-		movq2dq	xmm6, mm6
-		movq2dq xmm7, mm7
-		movlhps xmm6, xmm6
-		movlhps	xmm7, xmm7
-		
-		xor eax, eax
-
-align 16
-.des2_loop
-		movdqa	xmm0, [esi + eax*8]			; xmm0 = [coeff]
-		movdqa	xmm3, [esi + eax*8 + 16]
-		pxor	xmm1, xmm1
-		pxor	xmm4, xmm4
-		pcmpgtw	xmm1, xmm0
-		pcmpgtw	xmm4, xmm3
-		pxor	xmm2, xmm2
-		pxor	xmm5, xmm5
-		pcmpeqw	xmm2, xmm0
-		pcmpeqw	xmm5, xmm3
-		pandn   xmm2, xmm6
-		pandn   xmm5, xmm6
-		pxor	xmm0, xmm1
-		pxor	xmm3, xmm4
-		psubw	xmm0, xmm1
-		psubw	xmm3, xmm4
-		pmullw	xmm0, xmm7
-		pmullw	xmm3, xmm7
-		paddw	xmm0, xmm2
-		paddw	xmm3, xmm5
-		pxor	xmm0, xmm1
-		pxor	xmm3, xmm4
-		psubw	xmm0, xmm1
-		psubw	xmm3, xmm4
-
-%ifdef SATURATE
-		movdqu	xmm2, [sse2_pos_2047]
-		movdqu	xmm4, [sse2_neg_2048]
-		pminsw	xmm0, xmm2
-		pminsw	xmm3, xmm2
-		pmaxsw	xmm0, xmm4
-		pmaxsw	xmm3, xmm4
-%endif
-
-		movdqa	[edi + eax*8], xmm0
-		movdqa	[edi + eax*8 + 16], xmm3
-
-		add eax, 4
-		cmp eax, 16
-		jnz	near .des2_loop
+align ALIGN
+.loop
+	movdqa xmm0, [ecx + 8*16 + 8*eax]  ; c  = coeff[i]
+	movdqa xmm3, [ecx + 8*16 + 8*eax + 16]
 
-		pop 	edi
-		pop 	esi
+	pxor xmm1, xmm1
+	pxor xmm4, xmm4
+	pcmpgtw	xmm1, xmm0  ; sign(c)
+	pcmpgtw	xmm4, xmm3
+	pxor xmm2, xmm2
+	pxor xmm5, xmm5
+	pcmpeqw	xmm2, xmm0  ; c is zero
+	pcmpeqw	xmm5, xmm3
+	pandn xmm2, xmm6
+	pandn xmm5, xmm6
+	pxor xmm0, xmm1  ; negate if negative
+	pxor xmm3, xmm4
+	psubw xmm0, xmm1
+	psubw xmm3, xmm4
+	pmullw xmm0, xmm7  ; *= 2Q
+	pmullw xmm3, xmm7
+	paddw xmm0, xmm2  ; + offset
+	paddw xmm3, xmm5
+
+	paddw xmm0, xmm1  ; start restoring sign
+	paddw xmm3, xmm4
+
+	; saturates to +2047
+	movdqa xmm2, [sse2_2047]
+	pminsw xmm0, xmm2
+	add eax, 4
+	pminsw xmm3, xmm2
+
+	pxor xmm0, xmm1 ; finish restoring sign
+	pxor xmm3, xmm4
+	movdqa [edx + 8*16 - 8*4 + 8*eax], xmm0
+	movdqa [edx + 8*16 - 8*4 + 8*eax + 16], xmm3
+	jnz	near .loop
 
-		ret
+	ret