--- branches/dev-api-4/xvidcore/src/quant/x86_asm/quantize_h263_3dne.asm	2003/10/27 01:03:43	1191
+++ branches/dev-api-4/xvidcore/src/quant/x86_asm/quantize_h263_3dne.asm	2003/10/28 22:23:03	1192
@@ -19,96 +19,85 @@
 ; *  along with this program ; if not, write to the Free Software
 ; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 ; *
-; * $Id: quantize_h263_3dne.asm,v 1.1.2.2 2003-10-09 18:50:22 edgomez Exp $
+; * $Id: quantize_h263_3dne.asm,v 1.1.2.3 2003-10-28 22:23:03 edgomez Exp $
 ; *
 ; *************************************************************************/
 ;
-; these 3dne functions are compatible with iSSE, but are optimized specifically for 
+; these 3dne functions are compatible with iSSE, but are optimized specifically for
 ; K7 pipelines
 
 ; enable dequant saturate [-2048,2047], test purposes only.
 %define SATURATE
 
-; data/text alignment
-%define ALIGN 16
+BITS 32
 
-bits 32
-
-%macro cglobal 1 
+%macro cglobal 1
 	%ifdef PREFIX
-		global _%1 
+		global _%1
 		%define %1 _%1
 	%else
 		global %1
 	%endif
 %endmacro
 
-;***************************************************************************
+;=============================================================================
 ; Local data
-;***************************************************************************
+;=============================================================================
 
-%ifdef FORMAT_COFF
-section .data data
-%else
-section .data data align=16
-%endif
+SECTION .rodata
 
 align 4
 int_div:
-dd 0
+	dd 0
 %assign i 1
-%rep 255 
+%rep 255
 	dd  (1 << 16) / (i) + 1
 	%assign i i+1
 %endrep
 
-align 16
+ALIGN 16
 plus_one:
 	times 8 dw 1
 
-;===========================================================================
-;
+;-----------------------------------------------------------------------------
 ; subtract by Q/2 table
-;
-;===========================================================================
+;-----------------------------------------------------------------------------
 
-align 16
+ALIGN 16
 mmx_sub:
 %assign i 1
-%rep 31 
+%rep 31
 	times 4 dw i / 2
 	%assign i i+1
 %endrep
 
 
-;===========================================================================
+;-----------------------------------------------------------------------------
 ;
-; divide by 2Q table 
+; divide by 2Q table
 ;
 ; use a shift of 16 to take full advantage of _pmulhw_
 ; for q=1, _pmulhw_ will overflow so it is treated seperately
 ; (3dnow2 provides _pmulhuw_ which wont cause overflow)
 ;
-;===========================================================================
+;-----------------------------------------------------------------------------
 
-align 16
+ALIGN 16
 mmx_div:
 %assign i 1
-%rep 31 
+%rep 31
 	times 4 dw  (1 << 16) / (i * 2) + 1
 	%assign i i+1
 %endrep
 
-;===========================================================================
-;
+;-----------------------------------------------------------------------------
 ; add by (odd(Q) ? Q : Q - 1) table
-;
-;===========================================================================
+;-----------------------------------------------------------------------------
 
-align 16
+ALIGN 16
 mmx_add:
 %assign i 1
-%rep 31 
+%rep 31
 	%if i % 2 != 0
 	times 4 dw i
 	%else
@@ -117,37 +106,33 @@
 	%assign i i+1
 %endrep
 
-;===========================================================================
-;
+;-----------------------------------------------------------------------------
 ; multiple by 2Q table
-;
-;===========================================================================
+;-----------------------------------------------------------------------------
 
-align 16
-mmx_mul:		
+ALIGN 16
+mmx_mul:
 %assign i 1
-%rep 31 
+%rep 31
 	times 4 dw i * 2
 	%assign i i+1
 %endrep
 
-;===========================================================================
-;
-; saturation limits 
-;
-;===========================================================================
+;-----------------------------------------------------------------------------
+; saturation limits
+;-----------------------------------------------------------------------------
 
-align 8
+ALIGN 8
 mmx_32768_minus_2048:
 	times 4 dw (32768-2048)
 mmx_32767_minus_2047:
 	times 4 dw (32767-2047)
 
-align 16
+ALIGN 16
 mmx_2047:
 	times 4 dw 2047
 
-align 8
+ALIGN 8
 mmzero:
 	dd 0, 0
 int2047:
@@ -155,427 +140,426 @@
 int_2048:
 	dd -2048
 
-;***************************************************************************
+;=============================================================================
 ; Code
-;***************************************************************************
+;=============================================================================
 
-section .text
+SECTION .text
 
 
-;===========================================================================
+;-----------------------------------------------------------------------------
 ;
-; uint32_t quant_h263_intra_3dne(int16_t * coeff, 
+; uint32_t quant_h263_intra_3dne(int16_t * coeff,
 ;                                const int16_t const * data,
 ;                                const uint32_t quant,
 ;                                const uint32_t dcscalar);
 ;
-;===========================================================================
+;-----------------------------------------------------------------------------
 ;This is Athlon-optimized code (ca 70 clk per call)
 
 %macro quant_intra1  1
+  psubw mm1, mm0    ;A3
+  psubw mm3, mm2    ;B3
+%if (%1)
+  psubw mm5, mm4    ;C8
+  psubw mm7, mm6    ;D8
+%endif
+
+ALIGN 8
+  db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16)  ;movq   mm4, [ecx + %1 * 32 +16+32] ;C1
+  pmaxsw mm1, mm0   ;A4
+  db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24)  ;movq   mm6, [ecx + %1 * 32 +24+32] ;D1
+  pmaxsw mm3, mm2   ;B4
+
 
-	psubw	mm1, mm0	;A3
-	psubw	mm3, mm2	;B3
-%if (%1)		
-	psubw	mm5, mm4	;C8
-	psubw	mm7, mm6	;D8
-%endif
-
-align 8
-	db	0Fh, 6Fh, 64h, 21h, (%1 * 32 +16)	;movq	mm4, [ecx + %1 * 32 +16+32]	;C1
-	pmaxsw	mm1, mm0	;A4
-	db	0Fh, 6Fh, 74h, 21h, (%1 * 32 +24)	;movq	mm6, [ecx + %1 * 32 +24+32]	;D1
-	pmaxsw	mm3, mm2	;B4
-
-
-	psraw	mm0, 15		;A5
-	psraw	mm2, 15		;B5
-%if (%1)		
-	movq	[edx + %1 * 32 + 16-32], mm5	;C9
-	movq	[edx + %1 * 32 + 24-32], mm7	;D9
-%endif
-
-	psrlw	mm1, 1		;A6
-	psrlw 	mm3, 1		;B6
-	movq	mm5, [ebx]	;C2
-	movq	mm7, [ebx]	;D2
-
-	pxor	mm1, mm0	;A7	
-	pxor	mm3, mm2	;B7
-
-	psubw	mm5, mm4	;C3
-	psubw	mm7, mm6	;D3
-	psubw	mm1, mm0	;A8
-	psubw	mm3, mm2	;B8
+  psraw mm0, 15     ;A5
+  psraw mm2, 15     ;B5
+%if (%1)
+  movq [edx + %1 * 32 + 16-32], mm5 ;C9
+  movq [edx + %1 * 32 + 24-32], mm7 ;D9
+%endif
+
+  psrlw mm1, 1      ;A6
+  psrlw mm3, 1      ;B6
+  movq mm5, [ebx]   ;C2
+  movq mm7, [ebx]   ;D2
+
+  pxor mm1, mm0 ;A7
+  pxor mm3, mm2 ;B7
+
+  psubw mm5, mm4    ;C3
+  psubw mm7, mm6    ;D3
+  psubw mm1, mm0    ;A8
+  psubw mm3, mm2    ;B8
 
 %if (%1 == 0)
-	push	ebp
-	movq	mm0, [ecx + %1 * 32 +32]
+  push ebp
+  movq mm0, [ecx + %1 * 32 +32]
 %elif (%1 < 3)
-	db	0Fh, 6Fh, 44h, 21h, (%1 * 32 +32)	;movq	mm0, [ecx + %1 * 32 +32] 	;A1
-%endif		
-	pmaxsw	mm5, mm4	;C4
+  db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32)  ;movq   mm0, [ecx + %1 * 32 +32]    ;A1
+%endif
+  pmaxsw mm5, mm4   ;C4
 %if (%1 < 3)
-	db	0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32)	;movq	mm2, [ecx + %1 * 32 +8+32]	;B1
-%else 
-	cmp	esp, esp
+  db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32)   ;movq   mm2, [ecx + %1 * 32 +8+32]  ;B1
+%else
+  cmp esp, esp
 %endif
-	pmaxsw	mm7, mm6	;D4
+  pmaxsw mm7, mm6   ;D4
 
-	psraw	mm4, 15		;C5
-	psraw	mm6, 15		;D5
-	movq	[byte edx + %1 * 32], mm1	;A9
-	movq	[edx + %1 * 32+8], mm3		;B9
+  psraw mm4, 15     ;C5
+  psraw mm6, 15     ;D5
+  movq [byte edx + %1 * 32], mm1    ;A9
+  movq [edx + %1 * 32+8], mm3       ;B9
 
 
-	psrlw 	mm5, 1		;C6
-	psrlw 	mm7, 1		;D6
+  psrlw mm5, 1      ;C6
+  psrlw mm7, 1      ;D6
 %if (%1 < 3)
-	movq	mm1, [ebx]	;A2
-	movq	mm3, [ebx]	;B2
+  movq mm1, [ebx]   ;A2
+  movq mm3, [ebx]   ;B2
 %endif
 %if (%1 == 3)
-	imul	eax, [int_div+4*edi]
+  imul eax, [int_div+4*edi]
 %endif
-	pxor	mm5, mm4	;C7
-	pxor	mm7, mm6	;D7
+  pxor mm5, mm4 ;C7
+  pxor mm7, mm6 ;D7
 %endm
 
 
 %macro quant_intra  1
-	; Rules for athlon:
-		; 1) schedule latencies
-		; 2) add/mul and load/store in 2:1 proportion
-		; 3) avoid spliting >3byte instructions over 8byte boundaries
-
-	psubw	mm1, mm0	;A3
-	psubw	mm3, mm2	;B3
-%if (%1)		
-	psubw	mm5, mm4	;C8
-	psubw	mm7, mm6	;D8
-%endif
-
-align 8
-	db	0Fh, 6Fh, 64h, 21h, (%1 * 32 +16)	;movq	mm4, [ecx + %1 * 32 +16+32]	;C1
-	pmaxsw	mm1, mm0	;A4
-	db	0Fh, 6Fh, 74h, 21h, (%1 * 32 +24)	;movq	mm6, [ecx + %1 * 32 +24+32]	;D1
-	pmaxsw	mm3, mm2	;B4
-
-
-	psraw	mm0, 15		;A5
-	psraw	mm2, 15		;B5
-%if (%1)		
-	movq	[edx + %1 * 32 + 16-32], mm5 ;C9
-	movq	[edx + %1 * 32 + 24-32], mm7 ;D9
-%endif
-
-	pmulhw	mm1, [esi]	;A6
-	pmulhw 	mm3, [esi]	;B6
-	movq	mm5, [ebx]	;C2
-	movq	mm7, [ebx]	;D2
-
-	nop
-	nop
-	pxor	mm1, mm0	;A7	
-	pxor	mm3, mm2	;B7
-
-	psubw	mm5, mm4	;C3
-	psubw	mm7, mm6	;D3
-	psubw	mm1, mm0	;A8	
-	psubw	mm3, mm2	;B8
+    ; Rules for athlon:
+        ; 1) schedule latencies
+        ; 2) add/mul and load/store in 2:1 proportion
+        ; 3) avoid spliting >3byte instructions over 8byte boundaries
+
+  psubw mm1, mm0    ;A3
+  psubw mm3, mm2    ;B3
+%if (%1)
+  psubw mm5, mm4    ;C8
+  psubw mm7, mm6    ;D8
+%endif
+
+ALIGN 8
+  db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16)  ;movq   mm4, [ecx + %1 * 32 +16+32] ;C1
+  pmaxsw mm1, mm0   ;A4
+  db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24)  ;movq   mm6, [ecx + %1 * 32 +24+32] ;D1
+  pmaxsw mm3, mm2   ;B4
+
+
+  psraw mm0, 15     ;A5
+  psraw mm2, 15     ;B5
+%if (%1)
+  movq [edx + %1 * 32 + 16-32], mm5 ;C9
+  movq [edx + %1 * 32 + 24-32], mm7 ;D9
+%endif
+
+  pmulhw mm1, [esi] ;A6
+  pmulhw mm3, [esi] ;B6
+  movq mm5, [ebx]   ;C2
+  movq mm7, [ebx]   ;D2
+
+  nop
+  nop
+  pxor mm1, mm0 ;A7
+  pxor mm3, mm2 ;B7
+
+  psubw mm5, mm4    ;C3
+  psubw mm7, mm6    ;D3
+  psubw mm1, mm0    ;A8
+  psubw mm3, mm2    ;B8
 
 
 %if (%1 < 3)
-	db	0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq	mm0, [ecx + %1 * 32 +32] 	;A1
-%endif		
-	pmaxsw	mm5, mm4	  ;C4
+  db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq    mm0, [ecx + %1 * 32 +32]    ;A1
+%endif
+  pmaxsw mm5, mm4     ;C4
 %if (%1 < 3)
-	db	0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq	mm2, [ecx + %1 * 32 +8+32]	;B1
-%else 
-	cmp	esp, esp
+  db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32]  ;B1
+%else
+  cmp esp, esp
 %endif
-	pmaxsw	mm7,mm6		;D4
+  pmaxsw mm7,mm6        ;D4
 
-	psraw	mm4, 15		;C5
-	psraw	mm6, 15		;D5
-	movq	[byte edx + %1 * 32], mm1 ;A9
-	movq	[edx + %1 * 32+8], mm3	  ;B9
+  psraw mm4, 15     ;C5
+  psraw mm6, 15     ;D5
+  movq [byte edx + %1 * 32], mm1 ;A9
+  movq [edx + %1 * 32+8], mm3     ;B9
 
 
-	pmulhw 	mm5, [esi]	;C6
-	pmulhw 	mm7, [esi]	;D6
+  pmulhw mm5, [esi] ;C6
+  pmulhw mm7, [esi] ;D6
 %if (%1 < 3)
-	movq	mm1, [ebx]	;A2
-	movq	mm3, [ebx]	;B2
+  movq mm1, [ebx]   ;A2
+  movq mm3, [ebx]   ;B2
 %endif
 %if (%1 == 0)
-	push	ebp
+  push ebp
 %elif (%1 < 3)
-	nop
+  nop
 %endif
-	nop
+  nop
 %if (%1 == 3)
-	imul	eax, [int_div+4*edi]
+  imul eax, [int_div+4*edi]
 %endif
-	pxor	mm5, mm4	;C7
-	pxor	mm7, mm6	;D7
+  pxor mm5, mm4 ;C7
+  pxor mm7, mm6 ;D7
 %endmacro
 
 
-align ALIGN
+ALIGN 16
 cglobal quant_h263_intra_3dne
 quant_h263_intra_3dne:
 
-	mov		eax, [esp + 12]		; quant
-	mov		ecx, [esp + 8]		; data
-	mov		edx, [esp + 4]		; coeff
-	cmp		al, 1
-	pxor	mm1, mm1
-	pxor	mm3, mm3
-	movq	mm0, [ecx]		; mm0 = [1st]
-	movq	mm2, [ecx + 8]		
-	push	esi
-	lea		esi, [mmx_div + eax*8 - 8]
-
-	push	ebx
-	mov		ebx, mmzero
-	push	edi
-	jz		near .q1loop
-	
-quant_intra 0
-	mov 	ebp, [esp + 16 + 16]	; dcscalar
-					; NB -- there are 3 pushes in the function preambule and one more
-					; in "quant_intra 0", thus an added offset of 16 bytes
-	movsx 	eax, word [byte ecx]	; DC
-
-quant_intra 1
-	mov		edi, eax 
-	sar		edi, 31		; sign(DC)
-	shr 	ebp, byte 1	; ebp = dcscalar/2
-
-quant_intra 2
-	sub		eax, edi		; DC (+1)
-	xor 	ebp, edi		; sign(DC) dcscalar /2  (-1)
-	mov		edi, [esp + 16 + 16]	; dscalar
-	lea		eax, [byte eax + ebp]	; DC + sign(DC) dcscalar/2
-	mov		ebp, [byte esp]
-
-quant_intra 3
-	psubw	mm5, mm4			;C8
-	mov		esi, [esp + 12]			; pop back the register value
-	mov		edi, [esp + 4]			; pop back the register value
-	sar		eax, 16
-	lea		ebx, [byte eax + 1]		; workaround for eax < 0
-	cmovs	eax, ebx			; conditionnaly move the corrected value
-	mov		[edx], ax			; coeff[0] = ax
-	mov		ebx, [esp + 8]			; pop back the register value
-	add		esp, byte 16			; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
-	psubw	mm7, mm6			;D8
-	movq	[edx + 3 * 32 + 16], mm5	;C9
-	movq	[edx + 3 * 32 + 24], mm7	;D9
+  mov eax, [esp + 12]       ; quant
+  mov ecx, [esp + 8]        ; data
+  mov edx, [esp + 4]        ; coeff
+  cmp al, 1
+  pxor mm1, mm1
+  pxor mm3, mm3
+  movq mm0, [ecx]           ; mm0 = [1st]
+  movq mm2, [ecx + 8]
+  push esi
+  lea esi, [mmx_div + eax*8 - 8]
+
+  push ebx
+  mov ebx, mmzero
+  push edi
+  jz near .q1loop
+
+  quant_intra 0
+  mov ebp, [esp + 16 + 16]      ; dcscalar
+                                ; NB -- there are 3 pushes in the function preambule and one more
+                                ; in "quant_intra 0", thus an added offset of 16 bytes
+  movsx eax, word [byte ecx]    ; DC
+
+  quant_intra 1
+  mov edi, eax
+  sar edi, 31                       ; sign(DC)
+  shr ebp, byte 1                   ; ebp = dcscalar/2
+
+  quant_intra 2
+  sub eax, edi                      ; DC (+1)
+  xor ebp, edi                      ; sign(DC) dcscalar /2  (-1)
+  mov edi, [esp + 16 + 16]          ; dscalar
+  lea eax, [byte eax + ebp]         ; DC + sign(DC) dcscalar/2
+  mov ebp, [byte esp]
+
+  quant_intra 3
+  psubw mm5, mm4                    ;C8
+  mov esi, [esp + 12]               ; pop back the register value
+  mov edi, [esp + 4]                ; pop back the register value
+  sar eax, 16
+  lea ebx, [byte eax + 1]           ; workaround for eax < 0
+  cmovs eax, ebx                    ; conditionnaly move the corrected value
+  mov [edx], ax                     ; coeff[0] = ax
+  mov ebx, [esp + 8]                ; pop back the register value
+  add esp, byte 16                  ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
+  psubw mm7, mm6                    ;D8
+  movq [edx + 3 * 32 + 16], mm5     ;C9
+  movq [edx + 3 * 32 + 24], mm7     ;D9
 
-	xor		eax, eax
-	ret				
+  xor eax, eax
+  ret
 
-	align 16
+ALIGN 16
 
 .q1loop
-quant_intra1 0
-	mov		ebp, [esp + 16 + 16]	; dcscalar
-	movsx 	eax, word [byte ecx]	; DC
-
-quant_intra1 1
-	mov		edi, eax
-	sar		edi, 31		; sign(DC)
-	shr 	ebp, byte 1	; ebp = dcscalar /2
-
-quant_intra1 2
-	sub		eax, edi		; DC (+1)
-	xor 	ebp, edi		; sign(DC) dcscalar /2  (-1)
-	mov		edi, [esp + 16 + 16]	; dcscalar
-	lea		eax, [byte eax + ebp]	; DC + sign(DC) dcscalar /2
-	mov		ebp, [byte esp]
-
-quant_intra1 3
-	psubw	mm5, mm4			;C8
-	mov		esi, [dword esp + 12]		; pop back the register value
-	mov		edi, [esp + 4]		; pop back the register value
-	sar		eax, 16
-	lea		ebx, [byte eax + 1]	; workaround for eax < 0
-	cmovs	eax, ebx			; conditionnaly move the corrected value
-	mov		[edx], ax			; coeff[0] = ax
-	mov		ebx, [esp + 8]		; pop back the register value
-	add		esp, byte 16		; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
-	psubw	mm7, mm6			;D8
-	movq	[edx + 3 * 32 + 16], mm5	;C9
-	movq	[edx + 3 * 32 + 24], mm7	;D9		
+  quant_intra1 0
+  mov ebp, [esp + 16 + 16]          ; dcscalar
+  movsx eax, word [byte ecx]        ; DC
+
+  quant_intra1 1
+  mov edi, eax
+  sar edi, 31                       ; sign(DC)
+  shr ebp, byte 1                   ; ebp = dcscalar /2
+
+  quant_intra1 2
+  sub eax, edi                      ; DC (+1)
+  xor ebp, edi                      ; sign(DC) dcscalar /2  (-1)
+  mov edi, [esp + 16 + 16]          ; dcscalar
+  lea eax, [byte eax + ebp]         ; DC + sign(DC) dcscalar /2
+  mov ebp, [byte esp]
+
+  quant_intra1 3
+  psubw mm5, mm4                    ;C8
+  mov esi, [dword esp + 12]         ; pop back the register value
+  mov edi, [esp + 4]                ; pop back the register value
+  sar eax, 16
+  lea ebx, [byte eax + 1]           ; workaround for eax < 0
+  cmovs eax, ebx                    ; conditionnaly move the corrected value
+  mov [edx], ax                     ; coeff[0] = ax
+  mov ebx, [esp + 8]                ; pop back the register value
+  add esp, byte 16                  ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
+  psubw mm7, mm6                    ;D8
+  movq [edx + 3 * 32 + 16], mm5     ;C9
+  movq [edx + 3 * 32 + 24], mm7     ;D9
 
-	xor		eax, eax
-	ret				
+  xor eax, eax
+  ret
 
 
 
 
-;===========================================================================
+;-----------------------------------------------------------------------------
 ;
 ; uint32_t quant_h263_inter_3dne(int16_t * coeff,
 ;                                const int16_t const * data,
 ;                                const uint32_t quant);
 ;
-;===========================================================================
+;-----------------------------------------------------------------------------
 ;This is Athlon-optimized code (ca 90 clk per call)
 ;Optimized by Jaan, 30 Nov 2002
 
 
-%macro quantinter 1		
-	movq	mm1, [eax]	;A2
-	psraw	mm3, 15		;B6
-%if (%1)
-	psubw	mm2, mm6	;C10
-%endif
-	psubw	mm1, mm0		;A3
-	pmulhw	mm4, mm7		;B7
-	movq	mm6, [ecx + %1*24+16]	;C1
-	pmaxsw 	mm1, mm0		;A4
-	paddw	mm5, mm4		;B8
-%if (%1)
-	movq	[edx + %1*24+16-24], mm2	;C11
-%endif
-	psubusw	mm1, [ebx]		;A5 mm0 -= sub (unsigned, dont go < 0)
-	pxor	mm4, mm3		;B9
-	movq	mm2, [eax]		;C2
-	psraw 	mm0, 15			;A6
-	psubw	mm4, mm3		;B10
-	psubw	mm2, mm6		;C3
-	pmulhw	mm1, mm7		;A7 mm0 = (mm0 / 2Q) >> 24
-	movq	mm3, [ecx + %1*24+8]	;B1
-	pmaxsw	mm2, mm6		;C4
-	paddw	mm5, mm1		;A8 sum += mm0
-%if (%1)
-	movq	[edx + %1*24+8-24], mm4	;B11
-%else 
-	movq	[edx + 120], mm4	;B11
-%endif
-	psubusw	mm2, [ebx]		;C5
-	pxor	mm1, mm0		;A9 mm0 *= sign(mm0)
-	movq	mm4, [eax]		;B2
-	psraw	mm6, 15			;C6
-	psubw	mm1, mm0		;A10 undisplace
-	psubw	mm4, mm3		;B3
-	pmulhw	mm2, mm7		;C7
-	movq	mm0, [ecx + %1*24+24]	;A1 mm0 = [1st]
-	pmaxsw	mm4, mm3		;B4
-	paddw	mm5, mm2		;C8
-	movq	[byte edx + %1*24], mm1	;A11
-	psubusw	mm4, [ebx]		;B5
-	pxor	mm2, mm6		;C9
+%macro quantinter 1
+  movq mm1, [eax]               ;A2
+  psraw mm3, 15                 ;B6
+%if (%1)
+  psubw mm2, mm6                ;C10
+%endif
+  psubw mm1, mm0                ;A3
+  pmulhw mm4, mm7               ;B7
+  movq mm6, [ecx + %1*24+16]    ;C1
+  pmaxsw mm1, mm0               ;A4
+  paddw mm5, mm4                ;B8
+%if (%1)
+  movq [edx + %1*24+16-24], mm2 ;C11
+%endif
+  psubusw mm1, [ebx]            ;A5 mm0 -= sub (unsigned, dont go < 0)
+  pxor mm4, mm3                 ;B9
+  movq mm2, [eax]               ;C2
+  psraw mm0, 15                 ;A6
+  psubw mm4, mm3                ;B10
+  psubw mm2, mm6                ;C3
+  pmulhw mm1, mm7               ;A7 mm0 = (mm0 / 2Q) >> 24
+  movq mm3, [ecx + %1*24+8] ;B1
+  pmaxsw mm2, mm6               ;C4
+  paddw mm5, mm1                ;A8 sum += mm0
+%if (%1)
+  movq [edx + %1*24+8-24], mm4  ;B11
+%else
+  movq [edx + 120], mm4         ;B11
+%endif
+  psubusw mm2, [ebx]            ;C5
+  pxor mm1, mm0                 ;A9 mm0 *= sign(mm0)
+  movq mm4, [eax]               ;B2
+  psraw mm6, 15                 ;C6
+  psubw mm1, mm0                ;A10 undisplace
+  psubw mm4, mm3                ;B3
+  pmulhw mm2, mm7               ;C7
+  movq mm0, [ecx + %1*24+24]    ;A1 mm0 = [1st]
+  pmaxsw mm4, mm3               ;B4
+  paddw mm5, mm2                ;C8
+  movq [byte edx + %1*24], mm1  ;A11
+  psubusw mm4, [ebx]            ;B5
+  pxor mm2, mm6                 ;C9
 %endmacro
 
 %macro quantinter1 1
-	movq	mm0, [byte ecx + %1*16]	;mm0 = [1st]
-	movq	mm3, [ecx + %1*16+8]	;
-	movq	mm1, [eax]
-	movq	mm4, [eax]
-	psubw	mm1, mm0
-	psubw	mm4, mm3
-	pmaxsw 	mm1, mm0
-	pmaxsw	mm4, mm3
-	psubusw	mm1, mm6		; mm0 -= sub (unsigned, dont go < 0)
-	psubusw	mm4, mm6		;
-	psraw 	mm0, 15
-	psraw	mm3, 15
-	psrlw	mm1, 1			; mm0 = (mm0 / 2Q) >> 16
-	psrlw	mm4, 1			; 
-	paddw	mm5, mm1		; sum += mm0
-	pxor	mm1, mm0		; mm0 *= sign(mm0)
-	paddw	mm5, mm4
-	pxor	mm4, mm3		;
-	psubw	mm1, mm0		; undisplace
-	psubw	mm4, mm3
-	cmp		esp, esp
-	movq	[byte edx + %1*16], mm1
-	movq	[edx + %1*16+8], mm4
+  movq mm0, [byte ecx + %1*16]  ;mm0 = [1st]
+  movq mm3, [ecx + %1*16+8] ;
+  movq mm1, [eax]
+  movq mm4, [eax]
+  psubw mm1, mm0
+  psubw mm4, mm3
+  pmaxsw mm1, mm0
+  pmaxsw mm4, mm3
+  psubusw mm1, mm6              ; mm0 -= sub (unsigned, dont go < 0)
+  psubusw mm4, mm6              ;
+  psraw mm0, 15
+  psraw mm3, 15
+  psrlw mm1, 1                  ; mm0 = (mm0 / 2Q) >> 16
+  psrlw mm4, 1                  ;
+  paddw mm5, mm1                ; sum += mm0
+  pxor mm1, mm0                 ; mm0 *= sign(mm0)
+  paddw mm5, mm4
+  pxor mm4, mm3                 ;
+  psubw mm1, mm0                ; undisplace
+  psubw mm4, mm3
+  cmp esp, esp
+  movq [byte edx + %1*16], mm1
+  movq [edx + %1*16+8], mm4
 %endmacro
 
-align ALIGN
+ALIGN 16
 cglobal quant_h263_inter_3dne
 quant_h263_inter_3dne:
-	mov		edx, [esp  + 4]		; coeff
-	mov		ecx, [esp  + 8]		; data
-	mov		eax, [esp  + 12]	; quant
-	push	ebx
-
-	pxor	mm5, mm5			; sum
-	nop
-	lea		ebx,[mmx_sub + eax * 8 - 8]	; sub
-	movq	mm7, [mmx_div + eax * 8 - 8]	; divider
-
-	cmp		al, 1
-	lea		eax, [mmzero]
-	jz		near .q1loop
-	cmp		esp, esp
-align 8
-	movq	mm3, [ecx + 120]	;B1
-	pxor	mm4, mm4		;B2
-	psubw	mm4, mm3		;B3
-	movq	mm0, [ecx]		;A1 mm0 = [1st]
-	pmaxsw	mm4, mm3		;B4
-	psubusw	mm4, [ebx] 		;B5
-
-	quantinter 0
-	quantinter 1
-	quantinter 2
-	quantinter 3
-	quantinter 4
-
-	psraw	mm3, 15			;B6
-	psubw	mm2, mm6		;C10
-	pmulhw	mm4, mm7		;B7
-	paddw	mm5, mm4		;B8
-	pxor	mm4, mm3		;B9
-	psubw	mm4, mm3		;B10
-	movq	[edx + 4*24+16], mm2	;C11
-	pop		ebx
-	movq	[edx + 4*24+8], mm4	;B11
-	pmaddwd	mm5, [plus_one]
-	movq	mm0, mm5
-	punpckhdq	mm5, mm5
-	paddd	mm0, mm5
-	movd	eax, mm0		; return sum
+  mov edx, [esp  + 4]               ; coeff
+  mov ecx, [esp  + 8]               ; data
+  mov eax, [esp  + 12]              ; quant
+  push ebx
+
+  pxor mm5, mm5                     ; sum
+  nop
+  lea ebx,[mmx_sub + eax * 8 - 8]   ; sub
+  movq mm7, [mmx_div + eax * 8 - 8] ; divider
+
+  cmp al, 1
+  lea eax, [mmzero]
+  jz near .q1loop
+  cmp esp, esp
+ALIGN 8
+  movq mm3, [ecx + 120]     ;B1
+  pxor mm4, mm4             ;B2
+  psubw mm4, mm3            ;B3
+  movq mm0, [ecx]           ;A1 mm0 = [1st]
+  pmaxsw mm4, mm3           ;B4
+  psubusw mm4, [ebx]        ;B5
+
+  quantinter 0
+  quantinter 1
+  quantinter 2
+  quantinter 3
+  quantinter 4
+
+  psraw mm3, 15             ;B6
+  psubw mm2, mm6            ;C10
+  pmulhw mm4, mm7           ;B7
+  paddw mm5, mm4            ;B8
+  pxor mm4, mm3             ;B9
+  psubw mm4, mm3            ;B10
+  movq [edx + 4*24+16], mm2 ;C11
+  pop ebx
+  movq [edx + 4*24+8], mm4  ;B11
+  pmaddwd mm5, [plus_one]
+  movq mm0, mm5
+  punpckhdq mm5, mm5
+  paddd mm0, mm5
+  movd eax, mm0             ; return sum
 
-	ret
+  ret
 
-align ALIGN
+ALIGN 16
 .q1loop
-	movq mm6, [byte ebx]
+  movq mm6, [byte ebx]
 
-	quantinter1 0
-	quantinter1 1
-	quantinter1 2
-	quantinter1 3
-	quantinter1 4
-	quantinter1 5
-	quantinter1 6
-	quantinter1 7
-
-	pmaddwd mm5, [plus_one]
-	movq    mm0, mm5
-	psrlq   mm5, 32
-	paddd   mm0, mm5
-	movd	eax, mm0	; return sum
-		
-	pop ebx
+  quantinter1 0
+  quantinter1 1
+  quantinter1 2
+  quantinter1 3
+  quantinter1 4
+  quantinter1 5
+  quantinter1 6
+  quantinter1 7
+
+  pmaddwd mm5, [plus_one]
+  movq mm0, mm5
+  psrlq mm5, 32
+  paddd mm0, mm5
+  movd eax, mm0 ; return sum
 
-	ret
+  pop ebx
 
-;===========================================================================
+  ret
+
+;-----------------------------------------------------------------------------
 ;
 ; uint32_t dequant_h263_intra_3dne(int16_t *data,
 ;                                  const int16_t const *coeff,
 ;                                  const uint32_t quant,
 ;                                  const uint32_t dcscalar);
 ;
-;===========================================================================
+;-----------------------------------------------------------------------------
 
   ; this is the same as dequant_inter_3dne, except that we're
   ; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)
@@ -583,198 +567,198 @@
 ;This is Athlon-optimized code (ca 106 clk per call)
 
 %macro dequant 1
-	movq	mm1, [ecx+%1*24]	; c  = coeff[i] ;A2
-	psubw	mm0, mm1  		;-c		;A3 (1st dep)
+  movq mm1, [ecx+%1*24]         ; c  = coeff[i] ;A2
+  psubw mm0, mm1                ;-c     ;A3 (1st dep)
 %if (%1)
-	paddw	mm4, mm6		;C11 mm6 free (4th+)
+  paddw mm4, mm6                ;C11 mm6 free (4th+)
 %endif
-	pmaxsw	mm0, mm1		;|c|		;A4 (2nd)
+  pmaxsw mm0, mm1               ;|c|        ;A4 (2nd)
 %if (%1)
-	mov		ebp, ebp
-	pminsw	mm4, [ebx]		;C12 saturates to +2047 (5th+) later
+  mov ebp, ebp
+  pminsw mm4, [ebx]             ;C12 saturates to +2047 (5th+) later
 %endif
-	movq	mm6, [esi]		;0		;A5  mm6 in use
-	pandn	mm7, [eax]		;B9 offset = isZero ? 0 : quant_add (2nd)
+  movq mm6, [esi]               ;0      ;A5  mm6 in use
+  pandn mm7, [eax]              ;B9 offset = isZero ? 0 : quant_add (2nd)
 %if (%1)
-	pxor	mm5, mm4		;C13 (6th+) 1later
+  pxor mm5, mm4                 ;C13 (6th+) 1later
 %endif
-	movq	mm4, [esi]		;C1 ;0
-	mov		esp, esp
-	pcmpeqw	mm6, [ecx+%1*24]	;A6 (c ==0) ? -1 : 0 (1st)
-align 4
-	psraw	mm1, 15			; sign(c)	;A7 (2nd)
+  movq mm4, [esi]               ;C1 ;0
+  mov esp, esp
+  pcmpeqw mm6, [ecx+%1*24]      ;A6 (c ==0) ? -1 : 0 (1st)
+ALIGN 4
+  psraw mm1, 15                 ; sign(c)   ;A7 (2nd)
 %if (%1)
-	movq	[edx+%1*24+16-24], mm5	; C14 (7th) 2later
+  movq [edx+%1*24+16-24], mm5   ; C14 (7th) 2later
 %endif
-	paddw	mm7, mm3		;B10  offset +negate back (3rd)
-	pmullw	mm0, [edi]		;*= 2Q  ;A8 (3rd+)
-	paddw	mm2, mm7		;B11 mm7 free (4th+)
-	lea		ebp, [byte ebp]
-	movq	mm5, [ecx+%1*24+16]	;C2 ; c  = coeff[i]
-	psubw	mm4, mm5		;-c  		;C3 (1st dep)
-	pandn	mm6, [eax]		;A9 offset = isZero ? 0 : quant_add (2nd)
-	pminsw	mm2, [ebx]		;B12 saturates to +2047 (5th+)
-	pxor	mm3, mm2		;B13 (6th+)
-	movq	mm2, [byte esi]		;B1 ;0
+  paddw mm7, mm3                ;B10  offset +negate back (3rd)
+  pmullw mm0, [edi]             ;*= 2Q  ;A8 (3rd+)
+  paddw mm2, mm7                ;B11 mm7 free (4th+)
+  lea ebp, [byte ebp]
+  movq mm5, [ecx+%1*24+16]      ;C2 ; c  = coeff[i]
+  psubw mm4, mm5                ;-c         ;C3 (1st dep)
+  pandn mm6, [eax]              ;A9 offset = isZero ? 0 : quant_add (2nd)
+  pminsw mm2, [ebx]             ;B12 saturates to +2047 (5th+)
+  pxor mm3, mm2                 ;B13 (6th+)
+  movq mm2, [byte esi]          ;B1 ;0
 %if (%1)
-	movq	[edx+%1*24+8-24], mm3	;B14 (7th)
+  movq [edx+%1*24+8-24], mm3    ;B14 (7th)
 %else
-	movq	[edx+120], mm3
+  movq [edx+120], mm3
 %endif
-	pmaxsw	mm4, mm5		;|c|		;C4 (2nd)  
-	paddw	mm6, mm1		;A10  offset +negate back (3rd)
-	movq	mm3, [ecx+%1*24 + 8]	;B2 ; c  = coeff[i]
-	psubw	mm2, mm3		;-c		;B3 (1st dep)
-	paddw	mm0, mm6		;A11 mm6 free (4th+)
-	movq	mm6, [byte esi]		;0  		;C5  mm6 in use
-	pcmpeqw	mm6, [ecx+%1*24+16]	;C6 (c ==0) ? -1 : 0 (1st) 
-	pminsw	mm0, [ebx]		;A12 saturates to +2047 (5th+)
-	pmaxsw	mm2, mm3		;|c|		;B4 (2nd)
-	pxor	mm1, mm0		;A13 (6th+)
-	pmullw	mm4, [edi]		;*= 2Q  ;C8 (3rd+)
-	psraw	mm5, 15			; sign(c)	;C7 (2nd)
-	movq	mm7, [byte esi]		;0  		;B5 mm7 in use
-	pcmpeqw	mm7, [ecx+%1*24 + 8]	;B6 (c ==0) ? -1 : 0 (1st)
+  pmaxsw mm4, mm5               ;|c|        ;C4 (2nd)
+  paddw mm6, mm1                ;A10  offset +negate back (3rd)
+  movq mm3, [ecx+%1*24 + 8]     ;B2 ; c  = coeff[i]
+  psubw mm2, mm3                ;-c     ;B3 (1st dep)
+  paddw mm0, mm6                ;A11 mm6 free (4th+)
+  movq mm6, [byte esi]          ;0          ;C5  mm6 in use
+  pcmpeqw mm6, [ecx+%1*24+16]   ;C6 (c ==0) ? -1 : 0 (1st)
+  pminsw mm0, [ebx]             ;A12 saturates to +2047 (5th+)
+  pmaxsw mm2, mm3               ;|c|        ;B4 (2nd)
+  pxor mm1, mm0                 ;A13 (6th+)
+  pmullw mm4, [edi]             ;*= 2Q  ;C8 (3rd+)
+  psraw mm5, 15                 ; sign(c)   ;C7 (2nd)
+  movq mm7, [byte esi]          ;0          ;B5 mm7 in use
+  pcmpeqw mm7, [ecx+%1*24 + 8]  ;B6 (c ==0) ? -1 : 0 (1st)
 %if (%1 < 4)
-	movq	mm0, [byte esi]		;A1 ;0
+  movq mm0, [byte esi]          ;A1 ;0
 %endif
-	pandn	mm6, [byte eax]		;C9 offset = isZero ? 0 : quant_add (2nd)
-	psraw	mm3, 15			;sign(c)	;B7 (2nd)
-	movq	[byte edx+%1*24], mm1	;A14 (7th)
-	paddw	mm6, mm5		;C10  offset +negate back (3rd)
-	pmullw	mm2, [edi]		;*= 2Q  ;B8 (3rd+)
-	mov		esp, esp
+  pandn mm6, [byte eax]         ;C9 offset = isZero ? 0 : quant_add (2nd)
+  psraw mm3, 15                 ;sign(c)    ;B7 (2nd)
+  movq [byte edx+%1*24], mm1    ;A14 (7th)
+  paddw mm6, mm5                ;C10  offset +negate back (3rd)
+  pmullw mm2, [edi]             ;*= 2Q  ;B8 (3rd+)
+  mov esp, esp
 %endmacro
 
 
-align ALIGN
+ALIGN 16
 cglobal dequant_h263_intra_3dne
 dequant_h263_intra_3dne:
-	mov		ecx, [esp+ 8]			; coeff
-	mov		eax, [esp+12]			; quant
-	pxor	mm0, mm0
-	pxor	mm2, mm2
-	push	edi
-	push	ebx
-	lea		edi, [mmx_mul + eax*8 - 8]	; 2*quant
-	push	ebp
-	mov		ebx, mmx_2047
-	movsx	ebp, word [ecx]  
-	lea		eax, [mmx_add + eax*8 - 8]	; quant or quant-1 
-	push	esi
-	mov		esi, mmzero
-	pxor	mm7, mm7
-	movq	mm3, [ecx+120]			;B2 ; c  = coeff[i]
-	pcmpeqw	mm7, [ecx+120]			;B6 (c ==0) ? -1 : 0 (1st)
-  
-	imul	ebp, [esp+16+16]		; dcscalar  
-	psubw	mm2, mm3			;-c  		;B3 (1st dep)
-	pmaxsw	mm2, mm3			;|c|		;B4 (2nd)
-	pmullw	mm2, [edi]			;*= 2Q  ;B8 (3rd+)
-	psraw	mm3, 15				; sign(c)	;B7 (2nd)
-	mov		edx, [esp+ 4+16]		; data
-	
-align 8  
-	dequant 0
-
-	cmp		ebp, -2048
-	mov		esp, esp
-
-	dequant 1
-
-	cmovl	ebp, [int_2048]
-	nop 
-
-	dequant 2
-
-	cmp		ebp, 2047
-	mov		esp, esp
-
-	dequant 3
-	
-	cmovg	ebp, [int2047]
-	nop 
-
-	dequant 4
-
-	paddw	mm4, mm6		;C11 mm6 free (4th+)
-	pminsw	mm4, [ebx]		;C12 saturates to +2047 (5th+)
-	pandn	mm7, [eax]		;B9 offset = isZero ? 0 : quant_add (2nd)
-	mov		eax, ebp
-	mov		esi, [esp]
-	mov		ebp, [esp+4]
-	pxor	mm5, mm4		;C13 (6th+)
-	paddw	mm7, mm3		;B10  offset +negate back (3rd)
-	movq	[edx+4*24+16], mm5	;C14 (7th)
-	paddw	mm2, mm7		;B11 mm7 free (4th+)
-	pminsw	mm2, [ebx]		;B12 saturates to +2047 (5th+)
-	mov		ebx, [esp+8]
-	mov		edi, [esp+12]
-	add		esp, byte 16
-	pxor	mm3, mm2		;B13 (6th+)
-	movq	[edx+4*24+8], mm3	;B14 (7th)
-	mov		[edx], ax
+  mov ecx, [esp+ 8]                 ; coeff
+  mov eax, [esp+12]                 ; quant
+  pxor mm0, mm0
+  pxor mm2, mm2
+  push edi
+  push ebx
+  lea edi, [mmx_mul + eax*8 - 8]    ; 2*quant
+  push ebp
+  mov ebx, mmx_2047
+  movsx ebp, word [ecx]
+  lea eax, [mmx_add + eax*8 - 8]    ; quant or quant-1
+  push esi
+  mov esi, mmzero
+  pxor mm7, mm7
+  movq mm3, [ecx+120]               ;B2 ; c  = coeff[i]
+  pcmpeqw mm7, [ecx+120]            ;B6 (c ==0) ? -1 : 0 (1st)
+
+  imul ebp, [esp+16+16]             ; dcscalar
+  psubw mm2, mm3                    ;-c         ;B3 (1st dep)
+  pmaxsw mm2, mm3                   ;|c|        ;B4 (2nd)
+  pmullw mm2, [edi]                 ;*= 2Q  ;B8 (3rd+)
+  psraw mm3, 15                     ; sign(c)   ;B7 (2nd)
+  mov edx, [esp+ 4+16]              ; data
+
+ALIGN 8
+  dequant 0
+
+  cmp ebp, -2048
+  mov esp, esp
+
+  dequant 1
+
+  cmovl ebp, [int_2048]
+  nop
+
+  dequant 2
+
+  cmp ebp, 2047
+  mov esp, esp
+
+  dequant 3
+
+  cmovg ebp, [int2047]
+  nop
+
+  dequant 4
+
+  paddw mm4, mm6            ;C11 mm6 free (4th+)
+  pminsw mm4, [ebx]         ;C12 saturates to +2047 (5th+)
+  pandn mm7, [eax]          ;B9 offset = isZero ? 0 : quant_add (2nd)
+  mov eax, ebp
+  mov esi, [esp]
+  mov ebp, [esp+4]
+  pxor mm5, mm4             ;C13 (6th+)
+  paddw mm7, mm3            ;B10  offset +negate back (3rd)
+  movq [edx+4*24+16], mm5   ;C14 (7th)
+  paddw mm2, mm7            ;B11 mm7 free (4th+)
+  pminsw mm2, [ebx]         ;B12 saturates to +2047 (5th+)
+  mov ebx, [esp+8]
+  mov edi, [esp+12]
+  add esp, byte 16
+  pxor mm3, mm2             ;B13 (6th+)
+  movq [edx+4*24+8], mm3    ;B14 (7th)
+  mov [edx], ax
 
-	xor		eax, eax
-	ret
+  xor eax, eax
+  ret
 
-;===========================================================================
+;-----------------------------------------------------------------------------
 ;
 ; uint32_t dequant_h263_inter_3dne(int16_t * data,
 ;                                  const int16_t * const coeff,
 ;                                  const uint32_t quant);
 ;
-;===========================================================================
+;-----------------------------------------------------------------------------
 
 ; this is the same as dequant_inter_3dne,
 ; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
 ; This is Athlon-optimized code (ca 100 clk per call)
 
-align ALIGN
+ALIGN 16
 cglobal dequant_h263_inter_3dne
 dequant_h263_inter_3dne:
-	mov		ecx, [esp+ 8]			; coeff
-	mov		eax, [esp+12]			; quant
-	pxor	mm0, mm0
-	pxor	mm2, mm2
-	push	edi
-	push	ebx
-	push	esi
-	lea		edi, [mmx_mul + eax*8 - 8]	; 2*quant
-	mov		ebx, mmx_2047
-	pxor	mm7, mm7
-	movq	mm3, [ecx+120]  		;B2 ; c  = coeff[i]
-	pcmpeqw	mm7, [ecx+120]  		;B6 (c ==0) ? -1 : 0 (1st)  
-	lea		eax, [mmx_add + eax*8 - 8]	; quant or quant-1 
-	psubw	mm2, mm3			;-c	;B3 (1st dep)
-	mov		esi, mmzero
-	pmaxsw	mm2, mm3			;|c|		;B4 (2nd)
-	pmullw	mm2, [edi]			;*= 2Q		;B8 (3rd+)
-	psraw	mm3, 15				; sign(c)	;B7 (2nd)
-	mov		edx, [dword esp+ 4+12]		; data
-
-align 8  
-
-	dequant 0
-	dequant 1
-	dequant 2
-	dequant 3
-	dequant 4
-
-	paddw	mm4, mm6		;C11 mm6 free (4th+)
-	pminsw	mm4, [ebx]		;C12 saturates to +2047 (5th+)
-	pandn	mm7, [eax]		;B9 offset = isZero ? 0 : quant_add (2nd)
-	mov		esi, [esp]
-	pxor	mm5, mm4		;C13 (6th+)
-	paddw	mm7, mm3		;B10  offset +negate back (3rd)
-	movq	[edx+4*24+16], mm5	;C14 (7th)
-	paddw	mm2, mm7		;B11 mm7 free (4th+)
-	pminsw	mm2, [ebx]		;B12 saturates to +2047 (5th+)
-	mov		ebx, [esp+4]
-	mov		edi, [esp+8]
-	add		esp, byte 12
-	pxor	mm3, mm2		;B13 (6th+)
-	movq	[edx+4*24+8], mm3	;B14 (7th)
+  mov ecx, [esp+ 8]         ; coeff
+  mov eax, [esp+12]         ; quant
+  pxor mm0, mm0
+  pxor mm2, mm2
+  push edi
+  push ebx
+  push esi
+  lea edi, [mmx_mul + eax*8 - 8]    ; 2*quant
+  mov ebx, mmx_2047
+  pxor mm7, mm7
+  movq mm3, [ecx+120]               ;B2 ; c  = coeff[i]
+  pcmpeqw mm7, [ecx+120]            ;B6 (c ==0) ? -1 : 0 (1st)
+  lea eax, [mmx_add + eax*8 - 8]    ; quant or quant-1
+  psubw mm2, mm3                    ;-c ;B3 (1st dep)
+  mov esi, mmzero
+  pmaxsw mm2, mm3                   ;|c|        ;B4 (2nd)
+  pmullw mm2, [edi]                 ;*= 2Q      ;B8 (3rd+)
+  psraw mm3, 15                     ; sign(c)   ;B7 (2nd)
+  mov edx, [dword esp+ 4+12]        ; data
+
+ALIGN 8
+
+  dequant 0
+  dequant 1
+  dequant 2
+  dequant 3
+  dequant 4
+
+  paddw mm4, mm6            ;C11 mm6 free (4th+)
+  pminsw mm4, [ebx]         ;C12 saturates to +2047 (5th+)
+  pandn mm7, [eax]          ;B9 offset = isZero ? 0 : quant_add (2nd)
+  mov esi, [esp]
+  pxor mm5, mm4             ;C13 (6th+)
+  paddw mm7, mm3            ;B10  offset +negate back (3rd)
+  movq [edx+4*24+16], mm5   ;C14 (7th)
+  paddw mm2, mm7            ;B11 mm7 free (4th+)
+  pminsw mm2, [ebx]         ;B12 saturates to +2047 (5th+)
+  mov ebx, [esp+4]
+  mov edi, [esp+8]
+  add esp, byte 12
+  pxor mm3, mm2             ;B13 (6th+)
+  movq [edx+4*24+8], mm3    ;B14 (7th)
 
-	xor		eax, eax
-	ret
+  xor eax, eax
+  ret