--- trunk/xvidcore/src/quant/x86_asm/quantize_h263_3dne.asm	2004/08/22 11:46:10	1535
+++ trunk/xvidcore/src/quant/x86_asm/quantize_h263_3dne.asm	2008/11/26 01:04:34	1795
@@ -5,7 +5,7 @@
 ; *
 ; *  Copyright(C) 2002-2003 Jaan Kalda
 ; *
-; *  This program is free software ; you can redistribute it and/or modify
+; *  This program is free software ; you can r_EDIstribute it and/or modify
 ; *  it under the terms of the GNU General Public License as published by
 ; *  the Free Software Foundation ; either version 2 of the License, or
 ; *  (at your option) any later version.
@@ -19,7 +19,7 @@
 ; *  along with this program ; if not, write to the Free Software
 ; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 ; *
-; * $Id: quantize_h263_3dne.asm,v 1.4 2004-08-22 11:46:10 edgomez Exp $
+; * $Id: quantize_h263_3dne.asm,v 1.9 2008-11-26 01:04:34 Isibaar Exp $
 ; *
 ; *************************************************************************/
 ;
@@ -29,37 +29,15 @@
 ; enable dequant saturate [-2048,2047], test purposes only.
 %define SATURATE
 
-BITS 32
-
-%macro cglobal 1
-	%ifdef PREFIX
-		%ifdef MARK_FUNCS
-			global _%1:function
-			%define %1 _%1:function
-		%else
-			global _%1
-			%define %1 _%1
-		%endif
-	%else
-		%ifdef MARK_FUNCS
-			global %1:function
-		%else
-			global %1
-		%endif
-	%endif
-%endmacro
+%include "nasm.inc"
 
 ;=============================================================================
 ; Local data
 ;=============================================================================
 
-%ifdef FORMAT_COFF
-SECTION .rodata
-%else
-SECTION .rodata align=16
-%endif
+DATA
 
-align 4
+align SECTION_ALIGN
 int_div:
 	dd 0
 %assign i 1
@@ -68,7 +46,7 @@
 	%assign i i+1
 %endrep
 
-ALIGN 16
+ALIGN SECTION_ALIGN
 plus_one:
 	times 8 dw 1
 
@@ -76,7 +54,7 @@
 ; subtract by Q/2 table
 ;-----------------------------------------------------------------------------
 
-ALIGN 16
+ALIGN SECTION_ALIGN
 mmx_sub:
 %assign i 1
 %rep 31
@@ -95,7 +73,7 @@
 ;
 ;-----------------------------------------------------------------------------
 
-ALIGN 16
+ALIGN SECTION_ALIGN
 mmx_div:
 %assign i 1
 %rep 31
@@ -107,7 +85,7 @@
 ; add by (odd(Q) ? Q : Q - 1) table
 ;-----------------------------------------------------------------------------
 
-ALIGN 16
+ALIGN SECTION_ALIGN
 mmx_add:
 %assign i 1
 %rep 31
@@ -123,7 +101,7 @@
 ; multiple by 2Q table
 ;-----------------------------------------------------------------------------
 
-ALIGN 16
+ALIGN SECTION_ALIGN
 mmx_mul:
 %assign i 1
 %rep 31
@@ -135,17 +113,17 @@
 ; saturation limits
 ;-----------------------------------------------------------------------------
 
-ALIGN 8
+ALIGN SECTION_ALIGN
 mmx_32768_minus_2048:
 	times 4 dw (32768-2048)
 mmx_32767_minus_2047:
 	times 4 dw (32767-2047)
 
-ALIGN 16
+ALIGN SECTION_ALIGN
 mmx_2047:
 	times 4 dw 2047
 
-ALIGN 8
+ALIGN SECTION_ALIGN
 mmzero:
 	dd 0, 0
 int2047:
@@ -157,7 +135,7 @@
 ; Code
 ;=============================================================================
 
-SECTION .text
+SECTION .rotext align=SECTION_ALIGN
 
 
 ;-----------------------------------------------------------------------------
@@ -179,24 +157,24 @@
   psubw mm7, mm6    ;D8
 %endif
 
-ALIGN 8
-  db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16)  ;movq   mm4, [ecx + %1 * 32 +16+32] ;C1
+ALIGN SECTION_ALIGN
+  movq   mm4, [_ECX + %1 * 32 +16] ;C1
   pmaxsw mm1, mm0   ;A4
-  db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24)  ;movq   mm6, [ecx + %1 * 32 +24+32] ;D1
+  movq   mm6, [_ECX + %1 * 32 +24] ;D1
   pmaxsw mm3, mm2   ;B4
 
 
   psraw mm0, 15     ;A5
   psraw mm2, 15     ;B5
 %if (%1)
-  movq [edx + %1 * 32 + 16-32], mm5 ;C9
-  movq [edx + %1 * 32 + 24-32], mm7 ;D9
+  movq [_EDX + %1 * 32 + 16-32], mm5 ;C9
+  movq [_EDX + %1 * 32 + 24-32], mm7 ;D9
 %endif
 
   psrlw mm1, 1      ;A6
   psrlw mm3, 1      ;B6
-  movq mm5, [ebx]   ;C2
-  movq mm7, [ebx]   ;D2
+  movq mm5, [_EBX]   ;C2
+  movq mm7, [_EBX]   ;D2
 
   pxor mm1, mm0 ;A7
   pxor mm3, mm2 ;B7
@@ -207,33 +185,38 @@
   psubw mm3, mm2    ;B8
 
 %if (%1 == 0)
-  push ebp
-  movq mm0, [ecx + %1 * 32 +32]
+  push _EBP
+  movq mm0, [_ECX + %1 * 32 +32]
 %elif (%1 < 3)
-  db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32)  ;movq   mm0, [ecx + %1 * 32 +32]    ;A1
+  movq   mm0, [_ECX + %1 * 32 +32]    ;A1
 %endif
   pmaxsw mm5, mm4   ;C4
 %if (%1 < 3)
-  db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32)   ;movq   mm2, [ecx + %1 * 32 +8+32]  ;B1
+  movq   mm2, [_ECX + %1 * 32 +8+32]  ;B1
 %else
-  cmp esp, esp
+  cmp _ESP, _ESP
 %endif
   pmaxsw mm7, mm6   ;D4
 
   psraw mm4, 15     ;C5
   psraw mm6, 15     ;D5
-  movq [byte edx + %1 * 32], mm1    ;A9
-  movq [edx + %1 * 32+8], mm3       ;B9
+  movq [byte _EDX + %1 * 32], mm1    ;A9
+  movq [_EDX + %1 * 32+8], mm3       ;B9
 
 
   psrlw mm5, 1      ;C6
   psrlw mm7, 1      ;D6
 %if (%1 < 3)
-  movq mm1, [ebx]   ;A2
-  movq mm3, [ebx]   ;B2
+  movq mm1, [_EBX]   ;A2
+  movq mm3, [_EBX]   ;B2
 %endif
 %if (%1 == 3)
-  imul eax, [int_div+4*edi]
+%ifdef ARCH_IS_X86_64
+  lea r9, [int_div]
+  imul eax, dword [r9+4*_EDI]
+%else
+  imul _EAX, [int_div+4*_EDI]
+%endif
 %endif
   pxor mm5, mm4 ;C7
   pxor mm7, mm6 ;D7
@@ -253,24 +236,24 @@
   psubw mm7, mm6    ;D8
 %endif
 
-ALIGN 8
-  db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16)  ;movq   mm4, [ecx + %1 * 32 +16+32] ;C1
+ALIGN SECTION_ALIGN
+  movq   mm4, [_ECX + %1 * 32 +16] ;C1
   pmaxsw mm1, mm0   ;A4
-  db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24)  ;movq   mm6, [ecx + %1 * 32 +24+32] ;D1
+  movq   mm6, [_ECX + %1 * 32 +24] ;D1
   pmaxsw mm3, mm2   ;B4
 
 
   psraw mm0, 15     ;A5
   psraw mm2, 15     ;B5
 %if (%1)
-  movq [edx + %1 * 32 + 16-32], mm5 ;C9
-  movq [edx + %1 * 32 + 24-32], mm7 ;D9
+  movq [_EDX + %1 * 32 + 16-32], mm5 ;C9
+  movq [_EDX + %1 * 32 + 24-32], mm7 ;D9
 %endif
 
-  pmulhw mm1, [esi] ;A6
-  pmulhw mm3, [esi] ;B6
-  movq mm5, [ebx]   ;C2
-  movq mm7, [ebx]   ;D2
+  pmulhw mm1, [_ESI] ;A6
+  pmulhw mm3, [_ESI] ;B6
+  movq mm5, [_EBX]   ;C2
+  movq mm7, [_EBX]   ;D2
 
   nop
   nop
@@ -284,134 +267,180 @@
 
 
 %if (%1 < 3)
-  db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq    mm0, [ecx + %1 * 32 +32]    ;A1
+  movq    mm0, [_ECX + %1 * 32 +32]    ;A1
 %endif
   pmaxsw mm5, mm4     ;C4
 %if (%1 < 3)
-  db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32]  ;B1
+  movq mm2, [_ECX + %1 * 32 +8+32]  ;B1
 %else
-  cmp esp, esp
+  cmp _ESP, _ESP
 %endif
   pmaxsw mm7,mm6        ;D4
 
   psraw mm4, 15     ;C5
   psraw mm6, 15     ;D5
-  movq [byte edx + %1 * 32], mm1 ;A9
-  movq [edx + %1 * 32+8], mm3     ;B9
+  movq [byte _EDX + %1 * 32], mm1 ;A9
+  movq [_EDX + %1 * 32+8], mm3     ;B9
 
 
-  pmulhw mm5, [esi] ;C6
-  pmulhw mm7, [esi] ;D6
+  pmulhw mm5, [_ESI] ;C6
+  pmulhw mm7, [_ESI] ;D6
 %if (%1 < 3)
-  movq mm1, [ebx]   ;A2
-  movq mm3, [ebx]   ;B2
+  movq mm1, [_EBX]   ;A2
+  movq mm3, [_EBX]   ;B2
 %endif
 %if (%1 == 0)
-  push ebp
+  push _EBP
 %elif (%1 < 3)
   nop
 %endif
   nop
 %if (%1 == 3)
-  imul eax, [int_div+4*edi]
+%ifdef ARCH_IS_X86_64
+  lea r9, [int_div]
+  imul eax, dword [r9+4*_EDI]
+%else
+  imul _EAX, [int_div+4*_EDI]
+%endif
 %endif
   pxor mm5, mm4 ;C7
   pxor mm7, mm6 ;D7
 %endmacro
 
 
-ALIGN 16
+ALIGN SECTION_ALIGN
 cglobal quant_h263_intra_3dne
 quant_h263_intra_3dne:
 
-  mov eax, [esp + 12]       ; quant
-  mov ecx, [esp + 8]        ; data
-  mov edx, [esp + 4]        ; coeff
+%ifdef ARCH_IS_X86_64
+  mov TMP0, [_ESP]
+  add _ESP, PTR_SIZE
+%ifndef WINDOWS
+  push prm6
+  push prm5
+%endif
+  push prm4
+  push prm3
+  push prm2
+  push prm1
+  sub _ESP, PTR_SIZE
+  mov [_ESP], TMP0
+%endif  
+
+  mov _EAX, [_ESP + 3*PTR_SIZE]       ; quant
+  mov _ECX, [_ESP + 2*PTR_SIZE]       ; data
+  mov _EDX, [_ESP + 1*PTR_SIZE]       ; coeff
   cmp al, 1
   pxor mm1, mm1
   pxor mm3, mm3
-  movq mm0, [ecx]           ; mm0 = [1st]
-  movq mm2, [ecx + 8]
-  push esi
-  lea esi, [mmx_div + eax*8 - 8]
-
-  push ebx
-  mov ebx, mmzero
-  push edi
+  movq mm0, [_ECX]           ; mm0 = [1st]
+  movq mm2, [_ECX + 8]
+  push _ESI
+%ifdef ARCH_IS_X86_64
+  lea _ESI, [mmx_div]
+  lea _ESI, [_ESI + _EAX*8 - 8]
+%else
+  lea _ESI, [mmx_div + _EAX*8 - 8]
+%endif
+
+  push _EBX
+  mov _EBX, mmzero
+  push _EDI
   jz near .q1loop
 
   quant_intra 0
-  mov ebp, [esp + 16 + 16]      ; dcscalar
-                                ; NB -- there are 3 pushes in the function preambule and one more
-                                ; in "quant_intra 0", thus an added offset of 16 bytes
-  movsx eax, word [byte ecx]    ; DC
+  mov _EBP, [_ESP + (4+4)*PTR_SIZE]   ; dcscalar
+                                    ; NB -- there are 3 pushes in the function preambule and one more
+                                    ; in "quant_intra 0", thus an added offset of 16 bytes
+  movsx _EAX, word [byte _ECX]        ; DC
 
   quant_intra 1
-  mov edi, eax
-  sar edi, 31                       ; sign(DC)
-  shr ebp, byte 1                   ; ebp = dcscalar/2
+  mov _EDI, _EAX
+  sar _EDI, 31                       ; sign(DC)
+  shr _EBP, byte 1                   ; _EBP = dcscalar/2
 
   quant_intra 2
-  sub eax, edi                      ; DC (+1)
-  xor ebp, edi                      ; sign(DC) dcscalar /2  (-1)
-  mov edi, [esp + 16 + 16]          ; dscalar
-  lea eax, [byte eax + ebp]         ; DC + sign(DC) dcscalar/2
-  mov ebp, [byte esp]
+  sub _EAX, _EDI                      ; DC (+1)
+  xor _EBP, _EDI                      ; sign(DC) dcscalar /2  (-1)
+  mov _EDI, [_ESP + (4+4)*PTR_SIZE]   ; dscalar
+  lea _EAX, [byte _EAX + _EBP]         ; DC + sign(DC) dcscalar/2
+  mov _EBP, [byte _ESP]
 
   quant_intra 3
   psubw mm5, mm4                    ;C8
-  mov esi, [esp + 12]               ; pop back the register value
-  mov edi, [esp + 4]                ; pop back the register value
-  sar eax, 16
-  lea ebx, [byte eax + 1]           ; workaround for eax < 0
-  cmovs eax, ebx                    ; conditionnaly move the corrected value
-  mov [edx], ax                     ; coeff[0] = ax
-  mov ebx, [esp + 8]                ; pop back the register value
-  add esp, byte 16                  ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
+  mov _ESI, [_ESP + 3*PTR_SIZE]       ; pop back the register value
+  mov _EDI, [_ESP + 1*PTR_SIZE]       ; pop back the register value
+  sar _EAX, 16
+  lea _EBX, [byte _EAX + 1]           ; workaround for _EAX < 0
+  cmovs _EAX, _EBX                    ; conditionnaly move the corrected value
+  mov [_EDX], ax                     ; coeff[0] = ax
+  mov _EBX, [_ESP + 2*PTR_SIZE]       ; pop back the register value
+  add _ESP, byte 4*PTR_SIZE          ; "quant_intra 0" pushed _EBP, but we don't restore that one, just correct the stack offset by 16
   psubw mm7, mm6                    ;D8
-  movq [edx + 3 * 32 + 16], mm5     ;C9
-  movq [edx + 3 * 32 + 24], mm7     ;D9
+  movq [_EDX + 3 * 32 + 16], mm5     ;C9
+  movq [_EDX + 3 * 32 + 24], mm7     ;D9
+
+  xor _EAX, _EAX
+
+%ifdef ARCH_IS_X86_64
+  mov TMP0, [_ESP]
+%ifndef WINDOWS
+  add _ESP, 6*PTR_SIZE
+%else
+  add _ESP, 4*PTR_SIZE
+%endif
+  mov [_ESP], TMP0
+%endif  
 
-  xor eax, eax
   ret
 
-ALIGN 16
+ALIGN SECTION_ALIGN
 
-.q1loop
+.q1loop:
   quant_intra1 0
-  mov ebp, [esp + 16 + 16]          ; dcscalar
-  movsx eax, word [byte ecx]        ; DC
+  mov _EBP, [_ESP + (4+4)*PTR_SIZE]   ; dcscalar
+  movsx _EAX, word [byte _ECX]        ; DC
 
   quant_intra1 1
-  mov edi, eax
-  sar edi, 31                       ; sign(DC)
-  shr ebp, byte 1                   ; ebp = dcscalar /2
+  mov _EDI, _EAX
+  sar _EDI, 31                       ; sign(DC)
+  shr _EBP, byte 1                   ; _EBP = dcscalar /2
 
   quant_intra1 2
-  sub eax, edi                      ; DC (+1)
-  xor ebp, edi                      ; sign(DC) dcscalar /2  (-1)
-  mov edi, [esp + 16 + 16]          ; dcscalar
-  lea eax, [byte eax + ebp]         ; DC + sign(DC) dcscalar /2
-  mov ebp, [byte esp]
+  sub _EAX, _EDI                      ; DC (+1)
+  xor _EBP, _EDI                      ; sign(DC) dcscalar /2  (-1)
+  mov _EDI, [_ESP + (4+4)*PTR_SIZE]   ; dcscalar
+  lea _EAX, [byte _EAX + _EBP]         ; DC + sign(DC) dcscalar /2
+  mov _EBP, [byte _ESP]
 
   quant_intra1 3
   psubw mm5, mm4                    ;C8
-  mov esi, [dword esp + 12]         ; pop back the register value
-  mov edi, [esp + 4]                ; pop back the register value
-  sar eax, 16
-  lea ebx, [byte eax + 1]           ; workaround for eax < 0
-  cmovs eax, ebx                    ; conditionnaly move the corrected value
-  mov [edx], ax                     ; coeff[0] = ax
-  mov ebx, [esp + 8]                ; pop back the register value
-  add esp, byte 16                  ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
+  mov _ESI, [_ESP + 3*PTR_SIZE]       ; pop back the register value
+  mov _EDI, [_ESP + 1*PTR_SIZE]       ; pop back the register value
+  sar _EAX, 16
+  lea _EBX, [byte _EAX + 1]           ; workaround for _EAX < 0
+  cmovs _EAX, _EBX                    ; conditionnaly move the corrected value
+  mov [_EDX], ax                     ; coeff[0] = ax
+  mov _EBX, [_ESP + 2*PTR_SIZE]       ; pop back the register value
+  add _ESP, byte 4*PTR_SIZE          ; "quant_intra 0" pushed _EBP, but we don't restore that one, just correct the stack offset by 16
   psubw mm7, mm6                    ;D8
-  movq [edx + 3 * 32 + 16], mm5     ;C9
-  movq [edx + 3 * 32 + 24], mm7     ;D9
+  movq [_EDX + 3 * 32 + 16], mm5     ;C9
+  movq [_EDX + 3 * 32 + 24], mm7     ;D9
 
-  xor eax, eax
-  ret
+  xor _EAX, _EAX
 
+%ifdef ARCH_IS_X86_64
+  mov TMP0, [_ESP]
+%ifndef WINDOWS
+  add _ESP, 6*PTR_SIZE
+%else
+  add _ESP, 4*PTR_SIZE
+%endif
+  mov [_ESP], TMP0
+%endif  
 
+  ret
+ENDFUNC
 
 
 ;-----------------------------------------------------------------------------
@@ -427,54 +456,54 @@
 
 
 %macro quantinter 1
-  movq mm1, [eax]               ;A2
+  movq mm1, [_EAX]               ;A2
   psraw mm3, 15                 ;B6
 %if (%1)
   psubw mm2, mm6                ;C10
 %endif
   psubw mm1, mm0                ;A3
   pmulhw mm4, mm7               ;B7
-  movq mm6, [ecx + %1*24+16]    ;C1
+  movq mm6, [_ECX + %1*24+16]    ;C1
   pmaxsw mm1, mm0               ;A4
   paddw mm5, mm4                ;B8
 %if (%1)
-  movq [edx + %1*24+16-24], mm2 ;C11
+  movq [_EDX + %1*24+16-24], mm2 ;C11
 %endif
-  psubusw mm1, [ebx]            ;A5 mm0 -= sub (unsigned, dont go < 0)
+  psubusw mm1, [_EBX]            ;A5 mm0 -= sub (unsigned, dont go < 0)
   pxor mm4, mm3                 ;B9
-  movq mm2, [eax]               ;C2
+  movq mm2, [_EAX]               ;C2
   psraw mm0, 15                 ;A6
   psubw mm4, mm3                ;B10
   psubw mm2, mm6                ;C3
   pmulhw mm1, mm7               ;A7 mm0 = (mm0 / 2Q) >> 24
-  movq mm3, [ecx + %1*24+8] ;B1
+  movq mm3, [_ECX + %1*24+8] ;B1
   pmaxsw mm2, mm6               ;C4
   paddw mm5, mm1                ;A8 sum += mm0
 %if (%1)
-  movq [edx + %1*24+8-24], mm4  ;B11
+  movq [_EDX + %1*24+8-24], mm4  ;B11
 %else
-  movq [edx + 120], mm4         ;B11
+  movq [_EDX + 120], mm4         ;B11
 %endif
-  psubusw mm2, [ebx]            ;C5
+  psubusw mm2, [_EBX]            ;C5
   pxor mm1, mm0                 ;A9 mm0 *= sign(mm0)
-  movq mm4, [eax]               ;B2
+  movq mm4, [_EAX]               ;B2
   psraw mm6, 15                 ;C6
   psubw mm1, mm0                ;A10 undisplace
   psubw mm4, mm3                ;B3
   pmulhw mm2, mm7               ;C7
-  movq mm0, [ecx + %1*24+24]    ;A1 mm0 = [1st]
+  movq mm0, [_ECX + %1*24+24]    ;A1 mm0 = [1st]
   pmaxsw mm4, mm3               ;B4
   paddw mm5, mm2                ;C8
-  movq [byte edx + %1*24], mm1  ;A11
-  psubusw mm4, [ebx]            ;B5
+  movq [byte _EDX + %1*24], mm1  ;A11
+  psubusw mm4, [_EBX]            ;B5
   pxor mm2, mm6                 ;C9
 %endmacro
 
 %macro quantinter1 1
-  movq mm0, [byte ecx + %1*16]  ;mm0 = [1st]
-  movq mm3, [ecx + %1*16+8] ;
-  movq mm1, [eax]
-  movq mm4, [eax]
+  movq mm0, [byte _ECX + %1*16]  ;mm0 = [1st]
+  movq mm3, [_ECX + %1*16+8] ;
+  movq mm1, [_EAX]
+  movq mm4, [_EAX]
   psubw mm1, mm0
   psubw mm4, mm3
   pmaxsw mm1, mm0
@@ -491,35 +520,58 @@
   pxor mm4, mm3                 ;
   psubw mm1, mm0                ; undisplace
   psubw mm4, mm3
-  cmp esp, esp
-  movq [byte edx + %1*16], mm1
-  movq [edx + %1*16+8], mm4
+  cmp _ESP, _ESP
+  movq [byte _EDX + %1*16], mm1
+  movq [_EDX + %1*16+8], mm4
 %endmacro
 
-ALIGN 16
+ALIGN SECTION_ALIGN
 cglobal quant_h263_inter_3dne
 quant_h263_inter_3dne:
-  mov edx, [esp  + 4]               ; coeff
-  mov ecx, [esp  + 8]               ; data
-  mov eax, [esp  + 12]              ; quant
-  push ebx
+
+%ifdef ARCH_IS_X86_64
+  mov TMP0, [_ESP]
+  add _ESP, PTR_SIZE
+%ifndef WINDOWS
+  push prm6
+  push prm5
+%endif
+  push prm4
+  push prm3
+  push prm2
+  push prm1
+  sub _ESP, PTR_SIZE
+  mov [_ESP], TMP0
+%endif  
+
+  mov _EDX, [_ESP  + 1*PTR_SIZE]      ; coeff
+  mov _ECX, [_ESP  + 2*PTR_SIZE]      ; data
+  mov _EAX, [_ESP  + 3*PTR_SIZE]      ; quant
+  push _EBX
 
   pxor mm5, mm5                     ; sum
   nop
-  lea ebx,[mmx_sub + eax * 8 - 8]   ; sub
-  movq mm7, [mmx_div + eax * 8 - 8] ; divider
+%ifdef ARCH_IS_X86_64
+  lea _EBX, [mmx_div]
+  movq mm7, [_EBX + _EAX * 8 - 8]
+  lea _EBX, [mmx_sub]
+  lea _EBX, [_EBX + _EAX * 8 - 8]
+%else
+  lea _EBX,[mmx_sub + _EAX * 8 - 8]   ; sub
+  movq mm7, [mmx_div + _EAX * 8 - 8] ; divider
+%endif
 
   cmp al, 1
-  lea eax, [mmzero]
+  lea _EAX, [mmzero]
   jz near .q1loop
-  cmp esp, esp
-ALIGN 8
-  movq mm3, [ecx + 120]     ;B1
+  cmp _ESP, _ESP
+ALIGN SECTION_ALIGN
+  movq mm3, [_ECX + 120]     ;B1
   pxor mm4, mm4             ;B2
   psubw mm4, mm3            ;B3
-  movq mm0, [ecx]           ;A1 mm0 = [1st]
+  movq mm0, [_ECX]           ;A1 mm0 = [1st]
   pmaxsw mm4, mm3           ;B4
-  psubusw mm4, [ebx]        ;B5
+  psubusw mm4, [_EBX]        ;B5
 
   quantinter 0
   quantinter 1
@@ -533,20 +585,30 @@
   paddw mm5, mm4            ;B8
   pxor mm4, mm3             ;B9
   psubw mm4, mm3            ;B10
-  movq [edx + 4*24+16], mm2 ;C11
-  pop ebx
-  movq [edx + 4*24+8], mm4  ;B11
+  movq [_EDX + 4*24+16], mm2 ;C11
+  pop _EBX
+  movq [_EDX + 4*24+8], mm4  ;B11
   pmaddwd mm5, [plus_one]
   movq mm0, mm5
   punpckhdq mm5, mm5
   paddd mm0, mm5
   movd eax, mm0             ; return sum
 
+%ifdef ARCH_IS_X86_64
+  mov TMP0, [_ESP]
+%ifndef WINDOWS
+  add _ESP, 6*PTR_SIZE
+%else
+  add _ESP, 4*PTR_SIZE
+%endif
+  mov [_ESP], TMP0
+%endif  
+
   ret
 
-ALIGN 16
-.q1loop
-  movq mm6, [byte ebx]
+ALIGN SECTION_ALIGN
+.q1loop:
+  movq mm6, [byte _EBX]
 
   quantinter1 0
   quantinter1 1
@@ -563,9 +625,21 @@
   paddd mm0, mm5
   movd eax, mm0 ; return sum
 
-  pop ebx
+  pop _EBX
+
+%ifdef ARCH_IS_X86_64
+  mov TMP0, [_ESP]
+%ifndef WINDOWS
+  add _ESP, 6*PTR_SIZE
+%else
+  add _ESP, 4*PTR_SIZE
+%endif
+  mov [_ESP], TMP0
+%endif  
 
   ret
+ENDFUNC
+
 
 ;-----------------------------------------------------------------------------
 ;
@@ -583,140 +657,179 @@
 ;This is Athlon-optimized code (ca 106 clk per call)
 
 %macro dequant 1
-  movq mm1, [ecx+%1*24]         ; c  = coeff[i] ;A2
+  movq mm1, [_ECX+%1*24]         ; c  = coeff[i] ;A2
   psubw mm0, mm1                ;-c     ;A3 (1st dep)
 %if (%1)
   paddw mm4, mm6                ;C11 mm6 free (4th+)
 %endif
   pmaxsw mm0, mm1               ;|c|        ;A4 (2nd)
 %if (%1)
-  mov ebp, ebp
-  pminsw mm4, [ebx]             ;C12 saturates to +2047 (5th+) later
+  mov _EBP, _EBP
+  pminsw mm4, [_EBX]             ;C12 saturates to +2047 (5th+) later
 %endif
-  movq mm6, [esi]               ;0      ;A5  mm6 in use
-  pandn mm7, [eax]              ;B9 offset = isZero ? 0 : quant_add (2nd)
+  movq mm6, [_ESI]               ;0      ;A5  mm6 in use
+  pandn mm7, [_EAX]              ;B9 offset = isZero ? 0 : quant_add (2nd)
 %if (%1)
   pxor mm5, mm4                 ;C13 (6th+) 1later
 %endif
-  movq mm4, [esi]               ;C1 ;0
-  mov esp, esp
-  pcmpeqw mm6, [ecx+%1*24]      ;A6 (c ==0) ? -1 : 0 (1st)
-ALIGN 4
+  movq mm4, [_ESI]               ;C1 ;0
+  mov _ESP, _ESP
+  pcmpeqw mm6, [_ECX+%1*24]      ;A6 (c ==0) ? -1 : 0 (1st)
+ALIGN SECTION_ALIGN
   psraw mm1, 15                 ; sign(c)   ;A7 (2nd)
 %if (%1)
-  movq [edx+%1*24+16-24], mm5   ; C14 (7th) 2later
+  movq [_EDX+%1*24+16-24], mm5   ; C14 (7th) 2later
 %endif
   paddw mm7, mm3                ;B10  offset +negate back (3rd)
-  pmullw mm0, [edi]             ;*= 2Q  ;A8 (3rd+)
+  pmullw mm0, [_EDI]             ;*= 2Q  ;A8 (3rd+)
   paddw mm2, mm7                ;B11 mm7 free (4th+)
-  lea ebp, [byte ebp]
-  movq mm5, [ecx+%1*24+16]      ;C2 ; c  = coeff[i]
+  lea _EBP, [byte _EBP]
+  movq mm5, [_ECX+%1*24+16]      ;C2 ; c  = coeff[i]
   psubw mm4, mm5                ;-c         ;C3 (1st dep)
-  pandn mm6, [eax]              ;A9 offset = isZero ? 0 : quant_add (2nd)
-  pminsw mm2, [ebx]             ;B12 saturates to +2047 (5th+)
+  pandn mm6, [_EAX]              ;A9 offset = isZero ? 0 : quant_add (2nd)
+  pminsw mm2, [_EBX]             ;B12 saturates to +2047 (5th+)
   pxor mm3, mm2                 ;B13 (6th+)
-  movq mm2, [byte esi]          ;B1 ;0
+  movq mm2, [byte _ESI]          ;B1 ;0
 %if (%1)
-  movq [edx+%1*24+8-24], mm3    ;B14 (7th)
+  movq [_EDX+%1*24+8-24], mm3    ;B14 (7th)
 %else
-  movq [edx+120], mm3
+  movq [_EDX+120], mm3
 %endif
   pmaxsw mm4, mm5               ;|c|        ;C4 (2nd)
   paddw mm6, mm1                ;A10  offset +negate back (3rd)
-  movq mm3, [ecx+%1*24 + 8]     ;B2 ; c  = coeff[i]
+  movq mm3, [_ECX+%1*24 + 8]     ;B2 ; c  = coeff[i]
   psubw mm2, mm3                ;-c     ;B3 (1st dep)
   paddw mm0, mm6                ;A11 mm6 free (4th+)
-  movq mm6, [byte esi]          ;0          ;C5  mm6 in use
-  pcmpeqw mm6, [ecx+%1*24+16]   ;C6 (c ==0) ? -1 : 0 (1st)
-  pminsw mm0, [ebx]             ;A12 saturates to +2047 (5th+)
+  movq mm6, [byte _ESI]          ;0          ;C5  mm6 in use
+  pcmpeqw mm6, [_ECX+%1*24+16]   ;C6 (c ==0) ? -1 : 0 (1st)
+  pminsw mm0, [_EBX]             ;A12 saturates to +2047 (5th+)
   pmaxsw mm2, mm3               ;|c|        ;B4 (2nd)
   pxor mm1, mm0                 ;A13 (6th+)
-  pmullw mm4, [edi]             ;*= 2Q  ;C8 (3rd+)
+  pmullw mm4, [_EDI]             ;*= 2Q  ;C8 (3rd+)
   psraw mm5, 15                 ; sign(c)   ;C7 (2nd)
-  movq mm7, [byte esi]          ;0          ;B5 mm7 in use
-  pcmpeqw mm7, [ecx+%1*24 + 8]  ;B6 (c ==0) ? -1 : 0 (1st)
+  movq mm7, [byte _ESI]          ;0          ;B5 mm7 in use
+  pcmpeqw mm7, [_ECX+%1*24 + 8]  ;B6 (c ==0) ? -1 : 0 (1st)
 %if (%1 < 4)
-  movq mm0, [byte esi]          ;A1 ;0
+  movq mm0, [byte _ESI]          ;A1 ;0
 %endif
-  pandn mm6, [byte eax]         ;C9 offset = isZero ? 0 : quant_add (2nd)
+  pandn mm6, [byte _EAX]         ;C9 offset = isZero ? 0 : quant_add (2nd)
   psraw mm3, 15                 ;sign(c)    ;B7 (2nd)
-  movq [byte edx+%1*24], mm1    ;A14 (7th)
+  movq [byte _EDX+%1*24], mm1    ;A14 (7th)
   paddw mm6, mm5                ;C10  offset +negate back (3rd)
-  pmullw mm2, [edi]             ;*= 2Q  ;B8 (3rd+)
-  mov esp, esp
+  pmullw mm2, [_EDI]             ;*= 2Q  ;B8 (3rd+)
+  mov _ESP, _ESP
 %endmacro
 
 
-ALIGN 16
+ALIGN SECTION_ALIGN
 cglobal dequant_h263_intra_3dne
 dequant_h263_intra_3dne:
-  mov ecx, [esp+ 8]                 ; coeff
-  mov eax, [esp+12]                 ; quant
+
+%ifdef ARCH_IS_X86_64
+  mov TMP0, [_ESP]
+  add _ESP, PTR_SIZE
+%ifndef WINDOWS
+  push prm6
+  push prm5
+%endif
+  push prm4
+  push prm3
+  push prm2
+  push prm1
+  sub _ESP, PTR_SIZE
+  mov [_ESP], TMP0
+%endif  
+
+  mov _ECX, [_ESP+ 2*PTR_SIZE]        ; coeff
+  mov _EAX, [_ESP+ 3*PTR_SIZE]        ; quant
   pxor mm0, mm0
   pxor mm2, mm2
-  push edi
-  push ebx
-  lea edi, [mmx_mul + eax*8 - 8]    ; 2*quant
-  push ebp
-  mov ebx, mmx_2047
-  movsx ebp, word [ecx]
-  lea eax, [mmx_add + eax*8 - 8]    ; quant or quant-1
-  push esi
-  mov esi, mmzero
+  push _EDI
+  push _EBX
+%ifdef ARCH_IS_X86_64
+  lea _EDI, [mmx_mul]
+  lea _EDI, [_EDI + _EAX*8 - 8]    ; 2*quant
+%else
+  lea _EDI, [mmx_mul + _EAX*8 - 8]    ; 2*quant
+%endif
+  push _EBP
+  mov _EBX, mmx_2047
+  movsx _EBP, word [_ECX]
+%ifdef ARCH_IS_X86_64
+  lea r9, [mmx_add]
+  lea _EAX, [r9 + _EAX*8 - 8]    ; quant or quant-1
+%else
+  lea _EAX, [mmx_add + _EAX*8 - 8]    ; quant or quant-1
+%endif
+  push _ESI
+  mov _ESI, mmzero
   pxor mm7, mm7
-  movq mm3, [ecx+120]               ;B2 ; c  = coeff[i]
-  pcmpeqw mm7, [ecx+120]            ;B6 (c ==0) ? -1 : 0 (1st)
+  movq mm3, [_ECX+120]               ;B2 ; c  = coeff[i]
+  pcmpeqw mm7, [_ECX+120]            ;B6 (c ==0) ? -1 : 0 (1st)
 
-  imul ebp, [esp+16+16]             ; dcscalar
+  imul _EBP, [_ESP+(4+4)*PTR_SIZE]    ; dcscalar
   psubw mm2, mm3                    ;-c         ;B3 (1st dep)
   pmaxsw mm2, mm3                   ;|c|        ;B4 (2nd)
-  pmullw mm2, [edi]                 ;*= 2Q  ;B8 (3rd+)
+  pmullw mm2, [_EDI]                 ;*= 2Q  ;B8 (3rd+)
   psraw mm3, 15                     ; sign(c)   ;B7 (2nd)
-  mov edx, [esp+ 4+16]              ; data
+  mov _EDX, [_ESP+ (1+4)*PTR_SIZE]    ; data
 
-ALIGN 8
+ALIGN SECTION_ALIGN
   dequant 0
 
-  cmp ebp, -2048
-  mov esp, esp
+  cmp _EBP, -2048
+  mov _ESP, _ESP
 
   dequant 1
 
-  cmovl ebp, [int_2048]
+  cmovl _EBP, [int_2048]
   nop
 
   dequant 2
 
-  cmp ebp, 2047
-  mov esp, esp
+  cmp _EBP, 2047
+  mov _ESP, _ESP
 
   dequant 3
 
-  cmovg ebp, [int2047]
+  cmovg _EBP, [int2047]
   nop
 
   dequant 4
 
   paddw mm4, mm6            ;C11 mm6 free (4th+)
-  pminsw mm4, [ebx]         ;C12 saturates to +2047 (5th+)
-  pandn mm7, [eax]          ;B9 offset = isZero ? 0 : quant_add (2nd)
-  mov eax, ebp
-  mov esi, [esp]
-  mov ebp, [esp+4]
+  pminsw mm4, [_EBX]         ;C12 saturates to +2047 (5th+)
+  pandn mm7, [_EAX]          ;B9 offset = isZero ? 0 : quant_add (2nd)
+  mov _EAX, _EBP
+  mov _ESI, [_ESP]
+  mov _EBP, [_ESP+PTR_SIZE]
   pxor mm5, mm4             ;C13 (6th+)
   paddw mm7, mm3            ;B10  offset +negate back (3rd)
-  movq [edx+4*24+16], mm5   ;C14 (7th)
+  movq [_EDX+4*24+16], mm5   ;C14 (7th)
   paddw mm2, mm7            ;B11 mm7 free (4th+)
-  pminsw mm2, [ebx]         ;B12 saturates to +2047 (5th+)
-  mov ebx, [esp+8]
-  mov edi, [esp+12]
-  add esp, byte 16
+  pminsw mm2, [_EBX]         ;B12 saturates to +2047 (5th+)
+  mov _EBX, [_ESP+2*PTR_SIZE]
+  mov _EDI, [_ESP+3*PTR_SIZE]
+  add _ESP, byte 4*PTR_SIZE
   pxor mm3, mm2             ;B13 (6th+)
-  movq [edx+4*24+8], mm3    ;B14 (7th)
-  mov [edx], ax
+  movq [_EDX+4*24+8], mm3    ;B14 (7th)
+  mov [_EDX], ax
+
+  xor _EAX, _EAX
+
+%ifdef ARCH_IS_X86_64
+  mov TMP0, [_ESP]
+%ifndef WINDOWS
+  add _ESP, 6*PTR_SIZE
+%else
+  add _ESP, 4*PTR_SIZE
+%endif
+  mov [_ESP], TMP0
+%endif  
 
-  xor eax, eax
   ret
+ENDFUNC
+
 
 ;-----------------------------------------------------------------------------
 ;
@@ -731,30 +844,56 @@
 ; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
 ; This is Athlon-optimized code (ca 100 clk per call)
 
-ALIGN 16
+ALIGN SECTION_ALIGN
 cglobal dequant_h263_inter_3dne
 dequant_h263_inter_3dne:
-  mov ecx, [esp+ 8]         ; coeff
-  mov eax, [esp+12]         ; quant
+
+%ifdef ARCH_IS_X86_64
+  mov TMP0, [_ESP]
+  add _ESP, PTR_SIZE
+%ifndef WINDOWS
+  push prm6
+  push prm5
+%endif
+  push prm4
+  push prm3
+  push prm2
+  push prm1
+  sub _ESP, PTR_SIZE
+  mov [_ESP], TMP0
+%endif  
+
+  mov _ECX, [_ESP+ 2*PTR_SIZE]        ; coeff
+  mov _EAX, [_ESP+ 3*PTR_SIZE]        ; quant
   pxor mm0, mm0
   pxor mm2, mm2
-  push edi
-  push ebx
-  push esi
-  lea edi, [mmx_mul + eax*8 - 8]    ; 2*quant
-  mov ebx, mmx_2047
+  push _EDI
+  push _EBX
+  push _ESI
+%ifdef ARCH_IS_X86_64
+  lea _EDI, [mmx_mul]
+  lea _EDI, [_EDI + _EAX*8 - 8]    ; 2*quant
+%else
+  lea _EDI, [mmx_mul + _EAX*8 - 8]    ; 2*quant
+%endif
+  mov _EBX, mmx_2047
   pxor mm7, mm7
-  movq mm3, [ecx+120]               ;B2 ; c  = coeff[i]
-  pcmpeqw mm7, [ecx+120]            ;B6 (c ==0) ? -1 : 0 (1st)
-  lea eax, [mmx_add + eax*8 - 8]    ; quant or quant-1
+  movq mm3, [_ECX+120]               ;B2 ; c  = coeff[i]
+  pcmpeqw mm7, [_ECX+120]            ;B6 (c ==0) ? -1 : 0 (1st)
+%ifdef ARCH_IS_X86_64
+  lea r9, [mmx_add]
+  lea _EAX, [r9 + _EAX*8 - 8]    ; quant or quant-1
+%else
+  lea _EAX, [mmx_add + _EAX*8 - 8]    ; quant or quant-1
+%endif
   psubw mm2, mm3                    ;-c ;B3 (1st dep)
-  mov esi, mmzero
+  mov _ESI, mmzero
   pmaxsw mm2, mm3                   ;|c|        ;B4 (2nd)
-  pmullw mm2, [edi]                 ;*= 2Q      ;B8 (3rd+)
+  pmullw mm2, [_EDI]                 ;*= 2Q      ;B8 (3rd+)
   psraw mm3, 15                     ; sign(c)   ;B7 (2nd)
-  mov edx, [dword esp+ 4+12]        ; data
+  mov _EDX, [_ESP+ (1+3)*PTR_SIZE]    ; data
 
-ALIGN 8
+ALIGN SECTION_ALIGN
 
   dequant 0
   dequant 1
@@ -763,19 +902,36 @@
   dequant 4
 
   paddw mm4, mm6            ;C11 mm6 free (4th+)
-  pminsw mm4, [ebx]         ;C12 saturates to +2047 (5th+)
-  pandn mm7, [eax]          ;B9 offset = isZero ? 0 : quant_add (2nd)
-  mov esi, [esp]
+  pminsw mm4, [_EBX]         ;C12 saturates to +2047 (5th+)
+  pandn mm7, [_EAX]          ;B9 offset = isZero ? 0 : quant_add (2nd)
+  mov _ESI, [_ESP]
   pxor mm5, mm4             ;C13 (6th+)
   paddw mm7, mm3            ;B10  offset +negate back (3rd)
-  movq [edx+4*24+16], mm5   ;C14 (7th)
+  movq [_EDX+4*24+16], mm5   ;C14 (7th)
   paddw mm2, mm7            ;B11 mm7 free (4th+)
-  pminsw mm2, [ebx]         ;B12 saturates to +2047 (5th+)
-  mov ebx, [esp+4]
-  mov edi, [esp+8]
-  add esp, byte 12
+  pminsw mm2, [_EBX]         ;B12 saturates to +2047 (5th+)
+  mov _EBX, [_ESP+PTR_SIZE]
+  mov _EDI, [_ESP+2*PTR_SIZE]
+  add _ESP, byte 3*PTR_SIZE
   pxor mm3, mm2             ;B13 (6th+)
-  movq [edx+4*24+8], mm3    ;B14 (7th)
+  movq [_EDX+4*24+8], mm3    ;B14 (7th)
+
+  xor _EAX, _EAX
+
+%ifdef ARCH_IS_X86_64
+  mov TMP0, [_ESP]
+%ifndef WINDOWS
+  add _ESP, 6*PTR_SIZE
+%else
+  add _ESP, 4*PTR_SIZE
+%endif
+  mov [_ESP], TMP0
+%endif  
 
-  xor eax, eax
   ret
+ENDFUNC
+
+%ifidn __OUTPUT_FORMAT__,elf
+section ".note.GNU-stack" noalloc noexec nowrite progbits
+%endif
+