Diff of /branches/dev-api-4/xvidcore/src/quant/x86_asm/quantize_3dne.asm

-revision 1088, Mon Jul 14 12:40:16 2003 UTC
+revision 1089, Wed Jul 16 23:00:08 2003 UTC
 Line 1
  ;/**************************************************************************
  ; *
  ; *     XVID MPEG-4 VIDEO CODEC
- ; *     mmx quantization/dequantization
+ ; *  - mmx quantization/dequantization -
  ; *
- ; *     This program is an implementation of a part of one or more MPEG-4
+ ; *  Copyright(C) 2001-2003 XviD Team <xvid-devel@xvid.org>
- ; *     Video tools as specified in ISO/IEC 14496-2 standard.  Those intending
- ; *     to use this software module in hardware or software products are
- ; *     advised that its use may infringe existing patents or copyrights, and
- ; *     any such use would be at such party's own risk.  The original
- ; *     developer of this software module and his/her company, and subsequent
- ; *     editors and their companies, will have no liability for use of this
- ; *     software or modifications or derivatives thereof.
  ; *
  ; *     This program is free software; you can redistribute it and/or modify
  ; *     it under the terms of the GNU General Public License as published by
-Line 24
+Line 17
  ; *
  ; *     You should have received a copy of the GNU General Public License
  ; *     along with this program; if not, write to the Free Software
- ; *     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ ; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ ; *
+ ; * $Id: quantize_3dne.asm,v 1.2.2.1 2003-07-16 22:59:20 edgomez Exp $
  ; *
  ; *************************************************************************/
  ; these 3dne functions are compatible with iSSE, but are optimized specifically for
-Line 183
+Line 178
  ;Optimized by Jaan, 30 Nov 2002
   %macro quant_intra1  1
                  psubw   mm1,mm0   ;A3
                  psubw   mm3,mm2   ;B3
  %if (%1)
-Line 251
+Line 247
  %endm
- %macro quant_intra  1 ;rules for athlon: 1) schedule latencies, 2) add/mul and load/store in 2:1 proportion,
+ %macro quant_intra  1
+         ; Rules for athlon:
+                 ; 1) schedule latencies
+                 ; 2) add/mul and load/store in 2:1 proportion
                                                  ; 3) avoid spliting >3byte instructions over 8byte boundaries
                  psubw   mm1,mm0   ;A3
                  psubw   mm3,mm2   ;B3
  %if (%1)
 Line 346
                  mov     ebx,mmzero
                  push edi
                  jz      near .q1loop
  quant_intra 0
  mov     ebp, [esp + 16 + 16]    ; dcscalar
- movsx   eax, word [byte ecx] ;x
+                                         ; NB -- there are 3 pushes in the function preambule and one more
+                                         ; in "quant_intra 0", thus an added offset of 16 bytes
+         movsx   eax, word [byte ecx]    ; DC
  quant_intra 1
  mov             edi,eax
- sar             edi,31 ;sign(x)
+         sar     edi, 31         ; sign(DC)
  shr     ebp,byte 1                      ; ebp = dcscalar /2
  quant_intra 2
- sub             eax,edi ; x (+1)
+         sub     eax, edi                ; DC (+1)
- xor     ebp,edi ;sign(x) dcscalar /2  (-1)
+         xor     ebp, edi                ; sign(DC) dcscalar /2  (-1)
- mov             edi,[esp + 16 + 16]
+         mov     edi, [esp + 16 + 16]    ; dscalar
- lea             eax,[byte eax+ebp]  ;x + sign(x) dcscalar /2
+         lea     eax, [byte eax + ebp]   ; DC + sign(DC) dcscalar/2
  mov             ebp,[byte esp]
  quant_intra 3
                  psubw   mm5, mm4        ;C8
-                 mov     esi,[esp+12]
+         mov     esi, [esp + 12]                 ; pop back the register value
-                 mov             edi,[esp+4]
+         mov     edi, [esp + 4]                  ; pop back the register value
-                 mov     ebx,[esp+8]
-                 add esp,byte 16
                  sar     eax,16
+         lea     ebx, [byte eax + 1]             ; workaround for eax < 0
+         cmovs   eax, ebx                        ; conditionnaly move the corrected value
                  mov     [edx], ax               ; coeff[0] = ax
+         mov     ebx, [esp + 8]                  ; pop back the register value
+         add     esp, byte 16                    ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
                  psubw   mm7, mm6        ;D8
                  movq    [edx + 3 * 32 + 16], mm5 ;C9
                  movq    [edx + 3 * 32 + 24], mm7 ;D9
                  ret
  align 16
  .q1loop
  quant_intra1 0
  mov     ebp, [esp + 16 + 16]    ; dcscalar
- movsx   eax, word [byte ecx] ;x
+         movsx   eax, word [byte ecx]    ; DC
  quant_intra1 1
  mov             edi,eax
- sar             edi,31 ;sign(x)
+         sar     edi, 31         ; sign(DC)
  shr     ebp,byte 1                      ; ebp = dcscalar /2
  quant_intra1 2
- sub             eax,edi ; x (+1)
+         sub     eax, edi                ; DC (+1)
- xor     ebp,edi ;sign(x) dcscalar /2  (-1)
+         xor     ebp, edi                ; sign(DC) dcscalar /2  (-1)
- mov             edi,[esp + 16 + 16]
+         mov     edi, [esp + 16 + 16]    ; dcscalar
- lea             eax,[byte eax+ebp]  ;x + sign(x) dcscalar /2
+         lea     eax, [byte eax + ebp]   ; DC + sign(DC) dcscalar /2
  mov             ebp,[byte esp]
  quant_intra1 3
                  psubw   mm5, mm4        ;C8
-                 mov     esi,[dword esp+12]
+         mov     esi, [dword esp + 12]           ; pop back the register value
-                 mov             edi,[esp+4]
+         mov     edi, [esp + 4]                  ; pop back the register value
-                 mov     ebx,[esp+8]
-                 add esp,byte 16
                  sar     eax,16
+         lea     ebx, [byte eax + 1]             ; workaround for eax < 0
+         cmovs   eax, ebx                        ; conditionnaly move the corrected value
                  mov     [edx], ax               ; coeff[0] = ax
+         mov     ebx, [esp + 8]                  ; pop back the register value
+         add     esp, byte 16                    ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
                  psubw   mm7, mm6        ;D8
                  movq    [edx + 3 * 32 + 16], mm5 ;C9
                  movq    [edx + 3 * 32 + 24], mm7 ;D9
                  ret
-Line 486
+Line 503
  align ALIGN
  cglobal quant_inter_3dne
                  quant_inter_3dne
                  mov     edx, [esp  + 4]         ; coeff
                  mov     ecx, [esp  + 8]         ; data
                  mov     eax, [esp  + 12]        ; quant
-Line 514
+Line 530
                  quantinter 2
                  quantinter 3
                  quantinter 4
                  psraw   mm3,15                  ;B6
                  psubw   mm2, mm6                ;C10
                  pmulhw  mm4, mm7                ; B7
-Line 528
+Line 545
                  punpckhdq   mm5, mm5
                  paddd   mm0, mm5
                  movd    eax, mm0                ; return sum
                  ret
  align ALIGN
  .q1loop
                  movq mm6,[byte ebx]
                  quantinter1 0
                  quantinter1 1
                  quantinter1 2
-Line 567
+Line 586
  ;This is Athlon-optimized code (ca 106 clk per call)
  %macro dequant 1
-   movq mm1, [ecx+%1*24]     ;A2 ; c  = coeff[i]
+         movq    mm1, [ecx+%1*24]        ; c  = coeff[i] ;A2
    psubw mm0,mm1 ;-c             ;A3 (1st dep)
  %if (%1)
    paddw mm4,mm6 ;                       C11 mm6 free (4th+)
-Line 575
+Line 594
    pmaxsw mm0,mm1 ;|c|           ;A4 (2nd)
  %if (%1)
   mov ebp,ebp
-   pminsw mm4,[ebx] ;            C12 saturates to +2047 (5th+) 1ater
+         pminsw  mm4, [ebx]              ;C12 saturates to +2047 (5th+) later
  %endif
    movq  mm6,[esi] ;0            ;A5  mm6 in use
    pandn mm7,[eax] ;              B9 offset = isZero ? 0 : quant_add (2nd)
-Line 657
+Line 676
    pmullw mm2, [edi] ;*= 2Q  ;B8 (3rd+)
    psraw mm3,15 ; sign(c)        ;B7 (2nd)
    mov   edx, [esp+ 4+16]        ; data
  align 8
  dequant 0
    cmp   ebp,-2048
    mov esp,esp
  dequant 1
    cmovl ebp,[int_2048]
    nop
  dequant 2
    cmp   ebp,2047
    mov esp,esp
  dequant 3
    cmovg ebp,[int2047]
    nop
  dequant 4
    paddw mm4,mm6 ;                       C11 mm6 free (4th+)
-Line 707
+Line 735
  align ALIGN
  cglobal dequant_inter_3dne
  dequant_inter_3dne:
    mov    ecx, [esp+ 8]        ; coeff
    mov    eax, [esp+12]        ; quant
    pxor mm0,mm0
-Line 727
+Line 754
    pmullw mm2, [edi] ;*= 2Q  ;B8 (3rd+)
    psraw mm3,15 ; sign(c)        ;B7 (2nd)
    mov   edx, [dword esp+ 4+12]        ; data
  align 8
  dequant 0
  dequant 1
  dequant 2
-Line 748
+Line 777
          add esp,byte 12
    pxor mm3, mm2 ;                       B13 (6th+)
    movq [edx+4*24+8], mm3 ;      B14 (7th)
    ret

 Legend:



Removed from v.1088
 


changed lines


 
Added in v.1089
 Legend:



Removed from v.1088
 


changed lines


 
Added in v.1089
-Removed from v.1088
+Added in v.1089

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4