[svn] / trunk / xvidcore / src / dct / x86_asm / fdct_sse2_skal.asm Repository:
ViewVC logotype

Diff of /trunk/xvidcore/src/dct/x86_asm/fdct_sse2_skal.asm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1617, Mon May 23 09:29:43 2005 UTC revision 1618, Mon May 23 12:06:02 2005 UTC
# Line 19  Line 19 
19  ; *  along with this program; if not, write to the Free Software  ; *  along with this program; if not, write to the Free Software
20  ; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA  ; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
21  ; *  ; *
22  ; * $Id: fdct_sse2_skal.asm,v 1.5 2004-08-29 10:02:38 edgomez Exp $  ; * $Id: fdct_sse2_skal.asm,v 1.6 2005-05-23 12:06:02 Skal Exp $
23  ; *  ; *
24  ; ***************************************************************************/  ; ***************************************************************************/
25    
# Line 74  Line 74 
74  ;  ;
75  ;  * Some more details at: http://skal.planet-d.net/coding/dct.html  ;  * Some more details at: http://skal.planet-d.net/coding/dct.html
76  ;  ;
 ;  
 ;//////////////////////////////////////////////////////////////////////  
 ;  
 ;  == Mean square errors ==  
 ;   0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.000    [0.001]  
 ;   0.035 0.029 0.032 0.032 0.031 0.032 0.034 0.035    [0.032]  
 ;   0.026 0.028 0.027 0.027 0.025 0.028 0.028 0.025    [0.027]  
 ;   0.037 0.032 0.031 0.030 0.028 0.029 0.026 0.031    [0.030]  
 ;   0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.001    [0.001]  
 ;   0.025 0.024 0.022 0.022 0.022 0.022 0.023 0.023    [0.023]  
 ;   0.026 0.028 0.025 0.028 0.030 0.025 0.026 0.027    [0.027]  
 ;   0.021 0.020 0.020 0.022 0.020 0.022 0.017 0.019    [0.020]  
 ;  
 ;  == Abs Mean errors ==  
 ;   0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000    [0.000]  
 ;   0.020 0.001 0.003 0.003 0.000 0.004 0.002 0.003    [0.002]  
 ;   0.000 0.001 0.001 0.001 0.001 0.004 0.000 0.000    [0.000]  
 ;   0.027 0.001 0.000 0.002 0.002 0.002 0.001 0.000    [0.003]  
 ;   0.000 0.000 0.000 0.000 0.000 0.001 0.000 0.001    [-0.000]  
 ;   0.001 0.003 0.001 0.001 0.002 0.001 0.000 0.000    [-0.000]  
 ;   0.000 0.002 0.002 0.001 0.001 0.002 0.001 0.000    [-0.000]  
 ;   0.000 0.002 0.001 0.002 0.001 0.002 0.001 0.001    [-0.000]  
 ;  
 ;  =========================  
 ;  Peak error:   1.0000  
 ;  Peak MSE:     0.0365  
 ;  Overall MSE:  0.0201  
 ;  Peak ME:      0.0265  
 ;  Overall ME:   0.0006  
 ;  
77  ;-----------------------------------------------------------------------------  ;-----------------------------------------------------------------------------
78  ;  ;
79  ;                          -=IDCT=-  ;                          -=IDCT=-
# Line 112  Line 82 
82  ; descaling) require some unpairable shifting and packing, all on  ; descaling) require some unpairable shifting and packing, all on
83  ; the same CPU unit.  ; the same CPU unit.
84  ;  ;
 ;   THIS IDCT IS NOT IEEE-COMPLIANT: IT WILL FAIL THE [-300,300]  
 ;   INPUT RANGE TEST (because of overflow). But the [-256,255] one  
 ;   is OK, and I'm fine with it (for now;)  
 ;  
 ;  == Mean square errors ==  
 ;   0.007 0.006 0.005 0.007 0.006 0.007 0.005 0.007    [0.006]  
 ;   0.006 0.008 0.007 0.007 0.007 0.008 0.008 0.008    [0.007]  
 ;   0.008 0.008 0.008 0.008 0.007 0.009 0.010 0.007    [0.008]  
 ;   0.007 0.007 0.006 0.007 0.008 0.007 0.006 0.008    [0.007]  
 ;   0.007 0.006 0.006 0.006 0.006 0.005 0.006 0.006    [0.006]  
 ;   0.008 0.007 0.006 0.008 0.007 0.008 0.009 0.009    [0.008]  
 ;   0.008 0.006 0.010 0.008 0.008 0.008 0.007 0.007    [0.008]  
 ;   0.007 0.006 0.006 0.007 0.007 0.006 0.006 0.007    [0.006]  
 ;  
 ;  == Abs Mean errors ==  
 ;   0.001 0.000 0.000 0.001 0.001 0.000 0.000 0.000    [0.000]  
 ;   0.000 0.002 0.002 0.000 0.001 0.001 0.000 0.002    [0.000]  
 ;   0.001 0.002 0.001 0.001 0.001 0.001 0.000 0.001    [-0.001]  
 ;   0.000 0.002 0.000 0.000 0.001 0.000 0.000 0.001    [-0.000]  
 ;   0.000 0.001 0.001 0.001 0.000 0.001 0.000 0.001    [0.000]  
 ;   0.000 0.001 0.001 0.001 0.001 0.000 0.001 0.000    [0.000]  
 ;   0.001 0.001 0.002 0.001 0.001 0.002 0.001 0.001    [0.001]  
 ;   0.000 0.000 0.001 0.000 0.000 0.000 0.000 0.000    [0.000]  
 ;  
 ;  =========================  
 ;  
 ;  Peak error:   1.0000  
 ;  Peak MSE:     0.0096  
 ;  Overall MSE:  0.0070  
 ;  Peak ME:      0.0024  
 ;  Overall ME:   0.0001  
 ;  
85  ;-----------------------------------------------------------------------------  ;-----------------------------------------------------------------------------
86    
87  ;=============================================================================  ;=============================================================================
# Line 166  Line 104 
104  ; Inverse DCT tables  ; Inverse DCT tables
105  ;-----------------------------------------------------------------------------  ;-----------------------------------------------------------------------------
106    
107  ALIGN 16  align 16
108  iTab1:  iTab1:
109    dw 0x4000, 0x539f, 0x4000, 0x22a3    dw 0x4000, 0x539f, 0x4000, 0x22a3
110    dw 0x4000, 0xdd5d, 0x4000, 0xac61    dw 0x4000, 0xdd5d, 0x4000, 0xac61
# Line 207  Line 145 
145    dw 0x3b21, 0x14c3, 0x979e, 0xc4df    dw 0x3b21, 0x14c3, 0x979e, 0xc4df
146    dw 0x14c3, 0x587e, 0x587e, 0x979e    dw 0x14c3, 0x587e, 0x587e, 0x979e
147    
148    ; the original rounding trick is by  align 16
149    ; Michel Lespinasse (hi Walken!) <walken@zoy.org>  Walken_Idct_Rounders:
150      dd  65536, 65536, 65536, 65536
151  ALIGN 16    dd   3597,  3597,  3597,  3597
152  Idct_Rnd0: dd  65535, 65535, 65535, 65535    dd   2260,  2260,  2260,  2260
153  Idct_Rnd1: dd   3612,  3612,  3612,  3612    dd   1203,  1203,  1203,  1203
154  Idct_Rnd2: dd   2271,  2271,  2271,  2271    dd      0,     0,     0,     0
155  Idct_Rnd3: dd   1203,  1203,  1203,  1203    dd    120,   120,   120,   120
156  Idct_Rnd4: dd   1023,  1023,  1023,  1023    dd    512,   512,   512,   512
157  Idct_Rnd5: dd    102,   102,   102,   102    dd    512,   512,   512,   512
158  Idct_Rnd6: dd    398,   398,   398,   398  
159  Idct_Rnd7: dd    469,   469,   469,   469    times 8 dw  (65536>>11)
160      times 8 dw  ( 3597>>11)
161  Idct_Sparse_Rnd0: times 4 dw  (65535>>11)    times 8 dw  ( 2260>>11)
 Idct_Sparse_Rnd1: times 4 dw  ( 3612>>11)  
 Idct_Sparse_Rnd2: times 4 dw  ( 2271>>11)  
162    ; other rounders are zero...    ; other rounders are zero...
163    
164  ;-----------------------------------------------------------------------------  ;-----------------------------------------------------------------------------
# Line 284  Line 220 
220  SECTION .text  SECTION .text
221    
222  cglobal idct_sse2_skal  cglobal idct_sse2_skal
 cglobal idct_sse2_sparse_skal  
223  cglobal fdct_sse2_skal  cglobal fdct_sse2_skal
224    
225  ;-----------------------------------------------------------------------------  ;-----------------------------------------------------------------------------
# Line 295  Line 230 
230    
231    movdqa  xmm0, [ecx+%1*16]     ; xmm0 = [01234567]    movdqa  xmm0, [ecx+%1*16]     ; xmm0 = [01234567]
232    
233    pshuflw xmm0, xmm0, 11011000b ; [0213]    pshuflw xmm0, xmm0, 11011000b ; [02134567]  ; these two shufflings could be
234    pshufhw xmm0, xmm0, 11011000b ; [02134657]    pshufhw xmm0, xmm0, 11011000b ; [02134657]  ; integrated in zig-zag orders
235    
236    pshufd  xmm4, xmm0, 00000000b ; [02020202]    pshufd  xmm4, xmm0, 00000000b ; [02020202]
237    pshufd  xmm5, xmm0, 10101010b ; [46464646]    pshufd  xmm5, xmm0, 10101010b ; [46464646]
238    pshufd  xmm6, xmm0, 01010101b ; [13131313]    pshufd  xmm6, xmm0, 01010101b ; [13131313]
# Line 318  Line 254 
254    psrad   xmm4, %4        ; => out [7654]    psrad   xmm4, %4        ; => out [7654]
255    
256    packssdw xmm6, xmm4     ; [01237654]    packssdw xmm6, xmm4     ; [01237654]
257    
258    pshufhw xmm6, xmm6, 00011011b ; [01234567]    pshufhw xmm6, xmm6, 00011011b ; [01234567]
259    
260    movdqa  [ecx+%1*16], xmm6    movdqa  [ecx+%1*16], xmm6
# Line 390  Line 327 
327    movdqa xmm3, [%1+0*16] ; x0    movdqa xmm3, [%1+0*16] ; x0
328    movdqa xmm6, [%1+4*16] ; x4    movdqa xmm6, [%1+4*16] ; x4
329    
330      movdqa [%1   ], xmm2  ; we spill 1 reg to perform safe butterflies
331    
332      movdqa xmm2, xmm3
333    psubsw xmm3, xmm6   ; x0-x4 = tm04    psubsw xmm3, xmm6   ; x0-x4 = tm04
334    paddsw xmm6, xmm6   ; 2.x4    paddsw xmm6, xmm2   ; x0+x4 = tp04
   paddsw xmm6, xmm3   ; x0+x4 = tp04  
335    
336    psubsw xmm3, xmm5   ; tm04-tm26 = a2    movdqa xmm2, xmm6
337    psubsw xmm6, xmm7   ; tp04-tp26 = a3    psubsw xmm6, xmm7
338    paddsw xmm5, xmm5   ; 2.tm26    paddsw xmm7, xmm2
339    paddsw xmm7, xmm7   ; 2.tp26    movdqa xmm2, xmm3
340    paddsw xmm5, xmm3   ; tm04+tm26 = a1    psubsw xmm3, xmm5
341    paddsw xmm7, xmm6   ; tp04+tp26 = a0    paddsw xmm5, xmm2
342    
343    psubsw xmm5, xmm0   ; a1-b1    movdqa xmm2, xmm5
344    psubsw xmm3, xmm4   ; a2-b2    psubsw xmm5, xmm0
345    paddsw xmm0, xmm0   ; 2.b1    paddsw xmm0, xmm2
346    paddsw xmm4, xmm4   ; 2.b2    movdqa xmm2, xmm3
347    paddsw xmm0, xmm5   ; a1+b1    psubsw xmm3, xmm4
348    paddsw xmm4, xmm3   ; a2+b2    paddsw xmm4, xmm2
349    
350      movdqa xmm2, [%1]
351    
352    psraw  xmm5, 6     ; out6    psraw  xmm5, 6     ; out6
353    psraw  xmm3, 6     ; out5    psraw  xmm3, 6     ; out5
# Line 432  Line 373 
373    psraw  xmm2, 6     ; out3    psraw  xmm2, 6     ; out3
374    psraw  xmm6, 6     ; out4    psraw  xmm6, 6     ; out4
375    
376        ; store result
377    
378    movdqa [%1+0*16], xmm1    movdqa [%1+0*16], xmm1
379    movdqa [%1+3*16], xmm2    movdqa [%1+3*16], xmm2
380    movdqa [%1+4*16], xmm6    movdqa [%1+4*16], xmm6
381    movdqa [%1+7*16], xmm7    movdqa [%1+7*16], xmm7
 %endmacro  
382    
383  ;-----------------------------------------------------------------------------  %endmacro
 ; Function idct (the straight forward version)  
 ;-----------------------------------------------------------------------------  
   
 ALIGN 16  
 idct_sse2_skal:  
   mov ecx, [esp+4]  
   iMTX_MULT  0, iTab1, Idct_Rnd0, 11  
   iMTX_MULT  1, iTab2, Idct_Rnd1, 11  
   iMTX_MULT  2, iTab3, Idct_Rnd2, 11  
   iMTX_MULT  3, iTab4, Idct_Rnd3, 11  
   iMTX_MULT  4, iTab1, Idct_Rnd4, 11  
   iMTX_MULT  5, iTab4, Idct_Rnd5, 11  
   iMTX_MULT  6, iTab3, Idct_Rnd6, 11  
   iMTX_MULT  7, iTab2, Idct_Rnd7, 11  
   iLLM_PASS ecx+0  
   ret  
 .endfunc  
384    
385  ;-----------------------------------------------------------------------------  ;-----------------------------------------------------------------------------
386  ; Helper macro TEST_ROW (test a null row)  ; Helper macro TEST_ROW (test a null row)
# Line 473  Line 398 
398  ;-----------------------------------------------------------------------------  ;-----------------------------------------------------------------------------
399  ; Function idct (this one skips null rows)  ; Function idct (this one skips null rows)
400  ;-----------------------------------------------------------------------------  ;-----------------------------------------------------------------------------
401    ; IEEE1180 and Walken compatible version
402    
403  ALIGN 16  align 16
404  idct_sse2_sparse_skal:  idct_sse2_skal:
405    
406    mov ecx, [esp+ 4]  ; Src    mov ecx, [esp+ 4]  ; Src
407    
408    TEST_ROW ecx, .Row0_Round    TEST_ROW ecx, .Row0_Round
409    iMTX_MULT  0, iTab1, Idct_Rnd0, 11    iMTX_MULT  0, iTab1, Walken_Idct_Rounders + 16*0, 11
410    jmp .Row1    jmp .Row1
411  .Row0_Round  .Row0_Round
412    movq mm0, [Idct_Sparse_Rnd0]    movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0]
413    movq [ecx  ], mm0    movdqa [ecx  ], xmm0
   movq [ecx+8], mm0  
414    
415  .Row1  .Row1
416    TEST_ROW ecx+16, .Row1_Round    TEST_ROW ecx+16, .Row1_Round
417    iMTX_MULT  1, iTab2, Idct_Rnd1, 11    iMTX_MULT  1, iTab2, Walken_Idct_Rounders + 16*1, 11
418    jmp .Row2    jmp .Row2
419  .Row1_Round  .Row1_Round
420    movq mm0, [Idct_Sparse_Rnd1]    movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1]
421    movq [ecx+16  ], mm0    movdqa [ecx+16  ], xmm0
   movq [ecx+16+8], mm0  
422    
423  .Row2  .Row2
424    TEST_ROW ecx+32, .Row2_Round    TEST_ROW ecx+32, .Row2_Round
425    iMTX_MULT  2, iTab3, Idct_Rnd2, 11    iMTX_MULT  2, iTab3, Walken_Idct_Rounders + 16*2, 11
426    jmp .Row3    jmp .Row3
427  .Row2_Round  .Row2_Round
428    movq mm0, [Idct_Sparse_Rnd2]    movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2]
429    movq [ecx+32  ], mm0    movdqa [ecx+32  ], xmm0
   movq [ecx+32+8], mm0  
430    
431  .Row3  .Row3
432    TEST_ROW ecx+48, .Row4    TEST_ROW ecx+48, .Row4
433    iMTX_MULT  3, iTab4, Idct_Rnd3, 11    iMTX_MULT  3, iTab4, Walken_Idct_Rounders + 16*3, 11
   jmp .Row4  
434    
435  .Row4  .Row4
436    TEST_ROW ecx+64, .Row5    TEST_ROW ecx+64, .Row5
437    iMTX_MULT  4, iTab1, Idct_Rnd4, 11    iMTX_MULT  4, iTab1, Walken_Idct_Rounders + 16*4, 11
   jmp .Row5  
438    
439  .Row5  .Row5
440    TEST_ROW ecx+80, .Row6    TEST_ROW ecx+80, .Row6
441    iMTX_MULT  5, iTab4, Idct_Rnd5, 11    iMTX_MULT  5, iTab4, Walken_Idct_Rounders + 16*5, 11
442    
443  .Row6  .Row6
444    TEST_ROW ecx+96, .Row7    TEST_ROW ecx+96, .Row7
445    iMTX_MULT  6, iTab3, Idct_Rnd6, 11    iMTX_MULT  6, iTab3, Walken_Idct_Rounders + 16*6, 11
446    
447  .Row7  .Row7
448    TEST_ROW ecx+112, .End    TEST_ROW ecx+112, .End
449    iMTX_MULT  7, iTab2, Idct_Rnd7, 11    iMTX_MULT  7, iTab2, Walken_Idct_Rounders + 16*7, 11
450  .End  .End
451    
452    iLLM_PASS ecx+0    iLLM_PASS ecx
453    
454    ret    ret
455  .endfunc  .endfunc
456    

Legend:
Removed from v.1617  
changed lines
  Added in v.1618

No admin address has been configured
ViewVC Help
Powered by ViewVC 1.0.4