--- trunk/xvidcore/src/dct/x86_asm/fdct_sse2_skal.asm 2005/05/23 09:29:43 1617 +++ trunk/xvidcore/src/dct/x86_asm/fdct_sse2_skal.asm 2005/05/23 12:06:02 1618 @@ -19,7 +19,7 @@ ; * along with this program; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; * $Id: fdct_sse2_skal.asm,v 1.5 2004-08-29 10:02:38 edgomez Exp $ +; * $Id: fdct_sse2_skal.asm,v 1.6 2005-05-23 12:06:02 Skal Exp $ ; * ; ***************************************************************************/ @@ -74,36 +74,6 @@ ; ; * Some more details at: http://skal.planet-d.net/coding/dct.html ; -; -;////////////////////////////////////////////////////////////////////// -; -; == Mean square errors == -; 0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.000 [0.001] -; 0.035 0.029 0.032 0.032 0.031 0.032 0.034 0.035 [0.032] -; 0.026 0.028 0.027 0.027 0.025 0.028 0.028 0.025 [0.027] -; 0.037 0.032 0.031 0.030 0.028 0.029 0.026 0.031 [0.030] -; 0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.001 [0.001] -; 0.025 0.024 0.022 0.022 0.022 0.022 0.023 0.023 [0.023] -; 0.026 0.028 0.025 0.028 0.030 0.025 0.026 0.027 [0.027] -; 0.021 0.020 0.020 0.022 0.020 0.022 0.017 0.019 [0.020] -; -; == Abs Mean errors == -; 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 [0.000] -; 0.020 0.001 0.003 0.003 0.000 0.004 0.002 0.003 [0.002] -; 0.000 0.001 0.001 0.001 0.001 0.004 0.000 0.000 [0.000] -; 0.027 0.001 0.000 0.002 0.002 0.002 0.001 0.000 [0.003] -; 0.000 0.000 0.000 0.000 0.000 0.001 0.000 0.001 [-0.000] -; 0.001 0.003 0.001 0.001 0.002 0.001 0.000 0.000 [-0.000] -; 0.000 0.002 0.002 0.001 0.001 0.002 0.001 0.000 [-0.000] -; 0.000 0.002 0.001 0.002 0.001 0.002 0.001 0.001 [-0.000] -; -; ========================= -; Peak error: 1.0000 -; Peak MSE: 0.0365 -; Overall MSE: 0.0201 -; Peak ME: 0.0265 -; Overall ME: 0.0006 -; ;----------------------------------------------------------------------------- ; ; -=IDCT=- @@ -112,38 +82,6 @@ ; descaling) require some unpairable shifting and packing, all on ; the same CPU unit. ; -; THIS IDCT IS NOT IEEE-COMPLIANT: IT WILL FAIL THE [-300,300] -; INPUT RANGE TEST (because of overflow). But the [-256,255] one -; is OK, and I'm fine with it (for now;) -; -; == Mean square errors == -; 0.007 0.006 0.005 0.007 0.006 0.007 0.005 0.007 [0.006] -; 0.006 0.008 0.007 0.007 0.007 0.008 0.008 0.008 [0.007] -; 0.008 0.008 0.008 0.008 0.007 0.009 0.010 0.007 [0.008] -; 0.007 0.007 0.006 0.007 0.008 0.007 0.006 0.008 [0.007] -; 0.007 0.006 0.006 0.006 0.006 0.005 0.006 0.006 [0.006] -; 0.008 0.007 0.006 0.008 0.007 0.008 0.009 0.009 [0.008] -; 0.008 0.006 0.010 0.008 0.008 0.008 0.007 0.007 [0.008] -; 0.007 0.006 0.006 0.007 0.007 0.006 0.006 0.007 [0.006] -; -; == Abs Mean errors == -; 0.001 0.000 0.000 0.001 0.001 0.000 0.000 0.000 [0.000] -; 0.000 0.002 0.002 0.000 0.001 0.001 0.000 0.002 [0.000] -; 0.001 0.002 0.001 0.001 0.001 0.001 0.000 0.001 [-0.001] -; 0.000 0.002 0.000 0.000 0.001 0.000 0.000 0.001 [-0.000] -; 0.000 0.001 0.001 0.001 0.000 0.001 0.000 0.001 [0.000] -; 0.000 0.001 0.001 0.001 0.001 0.000 0.001 0.000 [0.000] -; 0.001 0.001 0.002 0.001 0.001 0.002 0.001 0.001 [0.001] -; 0.000 0.000 0.001 0.000 0.000 0.000 0.000 0.000 [0.000] -; -; ========================= -; -; Peak error: 1.0000 -; Peak MSE: 0.0096 -; Overall MSE: 0.0070 -; Peak ME: 0.0024 -; Overall ME: 0.0001 -; ;----------------------------------------------------------------------------- ;============================================================================= @@ -166,7 +104,7 @@ ; Inverse DCT tables ;----------------------------------------------------------------------------- -ALIGN 16 +align 16 iTab1: dw 0x4000, 0x539f, 0x4000, 0x22a3 dw 0x4000, 0xdd5d, 0x4000, 0xac61 @@ -207,22 +145,20 @@ dw 0x3b21, 0x14c3, 0x979e, 0xc4df dw 0x14c3, 0x587e, 0x587e, 0x979e - ; the original rounding trick is by - ; Michel Lespinasse (hi Walken!) - -ALIGN 16 -Idct_Rnd0: dd 65535, 65535, 65535, 65535 -Idct_Rnd1: dd 3612, 3612, 3612, 3612 -Idct_Rnd2: dd 2271, 2271, 2271, 2271 -Idct_Rnd3: dd 1203, 1203, 1203, 1203 -Idct_Rnd4: dd 1023, 1023, 1023, 1023 -Idct_Rnd5: dd 102, 102, 102, 102 -Idct_Rnd6: dd 398, 398, 398, 398 -Idct_Rnd7: dd 469, 469, 469, 469 - -Idct_Sparse_Rnd0: times 4 dw (65535>>11) -Idct_Sparse_Rnd1: times 4 dw ( 3612>>11) -Idct_Sparse_Rnd2: times 4 dw ( 2271>>11) +align 16 +Walken_Idct_Rounders: + dd 65536, 65536, 65536, 65536 + dd 3597, 3597, 3597, 3597 + dd 2260, 2260, 2260, 2260 + dd 1203, 1203, 1203, 1203 + dd 0, 0, 0, 0 + dd 120, 120, 120, 120 + dd 512, 512, 512, 512 + dd 512, 512, 512, 512 + + times 8 dw (65536>>11) + times 8 dw ( 3597>>11) + times 8 dw ( 2260>>11) ; other rounders are zero... ;----------------------------------------------------------------------------- @@ -284,7 +220,6 @@ SECTION .text cglobal idct_sse2_skal -cglobal idct_sse2_sparse_skal cglobal fdct_sse2_skal ;----------------------------------------------------------------------------- @@ -295,8 +230,9 @@ movdqa xmm0, [ecx+%1*16] ; xmm0 = [01234567] - pshuflw xmm0, xmm0, 11011000b ; [0213] - pshufhw xmm0, xmm0, 11011000b ; [02134657] + pshuflw xmm0, xmm0, 11011000b ; [02134567] ; these two shufflings could be + pshufhw xmm0, xmm0, 11011000b ; [02134657] ; integrated in zig-zag orders + pshufd xmm4, xmm0, 00000000b ; [02020202] pshufd xmm5, xmm0, 10101010b ; [46464646] pshufd xmm6, xmm0, 01010101b ; [13131313] @@ -318,6 +254,7 @@ psrad xmm4, %4 ; => out [7654] packssdw xmm6, xmm4 ; [01237654] + pshufhw xmm6, xmm6, 00011011b ; [01234567] movdqa [ecx+%1*16], xmm6 @@ -330,14 +267,14 @@ %macro iLLM_PASS 1 ; %1: src/dst - movdqa xmm0, [tan3] ; t3-1 - movdqa xmm3, [%1+16*3] ; x3 + movdqa xmm0, [tan3] ; t3-1 + movdqa xmm3, [%1+16*3] ; x3 movdqa xmm1, xmm0 ; t3-1 - movdqa xmm5, [%1+16*5] ; x5 + movdqa xmm5, [%1+16*5] ; x5 - movdqa xmm4, [tan1] ; t1 - movdqa xmm6, [%1+16*1] ; x1 - movdqa xmm7, [%1+16*7] ; x7 + movdqa xmm4, [tan1] ; t1 + movdqa xmm6, [%1+16*1] ; x1 + movdqa xmm7, [%1+16*7] ; x7 movdqa xmm2, xmm4 ; t1 pmulhw xmm0, xmm3 ; x3*(t3-1) @@ -373,9 +310,9 @@ paddsw xmm0, xmm0 ; 2.(t1+t2) = b1 paddsw xmm4, xmm4 ; 2.(t1-t2) = b2 - movdqa xmm7, [tan2] ; t2 - movdqa xmm3, [%1+2*16] ; x2 - movdqa xmm6, [%1+6*16] ; x6 + movdqa xmm7, [tan2] ; t2 + movdqa xmm3, [%1+2*16] ; x2 + movdqa xmm6, [%1+6*16] ; x6 movdqa xmm5, xmm7 ; t2 pmulhw xmm7, xmm6 ; x6*t2 @@ -385,33 +322,37 @@ psubsw xmm5, xmm6 ; x2*t2-x6 = tm26 - ; use:xmm3,xmm5,xmm6,xmm7 frozen: xmm0,xmm4,xmm1,xmm2 + ; use:xmm3,xmm5,xmm6,xmm7 frozen: xmm0,xmm4,xmm1,xmm2 movdqa xmm3, [%1+0*16] ; x0 movdqa xmm6, [%1+4*16] ; x4 + movdqa [%1 ], xmm2 ; we spill 1 reg to perform safe butterflies + + movdqa xmm2, xmm3 psubsw xmm3, xmm6 ; x0-x4 = tm04 - paddsw xmm6, xmm6 ; 2.x4 - paddsw xmm6, xmm3 ; x0+x4 = tp04 + paddsw xmm6, xmm2 ; x0+x4 = tp04 - psubsw xmm3, xmm5 ; tm04-tm26 = a2 - psubsw xmm6, xmm7 ; tp04-tp26 = a3 - paddsw xmm5, xmm5 ; 2.tm26 - paddsw xmm7, xmm7 ; 2.tp26 - paddsw xmm5, xmm3 ; tm04+tm26 = a1 - paddsw xmm7, xmm6 ; tp04+tp26 = a0 - - psubsw xmm5, xmm0 ; a1-b1 - psubsw xmm3, xmm4 ; a2-b2 - paddsw xmm0, xmm0 ; 2.b1 - paddsw xmm4, xmm4 ; 2.b2 - paddsw xmm0, xmm5 ; a1+b1 - paddsw xmm4, xmm3 ; a2+b2 - - psraw xmm5, 6 ; out6 - psraw xmm3, 6 ; out5 - psraw xmm0, 6 ; out1 - psraw xmm4, 6 ; out2 + movdqa xmm2, xmm6 + psubsw xmm6, xmm7 + paddsw xmm7, xmm2 + movdqa xmm2, xmm3 + psubsw xmm3, xmm5 + paddsw xmm5, xmm2 + + movdqa xmm2, xmm5 + psubsw xmm5, xmm0 + paddsw xmm0, xmm2 + movdqa xmm2, xmm3 + psubsw xmm3, xmm4 + paddsw xmm4, xmm2 + + movdqa xmm2, [%1] + + psraw xmm5, 6 ; out6 + psraw xmm3, 6 ; out5 + psraw xmm0, 6 ; out1 + psraw xmm4, 6 ; out2 movdqa [%1+6*16], xmm5 movdqa [%1+5*16], xmm3 @@ -427,35 +368,19 @@ paddsw xmm1, xmm0 ; a0+b0 paddsw xmm2, xmm4 ; a3+b3 - psraw xmm1, 6 ; out0 - psraw xmm7, 6 ; out7 - psraw xmm2, 6 ; out3 - psraw xmm6, 6 ; out4 + psraw xmm1, 6 ; out0 + psraw xmm7, 6 ; out7 + psraw xmm2, 6 ; out3 + psraw xmm6, 6 ; out4 + + ; store result movdqa [%1+0*16], xmm1 movdqa [%1+3*16], xmm2 movdqa [%1+4*16], xmm6 movdqa [%1+7*16], xmm7 -%endmacro - -;----------------------------------------------------------------------------- -; Function idct (the straight forward version) -;----------------------------------------------------------------------------- -ALIGN 16 -idct_sse2_skal: - mov ecx, [esp+4] - iMTX_MULT 0, iTab1, Idct_Rnd0, 11 - iMTX_MULT 1, iTab2, Idct_Rnd1, 11 - iMTX_MULT 2, iTab3, Idct_Rnd2, 11 - iMTX_MULT 3, iTab4, Idct_Rnd3, 11 - iMTX_MULT 4, iTab1, Idct_Rnd4, 11 - iMTX_MULT 5, iTab4, Idct_Rnd5, 11 - iMTX_MULT 6, iTab3, Idct_Rnd6, 11 - iMTX_MULT 7, iTab2, Idct_Rnd7, 11 - iLLM_PASS ecx+0 - ret -.endfunc +%endmacro ;----------------------------------------------------------------------------- ; Helper macro TEST_ROW (test a null row) @@ -473,62 +398,59 @@ ;----------------------------------------------------------------------------- ; Function idct (this one skips null rows) ;----------------------------------------------------------------------------- +; IEEE1180 and Walken compatible version -ALIGN 16 -idct_sse2_sparse_skal: +align 16 +idct_sse2_skal: mov ecx, [esp+ 4] ; Src TEST_ROW ecx, .Row0_Round - iMTX_MULT 0, iTab1, Idct_Rnd0, 11 + iMTX_MULT 0, iTab1, Walken_Idct_Rounders + 16*0, 11 jmp .Row1 .Row0_Round - movq mm0, [Idct_Sparse_Rnd0] - movq [ecx ], mm0 - movq [ecx+8], mm0 + movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0] + movdqa [ecx ], xmm0 .Row1 TEST_ROW ecx+16, .Row1_Round - iMTX_MULT 1, iTab2, Idct_Rnd1, 11 + iMTX_MULT 1, iTab2, Walken_Idct_Rounders + 16*1, 11 jmp .Row2 .Row1_Round - movq mm0, [Idct_Sparse_Rnd1] - movq [ecx+16 ], mm0 - movq [ecx+16+8], mm0 + movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1] + movdqa [ecx+16 ], xmm0 .Row2 TEST_ROW ecx+32, .Row2_Round - iMTX_MULT 2, iTab3, Idct_Rnd2, 11 + iMTX_MULT 2, iTab3, Walken_Idct_Rounders + 16*2, 11 jmp .Row3 .Row2_Round - movq mm0, [Idct_Sparse_Rnd2] - movq [ecx+32 ], mm0 - movq [ecx+32+8], mm0 + movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2] + movdqa [ecx+32 ], xmm0 .Row3 TEST_ROW ecx+48, .Row4 - iMTX_MULT 3, iTab4, Idct_Rnd3, 11 - jmp .Row4 + iMTX_MULT 3, iTab4, Walken_Idct_Rounders + 16*3, 11 .Row4 TEST_ROW ecx+64, .Row5 - iMTX_MULT 4, iTab1, Idct_Rnd4, 11 - jmp .Row5 + iMTX_MULT 4, iTab1, Walken_Idct_Rounders + 16*4, 11 .Row5 TEST_ROW ecx+80, .Row6 - iMTX_MULT 5, iTab4, Idct_Rnd5, 11 + iMTX_MULT 5, iTab4, Walken_Idct_Rounders + 16*5, 11 .Row6 TEST_ROW ecx+96, .Row7 - iMTX_MULT 6, iTab3, Idct_Rnd6, 11 + iMTX_MULT 6, iTab3, Walken_Idct_Rounders + 16*6, 11 .Row7 TEST_ROW ecx+112, .End - iMTX_MULT 7, iTab2, Idct_Rnd7, 11 + iMTX_MULT 7, iTab2, Walken_Idct_Rounders + 16*7, 11 .End - iLLM_PASS ecx+0 + iLLM_PASS ecx + ret .endfunc