--- trunk/xvidcore/src/dct/x86_asm/fdct_sse2_skal.asm 2004/03/22 22:36:25 1382 +++ branches/release-1_2-branch/xvidcore/src/dct/x86_asm/fdct_sse2_skal.asm 2008/12/01 17:27:03 1838 @@ -19,20 +19,11 @@ ; * along with this program; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; * $Id: fdct_sse2_skal.asm,v 1.2 2004-03-22 22:36:23 edgomez Exp $ +; * $Id: fdct_sse2_skal.asm,v 1.10.2.1 2008-12-01 17:27:03 Isibaar Exp $ ; * ; ***************************************************************************/ -BITS 32 - -%macro cglobal 1 - %ifdef PREFIX - global _%1 - %define %1 _%1 - %else - global %1 - %endif -%endmacro +%include "nasm.inc" ;----------------------------------------------------------------------------- ; @@ -65,36 +56,6 @@ ; ; * Some more details at: http://skal.planet-d.net/coding/dct.html ; -; -;////////////////////////////////////////////////////////////////////// -; -; == Mean square errors == -; 0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.000 [0.001] -; 0.035 0.029 0.032 0.032 0.031 0.032 0.034 0.035 [0.032] -; 0.026 0.028 0.027 0.027 0.025 0.028 0.028 0.025 [0.027] -; 0.037 0.032 0.031 0.030 0.028 0.029 0.026 0.031 [0.030] -; 0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.001 [0.001] -; 0.025 0.024 0.022 0.022 0.022 0.022 0.023 0.023 [0.023] -; 0.026 0.028 0.025 0.028 0.030 0.025 0.026 0.027 [0.027] -; 0.021 0.020 0.020 0.022 0.020 0.022 0.017 0.019 [0.020] -; -; == Abs Mean errors == -; 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 [0.000] -; 0.020 0.001 0.003 0.003 0.000 0.004 0.002 0.003 [0.002] -; 0.000 0.001 0.001 0.001 0.001 0.004 0.000 0.000 [0.000] -; 0.027 0.001 0.000 0.002 0.002 0.002 0.001 0.000 [0.003] -; 0.000 0.000 0.000 0.000 0.000 0.001 0.000 0.001 [-0.000] -; 0.001 0.003 0.001 0.001 0.002 0.001 0.000 0.000 [-0.000] -; 0.000 0.002 0.002 0.001 0.001 0.002 0.001 0.000 [-0.000] -; 0.000 0.002 0.001 0.002 0.001 0.002 0.001 0.001 [-0.000] -; -; ========================= -; Peak error: 1.0000 -; Peak MSE: 0.0365 -; Overall MSE: 0.0201 -; Peak ME: 0.0265 -; Overall ME: 0.0006 -; ;----------------------------------------------------------------------------- ; ; -=IDCT=- @@ -103,51 +64,15 @@ ; descaling) require some unpairable shifting and packing, all on ; the same CPU unit. ; -; THIS IDCT IS NOT IEEE-COMPLIANT: IT WILL FAIL THE [-300,300] -; INPUT RANGE TEST (because of overflow). But the [-256,255] one -; is OK, and I'm fine with it (for now;) -; -; == Mean square errors == -; 0.007 0.006 0.005 0.007 0.006 0.007 0.005 0.007 [0.006] -; 0.006 0.008 0.007 0.007 0.007 0.008 0.008 0.008 [0.007] -; 0.008 0.008 0.008 0.008 0.007 0.009 0.010 0.007 [0.008] -; 0.007 0.007 0.006 0.007 0.008 0.007 0.006 0.008 [0.007] -; 0.007 0.006 0.006 0.006 0.006 0.005 0.006 0.006 [0.006] -; 0.008 0.007 0.006 0.008 0.007 0.008 0.009 0.009 [0.008] -; 0.008 0.006 0.010 0.008 0.008 0.008 0.007 0.007 [0.008] -; 0.007 0.006 0.006 0.007 0.007 0.006 0.006 0.007 [0.006] -; -; == Abs Mean errors == -; 0.001 0.000 0.000 0.001 0.001 0.000 0.000 0.000 [0.000] -; 0.000 0.002 0.002 0.000 0.001 0.001 0.000 0.002 [0.000] -; 0.001 0.002 0.001 0.001 0.001 0.001 0.000 0.001 [-0.001] -; 0.000 0.002 0.000 0.000 0.001 0.000 0.000 0.001 [-0.000] -; 0.000 0.001 0.001 0.001 0.000 0.001 0.000 0.001 [0.000] -; 0.000 0.001 0.001 0.001 0.001 0.000 0.001 0.000 [0.000] -; 0.001 0.001 0.002 0.001 0.001 0.002 0.001 0.001 [0.001] -; 0.000 0.000 0.001 0.000 0.000 0.000 0.000 0.000 [0.000] -; -; ========================= -; -; Peak error: 1.0000 -; Peak MSE: 0.0096 -; Overall MSE: 0.0070 -; Peak ME: 0.0024 -; Overall ME: 0.0001 -; ;----------------------------------------------------------------------------- ;============================================================================= ; Read only data ;============================================================================= -%ifdef FORMAT_COFF -SECTION .rodata data -%else -SECTION .rodata data align=16 -%endif +DATA -ALIGN 16 +ALIGN SECTION_ALIGN tan1: times 8 dw 0x32ec ; tan( pi/16) tan2: times 8 dw 0x6a0a ; tan(2pi/16) (=sqrt(2)-1) tan3: times 8 dw 0xab0e ; tan(3pi/16)-1 @@ -157,7 +82,7 @@ ; Inverse DCT tables ;----------------------------------------------------------------------------- -ALIGN 16 +ALIGN SECTION_ALIGN iTab1: dw 0x4000, 0x539f, 0x4000, 0x22a3 dw 0x4000, 0xdd5d, 0x4000, 0xac61 @@ -198,29 +123,27 @@ dw 0x3b21, 0x14c3, 0x979e, 0xc4df dw 0x14c3, 0x587e, 0x587e, 0x979e - ; the original rounding trick is by - ; Michel Lespinasse (hi Walken!) - -ALIGN 16 -Idct_Rnd0: dd 65535, 65535, 65535, 65535 -Idct_Rnd1: dd 3612, 3612, 3612, 3612 -Idct_Rnd2: dd 2271, 2271, 2271, 2271 -Idct_Rnd3: dd 1203, 1203, 1203, 1203 -Idct_Rnd4: dd 1023, 1023, 1023, 1023 -Idct_Rnd5: dd 102, 102, 102, 102 -Idct_Rnd6: dd 398, 398, 398, 398 -Idct_Rnd7: dd 469, 469, 469, 469 - -Idct_Sparse_Rnd0: times 4 dw (65535>>11) -Idct_Sparse_Rnd1: times 4 dw ( 3612>>11) -Idct_Sparse_Rnd2: times 4 dw ( 2271>>11) +ALIGN SECTION_ALIGN +Walken_Idct_Rounders: + dd 65536, 65536, 65536, 65536 + dd 3597, 3597, 3597, 3597 + dd 2260, 2260, 2260, 2260 + dd 1203, 1203, 1203, 1203 + dd 0, 0, 0, 0 + dd 120, 120, 120, 120 + dd 512, 512, 512, 512 + dd 512, 512, 512, 512 + + times 8 dw (65536>>11) + times 8 dw ( 3597>>11) + times 8 dw ( 2260>>11) ; other rounders are zero... ;----------------------------------------------------------------------------- ; Forward DCT tables ;----------------------------------------------------------------------------- -ALIGN 16 +ALIGN SECTION_ALIGN fTab1: dw 0x4000, 0x4000, 0x58c5, 0x4b42, dw 0xdd5d, 0xac61, 0xa73b, 0xcdb7, @@ -262,7 +185,7 @@ dw 0x28ba, 0x9dac, 0x14c3, 0xc4df -ALIGN 16 +ALIGN SECTION_ALIGN Fdct_Rnd0: dw 6,8,8,8, 6,8,8,8 Fdct_Rnd1: dw 8,8,8,8, 8,8,8,8 Fdct_Rnd2: dw 10,8,8,8, 8,8,8,8 @@ -272,10 +195,9 @@ ; Code ;============================================================================= -SECTION .text +SECTION .rotext align=SECTION_ALIGN cglobal idct_sse2_skal -cglobal idct_sse2_sparse_skal cglobal fdct_sse2_skal ;----------------------------------------------------------------------------- @@ -284,10 +206,11 @@ %macro iMTX_MULT 4 ; %1=src, %2 = Table to use, %3=rounder, %4=Shift - movdqa xmm0, [ecx+%1*16] ; xmm0 = [01234567] + movdqa xmm0, [_ECX+%1*16] ; xmm0 = [01234567] + + pshuflw xmm0, xmm0, 11011000b ; [02134567] ; these two shufflings could be + pshufhw xmm0, xmm0, 11011000b ; [02134657] ; integrated in zig-zag orders - pshuflw xmm0, xmm0, 11011000b ; [0213] - pshufhw xmm0, xmm0, 11011000b ; [02134657] pshufd xmm4, xmm0, 00000000b ; [02020202] pshufd xmm5, xmm0, 10101010b ; [46464646] pshufd xmm6, xmm0, 01010101b ; [13131313] @@ -309,9 +232,10 @@ psrad xmm4, %4 ; => out [7654] packssdw xmm6, xmm4 ; [01237654] + pshufhw xmm6, xmm6, 00011011b ; [01234567] - movdqa [ecx+%1*16], xmm6 + movdqa [_ECX+%1*16], xmm6 %endmacro @@ -321,14 +245,14 @@ %macro iLLM_PASS 1 ; %1: src/dst - movdqa xmm0, [tan3] ; t3-1 - movdqa xmm3, [%1+16*3] ; x3 + movdqa xmm0, [tan3] ; t3-1 + movdqa xmm3, [%1+16*3] ; x3 movdqa xmm1, xmm0 ; t3-1 - movdqa xmm5, [%1+16*5] ; x5 + movdqa xmm5, [%1+16*5] ; x5 - movdqa xmm4, [tan1] ; t1 - movdqa xmm6, [%1+16*1] ; x1 - movdqa xmm7, [%1+16*7] ; x7 + movdqa xmm4, [tan1] ; t1 + movdqa xmm6, [%1+16*1] ; x1 + movdqa xmm7, [%1+16*7] ; x7 movdqa xmm2, xmm4 ; t1 pmulhw xmm0, xmm3 ; x3*(t3-1) @@ -364,9 +288,9 @@ paddsw xmm0, xmm0 ; 2.(t1+t2) = b1 paddsw xmm4, xmm4 ; 2.(t1-t2) = b2 - movdqa xmm7, [tan2] ; t2 - movdqa xmm3, [%1+2*16] ; x2 - movdqa xmm6, [%1+6*16] ; x6 + movdqa xmm7, [tan2] ; t2 + movdqa xmm3, [%1+2*16] ; x2 + movdqa xmm6, [%1+6*16] ; x6 movdqa xmm5, xmm7 ; t2 pmulhw xmm7, xmm6 ; x6*t2 @@ -376,33 +300,37 @@ psubsw xmm5, xmm6 ; x2*t2-x6 = tm26 - ; use:xmm3,xmm5,xmm6,xmm7 frozen: xmm0,xmm4,xmm1,xmm2 + ; use:xmm3,xmm5,xmm6,xmm7 frozen: xmm0,xmm4,xmm1,xmm2 movdqa xmm3, [%1+0*16] ; x0 movdqa xmm6, [%1+4*16] ; x4 + movdqa [%1 ], xmm2 ; we spill 1 reg to perform safe butterflies + + movdqa xmm2, xmm3 psubsw xmm3, xmm6 ; x0-x4 = tm04 - paddsw xmm6, xmm6 ; 2.x4 - paddsw xmm6, xmm3 ; x0+x4 = tp04 + paddsw xmm6, xmm2 ; x0+x4 = tp04 - psubsw xmm3, xmm5 ; tm04-tm26 = a2 - psubsw xmm6, xmm7 ; tp04-tp26 = a3 - paddsw xmm5, xmm5 ; 2.tm26 - paddsw xmm7, xmm7 ; 2.tp26 - paddsw xmm5, xmm3 ; tm04+tm26 = a1 - paddsw xmm7, xmm6 ; tp04+tp26 = a0 - - psubsw xmm5, xmm0 ; a1-b1 - psubsw xmm3, xmm4 ; a2-b2 - paddsw xmm0, xmm0 ; 2.b1 - paddsw xmm4, xmm4 ; 2.b2 - paddsw xmm0, xmm5 ; a1+b1 - paddsw xmm4, xmm3 ; a2+b2 - - psraw xmm5, 6 ; out6 - psraw xmm3, 6 ; out5 - psraw xmm0, 6 ; out1 - psraw xmm4, 6 ; out2 + movdqa xmm2, xmm6 + psubsw xmm6, xmm7 + paddsw xmm7, xmm2 + movdqa xmm2, xmm3 + psubsw xmm3, xmm5 + paddsw xmm5, xmm2 + + movdqa xmm2, xmm5 + psubsw xmm5, xmm0 + paddsw xmm0, xmm2 + movdqa xmm2, xmm3 + psubsw xmm3, xmm4 + paddsw xmm4, xmm2 + + movdqa xmm2, [%1] + + psraw xmm5, 6 ; out6 + psraw xmm3, 6 ; out5 + psraw xmm0, 6 ; out1 + psraw xmm4, 6 ; out2 movdqa [%1+6*16], xmm5 movdqa [%1+5*16], xmm3 @@ -418,108 +346,94 @@ paddsw xmm1, xmm0 ; a0+b0 paddsw xmm2, xmm4 ; a3+b3 - psraw xmm1, 6 ; out0 - psraw xmm7, 6 ; out7 - psraw xmm2, 6 ; out3 - psraw xmm6, 6 ; out4 + psraw xmm1, 6 ; out0 + psraw xmm7, 6 ; out7 + psraw xmm2, 6 ; out3 + psraw xmm6, 6 ; out4 + + ; store result movdqa [%1+0*16], xmm1 movdqa [%1+3*16], xmm2 movdqa [%1+4*16], xmm6 movdqa [%1+7*16], xmm7 -%endmacro - -;----------------------------------------------------------------------------- -; Function idct (the straight forward version) -;----------------------------------------------------------------------------- -ALIGN 16 -idct_sse2_skal: - mov ecx, [esp+4] - iMTX_MULT 0, iTab1, Idct_Rnd0, 11 - iMTX_MULT 1, iTab2, Idct_Rnd1, 11 - iMTX_MULT 2, iTab3, Idct_Rnd2, 11 - iMTX_MULT 3, iTab4, Idct_Rnd3, 11 - iMTX_MULT 4, iTab1, Idct_Rnd4, 11 - iMTX_MULT 5, iTab4, Idct_Rnd5, 11 - iMTX_MULT 6, iTab3, Idct_Rnd6, 11 - iMTX_MULT 7, iTab2, Idct_Rnd7, 11 - iLLM_PASS ecx+0 - ret +%endmacro ;----------------------------------------------------------------------------- ; Helper macro TEST_ROW (test a null row) ;----------------------------------------------------------------------------- %macro TEST_ROW 2 ; %1:src, %2:label x8 - mov eax, [%1 ] - mov edx, [%1+ 8] - or eax, [%1+ 4] - or edx, [%1+12] - or eax, edx + mov _EAX, [%1 ] + mov _EDX, [%1+ 8] + or _EAX, [%1+ 4] + or _EDX, [%1+12] + or _EAX, _EDX jz near %2 %endmacro ;----------------------------------------------------------------------------- ; Function idct (this one skips null rows) ;----------------------------------------------------------------------------- +; IEEE1180 and Walken compatible version -ALIGN 16 -idct_sse2_sparse_skal: +ALIGN SECTION_ALIGN +idct_sse2_skal: + + PUSH_XMM6_XMM7 - mov ecx, [esp+ 4] ; Src + mov _ECX, prm1 ; Src - TEST_ROW ecx, .Row0_Round - iMTX_MULT 0, iTab1, Idct_Rnd0, 11 + TEST_ROW _ECX, .Row0_Round + iMTX_MULT 0, iTab1, Walken_Idct_Rounders + 16*0, 11 jmp .Row1 -.Row0_Round - movq mm0, [Idct_Sparse_Rnd0] - movq [ecx ], mm0 - movq [ecx+8], mm0 - -.Row1 - TEST_ROW ecx+16, .Row1_Round - iMTX_MULT 1, iTab2, Idct_Rnd1, 11 +.Row0_Round: + movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0] + movdqa [_ECX ], xmm0 + +.Row1: + TEST_ROW _ECX+16, .Row1_Round + iMTX_MULT 1, iTab2, Walken_Idct_Rounders + 16*1, 11 jmp .Row2 -.Row1_Round - movq mm0, [Idct_Sparse_Rnd1] - movq [ecx+16 ], mm0 - movq [ecx+16+8], mm0 - -.Row2 - TEST_ROW ecx+32, .Row2_Round - iMTX_MULT 2, iTab3, Idct_Rnd2, 11 +.Row1_Round: + movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1] + movdqa [_ECX+16 ], xmm0 + +.Row2: + TEST_ROW _ECX+32, .Row2_Round + iMTX_MULT 2, iTab3, Walken_Idct_Rounders + 16*2, 11 jmp .Row3 -.Row2_Round - movq mm0, [Idct_Sparse_Rnd2] - movq [ecx+32 ], mm0 - movq [ecx+32+8], mm0 - -.Row3 - TEST_ROW ecx+48, .Row4 - iMTX_MULT 3, iTab4, Idct_Rnd3, 11 - jmp .Row4 - -.Row4 - TEST_ROW ecx+64, .Row5 - iMTX_MULT 4, iTab1, Idct_Rnd4, 11 - jmp .Row5 - -.Row5 - TEST_ROW ecx+80, .Row6 - iMTX_MULT 5, iTab4, Idct_Rnd5, 11 - -.Row6 - TEST_ROW ecx+96, .Row7 - iMTX_MULT 6, iTab3, Idct_Rnd6, 11 - -.Row7 - TEST_ROW ecx+112, .End - iMTX_MULT 7, iTab2, Idct_Rnd7, 11 -.End +.Row2_Round: + movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2] + movdqa [_ECX+32 ], xmm0 + +.Row3: + TEST_ROW _ECX+48, .Row4 + iMTX_MULT 3, iTab4, Walken_Idct_Rounders + 16*3, 11 + +.Row4: + TEST_ROW _ECX+64, .Row5 + iMTX_MULT 4, iTab1, Walken_Idct_Rounders + 16*4, 11 + +.Row5: + TEST_ROW _ECX+80, .Row6 + iMTX_MULT 5, iTab4, Walken_Idct_Rounders + 16*5, 11 + +.Row6: + TEST_ROW _ECX+96, .Row7 + iMTX_MULT 6, iTab3, Walken_Idct_Rounders + 16*6, 11 + +.Row7: + TEST_ROW _ECX+112, .End + iMTX_MULT 7, iTab2, Walken_Idct_Rounders + 16*7, 11 +.End: - iLLM_PASS ecx+0 + iLLM_PASS _ECX + + POP_XMM6_XMM7 ret +ENDFUNC ;----------------------------------------------------------------------------- ; Helper macro fLLM_PASS @@ -631,7 +545,7 @@ %macro fMTX_MULT 3 ; %1=src, %2 = Coeffs, %3=rounders - movdqa xmm0, [ecx+%1*16+0] ; xmm0 = [0123][4567] + movdqa xmm0, [_ECX+%1*16+0] ; xmm0 = [0123][4567] pshufhw xmm1, xmm0, 00011011b ; xmm1 = [----][7654] pshufd xmm0, xmm0, 01000100b pshufd xmm1, xmm1, 11101110b @@ -665,17 +579,18 @@ psraw xmm0, 4 ; => [-2048, 2047] - movdqa [ecx+%1*16+0], xmm0 + movdqa [_ECX+%1*16+0], xmm0 %endmacro ;----------------------------------------------------------------------------- ; Function Forward DCT ;----------------------------------------------------------------------------- -ALIGN 16 +ALIGN SECTION_ALIGN fdct_sse2_skal: - mov ecx, [esp+4] - fLLM_PASS ecx+0, 3 + PUSH_XMM6_XMM7 + mov _ECX, prm1 + fLLM_PASS _ECX+0, 3 fMTX_MULT 0, fTab1, Fdct_Rnd0 fMTX_MULT 1, fTab2, Fdct_Rnd2 fMTX_MULT 2, fTab3, Fdct_Rnd1 @@ -684,4 +599,13 @@ fMTX_MULT 5, fTab4, Fdct_Rnd1 fMTX_MULT 6, fTab3, Fdct_Rnd1 fMTX_MULT 7, fTab2, Fdct_Rnd1 + + POP_XMM6_XMM7 ret +ENDFUNC + + +%ifidn __OUTPUT_FORMAT__,elf +section ".note.GNU-stack" noalloc noexec nowrite progbits +%endif +