19 |
; * along with this program; if not, write to the Free Software |
; * along with this program; if not, write to the Free Software |
20 |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
; * |
; * |
22 |
; * $Id: fdct_sse2_skal.asm,v 1.3 2004-07-24 11:46:08 edgomez Exp $ |
; * $Id: fdct_sse2_skal.asm,v 1.9 2008-11-11 20:46:24 Isibaar Exp $ |
23 |
; * |
; * |
24 |
; ***************************************************************************/ |
; ***************************************************************************/ |
25 |
|
|
27 |
|
|
28 |
%macro cglobal 1 |
%macro cglobal 1 |
29 |
%ifdef PREFIX |
%ifdef PREFIX |
30 |
|
%ifdef MARK_FUNCS |
31 |
|
global _%1:function %1.endfunc-%1 |
32 |
|
%define %1 _%1:function %1.endfunc-%1 |
33 |
|
%define ENDFUNC .endfunc |
34 |
|
%else |
35 |
global _%1 |
global _%1 |
36 |
%define %1 _%1 |
%define %1 _%1 |
37 |
|
%define ENDFUNC |
38 |
|
%endif |
39 |
|
%else |
40 |
|
%ifdef MARK_FUNCS |
41 |
|
global %1:function %1.endfunc-%1 |
42 |
|
%define ENDFUNC .endfunc |
43 |
%else |
%else |
44 |
global %1 |
global %1 |
45 |
|
%define ENDFUNC |
46 |
|
%endif |
47 |
%endif |
%endif |
48 |
%endmacro |
%endmacro |
49 |
|
|
78 |
; |
; |
79 |
; * Some more details at: http://skal.planet-d.net/coding/dct.html |
; * Some more details at: http://skal.planet-d.net/coding/dct.html |
80 |
; |
; |
|
; |
|
|
;////////////////////////////////////////////////////////////////////// |
|
|
; |
|
|
; == Mean square errors == |
|
|
; 0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.000 [0.001] |
|
|
; 0.035 0.029 0.032 0.032 0.031 0.032 0.034 0.035 [0.032] |
|
|
; 0.026 0.028 0.027 0.027 0.025 0.028 0.028 0.025 [0.027] |
|
|
; 0.037 0.032 0.031 0.030 0.028 0.029 0.026 0.031 [0.030] |
|
|
; 0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.001 [0.001] |
|
|
; 0.025 0.024 0.022 0.022 0.022 0.022 0.023 0.023 [0.023] |
|
|
; 0.026 0.028 0.025 0.028 0.030 0.025 0.026 0.027 [0.027] |
|
|
; 0.021 0.020 0.020 0.022 0.020 0.022 0.017 0.019 [0.020] |
|
|
; |
|
|
; == Abs Mean errors == |
|
|
; 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 [0.000] |
|
|
; 0.020 0.001 0.003 0.003 0.000 0.004 0.002 0.003 [0.002] |
|
|
; 0.000 0.001 0.001 0.001 0.001 0.004 0.000 0.000 [0.000] |
|
|
; 0.027 0.001 0.000 0.002 0.002 0.002 0.001 0.000 [0.003] |
|
|
; 0.000 0.000 0.000 0.000 0.000 0.001 0.000 0.001 [-0.000] |
|
|
; 0.001 0.003 0.001 0.001 0.002 0.001 0.000 0.000 [-0.000] |
|
|
; 0.000 0.002 0.002 0.001 0.001 0.002 0.001 0.000 [-0.000] |
|
|
; 0.000 0.002 0.001 0.002 0.001 0.002 0.001 0.001 [-0.000] |
|
|
; |
|
|
; ========================= |
|
|
; Peak error: 1.0000 |
|
|
; Peak MSE: 0.0365 |
|
|
; Overall MSE: 0.0201 |
|
|
; Peak ME: 0.0265 |
|
|
; Overall ME: 0.0006 |
|
|
; |
|
81 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
82 |
; |
; |
83 |
; -=IDCT=- |
; -=IDCT=- |
86 |
; descaling) require some unpairable shifting and packing, all on |
; descaling) require some unpairable shifting and packing, all on |
87 |
; the same CPU unit. |
; the same CPU unit. |
88 |
; |
; |
|
; THIS IDCT IS NOT IEEE-COMPLIANT: IT WILL FAIL THE [-300,300] |
|
|
; INPUT RANGE TEST (because of overflow). But the [-256,255] one |
|
|
; is OK, and I'm fine with it (for now;) |
|
|
; |
|
|
; == Mean square errors == |
|
|
; 0.007 0.006 0.005 0.007 0.006 0.007 0.005 0.007 [0.006] |
|
|
; 0.006 0.008 0.007 0.007 0.007 0.008 0.008 0.008 [0.007] |
|
|
; 0.008 0.008 0.008 0.008 0.007 0.009 0.010 0.007 [0.008] |
|
|
; 0.007 0.007 0.006 0.007 0.008 0.007 0.006 0.008 [0.007] |
|
|
; 0.007 0.006 0.006 0.006 0.006 0.005 0.006 0.006 [0.006] |
|
|
; 0.008 0.007 0.006 0.008 0.007 0.008 0.009 0.009 [0.008] |
|
|
; 0.008 0.006 0.010 0.008 0.008 0.008 0.007 0.007 [0.008] |
|
|
; 0.007 0.006 0.006 0.007 0.007 0.006 0.006 0.007 [0.006] |
|
|
; |
|
|
; == Abs Mean errors == |
|
|
; 0.001 0.000 0.000 0.001 0.001 0.000 0.000 0.000 [0.000] |
|
|
; 0.000 0.002 0.002 0.000 0.001 0.001 0.000 0.002 [0.000] |
|
|
; 0.001 0.002 0.001 0.001 0.001 0.001 0.000 0.001 [-0.001] |
|
|
; 0.000 0.002 0.000 0.000 0.001 0.000 0.000 0.001 [-0.000] |
|
|
; 0.000 0.001 0.001 0.001 0.000 0.001 0.000 0.001 [0.000] |
|
|
; 0.000 0.001 0.001 0.001 0.001 0.000 0.001 0.000 [0.000] |
|
|
; 0.001 0.001 0.002 0.001 0.001 0.002 0.001 0.001 [0.001] |
|
|
; 0.000 0.000 0.001 0.000 0.000 0.000 0.000 0.000 [0.000] |
|
|
; |
|
|
; ========================= |
|
|
; |
|
|
; Peak error: 1.0000 |
|
|
; Peak MSE: 0.0096 |
|
|
; Overall MSE: 0.0070 |
|
|
; Peak ME: 0.0024 |
|
|
; Overall ME: 0.0001 |
|
|
; |
|
89 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
90 |
|
|
91 |
;============================================================================= |
;============================================================================= |
149 |
dw 0x3b21, 0x14c3, 0x979e, 0xc4df |
dw 0x3b21, 0x14c3, 0x979e, 0xc4df |
150 |
dw 0x14c3, 0x587e, 0x587e, 0x979e |
dw 0x14c3, 0x587e, 0x587e, 0x979e |
151 |
|
|
|
; the original rounding trick is by |
|
|
; Michel Lespinasse (hi Walken!) <walken@zoy.org> |
|
|
|
|
152 |
ALIGN 16 |
ALIGN 16 |
153 |
Idct_Rnd0: dd 65535, 65535, 65535, 65535 |
Walken_Idct_Rounders: |
154 |
Idct_Rnd1: dd 3612, 3612, 3612, 3612 |
dd 65536, 65536, 65536, 65536 |
155 |
Idct_Rnd2: dd 2271, 2271, 2271, 2271 |
dd 3597, 3597, 3597, 3597 |
156 |
Idct_Rnd3: dd 1203, 1203, 1203, 1203 |
dd 2260, 2260, 2260, 2260 |
157 |
Idct_Rnd4: dd 1023, 1023, 1023, 1023 |
dd 1203, 1203, 1203, 1203 |
158 |
Idct_Rnd5: dd 102, 102, 102, 102 |
dd 0, 0, 0, 0 |
159 |
Idct_Rnd6: dd 398, 398, 398, 398 |
dd 120, 120, 120, 120 |
160 |
Idct_Rnd7: dd 469, 469, 469, 469 |
dd 512, 512, 512, 512 |
161 |
|
dd 512, 512, 512, 512 |
162 |
Idct_Sparse_Rnd0: times 4 dw (65535>>11) |
|
163 |
Idct_Sparse_Rnd1: times 4 dw ( 3612>>11) |
times 8 dw (65536>>11) |
164 |
Idct_Sparse_Rnd2: times 4 dw ( 2271>>11) |
times 8 dw ( 3597>>11) |
165 |
|
times 8 dw ( 2260>>11) |
166 |
; other rounders are zero... |
; other rounders are zero... |
167 |
|
|
168 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
224 |
SECTION .text |
SECTION .text |
225 |
|
|
226 |
cglobal idct_sse2_skal |
cglobal idct_sse2_skal |
|
cglobal idct_sse2_sparse_skal |
|
227 |
cglobal fdct_sse2_skal |
cglobal fdct_sse2_skal |
228 |
|
|
229 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
234 |
|
|
235 |
movdqa xmm0, [ecx+%1*16] ; xmm0 = [01234567] |
movdqa xmm0, [ecx+%1*16] ; xmm0 = [01234567] |
236 |
|
|
237 |
pshuflw xmm0, xmm0, 11011000b ; [0213] |
pshuflw xmm0, xmm0, 11011000b ; [02134567] ; these two shufflings could be |
238 |
pshufhw xmm0, xmm0, 11011000b ; [02134657] |
pshufhw xmm0, xmm0, 11011000b ; [02134657] ; integrated in zig-zag orders |
239 |
|
|
240 |
pshufd xmm4, xmm0, 00000000b ; [02020202] |
pshufd xmm4, xmm0, 00000000b ; [02020202] |
241 |
pshufd xmm5, xmm0, 10101010b ; [46464646] |
pshufd xmm5, xmm0, 10101010b ; [46464646] |
242 |
pshufd xmm6, xmm0, 01010101b ; [13131313] |
pshufd xmm6, xmm0, 01010101b ; [13131313] |
258 |
psrad xmm4, %4 ; => out [7654] |
psrad xmm4, %4 ; => out [7654] |
259 |
|
|
260 |
packssdw xmm6, xmm4 ; [01237654] |
packssdw xmm6, xmm4 ; [01237654] |
261 |
|
|
262 |
pshufhw xmm6, xmm6, 00011011b ; [01234567] |
pshufhw xmm6, xmm6, 00011011b ; [01234567] |
263 |
|
|
264 |
movdqa [ecx+%1*16], xmm6 |
movdqa [ecx+%1*16], xmm6 |
331 |
movdqa xmm3, [%1+0*16] ; x0 |
movdqa xmm3, [%1+0*16] ; x0 |
332 |
movdqa xmm6, [%1+4*16] ; x4 |
movdqa xmm6, [%1+4*16] ; x4 |
333 |
|
|
334 |
|
movdqa [%1 ], xmm2 ; we spill 1 reg to perform safe butterflies |
335 |
|
|
336 |
|
movdqa xmm2, xmm3 |
337 |
psubsw xmm3, xmm6 ; x0-x4 = tm04 |
psubsw xmm3, xmm6 ; x0-x4 = tm04 |
338 |
paddsw xmm6, xmm6 ; 2.x4 |
paddsw xmm6, xmm2 ; x0+x4 = tp04 |
|
paddsw xmm6, xmm3 ; x0+x4 = tp04 |
|
339 |
|
|
340 |
psubsw xmm3, xmm5 ; tm04-tm26 = a2 |
movdqa xmm2, xmm6 |
341 |
psubsw xmm6, xmm7 ; tp04-tp26 = a3 |
psubsw xmm6, xmm7 |
342 |
paddsw xmm5, xmm5 ; 2.tm26 |
paddsw xmm7, xmm2 |
343 |
paddsw xmm7, xmm7 ; 2.tp26 |
movdqa xmm2, xmm3 |
344 |
paddsw xmm5, xmm3 ; tm04+tm26 = a1 |
psubsw xmm3, xmm5 |
345 |
paddsw xmm7, xmm6 ; tp04+tp26 = a0 |
paddsw xmm5, xmm2 |
346 |
|
|
347 |
psubsw xmm5, xmm0 ; a1-b1 |
movdqa xmm2, xmm5 |
348 |
psubsw xmm3, xmm4 ; a2-b2 |
psubsw xmm5, xmm0 |
349 |
paddsw xmm0, xmm0 ; 2.b1 |
paddsw xmm0, xmm2 |
350 |
paddsw xmm4, xmm4 ; 2.b2 |
movdqa xmm2, xmm3 |
351 |
paddsw xmm0, xmm5 ; a1+b1 |
psubsw xmm3, xmm4 |
352 |
paddsw xmm4, xmm3 ; a2+b2 |
paddsw xmm4, xmm2 |
353 |
|
|
354 |
|
movdqa xmm2, [%1] |
355 |
|
|
356 |
psraw xmm5, 6 ; out6 |
psraw xmm5, 6 ; out6 |
357 |
psraw xmm3, 6 ; out5 |
psraw xmm3, 6 ; out5 |
377 |
psraw xmm2, 6 ; out3 |
psraw xmm2, 6 ; out3 |
378 |
psraw xmm6, 6 ; out4 |
psraw xmm6, 6 ; out4 |
379 |
|
|
380 |
|
; store result |
381 |
|
|
382 |
movdqa [%1+0*16], xmm1 |
movdqa [%1+0*16], xmm1 |
383 |
movdqa [%1+3*16], xmm2 |
movdqa [%1+3*16], xmm2 |
384 |
movdqa [%1+4*16], xmm6 |
movdqa [%1+4*16], xmm6 |
385 |
movdqa [%1+7*16], xmm7 |
movdqa [%1+7*16], xmm7 |
|
%endmacro |
|
386 |
|
|
387 |
;----------------------------------------------------------------------------- |
%endmacro |
|
; Function idct (the straight forward version) |
|
|
;----------------------------------------------------------------------------- |
|
|
|
|
|
ALIGN 16 |
|
|
idct_sse2_skal: |
|
|
mov ecx, [esp+4] |
|
|
iMTX_MULT 0, iTab1, Idct_Rnd0, 11 |
|
|
iMTX_MULT 1, iTab2, Idct_Rnd1, 11 |
|
|
iMTX_MULT 2, iTab3, Idct_Rnd2, 11 |
|
|
iMTX_MULT 3, iTab4, Idct_Rnd3, 11 |
|
|
iMTX_MULT 4, iTab1, Idct_Rnd4, 11 |
|
|
iMTX_MULT 5, iTab4, Idct_Rnd5, 11 |
|
|
iMTX_MULT 6, iTab3, Idct_Rnd6, 11 |
|
|
iMTX_MULT 7, iTab2, Idct_Rnd7, 11 |
|
|
iLLM_PASS ecx+0 |
|
|
ret |
|
388 |
|
|
389 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
390 |
; Helper macro TEST_ROW (test a null row) |
; Helper macro TEST_ROW (test a null row) |
402 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
403 |
; Function idct (this one skips null rows) |
; Function idct (this one skips null rows) |
404 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
405 |
|
; IEEE1180 and Walken compatible version |
406 |
|
|
407 |
ALIGN 16 |
ALIGN 16 |
408 |
idct_sse2_sparse_skal: |
idct_sse2_skal: |
409 |
|
|
410 |
mov ecx, [esp+ 4] ; Src |
mov ecx, [esp+ 4] ; Src |
411 |
|
|
412 |
TEST_ROW ecx, .Row0_Round |
TEST_ROW ecx, .Row0_Round |
413 |
iMTX_MULT 0, iTab1, Idct_Rnd0, 11 |
iMTX_MULT 0, iTab1, Walken_Idct_Rounders + 16*0, 11 |
414 |
jmp .Row1 |
jmp .Row1 |
415 |
.Row0_Round |
.Row0_Round: |
416 |
movq mm0, [Idct_Sparse_Rnd0] |
movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0] |
417 |
movq [ecx ], mm0 |
movdqa [ecx ], xmm0 |
|
movq [ecx+8], mm0 |
|
418 |
|
|
419 |
.Row1 |
.Row1: |
420 |
TEST_ROW ecx+16, .Row1_Round |
TEST_ROW ecx+16, .Row1_Round |
421 |
iMTX_MULT 1, iTab2, Idct_Rnd1, 11 |
iMTX_MULT 1, iTab2, Walken_Idct_Rounders + 16*1, 11 |
422 |
jmp .Row2 |
jmp .Row2 |
423 |
.Row1_Round |
.Row1_Round: |
424 |
movq mm0, [Idct_Sparse_Rnd1] |
movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1] |
425 |
movq [ecx+16 ], mm0 |
movdqa [ecx+16 ], xmm0 |
|
movq [ecx+16+8], mm0 |
|
426 |
|
|
427 |
.Row2 |
.Row2: |
428 |
TEST_ROW ecx+32, .Row2_Round |
TEST_ROW ecx+32, .Row2_Round |
429 |
iMTX_MULT 2, iTab3, Idct_Rnd2, 11 |
iMTX_MULT 2, iTab3, Walken_Idct_Rounders + 16*2, 11 |
430 |
jmp .Row3 |
jmp .Row3 |
431 |
.Row2_Round |
.Row2_Round: |
432 |
movq mm0, [Idct_Sparse_Rnd2] |
movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2] |
433 |
movq [ecx+32 ], mm0 |
movdqa [ecx+32 ], xmm0 |
|
movq [ecx+32+8], mm0 |
|
434 |
|
|
435 |
.Row3 |
.Row3: |
436 |
TEST_ROW ecx+48, .Row4 |
TEST_ROW ecx+48, .Row4 |
437 |
iMTX_MULT 3, iTab4, Idct_Rnd3, 11 |
iMTX_MULT 3, iTab4, Walken_Idct_Rounders + 16*3, 11 |
|
jmp .Row4 |
|
438 |
|
|
439 |
.Row4 |
.Row4: |
440 |
TEST_ROW ecx+64, .Row5 |
TEST_ROW ecx+64, .Row5 |
441 |
iMTX_MULT 4, iTab1, Idct_Rnd4, 11 |
iMTX_MULT 4, iTab1, Walken_Idct_Rounders + 16*4, 11 |
|
jmp .Row5 |
|
442 |
|
|
443 |
.Row5 |
.Row5: |
444 |
TEST_ROW ecx+80, .Row6 |
TEST_ROW ecx+80, .Row6 |
445 |
iMTX_MULT 5, iTab4, Idct_Rnd5, 11 |
iMTX_MULT 5, iTab4, Walken_Idct_Rounders + 16*5, 11 |
446 |
|
|
447 |
.Row6 |
.Row6: |
448 |
TEST_ROW ecx+96, .Row7 |
TEST_ROW ecx+96, .Row7 |
449 |
iMTX_MULT 6, iTab3, Idct_Rnd6, 11 |
iMTX_MULT 6, iTab3, Walken_Idct_Rounders + 16*6, 11 |
450 |
|
|
451 |
.Row7 |
.Row7: |
452 |
TEST_ROW ecx+112, .End |
TEST_ROW ecx+112, .End |
453 |
iMTX_MULT 7, iTab2, Idct_Rnd7, 11 |
iMTX_MULT 7, iTab2, Walken_Idct_Rounders + 16*7, 11 |
454 |
.End |
.End: |
455 |
|
|
456 |
|
iLLM_PASS ecx |
457 |
|
|
|
iLLM_PASS ecx+0 |
|
458 |
ret |
ret |
459 |
|
ENDFUNC |
460 |
|
|
461 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
462 |
; Helper macro fLLM_PASS |
; Helper macro fLLM_PASS |
622 |
fMTX_MULT 6, fTab3, Fdct_Rnd1 |
fMTX_MULT 6, fTab3, Fdct_Rnd1 |
623 |
fMTX_MULT 7, fTab2, Fdct_Rnd1 |
fMTX_MULT 7, fTab2, Fdct_Rnd1 |
624 |
ret |
ret |
625 |
|
ENDFUNC |
626 |
|
|
627 |
|
|
628 |
|
%ifidn __OUTPUT_FORMAT__,elf |
629 |
|
section ".note.GNU-stack" noalloc noexec nowrite progbits |
630 |
|
%endif |
631 |
|
|