19 |
; * along with this program; if not, write to the Free Software |
; * along with this program; if not, write to the Free Software |
20 |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
; * |
; * |
22 |
; * $Id: fdct_sse2_skal.asm,v 1.9 2008-11-11 20:46:24 Isibaar Exp $ |
; * $Id: fdct_sse2_skal.asm,v 1.15 2009-09-16 17:07:58 Isibaar Exp $ |
23 |
; * |
; * |
24 |
; ***************************************************************************/ |
; ***************************************************************************/ |
25 |
|
|
26 |
BITS 32 |
%include "nasm.inc" |
|
|
|
|
%macro cglobal 1 |
|
|
%ifdef PREFIX |
|
|
%ifdef MARK_FUNCS |
|
|
global _%1:function %1.endfunc-%1 |
|
|
%define %1 _%1:function %1.endfunc-%1 |
|
|
%define ENDFUNC .endfunc |
|
|
%else |
|
|
global _%1 |
|
|
%define %1 _%1 |
|
|
%define ENDFUNC |
|
|
%endif |
|
|
%else |
|
|
%ifdef MARK_FUNCS |
|
|
global %1:function %1.endfunc-%1 |
|
|
%define ENDFUNC .endfunc |
|
|
%else |
|
|
global %1 |
|
|
%define ENDFUNC |
|
|
%endif |
|
|
%endif |
|
|
%endmacro |
|
27 |
|
|
28 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
29 |
; |
; |
70 |
; Read only data |
; Read only data |
71 |
;============================================================================= |
;============================================================================= |
72 |
|
|
73 |
%ifdef FORMAT_COFF |
DATA |
|
SECTION .rodata |
|
|
%else |
|
|
SECTION .rodata align=16 |
|
|
%endif |
|
74 |
|
|
75 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
76 |
tan1: times 8 dw 0x32ec ; tan( pi/16) |
tan1: times 8 dw 0x32ec ; tan( pi/16) |
77 |
tan2: times 8 dw 0x6a0a ; tan(2pi/16) (=sqrt(2)-1) |
tan2: times 8 dw 0x6a0a ; tan(2pi/16) (=sqrt(2)-1) |
78 |
tan3: times 8 dw 0xab0e ; tan(3pi/16)-1 |
tan3: times 8 dw 0xab0e ; tan(3pi/16)-1 |
82 |
; Inverse DCT tables |
; Inverse DCT tables |
83 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
84 |
|
|
85 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
86 |
iTab1: |
iTab1: |
87 |
dw 0x4000, 0x539f, 0x4000, 0x22a3 |
dw 0x4000, 0x539f, 0x4000, 0x22a3 |
88 |
dw 0x4000, 0xdd5d, 0x4000, 0xac61 |
dw 0x4000, 0xdd5d, 0x4000, 0xac61 |
123 |
dw 0x3b21, 0x14c3, 0x979e, 0xc4df |
dw 0x3b21, 0x14c3, 0x979e, 0xc4df |
124 |
dw 0x14c3, 0x587e, 0x587e, 0x979e |
dw 0x14c3, 0x587e, 0x587e, 0x979e |
125 |
|
|
126 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
127 |
Walken_Idct_Rounders: |
Walken_Idct_Rounders: |
128 |
dd 65536, 65536, 65536, 65536 |
dd 65536, 65536, 65536, 65536 |
129 |
dd 3597, 3597, 3597, 3597 |
dd 3597, 3597, 3597, 3597 |
143 |
; Forward DCT tables |
; Forward DCT tables |
144 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
145 |
|
|
146 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
147 |
fTab1: |
fTab1: |
148 |
dw 0x4000, 0x4000, 0x58c5, 0x4b42, |
dw 0x4000, 0x4000, 0x58c5, 0x4b42, |
149 |
dw 0xdd5d, 0xac61, 0xa73b, 0xcdb7, |
dw 0xdd5d, 0xac61, 0xa73b, 0xcdb7, |
185 |
dw 0x28ba, 0x9dac, 0x14c3, 0xc4df |
dw 0x28ba, 0x9dac, 0x14c3, 0xc4df |
186 |
|
|
187 |
|
|
188 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
189 |
Fdct_Rnd0: dw 6,8,8,8, 6,8,8,8 |
Fdct_Rnd0: dw 6,8,8,8, 6,8,8,8 |
190 |
Fdct_Rnd1: dw 8,8,8,8, 8,8,8,8 |
Fdct_Rnd1: dw 8,8,8,8, 8,8,8,8 |
191 |
Fdct_Rnd2: dw 10,8,8,8, 8,8,8,8 |
Fdct_Rnd2: dw 10,8,8,8, 8,8,8,8 |
195 |
; Code |
; Code |
196 |
;============================================================================= |
;============================================================================= |
197 |
|
|
198 |
SECTION .text |
TEXT |
199 |
|
|
200 |
cglobal idct_sse2_skal |
cglobal idct_sse2_skal |
201 |
cglobal fdct_sse2_skal |
cglobal fdct_sse2_skal |
206 |
|
|
207 |
%macro iMTX_MULT 4 ; %1=src, %2 = Table to use, %3=rounder, %4=Shift |
%macro iMTX_MULT 4 ; %1=src, %2 = Table to use, %3=rounder, %4=Shift |
208 |
|
|
209 |
movdqa xmm0, [ecx+%1*16] ; xmm0 = [01234567] |
movdqa xmm0, [_ECX+%1*16] ; xmm0 = [01234567] |
210 |
|
|
211 |
pshuflw xmm0, xmm0, 11011000b ; [02134567] ; these two shufflings could be |
pshuflw xmm0, xmm0, 11011000b ; [02134567] ; these two shufflings could be |
212 |
pshufhw xmm0, xmm0, 11011000b ; [02134657] ; integrated in zig-zag orders |
pshufhw xmm0, xmm0, 11011000b ; [02134657] ; integrated in zig-zag orders |
235 |
|
|
236 |
pshufhw xmm6, xmm6, 00011011b ; [01234567] |
pshufhw xmm6, xmm6, 00011011b ; [01234567] |
237 |
|
|
238 |
movdqa [ecx+%1*16], xmm6 |
movdqa [_ECX+%1*16], xmm6 |
239 |
|
|
240 |
%endmacro |
%endmacro |
241 |
|
|
365 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
366 |
|
|
367 |
%macro TEST_ROW 2 ; %1:src, %2:label x8 |
%macro TEST_ROW 2 ; %1:src, %2:label x8 |
368 |
mov eax, [%1 ] |
mov _EAX, [%1 ] |
369 |
mov edx, [%1+ 8] |
mov _EDX, [%1+ 8] |
370 |
or eax, [%1+ 4] |
or _EAX, [%1+ 4] |
371 |
or edx, [%1+12] |
or _EDX, [%1+12] |
372 |
or eax, edx |
or _EAX, _EDX |
373 |
jz near %2 |
jz near %2 |
374 |
%endmacro |
%endmacro |
375 |
|
|
378 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
379 |
; IEEE1180 and Walken compatible version |
; IEEE1180 and Walken compatible version |
380 |
|
|
381 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
382 |
idct_sse2_skal: |
idct_sse2_skal: |
383 |
|
|
384 |
mov ecx, [esp+ 4] ; Src |
PUSH_XMM6_XMM7 |
385 |
|
|
386 |
|
mov _ECX, prm1 ; Src |
387 |
|
|
388 |
TEST_ROW ecx, .Row0_Round |
TEST_ROW _ECX, .Row0_Round |
389 |
iMTX_MULT 0, iTab1, Walken_Idct_Rounders + 16*0, 11 |
iMTX_MULT 0, iTab1, Walken_Idct_Rounders + 16*0, 11 |
390 |
jmp .Row1 |
jmp .Row1 |
391 |
.Row0_Round: |
.Row0_Round: |
392 |
movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0] |
movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0] |
393 |
movdqa [ecx ], xmm0 |
movdqa [_ECX ], xmm0 |
394 |
|
|
395 |
.Row1: |
.Row1: |
396 |
TEST_ROW ecx+16, .Row1_Round |
TEST_ROW _ECX+16, .Row1_Round |
397 |
iMTX_MULT 1, iTab2, Walken_Idct_Rounders + 16*1, 11 |
iMTX_MULT 1, iTab2, Walken_Idct_Rounders + 16*1, 11 |
398 |
jmp .Row2 |
jmp .Row2 |
399 |
.Row1_Round: |
.Row1_Round: |
400 |
movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1] |
movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1] |
401 |
movdqa [ecx+16 ], xmm0 |
movdqa [_ECX+16 ], xmm0 |
402 |
|
|
403 |
.Row2: |
.Row2: |
404 |
TEST_ROW ecx+32, .Row2_Round |
TEST_ROW _ECX+32, .Row2_Round |
405 |
iMTX_MULT 2, iTab3, Walken_Idct_Rounders + 16*2, 11 |
iMTX_MULT 2, iTab3, Walken_Idct_Rounders + 16*2, 11 |
406 |
jmp .Row3 |
jmp .Row3 |
407 |
.Row2_Round: |
.Row2_Round: |
408 |
movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2] |
movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2] |
409 |
movdqa [ecx+32 ], xmm0 |
movdqa [_ECX+32 ], xmm0 |
410 |
|
|
411 |
.Row3: |
.Row3: |
412 |
TEST_ROW ecx+48, .Row4 |
TEST_ROW _ECX+48, .Row4 |
413 |
iMTX_MULT 3, iTab4, Walken_Idct_Rounders + 16*3, 11 |
iMTX_MULT 3, iTab4, Walken_Idct_Rounders + 16*3, 11 |
414 |
|
|
415 |
.Row4: |
.Row4: |
416 |
TEST_ROW ecx+64, .Row5 |
TEST_ROW _ECX+64, .Row5 |
417 |
iMTX_MULT 4, iTab1, Walken_Idct_Rounders + 16*4, 11 |
iMTX_MULT 4, iTab1, Walken_Idct_Rounders + 16*4, 11 |
418 |
|
|
419 |
.Row5: |
.Row5: |
420 |
TEST_ROW ecx+80, .Row6 |
TEST_ROW _ECX+80, .Row6 |
421 |
iMTX_MULT 5, iTab4, Walken_Idct_Rounders + 16*5, 11 |
iMTX_MULT 5, iTab4, Walken_Idct_Rounders + 16*5, 11 |
422 |
|
|
423 |
.Row6: |
.Row6: |
424 |
TEST_ROW ecx+96, .Row7 |
TEST_ROW _ECX+96, .Row7 |
425 |
iMTX_MULT 6, iTab3, Walken_Idct_Rounders + 16*6, 11 |
iMTX_MULT 6, iTab3, Walken_Idct_Rounders + 16*6, 11 |
426 |
|
|
427 |
.Row7: |
.Row7: |
428 |
TEST_ROW ecx+112, .End |
TEST_ROW _ECX+112, .End |
429 |
iMTX_MULT 7, iTab2, Walken_Idct_Rounders + 16*7, 11 |
iMTX_MULT 7, iTab2, Walken_Idct_Rounders + 16*7, 11 |
430 |
.End: |
.End: |
431 |
|
|
432 |
iLLM_PASS ecx |
iLLM_PASS _ECX |
433 |
|
|
434 |
|
POP_XMM6_XMM7 |
435 |
ret |
ret |
436 |
ENDFUNC |
ENDFUNC |
437 |
|
|
545 |
|
|
546 |
%macro fMTX_MULT 3 ; %1=src, %2 = Coeffs, %3=rounders |
%macro fMTX_MULT 3 ; %1=src, %2 = Coeffs, %3=rounders |
547 |
|
|
548 |
movdqa xmm0, [ecx+%1*16+0] ; xmm0 = [0123][4567] |
movdqa xmm0, [_ECX+%1*16+0] ; xmm0 = [0123][4567] |
549 |
pshufhw xmm1, xmm0, 00011011b ; xmm1 = [----][7654] |
pshufhw xmm1, xmm0, 00011011b ; xmm1 = [----][7654] |
550 |
pshufd xmm0, xmm0, 01000100b |
pshufd xmm0, xmm0, 01000100b |
551 |
pshufd xmm1, xmm1, 11101110b |
pshufd xmm1, xmm1, 11101110b |
579 |
|
|
580 |
psraw xmm0, 4 ; => [-2048, 2047] |
psraw xmm0, 4 ; => [-2048, 2047] |
581 |
|
|
582 |
movdqa [ecx+%1*16+0], xmm0 |
movdqa [_ECX+%1*16+0], xmm0 |
583 |
%endmacro |
%endmacro |
584 |
|
|
585 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
586 |
; Function Forward DCT |
; Function Forward DCT |
587 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
588 |
|
|
589 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
590 |
fdct_sse2_skal: |
fdct_sse2_skal: |
591 |
mov ecx, [esp+4] |
PUSH_XMM6_XMM7 |
592 |
fLLM_PASS ecx+0, 3 |
mov _ECX, prm1 |
593 |
|
fLLM_PASS _ECX+0, 3 |
594 |
fMTX_MULT 0, fTab1, Fdct_Rnd0 |
fMTX_MULT 0, fTab1, Fdct_Rnd0 |
595 |
fMTX_MULT 1, fTab2, Fdct_Rnd2 |
fMTX_MULT 1, fTab2, Fdct_Rnd2 |
596 |
fMTX_MULT 2, fTab3, Fdct_Rnd1 |
fMTX_MULT 2, fTab3, Fdct_Rnd1 |
599 |
fMTX_MULT 5, fTab4, Fdct_Rnd1 |
fMTX_MULT 5, fTab4, Fdct_Rnd1 |
600 |
fMTX_MULT 6, fTab3, Fdct_Rnd1 |
fMTX_MULT 6, fTab3, Fdct_Rnd1 |
601 |
fMTX_MULT 7, fTab2, Fdct_Rnd1 |
fMTX_MULT 7, fTab2, Fdct_Rnd1 |
602 |
|
|
603 |
|
POP_XMM6_XMM7 |
604 |
ret |
ret |
605 |
ENDFUNC |
ENDFUNC |
606 |
|
|
607 |
|
; Mac-specific workaround for misaligned DCT tables |
608 |
|
ALIGN SECTION_ALIGN |
609 |
|
times 8 dw 0 |
610 |
|
|
611 |
%ifidn __OUTPUT_FORMAT__,elf |
NON_EXEC_STACK |
|
section ".note.GNU-stack" noalloc noexec nowrite progbits |
|
|
%endif |
|
|
|
|