19 |
; * along with this program; if not, write to the Free Software |
; * along with this program; if not, write to the Free Software |
20 |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
; * |
; * |
22 |
; * $Id: fdct_sse2_skal.asm,v 1.4 2004-08-22 11:46:09 edgomez Exp $ |
; * $Id: fdct_sse2_skal.asm,v 1.12 2008-12-04 14:41:50 Isibaar Exp $ |
23 |
; * |
; * |
24 |
; ***************************************************************************/ |
; ***************************************************************************/ |
25 |
|
|
26 |
BITS 32 |
%include "nasm.inc" |
|
|
|
|
%macro cglobal 1 |
|
|
%ifdef PREFIX |
|
|
%ifdef MARK_FUNCS |
|
|
global _%1:function |
|
|
%define %1 _%1:function |
|
|
%else |
|
|
global _%1 |
|
|
%define %1 _%1 |
|
|
%endif |
|
|
%else |
|
|
%ifdef MARK_FUNCS |
|
|
global %1:function |
|
|
%else |
|
|
global %1 |
|
|
%endif |
|
|
%endif |
|
|
%endmacro |
|
27 |
|
|
28 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
29 |
; |
; |
56 |
; |
; |
57 |
; * Some more details at: http://skal.planet-d.net/coding/dct.html |
; * Some more details at: http://skal.planet-d.net/coding/dct.html |
58 |
; |
; |
|
; |
|
|
;////////////////////////////////////////////////////////////////////// |
|
|
; |
|
|
; == Mean square errors == |
|
|
; 0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.000 [0.001] |
|
|
; 0.035 0.029 0.032 0.032 0.031 0.032 0.034 0.035 [0.032] |
|
|
; 0.026 0.028 0.027 0.027 0.025 0.028 0.028 0.025 [0.027] |
|
|
; 0.037 0.032 0.031 0.030 0.028 0.029 0.026 0.031 [0.030] |
|
|
; 0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.001 [0.001] |
|
|
; 0.025 0.024 0.022 0.022 0.022 0.022 0.023 0.023 [0.023] |
|
|
; 0.026 0.028 0.025 0.028 0.030 0.025 0.026 0.027 [0.027] |
|
|
; 0.021 0.020 0.020 0.022 0.020 0.022 0.017 0.019 [0.020] |
|
|
; |
|
|
; == Abs Mean errors == |
|
|
; 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 [0.000] |
|
|
; 0.020 0.001 0.003 0.003 0.000 0.004 0.002 0.003 [0.002] |
|
|
; 0.000 0.001 0.001 0.001 0.001 0.004 0.000 0.000 [0.000] |
|
|
; 0.027 0.001 0.000 0.002 0.002 0.002 0.001 0.000 [0.003] |
|
|
; 0.000 0.000 0.000 0.000 0.000 0.001 0.000 0.001 [-0.000] |
|
|
; 0.001 0.003 0.001 0.001 0.002 0.001 0.000 0.000 [-0.000] |
|
|
; 0.000 0.002 0.002 0.001 0.001 0.002 0.001 0.000 [-0.000] |
|
|
; 0.000 0.002 0.001 0.002 0.001 0.002 0.001 0.001 [-0.000] |
|
|
; |
|
|
; ========================= |
|
|
; Peak error: 1.0000 |
|
|
; Peak MSE: 0.0365 |
|
|
; Overall MSE: 0.0201 |
|
|
; Peak ME: 0.0265 |
|
|
; Overall ME: 0.0006 |
|
|
; |
|
59 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
60 |
; |
; |
61 |
; -=IDCT=- |
; -=IDCT=- |
64 |
; descaling) require some unpairable shifting and packing, all on |
; descaling) require some unpairable shifting and packing, all on |
65 |
; the same CPU unit. |
; the same CPU unit. |
66 |
; |
; |
|
; THIS IDCT IS NOT IEEE-COMPLIANT: IT WILL FAIL THE [-300,300] |
|
|
; INPUT RANGE TEST (because of overflow). But the [-256,255] one |
|
|
; is OK, and I'm fine with it (for now;) |
|
|
; |
|
|
; == Mean square errors == |
|
|
; 0.007 0.006 0.005 0.007 0.006 0.007 0.005 0.007 [0.006] |
|
|
; 0.006 0.008 0.007 0.007 0.007 0.008 0.008 0.008 [0.007] |
|
|
; 0.008 0.008 0.008 0.008 0.007 0.009 0.010 0.007 [0.008] |
|
|
; 0.007 0.007 0.006 0.007 0.008 0.007 0.006 0.008 [0.007] |
|
|
; 0.007 0.006 0.006 0.006 0.006 0.005 0.006 0.006 [0.006] |
|
|
; 0.008 0.007 0.006 0.008 0.007 0.008 0.009 0.009 [0.008] |
|
|
; 0.008 0.006 0.010 0.008 0.008 0.008 0.007 0.007 [0.008] |
|
|
; 0.007 0.006 0.006 0.007 0.007 0.006 0.006 0.007 [0.006] |
|
|
; |
|
|
; == Abs Mean errors == |
|
|
; 0.001 0.000 0.000 0.001 0.001 0.000 0.000 0.000 [0.000] |
|
|
; 0.000 0.002 0.002 0.000 0.001 0.001 0.000 0.002 [0.000] |
|
|
; 0.001 0.002 0.001 0.001 0.001 0.001 0.000 0.001 [-0.001] |
|
|
; 0.000 0.002 0.000 0.000 0.001 0.000 0.000 0.001 [-0.000] |
|
|
; 0.000 0.001 0.001 0.001 0.000 0.001 0.000 0.001 [0.000] |
|
|
; 0.000 0.001 0.001 0.001 0.001 0.000 0.001 0.000 [0.000] |
|
|
; 0.001 0.001 0.002 0.001 0.001 0.002 0.001 0.001 [0.001] |
|
|
; 0.000 0.000 0.001 0.000 0.000 0.000 0.000 0.000 [0.000] |
|
|
; |
|
|
; ========================= |
|
|
; |
|
|
; Peak error: 1.0000 |
|
|
; Peak MSE: 0.0096 |
|
|
; Overall MSE: 0.0070 |
|
|
; Peak ME: 0.0024 |
|
|
; Overall ME: 0.0001 |
|
|
; |
|
67 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
68 |
|
|
69 |
;============================================================================= |
;============================================================================= |
70 |
; Read only data |
; Read only data |
71 |
;============================================================================= |
;============================================================================= |
72 |
|
|
73 |
%ifdef FORMAT_COFF |
DATA |
|
SECTION .rodata |
|
|
%else |
|
|
SECTION .rodata align=16 |
|
|
%endif |
|
74 |
|
|
75 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
76 |
tan1: times 8 dw 0x32ec ; tan( pi/16) |
tan1: times 8 dw 0x32ec ; tan( pi/16) |
77 |
tan2: times 8 dw 0x6a0a ; tan(2pi/16) (=sqrt(2)-1) |
tan2: times 8 dw 0x6a0a ; tan(2pi/16) (=sqrt(2)-1) |
78 |
tan3: times 8 dw 0xab0e ; tan(3pi/16)-1 |
tan3: times 8 dw 0xab0e ; tan(3pi/16)-1 |
82 |
; Inverse DCT tables |
; Inverse DCT tables |
83 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
84 |
|
|
85 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
86 |
iTab1: |
iTab1: |
87 |
dw 0x4000, 0x539f, 0x4000, 0x22a3 |
dw 0x4000, 0x539f, 0x4000, 0x22a3 |
88 |
dw 0x4000, 0xdd5d, 0x4000, 0xac61 |
dw 0x4000, 0xdd5d, 0x4000, 0xac61 |
123 |
dw 0x3b21, 0x14c3, 0x979e, 0xc4df |
dw 0x3b21, 0x14c3, 0x979e, 0xc4df |
124 |
dw 0x14c3, 0x587e, 0x587e, 0x979e |
dw 0x14c3, 0x587e, 0x587e, 0x979e |
125 |
|
|
126 |
; the original rounding trick is by |
ALIGN SECTION_ALIGN |
127 |
; Michel Lespinasse (hi Walken!) <walken@zoy.org> |
Walken_Idct_Rounders: |
128 |
|
dd 65536, 65536, 65536, 65536 |
129 |
ALIGN 16 |
dd 3597, 3597, 3597, 3597 |
130 |
Idct_Rnd0: dd 65535, 65535, 65535, 65535 |
dd 2260, 2260, 2260, 2260 |
131 |
Idct_Rnd1: dd 3612, 3612, 3612, 3612 |
dd 1203, 1203, 1203, 1203 |
132 |
Idct_Rnd2: dd 2271, 2271, 2271, 2271 |
dd 0, 0, 0, 0 |
133 |
Idct_Rnd3: dd 1203, 1203, 1203, 1203 |
dd 120, 120, 120, 120 |
134 |
Idct_Rnd4: dd 1023, 1023, 1023, 1023 |
dd 512, 512, 512, 512 |
135 |
Idct_Rnd5: dd 102, 102, 102, 102 |
dd 512, 512, 512, 512 |
136 |
Idct_Rnd6: dd 398, 398, 398, 398 |
|
137 |
Idct_Rnd7: dd 469, 469, 469, 469 |
times 8 dw (65536>>11) |
138 |
|
times 8 dw ( 3597>>11) |
139 |
Idct_Sparse_Rnd0: times 4 dw (65535>>11) |
times 8 dw ( 2260>>11) |
|
Idct_Sparse_Rnd1: times 4 dw ( 3612>>11) |
|
|
Idct_Sparse_Rnd2: times 4 dw ( 2271>>11) |
|
140 |
; other rounders are zero... |
; other rounders are zero... |
141 |
|
|
142 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
143 |
; Forward DCT tables |
; Forward DCT tables |
144 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
145 |
|
|
146 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
147 |
fTab1: |
fTab1: |
148 |
dw 0x4000, 0x4000, 0x58c5, 0x4b42, |
dw 0x4000, 0x4000, 0x58c5, 0x4b42, |
149 |
dw 0xdd5d, 0xac61, 0xa73b, 0xcdb7, |
dw 0xdd5d, 0xac61, 0xa73b, 0xcdb7, |
185 |
dw 0x28ba, 0x9dac, 0x14c3, 0xc4df |
dw 0x28ba, 0x9dac, 0x14c3, 0xc4df |
186 |
|
|
187 |
|
|
188 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
189 |
Fdct_Rnd0: dw 6,8,8,8, 6,8,8,8 |
Fdct_Rnd0: dw 6,8,8,8, 6,8,8,8 |
190 |
Fdct_Rnd1: dw 8,8,8,8, 8,8,8,8 |
Fdct_Rnd1: dw 8,8,8,8, 8,8,8,8 |
191 |
Fdct_Rnd2: dw 10,8,8,8, 8,8,8,8 |
Fdct_Rnd2: dw 10,8,8,8, 8,8,8,8 |
195 |
; Code |
; Code |
196 |
;============================================================================= |
;============================================================================= |
197 |
|
|
198 |
SECTION .text |
TEXT |
199 |
|
|
200 |
cglobal idct_sse2_skal |
cglobal idct_sse2_skal |
|
cglobal idct_sse2_sparse_skal |
|
201 |
cglobal fdct_sse2_skal |
cglobal fdct_sse2_skal |
202 |
|
|
203 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
206 |
|
|
207 |
%macro iMTX_MULT 4 ; %1=src, %2 = Table to use, %3=rounder, %4=Shift |
%macro iMTX_MULT 4 ; %1=src, %2 = Table to use, %3=rounder, %4=Shift |
208 |
|
|
209 |
movdqa xmm0, [ecx+%1*16] ; xmm0 = [01234567] |
movdqa xmm0, [_ECX+%1*16] ; xmm0 = [01234567] |
210 |
|
|
211 |
|
pshuflw xmm0, xmm0, 11011000b ; [02134567] ; these two shufflings could be |
212 |
|
pshufhw xmm0, xmm0, 11011000b ; [02134657] ; integrated in zig-zag orders |
213 |
|
|
|
pshuflw xmm0, xmm0, 11011000b ; [0213] |
|
|
pshufhw xmm0, xmm0, 11011000b ; [02134657] |
|
214 |
pshufd xmm4, xmm0, 00000000b ; [02020202] |
pshufd xmm4, xmm0, 00000000b ; [02020202] |
215 |
pshufd xmm5, xmm0, 10101010b ; [46464646] |
pshufd xmm5, xmm0, 10101010b ; [46464646] |
216 |
pshufd xmm6, xmm0, 01010101b ; [13131313] |
pshufd xmm6, xmm0, 01010101b ; [13131313] |
232 |
psrad xmm4, %4 ; => out [7654] |
psrad xmm4, %4 ; => out [7654] |
233 |
|
|
234 |
packssdw xmm6, xmm4 ; [01237654] |
packssdw xmm6, xmm4 ; [01237654] |
235 |
|
|
236 |
pshufhw xmm6, xmm6, 00011011b ; [01234567] |
pshufhw xmm6, xmm6, 00011011b ; [01234567] |
237 |
|
|
238 |
movdqa [ecx+%1*16], xmm6 |
movdqa [_ECX+%1*16], xmm6 |
239 |
|
|
240 |
%endmacro |
%endmacro |
241 |
|
|
305 |
movdqa xmm3, [%1+0*16] ; x0 |
movdqa xmm3, [%1+0*16] ; x0 |
306 |
movdqa xmm6, [%1+4*16] ; x4 |
movdqa xmm6, [%1+4*16] ; x4 |
307 |
|
|
308 |
|
movdqa [%1 ], xmm2 ; we spill 1 reg to perform safe butterflies |
309 |
|
|
310 |
|
movdqa xmm2, xmm3 |
311 |
psubsw xmm3, xmm6 ; x0-x4 = tm04 |
psubsw xmm3, xmm6 ; x0-x4 = tm04 |
312 |
paddsw xmm6, xmm6 ; 2.x4 |
paddsw xmm6, xmm2 ; x0+x4 = tp04 |
|
paddsw xmm6, xmm3 ; x0+x4 = tp04 |
|
313 |
|
|
314 |
psubsw xmm3, xmm5 ; tm04-tm26 = a2 |
movdqa xmm2, xmm6 |
315 |
psubsw xmm6, xmm7 ; tp04-tp26 = a3 |
psubsw xmm6, xmm7 |
316 |
paddsw xmm5, xmm5 ; 2.tm26 |
paddsw xmm7, xmm2 |
317 |
paddsw xmm7, xmm7 ; 2.tp26 |
movdqa xmm2, xmm3 |
318 |
paddsw xmm5, xmm3 ; tm04+tm26 = a1 |
psubsw xmm3, xmm5 |
319 |
paddsw xmm7, xmm6 ; tp04+tp26 = a0 |
paddsw xmm5, xmm2 |
320 |
|
|
321 |
psubsw xmm5, xmm0 ; a1-b1 |
movdqa xmm2, xmm5 |
322 |
psubsw xmm3, xmm4 ; a2-b2 |
psubsw xmm5, xmm0 |
323 |
paddsw xmm0, xmm0 ; 2.b1 |
paddsw xmm0, xmm2 |
324 |
paddsw xmm4, xmm4 ; 2.b2 |
movdqa xmm2, xmm3 |
325 |
paddsw xmm0, xmm5 ; a1+b1 |
psubsw xmm3, xmm4 |
326 |
paddsw xmm4, xmm3 ; a2+b2 |
paddsw xmm4, xmm2 |
327 |
|
|
328 |
|
movdqa xmm2, [%1] |
329 |
|
|
330 |
psraw xmm5, 6 ; out6 |
psraw xmm5, 6 ; out6 |
331 |
psraw xmm3, 6 ; out5 |
psraw xmm3, 6 ; out5 |
351 |
psraw xmm2, 6 ; out3 |
psraw xmm2, 6 ; out3 |
352 |
psraw xmm6, 6 ; out4 |
psraw xmm6, 6 ; out4 |
353 |
|
|
354 |
|
; store result |
355 |
|
|
356 |
movdqa [%1+0*16], xmm1 |
movdqa [%1+0*16], xmm1 |
357 |
movdqa [%1+3*16], xmm2 |
movdqa [%1+3*16], xmm2 |
358 |
movdqa [%1+4*16], xmm6 |
movdqa [%1+4*16], xmm6 |
359 |
movdqa [%1+7*16], xmm7 |
movdqa [%1+7*16], xmm7 |
|
%endmacro |
|
|
|
|
|
;----------------------------------------------------------------------------- |
|
|
; Function idct (the straight forward version) |
|
|
;----------------------------------------------------------------------------- |
|
360 |
|
|
361 |
ALIGN 16 |
%endmacro |
|
idct_sse2_skal: |
|
|
mov ecx, [esp+4] |
|
|
iMTX_MULT 0, iTab1, Idct_Rnd0, 11 |
|
|
iMTX_MULT 1, iTab2, Idct_Rnd1, 11 |
|
|
iMTX_MULT 2, iTab3, Idct_Rnd2, 11 |
|
|
iMTX_MULT 3, iTab4, Idct_Rnd3, 11 |
|
|
iMTX_MULT 4, iTab1, Idct_Rnd4, 11 |
|
|
iMTX_MULT 5, iTab4, Idct_Rnd5, 11 |
|
|
iMTX_MULT 6, iTab3, Idct_Rnd6, 11 |
|
|
iMTX_MULT 7, iTab2, Idct_Rnd7, 11 |
|
|
iLLM_PASS ecx+0 |
|
|
ret |
|
362 |
|
|
363 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
364 |
; Helper macro TEST_ROW (test a null row) |
; Helper macro TEST_ROW (test a null row) |
365 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
366 |
|
|
367 |
%macro TEST_ROW 2 ; %1:src, %2:label x8 |
%macro TEST_ROW 2 ; %1:src, %2:label x8 |
368 |
mov eax, [%1 ] |
mov _EAX, [%1 ] |
369 |
mov edx, [%1+ 8] |
mov _EDX, [%1+ 8] |
370 |
or eax, [%1+ 4] |
or _EAX, [%1+ 4] |
371 |
or edx, [%1+12] |
or _EDX, [%1+12] |
372 |
or eax, edx |
or _EAX, _EDX |
373 |
jz near %2 |
jz near %2 |
374 |
%endmacro |
%endmacro |
375 |
|
|
376 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
377 |
; Function idct (this one skips null rows) |
; Function idct (this one skips null rows) |
378 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
379 |
|
; IEEE1180 and Walken compatible version |
380 |
|
|
381 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
382 |
idct_sse2_sparse_skal: |
idct_sse2_skal: |
383 |
|
|
384 |
mov ecx, [esp+ 4] ; Src |
PUSH_XMM6_XMM7 |
385 |
|
|
386 |
TEST_ROW ecx, .Row0_Round |
mov _ECX, prm1 ; Src |
387 |
iMTX_MULT 0, iTab1, Idct_Rnd0, 11 |
|
388 |
|
TEST_ROW _ECX, .Row0_Round |
389 |
|
iMTX_MULT 0, iTab1, Walken_Idct_Rounders + 16*0, 11 |
390 |
jmp .Row1 |
jmp .Row1 |
391 |
.Row0_Round |
.Row0_Round: |
392 |
movq mm0, [Idct_Sparse_Rnd0] |
movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0] |
393 |
movq [ecx ], mm0 |
movdqa [_ECX ], xmm0 |
394 |
movq [ecx+8], mm0 |
|
395 |
|
.Row1: |
396 |
.Row1 |
TEST_ROW _ECX+16, .Row1_Round |
397 |
TEST_ROW ecx+16, .Row1_Round |
iMTX_MULT 1, iTab2, Walken_Idct_Rounders + 16*1, 11 |
|
iMTX_MULT 1, iTab2, Idct_Rnd1, 11 |
|
398 |
jmp .Row2 |
jmp .Row2 |
399 |
.Row1_Round |
.Row1_Round: |
400 |
movq mm0, [Idct_Sparse_Rnd1] |
movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1] |
401 |
movq [ecx+16 ], mm0 |
movdqa [_ECX+16 ], xmm0 |
402 |
movq [ecx+16+8], mm0 |
|
403 |
|
.Row2: |
404 |
.Row2 |
TEST_ROW _ECX+32, .Row2_Round |
405 |
TEST_ROW ecx+32, .Row2_Round |
iMTX_MULT 2, iTab3, Walken_Idct_Rounders + 16*2, 11 |
|
iMTX_MULT 2, iTab3, Idct_Rnd2, 11 |
|
406 |
jmp .Row3 |
jmp .Row3 |
407 |
.Row2_Round |
.Row2_Round: |
408 |
movq mm0, [Idct_Sparse_Rnd2] |
movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2] |
409 |
movq [ecx+32 ], mm0 |
movdqa [_ECX+32 ], xmm0 |
410 |
movq [ecx+32+8], mm0 |
|
411 |
|
.Row3: |
412 |
.Row3 |
TEST_ROW _ECX+48, .Row4 |
413 |
TEST_ROW ecx+48, .Row4 |
iMTX_MULT 3, iTab4, Walken_Idct_Rounders + 16*3, 11 |
414 |
iMTX_MULT 3, iTab4, Idct_Rnd3, 11 |
|
415 |
jmp .Row4 |
.Row4: |
416 |
|
TEST_ROW _ECX+64, .Row5 |
417 |
.Row4 |
iMTX_MULT 4, iTab1, Walken_Idct_Rounders + 16*4, 11 |
418 |
TEST_ROW ecx+64, .Row5 |
|
419 |
iMTX_MULT 4, iTab1, Idct_Rnd4, 11 |
.Row5: |
420 |
jmp .Row5 |
TEST_ROW _ECX+80, .Row6 |
421 |
|
iMTX_MULT 5, iTab4, Walken_Idct_Rounders + 16*5, 11 |
422 |
.Row5 |
|
423 |
TEST_ROW ecx+80, .Row6 |
.Row6: |
424 |
iMTX_MULT 5, iTab4, Idct_Rnd5, 11 |
TEST_ROW _ECX+96, .Row7 |
425 |
|
iMTX_MULT 6, iTab3, Walken_Idct_Rounders + 16*6, 11 |
426 |
.Row6 |
|
427 |
TEST_ROW ecx+96, .Row7 |
.Row7: |
428 |
iMTX_MULT 6, iTab3, Idct_Rnd6, 11 |
TEST_ROW _ECX+112, .End |
429 |
|
iMTX_MULT 7, iTab2, Walken_Idct_Rounders + 16*7, 11 |
430 |
.Row7 |
.End: |
431 |
TEST_ROW ecx+112, .End |
|
432 |
iMTX_MULT 7, iTab2, Idct_Rnd7, 11 |
iLLM_PASS _ECX |
|
.End |
|
433 |
|
|
434 |
iLLM_PASS ecx+0 |
POP_XMM6_XMM7 |
435 |
ret |
ret |
436 |
|
ENDFUNC |
437 |
|
|
438 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
439 |
; Helper macro fLLM_PASS |
; Helper macro fLLM_PASS |
545 |
|
|
546 |
%macro fMTX_MULT 3 ; %1=src, %2 = Coeffs, %3=rounders |
%macro fMTX_MULT 3 ; %1=src, %2 = Coeffs, %3=rounders |
547 |
|
|
548 |
movdqa xmm0, [ecx+%1*16+0] ; xmm0 = [0123][4567] |
movdqa xmm0, [_ECX+%1*16+0] ; xmm0 = [0123][4567] |
549 |
pshufhw xmm1, xmm0, 00011011b ; xmm1 = [----][7654] |
pshufhw xmm1, xmm0, 00011011b ; xmm1 = [----][7654] |
550 |
pshufd xmm0, xmm0, 01000100b |
pshufd xmm0, xmm0, 01000100b |
551 |
pshufd xmm1, xmm1, 11101110b |
pshufd xmm1, xmm1, 11101110b |
579 |
|
|
580 |
psraw xmm0, 4 ; => [-2048, 2047] |
psraw xmm0, 4 ; => [-2048, 2047] |
581 |
|
|
582 |
movdqa [ecx+%1*16+0], xmm0 |
movdqa [_ECX+%1*16+0], xmm0 |
583 |
%endmacro |
%endmacro |
584 |
|
|
585 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
586 |
; Function Forward DCT |
; Function Forward DCT |
587 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
588 |
|
|
589 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
590 |
fdct_sse2_skal: |
fdct_sse2_skal: |
591 |
mov ecx, [esp+4] |
PUSH_XMM6_XMM7 |
592 |
fLLM_PASS ecx+0, 3 |
mov _ECX, prm1 |
593 |
|
fLLM_PASS _ECX+0, 3 |
594 |
fMTX_MULT 0, fTab1, Fdct_Rnd0 |
fMTX_MULT 0, fTab1, Fdct_Rnd0 |
595 |
fMTX_MULT 1, fTab2, Fdct_Rnd2 |
fMTX_MULT 1, fTab2, Fdct_Rnd2 |
596 |
fMTX_MULT 2, fTab3, Fdct_Rnd1 |
fMTX_MULT 2, fTab3, Fdct_Rnd1 |
599 |
fMTX_MULT 5, fTab4, Fdct_Rnd1 |
fMTX_MULT 5, fTab4, Fdct_Rnd1 |
600 |
fMTX_MULT 6, fTab3, Fdct_Rnd1 |
fMTX_MULT 6, fTab3, Fdct_Rnd1 |
601 |
fMTX_MULT 7, fTab2, Fdct_Rnd1 |
fMTX_MULT 7, fTab2, Fdct_Rnd1 |
602 |
|
|
603 |
|
POP_XMM6_XMM7 |
604 |
ret |
ret |
605 |
|
ENDFUNC |
606 |
|
|
607 |
|
|
608 |
|
%ifidn __OUTPUT_FORMAT__,elf |
609 |
|
section ".note.GNU-stack" noalloc noexec nowrite progbits |
610 |
|
%endif |
611 |
|
|