--- branches/dev-api-4/xvidcore/src/dct/x86_asm/idct_mmx.asm 2003/10/27 00:50:05 1189 +++ branches/dev-api-4/xvidcore/src/dct/x86_asm/idct_mmx.asm 2003/10/27 01:03:06 1190 @@ -1,3 +1,30 @@ +;/**************************************************************************** +; * +; * XVID MPEG-4 VIDEO CODEC +; * - MMX and XMM forward discrete cosine transform - +; * +; * Copyright(C) 2001 Peter Ross +; * +; * This program is free software; you can redistribute it and/or modify it +; * under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 2 of the License, or +; * (at your option) any later version. +; * +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +; * +; * $Id: idct_mmx.asm,v 1.6.2.1 2003-10-27 01:03:06 edgomez Exp $ +; * +; ***************************************************************************/ + +; **************************************************************************** +; ; Originally provided by Intel at AP-922 ; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm ; (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm) @@ -5,23 +32,23 @@ ; New macro implements a column part for precise iDCT ; The routine precision now satisfies IEEE standard 1180-1990. ; -; Copyright (c) 2000-2001 Peter Gubanov -; Rounding trick Copyright (c) 2000 Michel Lespinasse +; Copyright(C) 2000-2001 Peter Gubanov +; Rounding trick Copyright(C) 2000 Michel Lespinasse ; ; http://www.elecard.com/peter/idct.html ; http://www.linuxvideo.org/mpeg2dec/ ; -;============================================================================= +; ***************************************************************************/ ; ; These examples contain code fragments for first stage iDCT 8x8 ; (for rows) and first stage DCT 8x8 (for columns) ; -;============================================================================= -; -; 04.11.2001 nasm conversion; peter ross -; -bits 32 +BITS 32 + +;============================================================================= +; Macros and other preprocessor constants +;============================================================================= %macro cglobal 1 %ifdef PREFIX @@ -32,65 +59,94 @@ %endif %endmacro -%define BITS_INV_ACC 5 ; 4 or 5 for IEEE +%define BITS_INV_ACC 5 ; 4 or 5 for IEEE %define SHIFT_INV_ROW 16 - BITS_INV_ACC %define SHIFT_INV_COL 1 + BITS_INV_ACC %define RND_INV_ROW 1024 * (6 - BITS_INV_ACC) ; 1 << (SHIFT_INV_ROW-1) %define RND_INV_COL 16 * (BITS_INV_ACC - 3) ; 1 << (SHIFT_INV_COL-1) -%define RND_INV_CORR RND_INV_COL - 1 ; correction -1.0 and round +%define RND_INV_CORR RND_INV_COL - 1 ; correction -1.0 and round -%define BITS_FRW_ACC 3 ; 2 or 3 for accuracy +%define BITS_FRW_ACC 3 ; 2 or 3 for accuracy %define SHIFT_FRW_COL BITS_FRW_ACC %define SHIFT_FRW_ROW BITS_FRW_ACC + 17 -%define RND_FRW_ROW 262144 * (BITS_FRW_ACC - 1) ; 1 << (SHIFT_FRW_ROW-1) +%define RND_FRW_ROW 262144*(BITS_FRW_ACC - 1) ; 1 << (SHIFT_FRW_ROW-1) -%ifdef FORMAT_COFF -section .data data -%else -section .data data align=16 -%endif +;============================================================================= +; Local Data (Read Only) +;============================================================================= -align 16 +SECTION .rodata -one_corr dw 1, 1, 1, 1 -round_inv_row dd RND_INV_ROW, RND_INV_ROW -round_inv_col dw RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL -round_inv_corr dw RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR -round_frw_row dd RND_FRW_ROW, RND_FRW_ROW - tg_1_16 dw 13036, 13036, 13036, 13036 ; tg * (2<<16) + 0.5 - tg_2_16 dw 27146, 27146, 27146, 27146 ; tg * (2<<16) + 0.5 - tg_3_16 dw -21746, -21746, -21746, -21746 ; tg * (2<<16) + 0.5 - cos_4_16 dw -19195, -19195, -19195, -19195 ; cos * (2<<16) + 0.5 -ocos_4_16 dw 23170, 23170, 23170, 23170 ; cos * (2<<15) + 0.5 +;----------------------------------------------------------------------------- +; Various memory constants (trigonometric values or rounding values) +;----------------------------------------------------------------------------- - otg_3_16 dw 21895, 21895, 21895, 21895 ; tg * (2<<16) + 0.5 +ALIGN 16 +one_corr: + dw 1, 1, 1, 1 +round_inv_row: + dd RND_INV_ROW, RND_INV_ROW +round_inv_col: + dw RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL +round_inv_corr: + dw RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR +round_frw_row: + dd RND_FRW_ROW, RND_FRW_ROW +tg_1_16: + dw 13036, 13036, 13036, 13036 ; tg * (2<<16) + 0.5 +tg_2_16: + dw 27146, 27146, 27146, 27146 ; tg * (2<<16) + 0.5 +tg_3_16: + dw -21746, -21746, -21746, -21746 ; tg * (2<<16) + 0.5 +cos_4_16: + dw -19195, -19195, -19195, -19195 ; cos * (2<<16) + 0.5 +ocos_4_16: + dw 23170, 23170, 23170, 23170 ; cos * (2<<15) + 0.5 +otg_3_16: + dw 21895, 21895, 21895, 21895 ; tg * (2<<16) + 0.5 %if SHIFT_INV_ROW == 12 ; assume SHIFT_INV_ROW == 12 -rounder_0 dd 65536, 65536 -rounder_4 dd 0, 0 -rounder_1 dd 7195, 7195 -rounder_7 dd 1024, 1024 -rounder_2 dd 4520, 4520 -rounder_6 dd 1024, 1024 -rounder_3 dd 2407, 2407 -rounder_5 dd 240, 240 +rounder_0: + dd 65536, 65536 +rounder_4: + dd 0, 0 +rounder_1: + dd 7195, 7195 +rounder_7 + dd 1024, 1024 +rounder_2: + dd 4520, 4520 +rounder_6: + dd 1024, 1024 +rounder_3: + dd 2407, 2407 +rounder_5: + dd 240, 240 %elif SHIFT_INV_ROW == 11 ; assume SHIFT_INV_ROW == 11 -rounder_0 dd 65536, 65536 -rounder_4 dd 0, 0 -rounder_1 dd 3597, 3597 -rounder_7 dd 512, 512 -rounder_2 dd 2260, 2260 -rounder_6 dd 512, 512 -rounder_3 dd 1203, 1203 -rounder_5 dd 120, 120 +rounder_0: + dd 65536, 65536 +rounder_4: + dd 0, 0 +rounder_1: + dd 3597, 3597 +rounder_7: + dd 512, 512 +rounder_2: + dd 2260, 2260 +rounder_6: + dd 512, 512 +rounder_3: + dd 1203, 1203 +rounder_5: + dd 120, 120 %else -%error invalid _SHIFT_INV_ROW_ +%error invalid SHIFT_INV_ROW %endif -;============================================================================= +;----------------------------------------------------------------------------- ; ; The first stage iDCT 8x8 - inverse DCTs of rows ; @@ -143,266 +199,211 @@ ; For the constants used, ; FIX(float_const) = (short) (float_const * (1<<15) + 0.5) ; -;============================================================================= +;----------------------------------------------------------------------------- -;============================================================================= -; MMX code -;============================================================================= +;----------------------------------------------------------------------------- +; Tables for mmx processors +;----------------------------------------------------------------------------- ; Table for rows 0,4 - constants are multiplied by cos_4_16 - -tab_i_04 dw 16384, 16384, 16384, -16384 ; movq-> w06 w04 w02 w00 - dw 21407, 8867, 8867, -21407 ; w07 w05 w03 w01 - dw 16384, -16384, 16384, 16384 ; w14 w12 w10 w08 - dw -8867, 21407, -21407, -8867 ; w15 w13 w11 w09 - dw 22725, 12873, 19266, -22725 ; w22 w20 w18 w16 - dw 19266, 4520, -4520, -12873 ; w23 w21 w19 w17 - dw 12873, 4520, 4520, 19266 ; w30 w28 w26 w24 - dw -22725, 19266, -12873, -22725 ; w31 w29 w27 w25 +tab_i_04_mmx: + dw 16384, 16384, 16384, -16384 ; movq-> w06 w04 w02 w00 + dw 21407, 8867, 8867, -21407 ; w07 w05 w03 w01 + dw 16384, -16384, 16384, 16384 ; w14 w12 w10 w08 + dw -8867, 21407, -21407, -8867 ; w15 w13 w11 w09 + dw 22725, 12873, 19266, -22725 ; w22 w20 w18 w16 + dw 19266, 4520, -4520, -12873 ; w23 w21 w19 w17 + dw 12873, 4520, 4520, 19266 ; w30 w28 w26 w24 + dw -22725, 19266, -12873, -22725 ; w31 w29 w27 w25 ; Table for rows 1,7 - constants are multiplied by cos_1_16 - -tab_i_17 dw 22725, 22725, 22725, -22725 ; movq-> w06 w04 w02 w00 - dw 29692, 12299, 12299, -29692 ; w07 w05 w03 w01 - dw 22725, -22725, 22725, 22725 ; w14 w12 w10 w08 - dw -12299, 29692, -29692, -12299 ; w15 w13 w11 w09 - dw 31521, 17855, 26722, -31521 ; w22 w20 w18 w16 - dw 26722, 6270, -6270, -17855 ; w23 w21 w19 w17 - dw 17855, 6270, 6270, 26722 ; w30 w28 w26 w24 - dw -31521, 26722, -17855, -31521 ; w31 w29 w27 w25 +tab_i_17_mmx: + dw 22725, 22725, 22725, -22725 ; movq-> w06 w04 w02 w00 + dw 29692, 12299, 12299, -29692 ; w07 w05 w03 w01 + dw 22725, -22725, 22725, 22725 ; w14 w12 w10 w08 + dw -12299, 29692, -29692, -12299 ; w15 w13 w11 w09 + dw 31521, 17855, 26722, -31521 ; w22 w20 w18 w16 + dw 26722, 6270, -6270, -17855 ; w23 w21 w19 w17 + dw 17855, 6270, 6270, 26722 ; w30 w28 w26 w24 + dw -31521, 26722, -17855, -31521 ; w31 w29 w27 w25 ; Table for rows 2,6 - constants are multiplied by cos_2_16 - -tab_i_26 dw 21407, 21407, 21407, -21407 ; movq-> w06 w04 w02 w00 - dw 27969, 11585, 11585, -27969 ; w07 w05 w03 w01 - dw 21407, -21407, 21407, 21407 ; w14 w12 w10 w08 - dw -11585, 27969, -27969, -11585 ; w15 w13 w11 w09 - dw 29692, 16819, 25172, -29692 ; w22 w20 w18 w16 - dw 25172, 5906, -5906, -16819 ; w23 w21 w19 w17 - dw 16819, 5906, 5906, 25172 ; w30 w28 w26 w24 - dw -29692, 25172, -16819, -29692 ; w31 w29 w27 w25 +tab_i_26_mmx: + dw 21407, 21407, 21407, -21407 ; movq-> w06 w04 w02 w00 + dw 27969, 11585, 11585, -27969 ; w07 w05 w03 w01 + dw 21407, -21407, 21407, 21407 ; w14 w12 w10 w08 + dw -11585, 27969, -27969, -11585 ; w15 w13 w11 w09 + dw 29692, 16819, 25172, -29692 ; w22 w20 w18 w16 + dw 25172, 5906, -5906, -16819 ; w23 w21 w19 w17 + dw 16819, 5906, 5906, 25172 ; w30 w28 w26 w24 + dw -29692, 25172, -16819, -29692 ; w31 w29 w27 w25 ; Table for rows 3,5 - constants are multiplied by cos_3_16 - -tab_i_35 dw 19266, 19266, 19266, -19266 ; movq-> w06 w04 w02 w00 - dw 25172, 10426, 10426, -25172 ; w07 w05 w03 w01 - dw 19266, -19266, 19266, 19266 ; w14 w12 w10 w08 - dw -10426, 25172, -25172, -10426 ; w15 w13 w11 w09 - dw 26722, 15137, 22654, -26722 ; w22 w20 w18 w16 - dw 22654, 5315, -5315, -15137 ; w23 w21 w19 w17 - dw 15137, 5315, 5315, 22654 ; w30 w28 w26 w24 - dw -26722, 22654, -15137, -26722 ; w31 w29 w27 w25 +tab_i_35_mmx: + dw 19266, 19266, 19266, -19266 ; movq-> w06 w04 w02 w00 + dw 25172, 10426, 10426, -25172 ; w07 w05 w03 w01 + dw 19266, -19266, 19266, 19266 ; w14 w12 w10 w08 + dw -10426, 25172, -25172, -10426 ; w15 w13 w11 w09 + dw 26722, 15137, 22654, -26722 ; w22 w20 w18 w16 + dw 22654, 5315, -5315, -15137 ; w23 w21 w19 w17 + dw 15137, 5315, 5315, 22654 ; w30 w28 w26 w24 + dw -26722, 22654, -15137, -26722 ; w31 w29 w27 w25 ;----------------------------------------------------------------------------- - -; -; DCT_8_INV_ROW_1 INP, OUT, TABLE, ROUNDER -; - -%macro DCT_8_INV_ROW_1 4 - - movq mm0, [%1] ; 0 ; x3 x2 x1 x0 - - movq mm1, [%1+8] ; 1 ; x7 x6 x5 x4 - movq mm2, mm0 ; 2 ; x3 x2 x1 x0 - - movq mm3, [%3] ; 3 ; w06 w04 w02 w00 - punpcklwd mm0, mm1 ; x5 x1 x4 x0 - - movq mm5, mm0 ; 5 ; x5 x1 x4 x0 - punpckldq mm0, mm0 ; x4 x0 x4 x0 - - movq mm4, [%3+8] ; 4 ; w07 w05 w03 w01 - punpckhwd mm2, mm1 ; 1 ; x7 x3 x6 x2 - - pmaddwd mm3, mm0 ; x4*w06+x0*w04 x4*w02+x0*w00 - movq mm6, mm2 ; 6 ; x7 x3 x6 x2 - - movq mm1, [%3+32] ; 1 ; w22 w20 w18 w16 - punpckldq mm2, mm2 ; x6 x2 x6 x2 - - pmaddwd mm4, mm2 ; x6*w07+x2*w05 x6*w03+x2*w01 - punpckhdq mm5, mm5 ; x5 x1 x5 x1 - - pmaddwd mm0, [%3+16] ; x4*w14+x0*w12 x4*w10+x0*w08 - punpckhdq mm6, mm6 ; x7 x3 x7 x3 - - movq mm7, [%3+40] ; 7 ; w23 w21 w19 w17 - pmaddwd mm1, mm5 ; x5*w22+x1*w20 x5*w18+x1*w16 - - paddd mm3, [%4] ; +%4 - pmaddwd mm7, mm6 ; x7*w23+x3*w21 x7*w19+x3*w17 - - pmaddwd mm2, [%3+24] ; x6*w15+x2*w13 x6*w11+x2*w09 - paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) - - pmaddwd mm5, [%3+48] ; x5*w30+x1*w28 x5*w26+x1*w24 - movq mm4, mm3 ; 4 ; a1 a0 - - pmaddwd mm6, [%3+56] ; x7*w31+x3*w29 x7*w27+x3*w25 - paddd mm1, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) - - paddd mm0, [%4] ; +%4 - psubd mm3, mm1 ; a1-b1 a0-b0 - - psrad mm3, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 - paddd mm1, mm4 ; 4 ; a1+b1 a0+b0 - - paddd mm0, mm2 ; 2 ; a3=sum(even3) a2=sum(even2) - psrad mm1, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 - - paddd mm5, mm6 ; 6 ; b3=sum(odd3) b2=sum(odd2) - movq mm4, mm0 ; 4 ; a3 a2 - - paddd mm0, mm5 ; a3+b3 a2+b2 - psubd mm4, mm5 ; 5 ; a3-b3 a2-b2 - - psrad mm0, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 - psrad mm4, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 - - packssdw mm1, mm0 ; 0 ; y3 y2 y1 y0 - packssdw mm4, mm3 ; 3 ; y6 y7 y4 y5 - - movq mm7, mm4 ; 7 ; y6 y7 y4 y5 - psrld mm4, 16 ; 0 y6 0 y4 - - pslld mm7, 16 ; y7 0 y5 0 - movq [%2], mm1 ; 1 ; save y3 y2 y1 y0 - - por mm7, mm4 ; 4 ; y7 y6 y5 y4 - movq [%2+8], mm7 ; 7 ; save y7 y6 y5 y4 -%endmacro - - - - -;============================================================================= -; code for Pentium III -;============================================================================= +; Tables for xmm processors +;----------------------------------------------------------------------------- ; %3 for rows 0,4 - constants are multiplied by cos_4_16 - -tab_i_04_sse dw 16384, 21407, 16384, 8867 ; movq-> w05 w04 w01 w00 - dw 16384, 8867, -16384, -21407 ; w07 w06 w03 w02 - dw 16384, -8867, 16384, -21407 ; w13 w12 w09 w08 - dw -16384, 21407, 16384, -8867 ; w15 w14 w11 w10 - dw 22725, 19266, 19266, -4520 ; w21 w20 w17 w16 - dw 12873, 4520, -22725, -12873 ; w23 w22 w19 w18 - dw 12873, -22725, 4520, -12873 ; w29 w28 w25 w24 - dw 4520, 19266, 19266, -22725 ; w31 w30 w27 w26 +tab_i_04_xmm: + dw 16384, 21407, 16384, 8867 ; movq-> w05 w04 w01 w00 + dw 16384, 8867, -16384, -21407 ; w07 w06 w03 w02 + dw 16384, -8867, 16384, -21407 ; w13 w12 w09 w08 + dw -16384, 21407, 16384, -8867 ; w15 w14 w11 w10 + dw 22725, 19266, 19266, -4520 ; w21 w20 w17 w16 + dw 12873, 4520, -22725, -12873 ; w23 w22 w19 w18 + dw 12873, -22725, 4520, -12873 ; w29 w28 w25 w24 + dw 4520, 19266, 19266, -22725 ; w31 w30 w27 w26 ; %3 for rows 1,7 - constants are multiplied by cos_1_16 - -tab_i_17_sse dw 22725, 29692, 22725, 12299 ; movq-> w05 w04 w01 w00 - dw 22725, 12299, -22725, -29692 ; w07 w06 w03 w02 - dw 22725, -12299, 22725, -29692 ; w13 w12 w09 w08 - dw -22725, 29692, 22725, -12299 ; w15 w14 w11 w10 - dw 31521, 26722, 26722, -6270 ; w21 w20 w17 w16 - dw 17855, 6270, -31521, -17855 ; w23 w22 w19 w18 - dw 17855, -31521, 6270, -17855 ; w29 w28 w25 w24 - dw 6270, 26722, 26722, -31521 ; w31 w30 w27 w26 +tab_i_17_xmm: + dw 22725, 29692, 22725, 12299 ; movq-> w05 w04 w01 w00 + dw 22725, 12299, -22725, -29692 ; w07 w06 w03 w02 + dw 22725, -12299, 22725, -29692 ; w13 w12 w09 w08 + dw -22725, 29692, 22725, -12299 ; w15 w14 w11 w10 + dw 31521, 26722, 26722, -6270 ; w21 w20 w17 w16 + dw 17855, 6270, -31521, -17855 ; w23 w22 w19 w18 + dw 17855, -31521, 6270, -17855 ; w29 w28 w25 w24 + dw 6270, 26722, 26722, -31521 ; w31 w30 w27 w26 ; %3 for rows 2,6 - constants are multiplied by cos_2_16 - -tab_i_26_sse dw 21407, 27969, 21407, 11585 ; movq-> w05 w04 w01 w00 - dw 21407, 11585, -21407, -27969 ; w07 w06 w03 w02 - dw 21407, -11585, 21407, -27969 ; w13 w12 w09 w08 - dw -21407, 27969, 21407, -11585 ; w15 w14 w11 w10 - dw 29692, 25172, 25172, -5906 ; w21 w20 w17 w16 - dw 16819, 5906, -29692, -16819 ; w23 w22 w19 w18 - dw 16819, -29692, 5906, -16819 ; w29 w28 w25 w24 - dw 5906, 25172, 25172, -29692 ; w31 w30 w27 w26 +tab_i_26_xmm: + dw 21407, 27969, 21407, 11585 ; movq-> w05 w04 w01 w00 + dw 21407, 11585, -21407, -27969 ; w07 w06 w03 w02 + dw 21407, -11585, 21407, -27969 ; w13 w12 w09 w08 + dw -21407, 27969, 21407, -11585 ; w15 w14 w11 w10 + dw 29692, 25172, 25172, -5906 ; w21 w20 w17 w16 + dw 16819, 5906, -29692, -16819 ; w23 w22 w19 w18 + dw 16819, -29692, 5906, -16819 ; w29 w28 w25 w24 + dw 5906, 25172, 25172, -29692 ; w31 w30 w27 w26 ; %3 for rows 3,5 - constants are multiplied by cos_3_16 +tab_i_35_xmm: + dw 19266, 25172, 19266, 10426 ; movq-> w05 w04 w01 w00 + dw 19266, 10426, -19266, -25172 ; w07 w06 w03 w02 + dw 19266, -10426, 19266, -25172 ; w13 w12 w09 w08 + dw -19266, 25172, 19266, -10426 ; w15 w14 w11 w10 + dw 26722, 22654, 22654, -5315 ; w21 w20 w17 w16 + dw 15137, 5315, -26722, -15137 ; w23 w22 w19 w18 + dw 15137, -26722, 5315, -15137 ; w29 w28 w25 w24 + dw 5315, 22654, 22654, -26722 ; w31 w30 w27 w26 -tab_i_35_sse dw 19266, 25172, 19266, 10426 ; movq-> w05 w04 w01 w00 - dw 19266, 10426, -19266, -25172 ; w07 w06 w03 w02 - dw 19266, -10426, 19266, -25172 ; w13 w12 w09 w08 - dw -19266, 25172, 19266, -10426 ; w15 w14 w11 w10 - dw 26722, 22654, 22654, -5315 ; w21 w20 w17 w16 - dw 15137, 5315, -26722, -15137 ; w23 w22 w19 w18 - dw 15137, -26722, 5315, -15137 ; w29 w28 w25 w24 - dw 5315, 22654, 22654, -26722 ; w31 w30 w27 w26 +;============================================================================= +; Helper macros for the code +;============================================================================= ;----------------------------------------------------------------------------- +; DCT_8_INV_ROW_MMX INP, OUT, TABLE, ROUNDER +;----------------------------------------------------------------------------- -; -; DCT_8_INV_ROW_1_sse INP, OUT, TABLE, ROUNDER -; - -%macro DCT_8_INV_ROW_1_sse 4 - - movq mm0, [%1] ; 0 ; x3 x2 x1 x0 - - movq mm1, [%1+8] ; 1 ; x7 x6 x5 x4 - movq mm2, mm0 ; 2 ; x3 x2 x1 x0 - - movq mm3, [%3] ; 3 ; w05 w04 w01 w00 - pshufw mm0, mm0, 10001000b ; x2 x0 x2 x0 - - movq mm4, [%3+8] ; 4 ; w07 w06 w03 w02 - movq mm5, mm1 ; 5 ; x7 x6 x5 x4 - pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 - - movq mm6, [%3+32] ; 6 ; w21 w20 w17 w16 - pshufw mm1, mm1, 10001000b ; x6 x4 x6 x4 - pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 - - movq mm7, [%3+40] ; 7 ; w23 w22 w19 w18 - pshufw mm2, mm2, 11011101b ; x3 x1 x3 x1 - pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 - - pshufw mm5, mm5, 11011101b ; x7 x5 x7 x5 - pmaddwd mm7, mm5 ; x7*w23+x5*w22 x7*w19+x5*w18 - - paddd mm3, [%4] ; +%4 - - pmaddwd mm0, [%3+16] ; x2*w13+x0*w12 x2*w09+x0*w08 - paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) - - pmaddwd mm1, [%3+24] ; x6*w15+x4*w14 x6*w11+x4*w10 - movq mm4, mm3 ; 4 ; a1 a0 - - pmaddwd mm2, [%3+48] ; x3*w29+x1*w28 x3*w25+x1*w24 - paddd mm6, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) - - pmaddwd mm5, [%3+56] ; x7*w31+x5*w30 x7*w27+x5*w26 - paddd mm3, mm6 ; a1+b1 a0+b0 - - paddd mm0, [%4] ; +%4 - psrad mm3, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 - - paddd mm0, mm1 ; 1 ; a3=sum(even3) a2=sum(even2) - psubd mm4, mm6 ; 6 ; a1-b1 a0-b0 - - movq mm7, mm0 ; 7 ; a3 a2 - paddd mm2, mm5 ; 5 ; b3=sum(odd3) b2=sum(odd2) - - paddd mm0, mm2 ; a3+b3 a2+b2 - psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 - - psubd mm7, mm2 ; 2 ; a3-b3 a2-b2 - psrad mm0, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 - - psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 - - packssdw mm3, mm0 ; 0 ; y3 y2 y1 y0 - - packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 - - movq [%2], mm3 ; 3 ; save y3 y2 y1 y0 - pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 - - movq [%2+8], mm7 ; 7 ; save y7 y6 y5 y4 - +%macro DCT_8_INV_ROW_MMX 4 + movq mm0, [%1] ; 0 ; x3 x2 x1 x0 + movq mm1, [%1+8] ; 1 ; x7 x6 x5 x4 + movq mm2, mm0 ; 2 ; x3 x2 x1 x0 + movq mm3, [%3] ; 3 ; w06 w04 w02 w00 + punpcklwd mm0, mm1 ; x5 x1 x4 x0 + movq mm5, mm0 ; 5 ; x5 x1 x4 x0 + punpckldq mm0, mm0 ; x4 x0 x4 x0 + movq mm4, [%3+8] ; 4 ; w07 w05 w03 w01 + punpckhwd mm2, mm1 ; 1 ; x7 x3 x6 x2 + pmaddwd mm3, mm0 ; x4*w06+x0*w04 x4*w02+x0*w00 + movq mm6, mm2 ; 6 ; x7 x3 x6 x2 + movq mm1, [%3+32] ; 1 ; w22 w20 w18 w16 + punpckldq mm2, mm2 ; x6 x2 x6 x2 + pmaddwd mm4, mm2 ; x6*w07+x2*w05 x6*w03+x2*w01 + punpckhdq mm5, mm5 ; x5 x1 x5 x1 + pmaddwd mm0, [%3+16] ; x4*w14+x0*w12 x4*w10+x0*w08 + punpckhdq mm6, mm6 ; x7 x3 x7 x3 + movq mm7, [%3+40] ; 7 ; w23 w21 w19 w17 + pmaddwd mm1, mm5 ; x5*w22+x1*w20 x5*w18+x1*w16 + paddd mm3, [%4] ; +%4 + pmaddwd mm7, mm6 ; x7*w23+x3*w21 x7*w19+x3*w17 + pmaddwd mm2, [%3+24] ; x6*w15+x2*w13 x6*w11+x2*w09 + paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) + pmaddwd mm5, [%3+48] ; x5*w30+x1*w28 x5*w26+x1*w24 + movq mm4, mm3 ; 4 ; a1 a0 + pmaddwd mm6, [%3+56] ; x7*w31+x3*w29 x7*w27+x3*w25 + paddd mm1, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) + paddd mm0, [%4] ; +%4 + psubd mm3, mm1 ; a1-b1 a0-b0 + psrad mm3, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 + paddd mm1, mm4 ; 4 ; a1+b1 a0+b0 + paddd mm0, mm2 ; 2 ; a3=sum(even3) a2=sum(even2) + psrad mm1, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 + paddd mm5, mm6 ; 6 ; b3=sum(odd3) b2=sum(odd2) + movq mm4, mm0 ; 4 ; a3 a2 + paddd mm0, mm5 ; a3+b3 a2+b2 + psubd mm4, mm5 ; 5 ; a3-b3 a2-b2 + psrad mm0, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 + psrad mm4, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 + packssdw mm1, mm0 ; 0 ; y3 y2 y1 y0 + packssdw mm4, mm3 ; 3 ; y6 y7 y4 y5 + movq mm7, mm4 ; 7 ; y6 y7 y4 y5 + psrld mm4, 16 ; 0 y6 0 y4 + pslld mm7, 16 ; y7 0 y5 0 + movq [%2], mm1 ; 1 ; save y3 y2 y1 y0 + por mm7, mm4 ; 4 ; y7 y6 y5 y4 + movq [%2+8], mm7 ; 7 ; save y7 y6 y5 y4 %endmacro +;----------------------------------------------------------------------------- +; DCT_8_INV_ROW_XMM INP, OUT, TABLE, ROUNDER +;----------------------------------------------------------------------------- -;============================================================================= -; -;============================================================================= +%macro DCT_8_INV_ROW_XMM 4 + movq mm0, [%1] ; 0 ; x3 x2 x1 x0 + movq mm1, [%1+8] ; 1 ; x7 x6 x5 x4 + movq mm2, mm0 ; 2 ; x3 x2 x1 x0 + movq mm3, [%3] ; 3 ; w05 w04 w01 w00 + pshufw mm0, mm0, 10001000b ; x2 x0 x2 x0 + movq mm4, [%3+8] ; 4 ; w07 w06 w03 w02 + movq mm5, mm1 ; 5 ; x7 x6 x5 x4 + pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 + movq mm6, [%3+32] ; 6 ; w21 w20 w17 w16 + pshufw mm1, mm1, 10001000b ; x6 x4 x6 x4 + pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 + movq mm7, [%3+40] ; 7 ; w23 w22 w19 w18 + pshufw mm2, mm2, 11011101b ; x3 x1 x3 x1 + pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 + pshufw mm5, mm5, 11011101b ; x7 x5 x7 x5 + pmaddwd mm7, mm5 ; x7*w23+x5*w22 x7*w19+x5*w18 + paddd mm3, [%4] ; +%4 + pmaddwd mm0, [%3+16] ; x2*w13+x0*w12 x2*w09+x0*w08 + paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) + pmaddwd mm1, [%3+24] ; x6*w15+x4*w14 x6*w11+x4*w10 + movq mm4, mm3 ; 4 ; a1 a0 + pmaddwd mm2, [%3+48] ; x3*w29+x1*w28 x3*w25+x1*w24 + paddd mm6, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) + pmaddwd mm5, [%3+56] ; x7*w31+x5*w30 x7*w27+x5*w26 + paddd mm3, mm6 ; a1+b1 a0+b0 + paddd mm0, [%4] ; +%4 + psrad mm3, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 + paddd mm0, mm1 ; 1 ; a3=sum(even3) a2=sum(even2) + psubd mm4, mm6 ; 6 ; a1-b1 a0-b0 + movq mm7, mm0 ; 7 ; a3 a2 + paddd mm2, mm5 ; 5 ; b3=sum(odd3) b2=sum(odd2) + paddd mm0, mm2 ; a3+b3 a2+b2 + psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 + psubd mm7, mm2 ; 2 ; a3-b3 a2-b2 + psrad mm0, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 + psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 + packssdw mm3, mm0 ; 0 ; y3 y2 y1 y0 + packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 + movq [%2], mm3 ; 3 ; save y3 y2 y1 y0 + pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 + movq [%2+8], mm7 ; 7 ; save y7 y6 y5 y4 +%endmacro -;============================================================================= +;----------------------------------------------------------------------------- ; ; The first stage DCT 8x8 - forward DCTs of columns ; @@ -459,569 +460,156 @@ ; y[3] = tm765 - tm465 * tg_3_16; ;} ; -;============================================================================= - - -; -; DCT_8_FRW_COL_4 INP, OUT -; - -%macro DCT_8_FRW_COL_4 2 - - LOCAL x0, x1, x2, x3, x4, x5, x6, x7 - LOCAL y0, y1, y2, y3, y4, y5, y6, y7 - x0 equ [%1 + 0*16] - x1 equ [%1 + 1*16] - x2 equ [%1 + 2*16] - x3 equ [%1 + 3*16] - x4 equ [%1 + 4*16] - x5 equ [%1 + 5*16] - x6 equ [%1 + 6*16] - x7 equ [%1 + 7*16] - y0 equ [%2 + 0*16] - y1 equ [%2 + 1*16] - y2 equ [%2 + 2*16] - y3 equ [%2 + 3*16] - y4 equ [%2 + 4*16] - y5 equ [%2 + 5*16] - y6 equ [%2 + 6*16] - y7 equ [%2 + 7*16] - movq mm0, x1 ; 0 ; x1 - movq mm1, x6 ; 1 ; x6 - movq mm2, mm0 ; 2 ; x1 - movq mm3, x2 ; 3 ; x2 - paddsw mm0, mm1 ; t1 = x[1] + x[6] - movq mm4, x5 ; 4 ; x5 - psllw mm0, SHIFT_FRW_COL ; t1 - movq mm5, x0 ; 5 ; x0 - paddsw mm4, mm3 ; t2 = x[2] + x[5] - paddsw mm5, x7 ; t0 = x[0] + x[7] - psllw mm4, SHIFT_FRW_COL ; t2 - movq mm6, mm0 ; 6 ; t1 - psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6] - movq mm1, [tg_2_16] ; 1 ; tg_2_16 - psubsw mm0, mm4 ; tm12 = t1 - t2 - movq mm7, x3 ; 7 ; x3 - pmulhw mm1, mm0 ; tm12*tg_2_16 - paddsw mm7, x4 ; t3 = x[3] + x[4] - psllw mm5, SHIFT_FRW_COL ; t0 - paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2 - psllw mm7, SHIFT_FRW_COL ; t3 - movq mm4, mm5 ; 4 ; t0 - psubsw mm5, mm7 ; tm03 = t0 - t3 - paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16 - paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3 - por mm1, [one_corr] ; correction y2 +0.5 - psllw mm2, SHIFT_FRW_COL+1 ; t6 - pmulhw mm5, [tg_2_16] ; tm03*tg_2_16 - movq mm7, mm4 ; 7 ; tp03 - psubsw mm3, x5 ; t5 = x[2] - x[5] - psubsw mm4, mm6 ; y4 = tp03 - tp12 - movq y2, mm1 ; 1 ; save y2 - paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12 - movq mm1, x3 ; 1 ; x3 - psllw mm3, SHIFT_FRW_COL+1 ; t5 - psubsw mm1, x4 ; t4 = x[3] - x[4] - movq mm6, mm2 ; 6 ; t6 - movq y4, mm4 ; 4 ; save y4 - paddsw mm2, mm3 ; t6 + t5 - pmulhw mm2, [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16 - psubsw mm6, mm3 ; 3 ; t6 - t5 - pmulhw mm6, [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16 - psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12 - por mm5, [one_corr] ; correction y6 +0.5 - psllw mm1, SHIFT_FRW_COL ; t4 - por mm2, [one_corr] ; correction tp65 +0.5 - movq mm4, mm1 ; 4 ; t4 - movq mm3, x0 ; 3 ; x0 - paddsw mm1, mm6 ; tp465 = t4 + tm65 - psubsw mm3, x7 ; t7 = x[0] - x[7] - psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65 - movq mm0, [tg_1_16] ; 0 ; tg_1_16 - psllw mm3, SHIFT_FRW_COL ; t7 - movq mm6, [tg_3_16] ; 6 ; tg_3_16 - pmulhw mm0, mm1 ; tp465*tg_1_16 - movq y0, mm7 ; 7 ; save y0 - pmulhw mm6, mm4 ; tm465*tg_3_16 - movq y6, mm5 ; 5 ; save y6 - movq mm7, mm3 ; 7 ; t7 - movq mm5, [tg_3_16] ; 5 ; tg_3_16 - psubsw mm7, mm2 ; tm765 = t7 - tp65 - paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65 - pmulhw mm5, mm7 ; tm765*tg_3_16 - paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16 - paddsw mm6, mm4 ; tm465*tg_3_16 - pmulhw mm3, [tg_1_16] ; tp765*tg_1_16 - por mm0, [one_corr] ; correction y1 +0.5 - paddsw mm5, mm7 ; tm765*tg_3_16 - psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16 - movq y1, mm0 ; 0 ; save y1 - paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465 - movq y3, mm7 ; 7 ; save y3 - psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465 - movq y5, mm5 ; 5 ; save y5 - movq y7, mm3 ; 3 ; save y7 -%endmacro - +;----------------------------------------------------------------------------- -; +;----------------------------------------------------------------------------- ; DCT_8_INV_COL_4 INP,OUT -; - -%macro DCT_8_INV_COL_4 2 - movq mm0, [tg_3_16] - - movq mm3, [%1+16*3] - movq mm1, mm0 ; tg_3_16 - - movq mm5, [%1+16*5] - pmulhw mm0, mm3 ; x3*(tg_3_16-1) - - movq mm4, [tg_1_16] - pmulhw mm1, mm5 ; x5*(tg_3_16-1) - - movq mm7, [%1+16*7] - movq mm2, mm4 ; tg_1_16 - - movq mm6, [%1+16*1] - pmulhw mm4, mm7 ; x7*tg_1_16 - - paddsw mm0, mm3 ; x3*tg_3_16 - pmulhw mm2, mm6 ; x1*tg_1_16 - - paddsw mm1, mm3 ; x3+x5*(tg_3_16-1) - psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35 - - movq mm3, [ocos_4_16] - paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35 - - paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17 - psubsw mm2, mm7 ; x1*tg_1_16-x7 = tm17 - - movq mm5, mm4 ; tp17 - movq mm6, mm2 ; tm17 - - paddsw mm5, mm1 ; tp17+tp35 = b0 - psubsw mm6, mm0 ; tm17-tm35 = b3 - - psubsw mm4, mm1 ; tp17-tp35 = t1 - paddsw mm2, mm0 ; tm17+tm35 = t2 - - movq mm7, [tg_2_16] - movq mm1, mm4 ; t1 - -; movq [SCRATCH+0], mm5 ; save b0 - movq [%2+3*16], mm5 ; save b0 - paddsw mm1, mm2 ; t1+t2 - -; movq [SCRATCH+8], mm6 ; save b3 - movq [%2+5*16], mm6 ; save b3 - psubsw mm4, mm2 ; t1-t2 - - movq mm5, [%1+2*16] - movq mm0, mm7 ; tg_2_16 - - movq mm6, [%1+6*16] - pmulhw mm0, mm5 ; x2*tg_2_16 +;----------------------------------------------------------------------------- - pmulhw mm7, mm6 ; x6*tg_2_16 +%macro DCT_8_INV_COL 2 + movq mm0, [tg_3_16] + movq mm3, [%1+16*3] + movq mm1, mm0 ; tg_3_16 + movq mm5, [%1+16*5] + pmulhw mm0, mm3 ; x3*(tg_3_16-1) + movq mm4, [tg_1_16] + pmulhw mm1, mm5 ; x5*(tg_3_16-1) + movq mm7, [%1+16*7] + movq mm2, mm4 ; tg_1_16 + movq mm6, [%1+16*1] + pmulhw mm4, mm7 ; x7*tg_1_16 + paddsw mm0, mm3 ; x3*tg_3_16 + pmulhw mm2, mm6 ; x1*tg_1_16 + paddsw mm1, mm3 ; x3+x5*(tg_3_16-1) + psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35 + movq mm3, [ocos_4_16] + paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35 + paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17 + psubsw mm2, mm7 ; x1*tg_1_16-x7 = tm17 + movq mm5, mm4 ; tp17 + movq mm6, mm2 ; tm17 + paddsw mm5, mm1 ; tp17+tp35 = b0 + psubsw mm6, mm0 ; tm17-tm35 = b3 + psubsw mm4, mm1 ; tp17-tp35 = t1 + paddsw mm2, mm0 ; tm17+tm35 = t2 + movq mm7, [tg_2_16] + movq mm1, mm4 ; t1 +; movq [SCRATCH+0], mm5 ; save b0 + movq [%2+3*16], mm5 ; save b0 + paddsw mm1, mm2 ; t1+t2 +; movq [SCRATCH+8], mm6 ; save b3 + movq [%2+5*16], mm6 ; save b3 + psubsw mm4, mm2 ; t1-t2 + movq mm5, [%1+2*16] + movq mm0, mm7 ; tg_2_16 + movq mm6, [%1+6*16] + pmulhw mm0, mm5 ; x2*tg_2_16 + pmulhw mm7, mm6 ; x6*tg_2_16 ; slot - pmulhw mm1, mm3 ; ocos_4_16*(t1+t2) = b1/2 + pmulhw mm1, mm3 ; ocos_4_16*(t1+t2) = b1/2 ; slot - movq mm2, [%1+0*16] - pmulhw mm4, mm3 ; ocos_4_16*(t1-t2) = b2/2 - - psubsw mm0, mm6 ; t2*tg_2_16-x6 = tm26 - movq mm3, mm2 ; x0 - - movq mm6, [%1+4*16] - paddsw mm7, mm5 ; x2+x6*tg_2_16 = tp26 - - paddsw mm2, mm6 ; x0+x4 = tp04 - psubsw mm3, mm6 ; x0-x4 = tm04 - - movq mm5, mm2 ; tp04 - movq mm6, mm3 ; tm04 - - psubsw mm2, mm7 ; tp04-tp26 = a3 - paddsw mm3, mm0 ; tm04+tm26 = a1 - - paddsw mm1, mm1 ; b1 - paddsw mm4, mm4 ; b2 - - paddsw mm5, mm7 ; tp04+tp26 = a0 - psubsw mm6, mm0 ; tm04-tm26 = a2 - - movq mm7, mm3 ; a1 - movq mm0, mm6 ; a2 - - paddsw mm3, mm1 ; a1+b1 - paddsw mm6, mm4 ; a2+b2 - - psraw mm3, SHIFT_INV_COL ; dst1 + movq mm2, [%1+0*16] + pmulhw mm4, mm3 ; ocos_4_16*(t1-t2) = b2/2 + psubsw mm0, mm6 ; t2*tg_2_16-x6 = tm26 + movq mm3, mm2 ; x0 + movq mm6, [%1+4*16] + paddsw mm7, mm5 ; x2+x6*tg_2_16 = tp26 + paddsw mm2, mm6 ; x0+x4 = tp04 + psubsw mm3, mm6 ; x0-x4 = tm04 + movq mm5, mm2 ; tp04 + movq mm6, mm3 ; tm04 + psubsw mm2, mm7 ; tp04-tp26 = a3 + paddsw mm3, mm0 ; tm04+tm26 = a1 + paddsw mm1, mm1 ; b1 + paddsw mm4, mm4 ; b2 + paddsw mm5, mm7 ; tp04+tp26 = a0 + psubsw mm6, mm0 ; tm04-tm26 = a2 + movq mm7, mm3 ; a1 + movq mm0, mm6 ; a2 + paddsw mm3, mm1 ; a1+b1 + paddsw mm6, mm4 ; a2+b2 + psraw mm3, SHIFT_INV_COL ; dst1 psubsw mm7, mm1 ; a1-b1 - - psraw mm6, SHIFT_INV_COL ; dst2 - psubsw mm0, mm4 ; a2-b2 - -; movq mm1, [SCRATCH+0] ; load b0 - movq mm1, [%2+3*16] ; load b0 - psraw mm7, SHIFT_INV_COL ; dst6 - - movq mm4, mm5 ; a0 - psraw mm0, SHIFT_INV_COL ; dst5 - - movq [%2+1*16], mm3 - paddsw mm5, mm1 ; a0+b0 - - movq [%2+2*16], mm6 + psraw mm6, SHIFT_INV_COL ; dst2 + psubsw mm0, mm4 ; a2-b2 +; movq mm1, [SCRATCH+0] ; load b0 + movq mm1, [%2+3*16] ; load b0 + psraw mm7, SHIFT_INV_COL ; dst6 + movq mm4, mm5 ; a0 + psraw mm0, SHIFT_INV_COL ; dst5 + movq [%2+1*16], mm3 + paddsw mm5, mm1 ; a0+b0 + movq [%2+2*16], mm6 psubsw mm4, mm1 ; a0-b0 - -; movq mm3, [SCRATCH+8] ; load b3 - movq mm3, [%2+5*16] ; load b3 - psraw mm5, SHIFT_INV_COL ; dst0 - - movq mm6, mm2 ; a3 - psraw mm4, SHIFT_INV_COL ; dst7 - - movq [%2+5*16], mm0 - paddsw mm2, mm3 ; a3+b3 - - movq [%2+6*16], mm7 - psubsw mm6, mm3 ; a3-b3 - - movq [%2+0*16], mm5 - psraw mm2, SHIFT_INV_COL ; dst3 - - movq [%2+7*16], mm4 - psraw mm6, SHIFT_INV_COL ; dst4 - - movq [%2+3*16], mm2 - - movq [%2+4*16], mm6 +; movq mm3, [SCRATCH+8] ; load b3 + movq mm3, [%2+5*16] ; load b3 + psraw mm5, SHIFT_INV_COL ; dst0 + movq mm6, mm2 ; a3 + psraw mm4, SHIFT_INV_COL ; dst7 + movq [%2+5*16], mm0 + paddsw mm2, mm3 ; a3+b3 + movq [%2+6*16], mm7 + psubsw mm6, mm3 ; a3-b3 + movq [%2+0*16], mm5 + psraw mm2, SHIFT_INV_COL ; dst3 + movq [%2+7*16], mm4 + psraw mm6, SHIFT_INV_COL ; dst4 + movq [%2+3*16], mm2 + movq [%2+4*16], mm6 %endmacro - - -section .text - ;============================================================================= -; -; void idct_mmx (short * const src_result); -; +; Code ;============================================================================= -align 16 -cglobal idct_mmx -idct_mmx - mov eax, dword [esp + 4] - - DCT_8_INV_ROW_1 eax+0, eax+0, tab_i_04, rounder_0 - DCT_8_INV_ROW_1 eax+16, eax+16, tab_i_17, rounder_1 - DCT_8_INV_ROW_1 eax+32, eax+32, tab_i_26, rounder_2 - DCT_8_INV_ROW_1 eax+48, eax+48, tab_i_35, rounder_3 - DCT_8_INV_ROW_1 eax+64, eax+64, tab_i_04, rounder_4 - DCT_8_INV_ROW_1 eax+80, eax+80, tab_i_35, rounder_5 - DCT_8_INV_ROW_1 eax+96, eax+96, tab_i_26, rounder_6 - DCT_8_INV_ROW_1 eax+112, eax+112, tab_i_17, rounder_7 +SECTION .text - DCT_8_INV_COL_4 eax+0,eax+0 - DCT_8_INV_COL_4 eax+8,eax+8 - - ret +;----------------------------------------------------------------------------- +; void idct_mmx(uint16_t block[64]); +;----------------------------------------------------------------------------- +ALIGN 16 +cglobal idct_mmx +idct_mmx: + mov eax, dword [esp + 4] + ;; Process each row + DCT_8_INV_ROW_MMX eax+0*16, eax+0*16, tab_i_04_mmx, rounder_0 + DCT_8_INV_ROW_MMX eax+1*16, eax+1*16, tab_i_17_mmx, rounder_1 + DCT_8_INV_ROW_MMX eax+2*16, eax+2*16, tab_i_26_mmx, rounder_2 + DCT_8_INV_ROW_MMX eax+3*16, eax+3*16, tab_i_35_mmx, rounder_3 + DCT_8_INV_ROW_MMX eax+4*16, eax+4*16, tab_i_04_mmx, rounder_4 + DCT_8_INV_ROW_MMX eax+5*16, eax+5*16, tab_i_35_mmx, rounder_5 + DCT_8_INV_ROW_MMX eax+6*16, eax+6*16, tab_i_26_mmx, rounder_6 + DCT_8_INV_ROW_MMX eax+7*16, eax+7*16, tab_i_17_mmx, rounder_7 + + ;; Process the columns (4 at a time) + DCT_8_INV_COL eax+0, eax+0 + DCT_8_INV_COL eax+8, eax+8 -;============================================================================= -; -; void idct_sse (short * const src_result); -; -;============================================================================= + ret + +;----------------------------------------------------------------------------- +; void idct_xmm(uint16_t block[64]); +;----------------------------------------------------------------------------- -align 16 +ALIGN 16 cglobal idct_xmm -idct_xmm - mov eax, dword [esp + 4] - - DCT_8_INV_ROW_1_sse eax+0, eax+0, tab_i_04_sse, rounder_0 - DCT_8_INV_ROW_1_sse eax+16, eax+16, tab_i_17_sse, rounder_1 - DCT_8_INV_ROW_1_sse eax+32, eax+32, tab_i_26_sse, rounder_2 - DCT_8_INV_ROW_1_sse eax+48, eax+48, tab_i_35_sse, rounder_3 - DCT_8_INV_ROW_1_sse eax+64, eax+64, tab_i_04_sse, rounder_4 - DCT_8_INV_ROW_1_sse eax+80, eax+80, tab_i_35_sse, rounder_5 - DCT_8_INV_ROW_1_sse eax+96, eax+96, tab_i_26_sse, rounder_6 - DCT_8_INV_ROW_1_sse eax+112, eax+112, tab_i_17_sse, rounder_7 - - DCT_8_INV_COL_4 eax+0, eax+0 - DCT_8_INV_COL_4 eax+8, eax+8 - - ret - -;============================================================================= -; The code below this line is for SSE2-equipped processors -; By Dmitry Rozhdestvensky -;============================================================================= - -section .data - -align 16 - -tab_i_04_s2 dw 16384, 21407, 16384, 8867 ; movq-> w05 w04 w01 w00 - dw 16384, -8867, 16384, -21407 ; w13 w12 w09 w08 - dw 16384, 8867, -16384, -21407 ; w07 w06 w03 w02 - dw -16384, 21407, 16384, -8867 ; w15 w14 w11 w10 - dw 22725, 19266, 19266, -4520 ; w21 w20 w17 w16 - dw 12873, -22725, 4520, -12873 ; w29 w28 w25 w24 - dw 12873, 4520, -22725, -12873 ; w23 w22 w19 w18 - dw 4520, 19266, 19266, -22725 ; w31 w30 w27 w26 - -; Table for rows 1,7 - constants are multiplied by cos_1_16 - -tab_i_17_s2 dw 22725, 29692, 22725, 12299 ; movq-> w05 w04 w01 w00 - dw 22725, -12299, 22725, -29692 ; w13 w12 w09 w08 - dw 22725, 12299, -22725, -29692 ; w07 w06 w03 w02 - dw -22725, 29692, 22725, -12299 ; w15 w14 w11 w10 - dw 31521, 26722, 26722, -6270 ; w21 w20 w17 w16 - dw 17855, -31521, 6270, -17855 ; w29 w28 w25 w24 - dw 17855, 6270, -31521, -17855 ; w23 w22 w19 w18 - dw 6270, 26722, 26722, -31521 ; w31 w30 w27 w26 - -; Table for rows 2,6 - constants are multiplied by cos_2_16 - -tab_i_26_s2 dw 21407, 27969, 21407, 11585 ; movq-> w05 w04 w01 w00 - dw 21407, -11585, 21407, -27969 ; w13 w12 w09 w08 - dw 21407, 11585, -21407, -27969 ; w07 w06 w03 w02 - dw -21407, 27969, 21407, -11585 ; w15 w14 w11 w10 - dw 29692, 25172, 25172, -5906 ; w21 w20 w17 w16 - dw 16819, -29692, 5906, -16819 ; w29 w28 w25 w24 - dw 16819, 5906, -29692, -16819 ; w23 w22 w19 w18 - dw 5906, 25172, 25172, -29692 ; w31 w30 w27 w26 - -; Table for rows 3,5 - constants are multiplied by cos_3_16 - -tab_i_35_s2 dw 19266, 25172, 19266, 10426 ; movq-> w05 w04 w01 w00 - dw 19266, -10426, 19266, -25172 ; w13 w12 w09 w08 - dw 19266, 10426, -19266, -25172 ; w07 w06 w03 w02 - dw -19266, 25172, 19266, -10426 ; w15 w14 w11 w10 - dw 26722, 22654, 22654, -5315 ; w21 w20 w17 w16 - dw 15137, -26722, 5315, -15137 ; w29 w28 w25 w24 - dw 15137, 5315, -26722, -15137 ; w23 w22 w19 w18 - dw 5315, 22654, 22654, -26722 ; w31 w30 w27 w26 - -%if SHIFT_INV_ROW == 12 ; assume SHIFT_INV_ROW == 12 -rounder_2_0 dd 65536, 65536 - dd 65536, 65536 -rounder_2_4 dd 0, 0 - dd 0, 0 -rounder_2_1 dd 7195, 7195 - dd 7195, 7195 -rounder_2_7 dd 1024, 1024 - dd 1024, 1024 -rounder_2_2 dd 4520, 4520 - dd 4520, 4520 -rounder_2_6 dd 1024, 1024 - dd 1024, 1024 -rounder_2_3 dd 2407, 2407 - dd 2407, 2407 -rounder_2_5 dd 240, 240 - dd 240, 240 - -%elif SHIFT_INV_ROW == 11 ; assume SHIFT_INV_ROW == 11 -rounder_2_0 dd 65536, 65536 - dd 65536, 65536 -rounder_2_4 dd 0, 0 - dd 0, 0 -rounder_2_1 dd 3597, 3597 - dd 3597, 3597 -rounder_2_7 dd 512, 512 - dd 512, 512 -rounder_2_2 dd 2260, 2260 - dd 2260, 2260 -rounder_2_6 dd 512, 512 - dd 512, 512 -rounder_2_3 dd 1203, 1203 - dd 1203, 1203 -rounder_2_5 dd 120, 120 - dd 120, 120 -%else - -%error invalid _SHIFT_INV_ROW_ - -%endif - - tg_1_16_2 dw 13036, 13036, 13036, 13036 ; tg * (2<<16) + 0.5 - dw 13036, 13036, 13036, 13036 - tg_2_16_2 dw 27146, 27146, 27146, 27146 ; tg * (2<<16) + 0.5 - dw 27146, 27146, 27146, 27146 - tg_3_16_2 dw -21746, -21746, -21746, -21746 ; tg * (2<<16) + 0.5 - dw -21746, -21746, -21746, -21746 -ocos_4_16_2 dw 23170, 23170, 23170, 23170 ; cos * (2<<15) + 0.5 - dw 23170, 23170, 23170, 23170 - -%macro DCT_8_INV_ROW_1_sse2 4 - - pshufhw xmm1,[%1],11011000b ;x 75643210 - pshuflw xmm1,xmm1,11011000b ;x 75643120 - pshufd xmm0,xmm1,00000000b ;x 20202020 - pmaddwd xmm0,[%3] ;w 13 12 9 8 5410 - ;a 3210 first part - - pshufd xmm2,xmm1,10101010b ;x 64646464 - pmaddwd xmm2,[%3+16] ;w 15 14 11 10 7632 - ;a 3210 second part - - paddd xmm2,xmm0 ;a 3210 ready - paddd xmm2,[%4] ;must be 4 dwords long, not 2 as for sse1 - movdqa xmm5,xmm2 - - pshufd xmm3,xmm1,01010101b ;x 31313131 - pmaddwd xmm3,[%3+32] ;w 29 28 25 24 21 20 17 16 - ;b 3210 first part - - pshufd xmm4,xmm1,11111111b ;x 75757575 - pmaddwd xmm4,[%3+48] ;w 31 30 27 26 23 22 19 18 - ;b 3210 second part - paddd xmm3,xmm4 ;b 3210 ready - - paddd xmm2,xmm3 ;will be y 3210 - psubd xmm5,xmm3 ;will be y 4567 - psrad xmm2,SHIFT_INV_ROW - psrad xmm5,SHIFT_INV_ROW - packssdw xmm2,xmm5 ;y 45673210 - pshufhw xmm6,xmm2,00011011b ;y 76543210 - movdqa [%2],xmm6 - -%endmacro - -%macro DCT_8_INV_COL_4_sse2 2 - - movdqa xmm0,[%1+16*0] ;x0 (all columns) - movdqa xmm2,[%1+16*4] ;x4 - movdqa xmm1,xmm0 - - movdqa xmm4,[%1+16*2] ;x2 - movdqa xmm5,[%1+16*6] ;x6 - movdqa xmm6,[tg_2_16_2] - movdqa xmm7,xmm6 - - paddsw xmm0,xmm2 ;u04=x0+x4 - psubsw xmm1,xmm2 ;v04=x0-x4 - movdqa xmm3,xmm0 - movdqa xmm2,xmm1 - - pmulhw xmm6,xmm4 - pmulhw xmm7,xmm5 - psubsw xmm6,xmm5 ;v26=x2*T2-x6 - paddsw xmm7,xmm4 ;u26=x6*T2+x2 - - paddsw xmm1,xmm6 ;a1=v04+v26 - paddsw xmm0,xmm7 ;a0=u04+u26 - psubsw xmm2,xmm6 ;a2=v04-v26 - psubsw xmm3,xmm7 ;a3=u04-u26 - - movdqa [%2+16*0],xmm0 ;store a3-a0 to - movdqa [%2+16*6],xmm1 ;free registers - movdqa [%2+16*2],xmm2 - movdqa [%2+16*4],xmm3 - - movdqa xmm0,[%1+16*1] ;x1 - movdqa xmm1,[%1+16*7] ;x7 - movdqa xmm2,[tg_1_16_2] - movdqa xmm3,xmm2 - - movdqa xmm4,[%1+16*3] ;x3 - movdqa xmm5,[%1+16*5] ;x5 - movdqa xmm6,[tg_3_16_2] - movdqa xmm7,xmm6 - - pmulhw xmm2,xmm0 - pmulhw xmm3,xmm1 - psubsw xmm2,xmm1 ;v17=x1*T1-x7 - paddsw xmm3,xmm0 ;u17=x7*T1+x1 - movdqa xmm0,xmm3 ;u17 - movdqa xmm1,xmm2 ;v17 - - pmulhw xmm6,xmm4 ;x3*(t3-1) - pmulhw xmm7,xmm5 ;x5*(t3-1) - paddsw xmm6,xmm4 - paddsw xmm7,xmm5 - psubsw xmm6,xmm5 ;v35=x3*T3-x5 - paddsw xmm7,xmm4 ;u35=x5*T3+x3 - - movdqa xmm4,[ocos_4_16_2] - - paddsw xmm0,xmm7 ;b0=u17+u35 - psubsw xmm1,xmm6 ;b3=v17-v35 - psubsw xmm3,xmm7 ;u12=u17-v35 - paddsw xmm2,xmm6 ;v12=v17+v35 - - movdqa xmm5,xmm3 - paddsw xmm3,xmm2 ;tb1 - psubsw xmm5,xmm2 ;tb2 - pmulhw xmm5,xmm4 - pmulhw xmm4,xmm3 - paddsw xmm5,xmm5 - paddsw xmm4,xmm4 - - movdqa xmm6,[%2+16*0] ;a0 - movdqa xmm7,xmm6 - movdqa xmm2,[%2+16*4] ;a3 - movdqa xmm3,xmm2 - - paddsw xmm6,xmm0 - psubsw xmm7,xmm0 - psraw xmm6,SHIFT_INV_COL ;y0=a0+b0 - psraw xmm7,SHIFT_INV_COL ;y7=a0-b0 - movdqa [%2+16*0],xmm6 - movdqa [%2+16*7],xmm7 - - paddsw xmm2,xmm1 - psubsw xmm3,xmm1 - psraw xmm2,SHIFT_INV_COL ;y3=a3+b3 - psraw xmm3,SHIFT_INV_COL ;y4=a3-b3 - movdqa [%2+16*3],xmm2 - movdqa [%2+16*4],xmm3 - - movdqa xmm0,[%2+16*6] ;a1 - movdqa xmm1,xmm0 - movdqa xmm6,[%2+16*2] ;a2 - movdqa xmm7,xmm6 - - - paddsw xmm0,xmm4 - psubsw xmm1,xmm4 - psraw xmm0,SHIFT_INV_COL ;y1=a1+b1 - psraw xmm1,SHIFT_INV_COL ;y6=a1-b1 - movdqa [%2+16*1],xmm0 - movdqa [%2+16*6],xmm1 - - paddsw xmm6,xmm5 - psubsw xmm7,xmm5 - psraw xmm6,SHIFT_INV_COL ;y2=a2+b2 - psraw xmm7,SHIFT_INV_COL ;y5=a2-b2 - movdqa [%2+16*2],xmm6 - movdqa [%2+16*5],xmm7 - -%endmacro - -section .text - -align 16 -cglobal idct_sse2 -idct_sse2 - - mov eax, dword [esp + 4] - - DCT_8_INV_ROW_1_sse2 eax+0, eax+0, tab_i_04_s2, rounder_2_0 - DCT_8_INV_ROW_1_sse2 eax+16, eax+16, tab_i_17_s2, rounder_2_1 - DCT_8_INV_ROW_1_sse2 eax+32, eax+32, tab_i_26_s2, rounder_2_2 - DCT_8_INV_ROW_1_sse2 eax+48, eax+48, tab_i_35_s2, rounder_2_3 - DCT_8_INV_ROW_1_sse2 eax+64, eax+64, tab_i_04_s2, rounder_2_4 - DCT_8_INV_ROW_1_sse2 eax+80, eax+80, tab_i_35_s2, rounder_2_5 - DCT_8_INV_ROW_1_sse2 eax+96, eax+96, tab_i_26_s2, rounder_2_6 - DCT_8_INV_ROW_1_sse2 eax+112, eax+112, tab_i_17_s2, rounder_2_7 +idct_xmm: + mov eax, dword [esp + 4] - DCT_8_INV_COL_4_sse2 eax, eax - ;DCT_8_INV_COL_4 eax+8, eax+8 + ;; Process each row + DCT_8_INV_ROW_XMM eax+0*16, eax+0*16, tab_i_04_xmm, rounder_0 + DCT_8_INV_ROW_XMM eax+1*16, eax+1*16, tab_i_17_xmm, rounder_1 + DCT_8_INV_ROW_XMM eax+2*16, eax+2*16, tab_i_26_xmm, rounder_2 + DCT_8_INV_ROW_XMM eax+3*16, eax+3*16, tab_i_35_xmm, rounder_3 + DCT_8_INV_ROW_XMM eax+4*16, eax+4*16, tab_i_04_xmm, rounder_4 + DCT_8_INV_ROW_XMM eax+5*16, eax+5*16, tab_i_35_xmm, rounder_5 + DCT_8_INV_ROW_XMM eax+6*16, eax+6*16, tab_i_26_xmm, rounder_6 + DCT_8_INV_ROW_XMM eax+7*16, eax+7*16, tab_i_17_xmm, rounder_7 + + ;; Process the columns (4 at a time) + DCT_8_INV_COL eax+0, eax+0 + DCT_8_INV_COL eax+8, eax+8 - ret + ret