--- branches/dev-api-4/xvidcore/src/dct/x86_asm/fdct_mmx_ffmpeg.asm 2003/10/27 01:03:06 1190 +++ branches/dev-api-4/xvidcore/src/dct/x86_asm/fdct_mmx_ffmpeg.asm 2003/10/28 22:23:03 1192 @@ -2,7 +2,7 @@ ; * ; * XVID MPEG-4 VIDEO CODEC ; * - MMX and XMM forward discrete cosine transform - -; * +; * ; * Copyright(C) 2003 Edouard Gomez ; * ; * This program is free software; you can redistribute it and/or modify it @@ -19,7 +19,7 @@ ; * along with this program; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * -; * $Id: fdct_mmx_ffmpeg.asm,v 1.1.2.1 2003-10-27 01:03:06 edgomez Exp $ +; * $Id: fdct_mmx_ffmpeg.asm,v 1.1.2.2 2003-10-28 22:23:03 edgomez Exp $ ; * ; ***************************************************************************/ @@ -48,9 +48,9 @@ ; Macros and other preprocessor constants ;============================================================================= -%macro cglobal 1 +%macro cglobal 1 %ifdef PREFIX - global _%1 + global _%1 %define %1 _%1 %else global %1 @@ -74,100 +74,100 @@ ALIGN 8 tab_frw_01234567: - dw 16384, 16384, -8867, -21407 - dw 16384, 16384, 21407, 8867 - dw 16384, -16384, 21407, -8867 - dw -16384, 16384, 8867, -21407 - dw 22725, 19266, -22725, -12873 - dw 12873, 4520, 19266, -4520 - dw 12873, -22725, 19266, -22725 - dw 4520, 19266, 4520, -12873 - - dw 22725, 22725, -12299, -29692 - dw 22725, 22725, 29692, 12299 - dw 22725, -22725, 29692, -12299 - dw -22725, 22725, 12299, -29692 - dw 31521, 26722, -31521, -17855 - dw 17855, 6270, 26722, -6270 - dw 17855, -31521, 26722, -31521 - dw 6270, 26722, 6270, -17855 - - dw 21407, 21407, -11585, -27969 - dw 21407, 21407, 27969, 11585 - dw 21407, -21407, 27969, -11585 - dw -21407, 21407, 11585, -27969 - dw 29692, 25172, -29692, -16819 - dw 16819, 5906, 25172, -5906 - dw 16819, -29692, 25172, -29692 - dw 5906, 25172, 5906, -16819 - - dw 19266, 19266, -10426, -25172 - dw 19266, 19266, 25172, 10426 - dw 19266, -19266, 25172, -10426 - dw -19266, 19266, 10426, -25172 - dw 26722, 22654, -26722, -15137 - dw 15137, 5315, 22654, -5315 - dw 15137, -26722, 22654, -26722 - dw 5315, 22654, 5315, -15137 - - dw 16384, 16384, -8867, -21407 - dw 16384, 16384, 21407, 8867 - dw 16384, -16384, 21407, -8867 - dw -16384, 16384, 8867, -21407 - dw 22725, 19266, -22725, -12873 - dw 12873, 4520, 19266, -4520 - dw 12873, -22725, 19266, -22725 - dw 4520, 19266, 4520, -12873 - - dw 19266, 19266, -10426, -25172 - dw 19266, 19266, 25172, 10426 - dw 19266, -19266, 25172, -10426 - dw -19266, 19266, 10426, -25172 - dw 26722, 22654, -26722, -15137 - dw 15137, 5315, 22654, -5315 - dw 15137, -26722, 22654, -26722 - dw 5315, 22654, 5315, -15137 - - dw 21407, 21407, -11585, -27969 - dw 21407, 21407, 27969, 11585 - dw 21407, -21407, 27969, -11585 - dw -21407, 21407, 11585, -27969 - dw 29692, 25172, -29692, -16819 - dw 16819, 5906, 25172, -5906 - dw 16819, -29692, 25172, -29692 - dw 5906, 25172, 5906, -16819, - - dw 22725, 22725, -12299, -29692 - dw 22725, 22725, 29692, 12299 - dw 22725, -22725, 29692, -12299 - dw -22725, 22725, 12299, -29692 - dw 31521, 26722, -31521, -17855 - dw 17855, 6270, 26722, -6270 - dw 17855, -31521, 26722, -31521 - dw 6270, 26722, 6270, -17855 + dw 16384, 16384, -8867, -21407 + dw 16384, 16384, 21407, 8867 + dw 16384, -16384, 21407, -8867 + dw -16384, 16384, 8867, -21407 + dw 22725, 19266, -22725, -12873 + dw 12873, 4520, 19266, -4520 + dw 12873, -22725, 19266, -22725 + dw 4520, 19266, 4520, -12873 + + dw 22725, 22725, -12299, -29692 + dw 22725, 22725, 29692, 12299 + dw 22725, -22725, 29692, -12299 + dw -22725, 22725, 12299, -29692 + dw 31521, 26722, -31521, -17855 + dw 17855, 6270, 26722, -6270 + dw 17855, -31521, 26722, -31521 + dw 6270, 26722, 6270, -17855 + + dw 21407, 21407, -11585, -27969 + dw 21407, 21407, 27969, 11585 + dw 21407, -21407, 27969, -11585 + dw -21407, 21407, 11585, -27969 + dw 29692, 25172, -29692, -16819 + dw 16819, 5906, 25172, -5906 + dw 16819, -29692, 25172, -29692 + dw 5906, 25172, 5906, -16819 + + dw 19266, 19266, -10426, -25172 + dw 19266, 19266, 25172, 10426 + dw 19266, -19266, 25172, -10426 + dw -19266, 19266, 10426, -25172 + dw 26722, 22654, -26722, -15137 + dw 15137, 5315, 22654, -5315 + dw 15137, -26722, 22654, -26722 + dw 5315, 22654, 5315, -15137 + + dw 16384, 16384, -8867, -21407 + dw 16384, 16384, 21407, 8867 + dw 16384, -16384, 21407, -8867 + dw -16384, 16384, 8867, -21407 + dw 22725, 19266, -22725, -12873 + dw 12873, 4520, 19266, -4520 + dw 12873, -22725, 19266, -22725 + dw 4520, 19266, 4520, -12873 + + dw 19266, 19266, -10426, -25172 + dw 19266, 19266, 25172, 10426 + dw 19266, -19266, 25172, -10426 + dw -19266, 19266, 10426, -25172 + dw 26722, 22654, -26722, -15137 + dw 15137, 5315, 22654, -5315 + dw 15137, -26722, 22654, -26722 + dw 5315, 22654, 5315, -15137 + + dw 21407, 21407, -11585, -27969 + dw 21407, 21407, 27969, 11585 + dw 21407, -21407, 27969, -11585 + dw -21407, 21407, 11585, -27969 + dw 29692, 25172, -29692, -16819 + dw 16819, 5906, 25172, -5906 + dw 16819, -29692, 25172, -29692 + dw 5906, 25172, 5906, -16819, + + dw 22725, 22725, -12299, -29692 + dw 22725, 22725, 29692, 12299 + dw 22725, -22725, 29692, -12299 + dw -22725, 22725, 12299, -29692 + dw 31521, 26722, -31521, -17855 + dw 17855, 6270, 26722, -6270 + dw 17855, -31521, 26722, -31521 + dw 6270, 26722, 6270, -17855 ALIGN 8 fdct_one_corr: - dw 1, 1, 1, 1 + dw 1, 1, 1, 1 ALIGN 8 fdct_tg_all_16: - dw 13036, 13036, 13036, 13036 - dw 27146, 27146, 27146, 27146 - dw -21746, -21746, -21746, -21746 + dw 13036, 13036, 13036, 13036 + dw 27146, 27146, 27146, 27146 + dw -21746, -21746, -21746, -21746 ALIGN 8 cos_4_16: - dw -19195, -19195, -19195, -19195 + dw -19195, -19195, -19195, -19195 ALIGN 8 ocos_4_16: - dw 23170, 23170, 23170, 23170 + dw 23170, 23170, 23170, 23170 ALIGN 8 -fdct_r_row: - dd RND_FRW_ROW, RND_FRW_ROW - +fdct_r_row: + dd RND_FRW_ROW, RND_FRW_ROW + ;============================================================================= ; Factorized parts of the code turned into macros for better understanding ;============================================================================= @@ -178,81 +178,81 @@ ;; - in, register name holding the in address ;; - column number to process %macro FDCT_COLUMN_COMMON 3 - movq mm0, [%2 + %3*2 + 1*16] - movq mm1, [%2 + %3*2 + 6*16] - movq mm2, mm0 - movq mm3, [%2 + %3*2 + 2*16] - paddsw mm0, mm1 - movq mm4, [%2 + %3*2 + 5*16] - psllw mm0, SHIFT_FRW_COL - movq mm5, [%2 + %3*2 + 0*16] - paddsw mm4, mm3 - paddsw mm5, [%2 + %3*2 + 7*16] - psllw mm4, SHIFT_FRW_COL - movq mm6, mm0 - psubsw mm2, mm1 - movq mm1, [fdct_tg_all_16 + 4*2] - psubsw mm0, mm4 - movq mm7, [%2 + %3*2 + 3*16] - pmulhw mm1, mm0 - paddsw mm7, [%2 + %3*2 + 4*16] - psllw mm5, SHIFT_FRW_COL - paddsw mm6, mm4 - psllw mm7, SHIFT_FRW_COL - movq mm4, mm5 - psubsw mm5, mm7 - paddsw mm1, mm5 - paddsw mm4, mm7 - por mm1, [fdct_one_corr] - psllw mm2, SHIFT_FRW_COL + 1 - pmulhw mm5, [fdct_tg_all_16 + 4*2] - movq mm7, mm4 - psubsw mm3, [%2 + %3*2 + 5*16] - psubsw mm4, mm6 - movq [%1 + %3*2 + 2*16], mm1 - paddsw mm7, mm6 - movq mm1, [%2 + %3*2 + 3*16] - psllw mm3, SHIFT_FRW_COL + 1 - psubsw mm1, [%2 + %3*2 + 4*16] - movq mm6, mm2 - movq [%1 + %3*2 + 4*16], mm4 - paddsw mm2, mm3 - pmulhw mm2, [ocos_4_16] - psubsw mm6, mm3 - pmulhw mm6, [ocos_4_16] - psubsw mm5, mm0 - por mm5, [fdct_one_corr] - psllw mm1, SHIFT_FRW_COL - por mm2, [fdct_one_corr] - movq mm4, mm1 - movq mm3, [%2 + %3*2 + 0*16] - paddsw mm1, mm6 - psubsw mm3, [%2 + %3*2 + 7*16] - psubsw mm4, mm6 - movq mm0, [fdct_tg_all_16 + 0*2] - psllw mm3, SHIFT_FRW_COL - movq mm6, [fdct_tg_all_16 + 8*2] - pmulhw mm0, mm1 - movq [%1 + %3*2 + 0*16], mm7 - pmulhw mm6, mm4 - movq [%1 + %3*2 + 6*16], mm5 - movq mm7, mm3 - movq mm5, [fdct_tg_all_16 + 8*2] - psubsw mm7, mm2 - paddsw mm3, mm2 - pmulhw mm5, mm7 - paddsw mm0, mm3 - paddsw mm6, mm4 - pmulhw mm3, [fdct_tg_all_16 + 0*2] - por mm0, [fdct_one_corr] - paddsw mm5, mm7 - psubsw mm7, mm6 - movq [%1 + %3*2 + 1*16], mm0 - paddsw mm5, mm4 - movq [%1 + %3*2 + 3*16], mm7 - psubsw mm3, mm1 - movq [%1 + %3*2 + 5*16], mm5 - movq [%1 + %3*2 + 7*16], mm3 + movq mm0, [%2 + %3*2 + 1*16] + movq mm1, [%2 + %3*2 + 6*16] + movq mm2, mm0 + movq mm3, [%2 + %3*2 + 2*16] + paddsw mm0, mm1 + movq mm4, [%2 + %3*2 + 5*16] + psllw mm0, SHIFT_FRW_COL + movq mm5, [%2 + %3*2 + 0*16] + paddsw mm4, mm3 + paddsw mm5, [%2 + %3*2 + 7*16] + psllw mm4, SHIFT_FRW_COL + movq mm6, mm0 + psubsw mm2, mm1 + movq mm1, [fdct_tg_all_16 + 4*2] + psubsw mm0, mm4 + movq mm7, [%2 + %3*2 + 3*16] + pmulhw mm1, mm0 + paddsw mm7, [%2 + %3*2 + 4*16] + psllw mm5, SHIFT_FRW_COL + paddsw mm6, mm4 + psllw mm7, SHIFT_FRW_COL + movq mm4, mm5 + psubsw mm5, mm7 + paddsw mm1, mm5 + paddsw mm4, mm7 + por mm1, [fdct_one_corr] + psllw mm2, SHIFT_FRW_COL + 1 + pmulhw mm5, [fdct_tg_all_16 + 4*2] + movq mm7, mm4 + psubsw mm3, [%2 + %3*2 + 5*16] + psubsw mm4, mm6 + movq [%1 + %3*2 + 2*16], mm1 + paddsw mm7, mm6 + movq mm1, [%2 + %3*2 + 3*16] + psllw mm3, SHIFT_FRW_COL + 1 + psubsw mm1, [%2 + %3*2 + 4*16] + movq mm6, mm2 + movq [%1 + %3*2 + 4*16], mm4 + paddsw mm2, mm3 + pmulhw mm2, [ocos_4_16] + psubsw mm6, mm3 + pmulhw mm6, [ocos_4_16] + psubsw mm5, mm0 + por mm5, [fdct_one_corr] + psllw mm1, SHIFT_FRW_COL + por mm2, [fdct_one_corr] + movq mm4, mm1 + movq mm3, [%2 + %3*2 + 0*16] + paddsw mm1, mm6 + psubsw mm3, [%2 + %3*2 + 7*16] + psubsw mm4, mm6 + movq mm0, [fdct_tg_all_16 + 0*2] + psllw mm3, SHIFT_FRW_COL + movq mm6, [fdct_tg_all_16 + 8*2] + pmulhw mm0, mm1 + movq [%1 + %3*2 + 0*16], mm7 + pmulhw mm6, mm4 + movq [%1 + %3*2 + 6*16], mm5 + movq mm7, mm3 + movq mm5, [fdct_tg_all_16 + 8*2] + psubsw mm7, mm2 + paddsw mm3, mm2 + pmulhw mm5, mm7 + paddsw mm0, mm3 + paddsw mm6, mm4 + pmulhw mm3, [fdct_tg_all_16 + 0*2] + por mm0, [fdct_one_corr] + paddsw mm5, mm7 + psubsw mm7, mm6 + movq [%1 + %3*2 + 1*16], mm0 + paddsw mm5, mm4 + movq [%1 + %3*2 + 3*16], mm7 + psubsw mm3, mm1 + movq [%1 + %3*2 + 5*16], mm5 + movq [%1 + %3*2 + 7*16], mm3 %endmacro ;; Macro for row DCT using MMX punpcklw instructions @@ -261,52 +261,52 @@ ;; - in, register name holding the in address ;; - table coefficients address (register or absolute) %macro FDCT_ROW_MMX 3 - movd mm1, [%2 + 6*2] - punpcklwd mm1, [%2 + 4*2] - movq mm2, mm1 - psrlq mm1, 0x20 - movq mm0, [%2 + 0*2] - punpcklwd mm1, mm2 - movq mm5, mm0 - paddsw mm0, mm1 - psubsw mm5, mm1 - movq mm1, mm0 - movq mm6, mm5 - punpckldq mm3, mm5 - punpckhdq mm6, mm3 - movq mm3, [%3 + 0*2] - movq mm4, [%3 + 4*2] - punpckldq mm2, mm0 - pmaddwd mm3, mm0 - punpckhdq mm1, mm2 - movq mm2, [%3 + 16*2] - pmaddwd mm4, mm1 - pmaddwd mm0, [%3 + 8*2] - movq mm7, [%3 + 20*2] - pmaddwd mm2, mm5 - paddd mm3, [fdct_r_row] - pmaddwd mm7, mm6 - pmaddwd mm1, [%3 + 12*2] - paddd mm3, mm4 - pmaddwd mm5, [%3 + 24*2] - pmaddwd mm6, [%3 + 28*2] - paddd mm2, mm7 - paddd mm0, [fdct_r_row] - psrad mm3, SHIFT_FRW_ROW - paddd mm2, [fdct_r_row] - paddd mm0, mm1 - paddd mm5, [fdct_r_row] - psrad mm2, SHIFT_FRW_ROW - paddd mm5, mm6 - psrad mm0, SHIFT_FRW_ROW - psrad mm5, SHIFT_FRW_ROW - packssdw mm3, mm0 - packssdw mm2, mm5 - movq mm6, mm3 - punpcklwd mm3, mm2 - punpckhwd mm6, mm2 - movq [%1 + 0*2], mm3 - movq [%1 + 4*2], mm6 + movd mm1, [%2 + 6*2] + punpcklwd mm1, [%2 + 4*2] + movq mm2, mm1 + psrlq mm1, 0x20 + movq mm0, [%2 + 0*2] + punpcklwd mm1, mm2 + movq mm5, mm0 + paddsw mm0, mm1 + psubsw mm5, mm1 + movq mm1, mm0 + movq mm6, mm5 + punpckldq mm3, mm5 + punpckhdq mm6, mm3 + movq mm3, [%3 + 0*2] + movq mm4, [%3 + 4*2] + punpckldq mm2, mm0 + pmaddwd mm3, mm0 + punpckhdq mm1, mm2 + movq mm2, [%3 + 16*2] + pmaddwd mm4, mm1 + pmaddwd mm0, [%3 + 8*2] + movq mm7, [%3 + 20*2] + pmaddwd mm2, mm5 + paddd mm3, [fdct_r_row] + pmaddwd mm7, mm6 + pmaddwd mm1, [%3 + 12*2] + paddd mm3, mm4 + pmaddwd mm5, [%3 + 24*2] + pmaddwd mm6, [%3 + 28*2] + paddd mm2, mm7 + paddd mm0, [fdct_r_row] + psrad mm3, SHIFT_FRW_ROW + paddd mm2, [fdct_r_row] + paddd mm0, mm1 + paddd mm5, [fdct_r_row] + psrad mm2, SHIFT_FRW_ROW + paddd mm5, mm6 + psrad mm0, SHIFT_FRW_ROW + psrad mm5, SHIFT_FRW_ROW + packssdw mm3, mm0 + packssdw mm2, mm5 + movq mm6, mm3 + punpcklwd mm3, mm2 + punpckhwd mm6, mm2 + movq [%1 + 0*2], mm3 + movq [%1 + 4*2], mm6 %endmacro ;; Macro for column DCT using XMM instuction pshufw @@ -316,45 +316,45 @@ ;; - table coefficient address %macro FDCT_ROW_XMM 3 ;; fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table) - pshufw mm5, [%2 + 4*2], 0x1B - movq mm0, [%2 + 0*2] - movq mm1, mm0 - paddsw mm0, mm5 - psubsw mm1, mm5 - pshufw mm2, mm0, 0x4E - pshufw mm3, mm1, 0x4E - movq mm4, [%3 + 0*2] - movq mm6, [%3 + 4*2] - movq mm5, [%3 + 16*2] - movq mm7, [%3 + 20*2] - pmaddwd mm4, mm0 - pmaddwd mm5, mm1 - pmaddwd mm6, mm2 - pmaddwd mm7, mm3 - pmaddwd mm0, [%3 + 8*2] - pmaddwd mm2, [%3 + 12*2] - pmaddwd mm1, [%3 + 24*2] - pmaddwd mm3, [%3 + 28*2] - paddd mm4, mm6 - paddd mm5, mm7 - paddd mm0, mm2 - paddd mm1, mm3 - movq mm7, [fdct_r_row] - paddd mm4, mm7 - paddd mm5, mm7 - paddd mm0, mm7 - paddd mm1, mm7 - psrad mm4, SHIFT_FRW_ROW - psrad mm5, SHIFT_FRW_ROW - psrad mm0, SHIFT_FRW_ROW - psrad mm1, SHIFT_FRW_ROW - packssdw mm4, mm0 - packssdw mm5, mm1 - movq mm2, mm4 - punpcklwd mm4, mm5 - punpckhwd mm2, mm5 - movq [%1 + 0*2], mm4 - movq [%1 + 4*2], mm2 + pshufw mm5, [%2 + 4*2], 0x1B + movq mm0, [%2 + 0*2] + movq mm1, mm0 + paddsw mm0, mm5 + psubsw mm1, mm5 + pshufw mm2, mm0, 0x4E + pshufw mm3, mm1, 0x4E + movq mm4, [%3 + 0*2] + movq mm6, [%3 + 4*2] + movq mm5, [%3 + 16*2] + movq mm7, [%3 + 20*2] + pmaddwd mm4, mm0 + pmaddwd mm5, mm1 + pmaddwd mm6, mm2 + pmaddwd mm7, mm3 + pmaddwd mm0, [%3 + 8*2] + pmaddwd mm2, [%3 + 12*2] + pmaddwd mm1, [%3 + 24*2] + pmaddwd mm3, [%3 + 28*2] + paddd mm4, mm6 + paddd mm5, mm7 + paddd mm0, mm2 + paddd mm1, mm3 + movq mm7, [fdct_r_row] + paddd mm4, mm7 + paddd mm5, mm7 + paddd mm0, mm7 + paddd mm1, mm7 + psrad mm4, SHIFT_FRW_ROW + psrad mm5, SHIFT_FRW_ROW + psrad mm0, SHIFT_FRW_ROW + psrad mm1, SHIFT_FRW_ROW + packssdw mm4, mm0 + packssdw mm5, mm1 + movq mm2, mm4 + punpcklwd mm4, mm5 + punpckhwd mm2, mm5 + movq [%1 + 0*2], mm4 + movq [%1 + 4*2], mm2 %endmacro %macro MAKE_FDCT_FUNC 2 @@ -362,33 +362,33 @@ cglobal %1 %1: ;; Move the destination/source address to the eax register - mov eax, [esp + 4] + mov eax, [esp + 4] ;; Process the columns (4 at a time) - FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3 - FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7 + FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3 + FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7 %ifdef UNROLLED_LOOP ; Unrolled loop version %assign i 0 %rep 8 ;; Process the 'i'th row - %2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i + %2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i %assign i i+1 %endrep %else - mov ecx, 8 - mov edx, tab_frw_01234567 + mov ecx, 8 + mov edx, tab_frw_01234567 ALIGN 8 .loop - %2 eax, eax, edx - add eax, 2*8 - add edx, 2*32 - dec ecx - jne .loop + %2 eax, eax, edx + add eax, 2*8 + add edx, 2*32 + dec ecx + jne .loop %endif - - ret + + ret %endmacro ;=============================================================================