;/****************************************************************************
; *
; *  XVID MPEG-4 VIDEO CODEC
; *  - MMX and XMM forward discrete cosine transform -
; *
; *  Copyright(C) 2003 Edouard Gomez <ed.gomez@free.fr>
; *
; *  This program is free software; you can redistribute it and/or modify it
; *  under the terms of the GNU General Public License as published by
; *  the Free Software Foundation; either version 2 of the License, or
; *  (at your option) any later version.
; *
; *  This program is distributed in the hope that it will be useful,
; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; *  GNU General Public License for more details.
; *
; *  You should have received a copy of the GNU General Public License
; *  along with this program; if not, write to the Free Software
; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
; *
; * $Id: fdct_mmx_ffmpeg.asm,v 1.1.2.2 2003-10-28 22:23:03 edgomez Exp $
; *
; ***************************************************************************/

;/****************************************************************************
; *
; *  Initial, but incomplete version provided by Intel at AppNote AP-922
; *    http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
; *  Copyright (C) 1999 Intel Corporation
; *
; *  Completed and corrected in fdctmm32.c/fdctmm32.doc
; *    http://members.tripod.com/~liaor/
; *  Copyright (C) 2000 - Royce Shih-Wea Liao <liaor@iname.com>
; *
; *  Minimizing coefficients reordering changing the tables constants order
; *    http://ffmpeg.sourceforge.net/
; *  Copyright (C) 2001 Fabrice Bellard.
; *
; *  The version coded here is just a port to NASM syntax from the FFMPEG's
; *  version. So all credits go to the previous authors for all their
; *  respective work in order to have a nice/fast mmx fDCT.
; ***************************************************************************/

BITS 32

;=============================================================================
; Macros and other preprocessor constants
;=============================================================================

%macro cglobal 1
	%ifdef PREFIX
		global _%1
		%define %1 _%1
	%else
		global %1
	%endif
%endmacro

;;; Define this if you want an unrolled version of the code
%define UNROLLED_LOOP

%define BITS_FRW_ACC   3
%define SHIFT_FRW_COL  BITS_FRW_ACC
%define SHIFT_FRW_ROW  (BITS_FRW_ACC + 17)
%define RND_FRW_ROW    (1 << (SHIFT_FRW_ROW-1))
%define RND_FRW_COL    (1 << (SHIFT_FRW_COL-1))

;=============================================================================
; Local Data (Read Only)
;=============================================================================

SECTION .rodata

ALIGN 8
tab_frw_01234567:
  dw  16384,   16384,   -8867,  -21407
  dw  16384,   16384,   21407,    8867
  dw  16384,  -16384,   21407,   -8867
  dw -16384,   16384,    8867,  -21407
  dw  22725,   19266,  -22725,  -12873
  dw  12873,    4520,   19266,   -4520
  dw  12873,  -22725,   19266,  -22725
  dw   4520,   19266,    4520,  -12873

  dw  22725,   22725,  -12299,  -29692
  dw  22725,   22725,   29692,   12299
  dw  22725,  -22725,   29692,  -12299
  dw -22725,   22725,   12299,  -29692
  dw  31521,   26722,  -31521,  -17855
  dw  17855,    6270,   26722,   -6270
  dw  17855,  -31521,   26722,  -31521
  dw   6270,   26722,    6270,  -17855

  dw  21407,   21407,  -11585,  -27969
  dw  21407,   21407,   27969,   11585
  dw  21407,  -21407,   27969,  -11585
  dw -21407,   21407,   11585,  -27969
  dw  29692,   25172,  -29692,  -16819
  dw  16819,    5906,   25172,   -5906
  dw  16819,  -29692,   25172,  -29692
  dw   5906,   25172,    5906,  -16819

  dw  19266,   19266,  -10426,  -25172
  dw  19266,   19266,   25172,   10426
  dw  19266,  -19266,   25172,  -10426
  dw -19266,   19266,   10426,  -25172
  dw  26722,   22654,  -26722,  -15137
  dw  15137,    5315,   22654,   -5315
  dw  15137,  -26722,   22654,  -26722
  dw   5315,   22654,    5315,  -15137

  dw  16384,   16384,   -8867,  -21407
  dw  16384,   16384,   21407,    8867
  dw  16384,  -16384,   21407,   -8867
  dw -16384,   16384,    8867,  -21407
  dw  22725,   19266,  -22725,  -12873
  dw  12873,    4520,   19266,   -4520
  dw  12873,  -22725,   19266,  -22725
  dw   4520,   19266,    4520,  -12873

  dw  19266,   19266,  -10426,  -25172
  dw  19266,   19266,   25172,   10426
  dw  19266,  -19266,   25172,  -10426
  dw -19266,   19266,   10426,  -25172
  dw  26722,   22654,  -26722,  -15137
  dw  15137,    5315,   22654,   -5315
  dw  15137,  -26722,   22654,  -26722
  dw   5315,   22654,    5315,  -15137

  dw  21407,   21407,  -11585,  -27969
  dw  21407,   21407,   27969,   11585
  dw  21407,  -21407,   27969,  -11585
  dw -21407,   21407,   11585,  -27969
  dw  29692,   25172,  -29692,  -16819
  dw  16819,    5906,   25172,   -5906
  dw  16819,  -29692,   25172,  -29692
  dw   5906,   25172,    5906,  -16819,

  dw  22725,   22725,  -12299,  -29692
  dw  22725,   22725,   29692,   12299
  dw  22725,  -22725,   29692,  -12299
  dw -22725,   22725,   12299,  -29692
  dw  31521,   26722,  -31521,  -17855
  dw  17855,    6270,   26722,   -6270
  dw  17855,  -31521,   26722,  -31521
  dw   6270,   26722,    6270,  -17855

ALIGN 8
fdct_one_corr:
  dw 1, 1, 1, 1

ALIGN 8
fdct_tg_all_16:
  dw  13036,	13036,	13036,	13036
  dw  27146,	27146,	27146,	27146
  dw -21746, -21746, -21746, -21746

ALIGN 8
cos_4_16:
  dw -19195, -19195, -19195, -19195

ALIGN 8
ocos_4_16:
  dw 23170, 23170, 23170, 23170

ALIGN 8
fdct_r_row:
  dd RND_FRW_ROW, RND_FRW_ROW

;=============================================================================
; Factorized parts of the code turned into macros for better understanding
;=============================================================================

	;; Macro for column DCT
	;; FDCT_COLUMN_MMX(int16_t *out, const int16_t *in, int offset);
	;;  - out, register name holding the out address
	;;  - in, register name holding the in address
	;;  - column number to process
%macro FDCT_COLUMN_COMMON 3
  movq mm0, [%2 + %3*2 + 1*16]
  movq mm1, [%2 + %3*2 + 6*16]
  movq mm2, mm0
  movq mm3, [%2 + %3*2 + 2*16]
  paddsw mm0, mm1
  movq mm4, [%2 + %3*2 + 5*16]
  psllw mm0, SHIFT_FRW_COL
  movq mm5, [%2 + %3*2 + 0*16]
  paddsw mm4, mm3
  paddsw mm5, [%2 + %3*2 + 7*16]
  psllw mm4, SHIFT_FRW_COL
  movq mm6, mm0
  psubsw mm2, mm1
  movq mm1, [fdct_tg_all_16 + 4*2]
  psubsw mm0, mm4
  movq mm7, [%2 + %3*2 + 3*16]
  pmulhw mm1, mm0
  paddsw mm7, [%2 + %3*2 + 4*16]
  psllw mm5, SHIFT_FRW_COL
  paddsw mm6, mm4
  psllw mm7, SHIFT_FRW_COL
  movq mm4, mm5
  psubsw mm5, mm7
  paddsw mm1, mm5
  paddsw mm4, mm7
  por mm1, [fdct_one_corr]
  psllw mm2, SHIFT_FRW_COL + 1
  pmulhw mm5, [fdct_tg_all_16 + 4*2]
  movq mm7, mm4
  psubsw mm3, [%2 + %3*2 + 5*16]
  psubsw mm4, mm6
  movq [%1 + %3*2 + 2*16], mm1
  paddsw mm7, mm6
  movq mm1, [%2 + %3*2 + 3*16]
  psllw mm3, SHIFT_FRW_COL + 1
  psubsw mm1, [%2 + %3*2 + 4*16]
  movq mm6, mm2
  movq [%1 + %3*2 + 4*16], mm4
  paddsw mm2, mm3
  pmulhw mm2, [ocos_4_16]
  psubsw mm6, mm3
  pmulhw mm6, [ocos_4_16]
  psubsw mm5, mm0
  por mm5, [fdct_one_corr]
  psllw mm1, SHIFT_FRW_COL
  por mm2, [fdct_one_corr]
  movq mm4, mm1
  movq mm3, [%2 + %3*2 + 0*16]
  paddsw mm1, mm6
  psubsw mm3, [%2 + %3*2 + 7*16]
  psubsw mm4, mm6
  movq mm0, [fdct_tg_all_16 + 0*2]
  psllw mm3, SHIFT_FRW_COL
  movq mm6, [fdct_tg_all_16 + 8*2]
  pmulhw mm0, mm1
  movq [%1 + %3*2 + 0*16], mm7
  pmulhw mm6, mm4
  movq [%1 + %3*2 + 6*16], mm5
  movq mm7, mm3
  movq mm5, [fdct_tg_all_16 + 8*2]
  psubsw mm7, mm2
  paddsw mm3, mm2
  pmulhw mm5, mm7
  paddsw mm0, mm3
  paddsw mm6, mm4
  pmulhw mm3, [fdct_tg_all_16 + 0*2]
  por mm0, [fdct_one_corr]
  paddsw mm5, mm7
  psubsw mm7, mm6
  movq [%1 + %3*2 + 1*16], mm0
  paddsw mm5, mm4
  movq [%1 + %3*2 + 3*16], mm7
  psubsw mm3, mm1
  movq [%1 + %3*2 + 5*16], mm5
  movq [%1 + %3*2 + 7*16], mm3
%endmacro

	;; Macro for row DCT using MMX punpcklw instructions
	;; FDCT_ROW_MMX(int16_t *out, const int16_t *in, const int16_t *table);
	;;  - out, register name holding the out address
	;;  - in, register name holding the in address
	;;  - table coefficients address (register or absolute)
%macro FDCT_ROW_MMX 3
  movd mm1, [%2 + 6*2]
  punpcklwd mm1, [%2 + 4*2]
  movq mm2, mm1
  psrlq mm1, 0x20
  movq mm0, [%2 + 0*2]
  punpcklwd mm1, mm2
  movq mm5, mm0
  paddsw mm0, mm1
  psubsw mm5, mm1
  movq mm1, mm0
  movq mm6, mm5
  punpckldq mm3, mm5
  punpckhdq mm6, mm3
  movq mm3, [%3 + 0*2]
  movq mm4, [%3 + 4*2]
  punpckldq mm2, mm0
  pmaddwd mm3, mm0
  punpckhdq mm1, mm2
  movq mm2, [%3 + 16*2]
  pmaddwd mm4, mm1
  pmaddwd mm0, [%3 + 8*2]
  movq mm7, [%3 + 20*2]
  pmaddwd mm2, mm5
  paddd mm3, [fdct_r_row]
  pmaddwd mm7, mm6
  pmaddwd mm1, [%3 + 12*2]
  paddd mm3, mm4
  pmaddwd mm5, [%3 + 24*2]
  pmaddwd mm6, [%3 + 28*2]
  paddd mm2, mm7
  paddd mm0, [fdct_r_row]
  psrad mm3, SHIFT_FRW_ROW
  paddd mm2, [fdct_r_row]
  paddd mm0, mm1
  paddd mm5, [fdct_r_row]
  psrad mm2, SHIFT_FRW_ROW
  paddd mm5, mm6
  psrad mm0, SHIFT_FRW_ROW
  psrad mm5, SHIFT_FRW_ROW
  packssdw mm3, mm0
  packssdw mm2, mm5
  movq mm6, mm3
  punpcklwd mm3, mm2
  punpckhwd mm6, mm2
  movq [%1 + 0*2], mm3
  movq [%1 + 4*2], mm6
%endmacro

	;; Macro for column DCT using XMM instuction pshufw
	;; FDCT_ROW_XMM(int16_t *out, const int16_t *in, const int16_t *table);
	;;  - out, register name holding the out address
	;;  - in, register name holding the in address
	;;  - table coefficient address
%macro FDCT_ROW_XMM 3
	;; fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
  pshufw mm5, [%2 + 4*2], 0x1B
  movq mm0, [%2 + 0*2]
  movq mm1, mm0
  paddsw mm0, mm5
  psubsw mm1, mm5
  pshufw mm2, mm0, 0x4E
  pshufw mm3, mm1, 0x4E
  movq mm4, [%3 +  0*2]
  movq mm6, [%3 +  4*2]
  movq mm5, [%3 + 16*2]
  movq mm7, [%3 + 20*2]
  pmaddwd mm4, mm0
  pmaddwd mm5, mm1
  pmaddwd mm6, mm2
  pmaddwd mm7, mm3
  pmaddwd mm0, [%3 +  8*2]
  pmaddwd mm2, [%3 + 12*2]
  pmaddwd mm1, [%3 + 24*2]
  pmaddwd mm3, [%3 + 28*2]
  paddd mm4, mm6
  paddd mm5, mm7
  paddd mm0, mm2
  paddd mm1, mm3
  movq mm7, [fdct_r_row]
  paddd mm4, mm7
  paddd mm5, mm7
  paddd mm0, mm7
  paddd mm1, mm7
  psrad mm4, SHIFT_FRW_ROW
  psrad mm5, SHIFT_FRW_ROW
  psrad mm0, SHIFT_FRW_ROW
  psrad mm1, SHIFT_FRW_ROW
  packssdw mm4, mm0
  packssdw mm5, mm1
  movq mm2, mm4
  punpcklwd mm4, mm5
  punpckhwd mm2, mm5
  movq [%1 + 0*2], mm4
  movq [%1 + 4*2], mm2
%endmacro

%macro MAKE_FDCT_FUNC 2
ALIGN 16
cglobal %1
%1:
	;; Move the destination/source address to the eax register
  mov eax, [esp + 4]

	;; Process the columns (4 at a time)
  FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3
  FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7

%ifdef UNROLLED_LOOP
	; Unrolled loop version
%assign i 0
%rep 8
	;; Process the 'i'th row
  %2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i
	%assign i i+1
%endrep
%else
  mov ecx, 8
  mov edx, tab_frw_01234567
ALIGN 8
.loop
  %2 eax, eax, edx
  add eax, 2*8
  add edx, 2*32
  dec ecx
  jne .loop
%endif

  ret
%endmacro

;=============================================================================
; Code
;=============================================================================

SECTION .text

;-----------------------------------------------------------------------------
; void fdct_mmx_ffmpeg(int16_t block[64]);
;-----------------------------------------------------------------------------

MAKE_FDCT_FUNC fdct_mmx_ffmpeg, FDCT_ROW_MMX

;-----------------------------------------------------------------------------
; void fdct_xmm_ffmpeg(int16_t block[64]);
;-----------------------------------------------------------------------------

MAKE_FDCT_FUNC fdct_xmm_ffmpeg, FDCT_ROW_XMM