;/*****************************************************************************
; *
; *  XVID MPEG-4 VIDEO CODEC
; *  - MPEG4 Quantization H263 implementation / MMX optimized -
; *
; *  Copyright(C) 2001-2003 Peter Ross <pross@xvid.org>
; *               2002-2003 Pascal Massimino <skal@planet-d.net>
; *		  2004 Andre Werthmann <wertmann@aei.mpg.de>
; *
; *  This program is free software ; you can redistribute it and/or modify
; *  it under the terms of the GNU General Public License as published by
; *  the Free Software Foundation ; either version 2 of the License, or
; *  (at your option) any later version.
; *
; *  This program is distributed in the hope that it will be useful,
; *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; *  GNU General Public License for more details.
; *
; *  You should have received a copy of the GNU General Public License
; *  along with this program ; if not, write to the Free Software
; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
; *
; * $Id: quantize_h263_mmx.asm,v 1.1 2005-01-05 23:02:15 edgomez Exp $
; *
; ****************************************************************************/

; enable dequant saturate [-2048,2047], test purposes only.
%define SATURATE

BITS 64

%macro cglobal 1
	%ifdef PREFIX
		%ifdef MARK_FUNCS
			global _%1:function %1.endfunc-%1
			%define %1 _%1:function %1.endfunc-%1
		%else
			global _%1
			%define %1 _%1
		%endif
	%else
		%ifdef MARK_FUNCS
			global %1:function %1.endfunc-%1
		%else
			global %1
		%endif
	%endif
%endmacro

;=============================================================================
; Read only Local data
;=============================================================================

%ifdef FORMAT_COFF
SECTION .rodata
%else
SECTION .rodata align=16
%endif

ALIGN 16
plus_one:
	times 8 dw 1

;-----------------------------------------------------------------------------
;
; subtract by Q/2 table
;
;-----------------------------------------------------------------------------

ALIGN 16
mmx_sub:
%assign quant 1
%rep 31
	times 4 dw  quant / 2
	%assign quant quant+1
%endrep

;-----------------------------------------------------------------------------
;
; divide by 2Q table
;
; use a shift of 16 to take full advantage of _pmulhw_
; for q=1, _pmulhw_ will overflow so it is treated seperately
; (3dnow2 provides _pmulhuw_ which wont cause overflow)
;
;-----------------------------------------------------------------------------

ALIGN 16
mmx_div:
%assign quant 1
%rep 31
	times 4 dw  (1<<16) / (quant*2) + 1
	%assign quant quant+1
%endrep

;-----------------------------------------------------------------------------
;
; add by (odd(Q) ? Q : Q - 1) table
;
;-----------------------------------------------------------------------------

ALIGN 16
mmx_add:
%assign quant 1
%rep 31
	%if quant % 2 != 0
	times 4 dw  quant
	%else
	times 4 dw quant - 1
	%endif
	%assign quant quant+1
%endrep

;-----------------------------------------------------------------------------
;
; multiple by 2Q table
;
;-----------------------------------------------------------------------------

ALIGN 16
mmx_mul:
%assign quant 1
%rep 31
	times 4 dw  quant*2
	%assign quant quant+1
%endrep

;-----------------------------------------------------------------------------
;
; saturation limits
;
;-----------------------------------------------------------------------------

ALIGN 16
sse2_2047:
	times 8 dw 2047

ALIGN 16
mmx_2047:
	times 4 dw 2047

ALIGN 8
mmx_32768_minus_2048:
	times 4 dw (32768-2048)

mmx_32767_minus_2047:
	times 4 dw (32767-2047)


;=============================================================================
; Code
;=============================================================================

SECTION .text align=16

cglobal quant_h263_intra_x86_64
cglobal quant_h263_inter_x86_64
cglobal dequant_h263_intra_x86_64
cglobal dequant_h263_inter_x86_64

;-----------------------------------------------------------------------------
;
; uint32_t quant_h263_intra_x86_64(int16_t * coeff,
;                               const int16_t const * data,
;                               const uint32_t quant,
;                               const uint32_t dcscalar,
;                               const uint16_t *mpeg_matrices);
; Port of the 32bit mmx cousin
;-----------------------------------------------------------------------------

ALIGN 16
quant_h263_intra_x86_64:
  mov rax, rdx			; quant
			; rsi is data
			; rdi is coeff
  mov r8, rcx			; save dscalar

  xor rcx, rcx
  cmp rax, 1
  jz .q1loop

  lea r9, [mmx_div wrt rip]
  movq mm7, [r9 + rax * 8 - 8]

ALIGN 16
.loop
  movq mm0, [rsi + 8*rcx]           ; mm0 = [1st]
  movq mm3, [rsi + 8*rcx + 8]
  pxor mm1, mm1                     ; mm1 = 0
  pxor mm4, mm4                     ;
  pcmpgtw mm1, mm0                  ; mm1 = (0 > mm0)
  pcmpgtw mm4, mm3                  ;
  pxor mm0, mm1                     ; mm0 = |mm0|
  pxor mm3, mm4                     ;
  psubw mm0, mm1                    ; displace
  psubw mm3, mm4                    ;
  pmulhw mm0, mm7                   ; mm0 = (mm0 / 2Q) >> 16
  pmulhw mm3, mm7                   ;
  pxor mm0, mm1                     ; mm0 *= sign(mm0)
  pxor mm3, mm4                     ;
  psubw mm0, mm1                    ; undisplace
  psubw mm3, mm4                    ;
  movq [rdi + 8*rcx], mm0
  movq [rdi + 8*rcx + 8], mm3

  add rcx, 2
  cmp rcx, 16
  jnz .loop

.done

    ; caclulate  data[0] // (int32_t)dcscalar)
  mov rcx, r8				; dscalar
  mov rdx, rcx
  movsx eax, word [rsi]			; data[0] with sign extend
  shr rdx, 1                    ; edx = dcscalar /2
  cmp eax, 0 
  jg .gtzero

  sub rax, rdx
  jmp short .mul

.gtzero
  add rax, rdx
.mul
  cdq ; expand eax -> edx:eax
  idiv ecx		; eax = edx:eax / dcscalar
  mov [rdi], ax		; coeff[0] = ax

  xor rax, rax      ; return(0);

  ret

ALIGN 16
.q1loop
  movq mm0, [rsi + 8*rcx]           ; mm0 = [1st]
  movq mm3, [rsi + 8*rcx + 8]
  pxor mm1, mm1                     ; mm1 = 0
  pxor mm4, mm4                     ;
  pcmpgtw mm1, mm0                  ; mm1 = (0 > mm0)
  pcmpgtw mm4, mm3                  ;
  pxor mm0, mm1                     ; mm0 = |mm0|
  pxor mm3, mm4                     ;
  psubw mm0, mm1                    ; displace
  psubw mm3, mm4                    ;
  psrlw mm0, 1                      ; mm0 >>= 1   (/2)
  psrlw mm3, 1                      ;
  pxor mm0, mm1                     ; mm0 *= sign(mm0)
  pxor mm3, mm4
  psubw mm0, mm1                    ; undisplace
  psubw mm3, mm4                    ;
  movq [rdi + 8*rcx], mm0
  movq [rdi + 8*rcx + 8], mm3

  add rcx, 2
  cmp rcx, 16
  jnz .q1loop

  jmp .done
.endfunc


;-----------------------------------------------------------------------------
;
; uint32_t quant_h263_inter_x86_64(int16_t * coeff,
;                               const int16_t const * data,
;                               const uint32_t quant,
;                               const uint16_t *mpeg_matrices);
; Port of the 32bit mmx cousin
;-----------------------------------------------------------------------------

ALIGN 16
quant_h263_inter_x86_64:
  mov rax, rdx				; quant
  			; rsi is data
			; rdi is coeff

  xor rcx, rcx

  pxor mm5, mm5                     ; sum
  lea r9, [mmx_sub wrt rip]
  movq mm6, [r9 + rax * 8 - 8] ; sub

  cmp rax, 1
  jz .q1loop

  lea r9, [mmx_div wrt rip]
  movq mm7, [r9 + rax * 8 - 8] ; divider

ALIGN 8
.loop
  movq mm0, [rsi + 8*rcx]           ; mm0 = [1st]
  movq mm3, [rsi + 8*rcx + 8]
  pxor mm1, mm1                     ; mm1 = 0
  pxor mm4, mm4                     ;
  pcmpgtw mm1, mm0                  ; mm1 = (0 > mm0)
  pcmpgtw mm4, mm3                  ;
  pxor mm0, mm1                     ; mm0 = |mm0|
  pxor mm3, mm4                     ;
  psubw mm0, mm1                    ; displace
  psubw mm3, mm4                    ;
  psubusw mm0, mm6                  ; mm0 -= sub (unsigned, dont go < 0)
  psubusw mm3, mm6                  ;
  pmulhw mm0, mm7                   ; mm0 = (mm0 / 2Q) >> 16
  pmulhw mm3, mm7                   ;
  paddw mm5, mm0                    ; sum += mm0
  pxor mm0, mm1                     ; mm0 *= sign(mm0)
  paddw mm5, mm3                    ;
  pxor mm3, mm4                     ;
  psubw mm0, mm1                    ; undisplace
  psubw mm3, mm4
  movq [rdi + 8*rcx], mm0
  movq [rdi + 8*rcx + 8], mm3

  add rcx, 2
  cmp rcx, 16
  jnz .loop

.done
  pmaddwd mm5, [plus_one wrt rip]
  movq mm0, mm5
  psrlq mm5, 32
  paddd mm0, mm5

  movd rax, mm0     ; return sum

  ret

ALIGN 8
.q1loop
  movq mm0, [rsi + 8*rcx]           ; mm0 = [1st]
  movq mm3, [rsi + 8*rcx+ 8]        ;
  pxor mm1, mm1                     ; mm1 = 0
  pxor mm4, mm4                     ;
  pcmpgtw mm1, mm0                  ; mm1 = (0 > mm0)
  pcmpgtw mm4, mm3                  ;
  pxor mm0, mm1                     ; mm0 = |mm0|
  pxor mm3, mm4                     ;
  psubw mm0, mm1                    ; displace
  psubw mm3, mm4                    ;
  psubusw mm0, mm6                  ; mm0 -= sub (unsigned, dont go < 0)
  psubusw mm3, mm6                  ;
  psrlw mm0, 1                      ; mm0 >>= 1   (/2)
  psrlw mm3, 1                      ;
  paddw mm5, mm0                    ; sum += mm0
  pxor mm0, mm1                     ; mm0 *= sign(mm0)
  paddw mm5, mm3                    ;
  pxor mm3, mm4                     ;
  psubw mm0, mm1                    ; undisplace
  psubw mm3, mm4
  movq [rdi + 8*rcx], mm0
  movq [rdi + 8*rcx + 8], mm3

  add rcx, 2
  cmp rcx, 16
  jnz .q1loop

  jmp .done
.endfunc


;-----------------------------------------------------------------------------
;
; uint32_t dequant_h263_intra_x86_64(int16_t *data,
;                                 const int16_t const *coeff,
;                                 const uint32_t quant,
;                                 const uint32_t dcscalar,
;                                 const uint16_t *mpeg_matrices);
; port of the 32bit xmm cousin
;-----------------------------------------------------------------------------

  ; this is the same as dequant_inter_mmx, except that we're
  ; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)

ALIGN 16
dequant_h263_intra_x86_64:

  mov rax, rdx				; quant
  mov [rsp-8], rcx			; save dscalar
  mov rcx, rsi				; coeff
  mov rdx, rdi				; data

  lea r9, [mmx_add wrt rip]
  movq mm6, [r9 + rax*8 - 8]   ; quant or quant-1
  lea r9, [mmx_mul wrt rip]
  movq mm7, [r9 + rax*8 - 8]   ; 2*quant
  mov rax, -16

ALIGN 16
.loop
  movq mm0, [rcx+8*rax+8*16]        ; c  = coeff[i]
  movq mm3, [rcx+8*rax+8*16 + 8]    ; c' = coeff[i+1]
  pxor mm1, mm1
  pxor mm4, mm4
  pcmpgtw mm1, mm0                  ; sign(c)
  pcmpgtw mm4, mm3                  ; sign(c')
  pxor mm2, mm2
  pxor mm5, mm5
  pcmpeqw mm2, mm0                  ; c is zero
  pcmpeqw mm5, mm3                  ; c' is zero
  pandn mm2, mm6                    ; offset = isZero ? 0 : quant_add
  pandn mm5, mm6
  pxor mm0, mm1                     ; negate if negative
  pxor mm3, mm4                     ; negate if negative
  psubw mm0, mm1
  psubw mm3, mm4
  pmullw mm0, mm7                   ; *= 2Q
  pmullw mm3, mm7                   ; *= 2Q
  paddw mm0, mm2                    ; + offset
  paddw mm3, mm5                    ; + offset
  paddw mm0, mm1                    ; negate back
  paddw mm3, mm4                    ; negate back

   ; saturates to +2047
  movq mm2, [mmx_2047 wrt rip]
  pminsw mm0, mm2
  add rax, 2
  pminsw mm3, mm2

  pxor mm0, mm1
  pxor mm3, mm4
  movq [rdx + 8*rax + 8*16   - 2*8], mm0
  movq [rdx + 8*rax + 8*16+8 - 2*8], mm3
  jnz near .loop

    ; deal with DC
  movd mm0, [rcx]
  pmullw mm0, [rsp-8]		; dscalar
  movq mm2, [mmx_32767_minus_2047 wrt rip]
  paddsw mm0, mm2
  psubsw mm0, mm2
  movq mm2, [mmx_32768_minus_2048 wrt rip]
  psubsw mm0, mm2
  paddsw mm0, mm2
  movd rax, mm0
  mov [rdx], ax

  xor rax, rax
  ret
.endfunc


;-----------------------------------------------------------------------------
;
; uint32_t dequant_h263_inter_x86_64(int16_t * data,
;                                 const int16_t * const coeff,
;                                 const uint32_t quant,
;                                 const uint16_t *mpeg_matrices);
; Port of the 32bit xmm cousin
;-----------------------------------------------------------------------------

  ; this is the same as dequant_inter_mmx,
  ; except that we're saturating using 'pminsw' (saves 2 cycles/loop)

ALIGN 16
dequant_h263_inter_x86_64:

  mov rax, rdx			; quant
  mov rcx, rsi			; coeff
  mov rdx, rdi			; data

  lea r9, [mmx_add wrt rip]
  movq mm6, [r9 + rax*8 - 8]  ; quant or quant-1
  lea r9, [mmx_mul wrt rip]
  movq mm7, [r9 + rax*8 - 8]  ; 2*quant
  mov rax, -16

ALIGN 16
.loop
  movq mm0, [rcx+8*rax+8*16]      ; c  = coeff[i]
  movq mm3, [rcx+8*rax+8*16 + 8]  ; c' = coeff[i+1]
  pxor mm1, mm1
  pxor mm4, mm4
  pcmpgtw mm1, mm0  ; sign(c)
  pcmpgtw mm4, mm3  ; sign(c')
  pxor mm2, mm2
  pxor mm5, mm5
  pcmpeqw mm2, mm0  ; c is zero
  pcmpeqw mm5, mm3  ; c' is zero
  pandn mm2, mm6    ; offset = isZero ? 0 : quant_add
  pandn mm5, mm6
  pxor mm0, mm1     ; negate if negative
  pxor mm3, mm4     ; negate if negative
  psubw mm0, mm1
  psubw mm3, mm4
  pmullw mm0, mm7   ; *= 2Q
  pmullw mm3, mm7   ; *= 2Q
  paddw mm0, mm2    ; + offset
  paddw mm3, mm5    ; + offset
  paddw mm0, mm1    ; start restoring sign
  paddw mm3, mm4    ; start restoring sign
                            ; saturates to +2047
  movq mm2, [mmx_2047 wrt rip]
  pminsw mm0, mm2
  add rax, 2
  pminsw mm3, mm2

  pxor mm0, mm1 ; finish restoring sign
  pxor mm3, mm4 ; finish restoring sign
  movq [rdx + 8*rax + 8*16   - 2*8], mm0
  movq [rdx + 8*rax + 8*16+8 - 2*8], mm3
  jnz near .loop

  xor rax, rax
  ret
.endfunc