[svn] / branches / dev-api-4 / xvidcore / src / quant / x86_asm / quantize4_xmm.asm Repository:
ViewVC logotype

View of /branches/dev-api-4/xvidcore/src/quant/x86_asm/quantize4_xmm.asm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1089 - (download) (annotate)
Wed Jul 16 23:00:08 2003 UTC (20 years, 8 months ago) by edgomez
File size: 21127 byte(s)
Fixed quant4_intra_xmm and quant_intra_3dne bug for DC<0.
;/**************************************************************************
; *
; *  XVID MPEG-4 VIDEO CODEC
; *  - mmx quantization/dequantization -
; *
; *  Copyright(C) 2001-2003 XviD Team <xvid-devel@xvid.org> 
; *
; *  This program is free software ; you can redistribute it and/or modify
; *  it under the terms of the GNU General Public License as published by
; *  the Free Software Foundation ; either version 2 of the License, or
; *  (at your option) any later version.
; *
; *  This program is distributed in the hope that it will be useful,
; *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; *  GNU General Public License for more details.
; *
; *  You should have received a copy of the GNU General Public License
; *  along with this program ; if not, write to the Free Software
; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
; *
; * $Id: quantize4_xmm.asm,v 1.2.2.1 2003-07-16 22:59:15 edgomez Exp $
; *
; *************************************************************************/
;/**************************************************************************
; *   quant4 bugs have been fixed: (a) overflow bug for matrix elements
; *   equal to 1 or 2 is fixed by substituting pmulhw with pmulhuw (iSSE)
; *   and using multiplier 0ffffh instead of 10001h (for matrix element = 1;
; *   in that case, 1 is added before multiplying, that additional 1 comes
; *   from intra_matrix1; (b) rounding error for large coefficients and matrix
; *   elements is fixed by two-step approach: first approximation (rounded
; *   down) is found as usual; the result is multiplied by the matrix element
; *   and mismatch is used to calculate the correction.
; *************************************************************************/
; _3dne functions are compatible with iSSE, but are optimized specifically
; for K7 pipelines
;
;---------------------------------------------------------------------------
; 09.12.2002  Athlon optimizations contributed by Jaan Kalda 
;---------------------------------------------------------------------------


; data/text alignment														   
%define ALIGN 8
%define SATURATE

bits 32

%ifdef FORMAT_COFF
SECTION .data data
%else
SECTION .data data align=8
%endif

%macro cglobal 1 
	%ifdef PREFIX
		global _%1 
		%define %1 _%1
	%else
		global %1
	%endif
%endmacro

%macro cextern 1 
	%ifdef PREFIX
		extern _%1 
		%define %1 _%1
	%else
		extern %1
	%endif
%endmacro
align 8
mmzero dd 0,0

mmx_one times 4	dw	 1

;===========================================================================
;
; divide by 2Q table 
;
;===========================================================================

align ALIGN
mmx_divs            ;i>2
%assign i 1
%rep 31
	times 4 dw  ((1 << 15) / i + 1) 
	%assign i i+1
%endrep

align ALIGN
mmx_div            ;i>2
%assign i 1 
%rep 31
	times 4 dw  ((1 << 16) / i + 1) 
	%assign i i+1
%endrep


;===========================================================================
;
; intra matrix 
;
;===========================================================================

%macro FIXX 1
dw (1 << 16) / (%1) + 1
%endmacro 

cextern intra_matrix_fixl
cextern intra_matrix_fix
cextern intra_matrix1
cextern intra_matrix

;===========================================================================
;
; inter matrix
;
;===========================================================================

cextern inter_matrix1
cextern inter_matrix
cextern inter_matrix_fix
cextern inter_matrix_fixl


%define VM18P	3
%define VM18Q	4
%define nop4	db	08Dh,074h,026h,0
%define nop3	add	esp,byte 0
%define nop2	mov	esp,esp
%define nop7	db	08dh,02ch,02dh,0,0,0,0
%define nop6	add	ebp,dword 0

;===========================================================================
;
; quantd table 
;
;===========================================================================


quantd
%assign i 1
%rep 31
	times 4 dw  (((VM18P*i) + (VM18Q/2)) / VM18Q)
	%assign i i+1
%endrep

;===========================================================================
;
; multiple by 2Q table
;
;===========================================================================


mmx_mul_quant
%assign i 1
%rep 31
	times 4 dw  i
	%assign i i+1
%endrep

;===========================================================================
;
; saturation limits 
;
;===========================================================================

align 16

mmx_32767_minus_2047		times 4 dw (32767-2047)
mmx_32768_minus_2048		times 4 dw (32768-2048)
mmx_2047			times 4 dw 2047
mmx_minus_2048			times 4 dw (-2048)
zero				times 4 dw 0

int_div
dd 0
%assign i 1
%rep 255 
	dd  (1 << 17) / ( i) + 1
	%assign i i+1
%endrep

section .text

;===========================================================================
;
; void quant4_intra_xmm(int16_t * coeff, 
;					const int16_t const * data,
;					const uint32_t quant,
;					const uint32_t dcscalar);
;
;===========================================================================

align ALIGN
cglobal quant4_intra_xmm
quant4_intra_xmm
	mov	eax, [esp  + 8]		; data
	mov	ecx, [esp  + 12]	; quant
	mov	edx, [esp  + 4]		; coeff
	push	esi
	push	edi
	push	ebx
	nop
	mov	edi,mmzero
	mov	esi,-14		
	pxor	mm0,mm0
	pxor	mm3,mm3
	cmp	ecx,byte 1
	je	near .q1loop
	cmp	ecx,byte 19
	jg	near .lloop
	nop6

		 
align ALIGN
.loop
	movq	mm1, [eax + 8*esi+112]		; mm0 = [1st]
	psubw	mm0, mm1			;-mm1
	movq	mm4, [eax + 8*esi + 120]	;
	psubw	mm3, mm4			;-mm4
	pmaxsw	mm0, mm1			;|src|
	pmaxsw	mm3,mm4
	nop2
	psraw	mm1, 15	;sign src
	psraw	mm4, 15
	psllw	mm0, 4	;level << 4 ;
	psllw	mm3, 4
	paddw	mm0, [intra_matrix1 + 8*esi+112]
	paddw	mm3, [intra_matrix1 + 8*esi+120]
	movq	mm5, [intra_matrix_fixl + 8*esi+112]
	movq	mm7, [intra_matrix_fixl + 8*esi+120]
	pmulhuw	mm5, mm0
	pmulhuw	mm7, mm3
	mov	esp, esp
	movq	mm2, [intra_matrix + 8*esi+112]
	movq	mm6, [intra_matrix + 8*esi+120]
	pmullw	mm2, mm5
	pmullw	mm6, mm7
	psubw	mm0, mm2
	psubw	mm3, mm6
	nop4
	movq	mm2, [quantd + ecx * 8 - 8]
	movq	mm6, [mmx_divs + ecx * 8 - 8] 
	paddw	mm5, mm2
	paddw	mm7, mm2
	mov	esp, esp
	pmulhuw	mm0, [intra_matrix_fix + 8*esi+112]
	pmulhuw	mm3, [intra_matrix_fix + 8*esi+120]
	paddw	mm5, mm0
	paddw	mm7, mm3
	movq	mm0, [edi]
	movq	mm3, [edi]
	pmulhuw	mm5, mm6		; mm0 = (mm0 / 2Q) >> 16
	pmulhuw	mm7, mm6		;  (level + quantd) / quant (0<quant<32)
	pxor	mm5, mm1		; mm0 *= sign(mm0)
	pxor	mm7, mm4		;
	psubw	mm5, mm1		; undisplace
	psubw	mm7, mm4		;
	movq	[edx + 8*esi+112], mm5
	movq	[edx + 8*esi +120], mm7
	add	esi, byte 2
	jng	near .loop

.done
; calculate  data[0] // (int32_t)dcscalar)
	mov	esi, [esp + 12 + 16]	; dcscalar
  	movsx	ecx, word [eax]
 	mov	edi, ecx
 	mov	edx, [esp + 12 + 16]
 	shr	edx, 1			; ebx = dcscalar /2
 	sar	edi, 31			; cdq is vectorpath
 	xor	edx, edi		; ebx = eax V -eax -1
 	sub	ecx, edi
 	add	ecx, edx
  	mov	edx, [dword esp + 12 + 4]
	mov	esi, [int_div+4*esi]
	imul	ecx, esi
	sar	ecx, 17
	lea	ebx, [byte ecx + 1]
	cmovs	ecx, ebx
;	idiv	cx			; ecx = edi:ecx / dcscalar
		
	mov	ebx, [esp]
	mov	edi, [esp+4]
	mov	esi, [esp+8]
	add	esp, byte 12
	mov	[edx], cx		; coeff[0] = ax

	ret				

align ALIGN
.q1loop
	movq	mm1, [eax + 8*esi+112]			; mm0 = [1st]
	psubw	mm0, mm1				;-mm1
	movq	mm4, [eax + 8*esi+120]			;
	psubw	mm3, mm4				;-mm4
	pmaxsw	mm0, mm1				;|src|
	pmaxsw	mm3, mm4
	nop2
	psraw	mm1, 15					;sign src
	psraw	mm4, 15 		
	psllw	mm0, 4					; level << 4
	psllw	mm3, 4			
	paddw	mm0, [intra_matrix1 + 8*esi+112] 	;mm0 is to be divided
	paddw	mm3, [intra_matrix1 + 8*esi+120] 	;intra1 contains fix for division by 1
	movq	mm5, [intra_matrix_fixl + 8*esi+112]	;with rounding down
	movq	mm7, [intra_matrix_fixl + 8*esi+120]
	pmulhuw	mm5, mm0
	pmulhuw	mm7, mm3  				;mm7: first approx of division
	mov	esp, esp
	movq	mm2, [intra_matrix + 8*esi+112]
	movq	mm6, [intra_matrix + 8*esi+120]		; divs for q<=16
	pmullw	mm2, mm5 				;test value <= original
	pmullw	mm6, mm7 
	psubw	mm0, mm2 				;mismatch
	psubw	mm3, mm6
	nop4
	movq	mm2, [quantd + ecx * 8 - 8]
	paddw	mm5, mm2 				;first approx with quantd
	paddw	mm7, mm2
	mov	esp, esp
	pmulhuw	mm0, [intra_matrix_fix + 8*esi+112]	;correction
	pmulhuw	mm3, [intra_matrix_fix + 8*esi+120]
	paddw	mm5, mm0				;final result with quantd
	paddw	mm7, mm3
	movq	mm0, [edi]
	movq	mm3, [edi]
	mov	esp, esp
	psrlw	mm5, 1			;  (level + quantd) /2  (quant = 1)
	psrlw	mm7, 1		
	pxor	mm5, mm1		; mm0 *= sign(mm0)
	pxor	mm7, mm4		;
	psubw	mm5, mm1		; undisplace
	psubw	mm7, mm4		;
	movq	[edx + 8*esi+112], mm5
	movq	[edx + 8*esi +120], mm7
	add	esi, byte 2
	jng	near .q1loop
	jmp	near .done

align 8
.lloop
	movq	mm1, [eax + 8*esi+112]		; mm0 = [1st]
	psubw	mm0, mm1 ;-mm1
	movq	mm4, [eax + 8*esi+120]	; 
	psubw	mm3, mm4 ;-mm4
	pmaxsw	mm0, mm1 ;|src|
	pmaxsw	mm3, mm4
	nop2
	psraw	mm1, 15 ;sign src
	psraw	mm4, 15 		
	psllw	mm0, 4			; level << 4
	psllw	mm3, 4			;		
	paddw	mm0, [intra_matrix1 + 8*esi+112] ;mm0 is to be divided intra1 contains fix for division by 1
	paddw	mm3, [intra_matrix1 + 8*esi+120]
	movq	mm5, [intra_matrix_fixl + 8*esi+112] 
	movq	mm7, [intra_matrix_fixl + 8*esi+120]
	pmulhuw	mm5, mm0
	pmulhuw	mm7, mm3  ;mm7: first approx of division
	mov	esp, esp
	movq	mm2, [intra_matrix + 8*esi+112]
	movq	mm6, [intra_matrix + 8*esi+120]
	pmullw	mm2, mm5 ;test value <= original
	pmullw	mm6, mm7 
	psubw	mm0, mm2 ;mismatch
	psubw	mm3, mm6
	nop4
	movq	mm2, [quantd + ecx * 8 - 8]
	movq	mm6, [mmx_div + ecx * 8 - 8] ; divs for q<=16
	paddw	mm5, mm2 ;first approx with quantd
	paddw	mm7, mm2
	mov	esp, esp
	pmulhuw	mm0, [intra_matrix_fix + 8*esi+112] ;correction
	pmulhuw	mm3, [intra_matrix_fix + 8*esi+120]
	paddw	mm5, mm0 ;final result with quantd
	paddw	mm7, mm3
	movq	mm0, [edi]
	movq	mm3, [edi]
	mov	esp, esp
	pmulhuw	mm5, mm6		; mm0 = (mm0 / 2Q) >> 16
	pmulhuw	mm7, mm6		;  (level + quantd) / quant (0<quant<32)
	psrlw	  mm5, 1			; (level + quantd) / (2*quant)
	psrlw	mm7, 1		
	pxor	mm5, mm1		; mm0 *= sign(mm0)
	pxor	mm7, mm4		;
	psubw	mm5, mm1		; undisplace
	psubw	mm7, mm4		;
	movq	[edx + 8*esi+112], mm5
	movq	[edx + 8*esi +120], mm7
	add	esi,byte 2
	jng	near .lloop 
	jmp	near .done

;===========================================================================
;
; uint32_t quant4_inter_xmm(int16_t * coeff,
;					const int16_t const * data,
;					const uint32_t quant);
;
;===========================================================================

align ALIGN
cglobal quant4_inter_xmm
quant4_inter_xmm
	mov	eax, [esp  + 8]		; data
	mov	ecx, [esp  + 12]	; quant
	mov	edx, [esp  + 4]		; coeff
	push	esi
	push	edi
	push	ebx
	nop
	mov edi,mmzero
	mov esi,-14		
	mov ebx,esp
	sub esp,byte 24
	lea ebx,[esp+8]
	and ebx,byte -8 ;align 8
	pxor mm0,mm0
	pxor mm3,mm3
	movq [byte ebx],mm0
	db 0Fh, 7Fh, 44h, 23h, 8 ;movq [ebx+8],mm0
	cmp	ecx,byte 1
	je	near .q1loop
	cmp	ecx,byte 19
	jg near .lloop
	nop
		 
align ALIGN
.loop
	movq	mm1, [eax + 8*esi+112]		; mm0 = [1st]
	psubw 	mm0,mm1 ;-mm1
	movq	mm4, [eax + 8*esi + 120]	; 
	psubw	mm3,mm4 ;-mm4
	pmaxsw  mm0,mm1 ;|src|
	pmaxsw  mm3,mm4
	nop2
	psraw   mm1,15 ;sign src
	psraw   mm4,15 		
	psllw   mm0, 4			; level << 4
	psllw   mm3, 4			;		
	paddw   mm0, [inter_matrix1 + 8*esi+112] 
	paddw   mm3, [inter_matrix1 + 8*esi+120]
	movq    mm5,[inter_matrix_fixl + 8*esi+112] 
	movq    mm7,[inter_matrix_fixl + 8*esi+120]
	pmulhuw mm5,mm0
	pmulhuw mm7,mm3  
	mov esp,esp
	movq   mm2,[inter_matrix + 8*esi+112]
	movq   mm6,[inter_matrix + 8*esi+120]
	pmullw mm2,mm5 
	pmullw mm6,mm7 
	psubw mm0,mm2 
	psubw mm3,mm6
	movq mm2,[byte ebx]
	movq mm6,[mmx_divs + ecx * 8 - 8] 
	pmulhuw mm0,[inter_matrix_fix + 8*esi+112] 
	pmulhuw mm3,[inter_matrix_fix + 8*esi+120]
	paddw mm2,[ebx+8]   ;sum
	paddw mm5,mm0
	paddw mm7,mm3
	movq mm0,[edi]
	movq mm3,[edi]		
	pmulhuw	mm5, mm6		; mm0 = (mm0 / 2Q) >> 16
	pmulhuw	mm7, mm6		;  (level ) / quant (0<quant<32)
	add esi,byte 2
	paddw 	mm2,mm5 ;sum += x1
	movq 	[ebx],mm7 ;store x2
	pxor	mm5, mm1		; mm0 *= sign(mm0)
	pxor	mm7, mm4		;
	psubw	mm5, mm1		; undisplace
	psubw	mm7, mm4		;
	db 0Fh, 7Fh, 54h, 23h, 08 ;movq 	[ebx+8],mm2 ;store sum
	movq	[edx + 8*esi+112-16], mm5
	movq	[edx + 8*esi +120-16], mm7
	jng 	near .loop 

.done
; calculate  data[0] // (int32_t)dcscalar)
	paddw mm2,[ebx]
	mov ebx,[esp+24]
	mov edi,[esp+4+24]
	mov esi,[esp+8+24]
	add esp,byte 12+24
	pmaddwd mm2, [mmx_one]
	punpckldq mm0,mm2 ;get low dw to mm0:high
	paddd mm0,mm2
	punpckhdq mm0,mm0 ;get result to low
	movd	eax, mm0

	ret

align ALIGN
.q1loop
	movq	mm1, [eax + 8*esi+112]		; mm0 = [1st]
	psubw 	mm0,mm1 ;-mm1
	movq	mm4, [eax + 8*esi+120]	; 
	psubw	mm3,mm4 ;-mm4
	pmaxsw  mm0,mm1 ;|src|
	pmaxsw  mm3,mm4
	nop2
	psraw   mm1,15 ;sign src
	psraw   mm4,15 		
	psllw   mm0, 4								; level << 4
	psllw   mm3, 4			
	paddw   mm0, [inter_matrix1 + 8*esi+112] 	;mm0 is to be divided
	paddw   mm3, [inter_matrix1 + 8*esi+120] 	; inter1 contains fix for division by 1
	movq    mm5,[inter_matrix_fixl + 8*esi+112] ;with rounding down
	movq    mm7,[inter_matrix_fixl + 8*esi+120]
	pmulhuw mm5,mm0
	pmulhuw mm7,mm3  							;mm7: first approx of division
	mov esp,esp
	movq   mm2,[inter_matrix + 8*esi+112]
	movq   mm6,[inter_matrix + 8*esi+120]		; divs for q<=16
	pmullw mm2,mm5 								;test value <= original
	pmullw mm6,mm7 
	psubw mm0,mm2 								;mismatch
	psubw mm3,mm6
	movq mm2,[byte ebx]
	pmulhuw mm0,[inter_matrix_fix + 8*esi+112]  ;correction
	pmulhuw mm3,[inter_matrix_fix + 8*esi+120]
	paddw mm2,[ebx+8]   ;sum		
	paddw mm5,mm0 								;final result
	paddw mm7,mm3
	movq mm0,[edi]
	movq mm3,[edi]
	psrlw   mm5, 1			;  (level ) /2  (quant = 1)
	psrlw   mm7, 1		
	add esi,byte 2
	paddw 	mm2,mm5 ;sum += x1
	movq 	[ebx],mm7 ;store x2		
	pxor	mm5, mm1		; mm0 *= sign(mm0)
	pxor	mm7, mm4		;
	psubw	mm5, mm1		; undisplace
	psubw	mm7, mm4		;
	movq 	[ebx+8],mm2 ;store sum		
	movq	[edx + 8*esi+112-16], mm5
	movq	[edx + 8*esi +120-16], mm7
	jng	near .q1loop
	jmp	near .done

align 8
.lloop
	movq	mm1, [eax + 8*esi+112]		; mm0 = [1st]
	psubw 	mm0,mm1 ;-mm1
	movq	mm4, [eax + 8*esi+120]	; 
	psubw	mm3,mm4 ;-mm4
	pmaxsw  mm0,mm1 ;|src|
	pmaxsw  mm3,mm4
	nop2
	psraw   mm1,15 ;sign src
	psraw   mm4,15 		
	psllw   mm0, 4			; level << 4
	psllw   mm3, 4			;		
	paddw   mm0, [inter_matrix1 + 8*esi+112] ;mm0 is to be divided inter1 contains fix for division by 1
	paddw   mm3, [inter_matrix1 + 8*esi+120]
	movq    mm5,[inter_matrix_fixl + 8*esi+112] 
	movq    mm7,[inter_matrix_fixl + 8*esi+120]
	pmulhuw mm5,mm0
	pmulhuw mm7,mm3  ;mm7: first approx of division
	mov esp,esp
	movq   mm2,[inter_matrix + 8*esi+112]
	movq   mm6,[inter_matrix + 8*esi+120]
	pmullw mm2,mm5 ;test value <= original
	pmullw mm6,mm7 
	psubw mm0,mm2 ;mismatch
	psubw mm3,mm6
	movq mm2,[byte ebx]
	movq mm6,[mmx_div + ecx * 8 - 8] ; divs for q<=16
	pmulhuw mm0,[inter_matrix_fix + 8*esi+112] ;correction
	pmulhuw mm3,[inter_matrix_fix + 8*esi+120]
	paddw mm2,[ebx+8]   ;sum		
	paddw mm5,mm0 ;final result
	paddw mm7,mm3
	movq mm0,[edi]
	movq mm3,[edi]
	pmulhuw	mm5, mm6		; mm0 = (mm0 / 2Q) >> 16
	pmulhuw	mm7, mm6		;  (level ) / quant (0<quant<32)
	add esi,byte 2
	psrlw   mm5, 1			; (level ) / (2*quant)
	paddw 	mm2,mm5 ;sum += x1
	psrlw   mm7, 1		
	movq 	[ebx],mm7 ;store x2		
	pxor	mm5, mm1		; mm0 *= sign(mm0)
	pxor	mm7, mm4		;
	psubw	mm5, mm1		; undisplace
	psubw	mm7, mm4		;
	db 0Fh, 7Fh, 54h, 23h, 08 ;movq 	[ebx+8],mm2 ;store sum
	movq	[edx + 8*esi+112-16], mm5
	movq	[edx + 8*esi +120-16], mm7
	jng 	near .lloop 
	jmp	near .done


;===========================================================================
;
; void dequant4_intra_mmx(int16_t *data,
;                    const int16_t const *coeff,
;                    const uint32_t quant,
;                    const uint32_t dcscalar);
;
;===========================================================================

  ;   Note: in order to saturate 'easily', we pre-shift the quantifier
  ; by 4. Then, the high-word of (coeff[]*matrix[i]*quant) are used to
  ; build a saturating mask. It is non-zero only when an overflow occured.
  ; We thus avoid packing/unpacking toward double-word.
  ; Moreover, we perform the mult (matrix[i]*quant) first, instead of, e.g.,
  ; (coeff[i]*matrix[i]). This is less prone to overflow if coeff[] are not
  ; checked. Input ranges are: coeff in [-127,127], inter_matrix in [1..255],a
  ; and quant in [1..31]. 
  ;
  ;********************************************************************
%macro DEQUANT4INTRAMMX 1
	movq mm1, [byte ecx+ 16 * %1]   ; mm0 = c  = coeff[i]
	movq mm4, [ecx+ 16 * %1 +8]; mm3 = c' = coeff[i+1]
	psubw mm0,mm1
	psubw mm3,mm4
	pmaxsw mm0,mm1
	pmaxsw mm3,mm4
	psraw mm1,15
	psraw mm4,15
%if %1  
	movq mm2,[eax+8]   ;preshifted quant
	movq mm7,[eax+8]  
%endif
	pmullw mm2,  [intra_matrix + 16 * %1 ]  ; matrix[i]*quant
	pmullw mm7,  [intra_matrix + 16 * %1 +8]  ; matrix[i+1]*quant
	movq mm5,mm0 
	movq mm6,mm3
	pmulhw mm0, mm2   ; high of coeff*(matrix*quant)
	pmulhw mm3, mm7   ; high of coeff*(matrix*quant)
	pmullw mm2, mm5   ; low  of coeff*(matrix*quant)
	pmullw mm7, mm6   ; low  of coeff*(matrix*quant)
	pcmpgtw mm0, [eax]
	pcmpgtw mm3, [eax]
	paddusw mm2, mm0
	paddusw mm7, mm3
	psrlw mm2, 5
	psrlw mm7, 5 
	pxor mm2, mm1  ; start negating back
	pxor mm7, mm4  ; start negating back
	psubusw mm1, mm0
	psubusw mm4, mm3
	movq mm0,[eax] ;zero
	movq mm3,[eax] ;zero
	psubw mm2, mm1 ; finish negating back  
	psubw mm7, mm4 ; finish negating back   
	movq [byte edx + 16 * %1], mm2   ; data[i]
	movq [edx + 16 * %1  +8], mm7   ; data[i+1]
%endmacro

align 16
cglobal dequant4_intra_3dne
dequant4_intra_3dne:
	mov eax, [esp+12] ; quant
	mov ecx, [esp+8]  ; coeff
	movq mm7, [mmx_mul_quant  + eax*8 - 8]
	psllw mm7, 2   ; << 2. See comment.
	mov edx, [esp+4]  ; data
	push ebx	
	movsx ebx,word [ecx]
	pxor mm0, mm0   
	pxor mm3, mm3   
	push esi
	lea eax,[esp-28]
	sub esp,byte 32
	and eax,byte -8 ;points to qword aligned space on stack
	movq [eax],mm0
	movq [eax+8],mm7
	imul ebx,[esp+16+8+32]    ; dcscalar  
	movq mm2,mm7
  

align 4

	DEQUANT4INTRAMMX 0

	mov esi,-2048
	nop
	cmp ebx,esi

	DEQUANT4INTRAMMX 1

	cmovl ebx, esi
	neg esi
	sub esi, byte 1 ;2047

	DEQUANT4INTRAMMX 2

	cmp ebx, esi
	cmovg ebx, esi
	lea ebp, [byte ebp]

	DEQUANT4INTRAMMX 3

	mov esi, [esp+32]
	mov [byte edx], bx
	mov ebx, [esp+32+4]
	
	DEQUANT4INTRAMMX 4
	DEQUANT4INTRAMMX 5
	DEQUANT4INTRAMMX 6
	DEQUANT4INTRAMMX 7

	add esp, byte 32+8

 ret

;===========================================================================
;
; void dequant4_inter_3dne(int16_t * data,
;                    const int16_t * const coeff,
;                    const uint32_t quant);
;
;===========================================================================

    ; Note:  We use (2*c + sgn(c) - sgn(-c)) as multiplier
    ; so we handle the 3 cases: c<0, c==0, and c>0 in one shot.
    ; sgn(x) is the result of 'pcmpgtw 0,x':  0 if x>=0, -1 if x<0.
    ; It's mixed with the extraction of the absolute value.

align 16
cglobal dequant4_inter_3dne
dequant4_inter_3dne:
	mov    edx, [esp+ 4]        ; data
	mov    ecx, [esp+ 8]        ; coeff
	mov    eax, [esp+12]        ; quant
	movq mm7, [mmx_mul_quant  + eax*8 - 8]
	mov eax, -14
	paddw mm7, mm7    ; << 1
	pxor mm6, mm6 ; mismatch sum
	push esi
	mov esi,mmzero
	pxor mm1,mm1
	pxor mm3,mm3
	nop
	nop4

align 16
.loop
	movq mm0, [ecx+8*eax + 7*16   ]   ; mm0 = coeff[i]
	pcmpgtw mm1, mm0  ; mm1 = sgn(c)    (preserved)
	movq mm2, [ecx+8*eax + 7*16 +8]   ; mm2 = coeff[i+1]
	pcmpgtw mm3, mm2  ; mm3 = sgn(c')   (preserved)
	paddsw mm0, mm1   ; c += sgn(c)
	paddsw mm2, mm3   ; c += sgn(c')
	paddw mm0, mm0    ; c *= 2
	paddw mm2, mm2    ; c'*= 2

	movq mm4, [esi]
	movq mm5, [esi]
	psubw mm4, mm0    ; -c
	psubw mm5, mm2    ; -c'

	psraw mm4, 16     ; mm4 = sgn(-c)
	psraw mm5, 16     ; mm5 = sgn(-c')
	psubsw mm0, mm4   ; c  -= sgn(-c)
	psubsw mm2, mm5   ; c' -= sgn(-c')
	pxor mm0, mm1     ; finish changing sign if needed
	pxor mm2, mm3     ; finish changing sign if needed

	; we're short on register, here. Poor pairing...

	movq mm4, mm7     ; (matrix*quant)
	nop
	pmullw mm4,  [inter_matrix + 8*eax + 7*16]
	movq mm5, mm4
	pmulhw mm5, mm0   ; high of c*(matrix*quant)
	pmullw mm0, mm4   ; low  of c*(matrix*quant)

	movq mm4, mm7     ; (matrix*quant)
	pmullw mm4,  [inter_matrix + 8*eax + 7*16 + 8]
	add eax,byte 2

	pcmpgtw mm5, [esi]
	paddusw mm0, mm5
	psrlw mm0, 5
	pxor mm0, mm1     ; start restoring sign
	psubusw mm1, mm5

	movq mm5, mm4
	pmulhw mm5, mm2   ; high of c*(matrix*quant)
	pmullw mm2, mm4   ; low  of c*(matrix*quant)
	psubw mm0, mm1    ; finish restoring sign

	pcmpgtw mm5, [esi]
	paddusw mm2, mm5
	psrlw mm2, 5
	pxor mm2, mm3    ; start restoring sign
	psubusw mm3, mm5
	psubw mm2, mm3   ; finish restoring sign
	movq mm1, [esi]
	movq mm3, [byte esi]
	pxor mm6, mm0     ; mismatch control
	movq [edx + 8*eax + 7*16 -2*8   ], mm0   ; data[i]
	pxor mm6, mm2     ; mismatch control
	movq [edx + 8*eax + 7*16 -2*8 +8], mm2   ; data[i+1]

	jng  .loop
	nop

	; mismatch control

	pshufw mm0,mm6,01010101b 
	pshufw mm1,mm6,10101010b 
	pshufw mm2,mm6,11111111b 
	pxor mm6, mm0
	pxor mm1, mm2
	pxor mm6, mm1
	movd eax, mm6
	and eax,byte 1
	xor eax,byte 1
	mov esi,[esp]
	add esp,byte 4
	xor word [edx + 2*63], ax

	ret

No admin address has been configured
ViewVC Help
Powered by ViewVC 1.0.4