[svn] / branches / dev-api-4 / xvidcore / src / dct / x86_asm / simple_idct_mmx.asm Repository:
ViewVC logotype

View of /branches/dev-api-4/xvidcore/src/dct/x86_asm/simple_idct_mmx.asm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 886 - (download) (annotate)
Fri Feb 21 14:49:29 2003 UTC (21 years, 2 months ago)
File size: 27056 byte(s)
This commit was manufactured by cvs2svn to create branch 'dev-api-4'.
;/*
; * Simple IDCT MMX
; *
; * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
; *
; * This library is free software; you can redistribute it and/or
; * modify it under the terms of the GNU Lesser General Public
; * License as published by the Free Software Foundation; either
; * version 2 of the License, or (at your option) any later version.
; *
; * This library is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
; * Lesser General Public License for more details.
; *
; * You should have received a copy of the GNU Lesser General Public
; * License along with this library; if not, write to the Free Software
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
; *
; * Ported to nasm by Peter Ross <pross@xvid.org>
; */

bits 32


;===========================================================================
; data
;===========================================================================

%ifdef FORMAT_COFF
section .data
align 8
%else
section .data data align=8
%endif

wm1010	dw		0, 0xffff, 0, 0xffff
d40000	dd		0x40000, 0


%define ROW_SHIFT 11
%define COL_SHIFT 20
%define C0 23170	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 23170.475006
%define C1 22725	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 22725.260826
%define C2 21407	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 21406.727617
%define C3 19266	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 19265.545870
%define C4 16383	;cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 = 16384.000000
%define C5 12873	;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 12872.826198
%define C6 8867		;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 8866.956905
%define C7 4520		;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 4520.335430

coeffs
	dw	1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,		; 0
	dw	1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,		; 8

	dw	C4,  C4,  C4,  C4		; 16
	dw	C4, -C4,  C4, -C4		; 24
 
	dw	C2,  C6,  C2,  C6		; 32
	dw	C6, -C2,  C6, -C2		; 40
 
	dw	C1,  C3,  C1,  C3		; 48
	dw	C5,  C7,  C5,  C7		; 56
 
	dw	C3, -C7,  C3, -C7		; 64
	dw	-C1, -C5, -C1, -C5		; 72
 
	dw	C5, -C1,  C5, -C1		; 80
	dw	C7,  C3,  C7,  C3		; 88
 
	dw	C7, -C5,  C7, -C5		; 96
	dw	C3, -C1,  C3, -C1		; 104


;===========================================================================
; text
;===========================================================================
section .text

%macro	DC_COND_IDCT	8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm1,[src4]				; R6	R2	r6	r2
	movq mm2,[src1]				; R3	R1	r3	r1
	movq mm3,[src5]				; R7	R5	r7	r5
	movq mm4,[wm1010]
	pand mm4,mm0
	por mm4,mm1
	por mm4,mm2
	por mm4,mm3
	packssdw mm4,mm4
	movd eax,mm4
	or eax,eax
	jz near .skip1
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	movq mm5,[coeffs+32]		; C6	C2	C6	C2
	pmaddwd mm5,mm1				; C6R6+C2R2	C6r6+C2r2
	movq mm6,[coeffs+40]		; -C2	C6	-C2	C6
	pmaddwd mm1,mm6				; -C2R6+C6R2	-C2r6+C6r2
	movq mm7,[coeffs+48]		; C3	C1	C3	C1
	pmaddwd mm7,mm2				; C3R3+C1R1	C3r3+C1r1
	rounder_op	mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	paddd mm4,mm5				; A0		a0
	psubd mm6,mm5				; A3		a3
	movq mm5,[coeffs+56]		; C7	C5	C7	C5
	pmaddwd mm5,mm3				; C7R7+C5R5	C7r7+C5r5
	rounder_op	mm0, rounder_arg
	paddd mm1,mm0				; A1		a1
	paddd mm0,mm0
	psubd mm0,mm1				; A2		a2
	pmaddwd mm2,[coeffs+64]		; -C7R3+C3R1	-C7r3+C3r1
	paddd mm7,mm5				; B0		b0
	movq mm5,[coeffs+72]		; -C5	-C1	-C5	-C1
	pmaddwd mm5,mm3				; -C5R7-C1R5	-C5r7-C1r5
	paddd mm7,mm4				; A0+B0		a0+b0
	paddd mm4,mm4				; 2A0		2a0
	psubd mm4,mm7				; A0-B0		a0-b0
	paddd mm5,mm2				; B1		b1
	psrad mm7,shift
	psrad mm4,shift
	movq mm2,mm1				; A1		a1
	paddd mm1,mm5				; A1+B1		a1+b1
	psubd mm2,mm5				; A1-B1		a1-b1
	psrad mm1,shift
	psrad mm2,shift
	packssdw mm7,mm1			; A1+B1	a1+b1	A0+B0	a0+b0
	packssdw mm2,mm4			; A0-B0	a0-b0	A1-B1	a1-b1
	movq [dst],mm7
	movq mm1,[src1]				; R3	R1	r3	r1
	movq mm4,[coeffs+80]		;-C1	C5	-C1 	C5
	movq [dst + 24],mm2
	pmaddwd	mm4,mm1				; -C1R3+C5R1	-C1r3+C5r1
	movq mm7,[coeffs+88]		; C3	C7	C3 	C7
	pmaddwd mm1,[coeffs+96]		; -C5R3+C7R1	-C5r3+C7r1
	pmaddwd mm7,mm3				; C3R7+C7R5	C3r7+C7r5
	movq mm2,mm0				; A2		a2
	pmaddwd mm3,[coeffs+104]	; -C1R7+C3R5	-C1r7+C3r5
	paddd mm4,mm7				; B2		b2
	paddd mm2,mm4				; A2+B2		a2+b2
	psubd mm0,mm4				; a2-B2		a2-b2
	psrad mm2,shift
	psrad mm0,shift
	movq mm4,mm6				; A3		a3
	paddd mm3,mm1				; B3		b3
	paddd mm6,mm3				; A3+B3		a3+b3
	psubd mm4,mm3				; a3-B3		a3-b3
	psrad mm6,shift
	packssdw mm2,mm6			; A3+B3	a3+b3	A2+B2	a2+b2
	movq [ dst + 8],mm2
	psrad mm4,shift
	packssdw mm4,mm0			; A2-B2	a2-b2	A3-B3	a3-b3
	movq [ dst + 16],mm4
	jmp short .skip2
.skip1
	pslld mm0,16			
	paddd mm0,[d40000]
	psrad mm0,13
	packssdw mm0,mm0
	movq [ dst ],mm0
	movq [ dst + 8],mm0
	movq [ dst + 16],mm0
	movq [ dst + 24],mm0
.skip2
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift
%endmacro



%macro	Z_COND_IDCT	9
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
%define	bt			%9
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm1,[src4]				; R6	R2	r6	r2
	movq mm2,[src1]				; R3	R1	r3	r1
	movq mm3,[src5]				; R7	R5	r7	r5
	movq mm4,mm0
	por mm4,mm1
	por mm4,mm2
	por mm4,mm3
	packssdw mm4,mm4
	movd eax,mm4
	or eax,eax
	jz near bt
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	movq mm5,[coeffs+32]		; C6	C2	C6	C2
	pmaddwd mm5,mm1				; C6R6+C2R2	C6r6+C2r2
	movq mm6,[coeffs+40]		; -C2	C6	-C2	C6
	pmaddwd mm1,mm6				; -C2R6+C6R2	-C2r6+C6r2
	movq mm7,[coeffs+48]		; C3	C1	C3	C1
	pmaddwd mm7,mm2				; C3R3+C1R1	C3r3+C1r1
	rounder_op mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	paddd mm4,mm5				; A0		a0
	psubd mm6,mm5				; A3		a3
	movq mm5,[coeffs+56]		; C7	C5	C7	C5
	pmaddwd mm5,mm3				; C7R7+C5R5	C7r7+C5r5
	rounder_op mm0, rounder_arg
	paddd mm1,mm0				; A1		a1
	paddd mm0,mm0
	psubd mm0,mm1				; A2		a2
	pmaddwd mm2,[coeffs+64]		; -C7R3+C3R1	-C7r3+C3r1
	paddd mm7,mm5				; B0		b0
	movq mm5,[coeffs+72]		; -C5	-C1	-C5	-C1
	pmaddwd mm5,mm3				; -C5R7-C1R5	-C5r7-C1r5
	paddd mm7,mm4				; A0+B0		a0+b0
	paddd mm4,mm4				; 2A0		2a0
	psubd mm4,mm7				; A0-B0		a0-b0
	paddd mm5,mm2				; B1		b1
	psrad mm7,shift
	psrad mm4,shift
	movq mm2,mm1				; A1		a1
	paddd mm1,mm5				; A1+B1		a1+b1
	psubd mm2,mm5				; A1-B1		a1-b1
	psrad mm1,shift
	psrad mm2,shift
	packssdw mm7,mm1			; A1+B1	a1+b1	A0+B0	a0+b0
	packssdw mm2,mm4			; A0-B0	a0-b0	A1-B1	a1-b1
	movq [ dst ],mm7
	movq mm1,[src1]				; R3	R1	r3	r1
	movq mm4,[coeffs+80]		; -C1	C5	-C1 	C5
	movq [ dst + 24 ],mm2
	pmaddwd mm4,mm1				; -C1R3+C5R1	-C1r3+C5r1
	movq mm7,[coeffs+88]		; C3	C7	C3 	C7
	pmaddwd mm1,[coeffs+96]		; -C5R3+C7R1	-C5r3+C7r1
	pmaddwd mm7,mm3				; C3R7+C7R5	C3r7+C7r5
	movq mm2,mm0				; A2		a2
	pmaddwd mm3,[coeffs+104]	; -C1R7+C3R5	-C1r7+C3r5
	paddd mm4,mm7				; B2		b2
	paddd mm2,mm4				; A2+B2		a2+b2
	psubd mm0,mm4				; a2-B2		a2-b2
	psrad mm2,shift
	psrad mm0,shift
	movq mm4,mm6				; A3		a3
	paddd mm3,mm1				; B3		b3
	paddd mm6,mm3				; A3+B3		a3+b3
	psubd mm4,mm3				; a3-B3		a3-b3
	psrad mm6,shift
	packssdw mm2,mm6			; A3+B3	a3+b3	A2+B2	a2+b2
	movq [ dst + 8],mm2
	psrad mm4,shift
	packssdw mm4,mm0			; A2-B2	a2-b2	A3-B3	a3-b3
	movq [dst + 16],mm4
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift
%undef	bt
%endmacro



%macro	IDCT0		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm1,[src4]				; R6	R2	r6	r2
	movq mm2,[src1]				; R3	R1	r3	r1
	movq mm3,[src5]				; R7	R5	r7	r5
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	movq mm5,[coeffs+32]		; C6	C2	C6	C2
	pmaddwd mm5,mm1				; C6R6+C2R2	C6r6+C2r2
	movq mm6,[coeffs+40]		; -C2	C6	-C2	C6
	pmaddwd mm1,mm6				; -C2R6+C6R2	-C2r6+C6r2
	; rounder_op mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	movq mm7,[coeffs+48]		; C3	C1	C3	C1
	; rounder_op mm0, rounder_arg
	pmaddwd mm7,mm2				; C3R3+C1R1	C3r3+C1r1
	paddd mm4,mm5				; A0		a0
	psubd mm6,mm5				; A3		a3
	movq mm5,mm0				; -C4R4+C4R0	-C4r4+C4r0
	paddd mm0,mm1				; A1		a1
	psubd mm5,mm1				; A2		a2
	movq mm1,[coeffs+56]		; C7	C5	C7	C5
	pmaddwd mm1,mm3				; C7R7+C5R5	C7r7+C5r5
	pmaddwd mm2,[coeffs+64]		; -C7R3+C3R1	-C7r3+C3r1
	paddd mm7,mm1				; B0		b0
	movq mm1,[coeffs+72]		; -C5	-C1	-C5	-C1
	pmaddwd mm1,mm3				; -C5R7-C1R5	-C5r7-C1r5
	paddd mm7,mm4				; A0+B0		a0+b0
	paddd mm4,mm4				; 2A0		2a0
	psubd mm4,mm7				; A0-B0		a0-b0
	paddd mm1,mm2				; B1		b1
	psrad mm7,shift
	psrad mm4,shift
	movq mm2,mm0				; A1		a1
	paddd mm0,mm1				; A1+B1		a1+b1
	psubd mm2,mm1				; A1-B1		a1-b1
	psrad mm0,shift
	psrad mm2,shift
	packssdw mm7,mm7			; A0+B0	a0+b0
	movd [ dst ],mm7
	packssdw mm0,mm0			; A1+B1	a1+b1
	movd [ dst + 16],mm0
	packssdw mm2,mm2			; A1-B1	a1-b1
	movd [ dst + 96 ],mm2
	packssdw mm4,mm4			; A0-B0	a0-b0
	movd [ dst + 112],mm4
	movq mm0,[src1]				; R3	R1	r3	r1
	movq mm4,[coeffs+80]		; -C1	C5	-C1 	C5
	pmaddwd mm4,mm0				; -C1R3+C5R1	-C1r3+C5r1
	movq mm7,[coeffs+88]		; C3	C7	C3 	C7
	pmaddwd mm0,[coeffs+96]		; -C5R3+C7R1	-C5r3+C7r1
	pmaddwd mm7,mm3				; C3R7+C7R5	C3r7+C7r5
	movq mm2,mm5				; A2		a2
	pmaddwd mm3,[coeffs+104]	; -C1R7+C3R5	-C1r7+C3r5
	paddd mm4,mm7				; B2		b2
	paddd mm2,mm4				; A2+B2		a2+b2
	psubd mm5,mm4				; a2-B2		a2-b2
	psrad mm2,shift
	psrad mm5,shift
	movq mm4,mm6				; A3		a3
	paddd mm3,mm0				; B3		b3
	paddd mm6,mm3				; A3+B3		a3+b3
	psubd mm4,mm3				; a3-B3		a3-b3
	psrad mm6,shift
	psrad mm4,shift
	packssdw mm2,mm2			; A2+B2	a2+b2
	packssdw mm6,mm6			; A3+B3	a3+b3
	movd [ dst + 32 ],mm2
	packssdw mm4,mm4			; A3-B3	a3-b3
	packssdw mm5,mm5			; A2-B2	a2-b2
	movd [ dst + 48 ],mm6
	movd [ dst + 64 ],mm4
	movd [ dst + 80 ],mm5
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift
%endmacro



%macro	IDCT4		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm1,[src4]				; R6	R2	r6	r2
	movq mm3,[src5]				; R7	R5	r7	r5
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	movq mm5,[coeffs+32]		; C6	C2	C6	C2
	pmaddwd mm5,mm1				; C6R6+C2R2	C6r6+C2r2
	movq mm6,[coeffs+40]		; -C2	C6	-C2	C6
	pmaddwd mm1,mm6				; -C2R6+C6R2	-C2r6+C6r2
	; rounder_op mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	; rounder_op mm0, rounder_arg
	paddd mm4,mm5				; A0		a0
	psubd mm6,mm5				; A3		a3
	movq mm5,mm0				; -C4R4+C4R0	-C4r4+C4r0
	paddd mm0,mm1				; A1		a1
	psubd mm5,mm1				; A2		a2
	movq mm1,[coeffs+56]		; C7	C5	C7	C5
	pmaddwd mm1,mm3				; C7R7+C5R5	C7r7+C5r5
	movq mm7,[coeffs+72]		; -C5	-C1	-C5	-C1
	pmaddwd mm7,mm3				; -C5R7-C1R5	-C5r7-C1r5
	paddd mm1,mm4				; A0+B0		a0+b0
	paddd mm4,mm4				; 2A0		2a0
	psubd mm4,mm1				; A0-B0		a0-b0
	psrad mm1,shift
	psrad mm4,shift
	movq mm2,mm0				; A1		a1
	paddd mm0,mm7				; A1+B1		a1+b1
	psubd mm2,mm7				; A1-B1		a1-b1
	psrad mm0,shift
	psrad mm2,shift
	packssdw mm1,mm1			; A0+B0	a0+b0
	movd [ dst ],mm1
	packssdw mm0,mm0			; A1+B1	a1+b1
	movd [ dst + 16 ],mm0
	packssdw mm2,mm2			; A1-B1	a1-b1
	movd [ dst + 96 ],mm2
	packssdw mm4,mm4			; A0-B0	a0-b0
	movd [ dst + 112 ],mm4
	movq mm1,[coeffs+88]		; C3	C7	C3 	C7
	pmaddwd mm1,mm3				; C3R7+C7R5	C3r7+C7r5
	movq mm2,mm5				; A2		a2
	pmaddwd mm3,[coeffs+104]	; -C1R7+C3R5	-C1r7+C3r5
	paddd mm2,mm1				; A2+B2		a2+b2
	psubd mm5,mm1				; a2-B2		a2-b2
	psrad mm2,shift
	psrad mm5,shift
	movq mm1,mm6				; A3		a3
	paddd mm6,mm3				; A3+B3		a3+b3
	psubd mm1,mm3				; a3-B3		a3-b3
	psrad mm6,shift
	psrad mm1,shift
	packssdw mm2,mm2			; A2+B2	a2+b2
	packssdw mm6,mm6			; A3+B3	a3+b3
	movd [dst + 32],mm2
	packssdw mm1,mm1			; A3-B3	a3-b3
	packssdw mm5,mm5			; A2-B2	a2-b2
	movd [dst + 48],mm6
	movd [dst + 64],mm1
	movd [dst + 80],mm5
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift
%endmacro



%macro	IDCT6		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm3,[src5]				; R7	R5	r7	r5
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	; rounder_op mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	; rounder_op mm0, rounder_arg
	movq mm5,mm0				; -C4R4+C4R0	-C4r4+C4r0
	movq mm1,[coeffs+56]		; C7	C5	C7	C5
	pmaddwd mm1,mm3				; C7R7+C5R5	C7r7+C5r5
	movq mm7,[coeffs+72]		; -C5	-C1	-C5	-C1
	pmaddwd mm7,mm3				; -C5R7-C1R5	-C5r7-C1r5
	paddd mm1,mm4				; A0+B0		a0+b0
	paddd mm4,mm4				; 2A0		2a0
	psubd mm4,mm1				; A0-B0		a0-b0
	psrad mm1,shift
	psrad mm4,shift
	movq mm2,mm0				; A1		a1
	paddd mm0,mm7				; A1+B1		a1+b1
	psubd mm2,mm7				; A1-B1		a1-b1
	psrad mm0,shift
	psrad mm2,shift
	packssdw mm1,mm1			; A0+B0	a0+b0
	movd [ dst ],mm1
	packssdw mm0,mm0			; A1+B1	a1+b1
	movd [ dst + 16 ],mm0
	packssdw mm2,mm2			; A1-B1	a1-b1
	movd [ dst + 96 ],mm2
	packssdw mm4,mm4			; A0-B0	a0-b0
	movd [ dst + 112 ],mm4
	movq mm1,[coeffs+88]		; C3	C7	C3 	C7
	pmaddwd mm1,mm3				; C3R7+C7R5	C3r7+C7r5
	movq mm2,mm5				; A2		a2
	pmaddwd mm3,[coeffs+104]	; -C1R7+C3R5	-C1r7+C3r5
	paddd mm2,mm1				; A2+B2		a2+b2
	psubd mm5,mm1				; a2-B2		a2-b2
	psrad mm2,shift
	psrad mm5,shift
	movq mm1,mm6				; A3		a3
	paddd mm6,mm3				; A3+B3		a3+b3
	psubd mm1,mm3				; a3-B3		a3-b3
	psrad mm6,shift
	psrad mm1,shift
	packssdw mm2,mm2			; A2+B2	a2+b2
	packssdw mm6,mm6			; A3+B3	a3+b3
	movd [dst + 32],mm2
	packssdw mm1,mm1			; A3-B3	a3-b3
	packssdw mm5,mm5			; A2-B2	a2-b2
	movd [dst + 48],mm6
	movd [dst + 64],mm1
	movd [dst + 80],mm5
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift
%endmacro




%macro	IDCT2		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm2,[src1]				; R3	R1	r3	r1
	movq mm3,[src5]				; R7	R5	r7	r5
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	; rounder_op mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	movq mm7,[coeffs+48]		; C3	C1	C3	C1
	; rounder_op mm0, rounder_arg
	pmaddwd mm7,mm2				; C3R3+C1R1	C3r3+C1r1
	movq mm5,mm0				; -C4R4+C4R0	-C4r4+C4r0
	movq mm1,[coeffs+56]		; C7	C5	C7	C5
	pmaddwd mm1,mm3				; C7R7+C5R5	C7r7+C5r5
	pmaddwd mm2,[coeffs+64]		; -C7R3+C3R1	-C7r3+C3r1
	paddd mm7,mm1				; B0		b0
	movq mm1,[coeffs+72]		; -C5	-C1	-C5	-C1
	pmaddwd mm1,mm3				; -C5R7-C1R5	-C5r7-C1r5
	paddd mm7,mm4				; A0+B0		a0+b0
	paddd mm4,mm4				; 2A0		2a0
	psubd mm4,mm7				; A0-B0		a0-b0
	paddd mm1,mm2				; B1		b1
	psrad mm7,shift
	psrad mm4,shift
	movq mm2,mm0				; A1		a1
	paddd mm0,mm1				; A1+B1		a1+b1
	psubd mm2,mm1				; A1-B1		a1-b1
	psrad mm0,shift
	psrad mm2,shift
	packssdw mm7,mm7			; A0+B0	a0+b0
	movd [dst],mm7
	packssdw mm0,mm0			; A1+B1	a1+b1
	movd [dst + 16],mm0
	packssdw mm2,mm2			; A1-B1	a1-b1
	movd [dst + 96],mm2
	packssdw mm4,mm4			; A0-B0	a0-b0
	movd [dst + 112],mm4
	movq mm0,[src1]				; R3	R1	r3	r1
	movq mm4,[coeffs+80]		; -C1	C5	-C1 	C5
	pmaddwd mm4,mm0				; -C1R3+C5R1	-C1r3+C5r1
	movq mm7,[coeffs+88]		; C3	C7	C3 	C7
	pmaddwd mm0,[coeffs+96]		; -C5R3+C7R1	-C5r3+C7r1
	pmaddwd mm7,mm3				; C3R7+C7R5	C3r7+C7r5
	movq mm2,mm5				; A2		a2
	pmaddwd mm3,[coeffs+104]	; -C1R7+C3R5	-C1r7+C3r5
	paddd mm4,mm7				; B2		b2
	paddd mm2,mm4				; A2+B2		a2+b2
	psubd mm5,mm4				; a2-B2		a2-b2
	psrad mm2,shift
	psrad mm5,shift
	movq mm4,mm6				; A3		a3
	paddd mm3,mm0				; B3		b3
	paddd mm6,mm3				; A3+B3		a3+b3
	psubd mm4,mm3				; a3-B3		a3-b3
	psrad mm6,shift
	psrad mm4,shift
	packssdw mm2,mm2			; A2+B2	a2+b2
	packssdw mm6,mm6			; A3+B3	a3+b3
	movd [dst + 32],mm2
	packssdw mm4,mm4			; A3-B3	a3-b3
	packssdw mm5,mm5			; A2-B2	a2-b2
	movd [dst + 48],mm6
	movd [dst + 64],mm4
	movd [dst + 80],mm5
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift
%endmacro



%macro	IDCT3		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm2,[src1]				; R3	R1	r3	r1
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	; rounder_op mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	movq mm7,[coeffs+48]		; C3	C1	C3	C1
	; rounder_op mm0, rounder_arg
	pmaddwd mm7,mm2				; C3R3+C1R1	C3r3+C1r1
	movq mm5,mm0				; -C4R4+C4R0	-C4r4+C4r0
	movq mm3,[coeffs+64]
	pmaddwd mm3,mm2				; -C7R3+C3R1	-C7r3+C3r1
	paddd mm7,mm4				; A0+B0		a0+b0
	paddd mm4,mm4				; 2A0		2a0
	psubd mm4,mm7				; A0-B0		a0-b0
	psrad mm7,shift
	psrad mm4,shift
	movq mm1,mm0				; A1		a1
	paddd mm0,mm3				; A1+B1		a1+b1
	psubd mm1,mm3				; A1-B1		a1-b1
	psrad mm0,shift
	psrad mm1,shift
	packssdw mm7,mm7			; A0+B0	a0+b0
	movd [dst],mm7
	packssdw mm0,mm0			; A1+B1	a1+b1
	movd [dst + 16],mm0
	packssdw mm1,mm1			; A1-B1	a1-b1
	movd [dst + 96],mm1
	packssdw mm4,mm4			; A0-B0	a0-b0
	movd [dst + 112],mm4
	movq mm4,[coeffs+80]		; -C1	C5	-C1 	C5
	pmaddwd mm4,mm2				; -C1R3+C5R1	-C1r3+C5r1
	pmaddwd mm2,[coeffs+96]		; -C5R3+C7R1	-C5r3+C7r1
	movq mm1,mm5				; A2		a2
	paddd mm1,mm4				; A2+B2		a2+b2
	psubd mm5,mm4				; a2-B2		a2-b2
	psrad mm1,shift
	psrad mm5,shift
	movq mm4,mm6				; A3		a3
	paddd mm6,mm2				; A3+B3		a3+b3
	psubd mm4,mm2				; a3-B3		a3-b3
	psrad mm6,shift
	psrad mm4,shift
	packssdw mm1,mm1			; A2+B2	a2+b2
	packssdw mm6,mm6			; A3+B3	a3+b3
	movd [dst + 32],mm1
	packssdw mm4,mm4			; A3-B3	a3-b3
	packssdw mm5,mm5			; A2-B2	a2-b2
	movd [dst + 48],mm6
	movd [dst + 64],mm4
	movd [dst + 80],mm5
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift
%endmacro



%macro	IDCT5		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm1,[src4]				; R6	R2	r6	r2
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	movq mm5,[coeffs+32]		; C6	C2	C6	C2
	pmaddwd mm5,mm1				; C6R6+C2R2	C6r6+C2r2
	movq mm6,[coeffs+40]		; -C2	C6	-C2	C6
	pmaddwd mm1,mm6				; -C2R6+C6R2	-C2r6+C6r2
	; rounder_op mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	paddd mm4,mm5				; A0		a0
	; rounder_op mm0, rounder_arg
	psubd mm6,mm5				; A3		a3
	movq mm5,mm0				; -C4R4+C4R0	-C4r4+C4r0
	paddd mm0,mm1				; A1		a1
	psubd mm5,mm1				; A2		a2
	movq mm2,[src0 + 8]			; R4	R0	r4	r0
	movq mm3,[src4 + 8]			; R6	R2	r6	r2
	movq mm1,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm1,mm2				; C4R4+C4R0	C4r4+C4r0
	movq mm7,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm2,mm7				; -C4R4+C4R0	-C4r4+C4r0
	movq mm7,[coeffs+32]		; C6	C2	C6	C2
	pmaddwd mm7,mm3				; C6R6+C2R2	C6r6+C2r2
	pmaddwd mm3,[coeffs+40]		; -C2R6+C6R2	-C2r6+C6r2
	; rounder_op mm1, rounder_arg
	paddd mm7,mm1				; A0		a0
	paddd mm1,mm1				; 2C0		2c0
	; rounder_op mm2, rounder_arg
	psubd mm1,mm7				; A3		a3
	paddd mm3,mm2				; A1		a1
	paddd mm2,mm2				; 2C1		2c1
	psubd mm2,mm3				; A2		a2
	psrad mm4,shift
	psrad mm7,shift
	psrad mm3,shift
	packssdw mm4,mm7			; A0	a0
	movq [dst],mm4
	psrad mm0,shift
	packssdw mm0,mm3			; A1	a1
	movq [dst + 16],mm0
	movq [dst + 96],mm0
	movq [dst + 112],mm4
	psrad mm5,shift
	psrad mm6,shift
	psrad mm2,shift
	packssdw mm5,mm2			; A2-B2	a2-b2
	movq [dst + 32],mm5
	psrad mm1,shift
	packssdw mm6,mm1			; A3+B3	a3+b3
	movq [dst + 48],mm6
	movq [dst + 64],mm6
	movq [dst + 80],mm5
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift
%endmacro


%macro	IDCT1		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm1,[src4]				; R6	R2	r6	r2
	movq mm2,[src1]				; R3	R1	r3	r1
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	movq mm5,[coeffs+32]		; C6	C2	C6	C2
	pmaddwd mm5,mm1				; C6R6+C2R2	C6r6+C2r2
	movq mm6,[coeffs+40]		; -C2	C6	-C2	C6
	pmaddwd mm1,mm6				; -C2R6+C6R2	-C2r6+C6r2
	; rounder_op mm4, rounder_arg
	movq mm6,mm4				; C4R4+C4R0	C4r4+C4r0
	movq mm7,[coeffs+48]		; C3	C1	C3	C1
	; rounder_op mm0, rounder_arg
	pmaddwd mm7,mm2				; C3R3+C1R1	C3r3+C1r1
	paddd mm4,mm5				; A0		a0
	psubd mm6,mm5				; A3		a3
	movq mm5,mm0				; -C4R4+C4R0	-C4r4+C4r0
	paddd mm0,mm1				; A1		a1
	psubd mm5,mm1				; A2		a2
	movq mm1,[coeffs+64]
	pmaddwd mm1,mm2				; -C7R3+C3R1	-C7r3+C3r1
	paddd mm7,mm4				; A0+B0		a0+b0
	paddd mm4,mm4				; 2A0		2a0
	psubd mm4,mm7				; A0-B0		a0-b0
	psrad mm7,shift
	psrad mm4,shift
	movq mm3,mm0				; A1		a1
	paddd mm0,mm1				; A1+B1		a1+b1
	psubd mm3,mm1				; A1-B1		a1-b1
	psrad mm0,shift
	psrad mm3,shift
	packssdw mm7,mm7			; A0+B0	a0+b0
	movd [dst],mm7
	packssdw mm0,mm0			; A1+B1	a1+b1
	movd [dst + 16],mm0
	packssdw mm3,mm3			; A1-B1	a1-b1
	movd [dst + 96],mm3
	packssdw mm4,mm4			; A0-B0	a0-b0
	movd [dst + 112],mm4
	movq mm4,[coeffs+80]		; -C1	C5	-C1 	C5
	pmaddwd mm4,mm2				; -C1R3+C5R1	-C1r3+C5r1
	pmaddwd mm2,[coeffs+96]		; -C5R3+C7R1	-C5r3+C7r1
	movq mm3,mm5				; A2		a2
	paddd mm3,mm4				; A2+B2		a2+b2
	psubd mm5,mm4				; a2-B2		a2-b2
	psrad mm3,shift
	psrad mm5,shift
	movq mm4,mm6				; A3		a3
	paddd mm6,mm2				; A3+B3		a3+b3
	psubd mm4,mm2				; a3-B3		a3-b3
	psrad mm6,shift
	packssdw mm3,mm3			; A2+B2	a2+b2
	movd [dst + 32],mm3
	psrad mm4,shift
	packssdw mm6,mm6			; A3+B3	a3+b3
	movd [dst + 48],mm6
	packssdw mm4,mm4			; A3-B3	a3-b3
	packssdw mm5,mm5			; A2-B2	a2-b2
	movd [dst + 64],mm4
	movd [dst + 80],mm5
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift
%endmacro




%macro	IDCT7		8
%define	src0		%1
%define	src4		%2
%define	src1		%3
%define	src5		%4
%define	dst			%5
%define	rounder_op	%6
%define	rounder_arg	%7
%define	shift		%8
	movq mm0,[src0]				; R4	R0	r4	r0
	movq mm4,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm4,mm0				; C4R4+C4R0	C4r4+C4r0
	movq mm5,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm0,mm5				; -C4R4+C4R0	-C4r4+C4r0
	; rounder_op mm4, rounder_arg
	; rounder_op mm0, rounder_arg
	psrad mm4,shift
	psrad mm0,shift
	movq mm2,[src0 + 8]			; R4	R0	r4	r0
	movq mm1,[coeffs+16]		; C4	C4	C4	C4
	pmaddwd mm1,mm2				; C4R4+C4R0	C4r4+C4r0
	movq mm7,[coeffs+24]		; -C4	C4	-C4	C4
	pmaddwd mm2,mm7				; -C4R4+C4R0	-C4r4+C4r0
	movq mm7,[coeffs+32]		; C6	C2	C6	C2
	; rounder_op mm1, rounder_arg
	; rounder_op mm2, rounder_arg
	psrad mm1,shift
	packssdw mm4,mm1			; A0	a0
	movq [dst],mm4
	psrad mm2,shift
	packssdw mm0,mm2			; A1	a1
	movq [dst + 16],mm0
	movq [dst + 96],mm0
	movq [dst + 112],mm4
	movq [dst + 32],mm0
	movq [dst + 48],mm4
	movq [dst + 64],mm4
	movq [dst + 80],mm0
%undef	src0
%undef	src4
%undef	src1
%undef	src5
%undef	dst
%undef	rounder_op
%undef	rounder_arg
%undef	shift
%endmacro



%macro cglobal 1 
	%ifdef PREFIX
		global _%1 
		%define %1 _%1
	%else
		global %1
	%endif
%endmacro


; void simple_idct_mmx(int16_t * const block);
align 16
cglobal simple_idct_mmx
simple_idct_mmx
	sub esp, 128
	mov edx, [esp+128+4]

;				src0,	src4,	src1,	src5,	dst,	rndop,	rndarg,		shift,	bt

	DC_COND_IDCT edx+0,	edx+8,	edx+16,	edx+24,	esp,	paddd,	[coeffs+8],	11
	Z_COND_IDCT	edx+32,	edx+40,	edx+48,	edx+56,	esp+32,	paddd,	[coeffs],	11,		.four
	Z_COND_IDCT	edx+64,	edx+72,	edx+80,	edx+88,	esp+64,	paddd,	[coeffs],	11,		.two
	Z_COND_IDCT	edx+96,	edx+104,edx+112,edx+120,esp+96,	paddd,	[coeffs],	11,		.one
	IDCT0		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT0		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT0		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT0		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.ret

align 16
.four
	Z_COND_IDCT	edx+64,	edx+72,	edx+80,	edx+88,	esp+64,	paddd,	[coeffs],	11,		.six
	Z_COND_IDCT	edx+96,	edx+104,edx+112,edx+120,esp+96,	paddd,	[coeffs],	11,		.five
	IDCT4		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT4		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT4		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT4		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.ret

align 16
.six
	Z_COND_IDCT	edx+96,	edx+104,edx+112,edx+120,esp+96,	paddd,	[coeffs],	11,		.seven
	IDCT6		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT6		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT6		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT6		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.ret

align 16
.two
	Z_COND_IDCT	edx+96,	edx+104,edx+112,edx+120,esp+96,	paddd,	[coeffs],	11,		.three
	IDCT2		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT2		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT2		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT2		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.ret

align 16
.three
	IDCT3		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT3		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT3		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT3		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.ret

align 16
.five
	IDCT5		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	; IDCT5		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT5		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	; IDCT5		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.ret

align 16
.one
	IDCT1		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	IDCT1		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT1		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	IDCT1		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20
	jmp	.ret

align 16
.seven
	IDCT7		esp,	esp+64,	esp+32,	esp+96,	edx,	nop,	0,			20
	; IDCT7		esp+8,	esp+72,	esp+40,	esp+104,edx+4,	nop,	0,			20
	IDCT7		esp+16,	esp+80,	esp+48,	esp+112,edx+8,	nop,	0,			20
	; IDCT7		esp+24,	esp+88,	esp+56,	esp+120,edx+12,	nop,	0,			20

.ret
	add esp, 128
	ret



No admin address has been configured
ViewVC Help
Powered by ViewVC 1.0.4