[svn] / trunk / xvidcore / src / image / x86_asm / colorspace_rgb_mmx.asm Repository:
ViewVC logotype

View of /trunk/xvidcore/src/image/x86_asm/colorspace_rgb_mmx.asm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 851 - (download) (annotate)
Sat Feb 15 15:22:19 2003 UTC (21 years, 1 month ago) by edgomez
File size: 10519 byte(s)
Moved dev-api-3 to HEAD -- Nasty but efficient -- Merging work has been done too
;/**************************************************************************
; *
; *	XVID MPEG-4 VIDEO CODEC
; *	colorspace rgb
; *
; *	This program is free software; you can redistribute it and/or modify
; *	it under the terms of the GNU General Public License as published by
; *	the Free Software Foundation; either version 2 of the License, or
; *	(at your option) any later version.
; *
; *	This program is distributed in the hope that it will be useful,
; *	but WITHOUT ANY WARRANTY; without even the implied warranty of
; *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; *	GNU General Public License for more details.
; *
; *	You should have received a copy of the GNU General Public License
; *	along with this program; if not, write to the Free Software
; *	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; *
; *************************************************************************/

;/**************************************************************************
; *
; *	History:
; *
; *	10.10.2001	initial version; (c)2002 peter ross <pross@xvid.org>
; *
; *************************************************************************/


bits 32

%macro cglobal 1 
	%ifdef PREFIX
		global _%1 
		%define %1 _%1
	%else
		global %1
	%endif
%endmacro


section .data
align 16

;===========================================================================
; RGB->YV12 yuv constants
;===========================================================================
%define Y_R		0.257
%define Y_G		0.504
%define Y_B		0.098
%define Y_ADD	16

%define U_R		0.148
%define U_G		0.291
%define U_B		0.439
%define U_ADD	128

%define V_R		0.439
%define V_G		0.368
%define V_B		0.071
%define V_ADD	128

;===========================================================================
; RGB->YV12 multiplication matrices
;===========================================================================
;				FIX(Y_B)	FIX(Y_G)	FIX(Y_R)		
y_mul	dw		 25,		 129,		 66,			0
u_mul	dw		 112,		-74,		-38,			0
v_mul	dw		-18,		-94,		 112,			0


;===========================================================================
; YV12->RGB data 
;===========================================================================
%define SCALEBITS 6
Y_SUB		dw  16,  16,  16,  16
U_SUB		dw 128, 128, 128, 128
V_SUB		dw 128, 128, 128, 128

Y_MUL		dw  74,  74,  74,  74

UG_MUL		dw  25,  25,  25,  25
VG_MUL		dw  52,  52,  52,  52

UB_MUL		dw 129, 129, 129, 129
VR_MUL		dw 102, 102, 102, 102



section .text

%include "colorspace_mmx.inc"


;------------------------------------------------------------------------------
; BGR_TO_YV12( BYTES )
;
; BYTES		3=bgr(24bit), 4=bgra(32-bit)
;
; bytes=3/4, pixels = 2, vpixels=2
;------------------------------------------------------------------------------
%macro BGR_TO_YV12_INIT		2
		movq mm7, [y_mul]		
%endmacro


%macro BGR_TO_YV12			2
		; y_out
		pxor mm4, mm4
		pxor mm5, mm5
		movd mm0, [edi]			; x_ptr[0...]
		movd mm2, [edi+edx]		; x_ptr[x_stride...]
		punpcklbw mm0, mm4		; [  |b |g |r ]
		punpcklbw mm2, mm5		; [  |b |g |r ]
		movq mm6, mm0			; = [  |b4|g4|r4]
		paddw mm6, mm2			; +[  |b4|g4|r4]
		pmaddwd mm0, mm7		; *= Y_MUL
		pmaddwd mm2, mm7		; *= Y_MUL
		movq mm4, mm0			; [r]
		movq mm5, mm2			; [r]
		psrlq mm4, 32			; +[g]
		psrlq mm5, 32			; +[g]
		paddd mm0, mm4			; +[b]
		paddd mm2, mm5			; +[b]

		pxor mm4, mm4
		pxor mm5, mm5
		movd mm1, [edi+%1]		; src[%1...]
		movd mm3, [edi+edx+%1]	; src[x_stride+%1...]
		punpcklbw mm1, mm4		; [  |b |g |r ]
		punpcklbw mm3, mm5		; [  |b |g |r ]
		paddw mm6, mm1			; +[  |b4|g4|r4]
		paddw mm6, mm3			; +[  |b4|g4|r4]
		pmaddwd mm1, mm7		; *= Y_MUL
		pmaddwd mm3, mm7		; *= Y_MUL
		movq mm4, mm1			; [r]
		movq mm5, mm3			; [r]
		psrlq mm4, 32			; +[g]
		psrlq mm5, 32			; +[g]
		paddd mm1, mm4			; +[b]
		paddd mm3, mm5			; +[b]

		push	edx

		movd edx, mm0
		shr edx, 8
		add edx, Y_ADD
		mov [esi], dl			; y_ptr[0]

		movd edx, mm1
		shr edx, 8
		add edx, Y_ADD
		mov [esi + 1], dl		; y_ptr[1]

		movd edx, mm2
		shr edx, 8
		add edx, Y_ADD
		mov [esi + eax + 0], dl			; y_ptr[y_stride + 0]

		movd edx, mm3
		shr edx, 8
		add edx, Y_ADD
		mov [esi + eax + 1], dl			; y_ptr[y_stride + 1]

		; u_ptr, v_ptr

		movq mm0, mm6			; = [  |b4|g4|r4]
		pmaddwd mm6, [v_mul]		; *= V_MUL
		pmaddwd mm0, [u_mul]		; *= U_MUL
		movq mm1, mm0
		movq mm2, mm6
		psrlq mm1, 32
		psrlq mm2, 32
		paddd mm0, mm1
		paddd mm2, mm6

		movd edx, mm0
		shr edx, 10
		add edx, U_ADD
		mov [ebx], dl

		movd edx, mm2
		shr edx, 10
		add edx, V_ADD
		mov [ecx], dl

		pop edx
%endmacro
;------------------------------------------------------------------------------





;------------------------------------------------------------------------------
; YV12_TO_BGR( BYTES )
;
; BYTES		3=bgr(24-bit), 4=bgra(32-bit)
;
; bytes=3/4, pixels = 8, vpixels=2
;------------------------------------------------------------------------------
%macro YV12_TO_BGR_INIT		2
		pxor mm7, mm7			; clear mm7
%endmacro


%macro YV12_TO_BGR			2
%define TEMP_Y1  esp
%define TEMP_Y2  esp + 8
%define TEMP_G1  esp + 16
%define TEMP_G2  esp + 24
%define TEMP_B1  esp + 32
%define TEMP_B2  esp + 40
	movd mm2, [ebx]		; u_ptr[0]
	movd mm3, [ecx]		; v_ptr[0]

	punpcklbw mm2, mm7		; u3u2u1u0 -> mm2
	punpcklbw mm3, mm7		; v3v2v1v0 -> mm3

	psubsw mm2, [U_SUB]		; U - 128
	psubsw mm3, [V_SUB]		; V - 128

	movq mm4, mm2
	movq mm5, mm3

	pmullw mm2, [UG_MUL]
	pmullw mm3, [VG_MUL]

	movq mm6, mm2			; u3u2u1u0 -> mm6
	punpckhwd mm2, mm2		; u3u3u2u2 -> mm2
	punpcklwd mm6, mm6		; u1u1u0u0 -> mm6

	pmullw mm4, [UB_MUL]		; B_ADD -> mm4

	movq mm0, mm3
	punpckhwd mm3, mm3		; v3v3v2v2 -> mm2
	punpcklwd mm0, mm0		; v1v1v0v0 -> mm6

	paddsw mm2, mm3
	paddsw mm6, mm0

	pmullw mm5, [VR_MUL]		; R_ADD -> mm5

	movq mm0, [esi]			; y7y6y5y4y3y2y1y0 -> mm0

	movq mm1, mm0
	punpckhbw mm1, mm7		; y7y6y5y4 -> mm1
	punpcklbw mm0, mm7		; y3y2y1y0 -> mm0

	psubsw mm0, [Y_SUB]		; Y - Y_SUB
	psubsw mm1, [Y_SUB]		; Y - Y_SUB

	pmullw mm1, [Y_MUL] 
	pmullw mm0, [Y_MUL]

	movq [TEMP_Y2], mm1		; y7y6y5y4 -> mm3
	movq [TEMP_Y1], mm0		; y3y2y1y0 -> mm7

	psubsw mm1, mm2			; g7g6g5g4 -> mm1
	psubsw mm0, mm6			; g3g2g1g0 -> mm0

	psraw mm1, SCALEBITS
	psraw mm0, SCALEBITS

	packuswb mm0, mm1		;g7g6g5g4g3g2g1g0 -> mm0

	movq [TEMP_G1], mm0

	movq mm0, [esi+eax]			; y7y6y5y4y3y2y1y0 -> mm0

	movq mm1, mm0

	punpckhbw mm1, mm7		; y7y6y5y4 -> mm1
	punpcklbw mm0, mm7		; y3y2y1y0 -> mm0

	psubsw mm0, [Y_SUB]		; Y - Y_SUB
	psubsw mm1, [Y_SUB]		; Y - Y_SUB

	pmullw mm1, [Y_MUL] 
	pmullw mm0, [Y_MUL]

	movq mm3, mm1
	psubsw mm1, mm2			; g7g6g5g4 -> mm1

	movq mm2, mm0
	psubsw mm0, mm6			; g3g2g1g0 -> mm0

	psraw mm1, SCALEBITS
	psraw mm0, SCALEBITS

	packuswb mm0, mm1		; g7g6g5g4g3g2g1g0 -> mm0

	movq [TEMP_G2], mm0

	movq mm0, mm4
	punpckhwd mm4, mm4		; u3u3u2u2 -> mm2
	punpcklwd mm0, mm0		; u1u1u0u0 -> mm6

	movq mm1, mm3			; y7y6y5y4 -> mm1
	paddsw mm3, mm4			; b7b6b5b4 -> mm3

	movq mm7, mm2			; y3y2y1y0 -> mm7

	paddsw mm2, mm0			; b3b2b1b0 -> mm2

	psraw mm3, SCALEBITS
	psraw mm2, SCALEBITS

	packuswb mm2, mm3		; b7b6b5b4b3b2b1b0 -> mm2

	movq [TEMP_B2], mm2

	movq mm3, [TEMP_Y2]
	movq mm2, [TEMP_Y1]

	movq mm6, mm3			; TEMP_Y2 -> mm6
	paddsw mm3, mm4			; b7b6b5b4 -> mm3

	movq mm4, mm2			; TEMP_Y1 -> mm4
	paddsw mm2, mm0			; b3b2b1b0 -> mm2

	psraw mm3, SCALEBITS
	psraw mm2, SCALEBITS

	packuswb mm2, mm3		; b7b6b5b4b3b2b1b0 -> mm2

	movq [TEMP_B1], mm2

	movq mm0, mm5
	punpckhwd mm5, mm5		; v3v3v2v2 -> mm5
	punpcklwd mm0, mm0		; v1v1v0v0 -> mm0

	paddsw mm1, mm5			; r7r6r5r4 -> mm1
	paddsw mm7, mm0			; r3r2r1r0 -> mm7

	psraw mm1, SCALEBITS
	psraw mm7, SCALEBITS

	packuswb mm7, mm1		; r7r6r5r4r3r2r1r0 -> mm7 (TEMP_R2)

	paddsw mm6, mm5			; r7r6r5r4 -> mm6
	paddsw mm4, mm0			; r3r2r1r0 -> mm4

	psraw mm6, SCALEBITS
	psraw mm4, SCALEBITS

	packuswb mm4, mm6		; r7r6r5r4r3r2r1r0 -> mm4 (TEMP_R1)
	
	movq mm0, [TEMP_B1]
	movq mm1, [TEMP_G1]

	movq mm6, mm7

	movq mm2, mm0
	punpcklbw mm2, mm4		; r3b3r2b2r1b1r0b0 -> mm2
	punpckhbw mm0, mm4		; r7b7r6b6r5b5r4b4 -> mm0

	pxor mm7, mm7

	movq mm3, mm1
	punpcklbw mm1, mm7		; 0g30g20g10g0 -> mm1
	punpckhbw mm3, mm7		; 0g70g60g50g4 -> mm3

	movq mm4, mm2
	punpcklbw mm2, mm1		; 0r1g1b10r0g0b0 -> mm2
	punpckhbw mm4, mm1		; 0r3g3b30r2g2b2 -> mm4

	movq mm5, mm0
	punpcklbw mm0, mm3		; 0r5g5b50r4g4b4 -> mm0
	punpckhbw mm5, mm3		; 0r7g7b70r6g6b6 -> mm5

%if %1 == 3		; BGR (24-bit)
	movd [edi], mm2			
	psrlq mm2, 32

	movd [edi + 3], mm2
	movd [edi + 6], mm4		

	psrlq mm4, 32

	movd [edi + 9], mm4
	movd [edi + 12], mm0	

	psrlq mm0, 32

	movd [edi + 15], mm0
	movd [edi + 18], mm5	

	psrlq mm5, 32

	movd [edi + 21], mm5	

	movq mm0, [TEMP_B2]
	movq mm1, [TEMP_G2]

	movq mm2, mm0
	punpcklbw mm2, mm6		; r3b3r2b2r1b1r0b0 -> mm2
	punpckhbw mm0, mm6		; r7b7r6b6r5b5r4b4 -> mm0

	movq mm3, mm1 
	punpcklbw mm1, mm7		; 0g30g20g10g0 -> mm1
	punpckhbw mm3, mm7		; 0g70g60g50g4 -> mm3

	movq mm4, mm2
	punpcklbw mm2, mm1		; 0r1g1b10r0g0b0 -> mm2
	punpckhbw mm4, mm1		; 0r3g3b30r2g2b2 -> mm4

	movq mm5, mm0
	punpcklbw mm0, mm3		; 0r5g5b50r4g4b4 -> mm0
	punpckhbw mm5, mm3		; 0r7g7b70r6g6b6 -> mm5

	movd [edi+edx], mm2
	psrlq mm2, 32

	movd [edi+edx + 3], mm2
	movd [edi+edx + 6], mm4

	psrlq mm4, 32

	movd [edi+edx + 9], mm4
	movd [edi+edx + 12], mm0

	psrlq mm0, 32

	movd [edi+edx + 15], mm0
	movd [edi+edx + 18], mm5

	psrlq mm5, 32

	movd [edi+edx + 21], mm5

%else		; BGRA (32-bit)
	movq [edi], mm2			
	movq [edi + 8], mm4		
	movq [edi + 16], mm0	
	movq [edi + 24], mm5	

	movq mm0, [TEMP_B2]
	movq mm1, [TEMP_G2]

	movq mm2, mm0
	punpcklbw mm2, mm6		; r3b3r2b2r1b1r0b0 -> mm2
	punpckhbw mm0, mm6		; r7b7r6b6r5b5r4b4 -> mm0

	movq mm3, mm1 
	punpcklbw mm1, mm7		; 0g30g20g10g0 -> mm1
	punpckhbw mm3, mm7		; 0g70g60g50g4 -> mm3

	movq mm4, mm2
	punpcklbw mm2, mm1		; 0r1g1b10r0g0b0 -> mm2
	punpckhbw mm4, mm1		; 0r3g3b30r2g2b2 -> mm4

	movq mm5, mm0
	punpcklbw mm0, mm3		; 0r5g5b50r4g4b4 -> mm0
	punpckhbw mm5, mm3		; 0r7g7b70r6g6b6 -> mm5

	movq [edi + edx], mm2
	movq [edi + edx + 8], mm4
	movq [edi + edx + 16], mm0
	movq [edi + edx + 24], mm5
%endif

%undef TEMP_Y1
%undef TEMP_Y2
%undef TEMP_G1
%undef TEMP_G2
%undef TEMP_B1
%undef TEMP_B2
%endmacro
;------------------------------------------------------------------------------




; input

MAKE_COLORSPACE  bgr_to_yv12_mmx,0,    3,2,2,  BGR_TO_YV12,  3, -1
MAKE_COLORSPACE  bgra_to_yv12_mmx,0,   4,2,2,  BGR_TO_YV12,  4, -1

; output

MAKE_COLORSPACE  yv12_to_bgr_mmx,48,   3,8,2,  YV12_TO_BGR,  3, -1
MAKE_COLORSPACE  yv12_to_bgra_mmx,48,  4,8,2,  YV12_TO_BGR,  4, -1


No admin address has been configured
ViewVC Help
Powered by ViewVC 1.0.4