[svn] / trunk / xvidcore / src / image / x86_asm / rgb_to_yv12_mmx.asm Repository:
ViewVC logotype

View of /trunk/xvidcore/src/image/x86_asm/rgb_to_yv12_mmx.asm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 651 - (download) (annotate)
Sun Nov 17 00:20:30 2002 UTC (21 years, 4 months ago) by edgomez
File size: 11273 byte(s)
License changed
;/*****************************************************************************
; *
; *  XVID MPEG-4 VIDEO CODEC
; *	 mmx yuv planar to yuyv/uyvy conversion 
; *
; *  Copyright(C) 2002 Peter Ross <pross@xvid.org>
; *
; *  This file is part of XviD, a free MPEG-4 video encoder/decoder
; *
; *  XviD is free software; you can redistribute it and/or modify it
; *  under the terms of the GNU General Public License as published by
; *  the Free Software Foundation; either version 2 of the License, or
; *  (at your option) any later version.
; *
; *  This program is distributed in the hope that it will be useful,
; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; *  GNU General Public License for more details.
; *
; *  You should have received a copy of the GNU General Public License
; *  along with this program; if not, write to the Free Software
; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
; *
; *  Under section 8 of the GNU General Public License, the copyright
; *  holders of XVID explicitly forbid distribution in the following
; *  countries:
; *
; *    - Japan
; *    - United States of America
; *
; *  Linking XviD statically or dynamically with other modules is making a
; *  combined work based on XviD.  Thus, the terms and conditions of the
; *  GNU General Public License cover the whole combination.
; *
; *  As a special exception, the copyright holders of XviD give you
; *  permission to link XviD with independent modules that communicate with
; *  XviD solely through the VFW1.1 and DShow interfaces, regardless of the
; *  license terms of these independent modules, and to copy and distribute
; *  the resulting combined work under terms of your choice, provided that
; *  every copy of the combined work is accompanied by a complete copy of
; *  the source code of XviD (the version of XviD used to produce the
; *  combined work), being distributed under the terms of the GNU General
; *  Public License plus this exception.  An independent module is a module
; *  which is not derived from or based on XviD.
; *
; *  Note that people who make modified versions of XviD are not obligated
; *  to grant this special exception for their modified versions; it is
; *  their choice whether to do so.  The GNU General Public License gives
; *  permission to release a modified version without this exception; this
; *  exception also makes it possible to release a modified version which
; *  carries forward this exception.
; *
; * $Id: rgb_to_yv12_mmx.asm,v 1.3 2002-11-17 00:20:30 edgomez Exp $
; *
; ****************************************************************************/

bits 32


section .data

%macro cglobal 1 
	%ifdef PREFIX
		global _%1 
		%define %1 _%1
	%else
		global %1
	%endif
%endmacro

align 16


;===========================================================================
; yuv constants
;===========================================================================

%define Y_R		0.257
%define Y_G		0.504
%define Y_B		0.098
%define Y_ADD	16

%define U_R		0.148
%define U_G		0.291
%define U_B		0.439
%define U_ADD	128

%define V_R		0.439
%define V_G		0.368
%define V_B		0.071
%define V_ADD	128


;===========================================================================
; multiplication matrices
;===========================================================================

; %define SCALEBITS 8

y_mul	dw		25		; FIX(Y_B)
		dw		129		; FIX(Y_G)
		dw		66		; FIX(Y_R)
		dw		0

u_mul	dw		112		; FIX(U_B)
		dw		-74		; FIX(U_G)
		dw		-38		; FIX(U_R)
		dw		0

v_mul	dw		-18		; FIX(V_B)
		dw		-94		; FIX(V_G)
		dw		112		; FIX(V_R)
		dw		0



section .text

;===========================================================================
;
;	void rgb24_to_yv12_mmx(uint8_t * const y_out,
;						uint8_t * const u_out,
;						uint8_t * const v_out,
;						const uint8_t * const src,
;						const uint32_t width,
;						const uint32_t height,
;						const uint32_t stride)
;
; always flips
;
;===========================================================================

align 16
cglobal rgb24_to_yv12_mmx
rgb24_to_yv12_mmx

		push ebx
		push ecx
		push esi
		push edi			
		push ebp			; STACK BASE = 20

		; global consants

		mov eax, [esp + 20 + 28]	; stride
		mov ecx, [esp + 20 + 20]	; width
		mov ebx, eax
		sub ebx, ecx				
		shr ebx, 1					; ebx = (stride-width) / 2;
		push ebx					; [esp + 20] = uv_dif
							; STACK BASE = 24

		add eax, eax
		sub eax, ecx				; eax = 2*stride - width
		push eax					; [esp + 16] = y_dif
							; STACK BASE = 28

		mov ebx, ecx				; 
		shr ebx, 1					;
		push ebx					; [esp + 12] = width/2
							; STACK BASE = 32

		mov edx, ecx
		add ecx, edx
		add ecx, edx				; ecx = 3*width   (use 4 for rgb32)
		push ecx					; [esp + 8] = width3
							; STACK BASE = 36

		mov edx, ecx
		add edx, ecx
		add edx, ecx				; edx = 3*width3
		push edx					; [esp + 4] = src_dif
							; STACK BASE = 40

		mov esi, [esp + 40 + 16]	; src
		mov ebp, [esp + 40 + 24]	; eax = height
		mov eax, ebp
		sub eax, 2
		mul ecx
		add esi, eax				; src += (height-2) * width3

		mov edi, [esp + 40 + 4]		; y_out
		mov ecx, [esp + 40 + 8]		; u_out
		mov edx, [esp + 40 + 12]	; v_out
		movq mm7, [y_mul]		

		shr ebp, 1				; ebp = height / 2
		push ebp				; [esp+0] = tmp
								; STACK BASE = 44

.yloop
		mov ebp, [esp + 12]		; ebp = width /2 

.xloop
			; y_out

			mov ebx, [esp + 8]			; ebx = width3

			pxor mm4, mm4
			pxor mm5, mm5
			movd mm0, [esi]			; src[0...]
			movd mm2, [esi+ebx]		; src[width3...]
			punpcklbw mm0, mm4		; [  |b |g |r ]
			punpcklbw mm2, mm5		; [  |b |g |r ]
			movq mm6, mm0			; = [  |b4|g4|r4]
			paddw mm6, mm2			; +[  |b4|g4|r4]
			pmaddwd mm0, mm7		; *= Y_MUL
			pmaddwd mm2, mm7		; *= Y_MUL
			movq mm4, mm0			; [r]
			movq mm5, mm2			; [r]
			psrlq mm4, 32			; +[g]
			psrlq mm5, 32			; +[g]
			paddd mm0, mm4			; +[b]
			paddd mm2, mm5			; +[b]

			pxor mm4, mm4
			pxor mm5, mm5
			movd mm1, [esi+3]		; src[4...]
			movd mm3, [esi+ebx+3]	; src[width3+4...]
			punpcklbw mm1, mm4		; [  |b |g |r ]
			punpcklbw mm3, mm5		; [  |b |g |r ]
			paddw mm6, mm1			; +[  |b4|g4|r4]
			paddw mm6, mm3			; +[  |b4|g4|r4]
			pmaddwd mm1, mm7		; *= Y_MUL
			pmaddwd mm3, mm7		; *= Y_MUL
			movq mm4, mm1			; [r]
			movq mm5, mm3			; [r]
			psrlq mm4, 32			; +[g]
			psrlq mm5, 32			; +[g]
			paddd mm1, mm4			; +[b]
			paddd mm3, mm5			; +[b]

			mov ebx, [esp + 44 + 28]	; stride

			movd eax, mm0
			shr eax, 8
			add eax, Y_ADD
			mov [edi + ebx], al

			movd eax, mm1
			shr eax, 8
			add eax, Y_ADD
			mov [edi + ebx + 1], al

			movd eax, mm2
			shr eax, 8
			add eax, Y_ADD
			mov [edi], al

			movd eax, mm3
			shr eax, 8
			add eax, Y_ADD
			mov [edi + 1], al

			; u_out, v_out

			movq mm0, mm6			; = [  |b4|g4|r4]
			pmaddwd mm6, [v_mul]		; *= V_MUL
			pmaddwd mm0, [u_mul]		; *= U_MUL
			movq mm1, mm0
			movq mm2, mm6
			psrlq mm1, 32
			psrlq mm2, 32
			paddd mm0, mm1
			paddd mm2, mm6

			movd eax, mm0
			shr eax, 10
			add eax, U_ADD
			mov [ecx], al

			movd eax, mm2
			shr eax, 10
			add eax, V_ADD
			mov [edx], al

			add esi, 2 * 3			; (use 4 for rgb32)
			add edi, 2
			inc ecx
			inc edx

			dec ebp
			jnz near .xloop

		sub esi, [esp + 4]			; src  -= src_dif
		add edi, [esp + 16]			; y_out += y_dif
		add ecx, [esp + 20]			; u_out += uv_dif
		add edx, [esp + 20]			; v_out += uv_dif

		dec dword [esp+0]
		jnz near .yloop

		emms

		add esp, 24
		pop ebp
		pop edi
		pop esi
		pop ecx
		pop ebx

		ret



;===========================================================================
;
;	void rgb32_to_yv12mmx(uint8_t * const y_out,
;						uint8_t * const u_out,
;						uint8_t * const v_out,
;						const uint8_t * const src,
;						const uint32_t width,
;						const uint32_t height,
;						const uint32_t stride)
;
; always flips
;
;===========================================================================

align 16
cglobal rgb32_to_yv12_mmx
rgb32_to_yv12_mmx

		push ebx
		push ecx
		push esi
		push edi			
		push ebp			; STACK BASE = 20

		; global consants

		mov eax, [esp + 20 + 28]	; stride
		mov ecx, [esp + 20 + 20]	; width
		mov ebx, eax
		sub ebx, ecx				
		shr ebx, 1					; ebx = (stride-width) / 2;
		push ebx					; [esp + 20] = uv_dif
							; STACK BASE = 24

		add eax, eax
		sub eax, ecx				; eax = 2*stride - width
		push eax					; [esp + 16] = y_dif
							; STACK BASE = 28

		mov ebx, ecx				; 
		shr ebx, 1					;
		push ebx					; [esp + 12] = width/2
							; STACK BASE = 32

		mov edx, ecx
		shl ecx, 2					; ecx = 4*width   (use 4 for rgb32)
		push ecx					; [esp + 8] = width4
							; STACK BASE = 36

		mov edx, ecx
		add edx, ecx
		add edx, ecx				; edx = 3*width4
		push edx					; [esp + 4] = src_dif
							; STACK BASE = 40

		mov esi, [esp + 40 + 16]	; src
		mov ebp, [esp + 40 + 24]	; eax = height
		mov eax, ebp
		sub eax, 2
		mul ecx
		add esi, eax				; src += (height-2) * width4

		mov edi, [esp + 40 + 4]		; y_out
		mov ecx, [esp + 40 + 8]		; u_out
		mov edx, [esp + 40 + 12]	; v_out
		movq mm7, [y_mul]		

		shr ebp, 1				; ebp = height / 2
		push ebp				; [esp+0] = tmp
								; STACK BASE = 44

.yloop
		mov ebp, [esp + 12]		; ebp = width /2 

.xloop
			; y_out

			mov ebx, [esp + 8]			; ebx = width4

			pxor mm4, mm4
			movq mm0, [esi]			; src[4...       |0...     ]
			movq mm2, [esi+ebx]		; src[width4+4...|width4...]
			movq mm1, mm0
			movq mm3, mm2
			punpcklbw mm0, mm4		; [  |b |g |r ]
			punpcklbw mm2, mm4		; [  |b |g |r ]
			punpckhbw mm1, mm4		; [  |b |g |r ]
			punpckhbw mm3, mm4		; [  |b |g |r ]

			movq mm6, mm0			; = [  |b4|g4|r4]
			paddw mm6, mm2			; +[  |b4|g4|r4]
			pmaddwd mm0, mm7		; *= Y_MUL
			pmaddwd mm2, mm7		; *= Y_MUL
			movq mm4, mm0			; [r]
			movq mm5, mm2			; [r]
			psrlq mm4, 32			; +[g]
			psrlq mm5, 32			; +[g]
			paddd mm0, mm4			; +[b]
			paddd mm2, mm5			; +[b]

			paddw mm6, mm1			; +[  |b4|g4|r4]
			paddw mm6, mm3			; +[  |b4|g4|r4]
			pmaddwd mm1, mm7		; *= Y_MUL
			pmaddwd mm3, mm7		; *= Y_MUL
			movq mm4, mm1			; [r]
			movq mm5, mm3			; [r]
			psrlq mm4, 32			; +[g]
			psrlq mm5, 32			; +[g]
			paddd mm1, mm4			; +[b]
			paddd mm3, mm5			; +[b]

			mov ebx, [esp + 44 + 28]	; stride

			movd eax, mm0
			shr eax, 8
			add eax, Y_ADD
			mov [edi + ebx], al

			movd eax, mm1
			shr eax, 8
			add eax, Y_ADD
			mov [edi + ebx + 1], al

			movd eax, mm2
			shr eax, 8
			add eax, Y_ADD
			mov [edi], al

			movd eax, mm3
			shr eax, 8
			add eax, Y_ADD
			mov [edi + 1], al

			; u_out, v_out

			movq mm0, mm6			; = [  |b4|g4|r4]
			pmaddwd mm6, [v_mul]		; *= V_MUL
			pmaddwd mm0, [u_mul]		; *= U_MUL
			movq mm1, mm0
			movq mm2, mm6
			psrlq mm1, 32
			psrlq mm2, 32
			paddd mm0, mm1
			paddd mm2, mm6

			movd eax, mm0
			shr eax, 10
			add eax, U_ADD
			mov [ecx], al

			movd eax, mm2
			shr eax, 10
			add eax, V_ADD
			mov [edx], al

			add esi, 2 * 4			; (use 4 for rgb32)
			add edi, 2
			inc ecx
			inc edx

			dec ebp
			jnz near .xloop

		sub esi, [esp + 4]			; src  -= src_dif
		add edi, [esp + 16]			; y_out += y_dif
		add ecx, [esp + 20]			; u_out += uv_dif
		add edx, [esp + 20]			; v_out += uv_dif

		dec dword [esp+0]
		jnz near .yloop

		emms

		add esp, 24
		pop ebp
		pop edi
		pop esi
		pop ecx
		pop ebx

		ret

No admin address has been configured
ViewVC Help
Powered by ViewVC 1.0.4