[svn] / branches / dev-api-4 / xvidcore / src / image / x86_asm / yuyv_to_yv12_mmx.asm Repository:
ViewVC logotype

View of /branches/dev-api-4/xvidcore/src/image/x86_asm/yuyv_to_yv12_mmx.asm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 886 - (download) (annotate)
Fri Feb 21 14:49:29 2003 UTC (16 years, 3 months ago)
File size: 6828 byte(s)
This commit was manufactured by cvs2svn to create branch 'dev-api-4'.
;/**************************************************************************
; *
; *	XVID MPEG-4 VIDEO CODEC
; *	mmx yuyv/uyvy to yuv planar
; *
; *	This program is free software; you can redistribute it and/or modify
; *	it under the terms of the GNU General Public License as published by
; *	the Free Software Foundation; either version 2 of the License, or
; *	(at your option) any later version.
; *
; *	This program is distributed in the hope that it will be useful,
; *	but WITHOUT ANY WARRANTY; without even the implied warranty of
; *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; *	GNU General Public License for more details.
; *
; *	You should have received a copy of the GNU General Public License
; *	along with this program; if not, write to the Free Software
; *	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; *
; *************************************************************************/

;/**************************************************************************
; *
; *	History:
; *
; *	30.11.2001	initial version; (c)2001 peter ross <pross@cs.rmit.edu.au>
; *
; *************************************************************************/

bits 32


section .data

%macro cglobal 1 
	%ifdef PREFIX
		global _%1 
		%define %1 _%1
	%else
		global %1
	%endif
%endmacro

align 16


section .data


;===========================================================================
; masks for extracting yuv components
;===========================================================================
;			y     u     y     v     y     u     y     v

mask1	db	0xff, 0,    0xff, 0,    0xff, 0,    0xff, 0
mask2	db	0,    0xff, 0,    0xff, 0,    0xff, 0,    0xff


section .text

;===========================================================================
;
;	void yuyv_to_yv12_mmx(uint8_t * const y_out,
;						uint8_t * const u_out,
;						uint8_t * const v_out,
;						const uint8_t * const src,
;						const uint32_t width,
;						const uint32_t height,
;						const uint32_t stride);
;
;	width must be multiple of 8
;	does not flip
;	~30% faster than plain c
;
;===========================================================================

align 16
cglobal yuyv_to_yv12_mmx
yuyv_to_yv12_mmx

		push ebx
		push ecx
		push esi
		push edi			
		push ebp		; STACK BASE = 20

		; some global consants

		mov ecx, [esp + 20 + 20]	; width
		mov eax, [esp + 20 + 28]	; stride
		sub eax, ecx				; eax = stride - width
		mov edx, eax
		add edx, [esp + 20 + 28]	; edx = y_dif + stride
		push edx					; [esp + 12] = y_dif

		shr eax, 1
		push eax					; [esp + 8] = uv_dif

		shr ecx, 3
		push ecx					; [esp + 4] = width/8

		sub	esp, 4					; [esp + 0] = tmp_height_counter
						; STACK_BASE = 36
		
		movq mm6, [mask1]
		movq mm7, [mask2]

		mov edi, [esp + 36 + 4]		; y_out
		mov ebx, [esp + 36 + 8]		; u_out
		mov edx, [esp + 36 + 12]	; v_out
		mov esi, [esp + 36 + 16]	; src

		mov eax, [esp + 36 + 20]
		mov ebp, [esp + 36 + 24]
		mov ecx, [esp + 36 + 28]	; ecx = stride
		shr ebp, 1					; ebp = height /= 2
		add eax, eax				; eax = 2 * width

.yloop
		mov [esp], ebp
		mov ebp, [esp + 4]			; width/8
.xloop
		movq mm2, [esi]				; y 1st row
		movq mm3, [esi + 8]
		movq mm0, mm2
		movq mm1, mm3
		pand mm2, mm6 ; mask1
		pand mm3, mm6 ; mask1
		pand mm0, mm7 ; mask2
		pand mm1, mm7 ; mask2
		packuswb mm2, mm3
		psrlq mm0, 8
		psrlq mm1, 8
		movq [edi], mm2

		movq mm4, [esi + eax]		; y 2nd row
		movq mm5, [esi + eax + 8]
		movq mm2, mm4
		movq mm3, mm5
		pand mm4, mm6 ; mask1
		pand mm5, mm6 ; mask1
		pand mm2, mm7 ; mask2
		pand mm3, mm7 ; mask2
		packuswb mm4, mm5
		psrlq mm2, 8
		psrlq mm3, 8
		movq [edi + ecx], mm4

		paddw mm0, mm2			; uv avg 1st & 2nd
		paddw mm1, mm3
		psrlw mm0, 1
		psrlw mm1, 1
		packuswb mm0, mm1
		movq mm2, mm0
		pand mm0, mm6 ; mask1
		pand mm2, mm7 ; mask2
		packuswb mm0, mm0
		psrlq mm2, 8
		movd [ebx], mm0
		packuswb mm2, mm2
		movd [edx], mm2

		add	esi, 16
		add	edi, 8
		add	ebx, 4
		add	edx, 4
		dec ebp
		jnz near .xloop

		mov ebp, [esp]

		add esi, eax			; += width2
		add edi, [esp + 12]		; += y_dif + stride
		add ebx, [esp + 8]		; += uv_dif
		add edx, [esp + 8]		; += uv_dif

		dec ebp
		jnz near .yloop

		emms

		add esp, 16
		pop ebp
		pop edi
		pop esi
		pop ecx
		pop ebx

		ret


;===========================================================================
;
;	void uyvy_to_yv12_mmx(uint8_t * const y_out,
;						uint8_t * const u_out,
;						uint8_t * const v_out,
;						const uint8_t * const src,
;						const uint32_t width,
;						const uint32_t height,
;						const uint32_t stride);
;
;	width must be multiple of 8
;	does not flip
;	~30% faster than plain c
;
;===========================================================================

align 16
cglobal uyvy_to_yv12_mmx
uyvy_to_yv12_mmx

		push ebx
		push ecx
		push esi
		push edi			
		push ebp		; STACK BASE = 20

		; some global consants

		mov ecx, [esp + 20 + 20]	; width
		mov eax, [esp + 20 + 28]	; stride
		sub eax, ecx				; eax = stride - width
		mov edx, eax
		add edx, [esp + 20 + 28]	; edx = y_dif + stride
		push edx					; [esp + 12] = y_dif

		shr eax, 1
		push eax					; [esp + 8] = uv_dif

		shr ecx, 3
		push ecx					; [esp + 4] = width/8

		sub	esp, 4					; [esp + 0] = tmp_height_counter
						; STACK_BASE = 36
		
		movq mm6, [mask1]
		movq mm7, [mask2]

		mov edi, [esp + 36 + 4]		; y_out
		mov ebx, [esp + 36 + 8]		; u_out
		mov edx, [esp + 36 + 12]	; v_out
		mov esi, [esp + 36 + 16]	; src

		mov eax, [esp + 36 + 20]
		mov ebp, [esp + 36 + 24]
		mov ecx, [esp + 36 + 28]	; ecx = stride
		shr ebp, 1					; ebp = height /= 2
		add eax, eax				; eax = 2 * width

.yloop
		mov [esp], ebp
		mov ebp, [esp + 4]			; width/8
.xloop
		movq mm2, [esi]				; y 1st row
		movq mm3, [esi + 8]
		movq mm0, mm2
		movq mm1, mm3
		pand mm2, mm7 ; mask2
		pand mm3, mm7 ; mask2
		psrlq mm2, 8
		psrlq mm3, 8
		pand mm0, mm6 ; mask1
		pand mm1, mm6 ; mask1
		packuswb mm2, mm3
		movq [edi], mm2

		movq mm4, [esi + eax]		; y 2nd row
		movq mm5, [esi + eax + 8]
		movq mm2, mm4
		movq mm3, mm5
		pand mm4, mm7 ; mask2
		pand mm5, mm7 ; mask2
		psrlq mm4, 8
		psrlq mm5, 8
		pand mm2, mm6 ; mask1
		pand mm3, mm6 ; mask1
		packuswb mm4, mm5
		movq [edi + ecx], mm4

		paddw mm0, mm2			; uv avg 1st & 2nd
		paddw mm1, mm3
		psrlw mm0, 1
		psrlw mm1, 1
		packuswb mm0, mm1
		movq mm2, mm0
		pand mm0, mm6 ; mask1
		pand mm2, mm7 ; mask2
		packuswb mm0, mm0
		psrlq mm2, 8
		movd [ebx], mm0
		packuswb mm2, mm2
		movd [edx], mm2

		add	esi, 16
		add	edi, 8
		add	ebx, 4
		add	edx, 4
		dec ebp
		jnz near .xloop

		mov ebp, [esp]

		add esi, eax			; += width2
		add edi, [esp + 12]		; += y_dif + stride
		add ebx, [esp + 8]		; += uv_dif
		add edx, [esp + 8]		; += uv_dif

		dec ebp
		jnz near .yloop

		emms

		add esp, 16
		pop ebp
		pop edi
		pop esi
		pop ecx
		pop ebx

		ret

No admin address has been configured
ViewVC Help
Powered by ViewVC 1.0.4