Annotation of /branches/dev-api-4/xvidcore/src/image/x86_asm/yv12_to_yuyv_mmx.asm

Revision 651 - (view) (download)
Original Path: trunk/xvidcore/src/image/x86_asm/yv12_to_yuyv_mmx.asm

1 :	chl	434	;/*****************************************************************************
2 :	Isibaar	3	; *
3 :	chl	434	; * XVID MPEG-4 VIDEO CODEC
4 :			; * mmx yuv planar to yuyv/uyvy conversion
5 :	Isibaar	3	; *
6 :	chl	434	; * Copyright(C) 2002 Peter Ross <pross@xvid.org>
7 :	Isibaar	3	; *
8 :	edgomez	651	; * This file is part of XviD, a free MPEG-4 video encoder/decoder
9 :	Isibaar	3	; *
10 :	edgomez	651	; * XviD is free software; you can redistribute it and/or modify it
11 :			; * under the terms of the GNU General Public License as published by
12 :	chl	434	; * the Free Software Foundation; either version 2 of the License, or
13 :			; * (at your option) any later version.
14 :	Isibaar	3	; *
15 :	chl	434	; * This program is distributed in the hope that it will be useful,
16 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 :			; * GNU General Public License for more details.
19 :	Isibaar	3	; *
20 :	chl	434	; * You should have received a copy of the GNU General Public License
21 :			; * along with this program; if not, write to the Free Software
22 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 :	Isibaar	3	; *
24 :	edgomez	651	; * Under section 8 of the GNU General Public License, the copyright
25 :			; * holders of XVID explicitly forbid distribution in the following
26 :			; * countries:
27 :			; *
28 :			; * - Japan
29 :			; * - United States of America
30 :			; *
31 :			; * Linking XviD statically or dynamically with other modules is making a
32 :			; * combined work based on XviD. Thus, the terms and conditions of the
33 :			; * GNU General Public License cover the whole combination.
34 :			; *
35 :			; * As a special exception, the copyright holders of XviD give you
36 :			; * permission to link XviD with independent modules that communicate with
37 :			; * XviD solely through the VFW1.1 and DShow interfaces, regardless of the
38 :			; * license terms of these independent modules, and to copy and distribute
39 :			; * the resulting combined work under terms of your choice, provided that
40 :			; * every copy of the combined work is accompanied by a complete copy of
41 :			; * the source code of XviD (the version of XviD used to produce the
42 :			; * combined work), being distributed under the terms of the GNU General
43 :			; * Public License plus this exception. An independent module is a module
44 :			; * which is not derived from or based on XviD.
45 :			; *
46 :			; * Note that people who make modified versions of XviD are not obligated
47 :			; * to grant this special exception for their modified versions; it is
48 :			; * their choice whether to do so. The GNU General Public License gives
49 :			; * permission to release a modified version without this exception; this
50 :			; * exception also makes it possible to release a modified version which
51 :			; * carries forward this exception.
52 :			; *
53 :			; * $Id: yv12_to_yuyv_mmx.asm,v 1.3 2002-11-17 00:20:30 edgomez Exp $
54 :			; *
55 :	chl	434	; ****************************************************************************/
56 :	Isibaar	3
57 :			bits 32
58 :
59 :
60 :			section .data
61 :
62 :			%macro cglobal 1
63 :			%ifdef PREFIX
64 :			global _%1
65 :			%define %1 _%1
66 :			%else
67 :			global %1
68 :			%endif
69 :			%endmacro
70 :
71 :			align 16
72 :
73 :
74 :			section .text
75 :
76 :
77 :			;===========================================================================
78 :			;
79 :			; void yv12_to_uyvy_mmx(
80 :			; uint8_t * dst,
81 :			; int dst_stride,
82 :			; uint8_t * y_src,
83 :			; uint8_t * u_src,
84 :			; uint8_t * v_src,
85 :			; int y_stride,
86 :			; int uv_stride,
87 :			; int width,
88 :			; int height);
89 :			;
90 :			; width must be multiple of 8
91 :			; ~10% faster than plain c
92 :			;
93 :			;===========================================================================
94 :
95 :			align 16
96 :			cglobal yv12_to_yuyv_mmx
97 :			yv12_to_yuyv_mmx
98 :
99 :			push ebx
100 :			push ecx
101 :			push esi
102 :			push edi
103 :			push ebp ; STACK BASE = 20
104 :
105 :			; global constants
106 :
107 :			mov ebx, [esp + 20 + 32] ; width
108 :			mov eax, [esp + 20 + 8] ; dst_stride
109 :			sub eax, ebx ;
110 :			add eax, eax ; eax = 2*(dst_stride - width)
111 :			push eax ; [esp + 4] = dst_dif
112 :			; STACK BASE = 24
113 :
114 :			shr ebx, 3 ; ebx = width / 8
115 :			mov edi, [esp + 24 + 4] ; dst
116 :
117 :
118 :			; --------- flip -------------
119 :
120 :			mov ebp, [esp + 24 + 36]
121 :			test ebp, ebp
122 :			jl .flip
123 :
124 :			mov esi, [esp + 24 + 12] ; y_src
125 :			mov ecx, [esp + 24 + 16] ; u_src
126 :			mov edx, [esp + 24 + 20] ; v_src
127 :			shr ebp, 1 ; y = height / 2
128 :			jmp short .yloop
129 :
130 :
131 :			.flip
132 :			neg ebp ; height = -height
133 :
134 :			mov eax, [esp + 24 + 24] ; y_stride
135 :			lea edx, [ebp - 1] ; edx = height - 1
136 :			mul edx
137 :			mov esi, [esp + 24 + 12] ; y_src
138 :			add esi, eax ; y_src += (height - 1) * y_stride
139 :
140 :			shr ebp, 1 ; y = height / 2
141 :			mov eax, [esp + 24 + 28] ; uv_stride
142 :			lea edx, [ebp - 1] ; edx = height/2 - 1
143 :			mul edx
144 :
145 :			mov ecx, [esp + 24 + 16] ; u_src
146 :			mov edx, [esp + 24 + 20] ; v_src
147 :			add ecx, eax ; u_src += (height/2 - 1) * uv_stride
148 :			add edx, eax ; v_src += (height/2 - 1) * uv_stride
149 :
150 :			neg dword [esp + 24 + 24] ; y_stride = -y_stride
151 :			neg dword [esp + 24 + 28] ; uv_stride = -uv_stride
152 :
153 :			.yloop
154 :			xor eax, eax ; x = 0;
155 :
156 :			.xloop1
157 :			movd mm0, [ecx+4*eax] ; [ \|uuuu]
158 :			movd mm1, [edx+4*eax] ; [ \|vvvv]
159 :			movq mm2, [esi+8*eax] ; [yyyy\|yyyy]
160 :
161 :			punpcklbw mm0, mm1 ; [vuvu\|vuvu]
162 :			movq mm3, mm2
163 :			punpcklbw mm2, mm0 ; [vyuy\|vyuy]
164 :			punpckhbw mm3, mm0 ; [vyuy\|vyuy]
165 :			movq [edi], mm2
166 :			movq [edi+8], mm3
167 :
168 :			inc eax
169 :			add edi, 16
170 :
171 :			cmp eax, ebx
172 :			jb .xloop1
173 :
174 :			add edi, [esp + 0] ; dst += dst_dif
175 :			add esi, [esp + 24 + 24] ; y_src += y_stride
176 :
177 :			xor eax, eax
178 :
179 :			.xloop2
180 :			movd mm0, [ecx+4*eax] ; [ \|uuuu]
181 :			movd mm1, [edx+4*eax] ; [ \|vvvv]
182 :			movq mm2, [esi+8*eax] ; [yyyy\|yyyy]
183 :
184 :			punpcklbw mm0, mm1 ; [vuvu\|vuvu]
185 :			movq mm3, mm2
186 :			punpcklbw mm2, mm0 ; [vyuy\|vyuy]
187 :			punpckhbw mm3, mm0 ; [vyuy\|vyuy]
188 :			movq [edi], mm2
189 :			movq [edi+8], mm3
190 :
191 :			inc eax
192 :			add edi, 16
193 :
194 :			cmp eax, ebx
195 :			jb .xloop2
196 :
197 :			add edi, [esp + 0] ; dst += dst_dif
198 :			add esi, [esp + 24 + 24] ; y_src += y_stride
199 :			add ecx, [esp + 24 + 28] ; u_src += uv_stride
200 :			add edx, [esp + 24 + 28] ; v_src += uv_stride
201 :
202 :			dec ebp ; y--
203 :			jnz near .yloop
204 :
205 :			emms
206 :
207 :			add esp, 4
208 :			pop ebp
209 :			pop edi
210 :			pop esi
211 :			pop ecx
212 :			pop ebx
213 :
214 :			ret
215 :
216 :
217 :
218 :
219 :
220 :			;===========================================================================
221 :			;
222 :			; void yv12_to_uyvy_mmx(
223 :			; uint8_t * dst,
224 :			; int dst_stride,
225 :			; uint8_t * y_src,
226 :			; uint8_t * u_src,
227 :			; uint8_t * v_src,
228 :			; int y_stride,
229 :			; int uv_stride,
230 :			; int width,
231 :			; int height);
232 :			;
233 :			; width must be multiple of 8
234 :			; ~10% faster than plain c
235 :			;
236 :			;===========================================================================
237 :
238 :			align 16
239 :			cglobal yv12_to_uyvy_mmx
240 :			yv12_to_uyvy_mmx
241 :
242 :			push ebx
243 :			push ecx
244 :			push esi
245 :			push edi
246 :			push ebp ; STACK BASE = 20
247 :
248 :			; global constants
249 :
250 :			mov ebx, [esp + 20 + 32] ; width
251 :			mov eax, [esp + 20 + 8] ; dst_stride
252 :			sub eax, ebx ;
253 :			add eax, eax ; eax = 2*(dst_stride - width)
254 :			push eax ; [esp + 4] = dst_dif
255 :			; STACK BASE = 24
256 :
257 :			shr ebx, 3 ; ebx = width / 8
258 :			mov edi, [esp + 24 + 4] ; dst
259 :
260 :
261 :			; --------- flip -------------
262 :
263 :			mov ebp, [esp + 24 + 36]
264 :			test ebp, ebp
265 :			jl .flip
266 :
267 :			mov esi, [esp + 24 + 12] ; y_src
268 :			mov ecx, [esp + 24 + 16] ; u_src
269 :			mov edx, [esp + 24 + 20] ; v_src
270 :			shr ebp, 1 ; y = height / 2
271 :			jmp short .yloop
272 :
273 :
274 :			.flip
275 :			neg ebp ; height = -height
276 :
277 :			mov eax, [esp + 24 + 24] ; y_stride
278 :			lea edx, [ebp - 1] ; edx = height - 1
279 :			mul edx
280 :			mov esi, [esp + 24 + 12] ; y_src
281 :			add esi, eax ; y_src += (height - 1) * y_stride
282 :
283 :			shr ebp, 1 ; y = height / 2
284 :			mov eax, [esp + 24 + 28] ; uv_stride
285 :			lea edx, [ebp - 1] ; edx = height/2 - 1
286 :			mul edx
287 :
288 :			mov ecx, [esp + 24 + 16] ; u_src
289 :			mov edx, [esp + 24 + 20] ; v_src
290 :			add ecx, eax ; u_src += (height/2 - 1) * uv_stride
291 :			add edx, eax ; v_src += (height/2 - 1) * uv_stride
292 :
293 :			neg dword [esp + 24 + 24] ; y_stride = -y_stride
294 :			neg dword [esp + 24 + 28] ; uv_stride = -uv_stride
295 :
296 :			.yloop
297 :			xor eax, eax ; x = 0;
298 :
299 :			.xloop1
300 :			movd mm0, [ecx+4*eax] ; [ \|uuuu]
301 :			movd mm1, [edx+4*eax] ; [ \|vvvv]
302 :			movq mm2, [esi+8*eax] ; [yyyy\|yyyy]
303 :
304 :			punpcklbw mm0, mm1 ; [vuvu\|vuvu]
305 :			movq mm1, mm0
306 :			punpcklbw mm0, mm2 ; [yvyu\|yvyu]
307 :			punpckhbw mm1, mm2 ; [yvyu\|yvyu]
308 :			movq [edi], mm0
309 :			movq [edi+8], mm1
310 :
311 :			inc eax
312 :			add edi, 16
313 :
314 :			cmp eax, ebx
315 :			jb .xloop1
316 :
317 :			add edi, [esp + 0] ; dst += dst_dif
318 :			add esi, [esp + 24 + 24] ; y_src += y_stride
319 :
320 :			xor eax, eax
321 :
322 :			.xloop2
323 :			movd mm0, [ecx+4*eax] ; [ \|uuuu]
324 :			movd mm1, [edx+4*eax] ; [ \|vvvv]
325 :			movq mm2, [esi+8*eax] ; [yyyy\|yyyy]
326 :
327 :			punpcklbw mm0, mm1 ; [vuvu\|vuvu]
328 :			movq mm1, mm0
329 :			punpcklbw mm0, mm2 ; [yvyu\|yvyu]
330 :			punpckhbw mm1, mm2 ; [yvyu\|yvyu]
331 :
332 :			movq [edi], mm0
333 :			movq [edi+8], mm1
334 :
335 :			inc eax
336 :			add edi, 16
337 :
338 :			cmp eax, ebx
339 :			jb .xloop2
340 :
341 :			add edi, [esp + 0] ; dst += dst_dif
342 :			add esi, [esp + 24 + 24] ; y_src += y_stride
343 :			add ecx, [esp + 24 + 28] ; u_src += uv_stride
344 :			add edx, [esp + 24 + 28] ; v_src += uv_stride
345 :
346 :			dec ebp ; y--
347 :			jnz near .yloop
348 :
349 :			emms
350 :
351 :			add esp, 4
352 :			pop ebp
353 :			pop edi
354 :			pop esi
355 :			pop ecx
356 :			pop ebx
357 :
358 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4