Annotation of /trunk/xvidcore/src/image/x86_asm/yuyv_to_yv12_mmx.asm

Revision 651 - (view) (download)

1 :	chl	434	;/*****************************************************************************
2 :	Isibaar	3	; *
3 :	chl	434	; * XVID MPEG-4 VIDEO CODEC
4 :			; * mmx yuyv/uyvy to yuv planar conversion
5 :	Isibaar	3	; *
6 :	chl	434	; * Copyright(C) 2002 Peter Ross <pross@xvid.org>
7 :	Isibaar	3	; *
8 :	edgomez	651	; * This file is part of XviD, a free MPEG-4 video encoder/decoder
9 :	Isibaar	3	; *
10 :	edgomez	651	; * XviD is free software; you can redistribute it and/or modify it
11 :			; * under the terms of the GNU General Public License as published by
12 :	chl	434	; * the Free Software Foundation; either version 2 of the License, or
13 :			; * (at your option) any later version.
14 :	Isibaar	3	; *
15 :	chl	434	; * This program is distributed in the hope that it will be useful,
16 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 :			; * GNU General Public License for more details.
19 :	Isibaar	3	; *
20 :	chl	434	; * You should have received a copy of the GNU General Public License
21 :			; * along with this program; if not, write to the Free Software
22 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 :	Isibaar	3	; *
24 :	edgomez	651	; * Under section 8 of the GNU General Public License, the copyright
25 :			; * holders of XVID explicitly forbid distribution in the following
26 :			; * countries:
27 :			; *
28 :			; * - Japan
29 :			; * - United States of America
30 :			; *
31 :			; * Linking XviD statically or dynamically with other modules is making a
32 :			; * combined work based on XviD. Thus, the terms and conditions of the
33 :			; * GNU General Public License cover the whole combination.
34 :			; *
35 :			; * As a special exception, the copyright holders of XviD give you
36 :			; * permission to link XviD with independent modules that communicate with
37 :			; * XviD solely through the VFW1.1 and DShow interfaces, regardless of the
38 :			; * license terms of these independent modules, and to copy and distribute
39 :			; * the resulting combined work under terms of your choice, provided that
40 :			; * every copy of the combined work is accompanied by a complete copy of
41 :			; * the source code of XviD (the version of XviD used to produce the
42 :			; * combined work), being distributed under the terms of the GNU General
43 :			; * Public License plus this exception. An independent module is a module
44 :			; * which is not derived from or based on XviD.
45 :			; *
46 :			; * Note that people who make modified versions of XviD are not obligated
47 :			; * to grant this special exception for their modified versions; it is
48 :			; * their choice whether to do so. The GNU General Public License gives
49 :			; * permission to release a modified version without this exception; this
50 :			; * exception also makes it possible to release a modified version which
51 :			; * carries forward this exception.
52 :			; *
53 :			; * $Id: yuyv_to_yv12_mmx.asm,v 1.3 2002-11-17 00:20:30 edgomez Exp $
54 :			; *
55 :	chl	434	; ****************************************************************************/
56 :	Isibaar	3
57 :			bits 32
58 :
59 :
60 :			section .data
61 :
62 :			%macro cglobal 1
63 :			%ifdef PREFIX
64 :			global _%1
65 :			%define %1 _%1
66 :			%else
67 :			global %1
68 :			%endif
69 :			%endmacro
70 :
71 :			align 16
72 :
73 :
74 :			section .data
75 :
76 :
77 :			;===========================================================================
78 :			; masks for extracting yuv components
79 :			;===========================================================================
80 :			; y u y v y u y v
81 :
82 :			mask1 db 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0
83 :			mask2 db 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff
84 :
85 :
86 :			section .text
87 :
88 :			;===========================================================================
89 :			;
90 :			; void yuyv_to_yv12_mmx(uint8_t * const y_out,
91 :			; uint8_t * const u_out,
92 :			; uint8_t * const v_out,
93 :			; const uint8_t * const src,
94 :			; const uint32_t width,
95 :			; const uint32_t height,
96 :			; const uint32_t stride);
97 :			;
98 :			; width must be multiple of 8
99 :			; does not flip
100 :			; ~30% faster than plain c
101 :			;
102 :			;===========================================================================
103 :
104 :			align 16
105 :			cglobal yuyv_to_yv12_mmx
106 :			yuyv_to_yv12_mmx
107 :
108 :			push ebx
109 :			push ecx
110 :			push esi
111 :			push edi
112 :			push ebp ; STACK BASE = 20
113 :
114 :			; some global consants
115 :
116 :			mov ecx, [esp + 20 + 20] ; width
117 :			mov eax, [esp + 20 + 28] ; stride
118 :			sub eax, ecx ; eax = stride - width
119 :			mov edx, eax
120 :			add edx, [esp + 20 + 28] ; edx = y_dif + stride
121 :			push edx ; [esp + 12] = y_dif
122 :
123 :			shr eax, 1
124 :			push eax ; [esp + 8] = uv_dif
125 :
126 :			shr ecx, 3
127 :			push ecx ; [esp + 4] = width/8
128 :
129 :			sub esp, 4 ; [esp + 0] = tmp_height_counter
130 :			; STACK_BASE = 36
131 :
132 :			movq mm6, [mask1]
133 :			movq mm7, [mask2]
134 :
135 :			mov edi, [esp + 36 + 4] ; y_out
136 :			mov ebx, [esp + 36 + 8] ; u_out
137 :			mov edx, [esp + 36 + 12] ; v_out
138 :			mov esi, [esp + 36 + 16] ; src
139 :
140 :			mov eax, [esp + 36 + 20]
141 :			mov ebp, [esp + 36 + 24]
142 :			mov ecx, [esp + 36 + 28] ; ecx = stride
143 :			shr ebp, 1 ; ebp = height /= 2
144 :			add eax, eax ; eax = 2 * width
145 :
146 :			.yloop
147 :			mov [esp], ebp
148 :			mov ebp, [esp + 4] ; width/8
149 :			.xloop
150 :			movq mm2, [esi] ; y 1st row
151 :			movq mm3, [esi + 8]
152 :			movq mm0, mm2
153 :			movq mm1, mm3
154 :			pand mm2, mm6 ; mask1
155 :			pand mm3, mm6 ; mask1
156 :			pand mm0, mm7 ; mask2
157 :			pand mm1, mm7 ; mask2
158 :			packuswb mm2, mm3
159 :			psrlq mm0, 8
160 :			psrlq mm1, 8
161 :			movq [edi], mm2
162 :
163 :			movq mm4, [esi + eax] ; y 2nd row
164 :			movq mm5, [esi + eax + 8]
165 :			movq mm2, mm4
166 :			movq mm3, mm5
167 :			pand mm4, mm6 ; mask1
168 :			pand mm5, mm6 ; mask1
169 :			pand mm2, mm7 ; mask2
170 :			pand mm3, mm7 ; mask2
171 :			packuswb mm4, mm5
172 :			psrlq mm2, 8
173 :			psrlq mm3, 8
174 :			movq [edi + ecx], mm4
175 :
176 :			paddw mm0, mm2 ; uv avg 1st & 2nd
177 :			paddw mm1, mm3
178 :			psrlw mm0, 1
179 :			psrlw mm1, 1
180 :			packuswb mm0, mm1
181 :			movq mm2, mm0
182 :			pand mm0, mm6 ; mask1
183 :			pand mm2, mm7 ; mask2
184 :			packuswb mm0, mm0
185 :			psrlq mm2, 8
186 :			movd [ebx], mm0
187 :			packuswb mm2, mm2
188 :			movd [edx], mm2
189 :
190 :			add esi, 16
191 :			add edi, 8
192 :			add ebx, 4
193 :			add edx, 4
194 :			dec ebp
195 :			jnz near .xloop
196 :
197 :			mov ebp, [esp]
198 :
199 :			add esi, eax ; += width2
200 :			add edi, [esp + 12] ; += y_dif + stride
201 :			add ebx, [esp + 8] ; += uv_dif
202 :			add edx, [esp + 8] ; += uv_dif
203 :
204 :			dec ebp
205 :			jnz near .yloop
206 :
207 :			emms
208 :
209 :			add esp, 16
210 :			pop ebp
211 :			pop edi
212 :			pop esi
213 :			pop ecx
214 :			pop ebx
215 :
216 :			ret
217 :
218 :
219 :			;===========================================================================
220 :			;
221 :			; void uyvy_to_yv12_mmx(uint8_t * const y_out,
222 :			; uint8_t * const u_out,
223 :			; uint8_t * const v_out,
224 :			; const uint8_t * const src,
225 :			; const uint32_t width,
226 :			; const uint32_t height,
227 :			; const uint32_t stride);
228 :			;
229 :			; width must be multiple of 8
230 :			; does not flip
231 :			; ~30% faster than plain c
232 :			;
233 :			;===========================================================================
234 :
235 :			align 16
236 :			cglobal uyvy_to_yv12_mmx
237 :			uyvy_to_yv12_mmx
238 :
239 :			push ebx
240 :			push ecx
241 :			push esi
242 :			push edi
243 :			push ebp ; STACK BASE = 20
244 :
245 :			; some global consants
246 :
247 :			mov ecx, [esp + 20 + 20] ; width
248 :			mov eax, [esp + 20 + 28] ; stride
249 :			sub eax, ecx ; eax = stride - width
250 :			mov edx, eax
251 :			add edx, [esp + 20 + 28] ; edx = y_dif + stride
252 :			push edx ; [esp + 12] = y_dif
253 :
254 :			shr eax, 1
255 :			push eax ; [esp + 8] = uv_dif
256 :
257 :			shr ecx, 3
258 :			push ecx ; [esp + 4] = width/8
259 :
260 :			sub esp, 4 ; [esp + 0] = tmp_height_counter
261 :			; STACK_BASE = 36
262 :
263 :			movq mm6, [mask1]
264 :			movq mm7, [mask2]
265 :
266 :			mov edi, [esp + 36 + 4] ; y_out
267 :			mov ebx, [esp + 36 + 8] ; u_out
268 :			mov edx, [esp + 36 + 12] ; v_out
269 :			mov esi, [esp + 36 + 16] ; src
270 :
271 :			mov eax, [esp + 36 + 20]
272 :			mov ebp, [esp + 36 + 24]
273 :			mov ecx, [esp + 36 + 28] ; ecx = stride
274 :			shr ebp, 1 ; ebp = height /= 2
275 :			add eax, eax ; eax = 2 * width
276 :
277 :			.yloop
278 :			mov [esp], ebp
279 :			mov ebp, [esp + 4] ; width/8
280 :			.xloop
281 :			movq mm2, [esi] ; y 1st row
282 :			movq mm3, [esi + 8]
283 :			movq mm0, mm2
284 :			movq mm1, mm3
285 :			pand mm2, mm7 ; mask2
286 :			pand mm3, mm7 ; mask2
287 :			psrlq mm2, 8
288 :			psrlq mm3, 8
289 :			pand mm0, mm6 ; mask1
290 :			pand mm1, mm6 ; mask1
291 :			packuswb mm2, mm3
292 :			movq [edi], mm2
293 :
294 :			movq mm4, [esi + eax] ; y 2nd row
295 :			movq mm5, [esi + eax + 8]
296 :			movq mm2, mm4
297 :			movq mm3, mm5
298 :			pand mm4, mm7 ; mask2
299 :			pand mm5, mm7 ; mask2
300 :			psrlq mm4, 8
301 :			psrlq mm5, 8
302 :			pand mm2, mm6 ; mask1
303 :			pand mm3, mm6 ; mask1
304 :			packuswb mm4, mm5
305 :			movq [edi + ecx], mm4
306 :
307 :			paddw mm0, mm2 ; uv avg 1st & 2nd
308 :			paddw mm1, mm3
309 :			psrlw mm0, 1
310 :			psrlw mm1, 1
311 :			packuswb mm0, mm1
312 :			movq mm2, mm0
313 :			pand mm0, mm6 ; mask1
314 :			pand mm2, mm7 ; mask2
315 :			packuswb mm0, mm0
316 :			psrlq mm2, 8
317 :			movd [ebx], mm0
318 :			packuswb mm2, mm2
319 :			movd [edx], mm2
320 :
321 :			add esi, 16
322 :			add edi, 8
323 :			add ebx, 4
324 :			add edx, 4
325 :			dec ebp
326 :			jnz near .xloop
327 :
328 :			mov ebp, [esp]
329 :
330 :			add esi, eax ; += width2
331 :			add edi, [esp + 12] ; += y_dif + stride
332 :			add ebx, [esp + 8] ; += uv_dif
333 :			add edx, [esp + 8] ; += uv_dif
334 :
335 :			dec ebp
336 :			jnz near .yloop
337 :
338 :			emms
339 :
340 :			add esp, 16
341 :			pop ebp
342 :			pop edi
343 :			pop esi
344 :			pop ecx
345 :			pop ebx
346 :
347 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4