Annotation of /trunk/xvidcore/src/image/x86_asm/yv12_to_rgb32_mmx.asm

Revision 651 - (view) (download)

1 :	chl	434	;/*****************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * mmx yuv planar to rgb conversion
5 :			; *
6 :			; * Copyright (C) 2001 - Michael Militzer <isibaar@xvid.org>
7 :			; *
8 :	edgomez	651	; * This file is part of XviD, a free MPEG-4 video encoder/decoder
9 :	chl	434	; *
10 :	edgomez	651	; * XviD is free software; you can redistribute it and/or modify it
11 :			; * under the terms of the GNU General Public License as published by
12 :	chl	434	; * the Free Software Foundation; either version 2 of the License, or
13 :			; * (at your option) any later version.
14 :			; *
15 :			; * This program is distributed in the hope that it will be useful,
16 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 :			; * GNU General Public License for more details.
19 :			; *
20 :			; * You should have received a copy of the GNU General Public License
21 :			; * along with this program; if not, write to the Free Software
22 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 :			; *
24 :	edgomez	651	; * Under section 8 of the GNU General Public License, the copyright
25 :			; * holders of XVID explicitly forbid distribution in the following
26 :			; * countries:
27 :	chl	434	; *
28 :	edgomez	651	; * - Japan
29 :			; * - United States of America
30 :			; *
31 :			; * Linking XviD statically or dynamically with other modules is making a
32 :			; * combined work based on XviD. Thus, the terms and conditions of the
33 :			; * GNU General Public License cover the whole combination.
34 :			; *
35 :			; * As a special exception, the copyright holders of XviD give you
36 :			; * permission to link XviD with independent modules that communicate with
37 :			; * XviD solely through the VFW1.1 and DShow interfaces, regardless of the
38 :			; * license terms of these independent modules, and to copy and distribute
39 :			; * the resulting combined work under terms of your choice, provided that
40 :			; * every copy of the combined work is accompanied by a complete copy of
41 :			; * the source code of XviD (the version of XviD used to produce the
42 :			; * combined work), being distributed under the terms of the GNU General
43 :			; * Public License plus this exception. An independent module is a module
44 :			; * which is not derived from or based on XviD.
45 :			; *
46 :			; * Note that people who make modified versions of XviD are not obligated
47 :			; * to grant this special exception for their modified versions; it is
48 :			; * their choice whether to do so. The GNU General Public License gives
49 :			; * permission to release a modified version without this exception; this
50 :			; * exception also makes it possible to release a modified version which
51 :			; * carries forward this exception.
52 :			; *
53 :			; * $Id: yv12_to_rgb32_mmx.asm,v 1.4 2002-11-17 00:20:30 edgomez Exp $
54 :			; *
55 :	chl	434	; ****************************************************************************/
56 :	edgomez	332	;
57 :			;------------------------------------------------------------------------------
58 :			; NB: n contrary to the c implementation this code does the conversion
59 :			; using direct calculations. Input data width must be a multiple of 8
60 :			; and height must be even.
61 :			; This implementation is less precise than the c version but is
62 :			; more than twice as fast :-)
63 :			;------------------------------------------------------------------------------
64 :
65 :			BITS 32
66 :
67 :
68 :			%macro cglobal 1
69 :			%ifdef PREFIX
70 :			global _%1
71 :			%define %1 _%1
72 :			%else
73 :			global %1
74 :			%endif
75 :			%endmacro
76 :
77 :
78 :			%define SCALEBITS 6
79 :
80 :
81 :			ALIGN 16
82 :
83 :			SECTION .data
84 :
85 :			Y_SUB dw 16, 16, 16, 16
86 :			U_SUB dw 128, 128, 128, 128
87 :			V_SUB dw 128, 128, 128, 128
88 :
89 :			Y_MUL dw 74, 74, 74, 74
90 :
91 :			UG_MUL dw 25, 25, 25, 25
92 :			VG_MUL dw 52, 52, 52, 52
93 :
94 :			UB_MUL dw 129, 129, 129, 129
95 :			VR_MUL dw 102, 102, 102, 102
96 :
97 :
98 :			ALIGN 16
99 :
100 :			SECTION .text
101 :
102 :			;------------------------------------------------------------------------------
103 :			;
104 :			; void yv12_to_rgb32_mmx(uint8_t *dst,
105 :			; int dst_stride,
106 :			; uint8_t *y_src,
107 :			; uint8_t *u_src,
108 :			; uint8_t *v_src,
109 :			; int y_stride, int uv_stride,
110 :			; int width, int height);
111 :			;
112 :			;------------------------------------------------------------------------------
113 :
114 :			cglobal yv12_to_rgb32_mmx
115 :			yv12_to_rgb32_mmx:
116 :
117 :			push ebx
118 :			push esi
119 :			push edi
120 :			push ebp
121 :
122 :			; local vars alloc
123 :			%define localsize 72
124 :			%define TEMP_Y1 esp
125 :			%define TEMP_Y2 esp + 8
126 :			%define TEMP_G1 esp + 16
127 :			%define TEMP_G2 esp + 24
128 :			%define TEMP_B1 esp + 32
129 :			%define TEMP_B2 esp + 40
130 :			%define y_dif esp + 48
131 :			%define dst_dif esp + 52
132 :			%define uv_dif esp + 56
133 :			%define height esp + 60
134 :			%define width_8 esp + 64
135 :			%define height_2 esp + 68
136 :			sub esp, localsize
137 :
138 :			; function code
139 :			mov eax, [esp + 52 + localsize] ; height -> eax
140 :			cmp eax, 0x00
141 :			jge near dont_flip ; flip?
142 :
143 :			neg eax ; neg height
144 :			mov [height], eax
145 :
146 :			mov esi, [esp + 48 + localsize] ; width -> esi
147 :
148 :			mov ebp, [esp + 40 + localsize] ; y_stride -> ebp
149 :			mov ebx, ebp
150 :			shl ebx, 1 ; 2 * y_stride -> ebx
151 :			neg ebx
152 :			sub ebx, esi ; y_dif -> eax
153 :
154 :			mov [y_dif], ebx
155 :
156 :			sub eax, 1 ; height - 1 -> eax
157 :			mul ebp ; (height - 1) * y_stride -> ebp
158 :			mov ecx, eax
159 :			mov eax, [esp + 28 + localsize] ; y_src -> eax
160 :			add eax, ecx ; y_src -> eax
161 :			mov ebx, eax
162 :			sub ebx, ebp ; y_src2 -> ebx
163 :
164 :			mov ecx, [esp + 24 + localsize] ; dst_stride -> ecx
165 :			mov edx, ecx
166 :			shl edx, 3
167 :			mov ecx, edx ; 8 * dst_stride -> ecx
168 :			shl esi, 2
169 :			sub ecx, esi ; 8 * dst_stride - 4 * width -> ecx
170 :
171 :			mov [dst_dif], ecx
172 :
173 :			mov esi, [esp + 20 + localsize] ; dst -> esi
174 :			mov edi, esi
175 :			shr edx, 1
176 :			add edi, edx ; dst2 -> edi
177 :
178 :			mov ebp, [esp + 48 + localsize] ; width -> ebp
179 :			mov ecx, ebp ; width -> ecx
180 :			shr ecx, 1
181 :			shr ebp, 3 ; width / 8 -> ebp
182 :			mov [width_8], ebp
183 :
184 :			mov ebp, [esp + 44 + localsize] ; uv_stride -> ebp
185 :			mov edx, ebp
186 :			neg edx
187 :			sub edx, ecx
188 :			mov [uv_dif], edx
189 :
190 :			mov edx, ebp
191 :			mov ebp, eax
192 :			mov eax, [height] ; height -> eax
193 :			shr eax, 1 ; height / 2 -> eax
194 :
195 :			mov ecx, [esp + 32 + localsize] ; u_src -> ecx
196 :			sub eax, 1
197 :			mul edx
198 :			add ecx, eax
199 :
200 :			mov edx, [esp + 36 + localsize] ; v_src -> edx
201 :			add edx, eax
202 :
203 :			mov eax, ebp
204 :
205 :			mov ebp, [height] ; height -> ebp
206 :			shr ebp, 1 ; height / 2 -> ebp
207 :
208 :			pxor mm7, mm7
209 :			jmp y_loop
210 :
211 :
212 :			dont_flip:
213 :			mov esi, [esp + 48 + localsize] ; width -> esi
214 :
215 :			mov ebp, [esp + 40 + localsize] ; y_stride -> ebp
216 :			mov ebx, ebp
217 :			shl ebx, 1 ; 2 * y_stride -> ebx
218 :			sub ebx, esi ; y_dif -> ebx
219 :
220 :			mov [y_dif], ebx
221 :
222 :			mov eax, [esp + 28 + localsize] ; y_src -> eax
223 :			mov ebx, eax
224 :			add ebx, ebp ; y_src2 -> ebp
225 :
226 :			mov ecx, [esp + 24 + localsize] ; dst_stride -> ecx
227 :			shl ecx, 3
228 :			mov edx, ecx ; 8 * dst_stride -> edx
229 :			shl esi, 2
230 :			sub ecx, esi ; 8 * dst_stride - 4 * width -> ecx
231 :
232 :			mov [dst_dif], ecx
233 :
234 :			mov esi, [esp + 20 + localsize] ; dst -> esi
235 :			mov edi, esi
236 :			shr edx, 1
237 :			add edi, edx ; dst2 -> edi
238 :
239 :			mov ebp, [esp + 48 + localsize] ; width -> ebp
240 :			mov ecx, ebp ; width -> ecx
241 :			shr ecx, 1
242 :			shr ebp, 3 ; width / 8 -> ebp
243 :			mov [width_8], ebp
244 :
245 :			mov ebp, [esp + 44 + localsize] ; uv_stride -> ebp
246 :			sub ebp, ecx
247 :			mov [uv_dif], ebp
248 :
249 :			mov ecx, [esp + 32 + localsize] ; u_src -> ecx
250 :			mov edx, [esp + 36 + localsize] ; v_src -> edx
251 :
252 :			mov ebp, [esp + 52 + localsize] ; height -> ebp
253 :			shr ebp, 1 ; height / 2 -> ebp
254 :
255 :			pxor mm7, mm7
256 :
257 :			y_loop:
258 :			mov [height_2], ebp
259 :			mov ebp, [width_8]
260 :
261 :			x_loop:
262 :			movd mm2, [ecx]
263 :			movd mm3, [edx]
264 :
265 :			punpcklbw mm2, mm7 ; u3u2u1u0 -> mm2
266 :			punpcklbw mm3, mm7 ; v3v2v1v0 -> mm3
267 :
268 :			psubsw mm2, [U_SUB] ; U - 128
269 :			psubsw mm3, [V_SUB] ; V - 128
270 :
271 :			movq mm4, mm2
272 :			movq mm5, mm3
273 :
274 :			pmullw mm2, [UG_MUL]
275 :			pmullw mm3, [VG_MUL]
276 :
277 :			movq mm6, mm2 ; u3u2u1u0 -> mm6
278 :			punpckhwd mm2, mm2 ; u3u3u2u2 -> mm2
279 :			punpcklwd mm6, mm6 ; u1u1u0u0 -> mm6
280 :
281 :			pmullw mm4, [UB_MUL] ; B_ADD -> mm4
282 :
283 :			movq mm0, mm3
284 :			punpckhwd mm3, mm3 ; v3v3v2v2 -> mm2
285 :			punpcklwd mm0, mm0 ; v1v1v0v0 -> mm6
286 :
287 :			paddsw mm2, mm3
288 :			paddsw mm6, mm0
289 :
290 :			pmullw mm5, [VR_MUL] ; R_ADD -> mm5
291 :
292 :			movq mm0, [eax] ; y7y6y5y4y3y2y1y0 -> mm0
293 :
294 :			movq mm1, mm0
295 :			punpckhbw mm1, mm7 ; y7y6y5y4 -> mm1
296 :			punpcklbw mm0, mm7 ; y3y2y1y0 -> mm0
297 :
298 :			psubsw mm0, [Y_SUB] ; Y - Y_SUB
299 :			psubsw mm1, [Y_SUB] ; Y - Y_SUB
300 :
301 :			pmullw mm1, [Y_MUL]
302 :			pmullw mm0, [Y_MUL]
303 :
304 :			movq [TEMP_Y2], mm1 ; y7y6y5y4 -> mm3
305 :			movq [TEMP_Y1], mm0 ; y3y2y1y0 -> mm7
306 :
307 :			psubsw mm1, mm2 ; g7g6g5g4 -> mm1
308 :			psubsw mm0, mm6 ; g3g2g1g0 -> mm0
309 :
310 :			psraw mm1, SCALEBITS
311 :			psraw mm0, SCALEBITS
312 :
313 :			packuswb mm0, mm1 ;g7g6g5g4g3g2g1g0 -> mm0
314 :
315 :			movq [TEMP_G1], mm0
316 :
317 :			movq mm0, [ebx] ; y7y6y5y4y3y2y1y0 -> mm0
318 :
319 :			movq mm1, mm0
320 :
321 :			punpckhbw mm1, mm7 ; y7y6y5y4 -> mm1
322 :			punpcklbw mm0, mm7 ; y3y2y1y0 -> mm0
323 :
324 :			psubsw mm0, [Y_SUB] ; Y - Y_SUB
325 :			psubsw mm1, [Y_SUB] ; Y - Y_SUB
326 :
327 :			pmullw mm1, [Y_MUL]
328 :			pmullw mm0, [Y_MUL]
329 :
330 :			movq mm3, mm1
331 :			psubsw mm1, mm2 ; g7g6g5g4 -> mm1
332 :
333 :			movq mm2, mm0
334 :			psubsw mm0, mm6 ; g3g2g1g0 -> mm0
335 :
336 :			psraw mm1, SCALEBITS
337 :			psraw mm0, SCALEBITS
338 :
339 :			packuswb mm0, mm1 ; g7g6g5g4g3g2g1g0 -> mm0
340 :
341 :			movq [TEMP_G2], mm0
342 :
343 :			movq mm0, mm4
344 :			punpckhwd mm4, mm4 ; u3u3u2u2 -> mm2
345 :			punpcklwd mm0, mm0 ; u1u1u0u0 -> mm6
346 :
347 :			movq mm1, mm3 ; y7y6y5y4 -> mm1
348 :			paddsw mm3, mm4 ; b7b6b5b4 -> mm3
349 :
350 :			movq mm7, mm2 ; y3y2y1y0 -> mm7
351 :
352 :			paddsw mm2, mm0 ; b3b2b1b0 -> mm2
353 :
354 :			psraw mm3, SCALEBITS
355 :			psraw mm2, SCALEBITS
356 :
357 :			packuswb mm2, mm3 ; b7b6b5b4b3b2b1b0 -> mm2
358 :
359 :			movq [TEMP_B2], mm2
360 :
361 :			movq mm3, [TEMP_Y2]
362 :			movq mm2, [TEMP_Y1]
363 :
364 :			movq mm6, mm3 ; TEMP_Y2 -> mm6
365 :			paddsw mm3, mm4 ; b7b6b5b4 -> mm3
366 :
367 :			movq mm4, mm2 ; TEMP_Y1 -> mm4
368 :			paddsw mm2, mm0 ; b3b2b1b0 -> mm2
369 :
370 :			psraw mm3, SCALEBITS
371 :			psraw mm2, SCALEBITS
372 :
373 :			packuswb mm2, mm3 ; b7b6b5b4b3b2b1b0 -> mm2
374 :
375 :			movq [TEMP_B1], mm2
376 :
377 :			movq mm0, mm5
378 :			punpckhwd mm5, mm5 ; v3v3v2v2 -> mm5
379 :			punpcklwd mm0, mm0 ; v1v1v0v0 -> mm0
380 :
381 :			paddsw mm1, mm5 ; r7r6r5r4 -> mm1
382 :			paddsw mm7, mm0 ; r3r2r1r0 -> mm7
383 :
384 :			psraw mm1, SCALEBITS
385 :			psraw mm7, SCALEBITS
386 :
387 :			packuswb mm7, mm1 ; r7r6r5r4r3r2r1r0 -> mm7 (TEMP_R2)
388 :
389 :			paddsw mm6, mm5 ; r7r6r5r4 -> mm6
390 :			paddsw mm4, mm0 ; r3r2r1r0 -> mm4
391 :
392 :			psraw mm6, SCALEBITS
393 :			psraw mm4, SCALEBITS
394 :
395 :			packuswb mm4, mm6 ; r7r6r5r4r3r2r1r0 -> mm4 (TEMP_R1)
396 :
397 :			movq mm0, [TEMP_B1]
398 :			movq mm1, [TEMP_G1]
399 :
400 :			movq mm6, mm7
401 :
402 :			movq mm2, mm0
403 :			punpcklbw mm2, mm4 ; r3b3r2b2r1b1r0b0 -> mm2
404 :			punpckhbw mm0, mm4 ; r7b7r6b6r5b5r4b4 -> mm0
405 :
406 :			pxor mm7, mm7
407 :
408 :			movq mm3, mm1
409 :			punpcklbw mm1, mm7 ; 0g30g20g10g0 -> mm1
410 :			punpckhbw mm3, mm7 ; 0g70g60g50g4 -> mm3
411 :
412 :			movq mm4, mm2
413 :			punpcklbw mm2, mm1 ; 0r1g1b10r0g0b0 -> mm2
414 :			punpckhbw mm4, mm1 ; 0r3g3b30r2g2b2 -> mm4
415 :
416 :			movq mm5, mm0
417 :			punpcklbw mm0, mm3 ; 0r5g5b50r4g4b4 -> mm0
418 :			punpckhbw mm5, mm3 ; 0r7g7b70r6g6b6 -> mm5
419 :
420 :			movq [esi], mm2
421 :			movq [esi + 8], mm4
422 :			movq [esi + 16], mm0
423 :			movq [esi + 24], mm5
424 :
425 :			movq mm0, [TEMP_B2]
426 :			movq mm1, [TEMP_G2]
427 :
428 :			movq mm2, mm0
429 :			punpcklbw mm2, mm6 ; r3b3r2b2r1b1r0b0 -> mm2
430 :			punpckhbw mm0, mm6 ; r7b7r6b6r5b5r4b4 -> mm0
431 :
432 :			movq mm3, mm1
433 :			punpcklbw mm1, mm7 ; 0g30g20g10g0 -> mm1
434 :			punpckhbw mm3, mm7 ; 0g70g60g50g4 -> mm3
435 :
436 :			movq mm4, mm2
437 :			punpcklbw mm2, mm1 ; 0r1g1b10r0g0b0 -> mm2
438 :			punpckhbw mm4, mm1 ; 0r3g3b30r2g2b2 -> mm4
439 :
440 :			movq mm5, mm0
441 :			punpcklbw mm0, mm3 ; 0r5g5b50r4g4b4 -> mm0
442 :			punpckhbw mm5, mm3 ; 0r7g7b70r6g6b6 -> mm5
443 :
444 :			movq [edi], mm2
445 :			movq [edi + 8], mm4
446 :			movq [edi + 16], mm0
447 :			movq [edi + 24], mm5
448 :
449 :			add esi, 32
450 :			add edi, 32
451 :
452 :			add eax, 8
453 :			add ebx, 8
454 :			add ecx, 4
455 :			add edx, 4
456 :
457 :			dec ebp
458 :
459 :			jnz near x_loop
460 :
461 :			add esi, [dst_dif]
462 :			add edi, [dst_dif]
463 :
464 :			add eax, [y_dif]
465 :			add ebx, [y_dif]
466 :
467 :			add ecx, [uv_dif]
468 :			add edx, [uv_dif]
469 :
470 :			mov ebp, [height_2]
471 :			dec ebp
472 :			jnz near y_loop
473 :
474 :			emms
475 :
476 :			;; Local vars deallocation
477 :			add esp, localsize
478 :			%undef TEMP_Y1
479 :			%undef TEMP_Y2
480 :			%undef TEMP_G1
481 :			%undef TEMP_G2
482 :			%undef TEMP_B1
483 :			%undef TEMP_B2
484 :			%undef y_dif
485 :			%undef dst_dif
486 :			%undef uv_dif
487 :			%undef height
488 :			%undef width_8
489 :			%undef height_2
490 :			%undef localsize
491 :
492 :			pop ebp
493 :			pop edi
494 :			pop esi
495 :			pop ebx
496 :
497 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4