Annotation of /trunk/xvidcore/src/image/x86_asm/yuv_to_yv12_mmx.asm

Revision 651 - (view) (download)

1 :	chl	434	;/*****************************************************************************
2 :			; *
3 :			; * XVID MPEG-4 VIDEO CODEC
4 :			; * mmx yuv planar to yv12 conversion
5 :			; *
6 :			; * Copyright (C) 2001 - Michael Militzer <isibaar@xvid.org>
7 :			; *
8 :	edgomez	651	; * This file is part of XviD, a free MPEG-4 video encoder/decoder
9 :	chl	434	; *
10 :	edgomez	651	; * XviD is free software; you can redistribute it and/or modify it
11 :			; * under the terms of the GNU General Public License as published by
12 :	chl	434	; * the Free Software Foundation; either version 2 of the License, or
13 :			; * (at your option) any later version.
14 :			; *
15 :			; * This program is distributed in the hope that it will be useful,
16 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 :			; * GNU General Public License for more details.
19 :			; *
20 :			; * You should have received a copy of the GNU General Public License
21 :			; * along with this program; if not, write to the Free Software
22 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 :			; *
24 :	edgomez	651	; * Under section 8 of the GNU General Public License, the copyright
25 :			; * holders of XVID explicitly forbid distribution in the following
26 :			; * countries:
27 :			; *
28 :			; * - Japan
29 :			; * - United States of America
30 :			; *
31 :			; * Linking XviD statically or dynamically with other modules is making a
32 :			; * combined work based on XviD. Thus, the terms and conditions of the
33 :			; * GNU General Public License cover the whole combination.
34 :			; *
35 :			; * As a special exception, the copyright holders of XviD give you
36 :			; * permission to link XviD with independent modules that communicate with
37 :			; * XviD solely through the VFW1.1 and DShow interfaces, regardless of the
38 :			; * license terms of these independent modules, and to copy and distribute
39 :			; * the resulting combined work under terms of your choice, provided that
40 :			; * every copy of the combined work is accompanied by a complete copy of
41 :			; * the source code of XviD (the version of XviD used to produce the
42 :			; * combined work), being distributed under the terms of the GNU General
43 :			; * Public License plus this exception. An independent module is a module
44 :			; * which is not derived from or based on XviD.
45 :			; *
46 :			; * Note that people who make modified versions of XviD are not obligated
47 :			; * to grant this special exception for their modified versions; it is
48 :			; * their choice whether to do so. The GNU General Public License gives
49 :			; * permission to release a modified version without this exception; this
50 :			; * exception also makes it possible to release a modified version which
51 :			; * carries forward this exception.
52 :			; *
53 :			; * $Id: yuv_to_yv12_mmx.asm,v 1.7 2002-11-17 00:20:30 edgomez Exp $
54 :			; *
55 :	chl	434	; ****************************************************************************/
56 :	edgomez	328
57 :			BITS 32
58 :
59 :			%macro cglobal 1
60 :			%ifdef PREFIX
61 :			global _%1
62 :	edgomez	331	%define %1 _%1
63 :	edgomez	328	%else
64 :			global %1
65 :			%endif
66 :			%endmacro
67 :
68 :			SECTION .text
69 :
70 :			ALIGN 64
71 :
72 :			;------------------------------------------------------------------------------
73 :			;
74 :			; void yuv_to_yv12_xmm(uint8_t *y_out,
75 :			; uint8_t *u_out,
76 :			; uint8_t *v_out,
77 :			; uint8_t *src,
78 :			; int width, int height, int stride);
79 :			;
80 :			; This function probably also runs on PentiumII class cpu's
81 :			;
82 :			; Attention: This code assumes that width is a multiple of 16
83 :			;
84 :			;------------------------------------------------------------------------------
85 :
86 :
87 :			cglobal yuv_to_yv12_xmm
88 :			yuv_to_yv12_xmm:
89 :
90 :			push ebx
91 :			push esi
92 :			push edi
93 :			push ebp
94 :
95 :			; local vars allocation
96 :			%define localsize 4
97 :			%define remainder esp
98 :			sub esp, localsize
99 :
100 :			; function code
101 :			mov eax, [esp + 40 + localsize] ; height -> eax
102 :			mov ebx, [esp + 44 + localsize] ; stride -> ebx
103 :			mov esi, [esp + 32 + localsize] ; src -> esi
104 :			mov edi, [esp + 20 + localsize] ; y_out -> edi
105 :			mov ecx, [esp + 36 + localsize] ; width -> ecx
106 :	edgomez	331
107 :	edgomez	328	sub ebx, ecx ; stride - width -> ebx
108 :
109 :			mov edx, ecx
110 :			mov ebp, ecx
111 :			shr edx, 6
112 :			mov ecx, edx ; 64 bytes copied per iteration
113 :			shl edx, 6
114 :			sub ebp, edx ; remainder -> ebp
115 :			shr ebp, 4 ; 16 bytes per iteration
116 :			add ebp, 1
117 :			mov [remainder], ebp
118 :
119 :			mov edx, ecx
120 :
121 :			.y_inner_loop:
122 :			prefetchnta [esi + 64] ; non temporal prefetch
123 :			prefetchnta [esi + 96]
124 :
125 :			movq mm1, [esi] ; read from src
126 :			movq mm2, [esi + 8]
127 :			movq mm3, [esi + 16]
128 :			movq mm4, [esi + 24]
129 :			movq mm5, [esi + 32]
130 :			movq mm6, [esi + 40]
131 :			movq mm7, [esi + 48]
132 :			movq mm0, [esi + 56]
133 :
134 :			movntq [edi], mm1 ; write to y_out
135 :			movntq [edi + 8], mm2
136 :			movntq [edi + 16], mm3
137 :			movntq [edi + 24], mm4
138 :			movntq [edi + 32], mm5
139 :			movntq [edi + 40], mm6
140 :			movntq [edi + 48], mm7
141 :			movntq [edi + 56], mm0
142 :
143 :			add esi, 64
144 :			add edi, 64
145 :			dec ecx
146 :			jnz .y_inner_loop
147 :
148 :			dec ebp
149 :			jz .y_outer_loop
150 :
151 :			.y_remainder_loop:
152 :			movq mm1, [esi] ; read from src
153 :			movq mm2, [esi + 8]
154 :
155 :			movntq [edi], mm1 ; write to y_out
156 :			movntq [edi + 8], mm2
157 :
158 :			add esi, 16
159 :			add edi, 16
160 :			dec ebp
161 :			jnz .y_remainder_loop
162 :
163 :			.y_outer_loop:
164 :			mov ebp, [remainder]
165 :			mov ecx, edx
166 :			add edi, ebx
167 :
168 :			dec eax
169 :			jnz near .y_inner_loop
170 :
171 :			mov eax, [esp + 40 + localsize] ; height -> eax
172 :			mov ebx, [esp + 44 + localsize] ; stride -> ebx
173 :			mov ecx, [esp + 36 + localsize] ; width -> ecx
174 :			mov edi, [esp + 24 + localsize] ; u_out -> edi
175 :
176 :			shr ecx, 1 ; width / 2 -> ecx
177 :			shr ebx, 1 ; stride / 2 -> ebx
178 :			shr eax, 1 ; height / 2 -> eax
179 :
180 :			sub ebx, ecx ; stride / 2 - width / 2 -> ebx
181 :
182 :			mov edx, ecx
183 :			mov ebp, ecx
184 :			shr edx, 6
185 :			mov ecx, edx ; 64 bytes copied per iteration
186 :			shl edx, 6
187 :			sub ebp, edx ; remainder -> ebp
188 :			shr ebp, 3 ; 8 bytes per iteration
189 :			add ebp, 1
190 :			mov [remainder], ebp
191 :
192 :			mov edx, ecx
193 :
194 :			.u_inner_loop:
195 :			prefetchnta [esi + 64] ; non temporal prefetch
196 :			prefetchnta [esi + 96]
197 :
198 :			movq mm1, [esi] ; read from src
199 :			movq mm2, [esi + 8]
200 :			movq mm3, [esi + 16]
201 :			movq mm4, [esi + 24]
202 :			movq mm5, [esi + 32]
203 :			movq mm6, [esi + 40]
204 :			movq mm7, [esi + 48]
205 :			movq mm0, [esi + 56]
206 :
207 :			movntq [edi], mm1 ; write to u_out
208 :			movntq [edi + 8], mm2
209 :			movntq [edi + 16], mm3
210 :			movntq [edi + 24], mm4
211 :			movntq [edi + 32], mm5
212 :			movntq [edi + 40], mm6
213 :			movntq [edi + 48], mm7
214 :			movntq [edi + 56], mm0
215 :
216 :
217 :			add esi, 64
218 :			add edi, 64
219 :			dec ecx
220 :			jnz .u_inner_loop
221 :
222 :			dec ebp
223 :			jz .u_outer_loop
224 :
225 :			.u_remainder_loop:
226 :			movq mm1, [esi] ; read from src
227 :			movntq [edi], mm1 ; write to y_out
228 :
229 :			add esi, 8
230 :			add edi, 8
231 :			dec ebp
232 :			jnz .u_remainder_loop
233 :
234 :			.u_outer_loop:
235 :			mov ebp, [remainder]
236 :			mov ecx, edx
237 :			add edi, ebx
238 :
239 :			dec eax
240 :			jnz .u_inner_loop
241 :
242 :			mov eax, [esp + 40 + localsize] ; height -> eax
243 :			mov ecx, [esp + 36 + localsize] ; width -> ecx
244 :			mov edi, [esp + 28 + localsize] ; v_out -> edi
245 :
246 :			shr ecx, 1 ; width / 2 -> ecx
247 :			shr eax, 1 ; height / 2 -> eax
248 :
249 :			mov edx, ecx
250 :			mov ebp, ecx
251 :			shr edx, 6
252 :			mov ecx, edx ; 64 bytes copied per iteration
253 :			shl edx, 6
254 :			sub ebp, edx ; remainder -> ebp
255 :			shr ebp, 3 ; 8 bytes per iteration
256 :			add ebp, 1
257 :			mov [remainder], ebp
258 :
259 :			mov edx, ecx
260 :
261 :			.v_inner_loop:
262 :			prefetchnta [esi + 64] ; non temporal prefetch
263 :			prefetchnta [esi + 96]
264 :
265 :			movq mm1, [esi] ; read from src
266 :			movq mm2, [esi + 8]
267 :			movq mm3, [esi + 16]
268 :			movq mm4, [esi + 24]
269 :			movq mm5, [esi + 32]
270 :			movq mm6, [esi + 40]
271 :			movq mm7, [esi + 48]
272 :			movq mm0, [esi + 56]
273 :
274 :			movntq [edi], mm1 ; write to u_out
275 :			movntq [edi + 8], mm2
276 :			movntq [edi + 16], mm3
277 :			movntq [edi + 24], mm4
278 :			movntq [edi + 32], mm5
279 :			movntq [edi + 40], mm6
280 :			movntq [edi + 48], mm7
281 :			movntq [edi + 56], mm0
282 :
283 :
284 :			add esi, 64
285 :			add edi, 64
286 :			dec ecx
287 :			jnz .v_inner_loop
288 :
289 :			dec ebp
290 :			jz .v_outer_loop
291 :
292 :			.v_remainder_loop:
293 :			movq mm1, [esi] ; read from src
294 :			movntq [edi], mm1 ; write to y_out
295 :
296 :			add esi, 8
297 :			add edi, 8
298 :			dec ebp
299 :			jnz .v_remainder_loop
300 :
301 :			.v_outer_loop:
302 :			mov ebp, [remainder]
303 :			mov ecx, edx
304 :			add edi, ebx
305 :
306 :			dec eax
307 :			jnz .v_inner_loop
308 :
309 :			; local vars deallocation
310 :			add esp, localsize
311 :			%undef localsize
312 :			%undef remainder
313 :
314 :			pop ebp
315 :			pop edi
316 :			pop esi
317 :			pop ebx
318 :
319 :			emms
320 :
321 :			ret
322 :
323 :
324 :
325 :			;------------------------------------------------------------------------------
326 :			;
327 :			; void yuv_to_yv12_mmx(uint8_t *y_out,
328 :			; uint8_t *u_out,
329 :			; uint8_t *v_out,
330 :			; uint8_t *src,
331 :			; int width, int height, int stride);
332 :			;
333 :			; Attention: This code assumes that width is a multiple of 16
334 :			;
335 :			;------------------------------------------------------------------------------
336 :
337 :			cglobal yuv_to_yv12_mmx
338 :			yuv_to_yv12_mmx:
339 :
340 :			push ebx
341 :			push esi
342 :			push edi
343 :			push ebp
344 :
345 :			; local vars allocation
346 :			%define localsize 4
347 :			%define remainder esp
348 :			sub esp, localsize
349 :
350 :
351 :			; function code
352 :			mov eax, [esp + 40 + localsize] ; height -> eax
353 :			mov ebx, [esp + 44 + localsize] ; stride -> ebx
354 :			mov esi, [esp + 32 + localsize] ; src -> esi
355 :			mov edi, [esp + 20 + localsize] ; y_out -> edi
356 :	edgomez	331	mov ecx, [esp + 36 + localsize] ; width -> ecx
357 :	edgomez	328
358 :			sub ebx, ecx ; stride - width -> ebx
359 :
360 :			mov edx, ecx
361 :			mov ebp, ecx
362 :			shr edx, 6
363 :			mov ecx, edx ; 64 bytes copied per iteration
364 :			shl edx, 6
365 :			sub ebp, edx ; mainder -> ebp
366 :			shr ebp, 4 ; 16 bytes per iteration
367 :			add ebp, 1
368 :			mov [remainder], ebp
369 :
370 :			mov edx, ecx
371 :
372 :			.y_inner_loop:
373 :			movq mm1, [esi] ; read from src
374 :			movq mm2, [esi + 8]
375 :			movq mm3, [esi + 16]
376 :			movq mm4, [esi + 24]
377 :			movq mm5, [esi + 32]
378 :			movq mm6, [esi + 40]
379 :			movq mm7, [esi + 48]
380 :			movq mm0, [esi + 56]
381 :
382 :			movq [edi], mm1 ; write to y_out
383 :			movq [edi + 8], mm2
384 :			movq [edi + 16], mm3
385 :			movq [edi + 24], mm4
386 :			movq [edi + 32], mm5
387 :			movq [edi + 40], mm6
388 :			movq [edi + 48], mm7
389 :			movq [edi + 56], mm0
390 :
391 :			add esi, 64
392 :			add edi, 64
393 :			dec ecx
394 :			jnz .y_inner_loop
395 :
396 :			dec ebp
397 :			jz .y_outer_loop
398 :
399 :			.y_remainder_loop:
400 :			movq mm1, [esi] ; read from src
401 :			movq mm2, [esi + 8]
402 :
403 :			movq [edi], mm1 ; write to y_out
404 :			movq [edi + 8], mm2
405 :
406 :			add esi, 16
407 :			add edi, 16
408 :			dec ebp
409 :			jnz .y_remainder_loop
410 :
411 :			.y_outer_loop:
412 :			mov ebp, [remainder]
413 :			mov ecx, edx
414 :			add edi, ebx
415 :
416 :			dec eax
417 :			jnz near .y_inner_loop
418 :
419 :			mov eax, [esp + 40 + localsize] ; height -> eax
420 :			mov ebx, [esp + 44 + localsize] ; stride -> ebx
421 :			mov ecx, [esp + 36 + localsize] ; width -> ecx
422 :			mov edi, [esp + 24 + localsize] ; u_out -> edi
423 :
424 :			shr ecx, 1 ; width / 2 -> ecx
425 :			shr ebx, 1 ; stride / 2 -> ebx
426 :			shr eax, 1 ; height / 2 -> eax
427 :
428 :			sub ebx, ecx ; stride / 2 - width / 2 -> ebx
429 :
430 :			mov edx, ecx
431 :			mov ebp, ecx
432 :			shr edx, 6
433 :			mov ecx, edx ; 64 bytes copied per iteration
434 :			shl edx, 6
435 :			sub ebp, edx ; remainder -> ebp
436 :			shr ebp, 3 ; 8 bytes per iteration
437 :			add ebp, 1
438 :			mov [remainder], ebp
439 :
440 :			mov edx, ecx
441 :
442 :			.u_inner_loop:
443 :			movq mm1, [esi] ; read from src
444 :			movq mm2, [esi + 8]
445 :			movq mm3, [esi + 16]
446 :			movq mm4, [esi + 24]
447 :			movq mm5, [esi + 32]
448 :			movq mm6, [esi + 40]
449 :			movq mm7, [esi + 48]
450 :			movq mm0, [esi + 56]
451 :
452 :			movq [edi], mm1 ; write to u_out
453 :			movq [edi + 8], mm2
454 :			movq [edi + 16], mm3
455 :			movq [edi + 24], mm4
456 :			movq [edi + 32], mm5
457 :			movq [edi + 40], mm6
458 :			movq [edi + 48], mm7
459 :			movq [edi + 56], mm0
460 :
461 :
462 :			add esi, 64
463 :			add edi, 64
464 :			dec ecx
465 :			jnz .u_inner_loop
466 :
467 :			dec ebp
468 :			jz .u_outer_loop
469 :
470 :			.u_remainder_loop:
471 :			movq mm1, [esi] ; read from src
472 :			movq [edi], mm1 ; write to y_out
473 :
474 :			add esi, 8
475 :			add edi, 8
476 :			dec ebp
477 :			jnz .u_remainder_loop
478 :
479 :			.u_outer_loop:
480 :			mov ebp, [remainder]
481 :			mov ecx, edx
482 :			add edi, ebx
483 :
484 :			dec eax
485 :			jnz .u_inner_loop
486 :
487 :			mov eax, [esp + 40 + localsize] ; height -> eax
488 :			mov ecx, [esp + 36 + localsize] ; width -> ecx
489 :			mov edi, [esp + 28 + localsize] ; v_out -> edi
490 :
491 :			shr ecx, 1 ; width / 2 -> ecx
492 :			shr eax, 1 ; height / 2 -> eax
493 :
494 :			mov edx, ecx
495 :			mov ebp, ecx
496 :			shr edx, 6
497 :			mov ecx, edx ; 64 bytes copied per iteration
498 :			shl edx, 6
499 :			sub ebp, edx ; remainder -> ebp
500 :			shr ebp, 3 ; 8 bytes per iteration
501 :			add ebp, 1
502 :			mov [remainder], ebp
503 :
504 :			mov edx, ecx
505 :
506 :			.v_inner_loop:
507 :			movq mm1, [esi] ; read from src
508 :			movq mm2, [esi + 8]
509 :			movq mm3, [esi + 16]
510 :			movq mm4, [esi + 24]
511 :			movq mm5, [esi + 32]
512 :			movq mm6, [esi + 40]
513 :			movq mm7, [esi + 48]
514 :			movq mm0, [esi + 56]
515 :
516 :			movq [edi], mm1 ; write to u_out
517 :			movq [edi + 8], mm2
518 :			movq [edi + 16], mm3
519 :			movq [edi + 24], mm4
520 :			movq [edi + 32], mm5
521 :			movq [edi + 40], mm6
522 :			movq [edi + 48], mm7
523 :			movq [edi + 56], mm0
524 :
525 :
526 :			add esi, 64
527 :			add edi, 64
528 :			dec ecx
529 :			jnz .v_inner_loop
530 :
531 :			dec ebp
532 :			jz .v_outer_loop
533 :
534 :			.v_remainder_loop:
535 :			movq mm1, [esi] ; read from src
536 :			movq [edi], mm1 ; write to y_out
537 :
538 :			add esi, 8
539 :			add edi, 8
540 :			dec ebp
541 :			jnz .v_remainder_loop
542 :
543 :			.v_outer_loop:
544 :			mov ebp, [remainder]
545 :			mov ecx, edx
546 :			add edi, ebx
547 :
548 :			dec eax
549 :			jnz .v_inner_loop
550 :
551 :			; local vars deallocation
552 :			add esp, localsize
553 :			%undef localsize
554 :			%undef remainder
555 :
556 :			pop ebp
557 :			pop edi
558 :			pop esi
559 :			pop ebx
560 :
561 :			emms
562 :
563 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4