Parent Directory | Revision Log
Revision 434 - (view) (download)
1 : | chl | 434 | ;/***************************************************************************** |
2 : | ; * | ||
3 : | ; * XVID MPEG-4 VIDEO CODEC | ||
4 : | ; * mmx yuv planar to yv12 conversion | ||
5 : | ; * | ||
6 : | ; * Copyright (C) 2001 - Michael Militzer <isibaar@xvid.org> | ||
7 : | ; * | ||
8 : | ; * This program is an implementation of a part of one or more MPEG-4 | ||
9 : | ; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending | ||
10 : | ; * to use this software module in hardware or software products are | ||
11 : | ; * advised that its use may infringe existing patents or copyrights, and | ||
12 : | ; * any such use would be at such party's own risk. The original | ||
13 : | ; * developer of this software module and his/her company, and subsequent | ||
14 : | ; * editors and their companies, will have no liability for use of this | ||
15 : | ; * software or modifications or derivatives thereof. | ||
16 : | ; * | ||
17 : | ; * This program is free software; you can redistribute it and/or modify | ||
18 : | ; * it under the terms of the GNU General Public License as published by | ||
19 : | ; * the Free Software Foundation; either version 2 of the License, or | ||
20 : | ; * (at your option) any later version. | ||
21 : | ; * | ||
22 : | ; * This program is distributed in the hope that it will be useful, | ||
23 : | ; * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
24 : | ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
25 : | ; * GNU General Public License for more details. | ||
26 : | ; * | ||
27 : | ; * You should have received a copy of the GNU General Public License | ||
28 : | ; * along with this program; if not, write to the Free Software | ||
29 : | ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
30 : | ; * | ||
31 : | ; * $Id: yuv_to_yv12_mmx.asm,v 1.6 2002-09-06 17:48:58 chl Exp $ | ||
32 : | ; * | ||
33 : | ; ****************************************************************************/ | ||
34 : | edgomez | 328 | |
35 : | BITS 32 | ||
36 : | |||
37 : | %macro cglobal 1 | ||
38 : | %ifdef PREFIX | ||
39 : | global _%1 | ||
40 : | edgomez | 331 | %define %1 _%1 |
41 : | edgomez | 328 | %else |
42 : | global %1 | ||
43 : | %endif | ||
44 : | %endmacro | ||
45 : | |||
46 : | SECTION .text | ||
47 : | |||
48 : | ALIGN 64 | ||
49 : | |||
50 : | ;------------------------------------------------------------------------------ | ||
51 : | ; | ||
52 : | ; void yuv_to_yv12_xmm(uint8_t *y_out, | ||
53 : | ; uint8_t *u_out, | ||
54 : | ; uint8_t *v_out, | ||
55 : | ; uint8_t *src, | ||
56 : | ; int width, int height, int stride); | ||
57 : | ; | ||
58 : | ; This function probably also runs on PentiumII class cpu's | ||
59 : | ; | ||
60 : | ; Attention: This code assumes that width is a multiple of 16 | ||
61 : | ; | ||
62 : | ;------------------------------------------------------------------------------ | ||
63 : | |||
64 : | |||
65 : | cglobal yuv_to_yv12_xmm | ||
66 : | yuv_to_yv12_xmm: | ||
67 : | |||
68 : | push ebx | ||
69 : | push esi | ||
70 : | push edi | ||
71 : | push ebp | ||
72 : | |||
73 : | ; local vars allocation | ||
74 : | %define localsize 4 | ||
75 : | %define remainder esp | ||
76 : | sub esp, localsize | ||
77 : | |||
78 : | ; function code | ||
79 : | mov eax, [esp + 40 + localsize] ; height -> eax | ||
80 : | mov ebx, [esp + 44 + localsize] ; stride -> ebx | ||
81 : | mov esi, [esp + 32 + localsize] ; src -> esi | ||
82 : | mov edi, [esp + 20 + localsize] ; y_out -> edi | ||
83 : | mov ecx, [esp + 36 + localsize] ; width -> ecx | ||
84 : | edgomez | 331 | |
85 : | edgomez | 328 | sub ebx, ecx ; stride - width -> ebx |
86 : | |||
87 : | mov edx, ecx | ||
88 : | mov ebp, ecx | ||
89 : | shr edx, 6 | ||
90 : | mov ecx, edx ; 64 bytes copied per iteration | ||
91 : | shl edx, 6 | ||
92 : | sub ebp, edx ; remainder -> ebp | ||
93 : | shr ebp, 4 ; 16 bytes per iteration | ||
94 : | add ebp, 1 | ||
95 : | mov [remainder], ebp | ||
96 : | |||
97 : | mov edx, ecx | ||
98 : | |||
99 : | .y_inner_loop: | ||
100 : | prefetchnta [esi + 64] ; non temporal prefetch | ||
101 : | prefetchnta [esi + 96] | ||
102 : | |||
103 : | movq mm1, [esi] ; read from src | ||
104 : | movq mm2, [esi + 8] | ||
105 : | movq mm3, [esi + 16] | ||
106 : | movq mm4, [esi + 24] | ||
107 : | movq mm5, [esi + 32] | ||
108 : | movq mm6, [esi + 40] | ||
109 : | movq mm7, [esi + 48] | ||
110 : | movq mm0, [esi + 56] | ||
111 : | |||
112 : | movntq [edi], mm1 ; write to y_out | ||
113 : | movntq [edi + 8], mm2 | ||
114 : | movntq [edi + 16], mm3 | ||
115 : | movntq [edi + 24], mm4 | ||
116 : | movntq [edi + 32], mm5 | ||
117 : | movntq [edi + 40], mm6 | ||
118 : | movntq [edi + 48], mm7 | ||
119 : | movntq [edi + 56], mm0 | ||
120 : | |||
121 : | add esi, 64 | ||
122 : | add edi, 64 | ||
123 : | dec ecx | ||
124 : | jnz .y_inner_loop | ||
125 : | |||
126 : | dec ebp | ||
127 : | jz .y_outer_loop | ||
128 : | |||
129 : | .y_remainder_loop: | ||
130 : | movq mm1, [esi] ; read from src | ||
131 : | movq mm2, [esi + 8] | ||
132 : | |||
133 : | movntq [edi], mm1 ; write to y_out | ||
134 : | movntq [edi + 8], mm2 | ||
135 : | |||
136 : | add esi, 16 | ||
137 : | add edi, 16 | ||
138 : | dec ebp | ||
139 : | jnz .y_remainder_loop | ||
140 : | |||
141 : | .y_outer_loop: | ||
142 : | mov ebp, [remainder] | ||
143 : | mov ecx, edx | ||
144 : | add edi, ebx | ||
145 : | |||
146 : | dec eax | ||
147 : | jnz near .y_inner_loop | ||
148 : | |||
149 : | mov eax, [esp + 40 + localsize] ; height -> eax | ||
150 : | mov ebx, [esp + 44 + localsize] ; stride -> ebx | ||
151 : | mov ecx, [esp + 36 + localsize] ; width -> ecx | ||
152 : | mov edi, [esp + 24 + localsize] ; u_out -> edi | ||
153 : | |||
154 : | shr ecx, 1 ; width / 2 -> ecx | ||
155 : | shr ebx, 1 ; stride / 2 -> ebx | ||
156 : | shr eax, 1 ; height / 2 -> eax | ||
157 : | |||
158 : | sub ebx, ecx ; stride / 2 - width / 2 -> ebx | ||
159 : | |||
160 : | mov edx, ecx | ||
161 : | mov ebp, ecx | ||
162 : | shr edx, 6 | ||
163 : | mov ecx, edx ; 64 bytes copied per iteration | ||
164 : | shl edx, 6 | ||
165 : | sub ebp, edx ; remainder -> ebp | ||
166 : | shr ebp, 3 ; 8 bytes per iteration | ||
167 : | add ebp, 1 | ||
168 : | mov [remainder], ebp | ||
169 : | |||
170 : | mov edx, ecx | ||
171 : | |||
172 : | .u_inner_loop: | ||
173 : | prefetchnta [esi + 64] ; non temporal prefetch | ||
174 : | prefetchnta [esi + 96] | ||
175 : | |||
176 : | movq mm1, [esi] ; read from src | ||
177 : | movq mm2, [esi + 8] | ||
178 : | movq mm3, [esi + 16] | ||
179 : | movq mm4, [esi + 24] | ||
180 : | movq mm5, [esi + 32] | ||
181 : | movq mm6, [esi + 40] | ||
182 : | movq mm7, [esi + 48] | ||
183 : | movq mm0, [esi + 56] | ||
184 : | |||
185 : | movntq [edi], mm1 ; write to u_out | ||
186 : | movntq [edi + 8], mm2 | ||
187 : | movntq [edi + 16], mm3 | ||
188 : | movntq [edi + 24], mm4 | ||
189 : | movntq [edi + 32], mm5 | ||
190 : | movntq [edi + 40], mm6 | ||
191 : | movntq [edi + 48], mm7 | ||
192 : | movntq [edi + 56], mm0 | ||
193 : | |||
194 : | |||
195 : | add esi, 64 | ||
196 : | add edi, 64 | ||
197 : | dec ecx | ||
198 : | jnz .u_inner_loop | ||
199 : | |||
200 : | dec ebp | ||
201 : | jz .u_outer_loop | ||
202 : | |||
203 : | .u_remainder_loop: | ||
204 : | movq mm1, [esi] ; read from src | ||
205 : | movntq [edi], mm1 ; write to y_out | ||
206 : | |||
207 : | add esi, 8 | ||
208 : | add edi, 8 | ||
209 : | dec ebp | ||
210 : | jnz .u_remainder_loop | ||
211 : | |||
212 : | .u_outer_loop: | ||
213 : | mov ebp, [remainder] | ||
214 : | mov ecx, edx | ||
215 : | add edi, ebx | ||
216 : | |||
217 : | dec eax | ||
218 : | jnz .u_inner_loop | ||
219 : | |||
220 : | mov eax, [esp + 40 + localsize] ; height -> eax | ||
221 : | mov ecx, [esp + 36 + localsize] ; width -> ecx | ||
222 : | mov edi, [esp + 28 + localsize] ; v_out -> edi | ||
223 : | |||
224 : | shr ecx, 1 ; width / 2 -> ecx | ||
225 : | shr eax, 1 ; height / 2 -> eax | ||
226 : | |||
227 : | mov edx, ecx | ||
228 : | mov ebp, ecx | ||
229 : | shr edx, 6 | ||
230 : | mov ecx, edx ; 64 bytes copied per iteration | ||
231 : | shl edx, 6 | ||
232 : | sub ebp, edx ; remainder -> ebp | ||
233 : | shr ebp, 3 ; 8 bytes per iteration | ||
234 : | add ebp, 1 | ||
235 : | mov [remainder], ebp | ||
236 : | |||
237 : | mov edx, ecx | ||
238 : | |||
239 : | .v_inner_loop: | ||
240 : | prefetchnta [esi + 64] ; non temporal prefetch | ||
241 : | prefetchnta [esi + 96] | ||
242 : | |||
243 : | movq mm1, [esi] ; read from src | ||
244 : | movq mm2, [esi + 8] | ||
245 : | movq mm3, [esi + 16] | ||
246 : | movq mm4, [esi + 24] | ||
247 : | movq mm5, [esi + 32] | ||
248 : | movq mm6, [esi + 40] | ||
249 : | movq mm7, [esi + 48] | ||
250 : | movq mm0, [esi + 56] | ||
251 : | |||
252 : | movntq [edi], mm1 ; write to u_out | ||
253 : | movntq [edi + 8], mm2 | ||
254 : | movntq [edi + 16], mm3 | ||
255 : | movntq [edi + 24], mm4 | ||
256 : | movntq [edi + 32], mm5 | ||
257 : | movntq [edi + 40], mm6 | ||
258 : | movntq [edi + 48], mm7 | ||
259 : | movntq [edi + 56], mm0 | ||
260 : | |||
261 : | |||
262 : | add esi, 64 | ||
263 : | add edi, 64 | ||
264 : | dec ecx | ||
265 : | jnz .v_inner_loop | ||
266 : | |||
267 : | dec ebp | ||
268 : | jz .v_outer_loop | ||
269 : | |||
270 : | .v_remainder_loop: | ||
271 : | movq mm1, [esi] ; read from src | ||
272 : | movntq [edi], mm1 ; write to y_out | ||
273 : | |||
274 : | add esi, 8 | ||
275 : | add edi, 8 | ||
276 : | dec ebp | ||
277 : | jnz .v_remainder_loop | ||
278 : | |||
279 : | .v_outer_loop: | ||
280 : | mov ebp, [remainder] | ||
281 : | mov ecx, edx | ||
282 : | add edi, ebx | ||
283 : | |||
284 : | dec eax | ||
285 : | jnz .v_inner_loop | ||
286 : | |||
287 : | ; local vars deallocation | ||
288 : | add esp, localsize | ||
289 : | %undef localsize | ||
290 : | %undef remainder | ||
291 : | |||
292 : | pop ebp | ||
293 : | pop edi | ||
294 : | pop esi | ||
295 : | pop ebx | ||
296 : | |||
297 : | emms | ||
298 : | |||
299 : | ret | ||
300 : | |||
301 : | |||
302 : | |||
303 : | ;------------------------------------------------------------------------------ | ||
304 : | ; | ||
305 : | ; void yuv_to_yv12_mmx(uint8_t *y_out, | ||
306 : | ; uint8_t *u_out, | ||
307 : | ; uint8_t *v_out, | ||
308 : | ; uint8_t *src, | ||
309 : | ; int width, int height, int stride); | ||
310 : | ; | ||
311 : | ; Attention: This code assumes that width is a multiple of 16 | ||
312 : | ; | ||
313 : | ;------------------------------------------------------------------------------ | ||
314 : | |||
315 : | cglobal yuv_to_yv12_mmx | ||
316 : | yuv_to_yv12_mmx: | ||
317 : | |||
318 : | push ebx | ||
319 : | push esi | ||
320 : | push edi | ||
321 : | push ebp | ||
322 : | |||
323 : | ; local vars allocation | ||
324 : | %define localsize 4 | ||
325 : | %define remainder esp | ||
326 : | sub esp, localsize | ||
327 : | |||
328 : | |||
329 : | ; function code | ||
330 : | mov eax, [esp + 40 + localsize] ; height -> eax | ||
331 : | mov ebx, [esp + 44 + localsize] ; stride -> ebx | ||
332 : | mov esi, [esp + 32 + localsize] ; src -> esi | ||
333 : | mov edi, [esp + 20 + localsize] ; y_out -> edi | ||
334 : | edgomez | 331 | mov ecx, [esp + 36 + localsize] ; width -> ecx |
335 : | edgomez | 328 | |
336 : | sub ebx, ecx ; stride - width -> ebx | ||
337 : | |||
338 : | mov edx, ecx | ||
339 : | mov ebp, ecx | ||
340 : | shr edx, 6 | ||
341 : | mov ecx, edx ; 64 bytes copied per iteration | ||
342 : | shl edx, 6 | ||
343 : | sub ebp, edx ; mainder -> ebp | ||
344 : | shr ebp, 4 ; 16 bytes per iteration | ||
345 : | add ebp, 1 | ||
346 : | mov [remainder], ebp | ||
347 : | |||
348 : | mov edx, ecx | ||
349 : | |||
350 : | .y_inner_loop: | ||
351 : | movq mm1, [esi] ; read from src | ||
352 : | movq mm2, [esi + 8] | ||
353 : | movq mm3, [esi + 16] | ||
354 : | movq mm4, [esi + 24] | ||
355 : | movq mm5, [esi + 32] | ||
356 : | movq mm6, [esi + 40] | ||
357 : | movq mm7, [esi + 48] | ||
358 : | movq mm0, [esi + 56] | ||
359 : | |||
360 : | movq [edi], mm1 ; write to y_out | ||
361 : | movq [edi + 8], mm2 | ||
362 : | movq [edi + 16], mm3 | ||
363 : | movq [edi + 24], mm4 | ||
364 : | movq [edi + 32], mm5 | ||
365 : | movq [edi + 40], mm6 | ||
366 : | movq [edi + 48], mm7 | ||
367 : | movq [edi + 56], mm0 | ||
368 : | |||
369 : | add esi, 64 | ||
370 : | add edi, 64 | ||
371 : | dec ecx | ||
372 : | jnz .y_inner_loop | ||
373 : | |||
374 : | dec ebp | ||
375 : | jz .y_outer_loop | ||
376 : | |||
377 : | .y_remainder_loop: | ||
378 : | movq mm1, [esi] ; read from src | ||
379 : | movq mm2, [esi + 8] | ||
380 : | |||
381 : | movq [edi], mm1 ; write to y_out | ||
382 : | movq [edi + 8], mm2 | ||
383 : | |||
384 : | add esi, 16 | ||
385 : | add edi, 16 | ||
386 : | dec ebp | ||
387 : | jnz .y_remainder_loop | ||
388 : | |||
389 : | .y_outer_loop: | ||
390 : | mov ebp, [remainder] | ||
391 : | mov ecx, edx | ||
392 : | add edi, ebx | ||
393 : | |||
394 : | dec eax | ||
395 : | jnz near .y_inner_loop | ||
396 : | |||
397 : | mov eax, [esp + 40 + localsize] ; height -> eax | ||
398 : | mov ebx, [esp + 44 + localsize] ; stride -> ebx | ||
399 : | mov ecx, [esp + 36 + localsize] ; width -> ecx | ||
400 : | mov edi, [esp + 24 + localsize] ; u_out -> edi | ||
401 : | |||
402 : | shr ecx, 1 ; width / 2 -> ecx | ||
403 : | shr ebx, 1 ; stride / 2 -> ebx | ||
404 : | shr eax, 1 ; height / 2 -> eax | ||
405 : | |||
406 : | sub ebx, ecx ; stride / 2 - width / 2 -> ebx | ||
407 : | |||
408 : | mov edx, ecx | ||
409 : | mov ebp, ecx | ||
410 : | shr edx, 6 | ||
411 : | mov ecx, edx ; 64 bytes copied per iteration | ||
412 : | shl edx, 6 | ||
413 : | sub ebp, edx ; remainder -> ebp | ||
414 : | shr ebp, 3 ; 8 bytes per iteration | ||
415 : | add ebp, 1 | ||
416 : | mov [remainder], ebp | ||
417 : | |||
418 : | mov edx, ecx | ||
419 : | |||
420 : | .u_inner_loop: | ||
421 : | movq mm1, [esi] ; read from src | ||
422 : | movq mm2, [esi + 8] | ||
423 : | movq mm3, [esi + 16] | ||
424 : | movq mm4, [esi + 24] | ||
425 : | movq mm5, [esi + 32] | ||
426 : | movq mm6, [esi + 40] | ||
427 : | movq mm7, [esi + 48] | ||
428 : | movq mm0, [esi + 56] | ||
429 : | |||
430 : | movq [edi], mm1 ; write to u_out | ||
431 : | movq [edi + 8], mm2 | ||
432 : | movq [edi + 16], mm3 | ||
433 : | movq [edi + 24], mm4 | ||
434 : | movq [edi + 32], mm5 | ||
435 : | movq [edi + 40], mm6 | ||
436 : | movq [edi + 48], mm7 | ||
437 : | movq [edi + 56], mm0 | ||
438 : | |||
439 : | |||
440 : | add esi, 64 | ||
441 : | add edi, 64 | ||
442 : | dec ecx | ||
443 : | jnz .u_inner_loop | ||
444 : | |||
445 : | dec ebp | ||
446 : | jz .u_outer_loop | ||
447 : | |||
448 : | .u_remainder_loop: | ||
449 : | movq mm1, [esi] ; read from src | ||
450 : | movq [edi], mm1 ; write to y_out | ||
451 : | |||
452 : | add esi, 8 | ||
453 : | add edi, 8 | ||
454 : | dec ebp | ||
455 : | jnz .u_remainder_loop | ||
456 : | |||
457 : | .u_outer_loop: | ||
458 : | mov ebp, [remainder] | ||
459 : | mov ecx, edx | ||
460 : | add edi, ebx | ||
461 : | |||
462 : | dec eax | ||
463 : | jnz .u_inner_loop | ||
464 : | |||
465 : | mov eax, [esp + 40 + localsize] ; height -> eax | ||
466 : | mov ecx, [esp + 36 + localsize] ; width -> ecx | ||
467 : | mov edi, [esp + 28 + localsize] ; v_out -> edi | ||
468 : | |||
469 : | shr ecx, 1 ; width / 2 -> ecx | ||
470 : | shr eax, 1 ; height / 2 -> eax | ||
471 : | |||
472 : | mov edx, ecx | ||
473 : | mov ebp, ecx | ||
474 : | shr edx, 6 | ||
475 : | mov ecx, edx ; 64 bytes copied per iteration | ||
476 : | shl edx, 6 | ||
477 : | sub ebp, edx ; remainder -> ebp | ||
478 : | shr ebp, 3 ; 8 bytes per iteration | ||
479 : | add ebp, 1 | ||
480 : | mov [remainder], ebp | ||
481 : | |||
482 : | mov edx, ecx | ||
483 : | |||
484 : | .v_inner_loop: | ||
485 : | movq mm1, [esi] ; read from src | ||
486 : | movq mm2, [esi + 8] | ||
487 : | movq mm3, [esi + 16] | ||
488 : | movq mm4, [esi + 24] | ||
489 : | movq mm5, [esi + 32] | ||
490 : | movq mm6, [esi + 40] | ||
491 : | movq mm7, [esi + 48] | ||
492 : | movq mm0, [esi + 56] | ||
493 : | |||
494 : | movq [edi], mm1 ; write to u_out | ||
495 : | movq [edi + 8], mm2 | ||
496 : | movq [edi + 16], mm3 | ||
497 : | movq [edi + 24], mm4 | ||
498 : | movq [edi + 32], mm5 | ||
499 : | movq [edi + 40], mm6 | ||
500 : | movq [edi + 48], mm7 | ||
501 : | movq [edi + 56], mm0 | ||
502 : | |||
503 : | |||
504 : | add esi, 64 | ||
505 : | add edi, 64 | ||
506 : | dec ecx | ||
507 : | jnz .v_inner_loop | ||
508 : | |||
509 : | dec ebp | ||
510 : | jz .v_outer_loop | ||
511 : | |||
512 : | .v_remainder_loop: | ||
513 : | movq mm1, [esi] ; read from src | ||
514 : | movq [edi], mm1 ; write to y_out | ||
515 : | |||
516 : | add esi, 8 | ||
517 : | add edi, 8 | ||
518 : | dec ebp | ||
519 : | jnz .v_remainder_loop | ||
520 : | |||
521 : | .v_outer_loop: | ||
522 : | mov ebp, [remainder] | ||
523 : | mov ecx, edx | ||
524 : | add edi, ebx | ||
525 : | |||
526 : | dec eax | ||
527 : | jnz .v_inner_loop | ||
528 : | |||
529 : | ; local vars deallocation | ||
530 : | add esp, localsize | ||
531 : | %undef localsize | ||
532 : | %undef remainder | ||
533 : | |||
534 : | pop ebp | ||
535 : | pop edi | ||
536 : | pop esi | ||
537 : | pop ebx | ||
538 : | |||
539 : | emms | ||
540 : | |||
541 : | ret |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |