Parent Directory
|
Revision Log
Revision 331 - (view) (download)
1 : | edgomez | 328 | ;------------------------------------------------------------------------------ |
2 : | ; | ||
3 : | ; This file is part of XviD, a free MPEG-4 video encoder/decoder | ||
4 : | ; | ||
5 : | ; This program is free software; you can redistribute it and/or modify it | ||
6 : | ; under the terms of the GNU General Public License as published by | ||
7 : | ; the Free Software Foundation; either version 2 of the License, or | ||
8 : | ; (at your option) any later version. | ||
9 : | ; | ||
10 : | ; This program is distributed in the hope that it will be useful, but | ||
11 : | ; WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 : | ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 : | ; GNU General Public License for more details. | ||
14 : | ; | ||
15 : | ; You should have received a copy of the GNU General Public License | ||
16 : | ; along with this program; if not, write to the Free Software | ||
17 : | ; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 : | ; | ||
19 : | ;------------------------------------------------------------------------------ | ||
20 : | ;------------------------------------------------------------------------------ | ||
21 : | ; | ||
22 : | ; yuv_to_yuv.asm, MMX optimized color conversion | ||
23 : | ; | ||
24 : | ; Copyright (C) 2001 - Michael Militzer <isibaar@xvid.org> | ||
25 : | ; | ||
26 : | ; For more information visit the XviD homepage: http://www.xvid.org | ||
27 : | ; | ||
28 : | ;------------------------------------------------------------------------------ | ||
29 : | ;------------------------------------------------------------------------------ | ||
30 : | ; | ||
31 : | ; Revision history: | ||
32 : | ; | ||
33 : | ; 24.11.2001 initial version (Isibaar) | ||
34 : | ; 23.07.2002 thread safe (edgomez) | ||
35 : | ; | ||
36 : | edgomez | 331 | ; $Id: yuv_to_yv12_mmx.asm,v 1.5 2002-07-23 15:38:18 edgomez Exp $ |
37 : | edgomez | 328 | ; |
38 : | ;------------------------------------------------------------------------------ | ||
39 : | |||
40 : | BITS 32 | ||
41 : | |||
42 : | %macro cglobal 1 | ||
43 : | %ifdef PREFIX | ||
44 : | global _%1 | ||
45 : | edgomez | 331 | %define %1 _%1 |
46 : | edgomez | 328 | %else |
47 : | global %1 | ||
48 : | %endif | ||
49 : | %endmacro | ||
50 : | |||
51 : | SECTION .text | ||
52 : | |||
53 : | ALIGN 64 | ||
54 : | |||
55 : | ;------------------------------------------------------------------------------ | ||
56 : | ; | ||
57 : | ; void yuv_to_yv12_xmm(uint8_t *y_out, | ||
58 : | ; uint8_t *u_out, | ||
59 : | ; uint8_t *v_out, | ||
60 : | ; uint8_t *src, | ||
61 : | ; int width, int height, int stride); | ||
62 : | ; | ||
63 : | ; This function probably also runs on PentiumII class cpu's | ||
64 : | ; | ||
65 : | ; Attention: This code assumes that width is a multiple of 16 | ||
66 : | ; | ||
67 : | ;------------------------------------------------------------------------------ | ||
68 : | |||
69 : | |||
70 : | cglobal yuv_to_yv12_xmm | ||
71 : | yuv_to_yv12_xmm: | ||
72 : | |||
73 : | push ebx | ||
74 : | push esi | ||
75 : | push edi | ||
76 : | push ebp | ||
77 : | |||
78 : | ; local vars allocation | ||
79 : | %define localsize 4 | ||
80 : | %define remainder esp | ||
81 : | sub esp, localsize | ||
82 : | |||
83 : | ; function code | ||
84 : | mov eax, [esp + 40 + localsize] ; height -> eax | ||
85 : | mov ebx, [esp + 44 + localsize] ; stride -> ebx | ||
86 : | mov esi, [esp + 32 + localsize] ; src -> esi | ||
87 : | mov edi, [esp + 20 + localsize] ; y_out -> edi | ||
88 : | mov ecx, [esp + 36 + localsize] ; width -> ecx | ||
89 : | edgomez | 331 | |
90 : | edgomez | 328 | sub ebx, ecx ; stride - width -> ebx |
91 : | |||
92 : | mov edx, ecx | ||
93 : | mov ebp, ecx | ||
94 : | shr edx, 6 | ||
95 : | mov ecx, edx ; 64 bytes copied per iteration | ||
96 : | shl edx, 6 | ||
97 : | sub ebp, edx ; remainder -> ebp | ||
98 : | shr ebp, 4 ; 16 bytes per iteration | ||
99 : | add ebp, 1 | ||
100 : | mov [remainder], ebp | ||
101 : | |||
102 : | mov edx, ecx | ||
103 : | |||
104 : | .y_inner_loop: | ||
105 : | prefetchnta [esi + 64] ; non temporal prefetch | ||
106 : | prefetchnta [esi + 96] | ||
107 : | |||
108 : | movq mm1, [esi] ; read from src | ||
109 : | movq mm2, [esi + 8] | ||
110 : | movq mm3, [esi + 16] | ||
111 : | movq mm4, [esi + 24] | ||
112 : | movq mm5, [esi + 32] | ||
113 : | movq mm6, [esi + 40] | ||
114 : | movq mm7, [esi + 48] | ||
115 : | movq mm0, [esi + 56] | ||
116 : | |||
117 : | movntq [edi], mm1 ; write to y_out | ||
118 : | movntq [edi + 8], mm2 | ||
119 : | movntq [edi + 16], mm3 | ||
120 : | movntq [edi + 24], mm4 | ||
121 : | movntq [edi + 32], mm5 | ||
122 : | movntq [edi + 40], mm6 | ||
123 : | movntq [edi + 48], mm7 | ||
124 : | movntq [edi + 56], mm0 | ||
125 : | |||
126 : | add esi, 64 | ||
127 : | add edi, 64 | ||
128 : | dec ecx | ||
129 : | jnz .y_inner_loop | ||
130 : | |||
131 : | dec ebp | ||
132 : | jz .y_outer_loop | ||
133 : | |||
134 : | .y_remainder_loop: | ||
135 : | movq mm1, [esi] ; read from src | ||
136 : | movq mm2, [esi + 8] | ||
137 : | |||
138 : | movntq [edi], mm1 ; write to y_out | ||
139 : | movntq [edi + 8], mm2 | ||
140 : | |||
141 : | add esi, 16 | ||
142 : | add edi, 16 | ||
143 : | dec ebp | ||
144 : | jnz .y_remainder_loop | ||
145 : | |||
146 : | .y_outer_loop: | ||
147 : | mov ebp, [remainder] | ||
148 : | mov ecx, edx | ||
149 : | add edi, ebx | ||
150 : | |||
151 : | dec eax | ||
152 : | jnz near .y_inner_loop | ||
153 : | |||
154 : | mov eax, [esp + 40 + localsize] ; height -> eax | ||
155 : | mov ebx, [esp + 44 + localsize] ; stride -> ebx | ||
156 : | mov ecx, [esp + 36 + localsize] ; width -> ecx | ||
157 : | mov edi, [esp + 24 + localsize] ; u_out -> edi | ||
158 : | |||
159 : | shr ecx, 1 ; width / 2 -> ecx | ||
160 : | shr ebx, 1 ; stride / 2 -> ebx | ||
161 : | shr eax, 1 ; height / 2 -> eax | ||
162 : | |||
163 : | sub ebx, ecx ; stride / 2 - width / 2 -> ebx | ||
164 : | |||
165 : | mov edx, ecx | ||
166 : | mov ebp, ecx | ||
167 : | shr edx, 6 | ||
168 : | mov ecx, edx ; 64 bytes copied per iteration | ||
169 : | shl edx, 6 | ||
170 : | sub ebp, edx ; remainder -> ebp | ||
171 : | shr ebp, 3 ; 8 bytes per iteration | ||
172 : | add ebp, 1 | ||
173 : | mov [remainder], ebp | ||
174 : | |||
175 : | mov edx, ecx | ||
176 : | |||
177 : | .u_inner_loop: | ||
178 : | prefetchnta [esi + 64] ; non temporal prefetch | ||
179 : | prefetchnta [esi + 96] | ||
180 : | |||
181 : | movq mm1, [esi] ; read from src | ||
182 : | movq mm2, [esi + 8] | ||
183 : | movq mm3, [esi + 16] | ||
184 : | movq mm4, [esi + 24] | ||
185 : | movq mm5, [esi + 32] | ||
186 : | movq mm6, [esi + 40] | ||
187 : | movq mm7, [esi + 48] | ||
188 : | movq mm0, [esi + 56] | ||
189 : | |||
190 : | movntq [edi], mm1 ; write to u_out | ||
191 : | movntq [edi + 8], mm2 | ||
192 : | movntq [edi + 16], mm3 | ||
193 : | movntq [edi + 24], mm4 | ||
194 : | movntq [edi + 32], mm5 | ||
195 : | movntq [edi + 40], mm6 | ||
196 : | movntq [edi + 48], mm7 | ||
197 : | movntq [edi + 56], mm0 | ||
198 : | |||
199 : | |||
200 : | add esi, 64 | ||
201 : | add edi, 64 | ||
202 : | dec ecx | ||
203 : | jnz .u_inner_loop | ||
204 : | |||
205 : | dec ebp | ||
206 : | jz .u_outer_loop | ||
207 : | |||
208 : | .u_remainder_loop: | ||
209 : | movq mm1, [esi] ; read from src | ||
210 : | movntq [edi], mm1 ; write to y_out | ||
211 : | |||
212 : | add esi, 8 | ||
213 : | add edi, 8 | ||
214 : | dec ebp | ||
215 : | jnz .u_remainder_loop | ||
216 : | |||
217 : | .u_outer_loop: | ||
218 : | mov ebp, [remainder] | ||
219 : | mov ecx, edx | ||
220 : | add edi, ebx | ||
221 : | |||
222 : | dec eax | ||
223 : | jnz .u_inner_loop | ||
224 : | |||
225 : | mov eax, [esp + 40 + localsize] ; height -> eax | ||
226 : | mov ecx, [esp + 36 + localsize] ; width -> ecx | ||
227 : | mov edi, [esp + 28 + localsize] ; v_out -> edi | ||
228 : | |||
229 : | shr ecx, 1 ; width / 2 -> ecx | ||
230 : | shr eax, 1 ; height / 2 -> eax | ||
231 : | |||
232 : | mov edx, ecx | ||
233 : | mov ebp, ecx | ||
234 : | shr edx, 6 | ||
235 : | mov ecx, edx ; 64 bytes copied per iteration | ||
236 : | shl edx, 6 | ||
237 : | sub ebp, edx ; remainder -> ebp | ||
238 : | shr ebp, 3 ; 8 bytes per iteration | ||
239 : | add ebp, 1 | ||
240 : | mov [remainder], ebp | ||
241 : | |||
242 : | mov edx, ecx | ||
243 : | |||
244 : | .v_inner_loop: | ||
245 : | prefetchnta [esi + 64] ; non temporal prefetch | ||
246 : | prefetchnta [esi + 96] | ||
247 : | |||
248 : | movq mm1, [esi] ; read from src | ||
249 : | movq mm2, [esi + 8] | ||
250 : | movq mm3, [esi + 16] | ||
251 : | movq mm4, [esi + 24] | ||
252 : | movq mm5, [esi + 32] | ||
253 : | movq mm6, [esi + 40] | ||
254 : | movq mm7, [esi + 48] | ||
255 : | movq mm0, [esi + 56] | ||
256 : | |||
257 : | movntq [edi], mm1 ; write to u_out | ||
258 : | movntq [edi + 8], mm2 | ||
259 : | movntq [edi + 16], mm3 | ||
260 : | movntq [edi + 24], mm4 | ||
261 : | movntq [edi + 32], mm5 | ||
262 : | movntq [edi + 40], mm6 | ||
263 : | movntq [edi + 48], mm7 | ||
264 : | movntq [edi + 56], mm0 | ||
265 : | |||
266 : | |||
267 : | add esi, 64 | ||
268 : | add edi, 64 | ||
269 : | dec ecx | ||
270 : | jnz .v_inner_loop | ||
271 : | |||
272 : | dec ebp | ||
273 : | jz .v_outer_loop | ||
274 : | |||
275 : | .v_remainder_loop: | ||
276 : | movq mm1, [esi] ; read from src | ||
277 : | movntq [edi], mm1 ; write to y_out | ||
278 : | |||
279 : | add esi, 8 | ||
280 : | add edi, 8 | ||
281 : | dec ebp | ||
282 : | jnz .v_remainder_loop | ||
283 : | |||
284 : | .v_outer_loop: | ||
285 : | mov ebp, [remainder] | ||
286 : | mov ecx, edx | ||
287 : | add edi, ebx | ||
288 : | |||
289 : | dec eax | ||
290 : | jnz .v_inner_loop | ||
291 : | |||
292 : | ; local vars deallocation | ||
293 : | add esp, localsize | ||
294 : | %undef localsize | ||
295 : | %undef remainder | ||
296 : | |||
297 : | pop ebp | ||
298 : | pop edi | ||
299 : | pop esi | ||
300 : | pop ebx | ||
301 : | |||
302 : | emms | ||
303 : | |||
304 : | ret | ||
305 : | |||
306 : | |||
307 : | |||
308 : | ;------------------------------------------------------------------------------ | ||
309 : | ; | ||
310 : | ; void yuv_to_yv12_mmx(uint8_t *y_out, | ||
311 : | ; uint8_t *u_out, | ||
312 : | ; uint8_t *v_out, | ||
313 : | ; uint8_t *src, | ||
314 : | ; int width, int height, int stride); | ||
315 : | ; | ||
316 : | ; Attention: This code assumes that width is a multiple of 16 | ||
317 : | ; | ||
318 : | ;------------------------------------------------------------------------------ | ||
319 : | |||
320 : | cglobal yuv_to_yv12_mmx | ||
321 : | yuv_to_yv12_mmx: | ||
322 : | |||
323 : | push ebx | ||
324 : | push esi | ||
325 : | push edi | ||
326 : | push ebp | ||
327 : | |||
328 : | ; local vars allocation | ||
329 : | %define localsize 4 | ||
330 : | %define remainder esp | ||
331 : | sub esp, localsize | ||
332 : | |||
333 : | |||
334 : | ; function code | ||
335 : | mov eax, [esp + 40 + localsize] ; height -> eax | ||
336 : | mov ebx, [esp + 44 + localsize] ; stride -> ebx | ||
337 : | mov esi, [esp + 32 + localsize] ; src -> esi | ||
338 : | mov edi, [esp + 20 + localsize] ; y_out -> edi | ||
339 : | edgomez | 331 | mov ecx, [esp + 36 + localsize] ; width -> ecx |
340 : | edgomez | 328 | |
341 : | sub ebx, ecx ; stride - width -> ebx | ||
342 : | |||
343 : | mov edx, ecx | ||
344 : | mov ebp, ecx | ||
345 : | shr edx, 6 | ||
346 : | mov ecx, edx ; 64 bytes copied per iteration | ||
347 : | shl edx, 6 | ||
348 : | sub ebp, edx ; mainder -> ebp | ||
349 : | shr ebp, 4 ; 16 bytes per iteration | ||
350 : | add ebp, 1 | ||
351 : | mov [remainder], ebp | ||
352 : | |||
353 : | mov edx, ecx | ||
354 : | |||
355 : | .y_inner_loop: | ||
356 : | movq mm1, [esi] ; read from src | ||
357 : | movq mm2, [esi + 8] | ||
358 : | movq mm3, [esi + 16] | ||
359 : | movq mm4, [esi + 24] | ||
360 : | movq mm5, [esi + 32] | ||
361 : | movq mm6, [esi + 40] | ||
362 : | movq mm7, [esi + 48] | ||
363 : | movq mm0, [esi + 56] | ||
364 : | |||
365 : | movq [edi], mm1 ; write to y_out | ||
366 : | movq [edi + 8], mm2 | ||
367 : | movq [edi + 16], mm3 | ||
368 : | movq [edi + 24], mm4 | ||
369 : | movq [edi + 32], mm5 | ||
370 : | movq [edi + 40], mm6 | ||
371 : | movq [edi + 48], mm7 | ||
372 : | movq [edi + 56], mm0 | ||
373 : | |||
374 : | add esi, 64 | ||
375 : | add edi, 64 | ||
376 : | dec ecx | ||
377 : | jnz .y_inner_loop | ||
378 : | |||
379 : | dec ebp | ||
380 : | jz .y_outer_loop | ||
381 : | |||
382 : | .y_remainder_loop: | ||
383 : | movq mm1, [esi] ; read from src | ||
384 : | movq mm2, [esi + 8] | ||
385 : | |||
386 : | movq [edi], mm1 ; write to y_out | ||
387 : | movq [edi + 8], mm2 | ||
388 : | |||
389 : | add esi, 16 | ||
390 : | add edi, 16 | ||
391 : | dec ebp | ||
392 : | jnz .y_remainder_loop | ||
393 : | |||
394 : | .y_outer_loop: | ||
395 : | mov ebp, [remainder] | ||
396 : | mov ecx, edx | ||
397 : | add edi, ebx | ||
398 : | |||
399 : | dec eax | ||
400 : | jnz near .y_inner_loop | ||
401 : | |||
402 : | mov eax, [esp + 40 + localsize] ; height -> eax | ||
403 : | mov ebx, [esp + 44 + localsize] ; stride -> ebx | ||
404 : | mov ecx, [esp + 36 + localsize] ; width -> ecx | ||
405 : | mov edi, [esp + 24 + localsize] ; u_out -> edi | ||
406 : | |||
407 : | shr ecx, 1 ; width / 2 -> ecx | ||
408 : | shr ebx, 1 ; stride / 2 -> ebx | ||
409 : | shr eax, 1 ; height / 2 -> eax | ||
410 : | |||
411 : | sub ebx, ecx ; stride / 2 - width / 2 -> ebx | ||
412 : | |||
413 : | mov edx, ecx | ||
414 : | mov ebp, ecx | ||
415 : | shr edx, 6 | ||
416 : | mov ecx, edx ; 64 bytes copied per iteration | ||
417 : | shl edx, 6 | ||
418 : | sub ebp, edx ; remainder -> ebp | ||
419 : | shr ebp, 3 ; 8 bytes per iteration | ||
420 : | add ebp, 1 | ||
421 : | mov [remainder], ebp | ||
422 : | |||
423 : | mov edx, ecx | ||
424 : | |||
425 : | .u_inner_loop: | ||
426 : | movq mm1, [esi] ; read from src | ||
427 : | movq mm2, [esi + 8] | ||
428 : | movq mm3, [esi + 16] | ||
429 : | movq mm4, [esi + 24] | ||
430 : | movq mm5, [esi + 32] | ||
431 : | movq mm6, [esi + 40] | ||
432 : | movq mm7, [esi + 48] | ||
433 : | movq mm0, [esi + 56] | ||
434 : | |||
435 : | movq [edi], mm1 ; write to u_out | ||
436 : | movq [edi + 8], mm2 | ||
437 : | movq [edi + 16], mm3 | ||
438 : | movq [edi + 24], mm4 | ||
439 : | movq [edi + 32], mm5 | ||
440 : | movq [edi + 40], mm6 | ||
441 : | movq [edi + 48], mm7 | ||
442 : | movq [edi + 56], mm0 | ||
443 : | |||
444 : | |||
445 : | add esi, 64 | ||
446 : | add edi, 64 | ||
447 : | dec ecx | ||
448 : | jnz .u_inner_loop | ||
449 : | |||
450 : | dec ebp | ||
451 : | jz .u_outer_loop | ||
452 : | |||
453 : | .u_remainder_loop: | ||
454 : | movq mm1, [esi] ; read from src | ||
455 : | movq [edi], mm1 ; write to y_out | ||
456 : | |||
457 : | add esi, 8 | ||
458 : | add edi, 8 | ||
459 : | dec ebp | ||
460 : | jnz .u_remainder_loop | ||
461 : | |||
462 : | .u_outer_loop: | ||
463 : | mov ebp, [remainder] | ||
464 : | mov ecx, edx | ||
465 : | add edi, ebx | ||
466 : | |||
467 : | dec eax | ||
468 : | jnz .u_inner_loop | ||
469 : | |||
470 : | mov eax, [esp + 40 + localsize] ; height -> eax | ||
471 : | mov ecx, [esp + 36 + localsize] ; width -> ecx | ||
472 : | mov edi, [esp + 28 + localsize] ; v_out -> edi | ||
473 : | |||
474 : | shr ecx, 1 ; width / 2 -> ecx | ||
475 : | shr eax, 1 ; height / 2 -> eax | ||
476 : | |||
477 : | mov edx, ecx | ||
478 : | mov ebp, ecx | ||
479 : | shr edx, 6 | ||
480 : | mov ecx, edx ; 64 bytes copied per iteration | ||
481 : | shl edx, 6 | ||
482 : | sub ebp, edx ; remainder -> ebp | ||
483 : | shr ebp, 3 ; 8 bytes per iteration | ||
484 : | add ebp, 1 | ||
485 : | mov [remainder], ebp | ||
486 : | |||
487 : | mov edx, ecx | ||
488 : | |||
489 : | .v_inner_loop: | ||
490 : | movq mm1, [esi] ; read from src | ||
491 : | movq mm2, [esi + 8] | ||
492 : | movq mm3, [esi + 16] | ||
493 : | movq mm4, [esi + 24] | ||
494 : | movq mm5, [esi + 32] | ||
495 : | movq mm6, [esi + 40] | ||
496 : | movq mm7, [esi + 48] | ||
497 : | movq mm0, [esi + 56] | ||
498 : | |||
499 : | movq [edi], mm1 ; write to u_out | ||
500 : | movq [edi + 8], mm2 | ||
501 : | movq [edi + 16], mm3 | ||
502 : | movq [edi + 24], mm4 | ||
503 : | movq [edi + 32], mm5 | ||
504 : | movq [edi + 40], mm6 | ||
505 : | movq [edi + 48], mm7 | ||
506 : | movq [edi + 56], mm0 | ||
507 : | |||
508 : | |||
509 : | add esi, 64 | ||
510 : | add edi, 64 | ||
511 : | dec ecx | ||
512 : | jnz .v_inner_loop | ||
513 : | |||
514 : | dec ebp | ||
515 : | jz .v_outer_loop | ||
516 : | |||
517 : | .v_remainder_loop: | ||
518 : | movq mm1, [esi] ; read from src | ||
519 : | movq [edi], mm1 ; write to y_out | ||
520 : | |||
521 : | add esi, 8 | ||
522 : | add edi, 8 | ||
523 : | dec ebp | ||
524 : | jnz .v_remainder_loop | ||
525 : | |||
526 : | .v_outer_loop: | ||
527 : | mov ebp, [remainder] | ||
528 : | mov ecx, edx | ||
529 : | add edi, ebx | ||
530 : | |||
531 : | dec eax | ||
532 : | jnz .v_inner_loop | ||
533 : | |||
534 : | ; local vars deallocation | ||
535 : | add esp, localsize | ||
536 : | %undef localsize | ||
537 : | %undef remainder | ||
538 : | |||
539 : | pop ebp | ||
540 : | pop edi | ||
541 : | pop esi | ||
542 : | pop ebx | ||
543 : | |||
544 : | emms | ||
545 : | |||
546 : | ret |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |