1 |
;------------------------------------------------------------------------------ |
;/**************************************************************************** |
2 |
; |
; * |
3 |
; This file is part of XviD, a free MPEG-4 video encoder/decoder |
; * XVID MPEG-4 VIDEO CODEC |
4 |
; |
; * - MMX and XMM YV12->YV12 conversion - |
5 |
; This program is free software; you can redistribute it and/or modify it |
; * |
6 |
; under the terms of the GNU General Public License as published by |
; * Copyright(C) 2001 Michael Militzer <isibaar@xvid.org> |
7 |
; the Free Software Foundation; either version 2 of the License, or |
; * |
8 |
; (at your option) any later version. |
; * This program is free software; you can redistribute it and/or modify it |
9 |
; |
; * under the terms of the GNU General Public License as published by |
10 |
; This program is distributed in the hope that it will be useful, but |
; * the Free Software Foundation; either version 2 of the License, or |
11 |
; WITHOUT ANY WARRANTY; without even the implied warranty of |
; * (at your option) any later version. |
12 |
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
; * |
13 |
; GNU General Public License for more details. |
; * This program is distributed in the hope that it will be useful, |
14 |
; |
; * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
; You should have received a copy of the GNU General Public License |
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
; along with this program; if not, write to the Free Software |
; * GNU General Public License for more details. |
17 |
; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
; * |
18 |
; |
; * You should have received a copy of the GNU General Public License |
19 |
;------------------------------------------------------------------------------ |
; * along with this program; if not, write to the Free Software |
20 |
;------------------------------------------------------------------------------ |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
; |
; * |
22 |
; yuv_to_yuv.asm, MMX optimized color conversion |
; * $Id: colorspace_yuv_mmx.asm,v 1.8 2008-11-11 20:46:24 Isibaar Exp $ |
23 |
; |
; * |
24 |
; Copyright (C) 2001 - Michael Militzer <isibaar@xvid.org> |
; ***************************************************************************/ |
|
; |
|
|
; For more information visit the XviD homepage: http://www.xvid.org |
|
|
; |
|
|
;------------------------------------------------------------------------------ |
|
|
;------------------------------------------------------------------------------ |
|
|
; |
|
|
; Revision history: |
|
|
; |
|
|
; 24.11.2001 initial version (Isibaar) |
|
|
; 23.07.2002 thread safe (edgomez) |
|
|
; |
|
|
; $Id: colorspace_yuv_mmx.asm,v 1.2 2003-02-15 15:22:18 edgomez Exp $ |
|
|
; |
|
|
;------------------------------------------------------------------------------ |
|
25 |
|
|
26 |
BITS 32 |
BITS 32 |
27 |
|
|
28 |
%macro cglobal 1 |
%macro cglobal 1 |
29 |
%ifdef PREFIX |
%ifdef PREFIX |
30 |
|
%ifdef MARK_FUNCS |
31 |
|
global _%1:function %1.endfunc-%1 |
32 |
|
%define %1 _%1:function %1.endfunc-%1 |
33 |
|
%define ENDFUNC .endfunc |
34 |
|
%else |
35 |
global _%1 |
global _%1 |
36 |
%define %1 _%1 |
%define %1 _%1 |
37 |
|
%define ENDFUNC |
38 |
|
%endif |
39 |
|
%else |
40 |
|
%ifdef MARK_FUNCS |
41 |
|
global %1:function %1.endfunc-%1 |
42 |
|
%define ENDFUNC .endfunc |
43 |
%else |
%else |
44 |
global %1 |
global %1 |
45 |
|
%define ENDFUNC |
46 |
|
%endif |
47 |
%endif |
%endif |
48 |
%endmacro |
%endmacro |
49 |
|
|
50 |
SECTION .text |
;============================================================================= |
51 |
|
; Helper macros |
52 |
ALIGN 64 |
;============================================================================= |
53 |
|
|
54 |
;------------------------------------------------------------------------------ |
;------------------------------------------------------------------------------ |
55 |
; PLANE_COPY ( DST, DST_DIF, SRC, SRC_DIF, WIDTH, HEIGHT, OPT ) |
; PLANE_COPY ( DST, DST_STRIDE, SRC, SRC_STRIDE, WIDTH, HEIGHT, OPT ) |
56 |
; DST dst buffer |
; DST dst buffer |
57 |
; DST_DIF dst stride difference (e.g. stride - width) |
; DST_STRIDE dst stride |
58 |
; SRC src destination buffer |
; SRC src destination buffer |
59 |
; SRC_DIF src stride difference (e.g. stride - width) |
; SRC_STRIDE src stride |
60 |
; WIDTH width |
; WIDTH width |
61 |
; HEIGHT height |
; HEIGHT height |
62 |
; OPT 0=plain mmx, 1=xmm |
; OPT 0=plain mmx, 1=xmm |
63 |
;------------------------------------------------------------------------------ |
;------------------------------------------------------------------------------ |
64 |
|
|
65 |
%macro PLANE_COPY 7 |
%macro PLANE_COPY 7 |
66 |
%define DST %1 |
%define DST %1 |
67 |
%define DST_DIF %2 |
%define DST_STRIDE %2 |
68 |
%define SRC %3 |
%define SRC %3 |
69 |
%define SRC_DIF %4 |
%define SRC_STRIDE %4 |
70 |
%define WIDTH %5 |
%define WIDTH %5 |
71 |
%define HEIGHT %6 |
%define HEIGHT %6 |
72 |
%define OPT %7 |
%define OPT %7 |
80 |
shr eax, 6 ; $eax$ = width / 64 |
shr eax, 6 ; $eax$ = width / 64 |
81 |
and ebx, 63 ; remainder = width % 64 |
and ebx, 63 ; remainder = width % 64 |
82 |
mov edx, ebx |
mov edx, ebx |
83 |
shr ebx, 4 ; $ebx$ = reaminder / 16 |
shr ebx, 4 ; $ebx$ = remainder / 16 |
84 |
and edx, 15 ; $edx$ = remainder % 16 |
and edx, 15 ; $edx$ = remainder % 16 |
85 |
|
|
86 |
%%loop64_start |
%%loop64_start_pc: |
87 |
or eax, eax |
push edi |
88 |
jz %%loop16_start |
push esi |
89 |
mov ecx, eax ; width64 |
mov ecx, eax ; width64 |
90 |
%%loop64: |
test eax, eax |
91 |
|
jz %%loop16_start_pc |
92 |
|
|
93 |
|
%%loop64_pc: |
94 |
%if OPT == 1 ; xmm |
%if OPT == 1 ; xmm |
95 |
prefetchnta [esi + 64] ; non temporal prefetch |
prefetchnta [esi + 64] ; non temporal prefetch |
96 |
prefetchnta [esi + 96] |
prefetchnta [esi + 96] |
126 |
|
|
127 |
add esi, 64 |
add esi, 64 |
128 |
add edi, 64 |
add edi, 64 |
129 |
dec ecx |
loop %%loop64_pc |
|
jnz %%loop64 |
|
130 |
|
|
131 |
|
|
132 |
%%loop16_start |
%%loop16_start_pc: |
|
or ebx, ebx |
|
|
jz %%loop1_start |
|
133 |
mov ecx, ebx ; width16 |
mov ecx, ebx ; width16 |
134 |
%%loop16: |
test ebx, ebx |
135 |
|
jz %%loop1_start_pc |
136 |
|
|
137 |
|
%%loop16_pc: |
138 |
movq mm1, [esi] |
movq mm1, [esi] |
139 |
movq mm2, [esi + 8] |
movq mm2, [esi + 8] |
140 |
%if OPT == 0 ; plain mmx |
%if OPT == 0 ; plain mmx |
147 |
|
|
148 |
add esi, 16 |
add esi, 16 |
149 |
add edi, 16 |
add edi, 16 |
150 |
dec ecx |
loop %%loop16_pc |
|
jnz %%loop16 |
|
151 |
|
|
152 |
|
|
153 |
%%loop1_start |
%%loop1_start_pc: |
154 |
mov ecx, edx |
mov ecx, edx |
155 |
rep movsb |
rep movsb |
156 |
|
|
157 |
add esi, SRC_DIF |
pop esi |
158 |
add edi, DST_DIF |
pop edi |
159 |
|
add esi, SRC_STRIDE |
160 |
|
add edi, DST_STRIDE |
161 |
dec ebp |
dec ebp |
162 |
jnz near %%loop64_start |
jg near %%loop64_start_pc |
163 |
%endmacro |
%endmacro |
164 |
|
|
165 |
;------------------------------------------------------------------------------ |
;------------------------------------------------------------------------------ |
166 |
|
; PLANE_FILL ( DST, DST_STRIDE, WIDTH, HEIGHT, OPT ) |
167 |
|
; DST dst buffer |
168 |
|
; DST_STRIDE dst stride |
169 |
|
; WIDTH width |
170 |
|
; HEIGHT height |
171 |
|
; OPT 0=plain mmx, 1=xmm |
172 |
|
;------------------------------------------------------------------------------ |
173 |
|
|
174 |
|
%macro PLANE_FILL 5 |
175 |
|
%define DST %1 |
176 |
|
%define DST_STRIDE %2 |
177 |
|
%define WIDTH %3 |
178 |
|
%define HEIGHT %4 |
179 |
|
%define OPT %5 |
180 |
|
|
181 |
|
mov esi, WIDTH |
182 |
|
mov ebp, HEIGHT ; $ebp$ = height |
183 |
|
mov edi, DST |
184 |
|
|
185 |
|
mov eax, 0x80808080 |
186 |
|
mov ebx, esi |
187 |
|
shr esi, 6 ; $esi$ = width / 64 |
188 |
|
and ebx, 63 ; ebx = remainder = width % 64 |
189 |
|
movd mm0, eax |
190 |
|
mov edx, ebx |
191 |
|
shr ebx, 4 ; $ebx$ = remainder / 16 |
192 |
|
and edx, 15 ; $edx$ = remainder % 16 |
193 |
|
punpckldq mm0, mm0 |
194 |
|
|
195 |
|
%%loop64_start_pf: |
196 |
|
push edi |
197 |
|
mov ecx, esi ; width64 |
198 |
|
test esi, esi |
199 |
|
jz %%loop16_start_pf |
200 |
|
|
201 |
|
%%loop64_pf: |
202 |
|
|
203 |
|
%if OPT == 0 ; plain mmx |
204 |
|
movq [edi ], mm0 ; write to y_out |
205 |
|
movq [edi + 8], mm0 |
206 |
|
movq [edi + 16], mm0 |
207 |
|
movq [edi + 24], mm0 |
208 |
|
movq [edi + 32], mm0 |
209 |
|
movq [edi + 40], mm0 |
210 |
|
movq [edi + 48], mm0 |
211 |
|
movq [edi + 56], mm0 |
212 |
|
%else |
213 |
|
movntq [edi ], mm0 ; write to y_out |
214 |
|
movntq [edi + 8], mm0 |
215 |
|
movntq [edi + 16], mm0 |
216 |
|
movntq [edi + 24], mm0 |
217 |
|
movntq [edi + 32], mm0 |
218 |
|
movntq [edi + 40], mm0 |
219 |
|
movntq [edi + 48], mm0 |
220 |
|
movntq [edi + 56], mm0 |
221 |
|
%endif |
222 |
|
|
223 |
|
add edi, 64 |
224 |
|
loop %%loop64_pf |
225 |
|
|
226 |
|
%%loop16_start_pf: |
227 |
|
mov ecx, ebx ; width16 |
228 |
|
test ebx, ebx |
229 |
|
jz %%loop1_start_pf |
230 |
|
|
231 |
|
%%loop16_pf: |
232 |
|
%if OPT == 0 ; plain mmx |
233 |
|
movq [edi ], mm0 |
234 |
|
movq [edi + 8], mm0 |
235 |
|
%else |
236 |
|
movntq [edi ], mm0 |
237 |
|
movntq [edi + 8], mm0 |
238 |
|
%endif |
239 |
|
|
240 |
|
add edi, 16 |
241 |
|
loop %%loop16_pf |
242 |
|
|
243 |
|
%%loop1_start_pf: |
244 |
|
mov ecx, edx |
245 |
|
rep stosb |
246 |
|
|
247 |
|
pop edi |
248 |
|
add edi, DST_STRIDE |
249 |
|
dec ebp |
250 |
|
jg near %%loop64_start_pf |
251 |
|
%endmacro |
252 |
|
|
253 |
;------------------------------------------------------------------------------ |
;------------------------------------------------------------------------------ |
254 |
; MAKE_YV12_TO_YV12( NAME, OPT ) |
; MAKE_YV12_TO_YV12( NAME, OPT ) |
264 |
%macro MAKE_YV12_TO_YV12 2 |
%macro MAKE_YV12_TO_YV12 2 |
265 |
%define NAME %1 |
%define NAME %1 |
266 |
%define OPT %2 |
%define OPT %2 |
267 |
align 16 |
ALIGN 16 |
268 |
cglobal NAME |
cglobal NAME |
269 |
NAME |
NAME: |
270 |
%define pushsize 16 |
%define pushsize 16 |
271 |
%define localsize 24 |
%define localsize 12 |
272 |
|
|
273 |
%define vflip esp + localsize + pushsize + 52 |
%define vflip esp + localsize + pushsize + 52 |
274 |
%define height esp + localsize + pushsize + 48 |
%define height esp + localsize + pushsize + 48 |
292 |
|
|
293 |
%define width2 esp + localsize - 4 |
%define width2 esp + localsize - 4 |
294 |
%define height2 esp + localsize - 8 |
%define height2 esp + localsize - 8 |
|
%define y_src_dif esp + localsize - 12 |
|
|
%define y_dst_dif esp + localsize - 16 |
|
|
%define uv_src_dif esp + localsize - 20 |
|
|
%define uv_dst_dif esp + localsize - 24 |
|
295 |
|
|
296 |
sub esp, localsize |
sub esp, localsize |
297 |
|
|
302 |
mov [width2], eax |
mov [width2], eax |
303 |
mov [height2], ebx |
mov [height2], ebx |
304 |
|
|
305 |
mov ebp, [vflip] |
mov eax, [vflip] |
306 |
or ebp, ebp |
test eax, eax |
307 |
jz near .dont_flip |
jz near .go |
308 |
|
|
309 |
; flipping support |
; flipping support |
310 |
mov eax, [height] |
mov eax, [height] |
311 |
mov esi, [y_src] |
mov esi, [y_src] |
312 |
mov edx, [y_src_stride] |
mov ecx, [y_src_stride] |
313 |
push edx |
sub eax, 1 |
314 |
mul edx |
imul eax, ecx |
|
pop edx |
|
315 |
add esi, eax ; y_src += (height-1) * y_src_stride |
add esi, eax ; y_src += (height-1) * y_src_stride |
316 |
neg edx |
neg ecx |
317 |
mov [y_src], esi |
mov [y_src], esi |
318 |
mov [y_src_stride], edx ; y_src_stride = -y_src_stride |
mov [y_src_stride], ecx ; y_src_stride = -y_src_stride |
319 |
|
|
320 |
mov eax, [height2] |
mov eax, [height2] |
321 |
mov esi, [u_src] |
mov esi, [u_src] |
322 |
mov edi, [v_src] |
mov edi, [v_src] |
323 |
mov edx, [uv_src_stride] |
mov ecx, [uv_src_stride] |
324 |
sub eax, 1 ; ebp = height2 - 1 |
test esi, esi |
325 |
push edx |
jz .go |
326 |
mul edx |
test edi, edi |
327 |
pop edx |
jz .go |
328 |
|
sub eax, 1 ; eax = height2 - 1 |
329 |
|
imul eax, ecx |
330 |
add esi, eax ; u_src += (height2-1) * uv_src_stride |
add esi, eax ; u_src += (height2-1) * uv_src_stride |
331 |
add edi, eax ; v_src += (height2-1) * uv_src_stride |
add edi, eax ; v_src += (height2-1) * uv_src_stride |
332 |
neg edx |
neg ecx |
333 |
mov [u_src], esi |
mov [u_src], esi |
334 |
mov [v_src], edi |
mov [v_src], edi |
335 |
mov [uv_src_stride], edx ; uv_src_stride = -uv_src_stride |
mov [uv_src_stride], ecx ; uv_src_stride = -uv_src_stride |
336 |
|
|
337 |
.dont_flip |
.go: |
338 |
|
|
339 |
mov eax, [y_src_stride] |
PLANE_COPY [y_dst], [y_dst_stride], [y_src], [y_src_stride], [width], [height], OPT |
340 |
mov ebx, [y_dst_stride] |
|
341 |
mov ecx, [uv_src_stride] |
mov eax, [u_src] |
342 |
mov edx, [uv_dst_stride] |
or eax, [v_src] |
343 |
sub eax, [width] |
jz near .UVFill_0x80 |
344 |
sub ebx, [width] |
PLANE_COPY [u_dst], [uv_dst_stride], [u_src], [uv_src_stride], [width2], [height2], OPT |
345 |
sub ecx, [width2] |
PLANE_COPY [v_dst], [uv_dst_stride], [v_src], [uv_src_stride], [width2], [height2], OPT |
|
sub edx, [width2] |
|
|
mov [y_src_dif], eax ; y_src_dif = y_src_stride - width |
|
|
mov [y_dst_dif], ebx ; y_dst_dif = y_dst_stride - width |
|
|
mov [uv_src_dif], ecx ; uv_src_dif = uv_src_stride - width2 |
|
|
mov [uv_dst_dif], edx ; uv_dst_dif = uv_dst_stride - width2 |
|
|
|
|
|
PLANE_COPY [y_dst], [y_dst_dif], [y_src], [y_src_dif], [width], [height], OPT |
|
|
PLANE_COPY [u_dst], [uv_dst_dif], [u_src], [uv_src_dif], [width2], [height2], OPT |
|
|
PLANE_COPY [v_dst], [uv_dst_dif], [v_src], [uv_src_dif], [width2], [height2], OPT |
|
346 |
|
|
347 |
|
.Done_UVPlane: |
348 |
add esp, localsize |
add esp, localsize |
349 |
pop ebp |
pop ebp |
350 |
pop edi |
pop edi |
351 |
pop esi |
pop esi |
352 |
pop ebx |
pop ebx |
|
|
|
353 |
ret |
ret |
354 |
|
|
355 |
|
.UVFill_0x80: |
356 |
|
PLANE_FILL [u_dst], [uv_dst_stride], [width2], [height2], OPT |
357 |
|
PLANE_FILL [v_dst], [uv_dst_stride], [width2], [height2], OPT |
358 |
|
jmp near .Done_UVPlane |
359 |
|
ENDFUNC |
360 |
%endmacro |
%endmacro |
|
;------------------------------------------------------------------------------ |
|
361 |
|
|
362 |
|
;============================================================================= |
363 |
|
; Code |
364 |
|
;============================================================================= |
365 |
|
|
366 |
|
SECTION .text |
367 |
|
|
368 |
MAKE_YV12_TO_YV12 yv12_to_yv12_mmx, 0 |
MAKE_YV12_TO_YV12 yv12_to_yv12_mmx, 0 |
369 |
|
|
370 |
MAKE_YV12_TO_YV12 yv12_to_yv12_xmm, 1 |
MAKE_YV12_TO_YV12 yv12_to_yv12_xmm, 1 |
371 |
|
|
372 |
|
%ifidn __OUTPUT_FORMAT__,elf |
373 |
|
section ".note.GNU-stack" noalloc noexec nowrite progbits |
374 |
|
%endif |
375 |
|
|