1 |
;------------------------------------------------------------------------------ |
;/**************************************************************************** |
2 |
; |
; * |
3 |
; This file is part of XviD, a free MPEG-4 video encoder/decoder |
; * XVID MPEG-4 VIDEO CODEC |
4 |
; |
; * - MMX and XMM YV12->YV12 conversion - |
5 |
; This program is free software; you can redistribute it and/or modify it |
; * |
6 |
; under the terms of the GNU General Public License as published by |
; * Copyright(C) 2001-2008 Michael Militzer <michael@xvid.org> |
7 |
; the Free Software Foundation; either version 2 of the License, or |
; * |
8 |
; (at your option) any later version. |
; * This program is free software; you can redistribute it and/or modify it |
9 |
; |
; * under the terms of the GNU General Public License as published by |
10 |
; This program is distributed in the hope that it will be useful, but |
; * the Free Software Foundation; either version 2 of the License, or |
11 |
; WITHOUT ANY WARRANTY; without even the implied warranty of |
; * (at your option) any later version. |
12 |
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
; * |
13 |
; GNU General Public License for more details. |
; * This program is distributed in the hope that it will be useful, |
14 |
; |
; * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
; You should have received a copy of the GNU General Public License |
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
; along with this program; if not, write to the Free Software |
; * GNU General Public License for more details. |
17 |
; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
; * |
18 |
; |
; * You should have received a copy of the GNU General Public License |
19 |
;------------------------------------------------------------------------------ |
; * along with this program; if not, write to the Free Software |
20 |
;------------------------------------------------------------------------------ |
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
; |
; * |
22 |
; yuv_to_yuv.asm, MMX optimized color conversion |
; * $Id: colorspace_yuv_mmx.asm,v 1.9 2008-11-26 01:04:34 Isibaar Exp $ |
23 |
; |
; * |
24 |
; Copyright (C) 2001 - Michael Militzer <isibaar@xvid.org> |
; ***************************************************************************/ |
25 |
; |
|
26 |
; For more information visit the XviD homepage: http://www.xvid.org |
%include "nasm.inc" |
27 |
; |
|
28 |
;------------------------------------------------------------------------------ |
;============================================================================= |
29 |
;------------------------------------------------------------------------------ |
; Helper macros |
30 |
; |
;============================================================================= |
|
; Revision history: |
|
|
; |
|
|
; 24.11.2001 initial version (Isibaar) |
|
|
; 23.07.2002 thread safe (edgomez) |
|
|
; |
|
|
; $Id: colorspace_yuv_mmx.asm,v 1.2 2003-02-15 15:22:18 edgomez Exp $ |
|
|
; |
|
|
;------------------------------------------------------------------------------ |
|
|
|
|
|
BITS 32 |
|
|
|
|
|
%macro cglobal 1 |
|
|
%ifdef PREFIX |
|
|
global _%1 |
|
|
%define %1 _%1 |
|
|
%else |
|
|
global %1 |
|
|
%endif |
|
|
%endmacro |
|
|
|
|
|
SECTION .text |
|
|
|
|
|
ALIGN 64 |
|
31 |
|
|
32 |
;------------------------------------------------------------------------------ |
;------------------------------------------------------------------------------ |
33 |
; PLANE_COPY ( DST, DST_DIF, SRC, SRC_DIF, WIDTH, HEIGHT, OPT ) |
; PLANE_COPY ( DST, DST_STRIDE, SRC, SRC_STRIDE, WIDTH, HEIGHT, OPT ) |
34 |
; DST dst buffer |
; DST dst buffer |
35 |
; DST_DIF dst stride difference (e.g. stride - width) |
; DST_STRIDE dst stride |
36 |
; SRC src destination buffer |
; SRC src destination buffer |
37 |
; SRC_DIF src stride difference (e.g. stride - width) |
; SRC_STRIDE src stride |
38 |
; WIDTH width |
; WIDTH width |
39 |
; HEIGHT height |
; HEIGHT height |
40 |
; OPT 0=plain mmx, 1=xmm |
; OPT 0=plain mmx, 1=xmm |
41 |
|
; |
42 |
|
; |
43 |
|
; Trashes: DST, SRC, WIDTH, HEIGHT, _EBX, _ECX, _EDX |
44 |
;------------------------------------------------------------------------------ |
;------------------------------------------------------------------------------ |
45 |
|
|
46 |
%macro PLANE_COPY 7 |
%macro PLANE_COPY 7 |
47 |
%define DST %1 |
%define DST %1 |
48 |
%define DST_DIF %2 |
%define DST_STRIDE %2 |
49 |
%define SRC %3 |
%define SRC %3 |
50 |
%define SRC_DIF %4 |
%define SRC_STRIDE %4 |
51 |
%define WIDTH %5 |
%define WIDTH %5 |
52 |
%define HEIGHT %6 |
%define HEIGHT %6 |
53 |
%define OPT %7 |
%define OPT %7 |
54 |
|
|
55 |
mov eax, WIDTH |
mov _EBX, WIDTH |
56 |
mov ebp, HEIGHT ; $ebp$ = height |
shr WIDTH, 6 ; $_EAX$ = width / 64 |
57 |
mov esi, SRC |
and _EBX, 63 ; remainder = width % 64 |
58 |
mov edi, DST |
mov _EDX, _EBX |
59 |
|
shr _EBX, 4 ; $_EBX$ = remainder / 16 |
60 |
mov ebx, eax |
and _EDX, 15 ; $_EDX$ = remainder % 16 |
61 |
shr eax, 6 ; $eax$ = width / 64 |
|
62 |
and ebx, 63 ; remainder = width % 64 |
%%loop64_start_pc: |
63 |
mov edx, ebx |
push DST |
64 |
shr ebx, 4 ; $ebx$ = reaminder / 16 |
push SRC |
65 |
and edx, 15 ; $edx$ = remainder % 16 |
|
66 |
|
mov _ECX, WIDTH ; width64 |
67 |
%%loop64_start |
test WIDTH, WIDTH |
68 |
or eax, eax |
jz %%loop16_start_pc |
69 |
jz %%loop16_start |
|
70 |
mov ecx, eax ; width64 |
%%loop64_pc: |
|
%%loop64: |
|
71 |
%if OPT == 1 ; xmm |
%if OPT == 1 ; xmm |
72 |
prefetchnta [esi + 64] ; non temporal prefetch |
prefetchnta [SRC + 64] ; non temporal prefetch |
73 |
prefetchnta [esi + 96] |
prefetchnta [SRC + 96] |
74 |
%endif |
%endif |
75 |
movq mm1, [esi] ; read from src |
movq mm1, [SRC ] ; read from src |
76 |
movq mm2, [esi + 8] |
movq mm2, [SRC + 8] |
77 |
movq mm3, [esi + 16] |
movq mm3, [SRC + 16] |
78 |
movq mm4, [esi + 24] |
movq mm4, [SRC + 24] |
79 |
movq mm5, [esi + 32] |
movq mm5, [SRC + 32] |
80 |
movq mm6, [esi + 40] |
movq mm6, [SRC + 40] |
81 |
movq mm7, [esi + 48] |
movq mm7, [SRC + 48] |
82 |
movq mm0, [esi + 56] |
movq mm0, [SRC + 56] |
83 |
|
|
84 |
%if OPT == 0 ; plain mmx |
%if OPT == 0 ; plain mmx |
85 |
movq [edi], mm1 ; write to y_out |
movq [DST ], mm1 ; write to y_out |
86 |
movq [edi + 8], mm2 |
movq [DST + 8], mm2 |
87 |
movq [edi + 16], mm3 |
movq [DST + 16], mm3 |
88 |
movq [edi + 24], mm4 |
movq [DST + 24], mm4 |
89 |
movq [edi + 32], mm5 |
movq [DST + 32], mm5 |
90 |
movq [edi + 40], mm6 |
movq [DST + 40], mm6 |
91 |
movq [edi + 48], mm7 |
movq [DST + 48], mm7 |
92 |
movq [edi + 56], mm0 |
movq [DST + 56], mm0 |
93 |
%else |
%else |
94 |
movntq [edi], mm1 ; write to y_out |
movntq [DST ], mm1 ; write to y_out |
95 |
movntq [edi + 8], mm2 |
movntq [DST + 8], mm2 |
96 |
movntq [edi + 16], mm3 |
movntq [DST + 16], mm3 |
97 |
movntq [edi + 24], mm4 |
movntq [DST + 24], mm4 |
98 |
movntq [edi + 32], mm5 |
movntq [DST + 32], mm5 |
99 |
movntq [edi + 40], mm6 |
movntq [DST + 40], mm6 |
100 |
movntq [edi + 48], mm7 |
movntq [DST + 48], mm7 |
101 |
movntq [edi + 56], mm0 |
movntq [DST + 56], mm0 |
102 |
%endif |
%endif |
103 |
|
|
104 |
add esi, 64 |
add SRC, 64 |
105 |
add edi, 64 |
add DST, 64 |
106 |
dec ecx |
loop %%loop64_pc |
107 |
jnz %%loop64 |
|
108 |
|
%%loop16_start_pc: |
109 |
|
mov _ECX, _EBX ; width16 |
110 |
%%loop16_start |
test _EBX, _EBX |
111 |
or ebx, ebx |
jz %%loop1_start_pc |
112 |
jz %%loop1_start |
|
113 |
mov ecx, ebx ; width16 |
%%loop16_pc: |
114 |
%%loop16: |
movq mm1, [SRC] |
115 |
movq mm1, [esi] |
movq mm2, [SRC + 8] |
|
movq mm2, [esi + 8] |
|
116 |
%if OPT == 0 ; plain mmx |
%if OPT == 0 ; plain mmx |
117 |
movq [edi], mm1 |
movq [DST], mm1 |
118 |
movq [edi + 8], mm2 |
movq [DST + 8], mm2 |
119 |
%else |
%else |
120 |
movntq [edi], mm1 |
movntq [DST], mm1 |
121 |
movntq [edi + 8], mm2 |
movntq [DST + 8], mm2 |
122 |
%endif |
%endif |
123 |
|
|
124 |
add esi, 16 |
add SRC, 16 |
125 |
add edi, 16 |
add DST, 16 |
126 |
dec ecx |
loop %%loop16_pc |
|
jnz %%loop16 |
|
|
|
|
127 |
|
|
128 |
%%loop1_start |
%%loop1_start_pc: |
129 |
mov ecx, edx |
mov _ECX, _EDX |
130 |
rep movsb |
rep movsb |
131 |
|
|
132 |
add esi, SRC_DIF |
pop SRC |
133 |
add edi, DST_DIF |
pop DST |
134 |
dec ebp |
|
135 |
jnz near %%loop64_start |
%ifdef ARCH_IS_X86_64 |
136 |
|
movsx _ECX, SRC_STRIDE |
137 |
|
add SRC, _ECX |
138 |
|
mov ecx, DST_STRIDE |
139 |
|
add DST, _ECX |
140 |
|
%else |
141 |
|
add SRC, SRC_STRIDE |
142 |
|
add DST, DST_STRIDE |
143 |
|
%endif |
144 |
|
dec HEIGHT |
145 |
|
jg near %%loop64_start_pc |
146 |
|
|
147 |
|
%undef DST |
148 |
|
%undef DST_STRIDE |
149 |
|
%undef SRC |
150 |
|
%undef SRC_STRIDE |
151 |
|
%undef WIDTH |
152 |
|
%undef HEIGHT |
153 |
|
%undef OPT |
154 |
%endmacro |
%endmacro |
155 |
|
|
156 |
;------------------------------------------------------------------------------ |
;------------------------------------------------------------------------------ |
157 |
|
; PLANE_FILL ( DST, DST_STRIDE, WIDTH, HEIGHT, OPT ) |
158 |
|
; DST dst buffer |
159 |
|
; DST_STRIDE dst stride |
160 |
|
; WIDTH width |
161 |
|
; HEIGHT height |
162 |
|
; OPT 0=plain mmx, 1=xmm |
163 |
|
; |
164 |
|
; Trashes: DST, WIDTH, HEIGHT, _EBX, _ECX, _EDX, _EAX |
165 |
|
;------------------------------------------------------------------------------ |
166 |
|
|
167 |
|
%macro PLANE_FILL 5 |
168 |
|
%define DST %1 |
169 |
|
%define DST_STRIDE %2 |
170 |
|
%define WIDTH %3 |
171 |
|
%define HEIGHT %4 |
172 |
|
%define OPT %5 |
173 |
|
|
174 |
|
mov _EAX, 0x80808080 |
175 |
|
mov _EBX, WIDTH |
176 |
|
shr WIDTH, 6 ; $_ESI$ = width / 64 |
177 |
|
and _EBX, 63 ; _EBX = remainder = width % 64 |
178 |
|
movd mm0, eax |
179 |
|
mov _EDX, _EBX |
180 |
|
shr _EBX, 4 ; $_EBX$ = remainder / 16 |
181 |
|
and _EDX, 15 ; $_EDX$ = remainder % 16 |
182 |
|
punpckldq mm0, mm0 |
183 |
|
|
184 |
|
%%loop64_start_pf: |
185 |
|
push DST |
186 |
|
mov _ECX, WIDTH ; width64 |
187 |
|
test WIDTH, WIDTH |
188 |
|
jz %%loop16_start_pf |
189 |
|
|
190 |
|
%%loop64_pf: |
191 |
|
|
192 |
|
%if OPT == 0 ; plain mmx |
193 |
|
movq [DST ], mm0 ; write to y_out |
194 |
|
movq [DST + 8], mm0 |
195 |
|
movq [DST + 16], mm0 |
196 |
|
movq [DST + 24], mm0 |
197 |
|
movq [DST + 32], mm0 |
198 |
|
movq [DST + 40], mm0 |
199 |
|
movq [DST + 48], mm0 |
200 |
|
movq [DST + 56], mm0 |
201 |
|
%else |
202 |
|
movntq [DST ], mm0 ; write to y_out |
203 |
|
movntq [DST + 8], mm0 |
204 |
|
movntq [DST + 16], mm0 |
205 |
|
movntq [DST + 24], mm0 |
206 |
|
movntq [DST + 32], mm0 |
207 |
|
movntq [DST + 40], mm0 |
208 |
|
movntq [DST + 48], mm0 |
209 |
|
movntq [DST + 56], mm0 |
210 |
|
%endif |
211 |
|
|
212 |
|
add DST, 64 |
213 |
|
loop %%loop64_pf |
214 |
|
|
215 |
|
%%loop16_start_pf: |
216 |
|
mov _ECX, _EBX ; width16 |
217 |
|
test _EBX, _EBX |
218 |
|
jz %%loop1_start_pf |
219 |
|
|
220 |
|
%%loop16_pf: |
221 |
|
%if OPT == 0 ; plain mmx |
222 |
|
movq [DST ], mm0 |
223 |
|
movq [DST + 8], mm0 |
224 |
|
%else |
225 |
|
movntq [DST ], mm0 |
226 |
|
movntq [DST + 8], mm0 |
227 |
|
%endif |
228 |
|
|
229 |
|
add DST, 16 |
230 |
|
loop %%loop16_pf |
231 |
|
|
232 |
|
%%loop1_start_pf: |
233 |
|
mov _ECX, _EDX |
234 |
|
rep stosb |
235 |
|
|
236 |
|
pop DST |
237 |
|
|
238 |
|
%ifdef ARCH_IS_X86_64 |
239 |
|
mov ecx, DST_STRIDE |
240 |
|
add DST, _ECX |
241 |
|
%else |
242 |
|
add DST, DST_STRIDE |
243 |
|
%endif |
244 |
|
|
245 |
|
dec HEIGHT |
246 |
|
jg near %%loop64_start_pf |
247 |
|
|
248 |
|
%undef DST |
249 |
|
%undef DST_STRIDE |
250 |
|
%undef WIDTH |
251 |
|
%undef HEIGHT |
252 |
|
%undef OPT |
253 |
|
%endmacro |
254 |
|
|
255 |
;------------------------------------------------------------------------------ |
;------------------------------------------------------------------------------ |
256 |
; MAKE_YV12_TO_YV12( NAME, OPT ) |
; MAKE_YV12_TO_YV12( NAME, OPT ) |
265 |
;------------------------------------------------------------------------------ |
;------------------------------------------------------------------------------ |
266 |
%macro MAKE_YV12_TO_YV12 2 |
%macro MAKE_YV12_TO_YV12 2 |
267 |
%define NAME %1 |
%define NAME %1 |
268 |
%define OPT %2 |
%define XMM_OPT %2 |
269 |
align 16 |
ALIGN SECTION_ALIGN |
270 |
cglobal NAME |
cglobal NAME |
271 |
NAME |
NAME: |
272 |
%define pushsize 16 |
|
273 |
%define localsize 24 |
push _EBX ; _ESP + localsize + 3*PTR_SIZE |
274 |
|
|
275 |
%define vflip esp + localsize + pushsize + 52 |
%define localsize 2*4 |
|
%define height esp + localsize + pushsize + 48 |
|
|
%define width esp + localsize + pushsize + 44 |
|
|
%define uv_src_stride esp + localsize + pushsize + 40 |
|
|
%define y_src_stride esp + localsize + pushsize + 36 |
|
|
%define v_src esp + localsize + pushsize + 32 |
|
|
%define u_src esp + localsize + pushsize + 28 |
|
|
%define y_src esp + localsize + pushsize + 24 |
|
|
%define uv_dst_stride esp + localsize + pushsize + 20 |
|
|
%define y_dst_stride esp + localsize + pushsize + 16 |
|
|
%define v_dst esp + localsize + pushsize + 12 |
|
|
%define u_dst esp + localsize + pushsize + 8 |
|
|
%define y_dst esp + localsize + pushsize + 4 |
|
|
%define _ip esp + localsize + pushsize + 0 |
|
|
|
|
|
push ebx ; esp + localsize + 16 |
|
|
push esi ; esp + localsize + 8 |
|
|
push edi ; esp + localsize + 4 |
|
|
push ebp ; esp + localsize + 0 |
|
|
|
|
|
%define width2 esp + localsize - 4 |
|
|
%define height2 esp + localsize - 8 |
|
|
%define y_src_dif esp + localsize - 12 |
|
|
%define y_dst_dif esp + localsize - 16 |
|
|
%define uv_src_dif esp + localsize - 20 |
|
|
%define uv_dst_dif esp + localsize - 24 |
|
276 |
|
|
277 |
sub esp, localsize |
%ifdef ARCH_IS_X86_64 |
278 |
|
|
279 |
|
%ifndef WINDOWS |
280 |
|
%define pushsize 2*PTR_SIZE |
281 |
|
%define shadow 0 |
282 |
|
%else |
283 |
|
%define pushsize 4*PTR_SIZE |
284 |
|
%define shadow 32 + 16 |
285 |
|
%endif |
286 |
|
|
287 |
|
%define prm_vflip dword [_ESP + localsize + pushsize + shadow + 7*PTR_SIZE] |
288 |
|
%define prm_height dword [_ESP + localsize + pushsize + shadow + 6*PTR_SIZE] |
289 |
|
%define prm_width dword [_ESP + localsize + pushsize + shadow + 5*PTR_SIZE] |
290 |
|
%define prm_uv_src_stride dword [_ESP + localsize + pushsize + shadow + 4*PTR_SIZE] |
291 |
|
%define prm_y_src_stride dword [_ESP + localsize + pushsize + shadow + 3*PTR_SIZE] |
292 |
|
%define prm_v_src [_ESP + localsize + pushsize + shadow + 2*PTR_SIZE] |
293 |
|
%define prm_u_src [_ESP + localsize + pushsize + shadow + 1*PTR_SIZE] |
294 |
|
|
295 |
|
%ifdef WINDOWS |
296 |
|
push _ESI ; _ESP + localsize + 2*PTR_SIZE |
297 |
|
push _EDI ; _ESP + localsize + 1*PTR_SIZE |
298 |
|
push _EBP ; _ESP + localsize + 0*PTR_SIZE |
299 |
|
|
300 |
|
sub _ESP, localsize |
301 |
|
|
302 |
|
%define prm_y_src _ESI |
303 |
|
%define prm_uv_dst_stride TMP0d |
304 |
|
%define prm_y_dst_stride prm4d |
305 |
|
%define prm_v_dst prm3 |
306 |
|
%define prm_u_dst TMP1 |
307 |
|
%define prm_y_dst _EDI |
308 |
|
|
309 |
|
mov _EDI, prm1 |
310 |
|
mov TMP1, prm2 |
311 |
|
|
312 |
|
mov _ESI, [_ESP + localsize + pushsize + shadow - 1*PTR_SIZE] |
313 |
|
mov TMP0d, dword [_ESP + localsize + pushsize + shadow - 2*PTR_SIZE] |
314 |
|
|
315 |
|
%else |
316 |
|
push _EBP ; _ESP + localsize + 0*PTR_SIZE |
317 |
|
|
318 |
|
sub _ESP, localsize |
319 |
|
|
320 |
|
%define prm_y_src _ESI |
321 |
|
%define prm_uv_dst_stride prm5d |
322 |
|
%define prm_y_dst_stride TMP1d |
323 |
|
%define prm_v_dst prm6 |
324 |
|
%define prm_u_dst TMP0 |
325 |
|
%define prm_y_dst _EDI |
326 |
|
|
327 |
|
mov TMP0, prm2 |
328 |
|
mov _ESI, prm6 |
329 |
|
|
330 |
|
mov prm6, prm3 |
331 |
|
mov TMP1d, prm4d |
332 |
|
%endif |
333 |
|
|
334 |
mov eax, [width] |
%define _ip _ESP + localsize + pushsize + 0 |
335 |
mov ebx, [height] |
|
336 |
|
%else |
337 |
|
|
338 |
|
%define pushsize 4*PTR_SIZE |
339 |
|
|
340 |
|
%define prm_vflip [_ESP + localsize + pushsize + 13*PTR_SIZE] |
341 |
|
%define prm_height [_ESP + localsize + pushsize + 12*PTR_SIZE] |
342 |
|
%define prm_width [_ESP + localsize + pushsize + 11*PTR_SIZE] |
343 |
|
%define prm_uv_src_stride [_ESP + localsize + pushsize + 10*PTR_SIZE] |
344 |
|
%define prm_y_src_stride [_ESP + localsize + pushsize + 9*PTR_SIZE] |
345 |
|
%define prm_v_src [_ESP + localsize + pushsize + 8*PTR_SIZE] |
346 |
|
%define prm_u_src [_ESP + localsize + pushsize + 7*PTR_SIZE] |
347 |
|
|
348 |
|
%define prm_y_src _ESI |
349 |
|
%define prm_uv_dst_stride [_ESP + localsize + pushsize + 5*PTR_SIZE] |
350 |
|
%define prm_y_dst_stride [_ESP + localsize + pushsize + 4*PTR_SIZE] |
351 |
|
%define prm_v_dst [_ESP + localsize + pushsize + 3*PTR_SIZE] |
352 |
|
%define prm_u_dst [_ESP + localsize + pushsize + 2*PTR_SIZE] |
353 |
|
%define prm_y_dst _EDI |
354 |
|
|
355 |
|
%define _ip _ESP + localsize + pushsize + 0 |
356 |
|
|
357 |
|
push _ESI ; _ESP + localsize + 2*PTR_SIZE |
358 |
|
push _EDI ; _ESP + localsize + 1*PTR_SIZE |
359 |
|
push _EBP ; _ESP + localsize + 0*PTR_SIZE |
360 |
|
|
361 |
|
sub _ESP, localsize |
362 |
|
|
363 |
|
mov _ESI, [_ESP + localsize + pushsize + 6*PTR_SIZE] |
364 |
|
mov _EDI, [_ESP + localsize + pushsize + 1*PTR_SIZE] |
365 |
|
|
366 |
|
%endif |
367 |
|
|
368 |
|
%define width2 dword [_ESP + localsize - 1*4] |
369 |
|
%define height2 dword [_ESP + localsize - 2*4] |
370 |
|
|
371 |
|
mov eax, prm_width |
372 |
|
mov ebx, prm_height |
373 |
shr eax, 1 ; calculate widht/2, heigh/2 |
shr eax, 1 ; calculate widht/2, heigh/2 |
374 |
shr ebx, 1 |
shr ebx, 1 |
375 |
mov [width2], eax |
mov width2, eax |
376 |
mov [height2], ebx |
mov height2, ebx |
377 |
|
|
378 |
mov ebp, [vflip] |
mov eax, prm_vflip |
379 |
or ebp, ebp |
test eax, eax |
380 |
jz near .dont_flip |
jz near .go |
381 |
|
|
382 |
; flipping support |
; flipping support |
383 |
mov eax, [height] |
mov eax, prm_height |
384 |
mov esi, [y_src] |
mov ecx, prm_y_src_stride |
385 |
mov edx, [y_src_stride] |
sub eax, 1 |
386 |
push edx |
imul eax, ecx |
387 |
mul edx |
add _ESI, _EAX ; y_src += (height-1) * y_src_stride |
388 |
pop edx |
neg ecx |
389 |
add esi, eax ; y_src += (height-1) * y_src_stride |
mov prm_y_src_stride, ecx ; y_src_stride = -y_src_stride |
390 |
neg edx |
|
391 |
mov [y_src], esi |
mov eax, height2 |
392 |
mov [y_src_stride], edx ; y_src_stride = -y_src_stride |
mov _EDX, prm_u_src |
393 |
|
mov _EBP, prm_v_src |
394 |
mov eax, [height2] |
mov ecx, prm_uv_src_stride |
395 |
mov esi, [u_src] |
test _EDX, _EDX |
396 |
mov edi, [v_src] |
jz .go |
397 |
mov edx, [uv_src_stride] |
test _EBP, _EBP |
398 |
sub eax, 1 ; ebp = height2 - 1 |
jz .go |
399 |
push edx |
sub eax, 1 ; _EAX = height2 - 1 |
400 |
mul edx |
imul eax, ecx |
401 |
pop edx |
add _EDX, _EAX ; u_src += (height2-1) * uv_src_stride |
402 |
add esi, eax ; u_src += (height2-1) * uv_src_stride |
add _EBP, _EAX ; v_src += (height2-1) * uv_src_stride |
403 |
add edi, eax ; v_src += (height2-1) * uv_src_stride |
neg ecx |
404 |
neg edx |
mov prm_u_src, _EDX |
405 |
mov [u_src], esi |
mov prm_v_src, _EBP |
406 |
mov [v_src], edi |
mov prm_uv_src_stride, ecx ; uv_src_stride = -uv_src_stride |
407 |
mov [uv_src_stride], edx ; uv_src_stride = -uv_src_stride |
|
408 |
|
.go: |
409 |
.dont_flip |
mov eax, prm_width |
410 |
|
mov ebp, prm_height |
411 |
mov eax, [y_src_stride] |
PLANE_COPY _EDI, prm_y_dst_stride, _ESI, prm_y_src_stride, _EAX, _EBP, XMM_OPT |
412 |
mov ebx, [y_dst_stride] |
|
413 |
mov ecx, [uv_src_stride] |
mov _EAX, prm_u_src |
414 |
mov edx, [uv_dst_stride] |
or _EAX, prm_v_src |
415 |
sub eax, [width] |
jz near .UVFill_0x80 |
416 |
sub ebx, [width] |
|
417 |
sub ecx, [width2] |
mov eax, width2 |
418 |
sub edx, [width2] |
mov ebp, height2 |
419 |
mov [y_src_dif], eax ; y_src_dif = y_src_stride - width |
mov _ESI, prm_u_src |
420 |
mov [y_dst_dif], ebx ; y_dst_dif = y_dst_stride - width |
mov _EDI, prm_u_dst |
421 |
mov [uv_src_dif], ecx ; uv_src_dif = uv_src_stride - width2 |
PLANE_COPY _EDI, prm_uv_dst_stride, _ESI, prm_uv_src_stride, _EAX, _EBP, XMM_OPT |
422 |
mov [uv_dst_dif], edx ; uv_dst_dif = uv_dst_stride - width2 |
|
423 |
|
mov eax, width2 |
424 |
PLANE_COPY [y_dst], [y_dst_dif], [y_src], [y_src_dif], [width], [height], OPT |
mov ebp, height2 |
425 |
PLANE_COPY [u_dst], [uv_dst_dif], [u_src], [uv_src_dif], [width2], [height2], OPT |
mov _ESI, prm_v_src |
426 |
PLANE_COPY [v_dst], [uv_dst_dif], [v_src], [uv_src_dif], [width2], [height2], OPT |
mov _EDI, prm_v_dst |
427 |
|
PLANE_COPY _EDI, prm_uv_dst_stride, _ESI, prm_uv_src_stride, _EAX, _EBP, XMM_OPT |
428 |
add esp, localsize |
|
429 |
pop ebp |
.Done_UVPlane: |
430 |
pop edi |
add _ESP, localsize |
431 |
pop esi |
|
432 |
pop ebx |
pop _EBP |
433 |
|
%ifndef ARCH_IS_X86_64 |
434 |
|
pop _EDI |
435 |
|
pop _ESI |
436 |
|
%else |
437 |
|
%ifdef WINDOWS |
438 |
|
pop _EDI |
439 |
|
pop _ESI |
440 |
|
%endif |
441 |
|
%endif |
442 |
|
pop _EBX |
443 |
|
|
444 |
ret |
ret |
445 |
|
|
446 |
|
.UVFill_0x80: |
447 |
|
|
448 |
|
mov esi, width2 |
449 |
|
mov ebp, height2 |
450 |
|
mov _EDI, prm_u_dst |
451 |
|
PLANE_FILL _EDI, prm_uv_dst_stride, _ESI, _EBP, XMM_OPT |
452 |
|
|
453 |
|
mov esi, width2 |
454 |
|
mov ebp, height2 |
455 |
|
mov _EDI, prm_v_dst |
456 |
|
PLANE_FILL _EDI, prm_uv_dst_stride, _ESI, _EBP, XMM_OPT |
457 |
|
|
458 |
|
jmp near .Done_UVPlane |
459 |
|
|
460 |
|
ENDFUNC |
461 |
|
|
462 |
|
%undef NAME |
463 |
|
%undef XMM_OPT |
464 |
%endmacro |
%endmacro |
|
;------------------------------------------------------------------------------ |
|
465 |
|
|
466 |
|
;============================================================================= |
467 |
|
; Code |
468 |
|
;============================================================================= |
469 |
|
|
470 |
|
SECTION .rotext align=SECTION_ALIGN |
471 |
|
|
472 |
MAKE_YV12_TO_YV12 yv12_to_yv12_mmx, 0 |
MAKE_YV12_TO_YV12 yv12_to_yv12_mmx, 0 |
473 |
|
|
474 |
MAKE_YV12_TO_YV12 yv12_to_yv12_xmm, 1 |
MAKE_YV12_TO_YV12 yv12_to_yv12_xmm, 1 |
475 |
|
|
476 |
|
%ifidn __OUTPUT_FORMAT__,elf |
477 |
|
section ".note.GNU-stack" noalloc noexec nowrite progbits |
478 |
|
%endif |
479 |
|
|