Annotation of /trunk/xvidcore/src/image/x86_asm/yuyv_to_yv12_mmx.asm

Revision 434 - (view) (download)

1 :	chl	434	;/*****************************************************************************
2 :	Isibaar	3	; *
3 :	chl	434	; * XVID MPEG-4 VIDEO CODEC
4 :			; * mmx yuyv/uyvy to yuv planar conversion
5 :	Isibaar	3	; *
6 :	chl	434	; * Copyright(C) 2002 Peter Ross <pross@xvid.org>
7 :	Isibaar	3	; *
8 :	chl	434	; * This program is an implementation of a part of one or more MPEG-4
9 :			; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending
10 :			; * to use this software module in hardware or software products are
11 :			; * advised that its use may infringe existing patents or copyrights, and
12 :			; * any such use would be at such party's own risk. The original
13 :			; * developer of this software module and his/her company, and subsequent
14 :			; * editors and their companies, will have no liability for use of this
15 :			; * software or modifications or derivatives thereof.
16 :	Isibaar	3	; *
17 :	chl	434	; * This program is free software; you can redistribute it and/or modify
18 :			; * it under the terms of the GNU General Public License as published by
19 :			; * the Free Software Foundation; either version 2 of the License, or
20 :			; * (at your option) any later version.
21 :	Isibaar	3	; *
22 :	chl	434	; * This program is distributed in the hope that it will be useful,
23 :			; * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 :			; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 :			; * GNU General Public License for more details.
26 :	Isibaar	3	; *
27 :	chl	434	; * You should have received a copy of the GNU General Public License
28 :			; * along with this program; if not, write to the Free Software
29 :			; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30 :	Isibaar	3	; *
31 :	chl	434	; ****************************************************************************/
32 :	Isibaar	3
33 :			bits 32
34 :
35 :
36 :			section .data
37 :
38 :			%macro cglobal 1
39 :			%ifdef PREFIX
40 :			global _%1
41 :			%define %1 _%1
42 :			%else
43 :			global %1
44 :			%endif
45 :			%endmacro
46 :
47 :			align 16
48 :
49 :
50 :			section .data
51 :
52 :
53 :			;===========================================================================
54 :			; masks for extracting yuv components
55 :			;===========================================================================
56 :			; y u y v y u y v
57 :
58 :			mask1 db 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0
59 :			mask2 db 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff
60 :
61 :
62 :			section .text
63 :
64 :			;===========================================================================
65 :			;
66 :			; void yuyv_to_yv12_mmx(uint8_t * const y_out,
67 :			; uint8_t * const u_out,
68 :			; uint8_t * const v_out,
69 :			; const uint8_t * const src,
70 :			; const uint32_t width,
71 :			; const uint32_t height,
72 :			; const uint32_t stride);
73 :			;
74 :			; width must be multiple of 8
75 :			; does not flip
76 :			; ~30% faster than plain c
77 :			;
78 :			;===========================================================================
79 :
80 :			align 16
81 :			cglobal yuyv_to_yv12_mmx
82 :			yuyv_to_yv12_mmx
83 :
84 :			push ebx
85 :			push ecx
86 :			push esi
87 :			push edi
88 :			push ebp ; STACK BASE = 20
89 :
90 :			; some global consants
91 :
92 :			mov ecx, [esp + 20 + 20] ; width
93 :			mov eax, [esp + 20 + 28] ; stride
94 :			sub eax, ecx ; eax = stride - width
95 :			mov edx, eax
96 :			add edx, [esp + 20 + 28] ; edx = y_dif + stride
97 :			push edx ; [esp + 12] = y_dif
98 :
99 :			shr eax, 1
100 :			push eax ; [esp + 8] = uv_dif
101 :
102 :			shr ecx, 3
103 :			push ecx ; [esp + 4] = width/8
104 :
105 :			sub esp, 4 ; [esp + 0] = tmp_height_counter
106 :			; STACK_BASE = 36
107 :
108 :			movq mm6, [mask1]
109 :			movq mm7, [mask2]
110 :
111 :			mov edi, [esp + 36 + 4] ; y_out
112 :			mov ebx, [esp + 36 + 8] ; u_out
113 :			mov edx, [esp + 36 + 12] ; v_out
114 :			mov esi, [esp + 36 + 16] ; src
115 :
116 :			mov eax, [esp + 36 + 20]
117 :			mov ebp, [esp + 36 + 24]
118 :			mov ecx, [esp + 36 + 28] ; ecx = stride
119 :			shr ebp, 1 ; ebp = height /= 2
120 :			add eax, eax ; eax = 2 * width
121 :
122 :			.yloop
123 :			mov [esp], ebp
124 :			mov ebp, [esp + 4] ; width/8
125 :			.xloop
126 :			movq mm2, [esi] ; y 1st row
127 :			movq mm3, [esi + 8]
128 :			movq mm0, mm2
129 :			movq mm1, mm3
130 :			pand mm2, mm6 ; mask1
131 :			pand mm3, mm6 ; mask1
132 :			pand mm0, mm7 ; mask2
133 :			pand mm1, mm7 ; mask2
134 :			packuswb mm2, mm3
135 :			psrlq mm0, 8
136 :			psrlq mm1, 8
137 :			movq [edi], mm2
138 :
139 :			movq mm4, [esi + eax] ; y 2nd row
140 :			movq mm5, [esi + eax + 8]
141 :			movq mm2, mm4
142 :			movq mm3, mm5
143 :			pand mm4, mm6 ; mask1
144 :			pand mm5, mm6 ; mask1
145 :			pand mm2, mm7 ; mask2
146 :			pand mm3, mm7 ; mask2
147 :			packuswb mm4, mm5
148 :			psrlq mm2, 8
149 :			psrlq mm3, 8
150 :			movq [edi + ecx], mm4
151 :
152 :			paddw mm0, mm2 ; uv avg 1st & 2nd
153 :			paddw mm1, mm3
154 :			psrlw mm0, 1
155 :			psrlw mm1, 1
156 :			packuswb mm0, mm1
157 :			movq mm2, mm0
158 :			pand mm0, mm6 ; mask1
159 :			pand mm2, mm7 ; mask2
160 :			packuswb mm0, mm0
161 :			psrlq mm2, 8
162 :			movd [ebx], mm0
163 :			packuswb mm2, mm2
164 :			movd [edx], mm2
165 :
166 :			add esi, 16
167 :			add edi, 8
168 :			add ebx, 4
169 :			add edx, 4
170 :			dec ebp
171 :			jnz near .xloop
172 :
173 :			mov ebp, [esp]
174 :
175 :			add esi, eax ; += width2
176 :			add edi, [esp + 12] ; += y_dif + stride
177 :			add ebx, [esp + 8] ; += uv_dif
178 :			add edx, [esp + 8] ; += uv_dif
179 :
180 :			dec ebp
181 :			jnz near .yloop
182 :
183 :			emms
184 :
185 :			add esp, 16
186 :			pop ebp
187 :			pop edi
188 :			pop esi
189 :			pop ecx
190 :			pop ebx
191 :
192 :			ret
193 :
194 :
195 :			;===========================================================================
196 :			;
197 :			; void uyvy_to_yv12_mmx(uint8_t * const y_out,
198 :			; uint8_t * const u_out,
199 :			; uint8_t * const v_out,
200 :			; const uint8_t * const src,
201 :			; const uint32_t width,
202 :			; const uint32_t height,
203 :			; const uint32_t stride);
204 :			;
205 :			; width must be multiple of 8
206 :			; does not flip
207 :			; ~30% faster than plain c
208 :			;
209 :			;===========================================================================
210 :
211 :			align 16
212 :			cglobal uyvy_to_yv12_mmx
213 :			uyvy_to_yv12_mmx
214 :
215 :			push ebx
216 :			push ecx
217 :			push esi
218 :			push edi
219 :			push ebp ; STACK BASE = 20
220 :
221 :			; some global consants
222 :
223 :			mov ecx, [esp + 20 + 20] ; width
224 :			mov eax, [esp + 20 + 28] ; stride
225 :			sub eax, ecx ; eax = stride - width
226 :			mov edx, eax
227 :			add edx, [esp + 20 + 28] ; edx = y_dif + stride
228 :			push edx ; [esp + 12] = y_dif
229 :
230 :			shr eax, 1
231 :			push eax ; [esp + 8] = uv_dif
232 :
233 :			shr ecx, 3
234 :			push ecx ; [esp + 4] = width/8
235 :
236 :			sub esp, 4 ; [esp + 0] = tmp_height_counter
237 :			; STACK_BASE = 36
238 :
239 :			movq mm6, [mask1]
240 :			movq mm7, [mask2]
241 :
242 :			mov edi, [esp + 36 + 4] ; y_out
243 :			mov ebx, [esp + 36 + 8] ; u_out
244 :			mov edx, [esp + 36 + 12] ; v_out
245 :			mov esi, [esp + 36 + 16] ; src
246 :
247 :			mov eax, [esp + 36 + 20]
248 :			mov ebp, [esp + 36 + 24]
249 :			mov ecx, [esp + 36 + 28] ; ecx = stride
250 :			shr ebp, 1 ; ebp = height /= 2
251 :			add eax, eax ; eax = 2 * width
252 :
253 :			.yloop
254 :			mov [esp], ebp
255 :			mov ebp, [esp + 4] ; width/8
256 :			.xloop
257 :			movq mm2, [esi] ; y 1st row
258 :			movq mm3, [esi + 8]
259 :			movq mm0, mm2
260 :			movq mm1, mm3
261 :			pand mm2, mm7 ; mask2
262 :			pand mm3, mm7 ; mask2
263 :			psrlq mm2, 8
264 :			psrlq mm3, 8
265 :			pand mm0, mm6 ; mask1
266 :			pand mm1, mm6 ; mask1
267 :			packuswb mm2, mm3
268 :			movq [edi], mm2
269 :
270 :			movq mm4, [esi + eax] ; y 2nd row
271 :			movq mm5, [esi + eax + 8]
272 :			movq mm2, mm4
273 :			movq mm3, mm5
274 :			pand mm4, mm7 ; mask2
275 :			pand mm5, mm7 ; mask2
276 :			psrlq mm4, 8
277 :			psrlq mm5, 8
278 :			pand mm2, mm6 ; mask1
279 :			pand mm3, mm6 ; mask1
280 :			packuswb mm4, mm5
281 :			movq [edi + ecx], mm4
282 :
283 :			paddw mm0, mm2 ; uv avg 1st & 2nd
284 :			paddw mm1, mm3
285 :			psrlw mm0, 1
286 :			psrlw mm1, 1
287 :			packuswb mm0, mm1
288 :			movq mm2, mm0
289 :			pand mm0, mm6 ; mask1
290 :			pand mm2, mm7 ; mask2
291 :			packuswb mm0, mm0
292 :			psrlq mm2, 8
293 :			movd [ebx], mm0
294 :			packuswb mm2, mm2
295 :			movd [edx], mm2
296 :
297 :			add esi, 16
298 :			add edi, 8
299 :			add ebx, 4
300 :			add edx, 4
301 :			dec ebp
302 :			jnz near .xloop
303 :
304 :			mov ebp, [esp]
305 :
306 :			add esi, eax ; += width2
307 :			add edi, [esp + 12] ; += y_dif + stride
308 :			add ebx, [esp + 8] ; += uv_dif
309 :			add edx, [esp + 8] ; += uv_dif
310 :
311 :			dec ebp
312 :			jnz near .yloop
313 :
314 :			emms
315 :
316 :			add esp, 16
317 :			pop ebp
318 :			pop edi
319 :			pop esi
320 :			pop ecx
321 :			pop ebx
322 :
323 :			ret

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4