Annotation of /trunk/xvidcore/src/image/ppc_asm/colorspace_altivec.c

Revision 1988 - (view) (download)

1 :	edgomez	1412	/*****************************************************************************
2 :			*
3 :			* XVID MPEG-4 VIDEO CODEC
4 :			* - Colorspace conversion functions with altivec optimization -
5 :			*
6 :			* Copyright(C) 2004 Christoph NŠgeli <chn@kbw.ch>
7 :			*
8 :			* This program is free software ; you can redistribute it and/or modify
9 :			* it under the terms of the GNU General Public License as published by
10 :			* the Free Software Foundation ; either version 2 of the License, or
11 :			* (at your option) any later version.
12 :			*
13 :			* This program is distributed in the hope that it will be useful,
14 :			* but WITHOUT ANY WARRANTY ; without even the implied warranty of
15 :			* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 :			* GNU General Public License for more details.
17 :			*
18 :			* You should have received a copy of the GNU General Public License
19 :			* along with this program ; if not, write to the Free Software
20 :			* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 :			*
22 :	Isibaar	1988	* $Id$
23 :	edgomez	1412	*
24 :			****************************************************************************/
25 :
26 :			#ifdef HAVE_ALTIVEC_H
27 :			#include <altivec.h>
28 :			#endif
29 :
30 :			#include "../../portab.h"
31 :	edgomez	1606	#include "../colorspace.h"
32 :	edgomez	1412
33 :			#undef DEBUG
34 :			#include <stdio.h>
35 :
36 :
37 :			/******** generic altivec RGB to YV12 colorspace macro ********/
38 :
39 :			#define MAKE_COLORSPACE_ALTIVEC_FROM_RGB(NAME,SIZE,PIXELS,VPIXELS,FUNC,C1,C2,C3,C4) \
40 :			void \
41 :			NAME(uint8_t *x_ptr, int x_stride, \
42 :			uint8_t y_ptr, uint8_t u_ptr, uint8_t *v_ptr, \
43 :			int y_stride, int uv_stride, \
44 :			int width, int height, int vflip) \
45 :			{ \
46 :			int fixed_width = (width + 15) & ~15; \
47 :			int x_dif = x_stride - (SIZE) * fixed_width; \
48 :			int y_dif = y_stride - fixed_width; \
49 :			int uv_dif = uv_stride - (fixed_width / 2); \
50 :			int x, y; \
51 :			unsigned prefetch_constant; \
52 :			\
53 :			register vector unsigned int shift_consts[4]; \
54 :			\
55 :			vector unsigned char y_add; \
56 :			vector unsigned char u_add; \
57 :			vector unsigned char v_add; \
58 :			\
59 :			vector unsigned short vec_fix_ins[3]; \
60 :			\
61 :			vec_st(vec_ldl(0, &g_vec_fix_ins[0]), 0, &vec_fix_ins[0]); \
62 :			vec_st(vec_ldl(0, &g_vec_fix_ins[1]), 0, &vec_fix_ins[1]); \
63 :			vec_st(vec_ldl(0, &g_vec_fix_ins[2]), 0, &vec_fix_ins[2]); \
64 :			\
65 :			shift_consts[0] = vec_add(vec_splat_u32(12), vec_splat_u32(12)); \
66 :			shift_consts[1] = vec_add(vec_splat_u32(8), vec_splat_u32(8)); \
67 :			shift_consts[2] = vec_splat_u32(8); \
68 :			shift_consts[3] = vec_splat_u32(0); \
69 :			\
70 :			prefetch_constant = build_prefetch(16, 2, (short)x_stride); \
71 :			vec_dstt(x_ptr, prefetch_constant, 0); \
72 :			vec_dstt(x_ptr + (x_stride << 1), prefetch_constant, 1); \
73 :			\
74 :			((unsigned char)&y_add) = Y_ADD_IN; \
75 :			((unsigned char)&u_add) = U_ADD_IN; \
76 :			((unsigned char)&v_add) = V_ADD_IN; \
77 :			\
78 :			y_add = vec_splat(y_add, 0); \
79 :			u_add = vec_splat(u_add, 0); \
80 :			v_add = vec_splat(v_add, 0); \
81 :			\
82 :			if(vflip) { \
83 :			x_ptr += (height - 1) * x_stride; \
84 :			x_dif = -(SIZE) * fixed_width - x_stride; \
85 :			x_stride = -x_stride; \
86 :			} \
87 :			\
88 :			for(y = 0; y < height; y += (VPIXELS)) { \
89 :			FUNC##_ROW(SIZE,C1,C2,C3,C4); \
90 :			for(x = 0; x < fixed_width; x += (PIXELS)) { \
91 :			FUNC(SIZE,C1,C2,C3,C4); \
92 :			x_ptr += (PIXELS)*(SIZE); \
93 :			y_ptr += (PIXELS); \
94 :			u_ptr += (PIXELS)/2; \
95 :			v_ptr += (PIXELS)/2; \
96 :			} \
97 :			x_ptr += x_dif + (VPIXELS-1) * x_stride; \
98 :			y_ptr += y_dif + (VPIXELS-1) * y_stride; \
99 :			u_ptr += uv_dif + ((VPIXELS/2)-1)*uv_stride; \
100 :			v_ptr += uv_dif + ((VPIXELS/2)-1)*uv_stride; \
101 :			} \
102 :			vec_dssall(); \
103 :			}
104 :
105 :
106 :			/******** generic altivec YUV to YV12 colorspace macro ********/
107 :
108 :			#define MAKE_COLORSPACE_ALTIVEC_FROM_YUV(NAME,SIZE,PIXELS,VPIXELS,FUNC,C1,C2,C3,C4) \
109 :			void \
110 :			NAME(uint8_t *x_ptr, int x_stride, \
111 :			uint8_t y_ptr, uint8_t u_ptr, uint8_t *v_ptr, \
112 :			int y_stride, int uv_stride, \
113 :			int width, int height, int vflip) \
114 :			{ \
115 :			int fixed_width = (width + 15) & ~15; \
116 :			int x_dif = x_stride - (SIZE)*fixed_width; \
117 :			int y_dif = y_stride - fixed_width; \
118 :			int uv_dif = uv_stride - (fixed_width / 2); \
119 :			int x, y; \
120 :			\
121 :			unsigned prefetch_constant; \
122 :			\
123 :			vector unsigned int p0, p1; \
124 :			vector unsigned char lum0, lum1; \
125 :			vector unsigned char u0, u1; \
126 :			vector unsigned char v0, v1; \
127 :			vector unsigned char t; \
128 :			\
129 :			prefetch_constant = build_prefetch(16, 2, (short)x_stride); \
130 :			vec_dstt(x_ptr, prefetch_constant, 0); \
131 :			vec_dstt(x_ptr + (x_stride << 1), prefetch_constant, 1); \
132 :			\
133 :			if(vflip) { \
134 :			x_ptr += (height - 1) * x_stride; \
135 :			x_dif = -(SIZE)*fixed_width - x_stride; \
136 :			x_stride = -x_stride; \
137 :			} \
138 :			\
139 :			for(y = 0; y < height; y += (VPIXELS)) { \
140 :			FUNC##_ROW(SIZE,C1,C2,C3,C4); \
141 :			for(x = 0; x < fixed_width; x += (PIXELS)) { \
142 :			FUNC(SIZE,C1,C2,C3,C4); \
143 :			x_ptr += (PIXELS)*(SIZE); \
144 :			y_ptr += (PIXELS); \
145 :			u_ptr += (PIXELS)/2; \
146 :			v_ptr += (PIXELS)/2; \
147 :			} \
148 :			x_ptr += x_dif + (VPIXELS-1) * x_stride; \
149 :			y_ptr += y_dif + (VPIXELS-1) * y_stride; \
150 :			u_ptr += uv_dif + ((VPIXELS/2)-1) * uv_stride; \
151 :			v_ptr += uv_dif + ((VPIXELS/2)-1) * uv_stride; \
152 :			} \
153 :			vec_dssall(); \
154 :			}
155 :
156 :
157 :			/******** generic altivec YV12 to YUV colorspace macro ********/
158 :
159 :			#define MAKE_COLORSPACE_ALTIVEC_TO_YUV(NAME,SIZE,PIXELS,VPIXELS,FUNC,C1,C2,C3,C4) \
160 :			void \
161 :			NAME(uint8_t *x_ptr, int x_stride, \
162 :			uint8_t y_ptr, uint8_t u_ptr, uint8_t *v_ptr, \
163 :			int y_stride, int uv_stride, \
164 :			int width, int height, int vflip) \
165 :			{ \
166 :			int fixed_width = (width + 15) & ~15; \
167 :			int x_dif = x_stride - (SIZE)*fixed_width; \
168 :			int y_dif = y_stride - fixed_width; \
169 :			int uv_dif = uv_stride - (fixed_width / 2); \
170 :			int x, y; \
171 :			\
172 :			vector unsigned char y_vec; \
173 :			vector unsigned char u_vec; \
174 :			vector unsigned char v_vec; \
175 :	edgomez	1570	vector unsigned char p0, p1, ptmp; \
176 :	edgomez	1412	vector unsigned char mask; \
177 :			vector unsigned char mask_stencil; \
178 :			vector unsigned char t; \
179 :			vector unsigned char m4; \
180 :			vector unsigned char vec4; \
181 :			\
182 :			unsigned prefetch_constant_y; \
183 :			unsigned prefetch_constant_uv; \
184 :			\
185 :			prefetch_constant_y = build_prefetch(16, 4, (short)y_stride); \
186 :			prefetch_constant_uv = build_prefetch(16, 2, (short)uv_stride); \
187 :			\
188 :			vec_dstt(y_ptr, prefetch_constant_y, 0); \
189 :			vec_dstt(u_ptr, prefetch_constant_uv, 1); \
190 :			vec_dstt(v_ptr, prefetch_constant_uv, 2); \
191 :			\
192 :			mask_stencil = (vector unsigned char)vec_mergeh( (vector unsigned short)vec_mergeh(vec_splat_u8(-1), vec_splat_u8(0)), vec_splat_u16(0) ); \
193 :			m4 = vec_sr(vec_lvsl(0, (unsigned char*)0), vec_splat_u8(2)); \
194 :			vec4 = vec_splat_u8(4); \
195 :			\
196 :			if(vflip) { \
197 :			x_ptr += (height - 1) * x_stride; \
198 :			x_dif = -(SIZE)*fixed_width - x_stride; \
199 :			x_stride = -x_stride; \
200 :			} \
201 :			\
202 :			for(y = 0; y < height; y += (VPIXELS)) { \
203 :			FUNC##_ROW(SIZE,C1,C2,C3,C4); \
204 :			for(x = 0; x < fixed_width; x += (PIXELS)) { \
205 :			FUNC(SIZE,C1,C2,C3,C4); \
206 :			x_ptr += (PIXELS)*(SIZE); \
207 :			y_ptr += (PIXELS); \
208 :			u_ptr += (PIXELS)/2; \
209 :			v_ptr += (PIXELS)/2; \
210 :			} \
211 :			x_ptr += x_dif + (VPIXELS-1) * x_stride; \
212 :			y_ptr += y_dif + (VPIXELS-1) * y_stride; \
213 :			u_ptr += uv_dif + ((VPIXELS/2)-1) * uv_stride; \
214 :			v_ptr += uv_dif + ((VPIXELS/2)-1) * uv_stride; \
215 :			} \
216 :			vec_dssall(); \
217 :			}
218 :
219 :			/******** colorspace input (xxx_to_yv12) functions ********/
220 :
221 :			/* rgb -> yuv def's
222 :
223 :			this following constants are "official spec"
224 :			Video Demystified" (ISBN 1-878707-09-4)
225 :
226 :			rgb<->yuv _is_ lossy, since most programs do the conversion differently
227 :
228 :			SCALEBITS/FIX taken from ffmpeg
229 :			*/
230 :
231 :			#define Y_R_IN 0.257
232 :			#define Y_G_IN 0.504
233 :			#define Y_B_IN 0.098
234 :			#define Y_ADD_IN 16
235 :
236 :			#define U_R_IN 0.148
237 :			#define U_G_IN 0.291
238 :			#define U_B_IN 0.439
239 :			#define U_ADD_IN 128
240 :
241 :			#define V_R_IN 0.439
242 :			#define V_G_IN 0.368
243 :			#define V_B_IN 0.071
244 :			#define V_ADD_IN 128
245 :
246 :			#define SCALEBITS_IN 8
247 :			#define FIX_IN(x) ((uint16_t) ((x) * (1L<<SCALEBITS_IN) + 0.5))
248 :
249 :
250 :			static inline unsigned
251 :			build_prefetch(unsigned char block_size, unsigned char block_count, short stride)
252 :	edgomez	1557	{
253 :	edgomez	1412	return ((block_size << 24) \| (block_count << 16) \| stride);
254 :			}
255 :
256 :			const static vector unsigned short g_vec_fix_ins [3] = {
257 :			(vector unsigned short)AVV( SCALEBITS_IN, FIX_IN(Y_R_IN), FIX_IN(Y_G_IN), FIX_IN(Y_B_IN), 0, 0, 0, 0),
258 :			(vector unsigned short)AVV( SCALEBITS_IN + 2, -FIX_IN(U_R_IN), -FIX_IN(U_G_IN), FIX_IN(U_B_IN), 0, 0, 0, 0),
259 :			(vector unsigned short)AVV( SCALEBITS_IN + 2, FIX_IN(V_R_IN), -FIX_IN(V_G_IN), -FIX_IN(V_B_IN), 0, 0, 0, 0)
260 :			};
261 :
262 :			/* RGB Input */
263 :			#define READ_RGB_Y_ALTIVEC(SIZE,ROW,UVID,C1,C2,C3,C4) \
264 :			p0 = vec_ld(0, (unsigned int)(x_ptr + (ROW) x_stride)); \
265 :			p1 = vec_ld(16, (unsigned int)(x_ptr + (ROW) x_stride)); \
266 :			\
267 :			mask = vec_mergeh((vector unsigned char)shift_consts[3], vec_splat_u8(-1)); \
268 :			mask = (vector unsigned char)vec_mergeh((vector unsigned short)shift_consts[3], (vector unsigned short)mask); \
269 :			\
270 :			t0 = vec_sr(p0, shift_consts[C1]); \
271 :			t0 = vec_sel(shift_consts[3], t0, (vector unsigned int)mask); \
272 :			t1 = vec_sr(p1, shift_consts[C1]); \
273 :			t1 = vec_sel(shift_consts[3], t1, (vector unsigned int)mask); \
274 :			r = vec_pack(t0, t1); \
275 :			r##UVID = vec_add(r##UVID, r); \
276 :			\
277 :			t0 = vec_sr(p0, shift_consts[C2]); \
278 :			t0 = vec_sel(shift_consts[3], t0, (vector unsigned int)mask); \
279 :			t1 = vec_sr(p1, shift_consts[C2]); \
280 :			t1 = vec_sel(shift_consts[3], t1, (vector unsigned int)mask); \
281 :			g = vec_pack(t0, t1); \
282 :			g##UVID = vec_add(g##UVID, g); \
283 :			\
284 :			t0 = vec_sr(p0, shift_consts[C3]); \
285 :			t0 = vec_sel(shift_consts[3], t0, (vector unsigned int)mask); \
286 :			t1 = vec_sr(p1, shift_consts[C3]); \
287 :			t1 = vec_sel(shift_consts[3], t1, (vector unsigned int)mask); \
288 :			b = vec_pack(t0, t1); \
289 :			b##UVID = vec_add(b##UVID, b); \
290 :			\
291 :			lum = vec_mladd(r, vec_splat(vec_fix_ins[0], 1), (vector unsigned short)shift_consts[3]); \
292 :			lum = vec_mladd(g, vec_splat(vec_fix_ins[0], 2), lum); \
293 :			lum = vec_mladd(b, vec_splat(vec_fix_ins[0], 3), lum); \
294 :			lum = vec_sr(lum, vec_splat(vec_fix_ins[0], 0)); \
295 :			y_vec = vec_pack(lum, (vector unsigned short)shift_consts[3]); \
296 :			y_vec = vec_add(y_vec, y_add); \
297 :			\
298 :			mask = vec_pack((vector unsigned short)shift_consts[3], vec_splat_u16(-1)); \
299 :			mask = vec_perm(mask, mask, vec_lvsl(0, y_ptr + (ROW)*y_stride)); \
300 :			y_vec = vec_perm(y_vec, y_vec, vec_lvsl(0, y_ptr + (ROW)*y_stride)); \
301 :			y_vec = vec_sel(y_vec, vec_ld(0, y_ptr + (ROW)*y_stride), mask); \
302 :			vec_st(y_vec, 0, y_ptr + (ROW)*y_stride)
303 :
304 :			#define READ_RGB_UV_ALTIVEC(UV_ROW,UVID) \
305 :			r##UVID = (vector unsigned short)vec_sum4s((vector signed short)r##UVID, (vector signed int)shift_consts[3]); \
306 :			g##UVID = (vector unsigned short)vec_sum4s((vector signed short)g##UVID, (vector signed int)shift_consts[3]); \
307 :			b##UVID = (vector unsigned short)vec_sum4s((vector signed short)b##UVID, (vector signed int)shift_consts[3]); \
308 :			\
309 :			t3 = vec_mulo((vector signed short)r##UVID, (vector signed short)vec_splat(vec_fix_ins[1], 1)); \
310 :			t3 = vec_add(t3, vec_mulo((vector signed short)g##UVID, (vector signed short)vec_splat(vec_fix_ins[1], 2))); \
311 :			t3 = vec_add(t3, vec_mulo((vector signed short)b##UVID, (vector signed short)vec_splat(vec_fix_ins[1], 3))); \
312 :			t3 = vec_sr(t3, (vector unsigned int)vec_mergeh((vector unsigned short)shift_consts[3], vec_splat(vec_fix_ins[1], 0))); \
313 :			\
314 :			u_vec = vec_pack(vec_pack((vector unsigned int)t3, shift_consts[3]), (vector unsigned short)shift_consts[3]); \
315 :			u_vec = vec_add(u_vec, u_add); \
316 :			\
317 :			mask = vec_pack(vec_splat_u16(-1), (vector unsigned short)shift_consts[3]); \
318 :			mask = (vector unsigned char)vec_pack((vector unsigned int)mask, shift_consts[3]); \
319 :			mask = vec_perm(mask, mask, vec_lvsr(0, u_ptr + (UV_ROW)*uv_stride)); \
320 :			u_vec = vec_perm(u_vec, u_vec, vec_lvsr(0, u_ptr + (UV_ROW)*uv_stride)); \
321 :			u_vec = vec_sel(vec_ld(0, u_ptr + (UV_ROW)*uv_stride), u_vec, mask); \
322 :			vec_st(u_vec, 0, u_ptr + (UV_ROW)*uv_stride); \
323 :			\
324 :			t3 = vec_mulo((vector signed short)r##UVID, (vector signed short)vec_splat(vec_fix_ins[2], 1)); \
325 :			t3 = vec_add(t3, vec_mulo((vector signed short)g##UVID, (vector signed short)vec_splat(vec_fix_ins[2], 2))); \
326 :			t3 = vec_add(t3, vec_mulo((vector signed short)b##UVID, (vector signed short)vec_splat(vec_fix_ins[2], 3))); \
327 :			t3 = vec_sr(t3, (vector unsigned int)vec_mergeh((vector unsigned short)shift_consts[3], vec_splat(vec_fix_ins[2], 0))); \
328 :			\
329 :			v_vec = vec_pack(vec_pack((vector unsigned int)t3, shift_consts[3]), (vector unsigned short)shift_consts[3]); \
330 :			v_vec = vec_add(v_vec, v_add); \
331 :			\
332 :			mask = vec_pack(vec_splat_u16(-1), (vector unsigned short)shift_consts[3]); \
333 :			mask = (vector unsigned char)vec_pack((vector unsigned int)mask, shift_consts[3]); \
334 :			mask = vec_perm(mask, mask, vec_lvsr(0, v_ptr + (UV_ROW) * uv_stride)); \
335 :			v_vec = vec_perm(v_vec, v_vec, vec_lvsr(0, v_ptr + (UV_ROW) * uv_stride)); \
336 :			v_vec = vec_sel(vec_ld(0, v_ptr + (UV_ROW) * uv_stride), v_vec, mask); \
337 :			vec_st(v_vec, 0, v_ptr + (UV_ROW) * uv_stride)
338 :
339 :
340 :			#define RGB_TO_YV12_ALTIVEC_ROW(SIZE,C1,C2,C3,C4) \
341 :			/* nothing */
342 :
343 :			#define RGB_TO_YV12_ALTIVEC(SIZE,C1,C2,C3,C4) \
344 :			vector unsigned int p0, p1; \
345 :			vector unsigned int t0, t1; \
346 :			vector unsigned short r, g, b, r0, g0, b0; \
347 :			vector unsigned short lum; \
348 :			vector unsigned char mask; \
349 :			vector unsigned char y_vec; \
350 :			vector unsigned char u_vec; \
351 :			vector unsigned char v_vec; \
352 :			vector signed int t3; \
353 :			\
354 :			vec_dstt(x_ptr, prefetch_constant, 0); \
355 :			vec_dstt(x_ptr + (x_stride << 1), prefetch_constant, 1); \
356 :			\
357 :			r0 = g0 = b0 = (vector unsigned short)shift_consts[3]; \
358 :			\
359 :			READ_RGB_Y_ALTIVEC(SIZE, 0, 0, C1, C2, C3, C4); \
360 :			READ_RGB_Y_ALTIVEC(SIZE, 1, 0, C1, C2, C3, C4); \
361 :			READ_RGB_UV_ALTIVEC(0, 0)
362 :
363 :
364 :			/* YUV input */
365 :
366 :			#define READ_YUYV_Y_ALTIVEC(ROW,C1,C2,C3,C4) \
367 :			p0 = vec_ld(0, (unsigned int)(x_ptr + (ROW)x_stride)); \
368 :			p1 = vec_ld(16, (unsigned int)(x_ptr + (ROW)x_stride)); \
369 :			\
370 :			t = vec_lvsl(0, (unsigned char*)0); \
371 :			t = vec_sl(t, vec_splat_u8(2)); \
372 :			t = vec_add(t, vec_splat_u8(C1)); \
373 :			\
374 :			lum0 = (vector unsigned char)vec_perm(p0, p0, t); \
375 :			lum1 = (vector unsigned char)vec_perm(p1, p1, t); \
376 :			\
377 :			t = vec_lvsl(0, (unsigned char*)0); \
378 :			t = vec_sl(t, vec_splat_u8(2)); \
379 :			t = vec_add(t, vec_splat_u8(C3)); \
380 :			\
381 :			lum0 = vec_mergeh(lum0, (vector unsigned char)vec_perm(p0, p0, t)); \
382 :			lum1 = vec_mergeh(lum1, (vector unsigned char)vec_perm(p1, p1, t)); \
383 :			\
384 :			lum0 = vec_sel(lum0, lum1, vec_pack(vec_splat_u16(0), vec_splat_u16(-1))); \
385 :			vec_st(lum0, 0, y_ptr + (ROW)*y_stride); \
386 :			\
387 :			t = vec_lvsl(0, (unsigned char*)0); \
388 :			t = vec_sl(t, vec_splat_u8(2)); \
389 :			t = vec_add(t, vec_splat_u8(C2)); \
390 :			\
391 :			lum0 = (vector unsigned char)vec_perm(p0, p0, t); \
392 :			lum1 = (vector unsigned char)vec_perm(p1, p1, t); \
393 :			lum1 = vec_perm(lum1, lum1, vec_lvsr(4, (unsigned char*)0)); \
394 :			t = vec_pack(vec_pack(vec_splat_u32(0), vec_splat_u32(-1)), vec_splat_u16(-1)); \
395 :			u##ROW = vec_sel(lum0, lum1, t); \
396 :			\
397 :			t = vec_lvsl(0, (unsigned char*)0); \
398 :			t = vec_sl(t, vec_splat_u8(2)); \
399 :			t = vec_add(t, vec_splat_u8(C4)); \
400 :			\
401 :			lum0 = (vector unsigned char)vec_perm(p0, p0, t); \
402 :			lum1 = (vector unsigned char)vec_perm(p1, p1, t); \
403 :			lum1 = vec_perm(lum1, lum1, vec_lvsr(4, (unsigned char*)0)); \
404 :			t = vec_pack(vec_pack(vec_splat_u32(0), vec_splat_u32(-1)), vec_splat_u16(-1)); \
405 :			v##ROW = vec_sel(lum0, lum1, t);
406 :
407 :			#define READ_YUYV_UV_ALTIVEC(UV_ROW,ROW1,ROW2,C1,C2,C3,C4) \
408 :			u##ROW1 = vec_avg(u##ROW1, u##ROW2); \
409 :			t = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); \
410 :			t = vec_perm(t, t, vec_lvsl(0, u_ptr + (UV_ROW)*uv_stride)); \
411 :			u##ROW1 = vec_perm(u##ROW1, u##ROW1, vec_lvsl(0, u_ptr + (UV_ROW)*uv_stride)); \
412 :			u##ROW1 = vec_sel(u##ROW1, vec_ld(0, u_ptr + (UV_ROW)*uv_stride), t); \
413 :			vec_st(u##ROW1, 0, u_ptr + (UV_ROW)*uv_stride); \
414 :			\
415 :			v##ROW1 = vec_avg(v##ROW1, v##ROW2); \
416 :			t = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); \
417 :			t = vec_perm(t, t, vec_lvsl(0, v_ptr + (UV_ROW)*uv_stride)); \
418 :			v##ROW1 = vec_perm(v##ROW1, v##ROW1, vec_lvsl(0, v_ptr + (UV_ROW)*uv_stride)); \
419 :			v##ROW1 = vec_sel(v##ROW1, vec_ld(0, v_ptr + (UV_ROW)*uv_stride), t); \
420 :			vec_st(v##ROW1, 0, v_ptr + (UV_ROW)*uv_stride);
421 :
422 :
423 :			#define YUYV_TO_YV12_ALTIVEC_ROW(SIZE,C1,C2,C3,C4) \
424 :			/nothing/
425 :
426 :			#define YUYV_TO_YV12_ALTIVEC(SIZE,C1,C2,C3,C4) \
427 :			vec_dstt(x_ptr, prefetch_constant, 0); \
428 :			vec_dstt(x_ptr + (x_stride << 1), prefetch_constant, 1); \
429 :			\
430 :			READ_YUYV_Y_ALTIVEC (0, C1,C2,C3,C4) \
431 :			READ_YUYV_Y_ALTIVEC (1, C1,C2,C3,C4) \
432 :			READ_YUYV_UV_ALTIVEC(0, 0, 1, C1,C2,C3,C4)
433 :
434 :			MAKE_COLORSPACE_ALTIVEC_FROM_RGB(bgra_to_yv12_altivec_c, 4, 8, 2, RGB_TO_YV12_ALTIVEC, 2, 1, 0, 0)
435 :			MAKE_COLORSPACE_ALTIVEC_FROM_RGB(abgr_to_yv12_altivec_c, 4, 8, 2, RGB_TO_YV12_ALTIVEC, 3, 2, 1, 0)
436 :			MAKE_COLORSPACE_ALTIVEC_FROM_RGB(rgba_to_yv12_altivec_c, 4, 8, 2, RGB_TO_YV12_ALTIVEC, 0, 1, 2, 0)
437 :			MAKE_COLORSPACE_ALTIVEC_FROM_RGB(argb_to_yv12_altivec_c, 4, 8, 2, RGB_TO_YV12_ALTIVEC, 1, 2, 3, 0)
438 :
439 :			MAKE_COLORSPACE_ALTIVEC_FROM_YUV(yuyv_to_yv12_altivec_c, 2, 16, 2, YUYV_TO_YV12_ALTIVEC, 0, 1, 2, 3)
440 :			MAKE_COLORSPACE_ALTIVEC_FROM_YUV(uyvy_to_yv12_altivec_c, 2, 16, 2, YUYV_TO_YV12_ALTIVEC, 1, 0, 3, 2)
441 :
442 :
443 :			#define WRITE_YUYV_ALTIVEC(ROW, UV_ROW, C1,C2,C3,C4) \
444 :			p0 = vec_splat_u8(0); \
445 :			p1 = vec_splat_u8(0); \
446 :			\
447 :			y_vec = vec_perm(vec_ld(0, y_ptr + (ROW)y_stride), vec_ld(16, y_ptr + (ROW)y_stride), vec_lvsl(0, y_ptr + (ROW)*y_stride)); \
448 :			/* C1 */ \
449 :			t = vec_perm(y_vec, y_vec, vec_sl(vec_lvsl(0, (unsigned char*)0), vec_splat_u8(1))); \
450 :			mask = vec_perm(mask_stencil, mask_stencil, vec_lvsr(C1, (unsigned char*)0)); \
451 :			\
452 :			p0 = vec_sel(p0, vec_perm(t, t, m4), mask); \
453 :	edgomez	1570	ptmp = vec_perm(t,t, vec_add(m4, vec4));\
454 :			p1 = vec_sel(p1, ptmp, mask); \
455 :	edgomez	1412	\
456 :			/* C3 */ \
457 :	edgomez	1570	ptmp = vec_add(vec_sl(vec_lvsl(0, (unsigned char*)0), vec_splat_u8(1)), vec_splat_u8(1)); \
458 :			t = vec_perm(y_vec, y_vec, ptmp); \
459 :	edgomez	1412	mask = vec_perm(mask_stencil, mask_stencil, vec_lvsr(C3, (unsigned char*)0)); \
460 :			\
461 :			p0 = vec_sel(p0, vec_perm(t, t, m4), mask); \
462 :	edgomez	1570	ptmp = vec_perm(t, t, vec_add(m4, vec4)); \
463 :			p1 = vec_sel(p1, ptmp, mask); \
464 :	edgomez	1412	\
465 :			/* C2 */ \
466 :			u_vec = vec_perm(vec_ld(0,u_ptr), vec_ld(16, u_ptr), vec_lvsl(0, u_ptr)); \
467 :			mask = vec_perm(mask_stencil, mask_stencil, vec_lvsr(C2, (unsigned char*)0)); \
468 :			\
469 :			p0 = vec_sel(p0, vec_perm(u_vec, u_vec, m4), mask); \
470 :	edgomez	1570	ptmp = vec_perm(u_vec, u_vec, vec_add(m4, vec4)); \
471 :			p1 = vec_sel(p1, ptmp, mask); \
472 :	edgomez	1412	\
473 :			/* C4 */ \
474 :			v_vec = vec_perm(vec_ld(0, v_ptr), vec_ld(16, v_ptr), vec_lvsl(0, v_ptr)); \
475 :			mask = vec_perm(mask_stencil, mask_stencil, vec_lvsr(C4, (unsigned char*)0)); \
476 :			\
477 :			p0 = vec_sel(p0, vec_perm(v_vec, v_vec, m4), mask); \
478 :	edgomez	1570	ptmp = vec_perm(v_vec, v_vec, vec_add(m4, vec4)); \
479 :			p1 = vec_sel(p1, ptmp, mask); \
480 :	edgomez	1412	\
481 :			vec_st(p0, 0, x_ptr + (ROW)*x_stride); \
482 :			vec_st(p1, 16, x_ptr + (ROW)*x_stride)
483 :
484 :
485 :			#define YV12_TO_YUYV_ALTIVEC_ROW(SIZE,C1,C2,C3,C4) \
486 :			/nothing/
487 :
488 :			#define YV12_TO_YUYV_ALTIVEC(SIZE,C1,C2,C3,C4) \
489 :			vec_dstt(y_ptr, prefetch_constant_y, 0); \
490 :			vec_dstt(u_ptr, prefetch_constant_uv, 1); \
491 :			vec_dstt(v_ptr, prefetch_constant_uv, 2); \
492 :			\
493 :			WRITE_YUYV_ALTIVEC(0, 0, C1,C2,C3,C4); \
494 :			WRITE_YUYV_ALTIVEC(1, 0, C1,C2,C3,C4)
495 :
496 :
497 :	edgomez	1606	MAKE_COLORSPACE_ALTIVEC_TO_YUV(yv12_to_yuyv_altivec_unaligned_c, 2, 16, 2, YV12_TO_YUYV_ALTIVEC, 0, 1, 2, 3)
498 :			MAKE_COLORSPACE_ALTIVEC_TO_YUV(yv12_to_uyvy_altivec_unaligned_c, 2, 16, 2, YV12_TO_YUYV_ALTIVEC, 1, 0, 3, 2)
499 :	edgomez	1412
500 :	edgomez	1606
501 :			/* This intermediate functions are used because gcc v3.3 seems to produces an invalid register usage with the fallback directly integrated in the altivec routine (!!!) */
502 :
503 :			#define CHECK_COLORSPACE_ALTIVEC_TO_YUV(NAME,FAST,FALLBACK) \
504 :			void \
505 :			NAME(uint8_t *x_ptr, int x_stride, \
506 :			uint8_t y_ptr, uint8_t u_ptr, uint8_t *v_ptr, \
507 :			int y_stride, int uv_stride, \
508 :			int width, int height, int vflip) \
509 :			{\
510 :			if( ((uint32_t)x_ptr & 15) \| (x_stride & 15) )\
511 :			FALLBACK(x_ptr, x_stride, y_ptr, u_ptr, v_ptr, y_stride, uv_stride, width, height, vflip);\
512 :			else\
513 :			FAST(x_ptr, x_stride, y_ptr, u_ptr, v_ptr, y_stride, uv_stride, width, height, vflip);\
514 :			}
515 :
516 :			CHECK_COLORSPACE_ALTIVEC_TO_YUV(yv12_to_yuyv_altivec_c, yv12_to_yuyv_altivec_unaligned_c, yv12_to_yuyv_c)
517 :			CHECK_COLORSPACE_ALTIVEC_TO_YUV(yv12_to_uyvy_altivec_c, yv12_to_uyvy_altivec_unaligned_c, yv12_to_uyvy_c)

No admin address has been configured	ViewVC Help
Powered by ViewVC 1.0.4