Parent Directory | Revision Log
Revision 1557 - (view) (download)
1 : | edgomez | 1412 | /***************************************************************************** |
2 : | * | ||
3 : | * XVID MPEG-4 VIDEO CODEC | ||
4 : | * - 8x8 block-based halfpel interpolation with altivec optimization - | ||
5 : | * | ||
6 : | * Copyright(C) 2004 Christoph Naegeli <chn@kbw.ch> | ||
7 : | * | ||
8 : | * This program is free software ; you can redistribute it and/or modify | ||
9 : | * it under the terms of the GNU General Public License as published by | ||
10 : | * the Free Software Foundation ; either version 2 of the License, or | ||
11 : | * (at your option) any later version. | ||
12 : | * | ||
13 : | * This program is distributed in the hope that it will be useful, | ||
14 : | * but WITHOUT ANY WARRANTY ; without even the implied warranty of | ||
15 : | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 : | * GNU General Public License for more details. | ||
17 : | * | ||
18 : | * You should have received a copy of the GNU General Public License | ||
19 : | * along with this program ; if not, write to the Free Software | ||
20 : | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
21 : | * | ||
22 : | edgomez | 1557 | * $Id: interpolate8x8_altivec.c,v 1.2 2004-10-17 10:20:15 edgomez Exp $ |
23 : | edgomez | 1412 | * |
24 : | ****************************************************************************/ | ||
25 : | |||
26 : | |||
27 : | #ifdef HAVE_ALTIVEC_H | ||
28 : | #include <altivec.h> | ||
29 : | #endif | ||
30 : | |||
31 : | #include "../../portab.h" | ||
32 : | |||
33 : | #undef DEBUG | ||
34 : | #include <stdio.h> | ||
35 : | |||
36 : | static inline unsigned | ||
37 : | build_prefetch(unsigned char block_size, unsigned char block_count, short stride) | ||
38 : | { | ||
39 : | if(block_size > 31) | ||
40 : | block_size = 0; | ||
41 : | |||
42 : | return ((block_size << 24) | (block_count << 16) | stride); | ||
43 : | } | ||
44 : | |||
45 : | #define NO_ROUNDING | ||
46 : | |||
47 : | #define ROUNDING \ | ||
48 : | s1 = vec_and(vec_add(s1, s2), vec_splat_u8(1)); \ | ||
49 : | d = vec_sub(d, s1); | ||
50 : | |||
51 : | #define INTERPLATE8X8_HALFPEL_H(round) \ | ||
52 : | s1 = vec_perm(vec_ld(0, src), vec_ld(16, src), vec_lvsl(0, src)); \ | ||
53 : | s2 = vec_perm(s1, s1, s2_mask); \ | ||
54 : | d = vec_avg(s1, s2); \ | ||
55 : | round; \ | ||
56 : | mask = vec_perm(mask_stencil, mask_stencil, vec_lvsl(0, dst)); \ | ||
57 : | d = vec_perm(d, d, vec_lvsl(0, dst)); \ | ||
58 : | d = vec_sel(d, vec_ld(0, dst), mask); \ | ||
59 : | vec_st(d, 0, dst); \ | ||
60 : | dst += stride; \ | ||
61 : | src += stride | ||
62 : | |||
63 : | |||
64 : | /* This function assumes: | ||
65 : | * dst is 8 byte aligned | ||
66 : | * src is unaligned | ||
67 : | * stride is a multiple of 8 | ||
68 : | */ | ||
69 : | void | ||
70 : | interpolate8x8_halfpel_h_altivec_c( uint8_t *dst, | ||
71 : | uint8_t *src, | ||
72 : | const uint32_t stride, | ||
73 : | const uint32_t rounding) | ||
74 : | { | ||
75 : | register vector unsigned char s1, s2; | ||
76 : | register vector unsigned char d; | ||
77 : | register vector unsigned char mask; | ||
78 : | register vector unsigned char s2_mask; | ||
79 : | register vector unsigned char mask_stencil; | ||
80 : | |||
81 : | #ifdef DEBUG | ||
82 : | /* Dump alignment errors if DEBUG is defined */ | ||
83 : | if(((unsigned long)dst) & 0x7) | ||
84 : | fprintf(stderr, "interpolate8x8_halfpel_h_altivec_c:incorrect align, dst: %x\n", dst); | ||
85 : | if(stride & 0x7) | ||
86 : | fprintf(stderr, "interpolate8x8_halfpel_h_altivec_c:incorrect stride, stride: %u\n", stride); | ||
87 : | #endif | ||
88 : | |||
89 : | s2_mask = vec_lvsl(1, (unsigned char*)0); | ||
90 : | mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); | ||
91 : | |||
92 : | if(rounding) { | ||
93 : | INTERPLATE8X8_HALFPEL_H(ROUNDING); | ||
94 : | INTERPLATE8X8_HALFPEL_H(ROUNDING); | ||
95 : | INTERPLATE8X8_HALFPEL_H(ROUNDING); | ||
96 : | INTERPLATE8X8_HALFPEL_H(ROUNDING); | ||
97 : | |||
98 : | INTERPLATE8X8_HALFPEL_H(ROUNDING); | ||
99 : | INTERPLATE8X8_HALFPEL_H(ROUNDING); | ||
100 : | INTERPLATE8X8_HALFPEL_H(ROUNDING); | ||
101 : | INTERPLATE8X8_HALFPEL_H(ROUNDING); | ||
102 : | } | ||
103 : | else { | ||
104 : | INTERPLATE8X8_HALFPEL_H(NO_ROUNDING); | ||
105 : | INTERPLATE8X8_HALFPEL_H(NO_ROUNDING); | ||
106 : | INTERPLATE8X8_HALFPEL_H(NO_ROUNDING); | ||
107 : | INTERPLATE8X8_HALFPEL_H(NO_ROUNDING); | ||
108 : | |||
109 : | INTERPLATE8X8_HALFPEL_H(NO_ROUNDING); | ||
110 : | INTERPLATE8X8_HALFPEL_H(NO_ROUNDING); | ||
111 : | INTERPLATE8X8_HALFPEL_H(NO_ROUNDING); | ||
112 : | INTERPLATE8X8_HALFPEL_H(NO_ROUNDING); | ||
113 : | } | ||
114 : | } | ||
115 : | |||
116 : | #define INTERPLATE8X8_HALFPEL_V(round) \ | ||
117 : | s1 = vec_perm(vec_ld(0, src), vec_ld(16, src), vec_lvsl(0, src)); \ | ||
118 : | s2 = vec_perm(vec_ld(0, src + stride), vec_ld(16, src + stride), vec_lvsl(0, src + stride)); \ | ||
119 : | d = vec_avg(s1, s2); \ | ||
120 : | round; \ | ||
121 : | mask = vec_perm(mask_stencil, mask_stencil, vec_lvsl(0, dst)); \ | ||
122 : | d = vec_perm(d, d, vec_lvsl(0, dst)); \ | ||
123 : | d = vec_sel(d, vec_ld(0, dst), mask); \ | ||
124 : | vec_st(d, 0, dst); \ | ||
125 : | dst += stride; \ | ||
126 : | src += stride | ||
127 : | |||
128 : | /* | ||
129 : | * This function assumes | ||
130 : | * dst is 8 byte aligned | ||
131 : | * src is unaligned | ||
132 : | * stride is a multiple of 8 | ||
133 : | */ | ||
134 : | void | ||
135 : | interpolate8x8_halfpel_v_altivec_c( uint8_t *dst, | ||
136 : | uint8_t *src, | ||
137 : | const uint32_t stride, | ||
138 : | const uint32_t rounding) | ||
139 : | { | ||
140 : | vector unsigned char s1, s2; | ||
141 : | vector unsigned char d; | ||
142 : | vector unsigned char mask; | ||
143 : | vector unsigned char mask_stencil; | ||
144 : | |||
145 : | #ifdef DEBUG | ||
146 : | /* if this is on, print alignment errors */ | ||
147 : | if(((unsigned long)dst) & 0x7) | ||
148 : | fprintf(stderr, "interpolate8x8_halfpel_v_altivec_c:incorrect align, dst: %x\n", dst); | ||
149 : | if(stride & 0x7) | ||
150 : | fprintf(stderr, "interpolate8x8_halfpel_v_altivec_c:incorrect stride, stride: %u\n", stride); | ||
151 : | #endif | ||
152 : | |||
153 : | mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); | ||
154 : | |||
155 : | if(rounding) { | ||
156 : | INTERPLATE8X8_HALFPEL_V(ROUNDING); | ||
157 : | INTERPLATE8X8_HALFPEL_V(ROUNDING); | ||
158 : | INTERPLATE8X8_HALFPEL_V(ROUNDING); | ||
159 : | INTERPLATE8X8_HALFPEL_V(ROUNDING); | ||
160 : | |||
161 : | INTERPLATE8X8_HALFPEL_V(ROUNDING); | ||
162 : | INTERPLATE8X8_HALFPEL_V(ROUNDING); | ||
163 : | INTERPLATE8X8_HALFPEL_V(ROUNDING); | ||
164 : | INTERPLATE8X8_HALFPEL_V(ROUNDING); | ||
165 : | } | ||
166 : | else { | ||
167 : | INTERPLATE8X8_HALFPEL_V(NO_ROUNDING); | ||
168 : | INTERPLATE8X8_HALFPEL_V(NO_ROUNDING); | ||
169 : | INTERPLATE8X8_HALFPEL_V(NO_ROUNDING); | ||
170 : | INTERPLATE8X8_HALFPEL_V(NO_ROUNDING); | ||
171 : | |||
172 : | INTERPLATE8X8_HALFPEL_V(NO_ROUNDING); | ||
173 : | INTERPLATE8X8_HALFPEL_V(NO_ROUNDING); | ||
174 : | INTERPLATE8X8_HALFPEL_V(NO_ROUNDING); | ||
175 : | INTERPLATE8X8_HALFPEL_V(NO_ROUNDING); | ||
176 : | } | ||
177 : | } | ||
178 : | |||
179 : | |||
180 : | #define INTERPOLATE8X8_HALFPEL_HV(adding) \ | ||
181 : | t = vec_perm(vec_ld(0, src), vec_ld(16, src), vec_lvsl(0, src)); \ | ||
182 : | s1 = (vector unsigned short)vec_mergeh(zerovec, t); \ | ||
183 : | t = vec_perm(vec_ld(1, src), vec_ld(17, src), vec_lvsl(1, src)); \ | ||
184 : | s2 = (vector unsigned short)vec_mergeh(zerovec, t); \ | ||
185 : | t = vec_perm(vec_ld(0, src + stride), vec_ld(16, src + stride), vec_lvsl(0, src + stride)); \ | ||
186 : | s3 = (vector unsigned short)vec_mergeh(zerovec, t); \ | ||
187 : | t = vec_perm(vec_ld(1, src + stride), vec_ld(17, src + stride), vec_lvsl(1, src + stride)); \ | ||
188 : | s4 = (vector unsigned short)vec_mergeh(zerovec, t); \ | ||
189 : | s1 = vec_add(s1, vec_add(s2, vec_add(s3, s4))); \ | ||
190 : | s1 = vec_add(s1, adding); \ | ||
191 : | s1 = vec_sr(s1, two); \ | ||
192 : | t = vec_pack(s1, s1); \ | ||
193 : | mask = vec_perm(mask_stencil, mask_stencil, vec_lvsl(0, dst)); \ | ||
194 : | t = vec_sel(t, vec_ld(0, dst), mask); \ | ||
195 : | vec_st(t, 0, dst); \ | ||
196 : | dst += stride; \ | ||
197 : | src += stride | ||
198 : | |||
199 : | void | ||
200 : | interpolate8x8_halfpel_hv_altivec_c(uint8_t *dst, | ||
201 : | uint8_t *src, | ||
202 : | const uint32_t stride, | ||
203 : | const uint32_t rounding) | ||
204 : | { | ||
205 : | vector unsigned short s1, s2, s3, s4; | ||
206 : | vector unsigned char t; | ||
207 : | vector unsigned short one, two; | ||
208 : | vector unsigned char zerovec; | ||
209 : | vector unsigned char mask; | ||
210 : | vector unsigned char mask_stencil; | ||
211 : | |||
212 : | /* Initialisation stuff */ | ||
213 : | zerovec = vec_splat_u8(0); | ||
214 : | one = vec_splat_u16(1); | ||
215 : | two = vec_splat_u16(2); | ||
216 : | mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); | ||
217 : | |||
218 : | if(rounding) { | ||
219 : | INTERPOLATE8X8_HALFPEL_HV(one); | ||
220 : | INTERPOLATE8X8_HALFPEL_HV(one); | ||
221 : | INTERPOLATE8X8_HALFPEL_HV(one); | ||
222 : | INTERPOLATE8X8_HALFPEL_HV(one); | ||
223 : | |||
224 : | INTERPOLATE8X8_HALFPEL_HV(one); | ||
225 : | INTERPOLATE8X8_HALFPEL_HV(one); | ||
226 : | INTERPOLATE8X8_HALFPEL_HV(one); | ||
227 : | INTERPOLATE8X8_HALFPEL_HV(one); | ||
228 : | } | ||
229 : | else { | ||
230 : | INTERPOLATE8X8_HALFPEL_HV(two); | ||
231 : | INTERPOLATE8X8_HALFPEL_HV(two); | ||
232 : | INTERPOLATE8X8_HALFPEL_HV(two); | ||
233 : | INTERPOLATE8X8_HALFPEL_HV(two); | ||
234 : | |||
235 : | INTERPOLATE8X8_HALFPEL_HV(two); | ||
236 : | INTERPOLATE8X8_HALFPEL_HV(two); | ||
237 : | INTERPOLATE8X8_HALFPEL_HV(two); | ||
238 : | INTERPOLATE8X8_HALFPEL_HV(two); | ||
239 : | } | ||
240 : | } | ||
241 : | |||
242 : | /* | ||
243 : | * This function assumes: | ||
244 : | * dst is 8 byte aligned | ||
245 : | * src1 is unaligned | ||
246 : | * src2 is unaligned | ||
247 : | * stirde is a multiple of 8 | ||
248 : | * rounding is smaller than than max signed short + 2 | ||
249 : | */ | ||
250 : | |||
251 : | void | ||
252 : | interpolate8x8_avg2_altivec_c( uint8_t *dst, | ||
253 : | const uint8_t *src1, | ||
254 : | const uint8_t *src2, | ||
255 : | const uint32_t stride, | ||
256 : | const uint32_t rounding, | ||
257 : | const uint32_t height) | ||
258 : | { | ||
259 : | uint32_t i; | ||
260 : | vector unsigned char t; | ||
261 : | vector unsigned char mask; | ||
262 : | vector unsigned char mask_stencil; | ||
263 : | vector unsigned char zerovec; | ||
264 : | vector signed short s1, s2; | ||
265 : | vector signed short d; | ||
266 : | vector signed short round; | ||
267 : | |||
268 : | #ifdef DEBUG | ||
269 : | /* If this is on, print alignment errors */ | ||
270 : | if(((unsigned long)dst) & 0x7) | ||
271 : | fprintf(stderr, "interpolate8x8_avg2_altivec_c:incorrect align, dst: %x\n", dst); | ||
272 : | if(stride & 0x7) | ||
273 : | fprintf(stderr, "interpolate8x8_avg2_altivec_c:incorrect stride, stride: %u\n", stride); | ||
274 : | if(rounding > (32767 + 2)) | ||
275 : | fprintf(stderr, "interpolate8x8_avg2_altivec_c:incorrect rounding, rounding: %d\n", rounding); | ||
276 : | #endif | ||
277 : | |||
278 : | /* initialisation */ | ||
279 : | zerovec = vec_splat_u8(0); | ||
280 : | *((short*)&round) = 1 - rounding; | ||
281 : | round = vec_splat(round, 0); | ||
282 : | mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); | ||
283 : | |||
284 : | for(i = 0; i < height; i++) { | ||
285 : | |||
286 : | t = vec_perm(vec_ld(0, src1), vec_ld(16, src1), vec_lvsl(0, src1)); | ||
287 : | d = vec_add((vector signed short)zerovec, round); | ||
288 : | s1 = (vector signed short)vec_mergeh(zerovec, t); | ||
289 : | |||
290 : | t = vec_perm(vec_ld(0, src2), vec_ld(16, src2), vec_lvsl(0, src2)); | ||
291 : | d = vec_add(d, s1); | ||
292 : | s2 = (vector signed short)vec_mergeh(zerovec, t); | ||
293 : | |||
294 : | d = vec_add(d, s2); | ||
295 : | d = vec_sr(d, vec_splat_u16(1)); | ||
296 : | |||
297 : | t = vec_pack((vector unsigned short)d, (vector unsigned short)zerovec); | ||
298 : | mask = vec_perm(mask_stencil, mask_stencil, vec_lvsl(0, dst)); | ||
299 : | t = vec_perm(t, t, vec_lvsl(0, dst)); | ||
300 : | t = vec_sel(t, vec_ld(0, dst), mask); | ||
301 : | vec_st(t, 0, dst); | ||
302 : | |||
303 : | dst += stride; | ||
304 : | src1 += stride; | ||
305 : | src2 += stride; | ||
306 : | } | ||
307 : | } | ||
308 : | |||
309 : | |||
310 : | #define INTERPOLATE8X8_AVG4() \ | ||
311 : | d = r; \ | ||
312 : | \ | ||
313 : | t = vec_perm(vec_ld(0, src1), vec_ld(16, src1), vec_lvsl(0, src1)); \ | ||
314 : | s = (vector signed short)vec_mergeh(zerovec, t); \ | ||
315 : | d = vec_add(d, s); \ | ||
316 : | \ | ||
317 : | t = vec_perm(vec_ld(0, src2), vec_ld(16, src2), vec_lvsl(0, src2)); \ | ||
318 : | s = (vector signed short)vec_mergeh(zerovec, t); \ | ||
319 : | d = vec_add(d, s); \ | ||
320 : | \ | ||
321 : | t = vec_perm(vec_ld(0, src3), vec_ld(16, src3), vec_lvsl(0, src3)); \ | ||
322 : | s = (vector signed short)vec_mergeh(zerovec, t); \ | ||
323 : | d = vec_add(d, s); \ | ||
324 : | \ | ||
325 : | t = vec_perm(vec_ld(0, src4), vec_ld(16, src4), vec_lvsl(0, src4)); \ | ||
326 : | s = (vector signed short)vec_mergeh(zerovec, t); \ | ||
327 : | d = vec_add(d, s); \ | ||
328 : | \ | ||
329 : | d = vec_sr(d, shift); \ | ||
330 : | \ | ||
331 : | t = vec_pack((vector unsigned short)d, (vector unsigned short)zerovec); \ | ||
332 : | mask = vec_perm(mask_stencil, mask_stencil, vec_lvsl(0, dst)); \ | ||
333 : | t = vec_perm(t, t, vec_lvsl(0, dst)); \ | ||
334 : | t = vec_sel(t, vec_ld(0, dst), mask); \ | ||
335 : | vec_st(t, 0, dst); \ | ||
336 : | \ | ||
337 : | dst += stride; \ | ||
338 : | src1 += stride; \ | ||
339 : | src2 += stride; \ | ||
340 : | src3 += stride; \ | ||
341 : | src4 += stride | ||
342 : | |||
343 : | /* This function assumes: | ||
344 : | * dst is 8 byte aligned | ||
345 : | * src1, src2, src3, src4 are unaligned | ||
346 : | * stride is a multiple of 8 | ||
347 : | */ | ||
348 : | |||
349 : | void | ||
350 : | interpolate8x8_avg4_altivec_c(uint8_t *dst, | ||
351 : | const uint8_t *src1, const uint8_t *src2, | ||
352 : | const uint8_t *src3, const uint8_t *src4, | ||
353 : | const uint32_t stride, const uint32_t rounding) | ||
354 : | { | ||
355 : | vector signed short r; | ||
356 : | register vector signed short s, d; | ||
357 : | register vector unsigned short shift; | ||
358 : | register vector unsigned char t; | ||
359 : | register vector unsigned char zerovec; | ||
360 : | register vector unsigned char mask; | ||
361 : | register vector unsigned char mask_stencil; | ||
362 : | |||
363 : | #ifdef DEBUG | ||
364 : | /* if debug is set, print alignment errors */ | ||
365 : | if(((unsigned)dst) & 0x7) | ||
366 : | fprintf(stderr, "interpolate8x8_avg4_altivec_c:incorrect align, dst: %x\n", dst); | ||
367 : | if(stride & 0x7) | ||
368 : | fprintf(stderr, "interpolate8x8_avg4_altivec_c:incorrect stride, stride: %u\n", stride); | ||
369 : | #endif | ||
370 : | |||
371 : | /* Initialization */ | ||
372 : | zerovec = vec_splat_u8(0); | ||
373 : | *((short*)&r) = 2 - rounding; | ||
374 : | r = vec_splat(r, 0); | ||
375 : | shift = vec_splat_u16(2); | ||
376 : | mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); | ||
377 : | |||
378 : | /* interpolate */ | ||
379 : | INTERPOLATE8X8_AVG4(); | ||
380 : | INTERPOLATE8X8_AVG4(); | ||
381 : | INTERPOLATE8X8_AVG4(); | ||
382 : | INTERPOLATE8X8_AVG4(); | ||
383 : | |||
384 : | INTERPOLATE8X8_AVG4(); | ||
385 : | INTERPOLATE8X8_AVG4(); | ||
386 : | INTERPOLATE8X8_AVG4(); | ||
387 : | INTERPOLATE8X8_AVG4(); | ||
388 : | } | ||
389 : | |||
390 : | edgomez | 1557 | /* |
391 : | * This function assumes: | ||
392 : | * dst is 8 byte aligned | ||
393 : | * src is unaligned | ||
394 : | * stirde is a multiple of 8 | ||
395 : | * rounding is ignored | ||
396 : | */ | ||
397 : | void | ||
398 : | interpolate8x8_halfpel_add_altivec_c(uint8_t *dst, const uint8_t *src, const uint32_t stride, const uint32_t rouding) | ||
399 : | { | ||
400 : | interpolate8x8_avg2_altivec_c(dst, dst, src, stride, 0, 8); | ||
401 : | } | ||
402 : | edgomez | 1412 | |
403 : | edgomez | 1557 | #define INTERPOLATE8X8_HALFPEL_H_ADD_ROUND() \ |
404 : | mask_dst = vec_lvsl(0,dst); \ | ||
405 : | s1 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); \ | ||
406 : | d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst); \ | ||
407 : | \ | ||
408 : | s2 = vec_perm(s1,s1,rot1); \ | ||
409 : | tmp = vec_avg(s1,s2); \ | ||
410 : | s1 = vec_sub(tmp,vec_and(vec_xor(s1,s2),one)); \ | ||
411 : | \ | ||
412 : | d = vec_avg(s1,d);\ | ||
413 : | \ | ||
414 : | mask = vec_perm(mask_stencil, mask_stencil, mask_dst); \ | ||
415 : | d = vec_perm(d,d,mask_dst); \ | ||
416 : | d = vec_sel(d,vec_ld(0,dst),mask); \ | ||
417 : | vec_st(d,0,dst); \ | ||
418 : | \ | ||
419 : | dst += stride; \ | ||
420 : | src += stride | ||
421 : | edgomez | 1412 | |
422 : | edgomez | 1557 | #define INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND() \ |
423 : | mask_dst = vec_lvsl(0,dst); \ | ||
424 : | s1 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); \ | ||
425 : | d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst); \ | ||
426 : | \ | ||
427 : | s1 = vec_avg(s1, vec_perm(s1,s1,rot1));\ | ||
428 : | d = vec_avg(s1,d);\ | ||
429 : | \ | ||
430 : | mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ | ||
431 : | d = vec_perm(d,d,mask_dst);\ | ||
432 : | d = vec_sel(d,vec_ld(0,dst),mask);\ | ||
433 : | vec_st(d,0,dst);\ | ||
434 : | \ | ||
435 : | dst += stride;\ | ||
436 : | src += stride | ||
437 : | edgomez | 1412 | |
438 : | edgomez | 1557 | /* |
439 : | * This function assumes: | ||
440 : | * dst is 8 byte aligned | ||
441 : | * src is unaligned | ||
442 : | * stride is a multiple of 8 | ||
443 : | */ | ||
444 : | void | ||
445 : | interpolate8x8_halfpel_h_add_altivec_c(uint8_t *dst, uint8_t *src, const uint32_t stride, const uint32_t rounding) | ||
446 : | { | ||
447 : | register vector unsigned char s1,s2; | ||
448 : | register vector unsigned char d; | ||
449 : | register vector unsigned char tmp; | ||
450 : | |||
451 : | register vector unsigned char mask_dst; | ||
452 : | register vector unsigned char one; | ||
453 : | register vector unsigned char rot1; | ||
454 : | |||
455 : | register vector unsigned char mask_stencil; | ||
456 : | register vector unsigned char mask; | ||
457 : | |||
458 : | #ifdef DEBUG | ||
459 : | if(((unsigned)dst) & 0x7); | ||
460 : | fprintf(stderr, "interpolate8x8_halfpel_h_add_altivec_c:incorrect align, dst: %x\n", dst); | ||
461 : | if(stride & 0x7) | ||
462 : | fprintf(stderr, "interpolate8x8_halfpel_h_add_altivec_c:incorrect stride, stride: %u\n", stride); | ||
463 : | #endif | ||
464 : | |||
465 : | /* initialization */ | ||
466 : | mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); | ||
467 : | one = vec_splat_u8(1); | ||
468 : | rot1 = vec_lvsl(1,(unsigned char*)0); | ||
469 : | |||
470 : | if(rounding) { | ||
471 : | INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); | ||
472 : | INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); | ||
473 : | INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); | ||
474 : | INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); | ||
475 : | |||
476 : | INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); | ||
477 : | INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); | ||
478 : | INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); | ||
479 : | INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); | ||
480 : | } | ||
481 : | else { | ||
482 : | |||
483 : | INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); | ||
484 : | INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); | ||
485 : | INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); | ||
486 : | INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); | ||
487 : | |||
488 : | INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); | ||
489 : | INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); | ||
490 : | INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); | ||
491 : | INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); | ||
492 : | } | ||
493 : | } | ||
494 : | edgomez | 1412 | |
495 : | |||
496 : | edgomez | 1557 | |
497 : | |||
498 : | #define INTERPOLATE8X8_HALFPEL_V_ADD_ROUND()\ | ||
499 : | src += stride;\ | ||
500 : | mask_dst = vec_lvsl(0,dst);\ | ||
501 : | s2 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src));\ | ||
502 : | d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst);\ | ||
503 : | \ | ||
504 : | tmp = vec_avg(s1,s2);\ | ||
505 : | s1 = vec_sub(tmp,vec_and(vec_xor(s1,s2),vec_splat_u8(1)));\ | ||
506 : | d = vec_avg(s1,d);\ | ||
507 : | \ | ||
508 : | mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ | ||
509 : | d = vec_perm(d,d,mask_dst);\ | ||
510 : | d = vec_sel(d,vec_ld(0,dst),mask);\ | ||
511 : | vec_st(d,0,dst);\ | ||
512 : | \ | ||
513 : | s1 = s2;\ | ||
514 : | \ | ||
515 : | dst += stride | ||
516 : | |||
517 : | #define INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND()\ | ||
518 : | src += stride;\ | ||
519 : | mask_dst = vec_lvsl(0,dst);\ | ||
520 : | s2 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src));\ | ||
521 : | d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst);\ | ||
522 : | \ | ||
523 : | s1 = vec_avg(s1,s2);\ | ||
524 : | d = vec_avg(s1,d);\ | ||
525 : | \ | ||
526 : | mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ | ||
527 : | d = vec_perm(d,d,mask_dst);\ | ||
528 : | d = vec_sel(d,vec_ld(0,dst),mask);\ | ||
529 : | vec_st(d,0,dst);\ | ||
530 : | \ | ||
531 : | s1 = s2;\ | ||
532 : | dst += stride | ||
533 : | |||
534 : | /* | ||
535 : | * This function assumes: | ||
536 : | * dst: 8 byte aligned | ||
537 : | * src: unaligned | ||
538 : | * stride is a multiple of 8 | ||
539 : | edgomez | 1412 | */ |
540 : | edgomez | 1557 | |
541 : | void | ||
542 : | interpolate8x8_halfpel_v_add_altivec_c(uint8_t *dst, uint8_t *src, const uint32_t stride, const uint32_t rounding) | ||
543 : | { | ||
544 : | register vector unsigned char s1,s2; | ||
545 : | register vector unsigned char tmp; | ||
546 : | register vector unsigned char d; | ||
547 : | |||
548 : | register vector unsigned char mask; | ||
549 : | register vector unsigned char mask_dst; | ||
550 : | register vector unsigned char mask_stencil; | ||
551 : | |||
552 : | #ifdef DEBUG | ||
553 : | if(((unsigned)dst) & 0x7) | ||
554 : | fprintf(stderr, "interpolate8x8_halfpel_v_add_altivec_c:incorrect align, dst: %x\n", dst); | ||
555 : | if(stride & 0x7) | ||
556 : | fprintf(stderr, "interpolate8x8_halfpel_v_add_altivec_c:incorrect align, dst: %u\n", stride); | ||
557 : | #endif | ||
558 : | edgomez | 1412 | |
559 : | edgomez | 1557 | /* initialization */ |
560 : | mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); | ||
561 : | |||
562 : | if(rounding) { | ||
563 : | |||
564 : | /* Interpolate vertical with rounding */ | ||
565 : | s1 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); | ||
566 : | |||
567 : | INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); | ||
568 : | INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); | ||
569 : | INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); | ||
570 : | INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); | ||
571 : | |||
572 : | INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); | ||
573 : | INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); | ||
574 : | INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); | ||
575 : | INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); | ||
576 : | } | ||
577 : | else { | ||
578 : | /* Interpolate vertical without rounding */ | ||
579 : | s1 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); | ||
580 : | |||
581 : | INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); | ||
582 : | INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); | ||
583 : | INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); | ||
584 : | INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); | ||
585 : | |||
586 : | INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); | ||
587 : | INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); | ||
588 : | INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); | ||
589 : | INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); | ||
590 : | } | ||
591 : | } | ||
592 : | |||
593 : | |||
594 : | |||
595 : | #define INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND()\ | ||
596 : | src += stride;\ | ||
597 : | mask_dst = vec_lvsl(0,dst);\ | ||
598 : | c10 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src));\ | ||
599 : | d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst);\ | ||
600 : | c11 = vec_perm(c10,c10,rot1);\ | ||
601 : | \ | ||
602 : | s00 = (vector unsigned short)vec_mergeh(zero,c00);\ | ||
603 : | s01 = (vector unsigned short)vec_mergeh(zero,c01);\ | ||
604 : | s10 = (vector unsigned short)vec_mergeh(zero,c10);\ | ||
605 : | s11 = (vector unsigned short)vec_mergeh(zero,c11);\ | ||
606 : | \ | ||
607 : | s00 = vec_add(s00,s10);\ | ||
608 : | s01 = vec_add(s01,s11);\ | ||
609 : | s00 = vec_add(s00,s01);\ | ||
610 : | s00 = vec_add(s00,one);\ | ||
611 : | \ | ||
612 : | s00 = vec_sr(s00,two);\ | ||
613 : | s00 = vec_add(s00, (vector unsigned short)vec_mergeh(zero,d));\ | ||
614 : | s00 = vec_sr(s00,one);\ | ||
615 : | \ | ||
616 : | d = vec_pack(s00,s00);\ | ||
617 : | mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ | ||
618 : | d = vec_sel(d,vec_ld(0,dst),mask);\ | ||
619 : | vec_st(d,0,dst);\ | ||
620 : | \ | ||
621 : | c00 = c10;\ | ||
622 : | c01 = c11;\ | ||
623 : | dst += stride | ||
624 : | |||
625 : | |||
626 : | #define INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND()\ | ||
627 : | src += stride;\ | ||
628 : | mask_dst = vec_lvsl(0,dst);\ | ||
629 : | c10 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src));\ | ||
630 : | d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst);\ | ||
631 : | c11 = vec_perm(c10,c10,rot1);\ | ||
632 : | \ | ||
633 : | s00 = (vector unsigned short)vec_mergeh(zero,c00);\ | ||
634 : | s01 = (vector unsigned short)vec_mergeh(zero,c01);\ | ||
635 : | s10 = (vector unsigned short)vec_mergeh(zero,c10);\ | ||
636 : | s11 = (vector unsigned short)vec_mergeh(zero,c11);\ | ||
637 : | \ | ||
638 : | s00 = vec_add(s00,s10);\ | ||
639 : | s01 = vec_add(s01,s11);\ | ||
640 : | s00 = vec_add(s00,s01);\ | ||
641 : | s00 = vec_add(s00,two);\ | ||
642 : | s00 = vec_sr(s00,two);\ | ||
643 : | \ | ||
644 : | c00 = vec_pack(s00,s00);\ | ||
645 : | d = vec_avg(d,c00);\ | ||
646 : | \ | ||
647 : | mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ | ||
648 : | d = vec_perm(d,d,mask_dst);\ | ||
649 : | d = vec_sel(d,vec_ld(0,dst),mask);\ | ||
650 : | vec_st(d,0,dst);\ | ||
651 : | \ | ||
652 : | c00 = c10;\ | ||
653 : | c01 = c11;\ | ||
654 : | dst += stride | ||
655 : | |||
656 : | |||
657 : | /* | ||
658 : | * This function assumes: | ||
659 : | * dst: 8 byte aligned | ||
660 : | * src: unaligned | ||
661 : | * stride: multiple of 8 | ||
662 : | */ | ||
663 : | |||
664 : | edgomez | 1412 | void |
665 : | edgomez | 1557 | interpolate8x8_halfpel_hv_add_altivec_c(uint8_t *dst, uint8_t *src, const uint32_t stride, const uint32_t rounding) |
666 : | edgomez | 1412 | { |
667 : | edgomez | 1557 | register vector unsigned char c00,c10,c01,c11; |
668 : | register vector unsigned short s00,s10,s01,s11; | ||
669 : | register vector unsigned char d; | ||
670 : | |||
671 : | register vector unsigned char mask; | ||
672 : | register vector unsigned char mask_stencil; | ||
673 : | |||
674 : | register vector unsigned char rot1; | ||
675 : | register vector unsigned char mask_dst; | ||
676 : | register vector unsigned char zero; | ||
677 : | register vector unsigned short one,two; | ||
678 : | |||
679 : | #ifdef DEBUG | ||
680 : | if(((unsigned)dst) & 0x7) | ||
681 : | fprintf(stderr, "interpolate8x8_halfpel_hv_add_altivec_c:incorrect align, dst: %x\n",dst); | ||
682 : | if(stride & 0x7) | ||
683 : | fprintf(stderr, "interpolate8x8_halfpel_hv_add_altivec_c:incorrect stride, stride: %u\n", stride); | ||
684 : | #endif | ||
685 : | |||
686 : | /* initialization */ | ||
687 : | mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); | ||
688 : | rot1 = vec_lvsl(1,(unsigned char*)0); | ||
689 : | zero = vec_splat_u8(0); | ||
690 : | one = vec_splat_u16(1); | ||
691 : | two = vec_splat_u16(2); | ||
692 : | |||
693 : | if(rounding) { | ||
694 : | |||
695 : | /* Load the first row 'manually' */ | ||
696 : | c00 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); | ||
697 : | c01 = vec_perm(c00,c00,rot1); | ||
698 : | |||
699 : | INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); | ||
700 : | INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); | ||
701 : | INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); | ||
702 : | INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); | ||
703 : | |||
704 : | INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); | ||
705 : | INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); | ||
706 : | INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); | ||
707 : | INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); | ||
708 : | } | ||
709 : | else { | ||
710 : | |||
711 : | /* Load the first row 'manually' */ | ||
712 : | c00 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); | ||
713 : | c01 = vec_perm(c00,c00,rot1); | ||
714 : | |||
715 : | INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); | ||
716 : | INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); | ||
717 : | INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); | ||
718 : | INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); | ||
719 : | |||
720 : | INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); | ||
721 : | INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); | ||
722 : | INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); | ||
723 : | INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); | ||
724 : | } | ||
725 : | edgomez | 1412 | } |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |