19 |
* along with this program ; if not, write to the Free Software |
* along with this program ; if not, write to the Free Software |
20 |
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
* |
* |
22 |
* $Id: interpolate8x8_altivec.c,v 1.1 2004-04-05 20:36:36 edgomez Exp $ |
* $Id$ |
23 |
* |
* |
24 |
****************************************************************************/ |
****************************************************************************/ |
25 |
|
|
81 |
#ifdef DEBUG |
#ifdef DEBUG |
82 |
/* Dump alignment errors if DEBUG is defined */ |
/* Dump alignment errors if DEBUG is defined */ |
83 |
if(((unsigned long)dst) & 0x7) |
if(((unsigned long)dst) & 0x7) |
84 |
fprintf(stderr, "interpolate8x8_halfpel_h_altivec_c:incorrect align, dst: %x\n", dst); |
fprintf(stderr, "interpolate8x8_halfpel_h_altivec_c:incorrect align, dst: %lx\n", (long)dst); |
85 |
if(stride & 0x7) |
if(stride & 0x7) |
86 |
fprintf(stderr, "interpolate8x8_halfpel_h_altivec_c:incorrect stride, stride: %u\n", stride); |
fprintf(stderr, "interpolate8x8_halfpel_h_altivec_c:incorrect stride, stride: %u\n", stride); |
87 |
#endif |
#endif |
145 |
#ifdef DEBUG |
#ifdef DEBUG |
146 |
/* if this is on, print alignment errors */ |
/* if this is on, print alignment errors */ |
147 |
if(((unsigned long)dst) & 0x7) |
if(((unsigned long)dst) & 0x7) |
148 |
fprintf(stderr, "interpolate8x8_halfpel_v_altivec_c:incorrect align, dst: %x\n", dst); |
fprintf(stderr, "interpolate8x8_halfpel_v_altivec_c:incorrect align, dst: %lx\n", (long)dst); |
149 |
if(stride & 0x7) |
if(stride & 0x7) |
150 |
fprintf(stderr, "interpolate8x8_halfpel_v_altivec_c:incorrect stride, stride: %u\n", stride); |
fprintf(stderr, "interpolate8x8_halfpel_v_altivec_c:incorrect stride, stride: %u\n", stride); |
151 |
#endif |
#endif |
186 |
s3 = (vector unsigned short)vec_mergeh(zerovec, t); \ |
s3 = (vector unsigned short)vec_mergeh(zerovec, t); \ |
187 |
t = vec_perm(vec_ld(1, src + stride), vec_ld(17, src + stride), vec_lvsl(1, src + stride)); \ |
t = vec_perm(vec_ld(1, src + stride), vec_ld(17, src + stride), vec_lvsl(1, src + stride)); \ |
188 |
s4 = (vector unsigned short)vec_mergeh(zerovec, t); \ |
s4 = (vector unsigned short)vec_mergeh(zerovec, t); \ |
189 |
s1 = vec_add(s1, vec_add(s2, vec_add(s3, s4))); \ |
s1 = vec_add(s1,s2);\ |
190 |
|
s3 = vec_add(s3,s4);\ |
191 |
|
s1 = vec_add(s1,s3);\ |
192 |
s1 = vec_add(s1, adding); \ |
s1 = vec_add(s1, adding); \ |
193 |
s1 = vec_sr(s1, two); \ |
s1 = vec_sr(s1, two); \ |
194 |
t = vec_pack(s1, s1); \ |
t = vec_pack(s1, s1); \ |
270 |
#ifdef DEBUG |
#ifdef DEBUG |
271 |
/* If this is on, print alignment errors */ |
/* If this is on, print alignment errors */ |
272 |
if(((unsigned long)dst) & 0x7) |
if(((unsigned long)dst) & 0x7) |
273 |
fprintf(stderr, "interpolate8x8_avg2_altivec_c:incorrect align, dst: %x\n", dst); |
fprintf(stderr, "interpolate8x8_avg2_altivec_c:incorrect align, dst: %lx\n", (long)dst); |
274 |
if(stride & 0x7) |
if(stride & 0x7) |
275 |
fprintf(stderr, "interpolate8x8_avg2_altivec_c:incorrect stride, stride: %u\n", stride); |
fprintf(stderr, "interpolate8x8_avg2_altivec_c:incorrect stride, stride: %u\n", stride); |
276 |
if(rounding > (32767 + 2)) |
if(rounding > (32767 + 2)) |
365 |
#ifdef DEBUG |
#ifdef DEBUG |
366 |
/* if debug is set, print alignment errors */ |
/* if debug is set, print alignment errors */ |
367 |
if(((unsigned)dst) & 0x7) |
if(((unsigned)dst) & 0x7) |
368 |
fprintf(stderr, "interpolate8x8_avg4_altivec_c:incorrect align, dst: %x\n", dst); |
fprintf(stderr, "interpolate8x8_avg4_altivec_c:incorrect align, dst: %lx\n", (long)dst); |
369 |
if(stride & 0x7) |
if(stride & 0x7) |
370 |
fprintf(stderr, "interpolate8x8_avg4_altivec_c:incorrect stride, stride: %u\n", stride); |
fprintf(stderr, "interpolate8x8_avg4_altivec_c:incorrect stride, stride: %u\n", stride); |
371 |
#endif |
#endif |
389 |
INTERPOLATE8X8_AVG4(); |
INTERPOLATE8X8_AVG4(); |
390 |
} |
} |
391 |
|
|
392 |
|
/* |
393 |
|
* This function assumes: |
394 |
|
* dst is 8 byte aligned |
395 |
|
* src is unaligned |
396 |
|
* stirde is a multiple of 8 |
397 |
|
* rounding is ignored |
398 |
|
*/ |
399 |
|
void |
400 |
|
interpolate8x8_halfpel_add_altivec_c(uint8_t *dst, const uint8_t *src, const uint32_t stride, const uint32_t rouding) |
401 |
|
{ |
402 |
|
interpolate8x8_avg2_altivec_c(dst, dst, src, stride, 0, 8); |
403 |
|
} |
404 |
|
|
405 |
|
#define INTERPOLATE8X8_HALFPEL_H_ADD_ROUND() \ |
406 |
|
mask_dst = vec_lvsl(0,dst); \ |
407 |
|
s1 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); \ |
408 |
|
d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst); \ |
409 |
|
\ |
410 |
|
s2 = vec_perm(s1,s1,rot1);\ |
411 |
|
tmp = vec_avg(s1,s2);\ |
412 |
|
s1 = vec_xor(s1,s2);\ |
413 |
|
s1 = vec_sub(tmp,vec_and(s1,one));\ |
414 |
|
\ |
415 |
|
d = vec_avg(s1,d);\ |
416 |
|
\ |
417 |
|
mask = vec_perm(mask_stencil, mask_stencil, mask_dst); \ |
418 |
|
d = vec_perm(d,d,mask_dst); \ |
419 |
|
d = vec_sel(d,vec_ld(0,dst),mask); \ |
420 |
|
vec_st(d,0,dst); \ |
421 |
|
\ |
422 |
|
dst += stride; \ |
423 |
|
src += stride |
424 |
|
|
425 |
/************************************************************* |
#define INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND() \ |
426 |
* QPEL STUFF STARTS HERE * |
mask_dst = vec_lvsl(0,dst); \ |
427 |
*************************************************************/ |
s1 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); \ |
428 |
|
d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst); \ |
|
|
|
|
#define INTERPOLATE8X8_6TAP_LOWPASS_H() \ |
|
|
vec_dstt(src, prefetch_constant, 0); \ |
|
|
data = vec_perm(vec_ld(-2, src), vec_ld(14, src), vec_lvsl(-2, src)); \ |
|
|
s1 = (vector signed short)vec_mergeh(zerovec, data); \ |
|
|
t = vec_perm(data, data, vec_lvsl(5, (unsigned char*)0)); \ |
|
|
s2 = (vector signed short)vec_mergeh(zerovec, t); \ |
|
|
d = vec_add(s1, s2); \ |
|
|
\ |
|
|
t = vec_perm(data, data, vec_lvsl(2, (unsigned char*)0)); \ |
|
|
s1 = (vector signed short)vec_mergeh(zerovec, t); \ |
|
|
t = vec_perm(data, data, vec_lvsl(3, (unsigned char*)0)); \ |
|
|
s2 = (vector signed short)vec_mergeh(zerovec, t); \ |
|
|
s1 = vec_add(s1,s2); \ |
|
|
z = vec_sl(s1, vec_splat_u16(2)); \ |
|
|
t = vec_perm(data, data, vec_lvsl(1, (unsigned char*)0)); \ |
|
|
s1 = (vector signed short)vec_mergeh(zerovec, t); \ |
|
|
t = vec_perm(data, data, vec_lvsl(4, (unsigned char*)0)); \ |
|
|
s2 = (vector signed short)vec_mergeh(zerovec, t); \ |
|
|
s1 = vec_add(s1, s2); \ |
|
|
z = vec_sub(z, s1); \ |
|
|
z = vec_add(vec_sl(z, vec_splat_u16(2)), z); \ |
|
|
d = vec_add(d, z); \ |
|
429 |
\ |
\ |
430 |
d = vec_add(d, round_add); \ |
s1 = vec_avg(s1, vec_perm(s1,s1,rot1));\ |
431 |
d = vec_sra(d, vec_splat_u16(5)); \ |
d = vec_avg(s1,d);\ |
432 |
\ |
\ |
433 |
t = vec_packsu(d, (vector signed short)zerovec); \ |
mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ |
434 |
mask = vec_perm(mask_stencil, mask_stencil, vec_lvsl(0, dst)); \ |
d = vec_perm(d,d,mask_dst);\ |
435 |
t = vec_perm(t, t, vec_lvsl(0, dst)); \ |
d = vec_sel(d,vec_ld(0,dst),mask);\ |
436 |
t = vec_sel(t, vec_ld(0, dst), mask); \ |
vec_st(d,0,dst);\ |
|
vec_st(t, 0, dst); \ |
|
437 |
\ |
\ |
438 |
dst += stride; \ |
dst += stride; \ |
439 |
src += stride |
src += stride |
440 |
|
|
441 |
/* This function assumes: |
/* |
442 |
|
* This function assumes: |
443 |
* dst is 8 byte aligned |
* dst is 8 byte aligned |
444 |
* src is unaligned |
* src is unaligned |
445 |
* stride is a muliple of 8 |
* stride is a multiple of 8 |
446 |
|
*/ |
447 |
|
void |
448 |
|
interpolate8x8_halfpel_h_add_altivec_c(uint8_t *dst, uint8_t *src, const uint32_t stride, const uint32_t rounding) |
449 |
|
{ |
450 |
|
register vector unsigned char s1,s2; |
451 |
|
register vector unsigned char d; |
452 |
|
register vector unsigned char tmp; |
453 |
|
|
454 |
|
register vector unsigned char mask_dst; |
455 |
|
register vector unsigned char one; |
456 |
|
register vector unsigned char rot1; |
457 |
|
|
458 |
|
register vector unsigned char mask_stencil; |
459 |
|
register vector unsigned char mask; |
460 |
|
|
461 |
|
#ifdef DEBUG |
462 |
|
if(((unsigned)dst) & 0x7) |
463 |
|
fprintf(stderr, "interpolate8x8_halfpel_h_add_altivec_c:incorrect align, dst: %lx\n", (long)dst); |
464 |
|
if(stride & 0x7) |
465 |
|
fprintf(stderr, "interpolate8x8_halfpel_h_add_altivec_c:incorrect stride, stride: %u\n", stride); |
466 |
|
#endif |
467 |
|
|
468 |
|
/* initialization */ |
469 |
|
mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); |
470 |
|
one = vec_splat_u8(1); |
471 |
|
rot1 = vec_lvsl(1,(unsigned char*)0); |
472 |
|
|
473 |
|
if(rounding) { |
474 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); |
475 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); |
476 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); |
477 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); |
478 |
|
|
479 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); |
480 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); |
481 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); |
482 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); |
483 |
|
} |
484 |
|
else { |
485 |
|
|
486 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); |
487 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); |
488 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); |
489 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); |
490 |
|
|
491 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); |
492 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); |
493 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); |
494 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); |
495 |
|
} |
496 |
|
} |
497 |
|
|
498 |
|
|
499 |
|
|
500 |
|
|
501 |
|
#define INTERPOLATE8X8_HALFPEL_V_ADD_ROUND()\ |
502 |
|
src += stride;\ |
503 |
|
mask_dst = vec_lvsl(0,dst);\ |
504 |
|
s2 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src));\ |
505 |
|
d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst);\ |
506 |
|
\ |
507 |
|
tmp = vec_avg(s1,s2);\ |
508 |
|
s1 = vec_xor(s1,s2);\ |
509 |
|
s1 = vec_sub(tmp,vec_and(s1,vec_splat_u8(1)));\ |
510 |
|
d = vec_avg(s1,d);\ |
511 |
|
\ |
512 |
|
mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ |
513 |
|
d = vec_perm(d,d,mask_dst);\ |
514 |
|
d = vec_sel(d,vec_ld(0,dst),mask);\ |
515 |
|
vec_st(d,0,dst);\ |
516 |
|
\ |
517 |
|
s1 = s2;\ |
518 |
|
\ |
519 |
|
dst += stride |
520 |
|
|
521 |
|
#define INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND()\ |
522 |
|
src += stride;\ |
523 |
|
mask_dst = vec_lvsl(0,dst);\ |
524 |
|
s2 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src));\ |
525 |
|
d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst);\ |
526 |
|
\ |
527 |
|
s1 = vec_avg(s1,s2);\ |
528 |
|
d = vec_avg(s1,d);\ |
529 |
|
\ |
530 |
|
mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ |
531 |
|
d = vec_perm(d,d,mask_dst);\ |
532 |
|
d = vec_sel(d,vec_ld(0,dst),mask);\ |
533 |
|
vec_st(d,0,dst);\ |
534 |
|
\ |
535 |
|
s1 = s2;\ |
536 |
|
dst += stride |
537 |
|
|
538 |
|
/* |
539 |
|
* This function assumes: |
540 |
|
* dst: 8 byte aligned |
541 |
|
* src: unaligned |
542 |
|
* stride is a multiple of 8 |
543 |
*/ |
*/ |
544 |
|
|
545 |
void |
void |
546 |
interpolate8x8_6tap_lowpass_h_altivec_c(uint8_t *dst, uint8_t *src, int32_t stride, int32_t rounding) |
interpolate8x8_halfpel_v_add_altivec_c(uint8_t *dst, uint8_t *src, const uint32_t stride, const uint32_t rounding) |
547 |
{ |
{ |
548 |
vector signed short s1, s2; |
register vector unsigned char s1,s2; |
549 |
vector signed short z; |
register vector unsigned char tmp; |
550 |
vector signed short d; |
register vector unsigned char d; |
|
vector signed short round_add; |
|
|
vector unsigned char t; |
|
|
vector unsigned char data; |
|
|
vector unsigned char mask; |
|
|
vector unsigned char mask_stencil; |
|
|
vector unsigned char zerovec; |
|
551 |
|
|
552 |
unsigned prefetch_constant; |
register vector unsigned char mask; |
553 |
|
register vector unsigned char mask_dst; |
554 |
|
register vector unsigned char mask_stencil; |
555 |
|
|
556 |
zerovec = vec_splat_u8(0); |
#ifdef DEBUG |
557 |
*((short*)&round_add) = (short)(16 - rounding); |
if(((unsigned)dst) & 0x7) |
558 |
round_add = vec_splat(round_add, 0); |
fprintf(stderr, "interpolate8x8_halfpel_v_add_altivec_c:incorrect align, dst: %lx\n", (long)dst); |
559 |
|
if(stride & 0x7) |
560 |
|
fprintf(stderr, "interpolate8x8_halfpel_v_add_altivec_c:incorrect align, dst: %u\n", stride); |
561 |
|
#endif |
562 |
|
|
563 |
|
/* initialization */ |
564 |
mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); |
mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); |
565 |
|
|
566 |
prefetch_constant = build_prefetch(1, 4, (short)stride); |
if(rounding) { |
567 |
|
|
568 |
|
/* Interpolate vertical with rounding */ |
569 |
|
s1 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); |
570 |
|
|
571 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); |
572 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); |
573 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); |
574 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); |
575 |
|
|
576 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); |
577 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); |
578 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); |
579 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); |
580 |
|
} |
581 |
|
else { |
582 |
|
/* Interpolate vertical without rounding */ |
583 |
|
s1 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); |
584 |
|
|
585 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); |
586 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); |
587 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); |
588 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); |
589 |
|
|
590 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); |
591 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); |
592 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); |
593 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); |
594 |
|
} |
595 |
|
} |
596 |
|
|
597 |
|
|
|
INTERPOLATE8X8_6TAP_LOWPASS_H(); |
|
|
INTERPOLATE8X8_6TAP_LOWPASS_H(); |
|
|
INTERPOLATE8X8_6TAP_LOWPASS_H(); |
|
|
INTERPOLATE8X8_6TAP_LOWPASS_H(); |
|
|
|
|
|
INTERPOLATE8X8_6TAP_LOWPASS_H(); |
|
|
INTERPOLATE8X8_6TAP_LOWPASS_H(); |
|
|
INTERPOLATE8X8_6TAP_LOWPASS_H(); |
|
|
INTERPOLATE8X8_6TAP_LOWPASS_H(); |
|
598 |
|
|
599 |
vec_dss(0); |
#define INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND()\ |
600 |
|
src += stride;\ |
601 |
|
mask_dst = vec_lvsl(0,dst);\ |
602 |
|
c10 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src));\ |
603 |
|
d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst);\ |
604 |
|
c11 = vec_perm(c10,c10,rot1);\ |
605 |
|
\ |
606 |
|
s00 = (vector unsigned short)vec_mergeh(zero,c00);\ |
607 |
|
s01 = (vector unsigned short)vec_mergeh(zero,c01);\ |
608 |
|
s10 = (vector unsigned short)vec_mergeh(zero,c10);\ |
609 |
|
s11 = (vector unsigned short)vec_mergeh(zero,c11);\ |
610 |
|
\ |
611 |
|
s00 = vec_add(s00,s10);\ |
612 |
|
s01 = vec_add(s01,s11);\ |
613 |
|
s00 = vec_add(s00,s01);\ |
614 |
|
s00 = vec_add(s00,one);\ |
615 |
|
\ |
616 |
|
s00 = vec_sr(s00,two);\ |
617 |
|
s00 = vec_add(s00, (vector unsigned short)vec_mergeh(zero,d));\ |
618 |
|
s00 = vec_sr(s00,one);\ |
619 |
|
\ |
620 |
|
d = vec_pack(s00,s00);\ |
621 |
|
mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ |
622 |
|
d = vec_sel(d,vec_ld(0,dst),mask);\ |
623 |
|
vec_st(d,0,dst);\ |
624 |
|
\ |
625 |
|
c00 = c10;\ |
626 |
|
c01 = c11;\ |
627 |
|
dst += stride |
628 |
|
|
629 |
|
|
630 |
|
#define INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND()\ |
631 |
|
src += stride;\ |
632 |
|
mask_dst = vec_lvsl(0,dst);\ |
633 |
|
c10 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src));\ |
634 |
|
d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst);\ |
635 |
|
c11 = vec_perm(c10,c10,rot1);\ |
636 |
|
\ |
637 |
|
s00 = (vector unsigned short)vec_mergeh(zero,c00);\ |
638 |
|
s01 = (vector unsigned short)vec_mergeh(zero,c01);\ |
639 |
|
s10 = (vector unsigned short)vec_mergeh(zero,c10);\ |
640 |
|
s11 = (vector unsigned short)vec_mergeh(zero,c11);\ |
641 |
|
\ |
642 |
|
s00 = vec_add(s00,s10);\ |
643 |
|
s01 = vec_add(s01,s11);\ |
644 |
|
s00 = vec_add(s00,s01);\ |
645 |
|
s00 = vec_add(s00,two);\ |
646 |
|
s00 = vec_sr(s00,two);\ |
647 |
|
\ |
648 |
|
c00 = vec_pack(s00,s00);\ |
649 |
|
d = vec_avg(d,c00);\ |
650 |
|
\ |
651 |
|
mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ |
652 |
|
d = vec_perm(d,d,mask_dst);\ |
653 |
|
d = vec_sel(d,vec_ld(0,dst),mask);\ |
654 |
|
vec_st(d,0,dst);\ |
655 |
|
\ |
656 |
|
c00 = c10;\ |
657 |
|
c01 = c11;\ |
658 |
|
dst += stride |
659 |
|
|
660 |
|
|
661 |
|
/* |
662 |
|
* This function assumes: |
663 |
|
* dst: 8 byte aligned |
664 |
|
* src: unaligned |
665 |
|
* stride: multiple of 8 |
666 |
|
*/ |
667 |
|
|
668 |
|
void |
669 |
|
interpolate8x8_halfpel_hv_add_altivec_c(uint8_t *dst, uint8_t *src, const uint32_t stride, const uint32_t rounding) |
670 |
|
{ |
671 |
|
register vector unsigned char c00,c10,c01,c11; |
672 |
|
register vector unsigned short s00,s10,s01,s11; |
673 |
|
register vector unsigned char d; |
674 |
|
|
675 |
|
register vector unsigned char mask; |
676 |
|
register vector unsigned char mask_stencil; |
677 |
|
|
678 |
|
register vector unsigned char rot1; |
679 |
|
register vector unsigned char mask_dst; |
680 |
|
register vector unsigned char zero; |
681 |
|
register vector unsigned short one,two; |
682 |
|
|
683 |
|
#ifdef DEBUG |
684 |
|
if(((unsigned)dst) & 0x7) |
685 |
|
fprintf(stderr, "interpolate8x8_halfpel_hv_add_altivec_c:incorrect align, dst: %lx\n", (long)dst); |
686 |
|
if(stride & 0x7) |
687 |
|
fprintf(stderr, "interpolate8x8_halfpel_hv_add_altivec_c:incorrect stride, stride: %u\n", stride); |
688 |
|
#endif |
689 |
|
|
690 |
|
/* initialization */ |
691 |
|
mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); |
692 |
|
rot1 = vec_lvsl(1,(unsigned char*)0); |
693 |
|
zero = vec_splat_u8(0); |
694 |
|
one = vec_splat_u16(1); |
695 |
|
two = vec_splat_u16(2); |
696 |
|
|
697 |
|
if(rounding) { |
698 |
|
|
699 |
|
/* Load the first row 'manually' */ |
700 |
|
c00 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); |
701 |
|
c01 = vec_perm(c00,c00,rot1); |
702 |
|
|
703 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); |
704 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); |
705 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); |
706 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); |
707 |
|
|
708 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); |
709 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); |
710 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); |
711 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); |
712 |
|
} |
713 |
|
else { |
714 |
|
|
715 |
|
/* Load the first row 'manually' */ |
716 |
|
c00 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); |
717 |
|
c01 = vec_perm(c00,c00,rot1); |
718 |
|
|
719 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); |
720 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); |
721 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); |
722 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); |
723 |
|
|
724 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); |
725 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); |
726 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); |
727 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); |
728 |
|
} |
729 |
} |
} |