19 |
* along with this program ; if not, write to the Free Software |
* along with this program ; if not, write to the Free Software |
20 |
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
* |
* |
22 |
* $Id: gmc.c,v 1.3 2004-04-02 21:29:21 edgomez Exp $ |
* $Id$ |
23 |
* |
* |
24 |
****************************************************************************/ |
****************************************************************************/ |
25 |
|
|
27 |
#include "../global.h" |
#include "../global.h" |
28 |
#include "../encoder.h" |
#include "../encoder.h" |
29 |
#include "gmc.h" |
#include "gmc.h" |
30 |
|
#include "../utils/emms.h" |
31 |
|
|
32 |
#include <stdio.h> |
#include <stdio.h> |
33 |
|
|
34 |
|
/* initialized by init_GMC(), for 3points */ |
35 |
|
static |
36 |
|
void (*Predict_16x16_func)(const NEW_GMC_DATA * const This, |
37 |
|
uint8_t *dst, const uint8_t *src, |
38 |
|
int dststride, int srcstride, int x, int y, int rounding) = 0; |
39 |
|
static |
40 |
|
void (*Predict_8x8_func)(const NEW_GMC_DATA * const This, |
41 |
|
uint8_t *uDst, const uint8_t *uSrc, |
42 |
|
uint8_t *vDst, const uint8_t *vSrc, |
43 |
|
int dststride, int srcstride, int x, int y, int rounding) = 0; |
44 |
|
|
45 |
|
/****************************************************************************/ |
46 |
|
/* this is borrowed from bitstream.c until we find a common solution */ |
47 |
|
static uint32_t __inline |
48 |
|
log2bin(uint32_t value) |
49 |
|
{ |
50 |
|
/* Changed by Chenm001 */ |
51 |
|
#if !defined(_MSC_VER) || defined(ARCH_IS_X86_64) |
52 |
|
int n = 0; |
53 |
|
|
54 |
|
while (value) { |
55 |
|
value >>= 1; |
56 |
|
n++; |
57 |
|
} |
58 |
|
return n; |
59 |
|
#else |
60 |
|
__asm { |
61 |
|
bsr eax, value |
62 |
|
inc eax |
63 |
|
} |
64 |
|
#endif |
65 |
|
} |
66 |
|
|
67 |
|
/* 16*sizeof(int) -> 1 or 2 cachelines */ |
68 |
|
/* table lookup might be faster! (still to be benchmarked) */ |
69 |
|
|
70 |
|
/* |
71 |
|
static int log2bin_table[16] = |
72 |
|
{ 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4}; |
73 |
|
*/ |
74 |
|
/* 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */ |
75 |
|
|
76 |
|
#define RDIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b)) |
77 |
|
#define RSHIFT(a,b) ( (a)>0 ? ((a) + (1<<((b)-1)))>>(b) : ((a) + (1<<((b)-1))-1)>>(b)) |
78 |
|
|
79 |
|
#define MLT(i) (((16-(i))<<16) + (i)) |
80 |
|
static const uint32_t MTab[16] = { |
81 |
|
MLT( 0), MLT( 1), MLT( 2), MLT( 3), MLT( 4), MLT( 5), MLT( 6), MLT( 7), |
82 |
|
MLT( 8), MLT( 9), MLT(10), MLT(11), MLT(12), MLT(13), MLT(14), MLT(15) |
83 |
|
}; |
84 |
|
#undef MLT |
85 |
|
|
86 |
/* ************************************************************ |
/* ************************************************************ |
87 |
* Pts = 2 or 3 |
* Pts = 2 or 3 |
88 |
* |
* |
91 |
* Conversely, *dst is the macroblock top-left adress. |
* Conversely, *dst is the macroblock top-left adress. |
92 |
*/ |
*/ |
93 |
|
|
94 |
|
static |
95 |
void Predict_16x16_C(const NEW_GMC_DATA * const This, |
void Predict_16x16_C(const NEW_GMC_DATA * const This, |
96 |
uint8_t *dst, const uint8_t *src, |
uint8_t *dst, const uint8_t *src, |
97 |
int dststride, int srcstride, int x, int y, int rounding) |
int dststride, int srcstride, int x, int y, int rounding) |
151 |
} |
} |
152 |
} |
} |
153 |
|
|
154 |
|
static |
155 |
void Predict_8x8_C(const NEW_GMC_DATA * const This, |
void Predict_8x8_C(const NEW_GMC_DATA * const This, |
156 |
uint8_t *uDst, const uint8_t *uSrc, |
uint8_t *uDst, const uint8_t *uSrc, |
157 |
uint8_t *vDst, const uint8_t *vSrc, |
uint8_t *vDst, const uint8_t *vSrc, |
231 |
} |
} |
232 |
} |
} |
233 |
|
|
234 |
|
static |
235 |
void get_average_mv_C(const NEW_GMC_DATA * const Dsp, VECTOR * const mv, |
void get_average_mv_C(const NEW_GMC_DATA * const Dsp, VECTOR * const mv, |
236 |
int x, int y, int qpel) |
int x, int y, int qpel) |
237 |
{ |
{ |
262 |
* simplified version for 1 warp point |
* simplified version for 1 warp point |
263 |
*/ |
*/ |
264 |
|
|
265 |
|
static |
266 |
void Predict_1pt_16x16_C(const NEW_GMC_DATA * const This, |
void Predict_1pt_16x16_C(const NEW_GMC_DATA * const This, |
267 |
uint8_t *Dst, const uint8_t *Src, |
uint8_t *Dst, const uint8_t *Src, |
268 |
int dststride, int srcstride, int x, int y, int rounding) |
int dststride, int srcstride, int x, int y, int rounding) |
280 |
int i, j; |
int i, j; |
281 |
|
|
282 |
int32_t Offset; |
int32_t Offset; |
283 |
if (vo>=(-16*4) && vo<=H) Offset = (vo>>4)*srcstride; |
if (vo>=(-16<<4) && vo<=H) Offset = (vo>>4)*srcstride; |
284 |
else { |
else { |
285 |
if (vo>H) Offset = ( H>>4)*srcstride; |
if (vo>H) Offset = ( H>>4)*srcstride; |
286 |
else Offset =-16*srcstride; |
else Offset =-16*srcstride; |
287 |
rj = MTab[0]; |
rj = MTab[0]; |
288 |
} |
} |
289 |
if (uo>=(-16*4) && uo<=W) Offset += (uo>>4); |
if (uo>=(-16<<4) && uo<=W) Offset += (uo>>4); |
290 |
else { |
else { |
291 |
if (uo>W) Offset += (W>>4); |
if (uo>W) Offset += (W>>4); |
292 |
else Offset -= 16; |
else Offset -= 16; |
314 |
} |
} |
315 |
} |
} |
316 |
|
|
317 |
|
static |
318 |
void Predict_1pt_8x8_C(const NEW_GMC_DATA * const This, |
void Predict_1pt_8x8_C(const NEW_GMC_DATA * const This, |
319 |
uint8_t *uDst, const uint8_t *uSrc, |
uint8_t *uDst, const uint8_t *uSrc, |
320 |
uint8_t *vDst, const uint8_t *vSrc, |
uint8_t *vDst, const uint8_t *vSrc, |
332 |
int i, j; |
int i, j; |
333 |
|
|
334 |
int32_t Offset; |
int32_t Offset; |
335 |
if (vo>=(-8*4) && vo<=H) Offset = (vo>>4)*srcstride; |
if (vo>=(-8<<4) && vo<=H) Offset = (vo>>4)*srcstride; |
336 |
else { |
else { |
337 |
if (vo>H) Offset = ( H>>4)*srcstride; |
if (vo>H) Offset = ( H>>4)*srcstride; |
338 |
else Offset =-8*srcstride; |
else Offset =-8*srcstride; |
339 |
rrj = MTab[0]; |
rrj = MTab[0]; |
340 |
} |
} |
341 |
if (uo>=(-8*4) && uo<=W) Offset += (uo>>4); |
if (uo>=(-8<<4) && uo<=W) Offset += (uo>>4); |
342 |
else { |
else { |
343 |
if (uo>W) Offset += ( W>>4); |
if (uo>W) Offset += ( W>>4); |
344 |
else Offset -= 8; |
else Offset -= 8; |
377 |
} |
} |
378 |
} |
} |
379 |
|
|
380 |
|
static |
381 |
void get_average_mv_1pt_C(const NEW_GMC_DATA * const Dsp, VECTOR * const mv, |
void get_average_mv_1pt_C(const NEW_GMC_DATA * const Dsp, VECTOR * const mv, |
382 |
int x, int y, int qpel) |
int x, int y, int qpel) |
383 |
{ |
{ |
385 |
mv->y = RSHIFT(Dsp->Vo<<qpel, 3); |
mv->y = RSHIFT(Dsp->Vo<<qpel, 3); |
386 |
} |
} |
387 |
|
|
388 |
|
#if defined(ARCH_IS_IA32) || defined(ARCH_IS_X86_64) |
389 |
|
/* ************************************************************* |
390 |
|
* MMX core function |
391 |
|
*/ |
392 |
|
|
393 |
|
static |
394 |
|
void (*GMC_Core_Lin_8)(uint8_t *Dst, const uint16_t * Offsets, |
395 |
|
const uint8_t * const Src0, const int BpS, const int Rounder) = 0; |
396 |
|
|
397 |
|
extern void xvid_GMC_Core_Lin_8_mmx(uint8_t *Dst, const uint16_t * Offsets, |
398 |
|
const uint8_t * const Src0, const int BpS, const int Rounder); |
399 |
|
|
400 |
|
extern void xvid_GMC_Core_Lin_8_sse2(uint8_t *Dst, const uint16_t * Offsets, |
401 |
|
const uint8_t * const Src0, const int BpS, const int Rounder); |
402 |
|
|
403 |
|
extern void xvid_GMC_Core_Lin_8_sse41(uint8_t *Dst, const uint16_t * Offsets, |
404 |
|
const uint8_t * const Src0, const int BpS, const int Rounder); |
405 |
|
|
406 |
|
/* *************************************************************/ |
407 |
|
|
408 |
|
static void GMC_Core_Non_Lin_8(uint8_t *Dst, |
409 |
|
const uint16_t * Offsets, |
410 |
|
const uint8_t * const Src0, const int srcstride, |
411 |
|
const int Rounder) |
412 |
|
{ |
413 |
|
int i; |
414 |
|
for(i=0; i<8; ++i) |
415 |
|
{ |
416 |
|
uint32_t u = Offsets[i ]; |
417 |
|
uint32_t v = Offsets[i+16]; |
418 |
|
const uint32_t ri = MTab[u&0x0f]; |
419 |
|
const uint32_t rj = MTab[v&0x0f]; |
420 |
|
uint32_t f0, f1; |
421 |
|
const uint8_t * const Src = Src0 + (u>>4) + (v>>4)*srcstride; |
422 |
|
f0 = Src[0]; |
423 |
|
f0 |= Src[1] << 16; |
424 |
|
f1 = Src[srcstride +0]; |
425 |
|
f1 |= Src[srcstride +1] << 16; |
426 |
|
f0 = (ri*f0)>>16; |
427 |
|
f1 = (ri*f1) & 0x0fff0000; |
428 |
|
f0 |= f1; |
429 |
|
f0 = ( rj*f0 + Rounder ) >> 24; |
430 |
|
Dst[i] = (uint8_t)f0; |
431 |
|
} |
432 |
|
} |
433 |
|
|
434 |
|
////////////////////////////////////////////////////////// |
435 |
|
|
436 |
|
static |
437 |
|
void Predict_16x16_mmx(const NEW_GMC_DATA * const This, |
438 |
|
uint8_t *dst, const uint8_t *src, |
439 |
|
int dststride, int srcstride, int x, int y, int rounding) |
440 |
|
{ |
441 |
|
const int W = This->sW; |
442 |
|
const int H = This->sH; |
443 |
|
const int rho = 3 - This->accuracy; |
444 |
|
const int Rounder = ( 128 - (rounding<<(2*rho)) ) << 16; |
445 |
|
const uint32_t W2 = W<<(16-rho); |
446 |
|
const uint32_t H2 = H<<(16-rho); |
447 |
|
|
448 |
|
const int dUx = This->dU[0]; |
449 |
|
const int dVx = This->dV[0]; |
450 |
|
const int dUy = This->dU[1]; |
451 |
|
const int dVy = This->dV[1]; |
452 |
|
|
453 |
|
int Uo = This->Uo + 16*(dUy*y + dUx*x); |
454 |
|
int Vo = This->Vo + 16*(dVy*y + dVx*x); |
455 |
|
|
456 |
|
int i, j; |
457 |
|
|
458 |
|
DECLARE_ALIGNED_MATRIX(Offsets, 2,16, uint16_t, CACHE_LINE); |
459 |
|
for(j=16; j>0; --j) |
460 |
|
{ |
461 |
|
int32_t U = Uo, V = Vo; |
462 |
|
Uo += dUy; Vo += dVy; |
463 |
|
if ( W2>(uint32_t)U && W2>(uint32_t)(U+15*dUx) && |
464 |
|
H2>(uint32_t)V && H2>(uint32_t)(V+15*dVx) ) |
465 |
|
{ |
466 |
|
uint32_t UV1, UV2; |
467 |
|
for(i=0; i<16; ++i) |
468 |
|
{ |
469 |
|
uint32_t u = ( U >> 16 ) << rho; |
470 |
|
uint32_t v = ( V >> 16 ) << rho; |
471 |
|
U += dUx; V += dVx; |
472 |
|
Offsets[ i] = u; |
473 |
|
Offsets[16+i] = v; |
474 |
|
} |
475 |
|
// batch 8 input pixels when linearity says it's ok |
476 |
|
|
477 |
|
UV1 = (Offsets[0] | (Offsets[16]<<16)) & 0xfff0fff0U; |
478 |
|
UV2 = (Offsets[7] | (Offsets[23]<<16)) & 0xfff0fff0U; |
479 |
|
if (UV1+7*16==UV2) |
480 |
|
GMC_Core_Lin_8(dst, Offsets, src + (Offsets[0]>>4) + (Offsets[16]>>4)*srcstride, srcstride, Rounder); |
481 |
|
else |
482 |
|
GMC_Core_Non_Lin_8(dst, Offsets, src, srcstride, Rounder); |
483 |
|
UV1 = (Offsets[ 8] | (Offsets[24]<<16)) & 0xfff0fff0U; |
484 |
|
UV2 = (Offsets[15] | (Offsets[31]<<16)) & 0xfff0fff0U; |
485 |
|
if (UV1+7*16==UV2) |
486 |
|
GMC_Core_Lin_8(dst+8, Offsets+8, src + (Offsets[8]>>4) + (Offsets[24]>>4)*srcstride, srcstride, Rounder); |
487 |
|
else |
488 |
|
GMC_Core_Non_Lin_8(dst+8, Offsets+8, src, srcstride, Rounder); |
489 |
|
} |
490 |
|
else |
491 |
|
{ |
492 |
|
for(i=0; i<16; ++i) |
493 |
|
{ |
494 |
|
int u = ( U >> 16 ) << rho; |
495 |
|
int v = ( V >> 16 ) << rho; |
496 |
|
U += dUx; V += dVx; |
497 |
|
|
498 |
|
Offsets[ i] = (u<0) ? 0 : (u>=W) ? W : u; |
499 |
|
Offsets[16+i] = (v<0) ? 0 : (v>=H) ? H : v; |
500 |
|
} |
501 |
|
// due to boundary clipping, we cannot infer the 8-pixels batchability |
502 |
|
// simply by using the linearity. Oh well, not a big deal... |
503 |
|
GMC_Core_Non_Lin_8(dst, Offsets, src, srcstride, Rounder); |
504 |
|
GMC_Core_Non_Lin_8(dst+8, Offsets+8, src, srcstride, Rounder); |
505 |
|
} |
506 |
|
dst += dststride; |
507 |
|
} |
508 |
|
} |
509 |
|
|
510 |
|
static |
511 |
|
void Predict_8x8_mmx(const NEW_GMC_DATA * const This, |
512 |
|
uint8_t *uDst, const uint8_t *uSrc, |
513 |
|
uint8_t *vDst, const uint8_t *vSrc, |
514 |
|
int dststride, int srcstride, int x, int y, int rounding) |
515 |
|
{ |
516 |
|
const int W = This->sW >> 1; |
517 |
|
const int H = This->sH >> 1; |
518 |
|
const int rho = 3-This->accuracy; |
519 |
|
const int32_t Rounder = ( 128 - (rounding<<(2*rho)) ) << 16; |
520 |
|
const uint32_t W2 = W<<(16-rho); |
521 |
|
const uint32_t H2 = H<<(16-rho); |
522 |
|
|
523 |
|
const int dUx = This->dU[0]; |
524 |
|
const int dVx = This->dV[0]; |
525 |
|
const int dUy = This->dU[1]; |
526 |
|
const int dVy = This->dV[1]; |
527 |
|
|
528 |
|
int Uo = This->Uco + 8*(dUy*y + dUx*x); |
529 |
|
int Vo = This->Vco + 8*(dVy*y + dVx*x); |
530 |
|
|
531 |
|
DECLARE_ALIGNED_MATRIX(Offsets, 2,16, uint16_t, CACHE_LINE); |
532 |
|
int i, j; |
533 |
|
for(j=8; j>0; --j) |
534 |
|
{ |
535 |
|
int32_t U = Uo, V = Vo; |
536 |
|
Uo += dUy; Vo += dVy; |
537 |
|
if ( W2>(uint32_t)U && W2>(uint32_t)(U+15*dUx) && |
538 |
|
H2>(uint32_t)V && H2>(uint32_t)(V+15*dVx) ) |
539 |
|
{ |
540 |
|
uint32_t UV1, UV2; |
541 |
|
for(i=0; i<8; ++i) |
542 |
|
{ |
543 |
|
int32_t u = ( U >> 16 ) << rho; |
544 |
|
int32_t v = ( V >> 16 ) << rho; |
545 |
|
U += dUx; V += dVx; |
546 |
|
Offsets[ i] = u; |
547 |
|
Offsets[16+i] = v; |
548 |
|
} |
549 |
|
|
550 |
|
// batch 8 input pixels when linearity says it's ok |
551 |
|
UV1 = (Offsets[ 0] | (Offsets[16]<<16)) & 0xfff0fff0U; |
552 |
|
UV2 = (Offsets[ 7] | (Offsets[23]<<16)) & 0xfff0fff0U; |
553 |
|
if (UV1+7*16==UV2) |
554 |
|
{ |
555 |
|
const uint32_t Off = (Offsets[0]>>4) + (Offsets[16]>>4)*srcstride; |
556 |
|
GMC_Core_Lin_8(uDst, Offsets, uSrc+Off, srcstride, Rounder); |
557 |
|
GMC_Core_Lin_8(vDst, Offsets, vSrc+Off, srcstride, Rounder); |
558 |
|
} |
559 |
|
else { |
560 |
|
GMC_Core_Non_Lin_8(uDst, Offsets, uSrc, srcstride, Rounder); |
561 |
|
GMC_Core_Non_Lin_8(vDst, Offsets, vSrc, srcstride, Rounder); |
562 |
|
} |
563 |
|
} |
564 |
|
else |
565 |
|
{ |
566 |
|
for(i=0; i<8; ++i) |
567 |
|
{ |
568 |
|
int u = ( U >> 16 ) << rho; |
569 |
|
int v = ( V >> 16 ) << rho; |
570 |
|
U += dUx; V += dVx; |
571 |
|
Offsets[ i] = (u<0) ? 0 : (u>=W) ? W : u; |
572 |
|
Offsets[16+i] = (v<0) ? 0 : (v>=H) ? H : v; |
573 |
|
} |
574 |
|
GMC_Core_Non_Lin_8(uDst, Offsets, uSrc, srcstride, Rounder); |
575 |
|
GMC_Core_Non_Lin_8(vDst, Offsets, vSrc, srcstride, Rounder); |
576 |
|
} |
577 |
|
uDst += dststride; |
578 |
|
vDst += dststride; |
579 |
|
} |
580 |
|
} |
581 |
|
|
582 |
|
#endif /* ARCH_IS_IA32 */ |
583 |
|
|
584 |
|
/* ************************************************************* |
585 |
|
* will initialize internal pointers |
586 |
|
*/ |
587 |
|
|
588 |
|
void init_GMC(const unsigned int cpu_flags) |
589 |
|
{ |
590 |
|
Predict_16x16_func = Predict_16x16_C; |
591 |
|
Predict_8x8_func = Predict_8x8_C; |
592 |
|
|
593 |
|
#if defined(ARCH_IS_IA32) || defined(ARCH_IS_X86_64) |
594 |
|
if ((cpu_flags & XVID_CPU_MMX) || (cpu_flags & XVID_CPU_MMXEXT) || |
595 |
|
(cpu_flags & XVID_CPU_3DNOW) || (cpu_flags & XVID_CPU_3DNOWEXT) || |
596 |
|
(cpu_flags & XVID_CPU_SSE) || (cpu_flags & XVID_CPU_SSE2) || |
597 |
|
(cpu_flags & XVID_CPU_SSE3) || (cpu_flags & XVID_CPU_SSE41)) |
598 |
|
{ |
599 |
|
Predict_16x16_func = Predict_16x16_mmx; |
600 |
|
Predict_8x8_func = Predict_8x8_mmx; |
601 |
|
|
602 |
|
if (cpu_flags & XVID_CPU_SSE41) |
603 |
|
GMC_Core_Lin_8 = xvid_GMC_Core_Lin_8_sse41; |
604 |
|
else if (cpu_flags & XVID_CPU_SSE2) |
605 |
|
GMC_Core_Lin_8 = xvid_GMC_Core_Lin_8_sse2; |
606 |
|
else |
607 |
|
GMC_Core_Lin_8 = xvid_GMC_Core_Lin_8_mmx; |
608 |
|
} |
609 |
|
#endif |
610 |
|
} |
611 |
|
|
612 |
/* ************************************************************* |
/* ************************************************************* |
613 |
* Warning! It's Accuracy being passed, not 'resolution'! |
* Warning! It's Accuracy being passed, not 'resolution'! |
614 |
*/ |
*/ |
696 |
gmc->Uco = (gmc->Uco + gmc->dU[0] + gmc->dU[1])>>2; |
gmc->Uco = (gmc->Uco + gmc->dU[0] + gmc->dU[1])>>2; |
697 |
gmc->Vco = (gmc->Vco + gmc->dV[0] + gmc->dV[1])>>2; |
gmc->Vco = (gmc->Vco + gmc->dV[0] + gmc->dV[1])>>2; |
698 |
|
|
699 |
gmc->predict_16x16 = Predict_16x16_C; |
gmc->predict_16x16 = Predict_16x16_func; |
700 |
gmc->predict_8x8 = Predict_8x8_C; |
gmc->predict_8x8 = Predict_8x8_func; |
701 |
gmc->get_average_mv = get_average_mv_C; |
gmc->get_average_mv = get_average_mv_C; |
702 |
} |
} |
703 |
} |
} |
745 |
|
|
746 |
pMBs[mbnum].mcsel = 0; /* until mode decision */ |
pMBs[mbnum].mcsel = 0; /* until mode decision */ |
747 |
} |
} |
748 |
|
emms(); |
749 |
} |
} |