--- branches/dev-api-4/xvidcore/src/utils/mbtransquant.c 2003/05/09 22:03:13 1011 +++ branches/dev-api-4/xvidcore/src/utils/mbtransquant.c 2003/05/11 13:26:14 1012 @@ -21,12 +21,13 @@ * along with this program ; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * $Id: mbtransquant.c,v 1.21.2.10 2003-05-09 22:03:13 chl Exp $ + * $Id: mbtransquant.c,v 1.21.2.11 2003-05-11 13:26:14 chl Exp $ * ****************************************************************************/ -#include +#include #include +#include #include "../portab.h" #include "mbfunctions.h" @@ -612,8 +613,9 @@ ************************************************************************/ -static int -dct_quantize_trellis_inter_mpeg_c (int16_t *qcoeff, const int16_t *data, int quant) +static int +dct_quantize_trellis_mpeg_c(int16_t *const Out, const int16_t *const In, int Q, + const uint16_t * const Zigzag, int Non_Zero) { return 63; } @@ -753,46 +755,271 @@ } ////////////////////////////////////////////////////////// +// this routine has been strippen of all debug code +////////////////////////////////////////////////////////// + +static int +dct_quantize_trellis_h263_c(int16_t *const Out, const int16_t *const In, int Q, const uint16_t * const Zigzag, int Non_Zero) +{ + + // Note: We should search last non-zero coeffs on *real* DCT input coeffs (In[]), + // not quantized one (Out[]). However, it only improves the result *very* + // slightly (~0.01dB), whereas speed drops to crawling level :) + // Well, actually, taking 1 more coeff past Non_Zero into account sometimes helps, + + typedef struct { int16_t Run, Level; } NODE; + + NODE Nodes[65], Last; + uint32_t Run_Costs0[64+1]; + uint32_t * const Run_Costs = Run_Costs0 + 1; + const int Mult = 2*Q; + const int Bias = (Q-1) | 1; + const int Lev0 = Mult + Bias; + const int Lambda = Trellis_Lambda_Tabs[Q-1]; // it's 1/lambda, actually + + int Run_Start = -1; + Run_Costs[-1] = 2<<16; // source (w/ CBP penalty) + uint32_t Min_Cost = 2<<16; + + int Last_Node = -1; + uint32_t Last_Cost = 0; + + int i, j; + + Non_Zero = Find_Last(Out, Zigzag, Non_Zero); + if (Non_Zero<0) + return -1; + + for(i=0; i<=Non_Zero; i++) + { + const int AC = In[Zigzag[i]]; + const int Level1 = Out[Zigzag[i]]; + const int Dist0 = Lambda* AC*AC; + uint32_t Best_Cost = 0xf0000000; + Last_Cost += Dist0; + + if ((uint32_t)(Level1+1)<3) // very specialized loop for -1,0,+1 + { + int dQ; + int Run; + uint32_t Cost0; + + if (AC<0) { + Nodes[i].Level = -1; + dQ = Lev0 + AC; + } else { + Nodes[i].Level = 1; + dQ = Lev0 - AC; + } + Cost0 = Lambda*dQ*dQ; + + Nodes[i].Run = 1; + Best_Cost = (Code_Len20[0]<<16) + Run_Costs[i-1]+Cost0; + for(Run=i-Run_Start; Run>0; --Run) + { + const uint32_t Cost_Base = Cost0 + Run_Costs[i-Run]; + const uint32_t Cost = Cost_Base + (Code_Len20[Run-1]<<16); + const uint32_t lCost = Cost_Base + (Code_Len24[Run-1]<<16); + + // TODO: what about tie-breaks? Should we favor short runs or + // long runs? Although the error is the same, it would not be + // spread the same way along high and low frequencies... + + // (I'd say: favour short runs => hifreq errors (HVS) -- gruel ) + + if (Cost1) { + dQ1 = Level1*Mult-AC + Bias; + dQ2 = dQ1 - Mult; + Level2 = Level1-1; + Tbl_L1 = (Level1<=24) ? B16_17_Code_Len[Level1-1] : Code_Len0; + Tbl_L2 = (Level2<=24) ? B16_17_Code_Len[Level2-1] : Code_Len0; + Tbl_L1_Last = (Level1<=6) ? B16_17_Code_Len_Last[Level1-1] : Code_Len0; + Tbl_L2_Last = (Level2<=6) ? B16_17_Code_Len_Last[Level2-1] : Code_Len0; + } else { // Level1<-1 + dQ1 = Level1*Mult-AC - Bias; + dQ2 = dQ1 + Mult; + Level2 = Level1 + 1; + Tbl_L1 = (Level1>=-24) ? B16_17_Code_Len[Level1^-1] : Code_Len0; + Tbl_L2 = (Level2>=-24) ? B16_17_Code_Len[Level2^-1] : Code_Len0; + Tbl_L1_Last = (Level1>=- 6) ? B16_17_Code_Len_Last[Level1^-1] : Code_Len0; + Tbl_L2_Last = (Level2>=- 6) ? B16_17_Code_Len_Last[Level2^-1] : Code_Len0; + } + Dist1 = Lambda*dQ1*dQ1; + Dist2 = Lambda*dQ2*dQ2; + dDist21 = Dist2-Dist1; + + for(Run=i-Run_Start; Run>0; --Run) + { + const uint32_t Cost_Base = Dist1 + Run_Costs[i-Run]; + uint32_t Cost1, Cost2; + int bLevel; + +// for sub-optimal (but slightly worth it, speed-wise) search, uncomment the following: +// if (Cost_Base>=Best_Cost) continue; +// (? doesn't seem to have any effect -- gruel ) + + Cost1 = Cost_Base + (Tbl_L1[Run-1]<<16); + Cost2 = Cost_Base + (Tbl_L2[Run-1]<<16) + dDist21; + + if (Cost2Min_Cost+(1<<16) ) + Run_Start++; + + // spread on preceding coeffs the cost incurred by skipping this one + for(j=Run_Start; j=0) { + Out[Zigzag[i]] = Nodes[i].Level; + i -= Nodes[i].Run; + } + return Last_Node; +} + + + + + + + + + + + +////////////////////////////////////////////////////////// +// original version including heavy debugging info +////////////////////////////////////////////////////////// + + +#ifdef DBGTRELL #define DBG 0 -static uint32_t Evaluate_Cost(const int16_t *C, int Mult, int Bias, +static inline uint32_t Evaluate_Cost(const int16_t *C, int Mult, int Bias, const uint16_t * Zigzag, int Max, int Lambda) { #if (DBG>0) const int16_t * const Ref = C + 6*64; int Last = Max; - while(Last>=0 && C[Zigzag[Last]]==0) Last--; int Bits = 0; + int Dist = 0; + int i; + uint32_t Cost; + + while(Last>=0 && C[Zigzag[Last]]==0) + Last--; + if (Last>=0) { - Bits = 2; // CBP int j=0, j0=0; int Run, Level; + + Bits = 2; // CBP while(j=-24 && Level<=24) Bits += B16_17_Code_Len[(Level<0) ? -Level-1 : Level-1][Run]; - else Bits += 30; + if (Level>=-24 && Level<=24) + Bits += B16_17_Code_Len[(Level<0) ? -Level-1 : Level-1][Run]; + else + Bits += 30; } Level = C[Zigzag[Last]]; Run = j - j0; - if (Level>=-6 && Level<=6) Bits += B16_17_Code_Len_Last[(Level<0) ? -Level-1 : Level-1][Run]; - else Bits += 30; + if (Level>=-6 && Level<=6) + Bits += B16_17_Code_Len_Last[(Level<0) ? -Level-1 : Level-1][Run]; + else + Bits += 30; } - int Dist = 0; - int i; for(i=0; i<=Last; ++i) { int V = C[Zigzag[i]]*Mult; - if (V>0) V += Bias; - else if (V<0) V -= Bias; + if (V>0) + V += Bias; + else + if (V<0) + V -= Bias; V -= Ref[Zigzag[i]]; Dist += V*V; } - uint32_t Cost = Lambda*Dist + (Bits<<16); + Cost = Lambda*Dist + (Bits<<16); if (DBG==1) printf( " Last:%2d/%2d Cost = [(Bits=%5.0d) + Lambda*(Dist=%6.0d) = %d ] >>12= %d ", Last,Max, Bits, Dist, Cost, Cost>>12 ); return Cost; @@ -815,8 +1042,8 @@ typedef struct { int16_t Run, Level; } NODE; NODE Nodes[65], Last; - uint32_t Run_Costs0[64+1], * const Run_Costs = Run_Costs0 + 1; - + uint32_t Run_Costs0[64+1]; + uint32_t * const Run_Costs = Run_Costs0 + 1; const int Mult = 2*Q; const int Bias = (Q-1) | 1; const int Lev0 = Mult + Bias; @@ -829,12 +1056,12 @@ int Last_Node = -1; uint32_t Last_Cost = 0; + int i, j; + #if (DBG>0) Last.Level = 0; Last.Run = -1; // just initialize to smthg #endif - int i, j; - Non_Zero = Find_Last(Out, Zigzag, Non_Zero); if (Non_Zero<0) return -1; @@ -849,8 +1076,9 @@ if ((uint32_t)(Level1+1)<3) // very specialized loop for -1,0,+1 { - int dQ; - int Run; + int dQ; + int Run; + uint32_t Cost0; if (AC<0) { Nodes[i].Level = -1; @@ -859,32 +1087,32 @@ Nodes[i].Level = 1; dQ = Lev0 - AC; } - const uint32_t Cost0 = Lambda*dQ*dQ; - + Cost0 = Lambda*dQ*dQ; + Nodes[i].Run = 1; Best_Cost = (Code_Len20[0]<<16) + Run_Costs[i-1]+Cost0; for(Run=i-Run_Start; Run>0; --Run) { const uint32_t Cost_Base = Cost0 + Run_Costs[i-Run]; const uint32_t Cost = Cost_Base + (Code_Len20[Run-1]<<16); + const uint32_t lCost = Cost_Base + (Code_Len24[Run-1]<<16); + // TODO: what about tie-breaks? Should we favor short runs or // long runs? Although the error is the same, it would not be // spread the same way along high and low frequencies... - if (Cost1) { dQ1 = Level1*Mult-AC + Bias; dQ2 = dQ1 - Mult; @@ -916,8 +1146,7 @@ Tbl_L2 = (Level2<=24) ? B16_17_Code_Len[Level2-1] : Code_Len0; Tbl_L1_Last = (Level1<=6) ? B16_17_Code_Len_Last[Level1-1] : Code_Len0; Tbl_L2_Last = (Level2<=6) ? B16_17_Code_Len_Last[Level2-1] : Code_Len0; - } - else { // Level1<-1 + } else { // Level1<-1 dQ1 = Level1*Mult-AC - Bias; dQ2 = dQ1 + Mult; Level2 = Level1 + 1; @@ -926,28 +1155,29 @@ Tbl_L1_Last = (Level1>=- 6) ? B16_17_Code_Len_Last[Level1^-1] : Code_Len0; Tbl_L2_Last = (Level2>=- 6) ? B16_17_Code_Len_Last[Level2^-1] : Code_Len0; } - const uint32_t Dist1 = Lambda*dQ1*dQ1; - const uint32_t Dist2 = Lambda*dQ2*dQ2; - const int dDist21 = Dist2-Dist1; + Dist1 = Lambda*dQ1*dQ1; + Dist2 = Lambda*dQ2*dQ2; + dDist21 = Dist2-Dist1; for(Run=i-Run_Start; Run>0; --Run) { const uint32_t Cost_Base = Dist1 + Run_Costs[i-Run]; + uint32_t Cost1, Cost2; + int bLevel; // for sub-optimal (but slightly worth it, speed-wise) search, uncomment the following: // if (Cost_Base>=Best_Cost) continue; - uint32_t Cost1, Cost2; - int bLevel; - Cost1 = Cost_Base + (Tbl_L1[Run-1]<<16); Cost2 = Cost_Base + (Tbl_L2[Run-1]<<16) + dDist21; - if (Cost2Min_Cost+(1<<16) ) Run_Start++; @@ -1016,7 +1250,7 @@ return -1; // reconstruct optimal sequence backward with surviving paths - bzero(Out, 64*sizeof(*Out)); + memset(Out, 0x00, 64*sizeof(*Out)); Out[Zigzag[Last_Node]] = Last.Level; i = Last_Node - Last.Run; while(i>=0) { @@ -1037,3 +1271,5 @@ } #undef DBG + +#endif