--- branches/release-1_3-branch/xvidcore/src/dct/fdct.c 2011/05/18 08:06:18 1983 +++ branches/release-1_3-branch/xvidcore/src/dct/fdct.c 2011/05/18 08:51:47 1984 @@ -1,288 +1,312 @@ -/***************************************************************************** - * - * XVID MPEG-4 VIDEO CODEC - * - Forward DCT - - * - * These routines are from Independent JPEG Group's free JPEG software - * Copyright (C) 1991-1998, Thomas G. Lane (see the file README.IJG) - * - * This program is free software ; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation ; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY ; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program ; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * $Id: fdct.c,v 1.7 2004-03-22 22:36:23 edgomez Exp $ - * - ****************************************************************************/ - -/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */ - -/* - * Disclaimer of Warranty - * - * These software programs are available to the user without any license fee or - * royalty on an "as is" basis. The MPEG Software Simulation Group disclaims - * any and all warranties, whether express, implied, or statuary, including any - * implied warranties or merchantability or of fitness for a particular - * purpose. In no event shall the copyright-holder be liable for any - * incidental, punitive, or consequential damages of any kind whatsoever - * arising from the use of these programs. - * - * This disclaimer of warranty extends to the user of these programs and user's - * customers, employees, agents, transferees, successors, and assigns. - * - * The MPEG Software Simulation Group does not represent or warrant that the - * programs furnished hereunder are free of infringement of any third-party - * patents. - * - * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware, - * are subject to royalty fees to patent holders. Many of these patents are - * general enough such that they are unavoidable regardless of implementation - * design. - * - */ - -/* This routine is a slow-but-accurate integer implementation of the - * forward DCT (Discrete Cosine Transform). Taken from the IJG software - * - * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT - * on each column. Direct algorithms are also available, but they are - * much more complex and seem not to be any faster when reduced to code. - * - * This implementation is based on an algorithm described in - * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT - * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, - * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. - * The primary algorithm described there uses 11 multiplies and 29 adds. - * We use their alternate method with 12 multiplies and 32 adds. - * The advantage of this method is that no data path contains more than one - * multiplication; this allows a very simple and accurate implementation in - * scaled fixed-point arithmetic, with a minimal number of shifts. - * - * The poop on this scaling stuff is as follows: - * - * Each 1-D DCT step produces outputs which are a factor of sqrt(N) - * larger than the true DCT outputs. The final outputs are therefore - * a factor of N larger than desired; since N=8 this can be cured by - * a simple right shift at the end of the algorithm. The advantage of - * this arrangement is that we save two multiplications per 1-D DCT, - * because the y0 and y4 outputs need not be divided by sqrt(N). - * In the IJG code, this factor of 8 is removed by the quantization step - * (in jcdctmgr.c), here it is removed. - * - * We have to do addition and subtraction of the integer inputs, which - * is no problem, and multiplication by fractional constants, which is - * a problem to do in integer arithmetic. We multiply all the constants - * by CONST_SCALE and convert them to integer constants (thus retaining - * CONST_BITS bits of precision in the constants). After doing a - * multiplication we have to divide the product by CONST_SCALE, with proper - * rounding, to produce the correct output. This division can be done - * cheaply as a right shift of CONST_BITS bits. We postpone shifting - * as long as possible so that partial sums can be added together with - * full fractional precision. - * - * The outputs of the first pass are scaled up by PASS1_BITS bits so that - * they are represented to better-than-integral precision. These outputs - * require 8 + PASS1_BITS + 3 bits; this fits in a 16-bit word - * with the recommended scaling. (For 12-bit sample data, the intermediate - * array is INT32 anyway.) - * - * To avoid overflow of the 32-bit intermediate results in pass 2, we must - * have 8 + CONST_BITS + PASS1_BITS <= 26. Error analysis - * shows that the values given below are the most effective. - * - * We can gain a little more speed, with a further compromise in accuracy, - * by omitting the addition in a descaling shift. This yields an incorrectly - * rounded result half the time... - */ - -#include "fdct.h" - -#define USE_ACCURATE_ROUNDING - -#define RIGHT_SHIFT(x, shft) ((x) >> (shft)) - -#ifdef USE_ACCURATE_ROUNDING -#define ONE ((int) 1) -#define DESCALE(x, n) RIGHT_SHIFT((x) + (ONE << ((n) - 1)), n) -#else -#define DESCALE(x, n) RIGHT_SHIFT(x, n) -#endif - -#define CONST_BITS 13 -#define PASS1_BITS 2 - -#define FIX_0_298631336 ((int) 2446) /* FIX(0.298631336) */ -#define FIX_0_390180644 ((int) 3196) /* FIX(0.390180644) */ -#define FIX_0_541196100 ((int) 4433) /* FIX(0.541196100) */ -#define FIX_0_765366865 ((int) 6270) /* FIX(0.765366865) */ -#define FIX_0_899976223 ((int) 7373) /* FIX(0.899976223) */ -#define FIX_1_175875602 ((int) 9633) /* FIX(1.175875602) */ -#define FIX_1_501321110 ((int) 12299) /* FIX(1.501321110) */ -#define FIX_1_847759065 ((int) 15137) /* FIX(1.847759065) */ -#define FIX_1_961570560 ((int) 16069) /* FIX(1.961570560) */ -#define FIX_2_053119869 ((int) 16819) /* FIX(2.053119869) */ -#define FIX_2_562915447 ((int) 20995) /* FIX(2.562915447) */ -#define FIX_3_072711026 ((int) 25172) /* FIX(3.072711026) */ - -/* function pointer */ -fdctFuncPtr fdct; - -/* - * Perform an integer forward DCT on one block of samples. - */ - -void -fdct_int32(short *const block) -{ - int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int tmp10, tmp11, tmp12, tmp13; - int z1, z2, z3, z4, z5; - short *blkptr; - int *dataptr; - int data[64]; - int i; - - /* Pass 1: process rows. */ - /* Note results are scaled up by sqrt(8) compared to a true DCT; */ - /* furthermore, we scale the results by 2**PASS1_BITS. */ - - dataptr = data; - blkptr = block; - for (i = 0; i < 8; i++) { - tmp0 = blkptr[0] + blkptr[7]; - tmp7 = blkptr[0] - blkptr[7]; - tmp1 = blkptr[1] + blkptr[6]; - tmp6 = blkptr[1] - blkptr[6]; - tmp2 = blkptr[2] + blkptr[5]; - tmp5 = blkptr[2] - blkptr[5]; - tmp3 = blkptr[3] + blkptr[4]; - tmp4 = blkptr[3] - blkptr[4]; - - /* Even part per LL&M figure 1 --- note that published figure is faulty; - * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". - */ - - tmp10 = tmp0 + tmp3; - tmp13 = tmp0 - tmp3; - tmp11 = tmp1 + tmp2; - tmp12 = tmp1 - tmp2; - - dataptr[0] = (tmp10 + tmp11) << PASS1_BITS; - dataptr[4] = (tmp10 - tmp11) << PASS1_BITS; - - z1 = (tmp12 + tmp13) * FIX_0_541196100; - dataptr[2] = - DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS - PASS1_BITS); - dataptr[6] = - DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS - PASS1_BITS); - - /* Odd part per figure 8 --- note paper omits factor of sqrt(2). - * cK represents cos(K*pi/16). - * i0..i3 in the paper are tmp4..tmp7 here. - */ - - z1 = tmp4 + tmp7; - z2 = tmp5 + tmp6; - z3 = tmp4 + tmp6; - z4 = tmp5 + tmp7; - z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */ - - tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */ - tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */ - tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */ - tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */ - z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */ - z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */ - z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */ - z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */ - - z3 += z5; - z4 += z5; - - dataptr[7] = DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS); - dataptr[5] = DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS); - dataptr[3] = DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS); - dataptr[1] = DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS); - - dataptr += 8; /* advance pointer to next row */ - blkptr += 8; - } - - /* Pass 2: process columns. - * We remove the PASS1_BITS scaling, but leave the results scaled up - * by an overall factor of 8. - */ - - dataptr = data; - for (i = 0; i < 8; i++) { - tmp0 = dataptr[0] + dataptr[56]; - tmp7 = dataptr[0] - dataptr[56]; - tmp1 = dataptr[8] + dataptr[48]; - tmp6 = dataptr[8] - dataptr[48]; - tmp2 = dataptr[16] + dataptr[40]; - tmp5 = dataptr[16] - dataptr[40]; - tmp3 = dataptr[24] + dataptr[32]; - tmp4 = dataptr[24] - dataptr[32]; - - /* Even part per LL&M figure 1 --- note that published figure is faulty; - * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". - */ - - tmp10 = tmp0 + tmp3; - tmp13 = tmp0 - tmp3; - tmp11 = tmp1 + tmp2; - tmp12 = tmp1 - tmp2; - - dataptr[0] = DESCALE(tmp10 + tmp11, PASS1_BITS); - dataptr[32] = DESCALE(tmp10 - tmp11, PASS1_BITS); - - z1 = (tmp12 + tmp13) * FIX_0_541196100; - dataptr[16] = - DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS + PASS1_BITS); - dataptr[48] = - DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS + PASS1_BITS); - - /* Odd part per figure 8 --- note paper omits factor of sqrt(2). - * cK represents cos(K*pi/16). - * i0..i3 in the paper are tmp4..tmp7 here. - */ - - z1 = tmp4 + tmp7; - z2 = tmp5 + tmp6; - z3 = tmp4 + tmp6; - z4 = tmp5 + tmp7; - z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */ - - tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */ - tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */ - tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */ - tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */ - z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */ - z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */ - z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */ - z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */ - - z3 += z5; - z4 += z5; - - dataptr[56] = DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS); - dataptr[40] = DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS); - dataptr[24] = DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS); - dataptr[8] = DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS); - - dataptr++; /* advance pointer to next column */ - } - /* descale */ - for (i = 0; i < 64; i++) - block[i] = (short int) DESCALE(data[i], 3); -} +/***************************************************************************** + * + * XVID MPEG-4 VIDEO CODEC + * - Forward DCT - + * + * Copyright (C) 2006-2011 Xvid Solutions GmbH + * + * This program is free software ; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation ; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY ; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program ; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * $Id: fdct.c,v 1.7 2004-03-22 22:36:23 edgomez Exp $ + * + ****************************************************************************/ + +/* + * Authors: Skal + * + * "Fast and precise" LLM implementation of FDCT/IDCT, where + * rotations are decomposed using: + * tmp = (x+y).cos t + * x' = tmp + y.(sin t - cos t) + * y' = tmp - x.(sin t + cos t) + * + * See details at the end of this file... + * + * Reference (e.g.): + * Loeffler C., Ligtenberg A., and Moschytz C.S.: + * Practical Fast 1D DCT Algorithm with Eleven Multiplications, + * Proc. ICASSP 1989, 988-991. + * + * IEEE-1180-like error specs for FDCT: + * Peak error: 1.0000 + * Peak MSE: 0.0340 + * Overall MSE: 0.0200 + * Peak ME: 0.0191 + * Overall ME: -0.0033 + * + ********************************************************/ + +#include "fdct.h" + +/* function pointer */ +fdctFuncPtr fdct; + +/* +////////////////////////////////////////////////////////// +*/ + +#define BUTF(a, b, tmp) \ + (tmp) = (a)+(b); \ + (b) = (a)-(b); \ + (a) = (tmp) + +#define LOAD_BUTF(m1, m2, a, b, tmp, S) \ + (m1) = (S)[(a)] + (S)[(b)]; \ + (m2) = (S)[(a)] - (S)[(b)] + +#define ROTATE(m1,m2,c,k1,k2,tmp,Fix,Rnd) \ + (tmp) = ( (m1) + (m2) )*(c); \ + (m1) *= k1; \ + (m2) *= k2; \ + (tmp) += (Rnd); \ + (m1) = ((m1)+(tmp))>>(Fix); \ + (m2) = ((m2)+(tmp))>>(Fix); + +#define ROTATE2(m1,m2,c,k1,k2,tmp) \ + (tmp) = ( (m1) + (m2) )*(c); \ + (m1) *= k1; \ + (m2) *= k2; \ + (m1) = (m1)+(tmp); \ + (m2) = (m2)+(tmp); + +#define ROTATE0(m1,m2,c,k1,k2,tmp) \ + (m1) = ( (m2) )*(c); \ + (m2) = (m2)*k2+(m1); + +#define SHIFTL(x,n) ((x)<<(n)) +#define SHIFTR(x, n) ((x)>>(n)) +#define HALF(n) (1<<((n)-1)) + +#define IPASS 3 +#define FPASS 2 +#define FIX 16 + +#if 1 + +#define ROT6_C 35468 +#define ROT6_SmC 50159 +#define ROT6_SpC 121095 +#define ROT17_C 77062 +#define ROT17_SmC 25571 +#define ROT17_SpC 128553 +#define ROT37_C 58981 +#define ROT37_SmC 98391 +#define ROT37_SpC 19571 +#define ROT13_C 167963 +#define ROT13_SmC 134553 +#define ROT13_SpC 201373 + +#else + +#include +#define FX(x) ( (int)floor((x)*(1<0; --i) + { + int mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, Spill; + + // even + + LOAD_BUTF(mm1,mm6, 1, 6, mm0, pIn); + LOAD_BUTF(mm2,mm5, 2, 5, mm0, pIn); + LOAD_BUTF(mm3,mm4, 3, 4, mm0, pIn); + LOAD_BUTF(mm0,mm7, 0, 7, Spill, pIn); + + BUTF(mm1, mm2, Spill); + BUTF(mm0, mm3, Spill); + + ROTATE(mm3, mm2, ROT6_C, ROT6_SmC, -ROT6_SpC, Spill, FIX-FPASS, HALF(FIX-FPASS)); + pIn[2] = mm3; + pIn[6] = mm2; + + BUTF(mm0, mm1, Spill); + pIn[0] = SHIFTL(mm0, FPASS); + pIn[4] = SHIFTL(mm1, FPASS); + + + // odd + + mm3 = mm5 + mm7; + mm2 = mm4 + mm6; + ROTATE(mm2, mm3, ROT17_C, -ROT17_SpC, -ROT17_SmC, mm0, FIX-FPASS, HALF(FIX-FPASS)); + ROTATE(mm4, mm7, -ROT37_C, ROT37_SpC, ROT37_SmC, mm0, FIX-FPASS, HALF(FIX-FPASS)); + mm7 += mm3; + mm4 += mm2; + pIn[1] = mm7; + pIn[7] = mm4; + + ROTATE(mm5, mm6, -ROT13_C, ROT13_SmC, ROT13_SpC, mm0, FIX-FPASS, HALF(FIX-FPASS)); + mm5 += mm3; + mm6 += mm2; + pIn[3] = mm6; + pIn[5] = mm5; + + pIn += 8; + } + + pIn = In; + for(i=8; i>0; --i) + { + int mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, Spill; + + // even + + LOAD_BUTF(mm1,mm6, 1*8, 6*8, mm0, pIn); + LOAD_BUTF(mm2,mm5, 2*8, 5*8, mm0, pIn); + BUTF(mm1, mm2, mm0); + + LOAD_BUTF(mm3,mm4, 3*8, 4*8, mm0, pIn); + LOAD_BUTF(mm0,mm7, 0*8, 7*8, Spill, pIn); + BUTF(mm0, mm3, Spill); + + ROTATE(mm3, mm2, ROT6_C, ROT6_SmC, -ROT6_SpC, Spill, 0, HALF(FIX+FPASS+3)); + pIn[2*8] = (int16_t)SHIFTR(mm3,FIX+FPASS+3); + pIn[6*8] = (int16_t)SHIFTR(mm2,FIX+FPASS+3); + + mm0 += HALF(FPASS+3) - 1; + BUTF(mm0, mm1, Spill); + pIn[0*8] = (int16_t)SHIFTR(mm0, FPASS+3); + pIn[4*8] = (int16_t)SHIFTR(mm1, FPASS+3); + + // odd + + mm3 = mm5 + mm7; + mm2 = mm4 + mm6; + + ROTATE(mm2, mm3, ROT17_C, -ROT17_SpC, -ROT17_SmC, mm0, 0, HALF(FIX+FPASS+3)); + ROTATE2(mm4, mm7, -ROT37_C, ROT37_SpC, ROT37_SmC, mm0); + mm7 += mm3; + mm4 += mm2; + pIn[7*8] = (int16_t)SHIFTR(mm4,FIX+FPASS+3); + pIn[1*8] = (int16_t)SHIFTR(mm7,FIX+FPASS+3); + + ROTATE2(mm5, mm6, -ROT13_C, ROT13_SmC, ROT13_SpC, mm0); + mm5 += mm3; + mm6 += mm2; + pIn[5*8] = (int16_t)SHIFTR(mm5,FIX+FPASS+3); + pIn[3*8] = (int16_t)SHIFTR(mm6,FIX+FPASS+3); + + pIn++; + } +} +#undef FIX +#undef FPASS +#undef IPASS + +#undef BUTF +#undef LOAD_BUTF +#undef ROTATE +#undef ROTATE2 +#undef SHIFTL +#undef SHIFTR + +////////////////////////////////////////////////////////// +// - Data flow schematics for FDCT - +// Output is scaled by 2.sqrt(2) +// Initial butterflies (in0/in7, etc.) are not fully depicted. +// Note: Rot6 coeffs are multiplied by sqrt(2). +////////////////////////////////////////////////////////// +/* + <---------Stage1 =even part=-----------> + + in3 mm3 +_____.___-___________.____* out6 + x \ / | + in4 mm4 \ | + / \ | + in0 mm0 +_____o___+__.___-___ | ___* out4 + x \ / | + in7 mm7 \ (Rot6) + / \ | + in1 mm1 +_____o___+__o___+___ | ___* out0 + x \ / | + in6 mm6 / | + / \ | + in2 mm2 +_____.___-___________o____* out2 + x + in5 mm5 + + <---------Stage2 =odd part=----------------> + + mm7*___._________.___-___[xSqrt2]___* out3 + | \ / + (Rot3) \ + | / \ + mm5*__ | ___o____o___+___.___-______* out7 + | | \ / + | (Rot1) \ + | | / \ + mm6*__ |____.____o___+___o___+______* out1 + | \ / + | / + | / \ + mm4*___o_________.___-___[xSqrt2]___* out5 + + + + Alternative schematics for stage 2: + ----------------------------------- + + mm7 *___[xSqrt2]____o___+____o_______* out1 + \ / | + / (Rot1) + / \ | + mm6 *____o___+______.___-___ | __.___* out5 + \ / | | + / | | + / \ | | + mm5 *____.___-______.___-____.__ | __* out7 + \ / | + \ (Rot3) + / \ | + mm4 *___[xSqrt2]____o___+________o___* out3 + +*/