/* * jrevdct.c * * Copyright (C) 1991, 1992, Thomas G. Lane. * This file is part of the Independent JPEG Group's software. * For conditions of distribution and use, see the accompanying README file. * */ /* This is an experimental modification of this file. * The MPEG-Player is used as a real environment testcontainer to examine the * performance of the DCT algorithms. * It turned out, (in other words - it could be prooved at least on an intel- * based machine), that a higher decoding speed (and finally an up to 20 % * higher frame rate) can be achieved when following the proposals of Arai, * Agiu and Nakajima - and the extensions made by Feig. * * Please find a detailed description of the implementation in * E.Feig,E.Linzer,Discrete Cosine Transform Algorithms for Image Data * Compression, Proceedings Electronic Imaging '90 Eas, pp.84-7, * Boston, MA(Oct.29th-Nov.1rst,1990) or see * W.B:PenneBaker,J.L.Mitchell, JPEG Still Image Compression Standard, * VAN Nostrand Reinhold 1993 * * The Java-Source and more documentations of the MPEG_player is available * on-line at: * http://rnvs.informatik.tu-chemnitz.de/~ja/MPEG/MPEG_Play.html * * The previous documentation was left intact since there are some analogies concerning the * used fixpoint arithmetic. * Have fun. * */ /* * This file contains the basic inverse-DCT transformation subroutine. * * This implementation is based on an algorithm described in * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. * The primary algorithm described there uses 11 multiplies and 29 adds. * We use their alternate method with 12 multiplies and 32 adds. * The advantage of this method is that no data path contains more than one * multiplication; this allows a very simple and accurate implementation in * scaled fixed-point arithmetic, with a minimal number of shifts. * * I've made lots of modifications to attempt to take advantage of the * sparse nature of the DCT matrices we're getting. Although the logic * is cumbersome, it's straightforward and the resulting code is much * faster. * * A better way to do this would be to pass in the DCT block as a sparse * matrix, perhaps with the difference cases encoded. */ #include #include #include "video.h" #include "proto.h" #include "jrevdct.h" #define GLOBAL /* a function referenced thru EXTERNs */ /* We assume that right shift corresponds to signed division by 2 with * rounding towards minus infinity. This is correct for typical "arithmetic * shift" instructions that shift in copies of the sign bit. But some * C compilers implement >> with an unsigned shift. For these machines you * must define RIGHT_SHIFT_IS_UNSIGNED. * RIGHT_SHIFT provides a proper signed right shift of an INT32 quantity. * It is only applied with constant shift counts. SHIFT_TEMPS must be * included in the variables of any routine using RIGHT_SHIFT. */ #ifdef RIGHT_SHIFT_IS_UNSIGNED #define SHIFT_TEMPS INT32 shift_temp; #define RIGHT_SHIFT(x,shft) \ ((shift_temp = (x)) < 0 ? \ (shift_temp >> (shft)) | ((~((INT32) 0)) << (32-(shft))) : \ (shift_temp >> (shft))) #else #define SHIFT_TEMPS #define RIGHT_SHIFT(x,shft) ((x) >> (shft)) #endif /* * This routine is specialized to the case DCTSIZE = 8. */ #if DCTSIZE != 8 Sorry, this code only copes with 8 x8 DCTs. /* deliberate syntax err */ #endif /* * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT * on each column. Direct algorithms are also available, but they are * much more complex and seem not to be any faster when reduced to code. * * The poop on this scaling stuff is as follows: * * Each 1-D IDCT step produces outputs which are a factor of sqrt(N) * larger than the true IDCT outputs. The final outputs are therefore * a factor of N larger than desired; since N=8 this can be cured by * a simple right shift at the end of the algorithm. The advantage of * this arrangement is that we save two multiplications per 1-D IDCT, * because the y0 and y4 inputs need not be divided by sqrt(N). * * We have to do addition and subtraction of the integer inputs, which * is no problem, and multiplication by fractional constants, which is * a problem to do in integer arithmetic. We multiply all the constants * by CONST_SCALE and convert them to integer constants (thus retaining * CONST_BITS bits of precision in the constants). After doing a * multiplication we have to divide the product by CONST_SCALE, with proper * rounding, to produce the correct output. This division can be done * cheaply as a right shift of CONST_BITS bits. We postpone shifting * as long as possible so that partial sums can be added together with * full fractional precision. * * The outputs of the first pass are scaled up by PASS1_BITS bits so that * they are represented to better-than-integral precision. These outputs * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word * with the recommended scaling. (To scale up 12-bit sample data further, an * intermediate INT32 array would be needed.) * * To avoid overflow of the 32-bit intermediate results in pass 2, we must * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis * shows that the values given below are the most effective. */ #ifdef EIGHT_BIT_SAMPLES #define PASS1_BITS 2 #else #define PASS1_BITS 1 /* lose a little precision to avoid overflow */ #endif #define ONE ((INT32) 1) #define CONST_SCALE (ONE << CONST_BITS) /* Convert a positive real constant to an integer scaled by CONST_SCALE. * IMPORTANT: if your compiler doesn't do this arithmetic at compile time, * you will pay a significant penalty in run time. In that case, figure * the correct integer constant values and insert them by hand. */ #define FIX(x) ((INT32) ((x) * CONST_SCALE + 0.5)) /* When adding two opposite-signed fixes, the 0.5 cancels */ #define FIX2(x) ((INT32) ((x) * CONST_SCALE)) /* Descale and correctly round an INT32 value that's scaled by N bits. * We assume RIGHT_SHIFT rounds towards minus infinity, so adding * the fudge factor is correct for either sign of X. */ #define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n) /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result. * For 8-bit samples with the recommended scaling, all the variable * and constant values involved are no more than 16 bits wide, so a * 16x16->32 bit multiply can be used instead of a full 32x32 multiply; * this provides a useful speedup on many machines. * There is no way to specify a 16x16->32 multiply in portable C, but * some C compilers will do the right thing if you provide the correct * combination of casts. * NB: for 12-bit samples, a full 32-bit multiplication will be needed. */ #ifdef EIGHT_BIT_SAMPLES #ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */ #define MULTIPLY(var,const) (((INT16) (var)) * ((INT16) (const))) #endif #ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */ #define MULTIPLY(var,const) (((INT16) (var)) * ((INT32) (const))) #endif #endif #ifndef MULTIPLY /* default definition */ #define MULTIPLY(var,const) ((var) * (const)) #endif #ifndef NO_SPARSE_DCT #define SPARSE_SCALE_FACTOR 8 #endif /* Precomputed idct value arrays. */ static DCTELEM PreIDCT[64][64]; /* Arrays temporarily used in j_rev_dct() */ int matr1[64]; int matr2[64]; /* *-------------------------------------------------------------- * * init_pre_idct -- * * Pre-computes singleton coefficient IDCT values. * * Results: * None. * * Side effects: * None. * *-------------------------------------------------------------- */ void init_pre_idct() { int i,j,k,l; /* printf("Init_pre_idct called !\n"); */ for (i=0; i<64; i++) { memset((char *) PreIDCT[i], 0L, 64*sizeof(DCTELEM)); PreIDCT[i][i] = 1 << 11; j_rev_dct(PreIDCT[i]); for (k = 0; k < 8; k++) { for (l = 0; l < 8; l++) { PreIDCT[i][k * 8 + l] = (int) (PreIDCT[i][k * 8 + l] * cos(PI * k / 16.0) * cos(PI * l / 16.0)); } } /* dct_dump(PreIDCT[i]); */ /* printf("\n"); */ } } /* *-------------------------------------------------------------- * * j_rev_dct_sparse -- * * Performs the original inverse DCT on one block of * coefficients. * * Results: * None. * * Side effects: * None. * *-------------------------------------------------------------- */ void j_rev_dct_sparse(data, pos) DCTBLOCK data; int pos; { int val, co; int p,i; int *tmpptr; // DC value if (pos == 0) { val = (data[0] >> VAL_BITS); p=0; data[0] = data[1] = data[2] = data[3] = data[4] = data[5] = data[6] = data[7] = data[8] = data[9] = data[10] = data[11] = data[12] = data[13] = data[14] = data[15] = data[16] = data[17] = data[18] = data[19] = data[20] = data[21] = data[22] = data[23] = data[24] = data[25] = data[26] = data[27] = data[28] = data[29] = data[30] = data[31] = data[32] = data[33] = data[34] = data[35] = data[36] = data[37] = data[38] = data[39] = data[40] = data[41] = data[42] = data[43] = data[44] = data[45] = data[46] = data[47] = data[48] = data[49] = data[50] = data[51] = data[52] = data[53] = data[54] = data[55] = data[56] = data[57] = data[58] = data[59] = data[60] = data[61] = data[62] = data[63] = val; return; } /* AC values: */ /* perform the IDFT using the lookup table */ co = data[pos]; tmpptr = PreIDCT[pos]; for (p = 0; p<64; p++) { data[p] = (tmpptr[p] * co) >> (VAL_BITS-2); } /* j_rev_dct(data); */ } /* *-------------------------------------------------------------- * * j_rev_dct * * The inverse DCT function. No tricks, no major optimizations. * * Results: * None. * * Side effects: * None. * *-------------------------------------------------------------- */ void j_rev_dct(DCTBLOCK coeff) { int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int co1, co2, co3, co5, co6, co7, co35, co17; int n0, n1, n2, n3; int m0,m1, m2, m3, m4, m5, m6, m7, m8, tmp; int l0 = 0, l1 = 0, l2 = 0, l3 = 0; int g0, g1, g2, g3; int i, j, p; short k; register int xxx; /* compute B1 (horizontal / vertical algoritm): */ /* (the vertical part is in tensor product) */ /* coeff[0]<<=8; *//* changed, see parseblock.c */ for (k = 0; k < 64; k += 8) { matr1[k] = coeff[k + 0]; matr1[k + 1] = coeff[k + 4]; matr1[k + 2] = matr1[k + 3] = coeff[k + 2]; matr1[k + 2] -= coeff[k + 6]; matr1[k + 3] += coeff[k + 6]; matr1[k + 4] = xxx = coeff[k + 5]; matr1[k + 4] -= coeff[k + 3]; xxx += coeff[k + 3]; matr1[k + 5] = matr1[k + 6] = coeff[k + 1]; matr1[k + 7] = (matr1[k + 5] += coeff[k + 7]); matr1[k + 6] -= coeff[k + 7]; matr1[k + 7] += xxx; matr1[k + 5] -= xxx; } p = 0; /* line 0, M x M */ tmp4 = (co3 = matr1[24]) - (co5 = matr1[40]); tmp6 = (co1 = matr1[8]) - (co7 = matr1[56]); tmp = C6 * (tmp6 - tmp4); matr2[p++] = matr1[0] << CONST_BITS_JA; matr2[p++] = matr1[32] << CONST_BITS_JA; matr2[p++] = ((co2 = matr1[16]) - (co6 = matr1[48])) * C4; matr2[p++] = (co2 + co6) << CONST_BITS_JA; matr2[p++] = Q * tmp4 - tmp; matr2[p++] = ((co17 = co1 + co7) - (co35 = co3 + co5)) * C4; matr2[p++] = R * tmp6 - tmp; matr2[p++] = (co17 + co35) << CONST_BITS_JA; /* line 1, M x M */ tmp4 = (co3 = matr1[25]) - (co5 = matr1[41]); tmp6 = (co1 = matr1[9]) - (co7 = matr1[57]); tmp = C6 * (tmp6 - tmp4); matr2[p++] = matr1[1] << CONST_BITS_JA; matr2[p++] = matr1[33] << CONST_BITS_JA; matr2[p++] = ((co2 = matr1[17]) - (co6 = matr1[49])) * C4; matr2[p++] = (co2 + co6) << CONST_BITS_JA; matr2[p++] = Q * tmp4 - tmp; matr2[p++] = ((co17 = co1 + co7) - (co35 = co3 + co5)) * C4; matr2[p++] = R * tmp6 - tmp; matr2[p++] = (co17 + co35) << CONST_BITS_JA; /* line 2, M x M */ tmp4 = (co3 = matr1[26]) - (co5 = matr1[42]); tmp6 = (co1 = matr1[10]) - (co7 = matr1[58]); tmp = Q * (tmp6 - tmp4); matr2[p++] = C4 * matr1[2]; matr2[p++] = C4 * matr1[34]; matr2[p++] = ((co2 = matr1[18]) - (co6 = matr1[50])) << TWO; matr2[p++] = C4 * (co2 + co6); matr2[p++] = C4Q * tmp4 - tmp; matr2[p++] = ((co17 = co1 + co7) - (co35 = co3 + co5)) << TWO; matr2[p++] = C4R * tmp6 - tmp; matr2[p++] = C4 * (co17 + co35); /* line 3, M x M */ tmp4 = (co3 = matr1[27]) - (co5 = matr1[43]); tmp6 = (co1 = matr1[11]) - (co7 = matr1[59]); tmp = C6 * (tmp6 - tmp4); matr2[p++] = matr1[3] << CONST_BITS_JA; matr2[p++] = matr1[35] << CONST_BITS_JA; matr2[p++] = ((co2 = matr1[19]) - (co6 = matr1[51])) * C4; matr2[p++] = (co2 + co6) << CONST_BITS_JA; matr2[p++] = Q * tmp4 - tmp; matr2[p++] = ((co17 = co1 + co7) - (co35 = co3 + co5)) * C4; matr2[p++] = R * tmp6 - tmp; matr2[p++] = (co17 + co35) << CONST_BITS_JA; /* line 4, M x M */ matr2[p++] = matr1[4]; matr2[p++] = matr1[36]; matr2[p++] = (co2 = matr1[20]) - (co6 = matr1[52]); matr2[p] = co2 + co6; l0 = l2 = -(co3 = matr1[28]) + (co5 = matr1[44]); p += 2; matr2[p] = (co17 = (co1 = matr1[12]) + (co7 = matr1[60])) - (co35 = co3 + co5); l3 = -(l1 = co1 - co7); p += 2; matr2[p++] = co17 + co35; /* line 5, M x M */ tmp4 = (co3 = matr1[29]) - (co5 = matr1[45]); tmp6 = (co1 = matr1[13]) - (co7 = matr1[61]); tmp = Q * (tmp6 - tmp4); matr2[p++] = C4 * matr1[5]; matr2[p++] = C4 * matr1[37]; matr2[p++] = ((co2 = matr1[16 + 5]) - (co6 = matr1[48 + 5])) << TWO; matr2[p++] = C4 * (co2 + co6); matr2[p++] = C4Q * tmp4 - tmp; matr2[p++] = ((co17 = co1 + co7) - (co35 = co3 + co5)) << TWO; matr2[p++] = C4R * tmp6 - tmp; matr2[p++] = C4 * (co17 + co35); /* line 6, M x M */ matr2[p++] = matr1[6]; matr2[p++] = matr1[38]; matr2[p++] = (co2 = matr1[22]) - (co6 = matr1[54]); matr2[p] = co2 + co6; l1 += (tmp4 = -(co3 = matr1[30]) + (co5 = matr1[46])); l3 += tmp4; p += 2; matr2[p] = (co17 = (co1 = matr1[14]) + (co7 = matr1[62])) - (co35 = co3 + co5); l2 += (tmp6 = co1 - co7); l0 -= tmp6; p += 2; matr2[p++] = co17 + co35; /* line 7, M x M */ tmp4 = (co3 = matr1[24 + 7]) - (co5 = matr1[40 + 7]); tmp6 = (co1 = matr1[8 + 7]) - (co7 = matr1[56 + 7]); tmp = C6 * (tmp6 - tmp4); matr2[p++] = matr1[7] << CONST_BITS_JA; matr2[p++] = matr1[32 + 7] << CONST_BITS_JA; matr2[p++] = ((co2 = matr1[16 + 7]) - (co6 = matr1[48 + 7])) * C4; matr2[p++] = (co2 + co6) << CONST_BITS_JA; matr2[p++] = Q * tmp4 - tmp; matr2[p++] = ((co17 = co1 + co7) - (co35 = co3 + co5)) * C4; matr2[p++] = R * tmp6 - tmp; matr2[p++] = (co17 + co35) << CONST_BITS_JA; /* completing line 4 and 6, O = J(NxM) */ g0 = C4 * (l0 + l1); g1 = C4 * (l0 - l1); g2 = l2 << TWO; g3 = l3 << TWO; matr2[36] = g0 + g2; matr2[38] = g1 + g3; matr2[52] = g1 - g3; matr2[54] = g2 - g0; tmp = C6 * (matr2[32] + matr2[48]); matr2[32] = -Q * matr2[32] - tmp; matr2[48] = R * matr2[48] - tmp; tmp = C6 * (matr2[33] + matr2[49]); matr2[33] = -Q * matr2[33] - tmp; matr2[49] = R * matr2[49] - tmp; tmp = Q * (matr2[34] + matr2[50]); matr2[34] = -C4Q * matr2[34] - tmp; matr2[50] = C4R * matr2[50] - tmp; tmp = C6 * (matr2[35] + matr2[51]); matr2[35] = -Q * matr2[35] - tmp; matr2[51] = R * matr2[51] - tmp; tmp = Q * (matr2[37] + matr2[53]); matr2[37] = -C4Q * matr2[37] - tmp; matr2[53] = C4R * matr2[53] - tmp; tmp = C6 * (matr2[39] + matr2[55]); matr2[39] = -Q * matr2[39] - tmp; matr2[55] = R * matr2[55] - tmp; for (p = 0; p < 64; p += 8) { matr1[p] = (tmp4 = (n3 = matr2[p] + matr2[p + 1]) + matr2[p + 3]) + matr2[p + 7]; matr1[p + 3] = (tmp6 = n3 - matr2[p + 3]) - (tmp7 = matr2[p + 4] - (tmp1 = (tmp2 = matr2[p + 6] - matr2[p + 7]) - matr2[p + 5])); matr1[p + 4] = tmp6 + tmp7; matr1[p + 1] = (tmp3 = (n1 = matr2[p] - matr2[p + 1]) + (n2 = matr2[p + 2] - matr2[p + 3])) + tmp2; matr1[p + 5] = (n1 - n2) + tmp1; // no tmp because of the caching of (n1 -n2) matr1[p + 2] = (n1 - n2) - tmp1; matr1[p + 6] = tmp3 - tmp2; matr1[p + 7] = tmp4 - matr2[p + 7]; } for (p = i = 0; p < 64; p += 8, i++) { coeff[p] = ((tmp4 = (n3 = matr1[i] + matr1[8 + i]) + matr1[24 + i]) + matr1[56 + i]) >> ALLBITS; coeff[p + 3] = ((tmp6 = n3 - matr1[24 + i]) - (tmp7 = matr1[32 + i] - (tmp1 = (tmp2 = matr1[48 + i] - matr1[56 + i]) - matr1[40 + i]))) >> ALLBITS; coeff[p + 4] = (tmp6 + tmp7) >> ALLBITS; coeff[p + 1] = ((tmp3 = (n1 = matr1[i] - matr1[8 + i]) + (n2 = matr1[16 + i] - matr1[24 + i])) + tmp2) >> ALLBITS; coeff[p + 5] = ((n1 - n2) + tmp1) >> ALLBITS; coeff[p + 2] = ((n1 - n2) - tmp1) >> ALLBITS; coeff[p + 6] = (tmp3 - tmp2) >> ALLBITS; coeff[p + 7] = (tmp4 - matr1[56 + i]) >> ALLBITS; } } /* *-------------------------------------------------------------- * * norm_quant_matrix() * * Remember that we're actually do an inverse Discrete Fourier Transformation here. * To get out the DCT-values ritght after the quantization step, we have to work our * conversion constants into the quantisation tables since Quantization means to * multiply the DCT-Result with empirically defined quantization factors. * * Results: * None. * * Side effects: * None. * *-------------------------------------------------------------- */ void norm_quant_matrix(m1) unsigned short m1[8][8]; { int i, j; double d; for (j = 0; j < 8; j++) { for (i = 0; i < 8; i++) { d = (double) m1[j][i]; if (i == 0 && j == 0) { d /= 8.0; } else if (i == 0 || j == 0) { d /= 8.0 / sqrt(2.0); } else { d /= 4.0; } m1[j][i] = (int) (d * (1 << VAL_BITS) * cos(PI * i / 16.0) * cos(PI * j / 16.0) + 0.5); } } } /* *-------------------------------------------------------------- * * dct_dump() * * Prints out an entire DCTBLOCK (64 Elements in 8 x 8 Order). * For debugging purpose only. * * Results: * None. * * Side effects: * None. * *-------------------------------------------------------------- */ void dct_dump(m1) DCTBLOCK m1; { int i, j; printf("\n"); for (j = 0; j < 8; j++) { for (i = 0; i < 8; i++) { printf("%d\t", (m1[j*8+i])); } printf("\n"); } }