xref: /plan9/sys/src/cmd/gs/jpeg/jidctint.c (revision 593dc095aefb2a85c828727bbfa9da139a49bdf4)
17dd7cddfSDavid du Colombier /*
27dd7cddfSDavid du Colombier  * jidctint.c
37dd7cddfSDavid du Colombier  *
4*593dc095SDavid du Colombier  * Copyright (C) 1991-1998, Thomas G. Lane.
57dd7cddfSDavid du Colombier  * This file is part of the Independent JPEG Group's software.
67dd7cddfSDavid du Colombier  * For conditions of distribution and use, see the accompanying README file.
77dd7cddfSDavid du Colombier  *
87dd7cddfSDavid du Colombier  * This file contains a slow-but-accurate integer implementation of the
97dd7cddfSDavid du Colombier  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
107dd7cddfSDavid du Colombier  * must also perform dequantization of the input coefficients.
117dd7cddfSDavid du Colombier  *
127dd7cddfSDavid du Colombier  * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
137dd7cddfSDavid du Colombier  * on each row (or vice versa, but it's more convenient to emit a row at
147dd7cddfSDavid du Colombier  * a time).  Direct algorithms are also available, but they are much more
157dd7cddfSDavid du Colombier  * complex and seem not to be any faster when reduced to code.
167dd7cddfSDavid du Colombier  *
177dd7cddfSDavid du Colombier  * This implementation is based on an algorithm described in
187dd7cddfSDavid du Colombier  *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
197dd7cddfSDavid du Colombier  *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
207dd7cddfSDavid du Colombier  *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
217dd7cddfSDavid du Colombier  * The primary algorithm described there uses 11 multiplies and 29 adds.
227dd7cddfSDavid du Colombier  * We use their alternate method with 12 multiplies and 32 adds.
237dd7cddfSDavid du Colombier  * The advantage of this method is that no data path contains more than one
247dd7cddfSDavid du Colombier  * multiplication; this allows a very simple and accurate implementation in
257dd7cddfSDavid du Colombier  * scaled fixed-point arithmetic, with a minimal number of shifts.
267dd7cddfSDavid du Colombier  */
277dd7cddfSDavid du Colombier 
287dd7cddfSDavid du Colombier #define JPEG_INTERNALS
297dd7cddfSDavid du Colombier #include "jinclude.h"
307dd7cddfSDavid du Colombier #include "jpeglib.h"
317dd7cddfSDavid du Colombier #include "jdct.h"		/* Private declarations for DCT subsystem */
327dd7cddfSDavid du Colombier 
337dd7cddfSDavid du Colombier #ifdef DCT_ISLOW_SUPPORTED
347dd7cddfSDavid du Colombier 
357dd7cddfSDavid du Colombier 
367dd7cddfSDavid du Colombier /*
377dd7cddfSDavid du Colombier  * This module is specialized to the case DCTSIZE = 8.
387dd7cddfSDavid du Colombier  */
397dd7cddfSDavid du Colombier 
407dd7cddfSDavid du Colombier #if DCTSIZE != 8
417dd7cddfSDavid du Colombier   Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
427dd7cddfSDavid du Colombier #endif
437dd7cddfSDavid du Colombier 
447dd7cddfSDavid du Colombier 
457dd7cddfSDavid du Colombier /*
467dd7cddfSDavid du Colombier  * The poop on this scaling stuff is as follows:
477dd7cddfSDavid du Colombier  *
487dd7cddfSDavid du Colombier  * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
497dd7cddfSDavid du Colombier  * larger than the true IDCT outputs.  The final outputs are therefore
507dd7cddfSDavid du Colombier  * a factor of N larger than desired; since N=8 this can be cured by
517dd7cddfSDavid du Colombier  * a simple right shift at the end of the algorithm.  The advantage of
527dd7cddfSDavid du Colombier  * this arrangement is that we save two multiplications per 1-D IDCT,
537dd7cddfSDavid du Colombier  * because the y0 and y4 inputs need not be divided by sqrt(N).
547dd7cddfSDavid du Colombier  *
557dd7cddfSDavid du Colombier  * We have to do addition and subtraction of the integer inputs, which
567dd7cddfSDavid du Colombier  * is no problem, and multiplication by fractional constants, which is
577dd7cddfSDavid du Colombier  * a problem to do in integer arithmetic.  We multiply all the constants
587dd7cddfSDavid du Colombier  * by CONST_SCALE and convert them to integer constants (thus retaining
597dd7cddfSDavid du Colombier  * CONST_BITS bits of precision in the constants).  After doing a
607dd7cddfSDavid du Colombier  * multiplication we have to divide the product by CONST_SCALE, with proper
617dd7cddfSDavid du Colombier  * rounding, to produce the correct output.  This division can be done
627dd7cddfSDavid du Colombier  * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
637dd7cddfSDavid du Colombier  * as long as possible so that partial sums can be added together with
647dd7cddfSDavid du Colombier  * full fractional precision.
657dd7cddfSDavid du Colombier  *
667dd7cddfSDavid du Colombier  * The outputs of the first pass are scaled up by PASS1_BITS bits so that
677dd7cddfSDavid du Colombier  * they are represented to better-than-integral precision.  These outputs
687dd7cddfSDavid du Colombier  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
697dd7cddfSDavid du Colombier  * with the recommended scaling.  (To scale up 12-bit sample data further, an
707dd7cddfSDavid du Colombier  * intermediate INT32 array would be needed.)
717dd7cddfSDavid du Colombier  *
727dd7cddfSDavid du Colombier  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
737dd7cddfSDavid du Colombier  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
747dd7cddfSDavid du Colombier  * shows that the values given below are the most effective.
757dd7cddfSDavid du Colombier  */
767dd7cddfSDavid du Colombier 
777dd7cddfSDavid du Colombier #if BITS_IN_JSAMPLE == 8
787dd7cddfSDavid du Colombier #define CONST_BITS  13
797dd7cddfSDavid du Colombier #define PASS1_BITS  2
807dd7cddfSDavid du Colombier #else
817dd7cddfSDavid du Colombier #define CONST_BITS  13
827dd7cddfSDavid du Colombier #define PASS1_BITS  1		/* lose a little precision to avoid overflow */
837dd7cddfSDavid du Colombier #endif
847dd7cddfSDavid du Colombier 
857dd7cddfSDavid du Colombier /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
867dd7cddfSDavid du Colombier  * causing a lot of useless floating-point operations at run time.
877dd7cddfSDavid du Colombier  * To get around this we use the following pre-calculated constants.
887dd7cddfSDavid du Colombier  * If you change CONST_BITS you may want to add appropriate values.
897dd7cddfSDavid du Colombier  * (With a reasonable C compiler, you can just rely on the FIX() macro...)
907dd7cddfSDavid du Colombier  */
917dd7cddfSDavid du Colombier 
927dd7cddfSDavid du Colombier #if CONST_BITS == 13
937dd7cddfSDavid du Colombier #define FIX_0_298631336  ((INT32)  2446)	/* FIX(0.298631336) */
947dd7cddfSDavid du Colombier #define FIX_0_390180644  ((INT32)  3196)	/* FIX(0.390180644) */
957dd7cddfSDavid du Colombier #define FIX_0_541196100  ((INT32)  4433)	/* FIX(0.541196100) */
967dd7cddfSDavid du Colombier #define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
977dd7cddfSDavid du Colombier #define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
987dd7cddfSDavid du Colombier #define FIX_1_175875602  ((INT32)  9633)	/* FIX(1.175875602) */
997dd7cddfSDavid du Colombier #define FIX_1_501321110  ((INT32)  12299)	/* FIX(1.501321110) */
1007dd7cddfSDavid du Colombier #define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
1017dd7cddfSDavid du Colombier #define FIX_1_961570560  ((INT32)  16069)	/* FIX(1.961570560) */
1027dd7cddfSDavid du Colombier #define FIX_2_053119869  ((INT32)  16819)	/* FIX(2.053119869) */
1037dd7cddfSDavid du Colombier #define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
1047dd7cddfSDavid du Colombier #define FIX_3_072711026  ((INT32)  25172)	/* FIX(3.072711026) */
1057dd7cddfSDavid du Colombier #else
1067dd7cddfSDavid du Colombier #define FIX_0_298631336  FIX(0.298631336)
1077dd7cddfSDavid du Colombier #define FIX_0_390180644  FIX(0.390180644)
1087dd7cddfSDavid du Colombier #define FIX_0_541196100  FIX(0.541196100)
1097dd7cddfSDavid du Colombier #define FIX_0_765366865  FIX(0.765366865)
1107dd7cddfSDavid du Colombier #define FIX_0_899976223  FIX(0.899976223)
1117dd7cddfSDavid du Colombier #define FIX_1_175875602  FIX(1.175875602)
1127dd7cddfSDavid du Colombier #define FIX_1_501321110  FIX(1.501321110)
1137dd7cddfSDavid du Colombier #define FIX_1_847759065  FIX(1.847759065)
1147dd7cddfSDavid du Colombier #define FIX_1_961570560  FIX(1.961570560)
1157dd7cddfSDavid du Colombier #define FIX_2_053119869  FIX(2.053119869)
1167dd7cddfSDavid du Colombier #define FIX_2_562915447  FIX(2.562915447)
1177dd7cddfSDavid du Colombier #define FIX_3_072711026  FIX(3.072711026)
1187dd7cddfSDavid du Colombier #endif
1197dd7cddfSDavid du Colombier 
1207dd7cddfSDavid du Colombier 
1217dd7cddfSDavid du Colombier /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
1227dd7cddfSDavid du Colombier  * For 8-bit samples with the recommended scaling, all the variable
1237dd7cddfSDavid du Colombier  * and constant values involved are no more than 16 bits wide, so a
1247dd7cddfSDavid du Colombier  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
1257dd7cddfSDavid du Colombier  * For 12-bit samples, a full 32-bit multiplication will be needed.
1267dd7cddfSDavid du Colombier  */
1277dd7cddfSDavid du Colombier 
1287dd7cddfSDavid du Colombier #if BITS_IN_JSAMPLE == 8
1297dd7cddfSDavid du Colombier #define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
1307dd7cddfSDavid du Colombier #else
1317dd7cddfSDavid du Colombier #define MULTIPLY(var,const)  ((var) * (const))
1327dd7cddfSDavid du Colombier #endif
1337dd7cddfSDavid du Colombier 
1347dd7cddfSDavid du Colombier 
1357dd7cddfSDavid du Colombier /* Dequantize a coefficient by multiplying it by the multiplier-table
1367dd7cddfSDavid du Colombier  * entry; produce an int result.  In this module, both inputs and result
1377dd7cddfSDavid du Colombier  * are 16 bits or less, so either int or short multiply will work.
1387dd7cddfSDavid du Colombier  */
1397dd7cddfSDavid du Colombier 
1407dd7cddfSDavid du Colombier #define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
1417dd7cddfSDavid du Colombier 
1427dd7cddfSDavid du Colombier 
1437dd7cddfSDavid du Colombier /*
1447dd7cddfSDavid du Colombier  * Perform dequantization and inverse DCT on one block of coefficients.
1457dd7cddfSDavid du Colombier  */
1467dd7cddfSDavid du Colombier 
1477dd7cddfSDavid du Colombier GLOBAL(void)
1487dd7cddfSDavid du Colombier jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1497dd7cddfSDavid du Colombier 		 JCOEFPTR coef_block,
1507dd7cddfSDavid du Colombier 		 JSAMPARRAY output_buf, JDIMENSION output_col)
1517dd7cddfSDavid du Colombier {
1527dd7cddfSDavid du Colombier   INT32 tmp0, tmp1, tmp2, tmp3;
1537dd7cddfSDavid du Colombier   INT32 tmp10, tmp11, tmp12, tmp13;
1547dd7cddfSDavid du Colombier   INT32 z1, z2, z3, z4, z5;
1557dd7cddfSDavid du Colombier   JCOEFPTR inptr;
1567dd7cddfSDavid du Colombier   ISLOW_MULT_TYPE * quantptr;
1577dd7cddfSDavid du Colombier   int * wsptr;
1587dd7cddfSDavid du Colombier   JSAMPROW outptr;
1597dd7cddfSDavid du Colombier   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1607dd7cddfSDavid du Colombier   int ctr;
1617dd7cddfSDavid du Colombier   int workspace[DCTSIZE2];	/* buffers data between passes */
1627dd7cddfSDavid du Colombier   SHIFT_TEMPS
1637dd7cddfSDavid du Colombier 
1647dd7cddfSDavid du Colombier   /* Pass 1: process columns from input, store into work array. */
1657dd7cddfSDavid du Colombier   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
1667dd7cddfSDavid du Colombier   /* furthermore, we scale the results by 2**PASS1_BITS. */
1677dd7cddfSDavid du Colombier 
1687dd7cddfSDavid du Colombier   inptr = coef_block;
1697dd7cddfSDavid du Colombier   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1707dd7cddfSDavid du Colombier   wsptr = workspace;
1717dd7cddfSDavid du Colombier   for (ctr = DCTSIZE; ctr > 0; ctr--) {
1727dd7cddfSDavid du Colombier     /* Due to quantization, we will usually find that many of the input
1737dd7cddfSDavid du Colombier      * coefficients are zero, especially the AC terms.  We can exploit this
1747dd7cddfSDavid du Colombier      * by short-circuiting the IDCT calculation for any column in which all
1757dd7cddfSDavid du Colombier      * the AC terms are zero.  In that case each output is equal to the
1767dd7cddfSDavid du Colombier      * DC coefficient (with scale factor as needed).
1777dd7cddfSDavid du Colombier      * With typical images and quantization tables, half or more of the
1787dd7cddfSDavid du Colombier      * column DCT calculations can be simplified this way.
1797dd7cddfSDavid du Colombier      */
1807dd7cddfSDavid du Colombier 
181*593dc095SDavid du Colombier     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
182*593dc095SDavid du Colombier 	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
183*593dc095SDavid du Colombier 	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
184*593dc095SDavid du Colombier 	inptr[DCTSIZE*7] == 0) {
1857dd7cddfSDavid du Colombier       /* AC terms all zero */
1867dd7cddfSDavid du Colombier       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
1877dd7cddfSDavid du Colombier 
1887dd7cddfSDavid du Colombier       wsptr[DCTSIZE*0] = dcval;
1897dd7cddfSDavid du Colombier       wsptr[DCTSIZE*1] = dcval;
1907dd7cddfSDavid du Colombier       wsptr[DCTSIZE*2] = dcval;
1917dd7cddfSDavid du Colombier       wsptr[DCTSIZE*3] = dcval;
1927dd7cddfSDavid du Colombier       wsptr[DCTSIZE*4] = dcval;
1937dd7cddfSDavid du Colombier       wsptr[DCTSIZE*5] = dcval;
1947dd7cddfSDavid du Colombier       wsptr[DCTSIZE*6] = dcval;
1957dd7cddfSDavid du Colombier       wsptr[DCTSIZE*7] = dcval;
1967dd7cddfSDavid du Colombier 
1977dd7cddfSDavid du Colombier       inptr++;			/* advance pointers to next column */
1987dd7cddfSDavid du Colombier       quantptr++;
1997dd7cddfSDavid du Colombier       wsptr++;
2007dd7cddfSDavid du Colombier       continue;
2017dd7cddfSDavid du Colombier     }
2027dd7cddfSDavid du Colombier 
2037dd7cddfSDavid du Colombier     /* Even part: reverse the even part of the forward DCT. */
2047dd7cddfSDavid du Colombier     /* The rotator is sqrt(2)*c(-6). */
2057dd7cddfSDavid du Colombier 
2067dd7cddfSDavid du Colombier     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2077dd7cddfSDavid du Colombier     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2087dd7cddfSDavid du Colombier 
2097dd7cddfSDavid du Colombier     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
2107dd7cddfSDavid du Colombier     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
2117dd7cddfSDavid du Colombier     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
2127dd7cddfSDavid du Colombier 
2137dd7cddfSDavid du Colombier     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2147dd7cddfSDavid du Colombier     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2157dd7cddfSDavid du Colombier 
2167dd7cddfSDavid du Colombier     tmp0 = (z2 + z3) << CONST_BITS;
2177dd7cddfSDavid du Colombier     tmp1 = (z2 - z3) << CONST_BITS;
2187dd7cddfSDavid du Colombier 
2197dd7cddfSDavid du Colombier     tmp10 = tmp0 + tmp3;
2207dd7cddfSDavid du Colombier     tmp13 = tmp0 - tmp3;
2217dd7cddfSDavid du Colombier     tmp11 = tmp1 + tmp2;
2227dd7cddfSDavid du Colombier     tmp12 = tmp1 - tmp2;
2237dd7cddfSDavid du Colombier 
2247dd7cddfSDavid du Colombier     /* Odd part per figure 8; the matrix is unitary and hence its
2257dd7cddfSDavid du Colombier      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
2267dd7cddfSDavid du Colombier      */
2277dd7cddfSDavid du Colombier 
2287dd7cddfSDavid du Colombier     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2297dd7cddfSDavid du Colombier     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2307dd7cddfSDavid du Colombier     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2317dd7cddfSDavid du Colombier     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2327dd7cddfSDavid du Colombier 
2337dd7cddfSDavid du Colombier     z1 = tmp0 + tmp3;
2347dd7cddfSDavid du Colombier     z2 = tmp1 + tmp2;
2357dd7cddfSDavid du Colombier     z3 = tmp0 + tmp2;
2367dd7cddfSDavid du Colombier     z4 = tmp1 + tmp3;
2377dd7cddfSDavid du Colombier     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
2387dd7cddfSDavid du Colombier 
2397dd7cddfSDavid du Colombier     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
2407dd7cddfSDavid du Colombier     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
2417dd7cddfSDavid du Colombier     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
2427dd7cddfSDavid du Colombier     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
2437dd7cddfSDavid du Colombier     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
2447dd7cddfSDavid du Colombier     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
2457dd7cddfSDavid du Colombier     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
2467dd7cddfSDavid du Colombier     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
2477dd7cddfSDavid du Colombier 
2487dd7cddfSDavid du Colombier     z3 += z5;
2497dd7cddfSDavid du Colombier     z4 += z5;
2507dd7cddfSDavid du Colombier 
2517dd7cddfSDavid du Colombier     tmp0 += z1 + z3;
2527dd7cddfSDavid du Colombier     tmp1 += z2 + z4;
2537dd7cddfSDavid du Colombier     tmp2 += z2 + z3;
2547dd7cddfSDavid du Colombier     tmp3 += z1 + z4;
2557dd7cddfSDavid du Colombier 
2567dd7cddfSDavid du Colombier     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
2577dd7cddfSDavid du Colombier 
2587dd7cddfSDavid du Colombier     wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
2597dd7cddfSDavid du Colombier     wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
2607dd7cddfSDavid du Colombier     wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
2617dd7cddfSDavid du Colombier     wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
2627dd7cddfSDavid du Colombier     wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
2637dd7cddfSDavid du Colombier     wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
2647dd7cddfSDavid du Colombier     wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
2657dd7cddfSDavid du Colombier     wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
2667dd7cddfSDavid du Colombier 
2677dd7cddfSDavid du Colombier     inptr++;			/* advance pointers to next column */
2687dd7cddfSDavid du Colombier     quantptr++;
2697dd7cddfSDavid du Colombier     wsptr++;
2707dd7cddfSDavid du Colombier   }
2717dd7cddfSDavid du Colombier 
2727dd7cddfSDavid du Colombier   /* Pass 2: process rows from work array, store into output array. */
2737dd7cddfSDavid du Colombier   /* Note that we must descale the results by a factor of 8 == 2**3, */
2747dd7cddfSDavid du Colombier   /* and also undo the PASS1_BITS scaling. */
2757dd7cddfSDavid du Colombier 
2767dd7cddfSDavid du Colombier   wsptr = workspace;
2777dd7cddfSDavid du Colombier   for (ctr = 0; ctr < DCTSIZE; ctr++) {
2787dd7cddfSDavid du Colombier     outptr = output_buf[ctr] + output_col;
2797dd7cddfSDavid du Colombier     /* Rows of zeroes can be exploited in the same way as we did with columns.
2807dd7cddfSDavid du Colombier      * However, the column calculation has created many nonzero AC terms, so
2817dd7cddfSDavid du Colombier      * the simplification applies less often (typically 5% to 10% of the time).
2827dd7cddfSDavid du Colombier      * On machines with very fast multiplication, it's possible that the
2837dd7cddfSDavid du Colombier      * test takes more time than it's worth.  In that case this section
2847dd7cddfSDavid du Colombier      * may be commented out.
2857dd7cddfSDavid du Colombier      */
2867dd7cddfSDavid du Colombier 
2877dd7cddfSDavid du Colombier #ifndef NO_ZERO_ROW_TEST
288*593dc095SDavid du Colombier     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
289*593dc095SDavid du Colombier 	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
2907dd7cddfSDavid du Colombier       /* AC terms all zero */
2917dd7cddfSDavid du Colombier       JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
2927dd7cddfSDavid du Colombier 				  & RANGE_MASK];
2937dd7cddfSDavid du Colombier 
2947dd7cddfSDavid du Colombier       outptr[0] = dcval;
2957dd7cddfSDavid du Colombier       outptr[1] = dcval;
2967dd7cddfSDavid du Colombier       outptr[2] = dcval;
2977dd7cddfSDavid du Colombier       outptr[3] = dcval;
2987dd7cddfSDavid du Colombier       outptr[4] = dcval;
2997dd7cddfSDavid du Colombier       outptr[5] = dcval;
3007dd7cddfSDavid du Colombier       outptr[6] = dcval;
3017dd7cddfSDavid du Colombier       outptr[7] = dcval;
3027dd7cddfSDavid du Colombier 
3037dd7cddfSDavid du Colombier       wsptr += DCTSIZE;		/* advance pointer to next row */
3047dd7cddfSDavid du Colombier       continue;
3057dd7cddfSDavid du Colombier     }
3067dd7cddfSDavid du Colombier #endif
3077dd7cddfSDavid du Colombier 
3087dd7cddfSDavid du Colombier     /* Even part: reverse the even part of the forward DCT. */
3097dd7cddfSDavid du Colombier     /* The rotator is sqrt(2)*c(-6). */
3107dd7cddfSDavid du Colombier 
3117dd7cddfSDavid du Colombier     z2 = (INT32) wsptr[2];
3127dd7cddfSDavid du Colombier     z3 = (INT32) wsptr[6];
3137dd7cddfSDavid du Colombier 
3147dd7cddfSDavid du Colombier     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
3157dd7cddfSDavid du Colombier     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
3167dd7cddfSDavid du Colombier     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
3177dd7cddfSDavid du Colombier 
3187dd7cddfSDavid du Colombier     tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS;
3197dd7cddfSDavid du Colombier     tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS;
3207dd7cddfSDavid du Colombier 
3217dd7cddfSDavid du Colombier     tmp10 = tmp0 + tmp3;
3227dd7cddfSDavid du Colombier     tmp13 = tmp0 - tmp3;
3237dd7cddfSDavid du Colombier     tmp11 = tmp1 + tmp2;
3247dd7cddfSDavid du Colombier     tmp12 = tmp1 - tmp2;
3257dd7cddfSDavid du Colombier 
3267dd7cddfSDavid du Colombier     /* Odd part per figure 8; the matrix is unitary and hence its
3277dd7cddfSDavid du Colombier      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
3287dd7cddfSDavid du Colombier      */
3297dd7cddfSDavid du Colombier 
3307dd7cddfSDavid du Colombier     tmp0 = (INT32) wsptr[7];
3317dd7cddfSDavid du Colombier     tmp1 = (INT32) wsptr[5];
3327dd7cddfSDavid du Colombier     tmp2 = (INT32) wsptr[3];
3337dd7cddfSDavid du Colombier     tmp3 = (INT32) wsptr[1];
3347dd7cddfSDavid du Colombier 
3357dd7cddfSDavid du Colombier     z1 = tmp0 + tmp3;
3367dd7cddfSDavid du Colombier     z2 = tmp1 + tmp2;
3377dd7cddfSDavid du Colombier     z3 = tmp0 + tmp2;
3387dd7cddfSDavid du Colombier     z4 = tmp1 + tmp3;
3397dd7cddfSDavid du Colombier     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
3407dd7cddfSDavid du Colombier 
3417dd7cddfSDavid du Colombier     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
3427dd7cddfSDavid du Colombier     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
3437dd7cddfSDavid du Colombier     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
3447dd7cddfSDavid du Colombier     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
3457dd7cddfSDavid du Colombier     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
3467dd7cddfSDavid du Colombier     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
3477dd7cddfSDavid du Colombier     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
3487dd7cddfSDavid du Colombier     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
3497dd7cddfSDavid du Colombier 
3507dd7cddfSDavid du Colombier     z3 += z5;
3517dd7cddfSDavid du Colombier     z4 += z5;
3527dd7cddfSDavid du Colombier 
3537dd7cddfSDavid du Colombier     tmp0 += z1 + z3;
3547dd7cddfSDavid du Colombier     tmp1 += z2 + z4;
3557dd7cddfSDavid du Colombier     tmp2 += z2 + z3;
3567dd7cddfSDavid du Colombier     tmp3 += z1 + z4;
3577dd7cddfSDavid du Colombier 
3587dd7cddfSDavid du Colombier     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
3597dd7cddfSDavid du Colombier 
3607dd7cddfSDavid du Colombier     outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
3617dd7cddfSDavid du Colombier 					  CONST_BITS+PASS1_BITS+3)
3627dd7cddfSDavid du Colombier 			    & RANGE_MASK];
3637dd7cddfSDavid du Colombier     outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
3647dd7cddfSDavid du Colombier 					  CONST_BITS+PASS1_BITS+3)
3657dd7cddfSDavid du Colombier 			    & RANGE_MASK];
3667dd7cddfSDavid du Colombier     outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
3677dd7cddfSDavid du Colombier 					  CONST_BITS+PASS1_BITS+3)
3687dd7cddfSDavid du Colombier 			    & RANGE_MASK];
3697dd7cddfSDavid du Colombier     outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
3707dd7cddfSDavid du Colombier 					  CONST_BITS+PASS1_BITS+3)
3717dd7cddfSDavid du Colombier 			    & RANGE_MASK];
3727dd7cddfSDavid du Colombier     outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
3737dd7cddfSDavid du Colombier 					  CONST_BITS+PASS1_BITS+3)
3747dd7cddfSDavid du Colombier 			    & RANGE_MASK];
3757dd7cddfSDavid du Colombier     outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
3767dd7cddfSDavid du Colombier 					  CONST_BITS+PASS1_BITS+3)
3777dd7cddfSDavid du Colombier 			    & RANGE_MASK];
3787dd7cddfSDavid du Colombier     outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
3797dd7cddfSDavid du Colombier 					  CONST_BITS+PASS1_BITS+3)
3807dd7cddfSDavid du Colombier 			    & RANGE_MASK];
3817dd7cddfSDavid du Colombier     outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
3827dd7cddfSDavid du Colombier 					  CONST_BITS+PASS1_BITS+3)
3837dd7cddfSDavid du Colombier 			    & RANGE_MASK];
3847dd7cddfSDavid du Colombier 
3857dd7cddfSDavid du Colombier     wsptr += DCTSIZE;		/* advance pointer to next row */
3867dd7cddfSDavid du Colombier   }
3877dd7cddfSDavid du Colombier }
3887dd7cddfSDavid du Colombier 
3897dd7cddfSDavid du Colombier #endif /* DCT_ISLOW_SUPPORTED */
390