media/libjpeg/jidctint.c

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2  * jidctint.c
     3  *
     4  * Copyright (C) 1991-1998, Thomas G. Lane.
     5  * Modification developed 2002-2009 by Guido Vollbeding.
     6  * This file is part of the Independent JPEG Group's software.
     7  * For conditions of distribution and use, see the accompanying README file.
     8  *
     9  * This file contains a slow-but-accurate integer implementation of the
    10  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
    11  * must also perform dequantization of the input coefficients.
    12  *
    13  * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
    14  * on each row (or vice versa, but it's more convenient to emit a row at
    15  * a time).  Direct algorithms are also available, but they are much more
    16  * complex and seem not to be any faster when reduced to code.
    17  *
    18  * This implementation is based on an algorithm described in
    19  *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
    20  *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
    21  *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
    22  * The primary algorithm described there uses 11 multiplies and 29 adds.
    23  * We use their alternate method with 12 multiplies and 32 adds.
    24  * The advantage of this method is that no data path contains more than one
    25  * multiplication; this allows a very simple and accurate implementation in
    26  * scaled fixed-point arithmetic, with a minimal number of shifts.
    27  *
    28  * We also provide IDCT routines with various output sample block sizes for
    29  * direct resolution reduction or enlargement without additional resampling:
    30  * NxN (N=1...16) pixels for one 8x8 input DCT block.
    31  *
    32  * For N<8 we simply take the corresponding low-frequency coefficients of
    33  * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
    34  * to yield the downscaled outputs.
    35  * This can be seen as direct low-pass downsampling from the DCT domain
    36  * point of view rather than the usual spatial domain point of view,
    37  * yielding significant computational savings and results at least
    38  * as good as common bilinear (averaging) spatial downsampling.
    39  *
    40  * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
    41  * lower frequencies and higher frequencies assumed to be zero.
    42  * It turns out that the computational effort is similar to the 8x8 IDCT
    43  * regarding the output size.
    44  * Furthermore, the scaling and descaling is the same for all IDCT sizes.
    45  *
    46  * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
    47  * since there would be too many additional constants to pre-calculate.
    48  */
    50 #define JPEG_INTERNALS
    51 #include "jinclude.h"
    52 #include "jpeglib.h"
    53 #include "jdct.h"		/* Private declarations for DCT subsystem */
    55 #ifdef DCT_ISLOW_SUPPORTED
    58 /*
    59  * This module is specialized to the case DCTSIZE = 8.
    60  */
    62 #if DCTSIZE != 8
    63   Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
    64 #endif
    67 /*
    68  * The poop on this scaling stuff is as follows:
    69  *
    70  * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
    71  * larger than the true IDCT outputs.  The final outputs are therefore
    72  * a factor of N larger than desired; since N=8 this can be cured by
    73  * a simple right shift at the end of the algorithm.  The advantage of
    74  * this arrangement is that we save two multiplications per 1-D IDCT,
    75  * because the y0 and y4 inputs need not be divided by sqrt(N).
    76  *
    77  * We have to do addition and subtraction of the integer inputs, which
    78  * is no problem, and multiplication by fractional constants, which is
    79  * a problem to do in integer arithmetic.  We multiply all the constants
    80  * by CONST_SCALE and convert them to integer constants (thus retaining
    81  * CONST_BITS bits of precision in the constants).  After doing a
    82  * multiplication we have to divide the product by CONST_SCALE, with proper
    83  * rounding, to produce the correct output.  This division can be done
    84  * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
    85  * as long as possible so that partial sums can be added together with
    86  * full fractional precision.
    87  *
    88  * The outputs of the first pass are scaled up by PASS1_BITS bits so that
    89  * they are represented to better-than-integral precision.  These outputs
    90  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
    91  * with the recommended scaling.  (To scale up 12-bit sample data further, an
    92  * intermediate INT32 array would be needed.)
    93  *
    94  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
    95  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
    96  * shows that the values given below are the most effective.
    97  */
    99 #if BITS_IN_JSAMPLE == 8
   100 #define CONST_BITS  13
   101 #define PASS1_BITS  2
   102 #else
   103 #define CONST_BITS  13
   104 #define PASS1_BITS  1		/* lose a little precision to avoid overflow */
   105 #endif
   107 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
   108  * causing a lot of useless floating-point operations at run time.
   109  * To get around this we use the following pre-calculated constants.
   110  * If you change CONST_BITS you may want to add appropriate values.
   111  * (With a reasonable C compiler, you can just rely on the FIX() macro...)
   112  */
   114 #if CONST_BITS == 13
   115 #define FIX_0_298631336  ((INT32)  2446)	/* FIX(0.298631336) */
   116 #define FIX_0_390180644  ((INT32)  3196)	/* FIX(0.390180644) */
   117 #define FIX_0_541196100  ((INT32)  4433)	/* FIX(0.541196100) */
   118 #define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
   119 #define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
   120 #define FIX_1_175875602  ((INT32)  9633)	/* FIX(1.175875602) */
   121 #define FIX_1_501321110  ((INT32)  12299)	/* FIX(1.501321110) */
   122 #define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
   123 #define FIX_1_961570560  ((INT32)  16069)	/* FIX(1.961570560) */
   124 #define FIX_2_053119869  ((INT32)  16819)	/* FIX(2.053119869) */
   125 #define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
   126 #define FIX_3_072711026  ((INT32)  25172)	/* FIX(3.072711026) */
   127 #else
   128 #define FIX_0_298631336  FIX(0.298631336)
   129 #define FIX_0_390180644  FIX(0.390180644)
   130 #define FIX_0_541196100  FIX(0.541196100)
   131 #define FIX_0_765366865  FIX(0.765366865)
   132 #define FIX_0_899976223  FIX(0.899976223)
   133 #define FIX_1_175875602  FIX(1.175875602)
   134 #define FIX_1_501321110  FIX(1.501321110)
   135 #define FIX_1_847759065  FIX(1.847759065)
   136 #define FIX_1_961570560  FIX(1.961570560)
   137 #define FIX_2_053119869  FIX(2.053119869)
   138 #define FIX_2_562915447  FIX(2.562915447)
   139 #define FIX_3_072711026  FIX(3.072711026)
   140 #endif
   143 /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
   144  * For 8-bit samples with the recommended scaling, all the variable
   145  * and constant values involved are no more than 16 bits wide, so a
   146  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
   147  * For 12-bit samples, a full 32-bit multiplication will be needed.
   148  */
   150 #if BITS_IN_JSAMPLE == 8
   151 #define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
   152 #else
   153 #define MULTIPLY(var,const)  ((var) * (const))
   154 #endif
   157 /* Dequantize a coefficient by multiplying it by the multiplier-table
   158  * entry; produce an int result.  In this module, both inputs and result
   159  * are 16 bits or less, so either int or short multiply will work.
   160  */
   162 #define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
   165 /*
   166  * Perform dequantization and inverse DCT on one block of coefficients.
   167  */
   169 GLOBAL(void)
   170 jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   171 		 JCOEFPTR coef_block,
   172 		 JSAMPARRAY output_buf, JDIMENSION output_col)
   173 {
   174   INT32 tmp0, tmp1, tmp2, tmp3;
   175   INT32 tmp10, tmp11, tmp12, tmp13;
   176   INT32 z1, z2, z3, z4, z5;
   177   JCOEFPTR inptr;
   178   ISLOW_MULT_TYPE * quantptr;
   179   int * wsptr;
   180   JSAMPROW outptr;
   181   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   182   int ctr;
   183   int workspace[DCTSIZE2];	/* buffers data between passes */
   184   SHIFT_TEMPS
   186   /* Pass 1: process columns from input, store into work array. */
   187   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
   188   /* furthermore, we scale the results by 2**PASS1_BITS. */
   190   inptr = coef_block;
   191   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   192   wsptr = workspace;
   193   for (ctr = DCTSIZE; ctr > 0; ctr--) {
   194     /* Due to quantization, we will usually find that many of the input
   195      * coefficients are zero, especially the AC terms.  We can exploit this
   196      * by short-circuiting the IDCT calculation for any column in which all
   197      * the AC terms are zero.  In that case each output is equal to the
   198      * DC coefficient (with scale factor as needed).
   199      * With typical images and quantization tables, half or more of the
   200      * column DCT calculations can be simplified this way.
   201      */
   203     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
   204 	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
   205 	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
   206 	inptr[DCTSIZE*7] == 0) {
   207       /* AC terms all zero */
   208       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
   210       wsptr[DCTSIZE*0] = dcval;
   211       wsptr[DCTSIZE*1] = dcval;
   212       wsptr[DCTSIZE*2] = dcval;
   213       wsptr[DCTSIZE*3] = dcval;
   214       wsptr[DCTSIZE*4] = dcval;
   215       wsptr[DCTSIZE*5] = dcval;
   216       wsptr[DCTSIZE*6] = dcval;
   217       wsptr[DCTSIZE*7] = dcval;
   219       inptr++;			/* advance pointers to next column */
   220       quantptr++;
   221       wsptr++;
   222       continue;
   223     }
   225     /* Even part: reverse the even part of the forward DCT. */
   226     /* The rotator is sqrt(2)*c(-6). */
   228     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   229     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   231     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
   232     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
   233     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
   235     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   236     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   238     tmp0 = (z2 + z3) << CONST_BITS;
   239     tmp1 = (z2 - z3) << CONST_BITS;
   241     tmp10 = tmp0 + tmp3;
   242     tmp13 = tmp0 - tmp3;
   243     tmp11 = tmp1 + tmp2;
   244     tmp12 = tmp1 - tmp2;
   246     /* Odd part per figure 8; the matrix is unitary and hence its
   247      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
   248      */
   250     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   251     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   252     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   253     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   255     z1 = tmp0 + tmp3;
   256     z2 = tmp1 + tmp2;
   257     z3 = tmp0 + tmp2;
   258     z4 = tmp1 + tmp3;
   259     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
   261     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
   262     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
   263     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
   264     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
   265     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
   266     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
   267     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
   268     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
   270     z3 += z5;
   271     z4 += z5;
   273     tmp0 += z1 + z3;
   274     tmp1 += z2 + z4;
   275     tmp2 += z2 + z3;
   276     tmp3 += z1 + z4;
   278     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
   280     wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
   281     wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
   282     wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
   283     wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
   284     wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
   285     wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
   286     wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
   287     wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
   289     inptr++;			/* advance pointers to next column */
   290     quantptr++;
   291     wsptr++;
   292   }
   294   /* Pass 2: process rows from work array, store into output array. */
   295   /* Note that we must descale the results by a factor of 8 == 2**3, */
   296   /* and also undo the PASS1_BITS scaling. */
   298   wsptr = workspace;
   299   for (ctr = 0; ctr < DCTSIZE; ctr++) {
   300     outptr = output_buf[ctr] + output_col;
   301     /* Rows of zeroes can be exploited in the same way as we did with columns.
   302      * However, the column calculation has created many nonzero AC terms, so
   303      * the simplification applies less often (typically 5% to 10% of the time).
   304      * On machines with very fast multiplication, it's possible that the
   305      * test takes more time than it's worth.  In that case this section
   306      * may be commented out.
   307      */
   309 #ifndef NO_ZERO_ROW_TEST
   310     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
   311 	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
   312       /* AC terms all zero */
   313       JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
   314 				  & RANGE_MASK];
   316       outptr[0] = dcval;
   317       outptr[1] = dcval;
   318       outptr[2] = dcval;
   319       outptr[3] = dcval;
   320       outptr[4] = dcval;
   321       outptr[5] = dcval;
   322       outptr[6] = dcval;
   323       outptr[7] = dcval;
   325       wsptr += DCTSIZE;		/* advance pointer to next row */
   326       continue;
   327     }
   328 #endif
   330     /* Even part: reverse the even part of the forward DCT. */
   331     /* The rotator is sqrt(2)*c(-6). */
   333     z2 = (INT32) wsptr[2];
   334     z3 = (INT32) wsptr[6];
   336     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
   337     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
   338     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
   340     tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS;
   341     tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS;
   343     tmp10 = tmp0 + tmp3;
   344     tmp13 = tmp0 - tmp3;
   345     tmp11 = tmp1 + tmp2;
   346     tmp12 = tmp1 - tmp2;
   348     /* Odd part per figure 8; the matrix is unitary and hence its
   349      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
   350      */
   352     tmp0 = (INT32) wsptr[7];
   353     tmp1 = (INT32) wsptr[5];
   354     tmp2 = (INT32) wsptr[3];
   355     tmp3 = (INT32) wsptr[1];
   357     z1 = tmp0 + tmp3;
   358     z2 = tmp1 + tmp2;
   359     z3 = tmp0 + tmp2;
   360     z4 = tmp1 + tmp3;
   361     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
   363     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
   364     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
   365     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
   366     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
   367     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
   368     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
   369     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
   370     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
   372     z3 += z5;
   373     z4 += z5;
   375     tmp0 += z1 + z3;
   376     tmp1 += z2 + z4;
   377     tmp2 += z2 + z3;
   378     tmp3 += z1 + z4;
   380     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
   382     outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
   383 					  CONST_BITS+PASS1_BITS+3)
   384 			    & RANGE_MASK];
   385     outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
   386 					  CONST_BITS+PASS1_BITS+3)
   387 			    & RANGE_MASK];
   388     outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
   389 					  CONST_BITS+PASS1_BITS+3)
   390 			    & RANGE_MASK];
   391     outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
   392 					  CONST_BITS+PASS1_BITS+3)
   393 			    & RANGE_MASK];
   394     outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
   395 					  CONST_BITS+PASS1_BITS+3)
   396 			    & RANGE_MASK];
   397     outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
   398 					  CONST_BITS+PASS1_BITS+3)
   399 			    & RANGE_MASK];
   400     outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
   401 					  CONST_BITS+PASS1_BITS+3)
   402 			    & RANGE_MASK];
   403     outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
   404 					  CONST_BITS+PASS1_BITS+3)
   405 			    & RANGE_MASK];
   407     wsptr += DCTSIZE;		/* advance pointer to next row */
   408   }
   409 }
   411 #ifdef IDCT_SCALING_SUPPORTED
   414 /*
   415  * Perform dequantization and inverse DCT on one block of coefficients,
   416  * producing a 7x7 output block.
   417  *
   418  * Optimized algorithm with 12 multiplications in the 1-D kernel.
   419  * cK represents sqrt(2) * cos(K*pi/14).
   420  */
   422 GLOBAL(void)
   423 jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   424 	       JCOEFPTR coef_block,
   425 	       JSAMPARRAY output_buf, JDIMENSION output_col)
   426 {
   427   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
   428   INT32 z1, z2, z3;
   429   JCOEFPTR inptr;
   430   ISLOW_MULT_TYPE * quantptr;
   431   int * wsptr;
   432   JSAMPROW outptr;
   433   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   434   int ctr;
   435   int workspace[7*7];	/* buffers data between passes */
   436   SHIFT_TEMPS
   438   /* Pass 1: process columns from input, store into work array. */
   440   inptr = coef_block;
   441   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   442   wsptr = workspace;
   443   for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
   444     /* Even part */
   446     tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   447     tmp13 <<= CONST_BITS;
   448     /* Add fudge factor here for final descale. */
   449     tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
   451     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   452     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   453     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   455     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
   456     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
   457     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
   458     tmp0 = z1 + z3;
   459     z2 -= tmp0;
   460     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
   461     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
   462     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
   463     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
   465     /* Odd part */
   467     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   468     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   469     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   471     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
   472     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
   473     tmp0 = tmp1 - tmp2;
   474     tmp1 += tmp2;
   475     tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
   476     tmp1 += tmp2;
   477     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
   478     tmp0 += z2;
   479     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
   481     /* Final output stage */
   483     wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
   484     wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
   485     wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
   486     wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
   487     wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
   488     wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
   489     wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
   490   }
   492   /* Pass 2: process 7 rows from work array, store into output array. */
   494   wsptr = workspace;
   495   for (ctr = 0; ctr < 7; ctr++) {
   496     outptr = output_buf[ctr] + output_col;
   498     /* Even part */
   500     /* Add fudge factor here for final descale. */
   501     tmp13 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   502     tmp13 <<= CONST_BITS;
   504     z1 = (INT32) wsptr[2];
   505     z2 = (INT32) wsptr[4];
   506     z3 = (INT32) wsptr[6];
   508     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
   509     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
   510     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
   511     tmp0 = z1 + z3;
   512     z2 -= tmp0;
   513     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
   514     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
   515     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
   516     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
   518     /* Odd part */
   520     z1 = (INT32) wsptr[1];
   521     z2 = (INT32) wsptr[3];
   522     z3 = (INT32) wsptr[5];
   524     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
   525     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
   526     tmp0 = tmp1 - tmp2;
   527     tmp1 += tmp2;
   528     tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
   529     tmp1 += tmp2;
   530     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
   531     tmp0 += z2;
   532     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
   534     /* Final output stage */
   536     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
   537 					      CONST_BITS+PASS1_BITS+3)
   538 			    & RANGE_MASK];
   539     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
   540 					      CONST_BITS+PASS1_BITS+3)
   541 			    & RANGE_MASK];
   542     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
   543 					      CONST_BITS+PASS1_BITS+3)
   544 			    & RANGE_MASK];
   545     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
   546 					      CONST_BITS+PASS1_BITS+3)
   547 			    & RANGE_MASK];
   548     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
   549 					      CONST_BITS+PASS1_BITS+3)
   550 			    & RANGE_MASK];
   551     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
   552 					      CONST_BITS+PASS1_BITS+3)
   553 			    & RANGE_MASK];
   554     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
   555 					      CONST_BITS+PASS1_BITS+3)
   556 			    & RANGE_MASK];
   558     wsptr += 7;		/* advance pointer to next row */
   559   }
   560 }
   563 /*
   564  * Perform dequantization and inverse DCT on one block of coefficients,
   565  * producing a reduced-size 6x6 output block.
   566  *
   567  * Optimized algorithm with 3 multiplications in the 1-D kernel.
   568  * cK represents sqrt(2) * cos(K*pi/12).
   569  */
   571 GLOBAL(void)
   572 jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   573 	       JCOEFPTR coef_block,
   574 	       JSAMPARRAY output_buf, JDIMENSION output_col)
   575 {
   576   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
   577   INT32 z1, z2, z3;
   578   JCOEFPTR inptr;
   579   ISLOW_MULT_TYPE * quantptr;
   580   int * wsptr;
   581   JSAMPROW outptr;
   582   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   583   int ctr;
   584   int workspace[6*6];	/* buffers data between passes */
   585   SHIFT_TEMPS
   587   /* Pass 1: process columns from input, store into work array. */
   589   inptr = coef_block;
   590   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   591   wsptr = workspace;
   592   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
   593     /* Even part */
   595     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   596     tmp0 <<= CONST_BITS;
   597     /* Add fudge factor here for final descale. */
   598     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
   599     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   600     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
   601     tmp1 = tmp0 + tmp10;
   602     tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
   603     tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   604     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
   605     tmp10 = tmp1 + tmp0;
   606     tmp12 = tmp1 - tmp0;
   608     /* Odd part */
   610     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   611     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   612     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   613     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
   614     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
   615     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
   616     tmp1 = (z1 - z2 - z3) << PASS1_BITS;
   618     /* Final output stage */
   620     wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
   621     wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
   622     wsptr[6*1] = (int) (tmp11 + tmp1);
   623     wsptr[6*4] = (int) (tmp11 - tmp1);
   624     wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
   625     wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
   626   }
   628   /* Pass 2: process 6 rows from work array, store into output array. */
   630   wsptr = workspace;
   631   for (ctr = 0; ctr < 6; ctr++) {
   632     outptr = output_buf[ctr] + output_col;
   634     /* Even part */
   636     /* Add fudge factor here for final descale. */
   637     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   638     tmp0 <<= CONST_BITS;
   639     tmp2 = (INT32) wsptr[4];
   640     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
   641     tmp1 = tmp0 + tmp10;
   642     tmp11 = tmp0 - tmp10 - tmp10;
   643     tmp10 = (INT32) wsptr[2];
   644     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
   645     tmp10 = tmp1 + tmp0;
   646     tmp12 = tmp1 - tmp0;
   648     /* Odd part */
   650     z1 = (INT32) wsptr[1];
   651     z2 = (INT32) wsptr[3];
   652     z3 = (INT32) wsptr[5];
   653     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
   654     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
   655     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
   656     tmp1 = (z1 - z2 - z3) << CONST_BITS;
   658     /* Final output stage */
   660     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
   661 					      CONST_BITS+PASS1_BITS+3)
   662 			    & RANGE_MASK];
   663     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
   664 					      CONST_BITS+PASS1_BITS+3)
   665 			    & RANGE_MASK];
   666     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
   667 					      CONST_BITS+PASS1_BITS+3)
   668 			    & RANGE_MASK];
   669     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
   670 					      CONST_BITS+PASS1_BITS+3)
   671 			    & RANGE_MASK];
   672     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
   673 					      CONST_BITS+PASS1_BITS+3)
   674 			    & RANGE_MASK];
   675     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
   676 					      CONST_BITS+PASS1_BITS+3)
   677 			    & RANGE_MASK];
   679     wsptr += 6;		/* advance pointer to next row */
   680   }
   681 }
   684 /*
   685  * Perform dequantization and inverse DCT on one block of coefficients,
   686  * producing a reduced-size 5x5 output block.
   687  *
   688  * Optimized algorithm with 5 multiplications in the 1-D kernel.
   689  * cK represents sqrt(2) * cos(K*pi/10).
   690  */
   692 GLOBAL(void)
   693 jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   694 	       JCOEFPTR coef_block,
   695 	       JSAMPARRAY output_buf, JDIMENSION output_col)
   696 {
   697   INT32 tmp0, tmp1, tmp10, tmp11, tmp12;
   698   INT32 z1, z2, z3;
   699   JCOEFPTR inptr;
   700   ISLOW_MULT_TYPE * quantptr;
   701   int * wsptr;
   702   JSAMPROW outptr;
   703   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   704   int ctr;
   705   int workspace[5*5];	/* buffers data between passes */
   706   SHIFT_TEMPS
   708   /* Pass 1: process columns from input, store into work array. */
   710   inptr = coef_block;
   711   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   712   wsptr = workspace;
   713   for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
   714     /* Even part */
   716     tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   717     tmp12 <<= CONST_BITS;
   718     /* Add fudge factor here for final descale. */
   719     tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
   720     tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   721     tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   722     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
   723     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
   724     z3 = tmp12 + z2;
   725     tmp10 = z3 + z1;
   726     tmp11 = z3 - z1;
   727     tmp12 -= z2 << 2;
   729     /* Odd part */
   731     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   732     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   734     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
   735     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
   736     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
   738     /* Final output stage */
   740     wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
   741     wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
   742     wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
   743     wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
   744     wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
   745   }
   747   /* Pass 2: process 5 rows from work array, store into output array. */
   749   wsptr = workspace;
   750   for (ctr = 0; ctr < 5; ctr++) {
   751     outptr = output_buf[ctr] + output_col;
   753     /* Even part */
   755     /* Add fudge factor here for final descale. */
   756     tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   757     tmp12 <<= CONST_BITS;
   758     tmp0 = (INT32) wsptr[2];
   759     tmp1 = (INT32) wsptr[4];
   760     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
   761     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
   762     z3 = tmp12 + z2;
   763     tmp10 = z3 + z1;
   764     tmp11 = z3 - z1;
   765     tmp12 -= z2 << 2;
   767     /* Odd part */
   769     z2 = (INT32) wsptr[1];
   770     z3 = (INT32) wsptr[3];
   772     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
   773     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
   774     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
   776     /* Final output stage */
   778     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
   779 					      CONST_BITS+PASS1_BITS+3)
   780 			    & RANGE_MASK];
   781     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
   782 					      CONST_BITS+PASS1_BITS+3)
   783 			    & RANGE_MASK];
   784     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
   785 					      CONST_BITS+PASS1_BITS+3)
   786 			    & RANGE_MASK];
   787     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
   788 					      CONST_BITS+PASS1_BITS+3)
   789 			    & RANGE_MASK];
   790     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
   791 					      CONST_BITS+PASS1_BITS+3)
   792 			    & RANGE_MASK];
   794     wsptr += 5;		/* advance pointer to next row */
   795   }
   796 }
   799 /*
   800  * Perform dequantization and inverse DCT on one block of coefficients,
   801  * producing a reduced-size 3x3 output block.
   802  *
   803  * Optimized algorithm with 2 multiplications in the 1-D kernel.
   804  * cK represents sqrt(2) * cos(K*pi/6).
   805  */
   807 GLOBAL(void)
   808 jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   809 	       JCOEFPTR coef_block,
   810 	       JSAMPARRAY output_buf, JDIMENSION output_col)
   811 {
   812   INT32 tmp0, tmp2, tmp10, tmp12;
   813   JCOEFPTR inptr;
   814   ISLOW_MULT_TYPE * quantptr;
   815   int * wsptr;
   816   JSAMPROW outptr;
   817   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   818   int ctr;
   819   int workspace[3*3];	/* buffers data between passes */
   820   SHIFT_TEMPS
   822   /* Pass 1: process columns from input, store into work array. */
   824   inptr = coef_block;
   825   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   826   wsptr = workspace;
   827   for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
   828     /* Even part */
   830     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   831     tmp0 <<= CONST_BITS;
   832     /* Add fudge factor here for final descale. */
   833     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
   834     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   835     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
   836     tmp10 = tmp0 + tmp12;
   837     tmp2 = tmp0 - tmp12 - tmp12;
   839     /* Odd part */
   841     tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   842     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
   844     /* Final output stage */
   846     wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
   847     wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
   848     wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
   849   }
   851   /* Pass 2: process 3 rows from work array, store into output array. */
   853   wsptr = workspace;
   854   for (ctr = 0; ctr < 3; ctr++) {
   855     outptr = output_buf[ctr] + output_col;
   857     /* Even part */
   859     /* Add fudge factor here for final descale. */
   860     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   861     tmp0 <<= CONST_BITS;
   862     tmp2 = (INT32) wsptr[2];
   863     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
   864     tmp10 = tmp0 + tmp12;
   865     tmp2 = tmp0 - tmp12 - tmp12;
   867     /* Odd part */
   869     tmp12 = (INT32) wsptr[1];
   870     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
   872     /* Final output stage */
   874     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
   875 					      CONST_BITS+PASS1_BITS+3)
   876 			    & RANGE_MASK];
   877     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
   878 					      CONST_BITS+PASS1_BITS+3)
   879 			    & RANGE_MASK];
   880     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
   881 					      CONST_BITS+PASS1_BITS+3)
   882 			    & RANGE_MASK];
   884     wsptr += 3;		/* advance pointer to next row */
   885   }
   886 }
   889 /*
   890  * Perform dequantization and inverse DCT on one block of coefficients,
   891  * producing a 9x9 output block.
   892  *
   893  * Optimized algorithm with 10 multiplications in the 1-D kernel.
   894  * cK represents sqrt(2) * cos(K*pi/18).
   895  */
   897 GLOBAL(void)
   898 jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   899 	       JCOEFPTR coef_block,
   900 	       JSAMPARRAY output_buf, JDIMENSION output_col)
   901 {
   902   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
   903   INT32 z1, z2, z3, z4;
   904   JCOEFPTR inptr;
   905   ISLOW_MULT_TYPE * quantptr;
   906   int * wsptr;
   907   JSAMPROW outptr;
   908   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   909   int ctr;
   910   int workspace[8*9];	/* buffers data between passes */
   911   SHIFT_TEMPS
   913   /* Pass 1: process columns from input, store into work array. */
   915   inptr = coef_block;
   916   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   917   wsptr = workspace;
   918   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   919     /* Even part */
   921     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   922     tmp0 <<= CONST_BITS;
   923     /* Add fudge factor here for final descale. */
   924     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
   926     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   927     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   928     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   930     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
   931     tmp1 = tmp0 + tmp3;
   932     tmp2 = tmp0 - tmp3 - tmp3;
   934     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
   935     tmp11 = tmp2 + tmp0;
   936     tmp14 = tmp2 - tmp0 - tmp0;
   938     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
   939     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
   940     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
   942     tmp10 = tmp1 + tmp0 - tmp3;
   943     tmp12 = tmp1 - tmp0 + tmp2;
   944     tmp13 = tmp1 - tmp2 + tmp3;
   946     /* Odd part */
   948     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   949     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   950     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   951     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   953     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
   955     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
   956     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
   957     tmp0 = tmp2 + tmp3 - z2;
   958     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
   959     tmp2 += z2 - tmp1;
   960     tmp3 += z2 + tmp1;
   961     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
   963     /* Final output stage */
   965     wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
   966     wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
   967     wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
   968     wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
   969     wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
   970     wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
   971     wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
   972     wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
   973     wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
   974   }
   976   /* Pass 2: process 9 rows from work array, store into output array. */
   978   wsptr = workspace;
   979   for (ctr = 0; ctr < 9; ctr++) {
   980     outptr = output_buf[ctr] + output_col;
   982     /* Even part */
   984     /* Add fudge factor here for final descale. */
   985     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   986     tmp0 <<= CONST_BITS;
   988     z1 = (INT32) wsptr[2];
   989     z2 = (INT32) wsptr[4];
   990     z3 = (INT32) wsptr[6];
   992     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
   993     tmp1 = tmp0 + tmp3;
   994     tmp2 = tmp0 - tmp3 - tmp3;
   996     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
   997     tmp11 = tmp2 + tmp0;
   998     tmp14 = tmp2 - tmp0 - tmp0;
  1000     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
  1001     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
  1002     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
  1004     tmp10 = tmp1 + tmp0 - tmp3;
  1005     tmp12 = tmp1 - tmp0 + tmp2;
  1006     tmp13 = tmp1 - tmp2 + tmp3;
  1008     /* Odd part */
  1010     z1 = (INT32) wsptr[1];
  1011     z2 = (INT32) wsptr[3];
  1012     z3 = (INT32) wsptr[5];
  1013     z4 = (INT32) wsptr[7];
  1015     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
  1017     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
  1018     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
  1019     tmp0 = tmp2 + tmp3 - z2;
  1020     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
  1021     tmp2 += z2 - tmp1;
  1022     tmp3 += z2 + tmp1;
  1023     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
  1025     /* Final output stage */
  1027     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
  1028 					      CONST_BITS+PASS1_BITS+3)
  1029 			    & RANGE_MASK];
  1030     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
  1031 					      CONST_BITS+PASS1_BITS+3)
  1032 			    & RANGE_MASK];
  1033     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
  1034 					      CONST_BITS+PASS1_BITS+3)
  1035 			    & RANGE_MASK];
  1036     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
  1037 					      CONST_BITS+PASS1_BITS+3)
  1038 			    & RANGE_MASK];
  1039     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
  1040 					      CONST_BITS+PASS1_BITS+3)
  1041 			    & RANGE_MASK];
  1042     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
  1043 					      CONST_BITS+PASS1_BITS+3)
  1044 			    & RANGE_MASK];
  1045     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
  1046 					      CONST_BITS+PASS1_BITS+3)
  1047 			    & RANGE_MASK];
  1048     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
  1049 					      CONST_BITS+PASS1_BITS+3)
  1050 			    & RANGE_MASK];
  1051     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
  1052 					      CONST_BITS+PASS1_BITS+3)
  1053 			    & RANGE_MASK];
  1055     wsptr += 8;		/* advance pointer to next row */
  1060 /*
  1061  * Perform dequantization and inverse DCT on one block of coefficients,
  1062  * producing a 10x10 output block.
  1064  * Optimized algorithm with 12 multiplications in the 1-D kernel.
  1065  * cK represents sqrt(2) * cos(K*pi/20).
  1066  */
  1068 GLOBAL(void)
  1069 jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
  1070 		 JCOEFPTR coef_block,
  1071 		 JSAMPARRAY output_buf, JDIMENSION output_col)
  1073   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
  1074   INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
  1075   INT32 z1, z2, z3, z4, z5;
  1076   JCOEFPTR inptr;
  1077   ISLOW_MULT_TYPE * quantptr;
  1078   int * wsptr;
  1079   JSAMPROW outptr;
  1080   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
  1081   int ctr;
  1082   int workspace[8*10];	/* buffers data between passes */
  1083   SHIFT_TEMPS
  1085   /* Pass 1: process columns from input, store into work array. */
  1087   inptr = coef_block;
  1088   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
  1089   wsptr = workspace;
  1090   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
  1091     /* Even part */
  1093     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
  1094     z3 <<= CONST_BITS;
  1095     /* Add fudge factor here for final descale. */
  1096     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
  1097     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
  1098     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
  1099     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
  1100     tmp10 = z3 + z1;
  1101     tmp11 = z3 - z2;
  1103     tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
  1104 			CONST_BITS-PASS1_BITS);
  1106     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
  1107     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
  1109     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
  1110     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
  1111     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
  1113     tmp20 = tmp10 + tmp12;
  1114     tmp24 = tmp10 - tmp12;
  1115     tmp21 = tmp11 + tmp13;
  1116     tmp23 = tmp11 - tmp13;
  1118     /* Odd part */
  1120     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
  1121     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
  1122     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
  1123     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
  1125     tmp11 = z2 + z4;
  1126     tmp13 = z2 - z4;
  1128     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
  1129     z5 = z3 << CONST_BITS;
  1131     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
  1132     z4 = z5 + tmp12;
  1134     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
  1135     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
  1137     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
  1138     z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
  1140     tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
  1142     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
  1143     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
  1145     /* Final output stage */
  1147     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
  1148     wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
  1149     wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
  1150     wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
  1151     wsptr[8*2] = (int) (tmp22 + tmp12);
  1152     wsptr[8*7] = (int) (tmp22 - tmp12);
  1153     wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
  1154     wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
  1155     wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
  1156     wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
  1159   /* Pass 2: process 10 rows from work array, store into output array. */
  1161   wsptr = workspace;
  1162   for (ctr = 0; ctr < 10; ctr++) {
  1163     outptr = output_buf[ctr] + output_col;
  1165     /* Even part */
  1167     /* Add fudge factor here for final descale. */
  1168     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
  1169     z3 <<= CONST_BITS;
  1170     z4 = (INT32) wsptr[4];
  1171     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
  1172     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
  1173     tmp10 = z3 + z1;
  1174     tmp11 = z3 - z2;
  1176     tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
  1178     z2 = (INT32) wsptr[2];
  1179     z3 = (INT32) wsptr[6];
  1181     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
  1182     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
  1183     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
  1185     tmp20 = tmp10 + tmp12;
  1186     tmp24 = tmp10 - tmp12;
  1187     tmp21 = tmp11 + tmp13;
  1188     tmp23 = tmp11 - tmp13;
  1190     /* Odd part */
  1192     z1 = (INT32) wsptr[1];
  1193     z2 = (INT32) wsptr[3];
  1194     z3 = (INT32) wsptr[5];
  1195     z3 <<= CONST_BITS;
  1196     z4 = (INT32) wsptr[7];
  1198     tmp11 = z2 + z4;
  1199     tmp13 = z2 - z4;
  1201     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
  1203     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
  1204     z4 = z3 + tmp12;
  1206     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
  1207     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
  1209     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
  1210     z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
  1212     tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
  1214     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
  1215     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
  1217     /* Final output stage */
  1219     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
  1220 					      CONST_BITS+PASS1_BITS+3)
  1221 			    & RANGE_MASK];
  1222     outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
  1223 					      CONST_BITS+PASS1_BITS+3)
  1224 			    & RANGE_MASK];
  1225     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
  1226 					      CONST_BITS+PASS1_BITS+3)
  1227 			    & RANGE_MASK];
  1228     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
  1229 					      CONST_BITS+PASS1_BITS+3)
  1230 			    & RANGE_MASK];
  1231     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
  1232 					      CONST_BITS+PASS1_BITS+3)
  1233 			    & RANGE_MASK];
  1234     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
  1235 					      CONST_BITS+PASS1_BITS+3)
  1236 			    & RANGE_MASK];
  1237     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
  1238 					      CONST_BITS+PASS1_BITS+3)
  1239 			    & RANGE_MASK];
  1240     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
  1241 					      CONST_BITS+PASS1_BITS+3)
  1242 			    & RANGE_MASK];
  1243     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
  1244 					      CONST_BITS+PASS1_BITS+3)
  1245 			    & RANGE_MASK];
  1246     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
  1247 					      CONST_BITS+PASS1_BITS+3)
  1248 			    & RANGE_MASK];
  1250     wsptr += 8;		/* advance pointer to next row */
  1255 /*
  1256  * Perform dequantization and inverse DCT on one block of coefficients,
  1257  * producing a 11x11 output block.
  1259  * Optimized algorithm with 24 multiplications in the 1-D kernel.
  1260  * cK represents sqrt(2) * cos(K*pi/22).
  1261  */
  1263 GLOBAL(void)
  1264 jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
  1265 		 JCOEFPTR coef_block,
  1266 		 JSAMPARRAY output_buf, JDIMENSION output_col)
  1268   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
  1269   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
  1270   INT32 z1, z2, z3, z4;
  1271   JCOEFPTR inptr;
  1272   ISLOW_MULT_TYPE * quantptr;
  1273   int * wsptr;
  1274   JSAMPROW outptr;
  1275   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
  1276   int ctr;
  1277   int workspace[8*11];	/* buffers data between passes */
  1278   SHIFT_TEMPS
  1280   /* Pass 1: process columns from input, store into work array. */
  1282   inptr = coef_block;
  1283   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
  1284   wsptr = workspace;
  1285   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
  1286     /* Even part */
  1288     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
  1289     tmp10 <<= CONST_BITS;
  1290     /* Add fudge factor here for final descale. */
  1291     tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
  1293     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
  1294     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
  1295     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
  1297     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
  1298     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
  1299     z4 = z1 + z3;
  1300     tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
  1301     z4 -= z2;
  1302     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
  1303     tmp21 = tmp20 + tmp23 + tmp25 -
  1304 	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
  1305     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
  1306     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
  1307     tmp24 += tmp25;
  1308     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
  1309     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
  1310 	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
  1311     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
  1313     /* Odd part */
  1315     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
  1316     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
  1317     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
  1318     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
  1320     tmp11 = z1 + z2;
  1321     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
  1322     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
  1323     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
  1324     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
  1325     tmp10 = tmp11 + tmp12 + tmp13 -
  1326 	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
  1327     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
  1328     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
  1329     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
  1330     z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
  1331     tmp11 += z1;
  1332     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
  1333     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
  1334 	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
  1335 	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
  1337     /* Final output stage */
  1339     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
  1340     wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
  1341     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
  1342     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
  1343     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
  1344     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
  1345     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
  1346     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
  1347     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
  1348     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
  1349     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
  1352   /* Pass 2: process 11 rows from work array, store into output array. */
  1354   wsptr = workspace;
  1355   for (ctr = 0; ctr < 11; ctr++) {
  1356     outptr = output_buf[ctr] + output_col;
  1358     /* Even part */
  1360     /* Add fudge factor here for final descale. */
  1361     tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
  1362     tmp10 <<= CONST_BITS;
  1364     z1 = (INT32) wsptr[2];
  1365     z2 = (INT32) wsptr[4];
  1366     z3 = (INT32) wsptr[6];
  1368     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
  1369     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
  1370     z4 = z1 + z3;
  1371     tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
  1372     z4 -= z2;
  1373     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
  1374     tmp21 = tmp20 + tmp23 + tmp25 -
  1375 	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
  1376     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
  1377     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
  1378     tmp24 += tmp25;
  1379     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
  1380     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
  1381 	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
  1382     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
  1384     /* Odd part */
  1386     z1 = (INT32) wsptr[1];
  1387     z2 = (INT32) wsptr[3];
  1388     z3 = (INT32) wsptr[5];
  1389     z4 = (INT32) wsptr[7];
  1391     tmp11 = z1 + z2;
  1392     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
  1393     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
  1394     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
  1395     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
  1396     tmp10 = tmp11 + tmp12 + tmp13 -
  1397 	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
  1398     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
  1399     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
  1400     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
  1401     z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
  1402     tmp11 += z1;
  1403     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
  1404     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
  1405 	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
  1406 	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
  1408     /* Final output stage */
  1410     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
  1411 					       CONST_BITS+PASS1_BITS+3)
  1412 			     & RANGE_MASK];
  1413     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
  1414 					       CONST_BITS+PASS1_BITS+3)
  1415 			     & RANGE_MASK];
  1416     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
  1417 					       CONST_BITS+PASS1_BITS+3)
  1418 			     & RANGE_MASK];
  1419     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
  1420 					       CONST_BITS+PASS1_BITS+3)
  1421 			     & RANGE_MASK];
  1422     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
  1423 					       CONST_BITS+PASS1_BITS+3)
  1424 			     & RANGE_MASK];
  1425     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
  1426 					       CONST_BITS+PASS1_BITS+3)
  1427 			     & RANGE_MASK];
  1428     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
  1429 					       CONST_BITS+PASS1_BITS+3)
  1430 			     & RANGE_MASK];
  1431     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
  1432 					       CONST_BITS+PASS1_BITS+3)
  1433 			     & RANGE_MASK];
  1434     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
  1435 					       CONST_BITS+PASS1_BITS+3)
  1436 			     & RANGE_MASK];
  1437     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
  1438 					       CONST_BITS+PASS1_BITS+3)
  1439 			     & RANGE_MASK];
  1440     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25,
  1441 					       CONST_BITS+PASS1_BITS+3)
  1442 			     & RANGE_MASK];
  1444     wsptr += 8;		/* advance pointer to next row */
  1449 /*
  1450  * Perform dequantization and inverse DCT on one block of coefficients,
  1451  * producing a 12x12 output block.
  1453  * Optimized algorithm with 15 multiplications in the 1-D kernel.
  1454  * cK represents sqrt(2) * cos(K*pi/24).
  1455  */
  1457 GLOBAL(void)
  1458 jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
  1459 		 JCOEFPTR coef_block,
  1460 		 JSAMPARRAY output_buf, JDIMENSION output_col)
  1462   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
  1463   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
  1464   INT32 z1, z2, z3, z4;
  1465   JCOEFPTR inptr;
  1466   ISLOW_MULT_TYPE * quantptr;
  1467   int * wsptr;
  1468   JSAMPROW outptr;
  1469   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
  1470   int ctr;
  1471   int workspace[8*12];	/* buffers data between passes */
  1472   SHIFT_TEMPS
  1474   /* Pass 1: process columns from input, store into work array. */
  1476   inptr = coef_block;
  1477   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
  1478   wsptr = workspace;
  1479   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
  1480     /* Even part */
  1482     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
  1483     z3 <<= CONST_BITS;
  1484     /* Add fudge factor here for final descale. */
  1485     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
  1487     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
  1488     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
  1490     tmp10 = z3 + z4;
  1491     tmp11 = z3 - z4;
  1493     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
  1494     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
  1495     z1 <<= CONST_BITS;
  1496     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
  1497     z2 <<= CONST_BITS;
  1499     tmp12 = z1 - z2;
  1501     tmp21 = z3 + tmp12;
  1502     tmp24 = z3 - tmp12;
  1504     tmp12 = z4 + z2;
  1506     tmp20 = tmp10 + tmp12;
  1507     tmp25 = tmp10 - tmp12;
  1509     tmp12 = z4 - z1 - z2;
  1511     tmp22 = tmp11 + tmp12;
  1512     tmp23 = tmp11 - tmp12;
  1514     /* Odd part */
  1516     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
  1517     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
  1518     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
  1519     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
  1521     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
  1522     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
  1524     tmp10 = z1 + z3;
  1525     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
  1526     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
  1527     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
  1528     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
  1529     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
  1530     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
  1531     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
  1532 	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
  1534     z1 -= z4;
  1535     z2 -= z3;
  1536     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
  1537     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
  1538     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
  1540     /* Final output stage */
  1542     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
  1543     wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
  1544     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
  1545     wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
  1546     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
  1547     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
  1548     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
  1549     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
  1550     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
  1551     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
  1552     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
  1553     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
  1556   /* Pass 2: process 12 rows from work array, store into output array. */
  1558   wsptr = workspace;
  1559   for (ctr = 0; ctr < 12; ctr++) {
  1560     outptr = output_buf[ctr] + output_col;
  1562     /* Even part */
  1564     /* Add fudge factor here for final descale. */
  1565     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
  1566     z3 <<= CONST_BITS;
  1568     z4 = (INT32) wsptr[4];
  1569     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
  1571     tmp10 = z3 + z4;
  1572     tmp11 = z3 - z4;
  1574     z1 = (INT32) wsptr[2];
  1575     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
  1576     z1 <<= CONST_BITS;
  1577     z2 = (INT32) wsptr[6];
  1578     z2 <<= CONST_BITS;
  1580     tmp12 = z1 - z2;
  1582     tmp21 = z3 + tmp12;
  1583     tmp24 = z3 - tmp12;
  1585     tmp12 = z4 + z2;
  1587     tmp20 = tmp10 + tmp12;
  1588     tmp25 = tmp10 - tmp12;
  1590     tmp12 = z4 - z1 - z2;
  1592     tmp22 = tmp11 + tmp12;
  1593     tmp23 = tmp11 - tmp12;
  1595     /* Odd part */
  1597     z1 = (INT32) wsptr[1];
  1598     z2 = (INT32) wsptr[3];
  1599     z3 = (INT32) wsptr[5];
  1600     z4 = (INT32) wsptr[7];
  1602     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
  1603     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
  1605     tmp10 = z1 + z3;
  1606     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
  1607     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
  1608     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
  1609     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
  1610     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
  1611     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
  1612     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
  1613 	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
  1615     z1 -= z4;
  1616     z2 -= z3;
  1617     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
  1618     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
  1619     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
  1621     /* Final output stage */
  1623     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
  1624 					       CONST_BITS+PASS1_BITS+3)
  1625 			     & RANGE_MASK];
  1626     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
  1627 					       CONST_BITS+PASS1_BITS+3)
  1628 			     & RANGE_MASK];
  1629     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
  1630 					       CONST_BITS+PASS1_BITS+3)
  1631 			     & RANGE_MASK];
  1632     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
  1633 					       CONST_BITS+PASS1_BITS+3)
  1634 			     & RANGE_MASK];
  1635     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
  1636 					       CONST_BITS+PASS1_BITS+3)
  1637 			     & RANGE_MASK];
  1638     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
  1639 					       CONST_BITS+PASS1_BITS+3)
  1640 			     & RANGE_MASK];
  1641     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
  1642 					       CONST_BITS+PASS1_BITS+3)
  1643 			     & RANGE_MASK];
  1644     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
  1645 					       CONST_BITS+PASS1_BITS+3)
  1646 			     & RANGE_MASK];
  1647     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
  1648 					       CONST_BITS+PASS1_BITS+3)
  1649 			     & RANGE_MASK];
  1650     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
  1651 					       CONST_BITS+PASS1_BITS+3)
  1652 			     & RANGE_MASK];
  1653     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
  1654 					       CONST_BITS+PASS1_BITS+3)
  1655 			     & RANGE_MASK];
  1656     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
  1657 					       CONST_BITS+PASS1_BITS+3)
  1658 			     & RANGE_MASK];
  1660     wsptr += 8;		/* advance pointer to next row */
  1665 /*
  1666  * Perform dequantization and inverse DCT on one block of coefficients,
  1667  * producing a 13x13 output block.
  1669  * Optimized algorithm with 29 multiplications in the 1-D kernel.
  1670  * cK represents sqrt(2) * cos(K*pi/26).
  1671  */
  1673 GLOBAL(void)
  1674 jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
  1675 		 JCOEFPTR coef_block,
  1676 		 JSAMPARRAY output_buf, JDIMENSION output_col)
  1678   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
  1679   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
  1680   INT32 z1, z2, z3, z4;
  1681   JCOEFPTR inptr;
  1682   ISLOW_MULT_TYPE * quantptr;
  1683   int * wsptr;
  1684   JSAMPROW outptr;
  1685   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
  1686   int ctr;
  1687   int workspace[8*13];	/* buffers data between passes */
  1688   SHIFT_TEMPS
  1690   /* Pass 1: process columns from input, store into work array. */
  1692   inptr = coef_block;
  1693   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
  1694   wsptr = workspace;
  1695   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
  1696     /* Even part */
  1698     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
  1699     z1 <<= CONST_BITS;
  1700     /* Add fudge factor here for final descale. */
  1701     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
  1703     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
  1704     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
  1705     z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
  1707     tmp10 = z3 + z4;
  1708     tmp11 = z3 - z4;
  1710     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
  1711     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
  1713     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
  1714     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
  1716     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
  1717     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
  1719     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
  1720     tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
  1722     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
  1723     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
  1725     tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
  1726     tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
  1728     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
  1730     /* Odd part */
  1732     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
  1733     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
  1734     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
  1735     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
  1737     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
  1738     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
  1739     tmp15 = z1 + z4;
  1740     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
  1741     tmp10 = tmp11 + tmp12 + tmp13 -
  1742 	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
  1743     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
  1744     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
  1745     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
  1746     tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
  1747     tmp11 += tmp14;
  1748     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
  1749     tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
  1750     tmp12 += tmp14;
  1751     tmp13 += tmp14;
  1752     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
  1753     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
  1754 	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
  1755     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
  1756     tmp14 += z1;
  1757     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
  1758 	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
  1760     /* Final output stage */
  1762     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
  1763     wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
  1764     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
  1765     wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
  1766     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
  1767     wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
  1768     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
  1769     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
  1770     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
  1771     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
  1772     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
  1773     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
  1774     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
  1777   /* Pass 2: process 13 rows from work array, store into output array. */
  1779   wsptr = workspace;
  1780   for (ctr = 0; ctr < 13; ctr++) {
  1781     outptr = output_buf[ctr] + output_col;
  1783     /* Even part */
  1785     /* Add fudge factor here for final descale. */
  1786     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
  1787     z1 <<= CONST_BITS;
  1789     z2 = (INT32) wsptr[2];
  1790     z3 = (INT32) wsptr[4];
  1791     z4 = (INT32) wsptr[6];
  1793     tmp10 = z3 + z4;
  1794     tmp11 = z3 - z4;
  1796     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
  1797     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
  1799     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
  1800     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
  1802     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
  1803     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
  1805     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
  1806     tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
  1808     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
  1809     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
  1811     tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
  1812     tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
  1814     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
  1816     /* Odd part */
  1818     z1 = (INT32) wsptr[1];
  1819     z2 = (INT32) wsptr[3];
  1820     z3 = (INT32) wsptr[5];
  1821     z4 = (INT32) wsptr[7];
  1823     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
  1824     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
  1825     tmp15 = z1 + z4;
  1826     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
  1827     tmp10 = tmp11 + tmp12 + tmp13 -
  1828 	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
  1829     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
  1830     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
  1831     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
  1832     tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
  1833     tmp11 += tmp14;
  1834     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
  1835     tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
  1836     tmp12 += tmp14;
  1837     tmp13 += tmp14;
  1838     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
  1839     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
  1840 	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
  1841     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
  1842     tmp14 += z1;
  1843     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
  1844 	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
  1846     /* Final output stage */
  1848     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
  1849 					       CONST_BITS+PASS1_BITS+3)
  1850 			     & RANGE_MASK];
  1851     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
  1852 					       CONST_BITS+PASS1_BITS+3)
  1853 			     & RANGE_MASK];
  1854     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
  1855 					       CONST_BITS+PASS1_BITS+3)
  1856 			     & RANGE_MASK];
  1857     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
  1858 					       CONST_BITS+PASS1_BITS+3)
  1859 			     & RANGE_MASK];
  1860     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
  1861 					       CONST_BITS+PASS1_BITS+3)
  1862 			     & RANGE_MASK];
  1863     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
  1864 					       CONST_BITS+PASS1_BITS+3)
  1865 			     & RANGE_MASK];
  1866     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
  1867 					       CONST_BITS+PASS1_BITS+3)
  1868 			     & RANGE_MASK];
  1869     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
  1870 					       CONST_BITS+PASS1_BITS+3)
  1871 			     & RANGE_MASK];
  1872     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
  1873 					       CONST_BITS+PASS1_BITS+3)
  1874 			     & RANGE_MASK];
  1875     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
  1876 					       CONST_BITS+PASS1_BITS+3)
  1877 			     & RANGE_MASK];
  1878     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
  1879 					       CONST_BITS+PASS1_BITS+3)
  1880 			     & RANGE_MASK];
  1881     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
  1882 					       CONST_BITS+PASS1_BITS+3)
  1883 			     & RANGE_MASK];
  1884     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26,
  1885 					       CONST_BITS+PASS1_BITS+3)
  1886 			     & RANGE_MASK];
  1888     wsptr += 8;		/* advance pointer to next row */
  1893 /*
  1894  * Perform dequantization and inverse DCT on one block of coefficients,
  1895  * producing a 14x14 output block.
  1897  * Optimized algorithm with 20 multiplications in the 1-D kernel.
  1898  * cK represents sqrt(2) * cos(K*pi/28).
  1899  */
  1901 GLOBAL(void)
  1902 jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
  1903 		 JCOEFPTR coef_block,
  1904 		 JSAMPARRAY output_buf, JDIMENSION output_col)
  1906   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
  1907   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
  1908   INT32 z1, z2, z3, z4;
  1909   JCOEFPTR inptr;
  1910   ISLOW_MULT_TYPE * quantptr;
  1911   int * wsptr;
  1912   JSAMPROW outptr;
  1913   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
  1914   int ctr;
  1915   int workspace[8*14];	/* buffers data between passes */
  1916   SHIFT_TEMPS
  1918   /* Pass 1: process columns from input, store into work array. */
  1920   inptr = coef_block;
  1921   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
  1922   wsptr = workspace;
  1923   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
  1924     /* Even part */
  1926     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
  1927     z1 <<= CONST_BITS;
  1928     /* Add fudge factor here for final descale. */
  1929     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
  1930     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
  1931     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
  1932     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
  1933     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
  1935     tmp10 = z1 + z2;
  1936     tmp11 = z1 + z3;
  1937     tmp12 = z1 - z4;
  1939     tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
  1940 			CONST_BITS-PASS1_BITS);
  1942     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
  1943     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
  1945     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
  1947     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
  1948     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
  1949     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
  1950 	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
  1952     tmp20 = tmp10 + tmp13;
  1953     tmp26 = tmp10 - tmp13;
  1954     tmp21 = tmp11 + tmp14;
  1955     tmp25 = tmp11 - tmp14;
  1956     tmp22 = tmp12 + tmp15;
  1957     tmp24 = tmp12 - tmp15;
  1959     /* Odd part */
  1961     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
  1962     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
  1963     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
  1964     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
  1965     tmp13 = z4 << CONST_BITS;
  1967     tmp14 = z1 + z3;
  1968     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
  1969     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
  1970     tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
  1971     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
  1972     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
  1973     z1    -= z2;
  1974     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
  1975     tmp16 += tmp15;
  1976     z1    += z4;
  1977     z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
  1978     tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
  1979     tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
  1980     z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
  1981     tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
  1982     tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
  1984     tmp13 = (z1 - z3) << PASS1_BITS;
  1986     /* Final output stage */
  1988     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
  1989     wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
  1990     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
  1991     wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
  1992     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
  1993     wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
  1994     wsptr[8*3]  = (int) (tmp23 + tmp13);
  1995     wsptr[8*10] = (int) (tmp23 - tmp13);
  1996     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
  1997     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
  1998     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
  1999     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
  2000     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
  2001     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
  2004   /* Pass 2: process 14 rows from work array, store into output array. */
  2006   wsptr = workspace;
  2007   for (ctr = 0; ctr < 14; ctr++) {
  2008     outptr = output_buf[ctr] + output_col;
  2010     /* Even part */
  2012     /* Add fudge factor here for final descale. */
  2013     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
  2014     z1 <<= CONST_BITS;
  2015     z4 = (INT32) wsptr[4];
  2016     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
  2017     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
  2018     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
  2020     tmp10 = z1 + z2;
  2021     tmp11 = z1 + z3;
  2022     tmp12 = z1 - z4;
  2024     tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
  2026     z1 = (INT32) wsptr[2];
  2027     z2 = (INT32) wsptr[6];
  2029     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
  2031     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
  2032     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
  2033     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
  2034 	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
  2036     tmp20 = tmp10 + tmp13;
  2037     tmp26 = tmp10 - tmp13;
  2038     tmp21 = tmp11 + tmp14;
  2039     tmp25 = tmp11 - tmp14;
  2040     tmp22 = tmp12 + tmp15;
  2041     tmp24 = tmp12 - tmp15;
  2043     /* Odd part */
  2045     z1 = (INT32) wsptr[1];
  2046     z2 = (INT32) wsptr[3];
  2047     z3 = (INT32) wsptr[5];
  2048     z4 = (INT32) wsptr[7];
  2049     z4 <<= CONST_BITS;
  2051     tmp14 = z1 + z3;
  2052     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
  2053     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
  2054     tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
  2055     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
  2056     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
  2057     z1    -= z2;
  2058     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
  2059     tmp16 += tmp15;
  2060     tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
  2061     tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
  2062     tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
  2063     tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
  2064     tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
  2065     tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
  2067     tmp13 = ((z1 - z3) << CONST_BITS) + z4;
  2069     /* Final output stage */
  2071     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
  2072 					       CONST_BITS+PASS1_BITS+3)
  2073 			     & RANGE_MASK];
  2074     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
  2075 					       CONST_BITS+PASS1_BITS+3)
  2076 			     & RANGE_MASK];
  2077     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
  2078 					       CONST_BITS+PASS1_BITS+3)
  2079 			     & RANGE_MASK];
  2080     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
  2081 					       CONST_BITS+PASS1_BITS+3)
  2082 			     & RANGE_MASK];
  2083     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
  2084 					       CONST_BITS+PASS1_BITS+3)
  2085 			     & RANGE_MASK];
  2086     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
  2087 					       CONST_BITS+PASS1_BITS+3)
  2088 			     & RANGE_MASK];
  2089     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
  2090 					       CONST_BITS+PASS1_BITS+3)
  2091 			     & RANGE_MASK];
  2092     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
  2093 					       CONST_BITS+PASS1_BITS+3)
  2094 			     & RANGE_MASK];
  2095     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
  2096 					       CONST_BITS+PASS1_BITS+3)
  2097 			     & RANGE_MASK];
  2098     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
  2099 					       CONST_BITS+PASS1_BITS+3)
  2100 			     & RANGE_MASK];
  2101     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
  2102 					       CONST_BITS+PASS1_BITS+3)
  2103 			     & RANGE_MASK];
  2104     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
  2105 					       CONST_BITS+PASS1_BITS+3)
  2106 			     & RANGE_MASK];
  2107     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
  2108 					       CONST_BITS+PASS1_BITS+3)
  2109 			     & RANGE_MASK];
  2110     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
  2111 					       CONST_BITS+PASS1_BITS+3)
  2112 			     & RANGE_MASK];
  2114     wsptr += 8;		/* advance pointer to next row */
  2119 /*
  2120  * Perform dequantization and inverse DCT on one block of coefficients,
  2121  * producing a 15x15 output block.
  2123  * Optimized algorithm with 22 multiplications in the 1-D kernel.
  2124  * cK represents sqrt(2) * cos(K*pi/30).
  2125  */
  2127 GLOBAL(void)
  2128 jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
  2129 		 JCOEFPTR coef_block,
  2130 		 JSAMPARRAY output_buf, JDIMENSION output_col)
  2132   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
  2133   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
  2134   INT32 z1, z2, z3, z4;
  2135   JCOEFPTR inptr;
  2136   ISLOW_MULT_TYPE * quantptr;
  2137   int * wsptr;
  2138   JSAMPROW outptr;
  2139   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
  2140   int ctr;
  2141   int workspace[8*15];	/* buffers data between passes */
  2142   SHIFT_TEMPS
  2144   /* Pass 1: process columns from input, store into work array. */
  2146   inptr = coef_block;
  2147   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
  2148   wsptr = workspace;
  2149   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
  2150     /* Even part */
  2152     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
  2153     z1 <<= CONST_BITS;
  2154     /* Add fudge factor here for final descale. */
  2155     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
  2157     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
  2158     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
  2159     z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
  2161     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
  2162     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
  2164     tmp12 = z1 - tmp10;
  2165     tmp13 = z1 + tmp11;
  2166     z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
  2168     z4 = z2 - z3;
  2169     z3 += z2;
  2170     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
  2171     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
  2172     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
  2174     tmp20 = tmp13 + tmp10 + tmp11;
  2175     tmp23 = tmp12 - tmp10 + tmp11 + z2;
  2177     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
  2178     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
  2180     tmp25 = tmp13 - tmp10 - tmp11;
  2181     tmp26 = tmp12 + tmp10 - tmp11 - z2;
  2183     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
  2184     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
  2186     tmp21 = tmp12 + tmp10 + tmp11;
  2187     tmp24 = tmp13 - tmp10 + tmp11;
  2188     tmp11 += tmp11;
  2189     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
  2190     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
  2192     /* Odd part */
  2194     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
  2195     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
  2196     z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
  2197     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
  2198     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
  2200     tmp13 = z2 - z4;
  2201     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
  2202     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
  2203     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
  2205     tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
  2206     tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
  2207     z2 = z1 - z4;
  2208     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
  2210     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
  2211     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
  2212     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
  2213     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
  2214     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
  2215     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
  2217     /* Final output stage */
  2219     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
  2220     wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
  2221     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
  2222     wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
  2223     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
  2224     wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
  2225     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
  2226     wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
  2227     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
  2228     wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
  2229     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
  2230     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
  2231     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
  2232     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
  2233     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
  2236   /* Pass 2: process 15 rows from work array, store into output array. */
  2238   wsptr = workspace;
  2239   for (ctr = 0; ctr < 15; ctr++) {
  2240     outptr = output_buf[ctr] + output_col;
  2242     /* Even part */
  2244     /* Add fudge factor here for final descale. */
  2245     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
  2246     z1 <<= CONST_BITS;
  2248     z2 = (INT32) wsptr[2];
  2249     z3 = (INT32) wsptr[4];
  2250     z4 = (INT32) wsptr[6];
  2252     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
  2253     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
  2255     tmp12 = z1 - tmp10;
  2256     tmp13 = z1 + tmp11;
  2257     z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
  2259     z4 = z2 - z3;
  2260     z3 += z2;
  2261     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
  2262     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
  2263     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
  2265     tmp20 = tmp13 + tmp10 + tmp11;
  2266     tmp23 = tmp12 - tmp10 + tmp11 + z2;
  2268     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
  2269     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
  2271     tmp25 = tmp13 - tmp10 - tmp11;
  2272     tmp26 = tmp12 + tmp10 - tmp11 - z2;
  2274     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
  2275     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
  2277     tmp21 = tmp12 + tmp10 + tmp11;
  2278     tmp24 = tmp13 - tmp10 + tmp11;
  2279     tmp11 += tmp11;
  2280     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
  2281     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
  2283     /* Odd part */
  2285     z1 = (INT32) wsptr[1];
  2286     z2 = (INT32) wsptr[3];
  2287     z4 = (INT32) wsptr[5];
  2288     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
  2289     z4 = (INT32) wsptr[7];
  2291     tmp13 = z2 - z4;
  2292     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
  2293     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
  2294     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
  2296     tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
  2297     tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
  2298     z2 = z1 - z4;
  2299     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
  2301     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
  2302     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
  2303     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
  2304     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
  2305     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
  2306     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
  2308     /* Final output stage */
  2310     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
  2311 					       CONST_BITS+PASS1_BITS+3)
  2312 			     & RANGE_MASK];
  2313     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
  2314 					       CONST_BITS+PASS1_BITS+3)
  2315 			     & RANGE_MASK];
  2316     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
  2317 					       CONST_BITS+PASS1_BITS+3)
  2318 			     & RANGE_MASK];
  2319     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
  2320 					       CONST_BITS+PASS1_BITS+3)
  2321 			     & RANGE_MASK];
  2322     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
  2323 					       CONST_BITS+PASS1_BITS+3)
  2324 			     & RANGE_MASK];
  2325     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
  2326 					       CONST_BITS+PASS1_BITS+3)
  2327 			     & RANGE_MASK];
  2328     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
  2329 					       CONST_BITS+PASS1_BITS+3)
  2330 			     & RANGE_MASK];
  2331     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
  2332 					       CONST_BITS+PASS1_BITS+3)
  2333 			     & RANGE_MASK];
  2334     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
  2335 					       CONST_BITS+PASS1_BITS+3)
  2336 			     & RANGE_MASK];
  2337     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
  2338 					       CONST_BITS+PASS1_BITS+3)
  2339 			     & RANGE_MASK];
  2340     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
  2341 					       CONST_BITS+PASS1_BITS+3)
  2342 			     & RANGE_MASK];
  2343     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
  2344 					       CONST_BITS+PASS1_BITS+3)
  2345 			     & RANGE_MASK];
  2346     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
  2347 					       CONST_BITS+PASS1_BITS+3)
  2348 			     & RANGE_MASK];
  2349     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
  2350 					       CONST_BITS+PASS1_BITS+3)
  2351 			     & RANGE_MASK];
  2352     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27,
  2353 					       CONST_BITS+PASS1_BITS+3)
  2354 			     & RANGE_MASK];
  2356     wsptr += 8;		/* advance pointer to next row */
  2361 /*
  2362  * Perform dequantization and inverse DCT on one block of coefficients,
  2363  * producing a 16x16 output block.
  2365  * Optimized algorithm with 28 multiplications in the 1-D kernel.
  2366  * cK represents sqrt(2) * cos(K*pi/32).
  2367  */
  2369 GLOBAL(void)
  2370 jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
  2371 		 JCOEFPTR coef_block,
  2372 		 JSAMPARRAY output_buf, JDIMENSION output_col)
  2374   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
  2375   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
  2376   INT32 z1, z2, z3, z4;
  2377   JCOEFPTR inptr;
  2378   ISLOW_MULT_TYPE * quantptr;
  2379   int * wsptr;
  2380   JSAMPROW outptr;
  2381   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
  2382   int ctr;
  2383   int workspace[8*16];	/* buffers data between passes */
  2384   SHIFT_TEMPS
  2386   /* Pass 1: process columns from input, store into work array. */
  2388   inptr = coef_block;
  2389   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
  2390   wsptr = workspace;
  2391   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
  2392     /* Even part */
  2394     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
  2395     tmp0 <<= CONST_BITS;
  2396     /* Add fudge factor here for final descale. */
  2397     tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
  2399     z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
  2400     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
  2401     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
  2403     tmp10 = tmp0 + tmp1;
  2404     tmp11 = tmp0 - tmp1;
  2405     tmp12 = tmp0 + tmp2;
  2406     tmp13 = tmp0 - tmp2;
  2408     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
  2409     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
  2410     z3 = z1 - z2;
  2411     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
  2412     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
  2414     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
  2415     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
  2416     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
  2417     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
  2419     tmp20 = tmp10 + tmp0;
  2420     tmp27 = tmp10 - tmp0;
  2421     tmp21 = tmp12 + tmp1;
  2422     tmp26 = tmp12 - tmp1;
  2423     tmp22 = tmp13 + tmp2;
  2424     tmp25 = tmp13 - tmp2;
  2425     tmp23 = tmp11 + tmp3;
  2426     tmp24 = tmp11 - tmp3;
  2428     /* Odd part */
  2430     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
  2431     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
  2432     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
  2433     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
  2435     tmp11 = z1 + z3;
  2437     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
  2438     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
  2439     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
  2440     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
  2441     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
  2442     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
  2443     tmp0  = tmp1 + tmp2 + tmp3 -
  2444 	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
  2445     tmp13 = tmp10 + tmp11 + tmp12 -
  2446 	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
  2447     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
  2448     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
  2449     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
  2450     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
  2451     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
  2452     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
  2453     z2    += z4;
  2454     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
  2455     tmp1  += z1;
  2456     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
  2457     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
  2458     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
  2459     tmp12 += z2;
  2460     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
  2461     tmp2  += z2;
  2462     tmp3  += z2;
  2463     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
  2464     tmp10 += z2;
  2465     tmp11 += z2;
  2467     /* Final output stage */
  2469     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
  2470     wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
  2471     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
  2472     wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
  2473     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
  2474     wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
  2475     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
  2476     wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
  2477     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
  2478     wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
  2479     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
  2480     wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
  2481     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
  2482     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
  2483     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
  2484     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
  2487   /* Pass 2: process 16 rows from work array, store into output array. */
  2489   wsptr = workspace;
  2490   for (ctr = 0; ctr < 16; ctr++) {
  2491     outptr = output_buf[ctr] + output_col;
  2493     /* Even part */
  2495     /* Add fudge factor here for final descale. */
  2496     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
  2497     tmp0 <<= CONST_BITS;
  2499     z1 = (INT32) wsptr[4];
  2500     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
  2501     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
  2503     tmp10 = tmp0 + tmp1;
  2504     tmp11 = tmp0 - tmp1;
  2505     tmp12 = tmp0 + tmp2;
  2506     tmp13 = tmp0 - tmp2;
  2508     z1 = (INT32) wsptr[2];
  2509     z2 = (INT32) wsptr[6];
  2510     z3 = z1 - z2;
  2511     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
  2512     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
  2514     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
  2515     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
  2516     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
  2517     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
  2519     tmp20 = tmp10 + tmp0;
  2520     tmp27 = tmp10 - tmp0;
  2521     tmp21 = tmp12 + tmp1;
  2522     tmp26 = tmp12 - tmp1;
  2523     tmp22 = tmp13 + tmp2;
  2524     tmp25 = tmp13 - tmp2;
  2525     tmp23 = tmp11 + tmp3;
  2526     tmp24 = tmp11 - tmp3;
  2528     /* Odd part */
  2530     z1 = (INT32) wsptr[1];
  2531     z2 = (INT32) wsptr[3];
  2532     z3 = (INT32) wsptr[5];
  2533     z4 = (INT32) wsptr[7];
  2535     tmp11 = z1 + z3;
  2537     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
  2538     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
  2539     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
  2540     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
  2541     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
  2542     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
  2543     tmp0  = tmp1 + tmp2 + tmp3 -
  2544 	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
  2545     tmp13 = tmp10 + tmp11 + tmp12 -
  2546 	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
  2547     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
  2548     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
  2549     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
  2550     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
  2551     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
  2552     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
  2553     z2    += z4;
  2554     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
  2555     tmp1  += z1;
  2556     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
  2557     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
  2558     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
  2559     tmp12 += z2;
  2560     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
  2561     tmp2  += z2;
  2562     tmp3  += z2;
  2563     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
  2564     tmp10 += z2;
  2565     tmp11 += z2;
  2567     /* Final output stage */
  2569     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
  2570 					       CONST_BITS+PASS1_BITS+3)
  2571 			     & RANGE_MASK];
  2572     outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
  2573 					       CONST_BITS+PASS1_BITS+3)
  2574 			     & RANGE_MASK];
  2575     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
  2576 					       CONST_BITS+PASS1_BITS+3)
  2577 			     & RANGE_MASK];
  2578     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
  2579 					       CONST_BITS+PASS1_BITS+3)
  2580 			     & RANGE_MASK];
  2581     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
  2582 					       CONST_BITS+PASS1_BITS+3)
  2583 			     & RANGE_MASK];
  2584     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
  2585 					       CONST_BITS+PASS1_BITS+3)
  2586 			     & RANGE_MASK];
  2587     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
  2588 					       CONST_BITS+PASS1_BITS+3)
  2589 			     & RANGE_MASK];
  2590     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
  2591 					       CONST_BITS+PASS1_BITS+3)
  2592 			     & RANGE_MASK];
  2593     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
  2594 					       CONST_BITS+PASS1_BITS+3)
  2595 			     & RANGE_MASK];
  2596     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
  2597 					       CONST_BITS+PASS1_BITS+3)
  2598 			     & RANGE_MASK];
  2599     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
  2600 					       CONST_BITS+PASS1_BITS+3)
  2601 			     & RANGE_MASK];
  2602     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
  2603 					       CONST_BITS+PASS1_BITS+3)
  2604 			     & RANGE_MASK];
  2605     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
  2606 					       CONST_BITS+PASS1_BITS+3)
  2607 			     & RANGE_MASK];
  2608     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
  2609 					       CONST_BITS+PASS1_BITS+3)
  2610 			     & RANGE_MASK];
  2611     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
  2612 					       CONST_BITS+PASS1_BITS+3)
  2613 			     & RANGE_MASK];
  2614     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
  2615 					       CONST_BITS+PASS1_BITS+3)
  2616 			     & RANGE_MASK];
  2618     wsptr += 8;		/* advance pointer to next row */
  2622 #endif /* IDCT_SCALING_SUPPORTED */
  2623 #endif /* DCT_ISLOW_SUPPORTED */

mercurial