PageRenderTime 124ms CodeModel.GetById 22ms app.highlight 88ms RepoModel.GetById 1ms app.codeStats 1ms

/src/FreeImage/Source/LibJPEG/jidctint.c

https://bitbucket.org/cabalistic/ogredeps/
C | 5137 lines | 3541 code | 880 blank | 716 comment | 106 complexity | 4ea27dba5316cdba15d8294ad0364aa8 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/*
   2 * jidctint.c
   3 *
   4 * Copyright (C) 1991-1998, Thomas G. Lane.
   5 * Modification developed 2002-2009 by Guido Vollbeding.
   6 * This file is part of the Independent JPEG Group's software.
   7 * For conditions of distribution and use, see the accompanying README file.
   8 *
   9 * This file contains a slow-but-accurate integer implementation of the
  10 * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
  11 * must also perform dequantization of the input coefficients.
  12 *
  13 * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
  14 * on each row (or vice versa, but it's more convenient to emit a row at
  15 * a time).  Direct algorithms are also available, but they are much more
  16 * complex and seem not to be any faster when reduced to code.
  17 *
  18 * This implementation is based on an algorithm described in
  19 *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
  20 *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
  21 *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
  22 * The primary algorithm described there uses 11 multiplies and 29 adds.
  23 * We use their alternate method with 12 multiplies and 32 adds.
  24 * The advantage of this method is that no data path contains more than one
  25 * multiplication; this allows a very simple and accurate implementation in
  26 * scaled fixed-point arithmetic, with a minimal number of shifts.
  27 *
  28 * We also provide IDCT routines with various output sample block sizes for
  29 * direct resolution reduction or enlargement and for direct resolving the
  30 * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
  31 * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 input DCT block.
  32 *
  33 * For N<8 we simply take the corresponding low-frequency coefficients of
  34 * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
  35 * to yield the downscaled outputs.
  36 * This can be seen as direct low-pass downsampling from the DCT domain
  37 * point of view rather than the usual spatial domain point of view,
  38 * yielding significant computational savings and results at least
  39 * as good as common bilinear (averaging) spatial downsampling.
  40 *
  41 * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
  42 * lower frequencies and higher frequencies assumed to be zero.
  43 * It turns out that the computational effort is similar to the 8x8 IDCT
  44 * regarding the output size.
  45 * Furthermore, the scaling and descaling is the same for all IDCT sizes.
  46 *
  47 * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
  48 * since there would be too many additional constants to pre-calculate.
  49 */
  50
  51#define JPEG_INTERNALS
  52#include "jinclude.h"
  53#include "jpeglib.h"
  54#include "jdct.h"		/* Private declarations for DCT subsystem */
  55
  56#ifdef DCT_ISLOW_SUPPORTED
  57
  58
  59/*
  60 * This module is specialized to the case DCTSIZE = 8.
  61 */
  62
  63#if DCTSIZE != 8
  64  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
  65#endif
  66
  67
  68/*
  69 * The poop on this scaling stuff is as follows:
  70 *
  71 * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
  72 * larger than the true IDCT outputs.  The final outputs are therefore
  73 * a factor of N larger than desired; since N=8 this can be cured by
  74 * a simple right shift at the end of the algorithm.  The advantage of
  75 * this arrangement is that we save two multiplications per 1-D IDCT,
  76 * because the y0 and y4 inputs need not be divided by sqrt(N).
  77 *
  78 * We have to do addition and subtraction of the integer inputs, which
  79 * is no problem, and multiplication by fractional constants, which is
  80 * a problem to do in integer arithmetic.  We multiply all the constants
  81 * by CONST_SCALE and convert them to integer constants (thus retaining
  82 * CONST_BITS bits of precision in the constants).  After doing a
  83 * multiplication we have to divide the product by CONST_SCALE, with proper
  84 * rounding, to produce the correct output.  This division can be done
  85 * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
  86 * as long as possible so that partial sums can be added together with
  87 * full fractional precision.
  88 *
  89 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
  90 * they are represented to better-than-integral precision.  These outputs
  91 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
  92 * with the recommended scaling.  (To scale up 12-bit sample data further, an
  93 * intermediate INT32 array would be needed.)
  94 *
  95 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
  96 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
  97 * shows that the values given below are the most effective.
  98 */
  99
 100#if BITS_IN_JSAMPLE == 8
 101#define CONST_BITS  13
 102#define PASS1_BITS  2
 103#else
 104#define CONST_BITS  13
 105#define PASS1_BITS  1		/* lose a little precision to avoid overflow */
 106#endif
 107
 108/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
 109 * causing a lot of useless floating-point operations at run time.
 110 * To get around this we use the following pre-calculated constants.
 111 * If you change CONST_BITS you may want to add appropriate values.
 112 * (With a reasonable C compiler, you can just rely on the FIX() macro...)
 113 */
 114
 115#if CONST_BITS == 13
 116#define FIX_0_298631336  ((INT32)  2446)	/* FIX(0.298631336) */
 117#define FIX_0_390180644  ((INT32)  3196)	/* FIX(0.390180644) */
 118#define FIX_0_541196100  ((INT32)  4433)	/* FIX(0.541196100) */
 119#define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
 120#define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
 121#define FIX_1_175875602  ((INT32)  9633)	/* FIX(1.175875602) */
 122#define FIX_1_501321110  ((INT32)  12299)	/* FIX(1.501321110) */
 123#define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
 124#define FIX_1_961570560  ((INT32)  16069)	/* FIX(1.961570560) */
 125#define FIX_2_053119869  ((INT32)  16819)	/* FIX(2.053119869) */
 126#define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
 127#define FIX_3_072711026  ((INT32)  25172)	/* FIX(3.072711026) */
 128#else
 129#define FIX_0_298631336  FIX(0.298631336)
 130#define FIX_0_390180644  FIX(0.390180644)
 131#define FIX_0_541196100  FIX(0.541196100)
 132#define FIX_0_765366865  FIX(0.765366865)
 133#define FIX_0_899976223  FIX(0.899976223)
 134#define FIX_1_175875602  FIX(1.175875602)
 135#define FIX_1_501321110  FIX(1.501321110)
 136#define FIX_1_847759065  FIX(1.847759065)
 137#define FIX_1_961570560  FIX(1.961570560)
 138#define FIX_2_053119869  FIX(2.053119869)
 139#define FIX_2_562915447  FIX(2.562915447)
 140#define FIX_3_072711026  FIX(3.072711026)
 141#endif
 142
 143
 144/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
 145 * For 8-bit samples with the recommended scaling, all the variable
 146 * and constant values involved are no more than 16 bits wide, so a
 147 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
 148 * For 12-bit samples, a full 32-bit multiplication will be needed.
 149 */
 150
 151#if BITS_IN_JSAMPLE == 8
 152#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
 153#else
 154#define MULTIPLY(var,const)  ((var) * (const))
 155#endif
 156
 157
 158/* Dequantize a coefficient by multiplying it by the multiplier-table
 159 * entry; produce an int result.  In this module, both inputs and result
 160 * are 16 bits or less, so either int or short multiply will work.
 161 */
 162
 163#define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
 164
 165
 166/*
 167 * Perform dequantization and inverse DCT on one block of coefficients.
 168 */
 169
 170GLOBAL(void)
 171jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 172		 JCOEFPTR coef_block,
 173		 JSAMPARRAY output_buf, JDIMENSION output_col)
 174{
 175  INT32 tmp0, tmp1, tmp2, tmp3;
 176  INT32 tmp10, tmp11, tmp12, tmp13;
 177  INT32 z1, z2, z3;
 178  JCOEFPTR inptr;
 179  ISLOW_MULT_TYPE * quantptr;
 180  int * wsptr;
 181  JSAMPROW outptr;
 182  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
 183  int ctr;
 184  int workspace[DCTSIZE2];	/* buffers data between passes */
 185  SHIFT_TEMPS
 186
 187  /* Pass 1: process columns from input, store into work array. */
 188  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
 189  /* furthermore, we scale the results by 2**PASS1_BITS. */
 190
 191  inptr = coef_block;
 192  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
 193  wsptr = workspace;
 194  for (ctr = DCTSIZE; ctr > 0; ctr--) {
 195    /* Due to quantization, we will usually find that many of the input
 196     * coefficients are zero, especially the AC terms.  We can exploit this
 197     * by short-circuiting the IDCT calculation for any column in which all
 198     * the AC terms are zero.  In that case each output is equal to the
 199     * DC coefficient (with scale factor as needed).
 200     * With typical images and quantization tables, half or more of the
 201     * column DCT calculations can be simplified this way.
 202     */
 203
 204    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
 205	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
 206	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
 207	inptr[DCTSIZE*7] == 0) {
 208      /* AC terms all zero */
 209      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
 210
 211      wsptr[DCTSIZE*0] = dcval;
 212      wsptr[DCTSIZE*1] = dcval;
 213      wsptr[DCTSIZE*2] = dcval;
 214      wsptr[DCTSIZE*3] = dcval;
 215      wsptr[DCTSIZE*4] = dcval;
 216      wsptr[DCTSIZE*5] = dcval;
 217      wsptr[DCTSIZE*6] = dcval;
 218      wsptr[DCTSIZE*7] = dcval;
 219
 220      inptr++;			/* advance pointers to next column */
 221      quantptr++;
 222      wsptr++;
 223      continue;
 224    }
 225
 226    /* Even part: reverse the even part of the forward DCT. */
 227    /* The rotator is sqrt(2)*c(-6). */
 228    
 229    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
 230    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
 231
 232    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
 233    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
 234    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
 235
 236    z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 237    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
 238    z2 <<= CONST_BITS;
 239    z3 <<= CONST_BITS;
 240    /* Add fudge factor here for final descale. */
 241    z2 += ONE << (CONST_BITS-PASS1_BITS-1);
 242
 243    tmp0 = z2 + z3;
 244    tmp1 = z2 - z3;
 245
 246    tmp10 = tmp0 + tmp2;
 247    tmp13 = tmp0 - tmp2;
 248    tmp11 = tmp1 + tmp3;
 249    tmp12 = tmp1 - tmp3;
 250
 251    /* Odd part per figure 8; the matrix is unitary and hence its
 252     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
 253     */
 254
 255    tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
 256    tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
 257    tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
 258    tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
 259    
 260    z2 = tmp0 + tmp2;
 261    z3 = tmp1 + tmp3;
 262
 263    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
 264    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
 265    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
 266    z2 += z1;
 267    z3 += z1;
 268
 269    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
 270    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
 271    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
 272    tmp0 += z1 + z2;
 273    tmp3 += z1 + z3;
 274
 275    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
 276    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
 277    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
 278    tmp1 += z1 + z3;
 279    tmp2 += z1 + z2;
 280
 281    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 282
 283    wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
 284    wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
 285    wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
 286    wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
 287    wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
 288    wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
 289    wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
 290    wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
 291    
 292    inptr++;			/* advance pointers to next column */
 293    quantptr++;
 294    wsptr++;
 295  }
 296
 297  /* Pass 2: process rows from work array, store into output array. */
 298  /* Note that we must descale the results by a factor of 8 == 2**3, */
 299  /* and also undo the PASS1_BITS scaling. */
 300
 301  wsptr = workspace;
 302  for (ctr = 0; ctr < DCTSIZE; ctr++) {
 303    outptr = output_buf[ctr] + output_col;
 304    /* Rows of zeroes can be exploited in the same way as we did with columns.
 305     * However, the column calculation has created many nonzero AC terms, so
 306     * the simplification applies less often (typically 5% to 10% of the time).
 307     * On machines with very fast multiplication, it's possible that the
 308     * test takes more time than it's worth.  In that case this section
 309     * may be commented out.
 310     */
 311
 312#ifndef NO_ZERO_ROW_TEST
 313    if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
 314	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
 315      /* AC terms all zero */
 316      JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
 317				  & RANGE_MASK];
 318
 319      outptr[0] = dcval;
 320      outptr[1] = dcval;
 321      outptr[2] = dcval;
 322      outptr[3] = dcval;
 323      outptr[4] = dcval;
 324      outptr[5] = dcval;
 325      outptr[6] = dcval;
 326      outptr[7] = dcval;
 327
 328      wsptr += DCTSIZE;		/* advance pointer to next row */
 329      continue;
 330    }
 331#endif
 332
 333    /* Even part: reverse the even part of the forward DCT. */
 334    /* The rotator is sqrt(2)*c(-6). */
 335    
 336    z2 = (INT32) wsptr[2];
 337    z3 = (INT32) wsptr[6];
 338
 339    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
 340    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
 341    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
 342
 343    /* Add fudge factor here for final descale. */
 344    z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
 345    z3 = (INT32) wsptr[4];
 346
 347    tmp0 = (z2 + z3) << CONST_BITS;
 348    tmp1 = (z2 - z3) << CONST_BITS;
 349    
 350    tmp10 = tmp0 + tmp2;
 351    tmp13 = tmp0 - tmp2;
 352    tmp11 = tmp1 + tmp3;
 353    tmp12 = tmp1 - tmp3;
 354
 355    /* Odd part per figure 8; the matrix is unitary and hence its
 356     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
 357     */
 358
 359    tmp0 = (INT32) wsptr[7];
 360    tmp1 = (INT32) wsptr[5];
 361    tmp2 = (INT32) wsptr[3];
 362    tmp3 = (INT32) wsptr[1];
 363
 364    z2 = tmp0 + tmp2;
 365    z3 = tmp1 + tmp3;
 366
 367    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
 368    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
 369    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
 370    z2 += z1;
 371    z3 += z1;
 372
 373    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
 374    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
 375    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
 376    tmp0 += z1 + z2;
 377    tmp3 += z1 + z3;
 378
 379    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
 380    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
 381    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
 382    tmp1 += z1 + z3;
 383    tmp2 += z1 + z2;
 384
 385    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 386
 387    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
 388					      CONST_BITS+PASS1_BITS+3)
 389			    & RANGE_MASK];
 390    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
 391					      CONST_BITS+PASS1_BITS+3)
 392			    & RANGE_MASK];
 393    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
 394					      CONST_BITS+PASS1_BITS+3)
 395			    & RANGE_MASK];
 396    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
 397					      CONST_BITS+PASS1_BITS+3)
 398			    & RANGE_MASK];
 399    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
 400					      CONST_BITS+PASS1_BITS+3)
 401			    & RANGE_MASK];
 402    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
 403					      CONST_BITS+PASS1_BITS+3)
 404			    & RANGE_MASK];
 405    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
 406					      CONST_BITS+PASS1_BITS+3)
 407			    & RANGE_MASK];
 408    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
 409					      CONST_BITS+PASS1_BITS+3)
 410			    & RANGE_MASK];
 411
 412    wsptr += DCTSIZE;		/* advance pointer to next row */
 413  }
 414}
 415
 416#ifdef IDCT_SCALING_SUPPORTED
 417
 418
 419/*
 420 * Perform dequantization and inverse DCT on one block of coefficients,
 421 * producing a 7x7 output block.
 422 *
 423 * Optimized algorithm with 12 multiplications in the 1-D kernel.
 424 * cK represents sqrt(2) * cos(K*pi/14).
 425 */
 426
 427GLOBAL(void)
 428jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 429	       JCOEFPTR coef_block,
 430	       JSAMPARRAY output_buf, JDIMENSION output_col)
 431{
 432  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
 433  INT32 z1, z2, z3;
 434  JCOEFPTR inptr;
 435  ISLOW_MULT_TYPE * quantptr;
 436  int * wsptr;
 437  JSAMPROW outptr;
 438  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
 439  int ctr;
 440  int workspace[7*7];	/* buffers data between passes */
 441  SHIFT_TEMPS
 442
 443  /* Pass 1: process columns from input, store into work array. */
 444
 445  inptr = coef_block;
 446  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
 447  wsptr = workspace;
 448  for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
 449    /* Even part */
 450
 451    tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 452    tmp13 <<= CONST_BITS;
 453    /* Add fudge factor here for final descale. */
 454    tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
 455
 456    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
 457    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
 458    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
 459
 460    tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
 461    tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
 462    tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
 463    tmp0 = z1 + z3;
 464    z2 -= tmp0;
 465    tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
 466    tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
 467    tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
 468    tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
 469
 470    /* Odd part */
 471
 472    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
 473    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
 474    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
 475
 476    tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
 477    tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
 478    tmp0 = tmp1 - tmp2;
 479    tmp1 += tmp2;
 480    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
 481    tmp1 += tmp2;
 482    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
 483    tmp0 += z2;
 484    tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
 485
 486    /* Final output stage */
 487
 488    wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
 489    wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
 490    wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
 491    wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
 492    wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
 493    wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
 494    wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
 495  }
 496
 497  /* Pass 2: process 7 rows from work array, store into output array. */
 498
 499  wsptr = workspace;
 500  for (ctr = 0; ctr < 7; ctr++) {
 501    outptr = output_buf[ctr] + output_col;
 502
 503    /* Even part */
 504
 505    /* Add fudge factor here for final descale. */
 506    tmp13 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
 507    tmp13 <<= CONST_BITS;
 508
 509    z1 = (INT32) wsptr[2];
 510    z2 = (INT32) wsptr[4];
 511    z3 = (INT32) wsptr[6];
 512
 513    tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
 514    tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
 515    tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
 516    tmp0 = z1 + z3;
 517    z2 -= tmp0;
 518    tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
 519    tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
 520    tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
 521    tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
 522
 523    /* Odd part */
 524
 525    z1 = (INT32) wsptr[1];
 526    z2 = (INT32) wsptr[3];
 527    z3 = (INT32) wsptr[5];
 528
 529    tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
 530    tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
 531    tmp0 = tmp1 - tmp2;
 532    tmp1 += tmp2;
 533    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
 534    tmp1 += tmp2;
 535    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
 536    tmp0 += z2;
 537    tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
 538
 539    /* Final output stage */
 540
 541    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
 542					      CONST_BITS+PASS1_BITS+3)
 543			    & RANGE_MASK];
 544    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
 545					      CONST_BITS+PASS1_BITS+3)
 546			    & RANGE_MASK];
 547    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
 548					      CONST_BITS+PASS1_BITS+3)
 549			    & RANGE_MASK];
 550    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
 551					      CONST_BITS+PASS1_BITS+3)
 552			    & RANGE_MASK];
 553    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
 554					      CONST_BITS+PASS1_BITS+3)
 555			    & RANGE_MASK];
 556    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
 557					      CONST_BITS+PASS1_BITS+3)
 558			    & RANGE_MASK];
 559    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
 560					      CONST_BITS+PASS1_BITS+3)
 561			    & RANGE_MASK];
 562
 563    wsptr += 7;		/* advance pointer to next row */
 564  }
 565}
 566
 567
 568/*
 569 * Perform dequantization and inverse DCT on one block of coefficients,
 570 * producing a reduced-size 6x6 output block.
 571 *
 572 * Optimized algorithm with 3 multiplications in the 1-D kernel.
 573 * cK represents sqrt(2) * cos(K*pi/12).
 574 */
 575
 576GLOBAL(void)
 577jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 578	       JCOEFPTR coef_block,
 579	       JSAMPARRAY output_buf, JDIMENSION output_col)
 580{
 581  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
 582  INT32 z1, z2, z3;
 583  JCOEFPTR inptr;
 584  ISLOW_MULT_TYPE * quantptr;
 585  int * wsptr;
 586  JSAMPROW outptr;
 587  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
 588  int ctr;
 589  int workspace[6*6];	/* buffers data between passes */
 590  SHIFT_TEMPS
 591
 592  /* Pass 1: process columns from input, store into work array. */
 593
 594  inptr = coef_block;
 595  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
 596  wsptr = workspace;
 597  for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
 598    /* Even part */
 599
 600    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 601    tmp0 <<= CONST_BITS;
 602    /* Add fudge factor here for final descale. */
 603    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
 604    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
 605    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
 606    tmp1 = tmp0 + tmp10;
 607    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
 608    tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
 609    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
 610    tmp10 = tmp1 + tmp0;
 611    tmp12 = tmp1 - tmp0;
 612
 613    /* Odd part */
 614
 615    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
 616    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
 617    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
 618    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
 619    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
 620    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
 621    tmp1 = (z1 - z2 - z3) << PASS1_BITS;
 622
 623    /* Final output stage */
 624
 625    wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
 626    wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
 627    wsptr[6*1] = (int) (tmp11 + tmp1);
 628    wsptr[6*4] = (int) (tmp11 - tmp1);
 629    wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
 630    wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
 631  }
 632
 633  /* Pass 2: process 6 rows from work array, store into output array. */
 634
 635  wsptr = workspace;
 636  for (ctr = 0; ctr < 6; ctr++) {
 637    outptr = output_buf[ctr] + output_col;
 638
 639    /* Even part */
 640
 641    /* Add fudge factor here for final descale. */
 642    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
 643    tmp0 <<= CONST_BITS;
 644    tmp2 = (INT32) wsptr[4];
 645    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
 646    tmp1 = tmp0 + tmp10;
 647    tmp11 = tmp0 - tmp10 - tmp10;
 648    tmp10 = (INT32) wsptr[2];
 649    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
 650    tmp10 = tmp1 + tmp0;
 651    tmp12 = tmp1 - tmp0;
 652
 653    /* Odd part */
 654
 655    z1 = (INT32) wsptr[1];
 656    z2 = (INT32) wsptr[3];
 657    z3 = (INT32) wsptr[5];
 658    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
 659    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
 660    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
 661    tmp1 = (z1 - z2 - z3) << CONST_BITS;
 662
 663    /* Final output stage */
 664
 665    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
 666					      CONST_BITS+PASS1_BITS+3)
 667			    & RANGE_MASK];
 668    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
 669					      CONST_BITS+PASS1_BITS+3)
 670			    & RANGE_MASK];
 671    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
 672					      CONST_BITS+PASS1_BITS+3)
 673			    & RANGE_MASK];
 674    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
 675					      CONST_BITS+PASS1_BITS+3)
 676			    & RANGE_MASK];
 677    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
 678					      CONST_BITS+PASS1_BITS+3)
 679			    & RANGE_MASK];
 680    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
 681					      CONST_BITS+PASS1_BITS+3)
 682			    & RANGE_MASK];
 683
 684    wsptr += 6;		/* advance pointer to next row */
 685  }
 686}
 687
 688
 689/*
 690 * Perform dequantization and inverse DCT on one block of coefficients,
 691 * producing a reduced-size 5x5 output block.
 692 *
 693 * Optimized algorithm with 5 multiplications in the 1-D kernel.
 694 * cK represents sqrt(2) * cos(K*pi/10).
 695 */
 696
 697GLOBAL(void)
 698jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 699	       JCOEFPTR coef_block,
 700	       JSAMPARRAY output_buf, JDIMENSION output_col)
 701{
 702  INT32 tmp0, tmp1, tmp10, tmp11, tmp12;
 703  INT32 z1, z2, z3;
 704  JCOEFPTR inptr;
 705  ISLOW_MULT_TYPE * quantptr;
 706  int * wsptr;
 707  JSAMPROW outptr;
 708  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
 709  int ctr;
 710  int workspace[5*5];	/* buffers data between passes */
 711  SHIFT_TEMPS
 712
 713  /* Pass 1: process columns from input, store into work array. */
 714
 715  inptr = coef_block;
 716  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
 717  wsptr = workspace;
 718  for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
 719    /* Even part */
 720
 721    tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 722    tmp12 <<= CONST_BITS;
 723    /* Add fudge factor here for final descale. */
 724    tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
 725    tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
 726    tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
 727    z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
 728    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
 729    z3 = tmp12 + z2;
 730    tmp10 = z3 + z1;
 731    tmp11 = z3 - z1;
 732    tmp12 -= z2 << 2;
 733
 734    /* Odd part */
 735
 736    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
 737    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
 738
 739    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
 740    tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
 741    tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
 742
 743    /* Final output stage */
 744
 745    wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
 746    wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
 747    wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
 748    wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
 749    wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
 750  }
 751
 752  /* Pass 2: process 5 rows from work array, store into output array. */
 753
 754  wsptr = workspace;
 755  for (ctr = 0; ctr < 5; ctr++) {
 756    outptr = output_buf[ctr] + output_col;
 757
 758    /* Even part */
 759
 760    /* Add fudge factor here for final descale. */
 761    tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
 762    tmp12 <<= CONST_BITS;
 763    tmp0 = (INT32) wsptr[2];
 764    tmp1 = (INT32) wsptr[4];
 765    z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
 766    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
 767    z3 = tmp12 + z2;
 768    tmp10 = z3 + z1;
 769    tmp11 = z3 - z1;
 770    tmp12 -= z2 << 2;
 771
 772    /* Odd part */
 773
 774    z2 = (INT32) wsptr[1];
 775    z3 = (INT32) wsptr[3];
 776
 777    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
 778    tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
 779    tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
 780
 781    /* Final output stage */
 782
 783    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
 784					      CONST_BITS+PASS1_BITS+3)
 785			    & RANGE_MASK];
 786    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
 787					      CONST_BITS+PASS1_BITS+3)
 788			    & RANGE_MASK];
 789    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
 790					      CONST_BITS+PASS1_BITS+3)
 791			    & RANGE_MASK];
 792    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
 793					      CONST_BITS+PASS1_BITS+3)
 794			    & RANGE_MASK];
 795    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
 796					      CONST_BITS+PASS1_BITS+3)
 797			    & RANGE_MASK];
 798
 799    wsptr += 5;		/* advance pointer to next row */
 800  }
 801}
 802
 803
 804/*
 805 * Perform dequantization and inverse DCT on one block of coefficients,
 806 * producing a reduced-size 4x4 output block.
 807 *
 808 * Optimized algorithm with 3 multiplications in the 1-D kernel.
 809 * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
 810 */
 811
 812GLOBAL(void)
 813jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 814	       JCOEFPTR coef_block,
 815	       JSAMPARRAY output_buf, JDIMENSION output_col)
 816{
 817  INT32 tmp0, tmp2, tmp10, tmp12;
 818  INT32 z1, z2, z3;
 819  JCOEFPTR inptr;
 820  ISLOW_MULT_TYPE * quantptr;
 821  int * wsptr;
 822  JSAMPROW outptr;
 823  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
 824  int ctr;
 825  int workspace[4*4];	/* buffers data between passes */
 826  SHIFT_TEMPS
 827
 828  /* Pass 1: process columns from input, store into work array. */
 829
 830  inptr = coef_block;
 831  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
 832  wsptr = workspace;
 833  for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
 834    /* Even part */
 835
 836    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 837    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
 838    
 839    tmp10 = (tmp0 + tmp2) << PASS1_BITS;
 840    tmp12 = (tmp0 - tmp2) << PASS1_BITS;
 841
 842    /* Odd part */
 843    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
 844
 845    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
 846    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
 847
 848    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
 849    /* Add fudge factor here for final descale. */
 850    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
 851    tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
 852		       CONST_BITS-PASS1_BITS);
 853    tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
 854		       CONST_BITS-PASS1_BITS);
 855
 856    /* Final output stage */
 857
 858    wsptr[4*0] = (int) (tmp10 + tmp0);
 859    wsptr[4*3] = (int) (tmp10 - tmp0);
 860    wsptr[4*1] = (int) (tmp12 + tmp2);
 861    wsptr[4*2] = (int) (tmp12 - tmp2);
 862  }
 863
 864  /* Pass 2: process 4 rows from work array, store into output array. */
 865
 866  wsptr = workspace;
 867  for (ctr = 0; ctr < 4; ctr++) {
 868    outptr = output_buf[ctr] + output_col;
 869
 870    /* Even part */
 871
 872    /* Add fudge factor here for final descale. */
 873    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
 874    tmp2 = (INT32) wsptr[2];
 875
 876    tmp10 = (tmp0 + tmp2) << CONST_BITS;
 877    tmp12 = (tmp0 - tmp2) << CONST_BITS;
 878
 879    /* Odd part */
 880    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
 881
 882    z2 = (INT32) wsptr[1];
 883    z3 = (INT32) wsptr[3];
 884
 885    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
 886    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
 887    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
 888
 889    /* Final output stage */
 890
 891    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
 892					      CONST_BITS+PASS1_BITS+3)
 893			    & RANGE_MASK];
 894    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
 895					      CONST_BITS+PASS1_BITS+3)
 896			    & RANGE_MASK];
 897    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
 898					      CONST_BITS+PASS1_BITS+3)
 899			    & RANGE_MASK];
 900    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
 901					      CONST_BITS+PASS1_BITS+3)
 902			    & RANGE_MASK];
 903
 904    wsptr += 4;		/* advance pointer to next row */
 905  }
 906}
 907
 908
 909/*
 910 * Perform dequantization and inverse DCT on one block of coefficients,
 911 * producing a reduced-size 3x3 output block.
 912 *
 913 * Optimized algorithm with 2 multiplications in the 1-D kernel.
 914 * cK represents sqrt(2) * cos(K*pi/6).
 915 */
 916
 917GLOBAL(void)
 918jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 919	       JCOEFPTR coef_block,
 920	       JSAMPARRAY output_buf, JDIMENSION output_col)
 921{
 922  INT32 tmp0, tmp2, tmp10, tmp12;
 923  JCOEFPTR inptr;
 924  ISLOW_MULT_TYPE * quantptr;
 925  int * wsptr;
 926  JSAMPROW outptr;
 927  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
 928  int ctr;
 929  int workspace[3*3];	/* buffers data between passes */
 930  SHIFT_TEMPS
 931
 932  /* Pass 1: process columns from input, store into work array. */
 933
 934  inptr = coef_block;
 935  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
 936  wsptr = workspace;
 937  for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
 938    /* Even part */
 939
 940    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 941    tmp0 <<= CONST_BITS;
 942    /* Add fudge factor here for final descale. */
 943    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
 944    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
 945    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
 946    tmp10 = tmp0 + tmp12;
 947    tmp2 = tmp0 - tmp12 - tmp12;
 948
 949    /* Odd part */
 950
 951    tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
 952    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
 953
 954    /* Final output stage */
 955
 956    wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
 957    wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
 958    wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
 959  }
 960
 961  /* Pass 2: process 3 rows from work array, store into output array. */
 962
 963  wsptr = workspace;
 964  for (ctr = 0; ctr < 3; ctr++) {
 965    outptr = output_buf[ctr] + output_col;
 966
 967    /* Even part */
 968
 969    /* Add fudge factor here for final descale. */
 970    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
 971    tmp0 <<= CONST_BITS;
 972    tmp2 = (INT32) wsptr[2];
 973    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
 974    tmp10 = tmp0 + tmp12;
 975    tmp2 = tmp0 - tmp12 - tmp12;
 976
 977    /* Odd part */
 978
 979    tmp12 = (INT32) wsptr[1];
 980    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
 981
 982    /* Final output stage */
 983
 984    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
 985					      CONST_BITS+PASS1_BITS+3)
 986			    & RANGE_MASK];
 987    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
 988					      CONST_BITS+PASS1_BITS+3)
 989			    & RANGE_MASK];
 990    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
 991					      CONST_BITS+PASS1_BITS+3)
 992			    & RANGE_MASK];
 993
 994    wsptr += 3;		/* advance pointer to next row */
 995  }
 996}
 997
 998
 999/*
1000 * Perform dequantization and inverse DCT on one block of coefficients,
1001 * producing a reduced-size 2x2 output block.
1002 *
1003 * Multiplication-less algorithm.
1004 */
1005
1006GLOBAL(void)
1007jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1008	       JCOEFPTR coef_block,
1009	       JSAMPARRAY output_buf, JDIMENSION output_col)
1010{
1011  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1012  ISLOW_MULT_TYPE * quantptr;
1013  JSAMPROW outptr;
1014  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1015  SHIFT_TEMPS
1016
1017  /* Pass 1: process columns from input. */
1018
1019  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1020
1021  /* Column 0 */
1022  tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
1023  tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
1024  /* Add fudge factor here for final descale. */
1025  tmp4 += ONE << 2;
1026
1027  tmp0 = tmp4 + tmp5;
1028  tmp2 = tmp4 - tmp5;
1029
1030  /* Column 1 */
1031  tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0+1], quantptr[DCTSIZE*0+1]);
1032  tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1+1], quantptr[DCTSIZE*1+1]);
1033
1034  tmp1 = tmp4 + tmp5;
1035  tmp3 = tmp4 - tmp5;
1036
1037  /* Pass 2: process 2 rows, store into output array. */
1038
1039  /* Row 0 */
1040  outptr = output_buf[0] + output_col;
1041
1042  outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp0 + tmp1, 3) & RANGE_MASK];
1043  outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp0 - tmp1, 3) & RANGE_MASK];
1044
1045  /* Row 1 */
1046  outptr = output_buf[1] + output_col;
1047
1048  outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp2 + tmp3, 3) & RANGE_MASK];
1049  outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2 - tmp3, 3) & RANGE_MASK];
1050}
1051
1052
1053/*
1054 * Perform dequantization and inverse DCT on one block of coefficients,
1055 * producing a reduced-size 1x1 output block.
1056 *
1057 * We hardly need an inverse DCT routine for this: just take the
1058 * average pixel value, which is one-eighth of the DC coefficient.
1059 */
1060
1061GLOBAL(void)
1062jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1063	       JCOEFPTR coef_block,
1064	       JSAMPARRAY output_buf, JDIMENSION output_col)
1065{
1066  int dcval;
1067  ISLOW_MULT_TYPE * quantptr;
1068  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1069  SHIFT_TEMPS
1070
1071  /* 1x1 is trivial: just take the DC coefficient divided by 8. */
1072  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1073  dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
1074  dcval = (int) DESCALE((INT32) dcval, 3);
1075
1076  output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
1077}
1078
1079
1080/*
1081 * Perform dequantization and inverse DCT on one block of coefficients,
1082 * producing a 9x9 output block.
1083 *
1084 * Optimized algorithm with 10 multiplications in the 1-D kernel.
1085 * cK represents sqrt(2) * cos(K*pi/18).
1086 */
1087
1088GLOBAL(void)
1089jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1090	       JCOEFPTR coef_block,
1091	       JSAMPARRAY output_buf, JDIMENSION output_col)
1092{
1093  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
1094  INT32 z1, z2, z3, z4;
1095  JCOEFPTR inptr;
1096  ISLOW_MULT_TYPE * quantptr;
1097  int * wsptr;
1098  JSAMPROW outptr;
1099  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1100  int ctr;
1101  int workspace[8*9];	/* buffers data between passes */
1102  SHIFT_TEMPS
1103
1104  /* Pass 1: process columns from input, store into work array. */
1105
1106  inptr = coef_block;
1107  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1108  wsptr = workspace;
1109  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1110    /* Even part */
1111
1112    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1113    tmp0 <<= CONST_BITS;
1114    /* Add fudge factor here for final descale. */
1115    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
1116
1117    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1118    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1119    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1120
1121    tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
1122    tmp1 = tmp0 + tmp3;
1123    tmp2 = tmp0 - tmp3 - tmp3;
1124
1125    tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
1126    tmp11 = tmp2 + tmp0;
1127    tmp14 = tmp2 - tmp0 - tmp0;
1128
1129    tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
1130    tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
1131    tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
1132
1133    tmp10 = tmp1 + tmp0 - tmp3;
1134    tmp12 = tmp1 - tmp0 + tmp2;
1135    tmp13 = tmp1 - tmp2 + tmp3;
1136
1137    /* Odd part */
1138
1139    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1140    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1141    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1142    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1143
1144    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
1145
1146    tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
1147    tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
1148    tmp0 = tmp2 + tmp3 - z2;
1149    tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
1150    tmp2 += z2 - tmp1;
1151    tmp3 += z2 + tmp1;
1152    tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
1153
1154    /* Final output stage */
1155
1156    wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
1157    wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
1158    wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
1159    wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
1160    wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
1161    wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
1162    wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
1163    wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
1164    wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
1165  }
1166
1167  /* Pass 2: process 9 rows from work array, store into output array. */
1168
1169  wsptr = workspace;
1170  for (ctr = 0; ctr < 9; ctr++) {
1171    outptr = output_buf[ctr] + output_col;
1172
1173    /* Even part */
1174
1175    /* Add fudge factor here for final descale. */
1176    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
1177    tmp0 <<= CONST_BITS;
1178
1179    z1 = (INT32) wsptr[2];
1180    z2 = (INT32) wsptr[4];
1181    z3 = (INT32) wsptr[6];
1182
1183    tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
1184    tmp1 = tmp0 + tmp3;
1185    tmp2 = tmp0 - tmp3 - tmp3;
1186
1187    tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
1188    tmp11 = tmp2 + tmp0;
1189    tmp14 = tmp2 - tmp0 - tmp0;
1190
1191    tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
1192    tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
1193    tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
1194
1195    tmp10 = tmp1 + tmp0 - tmp3;
1196    tmp12 = tmp1 - tmp0 + tmp2;
1197    tmp13 = tmp1 - tmp2 + tmp3;
1198
1199    /* Odd part */
1200
1201    z1 = (INT32) wsptr[1];
1202    z2 = (INT32) wsptr[3];
1203    z3 = (INT32) wsptr[5];
1204    z4 = (INT32) wsptr[7];
1205
1206    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
1207
1208    tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
1209    tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
1210    tmp0 = tmp2 + tmp3 - z2;
1211    tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
1212    tmp2 += z2 - tmp1;
1213    tmp3 += z2 + tmp1;
1214    tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
1215
1216    /* Final output stage */
1217
1218    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
1219					      CONST_BITS+PASS1_BITS+3)
1220			    & RANGE_MASK];
1221    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
1222					      CONST_BITS+PASS1_BITS+3)
1223			    & RANGE_MASK];
1224    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
1225					      CONST_BITS+PASS1_BITS+3)
1226			    & RANGE_MASK];
1227    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
1228					      CONST_BITS+PASS1_BITS+3)
1229			    & RANGE_MASK];
1230    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
1231					      CONST_BITS+PASS1_BITS+3)
1232			    & RANGE_MASK];
1233    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
1234					      CONST_BITS+PASS1_BITS+3)
1235			    & RANGE_MASK];
1236    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
1237					      CONST_BITS+PASS1_BITS+3)
1238			    & RANGE_MASK];
1239    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
1240					      CONST_BITS+PASS1_BITS+3)
1241			    & RANGE_MASK];
1242    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
1243					      CONST_BITS+PASS1_BITS+3)
1244			    & RANGE_MASK];
1245
1246    wsptr += 8;		/* advance pointer to next row */
1247  }
1248}
1249
1250
1251/*
1252 * Perform dequantization and inverse DCT on one block of coefficients,
1253 * producing a 10x10 output block.
1254 *
1255 * Optimized algorithm with 12 multiplications in the 1-D kernel.
1256 * cK represents sqrt(2) * cos(K*pi/20).
1257 */
1258
1259GLOBAL(void)
1260jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1261		 JCOEFPTR coef_block,
1262		 JSAMPARRAY output_buf, JDIMENSION output_col)
1263{
1264  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1265  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
1266  INT32 z1, z2, z3, z4, z5;
1267  JCOEFPTR inptr;
1268  ISLOW_MULT_TYPE * quantptr;
1269  int * wsptr;
1270  JSAMPROW outptr;
1271  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1272  int ctr;
1273  int workspace[8*10];	/* buffers data between passes */
1274  SHIFT_TEMPS
1275
1276  /* Pass 1: process columns from input, store into work array. */
1277
1278  inptr = coef_block;
1279  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1280  wsptr = workspace;
1281  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1282    /* Even part */
1283
1284    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1285    z3 <<= CONST_BITS;
1286    /* Add fudge factor here for final descale. */
1287    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
1288    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1289    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
1290    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
1291    tmp10 = z3 + z1;
1292    tmp11 = z3 - z2;
1293
1294    tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
1295			CONST_BITS-PASS1_BITS);
1296
1297    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1298    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1299
1300    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
1301    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
1302    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
1303
1304    tmp20 = tmp10 + tmp12;
1305    tmp24 = tmp10 - tmp12;
1306    tmp21 = tmp11 + tmp13;
1307    tmp23 = tmp11 - tmp13;
1308
1309    /* Odd part */
1310
1311    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1312    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1313    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1314    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1315
1316    tmp11 = z2 + z4;
1317    tmp13 = z2 - z4;
1318
1319    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
1320    z5 = z3 << CONST_BITS;
1321
1322    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
1323    z4 = z5 + tmp12;
1324
1325    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
1326    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
1327
1328    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
1329    z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
1330
1331    tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
1332
1333    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
1334    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
1335
1336    /* Final output stage */
1337
1338    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1339    wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1340    wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1341    wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1342    wsptr[8*2] = (int) (tmp22 + tmp12);
1343    wsptr[8*7] = (int) (tmp22 - tmp12);
1344    wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1345    wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1346    wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1347    wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1348  }
1349
1350  /* Pass 2: process 10 rows from work array, store into output array. */
1351
1352  wsptr = workspace;
1353  for (ctr = 0; ctr < 10; ctr++) {
1354    outptr = output_buf[ctr] + output_col;
1355
1356    /* Even part */
1357
1358    /* Add fudge factor here for final descale. */
1359    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
1360    z3 <<= CONST_BITS;
1361    z4 = (INT32) wsptr[4];
1362    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
1363    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
1364    tmp10 = z3 + z1;
1365    tmp11 = z3 - z2;
1366
1367    tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
1368
1369    z2 = (INT32) wsptr[2];
1370    z3 = (INT32) wsptr[6];
1371
1372    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
1373    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
1374    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
1375
1376    tmp20 = tmp10 + tmp12;
1377    tmp24 = tmp10 - tmp12;
1378    tmp21 = tmp11 + tmp13;
1379    tmp23 = tmp11 - tmp13;
1380
1381    /* Odd part */
1382
1383    z1 = (INT32) wsptr[1];
1384    z2 = (INT32) wsptr[3];
1385    z3 = (INT32) wsptr[5];
1386    z3 <<= CONST_BITS;
1387    z4 = (INT32) wsptr[7];
1388
1389    tmp11 = z2 + z4;
1390    tmp13 = z2 - z4;
1391
1392    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
1393
1394    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
1395    z4 = z3 + tmp12;
1396
1397    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
1398    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
1399
1400    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
1401    z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
1402
1403    tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
1404
1405    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
1406    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
1407
1408    /* Final output stage */
1409
1410    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1411					      CONST_BITS+PASS1_BITS+3)
1412			    & RANGE_MASK];
1413    outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1414					      CONST_BITS+PASS1_BITS+3)
1415			    & RANGE_MASK];
1416    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1417					      CONST_BITS+PASS1_BITS+3)
1418			    & RANGE_MASK];
1419    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1420					      CONST_BITS+PASS1_BITS+3)
1421			    & RANGE_MASK];
1422    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1423					      CONST_BITS+PASS1_BITS+3)
1424			    & RANGE_MASK];
1425    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1426					      CONST_BITS+PASS1_BITS+3)
1427			    & RANGE_MASK];
1428    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1429					      CONST_BITS+PASS1_BITS+3)
1430			    & RANGE_MASK];
1431    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1432					      CONST_BITS+PASS1_BITS+3)
1433			    & RANGE_MASK];
1434    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1435					      CONST_BITS+PASS1_BITS+3)
1436			    & RANGE_MASK];
1437    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1438					      CONST_BITS+PASS1_BITS+3)
1439			    & RANGE_MASK];
1440
1441    wsptr += 8;		/* advance pointer to next row */
1442  }
1443}
1444
1445
1446/*
1447 * Perform dequantization and inverse DCT on one block of coefficients,
1448 * producing a 11x11 output block.
1449 *
1450 * Optimized algorithm with 24 multiplications in the 1-D kernel.
1451 * cK represents sqrt(2) * cos(K*pi/22).
1452 */
1453
1454GLOBAL(void)
1455jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1456		 JCOEFPTR coef_block,
1457		 JSAMPARRAY output_buf, JDIMENSION output_col)
1458{
1459  INT32 tmp10, tmp11, tmp

Large files files are truncated, but you can click here to view the full file