media/libvpx/vp9/common/vp9_idct.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp9/common/vp9_idct.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1416 @@
     1.4 +/*
     1.5 + *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 + *
     1.7 + *  Use of this source code is governed by a BSD-style license
     1.8 + *  that can be found in the LICENSE file in the root of the source
     1.9 + *  tree. An additional intellectual property rights grant can be found
    1.10 + *  in the file PATENTS.  All contributing project authors may
    1.11 + *  be found in the AUTHORS file in the root of the source tree.
    1.12 + */
    1.13 +
    1.14 +#include <assert.h>
    1.15 +#include <math.h>
    1.16 +
    1.17 +#include "./vpx_config.h"
    1.18 +#include "./vp9_rtcd.h"
    1.19 +#include "vp9/common/vp9_systemdependent.h"
    1.20 +#include "vp9/common/vp9_blockd.h"
    1.21 +#include "vp9/common/vp9_common.h"
    1.22 +#include "vp9/common/vp9_idct.h"
    1.23 +
    1.24 +void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
    1.25 +/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
    1.26 +   0.5 shifts per pixel. */
    1.27 +  int i;
    1.28 +  int16_t output[16];
    1.29 +  int a1, b1, c1, d1, e1;
    1.30 +  const int16_t *ip = input;
    1.31 +  int16_t *op = output;
    1.32 +
    1.33 +  for (i = 0; i < 4; i++) {
    1.34 +    a1 = ip[0] >> UNIT_QUANT_SHIFT;
    1.35 +    c1 = ip[1] >> UNIT_QUANT_SHIFT;
    1.36 +    d1 = ip[2] >> UNIT_QUANT_SHIFT;
    1.37 +    b1 = ip[3] >> UNIT_QUANT_SHIFT;
    1.38 +    a1 += c1;
    1.39 +    d1 -= b1;
    1.40 +    e1 = (a1 - d1) >> 1;
    1.41 +    b1 = e1 - b1;
    1.42 +    c1 = e1 - c1;
    1.43 +    a1 -= b1;
    1.44 +    d1 += c1;
    1.45 +    op[0] = a1;
    1.46 +    op[1] = b1;
    1.47 +    op[2] = c1;
    1.48 +    op[3] = d1;
    1.49 +    ip += 4;
    1.50 +    op += 4;
    1.51 +  }
    1.52 +
    1.53 +  ip = output;
    1.54 +  for (i = 0; i < 4; i++) {
    1.55 +    a1 = ip[4 * 0];
    1.56 +    c1 = ip[4 * 1];
    1.57 +    d1 = ip[4 * 2];
    1.58 +    b1 = ip[4 * 3];
    1.59 +    a1 += c1;
    1.60 +    d1 -= b1;
    1.61 +    e1 = (a1 - d1) >> 1;
    1.62 +    b1 = e1 - b1;
    1.63 +    c1 = e1 - c1;
    1.64 +    a1 -= b1;
    1.65 +    d1 += c1;
    1.66 +    dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
    1.67 +    dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
    1.68 +    dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
    1.69 +    dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
    1.70 +
    1.71 +    ip++;
    1.72 +    dest++;
    1.73 +  }
    1.74 +}
    1.75 +
    1.76 +void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
    1.77 +  int i;
    1.78 +  int a1, e1;
    1.79 +  int16_t tmp[4];
    1.80 +  const int16_t *ip = in;
    1.81 +  int16_t *op = tmp;
    1.82 +
    1.83 +  a1 = ip[0] >> UNIT_QUANT_SHIFT;
    1.84 +  e1 = a1 >> 1;
    1.85 +  a1 -= e1;
    1.86 +  op[0] = a1;
    1.87 +  op[1] = op[2] = op[3] = e1;
    1.88 +
    1.89 +  ip = tmp;
    1.90 +  for (i = 0; i < 4; i++) {
    1.91 +    e1 = ip[0] >> 1;
    1.92 +    a1 = ip[0] - e1;
    1.93 +    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
    1.94 +    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);
    1.95 +    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);
    1.96 +    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);
    1.97 +    ip++;
    1.98 +    dest++;
    1.99 +  }
   1.100 +}
   1.101 +
   1.102 +static void idct4_1d(const int16_t *input, int16_t *output) {
   1.103 +  int16_t step[4];
   1.104 +  int temp1, temp2;
   1.105 +  // stage 1
   1.106 +  temp1 = (input[0] + input[2]) * cospi_16_64;
   1.107 +  temp2 = (input[0] - input[2]) * cospi_16_64;
   1.108 +  step[0] = dct_const_round_shift(temp1);
   1.109 +  step[1] = dct_const_round_shift(temp2);
   1.110 +  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   1.111 +  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
   1.112 +  step[2] = dct_const_round_shift(temp1);
   1.113 +  step[3] = dct_const_round_shift(temp2);
   1.114 +
   1.115 +  // stage 2
   1.116 +  output[0] = step[0] + step[3];
   1.117 +  output[1] = step[1] + step[2];
   1.118 +  output[2] = step[1] - step[2];
   1.119 +  output[3] = step[0] - step[3];
   1.120 +}
   1.121 +
   1.122 +void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
   1.123 +  int16_t out[4 * 4];
   1.124 +  int16_t *outptr = out;
   1.125 +  int i, j;
   1.126 +  int16_t temp_in[4], temp_out[4];
   1.127 +
   1.128 +  // Rows
   1.129 +  for (i = 0; i < 4; ++i) {
   1.130 +    idct4_1d(input, outptr);
   1.131 +    input += 4;
   1.132 +    outptr += 4;
   1.133 +  }
   1.134 +
   1.135 +  // Columns
   1.136 +  for (i = 0; i < 4; ++i) {
   1.137 +    for (j = 0; j < 4; ++j)
   1.138 +      temp_in[j] = out[j * 4 + i];
   1.139 +    idct4_1d(temp_in, temp_out);
   1.140 +    for (j = 0; j < 4; ++j)
   1.141 +      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
   1.142 +                                  + dest[j * stride + i]);
   1.143 +  }
   1.144 +}
   1.145 +
   1.146 +void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
   1.147 +  int i;
   1.148 +  int a1;
   1.149 +  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   1.150 +  out = dct_const_round_shift(out * cospi_16_64);
   1.151 +  a1 = ROUND_POWER_OF_TWO(out, 4);
   1.152 +
   1.153 +  for (i = 0; i < 4; i++) {
   1.154 +    dest[0] = clip_pixel(dest[0] + a1);
   1.155 +    dest[1] = clip_pixel(dest[1] + a1);
   1.156 +    dest[2] = clip_pixel(dest[2] + a1);
   1.157 +    dest[3] = clip_pixel(dest[3] + a1);
   1.158 +    dest += dest_stride;
   1.159 +  }
   1.160 +}
   1.161 +
   1.162 +static void idct8_1d(const int16_t *input, int16_t *output) {
   1.163 +  int16_t step1[8], step2[8];
   1.164 +  int temp1, temp2;
   1.165 +  // stage 1
   1.166 +  step1[0] = input[0];
   1.167 +  step1[2] = input[4];
   1.168 +  step1[1] = input[2];
   1.169 +  step1[3] = input[6];
   1.170 +  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   1.171 +  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
   1.172 +  step1[4] = dct_const_round_shift(temp1);
   1.173 +  step1[7] = dct_const_round_shift(temp2);
   1.174 +  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   1.175 +  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
   1.176 +  step1[5] = dct_const_round_shift(temp1);
   1.177 +  step1[6] = dct_const_round_shift(temp2);
   1.178 +
   1.179 +  // stage 2 & stage 3 - even half
   1.180 +  idct4_1d(step1, step1);
   1.181 +
   1.182 +  // stage 2 - odd half
   1.183 +  step2[4] = step1[4] + step1[5];
   1.184 +  step2[5] = step1[4] - step1[5];
   1.185 +  step2[6] = -step1[6] + step1[7];
   1.186 +  step2[7] = step1[6] + step1[7];
   1.187 +
   1.188 +  // stage 3 -odd half
   1.189 +  step1[4] = step2[4];
   1.190 +  temp1 = (step2[6] - step2[5]) * cospi_16_64;
   1.191 +  temp2 = (step2[5] + step2[6]) * cospi_16_64;
   1.192 +  step1[5] = dct_const_round_shift(temp1);
   1.193 +  step1[6] = dct_const_round_shift(temp2);
   1.194 +  step1[7] = step2[7];
   1.195 +
   1.196 +  // stage 4
   1.197 +  output[0] = step1[0] + step1[7];
   1.198 +  output[1] = step1[1] + step1[6];
   1.199 +  output[2] = step1[2] + step1[5];
   1.200 +  output[3] = step1[3] + step1[4];
   1.201 +  output[4] = step1[3] - step1[4];
   1.202 +  output[5] = step1[2] - step1[5];
   1.203 +  output[6] = step1[1] - step1[6];
   1.204 +  output[7] = step1[0] - step1[7];
   1.205 +}
   1.206 +
   1.207 +void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
   1.208 +  int16_t out[8 * 8];
   1.209 +  int16_t *outptr = out;
   1.210 +  int i, j;
   1.211 +  int16_t temp_in[8], temp_out[8];
   1.212 +
   1.213 +  // First transform rows
   1.214 +  for (i = 0; i < 8; ++i) {
   1.215 +    idct8_1d(input, outptr);
   1.216 +    input += 8;
   1.217 +    outptr += 8;
   1.218 +  }
   1.219 +
   1.220 +  // Then transform columns
   1.221 +  for (i = 0; i < 8; ++i) {
   1.222 +    for (j = 0; j < 8; ++j)
   1.223 +      temp_in[j] = out[j * 8 + i];
   1.224 +    idct8_1d(temp_in, temp_out);
   1.225 +    for (j = 0; j < 8; ++j)
   1.226 +      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
   1.227 +                                  + dest[j * stride + i]);
   1.228 +  }
   1.229 +}
   1.230 +
   1.231 +void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   1.232 +  int i, j;
   1.233 +  int a1;
   1.234 +  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   1.235 +  out = dct_const_round_shift(out * cospi_16_64);
   1.236 +  a1 = ROUND_POWER_OF_TWO(out, 5);
   1.237 +  for (j = 0; j < 8; ++j) {
   1.238 +    for (i = 0; i < 8; ++i)
   1.239 +      dest[i] = clip_pixel(dest[i] + a1);
   1.240 +    dest += stride;
   1.241 +  }
   1.242 +}
   1.243 +
   1.244 +static void iadst4_1d(const int16_t *input, int16_t *output) {
   1.245 +  int s0, s1, s2, s3, s4, s5, s6, s7;
   1.246 +
   1.247 +  int x0 = input[0];
   1.248 +  int x1 = input[1];
   1.249 +  int x2 = input[2];
   1.250 +  int x3 = input[3];
   1.251 +
   1.252 +  if (!(x0 | x1 | x2 | x3)) {
   1.253 +    output[0] = output[1] = output[2] = output[3] = 0;
   1.254 +    return;
   1.255 +  }
   1.256 +
   1.257 +  s0 = sinpi_1_9 * x0;
   1.258 +  s1 = sinpi_2_9 * x0;
   1.259 +  s2 = sinpi_3_9 * x1;
   1.260 +  s3 = sinpi_4_9 * x2;
   1.261 +  s4 = sinpi_1_9 * x2;
   1.262 +  s5 = sinpi_2_9 * x3;
   1.263 +  s6 = sinpi_4_9 * x3;
   1.264 +  s7 = x0 - x2 + x3;
   1.265 +
   1.266 +  x0 = s0 + s3 + s5;
   1.267 +  x1 = s1 - s4 - s6;
   1.268 +  x2 = sinpi_3_9 * s7;
   1.269 +  x3 = s2;
   1.270 +
   1.271 +  s0 = x0 + x3;
   1.272 +  s1 = x1 + x3;
   1.273 +  s2 = x2;
   1.274 +  s3 = x0 + x1 - x3;
   1.275 +
   1.276 +  // 1-D transform scaling factor is sqrt(2).
   1.277 +  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   1.278 +  // + 1b (addition) = 29b.
   1.279 +  // Hence the output bit depth is 15b.
   1.280 +  output[0] = dct_const_round_shift(s0);
   1.281 +  output[1] = dct_const_round_shift(s1);
   1.282 +  output[2] = dct_const_round_shift(s2);
   1.283 +  output[3] = dct_const_round_shift(s3);
   1.284 +}
   1.285 +
   1.286 +void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
   1.287 +                         int tx_type) {
   1.288 +  const transform_2d IHT_4[] = {
   1.289 +    { idct4_1d, idct4_1d  },  // DCT_DCT  = 0
   1.290 +    { iadst4_1d, idct4_1d  },   // ADST_DCT = 1
   1.291 +    { idct4_1d, iadst4_1d },    // DCT_ADST = 2
   1.292 +    { iadst4_1d, iadst4_1d }      // ADST_ADST = 3
   1.293 +  };
   1.294 +
   1.295 +  int i, j;
   1.296 +  int16_t out[4 * 4];
   1.297 +  int16_t *outptr = out;
   1.298 +  int16_t temp_in[4], temp_out[4];
   1.299 +
   1.300 +  // inverse transform row vectors
   1.301 +  for (i = 0; i < 4; ++i) {
   1.302 +    IHT_4[tx_type].rows(input, outptr);
   1.303 +    input  += 4;
   1.304 +    outptr += 4;
   1.305 +  }
   1.306 +
   1.307 +  // inverse transform column vectors
   1.308 +  for (i = 0; i < 4; ++i) {
   1.309 +    for (j = 0; j < 4; ++j)
   1.310 +      temp_in[j] = out[j * 4 + i];
   1.311 +    IHT_4[tx_type].cols(temp_in, temp_out);
   1.312 +    for (j = 0; j < 4; ++j)
   1.313 +      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
   1.314 +                                  + dest[j * stride + i]);
   1.315 +  }
   1.316 +}
   1.317 +static void iadst8_1d(const int16_t *input, int16_t *output) {
   1.318 +  int s0, s1, s2, s3, s4, s5, s6, s7;
   1.319 +
   1.320 +  int x0 = input[7];
   1.321 +  int x1 = input[0];
   1.322 +  int x2 = input[5];
   1.323 +  int x3 = input[2];
   1.324 +  int x4 = input[3];
   1.325 +  int x5 = input[4];
   1.326 +  int x6 = input[1];
   1.327 +  int x7 = input[6];
   1.328 +
   1.329 +  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
   1.330 +    output[0] = output[1] = output[2] = output[3] = output[4]
   1.331 +              = output[5] = output[6] = output[7] = 0;
   1.332 +    return;
   1.333 +  }
   1.334 +
   1.335 +  // stage 1
   1.336 +  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
   1.337 +  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
   1.338 +  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
   1.339 +  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
   1.340 +  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
   1.341 +  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
   1.342 +  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   1.343 +  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
   1.344 +
   1.345 +  x0 = dct_const_round_shift(s0 + s4);
   1.346 +  x1 = dct_const_round_shift(s1 + s5);
   1.347 +  x2 = dct_const_round_shift(s2 + s6);
   1.348 +  x3 = dct_const_round_shift(s3 + s7);
   1.349 +  x4 = dct_const_round_shift(s0 - s4);
   1.350 +  x5 = dct_const_round_shift(s1 - s5);
   1.351 +  x6 = dct_const_round_shift(s2 - s6);
   1.352 +  x7 = dct_const_round_shift(s3 - s7);
   1.353 +
   1.354 +  // stage 2
   1.355 +  s0 = x0;
   1.356 +  s1 = x1;
   1.357 +  s2 = x2;
   1.358 +  s3 = x3;
   1.359 +  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
   1.360 +  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
   1.361 +  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
   1.362 +  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
   1.363 +
   1.364 +  x0 = s0 + s2;
   1.365 +  x1 = s1 + s3;
   1.366 +  x2 = s0 - s2;
   1.367 +  x3 = s1 - s3;
   1.368 +  x4 = dct_const_round_shift(s4 + s6);
   1.369 +  x5 = dct_const_round_shift(s5 + s7);
   1.370 +  x6 = dct_const_round_shift(s4 - s6);
   1.371 +  x7 = dct_const_round_shift(s5 - s7);
   1.372 +
   1.373 +  // stage 3
   1.374 +  s2 = cospi_16_64 * (x2 + x3);
   1.375 +  s3 = cospi_16_64 * (x2 - x3);
   1.376 +  s6 = cospi_16_64 * (x6 + x7);
   1.377 +  s7 = cospi_16_64 * (x6 - x7);
   1.378 +
   1.379 +  x2 = dct_const_round_shift(s2);
   1.380 +  x3 = dct_const_round_shift(s3);
   1.381 +  x6 = dct_const_round_shift(s6);
   1.382 +  x7 = dct_const_round_shift(s7);
   1.383 +
   1.384 +  output[0] =  x0;
   1.385 +  output[1] = -x4;
   1.386 +  output[2] =  x6;
   1.387 +  output[3] = -x2;
   1.388 +  output[4] =  x3;
   1.389 +  output[5] = -x7;
   1.390 +  output[6] =  x5;
   1.391 +  output[7] = -x1;
   1.392 +}
   1.393 +
   1.394 +static const transform_2d IHT_8[] = {
   1.395 +  { idct8_1d,  idct8_1d  },  // DCT_DCT  = 0
   1.396 +  { iadst8_1d, idct8_1d  },  // ADST_DCT = 1
   1.397 +  { idct8_1d,  iadst8_1d },  // DCT_ADST = 2
   1.398 +  { iadst8_1d, iadst8_1d }   // ADST_ADST = 3
   1.399 +};
   1.400 +
   1.401 +void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
   1.402 +                         int tx_type) {
   1.403 +  int i, j;
   1.404 +  int16_t out[8 * 8];
   1.405 +  int16_t *outptr = out;
   1.406 +  int16_t temp_in[8], temp_out[8];
   1.407 +  const transform_2d ht = IHT_8[tx_type];
   1.408 +
   1.409 +  // inverse transform row vectors
   1.410 +  for (i = 0; i < 8; ++i) {
   1.411 +    ht.rows(input, outptr);
   1.412 +    input += 8;
   1.413 +    outptr += 8;
   1.414 +  }
   1.415 +
   1.416 +  // inverse transform column vectors
   1.417 +  for (i = 0; i < 8; ++i) {
   1.418 +    for (j = 0; j < 8; ++j)
   1.419 +      temp_in[j] = out[j * 8 + i];
   1.420 +    ht.cols(temp_in, temp_out);
   1.421 +    for (j = 0; j < 8; ++j)
   1.422 +      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
   1.423 +                                  + dest[j * stride + i]);
   1.424 +  }
   1.425 +}
   1.426 +
   1.427 +void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   1.428 +  int16_t out[8 * 8] = { 0 };
   1.429 +  int16_t *outptr = out;
   1.430 +  int i, j;
   1.431 +  int16_t temp_in[8], temp_out[8];
   1.432 +
   1.433 +  // First transform rows
   1.434 +  // only first 4 row has non-zero coefs
   1.435 +  for (i = 0; i < 4; ++i) {
   1.436 +    idct8_1d(input, outptr);
   1.437 +    input += 8;
   1.438 +    outptr += 8;
   1.439 +  }
   1.440 +
   1.441 +  // Then transform columns
   1.442 +  for (i = 0; i < 8; ++i) {
   1.443 +    for (j = 0; j < 8; ++j)
   1.444 +      temp_in[j] = out[j * 8 + i];
   1.445 +    idct8_1d(temp_in, temp_out);
   1.446 +    for (j = 0; j < 8; ++j)
   1.447 +      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
   1.448 +                                  + dest[j * stride + i]);
   1.449 +  }
   1.450 +}
   1.451 +
   1.452 +static void idct16_1d(const int16_t *input, int16_t *output) {
   1.453 +  int16_t step1[16], step2[16];
   1.454 +  int temp1, temp2;
   1.455 +
   1.456 +  // stage 1
   1.457 +  step1[0] = input[0/2];
   1.458 +  step1[1] = input[16/2];
   1.459 +  step1[2] = input[8/2];
   1.460 +  step1[3] = input[24/2];
   1.461 +  step1[4] = input[4/2];
   1.462 +  step1[5] = input[20/2];
   1.463 +  step1[6] = input[12/2];
   1.464 +  step1[7] = input[28/2];
   1.465 +  step1[8] = input[2/2];
   1.466 +  step1[9] = input[18/2];
   1.467 +  step1[10] = input[10/2];
   1.468 +  step1[11] = input[26/2];
   1.469 +  step1[12] = input[6/2];
   1.470 +  step1[13] = input[22/2];
   1.471 +  step1[14] = input[14/2];
   1.472 +  step1[15] = input[30/2];
   1.473 +
   1.474 +  // stage 2
   1.475 +  step2[0] = step1[0];
   1.476 +  step2[1] = step1[1];
   1.477 +  step2[2] = step1[2];
   1.478 +  step2[3] = step1[3];
   1.479 +  step2[4] = step1[4];
   1.480 +  step2[5] = step1[5];
   1.481 +  step2[6] = step1[6];
   1.482 +  step2[7] = step1[7];
   1.483 +
   1.484 +  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   1.485 +  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
   1.486 +  step2[8] = dct_const_round_shift(temp1);
   1.487 +  step2[15] = dct_const_round_shift(temp2);
   1.488 +
   1.489 +  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   1.490 +  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
   1.491 +  step2[9] = dct_const_round_shift(temp1);
   1.492 +  step2[14] = dct_const_round_shift(temp2);
   1.493 +
   1.494 +  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   1.495 +  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
   1.496 +  step2[10] = dct_const_round_shift(temp1);
   1.497 +  step2[13] = dct_const_round_shift(temp2);
   1.498 +
   1.499 +  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   1.500 +  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
   1.501 +  step2[11] = dct_const_round_shift(temp1);
   1.502 +  step2[12] = dct_const_round_shift(temp2);
   1.503 +
   1.504 +  // stage 3
   1.505 +  step1[0] = step2[0];
   1.506 +  step1[1] = step2[1];
   1.507 +  step1[2] = step2[2];
   1.508 +  step1[3] = step2[3];
   1.509 +
   1.510 +  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   1.511 +  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
   1.512 +  step1[4] = dct_const_round_shift(temp1);
   1.513 +  step1[7] = dct_const_round_shift(temp2);
   1.514 +  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   1.515 +  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
   1.516 +  step1[5] = dct_const_round_shift(temp1);
   1.517 +  step1[6] = dct_const_round_shift(temp2);
   1.518 +
   1.519 +  step1[8] = step2[8] + step2[9];
   1.520 +  step1[9] = step2[8] - step2[9];
   1.521 +  step1[10] = -step2[10] + step2[11];
   1.522 +  step1[11] = step2[10] + step2[11];
   1.523 +  step1[12] = step2[12] + step2[13];
   1.524 +  step1[13] = step2[12] - step2[13];
   1.525 +  step1[14] = -step2[14] + step2[15];
   1.526 +  step1[15] = step2[14] + step2[15];
   1.527 +
   1.528 +  // stage 4
   1.529 +  temp1 = (step1[0] + step1[1]) * cospi_16_64;
   1.530 +  temp2 = (step1[0] - step1[1]) * cospi_16_64;
   1.531 +  step2[0] = dct_const_round_shift(temp1);
   1.532 +  step2[1] = dct_const_round_shift(temp2);
   1.533 +  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   1.534 +  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
   1.535 +  step2[2] = dct_const_round_shift(temp1);
   1.536 +  step2[3] = dct_const_round_shift(temp2);
   1.537 +  step2[4] = step1[4] + step1[5];
   1.538 +  step2[5] = step1[4] - step1[5];
   1.539 +  step2[6] = -step1[6] + step1[7];
   1.540 +  step2[7] = step1[6] + step1[7];
   1.541 +
   1.542 +  step2[8] = step1[8];
   1.543 +  step2[15] = step1[15];
   1.544 +  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   1.545 +  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
   1.546 +  step2[9] = dct_const_round_shift(temp1);
   1.547 +  step2[14] = dct_const_round_shift(temp2);
   1.548 +  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   1.549 +  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
   1.550 +  step2[10] = dct_const_round_shift(temp1);
   1.551 +  step2[13] = dct_const_round_shift(temp2);
   1.552 +  step2[11] = step1[11];
   1.553 +  step2[12] = step1[12];
   1.554 +
   1.555 +  // stage 5
   1.556 +  step1[0] = step2[0] + step2[3];
   1.557 +  step1[1] = step2[1] + step2[2];
   1.558 +  step1[2] = step2[1] - step2[2];
   1.559 +  step1[3] = step2[0] - step2[3];
   1.560 +  step1[4] = step2[4];
   1.561 +  temp1 = (step2[6] - step2[5]) * cospi_16_64;
   1.562 +  temp2 = (step2[5] + step2[6]) * cospi_16_64;
   1.563 +  step1[5] = dct_const_round_shift(temp1);
   1.564 +  step1[6] = dct_const_round_shift(temp2);
   1.565 +  step1[7] = step2[7];
   1.566 +
   1.567 +  step1[8] = step2[8] + step2[11];
   1.568 +  step1[9] = step2[9] + step2[10];
   1.569 +  step1[10] = step2[9] - step2[10];
   1.570 +  step1[11] = step2[8] - step2[11];
   1.571 +  step1[12] = -step2[12] + step2[15];
   1.572 +  step1[13] = -step2[13] + step2[14];
   1.573 +  step1[14] = step2[13] + step2[14];
   1.574 +  step1[15] = step2[12] + step2[15];
   1.575 +
   1.576 +  // stage 6
   1.577 +  step2[0] = step1[0] + step1[7];
   1.578 +  step2[1] = step1[1] + step1[6];
   1.579 +  step2[2] = step1[2] + step1[5];
   1.580 +  step2[3] = step1[3] + step1[4];
   1.581 +  step2[4] = step1[3] - step1[4];
   1.582 +  step2[5] = step1[2] - step1[5];
   1.583 +  step2[6] = step1[1] - step1[6];
   1.584 +  step2[7] = step1[0] - step1[7];
   1.585 +  step2[8] = step1[8];
   1.586 +  step2[9] = step1[9];
   1.587 +  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   1.588 +  temp2 = (step1[10] + step1[13]) * cospi_16_64;
   1.589 +  step2[10] = dct_const_round_shift(temp1);
   1.590 +  step2[13] = dct_const_round_shift(temp2);
   1.591 +  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   1.592 +  temp2 = (step1[11] + step1[12]) * cospi_16_64;
   1.593 +  step2[11] = dct_const_round_shift(temp1);
   1.594 +  step2[12] = dct_const_round_shift(temp2);
   1.595 +  step2[14] = step1[14];
   1.596 +  step2[15] = step1[15];
   1.597 +
   1.598 +  // stage 7
   1.599 +  output[0] = step2[0] + step2[15];
   1.600 +  output[1] = step2[1] + step2[14];
   1.601 +  output[2] = step2[2] + step2[13];
   1.602 +  output[3] = step2[3] + step2[12];
   1.603 +  output[4] = step2[4] + step2[11];
   1.604 +  output[5] = step2[5] + step2[10];
   1.605 +  output[6] = step2[6] + step2[9];
   1.606 +  output[7] = step2[7] + step2[8];
   1.607 +  output[8] = step2[7] - step2[8];
   1.608 +  output[9] = step2[6] - step2[9];
   1.609 +  output[10] = step2[5] - step2[10];
   1.610 +  output[11] = step2[4] - step2[11];
   1.611 +  output[12] = step2[3] - step2[12];
   1.612 +  output[13] = step2[2] - step2[13];
   1.613 +  output[14] = step2[1] - step2[14];
   1.614 +  output[15] = step2[0] - step2[15];
   1.615 +}
   1.616 +
   1.617 +void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
   1.618 +  int16_t out[16 * 16];
   1.619 +  int16_t *outptr = out;
   1.620 +  int i, j;
   1.621 +  int16_t temp_in[16], temp_out[16];
   1.622 +
   1.623 +  // First transform rows
   1.624 +  for (i = 0; i < 16; ++i) {
   1.625 +    idct16_1d(input, outptr);
   1.626 +    input += 16;
   1.627 +    outptr += 16;
   1.628 +  }
   1.629 +
   1.630 +  // Then transform columns
   1.631 +  for (i = 0; i < 16; ++i) {
   1.632 +    for (j = 0; j < 16; ++j)
   1.633 +      temp_in[j] = out[j * 16 + i];
   1.634 +    idct16_1d(temp_in, temp_out);
   1.635 +    for (j = 0; j < 16; ++j)
   1.636 +      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
   1.637 +                                  + dest[j * stride + i]);
   1.638 +  }
   1.639 +}
   1.640 +
   1.641 +static void iadst16_1d(const int16_t *input, int16_t *output) {
   1.642 +  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
   1.643 +
   1.644 +  int x0 = input[15];
   1.645 +  int x1 = input[0];
   1.646 +  int x2 = input[13];
   1.647 +  int x3 = input[2];
   1.648 +  int x4 = input[11];
   1.649 +  int x5 = input[4];
   1.650 +  int x6 = input[9];
   1.651 +  int x7 = input[6];
   1.652 +  int x8 = input[7];
   1.653 +  int x9 = input[8];
   1.654 +  int x10 = input[5];
   1.655 +  int x11 = input[10];
   1.656 +  int x12 = input[3];
   1.657 +  int x13 = input[12];
   1.658 +  int x14 = input[1];
   1.659 +  int x15 = input[14];
   1.660 +
   1.661 +  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
   1.662 +           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
   1.663 +    output[0] = output[1] = output[2] = output[3] = output[4]
   1.664 +              = output[5] = output[6] = output[7] = output[8]
   1.665 +              = output[9] = output[10] = output[11] = output[12]
   1.666 +              = output[13] = output[14] = output[15] = 0;
   1.667 +    return;
   1.668 +  }
   1.669 +
   1.670 +  // stage 1
   1.671 +  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
   1.672 +  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
   1.673 +  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
   1.674 +  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
   1.675 +  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
   1.676 +  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
   1.677 +  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
   1.678 +  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
   1.679 +  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
   1.680 +  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
   1.681 +  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
   1.682 +  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
   1.683 +  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
   1.684 +  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
   1.685 +  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   1.686 +  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
   1.687 +
   1.688 +  x0 = dct_const_round_shift(s0 + s8);
   1.689 +  x1 = dct_const_round_shift(s1 + s9);
   1.690 +  x2 = dct_const_round_shift(s2 + s10);
   1.691 +  x3 = dct_const_round_shift(s3 + s11);
   1.692 +  x4 = dct_const_round_shift(s4 + s12);
   1.693 +  x5 = dct_const_round_shift(s5 + s13);
   1.694 +  x6 = dct_const_round_shift(s6 + s14);
   1.695 +  x7 = dct_const_round_shift(s7 + s15);
   1.696 +  x8  = dct_const_round_shift(s0 - s8);
   1.697 +  x9  = dct_const_round_shift(s1 - s9);
   1.698 +  x10 = dct_const_round_shift(s2 - s10);
   1.699 +  x11 = dct_const_round_shift(s3 - s11);
   1.700 +  x12 = dct_const_round_shift(s4 - s12);
   1.701 +  x13 = dct_const_round_shift(s5 - s13);
   1.702 +  x14 = dct_const_round_shift(s6 - s14);
   1.703 +  x15 = dct_const_round_shift(s7 - s15);
   1.704 +
   1.705 +  // stage 2
   1.706 +  s0 = x0;
   1.707 +  s1 = x1;
   1.708 +  s2 = x2;
   1.709 +  s3 = x3;
   1.710 +  s4 = x4;
   1.711 +  s5 = x5;
   1.712 +  s6 = x6;
   1.713 +  s7 = x7;
   1.714 +  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
   1.715 +  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
   1.716 +  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
   1.717 +  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
   1.718 +  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
   1.719 +  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
   1.720 +  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
   1.721 +  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
   1.722 +
   1.723 +  x0 = s0 + s4;
   1.724 +  x1 = s1 + s5;
   1.725 +  x2 = s2 + s6;
   1.726 +  x3 = s3 + s7;
   1.727 +  x4 = s0 - s4;
   1.728 +  x5 = s1 - s5;
   1.729 +  x6 = s2 - s6;
   1.730 +  x7 = s3 - s7;
   1.731 +  x8 = dct_const_round_shift(s8 + s12);
   1.732 +  x9 = dct_const_round_shift(s9 + s13);
   1.733 +  x10 = dct_const_round_shift(s10 + s14);
   1.734 +  x11 = dct_const_round_shift(s11 + s15);
   1.735 +  x12 = dct_const_round_shift(s8 - s12);
   1.736 +  x13 = dct_const_round_shift(s9 - s13);
   1.737 +  x14 = dct_const_round_shift(s10 - s14);
   1.738 +  x15 = dct_const_round_shift(s11 - s15);
   1.739 +
   1.740 +  // stage 3
   1.741 +  s0 = x0;
   1.742 +  s1 = x1;
   1.743 +  s2 = x2;
   1.744 +  s3 = x3;
   1.745 +  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
   1.746 +  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
   1.747 +  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
   1.748 +  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
   1.749 +  s8 = x8;
   1.750 +  s9 = x9;
   1.751 +  s10 = x10;
   1.752 +  s11 = x11;
   1.753 +  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
   1.754 +  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
   1.755 +  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
   1.756 +  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
   1.757 +
   1.758 +  x0 = s0 + s2;
   1.759 +  x1 = s1 + s3;
   1.760 +  x2 = s0 - s2;
   1.761 +  x3 = s1 - s3;
   1.762 +  x4 = dct_const_round_shift(s4 + s6);
   1.763 +  x5 = dct_const_round_shift(s5 + s7);
   1.764 +  x6 = dct_const_round_shift(s4 - s6);
   1.765 +  x7 = dct_const_round_shift(s5 - s7);
   1.766 +  x8 = s8 + s10;
   1.767 +  x9 = s9 + s11;
   1.768 +  x10 = s8 - s10;
   1.769 +  x11 = s9 - s11;
   1.770 +  x12 = dct_const_round_shift(s12 + s14);
   1.771 +  x13 = dct_const_round_shift(s13 + s15);
   1.772 +  x14 = dct_const_round_shift(s12 - s14);
   1.773 +  x15 = dct_const_round_shift(s13 - s15);
   1.774 +
   1.775 +  // stage 4
   1.776 +  s2 = (- cospi_16_64) * (x2 + x3);
   1.777 +  s3 = cospi_16_64 * (x2 - x3);
   1.778 +  s6 = cospi_16_64 * (x6 + x7);
   1.779 +  s7 = cospi_16_64 * (- x6 + x7);
   1.780 +  s10 = cospi_16_64 * (x10 + x11);
   1.781 +  s11 = cospi_16_64 * (- x10 + x11);
   1.782 +  s14 = (- cospi_16_64) * (x14 + x15);
   1.783 +  s15 = cospi_16_64 * (x14 - x15);
   1.784 +
   1.785 +  x2 = dct_const_round_shift(s2);
   1.786 +  x3 = dct_const_round_shift(s3);
   1.787 +  x6 = dct_const_round_shift(s6);
   1.788 +  x7 = dct_const_round_shift(s7);
   1.789 +  x10 = dct_const_round_shift(s10);
   1.790 +  x11 = dct_const_round_shift(s11);
   1.791 +  x14 = dct_const_round_shift(s14);
   1.792 +  x15 = dct_const_round_shift(s15);
   1.793 +
   1.794 +  output[0] =  x0;
   1.795 +  output[1] = -x8;
   1.796 +  output[2] =  x12;
   1.797 +  output[3] = -x4;
   1.798 +  output[4] =  x6;
   1.799 +  output[5] =  x14;
   1.800 +  output[6] =  x10;
   1.801 +  output[7] =  x2;
   1.802 +  output[8] =  x3;
   1.803 +  output[9] =  x11;
   1.804 +  output[10] =  x15;
   1.805 +  output[11] =  x7;
   1.806 +  output[12] =  x5;
   1.807 +  output[13] = -x13;
   1.808 +  output[14] =  x9;
   1.809 +  output[15] = -x1;
   1.810 +}
   1.811 +
   1.812 +static const transform_2d IHT_16[] = {
   1.813 +  { idct16_1d,  idct16_1d  },  // DCT_DCT  = 0
   1.814 +  { iadst16_1d, idct16_1d  },  // ADST_DCT = 1
   1.815 +  { idct16_1d,  iadst16_1d },  // DCT_ADST = 2
   1.816 +  { iadst16_1d, iadst16_1d }   // ADST_ADST = 3
   1.817 +};
   1.818 +
   1.819 +void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
   1.820 +                            int tx_type) {
   1.821 +  int i, j;
   1.822 +  int16_t out[16 * 16];
   1.823 +  int16_t *outptr = out;
   1.824 +  int16_t temp_in[16], temp_out[16];
   1.825 +  const transform_2d ht = IHT_16[tx_type];
   1.826 +
   1.827 +  // Rows
   1.828 +  for (i = 0; i < 16; ++i) {
   1.829 +    ht.rows(input, outptr);
   1.830 +    input += 16;
   1.831 +    outptr += 16;
   1.832 +  }
   1.833 +
   1.834 +  // Columns
   1.835 +  for (i = 0; i < 16; ++i) {
   1.836 +    for (j = 0; j < 16; ++j)
   1.837 +      temp_in[j] = out[j * 16 + i];
   1.838 +    ht.cols(temp_in, temp_out);
   1.839 +    for (j = 0; j < 16; ++j)
   1.840 +      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
   1.841 +                                        + dest[j * stride + i]);
   1.842 +  }
   1.843 +}
   1.844 +
   1.845 +void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   1.846 +  int16_t out[16 * 16] = { 0 };
   1.847 +  int16_t *outptr = out;
   1.848 +  int i, j;
   1.849 +  int16_t temp_in[16], temp_out[16];
   1.850 +
   1.851 +  // First transform rows. Since all non-zero dct coefficients are in
   1.852 +  // upper-left 4x4 area, we only need to calculate first 4 rows here.
   1.853 +  for (i = 0; i < 4; ++i) {
   1.854 +    idct16_1d(input, outptr);
   1.855 +    input += 16;
   1.856 +    outptr += 16;
   1.857 +  }
   1.858 +
   1.859 +  // Then transform columns
   1.860 +  for (i = 0; i < 16; ++i) {
   1.861 +    for (j = 0; j < 16; ++j)
   1.862 +      temp_in[j] = out[j*16 + i];
   1.863 +    idct16_1d(temp_in, temp_out);
   1.864 +    for (j = 0; j < 16; ++j)
   1.865 +      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
   1.866 +                                  + dest[j * stride + i]);
   1.867 +  }
   1.868 +}
   1.869 +
   1.870 +void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   1.871 +  int i, j;
   1.872 +  int a1;
   1.873 +  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   1.874 +  out = dct_const_round_shift(out * cospi_16_64);
   1.875 +  a1 = ROUND_POWER_OF_TWO(out, 6);
   1.876 +  for (j = 0; j < 16; ++j) {
   1.877 +    for (i = 0; i < 16; ++i)
   1.878 +      dest[i] = clip_pixel(dest[i] + a1);
   1.879 +    dest += stride;
   1.880 +  }
   1.881 +}
   1.882 +
   1.883 +static void idct32_1d(const int16_t *input, int16_t *output) {
   1.884 +  int16_t step1[32], step2[32];
   1.885 +  int temp1, temp2;
   1.886 +
   1.887 +  // stage 1
   1.888 +  step1[0] = input[0];
   1.889 +  step1[1] = input[16];
   1.890 +  step1[2] = input[8];
   1.891 +  step1[3] = input[24];
   1.892 +  step1[4] = input[4];
   1.893 +  step1[5] = input[20];
   1.894 +  step1[6] = input[12];
   1.895 +  step1[7] = input[28];
   1.896 +  step1[8] = input[2];
   1.897 +  step1[9] = input[18];
   1.898 +  step1[10] = input[10];
   1.899 +  step1[11] = input[26];
   1.900 +  step1[12] = input[6];
   1.901 +  step1[13] = input[22];
   1.902 +  step1[14] = input[14];
   1.903 +  step1[15] = input[30];
   1.904 +
   1.905 +  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   1.906 +  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
   1.907 +  step1[16] = dct_const_round_shift(temp1);
   1.908 +  step1[31] = dct_const_round_shift(temp2);
   1.909 +
   1.910 +  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   1.911 +  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
   1.912 +  step1[17] = dct_const_round_shift(temp1);
   1.913 +  step1[30] = dct_const_round_shift(temp2);
   1.914 +
   1.915 +  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   1.916 +  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
   1.917 +  step1[18] = dct_const_round_shift(temp1);
   1.918 +  step1[29] = dct_const_round_shift(temp2);
   1.919 +
   1.920 +  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   1.921 +  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
   1.922 +  step1[19] = dct_const_round_shift(temp1);
   1.923 +  step1[28] = dct_const_round_shift(temp2);
   1.924 +
   1.925 +  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   1.926 +  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
   1.927 +  step1[20] = dct_const_round_shift(temp1);
   1.928 +  step1[27] = dct_const_round_shift(temp2);
   1.929 +
   1.930 +  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   1.931 +  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
   1.932 +  step1[21] = dct_const_round_shift(temp1);
   1.933 +  step1[26] = dct_const_round_shift(temp2);
   1.934 +
   1.935 +  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   1.936 +  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
   1.937 +  step1[22] = dct_const_round_shift(temp1);
   1.938 +  step1[25] = dct_const_round_shift(temp2);
   1.939 +
   1.940 +  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   1.941 +  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
   1.942 +  step1[23] = dct_const_round_shift(temp1);
   1.943 +  step1[24] = dct_const_round_shift(temp2);
   1.944 +
   1.945 +  // stage 2
   1.946 +  step2[0] = step1[0];
   1.947 +  step2[1] = step1[1];
   1.948 +  step2[2] = step1[2];
   1.949 +  step2[3] = step1[3];
   1.950 +  step2[4] = step1[4];
   1.951 +  step2[5] = step1[5];
   1.952 +  step2[6] = step1[6];
   1.953 +  step2[7] = step1[7];
   1.954 +
   1.955 +  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   1.956 +  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
   1.957 +  step2[8] = dct_const_round_shift(temp1);
   1.958 +  step2[15] = dct_const_round_shift(temp2);
   1.959 +
   1.960 +  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   1.961 +  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
   1.962 +  step2[9] = dct_const_round_shift(temp1);
   1.963 +  step2[14] = dct_const_round_shift(temp2);
   1.964 +
   1.965 +  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   1.966 +  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
   1.967 +  step2[10] = dct_const_round_shift(temp1);
   1.968 +  step2[13] = dct_const_round_shift(temp2);
   1.969 +
   1.970 +  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   1.971 +  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
   1.972 +  step2[11] = dct_const_round_shift(temp1);
   1.973 +  step2[12] = dct_const_round_shift(temp2);
   1.974 +
   1.975 +  step2[16] = step1[16] + step1[17];
   1.976 +  step2[17] = step1[16] - step1[17];
   1.977 +  step2[18] = -step1[18] + step1[19];
   1.978 +  step2[19] = step1[18] + step1[19];
   1.979 +  step2[20] = step1[20] + step1[21];
   1.980 +  step2[21] = step1[20] - step1[21];
   1.981 +  step2[22] = -step1[22] + step1[23];
   1.982 +  step2[23] = step1[22] + step1[23];
   1.983 +  step2[24] = step1[24] + step1[25];
   1.984 +  step2[25] = step1[24] - step1[25];
   1.985 +  step2[26] = -step1[26] + step1[27];
   1.986 +  step2[27] = step1[26] + step1[27];
   1.987 +  step2[28] = step1[28] + step1[29];
   1.988 +  step2[29] = step1[28] - step1[29];
   1.989 +  step2[30] = -step1[30] + step1[31];
   1.990 +  step2[31] = step1[30] + step1[31];
   1.991 +
   1.992 +  // stage 3
   1.993 +  step1[0] = step2[0];
   1.994 +  step1[1] = step2[1];
   1.995 +  step1[2] = step2[2];
   1.996 +  step1[3] = step2[3];
   1.997 +
   1.998 +  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   1.999 +  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
  1.1000 +  step1[4] = dct_const_round_shift(temp1);
  1.1001 +  step1[7] = dct_const_round_shift(temp2);
  1.1002 +  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
  1.1003 +  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
  1.1004 +  step1[5] = dct_const_round_shift(temp1);
  1.1005 +  step1[6] = dct_const_round_shift(temp2);
  1.1006 +
  1.1007 +  step1[8] = step2[8] + step2[9];
  1.1008 +  step1[9] = step2[8] - step2[9];
  1.1009 +  step1[10] = -step2[10] + step2[11];
  1.1010 +  step1[11] = step2[10] + step2[11];
  1.1011 +  step1[12] = step2[12] + step2[13];
  1.1012 +  step1[13] = step2[12] - step2[13];
  1.1013 +  step1[14] = -step2[14] + step2[15];
  1.1014 +  step1[15] = step2[14] + step2[15];
  1.1015 +
  1.1016 +  step1[16] = step2[16];
  1.1017 +  step1[31] = step2[31];
  1.1018 +  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
  1.1019 +  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
  1.1020 +  step1[17] = dct_const_round_shift(temp1);
  1.1021 +  step1[30] = dct_const_round_shift(temp2);
  1.1022 +  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
  1.1023 +  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
  1.1024 +  step1[18] = dct_const_round_shift(temp1);
  1.1025 +  step1[29] = dct_const_round_shift(temp2);
  1.1026 +  step1[19] = step2[19];
  1.1027 +  step1[20] = step2[20];
  1.1028 +  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
  1.1029 +  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
  1.1030 +  step1[21] = dct_const_round_shift(temp1);
  1.1031 +  step1[26] = dct_const_round_shift(temp2);
  1.1032 +  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
  1.1033 +  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
  1.1034 +  step1[22] = dct_const_round_shift(temp1);
  1.1035 +  step1[25] = dct_const_round_shift(temp2);
  1.1036 +  step1[23] = step2[23];
  1.1037 +  step1[24] = step2[24];
  1.1038 +  step1[27] = step2[27];
  1.1039 +  step1[28] = step2[28];
  1.1040 +
  1.1041 +  // stage 4
  1.1042 +  temp1 = (step1[0] + step1[1]) * cospi_16_64;
  1.1043 +  temp2 = (step1[0] - step1[1]) * cospi_16_64;
  1.1044 +  step2[0] = dct_const_round_shift(temp1);
  1.1045 +  step2[1] = dct_const_round_shift(temp2);
  1.1046 +  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
  1.1047 +  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
  1.1048 +  step2[2] = dct_const_round_shift(temp1);
  1.1049 +  step2[3] = dct_const_round_shift(temp2);
  1.1050 +  step2[4] = step1[4] + step1[5];
  1.1051 +  step2[5] = step1[4] - step1[5];
  1.1052 +  step2[6] = -step1[6] + step1[7];
  1.1053 +  step2[7] = step1[6] + step1[7];
  1.1054 +
  1.1055 +  step2[8] = step1[8];
  1.1056 +  step2[15] = step1[15];
  1.1057 +  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
  1.1058 +  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
  1.1059 +  step2[9] = dct_const_round_shift(temp1);
  1.1060 +  step2[14] = dct_const_round_shift(temp2);
  1.1061 +  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
  1.1062 +  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
  1.1063 +  step2[10] = dct_const_round_shift(temp1);
  1.1064 +  step2[13] = dct_const_round_shift(temp2);
  1.1065 +  step2[11] = step1[11];
  1.1066 +  step2[12] = step1[12];
  1.1067 +
  1.1068 +  step2[16] = step1[16] + step1[19];
  1.1069 +  step2[17] = step1[17] + step1[18];
  1.1070 +  step2[18] = step1[17] - step1[18];
  1.1071 +  step2[19] = step1[16] - step1[19];
  1.1072 +  step2[20] = -step1[20] + step1[23];
  1.1073 +  step2[21] = -step1[21] + step1[22];
  1.1074 +  step2[22] = step1[21] + step1[22];
  1.1075 +  step2[23] = step1[20] + step1[23];
  1.1076 +
  1.1077 +  step2[24] = step1[24] + step1[27];
  1.1078 +  step2[25] = step1[25] + step1[26];
  1.1079 +  step2[26] = step1[25] - step1[26];
  1.1080 +  step2[27] = step1[24] - step1[27];
  1.1081 +  step2[28] = -step1[28] + step1[31];
  1.1082 +  step2[29] = -step1[29] + step1[30];
  1.1083 +  step2[30] = step1[29] + step1[30];
  1.1084 +  step2[31] = step1[28] + step1[31];
  1.1085 +
  1.1086 +  // stage 5
  1.1087 +  step1[0] = step2[0] + step2[3];
  1.1088 +  step1[1] = step2[1] + step2[2];
  1.1089 +  step1[2] = step2[1] - step2[2];
  1.1090 +  step1[3] = step2[0] - step2[3];
  1.1091 +  step1[4] = step2[4];
  1.1092 +  temp1 = (step2[6] - step2[5]) * cospi_16_64;
  1.1093 +  temp2 = (step2[5] + step2[6]) * cospi_16_64;
  1.1094 +  step1[5] = dct_const_round_shift(temp1);
  1.1095 +  step1[6] = dct_const_round_shift(temp2);
  1.1096 +  step1[7] = step2[7];
  1.1097 +
  1.1098 +  step1[8] = step2[8] + step2[11];
  1.1099 +  step1[9] = step2[9] + step2[10];
  1.1100 +  step1[10] = step2[9] - step2[10];
  1.1101 +  step1[11] = step2[8] - step2[11];
  1.1102 +  step1[12] = -step2[12] + step2[15];
  1.1103 +  step1[13] = -step2[13] + step2[14];
  1.1104 +  step1[14] = step2[13] + step2[14];
  1.1105 +  step1[15] = step2[12] + step2[15];
  1.1106 +
  1.1107 +  step1[16] = step2[16];
  1.1108 +  step1[17] = step2[17];
  1.1109 +  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
  1.1110 +  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
  1.1111 +  step1[18] = dct_const_round_shift(temp1);
  1.1112 +  step1[29] = dct_const_round_shift(temp2);
  1.1113 +  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
  1.1114 +  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
  1.1115 +  step1[19] = dct_const_round_shift(temp1);
  1.1116 +  step1[28] = dct_const_round_shift(temp2);
  1.1117 +  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
  1.1118 +  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
  1.1119 +  step1[20] = dct_const_round_shift(temp1);
  1.1120 +  step1[27] = dct_const_round_shift(temp2);
  1.1121 +  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
  1.1122 +  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
  1.1123 +  step1[21] = dct_const_round_shift(temp1);
  1.1124 +  step1[26] = dct_const_round_shift(temp2);
  1.1125 +  step1[22] = step2[22];
  1.1126 +  step1[23] = step2[23];
  1.1127 +  step1[24] = step2[24];
  1.1128 +  step1[25] = step2[25];
  1.1129 +  step1[30] = step2[30];
  1.1130 +  step1[31] = step2[31];
  1.1131 +
  1.1132 +  // stage 6
  1.1133 +  step2[0] = step1[0] + step1[7];
  1.1134 +  step2[1] = step1[1] + step1[6];
  1.1135 +  step2[2] = step1[2] + step1[5];
  1.1136 +  step2[3] = step1[3] + step1[4];
  1.1137 +  step2[4] = step1[3] - step1[4];
  1.1138 +  step2[5] = step1[2] - step1[5];
  1.1139 +  step2[6] = step1[1] - step1[6];
  1.1140 +  step2[7] = step1[0] - step1[7];
  1.1141 +  step2[8] = step1[8];
  1.1142 +  step2[9] = step1[9];
  1.1143 +  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
  1.1144 +  temp2 = (step1[10] + step1[13]) * cospi_16_64;
  1.1145 +  step2[10] = dct_const_round_shift(temp1);
  1.1146 +  step2[13] = dct_const_round_shift(temp2);
  1.1147 +  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
  1.1148 +  temp2 = (step1[11] + step1[12]) * cospi_16_64;
  1.1149 +  step2[11] = dct_const_round_shift(temp1);
  1.1150 +  step2[12] = dct_const_round_shift(temp2);
  1.1151 +  step2[14] = step1[14];
  1.1152 +  step2[15] = step1[15];
  1.1153 +
  1.1154 +  step2[16] = step1[16] + step1[23];
  1.1155 +  step2[17] = step1[17] + step1[22];
  1.1156 +  step2[18] = step1[18] + step1[21];
  1.1157 +  step2[19] = step1[19] + step1[20];
  1.1158 +  step2[20] = step1[19] - step1[20];
  1.1159 +  step2[21] = step1[18] - step1[21];
  1.1160 +  step2[22] = step1[17] - step1[22];
  1.1161 +  step2[23] = step1[16] - step1[23];
  1.1162 +
  1.1163 +  step2[24] = -step1[24] + step1[31];
  1.1164 +  step2[25] = -step1[25] + step1[30];
  1.1165 +  step2[26] = -step1[26] + step1[29];
  1.1166 +  step2[27] = -step1[27] + step1[28];
  1.1167 +  step2[28] = step1[27] + step1[28];
  1.1168 +  step2[29] = step1[26] + step1[29];
  1.1169 +  step2[30] = step1[25] + step1[30];
  1.1170 +  step2[31] = step1[24] + step1[31];
  1.1171 +
  1.1172 +  // stage 7
  1.1173 +  step1[0] = step2[0] + step2[15];
  1.1174 +  step1[1] = step2[1] + step2[14];
  1.1175 +  step1[2] = step2[2] + step2[13];
  1.1176 +  step1[3] = step2[3] + step2[12];
  1.1177 +  step1[4] = step2[4] + step2[11];
  1.1178 +  step1[5] = step2[5] + step2[10];
  1.1179 +  step1[6] = step2[6] + step2[9];
  1.1180 +  step1[7] = step2[7] + step2[8];
  1.1181 +  step1[8] = step2[7] - step2[8];
  1.1182 +  step1[9] = step2[6] - step2[9];
  1.1183 +  step1[10] = step2[5] - step2[10];
  1.1184 +  step1[11] = step2[4] - step2[11];
  1.1185 +  step1[12] = step2[3] - step2[12];
  1.1186 +  step1[13] = step2[2] - step2[13];
  1.1187 +  step1[14] = step2[1] - step2[14];
  1.1188 +  step1[15] = step2[0] - step2[15];
  1.1189 +
  1.1190 +  step1[16] = step2[16];
  1.1191 +  step1[17] = step2[17];
  1.1192 +  step1[18] = step2[18];
  1.1193 +  step1[19] = step2[19];
  1.1194 +  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
  1.1195 +  temp2 = (step2[20] + step2[27]) * cospi_16_64;
  1.1196 +  step1[20] = dct_const_round_shift(temp1);
  1.1197 +  step1[27] = dct_const_round_shift(temp2);
  1.1198 +  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
  1.1199 +  temp2 = (step2[21] + step2[26]) * cospi_16_64;
  1.1200 +  step1[21] = dct_const_round_shift(temp1);
  1.1201 +  step1[26] = dct_const_round_shift(temp2);
  1.1202 +  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
  1.1203 +  temp2 = (step2[22] + step2[25]) * cospi_16_64;
  1.1204 +  step1[22] = dct_const_round_shift(temp1);
  1.1205 +  step1[25] = dct_const_round_shift(temp2);
  1.1206 +  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
  1.1207 +  temp2 = (step2[23] + step2[24]) * cospi_16_64;
  1.1208 +  step1[23] = dct_const_round_shift(temp1);
  1.1209 +  step1[24] = dct_const_round_shift(temp2);
  1.1210 +  step1[28] = step2[28];
  1.1211 +  step1[29] = step2[29];
  1.1212 +  step1[30] = step2[30];
  1.1213 +  step1[31] = step2[31];
  1.1214 +
  1.1215 +  // final stage
  1.1216 +  output[0] = step1[0] + step1[31];
  1.1217 +  output[1] = step1[1] + step1[30];
  1.1218 +  output[2] = step1[2] + step1[29];
  1.1219 +  output[3] = step1[3] + step1[28];
  1.1220 +  output[4] = step1[4] + step1[27];
  1.1221 +  output[5] = step1[5] + step1[26];
  1.1222 +  output[6] = step1[6] + step1[25];
  1.1223 +  output[7] = step1[7] + step1[24];
  1.1224 +  output[8] = step1[8] + step1[23];
  1.1225 +  output[9] = step1[9] + step1[22];
  1.1226 +  output[10] = step1[10] + step1[21];
  1.1227 +  output[11] = step1[11] + step1[20];
  1.1228 +  output[12] = step1[12] + step1[19];
  1.1229 +  output[13] = step1[13] + step1[18];
  1.1230 +  output[14] = step1[14] + step1[17];
  1.1231 +  output[15] = step1[15] + step1[16];
  1.1232 +  output[16] = step1[15] - step1[16];
  1.1233 +  output[17] = step1[14] - step1[17];
  1.1234 +  output[18] = step1[13] - step1[18];
  1.1235 +  output[19] = step1[12] - step1[19];
  1.1236 +  output[20] = step1[11] - step1[20];
  1.1237 +  output[21] = step1[10] - step1[21];
  1.1238 +  output[22] = step1[9] - step1[22];
  1.1239 +  output[23] = step1[8] - step1[23];
  1.1240 +  output[24] = step1[7] - step1[24];
  1.1241 +  output[25] = step1[6] - step1[25];
  1.1242 +  output[26] = step1[5] - step1[26];
  1.1243 +  output[27] = step1[4] - step1[27];
  1.1244 +  output[28] = step1[3] - step1[28];
  1.1245 +  output[29] = step1[2] - step1[29];
  1.1246 +  output[30] = step1[1] - step1[30];
  1.1247 +  output[31] = step1[0] - step1[31];
  1.1248 +}
  1.1249 +
  1.1250 +void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
  1.1251 +  int16_t out[32 * 32];
  1.1252 +  int16_t *outptr = out;
  1.1253 +  int i, j;
  1.1254 +  int16_t temp_in[32], temp_out[32];
  1.1255 +
  1.1256 +  // Rows
  1.1257 +  for (i = 0; i < 32; ++i) {
  1.1258 +    int16_t zero_coeff[16];
  1.1259 +    for (j = 0; j < 16; ++j)
  1.1260 +      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
  1.1261 +    for (j = 0; j < 8; ++j)
  1.1262 +      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
  1.1263 +    for (j = 0; j < 4; ++j)
  1.1264 +      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
  1.1265 +    for (j = 0; j < 2; ++j)
  1.1266 +      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
  1.1267 +
  1.1268 +    if (zero_coeff[0] | zero_coeff[1])
  1.1269 +      idct32_1d(input, outptr);
  1.1270 +    else
  1.1271 +      vpx_memset(outptr, 0, sizeof(int16_t) * 32);
  1.1272 +    input += 32;
  1.1273 +    outptr += 32;
  1.1274 +  }
  1.1275 +
  1.1276 +  // Columns
  1.1277 +  for (i = 0; i < 32; ++i) {
  1.1278 +    for (j = 0; j < 32; ++j)
  1.1279 +      temp_in[j] = out[j * 32 + i];
  1.1280 +    idct32_1d(temp_in, temp_out);
  1.1281 +    for (j = 0; j < 32; ++j)
  1.1282 +      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
  1.1283 +                                        + dest[j * stride + i]);
  1.1284 +  }
  1.1285 +}
  1.1286 +
  1.1287 +void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
  1.1288 +  int16_t out[32 * 32] = {0};
  1.1289 +  int16_t *outptr = out;
  1.1290 +  int i, j;
  1.1291 +  int16_t temp_in[32], temp_out[32];
  1.1292 +
  1.1293 +  // Rows
  1.1294 +  // only upper-left 8x8 has non-zero coeff
  1.1295 +  for (i = 0; i < 8; ++i) {
  1.1296 +    idct32_1d(input, outptr);
  1.1297 +    input += 32;
  1.1298 +    outptr += 32;
  1.1299 +  }
  1.1300 +
  1.1301 +  // Columns
  1.1302 +  for (i = 0; i < 32; ++i) {
  1.1303 +    for (j = 0; j < 32; ++j)
  1.1304 +      temp_in[j] = out[j * 32 + i];
  1.1305 +    idct32_1d(temp_in, temp_out);
  1.1306 +    for (j = 0; j < 32; ++j)
  1.1307 +      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
  1.1308 +                                  + dest[j * stride + i]);
  1.1309 +  }
  1.1310 +}
  1.1311 +
  1.1312 +void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
  1.1313 +  int i, j;
  1.1314 +  int a1;
  1.1315 +
  1.1316 +  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
  1.1317 +  out = dct_const_round_shift(out * cospi_16_64);
  1.1318 +  a1 = ROUND_POWER_OF_TWO(out, 6);
  1.1319 +
  1.1320 +  for (j = 0; j < 32; ++j) {
  1.1321 +    for (i = 0; i < 32; ++i)
  1.1322 +      dest[i] = clip_pixel(dest[i] + a1);
  1.1323 +    dest += stride;
  1.1324 +  }
  1.1325 +}
  1.1326 +
  1.1327 +// idct
  1.1328 +void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
  1.1329 +  if (eob > 1)
  1.1330 +    vp9_idct4x4_16_add(input, dest, stride);
  1.1331 +  else
  1.1332 +    vp9_idct4x4_1_add(input, dest, stride);
  1.1333 +}
  1.1334 +
  1.1335 +
  1.1336 +void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
  1.1337 +  if (eob > 1)
  1.1338 +    vp9_iwht4x4_16_add(input, dest, stride);
  1.1339 +  else
  1.1340 +    vp9_iwht4x4_1_add(input, dest, stride);
  1.1341 +}
  1.1342 +
  1.1343 +void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
  1.1344 +  // If dc is 1, then input[0] is the reconstructed value, do not need
  1.1345 +  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
  1.1346 +
  1.1347 +  // The calculation can be simplified if there are not many non-zero dct
  1.1348 +  // coefficients. Use eobs to decide what to do.
  1.1349 +  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
  1.1350 +  // Combine that with code here.
  1.1351 +  if (eob) {
  1.1352 +    if (eob == 1)
  1.1353 +      // DC only DCT coefficient
  1.1354 +      vp9_idct8x8_1_add(input, dest, stride);
  1.1355 +    else if (eob <= 10)
  1.1356 +      vp9_idct8x8_10_add(input, dest, stride);
  1.1357 +    else
  1.1358 +      vp9_idct8x8_64_add(input, dest, stride);
  1.1359 +  }
  1.1360 +}
  1.1361 +
  1.1362 +void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
  1.1363 +                       int eob) {
  1.1364 +  /* The calculation can be simplified if there are not many non-zero dct
  1.1365 +   * coefficients. Use eobs to separate different cases. */
  1.1366 +  if (eob) {
  1.1367 +    if (eob == 1)
  1.1368 +      /* DC only DCT coefficient. */
  1.1369 +      vp9_idct16x16_1_add(input, dest, stride);
  1.1370 +    else if (eob <= 10)
  1.1371 +      vp9_idct16x16_10_add(input, dest, stride);
  1.1372 +    else
  1.1373 +      vp9_idct16x16_256_add(input, dest, stride);
  1.1374 +  }
  1.1375 +}
  1.1376 +
  1.1377 +void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
  1.1378 +                       int eob) {
  1.1379 +  if (eob) {
  1.1380 +    if (eob == 1)
  1.1381 +      vp9_idct32x32_1_add(input, dest, stride);
  1.1382 +    else if (eob <= 34)
  1.1383 +      // non-zero coeff only in upper-left 8x8
  1.1384 +      vp9_idct32x32_34_add(input, dest, stride);
  1.1385 +    else
  1.1386 +      vp9_idct32x32_1024_add(input, dest, stride);
  1.1387 +  }
  1.1388 +}
  1.1389 +
  1.1390 +// iht
  1.1391 +void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
  1.1392 +                    int stride, int eob) {
  1.1393 +  if (tx_type == DCT_DCT)
  1.1394 +    vp9_idct4x4_add(input, dest, stride, eob);
  1.1395 +  else
  1.1396 +    vp9_iht4x4_16_add(input, dest, stride, tx_type);
  1.1397 +}
  1.1398 +
  1.1399 +void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
  1.1400 +                    int stride, int eob) {
  1.1401 +  if (tx_type == DCT_DCT) {
  1.1402 +    vp9_idct8x8_add(input, dest, stride, eob);
  1.1403 +  } else {
  1.1404 +    if (eob > 0) {
  1.1405 +      vp9_iht8x8_64_add(input, dest, stride, tx_type);
  1.1406 +    }
  1.1407 +  }
  1.1408 +}
  1.1409 +
  1.1410 +void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
  1.1411 +                      int stride, int eob) {
  1.1412 +  if (tx_type == DCT_DCT) {
  1.1413 +    vp9_idct16x16_add(input, dest, stride, eob);
  1.1414 +  } else {
  1.1415 +    if (eob > 0) {
  1.1416 +      vp9_iht16x16_256_add(input, dest, stride, tx_type);
  1.1417 +    }
  1.1418 +  }
  1.1419 +}

mercurial