media/libvpx/vp9/encoder/vp9_dct.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp9/encoder/vp9_dct.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1401 @@
     1.4 +/*
     1.5 + *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 + *
     1.7 + *  Use of this source code is governed by a BSD-style license
     1.8 + *  that can be found in the LICENSE file in the root of the source
     1.9 + *  tree. An additional intellectual property rights grant can be found
    1.10 + *  in the file PATENTS.  All contributing project authors may
    1.11 + *  be found in the AUTHORS file in the root of the source tree.
    1.12 + */
    1.13 +
    1.14 +#include <assert.h>
    1.15 +#include <math.h>
    1.16 +
    1.17 +#include "./vpx_config.h"
    1.18 +#include "./vp9_rtcd.h"
    1.19 +
    1.20 +#include "vp9/common/vp9_blockd.h"
    1.21 +#include "vp9/common/vp9_idct.h"
    1.22 +#include "vp9/common/vp9_systemdependent.h"
    1.23 +
    1.24 +#include "vp9/encoder/vp9_dct.h"
    1.25 +
    1.26 +static INLINE int fdct_round_shift(int input) {
    1.27 +  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
    1.28 +  assert(INT16_MIN <= rv && rv <= INT16_MAX);
    1.29 +  return rv;
    1.30 +}
    1.31 +
    1.32 +static void fdct4(const int16_t *input, int16_t *output) {
    1.33 +  int16_t step[4];
    1.34 +  int temp1, temp2;
    1.35 +
    1.36 +  step[0] = input[0] + input[3];
    1.37 +  step[1] = input[1] + input[2];
    1.38 +  step[2] = input[1] - input[2];
    1.39 +  step[3] = input[0] - input[3];
    1.40 +
    1.41 +  temp1 = (step[0] + step[1]) * cospi_16_64;
    1.42 +  temp2 = (step[0] - step[1]) * cospi_16_64;
    1.43 +  output[0] = fdct_round_shift(temp1);
    1.44 +  output[2] = fdct_round_shift(temp2);
    1.45 +  temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
    1.46 +  temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
    1.47 +  output[1] = fdct_round_shift(temp1);
    1.48 +  output[3] = fdct_round_shift(temp2);
    1.49 +}
    1.50 +
    1.51 +void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
    1.52 +  // The 2D transform is done with two passes which are actually pretty
    1.53 +  // similar. In the first one, we transform the columns and transpose
    1.54 +  // the results. In the second one, we transform the rows. To achieve that,
    1.55 +  // as the first pass results are transposed, we tranpose the columns (that
    1.56 +  // is the transposed rows) and transpose the results (so that it goes back
    1.57 +  // in normal/row positions).
    1.58 +  int pass;
    1.59 +  // We need an intermediate buffer between passes.
    1.60 +  int16_t intermediate[4 * 4];
    1.61 +  const int16_t *in = input;
    1.62 +  int16_t *out = intermediate;
    1.63 +  // Do the two transform/transpose passes
    1.64 +  for (pass = 0; pass < 2; ++pass) {
    1.65 +    /*canbe16*/ int input[4];
    1.66 +    /*canbe16*/ int step[4];
    1.67 +    /*needs32*/ int temp1, temp2;
    1.68 +    int i;
    1.69 +    for (i = 0; i < 4; ++i) {
    1.70 +      // Load inputs.
    1.71 +      if (0 == pass) {
    1.72 +        input[0] = in[0 * stride] * 16;
    1.73 +        input[1] = in[1 * stride] * 16;
    1.74 +        input[2] = in[2 * stride] * 16;
    1.75 +        input[3] = in[3 * stride] * 16;
    1.76 +        if (i == 0 && input[0]) {
    1.77 +          input[0] += 1;
    1.78 +        }
    1.79 +      } else {
    1.80 +        input[0] = in[0 * 4];
    1.81 +        input[1] = in[1 * 4];
    1.82 +        input[2] = in[2 * 4];
    1.83 +        input[3] = in[3 * 4];
    1.84 +      }
    1.85 +      // Transform.
    1.86 +      step[0] = input[0] + input[3];
    1.87 +      step[1] = input[1] + input[2];
    1.88 +      step[2] = input[1] - input[2];
    1.89 +      step[3] = input[0] - input[3];
    1.90 +      temp1 = (step[0] + step[1]) * cospi_16_64;
    1.91 +      temp2 = (step[0] - step[1]) * cospi_16_64;
    1.92 +      out[0] = fdct_round_shift(temp1);
    1.93 +      out[2] = fdct_round_shift(temp2);
    1.94 +      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
    1.95 +      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
    1.96 +      out[1] = fdct_round_shift(temp1);
    1.97 +      out[3] = fdct_round_shift(temp2);
    1.98 +      // Do next column (which is a transposed row in second/horizontal pass)
    1.99 +      in++;
   1.100 +      out += 4;
   1.101 +    }
   1.102 +    // Setup in/out for next pass.
   1.103 +    in = intermediate;
   1.104 +    out = output;
   1.105 +  }
   1.106 +
   1.107 +  {
   1.108 +    int i, j;
   1.109 +    for (i = 0; i < 4; ++i) {
   1.110 +      for (j = 0; j < 4; ++j)
   1.111 +        output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
   1.112 +    }
   1.113 +  }
   1.114 +}
   1.115 +
   1.116 +static void fadst4(const int16_t *input, int16_t *output) {
   1.117 +  int x0, x1, x2, x3;
   1.118 +  int s0, s1, s2, s3, s4, s5, s6, s7;
   1.119 +
   1.120 +  x0 = input[0];
   1.121 +  x1 = input[1];
   1.122 +  x2 = input[2];
   1.123 +  x3 = input[3];
   1.124 +
   1.125 +  if (!(x0 | x1 | x2 | x3)) {
   1.126 +    output[0] = output[1] = output[2] = output[3] = 0;
   1.127 +    return;
   1.128 +  }
   1.129 +
   1.130 +  s0 = sinpi_1_9 * x0;
   1.131 +  s1 = sinpi_4_9 * x0;
   1.132 +  s2 = sinpi_2_9 * x1;
   1.133 +  s3 = sinpi_1_9 * x1;
   1.134 +  s4 = sinpi_3_9 * x2;
   1.135 +  s5 = sinpi_4_9 * x3;
   1.136 +  s6 = sinpi_2_9 * x3;
   1.137 +  s7 = x0 + x1 - x3;
   1.138 +
   1.139 +  x0 = s0 + s2 + s5;
   1.140 +  x1 = sinpi_3_9 * s7;
   1.141 +  x2 = s1 - s3 + s6;
   1.142 +  x3 = s4;
   1.143 +
   1.144 +  s0 = x0 + x3;
   1.145 +  s1 = x1;
   1.146 +  s2 = x2 - x3;
   1.147 +  s3 = x2 - x0 + x3;
   1.148 +
   1.149 +  // 1-D transform scaling factor is sqrt(2).
   1.150 +  output[0] = fdct_round_shift(s0);
   1.151 +  output[1] = fdct_round_shift(s1);
   1.152 +  output[2] = fdct_round_shift(s2);
   1.153 +  output[3] = fdct_round_shift(s3);
   1.154 +}
   1.155 +
   1.156 +static const transform_2d FHT_4[] = {
   1.157 +  { fdct4,  fdct4  },  // DCT_DCT  = 0
   1.158 +  { fadst4, fdct4  },  // ADST_DCT = 1
   1.159 +  { fdct4,  fadst4 },  // DCT_ADST = 2
   1.160 +  { fadst4, fadst4 }   // ADST_ADST = 3
   1.161 +};
   1.162 +
   1.163 +void vp9_short_fht4x4_c(const int16_t *input, int16_t *output,
   1.164 +                        int stride, int tx_type) {
   1.165 +  int16_t out[4 * 4];
   1.166 +  int16_t *outptr = &out[0];
   1.167 +  int i, j;
   1.168 +  int16_t temp_in[4], temp_out[4];
   1.169 +  const transform_2d ht = FHT_4[tx_type];
   1.170 +
   1.171 +  // Columns
   1.172 +  for (i = 0; i < 4; ++i) {
   1.173 +    for (j = 0; j < 4; ++j)
   1.174 +      temp_in[j] = input[j * stride + i] * 16;
   1.175 +    if (i == 0 && temp_in[0])
   1.176 +      temp_in[0] += 1;
   1.177 +    ht.cols(temp_in, temp_out);
   1.178 +    for (j = 0; j < 4; ++j)
   1.179 +      outptr[j * 4 + i] = temp_out[j];
   1.180 +  }
   1.181 +
   1.182 +  // Rows
   1.183 +  for (i = 0; i < 4; ++i) {
   1.184 +    for (j = 0; j < 4; ++j)
   1.185 +      temp_in[j] = out[j + i * 4];
   1.186 +    ht.rows(temp_in, temp_out);
   1.187 +    for (j = 0; j < 4; ++j)
   1.188 +      output[j + i * 4] = (temp_out[j] + 1) >> 2;
   1.189 +  }
   1.190 +}
   1.191 +
   1.192 +static void fdct8(const int16_t *input, int16_t *output) {
   1.193 +  /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
   1.194 +  /*needs32*/ int t0, t1, t2, t3;
   1.195 +  /*canbe16*/ int x0, x1, x2, x3;
   1.196 +
   1.197 +  // stage 1
   1.198 +  s0 = input[0] + input[7];
   1.199 +  s1 = input[1] + input[6];
   1.200 +  s2 = input[2] + input[5];
   1.201 +  s3 = input[3] + input[4];
   1.202 +  s4 = input[3] - input[4];
   1.203 +  s5 = input[2] - input[5];
   1.204 +  s6 = input[1] - input[6];
   1.205 +  s7 = input[0] - input[7];
   1.206 +
   1.207 +  // fdct4(step, step);
   1.208 +  x0 = s0 + s3;
   1.209 +  x1 = s1 + s2;
   1.210 +  x2 = s1 - s2;
   1.211 +  x3 = s0 - s3;
   1.212 +  t0 = (x0 + x1) * cospi_16_64;
   1.213 +  t1 = (x0 - x1) * cospi_16_64;
   1.214 +  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
   1.215 +  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
   1.216 +  output[0] = fdct_round_shift(t0);
   1.217 +  output[2] = fdct_round_shift(t2);
   1.218 +  output[4] = fdct_round_shift(t1);
   1.219 +  output[6] = fdct_round_shift(t3);
   1.220 +
   1.221 +  // Stage 2
   1.222 +  t0 = (s6 - s5) * cospi_16_64;
   1.223 +  t1 = (s6 + s5) * cospi_16_64;
   1.224 +  t2 = fdct_round_shift(t0);
   1.225 +  t3 = fdct_round_shift(t1);
   1.226 +
   1.227 +  // Stage 3
   1.228 +  x0 = s4 + t2;
   1.229 +  x1 = s4 - t2;
   1.230 +  x2 = s7 - t3;
   1.231 +  x3 = s7 + t3;
   1.232 +
   1.233 +  // Stage 4
   1.234 +  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
   1.235 +  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
   1.236 +  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
   1.237 +  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
   1.238 +  output[1] = fdct_round_shift(t0);
   1.239 +  output[3] = fdct_round_shift(t2);
   1.240 +  output[5] = fdct_round_shift(t1);
   1.241 +  output[7] = fdct_round_shift(t3);
   1.242 +}
   1.243 +
   1.244 +void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
   1.245 +  int i, j;
   1.246 +  int16_t intermediate[64];
   1.247 +
   1.248 +  // Transform columns
   1.249 +  {
   1.250 +    int16_t *output = intermediate;
   1.251 +    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
   1.252 +    /*needs32*/ int t0, t1, t2, t3;
   1.253 +    /*canbe16*/ int x0, x1, x2, x3;
   1.254 +
   1.255 +    int i;
   1.256 +    for (i = 0; i < 8; i++) {
   1.257 +      // stage 1
   1.258 +      s0 = (input[0 * stride] + input[7 * stride]) * 4;
   1.259 +      s1 = (input[1 * stride] + input[6 * stride]) * 4;
   1.260 +      s2 = (input[2 * stride] + input[5 * stride]) * 4;
   1.261 +      s3 = (input[3 * stride] + input[4 * stride]) * 4;
   1.262 +      s4 = (input[3 * stride] - input[4 * stride]) * 4;
   1.263 +      s5 = (input[2 * stride] - input[5 * stride]) * 4;
   1.264 +      s6 = (input[1 * stride] - input[6 * stride]) * 4;
   1.265 +      s7 = (input[0 * stride] - input[7 * stride]) * 4;
   1.266 +
   1.267 +      // fdct4(step, step);
   1.268 +      x0 = s0 + s3;
   1.269 +      x1 = s1 + s2;
   1.270 +      x2 = s1 - s2;
   1.271 +      x3 = s0 - s3;
   1.272 +      t0 = (x0 + x1) * cospi_16_64;
   1.273 +      t1 = (x0 - x1) * cospi_16_64;
   1.274 +      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
   1.275 +      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
   1.276 +      output[0 * 8] = fdct_round_shift(t0);
   1.277 +      output[2 * 8] = fdct_round_shift(t2);
   1.278 +      output[4 * 8] = fdct_round_shift(t1);
   1.279 +      output[6 * 8] = fdct_round_shift(t3);
   1.280 +
   1.281 +      // Stage 2
   1.282 +      t0 = (s6 - s5) * cospi_16_64;
   1.283 +      t1 = (s6 + s5) * cospi_16_64;
   1.284 +      t2 = fdct_round_shift(t0);
   1.285 +      t3 = fdct_round_shift(t1);
   1.286 +
   1.287 +      // Stage 3
   1.288 +      x0 = s4 + t2;
   1.289 +      x1 = s4 - t2;
   1.290 +      x2 = s7 - t3;
   1.291 +      x3 = s7 + t3;
   1.292 +
   1.293 +      // Stage 4
   1.294 +      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
   1.295 +      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
   1.296 +      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
   1.297 +      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
   1.298 +      output[1 * 8] = fdct_round_shift(t0);
   1.299 +      output[3 * 8] = fdct_round_shift(t2);
   1.300 +      output[5 * 8] = fdct_round_shift(t1);
   1.301 +      output[7 * 8] = fdct_round_shift(t3);
   1.302 +      input++;
   1.303 +      output++;
   1.304 +    }
   1.305 +  }
   1.306 +
   1.307 +  // Rows
   1.308 +  for (i = 0; i < 8; ++i) {
   1.309 +    fdct8(&intermediate[i * 8], &final_output[i * 8]);
   1.310 +    for (j = 0; j < 8; ++j)
   1.311 +      final_output[j + i * 8] /= 2;
   1.312 +  }
   1.313 +}
   1.314 +
   1.315 +void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
   1.316 +  // The 2D transform is done with two passes which are actually pretty
   1.317 +  // similar. In the first one, we transform the columns and transpose
   1.318 +  // the results. In the second one, we transform the rows. To achieve that,
   1.319 +  // as the first pass results are transposed, we tranpose the columns (that
   1.320 +  // is the transposed rows) and transpose the results (so that it goes back
   1.321 +  // in normal/row positions).
   1.322 +  int pass;
   1.323 +  // We need an intermediate buffer between passes.
   1.324 +  int16_t intermediate[256];
   1.325 +  const int16_t *in = input;
   1.326 +  int16_t *out = intermediate;
   1.327 +  // Do the two transform/transpose passes
   1.328 +  for (pass = 0; pass < 2; ++pass) {
   1.329 +    /*canbe16*/ int step1[8];
   1.330 +    /*canbe16*/ int step2[8];
   1.331 +    /*canbe16*/ int step3[8];
   1.332 +    /*canbe16*/ int input[8];
   1.333 +    /*needs32*/ int temp1, temp2;
   1.334 +    int i;
   1.335 +    for (i = 0; i < 16; i++) {
   1.336 +      if (0 == pass) {
   1.337 +        // Calculate input for the first 8 results.
   1.338 +        input[0] = (in[0 * stride] + in[15 * stride]) * 4;
   1.339 +        input[1] = (in[1 * stride] + in[14 * stride]) * 4;
   1.340 +        input[2] = (in[2 * stride] + in[13 * stride]) * 4;
   1.341 +        input[3] = (in[3 * stride] + in[12 * stride]) * 4;
   1.342 +        input[4] = (in[4 * stride] + in[11 * stride]) * 4;
   1.343 +        input[5] = (in[5 * stride] + in[10 * stride]) * 4;
   1.344 +        input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;
   1.345 +        input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;
   1.346 +        // Calculate input for the next 8 results.
   1.347 +        step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;
   1.348 +        step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;
   1.349 +        step1[2] = (in[5 * stride] - in[10 * stride]) * 4;
   1.350 +        step1[3] = (in[4 * stride] - in[11 * stride]) * 4;
   1.351 +        step1[4] = (in[3 * stride] - in[12 * stride]) * 4;
   1.352 +        step1[5] = (in[2 * stride] - in[13 * stride]) * 4;
   1.353 +        step1[6] = (in[1 * stride] - in[14 * stride]) * 4;
   1.354 +        step1[7] = (in[0 * stride] - in[15 * stride]) * 4;
   1.355 +      } else {
   1.356 +        // Calculate input for the first 8 results.
   1.357 +        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
   1.358 +        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
   1.359 +        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
   1.360 +        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
   1.361 +        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
   1.362 +        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
   1.363 +        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
   1.364 +        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
   1.365 +        // Calculate input for the next 8 results.
   1.366 +        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
   1.367 +        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
   1.368 +        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
   1.369 +        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
   1.370 +        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
   1.371 +        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
   1.372 +        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
   1.373 +        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
   1.374 +      }
   1.375 +      // Work on the first eight values; fdct8(input, even_results);
   1.376 +      {
   1.377 +        /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
   1.378 +        /*needs32*/ int t0, t1, t2, t3;
   1.379 +        /*canbe16*/ int x0, x1, x2, x3;
   1.380 +
   1.381 +        // stage 1
   1.382 +        s0 = input[0] + input[7];
   1.383 +        s1 = input[1] + input[6];
   1.384 +        s2 = input[2] + input[5];
   1.385 +        s3 = input[3] + input[4];
   1.386 +        s4 = input[3] - input[4];
   1.387 +        s5 = input[2] - input[5];
   1.388 +        s6 = input[1] - input[6];
   1.389 +        s7 = input[0] - input[7];
   1.390 +
   1.391 +        // fdct4(step, step);
   1.392 +        x0 = s0 + s3;
   1.393 +        x1 = s1 + s2;
   1.394 +        x2 = s1 - s2;
   1.395 +        x3 = s0 - s3;
   1.396 +        t0 = (x0 + x1) * cospi_16_64;
   1.397 +        t1 = (x0 - x1) * cospi_16_64;
   1.398 +        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
   1.399 +        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
   1.400 +        out[0] = fdct_round_shift(t0);
   1.401 +        out[4] = fdct_round_shift(t2);
   1.402 +        out[8] = fdct_round_shift(t1);
   1.403 +        out[12] = fdct_round_shift(t3);
   1.404 +
   1.405 +        // Stage 2
   1.406 +        t0 = (s6 - s5) * cospi_16_64;
   1.407 +        t1 = (s6 + s5) * cospi_16_64;
   1.408 +        t2 = fdct_round_shift(t0);
   1.409 +        t3 = fdct_round_shift(t1);
   1.410 +
   1.411 +        // Stage 3
   1.412 +        x0 = s4 + t2;
   1.413 +        x1 = s4 - t2;
   1.414 +        x2 = s7 - t3;
   1.415 +        x3 = s7 + t3;
   1.416 +
   1.417 +        // Stage 4
   1.418 +        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
   1.419 +        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
   1.420 +        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
   1.421 +        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
   1.422 +        out[2] = fdct_round_shift(t0);
   1.423 +        out[6] = fdct_round_shift(t2);
   1.424 +        out[10] = fdct_round_shift(t1);
   1.425 +        out[14] = fdct_round_shift(t3);
   1.426 +      }
   1.427 +      // Work on the next eight values; step1 -> odd_results
   1.428 +      {
   1.429 +        // step 2
   1.430 +        temp1 = (step1[5] - step1[2]) * cospi_16_64;
   1.431 +        temp2 = (step1[4] - step1[3]) * cospi_16_64;
   1.432 +        step2[2] = fdct_round_shift(temp1);
   1.433 +        step2[3] = fdct_round_shift(temp2);
   1.434 +        temp1 = (step1[4] + step1[3]) * cospi_16_64;
   1.435 +        temp2 = (step1[5] + step1[2]) * cospi_16_64;
   1.436 +        step2[4] = fdct_round_shift(temp1);
   1.437 +        step2[5] = fdct_round_shift(temp2);
   1.438 +        // step 3
   1.439 +        step3[0] = step1[0] + step2[3];
   1.440 +        step3[1] = step1[1] + step2[2];
   1.441 +        step3[2] = step1[1] - step2[2];
   1.442 +        step3[3] = step1[0] - step2[3];
   1.443 +        step3[4] = step1[7] - step2[4];
   1.444 +        step3[5] = step1[6] - step2[5];
   1.445 +        step3[6] = step1[6] + step2[5];
   1.446 +        step3[7] = step1[7] + step2[4];
   1.447 +        // step 4
   1.448 +        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
   1.449 +        temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
   1.450 +        step2[1] = fdct_round_shift(temp1);
   1.451 +        step2[2] = fdct_round_shift(temp2);
   1.452 +        temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
   1.453 +        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
   1.454 +        step2[5] = fdct_round_shift(temp1);
   1.455 +        step2[6] = fdct_round_shift(temp2);
   1.456 +        // step 5
   1.457 +        step1[0] = step3[0] + step2[1];
   1.458 +        step1[1] = step3[0] - step2[1];
   1.459 +        step1[2] = step3[3] - step2[2];
   1.460 +        step1[3] = step3[3] + step2[2];
   1.461 +        step1[4] = step3[4] + step2[5];
   1.462 +        step1[5] = step3[4] - step2[5];
   1.463 +        step1[6] = step3[7] - step2[6];
   1.464 +        step1[7] = step3[7] + step2[6];
   1.465 +        // step 6
   1.466 +        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
   1.467 +        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
   1.468 +        out[1] = fdct_round_shift(temp1);
   1.469 +        out[9] = fdct_round_shift(temp2);
   1.470 +        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
   1.471 +        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
   1.472 +        out[5] = fdct_round_shift(temp1);
   1.473 +        out[13] = fdct_round_shift(temp2);
   1.474 +        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
   1.475 +        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
   1.476 +        out[3] = fdct_round_shift(temp1);
   1.477 +        out[11] = fdct_round_shift(temp2);
   1.478 +        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
   1.479 +        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
   1.480 +        out[7] = fdct_round_shift(temp1);
   1.481 +        out[15] = fdct_round_shift(temp2);
   1.482 +      }
   1.483 +      // Do next column (which is a transposed row in second/horizontal pass)
   1.484 +      in++;
   1.485 +      out += 16;
   1.486 +    }
   1.487 +    // Setup in/out for next pass.
   1.488 +    in = intermediate;
   1.489 +    out = output;
   1.490 +  }
   1.491 +}
   1.492 +
   1.493 +static void fadst8(const int16_t *input, int16_t *output) {
   1.494 +  int s0, s1, s2, s3, s4, s5, s6, s7;
   1.495 +
   1.496 +  int x0 = input[7];
   1.497 +  int x1 = input[0];
   1.498 +  int x2 = input[5];
   1.499 +  int x3 = input[2];
   1.500 +  int x4 = input[3];
   1.501 +  int x5 = input[4];
   1.502 +  int x6 = input[1];
   1.503 +  int x7 = input[6];
   1.504 +
   1.505 +  // stage 1
   1.506 +  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
   1.507 +  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
   1.508 +  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
   1.509 +  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
   1.510 +  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
   1.511 +  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
   1.512 +  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   1.513 +  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
   1.514 +
   1.515 +  x0 = fdct_round_shift(s0 + s4);
   1.516 +  x1 = fdct_round_shift(s1 + s5);
   1.517 +  x2 = fdct_round_shift(s2 + s6);
   1.518 +  x3 = fdct_round_shift(s3 + s7);
   1.519 +  x4 = fdct_round_shift(s0 - s4);
   1.520 +  x5 = fdct_round_shift(s1 - s5);
   1.521 +  x6 = fdct_round_shift(s2 - s6);
   1.522 +  x7 = fdct_round_shift(s3 - s7);
   1.523 +
   1.524 +  // stage 2
   1.525 +  s0 = x0;
   1.526 +  s1 = x1;
   1.527 +  s2 = x2;
   1.528 +  s3 = x3;
   1.529 +  s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
   1.530 +  s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
   1.531 +  s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
   1.532 +  s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
   1.533 +
   1.534 +  x0 = s0 + s2;
   1.535 +  x1 = s1 + s3;
   1.536 +  x2 = s0 - s2;
   1.537 +  x3 = s1 - s3;
   1.538 +  x4 = fdct_round_shift(s4 + s6);
   1.539 +  x5 = fdct_round_shift(s5 + s7);
   1.540 +  x6 = fdct_round_shift(s4 - s6);
   1.541 +  x7 = fdct_round_shift(s5 - s7);
   1.542 +
   1.543 +  // stage 3
   1.544 +  s2 = cospi_16_64 * (x2 + x3);
   1.545 +  s3 = cospi_16_64 * (x2 - x3);
   1.546 +  s6 = cospi_16_64 * (x6 + x7);
   1.547 +  s7 = cospi_16_64 * (x6 - x7);
   1.548 +
   1.549 +  x2 = fdct_round_shift(s2);
   1.550 +  x3 = fdct_round_shift(s3);
   1.551 +  x6 = fdct_round_shift(s6);
   1.552 +  x7 = fdct_round_shift(s7);
   1.553 +
   1.554 +  output[0] =   x0;
   1.555 +  output[1] = - x4;
   1.556 +  output[2] =   x6;
   1.557 +  output[3] = - x2;
   1.558 +  output[4] =   x3;
   1.559 +  output[5] = - x7;
   1.560 +  output[6] =   x5;
   1.561 +  output[7] = - x1;
   1.562 +}
   1.563 +
   1.564 +static const transform_2d FHT_8[] = {
   1.565 +  { fdct8,  fdct8  },  // DCT_DCT  = 0
   1.566 +  { fadst8, fdct8  },  // ADST_DCT = 1
   1.567 +  { fdct8,  fadst8 },  // DCT_ADST = 2
   1.568 +  { fadst8, fadst8 }   // ADST_ADST = 3
   1.569 +};
   1.570 +
   1.571 +void vp9_short_fht8x8_c(const int16_t *input, int16_t *output,
   1.572 +                        int stride, int tx_type) {
   1.573 +  int16_t out[64];
   1.574 +  int16_t *outptr = &out[0];
   1.575 +  int i, j;
   1.576 +  int16_t temp_in[8], temp_out[8];
   1.577 +  const transform_2d ht = FHT_8[tx_type];
   1.578 +
   1.579 +  // Columns
   1.580 +  for (i = 0; i < 8; ++i) {
   1.581 +    for (j = 0; j < 8; ++j)
   1.582 +      temp_in[j] = input[j * stride + i] * 4;
   1.583 +    ht.cols(temp_in, temp_out);
   1.584 +    for (j = 0; j < 8; ++j)
   1.585 +      outptr[j * 8 + i] = temp_out[j];
   1.586 +  }
   1.587 +
   1.588 +  // Rows
   1.589 +  for (i = 0; i < 8; ++i) {
   1.590 +    for (j = 0; j < 8; ++j)
   1.591 +      temp_in[j] = out[j + i * 8];
   1.592 +    ht.rows(temp_in, temp_out);
   1.593 +    for (j = 0; j < 8; ++j)
   1.594 +      output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
   1.595 +  }
   1.596 +}
   1.597 +
   1.598 +/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
   1.599 +   pixel. */
   1.600 +void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
   1.601 +  int i;
   1.602 +  int a1, b1, c1, d1, e1;
   1.603 +  const int16_t *ip = input;
   1.604 +  int16_t *op = output;
   1.605 +
   1.606 +  for (i = 0; i < 4; i++) {
   1.607 +    a1 = ip[0 * stride];
   1.608 +    b1 = ip[1 * stride];
   1.609 +    c1 = ip[2 * stride];
   1.610 +    d1 = ip[3 * stride];
   1.611 +
   1.612 +    a1 += b1;
   1.613 +    d1 = d1 - c1;
   1.614 +    e1 = (a1 - d1) >> 1;
   1.615 +    b1 = e1 - b1;
   1.616 +    c1 = e1 - c1;
   1.617 +    a1 -= c1;
   1.618 +    d1 += b1;
   1.619 +    op[0] = a1;
   1.620 +    op[4] = c1;
   1.621 +    op[8] = d1;
   1.622 +    op[12] = b1;
   1.623 +
   1.624 +    ip++;
   1.625 +    op++;
   1.626 +  }
   1.627 +  ip = output;
   1.628 +  op = output;
   1.629 +
   1.630 +  for (i = 0; i < 4; i++) {
   1.631 +    a1 = ip[0];
   1.632 +    b1 = ip[1];
   1.633 +    c1 = ip[2];
   1.634 +    d1 = ip[3];
   1.635 +
   1.636 +    a1 += b1;
   1.637 +    d1 -= c1;
   1.638 +    e1 = (a1 - d1) >> 1;
   1.639 +    b1 = e1 - b1;
   1.640 +    c1 = e1 - c1;
   1.641 +    a1 -= c1;
   1.642 +    d1 += b1;
   1.643 +    op[0] = a1 * UNIT_QUANT_FACTOR;
   1.644 +    op[1] = c1 * UNIT_QUANT_FACTOR;
   1.645 +    op[2] = d1 * UNIT_QUANT_FACTOR;
   1.646 +    op[3] = b1 * UNIT_QUANT_FACTOR;
   1.647 +
   1.648 +    ip += 4;
   1.649 +    op += 4;
   1.650 +  }
   1.651 +}
   1.652 +
   1.653 +// Rewrote to use same algorithm as others.
   1.654 +static void fdct16(const int16_t in[16], int16_t out[16]) {
   1.655 +  /*canbe16*/ int step1[8];
   1.656 +  /*canbe16*/ int step2[8];
   1.657 +  /*canbe16*/ int step3[8];
   1.658 +  /*canbe16*/ int input[8];
   1.659 +  /*needs32*/ int temp1, temp2;
   1.660 +
   1.661 +  // step 1
   1.662 +  input[0] = in[0] + in[15];
   1.663 +  input[1] = in[1] + in[14];
   1.664 +  input[2] = in[2] + in[13];
   1.665 +  input[3] = in[3] + in[12];
   1.666 +  input[4] = in[4] + in[11];
   1.667 +  input[5] = in[5] + in[10];
   1.668 +  input[6] = in[6] + in[ 9];
   1.669 +  input[7] = in[7] + in[ 8];
   1.670 +
   1.671 +  step1[0] = in[7] - in[ 8];
   1.672 +  step1[1] = in[6] - in[ 9];
   1.673 +  step1[2] = in[5] - in[10];
   1.674 +  step1[3] = in[4] - in[11];
   1.675 +  step1[4] = in[3] - in[12];
   1.676 +  step1[5] = in[2] - in[13];
   1.677 +  step1[6] = in[1] - in[14];
   1.678 +  step1[7] = in[0] - in[15];
   1.679 +
   1.680 +  // fdct8(step, step);
   1.681 +  {
   1.682 +    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
   1.683 +    /*needs32*/ int t0, t1, t2, t3;
   1.684 +    /*canbe16*/ int x0, x1, x2, x3;
   1.685 +
   1.686 +    // stage 1
   1.687 +    s0 = input[0] + input[7];
   1.688 +    s1 = input[1] + input[6];
   1.689 +    s2 = input[2] + input[5];
   1.690 +    s3 = input[3] + input[4];
   1.691 +    s4 = input[3] - input[4];
   1.692 +    s5 = input[2] - input[5];
   1.693 +    s6 = input[1] - input[6];
   1.694 +    s7 = input[0] - input[7];
   1.695 +
   1.696 +    // fdct4(step, step);
   1.697 +    x0 = s0 + s3;
   1.698 +    x1 = s1 + s2;
   1.699 +    x2 = s1 - s2;
   1.700 +    x3 = s0 - s3;
   1.701 +    t0 = (x0 + x1) * cospi_16_64;
   1.702 +    t1 = (x0 - x1) * cospi_16_64;
   1.703 +    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
   1.704 +    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
   1.705 +    out[0] = fdct_round_shift(t0);
   1.706 +    out[4] = fdct_round_shift(t2);
   1.707 +    out[8] = fdct_round_shift(t1);
   1.708 +    out[12] = fdct_round_shift(t3);
   1.709 +
   1.710 +    // Stage 2
   1.711 +    t0 = (s6 - s5) * cospi_16_64;
   1.712 +    t1 = (s6 + s5) * cospi_16_64;
   1.713 +    t2 = fdct_round_shift(t0);
   1.714 +    t3 = fdct_round_shift(t1);
   1.715 +
   1.716 +    // Stage 3
   1.717 +    x0 = s4 + t2;
   1.718 +    x1 = s4 - t2;
   1.719 +    x2 = s7 - t3;
   1.720 +    x3 = s7 + t3;
   1.721 +
   1.722 +    // Stage 4
   1.723 +    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
   1.724 +    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
   1.725 +    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
   1.726 +    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
   1.727 +    out[2] = fdct_round_shift(t0);
   1.728 +    out[6] = fdct_round_shift(t2);
   1.729 +    out[10] = fdct_round_shift(t1);
   1.730 +    out[14] = fdct_round_shift(t3);
   1.731 +  }
   1.732 +
   1.733 +  // step 2
   1.734 +  temp1 = (step1[5] - step1[2]) * cospi_16_64;
   1.735 +  temp2 = (step1[4] - step1[3]) * cospi_16_64;
   1.736 +  step2[2] = fdct_round_shift(temp1);
   1.737 +  step2[3] = fdct_round_shift(temp2);
   1.738 +  temp1 = (step1[4] + step1[3]) * cospi_16_64;
   1.739 +  temp2 = (step1[5] + step1[2]) * cospi_16_64;
   1.740 +  step2[4] = fdct_round_shift(temp1);
   1.741 +  step2[5] = fdct_round_shift(temp2);
   1.742 +
   1.743 +  // step 3
   1.744 +  step3[0] = step1[0] + step2[3];
   1.745 +  step3[1] = step1[1] + step2[2];
   1.746 +  step3[2] = step1[1] - step2[2];
   1.747 +  step3[3] = step1[0] - step2[3];
   1.748 +  step3[4] = step1[7] - step2[4];
   1.749 +  step3[5] = step1[6] - step2[5];
   1.750 +  step3[6] = step1[6] + step2[5];
   1.751 +  step3[7] = step1[7] + step2[4];
   1.752 +
   1.753 +  // step 4
   1.754 +  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
   1.755 +  temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
   1.756 +  step2[1] = fdct_round_shift(temp1);
   1.757 +  step2[2] = fdct_round_shift(temp2);
   1.758 +  temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
   1.759 +  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
   1.760 +  step2[5] = fdct_round_shift(temp1);
   1.761 +  step2[6] = fdct_round_shift(temp2);
   1.762 +
   1.763 +  // step 5
   1.764 +  step1[0] = step3[0] + step2[1];
   1.765 +  step1[1] = step3[0] - step2[1];
   1.766 +  step1[2] = step3[3] - step2[2];
   1.767 +  step1[3] = step3[3] + step2[2];
   1.768 +  step1[4] = step3[4] + step2[5];
   1.769 +  step1[5] = step3[4] - step2[5];
   1.770 +  step1[6] = step3[7] - step2[6];
   1.771 +  step1[7] = step3[7] + step2[6];
   1.772 +
   1.773 +  // step 6
   1.774 +  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
   1.775 +  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
   1.776 +  out[1] = fdct_round_shift(temp1);
   1.777 +  out[9] = fdct_round_shift(temp2);
   1.778 +
   1.779 +  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
   1.780 +  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
   1.781 +  out[5] = fdct_round_shift(temp1);
   1.782 +  out[13] = fdct_round_shift(temp2);
   1.783 +
   1.784 +  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
   1.785 +  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
   1.786 +  out[3] = fdct_round_shift(temp1);
   1.787 +  out[11] = fdct_round_shift(temp2);
   1.788 +
   1.789 +  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
   1.790 +  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
   1.791 +  out[7] = fdct_round_shift(temp1);
   1.792 +  out[15] = fdct_round_shift(temp2);
   1.793 +}
   1.794 +
   1.795 +static void fadst16(const int16_t *input, int16_t *output) {
   1.796 +  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
   1.797 +
   1.798 +  int x0 = input[15];
   1.799 +  int x1 = input[0];
   1.800 +  int x2 = input[13];
   1.801 +  int x3 = input[2];
   1.802 +  int x4 = input[11];
   1.803 +  int x5 = input[4];
   1.804 +  int x6 = input[9];
   1.805 +  int x7 = input[6];
   1.806 +  int x8 = input[7];
   1.807 +  int x9 = input[8];
   1.808 +  int x10 = input[5];
   1.809 +  int x11 = input[10];
   1.810 +  int x12 = input[3];
   1.811 +  int x13 = input[12];
   1.812 +  int x14 = input[1];
   1.813 +  int x15 = input[14];
   1.814 +
   1.815 +  // stage 1
   1.816 +  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
   1.817 +  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
   1.818 +  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
   1.819 +  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
   1.820 +  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
   1.821 +  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
   1.822 +  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
   1.823 +  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
   1.824 +  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
   1.825 +  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
   1.826 +  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
   1.827 +  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
   1.828 +  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
   1.829 +  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
   1.830 +  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   1.831 +  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
   1.832 +
   1.833 +  x0 = fdct_round_shift(s0 + s8);
   1.834 +  x1 = fdct_round_shift(s1 + s9);
   1.835 +  x2 = fdct_round_shift(s2 + s10);
   1.836 +  x3 = fdct_round_shift(s3 + s11);
   1.837 +  x4 = fdct_round_shift(s4 + s12);
   1.838 +  x5 = fdct_round_shift(s5 + s13);
   1.839 +  x6 = fdct_round_shift(s6 + s14);
   1.840 +  x7 = fdct_round_shift(s7 + s15);
   1.841 +  x8  = fdct_round_shift(s0 - s8);
   1.842 +  x9  = fdct_round_shift(s1 - s9);
   1.843 +  x10 = fdct_round_shift(s2 - s10);
   1.844 +  x11 = fdct_round_shift(s3 - s11);
   1.845 +  x12 = fdct_round_shift(s4 - s12);
   1.846 +  x13 = fdct_round_shift(s5 - s13);
   1.847 +  x14 = fdct_round_shift(s6 - s14);
   1.848 +  x15 = fdct_round_shift(s7 - s15);
   1.849 +
   1.850 +  // stage 2
   1.851 +  s0 = x0;
   1.852 +  s1 = x1;
   1.853 +  s2 = x2;
   1.854 +  s3 = x3;
   1.855 +  s4 = x4;
   1.856 +  s5 = x5;
   1.857 +  s6 = x6;
   1.858 +  s7 = x7;
   1.859 +  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
   1.860 +  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
   1.861 +  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
   1.862 +  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
   1.863 +  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
   1.864 +  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
   1.865 +  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
   1.866 +  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
   1.867 +
   1.868 +  x0 = s0 + s4;
   1.869 +  x1 = s1 + s5;
   1.870 +  x2 = s2 + s6;
   1.871 +  x3 = s3 + s7;
   1.872 +  x4 = s0 - s4;
   1.873 +  x5 = s1 - s5;
   1.874 +  x6 = s2 - s6;
   1.875 +  x7 = s3 - s7;
   1.876 +  x8 = fdct_round_shift(s8 + s12);
   1.877 +  x9 = fdct_round_shift(s9 + s13);
   1.878 +  x10 = fdct_round_shift(s10 + s14);
   1.879 +  x11 = fdct_round_shift(s11 + s15);
   1.880 +  x12 = fdct_round_shift(s8 - s12);
   1.881 +  x13 = fdct_round_shift(s9 - s13);
   1.882 +  x14 = fdct_round_shift(s10 - s14);
   1.883 +  x15 = fdct_round_shift(s11 - s15);
   1.884 +
   1.885 +  // stage 3
   1.886 +  s0 = x0;
   1.887 +  s1 = x1;
   1.888 +  s2 = x2;
   1.889 +  s3 = x3;
   1.890 +  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
   1.891 +  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
   1.892 +  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
   1.893 +  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
   1.894 +  s8 = x8;
   1.895 +  s9 = x9;
   1.896 +  s10 = x10;
   1.897 +  s11 = x11;
   1.898 +  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
   1.899 +  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
   1.900 +  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
   1.901 +  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
   1.902 +
   1.903 +  x0 = s0 + s2;
   1.904 +  x1 = s1 + s3;
   1.905 +  x2 = s0 - s2;
   1.906 +  x3 = s1 - s3;
   1.907 +  x4 = fdct_round_shift(s4 + s6);
   1.908 +  x5 = fdct_round_shift(s5 + s7);
   1.909 +  x6 = fdct_round_shift(s4 - s6);
   1.910 +  x7 = fdct_round_shift(s5 - s7);
   1.911 +  x8 = s8 + s10;
   1.912 +  x9 = s9 + s11;
   1.913 +  x10 = s8 - s10;
   1.914 +  x11 = s9 - s11;
   1.915 +  x12 = fdct_round_shift(s12 + s14);
   1.916 +  x13 = fdct_round_shift(s13 + s15);
   1.917 +  x14 = fdct_round_shift(s12 - s14);
   1.918 +  x15 = fdct_round_shift(s13 - s15);
   1.919 +
   1.920 +  // stage 4
   1.921 +  s2 = (- cospi_16_64) * (x2 + x3);
   1.922 +  s3 = cospi_16_64 * (x2 - x3);
   1.923 +  s6 = cospi_16_64 * (x6 + x7);
   1.924 +  s7 = cospi_16_64 * (- x6 + x7);
   1.925 +  s10 = cospi_16_64 * (x10 + x11);
   1.926 +  s11 = cospi_16_64 * (- x10 + x11);
   1.927 +  s14 = (- cospi_16_64) * (x14 + x15);
   1.928 +  s15 = cospi_16_64 * (x14 - x15);
   1.929 +
   1.930 +  x2 = fdct_round_shift(s2);
   1.931 +  x3 = fdct_round_shift(s3);
   1.932 +  x6 = fdct_round_shift(s6);
   1.933 +  x7 = fdct_round_shift(s7);
   1.934 +  x10 = fdct_round_shift(s10);
   1.935 +  x11 = fdct_round_shift(s11);
   1.936 +  x14 = fdct_round_shift(s14);
   1.937 +  x15 = fdct_round_shift(s15);
   1.938 +
   1.939 +  output[0] = x0;
   1.940 +  output[1] = - x8;
   1.941 +  output[2] = x12;
   1.942 +  output[3] = - x4;
   1.943 +  output[4] = x6;
   1.944 +  output[5] = x14;
   1.945 +  output[6] = x10;
   1.946 +  output[7] = x2;
   1.947 +  output[8] = x3;
   1.948 +  output[9] =  x11;
   1.949 +  output[10] = x15;
   1.950 +  output[11] = x7;
   1.951 +  output[12] = x5;
   1.952 +  output[13] = - x13;
   1.953 +  output[14] = x9;
   1.954 +  output[15] = - x1;
   1.955 +}
   1.956 +
   1.957 +static const transform_2d FHT_16[] = {
   1.958 +  { fdct16,  fdct16  },  // DCT_DCT  = 0
   1.959 +  { fadst16, fdct16  },  // ADST_DCT = 1
   1.960 +  { fdct16,  fadst16 },  // DCT_ADST = 2
   1.961 +  { fadst16, fadst16 }   // ADST_ADST = 3
   1.962 +};
   1.963 +
   1.964 +void vp9_short_fht16x16_c(const int16_t *input, int16_t *output,
   1.965 +                          int stride, int tx_type) {
   1.966 +  int16_t out[256];
   1.967 +  int16_t *outptr = &out[0];
   1.968 +  int i, j;
   1.969 +  int16_t temp_in[16], temp_out[16];
   1.970 +  const transform_2d ht = FHT_16[tx_type];
   1.971 +
   1.972 +  // Columns
   1.973 +  for (i = 0; i < 16; ++i) {
   1.974 +    for (j = 0; j < 16; ++j)
   1.975 +      temp_in[j] = input[j * stride + i] * 4;
   1.976 +    ht.cols(temp_in, temp_out);
   1.977 +    for (j = 0; j < 16; ++j)
   1.978 +      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
   1.979 +//      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   1.980 +  }
   1.981 +
   1.982 +  // Rows
   1.983 +  for (i = 0; i < 16; ++i) {
   1.984 +    for (j = 0; j < 16; ++j)
   1.985 +      temp_in[j] = out[j + i * 16];
   1.986 +    ht.rows(temp_in, temp_out);
   1.987 +    for (j = 0; j < 16; ++j)
   1.988 +      output[j + i * 16] = temp_out[j];
   1.989 +  }
   1.990 +}
   1.991 +
   1.992 +static INLINE int dct_32_round(int input) {
   1.993 +  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
   1.994 +  assert(-131072 <= rv && rv <= 131071);
   1.995 +  return rv;
   1.996 +}
   1.997 +
   1.998 +static INLINE int half_round_shift(int input) {
   1.999 +  int rv = (input + 1 + (input < 0)) >> 2;
  1.1000 +  return rv;
  1.1001 +}
  1.1002 +
  1.1003 +static void dct32_1d(const int *input, int *output, int round) {
  1.1004 +  int step[32];
  1.1005 +  // Stage 1
  1.1006 +  step[0] = input[0] + input[(32 - 1)];
  1.1007 +  step[1] = input[1] + input[(32 - 2)];
  1.1008 +  step[2] = input[2] + input[(32 - 3)];
  1.1009 +  step[3] = input[3] + input[(32 - 4)];
  1.1010 +  step[4] = input[4] + input[(32 - 5)];
  1.1011 +  step[5] = input[5] + input[(32 - 6)];
  1.1012 +  step[6] = input[6] + input[(32 - 7)];
  1.1013 +  step[7] = input[7] + input[(32 - 8)];
  1.1014 +  step[8] = input[8] + input[(32 - 9)];
  1.1015 +  step[9] = input[9] + input[(32 - 10)];
  1.1016 +  step[10] = input[10] + input[(32 - 11)];
  1.1017 +  step[11] = input[11] + input[(32 - 12)];
  1.1018 +  step[12] = input[12] + input[(32 - 13)];
  1.1019 +  step[13] = input[13] + input[(32 - 14)];
  1.1020 +  step[14] = input[14] + input[(32 - 15)];
  1.1021 +  step[15] = input[15] + input[(32 - 16)];
  1.1022 +  step[16] = -input[16] + input[(32 - 17)];
  1.1023 +  step[17] = -input[17] + input[(32 - 18)];
  1.1024 +  step[18] = -input[18] + input[(32 - 19)];
  1.1025 +  step[19] = -input[19] + input[(32 - 20)];
  1.1026 +  step[20] = -input[20] + input[(32 - 21)];
  1.1027 +  step[21] = -input[21] + input[(32 - 22)];
  1.1028 +  step[22] = -input[22] + input[(32 - 23)];
  1.1029 +  step[23] = -input[23] + input[(32 - 24)];
  1.1030 +  step[24] = -input[24] + input[(32 - 25)];
  1.1031 +  step[25] = -input[25] + input[(32 - 26)];
  1.1032 +  step[26] = -input[26] + input[(32 - 27)];
  1.1033 +  step[27] = -input[27] + input[(32 - 28)];
  1.1034 +  step[28] = -input[28] + input[(32 - 29)];
  1.1035 +  step[29] = -input[29] + input[(32 - 30)];
  1.1036 +  step[30] = -input[30] + input[(32 - 31)];
  1.1037 +  step[31] = -input[31] + input[(32 - 32)];
  1.1038 +
  1.1039 +  // Stage 2
  1.1040 +  output[0] = step[0] + step[16 - 1];
  1.1041 +  output[1] = step[1] + step[16 - 2];
  1.1042 +  output[2] = step[2] + step[16 - 3];
  1.1043 +  output[3] = step[3] + step[16 - 4];
  1.1044 +  output[4] = step[4] + step[16 - 5];
  1.1045 +  output[5] = step[5] + step[16 - 6];
  1.1046 +  output[6] = step[6] + step[16 - 7];
  1.1047 +  output[7] = step[7] + step[16 - 8];
  1.1048 +  output[8] = -step[8] + step[16 - 9];
  1.1049 +  output[9] = -step[9] + step[16 - 10];
  1.1050 +  output[10] = -step[10] + step[16 - 11];
  1.1051 +  output[11] = -step[11] + step[16 - 12];
  1.1052 +  output[12] = -step[12] + step[16 - 13];
  1.1053 +  output[13] = -step[13] + step[16 - 14];
  1.1054 +  output[14] = -step[14] + step[16 - 15];
  1.1055 +  output[15] = -step[15] + step[16 - 16];
  1.1056 +
  1.1057 +  output[16] = step[16];
  1.1058 +  output[17] = step[17];
  1.1059 +  output[18] = step[18];
  1.1060 +  output[19] = step[19];
  1.1061 +
  1.1062 +  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
  1.1063 +  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
  1.1064 +  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
  1.1065 +  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
  1.1066 +
  1.1067 +  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
  1.1068 +  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
  1.1069 +  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
  1.1070 +  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
  1.1071 +
  1.1072 +  output[28] = step[28];
  1.1073 +  output[29] = step[29];
  1.1074 +  output[30] = step[30];
  1.1075 +  output[31] = step[31];
  1.1076 +
  1.1077 +  // dump the magnitude by 4, hence the intermediate values are within
  1.1078 +  // the range of 16 bits.
  1.1079 +  if (round) {
  1.1080 +    output[0] = half_round_shift(output[0]);
  1.1081 +    output[1] = half_round_shift(output[1]);
  1.1082 +    output[2] = half_round_shift(output[2]);
  1.1083 +    output[3] = half_round_shift(output[3]);
  1.1084 +    output[4] = half_round_shift(output[4]);
  1.1085 +    output[5] = half_round_shift(output[5]);
  1.1086 +    output[6] = half_round_shift(output[6]);
  1.1087 +    output[7] = half_round_shift(output[7]);
  1.1088 +    output[8] = half_round_shift(output[8]);
  1.1089 +    output[9] = half_round_shift(output[9]);
  1.1090 +    output[10] = half_round_shift(output[10]);
  1.1091 +    output[11] = half_round_shift(output[11]);
  1.1092 +    output[12] = half_round_shift(output[12]);
  1.1093 +    output[13] = half_round_shift(output[13]);
  1.1094 +    output[14] = half_round_shift(output[14]);
  1.1095 +    output[15] = half_round_shift(output[15]);
  1.1096 +
  1.1097 +    output[16] = half_round_shift(output[16]);
  1.1098 +    output[17] = half_round_shift(output[17]);
  1.1099 +    output[18] = half_round_shift(output[18]);
  1.1100 +    output[19] = half_round_shift(output[19]);
  1.1101 +    output[20] = half_round_shift(output[20]);
  1.1102 +    output[21] = half_round_shift(output[21]);
  1.1103 +    output[22] = half_round_shift(output[22]);
  1.1104 +    output[23] = half_round_shift(output[23]);
  1.1105 +    output[24] = half_round_shift(output[24]);
  1.1106 +    output[25] = half_round_shift(output[25]);
  1.1107 +    output[26] = half_round_shift(output[26]);
  1.1108 +    output[27] = half_round_shift(output[27]);
  1.1109 +    output[28] = half_round_shift(output[28]);
  1.1110 +    output[29] = half_round_shift(output[29]);
  1.1111 +    output[30] = half_round_shift(output[30]);
  1.1112 +    output[31] = half_round_shift(output[31]);
  1.1113 +  }
  1.1114 +
  1.1115 +  // Stage 3
  1.1116 +  step[0] = output[0] + output[(8 - 1)];
  1.1117 +  step[1] = output[1] + output[(8 - 2)];
  1.1118 +  step[2] = output[2] + output[(8 - 3)];
  1.1119 +  step[3] = output[3] + output[(8 - 4)];
  1.1120 +  step[4] = -output[4] + output[(8 - 5)];
  1.1121 +  step[5] = -output[5] + output[(8 - 6)];
  1.1122 +  step[6] = -output[6] + output[(8 - 7)];
  1.1123 +  step[7] = -output[7] + output[(8 - 8)];
  1.1124 +  step[8] = output[8];
  1.1125 +  step[9] = output[9];
  1.1126 +  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
  1.1127 +  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
  1.1128 +  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
  1.1129 +  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
  1.1130 +  step[14] = output[14];
  1.1131 +  step[15] = output[15];
  1.1132 +
  1.1133 +  step[16] = output[16] + output[23];
  1.1134 +  step[17] = output[17] + output[22];
  1.1135 +  step[18] = output[18] + output[21];
  1.1136 +  step[19] = output[19] + output[20];
  1.1137 +  step[20] = -output[20] + output[19];
  1.1138 +  step[21] = -output[21] + output[18];
  1.1139 +  step[22] = -output[22] + output[17];
  1.1140 +  step[23] = -output[23] + output[16];
  1.1141 +  step[24] = -output[24] + output[31];
  1.1142 +  step[25] = -output[25] + output[30];
  1.1143 +  step[26] = -output[26] + output[29];
  1.1144 +  step[27] = -output[27] + output[28];
  1.1145 +  step[28] = output[28] + output[27];
  1.1146 +  step[29] = output[29] + output[26];
  1.1147 +  step[30] = output[30] + output[25];
  1.1148 +  step[31] = output[31] + output[24];
  1.1149 +
  1.1150 +  // Stage 4
  1.1151 +  output[0] = step[0] + step[3];
  1.1152 +  output[1] = step[1] + step[2];
  1.1153 +  output[2] = -step[2] + step[1];
  1.1154 +  output[3] = -step[3] + step[0];
  1.1155 +  output[4] = step[4];
  1.1156 +  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
  1.1157 +  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
  1.1158 +  output[7] = step[7];
  1.1159 +  output[8] = step[8] + step[11];
  1.1160 +  output[9] = step[9] + step[10];
  1.1161 +  output[10] = -step[10] + step[9];
  1.1162 +  output[11] = -step[11] + step[8];
  1.1163 +  output[12] = -step[12] + step[15];
  1.1164 +  output[13] = -step[13] + step[14];
  1.1165 +  output[14] = step[14] + step[13];
  1.1166 +  output[15] = step[15] + step[12];
  1.1167 +
  1.1168 +  output[16] = step[16];
  1.1169 +  output[17] = step[17];
  1.1170 +  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
  1.1171 +  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
  1.1172 +  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
  1.1173 +  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
  1.1174 +  output[22] = step[22];
  1.1175 +  output[23] = step[23];
  1.1176 +  output[24] = step[24];
  1.1177 +  output[25] = step[25];
  1.1178 +  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
  1.1179 +  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
  1.1180 +  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
  1.1181 +  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
  1.1182 +  output[30] = step[30];
  1.1183 +  output[31] = step[31];
  1.1184 +
  1.1185 +  // Stage 5
  1.1186 +  step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
  1.1187 +  step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
  1.1188 +  step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
  1.1189 +  step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
  1.1190 +  step[4] = output[4] + output[5];
  1.1191 +  step[5] = -output[5] + output[4];
  1.1192 +  step[6] = -output[6] + output[7];
  1.1193 +  step[7] = output[7] + output[6];
  1.1194 +  step[8] = output[8];
  1.1195 +  step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
  1.1196 +  step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
  1.1197 +  step[11] = output[11];
  1.1198 +  step[12] = output[12];
  1.1199 +  step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
  1.1200 +  step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
  1.1201 +  step[15] = output[15];
  1.1202 +
  1.1203 +  step[16] = output[16] + output[19];
  1.1204 +  step[17] = output[17] + output[18];
  1.1205 +  step[18] = -output[18] + output[17];
  1.1206 +  step[19] = -output[19] + output[16];
  1.1207 +  step[20] = -output[20] + output[23];
  1.1208 +  step[21] = -output[21] + output[22];
  1.1209 +  step[22] = output[22] + output[21];
  1.1210 +  step[23] = output[23] + output[20];
  1.1211 +  step[24] = output[24] + output[27];
  1.1212 +  step[25] = output[25] + output[26];
  1.1213 +  step[26] = -output[26] + output[25];
  1.1214 +  step[27] = -output[27] + output[24];
  1.1215 +  step[28] = -output[28] + output[31];
  1.1216 +  step[29] = -output[29] + output[30];
  1.1217 +  step[30] = output[30] + output[29];
  1.1218 +  step[31] = output[31] + output[28];
  1.1219 +
  1.1220 +  // Stage 6
  1.1221 +  output[0] = step[0];
  1.1222 +  output[1] = step[1];
  1.1223 +  output[2] = step[2];
  1.1224 +  output[3] = step[3];
  1.1225 +  output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
  1.1226 +  output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
  1.1227 +  output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
  1.1228 +  output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
  1.1229 +  output[8] = step[8] + step[9];
  1.1230 +  output[9] = -step[9] + step[8];
  1.1231 +  output[10] = -step[10] + step[11];
  1.1232 +  output[11] = step[11] + step[10];
  1.1233 +  output[12] = step[12] + step[13];
  1.1234 +  output[13] = -step[13] + step[12];
  1.1235 +  output[14] = -step[14] + step[15];
  1.1236 +  output[15] = step[15] + step[14];
  1.1237 +
  1.1238 +  output[16] = step[16];
  1.1239 +  output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
  1.1240 +  output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
  1.1241 +  output[19] = step[19];
  1.1242 +  output[20] = step[20];
  1.1243 +  output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
  1.1244 +  output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
  1.1245 +  output[23] = step[23];
  1.1246 +  output[24] = step[24];
  1.1247 +  output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
  1.1248 +  output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
  1.1249 +  output[27] = step[27];
  1.1250 +  output[28] = step[28];
  1.1251 +  output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
  1.1252 +  output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
  1.1253 +  output[31] = step[31];
  1.1254 +
  1.1255 +  // Stage 7
  1.1256 +  step[0] = output[0];
  1.1257 +  step[1] = output[1];
  1.1258 +  step[2] = output[2];
  1.1259 +  step[3] = output[3];
  1.1260 +  step[4] = output[4];
  1.1261 +  step[5] = output[5];
  1.1262 +  step[6] = output[6];
  1.1263 +  step[7] = output[7];
  1.1264 +  step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
  1.1265 +  step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
  1.1266 +  step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
  1.1267 +  step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
  1.1268 +  step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
  1.1269 +  step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
  1.1270 +  step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
  1.1271 +  step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
  1.1272 +
  1.1273 +  step[16] = output[16] + output[17];
  1.1274 +  step[17] = -output[17] + output[16];
  1.1275 +  step[18] = -output[18] + output[19];
  1.1276 +  step[19] = output[19] + output[18];
  1.1277 +  step[20] = output[20] + output[21];
  1.1278 +  step[21] = -output[21] + output[20];
  1.1279 +  step[22] = -output[22] + output[23];
  1.1280 +  step[23] = output[23] + output[22];
  1.1281 +  step[24] = output[24] + output[25];
  1.1282 +  step[25] = -output[25] + output[24];
  1.1283 +  step[26] = -output[26] + output[27];
  1.1284 +  step[27] = output[27] + output[26];
  1.1285 +  step[28] = output[28] + output[29];
  1.1286 +  step[29] = -output[29] + output[28];
  1.1287 +  step[30] = -output[30] + output[31];
  1.1288 +  step[31] = output[31] + output[30];
  1.1289 +
  1.1290 +  // Final stage --- outputs indices are bit-reversed.
  1.1291 +  output[0]  = step[0];
  1.1292 +  output[16] = step[1];
  1.1293 +  output[8]  = step[2];
  1.1294 +  output[24] = step[3];
  1.1295 +  output[4]  = step[4];
  1.1296 +  output[20] = step[5];
  1.1297 +  output[12] = step[6];
  1.1298 +  output[28] = step[7];
  1.1299 +  output[2]  = step[8];
  1.1300 +  output[18] = step[9];
  1.1301 +  output[10] = step[10];
  1.1302 +  output[26] = step[11];
  1.1303 +  output[6]  = step[12];
  1.1304 +  output[22] = step[13];
  1.1305 +  output[14] = step[14];
  1.1306 +  output[30] = step[15];
  1.1307 +
  1.1308 +  output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
  1.1309 +  output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
  1.1310 +  output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
  1.1311 +  output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
  1.1312 +  output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
  1.1313 +  output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
  1.1314 +  output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
  1.1315 +  output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
  1.1316 +  output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
  1.1317 +  output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
  1.1318 +  output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
  1.1319 +  output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
  1.1320 +  output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
  1.1321 +  output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
  1.1322 +  output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
  1.1323 +  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
  1.1324 +}
  1.1325 +
  1.1326 +void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
  1.1327 +  int i, j;
  1.1328 +  int output[32 * 32];
  1.1329 +
  1.1330 +  // Columns
  1.1331 +  for (i = 0; i < 32; ++i) {
  1.1332 +    int temp_in[32], temp_out[32];
  1.1333 +    for (j = 0; j < 32; ++j)
  1.1334 +      temp_in[j] = input[j * stride + i] * 4;
  1.1335 +    dct32_1d(temp_in, temp_out, 0);
  1.1336 +    for (j = 0; j < 32; ++j)
  1.1337 +      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
  1.1338 +  }
  1.1339 +
  1.1340 +  // Rows
  1.1341 +  for (i = 0; i < 32; ++i) {
  1.1342 +    int temp_in[32], temp_out[32];
  1.1343 +    for (j = 0; j < 32; ++j)
  1.1344 +      temp_in[j] = output[j + i * 32];
  1.1345 +    dct32_1d(temp_in, temp_out, 0);
  1.1346 +    for (j = 0; j < 32; ++j)
  1.1347 +      out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
  1.1348 +  }
  1.1349 +}
  1.1350 +
  1.1351 +// Note that although we use dct_32_round in dct32_1d computation flow,
  1.1352 +// this 2d fdct32x32 for rate-distortion optimization loop is operating
  1.1353 +// within 16 bits precision.
  1.1354 +void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
  1.1355 +  int i, j;
  1.1356 +  int output[32 * 32];
  1.1357 +
  1.1358 +  // Columns
  1.1359 +  for (i = 0; i < 32; ++i) {
  1.1360 +    int temp_in[32], temp_out[32];
  1.1361 +    for (j = 0; j < 32; ++j)
  1.1362 +      temp_in[j] = input[j * stride + i] * 4;
  1.1363 +    dct32_1d(temp_in, temp_out, 0);
  1.1364 +    for (j = 0; j < 32; ++j)
  1.1365 +      // TODO(cd): see quality impact of only doing
  1.1366 +      //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
  1.1367 +      //           PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
  1.1368 +      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
  1.1369 +  }
  1.1370 +
  1.1371 +  // Rows
  1.1372 +  for (i = 0; i < 32; ++i) {
  1.1373 +    int temp_in[32], temp_out[32];
  1.1374 +    for (j = 0; j < 32; ++j)
  1.1375 +      temp_in[j] = output[j + i * 32];
  1.1376 +    dct32_1d(temp_in, temp_out, 1);
  1.1377 +    for (j = 0; j < 32; ++j)
  1.1378 +      out[j + i * 32] = temp_out[j];
  1.1379 +  }
  1.1380 +}
  1.1381 +
  1.1382 +void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
  1.1383 +                int stride) {
  1.1384 +  if (tx_type == DCT_DCT)
  1.1385 +    vp9_fdct4x4(input, output, stride);
  1.1386 +  else
  1.1387 +    vp9_short_fht4x4(input, output, stride, tx_type);
  1.1388 +}
  1.1389 +
  1.1390 +void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
  1.1391 +                int stride) {
  1.1392 +  if (tx_type == DCT_DCT)
  1.1393 +    vp9_fdct8x8(input, output, stride);
  1.1394 +  else
  1.1395 +    vp9_short_fht8x8(input, output, stride, tx_type);
  1.1396 +}
  1.1397 +
  1.1398 +void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
  1.1399 +                  int stride) {
  1.1400 +  if (tx_type == DCT_DCT)
  1.1401 +    vp9_fdct16x16(input, output, stride);
  1.1402 +  else
  1.1403 +    vp9_short_fht16x16(input, output, stride, tx_type);
  1.1404 +}

mercurial