1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/encoder/vp9_dct.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1401 @@ 1.4 +/* 1.5 + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include <assert.h> 1.15 +#include <math.h> 1.16 + 1.17 +#include "./vpx_config.h" 1.18 +#include "./vp9_rtcd.h" 1.19 + 1.20 +#include "vp9/common/vp9_blockd.h" 1.21 +#include "vp9/common/vp9_idct.h" 1.22 +#include "vp9/common/vp9_systemdependent.h" 1.23 + 1.24 +#include "vp9/encoder/vp9_dct.h" 1.25 + 1.26 +static INLINE int fdct_round_shift(int input) { 1.27 + int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); 1.28 + assert(INT16_MIN <= rv && rv <= INT16_MAX); 1.29 + return rv; 1.30 +} 1.31 + 1.32 +static void fdct4(const int16_t *input, int16_t *output) { 1.33 + int16_t step[4]; 1.34 + int temp1, temp2; 1.35 + 1.36 + step[0] = input[0] + input[3]; 1.37 + step[1] = input[1] + input[2]; 1.38 + step[2] = input[1] - input[2]; 1.39 + step[3] = input[0] - input[3]; 1.40 + 1.41 + temp1 = (step[0] + step[1]) * cospi_16_64; 1.42 + temp2 = (step[0] - step[1]) * cospi_16_64; 1.43 + output[0] = fdct_round_shift(temp1); 1.44 + output[2] = fdct_round_shift(temp2); 1.45 + temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; 1.46 + temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; 1.47 + output[1] = fdct_round_shift(temp1); 1.48 + output[3] = fdct_round_shift(temp2); 1.49 +} 1.50 + 1.51 +void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { 1.52 + // The 2D transform is done with two passes which are actually pretty 1.53 + // similar. In the first one, we transform the columns and transpose 1.54 + // the results. In the second one, we transform the rows. To achieve that, 1.55 + // as the first pass results are transposed, we tranpose the columns (that 1.56 + // is the transposed rows) and transpose the results (so that it goes back 1.57 + // in normal/row positions). 1.58 + int pass; 1.59 + // We need an intermediate buffer between passes. 1.60 + int16_t intermediate[4 * 4]; 1.61 + const int16_t *in = input; 1.62 + int16_t *out = intermediate; 1.63 + // Do the two transform/transpose passes 1.64 + for (pass = 0; pass < 2; ++pass) { 1.65 + /*canbe16*/ int input[4]; 1.66 + /*canbe16*/ int step[4]; 1.67 + /*needs32*/ int temp1, temp2; 1.68 + int i; 1.69 + for (i = 0; i < 4; ++i) { 1.70 + // Load inputs. 1.71 + if (0 == pass) { 1.72 + input[0] = in[0 * stride] * 16; 1.73 + input[1] = in[1 * stride] * 16; 1.74 + input[2] = in[2 * stride] * 16; 1.75 + input[3] = in[3 * stride] * 16; 1.76 + if (i == 0 && input[0]) { 1.77 + input[0] += 1; 1.78 + } 1.79 + } else { 1.80 + input[0] = in[0 * 4]; 1.81 + input[1] = in[1 * 4]; 1.82 + input[2] = in[2 * 4]; 1.83 + input[3] = in[3 * 4]; 1.84 + } 1.85 + // Transform. 1.86 + step[0] = input[0] + input[3]; 1.87 + step[1] = input[1] + input[2]; 1.88 + step[2] = input[1] - input[2]; 1.89 + step[3] = input[0] - input[3]; 1.90 + temp1 = (step[0] + step[1]) * cospi_16_64; 1.91 + temp2 = (step[0] - step[1]) * cospi_16_64; 1.92 + out[0] = fdct_round_shift(temp1); 1.93 + out[2] = fdct_round_shift(temp2); 1.94 + temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; 1.95 + temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; 1.96 + out[1] = fdct_round_shift(temp1); 1.97 + out[3] = fdct_round_shift(temp2); 1.98 + // Do next column (which is a transposed row in second/horizontal pass) 1.99 + in++; 1.100 + out += 4; 1.101 + } 1.102 + // Setup in/out for next pass. 1.103 + in = intermediate; 1.104 + out = output; 1.105 + } 1.106 + 1.107 + { 1.108 + int i, j; 1.109 + for (i = 0; i < 4; ++i) { 1.110 + for (j = 0; j < 4; ++j) 1.111 + output[j + i * 4] = (output[j + i * 4] + 1) >> 2; 1.112 + } 1.113 + } 1.114 +} 1.115 + 1.116 +static void fadst4(const int16_t *input, int16_t *output) { 1.117 + int x0, x1, x2, x3; 1.118 + int s0, s1, s2, s3, s4, s5, s6, s7; 1.119 + 1.120 + x0 = input[0]; 1.121 + x1 = input[1]; 1.122 + x2 = input[2]; 1.123 + x3 = input[3]; 1.124 + 1.125 + if (!(x0 | x1 | x2 | x3)) { 1.126 + output[0] = output[1] = output[2] = output[3] = 0; 1.127 + return; 1.128 + } 1.129 + 1.130 + s0 = sinpi_1_9 * x0; 1.131 + s1 = sinpi_4_9 * x0; 1.132 + s2 = sinpi_2_9 * x1; 1.133 + s3 = sinpi_1_9 * x1; 1.134 + s4 = sinpi_3_9 * x2; 1.135 + s5 = sinpi_4_9 * x3; 1.136 + s6 = sinpi_2_9 * x3; 1.137 + s7 = x0 + x1 - x3; 1.138 + 1.139 + x0 = s0 + s2 + s5; 1.140 + x1 = sinpi_3_9 * s7; 1.141 + x2 = s1 - s3 + s6; 1.142 + x3 = s4; 1.143 + 1.144 + s0 = x0 + x3; 1.145 + s1 = x1; 1.146 + s2 = x2 - x3; 1.147 + s3 = x2 - x0 + x3; 1.148 + 1.149 + // 1-D transform scaling factor is sqrt(2). 1.150 + output[0] = fdct_round_shift(s0); 1.151 + output[1] = fdct_round_shift(s1); 1.152 + output[2] = fdct_round_shift(s2); 1.153 + output[3] = fdct_round_shift(s3); 1.154 +} 1.155 + 1.156 +static const transform_2d FHT_4[] = { 1.157 + { fdct4, fdct4 }, // DCT_DCT = 0 1.158 + { fadst4, fdct4 }, // ADST_DCT = 1 1.159 + { fdct4, fadst4 }, // DCT_ADST = 2 1.160 + { fadst4, fadst4 } // ADST_ADST = 3 1.161 +}; 1.162 + 1.163 +void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, 1.164 + int stride, int tx_type) { 1.165 + int16_t out[4 * 4]; 1.166 + int16_t *outptr = &out[0]; 1.167 + int i, j; 1.168 + int16_t temp_in[4], temp_out[4]; 1.169 + const transform_2d ht = FHT_4[tx_type]; 1.170 + 1.171 + // Columns 1.172 + for (i = 0; i < 4; ++i) { 1.173 + for (j = 0; j < 4; ++j) 1.174 + temp_in[j] = input[j * stride + i] * 16; 1.175 + if (i == 0 && temp_in[0]) 1.176 + temp_in[0] += 1; 1.177 + ht.cols(temp_in, temp_out); 1.178 + for (j = 0; j < 4; ++j) 1.179 + outptr[j * 4 + i] = temp_out[j]; 1.180 + } 1.181 + 1.182 + // Rows 1.183 + for (i = 0; i < 4; ++i) { 1.184 + for (j = 0; j < 4; ++j) 1.185 + temp_in[j] = out[j + i * 4]; 1.186 + ht.rows(temp_in, temp_out); 1.187 + for (j = 0; j < 4; ++j) 1.188 + output[j + i * 4] = (temp_out[j] + 1) >> 2; 1.189 + } 1.190 +} 1.191 + 1.192 +static void fdct8(const int16_t *input, int16_t *output) { 1.193 + /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 1.194 + /*needs32*/ int t0, t1, t2, t3; 1.195 + /*canbe16*/ int x0, x1, x2, x3; 1.196 + 1.197 + // stage 1 1.198 + s0 = input[0] + input[7]; 1.199 + s1 = input[1] + input[6]; 1.200 + s2 = input[2] + input[5]; 1.201 + s3 = input[3] + input[4]; 1.202 + s4 = input[3] - input[4]; 1.203 + s5 = input[2] - input[5]; 1.204 + s6 = input[1] - input[6]; 1.205 + s7 = input[0] - input[7]; 1.206 + 1.207 + // fdct4(step, step); 1.208 + x0 = s0 + s3; 1.209 + x1 = s1 + s2; 1.210 + x2 = s1 - s2; 1.211 + x3 = s0 - s3; 1.212 + t0 = (x0 + x1) * cospi_16_64; 1.213 + t1 = (x0 - x1) * cospi_16_64; 1.214 + t2 = x2 * cospi_24_64 + x3 * cospi_8_64; 1.215 + t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; 1.216 + output[0] = fdct_round_shift(t0); 1.217 + output[2] = fdct_round_shift(t2); 1.218 + output[4] = fdct_round_shift(t1); 1.219 + output[6] = fdct_round_shift(t3); 1.220 + 1.221 + // Stage 2 1.222 + t0 = (s6 - s5) * cospi_16_64; 1.223 + t1 = (s6 + s5) * cospi_16_64; 1.224 + t2 = fdct_round_shift(t0); 1.225 + t3 = fdct_round_shift(t1); 1.226 + 1.227 + // Stage 3 1.228 + x0 = s4 + t2; 1.229 + x1 = s4 - t2; 1.230 + x2 = s7 - t3; 1.231 + x3 = s7 + t3; 1.232 + 1.233 + // Stage 4 1.234 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 1.235 + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 1.236 + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 1.237 + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 1.238 + output[1] = fdct_round_shift(t0); 1.239 + output[3] = fdct_round_shift(t2); 1.240 + output[5] = fdct_round_shift(t1); 1.241 + output[7] = fdct_round_shift(t3); 1.242 +} 1.243 + 1.244 +void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { 1.245 + int i, j; 1.246 + int16_t intermediate[64]; 1.247 + 1.248 + // Transform columns 1.249 + { 1.250 + int16_t *output = intermediate; 1.251 + /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 1.252 + /*needs32*/ int t0, t1, t2, t3; 1.253 + /*canbe16*/ int x0, x1, x2, x3; 1.254 + 1.255 + int i; 1.256 + for (i = 0; i < 8; i++) { 1.257 + // stage 1 1.258 + s0 = (input[0 * stride] + input[7 * stride]) * 4; 1.259 + s1 = (input[1 * stride] + input[6 * stride]) * 4; 1.260 + s2 = (input[2 * stride] + input[5 * stride]) * 4; 1.261 + s3 = (input[3 * stride] + input[4 * stride]) * 4; 1.262 + s4 = (input[3 * stride] - input[4 * stride]) * 4; 1.263 + s5 = (input[2 * stride] - input[5 * stride]) * 4; 1.264 + s6 = (input[1 * stride] - input[6 * stride]) * 4; 1.265 + s7 = (input[0 * stride] - input[7 * stride]) * 4; 1.266 + 1.267 + // fdct4(step, step); 1.268 + x0 = s0 + s3; 1.269 + x1 = s1 + s2; 1.270 + x2 = s1 - s2; 1.271 + x3 = s0 - s3; 1.272 + t0 = (x0 + x1) * cospi_16_64; 1.273 + t1 = (x0 - x1) * cospi_16_64; 1.274 + t2 = x2 * cospi_24_64 + x3 * cospi_8_64; 1.275 + t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; 1.276 + output[0 * 8] = fdct_round_shift(t0); 1.277 + output[2 * 8] = fdct_round_shift(t2); 1.278 + output[4 * 8] = fdct_round_shift(t1); 1.279 + output[6 * 8] = fdct_round_shift(t3); 1.280 + 1.281 + // Stage 2 1.282 + t0 = (s6 - s5) * cospi_16_64; 1.283 + t1 = (s6 + s5) * cospi_16_64; 1.284 + t2 = fdct_round_shift(t0); 1.285 + t3 = fdct_round_shift(t1); 1.286 + 1.287 + // Stage 3 1.288 + x0 = s4 + t2; 1.289 + x1 = s4 - t2; 1.290 + x2 = s7 - t3; 1.291 + x3 = s7 + t3; 1.292 + 1.293 + // Stage 4 1.294 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 1.295 + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 1.296 + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 1.297 + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 1.298 + output[1 * 8] = fdct_round_shift(t0); 1.299 + output[3 * 8] = fdct_round_shift(t2); 1.300 + output[5 * 8] = fdct_round_shift(t1); 1.301 + output[7 * 8] = fdct_round_shift(t3); 1.302 + input++; 1.303 + output++; 1.304 + } 1.305 + } 1.306 + 1.307 + // Rows 1.308 + for (i = 0; i < 8; ++i) { 1.309 + fdct8(&intermediate[i * 8], &final_output[i * 8]); 1.310 + for (j = 0; j < 8; ++j) 1.311 + final_output[j + i * 8] /= 2; 1.312 + } 1.313 +} 1.314 + 1.315 +void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { 1.316 + // The 2D transform is done with two passes which are actually pretty 1.317 + // similar. In the first one, we transform the columns and transpose 1.318 + // the results. In the second one, we transform the rows. To achieve that, 1.319 + // as the first pass results are transposed, we tranpose the columns (that 1.320 + // is the transposed rows) and transpose the results (so that it goes back 1.321 + // in normal/row positions). 1.322 + int pass; 1.323 + // We need an intermediate buffer between passes. 1.324 + int16_t intermediate[256]; 1.325 + const int16_t *in = input; 1.326 + int16_t *out = intermediate; 1.327 + // Do the two transform/transpose passes 1.328 + for (pass = 0; pass < 2; ++pass) { 1.329 + /*canbe16*/ int step1[8]; 1.330 + /*canbe16*/ int step2[8]; 1.331 + /*canbe16*/ int step3[8]; 1.332 + /*canbe16*/ int input[8]; 1.333 + /*needs32*/ int temp1, temp2; 1.334 + int i; 1.335 + for (i = 0; i < 16; i++) { 1.336 + if (0 == pass) { 1.337 + // Calculate input for the first 8 results. 1.338 + input[0] = (in[0 * stride] + in[15 * stride]) * 4; 1.339 + input[1] = (in[1 * stride] + in[14 * stride]) * 4; 1.340 + input[2] = (in[2 * stride] + in[13 * stride]) * 4; 1.341 + input[3] = (in[3 * stride] + in[12 * stride]) * 4; 1.342 + input[4] = (in[4 * stride] + in[11 * stride]) * 4; 1.343 + input[5] = (in[5 * stride] + in[10 * stride]) * 4; 1.344 + input[6] = (in[6 * stride] + in[ 9 * stride]) * 4; 1.345 + input[7] = (in[7 * stride] + in[ 8 * stride]) * 4; 1.346 + // Calculate input for the next 8 results. 1.347 + step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4; 1.348 + step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4; 1.349 + step1[2] = (in[5 * stride] - in[10 * stride]) * 4; 1.350 + step1[3] = (in[4 * stride] - in[11 * stride]) * 4; 1.351 + step1[4] = (in[3 * stride] - in[12 * stride]) * 4; 1.352 + step1[5] = (in[2 * stride] - in[13 * stride]) * 4; 1.353 + step1[6] = (in[1 * stride] - in[14 * stride]) * 4; 1.354 + step1[7] = (in[0 * stride] - in[15 * stride]) * 4; 1.355 + } else { 1.356 + // Calculate input for the first 8 results. 1.357 + input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); 1.358 + input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); 1.359 + input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); 1.360 + input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); 1.361 + input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); 1.362 + input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); 1.363 + input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); 1.364 + input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); 1.365 + // Calculate input for the next 8 results. 1.366 + step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); 1.367 + step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); 1.368 + step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); 1.369 + step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); 1.370 + step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); 1.371 + step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); 1.372 + step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); 1.373 + step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); 1.374 + } 1.375 + // Work on the first eight values; fdct8(input, even_results); 1.376 + { 1.377 + /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 1.378 + /*needs32*/ int t0, t1, t2, t3; 1.379 + /*canbe16*/ int x0, x1, x2, x3; 1.380 + 1.381 + // stage 1 1.382 + s0 = input[0] + input[7]; 1.383 + s1 = input[1] + input[6]; 1.384 + s2 = input[2] + input[5]; 1.385 + s3 = input[3] + input[4]; 1.386 + s4 = input[3] - input[4]; 1.387 + s5 = input[2] - input[5]; 1.388 + s6 = input[1] - input[6]; 1.389 + s7 = input[0] - input[7]; 1.390 + 1.391 + // fdct4(step, step); 1.392 + x0 = s0 + s3; 1.393 + x1 = s1 + s2; 1.394 + x2 = s1 - s2; 1.395 + x3 = s0 - s3; 1.396 + t0 = (x0 + x1) * cospi_16_64; 1.397 + t1 = (x0 - x1) * cospi_16_64; 1.398 + t2 = x3 * cospi_8_64 + x2 * cospi_24_64; 1.399 + t3 = x3 * cospi_24_64 - x2 * cospi_8_64; 1.400 + out[0] = fdct_round_shift(t0); 1.401 + out[4] = fdct_round_shift(t2); 1.402 + out[8] = fdct_round_shift(t1); 1.403 + out[12] = fdct_round_shift(t3); 1.404 + 1.405 + // Stage 2 1.406 + t0 = (s6 - s5) * cospi_16_64; 1.407 + t1 = (s6 + s5) * cospi_16_64; 1.408 + t2 = fdct_round_shift(t0); 1.409 + t3 = fdct_round_shift(t1); 1.410 + 1.411 + // Stage 3 1.412 + x0 = s4 + t2; 1.413 + x1 = s4 - t2; 1.414 + x2 = s7 - t3; 1.415 + x3 = s7 + t3; 1.416 + 1.417 + // Stage 4 1.418 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 1.419 + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 1.420 + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 1.421 + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 1.422 + out[2] = fdct_round_shift(t0); 1.423 + out[6] = fdct_round_shift(t2); 1.424 + out[10] = fdct_round_shift(t1); 1.425 + out[14] = fdct_round_shift(t3); 1.426 + } 1.427 + // Work on the next eight values; step1 -> odd_results 1.428 + { 1.429 + // step 2 1.430 + temp1 = (step1[5] - step1[2]) * cospi_16_64; 1.431 + temp2 = (step1[4] - step1[3]) * cospi_16_64; 1.432 + step2[2] = fdct_round_shift(temp1); 1.433 + step2[3] = fdct_round_shift(temp2); 1.434 + temp1 = (step1[4] + step1[3]) * cospi_16_64; 1.435 + temp2 = (step1[5] + step1[2]) * cospi_16_64; 1.436 + step2[4] = fdct_round_shift(temp1); 1.437 + step2[5] = fdct_round_shift(temp2); 1.438 + // step 3 1.439 + step3[0] = step1[0] + step2[3]; 1.440 + step3[1] = step1[1] + step2[2]; 1.441 + step3[2] = step1[1] - step2[2]; 1.442 + step3[3] = step1[0] - step2[3]; 1.443 + step3[4] = step1[7] - step2[4]; 1.444 + step3[5] = step1[6] - step2[5]; 1.445 + step3[6] = step1[6] + step2[5]; 1.446 + step3[7] = step1[7] + step2[4]; 1.447 + // step 4 1.448 + temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; 1.449 + temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64; 1.450 + step2[1] = fdct_round_shift(temp1); 1.451 + step2[2] = fdct_round_shift(temp2); 1.452 + temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64; 1.453 + temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; 1.454 + step2[5] = fdct_round_shift(temp1); 1.455 + step2[6] = fdct_round_shift(temp2); 1.456 + // step 5 1.457 + step1[0] = step3[0] + step2[1]; 1.458 + step1[1] = step3[0] - step2[1]; 1.459 + step1[2] = step3[3] - step2[2]; 1.460 + step1[3] = step3[3] + step2[2]; 1.461 + step1[4] = step3[4] + step2[5]; 1.462 + step1[5] = step3[4] - step2[5]; 1.463 + step1[6] = step3[7] - step2[6]; 1.464 + step1[7] = step3[7] + step2[6]; 1.465 + // step 6 1.466 + temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; 1.467 + temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; 1.468 + out[1] = fdct_round_shift(temp1); 1.469 + out[9] = fdct_round_shift(temp2); 1.470 + temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; 1.471 + temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; 1.472 + out[5] = fdct_round_shift(temp1); 1.473 + out[13] = fdct_round_shift(temp2); 1.474 + temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; 1.475 + temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; 1.476 + out[3] = fdct_round_shift(temp1); 1.477 + out[11] = fdct_round_shift(temp2); 1.478 + temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; 1.479 + temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; 1.480 + out[7] = fdct_round_shift(temp1); 1.481 + out[15] = fdct_round_shift(temp2); 1.482 + } 1.483 + // Do next column (which is a transposed row in second/horizontal pass) 1.484 + in++; 1.485 + out += 16; 1.486 + } 1.487 + // Setup in/out for next pass. 1.488 + in = intermediate; 1.489 + out = output; 1.490 + } 1.491 +} 1.492 + 1.493 +static void fadst8(const int16_t *input, int16_t *output) { 1.494 + int s0, s1, s2, s3, s4, s5, s6, s7; 1.495 + 1.496 + int x0 = input[7]; 1.497 + int x1 = input[0]; 1.498 + int x2 = input[5]; 1.499 + int x3 = input[2]; 1.500 + int x4 = input[3]; 1.501 + int x5 = input[4]; 1.502 + int x6 = input[1]; 1.503 + int x7 = input[6]; 1.504 + 1.505 + // stage 1 1.506 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 1.507 + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 1.508 + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 1.509 + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 1.510 + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 1.511 + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 1.512 + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 1.513 + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 1.514 + 1.515 + x0 = fdct_round_shift(s0 + s4); 1.516 + x1 = fdct_round_shift(s1 + s5); 1.517 + x2 = fdct_round_shift(s2 + s6); 1.518 + x3 = fdct_round_shift(s3 + s7); 1.519 + x4 = fdct_round_shift(s0 - s4); 1.520 + x5 = fdct_round_shift(s1 - s5); 1.521 + x6 = fdct_round_shift(s2 - s6); 1.522 + x7 = fdct_round_shift(s3 - s7); 1.523 + 1.524 + // stage 2 1.525 + s0 = x0; 1.526 + s1 = x1; 1.527 + s2 = x2; 1.528 + s3 = x3; 1.529 + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 1.530 + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 1.531 + s6 = - cospi_24_64 * x6 + cospi_8_64 * x7; 1.532 + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 1.533 + 1.534 + x0 = s0 + s2; 1.535 + x1 = s1 + s3; 1.536 + x2 = s0 - s2; 1.537 + x3 = s1 - s3; 1.538 + x4 = fdct_round_shift(s4 + s6); 1.539 + x5 = fdct_round_shift(s5 + s7); 1.540 + x6 = fdct_round_shift(s4 - s6); 1.541 + x7 = fdct_round_shift(s5 - s7); 1.542 + 1.543 + // stage 3 1.544 + s2 = cospi_16_64 * (x2 + x3); 1.545 + s3 = cospi_16_64 * (x2 - x3); 1.546 + s6 = cospi_16_64 * (x6 + x7); 1.547 + s7 = cospi_16_64 * (x6 - x7); 1.548 + 1.549 + x2 = fdct_round_shift(s2); 1.550 + x3 = fdct_round_shift(s3); 1.551 + x6 = fdct_round_shift(s6); 1.552 + x7 = fdct_round_shift(s7); 1.553 + 1.554 + output[0] = x0; 1.555 + output[1] = - x4; 1.556 + output[2] = x6; 1.557 + output[3] = - x2; 1.558 + output[4] = x3; 1.559 + output[5] = - x7; 1.560 + output[6] = x5; 1.561 + output[7] = - x1; 1.562 +} 1.563 + 1.564 +static const transform_2d FHT_8[] = { 1.565 + { fdct8, fdct8 }, // DCT_DCT = 0 1.566 + { fadst8, fdct8 }, // ADST_DCT = 1 1.567 + { fdct8, fadst8 }, // DCT_ADST = 2 1.568 + { fadst8, fadst8 } // ADST_ADST = 3 1.569 +}; 1.570 + 1.571 +void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, 1.572 + int stride, int tx_type) { 1.573 + int16_t out[64]; 1.574 + int16_t *outptr = &out[0]; 1.575 + int i, j; 1.576 + int16_t temp_in[8], temp_out[8]; 1.577 + const transform_2d ht = FHT_8[tx_type]; 1.578 + 1.579 + // Columns 1.580 + for (i = 0; i < 8; ++i) { 1.581 + for (j = 0; j < 8; ++j) 1.582 + temp_in[j] = input[j * stride + i] * 4; 1.583 + ht.cols(temp_in, temp_out); 1.584 + for (j = 0; j < 8; ++j) 1.585 + outptr[j * 8 + i] = temp_out[j]; 1.586 + } 1.587 + 1.588 + // Rows 1.589 + for (i = 0; i < 8; ++i) { 1.590 + for (j = 0; j < 8; ++j) 1.591 + temp_in[j] = out[j + i * 8]; 1.592 + ht.rows(temp_in, temp_out); 1.593 + for (j = 0; j < 8; ++j) 1.594 + output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; 1.595 + } 1.596 +} 1.597 + 1.598 +/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per 1.599 + pixel. */ 1.600 +void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) { 1.601 + int i; 1.602 + int a1, b1, c1, d1, e1; 1.603 + const int16_t *ip = input; 1.604 + int16_t *op = output; 1.605 + 1.606 + for (i = 0; i < 4; i++) { 1.607 + a1 = ip[0 * stride]; 1.608 + b1 = ip[1 * stride]; 1.609 + c1 = ip[2 * stride]; 1.610 + d1 = ip[3 * stride]; 1.611 + 1.612 + a1 += b1; 1.613 + d1 = d1 - c1; 1.614 + e1 = (a1 - d1) >> 1; 1.615 + b1 = e1 - b1; 1.616 + c1 = e1 - c1; 1.617 + a1 -= c1; 1.618 + d1 += b1; 1.619 + op[0] = a1; 1.620 + op[4] = c1; 1.621 + op[8] = d1; 1.622 + op[12] = b1; 1.623 + 1.624 + ip++; 1.625 + op++; 1.626 + } 1.627 + ip = output; 1.628 + op = output; 1.629 + 1.630 + for (i = 0; i < 4; i++) { 1.631 + a1 = ip[0]; 1.632 + b1 = ip[1]; 1.633 + c1 = ip[2]; 1.634 + d1 = ip[3]; 1.635 + 1.636 + a1 += b1; 1.637 + d1 -= c1; 1.638 + e1 = (a1 - d1) >> 1; 1.639 + b1 = e1 - b1; 1.640 + c1 = e1 - c1; 1.641 + a1 -= c1; 1.642 + d1 += b1; 1.643 + op[0] = a1 * UNIT_QUANT_FACTOR; 1.644 + op[1] = c1 * UNIT_QUANT_FACTOR; 1.645 + op[2] = d1 * UNIT_QUANT_FACTOR; 1.646 + op[3] = b1 * UNIT_QUANT_FACTOR; 1.647 + 1.648 + ip += 4; 1.649 + op += 4; 1.650 + } 1.651 +} 1.652 + 1.653 +// Rewrote to use same algorithm as others. 1.654 +static void fdct16(const int16_t in[16], int16_t out[16]) { 1.655 + /*canbe16*/ int step1[8]; 1.656 + /*canbe16*/ int step2[8]; 1.657 + /*canbe16*/ int step3[8]; 1.658 + /*canbe16*/ int input[8]; 1.659 + /*needs32*/ int temp1, temp2; 1.660 + 1.661 + // step 1 1.662 + input[0] = in[0] + in[15]; 1.663 + input[1] = in[1] + in[14]; 1.664 + input[2] = in[2] + in[13]; 1.665 + input[3] = in[3] + in[12]; 1.666 + input[4] = in[4] + in[11]; 1.667 + input[5] = in[5] + in[10]; 1.668 + input[6] = in[6] + in[ 9]; 1.669 + input[7] = in[7] + in[ 8]; 1.670 + 1.671 + step1[0] = in[7] - in[ 8]; 1.672 + step1[1] = in[6] - in[ 9]; 1.673 + step1[2] = in[5] - in[10]; 1.674 + step1[3] = in[4] - in[11]; 1.675 + step1[4] = in[3] - in[12]; 1.676 + step1[5] = in[2] - in[13]; 1.677 + step1[6] = in[1] - in[14]; 1.678 + step1[7] = in[0] - in[15]; 1.679 + 1.680 + // fdct8(step, step); 1.681 + { 1.682 + /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 1.683 + /*needs32*/ int t0, t1, t2, t3; 1.684 + /*canbe16*/ int x0, x1, x2, x3; 1.685 + 1.686 + // stage 1 1.687 + s0 = input[0] + input[7]; 1.688 + s1 = input[1] + input[6]; 1.689 + s2 = input[2] + input[5]; 1.690 + s3 = input[3] + input[4]; 1.691 + s4 = input[3] - input[4]; 1.692 + s5 = input[2] - input[5]; 1.693 + s6 = input[1] - input[6]; 1.694 + s7 = input[0] - input[7]; 1.695 + 1.696 + // fdct4(step, step); 1.697 + x0 = s0 + s3; 1.698 + x1 = s1 + s2; 1.699 + x2 = s1 - s2; 1.700 + x3 = s0 - s3; 1.701 + t0 = (x0 + x1) * cospi_16_64; 1.702 + t1 = (x0 - x1) * cospi_16_64; 1.703 + t2 = x3 * cospi_8_64 + x2 * cospi_24_64; 1.704 + t3 = x3 * cospi_24_64 - x2 * cospi_8_64; 1.705 + out[0] = fdct_round_shift(t0); 1.706 + out[4] = fdct_round_shift(t2); 1.707 + out[8] = fdct_round_shift(t1); 1.708 + out[12] = fdct_round_shift(t3); 1.709 + 1.710 + // Stage 2 1.711 + t0 = (s6 - s5) * cospi_16_64; 1.712 + t1 = (s6 + s5) * cospi_16_64; 1.713 + t2 = fdct_round_shift(t0); 1.714 + t3 = fdct_round_shift(t1); 1.715 + 1.716 + // Stage 3 1.717 + x0 = s4 + t2; 1.718 + x1 = s4 - t2; 1.719 + x2 = s7 - t3; 1.720 + x3 = s7 + t3; 1.721 + 1.722 + // Stage 4 1.723 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 1.724 + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 1.725 + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 1.726 + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 1.727 + out[2] = fdct_round_shift(t0); 1.728 + out[6] = fdct_round_shift(t2); 1.729 + out[10] = fdct_round_shift(t1); 1.730 + out[14] = fdct_round_shift(t3); 1.731 + } 1.732 + 1.733 + // step 2 1.734 + temp1 = (step1[5] - step1[2]) * cospi_16_64; 1.735 + temp2 = (step1[4] - step1[3]) * cospi_16_64; 1.736 + step2[2] = fdct_round_shift(temp1); 1.737 + step2[3] = fdct_round_shift(temp2); 1.738 + temp1 = (step1[4] + step1[3]) * cospi_16_64; 1.739 + temp2 = (step1[5] + step1[2]) * cospi_16_64; 1.740 + step2[4] = fdct_round_shift(temp1); 1.741 + step2[5] = fdct_round_shift(temp2); 1.742 + 1.743 + // step 3 1.744 + step3[0] = step1[0] + step2[3]; 1.745 + step3[1] = step1[1] + step2[2]; 1.746 + step3[2] = step1[1] - step2[2]; 1.747 + step3[3] = step1[0] - step2[3]; 1.748 + step3[4] = step1[7] - step2[4]; 1.749 + step3[5] = step1[6] - step2[5]; 1.750 + step3[6] = step1[6] + step2[5]; 1.751 + step3[7] = step1[7] + step2[4]; 1.752 + 1.753 + // step 4 1.754 + temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; 1.755 + temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64; 1.756 + step2[1] = fdct_round_shift(temp1); 1.757 + step2[2] = fdct_round_shift(temp2); 1.758 + temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64; 1.759 + temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; 1.760 + step2[5] = fdct_round_shift(temp1); 1.761 + step2[6] = fdct_round_shift(temp2); 1.762 + 1.763 + // step 5 1.764 + step1[0] = step3[0] + step2[1]; 1.765 + step1[1] = step3[0] - step2[1]; 1.766 + step1[2] = step3[3] - step2[2]; 1.767 + step1[3] = step3[3] + step2[2]; 1.768 + step1[4] = step3[4] + step2[5]; 1.769 + step1[5] = step3[4] - step2[5]; 1.770 + step1[6] = step3[7] - step2[6]; 1.771 + step1[7] = step3[7] + step2[6]; 1.772 + 1.773 + // step 6 1.774 + temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; 1.775 + temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; 1.776 + out[1] = fdct_round_shift(temp1); 1.777 + out[9] = fdct_round_shift(temp2); 1.778 + 1.779 + temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; 1.780 + temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; 1.781 + out[5] = fdct_round_shift(temp1); 1.782 + out[13] = fdct_round_shift(temp2); 1.783 + 1.784 + temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; 1.785 + temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; 1.786 + out[3] = fdct_round_shift(temp1); 1.787 + out[11] = fdct_round_shift(temp2); 1.788 + 1.789 + temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; 1.790 + temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; 1.791 + out[7] = fdct_round_shift(temp1); 1.792 + out[15] = fdct_round_shift(temp2); 1.793 +} 1.794 + 1.795 +static void fadst16(const int16_t *input, int16_t *output) { 1.796 + int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; 1.797 + 1.798 + int x0 = input[15]; 1.799 + int x1 = input[0]; 1.800 + int x2 = input[13]; 1.801 + int x3 = input[2]; 1.802 + int x4 = input[11]; 1.803 + int x5 = input[4]; 1.804 + int x6 = input[9]; 1.805 + int x7 = input[6]; 1.806 + int x8 = input[7]; 1.807 + int x9 = input[8]; 1.808 + int x10 = input[5]; 1.809 + int x11 = input[10]; 1.810 + int x12 = input[3]; 1.811 + int x13 = input[12]; 1.812 + int x14 = input[1]; 1.813 + int x15 = input[14]; 1.814 + 1.815 + // stage 1 1.816 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 1.817 + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 1.818 + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 1.819 + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 1.820 + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 1.821 + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 1.822 + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 1.823 + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 1.824 + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 1.825 + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 1.826 + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 1.827 + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 1.828 + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 1.829 + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 1.830 + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 1.831 + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 1.832 + 1.833 + x0 = fdct_round_shift(s0 + s8); 1.834 + x1 = fdct_round_shift(s1 + s9); 1.835 + x2 = fdct_round_shift(s2 + s10); 1.836 + x3 = fdct_round_shift(s3 + s11); 1.837 + x4 = fdct_round_shift(s4 + s12); 1.838 + x5 = fdct_round_shift(s5 + s13); 1.839 + x6 = fdct_round_shift(s6 + s14); 1.840 + x7 = fdct_round_shift(s7 + s15); 1.841 + x8 = fdct_round_shift(s0 - s8); 1.842 + x9 = fdct_round_shift(s1 - s9); 1.843 + x10 = fdct_round_shift(s2 - s10); 1.844 + x11 = fdct_round_shift(s3 - s11); 1.845 + x12 = fdct_round_shift(s4 - s12); 1.846 + x13 = fdct_round_shift(s5 - s13); 1.847 + x14 = fdct_round_shift(s6 - s14); 1.848 + x15 = fdct_round_shift(s7 - s15); 1.849 + 1.850 + // stage 2 1.851 + s0 = x0; 1.852 + s1 = x1; 1.853 + s2 = x2; 1.854 + s3 = x3; 1.855 + s4 = x4; 1.856 + s5 = x5; 1.857 + s6 = x6; 1.858 + s7 = x7; 1.859 + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 1.860 + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 1.861 + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 1.862 + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 1.863 + s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; 1.864 + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 1.865 + s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; 1.866 + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 1.867 + 1.868 + x0 = s0 + s4; 1.869 + x1 = s1 + s5; 1.870 + x2 = s2 + s6; 1.871 + x3 = s3 + s7; 1.872 + x4 = s0 - s4; 1.873 + x5 = s1 - s5; 1.874 + x6 = s2 - s6; 1.875 + x7 = s3 - s7; 1.876 + x8 = fdct_round_shift(s8 + s12); 1.877 + x9 = fdct_round_shift(s9 + s13); 1.878 + x10 = fdct_round_shift(s10 + s14); 1.879 + x11 = fdct_round_shift(s11 + s15); 1.880 + x12 = fdct_round_shift(s8 - s12); 1.881 + x13 = fdct_round_shift(s9 - s13); 1.882 + x14 = fdct_round_shift(s10 - s14); 1.883 + x15 = fdct_round_shift(s11 - s15); 1.884 + 1.885 + // stage 3 1.886 + s0 = x0; 1.887 + s1 = x1; 1.888 + s2 = x2; 1.889 + s3 = x3; 1.890 + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 1.891 + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 1.892 + s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; 1.893 + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 1.894 + s8 = x8; 1.895 + s9 = x9; 1.896 + s10 = x10; 1.897 + s11 = x11; 1.898 + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 1.899 + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 1.900 + s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; 1.901 + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 1.902 + 1.903 + x0 = s0 + s2; 1.904 + x1 = s1 + s3; 1.905 + x2 = s0 - s2; 1.906 + x3 = s1 - s3; 1.907 + x4 = fdct_round_shift(s4 + s6); 1.908 + x5 = fdct_round_shift(s5 + s7); 1.909 + x6 = fdct_round_shift(s4 - s6); 1.910 + x7 = fdct_round_shift(s5 - s7); 1.911 + x8 = s8 + s10; 1.912 + x9 = s9 + s11; 1.913 + x10 = s8 - s10; 1.914 + x11 = s9 - s11; 1.915 + x12 = fdct_round_shift(s12 + s14); 1.916 + x13 = fdct_round_shift(s13 + s15); 1.917 + x14 = fdct_round_shift(s12 - s14); 1.918 + x15 = fdct_round_shift(s13 - s15); 1.919 + 1.920 + // stage 4 1.921 + s2 = (- cospi_16_64) * (x2 + x3); 1.922 + s3 = cospi_16_64 * (x2 - x3); 1.923 + s6 = cospi_16_64 * (x6 + x7); 1.924 + s7 = cospi_16_64 * (- x6 + x7); 1.925 + s10 = cospi_16_64 * (x10 + x11); 1.926 + s11 = cospi_16_64 * (- x10 + x11); 1.927 + s14 = (- cospi_16_64) * (x14 + x15); 1.928 + s15 = cospi_16_64 * (x14 - x15); 1.929 + 1.930 + x2 = fdct_round_shift(s2); 1.931 + x3 = fdct_round_shift(s3); 1.932 + x6 = fdct_round_shift(s6); 1.933 + x7 = fdct_round_shift(s7); 1.934 + x10 = fdct_round_shift(s10); 1.935 + x11 = fdct_round_shift(s11); 1.936 + x14 = fdct_round_shift(s14); 1.937 + x15 = fdct_round_shift(s15); 1.938 + 1.939 + output[0] = x0; 1.940 + output[1] = - x8; 1.941 + output[2] = x12; 1.942 + output[3] = - x4; 1.943 + output[4] = x6; 1.944 + output[5] = x14; 1.945 + output[6] = x10; 1.946 + output[7] = x2; 1.947 + output[8] = x3; 1.948 + output[9] = x11; 1.949 + output[10] = x15; 1.950 + output[11] = x7; 1.951 + output[12] = x5; 1.952 + output[13] = - x13; 1.953 + output[14] = x9; 1.954 + output[15] = - x1; 1.955 +} 1.956 + 1.957 +static const transform_2d FHT_16[] = { 1.958 + { fdct16, fdct16 }, // DCT_DCT = 0 1.959 + { fadst16, fdct16 }, // ADST_DCT = 1 1.960 + { fdct16, fadst16 }, // DCT_ADST = 2 1.961 + { fadst16, fadst16 } // ADST_ADST = 3 1.962 +}; 1.963 + 1.964 +void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, 1.965 + int stride, int tx_type) { 1.966 + int16_t out[256]; 1.967 + int16_t *outptr = &out[0]; 1.968 + int i, j; 1.969 + int16_t temp_in[16], temp_out[16]; 1.970 + const transform_2d ht = FHT_16[tx_type]; 1.971 + 1.972 + // Columns 1.973 + for (i = 0; i < 16; ++i) { 1.974 + for (j = 0; j < 16; ++j) 1.975 + temp_in[j] = input[j * stride + i] * 4; 1.976 + ht.cols(temp_in, temp_out); 1.977 + for (j = 0; j < 16; ++j) 1.978 + outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; 1.979 +// outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 1.980 + } 1.981 + 1.982 + // Rows 1.983 + for (i = 0; i < 16; ++i) { 1.984 + for (j = 0; j < 16; ++j) 1.985 + temp_in[j] = out[j + i * 16]; 1.986 + ht.rows(temp_in, temp_out); 1.987 + for (j = 0; j < 16; ++j) 1.988 + output[j + i * 16] = temp_out[j]; 1.989 + } 1.990 +} 1.991 + 1.992 +static INLINE int dct_32_round(int input) { 1.993 + int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); 1.994 + assert(-131072 <= rv && rv <= 131071); 1.995 + return rv; 1.996 +} 1.997 + 1.998 +static INLINE int half_round_shift(int input) { 1.999 + int rv = (input + 1 + (input < 0)) >> 2; 1.1000 + return rv; 1.1001 +} 1.1002 + 1.1003 +static void dct32_1d(const int *input, int *output, int round) { 1.1004 + int step[32]; 1.1005 + // Stage 1 1.1006 + step[0] = input[0] + input[(32 - 1)]; 1.1007 + step[1] = input[1] + input[(32 - 2)]; 1.1008 + step[2] = input[2] + input[(32 - 3)]; 1.1009 + step[3] = input[3] + input[(32 - 4)]; 1.1010 + step[4] = input[4] + input[(32 - 5)]; 1.1011 + step[5] = input[5] + input[(32 - 6)]; 1.1012 + step[6] = input[6] + input[(32 - 7)]; 1.1013 + step[7] = input[7] + input[(32 - 8)]; 1.1014 + step[8] = input[8] + input[(32 - 9)]; 1.1015 + step[9] = input[9] + input[(32 - 10)]; 1.1016 + step[10] = input[10] + input[(32 - 11)]; 1.1017 + step[11] = input[11] + input[(32 - 12)]; 1.1018 + step[12] = input[12] + input[(32 - 13)]; 1.1019 + step[13] = input[13] + input[(32 - 14)]; 1.1020 + step[14] = input[14] + input[(32 - 15)]; 1.1021 + step[15] = input[15] + input[(32 - 16)]; 1.1022 + step[16] = -input[16] + input[(32 - 17)]; 1.1023 + step[17] = -input[17] + input[(32 - 18)]; 1.1024 + step[18] = -input[18] + input[(32 - 19)]; 1.1025 + step[19] = -input[19] + input[(32 - 20)]; 1.1026 + step[20] = -input[20] + input[(32 - 21)]; 1.1027 + step[21] = -input[21] + input[(32 - 22)]; 1.1028 + step[22] = -input[22] + input[(32 - 23)]; 1.1029 + step[23] = -input[23] + input[(32 - 24)]; 1.1030 + step[24] = -input[24] + input[(32 - 25)]; 1.1031 + step[25] = -input[25] + input[(32 - 26)]; 1.1032 + step[26] = -input[26] + input[(32 - 27)]; 1.1033 + step[27] = -input[27] + input[(32 - 28)]; 1.1034 + step[28] = -input[28] + input[(32 - 29)]; 1.1035 + step[29] = -input[29] + input[(32 - 30)]; 1.1036 + step[30] = -input[30] + input[(32 - 31)]; 1.1037 + step[31] = -input[31] + input[(32 - 32)]; 1.1038 + 1.1039 + // Stage 2 1.1040 + output[0] = step[0] + step[16 - 1]; 1.1041 + output[1] = step[1] + step[16 - 2]; 1.1042 + output[2] = step[2] + step[16 - 3]; 1.1043 + output[3] = step[3] + step[16 - 4]; 1.1044 + output[4] = step[4] + step[16 - 5]; 1.1045 + output[5] = step[5] + step[16 - 6]; 1.1046 + output[6] = step[6] + step[16 - 7]; 1.1047 + output[7] = step[7] + step[16 - 8]; 1.1048 + output[8] = -step[8] + step[16 - 9]; 1.1049 + output[9] = -step[9] + step[16 - 10]; 1.1050 + output[10] = -step[10] + step[16 - 11]; 1.1051 + output[11] = -step[11] + step[16 - 12]; 1.1052 + output[12] = -step[12] + step[16 - 13]; 1.1053 + output[13] = -step[13] + step[16 - 14]; 1.1054 + output[14] = -step[14] + step[16 - 15]; 1.1055 + output[15] = -step[15] + step[16 - 16]; 1.1056 + 1.1057 + output[16] = step[16]; 1.1058 + output[17] = step[17]; 1.1059 + output[18] = step[18]; 1.1060 + output[19] = step[19]; 1.1061 + 1.1062 + output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64); 1.1063 + output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64); 1.1064 + output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64); 1.1065 + output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64); 1.1066 + 1.1067 + output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64); 1.1068 + output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64); 1.1069 + output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64); 1.1070 + output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64); 1.1071 + 1.1072 + output[28] = step[28]; 1.1073 + output[29] = step[29]; 1.1074 + output[30] = step[30]; 1.1075 + output[31] = step[31]; 1.1076 + 1.1077 + // dump the magnitude by 4, hence the intermediate values are within 1.1078 + // the range of 16 bits. 1.1079 + if (round) { 1.1080 + output[0] = half_round_shift(output[0]); 1.1081 + output[1] = half_round_shift(output[1]); 1.1082 + output[2] = half_round_shift(output[2]); 1.1083 + output[3] = half_round_shift(output[3]); 1.1084 + output[4] = half_round_shift(output[4]); 1.1085 + output[5] = half_round_shift(output[5]); 1.1086 + output[6] = half_round_shift(output[6]); 1.1087 + output[7] = half_round_shift(output[7]); 1.1088 + output[8] = half_round_shift(output[8]); 1.1089 + output[9] = half_round_shift(output[9]); 1.1090 + output[10] = half_round_shift(output[10]); 1.1091 + output[11] = half_round_shift(output[11]); 1.1092 + output[12] = half_round_shift(output[12]); 1.1093 + output[13] = half_round_shift(output[13]); 1.1094 + output[14] = half_round_shift(output[14]); 1.1095 + output[15] = half_round_shift(output[15]); 1.1096 + 1.1097 + output[16] = half_round_shift(output[16]); 1.1098 + output[17] = half_round_shift(output[17]); 1.1099 + output[18] = half_round_shift(output[18]); 1.1100 + output[19] = half_round_shift(output[19]); 1.1101 + output[20] = half_round_shift(output[20]); 1.1102 + output[21] = half_round_shift(output[21]); 1.1103 + output[22] = half_round_shift(output[22]); 1.1104 + output[23] = half_round_shift(output[23]); 1.1105 + output[24] = half_round_shift(output[24]); 1.1106 + output[25] = half_round_shift(output[25]); 1.1107 + output[26] = half_round_shift(output[26]); 1.1108 + output[27] = half_round_shift(output[27]); 1.1109 + output[28] = half_round_shift(output[28]); 1.1110 + output[29] = half_round_shift(output[29]); 1.1111 + output[30] = half_round_shift(output[30]); 1.1112 + output[31] = half_round_shift(output[31]); 1.1113 + } 1.1114 + 1.1115 + // Stage 3 1.1116 + step[0] = output[0] + output[(8 - 1)]; 1.1117 + step[1] = output[1] + output[(8 - 2)]; 1.1118 + step[2] = output[2] + output[(8 - 3)]; 1.1119 + step[3] = output[3] + output[(8 - 4)]; 1.1120 + step[4] = -output[4] + output[(8 - 5)]; 1.1121 + step[5] = -output[5] + output[(8 - 6)]; 1.1122 + step[6] = -output[6] + output[(8 - 7)]; 1.1123 + step[7] = -output[7] + output[(8 - 8)]; 1.1124 + step[8] = output[8]; 1.1125 + step[9] = output[9]; 1.1126 + step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64); 1.1127 + step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64); 1.1128 + step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64); 1.1129 + step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64); 1.1130 + step[14] = output[14]; 1.1131 + step[15] = output[15]; 1.1132 + 1.1133 + step[16] = output[16] + output[23]; 1.1134 + step[17] = output[17] + output[22]; 1.1135 + step[18] = output[18] + output[21]; 1.1136 + step[19] = output[19] + output[20]; 1.1137 + step[20] = -output[20] + output[19]; 1.1138 + step[21] = -output[21] + output[18]; 1.1139 + step[22] = -output[22] + output[17]; 1.1140 + step[23] = -output[23] + output[16]; 1.1141 + step[24] = -output[24] + output[31]; 1.1142 + step[25] = -output[25] + output[30]; 1.1143 + step[26] = -output[26] + output[29]; 1.1144 + step[27] = -output[27] + output[28]; 1.1145 + step[28] = output[28] + output[27]; 1.1146 + step[29] = output[29] + output[26]; 1.1147 + step[30] = output[30] + output[25]; 1.1148 + step[31] = output[31] + output[24]; 1.1149 + 1.1150 + // Stage 4 1.1151 + output[0] = step[0] + step[3]; 1.1152 + output[1] = step[1] + step[2]; 1.1153 + output[2] = -step[2] + step[1]; 1.1154 + output[3] = -step[3] + step[0]; 1.1155 + output[4] = step[4]; 1.1156 + output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64); 1.1157 + output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64); 1.1158 + output[7] = step[7]; 1.1159 + output[8] = step[8] + step[11]; 1.1160 + output[9] = step[9] + step[10]; 1.1161 + output[10] = -step[10] + step[9]; 1.1162 + output[11] = -step[11] + step[8]; 1.1163 + output[12] = -step[12] + step[15]; 1.1164 + output[13] = -step[13] + step[14]; 1.1165 + output[14] = step[14] + step[13]; 1.1166 + output[15] = step[15] + step[12]; 1.1167 + 1.1168 + output[16] = step[16]; 1.1169 + output[17] = step[17]; 1.1170 + output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64); 1.1171 + output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64); 1.1172 + output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64); 1.1173 + output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64); 1.1174 + output[22] = step[22]; 1.1175 + output[23] = step[23]; 1.1176 + output[24] = step[24]; 1.1177 + output[25] = step[25]; 1.1178 + output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64); 1.1179 + output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64); 1.1180 + output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64); 1.1181 + output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64); 1.1182 + output[30] = step[30]; 1.1183 + output[31] = step[31]; 1.1184 + 1.1185 + // Stage 5 1.1186 + step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64); 1.1187 + step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64); 1.1188 + step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64); 1.1189 + step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64); 1.1190 + step[4] = output[4] + output[5]; 1.1191 + step[5] = -output[5] + output[4]; 1.1192 + step[6] = -output[6] + output[7]; 1.1193 + step[7] = output[7] + output[6]; 1.1194 + step[8] = output[8]; 1.1195 + step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64); 1.1196 + step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64); 1.1197 + step[11] = output[11]; 1.1198 + step[12] = output[12]; 1.1199 + step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64); 1.1200 + step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64); 1.1201 + step[15] = output[15]; 1.1202 + 1.1203 + step[16] = output[16] + output[19]; 1.1204 + step[17] = output[17] + output[18]; 1.1205 + step[18] = -output[18] + output[17]; 1.1206 + step[19] = -output[19] + output[16]; 1.1207 + step[20] = -output[20] + output[23]; 1.1208 + step[21] = -output[21] + output[22]; 1.1209 + step[22] = output[22] + output[21]; 1.1210 + step[23] = output[23] + output[20]; 1.1211 + step[24] = output[24] + output[27]; 1.1212 + step[25] = output[25] + output[26]; 1.1213 + step[26] = -output[26] + output[25]; 1.1214 + step[27] = -output[27] + output[24]; 1.1215 + step[28] = -output[28] + output[31]; 1.1216 + step[29] = -output[29] + output[30]; 1.1217 + step[30] = output[30] + output[29]; 1.1218 + step[31] = output[31] + output[28]; 1.1219 + 1.1220 + // Stage 6 1.1221 + output[0] = step[0]; 1.1222 + output[1] = step[1]; 1.1223 + output[2] = step[2]; 1.1224 + output[3] = step[3]; 1.1225 + output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64); 1.1226 + output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64); 1.1227 + output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64); 1.1228 + output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64); 1.1229 + output[8] = step[8] + step[9]; 1.1230 + output[9] = -step[9] + step[8]; 1.1231 + output[10] = -step[10] + step[11]; 1.1232 + output[11] = step[11] + step[10]; 1.1233 + output[12] = step[12] + step[13]; 1.1234 + output[13] = -step[13] + step[12]; 1.1235 + output[14] = -step[14] + step[15]; 1.1236 + output[15] = step[15] + step[14]; 1.1237 + 1.1238 + output[16] = step[16]; 1.1239 + output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64); 1.1240 + output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64); 1.1241 + output[19] = step[19]; 1.1242 + output[20] = step[20]; 1.1243 + output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64); 1.1244 + output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64); 1.1245 + output[23] = step[23]; 1.1246 + output[24] = step[24]; 1.1247 + output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64); 1.1248 + output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64); 1.1249 + output[27] = step[27]; 1.1250 + output[28] = step[28]; 1.1251 + output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64); 1.1252 + output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64); 1.1253 + output[31] = step[31]; 1.1254 + 1.1255 + // Stage 7 1.1256 + step[0] = output[0]; 1.1257 + step[1] = output[1]; 1.1258 + step[2] = output[2]; 1.1259 + step[3] = output[3]; 1.1260 + step[4] = output[4]; 1.1261 + step[5] = output[5]; 1.1262 + step[6] = output[6]; 1.1263 + step[7] = output[7]; 1.1264 + step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64); 1.1265 + step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64); 1.1266 + step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64); 1.1267 + step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64); 1.1268 + step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64); 1.1269 + step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64); 1.1270 + step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64); 1.1271 + step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64); 1.1272 + 1.1273 + step[16] = output[16] + output[17]; 1.1274 + step[17] = -output[17] + output[16]; 1.1275 + step[18] = -output[18] + output[19]; 1.1276 + step[19] = output[19] + output[18]; 1.1277 + step[20] = output[20] + output[21]; 1.1278 + step[21] = -output[21] + output[20]; 1.1279 + step[22] = -output[22] + output[23]; 1.1280 + step[23] = output[23] + output[22]; 1.1281 + step[24] = output[24] + output[25]; 1.1282 + step[25] = -output[25] + output[24]; 1.1283 + step[26] = -output[26] + output[27]; 1.1284 + step[27] = output[27] + output[26]; 1.1285 + step[28] = output[28] + output[29]; 1.1286 + step[29] = -output[29] + output[28]; 1.1287 + step[30] = -output[30] + output[31]; 1.1288 + step[31] = output[31] + output[30]; 1.1289 + 1.1290 + // Final stage --- outputs indices are bit-reversed. 1.1291 + output[0] = step[0]; 1.1292 + output[16] = step[1]; 1.1293 + output[8] = step[2]; 1.1294 + output[24] = step[3]; 1.1295 + output[4] = step[4]; 1.1296 + output[20] = step[5]; 1.1297 + output[12] = step[6]; 1.1298 + output[28] = step[7]; 1.1299 + output[2] = step[8]; 1.1300 + output[18] = step[9]; 1.1301 + output[10] = step[10]; 1.1302 + output[26] = step[11]; 1.1303 + output[6] = step[12]; 1.1304 + output[22] = step[13]; 1.1305 + output[14] = step[14]; 1.1306 + output[30] = step[15]; 1.1307 + 1.1308 + output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); 1.1309 + output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); 1.1310 + output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); 1.1311 + output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); 1.1312 + output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); 1.1313 + output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); 1.1314 + output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); 1.1315 + output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); 1.1316 + output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); 1.1317 + output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); 1.1318 + output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); 1.1319 + output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); 1.1320 + output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); 1.1321 + output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); 1.1322 + output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); 1.1323 + output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); 1.1324 +} 1.1325 + 1.1326 +void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { 1.1327 + int i, j; 1.1328 + int output[32 * 32]; 1.1329 + 1.1330 + // Columns 1.1331 + for (i = 0; i < 32; ++i) { 1.1332 + int temp_in[32], temp_out[32]; 1.1333 + for (j = 0; j < 32; ++j) 1.1334 + temp_in[j] = input[j * stride + i] * 4; 1.1335 + dct32_1d(temp_in, temp_out, 0); 1.1336 + for (j = 0; j < 32; ++j) 1.1337 + output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 1.1338 + } 1.1339 + 1.1340 + // Rows 1.1341 + for (i = 0; i < 32; ++i) { 1.1342 + int temp_in[32], temp_out[32]; 1.1343 + for (j = 0; j < 32; ++j) 1.1344 + temp_in[j] = output[j + i * 32]; 1.1345 + dct32_1d(temp_in, temp_out, 0); 1.1346 + for (j = 0; j < 32; ++j) 1.1347 + out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; 1.1348 + } 1.1349 +} 1.1350 + 1.1351 +// Note that although we use dct_32_round in dct32_1d computation flow, 1.1352 +// this 2d fdct32x32 for rate-distortion optimization loop is operating 1.1353 +// within 16 bits precision. 1.1354 +void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { 1.1355 + int i, j; 1.1356 + int output[32 * 32]; 1.1357 + 1.1358 + // Columns 1.1359 + for (i = 0; i < 32; ++i) { 1.1360 + int temp_in[32], temp_out[32]; 1.1361 + for (j = 0; j < 32; ++j) 1.1362 + temp_in[j] = input[j * stride + i] * 4; 1.1363 + dct32_1d(temp_in, temp_out, 0); 1.1364 + for (j = 0; j < 32; ++j) 1.1365 + // TODO(cd): see quality impact of only doing 1.1366 + // output[j * 32 + i] = (temp_out[j] + 1) >> 2; 1.1367 + // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c 1.1368 + output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 1.1369 + } 1.1370 + 1.1371 + // Rows 1.1372 + for (i = 0; i < 32; ++i) { 1.1373 + int temp_in[32], temp_out[32]; 1.1374 + for (j = 0; j < 32; ++j) 1.1375 + temp_in[j] = output[j + i * 32]; 1.1376 + dct32_1d(temp_in, temp_out, 1); 1.1377 + for (j = 0; j < 32; ++j) 1.1378 + out[j + i * 32] = temp_out[j]; 1.1379 + } 1.1380 +} 1.1381 + 1.1382 +void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output, 1.1383 + int stride) { 1.1384 + if (tx_type == DCT_DCT) 1.1385 + vp9_fdct4x4(input, output, stride); 1.1386 + else 1.1387 + vp9_short_fht4x4(input, output, stride, tx_type); 1.1388 +} 1.1389 + 1.1390 +void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output, 1.1391 + int stride) { 1.1392 + if (tx_type == DCT_DCT) 1.1393 + vp9_fdct8x8(input, output, stride); 1.1394 + else 1.1395 + vp9_short_fht8x8(input, output, stride, tx_type); 1.1396 +} 1.1397 + 1.1398 +void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output, 1.1399 + int stride) { 1.1400 + if (tx_type == DCT_DCT) 1.1401 + vp9_fdct16x16(input, output, stride); 1.1402 + else 1.1403 + vp9_short_fht16x16(input, output, stride, tx_type); 1.1404 +}