1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/common/vp9_idct.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1416 @@ 1.4 +/* 1.5 + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include <assert.h> 1.15 +#include <math.h> 1.16 + 1.17 +#include "./vpx_config.h" 1.18 +#include "./vp9_rtcd.h" 1.19 +#include "vp9/common/vp9_systemdependent.h" 1.20 +#include "vp9/common/vp9_blockd.h" 1.21 +#include "vp9/common/vp9_common.h" 1.22 +#include "vp9/common/vp9_idct.h" 1.23 + 1.24 +void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { 1.25 +/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 1.26 + 0.5 shifts per pixel. */ 1.27 + int i; 1.28 + int16_t output[16]; 1.29 + int a1, b1, c1, d1, e1; 1.30 + const int16_t *ip = input; 1.31 + int16_t *op = output; 1.32 + 1.33 + for (i = 0; i < 4; i++) { 1.34 + a1 = ip[0] >> UNIT_QUANT_SHIFT; 1.35 + c1 = ip[1] >> UNIT_QUANT_SHIFT; 1.36 + d1 = ip[2] >> UNIT_QUANT_SHIFT; 1.37 + b1 = ip[3] >> UNIT_QUANT_SHIFT; 1.38 + a1 += c1; 1.39 + d1 -= b1; 1.40 + e1 = (a1 - d1) >> 1; 1.41 + b1 = e1 - b1; 1.42 + c1 = e1 - c1; 1.43 + a1 -= b1; 1.44 + d1 += c1; 1.45 + op[0] = a1; 1.46 + op[1] = b1; 1.47 + op[2] = c1; 1.48 + op[3] = d1; 1.49 + ip += 4; 1.50 + op += 4; 1.51 + } 1.52 + 1.53 + ip = output; 1.54 + for (i = 0; i < 4; i++) { 1.55 + a1 = ip[4 * 0]; 1.56 + c1 = ip[4 * 1]; 1.57 + d1 = ip[4 * 2]; 1.58 + b1 = ip[4 * 3]; 1.59 + a1 += c1; 1.60 + d1 -= b1; 1.61 + e1 = (a1 - d1) >> 1; 1.62 + b1 = e1 - b1; 1.63 + c1 = e1 - c1; 1.64 + a1 -= b1; 1.65 + d1 += c1; 1.66 + dest[stride * 0] = clip_pixel(dest[stride * 0] + a1); 1.67 + dest[stride * 1] = clip_pixel(dest[stride * 1] + b1); 1.68 + dest[stride * 2] = clip_pixel(dest[stride * 2] + c1); 1.69 + dest[stride * 3] = clip_pixel(dest[stride * 3] + d1); 1.70 + 1.71 + ip++; 1.72 + dest++; 1.73 + } 1.74 +} 1.75 + 1.76 +void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) { 1.77 + int i; 1.78 + int a1, e1; 1.79 + int16_t tmp[4]; 1.80 + const int16_t *ip = in; 1.81 + int16_t *op = tmp; 1.82 + 1.83 + a1 = ip[0] >> UNIT_QUANT_SHIFT; 1.84 + e1 = a1 >> 1; 1.85 + a1 -= e1; 1.86 + op[0] = a1; 1.87 + op[1] = op[2] = op[3] = e1; 1.88 + 1.89 + ip = tmp; 1.90 + for (i = 0; i < 4; i++) { 1.91 + e1 = ip[0] >> 1; 1.92 + a1 = ip[0] - e1; 1.93 + dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1); 1.94 + dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1); 1.95 + dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1); 1.96 + dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1); 1.97 + ip++; 1.98 + dest++; 1.99 + } 1.100 +} 1.101 + 1.102 +static void idct4_1d(const int16_t *input, int16_t *output) { 1.103 + int16_t step[4]; 1.104 + int temp1, temp2; 1.105 + // stage 1 1.106 + temp1 = (input[0] + input[2]) * cospi_16_64; 1.107 + temp2 = (input[0] - input[2]) * cospi_16_64; 1.108 + step[0] = dct_const_round_shift(temp1); 1.109 + step[1] = dct_const_round_shift(temp2); 1.110 + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; 1.111 + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; 1.112 + step[2] = dct_const_round_shift(temp1); 1.113 + step[3] = dct_const_round_shift(temp2); 1.114 + 1.115 + // stage 2 1.116 + output[0] = step[0] + step[3]; 1.117 + output[1] = step[1] + step[2]; 1.118 + output[2] = step[1] - step[2]; 1.119 + output[3] = step[0] - step[3]; 1.120 +} 1.121 + 1.122 +void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { 1.123 + int16_t out[4 * 4]; 1.124 + int16_t *outptr = out; 1.125 + int i, j; 1.126 + int16_t temp_in[4], temp_out[4]; 1.127 + 1.128 + // Rows 1.129 + for (i = 0; i < 4; ++i) { 1.130 + idct4_1d(input, outptr); 1.131 + input += 4; 1.132 + outptr += 4; 1.133 + } 1.134 + 1.135 + // Columns 1.136 + for (i = 0; i < 4; ++i) { 1.137 + for (j = 0; j < 4; ++j) 1.138 + temp_in[j] = out[j * 4 + i]; 1.139 + idct4_1d(temp_in, temp_out); 1.140 + for (j = 0; j < 4; ++j) 1.141 + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) 1.142 + + dest[j * stride + i]); 1.143 + } 1.144 +} 1.145 + 1.146 +void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) { 1.147 + int i; 1.148 + int a1; 1.149 + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); 1.150 + out = dct_const_round_shift(out * cospi_16_64); 1.151 + a1 = ROUND_POWER_OF_TWO(out, 4); 1.152 + 1.153 + for (i = 0; i < 4; i++) { 1.154 + dest[0] = clip_pixel(dest[0] + a1); 1.155 + dest[1] = clip_pixel(dest[1] + a1); 1.156 + dest[2] = clip_pixel(dest[2] + a1); 1.157 + dest[3] = clip_pixel(dest[3] + a1); 1.158 + dest += dest_stride; 1.159 + } 1.160 +} 1.161 + 1.162 +static void idct8_1d(const int16_t *input, int16_t *output) { 1.163 + int16_t step1[8], step2[8]; 1.164 + int temp1, temp2; 1.165 + // stage 1 1.166 + step1[0] = input[0]; 1.167 + step1[2] = input[4]; 1.168 + step1[1] = input[2]; 1.169 + step1[3] = input[6]; 1.170 + temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 1.171 + temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 1.172 + step1[4] = dct_const_round_shift(temp1); 1.173 + step1[7] = dct_const_round_shift(temp2); 1.174 + temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 1.175 + temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 1.176 + step1[5] = dct_const_round_shift(temp1); 1.177 + step1[6] = dct_const_round_shift(temp2); 1.178 + 1.179 + // stage 2 & stage 3 - even half 1.180 + idct4_1d(step1, step1); 1.181 + 1.182 + // stage 2 - odd half 1.183 + step2[4] = step1[4] + step1[5]; 1.184 + step2[5] = step1[4] - step1[5]; 1.185 + step2[6] = -step1[6] + step1[7]; 1.186 + step2[7] = step1[6] + step1[7]; 1.187 + 1.188 + // stage 3 -odd half 1.189 + step1[4] = step2[4]; 1.190 + temp1 = (step2[6] - step2[5]) * cospi_16_64; 1.191 + temp2 = (step2[5] + step2[6]) * cospi_16_64; 1.192 + step1[5] = dct_const_round_shift(temp1); 1.193 + step1[6] = dct_const_round_shift(temp2); 1.194 + step1[7] = step2[7]; 1.195 + 1.196 + // stage 4 1.197 + output[0] = step1[0] + step1[7]; 1.198 + output[1] = step1[1] + step1[6]; 1.199 + output[2] = step1[2] + step1[5]; 1.200 + output[3] = step1[3] + step1[4]; 1.201 + output[4] = step1[3] - step1[4]; 1.202 + output[5] = step1[2] - step1[5]; 1.203 + output[6] = step1[1] - step1[6]; 1.204 + output[7] = step1[0] - step1[7]; 1.205 +} 1.206 + 1.207 +void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) { 1.208 + int16_t out[8 * 8]; 1.209 + int16_t *outptr = out; 1.210 + int i, j; 1.211 + int16_t temp_in[8], temp_out[8]; 1.212 + 1.213 + // First transform rows 1.214 + for (i = 0; i < 8; ++i) { 1.215 + idct8_1d(input, outptr); 1.216 + input += 8; 1.217 + outptr += 8; 1.218 + } 1.219 + 1.220 + // Then transform columns 1.221 + for (i = 0; i < 8; ++i) { 1.222 + for (j = 0; j < 8; ++j) 1.223 + temp_in[j] = out[j * 8 + i]; 1.224 + idct8_1d(temp_in, temp_out); 1.225 + for (j = 0; j < 8; ++j) 1.226 + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 1.227 + + dest[j * stride + i]); 1.228 + } 1.229 +} 1.230 + 1.231 +void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) { 1.232 + int i, j; 1.233 + int a1; 1.234 + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); 1.235 + out = dct_const_round_shift(out * cospi_16_64); 1.236 + a1 = ROUND_POWER_OF_TWO(out, 5); 1.237 + for (j = 0; j < 8; ++j) { 1.238 + for (i = 0; i < 8; ++i) 1.239 + dest[i] = clip_pixel(dest[i] + a1); 1.240 + dest += stride; 1.241 + } 1.242 +} 1.243 + 1.244 +static void iadst4_1d(const int16_t *input, int16_t *output) { 1.245 + int s0, s1, s2, s3, s4, s5, s6, s7; 1.246 + 1.247 + int x0 = input[0]; 1.248 + int x1 = input[1]; 1.249 + int x2 = input[2]; 1.250 + int x3 = input[3]; 1.251 + 1.252 + if (!(x0 | x1 | x2 | x3)) { 1.253 + output[0] = output[1] = output[2] = output[3] = 0; 1.254 + return; 1.255 + } 1.256 + 1.257 + s0 = sinpi_1_9 * x0; 1.258 + s1 = sinpi_2_9 * x0; 1.259 + s2 = sinpi_3_9 * x1; 1.260 + s3 = sinpi_4_9 * x2; 1.261 + s4 = sinpi_1_9 * x2; 1.262 + s5 = sinpi_2_9 * x3; 1.263 + s6 = sinpi_4_9 * x3; 1.264 + s7 = x0 - x2 + x3; 1.265 + 1.266 + x0 = s0 + s3 + s5; 1.267 + x1 = s1 - s4 - s6; 1.268 + x2 = sinpi_3_9 * s7; 1.269 + x3 = s2; 1.270 + 1.271 + s0 = x0 + x3; 1.272 + s1 = x1 + x3; 1.273 + s2 = x2; 1.274 + s3 = x0 + x1 - x3; 1.275 + 1.276 + // 1-D transform scaling factor is sqrt(2). 1.277 + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 1.278 + // + 1b (addition) = 29b. 1.279 + // Hence the output bit depth is 15b. 1.280 + output[0] = dct_const_round_shift(s0); 1.281 + output[1] = dct_const_round_shift(s1); 1.282 + output[2] = dct_const_round_shift(s2); 1.283 + output[3] = dct_const_round_shift(s3); 1.284 +} 1.285 + 1.286 +void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride, 1.287 + int tx_type) { 1.288 + const transform_2d IHT_4[] = { 1.289 + { idct4_1d, idct4_1d }, // DCT_DCT = 0 1.290 + { iadst4_1d, idct4_1d }, // ADST_DCT = 1 1.291 + { idct4_1d, iadst4_1d }, // DCT_ADST = 2 1.292 + { iadst4_1d, iadst4_1d } // ADST_ADST = 3 1.293 + }; 1.294 + 1.295 + int i, j; 1.296 + int16_t out[4 * 4]; 1.297 + int16_t *outptr = out; 1.298 + int16_t temp_in[4], temp_out[4]; 1.299 + 1.300 + // inverse transform row vectors 1.301 + for (i = 0; i < 4; ++i) { 1.302 + IHT_4[tx_type].rows(input, outptr); 1.303 + input += 4; 1.304 + outptr += 4; 1.305 + } 1.306 + 1.307 + // inverse transform column vectors 1.308 + for (i = 0; i < 4; ++i) { 1.309 + for (j = 0; j < 4; ++j) 1.310 + temp_in[j] = out[j * 4 + i]; 1.311 + IHT_4[tx_type].cols(temp_in, temp_out); 1.312 + for (j = 0; j < 4; ++j) 1.313 + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) 1.314 + + dest[j * stride + i]); 1.315 + } 1.316 +} 1.317 +static void iadst8_1d(const int16_t *input, int16_t *output) { 1.318 + int s0, s1, s2, s3, s4, s5, s6, s7; 1.319 + 1.320 + int x0 = input[7]; 1.321 + int x1 = input[0]; 1.322 + int x2 = input[5]; 1.323 + int x3 = input[2]; 1.324 + int x4 = input[3]; 1.325 + int x5 = input[4]; 1.326 + int x6 = input[1]; 1.327 + int x7 = input[6]; 1.328 + 1.329 + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 1.330 + output[0] = output[1] = output[2] = output[3] = output[4] 1.331 + = output[5] = output[6] = output[7] = 0; 1.332 + return; 1.333 + } 1.334 + 1.335 + // stage 1 1.336 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 1.337 + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 1.338 + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 1.339 + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 1.340 + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 1.341 + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 1.342 + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 1.343 + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 1.344 + 1.345 + x0 = dct_const_round_shift(s0 + s4); 1.346 + x1 = dct_const_round_shift(s1 + s5); 1.347 + x2 = dct_const_round_shift(s2 + s6); 1.348 + x3 = dct_const_round_shift(s3 + s7); 1.349 + x4 = dct_const_round_shift(s0 - s4); 1.350 + x5 = dct_const_round_shift(s1 - s5); 1.351 + x6 = dct_const_round_shift(s2 - s6); 1.352 + x7 = dct_const_round_shift(s3 - s7); 1.353 + 1.354 + // stage 2 1.355 + s0 = x0; 1.356 + s1 = x1; 1.357 + s2 = x2; 1.358 + s3 = x3; 1.359 + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 1.360 + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 1.361 + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; 1.362 + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 1.363 + 1.364 + x0 = s0 + s2; 1.365 + x1 = s1 + s3; 1.366 + x2 = s0 - s2; 1.367 + x3 = s1 - s3; 1.368 + x4 = dct_const_round_shift(s4 + s6); 1.369 + x5 = dct_const_round_shift(s5 + s7); 1.370 + x6 = dct_const_round_shift(s4 - s6); 1.371 + x7 = dct_const_round_shift(s5 - s7); 1.372 + 1.373 + // stage 3 1.374 + s2 = cospi_16_64 * (x2 + x3); 1.375 + s3 = cospi_16_64 * (x2 - x3); 1.376 + s6 = cospi_16_64 * (x6 + x7); 1.377 + s7 = cospi_16_64 * (x6 - x7); 1.378 + 1.379 + x2 = dct_const_round_shift(s2); 1.380 + x3 = dct_const_round_shift(s3); 1.381 + x6 = dct_const_round_shift(s6); 1.382 + x7 = dct_const_round_shift(s7); 1.383 + 1.384 + output[0] = x0; 1.385 + output[1] = -x4; 1.386 + output[2] = x6; 1.387 + output[3] = -x2; 1.388 + output[4] = x3; 1.389 + output[5] = -x7; 1.390 + output[6] = x5; 1.391 + output[7] = -x1; 1.392 +} 1.393 + 1.394 +static const transform_2d IHT_8[] = { 1.395 + { idct8_1d, idct8_1d }, // DCT_DCT = 0 1.396 + { iadst8_1d, idct8_1d }, // ADST_DCT = 1 1.397 + { idct8_1d, iadst8_1d }, // DCT_ADST = 2 1.398 + { iadst8_1d, iadst8_1d } // ADST_ADST = 3 1.399 +}; 1.400 + 1.401 +void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride, 1.402 + int tx_type) { 1.403 + int i, j; 1.404 + int16_t out[8 * 8]; 1.405 + int16_t *outptr = out; 1.406 + int16_t temp_in[8], temp_out[8]; 1.407 + const transform_2d ht = IHT_8[tx_type]; 1.408 + 1.409 + // inverse transform row vectors 1.410 + for (i = 0; i < 8; ++i) { 1.411 + ht.rows(input, outptr); 1.412 + input += 8; 1.413 + outptr += 8; 1.414 + } 1.415 + 1.416 + // inverse transform column vectors 1.417 + for (i = 0; i < 8; ++i) { 1.418 + for (j = 0; j < 8; ++j) 1.419 + temp_in[j] = out[j * 8 + i]; 1.420 + ht.cols(temp_in, temp_out); 1.421 + for (j = 0; j < 8; ++j) 1.422 + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 1.423 + + dest[j * stride + i]); 1.424 + } 1.425 +} 1.426 + 1.427 +void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) { 1.428 + int16_t out[8 * 8] = { 0 }; 1.429 + int16_t *outptr = out; 1.430 + int i, j; 1.431 + int16_t temp_in[8], temp_out[8]; 1.432 + 1.433 + // First transform rows 1.434 + // only first 4 row has non-zero coefs 1.435 + for (i = 0; i < 4; ++i) { 1.436 + idct8_1d(input, outptr); 1.437 + input += 8; 1.438 + outptr += 8; 1.439 + } 1.440 + 1.441 + // Then transform columns 1.442 + for (i = 0; i < 8; ++i) { 1.443 + for (j = 0; j < 8; ++j) 1.444 + temp_in[j] = out[j * 8 + i]; 1.445 + idct8_1d(temp_in, temp_out); 1.446 + for (j = 0; j < 8; ++j) 1.447 + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 1.448 + + dest[j * stride + i]); 1.449 + } 1.450 +} 1.451 + 1.452 +static void idct16_1d(const int16_t *input, int16_t *output) { 1.453 + int16_t step1[16], step2[16]; 1.454 + int temp1, temp2; 1.455 + 1.456 + // stage 1 1.457 + step1[0] = input[0/2]; 1.458 + step1[1] = input[16/2]; 1.459 + step1[2] = input[8/2]; 1.460 + step1[3] = input[24/2]; 1.461 + step1[4] = input[4/2]; 1.462 + step1[5] = input[20/2]; 1.463 + step1[6] = input[12/2]; 1.464 + step1[7] = input[28/2]; 1.465 + step1[8] = input[2/2]; 1.466 + step1[9] = input[18/2]; 1.467 + step1[10] = input[10/2]; 1.468 + step1[11] = input[26/2]; 1.469 + step1[12] = input[6/2]; 1.470 + step1[13] = input[22/2]; 1.471 + step1[14] = input[14/2]; 1.472 + step1[15] = input[30/2]; 1.473 + 1.474 + // stage 2 1.475 + step2[0] = step1[0]; 1.476 + step2[1] = step1[1]; 1.477 + step2[2] = step1[2]; 1.478 + step2[3] = step1[3]; 1.479 + step2[4] = step1[4]; 1.480 + step2[5] = step1[5]; 1.481 + step2[6] = step1[6]; 1.482 + step2[7] = step1[7]; 1.483 + 1.484 + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 1.485 + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 1.486 + step2[8] = dct_const_round_shift(temp1); 1.487 + step2[15] = dct_const_round_shift(temp2); 1.488 + 1.489 + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 1.490 + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 1.491 + step2[9] = dct_const_round_shift(temp1); 1.492 + step2[14] = dct_const_round_shift(temp2); 1.493 + 1.494 + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 1.495 + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 1.496 + step2[10] = dct_const_round_shift(temp1); 1.497 + step2[13] = dct_const_round_shift(temp2); 1.498 + 1.499 + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 1.500 + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 1.501 + step2[11] = dct_const_round_shift(temp1); 1.502 + step2[12] = dct_const_round_shift(temp2); 1.503 + 1.504 + // stage 3 1.505 + step1[0] = step2[0]; 1.506 + step1[1] = step2[1]; 1.507 + step1[2] = step2[2]; 1.508 + step1[3] = step2[3]; 1.509 + 1.510 + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 1.511 + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 1.512 + step1[4] = dct_const_round_shift(temp1); 1.513 + step1[7] = dct_const_round_shift(temp2); 1.514 + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 1.515 + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 1.516 + step1[5] = dct_const_round_shift(temp1); 1.517 + step1[6] = dct_const_round_shift(temp2); 1.518 + 1.519 + step1[8] = step2[8] + step2[9]; 1.520 + step1[9] = step2[8] - step2[9]; 1.521 + step1[10] = -step2[10] + step2[11]; 1.522 + step1[11] = step2[10] + step2[11]; 1.523 + step1[12] = step2[12] + step2[13]; 1.524 + step1[13] = step2[12] - step2[13]; 1.525 + step1[14] = -step2[14] + step2[15]; 1.526 + step1[15] = step2[14] + step2[15]; 1.527 + 1.528 + // stage 4 1.529 + temp1 = (step1[0] + step1[1]) * cospi_16_64; 1.530 + temp2 = (step1[0] - step1[1]) * cospi_16_64; 1.531 + step2[0] = dct_const_round_shift(temp1); 1.532 + step2[1] = dct_const_round_shift(temp2); 1.533 + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 1.534 + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 1.535 + step2[2] = dct_const_round_shift(temp1); 1.536 + step2[3] = dct_const_round_shift(temp2); 1.537 + step2[4] = step1[4] + step1[5]; 1.538 + step2[5] = step1[4] - step1[5]; 1.539 + step2[6] = -step1[6] + step1[7]; 1.540 + step2[7] = step1[6] + step1[7]; 1.541 + 1.542 + step2[8] = step1[8]; 1.543 + step2[15] = step1[15]; 1.544 + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 1.545 + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 1.546 + step2[9] = dct_const_round_shift(temp1); 1.547 + step2[14] = dct_const_round_shift(temp2); 1.548 + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 1.549 + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 1.550 + step2[10] = dct_const_round_shift(temp1); 1.551 + step2[13] = dct_const_round_shift(temp2); 1.552 + step2[11] = step1[11]; 1.553 + step2[12] = step1[12]; 1.554 + 1.555 + // stage 5 1.556 + step1[0] = step2[0] + step2[3]; 1.557 + step1[1] = step2[1] + step2[2]; 1.558 + step1[2] = step2[1] - step2[2]; 1.559 + step1[3] = step2[0] - step2[3]; 1.560 + step1[4] = step2[4]; 1.561 + temp1 = (step2[6] - step2[5]) * cospi_16_64; 1.562 + temp2 = (step2[5] + step2[6]) * cospi_16_64; 1.563 + step1[5] = dct_const_round_shift(temp1); 1.564 + step1[6] = dct_const_round_shift(temp2); 1.565 + step1[7] = step2[7]; 1.566 + 1.567 + step1[8] = step2[8] + step2[11]; 1.568 + step1[9] = step2[9] + step2[10]; 1.569 + step1[10] = step2[9] - step2[10]; 1.570 + step1[11] = step2[8] - step2[11]; 1.571 + step1[12] = -step2[12] + step2[15]; 1.572 + step1[13] = -step2[13] + step2[14]; 1.573 + step1[14] = step2[13] + step2[14]; 1.574 + step1[15] = step2[12] + step2[15]; 1.575 + 1.576 + // stage 6 1.577 + step2[0] = step1[0] + step1[7]; 1.578 + step2[1] = step1[1] + step1[6]; 1.579 + step2[2] = step1[2] + step1[5]; 1.580 + step2[3] = step1[3] + step1[4]; 1.581 + step2[4] = step1[3] - step1[4]; 1.582 + step2[5] = step1[2] - step1[5]; 1.583 + step2[6] = step1[1] - step1[6]; 1.584 + step2[7] = step1[0] - step1[7]; 1.585 + step2[8] = step1[8]; 1.586 + step2[9] = step1[9]; 1.587 + temp1 = (-step1[10] + step1[13]) * cospi_16_64; 1.588 + temp2 = (step1[10] + step1[13]) * cospi_16_64; 1.589 + step2[10] = dct_const_round_shift(temp1); 1.590 + step2[13] = dct_const_round_shift(temp2); 1.591 + temp1 = (-step1[11] + step1[12]) * cospi_16_64; 1.592 + temp2 = (step1[11] + step1[12]) * cospi_16_64; 1.593 + step2[11] = dct_const_round_shift(temp1); 1.594 + step2[12] = dct_const_round_shift(temp2); 1.595 + step2[14] = step1[14]; 1.596 + step2[15] = step1[15]; 1.597 + 1.598 + // stage 7 1.599 + output[0] = step2[0] + step2[15]; 1.600 + output[1] = step2[1] + step2[14]; 1.601 + output[2] = step2[2] + step2[13]; 1.602 + output[3] = step2[3] + step2[12]; 1.603 + output[4] = step2[4] + step2[11]; 1.604 + output[5] = step2[5] + step2[10]; 1.605 + output[6] = step2[6] + step2[9]; 1.606 + output[7] = step2[7] + step2[8]; 1.607 + output[8] = step2[7] - step2[8]; 1.608 + output[9] = step2[6] - step2[9]; 1.609 + output[10] = step2[5] - step2[10]; 1.610 + output[11] = step2[4] - step2[11]; 1.611 + output[12] = step2[3] - step2[12]; 1.612 + output[13] = step2[2] - step2[13]; 1.613 + output[14] = step2[1] - step2[14]; 1.614 + output[15] = step2[0] - step2[15]; 1.615 +} 1.616 + 1.617 +void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) { 1.618 + int16_t out[16 * 16]; 1.619 + int16_t *outptr = out; 1.620 + int i, j; 1.621 + int16_t temp_in[16], temp_out[16]; 1.622 + 1.623 + // First transform rows 1.624 + for (i = 0; i < 16; ++i) { 1.625 + idct16_1d(input, outptr); 1.626 + input += 16; 1.627 + outptr += 16; 1.628 + } 1.629 + 1.630 + // Then transform columns 1.631 + for (i = 0; i < 16; ++i) { 1.632 + for (j = 0; j < 16; ++j) 1.633 + temp_in[j] = out[j * 16 + i]; 1.634 + idct16_1d(temp_in, temp_out); 1.635 + for (j = 0; j < 16; ++j) 1.636 + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 1.637 + + dest[j * stride + i]); 1.638 + } 1.639 +} 1.640 + 1.641 +static void iadst16_1d(const int16_t *input, int16_t *output) { 1.642 + int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; 1.643 + 1.644 + int x0 = input[15]; 1.645 + int x1 = input[0]; 1.646 + int x2 = input[13]; 1.647 + int x3 = input[2]; 1.648 + int x4 = input[11]; 1.649 + int x5 = input[4]; 1.650 + int x6 = input[9]; 1.651 + int x7 = input[6]; 1.652 + int x8 = input[7]; 1.653 + int x9 = input[8]; 1.654 + int x10 = input[5]; 1.655 + int x11 = input[10]; 1.656 + int x12 = input[3]; 1.657 + int x13 = input[12]; 1.658 + int x14 = input[1]; 1.659 + int x15 = input[14]; 1.660 + 1.661 + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 1.662 + | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { 1.663 + output[0] = output[1] = output[2] = output[3] = output[4] 1.664 + = output[5] = output[6] = output[7] = output[8] 1.665 + = output[9] = output[10] = output[11] = output[12] 1.666 + = output[13] = output[14] = output[15] = 0; 1.667 + return; 1.668 + } 1.669 + 1.670 + // stage 1 1.671 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 1.672 + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 1.673 + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 1.674 + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 1.675 + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 1.676 + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 1.677 + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 1.678 + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 1.679 + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 1.680 + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 1.681 + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 1.682 + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 1.683 + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 1.684 + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 1.685 + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 1.686 + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 1.687 + 1.688 + x0 = dct_const_round_shift(s0 + s8); 1.689 + x1 = dct_const_round_shift(s1 + s9); 1.690 + x2 = dct_const_round_shift(s2 + s10); 1.691 + x3 = dct_const_round_shift(s3 + s11); 1.692 + x4 = dct_const_round_shift(s4 + s12); 1.693 + x5 = dct_const_round_shift(s5 + s13); 1.694 + x6 = dct_const_round_shift(s6 + s14); 1.695 + x7 = dct_const_round_shift(s7 + s15); 1.696 + x8 = dct_const_round_shift(s0 - s8); 1.697 + x9 = dct_const_round_shift(s1 - s9); 1.698 + x10 = dct_const_round_shift(s2 - s10); 1.699 + x11 = dct_const_round_shift(s3 - s11); 1.700 + x12 = dct_const_round_shift(s4 - s12); 1.701 + x13 = dct_const_round_shift(s5 - s13); 1.702 + x14 = dct_const_round_shift(s6 - s14); 1.703 + x15 = dct_const_round_shift(s7 - s15); 1.704 + 1.705 + // stage 2 1.706 + s0 = x0; 1.707 + s1 = x1; 1.708 + s2 = x2; 1.709 + s3 = x3; 1.710 + s4 = x4; 1.711 + s5 = x5; 1.712 + s6 = x6; 1.713 + s7 = x7; 1.714 + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 1.715 + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 1.716 + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 1.717 + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 1.718 + s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; 1.719 + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 1.720 + s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; 1.721 + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 1.722 + 1.723 + x0 = s0 + s4; 1.724 + x1 = s1 + s5; 1.725 + x2 = s2 + s6; 1.726 + x3 = s3 + s7; 1.727 + x4 = s0 - s4; 1.728 + x5 = s1 - s5; 1.729 + x6 = s2 - s6; 1.730 + x7 = s3 - s7; 1.731 + x8 = dct_const_round_shift(s8 + s12); 1.732 + x9 = dct_const_round_shift(s9 + s13); 1.733 + x10 = dct_const_round_shift(s10 + s14); 1.734 + x11 = dct_const_round_shift(s11 + s15); 1.735 + x12 = dct_const_round_shift(s8 - s12); 1.736 + x13 = dct_const_round_shift(s9 - s13); 1.737 + x14 = dct_const_round_shift(s10 - s14); 1.738 + x15 = dct_const_round_shift(s11 - s15); 1.739 + 1.740 + // stage 3 1.741 + s0 = x0; 1.742 + s1 = x1; 1.743 + s2 = x2; 1.744 + s3 = x3; 1.745 + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 1.746 + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 1.747 + s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; 1.748 + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 1.749 + s8 = x8; 1.750 + s9 = x9; 1.751 + s10 = x10; 1.752 + s11 = x11; 1.753 + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 1.754 + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 1.755 + s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; 1.756 + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 1.757 + 1.758 + x0 = s0 + s2; 1.759 + x1 = s1 + s3; 1.760 + x2 = s0 - s2; 1.761 + x3 = s1 - s3; 1.762 + x4 = dct_const_round_shift(s4 + s6); 1.763 + x5 = dct_const_round_shift(s5 + s7); 1.764 + x6 = dct_const_round_shift(s4 - s6); 1.765 + x7 = dct_const_round_shift(s5 - s7); 1.766 + x8 = s8 + s10; 1.767 + x9 = s9 + s11; 1.768 + x10 = s8 - s10; 1.769 + x11 = s9 - s11; 1.770 + x12 = dct_const_round_shift(s12 + s14); 1.771 + x13 = dct_const_round_shift(s13 + s15); 1.772 + x14 = dct_const_round_shift(s12 - s14); 1.773 + x15 = dct_const_round_shift(s13 - s15); 1.774 + 1.775 + // stage 4 1.776 + s2 = (- cospi_16_64) * (x2 + x3); 1.777 + s3 = cospi_16_64 * (x2 - x3); 1.778 + s6 = cospi_16_64 * (x6 + x7); 1.779 + s7 = cospi_16_64 * (- x6 + x7); 1.780 + s10 = cospi_16_64 * (x10 + x11); 1.781 + s11 = cospi_16_64 * (- x10 + x11); 1.782 + s14 = (- cospi_16_64) * (x14 + x15); 1.783 + s15 = cospi_16_64 * (x14 - x15); 1.784 + 1.785 + x2 = dct_const_round_shift(s2); 1.786 + x3 = dct_const_round_shift(s3); 1.787 + x6 = dct_const_round_shift(s6); 1.788 + x7 = dct_const_round_shift(s7); 1.789 + x10 = dct_const_round_shift(s10); 1.790 + x11 = dct_const_round_shift(s11); 1.791 + x14 = dct_const_round_shift(s14); 1.792 + x15 = dct_const_round_shift(s15); 1.793 + 1.794 + output[0] = x0; 1.795 + output[1] = -x8; 1.796 + output[2] = x12; 1.797 + output[3] = -x4; 1.798 + output[4] = x6; 1.799 + output[5] = x14; 1.800 + output[6] = x10; 1.801 + output[7] = x2; 1.802 + output[8] = x3; 1.803 + output[9] = x11; 1.804 + output[10] = x15; 1.805 + output[11] = x7; 1.806 + output[12] = x5; 1.807 + output[13] = -x13; 1.808 + output[14] = x9; 1.809 + output[15] = -x1; 1.810 +} 1.811 + 1.812 +static const transform_2d IHT_16[] = { 1.813 + { idct16_1d, idct16_1d }, // DCT_DCT = 0 1.814 + { iadst16_1d, idct16_1d }, // ADST_DCT = 1 1.815 + { idct16_1d, iadst16_1d }, // DCT_ADST = 2 1.816 + { iadst16_1d, iadst16_1d } // ADST_ADST = 3 1.817 +}; 1.818 + 1.819 +void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride, 1.820 + int tx_type) { 1.821 + int i, j; 1.822 + int16_t out[16 * 16]; 1.823 + int16_t *outptr = out; 1.824 + int16_t temp_in[16], temp_out[16]; 1.825 + const transform_2d ht = IHT_16[tx_type]; 1.826 + 1.827 + // Rows 1.828 + for (i = 0; i < 16; ++i) { 1.829 + ht.rows(input, outptr); 1.830 + input += 16; 1.831 + outptr += 16; 1.832 + } 1.833 + 1.834 + // Columns 1.835 + for (i = 0; i < 16; ++i) { 1.836 + for (j = 0; j < 16; ++j) 1.837 + temp_in[j] = out[j * 16 + i]; 1.838 + ht.cols(temp_in, temp_out); 1.839 + for (j = 0; j < 16; ++j) 1.840 + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 1.841 + + dest[j * stride + i]); 1.842 + } 1.843 +} 1.844 + 1.845 +void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) { 1.846 + int16_t out[16 * 16] = { 0 }; 1.847 + int16_t *outptr = out; 1.848 + int i, j; 1.849 + int16_t temp_in[16], temp_out[16]; 1.850 + 1.851 + // First transform rows. Since all non-zero dct coefficients are in 1.852 + // upper-left 4x4 area, we only need to calculate first 4 rows here. 1.853 + for (i = 0; i < 4; ++i) { 1.854 + idct16_1d(input, outptr); 1.855 + input += 16; 1.856 + outptr += 16; 1.857 + } 1.858 + 1.859 + // Then transform columns 1.860 + for (i = 0; i < 16; ++i) { 1.861 + for (j = 0; j < 16; ++j) 1.862 + temp_in[j] = out[j*16 + i]; 1.863 + idct16_1d(temp_in, temp_out); 1.864 + for (j = 0; j < 16; ++j) 1.865 + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 1.866 + + dest[j * stride + i]); 1.867 + } 1.868 +} 1.869 + 1.870 +void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) { 1.871 + int i, j; 1.872 + int a1; 1.873 + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); 1.874 + out = dct_const_round_shift(out * cospi_16_64); 1.875 + a1 = ROUND_POWER_OF_TWO(out, 6); 1.876 + for (j = 0; j < 16; ++j) { 1.877 + for (i = 0; i < 16; ++i) 1.878 + dest[i] = clip_pixel(dest[i] + a1); 1.879 + dest += stride; 1.880 + } 1.881 +} 1.882 + 1.883 +static void idct32_1d(const int16_t *input, int16_t *output) { 1.884 + int16_t step1[32], step2[32]; 1.885 + int temp1, temp2; 1.886 + 1.887 + // stage 1 1.888 + step1[0] = input[0]; 1.889 + step1[1] = input[16]; 1.890 + step1[2] = input[8]; 1.891 + step1[3] = input[24]; 1.892 + step1[4] = input[4]; 1.893 + step1[5] = input[20]; 1.894 + step1[6] = input[12]; 1.895 + step1[7] = input[28]; 1.896 + step1[8] = input[2]; 1.897 + step1[9] = input[18]; 1.898 + step1[10] = input[10]; 1.899 + step1[11] = input[26]; 1.900 + step1[12] = input[6]; 1.901 + step1[13] = input[22]; 1.902 + step1[14] = input[14]; 1.903 + step1[15] = input[30]; 1.904 + 1.905 + temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; 1.906 + temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; 1.907 + step1[16] = dct_const_round_shift(temp1); 1.908 + step1[31] = dct_const_round_shift(temp2); 1.909 + 1.910 + temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; 1.911 + temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; 1.912 + step1[17] = dct_const_round_shift(temp1); 1.913 + step1[30] = dct_const_round_shift(temp2); 1.914 + 1.915 + temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; 1.916 + temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; 1.917 + step1[18] = dct_const_round_shift(temp1); 1.918 + step1[29] = dct_const_round_shift(temp2); 1.919 + 1.920 + temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; 1.921 + temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; 1.922 + step1[19] = dct_const_round_shift(temp1); 1.923 + step1[28] = dct_const_round_shift(temp2); 1.924 + 1.925 + temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; 1.926 + temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; 1.927 + step1[20] = dct_const_round_shift(temp1); 1.928 + step1[27] = dct_const_round_shift(temp2); 1.929 + 1.930 + temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; 1.931 + temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; 1.932 + step1[21] = dct_const_round_shift(temp1); 1.933 + step1[26] = dct_const_round_shift(temp2); 1.934 + 1.935 + temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; 1.936 + temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; 1.937 + step1[22] = dct_const_round_shift(temp1); 1.938 + step1[25] = dct_const_round_shift(temp2); 1.939 + 1.940 + temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; 1.941 + temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; 1.942 + step1[23] = dct_const_round_shift(temp1); 1.943 + step1[24] = dct_const_round_shift(temp2); 1.944 + 1.945 + // stage 2 1.946 + step2[0] = step1[0]; 1.947 + step2[1] = step1[1]; 1.948 + step2[2] = step1[2]; 1.949 + step2[3] = step1[3]; 1.950 + step2[4] = step1[4]; 1.951 + step2[5] = step1[5]; 1.952 + step2[6] = step1[6]; 1.953 + step2[7] = step1[7]; 1.954 + 1.955 + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 1.956 + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 1.957 + step2[8] = dct_const_round_shift(temp1); 1.958 + step2[15] = dct_const_round_shift(temp2); 1.959 + 1.960 + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 1.961 + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 1.962 + step2[9] = dct_const_round_shift(temp1); 1.963 + step2[14] = dct_const_round_shift(temp2); 1.964 + 1.965 + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 1.966 + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 1.967 + step2[10] = dct_const_round_shift(temp1); 1.968 + step2[13] = dct_const_round_shift(temp2); 1.969 + 1.970 + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 1.971 + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 1.972 + step2[11] = dct_const_round_shift(temp1); 1.973 + step2[12] = dct_const_round_shift(temp2); 1.974 + 1.975 + step2[16] = step1[16] + step1[17]; 1.976 + step2[17] = step1[16] - step1[17]; 1.977 + step2[18] = -step1[18] + step1[19]; 1.978 + step2[19] = step1[18] + step1[19]; 1.979 + step2[20] = step1[20] + step1[21]; 1.980 + step2[21] = step1[20] - step1[21]; 1.981 + step2[22] = -step1[22] + step1[23]; 1.982 + step2[23] = step1[22] + step1[23]; 1.983 + step2[24] = step1[24] + step1[25]; 1.984 + step2[25] = step1[24] - step1[25]; 1.985 + step2[26] = -step1[26] + step1[27]; 1.986 + step2[27] = step1[26] + step1[27]; 1.987 + step2[28] = step1[28] + step1[29]; 1.988 + step2[29] = step1[28] - step1[29]; 1.989 + step2[30] = -step1[30] + step1[31]; 1.990 + step2[31] = step1[30] + step1[31]; 1.991 + 1.992 + // stage 3 1.993 + step1[0] = step2[0]; 1.994 + step1[1] = step2[1]; 1.995 + step1[2] = step2[2]; 1.996 + step1[3] = step2[3]; 1.997 + 1.998 + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 1.999 + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 1.1000 + step1[4] = dct_const_round_shift(temp1); 1.1001 + step1[7] = dct_const_round_shift(temp2); 1.1002 + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 1.1003 + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 1.1004 + step1[5] = dct_const_round_shift(temp1); 1.1005 + step1[6] = dct_const_round_shift(temp2); 1.1006 + 1.1007 + step1[8] = step2[8] + step2[9]; 1.1008 + step1[9] = step2[8] - step2[9]; 1.1009 + step1[10] = -step2[10] + step2[11]; 1.1010 + step1[11] = step2[10] + step2[11]; 1.1011 + step1[12] = step2[12] + step2[13]; 1.1012 + step1[13] = step2[12] - step2[13]; 1.1013 + step1[14] = -step2[14] + step2[15]; 1.1014 + step1[15] = step2[14] + step2[15]; 1.1015 + 1.1016 + step1[16] = step2[16]; 1.1017 + step1[31] = step2[31]; 1.1018 + temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; 1.1019 + temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; 1.1020 + step1[17] = dct_const_round_shift(temp1); 1.1021 + step1[30] = dct_const_round_shift(temp2); 1.1022 + temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; 1.1023 + temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; 1.1024 + step1[18] = dct_const_round_shift(temp1); 1.1025 + step1[29] = dct_const_round_shift(temp2); 1.1026 + step1[19] = step2[19]; 1.1027 + step1[20] = step2[20]; 1.1028 + temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; 1.1029 + temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; 1.1030 + step1[21] = dct_const_round_shift(temp1); 1.1031 + step1[26] = dct_const_round_shift(temp2); 1.1032 + temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; 1.1033 + temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; 1.1034 + step1[22] = dct_const_round_shift(temp1); 1.1035 + step1[25] = dct_const_round_shift(temp2); 1.1036 + step1[23] = step2[23]; 1.1037 + step1[24] = step2[24]; 1.1038 + step1[27] = step2[27]; 1.1039 + step1[28] = step2[28]; 1.1040 + 1.1041 + // stage 4 1.1042 + temp1 = (step1[0] + step1[1]) * cospi_16_64; 1.1043 + temp2 = (step1[0] - step1[1]) * cospi_16_64; 1.1044 + step2[0] = dct_const_round_shift(temp1); 1.1045 + step2[1] = dct_const_round_shift(temp2); 1.1046 + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 1.1047 + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 1.1048 + step2[2] = dct_const_round_shift(temp1); 1.1049 + step2[3] = dct_const_round_shift(temp2); 1.1050 + step2[4] = step1[4] + step1[5]; 1.1051 + step2[5] = step1[4] - step1[5]; 1.1052 + step2[6] = -step1[6] + step1[7]; 1.1053 + step2[7] = step1[6] + step1[7]; 1.1054 + 1.1055 + step2[8] = step1[8]; 1.1056 + step2[15] = step1[15]; 1.1057 + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 1.1058 + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 1.1059 + step2[9] = dct_const_round_shift(temp1); 1.1060 + step2[14] = dct_const_round_shift(temp2); 1.1061 + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 1.1062 + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 1.1063 + step2[10] = dct_const_round_shift(temp1); 1.1064 + step2[13] = dct_const_round_shift(temp2); 1.1065 + step2[11] = step1[11]; 1.1066 + step2[12] = step1[12]; 1.1067 + 1.1068 + step2[16] = step1[16] + step1[19]; 1.1069 + step2[17] = step1[17] + step1[18]; 1.1070 + step2[18] = step1[17] - step1[18]; 1.1071 + step2[19] = step1[16] - step1[19]; 1.1072 + step2[20] = -step1[20] + step1[23]; 1.1073 + step2[21] = -step1[21] + step1[22]; 1.1074 + step2[22] = step1[21] + step1[22]; 1.1075 + step2[23] = step1[20] + step1[23]; 1.1076 + 1.1077 + step2[24] = step1[24] + step1[27]; 1.1078 + step2[25] = step1[25] + step1[26]; 1.1079 + step2[26] = step1[25] - step1[26]; 1.1080 + step2[27] = step1[24] - step1[27]; 1.1081 + step2[28] = -step1[28] + step1[31]; 1.1082 + step2[29] = -step1[29] + step1[30]; 1.1083 + step2[30] = step1[29] + step1[30]; 1.1084 + step2[31] = step1[28] + step1[31]; 1.1085 + 1.1086 + // stage 5 1.1087 + step1[0] = step2[0] + step2[3]; 1.1088 + step1[1] = step2[1] + step2[2]; 1.1089 + step1[2] = step2[1] - step2[2]; 1.1090 + step1[3] = step2[0] - step2[3]; 1.1091 + step1[4] = step2[4]; 1.1092 + temp1 = (step2[6] - step2[5]) * cospi_16_64; 1.1093 + temp2 = (step2[5] + step2[6]) * cospi_16_64; 1.1094 + step1[5] = dct_const_round_shift(temp1); 1.1095 + step1[6] = dct_const_round_shift(temp2); 1.1096 + step1[7] = step2[7]; 1.1097 + 1.1098 + step1[8] = step2[8] + step2[11]; 1.1099 + step1[9] = step2[9] + step2[10]; 1.1100 + step1[10] = step2[9] - step2[10]; 1.1101 + step1[11] = step2[8] - step2[11]; 1.1102 + step1[12] = -step2[12] + step2[15]; 1.1103 + step1[13] = -step2[13] + step2[14]; 1.1104 + step1[14] = step2[13] + step2[14]; 1.1105 + step1[15] = step2[12] + step2[15]; 1.1106 + 1.1107 + step1[16] = step2[16]; 1.1108 + step1[17] = step2[17]; 1.1109 + temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; 1.1110 + temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; 1.1111 + step1[18] = dct_const_round_shift(temp1); 1.1112 + step1[29] = dct_const_round_shift(temp2); 1.1113 + temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; 1.1114 + temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; 1.1115 + step1[19] = dct_const_round_shift(temp1); 1.1116 + step1[28] = dct_const_round_shift(temp2); 1.1117 + temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; 1.1118 + temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; 1.1119 + step1[20] = dct_const_round_shift(temp1); 1.1120 + step1[27] = dct_const_round_shift(temp2); 1.1121 + temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; 1.1122 + temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; 1.1123 + step1[21] = dct_const_round_shift(temp1); 1.1124 + step1[26] = dct_const_round_shift(temp2); 1.1125 + step1[22] = step2[22]; 1.1126 + step1[23] = step2[23]; 1.1127 + step1[24] = step2[24]; 1.1128 + step1[25] = step2[25]; 1.1129 + step1[30] = step2[30]; 1.1130 + step1[31] = step2[31]; 1.1131 + 1.1132 + // stage 6 1.1133 + step2[0] = step1[0] + step1[7]; 1.1134 + step2[1] = step1[1] + step1[6]; 1.1135 + step2[2] = step1[2] + step1[5]; 1.1136 + step2[3] = step1[3] + step1[4]; 1.1137 + step2[4] = step1[3] - step1[4]; 1.1138 + step2[5] = step1[2] - step1[5]; 1.1139 + step2[6] = step1[1] - step1[6]; 1.1140 + step2[7] = step1[0] - step1[7]; 1.1141 + step2[8] = step1[8]; 1.1142 + step2[9] = step1[9]; 1.1143 + temp1 = (-step1[10] + step1[13]) * cospi_16_64; 1.1144 + temp2 = (step1[10] + step1[13]) * cospi_16_64; 1.1145 + step2[10] = dct_const_round_shift(temp1); 1.1146 + step2[13] = dct_const_round_shift(temp2); 1.1147 + temp1 = (-step1[11] + step1[12]) * cospi_16_64; 1.1148 + temp2 = (step1[11] + step1[12]) * cospi_16_64; 1.1149 + step2[11] = dct_const_round_shift(temp1); 1.1150 + step2[12] = dct_const_round_shift(temp2); 1.1151 + step2[14] = step1[14]; 1.1152 + step2[15] = step1[15]; 1.1153 + 1.1154 + step2[16] = step1[16] + step1[23]; 1.1155 + step2[17] = step1[17] + step1[22]; 1.1156 + step2[18] = step1[18] + step1[21]; 1.1157 + step2[19] = step1[19] + step1[20]; 1.1158 + step2[20] = step1[19] - step1[20]; 1.1159 + step2[21] = step1[18] - step1[21]; 1.1160 + step2[22] = step1[17] - step1[22]; 1.1161 + step2[23] = step1[16] - step1[23]; 1.1162 + 1.1163 + step2[24] = -step1[24] + step1[31]; 1.1164 + step2[25] = -step1[25] + step1[30]; 1.1165 + step2[26] = -step1[26] + step1[29]; 1.1166 + step2[27] = -step1[27] + step1[28]; 1.1167 + step2[28] = step1[27] + step1[28]; 1.1168 + step2[29] = step1[26] + step1[29]; 1.1169 + step2[30] = step1[25] + step1[30]; 1.1170 + step2[31] = step1[24] + step1[31]; 1.1171 + 1.1172 + // stage 7 1.1173 + step1[0] = step2[0] + step2[15]; 1.1174 + step1[1] = step2[1] + step2[14]; 1.1175 + step1[2] = step2[2] + step2[13]; 1.1176 + step1[3] = step2[3] + step2[12]; 1.1177 + step1[4] = step2[4] + step2[11]; 1.1178 + step1[5] = step2[5] + step2[10]; 1.1179 + step1[6] = step2[6] + step2[9]; 1.1180 + step1[7] = step2[7] + step2[8]; 1.1181 + step1[8] = step2[7] - step2[8]; 1.1182 + step1[9] = step2[6] - step2[9]; 1.1183 + step1[10] = step2[5] - step2[10]; 1.1184 + step1[11] = step2[4] - step2[11]; 1.1185 + step1[12] = step2[3] - step2[12]; 1.1186 + step1[13] = step2[2] - step2[13]; 1.1187 + step1[14] = step2[1] - step2[14]; 1.1188 + step1[15] = step2[0] - step2[15]; 1.1189 + 1.1190 + step1[16] = step2[16]; 1.1191 + step1[17] = step2[17]; 1.1192 + step1[18] = step2[18]; 1.1193 + step1[19] = step2[19]; 1.1194 + temp1 = (-step2[20] + step2[27]) * cospi_16_64; 1.1195 + temp2 = (step2[20] + step2[27]) * cospi_16_64; 1.1196 + step1[20] = dct_const_round_shift(temp1); 1.1197 + step1[27] = dct_const_round_shift(temp2); 1.1198 + temp1 = (-step2[21] + step2[26]) * cospi_16_64; 1.1199 + temp2 = (step2[21] + step2[26]) * cospi_16_64; 1.1200 + step1[21] = dct_const_round_shift(temp1); 1.1201 + step1[26] = dct_const_round_shift(temp2); 1.1202 + temp1 = (-step2[22] + step2[25]) * cospi_16_64; 1.1203 + temp2 = (step2[22] + step2[25]) * cospi_16_64; 1.1204 + step1[22] = dct_const_round_shift(temp1); 1.1205 + step1[25] = dct_const_round_shift(temp2); 1.1206 + temp1 = (-step2[23] + step2[24]) * cospi_16_64; 1.1207 + temp2 = (step2[23] + step2[24]) * cospi_16_64; 1.1208 + step1[23] = dct_const_round_shift(temp1); 1.1209 + step1[24] = dct_const_round_shift(temp2); 1.1210 + step1[28] = step2[28]; 1.1211 + step1[29] = step2[29]; 1.1212 + step1[30] = step2[30]; 1.1213 + step1[31] = step2[31]; 1.1214 + 1.1215 + // final stage 1.1216 + output[0] = step1[0] + step1[31]; 1.1217 + output[1] = step1[1] + step1[30]; 1.1218 + output[2] = step1[2] + step1[29]; 1.1219 + output[3] = step1[3] + step1[28]; 1.1220 + output[4] = step1[4] + step1[27]; 1.1221 + output[5] = step1[5] + step1[26]; 1.1222 + output[6] = step1[6] + step1[25]; 1.1223 + output[7] = step1[7] + step1[24]; 1.1224 + output[8] = step1[8] + step1[23]; 1.1225 + output[9] = step1[9] + step1[22]; 1.1226 + output[10] = step1[10] + step1[21]; 1.1227 + output[11] = step1[11] + step1[20]; 1.1228 + output[12] = step1[12] + step1[19]; 1.1229 + output[13] = step1[13] + step1[18]; 1.1230 + output[14] = step1[14] + step1[17]; 1.1231 + output[15] = step1[15] + step1[16]; 1.1232 + output[16] = step1[15] - step1[16]; 1.1233 + output[17] = step1[14] - step1[17]; 1.1234 + output[18] = step1[13] - step1[18]; 1.1235 + output[19] = step1[12] - step1[19]; 1.1236 + output[20] = step1[11] - step1[20]; 1.1237 + output[21] = step1[10] - step1[21]; 1.1238 + output[22] = step1[9] - step1[22]; 1.1239 + output[23] = step1[8] - step1[23]; 1.1240 + output[24] = step1[7] - step1[24]; 1.1241 + output[25] = step1[6] - step1[25]; 1.1242 + output[26] = step1[5] - step1[26]; 1.1243 + output[27] = step1[4] - step1[27]; 1.1244 + output[28] = step1[3] - step1[28]; 1.1245 + output[29] = step1[2] - step1[29]; 1.1246 + output[30] = step1[1] - step1[30]; 1.1247 + output[31] = step1[0] - step1[31]; 1.1248 +} 1.1249 + 1.1250 +void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { 1.1251 + int16_t out[32 * 32]; 1.1252 + int16_t *outptr = out; 1.1253 + int i, j; 1.1254 + int16_t temp_in[32], temp_out[32]; 1.1255 + 1.1256 + // Rows 1.1257 + for (i = 0; i < 32; ++i) { 1.1258 + int16_t zero_coeff[16]; 1.1259 + for (j = 0; j < 16; ++j) 1.1260 + zero_coeff[j] = input[2 * j] | input[2 * j + 1]; 1.1261 + for (j = 0; j < 8; ++j) 1.1262 + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1.1263 + for (j = 0; j < 4; ++j) 1.1264 + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1.1265 + for (j = 0; j < 2; ++j) 1.1266 + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1.1267 + 1.1268 + if (zero_coeff[0] | zero_coeff[1]) 1.1269 + idct32_1d(input, outptr); 1.1270 + else 1.1271 + vpx_memset(outptr, 0, sizeof(int16_t) * 32); 1.1272 + input += 32; 1.1273 + outptr += 32; 1.1274 + } 1.1275 + 1.1276 + // Columns 1.1277 + for (i = 0; i < 32; ++i) { 1.1278 + for (j = 0; j < 32; ++j) 1.1279 + temp_in[j] = out[j * 32 + i]; 1.1280 + idct32_1d(temp_in, temp_out); 1.1281 + for (j = 0; j < 32; ++j) 1.1282 + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 1.1283 + + dest[j * stride + i]); 1.1284 + } 1.1285 +} 1.1286 + 1.1287 +void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) { 1.1288 + int16_t out[32 * 32] = {0}; 1.1289 + int16_t *outptr = out; 1.1290 + int i, j; 1.1291 + int16_t temp_in[32], temp_out[32]; 1.1292 + 1.1293 + // Rows 1.1294 + // only upper-left 8x8 has non-zero coeff 1.1295 + for (i = 0; i < 8; ++i) { 1.1296 + idct32_1d(input, outptr); 1.1297 + input += 32; 1.1298 + outptr += 32; 1.1299 + } 1.1300 + 1.1301 + // Columns 1.1302 + for (i = 0; i < 32; ++i) { 1.1303 + for (j = 0; j < 32; ++j) 1.1304 + temp_in[j] = out[j * 32 + i]; 1.1305 + idct32_1d(temp_in, temp_out); 1.1306 + for (j = 0; j < 32; ++j) 1.1307 + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 1.1308 + + dest[j * stride + i]); 1.1309 + } 1.1310 +} 1.1311 + 1.1312 +void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) { 1.1313 + int i, j; 1.1314 + int a1; 1.1315 + 1.1316 + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); 1.1317 + out = dct_const_round_shift(out * cospi_16_64); 1.1318 + a1 = ROUND_POWER_OF_TWO(out, 6); 1.1319 + 1.1320 + for (j = 0; j < 32; ++j) { 1.1321 + for (i = 0; i < 32; ++i) 1.1322 + dest[i] = clip_pixel(dest[i] + a1); 1.1323 + dest += stride; 1.1324 + } 1.1325 +} 1.1326 + 1.1327 +// idct 1.1328 +void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) { 1.1329 + if (eob > 1) 1.1330 + vp9_idct4x4_16_add(input, dest, stride); 1.1331 + else 1.1332 + vp9_idct4x4_1_add(input, dest, stride); 1.1333 +} 1.1334 + 1.1335 + 1.1336 +void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) { 1.1337 + if (eob > 1) 1.1338 + vp9_iwht4x4_16_add(input, dest, stride); 1.1339 + else 1.1340 + vp9_iwht4x4_1_add(input, dest, stride); 1.1341 +} 1.1342 + 1.1343 +void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) { 1.1344 + // If dc is 1, then input[0] is the reconstructed value, do not need 1.1345 + // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. 1.1346 + 1.1347 + // The calculation can be simplified if there are not many non-zero dct 1.1348 + // coefficients. Use eobs to decide what to do. 1.1349 + // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. 1.1350 + // Combine that with code here. 1.1351 + if (eob) { 1.1352 + if (eob == 1) 1.1353 + // DC only DCT coefficient 1.1354 + vp9_idct8x8_1_add(input, dest, stride); 1.1355 + else if (eob <= 10) 1.1356 + vp9_idct8x8_10_add(input, dest, stride); 1.1357 + else 1.1358 + vp9_idct8x8_64_add(input, dest, stride); 1.1359 + } 1.1360 +} 1.1361 + 1.1362 +void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, 1.1363 + int eob) { 1.1364 + /* The calculation can be simplified if there are not many non-zero dct 1.1365 + * coefficients. Use eobs to separate different cases. */ 1.1366 + if (eob) { 1.1367 + if (eob == 1) 1.1368 + /* DC only DCT coefficient. */ 1.1369 + vp9_idct16x16_1_add(input, dest, stride); 1.1370 + else if (eob <= 10) 1.1371 + vp9_idct16x16_10_add(input, dest, stride); 1.1372 + else 1.1373 + vp9_idct16x16_256_add(input, dest, stride); 1.1374 + } 1.1375 +} 1.1376 + 1.1377 +void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride, 1.1378 + int eob) { 1.1379 + if (eob) { 1.1380 + if (eob == 1) 1.1381 + vp9_idct32x32_1_add(input, dest, stride); 1.1382 + else if (eob <= 34) 1.1383 + // non-zero coeff only in upper-left 8x8 1.1384 + vp9_idct32x32_34_add(input, dest, stride); 1.1385 + else 1.1386 + vp9_idct32x32_1024_add(input, dest, stride); 1.1387 + } 1.1388 +} 1.1389 + 1.1390 +// iht 1.1391 +void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, 1.1392 + int stride, int eob) { 1.1393 + if (tx_type == DCT_DCT) 1.1394 + vp9_idct4x4_add(input, dest, stride, eob); 1.1395 + else 1.1396 + vp9_iht4x4_16_add(input, dest, stride, tx_type); 1.1397 +} 1.1398 + 1.1399 +void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, 1.1400 + int stride, int eob) { 1.1401 + if (tx_type == DCT_DCT) { 1.1402 + vp9_idct8x8_add(input, dest, stride, eob); 1.1403 + } else { 1.1404 + if (eob > 0) { 1.1405 + vp9_iht8x8_64_add(input, dest, stride, tx_type); 1.1406 + } 1.1407 + } 1.1408 +} 1.1409 + 1.1410 +void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, 1.1411 + int stride, int eob) { 1.1412 + if (tx_type == DCT_DCT) { 1.1413 + vp9_idct16x16_add(input, dest, stride, eob); 1.1414 + } else { 1.1415 + if (eob > 0) { 1.1416 + vp9_iht16x16_256_add(input, dest, stride, tx_type); 1.1417 + } 1.1418 + } 1.1419 +}