media/libvpx/vp9/common/vp9_idct.c

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 *
michael@0 4 * Use of this source code is governed by a BSD-style license
michael@0 5 * that can be found in the LICENSE file in the root of the source
michael@0 6 * tree. An additional intellectual property rights grant can be found
michael@0 7 * in the file PATENTS. All contributing project authors may
michael@0 8 * be found in the AUTHORS file in the root of the source tree.
michael@0 9 */
michael@0 10
michael@0 11 #include <assert.h>
michael@0 12 #include <math.h>
michael@0 13
michael@0 14 #include "./vpx_config.h"
michael@0 15 #include "./vp9_rtcd.h"
michael@0 16 #include "vp9/common/vp9_systemdependent.h"
michael@0 17 #include "vp9/common/vp9_blockd.h"
michael@0 18 #include "vp9/common/vp9_common.h"
michael@0 19 #include "vp9/common/vp9_idct.h"
michael@0 20
michael@0 21 void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
michael@0 22 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
michael@0 23 0.5 shifts per pixel. */
michael@0 24 int i;
michael@0 25 int16_t output[16];
michael@0 26 int a1, b1, c1, d1, e1;
michael@0 27 const int16_t *ip = input;
michael@0 28 int16_t *op = output;
michael@0 29
michael@0 30 for (i = 0; i < 4; i++) {
michael@0 31 a1 = ip[0] >> UNIT_QUANT_SHIFT;
michael@0 32 c1 = ip[1] >> UNIT_QUANT_SHIFT;
michael@0 33 d1 = ip[2] >> UNIT_QUANT_SHIFT;
michael@0 34 b1 = ip[3] >> UNIT_QUANT_SHIFT;
michael@0 35 a1 += c1;
michael@0 36 d1 -= b1;
michael@0 37 e1 = (a1 - d1) >> 1;
michael@0 38 b1 = e1 - b1;
michael@0 39 c1 = e1 - c1;
michael@0 40 a1 -= b1;
michael@0 41 d1 += c1;
michael@0 42 op[0] = a1;
michael@0 43 op[1] = b1;
michael@0 44 op[2] = c1;
michael@0 45 op[3] = d1;
michael@0 46 ip += 4;
michael@0 47 op += 4;
michael@0 48 }
michael@0 49
michael@0 50 ip = output;
michael@0 51 for (i = 0; i < 4; i++) {
michael@0 52 a1 = ip[4 * 0];
michael@0 53 c1 = ip[4 * 1];
michael@0 54 d1 = ip[4 * 2];
michael@0 55 b1 = ip[4 * 3];
michael@0 56 a1 += c1;
michael@0 57 d1 -= b1;
michael@0 58 e1 = (a1 - d1) >> 1;
michael@0 59 b1 = e1 - b1;
michael@0 60 c1 = e1 - c1;
michael@0 61 a1 -= b1;
michael@0 62 d1 += c1;
michael@0 63 dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
michael@0 64 dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
michael@0 65 dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
michael@0 66 dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
michael@0 67
michael@0 68 ip++;
michael@0 69 dest++;
michael@0 70 }
michael@0 71 }
michael@0 72
michael@0 73 void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
michael@0 74 int i;
michael@0 75 int a1, e1;
michael@0 76 int16_t tmp[4];
michael@0 77 const int16_t *ip = in;
michael@0 78 int16_t *op = tmp;
michael@0 79
michael@0 80 a1 = ip[0] >> UNIT_QUANT_SHIFT;
michael@0 81 e1 = a1 >> 1;
michael@0 82 a1 -= e1;
michael@0 83 op[0] = a1;
michael@0 84 op[1] = op[2] = op[3] = e1;
michael@0 85
michael@0 86 ip = tmp;
michael@0 87 for (i = 0; i < 4; i++) {
michael@0 88 e1 = ip[0] >> 1;
michael@0 89 a1 = ip[0] - e1;
michael@0 90 dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
michael@0 91 dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);
michael@0 92 dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);
michael@0 93 dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);
michael@0 94 ip++;
michael@0 95 dest++;
michael@0 96 }
michael@0 97 }
michael@0 98
michael@0 99 static void idct4_1d(const int16_t *input, int16_t *output) {
michael@0 100 int16_t step[4];
michael@0 101 int temp1, temp2;
michael@0 102 // stage 1
michael@0 103 temp1 = (input[0] + input[2]) * cospi_16_64;
michael@0 104 temp2 = (input[0] - input[2]) * cospi_16_64;
michael@0 105 step[0] = dct_const_round_shift(temp1);
michael@0 106 step[1] = dct_const_round_shift(temp2);
michael@0 107 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
michael@0 108 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
michael@0 109 step[2] = dct_const_round_shift(temp1);
michael@0 110 step[3] = dct_const_round_shift(temp2);
michael@0 111
michael@0 112 // stage 2
michael@0 113 output[0] = step[0] + step[3];
michael@0 114 output[1] = step[1] + step[2];
michael@0 115 output[2] = step[1] - step[2];
michael@0 116 output[3] = step[0] - step[3];
michael@0 117 }
michael@0 118
michael@0 119 void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
michael@0 120 int16_t out[4 * 4];
michael@0 121 int16_t *outptr = out;
michael@0 122 int i, j;
michael@0 123 int16_t temp_in[4], temp_out[4];
michael@0 124
michael@0 125 // Rows
michael@0 126 for (i = 0; i < 4; ++i) {
michael@0 127 idct4_1d(input, outptr);
michael@0 128 input += 4;
michael@0 129 outptr += 4;
michael@0 130 }
michael@0 131
michael@0 132 // Columns
michael@0 133 for (i = 0; i < 4; ++i) {
michael@0 134 for (j = 0; j < 4; ++j)
michael@0 135 temp_in[j] = out[j * 4 + i];
michael@0 136 idct4_1d(temp_in, temp_out);
michael@0 137 for (j = 0; j < 4; ++j)
michael@0 138 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
michael@0 139 + dest[j * stride + i]);
michael@0 140 }
michael@0 141 }
michael@0 142
michael@0 143 void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
michael@0 144 int i;
michael@0 145 int a1;
michael@0 146 int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
michael@0 147 out = dct_const_round_shift(out * cospi_16_64);
michael@0 148 a1 = ROUND_POWER_OF_TWO(out, 4);
michael@0 149
michael@0 150 for (i = 0; i < 4; i++) {
michael@0 151 dest[0] = clip_pixel(dest[0] + a1);
michael@0 152 dest[1] = clip_pixel(dest[1] + a1);
michael@0 153 dest[2] = clip_pixel(dest[2] + a1);
michael@0 154 dest[3] = clip_pixel(dest[3] + a1);
michael@0 155 dest += dest_stride;
michael@0 156 }
michael@0 157 }
michael@0 158
michael@0 159 static void idct8_1d(const int16_t *input, int16_t *output) {
michael@0 160 int16_t step1[8], step2[8];
michael@0 161 int temp1, temp2;
michael@0 162 // stage 1
michael@0 163 step1[0] = input[0];
michael@0 164 step1[2] = input[4];
michael@0 165 step1[1] = input[2];
michael@0 166 step1[3] = input[6];
michael@0 167 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
michael@0 168 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
michael@0 169 step1[4] = dct_const_round_shift(temp1);
michael@0 170 step1[7] = dct_const_round_shift(temp2);
michael@0 171 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
michael@0 172 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
michael@0 173 step1[5] = dct_const_round_shift(temp1);
michael@0 174 step1[6] = dct_const_round_shift(temp2);
michael@0 175
michael@0 176 // stage 2 & stage 3 - even half
michael@0 177 idct4_1d(step1, step1);
michael@0 178
michael@0 179 // stage 2 - odd half
michael@0 180 step2[4] = step1[4] + step1[5];
michael@0 181 step2[5] = step1[4] - step1[5];
michael@0 182 step2[6] = -step1[6] + step1[7];
michael@0 183 step2[7] = step1[6] + step1[7];
michael@0 184
michael@0 185 // stage 3 -odd half
michael@0 186 step1[4] = step2[4];
michael@0 187 temp1 = (step2[6] - step2[5]) * cospi_16_64;
michael@0 188 temp2 = (step2[5] + step2[6]) * cospi_16_64;
michael@0 189 step1[5] = dct_const_round_shift(temp1);
michael@0 190 step1[6] = dct_const_round_shift(temp2);
michael@0 191 step1[7] = step2[7];
michael@0 192
michael@0 193 // stage 4
michael@0 194 output[0] = step1[0] + step1[7];
michael@0 195 output[1] = step1[1] + step1[6];
michael@0 196 output[2] = step1[2] + step1[5];
michael@0 197 output[3] = step1[3] + step1[4];
michael@0 198 output[4] = step1[3] - step1[4];
michael@0 199 output[5] = step1[2] - step1[5];
michael@0 200 output[6] = step1[1] - step1[6];
michael@0 201 output[7] = step1[0] - step1[7];
michael@0 202 }
michael@0 203
michael@0 204 void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
michael@0 205 int16_t out[8 * 8];
michael@0 206 int16_t *outptr = out;
michael@0 207 int i, j;
michael@0 208 int16_t temp_in[8], temp_out[8];
michael@0 209
michael@0 210 // First transform rows
michael@0 211 for (i = 0; i < 8; ++i) {
michael@0 212 idct8_1d(input, outptr);
michael@0 213 input += 8;
michael@0 214 outptr += 8;
michael@0 215 }
michael@0 216
michael@0 217 // Then transform columns
michael@0 218 for (i = 0; i < 8; ++i) {
michael@0 219 for (j = 0; j < 8; ++j)
michael@0 220 temp_in[j] = out[j * 8 + i];
michael@0 221 idct8_1d(temp_in, temp_out);
michael@0 222 for (j = 0; j < 8; ++j)
michael@0 223 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
michael@0 224 + dest[j * stride + i]);
michael@0 225 }
michael@0 226 }
michael@0 227
michael@0 228 void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
michael@0 229 int i, j;
michael@0 230 int a1;
michael@0 231 int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
michael@0 232 out = dct_const_round_shift(out * cospi_16_64);
michael@0 233 a1 = ROUND_POWER_OF_TWO(out, 5);
michael@0 234 for (j = 0; j < 8; ++j) {
michael@0 235 for (i = 0; i < 8; ++i)
michael@0 236 dest[i] = clip_pixel(dest[i] + a1);
michael@0 237 dest += stride;
michael@0 238 }
michael@0 239 }
michael@0 240
michael@0 241 static void iadst4_1d(const int16_t *input, int16_t *output) {
michael@0 242 int s0, s1, s2, s3, s4, s5, s6, s7;
michael@0 243
michael@0 244 int x0 = input[0];
michael@0 245 int x1 = input[1];
michael@0 246 int x2 = input[2];
michael@0 247 int x3 = input[3];
michael@0 248
michael@0 249 if (!(x0 | x1 | x2 | x3)) {
michael@0 250 output[0] = output[1] = output[2] = output[3] = 0;
michael@0 251 return;
michael@0 252 }
michael@0 253
michael@0 254 s0 = sinpi_1_9 * x0;
michael@0 255 s1 = sinpi_2_9 * x0;
michael@0 256 s2 = sinpi_3_9 * x1;
michael@0 257 s3 = sinpi_4_9 * x2;
michael@0 258 s4 = sinpi_1_9 * x2;
michael@0 259 s5 = sinpi_2_9 * x3;
michael@0 260 s6 = sinpi_4_9 * x3;
michael@0 261 s7 = x0 - x2 + x3;
michael@0 262
michael@0 263 x0 = s0 + s3 + s5;
michael@0 264 x1 = s1 - s4 - s6;
michael@0 265 x2 = sinpi_3_9 * s7;
michael@0 266 x3 = s2;
michael@0 267
michael@0 268 s0 = x0 + x3;
michael@0 269 s1 = x1 + x3;
michael@0 270 s2 = x2;
michael@0 271 s3 = x0 + x1 - x3;
michael@0 272
michael@0 273 // 1-D transform scaling factor is sqrt(2).
michael@0 274 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
michael@0 275 // + 1b (addition) = 29b.
michael@0 276 // Hence the output bit depth is 15b.
michael@0 277 output[0] = dct_const_round_shift(s0);
michael@0 278 output[1] = dct_const_round_shift(s1);
michael@0 279 output[2] = dct_const_round_shift(s2);
michael@0 280 output[3] = dct_const_round_shift(s3);
michael@0 281 }
michael@0 282
michael@0 283 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
michael@0 284 int tx_type) {
michael@0 285 const transform_2d IHT_4[] = {
michael@0 286 { idct4_1d, idct4_1d }, // DCT_DCT = 0
michael@0 287 { iadst4_1d, idct4_1d }, // ADST_DCT = 1
michael@0 288 { idct4_1d, iadst4_1d }, // DCT_ADST = 2
michael@0 289 { iadst4_1d, iadst4_1d } // ADST_ADST = 3
michael@0 290 };
michael@0 291
michael@0 292 int i, j;
michael@0 293 int16_t out[4 * 4];
michael@0 294 int16_t *outptr = out;
michael@0 295 int16_t temp_in[4], temp_out[4];
michael@0 296
michael@0 297 // inverse transform row vectors
michael@0 298 for (i = 0; i < 4; ++i) {
michael@0 299 IHT_4[tx_type].rows(input, outptr);
michael@0 300 input += 4;
michael@0 301 outptr += 4;
michael@0 302 }
michael@0 303
michael@0 304 // inverse transform column vectors
michael@0 305 for (i = 0; i < 4; ++i) {
michael@0 306 for (j = 0; j < 4; ++j)
michael@0 307 temp_in[j] = out[j * 4 + i];
michael@0 308 IHT_4[tx_type].cols(temp_in, temp_out);
michael@0 309 for (j = 0; j < 4; ++j)
michael@0 310 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
michael@0 311 + dest[j * stride + i]);
michael@0 312 }
michael@0 313 }
michael@0 314 static void iadst8_1d(const int16_t *input, int16_t *output) {
michael@0 315 int s0, s1, s2, s3, s4, s5, s6, s7;
michael@0 316
michael@0 317 int x0 = input[7];
michael@0 318 int x1 = input[0];
michael@0 319 int x2 = input[5];
michael@0 320 int x3 = input[2];
michael@0 321 int x4 = input[3];
michael@0 322 int x5 = input[4];
michael@0 323 int x6 = input[1];
michael@0 324 int x7 = input[6];
michael@0 325
michael@0 326 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
michael@0 327 output[0] = output[1] = output[2] = output[3] = output[4]
michael@0 328 = output[5] = output[6] = output[7] = 0;
michael@0 329 return;
michael@0 330 }
michael@0 331
michael@0 332 // stage 1
michael@0 333 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
michael@0 334 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
michael@0 335 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
michael@0 336 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
michael@0 337 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
michael@0 338 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
michael@0 339 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
michael@0 340 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
michael@0 341
michael@0 342 x0 = dct_const_round_shift(s0 + s4);
michael@0 343 x1 = dct_const_round_shift(s1 + s5);
michael@0 344 x2 = dct_const_round_shift(s2 + s6);
michael@0 345 x3 = dct_const_round_shift(s3 + s7);
michael@0 346 x4 = dct_const_round_shift(s0 - s4);
michael@0 347 x5 = dct_const_round_shift(s1 - s5);
michael@0 348 x6 = dct_const_round_shift(s2 - s6);
michael@0 349 x7 = dct_const_round_shift(s3 - s7);
michael@0 350
michael@0 351 // stage 2
michael@0 352 s0 = x0;
michael@0 353 s1 = x1;
michael@0 354 s2 = x2;
michael@0 355 s3 = x3;
michael@0 356 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
michael@0 357 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
michael@0 358 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
michael@0 359 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
michael@0 360
michael@0 361 x0 = s0 + s2;
michael@0 362 x1 = s1 + s3;
michael@0 363 x2 = s0 - s2;
michael@0 364 x3 = s1 - s3;
michael@0 365 x4 = dct_const_round_shift(s4 + s6);
michael@0 366 x5 = dct_const_round_shift(s5 + s7);
michael@0 367 x6 = dct_const_round_shift(s4 - s6);
michael@0 368 x7 = dct_const_round_shift(s5 - s7);
michael@0 369
michael@0 370 // stage 3
michael@0 371 s2 = cospi_16_64 * (x2 + x3);
michael@0 372 s3 = cospi_16_64 * (x2 - x3);
michael@0 373 s6 = cospi_16_64 * (x6 + x7);
michael@0 374 s7 = cospi_16_64 * (x6 - x7);
michael@0 375
michael@0 376 x2 = dct_const_round_shift(s2);
michael@0 377 x3 = dct_const_round_shift(s3);
michael@0 378 x6 = dct_const_round_shift(s6);
michael@0 379 x7 = dct_const_round_shift(s7);
michael@0 380
michael@0 381 output[0] = x0;
michael@0 382 output[1] = -x4;
michael@0 383 output[2] = x6;
michael@0 384 output[3] = -x2;
michael@0 385 output[4] = x3;
michael@0 386 output[5] = -x7;
michael@0 387 output[6] = x5;
michael@0 388 output[7] = -x1;
michael@0 389 }
michael@0 390
michael@0 391 static const transform_2d IHT_8[] = {
michael@0 392 { idct8_1d, idct8_1d }, // DCT_DCT = 0
michael@0 393 { iadst8_1d, idct8_1d }, // ADST_DCT = 1
michael@0 394 { idct8_1d, iadst8_1d }, // DCT_ADST = 2
michael@0 395 { iadst8_1d, iadst8_1d } // ADST_ADST = 3
michael@0 396 };
michael@0 397
michael@0 398 void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
michael@0 399 int tx_type) {
michael@0 400 int i, j;
michael@0 401 int16_t out[8 * 8];
michael@0 402 int16_t *outptr = out;
michael@0 403 int16_t temp_in[8], temp_out[8];
michael@0 404 const transform_2d ht = IHT_8[tx_type];
michael@0 405
michael@0 406 // inverse transform row vectors
michael@0 407 for (i = 0; i < 8; ++i) {
michael@0 408 ht.rows(input, outptr);
michael@0 409 input += 8;
michael@0 410 outptr += 8;
michael@0 411 }
michael@0 412
michael@0 413 // inverse transform column vectors
michael@0 414 for (i = 0; i < 8; ++i) {
michael@0 415 for (j = 0; j < 8; ++j)
michael@0 416 temp_in[j] = out[j * 8 + i];
michael@0 417 ht.cols(temp_in, temp_out);
michael@0 418 for (j = 0; j < 8; ++j)
michael@0 419 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
michael@0 420 + dest[j * stride + i]);
michael@0 421 }
michael@0 422 }
michael@0 423
michael@0 424 void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
michael@0 425 int16_t out[8 * 8] = { 0 };
michael@0 426 int16_t *outptr = out;
michael@0 427 int i, j;
michael@0 428 int16_t temp_in[8], temp_out[8];
michael@0 429
michael@0 430 // First transform rows
michael@0 431 // only first 4 row has non-zero coefs
michael@0 432 for (i = 0; i < 4; ++i) {
michael@0 433 idct8_1d(input, outptr);
michael@0 434 input += 8;
michael@0 435 outptr += 8;
michael@0 436 }
michael@0 437
michael@0 438 // Then transform columns
michael@0 439 for (i = 0; i < 8; ++i) {
michael@0 440 for (j = 0; j < 8; ++j)
michael@0 441 temp_in[j] = out[j * 8 + i];
michael@0 442 idct8_1d(temp_in, temp_out);
michael@0 443 for (j = 0; j < 8; ++j)
michael@0 444 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
michael@0 445 + dest[j * stride + i]);
michael@0 446 }
michael@0 447 }
michael@0 448
michael@0 449 static void idct16_1d(const int16_t *input, int16_t *output) {
michael@0 450 int16_t step1[16], step2[16];
michael@0 451 int temp1, temp2;
michael@0 452
michael@0 453 // stage 1
michael@0 454 step1[0] = input[0/2];
michael@0 455 step1[1] = input[16/2];
michael@0 456 step1[2] = input[8/2];
michael@0 457 step1[3] = input[24/2];
michael@0 458 step1[4] = input[4/2];
michael@0 459 step1[5] = input[20/2];
michael@0 460 step1[6] = input[12/2];
michael@0 461 step1[7] = input[28/2];
michael@0 462 step1[8] = input[2/2];
michael@0 463 step1[9] = input[18/2];
michael@0 464 step1[10] = input[10/2];
michael@0 465 step1[11] = input[26/2];
michael@0 466 step1[12] = input[6/2];
michael@0 467 step1[13] = input[22/2];
michael@0 468 step1[14] = input[14/2];
michael@0 469 step1[15] = input[30/2];
michael@0 470
michael@0 471 // stage 2
michael@0 472 step2[0] = step1[0];
michael@0 473 step2[1] = step1[1];
michael@0 474 step2[2] = step1[2];
michael@0 475 step2[3] = step1[3];
michael@0 476 step2[4] = step1[4];
michael@0 477 step2[5] = step1[5];
michael@0 478 step2[6] = step1[6];
michael@0 479 step2[7] = step1[7];
michael@0 480
michael@0 481 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
michael@0 482 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
michael@0 483 step2[8] = dct_const_round_shift(temp1);
michael@0 484 step2[15] = dct_const_round_shift(temp2);
michael@0 485
michael@0 486 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
michael@0 487 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
michael@0 488 step2[9] = dct_const_round_shift(temp1);
michael@0 489 step2[14] = dct_const_round_shift(temp2);
michael@0 490
michael@0 491 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
michael@0 492 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
michael@0 493 step2[10] = dct_const_round_shift(temp1);
michael@0 494 step2[13] = dct_const_round_shift(temp2);
michael@0 495
michael@0 496 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
michael@0 497 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
michael@0 498 step2[11] = dct_const_round_shift(temp1);
michael@0 499 step2[12] = dct_const_round_shift(temp2);
michael@0 500
michael@0 501 // stage 3
michael@0 502 step1[0] = step2[0];
michael@0 503 step1[1] = step2[1];
michael@0 504 step1[2] = step2[2];
michael@0 505 step1[3] = step2[3];
michael@0 506
michael@0 507 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
michael@0 508 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
michael@0 509 step1[4] = dct_const_round_shift(temp1);
michael@0 510 step1[7] = dct_const_round_shift(temp2);
michael@0 511 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
michael@0 512 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
michael@0 513 step1[5] = dct_const_round_shift(temp1);
michael@0 514 step1[6] = dct_const_round_shift(temp2);
michael@0 515
michael@0 516 step1[8] = step2[8] + step2[9];
michael@0 517 step1[9] = step2[8] - step2[9];
michael@0 518 step1[10] = -step2[10] + step2[11];
michael@0 519 step1[11] = step2[10] + step2[11];
michael@0 520 step1[12] = step2[12] + step2[13];
michael@0 521 step1[13] = step2[12] - step2[13];
michael@0 522 step1[14] = -step2[14] + step2[15];
michael@0 523 step1[15] = step2[14] + step2[15];
michael@0 524
michael@0 525 // stage 4
michael@0 526 temp1 = (step1[0] + step1[1]) * cospi_16_64;
michael@0 527 temp2 = (step1[0] - step1[1]) * cospi_16_64;
michael@0 528 step2[0] = dct_const_round_shift(temp1);
michael@0 529 step2[1] = dct_const_round_shift(temp2);
michael@0 530 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
michael@0 531 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
michael@0 532 step2[2] = dct_const_round_shift(temp1);
michael@0 533 step2[3] = dct_const_round_shift(temp2);
michael@0 534 step2[4] = step1[4] + step1[5];
michael@0 535 step2[5] = step1[4] - step1[5];
michael@0 536 step2[6] = -step1[6] + step1[7];
michael@0 537 step2[7] = step1[6] + step1[7];
michael@0 538
michael@0 539 step2[8] = step1[8];
michael@0 540 step2[15] = step1[15];
michael@0 541 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
michael@0 542 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
michael@0 543 step2[9] = dct_const_round_shift(temp1);
michael@0 544 step2[14] = dct_const_round_shift(temp2);
michael@0 545 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
michael@0 546 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
michael@0 547 step2[10] = dct_const_round_shift(temp1);
michael@0 548 step2[13] = dct_const_round_shift(temp2);
michael@0 549 step2[11] = step1[11];
michael@0 550 step2[12] = step1[12];
michael@0 551
michael@0 552 // stage 5
michael@0 553 step1[0] = step2[0] + step2[3];
michael@0 554 step1[1] = step2[1] + step2[2];
michael@0 555 step1[2] = step2[1] - step2[2];
michael@0 556 step1[3] = step2[0] - step2[3];
michael@0 557 step1[4] = step2[4];
michael@0 558 temp1 = (step2[6] - step2[5]) * cospi_16_64;
michael@0 559 temp2 = (step2[5] + step2[6]) * cospi_16_64;
michael@0 560 step1[5] = dct_const_round_shift(temp1);
michael@0 561 step1[6] = dct_const_round_shift(temp2);
michael@0 562 step1[7] = step2[7];
michael@0 563
michael@0 564 step1[8] = step2[8] + step2[11];
michael@0 565 step1[9] = step2[9] + step2[10];
michael@0 566 step1[10] = step2[9] - step2[10];
michael@0 567 step1[11] = step2[8] - step2[11];
michael@0 568 step1[12] = -step2[12] + step2[15];
michael@0 569 step1[13] = -step2[13] + step2[14];
michael@0 570 step1[14] = step2[13] + step2[14];
michael@0 571 step1[15] = step2[12] + step2[15];
michael@0 572
michael@0 573 // stage 6
michael@0 574 step2[0] = step1[0] + step1[7];
michael@0 575 step2[1] = step1[1] + step1[6];
michael@0 576 step2[2] = step1[2] + step1[5];
michael@0 577 step2[3] = step1[3] + step1[4];
michael@0 578 step2[4] = step1[3] - step1[4];
michael@0 579 step2[5] = step1[2] - step1[5];
michael@0 580 step2[6] = step1[1] - step1[6];
michael@0 581 step2[7] = step1[0] - step1[7];
michael@0 582 step2[8] = step1[8];
michael@0 583 step2[9] = step1[9];
michael@0 584 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
michael@0 585 temp2 = (step1[10] + step1[13]) * cospi_16_64;
michael@0 586 step2[10] = dct_const_round_shift(temp1);
michael@0 587 step2[13] = dct_const_round_shift(temp2);
michael@0 588 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
michael@0 589 temp2 = (step1[11] + step1[12]) * cospi_16_64;
michael@0 590 step2[11] = dct_const_round_shift(temp1);
michael@0 591 step2[12] = dct_const_round_shift(temp2);
michael@0 592 step2[14] = step1[14];
michael@0 593 step2[15] = step1[15];
michael@0 594
michael@0 595 // stage 7
michael@0 596 output[0] = step2[0] + step2[15];
michael@0 597 output[1] = step2[1] + step2[14];
michael@0 598 output[2] = step2[2] + step2[13];
michael@0 599 output[3] = step2[3] + step2[12];
michael@0 600 output[4] = step2[4] + step2[11];
michael@0 601 output[5] = step2[5] + step2[10];
michael@0 602 output[6] = step2[6] + step2[9];
michael@0 603 output[7] = step2[7] + step2[8];
michael@0 604 output[8] = step2[7] - step2[8];
michael@0 605 output[9] = step2[6] - step2[9];
michael@0 606 output[10] = step2[5] - step2[10];
michael@0 607 output[11] = step2[4] - step2[11];
michael@0 608 output[12] = step2[3] - step2[12];
michael@0 609 output[13] = step2[2] - step2[13];
michael@0 610 output[14] = step2[1] - step2[14];
michael@0 611 output[15] = step2[0] - step2[15];
michael@0 612 }
michael@0 613
michael@0 614 void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
michael@0 615 int16_t out[16 * 16];
michael@0 616 int16_t *outptr = out;
michael@0 617 int i, j;
michael@0 618 int16_t temp_in[16], temp_out[16];
michael@0 619
michael@0 620 // First transform rows
michael@0 621 for (i = 0; i < 16; ++i) {
michael@0 622 idct16_1d(input, outptr);
michael@0 623 input += 16;
michael@0 624 outptr += 16;
michael@0 625 }
michael@0 626
michael@0 627 // Then transform columns
michael@0 628 for (i = 0; i < 16; ++i) {
michael@0 629 for (j = 0; j < 16; ++j)
michael@0 630 temp_in[j] = out[j * 16 + i];
michael@0 631 idct16_1d(temp_in, temp_out);
michael@0 632 for (j = 0; j < 16; ++j)
michael@0 633 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
michael@0 634 + dest[j * stride + i]);
michael@0 635 }
michael@0 636 }
michael@0 637
michael@0 638 static void iadst16_1d(const int16_t *input, int16_t *output) {
michael@0 639 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
michael@0 640
michael@0 641 int x0 = input[15];
michael@0 642 int x1 = input[0];
michael@0 643 int x2 = input[13];
michael@0 644 int x3 = input[2];
michael@0 645 int x4 = input[11];
michael@0 646 int x5 = input[4];
michael@0 647 int x6 = input[9];
michael@0 648 int x7 = input[6];
michael@0 649 int x8 = input[7];
michael@0 650 int x9 = input[8];
michael@0 651 int x10 = input[5];
michael@0 652 int x11 = input[10];
michael@0 653 int x12 = input[3];
michael@0 654 int x13 = input[12];
michael@0 655 int x14 = input[1];
michael@0 656 int x15 = input[14];
michael@0 657
michael@0 658 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
michael@0 659 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
michael@0 660 output[0] = output[1] = output[2] = output[3] = output[4]
michael@0 661 = output[5] = output[6] = output[7] = output[8]
michael@0 662 = output[9] = output[10] = output[11] = output[12]
michael@0 663 = output[13] = output[14] = output[15] = 0;
michael@0 664 return;
michael@0 665 }
michael@0 666
michael@0 667 // stage 1
michael@0 668 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
michael@0 669 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
michael@0 670 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
michael@0 671 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
michael@0 672 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
michael@0 673 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
michael@0 674 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
michael@0 675 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
michael@0 676 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
michael@0 677 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
michael@0 678 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
michael@0 679 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
michael@0 680 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
michael@0 681 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
michael@0 682 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
michael@0 683 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
michael@0 684
michael@0 685 x0 = dct_const_round_shift(s0 + s8);
michael@0 686 x1 = dct_const_round_shift(s1 + s9);
michael@0 687 x2 = dct_const_round_shift(s2 + s10);
michael@0 688 x3 = dct_const_round_shift(s3 + s11);
michael@0 689 x4 = dct_const_round_shift(s4 + s12);
michael@0 690 x5 = dct_const_round_shift(s5 + s13);
michael@0 691 x6 = dct_const_round_shift(s6 + s14);
michael@0 692 x7 = dct_const_round_shift(s7 + s15);
michael@0 693 x8 = dct_const_round_shift(s0 - s8);
michael@0 694 x9 = dct_const_round_shift(s1 - s9);
michael@0 695 x10 = dct_const_round_shift(s2 - s10);
michael@0 696 x11 = dct_const_round_shift(s3 - s11);
michael@0 697 x12 = dct_const_round_shift(s4 - s12);
michael@0 698 x13 = dct_const_round_shift(s5 - s13);
michael@0 699 x14 = dct_const_round_shift(s6 - s14);
michael@0 700 x15 = dct_const_round_shift(s7 - s15);
michael@0 701
michael@0 702 // stage 2
michael@0 703 s0 = x0;
michael@0 704 s1 = x1;
michael@0 705 s2 = x2;
michael@0 706 s3 = x3;
michael@0 707 s4 = x4;
michael@0 708 s5 = x5;
michael@0 709 s6 = x6;
michael@0 710 s7 = x7;
michael@0 711 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
michael@0 712 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
michael@0 713 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
michael@0 714 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
michael@0 715 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
michael@0 716 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
michael@0 717 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
michael@0 718 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
michael@0 719
michael@0 720 x0 = s0 + s4;
michael@0 721 x1 = s1 + s5;
michael@0 722 x2 = s2 + s6;
michael@0 723 x3 = s3 + s7;
michael@0 724 x4 = s0 - s4;
michael@0 725 x5 = s1 - s5;
michael@0 726 x6 = s2 - s6;
michael@0 727 x7 = s3 - s7;
michael@0 728 x8 = dct_const_round_shift(s8 + s12);
michael@0 729 x9 = dct_const_round_shift(s9 + s13);
michael@0 730 x10 = dct_const_round_shift(s10 + s14);
michael@0 731 x11 = dct_const_round_shift(s11 + s15);
michael@0 732 x12 = dct_const_round_shift(s8 - s12);
michael@0 733 x13 = dct_const_round_shift(s9 - s13);
michael@0 734 x14 = dct_const_round_shift(s10 - s14);
michael@0 735 x15 = dct_const_round_shift(s11 - s15);
michael@0 736
michael@0 737 // stage 3
michael@0 738 s0 = x0;
michael@0 739 s1 = x1;
michael@0 740 s2 = x2;
michael@0 741 s3 = x3;
michael@0 742 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
michael@0 743 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
michael@0 744 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
michael@0 745 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
michael@0 746 s8 = x8;
michael@0 747 s9 = x9;
michael@0 748 s10 = x10;
michael@0 749 s11 = x11;
michael@0 750 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
michael@0 751 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
michael@0 752 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
michael@0 753 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
michael@0 754
michael@0 755 x0 = s0 + s2;
michael@0 756 x1 = s1 + s3;
michael@0 757 x2 = s0 - s2;
michael@0 758 x3 = s1 - s3;
michael@0 759 x4 = dct_const_round_shift(s4 + s6);
michael@0 760 x5 = dct_const_round_shift(s5 + s7);
michael@0 761 x6 = dct_const_round_shift(s4 - s6);
michael@0 762 x7 = dct_const_round_shift(s5 - s7);
michael@0 763 x8 = s8 + s10;
michael@0 764 x9 = s9 + s11;
michael@0 765 x10 = s8 - s10;
michael@0 766 x11 = s9 - s11;
michael@0 767 x12 = dct_const_round_shift(s12 + s14);
michael@0 768 x13 = dct_const_round_shift(s13 + s15);
michael@0 769 x14 = dct_const_round_shift(s12 - s14);
michael@0 770 x15 = dct_const_round_shift(s13 - s15);
michael@0 771
michael@0 772 // stage 4
michael@0 773 s2 = (- cospi_16_64) * (x2 + x3);
michael@0 774 s3 = cospi_16_64 * (x2 - x3);
michael@0 775 s6 = cospi_16_64 * (x6 + x7);
michael@0 776 s7 = cospi_16_64 * (- x6 + x7);
michael@0 777 s10 = cospi_16_64 * (x10 + x11);
michael@0 778 s11 = cospi_16_64 * (- x10 + x11);
michael@0 779 s14 = (- cospi_16_64) * (x14 + x15);
michael@0 780 s15 = cospi_16_64 * (x14 - x15);
michael@0 781
michael@0 782 x2 = dct_const_round_shift(s2);
michael@0 783 x3 = dct_const_round_shift(s3);
michael@0 784 x6 = dct_const_round_shift(s6);
michael@0 785 x7 = dct_const_round_shift(s7);
michael@0 786 x10 = dct_const_round_shift(s10);
michael@0 787 x11 = dct_const_round_shift(s11);
michael@0 788 x14 = dct_const_round_shift(s14);
michael@0 789 x15 = dct_const_round_shift(s15);
michael@0 790
michael@0 791 output[0] = x0;
michael@0 792 output[1] = -x8;
michael@0 793 output[2] = x12;
michael@0 794 output[3] = -x4;
michael@0 795 output[4] = x6;
michael@0 796 output[5] = x14;
michael@0 797 output[6] = x10;
michael@0 798 output[7] = x2;
michael@0 799 output[8] = x3;
michael@0 800 output[9] = x11;
michael@0 801 output[10] = x15;
michael@0 802 output[11] = x7;
michael@0 803 output[12] = x5;
michael@0 804 output[13] = -x13;
michael@0 805 output[14] = x9;
michael@0 806 output[15] = -x1;
michael@0 807 }
michael@0 808
michael@0 809 static const transform_2d IHT_16[] = {
michael@0 810 { idct16_1d, idct16_1d }, // DCT_DCT = 0
michael@0 811 { iadst16_1d, idct16_1d }, // ADST_DCT = 1
michael@0 812 { idct16_1d, iadst16_1d }, // DCT_ADST = 2
michael@0 813 { iadst16_1d, iadst16_1d } // ADST_ADST = 3
michael@0 814 };
michael@0 815
michael@0 816 void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
michael@0 817 int tx_type) {
michael@0 818 int i, j;
michael@0 819 int16_t out[16 * 16];
michael@0 820 int16_t *outptr = out;
michael@0 821 int16_t temp_in[16], temp_out[16];
michael@0 822 const transform_2d ht = IHT_16[tx_type];
michael@0 823
michael@0 824 // Rows
michael@0 825 for (i = 0; i < 16; ++i) {
michael@0 826 ht.rows(input, outptr);
michael@0 827 input += 16;
michael@0 828 outptr += 16;
michael@0 829 }
michael@0 830
michael@0 831 // Columns
michael@0 832 for (i = 0; i < 16; ++i) {
michael@0 833 for (j = 0; j < 16; ++j)
michael@0 834 temp_in[j] = out[j * 16 + i];
michael@0 835 ht.cols(temp_in, temp_out);
michael@0 836 for (j = 0; j < 16; ++j)
michael@0 837 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
michael@0 838 + dest[j * stride + i]);
michael@0 839 }
michael@0 840 }
michael@0 841
michael@0 842 void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
michael@0 843 int16_t out[16 * 16] = { 0 };
michael@0 844 int16_t *outptr = out;
michael@0 845 int i, j;
michael@0 846 int16_t temp_in[16], temp_out[16];
michael@0 847
michael@0 848 // First transform rows. Since all non-zero dct coefficients are in
michael@0 849 // upper-left 4x4 area, we only need to calculate first 4 rows here.
michael@0 850 for (i = 0; i < 4; ++i) {
michael@0 851 idct16_1d(input, outptr);
michael@0 852 input += 16;
michael@0 853 outptr += 16;
michael@0 854 }
michael@0 855
michael@0 856 // Then transform columns
michael@0 857 for (i = 0; i < 16; ++i) {
michael@0 858 for (j = 0; j < 16; ++j)
michael@0 859 temp_in[j] = out[j*16 + i];
michael@0 860 idct16_1d(temp_in, temp_out);
michael@0 861 for (j = 0; j < 16; ++j)
michael@0 862 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
michael@0 863 + dest[j * stride + i]);
michael@0 864 }
michael@0 865 }
michael@0 866
michael@0 867 void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
michael@0 868 int i, j;
michael@0 869 int a1;
michael@0 870 int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
michael@0 871 out = dct_const_round_shift(out * cospi_16_64);
michael@0 872 a1 = ROUND_POWER_OF_TWO(out, 6);
michael@0 873 for (j = 0; j < 16; ++j) {
michael@0 874 for (i = 0; i < 16; ++i)
michael@0 875 dest[i] = clip_pixel(dest[i] + a1);
michael@0 876 dest += stride;
michael@0 877 }
michael@0 878 }
michael@0 879
michael@0 880 static void idct32_1d(const int16_t *input, int16_t *output) {
michael@0 881 int16_t step1[32], step2[32];
michael@0 882 int temp1, temp2;
michael@0 883
michael@0 884 // stage 1
michael@0 885 step1[0] = input[0];
michael@0 886 step1[1] = input[16];
michael@0 887 step1[2] = input[8];
michael@0 888 step1[3] = input[24];
michael@0 889 step1[4] = input[4];
michael@0 890 step1[5] = input[20];
michael@0 891 step1[6] = input[12];
michael@0 892 step1[7] = input[28];
michael@0 893 step1[8] = input[2];
michael@0 894 step1[9] = input[18];
michael@0 895 step1[10] = input[10];
michael@0 896 step1[11] = input[26];
michael@0 897 step1[12] = input[6];
michael@0 898 step1[13] = input[22];
michael@0 899 step1[14] = input[14];
michael@0 900 step1[15] = input[30];
michael@0 901
michael@0 902 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
michael@0 903 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
michael@0 904 step1[16] = dct_const_round_shift(temp1);
michael@0 905 step1[31] = dct_const_round_shift(temp2);
michael@0 906
michael@0 907 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
michael@0 908 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
michael@0 909 step1[17] = dct_const_round_shift(temp1);
michael@0 910 step1[30] = dct_const_round_shift(temp2);
michael@0 911
michael@0 912 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
michael@0 913 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
michael@0 914 step1[18] = dct_const_round_shift(temp1);
michael@0 915 step1[29] = dct_const_round_shift(temp2);
michael@0 916
michael@0 917 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
michael@0 918 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
michael@0 919 step1[19] = dct_const_round_shift(temp1);
michael@0 920 step1[28] = dct_const_round_shift(temp2);
michael@0 921
michael@0 922 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
michael@0 923 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
michael@0 924 step1[20] = dct_const_round_shift(temp1);
michael@0 925 step1[27] = dct_const_round_shift(temp2);
michael@0 926
michael@0 927 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
michael@0 928 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
michael@0 929 step1[21] = dct_const_round_shift(temp1);
michael@0 930 step1[26] = dct_const_round_shift(temp2);
michael@0 931
michael@0 932 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
michael@0 933 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
michael@0 934 step1[22] = dct_const_round_shift(temp1);
michael@0 935 step1[25] = dct_const_round_shift(temp2);
michael@0 936
michael@0 937 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
michael@0 938 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
michael@0 939 step1[23] = dct_const_round_shift(temp1);
michael@0 940 step1[24] = dct_const_round_shift(temp2);
michael@0 941
michael@0 942 // stage 2
michael@0 943 step2[0] = step1[0];
michael@0 944 step2[1] = step1[1];
michael@0 945 step2[2] = step1[2];
michael@0 946 step2[3] = step1[3];
michael@0 947 step2[4] = step1[4];
michael@0 948 step2[5] = step1[5];
michael@0 949 step2[6] = step1[6];
michael@0 950 step2[7] = step1[7];
michael@0 951
michael@0 952 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
michael@0 953 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
michael@0 954 step2[8] = dct_const_round_shift(temp1);
michael@0 955 step2[15] = dct_const_round_shift(temp2);
michael@0 956
michael@0 957 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
michael@0 958 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
michael@0 959 step2[9] = dct_const_round_shift(temp1);
michael@0 960 step2[14] = dct_const_round_shift(temp2);
michael@0 961
michael@0 962 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
michael@0 963 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
michael@0 964 step2[10] = dct_const_round_shift(temp1);
michael@0 965 step2[13] = dct_const_round_shift(temp2);
michael@0 966
michael@0 967 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
michael@0 968 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
michael@0 969 step2[11] = dct_const_round_shift(temp1);
michael@0 970 step2[12] = dct_const_round_shift(temp2);
michael@0 971
michael@0 972 step2[16] = step1[16] + step1[17];
michael@0 973 step2[17] = step1[16] - step1[17];
michael@0 974 step2[18] = -step1[18] + step1[19];
michael@0 975 step2[19] = step1[18] + step1[19];
michael@0 976 step2[20] = step1[20] + step1[21];
michael@0 977 step2[21] = step1[20] - step1[21];
michael@0 978 step2[22] = -step1[22] + step1[23];
michael@0 979 step2[23] = step1[22] + step1[23];
michael@0 980 step2[24] = step1[24] + step1[25];
michael@0 981 step2[25] = step1[24] - step1[25];
michael@0 982 step2[26] = -step1[26] + step1[27];
michael@0 983 step2[27] = step1[26] + step1[27];
michael@0 984 step2[28] = step1[28] + step1[29];
michael@0 985 step2[29] = step1[28] - step1[29];
michael@0 986 step2[30] = -step1[30] + step1[31];
michael@0 987 step2[31] = step1[30] + step1[31];
michael@0 988
michael@0 989 // stage 3
michael@0 990 step1[0] = step2[0];
michael@0 991 step1[1] = step2[1];
michael@0 992 step1[2] = step2[2];
michael@0 993 step1[3] = step2[3];
michael@0 994
michael@0 995 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
michael@0 996 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
michael@0 997 step1[4] = dct_const_round_shift(temp1);
michael@0 998 step1[7] = dct_const_round_shift(temp2);
michael@0 999 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
michael@0 1000 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
michael@0 1001 step1[5] = dct_const_round_shift(temp1);
michael@0 1002 step1[6] = dct_const_round_shift(temp2);
michael@0 1003
michael@0 1004 step1[8] = step2[8] + step2[9];
michael@0 1005 step1[9] = step2[8] - step2[9];
michael@0 1006 step1[10] = -step2[10] + step2[11];
michael@0 1007 step1[11] = step2[10] + step2[11];
michael@0 1008 step1[12] = step2[12] + step2[13];
michael@0 1009 step1[13] = step2[12] - step2[13];
michael@0 1010 step1[14] = -step2[14] + step2[15];
michael@0 1011 step1[15] = step2[14] + step2[15];
michael@0 1012
michael@0 1013 step1[16] = step2[16];
michael@0 1014 step1[31] = step2[31];
michael@0 1015 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
michael@0 1016 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
michael@0 1017 step1[17] = dct_const_round_shift(temp1);
michael@0 1018 step1[30] = dct_const_round_shift(temp2);
michael@0 1019 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
michael@0 1020 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
michael@0 1021 step1[18] = dct_const_round_shift(temp1);
michael@0 1022 step1[29] = dct_const_round_shift(temp2);
michael@0 1023 step1[19] = step2[19];
michael@0 1024 step1[20] = step2[20];
michael@0 1025 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
michael@0 1026 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
michael@0 1027 step1[21] = dct_const_round_shift(temp1);
michael@0 1028 step1[26] = dct_const_round_shift(temp2);
michael@0 1029 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
michael@0 1030 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
michael@0 1031 step1[22] = dct_const_round_shift(temp1);
michael@0 1032 step1[25] = dct_const_round_shift(temp2);
michael@0 1033 step1[23] = step2[23];
michael@0 1034 step1[24] = step2[24];
michael@0 1035 step1[27] = step2[27];
michael@0 1036 step1[28] = step2[28];
michael@0 1037
michael@0 1038 // stage 4
michael@0 1039 temp1 = (step1[0] + step1[1]) * cospi_16_64;
michael@0 1040 temp2 = (step1[0] - step1[1]) * cospi_16_64;
michael@0 1041 step2[0] = dct_const_round_shift(temp1);
michael@0 1042 step2[1] = dct_const_round_shift(temp2);
michael@0 1043 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
michael@0 1044 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
michael@0 1045 step2[2] = dct_const_round_shift(temp1);
michael@0 1046 step2[3] = dct_const_round_shift(temp2);
michael@0 1047 step2[4] = step1[4] + step1[5];
michael@0 1048 step2[5] = step1[4] - step1[5];
michael@0 1049 step2[6] = -step1[6] + step1[7];
michael@0 1050 step2[7] = step1[6] + step1[7];
michael@0 1051
michael@0 1052 step2[8] = step1[8];
michael@0 1053 step2[15] = step1[15];
michael@0 1054 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
michael@0 1055 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
michael@0 1056 step2[9] = dct_const_round_shift(temp1);
michael@0 1057 step2[14] = dct_const_round_shift(temp2);
michael@0 1058 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
michael@0 1059 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
michael@0 1060 step2[10] = dct_const_round_shift(temp1);
michael@0 1061 step2[13] = dct_const_round_shift(temp2);
michael@0 1062 step2[11] = step1[11];
michael@0 1063 step2[12] = step1[12];
michael@0 1064
michael@0 1065 step2[16] = step1[16] + step1[19];
michael@0 1066 step2[17] = step1[17] + step1[18];
michael@0 1067 step2[18] = step1[17] - step1[18];
michael@0 1068 step2[19] = step1[16] - step1[19];
michael@0 1069 step2[20] = -step1[20] + step1[23];
michael@0 1070 step2[21] = -step1[21] + step1[22];
michael@0 1071 step2[22] = step1[21] + step1[22];
michael@0 1072 step2[23] = step1[20] + step1[23];
michael@0 1073
michael@0 1074 step2[24] = step1[24] + step1[27];
michael@0 1075 step2[25] = step1[25] + step1[26];
michael@0 1076 step2[26] = step1[25] - step1[26];
michael@0 1077 step2[27] = step1[24] - step1[27];
michael@0 1078 step2[28] = -step1[28] + step1[31];
michael@0 1079 step2[29] = -step1[29] + step1[30];
michael@0 1080 step2[30] = step1[29] + step1[30];
michael@0 1081 step2[31] = step1[28] + step1[31];
michael@0 1082
michael@0 1083 // stage 5
michael@0 1084 step1[0] = step2[0] + step2[3];
michael@0 1085 step1[1] = step2[1] + step2[2];
michael@0 1086 step1[2] = step2[1] - step2[2];
michael@0 1087 step1[3] = step2[0] - step2[3];
michael@0 1088 step1[4] = step2[4];
michael@0 1089 temp1 = (step2[6] - step2[5]) * cospi_16_64;
michael@0 1090 temp2 = (step2[5] + step2[6]) * cospi_16_64;
michael@0 1091 step1[5] = dct_const_round_shift(temp1);
michael@0 1092 step1[6] = dct_const_round_shift(temp2);
michael@0 1093 step1[7] = step2[7];
michael@0 1094
michael@0 1095 step1[8] = step2[8] + step2[11];
michael@0 1096 step1[9] = step2[9] + step2[10];
michael@0 1097 step1[10] = step2[9] - step2[10];
michael@0 1098 step1[11] = step2[8] - step2[11];
michael@0 1099 step1[12] = -step2[12] + step2[15];
michael@0 1100 step1[13] = -step2[13] + step2[14];
michael@0 1101 step1[14] = step2[13] + step2[14];
michael@0 1102 step1[15] = step2[12] + step2[15];
michael@0 1103
michael@0 1104 step1[16] = step2[16];
michael@0 1105 step1[17] = step2[17];
michael@0 1106 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
michael@0 1107 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
michael@0 1108 step1[18] = dct_const_round_shift(temp1);
michael@0 1109 step1[29] = dct_const_round_shift(temp2);
michael@0 1110 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
michael@0 1111 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
michael@0 1112 step1[19] = dct_const_round_shift(temp1);
michael@0 1113 step1[28] = dct_const_round_shift(temp2);
michael@0 1114 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
michael@0 1115 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
michael@0 1116 step1[20] = dct_const_round_shift(temp1);
michael@0 1117 step1[27] = dct_const_round_shift(temp2);
michael@0 1118 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
michael@0 1119 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
michael@0 1120 step1[21] = dct_const_round_shift(temp1);
michael@0 1121 step1[26] = dct_const_round_shift(temp2);
michael@0 1122 step1[22] = step2[22];
michael@0 1123 step1[23] = step2[23];
michael@0 1124 step1[24] = step2[24];
michael@0 1125 step1[25] = step2[25];
michael@0 1126 step1[30] = step2[30];
michael@0 1127 step1[31] = step2[31];
michael@0 1128
michael@0 1129 // stage 6
michael@0 1130 step2[0] = step1[0] + step1[7];
michael@0 1131 step2[1] = step1[1] + step1[6];
michael@0 1132 step2[2] = step1[2] + step1[5];
michael@0 1133 step2[3] = step1[3] + step1[4];
michael@0 1134 step2[4] = step1[3] - step1[4];
michael@0 1135 step2[5] = step1[2] - step1[5];
michael@0 1136 step2[6] = step1[1] - step1[6];
michael@0 1137 step2[7] = step1[0] - step1[7];
michael@0 1138 step2[8] = step1[8];
michael@0 1139 step2[9] = step1[9];
michael@0 1140 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
michael@0 1141 temp2 = (step1[10] + step1[13]) * cospi_16_64;
michael@0 1142 step2[10] = dct_const_round_shift(temp1);
michael@0 1143 step2[13] = dct_const_round_shift(temp2);
michael@0 1144 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
michael@0 1145 temp2 = (step1[11] + step1[12]) * cospi_16_64;
michael@0 1146 step2[11] = dct_const_round_shift(temp1);
michael@0 1147 step2[12] = dct_const_round_shift(temp2);
michael@0 1148 step2[14] = step1[14];
michael@0 1149 step2[15] = step1[15];
michael@0 1150
michael@0 1151 step2[16] = step1[16] + step1[23];
michael@0 1152 step2[17] = step1[17] + step1[22];
michael@0 1153 step2[18] = step1[18] + step1[21];
michael@0 1154 step2[19] = step1[19] + step1[20];
michael@0 1155 step2[20] = step1[19] - step1[20];
michael@0 1156 step2[21] = step1[18] - step1[21];
michael@0 1157 step2[22] = step1[17] - step1[22];
michael@0 1158 step2[23] = step1[16] - step1[23];
michael@0 1159
michael@0 1160 step2[24] = -step1[24] + step1[31];
michael@0 1161 step2[25] = -step1[25] + step1[30];
michael@0 1162 step2[26] = -step1[26] + step1[29];
michael@0 1163 step2[27] = -step1[27] + step1[28];
michael@0 1164 step2[28] = step1[27] + step1[28];
michael@0 1165 step2[29] = step1[26] + step1[29];
michael@0 1166 step2[30] = step1[25] + step1[30];
michael@0 1167 step2[31] = step1[24] + step1[31];
michael@0 1168
michael@0 1169 // stage 7
michael@0 1170 step1[0] = step2[0] + step2[15];
michael@0 1171 step1[1] = step2[1] + step2[14];
michael@0 1172 step1[2] = step2[2] + step2[13];
michael@0 1173 step1[3] = step2[3] + step2[12];
michael@0 1174 step1[4] = step2[4] + step2[11];
michael@0 1175 step1[5] = step2[5] + step2[10];
michael@0 1176 step1[6] = step2[6] + step2[9];
michael@0 1177 step1[7] = step2[7] + step2[8];
michael@0 1178 step1[8] = step2[7] - step2[8];
michael@0 1179 step1[9] = step2[6] - step2[9];
michael@0 1180 step1[10] = step2[5] - step2[10];
michael@0 1181 step1[11] = step2[4] - step2[11];
michael@0 1182 step1[12] = step2[3] - step2[12];
michael@0 1183 step1[13] = step2[2] - step2[13];
michael@0 1184 step1[14] = step2[1] - step2[14];
michael@0 1185 step1[15] = step2[0] - step2[15];
michael@0 1186
michael@0 1187 step1[16] = step2[16];
michael@0 1188 step1[17] = step2[17];
michael@0 1189 step1[18] = step2[18];
michael@0 1190 step1[19] = step2[19];
michael@0 1191 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
michael@0 1192 temp2 = (step2[20] + step2[27]) * cospi_16_64;
michael@0 1193 step1[20] = dct_const_round_shift(temp1);
michael@0 1194 step1[27] = dct_const_round_shift(temp2);
michael@0 1195 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
michael@0 1196 temp2 = (step2[21] + step2[26]) * cospi_16_64;
michael@0 1197 step1[21] = dct_const_round_shift(temp1);
michael@0 1198 step1[26] = dct_const_round_shift(temp2);
michael@0 1199 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
michael@0 1200 temp2 = (step2[22] + step2[25]) * cospi_16_64;
michael@0 1201 step1[22] = dct_const_round_shift(temp1);
michael@0 1202 step1[25] = dct_const_round_shift(temp2);
michael@0 1203 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
michael@0 1204 temp2 = (step2[23] + step2[24]) * cospi_16_64;
michael@0 1205 step1[23] = dct_const_round_shift(temp1);
michael@0 1206 step1[24] = dct_const_round_shift(temp2);
michael@0 1207 step1[28] = step2[28];
michael@0 1208 step1[29] = step2[29];
michael@0 1209 step1[30] = step2[30];
michael@0 1210 step1[31] = step2[31];
michael@0 1211
michael@0 1212 // final stage
michael@0 1213 output[0] = step1[0] + step1[31];
michael@0 1214 output[1] = step1[1] + step1[30];
michael@0 1215 output[2] = step1[2] + step1[29];
michael@0 1216 output[3] = step1[3] + step1[28];
michael@0 1217 output[4] = step1[4] + step1[27];
michael@0 1218 output[5] = step1[5] + step1[26];
michael@0 1219 output[6] = step1[6] + step1[25];
michael@0 1220 output[7] = step1[7] + step1[24];
michael@0 1221 output[8] = step1[8] + step1[23];
michael@0 1222 output[9] = step1[9] + step1[22];
michael@0 1223 output[10] = step1[10] + step1[21];
michael@0 1224 output[11] = step1[11] + step1[20];
michael@0 1225 output[12] = step1[12] + step1[19];
michael@0 1226 output[13] = step1[13] + step1[18];
michael@0 1227 output[14] = step1[14] + step1[17];
michael@0 1228 output[15] = step1[15] + step1[16];
michael@0 1229 output[16] = step1[15] - step1[16];
michael@0 1230 output[17] = step1[14] - step1[17];
michael@0 1231 output[18] = step1[13] - step1[18];
michael@0 1232 output[19] = step1[12] - step1[19];
michael@0 1233 output[20] = step1[11] - step1[20];
michael@0 1234 output[21] = step1[10] - step1[21];
michael@0 1235 output[22] = step1[9] - step1[22];
michael@0 1236 output[23] = step1[8] - step1[23];
michael@0 1237 output[24] = step1[7] - step1[24];
michael@0 1238 output[25] = step1[6] - step1[25];
michael@0 1239 output[26] = step1[5] - step1[26];
michael@0 1240 output[27] = step1[4] - step1[27];
michael@0 1241 output[28] = step1[3] - step1[28];
michael@0 1242 output[29] = step1[2] - step1[29];
michael@0 1243 output[30] = step1[1] - step1[30];
michael@0 1244 output[31] = step1[0] - step1[31];
michael@0 1245 }
michael@0 1246
michael@0 1247 void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
michael@0 1248 int16_t out[32 * 32];
michael@0 1249 int16_t *outptr = out;
michael@0 1250 int i, j;
michael@0 1251 int16_t temp_in[32], temp_out[32];
michael@0 1252
michael@0 1253 // Rows
michael@0 1254 for (i = 0; i < 32; ++i) {
michael@0 1255 int16_t zero_coeff[16];
michael@0 1256 for (j = 0; j < 16; ++j)
michael@0 1257 zero_coeff[j] = input[2 * j] | input[2 * j + 1];
michael@0 1258 for (j = 0; j < 8; ++j)
michael@0 1259 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
michael@0 1260 for (j = 0; j < 4; ++j)
michael@0 1261 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
michael@0 1262 for (j = 0; j < 2; ++j)
michael@0 1263 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
michael@0 1264
michael@0 1265 if (zero_coeff[0] | zero_coeff[1])
michael@0 1266 idct32_1d(input, outptr);
michael@0 1267 else
michael@0 1268 vpx_memset(outptr, 0, sizeof(int16_t) * 32);
michael@0 1269 input += 32;
michael@0 1270 outptr += 32;
michael@0 1271 }
michael@0 1272
michael@0 1273 // Columns
michael@0 1274 for (i = 0; i < 32; ++i) {
michael@0 1275 for (j = 0; j < 32; ++j)
michael@0 1276 temp_in[j] = out[j * 32 + i];
michael@0 1277 idct32_1d(temp_in, temp_out);
michael@0 1278 for (j = 0; j < 32; ++j)
michael@0 1279 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
michael@0 1280 + dest[j * stride + i]);
michael@0 1281 }
michael@0 1282 }
michael@0 1283
michael@0 1284 void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
michael@0 1285 int16_t out[32 * 32] = {0};
michael@0 1286 int16_t *outptr = out;
michael@0 1287 int i, j;
michael@0 1288 int16_t temp_in[32], temp_out[32];
michael@0 1289
michael@0 1290 // Rows
michael@0 1291 // only upper-left 8x8 has non-zero coeff
michael@0 1292 for (i = 0; i < 8; ++i) {
michael@0 1293 idct32_1d(input, outptr);
michael@0 1294 input += 32;
michael@0 1295 outptr += 32;
michael@0 1296 }
michael@0 1297
michael@0 1298 // Columns
michael@0 1299 for (i = 0; i < 32; ++i) {
michael@0 1300 for (j = 0; j < 32; ++j)
michael@0 1301 temp_in[j] = out[j * 32 + i];
michael@0 1302 idct32_1d(temp_in, temp_out);
michael@0 1303 for (j = 0; j < 32; ++j)
michael@0 1304 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
michael@0 1305 + dest[j * stride + i]);
michael@0 1306 }
michael@0 1307 }
michael@0 1308
michael@0 1309 void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
michael@0 1310 int i, j;
michael@0 1311 int a1;
michael@0 1312
michael@0 1313 int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
michael@0 1314 out = dct_const_round_shift(out * cospi_16_64);
michael@0 1315 a1 = ROUND_POWER_OF_TWO(out, 6);
michael@0 1316
michael@0 1317 for (j = 0; j < 32; ++j) {
michael@0 1318 for (i = 0; i < 32; ++i)
michael@0 1319 dest[i] = clip_pixel(dest[i] + a1);
michael@0 1320 dest += stride;
michael@0 1321 }
michael@0 1322 }
michael@0 1323
michael@0 1324 // idct
michael@0 1325 void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
michael@0 1326 if (eob > 1)
michael@0 1327 vp9_idct4x4_16_add(input, dest, stride);
michael@0 1328 else
michael@0 1329 vp9_idct4x4_1_add(input, dest, stride);
michael@0 1330 }
michael@0 1331
michael@0 1332
michael@0 1333 void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
michael@0 1334 if (eob > 1)
michael@0 1335 vp9_iwht4x4_16_add(input, dest, stride);
michael@0 1336 else
michael@0 1337 vp9_iwht4x4_1_add(input, dest, stride);
michael@0 1338 }
michael@0 1339
michael@0 1340 void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
michael@0 1341 // If dc is 1, then input[0] is the reconstructed value, do not need
michael@0 1342 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
michael@0 1343
michael@0 1344 // The calculation can be simplified if there are not many non-zero dct
michael@0 1345 // coefficients. Use eobs to decide what to do.
michael@0 1346 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
michael@0 1347 // Combine that with code here.
michael@0 1348 if (eob) {
michael@0 1349 if (eob == 1)
michael@0 1350 // DC only DCT coefficient
michael@0 1351 vp9_idct8x8_1_add(input, dest, stride);
michael@0 1352 else if (eob <= 10)
michael@0 1353 vp9_idct8x8_10_add(input, dest, stride);
michael@0 1354 else
michael@0 1355 vp9_idct8x8_64_add(input, dest, stride);
michael@0 1356 }
michael@0 1357 }
michael@0 1358
michael@0 1359 void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
michael@0 1360 int eob) {
michael@0 1361 /* The calculation can be simplified if there are not many non-zero dct
michael@0 1362 * coefficients. Use eobs to separate different cases. */
michael@0 1363 if (eob) {
michael@0 1364 if (eob == 1)
michael@0 1365 /* DC only DCT coefficient. */
michael@0 1366 vp9_idct16x16_1_add(input, dest, stride);
michael@0 1367 else if (eob <= 10)
michael@0 1368 vp9_idct16x16_10_add(input, dest, stride);
michael@0 1369 else
michael@0 1370 vp9_idct16x16_256_add(input, dest, stride);
michael@0 1371 }
michael@0 1372 }
michael@0 1373
michael@0 1374 void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
michael@0 1375 int eob) {
michael@0 1376 if (eob) {
michael@0 1377 if (eob == 1)
michael@0 1378 vp9_idct32x32_1_add(input, dest, stride);
michael@0 1379 else if (eob <= 34)
michael@0 1380 // non-zero coeff only in upper-left 8x8
michael@0 1381 vp9_idct32x32_34_add(input, dest, stride);
michael@0 1382 else
michael@0 1383 vp9_idct32x32_1024_add(input, dest, stride);
michael@0 1384 }
michael@0 1385 }
michael@0 1386
michael@0 1387 // iht
michael@0 1388 void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
michael@0 1389 int stride, int eob) {
michael@0 1390 if (tx_type == DCT_DCT)
michael@0 1391 vp9_idct4x4_add(input, dest, stride, eob);
michael@0 1392 else
michael@0 1393 vp9_iht4x4_16_add(input, dest, stride, tx_type);
michael@0 1394 }
michael@0 1395
michael@0 1396 void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
michael@0 1397 int stride, int eob) {
michael@0 1398 if (tx_type == DCT_DCT) {
michael@0 1399 vp9_idct8x8_add(input, dest, stride, eob);
michael@0 1400 } else {
michael@0 1401 if (eob > 0) {
michael@0 1402 vp9_iht8x8_64_add(input, dest, stride, tx_type);
michael@0 1403 }
michael@0 1404 }
michael@0 1405 }
michael@0 1406
michael@0 1407 void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
michael@0 1408 int stride, int eob) {
michael@0 1409 if (tx_type == DCT_DCT) {
michael@0 1410 vp9_idct16x16_add(input, dest, stride, eob);
michael@0 1411 } else {
michael@0 1412 if (eob > 0) {
michael@0 1413 vp9_iht16x16_256_add(input, dest, stride, tx_type);
michael@0 1414 }
michael@0 1415 }
michael@0 1416 }

mercurial