media/libvpx/vp9/common/vp9_idct.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3  *
     4  *  Use of this source code is governed by a BSD-style license
     5  *  that can be found in the LICENSE file in the root of the source
     6  *  tree. An additional intellectual property rights grant can be found
     7  *  in the file PATENTS.  All contributing project authors may
     8  *  be found in the AUTHORS file in the root of the source tree.
     9  */
    11 #include <assert.h>
    12 #include <math.h>
    14 #include "./vpx_config.h"
    15 #include "./vp9_rtcd.h"
    16 #include "vp9/common/vp9_systemdependent.h"
    17 #include "vp9/common/vp9_blockd.h"
    18 #include "vp9/common/vp9_common.h"
    19 #include "vp9/common/vp9_idct.h"
    21 void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
    22 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
    23    0.5 shifts per pixel. */
    24   int i;
    25   int16_t output[16];
    26   int a1, b1, c1, d1, e1;
    27   const int16_t *ip = input;
    28   int16_t *op = output;
    30   for (i = 0; i < 4; i++) {
    31     a1 = ip[0] >> UNIT_QUANT_SHIFT;
    32     c1 = ip[1] >> UNIT_QUANT_SHIFT;
    33     d1 = ip[2] >> UNIT_QUANT_SHIFT;
    34     b1 = ip[3] >> UNIT_QUANT_SHIFT;
    35     a1 += c1;
    36     d1 -= b1;
    37     e1 = (a1 - d1) >> 1;
    38     b1 = e1 - b1;
    39     c1 = e1 - c1;
    40     a1 -= b1;
    41     d1 += c1;
    42     op[0] = a1;
    43     op[1] = b1;
    44     op[2] = c1;
    45     op[3] = d1;
    46     ip += 4;
    47     op += 4;
    48   }
    50   ip = output;
    51   for (i = 0; i < 4; i++) {
    52     a1 = ip[4 * 0];
    53     c1 = ip[4 * 1];
    54     d1 = ip[4 * 2];
    55     b1 = ip[4 * 3];
    56     a1 += c1;
    57     d1 -= b1;
    58     e1 = (a1 - d1) >> 1;
    59     b1 = e1 - b1;
    60     c1 = e1 - c1;
    61     a1 -= b1;
    62     d1 += c1;
    63     dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
    64     dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
    65     dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
    66     dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
    68     ip++;
    69     dest++;
    70   }
    71 }
    73 void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
    74   int i;
    75   int a1, e1;
    76   int16_t tmp[4];
    77   const int16_t *ip = in;
    78   int16_t *op = tmp;
    80   a1 = ip[0] >> UNIT_QUANT_SHIFT;
    81   e1 = a1 >> 1;
    82   a1 -= e1;
    83   op[0] = a1;
    84   op[1] = op[2] = op[3] = e1;
    86   ip = tmp;
    87   for (i = 0; i < 4; i++) {
    88     e1 = ip[0] >> 1;
    89     a1 = ip[0] - e1;
    90     dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
    91     dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);
    92     dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);
    93     dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);
    94     ip++;
    95     dest++;
    96   }
    97 }
    99 static void idct4_1d(const int16_t *input, int16_t *output) {
   100   int16_t step[4];
   101   int temp1, temp2;
   102   // stage 1
   103   temp1 = (input[0] + input[2]) * cospi_16_64;
   104   temp2 = (input[0] - input[2]) * cospi_16_64;
   105   step[0] = dct_const_round_shift(temp1);
   106   step[1] = dct_const_round_shift(temp2);
   107   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   108   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
   109   step[2] = dct_const_round_shift(temp1);
   110   step[3] = dct_const_round_shift(temp2);
   112   // stage 2
   113   output[0] = step[0] + step[3];
   114   output[1] = step[1] + step[2];
   115   output[2] = step[1] - step[2];
   116   output[3] = step[0] - step[3];
   117 }
   119 void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
   120   int16_t out[4 * 4];
   121   int16_t *outptr = out;
   122   int i, j;
   123   int16_t temp_in[4], temp_out[4];
   125   // Rows
   126   for (i = 0; i < 4; ++i) {
   127     idct4_1d(input, outptr);
   128     input += 4;
   129     outptr += 4;
   130   }
   132   // Columns
   133   for (i = 0; i < 4; ++i) {
   134     for (j = 0; j < 4; ++j)
   135       temp_in[j] = out[j * 4 + i];
   136     idct4_1d(temp_in, temp_out);
   137     for (j = 0; j < 4; ++j)
   138       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
   139                                   + dest[j * stride + i]);
   140   }
   141 }
   143 void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
   144   int i;
   145   int a1;
   146   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   147   out = dct_const_round_shift(out * cospi_16_64);
   148   a1 = ROUND_POWER_OF_TWO(out, 4);
   150   for (i = 0; i < 4; i++) {
   151     dest[0] = clip_pixel(dest[0] + a1);
   152     dest[1] = clip_pixel(dest[1] + a1);
   153     dest[2] = clip_pixel(dest[2] + a1);
   154     dest[3] = clip_pixel(dest[3] + a1);
   155     dest += dest_stride;
   156   }
   157 }
   159 static void idct8_1d(const int16_t *input, int16_t *output) {
   160   int16_t step1[8], step2[8];
   161   int temp1, temp2;
   162   // stage 1
   163   step1[0] = input[0];
   164   step1[2] = input[4];
   165   step1[1] = input[2];
   166   step1[3] = input[6];
   167   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   168   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
   169   step1[4] = dct_const_round_shift(temp1);
   170   step1[7] = dct_const_round_shift(temp2);
   171   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   172   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
   173   step1[5] = dct_const_round_shift(temp1);
   174   step1[6] = dct_const_round_shift(temp2);
   176   // stage 2 & stage 3 - even half
   177   idct4_1d(step1, step1);
   179   // stage 2 - odd half
   180   step2[4] = step1[4] + step1[5];
   181   step2[5] = step1[4] - step1[5];
   182   step2[6] = -step1[6] + step1[7];
   183   step2[7] = step1[6] + step1[7];
   185   // stage 3 -odd half
   186   step1[4] = step2[4];
   187   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   188   temp2 = (step2[5] + step2[6]) * cospi_16_64;
   189   step1[5] = dct_const_round_shift(temp1);
   190   step1[6] = dct_const_round_shift(temp2);
   191   step1[7] = step2[7];
   193   // stage 4
   194   output[0] = step1[0] + step1[7];
   195   output[1] = step1[1] + step1[6];
   196   output[2] = step1[2] + step1[5];
   197   output[3] = step1[3] + step1[4];
   198   output[4] = step1[3] - step1[4];
   199   output[5] = step1[2] - step1[5];
   200   output[6] = step1[1] - step1[6];
   201   output[7] = step1[0] - step1[7];
   202 }
   204 void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
   205   int16_t out[8 * 8];
   206   int16_t *outptr = out;
   207   int i, j;
   208   int16_t temp_in[8], temp_out[8];
   210   // First transform rows
   211   for (i = 0; i < 8; ++i) {
   212     idct8_1d(input, outptr);
   213     input += 8;
   214     outptr += 8;
   215   }
   217   // Then transform columns
   218   for (i = 0; i < 8; ++i) {
   219     for (j = 0; j < 8; ++j)
   220       temp_in[j] = out[j * 8 + i];
   221     idct8_1d(temp_in, temp_out);
   222     for (j = 0; j < 8; ++j)
   223       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
   224                                   + dest[j * stride + i]);
   225   }
   226 }
   228 void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   229   int i, j;
   230   int a1;
   231   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   232   out = dct_const_round_shift(out * cospi_16_64);
   233   a1 = ROUND_POWER_OF_TWO(out, 5);
   234   for (j = 0; j < 8; ++j) {
   235     for (i = 0; i < 8; ++i)
   236       dest[i] = clip_pixel(dest[i] + a1);
   237     dest += stride;
   238   }
   239 }
   241 static void iadst4_1d(const int16_t *input, int16_t *output) {
   242   int s0, s1, s2, s3, s4, s5, s6, s7;
   244   int x0 = input[0];
   245   int x1 = input[1];
   246   int x2 = input[2];
   247   int x3 = input[3];
   249   if (!(x0 | x1 | x2 | x3)) {
   250     output[0] = output[1] = output[2] = output[3] = 0;
   251     return;
   252   }
   254   s0 = sinpi_1_9 * x0;
   255   s1 = sinpi_2_9 * x0;
   256   s2 = sinpi_3_9 * x1;
   257   s3 = sinpi_4_9 * x2;
   258   s4 = sinpi_1_9 * x2;
   259   s5 = sinpi_2_9 * x3;
   260   s6 = sinpi_4_9 * x3;
   261   s7 = x0 - x2 + x3;
   263   x0 = s0 + s3 + s5;
   264   x1 = s1 - s4 - s6;
   265   x2 = sinpi_3_9 * s7;
   266   x3 = s2;
   268   s0 = x0 + x3;
   269   s1 = x1 + x3;
   270   s2 = x2;
   271   s3 = x0 + x1 - x3;
   273   // 1-D transform scaling factor is sqrt(2).
   274   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   275   // + 1b (addition) = 29b.
   276   // Hence the output bit depth is 15b.
   277   output[0] = dct_const_round_shift(s0);
   278   output[1] = dct_const_round_shift(s1);
   279   output[2] = dct_const_round_shift(s2);
   280   output[3] = dct_const_round_shift(s3);
   281 }
   283 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
   284                          int tx_type) {
   285   const transform_2d IHT_4[] = {
   286     { idct4_1d, idct4_1d  },  // DCT_DCT  = 0
   287     { iadst4_1d, idct4_1d  },   // ADST_DCT = 1
   288     { idct4_1d, iadst4_1d },    // DCT_ADST = 2
   289     { iadst4_1d, iadst4_1d }      // ADST_ADST = 3
   290   };
   292   int i, j;
   293   int16_t out[4 * 4];
   294   int16_t *outptr = out;
   295   int16_t temp_in[4], temp_out[4];
   297   // inverse transform row vectors
   298   for (i = 0; i < 4; ++i) {
   299     IHT_4[tx_type].rows(input, outptr);
   300     input  += 4;
   301     outptr += 4;
   302   }
   304   // inverse transform column vectors
   305   for (i = 0; i < 4; ++i) {
   306     for (j = 0; j < 4; ++j)
   307       temp_in[j] = out[j * 4 + i];
   308     IHT_4[tx_type].cols(temp_in, temp_out);
   309     for (j = 0; j < 4; ++j)
   310       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
   311                                   + dest[j * stride + i]);
   312   }
   313 }
   314 static void iadst8_1d(const int16_t *input, int16_t *output) {
   315   int s0, s1, s2, s3, s4, s5, s6, s7;
   317   int x0 = input[7];
   318   int x1 = input[0];
   319   int x2 = input[5];
   320   int x3 = input[2];
   321   int x4 = input[3];
   322   int x5 = input[4];
   323   int x6 = input[1];
   324   int x7 = input[6];
   326   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
   327     output[0] = output[1] = output[2] = output[3] = output[4]
   328               = output[5] = output[6] = output[7] = 0;
   329     return;
   330   }
   332   // stage 1
   333   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
   334   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
   335   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
   336   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
   337   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
   338   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
   339   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   340   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
   342   x0 = dct_const_round_shift(s0 + s4);
   343   x1 = dct_const_round_shift(s1 + s5);
   344   x2 = dct_const_round_shift(s2 + s6);
   345   x3 = dct_const_round_shift(s3 + s7);
   346   x4 = dct_const_round_shift(s0 - s4);
   347   x5 = dct_const_round_shift(s1 - s5);
   348   x6 = dct_const_round_shift(s2 - s6);
   349   x7 = dct_const_round_shift(s3 - s7);
   351   // stage 2
   352   s0 = x0;
   353   s1 = x1;
   354   s2 = x2;
   355   s3 = x3;
   356   s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
   357   s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
   358   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
   359   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
   361   x0 = s0 + s2;
   362   x1 = s1 + s3;
   363   x2 = s0 - s2;
   364   x3 = s1 - s3;
   365   x4 = dct_const_round_shift(s4 + s6);
   366   x5 = dct_const_round_shift(s5 + s7);
   367   x6 = dct_const_round_shift(s4 - s6);
   368   x7 = dct_const_round_shift(s5 - s7);
   370   // stage 3
   371   s2 = cospi_16_64 * (x2 + x3);
   372   s3 = cospi_16_64 * (x2 - x3);
   373   s6 = cospi_16_64 * (x6 + x7);
   374   s7 = cospi_16_64 * (x6 - x7);
   376   x2 = dct_const_round_shift(s2);
   377   x3 = dct_const_round_shift(s3);
   378   x6 = dct_const_round_shift(s6);
   379   x7 = dct_const_round_shift(s7);
   381   output[0] =  x0;
   382   output[1] = -x4;
   383   output[2] =  x6;
   384   output[3] = -x2;
   385   output[4] =  x3;
   386   output[5] = -x7;
   387   output[6] =  x5;
   388   output[7] = -x1;
   389 }
   391 static const transform_2d IHT_8[] = {
   392   { idct8_1d,  idct8_1d  },  // DCT_DCT  = 0
   393   { iadst8_1d, idct8_1d  },  // ADST_DCT = 1
   394   { idct8_1d,  iadst8_1d },  // DCT_ADST = 2
   395   { iadst8_1d, iadst8_1d }   // ADST_ADST = 3
   396 };
   398 void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
   399                          int tx_type) {
   400   int i, j;
   401   int16_t out[8 * 8];
   402   int16_t *outptr = out;
   403   int16_t temp_in[8], temp_out[8];
   404   const transform_2d ht = IHT_8[tx_type];
   406   // inverse transform row vectors
   407   for (i = 0; i < 8; ++i) {
   408     ht.rows(input, outptr);
   409     input += 8;
   410     outptr += 8;
   411   }
   413   // inverse transform column vectors
   414   for (i = 0; i < 8; ++i) {
   415     for (j = 0; j < 8; ++j)
   416       temp_in[j] = out[j * 8 + i];
   417     ht.cols(temp_in, temp_out);
   418     for (j = 0; j < 8; ++j)
   419       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
   420                                   + dest[j * stride + i]);
   421   }
   422 }
   424 void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   425   int16_t out[8 * 8] = { 0 };
   426   int16_t *outptr = out;
   427   int i, j;
   428   int16_t temp_in[8], temp_out[8];
   430   // First transform rows
   431   // only first 4 row has non-zero coefs
   432   for (i = 0; i < 4; ++i) {
   433     idct8_1d(input, outptr);
   434     input += 8;
   435     outptr += 8;
   436   }
   438   // Then transform columns
   439   for (i = 0; i < 8; ++i) {
   440     for (j = 0; j < 8; ++j)
   441       temp_in[j] = out[j * 8 + i];
   442     idct8_1d(temp_in, temp_out);
   443     for (j = 0; j < 8; ++j)
   444       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
   445                                   + dest[j * stride + i]);
   446   }
   447 }
   449 static void idct16_1d(const int16_t *input, int16_t *output) {
   450   int16_t step1[16], step2[16];
   451   int temp1, temp2;
   453   // stage 1
   454   step1[0] = input[0/2];
   455   step1[1] = input[16/2];
   456   step1[2] = input[8/2];
   457   step1[3] = input[24/2];
   458   step1[4] = input[4/2];
   459   step1[5] = input[20/2];
   460   step1[6] = input[12/2];
   461   step1[7] = input[28/2];
   462   step1[8] = input[2/2];
   463   step1[9] = input[18/2];
   464   step1[10] = input[10/2];
   465   step1[11] = input[26/2];
   466   step1[12] = input[6/2];
   467   step1[13] = input[22/2];
   468   step1[14] = input[14/2];
   469   step1[15] = input[30/2];
   471   // stage 2
   472   step2[0] = step1[0];
   473   step2[1] = step1[1];
   474   step2[2] = step1[2];
   475   step2[3] = step1[3];
   476   step2[4] = step1[4];
   477   step2[5] = step1[5];
   478   step2[6] = step1[6];
   479   step2[7] = step1[7];
   481   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   482   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
   483   step2[8] = dct_const_round_shift(temp1);
   484   step2[15] = dct_const_round_shift(temp2);
   486   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   487   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
   488   step2[9] = dct_const_round_shift(temp1);
   489   step2[14] = dct_const_round_shift(temp2);
   491   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   492   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
   493   step2[10] = dct_const_round_shift(temp1);
   494   step2[13] = dct_const_round_shift(temp2);
   496   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   497   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
   498   step2[11] = dct_const_round_shift(temp1);
   499   step2[12] = dct_const_round_shift(temp2);
   501   // stage 3
   502   step1[0] = step2[0];
   503   step1[1] = step2[1];
   504   step1[2] = step2[2];
   505   step1[3] = step2[3];
   507   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   508   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
   509   step1[4] = dct_const_round_shift(temp1);
   510   step1[7] = dct_const_round_shift(temp2);
   511   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   512   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
   513   step1[5] = dct_const_round_shift(temp1);
   514   step1[6] = dct_const_round_shift(temp2);
   516   step1[8] = step2[8] + step2[9];
   517   step1[9] = step2[8] - step2[9];
   518   step1[10] = -step2[10] + step2[11];
   519   step1[11] = step2[10] + step2[11];
   520   step1[12] = step2[12] + step2[13];
   521   step1[13] = step2[12] - step2[13];
   522   step1[14] = -step2[14] + step2[15];
   523   step1[15] = step2[14] + step2[15];
   525   // stage 4
   526   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   527   temp2 = (step1[0] - step1[1]) * cospi_16_64;
   528   step2[0] = dct_const_round_shift(temp1);
   529   step2[1] = dct_const_round_shift(temp2);
   530   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   531   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
   532   step2[2] = dct_const_round_shift(temp1);
   533   step2[3] = dct_const_round_shift(temp2);
   534   step2[4] = step1[4] + step1[5];
   535   step2[5] = step1[4] - step1[5];
   536   step2[6] = -step1[6] + step1[7];
   537   step2[7] = step1[6] + step1[7];
   539   step2[8] = step1[8];
   540   step2[15] = step1[15];
   541   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   542   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
   543   step2[9] = dct_const_round_shift(temp1);
   544   step2[14] = dct_const_round_shift(temp2);
   545   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   546   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
   547   step2[10] = dct_const_round_shift(temp1);
   548   step2[13] = dct_const_round_shift(temp2);
   549   step2[11] = step1[11];
   550   step2[12] = step1[12];
   552   // stage 5
   553   step1[0] = step2[0] + step2[3];
   554   step1[1] = step2[1] + step2[2];
   555   step1[2] = step2[1] - step2[2];
   556   step1[3] = step2[0] - step2[3];
   557   step1[4] = step2[4];
   558   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   559   temp2 = (step2[5] + step2[6]) * cospi_16_64;
   560   step1[5] = dct_const_round_shift(temp1);
   561   step1[6] = dct_const_round_shift(temp2);
   562   step1[7] = step2[7];
   564   step1[8] = step2[8] + step2[11];
   565   step1[9] = step2[9] + step2[10];
   566   step1[10] = step2[9] - step2[10];
   567   step1[11] = step2[8] - step2[11];
   568   step1[12] = -step2[12] + step2[15];
   569   step1[13] = -step2[13] + step2[14];
   570   step1[14] = step2[13] + step2[14];
   571   step1[15] = step2[12] + step2[15];
   573   // stage 6
   574   step2[0] = step1[0] + step1[7];
   575   step2[1] = step1[1] + step1[6];
   576   step2[2] = step1[2] + step1[5];
   577   step2[3] = step1[3] + step1[4];
   578   step2[4] = step1[3] - step1[4];
   579   step2[5] = step1[2] - step1[5];
   580   step2[6] = step1[1] - step1[6];
   581   step2[7] = step1[0] - step1[7];
   582   step2[8] = step1[8];
   583   step2[9] = step1[9];
   584   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   585   temp2 = (step1[10] + step1[13]) * cospi_16_64;
   586   step2[10] = dct_const_round_shift(temp1);
   587   step2[13] = dct_const_round_shift(temp2);
   588   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   589   temp2 = (step1[11] + step1[12]) * cospi_16_64;
   590   step2[11] = dct_const_round_shift(temp1);
   591   step2[12] = dct_const_round_shift(temp2);
   592   step2[14] = step1[14];
   593   step2[15] = step1[15];
   595   // stage 7
   596   output[0] = step2[0] + step2[15];
   597   output[1] = step2[1] + step2[14];
   598   output[2] = step2[2] + step2[13];
   599   output[3] = step2[3] + step2[12];
   600   output[4] = step2[4] + step2[11];
   601   output[5] = step2[5] + step2[10];
   602   output[6] = step2[6] + step2[9];
   603   output[7] = step2[7] + step2[8];
   604   output[8] = step2[7] - step2[8];
   605   output[9] = step2[6] - step2[9];
   606   output[10] = step2[5] - step2[10];
   607   output[11] = step2[4] - step2[11];
   608   output[12] = step2[3] - step2[12];
   609   output[13] = step2[2] - step2[13];
   610   output[14] = step2[1] - step2[14];
   611   output[15] = step2[0] - step2[15];
   612 }
   614 void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
   615   int16_t out[16 * 16];
   616   int16_t *outptr = out;
   617   int i, j;
   618   int16_t temp_in[16], temp_out[16];
   620   // First transform rows
   621   for (i = 0; i < 16; ++i) {
   622     idct16_1d(input, outptr);
   623     input += 16;
   624     outptr += 16;
   625   }
   627   // Then transform columns
   628   for (i = 0; i < 16; ++i) {
   629     for (j = 0; j < 16; ++j)
   630       temp_in[j] = out[j * 16 + i];
   631     idct16_1d(temp_in, temp_out);
   632     for (j = 0; j < 16; ++j)
   633       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
   634                                   + dest[j * stride + i]);
   635   }
   636 }
   638 static void iadst16_1d(const int16_t *input, int16_t *output) {
   639   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
   641   int x0 = input[15];
   642   int x1 = input[0];
   643   int x2 = input[13];
   644   int x3 = input[2];
   645   int x4 = input[11];
   646   int x5 = input[4];
   647   int x6 = input[9];
   648   int x7 = input[6];
   649   int x8 = input[7];
   650   int x9 = input[8];
   651   int x10 = input[5];
   652   int x11 = input[10];
   653   int x12 = input[3];
   654   int x13 = input[12];
   655   int x14 = input[1];
   656   int x15 = input[14];
   658   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
   659            | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
   660     output[0] = output[1] = output[2] = output[3] = output[4]
   661               = output[5] = output[6] = output[7] = output[8]
   662               = output[9] = output[10] = output[11] = output[12]
   663               = output[13] = output[14] = output[15] = 0;
   664     return;
   665   }
   667   // stage 1
   668   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
   669   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
   670   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
   671   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
   672   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
   673   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
   674   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
   675   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
   676   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
   677   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
   678   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
   679   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
   680   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
   681   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
   682   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   683   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
   685   x0 = dct_const_round_shift(s0 + s8);
   686   x1 = dct_const_round_shift(s1 + s9);
   687   x2 = dct_const_round_shift(s2 + s10);
   688   x3 = dct_const_round_shift(s3 + s11);
   689   x4 = dct_const_round_shift(s4 + s12);
   690   x5 = dct_const_round_shift(s5 + s13);
   691   x6 = dct_const_round_shift(s6 + s14);
   692   x7 = dct_const_round_shift(s7 + s15);
   693   x8  = dct_const_round_shift(s0 - s8);
   694   x9  = dct_const_round_shift(s1 - s9);
   695   x10 = dct_const_round_shift(s2 - s10);
   696   x11 = dct_const_round_shift(s3 - s11);
   697   x12 = dct_const_round_shift(s4 - s12);
   698   x13 = dct_const_round_shift(s5 - s13);
   699   x14 = dct_const_round_shift(s6 - s14);
   700   x15 = dct_const_round_shift(s7 - s15);
   702   // stage 2
   703   s0 = x0;
   704   s1 = x1;
   705   s2 = x2;
   706   s3 = x3;
   707   s4 = x4;
   708   s5 = x5;
   709   s6 = x6;
   710   s7 = x7;
   711   s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
   712   s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
   713   s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
   714   s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
   715   s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
   716   s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
   717   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
   718   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
   720   x0 = s0 + s4;
   721   x1 = s1 + s5;
   722   x2 = s2 + s6;
   723   x3 = s3 + s7;
   724   x4 = s0 - s4;
   725   x5 = s1 - s5;
   726   x6 = s2 - s6;
   727   x7 = s3 - s7;
   728   x8 = dct_const_round_shift(s8 + s12);
   729   x9 = dct_const_round_shift(s9 + s13);
   730   x10 = dct_const_round_shift(s10 + s14);
   731   x11 = dct_const_round_shift(s11 + s15);
   732   x12 = dct_const_round_shift(s8 - s12);
   733   x13 = dct_const_round_shift(s9 - s13);
   734   x14 = dct_const_round_shift(s10 - s14);
   735   x15 = dct_const_round_shift(s11 - s15);
   737   // stage 3
   738   s0 = x0;
   739   s1 = x1;
   740   s2 = x2;
   741   s3 = x3;
   742   s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
   743   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
   744   s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
   745   s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
   746   s8 = x8;
   747   s9 = x9;
   748   s10 = x10;
   749   s11 = x11;
   750   s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
   751   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
   752   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
   753   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
   755   x0 = s0 + s2;
   756   x1 = s1 + s3;
   757   x2 = s0 - s2;
   758   x3 = s1 - s3;
   759   x4 = dct_const_round_shift(s4 + s6);
   760   x5 = dct_const_round_shift(s5 + s7);
   761   x6 = dct_const_round_shift(s4 - s6);
   762   x7 = dct_const_round_shift(s5 - s7);
   763   x8 = s8 + s10;
   764   x9 = s9 + s11;
   765   x10 = s8 - s10;
   766   x11 = s9 - s11;
   767   x12 = dct_const_round_shift(s12 + s14);
   768   x13 = dct_const_round_shift(s13 + s15);
   769   x14 = dct_const_round_shift(s12 - s14);
   770   x15 = dct_const_round_shift(s13 - s15);
   772   // stage 4
   773   s2 = (- cospi_16_64) * (x2 + x3);
   774   s3 = cospi_16_64 * (x2 - x3);
   775   s6 = cospi_16_64 * (x6 + x7);
   776   s7 = cospi_16_64 * (- x6 + x7);
   777   s10 = cospi_16_64 * (x10 + x11);
   778   s11 = cospi_16_64 * (- x10 + x11);
   779   s14 = (- cospi_16_64) * (x14 + x15);
   780   s15 = cospi_16_64 * (x14 - x15);
   782   x2 = dct_const_round_shift(s2);
   783   x3 = dct_const_round_shift(s3);
   784   x6 = dct_const_round_shift(s6);
   785   x7 = dct_const_round_shift(s7);
   786   x10 = dct_const_round_shift(s10);
   787   x11 = dct_const_round_shift(s11);
   788   x14 = dct_const_round_shift(s14);
   789   x15 = dct_const_round_shift(s15);
   791   output[0] =  x0;
   792   output[1] = -x8;
   793   output[2] =  x12;
   794   output[3] = -x4;
   795   output[4] =  x6;
   796   output[5] =  x14;
   797   output[6] =  x10;
   798   output[7] =  x2;
   799   output[8] =  x3;
   800   output[9] =  x11;
   801   output[10] =  x15;
   802   output[11] =  x7;
   803   output[12] =  x5;
   804   output[13] = -x13;
   805   output[14] =  x9;
   806   output[15] = -x1;
   807 }
   809 static const transform_2d IHT_16[] = {
   810   { idct16_1d,  idct16_1d  },  // DCT_DCT  = 0
   811   { iadst16_1d, idct16_1d  },  // ADST_DCT = 1
   812   { idct16_1d,  iadst16_1d },  // DCT_ADST = 2
   813   { iadst16_1d, iadst16_1d }   // ADST_ADST = 3
   814 };
   816 void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
   817                             int tx_type) {
   818   int i, j;
   819   int16_t out[16 * 16];
   820   int16_t *outptr = out;
   821   int16_t temp_in[16], temp_out[16];
   822   const transform_2d ht = IHT_16[tx_type];
   824   // Rows
   825   for (i = 0; i < 16; ++i) {
   826     ht.rows(input, outptr);
   827     input += 16;
   828     outptr += 16;
   829   }
   831   // Columns
   832   for (i = 0; i < 16; ++i) {
   833     for (j = 0; j < 16; ++j)
   834       temp_in[j] = out[j * 16 + i];
   835     ht.cols(temp_in, temp_out);
   836     for (j = 0; j < 16; ++j)
   837       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
   838                                         + dest[j * stride + i]);
   839   }
   840 }
   842 void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   843   int16_t out[16 * 16] = { 0 };
   844   int16_t *outptr = out;
   845   int i, j;
   846   int16_t temp_in[16], temp_out[16];
   848   // First transform rows. Since all non-zero dct coefficients are in
   849   // upper-left 4x4 area, we only need to calculate first 4 rows here.
   850   for (i = 0; i < 4; ++i) {
   851     idct16_1d(input, outptr);
   852     input += 16;
   853     outptr += 16;
   854   }
   856   // Then transform columns
   857   for (i = 0; i < 16; ++i) {
   858     for (j = 0; j < 16; ++j)
   859       temp_in[j] = out[j*16 + i];
   860     idct16_1d(temp_in, temp_out);
   861     for (j = 0; j < 16; ++j)
   862       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
   863                                   + dest[j * stride + i]);
   864   }
   865 }
   867 void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   868   int i, j;
   869   int a1;
   870   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   871   out = dct_const_round_shift(out * cospi_16_64);
   872   a1 = ROUND_POWER_OF_TWO(out, 6);
   873   for (j = 0; j < 16; ++j) {
   874     for (i = 0; i < 16; ++i)
   875       dest[i] = clip_pixel(dest[i] + a1);
   876     dest += stride;
   877   }
   878 }
   880 static void idct32_1d(const int16_t *input, int16_t *output) {
   881   int16_t step1[32], step2[32];
   882   int temp1, temp2;
   884   // stage 1
   885   step1[0] = input[0];
   886   step1[1] = input[16];
   887   step1[2] = input[8];
   888   step1[3] = input[24];
   889   step1[4] = input[4];
   890   step1[5] = input[20];
   891   step1[6] = input[12];
   892   step1[7] = input[28];
   893   step1[8] = input[2];
   894   step1[9] = input[18];
   895   step1[10] = input[10];
   896   step1[11] = input[26];
   897   step1[12] = input[6];
   898   step1[13] = input[22];
   899   step1[14] = input[14];
   900   step1[15] = input[30];
   902   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   903   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
   904   step1[16] = dct_const_round_shift(temp1);
   905   step1[31] = dct_const_round_shift(temp2);
   907   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   908   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
   909   step1[17] = dct_const_round_shift(temp1);
   910   step1[30] = dct_const_round_shift(temp2);
   912   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   913   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
   914   step1[18] = dct_const_round_shift(temp1);
   915   step1[29] = dct_const_round_shift(temp2);
   917   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   918   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
   919   step1[19] = dct_const_round_shift(temp1);
   920   step1[28] = dct_const_round_shift(temp2);
   922   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   923   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
   924   step1[20] = dct_const_round_shift(temp1);
   925   step1[27] = dct_const_round_shift(temp2);
   927   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   928   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
   929   step1[21] = dct_const_round_shift(temp1);
   930   step1[26] = dct_const_round_shift(temp2);
   932   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   933   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
   934   step1[22] = dct_const_round_shift(temp1);
   935   step1[25] = dct_const_round_shift(temp2);
   937   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   938   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
   939   step1[23] = dct_const_round_shift(temp1);
   940   step1[24] = dct_const_round_shift(temp2);
   942   // stage 2
   943   step2[0] = step1[0];
   944   step2[1] = step1[1];
   945   step2[2] = step1[2];
   946   step2[3] = step1[3];
   947   step2[4] = step1[4];
   948   step2[5] = step1[5];
   949   step2[6] = step1[6];
   950   step2[7] = step1[7];
   952   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   953   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
   954   step2[8] = dct_const_round_shift(temp1);
   955   step2[15] = dct_const_round_shift(temp2);
   957   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   958   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
   959   step2[9] = dct_const_round_shift(temp1);
   960   step2[14] = dct_const_round_shift(temp2);
   962   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   963   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
   964   step2[10] = dct_const_round_shift(temp1);
   965   step2[13] = dct_const_round_shift(temp2);
   967   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   968   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
   969   step2[11] = dct_const_round_shift(temp1);
   970   step2[12] = dct_const_round_shift(temp2);
   972   step2[16] = step1[16] + step1[17];
   973   step2[17] = step1[16] - step1[17];
   974   step2[18] = -step1[18] + step1[19];
   975   step2[19] = step1[18] + step1[19];
   976   step2[20] = step1[20] + step1[21];
   977   step2[21] = step1[20] - step1[21];
   978   step2[22] = -step1[22] + step1[23];
   979   step2[23] = step1[22] + step1[23];
   980   step2[24] = step1[24] + step1[25];
   981   step2[25] = step1[24] - step1[25];
   982   step2[26] = -step1[26] + step1[27];
   983   step2[27] = step1[26] + step1[27];
   984   step2[28] = step1[28] + step1[29];
   985   step2[29] = step1[28] - step1[29];
   986   step2[30] = -step1[30] + step1[31];
   987   step2[31] = step1[30] + step1[31];
   989   // stage 3
   990   step1[0] = step2[0];
   991   step1[1] = step2[1];
   992   step1[2] = step2[2];
   993   step1[3] = step2[3];
   995   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   996   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
   997   step1[4] = dct_const_round_shift(temp1);
   998   step1[7] = dct_const_round_shift(temp2);
   999   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
  1000   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
  1001   step1[5] = dct_const_round_shift(temp1);
  1002   step1[6] = dct_const_round_shift(temp2);
  1004   step1[8] = step2[8] + step2[9];
  1005   step1[9] = step2[8] - step2[9];
  1006   step1[10] = -step2[10] + step2[11];
  1007   step1[11] = step2[10] + step2[11];
  1008   step1[12] = step2[12] + step2[13];
  1009   step1[13] = step2[12] - step2[13];
  1010   step1[14] = -step2[14] + step2[15];
  1011   step1[15] = step2[14] + step2[15];
  1013   step1[16] = step2[16];
  1014   step1[31] = step2[31];
  1015   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
  1016   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
  1017   step1[17] = dct_const_round_shift(temp1);
  1018   step1[30] = dct_const_round_shift(temp2);
  1019   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
  1020   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
  1021   step1[18] = dct_const_round_shift(temp1);
  1022   step1[29] = dct_const_round_shift(temp2);
  1023   step1[19] = step2[19];
  1024   step1[20] = step2[20];
  1025   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
  1026   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
  1027   step1[21] = dct_const_round_shift(temp1);
  1028   step1[26] = dct_const_round_shift(temp2);
  1029   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
  1030   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
  1031   step1[22] = dct_const_round_shift(temp1);
  1032   step1[25] = dct_const_round_shift(temp2);
  1033   step1[23] = step2[23];
  1034   step1[24] = step2[24];
  1035   step1[27] = step2[27];
  1036   step1[28] = step2[28];
  1038   // stage 4
  1039   temp1 = (step1[0] + step1[1]) * cospi_16_64;
  1040   temp2 = (step1[0] - step1[1]) * cospi_16_64;
  1041   step2[0] = dct_const_round_shift(temp1);
  1042   step2[1] = dct_const_round_shift(temp2);
  1043   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
  1044   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
  1045   step2[2] = dct_const_round_shift(temp1);
  1046   step2[3] = dct_const_round_shift(temp2);
  1047   step2[4] = step1[4] + step1[5];
  1048   step2[5] = step1[4] - step1[5];
  1049   step2[6] = -step1[6] + step1[7];
  1050   step2[7] = step1[6] + step1[7];
  1052   step2[8] = step1[8];
  1053   step2[15] = step1[15];
  1054   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
  1055   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
  1056   step2[9] = dct_const_round_shift(temp1);
  1057   step2[14] = dct_const_round_shift(temp2);
  1058   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
  1059   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
  1060   step2[10] = dct_const_round_shift(temp1);
  1061   step2[13] = dct_const_round_shift(temp2);
  1062   step2[11] = step1[11];
  1063   step2[12] = step1[12];
  1065   step2[16] = step1[16] + step1[19];
  1066   step2[17] = step1[17] + step1[18];
  1067   step2[18] = step1[17] - step1[18];
  1068   step2[19] = step1[16] - step1[19];
  1069   step2[20] = -step1[20] + step1[23];
  1070   step2[21] = -step1[21] + step1[22];
  1071   step2[22] = step1[21] + step1[22];
  1072   step2[23] = step1[20] + step1[23];
  1074   step2[24] = step1[24] + step1[27];
  1075   step2[25] = step1[25] + step1[26];
  1076   step2[26] = step1[25] - step1[26];
  1077   step2[27] = step1[24] - step1[27];
  1078   step2[28] = -step1[28] + step1[31];
  1079   step2[29] = -step1[29] + step1[30];
  1080   step2[30] = step1[29] + step1[30];
  1081   step2[31] = step1[28] + step1[31];
  1083   // stage 5
  1084   step1[0] = step2[0] + step2[3];
  1085   step1[1] = step2[1] + step2[2];
  1086   step1[2] = step2[1] - step2[2];
  1087   step1[3] = step2[0] - step2[3];
  1088   step1[4] = step2[4];
  1089   temp1 = (step2[6] - step2[5]) * cospi_16_64;
  1090   temp2 = (step2[5] + step2[6]) * cospi_16_64;
  1091   step1[5] = dct_const_round_shift(temp1);
  1092   step1[6] = dct_const_round_shift(temp2);
  1093   step1[7] = step2[7];
  1095   step1[8] = step2[8] + step2[11];
  1096   step1[9] = step2[9] + step2[10];
  1097   step1[10] = step2[9] - step2[10];
  1098   step1[11] = step2[8] - step2[11];
  1099   step1[12] = -step2[12] + step2[15];
  1100   step1[13] = -step2[13] + step2[14];
  1101   step1[14] = step2[13] + step2[14];
  1102   step1[15] = step2[12] + step2[15];
  1104   step1[16] = step2[16];
  1105   step1[17] = step2[17];
  1106   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
  1107   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
  1108   step1[18] = dct_const_round_shift(temp1);
  1109   step1[29] = dct_const_round_shift(temp2);
  1110   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
  1111   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
  1112   step1[19] = dct_const_round_shift(temp1);
  1113   step1[28] = dct_const_round_shift(temp2);
  1114   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
  1115   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
  1116   step1[20] = dct_const_round_shift(temp1);
  1117   step1[27] = dct_const_round_shift(temp2);
  1118   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
  1119   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
  1120   step1[21] = dct_const_round_shift(temp1);
  1121   step1[26] = dct_const_round_shift(temp2);
  1122   step1[22] = step2[22];
  1123   step1[23] = step2[23];
  1124   step1[24] = step2[24];
  1125   step1[25] = step2[25];
  1126   step1[30] = step2[30];
  1127   step1[31] = step2[31];
  1129   // stage 6
  1130   step2[0] = step1[0] + step1[7];
  1131   step2[1] = step1[1] + step1[6];
  1132   step2[2] = step1[2] + step1[5];
  1133   step2[3] = step1[3] + step1[4];
  1134   step2[4] = step1[3] - step1[4];
  1135   step2[5] = step1[2] - step1[5];
  1136   step2[6] = step1[1] - step1[6];
  1137   step2[7] = step1[0] - step1[7];
  1138   step2[8] = step1[8];
  1139   step2[9] = step1[9];
  1140   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
  1141   temp2 = (step1[10] + step1[13]) * cospi_16_64;
  1142   step2[10] = dct_const_round_shift(temp1);
  1143   step2[13] = dct_const_round_shift(temp2);
  1144   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
  1145   temp2 = (step1[11] + step1[12]) * cospi_16_64;
  1146   step2[11] = dct_const_round_shift(temp1);
  1147   step2[12] = dct_const_round_shift(temp2);
  1148   step2[14] = step1[14];
  1149   step2[15] = step1[15];
  1151   step2[16] = step1[16] + step1[23];
  1152   step2[17] = step1[17] + step1[22];
  1153   step2[18] = step1[18] + step1[21];
  1154   step2[19] = step1[19] + step1[20];
  1155   step2[20] = step1[19] - step1[20];
  1156   step2[21] = step1[18] - step1[21];
  1157   step2[22] = step1[17] - step1[22];
  1158   step2[23] = step1[16] - step1[23];
  1160   step2[24] = -step1[24] + step1[31];
  1161   step2[25] = -step1[25] + step1[30];
  1162   step2[26] = -step1[26] + step1[29];
  1163   step2[27] = -step1[27] + step1[28];
  1164   step2[28] = step1[27] + step1[28];
  1165   step2[29] = step1[26] + step1[29];
  1166   step2[30] = step1[25] + step1[30];
  1167   step2[31] = step1[24] + step1[31];
  1169   // stage 7
  1170   step1[0] = step2[0] + step2[15];
  1171   step1[1] = step2[1] + step2[14];
  1172   step1[2] = step2[2] + step2[13];
  1173   step1[3] = step2[3] + step2[12];
  1174   step1[4] = step2[4] + step2[11];
  1175   step1[5] = step2[5] + step2[10];
  1176   step1[6] = step2[6] + step2[9];
  1177   step1[7] = step2[7] + step2[8];
  1178   step1[8] = step2[7] - step2[8];
  1179   step1[9] = step2[6] - step2[9];
  1180   step1[10] = step2[5] - step2[10];
  1181   step1[11] = step2[4] - step2[11];
  1182   step1[12] = step2[3] - step2[12];
  1183   step1[13] = step2[2] - step2[13];
  1184   step1[14] = step2[1] - step2[14];
  1185   step1[15] = step2[0] - step2[15];
  1187   step1[16] = step2[16];
  1188   step1[17] = step2[17];
  1189   step1[18] = step2[18];
  1190   step1[19] = step2[19];
  1191   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
  1192   temp2 = (step2[20] + step2[27]) * cospi_16_64;
  1193   step1[20] = dct_const_round_shift(temp1);
  1194   step1[27] = dct_const_round_shift(temp2);
  1195   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
  1196   temp2 = (step2[21] + step2[26]) * cospi_16_64;
  1197   step1[21] = dct_const_round_shift(temp1);
  1198   step1[26] = dct_const_round_shift(temp2);
  1199   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
  1200   temp2 = (step2[22] + step2[25]) * cospi_16_64;
  1201   step1[22] = dct_const_round_shift(temp1);
  1202   step1[25] = dct_const_round_shift(temp2);
  1203   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
  1204   temp2 = (step2[23] + step2[24]) * cospi_16_64;
  1205   step1[23] = dct_const_round_shift(temp1);
  1206   step1[24] = dct_const_round_shift(temp2);
  1207   step1[28] = step2[28];
  1208   step1[29] = step2[29];
  1209   step1[30] = step2[30];
  1210   step1[31] = step2[31];
  1212   // final stage
  1213   output[0] = step1[0] + step1[31];
  1214   output[1] = step1[1] + step1[30];
  1215   output[2] = step1[2] + step1[29];
  1216   output[3] = step1[3] + step1[28];
  1217   output[4] = step1[4] + step1[27];
  1218   output[5] = step1[5] + step1[26];
  1219   output[6] = step1[6] + step1[25];
  1220   output[7] = step1[7] + step1[24];
  1221   output[8] = step1[8] + step1[23];
  1222   output[9] = step1[9] + step1[22];
  1223   output[10] = step1[10] + step1[21];
  1224   output[11] = step1[11] + step1[20];
  1225   output[12] = step1[12] + step1[19];
  1226   output[13] = step1[13] + step1[18];
  1227   output[14] = step1[14] + step1[17];
  1228   output[15] = step1[15] + step1[16];
  1229   output[16] = step1[15] - step1[16];
  1230   output[17] = step1[14] - step1[17];
  1231   output[18] = step1[13] - step1[18];
  1232   output[19] = step1[12] - step1[19];
  1233   output[20] = step1[11] - step1[20];
  1234   output[21] = step1[10] - step1[21];
  1235   output[22] = step1[9] - step1[22];
  1236   output[23] = step1[8] - step1[23];
  1237   output[24] = step1[7] - step1[24];
  1238   output[25] = step1[6] - step1[25];
  1239   output[26] = step1[5] - step1[26];
  1240   output[27] = step1[4] - step1[27];
  1241   output[28] = step1[3] - step1[28];
  1242   output[29] = step1[2] - step1[29];
  1243   output[30] = step1[1] - step1[30];
  1244   output[31] = step1[0] - step1[31];
  1247 void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
  1248   int16_t out[32 * 32];
  1249   int16_t *outptr = out;
  1250   int i, j;
  1251   int16_t temp_in[32], temp_out[32];
  1253   // Rows
  1254   for (i = 0; i < 32; ++i) {
  1255     int16_t zero_coeff[16];
  1256     for (j = 0; j < 16; ++j)
  1257       zero_coeff[j] = input[2 * j] | input[2 * j + 1];
  1258     for (j = 0; j < 8; ++j)
  1259       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
  1260     for (j = 0; j < 4; ++j)
  1261       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
  1262     for (j = 0; j < 2; ++j)
  1263       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
  1265     if (zero_coeff[0] | zero_coeff[1])
  1266       idct32_1d(input, outptr);
  1267     else
  1268       vpx_memset(outptr, 0, sizeof(int16_t) * 32);
  1269     input += 32;
  1270     outptr += 32;
  1273   // Columns
  1274   for (i = 0; i < 32; ++i) {
  1275     for (j = 0; j < 32; ++j)
  1276       temp_in[j] = out[j * 32 + i];
  1277     idct32_1d(temp_in, temp_out);
  1278     for (j = 0; j < 32; ++j)
  1279       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
  1280                                         + dest[j * stride + i]);
  1284 void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
  1285   int16_t out[32 * 32] = {0};
  1286   int16_t *outptr = out;
  1287   int i, j;
  1288   int16_t temp_in[32], temp_out[32];
  1290   // Rows
  1291   // only upper-left 8x8 has non-zero coeff
  1292   for (i = 0; i < 8; ++i) {
  1293     idct32_1d(input, outptr);
  1294     input += 32;
  1295     outptr += 32;
  1298   // Columns
  1299   for (i = 0; i < 32; ++i) {
  1300     for (j = 0; j < 32; ++j)
  1301       temp_in[j] = out[j * 32 + i];
  1302     idct32_1d(temp_in, temp_out);
  1303     for (j = 0; j < 32; ++j)
  1304       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
  1305                                   + dest[j * stride + i]);
  1309 void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
  1310   int i, j;
  1311   int a1;
  1313   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
  1314   out = dct_const_round_shift(out * cospi_16_64);
  1315   a1 = ROUND_POWER_OF_TWO(out, 6);
  1317   for (j = 0; j < 32; ++j) {
  1318     for (i = 0; i < 32; ++i)
  1319       dest[i] = clip_pixel(dest[i] + a1);
  1320     dest += stride;
  1324 // idct
  1325 void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
  1326   if (eob > 1)
  1327     vp9_idct4x4_16_add(input, dest, stride);
  1328   else
  1329     vp9_idct4x4_1_add(input, dest, stride);
  1333 void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
  1334   if (eob > 1)
  1335     vp9_iwht4x4_16_add(input, dest, stride);
  1336   else
  1337     vp9_iwht4x4_1_add(input, dest, stride);
  1340 void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
  1341   // If dc is 1, then input[0] is the reconstructed value, do not need
  1342   // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
  1344   // The calculation can be simplified if there are not many non-zero dct
  1345   // coefficients. Use eobs to decide what to do.
  1346   // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
  1347   // Combine that with code here.
  1348   if (eob) {
  1349     if (eob == 1)
  1350       // DC only DCT coefficient
  1351       vp9_idct8x8_1_add(input, dest, stride);
  1352     else if (eob <= 10)
  1353       vp9_idct8x8_10_add(input, dest, stride);
  1354     else
  1355       vp9_idct8x8_64_add(input, dest, stride);
  1359 void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
  1360                        int eob) {
  1361   /* The calculation can be simplified if there are not many non-zero dct
  1362    * coefficients. Use eobs to separate different cases. */
  1363   if (eob) {
  1364     if (eob == 1)
  1365       /* DC only DCT coefficient. */
  1366       vp9_idct16x16_1_add(input, dest, stride);
  1367     else if (eob <= 10)
  1368       vp9_idct16x16_10_add(input, dest, stride);
  1369     else
  1370       vp9_idct16x16_256_add(input, dest, stride);
  1374 void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
  1375                        int eob) {
  1376   if (eob) {
  1377     if (eob == 1)
  1378       vp9_idct32x32_1_add(input, dest, stride);
  1379     else if (eob <= 34)
  1380       // non-zero coeff only in upper-left 8x8
  1381       vp9_idct32x32_34_add(input, dest, stride);
  1382     else
  1383       vp9_idct32x32_1024_add(input, dest, stride);
  1387 // iht
  1388 void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
  1389                     int stride, int eob) {
  1390   if (tx_type == DCT_DCT)
  1391     vp9_idct4x4_add(input, dest, stride, eob);
  1392   else
  1393     vp9_iht4x4_16_add(input, dest, stride, tx_type);
  1396 void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
  1397                     int stride, int eob) {
  1398   if (tx_type == DCT_DCT) {
  1399     vp9_idct8x8_add(input, dest, stride, eob);
  1400   } else {
  1401     if (eob > 0) {
  1402       vp9_iht8x8_64_add(input, dest, stride, tx_type);
  1407 void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
  1408                       int stride, int eob) {
  1409   if (tx_type == DCT_DCT) {
  1410     vp9_idct16x16_add(input, dest, stride, eob);
  1411   } else {
  1412     if (eob > 0) {
  1413       vp9_iht16x16_256_add(input, dest, stride, tx_type);

mercurial