The Tor Browser: media/libvpx/vp9/common/vp9

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*

     2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

     3  *

     4  *  Use of this source code is governed by a BSD-style license

     5  *  that can be found in the LICENSE file in the root of the source

     6  *  tree. An additional intellectual property rights grant can be found

     7  *  in the file PATENTS.  All contributing project authors may

     8  *  be found in the AUTHORS file in the root of the source tree.

     9  */

    11 #include <assert.h>

    12 #include <math.h>

    14 #include "./vpx_config.h"

    15 #include "./vp9_rtcd.h"

    16 #include "vp9/common/vp9_systemdependent.h"

    17 #include "vp9/common/vp9_blockd.h"

    18 #include "vp9/common/vp9_common.h"

    19 #include "vp9/common/vp9_idct.h"

    21 void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {

    22 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,

    23    0.5 shifts per pixel. */

    24   int i;

    25   int16_t output[16];

    26   int a1, b1, c1, d1, e1;

    27   const int16_t *ip = input;

    28   int16_t *op = output;

    30   for (i = 0; i < 4; i++) {

    31     a1 = ip[0] >> UNIT_QUANT_SHIFT;

    32     c1 = ip[1] >> UNIT_QUANT_SHIFT;

    33     d1 = ip[2] >> UNIT_QUANT_SHIFT;

    34     b1 = ip[3] >> UNIT_QUANT_SHIFT;

    35     a1 += c1;

    36     d1 -= b1;

    37     e1 = (a1 - d1) >> 1;

    38     b1 = e1 - b1;

    39     c1 = e1 - c1;

    40     a1 -= b1;

    41     d1 += c1;

    42     op[0] = a1;

    43     op[1] = b1;

    44     op[2] = c1;

    45     op[3] = d1;

    46     ip += 4;

    47     op += 4;

    48   }

    50   ip = output;

    51   for (i = 0; i < 4; i++) {

    52     a1 = ip[4 * 0];

    53     c1 = ip[4 * 1];

    54     d1 = ip[4 * 2];

    55     b1 = ip[4 * 3];

    56     a1 += c1;

    57     d1 -= b1;

    58     e1 = (a1 - d1) >> 1;

    59     b1 = e1 - b1;

    60     c1 = e1 - c1;

    61     a1 -= b1;

    62     d1 += c1;

    63     dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);

    64     dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);

    65     dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);

    66     dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);

    68     ip++;

    69     dest++;

    70   }

    71 }

    73 void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {

    74   int i;

    75   int a1, e1;

    76   int16_t tmp[4];

    77   const int16_t *ip = in;

    78   int16_t *op = tmp;

    80   a1 = ip[0] >> UNIT_QUANT_SHIFT;

    81   e1 = a1 >> 1;

    82   a1 -= e1;

    83   op[0] = a1;

    84   op[1] = op[2] = op[3] = e1;

    86   ip = tmp;

    87   for (i = 0; i < 4; i++) {

    88     e1 = ip[0] >> 1;

    89     a1 = ip[0] - e1;

    90     dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);

    91     dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);

    92     dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);

    93     dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);

    94     ip++;

    95     dest++;

    96   }

    97 }

    99 static void idct4_1d(const int16_t *input, int16_t *output) {

   100   int16_t step[4];

   101   int temp1, temp2;

   102   // stage 1

   103   temp1 = (input[0] + input[2]) * cospi_16_64;

   104   temp2 = (input[0] - input[2]) * cospi_16_64;

   105   step[0] = dct_const_round_shift(temp1);

   106   step[1] = dct_const_round_shift(temp2);

   107   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

   108   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

   109   step[2] = dct_const_round_shift(temp1);

   110   step[3] = dct_const_round_shift(temp2);

   112   // stage 2

   113   output[0] = step[0] + step[3];

   114   output[1] = step[1] + step[2];

   115   output[2] = step[1] - step[2];

   116   output[3] = step[0] - step[3];

   117 }

   119 void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {

   120   int16_t out[4 * 4];

   121   int16_t *outptr = out;

   122   int i, j;

   123   int16_t temp_in[4], temp_out[4];

   125   // Rows

   126   for (i = 0; i < 4; ++i) {

   127     idct4_1d(input, outptr);

   128     input += 4;

   129     outptr += 4;

   130   }

   132   // Columns

   133   for (i = 0; i < 4; ++i) {

   134     for (j = 0; j < 4; ++j)

   135       temp_in[j] = out[j * 4 + i];

   136     idct4_1d(temp_in, temp_out);

   137     for (j = 0; j < 4; ++j)

   138       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)

   139                                   + dest[j * stride + i]);

   140   }

   141 }

   143 void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {

   144   int i;

   145   int a1;

   146   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

   147   out = dct_const_round_shift(out * cospi_16_64);

   148   a1 = ROUND_POWER_OF_TWO(out, 4);

   150   for (i = 0; i < 4; i++) {

   151     dest[0] = clip_pixel(dest[0] + a1);

   152     dest[1] = clip_pixel(dest[1] + a1);

   153     dest[2] = clip_pixel(dest[2] + a1);

   154     dest[3] = clip_pixel(dest[3] + a1);

   155     dest += dest_stride;

   156   }

   157 }

   159 static void idct8_1d(const int16_t *input, int16_t *output) {

   160   int16_t step1[8], step2[8];

   161   int temp1, temp2;

   162   // stage 1

   163   step1[0] = input[0];

   164   step1[2] = input[4];

   165   step1[1] = input[2];

   166   step1[3] = input[6];

   167   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

   168   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

   169   step1[4] = dct_const_round_shift(temp1);

   170   step1[7] = dct_const_round_shift(temp2);

   171   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

   172   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

   173   step1[5] = dct_const_round_shift(temp1);

   174   step1[6] = dct_const_round_shift(temp2);

   176   // stage 2 & stage 3 - even half

   177   idct4_1d(step1, step1);

   179   // stage 2 - odd half

   180   step2[4] = step1[4] + step1[5];

   181   step2[5] = step1[4] - step1[5];

   182   step2[6] = -step1[6] + step1[7];

   183   step2[7] = step1[6] + step1[7];

   185   // stage 3 -odd half

   186   step1[4] = step2[4];

   187   temp1 = (step2[6] - step2[5]) * cospi_16_64;

   188   temp2 = (step2[5] + step2[6]) * cospi_16_64;

   189   step1[5] = dct_const_round_shift(temp1);

   190   step1[6] = dct_const_round_shift(temp2);

   191   step1[7] = step2[7];

   193   // stage 4

   194   output[0] = step1[0] + step1[7];

   195   output[1] = step1[1] + step1[6];

   196   output[2] = step1[2] + step1[5];

   197   output[3] = step1[3] + step1[4];

   198   output[4] = step1[3] - step1[4];

   199   output[5] = step1[2] - step1[5];

   200   output[6] = step1[1] - step1[6];

   201   output[7] = step1[0] - step1[7];

   202 }

   204 void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {

   205   int16_t out[8 * 8];

   206   int16_t *outptr = out;

   207   int i, j;

   208   int16_t temp_in[8], temp_out[8];

   210   // First transform rows

   211   for (i = 0; i < 8; ++i) {

   212     idct8_1d(input, outptr);

   213     input += 8;

   214     outptr += 8;

   215   }

   217   // Then transform columns

   218   for (i = 0; i < 8; ++i) {

   219     for (j = 0; j < 8; ++j)

   220       temp_in[j] = out[j * 8 + i];

   221     idct8_1d(temp_in, temp_out);

   222     for (j = 0; j < 8; ++j)

   223       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)

   224                                   + dest[j * stride + i]);

   225   }

   226 }

   228 void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {

   229   int i, j;

   230   int a1;

   231   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

   232   out = dct_const_round_shift(out * cospi_16_64);

   233   a1 = ROUND_POWER_OF_TWO(out, 5);

   234   for (j = 0; j < 8; ++j) {

   235     for (i = 0; i < 8; ++i)

   236       dest[i] = clip_pixel(dest[i] + a1);

   237     dest += stride;

   238   }

   239 }

   241 static void iadst4_1d(const int16_t *input, int16_t *output) {

   242   int s0, s1, s2, s3, s4, s5, s6, s7;

   244   int x0 = input[0];

   245   int x1 = input[1];

   246   int x2 = input[2];

   247   int x3 = input[3];

   249   if (!(x0 | x1 | x2 | x3)) {

   250     output[0] = output[1] = output[2] = output[3] = 0;

   251     return;

   252   }

   254   s0 = sinpi_1_9 * x0;

   255   s1 = sinpi_2_9 * x0;

   256   s2 = sinpi_3_9 * x1;

   257   s3 = sinpi_4_9 * x2;

   258   s4 = sinpi_1_9 * x2;

   259   s5 = sinpi_2_9 * x3;

   260   s6 = sinpi_4_9 * x3;

   261   s7 = x0 - x2 + x3;

   263   x0 = s0 + s3 + s5;

   264   x1 = s1 - s4 - s6;

   265   x2 = sinpi_3_9 * s7;

   266   x3 = s2;

   268   s0 = x0 + x3;

   269   s1 = x1 + x3;

   270   s2 = x2;

   271   s3 = x0 + x1 - x3;

   273   // 1-D transform scaling factor is sqrt(2).

   274   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

   275   // + 1b (addition) = 29b.

   276   // Hence the output bit depth is 15b.

   277   output[0] = dct_const_round_shift(s0);

   278   output[1] = dct_const_round_shift(s1);

   279   output[2] = dct_const_round_shift(s2);

   280   output[3] = dct_const_round_shift(s3);

   281 }

   283 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,

   284                          int tx_type) {

   285   const transform_2d IHT_4[] = {

   286     { idct4_1d, idct4_1d  },  // DCT_DCT  = 0

   287     { iadst4_1d, idct4_1d  },   // ADST_DCT = 1

   288     { idct4_1d, iadst4_1d },    // DCT_ADST = 2

   289     { iadst4_1d, iadst4_1d }      // ADST_ADST = 3

   290   };

   292   int i, j;

   293   int16_t out[4 * 4];

   294   int16_t *outptr = out;

   295   int16_t temp_in[4], temp_out[4];

   297   // inverse transform row vectors

   298   for (i = 0; i < 4; ++i) {

   299     IHT_4[tx_type].rows(input, outptr);

   300     input  += 4;

   301     outptr += 4;

   302   }

   304   // inverse transform column vectors

   305   for (i = 0; i < 4; ++i) {

   306     for (j = 0; j < 4; ++j)

   307       temp_in[j] = out[j * 4 + i];

   308     IHT_4[tx_type].cols(temp_in, temp_out);

   309     for (j = 0; j < 4; ++j)

   310       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)

   311                                   + dest[j * stride + i]);

   312   }

   313 }

   314 static void iadst8_1d(const int16_t *input, int16_t *output) {

   315   int s0, s1, s2, s3, s4, s5, s6, s7;

   317   int x0 = input[7];

   318   int x1 = input[0];

   319   int x2 = input[5];

   320   int x3 = input[2];

   321   int x4 = input[3];

   322   int x5 = input[4];

   323   int x6 = input[1];

   324   int x7 = input[6];

   326   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {

   327     output[0] = output[1] = output[2] = output[3] = output[4]

   328               = output[5] = output[6] = output[7] = 0;

   329     return;

   330   }

   332   // stage 1

   333   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;

   334   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;

   335   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

   336   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

   337   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

   338   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

   339   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;

   340   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;

   342   x0 = dct_const_round_shift(s0 + s4);

   343   x1 = dct_const_round_shift(s1 + s5);

   344   x2 = dct_const_round_shift(s2 + s6);

   345   x3 = dct_const_round_shift(s3 + s7);

   346   x4 = dct_const_round_shift(s0 - s4);

   347   x5 = dct_const_round_shift(s1 - s5);

   348   x6 = dct_const_round_shift(s2 - s6);

   349   x7 = dct_const_round_shift(s3 - s7);

   351   // stage 2

   352   s0 = x0;

   353   s1 = x1;

   354   s2 = x2;

   355   s3 = x3;

   356   s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;

   357   s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;

   358   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;

   359   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;

   361   x0 = s0 + s2;

   362   x1 = s1 + s3;

   363   x2 = s0 - s2;

   364   x3 = s1 - s3;

   365   x4 = dct_const_round_shift(s4 + s6);

   366   x5 = dct_const_round_shift(s5 + s7);

   367   x6 = dct_const_round_shift(s4 - s6);

   368   x7 = dct_const_round_shift(s5 - s7);

   370   // stage 3

   371   s2 = cospi_16_64 * (x2 + x3);

   372   s3 = cospi_16_64 * (x2 - x3);

   373   s6 = cospi_16_64 * (x6 + x7);

   374   s7 = cospi_16_64 * (x6 - x7);

   376   x2 = dct_const_round_shift(s2);

   377   x3 = dct_const_round_shift(s3);

   378   x6 = dct_const_round_shift(s6);

   379   x7 = dct_const_round_shift(s7);

   381   output[0] =  x0;

   382   output[1] = -x4;

   383   output[2] =  x6;

   384   output[3] = -x2;

   385   output[4] =  x3;

   386   output[5] = -x7;

   387   output[6] =  x5;

   388   output[7] = -x1;

   389 }

   391 static const transform_2d IHT_8[] = {

   392   { idct8_1d,  idct8_1d  },  // DCT_DCT  = 0

   393   { iadst8_1d, idct8_1d  },  // ADST_DCT = 1

   394   { idct8_1d,  iadst8_1d },  // DCT_ADST = 2

   395   { iadst8_1d, iadst8_1d }   // ADST_ADST = 3

   396 };

   398 void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,

   399                          int tx_type) {

   400   int i, j;

   401   int16_t out[8 * 8];

   402   int16_t *outptr = out;

   403   int16_t temp_in[8], temp_out[8];

   404   const transform_2d ht = IHT_8[tx_type];

   406   // inverse transform row vectors

   407   for (i = 0; i < 8; ++i) {

   408     ht.rows(input, outptr);

   409     input += 8;

   410     outptr += 8;

   411   }

   413   // inverse transform column vectors

   414   for (i = 0; i < 8; ++i) {

   415     for (j = 0; j < 8; ++j)

   416       temp_in[j] = out[j * 8 + i];

   417     ht.cols(temp_in, temp_out);

   418     for (j = 0; j < 8; ++j)

   419       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)

   420                                   + dest[j * stride + i]);

   421   }

   422 }

   424 void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {

   425   int16_t out[8 * 8] = { 0 };

   426   int16_t *outptr = out;

   427   int i, j;

   428   int16_t temp_in[8], temp_out[8];

   430   // First transform rows

   431   // only first 4 row has non-zero coefs

   432   for (i = 0; i < 4; ++i) {

   433     idct8_1d(input, outptr);

   434     input += 8;

   435     outptr += 8;

   436   }

   438   // Then transform columns

   439   for (i = 0; i < 8; ++i) {

   440     for (j = 0; j < 8; ++j)

   441       temp_in[j] = out[j * 8 + i];

   442     idct8_1d(temp_in, temp_out);

   443     for (j = 0; j < 8; ++j)

   444       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)

   445                                   + dest[j * stride + i]);

   446   }

   447 }

   449 static void idct16_1d(const int16_t *input, int16_t *output) {

   450   int16_t step1[16], step2[16];

   451   int temp1, temp2;

   453   // stage 1

   454   step1[0] = input[0/2];

   455   step1[1] = input[16/2];

   456   step1[2] = input[8/2];

   457   step1[3] = input[24/2];

   458   step1[4] = input[4/2];

   459   step1[5] = input[20/2];

   460   step1[6] = input[12/2];

   461   step1[7] = input[28/2];

   462   step1[8] = input[2/2];

   463   step1[9] = input[18/2];

   464   step1[10] = input[10/2];

   465   step1[11] = input[26/2];

   466   step1[12] = input[6/2];

   467   step1[13] = input[22/2];

   468   step1[14] = input[14/2];

   469   step1[15] = input[30/2];

   471   // stage 2

   472   step2[0] = step1[0];

   473   step2[1] = step1[1];

   474   step2[2] = step1[2];

   475   step2[3] = step1[3];

   476   step2[4] = step1[4];

   477   step2[5] = step1[5];

   478   step2[6] = step1[6];

   479   step2[7] = step1[7];

   481   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

   482   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

   483   step2[8] = dct_const_round_shift(temp1);

   484   step2[15] = dct_const_round_shift(temp2);

   486   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

   487   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

   488   step2[9] = dct_const_round_shift(temp1);

   489   step2[14] = dct_const_round_shift(temp2);

   491   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

   492   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

   493   step2[10] = dct_const_round_shift(temp1);

   494   step2[13] = dct_const_round_shift(temp2);

   496   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

   497   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

   498   step2[11] = dct_const_round_shift(temp1);

   499   step2[12] = dct_const_round_shift(temp2);

   501   // stage 3

   502   step1[0] = step2[0];

   503   step1[1] = step2[1];

   504   step1[2] = step2[2];

   505   step1[3] = step2[3];

   507   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

   508   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

   509   step1[4] = dct_const_round_shift(temp1);

   510   step1[7] = dct_const_round_shift(temp2);

   511   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

   512   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

   513   step1[5] = dct_const_round_shift(temp1);

   514   step1[6] = dct_const_round_shift(temp2);

   516   step1[8] = step2[8] + step2[9];

   517   step1[9] = step2[8] - step2[9];

   518   step1[10] = -step2[10] + step2[11];

   519   step1[11] = step2[10] + step2[11];

   520   step1[12] = step2[12] + step2[13];

   521   step1[13] = step2[12] - step2[13];

   522   step1[14] = -step2[14] + step2[15];

   523   step1[15] = step2[14] + step2[15];

   525   // stage 4

   526   temp1 = (step1[0] + step1[1]) * cospi_16_64;

   527   temp2 = (step1[0] - step1[1]) * cospi_16_64;

   528   step2[0] = dct_const_round_shift(temp1);

   529   step2[1] = dct_const_round_shift(temp2);

   530   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

   531   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

   532   step2[2] = dct_const_round_shift(temp1);

   533   step2[3] = dct_const_round_shift(temp2);

   534   step2[4] = step1[4] + step1[5];

   535   step2[5] = step1[4] - step1[5];

   536   step2[6] = -step1[6] + step1[7];

   537   step2[7] = step1[6] + step1[7];

   539   step2[8] = step1[8];

   540   step2[15] = step1[15];

   541   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

   542   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

   543   step2[9] = dct_const_round_shift(temp1);

   544   step2[14] = dct_const_round_shift(temp2);

   545   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

   546   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

   547   step2[10] = dct_const_round_shift(temp1);

   548   step2[13] = dct_const_round_shift(temp2);

   549   step2[11] = step1[11];

   550   step2[12] = step1[12];

   552   // stage 5

   553   step1[0] = step2[0] + step2[3];

   554   step1[1] = step2[1] + step2[2];

   555   step1[2] = step2[1] - step2[2];

   556   step1[3] = step2[0] - step2[3];

   557   step1[4] = step2[4];

   558   temp1 = (step2[6] - step2[5]) * cospi_16_64;

   559   temp2 = (step2[5] + step2[6]) * cospi_16_64;

   560   step1[5] = dct_const_round_shift(temp1);

   561   step1[6] = dct_const_round_shift(temp2);

   562   step1[7] = step2[7];

   564   step1[8] = step2[8] + step2[11];

   565   step1[9] = step2[9] + step2[10];

   566   step1[10] = step2[9] - step2[10];

   567   step1[11] = step2[8] - step2[11];

   568   step1[12] = -step2[12] + step2[15];

   569   step1[13] = -step2[13] + step2[14];

   570   step1[14] = step2[13] + step2[14];

   571   step1[15] = step2[12] + step2[15];

   573   // stage 6

   574   step2[0] = step1[0] + step1[7];

   575   step2[1] = step1[1] + step1[6];

   576   step2[2] = step1[2] + step1[5];

   577   step2[3] = step1[3] + step1[4];

   578   step2[4] = step1[3] - step1[4];

   579   step2[5] = step1[2] - step1[5];

   580   step2[6] = step1[1] - step1[6];

   581   step2[7] = step1[0] - step1[7];

   582   step2[8] = step1[8];

   583   step2[9] = step1[9];

   584   temp1 = (-step1[10] + step1[13]) * cospi_16_64;

   585   temp2 = (step1[10] + step1[13]) * cospi_16_64;

   586   step2[10] = dct_const_round_shift(temp1);

   587   step2[13] = dct_const_round_shift(temp2);

   588   temp1 = (-step1[11] + step1[12]) * cospi_16_64;

   589   temp2 = (step1[11] + step1[12]) * cospi_16_64;

   590   step2[11] = dct_const_round_shift(temp1);

   591   step2[12] = dct_const_round_shift(temp2);

   592   step2[14] = step1[14];

   593   step2[15] = step1[15];

   595   // stage 7

   596   output[0] = step2[0] + step2[15];

   597   output[1] = step2[1] + step2[14];

   598   output[2] = step2[2] + step2[13];

   599   output[3] = step2[3] + step2[12];

   600   output[4] = step2[4] + step2[11];

   601   output[5] = step2[5] + step2[10];

   602   output[6] = step2[6] + step2[9];

   603   output[7] = step2[7] + step2[8];

   604   output[8] = step2[7] - step2[8];

   605   output[9] = step2[6] - step2[9];

   606   output[10] = step2[5] - step2[10];

   607   output[11] = step2[4] - step2[11];

   608   output[12] = step2[3] - step2[12];

   609   output[13] = step2[2] - step2[13];

   610   output[14] = step2[1] - step2[14];

   611   output[15] = step2[0] - step2[15];

   612 }

   614 void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {

   615   int16_t out[16 * 16];

   616   int16_t *outptr = out;

   617   int i, j;

   618   int16_t temp_in[16], temp_out[16];

   620   // First transform rows

   621   for (i = 0; i < 16; ++i) {

   622     idct16_1d(input, outptr);

   623     input += 16;

   624     outptr += 16;

   625   }

   627   // Then transform columns

   628   for (i = 0; i < 16; ++i) {

   629     for (j = 0; j < 16; ++j)

   630       temp_in[j] = out[j * 16 + i];

   631     idct16_1d(temp_in, temp_out);

   632     for (j = 0; j < 16; ++j)

   633       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

   634                                   + dest[j * stride + i]);

   635   }

   636 }

   638 static void iadst16_1d(const int16_t *input, int16_t *output) {

   639   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;

   641   int x0 = input[15];

   642   int x1 = input[0];

   643   int x2 = input[13];

   644   int x3 = input[2];

   645   int x4 = input[11];

   646   int x5 = input[4];

   647   int x6 = input[9];

   648   int x7 = input[6];

   649   int x8 = input[7];

   650   int x9 = input[8];

   651   int x10 = input[5];

   652   int x11 = input[10];

   653   int x12 = input[3];

   654   int x13 = input[12];

   655   int x14 = input[1];

   656   int x15 = input[14];

   658   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8

   659            | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {

   660     output[0] = output[1] = output[2] = output[3] = output[4]

   661               = output[5] = output[6] = output[7] = output[8]

   662               = output[9] = output[10] = output[11] = output[12]

   663               = output[13] = output[14] = output[15] = 0;

   664     return;

   665   }

   667   // stage 1

   668   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;

   669   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

   670   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;

   671   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

   672   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;

   673   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

   674   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

   675   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

   676   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

   677   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

   678   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

   679   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

   680   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

   681   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;

   682   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

   683   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;

   685   x0 = dct_const_round_shift(s0 + s8);

   686   x1 = dct_const_round_shift(s1 + s9);

   687   x2 = dct_const_round_shift(s2 + s10);

   688   x3 = dct_const_round_shift(s3 + s11);

   689   x4 = dct_const_round_shift(s4 + s12);

   690   x5 = dct_const_round_shift(s5 + s13);

   691   x6 = dct_const_round_shift(s6 + s14);

   692   x7 = dct_const_round_shift(s7 + s15);

   693   x8  = dct_const_round_shift(s0 - s8);

   694   x9  = dct_const_round_shift(s1 - s9);

   695   x10 = dct_const_round_shift(s2 - s10);

   696   x11 = dct_const_round_shift(s3 - s11);

   697   x12 = dct_const_round_shift(s4 - s12);

   698   x13 = dct_const_round_shift(s5 - s13);

   699   x14 = dct_const_round_shift(s6 - s14);

   700   x15 = dct_const_round_shift(s7 - s15);

   702   // stage 2

   703   s0 = x0;

   704   s1 = x1;

   705   s2 = x2;

   706   s3 = x3;

   707   s4 = x4;

   708   s5 = x5;

   709   s6 = x6;

   710   s7 = x7;

   711   s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;

   712   s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;

   713   s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;

   714   s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;

   715   s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;

   716   s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;

   717   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;

   718   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;

   720   x0 = s0 + s4;

   721   x1 = s1 + s5;

   722   x2 = s2 + s6;

   723   x3 = s3 + s7;

   724   x4 = s0 - s4;

   725   x5 = s1 - s5;

   726   x6 = s2 - s6;

   727   x7 = s3 - s7;

   728   x8 = dct_const_round_shift(s8 + s12);

   729   x9 = dct_const_round_shift(s9 + s13);

   730   x10 = dct_const_round_shift(s10 + s14);

   731   x11 = dct_const_round_shift(s11 + s15);

   732   x12 = dct_const_round_shift(s8 - s12);

   733   x13 = dct_const_round_shift(s9 - s13);

   734   x14 = dct_const_round_shift(s10 - s14);

   735   x15 = dct_const_round_shift(s11 - s15);

   737   // stage 3

   738   s0 = x0;

   739   s1 = x1;

   740   s2 = x2;

   741   s3 = x3;

   742   s4 = x4 * cospi_8_64  + x5 * cospi_24_64;

   743   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

   744   s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;

   745   s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;

   746   s8 = x8;

   747   s9 = x9;

   748   s10 = x10;

   749   s11 = x11;

   750   s12 = x12 * cospi_8_64  + x13 * cospi_24_64;

   751   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

   752   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;

   753   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;

   755   x0 = s0 + s2;

   756   x1 = s1 + s3;

   757   x2 = s0 - s2;

   758   x3 = s1 - s3;

   759   x4 = dct_const_round_shift(s4 + s6);

   760   x5 = dct_const_round_shift(s5 + s7);

   761   x6 = dct_const_round_shift(s4 - s6);

   762   x7 = dct_const_round_shift(s5 - s7);

   763   x8 = s8 + s10;

   764   x9 = s9 + s11;

   765   x10 = s8 - s10;

   766   x11 = s9 - s11;

   767   x12 = dct_const_round_shift(s12 + s14);

   768   x13 = dct_const_round_shift(s13 + s15);

   769   x14 = dct_const_round_shift(s12 - s14);

   770   x15 = dct_const_round_shift(s13 - s15);

   772   // stage 4

   773   s2 = (- cospi_16_64) * (x2 + x3);

   774   s3 = cospi_16_64 * (x2 - x3);

   775   s6 = cospi_16_64 * (x6 + x7);

   776   s7 = cospi_16_64 * (- x6 + x7);

   777   s10 = cospi_16_64 * (x10 + x11);

   778   s11 = cospi_16_64 * (- x10 + x11);

   779   s14 = (- cospi_16_64) * (x14 + x15);

   780   s15 = cospi_16_64 * (x14 - x15);

   782   x2 = dct_const_round_shift(s2);

   783   x3 = dct_const_round_shift(s3);

   784   x6 = dct_const_round_shift(s6);

   785   x7 = dct_const_round_shift(s7);

   786   x10 = dct_const_round_shift(s10);

   787   x11 = dct_const_round_shift(s11);

   788   x14 = dct_const_round_shift(s14);

   789   x15 = dct_const_round_shift(s15);

   791   output[0] =  x0;

   792   output[1] = -x8;

   793   output[2] =  x12;

   794   output[3] = -x4;

   795   output[4] =  x6;

   796   output[5] =  x14;

   797   output[6] =  x10;

   798   output[7] =  x2;

   799   output[8] =  x3;

   800   output[9] =  x11;

   801   output[10] =  x15;

   802   output[11] =  x7;

   803   output[12] =  x5;

   804   output[13] = -x13;

   805   output[14] =  x9;

   806   output[15] = -x1;

   807 }

   809 static const transform_2d IHT_16[] = {

   810   { idct16_1d,  idct16_1d  },  // DCT_DCT  = 0

   811   { iadst16_1d, idct16_1d  },  // ADST_DCT = 1

   812   { idct16_1d,  iadst16_1d },  // DCT_ADST = 2

   813   { iadst16_1d, iadst16_1d }   // ADST_ADST = 3

   814 };

   816 void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,

   817                             int tx_type) {

   818   int i, j;

   819   int16_t out[16 * 16];

   820   int16_t *outptr = out;

   821   int16_t temp_in[16], temp_out[16];

   822   const transform_2d ht = IHT_16[tx_type];

   824   // Rows

   825   for (i = 0; i < 16; ++i) {

   826     ht.rows(input, outptr);

   827     input += 16;

   828     outptr += 16;

   829   }

   831   // Columns

   832   for (i = 0; i < 16; ++i) {

   833     for (j = 0; j < 16; ++j)

   834       temp_in[j] = out[j * 16 + i];

   835     ht.cols(temp_in, temp_out);

   836     for (j = 0; j < 16; ++j)

   837       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

   838                                         + dest[j * stride + i]);

   839   }

   840 }

   842 void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {

   843   int16_t out[16 * 16] = { 0 };

   844   int16_t *outptr = out;

   845   int i, j;

   846   int16_t temp_in[16], temp_out[16];

   848   // First transform rows. Since all non-zero dct coefficients are in

   849   // upper-left 4x4 area, we only need to calculate first 4 rows here.

   850   for (i = 0; i < 4; ++i) {

   851     idct16_1d(input, outptr);

   852     input += 16;

   853     outptr += 16;

   854   }

   856   // Then transform columns

   857   for (i = 0; i < 16; ++i) {

   858     for (j = 0; j < 16; ++j)

   859       temp_in[j] = out[j*16 + i];

   860     idct16_1d(temp_in, temp_out);

   861     for (j = 0; j < 16; ++j)

   862       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

   863                                   + dest[j * stride + i]);

   864   }

   865 }

   867 void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {

   868   int i, j;

   869   int a1;

   870   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

   871   out = dct_const_round_shift(out * cospi_16_64);

   872   a1 = ROUND_POWER_OF_TWO(out, 6);

   873   for (j = 0; j < 16; ++j) {

   874     for (i = 0; i < 16; ++i)

   875       dest[i] = clip_pixel(dest[i] + a1);

   876     dest += stride;

   877   }

   878 }

   880 static void idct32_1d(const int16_t *input, int16_t *output) {

   881   int16_t step1[32], step2[32];

   882   int temp1, temp2;

   884   // stage 1

   885   step1[0] = input[0];

   886   step1[1] = input[16];

   887   step1[2] = input[8];

   888   step1[3] = input[24];

   889   step1[4] = input[4];

   890   step1[5] = input[20];

   891   step1[6] = input[12];

   892   step1[7] = input[28];

   893   step1[8] = input[2];

   894   step1[9] = input[18];

   895   step1[10] = input[10];

   896   step1[11] = input[26];

   897   step1[12] = input[6];

   898   step1[13] = input[22];

   899   step1[14] = input[14];

   900   step1[15] = input[30];

   902   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;

   903   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;

   904   step1[16] = dct_const_round_shift(temp1);

   905   step1[31] = dct_const_round_shift(temp2);

   907   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;

   908   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;

   909   step1[17] = dct_const_round_shift(temp1);

   910   step1[30] = dct_const_round_shift(temp2);

   912   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;

   913   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;

   914   step1[18] = dct_const_round_shift(temp1);

   915   step1[29] = dct_const_round_shift(temp2);

   917   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;

   918   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;

   919   step1[19] = dct_const_round_shift(temp1);

   920   step1[28] = dct_const_round_shift(temp2);

   922   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;

   923   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;

   924   step1[20] = dct_const_round_shift(temp1);

   925   step1[27] = dct_const_round_shift(temp2);

   927   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;

   928   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;

   929   step1[21] = dct_const_round_shift(temp1);

   930   step1[26] = dct_const_round_shift(temp2);

   932   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;

   933   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;

   934   step1[22] = dct_const_round_shift(temp1);

   935   step1[25] = dct_const_round_shift(temp2);

   937   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;

   938   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;

   939   step1[23] = dct_const_round_shift(temp1);

   940   step1[24] = dct_const_round_shift(temp2);

   942   // stage 2

   943   step2[0] = step1[0];

   944   step2[1] = step1[1];

   945   step2[2] = step1[2];

   946   step2[3] = step1[3];

   947   step2[4] = step1[4];

   948   step2[5] = step1[5];

   949   step2[6] = step1[6];

   950   step2[7] = step1[7];

   952   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

   953   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

   954   step2[8] = dct_const_round_shift(temp1);

   955   step2[15] = dct_const_round_shift(temp2);

   957   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

   958   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

   959   step2[9] = dct_const_round_shift(temp1);

   960   step2[14] = dct_const_round_shift(temp2);

   962   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

   963   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

   964   step2[10] = dct_const_round_shift(temp1);

   965   step2[13] = dct_const_round_shift(temp2);

   967   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

   968   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

   969   step2[11] = dct_const_round_shift(temp1);

   970   step2[12] = dct_const_round_shift(temp2);

   972   step2[16] = step1[16] + step1[17];

   973   step2[17] = step1[16] - step1[17];

   974   step2[18] = -step1[18] + step1[19];

   975   step2[19] = step1[18] + step1[19];

   976   step2[20] = step1[20] + step1[21];

   977   step2[21] = step1[20] - step1[21];

   978   step2[22] = -step1[22] + step1[23];

   979   step2[23] = step1[22] + step1[23];

   980   step2[24] = step1[24] + step1[25];

   981   step2[25] = step1[24] - step1[25];

   982   step2[26] = -step1[26] + step1[27];

   983   step2[27] = step1[26] + step1[27];

   984   step2[28] = step1[28] + step1[29];

   985   step2[29] = step1[28] - step1[29];

   986   step2[30] = -step1[30] + step1[31];

   987   step2[31] = step1[30] + step1[31];

   989   // stage 3

   990   step1[0] = step2[0];

   991   step1[1] = step2[1];

   992   step1[2] = step2[2];

   993   step1[3] = step2[3];

   995   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

   996   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

   997   step1[4] = dct_const_round_shift(temp1);

   998   step1[7] = dct_const_round_shift(temp2);

   999   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

  1000   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

  1001   step1[5] = dct_const_round_shift(temp1);

  1002   step1[6] = dct_const_round_shift(temp2);

  1004   step1[8] = step2[8] + step2[9];

  1005   step1[9] = step2[8] - step2[9];

  1006   step1[10] = -step2[10] + step2[11];

  1007   step1[11] = step2[10] + step2[11];

  1008   step1[12] = step2[12] + step2[13];

  1009   step1[13] = step2[12] - step2[13];

  1010   step1[14] = -step2[14] + step2[15];

  1011   step1[15] = step2[14] + step2[15];

  1013   step1[16] = step2[16];

  1014   step1[31] = step2[31];

  1015   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;

  1016   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;

  1017   step1[17] = dct_const_round_shift(temp1);

  1018   step1[30] = dct_const_round_shift(temp2);

  1019   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;

  1020   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;

  1021   step1[18] = dct_const_round_shift(temp1);

  1022   step1[29] = dct_const_round_shift(temp2);

  1023   step1[19] = step2[19];

  1024   step1[20] = step2[20];

  1025   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;

  1026   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;

  1027   step1[21] = dct_const_round_shift(temp1);

  1028   step1[26] = dct_const_round_shift(temp2);

  1029   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;

  1030   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;

  1031   step1[22] = dct_const_round_shift(temp1);

  1032   step1[25] = dct_const_round_shift(temp2);

  1033   step1[23] = step2[23];

  1034   step1[24] = step2[24];

  1035   step1[27] = step2[27];

  1036   step1[28] = step2[28];

  1038   // stage 4

  1039   temp1 = (step1[0] + step1[1]) * cospi_16_64;

  1040   temp2 = (step1[0] - step1[1]) * cospi_16_64;

  1041   step2[0] = dct_const_round_shift(temp1);

  1042   step2[1] = dct_const_round_shift(temp2);

  1043   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

  1044   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

  1045   step2[2] = dct_const_round_shift(temp1);

  1046   step2[3] = dct_const_round_shift(temp2);

  1047   step2[4] = step1[4] + step1[5];

  1048   step2[5] = step1[4] - step1[5];

  1049   step2[6] = -step1[6] + step1[7];

  1050   step2[7] = step1[6] + step1[7];

  1052   step2[8] = step1[8];

  1053   step2[15] = step1[15];

  1054   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

  1055   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

  1056   step2[9] = dct_const_round_shift(temp1);

  1057   step2[14] = dct_const_round_shift(temp2);

  1058   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

  1059   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

  1060   step2[10] = dct_const_round_shift(temp1);

  1061   step2[13] = dct_const_round_shift(temp2);

  1062   step2[11] = step1[11];

  1063   step2[12] = step1[12];

  1065   step2[16] = step1[16] + step1[19];

  1066   step2[17] = step1[17] + step1[18];

  1067   step2[18] = step1[17] - step1[18];

  1068   step2[19] = step1[16] - step1[19];

  1069   step2[20] = -step1[20] + step1[23];

  1070   step2[21] = -step1[21] + step1[22];

  1071   step2[22] = step1[21] + step1[22];

  1072   step2[23] = step1[20] + step1[23];

  1074   step2[24] = step1[24] + step1[27];

  1075   step2[25] = step1[25] + step1[26];

  1076   step2[26] = step1[25] - step1[26];

  1077   step2[27] = step1[24] - step1[27];

  1078   step2[28] = -step1[28] + step1[31];

  1079   step2[29] = -step1[29] + step1[30];

  1080   step2[30] = step1[29] + step1[30];

  1081   step2[31] = step1[28] + step1[31];

  1083   // stage 5

  1084   step1[0] = step2[0] + step2[3];

  1085   step1[1] = step2[1] + step2[2];

  1086   step1[2] = step2[1] - step2[2];

  1087   step1[3] = step2[0] - step2[3];

  1088   step1[4] = step2[4];

  1089   temp1 = (step2[6] - step2[5]) * cospi_16_64;

  1090   temp2 = (step2[5] + step2[6]) * cospi_16_64;

  1091   step1[5] = dct_const_round_shift(temp1);

  1092   step1[6] = dct_const_round_shift(temp2);

  1093   step1[7] = step2[7];

  1095   step1[8] = step2[8] + step2[11];

  1096   step1[9] = step2[9] + step2[10];

  1097   step1[10] = step2[9] - step2[10];

  1098   step1[11] = step2[8] - step2[11];

  1099   step1[12] = -step2[12] + step2[15];

  1100   step1[13] = -step2[13] + step2[14];

  1101   step1[14] = step2[13] + step2[14];

  1102   step1[15] = step2[12] + step2[15];

  1104   step1[16] = step2[16];

  1105   step1[17] = step2[17];

  1106   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;

  1107   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;

  1108   step1[18] = dct_const_round_shift(temp1);

  1109   step1[29] = dct_const_round_shift(temp2);

  1110   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;

  1111   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;

  1112   step1[19] = dct_const_round_shift(temp1);

  1113   step1[28] = dct_const_round_shift(temp2);

  1114   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;

  1115   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;

  1116   step1[20] = dct_const_round_shift(temp1);

  1117   step1[27] = dct_const_round_shift(temp2);

  1118   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;

  1119   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;

  1120   step1[21] = dct_const_round_shift(temp1);

  1121   step1[26] = dct_const_round_shift(temp2);

  1122   step1[22] = step2[22];

  1123   step1[23] = step2[23];

  1124   step1[24] = step2[24];

  1125   step1[25] = step2[25];

  1126   step1[30] = step2[30];

  1127   step1[31] = step2[31];

  1129   // stage 6

  1130   step2[0] = step1[0] + step1[7];

  1131   step2[1] = step1[1] + step1[6];

  1132   step2[2] = step1[2] + step1[5];

  1133   step2[3] = step1[3] + step1[4];

  1134   step2[4] = step1[3] - step1[4];

  1135   step2[5] = step1[2] - step1[5];

  1136   step2[6] = step1[1] - step1[6];

  1137   step2[7] = step1[0] - step1[7];

  1138   step2[8] = step1[8];

  1139   step2[9] = step1[9];

  1140   temp1 = (-step1[10] + step1[13]) * cospi_16_64;

  1141   temp2 = (step1[10] + step1[13]) * cospi_16_64;

  1142   step2[10] = dct_const_round_shift(temp1);

  1143   step2[13] = dct_const_round_shift(temp2);

  1144   temp1 = (-step1[11] + step1[12]) * cospi_16_64;

  1145   temp2 = (step1[11] + step1[12]) * cospi_16_64;

  1146   step2[11] = dct_const_round_shift(temp1);

  1147   step2[12] = dct_const_round_shift(temp2);

  1148   step2[14] = step1[14];

  1149   step2[15] = step1[15];

  1151   step2[16] = step1[16] + step1[23];

  1152   step2[17] = step1[17] + step1[22];

  1153   step2[18] = step1[18] + step1[21];

  1154   step2[19] = step1[19] + step1[20];

  1155   step2[20] = step1[19] - step1[20];

  1156   step2[21] = step1[18] - step1[21];

  1157   step2[22] = step1[17] - step1[22];

  1158   step2[23] = step1[16] - step1[23];

  1160   step2[24] = -step1[24] + step1[31];

  1161   step2[25] = -step1[25] + step1[30];

  1162   step2[26] = -step1[26] + step1[29];

  1163   step2[27] = -step1[27] + step1[28];

  1164   step2[28] = step1[27] + step1[28];

  1165   step2[29] = step1[26] + step1[29];

  1166   step2[30] = step1[25] + step1[30];

  1167   step2[31] = step1[24] + step1[31];

  1169   // stage 7

  1170   step1[0] = step2[0] + step2[15];

  1171   step1[1] = step2[1] + step2[14];

  1172   step1[2] = step2[2] + step2[13];

  1173   step1[3] = step2[3] + step2[12];

  1174   step1[4] = step2[4] + step2[11];

  1175   step1[5] = step2[5] + step2[10];

  1176   step1[6] = step2[6] + step2[9];

  1177   step1[7] = step2[7] + step2[8];

  1178   step1[8] = step2[7] - step2[8];

  1179   step1[9] = step2[6] - step2[9];

  1180   step1[10] = step2[5] - step2[10];

  1181   step1[11] = step2[4] - step2[11];

  1182   step1[12] = step2[3] - step2[12];

  1183   step1[13] = step2[2] - step2[13];

  1184   step1[14] = step2[1] - step2[14];

  1185   step1[15] = step2[0] - step2[15];

  1187   step1[16] = step2[16];

  1188   step1[17] = step2[17];

  1189   step1[18] = step2[18];

  1190   step1[19] = step2[19];

  1191   temp1 = (-step2[20] + step2[27]) * cospi_16_64;

  1192   temp2 = (step2[20] + step2[27]) * cospi_16_64;

  1193   step1[20] = dct_const_round_shift(temp1);

  1194   step1[27] = dct_const_round_shift(temp2);

  1195   temp1 = (-step2[21] + step2[26]) * cospi_16_64;

  1196   temp2 = (step2[21] + step2[26]) * cospi_16_64;

  1197   step1[21] = dct_const_round_shift(temp1);

  1198   step1[26] = dct_const_round_shift(temp2);

  1199   temp1 = (-step2[22] + step2[25]) * cospi_16_64;

  1200   temp2 = (step2[22] + step2[25]) * cospi_16_64;

  1201   step1[22] = dct_const_round_shift(temp1);

  1202   step1[25] = dct_const_round_shift(temp2);

  1203   temp1 = (-step2[23] + step2[24]) * cospi_16_64;

  1204   temp2 = (step2[23] + step2[24]) * cospi_16_64;

  1205   step1[23] = dct_const_round_shift(temp1);

  1206   step1[24] = dct_const_round_shift(temp2);

  1207   step1[28] = step2[28];

  1208   step1[29] = step2[29];

  1209   step1[30] = step2[30];

  1210   step1[31] = step2[31];

  1212   // final stage

  1213   output[0] = step1[0] + step1[31];

  1214   output[1] = step1[1] + step1[30];

  1215   output[2] = step1[2] + step1[29];

  1216   output[3] = step1[3] + step1[28];

  1217   output[4] = step1[4] + step1[27];

  1218   output[5] = step1[5] + step1[26];

  1219   output[6] = step1[6] + step1[25];

  1220   output[7] = step1[7] + step1[24];

  1221   output[8] = step1[8] + step1[23];

  1222   output[9] = step1[9] + step1[22];

  1223   output[10] = step1[10] + step1[21];

  1224   output[11] = step1[11] + step1[20];

  1225   output[12] = step1[12] + step1[19];

  1226   output[13] = step1[13] + step1[18];

  1227   output[14] = step1[14] + step1[17];

  1228   output[15] = step1[15] + step1[16];

  1229   output[16] = step1[15] - step1[16];

  1230   output[17] = step1[14] - step1[17];

  1231   output[18] = step1[13] - step1[18];

  1232   output[19] = step1[12] - step1[19];

  1233   output[20] = step1[11] - step1[20];

  1234   output[21] = step1[10] - step1[21];

  1235   output[22] = step1[9] - step1[22];

  1236   output[23] = step1[8] - step1[23];

  1237   output[24] = step1[7] - step1[24];

  1238   output[25] = step1[6] - step1[25];

  1239   output[26] = step1[5] - step1[26];

  1240   output[27] = step1[4] - step1[27];

  1241   output[28] = step1[3] - step1[28];

  1242   output[29] = step1[2] - step1[29];

  1243   output[30] = step1[1] - step1[30];

  1244   output[31] = step1[0] - step1[31];

  1245 }

  1247 void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {

  1248   int16_t out[32 * 32];

  1249   int16_t *outptr = out;

  1250   int i, j;

  1251   int16_t temp_in[32], temp_out[32];

  1253   // Rows

  1254   for (i = 0; i < 32; ++i) {

  1255     int16_t zero_coeff[16];

  1256     for (j = 0; j < 16; ++j)

  1257       zero_coeff[j] = input[2 * j] | input[2 * j + 1];

  1258     for (j = 0; j < 8; ++j)

  1259       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

  1260     for (j = 0; j < 4; ++j)

  1261       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

  1262     for (j = 0; j < 2; ++j)

  1263       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];

  1265     if (zero_coeff[0] | zero_coeff[1])

  1266       idct32_1d(input, outptr);

  1267     else

  1268       vpx_memset(outptr, 0, sizeof(int16_t) * 32);

  1269     input += 32;

  1270     outptr += 32;

  1271   }

  1273   // Columns

  1274   for (i = 0; i < 32; ++i) {

  1275     for (j = 0; j < 32; ++j)

  1276       temp_in[j] = out[j * 32 + i];

  1277     idct32_1d(temp_in, temp_out);

  1278     for (j = 0; j < 32; ++j)

  1279       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

  1280                                         + dest[j * stride + i]);

  1281   }

  1282 }

  1284 void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {

  1285   int16_t out[32 * 32] = {0};

  1286   int16_t *outptr = out;

  1287   int i, j;

  1288   int16_t temp_in[32], temp_out[32];

  1290   // Rows

  1291   // only upper-left 8x8 has non-zero coeff

  1292   for (i = 0; i < 8; ++i) {

  1293     idct32_1d(input, outptr);

  1294     input += 32;

  1295     outptr += 32;

  1296   }

  1298   // Columns

  1299   for (i = 0; i < 32; ++i) {

  1300     for (j = 0; j < 32; ++j)

  1301       temp_in[j] = out[j * 32 + i];

  1302     idct32_1d(temp_in, temp_out);

  1303     for (j = 0; j < 32; ++j)

  1304       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

  1305                                   + dest[j * stride + i]);

  1306   }

  1307 }

  1309 void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {

  1310   int i, j;

  1311   int a1;

  1313   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

  1314   out = dct_const_round_shift(out * cospi_16_64);

  1315   a1 = ROUND_POWER_OF_TWO(out, 6);

  1317   for (j = 0; j < 32; ++j) {

  1318     for (i = 0; i < 32; ++i)

  1319       dest[i] = clip_pixel(dest[i] + a1);

  1320     dest += stride;

  1321   }

  1322 }

  1324 // idct

  1325 void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {

  1326   if (eob > 1)

  1327     vp9_idct4x4_16_add(input, dest, stride);

  1328   else

  1329     vp9_idct4x4_1_add(input, dest, stride);

  1330 }

  1333 void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {

  1334   if (eob > 1)

  1335     vp9_iwht4x4_16_add(input, dest, stride);

  1336   else

  1337     vp9_iwht4x4_1_add(input, dest, stride);

  1338 }

  1340 void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {

  1341   // If dc is 1, then input[0] is the reconstructed value, do not need

  1342   // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.

  1344   // The calculation can be simplified if there are not many non-zero dct

  1345   // coefficients. Use eobs to decide what to do.

  1346   // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.

  1347   // Combine that with code here.

  1348   if (eob) {

  1349     if (eob == 1)

  1350       // DC only DCT coefficient

  1351       vp9_idct8x8_1_add(input, dest, stride);

  1352     else if (eob <= 10)

  1353       vp9_idct8x8_10_add(input, dest, stride);

  1354     else

  1355       vp9_idct8x8_64_add(input, dest, stride);

  1356   }

  1357 }

  1359 void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,

  1360                        int eob) {

  1361   /* The calculation can be simplified if there are not many non-zero dct

  1362    * coefficients. Use eobs to separate different cases. */

  1363   if (eob) {

  1364     if (eob == 1)

  1365       /* DC only DCT coefficient. */

  1366       vp9_idct16x16_1_add(input, dest, stride);

  1367     else if (eob <= 10)

  1368       vp9_idct16x16_10_add(input, dest, stride);

  1369     else

  1370       vp9_idct16x16_256_add(input, dest, stride);

  1371   }

  1372 }

  1374 void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,

  1375                        int eob) {

  1376   if (eob) {

  1377     if (eob == 1)

  1378       vp9_idct32x32_1_add(input, dest, stride);

  1379     else if (eob <= 34)

  1380       // non-zero coeff only in upper-left 8x8

  1381       vp9_idct32x32_34_add(input, dest, stride);

  1382     else

  1383       vp9_idct32x32_1024_add(input, dest, stride);

  1384   }

  1385 }

  1387 // iht

  1388 void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,

  1389                     int stride, int eob) {

  1390   if (tx_type == DCT_DCT)

  1391     vp9_idct4x4_add(input, dest, stride, eob);

  1392   else

  1393     vp9_iht4x4_16_add(input, dest, stride, tx_type);

  1394 }

  1396 void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,

  1397                     int stride, int eob) {

  1398   if (tx_type == DCT_DCT) {

  1399     vp9_idct8x8_add(input, dest, stride, eob);

  1400   } else {

  1401     if (eob > 0) {

  1402       vp9_iht8x8_64_add(input, dest, stride, tx_type);

  1403     }

  1404   }

  1405 }

  1407 void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,

  1408                       int stride, int eob) {

  1409   if (tx_type == DCT_DCT) {

  1410     vp9_idct16x16_add(input, dest, stride, eob);

  1411   } else {

  1412     if (eob > 0) {

  1413       vp9_iht16x16_256_add(input, dest, stride, tx_type);

  1414     }

  1415   }

  1416 }

The Tor Browser / file revision

media/libvpx/vp9/common/vp9_idct.c@6474c204b198

media/libvpx/vp9/common/vp9_idct.c