The Tor Browser: media/libvpx/vp9/encoder/vp9

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*

     2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

     3  *

     4  *  Use of this source code is governed by a BSD-style license

     5  *  that can be found in the LICENSE file in the root of the source

     6  *  tree. An additional intellectual property rights grant can be found

     7  *  in the file PATENTS.  All contributing project authors may

     8  *  be found in the AUTHORS file in the root of the source tree.

     9  */

    11 #include <assert.h>

    12 #include <math.h>

    14 #include "./vpx_config.h"

    15 #include "./vp9_rtcd.h"

    17 #include "vp9/common/vp9_blockd.h"

    18 #include "vp9/common/vp9_idct.h"

    19 #include "vp9/common/vp9_systemdependent.h"

    21 #include "vp9/encoder/vp9_dct.h"

    23 static INLINE int fdct_round_shift(int input) {

    24   int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);

    25   assert(INT16_MIN <= rv && rv <= INT16_MAX);

    26   return rv;

    27 }

    29 static void fdct4(const int16_t *input, int16_t *output) {

    30   int16_t step[4];

    31   int temp1, temp2;

    33   step[0] = input[0] + input[3];

    34   step[1] = input[1] + input[2];

    35   step[2] = input[1] - input[2];

    36   step[3] = input[0] - input[3];

    38   temp1 = (step[0] + step[1]) * cospi_16_64;

    39   temp2 = (step[0] - step[1]) * cospi_16_64;

    40   output[0] = fdct_round_shift(temp1);

    41   output[2] = fdct_round_shift(temp2);

    42   temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;

    43   temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;

    44   output[1] = fdct_round_shift(temp1);

    45   output[3] = fdct_round_shift(temp2);

    46 }

    48 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {

    49   // The 2D transform is done with two passes which are actually pretty

    50   // similar. In the first one, we transform the columns and transpose

    51   // the results. In the second one, we transform the rows. To achieve that,

    52   // as the first pass results are transposed, we tranpose the columns (that

    53   // is the transposed rows) and transpose the results (so that it goes back

    54   // in normal/row positions).

    55   int pass;

    56   // We need an intermediate buffer between passes.

    57   int16_t intermediate[4 * 4];

    58   const int16_t *in = input;

    59   int16_t *out = intermediate;

    60   // Do the two transform/transpose passes

    61   for (pass = 0; pass < 2; ++pass) {

    62     /*canbe16*/ int input[4];

    63     /*canbe16*/ int step[4];

    64     /*needs32*/ int temp1, temp2;

    65     int i;

    66     for (i = 0; i < 4; ++i) {

    67       // Load inputs.

    68       if (0 == pass) {

    69         input[0] = in[0 * stride] * 16;

    70         input[1] = in[1 * stride] * 16;

    71         input[2] = in[2 * stride] * 16;

    72         input[3] = in[3 * stride] * 16;

    73         if (i == 0 && input[0]) {

    74           input[0] += 1;

    75         }

    76       } else {

    77         input[0] = in[0 * 4];

    78         input[1] = in[1 * 4];

    79         input[2] = in[2 * 4];

    80         input[3] = in[3 * 4];

    81       }

    82       // Transform.

    83       step[0] = input[0] + input[3];

    84       step[1] = input[1] + input[2];

    85       step[2] = input[1] - input[2];

    86       step[3] = input[0] - input[3];

    87       temp1 = (step[0] + step[1]) * cospi_16_64;

    88       temp2 = (step[0] - step[1]) * cospi_16_64;

    89       out[0] = fdct_round_shift(temp1);

    90       out[2] = fdct_round_shift(temp2);

    91       temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;

    92       temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;

    93       out[1] = fdct_round_shift(temp1);

    94       out[3] = fdct_round_shift(temp2);

    95       // Do next column (which is a transposed row in second/horizontal pass)

    96       in++;

    97       out += 4;

    98     }

    99     // Setup in/out for next pass.

   100     in = intermediate;

   101     out = output;

   102   }

   104   {

   105     int i, j;

   106     for (i = 0; i < 4; ++i) {

   107       for (j = 0; j < 4; ++j)

   108         output[j + i * 4] = (output[j + i * 4] + 1) >> 2;

   109     }

   110   }

   111 }

   113 static void fadst4(const int16_t *input, int16_t *output) {

   114   int x0, x1, x2, x3;

   115   int s0, s1, s2, s3, s4, s5, s6, s7;

   117   x0 = input[0];

   118   x1 = input[1];

   119   x2 = input[2];

   120   x3 = input[3];

   122   if (!(x0 | x1 | x2 | x3)) {

   123     output[0] = output[1] = output[2] = output[3] = 0;

   124     return;

   125   }

   127   s0 = sinpi_1_9 * x0;

   128   s1 = sinpi_4_9 * x0;

   129   s2 = sinpi_2_9 * x1;

   130   s3 = sinpi_1_9 * x1;

   131   s4 = sinpi_3_9 * x2;

   132   s5 = sinpi_4_9 * x3;

   133   s6 = sinpi_2_9 * x3;

   134   s7 = x0 + x1 - x3;

   136   x0 = s0 + s2 + s5;

   137   x1 = sinpi_3_9 * s7;

   138   x2 = s1 - s3 + s6;

   139   x3 = s4;

   141   s0 = x0 + x3;

   142   s1 = x1;

   143   s2 = x2 - x3;

   144   s3 = x2 - x0 + x3;

   146   // 1-D transform scaling factor is sqrt(2).

   147   output[0] = fdct_round_shift(s0);

   148   output[1] = fdct_round_shift(s1);

   149   output[2] = fdct_round_shift(s2);

   150   output[3] = fdct_round_shift(s3);

   151 }

   153 static const transform_2d FHT_4[] = {

   154   { fdct4,  fdct4  },  // DCT_DCT  = 0

   155   { fadst4, fdct4  },  // ADST_DCT = 1

   156   { fdct4,  fadst4 },  // DCT_ADST = 2

   157   { fadst4, fadst4 }   // ADST_ADST = 3

   158 };

   160 void vp9_short_fht4x4_c(const int16_t *input, int16_t *output,

   161                         int stride, int tx_type) {

   162   int16_t out[4 * 4];

   163   int16_t *outptr = &out[0];

   164   int i, j;

   165   int16_t temp_in[4], temp_out[4];

   166   const transform_2d ht = FHT_4[tx_type];

   168   // Columns

   169   for (i = 0; i < 4; ++i) {

   170     for (j = 0; j < 4; ++j)

   171       temp_in[j] = input[j * stride + i] * 16;

   172     if (i == 0 && temp_in[0])

   173       temp_in[0] += 1;

   174     ht.cols(temp_in, temp_out);

   175     for (j = 0; j < 4; ++j)

   176       outptr[j * 4 + i] = temp_out[j];

   177   }

   179   // Rows

   180   for (i = 0; i < 4; ++i) {

   181     for (j = 0; j < 4; ++j)

   182       temp_in[j] = out[j + i * 4];

   183     ht.rows(temp_in, temp_out);

   184     for (j = 0; j < 4; ++j)

   185       output[j + i * 4] = (temp_out[j] + 1) >> 2;

   186   }

   187 }

   189 static void fdct8(const int16_t *input, int16_t *output) {

   190   /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;

   191   /*needs32*/ int t0, t1, t2, t3;

   192   /*canbe16*/ int x0, x1, x2, x3;

   194   // stage 1

   195   s0 = input[0] + input[7];

   196   s1 = input[1] + input[6];

   197   s2 = input[2] + input[5];

   198   s3 = input[3] + input[4];

   199   s4 = input[3] - input[4];

   200   s5 = input[2] - input[5];

   201   s6 = input[1] - input[6];

   202   s7 = input[0] - input[7];

   204   // fdct4(step, step);

   205   x0 = s0 + s3;

   206   x1 = s1 + s2;

   207   x2 = s1 - s2;

   208   x3 = s0 - s3;

   209   t0 = (x0 + x1) * cospi_16_64;

   210   t1 = (x0 - x1) * cospi_16_64;

   211   t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;

   212   t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;

   213   output[0] = fdct_round_shift(t0);

   214   output[2] = fdct_round_shift(t2);

   215   output[4] = fdct_round_shift(t1);

   216   output[6] = fdct_round_shift(t3);

   218   // Stage 2

   219   t0 = (s6 - s5) * cospi_16_64;

   220   t1 = (s6 + s5) * cospi_16_64;

   221   t2 = fdct_round_shift(t0);

   222   t3 = fdct_round_shift(t1);

   224   // Stage 3

   225   x0 = s4 + t2;

   226   x1 = s4 - t2;

   227   x2 = s7 - t3;

   228   x3 = s7 + t3;

   230   // Stage 4

   231   t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;

   232   t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;

   233   t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;

   234   t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;

   235   output[1] = fdct_round_shift(t0);

   236   output[3] = fdct_round_shift(t2);

   237   output[5] = fdct_round_shift(t1);

   238   output[7] = fdct_round_shift(t3);

   239 }

   241 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {

   242   int i, j;

   243   int16_t intermediate[64];

   245   // Transform columns

   246   {

   247     int16_t *output = intermediate;

   248     /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;

   249     /*needs32*/ int t0, t1, t2, t3;

   250     /*canbe16*/ int x0, x1, x2, x3;

   252     int i;

   253     for (i = 0; i < 8; i++) {

   254       // stage 1

   255       s0 = (input[0 * stride] + input[7 * stride]) * 4;

   256       s1 = (input[1 * stride] + input[6 * stride]) * 4;

   257       s2 = (input[2 * stride] + input[5 * stride]) * 4;

   258       s3 = (input[3 * stride] + input[4 * stride]) * 4;

   259       s4 = (input[3 * stride] - input[4 * stride]) * 4;

   260       s5 = (input[2 * stride] - input[5 * stride]) * 4;

   261       s6 = (input[1 * stride] - input[6 * stride]) * 4;

   262       s7 = (input[0 * stride] - input[7 * stride]) * 4;

   264       // fdct4(step, step);

   265       x0 = s0 + s3;

   266       x1 = s1 + s2;

   267       x2 = s1 - s2;

   268       x3 = s0 - s3;

   269       t0 = (x0 + x1) * cospi_16_64;

   270       t1 = (x0 - x1) * cospi_16_64;

   271       t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;

   272       t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;

   273       output[0 * 8] = fdct_round_shift(t0);

   274       output[2 * 8] = fdct_round_shift(t2);

   275       output[4 * 8] = fdct_round_shift(t1);

   276       output[6 * 8] = fdct_round_shift(t3);

   278       // Stage 2

   279       t0 = (s6 - s5) * cospi_16_64;

   280       t1 = (s6 + s5) * cospi_16_64;

   281       t2 = fdct_round_shift(t0);

   282       t3 = fdct_round_shift(t1);

   284       // Stage 3

   285       x0 = s4 + t2;

   286       x1 = s4 - t2;

   287       x2 = s7 - t3;

   288       x3 = s7 + t3;

   290       // Stage 4

   291       t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;

   292       t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;

   293       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;

   294       t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;

   295       output[1 * 8] = fdct_round_shift(t0);

   296       output[3 * 8] = fdct_round_shift(t2);

   297       output[5 * 8] = fdct_round_shift(t1);

   298       output[7 * 8] = fdct_round_shift(t3);

   299       input++;

   300       output++;

   301     }

   302   }

   304   // Rows

   305   for (i = 0; i < 8; ++i) {

   306     fdct8(&intermediate[i * 8], &final_output[i * 8]);

   307     for (j = 0; j < 8; ++j)

   308       final_output[j + i * 8] /= 2;

   309   }

   310 }

   312 void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {

   313   // The 2D transform is done with two passes which are actually pretty

   314   // similar. In the first one, we transform the columns and transpose

   315   // the results. In the second one, we transform the rows. To achieve that,

   316   // as the first pass results are transposed, we tranpose the columns (that

   317   // is the transposed rows) and transpose the results (so that it goes back

   318   // in normal/row positions).

   319   int pass;

   320   // We need an intermediate buffer between passes.

   321   int16_t intermediate[256];

   322   const int16_t *in = input;

   323   int16_t *out = intermediate;

   324   // Do the two transform/transpose passes

   325   for (pass = 0; pass < 2; ++pass) {

   326     /*canbe16*/ int step1[8];

   327     /*canbe16*/ int step2[8];

   328     /*canbe16*/ int step3[8];

   329     /*canbe16*/ int input[8];

   330     /*needs32*/ int temp1, temp2;

   331     int i;

   332     for (i = 0; i < 16; i++) {

   333       if (0 == pass) {

   334         // Calculate input for the first 8 results.

   335         input[0] = (in[0 * stride] + in[15 * stride]) * 4;

   336         input[1] = (in[1 * stride] + in[14 * stride]) * 4;

   337         input[2] = (in[2 * stride] + in[13 * stride]) * 4;

   338         input[3] = (in[3 * stride] + in[12 * stride]) * 4;

   339         input[4] = (in[4 * stride] + in[11 * stride]) * 4;

   340         input[5] = (in[5 * stride] + in[10 * stride]) * 4;

   341         input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;

   342         input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;

   343         // Calculate input for the next 8 results.

   344         step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;

   345         step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;

   346         step1[2] = (in[5 * stride] - in[10 * stride]) * 4;

   347         step1[3] = (in[4 * stride] - in[11 * stride]) * 4;

   348         step1[4] = (in[3 * stride] - in[12 * stride]) * 4;

   349         step1[5] = (in[2 * stride] - in[13 * stride]) * 4;

   350         step1[6] = (in[1 * stride] - in[14 * stride]) * 4;

   351         step1[7] = (in[0 * stride] - in[15 * stride]) * 4;

   352       } else {

   353         // Calculate input for the first 8 results.

   354         input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);

   355         input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);

   356         input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);

   357         input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);

   358         input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);

   359         input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);

   360         input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);

   361         input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);

   362         // Calculate input for the next 8 results.

   363         step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);

   364         step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);

   365         step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);

   366         step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);

   367         step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);

   368         step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);

   369         step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);

   370         step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);

   371       }

   372       // Work on the first eight values; fdct8(input, even_results);

   373       {

   374         /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;

   375         /*needs32*/ int t0, t1, t2, t3;

   376         /*canbe16*/ int x0, x1, x2, x3;

   378         // stage 1

   379         s0 = input[0] + input[7];

   380         s1 = input[1] + input[6];

   381         s2 = input[2] + input[5];

   382         s3 = input[3] + input[4];

   383         s4 = input[3] - input[4];

   384         s5 = input[2] - input[5];

   385         s6 = input[1] - input[6];

   386         s7 = input[0] - input[7];

   388         // fdct4(step, step);

   389         x0 = s0 + s3;

   390         x1 = s1 + s2;

   391         x2 = s1 - s2;

   392         x3 = s0 - s3;

   393         t0 = (x0 + x1) * cospi_16_64;

   394         t1 = (x0 - x1) * cospi_16_64;

   395         t2 = x3 * cospi_8_64  + x2 * cospi_24_64;

   396         t3 = x3 * cospi_24_64 - x2 * cospi_8_64;

   397         out[0] = fdct_round_shift(t0);

   398         out[4] = fdct_round_shift(t2);

   399         out[8] = fdct_round_shift(t1);

   400         out[12] = fdct_round_shift(t3);

   402         // Stage 2

   403         t0 = (s6 - s5) * cospi_16_64;

   404         t1 = (s6 + s5) * cospi_16_64;

   405         t2 = fdct_round_shift(t0);

   406         t3 = fdct_round_shift(t1);

   408         // Stage 3

   409         x0 = s4 + t2;

   410         x1 = s4 - t2;

   411         x2 = s7 - t3;

   412         x3 = s7 + t3;

   414         // Stage 4

   415         t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;

   416         t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;

   417         t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;

   418         t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;

   419         out[2] = fdct_round_shift(t0);

   420         out[6] = fdct_round_shift(t2);

   421         out[10] = fdct_round_shift(t1);

   422         out[14] = fdct_round_shift(t3);

   423       }

   424       // Work on the next eight values; step1 -> odd_results

   425       {

   426         // step 2

   427         temp1 = (step1[5] - step1[2]) * cospi_16_64;

   428         temp2 = (step1[4] - step1[3]) * cospi_16_64;

   429         step2[2] = fdct_round_shift(temp1);

   430         step2[3] = fdct_round_shift(temp2);

   431         temp1 = (step1[4] + step1[3]) * cospi_16_64;

   432         temp2 = (step1[5] + step1[2]) * cospi_16_64;

   433         step2[4] = fdct_round_shift(temp1);

   434         step2[5] = fdct_round_shift(temp2);

   435         // step 3

   436         step3[0] = step1[0] + step2[3];

   437         step3[1] = step1[1] + step2[2];

   438         step3[2] = step1[1] - step2[2];

   439         step3[3] = step1[0] - step2[3];

   440         step3[4] = step1[7] - step2[4];

   441         step3[5] = step1[6] - step2[5];

   442         step3[6] = step1[6] + step2[5];

   443         step3[7] = step1[7] + step2[4];

   444         // step 4

   445         temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;

   446         temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;

   447         step2[1] = fdct_round_shift(temp1);

   448         step2[2] = fdct_round_shift(temp2);

   449         temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;

   450         temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;

   451         step2[5] = fdct_round_shift(temp1);

   452         step2[6] = fdct_round_shift(temp2);

   453         // step 5

   454         step1[0] = step3[0] + step2[1];

   455         step1[1] = step3[0] - step2[1];

   456         step1[2] = step3[3] - step2[2];

   457         step1[3] = step3[3] + step2[2];

   458         step1[4] = step3[4] + step2[5];

   459         step1[5] = step3[4] - step2[5];

   460         step1[6] = step3[7] - step2[6];

   461         step1[7] = step3[7] + step2[6];

   462         // step 6

   463         temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;

   464         temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;

   465         out[1] = fdct_round_shift(temp1);

   466         out[9] = fdct_round_shift(temp2);

   467         temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;

   468         temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;

   469         out[5] = fdct_round_shift(temp1);

   470         out[13] = fdct_round_shift(temp2);

   471         temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;

   472         temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;

   473         out[3] = fdct_round_shift(temp1);

   474         out[11] = fdct_round_shift(temp2);

   475         temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;

   476         temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;

   477         out[7] = fdct_round_shift(temp1);

   478         out[15] = fdct_round_shift(temp2);

   479       }

   480       // Do next column (which is a transposed row in second/horizontal pass)

   481       in++;

   482       out += 16;

   483     }

   484     // Setup in/out for next pass.

   485     in = intermediate;

   486     out = output;

   487   }

   488 }

   490 static void fadst8(const int16_t *input, int16_t *output) {

   491   int s0, s1, s2, s3, s4, s5, s6, s7;

   493   int x0 = input[7];

   494   int x1 = input[0];

   495   int x2 = input[5];

   496   int x3 = input[2];

   497   int x4 = input[3];

   498   int x5 = input[4];

   499   int x6 = input[1];

   500   int x7 = input[6];

   502   // stage 1

   503   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;

   504   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;

   505   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

   506   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

   507   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

   508   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

   509   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;

   510   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;

   512   x0 = fdct_round_shift(s0 + s4);

   513   x1 = fdct_round_shift(s1 + s5);

   514   x2 = fdct_round_shift(s2 + s6);

   515   x3 = fdct_round_shift(s3 + s7);

   516   x4 = fdct_round_shift(s0 - s4);

   517   x5 = fdct_round_shift(s1 - s5);

   518   x6 = fdct_round_shift(s2 - s6);

   519   x7 = fdct_round_shift(s3 - s7);

   521   // stage 2

   522   s0 = x0;

   523   s1 = x1;

   524   s2 = x2;

   525   s3 = x3;

   526   s4 = cospi_8_64  * x4 + cospi_24_64 * x5;

   527   s5 = cospi_24_64 * x4 - cospi_8_64  * x5;

   528   s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;

   529   s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;

   531   x0 = s0 + s2;

   532   x1 = s1 + s3;

   533   x2 = s0 - s2;

   534   x3 = s1 - s3;

   535   x4 = fdct_round_shift(s4 + s6);

   536   x5 = fdct_round_shift(s5 + s7);

   537   x6 = fdct_round_shift(s4 - s6);

   538   x7 = fdct_round_shift(s5 - s7);

   540   // stage 3

   541   s2 = cospi_16_64 * (x2 + x3);

   542   s3 = cospi_16_64 * (x2 - x3);

   543   s6 = cospi_16_64 * (x6 + x7);

   544   s7 = cospi_16_64 * (x6 - x7);

   546   x2 = fdct_round_shift(s2);

   547   x3 = fdct_round_shift(s3);

   548   x6 = fdct_round_shift(s6);

   549   x7 = fdct_round_shift(s7);

   551   output[0] =   x0;

   552   output[1] = - x4;

   553   output[2] =   x6;

   554   output[3] = - x2;

   555   output[4] =   x3;

   556   output[5] = - x7;

   557   output[6] =   x5;

   558   output[7] = - x1;

   559 }

   561 static const transform_2d FHT_8[] = {

   562   { fdct8,  fdct8  },  // DCT_DCT  = 0

   563   { fadst8, fdct8  },  // ADST_DCT = 1

   564   { fdct8,  fadst8 },  // DCT_ADST = 2

   565   { fadst8, fadst8 }   // ADST_ADST = 3

   566 };

   568 void vp9_short_fht8x8_c(const int16_t *input, int16_t *output,

   569                         int stride, int tx_type) {

   570   int16_t out[64];

   571   int16_t *outptr = &out[0];

   572   int i, j;

   573   int16_t temp_in[8], temp_out[8];

   574   const transform_2d ht = FHT_8[tx_type];

   576   // Columns

   577   for (i = 0; i < 8; ++i) {

   578     for (j = 0; j < 8; ++j)

   579       temp_in[j] = input[j * stride + i] * 4;

   580     ht.cols(temp_in, temp_out);

   581     for (j = 0; j < 8; ++j)

   582       outptr[j * 8 + i] = temp_out[j];

   583   }

   585   // Rows

   586   for (i = 0; i < 8; ++i) {

   587     for (j = 0; j < 8; ++j)

   588       temp_in[j] = out[j + i * 8];

   589     ht.rows(temp_in, temp_out);

   590     for (j = 0; j < 8; ++j)

   591       output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;

   592   }

   593 }

   595 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per

   596    pixel. */

   597 void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {

   598   int i;

   599   int a1, b1, c1, d1, e1;

   600   const int16_t *ip = input;

   601   int16_t *op = output;

   603   for (i = 0; i < 4; i++) {

   604     a1 = ip[0 * stride];

   605     b1 = ip[1 * stride];

   606     c1 = ip[2 * stride];

   607     d1 = ip[3 * stride];

   609     a1 += b1;

   610     d1 = d1 - c1;

   611     e1 = (a1 - d1) >> 1;

   612     b1 = e1 - b1;

   613     c1 = e1 - c1;

   614     a1 -= c1;

   615     d1 += b1;

   616     op[0] = a1;

   617     op[4] = c1;

   618     op[8] = d1;

   619     op[12] = b1;

   621     ip++;

   622     op++;

   623   }

   624   ip = output;

   625   op = output;

   627   for (i = 0; i < 4; i++) {

   628     a1 = ip[0];

   629     b1 = ip[1];

   630     c1 = ip[2];

   631     d1 = ip[3];

   633     a1 += b1;

   634     d1 -= c1;

   635     e1 = (a1 - d1) >> 1;

   636     b1 = e1 - b1;

   637     c1 = e1 - c1;

   638     a1 -= c1;

   639     d1 += b1;

   640     op[0] = a1 * UNIT_QUANT_FACTOR;

   641     op[1] = c1 * UNIT_QUANT_FACTOR;

   642     op[2] = d1 * UNIT_QUANT_FACTOR;

   643     op[3] = b1 * UNIT_QUANT_FACTOR;

   645     ip += 4;

   646     op += 4;

   647   }

   648 }

   650 // Rewrote to use same algorithm as others.

   651 static void fdct16(const int16_t in[16], int16_t out[16]) {

   652   /*canbe16*/ int step1[8];

   653   /*canbe16*/ int step2[8];

   654   /*canbe16*/ int step3[8];

   655   /*canbe16*/ int input[8];

   656   /*needs32*/ int temp1, temp2;

   658   // step 1

   659   input[0] = in[0] + in[15];

   660   input[1] = in[1] + in[14];

   661   input[2] = in[2] + in[13];

   662   input[3] = in[3] + in[12];

   663   input[4] = in[4] + in[11];

   664   input[5] = in[5] + in[10];

   665   input[6] = in[6] + in[ 9];

   666   input[7] = in[7] + in[ 8];

   668   step1[0] = in[7] - in[ 8];

   669   step1[1] = in[6] - in[ 9];

   670   step1[2] = in[5] - in[10];

   671   step1[3] = in[4] - in[11];

   672   step1[4] = in[3] - in[12];

   673   step1[5] = in[2] - in[13];

   674   step1[6] = in[1] - in[14];

   675   step1[7] = in[0] - in[15];

   677   // fdct8(step, step);

   678   {

   679     /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;

   680     /*needs32*/ int t0, t1, t2, t3;

   681     /*canbe16*/ int x0, x1, x2, x3;

   683     // stage 1

   684     s0 = input[0] + input[7];

   685     s1 = input[1] + input[6];

   686     s2 = input[2] + input[5];

   687     s3 = input[3] + input[4];

   688     s4 = input[3] - input[4];

   689     s5 = input[2] - input[5];

   690     s6 = input[1] - input[6];

   691     s7 = input[0] - input[7];

   693     // fdct4(step, step);

   694     x0 = s0 + s3;

   695     x1 = s1 + s2;

   696     x2 = s1 - s2;

   697     x3 = s0 - s3;

   698     t0 = (x0 + x1) * cospi_16_64;

   699     t1 = (x0 - x1) * cospi_16_64;

   700     t2 = x3 * cospi_8_64  + x2 * cospi_24_64;

   701     t3 = x3 * cospi_24_64 - x2 * cospi_8_64;

   702     out[0] = fdct_round_shift(t0);

   703     out[4] = fdct_round_shift(t2);

   704     out[8] = fdct_round_shift(t1);

   705     out[12] = fdct_round_shift(t3);

   707     // Stage 2

   708     t0 = (s6 - s5) * cospi_16_64;

   709     t1 = (s6 + s5) * cospi_16_64;

   710     t2 = fdct_round_shift(t0);

   711     t3 = fdct_round_shift(t1);

   713     // Stage 3

   714     x0 = s4 + t2;

   715     x1 = s4 - t2;

   716     x2 = s7 - t3;

   717     x3 = s7 + t3;

   719     // Stage 4

   720     t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;

   721     t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;

   722     t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;

   723     t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;

   724     out[2] = fdct_round_shift(t0);

   725     out[6] = fdct_round_shift(t2);

   726     out[10] = fdct_round_shift(t1);

   727     out[14] = fdct_round_shift(t3);

   728   }

   730   // step 2

   731   temp1 = (step1[5] - step1[2]) * cospi_16_64;

   732   temp2 = (step1[4] - step1[3]) * cospi_16_64;

   733   step2[2] = fdct_round_shift(temp1);

   734   step2[3] = fdct_round_shift(temp2);

   735   temp1 = (step1[4] + step1[3]) * cospi_16_64;

   736   temp2 = (step1[5] + step1[2]) * cospi_16_64;

   737   step2[4] = fdct_round_shift(temp1);

   738   step2[5] = fdct_round_shift(temp2);

   740   // step 3

   741   step3[0] = step1[0] + step2[3];

   742   step3[1] = step1[1] + step2[2];

   743   step3[2] = step1[1] - step2[2];

   744   step3[3] = step1[0] - step2[3];

   745   step3[4] = step1[7] - step2[4];

   746   step3[5] = step1[6] - step2[5];

   747   step3[6] = step1[6] + step2[5];

   748   step3[7] = step1[7] + step2[4];

   750   // step 4

   751   temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;

   752   temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;

   753   step2[1] = fdct_round_shift(temp1);

   754   step2[2] = fdct_round_shift(temp2);

   755   temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;

   756   temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;

   757   step2[5] = fdct_round_shift(temp1);

   758   step2[6] = fdct_round_shift(temp2);

   760   // step 5

   761   step1[0] = step3[0] + step2[1];

   762   step1[1] = step3[0] - step2[1];

   763   step1[2] = step3[3] - step2[2];

   764   step1[3] = step3[3] + step2[2];

   765   step1[4] = step3[4] + step2[5];

   766   step1[5] = step3[4] - step2[5];

   767   step1[6] = step3[7] - step2[6];

   768   step1[7] = step3[7] + step2[6];

   770   // step 6

   771   temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;

   772   temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;

   773   out[1] = fdct_round_shift(temp1);

   774   out[9] = fdct_round_shift(temp2);

   776   temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;

   777   temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;

   778   out[5] = fdct_round_shift(temp1);

   779   out[13] = fdct_round_shift(temp2);

   781   temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;

   782   temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;

   783   out[3] = fdct_round_shift(temp1);

   784   out[11] = fdct_round_shift(temp2);

   786   temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;

   787   temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;

   788   out[7] = fdct_round_shift(temp1);

   789   out[15] = fdct_round_shift(temp2);

   790 }

   792 static void fadst16(const int16_t *input, int16_t *output) {

   793   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;

   795   int x0 = input[15];

   796   int x1 = input[0];

   797   int x2 = input[13];

   798   int x3 = input[2];

   799   int x4 = input[11];

   800   int x5 = input[4];

   801   int x6 = input[9];

   802   int x7 = input[6];

   803   int x8 = input[7];

   804   int x9 = input[8];

   805   int x10 = input[5];

   806   int x11 = input[10];

   807   int x12 = input[3];

   808   int x13 = input[12];

   809   int x14 = input[1];

   810   int x15 = input[14];

   812   // stage 1

   813   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;

   814   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

   815   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;

   816   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

   817   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;

   818   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

   819   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

   820   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

   821   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

   822   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

   823   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

   824   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

   825   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

   826   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;

   827   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

   828   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;

   830   x0 = fdct_round_shift(s0 + s8);

   831   x1 = fdct_round_shift(s1 + s9);

   832   x2 = fdct_round_shift(s2 + s10);

   833   x3 = fdct_round_shift(s3 + s11);

   834   x4 = fdct_round_shift(s4 + s12);

   835   x5 = fdct_round_shift(s5 + s13);

   836   x6 = fdct_round_shift(s6 + s14);

   837   x7 = fdct_round_shift(s7 + s15);

   838   x8  = fdct_round_shift(s0 - s8);

   839   x9  = fdct_round_shift(s1 - s9);

   840   x10 = fdct_round_shift(s2 - s10);

   841   x11 = fdct_round_shift(s3 - s11);

   842   x12 = fdct_round_shift(s4 - s12);

   843   x13 = fdct_round_shift(s5 - s13);

   844   x14 = fdct_round_shift(s6 - s14);

   845   x15 = fdct_round_shift(s7 - s15);

   847   // stage 2

   848   s0 = x0;

   849   s1 = x1;

   850   s2 = x2;

   851   s3 = x3;

   852   s4 = x4;

   853   s5 = x5;

   854   s6 = x6;

   855   s7 = x7;

   856   s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;

   857   s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;

   858   s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;

   859   s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;

   860   s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;

   861   s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;

   862   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;

   863   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;

   865   x0 = s0 + s4;

   866   x1 = s1 + s5;

   867   x2 = s2 + s6;

   868   x3 = s3 + s7;

   869   x4 = s0 - s4;

   870   x5 = s1 - s5;

   871   x6 = s2 - s6;

   872   x7 = s3 - s7;

   873   x8 = fdct_round_shift(s8 + s12);

   874   x9 = fdct_round_shift(s9 + s13);

   875   x10 = fdct_round_shift(s10 + s14);

   876   x11 = fdct_round_shift(s11 + s15);

   877   x12 = fdct_round_shift(s8 - s12);

   878   x13 = fdct_round_shift(s9 - s13);

   879   x14 = fdct_round_shift(s10 - s14);

   880   x15 = fdct_round_shift(s11 - s15);

   882   // stage 3

   883   s0 = x0;

   884   s1 = x1;

   885   s2 = x2;

   886   s3 = x3;

   887   s4 = x4 * cospi_8_64  + x5 * cospi_24_64;

   888   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

   889   s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;

   890   s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;

   891   s8 = x8;

   892   s9 = x9;

   893   s10 = x10;

   894   s11 = x11;

   895   s12 = x12 * cospi_8_64  + x13 * cospi_24_64;

   896   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

   897   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;

   898   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;

   900   x0 = s0 + s2;

   901   x1 = s1 + s3;

   902   x2 = s0 - s2;

   903   x3 = s1 - s3;

   904   x4 = fdct_round_shift(s4 + s6);

   905   x5 = fdct_round_shift(s5 + s7);

   906   x6 = fdct_round_shift(s4 - s6);

   907   x7 = fdct_round_shift(s5 - s7);

   908   x8 = s8 + s10;

   909   x9 = s9 + s11;

   910   x10 = s8 - s10;

   911   x11 = s9 - s11;

   912   x12 = fdct_round_shift(s12 + s14);

   913   x13 = fdct_round_shift(s13 + s15);

   914   x14 = fdct_round_shift(s12 - s14);

   915   x15 = fdct_round_shift(s13 - s15);

   917   // stage 4

   918   s2 = (- cospi_16_64) * (x2 + x3);

   919   s3 = cospi_16_64 * (x2 - x3);

   920   s6 = cospi_16_64 * (x6 + x7);

   921   s7 = cospi_16_64 * (- x6 + x7);

   922   s10 = cospi_16_64 * (x10 + x11);

   923   s11 = cospi_16_64 * (- x10 + x11);

   924   s14 = (- cospi_16_64) * (x14 + x15);

   925   s15 = cospi_16_64 * (x14 - x15);

   927   x2 = fdct_round_shift(s2);

   928   x3 = fdct_round_shift(s3);

   929   x6 = fdct_round_shift(s6);

   930   x7 = fdct_round_shift(s7);

   931   x10 = fdct_round_shift(s10);

   932   x11 = fdct_round_shift(s11);

   933   x14 = fdct_round_shift(s14);

   934   x15 = fdct_round_shift(s15);

   936   output[0] = x0;

   937   output[1] = - x8;

   938   output[2] = x12;

   939   output[3] = - x4;

   940   output[4] = x6;

   941   output[5] = x14;

   942   output[6] = x10;

   943   output[7] = x2;

   944   output[8] = x3;

   945   output[9] =  x11;

   946   output[10] = x15;

   947   output[11] = x7;

   948   output[12] = x5;

   949   output[13] = - x13;

   950   output[14] = x9;

   951   output[15] = - x1;

   952 }

   954 static const transform_2d FHT_16[] = {

   955   { fdct16,  fdct16  },  // DCT_DCT  = 0

   956   { fadst16, fdct16  },  // ADST_DCT = 1

   957   { fdct16,  fadst16 },  // DCT_ADST = 2

   958   { fadst16, fadst16 }   // ADST_ADST = 3

   959 };

   961 void vp9_short_fht16x16_c(const int16_t *input, int16_t *output,

   962                           int stride, int tx_type) {

   963   int16_t out[256];

   964   int16_t *outptr = &out[0];

   965   int i, j;

   966   int16_t temp_in[16], temp_out[16];

   967   const transform_2d ht = FHT_16[tx_type];

   969   // Columns

   970   for (i = 0; i < 16; ++i) {

   971     for (j = 0; j < 16; ++j)

   972       temp_in[j] = input[j * stride + i] * 4;

   973     ht.cols(temp_in, temp_out);

   974     for (j = 0; j < 16; ++j)

   975       outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;

   976 //      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;

   977   }

   979   // Rows

   980   for (i = 0; i < 16; ++i) {

   981     for (j = 0; j < 16; ++j)

   982       temp_in[j] = out[j + i * 16];

   983     ht.rows(temp_in, temp_out);

   984     for (j = 0; j < 16; ++j)

   985       output[j + i * 16] = temp_out[j];

   986   }

   987 }

   989 static INLINE int dct_32_round(int input) {

   990   int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);

   991   assert(-131072 <= rv && rv <= 131071);

   992   return rv;

   993 }

   995 static INLINE int half_round_shift(int input) {

   996   int rv = (input + 1 + (input < 0)) >> 2;

   997   return rv;

   998 }

  1000 static void dct32_1d(const int *input, int *output, int round) {

  1001   int step[32];

  1002   // Stage 1

  1003   step[0] = input[0] + input[(32 - 1)];

  1004   step[1] = input[1] + input[(32 - 2)];

  1005   step[2] = input[2] + input[(32 - 3)];

  1006   step[3] = input[3] + input[(32 - 4)];

  1007   step[4] = input[4] + input[(32 - 5)];

  1008   step[5] = input[5] + input[(32 - 6)];

  1009   step[6] = input[6] + input[(32 - 7)];

  1010   step[7] = input[7] + input[(32 - 8)];

  1011   step[8] = input[8] + input[(32 - 9)];

  1012   step[9] = input[9] + input[(32 - 10)];

  1013   step[10] = input[10] + input[(32 - 11)];

  1014   step[11] = input[11] + input[(32 - 12)];

  1015   step[12] = input[12] + input[(32 - 13)];

  1016   step[13] = input[13] + input[(32 - 14)];

  1017   step[14] = input[14] + input[(32 - 15)];

  1018   step[15] = input[15] + input[(32 - 16)];

  1019   step[16] = -input[16] + input[(32 - 17)];

  1020   step[17] = -input[17] + input[(32 - 18)];

  1021   step[18] = -input[18] + input[(32 - 19)];

  1022   step[19] = -input[19] + input[(32 - 20)];

  1023   step[20] = -input[20] + input[(32 - 21)];

  1024   step[21] = -input[21] + input[(32 - 22)];

  1025   step[22] = -input[22] + input[(32 - 23)];

  1026   step[23] = -input[23] + input[(32 - 24)];

  1027   step[24] = -input[24] + input[(32 - 25)];

  1028   step[25] = -input[25] + input[(32 - 26)];

  1029   step[26] = -input[26] + input[(32 - 27)];

  1030   step[27] = -input[27] + input[(32 - 28)];

  1031   step[28] = -input[28] + input[(32 - 29)];

  1032   step[29] = -input[29] + input[(32 - 30)];

  1033   step[30] = -input[30] + input[(32 - 31)];

  1034   step[31] = -input[31] + input[(32 - 32)];

  1036   // Stage 2

  1037   output[0] = step[0] + step[16 - 1];

  1038   output[1] = step[1] + step[16 - 2];

  1039   output[2] = step[2] + step[16 - 3];

  1040   output[3] = step[3] + step[16 - 4];

  1041   output[4] = step[4] + step[16 - 5];

  1042   output[5] = step[5] + step[16 - 6];

  1043   output[6] = step[6] + step[16 - 7];

  1044   output[7] = step[7] + step[16 - 8];

  1045   output[8] = -step[8] + step[16 - 9];

  1046   output[9] = -step[9] + step[16 - 10];

  1047   output[10] = -step[10] + step[16 - 11];

  1048   output[11] = -step[11] + step[16 - 12];

  1049   output[12] = -step[12] + step[16 - 13];

  1050   output[13] = -step[13] + step[16 - 14];

  1051   output[14] = -step[14] + step[16 - 15];

  1052   output[15] = -step[15] + step[16 - 16];

  1054   output[16] = step[16];

  1055   output[17] = step[17];

  1056   output[18] = step[18];

  1057   output[19] = step[19];

  1059   output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);

  1060   output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);

  1061   output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);

  1062   output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);

  1064   output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);

  1065   output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);

  1066   output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);

  1067   output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);

  1069   output[28] = step[28];

  1070   output[29] = step[29];

  1071   output[30] = step[30];

  1072   output[31] = step[31];

  1074   // dump the magnitude by 4, hence the intermediate values are within

  1075   // the range of 16 bits.

  1076   if (round) {

  1077     output[0] = half_round_shift(output[0]);

  1078     output[1] = half_round_shift(output[1]);

  1079     output[2] = half_round_shift(output[2]);

  1080     output[3] = half_round_shift(output[3]);

  1081     output[4] = half_round_shift(output[4]);

  1082     output[5] = half_round_shift(output[5]);

  1083     output[6] = half_round_shift(output[6]);

  1084     output[7] = half_round_shift(output[7]);

  1085     output[8] = half_round_shift(output[8]);

  1086     output[9] = half_round_shift(output[9]);

  1087     output[10] = half_round_shift(output[10]);

  1088     output[11] = half_round_shift(output[11]);

  1089     output[12] = half_round_shift(output[12]);

  1090     output[13] = half_round_shift(output[13]);

  1091     output[14] = half_round_shift(output[14]);

  1092     output[15] = half_round_shift(output[15]);

  1094     output[16] = half_round_shift(output[16]);

  1095     output[17] = half_round_shift(output[17]);

  1096     output[18] = half_round_shift(output[18]);

  1097     output[19] = half_round_shift(output[19]);

  1098     output[20] = half_round_shift(output[20]);

  1099     output[21] = half_round_shift(output[21]);

  1100     output[22] = half_round_shift(output[22]);

  1101     output[23] = half_round_shift(output[23]);

  1102     output[24] = half_round_shift(output[24]);

  1103     output[25] = half_round_shift(output[25]);

  1104     output[26] = half_round_shift(output[26]);

  1105     output[27] = half_round_shift(output[27]);

  1106     output[28] = half_round_shift(output[28]);

  1107     output[29] = half_round_shift(output[29]);

  1108     output[30] = half_round_shift(output[30]);

  1109     output[31] = half_round_shift(output[31]);

  1110   }

  1112   // Stage 3

  1113   step[0] = output[0] + output[(8 - 1)];

  1114   step[1] = output[1] + output[(8 - 2)];

  1115   step[2] = output[2] + output[(8 - 3)];

  1116   step[3] = output[3] + output[(8 - 4)];

  1117   step[4] = -output[4] + output[(8 - 5)];

  1118   step[5] = -output[5] + output[(8 - 6)];

  1119   step[6] = -output[6] + output[(8 - 7)];

  1120   step[7] = -output[7] + output[(8 - 8)];

  1121   step[8] = output[8];

  1122   step[9] = output[9];

  1123   step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);

  1124   step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);

  1125   step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);

  1126   step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);

  1127   step[14] = output[14];

  1128   step[15] = output[15];

  1130   step[16] = output[16] + output[23];

  1131   step[17] = output[17] + output[22];

  1132   step[18] = output[18] + output[21];

  1133   step[19] = output[19] + output[20];

  1134   step[20] = -output[20] + output[19];

  1135   step[21] = -output[21] + output[18];

  1136   step[22] = -output[22] + output[17];

  1137   step[23] = -output[23] + output[16];

  1138   step[24] = -output[24] + output[31];

  1139   step[25] = -output[25] + output[30];

  1140   step[26] = -output[26] + output[29];

  1141   step[27] = -output[27] + output[28];

  1142   step[28] = output[28] + output[27];

  1143   step[29] = output[29] + output[26];

  1144   step[30] = output[30] + output[25];

  1145   step[31] = output[31] + output[24];

  1147   // Stage 4

  1148   output[0] = step[0] + step[3];

  1149   output[1] = step[1] + step[2];

  1150   output[2] = -step[2] + step[1];

  1151   output[3] = -step[3] + step[0];

  1152   output[4] = step[4];

  1153   output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);

  1154   output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);

  1155   output[7] = step[7];

  1156   output[8] = step[8] + step[11];

  1157   output[9] = step[9] + step[10];

  1158   output[10] = -step[10] + step[9];

  1159   output[11] = -step[11] + step[8];

  1160   output[12] = -step[12] + step[15];

  1161   output[13] = -step[13] + step[14];

  1162   output[14] = step[14] + step[13];

  1163   output[15] = step[15] + step[12];

  1165   output[16] = step[16];

  1166   output[17] = step[17];

  1167   output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);

  1168   output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);

  1169   output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);

  1170   output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);

  1171   output[22] = step[22];

  1172   output[23] = step[23];

  1173   output[24] = step[24];

  1174   output[25] = step[25];

  1175   output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);

  1176   output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);

  1177   output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);

  1178   output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);

  1179   output[30] = step[30];

  1180   output[31] = step[31];

  1182   // Stage 5

  1183   step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);

  1184   step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);

  1185   step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);

  1186   step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);

  1187   step[4] = output[4] + output[5];

  1188   step[5] = -output[5] + output[4];

  1189   step[6] = -output[6] + output[7];

  1190   step[7] = output[7] + output[6];

  1191   step[8] = output[8];

  1192   step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);

  1193   step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);

  1194   step[11] = output[11];

  1195   step[12] = output[12];

  1196   step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);

  1197   step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);

  1198   step[15] = output[15];

  1200   step[16] = output[16] + output[19];

  1201   step[17] = output[17] + output[18];

  1202   step[18] = -output[18] + output[17];

  1203   step[19] = -output[19] + output[16];

  1204   step[20] = -output[20] + output[23];

  1205   step[21] = -output[21] + output[22];

  1206   step[22] = output[22] + output[21];

  1207   step[23] = output[23] + output[20];

  1208   step[24] = output[24] + output[27];

  1209   step[25] = output[25] + output[26];

  1210   step[26] = -output[26] + output[25];

  1211   step[27] = -output[27] + output[24];

  1212   step[28] = -output[28] + output[31];

  1213   step[29] = -output[29] + output[30];

  1214   step[30] = output[30] + output[29];

  1215   step[31] = output[31] + output[28];

  1217   // Stage 6

  1218   output[0] = step[0];

  1219   output[1] = step[1];

  1220   output[2] = step[2];

  1221   output[3] = step[3];

  1222   output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);

  1223   output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);

  1224   output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);

  1225   output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);

  1226   output[8] = step[8] + step[9];

  1227   output[9] = -step[9] + step[8];

  1228   output[10] = -step[10] + step[11];

  1229   output[11] = step[11] + step[10];

  1230   output[12] = step[12] + step[13];

  1231   output[13] = -step[13] + step[12];

  1232   output[14] = -step[14] + step[15];

  1233   output[15] = step[15] + step[14];

  1235   output[16] = step[16];

  1236   output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);

  1237   output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);

  1238   output[19] = step[19];

  1239   output[20] = step[20];

  1240   output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);

  1241   output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);

  1242   output[23] = step[23];

  1243   output[24] = step[24];

  1244   output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);

  1245   output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);

  1246   output[27] = step[27];

  1247   output[28] = step[28];

  1248   output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);

  1249   output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);

  1250   output[31] = step[31];

  1252   // Stage 7

  1253   step[0] = output[0];

  1254   step[1] = output[1];

  1255   step[2] = output[2];

  1256   step[3] = output[3];

  1257   step[4] = output[4];

  1258   step[5] = output[5];

  1259   step[6] = output[6];

  1260   step[7] = output[7];

  1261   step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);

  1262   step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);

  1263   step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);

  1264   step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);

  1265   step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);

  1266   step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);

  1267   step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);

  1268   step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);

  1270   step[16] = output[16] + output[17];

  1271   step[17] = -output[17] + output[16];

  1272   step[18] = -output[18] + output[19];

  1273   step[19] = output[19] + output[18];

  1274   step[20] = output[20] + output[21];

  1275   step[21] = -output[21] + output[20];

  1276   step[22] = -output[22] + output[23];

  1277   step[23] = output[23] + output[22];

  1278   step[24] = output[24] + output[25];

  1279   step[25] = -output[25] + output[24];

  1280   step[26] = -output[26] + output[27];

  1281   step[27] = output[27] + output[26];

  1282   step[28] = output[28] + output[29];

  1283   step[29] = -output[29] + output[28];

  1284   step[30] = -output[30] + output[31];

  1285   step[31] = output[31] + output[30];

  1287   // Final stage --- outputs indices are bit-reversed.

  1288   output[0]  = step[0];

  1289   output[16] = step[1];

  1290   output[8]  = step[2];

  1291   output[24] = step[3];

  1292   output[4]  = step[4];

  1293   output[20] = step[5];

  1294   output[12] = step[6];

  1295   output[28] = step[7];

  1296   output[2]  = step[8];

  1297   output[18] = step[9];

  1298   output[10] = step[10];

  1299   output[26] = step[11];

  1300   output[6]  = step[12];

  1301   output[22] = step[13];

  1302   output[14] = step[14];

  1303   output[30] = step[15];

  1305   output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);

  1306   output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);

  1307   output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);

  1308   output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);

  1309   output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);

  1310   output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);

  1311   output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);

  1312   output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);

  1313   output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);

  1314   output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);

  1315   output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);

  1316   output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);

  1317   output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);

  1318   output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);

  1319   output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);

  1320   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);

  1321 }

  1323 void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {

  1324   int i, j;

  1325   int output[32 * 32];

  1327   // Columns

  1328   for (i = 0; i < 32; ++i) {

  1329     int temp_in[32], temp_out[32];

  1330     for (j = 0; j < 32; ++j)

  1331       temp_in[j] = input[j * stride + i] * 4;

  1332     dct32_1d(temp_in, temp_out, 0);

  1333     for (j = 0; j < 32; ++j)

  1334       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;

  1335   }

  1337   // Rows

  1338   for (i = 0; i < 32; ++i) {

  1339     int temp_in[32], temp_out[32];

  1340     for (j = 0; j < 32; ++j)

  1341       temp_in[j] = output[j + i * 32];

  1342     dct32_1d(temp_in, temp_out, 0);

  1343     for (j = 0; j < 32; ++j)

  1344       out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;

  1345   }

  1346 }

  1348 // Note that although we use dct_32_round in dct32_1d computation flow,

  1349 // this 2d fdct32x32 for rate-distortion optimization loop is operating

  1350 // within 16 bits precision.

  1351 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {

  1352   int i, j;

  1353   int output[32 * 32];

  1355   // Columns

  1356   for (i = 0; i < 32; ++i) {

  1357     int temp_in[32], temp_out[32];

  1358     for (j = 0; j < 32; ++j)

  1359       temp_in[j] = input[j * stride + i] * 4;

  1360     dct32_1d(temp_in, temp_out, 0);

  1361     for (j = 0; j < 32; ++j)

  1362       // TODO(cd): see quality impact of only doing

  1363       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;

  1364       //           PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c

  1365       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;

  1366   }

  1368   // Rows

  1369   for (i = 0; i < 32; ++i) {

  1370     int temp_in[32], temp_out[32];

  1371     for (j = 0; j < 32; ++j)

  1372       temp_in[j] = output[j + i * 32];

  1373     dct32_1d(temp_in, temp_out, 1);

  1374     for (j = 0; j < 32; ++j)

  1375       out[j + i * 32] = temp_out[j];

  1376   }

  1377 }

  1379 void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,

  1380                 int stride) {

  1381   if (tx_type == DCT_DCT)

  1382     vp9_fdct4x4(input, output, stride);

  1383   else

  1384     vp9_short_fht4x4(input, output, stride, tx_type);

  1385 }

  1387 void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,

  1388                 int stride) {

  1389   if (tx_type == DCT_DCT)

  1390     vp9_fdct8x8(input, output, stride);

  1391   else

  1392     vp9_short_fht8x8(input, output, stride, tx_type);

  1393 }

  1395 void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,

  1396                   int stride) {

  1397   if (tx_type == DCT_DCT)

  1398     vp9_fdct16x16(input, output, stride);

  1399   else

  1400     vp9_short_fht16x16(input, output, stride, tx_type);

  1401 }

The Tor Browser / file revision

media/libvpx/vp9/encoder/vp9_dct.c@b8a032363ba2

media/libvpx/vp9/encoder/vp9_dct.c