media/libvpx/vp9/encoder/vp9_dct.c

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 /*
     2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3  *
     4  *  Use of this source code is governed by a BSD-style license
     5  *  that can be found in the LICENSE file in the root of the source
     6  *  tree. An additional intellectual property rights grant can be found
     7  *  in the file PATENTS.  All contributing project authors may
     8  *  be found in the AUTHORS file in the root of the source tree.
     9  */
    11 #include <assert.h>
    12 #include <math.h>
    14 #include "./vpx_config.h"
    15 #include "./vp9_rtcd.h"
    17 #include "vp9/common/vp9_blockd.h"
    18 #include "vp9/common/vp9_idct.h"
    19 #include "vp9/common/vp9_systemdependent.h"
    21 #include "vp9/encoder/vp9_dct.h"
    23 static INLINE int fdct_round_shift(int input) {
    24   int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
    25   assert(INT16_MIN <= rv && rv <= INT16_MAX);
    26   return rv;
    27 }
    29 static void fdct4(const int16_t *input, int16_t *output) {
    30   int16_t step[4];
    31   int temp1, temp2;
    33   step[0] = input[0] + input[3];
    34   step[1] = input[1] + input[2];
    35   step[2] = input[1] - input[2];
    36   step[3] = input[0] - input[3];
    38   temp1 = (step[0] + step[1]) * cospi_16_64;
    39   temp2 = (step[0] - step[1]) * cospi_16_64;
    40   output[0] = fdct_round_shift(temp1);
    41   output[2] = fdct_round_shift(temp2);
    42   temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
    43   temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
    44   output[1] = fdct_round_shift(temp1);
    45   output[3] = fdct_round_shift(temp2);
    46 }
    48 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
    49   // The 2D transform is done with two passes which are actually pretty
    50   // similar. In the first one, we transform the columns and transpose
    51   // the results. In the second one, we transform the rows. To achieve that,
    52   // as the first pass results are transposed, we tranpose the columns (that
    53   // is the transposed rows) and transpose the results (so that it goes back
    54   // in normal/row positions).
    55   int pass;
    56   // We need an intermediate buffer between passes.
    57   int16_t intermediate[4 * 4];
    58   const int16_t *in = input;
    59   int16_t *out = intermediate;
    60   // Do the two transform/transpose passes
    61   for (pass = 0; pass < 2; ++pass) {
    62     /*canbe16*/ int input[4];
    63     /*canbe16*/ int step[4];
    64     /*needs32*/ int temp1, temp2;
    65     int i;
    66     for (i = 0; i < 4; ++i) {
    67       // Load inputs.
    68       if (0 == pass) {
    69         input[0] = in[0 * stride] * 16;
    70         input[1] = in[1 * stride] * 16;
    71         input[2] = in[2 * stride] * 16;
    72         input[3] = in[3 * stride] * 16;
    73         if (i == 0 && input[0]) {
    74           input[0] += 1;
    75         }
    76       } else {
    77         input[0] = in[0 * 4];
    78         input[1] = in[1 * 4];
    79         input[2] = in[2 * 4];
    80         input[3] = in[3 * 4];
    81       }
    82       // Transform.
    83       step[0] = input[0] + input[3];
    84       step[1] = input[1] + input[2];
    85       step[2] = input[1] - input[2];
    86       step[3] = input[0] - input[3];
    87       temp1 = (step[0] + step[1]) * cospi_16_64;
    88       temp2 = (step[0] - step[1]) * cospi_16_64;
    89       out[0] = fdct_round_shift(temp1);
    90       out[2] = fdct_round_shift(temp2);
    91       temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
    92       temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
    93       out[1] = fdct_round_shift(temp1);
    94       out[3] = fdct_round_shift(temp2);
    95       // Do next column (which is a transposed row in second/horizontal pass)
    96       in++;
    97       out += 4;
    98     }
    99     // Setup in/out for next pass.
   100     in = intermediate;
   101     out = output;
   102   }
   104   {
   105     int i, j;
   106     for (i = 0; i < 4; ++i) {
   107       for (j = 0; j < 4; ++j)
   108         output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
   109     }
   110   }
   111 }
   113 static void fadst4(const int16_t *input, int16_t *output) {
   114   int x0, x1, x2, x3;
   115   int s0, s1, s2, s3, s4, s5, s6, s7;
   117   x0 = input[0];
   118   x1 = input[1];
   119   x2 = input[2];
   120   x3 = input[3];
   122   if (!(x0 | x1 | x2 | x3)) {
   123     output[0] = output[1] = output[2] = output[3] = 0;
   124     return;
   125   }
   127   s0 = sinpi_1_9 * x0;
   128   s1 = sinpi_4_9 * x0;
   129   s2 = sinpi_2_9 * x1;
   130   s3 = sinpi_1_9 * x1;
   131   s4 = sinpi_3_9 * x2;
   132   s5 = sinpi_4_9 * x3;
   133   s6 = sinpi_2_9 * x3;
   134   s7 = x0 + x1 - x3;
   136   x0 = s0 + s2 + s5;
   137   x1 = sinpi_3_9 * s7;
   138   x2 = s1 - s3 + s6;
   139   x3 = s4;
   141   s0 = x0 + x3;
   142   s1 = x1;
   143   s2 = x2 - x3;
   144   s3 = x2 - x0 + x3;
   146   // 1-D transform scaling factor is sqrt(2).
   147   output[0] = fdct_round_shift(s0);
   148   output[1] = fdct_round_shift(s1);
   149   output[2] = fdct_round_shift(s2);
   150   output[3] = fdct_round_shift(s3);
   151 }
   153 static const transform_2d FHT_4[] = {
   154   { fdct4,  fdct4  },  // DCT_DCT  = 0
   155   { fadst4, fdct4  },  // ADST_DCT = 1
   156   { fdct4,  fadst4 },  // DCT_ADST = 2
   157   { fadst4, fadst4 }   // ADST_ADST = 3
   158 };
   160 void vp9_short_fht4x4_c(const int16_t *input, int16_t *output,
   161                         int stride, int tx_type) {
   162   int16_t out[4 * 4];
   163   int16_t *outptr = &out[0];
   164   int i, j;
   165   int16_t temp_in[4], temp_out[4];
   166   const transform_2d ht = FHT_4[tx_type];
   168   // Columns
   169   for (i = 0; i < 4; ++i) {
   170     for (j = 0; j < 4; ++j)
   171       temp_in[j] = input[j * stride + i] * 16;
   172     if (i == 0 && temp_in[0])
   173       temp_in[0] += 1;
   174     ht.cols(temp_in, temp_out);
   175     for (j = 0; j < 4; ++j)
   176       outptr[j * 4 + i] = temp_out[j];
   177   }
   179   // Rows
   180   for (i = 0; i < 4; ++i) {
   181     for (j = 0; j < 4; ++j)
   182       temp_in[j] = out[j + i * 4];
   183     ht.rows(temp_in, temp_out);
   184     for (j = 0; j < 4; ++j)
   185       output[j + i * 4] = (temp_out[j] + 1) >> 2;
   186   }
   187 }
   189 static void fdct8(const int16_t *input, int16_t *output) {
   190   /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
   191   /*needs32*/ int t0, t1, t2, t3;
   192   /*canbe16*/ int x0, x1, x2, x3;
   194   // stage 1
   195   s0 = input[0] + input[7];
   196   s1 = input[1] + input[6];
   197   s2 = input[2] + input[5];
   198   s3 = input[3] + input[4];
   199   s4 = input[3] - input[4];
   200   s5 = input[2] - input[5];
   201   s6 = input[1] - input[6];
   202   s7 = input[0] - input[7];
   204   // fdct4(step, step);
   205   x0 = s0 + s3;
   206   x1 = s1 + s2;
   207   x2 = s1 - s2;
   208   x3 = s0 - s3;
   209   t0 = (x0 + x1) * cospi_16_64;
   210   t1 = (x0 - x1) * cospi_16_64;
   211   t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
   212   t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
   213   output[0] = fdct_round_shift(t0);
   214   output[2] = fdct_round_shift(t2);
   215   output[4] = fdct_round_shift(t1);
   216   output[6] = fdct_round_shift(t3);
   218   // Stage 2
   219   t0 = (s6 - s5) * cospi_16_64;
   220   t1 = (s6 + s5) * cospi_16_64;
   221   t2 = fdct_round_shift(t0);
   222   t3 = fdct_round_shift(t1);
   224   // Stage 3
   225   x0 = s4 + t2;
   226   x1 = s4 - t2;
   227   x2 = s7 - t3;
   228   x3 = s7 + t3;
   230   // Stage 4
   231   t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
   232   t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
   233   t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
   234   t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
   235   output[1] = fdct_round_shift(t0);
   236   output[3] = fdct_round_shift(t2);
   237   output[5] = fdct_round_shift(t1);
   238   output[7] = fdct_round_shift(t3);
   239 }
   241 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
   242   int i, j;
   243   int16_t intermediate[64];
   245   // Transform columns
   246   {
   247     int16_t *output = intermediate;
   248     /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
   249     /*needs32*/ int t0, t1, t2, t3;
   250     /*canbe16*/ int x0, x1, x2, x3;
   252     int i;
   253     for (i = 0; i < 8; i++) {
   254       // stage 1
   255       s0 = (input[0 * stride] + input[7 * stride]) * 4;
   256       s1 = (input[1 * stride] + input[6 * stride]) * 4;
   257       s2 = (input[2 * stride] + input[5 * stride]) * 4;
   258       s3 = (input[3 * stride] + input[4 * stride]) * 4;
   259       s4 = (input[3 * stride] - input[4 * stride]) * 4;
   260       s5 = (input[2 * stride] - input[5 * stride]) * 4;
   261       s6 = (input[1 * stride] - input[6 * stride]) * 4;
   262       s7 = (input[0 * stride] - input[7 * stride]) * 4;
   264       // fdct4(step, step);
   265       x0 = s0 + s3;
   266       x1 = s1 + s2;
   267       x2 = s1 - s2;
   268       x3 = s0 - s3;
   269       t0 = (x0 + x1) * cospi_16_64;
   270       t1 = (x0 - x1) * cospi_16_64;
   271       t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
   272       t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
   273       output[0 * 8] = fdct_round_shift(t0);
   274       output[2 * 8] = fdct_round_shift(t2);
   275       output[4 * 8] = fdct_round_shift(t1);
   276       output[6 * 8] = fdct_round_shift(t3);
   278       // Stage 2
   279       t0 = (s6 - s5) * cospi_16_64;
   280       t1 = (s6 + s5) * cospi_16_64;
   281       t2 = fdct_round_shift(t0);
   282       t3 = fdct_round_shift(t1);
   284       // Stage 3
   285       x0 = s4 + t2;
   286       x1 = s4 - t2;
   287       x2 = s7 - t3;
   288       x3 = s7 + t3;
   290       // Stage 4
   291       t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
   292       t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
   293       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
   294       t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
   295       output[1 * 8] = fdct_round_shift(t0);
   296       output[3 * 8] = fdct_round_shift(t2);
   297       output[5 * 8] = fdct_round_shift(t1);
   298       output[7 * 8] = fdct_round_shift(t3);
   299       input++;
   300       output++;
   301     }
   302   }
   304   // Rows
   305   for (i = 0; i < 8; ++i) {
   306     fdct8(&intermediate[i * 8], &final_output[i * 8]);
   307     for (j = 0; j < 8; ++j)
   308       final_output[j + i * 8] /= 2;
   309   }
   310 }
   312 void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
   313   // The 2D transform is done with two passes which are actually pretty
   314   // similar. In the first one, we transform the columns and transpose
   315   // the results. In the second one, we transform the rows. To achieve that,
   316   // as the first pass results are transposed, we tranpose the columns (that
   317   // is the transposed rows) and transpose the results (so that it goes back
   318   // in normal/row positions).
   319   int pass;
   320   // We need an intermediate buffer between passes.
   321   int16_t intermediate[256];
   322   const int16_t *in = input;
   323   int16_t *out = intermediate;
   324   // Do the two transform/transpose passes
   325   for (pass = 0; pass < 2; ++pass) {
   326     /*canbe16*/ int step1[8];
   327     /*canbe16*/ int step2[8];
   328     /*canbe16*/ int step3[8];
   329     /*canbe16*/ int input[8];
   330     /*needs32*/ int temp1, temp2;
   331     int i;
   332     for (i = 0; i < 16; i++) {
   333       if (0 == pass) {
   334         // Calculate input for the first 8 results.
   335         input[0] = (in[0 * stride] + in[15 * stride]) * 4;
   336         input[1] = (in[1 * stride] + in[14 * stride]) * 4;
   337         input[2] = (in[2 * stride] + in[13 * stride]) * 4;
   338         input[3] = (in[3 * stride] + in[12 * stride]) * 4;
   339         input[4] = (in[4 * stride] + in[11 * stride]) * 4;
   340         input[5] = (in[5 * stride] + in[10 * stride]) * 4;
   341         input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;
   342         input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;
   343         // Calculate input for the next 8 results.
   344         step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;
   345         step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;
   346         step1[2] = (in[5 * stride] - in[10 * stride]) * 4;
   347         step1[3] = (in[4 * stride] - in[11 * stride]) * 4;
   348         step1[4] = (in[3 * stride] - in[12 * stride]) * 4;
   349         step1[5] = (in[2 * stride] - in[13 * stride]) * 4;
   350         step1[6] = (in[1 * stride] - in[14 * stride]) * 4;
   351         step1[7] = (in[0 * stride] - in[15 * stride]) * 4;
   352       } else {
   353         // Calculate input for the first 8 results.
   354         input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
   355         input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
   356         input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
   357         input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
   358         input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
   359         input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
   360         input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
   361         input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
   362         // Calculate input for the next 8 results.
   363         step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
   364         step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
   365         step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
   366         step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
   367         step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
   368         step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
   369         step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
   370         step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
   371       }
   372       // Work on the first eight values; fdct8(input, even_results);
   373       {
   374         /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
   375         /*needs32*/ int t0, t1, t2, t3;
   376         /*canbe16*/ int x0, x1, x2, x3;
   378         // stage 1
   379         s0 = input[0] + input[7];
   380         s1 = input[1] + input[6];
   381         s2 = input[2] + input[5];
   382         s3 = input[3] + input[4];
   383         s4 = input[3] - input[4];
   384         s5 = input[2] - input[5];
   385         s6 = input[1] - input[6];
   386         s7 = input[0] - input[7];
   388         // fdct4(step, step);
   389         x0 = s0 + s3;
   390         x1 = s1 + s2;
   391         x2 = s1 - s2;
   392         x3 = s0 - s3;
   393         t0 = (x0 + x1) * cospi_16_64;
   394         t1 = (x0 - x1) * cospi_16_64;
   395         t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
   396         t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
   397         out[0] = fdct_round_shift(t0);
   398         out[4] = fdct_round_shift(t2);
   399         out[8] = fdct_round_shift(t1);
   400         out[12] = fdct_round_shift(t3);
   402         // Stage 2
   403         t0 = (s6 - s5) * cospi_16_64;
   404         t1 = (s6 + s5) * cospi_16_64;
   405         t2 = fdct_round_shift(t0);
   406         t3 = fdct_round_shift(t1);
   408         // Stage 3
   409         x0 = s4 + t2;
   410         x1 = s4 - t2;
   411         x2 = s7 - t3;
   412         x3 = s7 + t3;
   414         // Stage 4
   415         t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
   416         t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
   417         t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
   418         t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
   419         out[2] = fdct_round_shift(t0);
   420         out[6] = fdct_round_shift(t2);
   421         out[10] = fdct_round_shift(t1);
   422         out[14] = fdct_round_shift(t3);
   423       }
   424       // Work on the next eight values; step1 -> odd_results
   425       {
   426         // step 2
   427         temp1 = (step1[5] - step1[2]) * cospi_16_64;
   428         temp2 = (step1[4] - step1[3]) * cospi_16_64;
   429         step2[2] = fdct_round_shift(temp1);
   430         step2[3] = fdct_round_shift(temp2);
   431         temp1 = (step1[4] + step1[3]) * cospi_16_64;
   432         temp2 = (step1[5] + step1[2]) * cospi_16_64;
   433         step2[4] = fdct_round_shift(temp1);
   434         step2[5] = fdct_round_shift(temp2);
   435         // step 3
   436         step3[0] = step1[0] + step2[3];
   437         step3[1] = step1[1] + step2[2];
   438         step3[2] = step1[1] - step2[2];
   439         step3[3] = step1[0] - step2[3];
   440         step3[4] = step1[7] - step2[4];
   441         step3[5] = step1[6] - step2[5];
   442         step3[6] = step1[6] + step2[5];
   443         step3[7] = step1[7] + step2[4];
   444         // step 4
   445         temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
   446         temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
   447         step2[1] = fdct_round_shift(temp1);
   448         step2[2] = fdct_round_shift(temp2);
   449         temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
   450         temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
   451         step2[5] = fdct_round_shift(temp1);
   452         step2[6] = fdct_round_shift(temp2);
   453         // step 5
   454         step1[0] = step3[0] + step2[1];
   455         step1[1] = step3[0] - step2[1];
   456         step1[2] = step3[3] - step2[2];
   457         step1[3] = step3[3] + step2[2];
   458         step1[4] = step3[4] + step2[5];
   459         step1[5] = step3[4] - step2[5];
   460         step1[6] = step3[7] - step2[6];
   461         step1[7] = step3[7] + step2[6];
   462         // step 6
   463         temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
   464         temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
   465         out[1] = fdct_round_shift(temp1);
   466         out[9] = fdct_round_shift(temp2);
   467         temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
   468         temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
   469         out[5] = fdct_round_shift(temp1);
   470         out[13] = fdct_round_shift(temp2);
   471         temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
   472         temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
   473         out[3] = fdct_round_shift(temp1);
   474         out[11] = fdct_round_shift(temp2);
   475         temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
   476         temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
   477         out[7] = fdct_round_shift(temp1);
   478         out[15] = fdct_round_shift(temp2);
   479       }
   480       // Do next column (which is a transposed row in second/horizontal pass)
   481       in++;
   482       out += 16;
   483     }
   484     // Setup in/out for next pass.
   485     in = intermediate;
   486     out = output;
   487   }
   488 }
   490 static void fadst8(const int16_t *input, int16_t *output) {
   491   int s0, s1, s2, s3, s4, s5, s6, s7;
   493   int x0 = input[7];
   494   int x1 = input[0];
   495   int x2 = input[5];
   496   int x3 = input[2];
   497   int x4 = input[3];
   498   int x5 = input[4];
   499   int x6 = input[1];
   500   int x7 = input[6];
   502   // stage 1
   503   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
   504   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
   505   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
   506   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
   507   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
   508   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
   509   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   510   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
   512   x0 = fdct_round_shift(s0 + s4);
   513   x1 = fdct_round_shift(s1 + s5);
   514   x2 = fdct_round_shift(s2 + s6);
   515   x3 = fdct_round_shift(s3 + s7);
   516   x4 = fdct_round_shift(s0 - s4);
   517   x5 = fdct_round_shift(s1 - s5);
   518   x6 = fdct_round_shift(s2 - s6);
   519   x7 = fdct_round_shift(s3 - s7);
   521   // stage 2
   522   s0 = x0;
   523   s1 = x1;
   524   s2 = x2;
   525   s3 = x3;
   526   s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
   527   s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
   528   s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
   529   s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
   531   x0 = s0 + s2;
   532   x1 = s1 + s3;
   533   x2 = s0 - s2;
   534   x3 = s1 - s3;
   535   x4 = fdct_round_shift(s4 + s6);
   536   x5 = fdct_round_shift(s5 + s7);
   537   x6 = fdct_round_shift(s4 - s6);
   538   x7 = fdct_round_shift(s5 - s7);
   540   // stage 3
   541   s2 = cospi_16_64 * (x2 + x3);
   542   s3 = cospi_16_64 * (x2 - x3);
   543   s6 = cospi_16_64 * (x6 + x7);
   544   s7 = cospi_16_64 * (x6 - x7);
   546   x2 = fdct_round_shift(s2);
   547   x3 = fdct_round_shift(s3);
   548   x6 = fdct_round_shift(s6);
   549   x7 = fdct_round_shift(s7);
   551   output[0] =   x0;
   552   output[1] = - x4;
   553   output[2] =   x6;
   554   output[3] = - x2;
   555   output[4] =   x3;
   556   output[5] = - x7;
   557   output[6] =   x5;
   558   output[7] = - x1;
   559 }
   561 static const transform_2d FHT_8[] = {
   562   { fdct8,  fdct8  },  // DCT_DCT  = 0
   563   { fadst8, fdct8  },  // ADST_DCT = 1
   564   { fdct8,  fadst8 },  // DCT_ADST = 2
   565   { fadst8, fadst8 }   // ADST_ADST = 3
   566 };
   568 void vp9_short_fht8x8_c(const int16_t *input, int16_t *output,
   569                         int stride, int tx_type) {
   570   int16_t out[64];
   571   int16_t *outptr = &out[0];
   572   int i, j;
   573   int16_t temp_in[8], temp_out[8];
   574   const transform_2d ht = FHT_8[tx_type];
   576   // Columns
   577   for (i = 0; i < 8; ++i) {
   578     for (j = 0; j < 8; ++j)
   579       temp_in[j] = input[j * stride + i] * 4;
   580     ht.cols(temp_in, temp_out);
   581     for (j = 0; j < 8; ++j)
   582       outptr[j * 8 + i] = temp_out[j];
   583   }
   585   // Rows
   586   for (i = 0; i < 8; ++i) {
   587     for (j = 0; j < 8; ++j)
   588       temp_in[j] = out[j + i * 8];
   589     ht.rows(temp_in, temp_out);
   590     for (j = 0; j < 8; ++j)
   591       output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
   592   }
   593 }
   595 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
   596    pixel. */
   597 void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
   598   int i;
   599   int a1, b1, c1, d1, e1;
   600   const int16_t *ip = input;
   601   int16_t *op = output;
   603   for (i = 0; i < 4; i++) {
   604     a1 = ip[0 * stride];
   605     b1 = ip[1 * stride];
   606     c1 = ip[2 * stride];
   607     d1 = ip[3 * stride];
   609     a1 += b1;
   610     d1 = d1 - c1;
   611     e1 = (a1 - d1) >> 1;
   612     b1 = e1 - b1;
   613     c1 = e1 - c1;
   614     a1 -= c1;
   615     d1 += b1;
   616     op[0] = a1;
   617     op[4] = c1;
   618     op[8] = d1;
   619     op[12] = b1;
   621     ip++;
   622     op++;
   623   }
   624   ip = output;
   625   op = output;
   627   for (i = 0; i < 4; i++) {
   628     a1 = ip[0];
   629     b1 = ip[1];
   630     c1 = ip[2];
   631     d1 = ip[3];
   633     a1 += b1;
   634     d1 -= c1;
   635     e1 = (a1 - d1) >> 1;
   636     b1 = e1 - b1;
   637     c1 = e1 - c1;
   638     a1 -= c1;
   639     d1 += b1;
   640     op[0] = a1 * UNIT_QUANT_FACTOR;
   641     op[1] = c1 * UNIT_QUANT_FACTOR;
   642     op[2] = d1 * UNIT_QUANT_FACTOR;
   643     op[3] = b1 * UNIT_QUANT_FACTOR;
   645     ip += 4;
   646     op += 4;
   647   }
   648 }
   650 // Rewrote to use same algorithm as others.
   651 static void fdct16(const int16_t in[16], int16_t out[16]) {
   652   /*canbe16*/ int step1[8];
   653   /*canbe16*/ int step2[8];
   654   /*canbe16*/ int step3[8];
   655   /*canbe16*/ int input[8];
   656   /*needs32*/ int temp1, temp2;
   658   // step 1
   659   input[0] = in[0] + in[15];
   660   input[1] = in[1] + in[14];
   661   input[2] = in[2] + in[13];
   662   input[3] = in[3] + in[12];
   663   input[4] = in[4] + in[11];
   664   input[5] = in[5] + in[10];
   665   input[6] = in[6] + in[ 9];
   666   input[7] = in[7] + in[ 8];
   668   step1[0] = in[7] - in[ 8];
   669   step1[1] = in[6] - in[ 9];
   670   step1[2] = in[5] - in[10];
   671   step1[3] = in[4] - in[11];
   672   step1[4] = in[3] - in[12];
   673   step1[5] = in[2] - in[13];
   674   step1[6] = in[1] - in[14];
   675   step1[7] = in[0] - in[15];
   677   // fdct8(step, step);
   678   {
   679     /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
   680     /*needs32*/ int t0, t1, t2, t3;
   681     /*canbe16*/ int x0, x1, x2, x3;
   683     // stage 1
   684     s0 = input[0] + input[7];
   685     s1 = input[1] + input[6];
   686     s2 = input[2] + input[5];
   687     s3 = input[3] + input[4];
   688     s4 = input[3] - input[4];
   689     s5 = input[2] - input[5];
   690     s6 = input[1] - input[6];
   691     s7 = input[0] - input[7];
   693     // fdct4(step, step);
   694     x0 = s0 + s3;
   695     x1 = s1 + s2;
   696     x2 = s1 - s2;
   697     x3 = s0 - s3;
   698     t0 = (x0 + x1) * cospi_16_64;
   699     t1 = (x0 - x1) * cospi_16_64;
   700     t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
   701     t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
   702     out[0] = fdct_round_shift(t0);
   703     out[4] = fdct_round_shift(t2);
   704     out[8] = fdct_round_shift(t1);
   705     out[12] = fdct_round_shift(t3);
   707     // Stage 2
   708     t0 = (s6 - s5) * cospi_16_64;
   709     t1 = (s6 + s5) * cospi_16_64;
   710     t2 = fdct_round_shift(t0);
   711     t3 = fdct_round_shift(t1);
   713     // Stage 3
   714     x0 = s4 + t2;
   715     x1 = s4 - t2;
   716     x2 = s7 - t3;
   717     x3 = s7 + t3;
   719     // Stage 4
   720     t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
   721     t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
   722     t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
   723     t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
   724     out[2] = fdct_round_shift(t0);
   725     out[6] = fdct_round_shift(t2);
   726     out[10] = fdct_round_shift(t1);
   727     out[14] = fdct_round_shift(t3);
   728   }
   730   // step 2
   731   temp1 = (step1[5] - step1[2]) * cospi_16_64;
   732   temp2 = (step1[4] - step1[3]) * cospi_16_64;
   733   step2[2] = fdct_round_shift(temp1);
   734   step2[3] = fdct_round_shift(temp2);
   735   temp1 = (step1[4] + step1[3]) * cospi_16_64;
   736   temp2 = (step1[5] + step1[2]) * cospi_16_64;
   737   step2[4] = fdct_round_shift(temp1);
   738   step2[5] = fdct_round_shift(temp2);
   740   // step 3
   741   step3[0] = step1[0] + step2[3];
   742   step3[1] = step1[1] + step2[2];
   743   step3[2] = step1[1] - step2[2];
   744   step3[3] = step1[0] - step2[3];
   745   step3[4] = step1[7] - step2[4];
   746   step3[5] = step1[6] - step2[5];
   747   step3[6] = step1[6] + step2[5];
   748   step3[7] = step1[7] + step2[4];
   750   // step 4
   751   temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
   752   temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
   753   step2[1] = fdct_round_shift(temp1);
   754   step2[2] = fdct_round_shift(temp2);
   755   temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
   756   temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
   757   step2[5] = fdct_round_shift(temp1);
   758   step2[6] = fdct_round_shift(temp2);
   760   // step 5
   761   step1[0] = step3[0] + step2[1];
   762   step1[1] = step3[0] - step2[1];
   763   step1[2] = step3[3] - step2[2];
   764   step1[3] = step3[3] + step2[2];
   765   step1[4] = step3[4] + step2[5];
   766   step1[5] = step3[4] - step2[5];
   767   step1[6] = step3[7] - step2[6];
   768   step1[7] = step3[7] + step2[6];
   770   // step 6
   771   temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
   772   temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
   773   out[1] = fdct_round_shift(temp1);
   774   out[9] = fdct_round_shift(temp2);
   776   temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
   777   temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
   778   out[5] = fdct_round_shift(temp1);
   779   out[13] = fdct_round_shift(temp2);
   781   temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
   782   temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
   783   out[3] = fdct_round_shift(temp1);
   784   out[11] = fdct_round_shift(temp2);
   786   temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
   787   temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
   788   out[7] = fdct_round_shift(temp1);
   789   out[15] = fdct_round_shift(temp2);
   790 }
   792 static void fadst16(const int16_t *input, int16_t *output) {
   793   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
   795   int x0 = input[15];
   796   int x1 = input[0];
   797   int x2 = input[13];
   798   int x3 = input[2];
   799   int x4 = input[11];
   800   int x5 = input[4];
   801   int x6 = input[9];
   802   int x7 = input[6];
   803   int x8 = input[7];
   804   int x9 = input[8];
   805   int x10 = input[5];
   806   int x11 = input[10];
   807   int x12 = input[3];
   808   int x13 = input[12];
   809   int x14 = input[1];
   810   int x15 = input[14];
   812   // stage 1
   813   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
   814   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
   815   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
   816   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
   817   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
   818   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
   819   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
   820   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
   821   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
   822   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
   823   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
   824   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
   825   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
   826   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
   827   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   828   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
   830   x0 = fdct_round_shift(s0 + s8);
   831   x1 = fdct_round_shift(s1 + s9);
   832   x2 = fdct_round_shift(s2 + s10);
   833   x3 = fdct_round_shift(s3 + s11);
   834   x4 = fdct_round_shift(s4 + s12);
   835   x5 = fdct_round_shift(s5 + s13);
   836   x6 = fdct_round_shift(s6 + s14);
   837   x7 = fdct_round_shift(s7 + s15);
   838   x8  = fdct_round_shift(s0 - s8);
   839   x9  = fdct_round_shift(s1 - s9);
   840   x10 = fdct_round_shift(s2 - s10);
   841   x11 = fdct_round_shift(s3 - s11);
   842   x12 = fdct_round_shift(s4 - s12);
   843   x13 = fdct_round_shift(s5 - s13);
   844   x14 = fdct_round_shift(s6 - s14);
   845   x15 = fdct_round_shift(s7 - s15);
   847   // stage 2
   848   s0 = x0;
   849   s1 = x1;
   850   s2 = x2;
   851   s3 = x3;
   852   s4 = x4;
   853   s5 = x5;
   854   s6 = x6;
   855   s7 = x7;
   856   s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
   857   s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
   858   s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
   859   s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
   860   s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
   861   s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
   862   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
   863   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
   865   x0 = s0 + s4;
   866   x1 = s1 + s5;
   867   x2 = s2 + s6;
   868   x3 = s3 + s7;
   869   x4 = s0 - s4;
   870   x5 = s1 - s5;
   871   x6 = s2 - s6;
   872   x7 = s3 - s7;
   873   x8 = fdct_round_shift(s8 + s12);
   874   x9 = fdct_round_shift(s9 + s13);
   875   x10 = fdct_round_shift(s10 + s14);
   876   x11 = fdct_round_shift(s11 + s15);
   877   x12 = fdct_round_shift(s8 - s12);
   878   x13 = fdct_round_shift(s9 - s13);
   879   x14 = fdct_round_shift(s10 - s14);
   880   x15 = fdct_round_shift(s11 - s15);
   882   // stage 3
   883   s0 = x0;
   884   s1 = x1;
   885   s2 = x2;
   886   s3 = x3;
   887   s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
   888   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
   889   s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
   890   s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
   891   s8 = x8;
   892   s9 = x9;
   893   s10 = x10;
   894   s11 = x11;
   895   s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
   896   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
   897   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
   898   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
   900   x0 = s0 + s2;
   901   x1 = s1 + s3;
   902   x2 = s0 - s2;
   903   x3 = s1 - s3;
   904   x4 = fdct_round_shift(s4 + s6);
   905   x5 = fdct_round_shift(s5 + s7);
   906   x6 = fdct_round_shift(s4 - s6);
   907   x7 = fdct_round_shift(s5 - s7);
   908   x8 = s8 + s10;
   909   x9 = s9 + s11;
   910   x10 = s8 - s10;
   911   x11 = s9 - s11;
   912   x12 = fdct_round_shift(s12 + s14);
   913   x13 = fdct_round_shift(s13 + s15);
   914   x14 = fdct_round_shift(s12 - s14);
   915   x15 = fdct_round_shift(s13 - s15);
   917   // stage 4
   918   s2 = (- cospi_16_64) * (x2 + x3);
   919   s3 = cospi_16_64 * (x2 - x3);
   920   s6 = cospi_16_64 * (x6 + x7);
   921   s7 = cospi_16_64 * (- x6 + x7);
   922   s10 = cospi_16_64 * (x10 + x11);
   923   s11 = cospi_16_64 * (- x10 + x11);
   924   s14 = (- cospi_16_64) * (x14 + x15);
   925   s15 = cospi_16_64 * (x14 - x15);
   927   x2 = fdct_round_shift(s2);
   928   x3 = fdct_round_shift(s3);
   929   x6 = fdct_round_shift(s6);
   930   x7 = fdct_round_shift(s7);
   931   x10 = fdct_round_shift(s10);
   932   x11 = fdct_round_shift(s11);
   933   x14 = fdct_round_shift(s14);
   934   x15 = fdct_round_shift(s15);
   936   output[0] = x0;
   937   output[1] = - x8;
   938   output[2] = x12;
   939   output[3] = - x4;
   940   output[4] = x6;
   941   output[5] = x14;
   942   output[6] = x10;
   943   output[7] = x2;
   944   output[8] = x3;
   945   output[9] =  x11;
   946   output[10] = x15;
   947   output[11] = x7;
   948   output[12] = x5;
   949   output[13] = - x13;
   950   output[14] = x9;
   951   output[15] = - x1;
   952 }
   954 static const transform_2d FHT_16[] = {
   955   { fdct16,  fdct16  },  // DCT_DCT  = 0
   956   { fadst16, fdct16  },  // ADST_DCT = 1
   957   { fdct16,  fadst16 },  // DCT_ADST = 2
   958   { fadst16, fadst16 }   // ADST_ADST = 3
   959 };
   961 void vp9_short_fht16x16_c(const int16_t *input, int16_t *output,
   962                           int stride, int tx_type) {
   963   int16_t out[256];
   964   int16_t *outptr = &out[0];
   965   int i, j;
   966   int16_t temp_in[16], temp_out[16];
   967   const transform_2d ht = FHT_16[tx_type];
   969   // Columns
   970   for (i = 0; i < 16; ++i) {
   971     for (j = 0; j < 16; ++j)
   972       temp_in[j] = input[j * stride + i] * 4;
   973     ht.cols(temp_in, temp_out);
   974     for (j = 0; j < 16; ++j)
   975       outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
   976 //      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   977   }
   979   // Rows
   980   for (i = 0; i < 16; ++i) {
   981     for (j = 0; j < 16; ++j)
   982       temp_in[j] = out[j + i * 16];
   983     ht.rows(temp_in, temp_out);
   984     for (j = 0; j < 16; ++j)
   985       output[j + i * 16] = temp_out[j];
   986   }
   987 }
   989 static INLINE int dct_32_round(int input) {
   990   int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
   991   assert(-131072 <= rv && rv <= 131071);
   992   return rv;
   993 }
   995 static INLINE int half_round_shift(int input) {
   996   int rv = (input + 1 + (input < 0)) >> 2;
   997   return rv;
   998 }
  1000 static void dct32_1d(const int *input, int *output, int round) {
  1001   int step[32];
  1002   // Stage 1
  1003   step[0] = input[0] + input[(32 - 1)];
  1004   step[1] = input[1] + input[(32 - 2)];
  1005   step[2] = input[2] + input[(32 - 3)];
  1006   step[3] = input[3] + input[(32 - 4)];
  1007   step[4] = input[4] + input[(32 - 5)];
  1008   step[5] = input[5] + input[(32 - 6)];
  1009   step[6] = input[6] + input[(32 - 7)];
  1010   step[7] = input[7] + input[(32 - 8)];
  1011   step[8] = input[8] + input[(32 - 9)];
  1012   step[9] = input[9] + input[(32 - 10)];
  1013   step[10] = input[10] + input[(32 - 11)];
  1014   step[11] = input[11] + input[(32 - 12)];
  1015   step[12] = input[12] + input[(32 - 13)];
  1016   step[13] = input[13] + input[(32 - 14)];
  1017   step[14] = input[14] + input[(32 - 15)];
  1018   step[15] = input[15] + input[(32 - 16)];
  1019   step[16] = -input[16] + input[(32 - 17)];
  1020   step[17] = -input[17] + input[(32 - 18)];
  1021   step[18] = -input[18] + input[(32 - 19)];
  1022   step[19] = -input[19] + input[(32 - 20)];
  1023   step[20] = -input[20] + input[(32 - 21)];
  1024   step[21] = -input[21] + input[(32 - 22)];
  1025   step[22] = -input[22] + input[(32 - 23)];
  1026   step[23] = -input[23] + input[(32 - 24)];
  1027   step[24] = -input[24] + input[(32 - 25)];
  1028   step[25] = -input[25] + input[(32 - 26)];
  1029   step[26] = -input[26] + input[(32 - 27)];
  1030   step[27] = -input[27] + input[(32 - 28)];
  1031   step[28] = -input[28] + input[(32 - 29)];
  1032   step[29] = -input[29] + input[(32 - 30)];
  1033   step[30] = -input[30] + input[(32 - 31)];
  1034   step[31] = -input[31] + input[(32 - 32)];
  1036   // Stage 2
  1037   output[0] = step[0] + step[16 - 1];
  1038   output[1] = step[1] + step[16 - 2];
  1039   output[2] = step[2] + step[16 - 3];
  1040   output[3] = step[3] + step[16 - 4];
  1041   output[4] = step[4] + step[16 - 5];
  1042   output[5] = step[5] + step[16 - 6];
  1043   output[6] = step[6] + step[16 - 7];
  1044   output[7] = step[7] + step[16 - 8];
  1045   output[8] = -step[8] + step[16 - 9];
  1046   output[9] = -step[9] + step[16 - 10];
  1047   output[10] = -step[10] + step[16 - 11];
  1048   output[11] = -step[11] + step[16 - 12];
  1049   output[12] = -step[12] + step[16 - 13];
  1050   output[13] = -step[13] + step[16 - 14];
  1051   output[14] = -step[14] + step[16 - 15];
  1052   output[15] = -step[15] + step[16 - 16];
  1054   output[16] = step[16];
  1055   output[17] = step[17];
  1056   output[18] = step[18];
  1057   output[19] = step[19];
  1059   output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
  1060   output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
  1061   output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
  1062   output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
  1064   output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
  1065   output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
  1066   output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
  1067   output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
  1069   output[28] = step[28];
  1070   output[29] = step[29];
  1071   output[30] = step[30];
  1072   output[31] = step[31];
  1074   // dump the magnitude by 4, hence the intermediate values are within
  1075   // the range of 16 bits.
  1076   if (round) {
  1077     output[0] = half_round_shift(output[0]);
  1078     output[1] = half_round_shift(output[1]);
  1079     output[2] = half_round_shift(output[2]);
  1080     output[3] = half_round_shift(output[3]);
  1081     output[4] = half_round_shift(output[4]);
  1082     output[5] = half_round_shift(output[5]);
  1083     output[6] = half_round_shift(output[6]);
  1084     output[7] = half_round_shift(output[7]);
  1085     output[8] = half_round_shift(output[8]);
  1086     output[9] = half_round_shift(output[9]);
  1087     output[10] = half_round_shift(output[10]);
  1088     output[11] = half_round_shift(output[11]);
  1089     output[12] = half_round_shift(output[12]);
  1090     output[13] = half_round_shift(output[13]);
  1091     output[14] = half_round_shift(output[14]);
  1092     output[15] = half_round_shift(output[15]);
  1094     output[16] = half_round_shift(output[16]);
  1095     output[17] = half_round_shift(output[17]);
  1096     output[18] = half_round_shift(output[18]);
  1097     output[19] = half_round_shift(output[19]);
  1098     output[20] = half_round_shift(output[20]);
  1099     output[21] = half_round_shift(output[21]);
  1100     output[22] = half_round_shift(output[22]);
  1101     output[23] = half_round_shift(output[23]);
  1102     output[24] = half_round_shift(output[24]);
  1103     output[25] = half_round_shift(output[25]);
  1104     output[26] = half_round_shift(output[26]);
  1105     output[27] = half_round_shift(output[27]);
  1106     output[28] = half_round_shift(output[28]);
  1107     output[29] = half_round_shift(output[29]);
  1108     output[30] = half_round_shift(output[30]);
  1109     output[31] = half_round_shift(output[31]);
  1112   // Stage 3
  1113   step[0] = output[0] + output[(8 - 1)];
  1114   step[1] = output[1] + output[(8 - 2)];
  1115   step[2] = output[2] + output[(8 - 3)];
  1116   step[3] = output[3] + output[(8 - 4)];
  1117   step[4] = -output[4] + output[(8 - 5)];
  1118   step[5] = -output[5] + output[(8 - 6)];
  1119   step[6] = -output[6] + output[(8 - 7)];
  1120   step[7] = -output[7] + output[(8 - 8)];
  1121   step[8] = output[8];
  1122   step[9] = output[9];
  1123   step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
  1124   step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
  1125   step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
  1126   step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
  1127   step[14] = output[14];
  1128   step[15] = output[15];
  1130   step[16] = output[16] + output[23];
  1131   step[17] = output[17] + output[22];
  1132   step[18] = output[18] + output[21];
  1133   step[19] = output[19] + output[20];
  1134   step[20] = -output[20] + output[19];
  1135   step[21] = -output[21] + output[18];
  1136   step[22] = -output[22] + output[17];
  1137   step[23] = -output[23] + output[16];
  1138   step[24] = -output[24] + output[31];
  1139   step[25] = -output[25] + output[30];
  1140   step[26] = -output[26] + output[29];
  1141   step[27] = -output[27] + output[28];
  1142   step[28] = output[28] + output[27];
  1143   step[29] = output[29] + output[26];
  1144   step[30] = output[30] + output[25];
  1145   step[31] = output[31] + output[24];
  1147   // Stage 4
  1148   output[0] = step[0] + step[3];
  1149   output[1] = step[1] + step[2];
  1150   output[2] = -step[2] + step[1];
  1151   output[3] = -step[3] + step[0];
  1152   output[4] = step[4];
  1153   output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
  1154   output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
  1155   output[7] = step[7];
  1156   output[8] = step[8] + step[11];
  1157   output[9] = step[9] + step[10];
  1158   output[10] = -step[10] + step[9];
  1159   output[11] = -step[11] + step[8];
  1160   output[12] = -step[12] + step[15];
  1161   output[13] = -step[13] + step[14];
  1162   output[14] = step[14] + step[13];
  1163   output[15] = step[15] + step[12];
  1165   output[16] = step[16];
  1166   output[17] = step[17];
  1167   output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
  1168   output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
  1169   output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
  1170   output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
  1171   output[22] = step[22];
  1172   output[23] = step[23];
  1173   output[24] = step[24];
  1174   output[25] = step[25];
  1175   output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
  1176   output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
  1177   output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
  1178   output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
  1179   output[30] = step[30];
  1180   output[31] = step[31];
  1182   // Stage 5
  1183   step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
  1184   step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
  1185   step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
  1186   step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
  1187   step[4] = output[4] + output[5];
  1188   step[5] = -output[5] + output[4];
  1189   step[6] = -output[6] + output[7];
  1190   step[7] = output[7] + output[6];
  1191   step[8] = output[8];
  1192   step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
  1193   step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
  1194   step[11] = output[11];
  1195   step[12] = output[12];
  1196   step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
  1197   step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
  1198   step[15] = output[15];
  1200   step[16] = output[16] + output[19];
  1201   step[17] = output[17] + output[18];
  1202   step[18] = -output[18] + output[17];
  1203   step[19] = -output[19] + output[16];
  1204   step[20] = -output[20] + output[23];
  1205   step[21] = -output[21] + output[22];
  1206   step[22] = output[22] + output[21];
  1207   step[23] = output[23] + output[20];
  1208   step[24] = output[24] + output[27];
  1209   step[25] = output[25] + output[26];
  1210   step[26] = -output[26] + output[25];
  1211   step[27] = -output[27] + output[24];
  1212   step[28] = -output[28] + output[31];
  1213   step[29] = -output[29] + output[30];
  1214   step[30] = output[30] + output[29];
  1215   step[31] = output[31] + output[28];
  1217   // Stage 6
  1218   output[0] = step[0];
  1219   output[1] = step[1];
  1220   output[2] = step[2];
  1221   output[3] = step[3];
  1222   output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
  1223   output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
  1224   output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
  1225   output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
  1226   output[8] = step[8] + step[9];
  1227   output[9] = -step[9] + step[8];
  1228   output[10] = -step[10] + step[11];
  1229   output[11] = step[11] + step[10];
  1230   output[12] = step[12] + step[13];
  1231   output[13] = -step[13] + step[12];
  1232   output[14] = -step[14] + step[15];
  1233   output[15] = step[15] + step[14];
  1235   output[16] = step[16];
  1236   output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
  1237   output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
  1238   output[19] = step[19];
  1239   output[20] = step[20];
  1240   output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
  1241   output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
  1242   output[23] = step[23];
  1243   output[24] = step[24];
  1244   output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
  1245   output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
  1246   output[27] = step[27];
  1247   output[28] = step[28];
  1248   output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
  1249   output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
  1250   output[31] = step[31];
  1252   // Stage 7
  1253   step[0] = output[0];
  1254   step[1] = output[1];
  1255   step[2] = output[2];
  1256   step[3] = output[3];
  1257   step[4] = output[4];
  1258   step[5] = output[5];
  1259   step[6] = output[6];
  1260   step[7] = output[7];
  1261   step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
  1262   step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
  1263   step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
  1264   step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
  1265   step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
  1266   step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
  1267   step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
  1268   step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
  1270   step[16] = output[16] + output[17];
  1271   step[17] = -output[17] + output[16];
  1272   step[18] = -output[18] + output[19];
  1273   step[19] = output[19] + output[18];
  1274   step[20] = output[20] + output[21];
  1275   step[21] = -output[21] + output[20];
  1276   step[22] = -output[22] + output[23];
  1277   step[23] = output[23] + output[22];
  1278   step[24] = output[24] + output[25];
  1279   step[25] = -output[25] + output[24];
  1280   step[26] = -output[26] + output[27];
  1281   step[27] = output[27] + output[26];
  1282   step[28] = output[28] + output[29];
  1283   step[29] = -output[29] + output[28];
  1284   step[30] = -output[30] + output[31];
  1285   step[31] = output[31] + output[30];
  1287   // Final stage --- outputs indices are bit-reversed.
  1288   output[0]  = step[0];
  1289   output[16] = step[1];
  1290   output[8]  = step[2];
  1291   output[24] = step[3];
  1292   output[4]  = step[4];
  1293   output[20] = step[5];
  1294   output[12] = step[6];
  1295   output[28] = step[7];
  1296   output[2]  = step[8];
  1297   output[18] = step[9];
  1298   output[10] = step[10];
  1299   output[26] = step[11];
  1300   output[6]  = step[12];
  1301   output[22] = step[13];
  1302   output[14] = step[14];
  1303   output[30] = step[15];
  1305   output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
  1306   output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
  1307   output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
  1308   output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
  1309   output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
  1310   output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
  1311   output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
  1312   output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
  1313   output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
  1314   output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
  1315   output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
  1316   output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
  1317   output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
  1318   output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
  1319   output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
  1320   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
  1323 void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
  1324   int i, j;
  1325   int output[32 * 32];
  1327   // Columns
  1328   for (i = 0; i < 32; ++i) {
  1329     int temp_in[32], temp_out[32];
  1330     for (j = 0; j < 32; ++j)
  1331       temp_in[j] = input[j * stride + i] * 4;
  1332     dct32_1d(temp_in, temp_out, 0);
  1333     for (j = 0; j < 32; ++j)
  1334       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
  1337   // Rows
  1338   for (i = 0; i < 32; ++i) {
  1339     int temp_in[32], temp_out[32];
  1340     for (j = 0; j < 32; ++j)
  1341       temp_in[j] = output[j + i * 32];
  1342     dct32_1d(temp_in, temp_out, 0);
  1343     for (j = 0; j < 32; ++j)
  1344       out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
  1348 // Note that although we use dct_32_round in dct32_1d computation flow,
  1349 // this 2d fdct32x32 for rate-distortion optimization loop is operating
  1350 // within 16 bits precision.
  1351 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
  1352   int i, j;
  1353   int output[32 * 32];
  1355   // Columns
  1356   for (i = 0; i < 32; ++i) {
  1357     int temp_in[32], temp_out[32];
  1358     for (j = 0; j < 32; ++j)
  1359       temp_in[j] = input[j * stride + i] * 4;
  1360     dct32_1d(temp_in, temp_out, 0);
  1361     for (j = 0; j < 32; ++j)
  1362       // TODO(cd): see quality impact of only doing
  1363       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
  1364       //           PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
  1365       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
  1368   // Rows
  1369   for (i = 0; i < 32; ++i) {
  1370     int temp_in[32], temp_out[32];
  1371     for (j = 0; j < 32; ++j)
  1372       temp_in[j] = output[j + i * 32];
  1373     dct32_1d(temp_in, temp_out, 1);
  1374     for (j = 0; j < 32; ++j)
  1375       out[j + i * 32] = temp_out[j];
  1379 void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
  1380                 int stride) {
  1381   if (tx_type == DCT_DCT)
  1382     vp9_fdct4x4(input, output, stride);
  1383   else
  1384     vp9_short_fht4x4(input, output, stride, tx_type);
  1387 void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
  1388                 int stride) {
  1389   if (tx_type == DCT_DCT)
  1390     vp9_fdct8x8(input, output, stride);
  1391   else
  1392     vp9_short_fht8x8(input, output, stride, tx_type);
  1395 void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
  1396                   int stride) {
  1397   if (tx_type == DCT_DCT)
  1398     vp9_fdct16x16(input, output, stride);
  1399   else
  1400     vp9_short_fht16x16(input, output, stride, tx_type);

mercurial