The Tor Browser: media/libvpx/vp9/common/x86/vp9_asm

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*

     2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

     3  *

     4  *  Use of this source code is governed by a BSD-style license

     5  *  that can be found in the LICENSE file in the root of the source

     6  *  tree. An additional intellectual property rights grant can be found

     7  *  in the file PATENTS.  All contributing project authors may

     8  *  be found in the AUTHORS file in the root of the source tree.

     9  */

    11 #include <assert.h>

    13 #include "./vpx_config.h"

    14 #include "./vp9_rtcd.h"

    15 #include "vpx_ports/mem.h"

    16 ///////////////////////////////////////////////////////////////////////////

    17 // the mmx function that does the bilinear filtering and var calculation //

    18 // int one pass                                                          //

    19 ///////////////////////////////////////////////////////////////////////////

    20 DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {

    21   { 128, 128, 128, 128,  0,  0,  0,  0 },

    22   { 120, 120, 120, 120,  8,  8,  8,  8 },

    23   { 112, 112, 112, 112, 16, 16, 16, 16 },

    24   { 104, 104, 104, 104, 24, 24, 24, 24 },

    25   {  96, 96, 96, 96, 32, 32, 32, 32 },

    26   {  88, 88, 88, 88, 40, 40, 40, 40 },

    27   {  80, 80, 80, 80, 48, 48, 48, 48 },

    28   {  72, 72, 72, 72, 56, 56, 56, 56 },

    29   {  64, 64, 64, 64, 64, 64, 64, 64 },

    30   {  56, 56, 56, 56, 72, 72, 72, 72 },

    31   {  48, 48, 48, 48, 80, 80, 80, 80 },

    32   {  40, 40, 40, 40, 88, 88, 88, 88 },

    33   {  32, 32, 32, 32, 96, 96, 96, 96 },

    34   {  24, 24, 24, 24, 104, 104, 104, 104 },

    35   {  16, 16, 16, 16, 112, 112, 112, 112 },

    36   {   8,  8,  8,  8, 120, 120, 120, 120 }

    37 };

    39 typedef void filter8_1dfunction (

    40   const unsigned char *src_ptr,

    41   const unsigned int src_pitch,

    42   unsigned char *output_ptr,

    43   unsigned int out_pitch,

    44   unsigned int output_height,

    45   const short *filter

    46 );

    48 #if HAVE_SSSE3

    49 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;

    50 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;

    51 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;

    52 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;

    53 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;

    54 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;

    55 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;

    56 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;

    57 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;

    58 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;

    59 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;

    60 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;

    62 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,

    63                                uint8_t *dst, ptrdiff_t dst_stride,

    64                                const int16_t *filter_x, int x_step_q4,

    65                                const int16_t *filter_y, int y_step_q4,

    66                                int w, int h) {

    67   /* Ensure the filter can be compressed to int16_t. */

    68   if (x_step_q4 == 16 && filter_x[3] != 128) {

    69     while (w >= 16) {

    70       vp9_filter_block1d16_h8_ssse3(src, src_stride,

    71                                     dst, dst_stride,

    72                                     h, filter_x);

    73       src += 16;

    74       dst += 16;

    75       w -= 16;

    76     }

    77     while (w >= 8) {

    78       vp9_filter_block1d8_h8_ssse3(src, src_stride,

    79                                    dst, dst_stride,

    80                                    h, filter_x);

    81       src += 8;

    82       dst += 8;

    83       w -= 8;

    84     }

    85     while (w >= 4) {

    86       vp9_filter_block1d4_h8_ssse3(src, src_stride,

    87                                    dst, dst_stride,

    88                                    h, filter_x);

    89       src += 4;

    90       dst += 4;

    91       w -= 4;

    92     }

    93   }

    94   if (w) {

    95     vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,

    96                           filter_x, x_step_q4, filter_y, y_step_q4,

    97                           w, h);

    98   }

    99 }

   101 void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,

   102                               uint8_t *dst, ptrdiff_t dst_stride,

   103                               const int16_t *filter_x, int x_step_q4,

   104                               const int16_t *filter_y, int y_step_q4,

   105                               int w, int h) {

   106   if (y_step_q4 == 16 && filter_y[3] != 128) {

   107     while (w >= 16) {

   108       vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,

   109                                     dst, dst_stride,

   110                                     h, filter_y);

   111       src += 16;

   112       dst += 16;

   113       w -= 16;

   114     }

   115     while (w >= 8) {

   116       vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,

   117                                    dst, dst_stride,

   118                                    h, filter_y);

   119       src += 8;

   120       dst += 8;

   121       w -= 8;

   122     }

   123     while (w >= 4) {

   124       vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride,

   125                                    dst, dst_stride,

   126                                    h, filter_y);

   127       src += 4;

   128       dst += 4;

   129       w -= 4;

   130     }

   131   }

   132   if (w) {

   133     vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,

   134                          filter_x, x_step_q4, filter_y, y_step_q4,

   135                          w, h);

   136   }

   137 }

   139 void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,

   140                                uint8_t *dst, ptrdiff_t dst_stride,

   141                                const int16_t *filter_x, int x_step_q4,

   142                                const int16_t *filter_y, int y_step_q4,

   143                                int w, int h) {

   144   if (x_step_q4 == 16 && filter_x[3] != 128) {

   145     while (w >= 16) {

   146       vp9_filter_block1d16_h8_avg_ssse3(src, src_stride,

   147                                     dst, dst_stride,

   148                                     h, filter_x);

   149       src += 16;

   150       dst += 16;

   151       w -= 16;

   152     }

   153     while (w >= 8) {

   154       vp9_filter_block1d8_h8_avg_ssse3(src, src_stride,

   155                                    dst, dst_stride,

   156                                    h, filter_x);

   157       src += 8;

   158       dst += 8;

   159       w -= 8;

   160     }

   161     while (w >= 4) {

   162       vp9_filter_block1d4_h8_avg_ssse3(src, src_stride,

   163                                    dst, dst_stride,

   164                                    h, filter_x);

   165       src += 4;

   166       dst += 4;

   167       w -= 4;

   168     }

   169   }

   170   if (w) {

   171     vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,

   172                               filter_x, x_step_q4, filter_y, y_step_q4,

   173                               w, h);

   174   }

   175 }

   177 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,

   178                               uint8_t *dst, ptrdiff_t dst_stride,

   179                               const int16_t *filter_x, int x_step_q4,

   180                               const int16_t *filter_y, int y_step_q4,

   181                               int w, int h) {

   182   if (y_step_q4 == 16 && filter_y[3] != 128) {

   183     while (w >= 16) {

   184       vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride,

   185                                     dst, dst_stride,

   186                                     h, filter_y);

   187       src += 16;

   188       dst += 16;

   189       w -= 16;

   190     }

   191     while (w >= 8) {

   192       vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride,

   193                                    dst, dst_stride,

   194                                    h, filter_y);

   195       src += 8;

   196       dst += 8;

   197       w -= 8;

   198     }

   199     while (w >= 4) {

   200       vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride,

   201                                    dst, dst_stride,

   202                                    h, filter_y);

   203       src += 4;

   204       dst += 4;

   205       w -= 4;

   206     }

   207   }

   208   if (w) {

   209     vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,

   210                              filter_x, x_step_q4, filter_y, y_step_q4,

   211                              w, h);

   212   }

   213 }

   215 void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,

   216                          uint8_t *dst, ptrdiff_t dst_stride,

   217                          const int16_t *filter_x, int x_step_q4,

   218                          const int16_t *filter_y, int y_step_q4,

   219                          int w, int h) {

   220   DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);

   222   assert(w <= 64);

   223   assert(h <= 64);

   224   if (x_step_q4 == 16 && y_step_q4 == 16) {

   225     vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,

   226                               filter_x, x_step_q4, filter_y, y_step_q4,

   227                               w, h + 7);

   228     vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,

   229                              filter_x, x_step_q4, filter_y, y_step_q4, w, h);

   230   } else {

   231     vp9_convolve8_c(src, src_stride, dst, dst_stride,

   232                     filter_x, x_step_q4, filter_y, y_step_q4, w, h);

   233   }

   234 }

   236 void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,

   237                          uint8_t *dst, ptrdiff_t dst_stride,

   238                          const int16_t *filter_x, int x_step_q4,

   239                          const int16_t *filter_y, int y_step_q4,

   240                          int w, int h) {

   241   DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);

   243   assert(w <= 64);

   244   assert(h <= 64);

   245   if (x_step_q4 == 16 && y_step_q4 == 16) {

   246     vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,

   247                               filter_x, x_step_q4, filter_y, y_step_q4,

   248                               w, h + 7);

   249     vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,

   250                                  filter_x, x_step_q4, filter_y, y_step_q4,

   251                                  w, h);

   252   } else {

   253     vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,

   254                         filter_x, x_step_q4, filter_y, y_step_q4, w, h);

   255   }

   256 }

   257 #endif

   259 #if HAVE_SSE2

   260 filter8_1dfunction vp9_filter_block1d16_v8_sse2;

   261 filter8_1dfunction vp9_filter_block1d16_h8_sse2;

   262 filter8_1dfunction vp9_filter_block1d8_v8_sse2;

   263 filter8_1dfunction vp9_filter_block1d8_h8_sse2;

   264 filter8_1dfunction vp9_filter_block1d4_v8_sse2;

   265 filter8_1dfunction vp9_filter_block1d4_h8_sse2;

   266 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;

   267 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;

   268 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;

   269 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;

   270 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;

   271 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;

   273 void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,

   274                                uint8_t *dst, ptrdiff_t dst_stride,

   275                                const int16_t *filter_x, int x_step_q4,

   276                                const int16_t *filter_y, int y_step_q4,

   277                                int w, int h) {

   278   /* Ensure the filter can be compressed to int16_t. */

   279   if (x_step_q4 == 16 && filter_x[3] != 128) {

   280     while (w >= 16) {

   281       vp9_filter_block1d16_h8_sse2(src, src_stride,

   282                                     dst, dst_stride,

   283                                     h, filter_x);

   284       src += 16;

   285       dst += 16;

   286       w -= 16;

   287     }

   288     while (w >= 8) {

   289       vp9_filter_block1d8_h8_sse2(src, src_stride,

   290                                    dst, dst_stride,

   291                                    h, filter_x);

   292       src += 8;

   293       dst += 8;

   294       w -= 8;

   295     }

   296     while (w >= 4) {

   297       vp9_filter_block1d4_h8_sse2(src, src_stride,

   298                                    dst, dst_stride,

   299                                    h, filter_x);

   300       src += 4;

   301       dst += 4;

   302       w -= 4;

   303     }

   304   }

   305   if (w) {

   306     vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,

   307                           filter_x, x_step_q4, filter_y, y_step_q4,

   308                           w, h);

   309   }

   310 }

   312 void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,

   313                               uint8_t *dst, ptrdiff_t dst_stride,

   314                               const int16_t *filter_x, int x_step_q4,

   315                               const int16_t *filter_y, int y_step_q4,

   316                               int w, int h) {

   317   if (y_step_q4 == 16 && filter_y[3] != 128) {

   318     while (w >= 16) {

   319       vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride,

   320                                     dst, dst_stride,

   321                                     h, filter_y);

   322       src += 16;

   323       dst += 16;

   324       w -= 16;

   325     }

   326     while (w >= 8) {

   327       vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride,

   328                                    dst, dst_stride,

   329                                    h, filter_y);

   330       src += 8;

   331       dst += 8;

   332       w -= 8;

   333     }

   334     while (w >= 4) {

   335       vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride,

   336                                    dst, dst_stride,

   337                                    h, filter_y);

   338       src += 4;

   339       dst += 4;

   340       w -= 4;

   341     }

   342   }

   343   if (w) {

   344     vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,

   345                          filter_x, x_step_q4, filter_y, y_step_q4,

   346                          w, h);

   347   }

   348 }

   350 void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,

   351                                uint8_t *dst, ptrdiff_t dst_stride,

   352                                const int16_t *filter_x, int x_step_q4,

   353                                const int16_t *filter_y, int y_step_q4,

   354                                int w, int h) {

   355   if (x_step_q4 == 16 && filter_x[3] != 128) {

   356     while (w >= 16) {

   357       vp9_filter_block1d16_h8_avg_sse2(src, src_stride,

   358                                     dst, dst_stride,

   359                                     h, filter_x);

   360       src += 16;

   361       dst += 16;

   362       w -= 16;

   363     }

   364     while (w >= 8) {

   365       vp9_filter_block1d8_h8_avg_sse2(src, src_stride,

   366                                    dst, dst_stride,

   367                                    h, filter_x);

   368       src += 8;

   369       dst += 8;

   370       w -= 8;

   371     }

   372     while (w >= 4) {

   373       vp9_filter_block1d4_h8_avg_sse2(src, src_stride,

   374                                    dst, dst_stride,

   375                                    h, filter_x);

   376       src += 4;

   377       dst += 4;

   378       w -= 4;

   379     }

   380   }

   381   if (w) {

   382     vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,

   383                               filter_x, x_step_q4, filter_y, y_step_q4,

   384                               w, h);

   385   }

   386 }

   388 void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,

   389                               uint8_t *dst, ptrdiff_t dst_stride,

   390                               const int16_t *filter_x, int x_step_q4,

   391                               const int16_t *filter_y, int y_step_q4,

   392                               int w, int h) {

   393   if (y_step_q4 == 16 && filter_y[3] != 128) {

   394     while (w >= 16) {

   395       vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride,

   396                                     dst, dst_stride,

   397                                     h, filter_y);

   398       src += 16;

   399       dst += 16;

   400       w -= 16;

   401     }

   402     while (w >= 8) {

   403       vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride,

   404                                    dst, dst_stride,

   405                                    h, filter_y);

   406       src += 8;

   407       dst += 8;

   408       w -= 8;

   409     }

   410     while (w >= 4) {

   411       vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride,

   412                                    dst, dst_stride,

   413                                    h, filter_y);

   414       src += 4;

   415       dst += 4;

   416       w -= 4;

   417     }

   418   }

   419   if (w) {

   420     vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,

   421                              filter_x, x_step_q4, filter_y, y_step_q4,

   422                              w, h);

   423   }

   424 }

   426 void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,

   427                          uint8_t *dst, ptrdiff_t dst_stride,

   428                          const int16_t *filter_x, int x_step_q4,

   429                          const int16_t *filter_y, int y_step_q4,

   430                          int w, int h) {

   431   DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);

   433   assert(w <= 64);

   434   assert(h <= 64);

   435   if (x_step_q4 == 16 && y_step_q4 == 16) {

   436     vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,

   437                               filter_x, x_step_q4, filter_y, y_step_q4,

   438                               w, h + 7);

   439     vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,

   440                              filter_x, x_step_q4, filter_y, y_step_q4, w, h);

   441   } else {

   442     vp9_convolve8_c(src, src_stride, dst, dst_stride,

   443                     filter_x, x_step_q4, filter_y, y_step_q4, w, h);

   444   }

   445 }

   447 void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,

   448                          uint8_t *dst, ptrdiff_t dst_stride,

   449                          const int16_t *filter_x, int x_step_q4,

   450                          const int16_t *filter_y, int y_step_q4,

   451                          int w, int h) {

   452   DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);

   454   assert(w <= 64);

   455   assert(h <= 64);

   456   if (x_step_q4 == 16 && y_step_q4 == 16) {

   457     vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,

   458                               filter_x, x_step_q4, filter_y, y_step_q4,

   459                               w, h + 7);

   460     vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,

   461                                  filter_x, x_step_q4, filter_y, y_step_q4,

   462                                  w, h);

   463   } else {

   464     vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,

   465                         filter_x, x_step_q4, filter_y, y_step_q4, w, h);

   466   }

   467 }

   468 #endif

The Tor Browser / file revision

media/libvpx/vp9/common/x86/vp9_asm_stubs.c@6474c204b198

media/libvpx/vp9/common/x86/vp9_asm_stubs.c