media/libvpx/vp9/common/x86/vp9_asm_stubs.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3  *
     4  *  Use of this source code is governed by a BSD-style license
     5  *  that can be found in the LICENSE file in the root of the source
     6  *  tree. An additional intellectual property rights grant can be found
     7  *  in the file PATENTS.  All contributing project authors may
     8  *  be found in the AUTHORS file in the root of the source tree.
     9  */
    11 #include <assert.h>
    13 #include "./vpx_config.h"
    14 #include "./vp9_rtcd.h"
    15 #include "vpx_ports/mem.h"
    16 ///////////////////////////////////////////////////////////////////////////
    17 // the mmx function that does the bilinear filtering and var calculation //
    18 // int one pass                                                          //
    19 ///////////////////////////////////////////////////////////////////////////
    20 DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
    21   { 128, 128, 128, 128,  0,  0,  0,  0 },
    22   { 120, 120, 120, 120,  8,  8,  8,  8 },
    23   { 112, 112, 112, 112, 16, 16, 16, 16 },
    24   { 104, 104, 104, 104, 24, 24, 24, 24 },
    25   {  96, 96, 96, 96, 32, 32, 32, 32 },
    26   {  88, 88, 88, 88, 40, 40, 40, 40 },
    27   {  80, 80, 80, 80, 48, 48, 48, 48 },
    28   {  72, 72, 72, 72, 56, 56, 56, 56 },
    29   {  64, 64, 64, 64, 64, 64, 64, 64 },
    30   {  56, 56, 56, 56, 72, 72, 72, 72 },
    31   {  48, 48, 48, 48, 80, 80, 80, 80 },
    32   {  40, 40, 40, 40, 88, 88, 88, 88 },
    33   {  32, 32, 32, 32, 96, 96, 96, 96 },
    34   {  24, 24, 24, 24, 104, 104, 104, 104 },
    35   {  16, 16, 16, 16, 112, 112, 112, 112 },
    36   {   8,  8,  8,  8, 120, 120, 120, 120 }
    37 };
    39 typedef void filter8_1dfunction (
    40   const unsigned char *src_ptr,
    41   const unsigned int src_pitch,
    42   unsigned char *output_ptr,
    43   unsigned int out_pitch,
    44   unsigned int output_height,
    45   const short *filter
    46 );
    48 #if HAVE_SSSE3
    49 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
    50 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
    51 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
    52 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
    53 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
    54 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
    55 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
    56 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
    57 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
    58 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
    59 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
    60 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
    62 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
    63                                uint8_t *dst, ptrdiff_t dst_stride,
    64                                const int16_t *filter_x, int x_step_q4,
    65                                const int16_t *filter_y, int y_step_q4,
    66                                int w, int h) {
    67   /* Ensure the filter can be compressed to int16_t. */
    68   if (x_step_q4 == 16 && filter_x[3] != 128) {
    69     while (w >= 16) {
    70       vp9_filter_block1d16_h8_ssse3(src, src_stride,
    71                                     dst, dst_stride,
    72                                     h, filter_x);
    73       src += 16;
    74       dst += 16;
    75       w -= 16;
    76     }
    77     while (w >= 8) {
    78       vp9_filter_block1d8_h8_ssse3(src, src_stride,
    79                                    dst, dst_stride,
    80                                    h, filter_x);
    81       src += 8;
    82       dst += 8;
    83       w -= 8;
    84     }
    85     while (w >= 4) {
    86       vp9_filter_block1d4_h8_ssse3(src, src_stride,
    87                                    dst, dst_stride,
    88                                    h, filter_x);
    89       src += 4;
    90       dst += 4;
    91       w -= 4;
    92     }
    93   }
    94   if (w) {
    95     vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
    96                           filter_x, x_step_q4, filter_y, y_step_q4,
    97                           w, h);
    98   }
    99 }
   101 void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
   102                               uint8_t *dst, ptrdiff_t dst_stride,
   103                               const int16_t *filter_x, int x_step_q4,
   104                               const int16_t *filter_y, int y_step_q4,
   105                               int w, int h) {
   106   if (y_step_q4 == 16 && filter_y[3] != 128) {
   107     while (w >= 16) {
   108       vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,
   109                                     dst, dst_stride,
   110                                     h, filter_y);
   111       src += 16;
   112       dst += 16;
   113       w -= 16;
   114     }
   115     while (w >= 8) {
   116       vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,
   117                                    dst, dst_stride,
   118                                    h, filter_y);
   119       src += 8;
   120       dst += 8;
   121       w -= 8;
   122     }
   123     while (w >= 4) {
   124       vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride,
   125                                    dst, dst_stride,
   126                                    h, filter_y);
   127       src += 4;
   128       dst += 4;
   129       w -= 4;
   130     }
   131   }
   132   if (w) {
   133     vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
   134                          filter_x, x_step_q4, filter_y, y_step_q4,
   135                          w, h);
   136   }
   137 }
   139 void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
   140                                uint8_t *dst, ptrdiff_t dst_stride,
   141                                const int16_t *filter_x, int x_step_q4,
   142                                const int16_t *filter_y, int y_step_q4,
   143                                int w, int h) {
   144   if (x_step_q4 == 16 && filter_x[3] != 128) {
   145     while (w >= 16) {
   146       vp9_filter_block1d16_h8_avg_ssse3(src, src_stride,
   147                                     dst, dst_stride,
   148                                     h, filter_x);
   149       src += 16;
   150       dst += 16;
   151       w -= 16;
   152     }
   153     while (w >= 8) {
   154       vp9_filter_block1d8_h8_avg_ssse3(src, src_stride,
   155                                    dst, dst_stride,
   156                                    h, filter_x);
   157       src += 8;
   158       dst += 8;
   159       w -= 8;
   160     }
   161     while (w >= 4) {
   162       vp9_filter_block1d4_h8_avg_ssse3(src, src_stride,
   163                                    dst, dst_stride,
   164                                    h, filter_x);
   165       src += 4;
   166       dst += 4;
   167       w -= 4;
   168     }
   169   }
   170   if (w) {
   171     vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
   172                               filter_x, x_step_q4, filter_y, y_step_q4,
   173                               w, h);
   174   }
   175 }
   177 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
   178                               uint8_t *dst, ptrdiff_t dst_stride,
   179                               const int16_t *filter_x, int x_step_q4,
   180                               const int16_t *filter_y, int y_step_q4,
   181                               int w, int h) {
   182   if (y_step_q4 == 16 && filter_y[3] != 128) {
   183     while (w >= 16) {
   184       vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride,
   185                                     dst, dst_stride,
   186                                     h, filter_y);
   187       src += 16;
   188       dst += 16;
   189       w -= 16;
   190     }
   191     while (w >= 8) {
   192       vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride,
   193                                    dst, dst_stride,
   194                                    h, filter_y);
   195       src += 8;
   196       dst += 8;
   197       w -= 8;
   198     }
   199     while (w >= 4) {
   200       vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride,
   201                                    dst, dst_stride,
   202                                    h, filter_y);
   203       src += 4;
   204       dst += 4;
   205       w -= 4;
   206     }
   207   }
   208   if (w) {
   209     vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
   210                              filter_x, x_step_q4, filter_y, y_step_q4,
   211                              w, h);
   212   }
   213 }
   215 void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
   216                          uint8_t *dst, ptrdiff_t dst_stride,
   217                          const int16_t *filter_x, int x_step_q4,
   218                          const int16_t *filter_y, int y_step_q4,
   219                          int w, int h) {
   220   DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
   222   assert(w <= 64);
   223   assert(h <= 64);
   224   if (x_step_q4 == 16 && y_step_q4 == 16) {
   225     vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
   226                               filter_x, x_step_q4, filter_y, y_step_q4,
   227                               w, h + 7);
   228     vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
   229                              filter_x, x_step_q4, filter_y, y_step_q4, w, h);
   230   } else {
   231     vp9_convolve8_c(src, src_stride, dst, dst_stride,
   232                     filter_x, x_step_q4, filter_y, y_step_q4, w, h);
   233   }
   234 }
   236 void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
   237                          uint8_t *dst, ptrdiff_t dst_stride,
   238                          const int16_t *filter_x, int x_step_q4,
   239                          const int16_t *filter_y, int y_step_q4,
   240                          int w, int h) {
   241   DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
   243   assert(w <= 64);
   244   assert(h <= 64);
   245   if (x_step_q4 == 16 && y_step_q4 == 16) {
   246     vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
   247                               filter_x, x_step_q4, filter_y, y_step_q4,
   248                               w, h + 7);
   249     vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
   250                                  filter_x, x_step_q4, filter_y, y_step_q4,
   251                                  w, h);
   252   } else {
   253     vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
   254                         filter_x, x_step_q4, filter_y, y_step_q4, w, h);
   255   }
   256 }
   257 #endif
   259 #if HAVE_SSE2
   260 filter8_1dfunction vp9_filter_block1d16_v8_sse2;
   261 filter8_1dfunction vp9_filter_block1d16_h8_sse2;
   262 filter8_1dfunction vp9_filter_block1d8_v8_sse2;
   263 filter8_1dfunction vp9_filter_block1d8_h8_sse2;
   264 filter8_1dfunction vp9_filter_block1d4_v8_sse2;
   265 filter8_1dfunction vp9_filter_block1d4_h8_sse2;
   266 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
   267 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
   268 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
   269 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
   270 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
   271 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
   273 void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
   274                                uint8_t *dst, ptrdiff_t dst_stride,
   275                                const int16_t *filter_x, int x_step_q4,
   276                                const int16_t *filter_y, int y_step_q4,
   277                                int w, int h) {
   278   /* Ensure the filter can be compressed to int16_t. */
   279   if (x_step_q4 == 16 && filter_x[3] != 128) {
   280     while (w >= 16) {
   281       vp9_filter_block1d16_h8_sse2(src, src_stride,
   282                                     dst, dst_stride,
   283                                     h, filter_x);
   284       src += 16;
   285       dst += 16;
   286       w -= 16;
   287     }
   288     while (w >= 8) {
   289       vp9_filter_block1d8_h8_sse2(src, src_stride,
   290                                    dst, dst_stride,
   291                                    h, filter_x);
   292       src += 8;
   293       dst += 8;
   294       w -= 8;
   295     }
   296     while (w >= 4) {
   297       vp9_filter_block1d4_h8_sse2(src, src_stride,
   298                                    dst, dst_stride,
   299                                    h, filter_x);
   300       src += 4;
   301       dst += 4;
   302       w -= 4;
   303     }
   304   }
   305   if (w) {
   306     vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
   307                           filter_x, x_step_q4, filter_y, y_step_q4,
   308                           w, h);
   309   }
   310 }
   312 void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
   313                               uint8_t *dst, ptrdiff_t dst_stride,
   314                               const int16_t *filter_x, int x_step_q4,
   315                               const int16_t *filter_y, int y_step_q4,
   316                               int w, int h) {
   317   if (y_step_q4 == 16 && filter_y[3] != 128) {
   318     while (w >= 16) {
   319       vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride,
   320                                     dst, dst_stride,
   321                                     h, filter_y);
   322       src += 16;
   323       dst += 16;
   324       w -= 16;
   325     }
   326     while (w >= 8) {
   327       vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride,
   328                                    dst, dst_stride,
   329                                    h, filter_y);
   330       src += 8;
   331       dst += 8;
   332       w -= 8;
   333     }
   334     while (w >= 4) {
   335       vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride,
   336                                    dst, dst_stride,
   337                                    h, filter_y);
   338       src += 4;
   339       dst += 4;
   340       w -= 4;
   341     }
   342   }
   343   if (w) {
   344     vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
   345                          filter_x, x_step_q4, filter_y, y_step_q4,
   346                          w, h);
   347   }
   348 }
   350 void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
   351                                uint8_t *dst, ptrdiff_t dst_stride,
   352                                const int16_t *filter_x, int x_step_q4,
   353                                const int16_t *filter_y, int y_step_q4,
   354                                int w, int h) {
   355   if (x_step_q4 == 16 && filter_x[3] != 128) {
   356     while (w >= 16) {
   357       vp9_filter_block1d16_h8_avg_sse2(src, src_stride,
   358                                     dst, dst_stride,
   359                                     h, filter_x);
   360       src += 16;
   361       dst += 16;
   362       w -= 16;
   363     }
   364     while (w >= 8) {
   365       vp9_filter_block1d8_h8_avg_sse2(src, src_stride,
   366                                    dst, dst_stride,
   367                                    h, filter_x);
   368       src += 8;
   369       dst += 8;
   370       w -= 8;
   371     }
   372     while (w >= 4) {
   373       vp9_filter_block1d4_h8_avg_sse2(src, src_stride,
   374                                    dst, dst_stride,
   375                                    h, filter_x);
   376       src += 4;
   377       dst += 4;
   378       w -= 4;
   379     }
   380   }
   381   if (w) {
   382     vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
   383                               filter_x, x_step_q4, filter_y, y_step_q4,
   384                               w, h);
   385   }
   386 }
   388 void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
   389                               uint8_t *dst, ptrdiff_t dst_stride,
   390                               const int16_t *filter_x, int x_step_q4,
   391                               const int16_t *filter_y, int y_step_q4,
   392                               int w, int h) {
   393   if (y_step_q4 == 16 && filter_y[3] != 128) {
   394     while (w >= 16) {
   395       vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride,
   396                                     dst, dst_stride,
   397                                     h, filter_y);
   398       src += 16;
   399       dst += 16;
   400       w -= 16;
   401     }
   402     while (w >= 8) {
   403       vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride,
   404                                    dst, dst_stride,
   405                                    h, filter_y);
   406       src += 8;
   407       dst += 8;
   408       w -= 8;
   409     }
   410     while (w >= 4) {
   411       vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride,
   412                                    dst, dst_stride,
   413                                    h, filter_y);
   414       src += 4;
   415       dst += 4;
   416       w -= 4;
   417     }
   418   }
   419   if (w) {
   420     vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
   421                              filter_x, x_step_q4, filter_y, y_step_q4,
   422                              w, h);
   423   }
   424 }
   426 void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
   427                          uint8_t *dst, ptrdiff_t dst_stride,
   428                          const int16_t *filter_x, int x_step_q4,
   429                          const int16_t *filter_y, int y_step_q4,
   430                          int w, int h) {
   431   DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
   433   assert(w <= 64);
   434   assert(h <= 64);
   435   if (x_step_q4 == 16 && y_step_q4 == 16) {
   436     vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
   437                               filter_x, x_step_q4, filter_y, y_step_q4,
   438                               w, h + 7);
   439     vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
   440                              filter_x, x_step_q4, filter_y, y_step_q4, w, h);
   441   } else {
   442     vp9_convolve8_c(src, src_stride, dst, dst_stride,
   443                     filter_x, x_step_q4, filter_y, y_step_q4, w, h);
   444   }
   445 }
   447 void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
   448                          uint8_t *dst, ptrdiff_t dst_stride,
   449                          const int16_t *filter_x, int x_step_q4,
   450                          const int16_t *filter_y, int y_step_q4,
   451                          int w, int h) {
   452   DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
   454   assert(w <= 64);
   455   assert(h <= 64);
   456   if (x_step_q4 == 16 && y_step_q4 == 16) {
   457     vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
   458                               filter_x, x_step_q4, filter_y, y_step_q4,
   459                               w, h + 7);
   460     vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
   461                                  filter_x, x_step_q4, filter_y, y_step_q4,
   462                                  w, h);
   463   } else {
   464     vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
   465                         filter_x, x_step_q4, filter_y, y_step_q4, w, h);
   466   }
   467 }
   468 #endif

mercurial