media/libvpx/vp9/encoder/x86/vp9_variance_sse2.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3  *
     4  *  Use of this source code is governed by a BSD-style license
     5  *  that can be found in the LICENSE file in the root of the source
     6  *  tree. An additional intellectual property rights grant can be found
     7  *  in the file PATENTS.  All contributing project authors may
     8  *  be found in the AUTHORS file in the root of the source tree.
     9  */
    11 #include "./vpx_config.h"
    13 #include "vp9/encoder/vp9_variance.h"
    14 #include "vp9/common/vp9_pragmas.h"
    15 #include "vpx_ports/mem.h"
    17 extern unsigned int vp9_get4x4var_mmx
    18 (
    19   const unsigned char *src_ptr,
    20   int  source_stride,
    21   const unsigned char *ref_ptr,
    22   int  recon_stride,
    23   unsigned int *SSE,
    24   int *Sum
    25 );
    27 unsigned int vp9_get_mb_ss_sse2
    28 (
    29   const int16_t *src_ptr
    30 );
    31 unsigned int vp9_get16x16var_sse2
    32 (
    33   const unsigned char *src_ptr,
    34   int source_stride,
    35   const unsigned char *ref_ptr,
    36   int recon_stride,
    37   unsigned int *SSE,
    38   int *Sum
    39 );
    40 unsigned int vp9_get8x8var_sse2
    41 (
    42   const unsigned char *src_ptr,
    43   int source_stride,
    44   const unsigned char *ref_ptr,
    45   int recon_stride,
    46   unsigned int *SSE,
    47   int *Sum
    48 );
    49 void vp9_half_horiz_vert_variance8x_h_sse2
    50 (
    51   const unsigned char *ref_ptr,
    52   int ref_pixels_per_line,
    53   const unsigned char *src_ptr,
    54   int src_pixels_per_line,
    55   unsigned int Height,
    56   int *sum,
    57   unsigned int *sumsquared
    58 );
    59 void vp9_half_horiz_vert_variance16x_h_sse2
    60 (
    61   const unsigned char *ref_ptr,
    62   int ref_pixels_per_line,
    63   const unsigned char *src_ptr,
    64   int src_pixels_per_line,
    65   unsigned int Height,
    66   int *sum,
    67   unsigned int *sumsquared
    68 );
    69 void vp9_half_horiz_variance8x_h_sse2
    70 (
    71   const unsigned char *ref_ptr,
    72   int ref_pixels_per_line,
    73   const unsigned char *src_ptr,
    74   int src_pixels_per_line,
    75   unsigned int Height,
    76   int *sum,
    77   unsigned int *sumsquared
    78 );
    79 void vp9_half_horiz_variance16x_h_sse2
    80 (
    81   const unsigned char *ref_ptr,
    82   int ref_pixels_per_line,
    83   const unsigned char *src_ptr,
    84   int src_pixels_per_line,
    85   unsigned int Height,
    86   int *sum,
    87   unsigned int *sumsquared
    88 );
    89 void vp9_half_vert_variance8x_h_sse2
    90 (
    91   const unsigned char *ref_ptr,
    92   int ref_pixels_per_line,
    93   const unsigned char *src_ptr,
    94   int src_pixels_per_line,
    95   unsigned int Height,
    96   int *sum,
    97   unsigned int *sumsquared
    98 );
    99 void vp9_half_vert_variance16x_h_sse2
   100 (
   101   const unsigned char *ref_ptr,
   102   int ref_pixels_per_line,
   103   const unsigned char *src_ptr,
   104   int src_pixels_per_line,
   105   unsigned int Height,
   106   int *sum,
   107   unsigned int *sumsquared
   108 );
   110 typedef unsigned int (*get_var_sse2) (
   111   const unsigned char *src_ptr,
   112   int source_stride,
   113   const unsigned char *ref_ptr,
   114   int recon_stride,
   115   unsigned int *SSE,
   116   int *Sum
   117 );
   119 static void variance_sse2(const unsigned char *src_ptr, int  source_stride,
   120                         const unsigned char *ref_ptr, int  recon_stride,
   121                         int  w, int  h, unsigned int *sse, int *sum,
   122                         get_var_sse2 var_fn, int block_size) {
   123   unsigned int sse0;
   124   int sum0;
   125   int i, j;
   127   *sse = 0;
   128   *sum = 0;
   130   for (i = 0; i < h; i += block_size) {
   131     for (j = 0; j < w; j += block_size) {
   132       var_fn(src_ptr + source_stride * i + j, source_stride,
   133              ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
   134       *sse += sse0;
   135       *sum += sum0;
   136     }
   137   }
   138 }
   140 unsigned int vp9_variance4x4_sse2(
   141   const unsigned char *src_ptr,
   142   int  source_stride,
   143   const unsigned char *ref_ptr,
   144   int  recon_stride,
   145   unsigned int *sse) {
   146   unsigned int var;
   147   int avg;
   149   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4,
   150                   &var, &avg, vp9_get4x4var_mmx, 4);
   151   *sse = var;
   152   return (var - (((unsigned int)avg * avg) >> 4));
   153 }
   155 unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr,
   156                                   int  source_stride,
   157                                   const uint8_t *ref_ptr,
   158                                   int  recon_stride,
   159                                   unsigned int *sse) {
   160   unsigned int var;
   161   int avg;
   163   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4,
   164                   &var, &avg, vp9_get4x4var_mmx, 4);
   165   *sse = var;
   166   return (var - (((unsigned int)avg * avg) >> 5));
   167 }
   169 unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr,
   170                                   int  source_stride,
   171                                   const uint8_t *ref_ptr,
   172                                   int  recon_stride,
   173                                   unsigned int *sse) {
   174   unsigned int var;
   175   int avg;
   177   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8,
   178                   &var, &avg, vp9_get4x4var_mmx, 4);
   179   *sse = var;
   180   return (var - (((unsigned int)avg * avg) >> 5));
   181 }
   183 unsigned int vp9_variance8x8_sse2
   184 (
   185   const unsigned char *src_ptr,
   186   int  source_stride,
   187   const unsigned char *ref_ptr,
   188   int  recon_stride,
   189   unsigned int *sse) {
   190   unsigned int var;
   191   int avg;
   193   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8,
   194                   &var, &avg, vp9_get8x8var_sse2, 8);
   195   *sse = var;
   196   return (var - (((unsigned int)avg * avg) >> 6));
   197 }
   199 unsigned int vp9_variance16x8_sse2
   200 (
   201   const unsigned char *src_ptr,
   202   int  source_stride,
   203   const unsigned char *ref_ptr,
   204   int  recon_stride,
   205   unsigned int *sse) {
   206   unsigned int var;
   207   int avg;
   209   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8,
   210                   &var, &avg, vp9_get8x8var_sse2, 8);
   211   *sse = var;
   212   return (var - (((unsigned int)avg * avg) >> 7));
   213 }
   215 unsigned int vp9_variance8x16_sse2
   216 (
   217   const unsigned char *src_ptr,
   218   int  source_stride,
   219   const unsigned char *ref_ptr,
   220   int  recon_stride,
   221   unsigned int *sse) {
   222   unsigned int var;
   223   int avg;
   225   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16,
   226                 &var, &avg, vp9_get8x8var_sse2, 8);
   227   *sse = var;
   228   return (var - (((unsigned int)avg * avg) >> 7));
   229 }
   231 unsigned int vp9_variance16x16_sse2
   232 (
   233   const unsigned char *src_ptr,
   234   int  source_stride,
   235   const unsigned char *ref_ptr,
   236   int  recon_stride,
   237   unsigned int *sse) {
   238   unsigned int var;
   239   int avg;
   241   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
   242                 &var, &avg, vp9_get16x16var_sse2, 16);
   243   *sse = var;
   244   return (var - (((unsigned int)avg * avg) >> 8));
   245 }
   247 unsigned int vp9_mse16x16_sse2(
   248   const unsigned char *src_ptr,
   249   int  source_stride,
   250   const unsigned char *ref_ptr,
   251   int  recon_stride,
   252   unsigned int *sse) {
   253   unsigned int sse0;
   254   int sum0;
   255   vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
   256                        &sum0);
   257   *sse = sse0;
   258   return sse0;
   259 }
   261 unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr,
   262                                     int  source_stride,
   263                                     const uint8_t *ref_ptr,
   264                                     int  recon_stride,
   265                                     unsigned int *sse) {
   266   unsigned int var;
   267   int avg;
   269   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
   270                 &var, &avg, vp9_get16x16var_sse2, 16);
   271   *sse = var;
   272   return (var - (((int64_t)avg * avg) >> 10));
   273 }
   275 unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr,
   276                                     int  source_stride,
   277                                     const uint8_t *ref_ptr,
   278                                     int  recon_stride,
   279                                     unsigned int *sse) {
   280   unsigned int var;
   281   int avg;
   283   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
   284                 &var, &avg, vp9_get16x16var_sse2, 16);
   285   *sse = var;
   286   return (var - (((int64_t)avg * avg) >> 9));
   287 }
   289 unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr,
   290                                     int  source_stride,
   291                                     const uint8_t *ref_ptr,
   292                                     int  recon_stride,
   293                                     unsigned int *sse) {
   294   unsigned int var;
   295   int avg;
   297   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32,
   298                 &var, &avg, vp9_get16x16var_sse2, 16);
   299   *sse = var;
   300   return (var - (((int64_t)avg * avg) >> 9));
   301 }
   303 unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr,
   304                                     int  source_stride,
   305                                     const uint8_t *ref_ptr,
   306                                     int  recon_stride,
   307                                     unsigned int *sse) {
   308   unsigned int var;
   309   int avg;
   311   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
   312                 &var, &avg, vp9_get16x16var_sse2, 16);
   313   *sse = var;
   314   return (var - (((int64_t)avg * avg) >> 12));
   315 }
   317 unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr,
   318                                     int  source_stride,
   319                                     const uint8_t *ref_ptr,
   320                                     int  recon_stride,
   321                                     unsigned int *sse) {
   322   unsigned int var;
   323   int avg;
   325   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
   326                 &var, &avg, vp9_get16x16var_sse2, 16);
   327   *sse = var;
   328   return (var - (((int64_t)avg * avg) >> 11));
   329 }
   331 unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
   332                                     int  source_stride,
   333                                     const uint8_t *ref_ptr,
   334                                     int  recon_stride,
   335                                     unsigned int *sse) {
   336   unsigned int var;
   337   int avg;
   339   variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64,
   340                 &var, &avg, vp9_get16x16var_sse2, 16);
   341   *sse = var;
   342   return (var - (((int64_t)avg * avg) >> 11));
   343 }
   345 #define DECL(w, opt) \
   346 int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
   347                                         ptrdiff_t src_stride, \
   348                                         int x_offset, int y_offset, \
   349                                         const uint8_t *dst, \
   350                                         ptrdiff_t dst_stride, \
   351                                         int height, unsigned int *sse)
   352 #define DECLS(opt1, opt2) \
   353 DECL(4, opt2); \
   354 DECL(8, opt1); \
   355 DECL(16, opt1)
   357 DECLS(sse2, sse);
   358 DECLS(ssse3, ssse3);
   359 #undef DECLS
   360 #undef DECL
   362 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
   363 unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
   364                                                      int src_stride, \
   365                                                      int x_offset, \
   366                                                      int y_offset, \
   367                                                      const uint8_t *dst, \
   368                                                      int dst_stride, \
   369                                                      unsigned int *sse_ptr) { \
   370   unsigned int sse; \
   371   int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
   372                                                 y_offset, dst, dst_stride, \
   373                                                 h, &sse); \
   374   if (w > wf) { \
   375     unsigned int sse2; \
   376     int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
   377                                                    x_offset, y_offset, \
   378                                                    dst + 16, dst_stride, \
   379                                                    h, &sse2); \
   380     se += se2; \
   381     sse += sse2; \
   382     if (w > wf * 2) { \
   383       se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
   384                                                  x_offset, y_offset, \
   385                                                  dst + 32, dst_stride, \
   386                                                  h, &sse2); \
   387       se += se2; \
   388       sse += sse2; \
   389       se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
   390                                                  x_offset, y_offset, \
   391                                                  dst + 48, dst_stride, \
   392                                                  h, &sse2); \
   393       se += se2; \
   394       sse += sse2; \
   395     } \
   396   } \
   397   *sse_ptr = sse; \
   398   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
   399 }
   401 #define FNS(opt1, opt2) \
   402 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
   403 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
   404 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
   405 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
   406 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
   407 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
   408 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
   409 FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
   410 FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
   411 FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
   412 FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
   413 FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
   414 FN(4,   4,  4, 2, 2, opt2, (unsigned int))
   416 FNS(sse2, sse);
   417 FNS(ssse3, ssse3);
   419 #undef FNS
   420 #undef FN
   422 #define DECL(w, opt) \
   423 int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
   424                                             ptrdiff_t src_stride, \
   425                                             int x_offset, int y_offset, \
   426                                             const uint8_t *dst, \
   427                                             ptrdiff_t dst_stride, \
   428                                             const uint8_t *sec, \
   429                                             ptrdiff_t sec_stride, \
   430                                             int height, unsigned int *sse)
   431 #define DECLS(opt1, opt2) \
   432 DECL(4, opt2); \
   433 DECL(8, opt1); \
   434 DECL(16, opt1)
   436 DECLS(sse2, sse);
   437 DECLS(ssse3, ssse3);
   438 #undef DECL
   439 #undef DECLS
   441 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
   442 unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
   443                                                          int src_stride, \
   444                                                          int x_offset, \
   445                                                          int y_offset, \
   446                                                          const uint8_t *dst, \
   447                                                          int dst_stride, \
   448                                                          unsigned int *sseptr, \
   449                                                          const uint8_t *sec) { \
   450   unsigned int sse; \
   451   int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
   452                                                     y_offset, dst, dst_stride, \
   453                                                     sec, w, h, &sse); \
   454   if (w > wf) { \
   455     unsigned int sse2; \
   456     int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
   457                                                        x_offset, y_offset, \
   458                                                        dst + 16, dst_stride, \
   459                                                        sec + 16, w, h, &sse2); \
   460     se += se2; \
   461     sse += sse2; \
   462     if (w > wf * 2) { \
   463       se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
   464                                                      x_offset, y_offset, \
   465                                                      dst + 32, dst_stride, \
   466                                                      sec + 32, w, h, &sse2); \
   467       se += se2; \
   468       sse += sse2; \
   469       se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
   470                                                      x_offset, y_offset, \
   471                                                      dst + 48, dst_stride, \
   472                                                      sec + 48, w, h, &sse2); \
   473       se += se2; \
   474       sse += sse2; \
   475     } \
   476   } \
   477   *sseptr = sse; \
   478   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
   479 }
   481 #define FNS(opt1, opt2) \
   482 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
   483 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
   484 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
   485 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
   486 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
   487 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
   488 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
   489 FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
   490 FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
   491 FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
   492 FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
   493 FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
   494 FN(4,   4,  4, 2, 2, opt2, (unsigned int))
   496 FNS(sse2, sse);
   497 FNS(ssse3, ssse3);
   499 #undef FNS
   500 #undef FN
   502 unsigned int vp9_variance_halfpixvar16x16_h_sse2(
   503   const unsigned char *src_ptr,
   504   int  src_pixels_per_line,
   505   const unsigned char *dst_ptr,
   506   int  dst_pixels_per_line,
   507   unsigned int *sse) {
   508   int xsum0;
   509   unsigned int xxsum0;
   511   vp9_half_horiz_variance16x_h_sse2(
   512     src_ptr, src_pixels_per_line,
   513     dst_ptr, dst_pixels_per_line, 16,
   514     &xsum0, &xxsum0);
   516   *sse = xxsum0;
   517   return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
   518 }
   521 unsigned int vp9_variance_halfpixvar16x16_v_sse2(
   522   const unsigned char *src_ptr,
   523   int  src_pixels_per_line,
   524   const unsigned char *dst_ptr,
   525   int  dst_pixels_per_line,
   526   unsigned int *sse) {
   527   int xsum0;
   528   unsigned int xxsum0;
   529   vp9_half_vert_variance16x_h_sse2(
   530     src_ptr, src_pixels_per_line,
   531     dst_ptr, dst_pixels_per_line, 16,
   532     &xsum0, &xxsum0);
   534   *sse = xxsum0;
   535   return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
   536 }
   539 unsigned int vp9_variance_halfpixvar16x16_hv_sse2(
   540   const unsigned char *src_ptr,
   541   int  src_pixels_per_line,
   542   const unsigned char *dst_ptr,
   543   int  dst_pixels_per_line,
   544   unsigned int *sse) {
   545   int xsum0;
   546   unsigned int xxsum0;
   548   vp9_half_horiz_vert_variance16x_h_sse2(
   549     src_ptr, src_pixels_per_line,
   550     dst_ptr, dst_pixels_per_line, 16,
   551     &xsum0, &xxsum0);
   553   *sse = xxsum0;
   554   return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
   555 }

mercurial