The Tor Browser: media/libyuv/util/ssim.cc@b8a032363ba2

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*

     2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.

     3  *

     4  *  Use of this source code is governed by a BSD-style license

     5  *  that can be found in the LICENSE file in the root of the source

     6  *  tree. An additional intellectual property rights grant can be found

     7  *  in the file PATENTS. All contributing project authors may

     8  *  be found in the AUTHORS file in the root of the source tree.

     9  */

    11 #include "../util/ssim.h"  // NOLINT

    13 #include <math.h>

    14 #include <string.h>

    16 #ifdef __cplusplus

    17 extern "C" {

    18 #endif

    20 typedef unsigned int uint32;     // NOLINT

    21 typedef unsigned short uint16;   // NOLINT

    23 #if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \

    24   (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))

    25 #define __SSE2__

    26 #endif

    27 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)

    28 #include <emmintrin.h>

    29 #endif

    31 #ifdef _OPENMP

    32 #include <omp.h>

    33 #endif

    35 // SSIM

    36 enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };

    38 // Symmetric Gaussian kernel:  K[i] = ~11 * exp(-0.3 * i * i)

    39 // The maximum value (11 x 11) must be less than 128 to avoid sign

    40 // problems during the calls to _mm_mullo_epi16().

    41 static const int K[KERNEL_SIZE] = {

    42   1, 3, 7, 11, 7, 3, 1    // ~11 * exp(-0.3 * i * i)

    43 };

    44 static const double kiW[KERNEL + 1 + 1] = {

    45   1. / 1089.,   // 1 / sum(i:0..6, j..6) K[i]*K[j]

    46   1. / 1089.,   // 1 / sum(i:0..6, j..6) K[i]*K[j]

    47   1. / 1056.,   // 1 / sum(i:0..5, j..6) K[i]*K[j]

    48   1. / 957.,    // 1 / sum(i:0..4, j..6) K[i]*K[j]

    49   1. / 726.,    // 1 / sum(i:0..3, j..6) K[i]*K[j]

    50 };

    52 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)

    54 #define PWEIGHT(A, B)  static_cast<uint16>(K[(A)] * K[(B)])   // weight product

    55 #define MAKE_WEIGHT(L)                                               \

    56   { { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3),  \

    57         PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } }

    59 // We need this union trick to be able to initialize constant static __m128i

    60 // values. We can't call _mm_set_epi16() for static compile-time initialization.

    61 static const struct {

    62   union {

    63     uint16 i16_[8];

    64     __m128i m_;

    65   } values_;

    66 } W0 = MAKE_WEIGHT(0),

    67   W1 = MAKE_WEIGHT(1),

    68   W2 = MAKE_WEIGHT(2),

    69   W3 = MAKE_WEIGHT(3);

    70   // ... the rest is symmetric.

    71 #undef MAKE_WEIGHT

    72 #undef PWEIGHT

    73 #endif

    75 // Common final expression for SSIM, once the weighted sums are known.

    76 static double FinalizeSSIM(double iw, double xm, double ym,

    77                            double xxm, double xym, double yym) {

    78   const double iwx = xm * iw;

    79   const double iwy = ym * iw;

    80   double sxx = xxm * iw - iwx * iwx;

    81   double syy = yym * iw - iwy * iwy;

    82   // small errors are possible, due to rounding. Clamp to zero.

    83   if (sxx < 0.) sxx = 0.;

    84   if (syy < 0.) syy = 0.;

    85   const double sxsy = sqrt(sxx * syy);

    86   const double sxy = xym * iw - iwx * iwy;

    87   static const double C11 = (0.01 * 0.01) * (255 * 255);

    88   static const double C22 = (0.03 * 0.03) * (255 * 255);

    89   static const double C33 = (0.015 * 0.015) * (255 * 255);

    90   const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);

    91   const double c = (2. * sxsy      + C22) / (sxx + syy + C22);

    92   const double s = (sxy + C33) / (sxsy + C33);

    93   return l * c * s;

    94 }

    96 // GetSSIM() does clipping.  GetSSIMFullKernel() does not

    98 // TODO(skal): use summed tables?

    99 // Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)

   100 // with a diff of 255, squared. The maximum error is thus 0x4388241,

   101 // which fits into 32 bits integers.

   102 double GetSSIM(const uint8 *org, const uint8 *rec,

   103                int xo, int yo, int W, int H, int stride) {

   104   uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;

   105   org += (yo - KERNEL) * stride;

   106   org += (xo - KERNEL);

   107   rec += (yo - KERNEL) * stride;

   108   rec += (xo - KERNEL);

   109   for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {

   110     if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue;

   111     const int Wy = K[y_];

   112     for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {

   113       const int Wxy = Wy * K[x_];

   114       if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {

   115         const int org_x = org[x_];

   116         const int rec_x = rec[x_];

   117         ws += Wxy;

   118         xm  += Wxy * org_x;

   119         ym  += Wxy * rec_x;

   120         xxm += Wxy * org_x * org_x;

   121         xym += Wxy * org_x * rec_x;

   122         yym += Wxy * rec_x * rec_x;

   123       }

   124     }

   125   }

   126   return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);

   127 }

   129 double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,

   130                          int xo, int yo, int stride,

   131                          double area_weight) {

   132   uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;

   134 #if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)

   136   org += yo * stride + xo;

   137   rec += yo * stride + xo;

   138   for (int y = 1; y <= KERNEL; y++) {

   139     const int dy1 = y * stride;

   140     const int dy2 = y * stride;

   141     const int Wy = K[KERNEL + y];

   143     for (int x = 1; x <= KERNEL; x++) {

   144       // Compute the contributions of upper-left (ul), upper-right (ur)

   145       // lower-left (ll) and lower-right (lr) points (see the diagram below).

   146       // Symmetric Kernel will have same weight on those points.

   147       //       -  -  -  -  -  -  -

   148       //       -  ul -  -  -  ur -

   149       //       -  -  -  -  -  -  -

   150       //       -  -  -  0  -  -  -

   151       //       -  -  -  -  -  -  -

   152       //       -  ll -  -  -  lr -

   153       //       -  -  -  -  -  -  -

   154       const int Wxy = Wy * K[KERNEL + x];

   155       const int ul1 = org[-dy1 - x];

   156       const int ur1 = org[-dy1 + x];

   157       const int ll1 = org[dy1 - x];

   158       const int lr1 = org[dy1 + x];

   160       const int ul2 = rec[-dy2 - x];

   161       const int ur2 = rec[-dy2 + x];

   162       const int ll2 = rec[dy2 - x];

   163       const int lr2 = rec[dy2 + x];

   165       xm  += Wxy * (ul1 + ur1 + ll1 + lr1);

   166       ym  += Wxy * (ul2 + ur2 + ll2 + lr2);

   167       xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);

   168       xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);

   169       yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);

   170     }

   172     // Compute the contributions of up (u), down (d), left (l) and right (r)

   173     // points across the main axes (see the diagram below).

   174     // Symmetric Kernel will have same weight on those points.

   175     //       -  -  -  -  -  -  -

   176     //       -  -  -  u  -  -  -

   177     //       -  -  -  -  -  -  -

   178     //       -  l  -  0  -  r  -

   179     //       -  -  -  -  -  -  -

   180     //       -  -  -  d  -  -  -

   181     //       -  -  -  -  -  -  -

   182     const int Wxy = Wy * K[KERNEL];

   183     const int u1 = org[-dy1];

   184     const int d1 = org[dy1];

   185     const int l1 = org[-y];

   186     const int r1 = org[y];

   188     const int u2 = rec[-dy2];

   189     const int d2 = rec[dy2];

   190     const int l2 = rec[-y];

   191     const int r2 = rec[y];

   193     xm  += Wxy * (u1 + d1 + l1 + r1);

   194     ym  += Wxy * (u2 + d2 + l2 + r2);

   195     xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);

   196     xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);

   197     yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);

   198   }

   200   // Lastly the contribution of (x0, y0) point.

   201   const int Wxy = K[KERNEL] * K[KERNEL];

   202   const int s1 = org[0];

   203   const int s2 = rec[0];

   205   xm  += Wxy * s1;

   206   ym  += Wxy * s2;

   207   xxm += Wxy * s1 * s1;

   208   xym += Wxy * s1 * s2;

   209   yym += Wxy * s2 * s2;

   211 #else   // __SSE2__

   213   org += (yo - KERNEL) * stride + (xo - KERNEL);

   214   rec += (yo - KERNEL) * stride + (xo - KERNEL);

   216   const __m128i zero = _mm_setzero_si128();

   217   __m128i x = zero;

   218   __m128i y = zero;

   219   __m128i xx = zero;

   220   __m128i xy = zero;

   221   __m128i yy = zero;

   223 // Read 8 pixels at line #L, and convert to 16bit, perform weighting

   224 // and acccumulate.

   225 #define LOAD_LINE_PAIR(L, WEIGHT) do {                                       \

   226   const __m128i v0 =                                                         \

   227       _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \

   228   const __m128i v1 =                                                         \

   229       _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \

   230   const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                            \

   231   const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                            \

   232   const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);              \

   233   const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);              \

   234   x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                       \

   235   y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                       \

   236   x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                       \

   237   y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                       \

   238   xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                           \

   239   xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                           \

   240   yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                           \

   241 } while (0)

   243 #define ADD_AND_STORE_FOUR_EPI32(M, OUT) do {                                \

   244   uint32 tmp[4];                                                             \

   245   _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M));                    \

   246   (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0];                                 \

   247 } while (0)

   249   LOAD_LINE_PAIR(0, W0);

   250   LOAD_LINE_PAIR(1, W1);

   251   LOAD_LINE_PAIR(2, W2);

   252   LOAD_LINE_PAIR(3, W3);

   253   LOAD_LINE_PAIR(4, W2);

   254   LOAD_LINE_PAIR(5, W1);

   255   LOAD_LINE_PAIR(6, W0);

   257   ADD_AND_STORE_FOUR_EPI32(x, xm);

   258   ADD_AND_STORE_FOUR_EPI32(y, ym);

   259   ADD_AND_STORE_FOUR_EPI32(xx, xxm);

   260   ADD_AND_STORE_FOUR_EPI32(xy, xym);

   261   ADD_AND_STORE_FOUR_EPI32(yy, yym);

   263 #undef LOAD_LINE_PAIR

   264 #undef ADD_AND_STORE_FOUR_EPI32

   265 #endif

   267   return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);

   268 }

   270 static int start_max(int x, int y) { return (x > y) ? x : y; }

   272 double CalcSSIM(const uint8 *org, const uint8 *rec,

   273                 const int image_width, const int image_height) {

   274   double SSIM = 0.;

   275   const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;

   276   const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;

   277   const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);

   278   const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);

   279   const int stride = image_width;

   281   for (int j = 0; j < KERNEL_Y; ++j) {

   282     for (int i = 0; i < image_width; ++i) {

   283       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);

   284     }

   285   }

   287 #ifdef _OPENMP

   288   #pragma omp parallel for reduction(+: SSIM)

   289 #endif

   290   for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {

   291     for (int i = 0; i < KERNEL_X; ++i) {

   292       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);

   293     }

   294     for (int i = KERNEL_X; i < start_x; ++i) {

   295       SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);

   296     }

   297     if (start_x < image_width) {

   298       // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we

   299       // copy the 8 rightmost pixels on a cache area, and pad this area with

   300       // zeros which won't contribute to the overall SSIM value (but we need

   301       // to pass the correct normalizing constant!). By using this cache, we can

   302       // still call GetSSIMFullKernel() instead of the slower GetSSIM().

   303       // NOTE: we could use similar method for the left-most pixels too.

   304       const int kScratchWidth = 8;

   305       const int kScratchStride = kScratchWidth + KERNEL + 1;

   306       uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 };

   307       uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 };

   309       for (int k = 0; k < KERNEL_SIZE; ++k) {

   310         const int offset =

   311             (j - KERNEL + k) * stride + image_width - kScratchWidth;

   312         memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);

   313         memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);

   314       }

   315       for (int k = 0;  k <= KERNEL_X + 1; ++k) {

   316         SSIM += GetSSIMFullKernel(scratch_org, scratch_rec,

   317                                   KERNEL + k, KERNEL, kScratchStride, kiW[k]);

   318       }

   319     }

   320   }

   322   for (int j = start_y; j < image_height; ++j) {

   323     for (int i = 0; i < image_width; ++i) {

   324       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);

   325     }

   326   }

   327   return SSIM;

   328 }

   330 double CalcLSSIM(double ssim) {

   331   return -10.0 * log10(1.0 - ssim);

   332 }

   334 #ifdef __cplusplus

   335 }  // extern "C"

   336 #endif

The Tor Browser / file revision

media/libyuv/util/ssim.cc@b8a032363ba2

media/libyuv/util/ssim.cc