media/libyuv/util/ssim.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libyuv/util/ssim.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,337 @@
     1.4 +/*
     1.5 + *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
     1.6 + *
     1.7 + *  Use of this source code is governed by a BSD-style license
     1.8 + *  that can be found in the LICENSE file in the root of the source
     1.9 + *  tree. An additional intellectual property rights grant can be found
    1.10 + *  in the file PATENTS. All contributing project authors may
    1.11 + *  be found in the AUTHORS file in the root of the source tree.
    1.12 + */
    1.13 +
    1.14 +#include "../util/ssim.h"  // NOLINT
    1.15 +
    1.16 +#include <math.h>
    1.17 +#include <string.h>
    1.18 +
    1.19 +#ifdef __cplusplus
    1.20 +extern "C" {
    1.21 +#endif
    1.22 +
    1.23 +typedef unsigned int uint32;     // NOLINT
    1.24 +typedef unsigned short uint16;   // NOLINT
    1.25 +
    1.26 +#if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
    1.27 +  (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
    1.28 +#define __SSE2__
    1.29 +#endif
    1.30 +#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
    1.31 +#include <emmintrin.h>
    1.32 +#endif
    1.33 +
    1.34 +#ifdef _OPENMP
    1.35 +#include <omp.h>
    1.36 +#endif
    1.37 +
    1.38 +// SSIM
    1.39 +enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
    1.40 +
    1.41 +// Symmetric Gaussian kernel:  K[i] = ~11 * exp(-0.3 * i * i)
    1.42 +// The maximum value (11 x 11) must be less than 128 to avoid sign
    1.43 +// problems during the calls to _mm_mullo_epi16().
    1.44 +static const int K[KERNEL_SIZE] = {
    1.45 +  1, 3, 7, 11, 7, 3, 1    // ~11 * exp(-0.3 * i * i)
    1.46 +};
    1.47 +static const double kiW[KERNEL + 1 + 1] = {
    1.48 +  1. / 1089.,   // 1 / sum(i:0..6, j..6) K[i]*K[j]
    1.49 +  1. / 1089.,   // 1 / sum(i:0..6, j..6) K[i]*K[j]
    1.50 +  1. / 1056.,   // 1 / sum(i:0..5, j..6) K[i]*K[j]
    1.51 +  1. / 957.,    // 1 / sum(i:0..4, j..6) K[i]*K[j]
    1.52 +  1. / 726.,    // 1 / sum(i:0..3, j..6) K[i]*K[j]
    1.53 +};
    1.54 +
    1.55 +#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
    1.56 +
    1.57 +#define PWEIGHT(A, B)  static_cast<uint16>(K[(A)] * K[(B)])   // weight product
    1.58 +#define MAKE_WEIGHT(L)                                               \
    1.59 +  { { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3),  \
    1.60 +        PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } }
    1.61 +
    1.62 +// We need this union trick to be able to initialize constant static __m128i
    1.63 +// values. We can't call _mm_set_epi16() for static compile-time initialization.
    1.64 +static const struct {
    1.65 +  union {
    1.66 +    uint16 i16_[8];
    1.67 +    __m128i m_;
    1.68 +  } values_;
    1.69 +} W0 = MAKE_WEIGHT(0),
    1.70 +  W1 = MAKE_WEIGHT(1),
    1.71 +  W2 = MAKE_WEIGHT(2),
    1.72 +  W3 = MAKE_WEIGHT(3);
    1.73 +  // ... the rest is symmetric.
    1.74 +#undef MAKE_WEIGHT
    1.75 +#undef PWEIGHT
    1.76 +#endif
    1.77 +
    1.78 +// Common final expression for SSIM, once the weighted sums are known.
    1.79 +static double FinalizeSSIM(double iw, double xm, double ym,
    1.80 +                           double xxm, double xym, double yym) {
    1.81 +  const double iwx = xm * iw;
    1.82 +  const double iwy = ym * iw;
    1.83 +  double sxx = xxm * iw - iwx * iwx;
    1.84 +  double syy = yym * iw - iwy * iwy;
    1.85 +  // small errors are possible, due to rounding. Clamp to zero.
    1.86 +  if (sxx < 0.) sxx = 0.;
    1.87 +  if (syy < 0.) syy = 0.;
    1.88 +  const double sxsy = sqrt(sxx * syy);
    1.89 +  const double sxy = xym * iw - iwx * iwy;
    1.90 +  static const double C11 = (0.01 * 0.01) * (255 * 255);
    1.91 +  static const double C22 = (0.03 * 0.03) * (255 * 255);
    1.92 +  static const double C33 = (0.015 * 0.015) * (255 * 255);
    1.93 +  const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
    1.94 +  const double c = (2. * sxsy      + C22) / (sxx + syy + C22);
    1.95 +  const double s = (sxy + C33) / (sxsy + C33);
    1.96 +  return l * c * s;
    1.97 +}
    1.98 +
    1.99 +// GetSSIM() does clipping.  GetSSIMFullKernel() does not
   1.100 +
   1.101 +// TODO(skal): use summed tables?
   1.102 +// Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
   1.103 +// with a diff of 255, squared. The maximum error is thus 0x4388241,
   1.104 +// which fits into 32 bits integers.
   1.105 +double GetSSIM(const uint8 *org, const uint8 *rec,
   1.106 +               int xo, int yo, int W, int H, int stride) {
   1.107 +  uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
   1.108 +  org += (yo - KERNEL) * stride;
   1.109 +  org += (xo - KERNEL);
   1.110 +  rec += (yo - KERNEL) * stride;
   1.111 +  rec += (xo - KERNEL);
   1.112 +  for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
   1.113 +    if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue;
   1.114 +    const int Wy = K[y_];
   1.115 +    for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
   1.116 +      const int Wxy = Wy * K[x_];
   1.117 +      if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
   1.118 +        const int org_x = org[x_];
   1.119 +        const int rec_x = rec[x_];
   1.120 +        ws += Wxy;
   1.121 +        xm  += Wxy * org_x;
   1.122 +        ym  += Wxy * rec_x;
   1.123 +        xxm += Wxy * org_x * org_x;
   1.124 +        xym += Wxy * org_x * rec_x;
   1.125 +        yym += Wxy * rec_x * rec_x;
   1.126 +      }
   1.127 +    }
   1.128 +  }
   1.129 +  return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
   1.130 +}
   1.131 +
   1.132 +double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
   1.133 +                         int xo, int yo, int stride,
   1.134 +                         double area_weight) {
   1.135 +  uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
   1.136 +
   1.137 +#if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
   1.138 +
   1.139 +  org += yo * stride + xo;
   1.140 +  rec += yo * stride + xo;
   1.141 +  for (int y = 1; y <= KERNEL; y++) {
   1.142 +    const int dy1 = y * stride;
   1.143 +    const int dy2 = y * stride;
   1.144 +    const int Wy = K[KERNEL + y];
   1.145 +
   1.146 +    for (int x = 1; x <= KERNEL; x++) {
   1.147 +      // Compute the contributions of upper-left (ul), upper-right (ur)
   1.148 +      // lower-left (ll) and lower-right (lr) points (see the diagram below).
   1.149 +      // Symmetric Kernel will have same weight on those points.
   1.150 +      //       -  -  -  -  -  -  -
   1.151 +      //       -  ul -  -  -  ur -
   1.152 +      //       -  -  -  -  -  -  -
   1.153 +      //       -  -  -  0  -  -  -
   1.154 +      //       -  -  -  -  -  -  -
   1.155 +      //       -  ll -  -  -  lr -
   1.156 +      //       -  -  -  -  -  -  -
   1.157 +      const int Wxy = Wy * K[KERNEL + x];
   1.158 +      const int ul1 = org[-dy1 - x];
   1.159 +      const int ur1 = org[-dy1 + x];
   1.160 +      const int ll1 = org[dy1 - x];
   1.161 +      const int lr1 = org[dy1 + x];
   1.162 +
   1.163 +      const int ul2 = rec[-dy2 - x];
   1.164 +      const int ur2 = rec[-dy2 + x];
   1.165 +      const int ll2 = rec[dy2 - x];
   1.166 +      const int lr2 = rec[dy2 + x];
   1.167 +
   1.168 +      xm  += Wxy * (ul1 + ur1 + ll1 + lr1);
   1.169 +      ym  += Wxy * (ul2 + ur2 + ll2 + lr2);
   1.170 +      xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
   1.171 +      xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
   1.172 +      yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
   1.173 +    }
   1.174 +
   1.175 +    // Compute the contributions of up (u), down (d), left (l) and right (r)
   1.176 +    // points across the main axes (see the diagram below).
   1.177 +    // Symmetric Kernel will have same weight on those points.
   1.178 +    //       -  -  -  -  -  -  -
   1.179 +    //       -  -  -  u  -  -  -
   1.180 +    //       -  -  -  -  -  -  -
   1.181 +    //       -  l  -  0  -  r  -
   1.182 +    //       -  -  -  -  -  -  -
   1.183 +    //       -  -  -  d  -  -  -
   1.184 +    //       -  -  -  -  -  -  -
   1.185 +    const int Wxy = Wy * K[KERNEL];
   1.186 +    const int u1 = org[-dy1];
   1.187 +    const int d1 = org[dy1];
   1.188 +    const int l1 = org[-y];
   1.189 +    const int r1 = org[y];
   1.190 +
   1.191 +    const int u2 = rec[-dy2];
   1.192 +    const int d2 = rec[dy2];
   1.193 +    const int l2 = rec[-y];
   1.194 +    const int r2 = rec[y];
   1.195 +
   1.196 +    xm  += Wxy * (u1 + d1 + l1 + r1);
   1.197 +    ym  += Wxy * (u2 + d2 + l2 + r2);
   1.198 +    xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
   1.199 +    xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
   1.200 +    yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
   1.201 +  }
   1.202 +
   1.203 +  // Lastly the contribution of (x0, y0) point.
   1.204 +  const int Wxy = K[KERNEL] * K[KERNEL];
   1.205 +  const int s1 = org[0];
   1.206 +  const int s2 = rec[0];
   1.207 +
   1.208 +  xm  += Wxy * s1;
   1.209 +  ym  += Wxy * s2;
   1.210 +  xxm += Wxy * s1 * s1;
   1.211 +  xym += Wxy * s1 * s2;
   1.212 +  yym += Wxy * s2 * s2;
   1.213 +
   1.214 +#else   // __SSE2__
   1.215 +
   1.216 +  org += (yo - KERNEL) * stride + (xo - KERNEL);
   1.217 +  rec += (yo - KERNEL) * stride + (xo - KERNEL);
   1.218 +
   1.219 +  const __m128i zero = _mm_setzero_si128();
   1.220 +  __m128i x = zero;
   1.221 +  __m128i y = zero;
   1.222 +  __m128i xx = zero;
   1.223 +  __m128i xy = zero;
   1.224 +  __m128i yy = zero;
   1.225 +
   1.226 +// Read 8 pixels at line #L, and convert to 16bit, perform weighting
   1.227 +// and acccumulate.
   1.228 +#define LOAD_LINE_PAIR(L, WEIGHT) do {                                       \
   1.229 +  const __m128i v0 =                                                         \
   1.230 +      _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \
   1.231 +  const __m128i v1 =                                                         \
   1.232 +      _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \
   1.233 +  const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                            \
   1.234 +  const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                            \
   1.235 +  const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);              \
   1.236 +  const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);              \
   1.237 +  x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                       \
   1.238 +  y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                       \
   1.239 +  x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                       \
   1.240 +  y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                       \
   1.241 +  xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                           \
   1.242 +  xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                           \
   1.243 +  yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                           \
   1.244 +} while (0)
   1.245 +
   1.246 +#define ADD_AND_STORE_FOUR_EPI32(M, OUT) do {                                \
   1.247 +  uint32 tmp[4];                                                             \
   1.248 +  _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M));                    \
   1.249 +  (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0];                                 \
   1.250 +} while (0)
   1.251 +
   1.252 +  LOAD_LINE_PAIR(0, W0);
   1.253 +  LOAD_LINE_PAIR(1, W1);
   1.254 +  LOAD_LINE_PAIR(2, W2);
   1.255 +  LOAD_LINE_PAIR(3, W3);
   1.256 +  LOAD_LINE_PAIR(4, W2);
   1.257 +  LOAD_LINE_PAIR(5, W1);
   1.258 +  LOAD_LINE_PAIR(6, W0);
   1.259 +
   1.260 +  ADD_AND_STORE_FOUR_EPI32(x, xm);
   1.261 +  ADD_AND_STORE_FOUR_EPI32(y, ym);
   1.262 +  ADD_AND_STORE_FOUR_EPI32(xx, xxm);
   1.263 +  ADD_AND_STORE_FOUR_EPI32(xy, xym);
   1.264 +  ADD_AND_STORE_FOUR_EPI32(yy, yym);
   1.265 +
   1.266 +#undef LOAD_LINE_PAIR
   1.267 +#undef ADD_AND_STORE_FOUR_EPI32
   1.268 +#endif
   1.269 +
   1.270 +  return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
   1.271 +}
   1.272 +
   1.273 +static int start_max(int x, int y) { return (x > y) ? x : y; }
   1.274 +
   1.275 +double CalcSSIM(const uint8 *org, const uint8 *rec,
   1.276 +                const int image_width, const int image_height) {
   1.277 +  double SSIM = 0.;
   1.278 +  const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
   1.279 +  const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
   1.280 +  const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
   1.281 +  const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
   1.282 +  const int stride = image_width;
   1.283 +
   1.284 +  for (int j = 0; j < KERNEL_Y; ++j) {
   1.285 +    for (int i = 0; i < image_width; ++i) {
   1.286 +      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
   1.287 +    }
   1.288 +  }
   1.289 +
   1.290 +#ifdef _OPENMP
   1.291 +  #pragma omp parallel for reduction(+: SSIM)
   1.292 +#endif
   1.293 +  for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
   1.294 +    for (int i = 0; i < KERNEL_X; ++i) {
   1.295 +      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
   1.296 +    }
   1.297 +    for (int i = KERNEL_X; i < start_x; ++i) {
   1.298 +      SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
   1.299 +    }
   1.300 +    if (start_x < image_width) {
   1.301 +      // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
   1.302 +      // copy the 8 rightmost pixels on a cache area, and pad this area with
   1.303 +      // zeros which won't contribute to the overall SSIM value (but we need
   1.304 +      // to pass the correct normalizing constant!). By using this cache, we can
   1.305 +      // still call GetSSIMFullKernel() instead of the slower GetSSIM().
   1.306 +      // NOTE: we could use similar method for the left-most pixels too.
   1.307 +      const int kScratchWidth = 8;
   1.308 +      const int kScratchStride = kScratchWidth + KERNEL + 1;
   1.309 +      uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 };
   1.310 +      uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 };
   1.311 +
   1.312 +      for (int k = 0; k < KERNEL_SIZE; ++k) {
   1.313 +        const int offset =
   1.314 +            (j - KERNEL + k) * stride + image_width - kScratchWidth;
   1.315 +        memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
   1.316 +        memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
   1.317 +      }
   1.318 +      for (int k = 0;  k <= KERNEL_X + 1; ++k) {
   1.319 +        SSIM += GetSSIMFullKernel(scratch_org, scratch_rec,
   1.320 +                                  KERNEL + k, KERNEL, kScratchStride, kiW[k]);
   1.321 +      }
   1.322 +    }
   1.323 +  }
   1.324 +
   1.325 +  for (int j = start_y; j < image_height; ++j) {
   1.326 +    for (int i = 0; i < image_width; ++i) {
   1.327 +      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
   1.328 +    }
   1.329 +  }
   1.330 +  return SSIM;
   1.331 +}
   1.332 +
   1.333 +double CalcLSSIM(double ssim) {
   1.334 +  return -10.0 * log10(1.0 - ssim);
   1.335 +}
   1.336 +
   1.337 +#ifdef __cplusplus
   1.338 +}  // extern "C"
   1.339 +#endif
   1.340 +

mercurial