media/libyuv/util/ssim.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
michael@0 3 *
michael@0 4 * Use of this source code is governed by a BSD-style license
michael@0 5 * that can be found in the LICENSE file in the root of the source
michael@0 6 * tree. An additional intellectual property rights grant can be found
michael@0 7 * in the file PATENTS. All contributing project authors may
michael@0 8 * be found in the AUTHORS file in the root of the source tree.
michael@0 9 */
michael@0 10
michael@0 11 #include "../util/ssim.h" // NOLINT
michael@0 12
michael@0 13 #include <math.h>
michael@0 14 #include <string.h>
michael@0 15
michael@0 16 #ifdef __cplusplus
michael@0 17 extern "C" {
michael@0 18 #endif
michael@0 19
michael@0 20 typedef unsigned int uint32; // NOLINT
michael@0 21 typedef unsigned short uint16; // NOLINT
michael@0 22
michael@0 23 #if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
michael@0 24 (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
michael@0 25 #define __SSE2__
michael@0 26 #endif
michael@0 27 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
michael@0 28 #include <emmintrin.h>
michael@0 29 #endif
michael@0 30
michael@0 31 #ifdef _OPENMP
michael@0 32 #include <omp.h>
michael@0 33 #endif
michael@0 34
michael@0 35 // SSIM
michael@0 36 enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
michael@0 37
michael@0 38 // Symmetric Gaussian kernel: K[i] = ~11 * exp(-0.3 * i * i)
michael@0 39 // The maximum value (11 x 11) must be less than 128 to avoid sign
michael@0 40 // problems during the calls to _mm_mullo_epi16().
michael@0 41 static const int K[KERNEL_SIZE] = {
michael@0 42 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i)
michael@0 43 };
michael@0 44 static const double kiW[KERNEL + 1 + 1] = {
michael@0 45 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
michael@0 46 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
michael@0 47 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j]
michael@0 48 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j]
michael@0 49 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j]
michael@0 50 };
michael@0 51
michael@0 52 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
michael@0 53
michael@0 54 #define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)]) // weight product
michael@0 55 #define MAKE_WEIGHT(L) \
michael@0 56 { { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), \
michael@0 57 PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } }
michael@0 58
michael@0 59 // We need this union trick to be able to initialize constant static __m128i
michael@0 60 // values. We can't call _mm_set_epi16() for static compile-time initialization.
michael@0 61 static const struct {
michael@0 62 union {
michael@0 63 uint16 i16_[8];
michael@0 64 __m128i m_;
michael@0 65 } values_;
michael@0 66 } W0 = MAKE_WEIGHT(0),
michael@0 67 W1 = MAKE_WEIGHT(1),
michael@0 68 W2 = MAKE_WEIGHT(2),
michael@0 69 W3 = MAKE_WEIGHT(3);
michael@0 70 // ... the rest is symmetric.
michael@0 71 #undef MAKE_WEIGHT
michael@0 72 #undef PWEIGHT
michael@0 73 #endif
michael@0 74
michael@0 75 // Common final expression for SSIM, once the weighted sums are known.
michael@0 76 static double FinalizeSSIM(double iw, double xm, double ym,
michael@0 77 double xxm, double xym, double yym) {
michael@0 78 const double iwx = xm * iw;
michael@0 79 const double iwy = ym * iw;
michael@0 80 double sxx = xxm * iw - iwx * iwx;
michael@0 81 double syy = yym * iw - iwy * iwy;
michael@0 82 // small errors are possible, due to rounding. Clamp to zero.
michael@0 83 if (sxx < 0.) sxx = 0.;
michael@0 84 if (syy < 0.) syy = 0.;
michael@0 85 const double sxsy = sqrt(sxx * syy);
michael@0 86 const double sxy = xym * iw - iwx * iwy;
michael@0 87 static const double C11 = (0.01 * 0.01) * (255 * 255);
michael@0 88 static const double C22 = (0.03 * 0.03) * (255 * 255);
michael@0 89 static const double C33 = (0.015 * 0.015) * (255 * 255);
michael@0 90 const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
michael@0 91 const double c = (2. * sxsy + C22) / (sxx + syy + C22);
michael@0 92 const double s = (sxy + C33) / (sxsy + C33);
michael@0 93 return l * c * s;
michael@0 94 }
michael@0 95
michael@0 96 // GetSSIM() does clipping. GetSSIMFullKernel() does not
michael@0 97
michael@0 98 // TODO(skal): use summed tables?
michael@0 99 // Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
michael@0 100 // with a diff of 255, squared. The maximum error is thus 0x4388241,
michael@0 101 // which fits into 32 bits integers.
michael@0 102 double GetSSIM(const uint8 *org, const uint8 *rec,
michael@0 103 int xo, int yo, int W, int H, int stride) {
michael@0 104 uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
michael@0 105 org += (yo - KERNEL) * stride;
michael@0 106 org += (xo - KERNEL);
michael@0 107 rec += (yo - KERNEL) * stride;
michael@0 108 rec += (xo - KERNEL);
michael@0 109 for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
michael@0 110 if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue;
michael@0 111 const int Wy = K[y_];
michael@0 112 for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
michael@0 113 const int Wxy = Wy * K[x_];
michael@0 114 if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
michael@0 115 const int org_x = org[x_];
michael@0 116 const int rec_x = rec[x_];
michael@0 117 ws += Wxy;
michael@0 118 xm += Wxy * org_x;
michael@0 119 ym += Wxy * rec_x;
michael@0 120 xxm += Wxy * org_x * org_x;
michael@0 121 xym += Wxy * org_x * rec_x;
michael@0 122 yym += Wxy * rec_x * rec_x;
michael@0 123 }
michael@0 124 }
michael@0 125 }
michael@0 126 return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
michael@0 127 }
michael@0 128
michael@0 129 double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
michael@0 130 int xo, int yo, int stride,
michael@0 131 double area_weight) {
michael@0 132 uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
michael@0 133
michael@0 134 #if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
michael@0 135
michael@0 136 org += yo * stride + xo;
michael@0 137 rec += yo * stride + xo;
michael@0 138 for (int y = 1; y <= KERNEL; y++) {
michael@0 139 const int dy1 = y * stride;
michael@0 140 const int dy2 = y * stride;
michael@0 141 const int Wy = K[KERNEL + y];
michael@0 142
michael@0 143 for (int x = 1; x <= KERNEL; x++) {
michael@0 144 // Compute the contributions of upper-left (ul), upper-right (ur)
michael@0 145 // lower-left (ll) and lower-right (lr) points (see the diagram below).
michael@0 146 // Symmetric Kernel will have same weight on those points.
michael@0 147 // - - - - - - -
michael@0 148 // - ul - - - ur -
michael@0 149 // - - - - - - -
michael@0 150 // - - - 0 - - -
michael@0 151 // - - - - - - -
michael@0 152 // - ll - - - lr -
michael@0 153 // - - - - - - -
michael@0 154 const int Wxy = Wy * K[KERNEL + x];
michael@0 155 const int ul1 = org[-dy1 - x];
michael@0 156 const int ur1 = org[-dy1 + x];
michael@0 157 const int ll1 = org[dy1 - x];
michael@0 158 const int lr1 = org[dy1 + x];
michael@0 159
michael@0 160 const int ul2 = rec[-dy2 - x];
michael@0 161 const int ur2 = rec[-dy2 + x];
michael@0 162 const int ll2 = rec[dy2 - x];
michael@0 163 const int lr2 = rec[dy2 + x];
michael@0 164
michael@0 165 xm += Wxy * (ul1 + ur1 + ll1 + lr1);
michael@0 166 ym += Wxy * (ul2 + ur2 + ll2 + lr2);
michael@0 167 xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
michael@0 168 xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
michael@0 169 yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
michael@0 170 }
michael@0 171
michael@0 172 // Compute the contributions of up (u), down (d), left (l) and right (r)
michael@0 173 // points across the main axes (see the diagram below).
michael@0 174 // Symmetric Kernel will have same weight on those points.
michael@0 175 // - - - - - - -
michael@0 176 // - - - u - - -
michael@0 177 // - - - - - - -
michael@0 178 // - l - 0 - r -
michael@0 179 // - - - - - - -
michael@0 180 // - - - d - - -
michael@0 181 // - - - - - - -
michael@0 182 const int Wxy = Wy * K[KERNEL];
michael@0 183 const int u1 = org[-dy1];
michael@0 184 const int d1 = org[dy1];
michael@0 185 const int l1 = org[-y];
michael@0 186 const int r1 = org[y];
michael@0 187
michael@0 188 const int u2 = rec[-dy2];
michael@0 189 const int d2 = rec[dy2];
michael@0 190 const int l2 = rec[-y];
michael@0 191 const int r2 = rec[y];
michael@0 192
michael@0 193 xm += Wxy * (u1 + d1 + l1 + r1);
michael@0 194 ym += Wxy * (u2 + d2 + l2 + r2);
michael@0 195 xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
michael@0 196 xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
michael@0 197 yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
michael@0 198 }
michael@0 199
michael@0 200 // Lastly the contribution of (x0, y0) point.
michael@0 201 const int Wxy = K[KERNEL] * K[KERNEL];
michael@0 202 const int s1 = org[0];
michael@0 203 const int s2 = rec[0];
michael@0 204
michael@0 205 xm += Wxy * s1;
michael@0 206 ym += Wxy * s2;
michael@0 207 xxm += Wxy * s1 * s1;
michael@0 208 xym += Wxy * s1 * s2;
michael@0 209 yym += Wxy * s2 * s2;
michael@0 210
michael@0 211 #else // __SSE2__
michael@0 212
michael@0 213 org += (yo - KERNEL) * stride + (xo - KERNEL);
michael@0 214 rec += (yo - KERNEL) * stride + (xo - KERNEL);
michael@0 215
michael@0 216 const __m128i zero = _mm_setzero_si128();
michael@0 217 __m128i x = zero;
michael@0 218 __m128i y = zero;
michael@0 219 __m128i xx = zero;
michael@0 220 __m128i xy = zero;
michael@0 221 __m128i yy = zero;
michael@0 222
michael@0 223 // Read 8 pixels at line #L, and convert to 16bit, perform weighting
michael@0 224 // and acccumulate.
michael@0 225 #define LOAD_LINE_PAIR(L, WEIGHT) do { \
michael@0 226 const __m128i v0 = \
michael@0 227 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \
michael@0 228 const __m128i v1 = \
michael@0 229 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \
michael@0 230 const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \
michael@0 231 const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \
michael@0 232 const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \
michael@0 233 const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \
michael@0 234 x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \
michael@0 235 y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \
michael@0 236 x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \
michael@0 237 y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \
michael@0 238 xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \
michael@0 239 xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \
michael@0 240 yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \
michael@0 241 } while (0)
michael@0 242
michael@0 243 #define ADD_AND_STORE_FOUR_EPI32(M, OUT) do { \
michael@0 244 uint32 tmp[4]; \
michael@0 245 _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
michael@0 246 (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \
michael@0 247 } while (0)
michael@0 248
michael@0 249 LOAD_LINE_PAIR(0, W0);
michael@0 250 LOAD_LINE_PAIR(1, W1);
michael@0 251 LOAD_LINE_PAIR(2, W2);
michael@0 252 LOAD_LINE_PAIR(3, W3);
michael@0 253 LOAD_LINE_PAIR(4, W2);
michael@0 254 LOAD_LINE_PAIR(5, W1);
michael@0 255 LOAD_LINE_PAIR(6, W0);
michael@0 256
michael@0 257 ADD_AND_STORE_FOUR_EPI32(x, xm);
michael@0 258 ADD_AND_STORE_FOUR_EPI32(y, ym);
michael@0 259 ADD_AND_STORE_FOUR_EPI32(xx, xxm);
michael@0 260 ADD_AND_STORE_FOUR_EPI32(xy, xym);
michael@0 261 ADD_AND_STORE_FOUR_EPI32(yy, yym);
michael@0 262
michael@0 263 #undef LOAD_LINE_PAIR
michael@0 264 #undef ADD_AND_STORE_FOUR_EPI32
michael@0 265 #endif
michael@0 266
michael@0 267 return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
michael@0 268 }
michael@0 269
michael@0 270 static int start_max(int x, int y) { return (x > y) ? x : y; }
michael@0 271
michael@0 272 double CalcSSIM(const uint8 *org, const uint8 *rec,
michael@0 273 const int image_width, const int image_height) {
michael@0 274 double SSIM = 0.;
michael@0 275 const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
michael@0 276 const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
michael@0 277 const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
michael@0 278 const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
michael@0 279 const int stride = image_width;
michael@0 280
michael@0 281 for (int j = 0; j < KERNEL_Y; ++j) {
michael@0 282 for (int i = 0; i < image_width; ++i) {
michael@0 283 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
michael@0 284 }
michael@0 285 }
michael@0 286
michael@0 287 #ifdef _OPENMP
michael@0 288 #pragma omp parallel for reduction(+: SSIM)
michael@0 289 #endif
michael@0 290 for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
michael@0 291 for (int i = 0; i < KERNEL_X; ++i) {
michael@0 292 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
michael@0 293 }
michael@0 294 for (int i = KERNEL_X; i < start_x; ++i) {
michael@0 295 SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
michael@0 296 }
michael@0 297 if (start_x < image_width) {
michael@0 298 // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
michael@0 299 // copy the 8 rightmost pixels on a cache area, and pad this area with
michael@0 300 // zeros which won't contribute to the overall SSIM value (but we need
michael@0 301 // to pass the correct normalizing constant!). By using this cache, we can
michael@0 302 // still call GetSSIMFullKernel() instead of the slower GetSSIM().
michael@0 303 // NOTE: we could use similar method for the left-most pixels too.
michael@0 304 const int kScratchWidth = 8;
michael@0 305 const int kScratchStride = kScratchWidth + KERNEL + 1;
michael@0 306 uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 };
michael@0 307 uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 };
michael@0 308
michael@0 309 for (int k = 0; k < KERNEL_SIZE; ++k) {
michael@0 310 const int offset =
michael@0 311 (j - KERNEL + k) * stride + image_width - kScratchWidth;
michael@0 312 memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
michael@0 313 memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
michael@0 314 }
michael@0 315 for (int k = 0; k <= KERNEL_X + 1; ++k) {
michael@0 316 SSIM += GetSSIMFullKernel(scratch_org, scratch_rec,
michael@0 317 KERNEL + k, KERNEL, kScratchStride, kiW[k]);
michael@0 318 }
michael@0 319 }
michael@0 320 }
michael@0 321
michael@0 322 for (int j = start_y; j < image_height; ++j) {
michael@0 323 for (int i = 0; i < image_width; ++i) {
michael@0 324 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
michael@0 325 }
michael@0 326 }
michael@0 327 return SSIM;
michael@0 328 }
michael@0 329
michael@0 330 double CalcLSSIM(double ssim) {
michael@0 331 return -10.0 * log10(1.0 - ssim);
michael@0 332 }
michael@0 333
michael@0 334 #ifdef __cplusplus
michael@0 335 } // extern "C"
michael@0 336 #endif
michael@0 337

mercurial