michael@0: /* michael@0: * Copyright 2013 The LibYuv Project Authors. All rights reserved. michael@0: * michael@0: * Use of this source code is governed by a BSD-style license michael@0: * that can be found in the LICENSE file in the root of the source michael@0: * tree. An additional intellectual property rights grant can be found michael@0: * in the file PATENTS. All contributing project authors may michael@0: * be found in the AUTHORS file in the root of the source tree. michael@0: */ michael@0: michael@0: #include "../util/ssim.h" // NOLINT michael@0: michael@0: #include michael@0: #include michael@0: michael@0: #ifdef __cplusplus michael@0: extern "C" { michael@0: #endif michael@0: michael@0: typedef unsigned int uint32; // NOLINT michael@0: typedef unsigned short uint16; // NOLINT michael@0: michael@0: #if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \ michael@0: (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2))) michael@0: #define __SSE2__ michael@0: #endif michael@0: #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) michael@0: #include michael@0: #endif michael@0: michael@0: #ifdef _OPENMP michael@0: #include michael@0: #endif michael@0: michael@0: // SSIM michael@0: enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 }; michael@0: michael@0: // Symmetric Gaussian kernel: K[i] = ~11 * exp(-0.3 * i * i) michael@0: // The maximum value (11 x 11) must be less than 128 to avoid sign michael@0: // problems during the calls to _mm_mullo_epi16(). michael@0: static const int K[KERNEL_SIZE] = { michael@0: 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i) michael@0: }; michael@0: static const double kiW[KERNEL + 1 + 1] = { michael@0: 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] michael@0: 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] michael@0: 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j] michael@0: 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j] michael@0: 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j] michael@0: }; michael@0: michael@0: #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) michael@0: michael@0: #define PWEIGHT(A, B) static_cast(K[(A)] * K[(B)]) // weight product michael@0: #define MAKE_WEIGHT(L) \ michael@0: { { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), \ michael@0: PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } } michael@0: michael@0: // We need this union trick to be able to initialize constant static __m128i michael@0: // values. We can't call _mm_set_epi16() for static compile-time initialization. michael@0: static const struct { michael@0: union { michael@0: uint16 i16_[8]; michael@0: __m128i m_; michael@0: } values_; michael@0: } W0 = MAKE_WEIGHT(0), michael@0: W1 = MAKE_WEIGHT(1), michael@0: W2 = MAKE_WEIGHT(2), michael@0: W3 = MAKE_WEIGHT(3); michael@0: // ... the rest is symmetric. michael@0: #undef MAKE_WEIGHT michael@0: #undef PWEIGHT michael@0: #endif michael@0: michael@0: // Common final expression for SSIM, once the weighted sums are known. michael@0: static double FinalizeSSIM(double iw, double xm, double ym, michael@0: double xxm, double xym, double yym) { michael@0: const double iwx = xm * iw; michael@0: const double iwy = ym * iw; michael@0: double sxx = xxm * iw - iwx * iwx; michael@0: double syy = yym * iw - iwy * iwy; michael@0: // small errors are possible, due to rounding. Clamp to zero. michael@0: if (sxx < 0.) sxx = 0.; michael@0: if (syy < 0.) syy = 0.; michael@0: const double sxsy = sqrt(sxx * syy); michael@0: const double sxy = xym * iw - iwx * iwy; michael@0: static const double C11 = (0.01 * 0.01) * (255 * 255); michael@0: static const double C22 = (0.03 * 0.03) * (255 * 255); michael@0: static const double C33 = (0.015 * 0.015) * (255 * 255); michael@0: const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11); michael@0: const double c = (2. * sxsy + C22) / (sxx + syy + C22); michael@0: const double s = (sxy + C33) / (sxsy + C33); michael@0: return l * c * s; michael@0: } michael@0: michael@0: // GetSSIM() does clipping. GetSSIMFullKernel() does not michael@0: michael@0: // TODO(skal): use summed tables? michael@0: // Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1) michael@0: // with a diff of 255, squared. The maximum error is thus 0x4388241, michael@0: // which fits into 32 bits integers. michael@0: double GetSSIM(const uint8 *org, const uint8 *rec, michael@0: int xo, int yo, int W, int H, int stride) { michael@0: uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; michael@0: org += (yo - KERNEL) * stride; michael@0: org += (xo - KERNEL); michael@0: rec += (yo - KERNEL) * stride; michael@0: rec += (xo - KERNEL); michael@0: for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) { michael@0: if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue; michael@0: const int Wy = K[y_]; michael@0: for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) { michael@0: const int Wxy = Wy * K[x_]; michael@0: if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) { michael@0: const int org_x = org[x_]; michael@0: const int rec_x = rec[x_]; michael@0: ws += Wxy; michael@0: xm += Wxy * org_x; michael@0: ym += Wxy * rec_x; michael@0: xxm += Wxy * org_x * org_x; michael@0: xym += Wxy * org_x * rec_x; michael@0: yym += Wxy * rec_x * rec_x; michael@0: } michael@0: } michael@0: } michael@0: return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym); michael@0: } michael@0: michael@0: double GetSSIMFullKernel(const uint8 *org, const uint8 *rec, michael@0: int xo, int yo, int stride, michael@0: double area_weight) { michael@0: uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; michael@0: michael@0: #if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__) michael@0: michael@0: org += yo * stride + xo; michael@0: rec += yo * stride + xo; michael@0: for (int y = 1; y <= KERNEL; y++) { michael@0: const int dy1 = y * stride; michael@0: const int dy2 = y * stride; michael@0: const int Wy = K[KERNEL + y]; michael@0: michael@0: for (int x = 1; x <= KERNEL; x++) { michael@0: // Compute the contributions of upper-left (ul), upper-right (ur) michael@0: // lower-left (ll) and lower-right (lr) points (see the diagram below). michael@0: // Symmetric Kernel will have same weight on those points. michael@0: // - - - - - - - michael@0: // - ul - - - ur - michael@0: // - - - - - - - michael@0: // - - - 0 - - - michael@0: // - - - - - - - michael@0: // - ll - - - lr - michael@0: // - - - - - - - michael@0: const int Wxy = Wy * K[KERNEL + x]; michael@0: const int ul1 = org[-dy1 - x]; michael@0: const int ur1 = org[-dy1 + x]; michael@0: const int ll1 = org[dy1 - x]; michael@0: const int lr1 = org[dy1 + x]; michael@0: michael@0: const int ul2 = rec[-dy2 - x]; michael@0: const int ur2 = rec[-dy2 + x]; michael@0: const int ll2 = rec[dy2 - x]; michael@0: const int lr2 = rec[dy2 + x]; michael@0: michael@0: xm += Wxy * (ul1 + ur1 + ll1 + lr1); michael@0: ym += Wxy * (ul2 + ur2 + ll2 + lr2); michael@0: xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1); michael@0: xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2); michael@0: yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2); michael@0: } michael@0: michael@0: // Compute the contributions of up (u), down (d), left (l) and right (r) michael@0: // points across the main axes (see the diagram below). michael@0: // Symmetric Kernel will have same weight on those points. michael@0: // - - - - - - - michael@0: // - - - u - - - michael@0: // - - - - - - - michael@0: // - l - 0 - r - michael@0: // - - - - - - - michael@0: // - - - d - - - michael@0: // - - - - - - - michael@0: const int Wxy = Wy * K[KERNEL]; michael@0: const int u1 = org[-dy1]; michael@0: const int d1 = org[dy1]; michael@0: const int l1 = org[-y]; michael@0: const int r1 = org[y]; michael@0: michael@0: const int u2 = rec[-dy2]; michael@0: const int d2 = rec[dy2]; michael@0: const int l2 = rec[-y]; michael@0: const int r2 = rec[y]; michael@0: michael@0: xm += Wxy * (u1 + d1 + l1 + r1); michael@0: ym += Wxy * (u2 + d2 + l2 + r2); michael@0: xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1); michael@0: xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2); michael@0: yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2); michael@0: } michael@0: michael@0: // Lastly the contribution of (x0, y0) point. michael@0: const int Wxy = K[KERNEL] * K[KERNEL]; michael@0: const int s1 = org[0]; michael@0: const int s2 = rec[0]; michael@0: michael@0: xm += Wxy * s1; michael@0: ym += Wxy * s2; michael@0: xxm += Wxy * s1 * s1; michael@0: xym += Wxy * s1 * s2; michael@0: yym += Wxy * s2 * s2; michael@0: michael@0: #else // __SSE2__ michael@0: michael@0: org += (yo - KERNEL) * stride + (xo - KERNEL); michael@0: rec += (yo - KERNEL) * stride + (xo - KERNEL); michael@0: michael@0: const __m128i zero = _mm_setzero_si128(); michael@0: __m128i x = zero; michael@0: __m128i y = zero; michael@0: __m128i xx = zero; michael@0: __m128i xy = zero; michael@0: __m128i yy = zero; michael@0: michael@0: // Read 8 pixels at line #L, and convert to 16bit, perform weighting michael@0: // and acccumulate. michael@0: #define LOAD_LINE_PAIR(L, WEIGHT) do { \ michael@0: const __m128i v0 = \ michael@0: _mm_loadl_epi64(reinterpret_cast(org + (L) * stride)); \ michael@0: const __m128i v1 = \ michael@0: _mm_loadl_epi64(reinterpret_cast(rec + (L) * stride)); \ michael@0: const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \ michael@0: const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \ michael@0: const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \ michael@0: const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \ michael@0: x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \ michael@0: y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \ michael@0: x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \ michael@0: y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \ michael@0: xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \ michael@0: xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \ michael@0: yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \ michael@0: } while (0) michael@0: michael@0: #define ADD_AND_STORE_FOUR_EPI32(M, OUT) do { \ michael@0: uint32 tmp[4]; \ michael@0: _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \ michael@0: (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \ michael@0: } while (0) michael@0: michael@0: LOAD_LINE_PAIR(0, W0); michael@0: LOAD_LINE_PAIR(1, W1); michael@0: LOAD_LINE_PAIR(2, W2); michael@0: LOAD_LINE_PAIR(3, W3); michael@0: LOAD_LINE_PAIR(4, W2); michael@0: LOAD_LINE_PAIR(5, W1); michael@0: LOAD_LINE_PAIR(6, W0); michael@0: michael@0: ADD_AND_STORE_FOUR_EPI32(x, xm); michael@0: ADD_AND_STORE_FOUR_EPI32(y, ym); michael@0: ADD_AND_STORE_FOUR_EPI32(xx, xxm); michael@0: ADD_AND_STORE_FOUR_EPI32(xy, xym); michael@0: ADD_AND_STORE_FOUR_EPI32(yy, yym); michael@0: michael@0: #undef LOAD_LINE_PAIR michael@0: #undef ADD_AND_STORE_FOUR_EPI32 michael@0: #endif michael@0: michael@0: return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym); michael@0: } michael@0: michael@0: static int start_max(int x, int y) { return (x > y) ? x : y; } michael@0: michael@0: double CalcSSIM(const uint8 *org, const uint8 *rec, michael@0: const int image_width, const int image_height) { michael@0: double SSIM = 0.; michael@0: const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL; michael@0: const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL; michael@0: const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X); michael@0: const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y); michael@0: const int stride = image_width; michael@0: michael@0: for (int j = 0; j < KERNEL_Y; ++j) { michael@0: for (int i = 0; i < image_width; ++i) { michael@0: SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); michael@0: } michael@0: } michael@0: michael@0: #ifdef _OPENMP michael@0: #pragma omp parallel for reduction(+: SSIM) michael@0: #endif michael@0: for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) { michael@0: for (int i = 0; i < KERNEL_X; ++i) { michael@0: SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); michael@0: } michael@0: for (int i = KERNEL_X; i < start_x; ++i) { michael@0: SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]); michael@0: } michael@0: if (start_x < image_width) { michael@0: // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we michael@0: // copy the 8 rightmost pixels on a cache area, and pad this area with michael@0: // zeros which won't contribute to the overall SSIM value (but we need michael@0: // to pass the correct normalizing constant!). By using this cache, we can michael@0: // still call GetSSIMFullKernel() instead of the slower GetSSIM(). michael@0: // NOTE: we could use similar method for the left-most pixels too. michael@0: const int kScratchWidth = 8; michael@0: const int kScratchStride = kScratchWidth + KERNEL + 1; michael@0: uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 }; michael@0: uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 }; michael@0: michael@0: for (int k = 0; k < KERNEL_SIZE; ++k) { michael@0: const int offset = michael@0: (j - KERNEL + k) * stride + image_width - kScratchWidth; michael@0: memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth); michael@0: memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth); michael@0: } michael@0: for (int k = 0; k <= KERNEL_X + 1; ++k) { michael@0: SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, michael@0: KERNEL + k, KERNEL, kScratchStride, kiW[k]); michael@0: } michael@0: } michael@0: } michael@0: michael@0: for (int j = start_y; j < image_height; ++j) { michael@0: for (int i = 0; i < image_width; ++i) { michael@0: SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); michael@0: } michael@0: } michael@0: return SSIM; michael@0: } michael@0: michael@0: double CalcLSSIM(double ssim) { michael@0: return -10.0 * log10(1.0 - ssim); michael@0: } michael@0: michael@0: #ifdef __cplusplus michael@0: } // extern "C" michael@0: #endif michael@0: