1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libyuv/util/ssim.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,337 @@ 1.4 +/* 1.5 + * Copyright 2013 The LibYuv Project Authors. All rights reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include "../util/ssim.h" // NOLINT 1.15 + 1.16 +#include <math.h> 1.17 +#include <string.h> 1.18 + 1.19 +#ifdef __cplusplus 1.20 +extern "C" { 1.21 +#endif 1.22 + 1.23 +typedef unsigned int uint32; // NOLINT 1.24 +typedef unsigned short uint16; // NOLINT 1.25 + 1.26 +#if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \ 1.27 + (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2))) 1.28 +#define __SSE2__ 1.29 +#endif 1.30 +#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) 1.31 +#include <emmintrin.h> 1.32 +#endif 1.33 + 1.34 +#ifdef _OPENMP 1.35 +#include <omp.h> 1.36 +#endif 1.37 + 1.38 +// SSIM 1.39 +enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 }; 1.40 + 1.41 +// Symmetric Gaussian kernel: K[i] = ~11 * exp(-0.3 * i * i) 1.42 +// The maximum value (11 x 11) must be less than 128 to avoid sign 1.43 +// problems during the calls to _mm_mullo_epi16(). 1.44 +static const int K[KERNEL_SIZE] = { 1.45 + 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i) 1.46 +}; 1.47 +static const double kiW[KERNEL + 1 + 1] = { 1.48 + 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] 1.49 + 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] 1.50 + 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j] 1.51 + 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j] 1.52 + 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j] 1.53 +}; 1.54 + 1.55 +#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) 1.56 + 1.57 +#define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)]) // weight product 1.58 +#define MAKE_WEIGHT(L) \ 1.59 + { { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), \ 1.60 + PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } } 1.61 + 1.62 +// We need this union trick to be able to initialize constant static __m128i 1.63 +// values. We can't call _mm_set_epi16() for static compile-time initialization. 1.64 +static const struct { 1.65 + union { 1.66 + uint16 i16_[8]; 1.67 + __m128i m_; 1.68 + } values_; 1.69 +} W0 = MAKE_WEIGHT(0), 1.70 + W1 = MAKE_WEIGHT(1), 1.71 + W2 = MAKE_WEIGHT(2), 1.72 + W3 = MAKE_WEIGHT(3); 1.73 + // ... the rest is symmetric. 1.74 +#undef MAKE_WEIGHT 1.75 +#undef PWEIGHT 1.76 +#endif 1.77 + 1.78 +// Common final expression for SSIM, once the weighted sums are known. 1.79 +static double FinalizeSSIM(double iw, double xm, double ym, 1.80 + double xxm, double xym, double yym) { 1.81 + const double iwx = xm * iw; 1.82 + const double iwy = ym * iw; 1.83 + double sxx = xxm * iw - iwx * iwx; 1.84 + double syy = yym * iw - iwy * iwy; 1.85 + // small errors are possible, due to rounding. Clamp to zero. 1.86 + if (sxx < 0.) sxx = 0.; 1.87 + if (syy < 0.) syy = 0.; 1.88 + const double sxsy = sqrt(sxx * syy); 1.89 + const double sxy = xym * iw - iwx * iwy; 1.90 + static const double C11 = (0.01 * 0.01) * (255 * 255); 1.91 + static const double C22 = (0.03 * 0.03) * (255 * 255); 1.92 + static const double C33 = (0.015 * 0.015) * (255 * 255); 1.93 + const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11); 1.94 + const double c = (2. * sxsy + C22) / (sxx + syy + C22); 1.95 + const double s = (sxy + C33) / (sxsy + C33); 1.96 + return l * c * s; 1.97 +} 1.98 + 1.99 +// GetSSIM() does clipping. GetSSIMFullKernel() does not 1.100 + 1.101 +// TODO(skal): use summed tables? 1.102 +// Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1) 1.103 +// with a diff of 255, squared. The maximum error is thus 0x4388241, 1.104 +// which fits into 32 bits integers. 1.105 +double GetSSIM(const uint8 *org, const uint8 *rec, 1.106 + int xo, int yo, int W, int H, int stride) { 1.107 + uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; 1.108 + org += (yo - KERNEL) * stride; 1.109 + org += (xo - KERNEL); 1.110 + rec += (yo - KERNEL) * stride; 1.111 + rec += (xo - KERNEL); 1.112 + for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) { 1.113 + if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue; 1.114 + const int Wy = K[y_]; 1.115 + for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) { 1.116 + const int Wxy = Wy * K[x_]; 1.117 + if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) { 1.118 + const int org_x = org[x_]; 1.119 + const int rec_x = rec[x_]; 1.120 + ws += Wxy; 1.121 + xm += Wxy * org_x; 1.122 + ym += Wxy * rec_x; 1.123 + xxm += Wxy * org_x * org_x; 1.124 + xym += Wxy * org_x * rec_x; 1.125 + yym += Wxy * rec_x * rec_x; 1.126 + } 1.127 + } 1.128 + } 1.129 + return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym); 1.130 +} 1.131 + 1.132 +double GetSSIMFullKernel(const uint8 *org, const uint8 *rec, 1.133 + int xo, int yo, int stride, 1.134 + double area_weight) { 1.135 + uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; 1.136 + 1.137 +#if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__) 1.138 + 1.139 + org += yo * stride + xo; 1.140 + rec += yo * stride + xo; 1.141 + for (int y = 1; y <= KERNEL; y++) { 1.142 + const int dy1 = y * stride; 1.143 + const int dy2 = y * stride; 1.144 + const int Wy = K[KERNEL + y]; 1.145 + 1.146 + for (int x = 1; x <= KERNEL; x++) { 1.147 + // Compute the contributions of upper-left (ul), upper-right (ur) 1.148 + // lower-left (ll) and lower-right (lr) points (see the diagram below). 1.149 + // Symmetric Kernel will have same weight on those points. 1.150 + // - - - - - - - 1.151 + // - ul - - - ur - 1.152 + // - - - - - - - 1.153 + // - - - 0 - - - 1.154 + // - - - - - - - 1.155 + // - ll - - - lr - 1.156 + // - - - - - - - 1.157 + const int Wxy = Wy * K[KERNEL + x]; 1.158 + const int ul1 = org[-dy1 - x]; 1.159 + const int ur1 = org[-dy1 + x]; 1.160 + const int ll1 = org[dy1 - x]; 1.161 + const int lr1 = org[dy1 + x]; 1.162 + 1.163 + const int ul2 = rec[-dy2 - x]; 1.164 + const int ur2 = rec[-dy2 + x]; 1.165 + const int ll2 = rec[dy2 - x]; 1.166 + const int lr2 = rec[dy2 + x]; 1.167 + 1.168 + xm += Wxy * (ul1 + ur1 + ll1 + lr1); 1.169 + ym += Wxy * (ul2 + ur2 + ll2 + lr2); 1.170 + xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1); 1.171 + xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2); 1.172 + yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2); 1.173 + } 1.174 + 1.175 + // Compute the contributions of up (u), down (d), left (l) and right (r) 1.176 + // points across the main axes (see the diagram below). 1.177 + // Symmetric Kernel will have same weight on those points. 1.178 + // - - - - - - - 1.179 + // - - - u - - - 1.180 + // - - - - - - - 1.181 + // - l - 0 - r - 1.182 + // - - - - - - - 1.183 + // - - - d - - - 1.184 + // - - - - - - - 1.185 + const int Wxy = Wy * K[KERNEL]; 1.186 + const int u1 = org[-dy1]; 1.187 + const int d1 = org[dy1]; 1.188 + const int l1 = org[-y]; 1.189 + const int r1 = org[y]; 1.190 + 1.191 + const int u2 = rec[-dy2]; 1.192 + const int d2 = rec[dy2]; 1.193 + const int l2 = rec[-y]; 1.194 + const int r2 = rec[y]; 1.195 + 1.196 + xm += Wxy * (u1 + d1 + l1 + r1); 1.197 + ym += Wxy * (u2 + d2 + l2 + r2); 1.198 + xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1); 1.199 + xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2); 1.200 + yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2); 1.201 + } 1.202 + 1.203 + // Lastly the contribution of (x0, y0) point. 1.204 + const int Wxy = K[KERNEL] * K[KERNEL]; 1.205 + const int s1 = org[0]; 1.206 + const int s2 = rec[0]; 1.207 + 1.208 + xm += Wxy * s1; 1.209 + ym += Wxy * s2; 1.210 + xxm += Wxy * s1 * s1; 1.211 + xym += Wxy * s1 * s2; 1.212 + yym += Wxy * s2 * s2; 1.213 + 1.214 +#else // __SSE2__ 1.215 + 1.216 + org += (yo - KERNEL) * stride + (xo - KERNEL); 1.217 + rec += (yo - KERNEL) * stride + (xo - KERNEL); 1.218 + 1.219 + const __m128i zero = _mm_setzero_si128(); 1.220 + __m128i x = zero; 1.221 + __m128i y = zero; 1.222 + __m128i xx = zero; 1.223 + __m128i xy = zero; 1.224 + __m128i yy = zero; 1.225 + 1.226 +// Read 8 pixels at line #L, and convert to 16bit, perform weighting 1.227 +// and acccumulate. 1.228 +#define LOAD_LINE_PAIR(L, WEIGHT) do { \ 1.229 + const __m128i v0 = \ 1.230 + _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \ 1.231 + const __m128i v1 = \ 1.232 + _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \ 1.233 + const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \ 1.234 + const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \ 1.235 + const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \ 1.236 + const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \ 1.237 + x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \ 1.238 + y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \ 1.239 + x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \ 1.240 + y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \ 1.241 + xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \ 1.242 + xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \ 1.243 + yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \ 1.244 +} while (0) 1.245 + 1.246 +#define ADD_AND_STORE_FOUR_EPI32(M, OUT) do { \ 1.247 + uint32 tmp[4]; \ 1.248 + _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \ 1.249 + (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \ 1.250 +} while (0) 1.251 + 1.252 + LOAD_LINE_PAIR(0, W0); 1.253 + LOAD_LINE_PAIR(1, W1); 1.254 + LOAD_LINE_PAIR(2, W2); 1.255 + LOAD_LINE_PAIR(3, W3); 1.256 + LOAD_LINE_PAIR(4, W2); 1.257 + LOAD_LINE_PAIR(5, W1); 1.258 + LOAD_LINE_PAIR(6, W0); 1.259 + 1.260 + ADD_AND_STORE_FOUR_EPI32(x, xm); 1.261 + ADD_AND_STORE_FOUR_EPI32(y, ym); 1.262 + ADD_AND_STORE_FOUR_EPI32(xx, xxm); 1.263 + ADD_AND_STORE_FOUR_EPI32(xy, xym); 1.264 + ADD_AND_STORE_FOUR_EPI32(yy, yym); 1.265 + 1.266 +#undef LOAD_LINE_PAIR 1.267 +#undef ADD_AND_STORE_FOUR_EPI32 1.268 +#endif 1.269 + 1.270 + return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym); 1.271 +} 1.272 + 1.273 +static int start_max(int x, int y) { return (x > y) ? x : y; } 1.274 + 1.275 +double CalcSSIM(const uint8 *org, const uint8 *rec, 1.276 + const int image_width, const int image_height) { 1.277 + double SSIM = 0.; 1.278 + const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL; 1.279 + const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL; 1.280 + const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X); 1.281 + const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y); 1.282 + const int stride = image_width; 1.283 + 1.284 + for (int j = 0; j < KERNEL_Y; ++j) { 1.285 + for (int i = 0; i < image_width; ++i) { 1.286 + SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); 1.287 + } 1.288 + } 1.289 + 1.290 +#ifdef _OPENMP 1.291 + #pragma omp parallel for reduction(+: SSIM) 1.292 +#endif 1.293 + for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) { 1.294 + for (int i = 0; i < KERNEL_X; ++i) { 1.295 + SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); 1.296 + } 1.297 + for (int i = KERNEL_X; i < start_x; ++i) { 1.298 + SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]); 1.299 + } 1.300 + if (start_x < image_width) { 1.301 + // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we 1.302 + // copy the 8 rightmost pixels on a cache area, and pad this area with 1.303 + // zeros which won't contribute to the overall SSIM value (but we need 1.304 + // to pass the correct normalizing constant!). By using this cache, we can 1.305 + // still call GetSSIMFullKernel() instead of the slower GetSSIM(). 1.306 + // NOTE: we could use similar method for the left-most pixels too. 1.307 + const int kScratchWidth = 8; 1.308 + const int kScratchStride = kScratchWidth + KERNEL + 1; 1.309 + uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 }; 1.310 + uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 }; 1.311 + 1.312 + for (int k = 0; k < KERNEL_SIZE; ++k) { 1.313 + const int offset = 1.314 + (j - KERNEL + k) * stride + image_width - kScratchWidth; 1.315 + memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth); 1.316 + memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth); 1.317 + } 1.318 + for (int k = 0; k <= KERNEL_X + 1; ++k) { 1.319 + SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, 1.320 + KERNEL + k, KERNEL, kScratchStride, kiW[k]); 1.321 + } 1.322 + } 1.323 + } 1.324 + 1.325 + for (int j = start_y; j < image_height; ++j) { 1.326 + for (int i = 0; i < image_width; ++i) { 1.327 + SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); 1.328 + } 1.329 + } 1.330 + return SSIM; 1.331 +} 1.332 + 1.333 +double CalcLSSIM(double ssim) { 1.334 + return -10.0 * log10(1.0 - ssim); 1.335 +} 1.336 + 1.337 +#ifdef __cplusplus 1.338 +} // extern "C" 1.339 +#endif 1.340 +