|
1 /* |
|
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
|
3 * |
|
4 * Use of this source code is governed by a BSD-style license |
|
5 * that can be found in the LICENSE file in the root of the source |
|
6 * tree. An additional intellectual property rights grant can be found |
|
7 * in the file PATENTS. All contributing project authors may |
|
8 * be found in the AUTHORS file in the root of the source tree. |
|
9 */ |
|
10 |
|
11 #include "../util/ssim.h" // NOLINT |
|
12 |
|
13 #include <math.h> |
|
14 #include <string.h> |
|
15 |
|
16 #ifdef __cplusplus |
|
17 extern "C" { |
|
18 #endif |
|
19 |
|
20 typedef unsigned int uint32; // NOLINT |
|
21 typedef unsigned short uint16; // NOLINT |
|
22 |
|
23 #if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \ |
|
24 (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2))) |
|
25 #define __SSE2__ |
|
26 #endif |
|
27 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) |
|
28 #include <emmintrin.h> |
|
29 #endif |
|
30 |
|
31 #ifdef _OPENMP |
|
32 #include <omp.h> |
|
33 #endif |
|
34 |
|
35 // SSIM |
|
36 enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 }; |
|
37 |
|
38 // Symmetric Gaussian kernel: K[i] = ~11 * exp(-0.3 * i * i) |
|
39 // The maximum value (11 x 11) must be less than 128 to avoid sign |
|
40 // problems during the calls to _mm_mullo_epi16(). |
|
41 static const int K[KERNEL_SIZE] = { |
|
42 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i) |
|
43 }; |
|
44 static const double kiW[KERNEL + 1 + 1] = { |
|
45 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] |
|
46 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] |
|
47 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j] |
|
48 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j] |
|
49 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j] |
|
50 }; |
|
51 |
|
52 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) |
|
53 |
|
54 #define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)]) // weight product |
|
55 #define MAKE_WEIGHT(L) \ |
|
56 { { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), \ |
|
57 PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } } |
|
58 |
|
59 // We need this union trick to be able to initialize constant static __m128i |
|
60 // values. We can't call _mm_set_epi16() for static compile-time initialization. |
|
61 static const struct { |
|
62 union { |
|
63 uint16 i16_[8]; |
|
64 __m128i m_; |
|
65 } values_; |
|
66 } W0 = MAKE_WEIGHT(0), |
|
67 W1 = MAKE_WEIGHT(1), |
|
68 W2 = MAKE_WEIGHT(2), |
|
69 W3 = MAKE_WEIGHT(3); |
|
70 // ... the rest is symmetric. |
|
71 #undef MAKE_WEIGHT |
|
72 #undef PWEIGHT |
|
73 #endif |
|
74 |
|
75 // Common final expression for SSIM, once the weighted sums are known. |
|
76 static double FinalizeSSIM(double iw, double xm, double ym, |
|
77 double xxm, double xym, double yym) { |
|
78 const double iwx = xm * iw; |
|
79 const double iwy = ym * iw; |
|
80 double sxx = xxm * iw - iwx * iwx; |
|
81 double syy = yym * iw - iwy * iwy; |
|
82 // small errors are possible, due to rounding. Clamp to zero. |
|
83 if (sxx < 0.) sxx = 0.; |
|
84 if (syy < 0.) syy = 0.; |
|
85 const double sxsy = sqrt(sxx * syy); |
|
86 const double sxy = xym * iw - iwx * iwy; |
|
87 static const double C11 = (0.01 * 0.01) * (255 * 255); |
|
88 static const double C22 = (0.03 * 0.03) * (255 * 255); |
|
89 static const double C33 = (0.015 * 0.015) * (255 * 255); |
|
90 const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11); |
|
91 const double c = (2. * sxsy + C22) / (sxx + syy + C22); |
|
92 const double s = (sxy + C33) / (sxsy + C33); |
|
93 return l * c * s; |
|
94 } |
|
95 |
|
96 // GetSSIM() does clipping. GetSSIMFullKernel() does not |
|
97 |
|
98 // TODO(skal): use summed tables? |
|
99 // Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1) |
|
100 // with a diff of 255, squared. The maximum error is thus 0x4388241, |
|
101 // which fits into 32 bits integers. |
|
102 double GetSSIM(const uint8 *org, const uint8 *rec, |
|
103 int xo, int yo, int W, int H, int stride) { |
|
104 uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; |
|
105 org += (yo - KERNEL) * stride; |
|
106 org += (xo - KERNEL); |
|
107 rec += (yo - KERNEL) * stride; |
|
108 rec += (xo - KERNEL); |
|
109 for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) { |
|
110 if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue; |
|
111 const int Wy = K[y_]; |
|
112 for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) { |
|
113 const int Wxy = Wy * K[x_]; |
|
114 if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) { |
|
115 const int org_x = org[x_]; |
|
116 const int rec_x = rec[x_]; |
|
117 ws += Wxy; |
|
118 xm += Wxy * org_x; |
|
119 ym += Wxy * rec_x; |
|
120 xxm += Wxy * org_x * org_x; |
|
121 xym += Wxy * org_x * rec_x; |
|
122 yym += Wxy * rec_x * rec_x; |
|
123 } |
|
124 } |
|
125 } |
|
126 return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym); |
|
127 } |
|
128 |
|
129 double GetSSIMFullKernel(const uint8 *org, const uint8 *rec, |
|
130 int xo, int yo, int stride, |
|
131 double area_weight) { |
|
132 uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; |
|
133 |
|
134 #if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__) |
|
135 |
|
136 org += yo * stride + xo; |
|
137 rec += yo * stride + xo; |
|
138 for (int y = 1; y <= KERNEL; y++) { |
|
139 const int dy1 = y * stride; |
|
140 const int dy2 = y * stride; |
|
141 const int Wy = K[KERNEL + y]; |
|
142 |
|
143 for (int x = 1; x <= KERNEL; x++) { |
|
144 // Compute the contributions of upper-left (ul), upper-right (ur) |
|
145 // lower-left (ll) and lower-right (lr) points (see the diagram below). |
|
146 // Symmetric Kernel will have same weight on those points. |
|
147 // - - - - - - - |
|
148 // - ul - - - ur - |
|
149 // - - - - - - - |
|
150 // - - - 0 - - - |
|
151 // - - - - - - - |
|
152 // - ll - - - lr - |
|
153 // - - - - - - - |
|
154 const int Wxy = Wy * K[KERNEL + x]; |
|
155 const int ul1 = org[-dy1 - x]; |
|
156 const int ur1 = org[-dy1 + x]; |
|
157 const int ll1 = org[dy1 - x]; |
|
158 const int lr1 = org[dy1 + x]; |
|
159 |
|
160 const int ul2 = rec[-dy2 - x]; |
|
161 const int ur2 = rec[-dy2 + x]; |
|
162 const int ll2 = rec[dy2 - x]; |
|
163 const int lr2 = rec[dy2 + x]; |
|
164 |
|
165 xm += Wxy * (ul1 + ur1 + ll1 + lr1); |
|
166 ym += Wxy * (ul2 + ur2 + ll2 + lr2); |
|
167 xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1); |
|
168 xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2); |
|
169 yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2); |
|
170 } |
|
171 |
|
172 // Compute the contributions of up (u), down (d), left (l) and right (r) |
|
173 // points across the main axes (see the diagram below). |
|
174 // Symmetric Kernel will have same weight on those points. |
|
175 // - - - - - - - |
|
176 // - - - u - - - |
|
177 // - - - - - - - |
|
178 // - l - 0 - r - |
|
179 // - - - - - - - |
|
180 // - - - d - - - |
|
181 // - - - - - - - |
|
182 const int Wxy = Wy * K[KERNEL]; |
|
183 const int u1 = org[-dy1]; |
|
184 const int d1 = org[dy1]; |
|
185 const int l1 = org[-y]; |
|
186 const int r1 = org[y]; |
|
187 |
|
188 const int u2 = rec[-dy2]; |
|
189 const int d2 = rec[dy2]; |
|
190 const int l2 = rec[-y]; |
|
191 const int r2 = rec[y]; |
|
192 |
|
193 xm += Wxy * (u1 + d1 + l1 + r1); |
|
194 ym += Wxy * (u2 + d2 + l2 + r2); |
|
195 xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1); |
|
196 xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2); |
|
197 yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2); |
|
198 } |
|
199 |
|
200 // Lastly the contribution of (x0, y0) point. |
|
201 const int Wxy = K[KERNEL] * K[KERNEL]; |
|
202 const int s1 = org[0]; |
|
203 const int s2 = rec[0]; |
|
204 |
|
205 xm += Wxy * s1; |
|
206 ym += Wxy * s2; |
|
207 xxm += Wxy * s1 * s1; |
|
208 xym += Wxy * s1 * s2; |
|
209 yym += Wxy * s2 * s2; |
|
210 |
|
211 #else // __SSE2__ |
|
212 |
|
213 org += (yo - KERNEL) * stride + (xo - KERNEL); |
|
214 rec += (yo - KERNEL) * stride + (xo - KERNEL); |
|
215 |
|
216 const __m128i zero = _mm_setzero_si128(); |
|
217 __m128i x = zero; |
|
218 __m128i y = zero; |
|
219 __m128i xx = zero; |
|
220 __m128i xy = zero; |
|
221 __m128i yy = zero; |
|
222 |
|
223 // Read 8 pixels at line #L, and convert to 16bit, perform weighting |
|
224 // and acccumulate. |
|
225 #define LOAD_LINE_PAIR(L, WEIGHT) do { \ |
|
226 const __m128i v0 = \ |
|
227 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \ |
|
228 const __m128i v1 = \ |
|
229 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \ |
|
230 const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \ |
|
231 const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \ |
|
232 const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \ |
|
233 const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \ |
|
234 x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \ |
|
235 y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \ |
|
236 x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \ |
|
237 y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \ |
|
238 xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \ |
|
239 xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \ |
|
240 yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \ |
|
241 } while (0) |
|
242 |
|
243 #define ADD_AND_STORE_FOUR_EPI32(M, OUT) do { \ |
|
244 uint32 tmp[4]; \ |
|
245 _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \ |
|
246 (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \ |
|
247 } while (0) |
|
248 |
|
249 LOAD_LINE_PAIR(0, W0); |
|
250 LOAD_LINE_PAIR(1, W1); |
|
251 LOAD_LINE_PAIR(2, W2); |
|
252 LOAD_LINE_PAIR(3, W3); |
|
253 LOAD_LINE_PAIR(4, W2); |
|
254 LOAD_LINE_PAIR(5, W1); |
|
255 LOAD_LINE_PAIR(6, W0); |
|
256 |
|
257 ADD_AND_STORE_FOUR_EPI32(x, xm); |
|
258 ADD_AND_STORE_FOUR_EPI32(y, ym); |
|
259 ADD_AND_STORE_FOUR_EPI32(xx, xxm); |
|
260 ADD_AND_STORE_FOUR_EPI32(xy, xym); |
|
261 ADD_AND_STORE_FOUR_EPI32(yy, yym); |
|
262 |
|
263 #undef LOAD_LINE_PAIR |
|
264 #undef ADD_AND_STORE_FOUR_EPI32 |
|
265 #endif |
|
266 |
|
267 return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym); |
|
268 } |
|
269 |
|
270 static int start_max(int x, int y) { return (x > y) ? x : y; } |
|
271 |
|
272 double CalcSSIM(const uint8 *org, const uint8 *rec, |
|
273 const int image_width, const int image_height) { |
|
274 double SSIM = 0.; |
|
275 const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL; |
|
276 const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL; |
|
277 const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X); |
|
278 const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y); |
|
279 const int stride = image_width; |
|
280 |
|
281 for (int j = 0; j < KERNEL_Y; ++j) { |
|
282 for (int i = 0; i < image_width; ++i) { |
|
283 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); |
|
284 } |
|
285 } |
|
286 |
|
287 #ifdef _OPENMP |
|
288 #pragma omp parallel for reduction(+: SSIM) |
|
289 #endif |
|
290 for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) { |
|
291 for (int i = 0; i < KERNEL_X; ++i) { |
|
292 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); |
|
293 } |
|
294 for (int i = KERNEL_X; i < start_x; ++i) { |
|
295 SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]); |
|
296 } |
|
297 if (start_x < image_width) { |
|
298 // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we |
|
299 // copy the 8 rightmost pixels on a cache area, and pad this area with |
|
300 // zeros which won't contribute to the overall SSIM value (but we need |
|
301 // to pass the correct normalizing constant!). By using this cache, we can |
|
302 // still call GetSSIMFullKernel() instead of the slower GetSSIM(). |
|
303 // NOTE: we could use similar method for the left-most pixels too. |
|
304 const int kScratchWidth = 8; |
|
305 const int kScratchStride = kScratchWidth + KERNEL + 1; |
|
306 uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 }; |
|
307 uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 }; |
|
308 |
|
309 for (int k = 0; k < KERNEL_SIZE; ++k) { |
|
310 const int offset = |
|
311 (j - KERNEL + k) * stride + image_width - kScratchWidth; |
|
312 memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth); |
|
313 memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth); |
|
314 } |
|
315 for (int k = 0; k <= KERNEL_X + 1; ++k) { |
|
316 SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, |
|
317 KERNEL + k, KERNEL, kScratchStride, kiW[k]); |
|
318 } |
|
319 } |
|
320 } |
|
321 |
|
322 for (int j = start_y; j < image_height; ++j) { |
|
323 for (int i = 0; i < image_width; ++i) { |
|
324 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); |
|
325 } |
|
326 } |
|
327 return SSIM; |
|
328 } |
|
329 |
|
330 double CalcLSSIM(double ssim) { |
|
331 return -10.0 * log10(1.0 - ssim); |
|
332 } |
|
333 |
|
334 #ifdef __cplusplus |
|
335 } // extern "C" |
|
336 #endif |
|
337 |