Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
11 #include "./psnr.h" // NOLINT
13 #include <math.h>
15 #ifdef _OPENMP
16 #include <omp.h>
17 #endif
18 #ifdef _MSC_VER
19 #include <intrin.h> // For __cpuid()
20 #endif
22 #ifdef __cplusplus
23 extern "C" {
24 #endif
26 typedef unsigned int uint32; // NOLINT
27 #ifdef _MSC_VER
28 typedef unsigned __int64 uint64;
29 #else // COMPILER_MSVC
30 #if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
31 typedef unsigned long uint64; // NOLINT
32 #else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
33 typedef unsigned long long uint64; // NOLINT
34 #endif // __LP64__
35 #endif // _MSC_VER
37 // PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
38 double ComputePSNR(double sse, double size) {
39 const double kMINSSE = 255.0 * 255.0 * size / pow(10., kMaxPSNR / 10.);
40 if (sse <= kMINSSE)
41 sse = kMINSSE; // Produces max PSNR of 128
42 return 10.0 * log10(65025.0 * size / sse);
43 }
45 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
46 #define HAS_SUMSQUAREERROR_NEON
47 static uint32 SumSquareError_NEON(const uint8* src_a,
48 const uint8* src_b, int count) {
49 volatile uint32 sse;
50 asm volatile ( // NOLINT
51 "vmov.u8 q7, #0 \n"
52 "vmov.u8 q9, #0 \n"
53 "vmov.u8 q8, #0 \n"
54 "vmov.u8 q10, #0 \n"
56 "1: \n"
57 "vld1.u8 {q0}, [%0]! \n"
58 "vld1.u8 {q1}, [%1]! \n"
59 "vsubl.u8 q2, d0, d2 \n"
60 "vsubl.u8 q3, d1, d3 \n"
61 "vmlal.s16 q7, d4, d4 \n"
62 "vmlal.s16 q8, d6, d6 \n"
63 "vmlal.s16 q8, d5, d5 \n"
64 "vmlal.s16 q10, d7, d7 \n"
65 "subs %2, %2, #16 \n"
66 "bhi 1b \n"
68 "vadd.u32 q7, q7, q8 \n"
69 "vadd.u32 q9, q9, q10 \n"
70 "vadd.u32 q10, q7, q9 \n"
71 "vpaddl.u32 q1, q10 \n"
72 "vadd.u64 d0, d2, d3 \n"
73 "vmov.32 %3, d0[0] \n"
74 : "+r"(src_a),
75 "+r"(src_b),
76 "+r"(count),
77 "=r"(sse)
78 :
79 : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
80 return sse;
81 }
82 #elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
83 #define HAS_SUMSQUAREERROR_SSE2
84 __declspec(naked)
85 static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
86 const uint8* /*src_b*/, int /*count*/) {
87 __asm {
88 mov eax, [esp + 4] // src_a
89 mov edx, [esp + 8] // src_b
90 mov ecx, [esp + 12] // count
91 pxor xmm0, xmm0
92 pxor xmm5, xmm5
93 sub edx, eax
95 wloop:
96 movdqu xmm1, [eax]
97 movdqu xmm2, [eax + edx]
98 lea eax, [eax + 16]
99 movdqu xmm3, xmm1
100 psubusb xmm1, xmm2
101 psubusb xmm2, xmm3
102 por xmm1, xmm2
103 movdqu xmm2, xmm1
104 punpcklbw xmm1, xmm5
105 punpckhbw xmm2, xmm5
106 pmaddwd xmm1, xmm1
107 pmaddwd xmm2, xmm2
108 paddd xmm0, xmm1
109 paddd xmm0, xmm2
110 sub ecx, 16
111 ja wloop
113 pshufd xmm1, xmm0, 0EEh
114 paddd xmm0, xmm1
115 pshufd xmm1, xmm0, 01h
116 paddd xmm0, xmm1
117 movd eax, xmm0
118 ret
119 }
120 }
121 #elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
122 #define HAS_SUMSQUAREERROR_SSE2
123 static uint32 SumSquareError_SSE2(const uint8* src_a,
124 const uint8* src_b, int count) {
125 uint32 sse;
126 asm volatile ( // NOLINT
127 "pxor %%xmm0,%%xmm0 \n"
128 "pxor %%xmm5,%%xmm5 \n"
129 "sub %0,%1 \n"
131 "1: \n"
132 "movdqu (%0),%%xmm1 \n"
133 "movdqu (%0,%1,1),%%xmm2 \n"
134 "lea 0x10(%0),%0 \n"
135 "movdqu %%xmm1,%%xmm3 \n"
136 "psubusb %%xmm2,%%xmm1 \n"
137 "psubusb %%xmm3,%%xmm2 \n"
138 "por %%xmm2,%%xmm1 \n"
139 "movdqu %%xmm1,%%xmm2 \n"
140 "punpcklbw %%xmm5,%%xmm1 \n"
141 "punpckhbw %%xmm5,%%xmm2 \n"
142 "pmaddwd %%xmm1,%%xmm1 \n"
143 "pmaddwd %%xmm2,%%xmm2 \n"
144 "paddd %%xmm1,%%xmm0 \n"
145 "paddd %%xmm2,%%xmm0 \n"
146 "sub $0x10,%2 \n"
147 "ja 1b \n"
149 "pshufd $0xee,%%xmm0,%%xmm1 \n"
150 "paddd %%xmm1,%%xmm0 \n"
151 "pshufd $0x1,%%xmm0,%%xmm1 \n"
152 "paddd %%xmm1,%%xmm0 \n"
153 "movd %%xmm0,%3 \n"
155 : "+r"(src_a), // %0
156 "+r"(src_b), // %1
157 "+r"(count), // %2
158 "=g"(sse) // %3
159 :
160 : "memory", "cc"
161 #if defined(__SSE2__)
162 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
163 #endif
164 ); // NOLINT
165 return sse;
166 }
167 #endif // LIBYUV_DISABLE_X86 etc
169 #if defined(HAS_SUMSQUAREERROR_SSE2)
170 #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
171 static __inline void __cpuid(int cpu_info[4], int info_type) {
172 asm volatile ( // NOLINT
173 "mov %%ebx, %%edi \n"
174 "cpuid \n"
175 "xchg %%edi, %%ebx \n"
176 : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
177 : "a"(info_type));
178 }
179 #elif defined(__i386__) || defined(__x86_64__)
180 static __inline void __cpuid(int cpu_info[4], int info_type) {
181 asm volatile ( // NOLINT
182 "cpuid \n"
183 : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
184 : "a"(info_type));
185 }
186 #endif
188 static int CpuHasSSE2() {
189 #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86)
190 int cpu_info[4];
191 __cpuid(cpu_info, 1);
192 if (cpu_info[3] & 0x04000000) {
193 return 1;
194 }
195 #endif
196 return 0;
197 }
198 #endif // HAS_SUMSQUAREERROR_SSE2
200 static uint32 SumSquareError_C(const uint8* src_a,
201 const uint8* src_b, int count) {
202 uint32 sse = 0u;
203 for (int x = 0; x < count; ++x) {
204 int diff = src_a[x] - src_b[x];
205 sse += static_cast<uint32>(diff * diff);
206 }
207 return sse;
208 }
210 double ComputeSumSquareError(const uint8* src_a,
211 const uint8* src_b, int count) {
212 uint32 (*SumSquareError)(const uint8* src_a,
213 const uint8* src_b, int count) = SumSquareError_C;
214 #if defined(HAS_SUMSQUAREERROR_NEON)
215 SumSquareError = SumSquareError_NEON;
216 #endif
217 #if defined(HAS_SUMSQUAREERROR_SSE2)
218 if (CpuHasSSE2()) {
219 SumSquareError = SumSquareError_SSE2;
220 }
221 #endif
222 const int kBlockSize = 1 << 15;
223 uint64 sse = 0;
224 #ifdef _OPENMP
225 #pragma omp parallel for reduction(+: sse)
226 #endif
227 for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
228 sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
229 }
230 src_a += count & ~(kBlockSize - 1);
231 src_b += count & ~(kBlockSize - 1);
232 int remainder = count & (kBlockSize - 1) & ~15;
233 if (remainder) {
234 sse += SumSquareError(src_a, src_b, remainder);
235 src_a += remainder;
236 src_b += remainder;
237 }
238 remainder = count & 15;
239 if (remainder) {
240 sse += SumSquareError_C(src_a, src_b, remainder);
241 }
242 return static_cast<double>(sse);
243 }
245 #ifdef __cplusplus
246 } // extern "C"
247 #endif