Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
michael@0 | 3 | * |
michael@0 | 4 | * Use of this source code is governed by a BSD-style license |
michael@0 | 5 | * that can be found in the LICENSE file in the root of the source |
michael@0 | 6 | * tree. An additional intellectual property rights grant can be found |
michael@0 | 7 | * in the file PATENTS. All contributing project authors may |
michael@0 | 8 | * be found in the AUTHORS file in the root of the source tree. |
michael@0 | 9 | */ |
michael@0 | 10 | |
michael@0 | 11 | #include "./psnr.h" // NOLINT |
michael@0 | 12 | |
michael@0 | 13 | #include <math.h> |
michael@0 | 14 | |
michael@0 | 15 | #ifdef _OPENMP |
michael@0 | 16 | #include <omp.h> |
michael@0 | 17 | #endif |
michael@0 | 18 | #ifdef _MSC_VER |
michael@0 | 19 | #include <intrin.h> // For __cpuid() |
michael@0 | 20 | #endif |
michael@0 | 21 | |
michael@0 | 22 | #ifdef __cplusplus |
michael@0 | 23 | extern "C" { |
michael@0 | 24 | #endif |
michael@0 | 25 | |
michael@0 | 26 | typedef unsigned int uint32; // NOLINT |
michael@0 | 27 | #ifdef _MSC_VER |
michael@0 | 28 | typedef unsigned __int64 uint64; |
michael@0 | 29 | #else // COMPILER_MSVC |
michael@0 | 30 | #if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) |
michael@0 | 31 | typedef unsigned long uint64; // NOLINT |
michael@0 | 32 | #else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) |
michael@0 | 33 | typedef unsigned long long uint64; // NOLINT |
michael@0 | 34 | #endif // __LP64__ |
michael@0 | 35 | #endif // _MSC_VER |
michael@0 | 36 | |
michael@0 | 37 | // PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse) |
michael@0 | 38 | double ComputePSNR(double sse, double size) { |
michael@0 | 39 | const double kMINSSE = 255.0 * 255.0 * size / pow(10., kMaxPSNR / 10.); |
michael@0 | 40 | if (sse <= kMINSSE) |
michael@0 | 41 | sse = kMINSSE; // Produces max PSNR of 128 |
michael@0 | 42 | return 10.0 * log10(65025.0 * size / sse); |
michael@0 | 43 | } |
michael@0 | 44 | |
michael@0 | 45 | #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) |
michael@0 | 46 | #define HAS_SUMSQUAREERROR_NEON |
michael@0 | 47 | static uint32 SumSquareError_NEON(const uint8* src_a, |
michael@0 | 48 | const uint8* src_b, int count) { |
michael@0 | 49 | volatile uint32 sse; |
michael@0 | 50 | asm volatile ( // NOLINT |
michael@0 | 51 | "vmov.u8 q7, #0 \n" |
michael@0 | 52 | "vmov.u8 q9, #0 \n" |
michael@0 | 53 | "vmov.u8 q8, #0 \n" |
michael@0 | 54 | "vmov.u8 q10, #0 \n" |
michael@0 | 55 | |
michael@0 | 56 | "1: \n" |
michael@0 | 57 | "vld1.u8 {q0}, [%0]! \n" |
michael@0 | 58 | "vld1.u8 {q1}, [%1]! \n" |
michael@0 | 59 | "vsubl.u8 q2, d0, d2 \n" |
michael@0 | 60 | "vsubl.u8 q3, d1, d3 \n" |
michael@0 | 61 | "vmlal.s16 q7, d4, d4 \n" |
michael@0 | 62 | "vmlal.s16 q8, d6, d6 \n" |
michael@0 | 63 | "vmlal.s16 q8, d5, d5 \n" |
michael@0 | 64 | "vmlal.s16 q10, d7, d7 \n" |
michael@0 | 65 | "subs %2, %2, #16 \n" |
michael@0 | 66 | "bhi 1b \n" |
michael@0 | 67 | |
michael@0 | 68 | "vadd.u32 q7, q7, q8 \n" |
michael@0 | 69 | "vadd.u32 q9, q9, q10 \n" |
michael@0 | 70 | "vadd.u32 q10, q7, q9 \n" |
michael@0 | 71 | "vpaddl.u32 q1, q10 \n" |
michael@0 | 72 | "vadd.u64 d0, d2, d3 \n" |
michael@0 | 73 | "vmov.32 %3, d0[0] \n" |
michael@0 | 74 | : "+r"(src_a), |
michael@0 | 75 | "+r"(src_b), |
michael@0 | 76 | "+r"(count), |
michael@0 | 77 | "=r"(sse) |
michael@0 | 78 | : |
michael@0 | 79 | : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10"); |
michael@0 | 80 | return sse; |
michael@0 | 81 | } |
michael@0 | 82 | #elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) |
michael@0 | 83 | #define HAS_SUMSQUAREERROR_SSE2 |
michael@0 | 84 | __declspec(naked) |
michael@0 | 85 | static uint32 SumSquareError_SSE2(const uint8* /*src_a*/, |
michael@0 | 86 | const uint8* /*src_b*/, int /*count*/) { |
michael@0 | 87 | __asm { |
michael@0 | 88 | mov eax, [esp + 4] // src_a |
michael@0 | 89 | mov edx, [esp + 8] // src_b |
michael@0 | 90 | mov ecx, [esp + 12] // count |
michael@0 | 91 | pxor xmm0, xmm0 |
michael@0 | 92 | pxor xmm5, xmm5 |
michael@0 | 93 | sub edx, eax |
michael@0 | 94 | |
michael@0 | 95 | wloop: |
michael@0 | 96 | movdqu xmm1, [eax] |
michael@0 | 97 | movdqu xmm2, [eax + edx] |
michael@0 | 98 | lea eax, [eax + 16] |
michael@0 | 99 | movdqu xmm3, xmm1 |
michael@0 | 100 | psubusb xmm1, xmm2 |
michael@0 | 101 | psubusb xmm2, xmm3 |
michael@0 | 102 | por xmm1, xmm2 |
michael@0 | 103 | movdqu xmm2, xmm1 |
michael@0 | 104 | punpcklbw xmm1, xmm5 |
michael@0 | 105 | punpckhbw xmm2, xmm5 |
michael@0 | 106 | pmaddwd xmm1, xmm1 |
michael@0 | 107 | pmaddwd xmm2, xmm2 |
michael@0 | 108 | paddd xmm0, xmm1 |
michael@0 | 109 | paddd xmm0, xmm2 |
michael@0 | 110 | sub ecx, 16 |
michael@0 | 111 | ja wloop |
michael@0 | 112 | |
michael@0 | 113 | pshufd xmm1, xmm0, 0EEh |
michael@0 | 114 | paddd xmm0, xmm1 |
michael@0 | 115 | pshufd xmm1, xmm0, 01h |
michael@0 | 116 | paddd xmm0, xmm1 |
michael@0 | 117 | movd eax, xmm0 |
michael@0 | 118 | ret |
michael@0 | 119 | } |
michael@0 | 120 | } |
michael@0 | 121 | #elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) |
michael@0 | 122 | #define HAS_SUMSQUAREERROR_SSE2 |
michael@0 | 123 | static uint32 SumSquareError_SSE2(const uint8* src_a, |
michael@0 | 124 | const uint8* src_b, int count) { |
michael@0 | 125 | uint32 sse; |
michael@0 | 126 | asm volatile ( // NOLINT |
michael@0 | 127 | "pxor %%xmm0,%%xmm0 \n" |
michael@0 | 128 | "pxor %%xmm5,%%xmm5 \n" |
michael@0 | 129 | "sub %0,%1 \n" |
michael@0 | 130 | |
michael@0 | 131 | "1: \n" |
michael@0 | 132 | "movdqu (%0),%%xmm1 \n" |
michael@0 | 133 | "movdqu (%0,%1,1),%%xmm2 \n" |
michael@0 | 134 | "lea 0x10(%0),%0 \n" |
michael@0 | 135 | "movdqu %%xmm1,%%xmm3 \n" |
michael@0 | 136 | "psubusb %%xmm2,%%xmm1 \n" |
michael@0 | 137 | "psubusb %%xmm3,%%xmm2 \n" |
michael@0 | 138 | "por %%xmm2,%%xmm1 \n" |
michael@0 | 139 | "movdqu %%xmm1,%%xmm2 \n" |
michael@0 | 140 | "punpcklbw %%xmm5,%%xmm1 \n" |
michael@0 | 141 | "punpckhbw %%xmm5,%%xmm2 \n" |
michael@0 | 142 | "pmaddwd %%xmm1,%%xmm1 \n" |
michael@0 | 143 | "pmaddwd %%xmm2,%%xmm2 \n" |
michael@0 | 144 | "paddd %%xmm1,%%xmm0 \n" |
michael@0 | 145 | "paddd %%xmm2,%%xmm0 \n" |
michael@0 | 146 | "sub $0x10,%2 \n" |
michael@0 | 147 | "ja 1b \n" |
michael@0 | 148 | |
michael@0 | 149 | "pshufd $0xee,%%xmm0,%%xmm1 \n" |
michael@0 | 150 | "paddd %%xmm1,%%xmm0 \n" |
michael@0 | 151 | "pshufd $0x1,%%xmm0,%%xmm1 \n" |
michael@0 | 152 | "paddd %%xmm1,%%xmm0 \n" |
michael@0 | 153 | "movd %%xmm0,%3 \n" |
michael@0 | 154 | |
michael@0 | 155 | : "+r"(src_a), // %0 |
michael@0 | 156 | "+r"(src_b), // %1 |
michael@0 | 157 | "+r"(count), // %2 |
michael@0 | 158 | "=g"(sse) // %3 |
michael@0 | 159 | : |
michael@0 | 160 | : "memory", "cc" |
michael@0 | 161 | #if defined(__SSE2__) |
michael@0 | 162 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
michael@0 | 163 | #endif |
michael@0 | 164 | ); // NOLINT |
michael@0 | 165 | return sse; |
michael@0 | 166 | } |
michael@0 | 167 | #endif // LIBYUV_DISABLE_X86 etc |
michael@0 | 168 | |
michael@0 | 169 | #if defined(HAS_SUMSQUAREERROR_SSE2) |
michael@0 | 170 | #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) |
michael@0 | 171 | static __inline void __cpuid(int cpu_info[4], int info_type) { |
michael@0 | 172 | asm volatile ( // NOLINT |
michael@0 | 173 | "mov %%ebx, %%edi \n" |
michael@0 | 174 | "cpuid \n" |
michael@0 | 175 | "xchg %%edi, %%ebx \n" |
michael@0 | 176 | : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) |
michael@0 | 177 | : "a"(info_type)); |
michael@0 | 178 | } |
michael@0 | 179 | #elif defined(__i386__) || defined(__x86_64__) |
michael@0 | 180 | static __inline void __cpuid(int cpu_info[4], int info_type) { |
michael@0 | 181 | asm volatile ( // NOLINT |
michael@0 | 182 | "cpuid \n" |
michael@0 | 183 | : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) |
michael@0 | 184 | : "a"(info_type)); |
michael@0 | 185 | } |
michael@0 | 186 | #endif |
michael@0 | 187 | |
michael@0 | 188 | static int CpuHasSSE2() { |
michael@0 | 189 | #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) |
michael@0 | 190 | int cpu_info[4]; |
michael@0 | 191 | __cpuid(cpu_info, 1); |
michael@0 | 192 | if (cpu_info[3] & 0x04000000) { |
michael@0 | 193 | return 1; |
michael@0 | 194 | } |
michael@0 | 195 | #endif |
michael@0 | 196 | return 0; |
michael@0 | 197 | } |
michael@0 | 198 | #endif // HAS_SUMSQUAREERROR_SSE2 |
michael@0 | 199 | |
michael@0 | 200 | static uint32 SumSquareError_C(const uint8* src_a, |
michael@0 | 201 | const uint8* src_b, int count) { |
michael@0 | 202 | uint32 sse = 0u; |
michael@0 | 203 | for (int x = 0; x < count; ++x) { |
michael@0 | 204 | int diff = src_a[x] - src_b[x]; |
michael@0 | 205 | sse += static_cast<uint32>(diff * diff); |
michael@0 | 206 | } |
michael@0 | 207 | return sse; |
michael@0 | 208 | } |
michael@0 | 209 | |
michael@0 | 210 | double ComputeSumSquareError(const uint8* src_a, |
michael@0 | 211 | const uint8* src_b, int count) { |
michael@0 | 212 | uint32 (*SumSquareError)(const uint8* src_a, |
michael@0 | 213 | const uint8* src_b, int count) = SumSquareError_C; |
michael@0 | 214 | #if defined(HAS_SUMSQUAREERROR_NEON) |
michael@0 | 215 | SumSquareError = SumSquareError_NEON; |
michael@0 | 216 | #endif |
michael@0 | 217 | #if defined(HAS_SUMSQUAREERROR_SSE2) |
michael@0 | 218 | if (CpuHasSSE2()) { |
michael@0 | 219 | SumSquareError = SumSquareError_SSE2; |
michael@0 | 220 | } |
michael@0 | 221 | #endif |
michael@0 | 222 | const int kBlockSize = 1 << 15; |
michael@0 | 223 | uint64 sse = 0; |
michael@0 | 224 | #ifdef _OPENMP |
michael@0 | 225 | #pragma omp parallel for reduction(+: sse) |
michael@0 | 226 | #endif |
michael@0 | 227 | for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { |
michael@0 | 228 | sse += SumSquareError(src_a + i, src_b + i, kBlockSize); |
michael@0 | 229 | } |
michael@0 | 230 | src_a += count & ~(kBlockSize - 1); |
michael@0 | 231 | src_b += count & ~(kBlockSize - 1); |
michael@0 | 232 | int remainder = count & (kBlockSize - 1) & ~15; |
michael@0 | 233 | if (remainder) { |
michael@0 | 234 | sse += SumSquareError(src_a, src_b, remainder); |
michael@0 | 235 | src_a += remainder; |
michael@0 | 236 | src_b += remainder; |
michael@0 | 237 | } |
michael@0 | 238 | remainder = count & 15; |
michael@0 | 239 | if (remainder) { |
michael@0 | 240 | sse += SumSquareError_C(src_a, src_b, remainder); |
michael@0 | 241 | } |
michael@0 | 242 | return static_cast<double>(sse); |
michael@0 | 243 | } |
michael@0 | 244 | |
michael@0 | 245 | #ifdef __cplusplus |
michael@0 | 246 | } // extern "C" |
michael@0 | 247 | #endif |