1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libyuv/util/psnr.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,247 @@ 1.4 +/* 1.5 + * Copyright 2013 The LibYuv Project Authors. All rights reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include "./psnr.h" // NOLINT 1.15 + 1.16 +#include <math.h> 1.17 + 1.18 +#ifdef _OPENMP 1.19 +#include <omp.h> 1.20 +#endif 1.21 +#ifdef _MSC_VER 1.22 +#include <intrin.h> // For __cpuid() 1.23 +#endif 1.24 + 1.25 +#ifdef __cplusplus 1.26 +extern "C" { 1.27 +#endif 1.28 + 1.29 +typedef unsigned int uint32; // NOLINT 1.30 +#ifdef _MSC_VER 1.31 +typedef unsigned __int64 uint64; 1.32 +#else // COMPILER_MSVC 1.33 +#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) 1.34 +typedef unsigned long uint64; // NOLINT 1.35 +#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) 1.36 +typedef unsigned long long uint64; // NOLINT 1.37 +#endif // __LP64__ 1.38 +#endif // _MSC_VER 1.39 + 1.40 +// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse) 1.41 +double ComputePSNR(double sse, double size) { 1.42 + const double kMINSSE = 255.0 * 255.0 * size / pow(10., kMaxPSNR / 10.); 1.43 + if (sse <= kMINSSE) 1.44 + sse = kMINSSE; // Produces max PSNR of 128 1.45 + return 10.0 * log10(65025.0 * size / sse); 1.46 +} 1.47 + 1.48 +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) 1.49 +#define HAS_SUMSQUAREERROR_NEON 1.50 +static uint32 SumSquareError_NEON(const uint8* src_a, 1.51 + const uint8* src_b, int count) { 1.52 + volatile uint32 sse; 1.53 + asm volatile ( // NOLINT 1.54 + "vmov.u8 q7, #0 \n" 1.55 + "vmov.u8 q9, #0 \n" 1.56 + "vmov.u8 q8, #0 \n" 1.57 + "vmov.u8 q10, #0 \n" 1.58 + 1.59 + "1: \n" 1.60 + "vld1.u8 {q0}, [%0]! \n" 1.61 + "vld1.u8 {q1}, [%1]! \n" 1.62 + "vsubl.u8 q2, d0, d2 \n" 1.63 + "vsubl.u8 q3, d1, d3 \n" 1.64 + "vmlal.s16 q7, d4, d4 \n" 1.65 + "vmlal.s16 q8, d6, d6 \n" 1.66 + "vmlal.s16 q8, d5, d5 \n" 1.67 + "vmlal.s16 q10, d7, d7 \n" 1.68 + "subs %2, %2, #16 \n" 1.69 + "bhi 1b \n" 1.70 + 1.71 + "vadd.u32 q7, q7, q8 \n" 1.72 + "vadd.u32 q9, q9, q10 \n" 1.73 + "vadd.u32 q10, q7, q9 \n" 1.74 + "vpaddl.u32 q1, q10 \n" 1.75 + "vadd.u64 d0, d2, d3 \n" 1.76 + "vmov.32 %3, d0[0] \n" 1.77 + : "+r"(src_a), 1.78 + "+r"(src_b), 1.79 + "+r"(count), 1.80 + "=r"(sse) 1.81 + : 1.82 + : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10"); 1.83 + return sse; 1.84 +} 1.85 +#elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 1.86 +#define HAS_SUMSQUAREERROR_SSE2 1.87 +__declspec(naked) 1.88 +static uint32 SumSquareError_SSE2(const uint8* /*src_a*/, 1.89 + const uint8* /*src_b*/, int /*count*/) { 1.90 + __asm { 1.91 + mov eax, [esp + 4] // src_a 1.92 + mov edx, [esp + 8] // src_b 1.93 + mov ecx, [esp + 12] // count 1.94 + pxor xmm0, xmm0 1.95 + pxor xmm5, xmm5 1.96 + sub edx, eax 1.97 + 1.98 + wloop: 1.99 + movdqu xmm1, [eax] 1.100 + movdqu xmm2, [eax + edx] 1.101 + lea eax, [eax + 16] 1.102 + movdqu xmm3, xmm1 1.103 + psubusb xmm1, xmm2 1.104 + psubusb xmm2, xmm3 1.105 + por xmm1, xmm2 1.106 + movdqu xmm2, xmm1 1.107 + punpcklbw xmm1, xmm5 1.108 + punpckhbw xmm2, xmm5 1.109 + pmaddwd xmm1, xmm1 1.110 + pmaddwd xmm2, xmm2 1.111 + paddd xmm0, xmm1 1.112 + paddd xmm0, xmm2 1.113 + sub ecx, 16 1.114 + ja wloop 1.115 + 1.116 + pshufd xmm1, xmm0, 0EEh 1.117 + paddd xmm0, xmm1 1.118 + pshufd xmm1, xmm0, 01h 1.119 + paddd xmm0, xmm1 1.120 + movd eax, xmm0 1.121 + ret 1.122 + } 1.123 +} 1.124 +#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) 1.125 +#define HAS_SUMSQUAREERROR_SSE2 1.126 +static uint32 SumSquareError_SSE2(const uint8* src_a, 1.127 + const uint8* src_b, int count) { 1.128 + uint32 sse; 1.129 + asm volatile ( // NOLINT 1.130 + "pxor %%xmm0,%%xmm0 \n" 1.131 + "pxor %%xmm5,%%xmm5 \n" 1.132 + "sub %0,%1 \n" 1.133 + 1.134 + "1: \n" 1.135 + "movdqu (%0),%%xmm1 \n" 1.136 + "movdqu (%0,%1,1),%%xmm2 \n" 1.137 + "lea 0x10(%0),%0 \n" 1.138 + "movdqu %%xmm1,%%xmm3 \n" 1.139 + "psubusb %%xmm2,%%xmm1 \n" 1.140 + "psubusb %%xmm3,%%xmm2 \n" 1.141 + "por %%xmm2,%%xmm1 \n" 1.142 + "movdqu %%xmm1,%%xmm2 \n" 1.143 + "punpcklbw %%xmm5,%%xmm1 \n" 1.144 + "punpckhbw %%xmm5,%%xmm2 \n" 1.145 + "pmaddwd %%xmm1,%%xmm1 \n" 1.146 + "pmaddwd %%xmm2,%%xmm2 \n" 1.147 + "paddd %%xmm1,%%xmm0 \n" 1.148 + "paddd %%xmm2,%%xmm0 \n" 1.149 + "sub $0x10,%2 \n" 1.150 + "ja 1b \n" 1.151 + 1.152 + "pshufd $0xee,%%xmm0,%%xmm1 \n" 1.153 + "paddd %%xmm1,%%xmm0 \n" 1.154 + "pshufd $0x1,%%xmm0,%%xmm1 \n" 1.155 + "paddd %%xmm1,%%xmm0 \n" 1.156 + "movd %%xmm0,%3 \n" 1.157 + 1.158 + : "+r"(src_a), // %0 1.159 + "+r"(src_b), // %1 1.160 + "+r"(count), // %2 1.161 + "=g"(sse) // %3 1.162 + : 1.163 + : "memory", "cc" 1.164 +#if defined(__SSE2__) 1.165 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 1.166 +#endif 1.167 + ); // NOLINT 1.168 + return sse; 1.169 +} 1.170 +#endif // LIBYUV_DISABLE_X86 etc 1.171 + 1.172 +#if defined(HAS_SUMSQUAREERROR_SSE2) 1.173 +#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) 1.174 +static __inline void __cpuid(int cpu_info[4], int info_type) { 1.175 + asm volatile ( // NOLINT 1.176 + "mov %%ebx, %%edi \n" 1.177 + "cpuid \n" 1.178 + "xchg %%edi, %%ebx \n" 1.179 + : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) 1.180 + : "a"(info_type)); 1.181 +} 1.182 +#elif defined(__i386__) || defined(__x86_64__) 1.183 +static __inline void __cpuid(int cpu_info[4], int info_type) { 1.184 + asm volatile ( // NOLINT 1.185 + "cpuid \n" 1.186 + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) 1.187 + : "a"(info_type)); 1.188 +} 1.189 +#endif 1.190 + 1.191 +static int CpuHasSSE2() { 1.192 +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) 1.193 + int cpu_info[4]; 1.194 + __cpuid(cpu_info, 1); 1.195 + if (cpu_info[3] & 0x04000000) { 1.196 + return 1; 1.197 + } 1.198 +#endif 1.199 + return 0; 1.200 +} 1.201 +#endif // HAS_SUMSQUAREERROR_SSE2 1.202 + 1.203 +static uint32 SumSquareError_C(const uint8* src_a, 1.204 + const uint8* src_b, int count) { 1.205 + uint32 sse = 0u; 1.206 + for (int x = 0; x < count; ++x) { 1.207 + int diff = src_a[x] - src_b[x]; 1.208 + sse += static_cast<uint32>(diff * diff); 1.209 + } 1.210 + return sse; 1.211 +} 1.212 + 1.213 +double ComputeSumSquareError(const uint8* src_a, 1.214 + const uint8* src_b, int count) { 1.215 + uint32 (*SumSquareError)(const uint8* src_a, 1.216 + const uint8* src_b, int count) = SumSquareError_C; 1.217 +#if defined(HAS_SUMSQUAREERROR_NEON) 1.218 + SumSquareError = SumSquareError_NEON; 1.219 +#endif 1.220 +#if defined(HAS_SUMSQUAREERROR_SSE2) 1.221 + if (CpuHasSSE2()) { 1.222 + SumSquareError = SumSquareError_SSE2; 1.223 + } 1.224 +#endif 1.225 + const int kBlockSize = 1 << 15; 1.226 + uint64 sse = 0; 1.227 +#ifdef _OPENMP 1.228 +#pragma omp parallel for reduction(+: sse) 1.229 +#endif 1.230 + for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { 1.231 + sse += SumSquareError(src_a + i, src_b + i, kBlockSize); 1.232 + } 1.233 + src_a += count & ~(kBlockSize - 1); 1.234 + src_b += count & ~(kBlockSize - 1); 1.235 + int remainder = count & (kBlockSize - 1) & ~15; 1.236 + if (remainder) { 1.237 + sse += SumSquareError(src_a, src_b, remainder); 1.238 + src_a += remainder; 1.239 + src_b += remainder; 1.240 + } 1.241 + remainder = count & 15; 1.242 + if (remainder) { 1.243 + sse += SumSquareError_C(src_a, src_b, remainder); 1.244 + } 1.245 + return static_cast<double>(sse); 1.246 +} 1.247 + 1.248 +#ifdef __cplusplus 1.249 +} // extern "C" 1.250 +#endif