1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libspeex_resampler/src/resample_sse.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,152 @@ 1.4 +/* Copyright (C) 2007-2008 Jean-Marc Valin 1.5 + * Copyright (C) 2008 Thorvald Natvig 1.6 + */ 1.7 +/** 1.8 + @file resample_sse.h 1.9 + @brief Resampler functions (SSE version) 1.10 +*/ 1.11 +/* 1.12 + Redistribution and use in source and binary forms, with or without 1.13 + modification, are permitted provided that the following conditions 1.14 + are met: 1.15 + 1.16 + - Redistributions of source code must retain the above copyright 1.17 + notice, this list of conditions and the following disclaimer. 1.18 + 1.19 + - Redistributions in binary form must reproduce the above copyright 1.20 + notice, this list of conditions and the following disclaimer in the 1.21 + documentation and/or other materials provided with the distribution. 1.22 + 1.23 + - Neither the name of the Xiph.org Foundation nor the names of its 1.24 + contributors may be used to endorse or promote products derived from 1.25 + this software without specific prior written permission. 1.26 + 1.27 + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 1.28 + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 1.29 + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 1.30 + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 1.31 + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 1.32 + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 1.33 + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 1.34 + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 1.35 + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 1.36 + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 1.37 + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1.38 +*/ 1.39 + 1.40 +#include <xmmintrin.h> 1.41 + 1.42 +#define OVERRIDE_INNER_PRODUCT_SINGLE 1.43 +static inline float inner_product_single(const float *a, const float *b, unsigned int len) 1.44 +{ 1.45 + int i; 1.46 + float ret; 1.47 + if (1) 1.48 + { 1.49 + __m128 sum = _mm_setzero_ps(); 1.50 + for (i=0;i<len;i+=8) 1.51 + { 1.52 + sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i))); 1.53 + sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4))); 1.54 + } 1.55 + sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); 1.56 + sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); 1.57 + _mm_store_ss(&ret, sum); 1.58 + } 1.59 + else 1.60 + { 1.61 + ret = 0; 1.62 + for (i=0;i<len;i++) ret += a[i] * b[i]; 1.63 + } 1.64 + return ret; 1.65 +} 1.66 + 1.67 +#define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE 1.68 +static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) { 1.69 + int i; 1.70 + float ret; 1.71 + if (1) 1.72 + { 1.73 + __m128 sum = _mm_setzero_ps(); 1.74 + __m128 f = _mm_loadu_ps(frac); 1.75 + for(i=0;i<len;i+=2) 1.76 + { 1.77 + sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample))); 1.78 + sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample))); 1.79 + } 1.80 + sum = _mm_mul_ps(f, sum); 1.81 + sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); 1.82 + sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); 1.83 + _mm_store_ss(&ret, sum); 1.84 + } 1.85 + else 1.86 + { 1.87 + float accum[4] = {0,0,0,0}; 1.88 + for(i=0;i<len;i++) 1.89 + { 1.90 + const float curr_in=a[i]; 1.91 + accum[0] += curr_in * b[i * oversample + 0]; 1.92 + accum[1] += curr_in * b[i * oversample + 1]; 1.93 + accum[2] += curr_in * b[i * oversample + 2]; 1.94 + accum[3] += curr_in * b[i * oversample + 3]; 1.95 + } 1.96 + ret = accum[0] * frac[0] + accum[1] * frac[1] + accum[2] * frac[2] + accum[3] * frac[3]; 1.97 + } 1.98 + return ret; 1.99 +} 1.100 + 1.101 +#ifdef __SSE2__ 1.102 +#include <emmintrin.h> 1.103 +#define OVERRIDE_INNER_PRODUCT_DOUBLE 1.104 + 1.105 +static inline double inner_product_double(const float *a, const float *b, unsigned int len) 1.106 +{ 1.107 + int i; 1.108 + double ret; 1.109 + __m128d sum = _mm_setzero_pd(); 1.110 + __m128 t; 1.111 + for (i=0;i<len;i+=8) 1.112 + { 1.113 + t = _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)); 1.114 + sum = _mm_add_pd(sum, _mm_cvtps_pd(t)); 1.115 + sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t))); 1.116 + 1.117 + t = _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)); 1.118 + sum = _mm_add_pd(sum, _mm_cvtps_pd(t)); 1.119 + sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t))); 1.120 + } 1.121 + sum = _mm_add_sd(sum, (__m128d) _mm_movehl_ps((__m128) sum, (__m128) sum)); 1.122 + _mm_store_sd(&ret, sum); 1.123 + return ret; 1.124 +} 1.125 + 1.126 +#define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE 1.127 +static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) { 1.128 + int i; 1.129 + double ret; 1.130 + __m128d sum; 1.131 + __m128d sum1 = _mm_setzero_pd(); 1.132 + __m128d sum2 = _mm_setzero_pd(); 1.133 + __m128 f = _mm_loadu_ps(frac); 1.134 + __m128d f1 = _mm_cvtps_pd(f); 1.135 + __m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f)); 1.136 + __m128 t; 1.137 + for(i=0;i<len;i+=2) 1.138 + { 1.139 + t = _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample)); 1.140 + sum1 = _mm_add_pd(sum1, _mm_cvtps_pd(t)); 1.141 + sum2 = _mm_add_pd(sum2, _mm_cvtps_pd(_mm_movehl_ps(t, t))); 1.142 + 1.143 + t = _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample)); 1.144 + sum1 = _mm_add_pd(sum1, _mm_cvtps_pd(t)); 1.145 + sum2 = _mm_add_pd(sum2, _mm_cvtps_pd(_mm_movehl_ps(t, t))); 1.146 + } 1.147 + sum1 = _mm_mul_pd(f1, sum1); 1.148 + sum2 = _mm_mul_pd(f2, sum2); 1.149 + sum = _mm_add_pd(sum1, sum2); 1.150 + sum = _mm_add_sd(sum, (__m128d) _mm_movehl_ps((__m128) sum, (__m128) sum)); 1.151 + _mm_store_sd(&ret, sum); 1.152 + return ret; 1.153 +} 1.154 + 1.155 +#endif