media/libspeex_resampler/src/resample_sse.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libspeex_resampler/src/resample_sse.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,152 @@
     1.4 +/* Copyright (C) 2007-2008 Jean-Marc Valin
     1.5 + * Copyright (C) 2008 Thorvald Natvig
     1.6 + */
     1.7 +/**
     1.8 +   @file resample_sse.h
     1.9 +   @brief Resampler functions (SSE version)
    1.10 +*/
    1.11 +/*
    1.12 +   Redistribution and use in source and binary forms, with or without
    1.13 +   modification, are permitted provided that the following conditions
    1.14 +   are met:
    1.15 +   
    1.16 +   - Redistributions of source code must retain the above copyright
    1.17 +   notice, this list of conditions and the following disclaimer.
    1.18 +   
    1.19 +   - Redistributions in binary form must reproduce the above copyright
    1.20 +   notice, this list of conditions and the following disclaimer in the
    1.21 +   documentation and/or other materials provided with the distribution.
    1.22 +   
    1.23 +   - Neither the name of the Xiph.org Foundation nor the names of its
    1.24 +   contributors may be used to endorse or promote products derived from
    1.25 +   this software without specific prior written permission.
    1.26 +   
    1.27 +   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    1.28 +   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    1.29 +   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
    1.30 +   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
    1.31 +   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
    1.32 +   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
    1.33 +   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    1.34 +   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
    1.35 +   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
    1.36 +   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
    1.37 +   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    1.38 +*/
    1.39 +
    1.40 +#include <xmmintrin.h>
    1.41 +
    1.42 +#define OVERRIDE_INNER_PRODUCT_SINGLE
    1.43 +static inline float inner_product_single(const float *a, const float *b, unsigned int len)
    1.44 +{
    1.45 +   int i;
    1.46 +   float ret;
    1.47 +   if (1)
    1.48 +   {
    1.49 +      __m128 sum = _mm_setzero_ps();
    1.50 +      for (i=0;i<len;i+=8)
    1.51 +      {
    1.52 +         sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
    1.53 +         sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
    1.54 +      }
    1.55 +      sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
    1.56 +      sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
    1.57 +      _mm_store_ss(&ret, sum);
    1.58 +   }
    1.59 +   else
    1.60 +   {
    1.61 +      ret = 0;
    1.62 +      for (i=0;i<len;i++) ret += a[i] * b[i];
    1.63 +   }
    1.64 +   return ret;
    1.65 +}
    1.66 +
    1.67 +#define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
    1.68 +static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
    1.69 +  int i;
    1.70 +  float ret;
    1.71 +  if (1)
    1.72 +  {
    1.73 +    __m128 sum = _mm_setzero_ps();
    1.74 +    __m128 f = _mm_loadu_ps(frac);
    1.75 +    for(i=0;i<len;i+=2)
    1.76 +    {
    1.77 +      sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample)));
    1.78 +      sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample)));
    1.79 +    }
    1.80 +    sum = _mm_mul_ps(f, sum);
    1.81 +    sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
    1.82 +    sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
    1.83 +    _mm_store_ss(&ret, sum);
    1.84 +  }
    1.85 +  else
    1.86 +  {
    1.87 +    float accum[4] = {0,0,0,0};
    1.88 +    for(i=0;i<len;i++)
    1.89 +    {
    1.90 +      const float curr_in=a[i];
    1.91 +      accum[0] += curr_in * b[i * oversample + 0];
    1.92 +      accum[1] += curr_in * b[i * oversample + 1];
    1.93 +      accum[2] += curr_in * b[i * oversample + 2];
    1.94 +      accum[3] += curr_in * b[i * oversample + 3];
    1.95 +    }
    1.96 +    ret = accum[0] * frac[0] + accum[1] * frac[1] + accum[2] * frac[2] + accum[3] * frac[3];
    1.97 +  }
    1.98 +  return ret;
    1.99 +}
   1.100 +
   1.101 +#ifdef __SSE2__
   1.102 +#include <emmintrin.h>
   1.103 +#define OVERRIDE_INNER_PRODUCT_DOUBLE
   1.104 +
   1.105 +static inline double inner_product_double(const float *a, const float *b, unsigned int len)
   1.106 +{
   1.107 +   int i;
   1.108 +   double ret;
   1.109 +   __m128d sum = _mm_setzero_pd();
   1.110 +   __m128 t;
   1.111 +   for (i=0;i<len;i+=8)
   1.112 +   {
   1.113 +      t = _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i));
   1.114 +      sum = _mm_add_pd(sum, _mm_cvtps_pd(t));
   1.115 +      sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
   1.116 +
   1.117 +      t = _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4));
   1.118 +      sum = _mm_add_pd(sum, _mm_cvtps_pd(t));
   1.119 +      sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
   1.120 +   }
   1.121 +   sum = _mm_add_sd(sum, (__m128d) _mm_movehl_ps((__m128) sum, (__m128) sum));
   1.122 +   _mm_store_sd(&ret, sum);
   1.123 +   return ret;
   1.124 +}
   1.125 +
   1.126 +#define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
   1.127 +static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
   1.128 +  int i;
   1.129 +  double ret;
   1.130 +  __m128d sum;
   1.131 +  __m128d sum1 = _mm_setzero_pd();
   1.132 +  __m128d sum2 = _mm_setzero_pd();
   1.133 +  __m128 f = _mm_loadu_ps(frac);
   1.134 +  __m128d f1 = _mm_cvtps_pd(f);
   1.135 +  __m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f));
   1.136 +  __m128 t;
   1.137 +  for(i=0;i<len;i+=2)
   1.138 +  {
   1.139 +    t = _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample));
   1.140 +    sum1 = _mm_add_pd(sum1, _mm_cvtps_pd(t));
   1.141 +    sum2 = _mm_add_pd(sum2, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
   1.142 +
   1.143 +    t = _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample));
   1.144 +    sum1 = _mm_add_pd(sum1, _mm_cvtps_pd(t));
   1.145 +    sum2 = _mm_add_pd(sum2, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
   1.146 +  }
   1.147 +  sum1 = _mm_mul_pd(f1, sum1);
   1.148 +  sum2 = _mm_mul_pd(f2, sum2);
   1.149 +  sum = _mm_add_pd(sum1, sum2);
   1.150 +  sum = _mm_add_sd(sum, (__m128d) _mm_movehl_ps((__m128) sum, (__m128) sum));
   1.151 +  _mm_store_sd(&ret, sum);
   1.152 +  return ret;
   1.153 +}
   1.154 +
   1.155 +#endif

mercurial