media/libopus/celt/x86/pitch_sse.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libopus/celt/x86/pitch_sse.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,156 @@
     1.4 +/* Copyright (c) 2013 Jean-Marc Valin and John Ridges */
     1.5 +/**
     1.6 +   @file pitch_sse.h
     1.7 +   @brief Pitch analysis
     1.8 + */
     1.9 +
    1.10 +/*
    1.11 +   Redistribution and use in source and binary forms, with or without
    1.12 +   modification, are permitted provided that the following conditions
    1.13 +   are met:
    1.14 +
    1.15 +   - Redistributions of source code must retain the above copyright
    1.16 +   notice, this list of conditions and the following disclaimer.
    1.17 +
    1.18 +   - Redistributions in binary form must reproduce the above copyright
    1.19 +   notice, this list of conditions and the following disclaimer in the
    1.20 +   documentation and/or other materials provided with the distribution.
    1.21 +
    1.22 +   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    1.23 +   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    1.24 +   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
    1.25 +   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
    1.26 +   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
    1.27 +   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
    1.28 +   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    1.29 +   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
    1.30 +   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
    1.31 +   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
    1.32 +   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    1.33 +*/
    1.34 +
    1.35 +#ifndef PITCH_SSE_H
    1.36 +#define PITCH_SSE_H
    1.37 +
    1.38 +#include <xmmintrin.h>
    1.39 +#include "arch.h"
    1.40 +
    1.41 +#define OVERRIDE_XCORR_KERNEL
    1.42 +static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
    1.43 +{
    1.44 +   int j;
    1.45 +   __m128 xsum1, xsum2;
    1.46 +   xsum1 = _mm_loadu_ps(sum);
    1.47 +   xsum2 = _mm_setzero_ps();
    1.48 +
    1.49 +   for (j = 0; j < len-3; j += 4)
    1.50 +   {
    1.51 +      __m128 x0 = _mm_loadu_ps(x+j);
    1.52 +      __m128 yj = _mm_loadu_ps(y+j);
    1.53 +      __m128 y3 = _mm_loadu_ps(y+j+3);
    1.54 +
    1.55 +      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
    1.56 +      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
    1.57 +                                          _mm_shuffle_ps(yj,y3,0x49)));
    1.58 +      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
    1.59 +                                          _mm_shuffle_ps(yj,y3,0x9e)));
    1.60 +      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
    1.61 +   }
    1.62 +   if (j < len)
    1.63 +   {
    1.64 +      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
    1.65 +      if (++j < len)
    1.66 +      {
    1.67 +         xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
    1.68 +         if (++j < len)
    1.69 +         {
    1.70 +            xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
    1.71 +         }
    1.72 +      }
    1.73 +   }
    1.74 +   _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
    1.75 +}
    1.76 +
    1.77 +#define OVERRIDE_DUAL_INNER_PROD
    1.78 +static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
    1.79 +      int N, opus_val32 *xy1, opus_val32 *xy2)
    1.80 +{
    1.81 +   int i;
    1.82 +   __m128 xsum1, xsum2;
    1.83 +   xsum1 = _mm_setzero_ps();
    1.84 +   xsum2 = _mm_setzero_ps();
    1.85 +   for (i=0;i<N-3;i+=4)
    1.86 +   {
    1.87 +      __m128 xi = _mm_loadu_ps(x+i);
    1.88 +      __m128 y1i = _mm_loadu_ps(y01+i);
    1.89 +      __m128 y2i = _mm_loadu_ps(y02+i);
    1.90 +      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
    1.91 +      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
    1.92 +   }
    1.93 +   /* Horizontal sum */
    1.94 +   xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
    1.95 +   xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
    1.96 +   _mm_store_ss(xy1, xsum1);
    1.97 +   xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
    1.98 +   xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
    1.99 +   _mm_store_ss(xy2, xsum2);
   1.100 +   for (;i<N;i++)
   1.101 +   {
   1.102 +      *xy1 = MAC16_16(*xy1, x[i], y01[i]);
   1.103 +      *xy2 = MAC16_16(*xy2, x[i], y02[i]);
   1.104 +   }
   1.105 +}
   1.106 +
   1.107 +#define OVERRIDE_COMB_FILTER_CONST
   1.108 +static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
   1.109 +      opus_val16 g10, opus_val16 g11, opus_val16 g12)
   1.110 +{
   1.111 +   int i;
   1.112 +   __m128 x0v;
   1.113 +   __m128 g10v, g11v, g12v;
   1.114 +   g10v = _mm_load1_ps(&g10);
   1.115 +   g11v = _mm_load1_ps(&g11);
   1.116 +   g12v = _mm_load1_ps(&g12);
   1.117 +   x0v = _mm_loadu_ps(&x[-T-2]);
   1.118 +   for (i=0;i<N-3;i+=4)
   1.119 +   {
   1.120 +      __m128 yi, yi2, x1v, x2v, x3v, x4v;
   1.121 +      const opus_val32 *xp = &x[i-T-2];
   1.122 +      yi = _mm_loadu_ps(x+i);
   1.123 +      x4v = _mm_loadu_ps(xp+4);
   1.124 +#if 0
   1.125 +      /* Slower version with all loads */
   1.126 +      x1v = _mm_loadu_ps(xp+1);
   1.127 +      x2v = _mm_loadu_ps(xp+2);
   1.128 +      x3v = _mm_loadu_ps(xp+3);
   1.129 +#else
   1.130 +      x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
   1.131 +      x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
   1.132 +      x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
   1.133 +#endif
   1.134 +
   1.135 +      yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
   1.136 +#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
   1.137 +      yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
   1.138 +      yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
   1.139 +#else
   1.140 +      /* Use partial sums */
   1.141 +      yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
   1.142 +                       _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
   1.143 +      yi = _mm_add_ps(yi, yi2);
   1.144 +#endif
   1.145 +      x0v=x4v;
   1.146 +      _mm_storeu_ps(y+i, yi);
   1.147 +   }
   1.148 +#ifdef CUSTOM_MODES
   1.149 +   for (;i<N;i++)
   1.150 +   {
   1.151 +      y[i] = x[i]
   1.152 +               + MULT16_32_Q15(g10,x[i-T])
   1.153 +               + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
   1.154 +               + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
   1.155 +   }
   1.156 +#endif
   1.157 +}
   1.158 +
   1.159 +#endif

mercurial