1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libopus/celt/x86/pitch_sse.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,156 @@ 1.4 +/* Copyright (c) 2013 Jean-Marc Valin and John Ridges */ 1.5 +/** 1.6 + @file pitch_sse.h 1.7 + @brief Pitch analysis 1.8 + */ 1.9 + 1.10 +/* 1.11 + Redistribution and use in source and binary forms, with or without 1.12 + modification, are permitted provided that the following conditions 1.13 + are met: 1.14 + 1.15 + - Redistributions of source code must retain the above copyright 1.16 + notice, this list of conditions and the following disclaimer. 1.17 + 1.18 + - Redistributions in binary form must reproduce the above copyright 1.19 + notice, this list of conditions and the following disclaimer in the 1.20 + documentation and/or other materials provided with the distribution. 1.21 + 1.22 + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 1.23 + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 1.24 + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 1.25 + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 1.26 + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 1.27 + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 1.28 + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 1.29 + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 1.30 + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 1.31 + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 1.32 + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1.33 +*/ 1.34 + 1.35 +#ifndef PITCH_SSE_H 1.36 +#define PITCH_SSE_H 1.37 + 1.38 +#include <xmmintrin.h> 1.39 +#include "arch.h" 1.40 + 1.41 +#define OVERRIDE_XCORR_KERNEL 1.42 +static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) 1.43 +{ 1.44 + int j; 1.45 + __m128 xsum1, xsum2; 1.46 + xsum1 = _mm_loadu_ps(sum); 1.47 + xsum2 = _mm_setzero_ps(); 1.48 + 1.49 + for (j = 0; j < len-3; j += 4) 1.50 + { 1.51 + __m128 x0 = _mm_loadu_ps(x+j); 1.52 + __m128 yj = _mm_loadu_ps(y+j); 1.53 + __m128 y3 = _mm_loadu_ps(y+j+3); 1.54 + 1.55 + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); 1.56 + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), 1.57 + _mm_shuffle_ps(yj,y3,0x49))); 1.58 + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), 1.59 + _mm_shuffle_ps(yj,y3,0x9e))); 1.60 + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); 1.61 + } 1.62 + if (j < len) 1.63 + { 1.64 + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); 1.65 + if (++j < len) 1.66 + { 1.67 + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); 1.68 + if (++j < len) 1.69 + { 1.70 + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); 1.71 + } 1.72 + } 1.73 + } 1.74 + _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); 1.75 +} 1.76 + 1.77 +#define OVERRIDE_DUAL_INNER_PROD 1.78 +static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, 1.79 + int N, opus_val32 *xy1, opus_val32 *xy2) 1.80 +{ 1.81 + int i; 1.82 + __m128 xsum1, xsum2; 1.83 + xsum1 = _mm_setzero_ps(); 1.84 + xsum2 = _mm_setzero_ps(); 1.85 + for (i=0;i<N-3;i+=4) 1.86 + { 1.87 + __m128 xi = _mm_loadu_ps(x+i); 1.88 + __m128 y1i = _mm_loadu_ps(y01+i); 1.89 + __m128 y2i = _mm_loadu_ps(y02+i); 1.90 + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); 1.91 + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); 1.92 + } 1.93 + /* Horizontal sum */ 1.94 + xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); 1.95 + xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); 1.96 + _mm_store_ss(xy1, xsum1); 1.97 + xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); 1.98 + xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); 1.99 + _mm_store_ss(xy2, xsum2); 1.100 + for (;i<N;i++) 1.101 + { 1.102 + *xy1 = MAC16_16(*xy1, x[i], y01[i]); 1.103 + *xy2 = MAC16_16(*xy2, x[i], y02[i]); 1.104 + } 1.105 +} 1.106 + 1.107 +#define OVERRIDE_COMB_FILTER_CONST 1.108 +static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, 1.109 + opus_val16 g10, opus_val16 g11, opus_val16 g12) 1.110 +{ 1.111 + int i; 1.112 + __m128 x0v; 1.113 + __m128 g10v, g11v, g12v; 1.114 + g10v = _mm_load1_ps(&g10); 1.115 + g11v = _mm_load1_ps(&g11); 1.116 + g12v = _mm_load1_ps(&g12); 1.117 + x0v = _mm_loadu_ps(&x[-T-2]); 1.118 + for (i=0;i<N-3;i+=4) 1.119 + { 1.120 + __m128 yi, yi2, x1v, x2v, x3v, x4v; 1.121 + const opus_val32 *xp = &x[i-T-2]; 1.122 + yi = _mm_loadu_ps(x+i); 1.123 + x4v = _mm_loadu_ps(xp+4); 1.124 +#if 0 1.125 + /* Slower version with all loads */ 1.126 + x1v = _mm_loadu_ps(xp+1); 1.127 + x2v = _mm_loadu_ps(xp+2); 1.128 + x3v = _mm_loadu_ps(xp+3); 1.129 +#else 1.130 + x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); 1.131 + x1v = _mm_shuffle_ps(x0v, x2v, 0x99); 1.132 + x3v = _mm_shuffle_ps(x2v, x4v, 0x99); 1.133 +#endif 1.134 + 1.135 + yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); 1.136 +#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ 1.137 + yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); 1.138 + yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); 1.139 +#else 1.140 + /* Use partial sums */ 1.141 + yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), 1.142 + _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); 1.143 + yi = _mm_add_ps(yi, yi2); 1.144 +#endif 1.145 + x0v=x4v; 1.146 + _mm_storeu_ps(y+i, yi); 1.147 + } 1.148 +#ifdef CUSTOM_MODES 1.149 + for (;i<N;i++) 1.150 + { 1.151 + y[i] = x[i] 1.152 + + MULT16_32_Q15(g10,x[i-T]) 1.153 + + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) 1.154 + + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); 1.155 + } 1.156 +#endif 1.157 +} 1.158 + 1.159 +#endif