michael@0: /* Copyright (c) 2013 Jean-Marc Valin and John Ridges */ michael@0: /** michael@0: @file pitch_sse.h michael@0: @brief Pitch analysis michael@0: */ michael@0: michael@0: /* michael@0: Redistribution and use in source and binary forms, with or without michael@0: modification, are permitted provided that the following conditions michael@0: are met: michael@0: michael@0: - Redistributions of source code must retain the above copyright michael@0: notice, this list of conditions and the following disclaimer. michael@0: michael@0: - Redistributions in binary form must reproduce the above copyright michael@0: notice, this list of conditions and the following disclaimer in the michael@0: documentation and/or other materials provided with the distribution. michael@0: michael@0: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS michael@0: ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT michael@0: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR michael@0: A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER michael@0: OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, michael@0: EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, michael@0: PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR michael@0: PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF michael@0: LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING michael@0: NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS michael@0: SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. michael@0: */ michael@0: michael@0: #ifndef PITCH_SSE_H michael@0: #define PITCH_SSE_H michael@0: michael@0: #include michael@0: #include "arch.h" michael@0: michael@0: #define OVERRIDE_XCORR_KERNEL michael@0: static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) michael@0: { michael@0: int j; michael@0: __m128 xsum1, xsum2; michael@0: xsum1 = _mm_loadu_ps(sum); michael@0: xsum2 = _mm_setzero_ps(); michael@0: michael@0: for (j = 0; j < len-3; j += 4) michael@0: { michael@0: __m128 x0 = _mm_loadu_ps(x+j); michael@0: __m128 yj = _mm_loadu_ps(y+j); michael@0: __m128 y3 = _mm_loadu_ps(y+j+3); michael@0: michael@0: xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); michael@0: xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), michael@0: _mm_shuffle_ps(yj,y3,0x49))); michael@0: xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), michael@0: _mm_shuffle_ps(yj,y3,0x9e))); michael@0: xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); michael@0: } michael@0: if (j < len) michael@0: { michael@0: xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); michael@0: if (++j < len) michael@0: { michael@0: xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); michael@0: if (++j < len) michael@0: { michael@0: xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); michael@0: } michael@0: } michael@0: } michael@0: _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); michael@0: } michael@0: michael@0: #define OVERRIDE_DUAL_INNER_PROD michael@0: static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, michael@0: int N, opus_val32 *xy1, opus_val32 *xy2) michael@0: { michael@0: int i; michael@0: __m128 xsum1, xsum2; michael@0: xsum1 = _mm_setzero_ps(); michael@0: xsum2 = _mm_setzero_ps(); michael@0: for (i=0;i