media/libopus/celt/x86/pitch_sse.h

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /* Copyright (c) 2013 Jean-Marc Valin and John Ridges */
michael@0 2 /**
michael@0 3 @file pitch_sse.h
michael@0 4 @brief Pitch analysis
michael@0 5 */
michael@0 6
michael@0 7 /*
michael@0 8 Redistribution and use in source and binary forms, with or without
michael@0 9 modification, are permitted provided that the following conditions
michael@0 10 are met:
michael@0 11
michael@0 12 - Redistributions of source code must retain the above copyright
michael@0 13 notice, this list of conditions and the following disclaimer.
michael@0 14
michael@0 15 - Redistributions in binary form must reproduce the above copyright
michael@0 16 notice, this list of conditions and the following disclaimer in the
michael@0 17 documentation and/or other materials provided with the distribution.
michael@0 18
michael@0 19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
michael@0 20 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
michael@0 21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
michael@0 22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
michael@0 23 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
michael@0 24 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
michael@0 25 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
michael@0 26 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
michael@0 27 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
michael@0 28 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
michael@0 29 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
michael@0 30 */
michael@0 31
michael@0 32 #ifndef PITCH_SSE_H
michael@0 33 #define PITCH_SSE_H
michael@0 34
michael@0 35 #include <xmmintrin.h>
michael@0 36 #include "arch.h"
michael@0 37
michael@0 38 #define OVERRIDE_XCORR_KERNEL
michael@0 39 static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
michael@0 40 {
michael@0 41 int j;
michael@0 42 __m128 xsum1, xsum2;
michael@0 43 xsum1 = _mm_loadu_ps(sum);
michael@0 44 xsum2 = _mm_setzero_ps();
michael@0 45
michael@0 46 for (j = 0; j < len-3; j += 4)
michael@0 47 {
michael@0 48 __m128 x0 = _mm_loadu_ps(x+j);
michael@0 49 __m128 yj = _mm_loadu_ps(y+j);
michael@0 50 __m128 y3 = _mm_loadu_ps(y+j+3);
michael@0 51
michael@0 52 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
michael@0 53 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
michael@0 54 _mm_shuffle_ps(yj,y3,0x49)));
michael@0 55 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
michael@0 56 _mm_shuffle_ps(yj,y3,0x9e)));
michael@0 57 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
michael@0 58 }
michael@0 59 if (j < len)
michael@0 60 {
michael@0 61 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
michael@0 62 if (++j < len)
michael@0 63 {
michael@0 64 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
michael@0 65 if (++j < len)
michael@0 66 {
michael@0 67 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
michael@0 68 }
michael@0 69 }
michael@0 70 }
michael@0 71 _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
michael@0 72 }
michael@0 73
michael@0 74 #define OVERRIDE_DUAL_INNER_PROD
michael@0 75 static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
michael@0 76 int N, opus_val32 *xy1, opus_val32 *xy2)
michael@0 77 {
michael@0 78 int i;
michael@0 79 __m128 xsum1, xsum2;
michael@0 80 xsum1 = _mm_setzero_ps();
michael@0 81 xsum2 = _mm_setzero_ps();
michael@0 82 for (i=0;i<N-3;i+=4)
michael@0 83 {
michael@0 84 __m128 xi = _mm_loadu_ps(x+i);
michael@0 85 __m128 y1i = _mm_loadu_ps(y01+i);
michael@0 86 __m128 y2i = _mm_loadu_ps(y02+i);
michael@0 87 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
michael@0 88 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
michael@0 89 }
michael@0 90 /* Horizontal sum */
michael@0 91 xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
michael@0 92 xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
michael@0 93 _mm_store_ss(xy1, xsum1);
michael@0 94 xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
michael@0 95 xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
michael@0 96 _mm_store_ss(xy2, xsum2);
michael@0 97 for (;i<N;i++)
michael@0 98 {
michael@0 99 *xy1 = MAC16_16(*xy1, x[i], y01[i]);
michael@0 100 *xy2 = MAC16_16(*xy2, x[i], y02[i]);
michael@0 101 }
michael@0 102 }
michael@0 103
michael@0 104 #define OVERRIDE_COMB_FILTER_CONST
michael@0 105 static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
michael@0 106 opus_val16 g10, opus_val16 g11, opus_val16 g12)
michael@0 107 {
michael@0 108 int i;
michael@0 109 __m128 x0v;
michael@0 110 __m128 g10v, g11v, g12v;
michael@0 111 g10v = _mm_load1_ps(&g10);
michael@0 112 g11v = _mm_load1_ps(&g11);
michael@0 113 g12v = _mm_load1_ps(&g12);
michael@0 114 x0v = _mm_loadu_ps(&x[-T-2]);
michael@0 115 for (i=0;i<N-3;i+=4)
michael@0 116 {
michael@0 117 __m128 yi, yi2, x1v, x2v, x3v, x4v;
michael@0 118 const opus_val32 *xp = &x[i-T-2];
michael@0 119 yi = _mm_loadu_ps(x+i);
michael@0 120 x4v = _mm_loadu_ps(xp+4);
michael@0 121 #if 0
michael@0 122 /* Slower version with all loads */
michael@0 123 x1v = _mm_loadu_ps(xp+1);
michael@0 124 x2v = _mm_loadu_ps(xp+2);
michael@0 125 x3v = _mm_loadu_ps(xp+3);
michael@0 126 #else
michael@0 127 x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
michael@0 128 x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
michael@0 129 x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
michael@0 130 #endif
michael@0 131
michael@0 132 yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
michael@0 133 #if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
michael@0 134 yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
michael@0 135 yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
michael@0 136 #else
michael@0 137 /* Use partial sums */
michael@0 138 yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
michael@0 139 _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
michael@0 140 yi = _mm_add_ps(yi, yi2);
michael@0 141 #endif
michael@0 142 x0v=x4v;
michael@0 143 _mm_storeu_ps(y+i, yi);
michael@0 144 }
michael@0 145 #ifdef CUSTOM_MODES
michael@0 146 for (;i<N;i++)
michael@0 147 {
michael@0 148 y[i] = x[i]
michael@0 149 + MULT16_32_Q15(g10,x[i-T])
michael@0 150 + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
michael@0 151 + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
michael@0 152 }
michael@0 153 #endif
michael@0 154 }
michael@0 155
michael@0 156 #endif

mercurial