Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* Copyright (c) 2013 Jean-Marc Valin and John Ridges */ |
michael@0 | 2 | /** |
michael@0 | 3 | @file pitch_sse.h |
michael@0 | 4 | @brief Pitch analysis |
michael@0 | 5 | */ |
michael@0 | 6 | |
michael@0 | 7 | /* |
michael@0 | 8 | Redistribution and use in source and binary forms, with or without |
michael@0 | 9 | modification, are permitted provided that the following conditions |
michael@0 | 10 | are met: |
michael@0 | 11 | |
michael@0 | 12 | - Redistributions of source code must retain the above copyright |
michael@0 | 13 | notice, this list of conditions and the following disclaimer. |
michael@0 | 14 | |
michael@0 | 15 | - Redistributions in binary form must reproduce the above copyright |
michael@0 | 16 | notice, this list of conditions and the following disclaimer in the |
michael@0 | 17 | documentation and/or other materials provided with the distribution. |
michael@0 | 18 | |
michael@0 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
michael@0 | 20 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
michael@0 | 21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
michael@0 | 22 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER |
michael@0 | 23 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
michael@0 | 24 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
michael@0 | 25 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
michael@0 | 26 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
michael@0 | 27 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
michael@0 | 28 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
michael@0 | 29 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
michael@0 | 30 | */ |
michael@0 | 31 | |
michael@0 | 32 | #ifndef PITCH_SSE_H |
michael@0 | 33 | #define PITCH_SSE_H |
michael@0 | 34 | |
michael@0 | 35 | #include <xmmintrin.h> |
michael@0 | 36 | #include "arch.h" |
michael@0 | 37 | |
michael@0 | 38 | #define OVERRIDE_XCORR_KERNEL |
michael@0 | 39 | static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) |
michael@0 | 40 | { |
michael@0 | 41 | int j; |
michael@0 | 42 | __m128 xsum1, xsum2; |
michael@0 | 43 | xsum1 = _mm_loadu_ps(sum); |
michael@0 | 44 | xsum2 = _mm_setzero_ps(); |
michael@0 | 45 | |
michael@0 | 46 | for (j = 0; j < len-3; j += 4) |
michael@0 | 47 | { |
michael@0 | 48 | __m128 x0 = _mm_loadu_ps(x+j); |
michael@0 | 49 | __m128 yj = _mm_loadu_ps(y+j); |
michael@0 | 50 | __m128 y3 = _mm_loadu_ps(y+j+3); |
michael@0 | 51 | |
michael@0 | 52 | xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); |
michael@0 | 53 | xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), |
michael@0 | 54 | _mm_shuffle_ps(yj,y3,0x49))); |
michael@0 | 55 | xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), |
michael@0 | 56 | _mm_shuffle_ps(yj,y3,0x9e))); |
michael@0 | 57 | xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); |
michael@0 | 58 | } |
michael@0 | 59 | if (j < len) |
michael@0 | 60 | { |
michael@0 | 61 | xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); |
michael@0 | 62 | if (++j < len) |
michael@0 | 63 | { |
michael@0 | 64 | xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); |
michael@0 | 65 | if (++j < len) |
michael@0 | 66 | { |
michael@0 | 67 | xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); |
michael@0 | 68 | } |
michael@0 | 69 | } |
michael@0 | 70 | } |
michael@0 | 71 | _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); |
michael@0 | 72 | } |
michael@0 | 73 | |
michael@0 | 74 | #define OVERRIDE_DUAL_INNER_PROD |
michael@0 | 75 | static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, |
michael@0 | 76 | int N, opus_val32 *xy1, opus_val32 *xy2) |
michael@0 | 77 | { |
michael@0 | 78 | int i; |
michael@0 | 79 | __m128 xsum1, xsum2; |
michael@0 | 80 | xsum1 = _mm_setzero_ps(); |
michael@0 | 81 | xsum2 = _mm_setzero_ps(); |
michael@0 | 82 | for (i=0;i<N-3;i+=4) |
michael@0 | 83 | { |
michael@0 | 84 | __m128 xi = _mm_loadu_ps(x+i); |
michael@0 | 85 | __m128 y1i = _mm_loadu_ps(y01+i); |
michael@0 | 86 | __m128 y2i = _mm_loadu_ps(y02+i); |
michael@0 | 87 | xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); |
michael@0 | 88 | xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); |
michael@0 | 89 | } |
michael@0 | 90 | /* Horizontal sum */ |
michael@0 | 91 | xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); |
michael@0 | 92 | xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); |
michael@0 | 93 | _mm_store_ss(xy1, xsum1); |
michael@0 | 94 | xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); |
michael@0 | 95 | xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); |
michael@0 | 96 | _mm_store_ss(xy2, xsum2); |
michael@0 | 97 | for (;i<N;i++) |
michael@0 | 98 | { |
michael@0 | 99 | *xy1 = MAC16_16(*xy1, x[i], y01[i]); |
michael@0 | 100 | *xy2 = MAC16_16(*xy2, x[i], y02[i]); |
michael@0 | 101 | } |
michael@0 | 102 | } |
michael@0 | 103 | |
michael@0 | 104 | #define OVERRIDE_COMB_FILTER_CONST |
michael@0 | 105 | static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, |
michael@0 | 106 | opus_val16 g10, opus_val16 g11, opus_val16 g12) |
michael@0 | 107 | { |
michael@0 | 108 | int i; |
michael@0 | 109 | __m128 x0v; |
michael@0 | 110 | __m128 g10v, g11v, g12v; |
michael@0 | 111 | g10v = _mm_load1_ps(&g10); |
michael@0 | 112 | g11v = _mm_load1_ps(&g11); |
michael@0 | 113 | g12v = _mm_load1_ps(&g12); |
michael@0 | 114 | x0v = _mm_loadu_ps(&x[-T-2]); |
michael@0 | 115 | for (i=0;i<N-3;i+=4) |
michael@0 | 116 | { |
michael@0 | 117 | __m128 yi, yi2, x1v, x2v, x3v, x4v; |
michael@0 | 118 | const opus_val32 *xp = &x[i-T-2]; |
michael@0 | 119 | yi = _mm_loadu_ps(x+i); |
michael@0 | 120 | x4v = _mm_loadu_ps(xp+4); |
michael@0 | 121 | #if 0 |
michael@0 | 122 | /* Slower version with all loads */ |
michael@0 | 123 | x1v = _mm_loadu_ps(xp+1); |
michael@0 | 124 | x2v = _mm_loadu_ps(xp+2); |
michael@0 | 125 | x3v = _mm_loadu_ps(xp+3); |
michael@0 | 126 | #else |
michael@0 | 127 | x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); |
michael@0 | 128 | x1v = _mm_shuffle_ps(x0v, x2v, 0x99); |
michael@0 | 129 | x3v = _mm_shuffle_ps(x2v, x4v, 0x99); |
michael@0 | 130 | #endif |
michael@0 | 131 | |
michael@0 | 132 | yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); |
michael@0 | 133 | #if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ |
michael@0 | 134 | yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); |
michael@0 | 135 | yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); |
michael@0 | 136 | #else |
michael@0 | 137 | /* Use partial sums */ |
michael@0 | 138 | yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), |
michael@0 | 139 | _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); |
michael@0 | 140 | yi = _mm_add_ps(yi, yi2); |
michael@0 | 141 | #endif |
michael@0 | 142 | x0v=x4v; |
michael@0 | 143 | _mm_storeu_ps(y+i, yi); |
michael@0 | 144 | } |
michael@0 | 145 | #ifdef CUSTOM_MODES |
michael@0 | 146 | for (;i<N;i++) |
michael@0 | 147 | { |
michael@0 | 148 | y[i] = x[i] |
michael@0 | 149 | + MULT16_32_Q15(g10,x[i-T]) |
michael@0 | 150 | + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) |
michael@0 | 151 | + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); |
michael@0 | 152 | } |
michael@0 | 153 | #endif |
michael@0 | 154 | } |
michael@0 | 155 | |
michael@0 | 156 | #endif |