|
1 /* Copyright (c) 2013 Jean-Marc Valin and John Ridges */ |
|
2 /** |
|
3 @file pitch_sse.h |
|
4 @brief Pitch analysis |
|
5 */ |
|
6 |
|
7 /* |
|
8 Redistribution and use in source and binary forms, with or without |
|
9 modification, are permitted provided that the following conditions |
|
10 are met: |
|
11 |
|
12 - Redistributions of source code must retain the above copyright |
|
13 notice, this list of conditions and the following disclaimer. |
|
14 |
|
15 - Redistributions in binary form must reproduce the above copyright |
|
16 notice, this list of conditions and the following disclaimer in the |
|
17 documentation and/or other materials provided with the distribution. |
|
18 |
|
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
20 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
|
22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER |
|
23 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
|
24 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
|
25 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
|
26 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
|
27 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
|
28 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|
29 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
30 */ |
|
31 |
|
32 #ifndef PITCH_SSE_H |
|
33 #define PITCH_SSE_H |
|
34 |
|
35 #include <xmmintrin.h> |
|
36 #include "arch.h" |
|
37 |
|
38 #define OVERRIDE_XCORR_KERNEL |
|
39 static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) |
|
40 { |
|
41 int j; |
|
42 __m128 xsum1, xsum2; |
|
43 xsum1 = _mm_loadu_ps(sum); |
|
44 xsum2 = _mm_setzero_ps(); |
|
45 |
|
46 for (j = 0; j < len-3; j += 4) |
|
47 { |
|
48 __m128 x0 = _mm_loadu_ps(x+j); |
|
49 __m128 yj = _mm_loadu_ps(y+j); |
|
50 __m128 y3 = _mm_loadu_ps(y+j+3); |
|
51 |
|
52 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); |
|
53 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), |
|
54 _mm_shuffle_ps(yj,y3,0x49))); |
|
55 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), |
|
56 _mm_shuffle_ps(yj,y3,0x9e))); |
|
57 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); |
|
58 } |
|
59 if (j < len) |
|
60 { |
|
61 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); |
|
62 if (++j < len) |
|
63 { |
|
64 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); |
|
65 if (++j < len) |
|
66 { |
|
67 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); |
|
68 } |
|
69 } |
|
70 } |
|
71 _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); |
|
72 } |
|
73 |
|
74 #define OVERRIDE_DUAL_INNER_PROD |
|
75 static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, |
|
76 int N, opus_val32 *xy1, opus_val32 *xy2) |
|
77 { |
|
78 int i; |
|
79 __m128 xsum1, xsum2; |
|
80 xsum1 = _mm_setzero_ps(); |
|
81 xsum2 = _mm_setzero_ps(); |
|
82 for (i=0;i<N-3;i+=4) |
|
83 { |
|
84 __m128 xi = _mm_loadu_ps(x+i); |
|
85 __m128 y1i = _mm_loadu_ps(y01+i); |
|
86 __m128 y2i = _mm_loadu_ps(y02+i); |
|
87 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); |
|
88 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); |
|
89 } |
|
90 /* Horizontal sum */ |
|
91 xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); |
|
92 xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); |
|
93 _mm_store_ss(xy1, xsum1); |
|
94 xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); |
|
95 xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); |
|
96 _mm_store_ss(xy2, xsum2); |
|
97 for (;i<N;i++) |
|
98 { |
|
99 *xy1 = MAC16_16(*xy1, x[i], y01[i]); |
|
100 *xy2 = MAC16_16(*xy2, x[i], y02[i]); |
|
101 } |
|
102 } |
|
103 |
|
104 #define OVERRIDE_COMB_FILTER_CONST |
|
105 static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, |
|
106 opus_val16 g10, opus_val16 g11, opus_val16 g12) |
|
107 { |
|
108 int i; |
|
109 __m128 x0v; |
|
110 __m128 g10v, g11v, g12v; |
|
111 g10v = _mm_load1_ps(&g10); |
|
112 g11v = _mm_load1_ps(&g11); |
|
113 g12v = _mm_load1_ps(&g12); |
|
114 x0v = _mm_loadu_ps(&x[-T-2]); |
|
115 for (i=0;i<N-3;i+=4) |
|
116 { |
|
117 __m128 yi, yi2, x1v, x2v, x3v, x4v; |
|
118 const opus_val32 *xp = &x[i-T-2]; |
|
119 yi = _mm_loadu_ps(x+i); |
|
120 x4v = _mm_loadu_ps(xp+4); |
|
121 #if 0 |
|
122 /* Slower version with all loads */ |
|
123 x1v = _mm_loadu_ps(xp+1); |
|
124 x2v = _mm_loadu_ps(xp+2); |
|
125 x3v = _mm_loadu_ps(xp+3); |
|
126 #else |
|
127 x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); |
|
128 x1v = _mm_shuffle_ps(x0v, x2v, 0x99); |
|
129 x3v = _mm_shuffle_ps(x2v, x4v, 0x99); |
|
130 #endif |
|
131 |
|
132 yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); |
|
133 #if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ |
|
134 yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); |
|
135 yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); |
|
136 #else |
|
137 /* Use partial sums */ |
|
138 yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), |
|
139 _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); |
|
140 yi = _mm_add_ps(yi, yi2); |
|
141 #endif |
|
142 x0v=x4v; |
|
143 _mm_storeu_ps(y+i, yi); |
|
144 } |
|
145 #ifdef CUSTOM_MODES |
|
146 for (;i<N;i++) |
|
147 { |
|
148 y[i] = x[i] |
|
149 + MULT16_32_Q15(g10,x[i-T]) |
|
150 + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) |
|
151 + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); |
|
152 } |
|
153 #endif |
|
154 } |
|
155 |
|
156 #endif |