|
1 //////////////////////////////////////////////////////////////////////////////// |
|
2 /// |
|
3 /// SSE optimized routines for Pentium-III, Athlon-XP and later CPUs. All SSE |
|
4 /// optimized functions have been gathered into this single source |
|
5 /// code file, regardless to their class or original source code file, in order |
|
6 /// to ease porting the library to other compiler and processor platforms. |
|
7 /// |
|
8 /// The SSE-optimizations are programmed using SSE compiler intrinsics that |
|
9 /// are supported both by Microsoft Visual C++ and GCC compilers, so this file |
|
10 /// should compile with both toolsets. |
|
11 /// |
|
12 /// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ |
|
13 /// 6.0 processor pack" update to support SSE instruction set. The update is |
|
14 /// available for download at Microsoft Developers Network, see here: |
|
15 /// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx |
|
16 /// |
|
17 /// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and |
|
18 /// perform a search with keywords "processor pack". |
|
19 /// |
|
20 /// Author : Copyright (c) Olli Parviainen |
|
21 /// Author e-mail : oparviai 'at' iki.fi |
|
22 /// SoundTouch WWW: http://www.surina.net/soundtouch |
|
23 /// |
|
24 //////////////////////////////////////////////////////////////////////////////// |
|
25 // |
|
26 // Last changed : $Date: 2014-01-07 12:25:40 -0600 (Tue, 07 Jan 2014) $ |
|
27 // File revision : $Revision: 4 $ |
|
28 // |
|
29 // $Id: sse_optimized.cpp 184 2014-01-07 18:25:40Z oparviai $ |
|
30 // |
|
31 //////////////////////////////////////////////////////////////////////////////// |
|
32 // |
|
33 // License : |
|
34 // |
|
35 // SoundTouch audio processing library |
|
36 // Copyright (c) Olli Parviainen |
|
37 // |
|
38 // This library is free software; you can redistribute it and/or |
|
39 // modify it under the terms of the GNU Lesser General Public |
|
40 // License as published by the Free Software Foundation; either |
|
41 // version 2.1 of the License, or (at your option) any later version. |
|
42 // |
|
43 // This library is distributed in the hope that it will be useful, |
|
44 // but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
45 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
46 // Lesser General Public License for more details. |
|
47 // |
|
48 // You should have received a copy of the GNU Lesser General Public |
|
49 // License along with this library; if not, write to the Free Software |
|
50 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
51 // |
|
52 //////////////////////////////////////////////////////////////////////////////// |
|
53 |
|
54 #include "cpu_detect.h" |
|
55 #include "STTypes.h" |
|
56 |
|
57 using namespace soundtouch; |
|
58 |
|
59 #ifdef SOUNDTOUCH_ALLOW_SSE |
|
60 |
|
61 // SSE routines available only with float sample type |
|
62 |
|
63 ////////////////////////////////////////////////////////////////////////////// |
|
64 // |
|
65 // implementation of SSE optimized functions of class 'TDStretchSSE' |
|
66 // |
|
67 ////////////////////////////////////////////////////////////////////////////// |
|
68 |
|
69 #include "TDStretch.h" |
|
70 #include <xmmintrin.h> |
|
71 #include <math.h> |
|
72 |
|
73 // Calculates cross correlation of two buffers |
|
74 double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &norm) const |
|
75 { |
|
76 int i; |
|
77 const float *pVec1; |
|
78 const __m128 *pVec2; |
|
79 __m128 vSum, vNorm; |
|
80 |
|
81 // Note. It means a major slow-down if the routine needs to tolerate |
|
82 // unaligned __m128 memory accesses. It's way faster if we can skip |
|
83 // unaligned slots and use _mm_load_ps instruction instead of _mm_loadu_ps. |
|
84 // This can mean up to ~ 10-fold difference (incl. part of which is |
|
85 // due to skipping every second round for stereo sound though). |
|
86 // |
|
87 // Compile-time define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided |
|
88 // for choosing if this little cheating is allowed. |
|
89 |
|
90 #ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION |
|
91 // Little cheating allowed, return valid correlation only for |
|
92 // aligned locations, meaning every second round for stereo sound. |
|
93 |
|
94 #define _MM_LOAD _mm_load_ps |
|
95 |
|
96 if (((ulongptr)pV1) & 15) return -1e50; // skip unaligned locations |
|
97 |
|
98 #else |
|
99 // No cheating allowed, use unaligned load & take the resulting |
|
100 // performance hit. |
|
101 #define _MM_LOAD _mm_loadu_ps |
|
102 #endif |
|
103 |
|
104 // ensure overlapLength is divisible by 8 |
|
105 assert((overlapLength % 8) == 0); |
|
106 |
|
107 // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors |
|
108 // Note: pV2 _must_ be aligned to 16-bit boundary, pV1 need not. |
|
109 pVec1 = (const float*)pV1; |
|
110 pVec2 = (const __m128*)pV2; |
|
111 vSum = vNorm = _mm_setzero_ps(); |
|
112 |
|
113 // Unroll the loop by factor of 4 * 4 operations. Use same routine for |
|
114 // stereo & mono, for mono it just means twice the amount of unrolling. |
|
115 for (i = 0; i < channels * overlapLength / 16; i ++) |
|
116 { |
|
117 __m128 vTemp; |
|
118 // vSum += pV1[0..3] * pV2[0..3] |
|
119 vTemp = _MM_LOAD(pVec1); |
|
120 vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp ,pVec2[0])); |
|
121 vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp)); |
|
122 |
|
123 // vSum += pV1[4..7] * pV2[4..7] |
|
124 vTemp = _MM_LOAD(pVec1 + 4); |
|
125 vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[1])); |
|
126 vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp)); |
|
127 |
|
128 // vSum += pV1[8..11] * pV2[8..11] |
|
129 vTemp = _MM_LOAD(pVec1 + 8); |
|
130 vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[2])); |
|
131 vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp)); |
|
132 |
|
133 // vSum += pV1[12..15] * pV2[12..15] |
|
134 vTemp = _MM_LOAD(pVec1 + 12); |
|
135 vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[3])); |
|
136 vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp)); |
|
137 |
|
138 pVec1 += 16; |
|
139 pVec2 += 4; |
|
140 } |
|
141 |
|
142 // return value = vSum[0] + vSum[1] + vSum[2] + vSum[3] |
|
143 float *pvNorm = (float*)&vNorm; |
|
144 norm = (pvNorm[0] + pvNorm[1] + pvNorm[2] + pvNorm[3]); |
|
145 |
|
146 float *pvSum = (float*)&vSum; |
|
147 return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]) / sqrt(norm < 1e-9 ? 1.0 : norm); |
|
148 |
|
149 /* This is approximately corresponding routine in C-language yet without normalization: |
|
150 double corr, norm; |
|
151 uint i; |
|
152 |
|
153 // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors |
|
154 corr = norm = 0.0; |
|
155 for (i = 0; i < channels * overlapLength / 16; i ++) |
|
156 { |
|
157 corr += pV1[0] * pV2[0] + |
|
158 pV1[1] * pV2[1] + |
|
159 pV1[2] * pV2[2] + |
|
160 pV1[3] * pV2[3] + |
|
161 pV1[4] * pV2[4] + |
|
162 pV1[5] * pV2[5] + |
|
163 pV1[6] * pV2[6] + |
|
164 pV1[7] * pV2[7] + |
|
165 pV1[8] * pV2[8] + |
|
166 pV1[9] * pV2[9] + |
|
167 pV1[10] * pV2[10] + |
|
168 pV1[11] * pV2[11] + |
|
169 pV1[12] * pV2[12] + |
|
170 pV1[13] * pV2[13] + |
|
171 pV1[14] * pV2[14] + |
|
172 pV1[15] * pV2[15]; |
|
173 |
|
174 for (j = 0; j < 15; j ++) norm += pV1[j] * pV1[j]; |
|
175 |
|
176 pV1 += 16; |
|
177 pV2 += 16; |
|
178 } |
|
179 return corr / sqrt(norm); |
|
180 */ |
|
181 } |
|
182 |
|
183 |
|
184 |
|
185 double TDStretchSSE::calcCrossCorrAccumulate(const float *pV1, const float *pV2, double &norm) const |
|
186 { |
|
187 // call usual calcCrossCorr function because SSE does not show big benefit of |
|
188 // accumulating "norm" value, and also the "norm" rolling algorithm would get |
|
189 // complicated due to SSE-specific alignment-vs-nonexact correlation rules. |
|
190 return calcCrossCorr(pV1, pV2, norm); |
|
191 } |
|
192 |
|
193 |
|
194 ////////////////////////////////////////////////////////////////////////////// |
|
195 // |
|
196 // implementation of SSE optimized functions of class 'FIRFilter' |
|
197 // |
|
198 ////////////////////////////////////////////////////////////////////////////// |
|
199 |
|
200 #include "FIRFilter.h" |
|
201 |
|
202 FIRFilterSSE::FIRFilterSSE() : FIRFilter() |
|
203 { |
|
204 filterCoeffsAlign = NULL; |
|
205 filterCoeffsUnalign = NULL; |
|
206 } |
|
207 |
|
208 |
|
209 FIRFilterSSE::~FIRFilterSSE() |
|
210 { |
|
211 delete[] filterCoeffsUnalign; |
|
212 filterCoeffsAlign = NULL; |
|
213 filterCoeffsUnalign = NULL; |
|
214 } |
|
215 |
|
216 |
|
217 // (overloaded) Calculates filter coefficients for SSE routine |
|
218 void FIRFilterSSE::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor) |
|
219 { |
|
220 uint i; |
|
221 float fDivider; |
|
222 |
|
223 FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor); |
|
224 |
|
225 // Scale the filter coefficients so that it won't be necessary to scale the filtering result |
|
226 // also rearrange coefficients suitably for SSE |
|
227 // Ensure that filter coeffs array is aligned to 16-byte boundary |
|
228 delete[] filterCoeffsUnalign; |
|
229 filterCoeffsUnalign = new float[2 * newLength + 4]; |
|
230 filterCoeffsAlign = (float *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign); |
|
231 |
|
232 fDivider = (float)resultDivider; |
|
233 |
|
234 // rearrange the filter coefficients for mmx routines |
|
235 for (i = 0; i < newLength; i ++) |
|
236 { |
|
237 filterCoeffsAlign[2 * i + 0] = |
|
238 filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider; |
|
239 } |
|
240 } |
|
241 |
|
242 |
|
243 |
|
244 // SSE-optimized version of the filter routine for stereo sound |
|
245 uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint numSamples) const |
|
246 { |
|
247 int count = (int)((numSamples - length) & (uint)-2); |
|
248 int j; |
|
249 |
|
250 assert(count % 2 == 0); |
|
251 |
|
252 if (count < 2) return 0; |
|
253 |
|
254 assert(source != NULL); |
|
255 assert(dest != NULL); |
|
256 assert((length % 8) == 0); |
|
257 assert(filterCoeffsAlign != NULL); |
|
258 assert(((ulongptr)filterCoeffsAlign) % 16 == 0); |
|
259 |
|
260 // filter is evaluated for two stereo samples with each iteration, thus use of 'j += 2' |
|
261 for (j = 0; j < count; j += 2) |
|
262 { |
|
263 const float *pSrc; |
|
264 const __m128 *pFil; |
|
265 __m128 sum1, sum2; |
|
266 uint i; |
|
267 |
|
268 pSrc = (const float*)source; // source audio data |
|
269 pFil = (const __m128*)filterCoeffsAlign; // filter coefficients. NOTE: Assumes coefficients |
|
270 // are aligned to 16-byte boundary |
|
271 sum1 = sum2 = _mm_setzero_ps(); |
|
272 |
|
273 for (i = 0; i < length / 8; i ++) |
|
274 { |
|
275 // Unroll loop for efficiency & calculate filter for 2*2 stereo samples |
|
276 // at each pass |
|
277 |
|
278 // sum1 is accu for 2*2 filtered stereo sound data at the primary sound data offset |
|
279 // sum2 is accu for 2*2 filtered stereo sound data for the next sound sample offset. |
|
280 |
|
281 sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc) , pFil[0])); |
|
282 sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 2), pFil[0])); |
|
283 |
|
284 sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 4), pFil[1])); |
|
285 sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 6), pFil[1])); |
|
286 |
|
287 sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 8) , pFil[2])); |
|
288 sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 10), pFil[2])); |
|
289 |
|
290 sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 12), pFil[3])); |
|
291 sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 14), pFil[3])); |
|
292 |
|
293 pSrc += 16; |
|
294 pFil += 4; |
|
295 } |
|
296 |
|
297 // Now sum1 and sum2 both have a filtered 2-channel sample each, but we still need |
|
298 // to sum the two hi- and lo-floats of these registers together. |
|
299 |
|
300 // post-shuffle & add the filtered values and store to dest. |
|
301 _mm_storeu_ps(dest, _mm_add_ps( |
|
302 _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(1,0,3,2)), // s2_1 s2_0 s1_3 s1_2 |
|
303 _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(3,2,1,0)) // s2_3 s2_2 s1_1 s1_0 |
|
304 )); |
|
305 source += 4; |
|
306 dest += 4; |
|
307 } |
|
308 |
|
309 // Ideas for further improvement: |
|
310 // 1. If it could be guaranteed that 'source' were always aligned to 16-byte |
|
311 // boundary, a faster aligned '_mm_load_ps' instruction could be used. |
|
312 // 2. If it could be guaranteed that 'dest' were always aligned to 16-byte |
|
313 // boundary, a faster '_mm_store_ps' instruction could be used. |
|
314 |
|
315 return (uint)count; |
|
316 |
|
317 /* original routine in C-language. please notice the C-version has differently |
|
318 organized coefficients though. |
|
319 double suml1, suml2; |
|
320 double sumr1, sumr2; |
|
321 uint i, j; |
|
322 |
|
323 for (j = 0; j < count; j += 2) |
|
324 { |
|
325 const float *ptr; |
|
326 const float *pFil; |
|
327 |
|
328 suml1 = sumr1 = 0.0; |
|
329 suml2 = sumr2 = 0.0; |
|
330 ptr = src; |
|
331 pFil = filterCoeffs; |
|
332 for (i = 0; i < lengthLocal; i ++) |
|
333 { |
|
334 // unroll loop for efficiency. |
|
335 |
|
336 suml1 += ptr[0] * pFil[0] + |
|
337 ptr[2] * pFil[2] + |
|
338 ptr[4] * pFil[4] + |
|
339 ptr[6] * pFil[6]; |
|
340 |
|
341 sumr1 += ptr[1] * pFil[1] + |
|
342 ptr[3] * pFil[3] + |
|
343 ptr[5] * pFil[5] + |
|
344 ptr[7] * pFil[7]; |
|
345 |
|
346 suml2 += ptr[8] * pFil[0] + |
|
347 ptr[10] * pFil[2] + |
|
348 ptr[12] * pFil[4] + |
|
349 ptr[14] * pFil[6]; |
|
350 |
|
351 sumr2 += ptr[9] * pFil[1] + |
|
352 ptr[11] * pFil[3] + |
|
353 ptr[13] * pFil[5] + |
|
354 ptr[15] * pFil[7]; |
|
355 |
|
356 ptr += 16; |
|
357 pFil += 8; |
|
358 } |
|
359 dest[0] = (float)suml1; |
|
360 dest[1] = (float)sumr1; |
|
361 dest[2] = (float)suml2; |
|
362 dest[3] = (float)sumr2; |
|
363 |
|
364 src += 4; |
|
365 dest += 4; |
|
366 } |
|
367 */ |
|
368 } |
|
369 |
|
370 #endif // SOUNDTOUCH_ALLOW_SSE |