Tue, 06 Jan 2015 21:39:09 +0100
Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
michael@0 | 1 | //////////////////////////////////////////////////////////////////////////////// |
michael@0 | 2 | /// |
michael@0 | 3 | /// SSE optimized routines for Pentium-III, Athlon-XP and later CPUs. All SSE |
michael@0 | 4 | /// optimized functions have been gathered into this single source |
michael@0 | 5 | /// code file, regardless to their class or original source code file, in order |
michael@0 | 6 | /// to ease porting the library to other compiler and processor platforms. |
michael@0 | 7 | /// |
michael@0 | 8 | /// The SSE-optimizations are programmed using SSE compiler intrinsics that |
michael@0 | 9 | /// are supported both by Microsoft Visual C++ and GCC compilers, so this file |
michael@0 | 10 | /// should compile with both toolsets. |
michael@0 | 11 | /// |
michael@0 | 12 | /// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ |
michael@0 | 13 | /// 6.0 processor pack" update to support SSE instruction set. The update is |
michael@0 | 14 | /// available for download at Microsoft Developers Network, see here: |
michael@0 | 15 | /// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx |
michael@0 | 16 | /// |
michael@0 | 17 | /// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and |
michael@0 | 18 | /// perform a search with keywords "processor pack". |
michael@0 | 19 | /// |
michael@0 | 20 | /// Author : Copyright (c) Olli Parviainen |
michael@0 | 21 | /// Author e-mail : oparviai 'at' iki.fi |
michael@0 | 22 | /// SoundTouch WWW: http://www.surina.net/soundtouch |
michael@0 | 23 | /// |
michael@0 | 24 | //////////////////////////////////////////////////////////////////////////////// |
michael@0 | 25 | // |
michael@0 | 26 | // Last changed : $Date: 2014-01-07 12:25:40 -0600 (Tue, 07 Jan 2014) $ |
michael@0 | 27 | // File revision : $Revision: 4 $ |
michael@0 | 28 | // |
michael@0 | 29 | // $Id: sse_optimized.cpp 184 2014-01-07 18:25:40Z oparviai $ |
michael@0 | 30 | // |
michael@0 | 31 | //////////////////////////////////////////////////////////////////////////////// |
michael@0 | 32 | // |
michael@0 | 33 | // License : |
michael@0 | 34 | // |
michael@0 | 35 | // SoundTouch audio processing library |
michael@0 | 36 | // Copyright (c) Olli Parviainen |
michael@0 | 37 | // |
michael@0 | 38 | // This library is free software; you can redistribute it and/or |
michael@0 | 39 | // modify it under the terms of the GNU Lesser General Public |
michael@0 | 40 | // License as published by the Free Software Foundation; either |
michael@0 | 41 | // version 2.1 of the License, or (at your option) any later version. |
michael@0 | 42 | // |
michael@0 | 43 | // This library is distributed in the hope that it will be useful, |
michael@0 | 44 | // but WITHOUT ANY WARRANTY; without even the implied warranty of |
michael@0 | 45 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
michael@0 | 46 | // Lesser General Public License for more details. |
michael@0 | 47 | // |
michael@0 | 48 | // You should have received a copy of the GNU Lesser General Public |
michael@0 | 49 | // License along with this library; if not, write to the Free Software |
michael@0 | 50 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
michael@0 | 51 | // |
michael@0 | 52 | //////////////////////////////////////////////////////////////////////////////// |
michael@0 | 53 | |
michael@0 | 54 | #include "cpu_detect.h" |
michael@0 | 55 | #include "STTypes.h" |
michael@0 | 56 | |
michael@0 | 57 | using namespace soundtouch; |
michael@0 | 58 | |
michael@0 | 59 | #ifdef SOUNDTOUCH_ALLOW_SSE |
michael@0 | 60 | |
michael@0 | 61 | // SSE routines available only with float sample type |
michael@0 | 62 | |
michael@0 | 63 | ////////////////////////////////////////////////////////////////////////////// |
michael@0 | 64 | // |
michael@0 | 65 | // implementation of SSE optimized functions of class 'TDStretchSSE' |
michael@0 | 66 | // |
michael@0 | 67 | ////////////////////////////////////////////////////////////////////////////// |
michael@0 | 68 | |
michael@0 | 69 | #include "TDStretch.h" |
michael@0 | 70 | #include <xmmintrin.h> |
michael@0 | 71 | #include <math.h> |
michael@0 | 72 | |
michael@0 | 73 | // Calculates cross correlation of two buffers |
michael@0 | 74 | double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &norm) const |
michael@0 | 75 | { |
michael@0 | 76 | int i; |
michael@0 | 77 | const float *pVec1; |
michael@0 | 78 | const __m128 *pVec2; |
michael@0 | 79 | __m128 vSum, vNorm; |
michael@0 | 80 | |
michael@0 | 81 | // Note. It means a major slow-down if the routine needs to tolerate |
michael@0 | 82 | // unaligned __m128 memory accesses. It's way faster if we can skip |
michael@0 | 83 | // unaligned slots and use _mm_load_ps instruction instead of _mm_loadu_ps. |
michael@0 | 84 | // This can mean up to ~ 10-fold difference (incl. part of which is |
michael@0 | 85 | // due to skipping every second round for stereo sound though). |
michael@0 | 86 | // |
michael@0 | 87 | // Compile-time define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided |
michael@0 | 88 | // for choosing if this little cheating is allowed. |
michael@0 | 89 | |
michael@0 | 90 | #ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION |
michael@0 | 91 | // Little cheating allowed, return valid correlation only for |
michael@0 | 92 | // aligned locations, meaning every second round for stereo sound. |
michael@0 | 93 | |
michael@0 | 94 | #define _MM_LOAD _mm_load_ps |
michael@0 | 95 | |
michael@0 | 96 | if (((ulongptr)pV1) & 15) return -1e50; // skip unaligned locations |
michael@0 | 97 | |
michael@0 | 98 | #else |
michael@0 | 99 | // No cheating allowed, use unaligned load & take the resulting |
michael@0 | 100 | // performance hit. |
michael@0 | 101 | #define _MM_LOAD _mm_loadu_ps |
michael@0 | 102 | #endif |
michael@0 | 103 | |
michael@0 | 104 | // ensure overlapLength is divisible by 8 |
michael@0 | 105 | assert((overlapLength % 8) == 0); |
michael@0 | 106 | |
michael@0 | 107 | // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors |
michael@0 | 108 | // Note: pV2 _must_ be aligned to 16-bit boundary, pV1 need not. |
michael@0 | 109 | pVec1 = (const float*)pV1; |
michael@0 | 110 | pVec2 = (const __m128*)pV2; |
michael@0 | 111 | vSum = vNorm = _mm_setzero_ps(); |
michael@0 | 112 | |
michael@0 | 113 | // Unroll the loop by factor of 4 * 4 operations. Use same routine for |
michael@0 | 114 | // stereo & mono, for mono it just means twice the amount of unrolling. |
michael@0 | 115 | for (i = 0; i < channels * overlapLength / 16; i ++) |
michael@0 | 116 | { |
michael@0 | 117 | __m128 vTemp; |
michael@0 | 118 | // vSum += pV1[0..3] * pV2[0..3] |
michael@0 | 119 | vTemp = _MM_LOAD(pVec1); |
michael@0 | 120 | vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp ,pVec2[0])); |
michael@0 | 121 | vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp)); |
michael@0 | 122 | |
michael@0 | 123 | // vSum += pV1[4..7] * pV2[4..7] |
michael@0 | 124 | vTemp = _MM_LOAD(pVec1 + 4); |
michael@0 | 125 | vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[1])); |
michael@0 | 126 | vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp)); |
michael@0 | 127 | |
michael@0 | 128 | // vSum += pV1[8..11] * pV2[8..11] |
michael@0 | 129 | vTemp = _MM_LOAD(pVec1 + 8); |
michael@0 | 130 | vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[2])); |
michael@0 | 131 | vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp)); |
michael@0 | 132 | |
michael@0 | 133 | // vSum += pV1[12..15] * pV2[12..15] |
michael@0 | 134 | vTemp = _MM_LOAD(pVec1 + 12); |
michael@0 | 135 | vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[3])); |
michael@0 | 136 | vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp)); |
michael@0 | 137 | |
michael@0 | 138 | pVec1 += 16; |
michael@0 | 139 | pVec2 += 4; |
michael@0 | 140 | } |
michael@0 | 141 | |
michael@0 | 142 | // return value = vSum[0] + vSum[1] + vSum[2] + vSum[3] |
michael@0 | 143 | float *pvNorm = (float*)&vNorm; |
michael@0 | 144 | norm = (pvNorm[0] + pvNorm[1] + pvNorm[2] + pvNorm[3]); |
michael@0 | 145 | |
michael@0 | 146 | float *pvSum = (float*)&vSum; |
michael@0 | 147 | return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]) / sqrt(norm < 1e-9 ? 1.0 : norm); |
michael@0 | 148 | |
michael@0 | 149 | /* This is approximately corresponding routine in C-language yet without normalization: |
michael@0 | 150 | double corr, norm; |
michael@0 | 151 | uint i; |
michael@0 | 152 | |
michael@0 | 153 | // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors |
michael@0 | 154 | corr = norm = 0.0; |
michael@0 | 155 | for (i = 0; i < channels * overlapLength / 16; i ++) |
michael@0 | 156 | { |
michael@0 | 157 | corr += pV1[0] * pV2[0] + |
michael@0 | 158 | pV1[1] * pV2[1] + |
michael@0 | 159 | pV1[2] * pV2[2] + |
michael@0 | 160 | pV1[3] * pV2[3] + |
michael@0 | 161 | pV1[4] * pV2[4] + |
michael@0 | 162 | pV1[5] * pV2[5] + |
michael@0 | 163 | pV1[6] * pV2[6] + |
michael@0 | 164 | pV1[7] * pV2[7] + |
michael@0 | 165 | pV1[8] * pV2[8] + |
michael@0 | 166 | pV1[9] * pV2[9] + |
michael@0 | 167 | pV1[10] * pV2[10] + |
michael@0 | 168 | pV1[11] * pV2[11] + |
michael@0 | 169 | pV1[12] * pV2[12] + |
michael@0 | 170 | pV1[13] * pV2[13] + |
michael@0 | 171 | pV1[14] * pV2[14] + |
michael@0 | 172 | pV1[15] * pV2[15]; |
michael@0 | 173 | |
michael@0 | 174 | for (j = 0; j < 15; j ++) norm += pV1[j] * pV1[j]; |
michael@0 | 175 | |
michael@0 | 176 | pV1 += 16; |
michael@0 | 177 | pV2 += 16; |
michael@0 | 178 | } |
michael@0 | 179 | return corr / sqrt(norm); |
michael@0 | 180 | */ |
michael@0 | 181 | } |
michael@0 | 182 | |
michael@0 | 183 | |
michael@0 | 184 | |
michael@0 | 185 | double TDStretchSSE::calcCrossCorrAccumulate(const float *pV1, const float *pV2, double &norm) const |
michael@0 | 186 | { |
michael@0 | 187 | // call usual calcCrossCorr function because SSE does not show big benefit of |
michael@0 | 188 | // accumulating "norm" value, and also the "norm" rolling algorithm would get |
michael@0 | 189 | // complicated due to SSE-specific alignment-vs-nonexact correlation rules. |
michael@0 | 190 | return calcCrossCorr(pV1, pV2, norm); |
michael@0 | 191 | } |
michael@0 | 192 | |
michael@0 | 193 | |
michael@0 | 194 | ////////////////////////////////////////////////////////////////////////////// |
michael@0 | 195 | // |
michael@0 | 196 | // implementation of SSE optimized functions of class 'FIRFilter' |
michael@0 | 197 | // |
michael@0 | 198 | ////////////////////////////////////////////////////////////////////////////// |
michael@0 | 199 | |
michael@0 | 200 | #include "FIRFilter.h" |
michael@0 | 201 | |
michael@0 | 202 | FIRFilterSSE::FIRFilterSSE() : FIRFilter() |
michael@0 | 203 | { |
michael@0 | 204 | filterCoeffsAlign = NULL; |
michael@0 | 205 | filterCoeffsUnalign = NULL; |
michael@0 | 206 | } |
michael@0 | 207 | |
michael@0 | 208 | |
michael@0 | 209 | FIRFilterSSE::~FIRFilterSSE() |
michael@0 | 210 | { |
michael@0 | 211 | delete[] filterCoeffsUnalign; |
michael@0 | 212 | filterCoeffsAlign = NULL; |
michael@0 | 213 | filterCoeffsUnalign = NULL; |
michael@0 | 214 | } |
michael@0 | 215 | |
michael@0 | 216 | |
michael@0 | 217 | // (overloaded) Calculates filter coefficients for SSE routine |
michael@0 | 218 | void FIRFilterSSE::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor) |
michael@0 | 219 | { |
michael@0 | 220 | uint i; |
michael@0 | 221 | float fDivider; |
michael@0 | 222 | |
michael@0 | 223 | FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor); |
michael@0 | 224 | |
michael@0 | 225 | // Scale the filter coefficients so that it won't be necessary to scale the filtering result |
michael@0 | 226 | // also rearrange coefficients suitably for SSE |
michael@0 | 227 | // Ensure that filter coeffs array is aligned to 16-byte boundary |
michael@0 | 228 | delete[] filterCoeffsUnalign; |
michael@0 | 229 | filterCoeffsUnalign = new float[2 * newLength + 4]; |
michael@0 | 230 | filterCoeffsAlign = (float *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign); |
michael@0 | 231 | |
michael@0 | 232 | fDivider = (float)resultDivider; |
michael@0 | 233 | |
michael@0 | 234 | // rearrange the filter coefficients for mmx routines |
michael@0 | 235 | for (i = 0; i < newLength; i ++) |
michael@0 | 236 | { |
michael@0 | 237 | filterCoeffsAlign[2 * i + 0] = |
michael@0 | 238 | filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider; |
michael@0 | 239 | } |
michael@0 | 240 | } |
michael@0 | 241 | |
michael@0 | 242 | |
michael@0 | 243 | |
michael@0 | 244 | // SSE-optimized version of the filter routine for stereo sound |
michael@0 | 245 | uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint numSamples) const |
michael@0 | 246 | { |
michael@0 | 247 | int count = (int)((numSamples - length) & (uint)-2); |
michael@0 | 248 | int j; |
michael@0 | 249 | |
michael@0 | 250 | assert(count % 2 == 0); |
michael@0 | 251 | |
michael@0 | 252 | if (count < 2) return 0; |
michael@0 | 253 | |
michael@0 | 254 | assert(source != NULL); |
michael@0 | 255 | assert(dest != NULL); |
michael@0 | 256 | assert((length % 8) == 0); |
michael@0 | 257 | assert(filterCoeffsAlign != NULL); |
michael@0 | 258 | assert(((ulongptr)filterCoeffsAlign) % 16 == 0); |
michael@0 | 259 | |
michael@0 | 260 | // filter is evaluated for two stereo samples with each iteration, thus use of 'j += 2' |
michael@0 | 261 | for (j = 0; j < count; j += 2) |
michael@0 | 262 | { |
michael@0 | 263 | const float *pSrc; |
michael@0 | 264 | const __m128 *pFil; |
michael@0 | 265 | __m128 sum1, sum2; |
michael@0 | 266 | uint i; |
michael@0 | 267 | |
michael@0 | 268 | pSrc = (const float*)source; // source audio data |
michael@0 | 269 | pFil = (const __m128*)filterCoeffsAlign; // filter coefficients. NOTE: Assumes coefficients |
michael@0 | 270 | // are aligned to 16-byte boundary |
michael@0 | 271 | sum1 = sum2 = _mm_setzero_ps(); |
michael@0 | 272 | |
michael@0 | 273 | for (i = 0; i < length / 8; i ++) |
michael@0 | 274 | { |
michael@0 | 275 | // Unroll loop for efficiency & calculate filter for 2*2 stereo samples |
michael@0 | 276 | // at each pass |
michael@0 | 277 | |
michael@0 | 278 | // sum1 is accu for 2*2 filtered stereo sound data at the primary sound data offset |
michael@0 | 279 | // sum2 is accu for 2*2 filtered stereo sound data for the next sound sample offset. |
michael@0 | 280 | |
michael@0 | 281 | sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc) , pFil[0])); |
michael@0 | 282 | sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 2), pFil[0])); |
michael@0 | 283 | |
michael@0 | 284 | sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 4), pFil[1])); |
michael@0 | 285 | sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 6), pFil[1])); |
michael@0 | 286 | |
michael@0 | 287 | sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 8) , pFil[2])); |
michael@0 | 288 | sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 10), pFil[2])); |
michael@0 | 289 | |
michael@0 | 290 | sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 12), pFil[3])); |
michael@0 | 291 | sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 14), pFil[3])); |
michael@0 | 292 | |
michael@0 | 293 | pSrc += 16; |
michael@0 | 294 | pFil += 4; |
michael@0 | 295 | } |
michael@0 | 296 | |
michael@0 | 297 | // Now sum1 and sum2 both have a filtered 2-channel sample each, but we still need |
michael@0 | 298 | // to sum the two hi- and lo-floats of these registers together. |
michael@0 | 299 | |
michael@0 | 300 | // post-shuffle & add the filtered values and store to dest. |
michael@0 | 301 | _mm_storeu_ps(dest, _mm_add_ps( |
michael@0 | 302 | _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(1,0,3,2)), // s2_1 s2_0 s1_3 s1_2 |
michael@0 | 303 | _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(3,2,1,0)) // s2_3 s2_2 s1_1 s1_0 |
michael@0 | 304 | )); |
michael@0 | 305 | source += 4; |
michael@0 | 306 | dest += 4; |
michael@0 | 307 | } |
michael@0 | 308 | |
michael@0 | 309 | // Ideas for further improvement: |
michael@0 | 310 | // 1. If it could be guaranteed that 'source' were always aligned to 16-byte |
michael@0 | 311 | // boundary, a faster aligned '_mm_load_ps' instruction could be used. |
michael@0 | 312 | // 2. If it could be guaranteed that 'dest' were always aligned to 16-byte |
michael@0 | 313 | // boundary, a faster '_mm_store_ps' instruction could be used. |
michael@0 | 314 | |
michael@0 | 315 | return (uint)count; |
michael@0 | 316 | |
michael@0 | 317 | /* original routine in C-language. please notice the C-version has differently |
michael@0 | 318 | organized coefficients though. |
michael@0 | 319 | double suml1, suml2; |
michael@0 | 320 | double sumr1, sumr2; |
michael@0 | 321 | uint i, j; |
michael@0 | 322 | |
michael@0 | 323 | for (j = 0; j < count; j += 2) |
michael@0 | 324 | { |
michael@0 | 325 | const float *ptr; |
michael@0 | 326 | const float *pFil; |
michael@0 | 327 | |
michael@0 | 328 | suml1 = sumr1 = 0.0; |
michael@0 | 329 | suml2 = sumr2 = 0.0; |
michael@0 | 330 | ptr = src; |
michael@0 | 331 | pFil = filterCoeffs; |
michael@0 | 332 | for (i = 0; i < lengthLocal; i ++) |
michael@0 | 333 | { |
michael@0 | 334 | // unroll loop for efficiency. |
michael@0 | 335 | |
michael@0 | 336 | suml1 += ptr[0] * pFil[0] + |
michael@0 | 337 | ptr[2] * pFil[2] + |
michael@0 | 338 | ptr[4] * pFil[4] + |
michael@0 | 339 | ptr[6] * pFil[6]; |
michael@0 | 340 | |
michael@0 | 341 | sumr1 += ptr[1] * pFil[1] + |
michael@0 | 342 | ptr[3] * pFil[3] + |
michael@0 | 343 | ptr[5] * pFil[5] + |
michael@0 | 344 | ptr[7] * pFil[7]; |
michael@0 | 345 | |
michael@0 | 346 | suml2 += ptr[8] * pFil[0] + |
michael@0 | 347 | ptr[10] * pFil[2] + |
michael@0 | 348 | ptr[12] * pFil[4] + |
michael@0 | 349 | ptr[14] * pFil[6]; |
michael@0 | 350 | |
michael@0 | 351 | sumr2 += ptr[9] * pFil[1] + |
michael@0 | 352 | ptr[11] * pFil[3] + |
michael@0 | 353 | ptr[13] * pFil[5] + |
michael@0 | 354 | ptr[15] * pFil[7]; |
michael@0 | 355 | |
michael@0 | 356 | ptr += 16; |
michael@0 | 357 | pFil += 8; |
michael@0 | 358 | } |
michael@0 | 359 | dest[0] = (float)suml1; |
michael@0 | 360 | dest[1] = (float)sumr1; |
michael@0 | 361 | dest[2] = (float)suml2; |
michael@0 | 362 | dest[3] = (float)sumr2; |
michael@0 | 363 | |
michael@0 | 364 | src += 4; |
michael@0 | 365 | dest += 4; |
michael@0 | 366 | } |
michael@0 | 367 | */ |
michael@0 | 368 | } |
michael@0 | 369 | |
michael@0 | 370 | #endif // SOUNDTOUCH_ALLOW_SSE |