media/libsoundtouch/src/sse_optimized.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libsoundtouch/src/sse_optimized.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,370 @@
     1.4 +////////////////////////////////////////////////////////////////////////////////
     1.5 +///
     1.6 +/// SSE optimized routines for Pentium-III, Athlon-XP and later CPUs. All SSE 
     1.7 +/// optimized functions have been gathered into this single source 
     1.8 +/// code file, regardless to their class or original source code file, in order 
     1.9 +/// to ease porting the library to other compiler and processor platforms.
    1.10 +///
    1.11 +/// The SSE-optimizations are programmed using SSE compiler intrinsics that
    1.12 +/// are supported both by Microsoft Visual C++ and GCC compilers, so this file
    1.13 +/// should compile with both toolsets.
    1.14 +///
    1.15 +/// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ 
    1.16 +/// 6.0 processor pack" update to support SSE instruction set. The update is 
    1.17 +/// available for download at Microsoft Developers Network, see here:
    1.18 +/// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx
    1.19 +///
    1.20 +/// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and 
    1.21 +/// perform a search with keywords "processor pack".
    1.22 +///
    1.23 +/// Author        : Copyright (c) Olli Parviainen
    1.24 +/// Author e-mail : oparviai 'at' iki.fi
    1.25 +/// SoundTouch WWW: http://www.surina.net/soundtouch
    1.26 +///
    1.27 +////////////////////////////////////////////////////////////////////////////////
    1.28 +//
    1.29 +// Last changed  : $Date: 2014-01-07 12:25:40 -0600 (Tue, 07 Jan 2014) $
    1.30 +// File revision : $Revision: 4 $
    1.31 +//
    1.32 +// $Id: sse_optimized.cpp 184 2014-01-07 18:25:40Z oparviai $
    1.33 +//
    1.34 +////////////////////////////////////////////////////////////////////////////////
    1.35 +//
    1.36 +// License :
    1.37 +//
    1.38 +//  SoundTouch audio processing library
    1.39 +//  Copyright (c) Olli Parviainen
    1.40 +//
    1.41 +//  This library is free software; you can redistribute it and/or
    1.42 +//  modify it under the terms of the GNU Lesser General Public
    1.43 +//  License as published by the Free Software Foundation; either
    1.44 +//  version 2.1 of the License, or (at your option) any later version.
    1.45 +//
    1.46 +//  This library is distributed in the hope that it will be useful,
    1.47 +//  but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.48 +//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    1.49 +//  Lesser General Public License for more details.
    1.50 +//
    1.51 +//  You should have received a copy of the GNU Lesser General Public
    1.52 +//  License along with this library; if not, write to the Free Software
    1.53 +//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    1.54 +//
    1.55 +////////////////////////////////////////////////////////////////////////////////
    1.56 +
    1.57 +#include "cpu_detect.h"
    1.58 +#include "STTypes.h"
    1.59 +
    1.60 +using namespace soundtouch;
    1.61 +
    1.62 +#ifdef SOUNDTOUCH_ALLOW_SSE
    1.63 +
    1.64 +// SSE routines available only with float sample type    
    1.65 +
    1.66 +//////////////////////////////////////////////////////////////////////////////
    1.67 +//
    1.68 +// implementation of SSE optimized functions of class 'TDStretchSSE'
    1.69 +//
    1.70 +//////////////////////////////////////////////////////////////////////////////
    1.71 +
    1.72 +#include "TDStretch.h"
    1.73 +#include <xmmintrin.h>
    1.74 +#include <math.h>
    1.75 +
    1.76 +// Calculates cross correlation of two buffers
    1.77 +double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &norm) const
    1.78 +{
    1.79 +    int i;
    1.80 +    const float *pVec1;
    1.81 +    const __m128 *pVec2;
    1.82 +    __m128 vSum, vNorm;
    1.83 +
    1.84 +    // Note. It means a major slow-down if the routine needs to tolerate 
    1.85 +    // unaligned __m128 memory accesses. It's way faster if we can skip 
    1.86 +    // unaligned slots and use _mm_load_ps instruction instead of _mm_loadu_ps.
    1.87 +    // This can mean up to ~ 10-fold difference (incl. part of which is
    1.88 +    // due to skipping every second round for stereo sound though).
    1.89 +    //
    1.90 +    // Compile-time define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided
    1.91 +    // for choosing if this little cheating is allowed.
    1.92 +
    1.93 +#ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
    1.94 +    // Little cheating allowed, return valid correlation only for 
    1.95 +    // aligned locations, meaning every second round for stereo sound.
    1.96 +
    1.97 +    #define _MM_LOAD    _mm_load_ps
    1.98 +
    1.99 +    if (((ulongptr)pV1) & 15) return -1e50;    // skip unaligned locations
   1.100 +
   1.101 +#else
   1.102 +    // No cheating allowed, use unaligned load & take the resulting
   1.103 +    // performance hit.
   1.104 +    #define _MM_LOAD    _mm_loadu_ps
   1.105 +#endif 
   1.106 +
   1.107 +    // ensure overlapLength is divisible by 8
   1.108 +    assert((overlapLength % 8) == 0);
   1.109 +
   1.110 +    // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
   1.111 +    // Note: pV2 _must_ be aligned to 16-bit boundary, pV1 need not.
   1.112 +    pVec1 = (const float*)pV1;
   1.113 +    pVec2 = (const __m128*)pV2;
   1.114 +    vSum = vNorm = _mm_setzero_ps();
   1.115 +
   1.116 +    // Unroll the loop by factor of 4 * 4 operations. Use same routine for
   1.117 +    // stereo & mono, for mono it just means twice the amount of unrolling.
   1.118 +    for (i = 0; i < channels * overlapLength / 16; i ++) 
   1.119 +    {
   1.120 +        __m128 vTemp;
   1.121 +        // vSum += pV1[0..3] * pV2[0..3]
   1.122 +        vTemp = _MM_LOAD(pVec1);
   1.123 +        vSum  = _mm_add_ps(vSum,  _mm_mul_ps(vTemp ,pVec2[0]));
   1.124 +        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
   1.125 +
   1.126 +        // vSum += pV1[4..7] * pV2[4..7]
   1.127 +        vTemp = _MM_LOAD(pVec1 + 4);
   1.128 +        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[1]));
   1.129 +        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
   1.130 +
   1.131 +        // vSum += pV1[8..11] * pV2[8..11]
   1.132 +        vTemp = _MM_LOAD(pVec1 + 8);
   1.133 +        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[2]));
   1.134 +        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
   1.135 +
   1.136 +        // vSum += pV1[12..15] * pV2[12..15]
   1.137 +        vTemp = _MM_LOAD(pVec1 + 12);
   1.138 +        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[3]));
   1.139 +        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
   1.140 +
   1.141 +        pVec1 += 16;
   1.142 +        pVec2 += 4;
   1.143 +    }
   1.144 +
   1.145 +    // return value = vSum[0] + vSum[1] + vSum[2] + vSum[3]
   1.146 +    float *pvNorm = (float*)&vNorm;
   1.147 +    norm = (pvNorm[0] + pvNorm[1] + pvNorm[2] + pvNorm[3]);
   1.148 +
   1.149 +    float *pvSum = (float*)&vSum;
   1.150 +    return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]) / sqrt(norm < 1e-9 ? 1.0 : norm);
   1.151 +
   1.152 +    /* This is approximately corresponding routine in C-language yet without normalization:
   1.153 +    double corr, norm;
   1.154 +    uint i;
   1.155 +
   1.156 +    // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
   1.157 +    corr = norm = 0.0;
   1.158 +    for (i = 0; i < channels * overlapLength / 16; i ++) 
   1.159 +    {
   1.160 +        corr += pV1[0] * pV2[0] +
   1.161 +                pV1[1] * pV2[1] +
   1.162 +                pV1[2] * pV2[2] +
   1.163 +                pV1[3] * pV2[3] +
   1.164 +                pV1[4] * pV2[4] +
   1.165 +                pV1[5] * pV2[5] +
   1.166 +                pV1[6] * pV2[6] +
   1.167 +                pV1[7] * pV2[7] +
   1.168 +                pV1[8] * pV2[8] +
   1.169 +                pV1[9] * pV2[9] +
   1.170 +                pV1[10] * pV2[10] +
   1.171 +                pV1[11] * pV2[11] +
   1.172 +                pV1[12] * pV2[12] +
   1.173 +                pV1[13] * pV2[13] +
   1.174 +                pV1[14] * pV2[14] +
   1.175 +                pV1[15] * pV2[15];
   1.176 +
   1.177 +    for (j = 0; j < 15; j ++) norm += pV1[j] * pV1[j];
   1.178 +
   1.179 +        pV1 += 16;
   1.180 +        pV2 += 16;
   1.181 +    }
   1.182 +    return corr / sqrt(norm);
   1.183 +    */
   1.184 +}
   1.185 +
   1.186 +
   1.187 +
   1.188 +double TDStretchSSE::calcCrossCorrAccumulate(const float *pV1, const float *pV2, double &norm) const
   1.189 +{
   1.190 +    // call usual calcCrossCorr function because SSE does not show big benefit of 
   1.191 +    // accumulating "norm" value, and also the "norm" rolling algorithm would get 
   1.192 +    // complicated due to SSE-specific alignment-vs-nonexact correlation rules.
   1.193 +    return calcCrossCorr(pV1, pV2, norm);
   1.194 +}
   1.195 +
   1.196 +
   1.197 +//////////////////////////////////////////////////////////////////////////////
   1.198 +//
   1.199 +// implementation of SSE optimized functions of class 'FIRFilter'
   1.200 +//
   1.201 +//////////////////////////////////////////////////////////////////////////////
   1.202 +
   1.203 +#include "FIRFilter.h"
   1.204 +
   1.205 +FIRFilterSSE::FIRFilterSSE() : FIRFilter()
   1.206 +{
   1.207 +    filterCoeffsAlign = NULL;
   1.208 +    filterCoeffsUnalign = NULL;
   1.209 +}
   1.210 +
   1.211 +
   1.212 +FIRFilterSSE::~FIRFilterSSE()
   1.213 +{
   1.214 +    delete[] filterCoeffsUnalign;
   1.215 +    filterCoeffsAlign = NULL;
   1.216 +    filterCoeffsUnalign = NULL;
   1.217 +}
   1.218 +
   1.219 +
   1.220 +// (overloaded) Calculates filter coefficients for SSE routine
   1.221 +void FIRFilterSSE::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
   1.222 +{
   1.223 +    uint i;
   1.224 +    float fDivider;
   1.225 +
   1.226 +    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
   1.227 +
   1.228 +    // Scale the filter coefficients so that it won't be necessary to scale the filtering result
   1.229 +    // also rearrange coefficients suitably for SSE
   1.230 +    // Ensure that filter coeffs array is aligned to 16-byte boundary
   1.231 +    delete[] filterCoeffsUnalign;
   1.232 +    filterCoeffsUnalign = new float[2 * newLength + 4];
   1.233 +    filterCoeffsAlign = (float *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
   1.234 +
   1.235 +    fDivider = (float)resultDivider;
   1.236 +
   1.237 +    // rearrange the filter coefficients for mmx routines 
   1.238 +    for (i = 0; i < newLength; i ++)
   1.239 +    {
   1.240 +        filterCoeffsAlign[2 * i + 0] =
   1.241 +        filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider;
   1.242 +    }
   1.243 +}
   1.244 +
   1.245 +
   1.246 +
   1.247 +// SSE-optimized version of the filter routine for stereo sound
   1.248 +uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint numSamples) const
   1.249 +{
   1.250 +    int count = (int)((numSamples - length) & (uint)-2);
   1.251 +    int j;
   1.252 +
   1.253 +    assert(count % 2 == 0);
   1.254 +
   1.255 +    if (count < 2) return 0;
   1.256 +
   1.257 +    assert(source != NULL);
   1.258 +    assert(dest != NULL);
   1.259 +    assert((length % 8) == 0);
   1.260 +    assert(filterCoeffsAlign != NULL);
   1.261 +    assert(((ulongptr)filterCoeffsAlign) % 16 == 0);
   1.262 +
   1.263 +    // filter is evaluated for two stereo samples with each iteration, thus use of 'j += 2'
   1.264 +    for (j = 0; j < count; j += 2)
   1.265 +    {
   1.266 +        const float *pSrc;
   1.267 +        const __m128 *pFil;
   1.268 +        __m128 sum1, sum2;
   1.269 +        uint i;
   1.270 +
   1.271 +        pSrc = (const float*)source;              // source audio data
   1.272 +        pFil = (const __m128*)filterCoeffsAlign;  // filter coefficients. NOTE: Assumes coefficients 
   1.273 +                                                  // are aligned to 16-byte boundary
   1.274 +        sum1 = sum2 = _mm_setzero_ps();
   1.275 +
   1.276 +        for (i = 0; i < length / 8; i ++) 
   1.277 +        {
   1.278 +            // Unroll loop for efficiency & calculate filter for 2*2 stereo samples 
   1.279 +            // at each pass
   1.280 +
   1.281 +            // sum1 is accu for 2*2 filtered stereo sound data at the primary sound data offset
   1.282 +            // sum2 is accu for 2*2 filtered stereo sound data for the next sound sample offset.
   1.283 +
   1.284 +            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc)    , pFil[0]));
   1.285 +            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 2), pFil[0]));
   1.286 +
   1.287 +            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 4), pFil[1]));
   1.288 +            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 6), pFil[1]));
   1.289 +
   1.290 +            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 8) ,  pFil[2]));
   1.291 +            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 10), pFil[2]));
   1.292 +
   1.293 +            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 12), pFil[3]));
   1.294 +            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 14), pFil[3]));
   1.295 +
   1.296 +            pSrc += 16;
   1.297 +            pFil += 4;
   1.298 +        }
   1.299 +
   1.300 +        // Now sum1 and sum2 both have a filtered 2-channel sample each, but we still need
   1.301 +        // to sum the two hi- and lo-floats of these registers together.
   1.302 +
   1.303 +        // post-shuffle & add the filtered values and store to dest.
   1.304 +        _mm_storeu_ps(dest, _mm_add_ps(
   1.305 +                    _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(1,0,3,2)),   // s2_1 s2_0 s1_3 s1_2
   1.306 +                    _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(3,2,1,0))    // s2_3 s2_2 s1_1 s1_0
   1.307 +                    ));
   1.308 +        source += 4;
   1.309 +        dest += 4;
   1.310 +    }
   1.311 +
   1.312 +    // Ideas for further improvement:
   1.313 +    // 1. If it could be guaranteed that 'source' were always aligned to 16-byte 
   1.314 +    //    boundary, a faster aligned '_mm_load_ps' instruction could be used.
   1.315 +    // 2. If it could be guaranteed that 'dest' were always aligned to 16-byte 
   1.316 +    //    boundary, a faster '_mm_store_ps' instruction could be used.
   1.317 +
   1.318 +    return (uint)count;
   1.319 +
   1.320 +    /* original routine in C-language. please notice the C-version has differently 
   1.321 +       organized coefficients though.
   1.322 +    double suml1, suml2;
   1.323 +    double sumr1, sumr2;
   1.324 +    uint i, j;
   1.325 +
   1.326 +    for (j = 0; j < count; j += 2)
   1.327 +    {
   1.328 +        const float *ptr;
   1.329 +        const float *pFil;
   1.330 +
   1.331 +        suml1 = sumr1 = 0.0;
   1.332 +        suml2 = sumr2 = 0.0;
   1.333 +        ptr = src;
   1.334 +        pFil = filterCoeffs;
   1.335 +        for (i = 0; i < lengthLocal; i ++) 
   1.336 +        {
   1.337 +            // unroll loop for efficiency.
   1.338 +
   1.339 +            suml1 += ptr[0] * pFil[0] + 
   1.340 +                     ptr[2] * pFil[2] +
   1.341 +                     ptr[4] * pFil[4] +
   1.342 +                     ptr[6] * pFil[6];
   1.343 +
   1.344 +            sumr1 += ptr[1] * pFil[1] + 
   1.345 +                     ptr[3] * pFil[3] +
   1.346 +                     ptr[5] * pFil[5] +
   1.347 +                     ptr[7] * pFil[7];
   1.348 +
   1.349 +            suml2 += ptr[8] * pFil[0] + 
   1.350 +                     ptr[10] * pFil[2] +
   1.351 +                     ptr[12] * pFil[4] +
   1.352 +                     ptr[14] * pFil[6];
   1.353 +
   1.354 +            sumr2 += ptr[9] * pFil[1] + 
   1.355 +                     ptr[11] * pFil[3] +
   1.356 +                     ptr[13] * pFil[5] +
   1.357 +                     ptr[15] * pFil[7];
   1.358 +
   1.359 +            ptr += 16;
   1.360 +            pFil += 8;
   1.361 +        }
   1.362 +        dest[0] = (float)suml1;
   1.363 +        dest[1] = (float)sumr1;
   1.364 +        dest[2] = (float)suml2;
   1.365 +        dest[3] = (float)sumr2;
   1.366 +
   1.367 +        src += 4;
   1.368 +        dest += 4;
   1.369 +    }
   1.370 +    */
   1.371 +}
   1.372 +
   1.373 +#endif  // SOUNDTOUCH_ALLOW_SSE

mercurial