media/libsoundtouch/src/mmx_optimized.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libsoundtouch/src/mmx_optimized.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,384 @@
     1.4 +////////////////////////////////////////////////////////////////////////////////
     1.5 +///
     1.6 +/// MMX optimized routines. All MMX optimized functions have been gathered into 
     1.7 +/// this single source code file, regardless to their class or original source 
     1.8 +/// code file, in order to ease porting the library to other compiler and 
     1.9 +/// processor platforms.
    1.10 +///
    1.11 +/// The MMX-optimizations are programmed using MMX compiler intrinsics that
    1.12 +/// are supported both by Microsoft Visual C++ and GCC compilers, so this file
    1.13 +/// should compile with both toolsets.
    1.14 +///
    1.15 +/// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ 
    1.16 +/// 6.0 processor pack" update to support compiler intrinsic syntax. The update
    1.17 +/// is available for download at Microsoft Developers Network, see here:
    1.18 +/// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx
    1.19 +///
    1.20 +/// Author        : Copyright (c) Olli Parviainen
    1.21 +/// Author e-mail : oparviai 'at' iki.fi
    1.22 +/// SoundTouch WWW: http://www.surina.net/soundtouch
    1.23 +///
    1.24 +////////////////////////////////////////////////////////////////////////////////
    1.25 +//
    1.26 +// Last changed  : $Date: 2014-01-07 12:25:40 -0600 (Tue, 07 Jan 2014) $
    1.27 +// File revision : $Revision: 4 $
    1.28 +//
    1.29 +// $Id: mmx_optimized.cpp 184 2014-01-07 18:25:40Z oparviai $
    1.30 +//
    1.31 +////////////////////////////////////////////////////////////////////////////////
    1.32 +//
    1.33 +// License :
    1.34 +//
    1.35 +//  SoundTouch audio processing library
    1.36 +//  Copyright (c) Olli Parviainen
    1.37 +//
    1.38 +//  This library is free software; you can redistribute it and/or
    1.39 +//  modify it under the terms of the GNU Lesser General Public
    1.40 +//  License as published by the Free Software Foundation; either
    1.41 +//  version 2.1 of the License, or (at your option) any later version.
    1.42 +//
    1.43 +//  This library is distributed in the hope that it will be useful,
    1.44 +//  but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.45 +//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    1.46 +//  Lesser General Public License for more details.
    1.47 +//
    1.48 +//  You should have received a copy of the GNU Lesser General Public
    1.49 +//  License along with this library; if not, write to the Free Software
    1.50 +//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    1.51 +//
    1.52 +////////////////////////////////////////////////////////////////////////////////
    1.53 +
    1.54 +#include "STTypes.h"
    1.55 +
    1.56 +#ifdef SOUNDTOUCH_ALLOW_MMX
    1.57 +// MMX routines available only with integer sample type
    1.58 +
    1.59 +using namespace soundtouch;
    1.60 +
    1.61 +//////////////////////////////////////////////////////////////////////////////
    1.62 +//
    1.63 +// implementation of MMX optimized functions of class 'TDStretchMMX'
    1.64 +//
    1.65 +//////////////////////////////////////////////////////////////////////////////
    1.66 +
    1.67 +#include "TDStretch.h"
    1.68 +#include <mmintrin.h>
    1.69 +#include <limits.h>
    1.70 +#include <math.h>
    1.71 +
    1.72 +
    1.73 +// Calculates cross correlation of two buffers
    1.74 +double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm) const
    1.75 +{
    1.76 +    const __m64 *pVec1, *pVec2;
    1.77 +    __m64 shifter;
    1.78 +    __m64 accu, normaccu;
    1.79 +    long corr, norm;
    1.80 +    int i;
    1.81 +   
    1.82 +    pVec1 = (__m64*)pV1;
    1.83 +    pVec2 = (__m64*)pV2;
    1.84 +
    1.85 +    shifter = _m_from_int(overlapDividerBits);
    1.86 +    normaccu = accu = _mm_setzero_si64();
    1.87 +
    1.88 +    // Process 4 parallel sets of 2 * stereo samples or 4 * mono samples 
    1.89 +    // during each round for improved CPU-level parallellization.
    1.90 +    for (i = 0; i < channels * overlapLength / 16; i ++)
    1.91 +    {
    1.92 +        __m64 temp, temp2;
    1.93 +
    1.94 +        // dictionary of instructions:
    1.95 +        // _m_pmaddwd   : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
    1.96 +        // _mm_add_pi32 : 2*32bit add
    1.97 +        // _m_psrad     : 32bit right-shift
    1.98 +
    1.99 +        temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]), shifter),
   1.100 +                            _mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec2[1]), shifter));
   1.101 +        temp2 = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec1[0]), shifter),
   1.102 +                            _mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec1[1]), shifter));
   1.103 +        accu = _mm_add_pi32(accu, temp);
   1.104 +        normaccu = _mm_add_pi32(normaccu, temp2);
   1.105 +
   1.106 +        temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]), shifter),
   1.107 +                            _mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec2[3]), shifter));
   1.108 +        temp2 = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec1[2]), shifter),
   1.109 +                            _mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec1[3]), shifter));
   1.110 +        accu = _mm_add_pi32(accu, temp);
   1.111 +        normaccu = _mm_add_pi32(normaccu, temp2);
   1.112 +
   1.113 +        pVec1 += 4;
   1.114 +        pVec2 += 4;
   1.115 +    }
   1.116 +
   1.117 +    // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
   1.118 +    // and finally store the result into the variable "corr"
   1.119 +
   1.120 +    accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32));
   1.121 +    corr = _m_to_int(accu);
   1.122 +
   1.123 +    normaccu = _mm_add_pi32(normaccu, _mm_srli_si64(normaccu, 32));
   1.124 +    norm = _m_to_int(normaccu);
   1.125 +
   1.126 +    // Clear MMS state
   1.127 +    _m_empty();
   1.128 +
   1.129 +    // Normalize result by dividing by sqrt(norm) - this step is easiest 
   1.130 +    // done using floating point operation
   1.131 +    dnorm = (double)norm;
   1.132 +
   1.133 +    return (double)corr / sqrt(dnorm < 1e-9 ? 1.0 : dnorm);
   1.134 +    // Note: Warning about the missing EMMS instruction is harmless
   1.135 +    // as it'll be called elsewhere.
   1.136 +}
   1.137 +
   1.138 +
   1.139 +/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
   1.140 +double TDStretchMMX::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm) const
   1.141 +{
   1.142 +    const __m64 *pVec1, *pVec2;
   1.143 +    __m64 shifter;
   1.144 +    __m64 accu;
   1.145 +    long corr, lnorm;
   1.146 +    int i;
   1.147 +   
   1.148 +    // cancel first normalizer tap from previous round
   1.149 +    lnorm = 0;
   1.150 +    for (i = 1; i <= channels; i ++)
   1.151 +    {
   1.152 +        lnorm -= (pV1[-i] * pV1[-i]) >> overlapDividerBits;
   1.153 +    }
   1.154 +
   1.155 +    pVec1 = (__m64*)pV1;
   1.156 +    pVec2 = (__m64*)pV2;
   1.157 +
   1.158 +    shifter = _m_from_int(overlapDividerBits);
   1.159 +    accu = _mm_setzero_si64();
   1.160 +
   1.161 +    // Process 4 parallel sets of 2 * stereo samples or 4 * mono samples 
   1.162 +    // during each round for improved CPU-level parallellization.
   1.163 +    for (i = 0; i < channels * overlapLength / 16; i ++)
   1.164 +    {
   1.165 +        __m64 temp;
   1.166 +
   1.167 +        // dictionary of instructions:
   1.168 +        // _m_pmaddwd   : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
   1.169 +        // _mm_add_pi32 : 2*32bit add
   1.170 +        // _m_psrad     : 32bit right-shift
   1.171 +
   1.172 +        temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]), shifter),
   1.173 +                            _mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec2[1]), shifter));
   1.174 +        accu = _mm_add_pi32(accu, temp);
   1.175 +
   1.176 +        temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]), shifter),
   1.177 +                            _mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec2[3]), shifter));
   1.178 +        accu = _mm_add_pi32(accu, temp);
   1.179 +
   1.180 +        pVec1 += 4;
   1.181 +        pVec2 += 4;
   1.182 +    }
   1.183 +
   1.184 +    // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
   1.185 +    // and finally store the result into the variable "corr"
   1.186 +
   1.187 +    accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32));
   1.188 +    corr = _m_to_int(accu);
   1.189 +
   1.190 +    // Clear MMS state
   1.191 +    _m_empty();
   1.192 +
   1.193 +    // update normalizer with last samples of this round
   1.194 +    pV1 = (short *)pVec1;
   1.195 +    for (int j = 1; j <= channels; j ++)
   1.196 +    {
   1.197 +        lnorm += (pV1[-j] * pV1[-j]) >> overlapDividerBits;
   1.198 +    }
   1.199 +    dnorm += (double)lnorm;
   1.200 +
   1.201 +    // Normalize result by dividing by sqrt(norm) - this step is easiest 
   1.202 +    // done using floating point operation
   1.203 +    return (double)corr / sqrt((dnorm < 1e-9) ? 1.0 : dnorm);
   1.204 +}
   1.205 +
   1.206 +
   1.207 +void TDStretchMMX::clearCrossCorrState()
   1.208 +{
   1.209 +    // Clear MMS state
   1.210 +    _m_empty();
   1.211 +    //_asm EMMS;
   1.212 +}
   1.213 +
   1.214 +
   1.215 +
   1.216 +// MMX-optimized version of the function overlapStereo
   1.217 +void TDStretchMMX::overlapStereo(short *output, const short *input) const
   1.218 +{
   1.219 +    const __m64 *pVinput, *pVMidBuf;
   1.220 +    __m64 *pVdest;
   1.221 +    __m64 mix1, mix2, adder, shifter;
   1.222 +    int i;
   1.223 +
   1.224 +    pVinput  = (const __m64*)input;
   1.225 +    pVMidBuf = (const __m64*)pMidBuffer;
   1.226 +    pVdest   = (__m64*)output;
   1.227 +
   1.228 +    // mix1  = mixer values for 1st stereo sample
   1.229 +    // mix1  = mixer values for 2nd stereo sample
   1.230 +    // adder = adder for updating mixer values after each round
   1.231 +    
   1.232 +    mix1  = _mm_set_pi16(0, overlapLength,   0, overlapLength);
   1.233 +    adder = _mm_set_pi16(1, -1, 1, -1);
   1.234 +    mix2  = _mm_add_pi16(mix1, adder);
   1.235 +    adder = _mm_add_pi16(adder, adder);
   1.236 +
   1.237 +    // Overlaplength-division by shifter. "+1" is to account for "-1" deduced in
   1.238 +    // overlapDividerBits calculation earlier.
   1.239 +    shifter = _m_from_int(overlapDividerBits + 1);
   1.240 +
   1.241 +    for (i = 0; i < overlapLength / 4; i ++)
   1.242 +    {
   1.243 +        __m64 temp1, temp2;
   1.244 +                
   1.245 +        // load & shuffle data so that input & mixbuffer data samples are paired
   1.246 +        temp1 = _mm_unpacklo_pi16(pVMidBuf[0], pVinput[0]);     // = i0l m0l i0r m0r
   1.247 +        temp2 = _mm_unpackhi_pi16(pVMidBuf[0], pVinput[0]);     // = i1l m1l i1r m1r
   1.248 +
   1.249 +        // temp = (temp .* mix) >> shifter
   1.250 +        temp1 = _mm_sra_pi32(_mm_madd_pi16(temp1, mix1), shifter);
   1.251 +        temp2 = _mm_sra_pi32(_mm_madd_pi16(temp2, mix2), shifter);
   1.252 +        pVdest[0] = _mm_packs_pi32(temp1, temp2); // pack 2*2*32bit => 4*16bit
   1.253 +
   1.254 +        // update mix += adder
   1.255 +        mix1 = _mm_add_pi16(mix1, adder);
   1.256 +        mix2 = _mm_add_pi16(mix2, adder);
   1.257 +
   1.258 +        // --- second round begins here ---
   1.259 +
   1.260 +        // load & shuffle data so that input & mixbuffer data samples are paired
   1.261 +        temp1 = _mm_unpacklo_pi16(pVMidBuf[1], pVinput[1]);       // = i2l m2l i2r m2r
   1.262 +        temp2 = _mm_unpackhi_pi16(pVMidBuf[1], pVinput[1]);       // = i3l m3l i3r m3r
   1.263 +
   1.264 +        // temp = (temp .* mix) >> shifter
   1.265 +        temp1 = _mm_sra_pi32(_mm_madd_pi16(temp1, mix1), shifter);
   1.266 +        temp2 = _mm_sra_pi32(_mm_madd_pi16(temp2, mix2), shifter);
   1.267 +        pVdest[1] = _mm_packs_pi32(temp1, temp2); // pack 2*2*32bit => 4*16bit
   1.268 +
   1.269 +        // update mix += adder
   1.270 +        mix1 = _mm_add_pi16(mix1, adder);
   1.271 +        mix2 = _mm_add_pi16(mix2, adder);
   1.272 +
   1.273 +        pVinput  += 2;
   1.274 +        pVMidBuf += 2;
   1.275 +        pVdest   += 2;
   1.276 +    }
   1.277 +
   1.278 +    _m_empty(); // clear MMS state
   1.279 +}
   1.280 +
   1.281 +
   1.282 +//////////////////////////////////////////////////////////////////////////////
   1.283 +//
   1.284 +// implementation of MMX optimized functions of class 'FIRFilter'
   1.285 +//
   1.286 +//////////////////////////////////////////////////////////////////////////////
   1.287 +
   1.288 +#include "FIRFilter.h"
   1.289 +
   1.290 +
   1.291 +FIRFilterMMX::FIRFilterMMX() : FIRFilter()
   1.292 +{
   1.293 +    filterCoeffsUnalign = NULL;
   1.294 +}
   1.295 +
   1.296 +
   1.297 +FIRFilterMMX::~FIRFilterMMX()
   1.298 +{
   1.299 +    delete[] filterCoeffsUnalign;
   1.300 +}
   1.301 +
   1.302 +
   1.303 +// (overloaded) Calculates filter coefficients for MMX routine
   1.304 +void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
   1.305 +{
   1.306 +    uint i;
   1.307 +    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
   1.308 +
   1.309 +    // Ensure that filter coeffs array is aligned to 16-byte boundary
   1.310 +    delete[] filterCoeffsUnalign;
   1.311 +    filterCoeffsUnalign = new short[2 * newLength + 8];
   1.312 +    filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
   1.313 +
   1.314 +    // rearrange the filter coefficients for mmx routines 
   1.315 +    for (i = 0;i < length; i += 4) 
   1.316 +    {
   1.317 +        filterCoeffsAlign[2 * i + 0] = coeffs[i + 0];
   1.318 +        filterCoeffsAlign[2 * i + 1] = coeffs[i + 2];
   1.319 +        filterCoeffsAlign[2 * i + 2] = coeffs[i + 0];
   1.320 +        filterCoeffsAlign[2 * i + 3] = coeffs[i + 2];
   1.321 +
   1.322 +        filterCoeffsAlign[2 * i + 4] = coeffs[i + 1];
   1.323 +        filterCoeffsAlign[2 * i + 5] = coeffs[i + 3];
   1.324 +        filterCoeffsAlign[2 * i + 6] = coeffs[i + 1];
   1.325 +        filterCoeffsAlign[2 * i + 7] = coeffs[i + 3];
   1.326 +    }
   1.327 +}
   1.328 +
   1.329 +
   1.330 +
   1.331 +// mmx-optimized version of the filter routine for stereo sound
   1.332 +uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, uint numSamples) const
   1.333 +{
   1.334 +    // Create stack copies of the needed member variables for asm routines :
   1.335 +    uint i, j;
   1.336 +    __m64 *pVdest = (__m64*)dest;
   1.337 +
   1.338 +    if (length < 2) return 0;
   1.339 +
   1.340 +    for (i = 0; i < (numSamples - length) / 2; i ++)
   1.341 +    {
   1.342 +        __m64 accu1;
   1.343 +        __m64 accu2;
   1.344 +        const __m64 *pVsrc = (const __m64*)src;
   1.345 +        const __m64 *pVfilter = (const __m64*)filterCoeffsAlign;
   1.346 +
   1.347 +        accu1 = accu2 = _mm_setzero_si64();
   1.348 +        for (j = 0; j < lengthDiv8 * 2; j ++)
   1.349 +        {
   1.350 +            __m64 temp1, temp2;
   1.351 +
   1.352 +            temp1 = _mm_unpacklo_pi16(pVsrc[0], pVsrc[1]);  // = l2 l0 r2 r0
   1.353 +            temp2 = _mm_unpackhi_pi16(pVsrc[0], pVsrc[1]);  // = l3 l1 r3 r1
   1.354 +
   1.355 +            accu1 = _mm_add_pi32(accu1, _mm_madd_pi16(temp1, pVfilter[0]));  // += l2*f2+l0*f0 r2*f2+r0*f0
   1.356 +            accu1 = _mm_add_pi32(accu1, _mm_madd_pi16(temp2, pVfilter[1]));  // += l3*f3+l1*f1 r3*f3+r1*f1
   1.357 +
   1.358 +            temp1 = _mm_unpacklo_pi16(pVsrc[1], pVsrc[2]);  // = l4 l2 r4 r2
   1.359 +
   1.360 +            accu2 = _mm_add_pi32(accu2, _mm_madd_pi16(temp2, pVfilter[0]));  // += l3*f2+l1*f0 r3*f2+r1*f0
   1.361 +            accu2 = _mm_add_pi32(accu2, _mm_madd_pi16(temp1, pVfilter[1]));  // += l4*f3+l2*f1 r4*f3+r2*f1
   1.362 +
   1.363 +            // accu1 += l2*f2+l0*f0 r2*f2+r0*f0
   1.364 +            //       += l3*f3+l1*f1 r3*f3+r1*f1
   1.365 +
   1.366 +            // accu2 += l3*f2+l1*f0 r3*f2+r1*f0
   1.367 +            //          l4*f3+l2*f1 r4*f3+r2*f1
   1.368 +
   1.369 +            pVfilter += 2;
   1.370 +            pVsrc += 2;
   1.371 +        }
   1.372 +        // accu >>= resultDivFactor
   1.373 +        accu1 = _mm_srai_pi32(accu1, resultDivFactor);
   1.374 +        accu2 = _mm_srai_pi32(accu2, resultDivFactor);
   1.375 +
   1.376 +        // pack 2*2*32bits => 4*16 bits
   1.377 +        pVdest[0] = _mm_packs_pi32(accu1, accu2);
   1.378 +        src += 4;
   1.379 +        pVdest ++;
   1.380 +    }
   1.381 +
   1.382 +   _m_empty();  // clear emms state
   1.383 +
   1.384 +    return (numSamples & 0xfffffffe) - length;
   1.385 +}
   1.386 +
   1.387 +#endif  // SOUNDTOUCH_ALLOW_MMX

mercurial