1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libsoundtouch/src/mmx_optimized.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,384 @@ 1.4 +//////////////////////////////////////////////////////////////////////////////// 1.5 +/// 1.6 +/// MMX optimized routines. All MMX optimized functions have been gathered into 1.7 +/// this single source code file, regardless to their class or original source 1.8 +/// code file, in order to ease porting the library to other compiler and 1.9 +/// processor platforms. 1.10 +/// 1.11 +/// The MMX-optimizations are programmed using MMX compiler intrinsics that 1.12 +/// are supported both by Microsoft Visual C++ and GCC compilers, so this file 1.13 +/// should compile with both toolsets. 1.14 +/// 1.15 +/// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ 1.16 +/// 6.0 processor pack" update to support compiler intrinsic syntax. The update 1.17 +/// is available for download at Microsoft Developers Network, see here: 1.18 +/// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx 1.19 +/// 1.20 +/// Author : Copyright (c) Olli Parviainen 1.21 +/// Author e-mail : oparviai 'at' iki.fi 1.22 +/// SoundTouch WWW: http://www.surina.net/soundtouch 1.23 +/// 1.24 +//////////////////////////////////////////////////////////////////////////////// 1.25 +// 1.26 +// Last changed : $Date: 2014-01-07 12:25:40 -0600 (Tue, 07 Jan 2014) $ 1.27 +// File revision : $Revision: 4 $ 1.28 +// 1.29 +// $Id: mmx_optimized.cpp 184 2014-01-07 18:25:40Z oparviai $ 1.30 +// 1.31 +//////////////////////////////////////////////////////////////////////////////// 1.32 +// 1.33 +// License : 1.34 +// 1.35 +// SoundTouch audio processing library 1.36 +// Copyright (c) Olli Parviainen 1.37 +// 1.38 +// This library is free software; you can redistribute it and/or 1.39 +// modify it under the terms of the GNU Lesser General Public 1.40 +// License as published by the Free Software Foundation; either 1.41 +// version 2.1 of the License, or (at your option) any later version. 1.42 +// 1.43 +// This library is distributed in the hope that it will be useful, 1.44 +// but WITHOUT ANY WARRANTY; without even the implied warranty of 1.45 +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 1.46 +// Lesser General Public License for more details. 1.47 +// 1.48 +// You should have received a copy of the GNU Lesser General Public 1.49 +// License along with this library; if not, write to the Free Software 1.50 +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 1.51 +// 1.52 +//////////////////////////////////////////////////////////////////////////////// 1.53 + 1.54 +#include "STTypes.h" 1.55 + 1.56 +#ifdef SOUNDTOUCH_ALLOW_MMX 1.57 +// MMX routines available only with integer sample type 1.58 + 1.59 +using namespace soundtouch; 1.60 + 1.61 +////////////////////////////////////////////////////////////////////////////// 1.62 +// 1.63 +// implementation of MMX optimized functions of class 'TDStretchMMX' 1.64 +// 1.65 +////////////////////////////////////////////////////////////////////////////// 1.66 + 1.67 +#include "TDStretch.h" 1.68 +#include <mmintrin.h> 1.69 +#include <limits.h> 1.70 +#include <math.h> 1.71 + 1.72 + 1.73 +// Calculates cross correlation of two buffers 1.74 +double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm) const 1.75 +{ 1.76 + const __m64 *pVec1, *pVec2; 1.77 + __m64 shifter; 1.78 + __m64 accu, normaccu; 1.79 + long corr, norm; 1.80 + int i; 1.81 + 1.82 + pVec1 = (__m64*)pV1; 1.83 + pVec2 = (__m64*)pV2; 1.84 + 1.85 + shifter = _m_from_int(overlapDividerBits); 1.86 + normaccu = accu = _mm_setzero_si64(); 1.87 + 1.88 + // Process 4 parallel sets of 2 * stereo samples or 4 * mono samples 1.89 + // during each round for improved CPU-level parallellization. 1.90 + for (i = 0; i < channels * overlapLength / 16; i ++) 1.91 + { 1.92 + __m64 temp, temp2; 1.93 + 1.94 + // dictionary of instructions: 1.95 + // _m_pmaddwd : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3] 1.96 + // _mm_add_pi32 : 2*32bit add 1.97 + // _m_psrad : 32bit right-shift 1.98 + 1.99 + temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]), shifter), 1.100 + _mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec2[1]), shifter)); 1.101 + temp2 = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec1[0]), shifter), 1.102 + _mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec1[1]), shifter)); 1.103 + accu = _mm_add_pi32(accu, temp); 1.104 + normaccu = _mm_add_pi32(normaccu, temp2); 1.105 + 1.106 + temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]), shifter), 1.107 + _mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec2[3]), shifter)); 1.108 + temp2 = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec1[2]), shifter), 1.109 + _mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec1[3]), shifter)); 1.110 + accu = _mm_add_pi32(accu, temp); 1.111 + normaccu = _mm_add_pi32(normaccu, temp2); 1.112 + 1.113 + pVec1 += 4; 1.114 + pVec2 += 4; 1.115 + } 1.116 + 1.117 + // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1 1.118 + // and finally store the result into the variable "corr" 1.119 + 1.120 + accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32)); 1.121 + corr = _m_to_int(accu); 1.122 + 1.123 + normaccu = _mm_add_pi32(normaccu, _mm_srli_si64(normaccu, 32)); 1.124 + norm = _m_to_int(normaccu); 1.125 + 1.126 + // Clear MMS state 1.127 + _m_empty(); 1.128 + 1.129 + // Normalize result by dividing by sqrt(norm) - this step is easiest 1.130 + // done using floating point operation 1.131 + dnorm = (double)norm; 1.132 + 1.133 + return (double)corr / sqrt(dnorm < 1e-9 ? 1.0 : dnorm); 1.134 + // Note: Warning about the missing EMMS instruction is harmless 1.135 + // as it'll be called elsewhere. 1.136 +} 1.137 + 1.138 + 1.139 +/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value 1.140 +double TDStretchMMX::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm) const 1.141 +{ 1.142 + const __m64 *pVec1, *pVec2; 1.143 + __m64 shifter; 1.144 + __m64 accu; 1.145 + long corr, lnorm; 1.146 + int i; 1.147 + 1.148 + // cancel first normalizer tap from previous round 1.149 + lnorm = 0; 1.150 + for (i = 1; i <= channels; i ++) 1.151 + { 1.152 + lnorm -= (pV1[-i] * pV1[-i]) >> overlapDividerBits; 1.153 + } 1.154 + 1.155 + pVec1 = (__m64*)pV1; 1.156 + pVec2 = (__m64*)pV2; 1.157 + 1.158 + shifter = _m_from_int(overlapDividerBits); 1.159 + accu = _mm_setzero_si64(); 1.160 + 1.161 + // Process 4 parallel sets of 2 * stereo samples or 4 * mono samples 1.162 + // during each round for improved CPU-level parallellization. 1.163 + for (i = 0; i < channels * overlapLength / 16; i ++) 1.164 + { 1.165 + __m64 temp; 1.166 + 1.167 + // dictionary of instructions: 1.168 + // _m_pmaddwd : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3] 1.169 + // _mm_add_pi32 : 2*32bit add 1.170 + // _m_psrad : 32bit right-shift 1.171 + 1.172 + temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]), shifter), 1.173 + _mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec2[1]), shifter)); 1.174 + accu = _mm_add_pi32(accu, temp); 1.175 + 1.176 + temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]), shifter), 1.177 + _mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec2[3]), shifter)); 1.178 + accu = _mm_add_pi32(accu, temp); 1.179 + 1.180 + pVec1 += 4; 1.181 + pVec2 += 4; 1.182 + } 1.183 + 1.184 + // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1 1.185 + // and finally store the result into the variable "corr" 1.186 + 1.187 + accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32)); 1.188 + corr = _m_to_int(accu); 1.189 + 1.190 + // Clear MMS state 1.191 + _m_empty(); 1.192 + 1.193 + // update normalizer with last samples of this round 1.194 + pV1 = (short *)pVec1; 1.195 + for (int j = 1; j <= channels; j ++) 1.196 + { 1.197 + lnorm += (pV1[-j] * pV1[-j]) >> overlapDividerBits; 1.198 + } 1.199 + dnorm += (double)lnorm; 1.200 + 1.201 + // Normalize result by dividing by sqrt(norm) - this step is easiest 1.202 + // done using floating point operation 1.203 + return (double)corr / sqrt((dnorm < 1e-9) ? 1.0 : dnorm); 1.204 +} 1.205 + 1.206 + 1.207 +void TDStretchMMX::clearCrossCorrState() 1.208 +{ 1.209 + // Clear MMS state 1.210 + _m_empty(); 1.211 + //_asm EMMS; 1.212 +} 1.213 + 1.214 + 1.215 + 1.216 +// MMX-optimized version of the function overlapStereo 1.217 +void TDStretchMMX::overlapStereo(short *output, const short *input) const 1.218 +{ 1.219 + const __m64 *pVinput, *pVMidBuf; 1.220 + __m64 *pVdest; 1.221 + __m64 mix1, mix2, adder, shifter; 1.222 + int i; 1.223 + 1.224 + pVinput = (const __m64*)input; 1.225 + pVMidBuf = (const __m64*)pMidBuffer; 1.226 + pVdest = (__m64*)output; 1.227 + 1.228 + // mix1 = mixer values for 1st stereo sample 1.229 + // mix1 = mixer values for 2nd stereo sample 1.230 + // adder = adder for updating mixer values after each round 1.231 + 1.232 + mix1 = _mm_set_pi16(0, overlapLength, 0, overlapLength); 1.233 + adder = _mm_set_pi16(1, -1, 1, -1); 1.234 + mix2 = _mm_add_pi16(mix1, adder); 1.235 + adder = _mm_add_pi16(adder, adder); 1.236 + 1.237 + // Overlaplength-division by shifter. "+1" is to account for "-1" deduced in 1.238 + // overlapDividerBits calculation earlier. 1.239 + shifter = _m_from_int(overlapDividerBits + 1); 1.240 + 1.241 + for (i = 0; i < overlapLength / 4; i ++) 1.242 + { 1.243 + __m64 temp1, temp2; 1.244 + 1.245 + // load & shuffle data so that input & mixbuffer data samples are paired 1.246 + temp1 = _mm_unpacklo_pi16(pVMidBuf[0], pVinput[0]); // = i0l m0l i0r m0r 1.247 + temp2 = _mm_unpackhi_pi16(pVMidBuf[0], pVinput[0]); // = i1l m1l i1r m1r 1.248 + 1.249 + // temp = (temp .* mix) >> shifter 1.250 + temp1 = _mm_sra_pi32(_mm_madd_pi16(temp1, mix1), shifter); 1.251 + temp2 = _mm_sra_pi32(_mm_madd_pi16(temp2, mix2), shifter); 1.252 + pVdest[0] = _mm_packs_pi32(temp1, temp2); // pack 2*2*32bit => 4*16bit 1.253 + 1.254 + // update mix += adder 1.255 + mix1 = _mm_add_pi16(mix1, adder); 1.256 + mix2 = _mm_add_pi16(mix2, adder); 1.257 + 1.258 + // --- second round begins here --- 1.259 + 1.260 + // load & shuffle data so that input & mixbuffer data samples are paired 1.261 + temp1 = _mm_unpacklo_pi16(pVMidBuf[1], pVinput[1]); // = i2l m2l i2r m2r 1.262 + temp2 = _mm_unpackhi_pi16(pVMidBuf[1], pVinput[1]); // = i3l m3l i3r m3r 1.263 + 1.264 + // temp = (temp .* mix) >> shifter 1.265 + temp1 = _mm_sra_pi32(_mm_madd_pi16(temp1, mix1), shifter); 1.266 + temp2 = _mm_sra_pi32(_mm_madd_pi16(temp2, mix2), shifter); 1.267 + pVdest[1] = _mm_packs_pi32(temp1, temp2); // pack 2*2*32bit => 4*16bit 1.268 + 1.269 + // update mix += adder 1.270 + mix1 = _mm_add_pi16(mix1, adder); 1.271 + mix2 = _mm_add_pi16(mix2, adder); 1.272 + 1.273 + pVinput += 2; 1.274 + pVMidBuf += 2; 1.275 + pVdest += 2; 1.276 + } 1.277 + 1.278 + _m_empty(); // clear MMS state 1.279 +} 1.280 + 1.281 + 1.282 +////////////////////////////////////////////////////////////////////////////// 1.283 +// 1.284 +// implementation of MMX optimized functions of class 'FIRFilter' 1.285 +// 1.286 +////////////////////////////////////////////////////////////////////////////// 1.287 + 1.288 +#include "FIRFilter.h" 1.289 + 1.290 + 1.291 +FIRFilterMMX::FIRFilterMMX() : FIRFilter() 1.292 +{ 1.293 + filterCoeffsUnalign = NULL; 1.294 +} 1.295 + 1.296 + 1.297 +FIRFilterMMX::~FIRFilterMMX() 1.298 +{ 1.299 + delete[] filterCoeffsUnalign; 1.300 +} 1.301 + 1.302 + 1.303 +// (overloaded) Calculates filter coefficients for MMX routine 1.304 +void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor) 1.305 +{ 1.306 + uint i; 1.307 + FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor); 1.308 + 1.309 + // Ensure that filter coeffs array is aligned to 16-byte boundary 1.310 + delete[] filterCoeffsUnalign; 1.311 + filterCoeffsUnalign = new short[2 * newLength + 8]; 1.312 + filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign); 1.313 + 1.314 + // rearrange the filter coefficients for mmx routines 1.315 + for (i = 0;i < length; i += 4) 1.316 + { 1.317 + filterCoeffsAlign[2 * i + 0] = coeffs[i + 0]; 1.318 + filterCoeffsAlign[2 * i + 1] = coeffs[i + 2]; 1.319 + filterCoeffsAlign[2 * i + 2] = coeffs[i + 0]; 1.320 + filterCoeffsAlign[2 * i + 3] = coeffs[i + 2]; 1.321 + 1.322 + filterCoeffsAlign[2 * i + 4] = coeffs[i + 1]; 1.323 + filterCoeffsAlign[2 * i + 5] = coeffs[i + 3]; 1.324 + filterCoeffsAlign[2 * i + 6] = coeffs[i + 1]; 1.325 + filterCoeffsAlign[2 * i + 7] = coeffs[i + 3]; 1.326 + } 1.327 +} 1.328 + 1.329 + 1.330 + 1.331 +// mmx-optimized version of the filter routine for stereo sound 1.332 +uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, uint numSamples) const 1.333 +{ 1.334 + // Create stack copies of the needed member variables for asm routines : 1.335 + uint i, j; 1.336 + __m64 *pVdest = (__m64*)dest; 1.337 + 1.338 + if (length < 2) return 0; 1.339 + 1.340 + for (i = 0; i < (numSamples - length) / 2; i ++) 1.341 + { 1.342 + __m64 accu1; 1.343 + __m64 accu2; 1.344 + const __m64 *pVsrc = (const __m64*)src; 1.345 + const __m64 *pVfilter = (const __m64*)filterCoeffsAlign; 1.346 + 1.347 + accu1 = accu2 = _mm_setzero_si64(); 1.348 + for (j = 0; j < lengthDiv8 * 2; j ++) 1.349 + { 1.350 + __m64 temp1, temp2; 1.351 + 1.352 + temp1 = _mm_unpacklo_pi16(pVsrc[0], pVsrc[1]); // = l2 l0 r2 r0 1.353 + temp2 = _mm_unpackhi_pi16(pVsrc[0], pVsrc[1]); // = l3 l1 r3 r1 1.354 + 1.355 + accu1 = _mm_add_pi32(accu1, _mm_madd_pi16(temp1, pVfilter[0])); // += l2*f2+l0*f0 r2*f2+r0*f0 1.356 + accu1 = _mm_add_pi32(accu1, _mm_madd_pi16(temp2, pVfilter[1])); // += l3*f3+l1*f1 r3*f3+r1*f1 1.357 + 1.358 + temp1 = _mm_unpacklo_pi16(pVsrc[1], pVsrc[2]); // = l4 l2 r4 r2 1.359 + 1.360 + accu2 = _mm_add_pi32(accu2, _mm_madd_pi16(temp2, pVfilter[0])); // += l3*f2+l1*f0 r3*f2+r1*f0 1.361 + accu2 = _mm_add_pi32(accu2, _mm_madd_pi16(temp1, pVfilter[1])); // += l4*f3+l2*f1 r4*f3+r2*f1 1.362 + 1.363 + // accu1 += l2*f2+l0*f0 r2*f2+r0*f0 1.364 + // += l3*f3+l1*f1 r3*f3+r1*f1 1.365 + 1.366 + // accu2 += l3*f2+l1*f0 r3*f2+r1*f0 1.367 + // l4*f3+l2*f1 r4*f3+r2*f1 1.368 + 1.369 + pVfilter += 2; 1.370 + pVsrc += 2; 1.371 + } 1.372 + // accu >>= resultDivFactor 1.373 + accu1 = _mm_srai_pi32(accu1, resultDivFactor); 1.374 + accu2 = _mm_srai_pi32(accu2, resultDivFactor); 1.375 + 1.376 + // pack 2*2*32bits => 4*16 bits 1.377 + pVdest[0] = _mm_packs_pi32(accu1, accu2); 1.378 + src += 4; 1.379 + pVdest ++; 1.380 + } 1.381 + 1.382 + _m_empty(); // clear emms state 1.383 + 1.384 + return (numSamples & 0xfffffffe) - length; 1.385 +} 1.386 + 1.387 +#endif // SOUNDTOUCH_ALLOW_MMX