1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libspeex_resampler/sse-detect-runtime.patch Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,220 @@ 1.4 +--- /home/paul/workspace/repositories/opus-tools/src/resample.c 2012-11-21 11:36:59.119430163 +0100 1.5 ++++ media/libspeex_resampler/src/resample.c 2013-08-09 19:24:39.060236120 +0200 1.6 +@@ -92,18 +92,28 @@ 1.7 + 1.8 + #define IMAX(a,b) ((a) > (b) ? (a) : (b)) 1.9 + #define IMIN(a,b) ((a) < (b) ? (a) : (b)) 1.10 + 1.11 + #ifndef NULL 1.12 + #define NULL 0 1.13 + #endif 1.14 + 1.15 ++#include "sse_detect.h" 1.16 ++ 1.17 ++/* We compile SSE code on x86 all the time, but we only use it if we find at 1.18 ++ * runtime that the CPU supports it. */ 1.19 + #if defined(FLOATING_POINT) && defined(__SSE__) 1.20 ++#if defined(_MSC_VER) 1.21 ++#define inline __inline 1.22 ++#endif 1.23 + # include "resample_sse.h" 1.24 ++#ifdef _MSC_VER 1.25 ++#undef inline 1.26 ++#endif 1.27 + #endif 1.28 + 1.29 + /* Numer of elements to allocate on the stack */ 1.30 + #ifdef VAR_ARRAYS 1.31 + #define FIXED_STACK_ALLOC 8192 1.32 + #else 1.33 + #define FIXED_STACK_ALLOC 1024 1.34 + #endif 1.35 +@@ -340,35 +350,39 @@ 1.36 + const spx_uint32_t den_rate = st->den_rate; 1.37 + spx_word32_t sum; 1.38 + 1.39 + while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len)) 1.40 + { 1.41 + const spx_word16_t *sinct = & sinc_table[samp_frac_num*N]; 1.42 + const spx_word16_t *iptr = & in[last_sample]; 1.43 + 1.44 +-#ifndef OVERRIDE_INNER_PRODUCT_SINGLE 1.45 ++#ifdef OVERRIDE_INNER_PRODUCT_SINGLE 1.46 ++ if (moz_has_sse()) { 1.47 ++ sum = inner_product_single(sinct, iptr, N); 1.48 ++ } else { 1.49 ++#endif 1.50 + int j; 1.51 + sum = 0; 1.52 + for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]); 1.53 + 1.54 + /* This code is slower on most DSPs which have only 2 accumulators. 1.55 + Plus this this forces truncation to 32 bits and you lose the HW guard bits. 1.56 + I think we can trust the compiler and let it vectorize and/or unroll itself. 1.57 + spx_word32_t accum[4] = {0,0,0,0}; 1.58 + for(j=0;j<N;j+=4) { 1.59 + accum[0] += MULT16_16(sinct[j], iptr[j]); 1.60 + accum[1] += MULT16_16(sinct[j+1], iptr[j+1]); 1.61 + accum[2] += MULT16_16(sinct[j+2], iptr[j+2]); 1.62 + accum[3] += MULT16_16(sinct[j+3], iptr[j+3]); 1.63 + } 1.64 + sum = accum[0] + accum[1] + accum[2] + accum[3]; 1.65 + */ 1.66 +-#else 1.67 +- sum = inner_product_single(sinct, iptr, N); 1.68 ++#ifdef OVERRIDE_INNER_PRODUCT_SINGLE 1.69 ++ } 1.70 + #endif 1.71 + 1.72 + out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767); 1.73 + last_sample += int_advance; 1.74 + samp_frac_num += frac_advance; 1.75 + if (samp_frac_num >= den_rate) 1.76 + { 1.77 + samp_frac_num -= den_rate; 1.78 +@@ -397,29 +411,33 @@ 1.79 + const spx_uint32_t den_rate = st->den_rate; 1.80 + double sum; 1.81 + 1.82 + while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len)) 1.83 + { 1.84 + const spx_word16_t *sinct = & sinc_table[samp_frac_num*N]; 1.85 + const spx_word16_t *iptr = & in[last_sample]; 1.86 + 1.87 +-#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE 1.88 +- int j; 1.89 +- double accum[4] = {0,0,0,0}; 1.90 +- 1.91 +- for(j=0;j<N;j+=4) { 1.92 +- accum[0] += sinct[j]*iptr[j]; 1.93 +- accum[1] += sinct[j+1]*iptr[j+1]; 1.94 +- accum[2] += sinct[j+2]*iptr[j+2]; 1.95 +- accum[3] += sinct[j+3]*iptr[j+3]; 1.96 ++#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE 1.97 ++ if(moz_has_sse2()) { 1.98 ++ sum = inner_product_double(sinct, iptr, N); 1.99 ++ } else { 1.100 ++#endif 1.101 ++ int j; 1.102 ++ double accum[4] = {0,0,0,0}; 1.103 ++ 1.104 ++ for(j=0;j<N;j+=4) { 1.105 ++ accum[0] += sinct[j]*iptr[j]; 1.106 ++ accum[1] += sinct[j+1]*iptr[j+1]; 1.107 ++ accum[2] += sinct[j+2]*iptr[j+2]; 1.108 ++ accum[3] += sinct[j+3]*iptr[j+3]; 1.109 ++ } 1.110 ++ sum = accum[0] + accum[1] + accum[2] + accum[3]; 1.111 ++#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE 1.112 + } 1.113 +- sum = accum[0] + accum[1] + accum[2] + accum[3]; 1.114 +-#else 1.115 +- sum = inner_product_double(sinct, iptr, N); 1.116 + #endif 1.117 + 1.118 + out[out_stride * out_sample++] = PSHR32(sum, 15); 1.119 + last_sample += int_advance; 1.120 + samp_frac_num += frac_advance; 1.121 + if (samp_frac_num >= den_rate) 1.122 + { 1.123 + samp_frac_num -= den_rate; 1.124 +@@ -453,35 +471,38 @@ 1.125 + #ifdef FIXED_POINT 1.126 + const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate); 1.127 + #else 1.128 + const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate; 1.129 + #endif 1.130 + spx_word16_t interp[4]; 1.131 + 1.132 + 1.133 +-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE 1.134 +- int j; 1.135 +- spx_word32_t accum[4] = {0,0,0,0}; 1.136 +- 1.137 +- for(j=0;j<N;j++) { 1.138 +- const spx_word16_t curr_in=iptr[j]; 1.139 +- accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]); 1.140 +- accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]); 1.141 +- accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]); 1.142 +- accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]); 1.143 ++#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE 1.144 ++ if (moz_has_sse()) { 1.145 ++ cubic_coef(frac, interp); 1.146 ++ sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp); 1.147 ++ } else { 1.148 ++#endif 1.149 ++ int j; 1.150 ++ spx_word32_t accum[4] = {0,0,0,0}; 1.151 ++ 1.152 ++ for(j=0;j<N;j++) { 1.153 ++ const spx_word16_t curr_in=iptr[j]; 1.154 ++ accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]); 1.155 ++ accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]); 1.156 ++ accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]); 1.157 ++ accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]); 1.158 ++ } 1.159 ++ cubic_coef(frac, interp); 1.160 ++ sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1)); 1.161 ++#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE 1.162 + } 1.163 +- 1.164 +- cubic_coef(frac, interp); 1.165 +- sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1)); 1.166 +-#else 1.167 +- cubic_coef(frac, interp); 1.168 +- sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp); 1.169 + #endif 1.170 +- 1.171 ++ 1.172 + out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767); 1.173 + last_sample += int_advance; 1.174 + samp_frac_num += frac_advance; 1.175 + if (samp_frac_num >= den_rate) 1.176 + { 1.177 + samp_frac_num -= den_rate; 1.178 + last_sample++; 1.179 + } 1.180 +@@ -515,35 +536,38 @@ 1.181 + #ifdef FIXED_POINT 1.182 + const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate); 1.183 + #else 1.184 + const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate; 1.185 + #endif 1.186 + spx_word16_t interp[4]; 1.187 + 1.188 + 1.189 +-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE 1.190 ++#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE 1.191 ++ if (moz_has_sse2()) { 1.192 ++ cubic_coef(frac, interp); 1.193 ++ sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp); 1.194 ++ } else { 1.195 ++#endif 1.196 + int j; 1.197 + double accum[4] = {0,0,0,0}; 1.198 + 1.199 + for(j=0;j<N;j++) { 1.200 + const double curr_in=iptr[j]; 1.201 + accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]); 1.202 + accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]); 1.203 + accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]); 1.204 + accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]); 1.205 + } 1.206 + 1.207 + cubic_coef(frac, interp); 1.208 + sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]); 1.209 +-#else 1.210 +- cubic_coef(frac, interp); 1.211 +- sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp); 1.212 ++#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE 1.213 ++ } 1.214 + #endif 1.215 +- 1.216 + out[out_stride * out_sample++] = PSHR32(sum,15); 1.217 + last_sample += int_advance; 1.218 + samp_frac_num += frac_advance; 1.219 + if (samp_frac_num >= den_rate) 1.220 + { 1.221 + samp_frac_num -= den_rate; 1.222 + last_sample++; 1.223 + }