media/libspeex_resampler/sse-detect-runtime.patch

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libspeex_resampler/sse-detect-runtime.patch	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,220 @@
     1.4 +--- /home/paul/workspace/repositories/opus-tools/src/resample.c	2012-11-21 11:36:59.119430163 +0100
     1.5 ++++ media/libspeex_resampler/src/resample.c	2013-08-09 19:24:39.060236120 +0200
     1.6 +@@ -92,18 +92,28 @@
     1.7 +                
     1.8 + #define IMAX(a,b) ((a) > (b) ? (a) : (b))
     1.9 + #define IMIN(a,b) ((a) < (b) ? (a) : (b))
    1.10 + 
    1.11 + #ifndef NULL
    1.12 + #define NULL 0
    1.13 + #endif
    1.14 + 
    1.15 ++#include "sse_detect.h"
    1.16 ++
    1.17 ++/* We compile SSE code on x86 all the time, but we only use it if we find at
    1.18 ++ * runtime that the CPU supports it. */
    1.19 + #if defined(FLOATING_POINT) && defined(__SSE__)
    1.20 ++#if defined(_MSC_VER)
    1.21 ++#define inline __inline
    1.22 ++#endif
    1.23 + # include "resample_sse.h"
    1.24 ++#ifdef _MSC_VER
    1.25 ++#undef inline
    1.26 ++#endif
    1.27 + #endif
    1.28 + 
    1.29 + /* Numer of elements to allocate on the stack */
    1.30 + #ifdef VAR_ARRAYS
    1.31 + #define FIXED_STACK_ALLOC 8192
    1.32 + #else
    1.33 + #define FIXED_STACK_ALLOC 1024
    1.34 + #endif
    1.35 +@@ -340,35 +350,39 @@
    1.36 +    const spx_uint32_t den_rate = st->den_rate;
    1.37 +    spx_word32_t sum;
    1.38 + 
    1.39 +    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
    1.40 +    {
    1.41 +       const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
    1.42 +       const spx_word16_t *iptr = & in[last_sample];
    1.43 + 
    1.44 +-#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
    1.45 ++#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
    1.46 ++    if (moz_has_sse()) {
    1.47 ++      sum = inner_product_single(sinct, iptr, N);
    1.48 ++    } else {
    1.49 ++#endif
    1.50 +       int j;
    1.51 +       sum = 0;
    1.52 +       for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
    1.53 + 
    1.54 + /*    This code is slower on most DSPs which have only 2 accumulators.
    1.55 +       Plus this this forces truncation to 32 bits and you lose the HW guard bits.
    1.56 +       I think we can trust the compiler and let it vectorize and/or unroll itself.
    1.57 +       spx_word32_t accum[4] = {0,0,0,0};
    1.58 +       for(j=0;j<N;j+=4) {
    1.59 +         accum[0] += MULT16_16(sinct[j], iptr[j]);
    1.60 +         accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
    1.61 +         accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
    1.62 +         accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
    1.63 +       }
    1.64 +       sum = accum[0] + accum[1] + accum[2] + accum[3];
    1.65 + */
    1.66 +-#else
    1.67 +-      sum = inner_product_single(sinct, iptr, N);
    1.68 ++#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
    1.69 ++    }
    1.70 + #endif
    1.71 + 
    1.72 +       out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767);
    1.73 +       last_sample += int_advance;
    1.74 +       samp_frac_num += frac_advance;
    1.75 +       if (samp_frac_num >= den_rate)
    1.76 +       {
    1.77 +          samp_frac_num -= den_rate;
    1.78 +@@ -397,29 +411,33 @@
    1.79 +    const spx_uint32_t den_rate = st->den_rate;
    1.80 +    double sum;
    1.81 + 
    1.82 +    while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
    1.83 +    {
    1.84 +       const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
    1.85 +       const spx_word16_t *iptr = & in[last_sample];
    1.86 + 
    1.87 +-#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
    1.88 +-      int j;
    1.89 +-      double accum[4] = {0,0,0,0};
    1.90 +-
    1.91 +-      for(j=0;j<N;j+=4) {
    1.92 +-        accum[0] += sinct[j]*iptr[j];
    1.93 +-        accum[1] += sinct[j+1]*iptr[j+1];
    1.94 +-        accum[2] += sinct[j+2]*iptr[j+2];
    1.95 +-        accum[3] += sinct[j+3]*iptr[j+3];
    1.96 ++#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
    1.97 ++      if(moz_has_sse2()) {
    1.98 ++        sum = inner_product_double(sinct, iptr, N);
    1.99 ++      } else {
   1.100 ++#endif
   1.101 ++        int j;
   1.102 ++        double accum[4] = {0,0,0,0};
   1.103 ++
   1.104 ++        for(j=0;j<N;j+=4) {
   1.105 ++          accum[0] += sinct[j]*iptr[j];
   1.106 ++          accum[1] += sinct[j+1]*iptr[j+1];
   1.107 ++          accum[2] += sinct[j+2]*iptr[j+2];
   1.108 ++          accum[3] += sinct[j+3]*iptr[j+3];
   1.109 ++        }
   1.110 ++        sum = accum[0] + accum[1] + accum[2] + accum[3];
   1.111 ++#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
   1.112 +       }
   1.113 +-      sum = accum[0] + accum[1] + accum[2] + accum[3];
   1.114 +-#else
   1.115 +-      sum = inner_product_double(sinct, iptr, N);
   1.116 + #endif
   1.117 + 
   1.118 +       out[out_stride * out_sample++] = PSHR32(sum, 15);
   1.119 +       last_sample += int_advance;
   1.120 +       samp_frac_num += frac_advance;
   1.121 +       if (samp_frac_num >= den_rate)
   1.122 +       {
   1.123 +          samp_frac_num -= den_rate;
   1.124 +@@ -453,35 +471,38 @@
   1.125 + #ifdef FIXED_POINT
   1.126 +       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
   1.127 + #else
   1.128 +       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
   1.129 + #endif
   1.130 +       spx_word16_t interp[4];
   1.131 + 
   1.132 + 
   1.133 +-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
   1.134 +-      int j;
   1.135 +-      spx_word32_t accum[4] = {0,0,0,0};
   1.136 +-
   1.137 +-      for(j=0;j<N;j++) {
   1.138 +-        const spx_word16_t curr_in=iptr[j];
   1.139 +-        accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
   1.140 +-        accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
   1.141 +-        accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
   1.142 +-        accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
   1.143 ++#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
   1.144 ++      if (moz_has_sse()) {
   1.145 ++        cubic_coef(frac, interp);
   1.146 ++        sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
   1.147 ++      } else {
   1.148 ++#endif
   1.149 ++        int j;
   1.150 ++        spx_word32_t accum[4] = {0,0,0,0};
   1.151 ++
   1.152 ++        for(j=0;j<N;j++) {
   1.153 ++          const spx_word16_t curr_in=iptr[j];
   1.154 ++          accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
   1.155 ++          accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
   1.156 ++          accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
   1.157 ++          accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
   1.158 ++        }
   1.159 ++        cubic_coef(frac, interp);
   1.160 ++        sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
   1.161 ++#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
   1.162 +       }
   1.163 +-
   1.164 +-      cubic_coef(frac, interp);
   1.165 +-      sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
   1.166 +-#else
   1.167 +-      cubic_coef(frac, interp);
   1.168 +-      sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
   1.169 + #endif
   1.170 +-      
   1.171 ++
   1.172 +       out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767);
   1.173 +       last_sample += int_advance;
   1.174 +       samp_frac_num += frac_advance;
   1.175 +       if (samp_frac_num >= den_rate)
   1.176 +       {
   1.177 +          samp_frac_num -= den_rate;
   1.178 +          last_sample++;
   1.179 +       }
   1.180 +@@ -515,35 +536,38 @@
   1.181 + #ifdef FIXED_POINT
   1.182 +       const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
   1.183 + #else
   1.184 +       const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
   1.185 + #endif
   1.186 +       spx_word16_t interp[4];
   1.187 + 
   1.188 + 
   1.189 +-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
   1.190 ++#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
   1.191 ++      if (moz_has_sse2()) {
   1.192 ++        cubic_coef(frac, interp);
   1.193 ++        sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
   1.194 ++      } else {
   1.195 ++#endif
   1.196 +       int j;
   1.197 +       double accum[4] = {0,0,0,0};
   1.198 + 
   1.199 +       for(j=0;j<N;j++) {
   1.200 +         const double curr_in=iptr[j];
   1.201 +         accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
   1.202 +         accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
   1.203 +         accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
   1.204 +         accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
   1.205 +       }
   1.206 + 
   1.207 +       cubic_coef(frac, interp);
   1.208 +       sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
   1.209 +-#else
   1.210 +-      cubic_coef(frac, interp);
   1.211 +-      sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
   1.212 ++#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
   1.213 ++      }
   1.214 + #endif
   1.215 +-      
   1.216 +       out[out_stride * out_sample++] = PSHR32(sum,15);
   1.217 +       last_sample += int_advance;
   1.218 +       samp_frac_num += frac_advance;
   1.219 +       if (samp_frac_num >= den_rate)
   1.220 +       {
   1.221 +          samp_frac_num -= den_rate;
   1.222 +          last_sample++;
   1.223 +       }

mercurial