media/libspeex_resampler/sse-detect-runtime.patch

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 --- /home/paul/workspace/repositories/opus-tools/src/resample.c	2012-11-21 11:36:59.119430163 +0100
     2 +++ media/libspeex_resampler/src/resample.c	2013-08-09 19:24:39.060236120 +0200
     3 @@ -92,18 +92,28 @@
     5  #define IMAX(a,b) ((a) > (b) ? (a) : (b))
     6  #define IMIN(a,b) ((a) < (b) ? (a) : (b))
     8  #ifndef NULL
     9  #define NULL 0
    10  #endif
    12 +#include "sse_detect.h"
    13 +
    14 +/* We compile SSE code on x86 all the time, but we only use it if we find at
    15 + * runtime that the CPU supports it. */
    16  #if defined(FLOATING_POINT) && defined(__SSE__)
    17 +#if defined(_MSC_VER)
    18 +#define inline __inline
    19 +#endif
    20  # include "resample_sse.h"
    21 +#ifdef _MSC_VER
    22 +#undef inline
    23 +#endif
    24  #endif
    26  /* Numer of elements to allocate on the stack */
    27  #ifdef VAR_ARRAYS
    28  #define FIXED_STACK_ALLOC 8192
    29  #else
    30  #define FIXED_STACK_ALLOC 1024
    31  #endif
    32 @@ -340,35 +350,39 @@
    33     const spx_uint32_t den_rate = st->den_rate;
    34     spx_word32_t sum;
    36     while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
    37     {
    38        const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
    39        const spx_word16_t *iptr = & in[last_sample];
    41 -#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
    42 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
    43 +    if (moz_has_sse()) {
    44 +      sum = inner_product_single(sinct, iptr, N);
    45 +    } else {
    46 +#endif
    47        int j;
    48        sum = 0;
    49        for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
    51  /*    This code is slower on most DSPs which have only 2 accumulators.
    52        Plus this this forces truncation to 32 bits and you lose the HW guard bits.
    53        I think we can trust the compiler and let it vectorize and/or unroll itself.
    54        spx_word32_t accum[4] = {0,0,0,0};
    55        for(j=0;j<N;j+=4) {
    56          accum[0] += MULT16_16(sinct[j], iptr[j]);
    57          accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
    58          accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
    59          accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
    60        }
    61        sum = accum[0] + accum[1] + accum[2] + accum[3];
    62  */
    63 -#else
    64 -      sum = inner_product_single(sinct, iptr, N);
    65 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
    66 +    }
    67  #endif
    69        out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767);
    70        last_sample += int_advance;
    71        samp_frac_num += frac_advance;
    72        if (samp_frac_num >= den_rate)
    73        {
    74           samp_frac_num -= den_rate;
    75 @@ -397,29 +411,33 @@
    76     const spx_uint32_t den_rate = st->den_rate;
    77     double sum;
    79     while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
    80     {
    81        const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
    82        const spx_word16_t *iptr = & in[last_sample];
    84 -#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
    85 -      int j;
    86 -      double accum[4] = {0,0,0,0};
    87 -
    88 -      for(j=0;j<N;j+=4) {
    89 -        accum[0] += sinct[j]*iptr[j];
    90 -        accum[1] += sinct[j+1]*iptr[j+1];
    91 -        accum[2] += sinct[j+2]*iptr[j+2];
    92 -        accum[3] += sinct[j+3]*iptr[j+3];
    93 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
    94 +      if(moz_has_sse2()) {
    95 +        sum = inner_product_double(sinct, iptr, N);
    96 +      } else {
    97 +#endif
    98 +        int j;
    99 +        double accum[4] = {0,0,0,0};
   100 +
   101 +        for(j=0;j<N;j+=4) {
   102 +          accum[0] += sinct[j]*iptr[j];
   103 +          accum[1] += sinct[j+1]*iptr[j+1];
   104 +          accum[2] += sinct[j+2]*iptr[j+2];
   105 +          accum[3] += sinct[j+3]*iptr[j+3];
   106 +        }
   107 +        sum = accum[0] + accum[1] + accum[2] + accum[3];
   108 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
   109        }
   110 -      sum = accum[0] + accum[1] + accum[2] + accum[3];
   111 -#else
   112 -      sum = inner_product_double(sinct, iptr, N);
   113  #endif
   115        out[out_stride * out_sample++] = PSHR32(sum, 15);
   116        last_sample += int_advance;
   117        samp_frac_num += frac_advance;
   118        if (samp_frac_num >= den_rate)
   119        {
   120           samp_frac_num -= den_rate;
   121 @@ -453,35 +471,38 @@
   122  #ifdef FIXED_POINT
   123        const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
   124  #else
   125        const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
   126  #endif
   127        spx_word16_t interp[4];
   130 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
   131 -      int j;
   132 -      spx_word32_t accum[4] = {0,0,0,0};
   133 -
   134 -      for(j=0;j<N;j++) {
   135 -        const spx_word16_t curr_in=iptr[j];
   136 -        accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
   137 -        accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
   138 -        accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
   139 -        accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
   140 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
   141 +      if (moz_has_sse()) {
   142 +        cubic_coef(frac, interp);
   143 +        sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
   144 +      } else {
   145 +#endif
   146 +        int j;
   147 +        spx_word32_t accum[4] = {0,0,0,0};
   148 +
   149 +        for(j=0;j<N;j++) {
   150 +          const spx_word16_t curr_in=iptr[j];
   151 +          accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
   152 +          accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
   153 +          accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
   154 +          accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
   155 +        }
   156 +        cubic_coef(frac, interp);
   157 +        sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
   158 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
   159        }
   160 -
   161 -      cubic_coef(frac, interp);
   162 -      sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
   163 -#else
   164 -      cubic_coef(frac, interp);
   165 -      sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
   166  #endif
   167 -      
   168 +
   169        out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767);
   170        last_sample += int_advance;
   171        samp_frac_num += frac_advance;
   172        if (samp_frac_num >= den_rate)
   173        {
   174           samp_frac_num -= den_rate;
   175           last_sample++;
   176        }
   177 @@ -515,35 +536,38 @@
   178  #ifdef FIXED_POINT
   179        const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
   180  #else
   181        const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
   182  #endif
   183        spx_word16_t interp[4];
   186 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
   187 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
   188 +      if (moz_has_sse2()) {
   189 +        cubic_coef(frac, interp);
   190 +        sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
   191 +      } else {
   192 +#endif
   193        int j;
   194        double accum[4] = {0,0,0,0};
   196        for(j=0;j<N;j++) {
   197          const double curr_in=iptr[j];
   198          accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
   199          accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
   200          accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
   201          accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
   202        }
   204        cubic_coef(frac, interp);
   205        sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
   206 -#else
   207 -      cubic_coef(frac, interp);
   208 -      sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
   209 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
   210 +      }
   211  #endif
   212 -      
   213        out[out_stride * out_sample++] = PSHR32(sum,15);
   214        last_sample += int_advance;
   215        samp_frac_num += frac_advance;
   216        if (samp_frac_num >= den_rate)
   217        {
   218           samp_frac_num -= den_rate;
   219           last_sample++;
   220        }

mercurial