The Tor Browser: media/libspeex_resampler/sse-detect-runtime.patch@b8a032363ba2

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 --- /home/paul/workspace/repositories/opus-tools/src/resample.c	2012-11-21 11:36:59.119430163 +0100

     2 +++ media/libspeex_resampler/src/resample.c	2013-08-09 19:24:39.060236120 +0200

     3 @@ -92,18 +92,28 @@

     5  #define IMAX(a,b) ((a) > (b) ? (a) : (b))

     6  #define IMIN(a,b) ((a) < (b) ? (a) : (b))

     8  #ifndef NULL

     9  #define NULL 0

    10  #endif

    12 +#include "sse_detect.h"

    13 +

    14 +/* We compile SSE code on x86 all the time, but we only use it if we find at

    15 + * runtime that the CPU supports it. */

    16  #if defined(FLOATING_POINT) && defined(__SSE__)

    17 +#if defined(_MSC_VER)

    18 +#define inline __inline

    19 +#endif

    20  # include "resample_sse.h"

    21 +#ifdef _MSC_VER

    22 +#undef inline

    23 +#endif

    24  #endif

    26  /* Numer of elements to allocate on the stack */

    27  #ifdef VAR_ARRAYS

    28  #define FIXED_STACK_ALLOC 8192

    29  #else

    30  #define FIXED_STACK_ALLOC 1024

    31  #endif

    32 @@ -340,35 +350,39 @@

    33     const spx_uint32_t den_rate = st->den_rate;

    34     spx_word32_t sum;

    36     while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))

    37     {

    38        const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];

    39        const spx_word16_t *iptr = & in[last_sample];

    41 -#ifndef OVERRIDE_INNER_PRODUCT_SINGLE

    42 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE

    43 +    if (moz_has_sse()) {

    44 +      sum = inner_product_single(sinct, iptr, N);

    45 +    } else {

    46 +#endif

    47        int j;

    48        sum = 0;

    49        for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);

    51  /*    This code is slower on most DSPs which have only 2 accumulators.

    52        Plus this this forces truncation to 32 bits and you lose the HW guard bits.

    53        I think we can trust the compiler and let it vectorize and/or unroll itself.

    54        spx_word32_t accum[4] = {0,0,0,0};

    55        for(j=0;j<N;j+=4) {

    56          accum[0] += MULT16_16(sinct[j], iptr[j]);

    57          accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);

    58          accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);

    59          accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);

    60        }

    61        sum = accum[0] + accum[1] + accum[2] + accum[3];

    62  */

    63 -#else

    64 -      sum = inner_product_single(sinct, iptr, N);

    65 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE

    66 +    }

    67  #endif

    69        out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767);

    70        last_sample += int_advance;

    71        samp_frac_num += frac_advance;

    72        if (samp_frac_num >= den_rate)

    73        {

    74           samp_frac_num -= den_rate;

    75 @@ -397,29 +411,33 @@

    76     const spx_uint32_t den_rate = st->den_rate;

    77     double sum;

    79     while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))

    80     {

    81        const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];

    82        const spx_word16_t *iptr = & in[last_sample];

    84 -#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE

    85 -      int j;

    86 -      double accum[4] = {0,0,0,0};

    87 -

    88 -      for(j=0;j<N;j+=4) {

    89 -        accum[0] += sinct[j]*iptr[j];

    90 -        accum[1] += sinct[j+1]*iptr[j+1];

    91 -        accum[2] += sinct[j+2]*iptr[j+2];

    92 -        accum[3] += sinct[j+3]*iptr[j+3];

    93 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE

    94 +      if(moz_has_sse2()) {

    95 +        sum = inner_product_double(sinct, iptr, N);

    96 +      } else {

    97 +#endif

    98 +        int j;

    99 +        double accum[4] = {0,0,0,0};

   100 +

   101 +        for(j=0;j<N;j+=4) {

   102 +          accum[0] += sinct[j]*iptr[j];

   103 +          accum[1] += sinct[j+1]*iptr[j+1];

   104 +          accum[2] += sinct[j+2]*iptr[j+2];

   105 +          accum[3] += sinct[j+3]*iptr[j+3];

   106 +        }

   107 +        sum = accum[0] + accum[1] + accum[2] + accum[3];

   108 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE

   109        }

   110 -      sum = accum[0] + accum[1] + accum[2] + accum[3];

   111 -#else

   112 -      sum = inner_product_double(sinct, iptr, N);

   113  #endif

   115        out[out_stride * out_sample++] = PSHR32(sum, 15);

   116        last_sample += int_advance;

   117        samp_frac_num += frac_advance;

   118        if (samp_frac_num >= den_rate)

   119        {

   120           samp_frac_num -= den_rate;

   121 @@ -453,35 +471,38 @@

   122  #ifdef FIXED_POINT

   123        const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);

   124  #else

   125        const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;

   126  #endif

   127        spx_word16_t interp[4];

   130 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE

   131 -      int j;

   132 -      spx_word32_t accum[4] = {0,0,0,0};

   133 -

   134 -      for(j=0;j<N;j++) {

   135 -        const spx_word16_t curr_in=iptr[j];

   136 -        accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);

   137 -        accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);

   138 -        accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);

   139 -        accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);

   140 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE

   141 +      if (moz_has_sse()) {

   142 +        cubic_coef(frac, interp);

   143 +        sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);

   144 +      } else {

   145 +#endif

   146 +        int j;

   147 +        spx_word32_t accum[4] = {0,0,0,0};

   148 +

   149 +        for(j=0;j<N;j++) {

   150 +          const spx_word16_t curr_in=iptr[j];

   151 +          accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);

   152 +          accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);

   153 +          accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);

   154 +          accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);

   155 +        }

   156 +        cubic_coef(frac, interp);

   157 +        sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));

   158 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE

   159        }

   160 -

   161 -      cubic_coef(frac, interp);

   162 -      sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));

   163 -#else

   164 -      cubic_coef(frac, interp);

   165 -      sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);

   166  #endif

   167 -

   168 +

   169        out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767);

   170        last_sample += int_advance;

   171        samp_frac_num += frac_advance;

   172        if (samp_frac_num >= den_rate)

   173        {

   174           samp_frac_num -= den_rate;

   175           last_sample++;

   176        }

   177 @@ -515,35 +536,38 @@

   178  #ifdef FIXED_POINT

   179        const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);

   180  #else

   181        const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;

   182  #endif

   183        spx_word16_t interp[4];

   186 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE

   187 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE

   188 +      if (moz_has_sse2()) {

   189 +        cubic_coef(frac, interp);

   190 +        sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);

   191 +      } else {

   192 +#endif

   193        int j;

   194        double accum[4] = {0,0,0,0};

   196        for(j=0;j<N;j++) {

   197          const double curr_in=iptr[j];

   198          accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);

   199          accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);

   200          accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);

   201          accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);

   202        }

   204        cubic_coef(frac, interp);

   205        sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);

   206 -#else

   207 -      cubic_coef(frac, interp);

   208 -      sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);

   209 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE

   210 +      }

   211  #endif

   212 -

   213        out[out_stride * out_sample++] = PSHR32(sum,15);

   214        last_sample += int_advance;

   215        samp_frac_num += frac_advance;

   216        if (samp_frac_num >= den_rate)

   217        {

   218           samp_frac_num -= den_rate;

   219           last_sample++;

   220        }

The Tor Browser / file revision

media/libspeex_resampler/sse-detect-runtime.patch@b8a032363ba2

media/libspeex_resampler/sse-detect-runtime.patch