media/libspeex_resampler/sse-detect-runtime.patch

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 --- /home/paul/workspace/repositories/opus-tools/src/resample.c 2012-11-21 11:36:59.119430163 +0100
michael@0 2 +++ media/libspeex_resampler/src/resample.c 2013-08-09 19:24:39.060236120 +0200
michael@0 3 @@ -92,18 +92,28 @@
michael@0 4
michael@0 5 #define IMAX(a,b) ((a) > (b) ? (a) : (b))
michael@0 6 #define IMIN(a,b) ((a) < (b) ? (a) : (b))
michael@0 7
michael@0 8 #ifndef NULL
michael@0 9 #define NULL 0
michael@0 10 #endif
michael@0 11
michael@0 12 +#include "sse_detect.h"
michael@0 13 +
michael@0 14 +/* We compile SSE code on x86 all the time, but we only use it if we find at
michael@0 15 + * runtime that the CPU supports it. */
michael@0 16 #if defined(FLOATING_POINT) && defined(__SSE__)
michael@0 17 +#if defined(_MSC_VER)
michael@0 18 +#define inline __inline
michael@0 19 +#endif
michael@0 20 # include "resample_sse.h"
michael@0 21 +#ifdef _MSC_VER
michael@0 22 +#undef inline
michael@0 23 +#endif
michael@0 24 #endif
michael@0 25
michael@0 26 /* Numer of elements to allocate on the stack */
michael@0 27 #ifdef VAR_ARRAYS
michael@0 28 #define FIXED_STACK_ALLOC 8192
michael@0 29 #else
michael@0 30 #define FIXED_STACK_ALLOC 1024
michael@0 31 #endif
michael@0 32 @@ -340,35 +350,39 @@
michael@0 33 const spx_uint32_t den_rate = st->den_rate;
michael@0 34 spx_word32_t sum;
michael@0 35
michael@0 36 while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
michael@0 37 {
michael@0 38 const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
michael@0 39 const spx_word16_t *iptr = & in[last_sample];
michael@0 40
michael@0 41 -#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
michael@0 42 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
michael@0 43 + if (moz_has_sse()) {
michael@0 44 + sum = inner_product_single(sinct, iptr, N);
michael@0 45 + } else {
michael@0 46 +#endif
michael@0 47 int j;
michael@0 48 sum = 0;
michael@0 49 for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
michael@0 50
michael@0 51 /* This code is slower on most DSPs which have only 2 accumulators.
michael@0 52 Plus this this forces truncation to 32 bits and you lose the HW guard bits.
michael@0 53 I think we can trust the compiler and let it vectorize and/or unroll itself.
michael@0 54 spx_word32_t accum[4] = {0,0,0,0};
michael@0 55 for(j=0;j<N;j+=4) {
michael@0 56 accum[0] += MULT16_16(sinct[j], iptr[j]);
michael@0 57 accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
michael@0 58 accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
michael@0 59 accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
michael@0 60 }
michael@0 61 sum = accum[0] + accum[1] + accum[2] + accum[3];
michael@0 62 */
michael@0 63 -#else
michael@0 64 - sum = inner_product_single(sinct, iptr, N);
michael@0 65 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
michael@0 66 + }
michael@0 67 #endif
michael@0 68
michael@0 69 out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767);
michael@0 70 last_sample += int_advance;
michael@0 71 samp_frac_num += frac_advance;
michael@0 72 if (samp_frac_num >= den_rate)
michael@0 73 {
michael@0 74 samp_frac_num -= den_rate;
michael@0 75 @@ -397,29 +411,33 @@
michael@0 76 const spx_uint32_t den_rate = st->den_rate;
michael@0 77 double sum;
michael@0 78
michael@0 79 while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
michael@0 80 {
michael@0 81 const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
michael@0 82 const spx_word16_t *iptr = & in[last_sample];
michael@0 83
michael@0 84 -#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
michael@0 85 - int j;
michael@0 86 - double accum[4] = {0,0,0,0};
michael@0 87 -
michael@0 88 - for(j=0;j<N;j+=4) {
michael@0 89 - accum[0] += sinct[j]*iptr[j];
michael@0 90 - accum[1] += sinct[j+1]*iptr[j+1];
michael@0 91 - accum[2] += sinct[j+2]*iptr[j+2];
michael@0 92 - accum[3] += sinct[j+3]*iptr[j+3];
michael@0 93 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
michael@0 94 + if(moz_has_sse2()) {
michael@0 95 + sum = inner_product_double(sinct, iptr, N);
michael@0 96 + } else {
michael@0 97 +#endif
michael@0 98 + int j;
michael@0 99 + double accum[4] = {0,0,0,0};
michael@0 100 +
michael@0 101 + for(j=0;j<N;j+=4) {
michael@0 102 + accum[0] += sinct[j]*iptr[j];
michael@0 103 + accum[1] += sinct[j+1]*iptr[j+1];
michael@0 104 + accum[2] += sinct[j+2]*iptr[j+2];
michael@0 105 + accum[3] += sinct[j+3]*iptr[j+3];
michael@0 106 + }
michael@0 107 + sum = accum[0] + accum[1] + accum[2] + accum[3];
michael@0 108 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
michael@0 109 }
michael@0 110 - sum = accum[0] + accum[1] + accum[2] + accum[3];
michael@0 111 -#else
michael@0 112 - sum = inner_product_double(sinct, iptr, N);
michael@0 113 #endif
michael@0 114
michael@0 115 out[out_stride * out_sample++] = PSHR32(sum, 15);
michael@0 116 last_sample += int_advance;
michael@0 117 samp_frac_num += frac_advance;
michael@0 118 if (samp_frac_num >= den_rate)
michael@0 119 {
michael@0 120 samp_frac_num -= den_rate;
michael@0 121 @@ -453,35 +471,38 @@
michael@0 122 #ifdef FIXED_POINT
michael@0 123 const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
michael@0 124 #else
michael@0 125 const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
michael@0 126 #endif
michael@0 127 spx_word16_t interp[4];
michael@0 128
michael@0 129
michael@0 130 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
michael@0 131 - int j;
michael@0 132 - spx_word32_t accum[4] = {0,0,0,0};
michael@0 133 -
michael@0 134 - for(j=0;j<N;j++) {
michael@0 135 - const spx_word16_t curr_in=iptr[j];
michael@0 136 - accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
michael@0 137 - accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
michael@0 138 - accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
michael@0 139 - accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
michael@0 140 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
michael@0 141 + if (moz_has_sse()) {
michael@0 142 + cubic_coef(frac, interp);
michael@0 143 + sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
michael@0 144 + } else {
michael@0 145 +#endif
michael@0 146 + int j;
michael@0 147 + spx_word32_t accum[4] = {0,0,0,0};
michael@0 148 +
michael@0 149 + for(j=0;j<N;j++) {
michael@0 150 + const spx_word16_t curr_in=iptr[j];
michael@0 151 + accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
michael@0 152 + accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
michael@0 153 + accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
michael@0 154 + accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
michael@0 155 + }
michael@0 156 + cubic_coef(frac, interp);
michael@0 157 + sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
michael@0 158 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
michael@0 159 }
michael@0 160 -
michael@0 161 - cubic_coef(frac, interp);
michael@0 162 - sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
michael@0 163 -#else
michael@0 164 - cubic_coef(frac, interp);
michael@0 165 - sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
michael@0 166 #endif
michael@0 167 -
michael@0 168 +
michael@0 169 out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767);
michael@0 170 last_sample += int_advance;
michael@0 171 samp_frac_num += frac_advance;
michael@0 172 if (samp_frac_num >= den_rate)
michael@0 173 {
michael@0 174 samp_frac_num -= den_rate;
michael@0 175 last_sample++;
michael@0 176 }
michael@0 177 @@ -515,35 +536,38 @@
michael@0 178 #ifdef FIXED_POINT
michael@0 179 const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
michael@0 180 #else
michael@0 181 const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
michael@0 182 #endif
michael@0 183 spx_word16_t interp[4];
michael@0 184
michael@0 185
michael@0 186 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
michael@0 187 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
michael@0 188 + if (moz_has_sse2()) {
michael@0 189 + cubic_coef(frac, interp);
michael@0 190 + sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
michael@0 191 + } else {
michael@0 192 +#endif
michael@0 193 int j;
michael@0 194 double accum[4] = {0,0,0,0};
michael@0 195
michael@0 196 for(j=0;j<N;j++) {
michael@0 197 const double curr_in=iptr[j];
michael@0 198 accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
michael@0 199 accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
michael@0 200 accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
michael@0 201 accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
michael@0 202 }
michael@0 203
michael@0 204 cubic_coef(frac, interp);
michael@0 205 sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
michael@0 206 -#else
michael@0 207 - cubic_coef(frac, interp);
michael@0 208 - sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
michael@0 209 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
michael@0 210 + }
michael@0 211 #endif
michael@0 212 -
michael@0 213 out[out_stride * out_sample++] = PSHR32(sum,15);
michael@0 214 last_sample += int_advance;
michael@0 215 samp_frac_num += frac_advance;
michael@0 216 if (samp_frac_num >= den_rate)
michael@0 217 {
michael@0 218 samp_frac_num -= den_rate;
michael@0 219 last_sample++;
michael@0 220 }

mercurial