Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | --- /home/paul/workspace/repositories/opus-tools/src/resample.c 2012-11-21 11:36:59.119430163 +0100 |
michael@0 | 2 | +++ media/libspeex_resampler/src/resample.c 2013-08-09 19:24:39.060236120 +0200 |
michael@0 | 3 | @@ -92,18 +92,28 @@ |
michael@0 | 4 | |
michael@0 | 5 | #define IMAX(a,b) ((a) > (b) ? (a) : (b)) |
michael@0 | 6 | #define IMIN(a,b) ((a) < (b) ? (a) : (b)) |
michael@0 | 7 | |
michael@0 | 8 | #ifndef NULL |
michael@0 | 9 | #define NULL 0 |
michael@0 | 10 | #endif |
michael@0 | 11 | |
michael@0 | 12 | +#include "sse_detect.h" |
michael@0 | 13 | + |
michael@0 | 14 | +/* We compile SSE code on x86 all the time, but we only use it if we find at |
michael@0 | 15 | + * runtime that the CPU supports it. */ |
michael@0 | 16 | #if defined(FLOATING_POINT) && defined(__SSE__) |
michael@0 | 17 | +#if defined(_MSC_VER) |
michael@0 | 18 | +#define inline __inline |
michael@0 | 19 | +#endif |
michael@0 | 20 | # include "resample_sse.h" |
michael@0 | 21 | +#ifdef _MSC_VER |
michael@0 | 22 | +#undef inline |
michael@0 | 23 | +#endif |
michael@0 | 24 | #endif |
michael@0 | 25 | |
michael@0 | 26 | /* Numer of elements to allocate on the stack */ |
michael@0 | 27 | #ifdef VAR_ARRAYS |
michael@0 | 28 | #define FIXED_STACK_ALLOC 8192 |
michael@0 | 29 | #else |
michael@0 | 30 | #define FIXED_STACK_ALLOC 1024 |
michael@0 | 31 | #endif |
michael@0 | 32 | @@ -340,35 +350,39 @@ |
michael@0 | 33 | const spx_uint32_t den_rate = st->den_rate; |
michael@0 | 34 | spx_word32_t sum; |
michael@0 | 35 | |
michael@0 | 36 | while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len)) |
michael@0 | 37 | { |
michael@0 | 38 | const spx_word16_t *sinct = & sinc_table[samp_frac_num*N]; |
michael@0 | 39 | const spx_word16_t *iptr = & in[last_sample]; |
michael@0 | 40 | |
michael@0 | 41 | -#ifndef OVERRIDE_INNER_PRODUCT_SINGLE |
michael@0 | 42 | +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE |
michael@0 | 43 | + if (moz_has_sse()) { |
michael@0 | 44 | + sum = inner_product_single(sinct, iptr, N); |
michael@0 | 45 | + } else { |
michael@0 | 46 | +#endif |
michael@0 | 47 | int j; |
michael@0 | 48 | sum = 0; |
michael@0 | 49 | for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]); |
michael@0 | 50 | |
michael@0 | 51 | /* This code is slower on most DSPs which have only 2 accumulators. |
michael@0 | 52 | Plus this this forces truncation to 32 bits and you lose the HW guard bits. |
michael@0 | 53 | I think we can trust the compiler and let it vectorize and/or unroll itself. |
michael@0 | 54 | spx_word32_t accum[4] = {0,0,0,0}; |
michael@0 | 55 | for(j=0;j<N;j+=4) { |
michael@0 | 56 | accum[0] += MULT16_16(sinct[j], iptr[j]); |
michael@0 | 57 | accum[1] += MULT16_16(sinct[j+1], iptr[j+1]); |
michael@0 | 58 | accum[2] += MULT16_16(sinct[j+2], iptr[j+2]); |
michael@0 | 59 | accum[3] += MULT16_16(sinct[j+3], iptr[j+3]); |
michael@0 | 60 | } |
michael@0 | 61 | sum = accum[0] + accum[1] + accum[2] + accum[3]; |
michael@0 | 62 | */ |
michael@0 | 63 | -#else |
michael@0 | 64 | - sum = inner_product_single(sinct, iptr, N); |
michael@0 | 65 | +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE |
michael@0 | 66 | + } |
michael@0 | 67 | #endif |
michael@0 | 68 | |
michael@0 | 69 | out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767); |
michael@0 | 70 | last_sample += int_advance; |
michael@0 | 71 | samp_frac_num += frac_advance; |
michael@0 | 72 | if (samp_frac_num >= den_rate) |
michael@0 | 73 | { |
michael@0 | 74 | samp_frac_num -= den_rate; |
michael@0 | 75 | @@ -397,29 +411,33 @@ |
michael@0 | 76 | const spx_uint32_t den_rate = st->den_rate; |
michael@0 | 77 | double sum; |
michael@0 | 78 | |
michael@0 | 79 | while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len)) |
michael@0 | 80 | { |
michael@0 | 81 | const spx_word16_t *sinct = & sinc_table[samp_frac_num*N]; |
michael@0 | 82 | const spx_word16_t *iptr = & in[last_sample]; |
michael@0 | 83 | |
michael@0 | 84 | -#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE |
michael@0 | 85 | - int j; |
michael@0 | 86 | - double accum[4] = {0,0,0,0}; |
michael@0 | 87 | - |
michael@0 | 88 | - for(j=0;j<N;j+=4) { |
michael@0 | 89 | - accum[0] += sinct[j]*iptr[j]; |
michael@0 | 90 | - accum[1] += sinct[j+1]*iptr[j+1]; |
michael@0 | 91 | - accum[2] += sinct[j+2]*iptr[j+2]; |
michael@0 | 92 | - accum[3] += sinct[j+3]*iptr[j+3]; |
michael@0 | 93 | +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE |
michael@0 | 94 | + if(moz_has_sse2()) { |
michael@0 | 95 | + sum = inner_product_double(sinct, iptr, N); |
michael@0 | 96 | + } else { |
michael@0 | 97 | +#endif |
michael@0 | 98 | + int j; |
michael@0 | 99 | + double accum[4] = {0,0,0,0}; |
michael@0 | 100 | + |
michael@0 | 101 | + for(j=0;j<N;j+=4) { |
michael@0 | 102 | + accum[0] += sinct[j]*iptr[j]; |
michael@0 | 103 | + accum[1] += sinct[j+1]*iptr[j+1]; |
michael@0 | 104 | + accum[2] += sinct[j+2]*iptr[j+2]; |
michael@0 | 105 | + accum[3] += sinct[j+3]*iptr[j+3]; |
michael@0 | 106 | + } |
michael@0 | 107 | + sum = accum[0] + accum[1] + accum[2] + accum[3]; |
michael@0 | 108 | +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE |
michael@0 | 109 | } |
michael@0 | 110 | - sum = accum[0] + accum[1] + accum[2] + accum[3]; |
michael@0 | 111 | -#else |
michael@0 | 112 | - sum = inner_product_double(sinct, iptr, N); |
michael@0 | 113 | #endif |
michael@0 | 114 | |
michael@0 | 115 | out[out_stride * out_sample++] = PSHR32(sum, 15); |
michael@0 | 116 | last_sample += int_advance; |
michael@0 | 117 | samp_frac_num += frac_advance; |
michael@0 | 118 | if (samp_frac_num >= den_rate) |
michael@0 | 119 | { |
michael@0 | 120 | samp_frac_num -= den_rate; |
michael@0 | 121 | @@ -453,35 +471,38 @@ |
michael@0 | 122 | #ifdef FIXED_POINT |
michael@0 | 123 | const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate); |
michael@0 | 124 | #else |
michael@0 | 125 | const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate; |
michael@0 | 126 | #endif |
michael@0 | 127 | spx_word16_t interp[4]; |
michael@0 | 128 | |
michael@0 | 129 | |
michael@0 | 130 | -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE |
michael@0 | 131 | - int j; |
michael@0 | 132 | - spx_word32_t accum[4] = {0,0,0,0}; |
michael@0 | 133 | - |
michael@0 | 134 | - for(j=0;j<N;j++) { |
michael@0 | 135 | - const spx_word16_t curr_in=iptr[j]; |
michael@0 | 136 | - accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]); |
michael@0 | 137 | - accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]); |
michael@0 | 138 | - accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]); |
michael@0 | 139 | - accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]); |
michael@0 | 140 | +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE |
michael@0 | 141 | + if (moz_has_sse()) { |
michael@0 | 142 | + cubic_coef(frac, interp); |
michael@0 | 143 | + sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp); |
michael@0 | 144 | + } else { |
michael@0 | 145 | +#endif |
michael@0 | 146 | + int j; |
michael@0 | 147 | + spx_word32_t accum[4] = {0,0,0,0}; |
michael@0 | 148 | + |
michael@0 | 149 | + for(j=0;j<N;j++) { |
michael@0 | 150 | + const spx_word16_t curr_in=iptr[j]; |
michael@0 | 151 | + accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]); |
michael@0 | 152 | + accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]); |
michael@0 | 153 | + accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]); |
michael@0 | 154 | + accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]); |
michael@0 | 155 | + } |
michael@0 | 156 | + cubic_coef(frac, interp); |
michael@0 | 157 | + sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1)); |
michael@0 | 158 | +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE |
michael@0 | 159 | } |
michael@0 | 160 | - |
michael@0 | 161 | - cubic_coef(frac, interp); |
michael@0 | 162 | - sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1)); |
michael@0 | 163 | -#else |
michael@0 | 164 | - cubic_coef(frac, interp); |
michael@0 | 165 | - sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp); |
michael@0 | 166 | #endif |
michael@0 | 167 | - |
michael@0 | 168 | + |
michael@0 | 169 | out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767); |
michael@0 | 170 | last_sample += int_advance; |
michael@0 | 171 | samp_frac_num += frac_advance; |
michael@0 | 172 | if (samp_frac_num >= den_rate) |
michael@0 | 173 | { |
michael@0 | 174 | samp_frac_num -= den_rate; |
michael@0 | 175 | last_sample++; |
michael@0 | 176 | } |
michael@0 | 177 | @@ -515,35 +536,38 @@ |
michael@0 | 178 | #ifdef FIXED_POINT |
michael@0 | 179 | const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate); |
michael@0 | 180 | #else |
michael@0 | 181 | const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate; |
michael@0 | 182 | #endif |
michael@0 | 183 | spx_word16_t interp[4]; |
michael@0 | 184 | |
michael@0 | 185 | |
michael@0 | 186 | -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE |
michael@0 | 187 | +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE |
michael@0 | 188 | + if (moz_has_sse2()) { |
michael@0 | 189 | + cubic_coef(frac, interp); |
michael@0 | 190 | + sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp); |
michael@0 | 191 | + } else { |
michael@0 | 192 | +#endif |
michael@0 | 193 | int j; |
michael@0 | 194 | double accum[4] = {0,0,0,0}; |
michael@0 | 195 | |
michael@0 | 196 | for(j=0;j<N;j++) { |
michael@0 | 197 | const double curr_in=iptr[j]; |
michael@0 | 198 | accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]); |
michael@0 | 199 | accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]); |
michael@0 | 200 | accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]); |
michael@0 | 201 | accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]); |
michael@0 | 202 | } |
michael@0 | 203 | |
michael@0 | 204 | cubic_coef(frac, interp); |
michael@0 | 205 | sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]); |
michael@0 | 206 | -#else |
michael@0 | 207 | - cubic_coef(frac, interp); |
michael@0 | 208 | - sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp); |
michael@0 | 209 | +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE |
michael@0 | 210 | + } |
michael@0 | 211 | #endif |
michael@0 | 212 | - |
michael@0 | 213 | out[out_stride * out_sample++] = PSHR32(sum,15); |
michael@0 | 214 | last_sample += int_advance; |
michael@0 | 215 | samp_frac_num += frac_advance; |
michael@0 | 216 | if (samp_frac_num >= den_rate) |
michael@0 | 217 | { |
michael@0 | 218 | samp_frac_num -= den_rate; |
michael@0 | 219 | last_sample++; |
michael@0 | 220 | } |