Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
1 --- /home/paul/workspace/repositories/opus-tools/src/resample.c 2012-11-21 11:36:59.119430163 +0100
2 +++ media/libspeex_resampler/src/resample.c 2013-08-09 19:24:39.060236120 +0200
3 @@ -92,18 +92,28 @@
5 #define IMAX(a,b) ((a) > (b) ? (a) : (b))
6 #define IMIN(a,b) ((a) < (b) ? (a) : (b))
8 #ifndef NULL
9 #define NULL 0
10 #endif
12 +#include "sse_detect.h"
13 +
14 +/* We compile SSE code on x86 all the time, but we only use it if we find at
15 + * runtime that the CPU supports it. */
16 #if defined(FLOATING_POINT) && defined(__SSE__)
17 +#if defined(_MSC_VER)
18 +#define inline __inline
19 +#endif
20 # include "resample_sse.h"
21 +#ifdef _MSC_VER
22 +#undef inline
23 +#endif
24 #endif
26 /* Numer of elements to allocate on the stack */
27 #ifdef VAR_ARRAYS
28 #define FIXED_STACK_ALLOC 8192
29 #else
30 #define FIXED_STACK_ALLOC 1024
31 #endif
32 @@ -340,35 +350,39 @@
33 const spx_uint32_t den_rate = st->den_rate;
34 spx_word32_t sum;
36 while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
37 {
38 const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
39 const spx_word16_t *iptr = & in[last_sample];
41 -#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
42 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
43 + if (moz_has_sse()) {
44 + sum = inner_product_single(sinct, iptr, N);
45 + } else {
46 +#endif
47 int j;
48 sum = 0;
49 for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]);
51 /* This code is slower on most DSPs which have only 2 accumulators.
52 Plus this this forces truncation to 32 bits and you lose the HW guard bits.
53 I think we can trust the compiler and let it vectorize and/or unroll itself.
54 spx_word32_t accum[4] = {0,0,0,0};
55 for(j=0;j<N;j+=4) {
56 accum[0] += MULT16_16(sinct[j], iptr[j]);
57 accum[1] += MULT16_16(sinct[j+1], iptr[j+1]);
58 accum[2] += MULT16_16(sinct[j+2], iptr[j+2]);
59 accum[3] += MULT16_16(sinct[j+3], iptr[j+3]);
60 }
61 sum = accum[0] + accum[1] + accum[2] + accum[3];
62 */
63 -#else
64 - sum = inner_product_single(sinct, iptr, N);
65 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
66 + }
67 #endif
69 out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767);
70 last_sample += int_advance;
71 samp_frac_num += frac_advance;
72 if (samp_frac_num >= den_rate)
73 {
74 samp_frac_num -= den_rate;
75 @@ -397,29 +411,33 @@
76 const spx_uint32_t den_rate = st->den_rate;
77 double sum;
79 while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len))
80 {
81 const spx_word16_t *sinct = & sinc_table[samp_frac_num*N];
82 const spx_word16_t *iptr = & in[last_sample];
84 -#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
85 - int j;
86 - double accum[4] = {0,0,0,0};
87 -
88 - for(j=0;j<N;j+=4) {
89 - accum[0] += sinct[j]*iptr[j];
90 - accum[1] += sinct[j+1]*iptr[j+1];
91 - accum[2] += sinct[j+2]*iptr[j+2];
92 - accum[3] += sinct[j+3]*iptr[j+3];
93 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
94 + if(moz_has_sse2()) {
95 + sum = inner_product_double(sinct, iptr, N);
96 + } else {
97 +#endif
98 + int j;
99 + double accum[4] = {0,0,0,0};
100 +
101 + for(j=0;j<N;j+=4) {
102 + accum[0] += sinct[j]*iptr[j];
103 + accum[1] += sinct[j+1]*iptr[j+1];
104 + accum[2] += sinct[j+2]*iptr[j+2];
105 + accum[3] += sinct[j+3]*iptr[j+3];
106 + }
107 + sum = accum[0] + accum[1] + accum[2] + accum[3];
108 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
109 }
110 - sum = accum[0] + accum[1] + accum[2] + accum[3];
111 -#else
112 - sum = inner_product_double(sinct, iptr, N);
113 #endif
115 out[out_stride * out_sample++] = PSHR32(sum, 15);
116 last_sample += int_advance;
117 samp_frac_num += frac_advance;
118 if (samp_frac_num >= den_rate)
119 {
120 samp_frac_num -= den_rate;
121 @@ -453,35 +471,38 @@
122 #ifdef FIXED_POINT
123 const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
124 #else
125 const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
126 #endif
127 spx_word16_t interp[4];
130 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
131 - int j;
132 - spx_word32_t accum[4] = {0,0,0,0};
133 -
134 - for(j=0;j<N;j++) {
135 - const spx_word16_t curr_in=iptr[j];
136 - accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
137 - accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
138 - accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
139 - accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
140 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
141 + if (moz_has_sse()) {
142 + cubic_coef(frac, interp);
143 + sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
144 + } else {
145 +#endif
146 + int j;
147 + spx_word32_t accum[4] = {0,0,0,0};
148 +
149 + for(j=0;j<N;j++) {
150 + const spx_word16_t curr_in=iptr[j];
151 + accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
152 + accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
153 + accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
154 + accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
155 + }
156 + cubic_coef(frac, interp);
157 + sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
158 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
159 }
160 -
161 - cubic_coef(frac, interp);
162 - sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
163 -#else
164 - cubic_coef(frac, interp);
165 - sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
166 #endif
167 -
168 +
169 out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767);
170 last_sample += int_advance;
171 samp_frac_num += frac_advance;
172 if (samp_frac_num >= den_rate)
173 {
174 samp_frac_num -= den_rate;
175 last_sample++;
176 }
177 @@ -515,35 +536,38 @@
178 #ifdef FIXED_POINT
179 const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate);
180 #else
181 const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate;
182 #endif
183 spx_word16_t interp[4];
186 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
187 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
188 + if (moz_has_sse2()) {
189 + cubic_coef(frac, interp);
190 + sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
191 + } else {
192 +#endif
193 int j;
194 double accum[4] = {0,0,0,0};
196 for(j=0;j<N;j++) {
197 const double curr_in=iptr[j];
198 accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]);
199 accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]);
200 accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]);
201 accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]);
202 }
204 cubic_coef(frac, interp);
205 sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
206 -#else
207 - cubic_coef(frac, interp);
208 - sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
209 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
210 + }
211 #endif
212 -
213 out[out_stride * out_sample++] = PSHR32(sum,15);
214 last_sample += int_advance;
215 samp_frac_num += frac_advance;
216 if (samp_frac_num >= den_rate)
217 {
218 samp_frac_num -= den_rate;
219 last_sample++;
220 }