|
1 --- /home/paul/workspace/repositories/opus-tools/src/resample.c 2012-11-21 11:36:59.119430163 +0100 |
|
2 +++ media/libspeex_resampler/src/resample.c 2013-08-09 19:24:39.060236120 +0200 |
|
3 @@ -92,18 +92,28 @@ |
|
4 |
|
5 #define IMAX(a,b) ((a) > (b) ? (a) : (b)) |
|
6 #define IMIN(a,b) ((a) < (b) ? (a) : (b)) |
|
7 |
|
8 #ifndef NULL |
|
9 #define NULL 0 |
|
10 #endif |
|
11 |
|
12 +#include "sse_detect.h" |
|
13 + |
|
14 +/* We compile SSE code on x86 all the time, but we only use it if we find at |
|
15 + * runtime that the CPU supports it. */ |
|
16 #if defined(FLOATING_POINT) && defined(__SSE__) |
|
17 +#if defined(_MSC_VER) |
|
18 +#define inline __inline |
|
19 +#endif |
|
20 # include "resample_sse.h" |
|
21 +#ifdef _MSC_VER |
|
22 +#undef inline |
|
23 +#endif |
|
24 #endif |
|
25 |
|
26 /* Numer of elements to allocate on the stack */ |
|
27 #ifdef VAR_ARRAYS |
|
28 #define FIXED_STACK_ALLOC 8192 |
|
29 #else |
|
30 #define FIXED_STACK_ALLOC 1024 |
|
31 #endif |
|
32 @@ -340,35 +350,39 @@ |
|
33 const spx_uint32_t den_rate = st->den_rate; |
|
34 spx_word32_t sum; |
|
35 |
|
36 while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len)) |
|
37 { |
|
38 const spx_word16_t *sinct = & sinc_table[samp_frac_num*N]; |
|
39 const spx_word16_t *iptr = & in[last_sample]; |
|
40 |
|
41 -#ifndef OVERRIDE_INNER_PRODUCT_SINGLE |
|
42 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE |
|
43 + if (moz_has_sse()) { |
|
44 + sum = inner_product_single(sinct, iptr, N); |
|
45 + } else { |
|
46 +#endif |
|
47 int j; |
|
48 sum = 0; |
|
49 for(j=0;j<N;j++) sum += MULT16_16(sinct[j], iptr[j]); |
|
50 |
|
51 /* This code is slower on most DSPs which have only 2 accumulators. |
|
52 Plus this this forces truncation to 32 bits and you lose the HW guard bits. |
|
53 I think we can trust the compiler and let it vectorize and/or unroll itself. |
|
54 spx_word32_t accum[4] = {0,0,0,0}; |
|
55 for(j=0;j<N;j+=4) { |
|
56 accum[0] += MULT16_16(sinct[j], iptr[j]); |
|
57 accum[1] += MULT16_16(sinct[j+1], iptr[j+1]); |
|
58 accum[2] += MULT16_16(sinct[j+2], iptr[j+2]); |
|
59 accum[3] += MULT16_16(sinct[j+3], iptr[j+3]); |
|
60 } |
|
61 sum = accum[0] + accum[1] + accum[2] + accum[3]; |
|
62 */ |
|
63 -#else |
|
64 - sum = inner_product_single(sinct, iptr, N); |
|
65 +#ifdef OVERRIDE_INNER_PRODUCT_SINGLE |
|
66 + } |
|
67 #endif |
|
68 |
|
69 out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767); |
|
70 last_sample += int_advance; |
|
71 samp_frac_num += frac_advance; |
|
72 if (samp_frac_num >= den_rate) |
|
73 { |
|
74 samp_frac_num -= den_rate; |
|
75 @@ -397,29 +411,33 @@ |
|
76 const spx_uint32_t den_rate = st->den_rate; |
|
77 double sum; |
|
78 |
|
79 while (!(last_sample >= (spx_int32_t)*in_len || out_sample >= (spx_int32_t)*out_len)) |
|
80 { |
|
81 const spx_word16_t *sinct = & sinc_table[samp_frac_num*N]; |
|
82 const spx_word16_t *iptr = & in[last_sample]; |
|
83 |
|
84 -#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE |
|
85 - int j; |
|
86 - double accum[4] = {0,0,0,0}; |
|
87 - |
|
88 - for(j=0;j<N;j+=4) { |
|
89 - accum[0] += sinct[j]*iptr[j]; |
|
90 - accum[1] += sinct[j+1]*iptr[j+1]; |
|
91 - accum[2] += sinct[j+2]*iptr[j+2]; |
|
92 - accum[3] += sinct[j+3]*iptr[j+3]; |
|
93 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE |
|
94 + if(moz_has_sse2()) { |
|
95 + sum = inner_product_double(sinct, iptr, N); |
|
96 + } else { |
|
97 +#endif |
|
98 + int j; |
|
99 + double accum[4] = {0,0,0,0}; |
|
100 + |
|
101 + for(j=0;j<N;j+=4) { |
|
102 + accum[0] += sinct[j]*iptr[j]; |
|
103 + accum[1] += sinct[j+1]*iptr[j+1]; |
|
104 + accum[2] += sinct[j+2]*iptr[j+2]; |
|
105 + accum[3] += sinct[j+3]*iptr[j+3]; |
|
106 + } |
|
107 + sum = accum[0] + accum[1] + accum[2] + accum[3]; |
|
108 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE |
|
109 } |
|
110 - sum = accum[0] + accum[1] + accum[2] + accum[3]; |
|
111 -#else |
|
112 - sum = inner_product_double(sinct, iptr, N); |
|
113 #endif |
|
114 |
|
115 out[out_stride * out_sample++] = PSHR32(sum, 15); |
|
116 last_sample += int_advance; |
|
117 samp_frac_num += frac_advance; |
|
118 if (samp_frac_num >= den_rate) |
|
119 { |
|
120 samp_frac_num -= den_rate; |
|
121 @@ -453,35 +471,38 @@ |
|
122 #ifdef FIXED_POINT |
|
123 const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate); |
|
124 #else |
|
125 const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate; |
|
126 #endif |
|
127 spx_word16_t interp[4]; |
|
128 |
|
129 |
|
130 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE |
|
131 - int j; |
|
132 - spx_word32_t accum[4] = {0,0,0,0}; |
|
133 - |
|
134 - for(j=0;j<N;j++) { |
|
135 - const spx_word16_t curr_in=iptr[j]; |
|
136 - accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]); |
|
137 - accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]); |
|
138 - accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]); |
|
139 - accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]); |
|
140 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE |
|
141 + if (moz_has_sse()) { |
|
142 + cubic_coef(frac, interp); |
|
143 + sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp); |
|
144 + } else { |
|
145 +#endif |
|
146 + int j; |
|
147 + spx_word32_t accum[4] = {0,0,0,0}; |
|
148 + |
|
149 + for(j=0;j<N;j++) { |
|
150 + const spx_word16_t curr_in=iptr[j]; |
|
151 + accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]); |
|
152 + accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]); |
|
153 + accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]); |
|
154 + accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]); |
|
155 + } |
|
156 + cubic_coef(frac, interp); |
|
157 + sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1)); |
|
158 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE |
|
159 } |
|
160 - |
|
161 - cubic_coef(frac, interp); |
|
162 - sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1)); |
|
163 -#else |
|
164 - cubic_coef(frac, interp); |
|
165 - sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp); |
|
166 #endif |
|
167 - |
|
168 + |
|
169 out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767); |
|
170 last_sample += int_advance; |
|
171 samp_frac_num += frac_advance; |
|
172 if (samp_frac_num >= den_rate) |
|
173 { |
|
174 samp_frac_num -= den_rate; |
|
175 last_sample++; |
|
176 } |
|
177 @@ -515,35 +536,38 @@ |
|
178 #ifdef FIXED_POINT |
|
179 const spx_word16_t frac = PDIV32(SHL32((samp_frac_num*st->oversample) % st->den_rate,15),st->den_rate); |
|
180 #else |
|
181 const spx_word16_t frac = ((float)((samp_frac_num*st->oversample) % st->den_rate))/st->den_rate; |
|
182 #endif |
|
183 spx_word16_t interp[4]; |
|
184 |
|
185 |
|
186 -#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE |
|
187 +#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE |
|
188 + if (moz_has_sse2()) { |
|
189 + cubic_coef(frac, interp); |
|
190 + sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp); |
|
191 + } else { |
|
192 +#endif |
|
193 int j; |
|
194 double accum[4] = {0,0,0,0}; |
|
195 |
|
196 for(j=0;j<N;j++) { |
|
197 const double curr_in=iptr[j]; |
|
198 accum[0] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-2]); |
|
199 accum[1] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset-1]); |
|
200 accum[2] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset]); |
|
201 accum[3] += MULT16_16(curr_in,st->sinc_table[4+(j+1)*st->oversample-offset+1]); |
|
202 } |
|
203 |
|
204 cubic_coef(frac, interp); |
|
205 sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]); |
|
206 -#else |
|
207 - cubic_coef(frac, interp); |
|
208 - sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp); |
|
209 +#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE |
|
210 + } |
|
211 #endif |
|
212 - |
|
213 out[out_stride * out_sample++] = PSHR32(sum,15); |
|
214 last_sample += int_advance; |
|
215 samp_frac_num += frac_advance; |
|
216 if (samp_frac_num >= den_rate) |
|
217 { |
|
218 samp_frac_num -= den_rate; |
|
219 last_sample++; |
|
220 } |