1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libopus/celt/mdct.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,311 @@ 1.4 +/* Copyright (c) 2007-2008 CSIRO 1.5 + Copyright (c) 2007-2008 Xiph.Org Foundation 1.6 + Written by Jean-Marc Valin */ 1.7 +/* 1.8 + Redistribution and use in source and binary forms, with or without 1.9 + modification, are permitted provided that the following conditions 1.10 + are met: 1.11 + 1.12 + - Redistributions of source code must retain the above copyright 1.13 + notice, this list of conditions and the following disclaimer. 1.14 + 1.15 + - Redistributions in binary form must reproduce the above copyright 1.16 + notice, this list of conditions and the following disclaimer in the 1.17 + documentation and/or other materials provided with the distribution. 1.18 + 1.19 + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 1.20 + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 1.21 + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 1.22 + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 1.23 + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 1.24 + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 1.25 + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 1.26 + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 1.27 + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 1.28 + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 1.29 + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1.30 +*/ 1.31 + 1.32 +/* This is a simple MDCT implementation that uses a N/4 complex FFT 1.33 + to do most of the work. It should be relatively straightforward to 1.34 + plug in pretty much and FFT here. 1.35 + 1.36 + This replaces the Vorbis FFT (and uses the exact same API), which 1.37 + was a bit too messy and that was ending up duplicating code 1.38 + (might as well use the same FFT everywhere). 1.39 + 1.40 + The algorithm is similar to (and inspired from) Fabrice Bellard's 1.41 + MDCT implementation in FFMPEG, but has differences in signs, ordering 1.42 + and scaling in many places. 1.43 +*/ 1.44 + 1.45 +#ifndef SKIP_CONFIG_H 1.46 +#ifdef HAVE_CONFIG_H 1.47 +#include "config.h" 1.48 +#endif 1.49 +#endif 1.50 + 1.51 +#include "mdct.h" 1.52 +#include "kiss_fft.h" 1.53 +#include "_kiss_fft_guts.h" 1.54 +#include <math.h> 1.55 +#include "os_support.h" 1.56 +#include "mathops.h" 1.57 +#include "stack_alloc.h" 1.58 + 1.59 +#ifdef CUSTOM_MODES 1.60 + 1.61 +int clt_mdct_init(mdct_lookup *l,int N, int maxshift) 1.62 +{ 1.63 + int i; 1.64 + int N4; 1.65 + kiss_twiddle_scalar *trig; 1.66 +#if defined(FIXED_POINT) 1.67 + int N2=N>>1; 1.68 +#endif 1.69 + l->n = N; 1.70 + N4 = N>>2; 1.71 + l->maxshift = maxshift; 1.72 + for (i=0;i<=maxshift;i++) 1.73 + { 1.74 + if (i==0) 1.75 + l->kfft[i] = opus_fft_alloc(N>>2>>i, 0, 0); 1.76 + else 1.77 + l->kfft[i] = opus_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0]); 1.78 +#ifndef ENABLE_TI_DSPLIB55 1.79 + if (l->kfft[i]==NULL) 1.80 + return 0; 1.81 +#endif 1.82 + } 1.83 + l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N4+1)*sizeof(kiss_twiddle_scalar)); 1.84 + if (l->trig==NULL) 1.85 + return 0; 1.86 + /* We have enough points that sine isn't necessary */ 1.87 +#if defined(FIXED_POINT) 1.88 + for (i=0;i<=N4;i++) 1.89 + trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2),N)); 1.90 +#else 1.91 + for (i=0;i<=N4;i++) 1.92 + trig[i] = (kiss_twiddle_scalar)cos(2*PI*i/N); 1.93 +#endif 1.94 + return 1; 1.95 +} 1.96 + 1.97 +void clt_mdct_clear(mdct_lookup *l) 1.98 +{ 1.99 + int i; 1.100 + for (i=0;i<=l->maxshift;i++) 1.101 + opus_fft_free(l->kfft[i]); 1.102 + opus_free((kiss_twiddle_scalar*)l->trig); 1.103 +} 1.104 + 1.105 +#endif /* CUSTOM_MODES */ 1.106 + 1.107 +/* Forward MDCT trashes the input array */ 1.108 +void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, 1.109 + const opus_val16 *window, int overlap, int shift, int stride) 1.110 +{ 1.111 + int i; 1.112 + int N, N2, N4; 1.113 + kiss_twiddle_scalar sine; 1.114 + VARDECL(kiss_fft_scalar, f); 1.115 + VARDECL(kiss_fft_scalar, f2); 1.116 + SAVE_STACK; 1.117 + N = l->n; 1.118 + N >>= shift; 1.119 + N2 = N>>1; 1.120 + N4 = N>>2; 1.121 + ALLOC(f, N2, kiss_fft_scalar); 1.122 + ALLOC(f2, N2, kiss_fft_scalar); 1.123 + /* sin(x) ~= x here */ 1.124 +#ifdef FIXED_POINT 1.125 + sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N; 1.126 +#else 1.127 + sine = (kiss_twiddle_scalar)2*PI*(.125f)/N; 1.128 +#endif 1.129 + 1.130 + /* Consider the input to be composed of four blocks: [a, b, c, d] */ 1.131 + /* Window, shuffle, fold */ 1.132 + { 1.133 + /* Temp pointers to make it really clear to the compiler what we're doing */ 1.134 + const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); 1.135 + const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); 1.136 + kiss_fft_scalar * OPUS_RESTRICT yp = f; 1.137 + const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1); 1.138 + const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; 1.139 + for(i=0;i<((overlap+3)>>2);i++) 1.140 + { 1.141 + /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ 1.142 + *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2); 1.143 + *yp++ = MULT16_32_Q15(*wp1, *xp1) - MULT16_32_Q15(*wp2, xp2[-N2]); 1.144 + xp1+=2; 1.145 + xp2-=2; 1.146 + wp1+=2; 1.147 + wp2-=2; 1.148 + } 1.149 + wp1 = window; 1.150 + wp2 = window+overlap-1; 1.151 + for(;i<N4-((overlap+3)>>2);i++) 1.152 + { 1.153 + /* Real part arranged as a-bR, Imag part arranged as -c-dR */ 1.154 + *yp++ = *xp2; 1.155 + *yp++ = *xp1; 1.156 + xp1+=2; 1.157 + xp2-=2; 1.158 + } 1.159 + for(;i<N4;i++) 1.160 + { 1.161 + /* Real part arranged as a-bR, Imag part arranged as -c-dR */ 1.162 + *yp++ = -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2); 1.163 + *yp++ = MULT16_32_Q15(*wp2, *xp1) + MULT16_32_Q15(*wp1, xp2[N2]); 1.164 + xp1+=2; 1.165 + xp2-=2; 1.166 + wp1+=2; 1.167 + wp2-=2; 1.168 + } 1.169 + } 1.170 + /* Pre-rotation */ 1.171 + { 1.172 + kiss_fft_scalar * OPUS_RESTRICT yp = f; 1.173 + const kiss_twiddle_scalar *t = &l->trig[0]; 1.174 + for(i=0;i<N4;i++) 1.175 + { 1.176 + kiss_fft_scalar re, im, yr, yi; 1.177 + re = yp[0]; 1.178 + im = yp[1]; 1.179 + yr = -S_MUL(re,t[i<<shift]) - S_MUL(im,t[(N4-i)<<shift]); 1.180 + yi = -S_MUL(im,t[i<<shift]) + S_MUL(re,t[(N4-i)<<shift]); 1.181 + /* works because the cos is nearly one */ 1.182 + *yp++ = yr + S_MUL(yi,sine); 1.183 + *yp++ = yi - S_MUL(yr,sine); 1.184 + } 1.185 + } 1.186 + 1.187 + /* N/4 complex FFT, down-scales by 4/N */ 1.188 + opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)f2); 1.189 + 1.190 + /* Post-rotate */ 1.191 + { 1.192 + /* Temp pointers to make it really clear to the compiler what we're doing */ 1.193 + const kiss_fft_scalar * OPUS_RESTRICT fp = f2; 1.194 + kiss_fft_scalar * OPUS_RESTRICT yp1 = out; 1.195 + kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); 1.196 + const kiss_twiddle_scalar *t = &l->trig[0]; 1.197 + /* Temp pointers to make it really clear to the compiler what we're doing */ 1.198 + for(i=0;i<N4;i++) 1.199 + { 1.200 + kiss_fft_scalar yr, yi; 1.201 + yr = S_MUL(fp[1],t[(N4-i)<<shift]) + S_MUL(fp[0],t[i<<shift]); 1.202 + yi = S_MUL(fp[0],t[(N4-i)<<shift]) - S_MUL(fp[1],t[i<<shift]); 1.203 + /* works because the cos is nearly one */ 1.204 + *yp1 = yr - S_MUL(yi,sine); 1.205 + *yp2 = yi + S_MUL(yr,sine);; 1.206 + fp += 2; 1.207 + yp1 += 2*stride; 1.208 + yp2 -= 2*stride; 1.209 + } 1.210 + } 1.211 + RESTORE_STACK; 1.212 +} 1.213 + 1.214 +void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, 1.215 + const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride) 1.216 +{ 1.217 + int i; 1.218 + int N, N2, N4; 1.219 + kiss_twiddle_scalar sine; 1.220 + VARDECL(kiss_fft_scalar, f2); 1.221 + SAVE_STACK; 1.222 + N = l->n; 1.223 + N >>= shift; 1.224 + N2 = N>>1; 1.225 + N4 = N>>2; 1.226 + ALLOC(f2, N2, kiss_fft_scalar); 1.227 + /* sin(x) ~= x here */ 1.228 +#ifdef FIXED_POINT 1.229 + sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N; 1.230 +#else 1.231 + sine = (kiss_twiddle_scalar)2*PI*(.125f)/N; 1.232 +#endif 1.233 + 1.234 + /* Pre-rotate */ 1.235 + { 1.236 + /* Temp pointers to make it really clear to the compiler what we're doing */ 1.237 + const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; 1.238 + const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); 1.239 + kiss_fft_scalar * OPUS_RESTRICT yp = f2; 1.240 + const kiss_twiddle_scalar *t = &l->trig[0]; 1.241 + for(i=0;i<N4;i++) 1.242 + { 1.243 + kiss_fft_scalar yr, yi; 1.244 + yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]); 1.245 + yi = -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]); 1.246 + /* works because the cos is nearly one */ 1.247 + *yp++ = yr - S_MUL(yi,sine); 1.248 + *yp++ = yi + S_MUL(yr,sine); 1.249 + xp1+=2*stride; 1.250 + xp2-=2*stride; 1.251 + } 1.252 + } 1.253 + 1.254 + /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */ 1.255 + opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1))); 1.256 + 1.257 + /* Post-rotate and de-shuffle from both ends of the buffer at once to make 1.258 + it in-place. */ 1.259 + { 1.260 + kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1); 1.261 + kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2; 1.262 + const kiss_twiddle_scalar *t = &l->trig[0]; 1.263 + /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the 1.264 + middle pair will be computed twice. */ 1.265 + for(i=0;i<(N4+1)>>1;i++) 1.266 + { 1.267 + kiss_fft_scalar re, im, yr, yi; 1.268 + kiss_twiddle_scalar t0, t1; 1.269 + re = yp0[0]; 1.270 + im = yp0[1]; 1.271 + t0 = t[i<<shift]; 1.272 + t1 = t[(N4-i)<<shift]; 1.273 + /* We'd scale up by 2 here, but instead it's done when mixing the windows */ 1.274 + yr = S_MUL(re,t0) - S_MUL(im,t1); 1.275 + yi = S_MUL(im,t0) + S_MUL(re,t1); 1.276 + re = yp1[0]; 1.277 + im = yp1[1]; 1.278 + /* works because the cos is nearly one */ 1.279 + yp0[0] = -(yr - S_MUL(yi,sine)); 1.280 + yp1[1] = yi + S_MUL(yr,sine); 1.281 + 1.282 + t0 = t[(N4-i-1)<<shift]; 1.283 + t1 = t[(i+1)<<shift]; 1.284 + /* We'd scale up by 2 here, but instead it's done when mixing the windows */ 1.285 + yr = S_MUL(re,t0) - S_MUL(im,t1); 1.286 + yi = S_MUL(im,t0) + S_MUL(re,t1); 1.287 + /* works because the cos is nearly one */ 1.288 + yp1[0] = -(yr - S_MUL(yi,sine)); 1.289 + yp0[1] = yi + S_MUL(yr,sine); 1.290 + yp0 += 2; 1.291 + yp1 -= 2; 1.292 + } 1.293 + } 1.294 + 1.295 + /* Mirror on both sides for TDAC */ 1.296 + { 1.297 + kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; 1.298 + kiss_fft_scalar * OPUS_RESTRICT yp1 = out; 1.299 + const opus_val16 * OPUS_RESTRICT wp1 = window; 1.300 + const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1; 1.301 + 1.302 + for(i = 0; i < overlap/2; i++) 1.303 + { 1.304 + kiss_fft_scalar x1, x2; 1.305 + x1 = *xp1; 1.306 + x2 = *yp1; 1.307 + *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1); 1.308 + *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1); 1.309 + wp1++; 1.310 + wp2--; 1.311 + } 1.312 + } 1.313 + RESTORE_STACK; 1.314 +}