michael@0: /* Copyright (c) 2011 Xiph.Org Foundation michael@0: Written by Jean-Marc Valin */ michael@0: /* michael@0: Redistribution and use in source and binary forms, with or without michael@0: modification, are permitted provided that the following conditions michael@0: are met: michael@0: michael@0: - Redistributions of source code must retain the above copyright michael@0: notice, this list of conditions and the following disclaimer. michael@0: michael@0: - Redistributions in binary form must reproduce the above copyright michael@0: notice, this list of conditions and the following disclaimer in the michael@0: documentation and/or other materials provided with the distribution. michael@0: michael@0: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS michael@0: ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT michael@0: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR michael@0: A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR michael@0: CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, michael@0: EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, michael@0: PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR michael@0: PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF michael@0: LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING michael@0: NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS michael@0: SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. michael@0: */ michael@0: michael@0: #ifdef HAVE_CONFIG_H michael@0: #include "config.h" michael@0: #endif michael@0: michael@0: #include "kiss_fft.h" michael@0: #include "celt.h" michael@0: #include "modes.h" michael@0: #include "arch.h" michael@0: #include "quant_bands.h" michael@0: #include michael@0: #include "analysis.h" michael@0: #include "mlp.h" michael@0: #include "stack_alloc.h" michael@0: michael@0: extern const MLP net; michael@0: michael@0: #ifndef M_PI michael@0: #define M_PI 3.141592653 michael@0: #endif michael@0: michael@0: static const float dct_table[128] = { michael@0: 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, michael@0: 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, michael@0: 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f, michael@0: -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f, michael@0: 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f, michael@0: -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f, michael@0: 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f, michael@0: 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f, michael@0: 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f, michael@0: 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f, michael@0: 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f, michael@0: -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f, michael@0: 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f, michael@0: -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f, michael@0: 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f, michael@0: 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f, michael@0: }; michael@0: michael@0: static const float analysis_window[240] = { michael@0: 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f, michael@0: 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f, michael@0: 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f, michael@0: 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f, michael@0: 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f, michael@0: 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f, michael@0: 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f, michael@0: 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f, michael@0: 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f, michael@0: 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f, michael@0: 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f, michael@0: 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f, michael@0: 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f, michael@0: 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f, michael@0: 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f, michael@0: 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f, michael@0: 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f, michael@0: 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f, michael@0: 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f, michael@0: 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f, michael@0: 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f, michael@0: 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f, michael@0: 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f, michael@0: 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f, michael@0: 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f, michael@0: 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f, michael@0: 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f, michael@0: 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f, michael@0: 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f, michael@0: 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f, michael@0: }; michael@0: michael@0: static const int tbands[NB_TBANDS+1] = { michael@0: 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120 michael@0: }; michael@0: michael@0: static const int extra_bands[NB_TOT_BANDS+1] = { michael@0: 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200 michael@0: }; michael@0: michael@0: /*static const float tweight[NB_TBANDS+1] = { michael@0: .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5 michael@0: };*/ michael@0: michael@0: #define NB_TONAL_SKIP_BANDS 9 michael@0: michael@0: #define cA 0.43157974f michael@0: #define cB 0.67848403f michael@0: #define cC 0.08595542f michael@0: #define cE ((float)M_PI/2) michael@0: static OPUS_INLINE float fast_atan2f(float y, float x) { michael@0: float x2, y2; michael@0: /* Should avoid underflow on the values we'll get */ michael@0: if (ABS16(x)+ABS16(y)<1e-9f) michael@0: { michael@0: x*=1e12f; michael@0: y*=1e12f; michael@0: } michael@0: x2 = x*x; michael@0: y2 = y*y; michael@0: if(x2read_pos; michael@0: curr_lookahead = tonal->write_pos-tonal->read_pos; michael@0: if (curr_lookahead<0) michael@0: curr_lookahead += DETECT_SIZE; michael@0: michael@0: if (len > 480 && pos != tonal->write_pos) michael@0: { michael@0: pos++; michael@0: if (pos==DETECT_SIZE) michael@0: pos=0; michael@0: } michael@0: if (pos == tonal->write_pos) michael@0: pos--; michael@0: if (pos<0) michael@0: pos = DETECT_SIZE-1; michael@0: OPUS_COPY(info_out, &tonal->info[pos], 1); michael@0: tonal->read_subframe += len/120; michael@0: while (tonal->read_subframe>=4) michael@0: { michael@0: tonal->read_subframe -= 4; michael@0: tonal->read_pos++; michael@0: } michael@0: if (tonal->read_pos>=DETECT_SIZE) michael@0: tonal->read_pos-=DETECT_SIZE; michael@0: michael@0: /* Compensate for the delay in the features themselves. michael@0: FIXME: Need a better estimate the 10 I just made up */ michael@0: curr_lookahead = IMAX(curr_lookahead-10, 0); michael@0: michael@0: psum=0; michael@0: /* Summing the probability of transition patterns that involve music at michael@0: time (DETECT_SIZE-curr_lookahead-1) */ michael@0: for (i=0;ipmusic[i]; michael@0: for (;ipspeech[i]; michael@0: psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence; michael@0: /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/ michael@0: michael@0: info_out->music_prob = psum; michael@0: } michael@0: michael@0: void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix) michael@0: { michael@0: int i, b; michael@0: const kiss_fft_state *kfft; michael@0: VARDECL(kiss_fft_cpx, in); michael@0: VARDECL(kiss_fft_cpx, out); michael@0: int N = 480, N2=240; michael@0: float * OPUS_RESTRICT A = tonal->angle; michael@0: float * OPUS_RESTRICT dA = tonal->d_angle; michael@0: float * OPUS_RESTRICT d2A = tonal->d2_angle; michael@0: VARDECL(float, tonality); michael@0: VARDECL(float, noisiness); michael@0: float band_tonality[NB_TBANDS]; michael@0: float logE[NB_TBANDS]; michael@0: float BFCC[8]; michael@0: float features[25]; michael@0: float frame_tonality; michael@0: float max_frame_tonality; michael@0: /*float tw_sum=0;*/ michael@0: float frame_noisiness; michael@0: const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI); michael@0: float slope=0; michael@0: float frame_stationarity; michael@0: float relativeE; michael@0: float frame_probs[2]; michael@0: float alpha, alphaE, alphaE2; michael@0: float frame_loudness; michael@0: float bandwidth_mask; michael@0: int bandwidth=0; michael@0: float maxE = 0; michael@0: float noise_floor; michael@0: int remaining; michael@0: AnalysisInfo *info; michael@0: SAVE_STACK; michael@0: michael@0: tonal->last_transition++; michael@0: alpha = 1.f/IMIN(20, 1+tonal->count); michael@0: alphaE = 1.f/IMIN(50, 1+tonal->count); michael@0: alphaE2 = 1.f/IMIN(1000, 1+tonal->count); michael@0: michael@0: if (tonal->count<4) michael@0: tonal->music_prob = .5; michael@0: kfft = celt_mode->mdct.kfft[0]; michael@0: if (tonal->count==0) michael@0: tonal->mem_fill = 240; michael@0: downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C); michael@0: if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE) michael@0: { michael@0: tonal->mem_fill += len; michael@0: /* Don't have enough to update the analysis */ michael@0: RESTORE_STACK; michael@0: return; michael@0: } michael@0: info = &tonal->info[tonal->write_pos++]; michael@0: if (tonal->write_pos>=DETECT_SIZE) michael@0: tonal->write_pos-=DETECT_SIZE; michael@0: michael@0: ALLOC(in, 480, kiss_fft_cpx); michael@0: ALLOC(out, 480, kiss_fft_cpx); michael@0: ALLOC(tonality, 240, float); michael@0: ALLOC(noisiness, 240, float); michael@0: for (i=0;iinmem[i]); michael@0: in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]); michael@0: in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]); michael@0: in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]); michael@0: } michael@0: OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); michael@0: remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); michael@0: downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C); michael@0: tonal->mem_fill = 240 + remaining; michael@0: opus_fft(kfft, in, out); michael@0: michael@0: for (i=1;iactivity = 0; michael@0: frame_noisiness = 0; michael@0: frame_stationarity = 0; michael@0: if (!tonal->count) michael@0: { michael@0: for (b=0;blowE[b] = 1e10; michael@0: tonal->highE[b] = -1e10; michael@0: } michael@0: } michael@0: relativeE = 0; michael@0: frame_loudness = 0; michael@0: for (b=0;bE[tonal->E_count][b] = E; michael@0: frame_noisiness += nE/(1e-15f+E); michael@0: michael@0: frame_loudness += (float)sqrt(E+1e-10f); michael@0: logE[b] = (float)log(E+1e-10f); michael@0: tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f); michael@0: tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f); michael@0: if (tonal->highE[b] < tonal->lowE[b]+1.f) michael@0: { michael@0: tonal->highE[b]+=.5f; michael@0: tonal->lowE[b]-=.5f; michael@0: } michael@0: relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]); michael@0: michael@0: L1=L2=0; michael@0: for (i=0;iE[i][b]); michael@0: L2 += tonal->E[i][b]; michael@0: } michael@0: michael@0: stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2)); michael@0: stationarity *= stationarity; michael@0: stationarity *= stationarity; michael@0: frame_stationarity += stationarity; michael@0: /*band_tonality[b] = tE/(1e-15+E)*/; michael@0: band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]); michael@0: #if 0 michael@0: if (b>=NB_TONAL_SKIP_BANDS) michael@0: { michael@0: frame_tonality += tweight[b]*band_tonality[b]; michael@0: tw_sum += tweight[b]; michael@0: } michael@0: #else michael@0: frame_tonality += band_tonality[b]; michael@0: if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS) michael@0: frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS]; michael@0: #endif michael@0: max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality); michael@0: slope += band_tonality[b]*(b-8); michael@0: /*printf("%f %f ", band_tonality[b], stationarity);*/ michael@0: tonal->prev_band_tonality[b] = band_tonality[b]; michael@0: } michael@0: michael@0: bandwidth_mask = 0; michael@0: bandwidth = 0; michael@0: maxE = 0; michael@0: noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8))); michael@0: #ifdef FIXED_POINT michael@0: noise_floor *= 1<<(15+SIG_SHIFT); michael@0: #endif michael@0: noise_floor *= noise_floor; michael@0: for (b=0;bmeanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E); michael@0: E = MAX32(E, tonal->meanE[b]); michael@0: /* Use a simple follower with 13 dB/Bark slope for spreading function */ michael@0: bandwidth_mask = MAX32(.05f*bandwidth_mask, E); michael@0: /* Consider the band "active" only if all these conditions are met: michael@0: 1) less than 10 dB below the simple follower michael@0: 2) less than 90 dB below the peak band (maximal masking possible considering michael@0: both the ATH and the loudness-dependent slope of the spreading function) michael@0: 3) above the PCM quantization noise floor michael@0: */ michael@0: if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start)) michael@0: bandwidth = b; michael@0: } michael@0: if (tonal->count<=2) michael@0: bandwidth = 20; michael@0: frame_loudness = 20*(float)log10(frame_loudness); michael@0: tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness); michael@0: tonal->lowECount *= (1-alphaE); michael@0: if (frame_loudness < tonal->Etracker-30) michael@0: tonal->lowECount += alphaE; michael@0: michael@0: for (i=0;i<8;i++) michael@0: { michael@0: float sum=0; michael@0: for (b=0;b<16;b++) michael@0: sum += dct_table[i*16+b]*logE[b]; michael@0: BFCC[i] = sum; michael@0: } michael@0: michael@0: frame_stationarity /= NB_TBANDS; michael@0: relativeE /= NB_TBANDS; michael@0: if (tonal->count<10) michael@0: relativeE = .5; michael@0: frame_noisiness /= NB_TBANDS; michael@0: #if 1 michael@0: info->activity = frame_noisiness + (1-frame_noisiness)*relativeE; michael@0: #else michael@0: info->activity = .5*(1+frame_noisiness-frame_stationarity); michael@0: #endif michael@0: frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS)); michael@0: frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f); michael@0: tonal->prev_tonality = frame_tonality; michael@0: michael@0: slope /= 8*8; michael@0: info->tonality_slope = slope; michael@0: michael@0: tonal->E_count = (tonal->E_count+1)%NB_FRAMES; michael@0: tonal->count++; michael@0: info->tonality = frame_tonality; michael@0: michael@0: for (i=0;i<4;i++) michael@0: features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i]; michael@0: michael@0: for (i=0;i<4;i++) michael@0: tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i]; michael@0: michael@0: for (i=0;i<4;i++) michael@0: features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]); michael@0: for (i=0;i<3;i++) michael@0: features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8]; michael@0: michael@0: if (tonal->count > 5) michael@0: { michael@0: for (i=0;i<9;i++) michael@0: tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i]; michael@0: } michael@0: michael@0: for (i=0;i<8;i++) michael@0: { michael@0: tonal->mem[i+24] = tonal->mem[i+16]; michael@0: tonal->mem[i+16] = tonal->mem[i+8]; michael@0: tonal->mem[i+8] = tonal->mem[i]; michael@0: tonal->mem[i] = BFCC[i]; michael@0: } michael@0: for (i=0;i<9;i++) michael@0: features[11+i] = (float)sqrt(tonal->std[i]); michael@0: features[20] = info->tonality; michael@0: features[21] = info->activity; michael@0: features[22] = frame_stationarity; michael@0: features[23] = info->tonality_slope; michael@0: features[24] = tonal->lowECount; michael@0: michael@0: #ifndef DISABLE_FLOAT_API michael@0: mlp_process(&net, features, frame_probs); michael@0: frame_probs[0] = .5f*(frame_probs[0]+1); michael@0: /* Curve fitting between the MLP probability and the actual probability */ michael@0: frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10); michael@0: /* Probability of active audio (as opposed to silence) */ michael@0: frame_probs[1] = .5f*frame_probs[1]+.5f; michael@0: /* Consider that silence has a 50-50 probability. */ michael@0: frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f; michael@0: michael@0: /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/ michael@0: { michael@0: /* Probability of state transition */ michael@0: float tau; michael@0: /* Represents independence of the MLP probabilities, where michael@0: beta=1 means fully independent. */ michael@0: float beta; michael@0: /* Denormalized probability of speech (p0) and music (p1) after update */ michael@0: float p0, p1; michael@0: /* Probabilities for "all speech" and "all music" */ michael@0: float s0, m0; michael@0: /* Probability sum for renormalisation */ michael@0: float psum; michael@0: /* Instantaneous probability of speech and music, with beta pre-applied. */ michael@0: float speech0; michael@0: float music0; michael@0: michael@0: /* One transition every 3 minutes of active audio */ michael@0: tau = .00005f*frame_probs[1]; michael@0: beta = .05f; michael@0: if (1) { michael@0: /* Adapt beta based on how "unexpected" the new prob is */ michael@0: float p, q; michael@0: p = MAX16(.05f,MIN16(.95f,frame_probs[0])); michael@0: q = MAX16(.05f,MIN16(.95f,tonal->music_prob)); michael@0: beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); michael@0: } michael@0: /* p0 and p1 are the probabilities of speech and music at this frame michael@0: using only information from previous frame and applying the michael@0: state transition model */ michael@0: p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau; michael@0: p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau; michael@0: /* We apply the current probability with exponent beta to work around michael@0: the fact that the probability estimates aren't independent. */ michael@0: p0 *= (float)pow(1-frame_probs[0], beta); michael@0: p1 *= (float)pow(frame_probs[0], beta); michael@0: /* Normalise the probabilities to get the Marokv probability of music. */ michael@0: tonal->music_prob = p1/(p0+p1); michael@0: info->music_prob = tonal->music_prob; michael@0: michael@0: /* This chunk of code deals with delayed decision. */ michael@0: psum=1e-20f; michael@0: /* Instantaneous probability of speech and music, with beta pre-applied. */ michael@0: speech0 = (float)pow(1-frame_probs[0], beta); michael@0: music0 = (float)pow(frame_probs[0], beta); michael@0: if (tonal->count==1) michael@0: { michael@0: tonal->pspeech[0]=.5; michael@0: tonal->pmusic [0]=.5; michael@0: } michael@0: /* Updated probability of having only speech (s0) or only music (m0), michael@0: before considering the new observation. */ michael@0: s0 = tonal->pspeech[0] + tonal->pspeech[1]; michael@0: m0 = tonal->pmusic [0] + tonal->pmusic [1]; michael@0: /* Updates s0 and m0 with instantaneous probability. */ michael@0: tonal->pspeech[0] = s0*(1-tau)*speech0; michael@0: tonal->pmusic [0] = m0*(1-tau)*music0; michael@0: /* Propagate the transition probabilities */ michael@0: for (i=1;ipspeech[i] = tonal->pspeech[i+1]*speech0; michael@0: tonal->pmusic [i] = tonal->pmusic [i+1]*music0; michael@0: } michael@0: /* Probability that the latest frame is speech, when all the previous ones were music. */ michael@0: tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0; michael@0: /* Probability that the latest frame is music, when all the previous ones were speech. */ michael@0: tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0; michael@0: michael@0: /* Renormalise probabilities to 1 */ michael@0: for (i=0;ipspeech[i] + tonal->pmusic[i]; michael@0: psum = 1.f/psum; michael@0: for (i=0;ipspeech[i] *= psum; michael@0: tonal->pmusic [i] *= psum; michael@0: } michael@0: psum = tonal->pmusic[0]; michael@0: for (i=1;ipspeech[i]; michael@0: michael@0: /* Estimate our confidence in the speech/music decisions */ michael@0: if (frame_probs[1]>.75) michael@0: { michael@0: if (tonal->music_prob>.9) michael@0: { michael@0: float adapt; michael@0: adapt = 1.f/(++tonal->music_confidence_count); michael@0: tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500); michael@0: tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence); michael@0: } michael@0: if (tonal->music_prob<.1) michael@0: { michael@0: float adapt; michael@0: adapt = 1.f/(++tonal->speech_confidence_count); michael@0: tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500); michael@0: tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence); michael@0: } michael@0: } else { michael@0: if (tonal->music_confidence_count==0) michael@0: tonal->music_confidence = .9f; michael@0: if (tonal->speech_confidence_count==0) michael@0: tonal->speech_confidence = .1f; michael@0: } michael@0: } michael@0: if (tonal->last_music != (tonal->music_prob>.5f)) michael@0: tonal->last_transition=0; michael@0: tonal->last_music = tonal->music_prob>.5f; michael@0: #else michael@0: info->music_prob = 0; michael@0: #endif michael@0: /*for (i=0;i<25;i++) michael@0: printf("%f ", features[i]); michael@0: printf("\n");*/ michael@0: michael@0: info->bandwidth = bandwidth; michael@0: /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/ michael@0: info->noisiness = frame_noisiness; michael@0: info->valid = 1; michael@0: if (info_out!=NULL) michael@0: OPUS_COPY(info_out, info, 1); michael@0: RESTORE_STACK; michael@0: } michael@0: michael@0: void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm, michael@0: int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs, michael@0: int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info) michael@0: { michael@0: int offset; michael@0: int pcm_len; michael@0: michael@0: if (analysis_pcm != NULL) michael@0: { michael@0: /* Avoid overflow/wrap-around of the analysis buffer */ michael@0: analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size); michael@0: michael@0: pcm_len = analysis_frame_size - analysis->analysis_offset; michael@0: offset = analysis->analysis_offset; michael@0: do { michael@0: tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix); michael@0: offset += 480; michael@0: pcm_len -= 480; michael@0: } while (pcm_len>0); michael@0: analysis->analysis_offset = analysis_frame_size; michael@0: michael@0: analysis->analysis_offset -= frame_size; michael@0: } michael@0: michael@0: analysis_info->valid = 0; michael@0: tonality_get_info(analysis, analysis_info, frame_size); michael@0: }