media/libopus/silk/VAD.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libopus/silk/VAD.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,357 @@
     1.4 +/***********************************************************************
     1.5 +Copyright (c) 2006-2011, Skype Limited. All rights reserved.
     1.6 +Redistribution and use in source and binary forms, with or without
     1.7 +modification, are permitted provided that the following conditions
     1.8 +are met:
     1.9 +- Redistributions of source code must retain the above copyright notice,
    1.10 +this list of conditions and the following disclaimer.
    1.11 +- Redistributions in binary form must reproduce the above copyright
    1.12 +notice, this list of conditions and the following disclaimer in the
    1.13 +documentation and/or other materials provided with the distribution.
    1.14 +- Neither the name of Internet Society, IETF or IETF Trust, nor the
    1.15 +names of specific contributors, may be used to endorse or promote
    1.16 +products derived from this software without specific prior written
    1.17 +permission.
    1.18 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
    1.19 +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    1.20 +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    1.21 +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
    1.22 +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    1.23 +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    1.24 +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    1.25 +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
    1.26 +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    1.27 +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    1.28 +POSSIBILITY OF SUCH DAMAGE.
    1.29 +***********************************************************************/
    1.30 +
    1.31 +#ifdef HAVE_CONFIG_H
    1.32 +#include "config.h"
    1.33 +#endif
    1.34 +
    1.35 +#include "main.h"
    1.36 +#include "stack_alloc.h"
    1.37 +
    1.38 +/* Silk VAD noise level estimation */
    1.39 +static OPUS_INLINE void silk_VAD_GetNoiseLevels(
    1.40 +    const opus_int32             pX[ VAD_N_BANDS ], /* I    subband energies                            */
    1.41 +    silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
    1.42 +);
    1.43 +
    1.44 +/**********************************/
    1.45 +/* Initialization of the Silk VAD */
    1.46 +/**********************************/
    1.47 +opus_int silk_VAD_Init(                                         /* O    Return value, 0 if success                  */
    1.48 +    silk_VAD_state              *psSilk_VAD                     /* I/O  Pointer to Silk VAD state                   */
    1.49 +)
    1.50 +{
    1.51 +    opus_int b, ret = 0;
    1.52 +
    1.53 +    /* reset state memory */
    1.54 +    silk_memset( psSilk_VAD, 0, sizeof( silk_VAD_state ) );
    1.55 +
    1.56 +    /* init noise levels */
    1.57 +    /* Initialize array with approx pink noise levels (psd proportional to inverse of frequency) */
    1.58 +    for( b = 0; b < VAD_N_BANDS; b++ ) {
    1.59 +        psSilk_VAD->NoiseLevelBias[ b ] = silk_max_32( silk_DIV32_16( VAD_NOISE_LEVELS_BIAS, b + 1 ), 1 );
    1.60 +    }
    1.61 +
    1.62 +    /* Initialize state */
    1.63 +    for( b = 0; b < VAD_N_BANDS; b++ ) {
    1.64 +        psSilk_VAD->NL[ b ]     = silk_MUL( 100, psSilk_VAD->NoiseLevelBias[ b ] );
    1.65 +        psSilk_VAD->inv_NL[ b ] = silk_DIV32( silk_int32_MAX, psSilk_VAD->NL[ b ] );
    1.66 +    }
    1.67 +    psSilk_VAD->counter = 15;
    1.68 +
    1.69 +    /* init smoothed energy-to-noise ratio*/
    1.70 +    for( b = 0; b < VAD_N_BANDS; b++ ) {
    1.71 +        psSilk_VAD->NrgRatioSmth_Q8[ b ] = 100 * 256;       /* 100 * 256 --> 20 dB SNR */
    1.72 +    }
    1.73 +
    1.74 +    return( ret );
    1.75 +}
    1.76 +
    1.77 +/* Weighting factors for tilt measure */
    1.78 +static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
    1.79 +
    1.80 +/***************************************/
    1.81 +/* Get the speech activity level in Q8 */
    1.82 +/***************************************/
    1.83 +opus_int silk_VAD_GetSA_Q8(                                     /* O    Return value, 0 if success                  */
    1.84 +    silk_encoder_state          *psEncC,                        /* I/O  Encoder state                               */
    1.85 +    const opus_int16            pIn[]                           /* I    PCM input                                   */
    1.86 +)
    1.87 +{
    1.88 +    opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
    1.89 +    opus_int   decimated_framelength1, decimated_framelength2;
    1.90 +    opus_int   decimated_framelength;
    1.91 +    opus_int   dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
    1.92 +    opus_int32 sumSquared, smooth_coef_Q16;
    1.93 +    opus_int16 HPstateTmp;
    1.94 +    VARDECL( opus_int16, X );
    1.95 +    opus_int32 Xnrg[ VAD_N_BANDS ];
    1.96 +    opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
    1.97 +    opus_int32 speech_nrg, x_tmp;
    1.98 +    opus_int   X_offset[ VAD_N_BANDS ];
    1.99 +    opus_int   ret = 0;
   1.100 +    silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
   1.101 +    SAVE_STACK;
   1.102 +
   1.103 +    /* Safety checks */
   1.104 +    silk_assert( VAD_N_BANDS == 4 );
   1.105 +    silk_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
   1.106 +    silk_assert( psEncC->frame_length <= 512 );
   1.107 +    silk_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
   1.108 +
   1.109 +    /***********************/
   1.110 +    /* Filter and Decimate */
   1.111 +    /***********************/
   1.112 +    decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
   1.113 +    decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
   1.114 +    decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
   1.115 +    /* Decimate into 4 bands:
   1.116 +       0       L      3L       L              3L                             5L
   1.117 +               -      --       -              --                             --
   1.118 +               8       8       2               4                              4
   1.119 +
   1.120 +       [0-1 kHz| temp. |1-2 kHz|    2-4 kHz    |            4-8 kHz           |
   1.121 +
   1.122 +       They're arranged to allow the minimal ( frame_length / 4 ) extra
   1.123 +       scratch space during the downsampling process */
   1.124 +    X_offset[ 0 ] = 0;
   1.125 +    X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
   1.126 +    X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
   1.127 +    X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
   1.128 +    ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
   1.129 +
   1.130 +    /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
   1.131 +    silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[  0 ],
   1.132 +        X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
   1.133 +
   1.134 +    /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
   1.135 +    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
   1.136 +        X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
   1.137 +
   1.138 +    /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
   1.139 +    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
   1.140 +        X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
   1.141 +
   1.142 +    /*********************************************/
   1.143 +    /* HP filter on lowest band (differentiator) */
   1.144 +    /*********************************************/
   1.145 +    X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
   1.146 +    HPstateTmp = X[ decimated_framelength - 1 ];
   1.147 +    for( i = decimated_framelength - 1; i > 0; i-- ) {
   1.148 +        X[ i - 1 ]  = silk_RSHIFT( X[ i - 1 ], 1 );
   1.149 +        X[ i ]     -= X[ i - 1 ];
   1.150 +    }
   1.151 +    X[ 0 ] -= psSilk_VAD->HPstate;
   1.152 +    psSilk_VAD->HPstate = HPstateTmp;
   1.153 +
   1.154 +    /*************************************/
   1.155 +    /* Calculate the energy in each band */
   1.156 +    /*************************************/
   1.157 +    for( b = 0; b < VAD_N_BANDS; b++ ) {
   1.158 +        /* Find the decimated framelength in the non-uniformly divided bands */
   1.159 +        decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
   1.160 +
   1.161 +        /* Split length into subframe lengths */
   1.162 +        dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
   1.163 +        dec_subframe_offset = 0;
   1.164 +
   1.165 +        /* Compute energy per sub-frame */
   1.166 +        /* initialize with summed energy of last subframe */
   1.167 +        Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
   1.168 +        for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
   1.169 +            sumSquared = 0;
   1.170 +            for( i = 0; i < dec_subframe_length; i++ ) {
   1.171 +                /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
   1.172 +                /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
   1.173 +                x_tmp = silk_RSHIFT(
   1.174 +                    X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
   1.175 +                sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
   1.176 +
   1.177 +                /* Safety check */
   1.178 +                silk_assert( sumSquared >= 0 );
   1.179 +            }
   1.180 +
   1.181 +            /* Add/saturate summed energy of current subframe */
   1.182 +            if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
   1.183 +                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
   1.184 +            } else {
   1.185 +                /* Look-ahead subframe */
   1.186 +                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
   1.187 +            }
   1.188 +
   1.189 +            dec_subframe_offset += dec_subframe_length;
   1.190 +        }
   1.191 +        psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
   1.192 +    }
   1.193 +
   1.194 +    /********************/
   1.195 +    /* Noise estimation */
   1.196 +    /********************/
   1.197 +    silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
   1.198 +
   1.199 +    /***********************************************/
   1.200 +    /* Signal-plus-noise to noise ratio estimation */
   1.201 +    /***********************************************/
   1.202 +    sumSquared = 0;
   1.203 +    input_tilt = 0;
   1.204 +    for( b = 0; b < VAD_N_BANDS; b++ ) {
   1.205 +        speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
   1.206 +        if( speech_nrg > 0 ) {
   1.207 +            /* Divide, with sufficient resolution */
   1.208 +            if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
   1.209 +                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
   1.210 +            } else {
   1.211 +                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
   1.212 +            }
   1.213 +
   1.214 +            /* Convert to log domain */
   1.215 +            SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
   1.216 +
   1.217 +            /* Sum-of-squares */
   1.218 +            sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
   1.219 +
   1.220 +            /* Tilt measure */
   1.221 +            if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
   1.222 +                /* Scale down SNR value for small subband speech energies */
   1.223 +                SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
   1.224 +            }
   1.225 +            input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
   1.226 +        } else {
   1.227 +            NrgToNoiseRatio_Q8[ b ] = 256;
   1.228 +        }
   1.229 +    }
   1.230 +
   1.231 +    /* Mean-of-squares */
   1.232 +    sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
   1.233 +
   1.234 +    /* Root-mean-square approximation, scale to dBs, and write to output pointer */
   1.235 +    pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
   1.236 +
   1.237 +    /*********************************/
   1.238 +    /* Speech Probability Estimation */
   1.239 +    /*********************************/
   1.240 +    SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
   1.241 +
   1.242 +    /**************************/
   1.243 +    /* Frequency Tilt Measure */
   1.244 +    /**************************/
   1.245 +    psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
   1.246 +
   1.247 +    /**************************************************/
   1.248 +    /* Scale the sigmoid output based on power levels */
   1.249 +    /**************************************************/
   1.250 +    speech_nrg = 0;
   1.251 +    for( b = 0; b < VAD_N_BANDS; b++ ) {
   1.252 +        /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
   1.253 +        speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
   1.254 +    }
   1.255 +
   1.256 +    /* Power scaling */
   1.257 +    if( speech_nrg <= 0 ) {
   1.258 +        SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
   1.259 +    } else if( speech_nrg < 32768 ) {
   1.260 +        if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
   1.261 +            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
   1.262 +        } else {
   1.263 +            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
   1.264 +        }
   1.265 +
   1.266 +        /* square-root */
   1.267 +        speech_nrg = silk_SQRT_APPROX( speech_nrg );
   1.268 +        SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
   1.269 +    }
   1.270 +
   1.271 +    /* Copy the resulting speech activity in Q8 */
   1.272 +    psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
   1.273 +
   1.274 +    /***********************************/
   1.275 +    /* Energy Level and SNR estimation */
   1.276 +    /***********************************/
   1.277 +    /* Smoothing coefficient */
   1.278 +    smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
   1.279 +
   1.280 +    if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
   1.281 +        smooth_coef_Q16 >>= 1;
   1.282 +    }
   1.283 +
   1.284 +    for( b = 0; b < VAD_N_BANDS; b++ ) {
   1.285 +        /* compute smoothed energy-to-noise ratio per band */
   1.286 +        psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
   1.287 +            NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
   1.288 +
   1.289 +        /* signal to noise ratio in dB per band */
   1.290 +        SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
   1.291 +        /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
   1.292 +        psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
   1.293 +    }
   1.294 +
   1.295 +    RESTORE_STACK;
   1.296 +    return( ret );
   1.297 +}
   1.298 +
   1.299 +/**************************/
   1.300 +/* Noise level estimation */
   1.301 +/**************************/
   1.302 +static OPUS_INLINE void silk_VAD_GetNoiseLevels(
   1.303 +    const opus_int32            pX[ VAD_N_BANDS ],  /* I    subband energies                            */
   1.304 +    silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
   1.305 +)
   1.306 +{
   1.307 +    opus_int   k;
   1.308 +    opus_int32 nl, nrg, inv_nrg;
   1.309 +    opus_int   coef, min_coef;
   1.310 +
   1.311 +    /* Initially faster smoothing */
   1.312 +    if( psSilk_VAD->counter < 1000 ) { /* 1000 = 20 sec */
   1.313 +        min_coef = silk_DIV32_16( silk_int16_MAX, silk_RSHIFT( psSilk_VAD->counter, 4 ) + 1 );
   1.314 +    } else {
   1.315 +        min_coef = 0;
   1.316 +    }
   1.317 +
   1.318 +    for( k = 0; k < VAD_N_BANDS; k++ ) {
   1.319 +        /* Get old noise level estimate for current band */
   1.320 +        nl = psSilk_VAD->NL[ k ];
   1.321 +        silk_assert( nl >= 0 );
   1.322 +
   1.323 +        /* Add bias */
   1.324 +        nrg = silk_ADD_POS_SAT32( pX[ k ], psSilk_VAD->NoiseLevelBias[ k ] );
   1.325 +        silk_assert( nrg > 0 );
   1.326 +
   1.327 +        /* Invert energies */
   1.328 +        inv_nrg = silk_DIV32( silk_int32_MAX, nrg );
   1.329 +        silk_assert( inv_nrg >= 0 );
   1.330 +
   1.331 +        /* Less update when subband energy is high */
   1.332 +        if( nrg > silk_LSHIFT( nl, 3 ) ) {
   1.333 +            coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 >> 3;
   1.334 +        } else if( nrg < nl ) {
   1.335 +            coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16;
   1.336 +        } else {
   1.337 +            coef = silk_SMULWB( silk_SMULWW( inv_nrg, nl ), VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 << 1 );
   1.338 +        }
   1.339 +
   1.340 +        /* Initially faster smoothing */
   1.341 +        coef = silk_max_int( coef, min_coef );
   1.342 +
   1.343 +        /* Smooth inverse energies */
   1.344 +        psSilk_VAD->inv_NL[ k ] = silk_SMLAWB( psSilk_VAD->inv_NL[ k ], inv_nrg - psSilk_VAD->inv_NL[ k ], coef );
   1.345 +        silk_assert( psSilk_VAD->inv_NL[ k ] >= 0 );
   1.346 +
   1.347 +        /* Compute noise level by inverting again */
   1.348 +        nl = silk_DIV32( silk_int32_MAX, psSilk_VAD->inv_NL[ k ] );
   1.349 +        silk_assert( nl >= 0 );
   1.350 +
   1.351 +        /* Limit noise levels (guarantee 7 bits of head room) */
   1.352 +        nl = silk_min( nl, 0x00FFFFFF );
   1.353 +
   1.354 +        /* Store as part of state */
   1.355 +        psSilk_VAD->NL[ k ] = nl;
   1.356 +    }
   1.357 +
   1.358 +    /* Increment frame counter */
   1.359 +    psSilk_VAD->counter++;
   1.360 +}

mercurial