|
1 /*********************************************************************** |
|
2 Copyright (c) 2006-2011, Skype Limited. All rights reserved. |
|
3 Redistribution and use in source and binary forms, with or without |
|
4 modification, are permitted provided that the following conditions |
|
5 are met: |
|
6 - Redistributions of source code must retain the above copyright notice, |
|
7 this list of conditions and the following disclaimer. |
|
8 - Redistributions in binary form must reproduce the above copyright |
|
9 notice, this list of conditions and the following disclaimer in the |
|
10 documentation and/or other materials provided with the distribution. |
|
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the |
|
12 names of specific contributors, may be used to endorse or promote |
|
13 products derived from this software without specific prior written |
|
14 permission. |
|
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
25 POSSIBILITY OF SUCH DAMAGE. |
|
26 ***********************************************************************/ |
|
27 |
|
28 #ifdef HAVE_CONFIG_H |
|
29 #include "config.h" |
|
30 #endif |
|
31 #include "API.h" |
|
32 #include "main.h" |
|
33 #include "stack_alloc.h" |
|
34 |
|
35 /************************/ |
|
36 /* Decoder Super Struct */ |
|
37 /************************/ |
|
38 typedef struct { |
|
39 silk_decoder_state channel_state[ DECODER_NUM_CHANNELS ]; |
|
40 stereo_dec_state sStereo; |
|
41 opus_int nChannelsAPI; |
|
42 opus_int nChannelsInternal; |
|
43 opus_int prev_decode_only_middle; |
|
44 } silk_decoder; |
|
45 |
|
46 /*********************/ |
|
47 /* Decoder functions */ |
|
48 /*********************/ |
|
49 |
|
50 opus_int silk_Get_Decoder_Size( /* O Returns error code */ |
|
51 opus_int *decSizeBytes /* O Number of bytes in SILK decoder state */ |
|
52 ) |
|
53 { |
|
54 opus_int ret = SILK_NO_ERROR; |
|
55 |
|
56 *decSizeBytes = sizeof( silk_decoder ); |
|
57 |
|
58 return ret; |
|
59 } |
|
60 |
|
61 /* Reset decoder state */ |
|
62 opus_int silk_InitDecoder( /* O Returns error code */ |
|
63 void *decState /* I/O State */ |
|
64 ) |
|
65 { |
|
66 opus_int n, ret = SILK_NO_ERROR; |
|
67 silk_decoder_state *channel_state = ((silk_decoder *)decState)->channel_state; |
|
68 |
|
69 for( n = 0; n < DECODER_NUM_CHANNELS; n++ ) { |
|
70 ret = silk_init_decoder( &channel_state[ n ] ); |
|
71 } |
|
72 silk_memset(&((silk_decoder *)decState)->sStereo, 0, sizeof(((silk_decoder *)decState)->sStereo)); |
|
73 /* Not strictly needed, but it's cleaner that way */ |
|
74 ((silk_decoder *)decState)->prev_decode_only_middle = 0; |
|
75 |
|
76 return ret; |
|
77 } |
|
78 |
|
79 /* Decode a frame */ |
|
80 opus_int silk_Decode( /* O Returns error code */ |
|
81 void* decState, /* I/O State */ |
|
82 silk_DecControlStruct* decControl, /* I/O Control Structure */ |
|
83 opus_int lostFlag, /* I 0: no loss, 1 loss, 2 decode fec */ |
|
84 opus_int newPacketFlag, /* I Indicates first decoder call for this packet */ |
|
85 ec_dec *psRangeDec, /* I/O Compressor data structure */ |
|
86 opus_int16 *samplesOut, /* O Decoded output speech vector */ |
|
87 opus_int32 *nSamplesOut /* O Number of samples decoded */ |
|
88 ) |
|
89 { |
|
90 opus_int i, n, decode_only_middle = 0, ret = SILK_NO_ERROR; |
|
91 opus_int32 nSamplesOutDec, LBRR_symbol; |
|
92 opus_int16 *samplesOut1_tmp[ 2 ]; |
|
93 VARDECL( opus_int16, samplesOut1_tmp_storage ); |
|
94 VARDECL( opus_int16, samplesOut2_tmp ); |
|
95 opus_int32 MS_pred_Q13[ 2 ] = { 0 }; |
|
96 opus_int16 *resample_out_ptr; |
|
97 silk_decoder *psDec = ( silk_decoder * )decState; |
|
98 silk_decoder_state *channel_state = psDec->channel_state; |
|
99 opus_int has_side; |
|
100 opus_int stereo_to_mono; |
|
101 SAVE_STACK; |
|
102 |
|
103 silk_assert( decControl->nChannelsInternal == 1 || decControl->nChannelsInternal == 2 ); |
|
104 |
|
105 /**********************************/ |
|
106 /* Test if first frame in payload */ |
|
107 /**********************************/ |
|
108 if( newPacketFlag ) { |
|
109 for( n = 0; n < decControl->nChannelsInternal; n++ ) { |
|
110 channel_state[ n ].nFramesDecoded = 0; /* Used to count frames in packet */ |
|
111 } |
|
112 } |
|
113 |
|
114 /* If Mono -> Stereo transition in bitstream: init state of second channel */ |
|
115 if( decControl->nChannelsInternal > psDec->nChannelsInternal ) { |
|
116 ret += silk_init_decoder( &channel_state[ 1 ] ); |
|
117 } |
|
118 |
|
119 stereo_to_mono = decControl->nChannelsInternal == 1 && psDec->nChannelsInternal == 2 && |
|
120 ( decControl->internalSampleRate == 1000*channel_state[ 0 ].fs_kHz ); |
|
121 |
|
122 if( channel_state[ 0 ].nFramesDecoded == 0 ) { |
|
123 for( n = 0; n < decControl->nChannelsInternal; n++ ) { |
|
124 opus_int fs_kHz_dec; |
|
125 if( decControl->payloadSize_ms == 0 ) { |
|
126 /* Assuming packet loss, use 10 ms */ |
|
127 channel_state[ n ].nFramesPerPacket = 1; |
|
128 channel_state[ n ].nb_subfr = 2; |
|
129 } else if( decControl->payloadSize_ms == 10 ) { |
|
130 channel_state[ n ].nFramesPerPacket = 1; |
|
131 channel_state[ n ].nb_subfr = 2; |
|
132 } else if( decControl->payloadSize_ms == 20 ) { |
|
133 channel_state[ n ].nFramesPerPacket = 1; |
|
134 channel_state[ n ].nb_subfr = 4; |
|
135 } else if( decControl->payloadSize_ms == 40 ) { |
|
136 channel_state[ n ].nFramesPerPacket = 2; |
|
137 channel_state[ n ].nb_subfr = 4; |
|
138 } else if( decControl->payloadSize_ms == 60 ) { |
|
139 channel_state[ n ].nFramesPerPacket = 3; |
|
140 channel_state[ n ].nb_subfr = 4; |
|
141 } else { |
|
142 silk_assert( 0 ); |
|
143 RESTORE_STACK; |
|
144 return SILK_DEC_INVALID_FRAME_SIZE; |
|
145 } |
|
146 fs_kHz_dec = ( decControl->internalSampleRate >> 10 ) + 1; |
|
147 if( fs_kHz_dec != 8 && fs_kHz_dec != 12 && fs_kHz_dec != 16 ) { |
|
148 silk_assert( 0 ); |
|
149 RESTORE_STACK; |
|
150 return SILK_DEC_INVALID_SAMPLING_FREQUENCY; |
|
151 } |
|
152 ret += silk_decoder_set_fs( &channel_state[ n ], fs_kHz_dec, decControl->API_sampleRate ); |
|
153 } |
|
154 } |
|
155 |
|
156 if( decControl->nChannelsAPI == 2 && decControl->nChannelsInternal == 2 && ( psDec->nChannelsAPI == 1 || psDec->nChannelsInternal == 1 ) ) { |
|
157 silk_memset( psDec->sStereo.pred_prev_Q13, 0, sizeof( psDec->sStereo.pred_prev_Q13 ) ); |
|
158 silk_memset( psDec->sStereo.sSide, 0, sizeof( psDec->sStereo.sSide ) ); |
|
159 silk_memcpy( &channel_state[ 1 ].resampler_state, &channel_state[ 0 ].resampler_state, sizeof( silk_resampler_state_struct ) ); |
|
160 } |
|
161 psDec->nChannelsAPI = decControl->nChannelsAPI; |
|
162 psDec->nChannelsInternal = decControl->nChannelsInternal; |
|
163 |
|
164 if( decControl->API_sampleRate > (opus_int32)MAX_API_FS_KHZ * 1000 || decControl->API_sampleRate < 8000 ) { |
|
165 ret = SILK_DEC_INVALID_SAMPLING_FREQUENCY; |
|
166 RESTORE_STACK; |
|
167 return( ret ); |
|
168 } |
|
169 |
|
170 if( lostFlag != FLAG_PACKET_LOST && channel_state[ 0 ].nFramesDecoded == 0 ) { |
|
171 /* First decoder call for this payload */ |
|
172 /* Decode VAD flags and LBRR flag */ |
|
173 for( n = 0; n < decControl->nChannelsInternal; n++ ) { |
|
174 for( i = 0; i < channel_state[ n ].nFramesPerPacket; i++ ) { |
|
175 channel_state[ n ].VAD_flags[ i ] = ec_dec_bit_logp(psRangeDec, 1); |
|
176 } |
|
177 channel_state[ n ].LBRR_flag = ec_dec_bit_logp(psRangeDec, 1); |
|
178 } |
|
179 /* Decode LBRR flags */ |
|
180 for( n = 0; n < decControl->nChannelsInternal; n++ ) { |
|
181 silk_memset( channel_state[ n ].LBRR_flags, 0, sizeof( channel_state[ n ].LBRR_flags ) ); |
|
182 if( channel_state[ n ].LBRR_flag ) { |
|
183 if( channel_state[ n ].nFramesPerPacket == 1 ) { |
|
184 channel_state[ n ].LBRR_flags[ 0 ] = 1; |
|
185 } else { |
|
186 LBRR_symbol = ec_dec_icdf( psRangeDec, silk_LBRR_flags_iCDF_ptr[ channel_state[ n ].nFramesPerPacket - 2 ], 8 ) + 1; |
|
187 for( i = 0; i < channel_state[ n ].nFramesPerPacket; i++ ) { |
|
188 channel_state[ n ].LBRR_flags[ i ] = silk_RSHIFT( LBRR_symbol, i ) & 1; |
|
189 } |
|
190 } |
|
191 } |
|
192 } |
|
193 |
|
194 if( lostFlag == FLAG_DECODE_NORMAL ) { |
|
195 /* Regular decoding: skip all LBRR data */ |
|
196 for( i = 0; i < channel_state[ 0 ].nFramesPerPacket; i++ ) { |
|
197 for( n = 0; n < decControl->nChannelsInternal; n++ ) { |
|
198 if( channel_state[ n ].LBRR_flags[ i ] ) { |
|
199 opus_int pulses[ MAX_FRAME_LENGTH ]; |
|
200 opus_int condCoding; |
|
201 |
|
202 if( decControl->nChannelsInternal == 2 && n == 0 ) { |
|
203 silk_stereo_decode_pred( psRangeDec, MS_pred_Q13 ); |
|
204 if( channel_state[ 1 ].LBRR_flags[ i ] == 0 ) { |
|
205 silk_stereo_decode_mid_only( psRangeDec, &decode_only_middle ); |
|
206 } |
|
207 } |
|
208 /* Use conditional coding if previous frame available */ |
|
209 if( i > 0 && channel_state[ n ].LBRR_flags[ i - 1 ] ) { |
|
210 condCoding = CODE_CONDITIONALLY; |
|
211 } else { |
|
212 condCoding = CODE_INDEPENDENTLY; |
|
213 } |
|
214 silk_decode_indices( &channel_state[ n ], psRangeDec, i, 1, condCoding ); |
|
215 silk_decode_pulses( psRangeDec, pulses, channel_state[ n ].indices.signalType, |
|
216 channel_state[ n ].indices.quantOffsetType, channel_state[ n ].frame_length ); |
|
217 } |
|
218 } |
|
219 } |
|
220 } |
|
221 } |
|
222 |
|
223 /* Get MS predictor index */ |
|
224 if( decControl->nChannelsInternal == 2 ) { |
|
225 if( lostFlag == FLAG_DECODE_NORMAL || |
|
226 ( lostFlag == FLAG_DECODE_LBRR && channel_state[ 0 ].LBRR_flags[ channel_state[ 0 ].nFramesDecoded ] == 1 ) ) |
|
227 { |
|
228 silk_stereo_decode_pred( psRangeDec, MS_pred_Q13 ); |
|
229 /* For LBRR data, decode mid-only flag only if side-channel's LBRR flag is false */ |
|
230 if( ( lostFlag == FLAG_DECODE_NORMAL && channel_state[ 1 ].VAD_flags[ channel_state[ 0 ].nFramesDecoded ] == 0 ) || |
|
231 ( lostFlag == FLAG_DECODE_LBRR && channel_state[ 1 ].LBRR_flags[ channel_state[ 0 ].nFramesDecoded ] == 0 ) ) |
|
232 { |
|
233 silk_stereo_decode_mid_only( psRangeDec, &decode_only_middle ); |
|
234 } else { |
|
235 decode_only_middle = 0; |
|
236 } |
|
237 } else { |
|
238 for( n = 0; n < 2; n++ ) { |
|
239 MS_pred_Q13[ n ] = psDec->sStereo.pred_prev_Q13[ n ]; |
|
240 } |
|
241 } |
|
242 } |
|
243 |
|
244 /* Reset side channel decoder prediction memory for first frame with side coding */ |
|
245 if( decControl->nChannelsInternal == 2 && decode_only_middle == 0 && psDec->prev_decode_only_middle == 1 ) { |
|
246 silk_memset( psDec->channel_state[ 1 ].outBuf, 0, sizeof(psDec->channel_state[ 1 ].outBuf) ); |
|
247 silk_memset( psDec->channel_state[ 1 ].sLPC_Q14_buf, 0, sizeof(psDec->channel_state[ 1 ].sLPC_Q14_buf) ); |
|
248 psDec->channel_state[ 1 ].lagPrev = 100; |
|
249 psDec->channel_state[ 1 ].LastGainIndex = 10; |
|
250 psDec->channel_state[ 1 ].prevSignalType = TYPE_NO_VOICE_ACTIVITY; |
|
251 psDec->channel_state[ 1 ].first_frame_after_reset = 1; |
|
252 } |
|
253 |
|
254 ALLOC( samplesOut1_tmp_storage, |
|
255 decControl->nChannelsInternal*( |
|
256 channel_state[ 0 ].frame_length + 2 ), |
|
257 opus_int16 ); |
|
258 samplesOut1_tmp[ 0 ] = samplesOut1_tmp_storage; |
|
259 samplesOut1_tmp[ 1 ] = samplesOut1_tmp_storage |
|
260 + channel_state[ 0 ].frame_length + 2; |
|
261 |
|
262 if( lostFlag == FLAG_DECODE_NORMAL ) { |
|
263 has_side = !decode_only_middle; |
|
264 } else { |
|
265 has_side = !psDec->prev_decode_only_middle |
|
266 || (decControl->nChannelsInternal == 2 && lostFlag == FLAG_DECODE_LBRR && channel_state[1].LBRR_flags[ channel_state[1].nFramesDecoded ] == 1 ); |
|
267 } |
|
268 /* Call decoder for one frame */ |
|
269 for( n = 0; n < decControl->nChannelsInternal; n++ ) { |
|
270 if( n == 0 || has_side ) { |
|
271 opus_int FrameIndex; |
|
272 opus_int condCoding; |
|
273 |
|
274 FrameIndex = channel_state[ 0 ].nFramesDecoded - n; |
|
275 /* Use independent coding if no previous frame available */ |
|
276 if( FrameIndex <= 0 ) { |
|
277 condCoding = CODE_INDEPENDENTLY; |
|
278 } else if( lostFlag == FLAG_DECODE_LBRR ) { |
|
279 condCoding = channel_state[ n ].LBRR_flags[ FrameIndex - 1 ] ? CODE_CONDITIONALLY : CODE_INDEPENDENTLY; |
|
280 } else if( n > 0 && psDec->prev_decode_only_middle ) { |
|
281 /* If we skipped a side frame in this packet, we don't |
|
282 need LTP scaling; the LTP state is well-defined. */ |
|
283 condCoding = CODE_INDEPENDENTLY_NO_LTP_SCALING; |
|
284 } else { |
|
285 condCoding = CODE_CONDITIONALLY; |
|
286 } |
|
287 ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 ], &nSamplesOutDec, lostFlag, condCoding); |
|
288 } else { |
|
289 silk_memset( &samplesOut1_tmp[ n ][ 2 ], 0, nSamplesOutDec * sizeof( opus_int16 ) ); |
|
290 } |
|
291 channel_state[ n ].nFramesDecoded++; |
|
292 } |
|
293 |
|
294 if( decControl->nChannelsAPI == 2 && decControl->nChannelsInternal == 2 ) { |
|
295 /* Convert Mid/Side to Left/Right */ |
|
296 silk_stereo_MS_to_LR( &psDec->sStereo, samplesOut1_tmp[ 0 ], samplesOut1_tmp[ 1 ], MS_pred_Q13, channel_state[ 0 ].fs_kHz, nSamplesOutDec ); |
|
297 } else { |
|
298 /* Buffering */ |
|
299 silk_memcpy( samplesOut1_tmp[ 0 ], psDec->sStereo.sMid, 2 * sizeof( opus_int16 ) ); |
|
300 silk_memcpy( psDec->sStereo.sMid, &samplesOut1_tmp[ 0 ][ nSamplesOutDec ], 2 * sizeof( opus_int16 ) ); |
|
301 } |
|
302 |
|
303 /* Number of output samples */ |
|
304 *nSamplesOut = silk_DIV32( nSamplesOutDec * decControl->API_sampleRate, silk_SMULBB( channel_state[ 0 ].fs_kHz, 1000 ) ); |
|
305 |
|
306 /* Set up pointers to temp buffers */ |
|
307 ALLOC( samplesOut2_tmp, |
|
308 decControl->nChannelsAPI == 2 ? *nSamplesOut : ALLOC_NONE, opus_int16 ); |
|
309 if( decControl->nChannelsAPI == 2 ) { |
|
310 resample_out_ptr = samplesOut2_tmp; |
|
311 } else { |
|
312 resample_out_ptr = samplesOut; |
|
313 } |
|
314 |
|
315 for( n = 0; n < silk_min( decControl->nChannelsAPI, decControl->nChannelsInternal ); n++ ) { |
|
316 |
|
317 /* Resample decoded signal to API_sampleRate */ |
|
318 ret += silk_resampler( &channel_state[ n ].resampler_state, resample_out_ptr, &samplesOut1_tmp[ n ][ 1 ], nSamplesOutDec ); |
|
319 |
|
320 /* Interleave if stereo output and stereo stream */ |
|
321 if( decControl->nChannelsAPI == 2 ) { |
|
322 for( i = 0; i < *nSamplesOut; i++ ) { |
|
323 samplesOut[ n + 2 * i ] = resample_out_ptr[ i ]; |
|
324 } |
|
325 } |
|
326 } |
|
327 |
|
328 /* Create two channel output from mono stream */ |
|
329 if( decControl->nChannelsAPI == 2 && decControl->nChannelsInternal == 1 ) { |
|
330 if ( stereo_to_mono ){ |
|
331 /* Resample right channel for newly collapsed stereo just in case |
|
332 we weren't doing collapsing when switching to mono */ |
|
333 ret += silk_resampler( &channel_state[ 1 ].resampler_state, resample_out_ptr, &samplesOut1_tmp[ 0 ][ 1 ], nSamplesOutDec ); |
|
334 |
|
335 for( i = 0; i < *nSamplesOut; i++ ) { |
|
336 samplesOut[ 1 + 2 * i ] = resample_out_ptr[ i ]; |
|
337 } |
|
338 } else { |
|
339 for( i = 0; i < *nSamplesOut; i++ ) { |
|
340 samplesOut[ 1 + 2 * i ] = samplesOut[ 0 + 2 * i ]; |
|
341 } |
|
342 } |
|
343 } |
|
344 |
|
345 /* Export pitch lag, measured at 48 kHz sampling rate */ |
|
346 if( channel_state[ 0 ].prevSignalType == TYPE_VOICED ) { |
|
347 int mult_tab[ 3 ] = { 6, 4, 3 }; |
|
348 decControl->prevPitchLag = channel_state[ 0 ].lagPrev * mult_tab[ ( channel_state[ 0 ].fs_kHz - 8 ) >> 2 ]; |
|
349 } else { |
|
350 decControl->prevPitchLag = 0; |
|
351 } |
|
352 |
|
353 if( lostFlag == FLAG_PACKET_LOST ) { |
|
354 /* On packet loss, remove the gain clamping to prevent having the energy "bounce back" |
|
355 if we lose packets when the energy is going down */ |
|
356 for ( i = 0; i < psDec->nChannelsInternal; i++ ) |
|
357 psDec->channel_state[ i ].LastGainIndex = 10; |
|
358 } else { |
|
359 psDec->prev_decode_only_middle = decode_only_middle; |
|
360 } |
|
361 RESTORE_STACK; |
|
362 return ret; |
|
363 } |
|
364 |
|
365 #if 0 |
|
366 /* Getting table of contents for a packet */ |
|
367 opus_int silk_get_TOC( |
|
368 const opus_uint8 *payload, /* I Payload data */ |
|
369 const opus_int nBytesIn, /* I Number of input bytes */ |
|
370 const opus_int nFramesPerPayload, /* I Number of SILK frames per payload */ |
|
371 silk_TOC_struct *Silk_TOC /* O Type of content */ |
|
372 ) |
|
373 { |
|
374 opus_int i, flags, ret = SILK_NO_ERROR; |
|
375 |
|
376 if( nBytesIn < 1 ) { |
|
377 return -1; |
|
378 } |
|
379 if( nFramesPerPayload < 0 || nFramesPerPayload > 3 ) { |
|
380 return -1; |
|
381 } |
|
382 |
|
383 silk_memset( Silk_TOC, 0, sizeof( *Silk_TOC ) ); |
|
384 |
|
385 /* For stereo, extract the flags for the mid channel */ |
|
386 flags = silk_RSHIFT( payload[ 0 ], 7 - nFramesPerPayload ) & ( silk_LSHIFT( 1, nFramesPerPayload + 1 ) - 1 ); |
|
387 |
|
388 Silk_TOC->inbandFECFlag = flags & 1; |
|
389 for( i = nFramesPerPayload - 1; i >= 0 ; i-- ) { |
|
390 flags = silk_RSHIFT( flags, 1 ); |
|
391 Silk_TOC->VADFlags[ i ] = flags & 1; |
|
392 Silk_TOC->VADFlag |= flags & 1; |
|
393 } |
|
394 |
|
395 return ret; |
|
396 } |
|
397 #endif |