|
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-*/ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this file, |
|
4 * You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 #include "OpusTrackEncoder.h" |
|
6 #include "nsString.h" |
|
7 |
|
8 #include <opus/opus.h> |
|
9 |
|
10 #undef LOG |
|
11 #ifdef MOZ_WIDGET_GONK |
|
12 #include <android/log.h> |
|
13 #define LOG(args...) __android_log_print(ANDROID_LOG_INFO, "MediaEncoder", ## args); |
|
14 #else |
|
15 #define LOG(args, ...) |
|
16 #endif |
|
17 |
|
18 namespace mozilla { |
|
19 |
|
20 // The Opus format supports up to 8 channels, and supports multitrack audio up |
|
21 // to 255 channels, but the current implementation supports only mono and |
|
22 // stereo, and downmixes any more than that. |
|
23 static const int MAX_SUPPORTED_AUDIO_CHANNELS = 8; |
|
24 |
|
25 // http://www.opus-codec.org/docs/html_api-1.0.2/group__opus__encoder.html |
|
26 // In section "opus_encoder_init", channels must be 1 or 2 of input signal. |
|
27 static const int MAX_CHANNELS = 2; |
|
28 |
|
29 // A maximum data bytes for Opus to encode. |
|
30 static const int MAX_DATA_BYTES = 4096; |
|
31 |
|
32 // http://tools.ietf.org/html/draft-ietf-codec-oggopus-00#section-4 |
|
33 // Second paragraph, " The granule position of an audio data page is in units |
|
34 // of PCM audio samples at a fixed rate of 48 kHz." |
|
35 static const int kOpusSamplingRate = 48000; |
|
36 |
|
37 // The duration of an Opus frame, and it must be 2.5, 5, 10, 20, 40 or 60 ms. |
|
38 static const int kFrameDurationMs = 20; |
|
39 |
|
40 // The supported sampling rate of input signal (Hz), |
|
41 // must be one of the following. Will resampled to 48kHz otherwise. |
|
42 static const int kOpusSupportedInputSamplingRates[] = |
|
43 {8000, 12000, 16000, 24000, 48000}; |
|
44 |
|
45 namespace { |
|
46 |
|
47 // An endian-neutral serialization of integers. Serializing T in little endian |
|
48 // format to aOutput, where T is a 16 bits or 32 bits integer. |
|
49 template<typename T> |
|
50 static void |
|
51 SerializeToBuffer(T aValue, nsTArray<uint8_t>* aOutput) |
|
52 { |
|
53 for (uint32_t i = 0; i < sizeof(T); i++) { |
|
54 aOutput->AppendElement((uint8_t)(0x000000ff & (aValue >> (i * 8)))); |
|
55 } |
|
56 } |
|
57 |
|
58 static inline void |
|
59 SerializeToBuffer(const nsCString& aComment, nsTArray<uint8_t>* aOutput) |
|
60 { |
|
61 // Format of serializing a string to buffer is, the length of string (32 bits, |
|
62 // little endian), and the string. |
|
63 SerializeToBuffer((uint32_t)(aComment.Length()), aOutput); |
|
64 aOutput->AppendElements(aComment.get(), aComment.Length()); |
|
65 } |
|
66 |
|
67 |
|
68 static void |
|
69 SerializeOpusIdHeader(uint8_t aChannelCount, uint16_t aPreskip, |
|
70 uint32_t aInputSampleRate, nsTArray<uint8_t>* aOutput) |
|
71 { |
|
72 // The magic signature, null terminator has to be stripped off from strings. |
|
73 static const uint8_t magic[] = "OpusHead"; |
|
74 aOutput->AppendElements(magic, sizeof(magic) - 1); |
|
75 |
|
76 // The version must always be 1 (8 bits, unsigned). |
|
77 aOutput->AppendElement(1); |
|
78 |
|
79 // Number of output channels (8 bits, unsigned). |
|
80 aOutput->AppendElement(aChannelCount); |
|
81 |
|
82 // Number of samples (at 48 kHz) to discard from the decoder output when |
|
83 // starting playback (16 bits, unsigned, little endian). |
|
84 SerializeToBuffer(aPreskip, aOutput); |
|
85 |
|
86 // The sampling rate of input source (32 bits, unsigned, little endian). |
|
87 SerializeToBuffer(aInputSampleRate, aOutput); |
|
88 |
|
89 // Output gain, an encoder should set this field to zero (16 bits, signed, |
|
90 // little endian). |
|
91 SerializeToBuffer((int16_t)0, aOutput); |
|
92 |
|
93 // Channel mapping family. Family 0 allows only 1 or 2 channels (8 bits, |
|
94 // unsigned). |
|
95 aOutput->AppendElement(0); |
|
96 } |
|
97 |
|
98 static void |
|
99 SerializeOpusCommentHeader(const nsCString& aVendor, |
|
100 const nsTArray<nsCString>& aComments, |
|
101 nsTArray<uint8_t>* aOutput) |
|
102 { |
|
103 // The magic signature, null terminator has to be stripped off. |
|
104 static const uint8_t magic[] = "OpusTags"; |
|
105 aOutput->AppendElements(magic, sizeof(magic) - 1); |
|
106 |
|
107 // The vendor; Should append in the following order: |
|
108 // vendor string length (32 bits, unsigned, little endian) |
|
109 // vendor string. |
|
110 SerializeToBuffer(aVendor, aOutput); |
|
111 |
|
112 // Add comments; Should append in the following order: |
|
113 // comment list length (32 bits, unsigned, little endian) |
|
114 // comment #0 string length (32 bits, unsigned, little endian) |
|
115 // comment #0 string |
|
116 // comment #1 string length (32 bits, unsigned, little endian) |
|
117 // comment #1 string ... |
|
118 SerializeToBuffer((uint32_t)aComments.Length(), aOutput); |
|
119 for (uint32_t i = 0; i < aComments.Length(); ++i) { |
|
120 SerializeToBuffer(aComments[i], aOutput); |
|
121 } |
|
122 } |
|
123 |
|
124 } // Anonymous namespace. |
|
125 |
|
126 OpusTrackEncoder::OpusTrackEncoder() |
|
127 : AudioTrackEncoder() |
|
128 , mEncoder(nullptr) |
|
129 , mLookahead(0) |
|
130 , mResampler(nullptr) |
|
131 { |
|
132 } |
|
133 |
|
134 OpusTrackEncoder::~OpusTrackEncoder() |
|
135 { |
|
136 if (mEncoder) { |
|
137 opus_encoder_destroy(mEncoder); |
|
138 } |
|
139 if (mResampler) { |
|
140 speex_resampler_destroy(mResampler); |
|
141 mResampler = nullptr; |
|
142 } |
|
143 } |
|
144 |
|
145 nsresult |
|
146 OpusTrackEncoder::Init(int aChannels, int aSamplingRate) |
|
147 { |
|
148 // This monitor is used to wake up other methods that are waiting for encoder |
|
149 // to be completely initialized. |
|
150 ReentrantMonitorAutoEnter mon(mReentrantMonitor); |
|
151 |
|
152 NS_ENSURE_TRUE((aChannels <= MAX_SUPPORTED_AUDIO_CHANNELS) && (aChannels > 0), |
|
153 NS_ERROR_FAILURE); |
|
154 |
|
155 // This version of encoder API only support 1 or 2 channels, |
|
156 // So set the mChannels less or equal 2 and |
|
157 // let InterleaveTrackData downmix pcm data. |
|
158 mChannels = aChannels > MAX_CHANNELS ? MAX_CHANNELS : aChannels; |
|
159 |
|
160 // According to www.opus-codec.org, creating an opus encoder requires the |
|
161 // sampling rate of source signal be one of 8000, 12000, 16000, 24000, or |
|
162 // 48000. If this constraint is not satisfied, we resample the input to 48kHz. |
|
163 nsTArray<int> supportedSamplingRates; |
|
164 supportedSamplingRates.AppendElements(kOpusSupportedInputSamplingRates, |
|
165 ArrayLength(kOpusSupportedInputSamplingRates)); |
|
166 if (!supportedSamplingRates.Contains(aSamplingRate)) { |
|
167 int error; |
|
168 mResampler = speex_resampler_init(mChannels, |
|
169 aSamplingRate, |
|
170 kOpusSamplingRate, |
|
171 SPEEX_RESAMPLER_QUALITY_DEFAULT, |
|
172 &error); |
|
173 |
|
174 if (error != RESAMPLER_ERR_SUCCESS) { |
|
175 return NS_ERROR_FAILURE; |
|
176 } |
|
177 } |
|
178 mSamplingRate = aSamplingRate; |
|
179 NS_ENSURE_TRUE(mSamplingRate > 0, NS_ERROR_FAILURE); |
|
180 |
|
181 int error = 0; |
|
182 mEncoder = opus_encoder_create(GetOutputSampleRate(), mChannels, |
|
183 OPUS_APPLICATION_AUDIO, &error); |
|
184 |
|
185 mInitialized = (error == OPUS_OK); |
|
186 |
|
187 mReentrantMonitor.NotifyAll(); |
|
188 |
|
189 return error == OPUS_OK ? NS_OK : NS_ERROR_FAILURE; |
|
190 } |
|
191 |
|
192 int |
|
193 OpusTrackEncoder::GetOutputSampleRate() |
|
194 { |
|
195 return mResampler ? kOpusSamplingRate : mSamplingRate; |
|
196 } |
|
197 |
|
198 int |
|
199 OpusTrackEncoder::GetPacketDuration() |
|
200 { |
|
201 return GetOutputSampleRate() * kFrameDurationMs / 1000; |
|
202 } |
|
203 |
|
204 already_AddRefed<TrackMetadataBase> |
|
205 OpusTrackEncoder::GetMetadata() |
|
206 { |
|
207 { |
|
208 // Wait if mEncoder is not initialized. |
|
209 ReentrantMonitorAutoEnter mon(mReentrantMonitor); |
|
210 while (!mCanceled && !mInitialized) { |
|
211 mReentrantMonitor.Wait(); |
|
212 } |
|
213 } |
|
214 |
|
215 if (mCanceled || mEncodingComplete) { |
|
216 return nullptr; |
|
217 } |
|
218 |
|
219 nsRefPtr<OpusMetadata> meta = new OpusMetadata(); |
|
220 |
|
221 mLookahead = 0; |
|
222 int error = opus_encoder_ctl(mEncoder, OPUS_GET_LOOKAHEAD(&mLookahead)); |
|
223 if (error != OPUS_OK) { |
|
224 mLookahead = 0; |
|
225 } |
|
226 |
|
227 // The ogg time stamping and pre-skip is always timed at 48000. |
|
228 SerializeOpusIdHeader(mChannels, mLookahead * (kOpusSamplingRate / |
|
229 GetOutputSampleRate()), mSamplingRate, |
|
230 &meta->mIdHeader); |
|
231 |
|
232 nsCString vendor; |
|
233 vendor.AppendASCII(opus_get_version_string()); |
|
234 |
|
235 nsTArray<nsCString> comments; |
|
236 comments.AppendElement(NS_LITERAL_CSTRING("ENCODER=Mozilla" MOZ_APP_UA_VERSION)); |
|
237 |
|
238 SerializeOpusCommentHeader(vendor, comments, |
|
239 &meta->mCommentHeader); |
|
240 |
|
241 return meta.forget(); |
|
242 } |
|
243 |
|
244 nsresult |
|
245 OpusTrackEncoder::GetEncodedTrack(EncodedFrameContainer& aData) |
|
246 { |
|
247 { |
|
248 ReentrantMonitorAutoEnter mon(mReentrantMonitor); |
|
249 // Wait until initialized or cancelled. |
|
250 while (!mCanceled && !mInitialized) { |
|
251 mReentrantMonitor.Wait(); |
|
252 } |
|
253 if (mCanceled || mEncodingComplete) { |
|
254 return NS_ERROR_FAILURE; |
|
255 } |
|
256 } |
|
257 |
|
258 // calculation below depends on the truth that mInitialized is true. |
|
259 MOZ_ASSERT(mInitialized); |
|
260 |
|
261 // re-sampled frames left last time which didn't fit into an Opus packet duration. |
|
262 const int framesLeft = mResampledLeftover.Length() / mChannels; |
|
263 // When framesLeft is 0, (GetPacketDuration() - framesLeft) is a multiple |
|
264 // of kOpusSamplingRate. There is not precision loss in the integer division |
|
265 // in computing framesToFetch. If frameLeft > 0, we need to add 1 to |
|
266 // framesToFetch to ensure there will be at least n frames after re-sampling. |
|
267 const int frameRoundUp = framesLeft ? 1 : 0; |
|
268 |
|
269 MOZ_ASSERT(GetPacketDuration() >= framesLeft); |
|
270 // Try to fetch m frames such that there will be n frames |
|
271 // where (n + frameLeft) >= GetPacketDuration() after re-sampling. |
|
272 const int framesToFetch = !mResampler ? GetPacketDuration() |
|
273 : (GetPacketDuration() - framesLeft) * mSamplingRate / kOpusSamplingRate |
|
274 + frameRoundUp; |
|
275 { |
|
276 // Move all the samples from mRawSegment to mSourceSegment. We only hold |
|
277 // the monitor in this block. |
|
278 ReentrantMonitorAutoEnter mon(mReentrantMonitor); |
|
279 |
|
280 // Wait until enough raw data, end of stream or cancelled. |
|
281 while (!mCanceled && mRawSegment.GetDuration() + |
|
282 mSourceSegment.GetDuration() < framesToFetch && |
|
283 !mEndOfStream) { |
|
284 mReentrantMonitor.Wait(); |
|
285 } |
|
286 |
|
287 if (mCanceled || mEncodingComplete) { |
|
288 return NS_ERROR_FAILURE; |
|
289 } |
|
290 |
|
291 mSourceSegment.AppendFrom(&mRawSegment); |
|
292 |
|
293 // Pad |mLookahead| samples to the end of source stream to prevent lost of |
|
294 // original data, the pcm duration will be calculated at rate 48K later. |
|
295 if (mEndOfStream && !mEosSetInEncoder) { |
|
296 mEosSetInEncoder = true; |
|
297 mSourceSegment.AppendNullData(mLookahead); |
|
298 } |
|
299 } |
|
300 |
|
301 // Start encoding data. |
|
302 nsAutoTArray<AudioDataValue, 9600> pcm; |
|
303 pcm.SetLength(GetPacketDuration() * mChannels); |
|
304 AudioSegment::ChunkIterator iter(mSourceSegment); |
|
305 int frameCopied = 0; |
|
306 |
|
307 while (!iter.IsEnded() && frameCopied < framesToFetch) { |
|
308 AudioChunk chunk = *iter; |
|
309 |
|
310 // Chunk to the required frame size. |
|
311 int frameToCopy = chunk.GetDuration(); |
|
312 if (frameCopied + frameToCopy > framesToFetch) { |
|
313 frameToCopy = framesToFetch - frameCopied; |
|
314 } |
|
315 |
|
316 if (!chunk.IsNull()) { |
|
317 // Append the interleaved data to the end of pcm buffer. |
|
318 AudioTrackEncoder::InterleaveTrackData(chunk, frameToCopy, mChannels, |
|
319 pcm.Elements() + frameCopied * mChannels); |
|
320 } else { |
|
321 memset(pcm.Elements() + frameCopied * mChannels, 0, |
|
322 frameToCopy * mChannels * sizeof(AudioDataValue)); |
|
323 } |
|
324 |
|
325 frameCopied += frameToCopy; |
|
326 iter.Next(); |
|
327 } |
|
328 |
|
329 nsRefPtr<EncodedFrame> audiodata = new EncodedFrame(); |
|
330 audiodata->SetFrameType(EncodedFrame::OPUS_AUDIO_FRAME); |
|
331 int framesInPCM = frameCopied; |
|
332 if (mResampler) { |
|
333 nsAutoTArray<AudioDataValue, 9600> resamplingDest; |
|
334 // We want to consume all the input data, so we slightly oversize the |
|
335 // resampled data buffer so we can fit the output data in. We cannot really |
|
336 // predict the output frame count at each call. |
|
337 uint32_t outframes = frameCopied * kOpusSamplingRate / mSamplingRate + 1; |
|
338 uint32_t inframes = frameCopied; |
|
339 |
|
340 resamplingDest.SetLength(outframes * mChannels); |
|
341 |
|
342 #if MOZ_SAMPLE_TYPE_S16 |
|
343 short* in = reinterpret_cast<short*>(pcm.Elements()); |
|
344 short* out = reinterpret_cast<short*>(resamplingDest.Elements()); |
|
345 speex_resampler_process_interleaved_int(mResampler, in, &inframes, |
|
346 out, &outframes); |
|
347 #else |
|
348 float* in = reinterpret_cast<float*>(pcm.Elements()); |
|
349 float* out = reinterpret_cast<float*>(resamplingDest.Elements()); |
|
350 speex_resampler_process_interleaved_float(mResampler, in, &inframes, |
|
351 out, &outframes); |
|
352 #endif |
|
353 |
|
354 MOZ_ASSERT(pcm.Length() >= mResampledLeftover.Length()); |
|
355 PodCopy(pcm.Elements(), mResampledLeftover.Elements(), |
|
356 mResampledLeftover.Length()); |
|
357 |
|
358 uint32_t outframesToCopy = std::min(outframes, |
|
359 static_cast<uint32_t>(GetPacketDuration() - framesLeft)); |
|
360 |
|
361 MOZ_ASSERT(pcm.Length() - mResampledLeftover.Length() >= |
|
362 outframesToCopy * mChannels); |
|
363 PodCopy(pcm.Elements() + mResampledLeftover.Length(), |
|
364 resamplingDest.Elements(), outframesToCopy * mChannels); |
|
365 int frameLeftover = outframes - outframesToCopy; |
|
366 mResampledLeftover.SetLength(frameLeftover * mChannels); |
|
367 PodCopy(mResampledLeftover.Elements(), |
|
368 resamplingDest.Elements() + outframesToCopy * mChannels, |
|
369 mResampledLeftover.Length()); |
|
370 // This is always at 48000Hz. |
|
371 framesInPCM = framesLeft + outframesToCopy; |
|
372 audiodata->SetDuration(framesInPCM); |
|
373 } else { |
|
374 // The ogg time stamping and pre-skip is always timed at 48000. |
|
375 audiodata->SetDuration(frameCopied * (kOpusSamplingRate / mSamplingRate)); |
|
376 } |
|
377 |
|
378 // Remove the raw data which has been pulled to pcm buffer. |
|
379 // The value of frameCopied should equal to (or smaller than, if eos) |
|
380 // GetPacketDuration(). |
|
381 mSourceSegment.RemoveLeading(frameCopied); |
|
382 |
|
383 // Has reached the end of input stream and all queued data has pulled for |
|
384 // encoding. |
|
385 if (mSourceSegment.GetDuration() == 0 && mEndOfStream) { |
|
386 mEncodingComplete = true; |
|
387 LOG("[Opus] Done encoding."); |
|
388 } |
|
389 |
|
390 MOZ_ASSERT(mEndOfStream || framesInPCM == GetPacketDuration()); |
|
391 |
|
392 // Append null data to pcm buffer if the leftover data is not enough for |
|
393 // opus encoder. |
|
394 if (framesInPCM < GetPacketDuration() && mEndOfStream) { |
|
395 PodZero(pcm.Elements() + framesInPCM * mChannels, |
|
396 (GetPacketDuration() - framesInPCM) * mChannels); |
|
397 } |
|
398 nsTArray<uint8_t> frameData; |
|
399 // Encode the data with Opus Encoder. |
|
400 frameData.SetLength(MAX_DATA_BYTES); |
|
401 // result is returned as opus error code if it is negative. |
|
402 int result = 0; |
|
403 #ifdef MOZ_SAMPLE_TYPE_S16 |
|
404 const opus_int16* pcmBuf = static_cast<opus_int16*>(pcm.Elements()); |
|
405 result = opus_encode(mEncoder, pcmBuf, GetPacketDuration(), |
|
406 frameData.Elements(), MAX_DATA_BYTES); |
|
407 #else |
|
408 const float* pcmBuf = static_cast<float*>(pcm.Elements()); |
|
409 result = opus_encode_float(mEncoder, pcmBuf, GetPacketDuration(), |
|
410 frameData.Elements(), MAX_DATA_BYTES); |
|
411 #endif |
|
412 frameData.SetLength(result >= 0 ? result : 0); |
|
413 |
|
414 if (result < 0) { |
|
415 LOG("[Opus] Fail to encode data! Result: %s.", opus_strerror(result)); |
|
416 } |
|
417 if (mEncodingComplete) { |
|
418 if (mResampler) { |
|
419 speex_resampler_destroy(mResampler); |
|
420 mResampler = nullptr; |
|
421 } |
|
422 mResampledLeftover.SetLength(0); |
|
423 } |
|
424 |
|
425 audiodata->SwapInFrameData(frameData); |
|
426 aData.AppendEncodedFrame(audiodata); |
|
427 return result >= 0 ? NS_OK : NS_ERROR_FAILURE; |
|
428 } |
|
429 |
|
430 } |