1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/content/media/encoder/OpusTrackEncoder.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,430 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-*/ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this file, 1.7 + * You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 +#include "OpusTrackEncoder.h" 1.9 +#include "nsString.h" 1.10 + 1.11 +#include <opus/opus.h> 1.12 + 1.13 +#undef LOG 1.14 +#ifdef MOZ_WIDGET_GONK 1.15 +#include <android/log.h> 1.16 +#define LOG(args...) __android_log_print(ANDROID_LOG_INFO, "MediaEncoder", ## args); 1.17 +#else 1.18 +#define LOG(args, ...) 1.19 +#endif 1.20 + 1.21 +namespace mozilla { 1.22 + 1.23 +// The Opus format supports up to 8 channels, and supports multitrack audio up 1.24 +// to 255 channels, but the current implementation supports only mono and 1.25 +// stereo, and downmixes any more than that. 1.26 +static const int MAX_SUPPORTED_AUDIO_CHANNELS = 8; 1.27 + 1.28 +// http://www.opus-codec.org/docs/html_api-1.0.2/group__opus__encoder.html 1.29 +// In section "opus_encoder_init", channels must be 1 or 2 of input signal. 1.30 +static const int MAX_CHANNELS = 2; 1.31 + 1.32 +// A maximum data bytes for Opus to encode. 1.33 +static const int MAX_DATA_BYTES = 4096; 1.34 + 1.35 +// http://tools.ietf.org/html/draft-ietf-codec-oggopus-00#section-4 1.36 +// Second paragraph, " The granule position of an audio data page is in units 1.37 +// of PCM audio samples at a fixed rate of 48 kHz." 1.38 +static const int kOpusSamplingRate = 48000; 1.39 + 1.40 +// The duration of an Opus frame, and it must be 2.5, 5, 10, 20, 40 or 60 ms. 1.41 +static const int kFrameDurationMs = 20; 1.42 + 1.43 +// The supported sampling rate of input signal (Hz), 1.44 +// must be one of the following. Will resampled to 48kHz otherwise. 1.45 +static const int kOpusSupportedInputSamplingRates[] = 1.46 + {8000, 12000, 16000, 24000, 48000}; 1.47 + 1.48 +namespace { 1.49 + 1.50 +// An endian-neutral serialization of integers. Serializing T in little endian 1.51 +// format to aOutput, where T is a 16 bits or 32 bits integer. 1.52 +template<typename T> 1.53 +static void 1.54 +SerializeToBuffer(T aValue, nsTArray<uint8_t>* aOutput) 1.55 +{ 1.56 + for (uint32_t i = 0; i < sizeof(T); i++) { 1.57 + aOutput->AppendElement((uint8_t)(0x000000ff & (aValue >> (i * 8)))); 1.58 + } 1.59 +} 1.60 + 1.61 +static inline void 1.62 +SerializeToBuffer(const nsCString& aComment, nsTArray<uint8_t>* aOutput) 1.63 +{ 1.64 + // Format of serializing a string to buffer is, the length of string (32 bits, 1.65 + // little endian), and the string. 1.66 + SerializeToBuffer((uint32_t)(aComment.Length()), aOutput); 1.67 + aOutput->AppendElements(aComment.get(), aComment.Length()); 1.68 +} 1.69 + 1.70 + 1.71 +static void 1.72 +SerializeOpusIdHeader(uint8_t aChannelCount, uint16_t aPreskip, 1.73 + uint32_t aInputSampleRate, nsTArray<uint8_t>* aOutput) 1.74 +{ 1.75 + // The magic signature, null terminator has to be stripped off from strings. 1.76 + static const uint8_t magic[] = "OpusHead"; 1.77 + aOutput->AppendElements(magic, sizeof(magic) - 1); 1.78 + 1.79 + // The version must always be 1 (8 bits, unsigned). 1.80 + aOutput->AppendElement(1); 1.81 + 1.82 + // Number of output channels (8 bits, unsigned). 1.83 + aOutput->AppendElement(aChannelCount); 1.84 + 1.85 + // Number of samples (at 48 kHz) to discard from the decoder output when 1.86 + // starting playback (16 bits, unsigned, little endian). 1.87 + SerializeToBuffer(aPreskip, aOutput); 1.88 + 1.89 + // The sampling rate of input source (32 bits, unsigned, little endian). 1.90 + SerializeToBuffer(aInputSampleRate, aOutput); 1.91 + 1.92 + // Output gain, an encoder should set this field to zero (16 bits, signed, 1.93 + // little endian). 1.94 + SerializeToBuffer((int16_t)0, aOutput); 1.95 + 1.96 + // Channel mapping family. Family 0 allows only 1 or 2 channels (8 bits, 1.97 + // unsigned). 1.98 + aOutput->AppendElement(0); 1.99 +} 1.100 + 1.101 +static void 1.102 +SerializeOpusCommentHeader(const nsCString& aVendor, 1.103 + const nsTArray<nsCString>& aComments, 1.104 + nsTArray<uint8_t>* aOutput) 1.105 +{ 1.106 + // The magic signature, null terminator has to be stripped off. 1.107 + static const uint8_t magic[] = "OpusTags"; 1.108 + aOutput->AppendElements(magic, sizeof(magic) - 1); 1.109 + 1.110 + // The vendor; Should append in the following order: 1.111 + // vendor string length (32 bits, unsigned, little endian) 1.112 + // vendor string. 1.113 + SerializeToBuffer(aVendor, aOutput); 1.114 + 1.115 + // Add comments; Should append in the following order: 1.116 + // comment list length (32 bits, unsigned, little endian) 1.117 + // comment #0 string length (32 bits, unsigned, little endian) 1.118 + // comment #0 string 1.119 + // comment #1 string length (32 bits, unsigned, little endian) 1.120 + // comment #1 string ... 1.121 + SerializeToBuffer((uint32_t)aComments.Length(), aOutput); 1.122 + for (uint32_t i = 0; i < aComments.Length(); ++i) { 1.123 + SerializeToBuffer(aComments[i], aOutput); 1.124 + } 1.125 +} 1.126 + 1.127 +} // Anonymous namespace. 1.128 + 1.129 +OpusTrackEncoder::OpusTrackEncoder() 1.130 + : AudioTrackEncoder() 1.131 + , mEncoder(nullptr) 1.132 + , mLookahead(0) 1.133 + , mResampler(nullptr) 1.134 +{ 1.135 +} 1.136 + 1.137 +OpusTrackEncoder::~OpusTrackEncoder() 1.138 +{ 1.139 + if (mEncoder) { 1.140 + opus_encoder_destroy(mEncoder); 1.141 + } 1.142 + if (mResampler) { 1.143 + speex_resampler_destroy(mResampler); 1.144 + mResampler = nullptr; 1.145 + } 1.146 +} 1.147 + 1.148 +nsresult 1.149 +OpusTrackEncoder::Init(int aChannels, int aSamplingRate) 1.150 +{ 1.151 + // This monitor is used to wake up other methods that are waiting for encoder 1.152 + // to be completely initialized. 1.153 + ReentrantMonitorAutoEnter mon(mReentrantMonitor); 1.154 + 1.155 + NS_ENSURE_TRUE((aChannels <= MAX_SUPPORTED_AUDIO_CHANNELS) && (aChannels > 0), 1.156 + NS_ERROR_FAILURE); 1.157 + 1.158 + // This version of encoder API only support 1 or 2 channels, 1.159 + // So set the mChannels less or equal 2 and 1.160 + // let InterleaveTrackData downmix pcm data. 1.161 + mChannels = aChannels > MAX_CHANNELS ? MAX_CHANNELS : aChannels; 1.162 + 1.163 + // According to www.opus-codec.org, creating an opus encoder requires the 1.164 + // sampling rate of source signal be one of 8000, 12000, 16000, 24000, or 1.165 + // 48000. If this constraint is not satisfied, we resample the input to 48kHz. 1.166 + nsTArray<int> supportedSamplingRates; 1.167 + supportedSamplingRates.AppendElements(kOpusSupportedInputSamplingRates, 1.168 + ArrayLength(kOpusSupportedInputSamplingRates)); 1.169 + if (!supportedSamplingRates.Contains(aSamplingRate)) { 1.170 + int error; 1.171 + mResampler = speex_resampler_init(mChannels, 1.172 + aSamplingRate, 1.173 + kOpusSamplingRate, 1.174 + SPEEX_RESAMPLER_QUALITY_DEFAULT, 1.175 + &error); 1.176 + 1.177 + if (error != RESAMPLER_ERR_SUCCESS) { 1.178 + return NS_ERROR_FAILURE; 1.179 + } 1.180 + } 1.181 + mSamplingRate = aSamplingRate; 1.182 + NS_ENSURE_TRUE(mSamplingRate > 0, NS_ERROR_FAILURE); 1.183 + 1.184 + int error = 0; 1.185 + mEncoder = opus_encoder_create(GetOutputSampleRate(), mChannels, 1.186 + OPUS_APPLICATION_AUDIO, &error); 1.187 + 1.188 + mInitialized = (error == OPUS_OK); 1.189 + 1.190 + mReentrantMonitor.NotifyAll(); 1.191 + 1.192 + return error == OPUS_OK ? NS_OK : NS_ERROR_FAILURE; 1.193 +} 1.194 + 1.195 +int 1.196 +OpusTrackEncoder::GetOutputSampleRate() 1.197 +{ 1.198 + return mResampler ? kOpusSamplingRate : mSamplingRate; 1.199 +} 1.200 + 1.201 +int 1.202 +OpusTrackEncoder::GetPacketDuration() 1.203 +{ 1.204 + return GetOutputSampleRate() * kFrameDurationMs / 1000; 1.205 +} 1.206 + 1.207 +already_AddRefed<TrackMetadataBase> 1.208 +OpusTrackEncoder::GetMetadata() 1.209 +{ 1.210 + { 1.211 + // Wait if mEncoder is not initialized. 1.212 + ReentrantMonitorAutoEnter mon(mReentrantMonitor); 1.213 + while (!mCanceled && !mInitialized) { 1.214 + mReentrantMonitor.Wait(); 1.215 + } 1.216 + } 1.217 + 1.218 + if (mCanceled || mEncodingComplete) { 1.219 + return nullptr; 1.220 + } 1.221 + 1.222 + nsRefPtr<OpusMetadata> meta = new OpusMetadata(); 1.223 + 1.224 + mLookahead = 0; 1.225 + int error = opus_encoder_ctl(mEncoder, OPUS_GET_LOOKAHEAD(&mLookahead)); 1.226 + if (error != OPUS_OK) { 1.227 + mLookahead = 0; 1.228 + } 1.229 + 1.230 + // The ogg time stamping and pre-skip is always timed at 48000. 1.231 + SerializeOpusIdHeader(mChannels, mLookahead * (kOpusSamplingRate / 1.232 + GetOutputSampleRate()), mSamplingRate, 1.233 + &meta->mIdHeader); 1.234 + 1.235 + nsCString vendor; 1.236 + vendor.AppendASCII(opus_get_version_string()); 1.237 + 1.238 + nsTArray<nsCString> comments; 1.239 + comments.AppendElement(NS_LITERAL_CSTRING("ENCODER=Mozilla" MOZ_APP_UA_VERSION)); 1.240 + 1.241 + SerializeOpusCommentHeader(vendor, comments, 1.242 + &meta->mCommentHeader); 1.243 + 1.244 + return meta.forget(); 1.245 +} 1.246 + 1.247 +nsresult 1.248 +OpusTrackEncoder::GetEncodedTrack(EncodedFrameContainer& aData) 1.249 +{ 1.250 + { 1.251 + ReentrantMonitorAutoEnter mon(mReentrantMonitor); 1.252 + // Wait until initialized or cancelled. 1.253 + while (!mCanceled && !mInitialized) { 1.254 + mReentrantMonitor.Wait(); 1.255 + } 1.256 + if (mCanceled || mEncodingComplete) { 1.257 + return NS_ERROR_FAILURE; 1.258 + } 1.259 + } 1.260 + 1.261 + // calculation below depends on the truth that mInitialized is true. 1.262 + MOZ_ASSERT(mInitialized); 1.263 + 1.264 + // re-sampled frames left last time which didn't fit into an Opus packet duration. 1.265 + const int framesLeft = mResampledLeftover.Length() / mChannels; 1.266 + // When framesLeft is 0, (GetPacketDuration() - framesLeft) is a multiple 1.267 + // of kOpusSamplingRate. There is not precision loss in the integer division 1.268 + // in computing framesToFetch. If frameLeft > 0, we need to add 1 to 1.269 + // framesToFetch to ensure there will be at least n frames after re-sampling. 1.270 + const int frameRoundUp = framesLeft ? 1 : 0; 1.271 + 1.272 + MOZ_ASSERT(GetPacketDuration() >= framesLeft); 1.273 + // Try to fetch m frames such that there will be n frames 1.274 + // where (n + frameLeft) >= GetPacketDuration() after re-sampling. 1.275 + const int framesToFetch = !mResampler ? GetPacketDuration() 1.276 + : (GetPacketDuration() - framesLeft) * mSamplingRate / kOpusSamplingRate 1.277 + + frameRoundUp; 1.278 + { 1.279 + // Move all the samples from mRawSegment to mSourceSegment. We only hold 1.280 + // the monitor in this block. 1.281 + ReentrantMonitorAutoEnter mon(mReentrantMonitor); 1.282 + 1.283 + // Wait until enough raw data, end of stream or cancelled. 1.284 + while (!mCanceled && mRawSegment.GetDuration() + 1.285 + mSourceSegment.GetDuration() < framesToFetch && 1.286 + !mEndOfStream) { 1.287 + mReentrantMonitor.Wait(); 1.288 + } 1.289 + 1.290 + if (mCanceled || mEncodingComplete) { 1.291 + return NS_ERROR_FAILURE; 1.292 + } 1.293 + 1.294 + mSourceSegment.AppendFrom(&mRawSegment); 1.295 + 1.296 + // Pad |mLookahead| samples to the end of source stream to prevent lost of 1.297 + // original data, the pcm duration will be calculated at rate 48K later. 1.298 + if (mEndOfStream && !mEosSetInEncoder) { 1.299 + mEosSetInEncoder = true; 1.300 + mSourceSegment.AppendNullData(mLookahead); 1.301 + } 1.302 + } 1.303 + 1.304 + // Start encoding data. 1.305 + nsAutoTArray<AudioDataValue, 9600> pcm; 1.306 + pcm.SetLength(GetPacketDuration() * mChannels); 1.307 + AudioSegment::ChunkIterator iter(mSourceSegment); 1.308 + int frameCopied = 0; 1.309 + 1.310 + while (!iter.IsEnded() && frameCopied < framesToFetch) { 1.311 + AudioChunk chunk = *iter; 1.312 + 1.313 + // Chunk to the required frame size. 1.314 + int frameToCopy = chunk.GetDuration(); 1.315 + if (frameCopied + frameToCopy > framesToFetch) { 1.316 + frameToCopy = framesToFetch - frameCopied; 1.317 + } 1.318 + 1.319 + if (!chunk.IsNull()) { 1.320 + // Append the interleaved data to the end of pcm buffer. 1.321 + AudioTrackEncoder::InterleaveTrackData(chunk, frameToCopy, mChannels, 1.322 + pcm.Elements() + frameCopied * mChannels); 1.323 + } else { 1.324 + memset(pcm.Elements() + frameCopied * mChannels, 0, 1.325 + frameToCopy * mChannels * sizeof(AudioDataValue)); 1.326 + } 1.327 + 1.328 + frameCopied += frameToCopy; 1.329 + iter.Next(); 1.330 + } 1.331 + 1.332 + nsRefPtr<EncodedFrame> audiodata = new EncodedFrame(); 1.333 + audiodata->SetFrameType(EncodedFrame::OPUS_AUDIO_FRAME); 1.334 + int framesInPCM = frameCopied; 1.335 + if (mResampler) { 1.336 + nsAutoTArray<AudioDataValue, 9600> resamplingDest; 1.337 + // We want to consume all the input data, so we slightly oversize the 1.338 + // resampled data buffer so we can fit the output data in. We cannot really 1.339 + // predict the output frame count at each call. 1.340 + uint32_t outframes = frameCopied * kOpusSamplingRate / mSamplingRate + 1; 1.341 + uint32_t inframes = frameCopied; 1.342 + 1.343 + resamplingDest.SetLength(outframes * mChannels); 1.344 + 1.345 +#if MOZ_SAMPLE_TYPE_S16 1.346 + short* in = reinterpret_cast<short*>(pcm.Elements()); 1.347 + short* out = reinterpret_cast<short*>(resamplingDest.Elements()); 1.348 + speex_resampler_process_interleaved_int(mResampler, in, &inframes, 1.349 + out, &outframes); 1.350 +#else 1.351 + float* in = reinterpret_cast<float*>(pcm.Elements()); 1.352 + float* out = reinterpret_cast<float*>(resamplingDest.Elements()); 1.353 + speex_resampler_process_interleaved_float(mResampler, in, &inframes, 1.354 + out, &outframes); 1.355 +#endif 1.356 + 1.357 + MOZ_ASSERT(pcm.Length() >= mResampledLeftover.Length()); 1.358 + PodCopy(pcm.Elements(), mResampledLeftover.Elements(), 1.359 + mResampledLeftover.Length()); 1.360 + 1.361 + uint32_t outframesToCopy = std::min(outframes, 1.362 + static_cast<uint32_t>(GetPacketDuration() - framesLeft)); 1.363 + 1.364 + MOZ_ASSERT(pcm.Length() - mResampledLeftover.Length() >= 1.365 + outframesToCopy * mChannels); 1.366 + PodCopy(pcm.Elements() + mResampledLeftover.Length(), 1.367 + resamplingDest.Elements(), outframesToCopy * mChannels); 1.368 + int frameLeftover = outframes - outframesToCopy; 1.369 + mResampledLeftover.SetLength(frameLeftover * mChannels); 1.370 + PodCopy(mResampledLeftover.Elements(), 1.371 + resamplingDest.Elements() + outframesToCopy * mChannels, 1.372 + mResampledLeftover.Length()); 1.373 + // This is always at 48000Hz. 1.374 + framesInPCM = framesLeft + outframesToCopy; 1.375 + audiodata->SetDuration(framesInPCM); 1.376 + } else { 1.377 + // The ogg time stamping and pre-skip is always timed at 48000. 1.378 + audiodata->SetDuration(frameCopied * (kOpusSamplingRate / mSamplingRate)); 1.379 + } 1.380 + 1.381 + // Remove the raw data which has been pulled to pcm buffer. 1.382 + // The value of frameCopied should equal to (or smaller than, if eos) 1.383 + // GetPacketDuration(). 1.384 + mSourceSegment.RemoveLeading(frameCopied); 1.385 + 1.386 + // Has reached the end of input stream and all queued data has pulled for 1.387 + // encoding. 1.388 + if (mSourceSegment.GetDuration() == 0 && mEndOfStream) { 1.389 + mEncodingComplete = true; 1.390 + LOG("[Opus] Done encoding."); 1.391 + } 1.392 + 1.393 + MOZ_ASSERT(mEndOfStream || framesInPCM == GetPacketDuration()); 1.394 + 1.395 + // Append null data to pcm buffer if the leftover data is not enough for 1.396 + // opus encoder. 1.397 + if (framesInPCM < GetPacketDuration() && mEndOfStream) { 1.398 + PodZero(pcm.Elements() + framesInPCM * mChannels, 1.399 + (GetPacketDuration() - framesInPCM) * mChannels); 1.400 + } 1.401 + nsTArray<uint8_t> frameData; 1.402 + // Encode the data with Opus Encoder. 1.403 + frameData.SetLength(MAX_DATA_BYTES); 1.404 + // result is returned as opus error code if it is negative. 1.405 + int result = 0; 1.406 +#ifdef MOZ_SAMPLE_TYPE_S16 1.407 + const opus_int16* pcmBuf = static_cast<opus_int16*>(pcm.Elements()); 1.408 + result = opus_encode(mEncoder, pcmBuf, GetPacketDuration(), 1.409 + frameData.Elements(), MAX_DATA_BYTES); 1.410 +#else 1.411 + const float* pcmBuf = static_cast<float*>(pcm.Elements()); 1.412 + result = opus_encode_float(mEncoder, pcmBuf, GetPacketDuration(), 1.413 + frameData.Elements(), MAX_DATA_BYTES); 1.414 +#endif 1.415 + frameData.SetLength(result >= 0 ? result : 0); 1.416 + 1.417 + if (result < 0) { 1.418 + LOG("[Opus] Fail to encode data! Result: %s.", opus_strerror(result)); 1.419 + } 1.420 + if (mEncodingComplete) { 1.421 + if (mResampler) { 1.422 + speex_resampler_destroy(mResampler); 1.423 + mResampler = nullptr; 1.424 + } 1.425 + mResampledLeftover.SetLength(0); 1.426 + } 1.427 + 1.428 + audiodata->SwapInFrameData(frameData); 1.429 + aData.AppendEncodedFrame(audiodata); 1.430 + return result >= 0 ? NS_OK : NS_ERROR_FAILURE; 1.431 +} 1.432 + 1.433 +}