content/media/encoder/OpusTrackEncoder.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/content/media/encoder/OpusTrackEncoder.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,430 @@
     1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-*/
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this file,
     1.7 + * You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +#include "OpusTrackEncoder.h"
     1.9 +#include "nsString.h"
    1.10 +
    1.11 +#include <opus/opus.h>
    1.12 +
    1.13 +#undef LOG
    1.14 +#ifdef MOZ_WIDGET_GONK
    1.15 +#include <android/log.h>
    1.16 +#define LOG(args...) __android_log_print(ANDROID_LOG_INFO, "MediaEncoder", ## args);
    1.17 +#else
    1.18 +#define LOG(args, ...)
    1.19 +#endif
    1.20 +
    1.21 +namespace mozilla {
    1.22 +
    1.23 +// The Opus format supports up to 8 channels, and supports multitrack audio up
    1.24 +// to 255 channels, but the current implementation supports only mono and
    1.25 +// stereo, and downmixes any more than that.
    1.26 +static const int MAX_SUPPORTED_AUDIO_CHANNELS = 8;
    1.27 +
    1.28 +// http://www.opus-codec.org/docs/html_api-1.0.2/group__opus__encoder.html
    1.29 +// In section "opus_encoder_init", channels must be 1 or 2 of input signal.
    1.30 +static const int MAX_CHANNELS = 2;
    1.31 +
    1.32 +// A maximum data bytes for Opus to encode.
    1.33 +static const int MAX_DATA_BYTES = 4096;
    1.34 +
    1.35 +// http://tools.ietf.org/html/draft-ietf-codec-oggopus-00#section-4
    1.36 +// Second paragraph, " The granule position of an audio data page is in units
    1.37 +// of PCM audio samples at a fixed rate of 48 kHz."
    1.38 +static const int kOpusSamplingRate = 48000;
    1.39 +
    1.40 +// The duration of an Opus frame, and it must be 2.5, 5, 10, 20, 40 or 60 ms.
    1.41 +static const int kFrameDurationMs  = 20;
    1.42 +
    1.43 +// The supported sampling rate of input signal (Hz),
    1.44 +// must be one of the following. Will resampled to 48kHz otherwise.
    1.45 +static const int kOpusSupportedInputSamplingRates[] =
    1.46 +                   {8000, 12000, 16000, 24000, 48000};
    1.47 +
    1.48 +namespace {
    1.49 +
    1.50 +// An endian-neutral serialization of integers. Serializing T in little endian
    1.51 +// format to aOutput, where T is a 16 bits or 32 bits integer.
    1.52 +template<typename T>
    1.53 +static void
    1.54 +SerializeToBuffer(T aValue, nsTArray<uint8_t>* aOutput)
    1.55 +{
    1.56 +  for (uint32_t i = 0; i < sizeof(T); i++) {
    1.57 +    aOutput->AppendElement((uint8_t)(0x000000ff & (aValue >> (i * 8))));
    1.58 +  }
    1.59 +}
    1.60 +
    1.61 +static inline void
    1.62 +SerializeToBuffer(const nsCString& aComment, nsTArray<uint8_t>* aOutput)
    1.63 +{
    1.64 +  // Format of serializing a string to buffer is, the length of string (32 bits,
    1.65 +  // little endian), and the string.
    1.66 +  SerializeToBuffer((uint32_t)(aComment.Length()), aOutput);
    1.67 +  aOutput->AppendElements(aComment.get(), aComment.Length());
    1.68 +}
    1.69 +
    1.70 +
    1.71 +static void
    1.72 +SerializeOpusIdHeader(uint8_t aChannelCount, uint16_t aPreskip,
    1.73 +                      uint32_t aInputSampleRate, nsTArray<uint8_t>* aOutput)
    1.74 +{
    1.75 +  // The magic signature, null terminator has to be stripped off from strings.
    1.76 +  static const uint8_t magic[] = "OpusHead";
    1.77 +  aOutput->AppendElements(magic, sizeof(magic) - 1);
    1.78 +
    1.79 +  // The version must always be 1 (8 bits, unsigned).
    1.80 +  aOutput->AppendElement(1);
    1.81 +
    1.82 +  // Number of output channels (8 bits, unsigned).
    1.83 +  aOutput->AppendElement(aChannelCount);
    1.84 +
    1.85 +  // Number of samples (at 48 kHz) to discard from the decoder output when
    1.86 +  // starting playback (16 bits, unsigned, little endian).
    1.87 +  SerializeToBuffer(aPreskip, aOutput);
    1.88 +
    1.89 +  // The sampling rate of input source (32 bits, unsigned, little endian).
    1.90 +  SerializeToBuffer(aInputSampleRate, aOutput);
    1.91 +
    1.92 +  // Output gain, an encoder should set this field to zero (16 bits, signed,
    1.93 +  // little endian).
    1.94 +  SerializeToBuffer((int16_t)0, aOutput);
    1.95 +
    1.96 +  // Channel mapping family. Family 0 allows only 1 or 2 channels (8 bits,
    1.97 +  // unsigned).
    1.98 +  aOutput->AppendElement(0);
    1.99 +}
   1.100 +
   1.101 +static void
   1.102 +SerializeOpusCommentHeader(const nsCString& aVendor,
   1.103 +                           const nsTArray<nsCString>& aComments,
   1.104 +                           nsTArray<uint8_t>* aOutput)
   1.105 +{
   1.106 +  // The magic signature, null terminator has to be stripped off.
   1.107 +  static const uint8_t magic[] = "OpusTags";
   1.108 +  aOutput->AppendElements(magic, sizeof(magic) - 1);
   1.109 +
   1.110 +  // The vendor; Should append in the following order:
   1.111 +  // vendor string length (32 bits, unsigned, little endian)
   1.112 +  // vendor string.
   1.113 +  SerializeToBuffer(aVendor, aOutput);
   1.114 +
   1.115 +  // Add comments; Should append in the following order:
   1.116 +  // comment list length (32 bits, unsigned, little endian)
   1.117 +  // comment #0 string length (32 bits, unsigned, little endian)
   1.118 +  // comment #0 string
   1.119 +  // comment #1 string length (32 bits, unsigned, little endian)
   1.120 +  // comment #1 string ...
   1.121 +  SerializeToBuffer((uint32_t)aComments.Length(), aOutput);
   1.122 +  for (uint32_t i = 0; i < aComments.Length(); ++i) {
   1.123 +    SerializeToBuffer(aComments[i], aOutput);
   1.124 +  }
   1.125 +}
   1.126 +
   1.127 +}  // Anonymous namespace.
   1.128 +
   1.129 +OpusTrackEncoder::OpusTrackEncoder()
   1.130 +  : AudioTrackEncoder()
   1.131 +  , mEncoder(nullptr)
   1.132 +  , mLookahead(0)
   1.133 +  , mResampler(nullptr)
   1.134 +{
   1.135 +}
   1.136 +
   1.137 +OpusTrackEncoder::~OpusTrackEncoder()
   1.138 +{
   1.139 +  if (mEncoder) {
   1.140 +    opus_encoder_destroy(mEncoder);
   1.141 +  }
   1.142 +  if (mResampler) {
   1.143 +    speex_resampler_destroy(mResampler);
   1.144 +    mResampler = nullptr;
   1.145 +  }
   1.146 +}
   1.147 +
   1.148 +nsresult
   1.149 +OpusTrackEncoder::Init(int aChannels, int aSamplingRate)
   1.150 +{
   1.151 +  // This monitor is used to wake up other methods that are waiting for encoder
   1.152 +  // to be completely initialized.
   1.153 +  ReentrantMonitorAutoEnter mon(mReentrantMonitor);
   1.154 +
   1.155 +  NS_ENSURE_TRUE((aChannels <= MAX_SUPPORTED_AUDIO_CHANNELS) && (aChannels > 0),
   1.156 +                 NS_ERROR_FAILURE);
   1.157 +
   1.158 +  // This version of encoder API only support 1 or 2 channels,
   1.159 +  // So set the mChannels less or equal 2 and
   1.160 +  // let InterleaveTrackData downmix pcm data.
   1.161 +  mChannels = aChannels > MAX_CHANNELS ? MAX_CHANNELS : aChannels;
   1.162 +
   1.163 +  // According to www.opus-codec.org, creating an opus encoder requires the
   1.164 +  // sampling rate of source signal be one of 8000, 12000, 16000, 24000, or
   1.165 +  // 48000. If this constraint is not satisfied, we resample the input to 48kHz.
   1.166 +  nsTArray<int> supportedSamplingRates;
   1.167 +  supportedSamplingRates.AppendElements(kOpusSupportedInputSamplingRates,
   1.168 +                         ArrayLength(kOpusSupportedInputSamplingRates));
   1.169 +  if (!supportedSamplingRates.Contains(aSamplingRate)) {
   1.170 +    int error;
   1.171 +    mResampler = speex_resampler_init(mChannels,
   1.172 +                                      aSamplingRate,
   1.173 +                                      kOpusSamplingRate,
   1.174 +                                      SPEEX_RESAMPLER_QUALITY_DEFAULT,
   1.175 +                                      &error);
   1.176 +
   1.177 +    if (error != RESAMPLER_ERR_SUCCESS) {
   1.178 +      return NS_ERROR_FAILURE;
   1.179 +    }
   1.180 +  }
   1.181 +  mSamplingRate = aSamplingRate;
   1.182 +  NS_ENSURE_TRUE(mSamplingRate > 0, NS_ERROR_FAILURE);
   1.183 +
   1.184 +  int error = 0;
   1.185 +  mEncoder = opus_encoder_create(GetOutputSampleRate(), mChannels,
   1.186 +                                 OPUS_APPLICATION_AUDIO, &error);
   1.187 +
   1.188 +  mInitialized = (error == OPUS_OK);
   1.189 +
   1.190 +  mReentrantMonitor.NotifyAll();
   1.191 +
   1.192 +  return error == OPUS_OK ? NS_OK : NS_ERROR_FAILURE;
   1.193 +}
   1.194 +
   1.195 +int
   1.196 +OpusTrackEncoder::GetOutputSampleRate()
   1.197 +{
   1.198 +  return mResampler ? kOpusSamplingRate : mSamplingRate;
   1.199 +}
   1.200 +
   1.201 +int
   1.202 +OpusTrackEncoder::GetPacketDuration()
   1.203 +{
   1.204 +  return GetOutputSampleRate() * kFrameDurationMs / 1000;
   1.205 +}
   1.206 +
   1.207 +already_AddRefed<TrackMetadataBase>
   1.208 +OpusTrackEncoder::GetMetadata()
   1.209 +{
   1.210 +  {
   1.211 +    // Wait if mEncoder is not initialized.
   1.212 +    ReentrantMonitorAutoEnter mon(mReentrantMonitor);
   1.213 +    while (!mCanceled && !mInitialized) {
   1.214 +      mReentrantMonitor.Wait();
   1.215 +    }
   1.216 +  }
   1.217 +
   1.218 +  if (mCanceled || mEncodingComplete) {
   1.219 +    return nullptr;
   1.220 +  }
   1.221 +
   1.222 +  nsRefPtr<OpusMetadata> meta = new OpusMetadata();
   1.223 +
   1.224 +  mLookahead = 0;
   1.225 +  int error = opus_encoder_ctl(mEncoder, OPUS_GET_LOOKAHEAD(&mLookahead));
   1.226 +  if (error != OPUS_OK) {
   1.227 +    mLookahead = 0;
   1.228 +  }
   1.229 +
   1.230 +  // The ogg time stamping and pre-skip is always timed at 48000.
   1.231 +  SerializeOpusIdHeader(mChannels, mLookahead * (kOpusSamplingRate /
   1.232 +                        GetOutputSampleRate()), mSamplingRate,
   1.233 +                        &meta->mIdHeader);
   1.234 +
   1.235 +  nsCString vendor;
   1.236 +  vendor.AppendASCII(opus_get_version_string());
   1.237 +
   1.238 +  nsTArray<nsCString> comments;
   1.239 +  comments.AppendElement(NS_LITERAL_CSTRING("ENCODER=Mozilla" MOZ_APP_UA_VERSION));
   1.240 +
   1.241 +  SerializeOpusCommentHeader(vendor, comments,
   1.242 +                             &meta->mCommentHeader);
   1.243 +
   1.244 +  return meta.forget();
   1.245 +}
   1.246 +
   1.247 +nsresult
   1.248 +OpusTrackEncoder::GetEncodedTrack(EncodedFrameContainer& aData)
   1.249 +{
   1.250 +  {
   1.251 +    ReentrantMonitorAutoEnter mon(mReentrantMonitor);
   1.252 +    // Wait until initialized or cancelled.
   1.253 +    while (!mCanceled && !mInitialized) {
   1.254 +      mReentrantMonitor.Wait();
   1.255 +    }
   1.256 +    if (mCanceled || mEncodingComplete) {
   1.257 +      return NS_ERROR_FAILURE;
   1.258 +    }
   1.259 +  }
   1.260 +
   1.261 +  // calculation below depends on the truth that mInitialized is true.
   1.262 +  MOZ_ASSERT(mInitialized);
   1.263 +
   1.264 +  // re-sampled frames left last time which didn't fit into an Opus packet duration.
   1.265 +  const int framesLeft = mResampledLeftover.Length() / mChannels;
   1.266 +  // When framesLeft is 0, (GetPacketDuration() - framesLeft) is a multiple
   1.267 +  // of kOpusSamplingRate. There is not precision loss in the integer division
   1.268 +  // in computing framesToFetch. If frameLeft > 0, we need to add 1 to
   1.269 +  // framesToFetch to ensure there will be at least n frames after re-sampling.
   1.270 +  const int frameRoundUp = framesLeft ? 1 : 0;
   1.271 +
   1.272 +  MOZ_ASSERT(GetPacketDuration() >= framesLeft);
   1.273 +  // Try to fetch m frames such that there will be n frames
   1.274 +  // where (n + frameLeft) >= GetPacketDuration() after re-sampling.
   1.275 +  const int framesToFetch = !mResampler ? GetPacketDuration()
   1.276 +    : (GetPacketDuration() - framesLeft) * mSamplingRate / kOpusSamplingRate
   1.277 +      + frameRoundUp;
   1.278 +  {
   1.279 +    // Move all the samples from mRawSegment to mSourceSegment. We only hold
   1.280 +    // the monitor in this block.
   1.281 +    ReentrantMonitorAutoEnter mon(mReentrantMonitor);
   1.282 +
   1.283 +    // Wait until enough raw data, end of stream or cancelled.
   1.284 +    while (!mCanceled && mRawSegment.GetDuration() +
   1.285 +        mSourceSegment.GetDuration() < framesToFetch &&
   1.286 +        !mEndOfStream) {
   1.287 +      mReentrantMonitor.Wait();
   1.288 +    }
   1.289 +
   1.290 +    if (mCanceled || mEncodingComplete) {
   1.291 +      return NS_ERROR_FAILURE;
   1.292 +    }
   1.293 +
   1.294 +    mSourceSegment.AppendFrom(&mRawSegment);
   1.295 +
   1.296 +    // Pad |mLookahead| samples to the end of source stream to prevent lost of
   1.297 +    // original data, the pcm duration will be calculated at rate 48K later.
   1.298 +    if (mEndOfStream && !mEosSetInEncoder) {
   1.299 +      mEosSetInEncoder = true;
   1.300 +      mSourceSegment.AppendNullData(mLookahead);
   1.301 +    }
   1.302 +  }
   1.303 +
   1.304 +  // Start encoding data.
   1.305 +  nsAutoTArray<AudioDataValue, 9600> pcm;
   1.306 +  pcm.SetLength(GetPacketDuration() * mChannels);
   1.307 +  AudioSegment::ChunkIterator iter(mSourceSegment);
   1.308 +  int frameCopied = 0;
   1.309 +
   1.310 +  while (!iter.IsEnded() && frameCopied < framesToFetch) {
   1.311 +    AudioChunk chunk = *iter;
   1.312 +
   1.313 +    // Chunk to the required frame size.
   1.314 +    int frameToCopy = chunk.GetDuration();
   1.315 +    if (frameCopied + frameToCopy > framesToFetch) {
   1.316 +      frameToCopy = framesToFetch - frameCopied;
   1.317 +    }
   1.318 +
   1.319 +    if (!chunk.IsNull()) {
   1.320 +      // Append the interleaved data to the end of pcm buffer.
   1.321 +      AudioTrackEncoder::InterleaveTrackData(chunk, frameToCopy, mChannels,
   1.322 +        pcm.Elements() + frameCopied * mChannels);
   1.323 +    } else {
   1.324 +      memset(pcm.Elements() + frameCopied * mChannels, 0,
   1.325 +             frameToCopy * mChannels * sizeof(AudioDataValue));
   1.326 +    }
   1.327 +
   1.328 +    frameCopied += frameToCopy;
   1.329 +    iter.Next();
   1.330 +  }
   1.331 +
   1.332 +  nsRefPtr<EncodedFrame> audiodata = new EncodedFrame();
   1.333 +  audiodata->SetFrameType(EncodedFrame::OPUS_AUDIO_FRAME);
   1.334 +  int framesInPCM = frameCopied;
   1.335 +  if (mResampler) {
   1.336 +    nsAutoTArray<AudioDataValue, 9600> resamplingDest;
   1.337 +    // We want to consume all the input data, so we slightly oversize the
   1.338 +    // resampled data buffer so we can fit the output data in. We cannot really
   1.339 +    // predict the output frame count at each call.
   1.340 +    uint32_t outframes = frameCopied * kOpusSamplingRate / mSamplingRate + 1;
   1.341 +    uint32_t inframes = frameCopied;
   1.342 +
   1.343 +    resamplingDest.SetLength(outframes * mChannels);
   1.344 +
   1.345 +#if MOZ_SAMPLE_TYPE_S16
   1.346 +    short* in = reinterpret_cast<short*>(pcm.Elements());
   1.347 +    short* out = reinterpret_cast<short*>(resamplingDest.Elements());
   1.348 +    speex_resampler_process_interleaved_int(mResampler, in, &inframes,
   1.349 +                                                        out, &outframes);
   1.350 +#else
   1.351 +    float* in = reinterpret_cast<float*>(pcm.Elements());
   1.352 +    float* out = reinterpret_cast<float*>(resamplingDest.Elements());
   1.353 +    speex_resampler_process_interleaved_float(mResampler, in, &inframes,
   1.354 +                                                          out, &outframes);
   1.355 +#endif
   1.356 +
   1.357 +    MOZ_ASSERT(pcm.Length() >= mResampledLeftover.Length());
   1.358 +    PodCopy(pcm.Elements(), mResampledLeftover.Elements(),
   1.359 +        mResampledLeftover.Length());
   1.360 +
   1.361 +    uint32_t outframesToCopy = std::min(outframes,
   1.362 +        static_cast<uint32_t>(GetPacketDuration() - framesLeft));
   1.363 +
   1.364 +    MOZ_ASSERT(pcm.Length() - mResampledLeftover.Length() >=
   1.365 +        outframesToCopy * mChannels);
   1.366 +    PodCopy(pcm.Elements() + mResampledLeftover.Length(),
   1.367 +        resamplingDest.Elements(), outframesToCopy * mChannels);
   1.368 +    int frameLeftover = outframes - outframesToCopy;
   1.369 +    mResampledLeftover.SetLength(frameLeftover * mChannels);
   1.370 +    PodCopy(mResampledLeftover.Elements(),
   1.371 +        resamplingDest.Elements() + outframesToCopy * mChannels,
   1.372 +        mResampledLeftover.Length());
   1.373 +    // This is always at 48000Hz.
   1.374 +    framesInPCM = framesLeft + outframesToCopy;
   1.375 +    audiodata->SetDuration(framesInPCM);
   1.376 +  } else {
   1.377 +    // The ogg time stamping and pre-skip is always timed at 48000.
   1.378 +    audiodata->SetDuration(frameCopied * (kOpusSamplingRate / mSamplingRate));
   1.379 +  }
   1.380 +
   1.381 +  // Remove the raw data which has been pulled to pcm buffer.
   1.382 +  // The value of frameCopied should equal to (or smaller than, if eos)
   1.383 +  // GetPacketDuration().
   1.384 +  mSourceSegment.RemoveLeading(frameCopied);
   1.385 +
   1.386 +  // Has reached the end of input stream and all queued data has pulled for
   1.387 +  // encoding.
   1.388 +  if (mSourceSegment.GetDuration() == 0 && mEndOfStream) {
   1.389 +    mEncodingComplete = true;
   1.390 +    LOG("[Opus] Done encoding.");
   1.391 +  }
   1.392 +
   1.393 +  MOZ_ASSERT(mEndOfStream || framesInPCM == GetPacketDuration());
   1.394 +
   1.395 +  // Append null data to pcm buffer if the leftover data is not enough for
   1.396 +  // opus encoder.
   1.397 +  if (framesInPCM < GetPacketDuration() && mEndOfStream) {
   1.398 +    PodZero(pcm.Elements() + framesInPCM * mChannels,
   1.399 +        (GetPacketDuration() - framesInPCM) * mChannels);
   1.400 +  }
   1.401 +  nsTArray<uint8_t> frameData;
   1.402 +  // Encode the data with Opus Encoder.
   1.403 +  frameData.SetLength(MAX_DATA_BYTES);
   1.404 +  // result is returned as opus error code if it is negative.
   1.405 +  int result = 0;
   1.406 +#ifdef MOZ_SAMPLE_TYPE_S16
   1.407 +  const opus_int16* pcmBuf = static_cast<opus_int16*>(pcm.Elements());
   1.408 +  result = opus_encode(mEncoder, pcmBuf, GetPacketDuration(),
   1.409 +                       frameData.Elements(), MAX_DATA_BYTES);
   1.410 +#else
   1.411 +  const float* pcmBuf = static_cast<float*>(pcm.Elements());
   1.412 +  result = opus_encode_float(mEncoder, pcmBuf, GetPacketDuration(),
   1.413 +                             frameData.Elements(), MAX_DATA_BYTES);
   1.414 +#endif
   1.415 +  frameData.SetLength(result >= 0 ? result : 0);
   1.416 +
   1.417 +  if (result < 0) {
   1.418 +    LOG("[Opus] Fail to encode data! Result: %s.", opus_strerror(result));
   1.419 +  }
   1.420 +  if (mEncodingComplete) {
   1.421 +    if (mResampler) {
   1.422 +      speex_resampler_destroy(mResampler);
   1.423 +      mResampler = nullptr;
   1.424 +    }
   1.425 +    mResampledLeftover.SetLength(0);
   1.426 +  }
   1.427 +
   1.428 +  audiodata->SwapInFrameData(frameData);
   1.429 +  aData.AppendEncodedFrame(audiodata);
   1.430 +  return result >= 0 ? NS_OK : NS_ERROR_FAILURE;
   1.431 +}
   1.432 +
   1.433 +}

mercurial