content/media/webspeech/recognition/energy_endpointer.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/content/media/webspeech/recognition/energy_endpointer.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,393 @@
     1.4 +// Copyright (c) 2013 The Chromium Authors. All rights reserved.
     1.5 +//
     1.6 +// Redistribution and use in source and binary forms, with or without
     1.7 +// modification, are permitted provided that the following conditions are
     1.8 +// met:
     1.9 +//
    1.10 +//    * Redistributions of source code must retain the above copyright
    1.11 +// notice, this list of conditions and the following disclaimer.
    1.12 +//    * Redistributions in binary form must reproduce the above
    1.13 +// copyright notice, this list of conditions and the following disclaimer
    1.14 +// in the documentation and/or other materials provided with the
    1.15 +// distribution.
    1.16 +//    * Neither the name of Google Inc. nor the names of its
    1.17 +// contributors may be used to endorse or promote products derived from
    1.18 +// this software without specific prior written permission.
    1.19 +//
    1.20 +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    1.21 +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    1.22 +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
    1.23 +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
    1.24 +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
    1.25 +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
    1.26 +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
    1.27 +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    1.28 +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    1.29 +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    1.30 +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    1.31 +
    1.32 +#include "energy_endpointer.h"
    1.33 +
    1.34 +#include <math.h>
    1.35 +
    1.36 +namespace {
    1.37 +
    1.38 +// Returns the RMS (quadratic mean) of the input signal.
    1.39 +float RMS(const int16_t* samples, int num_samples) {
    1.40 +  int64_t ssq_int64_t = 0;
    1.41 +  int64_t sum_int64_t = 0;
    1.42 +  for (int i = 0; i < num_samples; ++i) {
    1.43 +    sum_int64_t += samples[i];
    1.44 +    ssq_int64_t += samples[i] * samples[i];
    1.45 +  }
    1.46 +  // now convert to floats.
    1.47 +  double sum = static_cast<double>(sum_int64_t);
    1.48 +  sum /= num_samples;
    1.49 +  double ssq = static_cast<double>(ssq_int64_t);
    1.50 +  return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
    1.51 +}
    1.52 +
    1.53 +int64_t Secs2Usecs(float seconds) {
    1.54 +  return static_cast<int64_t>(0.5 + (1.0e6 * seconds));
    1.55 +}
    1.56 +
    1.57 +float GetDecibel(float value) {
    1.58 +  if (value > 1.0e-100)
    1.59 +    return 20 * log10(value);
    1.60 +  return -2000.0;
    1.61 +}
    1.62 +
    1.63 +}  // namespace
    1.64 +
    1.65 +namespace mozilla {
    1.66 +
    1.67 +// Stores threshold-crossing histories for making decisions about the speech
    1.68 +// state.
    1.69 +class EnergyEndpointer::HistoryRing {
    1.70 + public:
    1.71 +  HistoryRing() : insertion_index_(0) {}
    1.72 +
    1.73 +  // Resets the ring to |size| elements each with state |initial_state|
    1.74 +  void SetRing(int size, bool initial_state);
    1.75 +
    1.76 +  // Inserts a new entry into the ring and drops the oldest entry.
    1.77 +  void Insert(int64_t time_us, bool decision);
    1.78 +
    1.79 +  // Returns the time in microseconds of the most recently added entry.
    1.80 +  int64_t EndTime() const;
    1.81 +
    1.82 +  // Returns the sum of all intervals during which 'decision' is true within
    1.83 +  // the time in seconds specified by 'duration'. The returned interval is
    1.84 +  // in seconds.
    1.85 +  float RingSum(float duration_sec);
    1.86 +
    1.87 + private:
    1.88 +  struct DecisionPoint {
    1.89 +    int64_t time_us;
    1.90 +    bool decision;
    1.91 +  };
    1.92 +
    1.93 +  std::vector<DecisionPoint> decision_points_;
    1.94 +  int insertion_index_;  // Index at which the next item gets added/inserted.
    1.95 +
    1.96 +  HistoryRing(const HistoryRing&);
    1.97 +  void operator=(const HistoryRing&);
    1.98 +};
    1.99 +
   1.100 +void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
   1.101 +  insertion_index_ = 0;
   1.102 +  decision_points_.clear();
   1.103 +  DecisionPoint init = { -1, initial_state };
   1.104 +  decision_points_.resize(size, init);
   1.105 +}
   1.106 +
   1.107 +void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) {
   1.108 +  decision_points_[insertion_index_].time_us = time_us;
   1.109 +  decision_points_[insertion_index_].decision = decision;
   1.110 +  insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
   1.111 +}
   1.112 +
   1.113 +int64_t EnergyEndpointer::HistoryRing::EndTime() const {
   1.114 +  int ind = insertion_index_ - 1;
   1.115 +  if (ind < 0)
   1.116 +    ind = decision_points_.size() - 1;
   1.117 +  return decision_points_[ind].time_us;
   1.118 +}
   1.119 +
   1.120 +float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
   1.121 +  if (!decision_points_.size())
   1.122 +    return 0.0;
   1.123 +
   1.124 +  int64_t sum_us = 0;
   1.125 +  int ind = insertion_index_ - 1;
   1.126 +  if (ind < 0)
   1.127 +    ind = decision_points_.size() - 1;
   1.128 +  int64_t end_us = decision_points_[ind].time_us;
   1.129 +  bool is_on = decision_points_[ind].decision;
   1.130 +  int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec));
   1.131 +  if (start_us < 0)
   1.132 +    start_us = 0;
   1.133 +  size_t n_summed = 1;  // n points ==> (n-1) intervals
   1.134 +  while ((decision_points_[ind].time_us > start_us) &&
   1.135 +         (n_summed < decision_points_.size())) {
   1.136 +    --ind;
   1.137 +    if (ind < 0)
   1.138 +      ind = decision_points_.size() - 1;
   1.139 +    if (is_on)
   1.140 +      sum_us += end_us - decision_points_[ind].time_us;
   1.141 +    is_on = decision_points_[ind].decision;
   1.142 +    end_us = decision_points_[ind].time_us;
   1.143 +    n_summed++;
   1.144 +  }
   1.145 +
   1.146 +  return 1.0e-6f * sum_us;  //  Returns total time that was super threshold.
   1.147 +}
   1.148 +
   1.149 +EnergyEndpointer::EnergyEndpointer()
   1.150 +    : status_(EP_PRE_SPEECH),
   1.151 +      offset_confirm_dur_sec_(0),
   1.152 +      endpointer_time_us_(0),
   1.153 +      fast_update_frames_(0),
   1.154 +      frame_counter_(0),
   1.155 +      max_window_dur_(4.0),
   1.156 +      sample_rate_(0),
   1.157 +      history_(new HistoryRing()),
   1.158 +      decision_threshold_(0),
   1.159 +      estimating_environment_(false),
   1.160 +      noise_level_(0),
   1.161 +      rms_adapt_(0),
   1.162 +      start_lag_(0),
   1.163 +      end_lag_(0),
   1.164 +      user_input_start_time_us_(0) {
   1.165 +}
   1.166 +
   1.167 +EnergyEndpointer::~EnergyEndpointer() {
   1.168 +}
   1.169 +
   1.170 +int EnergyEndpointer::TimeToFrame(float time) const {
   1.171 +  return static_cast<int32_t>(0.5 + (time / params_.frame_period()));
   1.172 +}
   1.173 +
   1.174 +void EnergyEndpointer::Restart(bool reset_threshold) {
   1.175 +  status_ = EP_PRE_SPEECH;
   1.176 +  user_input_start_time_us_ = 0;
   1.177 +
   1.178 +  if (reset_threshold) {
   1.179 +    decision_threshold_ = params_.decision_threshold();
   1.180 +    rms_adapt_ = decision_threshold_;
   1.181 +    noise_level_ = params_.decision_threshold() / 2.0f;
   1.182 +    frame_counter_ = 0;  // Used for rapid initial update of levels.
   1.183 +  }
   1.184 +
   1.185 +  // Set up the memories to hold the history windows.
   1.186 +  history_->SetRing(TimeToFrame(max_window_dur_), false);
   1.187 +
   1.188 +  // Flag that indicates that current input should be used for
   1.189 +  // estimating the environment. The user has not yet started input
   1.190 +  // by e.g. pressed the push-to-talk button. By default, this is
   1.191 +  // false for backward compatibility.
   1.192 +  estimating_environment_ = false;
   1.193 +}
   1.194 +
   1.195 +void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
   1.196 +  params_ = params;
   1.197 +
   1.198 +  // Find the longest history interval to be used, and make the ring
   1.199 +  // large enough to accommodate that number of frames.  NOTE: This
   1.200 +  // depends upon ep_frame_period being set correctly in the factory
   1.201 +  // that did this instantiation.
   1.202 +  max_window_dur_ = params_.onset_window();
   1.203 +  if (params_.speech_on_window() > max_window_dur_)
   1.204 +    max_window_dur_ = params_.speech_on_window();
   1.205 +  if (params_.offset_window() > max_window_dur_)
   1.206 +    max_window_dur_ = params_.offset_window();
   1.207 +  Restart(true);
   1.208 +
   1.209 +  offset_confirm_dur_sec_ = params_.offset_window() -
   1.210 +                            params_.offset_confirm_dur();
   1.211 +  if (offset_confirm_dur_sec_ < 0.0)
   1.212 +    offset_confirm_dur_sec_ = 0.0;
   1.213 +
   1.214 +  user_input_start_time_us_ = 0;
   1.215 +
   1.216 +  // Flag that indicates that  current input should be used for
   1.217 +  // estimating the environment. The user has not yet started input
   1.218 +  // by e.g. pressed the push-to-talk button. By default, this is
   1.219 +  // false for backward compatibility.
   1.220 +  estimating_environment_ = false;
   1.221 +  // The initial value of the noise and speech levels is inconsequential.
   1.222 +  // The level of the first frame will overwrite these values.
   1.223 +  noise_level_ = params_.decision_threshold() / 2.0f;
   1.224 +  fast_update_frames_ =
   1.225 +      static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period());
   1.226 +
   1.227 +  frame_counter_ = 0;  // Used for rapid initial update of levels.
   1.228 +
   1.229 +  sample_rate_ = params_.sample_rate();
   1.230 +  start_lag_ = static_cast<int>(sample_rate_ /
   1.231 +                                params_.max_fundamental_frequency());
   1.232 +  end_lag_ = static_cast<int>(sample_rate_ /
   1.233 +                              params_.min_fundamental_frequency());
   1.234 +}
   1.235 +
   1.236 +void EnergyEndpointer::StartSession() {
   1.237 +  Restart(true);
   1.238 +}
   1.239 +
   1.240 +void EnergyEndpointer::EndSession() {
   1.241 +  status_ = EP_POST_SPEECH;
   1.242 +}
   1.243 +
   1.244 +void EnergyEndpointer::SetEnvironmentEstimationMode() {
   1.245 +  Restart(true);
   1.246 +  estimating_environment_ = true;
   1.247 +}
   1.248 +
   1.249 +void EnergyEndpointer::SetUserInputMode() {
   1.250 +  estimating_environment_ = false;
   1.251 +  user_input_start_time_us_ = endpointer_time_us_;
   1.252 +}
   1.253 +
   1.254 +void EnergyEndpointer::ProcessAudioFrame(int64_t time_us,
   1.255 +                                         const int16_t* samples,
   1.256 +                                         int num_samples,
   1.257 +                                         float* rms_out) {
   1.258 +  endpointer_time_us_ = time_us;
   1.259 +  float rms = RMS(samples, num_samples);
   1.260 +
   1.261 +  // Check that this is user input audio vs. pre-input adaptation audio.
   1.262 +  // Input audio starts when the user indicates start of input, by e.g.
   1.263 +  // pressing push-to-talk. Audio recieved prior to that is used to update
   1.264 +  // noise and speech level estimates.
   1.265 +  if (!estimating_environment_) {
   1.266 +    bool decision = false;
   1.267 +    if ((endpointer_time_us_ - user_input_start_time_us_) <
   1.268 +        Secs2Usecs(params_.contamination_rejection_period())) {
   1.269 +      decision = false;
   1.270 +      //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_));
   1.271 +    } else {
   1.272 +      decision = (rms > decision_threshold_);
   1.273 +    }
   1.274 +
   1.275 +    history_->Insert(endpointer_time_us_, decision);
   1.276 +
   1.277 +    switch (status_) {
   1.278 +      case EP_PRE_SPEECH:
   1.279 +        if (history_->RingSum(params_.onset_window()) >
   1.280 +            params_.onset_detect_dur()) {
   1.281 +          status_ = EP_POSSIBLE_ONSET;
   1.282 +        }
   1.283 +        break;
   1.284 +
   1.285 +      case EP_POSSIBLE_ONSET: {
   1.286 +        float tsum = history_->RingSum(params_.onset_window());
   1.287 +        if (tsum > params_.onset_confirm_dur()) {
   1.288 +          status_ = EP_SPEECH_PRESENT;
   1.289 +        } else {  // If signal is not maintained, drop back to pre-speech.
   1.290 +          if (tsum <= params_.onset_detect_dur())
   1.291 +            status_ = EP_PRE_SPEECH;
   1.292 +        }
   1.293 +        break;
   1.294 +      }
   1.295 +
   1.296 +      case EP_SPEECH_PRESENT: {
   1.297 +        // To induce hysteresis in the state residency, we allow a
   1.298 +        // smaller residency time in the on_ring, than was required to
   1.299 +        // enter the SPEECH_PERSENT state.
   1.300 +        float on_time = history_->RingSum(params_.speech_on_window());
   1.301 +        if (on_time < params_.on_maintain_dur())
   1.302 +          status_ = EP_POSSIBLE_OFFSET;
   1.303 +        break;
   1.304 +      }
   1.305 +
   1.306 +      case EP_POSSIBLE_OFFSET:
   1.307 +        if (history_->RingSum(params_.offset_window()) <=
   1.308 +            offset_confirm_dur_sec_) {
   1.309 +          // Note that this offset time may be beyond the end
   1.310 +          // of the input buffer in a real-time system.  It will be up
   1.311 +          // to the RecognizerSession to decide what to do.
   1.312 +          status_ = EP_PRE_SPEECH;  // Automatically reset for next utterance.
   1.313 +        } else {  // If speech picks up again we allow return to SPEECH_PRESENT.
   1.314 +          if (history_->RingSum(params_.speech_on_window()) >=
   1.315 +              params_.on_maintain_dur())
   1.316 +            status_ = EP_SPEECH_PRESENT;
   1.317 +        }
   1.318 +        break;
   1.319 +
   1.320 +      default:
   1.321 +        break;
   1.322 +    }
   1.323 +
   1.324 +    // If this is a quiet, non-speech region, slowly adapt the detection
   1.325 +    // threshold to be about 6dB above the average RMS.
   1.326 +    if ((!decision) && (status_ == EP_PRE_SPEECH)) {
   1.327 +      decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
   1.328 +      rms_adapt_ = decision_threshold_;
   1.329 +    } else {
   1.330 +      // If this is in a speech region, adapt the decision threshold to
   1.331 +      // be about 10dB below the average RMS. If the noise level is high,
   1.332 +      // the threshold is pushed up.
   1.333 +      // Adaptation up to a higher level is 5 times faster than decay to
   1.334 +      // a lower level.
   1.335 +      if ((status_ == EP_SPEECH_PRESENT) && decision) {
   1.336 +        if (rms_adapt_ > rms) {
   1.337 +          rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
   1.338 +        } else {
   1.339 +          rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
   1.340 +        }
   1.341 +        float target_threshold = 0.3f * rms_adapt_ +  noise_level_;
   1.342 +        decision_threshold_ = (.90f * decision_threshold_) +
   1.343 +                              (0.10f * target_threshold);
   1.344 +      }
   1.345 +    }
   1.346 +
   1.347 +    // Set a floor
   1.348 +    if (decision_threshold_ < params_.min_decision_threshold())
   1.349 +      decision_threshold_ = params_.min_decision_threshold();
   1.350 +  }
   1.351 +
   1.352 +  // Update speech and noise levels.
   1.353 +  UpdateLevels(rms);
   1.354 +  ++frame_counter_;
   1.355 +
   1.356 +  if (rms_out)
   1.357 +    *rms_out = GetDecibel(rms);
   1.358 +}
   1.359 +
   1.360 +float EnergyEndpointer::GetNoiseLevelDb() const {
   1.361 +  return GetDecibel(noise_level_);
   1.362 +}
   1.363 +
   1.364 +void EnergyEndpointer::UpdateLevels(float rms) {
   1.365 +  // Update quickly initially. We assume this is noise and that
   1.366 +  // speech is 6dB above the noise.
   1.367 +  if (frame_counter_ < fast_update_frames_) {
   1.368 +    // Alpha increases from 0 to (k-1)/k where k is the number of time
   1.369 +    // steps in the initial adaptation period.
   1.370 +    float alpha = static_cast<float>(frame_counter_) /
   1.371 +        static_cast<float>(fast_update_frames_);
   1.372 +    noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
   1.373 +    //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_));
   1.374 +  } else {
   1.375 +    // Update Noise level. The noise level adapts quickly downward, but
   1.376 +    // slowly upward. The noise_level_ parameter is not currently used
   1.377 +    // for threshold adaptation. It is used for UI feedback.
   1.378 +    if (noise_level_ < rms)
   1.379 +      noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
   1.380 +    else
   1.381 +      noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
   1.382 +  }
   1.383 +  if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
   1.384 +    decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
   1.385 +    // Set a floor
   1.386 +    if (decision_threshold_ < params_.min_decision_threshold())
   1.387 +      decision_threshold_ = params_.min_decision_threshold();
   1.388 +  }
   1.389 +}
   1.390 +
   1.391 +EpStatus EnergyEndpointer::Status(int64_t* status_time)  const {
   1.392 +  *status_time = history_->EndTime();
   1.393 +  return status_;
   1.394 +}
   1.395 +
   1.396 +}  // namespace mozilla

mercurial