content/media/webspeech/recognition/energy_endpointer.cc

Tue, 06 Jan 2015 21:39:09 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Tue, 06 Jan 2015 21:39:09 +0100
branch
TOR_BUG_9701
changeset 8
97036ab72558
permissions
-rw-r--r--

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
     2 //
     3 // Redistribution and use in source and binary forms, with or without
     4 // modification, are permitted provided that the following conditions are
     5 // met:
     6 //
     7 //    * Redistributions of source code must retain the above copyright
     8 // notice, this list of conditions and the following disclaimer.
     9 //    * Redistributions in binary form must reproduce the above
    10 // copyright notice, this list of conditions and the following disclaimer
    11 // in the documentation and/or other materials provided with the
    12 // distribution.
    13 //    * Neither the name of Google Inc. nor the names of its
    14 // contributors may be used to endorse or promote products derived from
    15 // this software without specific prior written permission.
    16 //
    17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
    20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
    21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
    22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
    23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
    24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    29 #include "energy_endpointer.h"
    31 #include <math.h>
    33 namespace {
    35 // Returns the RMS (quadratic mean) of the input signal.
    36 float RMS(const int16_t* samples, int num_samples) {
    37   int64_t ssq_int64_t = 0;
    38   int64_t sum_int64_t = 0;
    39   for (int i = 0; i < num_samples; ++i) {
    40     sum_int64_t += samples[i];
    41     ssq_int64_t += samples[i] * samples[i];
    42   }
    43   // now convert to floats.
    44   double sum = static_cast<double>(sum_int64_t);
    45   sum /= num_samples;
    46   double ssq = static_cast<double>(ssq_int64_t);
    47   return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
    48 }
    50 int64_t Secs2Usecs(float seconds) {
    51   return static_cast<int64_t>(0.5 + (1.0e6 * seconds));
    52 }
    54 float GetDecibel(float value) {
    55   if (value > 1.0e-100)
    56     return 20 * log10(value);
    57   return -2000.0;
    58 }
    60 }  // namespace
    62 namespace mozilla {
    64 // Stores threshold-crossing histories for making decisions about the speech
    65 // state.
    66 class EnergyEndpointer::HistoryRing {
    67  public:
    68   HistoryRing() : insertion_index_(0) {}
    70   // Resets the ring to |size| elements each with state |initial_state|
    71   void SetRing(int size, bool initial_state);
    73   // Inserts a new entry into the ring and drops the oldest entry.
    74   void Insert(int64_t time_us, bool decision);
    76   // Returns the time in microseconds of the most recently added entry.
    77   int64_t EndTime() const;
    79   // Returns the sum of all intervals during which 'decision' is true within
    80   // the time in seconds specified by 'duration'. The returned interval is
    81   // in seconds.
    82   float RingSum(float duration_sec);
    84  private:
    85   struct DecisionPoint {
    86     int64_t time_us;
    87     bool decision;
    88   };
    90   std::vector<DecisionPoint> decision_points_;
    91   int insertion_index_;  // Index at which the next item gets added/inserted.
    93   HistoryRing(const HistoryRing&);
    94   void operator=(const HistoryRing&);
    95 };
    97 void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
    98   insertion_index_ = 0;
    99   decision_points_.clear();
   100   DecisionPoint init = { -1, initial_state };
   101   decision_points_.resize(size, init);
   102 }
   104 void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) {
   105   decision_points_[insertion_index_].time_us = time_us;
   106   decision_points_[insertion_index_].decision = decision;
   107   insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
   108 }
   110 int64_t EnergyEndpointer::HistoryRing::EndTime() const {
   111   int ind = insertion_index_ - 1;
   112   if (ind < 0)
   113     ind = decision_points_.size() - 1;
   114   return decision_points_[ind].time_us;
   115 }
   117 float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
   118   if (!decision_points_.size())
   119     return 0.0;
   121   int64_t sum_us = 0;
   122   int ind = insertion_index_ - 1;
   123   if (ind < 0)
   124     ind = decision_points_.size() - 1;
   125   int64_t end_us = decision_points_[ind].time_us;
   126   bool is_on = decision_points_[ind].decision;
   127   int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec));
   128   if (start_us < 0)
   129     start_us = 0;
   130   size_t n_summed = 1;  // n points ==> (n-1) intervals
   131   while ((decision_points_[ind].time_us > start_us) &&
   132          (n_summed < decision_points_.size())) {
   133     --ind;
   134     if (ind < 0)
   135       ind = decision_points_.size() - 1;
   136     if (is_on)
   137       sum_us += end_us - decision_points_[ind].time_us;
   138     is_on = decision_points_[ind].decision;
   139     end_us = decision_points_[ind].time_us;
   140     n_summed++;
   141   }
   143   return 1.0e-6f * sum_us;  //  Returns total time that was super threshold.
   144 }
   146 EnergyEndpointer::EnergyEndpointer()
   147     : status_(EP_PRE_SPEECH),
   148       offset_confirm_dur_sec_(0),
   149       endpointer_time_us_(0),
   150       fast_update_frames_(0),
   151       frame_counter_(0),
   152       max_window_dur_(4.0),
   153       sample_rate_(0),
   154       history_(new HistoryRing()),
   155       decision_threshold_(0),
   156       estimating_environment_(false),
   157       noise_level_(0),
   158       rms_adapt_(0),
   159       start_lag_(0),
   160       end_lag_(0),
   161       user_input_start_time_us_(0) {
   162 }
   164 EnergyEndpointer::~EnergyEndpointer() {
   165 }
   167 int EnergyEndpointer::TimeToFrame(float time) const {
   168   return static_cast<int32_t>(0.5 + (time / params_.frame_period()));
   169 }
   171 void EnergyEndpointer::Restart(bool reset_threshold) {
   172   status_ = EP_PRE_SPEECH;
   173   user_input_start_time_us_ = 0;
   175   if (reset_threshold) {
   176     decision_threshold_ = params_.decision_threshold();
   177     rms_adapt_ = decision_threshold_;
   178     noise_level_ = params_.decision_threshold() / 2.0f;
   179     frame_counter_ = 0;  // Used for rapid initial update of levels.
   180   }
   182   // Set up the memories to hold the history windows.
   183   history_->SetRing(TimeToFrame(max_window_dur_), false);
   185   // Flag that indicates that current input should be used for
   186   // estimating the environment. The user has not yet started input
   187   // by e.g. pressed the push-to-talk button. By default, this is
   188   // false for backward compatibility.
   189   estimating_environment_ = false;
   190 }
   192 void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
   193   params_ = params;
   195   // Find the longest history interval to be used, and make the ring
   196   // large enough to accommodate that number of frames.  NOTE: This
   197   // depends upon ep_frame_period being set correctly in the factory
   198   // that did this instantiation.
   199   max_window_dur_ = params_.onset_window();
   200   if (params_.speech_on_window() > max_window_dur_)
   201     max_window_dur_ = params_.speech_on_window();
   202   if (params_.offset_window() > max_window_dur_)
   203     max_window_dur_ = params_.offset_window();
   204   Restart(true);
   206   offset_confirm_dur_sec_ = params_.offset_window() -
   207                             params_.offset_confirm_dur();
   208   if (offset_confirm_dur_sec_ < 0.0)
   209     offset_confirm_dur_sec_ = 0.0;
   211   user_input_start_time_us_ = 0;
   213   // Flag that indicates that  current input should be used for
   214   // estimating the environment. The user has not yet started input
   215   // by e.g. pressed the push-to-talk button. By default, this is
   216   // false for backward compatibility.
   217   estimating_environment_ = false;
   218   // The initial value of the noise and speech levels is inconsequential.
   219   // The level of the first frame will overwrite these values.
   220   noise_level_ = params_.decision_threshold() / 2.0f;
   221   fast_update_frames_ =
   222       static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period());
   224   frame_counter_ = 0;  // Used for rapid initial update of levels.
   226   sample_rate_ = params_.sample_rate();
   227   start_lag_ = static_cast<int>(sample_rate_ /
   228                                 params_.max_fundamental_frequency());
   229   end_lag_ = static_cast<int>(sample_rate_ /
   230                               params_.min_fundamental_frequency());
   231 }
   233 void EnergyEndpointer::StartSession() {
   234   Restart(true);
   235 }
   237 void EnergyEndpointer::EndSession() {
   238   status_ = EP_POST_SPEECH;
   239 }
   241 void EnergyEndpointer::SetEnvironmentEstimationMode() {
   242   Restart(true);
   243   estimating_environment_ = true;
   244 }
   246 void EnergyEndpointer::SetUserInputMode() {
   247   estimating_environment_ = false;
   248   user_input_start_time_us_ = endpointer_time_us_;
   249 }
   251 void EnergyEndpointer::ProcessAudioFrame(int64_t time_us,
   252                                          const int16_t* samples,
   253                                          int num_samples,
   254                                          float* rms_out) {
   255   endpointer_time_us_ = time_us;
   256   float rms = RMS(samples, num_samples);
   258   // Check that this is user input audio vs. pre-input adaptation audio.
   259   // Input audio starts when the user indicates start of input, by e.g.
   260   // pressing push-to-talk. Audio recieved prior to that is used to update
   261   // noise and speech level estimates.
   262   if (!estimating_environment_) {
   263     bool decision = false;
   264     if ((endpointer_time_us_ - user_input_start_time_us_) <
   265         Secs2Usecs(params_.contamination_rejection_period())) {
   266       decision = false;
   267       //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_));
   268     } else {
   269       decision = (rms > decision_threshold_);
   270     }
   272     history_->Insert(endpointer_time_us_, decision);
   274     switch (status_) {
   275       case EP_PRE_SPEECH:
   276         if (history_->RingSum(params_.onset_window()) >
   277             params_.onset_detect_dur()) {
   278           status_ = EP_POSSIBLE_ONSET;
   279         }
   280         break;
   282       case EP_POSSIBLE_ONSET: {
   283         float tsum = history_->RingSum(params_.onset_window());
   284         if (tsum > params_.onset_confirm_dur()) {
   285           status_ = EP_SPEECH_PRESENT;
   286         } else {  // If signal is not maintained, drop back to pre-speech.
   287           if (tsum <= params_.onset_detect_dur())
   288             status_ = EP_PRE_SPEECH;
   289         }
   290         break;
   291       }
   293       case EP_SPEECH_PRESENT: {
   294         // To induce hysteresis in the state residency, we allow a
   295         // smaller residency time in the on_ring, than was required to
   296         // enter the SPEECH_PERSENT state.
   297         float on_time = history_->RingSum(params_.speech_on_window());
   298         if (on_time < params_.on_maintain_dur())
   299           status_ = EP_POSSIBLE_OFFSET;
   300         break;
   301       }
   303       case EP_POSSIBLE_OFFSET:
   304         if (history_->RingSum(params_.offset_window()) <=
   305             offset_confirm_dur_sec_) {
   306           // Note that this offset time may be beyond the end
   307           // of the input buffer in a real-time system.  It will be up
   308           // to the RecognizerSession to decide what to do.
   309           status_ = EP_PRE_SPEECH;  // Automatically reset for next utterance.
   310         } else {  // If speech picks up again we allow return to SPEECH_PRESENT.
   311           if (history_->RingSum(params_.speech_on_window()) >=
   312               params_.on_maintain_dur())
   313             status_ = EP_SPEECH_PRESENT;
   314         }
   315         break;
   317       default:
   318         break;
   319     }
   321     // If this is a quiet, non-speech region, slowly adapt the detection
   322     // threshold to be about 6dB above the average RMS.
   323     if ((!decision) && (status_ == EP_PRE_SPEECH)) {
   324       decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
   325       rms_adapt_ = decision_threshold_;
   326     } else {
   327       // If this is in a speech region, adapt the decision threshold to
   328       // be about 10dB below the average RMS. If the noise level is high,
   329       // the threshold is pushed up.
   330       // Adaptation up to a higher level is 5 times faster than decay to
   331       // a lower level.
   332       if ((status_ == EP_SPEECH_PRESENT) && decision) {
   333         if (rms_adapt_ > rms) {
   334           rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
   335         } else {
   336           rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
   337         }
   338         float target_threshold = 0.3f * rms_adapt_ +  noise_level_;
   339         decision_threshold_ = (.90f * decision_threshold_) +
   340                               (0.10f * target_threshold);
   341       }
   342     }
   344     // Set a floor
   345     if (decision_threshold_ < params_.min_decision_threshold())
   346       decision_threshold_ = params_.min_decision_threshold();
   347   }
   349   // Update speech and noise levels.
   350   UpdateLevels(rms);
   351   ++frame_counter_;
   353   if (rms_out)
   354     *rms_out = GetDecibel(rms);
   355 }
   357 float EnergyEndpointer::GetNoiseLevelDb() const {
   358   return GetDecibel(noise_level_);
   359 }
   361 void EnergyEndpointer::UpdateLevels(float rms) {
   362   // Update quickly initially. We assume this is noise and that
   363   // speech is 6dB above the noise.
   364   if (frame_counter_ < fast_update_frames_) {
   365     // Alpha increases from 0 to (k-1)/k where k is the number of time
   366     // steps in the initial adaptation period.
   367     float alpha = static_cast<float>(frame_counter_) /
   368         static_cast<float>(fast_update_frames_);
   369     noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
   370     //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_));
   371   } else {
   372     // Update Noise level. The noise level adapts quickly downward, but
   373     // slowly upward. The noise_level_ parameter is not currently used
   374     // for threshold adaptation. It is used for UI feedback.
   375     if (noise_level_ < rms)
   376       noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
   377     else
   378       noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
   379   }
   380   if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
   381     decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
   382     // Set a floor
   383     if (decision_threshold_ < params_.min_decision_threshold())
   384       decision_threshold_ = params_.min_decision_threshold();
   385   }
   386 }
   388 EpStatus EnergyEndpointer::Status(int64_t* status_time)  const {
   389   *status_time = history_->EndTime();
   390   return status_;
   391 }
   393 }  // namespace mozilla

mercurial