The Tor Browser: content/media/webspeech/recognition/energy

Replace accessor implementation with direct member state manipulation, by
request https://trac.torproject.org/projects/tor/ticket/9701#comment:32

     1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.

     2 //

     3 // Redistribution and use in source and binary forms, with or without

     4 // modification, are permitted provided that the following conditions are

     5 // met:

     6 //

     7 //    * Redistributions of source code must retain the above copyright

     8 // notice, this list of conditions and the following disclaimer.

     9 //    * Redistributions in binary form must reproduce the above

    10 // copyright notice, this list of conditions and the following disclaimer

    11 // in the documentation and/or other materials provided with the

    12 // distribution.

    13 //    * Neither the name of Google Inc. nor the names of its

    14 // contributors may be used to endorse or promote products derived from

    15 // this software without specific prior written permission.

    16 //

    17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

    18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

    19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

    20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

    21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

    22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

    23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

    24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

    25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

    26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

    27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

    29 #include "energy_endpointer.h"

    31 #include <math.h>

    33 namespace {

    35 // Returns the RMS (quadratic mean) of the input signal.

    36 float RMS(const int16_t* samples, int num_samples) {

    37   int64_t ssq_int64_t = 0;

    38   int64_t sum_int64_t = 0;

    39   for (int i = 0; i < num_samples; ++i) {

    40     sum_int64_t += samples[i];

    41     ssq_int64_t += samples[i] * samples[i];

    42   }

    43   // now convert to floats.

    44   double sum = static_cast<double>(sum_int64_t);

    45   sum /= num_samples;

    46   double ssq = static_cast<double>(ssq_int64_t);

    47   return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));

    48 }

    50 int64_t Secs2Usecs(float seconds) {

    51   return static_cast<int64_t>(0.5 + (1.0e6 * seconds));

    52 }

    54 float GetDecibel(float value) {

    55   if (value > 1.0e-100)

    56     return 20 * log10(value);

    57   return -2000.0;

    58 }

    60 }  // namespace

    62 namespace mozilla {

    64 // Stores threshold-crossing histories for making decisions about the speech

    65 // state.

    66 class EnergyEndpointer::HistoryRing {

    67  public:

    68   HistoryRing() : insertion_index_(0) {}

    70   // Resets the ring to |size| elements each with state |initial_state|

    71   void SetRing(int size, bool initial_state);

    73   // Inserts a new entry into the ring and drops the oldest entry.

    74   void Insert(int64_t time_us, bool decision);

    76   // Returns the time in microseconds of the most recently added entry.

    77   int64_t EndTime() const;

    79   // Returns the sum of all intervals during which 'decision' is true within

    80   // the time in seconds specified by 'duration'. The returned interval is

    81   // in seconds.

    82   float RingSum(float duration_sec);

    84  private:

    85   struct DecisionPoint {

    86     int64_t time_us;

    87     bool decision;

    88   };

    90   std::vector<DecisionPoint> decision_points_;

    91   int insertion_index_;  // Index at which the next item gets added/inserted.

    93   HistoryRing(const HistoryRing&);

    94   void operator=(const HistoryRing&);

    95 };

    97 void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {

    98   insertion_index_ = 0;

    99   decision_points_.clear();

   100   DecisionPoint init = { -1, initial_state };

   101   decision_points_.resize(size, init);

   102 }

   104 void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) {

   105   decision_points_[insertion_index_].time_us = time_us;

   106   decision_points_[insertion_index_].decision = decision;

   107   insertion_index_ = (insertion_index_ + 1) % decision_points_.size();

   108 }

   110 int64_t EnergyEndpointer::HistoryRing::EndTime() const {

   111   int ind = insertion_index_ - 1;

   112   if (ind < 0)

   113     ind = decision_points_.size() - 1;

   114   return decision_points_[ind].time_us;

   115 }

   117 float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {

   118   if (!decision_points_.size())

   119     return 0.0;

   121   int64_t sum_us = 0;

   122   int ind = insertion_index_ - 1;

   123   if (ind < 0)

   124     ind = decision_points_.size() - 1;

   125   int64_t end_us = decision_points_[ind].time_us;

   126   bool is_on = decision_points_[ind].decision;

   127   int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec));

   128   if (start_us < 0)

   129     start_us = 0;

   130   size_t n_summed = 1;  // n points ==> (n-1) intervals

   131   while ((decision_points_[ind].time_us > start_us) &&

   132          (n_summed < decision_points_.size())) {

   133     --ind;

   134     if (ind < 0)

   135       ind = decision_points_.size() - 1;

   136     if (is_on)

   137       sum_us += end_us - decision_points_[ind].time_us;

   138     is_on = decision_points_[ind].decision;

   139     end_us = decision_points_[ind].time_us;

   140     n_summed++;

   141   }

   143   return 1.0e-6f * sum_us;  //  Returns total time that was super threshold.

   144 }

   146 EnergyEndpointer::EnergyEndpointer()

   147     : status_(EP_PRE_SPEECH),

   148       offset_confirm_dur_sec_(0),

   149       endpointer_time_us_(0),

   150       fast_update_frames_(0),

   151       frame_counter_(0),

   152       max_window_dur_(4.0),

   153       sample_rate_(0),

   154       history_(new HistoryRing()),

   155       decision_threshold_(0),

   156       estimating_environment_(false),

   157       noise_level_(0),

   158       rms_adapt_(0),

   159       start_lag_(0),

   160       end_lag_(0),

   161       user_input_start_time_us_(0) {

   162 }

   164 EnergyEndpointer::~EnergyEndpointer() {

   165 }

   167 int EnergyEndpointer::TimeToFrame(float time) const {

   168   return static_cast<int32_t>(0.5 + (time / params_.frame_period()));

   169 }

   171 void EnergyEndpointer::Restart(bool reset_threshold) {

   172   status_ = EP_PRE_SPEECH;

   173   user_input_start_time_us_ = 0;

   175   if (reset_threshold) {

   176     decision_threshold_ = params_.decision_threshold();

   177     rms_adapt_ = decision_threshold_;

   178     noise_level_ = params_.decision_threshold() / 2.0f;

   179     frame_counter_ = 0;  // Used for rapid initial update of levels.

   180   }

   182   // Set up the memories to hold the history windows.

   183   history_->SetRing(TimeToFrame(max_window_dur_), false);

   185   // Flag that indicates that current input should be used for

   186   // estimating the environment. The user has not yet started input

   187   // by e.g. pressed the push-to-talk button. By default, this is

   188   // false for backward compatibility.

   189   estimating_environment_ = false;

   190 }

   192 void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {

   193   params_ = params;

   195   // Find the longest history interval to be used, and make the ring

   196   // large enough to accommodate that number of frames.  NOTE: This

   197   // depends upon ep_frame_period being set correctly in the factory

   198   // that did this instantiation.

   199   max_window_dur_ = params_.onset_window();

   200   if (params_.speech_on_window() > max_window_dur_)

   201     max_window_dur_ = params_.speech_on_window();

   202   if (params_.offset_window() > max_window_dur_)

   203     max_window_dur_ = params_.offset_window();

   204   Restart(true);

   206   offset_confirm_dur_sec_ = params_.offset_window() -

   207                             params_.offset_confirm_dur();

   208   if (offset_confirm_dur_sec_ < 0.0)

   209     offset_confirm_dur_sec_ = 0.0;

   211   user_input_start_time_us_ = 0;

   213   // Flag that indicates that  current input should be used for

   214   // estimating the environment. The user has not yet started input

   215   // by e.g. pressed the push-to-talk button. By default, this is

   216   // false for backward compatibility.

   217   estimating_environment_ = false;

   218   // The initial value of the noise and speech levels is inconsequential.

   219   // The level of the first frame will overwrite these values.

   220   noise_level_ = params_.decision_threshold() / 2.0f;

   221   fast_update_frames_ =

   222       static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period());

   224   frame_counter_ = 0;  // Used for rapid initial update of levels.

   226   sample_rate_ = params_.sample_rate();

   227   start_lag_ = static_cast<int>(sample_rate_ /

   228                                 params_.max_fundamental_frequency());

   229   end_lag_ = static_cast<int>(sample_rate_ /

   230                               params_.min_fundamental_frequency());

   231 }

   233 void EnergyEndpointer::StartSession() {

   234   Restart(true);

   235 }

   237 void EnergyEndpointer::EndSession() {

   238   status_ = EP_POST_SPEECH;

   239 }

   241 void EnergyEndpointer::SetEnvironmentEstimationMode() {

   242   Restart(true);

   243   estimating_environment_ = true;

   244 }

   246 void EnergyEndpointer::SetUserInputMode() {

   247   estimating_environment_ = false;

   248   user_input_start_time_us_ = endpointer_time_us_;

   249 }

   251 void EnergyEndpointer::ProcessAudioFrame(int64_t time_us,

   252                                          const int16_t* samples,

   253                                          int num_samples,

   254                                          float* rms_out) {

   255   endpointer_time_us_ = time_us;

   256   float rms = RMS(samples, num_samples);

   258   // Check that this is user input audio vs. pre-input adaptation audio.

   259   // Input audio starts when the user indicates start of input, by e.g.

   260   // pressing push-to-talk. Audio recieved prior to that is used to update

   261   // noise and speech level estimates.

   262   if (!estimating_environment_) {

   263     bool decision = false;

   264     if ((endpointer_time_us_ - user_input_start_time_us_) <

   265         Secs2Usecs(params_.contamination_rejection_period())) {

   266       decision = false;

   267       //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_));

   268     } else {

   269       decision = (rms > decision_threshold_);

   270     }

   272     history_->Insert(endpointer_time_us_, decision);

   274     switch (status_) {

   275       case EP_PRE_SPEECH:

   276         if (history_->RingSum(params_.onset_window()) >

   277             params_.onset_detect_dur()) {

   278           status_ = EP_POSSIBLE_ONSET;

   279         }

   280         break;

   282       case EP_POSSIBLE_ONSET: {

   283         float tsum = history_->RingSum(params_.onset_window());

   284         if (tsum > params_.onset_confirm_dur()) {

   285           status_ = EP_SPEECH_PRESENT;

   286         } else {  // If signal is not maintained, drop back to pre-speech.

   287           if (tsum <= params_.onset_detect_dur())

   288             status_ = EP_PRE_SPEECH;

   289         }

   290         break;

   291       }

   293       case EP_SPEECH_PRESENT: {

   294         // To induce hysteresis in the state residency, we allow a

   295         // smaller residency time in the on_ring, than was required to

   296         // enter the SPEECH_PERSENT state.

   297         float on_time = history_->RingSum(params_.speech_on_window());

   298         if (on_time < params_.on_maintain_dur())

   299           status_ = EP_POSSIBLE_OFFSET;

   300         break;

   301       }

   303       case EP_POSSIBLE_OFFSET:

   304         if (history_->RingSum(params_.offset_window()) <=

   305             offset_confirm_dur_sec_) {

   306           // Note that this offset time may be beyond the end

   307           // of the input buffer in a real-time system.  It will be up

   308           // to the RecognizerSession to decide what to do.

   309           status_ = EP_PRE_SPEECH;  // Automatically reset for next utterance.

   310         } else {  // If speech picks up again we allow return to SPEECH_PRESENT.

   311           if (history_->RingSum(params_.speech_on_window()) >=

   312               params_.on_maintain_dur())

   313             status_ = EP_SPEECH_PRESENT;

   314         }

   315         break;

   317       default:

   318         break;

   319     }

   321     // If this is a quiet, non-speech region, slowly adapt the detection

   322     // threshold to be about 6dB above the average RMS.

   323     if ((!decision) && (status_ == EP_PRE_SPEECH)) {

   324       decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);

   325       rms_adapt_ = decision_threshold_;

   326     } else {

   327       // If this is in a speech region, adapt the decision threshold to

   328       // be about 10dB below the average RMS. If the noise level is high,

   329       // the threshold is pushed up.

   330       // Adaptation up to a higher level is 5 times faster than decay to

   331       // a lower level.

   332       if ((status_ == EP_SPEECH_PRESENT) && decision) {

   333         if (rms_adapt_ > rms) {

   334           rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);

   335         } else {

   336           rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);

   337         }

   338         float target_threshold = 0.3f * rms_adapt_ +  noise_level_;

   339         decision_threshold_ = (.90f * decision_threshold_) +

   340                               (0.10f * target_threshold);

   341       }

   342     }

   344     // Set a floor

   345     if (decision_threshold_ < params_.min_decision_threshold())

   346       decision_threshold_ = params_.min_decision_threshold();

   347   }

   349   // Update speech and noise levels.

   350   UpdateLevels(rms);

   351   ++frame_counter_;

   353   if (rms_out)

   354     *rms_out = GetDecibel(rms);

   355 }

   357 float EnergyEndpointer::GetNoiseLevelDb() const {

   358   return GetDecibel(noise_level_);

   359 }

   361 void EnergyEndpointer::UpdateLevels(float rms) {

   362   // Update quickly initially. We assume this is noise and that

   363   // speech is 6dB above the noise.

   364   if (frame_counter_ < fast_update_frames_) {

   365     // Alpha increases from 0 to (k-1)/k where k is the number of time

   366     // steps in the initial adaptation period.

   367     float alpha = static_cast<float>(frame_counter_) /

   368         static_cast<float>(fast_update_frames_);

   369     noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);

   370     //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_));

   371   } else {

   372     // Update Noise level. The noise level adapts quickly downward, but

   373     // slowly upward. The noise_level_ parameter is not currently used

   374     // for threshold adaptation. It is used for UI feedback.

   375     if (noise_level_ < rms)

   376       noise_level_ = (0.999f * noise_level_) + (0.001f * rms);

   377     else

   378       noise_level_ = (0.95f * noise_level_) + (0.05f * rms);

   379   }

   380   if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {

   381     decision_threshold_ = noise_level_ * 2; // 6dB above noise level.

   382     // Set a floor

   383     if (decision_threshold_ < params_.min_decision_threshold())

   384       decision_threshold_ = params_.min_decision_threshold();

   385   }

   386 }

   388 EpStatus EnergyEndpointer::Status(int64_t* status_time)  const {

   389   *status_time = history_->EndTime();

   390   return status_;

   391 }

   393 }  // namespace mozilla

The Tor Browser / file revision

content/media/webspeech/recognition/energy_endpointer.cc@44a2da4a2ab2

content/media/webspeech/recognition/energy_endpointer.cc