1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/content/media/webspeech/recognition/energy_endpointer.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,393 @@ 1.4 +// Copyright (c) 2013 The Chromium Authors. All rights reserved. 1.5 +// 1.6 +// Redistribution and use in source and binary forms, with or without 1.7 +// modification, are permitted provided that the following conditions are 1.8 +// met: 1.9 +// 1.10 +// * Redistributions of source code must retain the above copyright 1.11 +// notice, this list of conditions and the following disclaimer. 1.12 +// * Redistributions in binary form must reproduce the above 1.13 +// copyright notice, this list of conditions and the following disclaimer 1.14 +// in the documentation and/or other materials provided with the 1.15 +// distribution. 1.16 +// * Neither the name of Google Inc. nor the names of its 1.17 +// contributors may be used to endorse or promote products derived from 1.18 +// this software without specific prior written permission. 1.19 +// 1.20 +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 1.21 +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 1.22 +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 1.23 +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 1.24 +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 1.25 +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 1.26 +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 1.27 +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 1.28 +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 1.29 +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 1.30 +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1.31 + 1.32 +#include "energy_endpointer.h" 1.33 + 1.34 +#include <math.h> 1.35 + 1.36 +namespace { 1.37 + 1.38 +// Returns the RMS (quadratic mean) of the input signal. 1.39 +float RMS(const int16_t* samples, int num_samples) { 1.40 + int64_t ssq_int64_t = 0; 1.41 + int64_t sum_int64_t = 0; 1.42 + for (int i = 0; i < num_samples; ++i) { 1.43 + sum_int64_t += samples[i]; 1.44 + ssq_int64_t += samples[i] * samples[i]; 1.45 + } 1.46 + // now convert to floats. 1.47 + double sum = static_cast<double>(sum_int64_t); 1.48 + sum /= num_samples; 1.49 + double ssq = static_cast<double>(ssq_int64_t); 1.50 + return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum))); 1.51 +} 1.52 + 1.53 +int64_t Secs2Usecs(float seconds) { 1.54 + return static_cast<int64_t>(0.5 + (1.0e6 * seconds)); 1.55 +} 1.56 + 1.57 +float GetDecibel(float value) { 1.58 + if (value > 1.0e-100) 1.59 + return 20 * log10(value); 1.60 + return -2000.0; 1.61 +} 1.62 + 1.63 +} // namespace 1.64 + 1.65 +namespace mozilla { 1.66 + 1.67 +// Stores threshold-crossing histories for making decisions about the speech 1.68 +// state. 1.69 +class EnergyEndpointer::HistoryRing { 1.70 + public: 1.71 + HistoryRing() : insertion_index_(0) {} 1.72 + 1.73 + // Resets the ring to |size| elements each with state |initial_state| 1.74 + void SetRing(int size, bool initial_state); 1.75 + 1.76 + // Inserts a new entry into the ring and drops the oldest entry. 1.77 + void Insert(int64_t time_us, bool decision); 1.78 + 1.79 + // Returns the time in microseconds of the most recently added entry. 1.80 + int64_t EndTime() const; 1.81 + 1.82 + // Returns the sum of all intervals during which 'decision' is true within 1.83 + // the time in seconds specified by 'duration'. The returned interval is 1.84 + // in seconds. 1.85 + float RingSum(float duration_sec); 1.86 + 1.87 + private: 1.88 + struct DecisionPoint { 1.89 + int64_t time_us; 1.90 + bool decision; 1.91 + }; 1.92 + 1.93 + std::vector<DecisionPoint> decision_points_; 1.94 + int insertion_index_; // Index at which the next item gets added/inserted. 1.95 + 1.96 + HistoryRing(const HistoryRing&); 1.97 + void operator=(const HistoryRing&); 1.98 +}; 1.99 + 1.100 +void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) { 1.101 + insertion_index_ = 0; 1.102 + decision_points_.clear(); 1.103 + DecisionPoint init = { -1, initial_state }; 1.104 + decision_points_.resize(size, init); 1.105 +} 1.106 + 1.107 +void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) { 1.108 + decision_points_[insertion_index_].time_us = time_us; 1.109 + decision_points_[insertion_index_].decision = decision; 1.110 + insertion_index_ = (insertion_index_ + 1) % decision_points_.size(); 1.111 +} 1.112 + 1.113 +int64_t EnergyEndpointer::HistoryRing::EndTime() const { 1.114 + int ind = insertion_index_ - 1; 1.115 + if (ind < 0) 1.116 + ind = decision_points_.size() - 1; 1.117 + return decision_points_[ind].time_us; 1.118 +} 1.119 + 1.120 +float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) { 1.121 + if (!decision_points_.size()) 1.122 + return 0.0; 1.123 + 1.124 + int64_t sum_us = 0; 1.125 + int ind = insertion_index_ - 1; 1.126 + if (ind < 0) 1.127 + ind = decision_points_.size() - 1; 1.128 + int64_t end_us = decision_points_[ind].time_us; 1.129 + bool is_on = decision_points_[ind].decision; 1.130 + int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec)); 1.131 + if (start_us < 0) 1.132 + start_us = 0; 1.133 + size_t n_summed = 1; // n points ==> (n-1) intervals 1.134 + while ((decision_points_[ind].time_us > start_us) && 1.135 + (n_summed < decision_points_.size())) { 1.136 + --ind; 1.137 + if (ind < 0) 1.138 + ind = decision_points_.size() - 1; 1.139 + if (is_on) 1.140 + sum_us += end_us - decision_points_[ind].time_us; 1.141 + is_on = decision_points_[ind].decision; 1.142 + end_us = decision_points_[ind].time_us; 1.143 + n_summed++; 1.144 + } 1.145 + 1.146 + return 1.0e-6f * sum_us; // Returns total time that was super threshold. 1.147 +} 1.148 + 1.149 +EnergyEndpointer::EnergyEndpointer() 1.150 + : status_(EP_PRE_SPEECH), 1.151 + offset_confirm_dur_sec_(0), 1.152 + endpointer_time_us_(0), 1.153 + fast_update_frames_(0), 1.154 + frame_counter_(0), 1.155 + max_window_dur_(4.0), 1.156 + sample_rate_(0), 1.157 + history_(new HistoryRing()), 1.158 + decision_threshold_(0), 1.159 + estimating_environment_(false), 1.160 + noise_level_(0), 1.161 + rms_adapt_(0), 1.162 + start_lag_(0), 1.163 + end_lag_(0), 1.164 + user_input_start_time_us_(0) { 1.165 +} 1.166 + 1.167 +EnergyEndpointer::~EnergyEndpointer() { 1.168 +} 1.169 + 1.170 +int EnergyEndpointer::TimeToFrame(float time) const { 1.171 + return static_cast<int32_t>(0.5 + (time / params_.frame_period())); 1.172 +} 1.173 + 1.174 +void EnergyEndpointer::Restart(bool reset_threshold) { 1.175 + status_ = EP_PRE_SPEECH; 1.176 + user_input_start_time_us_ = 0; 1.177 + 1.178 + if (reset_threshold) { 1.179 + decision_threshold_ = params_.decision_threshold(); 1.180 + rms_adapt_ = decision_threshold_; 1.181 + noise_level_ = params_.decision_threshold() / 2.0f; 1.182 + frame_counter_ = 0; // Used for rapid initial update of levels. 1.183 + } 1.184 + 1.185 + // Set up the memories to hold the history windows. 1.186 + history_->SetRing(TimeToFrame(max_window_dur_), false); 1.187 + 1.188 + // Flag that indicates that current input should be used for 1.189 + // estimating the environment. The user has not yet started input 1.190 + // by e.g. pressed the push-to-talk button. By default, this is 1.191 + // false for backward compatibility. 1.192 + estimating_environment_ = false; 1.193 +} 1.194 + 1.195 +void EnergyEndpointer::Init(const EnergyEndpointerParams& params) { 1.196 + params_ = params; 1.197 + 1.198 + // Find the longest history interval to be used, and make the ring 1.199 + // large enough to accommodate that number of frames. NOTE: This 1.200 + // depends upon ep_frame_period being set correctly in the factory 1.201 + // that did this instantiation. 1.202 + max_window_dur_ = params_.onset_window(); 1.203 + if (params_.speech_on_window() > max_window_dur_) 1.204 + max_window_dur_ = params_.speech_on_window(); 1.205 + if (params_.offset_window() > max_window_dur_) 1.206 + max_window_dur_ = params_.offset_window(); 1.207 + Restart(true); 1.208 + 1.209 + offset_confirm_dur_sec_ = params_.offset_window() - 1.210 + params_.offset_confirm_dur(); 1.211 + if (offset_confirm_dur_sec_ < 0.0) 1.212 + offset_confirm_dur_sec_ = 0.0; 1.213 + 1.214 + user_input_start_time_us_ = 0; 1.215 + 1.216 + // Flag that indicates that current input should be used for 1.217 + // estimating the environment. The user has not yet started input 1.218 + // by e.g. pressed the push-to-talk button. By default, this is 1.219 + // false for backward compatibility. 1.220 + estimating_environment_ = false; 1.221 + // The initial value of the noise and speech levels is inconsequential. 1.222 + // The level of the first frame will overwrite these values. 1.223 + noise_level_ = params_.decision_threshold() / 2.0f; 1.224 + fast_update_frames_ = 1.225 + static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period()); 1.226 + 1.227 + frame_counter_ = 0; // Used for rapid initial update of levels. 1.228 + 1.229 + sample_rate_ = params_.sample_rate(); 1.230 + start_lag_ = static_cast<int>(sample_rate_ / 1.231 + params_.max_fundamental_frequency()); 1.232 + end_lag_ = static_cast<int>(sample_rate_ / 1.233 + params_.min_fundamental_frequency()); 1.234 +} 1.235 + 1.236 +void EnergyEndpointer::StartSession() { 1.237 + Restart(true); 1.238 +} 1.239 + 1.240 +void EnergyEndpointer::EndSession() { 1.241 + status_ = EP_POST_SPEECH; 1.242 +} 1.243 + 1.244 +void EnergyEndpointer::SetEnvironmentEstimationMode() { 1.245 + Restart(true); 1.246 + estimating_environment_ = true; 1.247 +} 1.248 + 1.249 +void EnergyEndpointer::SetUserInputMode() { 1.250 + estimating_environment_ = false; 1.251 + user_input_start_time_us_ = endpointer_time_us_; 1.252 +} 1.253 + 1.254 +void EnergyEndpointer::ProcessAudioFrame(int64_t time_us, 1.255 + const int16_t* samples, 1.256 + int num_samples, 1.257 + float* rms_out) { 1.258 + endpointer_time_us_ = time_us; 1.259 + float rms = RMS(samples, num_samples); 1.260 + 1.261 + // Check that this is user input audio vs. pre-input adaptation audio. 1.262 + // Input audio starts when the user indicates start of input, by e.g. 1.263 + // pressing push-to-talk. Audio recieved prior to that is used to update 1.264 + // noise and speech level estimates. 1.265 + if (!estimating_environment_) { 1.266 + bool decision = false; 1.267 + if ((endpointer_time_us_ - user_input_start_time_us_) < 1.268 + Secs2Usecs(params_.contamination_rejection_period())) { 1.269 + decision = false; 1.270 + //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_)); 1.271 + } else { 1.272 + decision = (rms > decision_threshold_); 1.273 + } 1.274 + 1.275 + history_->Insert(endpointer_time_us_, decision); 1.276 + 1.277 + switch (status_) { 1.278 + case EP_PRE_SPEECH: 1.279 + if (history_->RingSum(params_.onset_window()) > 1.280 + params_.onset_detect_dur()) { 1.281 + status_ = EP_POSSIBLE_ONSET; 1.282 + } 1.283 + break; 1.284 + 1.285 + case EP_POSSIBLE_ONSET: { 1.286 + float tsum = history_->RingSum(params_.onset_window()); 1.287 + if (tsum > params_.onset_confirm_dur()) { 1.288 + status_ = EP_SPEECH_PRESENT; 1.289 + } else { // If signal is not maintained, drop back to pre-speech. 1.290 + if (tsum <= params_.onset_detect_dur()) 1.291 + status_ = EP_PRE_SPEECH; 1.292 + } 1.293 + break; 1.294 + } 1.295 + 1.296 + case EP_SPEECH_PRESENT: { 1.297 + // To induce hysteresis in the state residency, we allow a 1.298 + // smaller residency time in the on_ring, than was required to 1.299 + // enter the SPEECH_PERSENT state. 1.300 + float on_time = history_->RingSum(params_.speech_on_window()); 1.301 + if (on_time < params_.on_maintain_dur()) 1.302 + status_ = EP_POSSIBLE_OFFSET; 1.303 + break; 1.304 + } 1.305 + 1.306 + case EP_POSSIBLE_OFFSET: 1.307 + if (history_->RingSum(params_.offset_window()) <= 1.308 + offset_confirm_dur_sec_) { 1.309 + // Note that this offset time may be beyond the end 1.310 + // of the input buffer in a real-time system. It will be up 1.311 + // to the RecognizerSession to decide what to do. 1.312 + status_ = EP_PRE_SPEECH; // Automatically reset for next utterance. 1.313 + } else { // If speech picks up again we allow return to SPEECH_PRESENT. 1.314 + if (history_->RingSum(params_.speech_on_window()) >= 1.315 + params_.on_maintain_dur()) 1.316 + status_ = EP_SPEECH_PRESENT; 1.317 + } 1.318 + break; 1.319 + 1.320 + default: 1.321 + break; 1.322 + } 1.323 + 1.324 + // If this is a quiet, non-speech region, slowly adapt the detection 1.325 + // threshold to be about 6dB above the average RMS. 1.326 + if ((!decision) && (status_ == EP_PRE_SPEECH)) { 1.327 + decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms); 1.328 + rms_adapt_ = decision_threshold_; 1.329 + } else { 1.330 + // If this is in a speech region, adapt the decision threshold to 1.331 + // be about 10dB below the average RMS. If the noise level is high, 1.332 + // the threshold is pushed up. 1.333 + // Adaptation up to a higher level is 5 times faster than decay to 1.334 + // a lower level. 1.335 + if ((status_ == EP_SPEECH_PRESENT) && decision) { 1.336 + if (rms_adapt_ > rms) { 1.337 + rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms); 1.338 + } else { 1.339 + rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms); 1.340 + } 1.341 + float target_threshold = 0.3f * rms_adapt_ + noise_level_; 1.342 + decision_threshold_ = (.90f * decision_threshold_) + 1.343 + (0.10f * target_threshold); 1.344 + } 1.345 + } 1.346 + 1.347 + // Set a floor 1.348 + if (decision_threshold_ < params_.min_decision_threshold()) 1.349 + decision_threshold_ = params_.min_decision_threshold(); 1.350 + } 1.351 + 1.352 + // Update speech and noise levels. 1.353 + UpdateLevels(rms); 1.354 + ++frame_counter_; 1.355 + 1.356 + if (rms_out) 1.357 + *rms_out = GetDecibel(rms); 1.358 +} 1.359 + 1.360 +float EnergyEndpointer::GetNoiseLevelDb() const { 1.361 + return GetDecibel(noise_level_); 1.362 +} 1.363 + 1.364 +void EnergyEndpointer::UpdateLevels(float rms) { 1.365 + // Update quickly initially. We assume this is noise and that 1.366 + // speech is 6dB above the noise. 1.367 + if (frame_counter_ < fast_update_frames_) { 1.368 + // Alpha increases from 0 to (k-1)/k where k is the number of time 1.369 + // steps in the initial adaptation period. 1.370 + float alpha = static_cast<float>(frame_counter_) / 1.371 + static_cast<float>(fast_update_frames_); 1.372 + noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms); 1.373 + //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_)); 1.374 + } else { 1.375 + // Update Noise level. The noise level adapts quickly downward, but 1.376 + // slowly upward. The noise_level_ parameter is not currently used 1.377 + // for threshold adaptation. It is used for UI feedback. 1.378 + if (noise_level_ < rms) 1.379 + noise_level_ = (0.999f * noise_level_) + (0.001f * rms); 1.380 + else 1.381 + noise_level_ = (0.95f * noise_level_) + (0.05f * rms); 1.382 + } 1.383 + if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) { 1.384 + decision_threshold_ = noise_level_ * 2; // 6dB above noise level. 1.385 + // Set a floor 1.386 + if (decision_threshold_ < params_.min_decision_threshold()) 1.387 + decision_threshold_ = params_.min_decision_threshold(); 1.388 + } 1.389 +} 1.390 + 1.391 +EpStatus EnergyEndpointer::Status(int64_t* status_time) const { 1.392 + *status_time = history_->EndTime(); 1.393 + return status_; 1.394 +} 1.395 + 1.396 +} // namespace mozilla