michael@0: // Copyright (c) 2013 The Chromium Authors. All rights reserved. michael@0: // michael@0: // Redistribution and use in source and binary forms, with or without michael@0: // modification, are permitted provided that the following conditions are michael@0: // met: michael@0: // michael@0: // * Redistributions of source code must retain the above copyright michael@0: // notice, this list of conditions and the following disclaimer. michael@0: // * Redistributions in binary form must reproduce the above michael@0: // copyright notice, this list of conditions and the following disclaimer michael@0: // in the documentation and/or other materials provided with the michael@0: // distribution. michael@0: // * Neither the name of Google Inc. nor the names of its michael@0: // contributors may be used to endorse or promote products derived from michael@0: // this software without specific prior written permission. michael@0: // michael@0: // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS michael@0: // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT michael@0: // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR michael@0: // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT michael@0: // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, michael@0: // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT michael@0: // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, michael@0: // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY michael@0: // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT michael@0: // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE michael@0: // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. michael@0: michael@0: #include "energy_endpointer.h" michael@0: michael@0: #include michael@0: michael@0: namespace { michael@0: michael@0: // Returns the RMS (quadratic mean) of the input signal. michael@0: float RMS(const int16_t* samples, int num_samples) { michael@0: int64_t ssq_int64_t = 0; michael@0: int64_t sum_int64_t = 0; michael@0: for (int i = 0; i < num_samples; ++i) { michael@0: sum_int64_t += samples[i]; michael@0: ssq_int64_t += samples[i] * samples[i]; michael@0: } michael@0: // now convert to floats. michael@0: double sum = static_cast(sum_int64_t); michael@0: sum /= num_samples; michael@0: double ssq = static_cast(ssq_int64_t); michael@0: return static_cast(sqrt((ssq / num_samples) - (sum * sum))); michael@0: } michael@0: michael@0: int64_t Secs2Usecs(float seconds) { michael@0: return static_cast(0.5 + (1.0e6 * seconds)); michael@0: } michael@0: michael@0: float GetDecibel(float value) { michael@0: if (value > 1.0e-100) michael@0: return 20 * log10(value); michael@0: return -2000.0; michael@0: } michael@0: michael@0: } // namespace michael@0: michael@0: namespace mozilla { michael@0: michael@0: // Stores threshold-crossing histories for making decisions about the speech michael@0: // state. michael@0: class EnergyEndpointer::HistoryRing { michael@0: public: michael@0: HistoryRing() : insertion_index_(0) {} michael@0: michael@0: // Resets the ring to |size| elements each with state |initial_state| michael@0: void SetRing(int size, bool initial_state); michael@0: michael@0: // Inserts a new entry into the ring and drops the oldest entry. michael@0: void Insert(int64_t time_us, bool decision); michael@0: michael@0: // Returns the time in microseconds of the most recently added entry. michael@0: int64_t EndTime() const; michael@0: michael@0: // Returns the sum of all intervals during which 'decision' is true within michael@0: // the time in seconds specified by 'duration'. The returned interval is michael@0: // in seconds. michael@0: float RingSum(float duration_sec); michael@0: michael@0: private: michael@0: struct DecisionPoint { michael@0: int64_t time_us; michael@0: bool decision; michael@0: }; michael@0: michael@0: std::vector decision_points_; michael@0: int insertion_index_; // Index at which the next item gets added/inserted. michael@0: michael@0: HistoryRing(const HistoryRing&); michael@0: void operator=(const HistoryRing&); michael@0: }; michael@0: michael@0: void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) { michael@0: insertion_index_ = 0; michael@0: decision_points_.clear(); michael@0: DecisionPoint init = { -1, initial_state }; michael@0: decision_points_.resize(size, init); michael@0: } michael@0: michael@0: void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) { michael@0: decision_points_[insertion_index_].time_us = time_us; michael@0: decision_points_[insertion_index_].decision = decision; michael@0: insertion_index_ = (insertion_index_ + 1) % decision_points_.size(); michael@0: } michael@0: michael@0: int64_t EnergyEndpointer::HistoryRing::EndTime() const { michael@0: int ind = insertion_index_ - 1; michael@0: if (ind < 0) michael@0: ind = decision_points_.size() - 1; michael@0: return decision_points_[ind].time_us; michael@0: } michael@0: michael@0: float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) { michael@0: if (!decision_points_.size()) michael@0: return 0.0; michael@0: michael@0: int64_t sum_us = 0; michael@0: int ind = insertion_index_ - 1; michael@0: if (ind < 0) michael@0: ind = decision_points_.size() - 1; michael@0: int64_t end_us = decision_points_[ind].time_us; michael@0: bool is_on = decision_points_[ind].decision; michael@0: int64_t start_us = end_us - static_cast(0.5 + (1.0e6 * duration_sec)); michael@0: if (start_us < 0) michael@0: start_us = 0; michael@0: size_t n_summed = 1; // n points ==> (n-1) intervals michael@0: while ((decision_points_[ind].time_us > start_us) && michael@0: (n_summed < decision_points_.size())) { michael@0: --ind; michael@0: if (ind < 0) michael@0: ind = decision_points_.size() - 1; michael@0: if (is_on) michael@0: sum_us += end_us - decision_points_[ind].time_us; michael@0: is_on = decision_points_[ind].decision; michael@0: end_us = decision_points_[ind].time_us; michael@0: n_summed++; michael@0: } michael@0: michael@0: return 1.0e-6f * sum_us; // Returns total time that was super threshold. michael@0: } michael@0: michael@0: EnergyEndpointer::EnergyEndpointer() michael@0: : status_(EP_PRE_SPEECH), michael@0: offset_confirm_dur_sec_(0), michael@0: endpointer_time_us_(0), michael@0: fast_update_frames_(0), michael@0: frame_counter_(0), michael@0: max_window_dur_(4.0), michael@0: sample_rate_(0), michael@0: history_(new HistoryRing()), michael@0: decision_threshold_(0), michael@0: estimating_environment_(false), michael@0: noise_level_(0), michael@0: rms_adapt_(0), michael@0: start_lag_(0), michael@0: end_lag_(0), michael@0: user_input_start_time_us_(0) { michael@0: } michael@0: michael@0: EnergyEndpointer::~EnergyEndpointer() { michael@0: } michael@0: michael@0: int EnergyEndpointer::TimeToFrame(float time) const { michael@0: return static_cast(0.5 + (time / params_.frame_period())); michael@0: } michael@0: michael@0: void EnergyEndpointer::Restart(bool reset_threshold) { michael@0: status_ = EP_PRE_SPEECH; michael@0: user_input_start_time_us_ = 0; michael@0: michael@0: if (reset_threshold) { michael@0: decision_threshold_ = params_.decision_threshold(); michael@0: rms_adapt_ = decision_threshold_; michael@0: noise_level_ = params_.decision_threshold() / 2.0f; michael@0: frame_counter_ = 0; // Used for rapid initial update of levels. michael@0: } michael@0: michael@0: // Set up the memories to hold the history windows. michael@0: history_->SetRing(TimeToFrame(max_window_dur_), false); michael@0: michael@0: // Flag that indicates that current input should be used for michael@0: // estimating the environment. The user has not yet started input michael@0: // by e.g. pressed the push-to-talk button. By default, this is michael@0: // false for backward compatibility. michael@0: estimating_environment_ = false; michael@0: } michael@0: michael@0: void EnergyEndpointer::Init(const EnergyEndpointerParams& params) { michael@0: params_ = params; michael@0: michael@0: // Find the longest history interval to be used, and make the ring michael@0: // large enough to accommodate that number of frames. NOTE: This michael@0: // depends upon ep_frame_period being set correctly in the factory michael@0: // that did this instantiation. michael@0: max_window_dur_ = params_.onset_window(); michael@0: if (params_.speech_on_window() > max_window_dur_) michael@0: max_window_dur_ = params_.speech_on_window(); michael@0: if (params_.offset_window() > max_window_dur_) michael@0: max_window_dur_ = params_.offset_window(); michael@0: Restart(true); michael@0: michael@0: offset_confirm_dur_sec_ = params_.offset_window() - michael@0: params_.offset_confirm_dur(); michael@0: if (offset_confirm_dur_sec_ < 0.0) michael@0: offset_confirm_dur_sec_ = 0.0; michael@0: michael@0: user_input_start_time_us_ = 0; michael@0: michael@0: // Flag that indicates that current input should be used for michael@0: // estimating the environment. The user has not yet started input michael@0: // by e.g. pressed the push-to-talk button. By default, this is michael@0: // false for backward compatibility. michael@0: estimating_environment_ = false; michael@0: // The initial value of the noise and speech levels is inconsequential. michael@0: // The level of the first frame will overwrite these values. michael@0: noise_level_ = params_.decision_threshold() / 2.0f; michael@0: fast_update_frames_ = michael@0: static_cast(params_.fast_update_dur() / params_.frame_period()); michael@0: michael@0: frame_counter_ = 0; // Used for rapid initial update of levels. michael@0: michael@0: sample_rate_ = params_.sample_rate(); michael@0: start_lag_ = static_cast(sample_rate_ / michael@0: params_.max_fundamental_frequency()); michael@0: end_lag_ = static_cast(sample_rate_ / michael@0: params_.min_fundamental_frequency()); michael@0: } michael@0: michael@0: void EnergyEndpointer::StartSession() { michael@0: Restart(true); michael@0: } michael@0: michael@0: void EnergyEndpointer::EndSession() { michael@0: status_ = EP_POST_SPEECH; michael@0: } michael@0: michael@0: void EnergyEndpointer::SetEnvironmentEstimationMode() { michael@0: Restart(true); michael@0: estimating_environment_ = true; michael@0: } michael@0: michael@0: void EnergyEndpointer::SetUserInputMode() { michael@0: estimating_environment_ = false; michael@0: user_input_start_time_us_ = endpointer_time_us_; michael@0: } michael@0: michael@0: void EnergyEndpointer::ProcessAudioFrame(int64_t time_us, michael@0: const int16_t* samples, michael@0: int num_samples, michael@0: float* rms_out) { michael@0: endpointer_time_us_ = time_us; michael@0: float rms = RMS(samples, num_samples); michael@0: michael@0: // Check that this is user input audio vs. pre-input adaptation audio. michael@0: // Input audio starts when the user indicates start of input, by e.g. michael@0: // pressing push-to-talk. Audio recieved prior to that is used to update michael@0: // noise and speech level estimates. michael@0: if (!estimating_environment_) { michael@0: bool decision = false; michael@0: if ((endpointer_time_us_ - user_input_start_time_us_) < michael@0: Secs2Usecs(params_.contamination_rejection_period())) { michael@0: decision = false; michael@0: //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_)); michael@0: } else { michael@0: decision = (rms > decision_threshold_); michael@0: } michael@0: michael@0: history_->Insert(endpointer_time_us_, decision); michael@0: michael@0: switch (status_) { michael@0: case EP_PRE_SPEECH: michael@0: if (history_->RingSum(params_.onset_window()) > michael@0: params_.onset_detect_dur()) { michael@0: status_ = EP_POSSIBLE_ONSET; michael@0: } michael@0: break; michael@0: michael@0: case EP_POSSIBLE_ONSET: { michael@0: float tsum = history_->RingSum(params_.onset_window()); michael@0: if (tsum > params_.onset_confirm_dur()) { michael@0: status_ = EP_SPEECH_PRESENT; michael@0: } else { // If signal is not maintained, drop back to pre-speech. michael@0: if (tsum <= params_.onset_detect_dur()) michael@0: status_ = EP_PRE_SPEECH; michael@0: } michael@0: break; michael@0: } michael@0: michael@0: case EP_SPEECH_PRESENT: { michael@0: // To induce hysteresis in the state residency, we allow a michael@0: // smaller residency time in the on_ring, than was required to michael@0: // enter the SPEECH_PERSENT state. michael@0: float on_time = history_->RingSum(params_.speech_on_window()); michael@0: if (on_time < params_.on_maintain_dur()) michael@0: status_ = EP_POSSIBLE_OFFSET; michael@0: break; michael@0: } michael@0: michael@0: case EP_POSSIBLE_OFFSET: michael@0: if (history_->RingSum(params_.offset_window()) <= michael@0: offset_confirm_dur_sec_) { michael@0: // Note that this offset time may be beyond the end michael@0: // of the input buffer in a real-time system. It will be up michael@0: // to the RecognizerSession to decide what to do. michael@0: status_ = EP_PRE_SPEECH; // Automatically reset for next utterance. michael@0: } else { // If speech picks up again we allow return to SPEECH_PRESENT. michael@0: if (history_->RingSum(params_.speech_on_window()) >= michael@0: params_.on_maintain_dur()) michael@0: status_ = EP_SPEECH_PRESENT; michael@0: } michael@0: break; michael@0: michael@0: default: michael@0: break; michael@0: } michael@0: michael@0: // If this is a quiet, non-speech region, slowly adapt the detection michael@0: // threshold to be about 6dB above the average RMS. michael@0: if ((!decision) && (status_ == EP_PRE_SPEECH)) { michael@0: decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms); michael@0: rms_adapt_ = decision_threshold_; michael@0: } else { michael@0: // If this is in a speech region, adapt the decision threshold to michael@0: // be about 10dB below the average RMS. If the noise level is high, michael@0: // the threshold is pushed up. michael@0: // Adaptation up to a higher level is 5 times faster than decay to michael@0: // a lower level. michael@0: if ((status_ == EP_SPEECH_PRESENT) && decision) { michael@0: if (rms_adapt_ > rms) { michael@0: rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms); michael@0: } else { michael@0: rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms); michael@0: } michael@0: float target_threshold = 0.3f * rms_adapt_ + noise_level_; michael@0: decision_threshold_ = (.90f * decision_threshold_) + michael@0: (0.10f * target_threshold); michael@0: } michael@0: } michael@0: michael@0: // Set a floor michael@0: if (decision_threshold_ < params_.min_decision_threshold()) michael@0: decision_threshold_ = params_.min_decision_threshold(); michael@0: } michael@0: michael@0: // Update speech and noise levels. michael@0: UpdateLevels(rms); michael@0: ++frame_counter_; michael@0: michael@0: if (rms_out) michael@0: *rms_out = GetDecibel(rms); michael@0: } michael@0: michael@0: float EnergyEndpointer::GetNoiseLevelDb() const { michael@0: return GetDecibel(noise_level_); michael@0: } michael@0: michael@0: void EnergyEndpointer::UpdateLevels(float rms) { michael@0: // Update quickly initially. We assume this is noise and that michael@0: // speech is 6dB above the noise. michael@0: if (frame_counter_ < fast_update_frames_) { michael@0: // Alpha increases from 0 to (k-1)/k where k is the number of time michael@0: // steps in the initial adaptation period. michael@0: float alpha = static_cast(frame_counter_) / michael@0: static_cast(fast_update_frames_); michael@0: noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms); michael@0: //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_)); michael@0: } else { michael@0: // Update Noise level. The noise level adapts quickly downward, but michael@0: // slowly upward. The noise_level_ parameter is not currently used michael@0: // for threshold adaptation. It is used for UI feedback. michael@0: if (noise_level_ < rms) michael@0: noise_level_ = (0.999f * noise_level_) + (0.001f * rms); michael@0: else michael@0: noise_level_ = (0.95f * noise_level_) + (0.05f * rms); michael@0: } michael@0: if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) { michael@0: decision_threshold_ = noise_level_ * 2; // 6dB above noise level. michael@0: // Set a floor michael@0: if (decision_threshold_ < params_.min_decision_threshold()) michael@0: decision_threshold_ = params_.min_decision_threshold(); michael@0: } michael@0: } michael@0: michael@0: EpStatus EnergyEndpointer::Status(int64_t* status_time) const { michael@0: *status_time = history_->EndTime(); michael@0: return status_; michael@0: } michael@0: michael@0: } // namespace mozilla