michael@0: // Copyright (c) 2013 The Chromium Authors. All rights reserved. michael@0: // michael@0: // Redistribution and use in source and binary forms, with or without michael@0: // modification, are permitted provided that the following conditions are michael@0: // met: michael@0: // michael@0: // * Redistributions of source code must retain the above copyright michael@0: // notice, this list of conditions and the following disclaimer. michael@0: // * Redistributions in binary form must reproduce the above michael@0: // copyright notice, this list of conditions and the following disclaimer michael@0: // in the documentation and/or other materials provided with the michael@0: // distribution. michael@0: // * Neither the name of Google Inc. nor the names of its michael@0: // contributors may be used to endorse or promote products derived from michael@0: // this software without specific prior written permission. michael@0: // michael@0: // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS michael@0: // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT michael@0: // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR michael@0: // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT michael@0: // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, michael@0: // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT michael@0: // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, michael@0: // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY michael@0: // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT michael@0: // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE michael@0: // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. michael@0: michael@0: // The EnergyEndpointer class finds likely speech onset and offset points. michael@0: // michael@0: // The implementation described here is about the simplest possible. michael@0: // It is based on timings of threshold crossings for overall signal michael@0: // RMS. It is suitable for light weight applications. michael@0: // michael@0: // As written, the basic idea is that one specifies intervals that michael@0: // must be occupied by super- and sub-threshold energy levels, and michael@0: // defers decisions re onset and offset times until these michael@0: // specifications have been met. Three basic intervals are tested: an michael@0: // onset window, a speech-on window, and an offset window. We require michael@0: // super-threshold to exceed some mimimum total durations in the onset michael@0: // and speech-on windows before declaring the speech onset time, and michael@0: // we specify a required sub-threshold residency in the offset window michael@0: // before declaring speech offset. As the various residency requirements are michael@0: // met, the EnergyEndpointer instance assumes various states, and can return the michael@0: // ID of these states to the client (see EpStatus below). michael@0: // michael@0: // The levels of the speech and background noise are continuously updated. It is michael@0: // important that the background noise level be estimated initially for michael@0: // robustness in noisy conditions. The first frames are assumed to be background michael@0: // noise and a fast update rate is used for the noise level. The duration for michael@0: // fast update is controlled by the fast_update_dur_ paramter. michael@0: // michael@0: // If used in noisy conditions, the endpointer should be started and run in the michael@0: // EnvironmentEstimation mode, for at least 200ms, before switching to michael@0: // UserInputMode. michael@0: // Audio feedback contamination can appear in the input audio, if not cut michael@0: // out or handled by echo cancellation. Audio feedback can trigger a false michael@0: // accept. The false accepts can be ignored by setting michael@0: // ep_contamination_rejection_period. michael@0: michael@0: #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ michael@0: #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ michael@0: michael@0: #include michael@0: michael@0: #include "nsAutoPtr.h" michael@0: michael@0: #include "energy_endpointer_params.h" michael@0: michael@0: namespace mozilla { michael@0: michael@0: // Endpointer status codes michael@0: enum EpStatus { michael@0: EP_PRE_SPEECH = 10, michael@0: EP_POSSIBLE_ONSET, michael@0: EP_SPEECH_PRESENT, michael@0: EP_POSSIBLE_OFFSET, michael@0: EP_POST_SPEECH, michael@0: }; michael@0: michael@0: class EnergyEndpointer { michael@0: public: michael@0: // The default construction MUST be followed by Init(), before any michael@0: // other use can be made of the instance. michael@0: EnergyEndpointer(); michael@0: virtual ~EnergyEndpointer(); michael@0: michael@0: void Init(const EnergyEndpointerParams& params); michael@0: michael@0: // Start the endpointer. This should be called at the beginning of a session. michael@0: void StartSession(); michael@0: michael@0: // Stop the endpointer. michael@0: void EndSession(); michael@0: michael@0: // Start environment estimation. Audio will be used for environment estimation michael@0: // i.e. noise level estimation. michael@0: void SetEnvironmentEstimationMode(); michael@0: michael@0: // Start user input. This should be called when the user indicates start of michael@0: // input, e.g. by pressing a button. michael@0: void SetUserInputMode(); michael@0: michael@0: // Computes the next input frame and modifies EnergyEndpointer status as michael@0: // appropriate based on the computation. michael@0: void ProcessAudioFrame(int64_t time_us, michael@0: const int16_t* samples, int num_samples, michael@0: float* rms_out); michael@0: michael@0: // Returns the current state of the EnergyEndpointer and the time michael@0: // corresponding to the most recently computed frame. michael@0: EpStatus Status(int64_t* status_time_us) const; michael@0: michael@0: bool estimating_environment() const { michael@0: return estimating_environment_; michael@0: } michael@0: michael@0: // Returns estimated noise level in dB. michael@0: float GetNoiseLevelDb() const; michael@0: michael@0: private: michael@0: class HistoryRing; michael@0: michael@0: // Resets the endpointer internal state. If reset_threshold is true, the michael@0: // state will be reset completely, including adaptive thresholds and the michael@0: // removal of all history information. michael@0: void Restart(bool reset_threshold); michael@0: michael@0: // Update internal speech and noise levels. michael@0: void UpdateLevels(float rms); michael@0: michael@0: // Returns the number of frames (or frame number) corresponding to michael@0: // the 'time' (in seconds). michael@0: int TimeToFrame(float time) const; michael@0: michael@0: EpStatus status_; // The current state of this instance. michael@0: float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH michael@0: int64_t endpointer_time_us_; // Time of the most recently received audio frame. michael@0: int64_t fast_update_frames_; // Number of frames for initial level adaptation. michael@0: int64_t frame_counter_; // Number of frames seen. Used for initial adaptation. michael@0: float max_window_dur_; // Largest search window size (seconds) michael@0: float sample_rate_; // Sampling rate. michael@0: michael@0: // Ring buffers to hold the speech activity history. michael@0: nsAutoPtr history_; michael@0: michael@0: // Configuration parameters. michael@0: EnergyEndpointerParams params_; michael@0: michael@0: // RMS which must be exceeded to conclude frame is speech. michael@0: float decision_threshold_; michael@0: michael@0: // Flag to indicate that audio should be used to estimate environment, prior michael@0: // to receiving user input. michael@0: bool estimating_environment_; michael@0: michael@0: // Estimate of the background noise level. Used externally for UI feedback. michael@0: float noise_level_; michael@0: michael@0: // An adaptive threshold used to update decision_threshold_ when appropriate. michael@0: float rms_adapt_; michael@0: michael@0: // Start lag corresponds to the highest fundamental frequency. michael@0: int start_lag_; michael@0: michael@0: // End lag corresponds to the lowest fundamental frequency. michael@0: int end_lag_; michael@0: michael@0: // Time when mode switched from environment estimation to user input. This michael@0: // is used to time forced rejection of audio feedback contamination. michael@0: int64_t user_input_start_time_us_; michael@0: michael@0: // prevent copy constructor and assignment michael@0: EnergyEndpointer(const EnergyEndpointer&); michael@0: void operator=(const EnergyEndpointer&); michael@0: }; michael@0: michael@0: } // namespace mozilla michael@0: michael@0: #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_