Thu, 15 Jan 2015 15:55:04 +0100
Back out 97036ab72558 which inappropriately compared turds to third parties.
michael@0 | 1 | // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Redistribution and use in source and binary forms, with or without |
michael@0 | 4 | // modification, are permitted provided that the following conditions are |
michael@0 | 5 | // met: |
michael@0 | 6 | // |
michael@0 | 7 | // * Redistributions of source code must retain the above copyright |
michael@0 | 8 | // notice, this list of conditions and the following disclaimer. |
michael@0 | 9 | // * Redistributions in binary form must reproduce the above |
michael@0 | 10 | // copyright notice, this list of conditions and the following disclaimer |
michael@0 | 11 | // in the documentation and/or other materials provided with the |
michael@0 | 12 | // distribution. |
michael@0 | 13 | // * Neither the name of Google Inc. nor the names of its |
michael@0 | 14 | // contributors may be used to endorse or promote products derived from |
michael@0 | 15 | // this software without specific prior written permission. |
michael@0 | 16 | // |
michael@0 | 17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
michael@0 | 18 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
michael@0 | 19 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
michael@0 | 20 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
michael@0 | 21 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
michael@0 | 22 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
michael@0 | 23 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
michael@0 | 24 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
michael@0 | 25 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
michael@0 | 26 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
michael@0 | 27 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
michael@0 | 28 | |
michael@0 | 29 | // The EnergyEndpointer class finds likely speech onset and offset points. |
michael@0 | 30 | // |
michael@0 | 31 | // The implementation described here is about the simplest possible. |
michael@0 | 32 | // It is based on timings of threshold crossings for overall signal |
michael@0 | 33 | // RMS. It is suitable for light weight applications. |
michael@0 | 34 | // |
michael@0 | 35 | // As written, the basic idea is that one specifies intervals that |
michael@0 | 36 | // must be occupied by super- and sub-threshold energy levels, and |
michael@0 | 37 | // defers decisions re onset and offset times until these |
michael@0 | 38 | // specifications have been met. Three basic intervals are tested: an |
michael@0 | 39 | // onset window, a speech-on window, and an offset window. We require |
michael@0 | 40 | // super-threshold to exceed some mimimum total durations in the onset |
michael@0 | 41 | // and speech-on windows before declaring the speech onset time, and |
michael@0 | 42 | // we specify a required sub-threshold residency in the offset window |
michael@0 | 43 | // before declaring speech offset. As the various residency requirements are |
michael@0 | 44 | // met, the EnergyEndpointer instance assumes various states, and can return the |
michael@0 | 45 | // ID of these states to the client (see EpStatus below). |
michael@0 | 46 | // |
michael@0 | 47 | // The levels of the speech and background noise are continuously updated. It is |
michael@0 | 48 | // important that the background noise level be estimated initially for |
michael@0 | 49 | // robustness in noisy conditions. The first frames are assumed to be background |
michael@0 | 50 | // noise and a fast update rate is used for the noise level. The duration for |
michael@0 | 51 | // fast update is controlled by the fast_update_dur_ paramter. |
michael@0 | 52 | // |
michael@0 | 53 | // If used in noisy conditions, the endpointer should be started and run in the |
michael@0 | 54 | // EnvironmentEstimation mode, for at least 200ms, before switching to |
michael@0 | 55 | // UserInputMode. |
michael@0 | 56 | // Audio feedback contamination can appear in the input audio, if not cut |
michael@0 | 57 | // out or handled by echo cancellation. Audio feedback can trigger a false |
michael@0 | 58 | // accept. The false accepts can be ignored by setting |
michael@0 | 59 | // ep_contamination_rejection_period. |
michael@0 | 60 | |
michael@0 | 61 | #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ |
michael@0 | 62 | #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ |
michael@0 | 63 | |
michael@0 | 64 | #include <vector> |
michael@0 | 65 | |
michael@0 | 66 | #include "nsAutoPtr.h" |
michael@0 | 67 | |
michael@0 | 68 | #include "energy_endpointer_params.h" |
michael@0 | 69 | |
michael@0 | 70 | namespace mozilla { |
michael@0 | 71 | |
michael@0 | 72 | // Endpointer status codes |
michael@0 | 73 | enum EpStatus { |
michael@0 | 74 | EP_PRE_SPEECH = 10, |
michael@0 | 75 | EP_POSSIBLE_ONSET, |
michael@0 | 76 | EP_SPEECH_PRESENT, |
michael@0 | 77 | EP_POSSIBLE_OFFSET, |
michael@0 | 78 | EP_POST_SPEECH, |
michael@0 | 79 | }; |
michael@0 | 80 | |
michael@0 | 81 | class EnergyEndpointer { |
michael@0 | 82 | public: |
michael@0 | 83 | // The default construction MUST be followed by Init(), before any |
michael@0 | 84 | // other use can be made of the instance. |
michael@0 | 85 | EnergyEndpointer(); |
michael@0 | 86 | virtual ~EnergyEndpointer(); |
michael@0 | 87 | |
michael@0 | 88 | void Init(const EnergyEndpointerParams& params); |
michael@0 | 89 | |
michael@0 | 90 | // Start the endpointer. This should be called at the beginning of a session. |
michael@0 | 91 | void StartSession(); |
michael@0 | 92 | |
michael@0 | 93 | // Stop the endpointer. |
michael@0 | 94 | void EndSession(); |
michael@0 | 95 | |
michael@0 | 96 | // Start environment estimation. Audio will be used for environment estimation |
michael@0 | 97 | // i.e. noise level estimation. |
michael@0 | 98 | void SetEnvironmentEstimationMode(); |
michael@0 | 99 | |
michael@0 | 100 | // Start user input. This should be called when the user indicates start of |
michael@0 | 101 | // input, e.g. by pressing a button. |
michael@0 | 102 | void SetUserInputMode(); |
michael@0 | 103 | |
michael@0 | 104 | // Computes the next input frame and modifies EnergyEndpointer status as |
michael@0 | 105 | // appropriate based on the computation. |
michael@0 | 106 | void ProcessAudioFrame(int64_t time_us, |
michael@0 | 107 | const int16_t* samples, int num_samples, |
michael@0 | 108 | float* rms_out); |
michael@0 | 109 | |
michael@0 | 110 | // Returns the current state of the EnergyEndpointer and the time |
michael@0 | 111 | // corresponding to the most recently computed frame. |
michael@0 | 112 | EpStatus Status(int64_t* status_time_us) const; |
michael@0 | 113 | |
michael@0 | 114 | bool estimating_environment() const { |
michael@0 | 115 | return estimating_environment_; |
michael@0 | 116 | } |
michael@0 | 117 | |
michael@0 | 118 | // Returns estimated noise level in dB. |
michael@0 | 119 | float GetNoiseLevelDb() const; |
michael@0 | 120 | |
michael@0 | 121 | private: |
michael@0 | 122 | class HistoryRing; |
michael@0 | 123 | |
michael@0 | 124 | // Resets the endpointer internal state. If reset_threshold is true, the |
michael@0 | 125 | // state will be reset completely, including adaptive thresholds and the |
michael@0 | 126 | // removal of all history information. |
michael@0 | 127 | void Restart(bool reset_threshold); |
michael@0 | 128 | |
michael@0 | 129 | // Update internal speech and noise levels. |
michael@0 | 130 | void UpdateLevels(float rms); |
michael@0 | 131 | |
michael@0 | 132 | // Returns the number of frames (or frame number) corresponding to |
michael@0 | 133 | // the 'time' (in seconds). |
michael@0 | 134 | int TimeToFrame(float time) const; |
michael@0 | 135 | |
michael@0 | 136 | EpStatus status_; // The current state of this instance. |
michael@0 | 137 | float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH |
michael@0 | 138 | int64_t endpointer_time_us_; // Time of the most recently received audio frame. |
michael@0 | 139 | int64_t fast_update_frames_; // Number of frames for initial level adaptation. |
michael@0 | 140 | int64_t frame_counter_; // Number of frames seen. Used for initial adaptation. |
michael@0 | 141 | float max_window_dur_; // Largest search window size (seconds) |
michael@0 | 142 | float sample_rate_; // Sampling rate. |
michael@0 | 143 | |
michael@0 | 144 | // Ring buffers to hold the speech activity history. |
michael@0 | 145 | nsAutoPtr<HistoryRing> history_; |
michael@0 | 146 | |
michael@0 | 147 | // Configuration parameters. |
michael@0 | 148 | EnergyEndpointerParams params_; |
michael@0 | 149 | |
michael@0 | 150 | // RMS which must be exceeded to conclude frame is speech. |
michael@0 | 151 | float decision_threshold_; |
michael@0 | 152 | |
michael@0 | 153 | // Flag to indicate that audio should be used to estimate environment, prior |
michael@0 | 154 | // to receiving user input. |
michael@0 | 155 | bool estimating_environment_; |
michael@0 | 156 | |
michael@0 | 157 | // Estimate of the background noise level. Used externally for UI feedback. |
michael@0 | 158 | float noise_level_; |
michael@0 | 159 | |
michael@0 | 160 | // An adaptive threshold used to update decision_threshold_ when appropriate. |
michael@0 | 161 | float rms_adapt_; |
michael@0 | 162 | |
michael@0 | 163 | // Start lag corresponds to the highest fundamental frequency. |
michael@0 | 164 | int start_lag_; |
michael@0 | 165 | |
michael@0 | 166 | // End lag corresponds to the lowest fundamental frequency. |
michael@0 | 167 | int end_lag_; |
michael@0 | 168 | |
michael@0 | 169 | // Time when mode switched from environment estimation to user input. This |
michael@0 | 170 | // is used to time forced rejection of audio feedback contamination. |
michael@0 | 171 | int64_t user_input_start_time_us_; |
michael@0 | 172 | |
michael@0 | 173 | // prevent copy constructor and assignment |
michael@0 | 174 | EnergyEndpointer(const EnergyEndpointer&); |
michael@0 | 175 | void operator=(const EnergyEndpointer&); |
michael@0 | 176 | }; |
michael@0 | 177 | |
michael@0 | 178 | } // namespace mozilla |
michael@0 | 179 | |
michael@0 | 180 | #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ |