|
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
|
2 // |
|
3 // Redistribution and use in source and binary forms, with or without |
|
4 // modification, are permitted provided that the following conditions are |
|
5 // met: |
|
6 // |
|
7 // * Redistributions of source code must retain the above copyright |
|
8 // notice, this list of conditions and the following disclaimer. |
|
9 // * Redistributions in binary form must reproduce the above |
|
10 // copyright notice, this list of conditions and the following disclaimer |
|
11 // in the documentation and/or other materials provided with the |
|
12 // distribution. |
|
13 // * Neither the name of Google Inc. nor the names of its |
|
14 // contributors may be used to endorse or promote products derived from |
|
15 // this software without specific prior written permission. |
|
16 // |
|
17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
|
20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
|
21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
|
24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
|
25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
28 |
|
29 // The EnergyEndpointer class finds likely speech onset and offset points. |
|
30 // |
|
31 // The implementation described here is about the simplest possible. |
|
32 // It is based on timings of threshold crossings for overall signal |
|
33 // RMS. It is suitable for light weight applications. |
|
34 // |
|
35 // As written, the basic idea is that one specifies intervals that |
|
36 // must be occupied by super- and sub-threshold energy levels, and |
|
37 // defers decisions re onset and offset times until these |
|
38 // specifications have been met. Three basic intervals are tested: an |
|
39 // onset window, a speech-on window, and an offset window. We require |
|
40 // super-threshold to exceed some mimimum total durations in the onset |
|
41 // and speech-on windows before declaring the speech onset time, and |
|
42 // we specify a required sub-threshold residency in the offset window |
|
43 // before declaring speech offset. As the various residency requirements are |
|
44 // met, the EnergyEndpointer instance assumes various states, and can return the |
|
45 // ID of these states to the client (see EpStatus below). |
|
46 // |
|
47 // The levels of the speech and background noise are continuously updated. It is |
|
48 // important that the background noise level be estimated initially for |
|
49 // robustness in noisy conditions. The first frames are assumed to be background |
|
50 // noise and a fast update rate is used for the noise level. The duration for |
|
51 // fast update is controlled by the fast_update_dur_ paramter. |
|
52 // |
|
53 // If used in noisy conditions, the endpointer should be started and run in the |
|
54 // EnvironmentEstimation mode, for at least 200ms, before switching to |
|
55 // UserInputMode. |
|
56 // Audio feedback contamination can appear in the input audio, if not cut |
|
57 // out or handled by echo cancellation. Audio feedback can trigger a false |
|
58 // accept. The false accepts can be ignored by setting |
|
59 // ep_contamination_rejection_period. |
|
60 |
|
61 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ |
|
62 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ |
|
63 |
|
64 #include <vector> |
|
65 |
|
66 #include "nsAutoPtr.h" |
|
67 |
|
68 #include "energy_endpointer_params.h" |
|
69 |
|
70 namespace mozilla { |
|
71 |
|
72 // Endpointer status codes |
|
73 enum EpStatus { |
|
74 EP_PRE_SPEECH = 10, |
|
75 EP_POSSIBLE_ONSET, |
|
76 EP_SPEECH_PRESENT, |
|
77 EP_POSSIBLE_OFFSET, |
|
78 EP_POST_SPEECH, |
|
79 }; |
|
80 |
|
81 class EnergyEndpointer { |
|
82 public: |
|
83 // The default construction MUST be followed by Init(), before any |
|
84 // other use can be made of the instance. |
|
85 EnergyEndpointer(); |
|
86 virtual ~EnergyEndpointer(); |
|
87 |
|
88 void Init(const EnergyEndpointerParams& params); |
|
89 |
|
90 // Start the endpointer. This should be called at the beginning of a session. |
|
91 void StartSession(); |
|
92 |
|
93 // Stop the endpointer. |
|
94 void EndSession(); |
|
95 |
|
96 // Start environment estimation. Audio will be used for environment estimation |
|
97 // i.e. noise level estimation. |
|
98 void SetEnvironmentEstimationMode(); |
|
99 |
|
100 // Start user input. This should be called when the user indicates start of |
|
101 // input, e.g. by pressing a button. |
|
102 void SetUserInputMode(); |
|
103 |
|
104 // Computes the next input frame and modifies EnergyEndpointer status as |
|
105 // appropriate based on the computation. |
|
106 void ProcessAudioFrame(int64_t time_us, |
|
107 const int16_t* samples, int num_samples, |
|
108 float* rms_out); |
|
109 |
|
110 // Returns the current state of the EnergyEndpointer and the time |
|
111 // corresponding to the most recently computed frame. |
|
112 EpStatus Status(int64_t* status_time_us) const; |
|
113 |
|
114 bool estimating_environment() const { |
|
115 return estimating_environment_; |
|
116 } |
|
117 |
|
118 // Returns estimated noise level in dB. |
|
119 float GetNoiseLevelDb() const; |
|
120 |
|
121 private: |
|
122 class HistoryRing; |
|
123 |
|
124 // Resets the endpointer internal state. If reset_threshold is true, the |
|
125 // state will be reset completely, including adaptive thresholds and the |
|
126 // removal of all history information. |
|
127 void Restart(bool reset_threshold); |
|
128 |
|
129 // Update internal speech and noise levels. |
|
130 void UpdateLevels(float rms); |
|
131 |
|
132 // Returns the number of frames (or frame number) corresponding to |
|
133 // the 'time' (in seconds). |
|
134 int TimeToFrame(float time) const; |
|
135 |
|
136 EpStatus status_; // The current state of this instance. |
|
137 float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH |
|
138 int64_t endpointer_time_us_; // Time of the most recently received audio frame. |
|
139 int64_t fast_update_frames_; // Number of frames for initial level adaptation. |
|
140 int64_t frame_counter_; // Number of frames seen. Used for initial adaptation. |
|
141 float max_window_dur_; // Largest search window size (seconds) |
|
142 float sample_rate_; // Sampling rate. |
|
143 |
|
144 // Ring buffers to hold the speech activity history. |
|
145 nsAutoPtr<HistoryRing> history_; |
|
146 |
|
147 // Configuration parameters. |
|
148 EnergyEndpointerParams params_; |
|
149 |
|
150 // RMS which must be exceeded to conclude frame is speech. |
|
151 float decision_threshold_; |
|
152 |
|
153 // Flag to indicate that audio should be used to estimate environment, prior |
|
154 // to receiving user input. |
|
155 bool estimating_environment_; |
|
156 |
|
157 // Estimate of the background noise level. Used externally for UI feedback. |
|
158 float noise_level_; |
|
159 |
|
160 // An adaptive threshold used to update decision_threshold_ when appropriate. |
|
161 float rms_adapt_; |
|
162 |
|
163 // Start lag corresponds to the highest fundamental frequency. |
|
164 int start_lag_; |
|
165 |
|
166 // End lag corresponds to the lowest fundamental frequency. |
|
167 int end_lag_; |
|
168 |
|
169 // Time when mode switched from environment estimation to user input. This |
|
170 // is used to time forced rejection of audio feedback contamination. |
|
171 int64_t user_input_start_time_us_; |
|
172 |
|
173 // prevent copy constructor and assignment |
|
174 EnergyEndpointer(const EnergyEndpointer&); |
|
175 void operator=(const EnergyEndpointer&); |
|
176 }; |
|
177 |
|
178 } // namespace mozilla |
|
179 |
|
180 #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ |