|
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
|
2 // |
|
3 // Redistribution and use in source and binary forms, with or without |
|
4 // modification, are permitted provided that the following conditions are |
|
5 // met: |
|
6 // |
|
7 // * Redistributions of source code must retain the above copyright |
|
8 // notice, this list of conditions and the following disclaimer. |
|
9 // * Redistributions in binary form must reproduce the above |
|
10 // copyright notice, this list of conditions and the following disclaimer |
|
11 // in the documentation and/or other materials provided with the |
|
12 // distribution. |
|
13 // * Neither the name of Google Inc. nor the names of its |
|
14 // contributors may be used to endorse or promote products derived from |
|
15 // this software without specific prior written permission. |
|
16 // |
|
17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
|
20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
|
21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
|
24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
|
25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
28 |
|
29 #include "energy_endpointer.h" |
|
30 |
|
31 #include <math.h> |
|
32 |
|
33 namespace { |
|
34 |
|
35 // Returns the RMS (quadratic mean) of the input signal. |
|
36 float RMS(const int16_t* samples, int num_samples) { |
|
37 int64_t ssq_int64_t = 0; |
|
38 int64_t sum_int64_t = 0; |
|
39 for (int i = 0; i < num_samples; ++i) { |
|
40 sum_int64_t += samples[i]; |
|
41 ssq_int64_t += samples[i] * samples[i]; |
|
42 } |
|
43 // now convert to floats. |
|
44 double sum = static_cast<double>(sum_int64_t); |
|
45 sum /= num_samples; |
|
46 double ssq = static_cast<double>(ssq_int64_t); |
|
47 return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum))); |
|
48 } |
|
49 |
|
50 int64_t Secs2Usecs(float seconds) { |
|
51 return static_cast<int64_t>(0.5 + (1.0e6 * seconds)); |
|
52 } |
|
53 |
|
54 float GetDecibel(float value) { |
|
55 if (value > 1.0e-100) |
|
56 return 20 * log10(value); |
|
57 return -2000.0; |
|
58 } |
|
59 |
|
60 } // namespace |
|
61 |
|
62 namespace mozilla { |
|
63 |
|
64 // Stores threshold-crossing histories for making decisions about the speech |
|
65 // state. |
|
66 class EnergyEndpointer::HistoryRing { |
|
67 public: |
|
68 HistoryRing() : insertion_index_(0) {} |
|
69 |
|
70 // Resets the ring to |size| elements each with state |initial_state| |
|
71 void SetRing(int size, bool initial_state); |
|
72 |
|
73 // Inserts a new entry into the ring and drops the oldest entry. |
|
74 void Insert(int64_t time_us, bool decision); |
|
75 |
|
76 // Returns the time in microseconds of the most recently added entry. |
|
77 int64_t EndTime() const; |
|
78 |
|
79 // Returns the sum of all intervals during which 'decision' is true within |
|
80 // the time in seconds specified by 'duration'. The returned interval is |
|
81 // in seconds. |
|
82 float RingSum(float duration_sec); |
|
83 |
|
84 private: |
|
85 struct DecisionPoint { |
|
86 int64_t time_us; |
|
87 bool decision; |
|
88 }; |
|
89 |
|
90 std::vector<DecisionPoint> decision_points_; |
|
91 int insertion_index_; // Index at which the next item gets added/inserted. |
|
92 |
|
93 HistoryRing(const HistoryRing&); |
|
94 void operator=(const HistoryRing&); |
|
95 }; |
|
96 |
|
97 void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) { |
|
98 insertion_index_ = 0; |
|
99 decision_points_.clear(); |
|
100 DecisionPoint init = { -1, initial_state }; |
|
101 decision_points_.resize(size, init); |
|
102 } |
|
103 |
|
104 void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) { |
|
105 decision_points_[insertion_index_].time_us = time_us; |
|
106 decision_points_[insertion_index_].decision = decision; |
|
107 insertion_index_ = (insertion_index_ + 1) % decision_points_.size(); |
|
108 } |
|
109 |
|
110 int64_t EnergyEndpointer::HistoryRing::EndTime() const { |
|
111 int ind = insertion_index_ - 1; |
|
112 if (ind < 0) |
|
113 ind = decision_points_.size() - 1; |
|
114 return decision_points_[ind].time_us; |
|
115 } |
|
116 |
|
117 float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) { |
|
118 if (!decision_points_.size()) |
|
119 return 0.0; |
|
120 |
|
121 int64_t sum_us = 0; |
|
122 int ind = insertion_index_ - 1; |
|
123 if (ind < 0) |
|
124 ind = decision_points_.size() - 1; |
|
125 int64_t end_us = decision_points_[ind].time_us; |
|
126 bool is_on = decision_points_[ind].decision; |
|
127 int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec)); |
|
128 if (start_us < 0) |
|
129 start_us = 0; |
|
130 size_t n_summed = 1; // n points ==> (n-1) intervals |
|
131 while ((decision_points_[ind].time_us > start_us) && |
|
132 (n_summed < decision_points_.size())) { |
|
133 --ind; |
|
134 if (ind < 0) |
|
135 ind = decision_points_.size() - 1; |
|
136 if (is_on) |
|
137 sum_us += end_us - decision_points_[ind].time_us; |
|
138 is_on = decision_points_[ind].decision; |
|
139 end_us = decision_points_[ind].time_us; |
|
140 n_summed++; |
|
141 } |
|
142 |
|
143 return 1.0e-6f * sum_us; // Returns total time that was super threshold. |
|
144 } |
|
145 |
|
146 EnergyEndpointer::EnergyEndpointer() |
|
147 : status_(EP_PRE_SPEECH), |
|
148 offset_confirm_dur_sec_(0), |
|
149 endpointer_time_us_(0), |
|
150 fast_update_frames_(0), |
|
151 frame_counter_(0), |
|
152 max_window_dur_(4.0), |
|
153 sample_rate_(0), |
|
154 history_(new HistoryRing()), |
|
155 decision_threshold_(0), |
|
156 estimating_environment_(false), |
|
157 noise_level_(0), |
|
158 rms_adapt_(0), |
|
159 start_lag_(0), |
|
160 end_lag_(0), |
|
161 user_input_start_time_us_(0) { |
|
162 } |
|
163 |
|
164 EnergyEndpointer::~EnergyEndpointer() { |
|
165 } |
|
166 |
|
167 int EnergyEndpointer::TimeToFrame(float time) const { |
|
168 return static_cast<int32_t>(0.5 + (time / params_.frame_period())); |
|
169 } |
|
170 |
|
171 void EnergyEndpointer::Restart(bool reset_threshold) { |
|
172 status_ = EP_PRE_SPEECH; |
|
173 user_input_start_time_us_ = 0; |
|
174 |
|
175 if (reset_threshold) { |
|
176 decision_threshold_ = params_.decision_threshold(); |
|
177 rms_adapt_ = decision_threshold_; |
|
178 noise_level_ = params_.decision_threshold() / 2.0f; |
|
179 frame_counter_ = 0; // Used for rapid initial update of levels. |
|
180 } |
|
181 |
|
182 // Set up the memories to hold the history windows. |
|
183 history_->SetRing(TimeToFrame(max_window_dur_), false); |
|
184 |
|
185 // Flag that indicates that current input should be used for |
|
186 // estimating the environment. The user has not yet started input |
|
187 // by e.g. pressed the push-to-talk button. By default, this is |
|
188 // false for backward compatibility. |
|
189 estimating_environment_ = false; |
|
190 } |
|
191 |
|
192 void EnergyEndpointer::Init(const EnergyEndpointerParams& params) { |
|
193 params_ = params; |
|
194 |
|
195 // Find the longest history interval to be used, and make the ring |
|
196 // large enough to accommodate that number of frames. NOTE: This |
|
197 // depends upon ep_frame_period being set correctly in the factory |
|
198 // that did this instantiation. |
|
199 max_window_dur_ = params_.onset_window(); |
|
200 if (params_.speech_on_window() > max_window_dur_) |
|
201 max_window_dur_ = params_.speech_on_window(); |
|
202 if (params_.offset_window() > max_window_dur_) |
|
203 max_window_dur_ = params_.offset_window(); |
|
204 Restart(true); |
|
205 |
|
206 offset_confirm_dur_sec_ = params_.offset_window() - |
|
207 params_.offset_confirm_dur(); |
|
208 if (offset_confirm_dur_sec_ < 0.0) |
|
209 offset_confirm_dur_sec_ = 0.0; |
|
210 |
|
211 user_input_start_time_us_ = 0; |
|
212 |
|
213 // Flag that indicates that current input should be used for |
|
214 // estimating the environment. The user has not yet started input |
|
215 // by e.g. pressed the push-to-talk button. By default, this is |
|
216 // false for backward compatibility. |
|
217 estimating_environment_ = false; |
|
218 // The initial value of the noise and speech levels is inconsequential. |
|
219 // The level of the first frame will overwrite these values. |
|
220 noise_level_ = params_.decision_threshold() / 2.0f; |
|
221 fast_update_frames_ = |
|
222 static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period()); |
|
223 |
|
224 frame_counter_ = 0; // Used for rapid initial update of levels. |
|
225 |
|
226 sample_rate_ = params_.sample_rate(); |
|
227 start_lag_ = static_cast<int>(sample_rate_ / |
|
228 params_.max_fundamental_frequency()); |
|
229 end_lag_ = static_cast<int>(sample_rate_ / |
|
230 params_.min_fundamental_frequency()); |
|
231 } |
|
232 |
|
233 void EnergyEndpointer::StartSession() { |
|
234 Restart(true); |
|
235 } |
|
236 |
|
237 void EnergyEndpointer::EndSession() { |
|
238 status_ = EP_POST_SPEECH; |
|
239 } |
|
240 |
|
241 void EnergyEndpointer::SetEnvironmentEstimationMode() { |
|
242 Restart(true); |
|
243 estimating_environment_ = true; |
|
244 } |
|
245 |
|
246 void EnergyEndpointer::SetUserInputMode() { |
|
247 estimating_environment_ = false; |
|
248 user_input_start_time_us_ = endpointer_time_us_; |
|
249 } |
|
250 |
|
251 void EnergyEndpointer::ProcessAudioFrame(int64_t time_us, |
|
252 const int16_t* samples, |
|
253 int num_samples, |
|
254 float* rms_out) { |
|
255 endpointer_time_us_ = time_us; |
|
256 float rms = RMS(samples, num_samples); |
|
257 |
|
258 // Check that this is user input audio vs. pre-input adaptation audio. |
|
259 // Input audio starts when the user indicates start of input, by e.g. |
|
260 // pressing push-to-talk. Audio recieved prior to that is used to update |
|
261 // noise and speech level estimates. |
|
262 if (!estimating_environment_) { |
|
263 bool decision = false; |
|
264 if ((endpointer_time_us_ - user_input_start_time_us_) < |
|
265 Secs2Usecs(params_.contamination_rejection_period())) { |
|
266 decision = false; |
|
267 //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_)); |
|
268 } else { |
|
269 decision = (rms > decision_threshold_); |
|
270 } |
|
271 |
|
272 history_->Insert(endpointer_time_us_, decision); |
|
273 |
|
274 switch (status_) { |
|
275 case EP_PRE_SPEECH: |
|
276 if (history_->RingSum(params_.onset_window()) > |
|
277 params_.onset_detect_dur()) { |
|
278 status_ = EP_POSSIBLE_ONSET; |
|
279 } |
|
280 break; |
|
281 |
|
282 case EP_POSSIBLE_ONSET: { |
|
283 float tsum = history_->RingSum(params_.onset_window()); |
|
284 if (tsum > params_.onset_confirm_dur()) { |
|
285 status_ = EP_SPEECH_PRESENT; |
|
286 } else { // If signal is not maintained, drop back to pre-speech. |
|
287 if (tsum <= params_.onset_detect_dur()) |
|
288 status_ = EP_PRE_SPEECH; |
|
289 } |
|
290 break; |
|
291 } |
|
292 |
|
293 case EP_SPEECH_PRESENT: { |
|
294 // To induce hysteresis in the state residency, we allow a |
|
295 // smaller residency time in the on_ring, than was required to |
|
296 // enter the SPEECH_PERSENT state. |
|
297 float on_time = history_->RingSum(params_.speech_on_window()); |
|
298 if (on_time < params_.on_maintain_dur()) |
|
299 status_ = EP_POSSIBLE_OFFSET; |
|
300 break; |
|
301 } |
|
302 |
|
303 case EP_POSSIBLE_OFFSET: |
|
304 if (history_->RingSum(params_.offset_window()) <= |
|
305 offset_confirm_dur_sec_) { |
|
306 // Note that this offset time may be beyond the end |
|
307 // of the input buffer in a real-time system. It will be up |
|
308 // to the RecognizerSession to decide what to do. |
|
309 status_ = EP_PRE_SPEECH; // Automatically reset for next utterance. |
|
310 } else { // If speech picks up again we allow return to SPEECH_PRESENT. |
|
311 if (history_->RingSum(params_.speech_on_window()) >= |
|
312 params_.on_maintain_dur()) |
|
313 status_ = EP_SPEECH_PRESENT; |
|
314 } |
|
315 break; |
|
316 |
|
317 default: |
|
318 break; |
|
319 } |
|
320 |
|
321 // If this is a quiet, non-speech region, slowly adapt the detection |
|
322 // threshold to be about 6dB above the average RMS. |
|
323 if ((!decision) && (status_ == EP_PRE_SPEECH)) { |
|
324 decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms); |
|
325 rms_adapt_ = decision_threshold_; |
|
326 } else { |
|
327 // If this is in a speech region, adapt the decision threshold to |
|
328 // be about 10dB below the average RMS. If the noise level is high, |
|
329 // the threshold is pushed up. |
|
330 // Adaptation up to a higher level is 5 times faster than decay to |
|
331 // a lower level. |
|
332 if ((status_ == EP_SPEECH_PRESENT) && decision) { |
|
333 if (rms_adapt_ > rms) { |
|
334 rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms); |
|
335 } else { |
|
336 rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms); |
|
337 } |
|
338 float target_threshold = 0.3f * rms_adapt_ + noise_level_; |
|
339 decision_threshold_ = (.90f * decision_threshold_) + |
|
340 (0.10f * target_threshold); |
|
341 } |
|
342 } |
|
343 |
|
344 // Set a floor |
|
345 if (decision_threshold_ < params_.min_decision_threshold()) |
|
346 decision_threshold_ = params_.min_decision_threshold(); |
|
347 } |
|
348 |
|
349 // Update speech and noise levels. |
|
350 UpdateLevels(rms); |
|
351 ++frame_counter_; |
|
352 |
|
353 if (rms_out) |
|
354 *rms_out = GetDecibel(rms); |
|
355 } |
|
356 |
|
357 float EnergyEndpointer::GetNoiseLevelDb() const { |
|
358 return GetDecibel(noise_level_); |
|
359 } |
|
360 |
|
361 void EnergyEndpointer::UpdateLevels(float rms) { |
|
362 // Update quickly initially. We assume this is noise and that |
|
363 // speech is 6dB above the noise. |
|
364 if (frame_counter_ < fast_update_frames_) { |
|
365 // Alpha increases from 0 to (k-1)/k where k is the number of time |
|
366 // steps in the initial adaptation period. |
|
367 float alpha = static_cast<float>(frame_counter_) / |
|
368 static_cast<float>(fast_update_frames_); |
|
369 noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms); |
|
370 //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_)); |
|
371 } else { |
|
372 // Update Noise level. The noise level adapts quickly downward, but |
|
373 // slowly upward. The noise_level_ parameter is not currently used |
|
374 // for threshold adaptation. It is used for UI feedback. |
|
375 if (noise_level_ < rms) |
|
376 noise_level_ = (0.999f * noise_level_) + (0.001f * rms); |
|
377 else |
|
378 noise_level_ = (0.95f * noise_level_) + (0.05f * rms); |
|
379 } |
|
380 if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) { |
|
381 decision_threshold_ = noise_level_ * 2; // 6dB above noise level. |
|
382 // Set a floor |
|
383 if (decision_threshold_ < params_.min_decision_threshold()) |
|
384 decision_threshold_ = params_.min_decision_threshold(); |
|
385 } |
|
386 } |
|
387 |
|
388 EpStatus EnergyEndpointer::Status(int64_t* status_time) const { |
|
389 *status_time = history_->EndTime(); |
|
390 return status_; |
|
391 } |
|
392 |
|
393 } // namespace mozilla |