Fri, 16 Jan 2015 04:50:19 +0100
Replace accessor implementation with direct member state manipulation, by
request https://trac.torproject.org/projects/tor/ticket/9701#comment:32
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 //
3 // Redistribution and use in source and binary forms, with or without
4 // modification, are permitted provided that the following conditions are
5 // met:
6 //
7 // * Redistributions of source code must retain the above copyright
8 // notice, this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above
10 // copyright notice, this list of conditions and the following disclaimer
11 // in the documentation and/or other materials provided with the
12 // distribution.
13 // * Neither the name of Google Inc. nor the names of its
14 // contributors may be used to endorse or promote products derived from
15 // this software without specific prior written permission.
16 //
17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 #include "energy_endpointer.h"
31 #include <math.h>
33 namespace {
35 // Returns the RMS (quadratic mean) of the input signal.
36 float RMS(const int16_t* samples, int num_samples) {
37 int64_t ssq_int64_t = 0;
38 int64_t sum_int64_t = 0;
39 for (int i = 0; i < num_samples; ++i) {
40 sum_int64_t += samples[i];
41 ssq_int64_t += samples[i] * samples[i];
42 }
43 // now convert to floats.
44 double sum = static_cast<double>(sum_int64_t);
45 sum /= num_samples;
46 double ssq = static_cast<double>(ssq_int64_t);
47 return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
48 }
50 int64_t Secs2Usecs(float seconds) {
51 return static_cast<int64_t>(0.5 + (1.0e6 * seconds));
52 }
54 float GetDecibel(float value) {
55 if (value > 1.0e-100)
56 return 20 * log10(value);
57 return -2000.0;
58 }
60 } // namespace
62 namespace mozilla {
64 // Stores threshold-crossing histories for making decisions about the speech
65 // state.
66 class EnergyEndpointer::HistoryRing {
67 public:
68 HistoryRing() : insertion_index_(0) {}
70 // Resets the ring to |size| elements each with state |initial_state|
71 void SetRing(int size, bool initial_state);
73 // Inserts a new entry into the ring and drops the oldest entry.
74 void Insert(int64_t time_us, bool decision);
76 // Returns the time in microseconds of the most recently added entry.
77 int64_t EndTime() const;
79 // Returns the sum of all intervals during which 'decision' is true within
80 // the time in seconds specified by 'duration'. The returned interval is
81 // in seconds.
82 float RingSum(float duration_sec);
84 private:
85 struct DecisionPoint {
86 int64_t time_us;
87 bool decision;
88 };
90 std::vector<DecisionPoint> decision_points_;
91 int insertion_index_; // Index at which the next item gets added/inserted.
93 HistoryRing(const HistoryRing&);
94 void operator=(const HistoryRing&);
95 };
97 void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
98 insertion_index_ = 0;
99 decision_points_.clear();
100 DecisionPoint init = { -1, initial_state };
101 decision_points_.resize(size, init);
102 }
104 void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) {
105 decision_points_[insertion_index_].time_us = time_us;
106 decision_points_[insertion_index_].decision = decision;
107 insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
108 }
110 int64_t EnergyEndpointer::HistoryRing::EndTime() const {
111 int ind = insertion_index_ - 1;
112 if (ind < 0)
113 ind = decision_points_.size() - 1;
114 return decision_points_[ind].time_us;
115 }
117 float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
118 if (!decision_points_.size())
119 return 0.0;
121 int64_t sum_us = 0;
122 int ind = insertion_index_ - 1;
123 if (ind < 0)
124 ind = decision_points_.size() - 1;
125 int64_t end_us = decision_points_[ind].time_us;
126 bool is_on = decision_points_[ind].decision;
127 int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec));
128 if (start_us < 0)
129 start_us = 0;
130 size_t n_summed = 1; // n points ==> (n-1) intervals
131 while ((decision_points_[ind].time_us > start_us) &&
132 (n_summed < decision_points_.size())) {
133 --ind;
134 if (ind < 0)
135 ind = decision_points_.size() - 1;
136 if (is_on)
137 sum_us += end_us - decision_points_[ind].time_us;
138 is_on = decision_points_[ind].decision;
139 end_us = decision_points_[ind].time_us;
140 n_summed++;
141 }
143 return 1.0e-6f * sum_us; // Returns total time that was super threshold.
144 }
146 EnergyEndpointer::EnergyEndpointer()
147 : status_(EP_PRE_SPEECH),
148 offset_confirm_dur_sec_(0),
149 endpointer_time_us_(0),
150 fast_update_frames_(0),
151 frame_counter_(0),
152 max_window_dur_(4.0),
153 sample_rate_(0),
154 history_(new HistoryRing()),
155 decision_threshold_(0),
156 estimating_environment_(false),
157 noise_level_(0),
158 rms_adapt_(0),
159 start_lag_(0),
160 end_lag_(0),
161 user_input_start_time_us_(0) {
162 }
164 EnergyEndpointer::~EnergyEndpointer() {
165 }
167 int EnergyEndpointer::TimeToFrame(float time) const {
168 return static_cast<int32_t>(0.5 + (time / params_.frame_period()));
169 }
171 void EnergyEndpointer::Restart(bool reset_threshold) {
172 status_ = EP_PRE_SPEECH;
173 user_input_start_time_us_ = 0;
175 if (reset_threshold) {
176 decision_threshold_ = params_.decision_threshold();
177 rms_adapt_ = decision_threshold_;
178 noise_level_ = params_.decision_threshold() / 2.0f;
179 frame_counter_ = 0; // Used for rapid initial update of levels.
180 }
182 // Set up the memories to hold the history windows.
183 history_->SetRing(TimeToFrame(max_window_dur_), false);
185 // Flag that indicates that current input should be used for
186 // estimating the environment. The user has not yet started input
187 // by e.g. pressed the push-to-talk button. By default, this is
188 // false for backward compatibility.
189 estimating_environment_ = false;
190 }
192 void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
193 params_ = params;
195 // Find the longest history interval to be used, and make the ring
196 // large enough to accommodate that number of frames. NOTE: This
197 // depends upon ep_frame_period being set correctly in the factory
198 // that did this instantiation.
199 max_window_dur_ = params_.onset_window();
200 if (params_.speech_on_window() > max_window_dur_)
201 max_window_dur_ = params_.speech_on_window();
202 if (params_.offset_window() > max_window_dur_)
203 max_window_dur_ = params_.offset_window();
204 Restart(true);
206 offset_confirm_dur_sec_ = params_.offset_window() -
207 params_.offset_confirm_dur();
208 if (offset_confirm_dur_sec_ < 0.0)
209 offset_confirm_dur_sec_ = 0.0;
211 user_input_start_time_us_ = 0;
213 // Flag that indicates that current input should be used for
214 // estimating the environment. The user has not yet started input
215 // by e.g. pressed the push-to-talk button. By default, this is
216 // false for backward compatibility.
217 estimating_environment_ = false;
218 // The initial value of the noise and speech levels is inconsequential.
219 // The level of the first frame will overwrite these values.
220 noise_level_ = params_.decision_threshold() / 2.0f;
221 fast_update_frames_ =
222 static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period());
224 frame_counter_ = 0; // Used for rapid initial update of levels.
226 sample_rate_ = params_.sample_rate();
227 start_lag_ = static_cast<int>(sample_rate_ /
228 params_.max_fundamental_frequency());
229 end_lag_ = static_cast<int>(sample_rate_ /
230 params_.min_fundamental_frequency());
231 }
233 void EnergyEndpointer::StartSession() {
234 Restart(true);
235 }
237 void EnergyEndpointer::EndSession() {
238 status_ = EP_POST_SPEECH;
239 }
241 void EnergyEndpointer::SetEnvironmentEstimationMode() {
242 Restart(true);
243 estimating_environment_ = true;
244 }
246 void EnergyEndpointer::SetUserInputMode() {
247 estimating_environment_ = false;
248 user_input_start_time_us_ = endpointer_time_us_;
249 }
251 void EnergyEndpointer::ProcessAudioFrame(int64_t time_us,
252 const int16_t* samples,
253 int num_samples,
254 float* rms_out) {
255 endpointer_time_us_ = time_us;
256 float rms = RMS(samples, num_samples);
258 // Check that this is user input audio vs. pre-input adaptation audio.
259 // Input audio starts when the user indicates start of input, by e.g.
260 // pressing push-to-talk. Audio recieved prior to that is used to update
261 // noise and speech level estimates.
262 if (!estimating_environment_) {
263 bool decision = false;
264 if ((endpointer_time_us_ - user_input_start_time_us_) <
265 Secs2Usecs(params_.contamination_rejection_period())) {
266 decision = false;
267 //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_));
268 } else {
269 decision = (rms > decision_threshold_);
270 }
272 history_->Insert(endpointer_time_us_, decision);
274 switch (status_) {
275 case EP_PRE_SPEECH:
276 if (history_->RingSum(params_.onset_window()) >
277 params_.onset_detect_dur()) {
278 status_ = EP_POSSIBLE_ONSET;
279 }
280 break;
282 case EP_POSSIBLE_ONSET: {
283 float tsum = history_->RingSum(params_.onset_window());
284 if (tsum > params_.onset_confirm_dur()) {
285 status_ = EP_SPEECH_PRESENT;
286 } else { // If signal is not maintained, drop back to pre-speech.
287 if (tsum <= params_.onset_detect_dur())
288 status_ = EP_PRE_SPEECH;
289 }
290 break;
291 }
293 case EP_SPEECH_PRESENT: {
294 // To induce hysteresis in the state residency, we allow a
295 // smaller residency time in the on_ring, than was required to
296 // enter the SPEECH_PERSENT state.
297 float on_time = history_->RingSum(params_.speech_on_window());
298 if (on_time < params_.on_maintain_dur())
299 status_ = EP_POSSIBLE_OFFSET;
300 break;
301 }
303 case EP_POSSIBLE_OFFSET:
304 if (history_->RingSum(params_.offset_window()) <=
305 offset_confirm_dur_sec_) {
306 // Note that this offset time may be beyond the end
307 // of the input buffer in a real-time system. It will be up
308 // to the RecognizerSession to decide what to do.
309 status_ = EP_PRE_SPEECH; // Automatically reset for next utterance.
310 } else { // If speech picks up again we allow return to SPEECH_PRESENT.
311 if (history_->RingSum(params_.speech_on_window()) >=
312 params_.on_maintain_dur())
313 status_ = EP_SPEECH_PRESENT;
314 }
315 break;
317 default:
318 break;
319 }
321 // If this is a quiet, non-speech region, slowly adapt the detection
322 // threshold to be about 6dB above the average RMS.
323 if ((!decision) && (status_ == EP_PRE_SPEECH)) {
324 decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
325 rms_adapt_ = decision_threshold_;
326 } else {
327 // If this is in a speech region, adapt the decision threshold to
328 // be about 10dB below the average RMS. If the noise level is high,
329 // the threshold is pushed up.
330 // Adaptation up to a higher level is 5 times faster than decay to
331 // a lower level.
332 if ((status_ == EP_SPEECH_PRESENT) && decision) {
333 if (rms_adapt_ > rms) {
334 rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
335 } else {
336 rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
337 }
338 float target_threshold = 0.3f * rms_adapt_ + noise_level_;
339 decision_threshold_ = (.90f * decision_threshold_) +
340 (0.10f * target_threshold);
341 }
342 }
344 // Set a floor
345 if (decision_threshold_ < params_.min_decision_threshold())
346 decision_threshold_ = params_.min_decision_threshold();
347 }
349 // Update speech and noise levels.
350 UpdateLevels(rms);
351 ++frame_counter_;
353 if (rms_out)
354 *rms_out = GetDecibel(rms);
355 }
357 float EnergyEndpointer::GetNoiseLevelDb() const {
358 return GetDecibel(noise_level_);
359 }
361 void EnergyEndpointer::UpdateLevels(float rms) {
362 // Update quickly initially. We assume this is noise and that
363 // speech is 6dB above the noise.
364 if (frame_counter_ < fast_update_frames_) {
365 // Alpha increases from 0 to (k-1)/k where k is the number of time
366 // steps in the initial adaptation period.
367 float alpha = static_cast<float>(frame_counter_) /
368 static_cast<float>(fast_update_frames_);
369 noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
370 //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_));
371 } else {
372 // Update Noise level. The noise level adapts quickly downward, but
373 // slowly upward. The noise_level_ parameter is not currently used
374 // for threshold adaptation. It is used for UI feedback.
375 if (noise_level_ < rms)
376 noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
377 else
378 noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
379 }
380 if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
381 decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
382 // Set a floor
383 if (decision_threshold_ < params_.min_decision_threshold())
384 decision_threshold_ = params_.min_decision_threshold();
385 }
386 }
388 EpStatus EnergyEndpointer::Status(int64_t* status_time) const {
389 *status_time = history_->EndTime();
390 return status_;
391 }
393 } // namespace mozilla