content/media/webspeech/recognition/energy_endpointer.cc

Tue, 06 Jan 2015 21:39:09 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Tue, 06 Jan 2015 21:39:09 +0100
branch
TOR_BUG_9701
changeset 8
97036ab72558
permissions
-rw-r--r--

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
michael@0 2 //
michael@0 3 // Redistribution and use in source and binary forms, with or without
michael@0 4 // modification, are permitted provided that the following conditions are
michael@0 5 // met:
michael@0 6 //
michael@0 7 // * Redistributions of source code must retain the above copyright
michael@0 8 // notice, this list of conditions and the following disclaimer.
michael@0 9 // * Redistributions in binary form must reproduce the above
michael@0 10 // copyright notice, this list of conditions and the following disclaimer
michael@0 11 // in the documentation and/or other materials provided with the
michael@0 12 // distribution.
michael@0 13 // * Neither the name of Google Inc. nor the names of its
michael@0 14 // contributors may be used to endorse or promote products derived from
michael@0 15 // this software without specific prior written permission.
michael@0 16 //
michael@0 17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
michael@0 18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
michael@0 19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
michael@0 20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
michael@0 21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
michael@0 22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
michael@0 23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
michael@0 24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
michael@0 25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
michael@0 26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
michael@0 27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
michael@0 28
michael@0 29 #include "energy_endpointer.h"
michael@0 30
michael@0 31 #include <math.h>
michael@0 32
michael@0 33 namespace {
michael@0 34
michael@0 35 // Returns the RMS (quadratic mean) of the input signal.
michael@0 36 float RMS(const int16_t* samples, int num_samples) {
michael@0 37 int64_t ssq_int64_t = 0;
michael@0 38 int64_t sum_int64_t = 0;
michael@0 39 for (int i = 0; i < num_samples; ++i) {
michael@0 40 sum_int64_t += samples[i];
michael@0 41 ssq_int64_t += samples[i] * samples[i];
michael@0 42 }
michael@0 43 // now convert to floats.
michael@0 44 double sum = static_cast<double>(sum_int64_t);
michael@0 45 sum /= num_samples;
michael@0 46 double ssq = static_cast<double>(ssq_int64_t);
michael@0 47 return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
michael@0 48 }
michael@0 49
michael@0 50 int64_t Secs2Usecs(float seconds) {
michael@0 51 return static_cast<int64_t>(0.5 + (1.0e6 * seconds));
michael@0 52 }
michael@0 53
michael@0 54 float GetDecibel(float value) {
michael@0 55 if (value > 1.0e-100)
michael@0 56 return 20 * log10(value);
michael@0 57 return -2000.0;
michael@0 58 }
michael@0 59
michael@0 60 } // namespace
michael@0 61
michael@0 62 namespace mozilla {
michael@0 63
michael@0 64 // Stores threshold-crossing histories for making decisions about the speech
michael@0 65 // state.
michael@0 66 class EnergyEndpointer::HistoryRing {
michael@0 67 public:
michael@0 68 HistoryRing() : insertion_index_(0) {}
michael@0 69
michael@0 70 // Resets the ring to |size| elements each with state |initial_state|
michael@0 71 void SetRing(int size, bool initial_state);
michael@0 72
michael@0 73 // Inserts a new entry into the ring and drops the oldest entry.
michael@0 74 void Insert(int64_t time_us, bool decision);
michael@0 75
michael@0 76 // Returns the time in microseconds of the most recently added entry.
michael@0 77 int64_t EndTime() const;
michael@0 78
michael@0 79 // Returns the sum of all intervals during which 'decision' is true within
michael@0 80 // the time in seconds specified by 'duration'. The returned interval is
michael@0 81 // in seconds.
michael@0 82 float RingSum(float duration_sec);
michael@0 83
michael@0 84 private:
michael@0 85 struct DecisionPoint {
michael@0 86 int64_t time_us;
michael@0 87 bool decision;
michael@0 88 };
michael@0 89
michael@0 90 std::vector<DecisionPoint> decision_points_;
michael@0 91 int insertion_index_; // Index at which the next item gets added/inserted.
michael@0 92
michael@0 93 HistoryRing(const HistoryRing&);
michael@0 94 void operator=(const HistoryRing&);
michael@0 95 };
michael@0 96
michael@0 97 void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
michael@0 98 insertion_index_ = 0;
michael@0 99 decision_points_.clear();
michael@0 100 DecisionPoint init = { -1, initial_state };
michael@0 101 decision_points_.resize(size, init);
michael@0 102 }
michael@0 103
michael@0 104 void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) {
michael@0 105 decision_points_[insertion_index_].time_us = time_us;
michael@0 106 decision_points_[insertion_index_].decision = decision;
michael@0 107 insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
michael@0 108 }
michael@0 109
michael@0 110 int64_t EnergyEndpointer::HistoryRing::EndTime() const {
michael@0 111 int ind = insertion_index_ - 1;
michael@0 112 if (ind < 0)
michael@0 113 ind = decision_points_.size() - 1;
michael@0 114 return decision_points_[ind].time_us;
michael@0 115 }
michael@0 116
michael@0 117 float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
michael@0 118 if (!decision_points_.size())
michael@0 119 return 0.0;
michael@0 120
michael@0 121 int64_t sum_us = 0;
michael@0 122 int ind = insertion_index_ - 1;
michael@0 123 if (ind < 0)
michael@0 124 ind = decision_points_.size() - 1;
michael@0 125 int64_t end_us = decision_points_[ind].time_us;
michael@0 126 bool is_on = decision_points_[ind].decision;
michael@0 127 int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec));
michael@0 128 if (start_us < 0)
michael@0 129 start_us = 0;
michael@0 130 size_t n_summed = 1; // n points ==> (n-1) intervals
michael@0 131 while ((decision_points_[ind].time_us > start_us) &&
michael@0 132 (n_summed < decision_points_.size())) {
michael@0 133 --ind;
michael@0 134 if (ind < 0)
michael@0 135 ind = decision_points_.size() - 1;
michael@0 136 if (is_on)
michael@0 137 sum_us += end_us - decision_points_[ind].time_us;
michael@0 138 is_on = decision_points_[ind].decision;
michael@0 139 end_us = decision_points_[ind].time_us;
michael@0 140 n_summed++;
michael@0 141 }
michael@0 142
michael@0 143 return 1.0e-6f * sum_us; // Returns total time that was super threshold.
michael@0 144 }
michael@0 145
michael@0 146 EnergyEndpointer::EnergyEndpointer()
michael@0 147 : status_(EP_PRE_SPEECH),
michael@0 148 offset_confirm_dur_sec_(0),
michael@0 149 endpointer_time_us_(0),
michael@0 150 fast_update_frames_(0),
michael@0 151 frame_counter_(0),
michael@0 152 max_window_dur_(4.0),
michael@0 153 sample_rate_(0),
michael@0 154 history_(new HistoryRing()),
michael@0 155 decision_threshold_(0),
michael@0 156 estimating_environment_(false),
michael@0 157 noise_level_(0),
michael@0 158 rms_adapt_(0),
michael@0 159 start_lag_(0),
michael@0 160 end_lag_(0),
michael@0 161 user_input_start_time_us_(0) {
michael@0 162 }
michael@0 163
michael@0 164 EnergyEndpointer::~EnergyEndpointer() {
michael@0 165 }
michael@0 166
michael@0 167 int EnergyEndpointer::TimeToFrame(float time) const {
michael@0 168 return static_cast<int32_t>(0.5 + (time / params_.frame_period()));
michael@0 169 }
michael@0 170
michael@0 171 void EnergyEndpointer::Restart(bool reset_threshold) {
michael@0 172 status_ = EP_PRE_SPEECH;
michael@0 173 user_input_start_time_us_ = 0;
michael@0 174
michael@0 175 if (reset_threshold) {
michael@0 176 decision_threshold_ = params_.decision_threshold();
michael@0 177 rms_adapt_ = decision_threshold_;
michael@0 178 noise_level_ = params_.decision_threshold() / 2.0f;
michael@0 179 frame_counter_ = 0; // Used for rapid initial update of levels.
michael@0 180 }
michael@0 181
michael@0 182 // Set up the memories to hold the history windows.
michael@0 183 history_->SetRing(TimeToFrame(max_window_dur_), false);
michael@0 184
michael@0 185 // Flag that indicates that current input should be used for
michael@0 186 // estimating the environment. The user has not yet started input
michael@0 187 // by e.g. pressed the push-to-talk button. By default, this is
michael@0 188 // false for backward compatibility.
michael@0 189 estimating_environment_ = false;
michael@0 190 }
michael@0 191
michael@0 192 void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
michael@0 193 params_ = params;
michael@0 194
michael@0 195 // Find the longest history interval to be used, and make the ring
michael@0 196 // large enough to accommodate that number of frames. NOTE: This
michael@0 197 // depends upon ep_frame_period being set correctly in the factory
michael@0 198 // that did this instantiation.
michael@0 199 max_window_dur_ = params_.onset_window();
michael@0 200 if (params_.speech_on_window() > max_window_dur_)
michael@0 201 max_window_dur_ = params_.speech_on_window();
michael@0 202 if (params_.offset_window() > max_window_dur_)
michael@0 203 max_window_dur_ = params_.offset_window();
michael@0 204 Restart(true);
michael@0 205
michael@0 206 offset_confirm_dur_sec_ = params_.offset_window() -
michael@0 207 params_.offset_confirm_dur();
michael@0 208 if (offset_confirm_dur_sec_ < 0.0)
michael@0 209 offset_confirm_dur_sec_ = 0.0;
michael@0 210
michael@0 211 user_input_start_time_us_ = 0;
michael@0 212
michael@0 213 // Flag that indicates that current input should be used for
michael@0 214 // estimating the environment. The user has not yet started input
michael@0 215 // by e.g. pressed the push-to-talk button. By default, this is
michael@0 216 // false for backward compatibility.
michael@0 217 estimating_environment_ = false;
michael@0 218 // The initial value of the noise and speech levels is inconsequential.
michael@0 219 // The level of the first frame will overwrite these values.
michael@0 220 noise_level_ = params_.decision_threshold() / 2.0f;
michael@0 221 fast_update_frames_ =
michael@0 222 static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period());
michael@0 223
michael@0 224 frame_counter_ = 0; // Used for rapid initial update of levels.
michael@0 225
michael@0 226 sample_rate_ = params_.sample_rate();
michael@0 227 start_lag_ = static_cast<int>(sample_rate_ /
michael@0 228 params_.max_fundamental_frequency());
michael@0 229 end_lag_ = static_cast<int>(sample_rate_ /
michael@0 230 params_.min_fundamental_frequency());
michael@0 231 }
michael@0 232
michael@0 233 void EnergyEndpointer::StartSession() {
michael@0 234 Restart(true);
michael@0 235 }
michael@0 236
michael@0 237 void EnergyEndpointer::EndSession() {
michael@0 238 status_ = EP_POST_SPEECH;
michael@0 239 }
michael@0 240
michael@0 241 void EnergyEndpointer::SetEnvironmentEstimationMode() {
michael@0 242 Restart(true);
michael@0 243 estimating_environment_ = true;
michael@0 244 }
michael@0 245
michael@0 246 void EnergyEndpointer::SetUserInputMode() {
michael@0 247 estimating_environment_ = false;
michael@0 248 user_input_start_time_us_ = endpointer_time_us_;
michael@0 249 }
michael@0 250
michael@0 251 void EnergyEndpointer::ProcessAudioFrame(int64_t time_us,
michael@0 252 const int16_t* samples,
michael@0 253 int num_samples,
michael@0 254 float* rms_out) {
michael@0 255 endpointer_time_us_ = time_us;
michael@0 256 float rms = RMS(samples, num_samples);
michael@0 257
michael@0 258 // Check that this is user input audio vs. pre-input adaptation audio.
michael@0 259 // Input audio starts when the user indicates start of input, by e.g.
michael@0 260 // pressing push-to-talk. Audio recieved prior to that is used to update
michael@0 261 // noise and speech level estimates.
michael@0 262 if (!estimating_environment_) {
michael@0 263 bool decision = false;
michael@0 264 if ((endpointer_time_us_ - user_input_start_time_us_) <
michael@0 265 Secs2Usecs(params_.contamination_rejection_period())) {
michael@0 266 decision = false;
michael@0 267 //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_));
michael@0 268 } else {
michael@0 269 decision = (rms > decision_threshold_);
michael@0 270 }
michael@0 271
michael@0 272 history_->Insert(endpointer_time_us_, decision);
michael@0 273
michael@0 274 switch (status_) {
michael@0 275 case EP_PRE_SPEECH:
michael@0 276 if (history_->RingSum(params_.onset_window()) >
michael@0 277 params_.onset_detect_dur()) {
michael@0 278 status_ = EP_POSSIBLE_ONSET;
michael@0 279 }
michael@0 280 break;
michael@0 281
michael@0 282 case EP_POSSIBLE_ONSET: {
michael@0 283 float tsum = history_->RingSum(params_.onset_window());
michael@0 284 if (tsum > params_.onset_confirm_dur()) {
michael@0 285 status_ = EP_SPEECH_PRESENT;
michael@0 286 } else { // If signal is not maintained, drop back to pre-speech.
michael@0 287 if (tsum <= params_.onset_detect_dur())
michael@0 288 status_ = EP_PRE_SPEECH;
michael@0 289 }
michael@0 290 break;
michael@0 291 }
michael@0 292
michael@0 293 case EP_SPEECH_PRESENT: {
michael@0 294 // To induce hysteresis in the state residency, we allow a
michael@0 295 // smaller residency time in the on_ring, than was required to
michael@0 296 // enter the SPEECH_PERSENT state.
michael@0 297 float on_time = history_->RingSum(params_.speech_on_window());
michael@0 298 if (on_time < params_.on_maintain_dur())
michael@0 299 status_ = EP_POSSIBLE_OFFSET;
michael@0 300 break;
michael@0 301 }
michael@0 302
michael@0 303 case EP_POSSIBLE_OFFSET:
michael@0 304 if (history_->RingSum(params_.offset_window()) <=
michael@0 305 offset_confirm_dur_sec_) {
michael@0 306 // Note that this offset time may be beyond the end
michael@0 307 // of the input buffer in a real-time system. It will be up
michael@0 308 // to the RecognizerSession to decide what to do.
michael@0 309 status_ = EP_PRE_SPEECH; // Automatically reset for next utterance.
michael@0 310 } else { // If speech picks up again we allow return to SPEECH_PRESENT.
michael@0 311 if (history_->RingSum(params_.speech_on_window()) >=
michael@0 312 params_.on_maintain_dur())
michael@0 313 status_ = EP_SPEECH_PRESENT;
michael@0 314 }
michael@0 315 break;
michael@0 316
michael@0 317 default:
michael@0 318 break;
michael@0 319 }
michael@0 320
michael@0 321 // If this is a quiet, non-speech region, slowly adapt the detection
michael@0 322 // threshold to be about 6dB above the average RMS.
michael@0 323 if ((!decision) && (status_ == EP_PRE_SPEECH)) {
michael@0 324 decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
michael@0 325 rms_adapt_ = decision_threshold_;
michael@0 326 } else {
michael@0 327 // If this is in a speech region, adapt the decision threshold to
michael@0 328 // be about 10dB below the average RMS. If the noise level is high,
michael@0 329 // the threshold is pushed up.
michael@0 330 // Adaptation up to a higher level is 5 times faster than decay to
michael@0 331 // a lower level.
michael@0 332 if ((status_ == EP_SPEECH_PRESENT) && decision) {
michael@0 333 if (rms_adapt_ > rms) {
michael@0 334 rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
michael@0 335 } else {
michael@0 336 rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
michael@0 337 }
michael@0 338 float target_threshold = 0.3f * rms_adapt_ + noise_level_;
michael@0 339 decision_threshold_ = (.90f * decision_threshold_) +
michael@0 340 (0.10f * target_threshold);
michael@0 341 }
michael@0 342 }
michael@0 343
michael@0 344 // Set a floor
michael@0 345 if (decision_threshold_ < params_.min_decision_threshold())
michael@0 346 decision_threshold_ = params_.min_decision_threshold();
michael@0 347 }
michael@0 348
michael@0 349 // Update speech and noise levels.
michael@0 350 UpdateLevels(rms);
michael@0 351 ++frame_counter_;
michael@0 352
michael@0 353 if (rms_out)
michael@0 354 *rms_out = GetDecibel(rms);
michael@0 355 }
michael@0 356
michael@0 357 float EnergyEndpointer::GetNoiseLevelDb() const {
michael@0 358 return GetDecibel(noise_level_);
michael@0 359 }
michael@0 360
michael@0 361 void EnergyEndpointer::UpdateLevels(float rms) {
michael@0 362 // Update quickly initially. We assume this is noise and that
michael@0 363 // speech is 6dB above the noise.
michael@0 364 if (frame_counter_ < fast_update_frames_) {
michael@0 365 // Alpha increases from 0 to (k-1)/k where k is the number of time
michael@0 366 // steps in the initial adaptation period.
michael@0 367 float alpha = static_cast<float>(frame_counter_) /
michael@0 368 static_cast<float>(fast_update_frames_);
michael@0 369 noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
michael@0 370 //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_));
michael@0 371 } else {
michael@0 372 // Update Noise level. The noise level adapts quickly downward, but
michael@0 373 // slowly upward. The noise_level_ parameter is not currently used
michael@0 374 // for threshold adaptation. It is used for UI feedback.
michael@0 375 if (noise_level_ < rms)
michael@0 376 noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
michael@0 377 else
michael@0 378 noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
michael@0 379 }
michael@0 380 if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
michael@0 381 decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
michael@0 382 // Set a floor
michael@0 383 if (decision_threshold_ < params_.min_decision_threshold())
michael@0 384 decision_threshold_ = params_.min_decision_threshold();
michael@0 385 }
michael@0 386 }
michael@0 387
michael@0 388 EpStatus EnergyEndpointer::Status(int64_t* status_time) const {
michael@0 389 *status_time = history_->EndTime();
michael@0 390 return status_;
michael@0 391 }
michael@0 392
michael@0 393 } // namespace mozilla

mercurial