1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/content/media/webspeech/recognition/energy_endpointer.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,180 @@ 1.4 +// Copyright (c) 2013 The Chromium Authors. All rights reserved. 1.5 +// 1.6 +// Redistribution and use in source and binary forms, with or without 1.7 +// modification, are permitted provided that the following conditions are 1.8 +// met: 1.9 +// 1.10 +// * Redistributions of source code must retain the above copyright 1.11 +// notice, this list of conditions and the following disclaimer. 1.12 +// * Redistributions in binary form must reproduce the above 1.13 +// copyright notice, this list of conditions and the following disclaimer 1.14 +// in the documentation and/or other materials provided with the 1.15 +// distribution. 1.16 +// * Neither the name of Google Inc. nor the names of its 1.17 +// contributors may be used to endorse or promote products derived from 1.18 +// this software without specific prior written permission. 1.19 +// 1.20 +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 1.21 +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 1.22 +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 1.23 +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 1.24 +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 1.25 +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 1.26 +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 1.27 +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 1.28 +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 1.29 +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 1.30 +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1.31 + 1.32 +// The EnergyEndpointer class finds likely speech onset and offset points. 1.33 +// 1.34 +// The implementation described here is about the simplest possible. 1.35 +// It is based on timings of threshold crossings for overall signal 1.36 +// RMS. It is suitable for light weight applications. 1.37 +// 1.38 +// As written, the basic idea is that one specifies intervals that 1.39 +// must be occupied by super- and sub-threshold energy levels, and 1.40 +// defers decisions re onset and offset times until these 1.41 +// specifications have been met. Three basic intervals are tested: an 1.42 +// onset window, a speech-on window, and an offset window. We require 1.43 +// super-threshold to exceed some mimimum total durations in the onset 1.44 +// and speech-on windows before declaring the speech onset time, and 1.45 +// we specify a required sub-threshold residency in the offset window 1.46 +// before declaring speech offset. As the various residency requirements are 1.47 +// met, the EnergyEndpointer instance assumes various states, and can return the 1.48 +// ID of these states to the client (see EpStatus below). 1.49 +// 1.50 +// The levels of the speech and background noise are continuously updated. It is 1.51 +// important that the background noise level be estimated initially for 1.52 +// robustness in noisy conditions. The first frames are assumed to be background 1.53 +// noise and a fast update rate is used for the noise level. The duration for 1.54 +// fast update is controlled by the fast_update_dur_ paramter. 1.55 +// 1.56 +// If used in noisy conditions, the endpointer should be started and run in the 1.57 +// EnvironmentEstimation mode, for at least 200ms, before switching to 1.58 +// UserInputMode. 1.59 +// Audio feedback contamination can appear in the input audio, if not cut 1.60 +// out or handled by echo cancellation. Audio feedback can trigger a false 1.61 +// accept. The false accepts can be ignored by setting 1.62 +// ep_contamination_rejection_period. 1.63 + 1.64 +#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ 1.65 +#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ 1.66 + 1.67 +#include <vector> 1.68 + 1.69 +#include "nsAutoPtr.h" 1.70 + 1.71 +#include "energy_endpointer_params.h" 1.72 + 1.73 +namespace mozilla { 1.74 + 1.75 +// Endpointer status codes 1.76 +enum EpStatus { 1.77 + EP_PRE_SPEECH = 10, 1.78 + EP_POSSIBLE_ONSET, 1.79 + EP_SPEECH_PRESENT, 1.80 + EP_POSSIBLE_OFFSET, 1.81 + EP_POST_SPEECH, 1.82 +}; 1.83 + 1.84 +class EnergyEndpointer { 1.85 + public: 1.86 + // The default construction MUST be followed by Init(), before any 1.87 + // other use can be made of the instance. 1.88 + EnergyEndpointer(); 1.89 + virtual ~EnergyEndpointer(); 1.90 + 1.91 + void Init(const EnergyEndpointerParams& params); 1.92 + 1.93 + // Start the endpointer. This should be called at the beginning of a session. 1.94 + void StartSession(); 1.95 + 1.96 + // Stop the endpointer. 1.97 + void EndSession(); 1.98 + 1.99 + // Start environment estimation. Audio will be used for environment estimation 1.100 + // i.e. noise level estimation. 1.101 + void SetEnvironmentEstimationMode(); 1.102 + 1.103 + // Start user input. This should be called when the user indicates start of 1.104 + // input, e.g. by pressing a button. 1.105 + void SetUserInputMode(); 1.106 + 1.107 + // Computes the next input frame and modifies EnergyEndpointer status as 1.108 + // appropriate based on the computation. 1.109 + void ProcessAudioFrame(int64_t time_us, 1.110 + const int16_t* samples, int num_samples, 1.111 + float* rms_out); 1.112 + 1.113 + // Returns the current state of the EnergyEndpointer and the time 1.114 + // corresponding to the most recently computed frame. 1.115 + EpStatus Status(int64_t* status_time_us) const; 1.116 + 1.117 + bool estimating_environment() const { 1.118 + return estimating_environment_; 1.119 + } 1.120 + 1.121 + // Returns estimated noise level in dB. 1.122 + float GetNoiseLevelDb() const; 1.123 + 1.124 + private: 1.125 + class HistoryRing; 1.126 + 1.127 + // Resets the endpointer internal state. If reset_threshold is true, the 1.128 + // state will be reset completely, including adaptive thresholds and the 1.129 + // removal of all history information. 1.130 + void Restart(bool reset_threshold); 1.131 + 1.132 + // Update internal speech and noise levels. 1.133 + void UpdateLevels(float rms); 1.134 + 1.135 + // Returns the number of frames (or frame number) corresponding to 1.136 + // the 'time' (in seconds). 1.137 + int TimeToFrame(float time) const; 1.138 + 1.139 + EpStatus status_; // The current state of this instance. 1.140 + float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH 1.141 + int64_t endpointer_time_us_; // Time of the most recently received audio frame. 1.142 + int64_t fast_update_frames_; // Number of frames for initial level adaptation. 1.143 + int64_t frame_counter_; // Number of frames seen. Used for initial adaptation. 1.144 + float max_window_dur_; // Largest search window size (seconds) 1.145 + float sample_rate_; // Sampling rate. 1.146 + 1.147 + // Ring buffers to hold the speech activity history. 1.148 + nsAutoPtr<HistoryRing> history_; 1.149 + 1.150 + // Configuration parameters. 1.151 + EnergyEndpointerParams params_; 1.152 + 1.153 + // RMS which must be exceeded to conclude frame is speech. 1.154 + float decision_threshold_; 1.155 + 1.156 + // Flag to indicate that audio should be used to estimate environment, prior 1.157 + // to receiving user input. 1.158 + bool estimating_environment_; 1.159 + 1.160 + // Estimate of the background noise level. Used externally for UI feedback. 1.161 + float noise_level_; 1.162 + 1.163 + // An adaptive threshold used to update decision_threshold_ when appropriate. 1.164 + float rms_adapt_; 1.165 + 1.166 + // Start lag corresponds to the highest fundamental frequency. 1.167 + int start_lag_; 1.168 + 1.169 + // End lag corresponds to the lowest fundamental frequency. 1.170 + int end_lag_; 1.171 + 1.172 + // Time when mode switched from environment estimation to user input. This 1.173 + // is used to time forced rejection of audio feedback contamination. 1.174 + int64_t user_input_start_time_us_; 1.175 + 1.176 + // prevent copy constructor and assignment 1.177 + EnergyEndpointer(const EnergyEndpointer&); 1.178 + void operator=(const EnergyEndpointer&); 1.179 +}; 1.180 + 1.181 +} // namespace mozilla 1.182 + 1.183 +#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_