content/media/webspeech/recognition/energy_endpointer.h

Thu, 15 Jan 2015 15:55:04 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:55:04 +0100
branch
TOR_BUG_9701
changeset 9
a63d609f5ebe
permissions
-rw-r--r--

Back out 97036ab72558 which inappropriately compared turds to third parties.

michael@0 1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
michael@0 2 //
michael@0 3 // Redistribution and use in source and binary forms, with or without
michael@0 4 // modification, are permitted provided that the following conditions are
michael@0 5 // met:
michael@0 6 //
michael@0 7 // * Redistributions of source code must retain the above copyright
michael@0 8 // notice, this list of conditions and the following disclaimer.
michael@0 9 // * Redistributions in binary form must reproduce the above
michael@0 10 // copyright notice, this list of conditions and the following disclaimer
michael@0 11 // in the documentation and/or other materials provided with the
michael@0 12 // distribution.
michael@0 13 // * Neither the name of Google Inc. nor the names of its
michael@0 14 // contributors may be used to endorse or promote products derived from
michael@0 15 // this software without specific prior written permission.
michael@0 16 //
michael@0 17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
michael@0 18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
michael@0 19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
michael@0 20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
michael@0 21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
michael@0 22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
michael@0 23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
michael@0 24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
michael@0 25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
michael@0 26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
michael@0 27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
michael@0 28
michael@0 29 // The EnergyEndpointer class finds likely speech onset and offset points.
michael@0 30 //
michael@0 31 // The implementation described here is about the simplest possible.
michael@0 32 // It is based on timings of threshold crossings for overall signal
michael@0 33 // RMS. It is suitable for light weight applications.
michael@0 34 //
michael@0 35 // As written, the basic idea is that one specifies intervals that
michael@0 36 // must be occupied by super- and sub-threshold energy levels, and
michael@0 37 // defers decisions re onset and offset times until these
michael@0 38 // specifications have been met. Three basic intervals are tested: an
michael@0 39 // onset window, a speech-on window, and an offset window. We require
michael@0 40 // super-threshold to exceed some mimimum total durations in the onset
michael@0 41 // and speech-on windows before declaring the speech onset time, and
michael@0 42 // we specify a required sub-threshold residency in the offset window
michael@0 43 // before declaring speech offset. As the various residency requirements are
michael@0 44 // met, the EnergyEndpointer instance assumes various states, and can return the
michael@0 45 // ID of these states to the client (see EpStatus below).
michael@0 46 //
michael@0 47 // The levels of the speech and background noise are continuously updated. It is
michael@0 48 // important that the background noise level be estimated initially for
michael@0 49 // robustness in noisy conditions. The first frames are assumed to be background
michael@0 50 // noise and a fast update rate is used for the noise level. The duration for
michael@0 51 // fast update is controlled by the fast_update_dur_ paramter.
michael@0 52 //
michael@0 53 // If used in noisy conditions, the endpointer should be started and run in the
michael@0 54 // EnvironmentEstimation mode, for at least 200ms, before switching to
michael@0 55 // UserInputMode.
michael@0 56 // Audio feedback contamination can appear in the input audio, if not cut
michael@0 57 // out or handled by echo cancellation. Audio feedback can trigger a false
michael@0 58 // accept. The false accepts can be ignored by setting
michael@0 59 // ep_contamination_rejection_period.
michael@0 60
michael@0 61 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
michael@0 62 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
michael@0 63
michael@0 64 #include <vector>
michael@0 65
michael@0 66 #include "nsAutoPtr.h"
michael@0 67
michael@0 68 #include "energy_endpointer_params.h"
michael@0 69
michael@0 70 namespace mozilla {
michael@0 71
michael@0 72 // Endpointer status codes
michael@0 73 enum EpStatus {
michael@0 74 EP_PRE_SPEECH = 10,
michael@0 75 EP_POSSIBLE_ONSET,
michael@0 76 EP_SPEECH_PRESENT,
michael@0 77 EP_POSSIBLE_OFFSET,
michael@0 78 EP_POST_SPEECH,
michael@0 79 };
michael@0 80
michael@0 81 class EnergyEndpointer {
michael@0 82 public:
michael@0 83 // The default construction MUST be followed by Init(), before any
michael@0 84 // other use can be made of the instance.
michael@0 85 EnergyEndpointer();
michael@0 86 virtual ~EnergyEndpointer();
michael@0 87
michael@0 88 void Init(const EnergyEndpointerParams& params);
michael@0 89
michael@0 90 // Start the endpointer. This should be called at the beginning of a session.
michael@0 91 void StartSession();
michael@0 92
michael@0 93 // Stop the endpointer.
michael@0 94 void EndSession();
michael@0 95
michael@0 96 // Start environment estimation. Audio will be used for environment estimation
michael@0 97 // i.e. noise level estimation.
michael@0 98 void SetEnvironmentEstimationMode();
michael@0 99
michael@0 100 // Start user input. This should be called when the user indicates start of
michael@0 101 // input, e.g. by pressing a button.
michael@0 102 void SetUserInputMode();
michael@0 103
michael@0 104 // Computes the next input frame and modifies EnergyEndpointer status as
michael@0 105 // appropriate based on the computation.
michael@0 106 void ProcessAudioFrame(int64_t time_us,
michael@0 107 const int16_t* samples, int num_samples,
michael@0 108 float* rms_out);
michael@0 109
michael@0 110 // Returns the current state of the EnergyEndpointer and the time
michael@0 111 // corresponding to the most recently computed frame.
michael@0 112 EpStatus Status(int64_t* status_time_us) const;
michael@0 113
michael@0 114 bool estimating_environment() const {
michael@0 115 return estimating_environment_;
michael@0 116 }
michael@0 117
michael@0 118 // Returns estimated noise level in dB.
michael@0 119 float GetNoiseLevelDb() const;
michael@0 120
michael@0 121 private:
michael@0 122 class HistoryRing;
michael@0 123
michael@0 124 // Resets the endpointer internal state. If reset_threshold is true, the
michael@0 125 // state will be reset completely, including adaptive thresholds and the
michael@0 126 // removal of all history information.
michael@0 127 void Restart(bool reset_threshold);
michael@0 128
michael@0 129 // Update internal speech and noise levels.
michael@0 130 void UpdateLevels(float rms);
michael@0 131
michael@0 132 // Returns the number of frames (or frame number) corresponding to
michael@0 133 // the 'time' (in seconds).
michael@0 134 int TimeToFrame(float time) const;
michael@0 135
michael@0 136 EpStatus status_; // The current state of this instance.
michael@0 137 float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH
michael@0 138 int64_t endpointer_time_us_; // Time of the most recently received audio frame.
michael@0 139 int64_t fast_update_frames_; // Number of frames for initial level adaptation.
michael@0 140 int64_t frame_counter_; // Number of frames seen. Used for initial adaptation.
michael@0 141 float max_window_dur_; // Largest search window size (seconds)
michael@0 142 float sample_rate_; // Sampling rate.
michael@0 143
michael@0 144 // Ring buffers to hold the speech activity history.
michael@0 145 nsAutoPtr<HistoryRing> history_;
michael@0 146
michael@0 147 // Configuration parameters.
michael@0 148 EnergyEndpointerParams params_;
michael@0 149
michael@0 150 // RMS which must be exceeded to conclude frame is speech.
michael@0 151 float decision_threshold_;
michael@0 152
michael@0 153 // Flag to indicate that audio should be used to estimate environment, prior
michael@0 154 // to receiving user input.
michael@0 155 bool estimating_environment_;
michael@0 156
michael@0 157 // Estimate of the background noise level. Used externally for UI feedback.
michael@0 158 float noise_level_;
michael@0 159
michael@0 160 // An adaptive threshold used to update decision_threshold_ when appropriate.
michael@0 161 float rms_adapt_;
michael@0 162
michael@0 163 // Start lag corresponds to the highest fundamental frequency.
michael@0 164 int start_lag_;
michael@0 165
michael@0 166 // End lag corresponds to the lowest fundamental frequency.
michael@0 167 int end_lag_;
michael@0 168
michael@0 169 // Time when mode switched from environment estimation to user input. This
michael@0 170 // is used to time forced rejection of audio feedback contamination.
michael@0 171 int64_t user_input_start_time_us_;
michael@0 172
michael@0 173 // prevent copy constructor and assignment
michael@0 174 EnergyEndpointer(const EnergyEndpointer&);
michael@0 175 void operator=(const EnergyEndpointer&);
michael@0 176 };
michael@0 177
michael@0 178 } // namespace mozilla
michael@0 179
michael@0 180 #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_

mercurial