content/media/webspeech/recognition/energy_endpointer.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/content/media/webspeech/recognition/energy_endpointer.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,180 @@
     1.4 +// Copyright (c) 2013 The Chromium Authors. All rights reserved.
     1.5 +//
     1.6 +// Redistribution and use in source and binary forms, with or without
     1.7 +// modification, are permitted provided that the following conditions are
     1.8 +// met:
     1.9 +//
    1.10 +//    * Redistributions of source code must retain the above copyright
    1.11 +// notice, this list of conditions and the following disclaimer.
    1.12 +//    * Redistributions in binary form must reproduce the above
    1.13 +// copyright notice, this list of conditions and the following disclaimer
    1.14 +// in the documentation and/or other materials provided with the
    1.15 +// distribution.
    1.16 +//    * Neither the name of Google Inc. nor the names of its
    1.17 +// contributors may be used to endorse or promote products derived from
    1.18 +// this software without specific prior written permission.
    1.19 +//
    1.20 +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    1.21 +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    1.22 +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
    1.23 +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
    1.24 +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
    1.25 +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
    1.26 +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
    1.27 +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    1.28 +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    1.29 +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    1.30 +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    1.31 +
    1.32 +// The EnergyEndpointer class finds likely speech onset and offset points.
    1.33 +//
    1.34 +// The implementation described here is about the simplest possible.
    1.35 +// It is based on timings of threshold crossings for overall signal
    1.36 +// RMS. It is suitable for light weight applications.
    1.37 +//
    1.38 +// As written, the basic idea is that one specifies intervals that
    1.39 +// must be occupied by super- and sub-threshold energy levels, and
    1.40 +// defers decisions re onset and offset times until these
    1.41 +// specifications have been met.  Three basic intervals are tested: an
    1.42 +// onset window, a speech-on window, and an offset window.  We require
    1.43 +// super-threshold to exceed some mimimum total durations in the onset
    1.44 +// and speech-on windows before declaring the speech onset time, and
    1.45 +// we specify a required sub-threshold residency in the offset window
    1.46 +// before declaring speech offset. As the various residency requirements are
    1.47 +// met, the EnergyEndpointer instance assumes various states, and can return the
    1.48 +// ID of these states to the client (see EpStatus below).
    1.49 +//
    1.50 +// The levels of the speech and background noise are continuously updated. It is
    1.51 +// important that the background noise level be estimated initially for
    1.52 +// robustness in noisy conditions. The first frames are assumed to be background
    1.53 +// noise and a fast update rate is used for the noise level. The duration for
    1.54 +// fast update is controlled by the fast_update_dur_ paramter.
    1.55 +//
    1.56 +// If used in noisy conditions, the endpointer should be started and run in the
    1.57 +// EnvironmentEstimation mode, for at least 200ms, before switching to
    1.58 +// UserInputMode.
    1.59 +// Audio feedback contamination can appear in the input audio, if not cut
    1.60 +// out or handled by echo cancellation. Audio feedback can trigger a false
    1.61 +// accept. The false accepts can be ignored by setting
    1.62 +// ep_contamination_rejection_period.
    1.63 +
    1.64 +#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
    1.65 +#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
    1.66 +
    1.67 +#include <vector>
    1.68 +
    1.69 +#include "nsAutoPtr.h"
    1.70 +
    1.71 +#include "energy_endpointer_params.h"
    1.72 +
    1.73 +namespace mozilla {
    1.74 +
    1.75 +// Endpointer status codes
    1.76 +enum EpStatus {
    1.77 +  EP_PRE_SPEECH = 10,
    1.78 +  EP_POSSIBLE_ONSET,
    1.79 +  EP_SPEECH_PRESENT,
    1.80 +  EP_POSSIBLE_OFFSET,
    1.81 +  EP_POST_SPEECH,
    1.82 +};
    1.83 +
    1.84 +class EnergyEndpointer {
    1.85 + public:
    1.86 +  // The default construction MUST be followed by Init(), before any
    1.87 +  // other use can be made of the instance.
    1.88 +  EnergyEndpointer();
    1.89 +  virtual ~EnergyEndpointer();
    1.90 +
    1.91 +  void Init(const EnergyEndpointerParams& params);
    1.92 +
    1.93 +  // Start the endpointer. This should be called at the beginning of a session.
    1.94 +  void StartSession();
    1.95 +
    1.96 +  // Stop the endpointer.
    1.97 +  void EndSession();
    1.98 +
    1.99 +  // Start environment estimation. Audio will be used for environment estimation
   1.100 +  // i.e. noise level estimation.
   1.101 +  void SetEnvironmentEstimationMode();
   1.102 +
   1.103 +  // Start user input. This should be called when the user indicates start of
   1.104 +  // input, e.g. by pressing a button.
   1.105 +  void SetUserInputMode();
   1.106 +
   1.107 +  // Computes the next input frame and modifies EnergyEndpointer status as
   1.108 +  // appropriate based on the computation.
   1.109 +  void ProcessAudioFrame(int64_t time_us,
   1.110 +                         const int16_t* samples, int num_samples,
   1.111 +                         float* rms_out);
   1.112 +
   1.113 +  // Returns the current state of the EnergyEndpointer and the time
   1.114 +  // corresponding to the most recently computed frame.
   1.115 +  EpStatus Status(int64_t* status_time_us) const;
   1.116 +
   1.117 +  bool estimating_environment() const {
   1.118 +    return estimating_environment_;
   1.119 +  }
   1.120 +
   1.121 +  // Returns estimated noise level in dB.
   1.122 +  float GetNoiseLevelDb() const;
   1.123 +
   1.124 + private:
   1.125 +  class HistoryRing;
   1.126 +
   1.127 +  // Resets the endpointer internal state.  If reset_threshold is true, the
   1.128 +  // state will be reset completely, including adaptive thresholds and the
   1.129 +  // removal of all history information.
   1.130 +  void Restart(bool reset_threshold);
   1.131 +
   1.132 +  // Update internal speech and noise levels.
   1.133 +  void UpdateLevels(float rms);
   1.134 +
   1.135 +  // Returns the number of frames (or frame number) corresponding to
   1.136 +  // the 'time' (in seconds).
   1.137 +  int TimeToFrame(float time) const;
   1.138 +
   1.139 +  EpStatus status_;  // The current state of this instance.
   1.140 +  float offset_confirm_dur_sec_;  // max on time allowed to confirm POST_SPEECH
   1.141 +  int64_t endpointer_time_us_;  // Time of the most recently received audio frame.
   1.142 +  int64_t fast_update_frames_; // Number of frames for initial level adaptation.
   1.143 +  int64_t frame_counter_;  // Number of frames seen. Used for initial adaptation.
   1.144 +  float max_window_dur_;  // Largest search window size (seconds)
   1.145 +  float sample_rate_;  // Sampling rate.
   1.146 +
   1.147 +  // Ring buffers to hold the speech activity history.
   1.148 +  nsAutoPtr<HistoryRing> history_;
   1.149 +
   1.150 +  // Configuration parameters.
   1.151 +  EnergyEndpointerParams params_;
   1.152 +
   1.153 +  // RMS which must be exceeded to conclude frame is speech.
   1.154 +  float decision_threshold_;
   1.155 +
   1.156 +  // Flag to indicate that audio should be used to estimate environment, prior
   1.157 +  // to receiving user input.
   1.158 +  bool estimating_environment_;
   1.159 +
   1.160 +  // Estimate of the background noise level. Used externally for UI feedback.
   1.161 +  float noise_level_;
   1.162 +
   1.163 +  // An adaptive threshold used to update decision_threshold_ when appropriate.
   1.164 +  float rms_adapt_;
   1.165 +
   1.166 +  // Start lag corresponds to the highest fundamental frequency.
   1.167 +  int start_lag_;
   1.168 +
   1.169 +  // End lag corresponds to the lowest fundamental frequency.
   1.170 +  int end_lag_;
   1.171 +
   1.172 +  // Time when mode switched from environment estimation to user input. This
   1.173 +  // is used to time forced rejection of audio feedback contamination.
   1.174 +  int64_t user_input_start_time_us_;
   1.175 +
   1.176 +  // prevent copy constructor and assignment
   1.177 +  EnergyEndpointer(const EnergyEndpointer&);
   1.178 +  void operator=(const EnergyEndpointer&);
   1.179 +};
   1.180 +
   1.181 +}  // namespace mozilla
   1.182 +
   1.183 +#endif  // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_

mercurial