content/media/webspeech/recognition/energy_endpointer.h

Thu, 15 Jan 2015 15:55:04 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:55:04 +0100
branch
TOR_BUG_9701
changeset 9
a63d609f5ebe
permissions
-rw-r--r--

Back out 97036ab72558 which inappropriately compared turds to third parties.

     1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
     2 //
     3 // Redistribution and use in source and binary forms, with or without
     4 // modification, are permitted provided that the following conditions are
     5 // met:
     6 //
     7 //    * Redistributions of source code must retain the above copyright
     8 // notice, this list of conditions and the following disclaimer.
     9 //    * Redistributions in binary form must reproduce the above
    10 // copyright notice, this list of conditions and the following disclaimer
    11 // in the documentation and/or other materials provided with the
    12 // distribution.
    13 //    * Neither the name of Google Inc. nor the names of its
    14 // contributors may be used to endorse or promote products derived from
    15 // this software without specific prior written permission.
    16 //
    17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
    20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
    21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
    22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
    23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
    24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    29 // The EnergyEndpointer class finds likely speech onset and offset points.
    30 //
    31 // The implementation described here is about the simplest possible.
    32 // It is based on timings of threshold crossings for overall signal
    33 // RMS. It is suitable for light weight applications.
    34 //
    35 // As written, the basic idea is that one specifies intervals that
    36 // must be occupied by super- and sub-threshold energy levels, and
    37 // defers decisions re onset and offset times until these
    38 // specifications have been met.  Three basic intervals are tested: an
    39 // onset window, a speech-on window, and an offset window.  We require
    40 // super-threshold to exceed some mimimum total durations in the onset
    41 // and speech-on windows before declaring the speech onset time, and
    42 // we specify a required sub-threshold residency in the offset window
    43 // before declaring speech offset. As the various residency requirements are
    44 // met, the EnergyEndpointer instance assumes various states, and can return the
    45 // ID of these states to the client (see EpStatus below).
    46 //
    47 // The levels of the speech and background noise are continuously updated. It is
    48 // important that the background noise level be estimated initially for
    49 // robustness in noisy conditions. The first frames are assumed to be background
    50 // noise and a fast update rate is used for the noise level. The duration for
    51 // fast update is controlled by the fast_update_dur_ paramter.
    52 //
    53 // If used in noisy conditions, the endpointer should be started and run in the
    54 // EnvironmentEstimation mode, for at least 200ms, before switching to
    55 // UserInputMode.
    56 // Audio feedback contamination can appear in the input audio, if not cut
    57 // out or handled by echo cancellation. Audio feedback can trigger a false
    58 // accept. The false accepts can be ignored by setting
    59 // ep_contamination_rejection_period.
    61 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
    62 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
    64 #include <vector>
    66 #include "nsAutoPtr.h"
    68 #include "energy_endpointer_params.h"
    70 namespace mozilla {
    72 // Endpointer status codes
    73 enum EpStatus {
    74   EP_PRE_SPEECH = 10,
    75   EP_POSSIBLE_ONSET,
    76   EP_SPEECH_PRESENT,
    77   EP_POSSIBLE_OFFSET,
    78   EP_POST_SPEECH,
    79 };
    81 class EnergyEndpointer {
    82  public:
    83   // The default construction MUST be followed by Init(), before any
    84   // other use can be made of the instance.
    85   EnergyEndpointer();
    86   virtual ~EnergyEndpointer();
    88   void Init(const EnergyEndpointerParams& params);
    90   // Start the endpointer. This should be called at the beginning of a session.
    91   void StartSession();
    93   // Stop the endpointer.
    94   void EndSession();
    96   // Start environment estimation. Audio will be used for environment estimation
    97   // i.e. noise level estimation.
    98   void SetEnvironmentEstimationMode();
   100   // Start user input. This should be called when the user indicates start of
   101   // input, e.g. by pressing a button.
   102   void SetUserInputMode();
   104   // Computes the next input frame and modifies EnergyEndpointer status as
   105   // appropriate based on the computation.
   106   void ProcessAudioFrame(int64_t time_us,
   107                          const int16_t* samples, int num_samples,
   108                          float* rms_out);
   110   // Returns the current state of the EnergyEndpointer and the time
   111   // corresponding to the most recently computed frame.
   112   EpStatus Status(int64_t* status_time_us) const;
   114   bool estimating_environment() const {
   115     return estimating_environment_;
   116   }
   118   // Returns estimated noise level in dB.
   119   float GetNoiseLevelDb() const;
   121  private:
   122   class HistoryRing;
   124   // Resets the endpointer internal state.  If reset_threshold is true, the
   125   // state will be reset completely, including adaptive thresholds and the
   126   // removal of all history information.
   127   void Restart(bool reset_threshold);
   129   // Update internal speech and noise levels.
   130   void UpdateLevels(float rms);
   132   // Returns the number of frames (or frame number) corresponding to
   133   // the 'time' (in seconds).
   134   int TimeToFrame(float time) const;
   136   EpStatus status_;  // The current state of this instance.
   137   float offset_confirm_dur_sec_;  // max on time allowed to confirm POST_SPEECH
   138   int64_t endpointer_time_us_;  // Time of the most recently received audio frame.
   139   int64_t fast_update_frames_; // Number of frames for initial level adaptation.
   140   int64_t frame_counter_;  // Number of frames seen. Used for initial adaptation.
   141   float max_window_dur_;  // Largest search window size (seconds)
   142   float sample_rate_;  // Sampling rate.
   144   // Ring buffers to hold the speech activity history.
   145   nsAutoPtr<HistoryRing> history_;
   147   // Configuration parameters.
   148   EnergyEndpointerParams params_;
   150   // RMS which must be exceeded to conclude frame is speech.
   151   float decision_threshold_;
   153   // Flag to indicate that audio should be used to estimate environment, prior
   154   // to receiving user input.
   155   bool estimating_environment_;
   157   // Estimate of the background noise level. Used externally for UI feedback.
   158   float noise_level_;
   160   // An adaptive threshold used to update decision_threshold_ when appropriate.
   161   float rms_adapt_;
   163   // Start lag corresponds to the highest fundamental frequency.
   164   int start_lag_;
   166   // End lag corresponds to the lowest fundamental frequency.
   167   int end_lag_;
   169   // Time when mode switched from environment estimation to user input. This
   170   // is used to time forced rejection of audio feedback contamination.
   171   int64_t user_input_start_time_us_;
   173   // prevent copy constructor and assignment
   174   EnergyEndpointer(const EnergyEndpointer&);
   175   void operator=(const EnergyEndpointer&);
   176 };
   178 }  // namespace mozilla
   180 #endif  // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_

mercurial