content/media/webspeech/recognition/endpointer.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/content/media/webspeech/recognition/endpointer.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,180 @@
     1.4 +// Copyright (c) 2013 The Chromium Authors. All rights reserved.
     1.5 +//
     1.6 +// Redistribution and use in source and binary forms, with or without
     1.7 +// modification, are permitted provided that the following conditions are
     1.8 +// met:
     1.9 +//
    1.10 +//    * Redistributions of source code must retain the above copyright
    1.11 +// notice, this list of conditions and the following disclaimer.
    1.12 +//    * Redistributions in binary form must reproduce the above
    1.13 +// copyright notice, this list of conditions and the following disclaimer
    1.14 +// in the documentation and/or other materials provided with the
    1.15 +// distribution.
    1.16 +//    * Neither the name of Google Inc. nor the names of its
    1.17 +// contributors may be used to endorse or promote products derived from
    1.18 +// this software without specific prior written permission.
    1.19 +//
    1.20 +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    1.21 +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    1.22 +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
    1.23 +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
    1.24 +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
    1.25 +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
    1.26 +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
    1.27 +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    1.28 +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    1.29 +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    1.30 +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    1.31 +
    1.32 +#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
    1.33 +#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
    1.34 +
    1.35 +#include "energy_endpointer.h"
    1.36 +
    1.37 +namespace mozilla {
    1.38 +
    1.39 +struct AudioChunk;
    1.40 +
    1.41 +// A simple interface to the underlying energy-endpointer implementation, this
    1.42 +// class lets callers provide audio as being recorded and let them poll to find
    1.43 +// when the user has stopped speaking.
    1.44 +//
    1.45 +// There are two events that may trigger the end of speech:
    1.46 +//
    1.47 +// speechInputPossiblyComplete event:
    1.48 +//
    1.49 +// Signals that silence/noise has  been detected for a *short* amount of
    1.50 +// time after some speech has been detected. It can be used for low latency
    1.51 +// UI feedback. To disable it, set it to a large amount.
    1.52 +//
    1.53 +// speechInputComplete event:
    1.54 +//
    1.55 +// This event is intended to signal end of input and to stop recording.
    1.56 +// The amount of time to wait after speech is set by
    1.57 +// speech_input_complete_silence_length_ and optionally two other
    1.58 +// parameters (see below).
    1.59 +// This time can be held constant, or can change as more speech is detected.
    1.60 +// In the latter case, the time changes after a set amount of time from the
    1.61 +// *beginning* of speech.  This is motivated by the expectation that there
    1.62 +// will be two distinct types of inputs: short search queries and longer
    1.63 +// dictation style input.
    1.64 +//
    1.65 +// Three parameters are used to define the piecewise constant timeout function.
    1.66 +// The timeout length is speech_input_complete_silence_length until
    1.67 +// long_speech_length, when it changes to
    1.68 +// long_speech_input_complete_silence_length.
    1.69 +class Endpointer {
    1.70 + public:
    1.71 +  explicit Endpointer(int sample_rate);
    1.72 +
    1.73 +  // Start the endpointer. This should be called at the beginning of a session.
    1.74 +  void StartSession();
    1.75 +
    1.76 +  // Stop the endpointer.
    1.77 +  void EndSession();
    1.78 +
    1.79 +  // Start environment estimation. Audio will be used for environment estimation
    1.80 +  // i.e. noise level estimation.
    1.81 +  void SetEnvironmentEstimationMode();
    1.82 +
    1.83 +  // Start user input. This should be called when the user indicates start of
    1.84 +  // input, e.g. by pressing a button.
    1.85 +  void SetUserInputMode();
    1.86 +
    1.87 +  // Process a segment of audio, which may be more than one frame.
    1.88 +  // The status of the last frame will be returned.
    1.89 +  EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out);
    1.90 +
    1.91 +  // Get the status of the endpointer.
    1.92 +  EpStatus Status(int64_t *time_us);
    1.93 +
    1.94 +  // Get the expected frame size for audio chunks. Audio chunks are expected
    1.95 +  // to contain a number of samples that is a multiple of this number, and extra
    1.96 +  // samples will be dropped.
    1.97 +  int32_t FrameSize() const {
    1.98 +    return frame_size_;
    1.99 +  }
   1.100 +
   1.101 +  // Returns true if the endpointer detected reasonable audio levels above
   1.102 +  // background noise which could be user speech, false if not.
   1.103 +  bool DidStartReceivingSpeech() const {
   1.104 +    return speech_previously_detected_;
   1.105 +  }
   1.106 +
   1.107 +  bool IsEstimatingEnvironment() const {
   1.108 +    return energy_endpointer_.estimating_environment();
   1.109 +  }
   1.110 +
   1.111 +  void set_speech_input_complete_silence_length(int64_t time_us) {
   1.112 +    speech_input_complete_silence_length_us_ = time_us;
   1.113 +  }
   1.114 +
   1.115 +  void set_long_speech_input_complete_silence_length(int64_t time_us) {
   1.116 +    long_speech_input_complete_silence_length_us_ = time_us;
   1.117 +  }
   1.118 +
   1.119 +  void set_speech_input_possibly_complete_silence_length(int64_t time_us) {
   1.120 +    speech_input_possibly_complete_silence_length_us_ = time_us;
   1.121 +  }
   1.122 +
   1.123 +  void set_long_speech_length(int64_t time_us) {
   1.124 +    long_speech_length_us_ = time_us;
   1.125 +  }
   1.126 +
   1.127 +  bool speech_input_complete() const {
   1.128 +    return speech_input_complete_;
   1.129 +  }
   1.130 +
   1.131 +  // RMS background noise level in dB.
   1.132 +  float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); }
   1.133 +
   1.134 + private:
   1.135 +  // Reset internal states. Helper method common to initial input utterance
   1.136 +  // and following input utternaces.
   1.137 +  void Reset();
   1.138 +
   1.139 +  // Minimum allowable length of speech input.
   1.140 +  int64_t speech_input_minimum_length_us_;
   1.141 +
   1.142 +  // The speechInputPossiblyComplete event signals that silence/noise has been
   1.143 +  // detected for a *short* amount of time after some speech has been detected.
   1.144 +  // This proporty specifies the time period.
   1.145 +  int64_t speech_input_possibly_complete_silence_length_us_;
   1.146 +
   1.147 +  // The speechInputComplete event signals that silence/noise has been
   1.148 +  // detected for a *long* amount of time after some speech has been detected.
   1.149 +  // This property specifies the time period.
   1.150 +  int64_t speech_input_complete_silence_length_us_;
   1.151 +
   1.152 +  // Same as above, this specifies the required silence period after speech
   1.153 +  // detection. This period is used instead of
   1.154 +  // speech_input_complete_silence_length_ when the utterance is longer than
   1.155 +  // long_speech_length_. This parameter is optional.
   1.156 +  int64_t long_speech_input_complete_silence_length_us_;
   1.157 +
   1.158 +  // The period of time after which the endpointer should consider
   1.159 +  // long_speech_input_complete_silence_length_ as a valid silence period
   1.160 +  // instead of speech_input_complete_silence_length_. This parameter is
   1.161 +  // optional.
   1.162 +  int64_t long_speech_length_us_;
   1.163 +
   1.164 +  // First speech onset time, used in determination of speech complete timeout.
   1.165 +  int64_t speech_start_time_us_;
   1.166 +
   1.167 +  // Most recent end time, used in determination of speech complete timeout.
   1.168 +  int64_t speech_end_time_us_;
   1.169 +
   1.170 +  int64_t audio_frame_time_us_;
   1.171 +  EpStatus old_ep_status_;
   1.172 +  bool waiting_for_speech_possibly_complete_timeout_;
   1.173 +  bool waiting_for_speech_complete_timeout_;
   1.174 +  bool speech_previously_detected_;
   1.175 +  bool speech_input_complete_;
   1.176 +  EnergyEndpointer energy_endpointer_;
   1.177 +  int sample_rate_;
   1.178 +  int32_t frame_size_;
   1.179 +};
   1.180 +
   1.181 +}  // namespace mozilla
   1.182 +
   1.183 +#endif  // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_

mercurial