1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/content/media/webspeech/recognition/endpointer.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,180 @@ 1.4 +// Copyright (c) 2013 The Chromium Authors. All rights reserved. 1.5 +// 1.6 +// Redistribution and use in source and binary forms, with or without 1.7 +// modification, are permitted provided that the following conditions are 1.8 +// met: 1.9 +// 1.10 +// * Redistributions of source code must retain the above copyright 1.11 +// notice, this list of conditions and the following disclaimer. 1.12 +// * Redistributions in binary form must reproduce the above 1.13 +// copyright notice, this list of conditions and the following disclaimer 1.14 +// in the documentation and/or other materials provided with the 1.15 +// distribution. 1.16 +// * Neither the name of Google Inc. nor the names of its 1.17 +// contributors may be used to endorse or promote products derived from 1.18 +// this software without specific prior written permission. 1.19 +// 1.20 +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 1.21 +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 1.22 +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 1.23 +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 1.24 +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 1.25 +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 1.26 +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 1.27 +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 1.28 +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 1.29 +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 1.30 +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1.31 + 1.32 +#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ 1.33 +#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ 1.34 + 1.35 +#include "energy_endpointer.h" 1.36 + 1.37 +namespace mozilla { 1.38 + 1.39 +struct AudioChunk; 1.40 + 1.41 +// A simple interface to the underlying energy-endpointer implementation, this 1.42 +// class lets callers provide audio as being recorded and let them poll to find 1.43 +// when the user has stopped speaking. 1.44 +// 1.45 +// There are two events that may trigger the end of speech: 1.46 +// 1.47 +// speechInputPossiblyComplete event: 1.48 +// 1.49 +// Signals that silence/noise has been detected for a *short* amount of 1.50 +// time after some speech has been detected. It can be used for low latency 1.51 +// UI feedback. To disable it, set it to a large amount. 1.52 +// 1.53 +// speechInputComplete event: 1.54 +// 1.55 +// This event is intended to signal end of input and to stop recording. 1.56 +// The amount of time to wait after speech is set by 1.57 +// speech_input_complete_silence_length_ and optionally two other 1.58 +// parameters (see below). 1.59 +// This time can be held constant, or can change as more speech is detected. 1.60 +// In the latter case, the time changes after a set amount of time from the 1.61 +// *beginning* of speech. This is motivated by the expectation that there 1.62 +// will be two distinct types of inputs: short search queries and longer 1.63 +// dictation style input. 1.64 +// 1.65 +// Three parameters are used to define the piecewise constant timeout function. 1.66 +// The timeout length is speech_input_complete_silence_length until 1.67 +// long_speech_length, when it changes to 1.68 +// long_speech_input_complete_silence_length. 1.69 +class Endpointer { 1.70 + public: 1.71 + explicit Endpointer(int sample_rate); 1.72 + 1.73 + // Start the endpointer. This should be called at the beginning of a session. 1.74 + void StartSession(); 1.75 + 1.76 + // Stop the endpointer. 1.77 + void EndSession(); 1.78 + 1.79 + // Start environment estimation. Audio will be used for environment estimation 1.80 + // i.e. noise level estimation. 1.81 + void SetEnvironmentEstimationMode(); 1.82 + 1.83 + // Start user input. This should be called when the user indicates start of 1.84 + // input, e.g. by pressing a button. 1.85 + void SetUserInputMode(); 1.86 + 1.87 + // Process a segment of audio, which may be more than one frame. 1.88 + // The status of the last frame will be returned. 1.89 + EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out); 1.90 + 1.91 + // Get the status of the endpointer. 1.92 + EpStatus Status(int64_t *time_us); 1.93 + 1.94 + // Get the expected frame size for audio chunks. Audio chunks are expected 1.95 + // to contain a number of samples that is a multiple of this number, and extra 1.96 + // samples will be dropped. 1.97 + int32_t FrameSize() const { 1.98 + return frame_size_; 1.99 + } 1.100 + 1.101 + // Returns true if the endpointer detected reasonable audio levels above 1.102 + // background noise which could be user speech, false if not. 1.103 + bool DidStartReceivingSpeech() const { 1.104 + return speech_previously_detected_; 1.105 + } 1.106 + 1.107 + bool IsEstimatingEnvironment() const { 1.108 + return energy_endpointer_.estimating_environment(); 1.109 + } 1.110 + 1.111 + void set_speech_input_complete_silence_length(int64_t time_us) { 1.112 + speech_input_complete_silence_length_us_ = time_us; 1.113 + } 1.114 + 1.115 + void set_long_speech_input_complete_silence_length(int64_t time_us) { 1.116 + long_speech_input_complete_silence_length_us_ = time_us; 1.117 + } 1.118 + 1.119 + void set_speech_input_possibly_complete_silence_length(int64_t time_us) { 1.120 + speech_input_possibly_complete_silence_length_us_ = time_us; 1.121 + } 1.122 + 1.123 + void set_long_speech_length(int64_t time_us) { 1.124 + long_speech_length_us_ = time_us; 1.125 + } 1.126 + 1.127 + bool speech_input_complete() const { 1.128 + return speech_input_complete_; 1.129 + } 1.130 + 1.131 + // RMS background noise level in dB. 1.132 + float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); } 1.133 + 1.134 + private: 1.135 + // Reset internal states. Helper method common to initial input utterance 1.136 + // and following input utternaces. 1.137 + void Reset(); 1.138 + 1.139 + // Minimum allowable length of speech input. 1.140 + int64_t speech_input_minimum_length_us_; 1.141 + 1.142 + // The speechInputPossiblyComplete event signals that silence/noise has been 1.143 + // detected for a *short* amount of time after some speech has been detected. 1.144 + // This proporty specifies the time period. 1.145 + int64_t speech_input_possibly_complete_silence_length_us_; 1.146 + 1.147 + // The speechInputComplete event signals that silence/noise has been 1.148 + // detected for a *long* amount of time after some speech has been detected. 1.149 + // This property specifies the time period. 1.150 + int64_t speech_input_complete_silence_length_us_; 1.151 + 1.152 + // Same as above, this specifies the required silence period after speech 1.153 + // detection. This period is used instead of 1.154 + // speech_input_complete_silence_length_ when the utterance is longer than 1.155 + // long_speech_length_. This parameter is optional. 1.156 + int64_t long_speech_input_complete_silence_length_us_; 1.157 + 1.158 + // The period of time after which the endpointer should consider 1.159 + // long_speech_input_complete_silence_length_ as a valid silence period 1.160 + // instead of speech_input_complete_silence_length_. This parameter is 1.161 + // optional. 1.162 + int64_t long_speech_length_us_; 1.163 + 1.164 + // First speech onset time, used in determination of speech complete timeout. 1.165 + int64_t speech_start_time_us_; 1.166 + 1.167 + // Most recent end time, used in determination of speech complete timeout. 1.168 + int64_t speech_end_time_us_; 1.169 + 1.170 + int64_t audio_frame_time_us_; 1.171 + EpStatus old_ep_status_; 1.172 + bool waiting_for_speech_possibly_complete_timeout_; 1.173 + bool waiting_for_speech_complete_timeout_; 1.174 + bool speech_previously_detected_; 1.175 + bool speech_input_complete_; 1.176 + EnergyEndpointer energy_endpointer_; 1.177 + int sample_rate_; 1.178 + int32_t frame_size_; 1.179 +}; 1.180 + 1.181 +} // namespace mozilla 1.182 + 1.183 +#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_