michael@0: // Copyright (c) 2013 The Chromium Authors. All rights reserved. michael@0: // michael@0: // Redistribution and use in source and binary forms, with or without michael@0: // modification, are permitted provided that the following conditions are michael@0: // met: michael@0: // michael@0: // * Redistributions of source code must retain the above copyright michael@0: // notice, this list of conditions and the following disclaimer. michael@0: // * Redistributions in binary form must reproduce the above michael@0: // copyright notice, this list of conditions and the following disclaimer michael@0: // in the documentation and/or other materials provided with the michael@0: // distribution. michael@0: // * Neither the name of Google Inc. nor the names of its michael@0: // contributors may be used to endorse or promote products derived from michael@0: // this software without specific prior written permission. michael@0: // michael@0: // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS michael@0: // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT michael@0: // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR michael@0: // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT michael@0: // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, michael@0: // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT michael@0: // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, michael@0: // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY michael@0: // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT michael@0: // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE michael@0: // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. michael@0: michael@0: #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ michael@0: #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ michael@0: michael@0: #include "energy_endpointer.h" michael@0: michael@0: namespace mozilla { michael@0: michael@0: struct AudioChunk; michael@0: michael@0: // A simple interface to the underlying energy-endpointer implementation, this michael@0: // class lets callers provide audio as being recorded and let them poll to find michael@0: // when the user has stopped speaking. michael@0: // michael@0: // There are two events that may trigger the end of speech: michael@0: // michael@0: // speechInputPossiblyComplete event: michael@0: // michael@0: // Signals that silence/noise has been detected for a *short* amount of michael@0: // time after some speech has been detected. It can be used for low latency michael@0: // UI feedback. To disable it, set it to a large amount. michael@0: // michael@0: // speechInputComplete event: michael@0: // michael@0: // This event is intended to signal end of input and to stop recording. michael@0: // The amount of time to wait after speech is set by michael@0: // speech_input_complete_silence_length_ and optionally two other michael@0: // parameters (see below). michael@0: // This time can be held constant, or can change as more speech is detected. michael@0: // In the latter case, the time changes after a set amount of time from the michael@0: // *beginning* of speech. This is motivated by the expectation that there michael@0: // will be two distinct types of inputs: short search queries and longer michael@0: // dictation style input. michael@0: // michael@0: // Three parameters are used to define the piecewise constant timeout function. michael@0: // The timeout length is speech_input_complete_silence_length until michael@0: // long_speech_length, when it changes to michael@0: // long_speech_input_complete_silence_length. michael@0: class Endpointer { michael@0: public: michael@0: explicit Endpointer(int sample_rate); michael@0: michael@0: // Start the endpointer. This should be called at the beginning of a session. michael@0: void StartSession(); michael@0: michael@0: // Stop the endpointer. michael@0: void EndSession(); michael@0: michael@0: // Start environment estimation. Audio will be used for environment estimation michael@0: // i.e. noise level estimation. michael@0: void SetEnvironmentEstimationMode(); michael@0: michael@0: // Start user input. This should be called when the user indicates start of michael@0: // input, e.g. by pressing a button. michael@0: void SetUserInputMode(); michael@0: michael@0: // Process a segment of audio, which may be more than one frame. michael@0: // The status of the last frame will be returned. michael@0: EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out); michael@0: michael@0: // Get the status of the endpointer. michael@0: EpStatus Status(int64_t *time_us); michael@0: michael@0: // Get the expected frame size for audio chunks. Audio chunks are expected michael@0: // to contain a number of samples that is a multiple of this number, and extra michael@0: // samples will be dropped. michael@0: int32_t FrameSize() const { michael@0: return frame_size_; michael@0: } michael@0: michael@0: // Returns true if the endpointer detected reasonable audio levels above michael@0: // background noise which could be user speech, false if not. michael@0: bool DidStartReceivingSpeech() const { michael@0: return speech_previously_detected_; michael@0: } michael@0: michael@0: bool IsEstimatingEnvironment() const { michael@0: return energy_endpointer_.estimating_environment(); michael@0: } michael@0: michael@0: void set_speech_input_complete_silence_length(int64_t time_us) { michael@0: speech_input_complete_silence_length_us_ = time_us; michael@0: } michael@0: michael@0: void set_long_speech_input_complete_silence_length(int64_t time_us) { michael@0: long_speech_input_complete_silence_length_us_ = time_us; michael@0: } michael@0: michael@0: void set_speech_input_possibly_complete_silence_length(int64_t time_us) { michael@0: speech_input_possibly_complete_silence_length_us_ = time_us; michael@0: } michael@0: michael@0: void set_long_speech_length(int64_t time_us) { michael@0: long_speech_length_us_ = time_us; michael@0: } michael@0: michael@0: bool speech_input_complete() const { michael@0: return speech_input_complete_; michael@0: } michael@0: michael@0: // RMS background noise level in dB. michael@0: float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); } michael@0: michael@0: private: michael@0: // Reset internal states. Helper method common to initial input utterance michael@0: // and following input utternaces. michael@0: void Reset(); michael@0: michael@0: // Minimum allowable length of speech input. michael@0: int64_t speech_input_minimum_length_us_; michael@0: michael@0: // The speechInputPossiblyComplete event signals that silence/noise has been michael@0: // detected for a *short* amount of time after some speech has been detected. michael@0: // This proporty specifies the time period. michael@0: int64_t speech_input_possibly_complete_silence_length_us_; michael@0: michael@0: // The speechInputComplete event signals that silence/noise has been michael@0: // detected for a *long* amount of time after some speech has been detected. michael@0: // This property specifies the time period. michael@0: int64_t speech_input_complete_silence_length_us_; michael@0: michael@0: // Same as above, this specifies the required silence period after speech michael@0: // detection. This period is used instead of michael@0: // speech_input_complete_silence_length_ when the utterance is longer than michael@0: // long_speech_length_. This parameter is optional. michael@0: int64_t long_speech_input_complete_silence_length_us_; michael@0: michael@0: // The period of time after which the endpointer should consider michael@0: // long_speech_input_complete_silence_length_ as a valid silence period michael@0: // instead of speech_input_complete_silence_length_. This parameter is michael@0: // optional. michael@0: int64_t long_speech_length_us_; michael@0: michael@0: // First speech onset time, used in determination of speech complete timeout. michael@0: int64_t speech_start_time_us_; michael@0: michael@0: // Most recent end time, used in determination of speech complete timeout. michael@0: int64_t speech_end_time_us_; michael@0: michael@0: int64_t audio_frame_time_us_; michael@0: EpStatus old_ep_status_; michael@0: bool waiting_for_speech_possibly_complete_timeout_; michael@0: bool waiting_for_speech_complete_timeout_; michael@0: bool speech_previously_detected_; michael@0: bool speech_input_complete_; michael@0: EnergyEndpointer energy_endpointer_; michael@0: int sample_rate_; michael@0: int32_t frame_size_; michael@0: }; michael@0: michael@0: } // namespace mozilla michael@0: michael@0: #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_