|
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
|
2 // |
|
3 // Redistribution and use in source and binary forms, with or without |
|
4 // modification, are permitted provided that the following conditions are |
|
5 // met: |
|
6 // |
|
7 // * Redistributions of source code must retain the above copyright |
|
8 // notice, this list of conditions and the following disclaimer. |
|
9 // * Redistributions in binary form must reproduce the above |
|
10 // copyright notice, this list of conditions and the following disclaimer |
|
11 // in the documentation and/or other materials provided with the |
|
12 // distribution. |
|
13 // * Neither the name of Google Inc. nor the names of its |
|
14 // contributors may be used to endorse or promote products derived from |
|
15 // this software without specific prior written permission. |
|
16 // |
|
17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
|
20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
|
21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
|
24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
|
25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
28 |
|
29 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ |
|
30 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ |
|
31 |
|
32 #include "energy_endpointer.h" |
|
33 |
|
34 namespace mozilla { |
|
35 |
|
36 struct AudioChunk; |
|
37 |
|
38 // A simple interface to the underlying energy-endpointer implementation, this |
|
39 // class lets callers provide audio as being recorded and let them poll to find |
|
40 // when the user has stopped speaking. |
|
41 // |
|
42 // There are two events that may trigger the end of speech: |
|
43 // |
|
44 // speechInputPossiblyComplete event: |
|
45 // |
|
46 // Signals that silence/noise has been detected for a *short* amount of |
|
47 // time after some speech has been detected. It can be used for low latency |
|
48 // UI feedback. To disable it, set it to a large amount. |
|
49 // |
|
50 // speechInputComplete event: |
|
51 // |
|
52 // This event is intended to signal end of input and to stop recording. |
|
53 // The amount of time to wait after speech is set by |
|
54 // speech_input_complete_silence_length_ and optionally two other |
|
55 // parameters (see below). |
|
56 // This time can be held constant, or can change as more speech is detected. |
|
57 // In the latter case, the time changes after a set amount of time from the |
|
58 // *beginning* of speech. This is motivated by the expectation that there |
|
59 // will be two distinct types of inputs: short search queries and longer |
|
60 // dictation style input. |
|
61 // |
|
62 // Three parameters are used to define the piecewise constant timeout function. |
|
63 // The timeout length is speech_input_complete_silence_length until |
|
64 // long_speech_length, when it changes to |
|
65 // long_speech_input_complete_silence_length. |
|
66 class Endpointer { |
|
67 public: |
|
68 explicit Endpointer(int sample_rate); |
|
69 |
|
70 // Start the endpointer. This should be called at the beginning of a session. |
|
71 void StartSession(); |
|
72 |
|
73 // Stop the endpointer. |
|
74 void EndSession(); |
|
75 |
|
76 // Start environment estimation. Audio will be used for environment estimation |
|
77 // i.e. noise level estimation. |
|
78 void SetEnvironmentEstimationMode(); |
|
79 |
|
80 // Start user input. This should be called when the user indicates start of |
|
81 // input, e.g. by pressing a button. |
|
82 void SetUserInputMode(); |
|
83 |
|
84 // Process a segment of audio, which may be more than one frame. |
|
85 // The status of the last frame will be returned. |
|
86 EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out); |
|
87 |
|
88 // Get the status of the endpointer. |
|
89 EpStatus Status(int64_t *time_us); |
|
90 |
|
91 // Get the expected frame size for audio chunks. Audio chunks are expected |
|
92 // to contain a number of samples that is a multiple of this number, and extra |
|
93 // samples will be dropped. |
|
94 int32_t FrameSize() const { |
|
95 return frame_size_; |
|
96 } |
|
97 |
|
98 // Returns true if the endpointer detected reasonable audio levels above |
|
99 // background noise which could be user speech, false if not. |
|
100 bool DidStartReceivingSpeech() const { |
|
101 return speech_previously_detected_; |
|
102 } |
|
103 |
|
104 bool IsEstimatingEnvironment() const { |
|
105 return energy_endpointer_.estimating_environment(); |
|
106 } |
|
107 |
|
108 void set_speech_input_complete_silence_length(int64_t time_us) { |
|
109 speech_input_complete_silence_length_us_ = time_us; |
|
110 } |
|
111 |
|
112 void set_long_speech_input_complete_silence_length(int64_t time_us) { |
|
113 long_speech_input_complete_silence_length_us_ = time_us; |
|
114 } |
|
115 |
|
116 void set_speech_input_possibly_complete_silence_length(int64_t time_us) { |
|
117 speech_input_possibly_complete_silence_length_us_ = time_us; |
|
118 } |
|
119 |
|
120 void set_long_speech_length(int64_t time_us) { |
|
121 long_speech_length_us_ = time_us; |
|
122 } |
|
123 |
|
124 bool speech_input_complete() const { |
|
125 return speech_input_complete_; |
|
126 } |
|
127 |
|
128 // RMS background noise level in dB. |
|
129 float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); } |
|
130 |
|
131 private: |
|
132 // Reset internal states. Helper method common to initial input utterance |
|
133 // and following input utternaces. |
|
134 void Reset(); |
|
135 |
|
136 // Minimum allowable length of speech input. |
|
137 int64_t speech_input_minimum_length_us_; |
|
138 |
|
139 // The speechInputPossiblyComplete event signals that silence/noise has been |
|
140 // detected for a *short* amount of time after some speech has been detected. |
|
141 // This proporty specifies the time period. |
|
142 int64_t speech_input_possibly_complete_silence_length_us_; |
|
143 |
|
144 // The speechInputComplete event signals that silence/noise has been |
|
145 // detected for a *long* amount of time after some speech has been detected. |
|
146 // This property specifies the time period. |
|
147 int64_t speech_input_complete_silence_length_us_; |
|
148 |
|
149 // Same as above, this specifies the required silence period after speech |
|
150 // detection. This period is used instead of |
|
151 // speech_input_complete_silence_length_ when the utterance is longer than |
|
152 // long_speech_length_. This parameter is optional. |
|
153 int64_t long_speech_input_complete_silence_length_us_; |
|
154 |
|
155 // The period of time after which the endpointer should consider |
|
156 // long_speech_input_complete_silence_length_ as a valid silence period |
|
157 // instead of speech_input_complete_silence_length_. This parameter is |
|
158 // optional. |
|
159 int64_t long_speech_length_us_; |
|
160 |
|
161 // First speech onset time, used in determination of speech complete timeout. |
|
162 int64_t speech_start_time_us_; |
|
163 |
|
164 // Most recent end time, used in determination of speech complete timeout. |
|
165 int64_t speech_end_time_us_; |
|
166 |
|
167 int64_t audio_frame_time_us_; |
|
168 EpStatus old_ep_status_; |
|
169 bool waiting_for_speech_possibly_complete_timeout_; |
|
170 bool waiting_for_speech_complete_timeout_; |
|
171 bool speech_previously_detected_; |
|
172 bool speech_input_complete_; |
|
173 EnergyEndpointer energy_endpointer_; |
|
174 int sample_rate_; |
|
175 int32_t frame_size_; |
|
176 }; |
|
177 |
|
178 } // namespace mozilla |
|
179 |
|
180 #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ |