|
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
|
2 // |
|
3 // Redistribution and use in source and binary forms, with or without |
|
4 // modification, are permitted provided that the following conditions are |
|
5 // met: |
|
6 // |
|
7 // * Redistributions of source code must retain the above copyright |
|
8 // notice, this list of conditions and the following disclaimer. |
|
9 // * Redistributions in binary form must reproduce the above |
|
10 // copyright notice, this list of conditions and the following disclaimer |
|
11 // in the documentation and/or other materials provided with the |
|
12 // distribution. |
|
13 // * Neither the name of Google Inc. nor the names of its |
|
14 // contributors may be used to endorse or promote products derived from |
|
15 // this software without specific prior written permission. |
|
16 // |
|
17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
|
20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
|
21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
|
24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
|
25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
28 |
|
29 #include "endpointer.h" |
|
30 |
|
31 #include "AudioSegment.h" |
|
32 |
|
33 namespace { |
|
34 const int kFrameRate = 200; // 1 frame = 5ms of audio. |
|
35 } |
|
36 |
|
37 namespace mozilla { |
|
38 |
|
39 Endpointer::Endpointer(int sample_rate) |
|
40 : speech_input_possibly_complete_silence_length_us_(-1), |
|
41 speech_input_complete_silence_length_us_(-1), |
|
42 audio_frame_time_us_(0), |
|
43 sample_rate_(sample_rate), |
|
44 frame_size_(0) { |
|
45 Reset(); |
|
46 |
|
47 frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate)); |
|
48 |
|
49 speech_input_minimum_length_us_ = |
|
50 static_cast<int64_t>(1.7 * 1000000); |
|
51 speech_input_complete_silence_length_us_ = |
|
52 static_cast<int64_t>(0.5 * 1000000); |
|
53 long_speech_input_complete_silence_length_us_ = -1; |
|
54 long_speech_length_us_ = -1; |
|
55 speech_input_possibly_complete_silence_length_us_ = |
|
56 1 * 1000000; |
|
57 |
|
58 // Set the default configuration for Push To Talk mode. |
|
59 EnergyEndpointerParams ep_config; |
|
60 ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate)); |
|
61 ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate)); |
|
62 ep_config.set_endpoint_margin(0.2f); |
|
63 ep_config.set_onset_window(0.15f); |
|
64 ep_config.set_speech_on_window(0.4f); |
|
65 ep_config.set_offset_window(0.15f); |
|
66 ep_config.set_onset_detect_dur(0.09f); |
|
67 ep_config.set_onset_confirm_dur(0.075f); |
|
68 ep_config.set_on_maintain_dur(0.10f); |
|
69 ep_config.set_offset_confirm_dur(0.12f); |
|
70 ep_config.set_decision_threshold(1000.0f); |
|
71 ep_config.set_min_decision_threshold(50.0f); |
|
72 ep_config.set_fast_update_dur(0.2f); |
|
73 ep_config.set_sample_rate(static_cast<float>(sample_rate)); |
|
74 ep_config.set_min_fundamental_frequency(57.143f); |
|
75 ep_config.set_max_fundamental_frequency(400.0f); |
|
76 ep_config.set_contamination_rejection_period(0.25f); |
|
77 energy_endpointer_.Init(ep_config); |
|
78 } |
|
79 |
|
80 void Endpointer::Reset() { |
|
81 old_ep_status_ = EP_PRE_SPEECH; |
|
82 waiting_for_speech_possibly_complete_timeout_ = false; |
|
83 waiting_for_speech_complete_timeout_ = false; |
|
84 speech_previously_detected_ = false; |
|
85 speech_input_complete_ = false; |
|
86 audio_frame_time_us_ = 0; // Reset time for packets sent to endpointer. |
|
87 speech_end_time_us_ = -1; |
|
88 speech_start_time_us_ = -1; |
|
89 } |
|
90 |
|
91 void Endpointer::StartSession() { |
|
92 Reset(); |
|
93 energy_endpointer_.StartSession(); |
|
94 } |
|
95 |
|
96 void Endpointer::EndSession() { |
|
97 energy_endpointer_.EndSession(); |
|
98 } |
|
99 |
|
100 void Endpointer::SetEnvironmentEstimationMode() { |
|
101 Reset(); |
|
102 energy_endpointer_.SetEnvironmentEstimationMode(); |
|
103 } |
|
104 |
|
105 void Endpointer::SetUserInputMode() { |
|
106 energy_endpointer_.SetUserInputMode(); |
|
107 } |
|
108 |
|
109 EpStatus Endpointer::Status(int64_t *time) { |
|
110 return energy_endpointer_.Status(time); |
|
111 } |
|
112 |
|
113 EpStatus Endpointer::ProcessAudio(const AudioChunk& raw_audio, float* rms_out) { |
|
114 MOZ_ASSERT(raw_audio.mBufferFormat == AUDIO_FORMAT_S16, "Audio is not in 16 bit format"); |
|
115 const int16_t* audio_data = static_cast<const int16_t*>(raw_audio.mChannelData[0]); |
|
116 const int num_samples = raw_audio.mDuration; |
|
117 EpStatus ep_status = EP_PRE_SPEECH; |
|
118 |
|
119 // Process the input data in blocks of frame_size_, dropping any incomplete |
|
120 // frames at the end (which is ok since typically the caller will be recording |
|
121 // audio in multiples of our frame size). |
|
122 int sample_index = 0; |
|
123 while (sample_index + frame_size_ <= num_samples) { |
|
124 // Have the endpointer process the frame. |
|
125 energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_, |
|
126 audio_data + sample_index, |
|
127 frame_size_, |
|
128 rms_out); |
|
129 sample_index += frame_size_; |
|
130 audio_frame_time_us_ += (frame_size_ * 1000000) / |
|
131 sample_rate_; |
|
132 |
|
133 // Get the status of the endpointer. |
|
134 int64_t ep_time; |
|
135 ep_status = energy_endpointer_.Status(&ep_time); |
|
136 if (old_ep_status_ != ep_status) |
|
137 fprintf(stderr, "Status changed old= %d, new= %d\n", old_ep_status_, ep_status); |
|
138 |
|
139 // Handle state changes. |
|
140 if ((EP_SPEECH_PRESENT == ep_status) && |
|
141 (EP_POSSIBLE_ONSET == old_ep_status_)) { |
|
142 speech_end_time_us_ = -1; |
|
143 waiting_for_speech_possibly_complete_timeout_ = false; |
|
144 waiting_for_speech_complete_timeout_ = false; |
|
145 // Trigger SpeechInputDidStart event on first detection. |
|
146 if (false == speech_previously_detected_) { |
|
147 speech_previously_detected_ = true; |
|
148 speech_start_time_us_ = ep_time; |
|
149 } |
|
150 } |
|
151 if ((EP_PRE_SPEECH == ep_status) && |
|
152 (EP_POSSIBLE_OFFSET == old_ep_status_)) { |
|
153 speech_end_time_us_ = ep_time; |
|
154 waiting_for_speech_possibly_complete_timeout_ = true; |
|
155 waiting_for_speech_complete_timeout_ = true; |
|
156 } |
|
157 if (ep_time > speech_input_minimum_length_us_) { |
|
158 // Speech possibly complete timeout. |
|
159 if ((waiting_for_speech_possibly_complete_timeout_) && |
|
160 (ep_time - speech_end_time_us_ > |
|
161 speech_input_possibly_complete_silence_length_us_)) { |
|
162 waiting_for_speech_possibly_complete_timeout_ = false; |
|
163 } |
|
164 if (waiting_for_speech_complete_timeout_) { |
|
165 // The length of the silence timeout period can be held constant, or it |
|
166 // can be changed after a fixed amount of time from the beginning of |
|
167 // speech. |
|
168 bool has_stepped_silence = |
|
169 (long_speech_length_us_ > 0) && |
|
170 (long_speech_input_complete_silence_length_us_ > 0); |
|
171 int64_t requested_silence_length; |
|
172 if (has_stepped_silence && |
|
173 (ep_time - speech_start_time_us_) > long_speech_length_us_) { |
|
174 requested_silence_length = |
|
175 long_speech_input_complete_silence_length_us_; |
|
176 } else { |
|
177 requested_silence_length = |
|
178 speech_input_complete_silence_length_us_; |
|
179 } |
|
180 |
|
181 // Speech complete timeout. |
|
182 if ((ep_time - speech_end_time_us_) > requested_silence_length) { |
|
183 waiting_for_speech_complete_timeout_ = false; |
|
184 speech_input_complete_ = true; |
|
185 } |
|
186 } |
|
187 } |
|
188 old_ep_status_ = ep_status; |
|
189 } |
|
190 return ep_status; |
|
191 } |
|
192 |
|
193 } // namespace mozilla |