content/media/webspeech/recognition/endpointer.cc

Thu, 15 Jan 2015 15:55:04 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:55:04 +0100
branch
TOR_BUG_9701
changeset 9
a63d609f5ebe
permissions
-rw-r--r--

Back out 97036ab72558 which inappropriately compared turds to third parties.

michael@0 1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
michael@0 2 //
michael@0 3 // Redistribution and use in source and binary forms, with or without
michael@0 4 // modification, are permitted provided that the following conditions are
michael@0 5 // met:
michael@0 6 //
michael@0 7 // * Redistributions of source code must retain the above copyright
michael@0 8 // notice, this list of conditions and the following disclaimer.
michael@0 9 // * Redistributions in binary form must reproduce the above
michael@0 10 // copyright notice, this list of conditions and the following disclaimer
michael@0 11 // in the documentation and/or other materials provided with the
michael@0 12 // distribution.
michael@0 13 // * Neither the name of Google Inc. nor the names of its
michael@0 14 // contributors may be used to endorse or promote products derived from
michael@0 15 // this software without specific prior written permission.
michael@0 16 //
michael@0 17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
michael@0 18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
michael@0 19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
michael@0 20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
michael@0 21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
michael@0 22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
michael@0 23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
michael@0 24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
michael@0 25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
michael@0 26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
michael@0 27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
michael@0 28
michael@0 29 #include "endpointer.h"
michael@0 30
michael@0 31 #include "AudioSegment.h"
michael@0 32
michael@0 33 namespace {
michael@0 34 const int kFrameRate = 200; // 1 frame = 5ms of audio.
michael@0 35 }
michael@0 36
michael@0 37 namespace mozilla {
michael@0 38
michael@0 39 Endpointer::Endpointer(int sample_rate)
michael@0 40 : speech_input_possibly_complete_silence_length_us_(-1),
michael@0 41 speech_input_complete_silence_length_us_(-1),
michael@0 42 audio_frame_time_us_(0),
michael@0 43 sample_rate_(sample_rate),
michael@0 44 frame_size_(0) {
michael@0 45 Reset();
michael@0 46
michael@0 47 frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate));
michael@0 48
michael@0 49 speech_input_minimum_length_us_ =
michael@0 50 static_cast<int64_t>(1.7 * 1000000);
michael@0 51 speech_input_complete_silence_length_us_ =
michael@0 52 static_cast<int64_t>(0.5 * 1000000);
michael@0 53 long_speech_input_complete_silence_length_us_ = -1;
michael@0 54 long_speech_length_us_ = -1;
michael@0 55 speech_input_possibly_complete_silence_length_us_ =
michael@0 56 1 * 1000000;
michael@0 57
michael@0 58 // Set the default configuration for Push To Talk mode.
michael@0 59 EnergyEndpointerParams ep_config;
michael@0 60 ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate));
michael@0 61 ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate));
michael@0 62 ep_config.set_endpoint_margin(0.2f);
michael@0 63 ep_config.set_onset_window(0.15f);
michael@0 64 ep_config.set_speech_on_window(0.4f);
michael@0 65 ep_config.set_offset_window(0.15f);
michael@0 66 ep_config.set_onset_detect_dur(0.09f);
michael@0 67 ep_config.set_onset_confirm_dur(0.075f);
michael@0 68 ep_config.set_on_maintain_dur(0.10f);
michael@0 69 ep_config.set_offset_confirm_dur(0.12f);
michael@0 70 ep_config.set_decision_threshold(1000.0f);
michael@0 71 ep_config.set_min_decision_threshold(50.0f);
michael@0 72 ep_config.set_fast_update_dur(0.2f);
michael@0 73 ep_config.set_sample_rate(static_cast<float>(sample_rate));
michael@0 74 ep_config.set_min_fundamental_frequency(57.143f);
michael@0 75 ep_config.set_max_fundamental_frequency(400.0f);
michael@0 76 ep_config.set_contamination_rejection_period(0.25f);
michael@0 77 energy_endpointer_.Init(ep_config);
michael@0 78 }
michael@0 79
michael@0 80 void Endpointer::Reset() {
michael@0 81 old_ep_status_ = EP_PRE_SPEECH;
michael@0 82 waiting_for_speech_possibly_complete_timeout_ = false;
michael@0 83 waiting_for_speech_complete_timeout_ = false;
michael@0 84 speech_previously_detected_ = false;
michael@0 85 speech_input_complete_ = false;
michael@0 86 audio_frame_time_us_ = 0; // Reset time for packets sent to endpointer.
michael@0 87 speech_end_time_us_ = -1;
michael@0 88 speech_start_time_us_ = -1;
michael@0 89 }
michael@0 90
michael@0 91 void Endpointer::StartSession() {
michael@0 92 Reset();
michael@0 93 energy_endpointer_.StartSession();
michael@0 94 }
michael@0 95
michael@0 96 void Endpointer::EndSession() {
michael@0 97 energy_endpointer_.EndSession();
michael@0 98 }
michael@0 99
michael@0 100 void Endpointer::SetEnvironmentEstimationMode() {
michael@0 101 Reset();
michael@0 102 energy_endpointer_.SetEnvironmentEstimationMode();
michael@0 103 }
michael@0 104
michael@0 105 void Endpointer::SetUserInputMode() {
michael@0 106 energy_endpointer_.SetUserInputMode();
michael@0 107 }
michael@0 108
michael@0 109 EpStatus Endpointer::Status(int64_t *time) {
michael@0 110 return energy_endpointer_.Status(time);
michael@0 111 }
michael@0 112
michael@0 113 EpStatus Endpointer::ProcessAudio(const AudioChunk& raw_audio, float* rms_out) {
michael@0 114 MOZ_ASSERT(raw_audio.mBufferFormat == AUDIO_FORMAT_S16, "Audio is not in 16 bit format");
michael@0 115 const int16_t* audio_data = static_cast<const int16_t*>(raw_audio.mChannelData[0]);
michael@0 116 const int num_samples = raw_audio.mDuration;
michael@0 117 EpStatus ep_status = EP_PRE_SPEECH;
michael@0 118
michael@0 119 // Process the input data in blocks of frame_size_, dropping any incomplete
michael@0 120 // frames at the end (which is ok since typically the caller will be recording
michael@0 121 // audio in multiples of our frame size).
michael@0 122 int sample_index = 0;
michael@0 123 while (sample_index + frame_size_ <= num_samples) {
michael@0 124 // Have the endpointer process the frame.
michael@0 125 energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_,
michael@0 126 audio_data + sample_index,
michael@0 127 frame_size_,
michael@0 128 rms_out);
michael@0 129 sample_index += frame_size_;
michael@0 130 audio_frame_time_us_ += (frame_size_ * 1000000) /
michael@0 131 sample_rate_;
michael@0 132
michael@0 133 // Get the status of the endpointer.
michael@0 134 int64_t ep_time;
michael@0 135 ep_status = energy_endpointer_.Status(&ep_time);
michael@0 136 if (old_ep_status_ != ep_status)
michael@0 137 fprintf(stderr, "Status changed old= %d, new= %d\n", old_ep_status_, ep_status);
michael@0 138
michael@0 139 // Handle state changes.
michael@0 140 if ((EP_SPEECH_PRESENT == ep_status) &&
michael@0 141 (EP_POSSIBLE_ONSET == old_ep_status_)) {
michael@0 142 speech_end_time_us_ = -1;
michael@0 143 waiting_for_speech_possibly_complete_timeout_ = false;
michael@0 144 waiting_for_speech_complete_timeout_ = false;
michael@0 145 // Trigger SpeechInputDidStart event on first detection.
michael@0 146 if (false == speech_previously_detected_) {
michael@0 147 speech_previously_detected_ = true;
michael@0 148 speech_start_time_us_ = ep_time;
michael@0 149 }
michael@0 150 }
michael@0 151 if ((EP_PRE_SPEECH == ep_status) &&
michael@0 152 (EP_POSSIBLE_OFFSET == old_ep_status_)) {
michael@0 153 speech_end_time_us_ = ep_time;
michael@0 154 waiting_for_speech_possibly_complete_timeout_ = true;
michael@0 155 waiting_for_speech_complete_timeout_ = true;
michael@0 156 }
michael@0 157 if (ep_time > speech_input_minimum_length_us_) {
michael@0 158 // Speech possibly complete timeout.
michael@0 159 if ((waiting_for_speech_possibly_complete_timeout_) &&
michael@0 160 (ep_time - speech_end_time_us_ >
michael@0 161 speech_input_possibly_complete_silence_length_us_)) {
michael@0 162 waiting_for_speech_possibly_complete_timeout_ = false;
michael@0 163 }
michael@0 164 if (waiting_for_speech_complete_timeout_) {
michael@0 165 // The length of the silence timeout period can be held constant, or it
michael@0 166 // can be changed after a fixed amount of time from the beginning of
michael@0 167 // speech.
michael@0 168 bool has_stepped_silence =
michael@0 169 (long_speech_length_us_ > 0) &&
michael@0 170 (long_speech_input_complete_silence_length_us_ > 0);
michael@0 171 int64_t requested_silence_length;
michael@0 172 if (has_stepped_silence &&
michael@0 173 (ep_time - speech_start_time_us_) > long_speech_length_us_) {
michael@0 174 requested_silence_length =
michael@0 175 long_speech_input_complete_silence_length_us_;
michael@0 176 } else {
michael@0 177 requested_silence_length =
michael@0 178 speech_input_complete_silence_length_us_;
michael@0 179 }
michael@0 180
michael@0 181 // Speech complete timeout.
michael@0 182 if ((ep_time - speech_end_time_us_) > requested_silence_length) {
michael@0 183 waiting_for_speech_complete_timeout_ = false;
michael@0 184 speech_input_complete_ = true;
michael@0 185 }
michael@0 186 }
michael@0 187 }
michael@0 188 old_ep_status_ = ep_status;
michael@0 189 }
michael@0 190 return ep_status;
michael@0 191 }
michael@0 192
michael@0 193 } // namespace mozilla

mercurial