content/media/webspeech/recognition/endpointer.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/content/media/webspeech/recognition/endpointer.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,193 @@
     1.4 +// Copyright (c) 2013 The Chromium Authors. All rights reserved.
     1.5 +//
     1.6 +// Redistribution and use in source and binary forms, with or without
     1.7 +// modification, are permitted provided that the following conditions are
     1.8 +// met:
     1.9 +//
    1.10 +//    * Redistributions of source code must retain the above copyright
    1.11 +// notice, this list of conditions and the following disclaimer.
    1.12 +//    * Redistributions in binary form must reproduce the above
    1.13 +// copyright notice, this list of conditions and the following disclaimer
    1.14 +// in the documentation and/or other materials provided with the
    1.15 +// distribution.
    1.16 +//    * Neither the name of Google Inc. nor the names of its
    1.17 +// contributors may be used to endorse or promote products derived from
    1.18 +// this software without specific prior written permission.
    1.19 +//
    1.20 +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    1.21 +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    1.22 +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
    1.23 +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
    1.24 +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
    1.25 +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
    1.26 +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
    1.27 +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    1.28 +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    1.29 +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    1.30 +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    1.31 +
    1.32 +#include "endpointer.h"
    1.33 +
    1.34 +#include "AudioSegment.h"
    1.35 +
    1.36 +namespace {
    1.37 +const int kFrameRate = 200;  // 1 frame = 5ms of audio.
    1.38 +}
    1.39 +
    1.40 +namespace mozilla {
    1.41 +
    1.42 +Endpointer::Endpointer(int sample_rate)
    1.43 +    : speech_input_possibly_complete_silence_length_us_(-1),
    1.44 +      speech_input_complete_silence_length_us_(-1),
    1.45 +      audio_frame_time_us_(0),
    1.46 +      sample_rate_(sample_rate),
    1.47 +      frame_size_(0) {
    1.48 +  Reset();
    1.49 +
    1.50 +  frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate));
    1.51 +
    1.52 +  speech_input_minimum_length_us_ =
    1.53 +      static_cast<int64_t>(1.7 * 1000000);
    1.54 +  speech_input_complete_silence_length_us_ =
    1.55 +      static_cast<int64_t>(0.5 * 1000000);
    1.56 +  long_speech_input_complete_silence_length_us_ = -1;
    1.57 +  long_speech_length_us_ = -1;
    1.58 +  speech_input_possibly_complete_silence_length_us_ =
    1.59 +      1 * 1000000;
    1.60 +
    1.61 +  // Set the default configuration for Push To Talk mode.
    1.62 +  EnergyEndpointerParams ep_config;
    1.63 +  ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate));
    1.64 +  ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate));
    1.65 +  ep_config.set_endpoint_margin(0.2f);
    1.66 +  ep_config.set_onset_window(0.15f);
    1.67 +  ep_config.set_speech_on_window(0.4f);
    1.68 +  ep_config.set_offset_window(0.15f);
    1.69 +  ep_config.set_onset_detect_dur(0.09f);
    1.70 +  ep_config.set_onset_confirm_dur(0.075f);
    1.71 +  ep_config.set_on_maintain_dur(0.10f);
    1.72 +  ep_config.set_offset_confirm_dur(0.12f);
    1.73 +  ep_config.set_decision_threshold(1000.0f);
    1.74 +  ep_config.set_min_decision_threshold(50.0f);
    1.75 +  ep_config.set_fast_update_dur(0.2f);
    1.76 +  ep_config.set_sample_rate(static_cast<float>(sample_rate));
    1.77 +  ep_config.set_min_fundamental_frequency(57.143f);
    1.78 +  ep_config.set_max_fundamental_frequency(400.0f);
    1.79 +  ep_config.set_contamination_rejection_period(0.25f);
    1.80 +  energy_endpointer_.Init(ep_config);
    1.81 +}
    1.82 +
    1.83 +void Endpointer::Reset() {
    1.84 +  old_ep_status_ = EP_PRE_SPEECH;
    1.85 +  waiting_for_speech_possibly_complete_timeout_ = false;
    1.86 +  waiting_for_speech_complete_timeout_ = false;
    1.87 +  speech_previously_detected_ = false;
    1.88 +  speech_input_complete_ = false;
    1.89 +  audio_frame_time_us_ = 0; // Reset time for packets sent to endpointer.
    1.90 +  speech_end_time_us_ = -1;
    1.91 +  speech_start_time_us_ = -1;
    1.92 +}
    1.93 +
    1.94 +void Endpointer::StartSession() {
    1.95 +  Reset();
    1.96 +  energy_endpointer_.StartSession();
    1.97 +}
    1.98 +
    1.99 +void Endpointer::EndSession() {
   1.100 +  energy_endpointer_.EndSession();
   1.101 +}
   1.102 +
   1.103 +void Endpointer::SetEnvironmentEstimationMode() {
   1.104 +  Reset();
   1.105 +  energy_endpointer_.SetEnvironmentEstimationMode();
   1.106 +}
   1.107 +
   1.108 +void Endpointer::SetUserInputMode() {
   1.109 +  energy_endpointer_.SetUserInputMode();
   1.110 +}
   1.111 +
   1.112 +EpStatus Endpointer::Status(int64_t *time) {
   1.113 +  return energy_endpointer_.Status(time);
   1.114 +}
   1.115 +
   1.116 +EpStatus Endpointer::ProcessAudio(const AudioChunk& raw_audio, float* rms_out) {
   1.117 +  MOZ_ASSERT(raw_audio.mBufferFormat == AUDIO_FORMAT_S16, "Audio is not in 16 bit format");
   1.118 +  const int16_t* audio_data = static_cast<const int16_t*>(raw_audio.mChannelData[0]);
   1.119 +  const int num_samples = raw_audio.mDuration;
   1.120 +  EpStatus ep_status = EP_PRE_SPEECH;
   1.121 +
   1.122 +  // Process the input data in blocks of frame_size_, dropping any incomplete
   1.123 +  // frames at the end (which is ok since typically the caller will be recording
   1.124 +  // audio in multiples of our frame size).
   1.125 +  int sample_index = 0;
   1.126 +  while (sample_index + frame_size_ <= num_samples) {
   1.127 +    // Have the endpointer process the frame.
   1.128 +    energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_,
   1.129 +                                         audio_data + sample_index,
   1.130 +                                         frame_size_,
   1.131 +                                         rms_out);
   1.132 +    sample_index += frame_size_;
   1.133 +    audio_frame_time_us_ += (frame_size_ * 1000000) /
   1.134 +                         sample_rate_;
   1.135 +
   1.136 +    // Get the status of the endpointer.
   1.137 +    int64_t ep_time;
   1.138 +    ep_status = energy_endpointer_.Status(&ep_time);
   1.139 +    if (old_ep_status_ != ep_status)
   1.140 +        fprintf(stderr, "Status changed old= %d, new= %d\n", old_ep_status_, ep_status);
   1.141 +
   1.142 +    // Handle state changes.
   1.143 +    if ((EP_SPEECH_PRESENT == ep_status) &&
   1.144 +        (EP_POSSIBLE_ONSET == old_ep_status_)) {
   1.145 +      speech_end_time_us_ = -1;
   1.146 +      waiting_for_speech_possibly_complete_timeout_ = false;
   1.147 +      waiting_for_speech_complete_timeout_ = false;
   1.148 +      // Trigger SpeechInputDidStart event on first detection.
   1.149 +      if (false == speech_previously_detected_) {
   1.150 +        speech_previously_detected_ = true;
   1.151 +        speech_start_time_us_ = ep_time;
   1.152 +      }
   1.153 +    }
   1.154 +    if ((EP_PRE_SPEECH == ep_status) &&
   1.155 +        (EP_POSSIBLE_OFFSET == old_ep_status_)) {
   1.156 +      speech_end_time_us_ = ep_time;
   1.157 +      waiting_for_speech_possibly_complete_timeout_ = true;
   1.158 +      waiting_for_speech_complete_timeout_ = true;
   1.159 +    }
   1.160 +    if (ep_time > speech_input_minimum_length_us_) {
   1.161 +      // Speech possibly complete timeout.
   1.162 +      if ((waiting_for_speech_possibly_complete_timeout_) &&
   1.163 +          (ep_time - speech_end_time_us_ >
   1.164 +              speech_input_possibly_complete_silence_length_us_)) {
   1.165 +        waiting_for_speech_possibly_complete_timeout_ = false;
   1.166 +      }
   1.167 +      if (waiting_for_speech_complete_timeout_) {
   1.168 +        // The length of the silence timeout period can be held constant, or it
   1.169 +        // can be changed after a fixed amount of time from the beginning of
   1.170 +        // speech.
   1.171 +        bool has_stepped_silence =
   1.172 +            (long_speech_length_us_ > 0) &&
   1.173 +            (long_speech_input_complete_silence_length_us_ > 0);
   1.174 +        int64_t requested_silence_length;
   1.175 +        if (has_stepped_silence &&
   1.176 +            (ep_time - speech_start_time_us_) > long_speech_length_us_) {
   1.177 +          requested_silence_length =
   1.178 +              long_speech_input_complete_silence_length_us_;
   1.179 +        } else {
   1.180 +          requested_silence_length =
   1.181 +              speech_input_complete_silence_length_us_;
   1.182 +        }
   1.183 +
   1.184 +        // Speech complete timeout.
   1.185 +        if ((ep_time - speech_end_time_us_) > requested_silence_length) {
   1.186 +          waiting_for_speech_complete_timeout_ = false;
   1.187 +          speech_input_complete_ = true;
   1.188 +        }
   1.189 +      }
   1.190 +    }
   1.191 +    old_ep_status_ = ep_status;
   1.192 +  }
   1.193 +  return ep_status;
   1.194 +}
   1.195 +
   1.196 +}  // namespace mozilla

mercurial