1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/content/media/webspeech/recognition/SpeechRecognition.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,971 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* vim:set ts=2 sw=2 sts=2 et cindent: */ 1.6 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.7 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.8 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.9 + 1.10 +#include "SpeechRecognition.h" 1.11 + 1.12 +#include "nsCOMPtr.h" 1.13 +#include "nsCycleCollectionParticipant.h" 1.14 + 1.15 +#include "mozilla/dom/SpeechRecognitionBinding.h" 1.16 +#include "mozilla/dom/MediaStreamTrackBinding.h" 1.17 +#include "mozilla/MediaManager.h" 1.18 +#include "mozilla/Services.h" 1.19 + 1.20 +#include "AudioSegment.h" 1.21 +#include "endpointer.h" 1.22 + 1.23 +#include "GeneratedEvents.h" 1.24 +#include "nsIDOMSpeechRecognitionEvent.h" 1.25 +#include "nsIObserverService.h" 1.26 +#include "nsServiceManagerUtils.h" 1.27 + 1.28 +#include <algorithm> 1.29 + 1.30 +namespace mozilla { 1.31 +namespace dom { 1.32 + 1.33 +#define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default" 1.34 +#define DEFAULT_RECOGNITION_SERVICE "google" 1.35 + 1.36 +#define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length" 1.37 +#define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH "media.webspeech.long_silence_length" 1.38 +#define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH "media.webspeech.long_speech_length" 1.39 + 1.40 +static const uint32_t kSAMPLE_RATE = 16000; 1.41 +static const uint32_t kSPEECH_DETECTION_TIMEOUT_MS = 10000; 1.42 + 1.43 +// number of frames corresponding to 300ms of audio to send to endpointer while 1.44 +// it's in environment estimation mode 1.45 +// kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms 1.46 +static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000; 1.47 + 1.48 +#ifdef PR_LOGGING 1.49 +PRLogModuleInfo* 1.50 +GetSpeechRecognitionLog() 1.51 +{ 1.52 + static PRLogModuleInfo* sLog; 1.53 + if (!sLog) { 1.54 + sLog = PR_NewLogModule("SpeechRecognition"); 1.55 + } 1.56 + 1.57 + return sLog; 1.58 +} 1.59 +#define SR_LOG(...) PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, (__VA_ARGS__)) 1.60 +#else 1.61 +#define SR_LOG(...) 1.62 +#endif 1.63 + 1.64 +NS_INTERFACE_MAP_BEGIN(SpeechRecognition) 1.65 + NS_INTERFACE_MAP_ENTRY(nsIObserver) 1.66 +NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper) 1.67 + 1.68 +NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper) 1.69 +NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper) 1.70 + 1.71 +struct SpeechRecognition::TestConfig SpeechRecognition::mTestConfig; 1.72 + 1.73 +SpeechRecognition::SpeechRecognition(nsPIDOMWindow* aOwnerWindow) 1.74 + : DOMEventTargetHelper(aOwnerWindow) 1.75 + , mEndpointer(kSAMPLE_RATE) 1.76 + , mAudioSamplesPerChunk(mEndpointer.FrameSize()) 1.77 + , mSpeechDetectionTimer(do_CreateInstance(NS_TIMER_CONTRACTID)) 1.78 +{ 1.79 + SR_LOG("created SpeechRecognition"); 1.80 + 1.81 + mTestConfig.Init(); 1.82 + if (mTestConfig.mEnableTests) { 1.83 + nsCOMPtr<nsIObserverService> obs = services::GetObserverService(); 1.84 + obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false); 1.85 + obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false); 1.86 + } 1.87 + 1.88 + mEndpointer.set_speech_input_complete_silence_length( 1.89 + Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 500000)); 1.90 + mEndpointer.set_long_speech_input_complete_silence_length( 1.91 + Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 1000000)); 1.92 + mEndpointer.set_long_speech_length( 1.93 + Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000)); 1.94 + Reset(); 1.95 +} 1.96 + 1.97 +bool 1.98 +SpeechRecognition::StateBetween(FSMState begin, FSMState end) 1.99 +{ 1.100 + return mCurrentState >= begin && mCurrentState <= end; 1.101 +} 1.102 + 1.103 +void 1.104 +SpeechRecognition::SetState(FSMState state) 1.105 +{ 1.106 + mCurrentState = state; 1.107 + SR_LOG("Transitioned to state %s", GetName(mCurrentState)); 1.108 + return; 1.109 +} 1.110 + 1.111 +JSObject* 1.112 +SpeechRecognition::WrapObject(JSContext* aCx) 1.113 +{ 1.114 + return SpeechRecognitionBinding::Wrap(aCx, this); 1.115 +} 1.116 + 1.117 +already_AddRefed<SpeechRecognition> 1.118 +SpeechRecognition::Constructor(const GlobalObject& aGlobal, 1.119 + ErrorResult& aRv) 1.120 +{ 1.121 + nsCOMPtr<nsPIDOMWindow> win = do_QueryInterface(aGlobal.GetAsSupports()); 1.122 + if (!win) { 1.123 + aRv.Throw(NS_ERROR_FAILURE); 1.124 + } 1.125 + 1.126 + MOZ_ASSERT(win->IsInnerWindow()); 1.127 + nsRefPtr<SpeechRecognition> object = new SpeechRecognition(win); 1.128 + return object.forget(); 1.129 +} 1.130 + 1.131 +nsISupports* 1.132 +SpeechRecognition::GetParentObject() const 1.133 +{ 1.134 + return GetOwner(); 1.135 +} 1.136 + 1.137 +void 1.138 +SpeechRecognition::ProcessEvent(SpeechEvent* aEvent) 1.139 +{ 1.140 + SR_LOG("Processing %s, current state is %s", 1.141 + GetName(aEvent), 1.142 + GetName(mCurrentState)); 1.143 + 1.144 + if (mAborted && aEvent->mType != EVENT_ABORT) { 1.145 + // ignore all events while aborting 1.146 + return; 1.147 + } 1.148 + 1.149 + Transition(aEvent); 1.150 +} 1.151 + 1.152 +void 1.153 +SpeechRecognition::Transition(SpeechEvent* aEvent) 1.154 +{ 1.155 + switch (mCurrentState) { 1.156 + case STATE_IDLE: 1.157 + switch (aEvent->mType) { 1.158 + case EVENT_START: 1.159 + // TODO: may want to time out if we wait too long 1.160 + // for user to approve 1.161 + WaitForAudioData(aEvent); 1.162 + break; 1.163 + case EVENT_STOP: 1.164 + case EVENT_ABORT: 1.165 + case EVENT_AUDIO_DATA: 1.166 + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: 1.167 + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: 1.168 + DoNothing(aEvent); 1.169 + break; 1.170 + case EVENT_AUDIO_ERROR: 1.171 + case EVENT_RECOGNITIONSERVICE_ERROR: 1.172 + AbortError(aEvent); 1.173 + break; 1.174 + case EVENT_COUNT: 1.175 + MOZ_CRASH("Invalid event EVENT_COUNT"); 1.176 + } 1.177 + break; 1.178 + case STATE_STARTING: 1.179 + switch (aEvent->mType) { 1.180 + case EVENT_AUDIO_DATA: 1.181 + StartedAudioCapture(aEvent); 1.182 + break; 1.183 + case EVENT_AUDIO_ERROR: 1.184 + case EVENT_RECOGNITIONSERVICE_ERROR: 1.185 + AbortError(aEvent); 1.186 + break; 1.187 + case EVENT_ABORT: 1.188 + AbortSilently(aEvent); 1.189 + break; 1.190 + case EVENT_STOP: 1.191 + Reset(); 1.192 + break; 1.193 + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: 1.194 + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: 1.195 + DoNothing(aEvent); 1.196 + break; 1.197 + case EVENT_START: 1.198 + SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); 1.199 + MOZ_CRASH(); 1.200 + case EVENT_COUNT: 1.201 + MOZ_CRASH("Invalid event EVENT_COUNT"); 1.202 + } 1.203 + break; 1.204 + case STATE_ESTIMATING: 1.205 + switch (aEvent->mType) { 1.206 + case EVENT_AUDIO_DATA: 1.207 + WaitForEstimation(aEvent); 1.208 + break; 1.209 + case EVENT_STOP: 1.210 + StopRecordingAndRecognize(aEvent); 1.211 + break; 1.212 + case EVENT_ABORT: 1.213 + AbortSilently(aEvent); 1.214 + break; 1.215 + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: 1.216 + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: 1.217 + case EVENT_RECOGNITIONSERVICE_ERROR: 1.218 + DoNothing(aEvent); 1.219 + break; 1.220 + case EVENT_AUDIO_ERROR: 1.221 + AbortError(aEvent); 1.222 + break; 1.223 + case EVENT_START: 1.224 + SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType); 1.225 + MOZ_CRASH(); 1.226 + case EVENT_COUNT: 1.227 + MOZ_CRASH("Invalid event EVENT_COUNT"); 1.228 + } 1.229 + break; 1.230 + case STATE_WAITING_FOR_SPEECH: 1.231 + switch (aEvent->mType) { 1.232 + case EVENT_AUDIO_DATA: 1.233 + DetectSpeech(aEvent); 1.234 + break; 1.235 + case EVENT_STOP: 1.236 + StopRecordingAndRecognize(aEvent); 1.237 + break; 1.238 + case EVENT_ABORT: 1.239 + AbortSilently(aEvent); 1.240 + break; 1.241 + case EVENT_AUDIO_ERROR: 1.242 + AbortError(aEvent); 1.243 + break; 1.244 + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: 1.245 + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: 1.246 + case EVENT_RECOGNITIONSERVICE_ERROR: 1.247 + DoNothing(aEvent); 1.248 + break; 1.249 + case EVENT_START: 1.250 + SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); 1.251 + MOZ_CRASH(); 1.252 + case EVENT_COUNT: 1.253 + MOZ_CRASH("Invalid event EVENT_COUNT"); 1.254 + } 1.255 + break; 1.256 + case STATE_RECOGNIZING: 1.257 + switch (aEvent->mType) { 1.258 + case EVENT_AUDIO_DATA: 1.259 + WaitForSpeechEnd(aEvent); 1.260 + break; 1.261 + case EVENT_STOP: 1.262 + StopRecordingAndRecognize(aEvent); 1.263 + break; 1.264 + case EVENT_AUDIO_ERROR: 1.265 + case EVENT_RECOGNITIONSERVICE_ERROR: 1.266 + AbortError(aEvent); 1.267 + break; 1.268 + case EVENT_ABORT: 1.269 + AbortSilently(aEvent); 1.270 + break; 1.271 + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: 1.272 + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: 1.273 + DoNothing(aEvent); 1.274 + break; 1.275 + case EVENT_START: 1.276 + SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent)); 1.277 + MOZ_CRASH(); 1.278 + case EVENT_COUNT: 1.279 + MOZ_CRASH("Invalid event EVENT_COUNT"); 1.280 + } 1.281 + break; 1.282 + case STATE_WAITING_FOR_RESULT: 1.283 + switch (aEvent->mType) { 1.284 + case EVENT_STOP: 1.285 + DoNothing(aEvent); 1.286 + break; 1.287 + case EVENT_AUDIO_ERROR: 1.288 + case EVENT_RECOGNITIONSERVICE_ERROR: 1.289 + AbortError(aEvent); 1.290 + break; 1.291 + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: 1.292 + NotifyFinalResult(aEvent); 1.293 + break; 1.294 + case EVENT_AUDIO_DATA: 1.295 + DoNothing(aEvent); 1.296 + break; 1.297 + case EVENT_ABORT: 1.298 + AbortSilently(aEvent); 1.299 + break; 1.300 + case EVENT_START: 1.301 + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: 1.302 + SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", GetName(aEvent)); 1.303 + MOZ_CRASH(); 1.304 + case EVENT_COUNT: 1.305 + MOZ_CRASH("Invalid event EVENT_COUNT"); 1.306 + } 1.307 + break; 1.308 + case STATE_COUNT: 1.309 + MOZ_CRASH("Invalid state STATE_COUNT"); 1.310 + } 1.311 + 1.312 + return; 1.313 +} 1.314 + 1.315 +/* 1.316 + * Handle a segment of recorded audio data. 1.317 + * Returns the number of samples that were processed. 1.318 + */ 1.319 +uint32_t 1.320 +SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment) 1.321 +{ 1.322 + AudioSegment::ChunkIterator iterator(*aSegment); 1.323 + uint32_t samples = 0; 1.324 + while (!iterator.IsEnded()) { 1.325 + float out; 1.326 + mEndpointer.ProcessAudio(*iterator, &out); 1.327 + samples += iterator->GetDuration(); 1.328 + iterator.Next(); 1.329 + } 1.330 + 1.331 + mRecognitionService->ProcessAudioSegment(aSegment); 1.332 + return samples; 1.333 +} 1.334 + 1.335 +void 1.336 +SpeechRecognition::GetRecognitionServiceCID(nsACString& aResultCID) 1.337 +{ 1.338 + if (mTestConfig.mFakeRecognitionService) { 1.339 + aResultCID = 1.340 + NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake"; 1.341 + 1.342 + return; 1.343 + } 1.344 + 1.345 + nsAdoptingCString prefValue = 1.346 + Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE); 1.347 + 1.348 + nsAutoCString speechRecognitionService; 1.349 + if (!prefValue.get() || prefValue.IsEmpty()) { 1.350 + speechRecognitionService = DEFAULT_RECOGNITION_SERVICE; 1.351 + } else { 1.352 + speechRecognitionService = prefValue; 1.353 + } 1.354 + 1.355 + aResultCID = 1.356 + NS_LITERAL_CSTRING(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) + 1.357 + speechRecognitionService; 1.358 + 1.359 + return; 1.360 +} 1.361 + 1.362 +/**************************************************************************** 1.363 + * FSM Transition functions 1.364 + * 1.365 + * If a transition function may cause a DOM event to be fired, 1.366 + * it may also be re-entered, since the event handler may cause the 1.367 + * event loop to spin and new SpeechEvents to be processed. 1.368 + * 1.369 + * Rules: 1.370 + * 1) These methods should call SetState as soon as possible. 1.371 + * 2) If these methods dispatch DOM events, or call methods that dispatch 1.372 + * DOM events, that should be done as late as possible. 1.373 + * 3) If anything must happen after dispatching a DOM event, make sure 1.374 + * the state is still what the method expected it to be. 1.375 + ****************************************************************************/ 1.376 + 1.377 +void 1.378 +SpeechRecognition::Reset() 1.379 +{ 1.380 + SetState(STATE_IDLE); 1.381 + mRecognitionService = nullptr; 1.382 + mEstimationSamples = 0; 1.383 + mBufferedSamples = 0; 1.384 + mSpeechDetectionTimer->Cancel(); 1.385 + mAborted = false; 1.386 +} 1.387 + 1.388 +void 1.389 +SpeechRecognition::ResetAndEnd() 1.390 +{ 1.391 + Reset(); 1.392 + DispatchTrustedEvent(NS_LITERAL_STRING("end")); 1.393 +} 1.394 + 1.395 +void 1.396 +SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent) 1.397 +{ 1.398 + SetState(STATE_STARTING); 1.399 +} 1.400 + 1.401 +void 1.402 +SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent) 1.403 +{ 1.404 + SetState(STATE_ESTIMATING); 1.405 + 1.406 + mEndpointer.SetEnvironmentEstimationMode(); 1.407 + mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment); 1.408 + 1.409 + DispatchTrustedEvent(NS_LITERAL_STRING("audiostart")); 1.410 + if (mCurrentState == STATE_ESTIMATING) { 1.411 + DispatchTrustedEvent(NS_LITERAL_STRING("start")); 1.412 + } 1.413 +} 1.414 + 1.415 +void 1.416 +SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent) 1.417 +{ 1.418 + SetState(STATE_WAITING_FOR_RESULT); 1.419 + 1.420 + MOZ_ASSERT(mRecognitionService, "Service deleted before recording done"); 1.421 + mRecognitionService->SoundEnd(); 1.422 + 1.423 + StopRecording(); 1.424 +} 1.425 + 1.426 +void 1.427 +SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent) 1.428 +{ 1.429 + SetState(STATE_ESTIMATING); 1.430 + 1.431 + mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment); 1.432 + if (mEstimationSamples > kESTIMATION_SAMPLES) { 1.433 + mEndpointer.SetUserInputMode(); 1.434 + SetState(STATE_WAITING_FOR_SPEECH); 1.435 + } 1.436 +} 1.437 + 1.438 +void 1.439 +SpeechRecognition::DetectSpeech(SpeechEvent* aEvent) 1.440 +{ 1.441 + SetState(STATE_WAITING_FOR_SPEECH); 1.442 + 1.443 + ProcessAudioSegment(aEvent->mAudioSegment); 1.444 + if (mEndpointer.DidStartReceivingSpeech()) { 1.445 + mSpeechDetectionTimer->Cancel(); 1.446 + SetState(STATE_RECOGNIZING); 1.447 + DispatchTrustedEvent(NS_LITERAL_STRING("speechstart")); 1.448 + } 1.449 +} 1.450 + 1.451 +void 1.452 +SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent) 1.453 +{ 1.454 + SetState(STATE_RECOGNIZING); 1.455 + 1.456 + ProcessAudioSegment(aEvent->mAudioSegment); 1.457 + if (mEndpointer.speech_input_complete()) { 1.458 + DispatchTrustedEvent(NS_LITERAL_STRING("speechend")); 1.459 + 1.460 + if (mCurrentState == STATE_RECOGNIZING) { 1.461 + // FIXME: StopRecordingAndRecognize should only be called for single 1.462 + // shot services for continuous we should just inform the service 1.463 + StopRecordingAndRecognize(aEvent); 1.464 + } 1.465 + } 1.466 +} 1.467 + 1.468 +void 1.469 +SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent) 1.470 +{ 1.471 + ResetAndEnd(); 1.472 + 1.473 + nsCOMPtr<nsIDOMEvent> domEvent; 1.474 + NS_NewDOMSpeechRecognitionEvent(getter_AddRefs(domEvent), nullptr, nullptr, nullptr); 1.475 + 1.476 + nsCOMPtr<nsIDOMSpeechRecognitionEvent> srEvent = do_QueryInterface(domEvent); 1.477 + nsRefPtr<SpeechRecognitionResultList> rlist = aEvent->mRecognitionResultList; 1.478 + nsCOMPtr<nsISupports> ilist = do_QueryInterface(rlist); 1.479 + srEvent->InitSpeechRecognitionEvent(NS_LITERAL_STRING("result"), 1.480 + true, false, 0, ilist, 1.481 + NS_LITERAL_STRING("NOT_IMPLEMENTED"), 1.482 + nullptr); 1.483 + domEvent->SetTrusted(true); 1.484 + 1.485 + bool defaultActionEnabled; 1.486 + this->DispatchEvent(domEvent, &defaultActionEnabled); 1.487 +} 1.488 + 1.489 +void 1.490 +SpeechRecognition::DoNothing(SpeechEvent* aEvent) 1.491 +{ 1.492 +} 1.493 + 1.494 +void 1.495 +SpeechRecognition::AbortSilently(SpeechEvent* aEvent) 1.496 +{ 1.497 + bool stopRecording = StateBetween(STATE_ESTIMATING, STATE_RECOGNIZING); 1.498 + 1.499 + if (mRecognitionService) { 1.500 + mRecognitionService->Abort(); 1.501 + } 1.502 + 1.503 + if (stopRecording) { 1.504 + StopRecording(); 1.505 + } 1.506 + 1.507 + ResetAndEnd(); 1.508 +} 1.509 + 1.510 +void 1.511 +SpeechRecognition::AbortError(SpeechEvent* aEvent) 1.512 +{ 1.513 + AbortSilently(aEvent); 1.514 + NotifyError(aEvent); 1.515 +} 1.516 + 1.517 +void 1.518 +SpeechRecognition::NotifyError(SpeechEvent* aEvent) 1.519 +{ 1.520 + aEvent->mError->SetTrusted(true); 1.521 + 1.522 + bool defaultActionEnabled; 1.523 + this->DispatchEvent(aEvent->mError, &defaultActionEnabled); 1.524 + 1.525 + return; 1.526 +} 1.527 + 1.528 +/************************************** 1.529 + * Event triggers and other functions * 1.530 + **************************************/ 1.531 +NS_IMETHODIMP 1.532 +SpeechRecognition::StartRecording(DOMMediaStream* aDOMStream) 1.533 +{ 1.534 + // hold a reference so that the underlying stream 1.535 + // doesn't get Destroy()'ed 1.536 + mDOMStream = aDOMStream; 1.537 + 1.538 + NS_ENSURE_STATE(mDOMStream->GetStream()); 1.539 + mSpeechListener = new SpeechStreamListener(this); 1.540 + mDOMStream->GetStream()->AddListener(mSpeechListener); 1.541 + 1.542 + mEndpointer.StartSession(); 1.543 + 1.544 + return mSpeechDetectionTimer->Init(this, kSPEECH_DETECTION_TIMEOUT_MS, 1.545 + nsITimer::TYPE_ONE_SHOT); 1.546 +} 1.547 + 1.548 +NS_IMETHODIMP 1.549 +SpeechRecognition::StopRecording() 1.550 +{ 1.551 + // we only really need to remove the listener explicitly when testing, 1.552 + // as our JS code still holds a reference to mDOMStream and only assigning 1.553 + // it to nullptr isn't guaranteed to free the stream and the listener. 1.554 + mDOMStream->GetStream()->RemoveListener(mSpeechListener); 1.555 + mSpeechListener = nullptr; 1.556 + mDOMStream = nullptr; 1.557 + 1.558 + mEndpointer.EndSession(); 1.559 + DispatchTrustedEvent(NS_LITERAL_STRING("audioend")); 1.560 + 1.561 + return NS_OK; 1.562 +} 1.563 + 1.564 +NS_IMETHODIMP 1.565 +SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic, 1.566 + const char16_t* aData) 1.567 +{ 1.568 + MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread"); 1.569 + 1.570 + if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) && 1.571 + StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) { 1.572 + 1.573 + DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, 1.574 + SpeechRecognitionErrorCode::No_speech, 1.575 + NS_LITERAL_STRING("No speech detected (timeout)")); 1.576 + } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) { 1.577 + nsCOMPtr<nsIObserverService> obs = services::GetObserverService(); 1.578 + obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC); 1.579 + obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC); 1.580 + } else if (mTestConfig.mFakeFSMEvents && 1.581 + !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) { 1.582 + ProcessTestEventRequest(aSubject, nsDependentString(aData)); 1.583 + } 1.584 + 1.585 + return NS_OK; 1.586 +} 1.587 + 1.588 +void 1.589 +SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName) 1.590 +{ 1.591 + if (aEventName.EqualsLiteral("EVENT_START")) { 1.592 + ErrorResult err; 1.593 + Start(err); 1.594 + } else if (aEventName.EqualsLiteral("EVENT_STOP")) { 1.595 + Stop(); 1.596 + } else if (aEventName.EqualsLiteral("EVENT_ABORT")) { 1.597 + Abort(); 1.598 + } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) { 1.599 + DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, 1.600 + SpeechRecognitionErrorCode::Audio_capture, // TODO different codes? 1.601 + NS_LITERAL_STRING("AUDIO_ERROR test event")); 1.602 + } else if (aEventName.EqualsLiteral("EVENT_AUDIO_DATA")) { 1.603 + StartRecording(static_cast<DOMMediaStream*>(aSubject)); 1.604 + } else { 1.605 + NS_ASSERTION(mTestConfig.mFakeRecognitionService, 1.606 + "Got request for fake recognition service event, but " 1.607 + TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE " is unset"); 1.608 + 1.609 + // let the fake recognition service handle the request 1.610 + } 1.611 + 1.612 + return; 1.613 +} 1.614 + 1.615 +already_AddRefed<SpeechGrammarList> 1.616 +SpeechRecognition::GetGrammars(ErrorResult& aRv) const 1.617 +{ 1.618 + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); 1.619 + return nullptr; 1.620 +} 1.621 + 1.622 +void 1.623 +SpeechRecognition::SetGrammars(SpeechGrammarList& aArg, ErrorResult& aRv) 1.624 +{ 1.625 + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); 1.626 + return; 1.627 +} 1.628 + 1.629 +void 1.630 +SpeechRecognition::GetLang(nsString& aRetVal, ErrorResult& aRv) const 1.631 +{ 1.632 + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); 1.633 + return; 1.634 +} 1.635 + 1.636 +void 1.637 +SpeechRecognition::SetLang(const nsAString& aArg, ErrorResult& aRv) 1.638 +{ 1.639 + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); 1.640 + return; 1.641 +} 1.642 + 1.643 +bool 1.644 +SpeechRecognition::GetContinuous(ErrorResult& aRv) const 1.645 +{ 1.646 + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); 1.647 + return false; 1.648 +} 1.649 + 1.650 +void 1.651 +SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv) 1.652 +{ 1.653 + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); 1.654 + return; 1.655 +} 1.656 + 1.657 +bool 1.658 +SpeechRecognition::GetInterimResults(ErrorResult& aRv) const 1.659 +{ 1.660 + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); 1.661 + return false; 1.662 +} 1.663 + 1.664 +void 1.665 +SpeechRecognition::SetInterimResults(bool aArg, ErrorResult& aRv) 1.666 +{ 1.667 + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); 1.668 + return; 1.669 +} 1.670 + 1.671 +uint32_t 1.672 +SpeechRecognition::GetMaxAlternatives(ErrorResult& aRv) const 1.673 +{ 1.674 + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); 1.675 + return 0; 1.676 +} 1.677 + 1.678 +void 1.679 +SpeechRecognition::SetMaxAlternatives(uint32_t aArg, ErrorResult& aRv) 1.680 +{ 1.681 + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); 1.682 + return; 1.683 +} 1.684 + 1.685 +void 1.686 +SpeechRecognition::GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const 1.687 +{ 1.688 + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); 1.689 + return; 1.690 +} 1.691 + 1.692 +void 1.693 +SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv) 1.694 +{ 1.695 + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); 1.696 + return; 1.697 +} 1.698 + 1.699 +void 1.700 +SpeechRecognition::Start(ErrorResult& aRv) 1.701 +{ 1.702 + if (mCurrentState != STATE_IDLE) { 1.703 + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); 1.704 + return; 1.705 + } 1.706 + 1.707 + nsAutoCString speechRecognitionServiceCID; 1.708 + GetRecognitionServiceCID(speechRecognitionServiceCID); 1.709 + 1.710 + nsresult rv; 1.711 + mRecognitionService = do_GetService(speechRecognitionServiceCID.get(), &rv); 1.712 + NS_ENSURE_SUCCESS_VOID(rv); 1.713 + 1.714 + rv = mRecognitionService->Initialize(this->asWeakPtr()); 1.715 + NS_ENSURE_SUCCESS_VOID(rv); 1.716 + 1.717 + MediaStreamConstraints constraints; 1.718 + constraints.mAudio.SetAsBoolean() = true; 1.719 + 1.720 + if (!mTestConfig.mFakeFSMEvents) { 1.721 + MediaManager* manager = MediaManager::Get(); 1.722 + manager->GetUserMedia(false, 1.723 + GetOwner(), 1.724 + constraints, 1.725 + new GetUserMediaSuccessCallback(this), 1.726 + new GetUserMediaErrorCallback(this)); 1.727 + } 1.728 + 1.729 + nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START); 1.730 + NS_DispatchToMainThread(event); 1.731 +} 1.732 + 1.733 +void 1.734 +SpeechRecognition::Stop() 1.735 +{ 1.736 + nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP); 1.737 + NS_DispatchToMainThread(event); 1.738 +} 1.739 + 1.740 +void 1.741 +SpeechRecognition::Abort() 1.742 +{ 1.743 + if (mAborted) { 1.744 + return; 1.745 + } 1.746 + 1.747 + mAborted = true; 1.748 + nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT); 1.749 + NS_DispatchToMainThread(event); 1.750 +} 1.751 + 1.752 +void 1.753 +SpeechRecognition::DispatchError(EventType aErrorType, 1.754 + SpeechRecognitionErrorCode aErrorCode, 1.755 + const nsAString& aMessage) 1.756 +{ 1.757 + MOZ_ASSERT(NS_IsMainThread()); 1.758 + MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR || 1.759 + aErrorType == EVENT_AUDIO_ERROR, "Invalid error type!"); 1.760 + 1.761 + nsRefPtr<SpeechRecognitionError> srError = 1.762 + new SpeechRecognitionError(nullptr, nullptr, nullptr); 1.763 + 1.764 + ErrorResult err; 1.765 + srError->InitSpeechRecognitionError(NS_LITERAL_STRING("error"), true, false, 1.766 + aErrorCode, aMessage, err); 1.767 + 1.768 + nsRefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType); 1.769 + event->mError = srError; 1.770 + NS_DispatchToMainThread(event); 1.771 +} 1.772 + 1.773 +/* 1.774 + * Buffer audio samples into mAudioSamplesBuffer until aBufferSize. 1.775 + * Updates mBufferedSamples and returns the number of samples that were buffered. 1.776 + */ 1.777 +uint32_t 1.778 +SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples, 1.779 + uint32_t aSampleCount) 1.780 +{ 1.781 + MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk); 1.782 + MOZ_ASSERT(mAudioSamplesBuffer.get()); 1.783 + 1.784 + int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data()); 1.785 + size_t samplesToCopy = std::min(aSampleCount, 1.786 + mAudioSamplesPerChunk - mBufferedSamples); 1.787 + 1.788 + memcpy(samplesBuffer + mBufferedSamples, aSamples, 1.789 + samplesToCopy * sizeof(int16_t)); 1.790 + 1.791 + mBufferedSamples += samplesToCopy; 1.792 + return samplesToCopy; 1.793 +} 1.794 + 1.795 +/* 1.796 + * Split a samples buffer starting of a given size into 1.797 + * chunks of equal size. The chunks are stored in the array 1.798 + * received as argument. 1.799 + * Returns the offset of the end of the last chunk that was 1.800 + * created. 1.801 + */ 1.802 +uint32_t 1.803 +SpeechRecognition::SplitSamplesBuffer(const int16_t* aSamplesBuffer, 1.804 + uint32_t aSampleCount, 1.805 + nsTArray<nsRefPtr<SharedBuffer>>& aResult) 1.806 +{ 1.807 + uint32_t chunkStart = 0; 1.808 + 1.809 + while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) { 1.810 + nsRefPtr<SharedBuffer> chunk = 1.811 + SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t)); 1.812 + 1.813 + memcpy(chunk->Data(), aSamplesBuffer + chunkStart, 1.814 + mAudioSamplesPerChunk * sizeof(int16_t)); 1.815 + 1.816 + aResult.AppendElement(chunk); 1.817 + chunkStart += mAudioSamplesPerChunk; 1.818 + } 1.819 + 1.820 + return chunkStart; 1.821 +} 1.822 + 1.823 +AudioSegment* 1.824 +SpeechRecognition::CreateAudioSegment(nsTArray<nsRefPtr<SharedBuffer>>& aChunks) 1.825 +{ 1.826 + AudioSegment* segment = new AudioSegment(); 1.827 + for (uint32_t i = 0; i < aChunks.Length(); ++i) { 1.828 + nsRefPtr<SharedBuffer> buffer = aChunks[i]; 1.829 + const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data()); 1.830 + 1.831 + nsAutoTArray<const int16_t*, 1> channels; 1.832 + channels.AppendElement(chunkData); 1.833 + segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk); 1.834 + } 1.835 + 1.836 + return segment; 1.837 +} 1.838 + 1.839 +void 1.840 +SpeechRecognition::FeedAudioData(already_AddRefed<SharedBuffer> aSamples, 1.841 + uint32_t aDuration, 1.842 + MediaStreamListener* aProvider) 1.843 +{ 1.844 + NS_ASSERTION(!NS_IsMainThread(), 1.845 + "FeedAudioData should not be called in the main thread"); 1.846 + 1.847 + // Endpointer expects to receive samples in chunks whose size is a 1.848 + // multiple of its frame size. 1.849 + // Since we can't assume we will receive the frames in appropriate-sized 1.850 + // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk 1.851 + // (a multiple of Endpointer's frame size) before feeding to Endpointer. 1.852 + 1.853 + // ensure aSamples is deleted 1.854 + nsRefPtr<SharedBuffer> refSamples = aSamples; 1.855 + 1.856 + uint32_t samplesIndex = 0; 1.857 + const int16_t* samples = static_cast<int16_t*>(refSamples->Data()); 1.858 + nsAutoTArray<nsRefPtr<SharedBuffer>, 5> chunksToSend; 1.859 + 1.860 + // fill up our buffer and make a chunk out of it, if possible 1.861 + if (mBufferedSamples > 0) { 1.862 + samplesIndex += FillSamplesBuffer(samples, aDuration); 1.863 + 1.864 + if (mBufferedSamples == mAudioSamplesPerChunk) { 1.865 + chunksToSend.AppendElement(mAudioSamplesBuffer); 1.866 + mAudioSamplesBuffer = nullptr; 1.867 + mBufferedSamples = 0; 1.868 + } 1.869 + } 1.870 + 1.871 + // create sample chunks of correct size 1.872 + if (samplesIndex < aDuration) { 1.873 + samplesIndex += SplitSamplesBuffer(samples + samplesIndex, 1.874 + aDuration - samplesIndex, 1.875 + chunksToSend); 1.876 + } 1.877 + 1.878 + // buffer remaining samples 1.879 + if (samplesIndex < aDuration) { 1.880 + mBufferedSamples = 0; 1.881 + mAudioSamplesBuffer = 1.882 + SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t)); 1.883 + 1.884 + FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex); 1.885 + } 1.886 + 1.887 + AudioSegment* segment = CreateAudioSegment(chunksToSend); 1.888 + nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_AUDIO_DATA); 1.889 + event->mAudioSegment = segment; 1.890 + event->mProvider = aProvider; 1.891 + NS_DispatchToMainThread(event); 1.892 + 1.893 + return; 1.894 +} 1.895 + 1.896 +const char* 1.897 +SpeechRecognition::GetName(FSMState aId) 1.898 +{ 1.899 + static const char* names[] = { 1.900 + "STATE_IDLE", 1.901 + "STATE_STARTING", 1.902 + "STATE_ESTIMATING", 1.903 + "STATE_WAITING_FOR_SPEECH", 1.904 + "STATE_RECOGNIZING", 1.905 + "STATE_WAITING_FOR_RESULT", 1.906 + }; 1.907 + 1.908 + MOZ_ASSERT(aId < STATE_COUNT); 1.909 + MOZ_ASSERT(ArrayLength(names) == STATE_COUNT); 1.910 + return names[aId]; 1.911 +} 1.912 + 1.913 +const char* 1.914 +SpeechRecognition::GetName(SpeechEvent* aEvent) 1.915 +{ 1.916 + static const char* names[] = { 1.917 + "EVENT_START", 1.918 + "EVENT_STOP", 1.919 + "EVENT_ABORT", 1.920 + "EVENT_AUDIO_DATA", 1.921 + "EVENT_AUDIO_ERROR", 1.922 + "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT", 1.923 + "EVENT_RECOGNITIONSERVICE_FINAL_RESULT", 1.924 + "EVENT_RECOGNITIONSERVICE_ERROR" 1.925 + }; 1.926 + 1.927 + MOZ_ASSERT(aEvent->mType < EVENT_COUNT); 1.928 + MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT); 1.929 + return names[aEvent->mType]; 1.930 +} 1.931 + 1.932 +SpeechEvent::~SpeechEvent() 1.933 +{ 1.934 + delete mAudioSegment; 1.935 +} 1.936 + 1.937 +NS_IMETHODIMP 1.938 +SpeechEvent::Run() 1.939 +{ 1.940 + mRecognition->ProcessEvent(this); 1.941 + return NS_OK; 1.942 +} 1.943 + 1.944 +NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaSuccessCallback, nsIDOMGetUserMediaSuccessCallback) 1.945 + 1.946 +NS_IMETHODIMP 1.947 +SpeechRecognition::GetUserMediaSuccessCallback::OnSuccess(nsISupports* aStream) 1.948 +{ 1.949 + nsCOMPtr<nsIDOMLocalMediaStream> localStream = do_QueryInterface(aStream); 1.950 + mRecognition->StartRecording(static_cast<DOMLocalMediaStream*>(localStream.get())); 1.951 + return NS_OK; 1.952 +} 1.953 + 1.954 +NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaErrorCallback, nsIDOMGetUserMediaErrorCallback) 1.955 + 1.956 +NS_IMETHODIMP 1.957 +SpeechRecognition::GetUserMediaErrorCallback::OnError(const nsAString& aError) 1.958 +{ 1.959 + SpeechRecognitionErrorCode errorCode; 1.960 + 1.961 + if (aError.Equals(NS_LITERAL_STRING("PERMISSION_DENIED"))) { 1.962 + errorCode = SpeechRecognitionErrorCode::Not_allowed; 1.963 + } else { 1.964 + errorCode = SpeechRecognitionErrorCode::Audio_capture; 1.965 + } 1.966 + 1.967 + mRecognition->DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode, 1.968 + aError); 1.969 + 1.970 + return NS_OK; 1.971 +} 1.972 + 1.973 +} // namespace dom 1.974 +} // namespace mozilla