michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* vim:set ts=2 sw=2 sts=2 et cindent: */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #include "SpeechRecognition.h" michael@0: michael@0: #include "nsCOMPtr.h" michael@0: #include "nsCycleCollectionParticipant.h" michael@0: michael@0: #include "mozilla/dom/SpeechRecognitionBinding.h" michael@0: #include "mozilla/dom/MediaStreamTrackBinding.h" michael@0: #include "mozilla/MediaManager.h" michael@0: #include "mozilla/Services.h" michael@0: michael@0: #include "AudioSegment.h" michael@0: #include "endpointer.h" michael@0: michael@0: #include "GeneratedEvents.h" michael@0: #include "nsIDOMSpeechRecognitionEvent.h" michael@0: #include "nsIObserverService.h" michael@0: #include "nsServiceManagerUtils.h" michael@0: michael@0: #include michael@0: michael@0: namespace mozilla { michael@0: namespace dom { michael@0: michael@0: #define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default" michael@0: #define DEFAULT_RECOGNITION_SERVICE "google" michael@0: michael@0: #define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length" michael@0: #define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH "media.webspeech.long_silence_length" michael@0: #define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH "media.webspeech.long_speech_length" michael@0: michael@0: static const uint32_t kSAMPLE_RATE = 16000; michael@0: static const uint32_t kSPEECH_DETECTION_TIMEOUT_MS = 10000; michael@0: michael@0: // number of frames corresponding to 300ms of audio to send to endpointer while michael@0: // it's in environment estimation mode michael@0: // kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms michael@0: static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000; michael@0: michael@0: #ifdef PR_LOGGING michael@0: PRLogModuleInfo* michael@0: GetSpeechRecognitionLog() michael@0: { michael@0: static PRLogModuleInfo* sLog; michael@0: if (!sLog) { michael@0: sLog = PR_NewLogModule("SpeechRecognition"); michael@0: } michael@0: michael@0: return sLog; michael@0: } michael@0: #define SR_LOG(...) PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, (__VA_ARGS__)) michael@0: #else michael@0: #define SR_LOG(...) michael@0: #endif michael@0: michael@0: NS_INTERFACE_MAP_BEGIN(SpeechRecognition) michael@0: NS_INTERFACE_MAP_ENTRY(nsIObserver) michael@0: NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper) michael@0: michael@0: NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper) michael@0: NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper) michael@0: michael@0: struct SpeechRecognition::TestConfig SpeechRecognition::mTestConfig; michael@0: michael@0: SpeechRecognition::SpeechRecognition(nsPIDOMWindow* aOwnerWindow) michael@0: : DOMEventTargetHelper(aOwnerWindow) michael@0: , mEndpointer(kSAMPLE_RATE) michael@0: , mAudioSamplesPerChunk(mEndpointer.FrameSize()) michael@0: , mSpeechDetectionTimer(do_CreateInstance(NS_TIMER_CONTRACTID)) michael@0: { michael@0: SR_LOG("created SpeechRecognition"); michael@0: michael@0: mTestConfig.Init(); michael@0: if (mTestConfig.mEnableTests) { michael@0: nsCOMPtr obs = services::GetObserverService(); michael@0: obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false); michael@0: obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false); michael@0: } michael@0: michael@0: mEndpointer.set_speech_input_complete_silence_length( michael@0: Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 500000)); michael@0: mEndpointer.set_long_speech_input_complete_silence_length( michael@0: Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 1000000)); michael@0: mEndpointer.set_long_speech_length( michael@0: Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000)); michael@0: Reset(); michael@0: } michael@0: michael@0: bool michael@0: SpeechRecognition::StateBetween(FSMState begin, FSMState end) michael@0: { michael@0: return mCurrentState >= begin && mCurrentState <= end; michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::SetState(FSMState state) michael@0: { michael@0: mCurrentState = state; michael@0: SR_LOG("Transitioned to state %s", GetName(mCurrentState)); michael@0: return; michael@0: } michael@0: michael@0: JSObject* michael@0: SpeechRecognition::WrapObject(JSContext* aCx) michael@0: { michael@0: return SpeechRecognitionBinding::Wrap(aCx, this); michael@0: } michael@0: michael@0: already_AddRefed michael@0: SpeechRecognition::Constructor(const GlobalObject& aGlobal, michael@0: ErrorResult& aRv) michael@0: { michael@0: nsCOMPtr win = do_QueryInterface(aGlobal.GetAsSupports()); michael@0: if (!win) { michael@0: aRv.Throw(NS_ERROR_FAILURE); michael@0: } michael@0: michael@0: MOZ_ASSERT(win->IsInnerWindow()); michael@0: nsRefPtr object = new SpeechRecognition(win); michael@0: return object.forget(); michael@0: } michael@0: michael@0: nsISupports* michael@0: SpeechRecognition::GetParentObject() const michael@0: { michael@0: return GetOwner(); michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::ProcessEvent(SpeechEvent* aEvent) michael@0: { michael@0: SR_LOG("Processing %s, current state is %s", michael@0: GetName(aEvent), michael@0: GetName(mCurrentState)); michael@0: michael@0: if (mAborted && aEvent->mType != EVENT_ABORT) { michael@0: // ignore all events while aborting michael@0: return; michael@0: } michael@0: michael@0: Transition(aEvent); michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::Transition(SpeechEvent* aEvent) michael@0: { michael@0: switch (mCurrentState) { michael@0: case STATE_IDLE: michael@0: switch (aEvent->mType) { michael@0: case EVENT_START: michael@0: // TODO: may want to time out if we wait too long michael@0: // for user to approve michael@0: WaitForAudioData(aEvent); michael@0: break; michael@0: case EVENT_STOP: michael@0: case EVENT_ABORT: michael@0: case EVENT_AUDIO_DATA: michael@0: case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: michael@0: case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: michael@0: DoNothing(aEvent); michael@0: break; michael@0: case EVENT_AUDIO_ERROR: michael@0: case EVENT_RECOGNITIONSERVICE_ERROR: michael@0: AbortError(aEvent); michael@0: break; michael@0: case EVENT_COUNT: michael@0: MOZ_CRASH("Invalid event EVENT_COUNT"); michael@0: } michael@0: break; michael@0: case STATE_STARTING: michael@0: switch (aEvent->mType) { michael@0: case EVENT_AUDIO_DATA: michael@0: StartedAudioCapture(aEvent); michael@0: break; michael@0: case EVENT_AUDIO_ERROR: michael@0: case EVENT_RECOGNITIONSERVICE_ERROR: michael@0: AbortError(aEvent); michael@0: break; michael@0: case EVENT_ABORT: michael@0: AbortSilently(aEvent); michael@0: break; michael@0: case EVENT_STOP: michael@0: Reset(); michael@0: break; michael@0: case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: michael@0: case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: michael@0: DoNothing(aEvent); michael@0: break; michael@0: case EVENT_START: michael@0: SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); michael@0: MOZ_CRASH(); michael@0: case EVENT_COUNT: michael@0: MOZ_CRASH("Invalid event EVENT_COUNT"); michael@0: } michael@0: break; michael@0: case STATE_ESTIMATING: michael@0: switch (aEvent->mType) { michael@0: case EVENT_AUDIO_DATA: michael@0: WaitForEstimation(aEvent); michael@0: break; michael@0: case EVENT_STOP: michael@0: StopRecordingAndRecognize(aEvent); michael@0: break; michael@0: case EVENT_ABORT: michael@0: AbortSilently(aEvent); michael@0: break; michael@0: case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: michael@0: case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: michael@0: case EVENT_RECOGNITIONSERVICE_ERROR: michael@0: DoNothing(aEvent); michael@0: break; michael@0: case EVENT_AUDIO_ERROR: michael@0: AbortError(aEvent); michael@0: break; michael@0: case EVENT_START: michael@0: SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType); michael@0: MOZ_CRASH(); michael@0: case EVENT_COUNT: michael@0: MOZ_CRASH("Invalid event EVENT_COUNT"); michael@0: } michael@0: break; michael@0: case STATE_WAITING_FOR_SPEECH: michael@0: switch (aEvent->mType) { michael@0: case EVENT_AUDIO_DATA: michael@0: DetectSpeech(aEvent); michael@0: break; michael@0: case EVENT_STOP: michael@0: StopRecordingAndRecognize(aEvent); michael@0: break; michael@0: case EVENT_ABORT: michael@0: AbortSilently(aEvent); michael@0: break; michael@0: case EVENT_AUDIO_ERROR: michael@0: AbortError(aEvent); michael@0: break; michael@0: case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: michael@0: case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: michael@0: case EVENT_RECOGNITIONSERVICE_ERROR: michael@0: DoNothing(aEvent); michael@0: break; michael@0: case EVENT_START: michael@0: SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); michael@0: MOZ_CRASH(); michael@0: case EVENT_COUNT: michael@0: MOZ_CRASH("Invalid event EVENT_COUNT"); michael@0: } michael@0: break; michael@0: case STATE_RECOGNIZING: michael@0: switch (aEvent->mType) { michael@0: case EVENT_AUDIO_DATA: michael@0: WaitForSpeechEnd(aEvent); michael@0: break; michael@0: case EVENT_STOP: michael@0: StopRecordingAndRecognize(aEvent); michael@0: break; michael@0: case EVENT_AUDIO_ERROR: michael@0: case EVENT_RECOGNITIONSERVICE_ERROR: michael@0: AbortError(aEvent); michael@0: break; michael@0: case EVENT_ABORT: michael@0: AbortSilently(aEvent); michael@0: break; michael@0: case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: michael@0: case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: michael@0: DoNothing(aEvent); michael@0: break; michael@0: case EVENT_START: michael@0: SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent)); michael@0: MOZ_CRASH(); michael@0: case EVENT_COUNT: michael@0: MOZ_CRASH("Invalid event EVENT_COUNT"); michael@0: } michael@0: break; michael@0: case STATE_WAITING_FOR_RESULT: michael@0: switch (aEvent->mType) { michael@0: case EVENT_STOP: michael@0: DoNothing(aEvent); michael@0: break; michael@0: case EVENT_AUDIO_ERROR: michael@0: case EVENT_RECOGNITIONSERVICE_ERROR: michael@0: AbortError(aEvent); michael@0: break; michael@0: case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: michael@0: NotifyFinalResult(aEvent); michael@0: break; michael@0: case EVENT_AUDIO_DATA: michael@0: DoNothing(aEvent); michael@0: break; michael@0: case EVENT_ABORT: michael@0: AbortSilently(aEvent); michael@0: break; michael@0: case EVENT_START: michael@0: case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: michael@0: SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", GetName(aEvent)); michael@0: MOZ_CRASH(); michael@0: case EVENT_COUNT: michael@0: MOZ_CRASH("Invalid event EVENT_COUNT"); michael@0: } michael@0: break; michael@0: case STATE_COUNT: michael@0: MOZ_CRASH("Invalid state STATE_COUNT"); michael@0: } michael@0: michael@0: return; michael@0: } michael@0: michael@0: /* michael@0: * Handle a segment of recorded audio data. michael@0: * Returns the number of samples that were processed. michael@0: */ michael@0: uint32_t michael@0: SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment) michael@0: { michael@0: AudioSegment::ChunkIterator iterator(*aSegment); michael@0: uint32_t samples = 0; michael@0: while (!iterator.IsEnded()) { michael@0: float out; michael@0: mEndpointer.ProcessAudio(*iterator, &out); michael@0: samples += iterator->GetDuration(); michael@0: iterator.Next(); michael@0: } michael@0: michael@0: mRecognitionService->ProcessAudioSegment(aSegment); michael@0: return samples; michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::GetRecognitionServiceCID(nsACString& aResultCID) michael@0: { michael@0: if (mTestConfig.mFakeRecognitionService) { michael@0: aResultCID = michael@0: NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake"; michael@0: michael@0: return; michael@0: } michael@0: michael@0: nsAdoptingCString prefValue = michael@0: Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE); michael@0: michael@0: nsAutoCString speechRecognitionService; michael@0: if (!prefValue.get() || prefValue.IsEmpty()) { michael@0: speechRecognitionService = DEFAULT_RECOGNITION_SERVICE; michael@0: } else { michael@0: speechRecognitionService = prefValue; michael@0: } michael@0: michael@0: aResultCID = michael@0: NS_LITERAL_CSTRING(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) + michael@0: speechRecognitionService; michael@0: michael@0: return; michael@0: } michael@0: michael@0: /**************************************************************************** michael@0: * FSM Transition functions michael@0: * michael@0: * If a transition function may cause a DOM event to be fired, michael@0: * it may also be re-entered, since the event handler may cause the michael@0: * event loop to spin and new SpeechEvents to be processed. michael@0: * michael@0: * Rules: michael@0: * 1) These methods should call SetState as soon as possible. michael@0: * 2) If these methods dispatch DOM events, or call methods that dispatch michael@0: * DOM events, that should be done as late as possible. michael@0: * 3) If anything must happen after dispatching a DOM event, make sure michael@0: * the state is still what the method expected it to be. michael@0: ****************************************************************************/ michael@0: michael@0: void michael@0: SpeechRecognition::Reset() michael@0: { michael@0: SetState(STATE_IDLE); michael@0: mRecognitionService = nullptr; michael@0: mEstimationSamples = 0; michael@0: mBufferedSamples = 0; michael@0: mSpeechDetectionTimer->Cancel(); michael@0: mAborted = false; michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::ResetAndEnd() michael@0: { michael@0: Reset(); michael@0: DispatchTrustedEvent(NS_LITERAL_STRING("end")); michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent) michael@0: { michael@0: SetState(STATE_STARTING); michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent) michael@0: { michael@0: SetState(STATE_ESTIMATING); michael@0: michael@0: mEndpointer.SetEnvironmentEstimationMode(); michael@0: mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment); michael@0: michael@0: DispatchTrustedEvent(NS_LITERAL_STRING("audiostart")); michael@0: if (mCurrentState == STATE_ESTIMATING) { michael@0: DispatchTrustedEvent(NS_LITERAL_STRING("start")); michael@0: } michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent) michael@0: { michael@0: SetState(STATE_WAITING_FOR_RESULT); michael@0: michael@0: MOZ_ASSERT(mRecognitionService, "Service deleted before recording done"); michael@0: mRecognitionService->SoundEnd(); michael@0: michael@0: StopRecording(); michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent) michael@0: { michael@0: SetState(STATE_ESTIMATING); michael@0: michael@0: mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment); michael@0: if (mEstimationSamples > kESTIMATION_SAMPLES) { michael@0: mEndpointer.SetUserInputMode(); michael@0: SetState(STATE_WAITING_FOR_SPEECH); michael@0: } michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::DetectSpeech(SpeechEvent* aEvent) michael@0: { michael@0: SetState(STATE_WAITING_FOR_SPEECH); michael@0: michael@0: ProcessAudioSegment(aEvent->mAudioSegment); michael@0: if (mEndpointer.DidStartReceivingSpeech()) { michael@0: mSpeechDetectionTimer->Cancel(); michael@0: SetState(STATE_RECOGNIZING); michael@0: DispatchTrustedEvent(NS_LITERAL_STRING("speechstart")); michael@0: } michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent) michael@0: { michael@0: SetState(STATE_RECOGNIZING); michael@0: michael@0: ProcessAudioSegment(aEvent->mAudioSegment); michael@0: if (mEndpointer.speech_input_complete()) { michael@0: DispatchTrustedEvent(NS_LITERAL_STRING("speechend")); michael@0: michael@0: if (mCurrentState == STATE_RECOGNIZING) { michael@0: // FIXME: StopRecordingAndRecognize should only be called for single michael@0: // shot services for continuous we should just inform the service michael@0: StopRecordingAndRecognize(aEvent); michael@0: } michael@0: } michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent) michael@0: { michael@0: ResetAndEnd(); michael@0: michael@0: nsCOMPtr domEvent; michael@0: NS_NewDOMSpeechRecognitionEvent(getter_AddRefs(domEvent), nullptr, nullptr, nullptr); michael@0: michael@0: nsCOMPtr srEvent = do_QueryInterface(domEvent); michael@0: nsRefPtr rlist = aEvent->mRecognitionResultList; michael@0: nsCOMPtr ilist = do_QueryInterface(rlist); michael@0: srEvent->InitSpeechRecognitionEvent(NS_LITERAL_STRING("result"), michael@0: true, false, 0, ilist, michael@0: NS_LITERAL_STRING("NOT_IMPLEMENTED"), michael@0: nullptr); michael@0: domEvent->SetTrusted(true); michael@0: michael@0: bool defaultActionEnabled; michael@0: this->DispatchEvent(domEvent, &defaultActionEnabled); michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::DoNothing(SpeechEvent* aEvent) michael@0: { michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::AbortSilently(SpeechEvent* aEvent) michael@0: { michael@0: bool stopRecording = StateBetween(STATE_ESTIMATING, STATE_RECOGNIZING); michael@0: michael@0: if (mRecognitionService) { michael@0: mRecognitionService->Abort(); michael@0: } michael@0: michael@0: if (stopRecording) { michael@0: StopRecording(); michael@0: } michael@0: michael@0: ResetAndEnd(); michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::AbortError(SpeechEvent* aEvent) michael@0: { michael@0: AbortSilently(aEvent); michael@0: NotifyError(aEvent); michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::NotifyError(SpeechEvent* aEvent) michael@0: { michael@0: aEvent->mError->SetTrusted(true); michael@0: michael@0: bool defaultActionEnabled; michael@0: this->DispatchEvent(aEvent->mError, &defaultActionEnabled); michael@0: michael@0: return; michael@0: } michael@0: michael@0: /************************************** michael@0: * Event triggers and other functions * michael@0: **************************************/ michael@0: NS_IMETHODIMP michael@0: SpeechRecognition::StartRecording(DOMMediaStream* aDOMStream) michael@0: { michael@0: // hold a reference so that the underlying stream michael@0: // doesn't get Destroy()'ed michael@0: mDOMStream = aDOMStream; michael@0: michael@0: NS_ENSURE_STATE(mDOMStream->GetStream()); michael@0: mSpeechListener = new SpeechStreamListener(this); michael@0: mDOMStream->GetStream()->AddListener(mSpeechListener); michael@0: michael@0: mEndpointer.StartSession(); michael@0: michael@0: return mSpeechDetectionTimer->Init(this, kSPEECH_DETECTION_TIMEOUT_MS, michael@0: nsITimer::TYPE_ONE_SHOT); michael@0: } michael@0: michael@0: NS_IMETHODIMP michael@0: SpeechRecognition::StopRecording() michael@0: { michael@0: // we only really need to remove the listener explicitly when testing, michael@0: // as our JS code still holds a reference to mDOMStream and only assigning michael@0: // it to nullptr isn't guaranteed to free the stream and the listener. michael@0: mDOMStream->GetStream()->RemoveListener(mSpeechListener); michael@0: mSpeechListener = nullptr; michael@0: mDOMStream = nullptr; michael@0: michael@0: mEndpointer.EndSession(); michael@0: DispatchTrustedEvent(NS_LITERAL_STRING("audioend")); michael@0: michael@0: return NS_OK; michael@0: } michael@0: michael@0: NS_IMETHODIMP michael@0: SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic, michael@0: const char16_t* aData) michael@0: { michael@0: MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread"); michael@0: michael@0: if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) && michael@0: StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) { michael@0: michael@0: DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, michael@0: SpeechRecognitionErrorCode::No_speech, michael@0: NS_LITERAL_STRING("No speech detected (timeout)")); michael@0: } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) { michael@0: nsCOMPtr obs = services::GetObserverService(); michael@0: obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC); michael@0: obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC); michael@0: } else if (mTestConfig.mFakeFSMEvents && michael@0: !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) { michael@0: ProcessTestEventRequest(aSubject, nsDependentString(aData)); michael@0: } michael@0: michael@0: return NS_OK; michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName) michael@0: { michael@0: if (aEventName.EqualsLiteral("EVENT_START")) { michael@0: ErrorResult err; michael@0: Start(err); michael@0: } else if (aEventName.EqualsLiteral("EVENT_STOP")) { michael@0: Stop(); michael@0: } else if (aEventName.EqualsLiteral("EVENT_ABORT")) { michael@0: Abort(); michael@0: } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) { michael@0: DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, michael@0: SpeechRecognitionErrorCode::Audio_capture, // TODO different codes? michael@0: NS_LITERAL_STRING("AUDIO_ERROR test event")); michael@0: } else if (aEventName.EqualsLiteral("EVENT_AUDIO_DATA")) { michael@0: StartRecording(static_cast(aSubject)); michael@0: } else { michael@0: NS_ASSERTION(mTestConfig.mFakeRecognitionService, michael@0: "Got request for fake recognition service event, but " michael@0: TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE " is unset"); michael@0: michael@0: // let the fake recognition service handle the request michael@0: } michael@0: michael@0: return; michael@0: } michael@0: michael@0: already_AddRefed michael@0: SpeechRecognition::GetGrammars(ErrorResult& aRv) const michael@0: { michael@0: aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); michael@0: return nullptr; michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::SetGrammars(SpeechGrammarList& aArg, ErrorResult& aRv) michael@0: { michael@0: aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); michael@0: return; michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::GetLang(nsString& aRetVal, ErrorResult& aRv) const michael@0: { michael@0: aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); michael@0: return; michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::SetLang(const nsAString& aArg, ErrorResult& aRv) michael@0: { michael@0: aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); michael@0: return; michael@0: } michael@0: michael@0: bool michael@0: SpeechRecognition::GetContinuous(ErrorResult& aRv) const michael@0: { michael@0: aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); michael@0: return false; michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv) michael@0: { michael@0: aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); michael@0: return; michael@0: } michael@0: michael@0: bool michael@0: SpeechRecognition::GetInterimResults(ErrorResult& aRv) const michael@0: { michael@0: aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); michael@0: return false; michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::SetInterimResults(bool aArg, ErrorResult& aRv) michael@0: { michael@0: aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); michael@0: return; michael@0: } michael@0: michael@0: uint32_t michael@0: SpeechRecognition::GetMaxAlternatives(ErrorResult& aRv) const michael@0: { michael@0: aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); michael@0: return 0; michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::SetMaxAlternatives(uint32_t aArg, ErrorResult& aRv) michael@0: { michael@0: aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); michael@0: return; michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const michael@0: { michael@0: aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); michael@0: return; michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv) michael@0: { michael@0: aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); michael@0: return; michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::Start(ErrorResult& aRv) michael@0: { michael@0: if (mCurrentState != STATE_IDLE) { michael@0: aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); michael@0: return; michael@0: } michael@0: michael@0: nsAutoCString speechRecognitionServiceCID; michael@0: GetRecognitionServiceCID(speechRecognitionServiceCID); michael@0: michael@0: nsresult rv; michael@0: mRecognitionService = do_GetService(speechRecognitionServiceCID.get(), &rv); michael@0: NS_ENSURE_SUCCESS_VOID(rv); michael@0: michael@0: rv = mRecognitionService->Initialize(this->asWeakPtr()); michael@0: NS_ENSURE_SUCCESS_VOID(rv); michael@0: michael@0: MediaStreamConstraints constraints; michael@0: constraints.mAudio.SetAsBoolean() = true; michael@0: michael@0: if (!mTestConfig.mFakeFSMEvents) { michael@0: MediaManager* manager = MediaManager::Get(); michael@0: manager->GetUserMedia(false, michael@0: GetOwner(), michael@0: constraints, michael@0: new GetUserMediaSuccessCallback(this), michael@0: new GetUserMediaErrorCallback(this)); michael@0: } michael@0: michael@0: nsRefPtr event = new SpeechEvent(this, EVENT_START); michael@0: NS_DispatchToMainThread(event); michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::Stop() michael@0: { michael@0: nsRefPtr event = new SpeechEvent(this, EVENT_STOP); michael@0: NS_DispatchToMainThread(event); michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::Abort() michael@0: { michael@0: if (mAborted) { michael@0: return; michael@0: } michael@0: michael@0: mAborted = true; michael@0: nsRefPtr event = new SpeechEvent(this, EVENT_ABORT); michael@0: NS_DispatchToMainThread(event); michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::DispatchError(EventType aErrorType, michael@0: SpeechRecognitionErrorCode aErrorCode, michael@0: const nsAString& aMessage) michael@0: { michael@0: MOZ_ASSERT(NS_IsMainThread()); michael@0: MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR || michael@0: aErrorType == EVENT_AUDIO_ERROR, "Invalid error type!"); michael@0: michael@0: nsRefPtr srError = michael@0: new SpeechRecognitionError(nullptr, nullptr, nullptr); michael@0: michael@0: ErrorResult err; michael@0: srError->InitSpeechRecognitionError(NS_LITERAL_STRING("error"), true, false, michael@0: aErrorCode, aMessage, err); michael@0: michael@0: nsRefPtr event = new SpeechEvent(this, aErrorType); michael@0: event->mError = srError; michael@0: NS_DispatchToMainThread(event); michael@0: } michael@0: michael@0: /* michael@0: * Buffer audio samples into mAudioSamplesBuffer until aBufferSize. michael@0: * Updates mBufferedSamples and returns the number of samples that were buffered. michael@0: */ michael@0: uint32_t michael@0: SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples, michael@0: uint32_t aSampleCount) michael@0: { michael@0: MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk); michael@0: MOZ_ASSERT(mAudioSamplesBuffer.get()); michael@0: michael@0: int16_t* samplesBuffer = static_cast(mAudioSamplesBuffer->Data()); michael@0: size_t samplesToCopy = std::min(aSampleCount, michael@0: mAudioSamplesPerChunk - mBufferedSamples); michael@0: michael@0: memcpy(samplesBuffer + mBufferedSamples, aSamples, michael@0: samplesToCopy * sizeof(int16_t)); michael@0: michael@0: mBufferedSamples += samplesToCopy; michael@0: return samplesToCopy; michael@0: } michael@0: michael@0: /* michael@0: * Split a samples buffer starting of a given size into michael@0: * chunks of equal size. The chunks are stored in the array michael@0: * received as argument. michael@0: * Returns the offset of the end of the last chunk that was michael@0: * created. michael@0: */ michael@0: uint32_t michael@0: SpeechRecognition::SplitSamplesBuffer(const int16_t* aSamplesBuffer, michael@0: uint32_t aSampleCount, michael@0: nsTArray>& aResult) michael@0: { michael@0: uint32_t chunkStart = 0; michael@0: michael@0: while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) { michael@0: nsRefPtr chunk = michael@0: SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t)); michael@0: michael@0: memcpy(chunk->Data(), aSamplesBuffer + chunkStart, michael@0: mAudioSamplesPerChunk * sizeof(int16_t)); michael@0: michael@0: aResult.AppendElement(chunk); michael@0: chunkStart += mAudioSamplesPerChunk; michael@0: } michael@0: michael@0: return chunkStart; michael@0: } michael@0: michael@0: AudioSegment* michael@0: SpeechRecognition::CreateAudioSegment(nsTArray>& aChunks) michael@0: { michael@0: AudioSegment* segment = new AudioSegment(); michael@0: for (uint32_t i = 0; i < aChunks.Length(); ++i) { michael@0: nsRefPtr buffer = aChunks[i]; michael@0: const int16_t* chunkData = static_cast(buffer->Data()); michael@0: michael@0: nsAutoTArray channels; michael@0: channels.AppendElement(chunkData); michael@0: segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk); michael@0: } michael@0: michael@0: return segment; michael@0: } michael@0: michael@0: void michael@0: SpeechRecognition::FeedAudioData(already_AddRefed aSamples, michael@0: uint32_t aDuration, michael@0: MediaStreamListener* aProvider) michael@0: { michael@0: NS_ASSERTION(!NS_IsMainThread(), michael@0: "FeedAudioData should not be called in the main thread"); michael@0: michael@0: // Endpointer expects to receive samples in chunks whose size is a michael@0: // multiple of its frame size. michael@0: // Since we can't assume we will receive the frames in appropriate-sized michael@0: // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk michael@0: // (a multiple of Endpointer's frame size) before feeding to Endpointer. michael@0: michael@0: // ensure aSamples is deleted michael@0: nsRefPtr refSamples = aSamples; michael@0: michael@0: uint32_t samplesIndex = 0; michael@0: const int16_t* samples = static_cast(refSamples->Data()); michael@0: nsAutoTArray, 5> chunksToSend; michael@0: michael@0: // fill up our buffer and make a chunk out of it, if possible michael@0: if (mBufferedSamples > 0) { michael@0: samplesIndex += FillSamplesBuffer(samples, aDuration); michael@0: michael@0: if (mBufferedSamples == mAudioSamplesPerChunk) { michael@0: chunksToSend.AppendElement(mAudioSamplesBuffer); michael@0: mAudioSamplesBuffer = nullptr; michael@0: mBufferedSamples = 0; michael@0: } michael@0: } michael@0: michael@0: // create sample chunks of correct size michael@0: if (samplesIndex < aDuration) { michael@0: samplesIndex += SplitSamplesBuffer(samples + samplesIndex, michael@0: aDuration - samplesIndex, michael@0: chunksToSend); michael@0: } michael@0: michael@0: // buffer remaining samples michael@0: if (samplesIndex < aDuration) { michael@0: mBufferedSamples = 0; michael@0: mAudioSamplesBuffer = michael@0: SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t)); michael@0: michael@0: FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex); michael@0: } michael@0: michael@0: AudioSegment* segment = CreateAudioSegment(chunksToSend); michael@0: nsRefPtr event = new SpeechEvent(this, EVENT_AUDIO_DATA); michael@0: event->mAudioSegment = segment; michael@0: event->mProvider = aProvider; michael@0: NS_DispatchToMainThread(event); michael@0: michael@0: return; michael@0: } michael@0: michael@0: const char* michael@0: SpeechRecognition::GetName(FSMState aId) michael@0: { michael@0: static const char* names[] = { michael@0: "STATE_IDLE", michael@0: "STATE_STARTING", michael@0: "STATE_ESTIMATING", michael@0: "STATE_WAITING_FOR_SPEECH", michael@0: "STATE_RECOGNIZING", michael@0: "STATE_WAITING_FOR_RESULT", michael@0: }; michael@0: michael@0: MOZ_ASSERT(aId < STATE_COUNT); michael@0: MOZ_ASSERT(ArrayLength(names) == STATE_COUNT); michael@0: return names[aId]; michael@0: } michael@0: michael@0: const char* michael@0: SpeechRecognition::GetName(SpeechEvent* aEvent) michael@0: { michael@0: static const char* names[] = { michael@0: "EVENT_START", michael@0: "EVENT_STOP", michael@0: "EVENT_ABORT", michael@0: "EVENT_AUDIO_DATA", michael@0: "EVENT_AUDIO_ERROR", michael@0: "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT", michael@0: "EVENT_RECOGNITIONSERVICE_FINAL_RESULT", michael@0: "EVENT_RECOGNITIONSERVICE_ERROR" michael@0: }; michael@0: michael@0: MOZ_ASSERT(aEvent->mType < EVENT_COUNT); michael@0: MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT); michael@0: return names[aEvent->mType]; michael@0: } michael@0: michael@0: SpeechEvent::~SpeechEvent() michael@0: { michael@0: delete mAudioSegment; michael@0: } michael@0: michael@0: NS_IMETHODIMP michael@0: SpeechEvent::Run() michael@0: { michael@0: mRecognition->ProcessEvent(this); michael@0: return NS_OK; michael@0: } michael@0: michael@0: NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaSuccessCallback, nsIDOMGetUserMediaSuccessCallback) michael@0: michael@0: NS_IMETHODIMP michael@0: SpeechRecognition::GetUserMediaSuccessCallback::OnSuccess(nsISupports* aStream) michael@0: { michael@0: nsCOMPtr localStream = do_QueryInterface(aStream); michael@0: mRecognition->StartRecording(static_cast(localStream.get())); michael@0: return NS_OK; michael@0: } michael@0: michael@0: NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaErrorCallback, nsIDOMGetUserMediaErrorCallback) michael@0: michael@0: NS_IMETHODIMP michael@0: SpeechRecognition::GetUserMediaErrorCallback::OnError(const nsAString& aError) michael@0: { michael@0: SpeechRecognitionErrorCode errorCode; michael@0: michael@0: if (aError.Equals(NS_LITERAL_STRING("PERMISSION_DENIED"))) { michael@0: errorCode = SpeechRecognitionErrorCode::Not_allowed; michael@0: } else { michael@0: errorCode = SpeechRecognitionErrorCode::Audio_capture; michael@0: } michael@0: michael@0: mRecognition->DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode, michael@0: aError); michael@0: michael@0: return NS_OK; michael@0: } michael@0: michael@0: } // namespace dom michael@0: } // namespace mozilla