diff -r 000000000000 -r 6474c204b198 content/media/webspeech/recognition/SpeechRecognition.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/content/media/webspeech/recognition/SpeechRecognition.cpp Wed Dec 31 06:09:35 2014 +0100 @@ -0,0 +1,971 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "SpeechRecognition.h" + +#include "nsCOMPtr.h" +#include "nsCycleCollectionParticipant.h" + +#include "mozilla/dom/SpeechRecognitionBinding.h" +#include "mozilla/dom/MediaStreamTrackBinding.h" +#include "mozilla/MediaManager.h" +#include "mozilla/Services.h" + +#include "AudioSegment.h" +#include "endpointer.h" + +#include "GeneratedEvents.h" +#include "nsIDOMSpeechRecognitionEvent.h" +#include "nsIObserverService.h" +#include "nsServiceManagerUtils.h" + +#include + +namespace mozilla { +namespace dom { + +#define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default" +#define DEFAULT_RECOGNITION_SERVICE "google" + +#define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length" +#define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH "media.webspeech.long_silence_length" +#define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH "media.webspeech.long_speech_length" + +static const uint32_t kSAMPLE_RATE = 16000; +static const uint32_t kSPEECH_DETECTION_TIMEOUT_MS = 10000; + +// number of frames corresponding to 300ms of audio to send to endpointer while +// it's in environment estimation mode +// kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms +static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000; + +#ifdef PR_LOGGING +PRLogModuleInfo* +GetSpeechRecognitionLog() +{ + static PRLogModuleInfo* sLog; + if (!sLog) { + sLog = PR_NewLogModule("SpeechRecognition"); + } + + return sLog; +} +#define SR_LOG(...) PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, (__VA_ARGS__)) +#else +#define SR_LOG(...) +#endif + +NS_INTERFACE_MAP_BEGIN(SpeechRecognition) + NS_INTERFACE_MAP_ENTRY(nsIObserver) +NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper) + +NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper) +NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper) + +struct SpeechRecognition::TestConfig SpeechRecognition::mTestConfig; + +SpeechRecognition::SpeechRecognition(nsPIDOMWindow* aOwnerWindow) + : DOMEventTargetHelper(aOwnerWindow) + , mEndpointer(kSAMPLE_RATE) + , mAudioSamplesPerChunk(mEndpointer.FrameSize()) + , mSpeechDetectionTimer(do_CreateInstance(NS_TIMER_CONTRACTID)) +{ + SR_LOG("created SpeechRecognition"); + + mTestConfig.Init(); + if (mTestConfig.mEnableTests) { + nsCOMPtr obs = services::GetObserverService(); + obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false); + obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false); + } + + mEndpointer.set_speech_input_complete_silence_length( + Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 500000)); + mEndpointer.set_long_speech_input_complete_silence_length( + Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 1000000)); + mEndpointer.set_long_speech_length( + Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000)); + Reset(); +} + +bool +SpeechRecognition::StateBetween(FSMState begin, FSMState end) +{ + return mCurrentState >= begin && mCurrentState <= end; +} + +void +SpeechRecognition::SetState(FSMState state) +{ + mCurrentState = state; + SR_LOG("Transitioned to state %s", GetName(mCurrentState)); + return; +} + +JSObject* +SpeechRecognition::WrapObject(JSContext* aCx) +{ + return SpeechRecognitionBinding::Wrap(aCx, this); +} + +already_AddRefed +SpeechRecognition::Constructor(const GlobalObject& aGlobal, + ErrorResult& aRv) +{ + nsCOMPtr win = do_QueryInterface(aGlobal.GetAsSupports()); + if (!win) { + aRv.Throw(NS_ERROR_FAILURE); + } + + MOZ_ASSERT(win->IsInnerWindow()); + nsRefPtr object = new SpeechRecognition(win); + return object.forget(); +} + +nsISupports* +SpeechRecognition::GetParentObject() const +{ + return GetOwner(); +} + +void +SpeechRecognition::ProcessEvent(SpeechEvent* aEvent) +{ + SR_LOG("Processing %s, current state is %s", + GetName(aEvent), + GetName(mCurrentState)); + + if (mAborted && aEvent->mType != EVENT_ABORT) { + // ignore all events while aborting + return; + } + + Transition(aEvent); +} + +void +SpeechRecognition::Transition(SpeechEvent* aEvent) +{ + switch (mCurrentState) { + case STATE_IDLE: + switch (aEvent->mType) { + case EVENT_START: + // TODO: may want to time out if we wait too long + // for user to approve + WaitForAudioData(aEvent); + break; + case EVENT_STOP: + case EVENT_ABORT: + case EVENT_AUDIO_DATA: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + DoNothing(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + case EVENT_COUNT: + MOZ_CRASH("Invalid event EVENT_COUNT"); + } + break; + case STATE_STARTING: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + StartedAudioCapture(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_STOP: + Reset(); + break; + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); + MOZ_CRASH(); + case EVENT_COUNT: + MOZ_CRASH("Invalid event EVENT_COUNT"); + } + break; + case STATE_ESTIMATING: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + WaitForEstimation(aEvent); + break; + case EVENT_STOP: + StopRecordingAndRecognize(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_ERROR: + DoNothing(aEvent); + break; + case EVENT_AUDIO_ERROR: + AbortError(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType); + MOZ_CRASH(); + case EVENT_COUNT: + MOZ_CRASH("Invalid event EVENT_COUNT"); + } + break; + case STATE_WAITING_FOR_SPEECH: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + DetectSpeech(aEvent); + break; + case EVENT_STOP: + StopRecordingAndRecognize(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_AUDIO_ERROR: + AbortError(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_ERROR: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); + MOZ_CRASH(); + case EVENT_COUNT: + MOZ_CRASH("Invalid event EVENT_COUNT"); + } + break; + case STATE_RECOGNIZING: + switch (aEvent->mType) { + case EVENT_AUDIO_DATA: + WaitForSpeechEnd(aEvent); + break; + case EVENT_STOP: + StopRecordingAndRecognize(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + DoNothing(aEvent); + break; + case EVENT_START: + SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent)); + MOZ_CRASH(); + case EVENT_COUNT: + MOZ_CRASH("Invalid event EVENT_COUNT"); + } + break; + case STATE_WAITING_FOR_RESULT: + switch (aEvent->mType) { + case EVENT_STOP: + DoNothing(aEvent); + break; + case EVENT_AUDIO_ERROR: + case EVENT_RECOGNITIONSERVICE_ERROR: + AbortError(aEvent); + break; + case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: + NotifyFinalResult(aEvent); + break; + case EVENT_AUDIO_DATA: + DoNothing(aEvent); + break; + case EVENT_ABORT: + AbortSilently(aEvent); + break; + case EVENT_START: + case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: + SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", GetName(aEvent)); + MOZ_CRASH(); + case EVENT_COUNT: + MOZ_CRASH("Invalid event EVENT_COUNT"); + } + break; + case STATE_COUNT: + MOZ_CRASH("Invalid state STATE_COUNT"); + } + + return; +} + +/* + * Handle a segment of recorded audio data. + * Returns the number of samples that were processed. + */ +uint32_t +SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment) +{ + AudioSegment::ChunkIterator iterator(*aSegment); + uint32_t samples = 0; + while (!iterator.IsEnded()) { + float out; + mEndpointer.ProcessAudio(*iterator, &out); + samples += iterator->GetDuration(); + iterator.Next(); + } + + mRecognitionService->ProcessAudioSegment(aSegment); + return samples; +} + +void +SpeechRecognition::GetRecognitionServiceCID(nsACString& aResultCID) +{ + if (mTestConfig.mFakeRecognitionService) { + aResultCID = + NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake"; + + return; + } + + nsAdoptingCString prefValue = + Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE); + + nsAutoCString speechRecognitionService; + if (!prefValue.get() || prefValue.IsEmpty()) { + speechRecognitionService = DEFAULT_RECOGNITION_SERVICE; + } else { + speechRecognitionService = prefValue; + } + + aResultCID = + NS_LITERAL_CSTRING(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) + + speechRecognitionService; + + return; +} + +/**************************************************************************** + * FSM Transition functions + * + * If a transition function may cause a DOM event to be fired, + * it may also be re-entered, since the event handler may cause the + * event loop to spin and new SpeechEvents to be processed. + * + * Rules: + * 1) These methods should call SetState as soon as possible. + * 2) If these methods dispatch DOM events, or call methods that dispatch + * DOM events, that should be done as late as possible. + * 3) If anything must happen after dispatching a DOM event, make sure + * the state is still what the method expected it to be. + ****************************************************************************/ + +void +SpeechRecognition::Reset() +{ + SetState(STATE_IDLE); + mRecognitionService = nullptr; + mEstimationSamples = 0; + mBufferedSamples = 0; + mSpeechDetectionTimer->Cancel(); + mAborted = false; +} + +void +SpeechRecognition::ResetAndEnd() +{ + Reset(); + DispatchTrustedEvent(NS_LITERAL_STRING("end")); +} + +void +SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent) +{ + SetState(STATE_STARTING); +} + +void +SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent) +{ + SetState(STATE_ESTIMATING); + + mEndpointer.SetEnvironmentEstimationMode(); + mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment); + + DispatchTrustedEvent(NS_LITERAL_STRING("audiostart")); + if (mCurrentState == STATE_ESTIMATING) { + DispatchTrustedEvent(NS_LITERAL_STRING("start")); + } +} + +void +SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent) +{ + SetState(STATE_WAITING_FOR_RESULT); + + MOZ_ASSERT(mRecognitionService, "Service deleted before recording done"); + mRecognitionService->SoundEnd(); + + StopRecording(); +} + +void +SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent) +{ + SetState(STATE_ESTIMATING); + + mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment); + if (mEstimationSamples > kESTIMATION_SAMPLES) { + mEndpointer.SetUserInputMode(); + SetState(STATE_WAITING_FOR_SPEECH); + } +} + +void +SpeechRecognition::DetectSpeech(SpeechEvent* aEvent) +{ + SetState(STATE_WAITING_FOR_SPEECH); + + ProcessAudioSegment(aEvent->mAudioSegment); + if (mEndpointer.DidStartReceivingSpeech()) { + mSpeechDetectionTimer->Cancel(); + SetState(STATE_RECOGNIZING); + DispatchTrustedEvent(NS_LITERAL_STRING("speechstart")); + } +} + +void +SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent) +{ + SetState(STATE_RECOGNIZING); + + ProcessAudioSegment(aEvent->mAudioSegment); + if (mEndpointer.speech_input_complete()) { + DispatchTrustedEvent(NS_LITERAL_STRING("speechend")); + + if (mCurrentState == STATE_RECOGNIZING) { + // FIXME: StopRecordingAndRecognize should only be called for single + // shot services for continuous we should just inform the service + StopRecordingAndRecognize(aEvent); + } + } +} + +void +SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent) +{ + ResetAndEnd(); + + nsCOMPtr domEvent; + NS_NewDOMSpeechRecognitionEvent(getter_AddRefs(domEvent), nullptr, nullptr, nullptr); + + nsCOMPtr srEvent = do_QueryInterface(domEvent); + nsRefPtr rlist = aEvent->mRecognitionResultList; + nsCOMPtr ilist = do_QueryInterface(rlist); + srEvent->InitSpeechRecognitionEvent(NS_LITERAL_STRING("result"), + true, false, 0, ilist, + NS_LITERAL_STRING("NOT_IMPLEMENTED"), + nullptr); + domEvent->SetTrusted(true); + + bool defaultActionEnabled; + this->DispatchEvent(domEvent, &defaultActionEnabled); +} + +void +SpeechRecognition::DoNothing(SpeechEvent* aEvent) +{ +} + +void +SpeechRecognition::AbortSilently(SpeechEvent* aEvent) +{ + bool stopRecording = StateBetween(STATE_ESTIMATING, STATE_RECOGNIZING); + + if (mRecognitionService) { + mRecognitionService->Abort(); + } + + if (stopRecording) { + StopRecording(); + } + + ResetAndEnd(); +} + +void +SpeechRecognition::AbortError(SpeechEvent* aEvent) +{ + AbortSilently(aEvent); + NotifyError(aEvent); +} + +void +SpeechRecognition::NotifyError(SpeechEvent* aEvent) +{ + aEvent->mError->SetTrusted(true); + + bool defaultActionEnabled; + this->DispatchEvent(aEvent->mError, &defaultActionEnabled); + + return; +} + +/************************************** + * Event triggers and other functions * + **************************************/ +NS_IMETHODIMP +SpeechRecognition::StartRecording(DOMMediaStream* aDOMStream) +{ + // hold a reference so that the underlying stream + // doesn't get Destroy()'ed + mDOMStream = aDOMStream; + + NS_ENSURE_STATE(mDOMStream->GetStream()); + mSpeechListener = new SpeechStreamListener(this); + mDOMStream->GetStream()->AddListener(mSpeechListener); + + mEndpointer.StartSession(); + + return mSpeechDetectionTimer->Init(this, kSPEECH_DETECTION_TIMEOUT_MS, + nsITimer::TYPE_ONE_SHOT); +} + +NS_IMETHODIMP +SpeechRecognition::StopRecording() +{ + // we only really need to remove the listener explicitly when testing, + // as our JS code still holds a reference to mDOMStream and only assigning + // it to nullptr isn't guaranteed to free the stream and the listener. + mDOMStream->GetStream()->RemoveListener(mSpeechListener); + mSpeechListener = nullptr; + mDOMStream = nullptr; + + mEndpointer.EndSession(); + DispatchTrustedEvent(NS_LITERAL_STRING("audioend")); + + return NS_OK; +} + +NS_IMETHODIMP +SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic, + const char16_t* aData) +{ + MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread"); + + if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) && + StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) { + + DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, + SpeechRecognitionErrorCode::No_speech, + NS_LITERAL_STRING("No speech detected (timeout)")); + } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) { + nsCOMPtr obs = services::GetObserverService(); + obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC); + obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC); + } else if (mTestConfig.mFakeFSMEvents && + !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) { + ProcessTestEventRequest(aSubject, nsDependentString(aData)); + } + + return NS_OK; +} + +void +SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName) +{ + if (aEventName.EqualsLiteral("EVENT_START")) { + ErrorResult err; + Start(err); + } else if (aEventName.EqualsLiteral("EVENT_STOP")) { + Stop(); + } else if (aEventName.EqualsLiteral("EVENT_ABORT")) { + Abort(); + } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) { + DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, + SpeechRecognitionErrorCode::Audio_capture, // TODO different codes? + NS_LITERAL_STRING("AUDIO_ERROR test event")); + } else if (aEventName.EqualsLiteral("EVENT_AUDIO_DATA")) { + StartRecording(static_cast(aSubject)); + } else { + NS_ASSERTION(mTestConfig.mFakeRecognitionService, + "Got request for fake recognition service event, but " + TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE " is unset"); + + // let the fake recognition service handle the request + } + + return; +} + +already_AddRefed +SpeechRecognition::GetGrammars(ErrorResult& aRv) const +{ + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return nullptr; +} + +void +SpeechRecognition::SetGrammars(SpeechGrammarList& aArg, ErrorResult& aRv) +{ + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return; +} + +void +SpeechRecognition::GetLang(nsString& aRetVal, ErrorResult& aRv) const +{ + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return; +} + +void +SpeechRecognition::SetLang(const nsAString& aArg, ErrorResult& aRv) +{ + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return; +} + +bool +SpeechRecognition::GetContinuous(ErrorResult& aRv) const +{ + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return false; +} + +void +SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv) +{ + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return; +} + +bool +SpeechRecognition::GetInterimResults(ErrorResult& aRv) const +{ + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return false; +} + +void +SpeechRecognition::SetInterimResults(bool aArg, ErrorResult& aRv) +{ + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return; +} + +uint32_t +SpeechRecognition::GetMaxAlternatives(ErrorResult& aRv) const +{ + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return 0; +} + +void +SpeechRecognition::SetMaxAlternatives(uint32_t aArg, ErrorResult& aRv) +{ + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return; +} + +void +SpeechRecognition::GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const +{ + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return; +} + +void +SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv) +{ + aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); + return; +} + +void +SpeechRecognition::Start(ErrorResult& aRv) +{ + if (mCurrentState != STATE_IDLE) { + aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); + return; + } + + nsAutoCString speechRecognitionServiceCID; + GetRecognitionServiceCID(speechRecognitionServiceCID); + + nsresult rv; + mRecognitionService = do_GetService(speechRecognitionServiceCID.get(), &rv); + NS_ENSURE_SUCCESS_VOID(rv); + + rv = mRecognitionService->Initialize(this->asWeakPtr()); + NS_ENSURE_SUCCESS_VOID(rv); + + MediaStreamConstraints constraints; + constraints.mAudio.SetAsBoolean() = true; + + if (!mTestConfig.mFakeFSMEvents) { + MediaManager* manager = MediaManager::Get(); + manager->GetUserMedia(false, + GetOwner(), + constraints, + new GetUserMediaSuccessCallback(this), + new GetUserMediaErrorCallback(this)); + } + + nsRefPtr event = new SpeechEvent(this, EVENT_START); + NS_DispatchToMainThread(event); +} + +void +SpeechRecognition::Stop() +{ + nsRefPtr event = new SpeechEvent(this, EVENT_STOP); + NS_DispatchToMainThread(event); +} + +void +SpeechRecognition::Abort() +{ + if (mAborted) { + return; + } + + mAborted = true; + nsRefPtr event = new SpeechEvent(this, EVENT_ABORT); + NS_DispatchToMainThread(event); +} + +void +SpeechRecognition::DispatchError(EventType aErrorType, + SpeechRecognitionErrorCode aErrorCode, + const nsAString& aMessage) +{ + MOZ_ASSERT(NS_IsMainThread()); + MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR || + aErrorType == EVENT_AUDIO_ERROR, "Invalid error type!"); + + nsRefPtr srError = + new SpeechRecognitionError(nullptr, nullptr, nullptr); + + ErrorResult err; + srError->InitSpeechRecognitionError(NS_LITERAL_STRING("error"), true, false, + aErrorCode, aMessage, err); + + nsRefPtr event = new SpeechEvent(this, aErrorType); + event->mError = srError; + NS_DispatchToMainThread(event); +} + +/* + * Buffer audio samples into mAudioSamplesBuffer until aBufferSize. + * Updates mBufferedSamples and returns the number of samples that were buffered. + */ +uint32_t +SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples, + uint32_t aSampleCount) +{ + MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk); + MOZ_ASSERT(mAudioSamplesBuffer.get()); + + int16_t* samplesBuffer = static_cast(mAudioSamplesBuffer->Data()); + size_t samplesToCopy = std::min(aSampleCount, + mAudioSamplesPerChunk - mBufferedSamples); + + memcpy(samplesBuffer + mBufferedSamples, aSamples, + samplesToCopy * sizeof(int16_t)); + + mBufferedSamples += samplesToCopy; + return samplesToCopy; +} + +/* + * Split a samples buffer starting of a given size into + * chunks of equal size. The chunks are stored in the array + * received as argument. + * Returns the offset of the end of the last chunk that was + * created. + */ +uint32_t +SpeechRecognition::SplitSamplesBuffer(const int16_t* aSamplesBuffer, + uint32_t aSampleCount, + nsTArray>& aResult) +{ + uint32_t chunkStart = 0; + + while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) { + nsRefPtr chunk = + SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t)); + + memcpy(chunk->Data(), aSamplesBuffer + chunkStart, + mAudioSamplesPerChunk * sizeof(int16_t)); + + aResult.AppendElement(chunk); + chunkStart += mAudioSamplesPerChunk; + } + + return chunkStart; +} + +AudioSegment* +SpeechRecognition::CreateAudioSegment(nsTArray>& aChunks) +{ + AudioSegment* segment = new AudioSegment(); + for (uint32_t i = 0; i < aChunks.Length(); ++i) { + nsRefPtr buffer = aChunks[i]; + const int16_t* chunkData = static_cast(buffer->Data()); + + nsAutoTArray channels; + channels.AppendElement(chunkData); + segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk); + } + + return segment; +} + +void +SpeechRecognition::FeedAudioData(already_AddRefed aSamples, + uint32_t aDuration, + MediaStreamListener* aProvider) +{ + NS_ASSERTION(!NS_IsMainThread(), + "FeedAudioData should not be called in the main thread"); + + // Endpointer expects to receive samples in chunks whose size is a + // multiple of its frame size. + // Since we can't assume we will receive the frames in appropriate-sized + // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk + // (a multiple of Endpointer's frame size) before feeding to Endpointer. + + // ensure aSamples is deleted + nsRefPtr refSamples = aSamples; + + uint32_t samplesIndex = 0; + const int16_t* samples = static_cast(refSamples->Data()); + nsAutoTArray, 5> chunksToSend; + + // fill up our buffer and make a chunk out of it, if possible + if (mBufferedSamples > 0) { + samplesIndex += FillSamplesBuffer(samples, aDuration); + + if (mBufferedSamples == mAudioSamplesPerChunk) { + chunksToSend.AppendElement(mAudioSamplesBuffer); + mAudioSamplesBuffer = nullptr; + mBufferedSamples = 0; + } + } + + // create sample chunks of correct size + if (samplesIndex < aDuration) { + samplesIndex += SplitSamplesBuffer(samples + samplesIndex, + aDuration - samplesIndex, + chunksToSend); + } + + // buffer remaining samples + if (samplesIndex < aDuration) { + mBufferedSamples = 0; + mAudioSamplesBuffer = + SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t)); + + FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex); + } + + AudioSegment* segment = CreateAudioSegment(chunksToSend); + nsRefPtr event = new SpeechEvent(this, EVENT_AUDIO_DATA); + event->mAudioSegment = segment; + event->mProvider = aProvider; + NS_DispatchToMainThread(event); + + return; +} + +const char* +SpeechRecognition::GetName(FSMState aId) +{ + static const char* names[] = { + "STATE_IDLE", + "STATE_STARTING", + "STATE_ESTIMATING", + "STATE_WAITING_FOR_SPEECH", + "STATE_RECOGNIZING", + "STATE_WAITING_FOR_RESULT", + }; + + MOZ_ASSERT(aId < STATE_COUNT); + MOZ_ASSERT(ArrayLength(names) == STATE_COUNT); + return names[aId]; +} + +const char* +SpeechRecognition::GetName(SpeechEvent* aEvent) +{ + static const char* names[] = { + "EVENT_START", + "EVENT_STOP", + "EVENT_ABORT", + "EVENT_AUDIO_DATA", + "EVENT_AUDIO_ERROR", + "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT", + "EVENT_RECOGNITIONSERVICE_FINAL_RESULT", + "EVENT_RECOGNITIONSERVICE_ERROR" + }; + + MOZ_ASSERT(aEvent->mType < EVENT_COUNT); + MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT); + return names[aEvent->mType]; +} + +SpeechEvent::~SpeechEvent() +{ + delete mAudioSegment; +} + +NS_IMETHODIMP +SpeechEvent::Run() +{ + mRecognition->ProcessEvent(this); + return NS_OK; +} + +NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaSuccessCallback, nsIDOMGetUserMediaSuccessCallback) + +NS_IMETHODIMP +SpeechRecognition::GetUserMediaSuccessCallback::OnSuccess(nsISupports* aStream) +{ + nsCOMPtr localStream = do_QueryInterface(aStream); + mRecognition->StartRecording(static_cast(localStream.get())); + return NS_OK; +} + +NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaErrorCallback, nsIDOMGetUserMediaErrorCallback) + +NS_IMETHODIMP +SpeechRecognition::GetUserMediaErrorCallback::OnError(const nsAString& aError) +{ + SpeechRecognitionErrorCode errorCode; + + if (aError.Equals(NS_LITERAL_STRING("PERMISSION_DENIED"))) { + errorCode = SpeechRecognitionErrorCode::Not_allowed; + } else { + errorCode = SpeechRecognitionErrorCode::Audio_capture; + } + + mRecognition->DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode, + aError); + + return NS_OK; +} + +} // namespace dom +} // namespace mozilla