michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0: /* vim:set ts=2 sw=2 sts=2 et cindent: */
michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0:  * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0:  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0: 
michael@0: #include "SpeechRecognition.h"
michael@0: 
michael@0: #include "nsCOMPtr.h"
michael@0: #include "nsCycleCollectionParticipant.h"
michael@0: 
michael@0: #include "mozilla/dom/SpeechRecognitionBinding.h"
michael@0: #include "mozilla/dom/MediaStreamTrackBinding.h"
michael@0: #include "mozilla/MediaManager.h"
michael@0: #include "mozilla/Services.h"
michael@0: 
michael@0: #include "AudioSegment.h"
michael@0: #include "endpointer.h"
michael@0: 
michael@0: #include "GeneratedEvents.h"
michael@0: #include "nsIDOMSpeechRecognitionEvent.h"
michael@0: #include "nsIObserverService.h"
michael@0: #include "nsServiceManagerUtils.h"
michael@0: 
michael@0: #include <algorithm>
michael@0: 
michael@0: namespace mozilla {
michael@0: namespace dom {
michael@0: 
michael@0: #define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default"
michael@0: #define DEFAULT_RECOGNITION_SERVICE "google"
michael@0: 
michael@0: #define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length"
michael@0: #define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH "media.webspeech.long_silence_length"
michael@0: #define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH "media.webspeech.long_speech_length"
michael@0: 
michael@0: static const uint32_t kSAMPLE_RATE = 16000;
michael@0: static const uint32_t kSPEECH_DETECTION_TIMEOUT_MS = 10000;
michael@0: 
michael@0: // number of frames corresponding to 300ms of audio to send to endpointer while
michael@0: // it's in environment estimation mode
michael@0: // kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms
michael@0: static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000;
michael@0: 
michael@0: #ifdef PR_LOGGING
michael@0: PRLogModuleInfo*
michael@0: GetSpeechRecognitionLog()
michael@0: {
michael@0:   static PRLogModuleInfo* sLog;
michael@0:   if (!sLog) {
michael@0:     sLog = PR_NewLogModule("SpeechRecognition");
michael@0:   }
michael@0: 
michael@0:   return sLog;
michael@0: }
michael@0: #define SR_LOG(...) PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, (__VA_ARGS__))
michael@0: #else
michael@0: #define SR_LOG(...)
michael@0: #endif
michael@0: 
michael@0: NS_INTERFACE_MAP_BEGIN(SpeechRecognition)
michael@0:   NS_INTERFACE_MAP_ENTRY(nsIObserver)
michael@0: NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper)
michael@0: 
michael@0: NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper)
michael@0: NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper)
michael@0: 
michael@0: struct SpeechRecognition::TestConfig SpeechRecognition::mTestConfig;
michael@0: 
michael@0: SpeechRecognition::SpeechRecognition(nsPIDOMWindow* aOwnerWindow)
michael@0:   : DOMEventTargetHelper(aOwnerWindow)
michael@0:   , mEndpointer(kSAMPLE_RATE)
michael@0:   , mAudioSamplesPerChunk(mEndpointer.FrameSize())
michael@0:   , mSpeechDetectionTimer(do_CreateInstance(NS_TIMER_CONTRACTID))
michael@0: {
michael@0:   SR_LOG("created SpeechRecognition");
michael@0: 
michael@0:   mTestConfig.Init();
michael@0:   if (mTestConfig.mEnableTests) {
michael@0:     nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
michael@0:     obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false);
michael@0:     obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false);
michael@0:   }
michael@0: 
michael@0:   mEndpointer.set_speech_input_complete_silence_length(
michael@0:       Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 500000));
michael@0:   mEndpointer.set_long_speech_input_complete_silence_length(
michael@0:       Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 1000000));
michael@0:   mEndpointer.set_long_speech_length(
michael@0:       Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000));
michael@0:   Reset();
michael@0: }
michael@0: 
michael@0: bool
michael@0: SpeechRecognition::StateBetween(FSMState begin, FSMState end)
michael@0: {
michael@0:   return mCurrentState >= begin && mCurrentState <= end;
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::SetState(FSMState state)
michael@0: {
michael@0:   mCurrentState = state;
michael@0:   SR_LOG("Transitioned to state %s", GetName(mCurrentState));
michael@0:   return;
michael@0: }
michael@0: 
michael@0: JSObject*
michael@0: SpeechRecognition::WrapObject(JSContext* aCx)
michael@0: {
michael@0:   return SpeechRecognitionBinding::Wrap(aCx, this);
michael@0: }
michael@0: 
michael@0: already_AddRefed<SpeechRecognition>
michael@0: SpeechRecognition::Constructor(const GlobalObject& aGlobal,
michael@0:                                ErrorResult& aRv)
michael@0: {
michael@0:   nsCOMPtr<nsPIDOMWindow> win = do_QueryInterface(aGlobal.GetAsSupports());
michael@0:   if (!win) {
michael@0:     aRv.Throw(NS_ERROR_FAILURE);
michael@0:   }
michael@0: 
michael@0:   MOZ_ASSERT(win->IsInnerWindow());
michael@0:   nsRefPtr<SpeechRecognition> object = new SpeechRecognition(win);
michael@0:   return object.forget();
michael@0: }
michael@0: 
michael@0: nsISupports*
michael@0: SpeechRecognition::GetParentObject() const
michael@0: {
michael@0:   return GetOwner();
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::ProcessEvent(SpeechEvent* aEvent)
michael@0: {
michael@0:   SR_LOG("Processing %s, current state is %s",
michael@0:          GetName(aEvent),
michael@0:          GetName(mCurrentState));
michael@0: 
michael@0:   if (mAborted && aEvent->mType != EVENT_ABORT) {
michael@0:     // ignore all events while aborting
michael@0:     return;
michael@0:   }
michael@0: 
michael@0:   Transition(aEvent);
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::Transition(SpeechEvent* aEvent)
michael@0: {
michael@0:   switch (mCurrentState) {
michael@0:     case STATE_IDLE:
michael@0:       switch (aEvent->mType) {
michael@0:         case EVENT_START:
michael@0:           // TODO: may want to time out if we wait too long
michael@0:           // for user to approve
michael@0:           WaitForAudioData(aEvent);
michael@0:           break;
michael@0:         case EVENT_STOP:
michael@0:         case EVENT_ABORT:
michael@0:         case EVENT_AUDIO_DATA:
michael@0:         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
michael@0:         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
michael@0:           DoNothing(aEvent);
michael@0:           break;
michael@0:         case EVENT_AUDIO_ERROR:
michael@0:         case EVENT_RECOGNITIONSERVICE_ERROR:
michael@0:           AbortError(aEvent);
michael@0:           break;
michael@0:         case EVENT_COUNT:
michael@0:           MOZ_CRASH("Invalid event EVENT_COUNT");
michael@0:       }
michael@0:       break;
michael@0:     case STATE_STARTING:
michael@0:       switch (aEvent->mType) {
michael@0:         case EVENT_AUDIO_DATA:
michael@0:           StartedAudioCapture(aEvent);
michael@0:           break;
michael@0:         case EVENT_AUDIO_ERROR:
michael@0:         case EVENT_RECOGNITIONSERVICE_ERROR:
michael@0:           AbortError(aEvent);
michael@0:           break;
michael@0:         case EVENT_ABORT:
michael@0:           AbortSilently(aEvent);
michael@0:           break;
michael@0:         case EVENT_STOP:
michael@0:           Reset();
michael@0:           break;
michael@0:         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
michael@0:         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
michael@0:           DoNothing(aEvent);
michael@0:           break;
michael@0:         case EVENT_START:
michael@0:           SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
michael@0:           MOZ_CRASH();
michael@0:         case EVENT_COUNT:
michael@0:           MOZ_CRASH("Invalid event EVENT_COUNT");
michael@0:       }
michael@0:       break;
michael@0:     case STATE_ESTIMATING:
michael@0:       switch (aEvent->mType) {
michael@0:         case EVENT_AUDIO_DATA:
michael@0:           WaitForEstimation(aEvent);
michael@0:           break;
michael@0:         case EVENT_STOP:
michael@0:           StopRecordingAndRecognize(aEvent);
michael@0:           break;
michael@0:         case EVENT_ABORT:
michael@0:           AbortSilently(aEvent);
michael@0:           break;
michael@0:         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
michael@0:         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
michael@0:         case EVENT_RECOGNITIONSERVICE_ERROR:
michael@0:           DoNothing(aEvent);
michael@0:           break;
michael@0:         case EVENT_AUDIO_ERROR:
michael@0:           AbortError(aEvent);
michael@0:           break;
michael@0:         case EVENT_START:
michael@0:           SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType);
michael@0:           MOZ_CRASH();
michael@0:         case EVENT_COUNT:
michael@0:           MOZ_CRASH("Invalid event EVENT_COUNT");
michael@0:       }
michael@0:       break;
michael@0:     case STATE_WAITING_FOR_SPEECH:
michael@0:       switch (aEvent->mType) {
michael@0:         case EVENT_AUDIO_DATA:
michael@0:           DetectSpeech(aEvent);
michael@0:           break;
michael@0:         case EVENT_STOP:
michael@0:           StopRecordingAndRecognize(aEvent);
michael@0:           break;
michael@0:         case EVENT_ABORT:
michael@0:           AbortSilently(aEvent);
michael@0:           break;
michael@0:         case EVENT_AUDIO_ERROR:
michael@0:           AbortError(aEvent);
michael@0:           break;
michael@0:         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
michael@0:         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
michael@0:         case EVENT_RECOGNITIONSERVICE_ERROR:
michael@0:           DoNothing(aEvent);
michael@0:           break;
michael@0:         case EVENT_START:
michael@0:           SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
michael@0:           MOZ_CRASH();
michael@0:         case EVENT_COUNT:
michael@0:           MOZ_CRASH("Invalid event EVENT_COUNT");
michael@0:       }
michael@0:       break;
michael@0:     case STATE_RECOGNIZING:
michael@0:       switch (aEvent->mType) {
michael@0:         case EVENT_AUDIO_DATA:
michael@0:           WaitForSpeechEnd(aEvent);
michael@0:           break;
michael@0:         case EVENT_STOP:
michael@0:           StopRecordingAndRecognize(aEvent);
michael@0:           break;
michael@0:         case EVENT_AUDIO_ERROR:
michael@0:         case EVENT_RECOGNITIONSERVICE_ERROR:
michael@0:           AbortError(aEvent);
michael@0:           break;
michael@0:         case EVENT_ABORT:
michael@0:           AbortSilently(aEvent);
michael@0:           break;
michael@0:         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
michael@0:         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
michael@0:           DoNothing(aEvent);
michael@0:           break;
michael@0:         case EVENT_START:
michael@0:           SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent));
michael@0:           MOZ_CRASH();
michael@0:         case EVENT_COUNT:
michael@0:           MOZ_CRASH("Invalid event EVENT_COUNT");
michael@0:       }
michael@0:       break;
michael@0:     case STATE_WAITING_FOR_RESULT:
michael@0:       switch (aEvent->mType) {
michael@0:         case EVENT_STOP:
michael@0:           DoNothing(aEvent);
michael@0:           break;
michael@0:         case EVENT_AUDIO_ERROR:
michael@0:         case EVENT_RECOGNITIONSERVICE_ERROR:
michael@0:           AbortError(aEvent);
michael@0:           break;
michael@0:         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
michael@0:           NotifyFinalResult(aEvent);
michael@0:           break;
michael@0:         case EVENT_AUDIO_DATA:
michael@0:           DoNothing(aEvent);
michael@0:           break;
michael@0:         case EVENT_ABORT:
michael@0:           AbortSilently(aEvent);
michael@0:           break;
michael@0:         case EVENT_START:
michael@0:         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
michael@0:           SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", GetName(aEvent));
michael@0:           MOZ_CRASH();
michael@0:         case EVENT_COUNT:
michael@0:           MOZ_CRASH("Invalid event EVENT_COUNT");
michael@0:       }
michael@0:       break;
michael@0:     case STATE_COUNT:
michael@0:       MOZ_CRASH("Invalid state STATE_COUNT");
michael@0:   }
michael@0: 
michael@0:   return;
michael@0: }
michael@0: 
michael@0: /*
michael@0:  * Handle a segment of recorded audio data.
michael@0:  * Returns the number of samples that were processed.
michael@0:  */
michael@0: uint32_t
michael@0: SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment)
michael@0: {
michael@0:   AudioSegment::ChunkIterator iterator(*aSegment);
michael@0:   uint32_t samples = 0;
michael@0:   while (!iterator.IsEnded()) {
michael@0:     float out;
michael@0:     mEndpointer.ProcessAudio(*iterator, &out);
michael@0:     samples += iterator->GetDuration();
michael@0:     iterator.Next();
michael@0:   }
michael@0: 
michael@0:   mRecognitionService->ProcessAudioSegment(aSegment);
michael@0:   return samples;
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::GetRecognitionServiceCID(nsACString& aResultCID)
michael@0: {
michael@0:   if (mTestConfig.mFakeRecognitionService) {
michael@0:     aResultCID =
michael@0:       NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake";
michael@0: 
michael@0:     return;
michael@0:   }
michael@0: 
michael@0:   nsAdoptingCString prefValue =
michael@0:     Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE);
michael@0: 
michael@0:   nsAutoCString speechRecognitionService;
michael@0:   if (!prefValue.get() || prefValue.IsEmpty()) {
michael@0:     speechRecognitionService = DEFAULT_RECOGNITION_SERVICE;
michael@0:   } else {
michael@0:     speechRecognitionService = prefValue;
michael@0:   }
michael@0: 
michael@0:   aResultCID =
michael@0:     NS_LITERAL_CSTRING(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) +
michael@0:     speechRecognitionService;
michael@0: 
michael@0:   return;
michael@0: }
michael@0: 
michael@0: /****************************************************************************
michael@0:  * FSM Transition functions
michael@0:  *
michael@0:  * If a transition function may cause a DOM event to be fired,
michael@0:  * it may also be re-entered, since the event handler may cause the
michael@0:  * event loop to spin and new SpeechEvents to be processed.
michael@0:  *
michael@0:  * Rules:
michael@0:  * 1) These methods should call SetState as soon as possible.
michael@0:  * 2) If these methods dispatch DOM events, or call methods that dispatch
michael@0:  * DOM events, that should be done as late as possible.
michael@0:  * 3) If anything must happen after dispatching a DOM event, make sure
michael@0:  * the state is still what the method expected it to be.
michael@0:  ****************************************************************************/
michael@0: 
michael@0: void
michael@0: SpeechRecognition::Reset()
michael@0: {
michael@0:   SetState(STATE_IDLE);
michael@0:   mRecognitionService = nullptr;
michael@0:   mEstimationSamples = 0;
michael@0:   mBufferedSamples = 0;
michael@0:   mSpeechDetectionTimer->Cancel();
michael@0:   mAborted = false;
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::ResetAndEnd()
michael@0: {
michael@0:   Reset();
michael@0:   DispatchTrustedEvent(NS_LITERAL_STRING("end"));
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent)
michael@0: {
michael@0:   SetState(STATE_STARTING);
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent)
michael@0: {
michael@0:   SetState(STATE_ESTIMATING);
michael@0: 
michael@0:   mEndpointer.SetEnvironmentEstimationMode();
michael@0:   mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment);
michael@0: 
michael@0:   DispatchTrustedEvent(NS_LITERAL_STRING("audiostart"));
michael@0:   if (mCurrentState == STATE_ESTIMATING) {
michael@0:     DispatchTrustedEvent(NS_LITERAL_STRING("start"));
michael@0:   }
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent)
michael@0: {
michael@0:   SetState(STATE_WAITING_FOR_RESULT);
michael@0: 
michael@0:   MOZ_ASSERT(mRecognitionService, "Service deleted before recording done");
michael@0:   mRecognitionService->SoundEnd();
michael@0: 
michael@0:   StopRecording();
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent)
michael@0: {
michael@0:   SetState(STATE_ESTIMATING);
michael@0: 
michael@0:   mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment);
michael@0:   if (mEstimationSamples > kESTIMATION_SAMPLES) {
michael@0:     mEndpointer.SetUserInputMode();
michael@0:     SetState(STATE_WAITING_FOR_SPEECH);
michael@0:   }
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::DetectSpeech(SpeechEvent* aEvent)
michael@0: {
michael@0:   SetState(STATE_WAITING_FOR_SPEECH);
michael@0: 
michael@0:   ProcessAudioSegment(aEvent->mAudioSegment);
michael@0:   if (mEndpointer.DidStartReceivingSpeech()) {
michael@0:     mSpeechDetectionTimer->Cancel();
michael@0:     SetState(STATE_RECOGNIZING);
michael@0:     DispatchTrustedEvent(NS_LITERAL_STRING("speechstart"));
michael@0:   }
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent)
michael@0: {
michael@0:   SetState(STATE_RECOGNIZING);
michael@0: 
michael@0:   ProcessAudioSegment(aEvent->mAudioSegment);
michael@0:   if (mEndpointer.speech_input_complete()) {
michael@0:     DispatchTrustedEvent(NS_LITERAL_STRING("speechend"));
michael@0: 
michael@0:     if (mCurrentState == STATE_RECOGNIZING) {
michael@0:       // FIXME: StopRecordingAndRecognize should only be called for single
michael@0:       // shot services for continuous we should just inform the service
michael@0:       StopRecordingAndRecognize(aEvent);
michael@0:     }
michael@0:   }
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent)
michael@0: {
michael@0:   ResetAndEnd();
michael@0: 
michael@0:   nsCOMPtr<nsIDOMEvent> domEvent;
michael@0:   NS_NewDOMSpeechRecognitionEvent(getter_AddRefs(domEvent), nullptr, nullptr, nullptr);
michael@0: 
michael@0:   nsCOMPtr<nsIDOMSpeechRecognitionEvent> srEvent = do_QueryInterface(domEvent);
michael@0:   nsRefPtr<SpeechRecognitionResultList> rlist = aEvent->mRecognitionResultList;
michael@0:   nsCOMPtr<nsISupports> ilist = do_QueryInterface(rlist);
michael@0:   srEvent->InitSpeechRecognitionEvent(NS_LITERAL_STRING("result"),
michael@0:                                       true, false, 0, ilist,
michael@0:                                       NS_LITERAL_STRING("NOT_IMPLEMENTED"),
michael@0:                                       nullptr);
michael@0:   domEvent->SetTrusted(true);
michael@0: 
michael@0:   bool defaultActionEnabled;
michael@0:   this->DispatchEvent(domEvent, &defaultActionEnabled);
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::DoNothing(SpeechEvent* aEvent)
michael@0: {
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::AbortSilently(SpeechEvent* aEvent)
michael@0: {
michael@0:   bool stopRecording = StateBetween(STATE_ESTIMATING, STATE_RECOGNIZING);
michael@0: 
michael@0:   if (mRecognitionService) {
michael@0:     mRecognitionService->Abort();
michael@0:   }
michael@0: 
michael@0:   if (stopRecording) {
michael@0:     StopRecording();
michael@0:   }
michael@0: 
michael@0:   ResetAndEnd();
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::AbortError(SpeechEvent* aEvent)
michael@0: {
michael@0:   AbortSilently(aEvent);
michael@0:   NotifyError(aEvent);
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::NotifyError(SpeechEvent* aEvent)
michael@0: {
michael@0:   aEvent->mError->SetTrusted(true);
michael@0: 
michael@0:   bool defaultActionEnabled;
michael@0:   this->DispatchEvent(aEvent->mError, &defaultActionEnabled);
michael@0: 
michael@0:   return;
michael@0: }
michael@0: 
michael@0: /**************************************
michael@0:  * Event triggers and other functions *
michael@0:  **************************************/
michael@0: NS_IMETHODIMP
michael@0: SpeechRecognition::StartRecording(DOMMediaStream* aDOMStream)
michael@0: {
michael@0:   // hold a reference so that the underlying stream
michael@0:   // doesn't get Destroy()'ed
michael@0:   mDOMStream = aDOMStream;
michael@0: 
michael@0:   NS_ENSURE_STATE(mDOMStream->GetStream());
michael@0:   mSpeechListener = new SpeechStreamListener(this);
michael@0:   mDOMStream->GetStream()->AddListener(mSpeechListener);
michael@0: 
michael@0:   mEndpointer.StartSession();
michael@0: 
michael@0:   return mSpeechDetectionTimer->Init(this, kSPEECH_DETECTION_TIMEOUT_MS,
michael@0:                                      nsITimer::TYPE_ONE_SHOT);
michael@0: }
michael@0: 
michael@0: NS_IMETHODIMP
michael@0: SpeechRecognition::StopRecording()
michael@0: {
michael@0:   // we only really need to remove the listener explicitly when testing,
michael@0:   // as our JS code still holds a reference to mDOMStream and only assigning
michael@0:   // it to nullptr isn't guaranteed to free the stream and the listener.
michael@0:   mDOMStream->GetStream()->RemoveListener(mSpeechListener);
michael@0:   mSpeechListener = nullptr;
michael@0:   mDOMStream = nullptr;
michael@0: 
michael@0:   mEndpointer.EndSession();
michael@0:   DispatchTrustedEvent(NS_LITERAL_STRING("audioend"));
michael@0: 
michael@0:   return NS_OK;
michael@0: }
michael@0: 
michael@0: NS_IMETHODIMP
michael@0: SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic,
michael@0:                            const char16_t* aData)
michael@0: {
michael@0:   MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread");
michael@0: 
michael@0:   if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) &&
michael@0:       StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) {
michael@0: 
michael@0:     DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,
michael@0:                   SpeechRecognitionErrorCode::No_speech,
michael@0:                   NS_LITERAL_STRING("No speech detected (timeout)"));
michael@0:   } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) {
michael@0:     nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
michael@0:     obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC);
michael@0:     obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC);
michael@0:   } else if (mTestConfig.mFakeFSMEvents &&
michael@0:              !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) {
michael@0:     ProcessTestEventRequest(aSubject, nsDependentString(aData));
michael@0:   }
michael@0: 
michael@0:   return NS_OK;
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName)
michael@0: {
michael@0:   if (aEventName.EqualsLiteral("EVENT_START")) {
michael@0:     ErrorResult err;
michael@0:     Start(err);
michael@0:   } else if (aEventName.EqualsLiteral("EVENT_STOP")) {
michael@0:     Stop();
michael@0:   } else if (aEventName.EqualsLiteral("EVENT_ABORT")) {
michael@0:     Abort();
michael@0:   } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) {
michael@0:     DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,
michael@0:                   SpeechRecognitionErrorCode::Audio_capture, // TODO different codes?
michael@0:                   NS_LITERAL_STRING("AUDIO_ERROR test event"));
michael@0:   } else if (aEventName.EqualsLiteral("EVENT_AUDIO_DATA")) {
michael@0:     StartRecording(static_cast<DOMMediaStream*>(aSubject));
michael@0:   } else {
michael@0:     NS_ASSERTION(mTestConfig.mFakeRecognitionService,
michael@0:                  "Got request for fake recognition service event, but "
michael@0:                  TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE " is unset");
michael@0: 
michael@0:     // let the fake recognition service handle the request
michael@0:   }
michael@0: 
michael@0:   return;
michael@0: }
michael@0: 
michael@0: already_AddRefed<SpeechGrammarList>
michael@0: SpeechRecognition::GetGrammars(ErrorResult& aRv) const
michael@0: {
michael@0:   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0:   return nullptr;
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::SetGrammars(SpeechGrammarList& aArg, ErrorResult& aRv)
michael@0: {
michael@0:   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0:   return;
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::GetLang(nsString& aRetVal, ErrorResult& aRv) const
michael@0: {
michael@0:   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0:   return;
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::SetLang(const nsAString& aArg, ErrorResult& aRv)
michael@0: {
michael@0:   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0:   return;
michael@0: }
michael@0: 
michael@0: bool
michael@0: SpeechRecognition::GetContinuous(ErrorResult& aRv) const
michael@0: {
michael@0:   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0:   return false;
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv)
michael@0: {
michael@0:   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0:   return;
michael@0: }
michael@0: 
michael@0: bool
michael@0: SpeechRecognition::GetInterimResults(ErrorResult& aRv) const
michael@0: {
michael@0:   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0:   return false;
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::SetInterimResults(bool aArg, ErrorResult& aRv)
michael@0: {
michael@0:   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0:   return;
michael@0: }
michael@0: 
michael@0: uint32_t
michael@0: SpeechRecognition::GetMaxAlternatives(ErrorResult& aRv) const
michael@0: {
michael@0:   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0:   return 0;
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::SetMaxAlternatives(uint32_t aArg, ErrorResult& aRv)
michael@0: {
michael@0:   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0:   return;
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const
michael@0: {
michael@0:   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0:   return;
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv)
michael@0: {
michael@0:   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0:   return;
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::Start(ErrorResult& aRv)
michael@0: {
michael@0:   if (mCurrentState != STATE_IDLE) {
michael@0:     aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
michael@0:     return;
michael@0:   }
michael@0: 
michael@0:   nsAutoCString speechRecognitionServiceCID;
michael@0:   GetRecognitionServiceCID(speechRecognitionServiceCID);
michael@0: 
michael@0:   nsresult rv;
michael@0:   mRecognitionService = do_GetService(speechRecognitionServiceCID.get(), &rv);
michael@0:   NS_ENSURE_SUCCESS_VOID(rv);
michael@0: 
michael@0:   rv = mRecognitionService->Initialize(this->asWeakPtr());
michael@0:   NS_ENSURE_SUCCESS_VOID(rv);
michael@0: 
michael@0:   MediaStreamConstraints constraints;
michael@0:   constraints.mAudio.SetAsBoolean() = true;
michael@0: 
michael@0:   if (!mTestConfig.mFakeFSMEvents) {
michael@0:     MediaManager* manager = MediaManager::Get();
michael@0:     manager->GetUserMedia(false,
michael@0:                           GetOwner(),
michael@0:                           constraints,
michael@0:                           new GetUserMediaSuccessCallback(this),
michael@0:                           new GetUserMediaErrorCallback(this));
michael@0:   }
michael@0: 
michael@0:   nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START);
michael@0:   NS_DispatchToMainThread(event);
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::Stop()
michael@0: {
michael@0:   nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP);
michael@0:   NS_DispatchToMainThread(event);
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::Abort()
michael@0: {
michael@0:   if (mAborted) {
michael@0:     return;
michael@0:   }
michael@0: 
michael@0:   mAborted = true;
michael@0:   nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT);
michael@0:   NS_DispatchToMainThread(event);
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::DispatchError(EventType aErrorType,
michael@0:                                  SpeechRecognitionErrorCode aErrorCode,
michael@0:                                  const nsAString& aMessage)
michael@0: {
michael@0:   MOZ_ASSERT(NS_IsMainThread());
michael@0:   MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR ||
michael@0:              aErrorType == EVENT_AUDIO_ERROR, "Invalid error type!");
michael@0: 
michael@0:   nsRefPtr<SpeechRecognitionError> srError =
michael@0:     new SpeechRecognitionError(nullptr, nullptr, nullptr);
michael@0: 
michael@0:   ErrorResult err;
michael@0:   srError->InitSpeechRecognitionError(NS_LITERAL_STRING("error"), true, false,
michael@0:                                       aErrorCode, aMessage, err);
michael@0: 
michael@0:   nsRefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType);
michael@0:   event->mError = srError;
michael@0:   NS_DispatchToMainThread(event);
michael@0: }
michael@0: 
michael@0: /*
michael@0:  * Buffer audio samples into mAudioSamplesBuffer until aBufferSize.
michael@0:  * Updates mBufferedSamples and returns the number of samples that were buffered.
michael@0:  */
michael@0: uint32_t
michael@0: SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples,
michael@0:                                      uint32_t aSampleCount)
michael@0: {
michael@0:   MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk);
michael@0:   MOZ_ASSERT(mAudioSamplesBuffer.get());
michael@0: 
michael@0:   int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data());
michael@0:   size_t samplesToCopy = std::min(aSampleCount,
michael@0:                                   mAudioSamplesPerChunk - mBufferedSamples);
michael@0: 
michael@0:   memcpy(samplesBuffer + mBufferedSamples, aSamples,
michael@0:          samplesToCopy * sizeof(int16_t));
michael@0: 
michael@0:   mBufferedSamples += samplesToCopy;
michael@0:   return samplesToCopy;
michael@0: }
michael@0: 
michael@0: /*
michael@0:  * Split a samples buffer starting of a given size into
michael@0:  * chunks of equal size. The chunks are stored in the array
michael@0:  * received as argument.
michael@0:  * Returns the offset of the end of the last chunk that was
michael@0:  * created.
michael@0:  */
michael@0: uint32_t
michael@0: SpeechRecognition::SplitSamplesBuffer(const int16_t* aSamplesBuffer,
michael@0:                                       uint32_t aSampleCount,
michael@0:                                       nsTArray<nsRefPtr<SharedBuffer>>& aResult)
michael@0: {
michael@0:   uint32_t chunkStart = 0;
michael@0: 
michael@0:   while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) {
michael@0:     nsRefPtr<SharedBuffer> chunk =
michael@0:       SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));
michael@0: 
michael@0:     memcpy(chunk->Data(), aSamplesBuffer + chunkStart,
michael@0:            mAudioSamplesPerChunk * sizeof(int16_t));
michael@0: 
michael@0:     aResult.AppendElement(chunk);
michael@0:     chunkStart += mAudioSamplesPerChunk;
michael@0:   }
michael@0: 
michael@0:   return chunkStart;
michael@0: }
michael@0: 
michael@0: AudioSegment*
michael@0: SpeechRecognition::CreateAudioSegment(nsTArray<nsRefPtr<SharedBuffer>>& aChunks)
michael@0: {
michael@0:   AudioSegment* segment = new AudioSegment();
michael@0:   for (uint32_t i = 0; i < aChunks.Length(); ++i) {
michael@0:     nsRefPtr<SharedBuffer> buffer = aChunks[i];
michael@0:     const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data());
michael@0: 
michael@0:     nsAutoTArray<const int16_t*, 1> channels;
michael@0:     channels.AppendElement(chunkData);
michael@0:     segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk);
michael@0:   }
michael@0: 
michael@0:   return segment;
michael@0: }
michael@0: 
michael@0: void
michael@0: SpeechRecognition::FeedAudioData(already_AddRefed<SharedBuffer> aSamples,
michael@0:                                  uint32_t aDuration,
michael@0:                                  MediaStreamListener* aProvider)
michael@0: {
michael@0:   NS_ASSERTION(!NS_IsMainThread(),
michael@0:                "FeedAudioData should not be called in the main thread");
michael@0: 
michael@0:   // Endpointer expects to receive samples in chunks whose size is a
michael@0:   // multiple of its frame size.
michael@0:   // Since we can't assume we will receive the frames in appropriate-sized
michael@0:   // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk
michael@0:   // (a multiple of Endpointer's frame size) before feeding to Endpointer.
michael@0: 
michael@0:   // ensure aSamples is deleted
michael@0:   nsRefPtr<SharedBuffer> refSamples = aSamples;
michael@0: 
michael@0:   uint32_t samplesIndex = 0;
michael@0:   const int16_t* samples = static_cast<int16_t*>(refSamples->Data());
michael@0:   nsAutoTArray<nsRefPtr<SharedBuffer>, 5> chunksToSend;
michael@0: 
michael@0:   // fill up our buffer and make a chunk out of it, if possible
michael@0:   if (mBufferedSamples > 0) {
michael@0:     samplesIndex += FillSamplesBuffer(samples, aDuration);
michael@0: 
michael@0:     if (mBufferedSamples == mAudioSamplesPerChunk) {
michael@0:       chunksToSend.AppendElement(mAudioSamplesBuffer);
michael@0:       mAudioSamplesBuffer = nullptr;
michael@0:       mBufferedSamples = 0;
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   // create sample chunks of correct size
michael@0:   if (samplesIndex < aDuration) {
michael@0:     samplesIndex += SplitSamplesBuffer(samples + samplesIndex,
michael@0:                                        aDuration - samplesIndex,
michael@0:                                        chunksToSend);
michael@0:   }
michael@0: 
michael@0:   // buffer remaining samples
michael@0:   if (samplesIndex < aDuration) {
michael@0:     mBufferedSamples = 0;
michael@0:     mAudioSamplesBuffer =
michael@0:       SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));
michael@0: 
michael@0:     FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex);
michael@0:   }
michael@0: 
michael@0:   AudioSegment* segment = CreateAudioSegment(chunksToSend);
michael@0:   nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_AUDIO_DATA);
michael@0:   event->mAudioSegment = segment;
michael@0:   event->mProvider = aProvider;
michael@0:   NS_DispatchToMainThread(event);
michael@0: 
michael@0:   return;
michael@0: }
michael@0: 
michael@0: const char*
michael@0: SpeechRecognition::GetName(FSMState aId)
michael@0: {
michael@0:   static const char* names[] = {
michael@0:     "STATE_IDLE",
michael@0:     "STATE_STARTING",
michael@0:     "STATE_ESTIMATING",
michael@0:     "STATE_WAITING_FOR_SPEECH",
michael@0:     "STATE_RECOGNIZING",
michael@0:     "STATE_WAITING_FOR_RESULT",
michael@0:   };
michael@0: 
michael@0:   MOZ_ASSERT(aId < STATE_COUNT);
michael@0:   MOZ_ASSERT(ArrayLength(names) == STATE_COUNT);
michael@0:   return names[aId];
michael@0: }
michael@0: 
michael@0: const char*
michael@0: SpeechRecognition::GetName(SpeechEvent* aEvent)
michael@0: {
michael@0:   static const char* names[] = {
michael@0:     "EVENT_START",
michael@0:     "EVENT_STOP",
michael@0:     "EVENT_ABORT",
michael@0:     "EVENT_AUDIO_DATA",
michael@0:     "EVENT_AUDIO_ERROR",
michael@0:     "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT",
michael@0:     "EVENT_RECOGNITIONSERVICE_FINAL_RESULT",
michael@0:     "EVENT_RECOGNITIONSERVICE_ERROR"
michael@0:   };
michael@0: 
michael@0:   MOZ_ASSERT(aEvent->mType < EVENT_COUNT);
michael@0:   MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT);
michael@0:   return names[aEvent->mType];
michael@0: }
michael@0: 
michael@0: SpeechEvent::~SpeechEvent()
michael@0: {
michael@0:   delete mAudioSegment;
michael@0: }
michael@0: 
michael@0: NS_IMETHODIMP
michael@0: SpeechEvent::Run()
michael@0: {
michael@0:   mRecognition->ProcessEvent(this);
michael@0:   return NS_OK;
michael@0: }
michael@0: 
michael@0: NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaSuccessCallback, nsIDOMGetUserMediaSuccessCallback)
michael@0: 
michael@0: NS_IMETHODIMP
michael@0: SpeechRecognition::GetUserMediaSuccessCallback::OnSuccess(nsISupports* aStream)
michael@0: {
michael@0:   nsCOMPtr<nsIDOMLocalMediaStream> localStream = do_QueryInterface(aStream);
michael@0:   mRecognition->StartRecording(static_cast<DOMLocalMediaStream*>(localStream.get()));
michael@0:   return NS_OK;
michael@0: }
michael@0: 
michael@0: NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaErrorCallback, nsIDOMGetUserMediaErrorCallback)
michael@0: 
michael@0: NS_IMETHODIMP
michael@0: SpeechRecognition::GetUserMediaErrorCallback::OnError(const nsAString& aError)
michael@0: {
michael@0:   SpeechRecognitionErrorCode errorCode;
michael@0: 
michael@0:   if (aError.Equals(NS_LITERAL_STRING("PERMISSION_DENIED"))) {
michael@0:     errorCode = SpeechRecognitionErrorCode::Not_allowed;
michael@0:   } else {
michael@0:     errorCode = SpeechRecognitionErrorCode::Audio_capture;
michael@0:   }
michael@0: 
michael@0:   mRecognition->DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode,
michael@0:                               aError);
michael@0: 
michael@0:   return NS_OK;
michael@0: }
michael@0: 
michael@0: } // namespace dom
michael@0: } // namespace mozilla