content/media/webspeech/recognition/SpeechRecognition.cpp

Thu, 15 Jan 2015 15:55:04 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:55:04 +0100
branch
TOR_BUG_9701
changeset 9
a63d609f5ebe
permissions
-rw-r--r--

Back out 97036ab72558 which inappropriately compared turds to third parties.

     1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* vim:set ts=2 sw=2 sts=2 et cindent: */
     3 /* This Source Code Form is subject to the terms of the Mozilla Public
     4  * License, v. 2.0. If a copy of the MPL was not distributed with this
     5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     7 #include "SpeechRecognition.h"
     9 #include "nsCOMPtr.h"
    10 #include "nsCycleCollectionParticipant.h"
    12 #include "mozilla/dom/SpeechRecognitionBinding.h"
    13 #include "mozilla/dom/MediaStreamTrackBinding.h"
    14 #include "mozilla/MediaManager.h"
    15 #include "mozilla/Services.h"
    17 #include "AudioSegment.h"
    18 #include "endpointer.h"
    20 #include "GeneratedEvents.h"
    21 #include "nsIDOMSpeechRecognitionEvent.h"
    22 #include "nsIObserverService.h"
    23 #include "nsServiceManagerUtils.h"
    25 #include <algorithm>
    27 namespace mozilla {
    28 namespace dom {
    30 #define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default"
    31 #define DEFAULT_RECOGNITION_SERVICE "google"
    33 #define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length"
    34 #define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH "media.webspeech.long_silence_length"
    35 #define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH "media.webspeech.long_speech_length"
    37 static const uint32_t kSAMPLE_RATE = 16000;
    38 static const uint32_t kSPEECH_DETECTION_TIMEOUT_MS = 10000;
    40 // number of frames corresponding to 300ms of audio to send to endpointer while
    41 // it's in environment estimation mode
    42 // kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms
    43 static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000;
    45 #ifdef PR_LOGGING
    46 PRLogModuleInfo*
    47 GetSpeechRecognitionLog()
    48 {
    49   static PRLogModuleInfo* sLog;
    50   if (!sLog) {
    51     sLog = PR_NewLogModule("SpeechRecognition");
    52   }
    54   return sLog;
    55 }
    56 #define SR_LOG(...) PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, (__VA_ARGS__))
    57 #else
    58 #define SR_LOG(...)
    59 #endif
    61 NS_INTERFACE_MAP_BEGIN(SpeechRecognition)
    62   NS_INTERFACE_MAP_ENTRY(nsIObserver)
    63 NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper)
    65 NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper)
    66 NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper)
    68 struct SpeechRecognition::TestConfig SpeechRecognition::mTestConfig;
    70 SpeechRecognition::SpeechRecognition(nsPIDOMWindow* aOwnerWindow)
    71   : DOMEventTargetHelper(aOwnerWindow)
    72   , mEndpointer(kSAMPLE_RATE)
    73   , mAudioSamplesPerChunk(mEndpointer.FrameSize())
    74   , mSpeechDetectionTimer(do_CreateInstance(NS_TIMER_CONTRACTID))
    75 {
    76   SR_LOG("created SpeechRecognition");
    78   mTestConfig.Init();
    79   if (mTestConfig.mEnableTests) {
    80     nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
    81     obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false);
    82     obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false);
    83   }
    85   mEndpointer.set_speech_input_complete_silence_length(
    86       Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 500000));
    87   mEndpointer.set_long_speech_input_complete_silence_length(
    88       Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 1000000));
    89   mEndpointer.set_long_speech_length(
    90       Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000));
    91   Reset();
    92 }
    94 bool
    95 SpeechRecognition::StateBetween(FSMState begin, FSMState end)
    96 {
    97   return mCurrentState >= begin && mCurrentState <= end;
    98 }
   100 void
   101 SpeechRecognition::SetState(FSMState state)
   102 {
   103   mCurrentState = state;
   104   SR_LOG("Transitioned to state %s", GetName(mCurrentState));
   105   return;
   106 }
   108 JSObject*
   109 SpeechRecognition::WrapObject(JSContext* aCx)
   110 {
   111   return SpeechRecognitionBinding::Wrap(aCx, this);
   112 }
   114 already_AddRefed<SpeechRecognition>
   115 SpeechRecognition::Constructor(const GlobalObject& aGlobal,
   116                                ErrorResult& aRv)
   117 {
   118   nsCOMPtr<nsPIDOMWindow> win = do_QueryInterface(aGlobal.GetAsSupports());
   119   if (!win) {
   120     aRv.Throw(NS_ERROR_FAILURE);
   121   }
   123   MOZ_ASSERT(win->IsInnerWindow());
   124   nsRefPtr<SpeechRecognition> object = new SpeechRecognition(win);
   125   return object.forget();
   126 }
   128 nsISupports*
   129 SpeechRecognition::GetParentObject() const
   130 {
   131   return GetOwner();
   132 }
   134 void
   135 SpeechRecognition::ProcessEvent(SpeechEvent* aEvent)
   136 {
   137   SR_LOG("Processing %s, current state is %s",
   138          GetName(aEvent),
   139          GetName(mCurrentState));
   141   if (mAborted && aEvent->mType != EVENT_ABORT) {
   142     // ignore all events while aborting
   143     return;
   144   }
   146   Transition(aEvent);
   147 }
   149 void
   150 SpeechRecognition::Transition(SpeechEvent* aEvent)
   151 {
   152   switch (mCurrentState) {
   153     case STATE_IDLE:
   154       switch (aEvent->mType) {
   155         case EVENT_START:
   156           // TODO: may want to time out if we wait too long
   157           // for user to approve
   158           WaitForAudioData(aEvent);
   159           break;
   160         case EVENT_STOP:
   161         case EVENT_ABORT:
   162         case EVENT_AUDIO_DATA:
   163         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
   164         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
   165           DoNothing(aEvent);
   166           break;
   167         case EVENT_AUDIO_ERROR:
   168         case EVENT_RECOGNITIONSERVICE_ERROR:
   169           AbortError(aEvent);
   170           break;
   171         case EVENT_COUNT:
   172           MOZ_CRASH("Invalid event EVENT_COUNT");
   173       }
   174       break;
   175     case STATE_STARTING:
   176       switch (aEvent->mType) {
   177         case EVENT_AUDIO_DATA:
   178           StartedAudioCapture(aEvent);
   179           break;
   180         case EVENT_AUDIO_ERROR:
   181         case EVENT_RECOGNITIONSERVICE_ERROR:
   182           AbortError(aEvent);
   183           break;
   184         case EVENT_ABORT:
   185           AbortSilently(aEvent);
   186           break;
   187         case EVENT_STOP:
   188           Reset();
   189           break;
   190         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
   191         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
   192           DoNothing(aEvent);
   193           break;
   194         case EVENT_START:
   195           SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
   196           MOZ_CRASH();
   197         case EVENT_COUNT:
   198           MOZ_CRASH("Invalid event EVENT_COUNT");
   199       }
   200       break;
   201     case STATE_ESTIMATING:
   202       switch (aEvent->mType) {
   203         case EVENT_AUDIO_DATA:
   204           WaitForEstimation(aEvent);
   205           break;
   206         case EVENT_STOP:
   207           StopRecordingAndRecognize(aEvent);
   208           break;
   209         case EVENT_ABORT:
   210           AbortSilently(aEvent);
   211           break;
   212         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
   213         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
   214         case EVENT_RECOGNITIONSERVICE_ERROR:
   215           DoNothing(aEvent);
   216           break;
   217         case EVENT_AUDIO_ERROR:
   218           AbortError(aEvent);
   219           break;
   220         case EVENT_START:
   221           SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType);
   222           MOZ_CRASH();
   223         case EVENT_COUNT:
   224           MOZ_CRASH("Invalid event EVENT_COUNT");
   225       }
   226       break;
   227     case STATE_WAITING_FOR_SPEECH:
   228       switch (aEvent->mType) {
   229         case EVENT_AUDIO_DATA:
   230           DetectSpeech(aEvent);
   231           break;
   232         case EVENT_STOP:
   233           StopRecordingAndRecognize(aEvent);
   234           break;
   235         case EVENT_ABORT:
   236           AbortSilently(aEvent);
   237           break;
   238         case EVENT_AUDIO_ERROR:
   239           AbortError(aEvent);
   240           break;
   241         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
   242         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
   243         case EVENT_RECOGNITIONSERVICE_ERROR:
   244           DoNothing(aEvent);
   245           break;
   246         case EVENT_START:
   247           SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
   248           MOZ_CRASH();
   249         case EVENT_COUNT:
   250           MOZ_CRASH("Invalid event EVENT_COUNT");
   251       }
   252       break;
   253     case STATE_RECOGNIZING:
   254       switch (aEvent->mType) {
   255         case EVENT_AUDIO_DATA:
   256           WaitForSpeechEnd(aEvent);
   257           break;
   258         case EVENT_STOP:
   259           StopRecordingAndRecognize(aEvent);
   260           break;
   261         case EVENT_AUDIO_ERROR:
   262         case EVENT_RECOGNITIONSERVICE_ERROR:
   263           AbortError(aEvent);
   264           break;
   265         case EVENT_ABORT:
   266           AbortSilently(aEvent);
   267           break;
   268         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
   269         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
   270           DoNothing(aEvent);
   271           break;
   272         case EVENT_START:
   273           SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent));
   274           MOZ_CRASH();
   275         case EVENT_COUNT:
   276           MOZ_CRASH("Invalid event EVENT_COUNT");
   277       }
   278       break;
   279     case STATE_WAITING_FOR_RESULT:
   280       switch (aEvent->mType) {
   281         case EVENT_STOP:
   282           DoNothing(aEvent);
   283           break;
   284         case EVENT_AUDIO_ERROR:
   285         case EVENT_RECOGNITIONSERVICE_ERROR:
   286           AbortError(aEvent);
   287           break;
   288         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
   289           NotifyFinalResult(aEvent);
   290           break;
   291         case EVENT_AUDIO_DATA:
   292           DoNothing(aEvent);
   293           break;
   294         case EVENT_ABORT:
   295           AbortSilently(aEvent);
   296           break;
   297         case EVENT_START:
   298         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
   299           SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", GetName(aEvent));
   300           MOZ_CRASH();
   301         case EVENT_COUNT:
   302           MOZ_CRASH("Invalid event EVENT_COUNT");
   303       }
   304       break;
   305     case STATE_COUNT:
   306       MOZ_CRASH("Invalid state STATE_COUNT");
   307   }
   309   return;
   310 }
   312 /*
   313  * Handle a segment of recorded audio data.
   314  * Returns the number of samples that were processed.
   315  */
   316 uint32_t
   317 SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment)
   318 {
   319   AudioSegment::ChunkIterator iterator(*aSegment);
   320   uint32_t samples = 0;
   321   while (!iterator.IsEnded()) {
   322     float out;
   323     mEndpointer.ProcessAudio(*iterator, &out);
   324     samples += iterator->GetDuration();
   325     iterator.Next();
   326   }
   328   mRecognitionService->ProcessAudioSegment(aSegment);
   329   return samples;
   330 }
   332 void
   333 SpeechRecognition::GetRecognitionServiceCID(nsACString& aResultCID)
   334 {
   335   if (mTestConfig.mFakeRecognitionService) {
   336     aResultCID =
   337       NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake";
   339     return;
   340   }
   342   nsAdoptingCString prefValue =
   343     Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE);
   345   nsAutoCString speechRecognitionService;
   346   if (!prefValue.get() || prefValue.IsEmpty()) {
   347     speechRecognitionService = DEFAULT_RECOGNITION_SERVICE;
   348   } else {
   349     speechRecognitionService = prefValue;
   350   }
   352   aResultCID =
   353     NS_LITERAL_CSTRING(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) +
   354     speechRecognitionService;
   356   return;
   357 }
   359 /****************************************************************************
   360  * FSM Transition functions
   361  *
   362  * If a transition function may cause a DOM event to be fired,
   363  * it may also be re-entered, since the event handler may cause the
   364  * event loop to spin and new SpeechEvents to be processed.
   365  *
   366  * Rules:
   367  * 1) These methods should call SetState as soon as possible.
   368  * 2) If these methods dispatch DOM events, or call methods that dispatch
   369  * DOM events, that should be done as late as possible.
   370  * 3) If anything must happen after dispatching a DOM event, make sure
   371  * the state is still what the method expected it to be.
   372  ****************************************************************************/
   374 void
   375 SpeechRecognition::Reset()
   376 {
   377   SetState(STATE_IDLE);
   378   mRecognitionService = nullptr;
   379   mEstimationSamples = 0;
   380   mBufferedSamples = 0;
   381   mSpeechDetectionTimer->Cancel();
   382   mAborted = false;
   383 }
   385 void
   386 SpeechRecognition::ResetAndEnd()
   387 {
   388   Reset();
   389   DispatchTrustedEvent(NS_LITERAL_STRING("end"));
   390 }
   392 void
   393 SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent)
   394 {
   395   SetState(STATE_STARTING);
   396 }
   398 void
   399 SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent)
   400 {
   401   SetState(STATE_ESTIMATING);
   403   mEndpointer.SetEnvironmentEstimationMode();
   404   mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment);
   406   DispatchTrustedEvent(NS_LITERAL_STRING("audiostart"));
   407   if (mCurrentState == STATE_ESTIMATING) {
   408     DispatchTrustedEvent(NS_LITERAL_STRING("start"));
   409   }
   410 }
   412 void
   413 SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent)
   414 {
   415   SetState(STATE_WAITING_FOR_RESULT);
   417   MOZ_ASSERT(mRecognitionService, "Service deleted before recording done");
   418   mRecognitionService->SoundEnd();
   420   StopRecording();
   421 }
   423 void
   424 SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent)
   425 {
   426   SetState(STATE_ESTIMATING);
   428   mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment);
   429   if (mEstimationSamples > kESTIMATION_SAMPLES) {
   430     mEndpointer.SetUserInputMode();
   431     SetState(STATE_WAITING_FOR_SPEECH);
   432   }
   433 }
   435 void
   436 SpeechRecognition::DetectSpeech(SpeechEvent* aEvent)
   437 {
   438   SetState(STATE_WAITING_FOR_SPEECH);
   440   ProcessAudioSegment(aEvent->mAudioSegment);
   441   if (mEndpointer.DidStartReceivingSpeech()) {
   442     mSpeechDetectionTimer->Cancel();
   443     SetState(STATE_RECOGNIZING);
   444     DispatchTrustedEvent(NS_LITERAL_STRING("speechstart"));
   445   }
   446 }
   448 void
   449 SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent)
   450 {
   451   SetState(STATE_RECOGNIZING);
   453   ProcessAudioSegment(aEvent->mAudioSegment);
   454   if (mEndpointer.speech_input_complete()) {
   455     DispatchTrustedEvent(NS_LITERAL_STRING("speechend"));
   457     if (mCurrentState == STATE_RECOGNIZING) {
   458       // FIXME: StopRecordingAndRecognize should only be called for single
   459       // shot services for continuous we should just inform the service
   460       StopRecordingAndRecognize(aEvent);
   461     }
   462   }
   463 }
   465 void
   466 SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent)
   467 {
   468   ResetAndEnd();
   470   nsCOMPtr<nsIDOMEvent> domEvent;
   471   NS_NewDOMSpeechRecognitionEvent(getter_AddRefs(domEvent), nullptr, nullptr, nullptr);
   473   nsCOMPtr<nsIDOMSpeechRecognitionEvent> srEvent = do_QueryInterface(domEvent);
   474   nsRefPtr<SpeechRecognitionResultList> rlist = aEvent->mRecognitionResultList;
   475   nsCOMPtr<nsISupports> ilist = do_QueryInterface(rlist);
   476   srEvent->InitSpeechRecognitionEvent(NS_LITERAL_STRING("result"),
   477                                       true, false, 0, ilist,
   478                                       NS_LITERAL_STRING("NOT_IMPLEMENTED"),
   479                                       nullptr);
   480   domEvent->SetTrusted(true);
   482   bool defaultActionEnabled;
   483   this->DispatchEvent(domEvent, &defaultActionEnabled);
   484 }
   486 void
   487 SpeechRecognition::DoNothing(SpeechEvent* aEvent)
   488 {
   489 }
   491 void
   492 SpeechRecognition::AbortSilently(SpeechEvent* aEvent)
   493 {
   494   bool stopRecording = StateBetween(STATE_ESTIMATING, STATE_RECOGNIZING);
   496   if (mRecognitionService) {
   497     mRecognitionService->Abort();
   498   }
   500   if (stopRecording) {
   501     StopRecording();
   502   }
   504   ResetAndEnd();
   505 }
   507 void
   508 SpeechRecognition::AbortError(SpeechEvent* aEvent)
   509 {
   510   AbortSilently(aEvent);
   511   NotifyError(aEvent);
   512 }
   514 void
   515 SpeechRecognition::NotifyError(SpeechEvent* aEvent)
   516 {
   517   aEvent->mError->SetTrusted(true);
   519   bool defaultActionEnabled;
   520   this->DispatchEvent(aEvent->mError, &defaultActionEnabled);
   522   return;
   523 }
   525 /**************************************
   526  * Event triggers and other functions *
   527  **************************************/
   528 NS_IMETHODIMP
   529 SpeechRecognition::StartRecording(DOMMediaStream* aDOMStream)
   530 {
   531   // hold a reference so that the underlying stream
   532   // doesn't get Destroy()'ed
   533   mDOMStream = aDOMStream;
   535   NS_ENSURE_STATE(mDOMStream->GetStream());
   536   mSpeechListener = new SpeechStreamListener(this);
   537   mDOMStream->GetStream()->AddListener(mSpeechListener);
   539   mEndpointer.StartSession();
   541   return mSpeechDetectionTimer->Init(this, kSPEECH_DETECTION_TIMEOUT_MS,
   542                                      nsITimer::TYPE_ONE_SHOT);
   543 }
   545 NS_IMETHODIMP
   546 SpeechRecognition::StopRecording()
   547 {
   548   // we only really need to remove the listener explicitly when testing,
   549   // as our JS code still holds a reference to mDOMStream and only assigning
   550   // it to nullptr isn't guaranteed to free the stream and the listener.
   551   mDOMStream->GetStream()->RemoveListener(mSpeechListener);
   552   mSpeechListener = nullptr;
   553   mDOMStream = nullptr;
   555   mEndpointer.EndSession();
   556   DispatchTrustedEvent(NS_LITERAL_STRING("audioend"));
   558   return NS_OK;
   559 }
   561 NS_IMETHODIMP
   562 SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic,
   563                            const char16_t* aData)
   564 {
   565   MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread");
   567   if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) &&
   568       StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) {
   570     DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,
   571                   SpeechRecognitionErrorCode::No_speech,
   572                   NS_LITERAL_STRING("No speech detected (timeout)"));
   573   } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) {
   574     nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
   575     obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC);
   576     obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC);
   577   } else if (mTestConfig.mFakeFSMEvents &&
   578              !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) {
   579     ProcessTestEventRequest(aSubject, nsDependentString(aData));
   580   }
   582   return NS_OK;
   583 }
   585 void
   586 SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName)
   587 {
   588   if (aEventName.EqualsLiteral("EVENT_START")) {
   589     ErrorResult err;
   590     Start(err);
   591   } else if (aEventName.EqualsLiteral("EVENT_STOP")) {
   592     Stop();
   593   } else if (aEventName.EqualsLiteral("EVENT_ABORT")) {
   594     Abort();
   595   } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) {
   596     DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,
   597                   SpeechRecognitionErrorCode::Audio_capture, // TODO different codes?
   598                   NS_LITERAL_STRING("AUDIO_ERROR test event"));
   599   } else if (aEventName.EqualsLiteral("EVENT_AUDIO_DATA")) {
   600     StartRecording(static_cast<DOMMediaStream*>(aSubject));
   601   } else {
   602     NS_ASSERTION(mTestConfig.mFakeRecognitionService,
   603                  "Got request for fake recognition service event, but "
   604                  TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE " is unset");
   606     // let the fake recognition service handle the request
   607   }
   609   return;
   610 }
   612 already_AddRefed<SpeechGrammarList>
   613 SpeechRecognition::GetGrammars(ErrorResult& aRv) const
   614 {
   615   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
   616   return nullptr;
   617 }
   619 void
   620 SpeechRecognition::SetGrammars(SpeechGrammarList& aArg, ErrorResult& aRv)
   621 {
   622   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
   623   return;
   624 }
   626 void
   627 SpeechRecognition::GetLang(nsString& aRetVal, ErrorResult& aRv) const
   628 {
   629   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
   630   return;
   631 }
   633 void
   634 SpeechRecognition::SetLang(const nsAString& aArg, ErrorResult& aRv)
   635 {
   636   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
   637   return;
   638 }
   640 bool
   641 SpeechRecognition::GetContinuous(ErrorResult& aRv) const
   642 {
   643   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
   644   return false;
   645 }
   647 void
   648 SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv)
   649 {
   650   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
   651   return;
   652 }
   654 bool
   655 SpeechRecognition::GetInterimResults(ErrorResult& aRv) const
   656 {
   657   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
   658   return false;
   659 }
   661 void
   662 SpeechRecognition::SetInterimResults(bool aArg, ErrorResult& aRv)
   663 {
   664   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
   665   return;
   666 }
   668 uint32_t
   669 SpeechRecognition::GetMaxAlternatives(ErrorResult& aRv) const
   670 {
   671   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
   672   return 0;
   673 }
   675 void
   676 SpeechRecognition::SetMaxAlternatives(uint32_t aArg, ErrorResult& aRv)
   677 {
   678   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
   679   return;
   680 }
   682 void
   683 SpeechRecognition::GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const
   684 {
   685   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
   686   return;
   687 }
   689 void
   690 SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv)
   691 {
   692   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
   693   return;
   694 }
   696 void
   697 SpeechRecognition::Start(ErrorResult& aRv)
   698 {
   699   if (mCurrentState != STATE_IDLE) {
   700     aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
   701     return;
   702   }
   704   nsAutoCString speechRecognitionServiceCID;
   705   GetRecognitionServiceCID(speechRecognitionServiceCID);
   707   nsresult rv;
   708   mRecognitionService = do_GetService(speechRecognitionServiceCID.get(), &rv);
   709   NS_ENSURE_SUCCESS_VOID(rv);
   711   rv = mRecognitionService->Initialize(this->asWeakPtr());
   712   NS_ENSURE_SUCCESS_VOID(rv);
   714   MediaStreamConstraints constraints;
   715   constraints.mAudio.SetAsBoolean() = true;
   717   if (!mTestConfig.mFakeFSMEvents) {
   718     MediaManager* manager = MediaManager::Get();
   719     manager->GetUserMedia(false,
   720                           GetOwner(),
   721                           constraints,
   722                           new GetUserMediaSuccessCallback(this),
   723                           new GetUserMediaErrorCallback(this));
   724   }
   726   nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START);
   727   NS_DispatchToMainThread(event);
   728 }
   730 void
   731 SpeechRecognition::Stop()
   732 {
   733   nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP);
   734   NS_DispatchToMainThread(event);
   735 }
   737 void
   738 SpeechRecognition::Abort()
   739 {
   740   if (mAborted) {
   741     return;
   742   }
   744   mAborted = true;
   745   nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT);
   746   NS_DispatchToMainThread(event);
   747 }
   749 void
   750 SpeechRecognition::DispatchError(EventType aErrorType,
   751                                  SpeechRecognitionErrorCode aErrorCode,
   752                                  const nsAString& aMessage)
   753 {
   754   MOZ_ASSERT(NS_IsMainThread());
   755   MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR ||
   756              aErrorType == EVENT_AUDIO_ERROR, "Invalid error type!");
   758   nsRefPtr<SpeechRecognitionError> srError =
   759     new SpeechRecognitionError(nullptr, nullptr, nullptr);
   761   ErrorResult err;
   762   srError->InitSpeechRecognitionError(NS_LITERAL_STRING("error"), true, false,
   763                                       aErrorCode, aMessage, err);
   765   nsRefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType);
   766   event->mError = srError;
   767   NS_DispatchToMainThread(event);
   768 }
   770 /*
   771  * Buffer audio samples into mAudioSamplesBuffer until aBufferSize.
   772  * Updates mBufferedSamples and returns the number of samples that were buffered.
   773  */
   774 uint32_t
   775 SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples,
   776                                      uint32_t aSampleCount)
   777 {
   778   MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk);
   779   MOZ_ASSERT(mAudioSamplesBuffer.get());
   781   int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data());
   782   size_t samplesToCopy = std::min(aSampleCount,
   783                                   mAudioSamplesPerChunk - mBufferedSamples);
   785   memcpy(samplesBuffer + mBufferedSamples, aSamples,
   786          samplesToCopy * sizeof(int16_t));
   788   mBufferedSamples += samplesToCopy;
   789   return samplesToCopy;
   790 }
   792 /*
   793  * Split a samples buffer starting of a given size into
   794  * chunks of equal size. The chunks are stored in the array
   795  * received as argument.
   796  * Returns the offset of the end of the last chunk that was
   797  * created.
   798  */
   799 uint32_t
   800 SpeechRecognition::SplitSamplesBuffer(const int16_t* aSamplesBuffer,
   801                                       uint32_t aSampleCount,
   802                                       nsTArray<nsRefPtr<SharedBuffer>>& aResult)
   803 {
   804   uint32_t chunkStart = 0;
   806   while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) {
   807     nsRefPtr<SharedBuffer> chunk =
   808       SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));
   810     memcpy(chunk->Data(), aSamplesBuffer + chunkStart,
   811            mAudioSamplesPerChunk * sizeof(int16_t));
   813     aResult.AppendElement(chunk);
   814     chunkStart += mAudioSamplesPerChunk;
   815   }
   817   return chunkStart;
   818 }
   820 AudioSegment*
   821 SpeechRecognition::CreateAudioSegment(nsTArray<nsRefPtr<SharedBuffer>>& aChunks)
   822 {
   823   AudioSegment* segment = new AudioSegment();
   824   for (uint32_t i = 0; i < aChunks.Length(); ++i) {
   825     nsRefPtr<SharedBuffer> buffer = aChunks[i];
   826     const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data());
   828     nsAutoTArray<const int16_t*, 1> channels;
   829     channels.AppendElement(chunkData);
   830     segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk);
   831   }
   833   return segment;
   834 }
   836 void
   837 SpeechRecognition::FeedAudioData(already_AddRefed<SharedBuffer> aSamples,
   838                                  uint32_t aDuration,
   839                                  MediaStreamListener* aProvider)
   840 {
   841   NS_ASSERTION(!NS_IsMainThread(),
   842                "FeedAudioData should not be called in the main thread");
   844   // Endpointer expects to receive samples in chunks whose size is a
   845   // multiple of its frame size.
   846   // Since we can't assume we will receive the frames in appropriate-sized
   847   // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk
   848   // (a multiple of Endpointer's frame size) before feeding to Endpointer.
   850   // ensure aSamples is deleted
   851   nsRefPtr<SharedBuffer> refSamples = aSamples;
   853   uint32_t samplesIndex = 0;
   854   const int16_t* samples = static_cast<int16_t*>(refSamples->Data());
   855   nsAutoTArray<nsRefPtr<SharedBuffer>, 5> chunksToSend;
   857   // fill up our buffer and make a chunk out of it, if possible
   858   if (mBufferedSamples > 0) {
   859     samplesIndex += FillSamplesBuffer(samples, aDuration);
   861     if (mBufferedSamples == mAudioSamplesPerChunk) {
   862       chunksToSend.AppendElement(mAudioSamplesBuffer);
   863       mAudioSamplesBuffer = nullptr;
   864       mBufferedSamples = 0;
   865     }
   866   }
   868   // create sample chunks of correct size
   869   if (samplesIndex < aDuration) {
   870     samplesIndex += SplitSamplesBuffer(samples + samplesIndex,
   871                                        aDuration - samplesIndex,
   872                                        chunksToSend);
   873   }
   875   // buffer remaining samples
   876   if (samplesIndex < aDuration) {
   877     mBufferedSamples = 0;
   878     mAudioSamplesBuffer =
   879       SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));
   881     FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex);
   882   }
   884   AudioSegment* segment = CreateAudioSegment(chunksToSend);
   885   nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_AUDIO_DATA);
   886   event->mAudioSegment = segment;
   887   event->mProvider = aProvider;
   888   NS_DispatchToMainThread(event);
   890   return;
   891 }
   893 const char*
   894 SpeechRecognition::GetName(FSMState aId)
   895 {
   896   static const char* names[] = {
   897     "STATE_IDLE",
   898     "STATE_STARTING",
   899     "STATE_ESTIMATING",
   900     "STATE_WAITING_FOR_SPEECH",
   901     "STATE_RECOGNIZING",
   902     "STATE_WAITING_FOR_RESULT",
   903   };
   905   MOZ_ASSERT(aId < STATE_COUNT);
   906   MOZ_ASSERT(ArrayLength(names) == STATE_COUNT);
   907   return names[aId];
   908 }
   910 const char*
   911 SpeechRecognition::GetName(SpeechEvent* aEvent)
   912 {
   913   static const char* names[] = {
   914     "EVENT_START",
   915     "EVENT_STOP",
   916     "EVENT_ABORT",
   917     "EVENT_AUDIO_DATA",
   918     "EVENT_AUDIO_ERROR",
   919     "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT",
   920     "EVENT_RECOGNITIONSERVICE_FINAL_RESULT",
   921     "EVENT_RECOGNITIONSERVICE_ERROR"
   922   };
   924   MOZ_ASSERT(aEvent->mType < EVENT_COUNT);
   925   MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT);
   926   return names[aEvent->mType];
   927 }
   929 SpeechEvent::~SpeechEvent()
   930 {
   931   delete mAudioSegment;
   932 }
   934 NS_IMETHODIMP
   935 SpeechEvent::Run()
   936 {
   937   mRecognition->ProcessEvent(this);
   938   return NS_OK;
   939 }
   941 NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaSuccessCallback, nsIDOMGetUserMediaSuccessCallback)
   943 NS_IMETHODIMP
   944 SpeechRecognition::GetUserMediaSuccessCallback::OnSuccess(nsISupports* aStream)
   945 {
   946   nsCOMPtr<nsIDOMLocalMediaStream> localStream = do_QueryInterface(aStream);
   947   mRecognition->StartRecording(static_cast<DOMLocalMediaStream*>(localStream.get()));
   948   return NS_OK;
   949 }
   951 NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaErrorCallback, nsIDOMGetUserMediaErrorCallback)
   953 NS_IMETHODIMP
   954 SpeechRecognition::GetUserMediaErrorCallback::OnError(const nsAString& aError)
   955 {
   956   SpeechRecognitionErrorCode errorCode;
   958   if (aError.Equals(NS_LITERAL_STRING("PERMISSION_DENIED"))) {
   959     errorCode = SpeechRecognitionErrorCode::Not_allowed;
   960   } else {
   961     errorCode = SpeechRecognitionErrorCode::Audio_capture;
   962   }
   964   mRecognition->DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode,
   965                               aError);
   967   return NS_OK;
   968 }
   970 } // namespace dom
   971 } // namespace mozilla

mercurial