The Tor Browser: content/media/webspeech/recognition/SpeechRecognition.cpp@a63d609f5ebe

     1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */

     2 /* vim:set ts=2 sw=2 sts=2 et cindent: */

     3 /* This Source Code Form is subject to the terms of the Mozilla Public

     4  * License, v. 2.0. If a copy of the MPL was not distributed with this

     5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

     7 #include "SpeechRecognition.h"

     9 #include "nsCOMPtr.h"

    10 #include "nsCycleCollectionParticipant.h"

    12 #include "mozilla/dom/SpeechRecognitionBinding.h"

    13 #include "mozilla/dom/MediaStreamTrackBinding.h"

    14 #include "mozilla/MediaManager.h"

    15 #include "mozilla/Services.h"

    17 #include "AudioSegment.h"

    18 #include "endpointer.h"

    20 #include "GeneratedEvents.h"

    21 #include "nsIDOMSpeechRecognitionEvent.h"

    22 #include "nsIObserverService.h"

    23 #include "nsServiceManagerUtils.h"

    25 #include <algorithm>

    27 namespace mozilla {

    28 namespace dom {

    30 #define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default"

    31 #define DEFAULT_RECOGNITION_SERVICE "google"

    33 #define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length"

    34 #define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH "media.webspeech.long_silence_length"

    35 #define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH "media.webspeech.long_speech_length"

    37 static const uint32_t kSAMPLE_RATE = 16000;

    38 static const uint32_t kSPEECH_DETECTION_TIMEOUT_MS = 10000;

    40 // number of frames corresponding to 300ms of audio to send to endpointer while

    41 // it's in environment estimation mode

    42 // kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms

    43 static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000;

    45 #ifdef PR_LOGGING

    46 PRLogModuleInfo*

    47 GetSpeechRecognitionLog()

    48 {

    49   static PRLogModuleInfo* sLog;

    50   if (!sLog) {

    51     sLog = PR_NewLogModule("SpeechRecognition");

    52   }

    54   return sLog;

    55 }

    56 #define SR_LOG(...) PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, (__VA_ARGS__))

    57 #else

    58 #define SR_LOG(...)

    59 #endif

    61 NS_INTERFACE_MAP_BEGIN(SpeechRecognition)

    62   NS_INTERFACE_MAP_ENTRY(nsIObserver)

    63 NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper)

    65 NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper)

    66 NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper)

    68 struct SpeechRecognition::TestConfig SpeechRecognition::mTestConfig;

    70 SpeechRecognition::SpeechRecognition(nsPIDOMWindow* aOwnerWindow)

    71   : DOMEventTargetHelper(aOwnerWindow)

    72   , mEndpointer(kSAMPLE_RATE)

    73   , mAudioSamplesPerChunk(mEndpointer.FrameSize())

    74   , mSpeechDetectionTimer(do_CreateInstance(NS_TIMER_CONTRACTID))

    75 {

    76   SR_LOG("created SpeechRecognition");

    78   mTestConfig.Init();

    79   if (mTestConfig.mEnableTests) {

    80     nsCOMPtr<nsIObserverService> obs = services::GetObserverService();

    81     obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false);

    82     obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false);

    83   }

    85   mEndpointer.set_speech_input_complete_silence_length(

    86       Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 500000));

    87   mEndpointer.set_long_speech_input_complete_silence_length(

    88       Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 1000000));

    89   mEndpointer.set_long_speech_length(

    90       Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000));

    91   Reset();

    92 }

    94 bool

    95 SpeechRecognition::StateBetween(FSMState begin, FSMState end)

    96 {

    97   return mCurrentState >= begin && mCurrentState <= end;

    98 }

   100 void

   101 SpeechRecognition::SetState(FSMState state)

   102 {

   103   mCurrentState = state;

   104   SR_LOG("Transitioned to state %s", GetName(mCurrentState));

   105   return;

   106 }

   108 JSObject*

   109 SpeechRecognition::WrapObject(JSContext* aCx)

   110 {

   111   return SpeechRecognitionBinding::Wrap(aCx, this);

   112 }

   114 already_AddRefed<SpeechRecognition>

   115 SpeechRecognition::Constructor(const GlobalObject& aGlobal,

   116                                ErrorResult& aRv)

   117 {

   118   nsCOMPtr<nsPIDOMWindow> win = do_QueryInterface(aGlobal.GetAsSupports());

   119   if (!win) {

   120     aRv.Throw(NS_ERROR_FAILURE);

   121   }

   123   MOZ_ASSERT(win->IsInnerWindow());

   124   nsRefPtr<SpeechRecognition> object = new SpeechRecognition(win);

   125   return object.forget();

   126 }

   128 nsISupports*

   129 SpeechRecognition::GetParentObject() const

   130 {

   131   return GetOwner();

   132 }

   134 void

   135 SpeechRecognition::ProcessEvent(SpeechEvent* aEvent)

   136 {

   137   SR_LOG("Processing %s, current state is %s",

   138          GetName(aEvent),

   139          GetName(mCurrentState));

   141   if (mAborted && aEvent->mType != EVENT_ABORT) {

   142     // ignore all events while aborting

   143     return;

   144   }

   146   Transition(aEvent);

   147 }

   149 void

   150 SpeechRecognition::Transition(SpeechEvent* aEvent)

   151 {

   152   switch (mCurrentState) {

   153     case STATE_IDLE:

   154       switch (aEvent->mType) {

   155         case EVENT_START:

   156           // TODO: may want to time out if we wait too long

   157           // for user to approve

   158           WaitForAudioData(aEvent);

   159           break;

   160         case EVENT_STOP:

   161         case EVENT_ABORT:

   162         case EVENT_AUDIO_DATA:

   163         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:

   164         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:

   165           DoNothing(aEvent);

   166           break;

   167         case EVENT_AUDIO_ERROR:

   168         case EVENT_RECOGNITIONSERVICE_ERROR:

   169           AbortError(aEvent);

   170           break;

   171         case EVENT_COUNT:

   172           MOZ_CRASH("Invalid event EVENT_COUNT");

   173       }

   174       break;

   175     case STATE_STARTING:

   176       switch (aEvent->mType) {

   177         case EVENT_AUDIO_DATA:

   178           StartedAudioCapture(aEvent);

   179           break;

   180         case EVENT_AUDIO_ERROR:

   181         case EVENT_RECOGNITIONSERVICE_ERROR:

   182           AbortError(aEvent);

   183           break;

   184         case EVENT_ABORT:

   185           AbortSilently(aEvent);

   186           break;

   187         case EVENT_STOP:

   188           Reset();

   189           break;

   190         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:

   191         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:

   192           DoNothing(aEvent);

   193           break;

   194         case EVENT_START:

   195           SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));

   196           MOZ_CRASH();

   197         case EVENT_COUNT:

   198           MOZ_CRASH("Invalid event EVENT_COUNT");

   199       }

   200       break;

   201     case STATE_ESTIMATING:

   202       switch (aEvent->mType) {

   203         case EVENT_AUDIO_DATA:

   204           WaitForEstimation(aEvent);

   205           break;

   206         case EVENT_STOP:

   207           StopRecordingAndRecognize(aEvent);

   208           break;

   209         case EVENT_ABORT:

   210           AbortSilently(aEvent);

   211           break;

   212         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:

   213         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:

   214         case EVENT_RECOGNITIONSERVICE_ERROR:

   215           DoNothing(aEvent);

   216           break;

   217         case EVENT_AUDIO_ERROR:

   218           AbortError(aEvent);

   219           break;

   220         case EVENT_START:

   221           SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType);

   222           MOZ_CRASH();

   223         case EVENT_COUNT:

   224           MOZ_CRASH("Invalid event EVENT_COUNT");

   225       }

   226       break;

   227     case STATE_WAITING_FOR_SPEECH:

   228       switch (aEvent->mType) {

   229         case EVENT_AUDIO_DATA:

   230           DetectSpeech(aEvent);

   231           break;

   232         case EVENT_STOP:

   233           StopRecordingAndRecognize(aEvent);

   234           break;

   235         case EVENT_ABORT:

   236           AbortSilently(aEvent);

   237           break;

   238         case EVENT_AUDIO_ERROR:

   239           AbortError(aEvent);

   240           break;

   241         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:

   242         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:

   243         case EVENT_RECOGNITIONSERVICE_ERROR:

   244           DoNothing(aEvent);

   245           break;

   246         case EVENT_START:

   247           SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));

   248           MOZ_CRASH();

   249         case EVENT_COUNT:

   250           MOZ_CRASH("Invalid event EVENT_COUNT");

   251       }

   252       break;

   253     case STATE_RECOGNIZING:

   254       switch (aEvent->mType) {

   255         case EVENT_AUDIO_DATA:

   256           WaitForSpeechEnd(aEvent);

   257           break;

   258         case EVENT_STOP:

   259           StopRecordingAndRecognize(aEvent);

   260           break;

   261         case EVENT_AUDIO_ERROR:

   262         case EVENT_RECOGNITIONSERVICE_ERROR:

   263           AbortError(aEvent);

   264           break;

   265         case EVENT_ABORT:

   266           AbortSilently(aEvent);

   267           break;

   268         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:

   269         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:

   270           DoNothing(aEvent);

   271           break;

   272         case EVENT_START:

   273           SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent));

   274           MOZ_CRASH();

   275         case EVENT_COUNT:

   276           MOZ_CRASH("Invalid event EVENT_COUNT");

   277       }

   278       break;

   279     case STATE_WAITING_FOR_RESULT:

   280       switch (aEvent->mType) {

   281         case EVENT_STOP:

   282           DoNothing(aEvent);

   283           break;

   284         case EVENT_AUDIO_ERROR:

   285         case EVENT_RECOGNITIONSERVICE_ERROR:

   286           AbortError(aEvent);

   287           break;

   288         case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:

   289           NotifyFinalResult(aEvent);

   290           break;

   291         case EVENT_AUDIO_DATA:

   292           DoNothing(aEvent);

   293           break;

   294         case EVENT_ABORT:

   295           AbortSilently(aEvent);

   296           break;

   297         case EVENT_START:

   298         case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:

   299           SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", GetName(aEvent));

   300           MOZ_CRASH();

   301         case EVENT_COUNT:

   302           MOZ_CRASH("Invalid event EVENT_COUNT");

   303       }

   304       break;

   305     case STATE_COUNT:

   306       MOZ_CRASH("Invalid state STATE_COUNT");

   307   }

   309   return;

   310 }

   312 /*

   313  * Handle a segment of recorded audio data.

   314  * Returns the number of samples that were processed.

   315  */

   316 uint32_t

   317 SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment)

   318 {

   319   AudioSegment::ChunkIterator iterator(*aSegment);

   320   uint32_t samples = 0;

   321   while (!iterator.IsEnded()) {

   322     float out;

   323     mEndpointer.ProcessAudio(*iterator, &out);

   324     samples += iterator->GetDuration();

   325     iterator.Next();

   326   }

   328   mRecognitionService->ProcessAudioSegment(aSegment);

   329   return samples;

   330 }

   332 void

   333 SpeechRecognition::GetRecognitionServiceCID(nsACString& aResultCID)

   334 {

   335   if (mTestConfig.mFakeRecognitionService) {

   336     aResultCID =

   337       NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake";

   339     return;

   340   }

   342   nsAdoptingCString prefValue =

   343     Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE);

   345   nsAutoCString speechRecognitionService;

   346   if (!prefValue.get() || prefValue.IsEmpty()) {

   347     speechRecognitionService = DEFAULT_RECOGNITION_SERVICE;

   348   } else {

   349     speechRecognitionService = prefValue;

   350   }

   352   aResultCID =

   353     NS_LITERAL_CSTRING(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) +

   354     speechRecognitionService;

   356   return;

   357 }

   359 /****************************************************************************

   360  * FSM Transition functions

   361  *

   362  * If a transition function may cause a DOM event to be fired,

   363  * it may also be re-entered, since the event handler may cause the

   364  * event loop to spin and new SpeechEvents to be processed.

   365  *

   366  * Rules:

   367  * 1) These methods should call SetState as soon as possible.

   368  * 2) If these methods dispatch DOM events, or call methods that dispatch

   369  * DOM events, that should be done as late as possible.

   370  * 3) If anything must happen after dispatching a DOM event, make sure

   371  * the state is still what the method expected it to be.

   372  ****************************************************************************/

   374 void

   375 SpeechRecognition::Reset()

   376 {

   377   SetState(STATE_IDLE);

   378   mRecognitionService = nullptr;

   379   mEstimationSamples = 0;

   380   mBufferedSamples = 0;

   381   mSpeechDetectionTimer->Cancel();

   382   mAborted = false;

   383 }

   385 void

   386 SpeechRecognition::ResetAndEnd()

   387 {

   388   Reset();

   389   DispatchTrustedEvent(NS_LITERAL_STRING("end"));

   390 }

   392 void

   393 SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent)

   394 {

   395   SetState(STATE_STARTING);

   396 }

   398 void

   399 SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent)

   400 {

   401   SetState(STATE_ESTIMATING);

   403   mEndpointer.SetEnvironmentEstimationMode();

   404   mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment);

   406   DispatchTrustedEvent(NS_LITERAL_STRING("audiostart"));

   407   if (mCurrentState == STATE_ESTIMATING) {

   408     DispatchTrustedEvent(NS_LITERAL_STRING("start"));

   409   }

   410 }

   412 void

   413 SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent)

   414 {

   415   SetState(STATE_WAITING_FOR_RESULT);

   417   MOZ_ASSERT(mRecognitionService, "Service deleted before recording done");

   418   mRecognitionService->SoundEnd();

   420   StopRecording();

   421 }

   423 void

   424 SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent)

   425 {

   426   SetState(STATE_ESTIMATING);

   428   mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment);

   429   if (mEstimationSamples > kESTIMATION_SAMPLES) {

   430     mEndpointer.SetUserInputMode();

   431     SetState(STATE_WAITING_FOR_SPEECH);

   432   }

   433 }

   435 void

   436 SpeechRecognition::DetectSpeech(SpeechEvent* aEvent)

   437 {

   438   SetState(STATE_WAITING_FOR_SPEECH);

   440   ProcessAudioSegment(aEvent->mAudioSegment);

   441   if (mEndpointer.DidStartReceivingSpeech()) {

   442     mSpeechDetectionTimer->Cancel();

   443     SetState(STATE_RECOGNIZING);

   444     DispatchTrustedEvent(NS_LITERAL_STRING("speechstart"));

   445   }

   446 }

   448 void

   449 SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent)

   450 {

   451   SetState(STATE_RECOGNIZING);

   453   ProcessAudioSegment(aEvent->mAudioSegment);

   454   if (mEndpointer.speech_input_complete()) {

   455     DispatchTrustedEvent(NS_LITERAL_STRING("speechend"));

   457     if (mCurrentState == STATE_RECOGNIZING) {

   458       // FIXME: StopRecordingAndRecognize should only be called for single

   459       // shot services for continuous we should just inform the service

   460       StopRecordingAndRecognize(aEvent);

   461     }

   462   }

   463 }

   465 void

   466 SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent)

   467 {

   468   ResetAndEnd();

   470   nsCOMPtr<nsIDOMEvent> domEvent;

   471   NS_NewDOMSpeechRecognitionEvent(getter_AddRefs(domEvent), nullptr, nullptr, nullptr);

   473   nsCOMPtr<nsIDOMSpeechRecognitionEvent> srEvent = do_QueryInterface(domEvent);

   474   nsRefPtr<SpeechRecognitionResultList> rlist = aEvent->mRecognitionResultList;

   475   nsCOMPtr<nsISupports> ilist = do_QueryInterface(rlist);

   476   srEvent->InitSpeechRecognitionEvent(NS_LITERAL_STRING("result"),

   477                                       true, false, 0, ilist,

   478                                       NS_LITERAL_STRING("NOT_IMPLEMENTED"),

   479                                       nullptr);

   480   domEvent->SetTrusted(true);

   482   bool defaultActionEnabled;

   483   this->DispatchEvent(domEvent, &defaultActionEnabled);

   484 }

   486 void

   487 SpeechRecognition::DoNothing(SpeechEvent* aEvent)

   488 {

   489 }

   491 void

   492 SpeechRecognition::AbortSilently(SpeechEvent* aEvent)

   493 {

   494   bool stopRecording = StateBetween(STATE_ESTIMATING, STATE_RECOGNIZING);

   496   if (mRecognitionService) {

   497     mRecognitionService->Abort();

   498   }

   500   if (stopRecording) {

   501     StopRecording();

   502   }

   504   ResetAndEnd();

   505 }

   507 void

   508 SpeechRecognition::AbortError(SpeechEvent* aEvent)

   509 {

   510   AbortSilently(aEvent);

   511   NotifyError(aEvent);

   512 }

   514 void

   515 SpeechRecognition::NotifyError(SpeechEvent* aEvent)

   516 {

   517   aEvent->mError->SetTrusted(true);

   519   bool defaultActionEnabled;

   520   this->DispatchEvent(aEvent->mError, &defaultActionEnabled);

   522   return;

   523 }

   525 /**************************************

   526  * Event triggers and other functions *

   527  **************************************/

   528 NS_IMETHODIMP

   529 SpeechRecognition::StartRecording(DOMMediaStream* aDOMStream)

   530 {

   531   // hold a reference so that the underlying stream

   532   // doesn't get Destroy()'ed

   533   mDOMStream = aDOMStream;

   535   NS_ENSURE_STATE(mDOMStream->GetStream());

   536   mSpeechListener = new SpeechStreamListener(this);

   537   mDOMStream->GetStream()->AddListener(mSpeechListener);

   539   mEndpointer.StartSession();

   541   return mSpeechDetectionTimer->Init(this, kSPEECH_DETECTION_TIMEOUT_MS,

   542                                      nsITimer::TYPE_ONE_SHOT);

   543 }

   545 NS_IMETHODIMP

   546 SpeechRecognition::StopRecording()

   547 {

   548   // we only really need to remove the listener explicitly when testing,

   549   // as our JS code still holds a reference to mDOMStream and only assigning

   550   // it to nullptr isn't guaranteed to free the stream and the listener.

   551   mDOMStream->GetStream()->RemoveListener(mSpeechListener);

   552   mSpeechListener = nullptr;

   553   mDOMStream = nullptr;

   555   mEndpointer.EndSession();

   556   DispatchTrustedEvent(NS_LITERAL_STRING("audioend"));

   558   return NS_OK;

   559 }

   561 NS_IMETHODIMP

   562 SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic,

   563                            const char16_t* aData)

   564 {

   565   MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread");

   567   if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) &&

   568       StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) {

   570     DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,

   571                   SpeechRecognitionErrorCode::No_speech,

   572                   NS_LITERAL_STRING("No speech detected (timeout)"));

   573   } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) {

   574     nsCOMPtr<nsIObserverService> obs = services::GetObserverService();

   575     obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC);

   576     obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC);

   577   } else if (mTestConfig.mFakeFSMEvents &&

   578              !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) {

   579     ProcessTestEventRequest(aSubject, nsDependentString(aData));

   580   }

   582   return NS_OK;

   583 }

   585 void

   586 SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName)

   587 {

   588   if (aEventName.EqualsLiteral("EVENT_START")) {

   589     ErrorResult err;

   590     Start(err);

   591   } else if (aEventName.EqualsLiteral("EVENT_STOP")) {

   592     Stop();

   593   } else if (aEventName.EqualsLiteral("EVENT_ABORT")) {

   594     Abort();

   595   } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) {

   596     DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,

   597                   SpeechRecognitionErrorCode::Audio_capture, // TODO different codes?

   598                   NS_LITERAL_STRING("AUDIO_ERROR test event"));

   599   } else if (aEventName.EqualsLiteral("EVENT_AUDIO_DATA")) {

   600     StartRecording(static_cast<DOMMediaStream*>(aSubject));

   601   } else {

   602     NS_ASSERTION(mTestConfig.mFakeRecognitionService,

   603                  "Got request for fake recognition service event, but "

   604                  TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE " is unset");

   606     // let the fake recognition service handle the request

   607   }

   609   return;

   610 }

   612 already_AddRefed<SpeechGrammarList>

   613 SpeechRecognition::GetGrammars(ErrorResult& aRv) const

   614 {

   615   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);

   616   return nullptr;

   617 }

   619 void

   620 SpeechRecognition::SetGrammars(SpeechGrammarList& aArg, ErrorResult& aRv)

   621 {

   622   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);

   623   return;

   624 }

   626 void

   627 SpeechRecognition::GetLang(nsString& aRetVal, ErrorResult& aRv) const

   628 {

   629   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);

   630   return;

   631 }

   633 void

   634 SpeechRecognition::SetLang(const nsAString& aArg, ErrorResult& aRv)

   635 {

   636   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);

   637   return;

   638 }

   640 bool

   641 SpeechRecognition::GetContinuous(ErrorResult& aRv) const

   642 {

   643   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);

   644   return false;

   645 }

   647 void

   648 SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv)

   649 {

   650   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);

   651   return;

   652 }

   654 bool

   655 SpeechRecognition::GetInterimResults(ErrorResult& aRv) const

   656 {

   657   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);

   658   return false;

   659 }

   661 void

   662 SpeechRecognition::SetInterimResults(bool aArg, ErrorResult& aRv)

   663 {

   664   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);

   665   return;

   666 }

   668 uint32_t

   669 SpeechRecognition::GetMaxAlternatives(ErrorResult& aRv) const

   670 {

   671   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);

   672   return 0;

   673 }

   675 void

   676 SpeechRecognition::SetMaxAlternatives(uint32_t aArg, ErrorResult& aRv)

   677 {

   678   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);

   679   return;

   680 }

   682 void

   683 SpeechRecognition::GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const

   684 {

   685   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);

   686   return;

   687 }

   689 void

   690 SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv)

   691 {

   692   aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);

   693   return;

   694 }

   696 void

   697 SpeechRecognition::Start(ErrorResult& aRv)

   698 {

   699   if (mCurrentState != STATE_IDLE) {

   700     aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);

   701     return;

   702   }

   704   nsAutoCString speechRecognitionServiceCID;

   705   GetRecognitionServiceCID(speechRecognitionServiceCID);

   707   nsresult rv;

   708   mRecognitionService = do_GetService(speechRecognitionServiceCID.get(), &rv);

   709   NS_ENSURE_SUCCESS_VOID(rv);

   711   rv = mRecognitionService->Initialize(this->asWeakPtr());

   712   NS_ENSURE_SUCCESS_VOID(rv);

   714   MediaStreamConstraints constraints;

   715   constraints.mAudio.SetAsBoolean() = true;

   717   if (!mTestConfig.mFakeFSMEvents) {

   718     MediaManager* manager = MediaManager::Get();

   719     manager->GetUserMedia(false,

   720                           GetOwner(),

   721                           constraints,

   722                           new GetUserMediaSuccessCallback(this),

   723                           new GetUserMediaErrorCallback(this));

   724   }

   726   nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START);

   727   NS_DispatchToMainThread(event);

   728 }

   730 void

   731 SpeechRecognition::Stop()

   732 {

   733   nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP);

   734   NS_DispatchToMainThread(event);

   735 }

   737 void

   738 SpeechRecognition::Abort()

   739 {

   740   if (mAborted) {

   741     return;

   742   }

   744   mAborted = true;

   745   nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT);

   746   NS_DispatchToMainThread(event);

   747 }

   749 void

   750 SpeechRecognition::DispatchError(EventType aErrorType,

   751                                  SpeechRecognitionErrorCode aErrorCode,

   752                                  const nsAString& aMessage)

   753 {

   754   MOZ_ASSERT(NS_IsMainThread());

   755   MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR ||

   756              aErrorType == EVENT_AUDIO_ERROR, "Invalid error type!");

   758   nsRefPtr<SpeechRecognitionError> srError =

   759     new SpeechRecognitionError(nullptr, nullptr, nullptr);

   761   ErrorResult err;

   762   srError->InitSpeechRecognitionError(NS_LITERAL_STRING("error"), true, false,

   763                                       aErrorCode, aMessage, err);

   765   nsRefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType);

   766   event->mError = srError;

   767   NS_DispatchToMainThread(event);

   768 }

   770 /*

   771  * Buffer audio samples into mAudioSamplesBuffer until aBufferSize.

   772  * Updates mBufferedSamples and returns the number of samples that were buffered.

   773  */

   774 uint32_t

   775 SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples,

   776                                      uint32_t aSampleCount)

   777 {

   778   MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk);

   779   MOZ_ASSERT(mAudioSamplesBuffer.get());

   781   int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data());

   782   size_t samplesToCopy = std::min(aSampleCount,

   783                                   mAudioSamplesPerChunk - mBufferedSamples);

   785   memcpy(samplesBuffer + mBufferedSamples, aSamples,

   786          samplesToCopy * sizeof(int16_t));

   788   mBufferedSamples += samplesToCopy;

   789   return samplesToCopy;

   790 }

   792 /*

   793  * Split a samples buffer starting of a given size into

   794  * chunks of equal size. The chunks are stored in the array

   795  * received as argument.

   796  * Returns the offset of the end of the last chunk that was

   797  * created.

   798  */

   799 uint32_t

   800 SpeechRecognition::SplitSamplesBuffer(const int16_t* aSamplesBuffer,

   801                                       uint32_t aSampleCount,

   802                                       nsTArray<nsRefPtr<SharedBuffer>>& aResult)

   803 {

   804   uint32_t chunkStart = 0;

   806   while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) {

   807     nsRefPtr<SharedBuffer> chunk =

   808       SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));

   810     memcpy(chunk->Data(), aSamplesBuffer + chunkStart,

   811            mAudioSamplesPerChunk * sizeof(int16_t));

   813     aResult.AppendElement(chunk);

   814     chunkStart += mAudioSamplesPerChunk;

   815   }

   817   return chunkStart;

   818 }

   820 AudioSegment*

   821 SpeechRecognition::CreateAudioSegment(nsTArray<nsRefPtr<SharedBuffer>>& aChunks)

   822 {

   823   AudioSegment* segment = new AudioSegment();

   824   for (uint32_t i = 0; i < aChunks.Length(); ++i) {

   825     nsRefPtr<SharedBuffer> buffer = aChunks[i];

   826     const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data());

   828     nsAutoTArray<const int16_t*, 1> channels;

   829     channels.AppendElement(chunkData);

   830     segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk);

   831   }

   833   return segment;

   834 }

   836 void

   837 SpeechRecognition::FeedAudioData(already_AddRefed<SharedBuffer> aSamples,

   838                                  uint32_t aDuration,

   839                                  MediaStreamListener* aProvider)

   840 {

   841   NS_ASSERTION(!NS_IsMainThread(),

   842                "FeedAudioData should not be called in the main thread");

   844   // Endpointer expects to receive samples in chunks whose size is a

   845   // multiple of its frame size.

   846   // Since we can't assume we will receive the frames in appropriate-sized

   847   // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk

   848   // (a multiple of Endpointer's frame size) before feeding to Endpointer.

   850   // ensure aSamples is deleted

   851   nsRefPtr<SharedBuffer> refSamples = aSamples;

   853   uint32_t samplesIndex = 0;

   854   const int16_t* samples = static_cast<int16_t*>(refSamples->Data());

   855   nsAutoTArray<nsRefPtr<SharedBuffer>, 5> chunksToSend;

   857   // fill up our buffer and make a chunk out of it, if possible

   858   if (mBufferedSamples > 0) {

   859     samplesIndex += FillSamplesBuffer(samples, aDuration);

   861     if (mBufferedSamples == mAudioSamplesPerChunk) {

   862       chunksToSend.AppendElement(mAudioSamplesBuffer);

   863       mAudioSamplesBuffer = nullptr;

   864       mBufferedSamples = 0;

   865     }

   866   }

   868   // create sample chunks of correct size

   869   if (samplesIndex < aDuration) {

   870     samplesIndex += SplitSamplesBuffer(samples + samplesIndex,

   871                                        aDuration - samplesIndex,

   872                                        chunksToSend);

   873   }

   875   // buffer remaining samples

   876   if (samplesIndex < aDuration) {

   877     mBufferedSamples = 0;

   878     mAudioSamplesBuffer =

   879       SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));

   881     FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex);

   882   }

   884   AudioSegment* segment = CreateAudioSegment(chunksToSend);

   885   nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_AUDIO_DATA);

   886   event->mAudioSegment = segment;

   887   event->mProvider = aProvider;

   888   NS_DispatchToMainThread(event);

   890   return;

   891 }

   893 const char*

   894 SpeechRecognition::GetName(FSMState aId)

   895 {

   896   static const char* names[] = {

   897     "STATE_IDLE",

   898     "STATE_STARTING",

   899     "STATE_ESTIMATING",

   900     "STATE_WAITING_FOR_SPEECH",

   901     "STATE_RECOGNIZING",

   902     "STATE_WAITING_FOR_RESULT",

   903   };

   905   MOZ_ASSERT(aId < STATE_COUNT);

   906   MOZ_ASSERT(ArrayLength(names) == STATE_COUNT);

   907   return names[aId];

   908 }

   910 const char*

   911 SpeechRecognition::GetName(SpeechEvent* aEvent)

   912 {

   913   static const char* names[] = {

   914     "EVENT_START",

   915     "EVENT_STOP",

   916     "EVENT_ABORT",

   917     "EVENT_AUDIO_DATA",

   918     "EVENT_AUDIO_ERROR",

   919     "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT",

   920     "EVENT_RECOGNITIONSERVICE_FINAL_RESULT",

   921     "EVENT_RECOGNITIONSERVICE_ERROR"

   922   };

   924   MOZ_ASSERT(aEvent->mType < EVENT_COUNT);

   925   MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT);

   926   return names[aEvent->mType];

   927 }

   929 SpeechEvent::~SpeechEvent()

   930 {

   931   delete mAudioSegment;

   932 }

   934 NS_IMETHODIMP

   935 SpeechEvent::Run()

   936 {

   937   mRecognition->ProcessEvent(this);

   938   return NS_OK;

   939 }

   941 NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaSuccessCallback, nsIDOMGetUserMediaSuccessCallback)

   943 NS_IMETHODIMP

   944 SpeechRecognition::GetUserMediaSuccessCallback::OnSuccess(nsISupports* aStream)

   945 {

   946   nsCOMPtr<nsIDOMLocalMediaStream> localStream = do_QueryInterface(aStream);

   947   mRecognition->StartRecording(static_cast<DOMLocalMediaStream*>(localStream.get()));

   948   return NS_OK;

   949 }

   951 NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaErrorCallback, nsIDOMGetUserMediaErrorCallback)

   953 NS_IMETHODIMP

   954 SpeechRecognition::GetUserMediaErrorCallback::OnError(const nsAString& aError)

   955 {

   956   SpeechRecognitionErrorCode errorCode;

   958   if (aError.Equals(NS_LITERAL_STRING("PERMISSION_DENIED"))) {

   959     errorCode = SpeechRecognitionErrorCode::Not_allowed;

   960   } else {

   961     errorCode = SpeechRecognitionErrorCode::Audio_capture;

   962   }

   964   mRecognition->DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode,

   965                               aError);

   967   return NS_OK;

   968 }

   970 } // namespace dom

   971 } // namespace mozilla

The Tor Browser / file revision

content/media/webspeech/recognition/SpeechRecognition.cpp@a63d609f5ebe

content/media/webspeech/recognition/SpeechRecognition.cpp