Thu, 15 Jan 2015 15:55:04 +0100
Back out 97036ab72558 which inappropriately compared turds to third parties.
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim:set ts=2 sw=2 sts=2 et cindent: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #include "SpeechRecognition.h"
9 #include "nsCOMPtr.h"
10 #include "nsCycleCollectionParticipant.h"
12 #include "mozilla/dom/SpeechRecognitionBinding.h"
13 #include "mozilla/dom/MediaStreamTrackBinding.h"
14 #include "mozilla/MediaManager.h"
15 #include "mozilla/Services.h"
17 #include "AudioSegment.h"
18 #include "endpointer.h"
20 #include "GeneratedEvents.h"
21 #include "nsIDOMSpeechRecognitionEvent.h"
22 #include "nsIObserverService.h"
23 #include "nsServiceManagerUtils.h"
25 #include <algorithm>
27 namespace mozilla {
28 namespace dom {
30 #define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default"
31 #define DEFAULT_RECOGNITION_SERVICE "google"
33 #define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length"
34 #define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH "media.webspeech.long_silence_length"
35 #define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH "media.webspeech.long_speech_length"
37 static const uint32_t kSAMPLE_RATE = 16000;
38 static const uint32_t kSPEECH_DETECTION_TIMEOUT_MS = 10000;
40 // number of frames corresponding to 300ms of audio to send to endpointer while
41 // it's in environment estimation mode
42 // kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms
43 static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000;
45 #ifdef PR_LOGGING
46 PRLogModuleInfo*
47 GetSpeechRecognitionLog()
48 {
49 static PRLogModuleInfo* sLog;
50 if (!sLog) {
51 sLog = PR_NewLogModule("SpeechRecognition");
52 }
54 return sLog;
55 }
56 #define SR_LOG(...) PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, (__VA_ARGS__))
57 #else
58 #define SR_LOG(...)
59 #endif
61 NS_INTERFACE_MAP_BEGIN(SpeechRecognition)
62 NS_INTERFACE_MAP_ENTRY(nsIObserver)
63 NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper)
65 NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper)
66 NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper)
68 struct SpeechRecognition::TestConfig SpeechRecognition::mTestConfig;
70 SpeechRecognition::SpeechRecognition(nsPIDOMWindow* aOwnerWindow)
71 : DOMEventTargetHelper(aOwnerWindow)
72 , mEndpointer(kSAMPLE_RATE)
73 , mAudioSamplesPerChunk(mEndpointer.FrameSize())
74 , mSpeechDetectionTimer(do_CreateInstance(NS_TIMER_CONTRACTID))
75 {
76 SR_LOG("created SpeechRecognition");
78 mTestConfig.Init();
79 if (mTestConfig.mEnableTests) {
80 nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
81 obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false);
82 obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false);
83 }
85 mEndpointer.set_speech_input_complete_silence_length(
86 Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 500000));
87 mEndpointer.set_long_speech_input_complete_silence_length(
88 Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 1000000));
89 mEndpointer.set_long_speech_length(
90 Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000));
91 Reset();
92 }
94 bool
95 SpeechRecognition::StateBetween(FSMState begin, FSMState end)
96 {
97 return mCurrentState >= begin && mCurrentState <= end;
98 }
100 void
101 SpeechRecognition::SetState(FSMState state)
102 {
103 mCurrentState = state;
104 SR_LOG("Transitioned to state %s", GetName(mCurrentState));
105 return;
106 }
108 JSObject*
109 SpeechRecognition::WrapObject(JSContext* aCx)
110 {
111 return SpeechRecognitionBinding::Wrap(aCx, this);
112 }
114 already_AddRefed<SpeechRecognition>
115 SpeechRecognition::Constructor(const GlobalObject& aGlobal,
116 ErrorResult& aRv)
117 {
118 nsCOMPtr<nsPIDOMWindow> win = do_QueryInterface(aGlobal.GetAsSupports());
119 if (!win) {
120 aRv.Throw(NS_ERROR_FAILURE);
121 }
123 MOZ_ASSERT(win->IsInnerWindow());
124 nsRefPtr<SpeechRecognition> object = new SpeechRecognition(win);
125 return object.forget();
126 }
128 nsISupports*
129 SpeechRecognition::GetParentObject() const
130 {
131 return GetOwner();
132 }
134 void
135 SpeechRecognition::ProcessEvent(SpeechEvent* aEvent)
136 {
137 SR_LOG("Processing %s, current state is %s",
138 GetName(aEvent),
139 GetName(mCurrentState));
141 if (mAborted && aEvent->mType != EVENT_ABORT) {
142 // ignore all events while aborting
143 return;
144 }
146 Transition(aEvent);
147 }
149 void
150 SpeechRecognition::Transition(SpeechEvent* aEvent)
151 {
152 switch (mCurrentState) {
153 case STATE_IDLE:
154 switch (aEvent->mType) {
155 case EVENT_START:
156 // TODO: may want to time out if we wait too long
157 // for user to approve
158 WaitForAudioData(aEvent);
159 break;
160 case EVENT_STOP:
161 case EVENT_ABORT:
162 case EVENT_AUDIO_DATA:
163 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
164 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
165 DoNothing(aEvent);
166 break;
167 case EVENT_AUDIO_ERROR:
168 case EVENT_RECOGNITIONSERVICE_ERROR:
169 AbortError(aEvent);
170 break;
171 case EVENT_COUNT:
172 MOZ_CRASH("Invalid event EVENT_COUNT");
173 }
174 break;
175 case STATE_STARTING:
176 switch (aEvent->mType) {
177 case EVENT_AUDIO_DATA:
178 StartedAudioCapture(aEvent);
179 break;
180 case EVENT_AUDIO_ERROR:
181 case EVENT_RECOGNITIONSERVICE_ERROR:
182 AbortError(aEvent);
183 break;
184 case EVENT_ABORT:
185 AbortSilently(aEvent);
186 break;
187 case EVENT_STOP:
188 Reset();
189 break;
190 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
191 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
192 DoNothing(aEvent);
193 break;
194 case EVENT_START:
195 SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
196 MOZ_CRASH();
197 case EVENT_COUNT:
198 MOZ_CRASH("Invalid event EVENT_COUNT");
199 }
200 break;
201 case STATE_ESTIMATING:
202 switch (aEvent->mType) {
203 case EVENT_AUDIO_DATA:
204 WaitForEstimation(aEvent);
205 break;
206 case EVENT_STOP:
207 StopRecordingAndRecognize(aEvent);
208 break;
209 case EVENT_ABORT:
210 AbortSilently(aEvent);
211 break;
212 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
213 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
214 case EVENT_RECOGNITIONSERVICE_ERROR:
215 DoNothing(aEvent);
216 break;
217 case EVENT_AUDIO_ERROR:
218 AbortError(aEvent);
219 break;
220 case EVENT_START:
221 SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType);
222 MOZ_CRASH();
223 case EVENT_COUNT:
224 MOZ_CRASH("Invalid event EVENT_COUNT");
225 }
226 break;
227 case STATE_WAITING_FOR_SPEECH:
228 switch (aEvent->mType) {
229 case EVENT_AUDIO_DATA:
230 DetectSpeech(aEvent);
231 break;
232 case EVENT_STOP:
233 StopRecordingAndRecognize(aEvent);
234 break;
235 case EVENT_ABORT:
236 AbortSilently(aEvent);
237 break;
238 case EVENT_AUDIO_ERROR:
239 AbortError(aEvent);
240 break;
241 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
242 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
243 case EVENT_RECOGNITIONSERVICE_ERROR:
244 DoNothing(aEvent);
245 break;
246 case EVENT_START:
247 SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
248 MOZ_CRASH();
249 case EVENT_COUNT:
250 MOZ_CRASH("Invalid event EVENT_COUNT");
251 }
252 break;
253 case STATE_RECOGNIZING:
254 switch (aEvent->mType) {
255 case EVENT_AUDIO_DATA:
256 WaitForSpeechEnd(aEvent);
257 break;
258 case EVENT_STOP:
259 StopRecordingAndRecognize(aEvent);
260 break;
261 case EVENT_AUDIO_ERROR:
262 case EVENT_RECOGNITIONSERVICE_ERROR:
263 AbortError(aEvent);
264 break;
265 case EVENT_ABORT:
266 AbortSilently(aEvent);
267 break;
268 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
269 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
270 DoNothing(aEvent);
271 break;
272 case EVENT_START:
273 SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent));
274 MOZ_CRASH();
275 case EVENT_COUNT:
276 MOZ_CRASH("Invalid event EVENT_COUNT");
277 }
278 break;
279 case STATE_WAITING_FOR_RESULT:
280 switch (aEvent->mType) {
281 case EVENT_STOP:
282 DoNothing(aEvent);
283 break;
284 case EVENT_AUDIO_ERROR:
285 case EVENT_RECOGNITIONSERVICE_ERROR:
286 AbortError(aEvent);
287 break;
288 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
289 NotifyFinalResult(aEvent);
290 break;
291 case EVENT_AUDIO_DATA:
292 DoNothing(aEvent);
293 break;
294 case EVENT_ABORT:
295 AbortSilently(aEvent);
296 break;
297 case EVENT_START:
298 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
299 SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", GetName(aEvent));
300 MOZ_CRASH();
301 case EVENT_COUNT:
302 MOZ_CRASH("Invalid event EVENT_COUNT");
303 }
304 break;
305 case STATE_COUNT:
306 MOZ_CRASH("Invalid state STATE_COUNT");
307 }
309 return;
310 }
312 /*
313 * Handle a segment of recorded audio data.
314 * Returns the number of samples that were processed.
315 */
316 uint32_t
317 SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment)
318 {
319 AudioSegment::ChunkIterator iterator(*aSegment);
320 uint32_t samples = 0;
321 while (!iterator.IsEnded()) {
322 float out;
323 mEndpointer.ProcessAudio(*iterator, &out);
324 samples += iterator->GetDuration();
325 iterator.Next();
326 }
328 mRecognitionService->ProcessAudioSegment(aSegment);
329 return samples;
330 }
332 void
333 SpeechRecognition::GetRecognitionServiceCID(nsACString& aResultCID)
334 {
335 if (mTestConfig.mFakeRecognitionService) {
336 aResultCID =
337 NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake";
339 return;
340 }
342 nsAdoptingCString prefValue =
343 Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE);
345 nsAutoCString speechRecognitionService;
346 if (!prefValue.get() || prefValue.IsEmpty()) {
347 speechRecognitionService = DEFAULT_RECOGNITION_SERVICE;
348 } else {
349 speechRecognitionService = prefValue;
350 }
352 aResultCID =
353 NS_LITERAL_CSTRING(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) +
354 speechRecognitionService;
356 return;
357 }
359 /****************************************************************************
360 * FSM Transition functions
361 *
362 * If a transition function may cause a DOM event to be fired,
363 * it may also be re-entered, since the event handler may cause the
364 * event loop to spin and new SpeechEvents to be processed.
365 *
366 * Rules:
367 * 1) These methods should call SetState as soon as possible.
368 * 2) If these methods dispatch DOM events, or call methods that dispatch
369 * DOM events, that should be done as late as possible.
370 * 3) If anything must happen after dispatching a DOM event, make sure
371 * the state is still what the method expected it to be.
372 ****************************************************************************/
374 void
375 SpeechRecognition::Reset()
376 {
377 SetState(STATE_IDLE);
378 mRecognitionService = nullptr;
379 mEstimationSamples = 0;
380 mBufferedSamples = 0;
381 mSpeechDetectionTimer->Cancel();
382 mAborted = false;
383 }
385 void
386 SpeechRecognition::ResetAndEnd()
387 {
388 Reset();
389 DispatchTrustedEvent(NS_LITERAL_STRING("end"));
390 }
392 void
393 SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent)
394 {
395 SetState(STATE_STARTING);
396 }
398 void
399 SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent)
400 {
401 SetState(STATE_ESTIMATING);
403 mEndpointer.SetEnvironmentEstimationMode();
404 mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment);
406 DispatchTrustedEvent(NS_LITERAL_STRING("audiostart"));
407 if (mCurrentState == STATE_ESTIMATING) {
408 DispatchTrustedEvent(NS_LITERAL_STRING("start"));
409 }
410 }
412 void
413 SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent)
414 {
415 SetState(STATE_WAITING_FOR_RESULT);
417 MOZ_ASSERT(mRecognitionService, "Service deleted before recording done");
418 mRecognitionService->SoundEnd();
420 StopRecording();
421 }
423 void
424 SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent)
425 {
426 SetState(STATE_ESTIMATING);
428 mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment);
429 if (mEstimationSamples > kESTIMATION_SAMPLES) {
430 mEndpointer.SetUserInputMode();
431 SetState(STATE_WAITING_FOR_SPEECH);
432 }
433 }
435 void
436 SpeechRecognition::DetectSpeech(SpeechEvent* aEvent)
437 {
438 SetState(STATE_WAITING_FOR_SPEECH);
440 ProcessAudioSegment(aEvent->mAudioSegment);
441 if (mEndpointer.DidStartReceivingSpeech()) {
442 mSpeechDetectionTimer->Cancel();
443 SetState(STATE_RECOGNIZING);
444 DispatchTrustedEvent(NS_LITERAL_STRING("speechstart"));
445 }
446 }
448 void
449 SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent)
450 {
451 SetState(STATE_RECOGNIZING);
453 ProcessAudioSegment(aEvent->mAudioSegment);
454 if (mEndpointer.speech_input_complete()) {
455 DispatchTrustedEvent(NS_LITERAL_STRING("speechend"));
457 if (mCurrentState == STATE_RECOGNIZING) {
458 // FIXME: StopRecordingAndRecognize should only be called for single
459 // shot services for continuous we should just inform the service
460 StopRecordingAndRecognize(aEvent);
461 }
462 }
463 }
465 void
466 SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent)
467 {
468 ResetAndEnd();
470 nsCOMPtr<nsIDOMEvent> domEvent;
471 NS_NewDOMSpeechRecognitionEvent(getter_AddRefs(domEvent), nullptr, nullptr, nullptr);
473 nsCOMPtr<nsIDOMSpeechRecognitionEvent> srEvent = do_QueryInterface(domEvent);
474 nsRefPtr<SpeechRecognitionResultList> rlist = aEvent->mRecognitionResultList;
475 nsCOMPtr<nsISupports> ilist = do_QueryInterface(rlist);
476 srEvent->InitSpeechRecognitionEvent(NS_LITERAL_STRING("result"),
477 true, false, 0, ilist,
478 NS_LITERAL_STRING("NOT_IMPLEMENTED"),
479 nullptr);
480 domEvent->SetTrusted(true);
482 bool defaultActionEnabled;
483 this->DispatchEvent(domEvent, &defaultActionEnabled);
484 }
486 void
487 SpeechRecognition::DoNothing(SpeechEvent* aEvent)
488 {
489 }
491 void
492 SpeechRecognition::AbortSilently(SpeechEvent* aEvent)
493 {
494 bool stopRecording = StateBetween(STATE_ESTIMATING, STATE_RECOGNIZING);
496 if (mRecognitionService) {
497 mRecognitionService->Abort();
498 }
500 if (stopRecording) {
501 StopRecording();
502 }
504 ResetAndEnd();
505 }
507 void
508 SpeechRecognition::AbortError(SpeechEvent* aEvent)
509 {
510 AbortSilently(aEvent);
511 NotifyError(aEvent);
512 }
514 void
515 SpeechRecognition::NotifyError(SpeechEvent* aEvent)
516 {
517 aEvent->mError->SetTrusted(true);
519 bool defaultActionEnabled;
520 this->DispatchEvent(aEvent->mError, &defaultActionEnabled);
522 return;
523 }
525 /**************************************
526 * Event triggers and other functions *
527 **************************************/
528 NS_IMETHODIMP
529 SpeechRecognition::StartRecording(DOMMediaStream* aDOMStream)
530 {
531 // hold a reference so that the underlying stream
532 // doesn't get Destroy()'ed
533 mDOMStream = aDOMStream;
535 NS_ENSURE_STATE(mDOMStream->GetStream());
536 mSpeechListener = new SpeechStreamListener(this);
537 mDOMStream->GetStream()->AddListener(mSpeechListener);
539 mEndpointer.StartSession();
541 return mSpeechDetectionTimer->Init(this, kSPEECH_DETECTION_TIMEOUT_MS,
542 nsITimer::TYPE_ONE_SHOT);
543 }
545 NS_IMETHODIMP
546 SpeechRecognition::StopRecording()
547 {
548 // we only really need to remove the listener explicitly when testing,
549 // as our JS code still holds a reference to mDOMStream and only assigning
550 // it to nullptr isn't guaranteed to free the stream and the listener.
551 mDOMStream->GetStream()->RemoveListener(mSpeechListener);
552 mSpeechListener = nullptr;
553 mDOMStream = nullptr;
555 mEndpointer.EndSession();
556 DispatchTrustedEvent(NS_LITERAL_STRING("audioend"));
558 return NS_OK;
559 }
561 NS_IMETHODIMP
562 SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic,
563 const char16_t* aData)
564 {
565 MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread");
567 if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) &&
568 StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) {
570 DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,
571 SpeechRecognitionErrorCode::No_speech,
572 NS_LITERAL_STRING("No speech detected (timeout)"));
573 } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) {
574 nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
575 obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC);
576 obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC);
577 } else if (mTestConfig.mFakeFSMEvents &&
578 !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) {
579 ProcessTestEventRequest(aSubject, nsDependentString(aData));
580 }
582 return NS_OK;
583 }
585 void
586 SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName)
587 {
588 if (aEventName.EqualsLiteral("EVENT_START")) {
589 ErrorResult err;
590 Start(err);
591 } else if (aEventName.EqualsLiteral("EVENT_STOP")) {
592 Stop();
593 } else if (aEventName.EqualsLiteral("EVENT_ABORT")) {
594 Abort();
595 } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) {
596 DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,
597 SpeechRecognitionErrorCode::Audio_capture, // TODO different codes?
598 NS_LITERAL_STRING("AUDIO_ERROR test event"));
599 } else if (aEventName.EqualsLiteral("EVENT_AUDIO_DATA")) {
600 StartRecording(static_cast<DOMMediaStream*>(aSubject));
601 } else {
602 NS_ASSERTION(mTestConfig.mFakeRecognitionService,
603 "Got request for fake recognition service event, but "
604 TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE " is unset");
606 // let the fake recognition service handle the request
607 }
609 return;
610 }
612 already_AddRefed<SpeechGrammarList>
613 SpeechRecognition::GetGrammars(ErrorResult& aRv) const
614 {
615 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
616 return nullptr;
617 }
619 void
620 SpeechRecognition::SetGrammars(SpeechGrammarList& aArg, ErrorResult& aRv)
621 {
622 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
623 return;
624 }
626 void
627 SpeechRecognition::GetLang(nsString& aRetVal, ErrorResult& aRv) const
628 {
629 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
630 return;
631 }
633 void
634 SpeechRecognition::SetLang(const nsAString& aArg, ErrorResult& aRv)
635 {
636 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
637 return;
638 }
640 bool
641 SpeechRecognition::GetContinuous(ErrorResult& aRv) const
642 {
643 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
644 return false;
645 }
647 void
648 SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv)
649 {
650 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
651 return;
652 }
654 bool
655 SpeechRecognition::GetInterimResults(ErrorResult& aRv) const
656 {
657 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
658 return false;
659 }
661 void
662 SpeechRecognition::SetInterimResults(bool aArg, ErrorResult& aRv)
663 {
664 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
665 return;
666 }
668 uint32_t
669 SpeechRecognition::GetMaxAlternatives(ErrorResult& aRv) const
670 {
671 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
672 return 0;
673 }
675 void
676 SpeechRecognition::SetMaxAlternatives(uint32_t aArg, ErrorResult& aRv)
677 {
678 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
679 return;
680 }
682 void
683 SpeechRecognition::GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const
684 {
685 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
686 return;
687 }
689 void
690 SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv)
691 {
692 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
693 return;
694 }
696 void
697 SpeechRecognition::Start(ErrorResult& aRv)
698 {
699 if (mCurrentState != STATE_IDLE) {
700 aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
701 return;
702 }
704 nsAutoCString speechRecognitionServiceCID;
705 GetRecognitionServiceCID(speechRecognitionServiceCID);
707 nsresult rv;
708 mRecognitionService = do_GetService(speechRecognitionServiceCID.get(), &rv);
709 NS_ENSURE_SUCCESS_VOID(rv);
711 rv = mRecognitionService->Initialize(this->asWeakPtr());
712 NS_ENSURE_SUCCESS_VOID(rv);
714 MediaStreamConstraints constraints;
715 constraints.mAudio.SetAsBoolean() = true;
717 if (!mTestConfig.mFakeFSMEvents) {
718 MediaManager* manager = MediaManager::Get();
719 manager->GetUserMedia(false,
720 GetOwner(),
721 constraints,
722 new GetUserMediaSuccessCallback(this),
723 new GetUserMediaErrorCallback(this));
724 }
726 nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START);
727 NS_DispatchToMainThread(event);
728 }
730 void
731 SpeechRecognition::Stop()
732 {
733 nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP);
734 NS_DispatchToMainThread(event);
735 }
737 void
738 SpeechRecognition::Abort()
739 {
740 if (mAborted) {
741 return;
742 }
744 mAborted = true;
745 nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT);
746 NS_DispatchToMainThread(event);
747 }
749 void
750 SpeechRecognition::DispatchError(EventType aErrorType,
751 SpeechRecognitionErrorCode aErrorCode,
752 const nsAString& aMessage)
753 {
754 MOZ_ASSERT(NS_IsMainThread());
755 MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR ||
756 aErrorType == EVENT_AUDIO_ERROR, "Invalid error type!");
758 nsRefPtr<SpeechRecognitionError> srError =
759 new SpeechRecognitionError(nullptr, nullptr, nullptr);
761 ErrorResult err;
762 srError->InitSpeechRecognitionError(NS_LITERAL_STRING("error"), true, false,
763 aErrorCode, aMessage, err);
765 nsRefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType);
766 event->mError = srError;
767 NS_DispatchToMainThread(event);
768 }
770 /*
771 * Buffer audio samples into mAudioSamplesBuffer until aBufferSize.
772 * Updates mBufferedSamples and returns the number of samples that were buffered.
773 */
774 uint32_t
775 SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples,
776 uint32_t aSampleCount)
777 {
778 MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk);
779 MOZ_ASSERT(mAudioSamplesBuffer.get());
781 int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data());
782 size_t samplesToCopy = std::min(aSampleCount,
783 mAudioSamplesPerChunk - mBufferedSamples);
785 memcpy(samplesBuffer + mBufferedSamples, aSamples,
786 samplesToCopy * sizeof(int16_t));
788 mBufferedSamples += samplesToCopy;
789 return samplesToCopy;
790 }
792 /*
793 * Split a samples buffer starting of a given size into
794 * chunks of equal size. The chunks are stored in the array
795 * received as argument.
796 * Returns the offset of the end of the last chunk that was
797 * created.
798 */
799 uint32_t
800 SpeechRecognition::SplitSamplesBuffer(const int16_t* aSamplesBuffer,
801 uint32_t aSampleCount,
802 nsTArray<nsRefPtr<SharedBuffer>>& aResult)
803 {
804 uint32_t chunkStart = 0;
806 while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) {
807 nsRefPtr<SharedBuffer> chunk =
808 SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));
810 memcpy(chunk->Data(), aSamplesBuffer + chunkStart,
811 mAudioSamplesPerChunk * sizeof(int16_t));
813 aResult.AppendElement(chunk);
814 chunkStart += mAudioSamplesPerChunk;
815 }
817 return chunkStart;
818 }
820 AudioSegment*
821 SpeechRecognition::CreateAudioSegment(nsTArray<nsRefPtr<SharedBuffer>>& aChunks)
822 {
823 AudioSegment* segment = new AudioSegment();
824 for (uint32_t i = 0; i < aChunks.Length(); ++i) {
825 nsRefPtr<SharedBuffer> buffer = aChunks[i];
826 const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data());
828 nsAutoTArray<const int16_t*, 1> channels;
829 channels.AppendElement(chunkData);
830 segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk);
831 }
833 return segment;
834 }
836 void
837 SpeechRecognition::FeedAudioData(already_AddRefed<SharedBuffer> aSamples,
838 uint32_t aDuration,
839 MediaStreamListener* aProvider)
840 {
841 NS_ASSERTION(!NS_IsMainThread(),
842 "FeedAudioData should not be called in the main thread");
844 // Endpointer expects to receive samples in chunks whose size is a
845 // multiple of its frame size.
846 // Since we can't assume we will receive the frames in appropriate-sized
847 // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk
848 // (a multiple of Endpointer's frame size) before feeding to Endpointer.
850 // ensure aSamples is deleted
851 nsRefPtr<SharedBuffer> refSamples = aSamples;
853 uint32_t samplesIndex = 0;
854 const int16_t* samples = static_cast<int16_t*>(refSamples->Data());
855 nsAutoTArray<nsRefPtr<SharedBuffer>, 5> chunksToSend;
857 // fill up our buffer and make a chunk out of it, if possible
858 if (mBufferedSamples > 0) {
859 samplesIndex += FillSamplesBuffer(samples, aDuration);
861 if (mBufferedSamples == mAudioSamplesPerChunk) {
862 chunksToSend.AppendElement(mAudioSamplesBuffer);
863 mAudioSamplesBuffer = nullptr;
864 mBufferedSamples = 0;
865 }
866 }
868 // create sample chunks of correct size
869 if (samplesIndex < aDuration) {
870 samplesIndex += SplitSamplesBuffer(samples + samplesIndex,
871 aDuration - samplesIndex,
872 chunksToSend);
873 }
875 // buffer remaining samples
876 if (samplesIndex < aDuration) {
877 mBufferedSamples = 0;
878 mAudioSamplesBuffer =
879 SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));
881 FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex);
882 }
884 AudioSegment* segment = CreateAudioSegment(chunksToSend);
885 nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_AUDIO_DATA);
886 event->mAudioSegment = segment;
887 event->mProvider = aProvider;
888 NS_DispatchToMainThread(event);
890 return;
891 }
893 const char*
894 SpeechRecognition::GetName(FSMState aId)
895 {
896 static const char* names[] = {
897 "STATE_IDLE",
898 "STATE_STARTING",
899 "STATE_ESTIMATING",
900 "STATE_WAITING_FOR_SPEECH",
901 "STATE_RECOGNIZING",
902 "STATE_WAITING_FOR_RESULT",
903 };
905 MOZ_ASSERT(aId < STATE_COUNT);
906 MOZ_ASSERT(ArrayLength(names) == STATE_COUNT);
907 return names[aId];
908 }
910 const char*
911 SpeechRecognition::GetName(SpeechEvent* aEvent)
912 {
913 static const char* names[] = {
914 "EVENT_START",
915 "EVENT_STOP",
916 "EVENT_ABORT",
917 "EVENT_AUDIO_DATA",
918 "EVENT_AUDIO_ERROR",
919 "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT",
920 "EVENT_RECOGNITIONSERVICE_FINAL_RESULT",
921 "EVENT_RECOGNITIONSERVICE_ERROR"
922 };
924 MOZ_ASSERT(aEvent->mType < EVENT_COUNT);
925 MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT);
926 return names[aEvent->mType];
927 }
929 SpeechEvent::~SpeechEvent()
930 {
931 delete mAudioSegment;
932 }
934 NS_IMETHODIMP
935 SpeechEvent::Run()
936 {
937 mRecognition->ProcessEvent(this);
938 return NS_OK;
939 }
941 NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaSuccessCallback, nsIDOMGetUserMediaSuccessCallback)
943 NS_IMETHODIMP
944 SpeechRecognition::GetUserMediaSuccessCallback::OnSuccess(nsISupports* aStream)
945 {
946 nsCOMPtr<nsIDOMLocalMediaStream> localStream = do_QueryInterface(aStream);
947 mRecognition->StartRecording(static_cast<DOMLocalMediaStream*>(localStream.get()));
948 return NS_OK;
949 }
951 NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaErrorCallback, nsIDOMGetUserMediaErrorCallback)
953 NS_IMETHODIMP
954 SpeechRecognition::GetUserMediaErrorCallback::OnError(const nsAString& aError)
955 {
956 SpeechRecognitionErrorCode errorCode;
958 if (aError.Equals(NS_LITERAL_STRING("PERMISSION_DENIED"))) {
959 errorCode = SpeechRecognitionErrorCode::Not_allowed;
960 } else {
961 errorCode = SpeechRecognitionErrorCode::Audio_capture;
962 }
964 mRecognition->DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode,
965 aError);
967 return NS_OK;
968 }
970 } // namespace dom
971 } // namespace mozilla