|
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* vim:set ts=2 sw=2 sts=2 et cindent: */ |
|
3 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
4 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
6 |
|
7 #include "SpeechRecognition.h" |
|
8 |
|
9 #include "nsCOMPtr.h" |
|
10 #include "nsCycleCollectionParticipant.h" |
|
11 |
|
12 #include "mozilla/dom/SpeechRecognitionBinding.h" |
|
13 #include "mozilla/dom/MediaStreamTrackBinding.h" |
|
14 #include "mozilla/MediaManager.h" |
|
15 #include "mozilla/Services.h" |
|
16 |
|
17 #include "AudioSegment.h" |
|
18 #include "endpointer.h" |
|
19 |
|
20 #include "GeneratedEvents.h" |
|
21 #include "nsIDOMSpeechRecognitionEvent.h" |
|
22 #include "nsIObserverService.h" |
|
23 #include "nsServiceManagerUtils.h" |
|
24 |
|
25 #include <algorithm> |
|
26 |
|
27 namespace mozilla { |
|
28 namespace dom { |
|
29 |
|
30 #define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default" |
|
31 #define DEFAULT_RECOGNITION_SERVICE "google" |
|
32 |
|
33 #define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length" |
|
34 #define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH "media.webspeech.long_silence_length" |
|
35 #define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH "media.webspeech.long_speech_length" |
|
36 |
|
37 static const uint32_t kSAMPLE_RATE = 16000; |
|
38 static const uint32_t kSPEECH_DETECTION_TIMEOUT_MS = 10000; |
|
39 |
|
40 // number of frames corresponding to 300ms of audio to send to endpointer while |
|
41 // it's in environment estimation mode |
|
42 // kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms |
|
43 static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000; |
|
44 |
|
45 #ifdef PR_LOGGING |
|
46 PRLogModuleInfo* |
|
47 GetSpeechRecognitionLog() |
|
48 { |
|
49 static PRLogModuleInfo* sLog; |
|
50 if (!sLog) { |
|
51 sLog = PR_NewLogModule("SpeechRecognition"); |
|
52 } |
|
53 |
|
54 return sLog; |
|
55 } |
|
56 #define SR_LOG(...) PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, (__VA_ARGS__)) |
|
57 #else |
|
58 #define SR_LOG(...) |
|
59 #endif |
|
60 |
|
61 NS_INTERFACE_MAP_BEGIN(SpeechRecognition) |
|
62 NS_INTERFACE_MAP_ENTRY(nsIObserver) |
|
63 NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper) |
|
64 |
|
65 NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper) |
|
66 NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper) |
|
67 |
|
68 struct SpeechRecognition::TestConfig SpeechRecognition::mTestConfig; |
|
69 |
|
70 SpeechRecognition::SpeechRecognition(nsPIDOMWindow* aOwnerWindow) |
|
71 : DOMEventTargetHelper(aOwnerWindow) |
|
72 , mEndpointer(kSAMPLE_RATE) |
|
73 , mAudioSamplesPerChunk(mEndpointer.FrameSize()) |
|
74 , mSpeechDetectionTimer(do_CreateInstance(NS_TIMER_CONTRACTID)) |
|
75 { |
|
76 SR_LOG("created SpeechRecognition"); |
|
77 |
|
78 mTestConfig.Init(); |
|
79 if (mTestConfig.mEnableTests) { |
|
80 nsCOMPtr<nsIObserverService> obs = services::GetObserverService(); |
|
81 obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false); |
|
82 obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false); |
|
83 } |
|
84 |
|
85 mEndpointer.set_speech_input_complete_silence_length( |
|
86 Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 500000)); |
|
87 mEndpointer.set_long_speech_input_complete_silence_length( |
|
88 Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 1000000)); |
|
89 mEndpointer.set_long_speech_length( |
|
90 Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000)); |
|
91 Reset(); |
|
92 } |
|
93 |
|
94 bool |
|
95 SpeechRecognition::StateBetween(FSMState begin, FSMState end) |
|
96 { |
|
97 return mCurrentState >= begin && mCurrentState <= end; |
|
98 } |
|
99 |
|
100 void |
|
101 SpeechRecognition::SetState(FSMState state) |
|
102 { |
|
103 mCurrentState = state; |
|
104 SR_LOG("Transitioned to state %s", GetName(mCurrentState)); |
|
105 return; |
|
106 } |
|
107 |
|
108 JSObject* |
|
109 SpeechRecognition::WrapObject(JSContext* aCx) |
|
110 { |
|
111 return SpeechRecognitionBinding::Wrap(aCx, this); |
|
112 } |
|
113 |
|
114 already_AddRefed<SpeechRecognition> |
|
115 SpeechRecognition::Constructor(const GlobalObject& aGlobal, |
|
116 ErrorResult& aRv) |
|
117 { |
|
118 nsCOMPtr<nsPIDOMWindow> win = do_QueryInterface(aGlobal.GetAsSupports()); |
|
119 if (!win) { |
|
120 aRv.Throw(NS_ERROR_FAILURE); |
|
121 } |
|
122 |
|
123 MOZ_ASSERT(win->IsInnerWindow()); |
|
124 nsRefPtr<SpeechRecognition> object = new SpeechRecognition(win); |
|
125 return object.forget(); |
|
126 } |
|
127 |
|
128 nsISupports* |
|
129 SpeechRecognition::GetParentObject() const |
|
130 { |
|
131 return GetOwner(); |
|
132 } |
|
133 |
|
134 void |
|
135 SpeechRecognition::ProcessEvent(SpeechEvent* aEvent) |
|
136 { |
|
137 SR_LOG("Processing %s, current state is %s", |
|
138 GetName(aEvent), |
|
139 GetName(mCurrentState)); |
|
140 |
|
141 if (mAborted && aEvent->mType != EVENT_ABORT) { |
|
142 // ignore all events while aborting |
|
143 return; |
|
144 } |
|
145 |
|
146 Transition(aEvent); |
|
147 } |
|
148 |
|
149 void |
|
150 SpeechRecognition::Transition(SpeechEvent* aEvent) |
|
151 { |
|
152 switch (mCurrentState) { |
|
153 case STATE_IDLE: |
|
154 switch (aEvent->mType) { |
|
155 case EVENT_START: |
|
156 // TODO: may want to time out if we wait too long |
|
157 // for user to approve |
|
158 WaitForAudioData(aEvent); |
|
159 break; |
|
160 case EVENT_STOP: |
|
161 case EVENT_ABORT: |
|
162 case EVENT_AUDIO_DATA: |
|
163 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: |
|
164 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: |
|
165 DoNothing(aEvent); |
|
166 break; |
|
167 case EVENT_AUDIO_ERROR: |
|
168 case EVENT_RECOGNITIONSERVICE_ERROR: |
|
169 AbortError(aEvent); |
|
170 break; |
|
171 case EVENT_COUNT: |
|
172 MOZ_CRASH("Invalid event EVENT_COUNT"); |
|
173 } |
|
174 break; |
|
175 case STATE_STARTING: |
|
176 switch (aEvent->mType) { |
|
177 case EVENT_AUDIO_DATA: |
|
178 StartedAudioCapture(aEvent); |
|
179 break; |
|
180 case EVENT_AUDIO_ERROR: |
|
181 case EVENT_RECOGNITIONSERVICE_ERROR: |
|
182 AbortError(aEvent); |
|
183 break; |
|
184 case EVENT_ABORT: |
|
185 AbortSilently(aEvent); |
|
186 break; |
|
187 case EVENT_STOP: |
|
188 Reset(); |
|
189 break; |
|
190 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: |
|
191 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: |
|
192 DoNothing(aEvent); |
|
193 break; |
|
194 case EVENT_START: |
|
195 SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); |
|
196 MOZ_CRASH(); |
|
197 case EVENT_COUNT: |
|
198 MOZ_CRASH("Invalid event EVENT_COUNT"); |
|
199 } |
|
200 break; |
|
201 case STATE_ESTIMATING: |
|
202 switch (aEvent->mType) { |
|
203 case EVENT_AUDIO_DATA: |
|
204 WaitForEstimation(aEvent); |
|
205 break; |
|
206 case EVENT_STOP: |
|
207 StopRecordingAndRecognize(aEvent); |
|
208 break; |
|
209 case EVENT_ABORT: |
|
210 AbortSilently(aEvent); |
|
211 break; |
|
212 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: |
|
213 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: |
|
214 case EVENT_RECOGNITIONSERVICE_ERROR: |
|
215 DoNothing(aEvent); |
|
216 break; |
|
217 case EVENT_AUDIO_ERROR: |
|
218 AbortError(aEvent); |
|
219 break; |
|
220 case EVENT_START: |
|
221 SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType); |
|
222 MOZ_CRASH(); |
|
223 case EVENT_COUNT: |
|
224 MOZ_CRASH("Invalid event EVENT_COUNT"); |
|
225 } |
|
226 break; |
|
227 case STATE_WAITING_FOR_SPEECH: |
|
228 switch (aEvent->mType) { |
|
229 case EVENT_AUDIO_DATA: |
|
230 DetectSpeech(aEvent); |
|
231 break; |
|
232 case EVENT_STOP: |
|
233 StopRecordingAndRecognize(aEvent); |
|
234 break; |
|
235 case EVENT_ABORT: |
|
236 AbortSilently(aEvent); |
|
237 break; |
|
238 case EVENT_AUDIO_ERROR: |
|
239 AbortError(aEvent); |
|
240 break; |
|
241 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: |
|
242 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: |
|
243 case EVENT_RECOGNITIONSERVICE_ERROR: |
|
244 DoNothing(aEvent); |
|
245 break; |
|
246 case EVENT_START: |
|
247 SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); |
|
248 MOZ_CRASH(); |
|
249 case EVENT_COUNT: |
|
250 MOZ_CRASH("Invalid event EVENT_COUNT"); |
|
251 } |
|
252 break; |
|
253 case STATE_RECOGNIZING: |
|
254 switch (aEvent->mType) { |
|
255 case EVENT_AUDIO_DATA: |
|
256 WaitForSpeechEnd(aEvent); |
|
257 break; |
|
258 case EVENT_STOP: |
|
259 StopRecordingAndRecognize(aEvent); |
|
260 break; |
|
261 case EVENT_AUDIO_ERROR: |
|
262 case EVENT_RECOGNITIONSERVICE_ERROR: |
|
263 AbortError(aEvent); |
|
264 break; |
|
265 case EVENT_ABORT: |
|
266 AbortSilently(aEvent); |
|
267 break; |
|
268 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: |
|
269 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: |
|
270 DoNothing(aEvent); |
|
271 break; |
|
272 case EVENT_START: |
|
273 SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent)); |
|
274 MOZ_CRASH(); |
|
275 case EVENT_COUNT: |
|
276 MOZ_CRASH("Invalid event EVENT_COUNT"); |
|
277 } |
|
278 break; |
|
279 case STATE_WAITING_FOR_RESULT: |
|
280 switch (aEvent->mType) { |
|
281 case EVENT_STOP: |
|
282 DoNothing(aEvent); |
|
283 break; |
|
284 case EVENT_AUDIO_ERROR: |
|
285 case EVENT_RECOGNITIONSERVICE_ERROR: |
|
286 AbortError(aEvent); |
|
287 break; |
|
288 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: |
|
289 NotifyFinalResult(aEvent); |
|
290 break; |
|
291 case EVENT_AUDIO_DATA: |
|
292 DoNothing(aEvent); |
|
293 break; |
|
294 case EVENT_ABORT: |
|
295 AbortSilently(aEvent); |
|
296 break; |
|
297 case EVENT_START: |
|
298 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: |
|
299 SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", GetName(aEvent)); |
|
300 MOZ_CRASH(); |
|
301 case EVENT_COUNT: |
|
302 MOZ_CRASH("Invalid event EVENT_COUNT"); |
|
303 } |
|
304 break; |
|
305 case STATE_COUNT: |
|
306 MOZ_CRASH("Invalid state STATE_COUNT"); |
|
307 } |
|
308 |
|
309 return; |
|
310 } |
|
311 |
|
312 /* |
|
313 * Handle a segment of recorded audio data. |
|
314 * Returns the number of samples that were processed. |
|
315 */ |
|
316 uint32_t |
|
317 SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment) |
|
318 { |
|
319 AudioSegment::ChunkIterator iterator(*aSegment); |
|
320 uint32_t samples = 0; |
|
321 while (!iterator.IsEnded()) { |
|
322 float out; |
|
323 mEndpointer.ProcessAudio(*iterator, &out); |
|
324 samples += iterator->GetDuration(); |
|
325 iterator.Next(); |
|
326 } |
|
327 |
|
328 mRecognitionService->ProcessAudioSegment(aSegment); |
|
329 return samples; |
|
330 } |
|
331 |
|
332 void |
|
333 SpeechRecognition::GetRecognitionServiceCID(nsACString& aResultCID) |
|
334 { |
|
335 if (mTestConfig.mFakeRecognitionService) { |
|
336 aResultCID = |
|
337 NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake"; |
|
338 |
|
339 return; |
|
340 } |
|
341 |
|
342 nsAdoptingCString prefValue = |
|
343 Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE); |
|
344 |
|
345 nsAutoCString speechRecognitionService; |
|
346 if (!prefValue.get() || prefValue.IsEmpty()) { |
|
347 speechRecognitionService = DEFAULT_RECOGNITION_SERVICE; |
|
348 } else { |
|
349 speechRecognitionService = prefValue; |
|
350 } |
|
351 |
|
352 aResultCID = |
|
353 NS_LITERAL_CSTRING(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) + |
|
354 speechRecognitionService; |
|
355 |
|
356 return; |
|
357 } |
|
358 |
|
359 /**************************************************************************** |
|
360 * FSM Transition functions |
|
361 * |
|
362 * If a transition function may cause a DOM event to be fired, |
|
363 * it may also be re-entered, since the event handler may cause the |
|
364 * event loop to spin and new SpeechEvents to be processed. |
|
365 * |
|
366 * Rules: |
|
367 * 1) These methods should call SetState as soon as possible. |
|
368 * 2) If these methods dispatch DOM events, or call methods that dispatch |
|
369 * DOM events, that should be done as late as possible. |
|
370 * 3) If anything must happen after dispatching a DOM event, make sure |
|
371 * the state is still what the method expected it to be. |
|
372 ****************************************************************************/ |
|
373 |
|
374 void |
|
375 SpeechRecognition::Reset() |
|
376 { |
|
377 SetState(STATE_IDLE); |
|
378 mRecognitionService = nullptr; |
|
379 mEstimationSamples = 0; |
|
380 mBufferedSamples = 0; |
|
381 mSpeechDetectionTimer->Cancel(); |
|
382 mAborted = false; |
|
383 } |
|
384 |
|
385 void |
|
386 SpeechRecognition::ResetAndEnd() |
|
387 { |
|
388 Reset(); |
|
389 DispatchTrustedEvent(NS_LITERAL_STRING("end")); |
|
390 } |
|
391 |
|
392 void |
|
393 SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent) |
|
394 { |
|
395 SetState(STATE_STARTING); |
|
396 } |
|
397 |
|
398 void |
|
399 SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent) |
|
400 { |
|
401 SetState(STATE_ESTIMATING); |
|
402 |
|
403 mEndpointer.SetEnvironmentEstimationMode(); |
|
404 mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment); |
|
405 |
|
406 DispatchTrustedEvent(NS_LITERAL_STRING("audiostart")); |
|
407 if (mCurrentState == STATE_ESTIMATING) { |
|
408 DispatchTrustedEvent(NS_LITERAL_STRING("start")); |
|
409 } |
|
410 } |
|
411 |
|
412 void |
|
413 SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent) |
|
414 { |
|
415 SetState(STATE_WAITING_FOR_RESULT); |
|
416 |
|
417 MOZ_ASSERT(mRecognitionService, "Service deleted before recording done"); |
|
418 mRecognitionService->SoundEnd(); |
|
419 |
|
420 StopRecording(); |
|
421 } |
|
422 |
|
423 void |
|
424 SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent) |
|
425 { |
|
426 SetState(STATE_ESTIMATING); |
|
427 |
|
428 mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment); |
|
429 if (mEstimationSamples > kESTIMATION_SAMPLES) { |
|
430 mEndpointer.SetUserInputMode(); |
|
431 SetState(STATE_WAITING_FOR_SPEECH); |
|
432 } |
|
433 } |
|
434 |
|
435 void |
|
436 SpeechRecognition::DetectSpeech(SpeechEvent* aEvent) |
|
437 { |
|
438 SetState(STATE_WAITING_FOR_SPEECH); |
|
439 |
|
440 ProcessAudioSegment(aEvent->mAudioSegment); |
|
441 if (mEndpointer.DidStartReceivingSpeech()) { |
|
442 mSpeechDetectionTimer->Cancel(); |
|
443 SetState(STATE_RECOGNIZING); |
|
444 DispatchTrustedEvent(NS_LITERAL_STRING("speechstart")); |
|
445 } |
|
446 } |
|
447 |
|
448 void |
|
449 SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent) |
|
450 { |
|
451 SetState(STATE_RECOGNIZING); |
|
452 |
|
453 ProcessAudioSegment(aEvent->mAudioSegment); |
|
454 if (mEndpointer.speech_input_complete()) { |
|
455 DispatchTrustedEvent(NS_LITERAL_STRING("speechend")); |
|
456 |
|
457 if (mCurrentState == STATE_RECOGNIZING) { |
|
458 // FIXME: StopRecordingAndRecognize should only be called for single |
|
459 // shot services for continuous we should just inform the service |
|
460 StopRecordingAndRecognize(aEvent); |
|
461 } |
|
462 } |
|
463 } |
|
464 |
|
465 void |
|
466 SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent) |
|
467 { |
|
468 ResetAndEnd(); |
|
469 |
|
470 nsCOMPtr<nsIDOMEvent> domEvent; |
|
471 NS_NewDOMSpeechRecognitionEvent(getter_AddRefs(domEvent), nullptr, nullptr, nullptr); |
|
472 |
|
473 nsCOMPtr<nsIDOMSpeechRecognitionEvent> srEvent = do_QueryInterface(domEvent); |
|
474 nsRefPtr<SpeechRecognitionResultList> rlist = aEvent->mRecognitionResultList; |
|
475 nsCOMPtr<nsISupports> ilist = do_QueryInterface(rlist); |
|
476 srEvent->InitSpeechRecognitionEvent(NS_LITERAL_STRING("result"), |
|
477 true, false, 0, ilist, |
|
478 NS_LITERAL_STRING("NOT_IMPLEMENTED"), |
|
479 nullptr); |
|
480 domEvent->SetTrusted(true); |
|
481 |
|
482 bool defaultActionEnabled; |
|
483 this->DispatchEvent(domEvent, &defaultActionEnabled); |
|
484 } |
|
485 |
|
486 void |
|
487 SpeechRecognition::DoNothing(SpeechEvent* aEvent) |
|
488 { |
|
489 } |
|
490 |
|
491 void |
|
492 SpeechRecognition::AbortSilently(SpeechEvent* aEvent) |
|
493 { |
|
494 bool stopRecording = StateBetween(STATE_ESTIMATING, STATE_RECOGNIZING); |
|
495 |
|
496 if (mRecognitionService) { |
|
497 mRecognitionService->Abort(); |
|
498 } |
|
499 |
|
500 if (stopRecording) { |
|
501 StopRecording(); |
|
502 } |
|
503 |
|
504 ResetAndEnd(); |
|
505 } |
|
506 |
|
507 void |
|
508 SpeechRecognition::AbortError(SpeechEvent* aEvent) |
|
509 { |
|
510 AbortSilently(aEvent); |
|
511 NotifyError(aEvent); |
|
512 } |
|
513 |
|
514 void |
|
515 SpeechRecognition::NotifyError(SpeechEvent* aEvent) |
|
516 { |
|
517 aEvent->mError->SetTrusted(true); |
|
518 |
|
519 bool defaultActionEnabled; |
|
520 this->DispatchEvent(aEvent->mError, &defaultActionEnabled); |
|
521 |
|
522 return; |
|
523 } |
|
524 |
|
525 /************************************** |
|
526 * Event triggers and other functions * |
|
527 **************************************/ |
|
528 NS_IMETHODIMP |
|
529 SpeechRecognition::StartRecording(DOMMediaStream* aDOMStream) |
|
530 { |
|
531 // hold a reference so that the underlying stream |
|
532 // doesn't get Destroy()'ed |
|
533 mDOMStream = aDOMStream; |
|
534 |
|
535 NS_ENSURE_STATE(mDOMStream->GetStream()); |
|
536 mSpeechListener = new SpeechStreamListener(this); |
|
537 mDOMStream->GetStream()->AddListener(mSpeechListener); |
|
538 |
|
539 mEndpointer.StartSession(); |
|
540 |
|
541 return mSpeechDetectionTimer->Init(this, kSPEECH_DETECTION_TIMEOUT_MS, |
|
542 nsITimer::TYPE_ONE_SHOT); |
|
543 } |
|
544 |
|
545 NS_IMETHODIMP |
|
546 SpeechRecognition::StopRecording() |
|
547 { |
|
548 // we only really need to remove the listener explicitly when testing, |
|
549 // as our JS code still holds a reference to mDOMStream and only assigning |
|
550 // it to nullptr isn't guaranteed to free the stream and the listener. |
|
551 mDOMStream->GetStream()->RemoveListener(mSpeechListener); |
|
552 mSpeechListener = nullptr; |
|
553 mDOMStream = nullptr; |
|
554 |
|
555 mEndpointer.EndSession(); |
|
556 DispatchTrustedEvent(NS_LITERAL_STRING("audioend")); |
|
557 |
|
558 return NS_OK; |
|
559 } |
|
560 |
|
561 NS_IMETHODIMP |
|
562 SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic, |
|
563 const char16_t* aData) |
|
564 { |
|
565 MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread"); |
|
566 |
|
567 if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) && |
|
568 StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) { |
|
569 |
|
570 DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, |
|
571 SpeechRecognitionErrorCode::No_speech, |
|
572 NS_LITERAL_STRING("No speech detected (timeout)")); |
|
573 } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) { |
|
574 nsCOMPtr<nsIObserverService> obs = services::GetObserverService(); |
|
575 obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC); |
|
576 obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC); |
|
577 } else if (mTestConfig.mFakeFSMEvents && |
|
578 !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) { |
|
579 ProcessTestEventRequest(aSubject, nsDependentString(aData)); |
|
580 } |
|
581 |
|
582 return NS_OK; |
|
583 } |
|
584 |
|
585 void |
|
586 SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName) |
|
587 { |
|
588 if (aEventName.EqualsLiteral("EVENT_START")) { |
|
589 ErrorResult err; |
|
590 Start(err); |
|
591 } else if (aEventName.EqualsLiteral("EVENT_STOP")) { |
|
592 Stop(); |
|
593 } else if (aEventName.EqualsLiteral("EVENT_ABORT")) { |
|
594 Abort(); |
|
595 } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) { |
|
596 DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, |
|
597 SpeechRecognitionErrorCode::Audio_capture, // TODO different codes? |
|
598 NS_LITERAL_STRING("AUDIO_ERROR test event")); |
|
599 } else if (aEventName.EqualsLiteral("EVENT_AUDIO_DATA")) { |
|
600 StartRecording(static_cast<DOMMediaStream*>(aSubject)); |
|
601 } else { |
|
602 NS_ASSERTION(mTestConfig.mFakeRecognitionService, |
|
603 "Got request for fake recognition service event, but " |
|
604 TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE " is unset"); |
|
605 |
|
606 // let the fake recognition service handle the request |
|
607 } |
|
608 |
|
609 return; |
|
610 } |
|
611 |
|
612 already_AddRefed<SpeechGrammarList> |
|
613 SpeechRecognition::GetGrammars(ErrorResult& aRv) const |
|
614 { |
|
615 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); |
|
616 return nullptr; |
|
617 } |
|
618 |
|
619 void |
|
620 SpeechRecognition::SetGrammars(SpeechGrammarList& aArg, ErrorResult& aRv) |
|
621 { |
|
622 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); |
|
623 return; |
|
624 } |
|
625 |
|
626 void |
|
627 SpeechRecognition::GetLang(nsString& aRetVal, ErrorResult& aRv) const |
|
628 { |
|
629 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); |
|
630 return; |
|
631 } |
|
632 |
|
633 void |
|
634 SpeechRecognition::SetLang(const nsAString& aArg, ErrorResult& aRv) |
|
635 { |
|
636 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); |
|
637 return; |
|
638 } |
|
639 |
|
640 bool |
|
641 SpeechRecognition::GetContinuous(ErrorResult& aRv) const |
|
642 { |
|
643 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); |
|
644 return false; |
|
645 } |
|
646 |
|
647 void |
|
648 SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv) |
|
649 { |
|
650 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); |
|
651 return; |
|
652 } |
|
653 |
|
654 bool |
|
655 SpeechRecognition::GetInterimResults(ErrorResult& aRv) const |
|
656 { |
|
657 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); |
|
658 return false; |
|
659 } |
|
660 |
|
661 void |
|
662 SpeechRecognition::SetInterimResults(bool aArg, ErrorResult& aRv) |
|
663 { |
|
664 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); |
|
665 return; |
|
666 } |
|
667 |
|
668 uint32_t |
|
669 SpeechRecognition::GetMaxAlternatives(ErrorResult& aRv) const |
|
670 { |
|
671 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); |
|
672 return 0; |
|
673 } |
|
674 |
|
675 void |
|
676 SpeechRecognition::SetMaxAlternatives(uint32_t aArg, ErrorResult& aRv) |
|
677 { |
|
678 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); |
|
679 return; |
|
680 } |
|
681 |
|
682 void |
|
683 SpeechRecognition::GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const |
|
684 { |
|
685 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); |
|
686 return; |
|
687 } |
|
688 |
|
689 void |
|
690 SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv) |
|
691 { |
|
692 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); |
|
693 return; |
|
694 } |
|
695 |
|
696 void |
|
697 SpeechRecognition::Start(ErrorResult& aRv) |
|
698 { |
|
699 if (mCurrentState != STATE_IDLE) { |
|
700 aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); |
|
701 return; |
|
702 } |
|
703 |
|
704 nsAutoCString speechRecognitionServiceCID; |
|
705 GetRecognitionServiceCID(speechRecognitionServiceCID); |
|
706 |
|
707 nsresult rv; |
|
708 mRecognitionService = do_GetService(speechRecognitionServiceCID.get(), &rv); |
|
709 NS_ENSURE_SUCCESS_VOID(rv); |
|
710 |
|
711 rv = mRecognitionService->Initialize(this->asWeakPtr()); |
|
712 NS_ENSURE_SUCCESS_VOID(rv); |
|
713 |
|
714 MediaStreamConstraints constraints; |
|
715 constraints.mAudio.SetAsBoolean() = true; |
|
716 |
|
717 if (!mTestConfig.mFakeFSMEvents) { |
|
718 MediaManager* manager = MediaManager::Get(); |
|
719 manager->GetUserMedia(false, |
|
720 GetOwner(), |
|
721 constraints, |
|
722 new GetUserMediaSuccessCallback(this), |
|
723 new GetUserMediaErrorCallback(this)); |
|
724 } |
|
725 |
|
726 nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START); |
|
727 NS_DispatchToMainThread(event); |
|
728 } |
|
729 |
|
730 void |
|
731 SpeechRecognition::Stop() |
|
732 { |
|
733 nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP); |
|
734 NS_DispatchToMainThread(event); |
|
735 } |
|
736 |
|
737 void |
|
738 SpeechRecognition::Abort() |
|
739 { |
|
740 if (mAborted) { |
|
741 return; |
|
742 } |
|
743 |
|
744 mAborted = true; |
|
745 nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT); |
|
746 NS_DispatchToMainThread(event); |
|
747 } |
|
748 |
|
749 void |
|
750 SpeechRecognition::DispatchError(EventType aErrorType, |
|
751 SpeechRecognitionErrorCode aErrorCode, |
|
752 const nsAString& aMessage) |
|
753 { |
|
754 MOZ_ASSERT(NS_IsMainThread()); |
|
755 MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR || |
|
756 aErrorType == EVENT_AUDIO_ERROR, "Invalid error type!"); |
|
757 |
|
758 nsRefPtr<SpeechRecognitionError> srError = |
|
759 new SpeechRecognitionError(nullptr, nullptr, nullptr); |
|
760 |
|
761 ErrorResult err; |
|
762 srError->InitSpeechRecognitionError(NS_LITERAL_STRING("error"), true, false, |
|
763 aErrorCode, aMessage, err); |
|
764 |
|
765 nsRefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType); |
|
766 event->mError = srError; |
|
767 NS_DispatchToMainThread(event); |
|
768 } |
|
769 |
|
770 /* |
|
771 * Buffer audio samples into mAudioSamplesBuffer until aBufferSize. |
|
772 * Updates mBufferedSamples and returns the number of samples that were buffered. |
|
773 */ |
|
774 uint32_t |
|
775 SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples, |
|
776 uint32_t aSampleCount) |
|
777 { |
|
778 MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk); |
|
779 MOZ_ASSERT(mAudioSamplesBuffer.get()); |
|
780 |
|
781 int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data()); |
|
782 size_t samplesToCopy = std::min(aSampleCount, |
|
783 mAudioSamplesPerChunk - mBufferedSamples); |
|
784 |
|
785 memcpy(samplesBuffer + mBufferedSamples, aSamples, |
|
786 samplesToCopy * sizeof(int16_t)); |
|
787 |
|
788 mBufferedSamples += samplesToCopy; |
|
789 return samplesToCopy; |
|
790 } |
|
791 |
|
792 /* |
|
793 * Split a samples buffer starting of a given size into |
|
794 * chunks of equal size. The chunks are stored in the array |
|
795 * received as argument. |
|
796 * Returns the offset of the end of the last chunk that was |
|
797 * created. |
|
798 */ |
|
799 uint32_t |
|
800 SpeechRecognition::SplitSamplesBuffer(const int16_t* aSamplesBuffer, |
|
801 uint32_t aSampleCount, |
|
802 nsTArray<nsRefPtr<SharedBuffer>>& aResult) |
|
803 { |
|
804 uint32_t chunkStart = 0; |
|
805 |
|
806 while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) { |
|
807 nsRefPtr<SharedBuffer> chunk = |
|
808 SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t)); |
|
809 |
|
810 memcpy(chunk->Data(), aSamplesBuffer + chunkStart, |
|
811 mAudioSamplesPerChunk * sizeof(int16_t)); |
|
812 |
|
813 aResult.AppendElement(chunk); |
|
814 chunkStart += mAudioSamplesPerChunk; |
|
815 } |
|
816 |
|
817 return chunkStart; |
|
818 } |
|
819 |
|
820 AudioSegment* |
|
821 SpeechRecognition::CreateAudioSegment(nsTArray<nsRefPtr<SharedBuffer>>& aChunks) |
|
822 { |
|
823 AudioSegment* segment = new AudioSegment(); |
|
824 for (uint32_t i = 0; i < aChunks.Length(); ++i) { |
|
825 nsRefPtr<SharedBuffer> buffer = aChunks[i]; |
|
826 const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data()); |
|
827 |
|
828 nsAutoTArray<const int16_t*, 1> channels; |
|
829 channels.AppendElement(chunkData); |
|
830 segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk); |
|
831 } |
|
832 |
|
833 return segment; |
|
834 } |
|
835 |
|
836 void |
|
837 SpeechRecognition::FeedAudioData(already_AddRefed<SharedBuffer> aSamples, |
|
838 uint32_t aDuration, |
|
839 MediaStreamListener* aProvider) |
|
840 { |
|
841 NS_ASSERTION(!NS_IsMainThread(), |
|
842 "FeedAudioData should not be called in the main thread"); |
|
843 |
|
844 // Endpointer expects to receive samples in chunks whose size is a |
|
845 // multiple of its frame size. |
|
846 // Since we can't assume we will receive the frames in appropriate-sized |
|
847 // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk |
|
848 // (a multiple of Endpointer's frame size) before feeding to Endpointer. |
|
849 |
|
850 // ensure aSamples is deleted |
|
851 nsRefPtr<SharedBuffer> refSamples = aSamples; |
|
852 |
|
853 uint32_t samplesIndex = 0; |
|
854 const int16_t* samples = static_cast<int16_t*>(refSamples->Data()); |
|
855 nsAutoTArray<nsRefPtr<SharedBuffer>, 5> chunksToSend; |
|
856 |
|
857 // fill up our buffer and make a chunk out of it, if possible |
|
858 if (mBufferedSamples > 0) { |
|
859 samplesIndex += FillSamplesBuffer(samples, aDuration); |
|
860 |
|
861 if (mBufferedSamples == mAudioSamplesPerChunk) { |
|
862 chunksToSend.AppendElement(mAudioSamplesBuffer); |
|
863 mAudioSamplesBuffer = nullptr; |
|
864 mBufferedSamples = 0; |
|
865 } |
|
866 } |
|
867 |
|
868 // create sample chunks of correct size |
|
869 if (samplesIndex < aDuration) { |
|
870 samplesIndex += SplitSamplesBuffer(samples + samplesIndex, |
|
871 aDuration - samplesIndex, |
|
872 chunksToSend); |
|
873 } |
|
874 |
|
875 // buffer remaining samples |
|
876 if (samplesIndex < aDuration) { |
|
877 mBufferedSamples = 0; |
|
878 mAudioSamplesBuffer = |
|
879 SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t)); |
|
880 |
|
881 FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex); |
|
882 } |
|
883 |
|
884 AudioSegment* segment = CreateAudioSegment(chunksToSend); |
|
885 nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_AUDIO_DATA); |
|
886 event->mAudioSegment = segment; |
|
887 event->mProvider = aProvider; |
|
888 NS_DispatchToMainThread(event); |
|
889 |
|
890 return; |
|
891 } |
|
892 |
|
893 const char* |
|
894 SpeechRecognition::GetName(FSMState aId) |
|
895 { |
|
896 static const char* names[] = { |
|
897 "STATE_IDLE", |
|
898 "STATE_STARTING", |
|
899 "STATE_ESTIMATING", |
|
900 "STATE_WAITING_FOR_SPEECH", |
|
901 "STATE_RECOGNIZING", |
|
902 "STATE_WAITING_FOR_RESULT", |
|
903 }; |
|
904 |
|
905 MOZ_ASSERT(aId < STATE_COUNT); |
|
906 MOZ_ASSERT(ArrayLength(names) == STATE_COUNT); |
|
907 return names[aId]; |
|
908 } |
|
909 |
|
910 const char* |
|
911 SpeechRecognition::GetName(SpeechEvent* aEvent) |
|
912 { |
|
913 static const char* names[] = { |
|
914 "EVENT_START", |
|
915 "EVENT_STOP", |
|
916 "EVENT_ABORT", |
|
917 "EVENT_AUDIO_DATA", |
|
918 "EVENT_AUDIO_ERROR", |
|
919 "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT", |
|
920 "EVENT_RECOGNITIONSERVICE_FINAL_RESULT", |
|
921 "EVENT_RECOGNITIONSERVICE_ERROR" |
|
922 }; |
|
923 |
|
924 MOZ_ASSERT(aEvent->mType < EVENT_COUNT); |
|
925 MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT); |
|
926 return names[aEvent->mType]; |
|
927 } |
|
928 |
|
929 SpeechEvent::~SpeechEvent() |
|
930 { |
|
931 delete mAudioSegment; |
|
932 } |
|
933 |
|
934 NS_IMETHODIMP |
|
935 SpeechEvent::Run() |
|
936 { |
|
937 mRecognition->ProcessEvent(this); |
|
938 return NS_OK; |
|
939 } |
|
940 |
|
941 NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaSuccessCallback, nsIDOMGetUserMediaSuccessCallback) |
|
942 |
|
943 NS_IMETHODIMP |
|
944 SpeechRecognition::GetUserMediaSuccessCallback::OnSuccess(nsISupports* aStream) |
|
945 { |
|
946 nsCOMPtr<nsIDOMLocalMediaStream> localStream = do_QueryInterface(aStream); |
|
947 mRecognition->StartRecording(static_cast<DOMLocalMediaStream*>(localStream.get())); |
|
948 return NS_OK; |
|
949 } |
|
950 |
|
951 NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaErrorCallback, nsIDOMGetUserMediaErrorCallback) |
|
952 |
|
953 NS_IMETHODIMP |
|
954 SpeechRecognition::GetUserMediaErrorCallback::OnError(const nsAString& aError) |
|
955 { |
|
956 SpeechRecognitionErrorCode errorCode; |
|
957 |
|
958 if (aError.Equals(NS_LITERAL_STRING("PERMISSION_DENIED"))) { |
|
959 errorCode = SpeechRecognitionErrorCode::Not_allowed; |
|
960 } else { |
|
961 errorCode = SpeechRecognitionErrorCode::Audio_capture; |
|
962 } |
|
963 |
|
964 mRecognition->DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode, |
|
965 aError); |
|
966 |
|
967 return NS_OK; |
|
968 } |
|
969 |
|
970 } // namespace dom |
|
971 } // namespace mozilla |