content/media/webspeech/recognition/SpeechRecognition.cpp

Thu, 15 Jan 2015 15:55:04 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:55:04 +0100
branch
TOR_BUG_9701
changeset 9
a63d609f5ebe
permissions
-rw-r--r--

Back out 97036ab72558 which inappropriately compared turds to third parties.

michael@0 1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0 2 /* vim:set ts=2 sw=2 sts=2 et cindent: */
michael@0 3 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 4 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 6
michael@0 7 #include "SpeechRecognition.h"
michael@0 8
michael@0 9 #include "nsCOMPtr.h"
michael@0 10 #include "nsCycleCollectionParticipant.h"
michael@0 11
michael@0 12 #include "mozilla/dom/SpeechRecognitionBinding.h"
michael@0 13 #include "mozilla/dom/MediaStreamTrackBinding.h"
michael@0 14 #include "mozilla/MediaManager.h"
michael@0 15 #include "mozilla/Services.h"
michael@0 16
michael@0 17 #include "AudioSegment.h"
michael@0 18 #include "endpointer.h"
michael@0 19
michael@0 20 #include "GeneratedEvents.h"
michael@0 21 #include "nsIDOMSpeechRecognitionEvent.h"
michael@0 22 #include "nsIObserverService.h"
michael@0 23 #include "nsServiceManagerUtils.h"
michael@0 24
michael@0 25 #include <algorithm>
michael@0 26
michael@0 27 namespace mozilla {
michael@0 28 namespace dom {
michael@0 29
michael@0 30 #define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default"
michael@0 31 #define DEFAULT_RECOGNITION_SERVICE "google"
michael@0 32
michael@0 33 #define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length"
michael@0 34 #define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH "media.webspeech.long_silence_length"
michael@0 35 #define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH "media.webspeech.long_speech_length"
michael@0 36
michael@0 37 static const uint32_t kSAMPLE_RATE = 16000;
michael@0 38 static const uint32_t kSPEECH_DETECTION_TIMEOUT_MS = 10000;
michael@0 39
michael@0 40 // number of frames corresponding to 300ms of audio to send to endpointer while
michael@0 41 // it's in environment estimation mode
michael@0 42 // kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms
michael@0 43 static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000;
michael@0 44
michael@0 45 #ifdef PR_LOGGING
michael@0 46 PRLogModuleInfo*
michael@0 47 GetSpeechRecognitionLog()
michael@0 48 {
michael@0 49 static PRLogModuleInfo* sLog;
michael@0 50 if (!sLog) {
michael@0 51 sLog = PR_NewLogModule("SpeechRecognition");
michael@0 52 }
michael@0 53
michael@0 54 return sLog;
michael@0 55 }
michael@0 56 #define SR_LOG(...) PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, (__VA_ARGS__))
michael@0 57 #else
michael@0 58 #define SR_LOG(...)
michael@0 59 #endif
michael@0 60
michael@0 61 NS_INTERFACE_MAP_BEGIN(SpeechRecognition)
michael@0 62 NS_INTERFACE_MAP_ENTRY(nsIObserver)
michael@0 63 NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper)
michael@0 64
michael@0 65 NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper)
michael@0 66 NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper)
michael@0 67
michael@0 68 struct SpeechRecognition::TestConfig SpeechRecognition::mTestConfig;
michael@0 69
michael@0 70 SpeechRecognition::SpeechRecognition(nsPIDOMWindow* aOwnerWindow)
michael@0 71 : DOMEventTargetHelper(aOwnerWindow)
michael@0 72 , mEndpointer(kSAMPLE_RATE)
michael@0 73 , mAudioSamplesPerChunk(mEndpointer.FrameSize())
michael@0 74 , mSpeechDetectionTimer(do_CreateInstance(NS_TIMER_CONTRACTID))
michael@0 75 {
michael@0 76 SR_LOG("created SpeechRecognition");
michael@0 77
michael@0 78 mTestConfig.Init();
michael@0 79 if (mTestConfig.mEnableTests) {
michael@0 80 nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
michael@0 81 obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false);
michael@0 82 obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false);
michael@0 83 }
michael@0 84
michael@0 85 mEndpointer.set_speech_input_complete_silence_length(
michael@0 86 Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 500000));
michael@0 87 mEndpointer.set_long_speech_input_complete_silence_length(
michael@0 88 Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 1000000));
michael@0 89 mEndpointer.set_long_speech_length(
michael@0 90 Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000));
michael@0 91 Reset();
michael@0 92 }
michael@0 93
michael@0 94 bool
michael@0 95 SpeechRecognition::StateBetween(FSMState begin, FSMState end)
michael@0 96 {
michael@0 97 return mCurrentState >= begin && mCurrentState <= end;
michael@0 98 }
michael@0 99
michael@0 100 void
michael@0 101 SpeechRecognition::SetState(FSMState state)
michael@0 102 {
michael@0 103 mCurrentState = state;
michael@0 104 SR_LOG("Transitioned to state %s", GetName(mCurrentState));
michael@0 105 return;
michael@0 106 }
michael@0 107
michael@0 108 JSObject*
michael@0 109 SpeechRecognition::WrapObject(JSContext* aCx)
michael@0 110 {
michael@0 111 return SpeechRecognitionBinding::Wrap(aCx, this);
michael@0 112 }
michael@0 113
michael@0 114 already_AddRefed<SpeechRecognition>
michael@0 115 SpeechRecognition::Constructor(const GlobalObject& aGlobal,
michael@0 116 ErrorResult& aRv)
michael@0 117 {
michael@0 118 nsCOMPtr<nsPIDOMWindow> win = do_QueryInterface(aGlobal.GetAsSupports());
michael@0 119 if (!win) {
michael@0 120 aRv.Throw(NS_ERROR_FAILURE);
michael@0 121 }
michael@0 122
michael@0 123 MOZ_ASSERT(win->IsInnerWindow());
michael@0 124 nsRefPtr<SpeechRecognition> object = new SpeechRecognition(win);
michael@0 125 return object.forget();
michael@0 126 }
michael@0 127
michael@0 128 nsISupports*
michael@0 129 SpeechRecognition::GetParentObject() const
michael@0 130 {
michael@0 131 return GetOwner();
michael@0 132 }
michael@0 133
michael@0 134 void
michael@0 135 SpeechRecognition::ProcessEvent(SpeechEvent* aEvent)
michael@0 136 {
michael@0 137 SR_LOG("Processing %s, current state is %s",
michael@0 138 GetName(aEvent),
michael@0 139 GetName(mCurrentState));
michael@0 140
michael@0 141 if (mAborted && aEvent->mType != EVENT_ABORT) {
michael@0 142 // ignore all events while aborting
michael@0 143 return;
michael@0 144 }
michael@0 145
michael@0 146 Transition(aEvent);
michael@0 147 }
michael@0 148
michael@0 149 void
michael@0 150 SpeechRecognition::Transition(SpeechEvent* aEvent)
michael@0 151 {
michael@0 152 switch (mCurrentState) {
michael@0 153 case STATE_IDLE:
michael@0 154 switch (aEvent->mType) {
michael@0 155 case EVENT_START:
michael@0 156 // TODO: may want to time out if we wait too long
michael@0 157 // for user to approve
michael@0 158 WaitForAudioData(aEvent);
michael@0 159 break;
michael@0 160 case EVENT_STOP:
michael@0 161 case EVENT_ABORT:
michael@0 162 case EVENT_AUDIO_DATA:
michael@0 163 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
michael@0 164 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
michael@0 165 DoNothing(aEvent);
michael@0 166 break;
michael@0 167 case EVENT_AUDIO_ERROR:
michael@0 168 case EVENT_RECOGNITIONSERVICE_ERROR:
michael@0 169 AbortError(aEvent);
michael@0 170 break;
michael@0 171 case EVENT_COUNT:
michael@0 172 MOZ_CRASH("Invalid event EVENT_COUNT");
michael@0 173 }
michael@0 174 break;
michael@0 175 case STATE_STARTING:
michael@0 176 switch (aEvent->mType) {
michael@0 177 case EVENT_AUDIO_DATA:
michael@0 178 StartedAudioCapture(aEvent);
michael@0 179 break;
michael@0 180 case EVENT_AUDIO_ERROR:
michael@0 181 case EVENT_RECOGNITIONSERVICE_ERROR:
michael@0 182 AbortError(aEvent);
michael@0 183 break;
michael@0 184 case EVENT_ABORT:
michael@0 185 AbortSilently(aEvent);
michael@0 186 break;
michael@0 187 case EVENT_STOP:
michael@0 188 Reset();
michael@0 189 break;
michael@0 190 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
michael@0 191 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
michael@0 192 DoNothing(aEvent);
michael@0 193 break;
michael@0 194 case EVENT_START:
michael@0 195 SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
michael@0 196 MOZ_CRASH();
michael@0 197 case EVENT_COUNT:
michael@0 198 MOZ_CRASH("Invalid event EVENT_COUNT");
michael@0 199 }
michael@0 200 break;
michael@0 201 case STATE_ESTIMATING:
michael@0 202 switch (aEvent->mType) {
michael@0 203 case EVENT_AUDIO_DATA:
michael@0 204 WaitForEstimation(aEvent);
michael@0 205 break;
michael@0 206 case EVENT_STOP:
michael@0 207 StopRecordingAndRecognize(aEvent);
michael@0 208 break;
michael@0 209 case EVENT_ABORT:
michael@0 210 AbortSilently(aEvent);
michael@0 211 break;
michael@0 212 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
michael@0 213 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
michael@0 214 case EVENT_RECOGNITIONSERVICE_ERROR:
michael@0 215 DoNothing(aEvent);
michael@0 216 break;
michael@0 217 case EVENT_AUDIO_ERROR:
michael@0 218 AbortError(aEvent);
michael@0 219 break;
michael@0 220 case EVENT_START:
michael@0 221 SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType);
michael@0 222 MOZ_CRASH();
michael@0 223 case EVENT_COUNT:
michael@0 224 MOZ_CRASH("Invalid event EVENT_COUNT");
michael@0 225 }
michael@0 226 break;
michael@0 227 case STATE_WAITING_FOR_SPEECH:
michael@0 228 switch (aEvent->mType) {
michael@0 229 case EVENT_AUDIO_DATA:
michael@0 230 DetectSpeech(aEvent);
michael@0 231 break;
michael@0 232 case EVENT_STOP:
michael@0 233 StopRecordingAndRecognize(aEvent);
michael@0 234 break;
michael@0 235 case EVENT_ABORT:
michael@0 236 AbortSilently(aEvent);
michael@0 237 break;
michael@0 238 case EVENT_AUDIO_ERROR:
michael@0 239 AbortError(aEvent);
michael@0 240 break;
michael@0 241 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
michael@0 242 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
michael@0 243 case EVENT_RECOGNITIONSERVICE_ERROR:
michael@0 244 DoNothing(aEvent);
michael@0 245 break;
michael@0 246 case EVENT_START:
michael@0 247 SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
michael@0 248 MOZ_CRASH();
michael@0 249 case EVENT_COUNT:
michael@0 250 MOZ_CRASH("Invalid event EVENT_COUNT");
michael@0 251 }
michael@0 252 break;
michael@0 253 case STATE_RECOGNIZING:
michael@0 254 switch (aEvent->mType) {
michael@0 255 case EVENT_AUDIO_DATA:
michael@0 256 WaitForSpeechEnd(aEvent);
michael@0 257 break;
michael@0 258 case EVENT_STOP:
michael@0 259 StopRecordingAndRecognize(aEvent);
michael@0 260 break;
michael@0 261 case EVENT_AUDIO_ERROR:
michael@0 262 case EVENT_RECOGNITIONSERVICE_ERROR:
michael@0 263 AbortError(aEvent);
michael@0 264 break;
michael@0 265 case EVENT_ABORT:
michael@0 266 AbortSilently(aEvent);
michael@0 267 break;
michael@0 268 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
michael@0 269 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
michael@0 270 DoNothing(aEvent);
michael@0 271 break;
michael@0 272 case EVENT_START:
michael@0 273 SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent));
michael@0 274 MOZ_CRASH();
michael@0 275 case EVENT_COUNT:
michael@0 276 MOZ_CRASH("Invalid event EVENT_COUNT");
michael@0 277 }
michael@0 278 break;
michael@0 279 case STATE_WAITING_FOR_RESULT:
michael@0 280 switch (aEvent->mType) {
michael@0 281 case EVENT_STOP:
michael@0 282 DoNothing(aEvent);
michael@0 283 break;
michael@0 284 case EVENT_AUDIO_ERROR:
michael@0 285 case EVENT_RECOGNITIONSERVICE_ERROR:
michael@0 286 AbortError(aEvent);
michael@0 287 break;
michael@0 288 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
michael@0 289 NotifyFinalResult(aEvent);
michael@0 290 break;
michael@0 291 case EVENT_AUDIO_DATA:
michael@0 292 DoNothing(aEvent);
michael@0 293 break;
michael@0 294 case EVENT_ABORT:
michael@0 295 AbortSilently(aEvent);
michael@0 296 break;
michael@0 297 case EVENT_START:
michael@0 298 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
michael@0 299 SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", GetName(aEvent));
michael@0 300 MOZ_CRASH();
michael@0 301 case EVENT_COUNT:
michael@0 302 MOZ_CRASH("Invalid event EVENT_COUNT");
michael@0 303 }
michael@0 304 break;
michael@0 305 case STATE_COUNT:
michael@0 306 MOZ_CRASH("Invalid state STATE_COUNT");
michael@0 307 }
michael@0 308
michael@0 309 return;
michael@0 310 }
michael@0 311
michael@0 312 /*
michael@0 313 * Handle a segment of recorded audio data.
michael@0 314 * Returns the number of samples that were processed.
michael@0 315 */
michael@0 316 uint32_t
michael@0 317 SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment)
michael@0 318 {
michael@0 319 AudioSegment::ChunkIterator iterator(*aSegment);
michael@0 320 uint32_t samples = 0;
michael@0 321 while (!iterator.IsEnded()) {
michael@0 322 float out;
michael@0 323 mEndpointer.ProcessAudio(*iterator, &out);
michael@0 324 samples += iterator->GetDuration();
michael@0 325 iterator.Next();
michael@0 326 }
michael@0 327
michael@0 328 mRecognitionService->ProcessAudioSegment(aSegment);
michael@0 329 return samples;
michael@0 330 }
michael@0 331
michael@0 332 void
michael@0 333 SpeechRecognition::GetRecognitionServiceCID(nsACString& aResultCID)
michael@0 334 {
michael@0 335 if (mTestConfig.mFakeRecognitionService) {
michael@0 336 aResultCID =
michael@0 337 NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake";
michael@0 338
michael@0 339 return;
michael@0 340 }
michael@0 341
michael@0 342 nsAdoptingCString prefValue =
michael@0 343 Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE);
michael@0 344
michael@0 345 nsAutoCString speechRecognitionService;
michael@0 346 if (!prefValue.get() || prefValue.IsEmpty()) {
michael@0 347 speechRecognitionService = DEFAULT_RECOGNITION_SERVICE;
michael@0 348 } else {
michael@0 349 speechRecognitionService = prefValue;
michael@0 350 }
michael@0 351
michael@0 352 aResultCID =
michael@0 353 NS_LITERAL_CSTRING(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) +
michael@0 354 speechRecognitionService;
michael@0 355
michael@0 356 return;
michael@0 357 }
michael@0 358
michael@0 359 /****************************************************************************
michael@0 360 * FSM Transition functions
michael@0 361 *
michael@0 362 * If a transition function may cause a DOM event to be fired,
michael@0 363 * it may also be re-entered, since the event handler may cause the
michael@0 364 * event loop to spin and new SpeechEvents to be processed.
michael@0 365 *
michael@0 366 * Rules:
michael@0 367 * 1) These methods should call SetState as soon as possible.
michael@0 368 * 2) If these methods dispatch DOM events, or call methods that dispatch
michael@0 369 * DOM events, that should be done as late as possible.
michael@0 370 * 3) If anything must happen after dispatching a DOM event, make sure
michael@0 371 * the state is still what the method expected it to be.
michael@0 372 ****************************************************************************/
michael@0 373
michael@0 374 void
michael@0 375 SpeechRecognition::Reset()
michael@0 376 {
michael@0 377 SetState(STATE_IDLE);
michael@0 378 mRecognitionService = nullptr;
michael@0 379 mEstimationSamples = 0;
michael@0 380 mBufferedSamples = 0;
michael@0 381 mSpeechDetectionTimer->Cancel();
michael@0 382 mAborted = false;
michael@0 383 }
michael@0 384
michael@0 385 void
michael@0 386 SpeechRecognition::ResetAndEnd()
michael@0 387 {
michael@0 388 Reset();
michael@0 389 DispatchTrustedEvent(NS_LITERAL_STRING("end"));
michael@0 390 }
michael@0 391
michael@0 392 void
michael@0 393 SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent)
michael@0 394 {
michael@0 395 SetState(STATE_STARTING);
michael@0 396 }
michael@0 397
michael@0 398 void
michael@0 399 SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent)
michael@0 400 {
michael@0 401 SetState(STATE_ESTIMATING);
michael@0 402
michael@0 403 mEndpointer.SetEnvironmentEstimationMode();
michael@0 404 mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment);
michael@0 405
michael@0 406 DispatchTrustedEvent(NS_LITERAL_STRING("audiostart"));
michael@0 407 if (mCurrentState == STATE_ESTIMATING) {
michael@0 408 DispatchTrustedEvent(NS_LITERAL_STRING("start"));
michael@0 409 }
michael@0 410 }
michael@0 411
michael@0 412 void
michael@0 413 SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent)
michael@0 414 {
michael@0 415 SetState(STATE_WAITING_FOR_RESULT);
michael@0 416
michael@0 417 MOZ_ASSERT(mRecognitionService, "Service deleted before recording done");
michael@0 418 mRecognitionService->SoundEnd();
michael@0 419
michael@0 420 StopRecording();
michael@0 421 }
michael@0 422
michael@0 423 void
michael@0 424 SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent)
michael@0 425 {
michael@0 426 SetState(STATE_ESTIMATING);
michael@0 427
michael@0 428 mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment);
michael@0 429 if (mEstimationSamples > kESTIMATION_SAMPLES) {
michael@0 430 mEndpointer.SetUserInputMode();
michael@0 431 SetState(STATE_WAITING_FOR_SPEECH);
michael@0 432 }
michael@0 433 }
michael@0 434
michael@0 435 void
michael@0 436 SpeechRecognition::DetectSpeech(SpeechEvent* aEvent)
michael@0 437 {
michael@0 438 SetState(STATE_WAITING_FOR_SPEECH);
michael@0 439
michael@0 440 ProcessAudioSegment(aEvent->mAudioSegment);
michael@0 441 if (mEndpointer.DidStartReceivingSpeech()) {
michael@0 442 mSpeechDetectionTimer->Cancel();
michael@0 443 SetState(STATE_RECOGNIZING);
michael@0 444 DispatchTrustedEvent(NS_LITERAL_STRING("speechstart"));
michael@0 445 }
michael@0 446 }
michael@0 447
michael@0 448 void
michael@0 449 SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent)
michael@0 450 {
michael@0 451 SetState(STATE_RECOGNIZING);
michael@0 452
michael@0 453 ProcessAudioSegment(aEvent->mAudioSegment);
michael@0 454 if (mEndpointer.speech_input_complete()) {
michael@0 455 DispatchTrustedEvent(NS_LITERAL_STRING("speechend"));
michael@0 456
michael@0 457 if (mCurrentState == STATE_RECOGNIZING) {
michael@0 458 // FIXME: StopRecordingAndRecognize should only be called for single
michael@0 459 // shot services for continuous we should just inform the service
michael@0 460 StopRecordingAndRecognize(aEvent);
michael@0 461 }
michael@0 462 }
michael@0 463 }
michael@0 464
michael@0 465 void
michael@0 466 SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent)
michael@0 467 {
michael@0 468 ResetAndEnd();
michael@0 469
michael@0 470 nsCOMPtr<nsIDOMEvent> domEvent;
michael@0 471 NS_NewDOMSpeechRecognitionEvent(getter_AddRefs(domEvent), nullptr, nullptr, nullptr);
michael@0 472
michael@0 473 nsCOMPtr<nsIDOMSpeechRecognitionEvent> srEvent = do_QueryInterface(domEvent);
michael@0 474 nsRefPtr<SpeechRecognitionResultList> rlist = aEvent->mRecognitionResultList;
michael@0 475 nsCOMPtr<nsISupports> ilist = do_QueryInterface(rlist);
michael@0 476 srEvent->InitSpeechRecognitionEvent(NS_LITERAL_STRING("result"),
michael@0 477 true, false, 0, ilist,
michael@0 478 NS_LITERAL_STRING("NOT_IMPLEMENTED"),
michael@0 479 nullptr);
michael@0 480 domEvent->SetTrusted(true);
michael@0 481
michael@0 482 bool defaultActionEnabled;
michael@0 483 this->DispatchEvent(domEvent, &defaultActionEnabled);
michael@0 484 }
michael@0 485
michael@0 486 void
michael@0 487 SpeechRecognition::DoNothing(SpeechEvent* aEvent)
michael@0 488 {
michael@0 489 }
michael@0 490
michael@0 491 void
michael@0 492 SpeechRecognition::AbortSilently(SpeechEvent* aEvent)
michael@0 493 {
michael@0 494 bool stopRecording = StateBetween(STATE_ESTIMATING, STATE_RECOGNIZING);
michael@0 495
michael@0 496 if (mRecognitionService) {
michael@0 497 mRecognitionService->Abort();
michael@0 498 }
michael@0 499
michael@0 500 if (stopRecording) {
michael@0 501 StopRecording();
michael@0 502 }
michael@0 503
michael@0 504 ResetAndEnd();
michael@0 505 }
michael@0 506
michael@0 507 void
michael@0 508 SpeechRecognition::AbortError(SpeechEvent* aEvent)
michael@0 509 {
michael@0 510 AbortSilently(aEvent);
michael@0 511 NotifyError(aEvent);
michael@0 512 }
michael@0 513
michael@0 514 void
michael@0 515 SpeechRecognition::NotifyError(SpeechEvent* aEvent)
michael@0 516 {
michael@0 517 aEvent->mError->SetTrusted(true);
michael@0 518
michael@0 519 bool defaultActionEnabled;
michael@0 520 this->DispatchEvent(aEvent->mError, &defaultActionEnabled);
michael@0 521
michael@0 522 return;
michael@0 523 }
michael@0 524
michael@0 525 /**************************************
michael@0 526 * Event triggers and other functions *
michael@0 527 **************************************/
michael@0 528 NS_IMETHODIMP
michael@0 529 SpeechRecognition::StartRecording(DOMMediaStream* aDOMStream)
michael@0 530 {
michael@0 531 // hold a reference so that the underlying stream
michael@0 532 // doesn't get Destroy()'ed
michael@0 533 mDOMStream = aDOMStream;
michael@0 534
michael@0 535 NS_ENSURE_STATE(mDOMStream->GetStream());
michael@0 536 mSpeechListener = new SpeechStreamListener(this);
michael@0 537 mDOMStream->GetStream()->AddListener(mSpeechListener);
michael@0 538
michael@0 539 mEndpointer.StartSession();
michael@0 540
michael@0 541 return mSpeechDetectionTimer->Init(this, kSPEECH_DETECTION_TIMEOUT_MS,
michael@0 542 nsITimer::TYPE_ONE_SHOT);
michael@0 543 }
michael@0 544
michael@0 545 NS_IMETHODIMP
michael@0 546 SpeechRecognition::StopRecording()
michael@0 547 {
michael@0 548 // we only really need to remove the listener explicitly when testing,
michael@0 549 // as our JS code still holds a reference to mDOMStream and only assigning
michael@0 550 // it to nullptr isn't guaranteed to free the stream and the listener.
michael@0 551 mDOMStream->GetStream()->RemoveListener(mSpeechListener);
michael@0 552 mSpeechListener = nullptr;
michael@0 553 mDOMStream = nullptr;
michael@0 554
michael@0 555 mEndpointer.EndSession();
michael@0 556 DispatchTrustedEvent(NS_LITERAL_STRING("audioend"));
michael@0 557
michael@0 558 return NS_OK;
michael@0 559 }
michael@0 560
michael@0 561 NS_IMETHODIMP
michael@0 562 SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic,
michael@0 563 const char16_t* aData)
michael@0 564 {
michael@0 565 MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread");
michael@0 566
michael@0 567 if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) &&
michael@0 568 StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) {
michael@0 569
michael@0 570 DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,
michael@0 571 SpeechRecognitionErrorCode::No_speech,
michael@0 572 NS_LITERAL_STRING("No speech detected (timeout)"));
michael@0 573 } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) {
michael@0 574 nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
michael@0 575 obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC);
michael@0 576 obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC);
michael@0 577 } else if (mTestConfig.mFakeFSMEvents &&
michael@0 578 !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) {
michael@0 579 ProcessTestEventRequest(aSubject, nsDependentString(aData));
michael@0 580 }
michael@0 581
michael@0 582 return NS_OK;
michael@0 583 }
michael@0 584
michael@0 585 void
michael@0 586 SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName)
michael@0 587 {
michael@0 588 if (aEventName.EqualsLiteral("EVENT_START")) {
michael@0 589 ErrorResult err;
michael@0 590 Start(err);
michael@0 591 } else if (aEventName.EqualsLiteral("EVENT_STOP")) {
michael@0 592 Stop();
michael@0 593 } else if (aEventName.EqualsLiteral("EVENT_ABORT")) {
michael@0 594 Abort();
michael@0 595 } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) {
michael@0 596 DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,
michael@0 597 SpeechRecognitionErrorCode::Audio_capture, // TODO different codes?
michael@0 598 NS_LITERAL_STRING("AUDIO_ERROR test event"));
michael@0 599 } else if (aEventName.EqualsLiteral("EVENT_AUDIO_DATA")) {
michael@0 600 StartRecording(static_cast<DOMMediaStream*>(aSubject));
michael@0 601 } else {
michael@0 602 NS_ASSERTION(mTestConfig.mFakeRecognitionService,
michael@0 603 "Got request for fake recognition service event, but "
michael@0 604 TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE " is unset");
michael@0 605
michael@0 606 // let the fake recognition service handle the request
michael@0 607 }
michael@0 608
michael@0 609 return;
michael@0 610 }
michael@0 611
michael@0 612 already_AddRefed<SpeechGrammarList>
michael@0 613 SpeechRecognition::GetGrammars(ErrorResult& aRv) const
michael@0 614 {
michael@0 615 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0 616 return nullptr;
michael@0 617 }
michael@0 618
michael@0 619 void
michael@0 620 SpeechRecognition::SetGrammars(SpeechGrammarList& aArg, ErrorResult& aRv)
michael@0 621 {
michael@0 622 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0 623 return;
michael@0 624 }
michael@0 625
michael@0 626 void
michael@0 627 SpeechRecognition::GetLang(nsString& aRetVal, ErrorResult& aRv) const
michael@0 628 {
michael@0 629 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0 630 return;
michael@0 631 }
michael@0 632
michael@0 633 void
michael@0 634 SpeechRecognition::SetLang(const nsAString& aArg, ErrorResult& aRv)
michael@0 635 {
michael@0 636 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0 637 return;
michael@0 638 }
michael@0 639
michael@0 640 bool
michael@0 641 SpeechRecognition::GetContinuous(ErrorResult& aRv) const
michael@0 642 {
michael@0 643 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0 644 return false;
michael@0 645 }
michael@0 646
michael@0 647 void
michael@0 648 SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv)
michael@0 649 {
michael@0 650 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0 651 return;
michael@0 652 }
michael@0 653
michael@0 654 bool
michael@0 655 SpeechRecognition::GetInterimResults(ErrorResult& aRv) const
michael@0 656 {
michael@0 657 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0 658 return false;
michael@0 659 }
michael@0 660
michael@0 661 void
michael@0 662 SpeechRecognition::SetInterimResults(bool aArg, ErrorResult& aRv)
michael@0 663 {
michael@0 664 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0 665 return;
michael@0 666 }
michael@0 667
michael@0 668 uint32_t
michael@0 669 SpeechRecognition::GetMaxAlternatives(ErrorResult& aRv) const
michael@0 670 {
michael@0 671 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0 672 return 0;
michael@0 673 }
michael@0 674
michael@0 675 void
michael@0 676 SpeechRecognition::SetMaxAlternatives(uint32_t aArg, ErrorResult& aRv)
michael@0 677 {
michael@0 678 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0 679 return;
michael@0 680 }
michael@0 681
michael@0 682 void
michael@0 683 SpeechRecognition::GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const
michael@0 684 {
michael@0 685 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0 686 return;
michael@0 687 }
michael@0 688
michael@0 689 void
michael@0 690 SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv)
michael@0 691 {
michael@0 692 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
michael@0 693 return;
michael@0 694 }
michael@0 695
michael@0 696 void
michael@0 697 SpeechRecognition::Start(ErrorResult& aRv)
michael@0 698 {
michael@0 699 if (mCurrentState != STATE_IDLE) {
michael@0 700 aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
michael@0 701 return;
michael@0 702 }
michael@0 703
michael@0 704 nsAutoCString speechRecognitionServiceCID;
michael@0 705 GetRecognitionServiceCID(speechRecognitionServiceCID);
michael@0 706
michael@0 707 nsresult rv;
michael@0 708 mRecognitionService = do_GetService(speechRecognitionServiceCID.get(), &rv);
michael@0 709 NS_ENSURE_SUCCESS_VOID(rv);
michael@0 710
michael@0 711 rv = mRecognitionService->Initialize(this->asWeakPtr());
michael@0 712 NS_ENSURE_SUCCESS_VOID(rv);
michael@0 713
michael@0 714 MediaStreamConstraints constraints;
michael@0 715 constraints.mAudio.SetAsBoolean() = true;
michael@0 716
michael@0 717 if (!mTestConfig.mFakeFSMEvents) {
michael@0 718 MediaManager* manager = MediaManager::Get();
michael@0 719 manager->GetUserMedia(false,
michael@0 720 GetOwner(),
michael@0 721 constraints,
michael@0 722 new GetUserMediaSuccessCallback(this),
michael@0 723 new GetUserMediaErrorCallback(this));
michael@0 724 }
michael@0 725
michael@0 726 nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START);
michael@0 727 NS_DispatchToMainThread(event);
michael@0 728 }
michael@0 729
michael@0 730 void
michael@0 731 SpeechRecognition::Stop()
michael@0 732 {
michael@0 733 nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP);
michael@0 734 NS_DispatchToMainThread(event);
michael@0 735 }
michael@0 736
michael@0 737 void
michael@0 738 SpeechRecognition::Abort()
michael@0 739 {
michael@0 740 if (mAborted) {
michael@0 741 return;
michael@0 742 }
michael@0 743
michael@0 744 mAborted = true;
michael@0 745 nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT);
michael@0 746 NS_DispatchToMainThread(event);
michael@0 747 }
michael@0 748
michael@0 749 void
michael@0 750 SpeechRecognition::DispatchError(EventType aErrorType,
michael@0 751 SpeechRecognitionErrorCode aErrorCode,
michael@0 752 const nsAString& aMessage)
michael@0 753 {
michael@0 754 MOZ_ASSERT(NS_IsMainThread());
michael@0 755 MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR ||
michael@0 756 aErrorType == EVENT_AUDIO_ERROR, "Invalid error type!");
michael@0 757
michael@0 758 nsRefPtr<SpeechRecognitionError> srError =
michael@0 759 new SpeechRecognitionError(nullptr, nullptr, nullptr);
michael@0 760
michael@0 761 ErrorResult err;
michael@0 762 srError->InitSpeechRecognitionError(NS_LITERAL_STRING("error"), true, false,
michael@0 763 aErrorCode, aMessage, err);
michael@0 764
michael@0 765 nsRefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType);
michael@0 766 event->mError = srError;
michael@0 767 NS_DispatchToMainThread(event);
michael@0 768 }
michael@0 769
michael@0 770 /*
michael@0 771 * Buffer audio samples into mAudioSamplesBuffer until aBufferSize.
michael@0 772 * Updates mBufferedSamples and returns the number of samples that were buffered.
michael@0 773 */
michael@0 774 uint32_t
michael@0 775 SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples,
michael@0 776 uint32_t aSampleCount)
michael@0 777 {
michael@0 778 MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk);
michael@0 779 MOZ_ASSERT(mAudioSamplesBuffer.get());
michael@0 780
michael@0 781 int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data());
michael@0 782 size_t samplesToCopy = std::min(aSampleCount,
michael@0 783 mAudioSamplesPerChunk - mBufferedSamples);
michael@0 784
michael@0 785 memcpy(samplesBuffer + mBufferedSamples, aSamples,
michael@0 786 samplesToCopy * sizeof(int16_t));
michael@0 787
michael@0 788 mBufferedSamples += samplesToCopy;
michael@0 789 return samplesToCopy;
michael@0 790 }
michael@0 791
michael@0 792 /*
michael@0 793 * Split a samples buffer starting of a given size into
michael@0 794 * chunks of equal size. The chunks are stored in the array
michael@0 795 * received as argument.
michael@0 796 * Returns the offset of the end of the last chunk that was
michael@0 797 * created.
michael@0 798 */
michael@0 799 uint32_t
michael@0 800 SpeechRecognition::SplitSamplesBuffer(const int16_t* aSamplesBuffer,
michael@0 801 uint32_t aSampleCount,
michael@0 802 nsTArray<nsRefPtr<SharedBuffer>>& aResult)
michael@0 803 {
michael@0 804 uint32_t chunkStart = 0;
michael@0 805
michael@0 806 while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) {
michael@0 807 nsRefPtr<SharedBuffer> chunk =
michael@0 808 SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));
michael@0 809
michael@0 810 memcpy(chunk->Data(), aSamplesBuffer + chunkStart,
michael@0 811 mAudioSamplesPerChunk * sizeof(int16_t));
michael@0 812
michael@0 813 aResult.AppendElement(chunk);
michael@0 814 chunkStart += mAudioSamplesPerChunk;
michael@0 815 }
michael@0 816
michael@0 817 return chunkStart;
michael@0 818 }
michael@0 819
michael@0 820 AudioSegment*
michael@0 821 SpeechRecognition::CreateAudioSegment(nsTArray<nsRefPtr<SharedBuffer>>& aChunks)
michael@0 822 {
michael@0 823 AudioSegment* segment = new AudioSegment();
michael@0 824 for (uint32_t i = 0; i < aChunks.Length(); ++i) {
michael@0 825 nsRefPtr<SharedBuffer> buffer = aChunks[i];
michael@0 826 const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data());
michael@0 827
michael@0 828 nsAutoTArray<const int16_t*, 1> channels;
michael@0 829 channels.AppendElement(chunkData);
michael@0 830 segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk);
michael@0 831 }
michael@0 832
michael@0 833 return segment;
michael@0 834 }
michael@0 835
michael@0 836 void
michael@0 837 SpeechRecognition::FeedAudioData(already_AddRefed<SharedBuffer> aSamples,
michael@0 838 uint32_t aDuration,
michael@0 839 MediaStreamListener* aProvider)
michael@0 840 {
michael@0 841 NS_ASSERTION(!NS_IsMainThread(),
michael@0 842 "FeedAudioData should not be called in the main thread");
michael@0 843
michael@0 844 // Endpointer expects to receive samples in chunks whose size is a
michael@0 845 // multiple of its frame size.
michael@0 846 // Since we can't assume we will receive the frames in appropriate-sized
michael@0 847 // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk
michael@0 848 // (a multiple of Endpointer's frame size) before feeding to Endpointer.
michael@0 849
michael@0 850 // ensure aSamples is deleted
michael@0 851 nsRefPtr<SharedBuffer> refSamples = aSamples;
michael@0 852
michael@0 853 uint32_t samplesIndex = 0;
michael@0 854 const int16_t* samples = static_cast<int16_t*>(refSamples->Data());
michael@0 855 nsAutoTArray<nsRefPtr<SharedBuffer>, 5> chunksToSend;
michael@0 856
michael@0 857 // fill up our buffer and make a chunk out of it, if possible
michael@0 858 if (mBufferedSamples > 0) {
michael@0 859 samplesIndex += FillSamplesBuffer(samples, aDuration);
michael@0 860
michael@0 861 if (mBufferedSamples == mAudioSamplesPerChunk) {
michael@0 862 chunksToSend.AppendElement(mAudioSamplesBuffer);
michael@0 863 mAudioSamplesBuffer = nullptr;
michael@0 864 mBufferedSamples = 0;
michael@0 865 }
michael@0 866 }
michael@0 867
michael@0 868 // create sample chunks of correct size
michael@0 869 if (samplesIndex < aDuration) {
michael@0 870 samplesIndex += SplitSamplesBuffer(samples + samplesIndex,
michael@0 871 aDuration - samplesIndex,
michael@0 872 chunksToSend);
michael@0 873 }
michael@0 874
michael@0 875 // buffer remaining samples
michael@0 876 if (samplesIndex < aDuration) {
michael@0 877 mBufferedSamples = 0;
michael@0 878 mAudioSamplesBuffer =
michael@0 879 SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));
michael@0 880
michael@0 881 FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex);
michael@0 882 }
michael@0 883
michael@0 884 AudioSegment* segment = CreateAudioSegment(chunksToSend);
michael@0 885 nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_AUDIO_DATA);
michael@0 886 event->mAudioSegment = segment;
michael@0 887 event->mProvider = aProvider;
michael@0 888 NS_DispatchToMainThread(event);
michael@0 889
michael@0 890 return;
michael@0 891 }
michael@0 892
michael@0 893 const char*
michael@0 894 SpeechRecognition::GetName(FSMState aId)
michael@0 895 {
michael@0 896 static const char* names[] = {
michael@0 897 "STATE_IDLE",
michael@0 898 "STATE_STARTING",
michael@0 899 "STATE_ESTIMATING",
michael@0 900 "STATE_WAITING_FOR_SPEECH",
michael@0 901 "STATE_RECOGNIZING",
michael@0 902 "STATE_WAITING_FOR_RESULT",
michael@0 903 };
michael@0 904
michael@0 905 MOZ_ASSERT(aId < STATE_COUNT);
michael@0 906 MOZ_ASSERT(ArrayLength(names) == STATE_COUNT);
michael@0 907 return names[aId];
michael@0 908 }
michael@0 909
michael@0 910 const char*
michael@0 911 SpeechRecognition::GetName(SpeechEvent* aEvent)
michael@0 912 {
michael@0 913 static const char* names[] = {
michael@0 914 "EVENT_START",
michael@0 915 "EVENT_STOP",
michael@0 916 "EVENT_ABORT",
michael@0 917 "EVENT_AUDIO_DATA",
michael@0 918 "EVENT_AUDIO_ERROR",
michael@0 919 "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT",
michael@0 920 "EVENT_RECOGNITIONSERVICE_FINAL_RESULT",
michael@0 921 "EVENT_RECOGNITIONSERVICE_ERROR"
michael@0 922 };
michael@0 923
michael@0 924 MOZ_ASSERT(aEvent->mType < EVENT_COUNT);
michael@0 925 MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT);
michael@0 926 return names[aEvent->mType];
michael@0 927 }
michael@0 928
michael@0 929 SpeechEvent::~SpeechEvent()
michael@0 930 {
michael@0 931 delete mAudioSegment;
michael@0 932 }
michael@0 933
michael@0 934 NS_IMETHODIMP
michael@0 935 SpeechEvent::Run()
michael@0 936 {
michael@0 937 mRecognition->ProcessEvent(this);
michael@0 938 return NS_OK;
michael@0 939 }
michael@0 940
michael@0 941 NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaSuccessCallback, nsIDOMGetUserMediaSuccessCallback)
michael@0 942
michael@0 943 NS_IMETHODIMP
michael@0 944 SpeechRecognition::GetUserMediaSuccessCallback::OnSuccess(nsISupports* aStream)
michael@0 945 {
michael@0 946 nsCOMPtr<nsIDOMLocalMediaStream> localStream = do_QueryInterface(aStream);
michael@0 947 mRecognition->StartRecording(static_cast<DOMLocalMediaStream*>(localStream.get()));
michael@0 948 return NS_OK;
michael@0 949 }
michael@0 950
michael@0 951 NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaErrorCallback, nsIDOMGetUserMediaErrorCallback)
michael@0 952
michael@0 953 NS_IMETHODIMP
michael@0 954 SpeechRecognition::GetUserMediaErrorCallback::OnError(const nsAString& aError)
michael@0 955 {
michael@0 956 SpeechRecognitionErrorCode errorCode;
michael@0 957
michael@0 958 if (aError.Equals(NS_LITERAL_STRING("PERMISSION_DENIED"))) {
michael@0 959 errorCode = SpeechRecognitionErrorCode::Not_allowed;
michael@0 960 } else {
michael@0 961 errorCode = SpeechRecognitionErrorCode::Audio_capture;
michael@0 962 }
michael@0 963
michael@0 964 mRecognition->DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode,
michael@0 965 aError);
michael@0 966
michael@0 967 return NS_OK;
michael@0 968 }
michael@0 969
michael@0 970 } // namespace dom
michael@0 971 } // namespace mozilla

mercurial