michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0: /* vim:set ts=2 sw=2 sts=2 et cindent: */
michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0:  * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0:  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0: 
michael@0: #ifndef mozilla_dom_SpeechRecognition_h
michael@0: #define mozilla_dom_SpeechRecognition_h
michael@0: 
michael@0: #include "mozilla/Attributes.h"
michael@0: #include "mozilla/DOMEventTargetHelper.h"
michael@0: #include "nsCOMPtr.h"
michael@0: #include "nsString.h"
michael@0: #include "nsWrapperCache.h"
michael@0: #include "nsTArray.h"
michael@0: #include "js/TypeDecls.h"
michael@0: 
michael@0: #include "nsIDOMNavigatorUserMedia.h"
michael@0: #include "nsITimer.h"
michael@0: #include "MediaEngine.h"
michael@0: #include "MediaStreamGraph.h"
michael@0: #include "AudioSegment.h"
michael@0: #include "mozilla/WeakPtr.h"
michael@0: #include "mozilla/Preferences.h"
michael@0: 
michael@0: #include "SpeechGrammarList.h"
michael@0: #include "SpeechRecognitionResultList.h"
michael@0: #include "SpeechStreamListener.h"
michael@0: #include "nsISpeechRecognitionService.h"
michael@0: #include "endpointer.h"
michael@0: 
michael@0: #include "mozilla/dom/SpeechRecognitionError.h"
michael@0: 
michael@0: class nsIDOMWindow;
michael@0: 
michael@0: namespace mozilla {
michael@0: 
michael@0: namespace dom {
michael@0: 
michael@0: #define TEST_PREFERENCE_ENABLE "media.webspeech.test.enable"
michael@0: #define TEST_PREFERENCE_FAKE_FSM_EVENTS "media.webspeech.test.fake_fsm_events"
michael@0: #define TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE "media.webspeech.test.fake_recognition_service"
michael@0: #define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC "SpeechRecognitionTest:RequestEvent"
michael@0: #define SPEECH_RECOGNITION_TEST_END_TOPIC "SpeechRecognitionTest:End"
michael@0: 
michael@0: class GlobalObject;
michael@0: class SpeechEvent;
michael@0: 
michael@0: #ifdef PR_LOGGING
michael@0: PRLogModuleInfo* GetSpeechRecognitionLog();
michael@0: #define SR_LOG(...) PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, (__VA_ARGS__))
michael@0: #else
michael@0: #define SR_LOG(...)
michael@0: #endif
michael@0: 
michael@0: class SpeechRecognition MOZ_FINAL : public DOMEventTargetHelper,
michael@0:                                     public nsIObserver,
michael@0:                                     public SupportsWeakPtr<SpeechRecognition>
michael@0: {
michael@0: public:
michael@0:   MOZ_DECLARE_REFCOUNTED_TYPENAME(SpeechRecognition)
michael@0:   SpeechRecognition(nsPIDOMWindow* aOwnerWindow);
michael@0:   virtual ~SpeechRecognition() {};
michael@0: 
michael@0:   NS_DECL_ISUPPORTS_INHERITED
michael@0: 
michael@0:   NS_DECL_NSIOBSERVER
michael@0: 
michael@0:   nsISupports* GetParentObject() const;
michael@0: 
michael@0:   virtual JSObject* WrapObject(JSContext* aCx) MOZ_OVERRIDE;
michael@0: 
michael@0:   static already_AddRefed<SpeechRecognition>
michael@0:   Constructor(const GlobalObject& aGlobal, ErrorResult& aRv);
michael@0: 
michael@0:   already_AddRefed<SpeechGrammarList> GetGrammars(ErrorResult& aRv) const;
michael@0: 
michael@0:   void SetGrammars(mozilla::dom::SpeechGrammarList& aArg, ErrorResult& aRv);
michael@0: 
michael@0:   void GetLang(nsString& aRetVal, ErrorResult& aRv) const;
michael@0: 
michael@0:   void SetLang(const nsAString& aArg, ErrorResult& aRv);
michael@0: 
michael@0:   bool GetContinuous(ErrorResult& aRv) const;
michael@0: 
michael@0:   void SetContinuous(bool aArg, ErrorResult& aRv);
michael@0: 
michael@0:   bool GetInterimResults(ErrorResult& aRv) const;
michael@0: 
michael@0:   void SetInterimResults(bool aArg, ErrorResult& aRv);
michael@0: 
michael@0:   uint32_t GetMaxAlternatives(ErrorResult& aRv) const;
michael@0: 
michael@0:   void SetMaxAlternatives(uint32_t aArg, ErrorResult& aRv);
michael@0: 
michael@0:   void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const;
michael@0: 
michael@0:   void SetServiceURI(const nsAString& aArg, ErrorResult& aRv);
michael@0: 
michael@0:   void Start(ErrorResult& aRv);
michael@0: 
michael@0:   void Stop();
michael@0: 
michael@0:   void Abort();
michael@0: 
michael@0:   IMPL_EVENT_HANDLER(audiostart)
michael@0:   IMPL_EVENT_HANDLER(soundstart)
michael@0:   IMPL_EVENT_HANDLER(speechstart)
michael@0:   IMPL_EVENT_HANDLER(speechend)
michael@0:   IMPL_EVENT_HANDLER(soundend)
michael@0:   IMPL_EVENT_HANDLER(audioend)
michael@0:   IMPL_EVENT_HANDLER(result)
michael@0:   IMPL_EVENT_HANDLER(nomatch)
michael@0:   IMPL_EVENT_HANDLER(error)
michael@0:   IMPL_EVENT_HANDLER(start)
michael@0:   IMPL_EVENT_HANDLER(end)
michael@0: 
michael@0:   enum EventType {
michael@0:     EVENT_START,
michael@0:     EVENT_STOP,
michael@0:     EVENT_ABORT,
michael@0:     EVENT_AUDIO_DATA,
michael@0:     EVENT_AUDIO_ERROR,
michael@0:     EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT,
michael@0:     EVENT_RECOGNITIONSERVICE_FINAL_RESULT,
michael@0:     EVENT_RECOGNITIONSERVICE_ERROR,
michael@0:     EVENT_COUNT
michael@0:   };
michael@0: 
michael@0:   void DispatchError(EventType aErrorType, SpeechRecognitionErrorCode aErrorCode, const nsAString& aMessage);
michael@0:   uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount);
michael@0:   uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer, uint32_t aSampleCount, nsTArray<nsRefPtr<SharedBuffer>>& aResult);
michael@0:   AudioSegment* CreateAudioSegment(nsTArray<nsRefPtr<SharedBuffer>>& aChunks);
michael@0:   void FeedAudioData(already_AddRefed<SharedBuffer> aSamples, uint32_t aDuration, MediaStreamListener* aProvider);
michael@0: 
michael@0:   static struct TestConfig
michael@0:   {
michael@0:   public:
michael@0:     bool mEnableTests;
michael@0:     bool mFakeFSMEvents;
michael@0:     bool mFakeRecognitionService;
michael@0: 
michael@0:     void Init()
michael@0:     {
michael@0:       if (mInitialized) {
michael@0:         return;
michael@0:       }
michael@0: 
michael@0:       Preferences::AddBoolVarCache(&mEnableTests, TEST_PREFERENCE_ENABLE);
michael@0: 
michael@0:       if (mEnableTests) {
michael@0:         Preferences::AddBoolVarCache(&mFakeFSMEvents, TEST_PREFERENCE_FAKE_FSM_EVENTS);
michael@0:         Preferences::AddBoolVarCache(&mFakeRecognitionService, TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE);
michael@0:       }
michael@0: 
michael@0:       mInitialized = true;
michael@0:     }
michael@0:   private:
michael@0:     bool mInitialized;
michael@0:   } mTestConfig;
michael@0: 
michael@0: 
michael@0:   friend class SpeechEvent;
michael@0: private:
michael@0:   enum FSMState {
michael@0:     STATE_IDLE,
michael@0:     STATE_STARTING,
michael@0:     STATE_ESTIMATING,
michael@0:     STATE_WAITING_FOR_SPEECH,
michael@0:     STATE_RECOGNIZING,
michael@0:     STATE_WAITING_FOR_RESULT,
michael@0:     STATE_COUNT
michael@0:   };
michael@0: 
michael@0:   void SetState(FSMState state);
michael@0:   bool StateBetween(FSMState begin, FSMState end);
michael@0: 
michael@0:   class GetUserMediaSuccessCallback : public nsIDOMGetUserMediaSuccessCallback
michael@0:   {
michael@0:   public:
michael@0:     NS_DECL_ISUPPORTS
michael@0:     NS_DECL_NSIDOMGETUSERMEDIASUCCESSCALLBACK
michael@0: 
michael@0:     GetUserMediaSuccessCallback(SpeechRecognition* aRecognition)
michael@0:       : mRecognition(aRecognition)
michael@0:     {}
michael@0: 
michael@0:     virtual ~GetUserMediaSuccessCallback() {}
michael@0: 
michael@0:   private:
michael@0:     nsRefPtr<SpeechRecognition> mRecognition;
michael@0:   };
michael@0: 
michael@0:   class GetUserMediaErrorCallback : public nsIDOMGetUserMediaErrorCallback
michael@0:   {
michael@0:   public:
michael@0:     NS_DECL_ISUPPORTS
michael@0:     NS_DECL_NSIDOMGETUSERMEDIAERRORCALLBACK
michael@0: 
michael@0:     GetUserMediaErrorCallback(SpeechRecognition* aRecognition)
michael@0:       : mRecognition(aRecognition)
michael@0:     {}
michael@0: 
michael@0:     virtual ~GetUserMediaErrorCallback() {}
michael@0: 
michael@0:   private:
michael@0:     nsRefPtr<SpeechRecognition> mRecognition;
michael@0:   };
michael@0: 
michael@0:   NS_IMETHOD StartRecording(DOMMediaStream* aDOMStream);
michael@0:   NS_IMETHOD StopRecording();
michael@0: 
michael@0:   uint32_t ProcessAudioSegment(AudioSegment* aSegment);
michael@0:   void NotifyError(SpeechEvent* aEvent);
michael@0: 
michael@0:   void ProcessEvent(SpeechEvent* aEvent);
michael@0:   void Transition(SpeechEvent* aEvent);
michael@0: 
michael@0:   void Reset();
michael@0:   void ResetAndEnd();
michael@0:   void WaitForAudioData(SpeechEvent* aEvent);
michael@0:   void StartedAudioCapture(SpeechEvent* aEvent);
michael@0:   void StopRecordingAndRecognize(SpeechEvent* aEvent);
michael@0:   void WaitForEstimation(SpeechEvent* aEvent);
michael@0:   void DetectSpeech(SpeechEvent* aEvent);
michael@0:   void WaitForSpeechEnd(SpeechEvent* aEvent);
michael@0:   void NotifyFinalResult(SpeechEvent* aEvent);
michael@0:   void DoNothing(SpeechEvent* aEvent);
michael@0:   void AbortSilently(SpeechEvent* aEvent);
michael@0:   void AbortError(SpeechEvent* aEvent);
michael@0: 
michael@0:   nsRefPtr<DOMMediaStream> mDOMStream;
michael@0:   nsRefPtr<SpeechStreamListener> mSpeechListener;
michael@0:   nsCOMPtr<nsISpeechRecognitionService> mRecognitionService;
michael@0: 
michael@0:   void GetRecognitionServiceCID(nsACString& aResultCID);
michael@0: 
michael@0:   FSMState mCurrentState;
michael@0: 
michael@0:   Endpointer mEndpointer;
michael@0:   uint32_t mEstimationSamples;
michael@0: 
michael@0:   uint32_t mAudioSamplesPerChunk;
michael@0: 
michael@0:   // buffer holds one chunk of mAudioSamplesPerChunk
michael@0:   // samples before feeding it to mEndpointer
michael@0:   nsRefPtr<SharedBuffer> mAudioSamplesBuffer;
michael@0:   uint32_t mBufferedSamples;
michael@0: 
michael@0:   nsCOMPtr<nsITimer> mSpeechDetectionTimer;
michael@0:   bool mAborted;
michael@0: 
michael@0:   void ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName);
michael@0: 
michael@0:   const char* GetName(FSMState aId);
michael@0:   const char* GetName(SpeechEvent* aId);
michael@0: };
michael@0: 
michael@0: class SpeechEvent : public nsRunnable
michael@0: {
michael@0: public:
michael@0:   SpeechEvent(SpeechRecognition* aRecognition, SpeechRecognition::EventType aType)
michael@0:   : mAudioSegment(0)
michael@0:   , mRecognitionResultList(0)
michael@0:   , mError(0)
michael@0:   , mRecognition(aRecognition)
michael@0:   , mType(aType)
michael@0:   {
michael@0:   }
michael@0: 
michael@0:   ~SpeechEvent();
michael@0: 
michael@0:   NS_IMETHOD Run() MOZ_OVERRIDE;
michael@0:   AudioSegment* mAudioSegment;
michael@0:   nsRefPtr<SpeechRecognitionResultList> mRecognitionResultList; // TODO: make this a session being passed which also has index and stuff
michael@0:   nsRefPtr<SpeechRecognitionError> mError;
michael@0: 
michael@0:   friend class SpeechRecognition;
michael@0: private:
michael@0:   SpeechRecognition* mRecognition;
michael@0: 
michael@0:   // for AUDIO_DATA events, keep a reference to the provider
michael@0:   // of the data (i.e., the SpeechStreamListener) to ensure it
michael@0:   // is kept alive (and keeps SpeechRecognition alive) until this
michael@0:   // event gets processed.
michael@0:   nsRefPtr<MediaStreamListener> mProvider;
michael@0:   SpeechRecognition::EventType mType;
michael@0: };
michael@0: 
michael@0: } // namespace dom
michael@0: 
michael@0: inline nsISupports*
michael@0: ToSupports(dom::SpeechRecognition* aRec)
michael@0: {
michael@0:   return ToSupports(static_cast<DOMEventTargetHelper*>(aRec));
michael@0: }
michael@0: } // namespace mozilla
michael@0: 
michael@0: #endif