1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/content/media/webspeech/recognition/SpeechRecognition.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,299 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* vim:set ts=2 sw=2 sts=2 et cindent: */ 1.6 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.7 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.8 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.9 + 1.10 +#ifndef mozilla_dom_SpeechRecognition_h 1.11 +#define mozilla_dom_SpeechRecognition_h 1.12 + 1.13 +#include "mozilla/Attributes.h" 1.14 +#include "mozilla/DOMEventTargetHelper.h" 1.15 +#include "nsCOMPtr.h" 1.16 +#include "nsString.h" 1.17 +#include "nsWrapperCache.h" 1.18 +#include "nsTArray.h" 1.19 +#include "js/TypeDecls.h" 1.20 + 1.21 +#include "nsIDOMNavigatorUserMedia.h" 1.22 +#include "nsITimer.h" 1.23 +#include "MediaEngine.h" 1.24 +#include "MediaStreamGraph.h" 1.25 +#include "AudioSegment.h" 1.26 +#include "mozilla/WeakPtr.h" 1.27 +#include "mozilla/Preferences.h" 1.28 + 1.29 +#include "SpeechGrammarList.h" 1.30 +#include "SpeechRecognitionResultList.h" 1.31 +#include "SpeechStreamListener.h" 1.32 +#include "nsISpeechRecognitionService.h" 1.33 +#include "endpointer.h" 1.34 + 1.35 +#include "mozilla/dom/SpeechRecognitionError.h" 1.36 + 1.37 +class nsIDOMWindow; 1.38 + 1.39 +namespace mozilla { 1.40 + 1.41 +namespace dom { 1.42 + 1.43 +#define TEST_PREFERENCE_ENABLE "media.webspeech.test.enable" 1.44 +#define TEST_PREFERENCE_FAKE_FSM_EVENTS "media.webspeech.test.fake_fsm_events" 1.45 +#define TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE "media.webspeech.test.fake_recognition_service" 1.46 +#define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC "SpeechRecognitionTest:RequestEvent" 1.47 +#define SPEECH_RECOGNITION_TEST_END_TOPIC "SpeechRecognitionTest:End" 1.48 + 1.49 +class GlobalObject; 1.50 +class SpeechEvent; 1.51 + 1.52 +#ifdef PR_LOGGING 1.53 +PRLogModuleInfo* GetSpeechRecognitionLog(); 1.54 +#define SR_LOG(...) PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, (__VA_ARGS__)) 1.55 +#else 1.56 +#define SR_LOG(...) 1.57 +#endif 1.58 + 1.59 +class SpeechRecognition MOZ_FINAL : public DOMEventTargetHelper, 1.60 + public nsIObserver, 1.61 + public SupportsWeakPtr<SpeechRecognition> 1.62 +{ 1.63 +public: 1.64 + MOZ_DECLARE_REFCOUNTED_TYPENAME(SpeechRecognition) 1.65 + SpeechRecognition(nsPIDOMWindow* aOwnerWindow); 1.66 + virtual ~SpeechRecognition() {}; 1.67 + 1.68 + NS_DECL_ISUPPORTS_INHERITED 1.69 + 1.70 + NS_DECL_NSIOBSERVER 1.71 + 1.72 + nsISupports* GetParentObject() const; 1.73 + 1.74 + virtual JSObject* WrapObject(JSContext* aCx) MOZ_OVERRIDE; 1.75 + 1.76 + static already_AddRefed<SpeechRecognition> 1.77 + Constructor(const GlobalObject& aGlobal, ErrorResult& aRv); 1.78 + 1.79 + already_AddRefed<SpeechGrammarList> GetGrammars(ErrorResult& aRv) const; 1.80 + 1.81 + void SetGrammars(mozilla::dom::SpeechGrammarList& aArg, ErrorResult& aRv); 1.82 + 1.83 + void GetLang(nsString& aRetVal, ErrorResult& aRv) const; 1.84 + 1.85 + void SetLang(const nsAString& aArg, ErrorResult& aRv); 1.86 + 1.87 + bool GetContinuous(ErrorResult& aRv) const; 1.88 + 1.89 + void SetContinuous(bool aArg, ErrorResult& aRv); 1.90 + 1.91 + bool GetInterimResults(ErrorResult& aRv) const; 1.92 + 1.93 + void SetInterimResults(bool aArg, ErrorResult& aRv); 1.94 + 1.95 + uint32_t GetMaxAlternatives(ErrorResult& aRv) const; 1.96 + 1.97 + void SetMaxAlternatives(uint32_t aArg, ErrorResult& aRv); 1.98 + 1.99 + void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const; 1.100 + 1.101 + void SetServiceURI(const nsAString& aArg, ErrorResult& aRv); 1.102 + 1.103 + void Start(ErrorResult& aRv); 1.104 + 1.105 + void Stop(); 1.106 + 1.107 + void Abort(); 1.108 + 1.109 + IMPL_EVENT_HANDLER(audiostart) 1.110 + IMPL_EVENT_HANDLER(soundstart) 1.111 + IMPL_EVENT_HANDLER(speechstart) 1.112 + IMPL_EVENT_HANDLER(speechend) 1.113 + IMPL_EVENT_HANDLER(soundend) 1.114 + IMPL_EVENT_HANDLER(audioend) 1.115 + IMPL_EVENT_HANDLER(result) 1.116 + IMPL_EVENT_HANDLER(nomatch) 1.117 + IMPL_EVENT_HANDLER(error) 1.118 + IMPL_EVENT_HANDLER(start) 1.119 + IMPL_EVENT_HANDLER(end) 1.120 + 1.121 + enum EventType { 1.122 + EVENT_START, 1.123 + EVENT_STOP, 1.124 + EVENT_ABORT, 1.125 + EVENT_AUDIO_DATA, 1.126 + EVENT_AUDIO_ERROR, 1.127 + EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT, 1.128 + EVENT_RECOGNITIONSERVICE_FINAL_RESULT, 1.129 + EVENT_RECOGNITIONSERVICE_ERROR, 1.130 + EVENT_COUNT 1.131 + }; 1.132 + 1.133 + void DispatchError(EventType aErrorType, SpeechRecognitionErrorCode aErrorCode, const nsAString& aMessage); 1.134 + uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount); 1.135 + uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer, uint32_t aSampleCount, nsTArray<nsRefPtr<SharedBuffer>>& aResult); 1.136 + AudioSegment* CreateAudioSegment(nsTArray<nsRefPtr<SharedBuffer>>& aChunks); 1.137 + void FeedAudioData(already_AddRefed<SharedBuffer> aSamples, uint32_t aDuration, MediaStreamListener* aProvider); 1.138 + 1.139 + static struct TestConfig 1.140 + { 1.141 + public: 1.142 + bool mEnableTests; 1.143 + bool mFakeFSMEvents; 1.144 + bool mFakeRecognitionService; 1.145 + 1.146 + void Init() 1.147 + { 1.148 + if (mInitialized) { 1.149 + return; 1.150 + } 1.151 + 1.152 + Preferences::AddBoolVarCache(&mEnableTests, TEST_PREFERENCE_ENABLE); 1.153 + 1.154 + if (mEnableTests) { 1.155 + Preferences::AddBoolVarCache(&mFakeFSMEvents, TEST_PREFERENCE_FAKE_FSM_EVENTS); 1.156 + Preferences::AddBoolVarCache(&mFakeRecognitionService, TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE); 1.157 + } 1.158 + 1.159 + mInitialized = true; 1.160 + } 1.161 + private: 1.162 + bool mInitialized; 1.163 + } mTestConfig; 1.164 + 1.165 + 1.166 + friend class SpeechEvent; 1.167 +private: 1.168 + enum FSMState { 1.169 + STATE_IDLE, 1.170 + STATE_STARTING, 1.171 + STATE_ESTIMATING, 1.172 + STATE_WAITING_FOR_SPEECH, 1.173 + STATE_RECOGNIZING, 1.174 + STATE_WAITING_FOR_RESULT, 1.175 + STATE_COUNT 1.176 + }; 1.177 + 1.178 + void SetState(FSMState state); 1.179 + bool StateBetween(FSMState begin, FSMState end); 1.180 + 1.181 + class GetUserMediaSuccessCallback : public nsIDOMGetUserMediaSuccessCallback 1.182 + { 1.183 + public: 1.184 + NS_DECL_ISUPPORTS 1.185 + NS_DECL_NSIDOMGETUSERMEDIASUCCESSCALLBACK 1.186 + 1.187 + GetUserMediaSuccessCallback(SpeechRecognition* aRecognition) 1.188 + : mRecognition(aRecognition) 1.189 + {} 1.190 + 1.191 + virtual ~GetUserMediaSuccessCallback() {} 1.192 + 1.193 + private: 1.194 + nsRefPtr<SpeechRecognition> mRecognition; 1.195 + }; 1.196 + 1.197 + class GetUserMediaErrorCallback : public nsIDOMGetUserMediaErrorCallback 1.198 + { 1.199 + public: 1.200 + NS_DECL_ISUPPORTS 1.201 + NS_DECL_NSIDOMGETUSERMEDIAERRORCALLBACK 1.202 + 1.203 + GetUserMediaErrorCallback(SpeechRecognition* aRecognition) 1.204 + : mRecognition(aRecognition) 1.205 + {} 1.206 + 1.207 + virtual ~GetUserMediaErrorCallback() {} 1.208 + 1.209 + private: 1.210 + nsRefPtr<SpeechRecognition> mRecognition; 1.211 + }; 1.212 + 1.213 + NS_IMETHOD StartRecording(DOMMediaStream* aDOMStream); 1.214 + NS_IMETHOD StopRecording(); 1.215 + 1.216 + uint32_t ProcessAudioSegment(AudioSegment* aSegment); 1.217 + void NotifyError(SpeechEvent* aEvent); 1.218 + 1.219 + void ProcessEvent(SpeechEvent* aEvent); 1.220 + void Transition(SpeechEvent* aEvent); 1.221 + 1.222 + void Reset(); 1.223 + void ResetAndEnd(); 1.224 + void WaitForAudioData(SpeechEvent* aEvent); 1.225 + void StartedAudioCapture(SpeechEvent* aEvent); 1.226 + void StopRecordingAndRecognize(SpeechEvent* aEvent); 1.227 + void WaitForEstimation(SpeechEvent* aEvent); 1.228 + void DetectSpeech(SpeechEvent* aEvent); 1.229 + void WaitForSpeechEnd(SpeechEvent* aEvent); 1.230 + void NotifyFinalResult(SpeechEvent* aEvent); 1.231 + void DoNothing(SpeechEvent* aEvent); 1.232 + void AbortSilently(SpeechEvent* aEvent); 1.233 + void AbortError(SpeechEvent* aEvent); 1.234 + 1.235 + nsRefPtr<DOMMediaStream> mDOMStream; 1.236 + nsRefPtr<SpeechStreamListener> mSpeechListener; 1.237 + nsCOMPtr<nsISpeechRecognitionService> mRecognitionService; 1.238 + 1.239 + void GetRecognitionServiceCID(nsACString& aResultCID); 1.240 + 1.241 + FSMState mCurrentState; 1.242 + 1.243 + Endpointer mEndpointer; 1.244 + uint32_t mEstimationSamples; 1.245 + 1.246 + uint32_t mAudioSamplesPerChunk; 1.247 + 1.248 + // buffer holds one chunk of mAudioSamplesPerChunk 1.249 + // samples before feeding it to mEndpointer 1.250 + nsRefPtr<SharedBuffer> mAudioSamplesBuffer; 1.251 + uint32_t mBufferedSamples; 1.252 + 1.253 + nsCOMPtr<nsITimer> mSpeechDetectionTimer; 1.254 + bool mAborted; 1.255 + 1.256 + void ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName); 1.257 + 1.258 + const char* GetName(FSMState aId); 1.259 + const char* GetName(SpeechEvent* aId); 1.260 +}; 1.261 + 1.262 +class SpeechEvent : public nsRunnable 1.263 +{ 1.264 +public: 1.265 + SpeechEvent(SpeechRecognition* aRecognition, SpeechRecognition::EventType aType) 1.266 + : mAudioSegment(0) 1.267 + , mRecognitionResultList(0) 1.268 + , mError(0) 1.269 + , mRecognition(aRecognition) 1.270 + , mType(aType) 1.271 + { 1.272 + } 1.273 + 1.274 + ~SpeechEvent(); 1.275 + 1.276 + NS_IMETHOD Run() MOZ_OVERRIDE; 1.277 + AudioSegment* mAudioSegment; 1.278 + nsRefPtr<SpeechRecognitionResultList> mRecognitionResultList; // TODO: make this a session being passed which also has index and stuff 1.279 + nsRefPtr<SpeechRecognitionError> mError; 1.280 + 1.281 + friend class SpeechRecognition; 1.282 +private: 1.283 + SpeechRecognition* mRecognition; 1.284 + 1.285 + // for AUDIO_DATA events, keep a reference to the provider 1.286 + // of the data (i.e., the SpeechStreamListener) to ensure it 1.287 + // is kept alive (and keeps SpeechRecognition alive) until this 1.288 + // event gets processed. 1.289 + nsRefPtr<MediaStreamListener> mProvider; 1.290 + SpeechRecognition::EventType mType; 1.291 +}; 1.292 + 1.293 +} // namespace dom 1.294 + 1.295 +inline nsISupports* 1.296 +ToSupports(dom::SpeechRecognition* aRec) 1.297 +{ 1.298 + return ToSupports(static_cast<DOMEventTargetHelper*>(aRec)); 1.299 +} 1.300 +} // namespace mozilla 1.301 + 1.302 +#endif