|
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 #include "mozilla/HangMonitor.h" |
|
7 #include "mozilla/BackgroundHangMonitor.h" |
|
8 #include "mozilla/Monitor.h" |
|
9 #include "mozilla/Preferences.h" |
|
10 #include "mozilla/Telemetry.h" |
|
11 #include "mozilla/ProcessedStack.h" |
|
12 #include "mozilla/Atomics.h" |
|
13 #include "nsXULAppAPI.h" |
|
14 #include "nsThreadUtils.h" |
|
15 #include "nsStackWalk.h" |
|
16 |
|
17 #ifdef MOZ_CRASHREPORTER |
|
18 #include "nsExceptionHandler.h" |
|
19 #endif |
|
20 |
|
21 #ifdef XP_WIN |
|
22 #include <windows.h> |
|
23 #endif |
|
24 |
|
25 #if defined(MOZ_ENABLE_PROFILER_SPS) && defined(MOZ_PROFILING) && defined(XP_WIN) |
|
26 #define REPORT_CHROME_HANGS |
|
27 #endif |
|
28 |
|
29 namespace mozilla { namespace HangMonitor { |
|
30 |
|
31 /** |
|
32 * A flag which may be set from within a debugger to disable the hang |
|
33 * monitor. |
|
34 */ |
|
35 volatile bool gDebugDisableHangMonitor = false; |
|
36 |
|
37 const char kHangMonitorPrefName[] = "hangmonitor.timeout"; |
|
38 |
|
39 const char kTelemetryPrefName[] = "toolkit.telemetry.enabled"; |
|
40 |
|
41 // Monitor protects gShutdown and gTimeout, but not gTimestamp which rely on |
|
42 // being atomically set by the processor; synchronization doesn't really matter |
|
43 // in this use case. |
|
44 Monitor* gMonitor; |
|
45 |
|
46 // The timeout preference, in seconds. |
|
47 int32_t gTimeout; |
|
48 |
|
49 PRThread* gThread; |
|
50 |
|
51 // Set when shutdown begins to signal the thread to exit immediately. |
|
52 bool gShutdown; |
|
53 |
|
54 // The timestamp of the last event notification, or PR_INTERVAL_NO_WAIT if |
|
55 // we're currently not processing events. |
|
56 Atomic<PRIntervalTime> gTimestamp(PR_INTERVAL_NO_WAIT); |
|
57 |
|
58 #ifdef REPORT_CHROME_HANGS |
|
59 // Main thread ID used in reporting chrome hangs under Windows |
|
60 static HANDLE winMainThreadHandle = nullptr; |
|
61 |
|
62 // Default timeout for reporting chrome hangs to Telemetry (5 seconds) |
|
63 static const int32_t DEFAULT_CHROME_HANG_INTERVAL = 5; |
|
64 |
|
65 // Maximum number of PCs to gather from the stack |
|
66 static const int32_t MAX_CALL_STACK_PCS = 400; |
|
67 #endif |
|
68 |
|
69 // PrefChangedFunc |
|
70 void |
|
71 PrefChanged(const char*, void*) |
|
72 { |
|
73 int32_t newval = Preferences::GetInt(kHangMonitorPrefName); |
|
74 #ifdef REPORT_CHROME_HANGS |
|
75 // Monitor chrome hangs on the profiling branch if Telemetry enabled |
|
76 if (newval == 0) { |
|
77 bool telemetryEnabled = Preferences::GetBool(kTelemetryPrefName); |
|
78 if (telemetryEnabled) { |
|
79 newval = DEFAULT_CHROME_HANG_INTERVAL; |
|
80 } |
|
81 } |
|
82 #endif |
|
83 MonitorAutoLock lock(*gMonitor); |
|
84 if (newval != gTimeout) { |
|
85 gTimeout = newval; |
|
86 lock.Notify(); |
|
87 } |
|
88 } |
|
89 |
|
90 void |
|
91 Crash() |
|
92 { |
|
93 if (gDebugDisableHangMonitor) { |
|
94 return; |
|
95 } |
|
96 |
|
97 #ifdef XP_WIN |
|
98 if (::IsDebuggerPresent()) { |
|
99 return; |
|
100 } |
|
101 #endif |
|
102 |
|
103 #ifdef MOZ_CRASHREPORTER |
|
104 CrashReporter::AnnotateCrashReport(NS_LITERAL_CSTRING("Hang"), |
|
105 NS_LITERAL_CSTRING("1")); |
|
106 #endif |
|
107 |
|
108 NS_RUNTIMEABORT("HangMonitor triggered"); |
|
109 } |
|
110 |
|
111 #ifdef REPORT_CHROME_HANGS |
|
112 static void |
|
113 ChromeStackWalker(void *aPC, void *aSP, void *aClosure) |
|
114 { |
|
115 MOZ_ASSERT(aClosure); |
|
116 std::vector<uintptr_t> *stack = |
|
117 static_cast<std::vector<uintptr_t>*>(aClosure); |
|
118 if (stack->size() == MAX_CALL_STACK_PCS) |
|
119 return; |
|
120 MOZ_ASSERT(stack->size() < MAX_CALL_STACK_PCS); |
|
121 stack->push_back(reinterpret_cast<uintptr_t>(aPC)); |
|
122 } |
|
123 |
|
124 static void |
|
125 GetChromeHangReport(Telemetry::ProcessedStack &aStack, |
|
126 int32_t &aSystemUptime, |
|
127 int32_t &aFirefoxUptime) |
|
128 { |
|
129 MOZ_ASSERT(winMainThreadHandle); |
|
130 |
|
131 // The thread we're about to suspend might have the alloc lock |
|
132 // so allocate ahead of time |
|
133 std::vector<uintptr_t> rawStack; |
|
134 rawStack.reserve(MAX_CALL_STACK_PCS); |
|
135 DWORD ret = ::SuspendThread(winMainThreadHandle); |
|
136 if (ret == -1) |
|
137 return; |
|
138 NS_StackWalk(ChromeStackWalker, /* skipFrames */ 0, /* maxFrames */ 0, |
|
139 reinterpret_cast<void*>(&rawStack), |
|
140 reinterpret_cast<uintptr_t>(winMainThreadHandle), nullptr); |
|
141 ret = ::ResumeThread(winMainThreadHandle); |
|
142 if (ret == -1) |
|
143 return; |
|
144 aStack = Telemetry::GetStackAndModules(rawStack); |
|
145 |
|
146 // Record system uptime (in minutes) at the time of the hang |
|
147 aSystemUptime = ((GetTickCount() / 1000) - (gTimeout * 2)) / 60; |
|
148 |
|
149 // Record Firefox uptime (in minutes) at the time of the hang |
|
150 bool error; |
|
151 TimeStamp processCreation = TimeStamp::ProcessCreation(error); |
|
152 if (!error) { |
|
153 TimeDuration td = TimeStamp::Now() - processCreation; |
|
154 aFirefoxUptime = (static_cast<int32_t>(td.ToSeconds()) - (gTimeout * 2)) / 60; |
|
155 } else { |
|
156 aFirefoxUptime = -1; |
|
157 } |
|
158 } |
|
159 #endif |
|
160 |
|
161 void |
|
162 ThreadMain(void*) |
|
163 { |
|
164 PR_SetCurrentThreadName("Hang Monitor"); |
|
165 |
|
166 MonitorAutoLock lock(*gMonitor); |
|
167 |
|
168 // In order to avoid issues with the hang monitor incorrectly triggering |
|
169 // during a general system stop such as sleeping, the monitor thread must |
|
170 // run twice to trigger hang protection. |
|
171 PRIntervalTime lastTimestamp = 0; |
|
172 int waitCount = 0; |
|
173 |
|
174 #ifdef REPORT_CHROME_HANGS |
|
175 Telemetry::ProcessedStack stack; |
|
176 int32_t systemUptime = -1; |
|
177 int32_t firefoxUptime = -1; |
|
178 #endif |
|
179 |
|
180 while (true) { |
|
181 if (gShutdown) { |
|
182 return; // Exit the thread |
|
183 } |
|
184 |
|
185 // avoid rereading the volatile value in this loop |
|
186 PRIntervalTime timestamp = gTimestamp; |
|
187 |
|
188 PRIntervalTime now = PR_IntervalNow(); |
|
189 |
|
190 if (timestamp != PR_INTERVAL_NO_WAIT && |
|
191 now < timestamp) { |
|
192 // 32-bit overflow, reset for another waiting period |
|
193 timestamp = 1; // lowest legal PRInterval value |
|
194 } |
|
195 |
|
196 if (timestamp != PR_INTERVAL_NO_WAIT && |
|
197 timestamp == lastTimestamp && |
|
198 gTimeout > 0) { |
|
199 ++waitCount; |
|
200 #ifdef REPORT_CHROME_HANGS |
|
201 // Capture the chrome-hang stack + Firefox & system uptimes after |
|
202 // the minimum hang duration has been reached (not when the hang ends) |
|
203 if (waitCount == 2) { |
|
204 GetChromeHangReport(stack, systemUptime, firefoxUptime); |
|
205 } |
|
206 #else |
|
207 // This is the crash-on-hang feature. |
|
208 // See bug 867313 for the quirk in the waitCount comparison |
|
209 if (waitCount >= 2) { |
|
210 int32_t delay = |
|
211 int32_t(PR_IntervalToSeconds(now - timestamp)); |
|
212 if (delay >= gTimeout) { |
|
213 MonitorAutoUnlock unlock(*gMonitor); |
|
214 Crash(); |
|
215 } |
|
216 } |
|
217 #endif |
|
218 } |
|
219 else { |
|
220 #ifdef REPORT_CHROME_HANGS |
|
221 if (waitCount >= 2) { |
|
222 uint32_t hangDuration = PR_IntervalToSeconds(now - lastTimestamp); |
|
223 Telemetry::RecordChromeHang(hangDuration, stack, |
|
224 systemUptime, firefoxUptime); |
|
225 stack.Clear(); |
|
226 } |
|
227 #endif |
|
228 lastTimestamp = timestamp; |
|
229 waitCount = 0; |
|
230 } |
|
231 |
|
232 PRIntervalTime timeout; |
|
233 if (gTimeout <= 0) { |
|
234 timeout = PR_INTERVAL_NO_TIMEOUT; |
|
235 } |
|
236 else { |
|
237 timeout = PR_MillisecondsToInterval(gTimeout * 500); |
|
238 } |
|
239 lock.Wait(timeout); |
|
240 } |
|
241 } |
|
242 |
|
243 void |
|
244 Startup() |
|
245 { |
|
246 // The hang detector only runs in chrome processes. If you change this, |
|
247 // you must also deal with the threadsafety of AnnotateCrashReport in |
|
248 // non-chrome processes! |
|
249 if (GeckoProcessType_Default != XRE_GetProcessType()) |
|
250 return; |
|
251 |
|
252 MOZ_ASSERT(!gMonitor, "Hang monitor already initialized"); |
|
253 gMonitor = new Monitor("HangMonitor"); |
|
254 |
|
255 Preferences::RegisterCallback(PrefChanged, kHangMonitorPrefName, nullptr); |
|
256 PrefChanged(nullptr, nullptr); |
|
257 |
|
258 #ifdef REPORT_CHROME_HANGS |
|
259 Preferences::RegisterCallback(PrefChanged, kTelemetryPrefName, nullptr); |
|
260 winMainThreadHandle = |
|
261 OpenThread(THREAD_ALL_ACCESS, FALSE, GetCurrentThreadId()); |
|
262 if (!winMainThreadHandle) |
|
263 return; |
|
264 #endif |
|
265 |
|
266 // Don't actually start measuring hangs until we hit the main event loop. |
|
267 // This potentially misses a small class of really early startup hangs, |
|
268 // but avoids dealing with some xpcshell tests and other situations which |
|
269 // start XPCOM but don't ever start the event loop. |
|
270 Suspend(); |
|
271 |
|
272 gThread = PR_CreateThread(PR_USER_THREAD, |
|
273 ThreadMain, |
|
274 nullptr, PR_PRIORITY_LOW, PR_GLOBAL_THREAD, |
|
275 PR_JOINABLE_THREAD, 0); |
|
276 } |
|
277 |
|
278 void |
|
279 Shutdown() |
|
280 { |
|
281 if (GeckoProcessType_Default != XRE_GetProcessType()) |
|
282 return; |
|
283 |
|
284 MOZ_ASSERT(gMonitor, "Hang monitor not started"); |
|
285 |
|
286 { // Scope the lock we're going to delete later |
|
287 MonitorAutoLock lock(*gMonitor); |
|
288 gShutdown = true; |
|
289 lock.Notify(); |
|
290 } |
|
291 |
|
292 // thread creation could theoretically fail |
|
293 if (gThread) { |
|
294 PR_JoinThread(gThread); |
|
295 gThread = nullptr; |
|
296 } |
|
297 |
|
298 delete gMonitor; |
|
299 gMonitor = nullptr; |
|
300 } |
|
301 |
|
302 static bool |
|
303 IsUIMessageWaiting() |
|
304 { |
|
305 #ifndef XP_WIN |
|
306 return false; |
|
307 #else |
|
308 #define NS_WM_IMEFIRST WM_IME_SETCONTEXT |
|
309 #define NS_WM_IMELAST WM_IME_KEYUP |
|
310 BOOL haveUIMessageWaiting = FALSE; |
|
311 MSG msg; |
|
312 haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, WM_KEYFIRST, |
|
313 WM_IME_KEYLAST, PM_NOREMOVE); |
|
314 haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, NS_WM_IMEFIRST, |
|
315 NS_WM_IMELAST, PM_NOREMOVE); |
|
316 haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, WM_MOUSEFIRST, |
|
317 WM_MOUSELAST, PM_NOREMOVE); |
|
318 return haveUIMessageWaiting; |
|
319 #endif |
|
320 } |
|
321 |
|
322 void |
|
323 NotifyActivity(ActivityType activityType) |
|
324 { |
|
325 MOZ_ASSERT(NS_IsMainThread(), |
|
326 "HangMonitor::Notify called from off the main thread."); |
|
327 |
|
328 // Determine the activity type more specifically |
|
329 if (activityType == kGeneralActivity) { |
|
330 activityType = IsUIMessageWaiting() ? kActivityUIAVail : |
|
331 kActivityNoUIAVail; |
|
332 } |
|
333 |
|
334 // Calculate the cumulative amount of lag time since the last UI message |
|
335 static uint32_t cumulativeUILagMS = 0; |
|
336 switch(activityType) { |
|
337 case kActivityNoUIAVail: |
|
338 cumulativeUILagMS = 0; |
|
339 break; |
|
340 case kActivityUIAVail: |
|
341 case kUIActivity: |
|
342 if (gTimestamp != PR_INTERVAL_NO_WAIT) { |
|
343 cumulativeUILagMS += PR_IntervalToMilliseconds(PR_IntervalNow() - |
|
344 gTimestamp); |
|
345 } |
|
346 break; |
|
347 default: |
|
348 break; |
|
349 } |
|
350 |
|
351 // This is not a locked activity because PRTimeStamp is a 32-bit quantity |
|
352 // which can be read/written atomically, and we don't want to pay locking |
|
353 // penalties here. |
|
354 gTimestamp = PR_IntervalNow(); |
|
355 |
|
356 // If we have UI activity we should reset the timer and report it if it is |
|
357 // significant enough. |
|
358 if (activityType == kUIActivity) { |
|
359 // The minimum amount of lag time that we should report for telemetry data. |
|
360 // Mozilla's UI responsiveness goal is 50ms |
|
361 static const uint32_t kUIResponsivenessThresholdMS = 50; |
|
362 if (cumulativeUILagMS > kUIResponsivenessThresholdMS) { |
|
363 mozilla::Telemetry::Accumulate(mozilla::Telemetry::EVENTLOOP_UI_LAG_EXP_MS, |
|
364 cumulativeUILagMS); |
|
365 } |
|
366 cumulativeUILagMS = 0; |
|
367 } |
|
368 |
|
369 if (gThread && !gShutdown) { |
|
370 mozilla::BackgroundHangMonitor().NotifyActivity(); |
|
371 } |
|
372 } |
|
373 |
|
374 void |
|
375 Suspend() |
|
376 { |
|
377 MOZ_ASSERT(NS_IsMainThread(), |
|
378 "HangMonitor::Suspend called from off the main thread."); |
|
379 |
|
380 // Because gTimestamp changes this resets the wait count. |
|
381 gTimestamp = PR_INTERVAL_NO_WAIT; |
|
382 |
|
383 if (gThread && !gShutdown) { |
|
384 mozilla::BackgroundHangMonitor().NotifyWait(); |
|
385 } |
|
386 } |
|
387 |
|
388 } } // namespace mozilla::HangMonitor |