Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "mozilla/HangMonitor.h"
7 #include "mozilla/BackgroundHangMonitor.h"
8 #include "mozilla/Monitor.h"
9 #include "mozilla/Preferences.h"
10 #include "mozilla/Telemetry.h"
11 #include "mozilla/ProcessedStack.h"
12 #include "mozilla/Atomics.h"
13 #include "nsXULAppAPI.h"
14 #include "nsThreadUtils.h"
15 #include "nsStackWalk.h"
17 #ifdef MOZ_CRASHREPORTER
18 #include "nsExceptionHandler.h"
19 #endif
21 #ifdef XP_WIN
22 #include <windows.h>
23 #endif
25 #if defined(MOZ_ENABLE_PROFILER_SPS) && defined(MOZ_PROFILING) && defined(XP_WIN)
26 #define REPORT_CHROME_HANGS
27 #endif
29 namespace mozilla { namespace HangMonitor {
31 /**
32 * A flag which may be set from within a debugger to disable the hang
33 * monitor.
34 */
35 volatile bool gDebugDisableHangMonitor = false;
37 const char kHangMonitorPrefName[] = "hangmonitor.timeout";
39 const char kTelemetryPrefName[] = "toolkit.telemetry.enabled";
41 // Monitor protects gShutdown and gTimeout, but not gTimestamp which rely on
42 // being atomically set by the processor; synchronization doesn't really matter
43 // in this use case.
44 Monitor* gMonitor;
46 // The timeout preference, in seconds.
47 int32_t gTimeout;
49 PRThread* gThread;
51 // Set when shutdown begins to signal the thread to exit immediately.
52 bool gShutdown;
54 // The timestamp of the last event notification, or PR_INTERVAL_NO_WAIT if
55 // we're currently not processing events.
56 Atomic<PRIntervalTime> gTimestamp(PR_INTERVAL_NO_WAIT);
58 #ifdef REPORT_CHROME_HANGS
59 // Main thread ID used in reporting chrome hangs under Windows
60 static HANDLE winMainThreadHandle = nullptr;
62 // Default timeout for reporting chrome hangs to Telemetry (5 seconds)
63 static const int32_t DEFAULT_CHROME_HANG_INTERVAL = 5;
65 // Maximum number of PCs to gather from the stack
66 static const int32_t MAX_CALL_STACK_PCS = 400;
67 #endif
69 // PrefChangedFunc
70 void
71 PrefChanged(const char*, void*)
72 {
73 int32_t newval = Preferences::GetInt(kHangMonitorPrefName);
74 #ifdef REPORT_CHROME_HANGS
75 // Monitor chrome hangs on the profiling branch if Telemetry enabled
76 if (newval == 0) {
77 bool telemetryEnabled = Preferences::GetBool(kTelemetryPrefName);
78 if (telemetryEnabled) {
79 newval = DEFAULT_CHROME_HANG_INTERVAL;
80 }
81 }
82 #endif
83 MonitorAutoLock lock(*gMonitor);
84 if (newval != gTimeout) {
85 gTimeout = newval;
86 lock.Notify();
87 }
88 }
90 void
91 Crash()
92 {
93 if (gDebugDisableHangMonitor) {
94 return;
95 }
97 #ifdef XP_WIN
98 if (::IsDebuggerPresent()) {
99 return;
100 }
101 #endif
103 #ifdef MOZ_CRASHREPORTER
104 CrashReporter::AnnotateCrashReport(NS_LITERAL_CSTRING("Hang"),
105 NS_LITERAL_CSTRING("1"));
106 #endif
108 NS_RUNTIMEABORT("HangMonitor triggered");
109 }
111 #ifdef REPORT_CHROME_HANGS
112 static void
113 ChromeStackWalker(void *aPC, void *aSP, void *aClosure)
114 {
115 MOZ_ASSERT(aClosure);
116 std::vector<uintptr_t> *stack =
117 static_cast<std::vector<uintptr_t>*>(aClosure);
118 if (stack->size() == MAX_CALL_STACK_PCS)
119 return;
120 MOZ_ASSERT(stack->size() < MAX_CALL_STACK_PCS);
121 stack->push_back(reinterpret_cast<uintptr_t>(aPC));
122 }
124 static void
125 GetChromeHangReport(Telemetry::ProcessedStack &aStack,
126 int32_t &aSystemUptime,
127 int32_t &aFirefoxUptime)
128 {
129 MOZ_ASSERT(winMainThreadHandle);
131 // The thread we're about to suspend might have the alloc lock
132 // so allocate ahead of time
133 std::vector<uintptr_t> rawStack;
134 rawStack.reserve(MAX_CALL_STACK_PCS);
135 DWORD ret = ::SuspendThread(winMainThreadHandle);
136 if (ret == -1)
137 return;
138 NS_StackWalk(ChromeStackWalker, /* skipFrames */ 0, /* maxFrames */ 0,
139 reinterpret_cast<void*>(&rawStack),
140 reinterpret_cast<uintptr_t>(winMainThreadHandle), nullptr);
141 ret = ::ResumeThread(winMainThreadHandle);
142 if (ret == -1)
143 return;
144 aStack = Telemetry::GetStackAndModules(rawStack);
146 // Record system uptime (in minutes) at the time of the hang
147 aSystemUptime = ((GetTickCount() / 1000) - (gTimeout * 2)) / 60;
149 // Record Firefox uptime (in minutes) at the time of the hang
150 bool error;
151 TimeStamp processCreation = TimeStamp::ProcessCreation(error);
152 if (!error) {
153 TimeDuration td = TimeStamp::Now() - processCreation;
154 aFirefoxUptime = (static_cast<int32_t>(td.ToSeconds()) - (gTimeout * 2)) / 60;
155 } else {
156 aFirefoxUptime = -1;
157 }
158 }
159 #endif
161 void
162 ThreadMain(void*)
163 {
164 PR_SetCurrentThreadName("Hang Monitor");
166 MonitorAutoLock lock(*gMonitor);
168 // In order to avoid issues with the hang monitor incorrectly triggering
169 // during a general system stop such as sleeping, the monitor thread must
170 // run twice to trigger hang protection.
171 PRIntervalTime lastTimestamp = 0;
172 int waitCount = 0;
174 #ifdef REPORT_CHROME_HANGS
175 Telemetry::ProcessedStack stack;
176 int32_t systemUptime = -1;
177 int32_t firefoxUptime = -1;
178 #endif
180 while (true) {
181 if (gShutdown) {
182 return; // Exit the thread
183 }
185 // avoid rereading the volatile value in this loop
186 PRIntervalTime timestamp = gTimestamp;
188 PRIntervalTime now = PR_IntervalNow();
190 if (timestamp != PR_INTERVAL_NO_WAIT &&
191 now < timestamp) {
192 // 32-bit overflow, reset for another waiting period
193 timestamp = 1; // lowest legal PRInterval value
194 }
196 if (timestamp != PR_INTERVAL_NO_WAIT &&
197 timestamp == lastTimestamp &&
198 gTimeout > 0) {
199 ++waitCount;
200 #ifdef REPORT_CHROME_HANGS
201 // Capture the chrome-hang stack + Firefox & system uptimes after
202 // the minimum hang duration has been reached (not when the hang ends)
203 if (waitCount == 2) {
204 GetChromeHangReport(stack, systemUptime, firefoxUptime);
205 }
206 #else
207 // This is the crash-on-hang feature.
208 // See bug 867313 for the quirk in the waitCount comparison
209 if (waitCount >= 2) {
210 int32_t delay =
211 int32_t(PR_IntervalToSeconds(now - timestamp));
212 if (delay >= gTimeout) {
213 MonitorAutoUnlock unlock(*gMonitor);
214 Crash();
215 }
216 }
217 #endif
218 }
219 else {
220 #ifdef REPORT_CHROME_HANGS
221 if (waitCount >= 2) {
222 uint32_t hangDuration = PR_IntervalToSeconds(now - lastTimestamp);
223 Telemetry::RecordChromeHang(hangDuration, stack,
224 systemUptime, firefoxUptime);
225 stack.Clear();
226 }
227 #endif
228 lastTimestamp = timestamp;
229 waitCount = 0;
230 }
232 PRIntervalTime timeout;
233 if (gTimeout <= 0) {
234 timeout = PR_INTERVAL_NO_TIMEOUT;
235 }
236 else {
237 timeout = PR_MillisecondsToInterval(gTimeout * 500);
238 }
239 lock.Wait(timeout);
240 }
241 }
243 void
244 Startup()
245 {
246 // The hang detector only runs in chrome processes. If you change this,
247 // you must also deal with the threadsafety of AnnotateCrashReport in
248 // non-chrome processes!
249 if (GeckoProcessType_Default != XRE_GetProcessType())
250 return;
252 MOZ_ASSERT(!gMonitor, "Hang monitor already initialized");
253 gMonitor = new Monitor("HangMonitor");
255 Preferences::RegisterCallback(PrefChanged, kHangMonitorPrefName, nullptr);
256 PrefChanged(nullptr, nullptr);
258 #ifdef REPORT_CHROME_HANGS
259 Preferences::RegisterCallback(PrefChanged, kTelemetryPrefName, nullptr);
260 winMainThreadHandle =
261 OpenThread(THREAD_ALL_ACCESS, FALSE, GetCurrentThreadId());
262 if (!winMainThreadHandle)
263 return;
264 #endif
266 // Don't actually start measuring hangs until we hit the main event loop.
267 // This potentially misses a small class of really early startup hangs,
268 // but avoids dealing with some xpcshell tests and other situations which
269 // start XPCOM but don't ever start the event loop.
270 Suspend();
272 gThread = PR_CreateThread(PR_USER_THREAD,
273 ThreadMain,
274 nullptr, PR_PRIORITY_LOW, PR_GLOBAL_THREAD,
275 PR_JOINABLE_THREAD, 0);
276 }
278 void
279 Shutdown()
280 {
281 if (GeckoProcessType_Default != XRE_GetProcessType())
282 return;
284 MOZ_ASSERT(gMonitor, "Hang monitor not started");
286 { // Scope the lock we're going to delete later
287 MonitorAutoLock lock(*gMonitor);
288 gShutdown = true;
289 lock.Notify();
290 }
292 // thread creation could theoretically fail
293 if (gThread) {
294 PR_JoinThread(gThread);
295 gThread = nullptr;
296 }
298 delete gMonitor;
299 gMonitor = nullptr;
300 }
302 static bool
303 IsUIMessageWaiting()
304 {
305 #ifndef XP_WIN
306 return false;
307 #else
308 #define NS_WM_IMEFIRST WM_IME_SETCONTEXT
309 #define NS_WM_IMELAST WM_IME_KEYUP
310 BOOL haveUIMessageWaiting = FALSE;
311 MSG msg;
312 haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, WM_KEYFIRST,
313 WM_IME_KEYLAST, PM_NOREMOVE);
314 haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, NS_WM_IMEFIRST,
315 NS_WM_IMELAST, PM_NOREMOVE);
316 haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, WM_MOUSEFIRST,
317 WM_MOUSELAST, PM_NOREMOVE);
318 return haveUIMessageWaiting;
319 #endif
320 }
322 void
323 NotifyActivity(ActivityType activityType)
324 {
325 MOZ_ASSERT(NS_IsMainThread(),
326 "HangMonitor::Notify called from off the main thread.");
328 // Determine the activity type more specifically
329 if (activityType == kGeneralActivity) {
330 activityType = IsUIMessageWaiting() ? kActivityUIAVail :
331 kActivityNoUIAVail;
332 }
334 // Calculate the cumulative amount of lag time since the last UI message
335 static uint32_t cumulativeUILagMS = 0;
336 switch(activityType) {
337 case kActivityNoUIAVail:
338 cumulativeUILagMS = 0;
339 break;
340 case kActivityUIAVail:
341 case kUIActivity:
342 if (gTimestamp != PR_INTERVAL_NO_WAIT) {
343 cumulativeUILagMS += PR_IntervalToMilliseconds(PR_IntervalNow() -
344 gTimestamp);
345 }
346 break;
347 default:
348 break;
349 }
351 // This is not a locked activity because PRTimeStamp is a 32-bit quantity
352 // which can be read/written atomically, and we don't want to pay locking
353 // penalties here.
354 gTimestamp = PR_IntervalNow();
356 // If we have UI activity we should reset the timer and report it if it is
357 // significant enough.
358 if (activityType == kUIActivity) {
359 // The minimum amount of lag time that we should report for telemetry data.
360 // Mozilla's UI responsiveness goal is 50ms
361 static const uint32_t kUIResponsivenessThresholdMS = 50;
362 if (cumulativeUILagMS > kUIResponsivenessThresholdMS) {
363 mozilla::Telemetry::Accumulate(mozilla::Telemetry::EVENTLOOP_UI_LAG_EXP_MS,
364 cumulativeUILagMS);
365 }
366 cumulativeUILagMS = 0;
367 }
369 if (gThread && !gShutdown) {
370 mozilla::BackgroundHangMonitor().NotifyActivity();
371 }
372 }
374 void
375 Suspend()
376 {
377 MOZ_ASSERT(NS_IsMainThread(),
378 "HangMonitor::Suspend called from off the main thread.");
380 // Because gTimestamp changes this resets the wait count.
381 gTimestamp = PR_INTERVAL_NO_WAIT;
383 if (gThread && !gShutdown) {
384 mozilla::BackgroundHangMonitor().NotifyWait();
385 }
386 }
388 } } // namespace mozilla::HangMonitor