xpcom/threads/HangMonitor.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0 2 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5
michael@0 6 #include "mozilla/HangMonitor.h"
michael@0 7 #include "mozilla/BackgroundHangMonitor.h"
michael@0 8 #include "mozilla/Monitor.h"
michael@0 9 #include "mozilla/Preferences.h"
michael@0 10 #include "mozilla/Telemetry.h"
michael@0 11 #include "mozilla/ProcessedStack.h"
michael@0 12 #include "mozilla/Atomics.h"
michael@0 13 #include "nsXULAppAPI.h"
michael@0 14 #include "nsThreadUtils.h"
michael@0 15 #include "nsStackWalk.h"
michael@0 16
michael@0 17 #ifdef MOZ_CRASHREPORTER
michael@0 18 #include "nsExceptionHandler.h"
michael@0 19 #endif
michael@0 20
michael@0 21 #ifdef XP_WIN
michael@0 22 #include <windows.h>
michael@0 23 #endif
michael@0 24
michael@0 25 #if defined(MOZ_ENABLE_PROFILER_SPS) && defined(MOZ_PROFILING) && defined(XP_WIN)
michael@0 26 #define REPORT_CHROME_HANGS
michael@0 27 #endif
michael@0 28
michael@0 29 namespace mozilla { namespace HangMonitor {
michael@0 30
michael@0 31 /**
michael@0 32 * A flag which may be set from within a debugger to disable the hang
michael@0 33 * monitor.
michael@0 34 */
michael@0 35 volatile bool gDebugDisableHangMonitor = false;
michael@0 36
michael@0 37 const char kHangMonitorPrefName[] = "hangmonitor.timeout";
michael@0 38
michael@0 39 const char kTelemetryPrefName[] = "toolkit.telemetry.enabled";
michael@0 40
michael@0 41 // Monitor protects gShutdown and gTimeout, but not gTimestamp which rely on
michael@0 42 // being atomically set by the processor; synchronization doesn't really matter
michael@0 43 // in this use case.
michael@0 44 Monitor* gMonitor;
michael@0 45
michael@0 46 // The timeout preference, in seconds.
michael@0 47 int32_t gTimeout;
michael@0 48
michael@0 49 PRThread* gThread;
michael@0 50
michael@0 51 // Set when shutdown begins to signal the thread to exit immediately.
michael@0 52 bool gShutdown;
michael@0 53
michael@0 54 // The timestamp of the last event notification, or PR_INTERVAL_NO_WAIT if
michael@0 55 // we're currently not processing events.
michael@0 56 Atomic<PRIntervalTime> gTimestamp(PR_INTERVAL_NO_WAIT);
michael@0 57
michael@0 58 #ifdef REPORT_CHROME_HANGS
michael@0 59 // Main thread ID used in reporting chrome hangs under Windows
michael@0 60 static HANDLE winMainThreadHandle = nullptr;
michael@0 61
michael@0 62 // Default timeout for reporting chrome hangs to Telemetry (5 seconds)
michael@0 63 static const int32_t DEFAULT_CHROME_HANG_INTERVAL = 5;
michael@0 64
michael@0 65 // Maximum number of PCs to gather from the stack
michael@0 66 static const int32_t MAX_CALL_STACK_PCS = 400;
michael@0 67 #endif
michael@0 68
michael@0 69 // PrefChangedFunc
michael@0 70 void
michael@0 71 PrefChanged(const char*, void*)
michael@0 72 {
michael@0 73 int32_t newval = Preferences::GetInt(kHangMonitorPrefName);
michael@0 74 #ifdef REPORT_CHROME_HANGS
michael@0 75 // Monitor chrome hangs on the profiling branch if Telemetry enabled
michael@0 76 if (newval == 0) {
michael@0 77 bool telemetryEnabled = Preferences::GetBool(kTelemetryPrefName);
michael@0 78 if (telemetryEnabled) {
michael@0 79 newval = DEFAULT_CHROME_HANG_INTERVAL;
michael@0 80 }
michael@0 81 }
michael@0 82 #endif
michael@0 83 MonitorAutoLock lock(*gMonitor);
michael@0 84 if (newval != gTimeout) {
michael@0 85 gTimeout = newval;
michael@0 86 lock.Notify();
michael@0 87 }
michael@0 88 }
michael@0 89
michael@0 90 void
michael@0 91 Crash()
michael@0 92 {
michael@0 93 if (gDebugDisableHangMonitor) {
michael@0 94 return;
michael@0 95 }
michael@0 96
michael@0 97 #ifdef XP_WIN
michael@0 98 if (::IsDebuggerPresent()) {
michael@0 99 return;
michael@0 100 }
michael@0 101 #endif
michael@0 102
michael@0 103 #ifdef MOZ_CRASHREPORTER
michael@0 104 CrashReporter::AnnotateCrashReport(NS_LITERAL_CSTRING("Hang"),
michael@0 105 NS_LITERAL_CSTRING("1"));
michael@0 106 #endif
michael@0 107
michael@0 108 NS_RUNTIMEABORT("HangMonitor triggered");
michael@0 109 }
michael@0 110
michael@0 111 #ifdef REPORT_CHROME_HANGS
michael@0 112 static void
michael@0 113 ChromeStackWalker(void *aPC, void *aSP, void *aClosure)
michael@0 114 {
michael@0 115 MOZ_ASSERT(aClosure);
michael@0 116 std::vector<uintptr_t> *stack =
michael@0 117 static_cast<std::vector<uintptr_t>*>(aClosure);
michael@0 118 if (stack->size() == MAX_CALL_STACK_PCS)
michael@0 119 return;
michael@0 120 MOZ_ASSERT(stack->size() < MAX_CALL_STACK_PCS);
michael@0 121 stack->push_back(reinterpret_cast<uintptr_t>(aPC));
michael@0 122 }
michael@0 123
michael@0 124 static void
michael@0 125 GetChromeHangReport(Telemetry::ProcessedStack &aStack,
michael@0 126 int32_t &aSystemUptime,
michael@0 127 int32_t &aFirefoxUptime)
michael@0 128 {
michael@0 129 MOZ_ASSERT(winMainThreadHandle);
michael@0 130
michael@0 131 // The thread we're about to suspend might have the alloc lock
michael@0 132 // so allocate ahead of time
michael@0 133 std::vector<uintptr_t> rawStack;
michael@0 134 rawStack.reserve(MAX_CALL_STACK_PCS);
michael@0 135 DWORD ret = ::SuspendThread(winMainThreadHandle);
michael@0 136 if (ret == -1)
michael@0 137 return;
michael@0 138 NS_StackWalk(ChromeStackWalker, /* skipFrames */ 0, /* maxFrames */ 0,
michael@0 139 reinterpret_cast<void*>(&rawStack),
michael@0 140 reinterpret_cast<uintptr_t>(winMainThreadHandle), nullptr);
michael@0 141 ret = ::ResumeThread(winMainThreadHandle);
michael@0 142 if (ret == -1)
michael@0 143 return;
michael@0 144 aStack = Telemetry::GetStackAndModules(rawStack);
michael@0 145
michael@0 146 // Record system uptime (in minutes) at the time of the hang
michael@0 147 aSystemUptime = ((GetTickCount() / 1000) - (gTimeout * 2)) / 60;
michael@0 148
michael@0 149 // Record Firefox uptime (in minutes) at the time of the hang
michael@0 150 bool error;
michael@0 151 TimeStamp processCreation = TimeStamp::ProcessCreation(error);
michael@0 152 if (!error) {
michael@0 153 TimeDuration td = TimeStamp::Now() - processCreation;
michael@0 154 aFirefoxUptime = (static_cast<int32_t>(td.ToSeconds()) - (gTimeout * 2)) / 60;
michael@0 155 } else {
michael@0 156 aFirefoxUptime = -1;
michael@0 157 }
michael@0 158 }
michael@0 159 #endif
michael@0 160
michael@0 161 void
michael@0 162 ThreadMain(void*)
michael@0 163 {
michael@0 164 PR_SetCurrentThreadName("Hang Monitor");
michael@0 165
michael@0 166 MonitorAutoLock lock(*gMonitor);
michael@0 167
michael@0 168 // In order to avoid issues with the hang monitor incorrectly triggering
michael@0 169 // during a general system stop such as sleeping, the monitor thread must
michael@0 170 // run twice to trigger hang protection.
michael@0 171 PRIntervalTime lastTimestamp = 0;
michael@0 172 int waitCount = 0;
michael@0 173
michael@0 174 #ifdef REPORT_CHROME_HANGS
michael@0 175 Telemetry::ProcessedStack stack;
michael@0 176 int32_t systemUptime = -1;
michael@0 177 int32_t firefoxUptime = -1;
michael@0 178 #endif
michael@0 179
michael@0 180 while (true) {
michael@0 181 if (gShutdown) {
michael@0 182 return; // Exit the thread
michael@0 183 }
michael@0 184
michael@0 185 // avoid rereading the volatile value in this loop
michael@0 186 PRIntervalTime timestamp = gTimestamp;
michael@0 187
michael@0 188 PRIntervalTime now = PR_IntervalNow();
michael@0 189
michael@0 190 if (timestamp != PR_INTERVAL_NO_WAIT &&
michael@0 191 now < timestamp) {
michael@0 192 // 32-bit overflow, reset for another waiting period
michael@0 193 timestamp = 1; // lowest legal PRInterval value
michael@0 194 }
michael@0 195
michael@0 196 if (timestamp != PR_INTERVAL_NO_WAIT &&
michael@0 197 timestamp == lastTimestamp &&
michael@0 198 gTimeout > 0) {
michael@0 199 ++waitCount;
michael@0 200 #ifdef REPORT_CHROME_HANGS
michael@0 201 // Capture the chrome-hang stack + Firefox & system uptimes after
michael@0 202 // the minimum hang duration has been reached (not when the hang ends)
michael@0 203 if (waitCount == 2) {
michael@0 204 GetChromeHangReport(stack, systemUptime, firefoxUptime);
michael@0 205 }
michael@0 206 #else
michael@0 207 // This is the crash-on-hang feature.
michael@0 208 // See bug 867313 for the quirk in the waitCount comparison
michael@0 209 if (waitCount >= 2) {
michael@0 210 int32_t delay =
michael@0 211 int32_t(PR_IntervalToSeconds(now - timestamp));
michael@0 212 if (delay >= gTimeout) {
michael@0 213 MonitorAutoUnlock unlock(*gMonitor);
michael@0 214 Crash();
michael@0 215 }
michael@0 216 }
michael@0 217 #endif
michael@0 218 }
michael@0 219 else {
michael@0 220 #ifdef REPORT_CHROME_HANGS
michael@0 221 if (waitCount >= 2) {
michael@0 222 uint32_t hangDuration = PR_IntervalToSeconds(now - lastTimestamp);
michael@0 223 Telemetry::RecordChromeHang(hangDuration, stack,
michael@0 224 systemUptime, firefoxUptime);
michael@0 225 stack.Clear();
michael@0 226 }
michael@0 227 #endif
michael@0 228 lastTimestamp = timestamp;
michael@0 229 waitCount = 0;
michael@0 230 }
michael@0 231
michael@0 232 PRIntervalTime timeout;
michael@0 233 if (gTimeout <= 0) {
michael@0 234 timeout = PR_INTERVAL_NO_TIMEOUT;
michael@0 235 }
michael@0 236 else {
michael@0 237 timeout = PR_MillisecondsToInterval(gTimeout * 500);
michael@0 238 }
michael@0 239 lock.Wait(timeout);
michael@0 240 }
michael@0 241 }
michael@0 242
michael@0 243 void
michael@0 244 Startup()
michael@0 245 {
michael@0 246 // The hang detector only runs in chrome processes. If you change this,
michael@0 247 // you must also deal with the threadsafety of AnnotateCrashReport in
michael@0 248 // non-chrome processes!
michael@0 249 if (GeckoProcessType_Default != XRE_GetProcessType())
michael@0 250 return;
michael@0 251
michael@0 252 MOZ_ASSERT(!gMonitor, "Hang monitor already initialized");
michael@0 253 gMonitor = new Monitor("HangMonitor");
michael@0 254
michael@0 255 Preferences::RegisterCallback(PrefChanged, kHangMonitorPrefName, nullptr);
michael@0 256 PrefChanged(nullptr, nullptr);
michael@0 257
michael@0 258 #ifdef REPORT_CHROME_HANGS
michael@0 259 Preferences::RegisterCallback(PrefChanged, kTelemetryPrefName, nullptr);
michael@0 260 winMainThreadHandle =
michael@0 261 OpenThread(THREAD_ALL_ACCESS, FALSE, GetCurrentThreadId());
michael@0 262 if (!winMainThreadHandle)
michael@0 263 return;
michael@0 264 #endif
michael@0 265
michael@0 266 // Don't actually start measuring hangs until we hit the main event loop.
michael@0 267 // This potentially misses a small class of really early startup hangs,
michael@0 268 // but avoids dealing with some xpcshell tests and other situations which
michael@0 269 // start XPCOM but don't ever start the event loop.
michael@0 270 Suspend();
michael@0 271
michael@0 272 gThread = PR_CreateThread(PR_USER_THREAD,
michael@0 273 ThreadMain,
michael@0 274 nullptr, PR_PRIORITY_LOW, PR_GLOBAL_THREAD,
michael@0 275 PR_JOINABLE_THREAD, 0);
michael@0 276 }
michael@0 277
michael@0 278 void
michael@0 279 Shutdown()
michael@0 280 {
michael@0 281 if (GeckoProcessType_Default != XRE_GetProcessType())
michael@0 282 return;
michael@0 283
michael@0 284 MOZ_ASSERT(gMonitor, "Hang monitor not started");
michael@0 285
michael@0 286 { // Scope the lock we're going to delete later
michael@0 287 MonitorAutoLock lock(*gMonitor);
michael@0 288 gShutdown = true;
michael@0 289 lock.Notify();
michael@0 290 }
michael@0 291
michael@0 292 // thread creation could theoretically fail
michael@0 293 if (gThread) {
michael@0 294 PR_JoinThread(gThread);
michael@0 295 gThread = nullptr;
michael@0 296 }
michael@0 297
michael@0 298 delete gMonitor;
michael@0 299 gMonitor = nullptr;
michael@0 300 }
michael@0 301
michael@0 302 static bool
michael@0 303 IsUIMessageWaiting()
michael@0 304 {
michael@0 305 #ifndef XP_WIN
michael@0 306 return false;
michael@0 307 #else
michael@0 308 #define NS_WM_IMEFIRST WM_IME_SETCONTEXT
michael@0 309 #define NS_WM_IMELAST WM_IME_KEYUP
michael@0 310 BOOL haveUIMessageWaiting = FALSE;
michael@0 311 MSG msg;
michael@0 312 haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, WM_KEYFIRST,
michael@0 313 WM_IME_KEYLAST, PM_NOREMOVE);
michael@0 314 haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, NS_WM_IMEFIRST,
michael@0 315 NS_WM_IMELAST, PM_NOREMOVE);
michael@0 316 haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, WM_MOUSEFIRST,
michael@0 317 WM_MOUSELAST, PM_NOREMOVE);
michael@0 318 return haveUIMessageWaiting;
michael@0 319 #endif
michael@0 320 }
michael@0 321
michael@0 322 void
michael@0 323 NotifyActivity(ActivityType activityType)
michael@0 324 {
michael@0 325 MOZ_ASSERT(NS_IsMainThread(),
michael@0 326 "HangMonitor::Notify called from off the main thread.");
michael@0 327
michael@0 328 // Determine the activity type more specifically
michael@0 329 if (activityType == kGeneralActivity) {
michael@0 330 activityType = IsUIMessageWaiting() ? kActivityUIAVail :
michael@0 331 kActivityNoUIAVail;
michael@0 332 }
michael@0 333
michael@0 334 // Calculate the cumulative amount of lag time since the last UI message
michael@0 335 static uint32_t cumulativeUILagMS = 0;
michael@0 336 switch(activityType) {
michael@0 337 case kActivityNoUIAVail:
michael@0 338 cumulativeUILagMS = 0;
michael@0 339 break;
michael@0 340 case kActivityUIAVail:
michael@0 341 case kUIActivity:
michael@0 342 if (gTimestamp != PR_INTERVAL_NO_WAIT) {
michael@0 343 cumulativeUILagMS += PR_IntervalToMilliseconds(PR_IntervalNow() -
michael@0 344 gTimestamp);
michael@0 345 }
michael@0 346 break;
michael@0 347 default:
michael@0 348 break;
michael@0 349 }
michael@0 350
michael@0 351 // This is not a locked activity because PRTimeStamp is a 32-bit quantity
michael@0 352 // which can be read/written atomically, and we don't want to pay locking
michael@0 353 // penalties here.
michael@0 354 gTimestamp = PR_IntervalNow();
michael@0 355
michael@0 356 // If we have UI activity we should reset the timer and report it if it is
michael@0 357 // significant enough.
michael@0 358 if (activityType == kUIActivity) {
michael@0 359 // The minimum amount of lag time that we should report for telemetry data.
michael@0 360 // Mozilla's UI responsiveness goal is 50ms
michael@0 361 static const uint32_t kUIResponsivenessThresholdMS = 50;
michael@0 362 if (cumulativeUILagMS > kUIResponsivenessThresholdMS) {
michael@0 363 mozilla::Telemetry::Accumulate(mozilla::Telemetry::EVENTLOOP_UI_LAG_EXP_MS,
michael@0 364 cumulativeUILagMS);
michael@0 365 }
michael@0 366 cumulativeUILagMS = 0;
michael@0 367 }
michael@0 368
michael@0 369 if (gThread && !gShutdown) {
michael@0 370 mozilla::BackgroundHangMonitor().NotifyActivity();
michael@0 371 }
michael@0 372 }
michael@0 373
michael@0 374 void
michael@0 375 Suspend()
michael@0 376 {
michael@0 377 MOZ_ASSERT(NS_IsMainThread(),
michael@0 378 "HangMonitor::Suspend called from off the main thread.");
michael@0 379
michael@0 380 // Because gTimestamp changes this resets the wait count.
michael@0 381 gTimestamp = PR_INTERVAL_NO_WAIT;
michael@0 382
michael@0 383 if (gThread && !gShutdown) {
michael@0 384 mozilla::BackgroundHangMonitor().NotifyWait();
michael@0 385 }
michael@0 386 }
michael@0 387
michael@0 388 } } // namespace mozilla::HangMonitor

mercurial