1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/xpcom/threads/HangMonitor.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,388 @@ 1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#include "mozilla/HangMonitor.h" 1.10 +#include "mozilla/BackgroundHangMonitor.h" 1.11 +#include "mozilla/Monitor.h" 1.12 +#include "mozilla/Preferences.h" 1.13 +#include "mozilla/Telemetry.h" 1.14 +#include "mozilla/ProcessedStack.h" 1.15 +#include "mozilla/Atomics.h" 1.16 +#include "nsXULAppAPI.h" 1.17 +#include "nsThreadUtils.h" 1.18 +#include "nsStackWalk.h" 1.19 + 1.20 +#ifdef MOZ_CRASHREPORTER 1.21 +#include "nsExceptionHandler.h" 1.22 +#endif 1.23 + 1.24 +#ifdef XP_WIN 1.25 +#include <windows.h> 1.26 +#endif 1.27 + 1.28 +#if defined(MOZ_ENABLE_PROFILER_SPS) && defined(MOZ_PROFILING) && defined(XP_WIN) 1.29 + #define REPORT_CHROME_HANGS 1.30 +#endif 1.31 + 1.32 +namespace mozilla { namespace HangMonitor { 1.33 + 1.34 +/** 1.35 + * A flag which may be set from within a debugger to disable the hang 1.36 + * monitor. 1.37 + */ 1.38 +volatile bool gDebugDisableHangMonitor = false; 1.39 + 1.40 +const char kHangMonitorPrefName[] = "hangmonitor.timeout"; 1.41 + 1.42 +const char kTelemetryPrefName[] = "toolkit.telemetry.enabled"; 1.43 + 1.44 +// Monitor protects gShutdown and gTimeout, but not gTimestamp which rely on 1.45 +// being atomically set by the processor; synchronization doesn't really matter 1.46 +// in this use case. 1.47 +Monitor* gMonitor; 1.48 + 1.49 +// The timeout preference, in seconds. 1.50 +int32_t gTimeout; 1.51 + 1.52 +PRThread* gThread; 1.53 + 1.54 +// Set when shutdown begins to signal the thread to exit immediately. 1.55 +bool gShutdown; 1.56 + 1.57 +// The timestamp of the last event notification, or PR_INTERVAL_NO_WAIT if 1.58 +// we're currently not processing events. 1.59 +Atomic<PRIntervalTime> gTimestamp(PR_INTERVAL_NO_WAIT); 1.60 + 1.61 +#ifdef REPORT_CHROME_HANGS 1.62 +// Main thread ID used in reporting chrome hangs under Windows 1.63 +static HANDLE winMainThreadHandle = nullptr; 1.64 + 1.65 +// Default timeout for reporting chrome hangs to Telemetry (5 seconds) 1.66 +static const int32_t DEFAULT_CHROME_HANG_INTERVAL = 5; 1.67 + 1.68 +// Maximum number of PCs to gather from the stack 1.69 +static const int32_t MAX_CALL_STACK_PCS = 400; 1.70 +#endif 1.71 + 1.72 +// PrefChangedFunc 1.73 +void 1.74 +PrefChanged(const char*, void*) 1.75 +{ 1.76 + int32_t newval = Preferences::GetInt(kHangMonitorPrefName); 1.77 +#ifdef REPORT_CHROME_HANGS 1.78 + // Monitor chrome hangs on the profiling branch if Telemetry enabled 1.79 + if (newval == 0) { 1.80 + bool telemetryEnabled = Preferences::GetBool(kTelemetryPrefName); 1.81 + if (telemetryEnabled) { 1.82 + newval = DEFAULT_CHROME_HANG_INTERVAL; 1.83 + } 1.84 + } 1.85 +#endif 1.86 + MonitorAutoLock lock(*gMonitor); 1.87 + if (newval != gTimeout) { 1.88 + gTimeout = newval; 1.89 + lock.Notify(); 1.90 + } 1.91 +} 1.92 + 1.93 +void 1.94 +Crash() 1.95 +{ 1.96 + if (gDebugDisableHangMonitor) { 1.97 + return; 1.98 + } 1.99 + 1.100 +#ifdef XP_WIN 1.101 + if (::IsDebuggerPresent()) { 1.102 + return; 1.103 + } 1.104 +#endif 1.105 + 1.106 +#ifdef MOZ_CRASHREPORTER 1.107 + CrashReporter::AnnotateCrashReport(NS_LITERAL_CSTRING("Hang"), 1.108 + NS_LITERAL_CSTRING("1")); 1.109 +#endif 1.110 + 1.111 + NS_RUNTIMEABORT("HangMonitor triggered"); 1.112 +} 1.113 + 1.114 +#ifdef REPORT_CHROME_HANGS 1.115 +static void 1.116 +ChromeStackWalker(void *aPC, void *aSP, void *aClosure) 1.117 +{ 1.118 + MOZ_ASSERT(aClosure); 1.119 + std::vector<uintptr_t> *stack = 1.120 + static_cast<std::vector<uintptr_t>*>(aClosure); 1.121 + if (stack->size() == MAX_CALL_STACK_PCS) 1.122 + return; 1.123 + MOZ_ASSERT(stack->size() < MAX_CALL_STACK_PCS); 1.124 + stack->push_back(reinterpret_cast<uintptr_t>(aPC)); 1.125 +} 1.126 + 1.127 +static void 1.128 +GetChromeHangReport(Telemetry::ProcessedStack &aStack, 1.129 + int32_t &aSystemUptime, 1.130 + int32_t &aFirefoxUptime) 1.131 +{ 1.132 + MOZ_ASSERT(winMainThreadHandle); 1.133 + 1.134 + // The thread we're about to suspend might have the alloc lock 1.135 + // so allocate ahead of time 1.136 + std::vector<uintptr_t> rawStack; 1.137 + rawStack.reserve(MAX_CALL_STACK_PCS); 1.138 + DWORD ret = ::SuspendThread(winMainThreadHandle); 1.139 + if (ret == -1) 1.140 + return; 1.141 + NS_StackWalk(ChromeStackWalker, /* skipFrames */ 0, /* maxFrames */ 0, 1.142 + reinterpret_cast<void*>(&rawStack), 1.143 + reinterpret_cast<uintptr_t>(winMainThreadHandle), nullptr); 1.144 + ret = ::ResumeThread(winMainThreadHandle); 1.145 + if (ret == -1) 1.146 + return; 1.147 + aStack = Telemetry::GetStackAndModules(rawStack); 1.148 + 1.149 + // Record system uptime (in minutes) at the time of the hang 1.150 + aSystemUptime = ((GetTickCount() / 1000) - (gTimeout * 2)) / 60; 1.151 + 1.152 + // Record Firefox uptime (in minutes) at the time of the hang 1.153 + bool error; 1.154 + TimeStamp processCreation = TimeStamp::ProcessCreation(error); 1.155 + if (!error) { 1.156 + TimeDuration td = TimeStamp::Now() - processCreation; 1.157 + aFirefoxUptime = (static_cast<int32_t>(td.ToSeconds()) - (gTimeout * 2)) / 60; 1.158 + } else { 1.159 + aFirefoxUptime = -1; 1.160 + } 1.161 +} 1.162 +#endif 1.163 + 1.164 +void 1.165 +ThreadMain(void*) 1.166 +{ 1.167 + PR_SetCurrentThreadName("Hang Monitor"); 1.168 + 1.169 + MonitorAutoLock lock(*gMonitor); 1.170 + 1.171 + // In order to avoid issues with the hang monitor incorrectly triggering 1.172 + // during a general system stop such as sleeping, the monitor thread must 1.173 + // run twice to trigger hang protection. 1.174 + PRIntervalTime lastTimestamp = 0; 1.175 + int waitCount = 0; 1.176 + 1.177 +#ifdef REPORT_CHROME_HANGS 1.178 + Telemetry::ProcessedStack stack; 1.179 + int32_t systemUptime = -1; 1.180 + int32_t firefoxUptime = -1; 1.181 +#endif 1.182 + 1.183 + while (true) { 1.184 + if (gShutdown) { 1.185 + return; // Exit the thread 1.186 + } 1.187 + 1.188 + // avoid rereading the volatile value in this loop 1.189 + PRIntervalTime timestamp = gTimestamp; 1.190 + 1.191 + PRIntervalTime now = PR_IntervalNow(); 1.192 + 1.193 + if (timestamp != PR_INTERVAL_NO_WAIT && 1.194 + now < timestamp) { 1.195 + // 32-bit overflow, reset for another waiting period 1.196 + timestamp = 1; // lowest legal PRInterval value 1.197 + } 1.198 + 1.199 + if (timestamp != PR_INTERVAL_NO_WAIT && 1.200 + timestamp == lastTimestamp && 1.201 + gTimeout > 0) { 1.202 + ++waitCount; 1.203 +#ifdef REPORT_CHROME_HANGS 1.204 + // Capture the chrome-hang stack + Firefox & system uptimes after 1.205 + // the minimum hang duration has been reached (not when the hang ends) 1.206 + if (waitCount == 2) { 1.207 + GetChromeHangReport(stack, systemUptime, firefoxUptime); 1.208 + } 1.209 +#else 1.210 + // This is the crash-on-hang feature. 1.211 + // See bug 867313 for the quirk in the waitCount comparison 1.212 + if (waitCount >= 2) { 1.213 + int32_t delay = 1.214 + int32_t(PR_IntervalToSeconds(now - timestamp)); 1.215 + if (delay >= gTimeout) { 1.216 + MonitorAutoUnlock unlock(*gMonitor); 1.217 + Crash(); 1.218 + } 1.219 + } 1.220 +#endif 1.221 + } 1.222 + else { 1.223 +#ifdef REPORT_CHROME_HANGS 1.224 + if (waitCount >= 2) { 1.225 + uint32_t hangDuration = PR_IntervalToSeconds(now - lastTimestamp); 1.226 + Telemetry::RecordChromeHang(hangDuration, stack, 1.227 + systemUptime, firefoxUptime); 1.228 + stack.Clear(); 1.229 + } 1.230 +#endif 1.231 + lastTimestamp = timestamp; 1.232 + waitCount = 0; 1.233 + } 1.234 + 1.235 + PRIntervalTime timeout; 1.236 + if (gTimeout <= 0) { 1.237 + timeout = PR_INTERVAL_NO_TIMEOUT; 1.238 + } 1.239 + else { 1.240 + timeout = PR_MillisecondsToInterval(gTimeout * 500); 1.241 + } 1.242 + lock.Wait(timeout); 1.243 + } 1.244 +} 1.245 + 1.246 +void 1.247 +Startup() 1.248 +{ 1.249 + // The hang detector only runs in chrome processes. If you change this, 1.250 + // you must also deal with the threadsafety of AnnotateCrashReport in 1.251 + // non-chrome processes! 1.252 + if (GeckoProcessType_Default != XRE_GetProcessType()) 1.253 + return; 1.254 + 1.255 + MOZ_ASSERT(!gMonitor, "Hang monitor already initialized"); 1.256 + gMonitor = new Monitor("HangMonitor"); 1.257 + 1.258 + Preferences::RegisterCallback(PrefChanged, kHangMonitorPrefName, nullptr); 1.259 + PrefChanged(nullptr, nullptr); 1.260 + 1.261 +#ifdef REPORT_CHROME_HANGS 1.262 + Preferences::RegisterCallback(PrefChanged, kTelemetryPrefName, nullptr); 1.263 + winMainThreadHandle = 1.264 + OpenThread(THREAD_ALL_ACCESS, FALSE, GetCurrentThreadId()); 1.265 + if (!winMainThreadHandle) 1.266 + return; 1.267 +#endif 1.268 + 1.269 + // Don't actually start measuring hangs until we hit the main event loop. 1.270 + // This potentially misses a small class of really early startup hangs, 1.271 + // but avoids dealing with some xpcshell tests and other situations which 1.272 + // start XPCOM but don't ever start the event loop. 1.273 + Suspend(); 1.274 + 1.275 + gThread = PR_CreateThread(PR_USER_THREAD, 1.276 + ThreadMain, 1.277 + nullptr, PR_PRIORITY_LOW, PR_GLOBAL_THREAD, 1.278 + PR_JOINABLE_THREAD, 0); 1.279 +} 1.280 + 1.281 +void 1.282 +Shutdown() 1.283 +{ 1.284 + if (GeckoProcessType_Default != XRE_GetProcessType()) 1.285 + return; 1.286 + 1.287 + MOZ_ASSERT(gMonitor, "Hang monitor not started"); 1.288 + 1.289 + { // Scope the lock we're going to delete later 1.290 + MonitorAutoLock lock(*gMonitor); 1.291 + gShutdown = true; 1.292 + lock.Notify(); 1.293 + } 1.294 + 1.295 + // thread creation could theoretically fail 1.296 + if (gThread) { 1.297 + PR_JoinThread(gThread); 1.298 + gThread = nullptr; 1.299 + } 1.300 + 1.301 + delete gMonitor; 1.302 + gMonitor = nullptr; 1.303 +} 1.304 + 1.305 +static bool 1.306 +IsUIMessageWaiting() 1.307 +{ 1.308 +#ifndef XP_WIN 1.309 + return false; 1.310 +#else 1.311 + #define NS_WM_IMEFIRST WM_IME_SETCONTEXT 1.312 + #define NS_WM_IMELAST WM_IME_KEYUP 1.313 + BOOL haveUIMessageWaiting = FALSE; 1.314 + MSG msg; 1.315 + haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, WM_KEYFIRST, 1.316 + WM_IME_KEYLAST, PM_NOREMOVE); 1.317 + haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, NS_WM_IMEFIRST, 1.318 + NS_WM_IMELAST, PM_NOREMOVE); 1.319 + haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, WM_MOUSEFIRST, 1.320 + WM_MOUSELAST, PM_NOREMOVE); 1.321 + return haveUIMessageWaiting; 1.322 +#endif 1.323 +} 1.324 + 1.325 +void 1.326 +NotifyActivity(ActivityType activityType) 1.327 +{ 1.328 + MOZ_ASSERT(NS_IsMainThread(), 1.329 + "HangMonitor::Notify called from off the main thread."); 1.330 + 1.331 + // Determine the activity type more specifically 1.332 + if (activityType == kGeneralActivity) { 1.333 + activityType = IsUIMessageWaiting() ? kActivityUIAVail : 1.334 + kActivityNoUIAVail; 1.335 + } 1.336 + 1.337 + // Calculate the cumulative amount of lag time since the last UI message 1.338 + static uint32_t cumulativeUILagMS = 0; 1.339 + switch(activityType) { 1.340 + case kActivityNoUIAVail: 1.341 + cumulativeUILagMS = 0; 1.342 + break; 1.343 + case kActivityUIAVail: 1.344 + case kUIActivity: 1.345 + if (gTimestamp != PR_INTERVAL_NO_WAIT) { 1.346 + cumulativeUILagMS += PR_IntervalToMilliseconds(PR_IntervalNow() - 1.347 + gTimestamp); 1.348 + } 1.349 + break; 1.350 + default: 1.351 + break; 1.352 + } 1.353 + 1.354 + // This is not a locked activity because PRTimeStamp is a 32-bit quantity 1.355 + // which can be read/written atomically, and we don't want to pay locking 1.356 + // penalties here. 1.357 + gTimestamp = PR_IntervalNow(); 1.358 + 1.359 + // If we have UI activity we should reset the timer and report it if it is 1.360 + // significant enough. 1.361 + if (activityType == kUIActivity) { 1.362 + // The minimum amount of lag time that we should report for telemetry data. 1.363 + // Mozilla's UI responsiveness goal is 50ms 1.364 + static const uint32_t kUIResponsivenessThresholdMS = 50; 1.365 + if (cumulativeUILagMS > kUIResponsivenessThresholdMS) { 1.366 + mozilla::Telemetry::Accumulate(mozilla::Telemetry::EVENTLOOP_UI_LAG_EXP_MS, 1.367 + cumulativeUILagMS); 1.368 + } 1.369 + cumulativeUILagMS = 0; 1.370 + } 1.371 + 1.372 + if (gThread && !gShutdown) { 1.373 + mozilla::BackgroundHangMonitor().NotifyActivity(); 1.374 + } 1.375 +} 1.376 + 1.377 +void 1.378 +Suspend() 1.379 +{ 1.380 + MOZ_ASSERT(NS_IsMainThread(), 1.381 + "HangMonitor::Suspend called from off the main thread."); 1.382 + 1.383 + // Because gTimestamp changes this resets the wait count. 1.384 + gTimestamp = PR_INTERVAL_NO_WAIT; 1.385 + 1.386 + if (gThread && !gShutdown) { 1.387 + mozilla::BackgroundHangMonitor().NotifyWait(); 1.388 + } 1.389 +} 1.390 + 1.391 +} } // namespace mozilla::HangMonitor