michael@0: /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #include "mozilla/HangMonitor.h" michael@0: #include "mozilla/BackgroundHangMonitor.h" michael@0: #include "mozilla/Monitor.h" michael@0: #include "mozilla/Preferences.h" michael@0: #include "mozilla/Telemetry.h" michael@0: #include "mozilla/ProcessedStack.h" michael@0: #include "mozilla/Atomics.h" michael@0: #include "nsXULAppAPI.h" michael@0: #include "nsThreadUtils.h" michael@0: #include "nsStackWalk.h" michael@0: michael@0: #ifdef MOZ_CRASHREPORTER michael@0: #include "nsExceptionHandler.h" michael@0: #endif michael@0: michael@0: #ifdef XP_WIN michael@0: #include michael@0: #endif michael@0: michael@0: #if defined(MOZ_ENABLE_PROFILER_SPS) && defined(MOZ_PROFILING) && defined(XP_WIN) michael@0: #define REPORT_CHROME_HANGS michael@0: #endif michael@0: michael@0: namespace mozilla { namespace HangMonitor { michael@0: michael@0: /** michael@0: * A flag which may be set from within a debugger to disable the hang michael@0: * monitor. michael@0: */ michael@0: volatile bool gDebugDisableHangMonitor = false; michael@0: michael@0: const char kHangMonitorPrefName[] = "hangmonitor.timeout"; michael@0: michael@0: const char kTelemetryPrefName[] = "toolkit.telemetry.enabled"; michael@0: michael@0: // Monitor protects gShutdown and gTimeout, but not gTimestamp which rely on michael@0: // being atomically set by the processor; synchronization doesn't really matter michael@0: // in this use case. michael@0: Monitor* gMonitor; michael@0: michael@0: // The timeout preference, in seconds. michael@0: int32_t gTimeout; michael@0: michael@0: PRThread* gThread; michael@0: michael@0: // Set when shutdown begins to signal the thread to exit immediately. michael@0: bool gShutdown; michael@0: michael@0: // The timestamp of the last event notification, or PR_INTERVAL_NO_WAIT if michael@0: // we're currently not processing events. michael@0: Atomic gTimestamp(PR_INTERVAL_NO_WAIT); michael@0: michael@0: #ifdef REPORT_CHROME_HANGS michael@0: // Main thread ID used in reporting chrome hangs under Windows michael@0: static HANDLE winMainThreadHandle = nullptr; michael@0: michael@0: // Default timeout for reporting chrome hangs to Telemetry (5 seconds) michael@0: static const int32_t DEFAULT_CHROME_HANG_INTERVAL = 5; michael@0: michael@0: // Maximum number of PCs to gather from the stack michael@0: static const int32_t MAX_CALL_STACK_PCS = 400; michael@0: #endif michael@0: michael@0: // PrefChangedFunc michael@0: void michael@0: PrefChanged(const char*, void*) michael@0: { michael@0: int32_t newval = Preferences::GetInt(kHangMonitorPrefName); michael@0: #ifdef REPORT_CHROME_HANGS michael@0: // Monitor chrome hangs on the profiling branch if Telemetry enabled michael@0: if (newval == 0) { michael@0: bool telemetryEnabled = Preferences::GetBool(kTelemetryPrefName); michael@0: if (telemetryEnabled) { michael@0: newval = DEFAULT_CHROME_HANG_INTERVAL; michael@0: } michael@0: } michael@0: #endif michael@0: MonitorAutoLock lock(*gMonitor); michael@0: if (newval != gTimeout) { michael@0: gTimeout = newval; michael@0: lock.Notify(); michael@0: } michael@0: } michael@0: michael@0: void michael@0: Crash() michael@0: { michael@0: if (gDebugDisableHangMonitor) { michael@0: return; michael@0: } michael@0: michael@0: #ifdef XP_WIN michael@0: if (::IsDebuggerPresent()) { michael@0: return; michael@0: } michael@0: #endif michael@0: michael@0: #ifdef MOZ_CRASHREPORTER michael@0: CrashReporter::AnnotateCrashReport(NS_LITERAL_CSTRING("Hang"), michael@0: NS_LITERAL_CSTRING("1")); michael@0: #endif michael@0: michael@0: NS_RUNTIMEABORT("HangMonitor triggered"); michael@0: } michael@0: michael@0: #ifdef REPORT_CHROME_HANGS michael@0: static void michael@0: ChromeStackWalker(void *aPC, void *aSP, void *aClosure) michael@0: { michael@0: MOZ_ASSERT(aClosure); michael@0: std::vector *stack = michael@0: static_cast*>(aClosure); michael@0: if (stack->size() == MAX_CALL_STACK_PCS) michael@0: return; michael@0: MOZ_ASSERT(stack->size() < MAX_CALL_STACK_PCS); michael@0: stack->push_back(reinterpret_cast(aPC)); michael@0: } michael@0: michael@0: static void michael@0: GetChromeHangReport(Telemetry::ProcessedStack &aStack, michael@0: int32_t &aSystemUptime, michael@0: int32_t &aFirefoxUptime) michael@0: { michael@0: MOZ_ASSERT(winMainThreadHandle); michael@0: michael@0: // The thread we're about to suspend might have the alloc lock michael@0: // so allocate ahead of time michael@0: std::vector rawStack; michael@0: rawStack.reserve(MAX_CALL_STACK_PCS); michael@0: DWORD ret = ::SuspendThread(winMainThreadHandle); michael@0: if (ret == -1) michael@0: return; michael@0: NS_StackWalk(ChromeStackWalker, /* skipFrames */ 0, /* maxFrames */ 0, michael@0: reinterpret_cast(&rawStack), michael@0: reinterpret_cast(winMainThreadHandle), nullptr); michael@0: ret = ::ResumeThread(winMainThreadHandle); michael@0: if (ret == -1) michael@0: return; michael@0: aStack = Telemetry::GetStackAndModules(rawStack); michael@0: michael@0: // Record system uptime (in minutes) at the time of the hang michael@0: aSystemUptime = ((GetTickCount() / 1000) - (gTimeout * 2)) / 60; michael@0: michael@0: // Record Firefox uptime (in minutes) at the time of the hang michael@0: bool error; michael@0: TimeStamp processCreation = TimeStamp::ProcessCreation(error); michael@0: if (!error) { michael@0: TimeDuration td = TimeStamp::Now() - processCreation; michael@0: aFirefoxUptime = (static_cast(td.ToSeconds()) - (gTimeout * 2)) / 60; michael@0: } else { michael@0: aFirefoxUptime = -1; michael@0: } michael@0: } michael@0: #endif michael@0: michael@0: void michael@0: ThreadMain(void*) michael@0: { michael@0: PR_SetCurrentThreadName("Hang Monitor"); michael@0: michael@0: MonitorAutoLock lock(*gMonitor); michael@0: michael@0: // In order to avoid issues with the hang monitor incorrectly triggering michael@0: // during a general system stop such as sleeping, the monitor thread must michael@0: // run twice to trigger hang protection. michael@0: PRIntervalTime lastTimestamp = 0; michael@0: int waitCount = 0; michael@0: michael@0: #ifdef REPORT_CHROME_HANGS michael@0: Telemetry::ProcessedStack stack; michael@0: int32_t systemUptime = -1; michael@0: int32_t firefoxUptime = -1; michael@0: #endif michael@0: michael@0: while (true) { michael@0: if (gShutdown) { michael@0: return; // Exit the thread michael@0: } michael@0: michael@0: // avoid rereading the volatile value in this loop michael@0: PRIntervalTime timestamp = gTimestamp; michael@0: michael@0: PRIntervalTime now = PR_IntervalNow(); michael@0: michael@0: if (timestamp != PR_INTERVAL_NO_WAIT && michael@0: now < timestamp) { michael@0: // 32-bit overflow, reset for another waiting period michael@0: timestamp = 1; // lowest legal PRInterval value michael@0: } michael@0: michael@0: if (timestamp != PR_INTERVAL_NO_WAIT && michael@0: timestamp == lastTimestamp && michael@0: gTimeout > 0) { michael@0: ++waitCount; michael@0: #ifdef REPORT_CHROME_HANGS michael@0: // Capture the chrome-hang stack + Firefox & system uptimes after michael@0: // the minimum hang duration has been reached (not when the hang ends) michael@0: if (waitCount == 2) { michael@0: GetChromeHangReport(stack, systemUptime, firefoxUptime); michael@0: } michael@0: #else michael@0: // This is the crash-on-hang feature. michael@0: // See bug 867313 for the quirk in the waitCount comparison michael@0: if (waitCount >= 2) { michael@0: int32_t delay = michael@0: int32_t(PR_IntervalToSeconds(now - timestamp)); michael@0: if (delay >= gTimeout) { michael@0: MonitorAutoUnlock unlock(*gMonitor); michael@0: Crash(); michael@0: } michael@0: } michael@0: #endif michael@0: } michael@0: else { michael@0: #ifdef REPORT_CHROME_HANGS michael@0: if (waitCount >= 2) { michael@0: uint32_t hangDuration = PR_IntervalToSeconds(now - lastTimestamp); michael@0: Telemetry::RecordChromeHang(hangDuration, stack, michael@0: systemUptime, firefoxUptime); michael@0: stack.Clear(); michael@0: } michael@0: #endif michael@0: lastTimestamp = timestamp; michael@0: waitCount = 0; michael@0: } michael@0: michael@0: PRIntervalTime timeout; michael@0: if (gTimeout <= 0) { michael@0: timeout = PR_INTERVAL_NO_TIMEOUT; michael@0: } michael@0: else { michael@0: timeout = PR_MillisecondsToInterval(gTimeout * 500); michael@0: } michael@0: lock.Wait(timeout); michael@0: } michael@0: } michael@0: michael@0: void michael@0: Startup() michael@0: { michael@0: // The hang detector only runs in chrome processes. If you change this, michael@0: // you must also deal with the threadsafety of AnnotateCrashReport in michael@0: // non-chrome processes! michael@0: if (GeckoProcessType_Default != XRE_GetProcessType()) michael@0: return; michael@0: michael@0: MOZ_ASSERT(!gMonitor, "Hang monitor already initialized"); michael@0: gMonitor = new Monitor("HangMonitor"); michael@0: michael@0: Preferences::RegisterCallback(PrefChanged, kHangMonitorPrefName, nullptr); michael@0: PrefChanged(nullptr, nullptr); michael@0: michael@0: #ifdef REPORT_CHROME_HANGS michael@0: Preferences::RegisterCallback(PrefChanged, kTelemetryPrefName, nullptr); michael@0: winMainThreadHandle = michael@0: OpenThread(THREAD_ALL_ACCESS, FALSE, GetCurrentThreadId()); michael@0: if (!winMainThreadHandle) michael@0: return; michael@0: #endif michael@0: michael@0: // Don't actually start measuring hangs until we hit the main event loop. michael@0: // This potentially misses a small class of really early startup hangs, michael@0: // but avoids dealing with some xpcshell tests and other situations which michael@0: // start XPCOM but don't ever start the event loop. michael@0: Suspend(); michael@0: michael@0: gThread = PR_CreateThread(PR_USER_THREAD, michael@0: ThreadMain, michael@0: nullptr, PR_PRIORITY_LOW, PR_GLOBAL_THREAD, michael@0: PR_JOINABLE_THREAD, 0); michael@0: } michael@0: michael@0: void michael@0: Shutdown() michael@0: { michael@0: if (GeckoProcessType_Default != XRE_GetProcessType()) michael@0: return; michael@0: michael@0: MOZ_ASSERT(gMonitor, "Hang monitor not started"); michael@0: michael@0: { // Scope the lock we're going to delete later michael@0: MonitorAutoLock lock(*gMonitor); michael@0: gShutdown = true; michael@0: lock.Notify(); michael@0: } michael@0: michael@0: // thread creation could theoretically fail michael@0: if (gThread) { michael@0: PR_JoinThread(gThread); michael@0: gThread = nullptr; michael@0: } michael@0: michael@0: delete gMonitor; michael@0: gMonitor = nullptr; michael@0: } michael@0: michael@0: static bool michael@0: IsUIMessageWaiting() michael@0: { michael@0: #ifndef XP_WIN michael@0: return false; michael@0: #else michael@0: #define NS_WM_IMEFIRST WM_IME_SETCONTEXT michael@0: #define NS_WM_IMELAST WM_IME_KEYUP michael@0: BOOL haveUIMessageWaiting = FALSE; michael@0: MSG msg; michael@0: haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, WM_KEYFIRST, michael@0: WM_IME_KEYLAST, PM_NOREMOVE); michael@0: haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, NS_WM_IMEFIRST, michael@0: NS_WM_IMELAST, PM_NOREMOVE); michael@0: haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, WM_MOUSEFIRST, michael@0: WM_MOUSELAST, PM_NOREMOVE); michael@0: return haveUIMessageWaiting; michael@0: #endif michael@0: } michael@0: michael@0: void michael@0: NotifyActivity(ActivityType activityType) michael@0: { michael@0: MOZ_ASSERT(NS_IsMainThread(), michael@0: "HangMonitor::Notify called from off the main thread."); michael@0: michael@0: // Determine the activity type more specifically michael@0: if (activityType == kGeneralActivity) { michael@0: activityType = IsUIMessageWaiting() ? kActivityUIAVail : michael@0: kActivityNoUIAVail; michael@0: } michael@0: michael@0: // Calculate the cumulative amount of lag time since the last UI message michael@0: static uint32_t cumulativeUILagMS = 0; michael@0: switch(activityType) { michael@0: case kActivityNoUIAVail: michael@0: cumulativeUILagMS = 0; michael@0: break; michael@0: case kActivityUIAVail: michael@0: case kUIActivity: michael@0: if (gTimestamp != PR_INTERVAL_NO_WAIT) { michael@0: cumulativeUILagMS += PR_IntervalToMilliseconds(PR_IntervalNow() - michael@0: gTimestamp); michael@0: } michael@0: break; michael@0: default: michael@0: break; michael@0: } michael@0: michael@0: // This is not a locked activity because PRTimeStamp is a 32-bit quantity michael@0: // which can be read/written atomically, and we don't want to pay locking michael@0: // penalties here. michael@0: gTimestamp = PR_IntervalNow(); michael@0: michael@0: // If we have UI activity we should reset the timer and report it if it is michael@0: // significant enough. michael@0: if (activityType == kUIActivity) { michael@0: // The minimum amount of lag time that we should report for telemetry data. michael@0: // Mozilla's UI responsiveness goal is 50ms michael@0: static const uint32_t kUIResponsivenessThresholdMS = 50; michael@0: if (cumulativeUILagMS > kUIResponsivenessThresholdMS) { michael@0: mozilla::Telemetry::Accumulate(mozilla::Telemetry::EVENTLOOP_UI_LAG_EXP_MS, michael@0: cumulativeUILagMS); michael@0: } michael@0: cumulativeUILagMS = 0; michael@0: } michael@0: michael@0: if (gThread && !gShutdown) { michael@0: mozilla::BackgroundHangMonitor().NotifyActivity(); michael@0: } michael@0: } michael@0: michael@0: void michael@0: Suspend() michael@0: { michael@0: MOZ_ASSERT(NS_IsMainThread(), michael@0: "HangMonitor::Suspend called from off the main thread."); michael@0: michael@0: // Because gTimestamp changes this resets the wait count. michael@0: gTimestamp = PR_INTERVAL_NO_WAIT; michael@0: michael@0: if (gThread && !gShutdown) { michael@0: mozilla::BackgroundHangMonitor().NotifyWait(); michael@0: } michael@0: } michael@0: michael@0: } } // namespace mozilla::HangMonitor