xpcom/threads/HangMonitor.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/xpcom/threads/HangMonitor.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,388 @@
     1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +#include "mozilla/HangMonitor.h"
    1.10 +#include "mozilla/BackgroundHangMonitor.h"
    1.11 +#include "mozilla/Monitor.h"
    1.12 +#include "mozilla/Preferences.h"
    1.13 +#include "mozilla/Telemetry.h"
    1.14 +#include "mozilla/ProcessedStack.h"
    1.15 +#include "mozilla/Atomics.h"
    1.16 +#include "nsXULAppAPI.h"
    1.17 +#include "nsThreadUtils.h"
    1.18 +#include "nsStackWalk.h"
    1.19 +
    1.20 +#ifdef MOZ_CRASHREPORTER
    1.21 +#include "nsExceptionHandler.h"
    1.22 +#endif
    1.23 +
    1.24 +#ifdef XP_WIN
    1.25 +#include <windows.h>
    1.26 +#endif
    1.27 +
    1.28 +#if defined(MOZ_ENABLE_PROFILER_SPS) && defined(MOZ_PROFILING) && defined(XP_WIN)
    1.29 +  #define REPORT_CHROME_HANGS
    1.30 +#endif
    1.31 +
    1.32 +namespace mozilla { namespace HangMonitor {
    1.33 +
    1.34 +/**
    1.35 + * A flag which may be set from within a debugger to disable the hang
    1.36 + * monitor.
    1.37 + */
    1.38 +volatile bool gDebugDisableHangMonitor = false;
    1.39 +
    1.40 +const char kHangMonitorPrefName[] = "hangmonitor.timeout";
    1.41 +
    1.42 +const char kTelemetryPrefName[] = "toolkit.telemetry.enabled";
    1.43 +
    1.44 +// Monitor protects gShutdown and gTimeout, but not gTimestamp which rely on
    1.45 +// being atomically set by the processor; synchronization doesn't really matter
    1.46 +// in this use case.
    1.47 +Monitor* gMonitor;
    1.48 +
    1.49 +// The timeout preference, in seconds.
    1.50 +int32_t gTimeout;
    1.51 +
    1.52 +PRThread* gThread;
    1.53 +
    1.54 +// Set when shutdown begins to signal the thread to exit immediately.
    1.55 +bool gShutdown;
    1.56 +
    1.57 +// The timestamp of the last event notification, or PR_INTERVAL_NO_WAIT if
    1.58 +// we're currently not processing events.
    1.59 +Atomic<PRIntervalTime> gTimestamp(PR_INTERVAL_NO_WAIT);
    1.60 +
    1.61 +#ifdef REPORT_CHROME_HANGS
    1.62 +// Main thread ID used in reporting chrome hangs under Windows
    1.63 +static HANDLE winMainThreadHandle = nullptr;
    1.64 +
    1.65 +// Default timeout for reporting chrome hangs to Telemetry (5 seconds)
    1.66 +static const int32_t DEFAULT_CHROME_HANG_INTERVAL = 5;
    1.67 +
    1.68 +// Maximum number of PCs to gather from the stack
    1.69 +static const int32_t MAX_CALL_STACK_PCS = 400;
    1.70 +#endif
    1.71 +
    1.72 +// PrefChangedFunc
    1.73 +void
    1.74 +PrefChanged(const char*, void*)
    1.75 +{
    1.76 +  int32_t newval = Preferences::GetInt(kHangMonitorPrefName);
    1.77 +#ifdef REPORT_CHROME_HANGS
    1.78 +  // Monitor chrome hangs on the profiling branch if Telemetry enabled
    1.79 +  if (newval == 0) {
    1.80 +    bool telemetryEnabled = Preferences::GetBool(kTelemetryPrefName);
    1.81 +    if (telemetryEnabled) {
    1.82 +      newval = DEFAULT_CHROME_HANG_INTERVAL;
    1.83 +    }
    1.84 +  }
    1.85 +#endif
    1.86 +  MonitorAutoLock lock(*gMonitor);
    1.87 +  if (newval != gTimeout) {
    1.88 +    gTimeout = newval;
    1.89 +    lock.Notify();
    1.90 +  }
    1.91 +}
    1.92 +
    1.93 +void
    1.94 +Crash()
    1.95 +{
    1.96 +  if (gDebugDisableHangMonitor) {
    1.97 +    return;
    1.98 +  }
    1.99 +
   1.100 +#ifdef XP_WIN
   1.101 +  if (::IsDebuggerPresent()) {
   1.102 +    return;
   1.103 +  }
   1.104 +#endif
   1.105 +
   1.106 +#ifdef MOZ_CRASHREPORTER
   1.107 +  CrashReporter::AnnotateCrashReport(NS_LITERAL_CSTRING("Hang"),
   1.108 +                                     NS_LITERAL_CSTRING("1"));
   1.109 +#endif
   1.110 +
   1.111 +  NS_RUNTIMEABORT("HangMonitor triggered");
   1.112 +}
   1.113 +
   1.114 +#ifdef REPORT_CHROME_HANGS
   1.115 +static void
   1.116 +ChromeStackWalker(void *aPC, void *aSP, void *aClosure)
   1.117 +{
   1.118 +  MOZ_ASSERT(aClosure);
   1.119 +  std::vector<uintptr_t> *stack =
   1.120 +    static_cast<std::vector<uintptr_t>*>(aClosure);
   1.121 +  if (stack->size() == MAX_CALL_STACK_PCS)
   1.122 +    return;
   1.123 +  MOZ_ASSERT(stack->size() < MAX_CALL_STACK_PCS);
   1.124 +  stack->push_back(reinterpret_cast<uintptr_t>(aPC));
   1.125 +}
   1.126 +
   1.127 +static void
   1.128 +GetChromeHangReport(Telemetry::ProcessedStack &aStack,
   1.129 +                    int32_t &aSystemUptime,
   1.130 +                    int32_t &aFirefoxUptime)
   1.131 +{
   1.132 +  MOZ_ASSERT(winMainThreadHandle);
   1.133 +
   1.134 +  // The thread we're about to suspend might have the alloc lock
   1.135 +  // so allocate ahead of time
   1.136 +  std::vector<uintptr_t> rawStack;
   1.137 +  rawStack.reserve(MAX_CALL_STACK_PCS);
   1.138 +  DWORD ret = ::SuspendThread(winMainThreadHandle);
   1.139 +  if (ret == -1)
   1.140 +    return;
   1.141 +  NS_StackWalk(ChromeStackWalker, /* skipFrames */ 0, /* maxFrames */ 0,
   1.142 +               reinterpret_cast<void*>(&rawStack),
   1.143 +               reinterpret_cast<uintptr_t>(winMainThreadHandle), nullptr);
   1.144 +  ret = ::ResumeThread(winMainThreadHandle);
   1.145 +  if (ret == -1)
   1.146 +    return;
   1.147 +  aStack = Telemetry::GetStackAndModules(rawStack);
   1.148 +
   1.149 +  // Record system uptime (in minutes) at the time of the hang
   1.150 +  aSystemUptime = ((GetTickCount() / 1000) - (gTimeout * 2)) / 60;
   1.151 +
   1.152 +  // Record Firefox uptime (in minutes) at the time of the hang
   1.153 +  bool error;
   1.154 +  TimeStamp processCreation = TimeStamp::ProcessCreation(error);
   1.155 +  if (!error) {
   1.156 +    TimeDuration td = TimeStamp::Now() - processCreation;
   1.157 +    aFirefoxUptime = (static_cast<int32_t>(td.ToSeconds()) - (gTimeout * 2)) / 60;
   1.158 +  } else {
   1.159 +    aFirefoxUptime = -1;
   1.160 +  }
   1.161 +}
   1.162 +#endif
   1.163 +
   1.164 +void
   1.165 +ThreadMain(void*)
   1.166 +{
   1.167 +  PR_SetCurrentThreadName("Hang Monitor");
   1.168 +
   1.169 +  MonitorAutoLock lock(*gMonitor);
   1.170 +
   1.171 +  // In order to avoid issues with the hang monitor incorrectly triggering
   1.172 +  // during a general system stop such as sleeping, the monitor thread must
   1.173 +  // run twice to trigger hang protection.
   1.174 +  PRIntervalTime lastTimestamp = 0;
   1.175 +  int waitCount = 0;
   1.176 +
   1.177 +#ifdef REPORT_CHROME_HANGS
   1.178 +  Telemetry::ProcessedStack stack;
   1.179 +  int32_t systemUptime = -1;
   1.180 +  int32_t firefoxUptime = -1;
   1.181 +#endif
   1.182 +
   1.183 +  while (true) {
   1.184 +    if (gShutdown) {
   1.185 +      return; // Exit the thread
   1.186 +    }
   1.187 +
   1.188 +    // avoid rereading the volatile value in this loop
   1.189 +    PRIntervalTime timestamp = gTimestamp;
   1.190 +
   1.191 +    PRIntervalTime now = PR_IntervalNow();
   1.192 +
   1.193 +    if (timestamp != PR_INTERVAL_NO_WAIT &&
   1.194 +        now < timestamp) {
   1.195 +      // 32-bit overflow, reset for another waiting period
   1.196 +      timestamp = 1; // lowest legal PRInterval value
   1.197 +    }
   1.198 +
   1.199 +    if (timestamp != PR_INTERVAL_NO_WAIT &&
   1.200 +        timestamp == lastTimestamp &&
   1.201 +        gTimeout > 0) {
   1.202 +      ++waitCount;
   1.203 +#ifdef REPORT_CHROME_HANGS
   1.204 +      // Capture the chrome-hang stack + Firefox & system uptimes after
   1.205 +      // the minimum hang duration has been reached (not when the hang ends)
   1.206 +      if (waitCount == 2) {
   1.207 +        GetChromeHangReport(stack, systemUptime, firefoxUptime);
   1.208 +      }
   1.209 +#else
   1.210 +      // This is the crash-on-hang feature.
   1.211 +      // See bug 867313 for the quirk in the waitCount comparison
   1.212 +      if (waitCount >= 2) {
   1.213 +        int32_t delay =
   1.214 +          int32_t(PR_IntervalToSeconds(now - timestamp));
   1.215 +        if (delay >= gTimeout) {
   1.216 +          MonitorAutoUnlock unlock(*gMonitor);
   1.217 +          Crash();
   1.218 +        }
   1.219 +      }
   1.220 +#endif
   1.221 +    }
   1.222 +    else {
   1.223 +#ifdef REPORT_CHROME_HANGS
   1.224 +      if (waitCount >= 2) {
   1.225 +        uint32_t hangDuration = PR_IntervalToSeconds(now - lastTimestamp);
   1.226 +        Telemetry::RecordChromeHang(hangDuration, stack,
   1.227 +                                    systemUptime, firefoxUptime);
   1.228 +        stack.Clear();
   1.229 +      }
   1.230 +#endif
   1.231 +      lastTimestamp = timestamp;
   1.232 +      waitCount = 0;
   1.233 +    }
   1.234 +
   1.235 +    PRIntervalTime timeout;
   1.236 +    if (gTimeout <= 0) {
   1.237 +      timeout = PR_INTERVAL_NO_TIMEOUT;
   1.238 +    }
   1.239 +    else {
   1.240 +      timeout = PR_MillisecondsToInterval(gTimeout * 500);
   1.241 +    }
   1.242 +    lock.Wait(timeout);
   1.243 +  }
   1.244 +}
   1.245 +
   1.246 +void
   1.247 +Startup()
   1.248 +{
   1.249 +  // The hang detector only runs in chrome processes. If you change this,
   1.250 +  // you must also deal with the threadsafety of AnnotateCrashReport in
   1.251 +  // non-chrome processes!
   1.252 +  if (GeckoProcessType_Default != XRE_GetProcessType())
   1.253 +    return;
   1.254 +
   1.255 +  MOZ_ASSERT(!gMonitor, "Hang monitor already initialized");
   1.256 +  gMonitor = new Monitor("HangMonitor");
   1.257 +
   1.258 +  Preferences::RegisterCallback(PrefChanged, kHangMonitorPrefName, nullptr);
   1.259 +  PrefChanged(nullptr, nullptr);
   1.260 +
   1.261 +#ifdef REPORT_CHROME_HANGS
   1.262 +  Preferences::RegisterCallback(PrefChanged, kTelemetryPrefName, nullptr);
   1.263 +  winMainThreadHandle =
   1.264 +    OpenThread(THREAD_ALL_ACCESS, FALSE, GetCurrentThreadId());
   1.265 +  if (!winMainThreadHandle)
   1.266 +    return;
   1.267 +#endif
   1.268 +
   1.269 +  // Don't actually start measuring hangs until we hit the main event loop.
   1.270 +  // This potentially misses a small class of really early startup hangs,
   1.271 +  // but avoids dealing with some xpcshell tests and other situations which
   1.272 +  // start XPCOM but don't ever start the event loop.
   1.273 +  Suspend();
   1.274 +
   1.275 +  gThread = PR_CreateThread(PR_USER_THREAD,
   1.276 +                            ThreadMain,
   1.277 +                            nullptr, PR_PRIORITY_LOW, PR_GLOBAL_THREAD,
   1.278 +                            PR_JOINABLE_THREAD, 0);
   1.279 +}
   1.280 +
   1.281 +void
   1.282 +Shutdown()
   1.283 +{
   1.284 +  if (GeckoProcessType_Default != XRE_GetProcessType())
   1.285 +    return;
   1.286 +
   1.287 +  MOZ_ASSERT(gMonitor, "Hang monitor not started");
   1.288 +
   1.289 +  { // Scope the lock we're going to delete later
   1.290 +    MonitorAutoLock lock(*gMonitor);
   1.291 +    gShutdown = true;
   1.292 +    lock.Notify();
   1.293 +  }
   1.294 +
   1.295 +  // thread creation could theoretically fail
   1.296 +  if (gThread) {
   1.297 +    PR_JoinThread(gThread);
   1.298 +    gThread = nullptr;
   1.299 +  }
   1.300 +
   1.301 +  delete gMonitor;
   1.302 +  gMonitor = nullptr;
   1.303 +}
   1.304 +
   1.305 +static bool
   1.306 +IsUIMessageWaiting()
   1.307 +{
   1.308 +#ifndef XP_WIN
   1.309 +  return false;
   1.310 +#else
   1.311 +  #define NS_WM_IMEFIRST WM_IME_SETCONTEXT
   1.312 +  #define NS_WM_IMELAST  WM_IME_KEYUP
   1.313 +  BOOL haveUIMessageWaiting = FALSE;
   1.314 +  MSG msg;
   1.315 +  haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, WM_KEYFIRST, 
   1.316 +                                         WM_IME_KEYLAST, PM_NOREMOVE);
   1.317 +  haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, NS_WM_IMEFIRST,
   1.318 +                                         NS_WM_IMELAST, PM_NOREMOVE);
   1.319 +  haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, WM_MOUSEFIRST,
   1.320 +                                         WM_MOUSELAST, PM_NOREMOVE);
   1.321 +  return haveUIMessageWaiting;
   1.322 +#endif
   1.323 +}
   1.324 +
   1.325 +void
   1.326 +NotifyActivity(ActivityType activityType)
   1.327 +{
   1.328 +  MOZ_ASSERT(NS_IsMainThread(),
   1.329 +             "HangMonitor::Notify called from off the main thread.");
   1.330 +
   1.331 +  // Determine the activity type more specifically
   1.332 +  if (activityType == kGeneralActivity) {
   1.333 +    activityType = IsUIMessageWaiting() ? kActivityUIAVail : 
   1.334 +                                          kActivityNoUIAVail;
   1.335 +  }
   1.336 +
   1.337 +  // Calculate the cumulative amount of lag time since the last UI message
   1.338 +  static uint32_t cumulativeUILagMS = 0;
   1.339 +  switch(activityType) {
   1.340 +  case kActivityNoUIAVail:
   1.341 +    cumulativeUILagMS = 0;
   1.342 +    break;
   1.343 +  case kActivityUIAVail:
   1.344 +  case kUIActivity:
   1.345 +    if (gTimestamp != PR_INTERVAL_NO_WAIT) {
   1.346 +      cumulativeUILagMS += PR_IntervalToMilliseconds(PR_IntervalNow() -
   1.347 +                                                     gTimestamp);
   1.348 +    }
   1.349 +    break;
   1.350 +  default:
   1.351 +    break;
   1.352 +  }
   1.353 +
   1.354 +  // This is not a locked activity because PRTimeStamp is a 32-bit quantity
   1.355 +  // which can be read/written atomically, and we don't want to pay locking
   1.356 +  // penalties here.
   1.357 +  gTimestamp = PR_IntervalNow();
   1.358 +
   1.359 +  // If we have UI activity we should reset the timer and report it if it is
   1.360 +  // significant enough.
   1.361 +  if (activityType == kUIActivity) {
   1.362 +    // The minimum amount of lag time that we should report for telemetry data.
   1.363 +    // Mozilla's UI responsiveness goal is 50ms
   1.364 +    static const uint32_t kUIResponsivenessThresholdMS = 50;
   1.365 +    if (cumulativeUILagMS > kUIResponsivenessThresholdMS) {
   1.366 +      mozilla::Telemetry::Accumulate(mozilla::Telemetry::EVENTLOOP_UI_LAG_EXP_MS,
   1.367 +                                     cumulativeUILagMS);
   1.368 +    }
   1.369 +    cumulativeUILagMS = 0;
   1.370 +  }
   1.371 +
   1.372 +  if (gThread && !gShutdown) {
   1.373 +    mozilla::BackgroundHangMonitor().NotifyActivity();
   1.374 +  }
   1.375 +}
   1.376 +
   1.377 +void
   1.378 +Suspend()
   1.379 +{
   1.380 +  MOZ_ASSERT(NS_IsMainThread(),
   1.381 +             "HangMonitor::Suspend called from off the main thread.");
   1.382 +
   1.383 +  // Because gTimestamp changes this resets the wait count.
   1.384 +  gTimestamp = PR_INTERVAL_NO_WAIT;
   1.385 +
   1.386 +  if (gThread && !gShutdown) {
   1.387 +    mozilla::BackgroundHangMonitor().NotifyWait();
   1.388 +  }
   1.389 +}
   1.390 +
   1.391 +} } // namespace mozilla::HangMonitor

mercurial