xpcom/threads/HangMonitor.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     6 #include "mozilla/HangMonitor.h"
     7 #include "mozilla/BackgroundHangMonitor.h"
     8 #include "mozilla/Monitor.h"
     9 #include "mozilla/Preferences.h"
    10 #include "mozilla/Telemetry.h"
    11 #include "mozilla/ProcessedStack.h"
    12 #include "mozilla/Atomics.h"
    13 #include "nsXULAppAPI.h"
    14 #include "nsThreadUtils.h"
    15 #include "nsStackWalk.h"
    17 #ifdef MOZ_CRASHREPORTER
    18 #include "nsExceptionHandler.h"
    19 #endif
    21 #ifdef XP_WIN
    22 #include <windows.h>
    23 #endif
    25 #if defined(MOZ_ENABLE_PROFILER_SPS) && defined(MOZ_PROFILING) && defined(XP_WIN)
    26   #define REPORT_CHROME_HANGS
    27 #endif
    29 namespace mozilla { namespace HangMonitor {
    31 /**
    32  * A flag which may be set from within a debugger to disable the hang
    33  * monitor.
    34  */
    35 volatile bool gDebugDisableHangMonitor = false;
    37 const char kHangMonitorPrefName[] = "hangmonitor.timeout";
    39 const char kTelemetryPrefName[] = "toolkit.telemetry.enabled";
    41 // Monitor protects gShutdown and gTimeout, but not gTimestamp which rely on
    42 // being atomically set by the processor; synchronization doesn't really matter
    43 // in this use case.
    44 Monitor* gMonitor;
    46 // The timeout preference, in seconds.
    47 int32_t gTimeout;
    49 PRThread* gThread;
    51 // Set when shutdown begins to signal the thread to exit immediately.
    52 bool gShutdown;
    54 // The timestamp of the last event notification, or PR_INTERVAL_NO_WAIT if
    55 // we're currently not processing events.
    56 Atomic<PRIntervalTime> gTimestamp(PR_INTERVAL_NO_WAIT);
    58 #ifdef REPORT_CHROME_HANGS
    59 // Main thread ID used in reporting chrome hangs under Windows
    60 static HANDLE winMainThreadHandle = nullptr;
    62 // Default timeout for reporting chrome hangs to Telemetry (5 seconds)
    63 static const int32_t DEFAULT_CHROME_HANG_INTERVAL = 5;
    65 // Maximum number of PCs to gather from the stack
    66 static const int32_t MAX_CALL_STACK_PCS = 400;
    67 #endif
    69 // PrefChangedFunc
    70 void
    71 PrefChanged(const char*, void*)
    72 {
    73   int32_t newval = Preferences::GetInt(kHangMonitorPrefName);
    74 #ifdef REPORT_CHROME_HANGS
    75   // Monitor chrome hangs on the profiling branch if Telemetry enabled
    76   if (newval == 0) {
    77     bool telemetryEnabled = Preferences::GetBool(kTelemetryPrefName);
    78     if (telemetryEnabled) {
    79       newval = DEFAULT_CHROME_HANG_INTERVAL;
    80     }
    81   }
    82 #endif
    83   MonitorAutoLock lock(*gMonitor);
    84   if (newval != gTimeout) {
    85     gTimeout = newval;
    86     lock.Notify();
    87   }
    88 }
    90 void
    91 Crash()
    92 {
    93   if (gDebugDisableHangMonitor) {
    94     return;
    95   }
    97 #ifdef XP_WIN
    98   if (::IsDebuggerPresent()) {
    99     return;
   100   }
   101 #endif
   103 #ifdef MOZ_CRASHREPORTER
   104   CrashReporter::AnnotateCrashReport(NS_LITERAL_CSTRING("Hang"),
   105                                      NS_LITERAL_CSTRING("1"));
   106 #endif
   108   NS_RUNTIMEABORT("HangMonitor triggered");
   109 }
   111 #ifdef REPORT_CHROME_HANGS
   112 static void
   113 ChromeStackWalker(void *aPC, void *aSP, void *aClosure)
   114 {
   115   MOZ_ASSERT(aClosure);
   116   std::vector<uintptr_t> *stack =
   117     static_cast<std::vector<uintptr_t>*>(aClosure);
   118   if (stack->size() == MAX_CALL_STACK_PCS)
   119     return;
   120   MOZ_ASSERT(stack->size() < MAX_CALL_STACK_PCS);
   121   stack->push_back(reinterpret_cast<uintptr_t>(aPC));
   122 }
   124 static void
   125 GetChromeHangReport(Telemetry::ProcessedStack &aStack,
   126                     int32_t &aSystemUptime,
   127                     int32_t &aFirefoxUptime)
   128 {
   129   MOZ_ASSERT(winMainThreadHandle);
   131   // The thread we're about to suspend might have the alloc lock
   132   // so allocate ahead of time
   133   std::vector<uintptr_t> rawStack;
   134   rawStack.reserve(MAX_CALL_STACK_PCS);
   135   DWORD ret = ::SuspendThread(winMainThreadHandle);
   136   if (ret == -1)
   137     return;
   138   NS_StackWalk(ChromeStackWalker, /* skipFrames */ 0, /* maxFrames */ 0,
   139                reinterpret_cast<void*>(&rawStack),
   140                reinterpret_cast<uintptr_t>(winMainThreadHandle), nullptr);
   141   ret = ::ResumeThread(winMainThreadHandle);
   142   if (ret == -1)
   143     return;
   144   aStack = Telemetry::GetStackAndModules(rawStack);
   146   // Record system uptime (in minutes) at the time of the hang
   147   aSystemUptime = ((GetTickCount() / 1000) - (gTimeout * 2)) / 60;
   149   // Record Firefox uptime (in minutes) at the time of the hang
   150   bool error;
   151   TimeStamp processCreation = TimeStamp::ProcessCreation(error);
   152   if (!error) {
   153     TimeDuration td = TimeStamp::Now() - processCreation;
   154     aFirefoxUptime = (static_cast<int32_t>(td.ToSeconds()) - (gTimeout * 2)) / 60;
   155   } else {
   156     aFirefoxUptime = -1;
   157   }
   158 }
   159 #endif
   161 void
   162 ThreadMain(void*)
   163 {
   164   PR_SetCurrentThreadName("Hang Monitor");
   166   MonitorAutoLock lock(*gMonitor);
   168   // In order to avoid issues with the hang monitor incorrectly triggering
   169   // during a general system stop such as sleeping, the monitor thread must
   170   // run twice to trigger hang protection.
   171   PRIntervalTime lastTimestamp = 0;
   172   int waitCount = 0;
   174 #ifdef REPORT_CHROME_HANGS
   175   Telemetry::ProcessedStack stack;
   176   int32_t systemUptime = -1;
   177   int32_t firefoxUptime = -1;
   178 #endif
   180   while (true) {
   181     if (gShutdown) {
   182       return; // Exit the thread
   183     }
   185     // avoid rereading the volatile value in this loop
   186     PRIntervalTime timestamp = gTimestamp;
   188     PRIntervalTime now = PR_IntervalNow();
   190     if (timestamp != PR_INTERVAL_NO_WAIT &&
   191         now < timestamp) {
   192       // 32-bit overflow, reset for another waiting period
   193       timestamp = 1; // lowest legal PRInterval value
   194     }
   196     if (timestamp != PR_INTERVAL_NO_WAIT &&
   197         timestamp == lastTimestamp &&
   198         gTimeout > 0) {
   199       ++waitCount;
   200 #ifdef REPORT_CHROME_HANGS
   201       // Capture the chrome-hang stack + Firefox & system uptimes after
   202       // the minimum hang duration has been reached (not when the hang ends)
   203       if (waitCount == 2) {
   204         GetChromeHangReport(stack, systemUptime, firefoxUptime);
   205       }
   206 #else
   207       // This is the crash-on-hang feature.
   208       // See bug 867313 for the quirk in the waitCount comparison
   209       if (waitCount >= 2) {
   210         int32_t delay =
   211           int32_t(PR_IntervalToSeconds(now - timestamp));
   212         if (delay >= gTimeout) {
   213           MonitorAutoUnlock unlock(*gMonitor);
   214           Crash();
   215         }
   216       }
   217 #endif
   218     }
   219     else {
   220 #ifdef REPORT_CHROME_HANGS
   221       if (waitCount >= 2) {
   222         uint32_t hangDuration = PR_IntervalToSeconds(now - lastTimestamp);
   223         Telemetry::RecordChromeHang(hangDuration, stack,
   224                                     systemUptime, firefoxUptime);
   225         stack.Clear();
   226       }
   227 #endif
   228       lastTimestamp = timestamp;
   229       waitCount = 0;
   230     }
   232     PRIntervalTime timeout;
   233     if (gTimeout <= 0) {
   234       timeout = PR_INTERVAL_NO_TIMEOUT;
   235     }
   236     else {
   237       timeout = PR_MillisecondsToInterval(gTimeout * 500);
   238     }
   239     lock.Wait(timeout);
   240   }
   241 }
   243 void
   244 Startup()
   245 {
   246   // The hang detector only runs in chrome processes. If you change this,
   247   // you must also deal with the threadsafety of AnnotateCrashReport in
   248   // non-chrome processes!
   249   if (GeckoProcessType_Default != XRE_GetProcessType())
   250     return;
   252   MOZ_ASSERT(!gMonitor, "Hang monitor already initialized");
   253   gMonitor = new Monitor("HangMonitor");
   255   Preferences::RegisterCallback(PrefChanged, kHangMonitorPrefName, nullptr);
   256   PrefChanged(nullptr, nullptr);
   258 #ifdef REPORT_CHROME_HANGS
   259   Preferences::RegisterCallback(PrefChanged, kTelemetryPrefName, nullptr);
   260   winMainThreadHandle =
   261     OpenThread(THREAD_ALL_ACCESS, FALSE, GetCurrentThreadId());
   262   if (!winMainThreadHandle)
   263     return;
   264 #endif
   266   // Don't actually start measuring hangs until we hit the main event loop.
   267   // This potentially misses a small class of really early startup hangs,
   268   // but avoids dealing with some xpcshell tests and other situations which
   269   // start XPCOM but don't ever start the event loop.
   270   Suspend();
   272   gThread = PR_CreateThread(PR_USER_THREAD,
   273                             ThreadMain,
   274                             nullptr, PR_PRIORITY_LOW, PR_GLOBAL_THREAD,
   275                             PR_JOINABLE_THREAD, 0);
   276 }
   278 void
   279 Shutdown()
   280 {
   281   if (GeckoProcessType_Default != XRE_GetProcessType())
   282     return;
   284   MOZ_ASSERT(gMonitor, "Hang monitor not started");
   286   { // Scope the lock we're going to delete later
   287     MonitorAutoLock lock(*gMonitor);
   288     gShutdown = true;
   289     lock.Notify();
   290   }
   292   // thread creation could theoretically fail
   293   if (gThread) {
   294     PR_JoinThread(gThread);
   295     gThread = nullptr;
   296   }
   298   delete gMonitor;
   299   gMonitor = nullptr;
   300 }
   302 static bool
   303 IsUIMessageWaiting()
   304 {
   305 #ifndef XP_WIN
   306   return false;
   307 #else
   308   #define NS_WM_IMEFIRST WM_IME_SETCONTEXT
   309   #define NS_WM_IMELAST  WM_IME_KEYUP
   310   BOOL haveUIMessageWaiting = FALSE;
   311   MSG msg;
   312   haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, WM_KEYFIRST, 
   313                                          WM_IME_KEYLAST, PM_NOREMOVE);
   314   haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, NS_WM_IMEFIRST,
   315                                          NS_WM_IMELAST, PM_NOREMOVE);
   316   haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, WM_MOUSEFIRST,
   317                                          WM_MOUSELAST, PM_NOREMOVE);
   318   return haveUIMessageWaiting;
   319 #endif
   320 }
   322 void
   323 NotifyActivity(ActivityType activityType)
   324 {
   325   MOZ_ASSERT(NS_IsMainThread(),
   326              "HangMonitor::Notify called from off the main thread.");
   328   // Determine the activity type more specifically
   329   if (activityType == kGeneralActivity) {
   330     activityType = IsUIMessageWaiting() ? kActivityUIAVail : 
   331                                           kActivityNoUIAVail;
   332   }
   334   // Calculate the cumulative amount of lag time since the last UI message
   335   static uint32_t cumulativeUILagMS = 0;
   336   switch(activityType) {
   337   case kActivityNoUIAVail:
   338     cumulativeUILagMS = 0;
   339     break;
   340   case kActivityUIAVail:
   341   case kUIActivity:
   342     if (gTimestamp != PR_INTERVAL_NO_WAIT) {
   343       cumulativeUILagMS += PR_IntervalToMilliseconds(PR_IntervalNow() -
   344                                                      gTimestamp);
   345     }
   346     break;
   347   default:
   348     break;
   349   }
   351   // This is not a locked activity because PRTimeStamp is a 32-bit quantity
   352   // which can be read/written atomically, and we don't want to pay locking
   353   // penalties here.
   354   gTimestamp = PR_IntervalNow();
   356   // If we have UI activity we should reset the timer and report it if it is
   357   // significant enough.
   358   if (activityType == kUIActivity) {
   359     // The minimum amount of lag time that we should report for telemetry data.
   360     // Mozilla's UI responsiveness goal is 50ms
   361     static const uint32_t kUIResponsivenessThresholdMS = 50;
   362     if (cumulativeUILagMS > kUIResponsivenessThresholdMS) {
   363       mozilla::Telemetry::Accumulate(mozilla::Telemetry::EVENTLOOP_UI_LAG_EXP_MS,
   364                                      cumulativeUILagMS);
   365     }
   366     cumulativeUILagMS = 0;
   367   }
   369   if (gThread && !gShutdown) {
   370     mozilla::BackgroundHangMonitor().NotifyActivity();
   371   }
   372 }
   374 void
   375 Suspend()
   376 {
   377   MOZ_ASSERT(NS_IsMainThread(),
   378              "HangMonitor::Suspend called from off the main thread.");
   380   // Because gTimestamp changes this resets the wait count.
   381   gTimestamp = PR_INTERVAL_NO_WAIT;
   383   if (gThread && !gShutdown) {
   384     mozilla::BackgroundHangMonitor().NotifyWait();
   385   }
   386 }
   388 } } // namespace mozilla::HangMonitor

mercurial