1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/xpcom/threads/BackgroundHangMonitor.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,524 @@ 1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#include "mozilla/ArrayUtils.h" 1.10 +#include "mozilla/BackgroundHangMonitor.h" 1.11 +#include "mozilla/LinkedList.h" 1.12 +#include "mozilla/Monitor.h" 1.13 +#include "mozilla/Move.h" 1.14 +#include "mozilla/StaticPtr.h" 1.15 +#include "mozilla/Telemetry.h" 1.16 +#include "mozilla/ThreadHangStats.h" 1.17 +#include "mozilla/ThreadLocal.h" 1.18 +#ifdef MOZ_NUWA_PROCESS 1.19 +#include "ipc/Nuwa.h" 1.20 +#endif 1.21 + 1.22 +#include "prinrval.h" 1.23 +#include "prthread.h" 1.24 +#include "ThreadStackHelper.h" 1.25 + 1.26 +#include <algorithm> 1.27 + 1.28 +namespace mozilla { 1.29 + 1.30 +/** 1.31 + * BackgroundHangManager is the global object that 1.32 + * manages all instances of BackgroundHangThread. 1.33 + */ 1.34 +class BackgroundHangManager 1.35 +{ 1.36 +private: 1.37 + // Background hang monitor thread function 1.38 + static void MonitorThread(void* aData) 1.39 + { 1.40 + PR_SetCurrentThreadName("BgHangManager"); 1.41 + 1.42 +#ifdef MOZ_NUWA_PROCESS 1.43 + if (IsNuwaProcess()) { 1.44 + NS_ASSERTION(NuwaMarkCurrentThread != nullptr, 1.45 + "NuwaMarkCurrentThread is undefined!"); 1.46 + NuwaMarkCurrentThread(nullptr, nullptr); 1.47 + } 1.48 +#endif 1.49 + 1.50 + /* We do not hold a reference to BackgroundHangManager here 1.51 + because the monitor thread only exists as long as the 1.52 + BackgroundHangManager instance exists. We stop the monitor 1.53 + thread in the BackgroundHangManager destructor, and we can 1.54 + only get to the destructor if we don't hold a reference here. */ 1.55 + static_cast<BackgroundHangManager*>(aData)->RunMonitorThread(); 1.56 + } 1.57 + 1.58 + // Hang monitor thread 1.59 + PRThread* mHangMonitorThread; 1.60 + // Stop hang monitoring 1.61 + bool mShutdown; 1.62 + 1.63 + BackgroundHangManager(const BackgroundHangManager&); 1.64 + BackgroundHangManager& operator=(const BackgroundHangManager&); 1.65 + void RunMonitorThread(); 1.66 + 1.67 +public: 1.68 + NS_INLINE_DECL_THREADSAFE_REFCOUNTING(BackgroundHangManager) 1.69 + static StaticRefPtr<BackgroundHangManager> sInstance; 1.70 + 1.71 + // Lock for access to members of this class 1.72 + Monitor mLock; 1.73 + // Current time as seen by hang monitors 1.74 + PRIntervalTime mIntervalNow; 1.75 + // List of BackgroundHangThread instances associated with each thread 1.76 + LinkedList<BackgroundHangThread> mHangThreads; 1.77 + 1.78 + void Shutdown() 1.79 + { 1.80 + MonitorAutoLock autoLock(mLock); 1.81 + mShutdown = true; 1.82 + autoLock.Notify(); 1.83 + } 1.84 + 1.85 + void Wakeup() 1.86 + { 1.87 + // PR_CreateThread could have failed earlier 1.88 + if (mHangMonitorThread) { 1.89 + // Use PR_Interrupt to avoid potentially taking a lock 1.90 + PR_Interrupt(mHangMonitorThread); 1.91 + } 1.92 + } 1.93 + 1.94 + BackgroundHangManager(); 1.95 + ~BackgroundHangManager(); 1.96 +}; 1.97 + 1.98 +/** 1.99 + * BackgroundHangThread is a per-thread object that is used 1.100 + * by all instances of BackgroundHangMonitor to monitor hangs. 1.101 + */ 1.102 +class BackgroundHangThread : public LinkedListElement<BackgroundHangThread> 1.103 +{ 1.104 +private: 1.105 + static ThreadLocal<BackgroundHangThread*> sTlsKey; 1.106 + 1.107 + BackgroundHangThread(const BackgroundHangThread&); 1.108 + BackgroundHangThread& operator=(const BackgroundHangThread&); 1.109 + ~BackgroundHangThread(); 1.110 + 1.111 + /* Keep a reference to the manager, so we can keep going even 1.112 + after BackgroundHangManager::Shutdown is called. */ 1.113 + const RefPtr<BackgroundHangManager> mManager; 1.114 + // Unique thread ID for identification 1.115 + const PRThread* mThreadID; 1.116 + 1.117 +public: 1.118 + NS_INLINE_DECL_REFCOUNTING(BackgroundHangThread) 1.119 + static BackgroundHangThread* FindThread(); 1.120 + 1.121 + static void Startup() 1.122 + { 1.123 + /* We can tolerate init() failing. 1.124 + The if block turns off warn_unused_result. */ 1.125 + if (!sTlsKey.init()) {} 1.126 + } 1.127 + 1.128 + // Hang timeout in ticks 1.129 + const PRIntervalTime mTimeout; 1.130 + // PermaHang timeout in ticks 1.131 + const PRIntervalTime mMaxTimeout; 1.132 + // Time at last activity 1.133 + PRIntervalTime mInterval; 1.134 + // Time when a hang started 1.135 + PRIntervalTime mHangStart; 1.136 + // Is the thread in a hang 1.137 + bool mHanging; 1.138 + // Is the thread in a waiting state 1.139 + bool mWaiting; 1.140 + // Platform-specific helper to get hang stacks 1.141 + ThreadStackHelper mStackHelper; 1.142 + // Stack of current hang 1.143 + Telemetry::HangHistogram::Stack mHangStack; 1.144 + // Statistics for telemetry 1.145 + Telemetry::ThreadHangStats mStats; 1.146 + 1.147 + BackgroundHangThread(const char* aName, 1.148 + uint32_t aTimeoutMs, 1.149 + uint32_t aMaxTimeoutMs); 1.150 + 1.151 + // Report a hang; aManager->mLock IS locked 1.152 + void ReportHang(PRIntervalTime aHangTime); 1.153 + // Report a permanent hang; aManager->mLock IS locked 1.154 + void ReportPermaHang(); 1.155 + // Called by BackgroundHangMonitor::NotifyActivity 1.156 + void NotifyActivity(); 1.157 + // Called by BackgroundHangMonitor::NotifyWait 1.158 + void NotifyWait() 1.159 + { 1.160 + NotifyActivity(); 1.161 + mWaiting = true; 1.162 + } 1.163 +}; 1.164 + 1.165 + 1.166 +StaticRefPtr<BackgroundHangManager> BackgroundHangManager::sInstance; 1.167 + 1.168 +ThreadLocal<BackgroundHangThread*> BackgroundHangThread::sTlsKey; 1.169 + 1.170 + 1.171 +BackgroundHangManager::BackgroundHangManager() 1.172 + : mShutdown(false) 1.173 + , mLock("BackgroundHangManager") 1.174 + , mIntervalNow(0) 1.175 +{ 1.176 + // Lock so we don't race against the new monitor thread 1.177 + MonitorAutoLock autoLock(mLock); 1.178 + mHangMonitorThread = PR_CreateThread( 1.179 + PR_USER_THREAD, MonitorThread, this, 1.180 + PR_PRIORITY_LOW, PR_GLOBAL_THREAD, PR_JOINABLE_THREAD, 0); 1.181 + 1.182 + MOZ_ASSERT(mHangMonitorThread, 1.183 + "Failed to create monitor thread"); 1.184 +} 1.185 + 1.186 +BackgroundHangManager::~BackgroundHangManager() 1.187 +{ 1.188 + MOZ_ASSERT(mShutdown, 1.189 + "Destruction without Shutdown call"); 1.190 + MOZ_ASSERT(mHangThreads.isEmpty(), 1.191 + "Destruction with outstanding monitors"); 1.192 + MOZ_ASSERT(mHangMonitorThread, 1.193 + "No monitor thread"); 1.194 + 1.195 + // PR_CreateThread could have failed above due to resource limitation 1.196 + if (mHangMonitorThread) { 1.197 + // The monitor thread can only live as long as the instance lives 1.198 + PR_JoinThread(mHangMonitorThread); 1.199 + } 1.200 +} 1.201 + 1.202 +void 1.203 +BackgroundHangManager::RunMonitorThread() 1.204 +{ 1.205 + // Keep us locked except when waiting 1.206 + MonitorAutoLock autoLock(mLock); 1.207 + 1.208 + /* mIntervalNow is updated at various intervals determined by waitTime. 1.209 + However, if an update latency is too long (due to CPU scheduling, system 1.210 + sleep, etc.), we don't update mIntervalNow at all. This is done so that 1.211 + long latencies in our timing are not detected as hangs. systemTime is 1.212 + used to track PR_IntervalNow() and determine our latency. */ 1.213 + 1.214 + PRIntervalTime systemTime = PR_IntervalNow(); 1.215 + // Default values for the first iteration of thread loop 1.216 + PRIntervalTime waitTime = PR_INTERVAL_NO_WAIT; 1.217 + PRIntervalTime recheckTimeout = PR_INTERVAL_NO_WAIT; 1.218 + 1.219 + while (!mShutdown) { 1.220 + 1.221 + PR_ClearInterrupt(); 1.222 + nsresult rv = autoLock.Wait(waitTime); 1.223 + 1.224 + PRIntervalTime newTime = PR_IntervalNow(); 1.225 + PRIntervalTime systemInterval = newTime - systemTime; 1.226 + systemTime = newTime; 1.227 + 1.228 + /* waitTime is a quarter of the shortest timeout value; If our timing 1.229 + latency is low enough (less than half the shortest timeout value), 1.230 + we can update mIntervalNow. */ 1.231 + if (MOZ_LIKELY(waitTime != PR_INTERVAL_NO_TIMEOUT && 1.232 + systemInterval < 2 * waitTime)) { 1.233 + mIntervalNow += systemInterval; 1.234 + } 1.235 + 1.236 + /* If it's before the next recheck timeout, and our wait did not 1.237 + get interrupted (either through Notify or PR_Interrupt), we can 1.238 + keep the current waitTime and skip iterating through hang monitors. */ 1.239 + if (MOZ_LIKELY(systemInterval < recheckTimeout && 1.240 + systemInterval >= waitTime && 1.241 + rv == NS_OK)) { 1.242 + recheckTimeout -= systemInterval; 1.243 + continue; 1.244 + } 1.245 + 1.246 + /* We are in one of the following scenarios, 1.247 + - Hang or permahang recheck timeout 1.248 + - Thread added/removed 1.249 + - Thread wait or hang ended 1.250 + In all cases, we want to go through our list of hang 1.251 + monitors and update waitTime and recheckTimeout. */ 1.252 + waitTime = PR_INTERVAL_NO_TIMEOUT; 1.253 + recheckTimeout = PR_INTERVAL_NO_TIMEOUT; 1.254 + 1.255 + // Locally hold mIntervalNow 1.256 + PRIntervalTime intervalNow = mIntervalNow; 1.257 + 1.258 + // iterate through hang monitors 1.259 + for (BackgroundHangThread* currentThread = mHangThreads.getFirst(); 1.260 + currentThread; currentThread = currentThread->getNext()) { 1.261 + 1.262 + if (currentThread->mWaiting) { 1.263 + // Thread is waiting, not hanging 1.264 + continue; 1.265 + } 1.266 + PRIntervalTime interval = currentThread->mInterval; 1.267 + PRIntervalTime hangTime = intervalNow - interval; 1.268 + if (MOZ_UNLIKELY(hangTime >= currentThread->mMaxTimeout)) { 1.269 + // A permahang started 1.270 + // Skip subsequent iterations and tolerate a race on mWaiting here 1.271 + currentThread->mWaiting = true; 1.272 + currentThread->mHanging = false; 1.273 + currentThread->ReportPermaHang(); 1.274 + continue; 1.275 + } 1.276 + 1.277 + if (MOZ_LIKELY(!currentThread->mHanging)) { 1.278 + if (MOZ_UNLIKELY(hangTime >= currentThread->mTimeout)) { 1.279 + // A hang started 1.280 + currentThread->mStackHelper.GetStack(currentThread->mHangStack); 1.281 + currentThread->mHangStart = interval; 1.282 + currentThread->mHanging = true; 1.283 + } 1.284 + } else { 1.285 + if (MOZ_LIKELY(interval != currentThread->mHangStart)) { 1.286 + // A hang ended 1.287 + currentThread->ReportHang(intervalNow - currentThread->mHangStart); 1.288 + currentThread->mHanging = false; 1.289 + } 1.290 + } 1.291 + 1.292 + /* If we are hanging, the next time we check for hang status is when 1.293 + the hang turns into a permahang. If we're not hanging, the next 1.294 + recheck timeout is when we may be entering a hang. */ 1.295 + PRIntervalTime nextRecheck; 1.296 + if (currentThread->mHanging) { 1.297 + nextRecheck = currentThread->mMaxTimeout; 1.298 + } else { 1.299 + nextRecheck = currentThread->mTimeout; 1.300 + } 1.301 + recheckTimeout = std::min(recheckTimeout, nextRecheck - hangTime); 1.302 + 1.303 + /* We wait for a quarter of the shortest timeout 1.304 + value to give mIntervalNow enough granularity. */ 1.305 + waitTime = std::min(waitTime, currentThread->mTimeout / 4); 1.306 + } 1.307 + } 1.308 + 1.309 + /* We are shutting down now. 1.310 + Wait for all outstanding monitors to unregister. */ 1.311 + while (!mHangThreads.isEmpty()) { 1.312 + autoLock.Wait(PR_INTERVAL_NO_TIMEOUT); 1.313 + } 1.314 +} 1.315 + 1.316 + 1.317 +BackgroundHangThread::BackgroundHangThread(const char* aName, 1.318 + uint32_t aTimeoutMs, 1.319 + uint32_t aMaxTimeoutMs) 1.320 + : mManager(BackgroundHangManager::sInstance) 1.321 + , mThreadID(PR_GetCurrentThread()) 1.322 + , mTimeout(aTimeoutMs == BackgroundHangMonitor::kNoTimeout 1.323 + ? PR_INTERVAL_NO_TIMEOUT 1.324 + : PR_MillisecondsToInterval(aTimeoutMs)) 1.325 + , mMaxTimeout(aMaxTimeoutMs == BackgroundHangMonitor::kNoTimeout 1.326 + ? PR_INTERVAL_NO_TIMEOUT 1.327 + : PR_MillisecondsToInterval(aMaxTimeoutMs)) 1.328 + , mInterval(mManager->mIntervalNow) 1.329 + , mHangStart(mInterval) 1.330 + , mHanging(false) 1.331 + , mWaiting(true) 1.332 + , mStats(aName) 1.333 +{ 1.334 + if (sTlsKey.initialized()) { 1.335 + sTlsKey.set(this); 1.336 + } 1.337 + // Lock here because LinkedList is not thread-safe 1.338 + MonitorAutoLock autoLock(mManager->mLock); 1.339 + // Add to thread list 1.340 + mManager->mHangThreads.insertBack(this); 1.341 + // Wake up monitor thread to process new thread 1.342 + autoLock.Notify(); 1.343 +} 1.344 + 1.345 +BackgroundHangThread::~BackgroundHangThread() 1.346 +{ 1.347 + // Lock here because LinkedList is not thread-safe 1.348 + MonitorAutoLock autoLock(mManager->mLock); 1.349 + // Remove from thread list 1.350 + remove(); 1.351 + // Wake up monitor thread to process removed thread 1.352 + autoLock.Notify(); 1.353 + 1.354 + // We no longer have a thread 1.355 + if (sTlsKey.initialized()) { 1.356 + sTlsKey.set(nullptr); 1.357 + } 1.358 + 1.359 + // Move our copy of ThreadHangStats to Telemetry storage 1.360 + Telemetry::RecordThreadHangStats(mStats); 1.361 +} 1.362 + 1.363 +void 1.364 +BackgroundHangThread::ReportHang(PRIntervalTime aHangTime) 1.365 +{ 1.366 + // Recovered from a hang; called on the monitor thread 1.367 + // mManager->mLock IS locked 1.368 + 1.369 + Telemetry::HangHistogram newHistogram(Move(mHangStack)); 1.370 + for (Telemetry::HangHistogram* oldHistogram = mStats.mHangs.begin(); 1.371 + oldHistogram != mStats.mHangs.end(); oldHistogram++) { 1.372 + if (newHistogram == *oldHistogram) { 1.373 + // New histogram matches old one 1.374 + oldHistogram->Add(aHangTime); 1.375 + return; 1.376 + } 1.377 + } 1.378 + // Add new histogram 1.379 + newHistogram.Add(aHangTime); 1.380 + mStats.mHangs.append(Move(newHistogram)); 1.381 +} 1.382 + 1.383 +void 1.384 +BackgroundHangThread::ReportPermaHang() 1.385 +{ 1.386 + // Permanently hanged; called on the monitor thread 1.387 + // mManager->mLock IS locked 1.388 + 1.389 + // TODO: Add more detailed analysis for perma-hangs 1.390 + ReportHang(mMaxTimeout); 1.391 +} 1.392 + 1.393 +MOZ_ALWAYS_INLINE void 1.394 +BackgroundHangThread::NotifyActivity() 1.395 +{ 1.396 + PRIntervalTime intervalNow = mManager->mIntervalNow; 1.397 + if (mWaiting) { 1.398 + mInterval = intervalNow; 1.399 + mWaiting = false; 1.400 + /* We have to wake up the manager thread because when all threads 1.401 + are waiting, the manager thread waits indefinitely as well. */ 1.402 + mManager->Wakeup(); 1.403 + } else { 1.404 + PRIntervalTime duration = intervalNow - mInterval; 1.405 + mStats.mActivity.Add(duration); 1.406 + if (MOZ_UNLIKELY(duration >= mTimeout)) { 1.407 + /* Wake up the manager thread to tell it that a hang ended */ 1.408 + mManager->Wakeup(); 1.409 + } 1.410 + mInterval = intervalNow; 1.411 + } 1.412 +} 1.413 + 1.414 +BackgroundHangThread* 1.415 +BackgroundHangThread::FindThread() 1.416 +{ 1.417 +#ifdef MOZ_ENABLE_BACKGROUND_HANG_MONITOR 1.418 + if (sTlsKey.initialized()) { 1.419 + // Use TLS if available 1.420 + return sTlsKey.get(); 1.421 + } 1.422 + // If TLS is unavailable, we can search through the thread list 1.423 + RefPtr<BackgroundHangManager> manager(BackgroundHangManager::sInstance); 1.424 + MOZ_ASSERT(manager, "Creating BackgroundHangMonitor after shutdown"); 1.425 + 1.426 + PRThread* threadID = PR_GetCurrentThread(); 1.427 + // Lock thread list for traversal 1.428 + MonitorAutoLock autoLock(manager->mLock); 1.429 + for (BackgroundHangThread* thread = manager->mHangThreads.getFirst(); 1.430 + thread; thread = thread->getNext()) { 1.431 + if (thread->mThreadID == threadID) { 1.432 + return thread; 1.433 + } 1.434 + } 1.435 +#endif 1.436 + // Current thread is not initialized 1.437 + return nullptr; 1.438 +} 1.439 + 1.440 + 1.441 +void 1.442 +BackgroundHangMonitor::Startup() 1.443 +{ 1.444 +#ifdef MOZ_ENABLE_BACKGROUND_HANG_MONITOR 1.445 + MOZ_ASSERT(!BackgroundHangManager::sInstance, "Already initialized"); 1.446 + ThreadStackHelper::Startup(); 1.447 + BackgroundHangThread::Startup(); 1.448 + BackgroundHangManager::sInstance = new BackgroundHangManager(); 1.449 +#endif 1.450 +} 1.451 + 1.452 +void 1.453 +BackgroundHangMonitor::Shutdown() 1.454 +{ 1.455 +#ifdef MOZ_ENABLE_BACKGROUND_HANG_MONITOR 1.456 + MOZ_ASSERT(BackgroundHangManager::sInstance, "Not initialized"); 1.457 + /* Scope our lock inside Shutdown() because the sInstance object can 1.458 + be destroyed as soon as we set sInstance to nullptr below, and 1.459 + we don't want to hold the lock when it's being destroyed. */ 1.460 + BackgroundHangManager::sInstance->Shutdown(); 1.461 + BackgroundHangManager::sInstance = nullptr; 1.462 + ThreadStackHelper::Shutdown(); 1.463 +#endif 1.464 +} 1.465 + 1.466 +BackgroundHangMonitor::BackgroundHangMonitor(const char* aName, 1.467 + uint32_t aTimeoutMs, 1.468 + uint32_t aMaxTimeoutMs) 1.469 + : mThread(BackgroundHangThread::FindThread()) 1.470 +{ 1.471 +#ifdef MOZ_ENABLE_BACKGROUND_HANG_MONITOR 1.472 + if (!mThread) { 1.473 + mThread = new BackgroundHangThread(aName, aTimeoutMs, aMaxTimeoutMs); 1.474 + } 1.475 +#endif 1.476 +} 1.477 + 1.478 +BackgroundHangMonitor::BackgroundHangMonitor() 1.479 + : mThread(BackgroundHangThread::FindThread()) 1.480 +{ 1.481 +#ifdef MOZ_ENABLE_BACKGROUND_HANG_MONITOR 1.482 + MOZ_ASSERT(mThread, "Thread not initialized for hang monitoring"); 1.483 +#endif 1.484 +} 1.485 + 1.486 +BackgroundHangMonitor::~BackgroundHangMonitor() 1.487 +{ 1.488 +} 1.489 + 1.490 +void 1.491 +BackgroundHangMonitor::NotifyActivity() 1.492 +{ 1.493 +#ifdef MOZ_ENABLE_BACKGROUND_HANG_MONITOR 1.494 + mThread->NotifyActivity(); 1.495 +#endif 1.496 +} 1.497 + 1.498 +void 1.499 +BackgroundHangMonitor::NotifyWait() 1.500 +{ 1.501 +#ifdef MOZ_ENABLE_BACKGROUND_HANG_MONITOR 1.502 + mThread->NotifyWait(); 1.503 +#endif 1.504 +} 1.505 + 1.506 + 1.507 +/* Because we are iterating through the BackgroundHangThread linked list, 1.508 + we need to take a lock. Using MonitorAutoLock as a base class makes 1.509 + sure all of that is taken care of for us. */ 1.510 +BackgroundHangMonitor::ThreadHangStatsIterator::ThreadHangStatsIterator() 1.511 + : MonitorAutoLock(BackgroundHangManager::sInstance->mLock) 1.512 + , mThread(BackgroundHangManager::sInstance->mHangThreads.getFirst()) 1.513 +{ 1.514 +} 1.515 + 1.516 +Telemetry::ThreadHangStats* 1.517 +BackgroundHangMonitor::ThreadHangStatsIterator::GetNext() 1.518 +{ 1.519 + if (!mThread) { 1.520 + return nullptr; 1.521 + } 1.522 + Telemetry::ThreadHangStats* stats = &mThread->mStats; 1.523 + mThread = mThread->getNext(); 1.524 + return stats; 1.525 +} 1.526 + 1.527 +} // namespace mozilla