1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/tools/profiler/UnwinderThread2.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1884 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#include <stdio.h> 1.10 +#include <signal.h> 1.11 +#include <string.h> 1.12 +#include <stdlib.h> 1.13 +#include <time.h> 1.14 + 1.15 +#ifdef MOZ_VALGRIND 1.16 +# include <valgrind/helgrind.h> 1.17 +# include <valgrind/memcheck.h> 1.18 +#else 1.19 +# define VALGRIND_HG_MUTEX_LOCK_PRE(_mx,_istry) /* */ 1.20 +# define VALGRIND_HG_MUTEX_LOCK_POST(_mx) /* */ 1.21 +# define VALGRIND_HG_MUTEX_UNLOCK_PRE(_mx) /* */ 1.22 +# define VALGRIND_HG_MUTEX_UNLOCK_POST(_mx) /* */ 1.23 +# define VALGRIND_MAKE_MEM_DEFINED(_addr,_len) ((void)0) 1.24 +# define VALGRIND_MAKE_MEM_UNDEFINED(_addr,_len) ((void)0) 1.25 +#endif 1.26 + 1.27 +#include "prenv.h" 1.28 +#include "mozilla/arm.h" 1.29 +#include "mozilla/DebugOnly.h" 1.30 +#include <stdint.h> 1.31 +#include "PlatformMacros.h" 1.32 + 1.33 +#include "platform.h" 1.34 +#include <ostream> 1.35 +#include <string> 1.36 + 1.37 +#include "ProfileEntry.h" 1.38 +#include "SyncProfile.h" 1.39 +#include "AutoObjectMapper.h" 1.40 +#include "UnwinderThread2.h" 1.41 + 1.42 +#if !defined(SPS_OS_windows) 1.43 +# include <sys/mman.h> 1.44 +#endif 1.45 + 1.46 +#if defined(SPS_OS_android) || defined(SPS_OS_linux) 1.47 +# include <ucontext.h> 1.48 +# include "LulMain.h" 1.49 +#endif 1.50 + 1.51 +#include "shared-libraries.h" 1.52 + 1.53 + 1.54 +// Verbosity of this module, for debugging: 1.55 +// 0 silent 1.56 +// 1 adds info about debuginfo load success/failure 1.57 +// 2 adds slow-summary stats for buffer fills/misses (RECOMMENDED) 1.58 +// 3 adds per-sample summary lines 1.59 +// 4 adds per-sample frame listing 1.60 +// Note that level 3 and above produces risk of deadlock, and 1.61 +// are not recommended for extended use. 1.62 +#define LOGLEVEL 2 1.63 + 1.64 +// The maximum number of frames that the native unwinder will 1.65 +// produce. Setting it too high gives a risk of it wasting a 1.66 +// lot of time looping on corrupted stacks. 1.67 +#define MAX_NATIVE_FRAMES 256 1.68 + 1.69 + 1.70 +// The 'else' of this covers the entire rest of the file 1.71 +#if defined(SPS_OS_windows) || defined(SPS_OS_darwin) 1.72 + 1.73 +////////////////////////////////////////////////////////// 1.74 +//// BEGIN externally visible functions (WINDOWS and OSX STUBS) 1.75 + 1.76 +// On Windows and OSX this will all need reworking. 1.77 +// GeckoProfilerImpl.h will ensure these functions are never actually 1.78 +// called, so just provide no-op stubs for now. 1.79 + 1.80 +void uwt__init() 1.81 +{ 1.82 +} 1.83 + 1.84 +void uwt__stop() 1.85 +{ 1.86 +} 1.87 + 1.88 +void uwt__deinit() 1.89 +{ 1.90 +} 1.91 + 1.92 +void uwt__register_thread_for_profiling ( void* stackTop ) 1.93 +{ 1.94 +} 1.95 + 1.96 +void uwt__unregister_thread_for_profiling() 1.97 +{ 1.98 +} 1.99 + 1.100 +LinkedUWTBuffer* utb__acquire_sync_buffer(void* stackTop) 1.101 +{ 1.102 + return nullptr; 1.103 +} 1.104 + 1.105 +// RUNS IN SIGHANDLER CONTEXT 1.106 +UnwinderThreadBuffer* uwt__acquire_empty_buffer() 1.107 +{ 1.108 + return nullptr; 1.109 +} 1.110 + 1.111 +void 1.112 +utb__finish_sync_buffer(ThreadProfile* aProfile, 1.113 + UnwinderThreadBuffer* utb, 1.114 + void* /* ucontext_t*, really */ ucV) 1.115 +{ 1.116 +} 1.117 + 1.118 +void 1.119 +utb__release_sync_buffer(LinkedUWTBuffer* utb) 1.120 +{ 1.121 +} 1.122 + 1.123 +// RUNS IN SIGHANDLER CONTEXT 1.124 +void 1.125 +uwt__release_full_buffer(ThreadProfile* aProfile, 1.126 + UnwinderThreadBuffer* utb, 1.127 + void* /* ucontext_t*, really */ ucV ) 1.128 +{ 1.129 +} 1.130 + 1.131 +// RUNS IN SIGHANDLER CONTEXT 1.132 +void 1.133 +utb__addEntry(/*MODIFIED*/UnwinderThreadBuffer* utb, ProfileEntry ent) 1.134 +{ 1.135 +} 1.136 + 1.137 +//// END externally visible functions (WINDOWS and OSX STUBS) 1.138 +////////////////////////////////////////////////////////// 1.139 + 1.140 +#else // a supported target 1.141 + 1.142 +////////////////////////////////////////////////////////// 1.143 +//// BEGIN externally visible functions 1.144 + 1.145 +// Forward references 1.146 +// the unwinder thread ID, its fn, and a stop-now flag 1.147 +static void* unwind_thr_fn ( void* exit_nowV ); 1.148 +static pthread_t unwind_thr; 1.149 +static int unwind_thr_exit_now = 0; // RACED ON 1.150 + 1.151 +// Threads must be registered with this file before they can be 1.152 +// sampled. So that we know the max safe stack address for each 1.153 +// registered thread. 1.154 +static void thread_register_for_profiling ( void* stackTop ); 1.155 + 1.156 +// Unregister a thread. 1.157 +static void thread_unregister_for_profiling(); 1.158 + 1.159 +// Empties out the buffer queue. Used when the unwinder thread is 1.160 +// shut down. 1.161 +static void empty_buffer_queue(); 1.162 + 1.163 +// Allocate a buffer for synchronous unwinding 1.164 +static LinkedUWTBuffer* acquire_sync_buffer(void* stackTop); 1.165 + 1.166 +// RUNS IN SIGHANDLER CONTEXT 1.167 +// Acquire an empty buffer and mark it as FILLING 1.168 +static UnwinderThreadBuffer* acquire_empty_buffer(); 1.169 + 1.170 +static void finish_sync_buffer(ThreadProfile* aProfile, 1.171 + UnwinderThreadBuffer* utb, 1.172 + void* /* ucontext_t*, really */ ucV); 1.173 + 1.174 +// Release an empty synchronous unwind buffer. 1.175 +static void release_sync_buffer(LinkedUWTBuffer* utb); 1.176 + 1.177 +// RUNS IN SIGHANDLER CONTEXT 1.178 +// Put this buffer in the queue of stuff going to the unwinder 1.179 +// thread, and mark it as FULL. Before doing that, fill in stack 1.180 +// chunk and register fields if a native unwind is requested. 1.181 +// APROFILE is where the profile data should be added to. UTB 1.182 +// is the partially-filled-in buffer, containing ProfileEntries. 1.183 +// UCV is the ucontext_t* from the signal handler. If non-nullptr, 1.184 +// is taken as a cue to request native unwind. 1.185 +static void release_full_buffer(ThreadProfile* aProfile, 1.186 + UnwinderThreadBuffer* utb, 1.187 + void* /* ucontext_t*, really */ ucV ); 1.188 + 1.189 +// RUNS IN SIGHANDLER CONTEXT 1.190 +static void utb_add_prof_ent(UnwinderThreadBuffer* utb, ProfileEntry ent); 1.191 + 1.192 +// Do a store memory barrier. 1.193 +static void do_MBAR(); 1.194 + 1.195 + 1.196 +// This is the single instance of the LUL unwind library that we will 1.197 +// use. Currently the library is operated with multiple sampling 1.198 +// threads but only one unwinder thread. It should also be possible 1.199 +// to use the library with multiple unwinder threads, to improve 1.200 +// throughput. The setup here makes it possible to use multiple 1.201 +// unwinder threads, although that is as-yet untested. 1.202 +// 1.203 +// |sLULmutex| protects |sLUL| and |sLULcount| and also is used to 1.204 +// ensure that only the first unwinder thread requests |sLUL| to read 1.205 +// debug info. |sLUL| may only be assigned to (and the object it 1.206 +// points at may only be created/destroyed) when |sLULcount| is zero. 1.207 +// |sLULcount| holds the number of unwinder threads currently in 1.208 +// existence. 1.209 +static pthread_mutex_t sLULmutex = PTHREAD_MUTEX_INITIALIZER; 1.210 +static lul::LUL* sLUL = nullptr; 1.211 +static int sLULcount = 0; 1.212 + 1.213 + 1.214 +void uwt__init() 1.215 +{ 1.216 + // Create the unwinder thread. 1.217 + MOZ_ASSERT(unwind_thr_exit_now == 0); 1.218 + int r = pthread_create( &unwind_thr, nullptr, 1.219 + unwind_thr_fn, (void*)&unwind_thr_exit_now ); 1.220 + MOZ_ALWAYS_TRUE(r == 0); 1.221 +} 1.222 + 1.223 +void uwt__stop() 1.224 +{ 1.225 + // Shut down the unwinder thread. 1.226 + MOZ_ASSERT(unwind_thr_exit_now == 0); 1.227 + unwind_thr_exit_now = 1; 1.228 + do_MBAR(); 1.229 + int r = pthread_join(unwind_thr, nullptr); 1.230 + MOZ_ALWAYS_TRUE(r == 0); 1.231 +} 1.232 + 1.233 +void uwt__deinit() 1.234 +{ 1.235 + empty_buffer_queue(); 1.236 +} 1.237 + 1.238 +void uwt__register_thread_for_profiling(void* stackTop) 1.239 +{ 1.240 + thread_register_for_profiling(stackTop); 1.241 +} 1.242 + 1.243 +void uwt__unregister_thread_for_profiling() 1.244 +{ 1.245 + thread_unregister_for_profiling(); 1.246 +} 1.247 + 1.248 +LinkedUWTBuffer* utb__acquire_sync_buffer(void* stackTop) 1.249 +{ 1.250 + return acquire_sync_buffer(stackTop); 1.251 +} 1.252 + 1.253 +void utb__finish_sync_buffer(ThreadProfile* profile, 1.254 + UnwinderThreadBuffer* buff, 1.255 + void* /* ucontext_t*, really */ ucV) 1.256 +{ 1.257 + finish_sync_buffer(profile, buff, ucV); 1.258 +} 1.259 + 1.260 +void utb__release_sync_buffer(LinkedUWTBuffer* buff) 1.261 +{ 1.262 + release_sync_buffer(buff); 1.263 +} 1.264 + 1.265 +// RUNS IN SIGHANDLER CONTEXT 1.266 +UnwinderThreadBuffer* uwt__acquire_empty_buffer() 1.267 +{ 1.268 + return acquire_empty_buffer(); 1.269 +} 1.270 + 1.271 +// RUNS IN SIGHANDLER CONTEXT 1.272 +void 1.273 +uwt__release_full_buffer(ThreadProfile* aProfile, 1.274 + UnwinderThreadBuffer* utb, 1.275 + void* /* ucontext_t*, really */ ucV ) 1.276 +{ 1.277 + release_full_buffer( aProfile, utb, ucV ); 1.278 +} 1.279 + 1.280 +// RUNS IN SIGHANDLER CONTEXT 1.281 +void 1.282 +utb__addEntry(/*MODIFIED*/UnwinderThreadBuffer* utb, ProfileEntry ent) 1.283 +{ 1.284 + utb_add_prof_ent(utb, ent); 1.285 +} 1.286 + 1.287 +//// END externally visible functions 1.288 +////////////////////////////////////////////////////////// 1.289 + 1.290 + 1.291 +////////////////////////////////////////////////////////// 1.292 +//// BEGIN type UnwindThreadBuffer 1.293 + 1.294 +static_assert(sizeof(uint32_t) == 4, "uint32_t size incorrect"); 1.295 +static_assert(sizeof(uint64_t) == 8, "uint64_t size incorrect"); 1.296 +static_assert(sizeof(uintptr_t) == sizeof(void*), 1.297 + "uintptr_t size incorrect"); 1.298 + 1.299 +typedef 1.300 + struct { 1.301 + uint64_t rsp; 1.302 + uint64_t rbp; 1.303 + uint64_t rip; 1.304 + } 1.305 + AMD64Regs; 1.306 + 1.307 +typedef 1.308 + struct { 1.309 + uint32_t r15; 1.310 + uint32_t r14; 1.311 + uint32_t r13; 1.312 + uint32_t r12; 1.313 + uint32_t r11; 1.314 + uint32_t r7; 1.315 + } 1.316 + ARMRegs; 1.317 + 1.318 +typedef 1.319 + struct { 1.320 + uint32_t esp; 1.321 + uint32_t ebp; 1.322 + uint32_t eip; 1.323 + } 1.324 + X86Regs; 1.325 + 1.326 +#if defined(SPS_ARCH_amd64) 1.327 +typedef AMD64Regs ArchRegs; 1.328 +#elif defined(SPS_ARCH_arm) 1.329 +typedef ARMRegs ArchRegs; 1.330 +#elif defined(SPS_ARCH_x86) 1.331 +typedef X86Regs ArchRegs; 1.332 +#else 1.333 +# error "Unknown plat" 1.334 +#endif 1.335 + 1.336 +#if defined(SPS_ARCH_amd64) || defined(SPS_ARCH_arm) || defined(SPS_ARCH_x86) 1.337 +# define SPS_PAGE_SIZE 4096 1.338 +#else 1.339 +# error "Unknown plat" 1.340 +#endif 1.341 + 1.342 +typedef enum { S_EMPTY, S_FILLING, S_EMPTYING, S_FULL } State; 1.343 + 1.344 +typedef struct { uintptr_t val; } SpinLock; 1.345 + 1.346 +/* CONFIGURABLE */ 1.347 +/* The number of fixed ProfileEntry slots. If more are required, they 1.348 + are placed in mmap'd pages. */ 1.349 +#define N_FIXED_PROF_ENTS 20 1.350 + 1.351 +/* CONFIGURABLE */ 1.352 +/* The number of extra pages of ProfileEntries. If (on arm) each 1.353 + ProfileEntry is 8 bytes, then a page holds 512, and so 100 pages 1.354 + is enough to hold 51200. */ 1.355 +#define N_PROF_ENT_PAGES 100 1.356 + 1.357 +/* DERIVATIVE */ 1.358 +#define N_PROF_ENTS_PER_PAGE (SPS_PAGE_SIZE / sizeof(ProfileEntry)) 1.359 + 1.360 +/* A page of ProfileEntrys. This might actually be slightly smaller 1.361 + than a page if SPS_PAGE_SIZE is not an exact multiple of 1.362 + sizeof(ProfileEntry). */ 1.363 +typedef 1.364 + struct { ProfileEntry ents[N_PROF_ENTS_PER_PAGE]; } 1.365 + ProfEntsPage; 1.366 + 1.367 +#define ProfEntsPage_INVALID ((ProfEntsPage*)1) 1.368 + 1.369 + 1.370 +/* Fields protected by the spinlock are marked SL */ 1.371 + 1.372 +struct _UnwinderThreadBuffer { 1.373 + /*SL*/ State state; 1.374 + /* The rest of these are protected, in some sense, by ::state. If 1.375 + ::state is S_FILLING, they are 'owned' by the sampler thread 1.376 + that set the state to S_FILLING. If ::state is S_EMPTYING, 1.377 + they are 'owned' by the unwinder thread that set the state to 1.378 + S_EMPTYING. If ::state is S_EMPTY or S_FULL, the buffer isn't 1.379 + owned by any thread, and so no thread may access these 1.380 + fields. */ 1.381 + /* Sample number, needed to process samples in order */ 1.382 + uint64_t seqNo; 1.383 + /* The ThreadProfile into which the results are eventually to be 1.384 + dumped. */ 1.385 + ThreadProfile* aProfile; 1.386 + /* Pseudostack and other info, always present */ 1.387 + ProfileEntry entsFixed[N_FIXED_PROF_ENTS]; 1.388 + ProfEntsPage* entsPages[N_PROF_ENT_PAGES]; 1.389 + uintptr_t entsUsed; 1.390 + /* Do we also have data to do a native unwind? */ 1.391 + bool haveNativeInfo; 1.392 + /* If so, here is the register state and stack. Unset if 1.393 + .haveNativeInfo is false. */ 1.394 + lul::UnwindRegs startRegs; 1.395 + lul::StackImage stackImg; 1.396 + void* stackMaxSafe; /* Address for max safe stack reading. */ 1.397 +}; 1.398 +/* Indexing scheme for ents: 1.399 + 0 <= i < N_FIXED_PROF_ENTS 1.400 + is at entsFixed[i] 1.401 + 1.402 + i >= N_FIXED_PROF_ENTS 1.403 + is at let j = i - N_FIXED_PROF_ENTS 1.404 + in entsPages[j / N_PROFENTS_PER_PAGE] 1.405 + ->ents[j % N_PROFENTS_PER_PAGE] 1.406 + 1.407 + entsPages[] are allocated on demand. Because zero can 1.408 + theoretically be a valid page pointer, use 1.409 + ProfEntsPage_INVALID == (ProfEntsPage*)1 to mark invalid pages. 1.410 + 1.411 + It follows that the max entsUsed value is N_FIXED_PROF_ENTS + 1.412 + N_PROFENTS_PER_PAGE * N_PROFENTS_PAGES, and at that point no more 1.413 + ProfileEntries can be storedd. 1.414 +*/ 1.415 + 1.416 + 1.417 +typedef 1.418 + struct { 1.419 + pthread_t thrId; 1.420 + void* stackTop; 1.421 + uint64_t nSamples; 1.422 + } 1.423 + StackLimit; 1.424 + 1.425 +/* Globals -- the buffer array */ 1.426 +#define N_UNW_THR_BUFFERS 10 1.427 +/*SL*/ static UnwinderThreadBuffer** g_buffers = nullptr; 1.428 +/*SL*/ static uint64_t g_seqNo = 0; 1.429 +/*SL*/ static SpinLock g_spinLock = { 0 }; 1.430 + 1.431 +/* Globals -- the thread array. The array is dynamically expanded on 1.432 + demand. The spinlock must be held when accessing g_stackLimits, 1.433 + g_stackLimits[some index], g_stackLimitsUsed and g_stackLimitsSize. 1.434 + However, the spinlock must not be held when calling malloc to 1.435 + allocate or expand the array, as that would risk deadlock against a 1.436 + sampling thread that holds the malloc lock and is trying to acquire 1.437 + the spinlock. */ 1.438 +/*SL*/ static StackLimit* g_stackLimits = nullptr; 1.439 +/*SL*/ static size_t g_stackLimitsUsed = 0; 1.440 +/*SL*/ static size_t g_stackLimitsSize = 0; 1.441 + 1.442 +/* Stats -- atomically incremented, no lock needed */ 1.443 +static uintptr_t g_stats_totalSamples = 0; // total # sample attempts 1.444 +static uintptr_t g_stats_noBuffAvail = 0; // # failed due to no buffer avail 1.445 +static uintptr_t g_stats_thrUnregd = 0; // # failed due to unregistered thr 1.446 + 1.447 +/* We must be VERY CAREFUL what we do with the spinlock held. The 1.448 + only thing it is safe to do with it held is modify (viz, read or 1.449 + write) g_buffers, g_buffers[], g_seqNo, g_buffers[]->state, 1.450 + g_stackLimits, g_stackLimits[], g_stackLimitsUsed and 1.451 + g_stackLimitsSize. No arbitrary computations, no syscalls, no 1.452 + printfs, no file IO, and absolutely no dynamic memory allocation 1.453 + (else we WILL eventually deadlock). 1.454 + 1.455 + This applies both to the signal handler and to the unwinder thread. 1.456 +*/ 1.457 + 1.458 +//// END type UnwindThreadBuffer 1.459 +////////////////////////////////////////////////////////// 1.460 + 1.461 +// This is the interface to LUL. 1.462 +typedef struct { u_int64_t pc; u_int64_t sp; } PCandSP; 1.463 + 1.464 +// Forward declaration. Implementation is below. 1.465 +static 1.466 +void do_lul_unwind_Buffer(/*OUT*/PCandSP** pairs, 1.467 + /*OUT*/unsigned int* nPairs, 1.468 + UnwinderThreadBuffer* buff, 1.469 + int buffNo /* for debug printing only */); 1.470 + 1.471 +static bool is_page_aligned(void* v) 1.472 +{ 1.473 + uintptr_t w = (uintptr_t) v; 1.474 + return (w & (SPS_PAGE_SIZE-1)) == 0 ? true : false; 1.475 +} 1.476 + 1.477 + 1.478 +/* Implement machine-word sized atomic compare-and-swap. Returns true 1.479 + if success, false if failure. */ 1.480 +static bool do_CASW(uintptr_t* addr, uintptr_t expected, uintptr_t nyu) 1.481 +{ 1.482 +#if defined(__GNUC__) 1.483 + return __sync_bool_compare_and_swap(addr, expected, nyu); 1.484 +#else 1.485 +# error "Unhandled compiler" 1.486 +#endif 1.487 +} 1.488 + 1.489 +/* Hint to the CPU core that we are in a spin-wait loop, and that 1.490 + other processors/cores/threads-running-on-the-same-core should be 1.491 + given priority on execute resources, if that is possible. Not 1.492 + critical if this is a no-op on some targets. */ 1.493 +static void do_SPINLOOP_RELAX() 1.494 +{ 1.495 +#if (defined(SPS_ARCH_amd64) || defined(SPS_ARCH_x86)) && defined(__GNUC__) 1.496 + __asm__ __volatile__("rep; nop"); 1.497 +#elif defined(SPS_PLAT_arm_android) && MOZILLA_ARM_ARCH >= 7 1.498 + __asm__ __volatile__("wfe"); 1.499 +#endif 1.500 +} 1.501 + 1.502 +/* Tell any cores snoozing in spin loops to wake up. */ 1.503 +static void do_SPINLOOP_NUDGE() 1.504 +{ 1.505 +#if (defined(SPS_ARCH_amd64) || defined(SPS_ARCH_x86)) && defined(__GNUC__) 1.506 + /* this is a no-op */ 1.507 +#elif defined(SPS_PLAT_arm_android) && MOZILLA_ARM_ARCH >= 7 1.508 + __asm__ __volatile__("sev"); 1.509 +#endif 1.510 +} 1.511 + 1.512 +/* Perform a full memory barrier. */ 1.513 +static void do_MBAR() 1.514 +{ 1.515 +#if defined(__GNUC__) 1.516 + __sync_synchronize(); 1.517 +#else 1.518 +# error "Unhandled compiler" 1.519 +#endif 1.520 +} 1.521 + 1.522 +static void spinLock_acquire(SpinLock* sl) 1.523 +{ 1.524 + uintptr_t* val = &sl->val; 1.525 + VALGRIND_HG_MUTEX_LOCK_PRE(sl, 0/*!isTryLock*/); 1.526 + while (1) { 1.527 + bool ok = do_CASW( val, 0, 1 ); 1.528 + if (ok) break; 1.529 + do_SPINLOOP_RELAX(); 1.530 + } 1.531 + do_MBAR(); 1.532 + VALGRIND_HG_MUTEX_LOCK_POST(sl); 1.533 +} 1.534 + 1.535 +static void spinLock_release(SpinLock* sl) 1.536 +{ 1.537 + uintptr_t* val = &sl->val; 1.538 + VALGRIND_HG_MUTEX_UNLOCK_PRE(sl); 1.539 + do_MBAR(); 1.540 + bool ok = do_CASW( val, 1, 0 ); 1.541 + /* This must succeed at the first try. To fail would imply that 1.542 + the lock was unheld. */ 1.543 + MOZ_ALWAYS_TRUE(ok); 1.544 + do_SPINLOOP_NUDGE(); 1.545 + VALGRIND_HG_MUTEX_UNLOCK_POST(sl); 1.546 +} 1.547 + 1.548 +static void sleep_ms(unsigned int ms) 1.549 +{ 1.550 + struct timespec req; 1.551 + req.tv_sec = ((time_t)ms) / 1000; 1.552 + req.tv_nsec = 1000 * 1000 * (((unsigned long)ms) % 1000); 1.553 + nanosleep(&req, nullptr); 1.554 +} 1.555 + 1.556 +/* Use CAS to implement standalone atomic increment. */ 1.557 +static void atomic_INC(uintptr_t* loc) 1.558 +{ 1.559 + while (1) { 1.560 + uintptr_t old = *loc; 1.561 + uintptr_t nyu = old + 1; 1.562 + bool ok = do_CASW( loc, old, nyu ); 1.563 + if (ok) break; 1.564 + } 1.565 +} 1.566 + 1.567 +// Empties out the buffer queue. 1.568 +static void empty_buffer_queue() 1.569 +{ 1.570 + spinLock_acquire(&g_spinLock); 1.571 + 1.572 + UnwinderThreadBuffer** tmp_g_buffers = g_buffers; 1.573 + g_stackLimitsUsed = 0; 1.574 + g_seqNo = 0; 1.575 + g_buffers = nullptr; 1.576 + 1.577 + spinLock_release(&g_spinLock); 1.578 + 1.579 + // Can't do any malloc/free when holding the spinlock. 1.580 + free(tmp_g_buffers); 1.581 + 1.582 + // We could potentially free up g_stackLimits; but given the 1.583 + // complications above involved in resizing it, it's probably 1.584 + // safer just to leave it in place. 1.585 +} 1.586 + 1.587 + 1.588 +// Registers a thread for profiling. Detects and ignores duplicate 1.589 +// registration. 1.590 +static void thread_register_for_profiling(void* stackTop) 1.591 +{ 1.592 + pthread_t me = pthread_self(); 1.593 + 1.594 + spinLock_acquire(&g_spinLock); 1.595 + 1.596 + // tmp copy of g_stackLimitsUsed, to avoid racing in message printing 1.597 + int n_used; 1.598 + 1.599 + // Ignore spurious calls which aren't really registering anything. 1.600 + if (stackTop == nullptr) { 1.601 + n_used = g_stackLimitsUsed; 1.602 + spinLock_release(&g_spinLock); 1.603 + LOGF("BPUnw: [%d total] thread_register_for_profiling" 1.604 + "(me=%p, stacktop=NULL) (IGNORED)", n_used, (void*)me); 1.605 + return; 1.606 + } 1.607 + 1.608 + /* Minimal sanity check on stackTop */ 1.609 + MOZ_ASSERT((void*)&n_used/*any auto var will do*/ < stackTop); 1.610 + 1.611 + bool is_dup = false; 1.612 + for (size_t i = 0; i < g_stackLimitsUsed; i++) { 1.613 + if (g_stackLimits[i].thrId == me) { 1.614 + is_dup = true; 1.615 + break; 1.616 + } 1.617 + } 1.618 + 1.619 + if (is_dup) { 1.620 + /* It's a duplicate registration. Ignore it: drop the lock and 1.621 + return. */ 1.622 + n_used = g_stackLimitsUsed; 1.623 + spinLock_release(&g_spinLock); 1.624 + 1.625 + LOGF("BPUnw: [%d total] thread_register_for_profiling" 1.626 + "(me=%p, stacktop=%p) (DUPLICATE)", n_used, (void*)me, stackTop); 1.627 + return; 1.628 + } 1.629 + 1.630 + /* Make sure the g_stackLimits array is large enough to accommodate 1.631 + this new entry. This is tricky. If it isn't large enough, we 1.632 + can malloc a larger version, but we have to do that without 1.633 + holding the spinlock, else we risk deadlock. The deadlock 1.634 + scenario is: 1.635 + 1.636 + Some other thread that is being sampled 1.637 + This thread 1.638 + 1.639 + call malloc call this function 1.640 + acquire malloc lock acquire the spinlock 1.641 + (sampling signal) discover thread array not big enough, 1.642 + call uwt__acquire_empty_buffer call malloc to make it larger 1.643 + acquire the spinlock acquire malloc lock 1.644 + 1.645 + This gives an inconsistent lock acquisition order on the malloc 1.646 + lock and spinlock, hence risk of deadlock. 1.647 + 1.648 + Allocating more space for the array without holding the spinlock 1.649 + implies tolerating races against other thread(s) who are also 1.650 + trying to expand the array. How can we detect if we have been 1.651 + out-raced? Every successful expansion of g_stackLimits[] results 1.652 + in an increase in g_stackLimitsSize. Hence we can detect if we 1.653 + got out-raced by remembering g_stackLimitsSize before we dropped 1.654 + the spinlock and checking if it has changed after the spinlock is 1.655 + reacquired. */ 1.656 + 1.657 + MOZ_ASSERT(g_stackLimitsUsed <= g_stackLimitsSize); 1.658 + 1.659 + if (g_stackLimitsUsed == g_stackLimitsSize) { 1.660 + /* g_stackLimits[] is full; resize it. */ 1.661 + 1.662 + size_t old_size = g_stackLimitsSize; 1.663 + size_t new_size = old_size == 0 ? 4 : (2 * old_size); 1.664 + 1.665 + spinLock_release(&g_spinLock); 1.666 + StackLimit* new_arr = (StackLimit*)malloc(new_size * sizeof(StackLimit)); 1.667 + if (!new_arr) 1.668 + return; 1.669 + 1.670 + spinLock_acquire(&g_spinLock); 1.671 + 1.672 + if (old_size != g_stackLimitsSize) { 1.673 + /* We've been outraced. Instead of trying to deal in-line with 1.674 + this extremely rare case, just start all over again by 1.675 + tail-calling this routine. */ 1.676 + spinLock_release(&g_spinLock); 1.677 + free(new_arr); 1.678 + thread_register_for_profiling(stackTop); 1.679 + return; 1.680 + } 1.681 + 1.682 + memcpy(new_arr, g_stackLimits, old_size * sizeof(StackLimit)); 1.683 + if (g_stackLimits) 1.684 + free(g_stackLimits); 1.685 + 1.686 + g_stackLimits = new_arr; 1.687 + 1.688 + MOZ_ASSERT(g_stackLimitsSize < new_size); 1.689 + g_stackLimitsSize = new_size; 1.690 + } 1.691 + 1.692 + MOZ_ASSERT(g_stackLimitsUsed < g_stackLimitsSize); 1.693 + 1.694 + /* Finally, we have a safe place to put the new entry. */ 1.695 + 1.696 + // Round |stackTop| up to the end of the containing page. We may 1.697 + // as well do this -- there's no danger of a fault, and we might 1.698 + // get a few more base-of-the-stack frames as a result. This 1.699 + // assumes that no target has a page size smaller than 4096. 1.700 + uintptr_t stackTopR = (uintptr_t)stackTop; 1.701 + stackTopR = (stackTopR & ~(uintptr_t)4095) + (uintptr_t)4095; 1.702 + 1.703 + g_stackLimits[g_stackLimitsUsed].thrId = me; 1.704 + g_stackLimits[g_stackLimitsUsed].stackTop = (void*)stackTopR; 1.705 + g_stackLimits[g_stackLimitsUsed].nSamples = 0; 1.706 + g_stackLimitsUsed++; 1.707 + 1.708 + n_used = g_stackLimitsUsed; 1.709 + spinLock_release(&g_spinLock); 1.710 + 1.711 + LOGF("BPUnw: [%d total] thread_register_for_profiling" 1.712 + "(me=%p, stacktop=%p)", n_used, (void*)me, stackTop); 1.713 +} 1.714 + 1.715 +// Deregisters a thread from profiling. Detects and ignores attempts 1.716 +// to deregister a not-registered thread. 1.717 +static void thread_unregister_for_profiling() 1.718 +{ 1.719 + spinLock_acquire(&g_spinLock); 1.720 + 1.721 + // tmp copy of g_stackLimitsUsed, to avoid racing in message printing 1.722 + size_t n_used; 1.723 + 1.724 + size_t i; 1.725 + bool found = false; 1.726 + pthread_t me = pthread_self(); 1.727 + for (i = 0; i < g_stackLimitsUsed; i++) { 1.728 + if (g_stackLimits[i].thrId == me) 1.729 + break; 1.730 + } 1.731 + if (i < g_stackLimitsUsed) { 1.732 + // found this entry. Slide the remaining ones down one place. 1.733 + for (; i+1 < g_stackLimitsUsed; i++) { 1.734 + g_stackLimits[i] = g_stackLimits[i+1]; 1.735 + } 1.736 + g_stackLimitsUsed--; 1.737 + found = true; 1.738 + } 1.739 + 1.740 + n_used = g_stackLimitsUsed; 1.741 + 1.742 + spinLock_release(&g_spinLock); 1.743 + LOGF("BPUnw: [%d total] thread_unregister_for_profiling(me=%p) %s", 1.744 + (int)n_used, (void*)me, found ? "" : " (NOT REGISTERED) "); 1.745 +} 1.746 + 1.747 + 1.748 +__attribute__((unused)) 1.749 +static void show_registered_threads() 1.750 +{ 1.751 + size_t i; 1.752 + spinLock_acquire(&g_spinLock); 1.753 + for (i = 0; i < g_stackLimitsUsed; i++) { 1.754 + LOGF("[%d] pthread_t=%p nSamples=%lld", 1.755 + (int)i, (void*)g_stackLimits[i].thrId, 1.756 + (unsigned long long int)g_stackLimits[i].nSamples); 1.757 + } 1.758 + spinLock_release(&g_spinLock); 1.759 +} 1.760 + 1.761 +// RUNS IN SIGHANDLER CONTEXT 1.762 +/* The calling thread owns the buffer, as denoted by its state being 1.763 + S_FILLING. So we can mess with it without further locking. */ 1.764 +static void init_empty_buffer(UnwinderThreadBuffer* buff, void* stackTop) 1.765 +{ 1.766 + /* Now we own the buffer, initialise it. */ 1.767 + buff->aProfile = nullptr; 1.768 + buff->entsUsed = 0; 1.769 + buff->haveNativeInfo = false; 1.770 + buff->stackImg.mLen = 0; 1.771 + buff->stackImg.mStartAvma = 0; 1.772 + buff->stackMaxSafe = stackTop; /* We will need this in 1.773 + release_full_buffer() */ 1.774 + for (size_t i = 0; i < N_PROF_ENT_PAGES; i++) 1.775 + buff->entsPages[i] = ProfEntsPage_INVALID; 1.776 +} 1.777 + 1.778 +struct SyncUnwinderThreadBuffer : public LinkedUWTBuffer 1.779 +{ 1.780 + UnwinderThreadBuffer* GetBuffer() 1.781 + { 1.782 + return &mBuff; 1.783 + } 1.784 + 1.785 + UnwinderThreadBuffer mBuff; 1.786 +}; 1.787 + 1.788 +static LinkedUWTBuffer* acquire_sync_buffer(void* stackTop) 1.789 +{ 1.790 + MOZ_ASSERT(stackTop); 1.791 + SyncUnwinderThreadBuffer* buff = new SyncUnwinderThreadBuffer(); 1.792 + // We can set state without locking here because this thread owns the buffer 1.793 + // and it is going to fill it itself. 1.794 + buff->GetBuffer()->state = S_FILLING; 1.795 + init_empty_buffer(buff->GetBuffer(), stackTop); 1.796 + return buff; 1.797 +} 1.798 + 1.799 +// RUNS IN SIGHANDLER CONTEXT 1.800 +static UnwinderThreadBuffer* acquire_empty_buffer() 1.801 +{ 1.802 + /* acq lock 1.803 + if buffers == nullptr { rel lock; exit } 1.804 + scan to find a free buff; if none { rel lock; exit } 1.805 + set buff state to S_FILLING 1.806 + fillseqno++; and remember it 1.807 + rel lock 1.808 + */ 1.809 + size_t i; 1.810 + 1.811 + atomic_INC( &g_stats_totalSamples ); 1.812 + 1.813 + /* This code is critical. We are in a signal handler and possibly 1.814 + with the malloc lock held. So we can't allocate any heap, and 1.815 + can't safely call any C library functions, not even the pthread_ 1.816 + functions. And we certainly can't do any syscalls. In short, 1.817 + this function needs to be self contained, not do any allocation, 1.818 + and not hold on to the spinlock for any significant length of 1.819 + time. */ 1.820 + 1.821 + spinLock_acquire(&g_spinLock); 1.822 + 1.823 + /* First of all, look for this thread's entry in g_stackLimits[]. 1.824 + We need to find it in order to figure out how much stack we can 1.825 + safely copy into the sample. This assumes that pthread_self() 1.826 + is safe to call in a signal handler, which strikes me as highly 1.827 + likely. */ 1.828 + pthread_t me = pthread_self(); 1.829 + MOZ_ASSERT(g_stackLimitsUsed <= g_stackLimitsSize); 1.830 + for (i = 0; i < g_stackLimitsUsed; i++) { 1.831 + if (g_stackLimits[i].thrId == me) 1.832 + break; 1.833 + } 1.834 + 1.835 + /* If the thread isn't registered for profiling, just ignore the call 1.836 + and return nullptr. */ 1.837 + if (i == g_stackLimitsUsed) { 1.838 + spinLock_release(&g_spinLock); 1.839 + atomic_INC( &g_stats_thrUnregd ); 1.840 + return nullptr; 1.841 + } 1.842 + 1.843 + /* "this thread is registered for profiling" */ 1.844 + MOZ_ASSERT(i < g_stackLimitsUsed); 1.845 + 1.846 + /* The furthest point that we can safely scan back up the stack. */ 1.847 + void* myStackTop = g_stackLimits[i].stackTop; 1.848 + g_stackLimits[i].nSamples++; 1.849 + 1.850 + /* Try to find a free buffer to use. */ 1.851 + if (g_buffers == nullptr) { 1.852 + /* The unwinder thread hasn't allocated any buffers yet. 1.853 + Nothing we can do. */ 1.854 + spinLock_release(&g_spinLock); 1.855 + atomic_INC( &g_stats_noBuffAvail ); 1.856 + return nullptr; 1.857 + } 1.858 + 1.859 + for (i = 0; i < N_UNW_THR_BUFFERS; i++) { 1.860 + if (g_buffers[i]->state == S_EMPTY) 1.861 + break; 1.862 + } 1.863 + MOZ_ASSERT(i <= N_UNW_THR_BUFFERS); 1.864 + 1.865 + if (i == N_UNW_THR_BUFFERS) { 1.866 + /* Again, no free buffers .. give up. */ 1.867 + spinLock_release(&g_spinLock); 1.868 + atomic_INC( &g_stats_noBuffAvail ); 1.869 + if (LOGLEVEL >= 3) 1.870 + LOG("BPUnw: handler: no free buffers"); 1.871 + return nullptr; 1.872 + } 1.873 + 1.874 + /* So we can use this one safely. Whilst still holding the lock, 1.875 + mark the buffer as belonging to us, and increment the sequence 1.876 + number. */ 1.877 + UnwinderThreadBuffer* buff = g_buffers[i]; 1.878 + MOZ_ASSERT(buff->state == S_EMPTY); 1.879 + buff->state = S_FILLING; 1.880 + buff->seqNo = g_seqNo; 1.881 + g_seqNo++; 1.882 + 1.883 + /* And drop the lock. We own the buffer, so go on and fill it. */ 1.884 + spinLock_release(&g_spinLock); 1.885 + 1.886 + /* Now we own the buffer, initialise it. */ 1.887 + init_empty_buffer(buff, myStackTop); 1.888 + return buff; 1.889 +} 1.890 + 1.891 +// RUNS IN SIGHANDLER CONTEXT 1.892 +/* The calling thread owns the buffer, as denoted by its state being 1.893 + S_FILLING. So we can mess with it without further locking. */ 1.894 +static void fill_buffer(ThreadProfile* aProfile, 1.895 + UnwinderThreadBuffer* buff, 1.896 + void* /* ucontext_t*, really */ ucV) 1.897 +{ 1.898 + MOZ_ASSERT(buff->state == S_FILLING); 1.899 + 1.900 + //////////////////////////////////////////////////// 1.901 + // BEGIN fill 1.902 + 1.903 + /* The buffer already will have some of its ProfileEntries filled 1.904 + in, but everything else needs to be filled in at this point. */ 1.905 + //LOGF("Release full buffer: %lu ents", buff->entsUsed); 1.906 + /* Where the resulting info is to be dumped */ 1.907 + buff->aProfile = aProfile; 1.908 + 1.909 + /* And, if we have register state, that and the stack top */ 1.910 + buff->haveNativeInfo = ucV != nullptr; 1.911 + if (buff->haveNativeInfo) { 1.912 +# if defined(SPS_PLAT_amd64_linux) 1.913 + ucontext_t* uc = (ucontext_t*)ucV; 1.914 + mcontext_t* mc = &(uc->uc_mcontext); 1.915 + buff->startRegs.xip = lul::TaggedUWord(mc->gregs[REG_RIP]); 1.916 + buff->startRegs.xsp = lul::TaggedUWord(mc->gregs[REG_RSP]); 1.917 + buff->startRegs.xbp = lul::TaggedUWord(mc->gregs[REG_RBP]); 1.918 +# elif defined(SPS_PLAT_amd64_darwin) 1.919 + ucontext_t* uc = (ucontext_t*)ucV; 1.920 + struct __darwin_mcontext64* mc = uc->uc_mcontext; 1.921 + struct __darwin_x86_thread_state64* ss = &mc->__ss; 1.922 + buff->regs.rip = ss->__rip; 1.923 + buff->regs.rsp = ss->__rsp; 1.924 + buff->regs.rbp = ss->__rbp; 1.925 +# elif defined(SPS_PLAT_arm_android) 1.926 + ucontext_t* uc = (ucontext_t*)ucV; 1.927 + mcontext_t* mc = &(uc->uc_mcontext); 1.928 + buff->startRegs.r15 = lul::TaggedUWord(mc->arm_pc); 1.929 + buff->startRegs.r14 = lul::TaggedUWord(mc->arm_lr); 1.930 + buff->startRegs.r13 = lul::TaggedUWord(mc->arm_sp); 1.931 + buff->startRegs.r12 = lul::TaggedUWord(mc->arm_ip); 1.932 + buff->startRegs.r11 = lul::TaggedUWord(mc->arm_fp); 1.933 + buff->startRegs.r7 = lul::TaggedUWord(mc->arm_r7); 1.934 +# elif defined(SPS_PLAT_x86_linux) || defined(SPS_PLAT_x86_android) 1.935 + ucontext_t* uc = (ucontext_t*)ucV; 1.936 + mcontext_t* mc = &(uc->uc_mcontext); 1.937 + buff->startRegs.xip = lul::TaggedUWord(mc->gregs[REG_EIP]); 1.938 + buff->startRegs.xsp = lul::TaggedUWord(mc->gregs[REG_ESP]); 1.939 + buff->startRegs.xbp = lul::TaggedUWord(mc->gregs[REG_EBP]); 1.940 +# elif defined(SPS_PLAT_x86_darwin) 1.941 + ucontext_t* uc = (ucontext_t*)ucV; 1.942 + struct __darwin_mcontext32* mc = uc->uc_mcontext; 1.943 + struct __darwin_i386_thread_state* ss = &mc->__ss; 1.944 + buff->regs.eip = ss->__eip; 1.945 + buff->regs.esp = ss->__esp; 1.946 + buff->regs.ebp = ss->__ebp; 1.947 +# else 1.948 +# error "Unknown plat" 1.949 +# endif 1.950 + 1.951 + /* Copy up to N_STACK_BYTES from rsp-REDZONE upwards, but not 1.952 + going past the stack's registered top point. Do some basic 1.953 + sanity checks too. This assumes that the TaggedUWord holding 1.954 + the stack pointer value is valid, but it should be, since it 1.955 + was constructed that way in the code just above. */ 1.956 + { 1.957 +# if defined(SPS_PLAT_amd64_linux) || defined(SPS_PLAT_amd64_darwin) 1.958 + uintptr_t rEDZONE_SIZE = 128; 1.959 + uintptr_t start = buff->startRegs.xsp.Value() - rEDZONE_SIZE; 1.960 +# elif defined(SPS_PLAT_arm_android) 1.961 + uintptr_t rEDZONE_SIZE = 0; 1.962 + uintptr_t start = buff->startRegs.r13.Value() - rEDZONE_SIZE; 1.963 +# elif defined(SPS_PLAT_x86_linux) || defined(SPS_PLAT_x86_darwin) \ 1.964 + || defined(SPS_PLAT_x86_android) 1.965 + uintptr_t rEDZONE_SIZE = 0; 1.966 + uintptr_t start = buff->startRegs.xsp.Value() - rEDZONE_SIZE; 1.967 +# else 1.968 +# error "Unknown plat" 1.969 +# endif 1.970 + uintptr_t end = (uintptr_t)buff->stackMaxSafe; 1.971 + uintptr_t ws = sizeof(void*); 1.972 + start &= ~(ws-1); 1.973 + end &= ~(ws-1); 1.974 + uintptr_t nToCopy = 0; 1.975 + if (start < end) { 1.976 + nToCopy = end - start; 1.977 + if (nToCopy > lul::N_STACK_BYTES) 1.978 + nToCopy = lul::N_STACK_BYTES; 1.979 + } 1.980 + MOZ_ASSERT(nToCopy <= lul::N_STACK_BYTES); 1.981 + buff->stackImg.mLen = nToCopy; 1.982 + buff->stackImg.mStartAvma = start; 1.983 + if (nToCopy > 0) { 1.984 + memcpy(&buff->stackImg.mContents[0], (void*)start, nToCopy); 1.985 + (void)VALGRIND_MAKE_MEM_DEFINED(&buff->stackImg.mContents[0], nToCopy); 1.986 + } 1.987 + } 1.988 + } /* if (buff->haveNativeInfo) */ 1.989 + // END fill 1.990 + //////////////////////////////////////////////////// 1.991 +} 1.992 + 1.993 +// RUNS IN SIGHANDLER CONTEXT 1.994 +/* The calling thread owns the buffer, as denoted by its state being 1.995 + S_FILLING. So we can mess with it without further locking. */ 1.996 +static void release_full_buffer(ThreadProfile* aProfile, 1.997 + UnwinderThreadBuffer* buff, 1.998 + void* /* ucontext_t*, really */ ucV ) 1.999 +{ 1.1000 + fill_buffer(aProfile, buff, ucV); 1.1001 + /* And now relinquish ownership of the buff, so that an unwinder 1.1002 + thread can pick it up. */ 1.1003 + spinLock_acquire(&g_spinLock); 1.1004 + buff->state = S_FULL; 1.1005 + spinLock_release(&g_spinLock); 1.1006 +} 1.1007 + 1.1008 +// RUNS IN SIGHANDLER CONTEXT 1.1009 +// Allocate a ProfEntsPage, without using malloc, or return 1.1010 +// ProfEntsPage_INVALID if we can't for some reason. 1.1011 +static ProfEntsPage* mmap_anon_ProfEntsPage() 1.1012 +{ 1.1013 +# if defined(SPS_OS_darwin) 1.1014 + void* v = ::mmap(nullptr, sizeof(ProfEntsPage), PROT_READ | PROT_WRITE, 1.1015 + MAP_PRIVATE | MAP_ANON, -1, 0); 1.1016 +# else 1.1017 + void* v = ::mmap(nullptr, sizeof(ProfEntsPage), PROT_READ | PROT_WRITE, 1.1018 + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 1.1019 +# endif 1.1020 + if (v == MAP_FAILED) { 1.1021 + return ProfEntsPage_INVALID; 1.1022 + } else { 1.1023 + return (ProfEntsPage*)v; 1.1024 + } 1.1025 +} 1.1026 + 1.1027 +// Runs in the unwinder thread 1.1028 +// Free a ProfEntsPage as allocated by mmap_anon_ProfEntsPage 1.1029 +static void munmap_ProfEntsPage(ProfEntsPage* pep) 1.1030 +{ 1.1031 + MOZ_ALWAYS_TRUE(is_page_aligned(pep)); 1.1032 + ::munmap(pep, sizeof(ProfEntsPage)); 1.1033 +} 1.1034 + 1.1035 + 1.1036 +// RUNS IN SIGHANDLER CONTEXT 1.1037 +void 1.1038 +utb_add_prof_ent(/*MODIFIED*/UnwinderThreadBuffer* utb, ProfileEntry ent) 1.1039 +{ 1.1040 + uintptr_t limit 1.1041 + = N_FIXED_PROF_ENTS + (N_PROF_ENTS_PER_PAGE * N_PROF_ENT_PAGES); 1.1042 + if (utb->entsUsed == limit) { 1.1043 + /* We're full. Now what? */ 1.1044 + LOG("BPUnw: utb__addEntry: NO SPACE for ProfileEntry; ignoring."); 1.1045 + return; 1.1046 + } 1.1047 + MOZ_ASSERT(utb->entsUsed < limit); 1.1048 + 1.1049 + /* Will it fit in the fixed array? */ 1.1050 + if (utb->entsUsed < N_FIXED_PROF_ENTS) { 1.1051 + utb->entsFixed[utb->entsUsed] = ent; 1.1052 + utb->entsUsed++; 1.1053 + return; 1.1054 + } 1.1055 + 1.1056 + /* No. Put it in the extras. */ 1.1057 + uintptr_t i = utb->entsUsed; 1.1058 + uintptr_t j = i - N_FIXED_PROF_ENTS; 1.1059 + uintptr_t j_div = j / N_PROF_ENTS_PER_PAGE; /* page number */ 1.1060 + uintptr_t j_mod = j % N_PROF_ENTS_PER_PAGE; /* page offset */ 1.1061 + ProfEntsPage* pep = utb->entsPages[j_div]; 1.1062 + if (pep == ProfEntsPage_INVALID) { 1.1063 + pep = mmap_anon_ProfEntsPage(); 1.1064 + if (pep == ProfEntsPage_INVALID) { 1.1065 + /* Urr, we ran out of memory. Now what? */ 1.1066 + LOG("BPUnw: utb__addEntry: MMAP FAILED for ProfileEntry; ignoring."); 1.1067 + return; 1.1068 + } 1.1069 + utb->entsPages[j_div] = pep; 1.1070 + } 1.1071 + pep->ents[j_mod] = ent; 1.1072 + utb->entsUsed++; 1.1073 +} 1.1074 + 1.1075 + 1.1076 +// misc helper 1.1077 +static ProfileEntry utb_get_profent(UnwinderThreadBuffer* buff, uintptr_t i) 1.1078 +{ 1.1079 + MOZ_ASSERT(i < buff->entsUsed); 1.1080 + if (i < N_FIXED_PROF_ENTS) { 1.1081 + return buff->entsFixed[i]; 1.1082 + } else { 1.1083 + uintptr_t j = i - N_FIXED_PROF_ENTS; 1.1084 + uintptr_t j_div = j / N_PROF_ENTS_PER_PAGE; /* page number */ 1.1085 + uintptr_t j_mod = j % N_PROF_ENTS_PER_PAGE; /* page offset */ 1.1086 + MOZ_ASSERT(buff->entsPages[j_div] != ProfEntsPage_INVALID); 1.1087 + return buff->entsPages[j_div]->ents[j_mod]; 1.1088 + } 1.1089 +} 1.1090 + 1.1091 +/* Copy ProfileEntries presented to us by the sampling thread. 1.1092 + Most of them are copied verbatim into |buff->aProfile|, 1.1093 + except for 'hint' tags, which direct us to do something 1.1094 + different. */ 1.1095 +static void process_buffer(UnwinderThreadBuffer* buff, int oldest_ix) 1.1096 +{ 1.1097 + /* Need to lock |aProfile| so nobody tries to copy out entries 1.1098 + whilst we are putting them in. */ 1.1099 + buff->aProfile->BeginUnwind(); 1.1100 + 1.1101 + /* The buff is a sequence of ProfileEntries (ents). It has 1.1102 + this grammar: 1.1103 + 1.1104 + | --pre-tags-- | (h 'P' .. h 'Q')* | --post-tags-- | 1.1105 + ^ ^ 1.1106 + ix_first_hP ix_last_hQ 1.1107 + 1.1108 + Each (h 'P' .. h 'Q') subsequence represents one pseudostack 1.1109 + entry. These, if present, are in the order 1.1110 + outermost-frame-first, and that is the order that they should 1.1111 + be copied into aProfile. The --pre-tags-- and --post-tags-- 1.1112 + are to be copied into the aProfile verbatim, except that they 1.1113 + may contain the hints "h 'F'" for a flush and "h 'N'" to 1.1114 + indicate that a native unwind is also required, and must be 1.1115 + interleaved with the pseudostack entries. 1.1116 + 1.1117 + The hint tags that bound each pseudostack entry, "h 'P'" and "h 1.1118 + 'Q'", are not to be copied into the aProfile -- they are 1.1119 + present only to make parsing easy here. Also, the pseudostack 1.1120 + entries may contain an "'S' (void*)" entry, which is the stack 1.1121 + pointer value for that entry, and these are also not to be 1.1122 + copied. 1.1123 + */ 1.1124 + /* The first thing to do is therefore to find the pseudostack 1.1125 + entries, if any, and to find out also whether a native unwind 1.1126 + has been requested. */ 1.1127 + const uintptr_t infUW = ~(uintptr_t)0; // infinity 1.1128 + bool need_native_unw = false; 1.1129 + uintptr_t ix_first_hP = infUW; // "not found" 1.1130 + uintptr_t ix_last_hQ = infUW; // "not found" 1.1131 + 1.1132 + uintptr_t k; 1.1133 + for (k = 0; k < buff->entsUsed; k++) { 1.1134 + ProfileEntry ent = utb_get_profent(buff, k); 1.1135 + if (ent.is_ent_hint('N')) { 1.1136 + need_native_unw = true; 1.1137 + } 1.1138 + else if (ent.is_ent_hint('P') && ix_first_hP == ~(uintptr_t)0) { 1.1139 + ix_first_hP = k; 1.1140 + } 1.1141 + else if (ent.is_ent_hint('Q')) { 1.1142 + ix_last_hQ = k; 1.1143 + } 1.1144 + } 1.1145 + 1.1146 + if (0) LOGF("BPUnw: ix_first_hP %llu ix_last_hQ %llu need_native_unw %llu", 1.1147 + (unsigned long long int)ix_first_hP, 1.1148 + (unsigned long long int)ix_last_hQ, 1.1149 + (unsigned long long int)need_native_unw); 1.1150 + 1.1151 + /* There are four possibilities: native-only, pseudostack-only, 1.1152 + combined (both), and neither. We handle all four cases. */ 1.1153 + 1.1154 + MOZ_ASSERT( (ix_first_hP == infUW && ix_last_hQ == infUW) || 1.1155 + (ix_first_hP != infUW && ix_last_hQ != infUW) ); 1.1156 + bool have_P = ix_first_hP != infUW; 1.1157 + if (have_P) { 1.1158 + MOZ_ASSERT(ix_first_hP < ix_last_hQ); 1.1159 + MOZ_ASSERT(ix_last_hQ <= buff->entsUsed); 1.1160 + } 1.1161 + 1.1162 + /* Neither N nor P. This is very unusual but has been observed to happen. 1.1163 + Just copy to the output. */ 1.1164 + if (!need_native_unw && !have_P) { 1.1165 + for (k = 0; k < buff->entsUsed; k++) { 1.1166 + ProfileEntry ent = utb_get_profent(buff, k); 1.1167 + // action flush-hints 1.1168 + if (ent.is_ent_hint('F')) { buff->aProfile->flush(); continue; } 1.1169 + // skip ones we can't copy 1.1170 + if (ent.is_ent_hint() || ent.is_ent('S')) { continue; } 1.1171 + // handle GetBacktrace() 1.1172 + if (ent.is_ent('B')) { 1.1173 + UnwinderThreadBuffer* buff = (UnwinderThreadBuffer*)ent.get_tagPtr(); 1.1174 + process_buffer(buff, -1); 1.1175 + continue; 1.1176 + } 1.1177 + // and copy everything else 1.1178 + buff->aProfile->addTag( ent ); 1.1179 + } 1.1180 + } 1.1181 + else /* Native only-case. */ 1.1182 + if (need_native_unw && !have_P) { 1.1183 + for (k = 0; k < buff->entsUsed; k++) { 1.1184 + ProfileEntry ent = utb_get_profent(buff, k); 1.1185 + // action a native-unwind-now hint 1.1186 + if (ent.is_ent_hint('N')) { 1.1187 + MOZ_ASSERT(buff->haveNativeInfo); 1.1188 + PCandSP* pairs = nullptr; 1.1189 + unsigned int nPairs = 0; 1.1190 + do_lul_unwind_Buffer(&pairs, &nPairs, buff, oldest_ix); 1.1191 + buff->aProfile->addTag( ProfileEntry('s', "(root)") ); 1.1192 + for (unsigned int i = 0; i < nPairs; i++) { 1.1193 + /* Skip any outermost frames that 1.1194 + do_lul_unwind_Buffer didn't give us. See comments 1.1195 + on that function for details. */ 1.1196 + if (pairs[i].pc == 0 && pairs[i].sp == 0) 1.1197 + continue; 1.1198 + buff->aProfile 1.1199 + ->addTag( ProfileEntry('l', reinterpret_cast<void*>(pairs[i].pc)) ); 1.1200 + } 1.1201 + if (pairs) 1.1202 + free(pairs); 1.1203 + continue; 1.1204 + } 1.1205 + // action flush-hints 1.1206 + if (ent.is_ent_hint('F')) { buff->aProfile->flush(); continue; } 1.1207 + // skip ones we can't copy 1.1208 + if (ent.is_ent_hint() || ent.is_ent('S')) { continue; } 1.1209 + // handle GetBacktrace() 1.1210 + if (ent.is_ent('B')) { 1.1211 + UnwinderThreadBuffer* buff = (UnwinderThreadBuffer*)ent.get_tagPtr(); 1.1212 + process_buffer(buff, -1); 1.1213 + continue; 1.1214 + } 1.1215 + // and copy everything else 1.1216 + buff->aProfile->addTag( ent ); 1.1217 + } 1.1218 + } 1.1219 + else /* Pseudostack-only case */ 1.1220 + if (!need_native_unw && have_P) { 1.1221 + /* If there's no request for a native stack, it's easy: just 1.1222 + copy the tags verbatim into aProfile, skipping the ones that 1.1223 + can't be copied -- 'h' (hint) tags, and "'S' (void*)" 1.1224 + stack-pointer tags. Except, insert a sample-start tag when 1.1225 + we see the start of the first pseudostack frame. */ 1.1226 + for (k = 0; k < buff->entsUsed; k++) { 1.1227 + ProfileEntry ent = utb_get_profent(buff, k); 1.1228 + // We need to insert a sample-start tag before the first frame 1.1229 + if (k == ix_first_hP) { 1.1230 + buff->aProfile->addTag( ProfileEntry('s', "(root)") ); 1.1231 + } 1.1232 + // action flush-hints 1.1233 + if (ent.is_ent_hint('F')) { buff->aProfile->flush(); continue; } 1.1234 + // skip ones we can't copy 1.1235 + if (ent.is_ent_hint() || ent.is_ent('S')) { continue; } 1.1236 + // handle GetBacktrace() 1.1237 + if (ent.is_ent('B')) { 1.1238 + UnwinderThreadBuffer* buff = (UnwinderThreadBuffer*)ent.get_tagPtr(); 1.1239 + process_buffer(buff, -1); 1.1240 + continue; 1.1241 + } 1.1242 + // and copy everything else 1.1243 + buff->aProfile->addTag( ent ); 1.1244 + } 1.1245 + } 1.1246 + else /* Combined case */ 1.1247 + if (need_native_unw && have_P) 1.1248 + { 1.1249 + /* We need to get a native stacktrace and merge it with the 1.1250 + pseudostack entries. This isn't too simple. First, copy all 1.1251 + the tags up to the start of the pseudostack tags. Then 1.1252 + generate a combined set of tags by native unwind and 1.1253 + pseudostack. Then, copy all the stuff after the pseudostack 1.1254 + tags. */ 1.1255 + MOZ_ASSERT(buff->haveNativeInfo); 1.1256 + 1.1257 + // Get native unwind info 1.1258 + PCandSP* pairs = nullptr; 1.1259 + unsigned int n_pairs = 0; 1.1260 + do_lul_unwind_Buffer(&pairs, &n_pairs, buff, oldest_ix); 1.1261 + 1.1262 + // Entries before the pseudostack frames 1.1263 + for (k = 0; k < ix_first_hP; k++) { 1.1264 + ProfileEntry ent = utb_get_profent(buff, k); 1.1265 + // action flush-hints 1.1266 + if (ent.is_ent_hint('F')) { buff->aProfile->flush(); continue; } 1.1267 + // skip ones we can't copy 1.1268 + if (ent.is_ent_hint() || ent.is_ent('S')) { continue; } 1.1269 + // handle GetBacktrace() 1.1270 + if (ent.is_ent('B')) { 1.1271 + UnwinderThreadBuffer* buff = (UnwinderThreadBuffer*)ent.get_tagPtr(); 1.1272 + process_buffer(buff, -1); 1.1273 + continue; 1.1274 + } 1.1275 + // and copy everything else 1.1276 + buff->aProfile->addTag( ent ); 1.1277 + } 1.1278 + 1.1279 + // BEGIN merge 1.1280 + buff->aProfile->addTag( ProfileEntry('s', "(root)") ); 1.1281 + unsigned int next_N = 0; // index in pairs[] 1.1282 + unsigned int next_P = ix_first_hP; // index in buff profent array 1.1283 + bool last_was_P = false; 1.1284 + if (0) LOGF("at mergeloop: n_pairs %llu ix_last_hQ %llu", 1.1285 + (unsigned long long int)n_pairs, 1.1286 + (unsigned long long int)ix_last_hQ); 1.1287 + /* Skip any outermost frames that do_lul_unwind_Buffer 1.1288 + didn't give us. See comments on that function for 1.1289 + details. */ 1.1290 + while (next_N < n_pairs && pairs[next_N].pc == 0 && pairs[next_N].sp == 0) 1.1291 + next_N++; 1.1292 + 1.1293 + while (true) { 1.1294 + if (next_P <= ix_last_hQ) { 1.1295 + // Assert that next_P points at the start of an P entry 1.1296 + MOZ_ASSERT(utb_get_profent(buff, next_P).is_ent_hint('P')); 1.1297 + } 1.1298 + if (next_N >= n_pairs && next_P > ix_last_hQ) { 1.1299 + // both stacks empty 1.1300 + break; 1.1301 + } 1.1302 + /* Decide which entry to use next: 1.1303 + If N is empty, must use P, and vice versa 1.1304 + else 1.1305 + If the last was P and current P has zero SP, use P 1.1306 + else 1.1307 + we assume that both P and N have valid SP, in which case 1.1308 + use the one with the larger value 1.1309 + */ 1.1310 + bool use_P = true; 1.1311 + if (next_N >= n_pairs) { 1.1312 + // N empty, use P 1.1313 + use_P = true; 1.1314 + if (0) LOG(" P <= no remaining N entries"); 1.1315 + } 1.1316 + else if (next_P > ix_last_hQ) { 1.1317 + // P empty, use N 1.1318 + use_P = false; 1.1319 + if (0) LOG(" N <= no remaining P entries"); 1.1320 + } 1.1321 + else { 1.1322 + // We have at least one N and one P entry available. 1.1323 + // Scan forwards to find the SP of the current P entry 1.1324 + u_int64_t sp_cur_P = 0; 1.1325 + unsigned int m = next_P + 1; 1.1326 + while (1) { 1.1327 + /* This assertion should hold because in a well formed 1.1328 + input, we must eventually find the hint-Q that marks 1.1329 + the end of this frame's entries. */ 1.1330 + MOZ_ASSERT(m < buff->entsUsed); 1.1331 + ProfileEntry ent = utb_get_profent(buff, m); 1.1332 + if (ent.is_ent_hint('Q')) 1.1333 + break; 1.1334 + if (ent.is_ent('S')) { 1.1335 + sp_cur_P = reinterpret_cast<u_int64_t>(ent.get_tagPtr()); 1.1336 + break; 1.1337 + } 1.1338 + m++; 1.1339 + } 1.1340 + if (last_was_P && sp_cur_P == 0) { 1.1341 + if (0) LOG(" P <= last_was_P && sp_cur_P == 0"); 1.1342 + use_P = true; 1.1343 + } else { 1.1344 + u_int64_t sp_cur_N = pairs[next_N].sp; 1.1345 + use_P = (sp_cur_P > sp_cur_N); 1.1346 + if (0) LOGF(" %s <= sps P %p N %p", 1.1347 + use_P ? "P" : "N", (void*)(intptr_t)sp_cur_P, 1.1348 + (void*)(intptr_t)sp_cur_N); 1.1349 + } 1.1350 + } 1.1351 + /* So, we know which we are going to use. */ 1.1352 + if (use_P) { 1.1353 + unsigned int m = next_P + 1; 1.1354 + while (true) { 1.1355 + MOZ_ASSERT(m < buff->entsUsed); 1.1356 + ProfileEntry ent = utb_get_profent(buff, m); 1.1357 + if (ent.is_ent_hint('Q')) { 1.1358 + next_P = m + 1; 1.1359 + break; 1.1360 + } 1.1361 + // we don't expect a flush-hint here 1.1362 + MOZ_ASSERT(!ent.is_ent_hint('F')); 1.1363 + // skip ones we can't copy 1.1364 + if (ent.is_ent_hint() || ent.is_ent('S')) { m++; continue; } 1.1365 + // and copy everything else 1.1366 + buff->aProfile->addTag( ent ); 1.1367 + m++; 1.1368 + } 1.1369 + } else { 1.1370 + buff->aProfile 1.1371 + ->addTag( ProfileEntry('l', reinterpret_cast<void*>(pairs[next_N].pc)) ); 1.1372 + next_N++; 1.1373 + } 1.1374 + /* Remember what we chose, for next time. */ 1.1375 + last_was_P = use_P; 1.1376 + } 1.1377 + 1.1378 + MOZ_ASSERT(next_P == ix_last_hQ + 1); 1.1379 + MOZ_ASSERT(next_N == n_pairs); 1.1380 + // END merge 1.1381 + 1.1382 + // Entries after the pseudostack frames 1.1383 + for (k = ix_last_hQ+1; k < buff->entsUsed; k++) { 1.1384 + ProfileEntry ent = utb_get_profent(buff, k); 1.1385 + // action flush-hints 1.1386 + if (ent.is_ent_hint('F')) { buff->aProfile->flush(); continue; } 1.1387 + // skip ones we can't copy 1.1388 + if (ent.is_ent_hint() || ent.is_ent('S')) { continue; } 1.1389 + // and copy everything else 1.1390 + buff->aProfile->addTag( ent ); 1.1391 + } 1.1392 + 1.1393 + // free native unwind info 1.1394 + if (pairs) 1.1395 + free(pairs); 1.1396 + } 1.1397 + 1.1398 +#if 0 1.1399 + bool show = true; 1.1400 + if (show) LOG("----------------"); 1.1401 + for (k = 0; k < buff->entsUsed; k++) { 1.1402 + ProfileEntry ent = utb_get_profent(buff, k); 1.1403 + if (show) ent.log(); 1.1404 + if (ent.is_ent_hint('F')) { 1.1405 + /* This is a flush-hint */ 1.1406 + buff->aProfile->flush(); 1.1407 + } 1.1408 + else if (ent.is_ent_hint('N')) { 1.1409 + /* This is a do-a-native-unwind-right-now hint */ 1.1410 + MOZ_ASSERT(buff->haveNativeInfo); 1.1411 + PCandSP* pairs = nullptr; 1.1412 + unsigned int nPairs = 0; 1.1413 + do_lul_unwind_Buffer(&pairs, &nPairs, buff, oldest_ix); 1.1414 + buff->aProfile->addTag( ProfileEntry('s', "(root)") ); 1.1415 + for (unsigned int i = 0; i < nPairs; i++) { 1.1416 + buff->aProfile 1.1417 + ->addTag( ProfileEntry('l', reinterpret_cast<void*>(pairs[i].pc)) ); 1.1418 + } 1.1419 + if (pairs) 1.1420 + free(pairs); 1.1421 + } else { 1.1422 + /* Copy in verbatim */ 1.1423 + buff->aProfile->addTag( ent ); 1.1424 + } 1.1425 + } 1.1426 +#endif 1.1427 + 1.1428 + buff->aProfile->EndUnwind(); 1.1429 +} 1.1430 + 1.1431 + 1.1432 +// Find out, in a platform-dependent way, where the code modules got 1.1433 +// mapped in the process' virtual address space, and get |aLUL| to 1.1434 +// load unwind info for them. 1.1435 +void 1.1436 +read_procmaps(lul::LUL* aLUL) 1.1437 +{ 1.1438 + MOZ_ASSERT(aLUL->CountMappings() == 0); 1.1439 + 1.1440 +# if defined(SPS_OS_linux) || defined(SPS_OS_android) || defined(SPS_OS_darwin) 1.1441 + SharedLibraryInfo info = SharedLibraryInfo::GetInfoForSelf(); 1.1442 + 1.1443 + for (size_t i = 0; i < info.GetSize(); i++) { 1.1444 + const SharedLibrary& lib = info.GetEntry(i); 1.1445 + 1.1446 +#if defined(SPS_OS_android) && !defined(MOZ_WIDGET_GONK) 1.1447 + // We're using faulty.lib. Use a special-case object mapper. 1.1448 + AutoObjectMapperFaultyLib mapper(aLUL->mLog); 1.1449 +#else 1.1450 + // We can use the standard POSIX-based mapper. 1.1451 + AutoObjectMapperPOSIX mapper(aLUL->mLog); 1.1452 +#endif 1.1453 + 1.1454 + // Ask |mapper| to map the object. Then hand its mapped address 1.1455 + // to NotifyAfterMap(). 1.1456 + void* image = nullptr; 1.1457 + size_t size = 0; 1.1458 + bool ok = mapper.Map(&image, &size, lib.GetName()); 1.1459 + if (ok && image && size > 0) { 1.1460 + aLUL->NotifyAfterMap(lib.GetStart(), lib.GetEnd()-lib.GetStart(), 1.1461 + lib.GetName().c_str(), image); 1.1462 + } else if (!ok && lib.GetName() == "") { 1.1463 + // The object has no name and (as a consequence) the mapper 1.1464 + // failed to map it. This happens on Linux, where 1.1465 + // GetInfoForSelf() produces two such mappings: one for the 1.1466 + // executable and one for the VDSO. The executable one isn't a 1.1467 + // big deal since there's not much interesting code in there, 1.1468 + // but the VDSO one is a problem on x86-{linux,android} because 1.1469 + // lack of knowledge about the mapped area inhibits LUL's 1.1470 + // special __kernel_syscall handling. Hence notify |aLUL| at 1.1471 + // least of the mapping, even though it can't read any unwind 1.1472 + // information for the area. 1.1473 + aLUL->NotifyExecutableArea(lib.GetStart(), lib.GetEnd()-lib.GetStart()); 1.1474 + } 1.1475 + 1.1476 + // |mapper| goes out of scope at this point and so its destructor 1.1477 + // unmaps the object. 1.1478 + } 1.1479 + 1.1480 +# else 1.1481 +# error "Unknown platform" 1.1482 +# endif 1.1483 +} 1.1484 + 1.1485 +// LUL needs a callback for its logging sink. 1.1486 +static void 1.1487 +logging_sink_for_LUL(const char* str) { 1.1488 + // Ignore any trailing \n, since LOG will add one anyway. 1.1489 + size_t n = strlen(str); 1.1490 + if (n > 0 && str[n-1] == '\n') { 1.1491 + char* tmp = strdup(str); 1.1492 + tmp[n-1] = 0; 1.1493 + LOG(tmp); 1.1494 + free(tmp); 1.1495 + } else { 1.1496 + LOG(str); 1.1497 + } 1.1498 +} 1.1499 + 1.1500 +// Runs in the unwinder thread -- well, this _is_ the unwinder thread. 1.1501 +static void* unwind_thr_fn(void* exit_nowV) 1.1502 +{ 1.1503 + // This is the unwinder thread function. The first thread in must 1.1504 + // create the unwinder library and request it to read the debug 1.1505 + // info. The last thread out must deallocate the library. These 1.1506 + // three tasks (create library, read debuginfo, destroy library) are 1.1507 + // sequentialised by |sLULmutex|. |sLUL| and |sLULcount| may only 1.1508 + // be modified whilst |sLULmutex| is held. 1.1509 + // 1.1510 + // Once the threads are up and running, |sLUL| (the pointer itself, 1.1511 + // that is) stays constant, and the multiple threads may make 1.1512 + // concurrent calls into |sLUL| to do concurrent unwinding. 1.1513 + LOG("unwind_thr_fn: START"); 1.1514 + 1.1515 + // A hook for testing LUL: at the first entrance here, check env var 1.1516 + // MOZ_PROFILER_LUL_TEST, and if set, run tests on LUL. Note that 1.1517 + // it is preferable to run the LUL tests via gtest, but gtest is not 1.1518 + // currently supported on all targets that LUL runs on. Hence the 1.1519 + // auxiliary mechanism here is also needed. 1.1520 + bool doLulTest = false; 1.1521 + 1.1522 + mozilla::DebugOnly<int> r = pthread_mutex_lock(&sLULmutex); 1.1523 + MOZ_ASSERT(!r); 1.1524 + 1.1525 + if (!sLUL) { 1.1526 + // sLUL hasn't been allocated, so we must be the first thread in. 1.1527 + sLUL = new lul::LUL(logging_sink_for_LUL); 1.1528 + MOZ_ASSERT(sLUL); 1.1529 + MOZ_ASSERT(sLULcount == 0); 1.1530 + // Register this thread so it can read unwind info and do unwinding. 1.1531 + sLUL->RegisterUnwinderThread(); 1.1532 + // Read all the unwind info currently available. 1.1533 + read_procmaps(sLUL); 1.1534 + // Has a test been requested? 1.1535 + if (PR_GetEnv("MOZ_PROFILER_LUL_TEST")) { 1.1536 + doLulTest = true; 1.1537 + } 1.1538 + } else { 1.1539 + // sLUL has already been allocated, so we can't be the first 1.1540 + // thread in. 1.1541 + MOZ_ASSERT(sLULcount > 0); 1.1542 + // Register this thread so it can do unwinding. 1.1543 + sLUL->RegisterUnwinderThread(); 1.1544 + } 1.1545 + 1.1546 + sLULcount++; 1.1547 + 1.1548 + r = pthread_mutex_unlock(&sLULmutex); 1.1549 + MOZ_ASSERT(!r); 1.1550 + 1.1551 + // If a test has been requested for LUL, run it. Summary results 1.1552 + // are sent to sLUL's logging sink. Note that this happens after 1.1553 + // read_procmaps has read unwind information into sLUL, so that the 1.1554 + // tests have something to unwind against. Without that they'd be 1.1555 + // pretty meaningless. 1.1556 + if (doLulTest) { 1.1557 + int nTests = 0, nTestsPassed = 0; 1.1558 + RunLulUnitTests(&nTests, &nTestsPassed, sLUL); 1.1559 + } 1.1560 + 1.1561 + // At this point, sLUL -- the single instance of the library -- is 1.1562 + // allocated and has read the required unwind info. All running 1.1563 + // threads can now make Unwind() requests of it concurrently, if 1.1564 + // they wish. 1.1565 + 1.1566 + // Now go on to allocate the array of buffers used for communication 1.1567 + // between the sampling threads and the unwinder threads. 1.1568 + 1.1569 + // If we're the first thread in, we'll need to allocate the buffer 1.1570 + // array g_buffers plus the Buffer structs that it points at. */ 1.1571 + spinLock_acquire(&g_spinLock); 1.1572 + if (g_buffers == nullptr) { 1.1573 + // Drop the lock, make a complete copy in memory, reacquire the 1.1574 + // lock, and try to install it -- which might fail, if someone 1.1575 + // else beat us to it. */ 1.1576 + spinLock_release(&g_spinLock); 1.1577 + UnwinderThreadBuffer** buffers 1.1578 + = (UnwinderThreadBuffer**)malloc(N_UNW_THR_BUFFERS 1.1579 + * sizeof(UnwinderThreadBuffer*)); 1.1580 + MOZ_ASSERT(buffers); 1.1581 + int i; 1.1582 + for (i = 0; i < N_UNW_THR_BUFFERS; i++) { 1.1583 + /* These calloc-ations are shared between the sampling and 1.1584 + unwinding threads. They must be free after all such threads 1.1585 + have terminated. */ 1.1586 + buffers[i] = (UnwinderThreadBuffer*) 1.1587 + calloc(sizeof(UnwinderThreadBuffer), 1); 1.1588 + MOZ_ASSERT(buffers[i]); 1.1589 + buffers[i]->state = S_EMPTY; 1.1590 + } 1.1591 + /* Try to install it */ 1.1592 + spinLock_acquire(&g_spinLock); 1.1593 + if (g_buffers == nullptr) { 1.1594 + g_buffers = buffers; 1.1595 + spinLock_release(&g_spinLock); 1.1596 + } else { 1.1597 + /* Someone else beat us to it. Release what we just allocated 1.1598 + so as to avoid a leak. */ 1.1599 + spinLock_release(&g_spinLock); 1.1600 + for (i = 0; i < N_UNW_THR_BUFFERS; i++) { 1.1601 + free(buffers[i]); 1.1602 + } 1.1603 + free(buffers); 1.1604 + } 1.1605 + } else { 1.1606 + /* They are already allocated, so just drop the lock and continue. */ 1.1607 + spinLock_release(&g_spinLock); 1.1608 + } 1.1609 + 1.1610 + /* 1.1611 + while (1) { 1.1612 + acq lock 1.1613 + scan to find oldest full 1.1614 + if none { rel lock; sleep; continue } 1.1615 + set buff state to emptying 1.1616 + rel lock 1.1617 + acq MLock // implicitly 1.1618 + process buffer 1.1619 + rel MLock // implicitly 1.1620 + acq lock 1.1621 + set buff state to S_EMPTY 1.1622 + rel lock 1.1623 + } 1.1624 + */ 1.1625 + int* exit_now = (int*)exit_nowV; 1.1626 + int ms_to_sleep_if_empty = 1; 1.1627 + 1.1628 + const int longest_sleep_ms = 1000; 1.1629 + bool show_sleep_message = true; 1.1630 + 1.1631 + while (1) { 1.1632 + 1.1633 + if (*exit_now != 0) { 1.1634 + *exit_now = 0; 1.1635 + break; 1.1636 + } 1.1637 + 1.1638 + spinLock_acquire(&g_spinLock); 1.1639 + 1.1640 + /* Find the oldest filled buffer, if any. */ 1.1641 + uint64_t oldest_seqNo = ~0ULL; /* infinity */ 1.1642 + int oldest_ix = -1; 1.1643 + int i; 1.1644 + for (i = 0; i < N_UNW_THR_BUFFERS; i++) { 1.1645 + UnwinderThreadBuffer* buff = g_buffers[i]; 1.1646 + if (buff->state != S_FULL) continue; 1.1647 + if (buff->seqNo < oldest_seqNo) { 1.1648 + oldest_seqNo = buff->seqNo; 1.1649 + oldest_ix = i; 1.1650 + } 1.1651 + } 1.1652 + if (oldest_ix == -1) { 1.1653 + /* We didn't find a full buffer. Snooze and try again later. */ 1.1654 + MOZ_ASSERT(oldest_seqNo == ~0ULL); 1.1655 + spinLock_release(&g_spinLock); 1.1656 + if (ms_to_sleep_if_empty > 100 && LOGLEVEL >= 2) { 1.1657 + if (show_sleep_message) 1.1658 + LOGF("BPUnw: unwinder: sleep for %d ms", ms_to_sleep_if_empty); 1.1659 + /* If we've already shown the message for the longest sleep, 1.1660 + don't show it again, until the next round of sleeping 1.1661 + starts. */ 1.1662 + if (ms_to_sleep_if_empty == longest_sleep_ms) 1.1663 + show_sleep_message = false; 1.1664 + } 1.1665 + sleep_ms(ms_to_sleep_if_empty); 1.1666 + if (ms_to_sleep_if_empty < 20) { 1.1667 + ms_to_sleep_if_empty += 2; 1.1668 + } else { 1.1669 + ms_to_sleep_if_empty = (15 * ms_to_sleep_if_empty) / 10; 1.1670 + if (ms_to_sleep_if_empty > longest_sleep_ms) 1.1671 + ms_to_sleep_if_empty = longest_sleep_ms; 1.1672 + } 1.1673 + continue; 1.1674 + } 1.1675 + 1.1676 + /* We found a full a buffer. Mark it as 'ours' and drop the 1.1677 + lock; then we can safely throw breakpad at it. */ 1.1678 + UnwinderThreadBuffer* buff = g_buffers[oldest_ix]; 1.1679 + MOZ_ASSERT(buff->state == S_FULL); 1.1680 + buff->state = S_EMPTYING; 1.1681 + spinLock_release(&g_spinLock); 1.1682 + 1.1683 + /* unwind .. in which we can do anything we like, since any 1.1684 + resource stalls that we may encounter (eg malloc locks) in 1.1685 + competition with signal handler instances, will be short 1.1686 + lived since the signal handler is guaranteed nonblocking. */ 1.1687 + if (0) LOGF("BPUnw: unwinder: seqNo %llu: emptying buf %d\n", 1.1688 + (unsigned long long int)oldest_seqNo, oldest_ix); 1.1689 + 1.1690 + process_buffer(buff, oldest_ix); 1.1691 + 1.1692 + /* And .. we're done. Mark the buffer as empty so it can be 1.1693 + reused. First though, unmap any of the entsPages that got 1.1694 + mapped during filling. */ 1.1695 + for (i = 0; i < N_PROF_ENT_PAGES; i++) { 1.1696 + if (buff->entsPages[i] == ProfEntsPage_INVALID) 1.1697 + continue; 1.1698 + munmap_ProfEntsPage(buff->entsPages[i]); 1.1699 + buff->entsPages[i] = ProfEntsPage_INVALID; 1.1700 + } 1.1701 + 1.1702 + (void)VALGRIND_MAKE_MEM_UNDEFINED(&buff->stackImg.mContents[0], 1.1703 + lul::N_STACK_BYTES); 1.1704 + spinLock_acquire(&g_spinLock); 1.1705 + MOZ_ASSERT(buff->state == S_EMPTYING); 1.1706 + buff->state = S_EMPTY; 1.1707 + spinLock_release(&g_spinLock); 1.1708 + ms_to_sleep_if_empty = 1; 1.1709 + show_sleep_message = true; 1.1710 + } 1.1711 + 1.1712 + // This unwinder thread is exiting. If it's the last one out, 1.1713 + // shut down and deallocate the unwinder library. 1.1714 + r = pthread_mutex_lock(&sLULmutex); 1.1715 + MOZ_ASSERT(!r); 1.1716 + 1.1717 + MOZ_ASSERT(sLULcount > 0); 1.1718 + if (sLULcount == 1) { 1.1719 + // Tell the library to discard unwind info for the entire address 1.1720 + // space. 1.1721 + sLUL->NotifyBeforeUnmapAll(); 1.1722 + 1.1723 + delete sLUL; 1.1724 + sLUL = nullptr; 1.1725 + } 1.1726 + 1.1727 + sLULcount--; 1.1728 + 1.1729 + r = pthread_mutex_unlock(&sLULmutex); 1.1730 + MOZ_ASSERT(!r); 1.1731 + 1.1732 + LOG("unwind_thr_fn: STOP"); 1.1733 + return nullptr; 1.1734 +} 1.1735 + 1.1736 +static void finish_sync_buffer(ThreadProfile* profile, 1.1737 + UnwinderThreadBuffer* buff, 1.1738 + void* /* ucontext_t*, really */ ucV) 1.1739 +{ 1.1740 + SyncProfile* syncProfile = profile->AsSyncProfile(); 1.1741 + MOZ_ASSERT(syncProfile); 1.1742 + SyncUnwinderThreadBuffer* utb = static_cast<SyncUnwinderThreadBuffer*>( 1.1743 + syncProfile->GetUWTBuffer()); 1.1744 + fill_buffer(profile, utb->GetBuffer(), ucV); 1.1745 + utb->GetBuffer()->state = S_FULL; 1.1746 + PseudoStack* stack = profile->GetPseudoStack(); 1.1747 + stack->addLinkedUWTBuffer(utb); 1.1748 +} 1.1749 + 1.1750 +static void release_sync_buffer(LinkedUWTBuffer* buff) 1.1751 +{ 1.1752 + SyncUnwinderThreadBuffer* data = static_cast<SyncUnwinderThreadBuffer*>(buff); 1.1753 + MOZ_ASSERT(data->GetBuffer()->state == S_EMPTY); 1.1754 + delete data; 1.1755 +} 1.1756 + 1.1757 +//////////////////////////////////////////////////////////////// 1.1758 +//////////////////////////////////////////////////////////////// 1.1759 +//////////////////////////////////////////////////////////////// 1.1760 +//////////////////////////////////////////////////////////////// 1.1761 +//////////////////////////////////////////////////////////////// 1.1762 +//////////////////////////////////////////////////////////////// 1.1763 + 1.1764 +// Keeps count of how frames are recovered, which is useful for 1.1765 +// diagnostic purposes. 1.1766 +static void stats_notify_frame(int n_context, int n_cfi, int n_scanned) 1.1767 +{ 1.1768 + // Gather stats in intervals. 1.1769 + static unsigned int nf_total = 0; // total frames since last printout 1.1770 + static unsigned int nf_CONTEXT = 0; 1.1771 + static unsigned int nf_CFI = 0; 1.1772 + static unsigned int nf_SCANNED = 0; 1.1773 + 1.1774 + nf_CONTEXT += n_context; 1.1775 + nf_CFI += n_cfi; 1.1776 + nf_SCANNED += n_scanned; 1.1777 + nf_total += (n_context + n_cfi + n_scanned); 1.1778 + 1.1779 + if (nf_total >= 5000) { 1.1780 + LOGF("BPUnw frame stats: TOTAL %5u" 1.1781 + " CTX %4u CFI %4u SCAN %4u", 1.1782 + nf_total, nf_CONTEXT, nf_CFI, nf_SCANNED); 1.1783 + nf_total = 0; 1.1784 + nf_CONTEXT = 0; 1.1785 + nf_CFI = 0; 1.1786 + nf_SCANNED = 0; 1.1787 + } 1.1788 +} 1.1789 + 1.1790 +static 1.1791 +void do_lul_unwind_Buffer(/*OUT*/PCandSP** pairs, 1.1792 + /*OUT*/unsigned int* nPairs, 1.1793 + UnwinderThreadBuffer* buff, 1.1794 + int buffNo /* for debug printing only */) 1.1795 +{ 1.1796 +# if defined(SPS_ARCH_amd64) || defined(SPS_ARCH_x86) 1.1797 + lul::UnwindRegs startRegs = buff->startRegs; 1.1798 + if (0) { 1.1799 + LOGF("Initial RIP = 0x%llx", (unsigned long long int)startRegs.xip.Value()); 1.1800 + LOGF("Initial RSP = 0x%llx", (unsigned long long int)startRegs.xsp.Value()); 1.1801 + LOGF("Initial RBP = 0x%llx", (unsigned long long int)startRegs.xbp.Value()); 1.1802 + } 1.1803 + 1.1804 +# elif defined(SPS_ARCH_arm) 1.1805 + lul::UnwindRegs startRegs = buff->startRegs; 1.1806 + if (0) { 1.1807 + LOGF("Initial R15 = 0x%llx", (unsigned long long int)startRegs.r15.Value()); 1.1808 + LOGF("Initial R13 = 0x%llx", (unsigned long long int)startRegs.r13.Value()); 1.1809 + } 1.1810 + 1.1811 +# else 1.1812 +# error "Unknown plat" 1.1813 +# endif 1.1814 + 1.1815 + // FIXME: should we reinstate the ability to use separate debug objects? 1.1816 + // /* Make up a list of places where the debug objects might be. */ 1.1817 + // std::vector<std::string> debug_dirs; 1.1818 +# if defined(SPS_OS_linux) 1.1819 + // debug_dirs.push_back("/usr/lib/debug/lib"); 1.1820 + // debug_dirs.push_back("/usr/lib/debug/usr/lib"); 1.1821 + // debug_dirs.push_back("/usr/lib/debug/lib/x86_64-linux-gnu"); 1.1822 + // debug_dirs.push_back("/usr/lib/debug/usr/lib/x86_64-linux-gnu"); 1.1823 +# elif defined(SPS_OS_android) 1.1824 + // debug_dirs.push_back("/sdcard/symbols/system/lib"); 1.1825 + // debug_dirs.push_back("/sdcard/symbols/system/bin"); 1.1826 +# elif defined(SPS_OS_darwin) 1.1827 + // /* Nothing */ 1.1828 +# else 1.1829 +# error "Unknown plat" 1.1830 +# endif 1.1831 + 1.1832 + // Set the max number of scanned or otherwise dubious frames 1.1833 + // to the user specified limit 1.1834 + size_t scannedFramesAllowed 1.1835 + = std::min(std::max(0, sUnwindStackScan), MAX_NATIVE_FRAMES); 1.1836 + 1.1837 + // The max number of frames is MAX_NATIVE_FRAMES, so as to avoid 1.1838 + // the unwinder wasting a lot of time looping on corrupted stacks. 1.1839 + uintptr_t framePCs[MAX_NATIVE_FRAMES]; 1.1840 + uintptr_t frameSPs[MAX_NATIVE_FRAMES]; 1.1841 + size_t framesAvail = mozilla::ArrayLength(framePCs); 1.1842 + size_t framesUsed = 0; 1.1843 + size_t scannedFramesAcquired = 0; 1.1844 + sLUL->Unwind( &framePCs[0], &frameSPs[0], 1.1845 + &framesUsed, &scannedFramesAcquired, 1.1846 + framesAvail, scannedFramesAllowed, 1.1847 + &startRegs, &buff->stackImg ); 1.1848 + 1.1849 + if (LOGLEVEL >= 2) 1.1850 + stats_notify_frame(/* context */ 1, 1.1851 + /* cfi */ framesUsed - 1 - scannedFramesAcquired, 1.1852 + /* scanned */ scannedFramesAcquired); 1.1853 + 1.1854 + // PC values are now in framePCs[0 .. framesUsed-1], with [0] being 1.1855 + // the innermost frame. SP values are likewise in frameSPs[]. 1.1856 + *pairs = (PCandSP*)calloc(framesUsed, sizeof(PCandSP)); 1.1857 + *nPairs = framesUsed; 1.1858 + if (*pairs == nullptr) { 1.1859 + *nPairs = 0; 1.1860 + return; 1.1861 + } 1.1862 + 1.1863 + if (framesUsed > 0) { 1.1864 + for (unsigned int frame_index = 0; 1.1865 + frame_index < framesUsed; ++frame_index) { 1.1866 + (*pairs)[framesUsed-1-frame_index].pc = framePCs[frame_index]; 1.1867 + (*pairs)[framesUsed-1-frame_index].sp = frameSPs[frame_index]; 1.1868 + } 1.1869 + } 1.1870 + 1.1871 + if (LOGLEVEL >= 3) { 1.1872 + LOGF("BPUnw: unwinder: seqNo %llu, buf %d: got %u frames", 1.1873 + (unsigned long long int)buff->seqNo, buffNo, 1.1874 + (unsigned int)framesUsed); 1.1875 + } 1.1876 + 1.1877 + if (LOGLEVEL >= 2) { 1.1878 + if (0 == (g_stats_totalSamples % 1000)) 1.1879 + LOGF("BPUnw: %llu total samples, %llu failed (buffer unavail), " 1.1880 + "%llu failed (thread unreg'd), ", 1.1881 + (unsigned long long int)g_stats_totalSamples, 1.1882 + (unsigned long long int)g_stats_noBuffAvail, 1.1883 + (unsigned long long int)g_stats_thrUnregd); 1.1884 + } 1.1885 +} 1.1886 + 1.1887 +#endif /* defined(SPS_OS_windows) */