1//===-- xray_profile_collector.cpp -----------------------------*- C++ -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file is a part of XRay, a dynamic runtime instrumentation system. 10// 11// This implements the interface for the profileCollectorService. 12// 13//===----------------------------------------------------------------------===// 14#include "xray_profile_collector.h" 15#include "sanitizer_common/sanitizer_common.h" 16#include "xray_allocator.h" 17#include "xray_defs.h" 18#include "xray_profiling_flags.h" 19#include "xray_segmented_array.h" 20#include <memory> 21#include <pthread.h> 22#include <utility> 23 24namespace __xray { 25namespace profileCollectorService { 26 27namespace { 28 29SpinMutex GlobalMutex; 30struct ThreadTrie { 31 tid_t TId; 32 typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage; 33}; 34 35struct ProfileBuffer { 36 void *Data; 37 size_t Size; 38}; 39 40// Current version of the profile format. 41constexpr u64 XRayProfilingVersion = 0x20180424; 42 43// Identifier for XRay profiling files 'xrayprof' in hex. 44constexpr u64 XRayMagicBytes = 0x7872617970726f66; 45 46struct XRayProfilingFileHeader { 47 const u64 MagicBytes = XRayMagicBytes; 48 const u64 Version = XRayProfilingVersion; 49 u64 Timestamp = 0; // System time in nanoseconds. 50 u64 PID = 0; // Process ID. 51}; 52 53struct BlockHeader { 54 u32 BlockSize; 55 u32 BlockNum; 56 u64 ThreadId; 57}; 58 59struct ThreadData { 60 BufferQueue *BQ; 61 FunctionCallTrie::Allocators::Buffers Buffers; 62 FunctionCallTrie::Allocators Allocators; 63 FunctionCallTrie FCT; 64 tid_t TId; 65}; 66 67using ThreadDataArray = Array<ThreadData>; 68using ThreadDataAllocator = ThreadDataArray::AllocatorType; 69 70// We use a separate buffer queue for the backing store for the allocator used 71// by the ThreadData array. This lets us host the buffers, allocators, and tries 72// associated with a thread by moving the data into the array instead of 73// attempting to copy the data to a separately backed set of tries. 74static typename std::aligned_storage< 75 sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage; 76static BufferQueue *BQ = nullptr; 77static BufferQueue::Buffer Buffer; 78static typename std::aligned_storage<sizeof(ThreadDataAllocator), 79 alignof(ThreadDataAllocator)>::type 80 ThreadDataAllocatorStorage; 81static typename std::aligned_storage<sizeof(ThreadDataArray), 82 alignof(ThreadDataArray)>::type 83 ThreadDataArrayStorage; 84 85static ThreadDataAllocator *TDAllocator = nullptr; 86static ThreadDataArray *TDArray = nullptr; 87 88using ProfileBufferArray = Array<ProfileBuffer>; 89using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType; 90 91// These need to be global aligned storage to avoid dynamic initialization. We 92// need these to be aligned to allow us to placement new objects into the 93// storage, and have pointers to those objects be appropriately aligned. 94static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type 95 ProfileBuffersStorage; 96static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type 97 ProfileBufferArrayAllocatorStorage; 98 99static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr; 100static ProfileBufferArray *ProfileBuffers = nullptr; 101 102// Use a global flag to determine whether the collector implementation has been 103// initialized. 104static atomic_uint8_t CollectorInitialized{0}; 105 106} // namespace 107 108void post(BufferQueue *Q, FunctionCallTrie &&T, 109 FunctionCallTrie::Allocators &&A, 110 FunctionCallTrie::Allocators::Buffers &&B, 111 tid_t TId) XRAY_NEVER_INSTRUMENT { 112 DCHECK_NE(Q, nullptr); 113 114 // Bail out early if the collector has not been initialized. 115 if (!atomic_load(&CollectorInitialized, memory_order_acquire)) { 116 T.~FunctionCallTrie(); 117 A.~Allocators(); 118 Q->releaseBuffer(B.NodeBuffer); 119 Q->releaseBuffer(B.RootsBuffer); 120 Q->releaseBuffer(B.ShadowStackBuffer); 121 Q->releaseBuffer(B.NodeIdPairBuffer); 122 B.~Buffers(); 123 return; 124 } 125 126 { 127 SpinMutexLock Lock(&GlobalMutex); 128 DCHECK_NE(TDAllocator, nullptr); 129 DCHECK_NE(TDArray, nullptr); 130 131 if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T), 132 TId) == nullptr) { 133 // If we fail to add the data to the array, we should destroy the objects 134 // handed us. 135 T.~FunctionCallTrie(); 136 A.~Allocators(); 137 Q->releaseBuffer(B.NodeBuffer); 138 Q->releaseBuffer(B.RootsBuffer); 139 Q->releaseBuffer(B.ShadowStackBuffer); 140 Q->releaseBuffer(B.NodeIdPairBuffer); 141 B.~Buffers(); 142 } 143 } 144} 145 146// A PathArray represents the function id's representing a stack trace. In this 147// context a path is almost always represented from the leaf function in a call 148// stack to a root of the call trie. 149using PathArray = Array<int32_t>; 150 151struct ProfileRecord { 152 using PathAllocator = typename PathArray::AllocatorType; 153 154 // The Path in this record is the function id's from the leaf to the root of 155 // the function call stack as represented from a FunctionCallTrie. 156 PathArray Path; 157 const FunctionCallTrie::Node *Node; 158}; 159 160namespace { 161 162using ProfileRecordArray = Array<ProfileRecord>; 163 164// Walk a depth-first traversal of each root of the FunctionCallTrie to generate 165// the path(s) and the data associated with the path. 166static void 167populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA, 168 const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT { 169 using StackArray = Array<const FunctionCallTrie::Node *>; 170 using StackAllocator = typename StackArray::AllocatorType; 171 StackAllocator StackAlloc(profilingFlags()->stack_allocator_max); 172 StackArray DFSStack(StackAlloc); 173 for (const auto *R : Trie.getRoots()) { 174 DFSStack.Append(R); 175 while (!DFSStack.empty()) { 176 auto *Node = DFSStack.back(); 177 DFSStack.trim(1); 178 if (Node == nullptr) 179 continue; 180 auto Record = PRs.AppendEmplace(PathArray{PA}, Node); 181 if (Record == nullptr) 182 return; 183 DCHECK_NE(Record, nullptr); 184 185 // Traverse the Node's parents and as we're doing so, get the FIds in 186 // the order they appear. 187 for (auto N = Node; N != nullptr; N = N->Parent) 188 Record->Path.Append(N->FId); 189 DCHECK(!Record->Path.empty()); 190 191 for (const auto C : Node->Callees) 192 DFSStack.Append(C.NodePtr); 193 } 194 } 195} 196 197static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header, 198 const ProfileRecordArray &ProfileRecords) 199 XRAY_NEVER_INSTRUMENT { 200 auto NextPtr = static_cast<uint8_t *>( 201 internal_memcpy(Buffer->Data, &Header, sizeof(Header))) + 202 sizeof(Header); 203 for (const auto &Record : ProfileRecords) { 204 // List of IDs follow: 205 for (const auto FId : Record.Path) 206 NextPtr = 207 static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) + 208 sizeof(FId); 209 210 // Add the sentinel here. 211 constexpr int32_t SentinelFId = 0; 212 NextPtr = static_cast<uint8_t *>( 213 internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) + 214 sizeof(SentinelFId); 215 216 // Add the node data here. 217 NextPtr = 218 static_cast<uint8_t *>(internal_memcpy( 219 NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) + 220 sizeof(Record.Node->CallCount); 221 NextPtr = static_cast<uint8_t *>( 222 internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime, 223 sizeof(Record.Node->CumulativeLocalTime))) + 224 sizeof(Record.Node->CumulativeLocalTime); 225 } 226 227 DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size); 228} 229 230} // namespace 231 232void serialize() XRAY_NEVER_INSTRUMENT { 233 if (!atomic_load(&CollectorInitialized, memory_order_acquire)) 234 return; 235 236 SpinMutexLock Lock(&GlobalMutex); 237 238 // Clear out the global ProfileBuffers, if it's not empty. 239 for (auto &B : *ProfileBuffers) 240 deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size); 241 ProfileBuffers->trim(ProfileBuffers->size()); 242 243 DCHECK_NE(TDArray, nullptr); 244 if (TDArray->empty()) 245 return; 246 247 // Then repopulate the global ProfileBuffers. 248 u32 I = 0; 249 auto MaxSize = profilingFlags()->global_allocator_max; 250 auto ProfileArena = allocateBuffer(MaxSize); 251 if (ProfileArena == nullptr) 252 return; 253 254 auto ProfileArenaCleanup = at_scope_exit( 255 [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); }); 256 257 auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max); 258 if (PathArena == nullptr) 259 return; 260 261 auto PathArenaCleanup = at_scope_exit( 262 [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); }); 263 264 for (const auto &ThreadTrie : *TDArray) { 265 using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; 266 ProfileRecordAllocator PRAlloc(ProfileArena, 267 profilingFlags()->global_allocator_max); 268 ProfileRecord::PathAllocator PathAlloc( 269 PathArena, profilingFlags()->global_allocator_max); 270 ProfileRecordArray ProfileRecords(PRAlloc); 271 272 // First, we want to compute the amount of space we're going to need. We'll 273 // use a local allocator and an __xray::Array<...> to store the intermediary 274 // data, then compute the size as we're going along. Then we'll allocate the 275 // contiguous space to contain the thread buffer data. 276 if (ThreadTrie.FCT.getRoots().empty()) 277 continue; 278 279 populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT); 280 DCHECK(!ThreadTrie.FCT.getRoots().empty()); 281 DCHECK(!ProfileRecords.empty()); 282 283 // Go through each record, to compute the sizes. 284 // 285 // header size = block size (4 bytes) 286 // + block number (4 bytes) 287 // + thread id (8 bytes) 288 // record size = path ids (4 bytes * number of ids + sentinel 4 bytes) 289 // + call count (8 bytes) 290 // + local time (8 bytes) 291 // + end of record (8 bytes) 292 u32 CumulativeSizes = 0; 293 for (const auto &Record : ProfileRecords) 294 CumulativeSizes += 20 + (4 * Record.Path.size()); 295 296 BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId}; 297 auto B = ProfileBuffers->Append({}); 298 B->Size = sizeof(Header) + CumulativeSizes; 299 B->Data = allocateBuffer(B->Size); 300 DCHECK_NE(B->Data, nullptr); 301 serializeRecords(B, Header, ProfileRecords); 302 } 303} 304 305void reset() XRAY_NEVER_INSTRUMENT { 306 atomic_store(&CollectorInitialized, 0, memory_order_release); 307 SpinMutexLock Lock(&GlobalMutex); 308 309 if (ProfileBuffers != nullptr) { 310 // Clear out the profile buffers that have been serialized. 311 for (auto &B : *ProfileBuffers) 312 deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size); 313 ProfileBuffers->trim(ProfileBuffers->size()); 314 ProfileBuffers = nullptr; 315 } 316 317 if (TDArray != nullptr) { 318 // Release the resources as required. 319 for (auto &TD : *TDArray) { 320 TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer); 321 TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer); 322 TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer); 323 TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer); 324 } 325 // We don't bother destroying the array here because we've already 326 // potentially freed the backing store for the array. Instead we're going to 327 // reset the pointer to nullptr, and re-use the storage later instead 328 // (placement-new'ing into the storage as-is). 329 TDArray = nullptr; 330 } 331 332 if (TDAllocator != nullptr) { 333 TDAllocator->~Allocator(); 334 TDAllocator = nullptr; 335 } 336 337 if (Buffer.Data != nullptr) { 338 BQ->releaseBuffer(Buffer); 339 } 340 341 if (BQ == nullptr) { 342 bool Success = false; 343 new (&BufferQueueStorage) 344 BufferQueue(profilingFlags()->global_allocator_max, 1, Success); 345 if (!Success) 346 return; 347 BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage); 348 } else { 349 BQ->finalize(); 350 351 if (BQ->init(profilingFlags()->global_allocator_max, 1) != 352 BufferQueue::ErrorCode::Ok) 353 return; 354 } 355 356 if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok) 357 return; 358 359 new (&ProfileBufferArrayAllocatorStorage) 360 ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max); 361 ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>( 362 &ProfileBufferArrayAllocatorStorage); 363 364 new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator); 365 ProfileBuffers = 366 reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage); 367 368 new (&ThreadDataAllocatorStorage) 369 ThreadDataAllocator(Buffer.Data, Buffer.Size); 370 TDAllocator = 371 reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage); 372 new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator); 373 TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage); 374 375 atomic_store(&CollectorInitialized, 1, memory_order_release); 376} 377 378XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT { 379 SpinMutexLock Lock(&GlobalMutex); 380 381 if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0) 382 return {nullptr, 0}; 383 384 static pthread_once_t Once = PTHREAD_ONCE_INIT; 385 static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type 386 FileHeaderStorage; 387 pthread_once( 388 &Once, +[]() XRAY_NEVER_INSTRUMENT { 389 new (&FileHeaderStorage) XRayProfilingFileHeader{}; 390 }); 391 392 if (UNLIKELY(B.Data == nullptr)) { 393 // The first buffer should always contain the file header information. 394 auto &FileHeader = 395 *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage); 396 FileHeader.Timestamp = NanoTime(); 397 FileHeader.PID = internal_getpid(); 398 return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)}; 399 } 400 401 if (UNLIKELY(B.Data == &FileHeaderStorage)) 402 return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size}; 403 404 BlockHeader Header; 405 internal_memcpy(&Header, B.Data, sizeof(BlockHeader)); 406 auto NextBlock = Header.BlockNum + 1; 407 if (NextBlock < ProfileBuffers->size()) 408 return {(*ProfileBuffers)[NextBlock].Data, 409 (*ProfileBuffers)[NextBlock].Size}; 410 return {nullptr, 0}; 411} 412 413} // namespace profileCollectorService 414} // namespace __xray 415