1//===-- xray_profile_collector.cpp -----------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file is a part of XRay, a dynamic runtime instrumentation system.
10//
11// This implements the interface for the profileCollectorService.
12//
13//===----------------------------------------------------------------------===//
14#include "xray_profile_collector.h"
15#include "sanitizer_common/sanitizer_common.h"
16#include "xray_allocator.h"
17#include "xray_defs.h"
18#include "xray_profiling_flags.h"
19#include "xray_segmented_array.h"
20#include <memory>
21#include <pthread.h>
22#include <utility>
23
24namespace __xray {
25namespace profileCollectorService {
26
27namespace {
28
29SpinMutex GlobalMutex;
30struct ThreadTrie {
31  tid_t TId;
32  typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
33};
34
35struct ProfileBuffer {
36  void *Data;
37  size_t Size;
38};
39
40// Current version of the profile format.
41constexpr u64 XRayProfilingVersion = 0x20180424;
42
43// Identifier for XRay profiling files 'xrayprof' in hex.
44constexpr u64 XRayMagicBytes = 0x7872617970726f66;
45
46struct XRayProfilingFileHeader {
47  const u64 MagicBytes = XRayMagicBytes;
48  const u64 Version = XRayProfilingVersion;
49  u64 Timestamp = 0; // System time in nanoseconds.
50  u64 PID = 0;       // Process ID.
51};
52
53struct BlockHeader {
54  u32 BlockSize;
55  u32 BlockNum;
56  u64 ThreadId;
57};
58
59struct ThreadData {
60  BufferQueue *BQ;
61  FunctionCallTrie::Allocators::Buffers Buffers;
62  FunctionCallTrie::Allocators Allocators;
63  FunctionCallTrie FCT;
64  tid_t TId;
65};
66
67using ThreadDataArray = Array<ThreadData>;
68using ThreadDataAllocator = ThreadDataArray::AllocatorType;
69
70// We use a separate buffer queue for the backing store for the allocator used
71// by the ThreadData array. This lets us host the buffers, allocators, and tries
72// associated with a thread by moving the data into the array instead of
73// attempting to copy the data to a separately backed set of tries.
74static typename std::aligned_storage<
75    sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage;
76static BufferQueue *BQ = nullptr;
77static BufferQueue::Buffer Buffer;
78static typename std::aligned_storage<sizeof(ThreadDataAllocator),
79                                     alignof(ThreadDataAllocator)>::type
80    ThreadDataAllocatorStorage;
81static typename std::aligned_storage<sizeof(ThreadDataArray),
82                                     alignof(ThreadDataArray)>::type
83    ThreadDataArrayStorage;
84
85static ThreadDataAllocator *TDAllocator = nullptr;
86static ThreadDataArray *TDArray = nullptr;
87
88using ProfileBufferArray = Array<ProfileBuffer>;
89using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
90
91// These need to be global aligned storage to avoid dynamic initialization. We
92// need these to be aligned to allow us to placement new objects into the
93// storage, and have pointers to those objects be appropriately aligned.
94static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
95    ProfileBuffersStorage;
96static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
97    ProfileBufferArrayAllocatorStorage;
98
99static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
100static ProfileBufferArray *ProfileBuffers = nullptr;
101
102// Use a global flag to determine whether the collector implementation has been
103// initialized.
104static atomic_uint8_t CollectorInitialized{0};
105
106} // namespace
107
108void post(BufferQueue *Q, FunctionCallTrie &&T,
109          FunctionCallTrie::Allocators &&A,
110          FunctionCallTrie::Allocators::Buffers &&B,
111          tid_t TId) XRAY_NEVER_INSTRUMENT {
112  DCHECK_NE(Q, nullptr);
113
114  // Bail out early if the collector has not been initialized.
115  if (!atomic_load(&CollectorInitialized, memory_order_acquire)) {
116    T.~FunctionCallTrie();
117    A.~Allocators();
118    Q->releaseBuffer(B.NodeBuffer);
119    Q->releaseBuffer(B.RootsBuffer);
120    Q->releaseBuffer(B.ShadowStackBuffer);
121    Q->releaseBuffer(B.NodeIdPairBuffer);
122    B.~Buffers();
123    return;
124  }
125
126  {
127    SpinMutexLock Lock(&GlobalMutex);
128    DCHECK_NE(TDAllocator, nullptr);
129    DCHECK_NE(TDArray, nullptr);
130
131    if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T),
132                               TId) == nullptr) {
133      // If we fail to add the data to the array, we should destroy the objects
134      // handed us.
135      T.~FunctionCallTrie();
136      A.~Allocators();
137      Q->releaseBuffer(B.NodeBuffer);
138      Q->releaseBuffer(B.RootsBuffer);
139      Q->releaseBuffer(B.ShadowStackBuffer);
140      Q->releaseBuffer(B.NodeIdPairBuffer);
141      B.~Buffers();
142    }
143  }
144}
145
146// A PathArray represents the function id's representing a stack trace. In this
147// context a path is almost always represented from the leaf function in a call
148// stack to a root of the call trie.
149using PathArray = Array<int32_t>;
150
151struct ProfileRecord {
152  using PathAllocator = typename PathArray::AllocatorType;
153
154  // The Path in this record is the function id's from the leaf to the root of
155  // the function call stack as represented from a FunctionCallTrie.
156  PathArray Path;
157  const FunctionCallTrie::Node *Node;
158};
159
160namespace {
161
162using ProfileRecordArray = Array<ProfileRecord>;
163
164// Walk a depth-first traversal of each root of the FunctionCallTrie to generate
165// the path(s) and the data associated with the path.
166static void
167populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA,
168                const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT {
169  using StackArray = Array<const FunctionCallTrie::Node *>;
170  using StackAllocator = typename StackArray::AllocatorType;
171  StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
172  StackArray DFSStack(StackAlloc);
173  for (const auto *R : Trie.getRoots()) {
174    DFSStack.Append(R);
175    while (!DFSStack.empty()) {
176      auto *Node = DFSStack.back();
177      DFSStack.trim(1);
178      if (Node == nullptr)
179        continue;
180      auto Record = PRs.AppendEmplace(PathArray{PA}, Node);
181      if (Record == nullptr)
182        return;
183      DCHECK_NE(Record, nullptr);
184
185      // Traverse the Node's parents and as we're doing so, get the FIds in
186      // the order they appear.
187      for (auto N = Node; N != nullptr; N = N->Parent)
188        Record->Path.Append(N->FId);
189      DCHECK(!Record->Path.empty());
190
191      for (const auto C : Node->Callees)
192        DFSStack.Append(C.NodePtr);
193    }
194  }
195}
196
197static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
198                             const ProfileRecordArray &ProfileRecords)
199    XRAY_NEVER_INSTRUMENT {
200  auto NextPtr = static_cast<uint8_t *>(
201                     internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
202                 sizeof(Header);
203  for (const auto &Record : ProfileRecords) {
204    // List of IDs follow:
205    for (const auto FId : Record.Path)
206      NextPtr =
207          static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
208          sizeof(FId);
209
210    // Add the sentinel here.
211    constexpr int32_t SentinelFId = 0;
212    NextPtr = static_cast<uint8_t *>(
213                  internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
214              sizeof(SentinelFId);
215
216    // Add the node data here.
217    NextPtr =
218        static_cast<uint8_t *>(internal_memcpy(
219            NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) +
220        sizeof(Record.Node->CallCount);
221    NextPtr = static_cast<uint8_t *>(
222                  internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
223                                  sizeof(Record.Node->CumulativeLocalTime))) +
224              sizeof(Record.Node->CumulativeLocalTime);
225  }
226
227  DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size);
228}
229
230} // namespace
231
232void serialize() XRAY_NEVER_INSTRUMENT {
233  if (!atomic_load(&CollectorInitialized, memory_order_acquire))
234    return;
235
236  SpinMutexLock Lock(&GlobalMutex);
237
238  // Clear out the global ProfileBuffers, if it's not empty.
239  for (auto &B : *ProfileBuffers)
240    deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size);
241  ProfileBuffers->trim(ProfileBuffers->size());
242
243  DCHECK_NE(TDArray, nullptr);
244  if (TDArray->empty())
245    return;
246
247  // Then repopulate the global ProfileBuffers.
248  u32 I = 0;
249  auto MaxSize = profilingFlags()->global_allocator_max;
250  auto ProfileArena = allocateBuffer(MaxSize);
251  if (ProfileArena == nullptr)
252    return;
253
254  auto ProfileArenaCleanup = at_scope_exit(
255      [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); });
256
257  auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max);
258  if (PathArena == nullptr)
259    return;
260
261  auto PathArenaCleanup = at_scope_exit(
262      [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); });
263
264  for (const auto &ThreadTrie : *TDArray) {
265    using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
266    ProfileRecordAllocator PRAlloc(ProfileArena,
267                                   profilingFlags()->global_allocator_max);
268    ProfileRecord::PathAllocator PathAlloc(
269        PathArena, profilingFlags()->global_allocator_max);
270    ProfileRecordArray ProfileRecords(PRAlloc);
271
272    // First, we want to compute the amount of space we're going to need. We'll
273    // use a local allocator and an __xray::Array<...> to store the intermediary
274    // data, then compute the size as we're going along. Then we'll allocate the
275    // contiguous space to contain the thread buffer data.
276    if (ThreadTrie.FCT.getRoots().empty())
277      continue;
278
279    populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT);
280    DCHECK(!ThreadTrie.FCT.getRoots().empty());
281    DCHECK(!ProfileRecords.empty());
282
283    // Go through each record, to compute the sizes.
284    //
285    // header size = block size (4 bytes)
286    //   + block number (4 bytes)
287    //   + thread id (8 bytes)
288    // record size = path ids (4 bytes * number of ids + sentinel 4 bytes)
289    //   + call count (8 bytes)
290    //   + local time (8 bytes)
291    //   + end of record (8 bytes)
292    u32 CumulativeSizes = 0;
293    for (const auto &Record : ProfileRecords)
294      CumulativeSizes += 20 + (4 * Record.Path.size());
295
296    BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
297    auto B = ProfileBuffers->Append({});
298    B->Size = sizeof(Header) + CumulativeSizes;
299    B->Data = allocateBuffer(B->Size);
300    DCHECK_NE(B->Data, nullptr);
301    serializeRecords(B, Header, ProfileRecords);
302  }
303}
304
305void reset() XRAY_NEVER_INSTRUMENT {
306  atomic_store(&CollectorInitialized, 0, memory_order_release);
307  SpinMutexLock Lock(&GlobalMutex);
308
309  if (ProfileBuffers != nullptr) {
310    // Clear out the profile buffers that have been serialized.
311    for (auto &B : *ProfileBuffers)
312      deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size);
313    ProfileBuffers->trim(ProfileBuffers->size());
314    ProfileBuffers = nullptr;
315  }
316
317  if (TDArray != nullptr) {
318    // Release the resources as required.
319    for (auto &TD : *TDArray) {
320      TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer);
321      TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer);
322      TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer);
323      TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer);
324    }
325    // We don't bother destroying the array here because we've already
326    // potentially freed the backing store for the array. Instead we're going to
327    // reset the pointer to nullptr, and re-use the storage later instead
328    // (placement-new'ing into the storage as-is).
329    TDArray = nullptr;
330  }
331
332  if (TDAllocator != nullptr) {
333    TDAllocator->~Allocator();
334    TDAllocator = nullptr;
335  }
336
337  if (Buffer.Data != nullptr) {
338    BQ->releaseBuffer(Buffer);
339  }
340
341  if (BQ == nullptr) {
342    bool Success = false;
343    new (&BufferQueueStorage)
344        BufferQueue(profilingFlags()->global_allocator_max, 1, Success);
345    if (!Success)
346      return;
347    BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
348  } else {
349    BQ->finalize();
350
351    if (BQ->init(profilingFlags()->global_allocator_max, 1) !=
352        BufferQueue::ErrorCode::Ok)
353      return;
354  }
355
356  if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok)
357    return;
358
359  new (&ProfileBufferArrayAllocatorStorage)
360      ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
361  ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
362      &ProfileBufferArrayAllocatorStorage);
363
364  new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator);
365  ProfileBuffers =
366      reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
367
368  new (&ThreadDataAllocatorStorage)
369      ThreadDataAllocator(Buffer.Data, Buffer.Size);
370  TDAllocator =
371      reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage);
372  new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator);
373  TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage);
374
375  atomic_store(&CollectorInitialized, 1, memory_order_release);
376}
377
378XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT {
379  SpinMutexLock Lock(&GlobalMutex);
380
381  if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
382    return {nullptr, 0};
383
384  static pthread_once_t Once = PTHREAD_ONCE_INIT;
385  static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type
386      FileHeaderStorage;
387  pthread_once(
388      &Once, +[]() XRAY_NEVER_INSTRUMENT {
389        new (&FileHeaderStorage) XRayProfilingFileHeader{};
390      });
391
392  if (UNLIKELY(B.Data == nullptr)) {
393    // The first buffer should always contain the file header information.
394    auto &FileHeader =
395        *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage);
396    FileHeader.Timestamp = NanoTime();
397    FileHeader.PID = internal_getpid();
398    return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)};
399  }
400
401  if (UNLIKELY(B.Data == &FileHeaderStorage))
402    return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size};
403
404  BlockHeader Header;
405  internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
406  auto NextBlock = Header.BlockNum + 1;
407  if (NextBlock < ProfileBuffers->size())
408    return {(*ProfileBuffers)[NextBlock].Data,
409            (*ProfileBuffers)[NextBlock].Size};
410  return {nullptr, 0};
411}
412
413} // namespace profileCollectorService
414} // namespace __xray
415