1//===-- xray_profile_collector.cc ------------------------------*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file is a part of XRay, a dynamic runtime instrumentation system.
11//
12// This implements the interface for the profileCollectorService.
13//
14//===----------------------------------------------------------------------===//
15#include "xray_profile_collector.h"
16#include "sanitizer_common/sanitizer_common.h"
17#include "xray_allocator.h"
18#include "xray_defs.h"
19#include "xray_profiling_flags.h"
20#include "xray_segmented_array.h"
21#include <memory>
22#include <pthread.h>
23#include <utility>
24
25namespace __xray {
26namespace profileCollectorService {
27
28namespace {
29
30SpinMutex GlobalMutex;
31struct ThreadTrie {
32  tid_t TId;
33  typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
34};
35
36struct ProfileBuffer {
37  void *Data;
38  size_t Size;
39};
40
41// Current version of the profile format.
42constexpr u64 XRayProfilingVersion = 0x20180424;
43
44// Identifier for XRay profiling files 'xrayprof' in hex.
45constexpr u64 XRayMagicBytes = 0x7872617970726f66;
46
47struct XRayProfilingFileHeader {
48  const u64 MagicBytes = XRayMagicBytes;
49  const u64 Version = XRayProfilingVersion;
50  u64 Timestamp = 0; // System time in nanoseconds.
51  u64 PID = 0;       // Process ID.
52};
53
54struct BlockHeader {
55  u32 BlockSize;
56  u32 BlockNum;
57  u64 ThreadId;
58};
59
60struct ThreadData {
61  BufferQueue *BQ;
62  FunctionCallTrie::Allocators::Buffers Buffers;
63  FunctionCallTrie::Allocators Allocators;
64  FunctionCallTrie FCT;
65  tid_t TId;
66};
67
68using ThreadDataArray = Array<ThreadData>;
69using ThreadDataAllocator = ThreadDataArray::AllocatorType;
70
71// We use a separate buffer queue for the backing store for the allocator used
72// by the ThreadData array. This lets us host the buffers, allocators, and tries
73// associated with a thread by moving the data into the array instead of
74// attempting to copy the data to a separately backed set of tries.
75static typename std::aligned_storage<
76    sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage;
77static BufferQueue *BQ = nullptr;
78static BufferQueue::Buffer Buffer;
79static typename std::aligned_storage<sizeof(ThreadDataAllocator),
80                                     alignof(ThreadDataAllocator)>::type
81    ThreadDataAllocatorStorage;
82static typename std::aligned_storage<sizeof(ThreadDataArray),
83                                     alignof(ThreadDataArray)>::type
84    ThreadDataArrayStorage;
85
86static ThreadDataAllocator *TDAllocator = nullptr;
87static ThreadDataArray *TDArray = nullptr;
88
89using ProfileBufferArray = Array<ProfileBuffer>;
90using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
91
92// These need to be global aligned storage to avoid dynamic initialization. We
93// need these to be aligned to allow us to placement new objects into the
94// storage, and have pointers to those objects be appropriately aligned.
95static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
96    ProfileBuffersStorage;
97static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
98    ProfileBufferArrayAllocatorStorage;
99
100static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
101static ProfileBufferArray *ProfileBuffers = nullptr;
102
103// Use a global flag to determine whether the collector implementation has been
104// initialized.
105static atomic_uint8_t CollectorInitialized{0};
106
107} // namespace
108
109void post(BufferQueue *Q, FunctionCallTrie &&T,
110          FunctionCallTrie::Allocators &&A,
111          FunctionCallTrie::Allocators::Buffers &&B,
112          tid_t TId) XRAY_NEVER_INSTRUMENT {
113  DCHECK_NE(Q, nullptr);
114
115  // Bail out early if the collector has not been initialized.
116  if (!atomic_load(&CollectorInitialized, memory_order_acquire)) {
117    T.~FunctionCallTrie();
118    A.~Allocators();
119    Q->releaseBuffer(B.NodeBuffer);
120    Q->releaseBuffer(B.RootsBuffer);
121    Q->releaseBuffer(B.ShadowStackBuffer);
122    Q->releaseBuffer(B.NodeIdPairBuffer);
123    B.~Buffers();
124    return;
125  }
126
127  {
128    SpinMutexLock Lock(&GlobalMutex);
129    DCHECK_NE(TDAllocator, nullptr);
130    DCHECK_NE(TDArray, nullptr);
131
132    if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T),
133                               TId) == nullptr) {
134      // If we fail to add the data to the array, we should destroy the objects
135      // handed us.
136      T.~FunctionCallTrie();
137      A.~Allocators();
138      Q->releaseBuffer(B.NodeBuffer);
139      Q->releaseBuffer(B.RootsBuffer);
140      Q->releaseBuffer(B.ShadowStackBuffer);
141      Q->releaseBuffer(B.NodeIdPairBuffer);
142      B.~Buffers();
143    }
144  }
145}
146
147// A PathArray represents the function id's representing a stack trace. In this
148// context a path is almost always represented from the leaf function in a call
149// stack to a root of the call trie.
150using PathArray = Array<int32_t>;
151
152struct ProfileRecord {
153  using PathAllocator = typename PathArray::AllocatorType;
154
155  // The Path in this record is the function id's from the leaf to the root of
156  // the function call stack as represented from a FunctionCallTrie.
157  PathArray Path;
158  const FunctionCallTrie::Node *Node;
159};
160
161namespace {
162
163using ProfileRecordArray = Array<ProfileRecord>;
164
165// Walk a depth-first traversal of each root of the FunctionCallTrie to generate
166// the path(s) and the data associated with the path.
167static void
168populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA,
169                const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT {
170  using StackArray = Array<const FunctionCallTrie::Node *>;
171  using StackAllocator = typename StackArray::AllocatorType;
172  StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
173  StackArray DFSStack(StackAlloc);
174  for (const auto *R : Trie.getRoots()) {
175    DFSStack.Append(R);
176    while (!DFSStack.empty()) {
177      auto *Node = DFSStack.back();
178      DFSStack.trim(1);
179      if (Node == nullptr)
180        continue;
181      auto Record = PRs.AppendEmplace(PathArray{PA}, Node);
182      if (Record == nullptr)
183        return;
184      DCHECK_NE(Record, nullptr);
185
186      // Traverse the Node's parents and as we're doing so, get the FIds in
187      // the order they appear.
188      for (auto N = Node; N != nullptr; N = N->Parent)
189        Record->Path.Append(N->FId);
190      DCHECK(!Record->Path.empty());
191
192      for (const auto C : Node->Callees)
193        DFSStack.Append(C.NodePtr);
194    }
195  }
196}
197
198static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
199                             const ProfileRecordArray &ProfileRecords)
200    XRAY_NEVER_INSTRUMENT {
201  auto NextPtr = static_cast<uint8_t *>(
202                     internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
203                 sizeof(Header);
204  for (const auto &Record : ProfileRecords) {
205    // List of IDs follow:
206    for (const auto FId : Record.Path)
207      NextPtr =
208          static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
209          sizeof(FId);
210
211    // Add the sentinel here.
212    constexpr int32_t SentinelFId = 0;
213    NextPtr = static_cast<uint8_t *>(
214                  internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
215              sizeof(SentinelFId);
216
217    // Add the node data here.
218    NextPtr =
219        static_cast<uint8_t *>(internal_memcpy(
220            NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) +
221        sizeof(Record.Node->CallCount);
222    NextPtr = static_cast<uint8_t *>(
223                  internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
224                                  sizeof(Record.Node->CumulativeLocalTime))) +
225              sizeof(Record.Node->CumulativeLocalTime);
226  }
227
228  DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size);
229}
230
231} // namespace
232
233void serialize() XRAY_NEVER_INSTRUMENT {
234  if (!atomic_load(&CollectorInitialized, memory_order_acquire))
235    return;
236
237  SpinMutexLock Lock(&GlobalMutex);
238
239  // Clear out the global ProfileBuffers, if it's not empty.
240  for (auto &B : *ProfileBuffers)
241    deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size);
242  ProfileBuffers->trim(ProfileBuffers->size());
243
244  DCHECK_NE(TDArray, nullptr);
245  if (TDArray->empty())
246    return;
247
248  // Then repopulate the global ProfileBuffers.
249  u32 I = 0;
250  auto MaxSize = profilingFlags()->global_allocator_max;
251  auto ProfileArena = allocateBuffer(MaxSize);
252  if (ProfileArena == nullptr)
253    return;
254
255  auto ProfileArenaCleanup = at_scope_exit(
256      [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); });
257
258  auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max);
259  if (PathArena == nullptr)
260    return;
261
262  auto PathArenaCleanup = at_scope_exit(
263      [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); });
264
265  for (const auto &ThreadTrie : *TDArray) {
266    using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
267    ProfileRecordAllocator PRAlloc(ProfileArena,
268                                   profilingFlags()->global_allocator_max);
269    ProfileRecord::PathAllocator PathAlloc(
270        PathArena, profilingFlags()->global_allocator_max);
271    ProfileRecordArray ProfileRecords(PRAlloc);
272
273    // First, we want to compute the amount of space we're going to need. We'll
274    // use a local allocator and an __xray::Array<...> to store the intermediary
275    // data, then compute the size as we're going along. Then we'll allocate the
276    // contiguous space to contain the thread buffer data.
277    if (ThreadTrie.FCT.getRoots().empty())
278      continue;
279
280    populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT);
281    DCHECK(!ThreadTrie.FCT.getRoots().empty());
282    DCHECK(!ProfileRecords.empty());
283
284    // Go through each record, to compute the sizes.
285    //
286    // header size = block size (4 bytes)
287    //   + block number (4 bytes)
288    //   + thread id (8 bytes)
289    // record size = path ids (4 bytes * number of ids + sentinel 4 bytes)
290    //   + call count (8 bytes)
291    //   + local time (8 bytes)
292    //   + end of record (8 bytes)
293    u32 CumulativeSizes = 0;
294    for (const auto &Record : ProfileRecords)
295      CumulativeSizes += 20 + (4 * Record.Path.size());
296
297    BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
298    auto B = ProfileBuffers->Append({});
299    B->Size = sizeof(Header) + CumulativeSizes;
300    B->Data = allocateBuffer(B->Size);
301    DCHECK_NE(B->Data, nullptr);
302    serializeRecords(B, Header, ProfileRecords);
303  }
304}
305
306void reset() XRAY_NEVER_INSTRUMENT {
307  atomic_store(&CollectorInitialized, 0, memory_order_release);
308  SpinMutexLock Lock(&GlobalMutex);
309
310  if (ProfileBuffers != nullptr) {
311    // Clear out the profile buffers that have been serialized.
312    for (auto &B : *ProfileBuffers)
313      deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size);
314    ProfileBuffers->trim(ProfileBuffers->size());
315    ProfileBuffers = nullptr;
316  }
317
318  if (TDArray != nullptr) {
319    // Release the resources as required.
320    for (auto &TD : *TDArray) {
321      TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer);
322      TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer);
323      TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer);
324      TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer);
325    }
326    // We don't bother destroying the array here because we've already
327    // potentially freed the backing store for the array. Instead we're going to
328    // reset the pointer to nullptr, and re-use the storage later instead
329    // (placement-new'ing into the storage as-is).
330    TDArray = nullptr;
331  }
332
333  if (TDAllocator != nullptr) {
334    TDAllocator->~Allocator();
335    TDAllocator = nullptr;
336  }
337
338  if (Buffer.Data != nullptr) {
339    BQ->releaseBuffer(Buffer);
340  }
341
342  if (BQ == nullptr) {
343    bool Success = false;
344    new (&BufferQueueStorage)
345        BufferQueue(profilingFlags()->global_allocator_max, 1, Success);
346    if (!Success)
347      return;
348    BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
349  } else {
350    BQ->finalize();
351
352    if (BQ->init(profilingFlags()->global_allocator_max, 1) !=
353        BufferQueue::ErrorCode::Ok)
354      return;
355  }
356
357  if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok)
358    return;
359
360  new (&ProfileBufferArrayAllocatorStorage)
361      ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
362  ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
363      &ProfileBufferArrayAllocatorStorage);
364
365  new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator);
366  ProfileBuffers =
367      reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
368
369  new (&ThreadDataAllocatorStorage)
370      ThreadDataAllocator(Buffer.Data, Buffer.Size);
371  TDAllocator =
372      reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage);
373  new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator);
374  TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage);
375
376  atomic_store(&CollectorInitialized, 1, memory_order_release);
377}
378
379XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT {
380  SpinMutexLock Lock(&GlobalMutex);
381
382  if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
383    return {nullptr, 0};
384
385  static pthread_once_t Once = PTHREAD_ONCE_INIT;
386  static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type
387      FileHeaderStorage;
388  pthread_once(
389      &Once, +[]() XRAY_NEVER_INSTRUMENT {
390        new (&FileHeaderStorage) XRayProfilingFileHeader{};
391      });
392
393  if (UNLIKELY(B.Data == nullptr)) {
394    // The first buffer should always contain the file header information.
395    auto &FileHeader =
396        *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage);
397    FileHeader.Timestamp = NanoTime();
398    FileHeader.PID = internal_getpid();
399    return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)};
400  }
401
402  if (UNLIKELY(B.Data == &FileHeaderStorage))
403    return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size};
404
405  BlockHeader Header;
406  internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
407  auto NextBlock = Header.BlockNum + 1;
408  if (NextBlock < ProfileBuffers->size())
409    return {(*ProfileBuffers)[NextBlock].Data,
410            (*ProfileBuffers)[NextBlock].Size};
411  return {nullptr, 0};
412}
413
414} // namespace profileCollectorService
415} // namespace __xray
416