1//===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The data structures defined in this file are based on the reference
10// implementation which is available at
11// https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
16#include "llvm/DebugInfo/CodeView/RecordName.h"
17#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
18#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
19#include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
20#include "llvm/DebugInfo/MSF/MSFBuilder.h"
21#include "llvm/DebugInfo/MSF/MSFCommon.h"
22#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
23#include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
24#include "llvm/DebugInfo/PDB/Native/Hash.h"
25#include "llvm/Support/BinaryItemStream.h"
26#include "llvm/Support/BinaryStreamWriter.h"
27#include "llvm/Support/Parallel.h"
28#include "llvm/Support/xxhash.h"
29#include <algorithm>
30#include <vector>
31
32using namespace llvm;
33using namespace llvm::msf;
34using namespace llvm::pdb;
35using namespace llvm::codeview;
36
37// Helper class for building the public and global PDB hash table buckets.
38struct llvm::pdb::GSIHashStreamBuilder {
39  // Sum of the size of all public or global records.
40  uint32_t RecordByteSize = 0;
41
42  std::vector<PSHashRecord> HashRecords;
43
44  // The hash bitmap has `ceil((IPHR_HASH + 1) / 32)` words in it. The
45  // reference implementation builds a hash table with IPHR_HASH buckets in it.
46  // The last bucket is used to link together free hash table cells in a linked
47  // list, but it is always empty in the compressed, on-disk format. However,
48  // the bitmap must have a bit for it.
49  std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap;
50
51  std::vector<support::ulittle32_t> HashBuckets;
52
53  uint32_t calculateSerializedLength() const;
54  Error commit(BinaryStreamWriter &Writer);
55
56  void finalizePublicBuckets();
57  void finalizeGlobalBuckets(uint32_t RecordZeroOffset);
58
59  // Assign public and global symbol records into hash table buckets.
60  // Modifies the list of records to store the bucket index, but does not
61  // change the order.
62  void finalizeBuckets(uint32_t RecordZeroOffset,
63                       MutableArrayRef<BulkPublic> Globals);
64};
65
66// DenseMapInfo implementation for deduplicating symbol records.
67struct llvm::pdb::SymbolDenseMapInfo {
68  static inline CVSymbol getEmptyKey() {
69    static CVSymbol Empty;
70    return Empty;
71  }
72  static inline CVSymbol getTombstoneKey() {
73    static CVSymbol Tombstone(
74        DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey());
75    return Tombstone;
76  }
77  static unsigned getHashValue(const CVSymbol &Val) {
78    return xxHash64(Val.RecordData);
79  }
80  static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
81    return LHS.RecordData == RHS.RecordData;
82  }
83};
84
85namespace {
86LLVM_PACKED_START
87struct PublicSym32Layout {
88  RecordPrefix Prefix;
89  PublicSym32Header Pub;
90  // char Name[];
91};
92LLVM_PACKED_END
93} // namespace
94
95// Calculate how much memory this public needs when serialized.
96static uint32_t sizeOfPublic(const BulkPublic &Pub) {
97  uint32_t NameLen = Pub.NameLen;
98  NameLen = std::min(NameLen,
99                     uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
100  return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
101}
102
103static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) {
104  // Assume the caller has allocated sizeOfPublic bytes.
105  uint32_t NameLen = std::min(
106      Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
107  size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
108  assert(Size == sizeOfPublic(Pub));
109  auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem);
110  FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32);
111  FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2);
112  FixedMem->Pub.Flags = Pub.Flags;
113  FixedMem->Pub.Offset = Pub.Offset;
114  FixedMem->Pub.Segment = Pub.Segment;
115  char *NameMem = reinterpret_cast<char *>(FixedMem + 1);
116  memcpy(NameMem, Pub.Name, NameLen);
117  // Zero the null terminator and remaining bytes.
118  memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen);
119  return CVSymbol(makeArrayRef(reinterpret_cast<uint8_t *>(Mem), Size));
120}
121
122uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
123  uint32_t Size = sizeof(GSIHashHeader);
124  Size += HashRecords.size() * sizeof(PSHashRecord);
125  Size += HashBitmap.size() * sizeof(uint32_t);
126  Size += HashBuckets.size() * sizeof(uint32_t);
127  return Size;
128}
129
130Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {
131  GSIHashHeader Header;
132  Header.VerSignature = GSIHashHeader::HdrSignature;
133  Header.VerHdr = GSIHashHeader::HdrVersion;
134  Header.HrSize = HashRecords.size() * sizeof(PSHashRecord);
135  Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4;
136
137  if (auto EC = Writer.writeObject(Header))
138    return EC;
139
140  if (auto EC = Writer.writeArray(makeArrayRef(HashRecords)))
141    return EC;
142  if (auto EC = Writer.writeArray(makeArrayRef(HashBitmap)))
143    return EC;
144  if (auto EC = Writer.writeArray(makeArrayRef(HashBuckets)))
145    return EC;
146  return Error::success();
147}
148
149static bool isAsciiString(StringRef S) {
150  return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; });
151}
152
153// See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
154static int gsiRecordCmp(StringRef S1, StringRef S2) {
155  size_t LS = S1.size();
156  size_t RS = S2.size();
157  // Shorter strings always compare less than longer strings.
158  if (LS != RS)
159    return LS - RS;
160
161  // If either string contains non ascii characters, memcmp them.
162  if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
163    return memcmp(S1.data(), S2.data(), LS);
164
165  // Both strings are ascii, perform a case-insenstive comparison.
166  return S1.compare_lower(S2.data());
167}
168
169void GSIStreamBuilder::finalizePublicBuckets() {
170  PSH->finalizeBuckets(0, Publics);
171}
172
173void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) {
174  // Build up a list of globals to be bucketed. Use the BulkPublic data
175  // structure for this purpose, even though these are global records, not
176  // public records. Most of the same fields are required:
177  // - Name
178  // - NameLen
179  // - SymOffset
180  // - BucketIdx
181  // The dead fields are Offset, Segment, and Flags.
182  std::vector<BulkPublic> Records;
183  Records.resize(Globals.size());
184  uint32_t SymOffset = RecordZeroOffset;
185  for (size_t I = 0, E = Globals.size(); I < E; ++I) {
186    StringRef Name = getSymbolName(Globals[I]);
187    Records[I].Name = Name.data();
188    Records[I].NameLen = Name.size();
189    Records[I].SymOffset = SymOffset;
190    SymOffset += Globals[I].length();
191  }
192
193  GSH->finalizeBuckets(RecordZeroOffset, Records);
194}
195
196void GSIHashStreamBuilder::finalizeBuckets(
197    uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) {
198  // Hash every name in parallel.
199  parallelForEachN(0, Records.size(), [&](size_t I) {
200    Records[I].setBucketIdx(hashStringV1(Records[I].Name) % IPHR_HASH);
201  });
202
203  // Count up the size of each bucket. Then, use an exclusive prefix sum to
204  // calculate the bucket start offsets. This is C++17 std::exclusive_scan, but
205  // we can't use it yet.
206  uint32_t BucketStarts[IPHR_HASH] = {0};
207  for (const BulkPublic &P : Records)
208    ++BucketStarts[P.BucketIdx];
209  uint32_t Sum = 0;
210  for (uint32_t &B : BucketStarts) {
211    uint32_t Size = B;
212    B = Sum;
213    Sum += Size;
214  }
215
216  // Place globals into the hash table in bucket order. When placing a global,
217  // update the bucket start. Every hash table slot should be filled. Always use
218  // a refcount of one for now.
219  HashRecords.resize(Records.size());
220  uint32_t BucketCursors[IPHR_HASH];
221  memcpy(BucketCursors, BucketStarts, sizeof(BucketCursors));
222  for (int I = 0, E = Records.size(); I < E; ++I) {
223    uint32_t HashIdx = BucketCursors[Records[I].BucketIdx]++;
224    HashRecords[HashIdx].Off = I;
225    HashRecords[HashIdx].CRef = 1;
226  }
227
228  // Within the buckets, sort each bucket by memcmp of the symbol's name.  It's
229  // important that we use the same sorting algorithm as is used by the
230  // reference implementation to ensure that the search for a record within a
231  // bucket can properly early-out when it detects the record won't be found.
232  // The algorithm used here corresponds to the function
233  // caseInsensitiveComparePchPchCchCch in the reference implementation.
234  parallelForEachN(0, IPHR_HASH, [&](size_t I) {
235    auto B = HashRecords.begin() + BucketStarts[I];
236    auto E = HashRecords.begin() + BucketCursors[I];
237    if (B == E)
238      return;
239    auto BucketCmp = [Records](const PSHashRecord &LHash,
240                               const PSHashRecord &RHash) {
241      const BulkPublic &L = Records[uint32_t(LHash.Off)];
242      const BulkPublic &R = Records[uint32_t(RHash.Off)];
243      assert(L.BucketIdx == R.BucketIdx);
244      int Cmp = gsiRecordCmp(L.getName(), R.getName());
245      if (Cmp != 0)
246        return Cmp < 0;
247      // This comparison is necessary to make the sorting stable in the presence
248      // of two static globals with the same name. The easiest way to observe
249      // this is with S_LDATA32 records.
250      return L.SymOffset < R.SymOffset;
251    };
252    llvm::sort(B, E, BucketCmp);
253
254    // After we are done sorting, replace the global indices with the stream
255    // offsets of each global. Add one when writing symbol offsets to disk.
256    // See GSI1::fixSymRecs.
257    for (PSHashRecord &HRec : make_range(B, E))
258      HRec.Off = Records[uint32_t(HRec.Off)].SymOffset + 1;
259  });
260
261  // For each non-empty bucket, push the bucket start offset into HashBuckets
262  // and set a bit in the hash bitmap.
263  for (uint32_t I = 0; I < HashBitmap.size(); ++I) {
264    uint32_t Word = 0;
265    for (uint32_t J = 0; J < 32; ++J) {
266      // Skip empty buckets.
267      uint32_t BucketIdx = I * 32 + J;
268      if (BucketIdx >= IPHR_HASH ||
269          BucketStarts[BucketIdx] == BucketCursors[BucketIdx])
270        continue;
271      Word |= (1U << J);
272
273      // Calculate what the offset of the first hash record in the chain would
274      // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
275      // each record would be 12 bytes. See HROffsetCalc in gsi.h.
276      const int SizeOfHROffsetCalc = 12;
277      ulittle32_t ChainStartOff =
278          ulittle32_t(BucketStarts[BucketIdx] * SizeOfHROffsetCalc);
279      HashBuckets.push_back(ChainStartOff);
280    }
281    HashBitmap[I] = Word;
282  }
283}
284
285GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)
286    : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),
287      GSH(std::make_unique<GSIHashStreamBuilder>()) {}
288
289GSIStreamBuilder::~GSIStreamBuilder() {}
290
291uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
292  uint32_t Size = 0;
293  Size += sizeof(PublicsStreamHeader);
294  Size += PSH->calculateSerializedLength();
295  Size += Publics.size() * sizeof(uint32_t); // AddrMap
296  // FIXME: Add thunk map and section offsets for incremental linking.
297
298  return Size;
299}
300
301uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {
302  return GSH->calculateSerializedLength();
303}
304
305Error GSIStreamBuilder::finalizeMsfLayout() {
306  // First we write public symbol records, then we write global symbol records.
307  finalizePublicBuckets();
308  finalizeGlobalBuckets(PSH->RecordByteSize);
309
310  Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize());
311  if (!Idx)
312    return Idx.takeError();
313  GlobalsStreamIndex = *Idx;
314
315  Idx = Msf.addStream(calculatePublicsHashStreamSize());
316  if (!Idx)
317    return Idx.takeError();
318  PublicsStreamIndex = *Idx;
319
320  uint32_t RecordBytes = PSH->RecordByteSize + GSH->RecordByteSize;
321
322  Idx = Msf.addStream(RecordBytes);
323  if (!Idx)
324    return Idx.takeError();
325  RecordStreamIndex = *Idx;
326  return Error::success();
327}
328
329void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&PublicsIn) {
330  assert(Publics.empty() && PSH->RecordByteSize == 0 &&
331         "publics can only be added once");
332  Publics = std::move(PublicsIn);
333
334  // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism.
335  parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) {
336    return L.getName() < R.getName();
337  });
338
339  // Assign offsets and calculate the length of the public symbol records.
340  uint32_t SymOffset = 0;
341  for (BulkPublic &Pub : Publics) {
342    Pub.SymOffset = SymOffset;
343    SymOffset += sizeOfPublic(Pub);
344  }
345
346  // Remember the length of the public stream records.
347  PSH->RecordByteSize = SymOffset;
348}
349
350void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
351  serializeAndAddGlobal(Sym);
352}
353
354void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {
355  serializeAndAddGlobal(Sym);
356}
357
358void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
359  serializeAndAddGlobal(Sym);
360}
361
362template <typename T>
363void GSIStreamBuilder::serializeAndAddGlobal(const T &Symbol) {
364  T Copy(Symbol);
365  addGlobalSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
366                                                   CodeViewContainer::Pdb));
367}
368
369void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Symbol) {
370  // Ignore duplicate typedefs and constants.
371  if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
372    auto Iter = GlobalsSeen.insert(Symbol);
373    if (!Iter.second)
374      return;
375  }
376  GSH->RecordByteSize += Symbol.length();
377  Globals.push_back(Symbol);
378}
379
380// Serialize each public and write it.
381static Error writePublics(BinaryStreamWriter &Writer,
382                          ArrayRef<BulkPublic> Publics) {
383  std::vector<uint8_t> Storage;
384  for (const BulkPublic &Pub : Publics) {
385    Storage.resize(sizeOfPublic(Pub));
386    serializePublic(Storage.data(), Pub);
387    if (Error E = Writer.writeBytes(Storage))
388      return E;
389  }
390  return Error::success();
391}
392
393static Error writeRecords(BinaryStreamWriter &Writer,
394                          ArrayRef<CVSymbol> Records) {
395  BinaryItemStream<CVSymbol> ItemStream(support::endianness::little);
396  ItemStream.setItems(Records);
397  BinaryStreamRef RecordsRef(ItemStream);
398  return Writer.writeStreamRef(RecordsRef);
399}
400
401Error GSIStreamBuilder::commitSymbolRecordStream(
402    WritableBinaryStreamRef Stream) {
403  BinaryStreamWriter Writer(Stream);
404
405  // Write public symbol records first, followed by global symbol records.  This
406  // must match the order that we assume in finalizeMsfLayout when computing
407  // PSHZero and GSHZero.
408  if (auto EC = writePublics(Writer, Publics))
409    return EC;
410  if (auto EC = writeRecords(Writer, Globals))
411    return EC;
412
413  return Error::success();
414}
415
416static std::vector<support::ulittle32_t>
417computeAddrMap(ArrayRef<BulkPublic> Publics) {
418  // Build a parallel vector of indices into the Publics vector, and sort it by
419  // address.
420  std::vector<ulittle32_t> PubAddrMap;
421  PubAddrMap.reserve(Publics.size());
422  for (int I = 0, E = Publics.size(); I < E; ++I)
423    PubAddrMap.push_back(ulittle32_t(I));
424
425  auto AddrCmp = [Publics](const ulittle32_t &LIdx, const ulittle32_t &RIdx) {
426    const BulkPublic &L = Publics[LIdx];
427    const BulkPublic &R = Publics[RIdx];
428    if (L.Segment != R.Segment)
429      return L.Segment < R.Segment;
430    if (L.Offset != R.Offset)
431      return L.Offset < R.Offset;
432    // parallelSort is unstable, so we have to do name comparison to ensure
433    // that two names for the same location come out in a deterministic order.
434    return L.getName() < R.getName();
435  };
436  parallelSort(PubAddrMap, AddrCmp);
437
438  // Rewrite the public symbol indices into symbol offsets.
439  for (ulittle32_t &Entry : PubAddrMap)
440    Entry = Publics[Entry].SymOffset;
441  return PubAddrMap;
442}
443
444Error GSIStreamBuilder::commitPublicsHashStream(
445    WritableBinaryStreamRef Stream) {
446  BinaryStreamWriter Writer(Stream);
447  PublicsStreamHeader Header;
448
449  // FIXME: Fill these in. They are for incremental linking.
450  Header.SymHash = PSH->calculateSerializedLength();
451  Header.AddrMap = Publics.size() * 4;
452  Header.NumThunks = 0;
453  Header.SizeOfThunk = 0;
454  Header.ISectThunkTable = 0;
455  memset(Header.Padding, 0, sizeof(Header.Padding));
456  Header.OffThunkTable = 0;
457  Header.NumSections = 0;
458  if (auto EC = Writer.writeObject(Header))
459    return EC;
460
461  if (auto EC = PSH->commit(Writer))
462    return EC;
463
464  std::vector<support::ulittle32_t> PubAddrMap = computeAddrMap(Publics);
465  assert(PubAddrMap.size() == Publics.size());
466  if (auto EC = Writer.writeArray(makeArrayRef(PubAddrMap)))
467    return EC;
468
469  return Error::success();
470}
471
472Error GSIStreamBuilder::commitGlobalsHashStream(
473    WritableBinaryStreamRef Stream) {
474  BinaryStreamWriter Writer(Stream);
475  return GSH->commit(Writer);
476}
477
478Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,
479                               WritableBinaryStreamRef Buffer) {
480  auto GS = WritableMappedBlockStream::createIndexedStream(
481      Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator());
482  auto PS = WritableMappedBlockStream::createIndexedStream(
483      Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator());
484  auto PRS = WritableMappedBlockStream::createIndexedStream(
485      Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator());
486
487  if (auto EC = commitSymbolRecordStream(*PRS))
488    return EC;
489  if (auto EC = commitGlobalsHashStream(*GS))
490    return EC;
491  if (auto EC = commitPublicsHashStream(*PS))
492    return EC;
493  return Error::success();
494}
495