1//===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The data structures defined in this file are based on the reference
10// implementation which is available at
11// https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
16#include "llvm/DebugInfo/CodeView/RecordName.h"
17#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
18#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
19#include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
20#include "llvm/DebugInfo/MSF/MSFBuilder.h"
21#include "llvm/DebugInfo/MSF/MSFCommon.h"
22#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
23#include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
24#include "llvm/DebugInfo/PDB/Native/Hash.h"
25#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
26#include "llvm/Support/BinaryItemStream.h"
27#include "llvm/Support/BinaryStreamWriter.h"
28#include "llvm/Support/Parallel.h"
29#include "llvm/Support/xxhash.h"
30#include <algorithm>
31#include <vector>
32
33using namespace llvm;
34using namespace llvm::msf;
35using namespace llvm::pdb;
36using namespace llvm::codeview;
37
38// Helper class for building the public and global PDB hash table buckets.
39struct llvm::pdb::GSIHashStreamBuilder {
40  // Sum of the size of all public or global records.
41  uint32_t RecordByteSize = 0;
42
43  std::vector<PSHashRecord> HashRecords;
44
45  // The hash bitmap has `ceil((IPHR_HASH + 1) / 32)` words in it. The
46  // reference implementation builds a hash table with IPHR_HASH buckets in it.
47  // The last bucket is used to link together free hash table cells in a linked
48  // list, but it is always empty in the compressed, on-disk format. However,
49  // the bitmap must have a bit for it.
50  std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap;
51
52  std::vector<support::ulittle32_t> HashBuckets;
53
54  uint32_t calculateSerializedLength() const;
55  Error commit(BinaryStreamWriter &Writer);
56
57  void finalizePublicBuckets();
58  void finalizeGlobalBuckets(uint32_t RecordZeroOffset);
59
60  // Assign public and global symbol records into hash table buckets.
61  // Modifies the list of records to store the bucket index, but does not
62  // change the order.
63  void finalizeBuckets(uint32_t RecordZeroOffset,
64                       MutableArrayRef<BulkPublic> Globals);
65};
66
67// DenseMapInfo implementation for deduplicating symbol records.
68struct llvm::pdb::SymbolDenseMapInfo {
69  static inline CVSymbol getEmptyKey() {
70    static CVSymbol Empty;
71    return Empty;
72  }
73  static inline CVSymbol getTombstoneKey() {
74    static CVSymbol Tombstone(
75        DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey());
76    return Tombstone;
77  }
78  static unsigned getHashValue(const CVSymbol &Val) {
79    return xxHash64(Val.RecordData);
80  }
81  static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
82    return LHS.RecordData == RHS.RecordData;
83  }
84};
85
86namespace {
87LLVM_PACKED_START
88struct PublicSym32Layout {
89  RecordPrefix Prefix;
90  PublicSym32Header Pub;
91  // char Name[];
92};
93LLVM_PACKED_END
94} // namespace
95
96// Calculate how much memory this public needs when serialized.
97static uint32_t sizeOfPublic(const BulkPublic &Pub) {
98  uint32_t NameLen = Pub.NameLen;
99  NameLen = std::min(NameLen,
100                     uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
101  return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
102}
103
104static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) {
105  // Assume the caller has allocated sizeOfPublic bytes.
106  uint32_t NameLen = std::min(
107      Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
108  size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
109  assert(Size == sizeOfPublic(Pub));
110  auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem);
111  FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32);
112  FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2);
113  FixedMem->Pub.Flags = Pub.Flags;
114  FixedMem->Pub.Offset = Pub.Offset;
115  FixedMem->Pub.Segment = Pub.Segment;
116  char *NameMem = reinterpret_cast<char *>(FixedMem + 1);
117  memcpy(NameMem, Pub.Name, NameLen);
118  // Zero the null terminator and remaining bytes.
119  memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen);
120  return CVSymbol(ArrayRef(reinterpret_cast<uint8_t *>(Mem), Size));
121}
122
123uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
124  uint32_t Size = sizeof(GSIHashHeader);
125  Size += HashRecords.size() * sizeof(PSHashRecord);
126  Size += HashBitmap.size() * sizeof(uint32_t);
127  Size += HashBuckets.size() * sizeof(uint32_t);
128  return Size;
129}
130
131Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {
132  GSIHashHeader Header;
133  Header.VerSignature = GSIHashHeader::HdrSignature;
134  Header.VerHdr = GSIHashHeader::HdrVersion;
135  Header.HrSize = HashRecords.size() * sizeof(PSHashRecord);
136  Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4;
137
138  if (auto EC = Writer.writeObject(Header))
139    return EC;
140
141  if (auto EC = Writer.writeArray(ArrayRef(HashRecords)))
142    return EC;
143  if (auto EC = Writer.writeArray(ArrayRef(HashBitmap)))
144    return EC;
145  if (auto EC = Writer.writeArray(ArrayRef(HashBuckets)))
146    return EC;
147  return Error::success();
148}
149
150static bool isAsciiString(StringRef S) {
151  return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; });
152}
153
154// See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
155static int gsiRecordCmp(StringRef S1, StringRef S2) {
156  size_t LS = S1.size();
157  size_t RS = S2.size();
158  // Shorter strings always compare less than longer strings.
159  if (LS != RS)
160    return (LS > RS) - (LS < RS);
161
162  // If either string contains non ascii characters, memcmp them.
163  if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
164    return memcmp(S1.data(), S2.data(), LS);
165
166  // Both strings are ascii, perform a case-insensitive comparison.
167  return S1.compare_insensitive(S2.data());
168}
169
170void GSIStreamBuilder::finalizePublicBuckets() {
171  PSH->finalizeBuckets(0, Publics);
172}
173
174void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) {
175  // Build up a list of globals to be bucketed. Use the BulkPublic data
176  // structure for this purpose, even though these are global records, not
177  // public records. Most of the same fields are required:
178  // - Name
179  // - NameLen
180  // - SymOffset
181  // - BucketIdx
182  // The dead fields are Offset, Segment, and Flags.
183  std::vector<BulkPublic> Records;
184  Records.resize(Globals.size());
185  uint32_t SymOffset = RecordZeroOffset;
186  for (size_t I = 0, E = Globals.size(); I < E; ++I) {
187    StringRef Name = getSymbolName(Globals[I]);
188    Records[I].Name = Name.data();
189    Records[I].NameLen = Name.size();
190    Records[I].SymOffset = SymOffset;
191    SymOffset += Globals[I].length();
192  }
193
194  GSH->finalizeBuckets(RecordZeroOffset, Records);
195}
196
197void GSIHashStreamBuilder::finalizeBuckets(
198    uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) {
199  // Hash every name in parallel.
200  parallelFor(0, Records.size(), [&](size_t I) {
201    Records[I].setBucketIdx(hashStringV1(Records[I].Name) % IPHR_HASH);
202  });
203
204  // Count up the size of each bucket. Then, use an exclusive prefix sum to
205  // calculate the bucket start offsets. This is C++17 std::exclusive_scan, but
206  // we can't use it yet.
207  uint32_t BucketStarts[IPHR_HASH] = {0};
208  for (const BulkPublic &P : Records)
209    ++BucketStarts[P.BucketIdx];
210  uint32_t Sum = 0;
211  for (uint32_t &B : BucketStarts) {
212    uint32_t Size = B;
213    B = Sum;
214    Sum += Size;
215  }
216
217  // Place globals into the hash table in bucket order. When placing a global,
218  // update the bucket start. Every hash table slot should be filled. Always use
219  // a refcount of one for now.
220  HashRecords.resize(Records.size());
221  uint32_t BucketCursors[IPHR_HASH];
222  memcpy(BucketCursors, BucketStarts, sizeof(BucketCursors));
223  for (int I = 0, E = Records.size(); I < E; ++I) {
224    uint32_t HashIdx = BucketCursors[Records[I].BucketIdx]++;
225    HashRecords[HashIdx].Off = I;
226    HashRecords[HashIdx].CRef = 1;
227  }
228
229  // Within the buckets, sort each bucket by memcmp of the symbol's name.  It's
230  // important that we use the same sorting algorithm as is used by the
231  // reference implementation to ensure that the search for a record within a
232  // bucket can properly early-out when it detects the record won't be found.
233  // The algorithm used here corresponds to the function
234  // caseInsensitiveComparePchPchCchCch in the reference implementation.
235  parallelFor(0, IPHR_HASH, [&](size_t I) {
236    auto B = HashRecords.begin() + BucketStarts[I];
237    auto E = HashRecords.begin() + BucketCursors[I];
238    if (B == E)
239      return;
240    auto BucketCmp = [Records](const PSHashRecord &LHash,
241                               const PSHashRecord &RHash) {
242      const BulkPublic &L = Records[uint32_t(LHash.Off)];
243      const BulkPublic &R = Records[uint32_t(RHash.Off)];
244      assert(L.BucketIdx == R.BucketIdx);
245      int Cmp = gsiRecordCmp(L.getName(), R.getName());
246      if (Cmp != 0)
247        return Cmp < 0;
248      // This comparison is necessary to make the sorting stable in the presence
249      // of two static globals with the same name. The easiest way to observe
250      // this is with S_LDATA32 records.
251      return L.SymOffset < R.SymOffset;
252    };
253    llvm::sort(B, E, BucketCmp);
254
255    // After we are done sorting, replace the global indices with the stream
256    // offsets of each global. Add one when writing symbol offsets to disk.
257    // See GSI1::fixSymRecs.
258    for (PSHashRecord &HRec : make_range(B, E))
259      HRec.Off = Records[uint32_t(HRec.Off)].SymOffset + 1;
260  });
261
262  // For each non-empty bucket, push the bucket start offset into HashBuckets
263  // and set a bit in the hash bitmap.
264  for (uint32_t I = 0; I < HashBitmap.size(); ++I) {
265    uint32_t Word = 0;
266    for (uint32_t J = 0; J < 32; ++J) {
267      // Skip empty buckets.
268      uint32_t BucketIdx = I * 32 + J;
269      if (BucketIdx >= IPHR_HASH ||
270          BucketStarts[BucketIdx] == BucketCursors[BucketIdx])
271        continue;
272      Word |= (1U << J);
273
274      // Calculate what the offset of the first hash record in the chain would
275      // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
276      // each record would be 12 bytes. See HROffsetCalc in gsi.h.
277      const int SizeOfHROffsetCalc = 12;
278      ulittle32_t ChainStartOff =
279          ulittle32_t(BucketStarts[BucketIdx] * SizeOfHROffsetCalc);
280      HashBuckets.push_back(ChainStartOff);
281    }
282    HashBitmap[I] = Word;
283  }
284}
285
286GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)
287    : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),
288      GSH(std::make_unique<GSIHashStreamBuilder>()) {}
289
290GSIStreamBuilder::~GSIStreamBuilder() = default;
291
292uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
293  uint32_t Size = 0;
294  Size += sizeof(PublicsStreamHeader);
295  Size += PSH->calculateSerializedLength();
296  Size += Publics.size() * sizeof(uint32_t); // AddrMap
297  // FIXME: Add thunk map and section offsets for incremental linking.
298
299  return Size;
300}
301
302uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {
303  return GSH->calculateSerializedLength();
304}
305
306Error GSIStreamBuilder::finalizeMsfLayout() {
307  // First we write public symbol records, then we write global symbol records.
308  finalizePublicBuckets();
309  finalizeGlobalBuckets(PSH->RecordByteSize);
310
311  Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize());
312  if (!Idx)
313    return Idx.takeError();
314  GlobalsStreamIndex = *Idx;
315
316  Idx = Msf.addStream(calculatePublicsHashStreamSize());
317  if (!Idx)
318    return Idx.takeError();
319  PublicsStreamIndex = *Idx;
320
321  uint32_t RecordBytes = PSH->RecordByteSize + GSH->RecordByteSize;
322
323  Idx = Msf.addStream(RecordBytes);
324  if (!Idx)
325    return Idx.takeError();
326  RecordStreamIndex = *Idx;
327  return Error::success();
328}
329
330void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&PublicsIn) {
331  assert(Publics.empty() && PSH->RecordByteSize == 0 &&
332         "publics can only be added once");
333  Publics = std::move(PublicsIn);
334
335  // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism.
336  parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) {
337    return L.getName() < R.getName();
338  });
339
340  // Assign offsets and calculate the length of the public symbol records.
341  uint32_t SymOffset = 0;
342  for (BulkPublic &Pub : Publics) {
343    Pub.SymOffset = SymOffset;
344    SymOffset += sizeOfPublic(Pub);
345  }
346
347  // Remember the length of the public stream records.
348  PSH->RecordByteSize = SymOffset;
349}
350
351void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
352  serializeAndAddGlobal(Sym);
353}
354
355void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {
356  serializeAndAddGlobal(Sym);
357}
358
359void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
360  serializeAndAddGlobal(Sym);
361}
362
363template <typename T>
364void GSIStreamBuilder::serializeAndAddGlobal(const T &Symbol) {
365  T Copy(Symbol);
366  addGlobalSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
367                                                   CodeViewContainer::Pdb));
368}
369
370void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Symbol) {
371  // Ignore duplicate typedefs and constants.
372  if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
373    auto Iter = GlobalsSeen.insert(Symbol);
374    if (!Iter.second)
375      return;
376  }
377  GSH->RecordByteSize += Symbol.length();
378  Globals.push_back(Symbol);
379}
380
381// Serialize each public and write it.
382static Error writePublics(BinaryStreamWriter &Writer,
383                          ArrayRef<BulkPublic> Publics) {
384  std::vector<uint8_t> Storage;
385  for (const BulkPublic &Pub : Publics) {
386    Storage.resize(sizeOfPublic(Pub));
387    serializePublic(Storage.data(), Pub);
388    if (Error E = Writer.writeBytes(Storage))
389      return E;
390  }
391  return Error::success();
392}
393
394static Error writeRecords(BinaryStreamWriter &Writer,
395                          ArrayRef<CVSymbol> Records) {
396  BinaryItemStream<CVSymbol> ItemStream(support::endianness::little);
397  ItemStream.setItems(Records);
398  BinaryStreamRef RecordsRef(ItemStream);
399  return Writer.writeStreamRef(RecordsRef);
400}
401
402Error GSIStreamBuilder::commitSymbolRecordStream(
403    WritableBinaryStreamRef Stream) {
404  BinaryStreamWriter Writer(Stream);
405
406  // Write public symbol records first, followed by global symbol records.  This
407  // must match the order that we assume in finalizeMsfLayout when computing
408  // PSHZero and GSHZero.
409  if (auto EC = writePublics(Writer, Publics))
410    return EC;
411  if (auto EC = writeRecords(Writer, Globals))
412    return EC;
413
414  return Error::success();
415}
416
417static std::vector<support::ulittle32_t>
418computeAddrMap(ArrayRef<BulkPublic> Publics) {
419  // Build a parallel vector of indices into the Publics vector, and sort it by
420  // address.
421  std::vector<ulittle32_t> PubAddrMap;
422  PubAddrMap.reserve(Publics.size());
423  for (int I = 0, E = Publics.size(); I < E; ++I)
424    PubAddrMap.push_back(ulittle32_t(I));
425
426  auto AddrCmp = [Publics](const ulittle32_t &LIdx, const ulittle32_t &RIdx) {
427    const BulkPublic &L = Publics[LIdx];
428    const BulkPublic &R = Publics[RIdx];
429    if (L.Segment != R.Segment)
430      return L.Segment < R.Segment;
431    if (L.Offset != R.Offset)
432      return L.Offset < R.Offset;
433    // parallelSort is unstable, so we have to do name comparison to ensure
434    // that two names for the same location come out in a deterministic order.
435    return L.getName() < R.getName();
436  };
437  parallelSort(PubAddrMap, AddrCmp);
438
439  // Rewrite the public symbol indices into symbol offsets.
440  for (ulittle32_t &Entry : PubAddrMap)
441    Entry = Publics[Entry].SymOffset;
442  return PubAddrMap;
443}
444
445Error GSIStreamBuilder::commitPublicsHashStream(
446    WritableBinaryStreamRef Stream) {
447  BinaryStreamWriter Writer(Stream);
448  PublicsStreamHeader Header;
449
450  // FIXME: Fill these in. They are for incremental linking.
451  Header.SymHash = PSH->calculateSerializedLength();
452  Header.AddrMap = Publics.size() * 4;
453  Header.NumThunks = 0;
454  Header.SizeOfThunk = 0;
455  Header.ISectThunkTable = 0;
456  memset(Header.Padding, 0, sizeof(Header.Padding));
457  Header.OffThunkTable = 0;
458  Header.NumSections = 0;
459  if (auto EC = Writer.writeObject(Header))
460    return EC;
461
462  if (auto EC = PSH->commit(Writer))
463    return EC;
464
465  std::vector<support::ulittle32_t> PubAddrMap = computeAddrMap(Publics);
466  assert(PubAddrMap.size() == Publics.size());
467  if (auto EC = Writer.writeArray(ArrayRef(PubAddrMap)))
468    return EC;
469
470  return Error::success();
471}
472
473Error GSIStreamBuilder::commitGlobalsHashStream(
474    WritableBinaryStreamRef Stream) {
475  BinaryStreamWriter Writer(Stream);
476  return GSH->commit(Writer);
477}
478
479Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,
480                               WritableBinaryStreamRef Buffer) {
481  auto GS = WritableMappedBlockStream::createIndexedStream(
482      Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator());
483  auto PS = WritableMappedBlockStream::createIndexedStream(
484      Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator());
485  auto PRS = WritableMappedBlockStream::createIndexedStream(
486      Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator());
487
488  if (auto EC = commitSymbolRecordStream(*PRS))
489    return EC;
490  if (auto EC = commitGlobalsHashStream(*GS))
491    return EC;
492  if (auto EC = commitPublicsHashStream(*PS))
493    return EC;
494  return Error::success();
495}
496