1//===- MultiOnDiskHashTable.h - Merged set of hash tables -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9//  This file provides a hash table data structure suitable for incremental and
10//  distributed storage across a set of files.
11//
12//  Multiple hash tables from different files are implicitly merged to improve
13//  performance, and on reload the merged table will override those from other
14//  files.
15//
16//===----------------------------------------------------------------------===//
17
18#ifndef LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
19#define LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
20
21#include "llvm/ADT/DenseMap.h"
22#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/PointerUnion.h"
24#include "llvm/ADT/STLExtras.h"
25#include "llvm/ADT/SmallVector.h"
26#include "llvm/ADT/TinyPtrVector.h"
27#include "llvm/ADT/iterator_range.h"
28#include "llvm/Support/Endian.h"
29#include "llvm/Support/EndianStream.h"
30#include "llvm/Support/OnDiskHashTable.h"
31#include "llvm/Support/raw_ostream.h"
32#include <algorithm>
33#include <cstdint>
34#include <vector>
35
36namespace clang {
37namespace serialization {
38
39/// A collection of on-disk hash tables, merged when relevant for performance.
40template<typename Info> class MultiOnDiskHashTable {
41public:
42  /// A handle to a file, used when overriding tables.
43  using file_type = typename Info::file_type;
44
45  /// A pointer to an on-disk representation of the hash table.
46  using storage_type = const unsigned char *;
47
48  using external_key_type = typename Info::external_key_type;
49  using internal_key_type = typename Info::internal_key_type;
50  using data_type = typename Info::data_type;
51  using data_type_builder = typename Info::data_type_builder;
52  using hash_value_type = unsigned;
53
54private:
55  /// The generator is permitted to read our merged table.
56  template<typename ReaderInfo, typename WriterInfo>
57  friend class MultiOnDiskHashTableGenerator;
58
59  /// A hash table stored on disk.
60  struct OnDiskTable {
61    using HashTable = llvm::OnDiskIterableChainedHashTable<Info>;
62
63    file_type File;
64    HashTable Table;
65
66    OnDiskTable(file_type File, unsigned NumBuckets, unsigned NumEntries,
67                storage_type Buckets, storage_type Payload, storage_type Base,
68                const Info &InfoObj)
69        : File(File),
70          Table(NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj) {}
71  };
72
73  struct MergedTable {
74    std::vector<file_type> Files;
75    llvm::DenseMap<internal_key_type, data_type> Data;
76  };
77
78  using Table = llvm::PointerUnion<OnDiskTable *, MergedTable *>;
79  using TableVector = llvm::TinyPtrVector<void *>;
80
81  /// The current set of on-disk and merged tables.
82  /// We manually store the opaque value of the Table because TinyPtrVector
83  /// can't cope with holding a PointerUnion directly.
84  /// There can be at most one MergedTable in this vector, and if present,
85  /// it is the first table.
86  TableVector Tables;
87
88  /// Files corresponding to overridden tables that we've not yet
89  /// discarded.
90  llvm::TinyPtrVector<file_type> PendingOverrides;
91
92  struct AsOnDiskTable {
93    using result_type = OnDiskTable *;
94
95    result_type operator()(void *P) const {
96      return Table::getFromOpaqueValue(P).template get<OnDiskTable *>();
97    }
98  };
99
100  using table_iterator =
101      llvm::mapped_iterator<TableVector::iterator, AsOnDiskTable>;
102  using table_range = llvm::iterator_range<table_iterator>;
103
104  /// The current set of on-disk tables.
105  table_range tables() {
106    auto Begin = Tables.begin(), End = Tables.end();
107    if (getMergedTable())
108      ++Begin;
109    return llvm::make_range(llvm::map_iterator(Begin, AsOnDiskTable()),
110                            llvm::map_iterator(End, AsOnDiskTable()));
111  }
112
113  MergedTable *getMergedTable() const {
114    // If we already have a merged table, it's the first one.
115    return Tables.empty() ? nullptr : Table::getFromOpaqueValue(*Tables.begin())
116                                          .template dyn_cast<MergedTable*>();
117  }
118
119  /// Delete all our current on-disk tables.
120  void clear() {
121    for (auto *T : tables())
122      delete T;
123    if (auto *M = getMergedTable())
124      delete M;
125    Tables.clear();
126  }
127
128  void removeOverriddenTables() {
129    llvm::DenseSet<file_type> Files;
130    Files.insert(PendingOverrides.begin(), PendingOverrides.end());
131    // Explicitly capture Files to work around an MSVC 2015 rejects-valid bug.
132    auto ShouldRemove = [&Files](void *T) -> bool {
133      auto *ODT = Table::getFromOpaqueValue(T).template get<OnDiskTable *>();
134      bool Remove = Files.count(ODT->File);
135      if (Remove)
136        delete ODT;
137      return Remove;
138    };
139    Tables.erase(std::remove_if(tables().begin().getCurrent(), Tables.end(),
140                                ShouldRemove),
141                 Tables.end());
142    PendingOverrides.clear();
143  }
144
145  void condense() {
146    MergedTable *Merged = getMergedTable();
147    if (!Merged)
148      Merged = new MergedTable;
149
150    // Read in all the tables and merge them together.
151    // FIXME: Be smarter about which tables we merge.
152    for (auto *ODT : tables()) {
153      auto &HT = ODT->Table;
154      Info &InfoObj = HT.getInfoObj();
155
156      for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) {
157        auto *LocalPtr = I.getItem();
158
159        // FIXME: Don't rely on the OnDiskHashTable format here.
160        auto L = InfoObj.ReadKeyDataLength(LocalPtr);
161        const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first);
162        data_type_builder ValueBuilder(Merged->Data[Key]);
163        InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second,
164                             ValueBuilder);
165      }
166
167      Merged->Files.push_back(ODT->File);
168      delete ODT;
169    }
170
171    Tables.clear();
172    Tables.push_back(Table(Merged).getOpaqueValue());
173  }
174
175public:
176  MultiOnDiskHashTable() = default;
177
178  MultiOnDiskHashTable(MultiOnDiskHashTable &&O)
179      : Tables(std::move(O.Tables)),
180        PendingOverrides(std::move(O.PendingOverrides)) {
181    O.Tables.clear();
182  }
183
184  MultiOnDiskHashTable &operator=(MultiOnDiskHashTable &&O) {
185    if (&O == this)
186      return *this;
187    clear();
188    Tables = std::move(O.Tables);
189    O.Tables.clear();
190    PendingOverrides = std::move(O.PendingOverrides);
191    return *this;
192  }
193
194  ~MultiOnDiskHashTable() { clear(); }
195
196  /// Add the table \p Data loaded from file \p File.
197  void add(file_type File, storage_type Data, Info InfoObj = Info()) {
198    using namespace llvm::support;
199
200    storage_type Ptr = Data;
201
202    uint32_t BucketOffset = endian::readNext<uint32_t, little, unaligned>(Ptr);
203
204    // Read the list of overridden files.
205    uint32_t NumFiles = endian::readNext<uint32_t, little, unaligned>(Ptr);
206    // FIXME: Add a reserve() to TinyPtrVector so that we don't need to make
207    // an additional copy.
208    llvm::SmallVector<file_type, 16> OverriddenFiles;
209    OverriddenFiles.reserve(NumFiles);
210    for (/**/; NumFiles != 0; --NumFiles)
211      OverriddenFiles.push_back(InfoObj.ReadFileRef(Ptr));
212    PendingOverrides.insert(PendingOverrides.end(), OverriddenFiles.begin(),
213                            OverriddenFiles.end());
214
215    // Read the OnDiskChainedHashTable header.
216    storage_type Buckets = Data + BucketOffset;
217    auto NumBucketsAndEntries =
218        OnDiskTable::HashTable::readNumBucketsAndEntries(Buckets);
219
220    // Register the table.
221    Table NewTable = new OnDiskTable(File, NumBucketsAndEntries.first,
222                                     NumBucketsAndEntries.second,
223                                     Buckets, Ptr, Data, std::move(InfoObj));
224    Tables.push_back(NewTable.getOpaqueValue());
225  }
226
227  /// Find and read the lookup results for \p EKey.
228  data_type find(const external_key_type &EKey) {
229    data_type Result;
230
231    if (!PendingOverrides.empty())
232      removeOverriddenTables();
233
234    if (Tables.size() > static_cast<unsigned>(Info::MaxTables))
235      condense();
236
237    internal_key_type Key = Info::GetInternalKey(EKey);
238    auto KeyHash = Info::ComputeHash(Key);
239
240    if (MergedTable *M = getMergedTable()) {
241      auto It = M->Data.find(Key);
242      if (It != M->Data.end())
243        Result = It->second;
244    }
245
246    data_type_builder ResultBuilder(Result);
247
248    for (auto *ODT : tables()) {
249      auto &HT = ODT->Table;
250      auto It = HT.find_hashed(Key, KeyHash);
251      if (It != HT.end())
252        HT.getInfoObj().ReadDataInto(Key, It.getDataPtr(), It.getDataLen(),
253                                     ResultBuilder);
254    }
255
256    return Result;
257  }
258
259  /// Read all the lookup results into a single value. This only makes
260  /// sense if merging values across keys is meaningful.
261  data_type findAll() {
262    data_type Result;
263    data_type_builder ResultBuilder(Result);
264
265    if (!PendingOverrides.empty())
266      removeOverriddenTables();
267
268    if (MergedTable *M = getMergedTable()) {
269      for (auto &KV : M->Data)
270        Info::MergeDataInto(KV.second, ResultBuilder);
271    }
272
273    for (auto *ODT : tables()) {
274      auto &HT = ODT->Table;
275      Info &InfoObj = HT.getInfoObj();
276      for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) {
277        auto *LocalPtr = I.getItem();
278
279        // FIXME: Don't rely on the OnDiskHashTable format here.
280        auto L = InfoObj.ReadKeyDataLength(LocalPtr);
281        const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first);
282        InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, ResultBuilder);
283      }
284    }
285
286    return Result;
287  }
288};
289
290/// Writer for the on-disk hash table.
291template<typename ReaderInfo, typename WriterInfo>
292class MultiOnDiskHashTableGenerator {
293  using BaseTable = MultiOnDiskHashTable<ReaderInfo>;
294  using Generator = llvm::OnDiskChainedHashTableGenerator<WriterInfo>;
295
296  Generator Gen;
297
298public:
299  MultiOnDiskHashTableGenerator() : Gen() {}
300
301  void insert(typename WriterInfo::key_type_ref Key,
302              typename WriterInfo::data_type_ref Data, WriterInfo &Info) {
303    Gen.insert(Key, Data, Info);
304  }
305
306  void emit(llvm::SmallVectorImpl<char> &Out, WriterInfo &Info,
307            const BaseTable *Base) {
308    using namespace llvm::support;
309
310    llvm::raw_svector_ostream OutStream(Out);
311
312    // Write our header information.
313    {
314      endian::Writer Writer(OutStream, little);
315
316      // Reserve four bytes for the bucket offset.
317      Writer.write<uint32_t>(0);
318
319      if (auto *Merged = Base ? Base->getMergedTable() : nullptr) {
320        // Write list of overridden files.
321        Writer.write<uint32_t>(Merged->Files.size());
322        for (const auto &F : Merged->Files)
323          Info.EmitFileRef(OutStream, F);
324
325        // Add all merged entries from Base to the generator.
326        for (auto &KV : Merged->Data) {
327          if (!Gen.contains(KV.first, Info))
328            Gen.insert(KV.first, Info.ImportData(KV.second), Info);
329        }
330      } else {
331        Writer.write<uint32_t>(0);
332      }
333    }
334
335    // Write the table itself.
336    uint32_t BucketOffset = Gen.Emit(OutStream, Info);
337
338    // Replace the first four bytes with the bucket offset.
339    endian::write32le(Out.data(), BucketOffset);
340  }
341};
342
343} // namespace serialization
344} // namespace clang
345
346#endif // LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
347