1//===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9///
10/// \file
11/// \brief Defines facilities for reading and writing on-disk hash tables.
12///
13//===----------------------------------------------------------------------===//
14#ifndef LLVM_SUPPORT_ONDISKHASHTABLE_H
15#define LLVM_SUPPORT_ONDISKHASHTABLE_H
16
17#include "llvm/Support/AlignOf.h"
18#include "llvm/Support/Allocator.h"
19#include "llvm/Support/DataTypes.h"
20#include "llvm/Support/EndianStream.h"
21#include "llvm/Support/Host.h"
22#include "llvm/Support/MathExtras.h"
23#include "llvm/Support/raw_ostream.h"
24#include <cassert>
25#include <cstdlib>
26
27namespace llvm {
28
29/// \brief Generates an on disk hash table.
30///
31/// This needs an \c Info that handles storing values into the hash table's
32/// payload and computes the hash for a given key. This should provide the
33/// following interface:
34///
35/// \code
36/// class ExampleInfo {
37/// public:
38///   typedef ExampleKey key_type;   // Must be copy constructible
39///   typedef ExampleKey &key_type_ref;
40///   typedef ExampleData data_type; // Must be copy constructible
41///   typedef ExampleData &data_type_ref;
42///   typedef uint32_t hash_value_type; // The type the hash function returns.
43///   typedef uint32_t offset_type; // The type for offsets into the table.
44///
45///   /// Calculate the hash for Key
46///   static hash_value_type ComputeHash(key_type_ref Key);
47///   /// Return the lengths, in bytes, of the given Key/Data pair.
48///   static std::pair<offset_type, offset_type>
49///   EmitKeyDataLength(raw_ostream &Out, key_type_ref Key, data_type_ref Data);
50///   /// Write Key to Out.  KeyLen is the length from EmitKeyDataLength.
51///   static void EmitKey(raw_ostream &Out, key_type_ref Key,
52///                       offset_type KeyLen);
53///   /// Write Data to Out.  DataLen is the length from EmitKeyDataLength.
54///   static void EmitData(raw_ostream &Out, key_type_ref Key,
55///                        data_type_ref Data, offset_type DataLen);
56///   /// Determine if two keys are equal. Optional, only needed by contains.
57///   static bool EqualKey(key_type_ref Key1, key_type_ref Key2);
58/// };
59/// \endcode
60template <typename Info> class OnDiskChainedHashTableGenerator {
61  /// \brief A single item in the hash table.
62  class Item {
63  public:
64    typename Info::key_type Key;
65    typename Info::data_type Data;
66    Item *Next;
67    const typename Info::hash_value_type Hash;
68
69    Item(typename Info::key_type_ref Key, typename Info::data_type_ref Data,
70         Info &InfoObj)
71        : Key(Key), Data(Data), Next(nullptr), Hash(InfoObj.ComputeHash(Key)) {}
72  };
73
74  typedef typename Info::offset_type offset_type;
75  offset_type NumBuckets;
76  offset_type NumEntries;
77  llvm::SpecificBumpPtrAllocator<Item> BA;
78
79  /// \brief A linked list of values in a particular hash bucket.
80  struct Bucket {
81    offset_type Off;
82    unsigned Length;
83    Item *Head;
84  };
85
86  Bucket *Buckets;
87
88private:
89  /// \brief Insert an item into the appropriate hash bucket.
90  void insert(Bucket *Buckets, size_t Size, Item *E) {
91    Bucket &B = Buckets[E->Hash & (Size - 1)];
92    E->Next = B.Head;
93    ++B.Length;
94    B.Head = E;
95  }
96
97  /// \brief Resize the hash table, moving the old entries into the new buckets.
98  void resize(size_t NewSize) {
99    Bucket *NewBuckets = (Bucket *)std::calloc(NewSize, sizeof(Bucket));
100    // Populate NewBuckets with the old entries.
101    for (size_t I = 0; I < NumBuckets; ++I)
102      for (Item *E = Buckets[I].Head; E;) {
103        Item *N = E->Next;
104        E->Next = nullptr;
105        insert(NewBuckets, NewSize, E);
106        E = N;
107      }
108
109    free(Buckets);
110    NumBuckets = NewSize;
111    Buckets = NewBuckets;
112  }
113
114public:
115  /// \brief Insert an entry into the table.
116  void insert(typename Info::key_type_ref Key,
117              typename Info::data_type_ref Data) {
118    Info InfoObj;
119    insert(Key, Data, InfoObj);
120  }
121
122  /// \brief Insert an entry into the table.
123  ///
124  /// Uses the provided Info instead of a stack allocated one.
125  void insert(typename Info::key_type_ref Key,
126              typename Info::data_type_ref Data, Info &InfoObj) {
127    ++NumEntries;
128    if (4 * NumEntries >= 3 * NumBuckets)
129      resize(NumBuckets * 2);
130    insert(Buckets, NumBuckets, new (BA.Allocate()) Item(Key, Data, InfoObj));
131  }
132
133  /// \brief Determine whether an entry has been inserted.
134  bool contains(typename Info::key_type_ref Key, Info &InfoObj) {
135    unsigned Hash = InfoObj.ComputeHash(Key);
136    for (Item *I = Buckets[Hash & (NumBuckets - 1)].Head; I; I = I->Next)
137      if (I->Hash == Hash && InfoObj.EqualKey(I->Key, Key))
138        return true;
139    return false;
140  }
141
142  /// \brief Emit the table to Out, which must not be at offset 0.
143  offset_type Emit(raw_ostream &Out) {
144    Info InfoObj;
145    return Emit(Out, InfoObj);
146  }
147
148  /// \brief Emit the table to Out, which must not be at offset 0.
149  ///
150  /// Uses the provided Info instead of a stack allocated one.
151  offset_type Emit(raw_ostream &Out, Info &InfoObj) {
152    using namespace llvm::support;
153    endian::Writer<little> LE(Out);
154
155    // Emit the payload of the table.
156    for (offset_type I = 0; I < NumBuckets; ++I) {
157      Bucket &B = Buckets[I];
158      if (!B.Head)
159        continue;
160
161      // Store the offset for the data of this bucket.
162      B.Off = Out.tell();
163      assert(B.Off && "Cannot write a bucket at offset 0. Please add padding.");
164
165      // Write out the number of items in the bucket.
166      LE.write<uint16_t>(B.Length);
167      assert(B.Length != 0 && "Bucket has a head but zero length?");
168
169      // Write out the entries in the bucket.
170      for (Item *I = B.Head; I; I = I->Next) {
171        LE.write<typename Info::hash_value_type>(I->Hash);
172        const std::pair<offset_type, offset_type> &Len =
173            InfoObj.EmitKeyDataLength(Out, I->Key, I->Data);
174#ifdef NDEBUG
175        InfoObj.EmitKey(Out, I->Key, Len.first);
176        InfoObj.EmitData(Out, I->Key, I->Data, Len.second);
177#else
178        // In asserts mode, check that the users length matches the data they
179        // wrote.
180        uint64_t KeyStart = Out.tell();
181        InfoObj.EmitKey(Out, I->Key, Len.first);
182        uint64_t DataStart = Out.tell();
183        InfoObj.EmitData(Out, I->Key, I->Data, Len.second);
184        uint64_t End = Out.tell();
185        assert(offset_type(DataStart - KeyStart) == Len.first &&
186               "key length does not match bytes written");
187        assert(offset_type(End - DataStart) == Len.second &&
188               "data length does not match bytes written");
189#endif
190      }
191    }
192
193    // Pad with zeros so that we can start the hashtable at an aligned address.
194    offset_type TableOff = Out.tell();
195    uint64_t N = llvm::OffsetToAlignment(TableOff, alignOf<offset_type>());
196    TableOff += N;
197    while (N--)
198      LE.write<uint8_t>(0);
199
200    // Emit the hashtable itself.
201    LE.write<offset_type>(NumBuckets);
202    LE.write<offset_type>(NumEntries);
203    for (offset_type I = 0; I < NumBuckets; ++I)
204      LE.write<offset_type>(Buckets[I].Off);
205
206    return TableOff;
207  }
208
209  OnDiskChainedHashTableGenerator() {
210    NumEntries = 0;
211    NumBuckets = 64;
212    // Note that we do not need to run the constructors of the individual
213    // Bucket objects since 'calloc' returns bytes that are all 0.
214    Buckets = (Bucket *)std::calloc(NumBuckets, sizeof(Bucket));
215  }
216
217  ~OnDiskChainedHashTableGenerator() { std::free(Buckets); }
218};
219
220/// \brief Provides lookup on an on disk hash table.
221///
222/// This needs an \c Info that handles reading values from the hash table's
223/// payload and computes the hash for a given key. This should provide the
224/// following interface:
225///
226/// \code
227/// class ExampleLookupInfo {
228/// public:
229///   typedef ExampleData data_type;
230///   typedef ExampleInternalKey internal_key_type; // The stored key type.
231///   typedef ExampleKey external_key_type; // The type to pass to find().
232///   typedef uint32_t hash_value_type; // The type the hash function returns.
233///   typedef uint32_t offset_type; // The type for offsets into the table.
234///
235///   /// Compare two keys for equality.
236///   static bool EqualKey(internal_key_type &Key1, internal_key_type &Key2);
237///   /// Calculate the hash for the given key.
238///   static hash_value_type ComputeHash(internal_key_type &IKey);
239///   /// Translate from the semantic type of a key in the hash table to the
240///   /// type that is actually stored and used for hashing and comparisons.
241///   /// The internal and external types are often the same, in which case this
242///   /// can simply return the passed in value.
243///   static const internal_key_type &GetInternalKey(external_key_type &EKey);
244///   /// Read the key and data length from Buffer, leaving it pointing at the
245///   /// following byte.
246///   static std::pair<offset_type, offset_type>
247///   ReadKeyDataLength(const unsigned char *&Buffer);
248///   /// Read the key from Buffer, given the KeyLen as reported from
249///   /// ReadKeyDataLength.
250///   const internal_key_type &ReadKey(const unsigned char *Buffer,
251///                                    offset_type KeyLen);
252///   /// Read the data for Key from Buffer, given the DataLen as reported from
253///   /// ReadKeyDataLength.
254///   data_type ReadData(StringRef Key, const unsigned char *Buffer,
255///                      offset_type DataLen);
256/// };
257/// \endcode
258template <typename Info> class OnDiskChainedHashTable {
259  const typename Info::offset_type NumBuckets;
260  const typename Info::offset_type NumEntries;
261  const unsigned char *const Buckets;
262  const unsigned char *const Base;
263  Info InfoObj;
264
265public:
266  typedef Info InfoType;
267  typedef typename Info::internal_key_type internal_key_type;
268  typedef typename Info::external_key_type external_key_type;
269  typedef typename Info::data_type data_type;
270  typedef typename Info::hash_value_type hash_value_type;
271  typedef typename Info::offset_type offset_type;
272
273  OnDiskChainedHashTable(offset_type NumBuckets, offset_type NumEntries,
274                         const unsigned char *Buckets,
275                         const unsigned char *Base,
276                         const Info &InfoObj = Info())
277      : NumBuckets(NumBuckets), NumEntries(NumEntries), Buckets(Buckets),
278        Base(Base), InfoObj(InfoObj) {
279    assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
280           "'buckets' must have a 4-byte alignment");
281  }
282
283  /// Read the number of buckets and the number of entries from a hash table
284  /// produced by OnDiskHashTableGenerator::Emit, and advance the Buckets
285  /// pointer past them.
286  static std::pair<offset_type, offset_type>
287  readNumBucketsAndEntries(const unsigned char *&Buckets) {
288    assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
289           "buckets should be 4-byte aligned.");
290    using namespace llvm::support;
291    offset_type NumBuckets =
292        endian::readNext<offset_type, little, aligned>(Buckets);
293    offset_type NumEntries =
294        endian::readNext<offset_type, little, aligned>(Buckets);
295    return std::make_pair(NumBuckets, NumEntries);
296  }
297
298  offset_type getNumBuckets() const { return NumBuckets; }
299  offset_type getNumEntries() const { return NumEntries; }
300  const unsigned char *getBase() const { return Base; }
301  const unsigned char *getBuckets() const { return Buckets; }
302
303  bool isEmpty() const { return NumEntries == 0; }
304
305  class iterator {
306    internal_key_type Key;
307    const unsigned char *const Data;
308    const offset_type Len;
309    Info *InfoObj;
310
311  public:
312    iterator() : Data(nullptr), Len(0) {}
313    iterator(const internal_key_type K, const unsigned char *D, offset_type L,
314             Info *InfoObj)
315        : Key(K), Data(D), Len(L), InfoObj(InfoObj) {}
316
317    data_type operator*() const { return InfoObj->ReadData(Key, Data, Len); }
318
319    const unsigned char *getDataPtr() const { return Data; }
320    offset_type getDataLen() const { return Len; }
321
322    bool operator==(const iterator &X) const { return X.Data == Data; }
323    bool operator!=(const iterator &X) const { return X.Data != Data; }
324  };
325
326  /// \brief Look up the stored data for a particular key.
327  iterator find(const external_key_type &EKey, Info *InfoPtr = nullptr) {
328    const internal_key_type &IKey = InfoObj.GetInternalKey(EKey);
329    hash_value_type KeyHash = InfoObj.ComputeHash(IKey);
330    return find_hashed(IKey, KeyHash, InfoPtr);
331  }
332
333  /// \brief Look up the stored data for a particular key with a known hash.
334  iterator find_hashed(const internal_key_type &IKey, hash_value_type KeyHash,
335                       Info *InfoPtr = nullptr) {
336    using namespace llvm::support;
337
338    if (!InfoPtr)
339      InfoPtr = &InfoObj;
340
341    // Each bucket is just an offset into the hash table file.
342    offset_type Idx = KeyHash & (NumBuckets - 1);
343    const unsigned char *Bucket = Buckets + sizeof(offset_type) * Idx;
344
345    offset_type Offset = endian::readNext<offset_type, little, aligned>(Bucket);
346    if (Offset == 0)
347      return iterator(); // Empty bucket.
348    const unsigned char *Items = Base + Offset;
349
350    // 'Items' starts with a 16-bit unsigned integer representing the
351    // number of items in this bucket.
352    unsigned Len = endian::readNext<uint16_t, little, unaligned>(Items);
353
354    for (unsigned i = 0; i < Len; ++i) {
355      // Read the hash.
356      hash_value_type ItemHash =
357          endian::readNext<hash_value_type, little, unaligned>(Items);
358
359      // Determine the length of the key and the data.
360      const std::pair<offset_type, offset_type> &L =
361          Info::ReadKeyDataLength(Items);
362      offset_type ItemLen = L.first + L.second;
363
364      // Compare the hashes.  If they are not the same, skip the entry entirely.
365      if (ItemHash != KeyHash) {
366        Items += ItemLen;
367        continue;
368      }
369
370      // Read the key.
371      const internal_key_type &X =
372          InfoPtr->ReadKey((const unsigned char *const)Items, L.first);
373
374      // If the key doesn't match just skip reading the value.
375      if (!InfoPtr->EqualKey(X, IKey)) {
376        Items += ItemLen;
377        continue;
378      }
379
380      // The key matches!
381      return iterator(X, Items + L.first, L.second, InfoPtr);
382    }
383
384    return iterator();
385  }
386
387  iterator end() const { return iterator(); }
388
389  Info &getInfoObj() { return InfoObj; }
390
391  /// \brief Create the hash table.
392  ///
393  /// \param Buckets is the beginning of the hash table itself, which follows
394  /// the payload of entire structure. This is the value returned by
395  /// OnDiskHashTableGenerator::Emit.
396  ///
397  /// \param Base is the point from which all offsets into the structure are
398  /// based. This is offset 0 in the stream that was used when Emitting the
399  /// table.
400  static OnDiskChainedHashTable *Create(const unsigned char *Buckets,
401                                        const unsigned char *const Base,
402                                        const Info &InfoObj = Info()) {
403    assert(Buckets > Base);
404    auto NumBucketsAndEntries = readNumBucketsAndEntries(Buckets);
405    return new OnDiskChainedHashTable<Info>(NumBucketsAndEntries.first,
406                                            NumBucketsAndEntries.second,
407                                            Buckets, Base, InfoObj);
408  }
409};
410
411/// \brief Provides lookup and iteration over an on disk hash table.
412///
413/// \copydetails llvm::OnDiskChainedHashTable
414template <typename Info>
415class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
416  const unsigned char *Payload;
417
418public:
419  typedef OnDiskChainedHashTable<Info>          base_type;
420  typedef typename base_type::internal_key_type internal_key_type;
421  typedef typename base_type::external_key_type external_key_type;
422  typedef typename base_type::data_type         data_type;
423  typedef typename base_type::hash_value_type   hash_value_type;
424  typedef typename base_type::offset_type       offset_type;
425
426private:
427  /// \brief Iterates over all of the keys in the table.
428  class iterator_base {
429    const unsigned char *Ptr;
430    offset_type NumItemsInBucketLeft;
431    offset_type NumEntriesLeft;
432
433  public:
434    typedef external_key_type value_type;
435
436    iterator_base(const unsigned char *const Ptr, offset_type NumEntries)
437        : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries) {}
438    iterator_base()
439        : Ptr(nullptr), NumItemsInBucketLeft(0), NumEntriesLeft(0) {}
440
441    friend bool operator==(const iterator_base &X, const iterator_base &Y) {
442      return X.NumEntriesLeft == Y.NumEntriesLeft;
443    }
444    friend bool operator!=(const iterator_base &X, const iterator_base &Y) {
445      return X.NumEntriesLeft != Y.NumEntriesLeft;
446    }
447
448    /// Move to the next item.
449    void advance() {
450      using namespace llvm::support;
451      if (!NumItemsInBucketLeft) {
452        // 'Items' starts with a 16-bit unsigned integer representing the
453        // number of items in this bucket.
454        NumItemsInBucketLeft =
455            endian::readNext<uint16_t, little, unaligned>(Ptr);
456      }
457      Ptr += sizeof(hash_value_type); // Skip the hash.
458      // Determine the length of the key and the data.
459      const std::pair<offset_type, offset_type> &L =
460          Info::ReadKeyDataLength(Ptr);
461      Ptr += L.first + L.second;
462      assert(NumItemsInBucketLeft);
463      --NumItemsInBucketLeft;
464      assert(NumEntriesLeft);
465      --NumEntriesLeft;
466    }
467
468    /// Get the start of the item as written by the trait (after the hash and
469    /// immediately before the key and value length).
470    const unsigned char *getItem() const {
471      return Ptr + (NumItemsInBucketLeft ? 0 : 2) + sizeof(hash_value_type);
472    }
473  };
474
475public:
476  OnDiskIterableChainedHashTable(offset_type NumBuckets, offset_type NumEntries,
477                                 const unsigned char *Buckets,
478                                 const unsigned char *Payload,
479                                 const unsigned char *Base,
480                                 const Info &InfoObj = Info())
481      : base_type(NumBuckets, NumEntries, Buckets, Base, InfoObj),
482        Payload(Payload) {}
483
484  /// \brief Iterates over all of the keys in the table.
485  class key_iterator : public iterator_base {
486    Info *InfoObj;
487
488  public:
489    typedef external_key_type value_type;
490
491    key_iterator(const unsigned char *const Ptr, offset_type NumEntries,
492                 Info *InfoObj)
493        : iterator_base(Ptr, NumEntries), InfoObj(InfoObj) {}
494    key_iterator() : iterator_base(), InfoObj() {}
495
496    key_iterator &operator++() {
497      this->advance();
498      return *this;
499    }
500    key_iterator operator++(int) { // Postincrement
501      key_iterator tmp = *this;
502      ++*this;
503      return tmp;
504    }
505
506    internal_key_type getInternalKey() const {
507      auto *LocalPtr = this->getItem();
508
509      // Determine the length of the key and the data.
510      auto L = Info::ReadKeyDataLength(LocalPtr);
511
512      // Read the key.
513      return InfoObj->ReadKey(LocalPtr, L.first);
514    }
515
516    value_type operator*() const {
517      return InfoObj->GetExternalKey(getInternalKey());
518    }
519  };
520
521  key_iterator key_begin() {
522    return key_iterator(Payload, this->getNumEntries(), &this->getInfoObj());
523  }
524  key_iterator key_end() { return key_iterator(); }
525
526  iterator_range<key_iterator> keys() {
527    return make_range(key_begin(), key_end());
528  }
529
530  /// \brief Iterates over all the entries in the table, returning the data.
531  class data_iterator : public iterator_base {
532    Info *InfoObj;
533
534  public:
535    typedef data_type value_type;
536
537    data_iterator(const unsigned char *const Ptr, offset_type NumEntries,
538                  Info *InfoObj)
539        : iterator_base(Ptr, NumEntries), InfoObj(InfoObj) {}
540    data_iterator() : iterator_base(), InfoObj() {}
541
542    data_iterator &operator++() { // Preincrement
543      this->advance();
544      return *this;
545    }
546    data_iterator operator++(int) { // Postincrement
547      data_iterator tmp = *this;
548      ++*this;
549      return tmp;
550    }
551
552    value_type operator*() const {
553      auto *LocalPtr = this->getItem();
554
555      // Determine the length of the key and the data.
556      auto L = Info::ReadKeyDataLength(LocalPtr);
557
558      // Read the key.
559      const internal_key_type &Key = InfoObj->ReadKey(LocalPtr, L.first);
560      return InfoObj->ReadData(Key, LocalPtr + L.first, L.second);
561    }
562  };
563
564  data_iterator data_begin() {
565    return data_iterator(Payload, this->getNumEntries(), &this->getInfoObj());
566  }
567  data_iterator data_end() { return data_iterator(); }
568
569  iterator_range<data_iterator> data() {
570    return make_range(data_begin(), data_end());
571  }
572
573  /// \brief Create the hash table.
574  ///
575  /// \param Buckets is the beginning of the hash table itself, which follows
576  /// the payload of entire structure. This is the value returned by
577  /// OnDiskHashTableGenerator::Emit.
578  ///
579  /// \param Payload is the beginning of the data contained in the table.  This
580  /// is Base plus any padding or header data that was stored, ie, the offset
581  /// that the stream was at when calling Emit.
582  ///
583  /// \param Base is the point from which all offsets into the structure are
584  /// based. This is offset 0 in the stream that was used when Emitting the
585  /// table.
586  static OnDiskIterableChainedHashTable *
587  Create(const unsigned char *Buckets, const unsigned char *const Payload,
588         const unsigned char *const Base, const Info &InfoObj = Info()) {
589    assert(Buckets > Base);
590    auto NumBucketsAndEntries =
591        OnDiskIterableChainedHashTable<Info>::readNumBucketsAndEntries(Buckets);
592    return new OnDiskIterableChainedHashTable<Info>(
593        NumBucketsAndEntries.first, NumBucketsAndEntries.second,
594        Buckets, Payload, Base, InfoObj);
595  }
596};
597
598} // end namespace llvm
599
600#endif
601