1//===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8/// 9/// \file 10/// Defines facilities for reading and writing on-disk hash tables. 11/// 12//===----------------------------------------------------------------------===// 13#ifndef LLVM_SUPPORT_ONDISKHASHTABLE_H 14#define LLVM_SUPPORT_ONDISKHASHTABLE_H 15 16#include "llvm/Support/Alignment.h" 17#include "llvm/Support/Allocator.h" 18#include "llvm/Support/DataTypes.h" 19#include "llvm/Support/EndianStream.h" 20#include "llvm/Support/MathExtras.h" 21#include "llvm/Support/raw_ostream.h" 22#include <cassert> 23#include <cstdlib> 24 25namespace llvm { 26 27/// Generates an on disk hash table. 28/// 29/// This needs an \c Info that handles storing values into the hash table's 30/// payload and computes the hash for a given key. This should provide the 31/// following interface: 32/// 33/// \code 34/// class ExampleInfo { 35/// public: 36/// typedef ExampleKey key_type; // Must be copy constructible 37/// typedef ExampleKey &key_type_ref; 38/// typedef ExampleData data_type; // Must be copy constructible 39/// typedef ExampleData &data_type_ref; 40/// typedef uint32_t hash_value_type; // The type the hash function returns. 41/// typedef uint32_t offset_type; // The type for offsets into the table. 42/// 43/// /// Calculate the hash for Key 44/// static hash_value_type ComputeHash(key_type_ref Key); 45/// /// Return the lengths, in bytes, of the given Key/Data pair. 46/// static std::pair<offset_type, offset_type> 47/// EmitKeyDataLength(raw_ostream &Out, key_type_ref Key, data_type_ref Data); 48/// /// Write Key to Out. KeyLen is the length from EmitKeyDataLength. 49/// static void EmitKey(raw_ostream &Out, key_type_ref Key, 50/// offset_type KeyLen); 51/// /// Write Data to Out. DataLen is the length from EmitKeyDataLength. 52/// static void EmitData(raw_ostream &Out, key_type_ref Key, 53/// data_type_ref Data, offset_type DataLen); 54/// /// Determine if two keys are equal. Optional, only needed by contains. 55/// static bool EqualKey(key_type_ref Key1, key_type_ref Key2); 56/// }; 57/// \endcode 58template <typename Info> class OnDiskChainedHashTableGenerator { 59 /// A single item in the hash table. 60 class Item { 61 public: 62 typename Info::key_type Key; 63 typename Info::data_type Data; 64 Item *Next; 65 const typename Info::hash_value_type Hash; 66 67 Item(typename Info::key_type_ref Key, typename Info::data_type_ref Data, 68 Info &InfoObj) 69 : Key(Key), Data(Data), Next(nullptr), Hash(InfoObj.ComputeHash(Key)) {} 70 }; 71 72 typedef typename Info::offset_type offset_type; 73 offset_type NumBuckets; 74 offset_type NumEntries; 75 llvm::SpecificBumpPtrAllocator<Item> BA; 76 77 /// A linked list of values in a particular hash bucket. 78 struct Bucket { 79 offset_type Off; 80 unsigned Length; 81 Item *Head; 82 }; 83 84 Bucket *Buckets; 85 86private: 87 /// Insert an item into the appropriate hash bucket. 88 void insert(Bucket *Buckets, size_t Size, Item *E) { 89 Bucket &B = Buckets[E->Hash & (Size - 1)]; 90 E->Next = B.Head; 91 ++B.Length; 92 B.Head = E; 93 } 94 95 /// Resize the hash table, moving the old entries into the new buckets. 96 void resize(size_t NewSize) { 97 Bucket *NewBuckets = static_cast<Bucket *>( 98 safe_calloc(NewSize, sizeof(Bucket))); 99 // Populate NewBuckets with the old entries. 100 for (size_t I = 0; I < NumBuckets; ++I) 101 for (Item *E = Buckets[I].Head; E;) { 102 Item *N = E->Next; 103 E->Next = nullptr; 104 insert(NewBuckets, NewSize, E); 105 E = N; 106 } 107 108 free(Buckets); 109 NumBuckets = NewSize; 110 Buckets = NewBuckets; 111 } 112 113public: 114 /// Insert an entry into the table. 115 void insert(typename Info::key_type_ref Key, 116 typename Info::data_type_ref Data) { 117 Info InfoObj; 118 insert(Key, Data, InfoObj); 119 } 120 121 /// Insert an entry into the table. 122 /// 123 /// Uses the provided Info instead of a stack allocated one. 124 void insert(typename Info::key_type_ref Key, 125 typename Info::data_type_ref Data, Info &InfoObj) { 126 ++NumEntries; 127 if (4 * NumEntries >= 3 * NumBuckets) 128 resize(NumBuckets * 2); 129 insert(Buckets, NumBuckets, new (BA.Allocate()) Item(Key, Data, InfoObj)); 130 } 131 132 /// Determine whether an entry has been inserted. 133 bool contains(typename Info::key_type_ref Key, Info &InfoObj) { 134 unsigned Hash = InfoObj.ComputeHash(Key); 135 for (Item *I = Buckets[Hash & (NumBuckets - 1)].Head; I; I = I->Next) 136 if (I->Hash == Hash && InfoObj.EqualKey(I->Key, Key)) 137 return true; 138 return false; 139 } 140 141 /// Emit the table to Out, which must not be at offset 0. 142 offset_type Emit(raw_ostream &Out) { 143 Info InfoObj; 144 return Emit(Out, InfoObj); 145 } 146 147 /// Emit the table to Out, which must not be at offset 0. 148 /// 149 /// Uses the provided Info instead of a stack allocated one. 150 offset_type Emit(raw_ostream &Out, Info &InfoObj) { 151 using namespace llvm::support; 152 endian::Writer LE(Out, llvm::endianness::little); 153 154 // Now we're done adding entries, resize the bucket list if it's 155 // significantly too large. (This only happens if the number of 156 // entries is small and we're within our initial allocation of 157 // 64 buckets.) We aim for an occupancy ratio in [3/8, 3/4). 158 // 159 // As a special case, if there are two or fewer entries, just 160 // form a single bucket. A linear scan is fine in that case, and 161 // this is very common in C++ class lookup tables. This also 162 // guarantees we produce at least one bucket for an empty table. 163 // 164 // FIXME: Try computing a perfect hash function at this point. 165 unsigned TargetNumBuckets = 166 NumEntries <= 2 ? 1 : llvm::bit_ceil(NumEntries * 4 / 3 + 1); 167 if (TargetNumBuckets != NumBuckets) 168 resize(TargetNumBuckets); 169 170 // Emit the payload of the table. 171 for (offset_type I = 0; I < NumBuckets; ++I) { 172 Bucket &B = Buckets[I]; 173 if (!B.Head) 174 continue; 175 176 // Store the offset for the data of this bucket. 177 B.Off = Out.tell(); 178 assert(B.Off && "Cannot write a bucket at offset 0. Please add padding."); 179 180 // Write out the number of items in the bucket. 181 LE.write<uint16_t>(B.Length); 182 assert(B.Length != 0 && "Bucket has a head but zero length?"); 183 184 // Write out the entries in the bucket. 185 for (Item *I = B.Head; I; I = I->Next) { 186 LE.write<typename Info::hash_value_type>(I->Hash); 187 const std::pair<offset_type, offset_type> &Len = 188 InfoObj.EmitKeyDataLength(Out, I->Key, I->Data); 189#ifdef NDEBUG 190 InfoObj.EmitKey(Out, I->Key, Len.first); 191 InfoObj.EmitData(Out, I->Key, I->Data, Len.second); 192#else 193 // In asserts mode, check that the users length matches the data they 194 // wrote. 195 uint64_t KeyStart = Out.tell(); 196 InfoObj.EmitKey(Out, I->Key, Len.first); 197 uint64_t DataStart = Out.tell(); 198 InfoObj.EmitData(Out, I->Key, I->Data, Len.second); 199 uint64_t End = Out.tell(); 200 assert(offset_type(DataStart - KeyStart) == Len.first && 201 "key length does not match bytes written"); 202 assert(offset_type(End - DataStart) == Len.second && 203 "data length does not match bytes written"); 204#endif 205 } 206 } 207 208 // Pad with zeros so that we can start the hashtable at an aligned address. 209 offset_type TableOff = Out.tell(); 210 uint64_t N = offsetToAlignment(TableOff, Align(alignof(offset_type))); 211 TableOff += N; 212 while (N--) 213 LE.write<uint8_t>(0); 214 215 // Emit the hashtable itself. 216 LE.write<offset_type>(NumBuckets); 217 LE.write<offset_type>(NumEntries); 218 for (offset_type I = 0; I < NumBuckets; ++I) 219 LE.write<offset_type>(Buckets[I].Off); 220 221 return TableOff; 222 } 223 224 OnDiskChainedHashTableGenerator() { 225 NumEntries = 0; 226 NumBuckets = 64; 227 // Note that we do not need to run the constructors of the individual 228 // Bucket objects since 'calloc' returns bytes that are all 0. 229 Buckets = static_cast<Bucket *>(safe_calloc(NumBuckets, sizeof(Bucket))); 230 } 231 232 ~OnDiskChainedHashTableGenerator() { std::free(Buckets); } 233}; 234 235/// Provides lookup on an on disk hash table. 236/// 237/// This needs an \c Info that handles reading values from the hash table's 238/// payload and computes the hash for a given key. This should provide the 239/// following interface: 240/// 241/// \code 242/// class ExampleLookupInfo { 243/// public: 244/// typedef ExampleData data_type; 245/// typedef ExampleInternalKey internal_key_type; // The stored key type. 246/// typedef ExampleKey external_key_type; // The type to pass to find(). 247/// typedef uint32_t hash_value_type; // The type the hash function returns. 248/// typedef uint32_t offset_type; // The type for offsets into the table. 249/// 250/// /// Compare two keys for equality. 251/// static bool EqualKey(internal_key_type &Key1, internal_key_type &Key2); 252/// /// Calculate the hash for the given key. 253/// static hash_value_type ComputeHash(internal_key_type &IKey); 254/// /// Translate from the semantic type of a key in the hash table to the 255/// /// type that is actually stored and used for hashing and comparisons. 256/// /// The internal and external types are often the same, in which case this 257/// /// can simply return the passed in value. 258/// static const internal_key_type &GetInternalKey(external_key_type &EKey); 259/// /// Read the key and data length from Buffer, leaving it pointing at the 260/// /// following byte. 261/// static std::pair<offset_type, offset_type> 262/// ReadKeyDataLength(const unsigned char *&Buffer); 263/// /// Read the key from Buffer, given the KeyLen as reported from 264/// /// ReadKeyDataLength. 265/// const internal_key_type &ReadKey(const unsigned char *Buffer, 266/// offset_type KeyLen); 267/// /// Read the data for Key from Buffer, given the DataLen as reported from 268/// /// ReadKeyDataLength. 269/// data_type ReadData(StringRef Key, const unsigned char *Buffer, 270/// offset_type DataLen); 271/// }; 272/// \endcode 273template <typename Info> class OnDiskChainedHashTable { 274 const typename Info::offset_type NumBuckets; 275 const typename Info::offset_type NumEntries; 276 const unsigned char *const Buckets; 277 const unsigned char *const Base; 278 Info InfoObj; 279 280public: 281 typedef Info InfoType; 282 typedef typename Info::internal_key_type internal_key_type; 283 typedef typename Info::external_key_type external_key_type; 284 typedef typename Info::data_type data_type; 285 typedef typename Info::hash_value_type hash_value_type; 286 typedef typename Info::offset_type offset_type; 287 288 OnDiskChainedHashTable(offset_type NumBuckets, offset_type NumEntries, 289 const unsigned char *Buckets, 290 const unsigned char *Base, 291 const Info &InfoObj = Info()) 292 : NumBuckets(NumBuckets), NumEntries(NumEntries), Buckets(Buckets), 293 Base(Base), InfoObj(InfoObj) { 294 assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 && 295 "'buckets' must have a 4-byte alignment"); 296 } 297 298 /// Read the number of buckets and the number of entries from a hash table 299 /// produced by OnDiskHashTableGenerator::Emit, and advance the Buckets 300 /// pointer past them. 301 static std::pair<offset_type, offset_type> 302 readNumBucketsAndEntries(const unsigned char *&Buckets) { 303 assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 && 304 "buckets should be 4-byte aligned."); 305 using namespace llvm::support; 306 offset_type NumBuckets = 307 endian::readNext<offset_type, llvm::endianness::little, aligned>( 308 Buckets); 309 offset_type NumEntries = 310 endian::readNext<offset_type, llvm::endianness::little, aligned>( 311 Buckets); 312 return std::make_pair(NumBuckets, NumEntries); 313 } 314 315 offset_type getNumBuckets() const { return NumBuckets; } 316 offset_type getNumEntries() const { return NumEntries; } 317 const unsigned char *getBase() const { return Base; } 318 const unsigned char *getBuckets() const { return Buckets; } 319 320 bool isEmpty() const { return NumEntries == 0; } 321 322 class iterator { 323 internal_key_type Key; 324 const unsigned char *const Data; 325 const offset_type Len; 326 Info *InfoObj; 327 328 public: 329 iterator() : Key(), Data(nullptr), Len(0), InfoObj(nullptr) {} 330 iterator(const internal_key_type K, const unsigned char *D, offset_type L, 331 Info *InfoObj) 332 : Key(K), Data(D), Len(L), InfoObj(InfoObj) {} 333 334 data_type operator*() const { return InfoObj->ReadData(Key, Data, Len); } 335 336 const unsigned char *getDataPtr() const { return Data; } 337 offset_type getDataLen() const { return Len; } 338 339 bool operator==(const iterator &X) const { return X.Data == Data; } 340 bool operator!=(const iterator &X) const { return X.Data != Data; } 341 }; 342 343 /// Look up the stored data for a particular key. 344 iterator find(const external_key_type &EKey, Info *InfoPtr = nullptr) { 345 const internal_key_type &IKey = InfoObj.GetInternalKey(EKey); 346 hash_value_type KeyHash = InfoObj.ComputeHash(IKey); 347 return find_hashed(IKey, KeyHash, InfoPtr); 348 } 349 350 /// Look up the stored data for a particular key with a known hash. 351 iterator find_hashed(const internal_key_type &IKey, hash_value_type KeyHash, 352 Info *InfoPtr = nullptr) { 353 using namespace llvm::support; 354 355 if (!InfoPtr) 356 InfoPtr = &InfoObj; 357 358 // Each bucket is just an offset into the hash table file. 359 offset_type Idx = KeyHash & (NumBuckets - 1); 360 const unsigned char *Bucket = Buckets + sizeof(offset_type) * Idx; 361 362 offset_type Offset = 363 endian::readNext<offset_type, llvm::endianness::little, aligned>( 364 Bucket); 365 if (Offset == 0) 366 return iterator(); // Empty bucket. 367 const unsigned char *Items = Base + Offset; 368 369 // 'Items' starts with a 16-bit unsigned integer representing the 370 // number of items in this bucket. 371 unsigned Len = 372 endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Items); 373 374 for (unsigned i = 0; i < Len; ++i) { 375 // Read the hash. 376 hash_value_type ItemHash = 377 endian::readNext<hash_value_type, llvm::endianness::little, 378 unaligned>(Items); 379 380 // Determine the length of the key and the data. 381 const std::pair<offset_type, offset_type> &L = 382 Info::ReadKeyDataLength(Items); 383 offset_type ItemLen = L.first + L.second; 384 385 // Compare the hashes. If they are not the same, skip the entry entirely. 386 if (ItemHash != KeyHash) { 387 Items += ItemLen; 388 continue; 389 } 390 391 // Read the key. 392 const internal_key_type &X = 393 InfoPtr->ReadKey((const unsigned char *const)Items, L.first); 394 395 // If the key doesn't match just skip reading the value. 396 if (!InfoPtr->EqualKey(X, IKey)) { 397 Items += ItemLen; 398 continue; 399 } 400 401 // The key matches! 402 return iterator(X, Items + L.first, L.second, InfoPtr); 403 } 404 405 return iterator(); 406 } 407 408 iterator end() const { return iterator(); } 409 410 Info &getInfoObj() { return InfoObj; } 411 412 /// Create the hash table. 413 /// 414 /// \param Buckets is the beginning of the hash table itself, which follows 415 /// the payload of entire structure. This is the value returned by 416 /// OnDiskHashTableGenerator::Emit. 417 /// 418 /// \param Base is the point from which all offsets into the structure are 419 /// based. This is offset 0 in the stream that was used when Emitting the 420 /// table. 421 static OnDiskChainedHashTable *Create(const unsigned char *Buckets, 422 const unsigned char *const Base, 423 const Info &InfoObj = Info()) { 424 assert(Buckets > Base); 425 auto NumBucketsAndEntries = readNumBucketsAndEntries(Buckets); 426 return new OnDiskChainedHashTable<Info>(NumBucketsAndEntries.first, 427 NumBucketsAndEntries.second, 428 Buckets, Base, InfoObj); 429 } 430}; 431 432/// Provides lookup and iteration over an on disk hash table. 433/// 434/// \copydetails llvm::OnDiskChainedHashTable 435template <typename Info> 436class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> { 437 const unsigned char *Payload; 438 439public: 440 typedef OnDiskChainedHashTable<Info> base_type; 441 typedef typename base_type::internal_key_type internal_key_type; 442 typedef typename base_type::external_key_type external_key_type; 443 typedef typename base_type::data_type data_type; 444 typedef typename base_type::hash_value_type hash_value_type; 445 typedef typename base_type::offset_type offset_type; 446 447private: 448 /// Iterates over all of the keys in the table. 449 class iterator_base { 450 const unsigned char *Ptr; 451 offset_type NumItemsInBucketLeft; 452 offset_type NumEntriesLeft; 453 454 public: 455 typedef external_key_type value_type; 456 457 iterator_base(const unsigned char *const Ptr, offset_type NumEntries) 458 : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries) {} 459 iterator_base() 460 : Ptr(nullptr), NumItemsInBucketLeft(0), NumEntriesLeft(0) {} 461 462 friend bool operator==(const iterator_base &X, const iterator_base &Y) { 463 return X.NumEntriesLeft == Y.NumEntriesLeft; 464 } 465 friend bool operator!=(const iterator_base &X, const iterator_base &Y) { 466 return X.NumEntriesLeft != Y.NumEntriesLeft; 467 } 468 469 /// Move to the next item. 470 void advance() { 471 using namespace llvm::support; 472 if (!NumItemsInBucketLeft) { 473 // 'Items' starts with a 16-bit unsigned integer representing the 474 // number of items in this bucket. 475 NumItemsInBucketLeft = 476 endian::readNext<uint16_t, llvm::endianness::little, unaligned>( 477 Ptr); 478 } 479 Ptr += sizeof(hash_value_type); // Skip the hash. 480 // Determine the length of the key and the data. 481 const std::pair<offset_type, offset_type> &L = 482 Info::ReadKeyDataLength(Ptr); 483 Ptr += L.first + L.second; 484 assert(NumItemsInBucketLeft); 485 --NumItemsInBucketLeft; 486 assert(NumEntriesLeft); 487 --NumEntriesLeft; 488 } 489 490 /// Get the start of the item as written by the trait (after the hash and 491 /// immediately before the key and value length). 492 const unsigned char *getItem() const { 493 return Ptr + (NumItemsInBucketLeft ? 0 : 2) + sizeof(hash_value_type); 494 } 495 }; 496 497public: 498 OnDiskIterableChainedHashTable(offset_type NumBuckets, offset_type NumEntries, 499 const unsigned char *Buckets, 500 const unsigned char *Payload, 501 const unsigned char *Base, 502 const Info &InfoObj = Info()) 503 : base_type(NumBuckets, NumEntries, Buckets, Base, InfoObj), 504 Payload(Payload) {} 505 506 /// Iterates over all of the keys in the table. 507 class key_iterator : public iterator_base { 508 Info *InfoObj; 509 510 public: 511 typedef external_key_type value_type; 512 513 key_iterator(const unsigned char *const Ptr, offset_type NumEntries, 514 Info *InfoObj) 515 : iterator_base(Ptr, NumEntries), InfoObj(InfoObj) {} 516 key_iterator() : iterator_base(), InfoObj() {} 517 518 key_iterator &operator++() { 519 this->advance(); 520 return *this; 521 } 522 key_iterator operator++(int) { // Postincrement 523 key_iterator tmp = *this; 524 ++*this; 525 return tmp; 526 } 527 528 internal_key_type getInternalKey() const { 529 auto *LocalPtr = this->getItem(); 530 531 // Determine the length of the key and the data. 532 auto L = Info::ReadKeyDataLength(LocalPtr); 533 534 // Read the key. 535 return InfoObj->ReadKey(LocalPtr, L.first); 536 } 537 538 value_type operator*() const { 539 return InfoObj->GetExternalKey(getInternalKey()); 540 } 541 }; 542 543 key_iterator key_begin() { 544 return key_iterator(Payload, this->getNumEntries(), &this->getInfoObj()); 545 } 546 key_iterator key_end() { return key_iterator(); } 547 548 iterator_range<key_iterator> keys() { 549 return make_range(key_begin(), key_end()); 550 } 551 552 /// Iterates over all the entries in the table, returning the data. 553 class data_iterator : public iterator_base { 554 Info *InfoObj; 555 556 public: 557 typedef data_type value_type; 558 559 data_iterator(const unsigned char *const Ptr, offset_type NumEntries, 560 Info *InfoObj) 561 : iterator_base(Ptr, NumEntries), InfoObj(InfoObj) {} 562 data_iterator() : iterator_base(), InfoObj() {} 563 564 data_iterator &operator++() { // Preincrement 565 this->advance(); 566 return *this; 567 } 568 data_iterator operator++(int) { // Postincrement 569 data_iterator tmp = *this; 570 ++*this; 571 return tmp; 572 } 573 574 value_type operator*() const { 575 auto *LocalPtr = this->getItem(); 576 577 // Determine the length of the key and the data. 578 auto L = Info::ReadKeyDataLength(LocalPtr); 579 580 // Read the key. 581 const internal_key_type &Key = InfoObj->ReadKey(LocalPtr, L.first); 582 return InfoObj->ReadData(Key, LocalPtr + L.first, L.second); 583 } 584 }; 585 586 data_iterator data_begin() { 587 return data_iterator(Payload, this->getNumEntries(), &this->getInfoObj()); 588 } 589 data_iterator data_end() { return data_iterator(); } 590 591 iterator_range<data_iterator> data() { 592 return make_range(data_begin(), data_end()); 593 } 594 595 /// Create the hash table. 596 /// 597 /// \param Buckets is the beginning of the hash table itself, which follows 598 /// the payload of entire structure. This is the value returned by 599 /// OnDiskHashTableGenerator::Emit. 600 /// 601 /// \param Payload is the beginning of the data contained in the table. This 602 /// is Base plus any padding or header data that was stored, ie, the offset 603 /// that the stream was at when calling Emit. 604 /// 605 /// \param Base is the point from which all offsets into the structure are 606 /// based. This is offset 0 in the stream that was used when Emitting the 607 /// table. 608 static OnDiskIterableChainedHashTable * 609 Create(const unsigned char *Buckets, const unsigned char *const Payload, 610 const unsigned char *const Base, const Info &InfoObj = Info()) { 611 assert(Buckets > Base); 612 auto NumBucketsAndEntries = 613 OnDiskIterableChainedHashTable<Info>::readNumBucketsAndEntries(Buckets); 614 return new OnDiskIterableChainedHashTable<Info>( 615 NumBucketsAndEntries.first, NumBucketsAndEntries.second, 616 Buckets, Payload, Base, InfoObj); 617 } 618}; 619 620} // end namespace llvm 621 622#endif 623