1*e5dd7070Spatrick //===- MultiOnDiskHashTable.h - Merged set of hash tables -------*- C++ -*-===// 2*e5dd7070Spatrick // 3*e5dd7070Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*e5dd7070Spatrick // See https://llvm.org/LICENSE.txt for license information. 5*e5dd7070Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*e5dd7070Spatrick // 7*e5dd7070Spatrick //===----------------------------------------------------------------------===// 8*e5dd7070Spatrick // 9*e5dd7070Spatrick // This file provides a hash table data structure suitable for incremental and 10*e5dd7070Spatrick // distributed storage across a set of files. 11*e5dd7070Spatrick // 12*e5dd7070Spatrick // Multiple hash tables from different files are implicitly merged to improve 13*e5dd7070Spatrick // performance, and on reload the merged table will override those from other 14*e5dd7070Spatrick // files. 15*e5dd7070Spatrick // 16*e5dd7070Spatrick //===----------------------------------------------------------------------===// 17*e5dd7070Spatrick 18*e5dd7070Spatrick #ifndef LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H 19*e5dd7070Spatrick #define LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H 20*e5dd7070Spatrick 21*e5dd7070Spatrick #include "llvm/ADT/DenseMap.h" 22*e5dd7070Spatrick #include "llvm/ADT/DenseSet.h" 23*e5dd7070Spatrick #include "llvm/ADT/PointerUnion.h" 24*e5dd7070Spatrick #include "llvm/ADT/STLExtras.h" 25*e5dd7070Spatrick #include "llvm/ADT/SmallVector.h" 26*e5dd7070Spatrick #include "llvm/ADT/TinyPtrVector.h" 27*e5dd7070Spatrick #include "llvm/ADT/iterator_range.h" 28*e5dd7070Spatrick #include "llvm/Support/Endian.h" 29*e5dd7070Spatrick #include "llvm/Support/EndianStream.h" 30*e5dd7070Spatrick #include "llvm/Support/OnDiskHashTable.h" 31*e5dd7070Spatrick #include "llvm/Support/raw_ostream.h" 32*e5dd7070Spatrick #include <algorithm> 33*e5dd7070Spatrick #include <cstdint> 34*e5dd7070Spatrick #include <vector> 35*e5dd7070Spatrick 36*e5dd7070Spatrick namespace clang { 37*e5dd7070Spatrick namespace serialization { 38*e5dd7070Spatrick 39*e5dd7070Spatrick /// A collection of on-disk hash tables, merged when relevant for performance. 40*e5dd7070Spatrick template<typename Info> class MultiOnDiskHashTable { 41*e5dd7070Spatrick public: 42*e5dd7070Spatrick /// A handle to a file, used when overriding tables. 43*e5dd7070Spatrick using file_type = typename Info::file_type; 44*e5dd7070Spatrick 45*e5dd7070Spatrick /// A pointer to an on-disk representation of the hash table. 46*e5dd7070Spatrick using storage_type = const unsigned char *; 47*e5dd7070Spatrick 48*e5dd7070Spatrick using external_key_type = typename Info::external_key_type; 49*e5dd7070Spatrick using internal_key_type = typename Info::internal_key_type; 50*e5dd7070Spatrick using data_type = typename Info::data_type; 51*e5dd7070Spatrick using data_type_builder = typename Info::data_type_builder; 52*e5dd7070Spatrick using hash_value_type = unsigned; 53*e5dd7070Spatrick 54*e5dd7070Spatrick private: 55*e5dd7070Spatrick /// The generator is permitted to read our merged table. 56*e5dd7070Spatrick template<typename ReaderInfo, typename WriterInfo> 57*e5dd7070Spatrick friend class MultiOnDiskHashTableGenerator; 58*e5dd7070Spatrick 59*e5dd7070Spatrick /// A hash table stored on disk. 60*e5dd7070Spatrick struct OnDiskTable { 61*e5dd7070Spatrick using HashTable = llvm::OnDiskIterableChainedHashTable<Info>; 62*e5dd7070Spatrick 63*e5dd7070Spatrick file_type File; 64*e5dd7070Spatrick HashTable Table; 65*e5dd7070Spatrick OnDiskTableOnDiskTable66*e5dd7070Spatrick OnDiskTable(file_type File, unsigned NumBuckets, unsigned NumEntries, 67*e5dd7070Spatrick storage_type Buckets, storage_type Payload, storage_type Base, 68*e5dd7070Spatrick const Info &InfoObj) 69*e5dd7070Spatrick : File(File), 70*e5dd7070Spatrick Table(NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj) {} 71*e5dd7070Spatrick }; 72*e5dd7070Spatrick 73*e5dd7070Spatrick struct MergedTable { 74*e5dd7070Spatrick std::vector<file_type> Files; 75*e5dd7070Spatrick llvm::DenseMap<internal_key_type, data_type> Data; 76*e5dd7070Spatrick }; 77*e5dd7070Spatrick 78*e5dd7070Spatrick using Table = llvm::PointerUnion<OnDiskTable *, MergedTable *>; 79*e5dd7070Spatrick using TableVector = llvm::TinyPtrVector<void *>; 80*e5dd7070Spatrick 81*e5dd7070Spatrick /// The current set of on-disk and merged tables. 82*e5dd7070Spatrick /// We manually store the opaque value of the Table because TinyPtrVector 83*e5dd7070Spatrick /// can't cope with holding a PointerUnion directly. 84*e5dd7070Spatrick /// There can be at most one MergedTable in this vector, and if present, 85*e5dd7070Spatrick /// it is the first table. 86*e5dd7070Spatrick TableVector Tables; 87*e5dd7070Spatrick 88*e5dd7070Spatrick /// Files corresponding to overridden tables that we've not yet 89*e5dd7070Spatrick /// discarded. 90*e5dd7070Spatrick llvm::TinyPtrVector<file_type> PendingOverrides; 91*e5dd7070Spatrick 92*e5dd7070Spatrick struct AsOnDiskTable { 93*e5dd7070Spatrick using result_type = OnDiskTable *; 94*e5dd7070Spatrick operatorAsOnDiskTable95*e5dd7070Spatrick result_type operator()(void *P) const { 96*e5dd7070Spatrick return Table::getFromOpaqueValue(P).template get<OnDiskTable *>(); 97*e5dd7070Spatrick } 98*e5dd7070Spatrick }; 99*e5dd7070Spatrick 100*e5dd7070Spatrick using table_iterator = 101*e5dd7070Spatrick llvm::mapped_iterator<TableVector::iterator, AsOnDiskTable>; 102*e5dd7070Spatrick using table_range = llvm::iterator_range<table_iterator>; 103*e5dd7070Spatrick 104*e5dd7070Spatrick /// The current set of on-disk tables. tables()105*e5dd7070Spatrick table_range tables() { 106*e5dd7070Spatrick auto Begin = Tables.begin(), End = Tables.end(); 107*e5dd7070Spatrick if (getMergedTable()) 108*e5dd7070Spatrick ++Begin; 109*e5dd7070Spatrick return llvm::make_range(llvm::map_iterator(Begin, AsOnDiskTable()), 110*e5dd7070Spatrick llvm::map_iterator(End, AsOnDiskTable())); 111*e5dd7070Spatrick } 112*e5dd7070Spatrick getMergedTable()113*e5dd7070Spatrick MergedTable *getMergedTable() const { 114*e5dd7070Spatrick // If we already have a merged table, it's the first one. 115*e5dd7070Spatrick return Tables.empty() ? nullptr : Table::getFromOpaqueValue(*Tables.begin()) 116*e5dd7070Spatrick .template dyn_cast<MergedTable*>(); 117*e5dd7070Spatrick } 118*e5dd7070Spatrick 119*e5dd7070Spatrick /// Delete all our current on-disk tables. clear()120*e5dd7070Spatrick void clear() { 121*e5dd7070Spatrick for (auto *T : tables()) 122*e5dd7070Spatrick delete T; 123*e5dd7070Spatrick if (auto *M = getMergedTable()) 124*e5dd7070Spatrick delete M; 125*e5dd7070Spatrick Tables.clear(); 126*e5dd7070Spatrick } 127*e5dd7070Spatrick removeOverriddenTables()128*e5dd7070Spatrick void removeOverriddenTables() { 129*e5dd7070Spatrick llvm::DenseSet<file_type> Files; 130*e5dd7070Spatrick Files.insert(PendingOverrides.begin(), PendingOverrides.end()); 131*e5dd7070Spatrick // Explicitly capture Files to work around an MSVC 2015 rejects-valid bug. 132*e5dd7070Spatrick auto ShouldRemove = [&Files](void *T) -> bool { 133*e5dd7070Spatrick auto *ODT = Table::getFromOpaqueValue(T).template get<OnDiskTable *>(); 134*e5dd7070Spatrick bool Remove = Files.count(ODT->File); 135*e5dd7070Spatrick if (Remove) 136*e5dd7070Spatrick delete ODT; 137*e5dd7070Spatrick return Remove; 138*e5dd7070Spatrick }; 139*e5dd7070Spatrick Tables.erase(std::remove_if(tables().begin().getCurrent(), Tables.end(), 140*e5dd7070Spatrick ShouldRemove), 141*e5dd7070Spatrick Tables.end()); 142*e5dd7070Spatrick PendingOverrides.clear(); 143*e5dd7070Spatrick } 144*e5dd7070Spatrick condense()145*e5dd7070Spatrick void condense() { 146*e5dd7070Spatrick MergedTable *Merged = getMergedTable(); 147*e5dd7070Spatrick if (!Merged) 148*e5dd7070Spatrick Merged = new MergedTable; 149*e5dd7070Spatrick 150*e5dd7070Spatrick // Read in all the tables and merge them together. 151*e5dd7070Spatrick // FIXME: Be smarter about which tables we merge. 152*e5dd7070Spatrick for (auto *ODT : tables()) { 153*e5dd7070Spatrick auto &HT = ODT->Table; 154*e5dd7070Spatrick Info &InfoObj = HT.getInfoObj(); 155*e5dd7070Spatrick 156*e5dd7070Spatrick for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) { 157*e5dd7070Spatrick auto *LocalPtr = I.getItem(); 158*e5dd7070Spatrick 159*e5dd7070Spatrick // FIXME: Don't rely on the OnDiskHashTable format here. 160*e5dd7070Spatrick auto L = InfoObj.ReadKeyDataLength(LocalPtr); 161*e5dd7070Spatrick const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first); 162*e5dd7070Spatrick data_type_builder ValueBuilder(Merged->Data[Key]); 163*e5dd7070Spatrick InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, 164*e5dd7070Spatrick ValueBuilder); 165*e5dd7070Spatrick } 166*e5dd7070Spatrick 167*e5dd7070Spatrick Merged->Files.push_back(ODT->File); 168*e5dd7070Spatrick delete ODT; 169*e5dd7070Spatrick } 170*e5dd7070Spatrick 171*e5dd7070Spatrick Tables.clear(); 172*e5dd7070Spatrick Tables.push_back(Table(Merged).getOpaqueValue()); 173*e5dd7070Spatrick } 174*e5dd7070Spatrick 175*e5dd7070Spatrick public: 176*e5dd7070Spatrick MultiOnDiskHashTable() = default; 177*e5dd7070Spatrick MultiOnDiskHashTable(MultiOnDiskHashTable && O)178*e5dd7070Spatrick MultiOnDiskHashTable(MultiOnDiskHashTable &&O) 179*e5dd7070Spatrick : Tables(std::move(O.Tables)), 180*e5dd7070Spatrick PendingOverrides(std::move(O.PendingOverrides)) { 181*e5dd7070Spatrick O.Tables.clear(); 182*e5dd7070Spatrick } 183*e5dd7070Spatrick 184*e5dd7070Spatrick MultiOnDiskHashTable &operator=(MultiOnDiskHashTable &&O) { 185*e5dd7070Spatrick if (&O == this) 186*e5dd7070Spatrick return *this; 187*e5dd7070Spatrick clear(); 188*e5dd7070Spatrick Tables = std::move(O.Tables); 189*e5dd7070Spatrick O.Tables.clear(); 190*e5dd7070Spatrick PendingOverrides = std::move(O.PendingOverrides); 191*e5dd7070Spatrick return *this; 192*e5dd7070Spatrick } 193*e5dd7070Spatrick ~MultiOnDiskHashTable()194*e5dd7070Spatrick ~MultiOnDiskHashTable() { clear(); } 195*e5dd7070Spatrick 196*e5dd7070Spatrick /// Add the table \p Data loaded from file \p File. 197*e5dd7070Spatrick void add(file_type File, storage_type Data, Info InfoObj = Info()) { 198*e5dd7070Spatrick using namespace llvm::support; 199*e5dd7070Spatrick 200*e5dd7070Spatrick storage_type Ptr = Data; 201*e5dd7070Spatrick 202*e5dd7070Spatrick uint32_t BucketOffset = endian::readNext<uint32_t, little, unaligned>(Ptr); 203*e5dd7070Spatrick 204*e5dd7070Spatrick // Read the list of overridden files. 205*e5dd7070Spatrick uint32_t NumFiles = endian::readNext<uint32_t, little, unaligned>(Ptr); 206*e5dd7070Spatrick // FIXME: Add a reserve() to TinyPtrVector so that we don't need to make 207*e5dd7070Spatrick // an additional copy. 208*e5dd7070Spatrick llvm::SmallVector<file_type, 16> OverriddenFiles; 209*e5dd7070Spatrick OverriddenFiles.reserve(NumFiles); 210*e5dd7070Spatrick for (/**/; NumFiles != 0; --NumFiles) 211*e5dd7070Spatrick OverriddenFiles.push_back(InfoObj.ReadFileRef(Ptr)); 212*e5dd7070Spatrick PendingOverrides.insert(PendingOverrides.end(), OverriddenFiles.begin(), 213*e5dd7070Spatrick OverriddenFiles.end()); 214*e5dd7070Spatrick 215*e5dd7070Spatrick // Read the OnDiskChainedHashTable header. 216*e5dd7070Spatrick storage_type Buckets = Data + BucketOffset; 217*e5dd7070Spatrick auto NumBucketsAndEntries = 218*e5dd7070Spatrick OnDiskTable::HashTable::readNumBucketsAndEntries(Buckets); 219*e5dd7070Spatrick 220*e5dd7070Spatrick // Register the table. 221*e5dd7070Spatrick Table NewTable = new OnDiskTable(File, NumBucketsAndEntries.first, 222*e5dd7070Spatrick NumBucketsAndEntries.second, 223*e5dd7070Spatrick Buckets, Ptr, Data, std::move(InfoObj)); 224*e5dd7070Spatrick Tables.push_back(NewTable.getOpaqueValue()); 225*e5dd7070Spatrick } 226*e5dd7070Spatrick 227*e5dd7070Spatrick /// Find and read the lookup results for \p EKey. find(const external_key_type & EKey)228*e5dd7070Spatrick data_type find(const external_key_type &EKey) { 229*e5dd7070Spatrick data_type Result; 230*e5dd7070Spatrick 231*e5dd7070Spatrick if (!PendingOverrides.empty()) 232*e5dd7070Spatrick removeOverriddenTables(); 233*e5dd7070Spatrick 234*e5dd7070Spatrick if (Tables.size() > static_cast<unsigned>(Info::MaxTables)) 235*e5dd7070Spatrick condense(); 236*e5dd7070Spatrick 237*e5dd7070Spatrick internal_key_type Key = Info::GetInternalKey(EKey); 238*e5dd7070Spatrick auto KeyHash = Info::ComputeHash(Key); 239*e5dd7070Spatrick 240*e5dd7070Spatrick if (MergedTable *M = getMergedTable()) { 241*e5dd7070Spatrick auto It = M->Data.find(Key); 242*e5dd7070Spatrick if (It != M->Data.end()) 243*e5dd7070Spatrick Result = It->second; 244*e5dd7070Spatrick } 245*e5dd7070Spatrick 246*e5dd7070Spatrick data_type_builder ResultBuilder(Result); 247*e5dd7070Spatrick 248*e5dd7070Spatrick for (auto *ODT : tables()) { 249*e5dd7070Spatrick auto &HT = ODT->Table; 250*e5dd7070Spatrick auto It = HT.find_hashed(Key, KeyHash); 251*e5dd7070Spatrick if (It != HT.end()) 252*e5dd7070Spatrick HT.getInfoObj().ReadDataInto(Key, It.getDataPtr(), It.getDataLen(), 253*e5dd7070Spatrick ResultBuilder); 254*e5dd7070Spatrick } 255*e5dd7070Spatrick 256*e5dd7070Spatrick return Result; 257*e5dd7070Spatrick } 258*e5dd7070Spatrick 259*e5dd7070Spatrick /// Read all the lookup results into a single value. This only makes 260*e5dd7070Spatrick /// sense if merging values across keys is meaningful. findAll()261*e5dd7070Spatrick data_type findAll() { 262*e5dd7070Spatrick data_type Result; 263*e5dd7070Spatrick data_type_builder ResultBuilder(Result); 264*e5dd7070Spatrick 265*e5dd7070Spatrick if (!PendingOverrides.empty()) 266*e5dd7070Spatrick removeOverriddenTables(); 267*e5dd7070Spatrick 268*e5dd7070Spatrick if (MergedTable *M = getMergedTable()) { 269*e5dd7070Spatrick for (auto &KV : M->Data) 270*e5dd7070Spatrick Info::MergeDataInto(KV.second, ResultBuilder); 271*e5dd7070Spatrick } 272*e5dd7070Spatrick 273*e5dd7070Spatrick for (auto *ODT : tables()) { 274*e5dd7070Spatrick auto &HT = ODT->Table; 275*e5dd7070Spatrick Info &InfoObj = HT.getInfoObj(); 276*e5dd7070Spatrick for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) { 277*e5dd7070Spatrick auto *LocalPtr = I.getItem(); 278*e5dd7070Spatrick 279*e5dd7070Spatrick // FIXME: Don't rely on the OnDiskHashTable format here. 280*e5dd7070Spatrick auto L = InfoObj.ReadKeyDataLength(LocalPtr); 281*e5dd7070Spatrick const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first); 282*e5dd7070Spatrick InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, ResultBuilder); 283*e5dd7070Spatrick } 284*e5dd7070Spatrick } 285*e5dd7070Spatrick 286*e5dd7070Spatrick return Result; 287*e5dd7070Spatrick } 288*e5dd7070Spatrick }; 289*e5dd7070Spatrick 290*e5dd7070Spatrick /// Writer for the on-disk hash table. 291*e5dd7070Spatrick template<typename ReaderInfo, typename WriterInfo> 292*e5dd7070Spatrick class MultiOnDiskHashTableGenerator { 293*e5dd7070Spatrick using BaseTable = MultiOnDiskHashTable<ReaderInfo>; 294*e5dd7070Spatrick using Generator = llvm::OnDiskChainedHashTableGenerator<WriterInfo>; 295*e5dd7070Spatrick 296*e5dd7070Spatrick Generator Gen; 297*e5dd7070Spatrick 298*e5dd7070Spatrick public: MultiOnDiskHashTableGenerator()299*e5dd7070Spatrick MultiOnDiskHashTableGenerator() : Gen() {} 300*e5dd7070Spatrick insert(typename WriterInfo::key_type_ref Key,typename WriterInfo::data_type_ref Data,WriterInfo & Info)301*e5dd7070Spatrick void insert(typename WriterInfo::key_type_ref Key, 302*e5dd7070Spatrick typename WriterInfo::data_type_ref Data, WriterInfo &Info) { 303*e5dd7070Spatrick Gen.insert(Key, Data, Info); 304*e5dd7070Spatrick } 305*e5dd7070Spatrick emit(llvm::SmallVectorImpl<char> & Out,WriterInfo & Info,const BaseTable * Base)306*e5dd7070Spatrick void emit(llvm::SmallVectorImpl<char> &Out, WriterInfo &Info, 307*e5dd7070Spatrick const BaseTable *Base) { 308*e5dd7070Spatrick using namespace llvm::support; 309*e5dd7070Spatrick 310*e5dd7070Spatrick llvm::raw_svector_ostream OutStream(Out); 311*e5dd7070Spatrick 312*e5dd7070Spatrick // Write our header information. 313*e5dd7070Spatrick { 314*e5dd7070Spatrick endian::Writer Writer(OutStream, little); 315*e5dd7070Spatrick 316*e5dd7070Spatrick // Reserve four bytes for the bucket offset. 317*e5dd7070Spatrick Writer.write<uint32_t>(0); 318*e5dd7070Spatrick 319*e5dd7070Spatrick if (auto *Merged = Base ? Base->getMergedTable() : nullptr) { 320*e5dd7070Spatrick // Write list of overridden files. 321*e5dd7070Spatrick Writer.write<uint32_t>(Merged->Files.size()); 322*e5dd7070Spatrick for (const auto &F : Merged->Files) 323*e5dd7070Spatrick Info.EmitFileRef(OutStream, F); 324*e5dd7070Spatrick 325*e5dd7070Spatrick // Add all merged entries from Base to the generator. 326*e5dd7070Spatrick for (auto &KV : Merged->Data) { 327*e5dd7070Spatrick if (!Gen.contains(KV.first, Info)) 328*e5dd7070Spatrick Gen.insert(KV.first, Info.ImportData(KV.second), Info); 329*e5dd7070Spatrick } 330*e5dd7070Spatrick } else { 331*e5dd7070Spatrick Writer.write<uint32_t>(0); 332*e5dd7070Spatrick } 333*e5dd7070Spatrick } 334*e5dd7070Spatrick 335*e5dd7070Spatrick // Write the table itself. 336*e5dd7070Spatrick uint32_t BucketOffset = Gen.Emit(OutStream, Info); 337*e5dd7070Spatrick 338*e5dd7070Spatrick // Replace the first four bytes with the bucket offset. 339*e5dd7070Spatrick endian::write32le(Out.data(), BucketOffset); 340*e5dd7070Spatrick } 341*e5dd7070Spatrick }; 342*e5dd7070Spatrick 343*e5dd7070Spatrick } // namespace serialization 344*e5dd7070Spatrick } // namespace clang 345*e5dd7070Spatrick 346*e5dd7070Spatrick #endif // LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H 347