xref: /openbsd-src/gnu/llvm/clang/lib/Serialization/MultiOnDiskHashTable.h (revision e5dd70708596ae51455a0ffa086a00c5b29f8583)
1*e5dd7070Spatrick //===- MultiOnDiskHashTable.h - Merged set of hash tables -------*- C++ -*-===//
2*e5dd7070Spatrick //
3*e5dd7070Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*e5dd7070Spatrick // See https://llvm.org/LICENSE.txt for license information.
5*e5dd7070Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*e5dd7070Spatrick //
7*e5dd7070Spatrick //===----------------------------------------------------------------------===//
8*e5dd7070Spatrick //
9*e5dd7070Spatrick //  This file provides a hash table data structure suitable for incremental and
10*e5dd7070Spatrick //  distributed storage across a set of files.
11*e5dd7070Spatrick //
12*e5dd7070Spatrick //  Multiple hash tables from different files are implicitly merged to improve
13*e5dd7070Spatrick //  performance, and on reload the merged table will override those from other
14*e5dd7070Spatrick //  files.
15*e5dd7070Spatrick //
16*e5dd7070Spatrick //===----------------------------------------------------------------------===//
17*e5dd7070Spatrick 
18*e5dd7070Spatrick #ifndef LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
19*e5dd7070Spatrick #define LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
20*e5dd7070Spatrick 
21*e5dd7070Spatrick #include "llvm/ADT/DenseMap.h"
22*e5dd7070Spatrick #include "llvm/ADT/DenseSet.h"
23*e5dd7070Spatrick #include "llvm/ADT/PointerUnion.h"
24*e5dd7070Spatrick #include "llvm/ADT/STLExtras.h"
25*e5dd7070Spatrick #include "llvm/ADT/SmallVector.h"
26*e5dd7070Spatrick #include "llvm/ADT/TinyPtrVector.h"
27*e5dd7070Spatrick #include "llvm/ADT/iterator_range.h"
28*e5dd7070Spatrick #include "llvm/Support/Endian.h"
29*e5dd7070Spatrick #include "llvm/Support/EndianStream.h"
30*e5dd7070Spatrick #include "llvm/Support/OnDiskHashTable.h"
31*e5dd7070Spatrick #include "llvm/Support/raw_ostream.h"
32*e5dd7070Spatrick #include <algorithm>
33*e5dd7070Spatrick #include <cstdint>
34*e5dd7070Spatrick #include <vector>
35*e5dd7070Spatrick 
36*e5dd7070Spatrick namespace clang {
37*e5dd7070Spatrick namespace serialization {
38*e5dd7070Spatrick 
39*e5dd7070Spatrick /// A collection of on-disk hash tables, merged when relevant for performance.
40*e5dd7070Spatrick template<typename Info> class MultiOnDiskHashTable {
41*e5dd7070Spatrick public:
42*e5dd7070Spatrick   /// A handle to a file, used when overriding tables.
43*e5dd7070Spatrick   using file_type = typename Info::file_type;
44*e5dd7070Spatrick 
45*e5dd7070Spatrick   /// A pointer to an on-disk representation of the hash table.
46*e5dd7070Spatrick   using storage_type = const unsigned char *;
47*e5dd7070Spatrick 
48*e5dd7070Spatrick   using external_key_type = typename Info::external_key_type;
49*e5dd7070Spatrick   using internal_key_type = typename Info::internal_key_type;
50*e5dd7070Spatrick   using data_type = typename Info::data_type;
51*e5dd7070Spatrick   using data_type_builder = typename Info::data_type_builder;
52*e5dd7070Spatrick   using hash_value_type = unsigned;
53*e5dd7070Spatrick 
54*e5dd7070Spatrick private:
55*e5dd7070Spatrick   /// The generator is permitted to read our merged table.
56*e5dd7070Spatrick   template<typename ReaderInfo, typename WriterInfo>
57*e5dd7070Spatrick   friend class MultiOnDiskHashTableGenerator;
58*e5dd7070Spatrick 
59*e5dd7070Spatrick   /// A hash table stored on disk.
60*e5dd7070Spatrick   struct OnDiskTable {
61*e5dd7070Spatrick     using HashTable = llvm::OnDiskIterableChainedHashTable<Info>;
62*e5dd7070Spatrick 
63*e5dd7070Spatrick     file_type File;
64*e5dd7070Spatrick     HashTable Table;
65*e5dd7070Spatrick 
OnDiskTableOnDiskTable66*e5dd7070Spatrick     OnDiskTable(file_type File, unsigned NumBuckets, unsigned NumEntries,
67*e5dd7070Spatrick                 storage_type Buckets, storage_type Payload, storage_type Base,
68*e5dd7070Spatrick                 const Info &InfoObj)
69*e5dd7070Spatrick         : File(File),
70*e5dd7070Spatrick           Table(NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj) {}
71*e5dd7070Spatrick   };
72*e5dd7070Spatrick 
73*e5dd7070Spatrick   struct MergedTable {
74*e5dd7070Spatrick     std::vector<file_type> Files;
75*e5dd7070Spatrick     llvm::DenseMap<internal_key_type, data_type> Data;
76*e5dd7070Spatrick   };
77*e5dd7070Spatrick 
78*e5dd7070Spatrick   using Table = llvm::PointerUnion<OnDiskTable *, MergedTable *>;
79*e5dd7070Spatrick   using TableVector = llvm::TinyPtrVector<void *>;
80*e5dd7070Spatrick 
81*e5dd7070Spatrick   /// The current set of on-disk and merged tables.
82*e5dd7070Spatrick   /// We manually store the opaque value of the Table because TinyPtrVector
83*e5dd7070Spatrick   /// can't cope with holding a PointerUnion directly.
84*e5dd7070Spatrick   /// There can be at most one MergedTable in this vector, and if present,
85*e5dd7070Spatrick   /// it is the first table.
86*e5dd7070Spatrick   TableVector Tables;
87*e5dd7070Spatrick 
88*e5dd7070Spatrick   /// Files corresponding to overridden tables that we've not yet
89*e5dd7070Spatrick   /// discarded.
90*e5dd7070Spatrick   llvm::TinyPtrVector<file_type> PendingOverrides;
91*e5dd7070Spatrick 
92*e5dd7070Spatrick   struct AsOnDiskTable {
93*e5dd7070Spatrick     using result_type = OnDiskTable *;
94*e5dd7070Spatrick 
operatorAsOnDiskTable95*e5dd7070Spatrick     result_type operator()(void *P) const {
96*e5dd7070Spatrick       return Table::getFromOpaqueValue(P).template get<OnDiskTable *>();
97*e5dd7070Spatrick     }
98*e5dd7070Spatrick   };
99*e5dd7070Spatrick 
100*e5dd7070Spatrick   using table_iterator =
101*e5dd7070Spatrick       llvm::mapped_iterator<TableVector::iterator, AsOnDiskTable>;
102*e5dd7070Spatrick   using table_range = llvm::iterator_range<table_iterator>;
103*e5dd7070Spatrick 
104*e5dd7070Spatrick   /// The current set of on-disk tables.
tables()105*e5dd7070Spatrick   table_range tables() {
106*e5dd7070Spatrick     auto Begin = Tables.begin(), End = Tables.end();
107*e5dd7070Spatrick     if (getMergedTable())
108*e5dd7070Spatrick       ++Begin;
109*e5dd7070Spatrick     return llvm::make_range(llvm::map_iterator(Begin, AsOnDiskTable()),
110*e5dd7070Spatrick                             llvm::map_iterator(End, AsOnDiskTable()));
111*e5dd7070Spatrick   }
112*e5dd7070Spatrick 
getMergedTable()113*e5dd7070Spatrick   MergedTable *getMergedTable() const {
114*e5dd7070Spatrick     // If we already have a merged table, it's the first one.
115*e5dd7070Spatrick     return Tables.empty() ? nullptr : Table::getFromOpaqueValue(*Tables.begin())
116*e5dd7070Spatrick                                           .template dyn_cast<MergedTable*>();
117*e5dd7070Spatrick   }
118*e5dd7070Spatrick 
119*e5dd7070Spatrick   /// Delete all our current on-disk tables.
clear()120*e5dd7070Spatrick   void clear() {
121*e5dd7070Spatrick     for (auto *T : tables())
122*e5dd7070Spatrick       delete T;
123*e5dd7070Spatrick     if (auto *M = getMergedTable())
124*e5dd7070Spatrick       delete M;
125*e5dd7070Spatrick     Tables.clear();
126*e5dd7070Spatrick   }
127*e5dd7070Spatrick 
removeOverriddenTables()128*e5dd7070Spatrick   void removeOverriddenTables() {
129*e5dd7070Spatrick     llvm::DenseSet<file_type> Files;
130*e5dd7070Spatrick     Files.insert(PendingOverrides.begin(), PendingOverrides.end());
131*e5dd7070Spatrick     // Explicitly capture Files to work around an MSVC 2015 rejects-valid bug.
132*e5dd7070Spatrick     auto ShouldRemove = [&Files](void *T) -> bool {
133*e5dd7070Spatrick       auto *ODT = Table::getFromOpaqueValue(T).template get<OnDiskTable *>();
134*e5dd7070Spatrick       bool Remove = Files.count(ODT->File);
135*e5dd7070Spatrick       if (Remove)
136*e5dd7070Spatrick         delete ODT;
137*e5dd7070Spatrick       return Remove;
138*e5dd7070Spatrick     };
139*e5dd7070Spatrick     Tables.erase(std::remove_if(tables().begin().getCurrent(), Tables.end(),
140*e5dd7070Spatrick                                 ShouldRemove),
141*e5dd7070Spatrick                  Tables.end());
142*e5dd7070Spatrick     PendingOverrides.clear();
143*e5dd7070Spatrick   }
144*e5dd7070Spatrick 
condense()145*e5dd7070Spatrick   void condense() {
146*e5dd7070Spatrick     MergedTable *Merged = getMergedTable();
147*e5dd7070Spatrick     if (!Merged)
148*e5dd7070Spatrick       Merged = new MergedTable;
149*e5dd7070Spatrick 
150*e5dd7070Spatrick     // Read in all the tables and merge them together.
151*e5dd7070Spatrick     // FIXME: Be smarter about which tables we merge.
152*e5dd7070Spatrick     for (auto *ODT : tables()) {
153*e5dd7070Spatrick       auto &HT = ODT->Table;
154*e5dd7070Spatrick       Info &InfoObj = HT.getInfoObj();
155*e5dd7070Spatrick 
156*e5dd7070Spatrick       for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) {
157*e5dd7070Spatrick         auto *LocalPtr = I.getItem();
158*e5dd7070Spatrick 
159*e5dd7070Spatrick         // FIXME: Don't rely on the OnDiskHashTable format here.
160*e5dd7070Spatrick         auto L = InfoObj.ReadKeyDataLength(LocalPtr);
161*e5dd7070Spatrick         const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first);
162*e5dd7070Spatrick         data_type_builder ValueBuilder(Merged->Data[Key]);
163*e5dd7070Spatrick         InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second,
164*e5dd7070Spatrick                              ValueBuilder);
165*e5dd7070Spatrick       }
166*e5dd7070Spatrick 
167*e5dd7070Spatrick       Merged->Files.push_back(ODT->File);
168*e5dd7070Spatrick       delete ODT;
169*e5dd7070Spatrick     }
170*e5dd7070Spatrick 
171*e5dd7070Spatrick     Tables.clear();
172*e5dd7070Spatrick     Tables.push_back(Table(Merged).getOpaqueValue());
173*e5dd7070Spatrick   }
174*e5dd7070Spatrick 
175*e5dd7070Spatrick public:
176*e5dd7070Spatrick   MultiOnDiskHashTable() = default;
177*e5dd7070Spatrick 
MultiOnDiskHashTable(MultiOnDiskHashTable && O)178*e5dd7070Spatrick   MultiOnDiskHashTable(MultiOnDiskHashTable &&O)
179*e5dd7070Spatrick       : Tables(std::move(O.Tables)),
180*e5dd7070Spatrick         PendingOverrides(std::move(O.PendingOverrides)) {
181*e5dd7070Spatrick     O.Tables.clear();
182*e5dd7070Spatrick   }
183*e5dd7070Spatrick 
184*e5dd7070Spatrick   MultiOnDiskHashTable &operator=(MultiOnDiskHashTable &&O) {
185*e5dd7070Spatrick     if (&O == this)
186*e5dd7070Spatrick       return *this;
187*e5dd7070Spatrick     clear();
188*e5dd7070Spatrick     Tables = std::move(O.Tables);
189*e5dd7070Spatrick     O.Tables.clear();
190*e5dd7070Spatrick     PendingOverrides = std::move(O.PendingOverrides);
191*e5dd7070Spatrick     return *this;
192*e5dd7070Spatrick   }
193*e5dd7070Spatrick 
~MultiOnDiskHashTable()194*e5dd7070Spatrick   ~MultiOnDiskHashTable() { clear(); }
195*e5dd7070Spatrick 
196*e5dd7070Spatrick   /// Add the table \p Data loaded from file \p File.
197*e5dd7070Spatrick   void add(file_type File, storage_type Data, Info InfoObj = Info()) {
198*e5dd7070Spatrick     using namespace llvm::support;
199*e5dd7070Spatrick 
200*e5dd7070Spatrick     storage_type Ptr = Data;
201*e5dd7070Spatrick 
202*e5dd7070Spatrick     uint32_t BucketOffset = endian::readNext<uint32_t, little, unaligned>(Ptr);
203*e5dd7070Spatrick 
204*e5dd7070Spatrick     // Read the list of overridden files.
205*e5dd7070Spatrick     uint32_t NumFiles = endian::readNext<uint32_t, little, unaligned>(Ptr);
206*e5dd7070Spatrick     // FIXME: Add a reserve() to TinyPtrVector so that we don't need to make
207*e5dd7070Spatrick     // an additional copy.
208*e5dd7070Spatrick     llvm::SmallVector<file_type, 16> OverriddenFiles;
209*e5dd7070Spatrick     OverriddenFiles.reserve(NumFiles);
210*e5dd7070Spatrick     for (/**/; NumFiles != 0; --NumFiles)
211*e5dd7070Spatrick       OverriddenFiles.push_back(InfoObj.ReadFileRef(Ptr));
212*e5dd7070Spatrick     PendingOverrides.insert(PendingOverrides.end(), OverriddenFiles.begin(),
213*e5dd7070Spatrick                             OverriddenFiles.end());
214*e5dd7070Spatrick 
215*e5dd7070Spatrick     // Read the OnDiskChainedHashTable header.
216*e5dd7070Spatrick     storage_type Buckets = Data + BucketOffset;
217*e5dd7070Spatrick     auto NumBucketsAndEntries =
218*e5dd7070Spatrick         OnDiskTable::HashTable::readNumBucketsAndEntries(Buckets);
219*e5dd7070Spatrick 
220*e5dd7070Spatrick     // Register the table.
221*e5dd7070Spatrick     Table NewTable = new OnDiskTable(File, NumBucketsAndEntries.first,
222*e5dd7070Spatrick                                      NumBucketsAndEntries.second,
223*e5dd7070Spatrick                                      Buckets, Ptr, Data, std::move(InfoObj));
224*e5dd7070Spatrick     Tables.push_back(NewTable.getOpaqueValue());
225*e5dd7070Spatrick   }
226*e5dd7070Spatrick 
227*e5dd7070Spatrick   /// Find and read the lookup results for \p EKey.
find(const external_key_type & EKey)228*e5dd7070Spatrick   data_type find(const external_key_type &EKey) {
229*e5dd7070Spatrick     data_type Result;
230*e5dd7070Spatrick 
231*e5dd7070Spatrick     if (!PendingOverrides.empty())
232*e5dd7070Spatrick       removeOverriddenTables();
233*e5dd7070Spatrick 
234*e5dd7070Spatrick     if (Tables.size() > static_cast<unsigned>(Info::MaxTables))
235*e5dd7070Spatrick       condense();
236*e5dd7070Spatrick 
237*e5dd7070Spatrick     internal_key_type Key = Info::GetInternalKey(EKey);
238*e5dd7070Spatrick     auto KeyHash = Info::ComputeHash(Key);
239*e5dd7070Spatrick 
240*e5dd7070Spatrick     if (MergedTable *M = getMergedTable()) {
241*e5dd7070Spatrick       auto It = M->Data.find(Key);
242*e5dd7070Spatrick       if (It != M->Data.end())
243*e5dd7070Spatrick         Result = It->second;
244*e5dd7070Spatrick     }
245*e5dd7070Spatrick 
246*e5dd7070Spatrick     data_type_builder ResultBuilder(Result);
247*e5dd7070Spatrick 
248*e5dd7070Spatrick     for (auto *ODT : tables()) {
249*e5dd7070Spatrick       auto &HT = ODT->Table;
250*e5dd7070Spatrick       auto It = HT.find_hashed(Key, KeyHash);
251*e5dd7070Spatrick       if (It != HT.end())
252*e5dd7070Spatrick         HT.getInfoObj().ReadDataInto(Key, It.getDataPtr(), It.getDataLen(),
253*e5dd7070Spatrick                                      ResultBuilder);
254*e5dd7070Spatrick     }
255*e5dd7070Spatrick 
256*e5dd7070Spatrick     return Result;
257*e5dd7070Spatrick   }
258*e5dd7070Spatrick 
259*e5dd7070Spatrick   /// Read all the lookup results into a single value. This only makes
260*e5dd7070Spatrick   /// sense if merging values across keys is meaningful.
findAll()261*e5dd7070Spatrick   data_type findAll() {
262*e5dd7070Spatrick     data_type Result;
263*e5dd7070Spatrick     data_type_builder ResultBuilder(Result);
264*e5dd7070Spatrick 
265*e5dd7070Spatrick     if (!PendingOverrides.empty())
266*e5dd7070Spatrick       removeOverriddenTables();
267*e5dd7070Spatrick 
268*e5dd7070Spatrick     if (MergedTable *M = getMergedTable()) {
269*e5dd7070Spatrick       for (auto &KV : M->Data)
270*e5dd7070Spatrick         Info::MergeDataInto(KV.second, ResultBuilder);
271*e5dd7070Spatrick     }
272*e5dd7070Spatrick 
273*e5dd7070Spatrick     for (auto *ODT : tables()) {
274*e5dd7070Spatrick       auto &HT = ODT->Table;
275*e5dd7070Spatrick       Info &InfoObj = HT.getInfoObj();
276*e5dd7070Spatrick       for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) {
277*e5dd7070Spatrick         auto *LocalPtr = I.getItem();
278*e5dd7070Spatrick 
279*e5dd7070Spatrick         // FIXME: Don't rely on the OnDiskHashTable format here.
280*e5dd7070Spatrick         auto L = InfoObj.ReadKeyDataLength(LocalPtr);
281*e5dd7070Spatrick         const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first);
282*e5dd7070Spatrick         InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, ResultBuilder);
283*e5dd7070Spatrick       }
284*e5dd7070Spatrick     }
285*e5dd7070Spatrick 
286*e5dd7070Spatrick     return Result;
287*e5dd7070Spatrick   }
288*e5dd7070Spatrick };
289*e5dd7070Spatrick 
290*e5dd7070Spatrick /// Writer for the on-disk hash table.
291*e5dd7070Spatrick template<typename ReaderInfo, typename WriterInfo>
292*e5dd7070Spatrick class MultiOnDiskHashTableGenerator {
293*e5dd7070Spatrick   using BaseTable = MultiOnDiskHashTable<ReaderInfo>;
294*e5dd7070Spatrick   using Generator = llvm::OnDiskChainedHashTableGenerator<WriterInfo>;
295*e5dd7070Spatrick 
296*e5dd7070Spatrick   Generator Gen;
297*e5dd7070Spatrick 
298*e5dd7070Spatrick public:
MultiOnDiskHashTableGenerator()299*e5dd7070Spatrick   MultiOnDiskHashTableGenerator() : Gen() {}
300*e5dd7070Spatrick 
insert(typename WriterInfo::key_type_ref Key,typename WriterInfo::data_type_ref Data,WriterInfo & Info)301*e5dd7070Spatrick   void insert(typename WriterInfo::key_type_ref Key,
302*e5dd7070Spatrick               typename WriterInfo::data_type_ref Data, WriterInfo &Info) {
303*e5dd7070Spatrick     Gen.insert(Key, Data, Info);
304*e5dd7070Spatrick   }
305*e5dd7070Spatrick 
emit(llvm::SmallVectorImpl<char> & Out,WriterInfo & Info,const BaseTable * Base)306*e5dd7070Spatrick   void emit(llvm::SmallVectorImpl<char> &Out, WriterInfo &Info,
307*e5dd7070Spatrick             const BaseTable *Base) {
308*e5dd7070Spatrick     using namespace llvm::support;
309*e5dd7070Spatrick 
310*e5dd7070Spatrick     llvm::raw_svector_ostream OutStream(Out);
311*e5dd7070Spatrick 
312*e5dd7070Spatrick     // Write our header information.
313*e5dd7070Spatrick     {
314*e5dd7070Spatrick       endian::Writer Writer(OutStream, little);
315*e5dd7070Spatrick 
316*e5dd7070Spatrick       // Reserve four bytes for the bucket offset.
317*e5dd7070Spatrick       Writer.write<uint32_t>(0);
318*e5dd7070Spatrick 
319*e5dd7070Spatrick       if (auto *Merged = Base ? Base->getMergedTable() : nullptr) {
320*e5dd7070Spatrick         // Write list of overridden files.
321*e5dd7070Spatrick         Writer.write<uint32_t>(Merged->Files.size());
322*e5dd7070Spatrick         for (const auto &F : Merged->Files)
323*e5dd7070Spatrick           Info.EmitFileRef(OutStream, F);
324*e5dd7070Spatrick 
325*e5dd7070Spatrick         // Add all merged entries from Base to the generator.
326*e5dd7070Spatrick         for (auto &KV : Merged->Data) {
327*e5dd7070Spatrick           if (!Gen.contains(KV.first, Info))
328*e5dd7070Spatrick             Gen.insert(KV.first, Info.ImportData(KV.second), Info);
329*e5dd7070Spatrick         }
330*e5dd7070Spatrick       } else {
331*e5dd7070Spatrick         Writer.write<uint32_t>(0);
332*e5dd7070Spatrick       }
333*e5dd7070Spatrick     }
334*e5dd7070Spatrick 
335*e5dd7070Spatrick     // Write the table itself.
336*e5dd7070Spatrick     uint32_t BucketOffset = Gen.Emit(OutStream, Info);
337*e5dd7070Spatrick 
338*e5dd7070Spatrick     // Replace the first four bytes with the bucket offset.
339*e5dd7070Spatrick     endian::write32le(Out.data(), BucketOffset);
340*e5dd7070Spatrick   }
341*e5dd7070Spatrick };
342*e5dd7070Spatrick 
343*e5dd7070Spatrick } // namespace serialization
344*e5dd7070Spatrick } // namespace clang
345*e5dd7070Spatrick 
346*e5dd7070Spatrick #endif // LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
347