xref: /llvm-project/clang-tools-extra/clangd/index/dex/PostingList.cpp (revision a5cd202e21e52c1e00963ad5bdfeb83b4fec820e)
1 //===--- PostingList.cpp - Symbol identifiers storage interface -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "PostingList.h"
10 #include "index/dex/Iterator.h"
11 #include "index/dex/Token.h"
12 #include "llvm/Support/MathExtras.h"
13 #include <optional>
14 
15 namespace clang {
16 namespace clangd {
17 namespace dex {
18 namespace {
19 
20 /// Implements iterator of PostingList chunks. This requires iterating over two
21 /// levels: the first level iterator iterates over the chunks and decompresses
22 /// them on-the-fly when the contents of chunk are to be seen.
23 class ChunkIterator : public Iterator {
24 public:
ChunkIterator(const Token * Tok,llvm::ArrayRef<Chunk> Chunks)25   explicit ChunkIterator(const Token *Tok, llvm::ArrayRef<Chunk> Chunks)
26       : Tok(Tok), Chunks(Chunks), CurrentChunk(Chunks.begin()) {
27     if (!Chunks.empty()) {
28       DecompressedChunk = CurrentChunk->decompress();
29       CurrentID = DecompressedChunk.begin();
30     }
31   }
32 
reachedEnd() const33   bool reachedEnd() const override { return CurrentChunk == Chunks.end(); }
34 
35   /// Advances cursor to the next item.
advance()36   void advance() override {
37     assert(!reachedEnd() &&
38            "Posting List iterator can't advance() at the end.");
39     ++CurrentID;
40     normalizeCursor();
41   }
42 
43   /// Applies binary search to advance cursor to the next item with DocID
44   /// equal or higher than the given one.
advanceTo(DocID ID)45   void advanceTo(DocID ID) override {
46     assert(!reachedEnd() &&
47            "Posting List iterator can't advance() at the end.");
48     if (ID <= peek())
49       return;
50     advanceToChunk(ID);
51     // Try to find ID within current chunk.
52     CurrentID = std::partition_point(CurrentID, DecompressedChunk.end(),
53                                      [&](const DocID D) { return D < ID; });
54     normalizeCursor();
55   }
56 
peek() const57   DocID peek() const override {
58     assert(!reachedEnd() && "Posting List iterator can't peek() at the end.");
59     return *CurrentID;
60   }
61 
consume()62   float consume() override {
63     assert(!reachedEnd() &&
64            "Posting List iterator can't consume() at the end.");
65     return 1;
66   }
67 
estimateSize() const68   size_t estimateSize() const override {
69     return Chunks.size() * ApproxEntriesPerChunk;
70   }
71 
72 private:
dump(llvm::raw_ostream & OS) const73   llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
74     if (Tok != nullptr)
75       return OS << *Tok;
76     OS << '[';
77     const char *Sep = "";
78     for (const Chunk &C : Chunks)
79       for (const DocID Doc : C.decompress()) {
80         OS << Sep << Doc;
81         Sep = " ";
82       }
83     return OS << ']';
84   }
85 
86   /// If the cursor is at the end of a chunk, place it at the start of the next
87   /// chunk.
normalizeCursor()88   void normalizeCursor() {
89     // Invariant is already established if examined chunk is not exhausted.
90     if (CurrentID != std::end(DecompressedChunk))
91       return;
92     // Advance to next chunk if current one is exhausted.
93     ++CurrentChunk;
94     if (CurrentChunk == Chunks.end()) // Reached the end of PostingList.
95       return;
96     DecompressedChunk = CurrentChunk->decompress();
97     CurrentID = DecompressedChunk.begin();
98   }
99 
100   /// Advances CurrentChunk to the chunk which might contain ID.
advanceToChunk(DocID ID)101   void advanceToChunk(DocID ID) {
102     if ((CurrentChunk != Chunks.end() - 1) &&
103         ((CurrentChunk + 1)->Head <= ID)) {
104       CurrentChunk =
105           std::partition_point(CurrentChunk + 1, Chunks.end(),
106                                [&](const Chunk &C) { return C.Head < ID; });
107       --CurrentChunk;
108       DecompressedChunk = CurrentChunk->decompress();
109       CurrentID = DecompressedChunk.begin();
110     }
111   }
112 
113   const Token *Tok;
114   llvm::ArrayRef<Chunk> Chunks;
115   /// Iterator over chunks.
116   /// If CurrentChunk is valid, then DecompressedChunk is
117   /// CurrentChunk->decompress() and CurrentID is a valid (non-end) iterator
118   /// into it.
119   decltype(Chunks)::const_iterator CurrentChunk;
120   llvm::SmallVector<DocID, Chunk::PayloadSize + 1> DecompressedChunk;
121   /// Iterator over DecompressedChunk.
122   decltype(DecompressedChunk)::iterator CurrentID;
123 
124   static constexpr size_t ApproxEntriesPerChunk = 15;
125 };
126 
127 static constexpr size_t BitsPerEncodingByte = 7;
128 
129 /// Writes a variable length DocID into the buffer and updates the buffer size.
130 /// If it doesn't fit, returns false and doesn't write to the buffer.
encodeVByte(DocID Delta,llvm::MutableArrayRef<uint8_t> & Payload)131 bool encodeVByte(DocID Delta, llvm::MutableArrayRef<uint8_t> &Payload) {
132   assert(Delta != 0 && "0 is not a valid PostingList delta.");
133   // Calculate number of bytes Delta encoding would take by examining the
134   // meaningful bits.
135   unsigned Width = 1 + llvm::Log2_64(Delta) / BitsPerEncodingByte;
136   if (Width > Payload.size())
137     return false;
138 
139   do {
140     uint8_t Encoding = Delta & 0x7f;
141     Delta >>= 7;
142     Payload.front() = Delta ? Encoding | 0x80 : Encoding;
143     Payload = Payload.drop_front();
144   } while (Delta != 0);
145   return true;
146 }
147 
148 /// Use Variable-length Byte (VByte) delta encoding to compress sorted list of
149 /// DocIDs. The compression stores deltas (differences) between subsequent
150 /// DocIDs and encodes these deltas utilizing the least possible number of
151 /// bytes.
152 ///
153 /// Each encoding byte consists of two parts: the first bit (continuation bit)
154 /// indicates whether this is the last byte (0 if this byte is the last) of
155 /// current encoding and seven bytes a piece of DocID (payload). DocID contains
156 /// 32 bits and therefore it takes up to 5 bytes to encode it (4 full 7-bit
157 /// payloads and one 4-bit payload), but in practice it is expected that gaps
158 /// (deltas) between subsequent DocIDs are not large enough to require 5 bytes.
159 /// In very dense posting lists (with average gaps less than 128) this
160 /// representation would be 4 times more efficient than raw DocID array.
161 ///
162 /// PostingList encoding example:
163 ///
164 /// DocIDs    42            47        7000
165 /// gaps                    5         6958
166 /// Encoding  (raw number)  00000101  10110110 00101110
encodeStream(llvm::ArrayRef<DocID> Documents)167 std::vector<Chunk> encodeStream(llvm::ArrayRef<DocID> Documents) {
168   assert(!Documents.empty() && "Can't encode empty sequence.");
169   std::vector<Chunk> Result;
170   Result.emplace_back();
171   DocID Last = Result.back().Head = Documents.front();
172   llvm::MutableArrayRef<uint8_t> RemainingPayload = Result.back().Payload;
173   for (DocID Doc : Documents.drop_front()) {
174     if (!encodeVByte(Doc - Last, RemainingPayload)) { // didn't fit, flush chunk
175       Result.emplace_back();
176       Result.back().Head = Doc;
177       RemainingPayload = Result.back().Payload;
178     }
179     Last = Doc;
180   }
181   return std::vector<Chunk>(Result); // no move, shrink-to-fit
182 }
183 
184 /// Reads variable length DocID from the buffer and updates the buffer size. If
185 /// the stream is terminated, return std::nullopt.
readVByte(llvm::ArrayRef<uint8_t> & Bytes)186 std::optional<DocID> readVByte(llvm::ArrayRef<uint8_t> &Bytes) {
187   if (Bytes.front() == 0 || Bytes.empty())
188     return std::nullopt;
189   DocID Result = 0;
190   bool HasNextByte = true;
191   for (size_t Length = 0; HasNextByte && !Bytes.empty(); ++Length) {
192     assert(Length <= 5 && "Malformed VByte encoding sequence.");
193     // Write meaningful bits to the correct place in the document decoding.
194     Result |= (Bytes.front() & 0x7f) << (BitsPerEncodingByte * Length);
195     if ((Bytes.front() & 0x80) == 0)
196       HasNextByte = false;
197     Bytes = Bytes.drop_front();
198   }
199   return Result;
200 }
201 
202 } // namespace
203 
decompress() const204 llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Chunk::decompress() const {
205   llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Result{Head};
206   llvm::ArrayRef<uint8_t> Bytes(Payload);
207   DocID Delta;
208   for (DocID Current = Head; !Bytes.empty(); Current += Delta) {
209     auto MaybeDelta = readVByte(Bytes);
210     if (!MaybeDelta)
211       break;
212     Delta = *MaybeDelta;
213     Result.push_back(Current + Delta);
214   }
215   return llvm::SmallVector<DocID, Chunk::PayloadSize + 1>{Result};
216 }
217 
PostingList(llvm::ArrayRef<DocID> Documents)218 PostingList::PostingList(llvm::ArrayRef<DocID> Documents)
219     : Chunks(encodeStream(Documents)) {}
220 
iterator(const Token * Tok) const221 std::unique_ptr<Iterator> PostingList::iterator(const Token *Tok) const {
222   return std::make_unique<ChunkIterator>(Tok, Chunks);
223 }
224 
225 } // namespace dex
226 } // namespace clangd
227 } // namespace clang
228