xref: /llvm-project/clang-tools-extra/clangd/index/dex/Dex.cpp (revision 61fe67a4017375fd675f75652e857e837f77fa51)
1 //===--- Dex.cpp - Dex Symbol Index Implementation --------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "Dex.h"
10 #include "FileDistance.h"
11 #include "FuzzyMatch.h"
12 #include "Quality.h"
13 #include "URI.h"
14 #include "index/Index.h"
15 #include "index/dex/Iterator.h"
16 #include "index/dex/Token.h"
17 #include "index/dex/Trigram.h"
18 #include "support/Logger.h"
19 #include "support/Trace.h"
20 #include "llvm/ADT/DenseMap.h"
21 #include "llvm/ADT/StringMap.h"
22 #include "llvm/ADT/StringSet.h"
23 #include "llvm/Support/Path.h"
24 #include "llvm/Support/ScopedPrinter.h"
25 #include <algorithm>
26 #include <optional>
27 #include <queue>
28 #include <utility>
29 #include <vector>
30 
31 namespace clang {
32 namespace clangd {
33 namespace dex {
34 
35 std::unique_ptr<SymbolIndex> Dex::build(SymbolSlab Symbols, RefSlab Refs,
36                                         RelationSlab Rels,
37                                         bool SupportContainedRefs) {
38   auto Size = Symbols.bytes() + Refs.bytes();
39   // There is no need to include "Rels" in Data because the relations are self-
40   // contained, without references into a backing store.
41   auto Data = std::make_pair(std::move(Symbols), std::move(Refs));
42   return std::make_unique<Dex>(Data.first, Data.second, Rels, std::move(Data),
43                                Size, SupportContainedRefs);
44 }
45 
46 namespace {
47 
48 // Mark symbols which are can be used for code completion.
49 const Token RestrictedForCodeCompletion =
50     Token(Token::Kind::Sentinel, "Restricted For Code Completion");
51 
52 // Helper to efficiently assemble the inverse index (token -> matching docs).
53 // The output is a nice uniform structure keyed on Token, but constructing
54 // the Token object every time we want to insert into the map is wasteful.
55 // Instead we have various maps keyed on things that are cheap to compute,
56 // and produce the Token keys once at the end.
57 class IndexBuilder {
58   llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs;
59   std::vector<DocID> RestrictedCCDocs;
60   llvm::StringMap<std::vector<DocID>> TypeDocs;
61   llvm::StringMap<std::vector<DocID>> ScopeDocs;
62   llvm::StringMap<std::vector<DocID>> ProximityDocs;
63   std::vector<Trigram> TrigramScratch;
64 
65 public:
66   // Add the tokens which are given symbol's characteristics.
67   // This includes fuzzy matching trigrams, symbol's scope, etc.
68   // FIXME(kbobyrev): Support more token types:
69   // * Namespace proximity
70   void add(const Symbol &Sym, DocID D) {
71     generateIdentifierTrigrams(Sym.Name, TrigramScratch);
72     for (Trigram T : TrigramScratch)
73       TrigramDocs[T].push_back(D);
74     ScopeDocs[Sym.Scope].push_back(D);
75     if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty())
76       for (const auto &ProximityURI :
77            generateProximityURIs(Sym.CanonicalDeclaration.FileURI))
78         ProximityDocs[ProximityURI].push_back(D);
79     if (Sym.Flags & Symbol::IndexedForCodeCompletion)
80       RestrictedCCDocs.push_back(D);
81     if (!Sym.Type.empty())
82       TypeDocs[Sym.Type].push_back(D);
83   }
84 
85   // Assemble the final compressed posting lists for the added symbols.
86   llvm::DenseMap<Token, PostingList> build() && {
87     llvm::DenseMap<Token, PostingList> Result(/*InitialReserve=*/
88                                               TrigramDocs.size() +
89                                               RestrictedCCDocs.size() +
90                                               TypeDocs.size() +
91                                               ScopeDocs.size() +
92                                               ProximityDocs.size());
93     // Tear down intermediate structs as we go to reduce memory usage.
94     // Since we're trying to get rid of underlying allocations, clearing the
95     // containers is not enough.
96     auto CreatePostingList =
97         [&Result](Token::Kind TK, llvm::StringMap<std::vector<DocID>> &Docs) {
98           for (auto &E : Docs) {
99             Result.try_emplace(Token(TK, E.first()), E.second);
100             E.second = {};
101           }
102           Docs = {};
103         };
104     CreatePostingList(Token::Kind::Type, TypeDocs);
105     CreatePostingList(Token::Kind::Scope, ScopeDocs);
106     CreatePostingList(Token::Kind::ProximityURI, ProximityDocs);
107 
108     // TrigramDocs are stored in a DenseMap and RestrictedCCDocs is not even a
109     // map, treat them specially.
110     for (auto &E : TrigramDocs) {
111       Result.try_emplace(Token(Token::Kind::Trigram, E.first.str()), E.second);
112       E.second = {};
113     }
114     TrigramDocs = llvm::DenseMap<Trigram, std::vector<DocID>>{};
115     if (!RestrictedCCDocs.empty())
116       Result.try_emplace(RestrictedForCodeCompletion,
117                          std::move(RestrictedCCDocs));
118     return Result;
119   }
120 };
121 
122 } // namespace
123 
124 void Dex::buildIndex(bool SupportContainedRefs) {
125   this->Corpus = dex::Corpus(Symbols.size());
126   std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());
127 
128   for (size_t I = 0; I < Symbols.size(); ++I) {
129     const Symbol *Sym = Symbols[I];
130     LookupTable[Sym->ID] = Sym;
131     ScoredSymbols[I] = {quality(*Sym), Sym};
132   }
133 
134   // Symbols are sorted by symbol qualities so that items in the posting lists
135   // are stored in the descending order of symbol quality.
136   llvm::sort(ScoredSymbols, std::greater<std::pair<float, const Symbol *>>());
137 
138   // SymbolQuality was empty up until now.
139   SymbolQuality.resize(Symbols.size());
140   // Populate internal storage using Symbol + Score pairs.
141   for (size_t I = 0; I < ScoredSymbols.size(); ++I) {
142     SymbolQuality[I] = ScoredSymbols[I].first;
143     Symbols[I] = ScoredSymbols[I].second;
144   }
145 
146   // Build posting lists for symbols.
147   IndexBuilder Builder;
148   for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank)
149     Builder.add(*Symbols[SymbolRank], SymbolRank);
150   InvertedIndex = std::move(Builder).build();
151 
152   // If the containedRefs() operation is supported, build the RevRefs
153   // data structure used to implement it.
154   if (!SupportContainedRefs)
155     return;
156   for (const auto &[ID, RefList] : Refs)
157     for (const auto &R : RefList)
158       if ((R.Kind & ContainedRefsRequest::SupportedRefKinds) !=
159           RefKind::Unknown)
160         RevRefs.emplace_back(R, ID);
161   // Sort by container ID so we can use binary search for lookup.
162   llvm::sort(RevRefs, [](const RevRef &A, const RevRef &B) {
163     return A.ref().Container < B.ref().Container;
164   });
165 }
166 
167 std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const {
168   auto It = InvertedIndex.find(Tok);
169   return It == InvertedIndex.end() ? Corpus.none()
170                                    : It->second.iterator(&It->first);
171 }
172 
173 // Constructs BOOST iterators for Path Proximities.
174 std::unique_ptr<Iterator> Dex::createFileProximityIterator(
175     llvm::ArrayRef<std::string> ProximityPaths) const {
176   std::vector<std::unique_ptr<Iterator>> BoostingIterators;
177   // Deduplicate parent URIs extracted from the ProximityPaths.
178   llvm::StringSet<> ParentURIs;
179   llvm::StringMap<SourceParams> Sources;
180   for (const auto &Path : ProximityPaths) {
181     Sources[Path] = SourceParams();
182     auto PathURI = URI::create(Path).toString();
183     const auto PathProximityURIs = generateProximityURIs(PathURI.c_str());
184     for (const auto &ProximityURI : PathProximityURIs)
185       ParentURIs.insert(ProximityURI);
186   }
187   // Use SymbolRelevanceSignals for symbol relevance evaluation: use defaults
188   // for all parameters except for Proximity Path distance signal.
189   SymbolRelevanceSignals PathProximitySignals;
190   // DistanceCalculator will find the shortest distance from ProximityPaths to
191   // any URI extracted from the ProximityPaths.
192   URIDistance DistanceCalculator(Sources);
193   PathProximitySignals.FileProximityMatch = &DistanceCalculator;
194   // Try to build BOOST iterator for each Proximity Path provided by
195   // ProximityPaths. Boosting factor should depend on the distance to the
196   // Proximity Path: the closer processed path is, the higher boosting factor.
197   for (const auto &ParentURI : ParentURIs.keys()) {
198     // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
199     auto It = iterator(Token(Token::Kind::ProximityURI, ParentURI));
200     if (It->kind() != Iterator::Kind::False) {
201       PathProximitySignals.SymbolURI = ParentURI;
202       BoostingIterators.push_back(Corpus.boost(
203           std::move(It), PathProximitySignals.evaluateHeuristics()));
204     }
205   }
206   BoostingIterators.push_back(Corpus.all());
207   return Corpus.unionOf(std::move(BoostingIterators));
208 }
209 
210 // Constructs BOOST iterators for preferred types.
211 std::unique_ptr<Iterator>
212 Dex::createTypeBoostingIterator(llvm::ArrayRef<std::string> Types) const {
213   std::vector<std::unique_ptr<Iterator>> BoostingIterators;
214   SymbolRelevanceSignals PreferredTypeSignals;
215   PreferredTypeSignals.TypeMatchesPreferred = true;
216   auto Boost = PreferredTypeSignals.evaluateHeuristics();
217   for (const auto &T : Types)
218     BoostingIterators.push_back(
219         Corpus.boost(iterator(Token(Token::Kind::Type, T)), Boost));
220   BoostingIterators.push_back(Corpus.all());
221   return Corpus.unionOf(std::move(BoostingIterators));
222 }
223 
224 /// Constructs iterators over tokens extracted from the query and exhausts it
225 /// while applying Callback to each symbol in the order of decreasing quality
226 /// of the matched symbols.
227 bool Dex::fuzzyFind(const FuzzyFindRequest &Req,
228                     llvm::function_ref<void(const Symbol &)> Callback) const {
229   assert(!StringRef(Req.Query).contains("::") &&
230          "There must be no :: in query.");
231   trace::Span Tracer("Dex fuzzyFind");
232   FuzzyMatcher Filter(Req.Query);
233   // For short queries we use specialized trigrams that don't yield all results.
234   // Prevent clients from postfiltering them for longer queries.
235   bool More = !Req.Query.empty() && Req.Query.size() < 3;
236 
237   std::vector<std::unique_ptr<Iterator>> Criteria;
238   const auto TrigramTokens = generateQueryTrigrams(Req.Query);
239 
240   // Generate query trigrams and construct AND iterator over all query
241   // trigrams.
242   std::vector<std::unique_ptr<Iterator>> TrigramIterators;
243   for (const auto &Trigram : TrigramTokens)
244     TrigramIterators.push_back(iterator(Trigram));
245   Criteria.push_back(Corpus.intersect(std::move(TrigramIterators)));
246 
247   // Generate scope tokens for search query.
248   std::vector<std::unique_ptr<Iterator>> ScopeIterators;
249   for (const auto &Scope : Req.Scopes)
250     ScopeIterators.push_back(iterator(Token(Token::Kind::Scope, Scope)));
251   if (Req.AnyScope)
252     ScopeIterators.push_back(
253         Corpus.boost(Corpus.all(), ScopeIterators.empty() ? 1.0 : 0.2));
254   Criteria.push_back(Corpus.unionOf(std::move(ScopeIterators)));
255 
256   // Add proximity paths boosting (all symbols, some boosted).
257   Criteria.push_back(createFileProximityIterator(Req.ProximityPaths));
258   // Add boosting for preferred types.
259   Criteria.push_back(createTypeBoostingIterator(Req.PreferredTypes));
260 
261   if (Req.RestrictForCodeCompletion)
262     Criteria.push_back(iterator(RestrictedForCodeCompletion));
263 
264   // Use TRUE iterator if both trigrams and scopes from the query are not
265   // present in the symbol index.
266   auto Root = Corpus.intersect(std::move(Criteria));
267   // Retrieve more items than it was requested: some of  the items with high
268   // final score might not be retrieved otherwise.
269   // FIXME(kbobyrev): Tune this ratio.
270   if (Req.Limit)
271     Root = Corpus.limit(std::move(Root), *Req.Limit * 100);
272   SPAN_ATTACH(Tracer, "query", llvm::to_string(*Root));
273   vlog("Dex query tree: {0}", *Root);
274 
275   using IDAndScore = std::pair<DocID, float>;
276   std::vector<IDAndScore> IDAndScores = consume(*Root);
277 
278   auto Compare = [](const IDAndScore &LHS, const IDAndScore &RHS) {
279     return LHS.second > RHS.second;
280   };
281   TopN<IDAndScore, decltype(Compare)> Top(
282       Req.Limit.value_or(std::numeric_limits<size_t>::max()), Compare);
283   for (const auto &IDAndScore : IDAndScores) {
284     const DocID SymbolDocID = IDAndScore.first;
285     const auto *Sym = Symbols[SymbolDocID];
286     const std::optional<float> Score = Filter.match(Sym->Name);
287     if (!Score)
288       continue;
289     // Combine Fuzzy Matching score, precomputed symbol quality and boosting
290     // score for a cumulative final symbol score.
291     const float FinalScore =
292         (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second;
293     // If Top.push(...) returns true, it means that it had to pop an item. In
294     // this case, it is possible to retrieve more symbols.
295     if (Top.push({SymbolDocID, FinalScore}))
296       More = true;
297   }
298 
299   // Apply callback to the top Req.Limit items in the descending
300   // order of cumulative score.
301   for (const auto &Item : std::move(Top).items())
302     Callback(*Symbols[Item.first]);
303   return More;
304 }
305 
306 void Dex::lookup(const LookupRequest &Req,
307                  llvm::function_ref<void(const Symbol &)> Callback) const {
308   trace::Span Tracer("Dex lookup");
309   for (const auto &ID : Req.IDs) {
310     auto I = LookupTable.find(ID);
311     if (I != LookupTable.end())
312       Callback(*I->second);
313   }
314 }
315 
316 bool Dex::refs(const RefsRequest &Req,
317                llvm::function_ref<void(const Ref &)> Callback) const {
318   trace::Span Tracer("Dex refs");
319   uint32_t Remaining = Req.Limit.value_or(std::numeric_limits<uint32_t>::max());
320   for (const auto &ID : Req.IDs)
321     for (const auto &Ref : Refs.lookup(ID)) {
322       if (!static_cast<int>(Req.Filter & Ref.Kind))
323         continue;
324       if (Remaining == 0)
325         return true; // More refs were available.
326       --Remaining;
327       Callback(Ref);
328     }
329   return false; // We reported all refs.
330 }
331 
332 llvm::iterator_range<std::vector<Dex::RevRef>::const_iterator>
333 Dex::lookupRevRefs(const SymbolID &Container) const {
334   // equal_range() requires an element of the same type as the elements of the
335   // range, so construct a dummy RevRef with the container of interest.
336   Ref QueryRef;
337   QueryRef.Container = Container;
338   RevRef Query(QueryRef, SymbolID{});
339 
340   auto ItPair = std::equal_range(RevRefs.cbegin(), RevRefs.cend(), Query,
341                                  [](const RevRef &A, const RevRef &B) {
342                                    return A.ref().Container < B.ref().Container;
343                                  });
344   return {ItPair.first, ItPair.second};
345 }
346 
347 bool Dex::containedRefs(
348     const ContainedRefsRequest &Req,
349     llvm::function_ref<void(const ContainedRefsResult &)> Callback) const {
350   trace::Span Tracer("Dex reversed refs");
351   uint32_t Remaining = Req.Limit.value_or(std::numeric_limits<uint32_t>::max());
352   for (const auto &Rev : lookupRevRefs(Req.ID)) {
353     // RevRefs are already filtered to ContainedRefsRequest::SupportedRefKinds
354     if (Remaining == 0)
355       return true; // More refs were available.
356     --Remaining;
357     Callback(Rev.containedRefsResult());
358   }
359   return false; // We reported all refs.
360 }
361 
362 void Dex::relations(
363     const RelationsRequest &Req,
364     llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const {
365   trace::Span Tracer("Dex relations");
366   uint32_t Remaining = Req.Limit.value_or(std::numeric_limits<uint32_t>::max());
367   for (const SymbolID &Subject : Req.Subjects) {
368     LookupRequest LookupReq;
369     auto It = Relations.find(
370         std::make_pair(Subject, static_cast<uint8_t>(Req.Predicate)));
371     if (It != Relations.end()) {
372       for (const auto &Object : It->second) {
373         if (Remaining > 0) {
374           --Remaining;
375           LookupReq.IDs.insert(Object);
376         }
377       }
378     }
379     lookup(LookupReq, [&](const Symbol &Object) { Callback(Subject, Object); });
380   }
381 }
382 
383 llvm::unique_function<IndexContents(llvm::StringRef) const>
384 Dex::indexedFiles() const {
385   return [this](llvm::StringRef FileURI) {
386     return Files.contains(FileURI) ? IdxContents : IndexContents::None;
387   };
388 }
389 
390 size_t Dex::estimateMemoryUsage() const {
391   size_t Bytes = Symbols.size() * sizeof(const Symbol *);
392   Bytes += SymbolQuality.size() * sizeof(float);
393   Bytes += LookupTable.getMemorySize();
394   Bytes += InvertedIndex.getMemorySize();
395   for (const auto &TokenToPostingList : InvertedIndex)
396     Bytes += TokenToPostingList.second.bytes();
397   Bytes += Refs.getMemorySize();
398   Bytes += RevRefs.size() * sizeof(RevRef);
399   Bytes += Relations.getMemorySize();
400   return Bytes + BackingDataSize;
401 }
402 
403 // Given foo://bar/one/two
404 // Returns        ~~~~~~~~  (or empty for bad URI)
405 llvm::StringRef findPathInURI(llvm::StringRef S) {
406   S = S.split(':').second;   // Eat scheme.
407   if (S.consume_front("//")) // Eat authority.
408     S = S.drop_until([](char C) { return C == '/'; });
409   return S;
410 }
411 
412 // FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum
413 // size of resulting vector. Some projects might want to have higher limit if
414 // the file hierarchy is deeper. For the generic case, it would be useful to
415 // calculate Limit in the index build stage by calculating the maximum depth
416 // of the project source tree at runtime.
417 constexpr unsigned ProximityURILimit = 5;
418 
419 llvm::SmallVector<llvm::StringRef, ProximityURILimit>
420 generateProximityURIs(llvm::StringRef URI) {
421   // This function is hot when indexing, so don't parse/reserialize URIPath,
422   // just emit substrings of it instead.
423   //
424   // foo://bar/one/two
425   //          ~~~~~~~~
426   //            Path
427   llvm::StringRef Path = findPathInURI(URI);
428   if (Path.empty())
429     return {}; // Bad URI.
430   assert(Path.begin() >= URI.begin() && Path.begin() < URI.end() &&
431          Path.end() == URI.end());
432 
433   // The original is a proximity URI.
434   llvm::SmallVector<llvm::StringRef, ProximityURILimit> Result = {URI};
435   // Form proximity URIs by chopping before each slash in the path part.
436   for (auto Slash = Path.rfind('/'); Slash > 0 && Slash != StringRef::npos;
437        Slash = Path.rfind('/')) {
438     Path = Path.substr(0, Slash);
439     Result.push_back(URI.substr(0, Path.end() - URI.data()));
440     if (Result.size() == ProximityURILimit)
441       return Result;
442   }
443   // The root foo://bar/ is a proximity URI.
444   if (Path.starts_with("/"))
445     Result.push_back(URI.substr(0, Path.begin() + 1 - URI.data()));
446   return Result;
447 }
448 
449 } // namespace dex
450 } // namespace clangd
451 } // namespace clang
452