1 //===--- Dex.cpp - Dex Symbol Index Implementation --------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "Dex.h" 10 #include "FileDistance.h" 11 #include "FuzzyMatch.h" 12 #include "Quality.h" 13 #include "URI.h" 14 #include "index/Index.h" 15 #include "index/dex/Iterator.h" 16 #include "index/dex/Token.h" 17 #include "index/dex/Trigram.h" 18 #include "support/Logger.h" 19 #include "support/Trace.h" 20 #include "llvm/ADT/DenseMap.h" 21 #include "llvm/ADT/StringMap.h" 22 #include "llvm/ADT/StringSet.h" 23 #include "llvm/Support/Path.h" 24 #include "llvm/Support/ScopedPrinter.h" 25 #include <algorithm> 26 #include <optional> 27 #include <queue> 28 #include <utility> 29 #include <vector> 30 31 namespace clang { 32 namespace clangd { 33 namespace dex { 34 35 std::unique_ptr<SymbolIndex> Dex::build(SymbolSlab Symbols, RefSlab Refs, 36 RelationSlab Rels, 37 bool SupportContainedRefs) { 38 auto Size = Symbols.bytes() + Refs.bytes(); 39 // There is no need to include "Rels" in Data because the relations are self- 40 // contained, without references into a backing store. 41 auto Data = std::make_pair(std::move(Symbols), std::move(Refs)); 42 return std::make_unique<Dex>(Data.first, Data.second, Rels, std::move(Data), 43 Size, SupportContainedRefs); 44 } 45 46 namespace { 47 48 // Mark symbols which are can be used for code completion. 49 const Token RestrictedForCodeCompletion = 50 Token(Token::Kind::Sentinel, "Restricted For Code Completion"); 51 52 // Helper to efficiently assemble the inverse index (token -> matching docs). 53 // The output is a nice uniform structure keyed on Token, but constructing 54 // the Token object every time we want to insert into the map is wasteful. 55 // Instead we have various maps keyed on things that are cheap to compute, 56 // and produce the Token keys once at the end. 57 class IndexBuilder { 58 llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs; 59 std::vector<DocID> RestrictedCCDocs; 60 llvm::StringMap<std::vector<DocID>> TypeDocs; 61 llvm::StringMap<std::vector<DocID>> ScopeDocs; 62 llvm::StringMap<std::vector<DocID>> ProximityDocs; 63 std::vector<Trigram> TrigramScratch; 64 65 public: 66 // Add the tokens which are given symbol's characteristics. 67 // This includes fuzzy matching trigrams, symbol's scope, etc. 68 // FIXME(kbobyrev): Support more token types: 69 // * Namespace proximity 70 void add(const Symbol &Sym, DocID D) { 71 generateIdentifierTrigrams(Sym.Name, TrigramScratch); 72 for (Trigram T : TrigramScratch) 73 TrigramDocs[T].push_back(D); 74 ScopeDocs[Sym.Scope].push_back(D); 75 if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty()) 76 for (const auto &ProximityURI : 77 generateProximityURIs(Sym.CanonicalDeclaration.FileURI)) 78 ProximityDocs[ProximityURI].push_back(D); 79 if (Sym.Flags & Symbol::IndexedForCodeCompletion) 80 RestrictedCCDocs.push_back(D); 81 if (!Sym.Type.empty()) 82 TypeDocs[Sym.Type].push_back(D); 83 } 84 85 // Assemble the final compressed posting lists for the added symbols. 86 llvm::DenseMap<Token, PostingList> build() && { 87 llvm::DenseMap<Token, PostingList> Result(/*InitialReserve=*/ 88 TrigramDocs.size() + 89 RestrictedCCDocs.size() + 90 TypeDocs.size() + 91 ScopeDocs.size() + 92 ProximityDocs.size()); 93 // Tear down intermediate structs as we go to reduce memory usage. 94 // Since we're trying to get rid of underlying allocations, clearing the 95 // containers is not enough. 96 auto CreatePostingList = 97 [&Result](Token::Kind TK, llvm::StringMap<std::vector<DocID>> &Docs) { 98 for (auto &E : Docs) { 99 Result.try_emplace(Token(TK, E.first()), E.second); 100 E.second = {}; 101 } 102 Docs = {}; 103 }; 104 CreatePostingList(Token::Kind::Type, TypeDocs); 105 CreatePostingList(Token::Kind::Scope, ScopeDocs); 106 CreatePostingList(Token::Kind::ProximityURI, ProximityDocs); 107 108 // TrigramDocs are stored in a DenseMap and RestrictedCCDocs is not even a 109 // map, treat them specially. 110 for (auto &E : TrigramDocs) { 111 Result.try_emplace(Token(Token::Kind::Trigram, E.first.str()), E.second); 112 E.second = {}; 113 } 114 TrigramDocs = llvm::DenseMap<Trigram, std::vector<DocID>>{}; 115 if (!RestrictedCCDocs.empty()) 116 Result.try_emplace(RestrictedForCodeCompletion, 117 std::move(RestrictedCCDocs)); 118 return Result; 119 } 120 }; 121 122 } // namespace 123 124 void Dex::buildIndex(bool SupportContainedRefs) { 125 this->Corpus = dex::Corpus(Symbols.size()); 126 std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size()); 127 128 for (size_t I = 0; I < Symbols.size(); ++I) { 129 const Symbol *Sym = Symbols[I]; 130 LookupTable[Sym->ID] = Sym; 131 ScoredSymbols[I] = {quality(*Sym), Sym}; 132 } 133 134 // Symbols are sorted by symbol qualities so that items in the posting lists 135 // are stored in the descending order of symbol quality. 136 llvm::sort(ScoredSymbols, std::greater<std::pair<float, const Symbol *>>()); 137 138 // SymbolQuality was empty up until now. 139 SymbolQuality.resize(Symbols.size()); 140 // Populate internal storage using Symbol + Score pairs. 141 for (size_t I = 0; I < ScoredSymbols.size(); ++I) { 142 SymbolQuality[I] = ScoredSymbols[I].first; 143 Symbols[I] = ScoredSymbols[I].second; 144 } 145 146 // Build posting lists for symbols. 147 IndexBuilder Builder; 148 for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) 149 Builder.add(*Symbols[SymbolRank], SymbolRank); 150 InvertedIndex = std::move(Builder).build(); 151 152 // If the containedRefs() operation is supported, build the RevRefs 153 // data structure used to implement it. 154 if (!SupportContainedRefs) 155 return; 156 for (const auto &[ID, RefList] : Refs) 157 for (const auto &R : RefList) 158 if ((R.Kind & ContainedRefsRequest::SupportedRefKinds) != 159 RefKind::Unknown) 160 RevRefs.emplace_back(R, ID); 161 // Sort by container ID so we can use binary search for lookup. 162 llvm::sort(RevRefs, [](const RevRef &A, const RevRef &B) { 163 return A.ref().Container < B.ref().Container; 164 }); 165 } 166 167 std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const { 168 auto It = InvertedIndex.find(Tok); 169 return It == InvertedIndex.end() ? Corpus.none() 170 : It->second.iterator(&It->first); 171 } 172 173 // Constructs BOOST iterators for Path Proximities. 174 std::unique_ptr<Iterator> Dex::createFileProximityIterator( 175 llvm::ArrayRef<std::string> ProximityPaths) const { 176 std::vector<std::unique_ptr<Iterator>> BoostingIterators; 177 // Deduplicate parent URIs extracted from the ProximityPaths. 178 llvm::StringSet<> ParentURIs; 179 llvm::StringMap<SourceParams> Sources; 180 for (const auto &Path : ProximityPaths) { 181 Sources[Path] = SourceParams(); 182 auto PathURI = URI::create(Path).toString(); 183 const auto PathProximityURIs = generateProximityURIs(PathURI.c_str()); 184 for (const auto &ProximityURI : PathProximityURIs) 185 ParentURIs.insert(ProximityURI); 186 } 187 // Use SymbolRelevanceSignals for symbol relevance evaluation: use defaults 188 // for all parameters except for Proximity Path distance signal. 189 SymbolRelevanceSignals PathProximitySignals; 190 // DistanceCalculator will find the shortest distance from ProximityPaths to 191 // any URI extracted from the ProximityPaths. 192 URIDistance DistanceCalculator(Sources); 193 PathProximitySignals.FileProximityMatch = &DistanceCalculator; 194 // Try to build BOOST iterator for each Proximity Path provided by 195 // ProximityPaths. Boosting factor should depend on the distance to the 196 // Proximity Path: the closer processed path is, the higher boosting factor. 197 for (const auto &ParentURI : ParentURIs.keys()) { 198 // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator. 199 auto It = iterator(Token(Token::Kind::ProximityURI, ParentURI)); 200 if (It->kind() != Iterator::Kind::False) { 201 PathProximitySignals.SymbolURI = ParentURI; 202 BoostingIterators.push_back(Corpus.boost( 203 std::move(It), PathProximitySignals.evaluateHeuristics())); 204 } 205 } 206 BoostingIterators.push_back(Corpus.all()); 207 return Corpus.unionOf(std::move(BoostingIterators)); 208 } 209 210 // Constructs BOOST iterators for preferred types. 211 std::unique_ptr<Iterator> 212 Dex::createTypeBoostingIterator(llvm::ArrayRef<std::string> Types) const { 213 std::vector<std::unique_ptr<Iterator>> BoostingIterators; 214 SymbolRelevanceSignals PreferredTypeSignals; 215 PreferredTypeSignals.TypeMatchesPreferred = true; 216 auto Boost = PreferredTypeSignals.evaluateHeuristics(); 217 for (const auto &T : Types) 218 BoostingIterators.push_back( 219 Corpus.boost(iterator(Token(Token::Kind::Type, T)), Boost)); 220 BoostingIterators.push_back(Corpus.all()); 221 return Corpus.unionOf(std::move(BoostingIterators)); 222 } 223 224 /// Constructs iterators over tokens extracted from the query and exhausts it 225 /// while applying Callback to each symbol in the order of decreasing quality 226 /// of the matched symbols. 227 bool Dex::fuzzyFind(const FuzzyFindRequest &Req, 228 llvm::function_ref<void(const Symbol &)> Callback) const { 229 assert(!StringRef(Req.Query).contains("::") && 230 "There must be no :: in query."); 231 trace::Span Tracer("Dex fuzzyFind"); 232 FuzzyMatcher Filter(Req.Query); 233 // For short queries we use specialized trigrams that don't yield all results. 234 // Prevent clients from postfiltering them for longer queries. 235 bool More = !Req.Query.empty() && Req.Query.size() < 3; 236 237 std::vector<std::unique_ptr<Iterator>> Criteria; 238 const auto TrigramTokens = generateQueryTrigrams(Req.Query); 239 240 // Generate query trigrams and construct AND iterator over all query 241 // trigrams. 242 std::vector<std::unique_ptr<Iterator>> TrigramIterators; 243 for (const auto &Trigram : TrigramTokens) 244 TrigramIterators.push_back(iterator(Trigram)); 245 Criteria.push_back(Corpus.intersect(std::move(TrigramIterators))); 246 247 // Generate scope tokens for search query. 248 std::vector<std::unique_ptr<Iterator>> ScopeIterators; 249 for (const auto &Scope : Req.Scopes) 250 ScopeIterators.push_back(iterator(Token(Token::Kind::Scope, Scope))); 251 if (Req.AnyScope) 252 ScopeIterators.push_back( 253 Corpus.boost(Corpus.all(), ScopeIterators.empty() ? 1.0 : 0.2)); 254 Criteria.push_back(Corpus.unionOf(std::move(ScopeIterators))); 255 256 // Add proximity paths boosting (all symbols, some boosted). 257 Criteria.push_back(createFileProximityIterator(Req.ProximityPaths)); 258 // Add boosting for preferred types. 259 Criteria.push_back(createTypeBoostingIterator(Req.PreferredTypes)); 260 261 if (Req.RestrictForCodeCompletion) 262 Criteria.push_back(iterator(RestrictedForCodeCompletion)); 263 264 // Use TRUE iterator if both trigrams and scopes from the query are not 265 // present in the symbol index. 266 auto Root = Corpus.intersect(std::move(Criteria)); 267 // Retrieve more items than it was requested: some of the items with high 268 // final score might not be retrieved otherwise. 269 // FIXME(kbobyrev): Tune this ratio. 270 if (Req.Limit) 271 Root = Corpus.limit(std::move(Root), *Req.Limit * 100); 272 SPAN_ATTACH(Tracer, "query", llvm::to_string(*Root)); 273 vlog("Dex query tree: {0}", *Root); 274 275 using IDAndScore = std::pair<DocID, float>; 276 std::vector<IDAndScore> IDAndScores = consume(*Root); 277 278 auto Compare = [](const IDAndScore &LHS, const IDAndScore &RHS) { 279 return LHS.second > RHS.second; 280 }; 281 TopN<IDAndScore, decltype(Compare)> Top( 282 Req.Limit.value_or(std::numeric_limits<size_t>::max()), Compare); 283 for (const auto &IDAndScore : IDAndScores) { 284 const DocID SymbolDocID = IDAndScore.first; 285 const auto *Sym = Symbols[SymbolDocID]; 286 const std::optional<float> Score = Filter.match(Sym->Name); 287 if (!Score) 288 continue; 289 // Combine Fuzzy Matching score, precomputed symbol quality and boosting 290 // score for a cumulative final symbol score. 291 const float FinalScore = 292 (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second; 293 // If Top.push(...) returns true, it means that it had to pop an item. In 294 // this case, it is possible to retrieve more symbols. 295 if (Top.push({SymbolDocID, FinalScore})) 296 More = true; 297 } 298 299 // Apply callback to the top Req.Limit items in the descending 300 // order of cumulative score. 301 for (const auto &Item : std::move(Top).items()) 302 Callback(*Symbols[Item.first]); 303 return More; 304 } 305 306 void Dex::lookup(const LookupRequest &Req, 307 llvm::function_ref<void(const Symbol &)> Callback) const { 308 trace::Span Tracer("Dex lookup"); 309 for (const auto &ID : Req.IDs) { 310 auto I = LookupTable.find(ID); 311 if (I != LookupTable.end()) 312 Callback(*I->second); 313 } 314 } 315 316 bool Dex::refs(const RefsRequest &Req, 317 llvm::function_ref<void(const Ref &)> Callback) const { 318 trace::Span Tracer("Dex refs"); 319 uint32_t Remaining = Req.Limit.value_or(std::numeric_limits<uint32_t>::max()); 320 for (const auto &ID : Req.IDs) 321 for (const auto &Ref : Refs.lookup(ID)) { 322 if (!static_cast<int>(Req.Filter & Ref.Kind)) 323 continue; 324 if (Remaining == 0) 325 return true; // More refs were available. 326 --Remaining; 327 Callback(Ref); 328 } 329 return false; // We reported all refs. 330 } 331 332 llvm::iterator_range<std::vector<Dex::RevRef>::const_iterator> 333 Dex::lookupRevRefs(const SymbolID &Container) const { 334 // equal_range() requires an element of the same type as the elements of the 335 // range, so construct a dummy RevRef with the container of interest. 336 Ref QueryRef; 337 QueryRef.Container = Container; 338 RevRef Query(QueryRef, SymbolID{}); 339 340 auto ItPair = std::equal_range(RevRefs.cbegin(), RevRefs.cend(), Query, 341 [](const RevRef &A, const RevRef &B) { 342 return A.ref().Container < B.ref().Container; 343 }); 344 return {ItPair.first, ItPair.second}; 345 } 346 347 bool Dex::containedRefs( 348 const ContainedRefsRequest &Req, 349 llvm::function_ref<void(const ContainedRefsResult &)> Callback) const { 350 trace::Span Tracer("Dex reversed refs"); 351 uint32_t Remaining = Req.Limit.value_or(std::numeric_limits<uint32_t>::max()); 352 for (const auto &Rev : lookupRevRefs(Req.ID)) { 353 // RevRefs are already filtered to ContainedRefsRequest::SupportedRefKinds 354 if (Remaining == 0) 355 return true; // More refs were available. 356 --Remaining; 357 Callback(Rev.containedRefsResult()); 358 } 359 return false; // We reported all refs. 360 } 361 362 void Dex::relations( 363 const RelationsRequest &Req, 364 llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const { 365 trace::Span Tracer("Dex relations"); 366 uint32_t Remaining = Req.Limit.value_or(std::numeric_limits<uint32_t>::max()); 367 for (const SymbolID &Subject : Req.Subjects) { 368 LookupRequest LookupReq; 369 auto It = Relations.find( 370 std::make_pair(Subject, static_cast<uint8_t>(Req.Predicate))); 371 if (It != Relations.end()) { 372 for (const auto &Object : It->second) { 373 if (Remaining > 0) { 374 --Remaining; 375 LookupReq.IDs.insert(Object); 376 } 377 } 378 } 379 lookup(LookupReq, [&](const Symbol &Object) { Callback(Subject, Object); }); 380 } 381 } 382 383 llvm::unique_function<IndexContents(llvm::StringRef) const> 384 Dex::indexedFiles() const { 385 return [this](llvm::StringRef FileURI) { 386 return Files.contains(FileURI) ? IdxContents : IndexContents::None; 387 }; 388 } 389 390 size_t Dex::estimateMemoryUsage() const { 391 size_t Bytes = Symbols.size() * sizeof(const Symbol *); 392 Bytes += SymbolQuality.size() * sizeof(float); 393 Bytes += LookupTable.getMemorySize(); 394 Bytes += InvertedIndex.getMemorySize(); 395 for (const auto &TokenToPostingList : InvertedIndex) 396 Bytes += TokenToPostingList.second.bytes(); 397 Bytes += Refs.getMemorySize(); 398 Bytes += RevRefs.size() * sizeof(RevRef); 399 Bytes += Relations.getMemorySize(); 400 return Bytes + BackingDataSize; 401 } 402 403 // Given foo://bar/one/two 404 // Returns ~~~~~~~~ (or empty for bad URI) 405 llvm::StringRef findPathInURI(llvm::StringRef S) { 406 S = S.split(':').second; // Eat scheme. 407 if (S.consume_front("//")) // Eat authority. 408 S = S.drop_until([](char C) { return C == '/'; }); 409 return S; 410 } 411 412 // FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum 413 // size of resulting vector. Some projects might want to have higher limit if 414 // the file hierarchy is deeper. For the generic case, it would be useful to 415 // calculate Limit in the index build stage by calculating the maximum depth 416 // of the project source tree at runtime. 417 constexpr unsigned ProximityURILimit = 5; 418 419 llvm::SmallVector<llvm::StringRef, ProximityURILimit> 420 generateProximityURIs(llvm::StringRef URI) { 421 // This function is hot when indexing, so don't parse/reserialize URIPath, 422 // just emit substrings of it instead. 423 // 424 // foo://bar/one/two 425 // ~~~~~~~~ 426 // Path 427 llvm::StringRef Path = findPathInURI(URI); 428 if (Path.empty()) 429 return {}; // Bad URI. 430 assert(Path.begin() >= URI.begin() && Path.begin() < URI.end() && 431 Path.end() == URI.end()); 432 433 // The original is a proximity URI. 434 llvm::SmallVector<llvm::StringRef, ProximityURILimit> Result = {URI}; 435 // Form proximity URIs by chopping before each slash in the path part. 436 for (auto Slash = Path.rfind('/'); Slash > 0 && Slash != StringRef::npos; 437 Slash = Path.rfind('/')) { 438 Path = Path.substr(0, Slash); 439 Result.push_back(URI.substr(0, Path.end() - URI.data())); 440 if (Result.size() == ProximityURILimit) 441 return Result; 442 } 443 // The root foo://bar/ is a proximity URI. 444 if (Path.starts_with("/")) 445 Result.push_back(URI.substr(0, Path.begin() + 1 - URI.data())); 446 return Result; 447 } 448 449 } // namespace dex 450 } // namespace clangd 451 } // namespace clang 452