xref: /llvm-project/clang-tools-extra/clangd/FileDistance.cpp (revision d5953e3e3092f7142a07aa012fc9665ede09e53b)
13f0243fdSSam McCall //===--- FileDistance.cpp - File contents container -------------*- C++ -*-===//
23f0243fdSSam McCall //
32946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
42946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information.
52946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
63f0243fdSSam McCall //
73f0243fdSSam McCall //===----------------------------------------------------------------------===//
83f0243fdSSam McCall //
93f0243fdSSam McCall // The FileDistance structure allows calculating the minimum distance to paths
103f0243fdSSam McCall // in a single tree.
113f0243fdSSam McCall // We simply walk up the path's ancestors until we find a node whose cost is
123f0243fdSSam McCall // known, and add the cost of walking back down. Initialization ensures this
133f0243fdSSam McCall // gives the correct path to the roots.
143f0243fdSSam McCall // We cache the results, so that the runtime is O(|A|), where A is the set of
153f0243fdSSam McCall // all distinct ancestors of visited paths.
163f0243fdSSam McCall //
173f0243fdSSam McCall // Example after initialization with /=2, /bar=0, DownCost = 1:
183f0243fdSSam McCall //  / = 2
193f0243fdSSam McCall //    /bar = 0
203f0243fdSSam McCall //
213f0243fdSSam McCall // After querying /foo/bar and /bar/foo:
223f0243fdSSam McCall //  / = 2
233f0243fdSSam McCall //    /bar = 0
243f0243fdSSam McCall //      /bar/foo = 1
253f0243fdSSam McCall //    /foo = 3
263f0243fdSSam McCall //      /foo/bar = 4
273f0243fdSSam McCall //
283f0243fdSSam McCall // URIDistance creates FileDistance lazily for each URI scheme encountered. In
293f0243fdSSam McCall // practice this is a small constant factor.
303f0243fdSSam McCall //
313f0243fdSSam McCall //===-------------------------------------------------------------------------//
323f0243fdSSam McCall 
333f0243fdSSam McCall #include "FileDistance.h"
344d006520SSam McCall #include "URI.h"
35ad97ccf6SSam McCall #include "support/Logger.h"
363f0243fdSSam McCall #include "llvm/ADT/STLExtras.h"
37b0abd489SElliot Goodrich #include "llvm/ADT/StringExtras.h"
384d006520SSam McCall #include "llvm/Support/Path.h"
393f0243fdSSam McCall #include <queue>
403f0243fdSSam McCall 
413f0243fdSSam McCall namespace clang {
423f0243fdSSam McCall namespace clangd {
433f0243fdSSam McCall 
443f0243fdSSam McCall // Convert a path into the canonical form.
453f0243fdSSam McCall // Canonical form is either "/", or "/segment" * N:
463f0243fdSSam McCall //   C:\foo\bar --> /c:/foo/bar
473f0243fdSSam McCall //   /foo/      --> /foo
483f0243fdSSam McCall //   a/b/c      --> /a/b/c
canonicalize(llvm::StringRef Path)49f2001aa7SIlya Biryukov static llvm::SmallString<128> canonicalize(llvm::StringRef Path) {
50f2001aa7SIlya Biryukov   llvm::SmallString<128> Result = Path.rtrim('/');
51f2001aa7SIlya Biryukov   native(Result, llvm::sys::path::Style::posix);
523f0243fdSSam McCall   if (Result.empty() || Result.front() != '/')
533f0243fdSSam McCall     Result.insert(Result.begin(), '/');
543f0243fdSSam McCall   return Result;
553f0243fdSSam McCall }
563f0243fdSSam McCall 
57c38b3f03SKirill Bobyrev constexpr const unsigned FileDistance::Unreachable;
58f2001aa7SIlya Biryukov const llvm::hash_code FileDistance::RootHash =
59f2001aa7SIlya Biryukov     llvm::hash_value(llvm::StringRef("/"));
603f0243fdSSam McCall 
FileDistance(llvm::StringMap<SourceParams> Sources,const FileDistanceOptions & Opts)61f2001aa7SIlya Biryukov FileDistance::FileDistance(llvm::StringMap<SourceParams> Sources,
623f0243fdSSam McCall                            const FileDistanceOptions &Opts)
633f0243fdSSam McCall     : Opts(Opts) {
64ee02e20cSKirill Bobyrev   llvm::DenseMap<llvm::hash_code, llvm::SmallVector<llvm::hash_code>> DownEdges;
653f0243fdSSam McCall   // Compute the best distance following only up edges.
663f0243fdSSam McCall   // Keep track of down edges, in case we can use them to improve on this.
673f0243fdSSam McCall   for (const auto &S : Sources) {
683f0243fdSSam McCall     auto Canonical = canonicalize(S.getKey());
69bed5885dSSam McCall     dlog("Source {0} = {1}, MaxUp = {2}", Canonical, S.second.Cost,
70bed5885dSSam McCall          S.second.MaxUpTraversals);
713f0243fdSSam McCall     // Walk up to ancestors of this source, assigning cost.
72f2001aa7SIlya Biryukov     llvm::StringRef Rest = Canonical;
73f2001aa7SIlya Biryukov     llvm::hash_code Hash = llvm::hash_value(Rest);
743f0243fdSSam McCall     for (unsigned I = 0; !Rest.empty(); ++I) {
75f2001aa7SIlya Biryukov       Rest = parent_path(Rest, llvm::sys::path::Style::posix);
76f2001aa7SIlya Biryukov       auto NextHash = llvm::hash_value(Rest);
777c96bb6eSSam McCall       auto &Down = DownEdges[NextHash];
78f2001aa7SIlya Biryukov       if (!llvm::is_contained(Down, Hash))
798380c9e9SFangrui Song         Down.push_back(Hash);
803f0243fdSSam McCall       // We can't just break after MaxUpTraversals, must still set DownEdges.
813f0243fdSSam McCall       if (I > S.getValue().MaxUpTraversals) {
8215aa9653SKazu Hirata         if (Cache.contains(Hash))
833f0243fdSSam McCall           break;
843f0243fdSSam McCall       } else {
853f0243fdSSam McCall         unsigned Cost = S.getValue().Cost + I * Opts.UpCost;
863f0243fdSSam McCall         auto R = Cache.try_emplace(Hash, Cost);
873f0243fdSSam McCall         if (!R.second) {
883f0243fdSSam McCall           if (Cost < R.first->second) {
893f0243fdSSam McCall             R.first->second = Cost;
903f0243fdSSam McCall           } else {
913f0243fdSSam McCall             // If we're not the best way to get to this path, stop assigning.
923f0243fdSSam McCall             break;
933f0243fdSSam McCall           }
943f0243fdSSam McCall         }
953f0243fdSSam McCall       }
963f0243fdSSam McCall       Hash = NextHash;
973f0243fdSSam McCall     }
983f0243fdSSam McCall   }
993f0243fdSSam McCall   // Now propagate scores parent -> child if that's an improvement.
1003f0243fdSSam McCall   // BFS ensures we propagate down chains (must visit parents before children).
101f2001aa7SIlya Biryukov   std::queue<llvm::hash_code> Next;
102f2001aa7SIlya Biryukov   for (auto Child : DownEdges.lookup(llvm::hash_value(llvm::StringRef(""))))
1033f0243fdSSam McCall     Next.push(Child);
1043f0243fdSSam McCall   while (!Next.empty()) {
1050c29722cSEric Liu     auto Parent = Next.front();
1060c29722cSEric Liu     Next.pop();
1070c29722cSEric Liu     auto ParentCost = Cache.lookup(Parent);
1080c29722cSEric Liu     for (auto Child : DownEdges.lookup(Parent)) {
1090c29722cSEric Liu       if (Parent != RootHash || Opts.AllowDownTraversalFromRoot) {
1103f0243fdSSam McCall         auto &ChildCost =
111c1bb7b9eSKirill Bobyrev             Cache.try_emplace(Child, Unreachable).first->getSecond();
1123f0243fdSSam McCall         if (ParentCost + Opts.DownCost < ChildCost)
1133f0243fdSSam McCall           ChildCost = ParentCost + Opts.DownCost;
1140c29722cSEric Liu       }
1153f0243fdSSam McCall       Next.push(Child);
1163f0243fdSSam McCall     }
1173f0243fdSSam McCall   }
1183f0243fdSSam McCall }
1193f0243fdSSam McCall 
distance(llvm::StringRef Path)120f2001aa7SIlya Biryukov unsigned FileDistance::distance(llvm::StringRef Path) {
1213f0243fdSSam McCall   auto Canonical = canonicalize(Path);
122c1bb7b9eSKirill Bobyrev   unsigned Cost = Unreachable;
123ee02e20cSKirill Bobyrev   llvm::SmallVector<llvm::hash_code> Ancestors;
1243f0243fdSSam McCall   // Walk up ancestors until we find a path we know the distance for.
125f2001aa7SIlya Biryukov   for (llvm::StringRef Rest = Canonical; !Rest.empty();
126f2001aa7SIlya Biryukov        Rest = parent_path(Rest, llvm::sys::path::Style::posix)) {
127f2001aa7SIlya Biryukov     auto Hash = llvm::hash_value(Rest);
1280c29722cSEric Liu     if (Hash == RootHash && !Ancestors.empty() &&
1290c29722cSEric Liu         !Opts.AllowDownTraversalFromRoot) {
1300c29722cSEric Liu       Cost = Unreachable;
1310c29722cSEric Liu       break;
1320c29722cSEric Liu     }
1333f0243fdSSam McCall     auto It = Cache.find(Hash);
1343f0243fdSSam McCall     if (It != Cache.end()) {
1353f0243fdSSam McCall       Cost = It->second;
1363f0243fdSSam McCall       break;
1373f0243fdSSam McCall     }
1383f0243fdSSam McCall     Ancestors.push_back(Hash);
1393f0243fdSSam McCall   }
1403f0243fdSSam McCall   // Now we know the costs for (known node, queried node].
1413f0243fdSSam McCall   // Fill these in, walking down the directory tree.
142f2001aa7SIlya Biryukov   for (llvm::hash_code Hash : llvm::reverse(Ancestors)) {
143c1bb7b9eSKirill Bobyrev     if (Cost != Unreachable)
1443f0243fdSSam McCall       Cost += Opts.DownCost;
1453f0243fdSSam McCall     Cache.try_emplace(Hash, Cost);
1463f0243fdSSam McCall   }
147bed5885dSSam McCall   dlog("distance({0} = {1})", Path, Cost);
1483f0243fdSSam McCall   return Cost;
1493f0243fdSSam McCall }
1503f0243fdSSam McCall 
distance(llvm::StringRef URI)151f2001aa7SIlya Biryukov unsigned URIDistance::distance(llvm::StringRef URI) {
152f2001aa7SIlya Biryukov   auto R = Cache.try_emplace(llvm::hash_value(URI), FileDistance::Unreachable);
1533f0243fdSSam McCall   if (!R.second)
1543f0243fdSSam McCall     return R.first->getSecond();
1553f0243fdSSam McCall   if (auto U = clangd::URI::parse(URI)) {
156bed5885dSSam McCall     dlog("distance({0} = {1})", URI, U->body());
1573f0243fdSSam McCall     R.first->second = forScheme(U->scheme()).distance(U->body());
1583f0243fdSSam McCall   } else {
159bed5885dSSam McCall     log("URIDistance::distance() of unparseable {0}: {1}", URI, U.takeError());
1603f0243fdSSam McCall   }
1613f0243fdSSam McCall   return R.first->second;
1623f0243fdSSam McCall }
1633f0243fdSSam McCall 
forScheme(llvm::StringRef Scheme)164f2001aa7SIlya Biryukov FileDistance &URIDistance::forScheme(llvm::StringRef Scheme) {
1653f0243fdSSam McCall   auto &Delegate = ByScheme[Scheme];
1663f0243fdSSam McCall   if (!Delegate) {
167f2001aa7SIlya Biryukov     llvm::StringMap<SourceParams> SchemeSources;
1683f0243fdSSam McCall     for (const auto &Source : Sources) {
1693f0243fdSSam McCall       if (auto U = clangd::URI::create(Source.getKey(), Scheme))
1703f0243fdSSam McCall         SchemeSources.try_emplace(U->body(), Source.getValue());
1713f0243fdSSam McCall       else
172f2001aa7SIlya Biryukov         llvm::consumeError(U.takeError());
1733f0243fdSSam McCall     }
174bed5885dSSam McCall     dlog("FileDistance for scheme {0}: {1}/{2} sources", Scheme,
175bed5885dSSam McCall          SchemeSources.size(), Sources.size());
1763f0243fdSSam McCall     Delegate.reset(new FileDistance(std::move(SchemeSources), Opts));
1773f0243fdSSam McCall   }
1783f0243fdSSam McCall   return *Delegate;
1793f0243fdSSam McCall }
1803f0243fdSSam McCall 
scopeToPath(llvm::StringRef Scope)181f2001aa7SIlya Biryukov static std::pair<std::string, int> scopeToPath(llvm::StringRef Scope) {
182ee02e20cSKirill Bobyrev   llvm::SmallVector<llvm::StringRef> Split;
1833fac4ef1SEric Liu   Scope.split(Split, "::", /*MaxSplit=*/-1, /*KeepEmpty=*/false);
184f2001aa7SIlya Biryukov   return {"/" + llvm::join(Split, "/"), Split.size()};
1853fac4ef1SEric Liu }
1863fac4ef1SEric Liu 
187f2001aa7SIlya Biryukov static FileDistance
createScopeFileDistance(llvm::ArrayRef<std::string> QueryScopes)188f2001aa7SIlya Biryukov createScopeFileDistance(llvm::ArrayRef<std::string> QueryScopes) {
1893fac4ef1SEric Liu   FileDistanceOptions Opts;
1903fac4ef1SEric Liu   Opts.UpCost = 2;
1913fac4ef1SEric Liu   Opts.DownCost = 4;
1923fac4ef1SEric Liu   Opts.AllowDownTraversalFromRoot = false;
1933fac4ef1SEric Liu 
194f2001aa7SIlya Biryukov   llvm::StringMap<SourceParams> Sources;
195f2001aa7SIlya Biryukov   llvm::StringRef Preferred =
196f2001aa7SIlya Biryukov       QueryScopes.empty() ? "" : QueryScopes.front().c_str();
197f2001aa7SIlya Biryukov   for (llvm::StringRef S : QueryScopes) {
1983fac4ef1SEric Liu     SourceParams Param;
1993fac4ef1SEric Liu     // Penalize the global scope even it's preferred, as all projects can define
2003fac4ef1SEric Liu     // symbols in it, and there is pattern where using-namespace is used in
2013fac4ef1SEric Liu     // place of enclosing namespaces (e.g. in implementation files).
2023fac4ef1SEric Liu     if (S == Preferred)
2034fac50e7SEric Liu       Param.Cost = S == "" ? 4 : 0;
204*d5953e3eSKazu Hirata     else if (Preferred.starts_with(S) && !S.empty())
2053fac4ef1SEric Liu       continue; // just rely on up-traversals.
2063fac4ef1SEric Liu     else
2074fac50e7SEric Liu       Param.Cost = S == "" ? 6 : 2;
2083fac4ef1SEric Liu     auto Path = scopeToPath(S);
2093fac4ef1SEric Liu     // The global namespace is not 'near' its children.
2103fac4ef1SEric Liu     Param.MaxUpTraversals = std::max(Path.second - 1, 0);
2113fac4ef1SEric Liu     Sources[Path.first] = std::move(Param);
2123fac4ef1SEric Liu   }
2133a415c20SHaojian Wu   return FileDistance(std::move(Sources), Opts);
2143fac4ef1SEric Liu }
2153fac4ef1SEric Liu 
ScopeDistance(llvm::ArrayRef<std::string> QueryScopes)216f2001aa7SIlya Biryukov ScopeDistance::ScopeDistance(llvm::ArrayRef<std::string> QueryScopes)
2173fac4ef1SEric Liu     : Distance(createScopeFileDistance(QueryScopes)) {}
2183fac4ef1SEric Liu 
distance(llvm::StringRef SymbolScope)219f2001aa7SIlya Biryukov unsigned ScopeDistance::distance(llvm::StringRef SymbolScope) {
2203fac4ef1SEric Liu   return Distance.distance(scopeToPath(SymbolScope).first);
2213fac4ef1SEric Liu }
2223fac4ef1SEric Liu 
2233f0243fdSSam McCall } // namespace clangd
2243f0243fdSSam McCall } // namespace clang
225