xref: /llvm-project/clang-tools-extra/clangd/FileDistance.cpp (revision d5953e3e3092f7142a07aa012fc9665ede09e53b)
1 //===--- FileDistance.cpp - File contents container -------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // The FileDistance structure allows calculating the minimum distance to paths
10 // in a single tree.
11 // We simply walk up the path's ancestors until we find a node whose cost is
12 // known, and add the cost of walking back down. Initialization ensures this
13 // gives the correct path to the roots.
14 // We cache the results, so that the runtime is O(|A|), where A is the set of
15 // all distinct ancestors of visited paths.
16 //
17 // Example after initialization with /=2, /bar=0, DownCost = 1:
18 //  / = 2
19 //    /bar = 0
20 //
21 // After querying /foo/bar and /bar/foo:
22 //  / = 2
23 //    /bar = 0
24 //      /bar/foo = 1
25 //    /foo = 3
26 //      /foo/bar = 4
27 //
28 // URIDistance creates FileDistance lazily for each URI scheme encountered. In
29 // practice this is a small constant factor.
30 //
31 //===-------------------------------------------------------------------------//
32 
33 #include "FileDistance.h"
34 #include "URI.h"
35 #include "support/Logger.h"
36 #include "llvm/ADT/STLExtras.h"
37 #include "llvm/ADT/StringExtras.h"
38 #include "llvm/Support/Path.h"
39 #include <queue>
40 
41 namespace clang {
42 namespace clangd {
43 
44 // Convert a path into the canonical form.
45 // Canonical form is either "/", or "/segment" * N:
46 //   C:\foo\bar --> /c:/foo/bar
47 //   /foo/      --> /foo
48 //   a/b/c      --> /a/b/c
canonicalize(llvm::StringRef Path)49 static llvm::SmallString<128> canonicalize(llvm::StringRef Path) {
50   llvm::SmallString<128> Result = Path.rtrim('/');
51   native(Result, llvm::sys::path::Style::posix);
52   if (Result.empty() || Result.front() != '/')
53     Result.insert(Result.begin(), '/');
54   return Result;
55 }
56 
57 constexpr const unsigned FileDistance::Unreachable;
58 const llvm::hash_code FileDistance::RootHash =
59     llvm::hash_value(llvm::StringRef("/"));
60 
FileDistance(llvm::StringMap<SourceParams> Sources,const FileDistanceOptions & Opts)61 FileDistance::FileDistance(llvm::StringMap<SourceParams> Sources,
62                            const FileDistanceOptions &Opts)
63     : Opts(Opts) {
64   llvm::DenseMap<llvm::hash_code, llvm::SmallVector<llvm::hash_code>> DownEdges;
65   // Compute the best distance following only up edges.
66   // Keep track of down edges, in case we can use them to improve on this.
67   for (const auto &S : Sources) {
68     auto Canonical = canonicalize(S.getKey());
69     dlog("Source {0} = {1}, MaxUp = {2}", Canonical, S.second.Cost,
70          S.second.MaxUpTraversals);
71     // Walk up to ancestors of this source, assigning cost.
72     llvm::StringRef Rest = Canonical;
73     llvm::hash_code Hash = llvm::hash_value(Rest);
74     for (unsigned I = 0; !Rest.empty(); ++I) {
75       Rest = parent_path(Rest, llvm::sys::path::Style::posix);
76       auto NextHash = llvm::hash_value(Rest);
77       auto &Down = DownEdges[NextHash];
78       if (!llvm::is_contained(Down, Hash))
79         Down.push_back(Hash);
80       // We can't just break after MaxUpTraversals, must still set DownEdges.
81       if (I > S.getValue().MaxUpTraversals) {
82         if (Cache.contains(Hash))
83           break;
84       } else {
85         unsigned Cost = S.getValue().Cost + I * Opts.UpCost;
86         auto R = Cache.try_emplace(Hash, Cost);
87         if (!R.second) {
88           if (Cost < R.first->second) {
89             R.first->second = Cost;
90           } else {
91             // If we're not the best way to get to this path, stop assigning.
92             break;
93           }
94         }
95       }
96       Hash = NextHash;
97     }
98   }
99   // Now propagate scores parent -> child if that's an improvement.
100   // BFS ensures we propagate down chains (must visit parents before children).
101   std::queue<llvm::hash_code> Next;
102   for (auto Child : DownEdges.lookup(llvm::hash_value(llvm::StringRef(""))))
103     Next.push(Child);
104   while (!Next.empty()) {
105     auto Parent = Next.front();
106     Next.pop();
107     auto ParentCost = Cache.lookup(Parent);
108     for (auto Child : DownEdges.lookup(Parent)) {
109       if (Parent != RootHash || Opts.AllowDownTraversalFromRoot) {
110         auto &ChildCost =
111             Cache.try_emplace(Child, Unreachable).first->getSecond();
112         if (ParentCost + Opts.DownCost < ChildCost)
113           ChildCost = ParentCost + Opts.DownCost;
114       }
115       Next.push(Child);
116     }
117   }
118 }
119 
distance(llvm::StringRef Path)120 unsigned FileDistance::distance(llvm::StringRef Path) {
121   auto Canonical = canonicalize(Path);
122   unsigned Cost = Unreachable;
123   llvm::SmallVector<llvm::hash_code> Ancestors;
124   // Walk up ancestors until we find a path we know the distance for.
125   for (llvm::StringRef Rest = Canonical; !Rest.empty();
126        Rest = parent_path(Rest, llvm::sys::path::Style::posix)) {
127     auto Hash = llvm::hash_value(Rest);
128     if (Hash == RootHash && !Ancestors.empty() &&
129         !Opts.AllowDownTraversalFromRoot) {
130       Cost = Unreachable;
131       break;
132     }
133     auto It = Cache.find(Hash);
134     if (It != Cache.end()) {
135       Cost = It->second;
136       break;
137     }
138     Ancestors.push_back(Hash);
139   }
140   // Now we know the costs for (known node, queried node].
141   // Fill these in, walking down the directory tree.
142   for (llvm::hash_code Hash : llvm::reverse(Ancestors)) {
143     if (Cost != Unreachable)
144       Cost += Opts.DownCost;
145     Cache.try_emplace(Hash, Cost);
146   }
147   dlog("distance({0} = {1})", Path, Cost);
148   return Cost;
149 }
150 
distance(llvm::StringRef URI)151 unsigned URIDistance::distance(llvm::StringRef URI) {
152   auto R = Cache.try_emplace(llvm::hash_value(URI), FileDistance::Unreachable);
153   if (!R.second)
154     return R.first->getSecond();
155   if (auto U = clangd::URI::parse(URI)) {
156     dlog("distance({0} = {1})", URI, U->body());
157     R.first->second = forScheme(U->scheme()).distance(U->body());
158   } else {
159     log("URIDistance::distance() of unparseable {0}: {1}", URI, U.takeError());
160   }
161   return R.first->second;
162 }
163 
forScheme(llvm::StringRef Scheme)164 FileDistance &URIDistance::forScheme(llvm::StringRef Scheme) {
165   auto &Delegate = ByScheme[Scheme];
166   if (!Delegate) {
167     llvm::StringMap<SourceParams> SchemeSources;
168     for (const auto &Source : Sources) {
169       if (auto U = clangd::URI::create(Source.getKey(), Scheme))
170         SchemeSources.try_emplace(U->body(), Source.getValue());
171       else
172         llvm::consumeError(U.takeError());
173     }
174     dlog("FileDistance for scheme {0}: {1}/{2} sources", Scheme,
175          SchemeSources.size(), Sources.size());
176     Delegate.reset(new FileDistance(std::move(SchemeSources), Opts));
177   }
178   return *Delegate;
179 }
180 
scopeToPath(llvm::StringRef Scope)181 static std::pair<std::string, int> scopeToPath(llvm::StringRef Scope) {
182   llvm::SmallVector<llvm::StringRef> Split;
183   Scope.split(Split, "::", /*MaxSplit=*/-1, /*KeepEmpty=*/false);
184   return {"/" + llvm::join(Split, "/"), Split.size()};
185 }
186 
187 static FileDistance
createScopeFileDistance(llvm::ArrayRef<std::string> QueryScopes)188 createScopeFileDistance(llvm::ArrayRef<std::string> QueryScopes) {
189   FileDistanceOptions Opts;
190   Opts.UpCost = 2;
191   Opts.DownCost = 4;
192   Opts.AllowDownTraversalFromRoot = false;
193 
194   llvm::StringMap<SourceParams> Sources;
195   llvm::StringRef Preferred =
196       QueryScopes.empty() ? "" : QueryScopes.front().c_str();
197   for (llvm::StringRef S : QueryScopes) {
198     SourceParams Param;
199     // Penalize the global scope even it's preferred, as all projects can define
200     // symbols in it, and there is pattern where using-namespace is used in
201     // place of enclosing namespaces (e.g. in implementation files).
202     if (S == Preferred)
203       Param.Cost = S == "" ? 4 : 0;
204     else if (Preferred.starts_with(S) && !S.empty())
205       continue; // just rely on up-traversals.
206     else
207       Param.Cost = S == "" ? 6 : 2;
208     auto Path = scopeToPath(S);
209     // The global namespace is not 'near' its children.
210     Param.MaxUpTraversals = std::max(Path.second - 1, 0);
211     Sources[Path.first] = std::move(Param);
212   }
213   return FileDistance(std::move(Sources), Opts);
214 }
215 
ScopeDistance(llvm::ArrayRef<std::string> QueryScopes)216 ScopeDistance::ScopeDistance(llvm::ArrayRef<std::string> QueryScopes)
217     : Distance(createScopeFileDistance(QueryScopes)) {}
218 
distance(llvm::StringRef SymbolScope)219 unsigned ScopeDistance::distance(llvm::StringRef SymbolScope) {
220   return Distance.distance(scopeToPath(SymbolScope).first);
221 }
222 
223 } // namespace clangd
224 } // namespace clang
225