xref: /llvm-project/clang-tools-extra/clangd/support/DirectiveTree.cpp (revision f59b600c21add076d6a876f29f94990b24b8e321)
1 //===--- DirectiveTree.cpp - Find and strip preprocessor directives -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "DirectiveTree.h"
10 #include "clang/Basic/IdentifierTable.h"
11 #include "clang/Basic/TokenKinds.h"
12 #include "llvm/Support/FormatVariadic.h"
13 #include <optional>
14 #include <variant>
15 
16 namespace clang {
17 namespace clangd {
18 namespace {
19 
20 class DirectiveParser {
21 public:
22   explicit DirectiveParser(const TokenStream &Code)
23       : Code(Code), Tok(&Code.front()) {}
24   void parse(DirectiveTree *Result) { parse(Result, /*TopLevel=*/true); }
25 
26 private:
27   // Roles that a directive might take within a conditional block.
28   enum class Cond { None, If, Else, End };
29   static Cond classifyDirective(tok::PPKeywordKind K) {
30     switch (K) {
31     case clang::tok::pp_if:
32     case clang::tok::pp_ifdef:
33     case clang::tok::pp_ifndef:
34       return Cond::If;
35     case clang::tok::pp_elif:
36     case clang::tok::pp_elifdef:
37     case clang::tok::pp_elifndef:
38     case clang::tok::pp_else:
39       return Cond::Else;
40     case clang::tok::pp_endif:
41       return Cond::End;
42     default:
43       return Cond::None;
44     }
45   }
46 
47   // Parses tokens starting at Tok into Tree.
48   // If we reach an End or Else directive that ends Tree, returns it.
49   // If TopLevel is true, then we do not expect End and always return
50   // std::nullopt.
51   std::optional<DirectiveTree::Directive> parse(DirectiveTree *Tree,
52                                                 bool TopLevel) {
53     auto StartsDirective =
54         [&, AllowDirectiveAt((const Token *)nullptr)]() mutable {
55           if (Tok->flag(LexFlags::StartsPPLine)) {
56             // If we considered a comment at the start of a PP-line, it doesn't
57             // start a directive but the directive can still start after it.
58             if (Tok->Kind == tok::comment)
59               AllowDirectiveAt = Tok + 1;
60             return Tok->Kind == tok::hash;
61           }
62           return Tok->Kind == tok::hash && AllowDirectiveAt == Tok;
63         };
64     // Each iteration adds one chunk (or returns, if we see #endif).
65     while (Tok->Kind != tok::eof) {
66       // If there's no directive here, we have a code chunk.
67       if (!StartsDirective()) {
68         const Token *Start = Tok;
69         do
70           ++Tok;
71         while (Tok->Kind != tok::eof && !StartsDirective());
72         Tree->Chunks.push_back(DirectiveTree::Code{
73             Token::Range{Code.index(*Start), Code.index(*Tok)}});
74         continue;
75       }
76 
77       // We have some kind of directive.
78       DirectiveTree::Directive Directive;
79       parseDirective(&Directive);
80       Cond Kind = classifyDirective(Directive.Kind);
81       if (Kind == Cond::If) {
82         // #if or similar, starting a nested conditional block.
83         DirectiveTree::Conditional Conditional;
84         Conditional.Branches.emplace_back();
85         Conditional.Branches.back().first = std::move(Directive);
86         parseConditional(&Conditional);
87         Tree->Chunks.push_back(std::move(Conditional));
88       } else if ((Kind == Cond::Else || Kind == Cond::End) && !TopLevel) {
89         // #endif or similar, ending this PStructure scope.
90         // (#endif is unexpected at the top level, treat as simple directive).
91         return std::move(Directive);
92       } else {
93         // #define or similar, a simple directive at the current scope.
94         Tree->Chunks.push_back(std::move(Directive));
95       }
96     }
97     return std::nullopt;
98   }
99 
100   // Parse the rest of a conditional section, after seeing the If directive.
101   // Returns after consuming the End directive.
102   void parseConditional(DirectiveTree::Conditional *C) {
103     assert(C->Branches.size() == 1 &&
104            C->Branches.front().second.Chunks.empty() &&
105            "Should be ready to parse first branch body");
106     while (Tok->Kind != tok::eof) {
107       auto Terminator = parse(&C->Branches.back().second, /*TopLevel=*/false);
108       if (!Terminator) {
109         assert(Tok->Kind == tok::eof && "gave up parsing before eof?");
110         C->End.Tokens = Token::Range::emptyAt(Code.index(*Tok));
111         return;
112       }
113       if (classifyDirective(Terminator->Kind) == Cond::End) {
114         C->End = std::move(*Terminator);
115         return;
116       }
117       assert(classifyDirective(Terminator->Kind) == Cond::Else &&
118              "ended branch unexpectedly");
119       C->Branches.emplace_back();
120       C->Branches.back().first = std::move(*Terminator);
121     }
122   }
123 
124   // Parse a directive. Tok is the hash.
125   void parseDirective(DirectiveTree::Directive *D) {
126     assert(Tok->Kind == tok::hash);
127 
128     // Directive spans from the hash until the end of line or file.
129     const Token *Begin = Tok++;
130     while (Tok->Kind != tok::eof && !Tok->flag(LexFlags::StartsPPLine))
131       ++Tok;
132     ArrayRef<Token> Tokens{Begin, Tok};
133     D->Tokens = {Code.index(*Tokens.begin()), Code.index(*Tokens.end())};
134 
135     // Directive name is the first non-comment token after the hash.
136     Tokens = Tokens.drop_front().drop_while(
137         [](const Token &T) { return T.Kind == tok::comment; });
138     if (!Tokens.empty())
139       D->Kind = PPKeywords.get(Tokens.front().text()).getPPKeywordID();
140   }
141 
142   const TokenStream &Code;
143   const Token *Tok;
144   clang::IdentifierTable PPKeywords;
145 };
146 
147 struct Dumper {
148   llvm::raw_ostream &OS;
149   unsigned Indent = 0;
150 
151   Dumper(llvm::raw_ostream& OS) : OS(OS) {}
152   void operator()(const DirectiveTree& Tree) {
153     for (const auto& Chunk : Tree.Chunks)
154       std::visit(*this, Chunk);
155   }
156   void operator()(const DirectiveTree::Conditional &Conditional) {
157     for (unsigned I = 0; I < Conditional.Branches.size(); ++I) {
158       const auto &Branch = Conditional.Branches[I];
159       (*this)(Branch.first, Conditional.Taken == I);
160       Indent += 2;
161       (*this)(Branch.second);
162       Indent -= 2;
163     }
164     (*this)(Conditional.End);
165   }
166   void operator()(const DirectiveTree::Directive &Directive,
167                   bool Taken = false) {
168     OS.indent(Indent) << llvm::formatv(
169         "#{0} ({1} tokens){2}\n", tok::getPPKeywordSpelling(Directive.Kind),
170         Directive.Tokens.size(), Taken ? " TAKEN" : "");
171   }
172   void operator()(const DirectiveTree::Code &Code) {
173     OS.indent(Indent) << llvm::formatv("code ({0} tokens)\n",
174                                        Code.Tokens.size());
175   }
176 };
177 } // namespace
178 
179 DirectiveTree DirectiveTree::parse(const TokenStream &Code) {
180   DirectiveTree Result;
181   DirectiveParser(Code).parse(&Result);
182   return Result;
183 }
184 
185 // Define operator<< in terms of dump() functions above.
186 #define OSTREAM_DUMP(Type)                                                     \
187   llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Type &T) {        \
188     Dumper{OS}(T);                                                         \
189     return OS;                                                                 \
190   }
191 OSTREAM_DUMP(DirectiveTree)
192 OSTREAM_DUMP(DirectiveTree::Directive)
193 OSTREAM_DUMP(DirectiveTree::Conditional)
194 OSTREAM_DUMP(DirectiveTree::Code)
195 #undef OSTREAM_DUMP
196 
197 namespace {
198 // Makes choices about conditional branches.
199 //
200 // Generally it tries to maximize the amount of useful code we see.
201 //
202 // Caveat: each conditional is evaluated independently. Consider this code:
203 //   #ifdef WINDOWS
204 //     bool isWindows = true;
205 //   #endif
206 //   #ifndef WINDOWS
207 //     bool isWindows = false;
208 //   #endif
209 // We take both branches and define isWindows twice. We could track more state
210 // in order to produce a consistent view, but this is complex.
211 class BranchChooser {
212 public:
213   BranchChooser(const TokenStream &Code) : Code(Code) {}
214 
215   // Describes code seen by making particular branch choices. Higher is better.
216   struct Score {
217     int Tokens = 0; // excluding comments and directives
218     int Directives = 0;
219     int Errors = 0; // #error directives
220 
221     bool operator>(const Score &Other) const {
222       // Seeing errors is bad, other things are good.
223       return std::make_tuple(-Errors, Tokens, Directives) >
224              std::make_tuple(-Other.Errors, Other.Tokens, Other.Directives);
225     }
226 
227     Score &operator+=(const Score &Other) {
228       Tokens += Other.Tokens;
229       Directives += Other.Directives;
230       Errors += Other.Errors;
231       return *this;
232     }
233   };
234 
235   Score operator()(DirectiveTree::Code &C) {
236     Score S;
237     for (const Token &T : Code.tokens(C.Tokens))
238       if (T.Kind != tok::comment)
239         ++S.Tokens;
240     return S;
241   }
242 
243   Score operator()(DirectiveTree::Directive &D) {
244     Score S;
245     S.Directives = 1;
246     S.Errors = D.Kind == tok::pp_error;
247     return S;
248   }
249 
250   Score operator()(DirectiveTree::Conditional &C) {
251     Score Best;
252     bool MayTakeTrivial = true;
253     bool TookTrivial = false;
254 
255     for (unsigned I = 0; I < C.Branches.size(); ++I) {
256       // Walk the branch to make its nested choices in any case.
257       Score BranchScore = walk(C.Branches[I].second);
258       // If we already took an #if 1, don't consider any other branches.
259       if (TookTrivial)
260         continue;
261       // Is this a trivial #if 0 or #if 1?
262       if (auto TriviallyTaken = isTakenWhenReached(C.Branches[I].first)) {
263         if (!*TriviallyTaken)
264           continue; // Don't consider #if 0 even if it scores well.
265         if (MayTakeTrivial)
266           TookTrivial = true;
267       } else {
268         // After a nontrivial condition, #elif 1 isn't guaranteed taken.
269         MayTakeTrivial = false;
270       }
271       // Is this the best branch so far? (Including if it's #if 1).
272       if (TookTrivial || !C.Taken || BranchScore > Best) {
273         Best = BranchScore;
274         C.Taken = I;
275       }
276     }
277     return Best;
278   }
279   Score walk(DirectiveTree &M) {
280     Score S;
281     for (auto &C : M.Chunks)
282       S += std::visit(*this, C);
283     return S;
284   }
285 
286 private:
287   // Return true if the directive starts an always-taken conditional branch,
288   // false if the branch is never taken, and std::nullopt otherwise.
289   std::optional<bool> isTakenWhenReached(const DirectiveTree::Directive &Dir) {
290     switch (Dir.Kind) {
291     case clang::tok::pp_if:
292     case clang::tok::pp_elif:
293       break; // handled below
294     case clang::tok::pp_else:
295       return true;
296     default: // #ifdef etc
297       return std::nullopt;
298     }
299 
300     const auto &Tokens = Code.tokens(Dir.Tokens);
301     assert(!Tokens.empty() && Tokens.front().Kind == tok::hash);
302     const Token &Name = Tokens.front().nextNC();
303     const Token &Value = Name.nextNC();
304     // Does the condition consist of exactly one token?
305     if (&Value >= Tokens.end() || &Value.nextNC() < Tokens.end())
306       return std::nullopt;
307     return llvm::StringSwitch<std::optional<bool>>(Value.text())
308         .Cases("true", "1", true)
309         .Cases("false", "0", false)
310         .Default(std::nullopt);
311   }
312 
313   const TokenStream &Code;
314 };
315 
316 } // namespace
317 
318 void chooseConditionalBranches(DirectiveTree &Tree, const TokenStream &Code) {
319   BranchChooser{Code}.walk(Tree);
320 }
321 
322 namespace {
323 class Preprocessor {
324   const TokenStream &In;
325   TokenStream &Out;
326 
327 public:
328   Preprocessor(const TokenStream &In, TokenStream &Out) : In(In), Out(Out) {}
329   ~Preprocessor() { Out.finalize(); }
330 
331   Preprocessor(const Preprocessor &other) = delete;
332   Preprocessor &operator=(const Preprocessor &other) = delete;
333 
334   void walk(const DirectiveTree &T) {
335     for (const auto &C : T.Chunks)
336       std::visit(*this, C);
337   }
338 
339   void operator()(const DirectiveTree::Code &C) {
340     for (const auto &Tok : In.tokens(C.Tokens))
341       Out.push(Tok);
342   }
343 
344   void operator()(const DirectiveTree::Directive &) {}
345 
346   void operator()(const DirectiveTree::Conditional &C) {
347     if (C.Taken)
348       walk(C.Branches[*C.Taken].second);
349   }
350 };
351 } // namespace
352 
353 TokenStream DirectiveTree::stripDirectives(const TokenStream &In) const {
354   TokenStream Out;
355   Preprocessor(In, Out).walk(*this);
356   return Out;
357 }
358 
359 } // namespace clangd
360 } // namespace clang
361