xref: /llvm-project/clang/lib/Tooling/Transformer/SourceCode.cpp (revision 86565c13094236e022d2238f5653641aaca7d31f)
1 //===--- SourceCode.cpp - Source code manipulation routines -----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //  This file provides functions that simplify extraction of source code.
10 //
11 //===----------------------------------------------------------------------===//
12 #include "clang/Tooling/Transformer/SourceCode.h"
13 #include "clang/AST/ASTContext.h"
14 #include "clang/AST/Attr.h"
15 #include "clang/AST/Comment.h"
16 #include "clang/AST/Decl.h"
17 #include "clang/AST/DeclCXX.h"
18 #include "clang/AST/DeclTemplate.h"
19 #include "clang/AST/Expr.h"
20 #include "clang/Basic/SourceManager.h"
21 #include "clang/Lex/Lexer.h"
22 #include "llvm/Support/Errc.h"
23 
24 using namespace clang;
25 
26 using llvm::errc;
27 using llvm::StringError;
28 
29 StringRef clang::tooling::getText(CharSourceRange Range,
30                                   const ASTContext &Context) {
31   return Lexer::getSourceText(Range, Context.getSourceManager(),
32                               Context.getLangOpts());
33 }
34 
35 CharSourceRange clang::tooling::maybeExtendRange(CharSourceRange Range,
36                                                  tok::TokenKind Next,
37                                                  ASTContext &Context) {
38   Optional<Token> Tok = Lexer::findNextToken(
39       Range.getEnd(), Context.getSourceManager(), Context.getLangOpts());
40   if (!Tok || !Tok->is(Next))
41     return Range;
42   return CharSourceRange::getTokenRange(Range.getBegin(), Tok->getLocation());
43 }
44 
45 llvm::Error clang::tooling::validateEditRange(const CharSourceRange &Range,
46                                               const SourceManager &SM) {
47   if (Range.isInvalid())
48     return llvm::make_error<StringError>(errc::invalid_argument,
49                                          "Invalid range");
50 
51   if (Range.getBegin().isMacroID() || Range.getEnd().isMacroID())
52     return llvm::make_error<StringError>(
53         errc::invalid_argument, "Range starts or ends in a macro expansion");
54 
55   if (SM.isInSystemHeader(Range.getBegin()) ||
56       SM.isInSystemHeader(Range.getEnd()))
57     return llvm::make_error<StringError>(errc::invalid_argument,
58                                          "Range is in system header");
59 
60   std::pair<FileID, unsigned> BeginInfo = SM.getDecomposedLoc(Range.getBegin());
61   std::pair<FileID, unsigned> EndInfo = SM.getDecomposedLoc(Range.getEnd());
62   if (BeginInfo.first != EndInfo.first)
63     return llvm::make_error<StringError>(
64         errc::invalid_argument, "Range begins and ends in different files");
65 
66   if (BeginInfo.second > EndInfo.second)
67     return llvm::make_error<StringError>(
68         errc::invalid_argument, "Range's begin is past its end");
69 
70   return llvm::Error::success();
71 }
72 
73 llvm::Optional<CharSourceRange>
74 clang::tooling::getRangeForEdit(const CharSourceRange &EditRange,
75                                 const SourceManager &SM,
76                                 const LangOptions &LangOpts) {
77   // FIXME: makeFileCharRange() has the disadvantage of stripping off "identity"
78   // macros. For example, if we're looking to rewrite the int literal 3 to 6,
79   // and we have the following definition:
80   //    #define DO_NOTHING(x) x
81   // then
82   //    foo(DO_NOTHING(3))
83   // will be rewritten to
84   //    foo(6)
85   // rather than the arguably better
86   //    foo(DO_NOTHING(6))
87   // Decide whether the current behavior is desirable and modify if not.
88   CharSourceRange Range = Lexer::makeFileCharRange(EditRange, SM, LangOpts);
89   bool IsInvalid = llvm::errorToBool(validateEditRange(Range, SM));
90   if (IsInvalid)
91     return llvm::None;
92   return Range;
93 
94 }
95 
96 static bool startsWithNewline(const SourceManager &SM, const Token &Tok) {
97   return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]);
98 }
99 
100 static bool contains(const std::set<tok::TokenKind> &Terminators,
101                      const Token &Tok) {
102   return Terminators.count(Tok.getKind()) > 0;
103 }
104 
105 // Returns the exclusive, *file* end location of the entity whose last token is
106 // at location 'EntityLast'. That is, it returns the location one past the last
107 // relevant character.
108 //
109 // Associated tokens include comments, horizontal whitespace and 'Terminators'
110 // -- optional tokens, which, if any are found, will be included; if
111 // 'Terminators' is empty, we will not include any extra tokens beyond comments
112 // and horizontal whitespace.
113 static SourceLocation
114 getEntityEndLoc(const SourceManager &SM, SourceLocation EntityLast,
115                 const std::set<tok::TokenKind> &Terminators,
116                 const LangOptions &LangOpts) {
117   assert(EntityLast.isValid() && "Invalid end location found.");
118 
119   // We remember the last location of a non-horizontal-whitespace token we have
120   // lexed; this is the location up to which we will want to delete.
121   // FIXME: Support using the spelling loc here for cases where we want to
122   // analyze the macro text.
123 
124   CharSourceRange ExpansionRange = SM.getExpansionRange(EntityLast);
125   // FIXME: Should check isTokenRange(), for the (rare) case that
126   // `ExpansionRange` is a character range.
127   std::unique_ptr<Lexer> Lexer = [&]() {
128     bool Invalid = false;
129     auto FileOffset = SM.getDecomposedLoc(ExpansionRange.getEnd());
130     llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid);
131     assert(!Invalid && "Cannot get file/offset");
132     return std::make_unique<clang::Lexer>(
133         SM.getLocForStartOfFile(FileOffset.first), LangOpts, File.begin(),
134         File.data() + FileOffset.second, File.end());
135   }();
136 
137   // Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown).
138   Lexer->SetKeepWhitespaceMode(true);
139 
140   // Generally, the code we want to include looks like this ([] are optional),
141   // If Terminators is empty:
142   //   [ <comment> ] [ <newline> ]
143   // Otherwise:
144   //   ... <terminator> [ <comment> ] [ <newline> ]
145 
146   Token Tok;
147   bool Terminated = false;
148 
149   // First, lex to the current token (which is the last token of the range that
150   // is definitely associated with the decl). Then, we process the first token
151   // separately from the rest based on conditions that hold specifically for
152   // that first token.
153   //
154   // We do not search for a terminator if none is required or we've already
155   // encountered it. Otherwise, if the original `EntityLast` location was in a
156   // macro expansion, we don't have visibility into the text, so we assume we've
157   // already terminated. However, we note this assumption with
158   // `TerminatedByMacro`, because we'll want to handle it somewhat differently
159   // for the terminators semicolon and comma. These terminators can be safely
160   // associated with the entity when they appear after the macro -- extra
161   // semicolons have no effect on the program and a well-formed program won't
162   // have multiple commas in a row, so we're guaranteed that there is only one.
163   //
164   // FIXME: This handling of macros is more conservative than necessary. When
165   // the end of the expansion coincides with the end of the node, we can still
166   // safely analyze the code. But, it is more complicated, because we need to
167   // start by lexing the spelling loc for the first token and then switch to the
168   // expansion loc.
169   bool TerminatedByMacro = false;
170   Lexer->LexFromRawLexer(Tok);
171   if (Terminators.empty() || contains(Terminators, Tok))
172     Terminated = true;
173   else if (EntityLast.isMacroID()) {
174     Terminated = true;
175     TerminatedByMacro = true;
176   }
177 
178   // We save the most recent candidate for the exclusive end location.
179   SourceLocation End = Tok.getEndLoc();
180 
181   while (!Terminated) {
182     // Lex the next token we want to possibly expand the range with.
183     Lexer->LexFromRawLexer(Tok);
184 
185     switch (Tok.getKind()) {
186     case tok::eof:
187     // Unexpected separators.
188     case tok::l_brace:
189     case tok::r_brace:
190     case tok::comma:
191       return End;
192     // Whitespace pseudo-tokens.
193     case tok::unknown:
194       if (startsWithNewline(SM, Tok))
195         // Include at least until the end of the line.
196         End = Tok.getEndLoc();
197       break;
198     default:
199       if (contains(Terminators, Tok))
200         Terminated = true;
201       End = Tok.getEndLoc();
202       break;
203     }
204   }
205 
206   do {
207     // Lex the next token we want to possibly expand the range with.
208     Lexer->LexFromRawLexer(Tok);
209 
210     switch (Tok.getKind()) {
211     case tok::unknown:
212       if (startsWithNewline(SM, Tok))
213         // We're done, but include this newline.
214         return Tok.getEndLoc();
215       break;
216     case tok::comment:
217       // Include any comments we find on the way.
218       End = Tok.getEndLoc();
219       break;
220     case tok::semi:
221     case tok::comma:
222       if (TerminatedByMacro && contains(Terminators, Tok)) {
223         End = Tok.getEndLoc();
224         // We've found a real terminator.
225         TerminatedByMacro = false;
226         break;
227       }
228       // Found an unrelated token; stop and don't include it.
229       return End;
230     default:
231       // Found an unrelated token; stop and don't include it.
232       return End;
233     }
234   } while (true);
235 }
236 
237 // Returns the expected terminator tokens for the given declaration.
238 //
239 // If we do not know the correct terminator token, returns an empty set.
240 //
241 // There are cases where we have more than one possible terminator (for example,
242 // we find either a comma or a semicolon after a VarDecl).
243 static std::set<tok::TokenKind> getTerminators(const Decl &D) {
244   if (llvm::isa<RecordDecl>(D) || llvm::isa<UsingDecl>(D))
245     return {tok::semi};
246 
247   if (llvm::isa<FunctionDecl>(D) || llvm::isa<LinkageSpecDecl>(D))
248     return {tok::r_brace, tok::semi};
249 
250   if (llvm::isa<VarDecl>(D) || llvm::isa<FieldDecl>(D))
251     return {tok::comma, tok::semi};
252 
253   return {};
254 }
255 
256 // Starting from `Loc`, skips whitespace up to, and including, a single
257 // newline. Returns the (exclusive) end of any skipped whitespace (that is, the
258 // location immediately after the whitespace).
259 static SourceLocation skipWhitespaceAndNewline(const SourceManager &SM,
260                                                SourceLocation Loc,
261                                                const LangOptions &LangOpts) {
262   const char *LocChars = SM.getCharacterData(Loc);
263   int i = 0;
264   while (isHorizontalWhitespace(LocChars[i]))
265     ++i;
266   if (isVerticalWhitespace(LocChars[i]))
267     ++i;
268   return Loc.getLocWithOffset(i);
269 }
270 
271 // Is `Loc` separated from any following decl by something meaningful (e.g. an
272 // empty line, a comment), ignoring horizontal whitespace?  Since this is a
273 // heuristic, we return false when in doubt.  `Loc` cannot be the first location
274 // in the file.
275 static bool atOrBeforeSeparation(const SourceManager &SM, SourceLocation Loc,
276                                  const LangOptions &LangOpts) {
277   // If the preceding character is a newline, we'll check for an empty line as a
278   // separator. However, we can't identify an empty line using tokens, so we
279   // analyse the characters. If we try to use tokens, we'll just end up with a
280   // whitespace token, whose characters we'd have to analyse anyhow.
281   bool Invalid = false;
282   const char *LocChars =
283       SM.getCharacterData(Loc.getLocWithOffset(-1), &Invalid);
284   assert(!Invalid &&
285          "Loc must be a valid character and not the first of the source file.");
286   if (isVerticalWhitespace(LocChars[0])) {
287     for (int i = 1; isWhitespace(LocChars[i]); ++i)
288       if (isVerticalWhitespace(LocChars[i]))
289         return true;
290   }
291   // We didn't find an empty line, so lex the next token, skipping past any
292   // whitespace we just scanned.
293   Token Tok;
294   bool Failed = Lexer::getRawToken(Loc, Tok, SM, LangOpts,
295                                    /*IgnoreWhiteSpace=*/true);
296   if (Failed)
297     // Any text that confuses the lexer seems fair to consider a separation.
298     return true;
299 
300   switch (Tok.getKind()) {
301   case tok::comment:
302   case tok::l_brace:
303   case tok::r_brace:
304   case tok::eof:
305     return true;
306   default:
307     return false;
308   }
309 }
310 
311 CharSourceRange tooling::getAssociatedRange(const Decl &Decl,
312                                             ASTContext &Context) {
313   const SourceManager &SM = Context.getSourceManager();
314   const LangOptions &LangOpts = Context.getLangOpts();
315   CharSourceRange Range = CharSourceRange::getTokenRange(Decl.getSourceRange());
316 
317   // First, expand to the start of the template<> declaration if necessary.
318   if (const auto *Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) {
319     if (const auto *T = Record->getDescribedClassTemplate())
320       if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
321         Range.setBegin(T->getBeginLoc());
322   } else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) {
323     if (const auto *T = F->getDescribedFunctionTemplate())
324       if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
325         Range.setBegin(T->getBeginLoc());
326   }
327 
328   // Next, expand the end location past trailing comments to include a potential
329   // newline at the end of the decl's line.
330   Range.setEnd(
331       getEntityEndLoc(SM, Decl.getEndLoc(), getTerminators(Decl), LangOpts));
332   Range.setTokenRange(false);
333 
334   // Expand to include preceeding associated comments. We ignore any comments
335   // that are not preceeding the decl, since we've already skipped trailing
336   // comments with getEntityEndLoc.
337   if (const RawComment *Comment =
338           Decl.getASTContext().getRawCommentForDeclNoCache(&Decl))
339     // Only include a preceding comment if:
340     // * it is *not* separate from the declaration (not including any newline
341     //   that immediately follows the comment),
342     // * the decl *is* separate from any following entity (so, there are no
343     //   other entities the comment could refer to), and
344     // * it is not a IfThisThenThat lint check.
345     if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(),
346                                      Range.getBegin()) &&
347         !atOrBeforeSeparation(
348             SM, skipWhitespaceAndNewline(SM, Comment->getEndLoc(), LangOpts),
349             LangOpts) &&
350         atOrBeforeSeparation(SM, Range.getEnd(), LangOpts)) {
351       const StringRef CommentText = Comment->getRawText(SM);
352       if (!CommentText.contains("LINT.IfChange") &&
353           !CommentText.contains("LINT.ThenChange"))
354         Range.setBegin(Comment->getBeginLoc());
355     }
356   // Add leading attributes.
357   for (auto *Attr : Decl.attrs()) {
358     if (Attr->getLocation().isInvalid() ||
359         !SM.isBeforeInTranslationUnit(Attr->getLocation(), Range.getBegin()))
360       continue;
361     Range.setBegin(Attr->getLocation());
362 
363     // Extend to the left '[[' or '__attribute((' if we saw the attribute,
364     // unless it is not a valid location.
365     bool Invalid;
366     StringRef Source =
367         SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid);
368     if (Invalid)
369       continue;
370     llvm::StringRef BeforeAttr =
371         Source.substr(0, SM.getFileOffset(Range.getBegin()));
372     llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim();
373 
374     for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) {
375       // Handle whitespace between attribute prefix and attribute value.
376       if (BeforeAttrStripped.endswith(Prefix)) {
377         // Move start to start position of prefix, which is
378         // length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix)
379         // positions to the left.
380         Range.setBegin(Range.getBegin().getLocWithOffset(static_cast<int>(
381             -BeforeAttr.size() + BeforeAttrStripped.size() - Prefix.size())));
382         break;
383         // If we didn't see '[[' or '__attribute' it's probably coming from a
384         // macro expansion which is already handled by makeFileCharRange(),
385         // below.
386       }
387     }
388   }
389 
390   // Range.getEnd() is already fully un-expanded by getEntityEndLoc. But,
391   // Range.getBegin() may be inside an expansion.
392   return Lexer::makeFileCharRange(Range, SM, LangOpts);
393 }
394