xref: /llvm-project/clang/lib/Tooling/Transformer/SourceCode.cpp (revision 38b4516de8a4a791d17085d37f95e3cc15c359f9)
1 //===--- SourceCode.cpp - Source code manipulation routines -----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //  This file provides functions that simplify extraction of source code.
10 //
11 //===----------------------------------------------------------------------===//
12 #include "clang/Tooling/Transformer/SourceCode.h"
13 #include "clang/AST/ASTContext.h"
14 #include "clang/AST/Attr.h"
15 #include "clang/AST/Comment.h"
16 #include "clang/AST/Decl.h"
17 #include "clang/AST/DeclCXX.h"
18 #include "clang/AST/DeclTemplate.h"
19 #include "clang/AST/Expr.h"
20 #include "clang/Lex/Lexer.h"
21 #include "llvm/Support/Errc.h"
22 
23 using namespace clang;
24 
25 using llvm::errc;
26 using llvm::StringError;
27 
28 StringRef clang::tooling::getText(CharSourceRange Range,
29                                   const ASTContext &Context) {
30   return Lexer::getSourceText(Range, Context.getSourceManager(),
31                               Context.getLangOpts());
32 }
33 
34 CharSourceRange clang::tooling::maybeExtendRange(CharSourceRange Range,
35                                                  tok::TokenKind Next,
36                                                  ASTContext &Context) {
37   Optional<Token> Tok = Lexer::findNextToken(
38       Range.getEnd(), Context.getSourceManager(), Context.getLangOpts());
39   if (!Tok || !Tok->is(Next))
40     return Range;
41   return CharSourceRange::getTokenRange(Range.getBegin(), Tok->getLocation());
42 }
43 
44 llvm::Error clang::tooling::validateEditRange(const CharSourceRange &Range,
45                                               const SourceManager &SM) {
46   if (Range.isInvalid())
47     return llvm::make_error<StringError>(errc::invalid_argument,
48                                          "Invalid range");
49 
50   if (Range.getBegin().isMacroID() || Range.getEnd().isMacroID())
51     return llvm::make_error<StringError>(
52         errc::invalid_argument, "Range starts or ends in a macro expansion");
53 
54   if (SM.isInSystemHeader(Range.getBegin()) ||
55       SM.isInSystemHeader(Range.getEnd()))
56     return llvm::make_error<StringError>(errc::invalid_argument,
57                                          "Range is in system header");
58 
59   std::pair<FileID, unsigned> BeginInfo = SM.getDecomposedLoc(Range.getBegin());
60   std::pair<FileID, unsigned> EndInfo = SM.getDecomposedLoc(Range.getEnd());
61   if (BeginInfo.first != EndInfo.first)
62     return llvm::make_error<StringError>(
63         errc::invalid_argument, "Range begins and ends in different files");
64 
65   if (BeginInfo.second > EndInfo.second)
66     return llvm::make_error<StringError>(
67         errc::invalid_argument, "Range's begin is past its end");
68 
69   return llvm::Error::success();
70 }
71 
72 llvm::Optional<CharSourceRange>
73 clang::tooling::getRangeForEdit(const CharSourceRange &EditRange,
74                                 const SourceManager &SM,
75                                 const LangOptions &LangOpts) {
76   // FIXME: makeFileCharRange() has the disadvantage of stripping off "identity"
77   // macros. For example, if we're looking to rewrite the int literal 3 to 6,
78   // and we have the following definition:
79   //    #define DO_NOTHING(x) x
80   // then
81   //    foo(DO_NOTHING(3))
82   // will be rewritten to
83   //    foo(6)
84   // rather than the arguably better
85   //    foo(DO_NOTHING(6))
86   // Decide whether the current behavior is desirable and modify if not.
87   CharSourceRange Range = Lexer::makeFileCharRange(EditRange, SM, LangOpts);
88   bool IsInvalid = llvm::errorToBool(validateEditRange(Range, SM));
89   if (IsInvalid)
90     return llvm::None;
91   return Range;
92 
93 }
94 
95 static bool startsWithNewline(const SourceManager &SM, const Token &Tok) {
96   return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]);
97 }
98 
99 static bool contains(const std::set<tok::TokenKind> &Terminators,
100                      const Token &Tok) {
101   return Terminators.count(Tok.getKind()) > 0;
102 }
103 
104 // Returns the exclusive, *file* end location of the entity whose last token is
105 // at location 'EntityLast'. That is, it returns the location one past the last
106 // relevant character.
107 //
108 // Associated tokens include comments, horizontal whitespace and 'Terminators'
109 // -- optional tokens, which, if any are found, will be included; if
110 // 'Terminators' is empty, we will not include any extra tokens beyond comments
111 // and horizontal whitespace.
112 static SourceLocation
113 getEntityEndLoc(const SourceManager &SM, SourceLocation EntityLast,
114                 const std::set<tok::TokenKind> &Terminators,
115                 const LangOptions &LangOpts) {
116   assert(EntityLast.isValid() && "Invalid end location found.");
117 
118   // We remember the last location of a non-horizontal-whitespace token we have
119   // lexed; this is the location up to which we will want to delete.
120   // FIXME: Support using the spelling loc here for cases where we want to
121   // analyze the macro text.
122 
123   CharSourceRange ExpansionRange = SM.getExpansionRange(EntityLast);
124   // FIXME: Should check isTokenRange(), for the (rare) case that
125   // `ExpansionRange` is a character range.
126   std::unique_ptr<Lexer> Lexer = [&]() {
127     bool Invalid = false;
128     auto FileOffset = SM.getDecomposedLoc(ExpansionRange.getEnd());
129     llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid);
130     assert(!Invalid && "Cannot get file/offset");
131     return std::make_unique<clang::Lexer>(
132         SM.getLocForStartOfFile(FileOffset.first), LangOpts, File.begin(),
133         File.data() + FileOffset.second, File.end());
134   }();
135 
136   // Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown).
137   Lexer->SetKeepWhitespaceMode(true);
138 
139   // Generally, the code we want to include looks like this ([] are optional),
140   // If Terminators is empty:
141   //   [ <comment> ] [ <newline> ]
142   // Otherwise:
143   //   ... <terminator> [ <comment> ] [ <newline> ]
144 
145   Token Tok;
146   bool Terminated = false;
147 
148   // First, lex to the current token (which is the last token of the range that
149   // is definitely associated with the decl). Then, we process the first token
150   // separately from the rest based on conditions that hold specifically for
151   // that first token.
152   //
153   // We do not search for a terminator if none is required or we've already
154   // encountered it. Otherwise, if the original `EntityLast` location was in a
155   // macro expansion, we don't have visibility into the text, so we assume we've
156   // already terminated. However, we note this assumption with
157   // `TerminatedByMacro`, because we'll want to handle it somewhat differently
158   // for the terminators semicolon and comma. These terminators can be safely
159   // associated with the entity when they appear after the macro -- extra
160   // semicolons have no effect on the program and a well-formed program won't
161   // have multiple commas in a row, so we're guaranteed that there is only one.
162   //
163   // FIXME: This handling of macros is more conservative than necessary. When
164   // the end of the expansion coincides with the end of the node, we can still
165   // safely analyze the code. But, it is more complicated, because we need to
166   // start by lexing the spelling loc for the first token and then switch to the
167   // expansion loc.
168   bool TerminatedByMacro = false;
169   Lexer->LexFromRawLexer(Tok);
170   if (Terminators.empty() || contains(Terminators, Tok))
171     Terminated = true;
172   else if (EntityLast.isMacroID()) {
173     Terminated = true;
174     TerminatedByMacro = true;
175   }
176 
177   // We save the most recent candidate for the exclusive end location.
178   SourceLocation End = Tok.getEndLoc();
179 
180   while (!Terminated) {
181     // Lex the next token we want to possibly expand the range with.
182     Lexer->LexFromRawLexer(Tok);
183 
184     switch (Tok.getKind()) {
185     case tok::eof:
186     // Unexpected separators.
187     case tok::l_brace:
188     case tok::r_brace:
189     case tok::comma:
190       return End;
191     // Whitespace pseudo-tokens.
192     case tok::unknown:
193       if (startsWithNewline(SM, Tok))
194         // Include at least until the end of the line.
195         End = Tok.getEndLoc();
196       break;
197     default:
198       if (contains(Terminators, Tok))
199         Terminated = true;
200       End = Tok.getEndLoc();
201       break;
202     }
203   }
204 
205   do {
206     // Lex the next token we want to possibly expand the range with.
207     Lexer->LexFromRawLexer(Tok);
208 
209     switch (Tok.getKind()) {
210     case tok::unknown:
211       if (startsWithNewline(SM, Tok))
212         // We're done, but include this newline.
213         return Tok.getEndLoc();
214       break;
215     case tok::comment:
216       // Include any comments we find on the way.
217       End = Tok.getEndLoc();
218       break;
219     case tok::semi:
220     case tok::comma:
221       if (TerminatedByMacro && contains(Terminators, Tok)) {
222         End = Tok.getEndLoc();
223         // We've found a real terminator.
224         TerminatedByMacro = false;
225         break;
226       }
227       // Found an unrelated token; stop and don't include it.
228       return End;
229     default:
230       // Found an unrelated token; stop and don't include it.
231       return End;
232     }
233   } while (true);
234 }
235 
236 // Returns the expected terminator tokens for the given declaration.
237 //
238 // If we do not know the correct terminator token, returns an empty set.
239 //
240 // There are cases where we have more than one possible terminator (for example,
241 // we find either a comma or a semicolon after a VarDecl).
242 static std::set<tok::TokenKind> getTerminators(const Decl &D) {
243   if (llvm::isa<RecordDecl>(D) || llvm::isa<UsingDecl>(D))
244     return {tok::semi};
245 
246   if (llvm::isa<FunctionDecl>(D) || llvm::isa<LinkageSpecDecl>(D))
247     return {tok::r_brace, tok::semi};
248 
249   if (llvm::isa<VarDecl>(D) || llvm::isa<FieldDecl>(D))
250     return {tok::comma, tok::semi};
251 
252   return {};
253 }
254 
255 // Starting from `Loc`, skips whitespace up to, and including, a single
256 // newline. Returns the (exclusive) end of any skipped whitespace (that is, the
257 // location immediately after the whitespace).
258 static SourceLocation skipWhitespaceAndNewline(const SourceManager &SM,
259                                                SourceLocation Loc,
260                                                const LangOptions &LangOpts) {
261   const char *LocChars = SM.getCharacterData(Loc);
262   int i = 0;
263   while (isHorizontalWhitespace(LocChars[i]))
264     ++i;
265   if (isVerticalWhitespace(LocChars[i]))
266     ++i;
267   return Loc.getLocWithOffset(i);
268 }
269 
270 // Is `Loc` separated from any following decl by something meaningful (e.g. an
271 // empty line, a comment), ignoring horizontal whitespace?  Since this is a
272 // heuristic, we return false when in doubt.  `Loc` cannot be the first location
273 // in the file.
274 static bool atOrBeforeSeparation(const SourceManager &SM, SourceLocation Loc,
275                                  const LangOptions &LangOpts) {
276   // If the preceding character is a newline, we'll check for an empty line as a
277   // separator. However, we can't identify an empty line using tokens, so we
278   // analyse the characters. If we try to use tokens, we'll just end up with a
279   // whitespace token, whose characters we'd have to analyse anyhow.
280   bool Invalid = false;
281   const char *LocChars =
282       SM.getCharacterData(Loc.getLocWithOffset(-1), &Invalid);
283   assert(!Invalid &&
284          "Loc must be a valid character and not the first of the source file.");
285   if (isVerticalWhitespace(LocChars[0])) {
286     for (int i = 1; isWhitespace(LocChars[i]); ++i)
287       if (isVerticalWhitespace(LocChars[i]))
288         return true;
289   }
290   // We didn't find an empty line, so lex the next token, skipping past any
291   // whitespace we just scanned.
292   Token Tok;
293   bool Failed = Lexer::getRawToken(Loc, Tok, SM, LangOpts,
294                                    /*IgnoreWhiteSpace=*/true);
295   if (Failed)
296     // Any text that confuses the lexer seems fair to consider a separation.
297     return true;
298 
299   switch (Tok.getKind()) {
300   case tok::comment:
301   case tok::l_brace:
302   case tok::r_brace:
303   case tok::eof:
304     return true;
305   default:
306     return false;
307   }
308 }
309 
310 CharSourceRange tooling::getAssociatedRange(const Decl &Decl,
311                                             ASTContext &Context) {
312   const SourceManager &SM = Context.getSourceManager();
313   const LangOptions &LangOpts = Context.getLangOpts();
314   CharSourceRange Range = CharSourceRange::getTokenRange(Decl.getSourceRange());
315 
316   // First, expand to the start of the template<> declaration if necessary.
317   if (const auto *Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) {
318     if (const auto *T = Record->getDescribedClassTemplate())
319       if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
320         Range.setBegin(T->getBeginLoc());
321   } else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) {
322     if (const auto *T = F->getDescribedFunctionTemplate())
323       if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
324         Range.setBegin(T->getBeginLoc());
325   }
326 
327   // Next, expand the end location past trailing comments to include a potential
328   // newline at the end of the decl's line.
329   Range.setEnd(
330       getEntityEndLoc(SM, Decl.getEndLoc(), getTerminators(Decl), LangOpts));
331   Range.setTokenRange(false);
332 
333   // Expand to include preceeding associated comments. We ignore any comments
334   // that are not preceeding the decl, since we've already skipped trailing
335   // comments with getEntityEndLoc.
336   if (const RawComment *Comment =
337           Decl.getASTContext().getRawCommentForDeclNoCache(&Decl))
338     // Only include a preceding comment if:
339     // * it is *not* separate from the declaration (not including any newline
340     //   that immediately follows the comment),
341     // * the decl *is* separate from any following entity (so, there are no
342     //   other entities the comment could refer to), and
343     // * it is not a IfThisThenThat lint check.
344     if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(),
345                                      Range.getBegin()) &&
346         !atOrBeforeSeparation(
347             SM, skipWhitespaceAndNewline(SM, Comment->getEndLoc(), LangOpts),
348             LangOpts) &&
349         atOrBeforeSeparation(SM, Range.getEnd(), LangOpts)) {
350       const StringRef CommentText = Comment->getRawText(SM);
351       if (!CommentText.contains("LINT.IfChange") &&
352           !CommentText.contains("LINT.ThenChange"))
353         Range.setBegin(Comment->getBeginLoc());
354     }
355   // Add leading attributes.
356   for (auto *Attr : Decl.attrs()) {
357     if (Attr->getLocation().isInvalid() ||
358         !SM.isBeforeInTranslationUnit(Attr->getLocation(), Range.getBegin()))
359       continue;
360     Range.setBegin(Attr->getLocation());
361 
362     // Extend to the left '[[' or '__attribute((' if we saw the attribute,
363     // unless it is not a valid location.
364     bool Invalid;
365     StringRef Source =
366         SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid);
367     if (Invalid)
368       continue;
369     llvm::StringRef BeforeAttr =
370         Source.substr(0, SM.getFileOffset(Range.getBegin()));
371     llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim();
372 
373     for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) {
374       // Handle whitespace between attribute prefix and attribute value.
375       if (BeforeAttrStripped.endswith(Prefix)) {
376         // Move start to start position of prefix, which is
377         // length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix)
378         // positions to the left.
379         Range.setBegin(Range.getBegin().getLocWithOffset(static_cast<int>(
380             -BeforeAttr.size() + BeforeAttrStripped.size() - Prefix.size())));
381         break;
382         // If we didn't see '[[' or '__attribute' it's probably coming from a
383         // macro expansion which is already handled by makeFileCharRange(),
384         // below.
385       }
386     }
387   }
388 
389   // Range.getEnd() is already fully un-expanded by getEntityEndLoc. But,
390   // Range.getBegin() may be inside an expansion.
391   return Lexer::makeFileCharRange(Range, SM, LangOpts);
392 }
393