1 //===--- SourceCode.cpp - Source code manipulation routines -----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file provides functions that simplify extraction of source code. 10 // 11 //===----------------------------------------------------------------------===// 12 #include "clang/Tooling/Transformer/SourceCode.h" 13 #include "clang/AST/ASTContext.h" 14 #include "clang/AST/Attr.h" 15 #include "clang/AST/Comment.h" 16 #include "clang/AST/Decl.h" 17 #include "clang/AST/DeclCXX.h" 18 #include "clang/AST/DeclTemplate.h" 19 #include "clang/AST/Expr.h" 20 #include "clang/Basic/SourceManager.h" 21 #include "clang/Lex/Lexer.h" 22 #include "llvm/Support/Errc.h" 23 #include "llvm/Support/Error.h" 24 #include <set> 25 26 using namespace clang; 27 28 using llvm::errc; 29 using llvm::StringError; 30 31 StringRef clang::tooling::getText(CharSourceRange Range, 32 const ASTContext &Context) { 33 return Lexer::getSourceText(Range, Context.getSourceManager(), 34 Context.getLangOpts()); 35 } 36 37 CharSourceRange clang::tooling::maybeExtendRange(CharSourceRange Range, 38 tok::TokenKind Next, 39 ASTContext &Context) { 40 CharSourceRange R = Lexer::getAsCharRange(Range, Context.getSourceManager(), 41 Context.getLangOpts()); 42 if (R.isInvalid()) 43 return Range; 44 Token Tok; 45 bool Err = 46 Lexer::getRawToken(R.getEnd(), Tok, Context.getSourceManager(), 47 Context.getLangOpts(), /*IgnoreWhiteSpace=*/true); 48 if (Err || !Tok.is(Next)) 49 return Range; 50 return CharSourceRange::getTokenRange(Range.getBegin(), Tok.getLocation()); 51 } 52 53 llvm::Error clang::tooling::validateEditRange(const CharSourceRange &Range, 54 const SourceManager &SM) { 55 if (Range.isInvalid()) 56 return llvm::make_error<StringError>(errc::invalid_argument, 57 "Invalid range"); 58 59 if (Range.getBegin().isMacroID() || Range.getEnd().isMacroID()) 60 return llvm::make_error<StringError>( 61 errc::invalid_argument, "Range starts or ends in a macro expansion"); 62 63 if (SM.isInSystemHeader(Range.getBegin()) || 64 SM.isInSystemHeader(Range.getEnd())) 65 return llvm::make_error<StringError>(errc::invalid_argument, 66 "Range is in system header"); 67 68 std::pair<FileID, unsigned> BeginInfo = SM.getDecomposedLoc(Range.getBegin()); 69 std::pair<FileID, unsigned> EndInfo = SM.getDecomposedLoc(Range.getEnd()); 70 if (BeginInfo.first != EndInfo.first) 71 return llvm::make_error<StringError>( 72 errc::invalid_argument, "Range begins and ends in different files"); 73 74 if (BeginInfo.second > EndInfo.second) 75 return llvm::make_error<StringError>( 76 errc::invalid_argument, "Range's begin is past its end"); 77 78 return llvm::Error::success(); 79 } 80 81 static bool SpelledInMacroDefinition(SourceLocation Loc, 82 const SourceManager &SM) { 83 while (Loc.isMacroID()) { 84 const auto &Expansion = SM.getSLocEntry(SM.getFileID(Loc)).getExpansion(); 85 if (Expansion.isMacroArgExpansion()) { 86 // Check the spelling location of the macro arg, in case the arg itself is 87 // in a macro expansion. 88 Loc = Expansion.getSpellingLoc(); 89 } else { 90 return true; 91 } 92 } 93 return false; 94 } 95 96 llvm::Optional<CharSourceRange> clang::tooling::getRangeForEdit( 97 const CharSourceRange &EditRange, const SourceManager &SM, 98 const LangOptions &LangOpts, bool IncludeMacroExpansion) { 99 CharSourceRange Range; 100 if (IncludeMacroExpansion) { 101 Range = Lexer::makeFileCharRange(EditRange, SM, LangOpts); 102 } else { 103 if (SpelledInMacroDefinition(EditRange.getBegin(), SM) || 104 SpelledInMacroDefinition(EditRange.getEnd(), SM)) 105 return std::nullopt; 106 107 auto B = SM.getSpellingLoc(EditRange.getBegin()); 108 auto E = SM.getSpellingLoc(EditRange.getEnd()); 109 if (EditRange.isTokenRange()) 110 E = Lexer::getLocForEndOfToken(E, 0, SM, LangOpts); 111 Range = CharSourceRange::getCharRange(B, E); 112 } 113 114 bool IsInvalid = llvm::errorToBool(validateEditRange(Range, SM)); 115 if (IsInvalid) 116 return std::nullopt; 117 return Range; 118 } 119 120 static bool startsWithNewline(const SourceManager &SM, const Token &Tok) { 121 return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]); 122 } 123 124 static bool contains(const std::set<tok::TokenKind> &Terminators, 125 const Token &Tok) { 126 return Terminators.count(Tok.getKind()) > 0; 127 } 128 129 // Returns the exclusive, *file* end location of the entity whose last token is 130 // at location 'EntityLast'. That is, it returns the location one past the last 131 // relevant character. 132 // 133 // Associated tokens include comments, horizontal whitespace and 'Terminators' 134 // -- optional tokens, which, if any are found, will be included; if 135 // 'Terminators' is empty, we will not include any extra tokens beyond comments 136 // and horizontal whitespace. 137 static SourceLocation 138 getEntityEndLoc(const SourceManager &SM, SourceLocation EntityLast, 139 const std::set<tok::TokenKind> &Terminators, 140 const LangOptions &LangOpts) { 141 assert(EntityLast.isValid() && "Invalid end location found."); 142 143 // We remember the last location of a non-horizontal-whitespace token we have 144 // lexed; this is the location up to which we will want to delete. 145 // FIXME: Support using the spelling loc here for cases where we want to 146 // analyze the macro text. 147 148 CharSourceRange ExpansionRange = SM.getExpansionRange(EntityLast); 149 // FIXME: Should check isTokenRange(), for the (rare) case that 150 // `ExpansionRange` is a character range. 151 std::unique_ptr<Lexer> Lexer = [&]() { 152 bool Invalid = false; 153 auto FileOffset = SM.getDecomposedLoc(ExpansionRange.getEnd()); 154 llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid); 155 assert(!Invalid && "Cannot get file/offset"); 156 return std::make_unique<clang::Lexer>( 157 SM.getLocForStartOfFile(FileOffset.first), LangOpts, File.begin(), 158 File.data() + FileOffset.second, File.end()); 159 }(); 160 161 // Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown). 162 Lexer->SetKeepWhitespaceMode(true); 163 164 // Generally, the code we want to include looks like this ([] are optional), 165 // If Terminators is empty: 166 // [ <comment> ] [ <newline> ] 167 // Otherwise: 168 // ... <terminator> [ <comment> ] [ <newline> ] 169 170 Token Tok; 171 bool Terminated = false; 172 173 // First, lex to the current token (which is the last token of the range that 174 // is definitely associated with the decl). Then, we process the first token 175 // separately from the rest based on conditions that hold specifically for 176 // that first token. 177 // 178 // We do not search for a terminator if none is required or we've already 179 // encountered it. Otherwise, if the original `EntityLast` location was in a 180 // macro expansion, we don't have visibility into the text, so we assume we've 181 // already terminated. However, we note this assumption with 182 // `TerminatedByMacro`, because we'll want to handle it somewhat differently 183 // for the terminators semicolon and comma. These terminators can be safely 184 // associated with the entity when they appear after the macro -- extra 185 // semicolons have no effect on the program and a well-formed program won't 186 // have multiple commas in a row, so we're guaranteed that there is only one. 187 // 188 // FIXME: This handling of macros is more conservative than necessary. When 189 // the end of the expansion coincides with the end of the node, we can still 190 // safely analyze the code. But, it is more complicated, because we need to 191 // start by lexing the spelling loc for the first token and then switch to the 192 // expansion loc. 193 bool TerminatedByMacro = false; 194 Lexer->LexFromRawLexer(Tok); 195 if (Terminators.empty() || contains(Terminators, Tok)) 196 Terminated = true; 197 else if (EntityLast.isMacroID()) { 198 Terminated = true; 199 TerminatedByMacro = true; 200 } 201 202 // We save the most recent candidate for the exclusive end location. 203 SourceLocation End = Tok.getEndLoc(); 204 205 while (!Terminated) { 206 // Lex the next token we want to possibly expand the range with. 207 Lexer->LexFromRawLexer(Tok); 208 209 switch (Tok.getKind()) { 210 case tok::eof: 211 // Unexpected separators. 212 case tok::l_brace: 213 case tok::r_brace: 214 case tok::comma: 215 return End; 216 // Whitespace pseudo-tokens. 217 case tok::unknown: 218 if (startsWithNewline(SM, Tok)) 219 // Include at least until the end of the line. 220 End = Tok.getEndLoc(); 221 break; 222 default: 223 if (contains(Terminators, Tok)) 224 Terminated = true; 225 End = Tok.getEndLoc(); 226 break; 227 } 228 } 229 230 do { 231 // Lex the next token we want to possibly expand the range with. 232 Lexer->LexFromRawLexer(Tok); 233 234 switch (Tok.getKind()) { 235 case tok::unknown: 236 if (startsWithNewline(SM, Tok)) 237 // We're done, but include this newline. 238 return Tok.getEndLoc(); 239 break; 240 case tok::comment: 241 // Include any comments we find on the way. 242 End = Tok.getEndLoc(); 243 break; 244 case tok::semi: 245 case tok::comma: 246 if (TerminatedByMacro && contains(Terminators, Tok)) { 247 End = Tok.getEndLoc(); 248 // We've found a real terminator. 249 TerminatedByMacro = false; 250 break; 251 } 252 // Found an unrelated token; stop and don't include it. 253 return End; 254 default: 255 // Found an unrelated token; stop and don't include it. 256 return End; 257 } 258 } while (true); 259 } 260 261 // Returns the expected terminator tokens for the given declaration. 262 // 263 // If we do not know the correct terminator token, returns an empty set. 264 // 265 // There are cases where we have more than one possible terminator (for example, 266 // we find either a comma or a semicolon after a VarDecl). 267 static std::set<tok::TokenKind> getTerminators(const Decl &D) { 268 if (llvm::isa<RecordDecl>(D) || llvm::isa<UsingDecl>(D)) 269 return {tok::semi}; 270 271 if (llvm::isa<FunctionDecl>(D) || llvm::isa<LinkageSpecDecl>(D)) 272 return {tok::r_brace, tok::semi}; 273 274 if (llvm::isa<VarDecl>(D) || llvm::isa<FieldDecl>(D)) 275 return {tok::comma, tok::semi}; 276 277 return {}; 278 } 279 280 // Starting from `Loc`, skips whitespace up to, and including, a single 281 // newline. Returns the (exclusive) end of any skipped whitespace (that is, the 282 // location immediately after the whitespace). 283 static SourceLocation skipWhitespaceAndNewline(const SourceManager &SM, 284 SourceLocation Loc, 285 const LangOptions &LangOpts) { 286 const char *LocChars = SM.getCharacterData(Loc); 287 int i = 0; 288 while (isHorizontalWhitespace(LocChars[i])) 289 ++i; 290 if (isVerticalWhitespace(LocChars[i])) 291 ++i; 292 return Loc.getLocWithOffset(i); 293 } 294 295 // Is `Loc` separated from any following decl by something meaningful (e.g. an 296 // empty line, a comment), ignoring horizontal whitespace? Since this is a 297 // heuristic, we return false when in doubt. `Loc` cannot be the first location 298 // in the file. 299 static bool atOrBeforeSeparation(const SourceManager &SM, SourceLocation Loc, 300 const LangOptions &LangOpts) { 301 // If the preceding character is a newline, we'll check for an empty line as a 302 // separator. However, we can't identify an empty line using tokens, so we 303 // analyse the characters. If we try to use tokens, we'll just end up with a 304 // whitespace token, whose characters we'd have to analyse anyhow. 305 bool Invalid = false; 306 const char *LocChars = 307 SM.getCharacterData(Loc.getLocWithOffset(-1), &Invalid); 308 assert(!Invalid && 309 "Loc must be a valid character and not the first of the source file."); 310 if (isVerticalWhitespace(LocChars[0])) { 311 for (int i = 1; isWhitespace(LocChars[i]); ++i) 312 if (isVerticalWhitespace(LocChars[i])) 313 return true; 314 } 315 // We didn't find an empty line, so lex the next token, skipping past any 316 // whitespace we just scanned. 317 Token Tok; 318 bool Failed = Lexer::getRawToken(Loc, Tok, SM, LangOpts, 319 /*IgnoreWhiteSpace=*/true); 320 if (Failed) 321 // Any text that confuses the lexer seems fair to consider a separation. 322 return true; 323 324 switch (Tok.getKind()) { 325 case tok::comment: 326 case tok::l_brace: 327 case tok::r_brace: 328 case tok::eof: 329 return true; 330 default: 331 return false; 332 } 333 } 334 335 CharSourceRange tooling::getAssociatedRange(const Decl &Decl, 336 ASTContext &Context) { 337 const SourceManager &SM = Context.getSourceManager(); 338 const LangOptions &LangOpts = Context.getLangOpts(); 339 CharSourceRange Range = CharSourceRange::getTokenRange(Decl.getSourceRange()); 340 341 // First, expand to the start of the template<> declaration if necessary. 342 if (const auto *Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) { 343 if (const auto *T = Record->getDescribedClassTemplate()) 344 if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin())) 345 Range.setBegin(T->getBeginLoc()); 346 } else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) { 347 if (const auto *T = F->getDescribedFunctionTemplate()) 348 if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin())) 349 Range.setBegin(T->getBeginLoc()); 350 } 351 352 // Next, expand the end location past trailing comments to include a potential 353 // newline at the end of the decl's line. 354 Range.setEnd( 355 getEntityEndLoc(SM, Decl.getEndLoc(), getTerminators(Decl), LangOpts)); 356 Range.setTokenRange(false); 357 358 // Expand to include preceeding associated comments. We ignore any comments 359 // that are not preceeding the decl, since we've already skipped trailing 360 // comments with getEntityEndLoc. 361 if (const RawComment *Comment = 362 Decl.getASTContext().getRawCommentForDeclNoCache(&Decl)) 363 // Only include a preceding comment if: 364 // * it is *not* separate from the declaration (not including any newline 365 // that immediately follows the comment), 366 // * the decl *is* separate from any following entity (so, there are no 367 // other entities the comment could refer to), and 368 // * it is not a IfThisThenThat lint check. 369 if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(), 370 Range.getBegin()) && 371 !atOrBeforeSeparation( 372 SM, skipWhitespaceAndNewline(SM, Comment->getEndLoc(), LangOpts), 373 LangOpts) && 374 atOrBeforeSeparation(SM, Range.getEnd(), LangOpts)) { 375 const StringRef CommentText = Comment->getRawText(SM); 376 if (!CommentText.contains("LINT.IfChange") && 377 !CommentText.contains("LINT.ThenChange")) 378 Range.setBegin(Comment->getBeginLoc()); 379 } 380 // Add leading attributes. 381 for (auto *Attr : Decl.attrs()) { 382 if (Attr->getLocation().isInvalid() || 383 !SM.isBeforeInTranslationUnit(Attr->getLocation(), Range.getBegin())) 384 continue; 385 Range.setBegin(Attr->getLocation()); 386 387 // Extend to the left '[[' or '__attribute((' if we saw the attribute, 388 // unless it is not a valid location. 389 bool Invalid; 390 StringRef Source = 391 SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid); 392 if (Invalid) 393 continue; 394 llvm::StringRef BeforeAttr = 395 Source.substr(0, SM.getFileOffset(Range.getBegin())); 396 llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim(); 397 398 for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) { 399 // Handle whitespace between attribute prefix and attribute value. 400 if (BeforeAttrStripped.endswith(Prefix)) { 401 // Move start to start position of prefix, which is 402 // length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix) 403 // positions to the left. 404 Range.setBegin(Range.getBegin().getLocWithOffset(static_cast<int>( 405 -BeforeAttr.size() + BeforeAttrStripped.size() - Prefix.size()))); 406 break; 407 // If we didn't see '[[' or '__attribute' it's probably coming from a 408 // macro expansion which is already handled by makeFileCharRange(), 409 // below. 410 } 411 } 412 } 413 414 // Range.getEnd() is already fully un-expanded by getEntityEndLoc. But, 415 // Range.getBegin() may be inside an expansion. 416 return Lexer::makeFileCharRange(Range, SM, LangOpts); 417 } 418