1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements FormatTokenLexer, which tokenizes a source file 11 /// into a FormatToken stream suitable for ClangFormat. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "FormatTokenLexer.h" 16 #include "FormatToken.h" 17 #include "clang/Basic/SourceLocation.h" 18 #include "clang/Basic/SourceManager.h" 19 #include "clang/Format/Format.h" 20 #include "llvm/Support/Regex.h" 21 22 namespace clang { 23 namespace format { 24 25 FormatTokenLexer::FormatTokenLexer( 26 const SourceManager &SourceMgr, FileID ID, unsigned Column, 27 const FormatStyle &Style, encoding::Encoding Encoding, 28 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator, 29 IdentifierTable &IdentTable) 30 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}), 31 Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID), 32 Style(Style), IdentTable(IdentTable), Keywords(IdentTable), 33 Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0), 34 FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin), 35 MacroBlockEndRegex(Style.MacroBlockEnd) { 36 Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, 37 getFormattingLangOpts(Style))); 38 Lex->SetKeepWhitespaceMode(true); 39 40 for (const std::string &ForEachMacro : Style.ForEachMacros) 41 Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro}); 42 for (const std::string &AttributeMacro : Style.AttributeMacros) 43 Macros.insert({&IdentTable.get(AttributeMacro), TT_AttributeMacro}); 44 for (const std::string &StatementMacro : Style.StatementMacros) 45 Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro}); 46 for (const std::string &TypenameMacro : Style.TypenameMacros) 47 Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro}); 48 for (const std::string &NamespaceMacro : Style.NamespaceMacros) 49 Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro}); 50 for (const std::string &WhitespaceSensitiveMacro : 51 Style.WhitespaceSensitiveMacros) { 52 Macros.insert( 53 {&IdentTable.get(WhitespaceSensitiveMacro), TT_UntouchableMacroFunc}); 54 } 55 for (const std::string &StatementAttributeLikeMacro : 56 Style.StatementAttributeLikeMacros) 57 Macros.insert({&IdentTable.get(StatementAttributeLikeMacro), 58 TT_StatementAttributeLikeMacro}); 59 } 60 61 ArrayRef<FormatToken *> FormatTokenLexer::lex() { 62 assert(Tokens.empty()); 63 assert(FirstInLineIndex == 0); 64 do { 65 Tokens.push_back(getNextToken()); 66 if (Style.Language == FormatStyle::LK_JavaScript) { 67 tryParseJSRegexLiteral(); 68 handleTemplateStrings(); 69 } 70 if (Style.Language == FormatStyle::LK_TextProto) 71 tryParsePythonComment(); 72 tryMergePreviousTokens(); 73 if (Style.isCSharp()) 74 // This needs to come after tokens have been merged so that C# 75 // string literals are correctly identified. 76 handleCSharpVerbatimAndInterpolatedStrings(); 77 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline) 78 FirstInLineIndex = Tokens.size() - 1; 79 } while (Tokens.back()->Tok.isNot(tok::eof)); 80 return Tokens; 81 } 82 83 void FormatTokenLexer::tryMergePreviousTokens() { 84 if (tryMerge_TMacro()) 85 return; 86 if (tryMergeConflictMarkers()) 87 return; 88 if (tryMergeLessLess()) 89 return; 90 if (tryMergeForEach()) 91 return; 92 if (Style.isCpp() && tryTransformTryUsageForC()) 93 return; 94 95 if (Style.Language == FormatStyle::LK_JavaScript || Style.isCSharp()) { 96 static const tok::TokenKind NullishCoalescingOperator[] = {tok::question, 97 tok::question}; 98 static const tok::TokenKind NullPropagatingOperator[] = {tok::question, 99 tok::period}; 100 static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater}; 101 102 if (tryMergeTokens(FatArrow, TT_FatArrow)) 103 return; 104 if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) { 105 // Treat like the "||" operator (as opposed to the ternary ?). 106 Tokens.back()->Tok.setKind(tok::pipepipe); 107 return; 108 } 109 if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) { 110 // Treat like a regular "." access. 111 Tokens.back()->Tok.setKind(tok::period); 112 return; 113 } 114 if (tryMergeNullishCoalescingEqual()) { 115 return; 116 } 117 } 118 119 if (Style.isCSharp()) { 120 static const tok::TokenKind CSharpNullConditionalLSquare[] = { 121 tok::question, tok::l_square}; 122 123 if (tryMergeCSharpKeywordVariables()) 124 return; 125 if (tryMergeCSharpStringLiteral()) 126 return; 127 if (tryTransformCSharpForEach()) 128 return; 129 if (tryMergeTokens(CSharpNullConditionalLSquare, 130 TT_CSharpNullConditionalLSquare)) { 131 // Treat like a regular "[" operator. 132 Tokens.back()->Tok.setKind(tok::l_square); 133 return; 134 } 135 } 136 137 if (tryMergeNSStringLiteral()) 138 return; 139 140 if (Style.Language == FormatStyle::LK_JavaScript) { 141 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal}; 142 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal, 143 tok::equal}; 144 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater, 145 tok::greaterequal}; 146 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star}; 147 static const tok::TokenKind JSExponentiationEqual[] = {tok::star, 148 tok::starequal}; 149 static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal}; 150 static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal}; 151 152 // FIXME: Investigate what token type gives the correct operator priority. 153 if (tryMergeTokens(JSIdentity, TT_BinaryOperator)) 154 return; 155 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator)) 156 return; 157 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator)) 158 return; 159 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation)) 160 return; 161 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) { 162 Tokens.back()->Tok.setKind(tok::starequal); 163 return; 164 } 165 if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) || 166 tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) { 167 // Treat like the "=" assignment operator. 168 Tokens.back()->Tok.setKind(tok::equal); 169 return; 170 } 171 if (tryMergeJSPrivateIdentifier()) 172 return; 173 } 174 175 if (Style.Language == FormatStyle::LK_Java) { 176 static const tok::TokenKind JavaRightLogicalShiftAssign[] = { 177 tok::greater, tok::greater, tok::greaterequal}; 178 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator)) 179 return; 180 } 181 } 182 183 bool FormatTokenLexer::tryMergeNSStringLiteral() { 184 if (Tokens.size() < 2) 185 return false; 186 auto &At = *(Tokens.end() - 2); 187 auto &String = *(Tokens.end() - 1); 188 if (!At->is(tok::at) || !String->is(tok::string_literal)) 189 return false; 190 At->Tok.setKind(tok::string_literal); 191 At->TokenText = StringRef(At->TokenText.begin(), 192 String->TokenText.end() - At->TokenText.begin()); 193 At->ColumnWidth += String->ColumnWidth; 194 At->setType(TT_ObjCStringLiteral); 195 Tokens.erase(Tokens.end() - 1); 196 return true; 197 } 198 199 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() { 200 // Merges #idenfier into a single identifier with the text #identifier 201 // but the token tok::identifier. 202 if (Tokens.size() < 2) 203 return false; 204 auto &Hash = *(Tokens.end() - 2); 205 auto &Identifier = *(Tokens.end() - 1); 206 if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier)) 207 return false; 208 Hash->Tok.setKind(tok::identifier); 209 Hash->TokenText = 210 StringRef(Hash->TokenText.begin(), 211 Identifier->TokenText.end() - Hash->TokenText.begin()); 212 Hash->ColumnWidth += Identifier->ColumnWidth; 213 Hash->setType(TT_JsPrivateIdentifier); 214 Tokens.erase(Tokens.end() - 1); 215 return true; 216 } 217 218 // Search for verbatim or interpolated string literals @"ABC" or 219 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to 220 // prevent splitting of @, $ and ". 221 // Merging of multiline verbatim strings with embedded '"' is handled in 222 // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing. 223 bool FormatTokenLexer::tryMergeCSharpStringLiteral() { 224 if (Tokens.size() < 2) 225 return false; 226 227 // Interpolated strings could contain { } with " characters inside. 228 // $"{x ?? "null"}" 229 // should not be split into $"{x ?? ", null, "}" but should treated as a 230 // single string-literal. 231 // 232 // We opt not to try and format expressions inside {} within a C# 233 // interpolated string. Formatting expressions within an interpolated string 234 // would require similar work as that done for JavaScript template strings 235 // in `handleTemplateStrings()`. 236 auto &CSharpInterpolatedString = *(Tokens.end() - 2); 237 if (CSharpInterpolatedString->getType() == TT_CSharpStringLiteral && 238 (CSharpInterpolatedString->TokenText.startswith(R"($")") || 239 CSharpInterpolatedString->TokenText.startswith(R"($@")"))) { 240 int UnmatchedOpeningBraceCount = 0; 241 242 auto TokenTextSize = CSharpInterpolatedString->TokenText.size(); 243 for (size_t Index = 0; Index < TokenTextSize; ++Index) { 244 char C = CSharpInterpolatedString->TokenText[Index]; 245 if (C == '{') { 246 // "{{" inside an interpolated string is an escaped '{' so skip it. 247 if (Index + 1 < TokenTextSize && 248 CSharpInterpolatedString->TokenText[Index + 1] == '{') { 249 ++Index; 250 continue; 251 } 252 ++UnmatchedOpeningBraceCount; 253 } else if (C == '}') { 254 // "}}" inside an interpolated string is an escaped '}' so skip it. 255 if (Index + 1 < TokenTextSize && 256 CSharpInterpolatedString->TokenText[Index + 1] == '}') { 257 ++Index; 258 continue; 259 } 260 --UnmatchedOpeningBraceCount; 261 } 262 } 263 264 if (UnmatchedOpeningBraceCount > 0) { 265 auto &NextToken = *(Tokens.end() - 1); 266 CSharpInterpolatedString->TokenText = 267 StringRef(CSharpInterpolatedString->TokenText.begin(), 268 NextToken->TokenText.end() - 269 CSharpInterpolatedString->TokenText.begin()); 270 CSharpInterpolatedString->ColumnWidth += NextToken->ColumnWidth; 271 Tokens.erase(Tokens.end() - 1); 272 return true; 273 } 274 } 275 276 // Look for @"aaaaaa" or $"aaaaaa". 277 auto &String = *(Tokens.end() - 1); 278 if (!String->is(tok::string_literal)) 279 return false; 280 281 auto &At = *(Tokens.end() - 2); 282 if (!(At->is(tok::at) || At->TokenText == "$")) 283 return false; 284 285 if (Tokens.size() > 2 && At->is(tok::at)) { 286 auto &Dollar = *(Tokens.end() - 3); 287 if (Dollar->TokenText == "$") { 288 // This looks like $@"aaaaa" so we need to combine all 3 tokens. 289 Dollar->Tok.setKind(tok::string_literal); 290 Dollar->TokenText = 291 StringRef(Dollar->TokenText.begin(), 292 String->TokenText.end() - Dollar->TokenText.begin()); 293 Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth); 294 Dollar->setType(TT_CSharpStringLiteral); 295 Tokens.erase(Tokens.end() - 2); 296 Tokens.erase(Tokens.end() - 1); 297 return true; 298 } 299 } 300 301 // Convert back into just a string_literal. 302 At->Tok.setKind(tok::string_literal); 303 At->TokenText = StringRef(At->TokenText.begin(), 304 String->TokenText.end() - At->TokenText.begin()); 305 At->ColumnWidth += String->ColumnWidth; 306 At->setType(TT_CSharpStringLiteral); 307 Tokens.erase(Tokens.end() - 1); 308 return true; 309 } 310 311 // Valid C# attribute targets: 312 // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets 313 const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = { 314 "assembly", "module", "field", "event", "method", 315 "param", "property", "return", "type", 316 }; 317 318 bool FormatTokenLexer::tryMergeNullishCoalescingEqual() { 319 if (Tokens.size() < 2) 320 return false; 321 auto &NullishCoalescing = *(Tokens.end() - 2); 322 auto &Equal = *(Tokens.end() - 1); 323 if (NullishCoalescing->getType() != TT_NullCoalescingOperator || 324 !Equal->is(tok::equal)) 325 return false; 326 NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens. 327 NullishCoalescing->TokenText = 328 StringRef(NullishCoalescing->TokenText.begin(), 329 Equal->TokenText.end() - NullishCoalescing->TokenText.begin()); 330 NullishCoalescing->ColumnWidth += Equal->ColumnWidth; 331 NullishCoalescing->setType(TT_NullCoalescingEqual); 332 Tokens.erase(Tokens.end() - 1); 333 return true; 334 } 335 336 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() { 337 if (Tokens.size() < 2) 338 return false; 339 auto &At = *(Tokens.end() - 2); 340 auto &Keyword = *(Tokens.end() - 1); 341 if (!At->is(tok::at)) 342 return false; 343 if (!Keywords.isCSharpKeyword(*Keyword)) 344 return false; 345 346 At->Tok.setKind(tok::identifier); 347 At->TokenText = StringRef(At->TokenText.begin(), 348 Keyword->TokenText.end() - At->TokenText.begin()); 349 At->ColumnWidth += Keyword->ColumnWidth; 350 At->setType(Keyword->getType()); 351 Tokens.erase(Tokens.end() - 1); 352 return true; 353 } 354 355 // In C# transform identifier foreach into kw_foreach 356 bool FormatTokenLexer::tryTransformCSharpForEach() { 357 if (Tokens.size() < 1) 358 return false; 359 auto &Identifier = *(Tokens.end() - 1); 360 if (!Identifier->is(tok::identifier)) 361 return false; 362 if (Identifier->TokenText != "foreach") 363 return false; 364 365 Identifier->setType(TT_ForEachMacro); 366 Identifier->Tok.setKind(tok::kw_for); 367 return true; 368 } 369 370 bool FormatTokenLexer::tryMergeForEach() { 371 if (Tokens.size() < 2) 372 return false; 373 auto &For = *(Tokens.end() - 2); 374 auto &Each = *(Tokens.end() - 1); 375 if (!For->is(tok::kw_for)) 376 return false; 377 if (!Each->is(tok::identifier)) 378 return false; 379 if (Each->TokenText != "each") 380 return false; 381 382 For->setType(TT_ForEachMacro); 383 For->Tok.setKind(tok::kw_for); 384 385 For->TokenText = StringRef(For->TokenText.begin(), 386 Each->TokenText.end() - For->TokenText.begin()); 387 For->ColumnWidth += Each->ColumnWidth; 388 Tokens.erase(Tokens.end() - 1); 389 return true; 390 } 391 392 bool FormatTokenLexer::tryTransformTryUsageForC() { 393 if (Tokens.size() < 2) 394 return false; 395 auto &Try = *(Tokens.end() - 2); 396 if (!Try->is(tok::kw_try)) 397 return false; 398 auto &Next = *(Tokens.end() - 1); 399 if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment)) 400 return false; 401 402 if (Tokens.size() > 2) { 403 auto &At = *(Tokens.end() - 3); 404 if (At->is(tok::at)) 405 return false; 406 } 407 408 Try->Tok.setKind(tok::identifier); 409 return true; 410 } 411 412 bool FormatTokenLexer::tryMergeLessLess() { 413 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less. 414 if (Tokens.size() < 3) 415 return false; 416 417 bool FourthTokenIsLess = false; 418 if (Tokens.size() > 3) 419 FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less); 420 421 auto First = Tokens.end() - 3; 422 if (First[2]->is(tok::less) || First[1]->isNot(tok::less) || 423 First[0]->isNot(tok::less) || FourthTokenIsLess) 424 return false; 425 426 // Only merge if there currently is no whitespace between the two "<". 427 if (First[1]->WhitespaceRange.getBegin() != 428 First[1]->WhitespaceRange.getEnd()) 429 return false; 430 431 First[0]->Tok.setKind(tok::lessless); 432 First[0]->TokenText = "<<"; 433 First[0]->ColumnWidth += 1; 434 Tokens.erase(Tokens.end() - 2); 435 return true; 436 } 437 438 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, 439 TokenType NewType) { 440 if (Tokens.size() < Kinds.size()) 441 return false; 442 443 SmallVectorImpl<FormatToken *>::const_iterator First = 444 Tokens.end() - Kinds.size(); 445 if (!First[0]->is(Kinds[0])) 446 return false; 447 unsigned AddLength = 0; 448 for (unsigned i = 1; i < Kinds.size(); ++i) { 449 if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() != 450 First[i]->WhitespaceRange.getEnd()) 451 return false; 452 AddLength += First[i]->TokenText.size(); 453 } 454 Tokens.resize(Tokens.size() - Kinds.size() + 1); 455 First[0]->TokenText = StringRef(First[0]->TokenText.data(), 456 First[0]->TokenText.size() + AddLength); 457 First[0]->ColumnWidth += AddLength; 458 First[0]->setType(NewType); 459 return true; 460 } 461 462 // Returns \c true if \p Tok can only be followed by an operand in JavaScript. 463 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) { 464 // NB: This is not entirely correct, as an r_paren can introduce an operand 465 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough 466 // corner case to not matter in practice, though. 467 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace, 468 tok::r_brace, tok::l_square, tok::semi, tok::exclaim, 469 tok::colon, tok::question, tok::tilde) || 470 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw, 471 tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void, 472 tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) || 473 Tok->isBinaryOperator(); 474 } 475 476 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) { 477 if (!Prev) 478 return true; 479 480 // Regex literals can only follow after prefix unary operators, not after 481 // postfix unary operators. If the '++' is followed by a non-operand 482 // introducing token, the slash here is the operand and not the start of a 483 // regex. 484 // `!` is an unary prefix operator, but also a post-fix operator that casts 485 // away nullability, so the same check applies. 486 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim)) 487 return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3])); 488 489 // The previous token must introduce an operand location where regex 490 // literals can occur. 491 if (!precedesOperand(Prev)) 492 return false; 493 494 return true; 495 } 496 497 // Tries to parse a JavaScript Regex literal starting at the current token, 498 // if that begins with a slash and is in a location where JavaScript allows 499 // regex literals. Changes the current token to a regex literal and updates 500 // its text if successful. 501 void FormatTokenLexer::tryParseJSRegexLiteral() { 502 FormatToken *RegexToken = Tokens.back(); 503 if (!RegexToken->isOneOf(tok::slash, tok::slashequal)) 504 return; 505 506 FormatToken *Prev = nullptr; 507 for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) { 508 // NB: Because previous pointers are not initialized yet, this cannot use 509 // Token.getPreviousNonComment. 510 if ((*I)->isNot(tok::comment)) { 511 Prev = *I; 512 break; 513 } 514 } 515 516 if (!canPrecedeRegexLiteral(Prev)) 517 return; 518 519 // 'Manually' lex ahead in the current file buffer. 520 const char *Offset = Lex->getBufferLocation(); 521 const char *RegexBegin = Offset - RegexToken->TokenText.size(); 522 StringRef Buffer = Lex->getBuffer(); 523 bool InCharacterClass = false; 524 bool HaveClosingSlash = false; 525 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) { 526 // Regular expressions are terminated with a '/', which can only be 527 // escaped using '\' or a character class between '[' and ']'. 528 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5. 529 switch (*Offset) { 530 case '\\': 531 // Skip the escaped character. 532 ++Offset; 533 break; 534 case '[': 535 InCharacterClass = true; 536 break; 537 case ']': 538 InCharacterClass = false; 539 break; 540 case '/': 541 if (!InCharacterClass) 542 HaveClosingSlash = true; 543 break; 544 } 545 } 546 547 RegexToken->setType(TT_RegexLiteral); 548 // Treat regex literals like other string_literals. 549 RegexToken->Tok.setKind(tok::string_literal); 550 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin); 551 RegexToken->ColumnWidth = RegexToken->TokenText.size(); 552 553 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset))); 554 } 555 556 void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() { 557 FormatToken *CSharpStringLiteral = Tokens.back(); 558 559 if (CSharpStringLiteral->getType() != TT_CSharpStringLiteral) 560 return; 561 562 // Deal with multiline strings. 563 if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") || 564 CSharpStringLiteral->TokenText.startswith(R"($@")"))) 565 return; 566 567 const char *StrBegin = 568 Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size(); 569 const char *Offset = StrBegin; 570 if (CSharpStringLiteral->TokenText.startswith(R"(@")")) 571 Offset += 2; 572 else // CSharpStringLiteral->TokenText.startswith(R"($@")") 573 Offset += 3; 574 575 // Look for a terminating '"' in the current file buffer. 576 // Make no effort to format code within an interpolated or verbatim string. 577 for (; Offset != Lex->getBuffer().end(); ++Offset) { 578 if (Offset[0] == '"') { 579 // "" within a verbatim string is an escaped double quote: skip it. 580 if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"') 581 ++Offset; 582 else 583 break; 584 } 585 } 586 587 // Make no attempt to format code properly if a verbatim string is 588 // unterminated. 589 if (Offset == Lex->getBuffer().end()) 590 return; 591 592 StringRef LiteralText(StrBegin, Offset - StrBegin + 1); 593 CSharpStringLiteral->TokenText = LiteralText; 594 595 // Adjust width for potentially multiline string literals. 596 size_t FirstBreak = LiteralText.find('\n'); 597 StringRef FirstLineText = FirstBreak == StringRef::npos 598 ? LiteralText 599 : LiteralText.substr(0, FirstBreak); 600 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs( 601 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth, 602 Encoding); 603 size_t LastBreak = LiteralText.rfind('\n'); 604 if (LastBreak != StringRef::npos) { 605 CSharpStringLiteral->IsMultiline = true; 606 unsigned StartColumn = 0; 607 CSharpStringLiteral->LastLineColumnWidth = encoding::columnWidthWithTabs( 608 LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn, 609 Style.TabWidth, Encoding); 610 } 611 612 SourceLocation loc = Offset < Lex->getBuffer().end() 613 ? Lex->getSourceLocation(Offset + 1) 614 : SourceMgr.getLocForEndOfFile(ID); 615 resetLexer(SourceMgr.getFileOffset(loc)); 616 } 617 618 void FormatTokenLexer::handleTemplateStrings() { 619 FormatToken *BacktickToken = Tokens.back(); 620 621 if (BacktickToken->is(tok::l_brace)) { 622 StateStack.push(LexerState::NORMAL); 623 return; 624 } 625 if (BacktickToken->is(tok::r_brace)) { 626 if (StateStack.size() == 1) 627 return; 628 StateStack.pop(); 629 if (StateStack.top() != LexerState::TEMPLATE_STRING) 630 return; 631 // If back in TEMPLATE_STRING, fallthrough and continue parsing the 632 } else if (BacktickToken->is(tok::unknown) && 633 BacktickToken->TokenText == "`") { 634 StateStack.push(LexerState::TEMPLATE_STRING); 635 } else { 636 return; // Not actually a template 637 } 638 639 // 'Manually' lex ahead in the current file buffer. 640 const char *Offset = Lex->getBufferLocation(); 641 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`" 642 for (; Offset != Lex->getBuffer().end(); ++Offset) { 643 if (Offset[0] == '`') { 644 StateStack.pop(); 645 break; 646 } 647 if (Offset[0] == '\\') { 648 ++Offset; // Skip the escaped character. 649 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' && 650 Offset[1] == '{') { 651 // '${' introduces an expression interpolation in the template string. 652 StateStack.push(LexerState::NORMAL); 653 ++Offset; 654 break; 655 } 656 } 657 658 StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1); 659 BacktickToken->setType(TT_TemplateString); 660 BacktickToken->Tok.setKind(tok::string_literal); 661 BacktickToken->TokenText = LiteralText; 662 663 // Adjust width for potentially multiline string literals. 664 size_t FirstBreak = LiteralText.find('\n'); 665 StringRef FirstLineText = FirstBreak == StringRef::npos 666 ? LiteralText 667 : LiteralText.substr(0, FirstBreak); 668 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs( 669 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding); 670 size_t LastBreak = LiteralText.rfind('\n'); 671 if (LastBreak != StringRef::npos) { 672 BacktickToken->IsMultiline = true; 673 unsigned StartColumn = 0; // The template tail spans the entire line. 674 BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs( 675 LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn, 676 Style.TabWidth, Encoding); 677 } 678 679 SourceLocation loc = Offset < Lex->getBuffer().end() 680 ? Lex->getSourceLocation(Offset + 1) 681 : SourceMgr.getLocForEndOfFile(ID); 682 resetLexer(SourceMgr.getFileOffset(loc)); 683 } 684 685 void FormatTokenLexer::tryParsePythonComment() { 686 FormatToken *HashToken = Tokens.back(); 687 if (!HashToken->isOneOf(tok::hash, tok::hashhash)) 688 return; 689 // Turn the remainder of this line into a comment. 690 const char *CommentBegin = 691 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#" 692 size_t From = CommentBegin - Lex->getBuffer().begin(); 693 size_t To = Lex->getBuffer().find_first_of('\n', From); 694 if (To == StringRef::npos) 695 To = Lex->getBuffer().size(); 696 size_t Len = To - From; 697 HashToken->setType(TT_LineComment); 698 HashToken->Tok.setKind(tok::comment); 699 HashToken->TokenText = Lex->getBuffer().substr(From, Len); 700 SourceLocation Loc = To < Lex->getBuffer().size() 701 ? Lex->getSourceLocation(CommentBegin + Len) 702 : SourceMgr.getLocForEndOfFile(ID); 703 resetLexer(SourceMgr.getFileOffset(Loc)); 704 } 705 706 bool FormatTokenLexer::tryMerge_TMacro() { 707 if (Tokens.size() < 4) 708 return false; 709 FormatToken *Last = Tokens.back(); 710 if (!Last->is(tok::r_paren)) 711 return false; 712 713 FormatToken *String = Tokens[Tokens.size() - 2]; 714 if (!String->is(tok::string_literal) || String->IsMultiline) 715 return false; 716 717 if (!Tokens[Tokens.size() - 3]->is(tok::l_paren)) 718 return false; 719 720 FormatToken *Macro = Tokens[Tokens.size() - 4]; 721 if (Macro->TokenText != "_T") 722 return false; 723 724 const char *Start = Macro->TokenText.data(); 725 const char *End = Last->TokenText.data() + Last->TokenText.size(); 726 String->TokenText = StringRef(Start, End - Start); 727 String->IsFirst = Macro->IsFirst; 728 String->LastNewlineOffset = Macro->LastNewlineOffset; 729 String->WhitespaceRange = Macro->WhitespaceRange; 730 String->OriginalColumn = Macro->OriginalColumn; 731 String->ColumnWidth = encoding::columnWidthWithTabs( 732 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding); 733 String->NewlinesBefore = Macro->NewlinesBefore; 734 String->HasUnescapedNewline = Macro->HasUnescapedNewline; 735 736 Tokens.pop_back(); 737 Tokens.pop_back(); 738 Tokens.pop_back(); 739 Tokens.back() = String; 740 return true; 741 } 742 743 bool FormatTokenLexer::tryMergeConflictMarkers() { 744 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof)) 745 return false; 746 747 // Conflict lines look like: 748 // <marker> <text from the vcs> 749 // For example: 750 // >>>>>>> /file/in/file/system at revision 1234 751 // 752 // We merge all tokens in a line that starts with a conflict marker 753 // into a single token with a special token type that the unwrapped line 754 // parser will use to correctly rebuild the underlying code. 755 756 FileID ID; 757 // Get the position of the first token in the line. 758 unsigned FirstInLineOffset; 759 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc( 760 Tokens[FirstInLineIndex]->getStartOfNonWhitespace()); 761 StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer(); 762 // Calculate the offset of the start of the current line. 763 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset); 764 if (LineOffset == StringRef::npos) { 765 LineOffset = 0; 766 } else { 767 ++LineOffset; 768 } 769 770 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset); 771 StringRef LineStart; 772 if (FirstSpace == StringRef::npos) { 773 LineStart = Buffer.substr(LineOffset); 774 } else { 775 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset); 776 } 777 778 TokenType Type = TT_Unknown; 779 if (LineStart == "<<<<<<<" || LineStart == ">>>>") { 780 Type = TT_ConflictStart; 781 } else if (LineStart == "|||||||" || LineStart == "=======" || 782 LineStart == "====") { 783 Type = TT_ConflictAlternative; 784 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") { 785 Type = TT_ConflictEnd; 786 } 787 788 if (Type != TT_Unknown) { 789 FormatToken *Next = Tokens.back(); 790 791 Tokens.resize(FirstInLineIndex + 1); 792 // We do not need to build a complete token here, as we will skip it 793 // during parsing anyway (as we must not touch whitespace around conflict 794 // markers). 795 Tokens.back()->setType(Type); 796 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype); 797 798 Tokens.push_back(Next); 799 return true; 800 } 801 802 return false; 803 } 804 805 FormatToken *FormatTokenLexer::getStashedToken() { 806 // Create a synthesized second '>' or '<' token. 807 Token Tok = FormatTok->Tok; 808 StringRef TokenText = FormatTok->TokenText; 809 810 unsigned OriginalColumn = FormatTok->OriginalColumn; 811 FormatTok = new (Allocator.Allocate()) FormatToken; 812 FormatTok->Tok = Tok; 813 SourceLocation TokLocation = 814 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1); 815 FormatTok->Tok.setLocation(TokLocation); 816 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation); 817 FormatTok->TokenText = TokenText; 818 FormatTok->ColumnWidth = 1; 819 FormatTok->OriginalColumn = OriginalColumn + 1; 820 821 return FormatTok; 822 } 823 824 FormatToken *FormatTokenLexer::getNextToken() { 825 if (StateStack.top() == LexerState::TOKEN_STASHED) { 826 StateStack.pop(); 827 return getStashedToken(); 828 } 829 830 FormatTok = new (Allocator.Allocate()) FormatToken; 831 readRawToken(*FormatTok); 832 SourceLocation WhitespaceStart = 833 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace); 834 FormatTok->IsFirst = IsFirstToken; 835 IsFirstToken = false; 836 837 // Consume and record whitespace until we find a significant token. 838 unsigned WhitespaceLength = TrailingWhitespace; 839 while (FormatTok->Tok.is(tok::unknown)) { 840 StringRef Text = FormatTok->TokenText; 841 auto EscapesNewline = [&](int pos) { 842 // A '\r' here is just part of '\r\n'. Skip it. 843 if (pos >= 0 && Text[pos] == '\r') 844 --pos; 845 // See whether there is an odd number of '\' before this. 846 // FIXME: This is wrong. A '\' followed by a newline is always removed, 847 // regardless of whether there is another '\' before it. 848 // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph. 849 unsigned count = 0; 850 for (; pos >= 0; --pos, ++count) 851 if (Text[pos] != '\\') 852 break; 853 return count & 1; 854 }; 855 // FIXME: This miscounts tok:unknown tokens that are not just 856 // whitespace, e.g. a '`' character. 857 for (int i = 0, e = Text.size(); i != e; ++i) { 858 switch (Text[i]) { 859 case '\n': 860 ++FormatTok->NewlinesBefore; 861 FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1); 862 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1; 863 Column = 0; 864 break; 865 case '\r': 866 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1; 867 Column = 0; 868 break; 869 case '\f': 870 case '\v': 871 Column = 0; 872 break; 873 case ' ': 874 ++Column; 875 break; 876 case '\t': 877 Column += 878 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0); 879 break; 880 case '\\': 881 if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n')) 882 FormatTok->setType(TT_ImplicitStringLiteral); 883 break; 884 default: 885 FormatTok->setType(TT_ImplicitStringLiteral); 886 break; 887 } 888 if (FormatTok->getType() == TT_ImplicitStringLiteral) 889 break; 890 } 891 892 if (FormatTok->is(TT_ImplicitStringLiteral)) 893 break; 894 WhitespaceLength += FormatTok->Tok.getLength(); 895 896 readRawToken(*FormatTok); 897 } 898 899 // JavaScript and Java do not allow to escape the end of the line with a 900 // backslash. Backslashes are syntax errors in plain source, but can occur in 901 // comments. When a single line comment ends with a \, it'll cause the next 902 // line of code to be lexed as a comment, breaking formatting. The code below 903 // finds comments that contain a backslash followed by a line break, truncates 904 // the comment token at the backslash, and resets the lexer to restart behind 905 // the backslash. 906 if ((Style.Language == FormatStyle::LK_JavaScript || 907 Style.Language == FormatStyle::LK_Java) && 908 FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) { 909 size_t BackslashPos = FormatTok->TokenText.find('\\'); 910 while (BackslashPos != StringRef::npos) { 911 if (BackslashPos + 1 < FormatTok->TokenText.size() && 912 FormatTok->TokenText[BackslashPos + 1] == '\n') { 913 const char *Offset = Lex->getBufferLocation(); 914 Offset -= FormatTok->TokenText.size(); 915 Offset += BackslashPos + 1; 916 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset))); 917 FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1); 918 FormatTok->ColumnWidth = encoding::columnWidthWithTabs( 919 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth, 920 Encoding); 921 break; 922 } 923 BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1); 924 } 925 } 926 927 // In case the token starts with escaped newlines, we want to 928 // take them into account as whitespace - this pattern is quite frequent 929 // in macro definitions. 930 // FIXME: Add a more explicit test. 931 while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') { 932 unsigned SkippedWhitespace = 0; 933 if (FormatTok->TokenText.size() > 2 && 934 (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n')) 935 SkippedWhitespace = 3; 936 else if (FormatTok->TokenText[1] == '\n') 937 SkippedWhitespace = 2; 938 else 939 break; 940 941 ++FormatTok->NewlinesBefore; 942 WhitespaceLength += SkippedWhitespace; 943 FormatTok->LastNewlineOffset = SkippedWhitespace; 944 Column = 0; 945 FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace); 946 } 947 948 FormatTok->WhitespaceRange = SourceRange( 949 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength)); 950 951 FormatTok->OriginalColumn = Column; 952 953 TrailingWhitespace = 0; 954 if (FormatTok->Tok.is(tok::comment)) { 955 // FIXME: Add the trimmed whitespace to Column. 956 StringRef UntrimmedText = FormatTok->TokenText; 957 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f"); 958 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size(); 959 } else if (FormatTok->Tok.is(tok::raw_identifier)) { 960 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText); 961 FormatTok->Tok.setIdentifierInfo(&Info); 962 FormatTok->Tok.setKind(Info.getTokenID()); 963 if (Style.Language == FormatStyle::LK_Java && 964 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete, 965 tok::kw_operator)) { 966 FormatTok->Tok.setKind(tok::identifier); 967 FormatTok->Tok.setIdentifierInfo(nullptr); 968 } else if (Style.Language == FormatStyle::LK_JavaScript && 969 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, 970 tok::kw_operator)) { 971 FormatTok->Tok.setKind(tok::identifier); 972 FormatTok->Tok.setIdentifierInfo(nullptr); 973 } 974 } else if (FormatTok->Tok.is(tok::greatergreater)) { 975 FormatTok->Tok.setKind(tok::greater); 976 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1); 977 ++Column; 978 StateStack.push(LexerState::TOKEN_STASHED); 979 } else if (FormatTok->Tok.is(tok::lessless)) { 980 FormatTok->Tok.setKind(tok::less); 981 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1); 982 ++Column; 983 StateStack.push(LexerState::TOKEN_STASHED); 984 } 985 986 // Now FormatTok is the next non-whitespace token. 987 988 StringRef Text = FormatTok->TokenText; 989 size_t FirstNewlinePos = Text.find('\n'); 990 if (FirstNewlinePos == StringRef::npos) { 991 // FIXME: ColumnWidth actually depends on the start column, we need to 992 // take this into account when the token is moved. 993 FormatTok->ColumnWidth = 994 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding); 995 Column += FormatTok->ColumnWidth; 996 } else { 997 FormatTok->IsMultiline = true; 998 // FIXME: ColumnWidth actually depends on the start column, we need to 999 // take this into account when the token is moved. 1000 FormatTok->ColumnWidth = encoding::columnWidthWithTabs( 1001 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding); 1002 1003 // The last line of the token always starts in column 0. 1004 // Thus, the length can be precomputed even in the presence of tabs. 1005 FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs( 1006 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding); 1007 Column = FormatTok->LastLineColumnWidth; 1008 } 1009 1010 if (Style.isCpp()) { 1011 auto it = Macros.find(FormatTok->Tok.getIdentifierInfo()); 1012 if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() && 1013 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() == 1014 tok::pp_define) && 1015 it != Macros.end()) { 1016 FormatTok->setType(it->second); 1017 } else if (FormatTok->is(tok::identifier)) { 1018 if (MacroBlockBeginRegex.match(Text)) { 1019 FormatTok->setType(TT_MacroBlockBegin); 1020 } else if (MacroBlockEndRegex.match(Text)) { 1021 FormatTok->setType(TT_MacroBlockEnd); 1022 } 1023 } 1024 } 1025 1026 return FormatTok; 1027 } 1028 1029 void FormatTokenLexer::readRawToken(FormatToken &Tok) { 1030 Lex->LexFromRawLexer(Tok.Tok); 1031 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()), 1032 Tok.Tok.getLength()); 1033 // For formatting, treat unterminated string literals like normal string 1034 // literals. 1035 if (Tok.is(tok::unknown)) { 1036 if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') { 1037 Tok.Tok.setKind(tok::string_literal); 1038 Tok.IsUnterminatedLiteral = true; 1039 } else if (Style.Language == FormatStyle::LK_JavaScript && 1040 Tok.TokenText == "''") { 1041 Tok.Tok.setKind(tok::string_literal); 1042 } 1043 } 1044 1045 if ((Style.Language == FormatStyle::LK_JavaScript || 1046 Style.Language == FormatStyle::LK_Proto || 1047 Style.Language == FormatStyle::LK_TextProto) && 1048 Tok.is(tok::char_constant)) { 1049 Tok.Tok.setKind(tok::string_literal); 1050 } 1051 1052 if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" || 1053 Tok.TokenText == "/* clang-format on */")) { 1054 FormattingDisabled = false; 1055 } 1056 1057 Tok.Finalized = FormattingDisabled; 1058 1059 if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" || 1060 Tok.TokenText == "/* clang-format off */")) { 1061 FormattingDisabled = true; 1062 } 1063 } 1064 1065 void FormatTokenLexer::resetLexer(unsigned Offset) { 1066 StringRef Buffer = SourceMgr.getBufferData(ID); 1067 Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), 1068 getFormattingLangOpts(Style), Buffer.begin(), 1069 Buffer.begin() + Offset, Buffer.end())); 1070 Lex->SetKeepWhitespaceMode(true); 1071 TrailingWhitespace = 0; 1072 } 1073 1074 } // namespace format 1075 } // namespace clang 1076