xref: /llvm-project/clang/lib/Format/FormatTokenLexer.cpp (revision 688bc958bd4167512f0d45e1fd008c9551de1c75)
1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements FormatTokenLexer, which tokenizes a source file
11 /// into a FormatToken stream suitable for ClangFormat.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "FormatTokenLexer.h"
16 #include "FormatToken.h"
17 #include "clang/Basic/SourceLocation.h"
18 #include "clang/Basic/SourceManager.h"
19 #include "clang/Format/Format.h"
20 #include "llvm/Support/Regex.h"
21 
22 namespace clang {
23 namespace format {
24 
25 FormatTokenLexer::FormatTokenLexer(
26     const SourceManager &SourceMgr, FileID ID, unsigned Column,
27     const FormatStyle &Style, encoding::Encoding Encoding,
28     llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29     IdentifierTable &IdentTable)
30     : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31       Column(Column), TrailingWhitespace(0),
32       LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
33       Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
34       Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
35       FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
36       MacroBlockEndRegex(Style.MacroBlockEnd) {
37   Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
38   Lex->SetKeepWhitespaceMode(true);
39 
40   for (const std::string &ForEachMacro : Style.ForEachMacros) {
41     auto Identifier = &IdentTable.get(ForEachMacro);
42     Macros.insert({Identifier, TT_ForEachMacro});
43   }
44   for (const std::string &IfMacro : Style.IfMacros) {
45     auto Identifier = &IdentTable.get(IfMacro);
46     Macros.insert({Identifier, TT_IfMacro});
47   }
48   for (const std::string &AttributeMacro : Style.AttributeMacros) {
49     auto Identifier = &IdentTable.get(AttributeMacro);
50     Macros.insert({Identifier, TT_AttributeMacro});
51   }
52   for (const std::string &StatementMacro : Style.StatementMacros) {
53     auto Identifier = &IdentTable.get(StatementMacro);
54     Macros.insert({Identifier, TT_StatementMacro});
55   }
56   for (const std::string &TypenameMacro : Style.TypenameMacros) {
57     auto Identifier = &IdentTable.get(TypenameMacro);
58     Macros.insert({Identifier, TT_TypenameMacro});
59   }
60   for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
61     auto Identifier = &IdentTable.get(NamespaceMacro);
62     Macros.insert({Identifier, TT_NamespaceMacro});
63   }
64   for (const std::string &WhitespaceSensitiveMacro :
65        Style.WhitespaceSensitiveMacros) {
66     auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
67     Macros.insert({Identifier, TT_UntouchableMacroFunc});
68   }
69   for (const std::string &StatementAttributeLikeMacro :
70        Style.StatementAttributeLikeMacros) {
71     auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
72     Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
73   }
74 
75   for (const auto &TemplateName : Style.TemplateNames)
76     TemplateNames.insert(&IdentTable.get(TemplateName));
77   for (const auto &TypeName : Style.TypeNames)
78     TypeNames.insert(&IdentTable.get(TypeName));
79 }
80 
81 ArrayRef<FormatToken *> FormatTokenLexer::lex() {
82   assert(Tokens.empty());
83   assert(FirstInLineIndex == 0);
84   do {
85     Tokens.push_back(getNextToken());
86     if (Style.isJavaScript()) {
87       tryParseJSRegexLiteral();
88       handleTemplateStrings();
89     }
90     if (Style.Language == FormatStyle::LK_TextProto)
91       tryParsePythonComment();
92     tryMergePreviousTokens();
93     if (Style.isCSharp()) {
94       // This needs to come after tokens have been merged so that C#
95       // string literals are correctly identified.
96       handleCSharpVerbatimAndInterpolatedStrings();
97     }
98     if (Style.isTableGen()) {
99       handleTableGenMultilineString();
100       handleTableGenNumericLikeIdentifier();
101     }
102     if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
103       FirstInLineIndex = Tokens.size() - 1;
104   } while (Tokens.back()->isNot(tok::eof));
105   if (Style.InsertNewlineAtEOF) {
106     auto &TokEOF = *Tokens.back();
107     if (TokEOF.NewlinesBefore == 0) {
108       TokEOF.NewlinesBefore = 1;
109       TokEOF.OriginalColumn = 0;
110     }
111   }
112   return Tokens;
113 }
114 
115 void FormatTokenLexer::tryMergePreviousTokens() {
116   if (tryMerge_TMacro())
117     return;
118   if (tryMergeConflictMarkers())
119     return;
120   if (tryMergeLessLess())
121     return;
122   if (tryMergeGreaterGreater())
123     return;
124   if (tryMergeForEach())
125     return;
126   if (Style.isCpp() && tryTransformTryUsageForC())
127     return;
128 
129   if (Style.isJavaScript() || Style.isCSharp()) {
130     static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
131                                                                tok::question};
132     static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
133                                                              tok::period};
134     static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
135 
136     if (tryMergeTokens(FatArrow, TT_FatArrow))
137       return;
138     if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
139       // Treat like the "||" operator (as opposed to the ternary ?).
140       Tokens.back()->Tok.setKind(tok::pipepipe);
141       return;
142     }
143     if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
144       // Treat like a regular "." access.
145       Tokens.back()->Tok.setKind(tok::period);
146       return;
147     }
148     if (tryMergeNullishCoalescingEqual())
149       return;
150   }
151 
152   if (Style.isCSharp()) {
153     static const tok::TokenKind CSharpNullConditionalLSquare[] = {
154         tok::question, tok::l_square};
155 
156     if (tryMergeCSharpKeywordVariables())
157       return;
158     if (tryMergeCSharpStringLiteral())
159       return;
160     if (tryTransformCSharpForEach())
161       return;
162     if (tryMergeTokens(CSharpNullConditionalLSquare,
163                        TT_CSharpNullConditionalLSquare)) {
164       // Treat like a regular "[" operator.
165       Tokens.back()->Tok.setKind(tok::l_square);
166       return;
167     }
168   }
169 
170   if (tryMergeNSStringLiteral())
171     return;
172 
173   if (Style.isJavaScript()) {
174     static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
175     static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
176                                                    tok::equal};
177     static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
178                                                   tok::greaterequal};
179     static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
180     static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
181                                                            tok::starequal};
182     static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
183     static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
184 
185     // FIXME: Investigate what token type gives the correct operator priority.
186     if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
187       return;
188     if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
189       return;
190     if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
191       return;
192     if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
193       return;
194     if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
195       Tokens.back()->Tok.setKind(tok::starequal);
196       return;
197     }
198     if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
199         tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
200       // Treat like the "=" assignment operator.
201       Tokens.back()->Tok.setKind(tok::equal);
202       return;
203     }
204     if (tryMergeJSPrivateIdentifier())
205       return;
206   }
207 
208   if (Style.Language == FormatStyle::LK_Java) {
209     static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
210         tok::greater, tok::greater, tok::greaterequal};
211     if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
212       return;
213   }
214 
215   if (Style.isVerilog()) {
216     // Merge the number following a base like `'h?a0`.
217     if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
218         Tokens.end()[-2]->is(tok::numeric_constant) &&
219         Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
220                                tok::question) &&
221         tryMergeTokens(2, TT_Unknown)) {
222       return;
223     }
224     // Part select.
225     if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
226                           TT_BitFieldColon)) {
227       return;
228     }
229     // Xnor. The combined token is treated as a caret which can also be either a
230     // unary or binary operator. The actual type is determined in
231     // TokenAnnotator. We also check the token length so we know it is not
232     // already a merged token.
233     if (Tokens.back()->TokenText.size() == 1 &&
234         tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
235                           TT_BinaryOperator)) {
236       Tokens.back()->Tok.setKind(tok::caret);
237       return;
238     }
239     // Signed shift and distribution weight.
240     if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
241       Tokens.back()->Tok.setKind(tok::lessless);
242       return;
243     }
244     if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
245       Tokens.back()->Tok.setKind(tok::greatergreater);
246       return;
247     }
248     if (tryMergeTokensAny({{tok::lessless, tok::equal},
249                            {tok::lessless, tok::lessequal},
250                            {tok::greatergreater, tok::equal},
251                            {tok::greatergreater, tok::greaterequal},
252                            {tok::colon, tok::equal},
253                            {tok::colon, tok::slash}},
254                           TT_BinaryOperator)) {
255       Tokens.back()->ForcedPrecedence = prec::Assignment;
256       return;
257     }
258     // Exponentiation, signed shift, case equality, and wildcard equality.
259     if (tryMergeTokensAny({{tok::star, tok::star},
260                            {tok::lessless, tok::less},
261                            {tok::greatergreater, tok::greater},
262                            {tok::exclaimequal, tok::equal},
263                            {tok::exclaimequal, tok::question},
264                            {tok::equalequal, tok::equal},
265                            {tok::equalequal, tok::question}},
266                           TT_BinaryOperator)) {
267       return;
268     }
269     // Module paths in specify blocks and the implication and boolean equality
270     // operators.
271     if (tryMergeTokensAny({{tok::plusequal, tok::greater},
272                            {tok::plus, tok::star, tok::greater},
273                            {tok::minusequal, tok::greater},
274                            {tok::minus, tok::star, tok::greater},
275                            {tok::less, tok::arrow},
276                            {tok::equal, tok::greater},
277                            {tok::star, tok::greater},
278                            {tok::pipeequal, tok::greater},
279                            {tok::pipe, tok::arrow},
280                            {tok::hash, tok::minus, tok::hash},
281                            {tok::hash, tok::equal, tok::hash}},
282                           TT_BinaryOperator) ||
283         Tokens.back()->is(tok::arrow)) {
284       Tokens.back()->ForcedPrecedence = prec::Comma;
285       return;
286     }
287   }
288   if (Style.isTableGen()) {
289     // TableGen's Multi line string starts with [{
290     if (tryMergeTokens({tok::l_square, tok::l_brace},
291                        TT_TableGenMultiLineString)) {
292       // Set again with finalizing. This must never be annotated as other types.
293       Tokens.back()->setFinalizedType(TT_TableGenMultiLineString);
294       Tokens.back()->Tok.setKind(tok::string_literal);
295       return;
296     }
297     // TableGen's bang operator is the form !<name>.
298     // !cond is a special case with specific syntax.
299     if (tryMergeTokens({tok::exclaim, tok::identifier},
300                        TT_TableGenBangOperator)) {
301       Tokens.back()->Tok.setKind(tok::identifier);
302       Tokens.back()->Tok.setIdentifierInfo(nullptr);
303       if (Tokens.back()->TokenText == "!cond")
304         Tokens.back()->setFinalizedType(TT_TableGenCondOperator);
305       else
306         Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
307       return;
308     }
309     if (tryMergeTokens({tok::exclaim, tok::kw_if}, TT_TableGenBangOperator)) {
310       // Here, "! if" becomes "!if".  That is, ! captures if even when the space
311       // exists. That is only one possibility in TableGen's syntax.
312       Tokens.back()->Tok.setKind(tok::identifier);
313       Tokens.back()->Tok.setIdentifierInfo(nullptr);
314       Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
315       return;
316     }
317     // +, - with numbers are literals. Not unary operators.
318     if (tryMergeTokens({tok::plus, tok::numeric_constant}, TT_Unknown)) {
319       Tokens.back()->Tok.setKind(tok::numeric_constant);
320       return;
321     }
322     if (tryMergeTokens({tok::minus, tok::numeric_constant}, TT_Unknown)) {
323       Tokens.back()->Tok.setKind(tok::numeric_constant);
324       return;
325     }
326   }
327 }
328 
329 bool FormatTokenLexer::tryMergeNSStringLiteral() {
330   if (Tokens.size() < 2)
331     return false;
332   auto &At = *(Tokens.end() - 2);
333   auto &String = *(Tokens.end() - 1);
334   if (At->isNot(tok::at) || String->isNot(tok::string_literal))
335     return false;
336   At->Tok.setKind(tok::string_literal);
337   At->TokenText = StringRef(At->TokenText.begin(),
338                             String->TokenText.end() - At->TokenText.begin());
339   At->ColumnWidth += String->ColumnWidth;
340   At->setType(TT_ObjCStringLiteral);
341   Tokens.erase(Tokens.end() - 1);
342   return true;
343 }
344 
345 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
346   // Merges #idenfier into a single identifier with the text #identifier
347   // but the token tok::identifier.
348   if (Tokens.size() < 2)
349     return false;
350   auto &Hash = *(Tokens.end() - 2);
351   auto &Identifier = *(Tokens.end() - 1);
352   if (Hash->isNot(tok::hash) || Identifier->isNot(tok::identifier))
353     return false;
354   Hash->Tok.setKind(tok::identifier);
355   Hash->TokenText =
356       StringRef(Hash->TokenText.begin(),
357                 Identifier->TokenText.end() - Hash->TokenText.begin());
358   Hash->ColumnWidth += Identifier->ColumnWidth;
359   Hash->setType(TT_JsPrivateIdentifier);
360   Tokens.erase(Tokens.end() - 1);
361   return true;
362 }
363 
364 // Search for verbatim or interpolated string literals @"ABC" or
365 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
366 // prevent splitting of @, $ and ".
367 // Merging of multiline verbatim strings with embedded '"' is handled in
368 // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
369 bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
370   if (Tokens.size() < 2)
371     return false;
372 
373   // Look for @"aaaaaa" or $"aaaaaa".
374   const auto String = *(Tokens.end() - 1);
375   if (String->isNot(tok::string_literal))
376     return false;
377 
378   auto Prefix = *(Tokens.end() - 2);
379   if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
380     return false;
381 
382   if (Tokens.size() > 2) {
383     const auto Tok = *(Tokens.end() - 3);
384     if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
385         (Tok->is(tok::at) && Prefix->TokenText == "$")) {
386       // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
387       Tok->ColumnWidth += Prefix->ColumnWidth;
388       Tokens.erase(Tokens.end() - 2);
389       Prefix = Tok;
390     }
391   }
392 
393   // Convert back into just a string_literal.
394   Prefix->Tok.setKind(tok::string_literal);
395   Prefix->TokenText =
396       StringRef(Prefix->TokenText.begin(),
397                 String->TokenText.end() - Prefix->TokenText.begin());
398   Prefix->ColumnWidth += String->ColumnWidth;
399   Prefix->setType(TT_CSharpStringLiteral);
400   Tokens.erase(Tokens.end() - 1);
401   return true;
402 }
403 
404 // Valid C# attribute targets:
405 // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
406 const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
407     "assembly", "module",   "field",  "event", "method",
408     "param",    "property", "return", "type",
409 };
410 
411 bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
412   if (Tokens.size() < 2)
413     return false;
414   auto &NullishCoalescing = *(Tokens.end() - 2);
415   auto &Equal = *(Tokens.end() - 1);
416   if (NullishCoalescing->isNot(TT_NullCoalescingOperator) ||
417       Equal->isNot(tok::equal)) {
418     return false;
419   }
420   NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
421   NullishCoalescing->TokenText =
422       StringRef(NullishCoalescing->TokenText.begin(),
423                 Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
424   NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
425   NullishCoalescing->setType(TT_NullCoalescingEqual);
426   Tokens.erase(Tokens.end() - 1);
427   return true;
428 }
429 
430 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
431   if (Tokens.size() < 2)
432     return false;
433   const auto At = *(Tokens.end() - 2);
434   if (At->isNot(tok::at))
435     return false;
436   const auto Keyword = *(Tokens.end() - 1);
437   if (Keyword->TokenText == "$")
438     return false;
439   if (!Keywords.isCSharpKeyword(*Keyword))
440     return false;
441 
442   At->Tok.setKind(tok::identifier);
443   At->TokenText = StringRef(At->TokenText.begin(),
444                             Keyword->TokenText.end() - At->TokenText.begin());
445   At->ColumnWidth += Keyword->ColumnWidth;
446   At->setType(Keyword->getType());
447   Tokens.erase(Tokens.end() - 1);
448   return true;
449 }
450 
451 // In C# transform identifier foreach into kw_foreach
452 bool FormatTokenLexer::tryTransformCSharpForEach() {
453   if (Tokens.size() < 1)
454     return false;
455   auto &Identifier = *(Tokens.end() - 1);
456   if (Identifier->isNot(tok::identifier))
457     return false;
458   if (Identifier->TokenText != "foreach")
459     return false;
460 
461   Identifier->setType(TT_ForEachMacro);
462   Identifier->Tok.setKind(tok::kw_for);
463   return true;
464 }
465 
466 bool FormatTokenLexer::tryMergeForEach() {
467   if (Tokens.size() < 2)
468     return false;
469   auto &For = *(Tokens.end() - 2);
470   auto &Each = *(Tokens.end() - 1);
471   if (For->isNot(tok::kw_for))
472     return false;
473   if (Each->isNot(tok::identifier))
474     return false;
475   if (Each->TokenText != "each")
476     return false;
477 
478   For->setType(TT_ForEachMacro);
479   For->Tok.setKind(tok::kw_for);
480 
481   For->TokenText = StringRef(For->TokenText.begin(),
482                              Each->TokenText.end() - For->TokenText.begin());
483   For->ColumnWidth += Each->ColumnWidth;
484   Tokens.erase(Tokens.end() - 1);
485   return true;
486 }
487 
488 bool FormatTokenLexer::tryTransformTryUsageForC() {
489   if (Tokens.size() < 2)
490     return false;
491   auto &Try = *(Tokens.end() - 2);
492   if (Try->isNot(tok::kw_try))
493     return false;
494   auto &Next = *(Tokens.end() - 1);
495   if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
496     return false;
497 
498   if (Tokens.size() > 2) {
499     auto &At = *(Tokens.end() - 3);
500     if (At->is(tok::at))
501       return false;
502   }
503 
504   Try->Tok.setKind(tok::identifier);
505   return true;
506 }
507 
508 bool FormatTokenLexer::tryMergeLessLess() {
509   // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
510   if (Tokens.size() < 3)
511     return false;
512 
513   auto First = Tokens.end() - 3;
514   if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
515     return false;
516 
517   // Only merge if there currently is no whitespace between the two "<".
518   if (First[1]->hasWhitespaceBefore())
519     return false;
520 
521   auto X = Tokens.size() > 3 ? First[-1] : nullptr;
522   if (X && X->is(tok::less))
523     return false;
524 
525   auto Y = First[2];
526   if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less))
527     return false;
528 
529   First[0]->Tok.setKind(tok::lessless);
530   First[0]->TokenText = "<<";
531   First[0]->ColumnWidth += 1;
532   Tokens.erase(Tokens.end() - 2);
533   return true;
534 }
535 
536 bool FormatTokenLexer::tryMergeGreaterGreater() {
537   // Merge kw_operator,greater,greater into kw_operator,greatergreater.
538   if (Tokens.size() < 2)
539     return false;
540 
541   auto First = Tokens.end() - 2;
542   if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater))
543     return false;
544 
545   // Only merge if there currently is no whitespace between the first two ">".
546   if (First[1]->hasWhitespaceBefore())
547     return false;
548 
549   auto Tok = Tokens.size() > 2 ? First[-1] : nullptr;
550   if (Tok && Tok->isNot(tok::kw_operator))
551     return false;
552 
553   First[0]->Tok.setKind(tok::greatergreater);
554   First[0]->TokenText = ">>";
555   First[0]->ColumnWidth += 1;
556   Tokens.erase(Tokens.end() - 1);
557   return true;
558 }
559 
560 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
561                                       TokenType NewType) {
562   if (Tokens.size() < Kinds.size())
563     return false;
564 
565   SmallVectorImpl<FormatToken *>::const_iterator First =
566       Tokens.end() - Kinds.size();
567   for (unsigned i = 0; i < Kinds.size(); ++i)
568     if (First[i]->isNot(Kinds[i]))
569       return false;
570 
571   return tryMergeTokens(Kinds.size(), NewType);
572 }
573 
574 bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
575   if (Tokens.size() < Count)
576     return false;
577 
578   SmallVectorImpl<FormatToken *>::const_iterator First = Tokens.end() - Count;
579   unsigned AddLength = 0;
580   for (size_t i = 1; i < Count; ++i) {
581     // If there is whitespace separating the token and the previous one,
582     // they should not be merged.
583     if (First[i]->hasWhitespaceBefore())
584       return false;
585     AddLength += First[i]->TokenText.size();
586   }
587 
588   Tokens.resize(Tokens.size() - Count + 1);
589   First[0]->TokenText = StringRef(First[0]->TokenText.data(),
590                                   First[0]->TokenText.size() + AddLength);
591   First[0]->ColumnWidth += AddLength;
592   First[0]->setType(NewType);
593   return true;
594 }
595 
596 bool FormatTokenLexer::tryMergeTokensAny(
597     ArrayRef<ArrayRef<tok::TokenKind>> Kinds, TokenType NewType) {
598   return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
599     return tryMergeTokens(Kinds, NewType);
600   });
601 }
602 
603 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
604 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
605   // NB: This is not entirely correct, as an r_paren can introduce an operand
606   // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
607   // corner case to not matter in practice, though.
608   return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
609                       tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
610                       tok::colon, tok::question, tok::tilde) ||
611          Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
612                       tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
613                       tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
614          Tok->isBinaryOperator();
615 }
616 
617 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
618   if (!Prev)
619     return true;
620 
621   // Regex literals can only follow after prefix unary operators, not after
622   // postfix unary operators. If the '++' is followed by a non-operand
623   // introducing token, the slash here is the operand and not the start of a
624   // regex.
625   // `!` is an unary prefix operator, but also a post-fix operator that casts
626   // away nullability, so the same check applies.
627   if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
628     return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
629 
630   // The previous token must introduce an operand location where regex
631   // literals can occur.
632   if (!precedesOperand(Prev))
633     return false;
634 
635   return true;
636 }
637 
638 // Tries to parse a JavaScript Regex literal starting at the current token,
639 // if that begins with a slash and is in a location where JavaScript allows
640 // regex literals. Changes the current token to a regex literal and updates
641 // its text if successful.
642 void FormatTokenLexer::tryParseJSRegexLiteral() {
643   FormatToken *RegexToken = Tokens.back();
644   if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
645     return;
646 
647   FormatToken *Prev = nullptr;
648   for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
649     // NB: Because previous pointers are not initialized yet, this cannot use
650     // Token.getPreviousNonComment.
651     if (FT->isNot(tok::comment)) {
652       Prev = FT;
653       break;
654     }
655   }
656 
657   if (!canPrecedeRegexLiteral(Prev))
658     return;
659 
660   // 'Manually' lex ahead in the current file buffer.
661   const char *Offset = Lex->getBufferLocation();
662   const char *RegexBegin = Offset - RegexToken->TokenText.size();
663   StringRef Buffer = Lex->getBuffer();
664   bool InCharacterClass = false;
665   bool HaveClosingSlash = false;
666   for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
667     // Regular expressions are terminated with a '/', which can only be
668     // escaped using '\' or a character class between '[' and ']'.
669     // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
670     switch (*Offset) {
671     case '\\':
672       // Skip the escaped character.
673       ++Offset;
674       break;
675     case '[':
676       InCharacterClass = true;
677       break;
678     case ']':
679       InCharacterClass = false;
680       break;
681     case '/':
682       if (!InCharacterClass)
683         HaveClosingSlash = true;
684       break;
685     }
686   }
687 
688   RegexToken->setType(TT_RegexLiteral);
689   // Treat regex literals like other string_literals.
690   RegexToken->Tok.setKind(tok::string_literal);
691   RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
692   RegexToken->ColumnWidth = RegexToken->TokenText.size();
693 
694   resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
695 }
696 
697 static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
698                             bool Interpolated) {
699   auto Repeated = [&Begin, End]() {
700     return Begin + 1 < End && Begin[1] == Begin[0];
701   };
702 
703   // Look for a terminating '"' in the current file buffer.
704   // Make no effort to format code within an interpolated or verbatim string.
705   //
706   // Interpolated strings could contain { } with " characters inside.
707   // $"{x ?? "null"}"
708   // should not be split into $"{x ?? ", null, "}" but should be treated as a
709   // single string-literal.
710   //
711   // We opt not to try and format expressions inside {} within a C#
712   // interpolated string. Formatting expressions within an interpolated string
713   // would require similar work as that done for JavaScript template strings
714   // in `handleTemplateStrings()`.
715   for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
716     switch (*Begin) {
717     case '\\':
718       if (!Verbatim)
719         ++Begin;
720       break;
721     case '{':
722       if (Interpolated) {
723         // {{ inside an interpolated string is escaped, so skip it.
724         if (Repeated())
725           ++Begin;
726         else
727           ++UnmatchedOpeningBraceCount;
728       }
729       break;
730     case '}':
731       if (Interpolated) {
732         // }} inside an interpolated string is escaped, so skip it.
733         if (Repeated())
734           ++Begin;
735         else if (UnmatchedOpeningBraceCount > 0)
736           --UnmatchedOpeningBraceCount;
737         else
738           return End;
739       }
740       break;
741     case '"':
742       if (UnmatchedOpeningBraceCount > 0)
743         break;
744       // "" within a verbatim string is an escaped double quote: skip it.
745       if (Verbatim && Repeated()) {
746         ++Begin;
747         break;
748       }
749       return Begin;
750     }
751   }
752 
753   return End;
754 }
755 
756 void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
757   FormatToken *CSharpStringLiteral = Tokens.back();
758 
759   if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
760     return;
761 
762   auto &TokenText = CSharpStringLiteral->TokenText;
763 
764   bool Verbatim = false;
765   bool Interpolated = false;
766   if (TokenText.starts_with(R"($@")") || TokenText.starts_with(R"(@$")")) {
767     Verbatim = true;
768     Interpolated = true;
769   } else if (TokenText.starts_with(R"(@")")) {
770     Verbatim = true;
771   } else if (TokenText.starts_with(R"($")")) {
772     Interpolated = true;
773   }
774 
775   // Deal with multiline strings.
776   if (!Verbatim && !Interpolated)
777     return;
778 
779   const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
780   const char *Offset = StrBegin;
781   if (Verbatim && Interpolated)
782     Offset += 3;
783   else
784     Offset += 2;
785 
786   const auto End = Lex->getBuffer().end();
787   Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
788 
789   // Make no attempt to format code properly if a verbatim string is
790   // unterminated.
791   if (Offset >= End)
792     return;
793 
794   StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
795   TokenText = LiteralText;
796 
797   // Adjust width for potentially multiline string literals.
798   size_t FirstBreak = LiteralText.find('\n');
799   StringRef FirstLineText = FirstBreak == StringRef::npos
800                                 ? LiteralText
801                                 : LiteralText.substr(0, FirstBreak);
802   CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
803       FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
804       Encoding);
805   size_t LastBreak = LiteralText.rfind('\n');
806   if (LastBreak != StringRef::npos) {
807     CSharpStringLiteral->IsMultiline = true;
808     unsigned StartColumn = 0;
809     CSharpStringLiteral->LastLineColumnWidth =
810         encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
811                                       StartColumn, Style.TabWidth, Encoding);
812   }
813 
814   assert(Offset < End);
815   resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
816 }
817 
818 void FormatTokenLexer::handleTableGenMultilineString() {
819   FormatToken *MultiLineString = Tokens.back();
820   if (MultiLineString->isNot(TT_TableGenMultiLineString))
821     return;
822 
823   auto OpenOffset = Lex->getCurrentBufferOffset() - 2 /* "[{" */;
824   // "}]" is the end of multi line string.
825   auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);
826   if (CloseOffset == StringRef::npos)
827     return;
828   auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset - OpenOffset + 2);
829   MultiLineString->TokenText = Text;
830   resetLexer(SourceMgr.getFileOffset(
831       Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
832   auto FirstLineText = Text;
833   auto FirstBreak = Text.find('\n');
834   // Set ColumnWidth and LastLineColumnWidth when it has multiple lines.
835   if (FirstBreak != StringRef::npos) {
836     MultiLineString->IsMultiline = true;
837     FirstLineText = Text.substr(0, FirstBreak + 1);
838     // LastLineColumnWidth holds the width of the last line.
839     auto LastBreak = Text.rfind('\n');
840     MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
841         Text.substr(LastBreak + 1), MultiLineString->OriginalColumn,
842         Style.TabWidth, Encoding);
843   }
844   // ColumnWidth holds only the width of the first line.
845   MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
846       FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
847 }
848 
849 void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
850   FormatToken *Tok = Tokens.back();
851   // TableGen identifiers can begin with digits. Such tokens are lexed as
852   // numeric_constant now.
853   if (Tok->isNot(tok::numeric_constant))
854     return;
855   StringRef Text = Tok->TokenText;
856   // The following check is based on llvm::TGLexer::LexToken.
857   // That lexes the token as a number if any of the following holds:
858   // 1. It starts with '+', '-'.
859   // 2. All the characters are digits.
860   // 3. The first non-digit character is 'b', and the next is '0' or '1'.
861   // 4. The first non-digit character is 'x', and the next is a hex digit.
862   // Note that in the case 3 and 4, if the next character does not exists in
863   // this token, the token is an identifier.
864   if (Text.size() < 1 || Text[0] == '+' || Text[0] == '-')
865     return;
866   const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });
867   // All the characters are digits
868   if (NonDigitPos == StringRef::npos)
869     return;
870   char FirstNonDigit = Text[NonDigitPos];
871   if (NonDigitPos < Text.size() - 1) {
872     char TheNext = Text[NonDigitPos + 1];
873     // Regarded as a binary number.
874     if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))
875       return;
876     // Regarded as hex number.
877     if (FirstNonDigit == 'x' && isxdigit(TheNext))
878       return;
879   }
880   if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {
881     // This is actually an identifier in TableGen.
882     Tok->Tok.setKind(tok::identifier);
883     Tok->Tok.setIdentifierInfo(nullptr);
884   }
885 }
886 
887 void FormatTokenLexer::handleTemplateStrings() {
888   FormatToken *BacktickToken = Tokens.back();
889 
890   if (BacktickToken->is(tok::l_brace)) {
891     StateStack.push(LexerState::NORMAL);
892     return;
893   }
894   if (BacktickToken->is(tok::r_brace)) {
895     if (StateStack.size() == 1)
896       return;
897     StateStack.pop();
898     if (StateStack.top() != LexerState::TEMPLATE_STRING)
899       return;
900     // If back in TEMPLATE_STRING, fallthrough and continue parsing the
901   } else if (BacktickToken->is(tok::unknown) &&
902              BacktickToken->TokenText == "`") {
903     StateStack.push(LexerState::TEMPLATE_STRING);
904   } else {
905     return; // Not actually a template
906   }
907 
908   // 'Manually' lex ahead in the current file buffer.
909   const char *Offset = Lex->getBufferLocation();
910   const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
911   for (; Offset != Lex->getBuffer().end(); ++Offset) {
912     if (Offset[0] == '`') {
913       StateStack.pop();
914       ++Offset;
915       break;
916     }
917     if (Offset[0] == '\\') {
918       ++Offset; // Skip the escaped character.
919     } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
920                Offset[1] == '{') {
921       // '${' introduces an expression interpolation in the template string.
922       StateStack.push(LexerState::NORMAL);
923       Offset += 2;
924       break;
925     }
926   }
927 
928   StringRef LiteralText(TmplBegin, Offset - TmplBegin);
929   BacktickToken->setType(TT_TemplateString);
930   BacktickToken->Tok.setKind(tok::string_literal);
931   BacktickToken->TokenText = LiteralText;
932 
933   // Adjust width for potentially multiline string literals.
934   size_t FirstBreak = LiteralText.find('\n');
935   StringRef FirstLineText = FirstBreak == StringRef::npos
936                                 ? LiteralText
937                                 : LiteralText.substr(0, FirstBreak);
938   BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
939       FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
940   size_t LastBreak = LiteralText.rfind('\n');
941   if (LastBreak != StringRef::npos) {
942     BacktickToken->IsMultiline = true;
943     unsigned StartColumn = 0; // The template tail spans the entire line.
944     BacktickToken->LastLineColumnWidth =
945         encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
946                                       StartColumn, Style.TabWidth, Encoding);
947   }
948 
949   SourceLocation loc = Lex->getSourceLocation(Offset);
950   resetLexer(SourceMgr.getFileOffset(loc));
951 }
952 
953 void FormatTokenLexer::tryParsePythonComment() {
954   FormatToken *HashToken = Tokens.back();
955   if (!HashToken->isOneOf(tok::hash, tok::hashhash))
956     return;
957   // Turn the remainder of this line into a comment.
958   const char *CommentBegin =
959       Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
960   size_t From = CommentBegin - Lex->getBuffer().begin();
961   size_t To = Lex->getBuffer().find_first_of('\n', From);
962   if (To == StringRef::npos)
963     To = Lex->getBuffer().size();
964   size_t Len = To - From;
965   HashToken->setType(TT_LineComment);
966   HashToken->Tok.setKind(tok::comment);
967   HashToken->TokenText = Lex->getBuffer().substr(From, Len);
968   SourceLocation Loc = To < Lex->getBuffer().size()
969                            ? Lex->getSourceLocation(CommentBegin + Len)
970                            : SourceMgr.getLocForEndOfFile(ID);
971   resetLexer(SourceMgr.getFileOffset(Loc));
972 }
973 
974 bool FormatTokenLexer::tryMerge_TMacro() {
975   if (Tokens.size() < 4)
976     return false;
977   FormatToken *Last = Tokens.back();
978   if (Last->isNot(tok::r_paren))
979     return false;
980 
981   FormatToken *String = Tokens[Tokens.size() - 2];
982   if (String->isNot(tok::string_literal) || String->IsMultiline)
983     return false;
984 
985   if (Tokens[Tokens.size() - 3]->isNot(tok::l_paren))
986     return false;
987 
988   FormatToken *Macro = Tokens[Tokens.size() - 4];
989   if (Macro->TokenText != "_T")
990     return false;
991 
992   const char *Start = Macro->TokenText.data();
993   const char *End = Last->TokenText.data() + Last->TokenText.size();
994   String->TokenText = StringRef(Start, End - Start);
995   String->IsFirst = Macro->IsFirst;
996   String->LastNewlineOffset = Macro->LastNewlineOffset;
997   String->WhitespaceRange = Macro->WhitespaceRange;
998   String->OriginalColumn = Macro->OriginalColumn;
999   String->ColumnWidth = encoding::columnWidthWithTabs(
1000       String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
1001   String->NewlinesBefore = Macro->NewlinesBefore;
1002   String->HasUnescapedNewline = Macro->HasUnescapedNewline;
1003 
1004   Tokens.pop_back();
1005   Tokens.pop_back();
1006   Tokens.pop_back();
1007   Tokens.back() = String;
1008   if (FirstInLineIndex >= Tokens.size())
1009     FirstInLineIndex = Tokens.size() - 1;
1010   return true;
1011 }
1012 
1013 bool FormatTokenLexer::tryMergeConflictMarkers() {
1014   if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
1015     return false;
1016 
1017   // Conflict lines look like:
1018   // <marker> <text from the vcs>
1019   // For example:
1020   // >>>>>>> /file/in/file/system at revision 1234
1021   //
1022   // We merge all tokens in a line that starts with a conflict marker
1023   // into a single token with a special token type that the unwrapped line
1024   // parser will use to correctly rebuild the underlying code.
1025 
1026   FileID ID;
1027   // Get the position of the first token in the line.
1028   unsigned FirstInLineOffset;
1029   std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
1030       Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
1031   StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
1032   // Calculate the offset of the start of the current line.
1033   auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
1034   if (LineOffset == StringRef::npos)
1035     LineOffset = 0;
1036   else
1037     ++LineOffset;
1038 
1039   auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
1040   StringRef LineStart;
1041   if (FirstSpace == StringRef::npos)
1042     LineStart = Buffer.substr(LineOffset);
1043   else
1044     LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
1045 
1046   TokenType Type = TT_Unknown;
1047   if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
1048     Type = TT_ConflictStart;
1049   } else if (LineStart == "|||||||" || LineStart == "=======" ||
1050              LineStart == "====") {
1051     Type = TT_ConflictAlternative;
1052   } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
1053     Type = TT_ConflictEnd;
1054   }
1055 
1056   if (Type != TT_Unknown) {
1057     FormatToken *Next = Tokens.back();
1058 
1059     Tokens.resize(FirstInLineIndex + 1);
1060     // We do not need to build a complete token here, as we will skip it
1061     // during parsing anyway (as we must not touch whitespace around conflict
1062     // markers).
1063     Tokens.back()->setType(Type);
1064     Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
1065 
1066     Tokens.push_back(Next);
1067     return true;
1068   }
1069 
1070   return false;
1071 }
1072 
1073 FormatToken *FormatTokenLexer::getStashedToken() {
1074   // Create a synthesized second '>' or '<' token.
1075   Token Tok = FormatTok->Tok;
1076   StringRef TokenText = FormatTok->TokenText;
1077 
1078   unsigned OriginalColumn = FormatTok->OriginalColumn;
1079   FormatTok = new (Allocator.Allocate()) FormatToken;
1080   FormatTok->Tok = Tok;
1081   SourceLocation TokLocation =
1082       FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
1083   FormatTok->Tok.setLocation(TokLocation);
1084   FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
1085   FormatTok->TokenText = TokenText;
1086   FormatTok->ColumnWidth = 1;
1087   FormatTok->OriginalColumn = OriginalColumn + 1;
1088 
1089   return FormatTok;
1090 }
1091 
1092 /// Truncate the current token to the new length and make the lexer continue
1093 /// from the end of the truncated token. Used for other languages that have
1094 /// different token boundaries, like JavaScript in which a comment ends at a
1095 /// line break regardless of whether the line break follows a backslash. Also
1096 /// used to set the lexer to the end of whitespace if the lexer regards
1097 /// whitespace and an unrecognized symbol as one token.
1098 void FormatTokenLexer::truncateToken(size_t NewLen) {
1099   assert(NewLen <= FormatTok->TokenText.size());
1100   resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
1101       Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
1102   FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
1103   FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1104       FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
1105       Encoding);
1106   FormatTok->Tok.setLength(NewLen);
1107 }
1108 
1109 /// Count the length of leading whitespace in a token.
1110 static size_t countLeadingWhitespace(StringRef Text) {
1111   // Basically counting the length matched by this regex.
1112   // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
1113   // Directly using the regex turned out to be slow. With the regex
1114   // version formatting all files in this directory took about 1.25
1115   // seconds. This version took about 0.5 seconds.
1116   const unsigned char *const Begin = Text.bytes_begin();
1117   const unsigned char *const End = Text.bytes_end();
1118   const unsigned char *Cur = Begin;
1119   while (Cur < End) {
1120     if (isspace(Cur[0])) {
1121       ++Cur;
1122     } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
1123       // A '\' followed by a newline always escapes the newline, regardless
1124       // of whether there is another '\' before it.
1125       // The source has a null byte at the end. So the end of the entire input
1126       // isn't reached yet. Also the lexer doesn't break apart an escaped
1127       // newline.
1128       assert(End - Cur >= 2);
1129       Cur += 2;
1130     } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
1131                (Cur[3] == '\n' || Cur[3] == '\r')) {
1132       // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
1133       // characters are quoted individually in this comment because if we write
1134       // them together some compilers warn that we have a trigraph in the code.
1135       assert(End - Cur >= 4);
1136       Cur += 4;
1137     } else {
1138       break;
1139     }
1140   }
1141   return Cur - Begin;
1142 }
1143 
1144 FormatToken *FormatTokenLexer::getNextToken() {
1145   if (StateStack.top() == LexerState::TOKEN_STASHED) {
1146     StateStack.pop();
1147     return getStashedToken();
1148   }
1149 
1150   FormatTok = new (Allocator.Allocate()) FormatToken;
1151   readRawToken(*FormatTok);
1152   SourceLocation WhitespaceStart =
1153       FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1154   FormatTok->IsFirst = IsFirstToken;
1155   IsFirstToken = false;
1156 
1157   // Consume and record whitespace until we find a significant token.
1158   // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1159   // followed by a symbol such as backtick. Those symbols may be
1160   // significant in other languages.
1161   unsigned WhitespaceLength = TrailingWhitespace;
1162   while (FormatTok->isNot(tok::eof)) {
1163     auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1164     if (LeadingWhitespace == 0)
1165       break;
1166     if (LeadingWhitespace < FormatTok->TokenText.size())
1167       truncateToken(LeadingWhitespace);
1168     StringRef Text = FormatTok->TokenText;
1169     bool InEscape = false;
1170     for (int i = 0, e = Text.size(); i != e; ++i) {
1171       switch (Text[i]) {
1172       case '\r':
1173         // If this is a CRLF sequence, break here and the LF will be handled on
1174         // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1175         // the same as a single LF.
1176         if (i + 1 < e && Text[i + 1] == '\n')
1177           break;
1178         [[fallthrough]];
1179       case '\n':
1180         ++FormatTok->NewlinesBefore;
1181         if (!InEscape)
1182           FormatTok->HasUnescapedNewline = true;
1183         else
1184           InEscape = false;
1185         FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1186         Column = 0;
1187         break;
1188       case '\f':
1189       case '\v':
1190         Column = 0;
1191         break;
1192       case ' ':
1193         ++Column;
1194         break;
1195       case '\t':
1196         Column +=
1197             Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1198         break;
1199       case '\\':
1200       case '?':
1201       case '/':
1202         // The text was entirely whitespace when this loop was entered. Thus
1203         // this has to be an escape sequence.
1204         assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
1205                Text.substr(i, 4) == "\?\?/\r" ||
1206                Text.substr(i, 4) == "\?\?/\n" ||
1207                (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
1208                            Text.substr(i - 1, 4) == "\?\?/\n")) ||
1209                (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
1210                            Text.substr(i - 2, 4) == "\?\?/\n")));
1211         InEscape = true;
1212         break;
1213       default:
1214         // This shouldn't happen.
1215         assert(false);
1216         break;
1217       }
1218     }
1219     WhitespaceLength += Text.size();
1220     readRawToken(*FormatTok);
1221   }
1222 
1223   if (FormatTok->is(tok::unknown))
1224     FormatTok->setType(TT_ImplicitStringLiteral);
1225 
1226   // JavaScript and Java do not allow to escape the end of the line with a
1227   // backslash. Backslashes are syntax errors in plain source, but can occur in
1228   // comments. When a single line comment ends with a \, it'll cause the next
1229   // line of code to be lexed as a comment, breaking formatting. The code below
1230   // finds comments that contain a backslash followed by a line break, truncates
1231   // the comment token at the backslash, and resets the lexer to restart behind
1232   // the backslash.
1233   if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&
1234       FormatTok->is(tok::comment) && FormatTok->TokenText.starts_with("//")) {
1235     size_t BackslashPos = FormatTok->TokenText.find('\\');
1236     while (BackslashPos != StringRef::npos) {
1237       if (BackslashPos + 1 < FormatTok->TokenText.size() &&
1238           FormatTok->TokenText[BackslashPos + 1] == '\n') {
1239         truncateToken(BackslashPos + 1);
1240         break;
1241       }
1242       BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
1243     }
1244   }
1245 
1246   if (Style.isVerilog()) {
1247     static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1248     SmallVector<StringRef, 1> Matches;
1249     // Verilog uses the backtick instead of the hash for preprocessor stuff.
1250     // And it uses the hash for delays and parameter lists. In order to continue
1251     // using `tok::hash` in other places, the backtick gets marked as the hash
1252     // here.  And in order to tell the backtick and hash apart for
1253     // Verilog-specific stuff, the hash becomes an identifier.
1254     if (FormatTok->is(tok::numeric_constant)) {
1255       // In Verilog the quote is not part of a number.
1256       auto Quote = FormatTok->TokenText.find('\'');
1257       if (Quote != StringRef::npos)
1258         truncateToken(Quote);
1259     } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1260       FormatTok->Tok.setKind(tok::raw_identifier);
1261     } else if (FormatTok->is(tok::raw_identifier)) {
1262       if (FormatTok->TokenText == "`") {
1263         FormatTok->Tok.setIdentifierInfo(nullptr);
1264         FormatTok->Tok.setKind(tok::hash);
1265       } else if (FormatTok->TokenText == "``") {
1266         FormatTok->Tok.setIdentifierInfo(nullptr);
1267         FormatTok->Tok.setKind(tok::hashhash);
1268       } else if (Tokens.size() > 0 &&
1269                  Tokens.back()->is(Keywords.kw_apostrophe) &&
1270                  NumberBase.match(FormatTok->TokenText, &Matches)) {
1271         // In Verilog in a based number literal like `'b10`, there may be
1272         // whitespace between `'b` and `10`. Therefore we handle the base and
1273         // the rest of the number literal as two tokens. But if there is no
1274         // space in the input code, we need to manually separate the two parts.
1275         truncateToken(Matches[0].size());
1276         FormatTok->setFinalizedType(TT_VerilogNumberBase);
1277       }
1278     }
1279   }
1280 
1281   FormatTok->WhitespaceRange = SourceRange(
1282       WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1283 
1284   FormatTok->OriginalColumn = Column;
1285 
1286   TrailingWhitespace = 0;
1287   if (FormatTok->is(tok::comment)) {
1288     // FIXME: Add the trimmed whitespace to Column.
1289     StringRef UntrimmedText = FormatTok->TokenText;
1290     FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1291     TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1292   } else if (FormatTok->is(tok::raw_identifier)) {
1293     IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1294     FormatTok->Tok.setIdentifierInfo(&Info);
1295     FormatTok->Tok.setKind(Info.getTokenID());
1296     if (Style.Language == FormatStyle::LK_Java &&
1297         FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1298                            tok::kw_operator)) {
1299       FormatTok->Tok.setKind(tok::identifier);
1300       FormatTok->Tok.setIdentifierInfo(nullptr);
1301     } else if (Style.isJavaScript() &&
1302                FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1303                                   tok::kw_operator)) {
1304       FormatTok->Tok.setKind(tok::identifier);
1305       FormatTok->Tok.setIdentifierInfo(nullptr);
1306     } else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) {
1307       FormatTok->Tok.setKind(tok::identifier);
1308       FormatTok->Tok.setIdentifierInfo(nullptr);
1309     }
1310   } else if (FormatTok->is(tok::greatergreater)) {
1311     FormatTok->Tok.setKind(tok::greater);
1312     FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1313     ++Column;
1314     StateStack.push(LexerState::TOKEN_STASHED);
1315   } else if (FormatTok->is(tok::lessless)) {
1316     FormatTok->Tok.setKind(tok::less);
1317     FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1318     ++Column;
1319     StateStack.push(LexerState::TOKEN_STASHED);
1320   }
1321 
1322   if (Style.isVerilog() && Tokens.size() > 0 &&
1323       Tokens.back()->is(TT_VerilogNumberBase) &&
1324       FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1325     // Mark the number following a base like `'h?a0` as a number.
1326     FormatTok->Tok.setKind(tok::numeric_constant);
1327   }
1328 
1329   // Now FormatTok is the next non-whitespace token.
1330 
1331   StringRef Text = FormatTok->TokenText;
1332   size_t FirstNewlinePos = Text.find('\n');
1333   if (FirstNewlinePos == StringRef::npos) {
1334     // FIXME: ColumnWidth actually depends on the start column, we need to
1335     // take this into account when the token is moved.
1336     FormatTok->ColumnWidth =
1337         encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1338     Column += FormatTok->ColumnWidth;
1339   } else {
1340     FormatTok->IsMultiline = true;
1341     // FIXME: ColumnWidth actually depends on the start column, we need to
1342     // take this into account when the token is moved.
1343     FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1344         Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1345 
1346     // The last line of the token always starts in column 0.
1347     // Thus, the length can be precomputed even in the presence of tabs.
1348     FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1349         Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1350     Column = FormatTok->LastLineColumnWidth;
1351   }
1352 
1353   if (Style.isCpp()) {
1354     auto *Identifier = FormatTok->Tok.getIdentifierInfo();
1355     auto it = Macros.find(Identifier);
1356     if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
1357           Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1358               tok::pp_define) &&
1359         it != Macros.end()) {
1360       FormatTok->setType(it->second);
1361       if (it->second == TT_IfMacro) {
1362         // The lexer token currently has type tok::kw_unknown. However, for this
1363         // substitution to be treated correctly in the TokenAnnotator, faking
1364         // the tok value seems to be needed. Not sure if there's a more elegant
1365         // way.
1366         FormatTok->Tok.setKind(tok::kw_if);
1367       }
1368     } else if (FormatTok->is(tok::identifier)) {
1369       if (MacroBlockBeginRegex.match(Text))
1370         FormatTok->setType(TT_MacroBlockBegin);
1371       else if (MacroBlockEndRegex.match(Text))
1372         FormatTok->setType(TT_MacroBlockEnd);
1373       else if (TemplateNames.contains(Identifier))
1374         FormatTok->setFinalizedType(TT_TemplateName);
1375       else if (TypeNames.contains(Identifier))
1376         FormatTok->setFinalizedType(TT_TypeName);
1377     }
1378   }
1379 
1380   return FormatTok;
1381 }
1382 
1383 bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1384   // In Verilog the quote is not a character literal.
1385   //
1386   // Make the backtick and double backtick identifiers to match against them
1387   // more easily.
1388   //
1389   // In Verilog an escaped identifier starts with backslash and ends with
1390   // whitespace. Unless that whitespace is an escaped newline. A backslash can
1391   // also begin an escaped newline outside of an escaped identifier. We check
1392   // for that outside of the Regex since we can't use negative lookhead
1393   // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1394   // identifier may have a length of 0 according to Section A.9.3.
1395   // FIXME: If there is an escaped newline in the middle of an escaped
1396   // identifier, allow for pasting the two lines together, But escaped
1397   // identifiers usually occur only in generated code anyway.
1398   static const llvm::Regex VerilogToken(R"re(^('|``?|\\(\\)re"
1399                                         "(\r?\n|\r)|[^[:space:]])*)");
1400 
1401   SmallVector<StringRef, 4> Matches;
1402   const char *Start = Lex->getBufferLocation();
1403   if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start),
1404                           &Matches)) {
1405     return false;
1406   }
1407   // There is a null byte at the end of the buffer, so we don't have to check
1408   // Start[1] is within the buffer.
1409   if (Start[0] == '\\' && (Start[1] == '\r' || Start[1] == '\n'))
1410     return false;
1411   size_t Len = Matches[0].size();
1412 
1413   // The kind has to be an identifier so we can match it against those defined
1414   // in Keywords. The kind has to be set before the length because the setLength
1415   // function checks that the kind is not an annotation.
1416   Tok.setKind(tok::raw_identifier);
1417   Tok.setLength(Len);
1418   Tok.setLocation(Lex->getSourceLocation(Start, Len));
1419   Tok.setRawIdentifierData(Start);
1420   Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1421   return true;
1422 }
1423 
1424 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1425   // For Verilog, first see if there is a special token, and fall back to the
1426   // normal lexer if there isn't one.
1427   if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
1428     Lex->LexFromRawLexer(Tok.Tok);
1429   Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1430                             Tok.Tok.getLength());
1431   // For formatting, treat unterminated string literals like normal string
1432   // literals.
1433   if (Tok.is(tok::unknown)) {
1434     if (Tok.TokenText.starts_with("\"")) {
1435       Tok.Tok.setKind(tok::string_literal);
1436       Tok.IsUnterminatedLiteral = true;
1437     } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1438       Tok.Tok.setKind(tok::string_literal);
1439     }
1440   }
1441 
1442   if ((Style.isJavaScript() || Style.isProto()) && Tok.is(tok::char_constant))
1443     Tok.Tok.setKind(tok::string_literal);
1444 
1445   if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText))
1446     FormattingDisabled = false;
1447 
1448   Tok.Finalized = FormattingDisabled;
1449 
1450   if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText))
1451     FormattingDisabled = true;
1452 }
1453 
1454 void FormatTokenLexer::resetLexer(unsigned Offset) {
1455   StringRef Buffer = SourceMgr.getBufferData(ID);
1456   Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1457                       Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1458   Lex->SetKeepWhitespaceMode(true);
1459   TrailingWhitespace = 0;
1460 }
1461 
1462 } // namespace format
1463 } // namespace clang
1464