xref: /openbsd-src/gnu/llvm/clang/lib/Format/FormatTokenLexer.cpp (revision 12c855180aad702bbcca06e0398d774beeafb155)
1e5dd7070Spatrick //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2e5dd7070Spatrick //
3e5dd7070Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e5dd7070Spatrick // See https://llvm.org/LICENSE.txt for license information.
5e5dd7070Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6e5dd7070Spatrick //
7e5dd7070Spatrick //===----------------------------------------------------------------------===//
8e5dd7070Spatrick ///
9e5dd7070Spatrick /// \file
10e5dd7070Spatrick /// This file implements FormatTokenLexer, which tokenizes a source file
11e5dd7070Spatrick /// into a FormatToken stream suitable for ClangFormat.
12e5dd7070Spatrick ///
13e5dd7070Spatrick //===----------------------------------------------------------------------===//
14e5dd7070Spatrick 
15e5dd7070Spatrick #include "FormatTokenLexer.h"
16e5dd7070Spatrick #include "FormatToken.h"
17e5dd7070Spatrick #include "clang/Basic/SourceLocation.h"
18e5dd7070Spatrick #include "clang/Basic/SourceManager.h"
19e5dd7070Spatrick #include "clang/Format/Format.h"
20e5dd7070Spatrick #include "llvm/Support/Regex.h"
21e5dd7070Spatrick 
22e5dd7070Spatrick namespace clang {
23e5dd7070Spatrick namespace format {
24e5dd7070Spatrick 
FormatTokenLexer(const SourceManager & SourceMgr,FileID ID,unsigned Column,const FormatStyle & Style,encoding::Encoding Encoding,llvm::SpecificBumpPtrAllocator<FormatToken> & Allocator,IdentifierTable & IdentTable)25ec727ea7Spatrick FormatTokenLexer::FormatTokenLexer(
26ec727ea7Spatrick     const SourceManager &SourceMgr, FileID ID, unsigned Column,
27ec727ea7Spatrick     const FormatStyle &Style, encoding::Encoding Encoding,
28ec727ea7Spatrick     llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29ec727ea7Spatrick     IdentifierTable &IdentTable)
30e5dd7070Spatrick     : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31*12c85518Srobert       Column(Column), TrailingWhitespace(0),
32*12c85518Srobert       LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
33ec727ea7Spatrick       Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
34ec727ea7Spatrick       Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
35e5dd7070Spatrick       FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
36e5dd7070Spatrick       MacroBlockEndRegex(Style.MacroBlockEnd) {
37*12c85518Srobert   Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
38e5dd7070Spatrick   Lex->SetKeepWhitespaceMode(true);
39e5dd7070Spatrick 
40*12c85518Srobert   for (const std::string &ForEachMacro : Style.ForEachMacros) {
41*12c85518Srobert     auto Identifier = &IdentTable.get(ForEachMacro);
42*12c85518Srobert     Macros.insert({Identifier, TT_ForEachMacro});
43*12c85518Srobert   }
44*12c85518Srobert   for (const std::string &IfMacro : Style.IfMacros) {
45*12c85518Srobert     auto Identifier = &IdentTable.get(IfMacro);
46*12c85518Srobert     Macros.insert({Identifier, TT_IfMacro});
47*12c85518Srobert   }
48*12c85518Srobert   for (const std::string &AttributeMacro : Style.AttributeMacros) {
49*12c85518Srobert     auto Identifier = &IdentTable.get(AttributeMacro);
50*12c85518Srobert     Macros.insert({Identifier, TT_AttributeMacro});
51*12c85518Srobert   }
52*12c85518Srobert   for (const std::string &StatementMacro : Style.StatementMacros) {
53*12c85518Srobert     auto Identifier = &IdentTable.get(StatementMacro);
54*12c85518Srobert     Macros.insert({Identifier, TT_StatementMacro});
55*12c85518Srobert   }
56*12c85518Srobert   for (const std::string &TypenameMacro : Style.TypenameMacros) {
57*12c85518Srobert     auto Identifier = &IdentTable.get(TypenameMacro);
58*12c85518Srobert     Macros.insert({Identifier, TT_TypenameMacro});
59*12c85518Srobert   }
60*12c85518Srobert   for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
61*12c85518Srobert     auto Identifier = &IdentTable.get(NamespaceMacro);
62*12c85518Srobert     Macros.insert({Identifier, TT_NamespaceMacro});
63*12c85518Srobert   }
64ec727ea7Spatrick   for (const std::string &WhitespaceSensitiveMacro :
65ec727ea7Spatrick        Style.WhitespaceSensitiveMacros) {
66*12c85518Srobert     auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
67*12c85518Srobert     Macros.insert({Identifier, TT_UntouchableMacroFunc});
68ec727ea7Spatrick   }
69a9ac8606Spatrick   for (const std::string &StatementAttributeLikeMacro :
70*12c85518Srobert        Style.StatementAttributeLikeMacros) {
71*12c85518Srobert     auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
72*12c85518Srobert     Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
73*12c85518Srobert   }
74e5dd7070Spatrick }
75e5dd7070Spatrick 
lex()76e5dd7070Spatrick ArrayRef<FormatToken *> FormatTokenLexer::lex() {
77e5dd7070Spatrick   assert(Tokens.empty());
78e5dd7070Spatrick   assert(FirstInLineIndex == 0);
79e5dd7070Spatrick   do {
80e5dd7070Spatrick     Tokens.push_back(getNextToken());
81*12c85518Srobert     if (Style.isJavaScript()) {
82e5dd7070Spatrick       tryParseJSRegexLiteral();
83e5dd7070Spatrick       handleTemplateStrings();
84e5dd7070Spatrick     }
85e5dd7070Spatrick     if (Style.Language == FormatStyle::LK_TextProto)
86e5dd7070Spatrick       tryParsePythonComment();
87e5dd7070Spatrick     tryMergePreviousTokens();
88*12c85518Srobert     if (Style.isCSharp()) {
89ec727ea7Spatrick       // This needs to come after tokens have been merged so that C#
90ec727ea7Spatrick       // string literals are correctly identified.
91ec727ea7Spatrick       handleCSharpVerbatimAndInterpolatedStrings();
92*12c85518Srobert     }
93e5dd7070Spatrick     if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
94e5dd7070Spatrick       FirstInLineIndex = Tokens.size() - 1;
95*12c85518Srobert   } while (Tokens.back()->isNot(tok::eof));
96e5dd7070Spatrick   return Tokens;
97e5dd7070Spatrick }
98e5dd7070Spatrick 
tryMergePreviousTokens()99e5dd7070Spatrick void FormatTokenLexer::tryMergePreviousTokens() {
100e5dd7070Spatrick   if (tryMerge_TMacro())
101e5dd7070Spatrick     return;
102e5dd7070Spatrick   if (tryMergeConflictMarkers())
103e5dd7070Spatrick     return;
104e5dd7070Spatrick   if (tryMergeLessLess())
105e5dd7070Spatrick     return;
106ec727ea7Spatrick   if (tryMergeForEach())
107ec727ea7Spatrick     return;
108ec727ea7Spatrick   if (Style.isCpp() && tryTransformTryUsageForC())
109ec727ea7Spatrick     return;
110e5dd7070Spatrick 
111*12c85518Srobert   if (Style.isJavaScript() || Style.isCSharp()) {
112a9ac8606Spatrick     static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
113a9ac8606Spatrick                                                                tok::question};
114a9ac8606Spatrick     static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
115a9ac8606Spatrick                                                              tok::period};
116a9ac8606Spatrick     static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
117a9ac8606Spatrick 
118a9ac8606Spatrick     if (tryMergeTokens(FatArrow, TT_FatArrow))
119a9ac8606Spatrick       return;
120a9ac8606Spatrick     if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
121a9ac8606Spatrick       // Treat like the "||" operator (as opposed to the ternary ?).
122a9ac8606Spatrick       Tokens.back()->Tok.setKind(tok::pipepipe);
123a9ac8606Spatrick       return;
124a9ac8606Spatrick     }
125a9ac8606Spatrick     if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
126a9ac8606Spatrick       // Treat like a regular "." access.
127a9ac8606Spatrick       Tokens.back()->Tok.setKind(tok::period);
128a9ac8606Spatrick       return;
129a9ac8606Spatrick     }
130*12c85518Srobert     if (tryMergeNullishCoalescingEqual())
131a9ac8606Spatrick       return;
132a9ac8606Spatrick   }
133a9ac8606Spatrick 
134e5dd7070Spatrick   if (Style.isCSharp()) {
135a9ac8606Spatrick     static const tok::TokenKind CSharpNullConditionalLSquare[] = {
136a9ac8606Spatrick         tok::question, tok::l_square};
137a9ac8606Spatrick 
138e5dd7070Spatrick     if (tryMergeCSharpKeywordVariables())
139e5dd7070Spatrick       return;
140ec727ea7Spatrick     if (tryMergeCSharpStringLiteral())
141e5dd7070Spatrick       return;
142e5dd7070Spatrick     if (tryTransformCSharpForEach())
143e5dd7070Spatrick       return;
144a9ac8606Spatrick     if (tryMergeTokens(CSharpNullConditionalLSquare,
145a9ac8606Spatrick                        TT_CSharpNullConditionalLSquare)) {
146a9ac8606Spatrick       // Treat like a regular "[" operator.
147a9ac8606Spatrick       Tokens.back()->Tok.setKind(tok::l_square);
148e5dd7070Spatrick       return;
149e5dd7070Spatrick     }
150a9ac8606Spatrick   }
151e5dd7070Spatrick 
152e5dd7070Spatrick   if (tryMergeNSStringLiteral())
153e5dd7070Spatrick     return;
154e5dd7070Spatrick 
155*12c85518Srobert   if (Style.isJavaScript()) {
156e5dd7070Spatrick     static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
157e5dd7070Spatrick     static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
158e5dd7070Spatrick                                                    tok::equal};
159e5dd7070Spatrick     static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
160e5dd7070Spatrick                                                   tok::greaterequal};
161e5dd7070Spatrick     static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
162e5dd7070Spatrick     static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
163e5dd7070Spatrick                                                            tok::starequal};
164a9ac8606Spatrick     static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
165a9ac8606Spatrick     static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
166e5dd7070Spatrick 
167e5dd7070Spatrick     // FIXME: Investigate what token type gives the correct operator priority.
168e5dd7070Spatrick     if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
169e5dd7070Spatrick       return;
170e5dd7070Spatrick     if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
171e5dd7070Spatrick       return;
172e5dd7070Spatrick     if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
173e5dd7070Spatrick       return;
174e5dd7070Spatrick     if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
175e5dd7070Spatrick       return;
176e5dd7070Spatrick     if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
177e5dd7070Spatrick       Tokens.back()->Tok.setKind(tok::starequal);
178e5dd7070Spatrick       return;
179e5dd7070Spatrick     }
180a9ac8606Spatrick     if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
181a9ac8606Spatrick         tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
182a9ac8606Spatrick       // Treat like the "=" assignment operator.
183a9ac8606Spatrick       Tokens.back()->Tok.setKind(tok::equal);
184e5dd7070Spatrick       return;
185e5dd7070Spatrick     }
186e5dd7070Spatrick     if (tryMergeJSPrivateIdentifier())
187e5dd7070Spatrick       return;
188e5dd7070Spatrick   }
189e5dd7070Spatrick 
190e5dd7070Spatrick   if (Style.Language == FormatStyle::LK_Java) {
191e5dd7070Spatrick     static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
192e5dd7070Spatrick         tok::greater, tok::greater, tok::greaterequal};
193e5dd7070Spatrick     if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
194e5dd7070Spatrick       return;
195e5dd7070Spatrick   }
196*12c85518Srobert 
197*12c85518Srobert   if (Style.isVerilog()) {
198*12c85518Srobert     // Merge the number following a base like `'h?a0`.
199*12c85518Srobert     if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
200*12c85518Srobert         Tokens.end()[-2]->is(tok::numeric_constant) &&
201*12c85518Srobert         Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
202*12c85518Srobert                                tok::question) &&
203*12c85518Srobert         tryMergeTokens(2, TT_Unknown)) {
204*12c85518Srobert       return;
205*12c85518Srobert     }
206*12c85518Srobert     // Part select.
207*12c85518Srobert     if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
208*12c85518Srobert                           TT_BitFieldColon)) {
209*12c85518Srobert       return;
210*12c85518Srobert     }
211*12c85518Srobert     // Xnor. The combined token is treated as a caret which can also be either a
212*12c85518Srobert     // unary or binary operator. The actual type is determined in
213*12c85518Srobert     // TokenAnnotator. We also check the token length so we know it is not
214*12c85518Srobert     // already a merged token.
215*12c85518Srobert     if (Tokens.back()->TokenText.size() == 1 &&
216*12c85518Srobert         tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
217*12c85518Srobert                           TT_BinaryOperator)) {
218*12c85518Srobert       Tokens.back()->Tok.setKind(tok::caret);
219*12c85518Srobert       return;
220*12c85518Srobert     }
221*12c85518Srobert     // Signed shift and distribution weight.
222*12c85518Srobert     if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
223*12c85518Srobert       Tokens.back()->Tok.setKind(tok::lessless);
224*12c85518Srobert       return;
225*12c85518Srobert     }
226*12c85518Srobert     if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
227*12c85518Srobert       Tokens.back()->Tok.setKind(tok::greatergreater);
228*12c85518Srobert       return;
229*12c85518Srobert     }
230*12c85518Srobert     if (tryMergeTokensAny({{tok::lessless, tok::equal},
231*12c85518Srobert                            {tok::lessless, tok::lessequal},
232*12c85518Srobert                            {tok::greatergreater, tok::equal},
233*12c85518Srobert                            {tok::greatergreater, tok::greaterequal},
234*12c85518Srobert                            {tok::colon, tok::equal},
235*12c85518Srobert                            {tok::colon, tok::slash}},
236*12c85518Srobert                           TT_BinaryOperator)) {
237*12c85518Srobert       Tokens.back()->ForcedPrecedence = prec::Assignment;
238*12c85518Srobert       return;
239*12c85518Srobert     }
240*12c85518Srobert     // Exponentiation, signed shift, case equality, and wildcard equality.
241*12c85518Srobert     if (tryMergeTokensAny({{tok::star, tok::star},
242*12c85518Srobert                            {tok::lessless, tok::less},
243*12c85518Srobert                            {tok::greatergreater, tok::greater},
244*12c85518Srobert                            {tok::exclaimequal, tok::equal},
245*12c85518Srobert                            {tok::exclaimequal, tok::question},
246*12c85518Srobert                            {tok::equalequal, tok::equal},
247*12c85518Srobert                            {tok::equalequal, tok::question}},
248*12c85518Srobert                           TT_BinaryOperator)) {
249*12c85518Srobert       return;
250*12c85518Srobert     }
251*12c85518Srobert     // Module paths in specify blocks and implications in properties.
252*12c85518Srobert     if (tryMergeTokensAny({{tok::plusequal, tok::greater},
253*12c85518Srobert                            {tok::plus, tok::star, tok::greater},
254*12c85518Srobert                            {tok::minusequal, tok::greater},
255*12c85518Srobert                            {tok::minus, tok::star, tok::greater},
256*12c85518Srobert                            {tok::less, tok::arrow},
257*12c85518Srobert                            {tok::equal, tok::greater},
258*12c85518Srobert                            {tok::star, tok::greater},
259*12c85518Srobert                            {tok::pipeequal, tok::greater},
260*12c85518Srobert                            {tok::pipe, tok::arrow},
261*12c85518Srobert                            {tok::hash, tok::minus, tok::hash},
262*12c85518Srobert                            {tok::hash, tok::equal, tok::hash}},
263*12c85518Srobert                           TT_BinaryOperator)) {
264*12c85518Srobert       Tokens.back()->ForcedPrecedence = prec::Comma;
265*12c85518Srobert       return;
266*12c85518Srobert     }
267*12c85518Srobert   }
268e5dd7070Spatrick }
269e5dd7070Spatrick 
tryMergeNSStringLiteral()270e5dd7070Spatrick bool FormatTokenLexer::tryMergeNSStringLiteral() {
271e5dd7070Spatrick   if (Tokens.size() < 2)
272e5dd7070Spatrick     return false;
273e5dd7070Spatrick   auto &At = *(Tokens.end() - 2);
274e5dd7070Spatrick   auto &String = *(Tokens.end() - 1);
275e5dd7070Spatrick   if (!At->is(tok::at) || !String->is(tok::string_literal))
276e5dd7070Spatrick     return false;
277e5dd7070Spatrick   At->Tok.setKind(tok::string_literal);
278e5dd7070Spatrick   At->TokenText = StringRef(At->TokenText.begin(),
279e5dd7070Spatrick                             String->TokenText.end() - At->TokenText.begin());
280e5dd7070Spatrick   At->ColumnWidth += String->ColumnWidth;
281ec727ea7Spatrick   At->setType(TT_ObjCStringLiteral);
282e5dd7070Spatrick   Tokens.erase(Tokens.end() - 1);
283e5dd7070Spatrick   return true;
284e5dd7070Spatrick }
285e5dd7070Spatrick 
tryMergeJSPrivateIdentifier()286e5dd7070Spatrick bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
287e5dd7070Spatrick   // Merges #idenfier into a single identifier with the text #identifier
288e5dd7070Spatrick   // but the token tok::identifier.
289e5dd7070Spatrick   if (Tokens.size() < 2)
290e5dd7070Spatrick     return false;
291e5dd7070Spatrick   auto &Hash = *(Tokens.end() - 2);
292e5dd7070Spatrick   auto &Identifier = *(Tokens.end() - 1);
293e5dd7070Spatrick   if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier))
294e5dd7070Spatrick     return false;
295e5dd7070Spatrick   Hash->Tok.setKind(tok::identifier);
296e5dd7070Spatrick   Hash->TokenText =
297e5dd7070Spatrick       StringRef(Hash->TokenText.begin(),
298e5dd7070Spatrick                 Identifier->TokenText.end() - Hash->TokenText.begin());
299e5dd7070Spatrick   Hash->ColumnWidth += Identifier->ColumnWidth;
300ec727ea7Spatrick   Hash->setType(TT_JsPrivateIdentifier);
301e5dd7070Spatrick   Tokens.erase(Tokens.end() - 1);
302e5dd7070Spatrick   return true;
303e5dd7070Spatrick }
304e5dd7070Spatrick 
305e5dd7070Spatrick // Search for verbatim or interpolated string literals @"ABC" or
306e5dd7070Spatrick // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
307e5dd7070Spatrick // prevent splitting of @, $ and ".
308ec727ea7Spatrick // Merging of multiline verbatim strings with embedded '"' is handled in
309ec727ea7Spatrick // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
tryMergeCSharpStringLiteral()310ec727ea7Spatrick bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
311e5dd7070Spatrick   if (Tokens.size() < 2)
312e5dd7070Spatrick     return false;
313e5dd7070Spatrick 
314ec727ea7Spatrick   // Look for @"aaaaaa" or $"aaaaaa".
315*12c85518Srobert   const auto String = *(Tokens.end() - 1);
316*12c85518Srobert   if (String->isNot(tok::string_literal))
317e5dd7070Spatrick     return false;
318e5dd7070Spatrick 
319*12c85518Srobert   auto Prefix = *(Tokens.end() - 2);
320*12c85518Srobert   if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
321ec727ea7Spatrick     return false;
322ec727ea7Spatrick 
323*12c85518Srobert   if (Tokens.size() > 2) {
324*12c85518Srobert     const auto Tok = *(Tokens.end() - 3);
325*12c85518Srobert     if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
326*12c85518Srobert         (Tok->is(tok::at) && Prefix->TokenText == "$")) {
327*12c85518Srobert       // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
328*12c85518Srobert       Tok->ColumnWidth += Prefix->ColumnWidth;
329e5dd7070Spatrick       Tokens.erase(Tokens.end() - 2);
330*12c85518Srobert       Prefix = Tok;
331e5dd7070Spatrick     }
332e5dd7070Spatrick   }
333e5dd7070Spatrick 
334e5dd7070Spatrick   // Convert back into just a string_literal.
335*12c85518Srobert   Prefix->Tok.setKind(tok::string_literal);
336*12c85518Srobert   Prefix->TokenText =
337*12c85518Srobert       StringRef(Prefix->TokenText.begin(),
338*12c85518Srobert                 String->TokenText.end() - Prefix->TokenText.begin());
339*12c85518Srobert   Prefix->ColumnWidth += String->ColumnWidth;
340*12c85518Srobert   Prefix->setType(TT_CSharpStringLiteral);
341e5dd7070Spatrick   Tokens.erase(Tokens.end() - 1);
342e5dd7070Spatrick   return true;
343e5dd7070Spatrick }
344e5dd7070Spatrick 
345ec727ea7Spatrick // Valid C# attribute targets:
346ec727ea7Spatrick // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
347ec727ea7Spatrick const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
348ec727ea7Spatrick     "assembly", "module",   "field",  "event", "method",
349ec727ea7Spatrick     "param",    "property", "return", "type",
350ec727ea7Spatrick };
351ec727ea7Spatrick 
tryMergeNullishCoalescingEqual()352a9ac8606Spatrick bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
353e5dd7070Spatrick   if (Tokens.size() < 2)
354e5dd7070Spatrick     return false;
355a9ac8606Spatrick   auto &NullishCoalescing = *(Tokens.end() - 2);
356a9ac8606Spatrick   auto &Equal = *(Tokens.end() - 1);
357a9ac8606Spatrick   if (NullishCoalescing->getType() != TT_NullCoalescingOperator ||
358*12c85518Srobert       !Equal->is(tok::equal)) {
359e5dd7070Spatrick     return false;
360*12c85518Srobert   }
361a9ac8606Spatrick   NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
362a9ac8606Spatrick   NullishCoalescing->TokenText =
363a9ac8606Spatrick       StringRef(NullishCoalescing->TokenText.begin(),
364a9ac8606Spatrick                 Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
365a9ac8606Spatrick   NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
366a9ac8606Spatrick   NullishCoalescing->setType(TT_NullCoalescingEqual);
367e5dd7070Spatrick   Tokens.erase(Tokens.end() - 1);
368e5dd7070Spatrick   return true;
369e5dd7070Spatrick }
370e5dd7070Spatrick 
tryMergeCSharpKeywordVariables()371e5dd7070Spatrick bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
372e5dd7070Spatrick   if (Tokens.size() < 2)
373e5dd7070Spatrick     return false;
374*12c85518Srobert   const auto At = *(Tokens.end() - 2);
375*12c85518Srobert   if (At->isNot(tok::at))
376*12c85518Srobert     return false;
377*12c85518Srobert   const auto Keyword = *(Tokens.end() - 1);
378*12c85518Srobert   if (Keyword->TokenText == "$")
379e5dd7070Spatrick     return false;
380e5dd7070Spatrick   if (!Keywords.isCSharpKeyword(*Keyword))
381e5dd7070Spatrick     return false;
382e5dd7070Spatrick 
383e5dd7070Spatrick   At->Tok.setKind(tok::identifier);
384e5dd7070Spatrick   At->TokenText = StringRef(At->TokenText.begin(),
385e5dd7070Spatrick                             Keyword->TokenText.end() - At->TokenText.begin());
386e5dd7070Spatrick   At->ColumnWidth += Keyword->ColumnWidth;
387ec727ea7Spatrick   At->setType(Keyword->getType());
388e5dd7070Spatrick   Tokens.erase(Tokens.end() - 1);
389e5dd7070Spatrick   return true;
390e5dd7070Spatrick }
391e5dd7070Spatrick 
392e5dd7070Spatrick // In C# transform identifier foreach into kw_foreach
tryTransformCSharpForEach()393e5dd7070Spatrick bool FormatTokenLexer::tryTransformCSharpForEach() {
394e5dd7070Spatrick   if (Tokens.size() < 1)
395e5dd7070Spatrick     return false;
396e5dd7070Spatrick   auto &Identifier = *(Tokens.end() - 1);
397e5dd7070Spatrick   if (!Identifier->is(tok::identifier))
398e5dd7070Spatrick     return false;
399e5dd7070Spatrick   if (Identifier->TokenText != "foreach")
400e5dd7070Spatrick     return false;
401e5dd7070Spatrick 
402ec727ea7Spatrick   Identifier->setType(TT_ForEachMacro);
403e5dd7070Spatrick   Identifier->Tok.setKind(tok::kw_for);
404e5dd7070Spatrick   return true;
405e5dd7070Spatrick }
406e5dd7070Spatrick 
tryMergeForEach()407ec727ea7Spatrick bool FormatTokenLexer::tryMergeForEach() {
408ec727ea7Spatrick   if (Tokens.size() < 2)
409ec727ea7Spatrick     return false;
410ec727ea7Spatrick   auto &For = *(Tokens.end() - 2);
411ec727ea7Spatrick   auto &Each = *(Tokens.end() - 1);
412ec727ea7Spatrick   if (!For->is(tok::kw_for))
413ec727ea7Spatrick     return false;
414ec727ea7Spatrick   if (!Each->is(tok::identifier))
415ec727ea7Spatrick     return false;
416ec727ea7Spatrick   if (Each->TokenText != "each")
417ec727ea7Spatrick     return false;
418ec727ea7Spatrick 
419ec727ea7Spatrick   For->setType(TT_ForEachMacro);
420ec727ea7Spatrick   For->Tok.setKind(tok::kw_for);
421ec727ea7Spatrick 
422ec727ea7Spatrick   For->TokenText = StringRef(For->TokenText.begin(),
423ec727ea7Spatrick                              Each->TokenText.end() - For->TokenText.begin());
424ec727ea7Spatrick   For->ColumnWidth += Each->ColumnWidth;
425ec727ea7Spatrick   Tokens.erase(Tokens.end() - 1);
426ec727ea7Spatrick   return true;
427ec727ea7Spatrick }
428ec727ea7Spatrick 
tryTransformTryUsageForC()429ec727ea7Spatrick bool FormatTokenLexer::tryTransformTryUsageForC() {
430ec727ea7Spatrick   if (Tokens.size() < 2)
431ec727ea7Spatrick     return false;
432ec727ea7Spatrick   auto &Try = *(Tokens.end() - 2);
433ec727ea7Spatrick   if (!Try->is(tok::kw_try))
434ec727ea7Spatrick     return false;
435ec727ea7Spatrick   auto &Next = *(Tokens.end() - 1);
436a9ac8606Spatrick   if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
437ec727ea7Spatrick     return false;
438ec727ea7Spatrick 
439ec727ea7Spatrick   if (Tokens.size() > 2) {
440ec727ea7Spatrick     auto &At = *(Tokens.end() - 3);
441ec727ea7Spatrick     if (At->is(tok::at))
442ec727ea7Spatrick       return false;
443ec727ea7Spatrick   }
444ec727ea7Spatrick 
445ec727ea7Spatrick   Try->Tok.setKind(tok::identifier);
446ec727ea7Spatrick   return true;
447ec727ea7Spatrick }
448ec727ea7Spatrick 
tryMergeLessLess()449e5dd7070Spatrick bool FormatTokenLexer::tryMergeLessLess() {
450e5dd7070Spatrick   // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
451e5dd7070Spatrick   if (Tokens.size() < 3)
452e5dd7070Spatrick     return false;
453e5dd7070Spatrick 
454e5dd7070Spatrick   auto First = Tokens.end() - 3;
455*12c85518Srobert   if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
456e5dd7070Spatrick     return false;
457e5dd7070Spatrick 
458e5dd7070Spatrick   // Only merge if there currently is no whitespace between the two "<".
459*12c85518Srobert   if (First[1]->hasWhitespaceBefore())
460*12c85518Srobert     return false;
461*12c85518Srobert 
462*12c85518Srobert   auto X = Tokens.size() > 3 ? First[-1] : nullptr;
463*12c85518Srobert   auto Y = First[2];
464*12c85518Srobert   if ((X && X->is(tok::less)) || Y->is(tok::less))
465*12c85518Srobert     return false;
466*12c85518Srobert 
467*12c85518Srobert   // Do not remove a whitespace between the two "<" e.g. "operator< <>".
468*12c85518Srobert   if (X && X->is(tok::kw_operator) && Y->is(tok::greater))
469e5dd7070Spatrick     return false;
470e5dd7070Spatrick 
471e5dd7070Spatrick   First[0]->Tok.setKind(tok::lessless);
472e5dd7070Spatrick   First[0]->TokenText = "<<";
473e5dd7070Spatrick   First[0]->ColumnWidth += 1;
474e5dd7070Spatrick   Tokens.erase(Tokens.end() - 2);
475e5dd7070Spatrick   return true;
476e5dd7070Spatrick }
477e5dd7070Spatrick 
tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,TokenType NewType)478e5dd7070Spatrick bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
479e5dd7070Spatrick                                       TokenType NewType) {
480e5dd7070Spatrick   if (Tokens.size() < Kinds.size())
481e5dd7070Spatrick     return false;
482e5dd7070Spatrick 
483e5dd7070Spatrick   SmallVectorImpl<FormatToken *>::const_iterator First =
484e5dd7070Spatrick       Tokens.end() - Kinds.size();
485*12c85518Srobert   for (unsigned i = 0; i < Kinds.size(); ++i)
486*12c85518Srobert     if (!First[i]->is(Kinds[i]))
487e5dd7070Spatrick       return false;
488*12c85518Srobert 
489*12c85518Srobert   return tryMergeTokens(Kinds.size(), NewType);
490*12c85518Srobert }
491*12c85518Srobert 
tryMergeTokens(size_t Count,TokenType NewType)492*12c85518Srobert bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
493*12c85518Srobert   if (Tokens.size() < Count)
494*12c85518Srobert     return false;
495*12c85518Srobert 
496*12c85518Srobert   SmallVectorImpl<FormatToken *>::const_iterator First = Tokens.end() - Count;
497e5dd7070Spatrick   unsigned AddLength = 0;
498*12c85518Srobert   for (size_t i = 1; i < Count; ++i) {
499*12c85518Srobert     // If there is whitespace separating the token and the previous one,
500*12c85518Srobert     // they should not be merged.
501*12c85518Srobert     if (First[i]->hasWhitespaceBefore())
502e5dd7070Spatrick       return false;
503e5dd7070Spatrick     AddLength += First[i]->TokenText.size();
504e5dd7070Spatrick   }
505*12c85518Srobert 
506*12c85518Srobert   Tokens.resize(Tokens.size() - Count + 1);
507e5dd7070Spatrick   First[0]->TokenText = StringRef(First[0]->TokenText.data(),
508e5dd7070Spatrick                                   First[0]->TokenText.size() + AddLength);
509e5dd7070Spatrick   First[0]->ColumnWidth += AddLength;
510ec727ea7Spatrick   First[0]->setType(NewType);
511e5dd7070Spatrick   return true;
512e5dd7070Spatrick }
513e5dd7070Spatrick 
tryMergeTokensAny(ArrayRef<ArrayRef<tok::TokenKind>> Kinds,TokenType NewType)514*12c85518Srobert bool FormatTokenLexer::tryMergeTokensAny(
515*12c85518Srobert     ArrayRef<ArrayRef<tok::TokenKind>> Kinds, TokenType NewType) {
516*12c85518Srobert   return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
517*12c85518Srobert     return tryMergeTokens(Kinds, NewType);
518*12c85518Srobert   });
519*12c85518Srobert }
520*12c85518Srobert 
521e5dd7070Spatrick // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
precedesOperand(FormatToken * Tok)522e5dd7070Spatrick bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
523e5dd7070Spatrick   // NB: This is not entirely correct, as an r_paren can introduce an operand
524e5dd7070Spatrick   // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
525e5dd7070Spatrick   // corner case to not matter in practice, though.
526e5dd7070Spatrick   return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
527e5dd7070Spatrick                       tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
528e5dd7070Spatrick                       tok::colon, tok::question, tok::tilde) ||
529e5dd7070Spatrick          Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
530e5dd7070Spatrick                       tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
531e5dd7070Spatrick                       tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
532e5dd7070Spatrick          Tok->isBinaryOperator();
533e5dd7070Spatrick }
534e5dd7070Spatrick 
canPrecedeRegexLiteral(FormatToken * Prev)535e5dd7070Spatrick bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
536e5dd7070Spatrick   if (!Prev)
537e5dd7070Spatrick     return true;
538e5dd7070Spatrick 
539e5dd7070Spatrick   // Regex literals can only follow after prefix unary operators, not after
540e5dd7070Spatrick   // postfix unary operators. If the '++' is followed by a non-operand
541e5dd7070Spatrick   // introducing token, the slash here is the operand and not the start of a
542e5dd7070Spatrick   // regex.
543e5dd7070Spatrick   // `!` is an unary prefix operator, but also a post-fix operator that casts
544e5dd7070Spatrick   // away nullability, so the same check applies.
545e5dd7070Spatrick   if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
546*12c85518Srobert     return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
547e5dd7070Spatrick 
548e5dd7070Spatrick   // The previous token must introduce an operand location where regex
549e5dd7070Spatrick   // literals can occur.
550e5dd7070Spatrick   if (!precedesOperand(Prev))
551e5dd7070Spatrick     return false;
552e5dd7070Spatrick 
553e5dd7070Spatrick   return true;
554e5dd7070Spatrick }
555e5dd7070Spatrick 
556e5dd7070Spatrick // Tries to parse a JavaScript Regex literal starting at the current token,
557e5dd7070Spatrick // if that begins with a slash and is in a location where JavaScript allows
558e5dd7070Spatrick // regex literals. Changes the current token to a regex literal and updates
559e5dd7070Spatrick // its text if successful.
tryParseJSRegexLiteral()560e5dd7070Spatrick void FormatTokenLexer::tryParseJSRegexLiteral() {
561e5dd7070Spatrick   FormatToken *RegexToken = Tokens.back();
562e5dd7070Spatrick   if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
563e5dd7070Spatrick     return;
564e5dd7070Spatrick 
565e5dd7070Spatrick   FormatToken *Prev = nullptr;
566*12c85518Srobert   for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
567e5dd7070Spatrick     // NB: Because previous pointers are not initialized yet, this cannot use
568e5dd7070Spatrick     // Token.getPreviousNonComment.
569*12c85518Srobert     if (FT->isNot(tok::comment)) {
570*12c85518Srobert       Prev = FT;
571e5dd7070Spatrick       break;
572e5dd7070Spatrick     }
573e5dd7070Spatrick   }
574e5dd7070Spatrick 
575e5dd7070Spatrick   if (!canPrecedeRegexLiteral(Prev))
576e5dd7070Spatrick     return;
577e5dd7070Spatrick 
578e5dd7070Spatrick   // 'Manually' lex ahead in the current file buffer.
579e5dd7070Spatrick   const char *Offset = Lex->getBufferLocation();
580e5dd7070Spatrick   const char *RegexBegin = Offset - RegexToken->TokenText.size();
581e5dd7070Spatrick   StringRef Buffer = Lex->getBuffer();
582e5dd7070Spatrick   bool InCharacterClass = false;
583e5dd7070Spatrick   bool HaveClosingSlash = false;
584e5dd7070Spatrick   for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
585e5dd7070Spatrick     // Regular expressions are terminated with a '/', which can only be
586e5dd7070Spatrick     // escaped using '\' or a character class between '[' and ']'.
587e5dd7070Spatrick     // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
588e5dd7070Spatrick     switch (*Offset) {
589e5dd7070Spatrick     case '\\':
590e5dd7070Spatrick       // Skip the escaped character.
591e5dd7070Spatrick       ++Offset;
592e5dd7070Spatrick       break;
593e5dd7070Spatrick     case '[':
594e5dd7070Spatrick       InCharacterClass = true;
595e5dd7070Spatrick       break;
596e5dd7070Spatrick     case ']':
597e5dd7070Spatrick       InCharacterClass = false;
598e5dd7070Spatrick       break;
599e5dd7070Spatrick     case '/':
600e5dd7070Spatrick       if (!InCharacterClass)
601e5dd7070Spatrick         HaveClosingSlash = true;
602e5dd7070Spatrick       break;
603e5dd7070Spatrick     }
604e5dd7070Spatrick   }
605e5dd7070Spatrick 
606ec727ea7Spatrick   RegexToken->setType(TT_RegexLiteral);
607e5dd7070Spatrick   // Treat regex literals like other string_literals.
608e5dd7070Spatrick   RegexToken->Tok.setKind(tok::string_literal);
609e5dd7070Spatrick   RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
610e5dd7070Spatrick   RegexToken->ColumnWidth = RegexToken->TokenText.size();
611e5dd7070Spatrick 
612e5dd7070Spatrick   resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
613e5dd7070Spatrick }
614e5dd7070Spatrick 
lexCSharpString(const char * Begin,const char * End,bool Verbatim,bool Interpolated)615*12c85518Srobert static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
616*12c85518Srobert                             bool Interpolated) {
617*12c85518Srobert   auto Repeated = [&Begin, End]() {
618*12c85518Srobert     return Begin + 1 < End && Begin[1] == Begin[0];
619*12c85518Srobert   };
620ec727ea7Spatrick 
621ec727ea7Spatrick   // Look for a terminating '"' in the current file buffer.
622ec727ea7Spatrick   // Make no effort to format code within an interpolated or verbatim string.
623*12c85518Srobert   //
624*12c85518Srobert   // Interpolated strings could contain { } with " characters inside.
625*12c85518Srobert   // $"{x ?? "null"}"
626*12c85518Srobert   // should not be split into $"{x ?? ", null, "}" but should be treated as a
627*12c85518Srobert   // single string-literal.
628*12c85518Srobert   //
629*12c85518Srobert   // We opt not to try and format expressions inside {} within a C#
630*12c85518Srobert   // interpolated string. Formatting expressions within an interpolated string
631*12c85518Srobert   // would require similar work as that done for JavaScript template strings
632*12c85518Srobert   // in `handleTemplateStrings()`.
633*12c85518Srobert   for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
634*12c85518Srobert     switch (*Begin) {
635*12c85518Srobert     case '\\':
636*12c85518Srobert       if (!Verbatim)
637*12c85518Srobert         ++Begin;
638*12c85518Srobert       break;
639*12c85518Srobert     case '{':
640*12c85518Srobert       if (Interpolated) {
641*12c85518Srobert         // {{ inside an interpolated string is escaped, so skip it.
642*12c85518Srobert         if (Repeated())
643*12c85518Srobert           ++Begin;
644ec727ea7Spatrick         else
645*12c85518Srobert           ++UnmatchedOpeningBraceCount;
646*12c85518Srobert       }
647*12c85518Srobert       break;
648*12c85518Srobert     case '}':
649*12c85518Srobert       if (Interpolated) {
650*12c85518Srobert         // }} inside an interpolated string is escaped, so skip it.
651*12c85518Srobert         if (Repeated())
652*12c85518Srobert           ++Begin;
653*12c85518Srobert         else if (UnmatchedOpeningBraceCount > 0)
654*12c85518Srobert           --UnmatchedOpeningBraceCount;
655*12c85518Srobert         else
656*12c85518Srobert           return End;
657*12c85518Srobert       }
658*12c85518Srobert       break;
659*12c85518Srobert     case '"':
660*12c85518Srobert       if (UnmatchedOpeningBraceCount > 0)
661*12c85518Srobert         break;
662*12c85518Srobert       // "" within a verbatim string is an escaped double quote: skip it.
663*12c85518Srobert       if (Verbatim && Repeated()) {
664*12c85518Srobert         ++Begin;
665ec727ea7Spatrick         break;
666ec727ea7Spatrick       }
667*12c85518Srobert       return Begin;
668ec727ea7Spatrick     }
669*12c85518Srobert   }
670*12c85518Srobert 
671*12c85518Srobert   return End;
672*12c85518Srobert }
673*12c85518Srobert 
handleCSharpVerbatimAndInterpolatedStrings()674*12c85518Srobert void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
675*12c85518Srobert   FormatToken *CSharpStringLiteral = Tokens.back();
676*12c85518Srobert 
677*12c85518Srobert   if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
678*12c85518Srobert     return;
679*12c85518Srobert 
680*12c85518Srobert   auto &TokenText = CSharpStringLiteral->TokenText;
681*12c85518Srobert 
682*12c85518Srobert   bool Verbatim = false;
683*12c85518Srobert   bool Interpolated = false;
684*12c85518Srobert   if (TokenText.startswith(R"($@")") || TokenText.startswith(R"(@$")")) {
685*12c85518Srobert     Verbatim = true;
686*12c85518Srobert     Interpolated = true;
687*12c85518Srobert   } else if (TokenText.startswith(R"(@")")) {
688*12c85518Srobert     Verbatim = true;
689*12c85518Srobert   } else if (TokenText.startswith(R"($")")) {
690*12c85518Srobert     Interpolated = true;
691*12c85518Srobert   }
692*12c85518Srobert 
693*12c85518Srobert   // Deal with multiline strings.
694*12c85518Srobert   if (!Verbatim && !Interpolated)
695*12c85518Srobert     return;
696*12c85518Srobert 
697*12c85518Srobert   const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
698*12c85518Srobert   const char *Offset = StrBegin;
699*12c85518Srobert   if (Verbatim && Interpolated)
700*12c85518Srobert     Offset += 3;
701*12c85518Srobert   else
702*12c85518Srobert     Offset += 2;
703*12c85518Srobert 
704*12c85518Srobert   const auto End = Lex->getBuffer().end();
705*12c85518Srobert   Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
706ec727ea7Spatrick 
707ec727ea7Spatrick   // Make no attempt to format code properly if a verbatim string is
708ec727ea7Spatrick   // unterminated.
709*12c85518Srobert   if (Offset >= End)
710ec727ea7Spatrick     return;
711ec727ea7Spatrick 
712ec727ea7Spatrick   StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
713*12c85518Srobert   TokenText = LiteralText;
714ec727ea7Spatrick 
715ec727ea7Spatrick   // Adjust width for potentially multiline string literals.
716ec727ea7Spatrick   size_t FirstBreak = LiteralText.find('\n');
717ec727ea7Spatrick   StringRef FirstLineText = FirstBreak == StringRef::npos
718ec727ea7Spatrick                                 ? LiteralText
719ec727ea7Spatrick                                 : LiteralText.substr(0, FirstBreak);
720ec727ea7Spatrick   CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
721ec727ea7Spatrick       FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
722ec727ea7Spatrick       Encoding);
723ec727ea7Spatrick   size_t LastBreak = LiteralText.rfind('\n');
724ec727ea7Spatrick   if (LastBreak != StringRef::npos) {
725ec727ea7Spatrick     CSharpStringLiteral->IsMultiline = true;
726ec727ea7Spatrick     unsigned StartColumn = 0;
727*12c85518Srobert     CSharpStringLiteral->LastLineColumnWidth =
728*12c85518Srobert         encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
729*12c85518Srobert                                       StartColumn, Style.TabWidth, Encoding);
730ec727ea7Spatrick   }
731ec727ea7Spatrick 
732*12c85518Srobert   assert(Offset < End);
733*12c85518Srobert   resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
734ec727ea7Spatrick }
735ec727ea7Spatrick 
handleTemplateStrings()736e5dd7070Spatrick void FormatTokenLexer::handleTemplateStrings() {
737e5dd7070Spatrick   FormatToken *BacktickToken = Tokens.back();
738e5dd7070Spatrick 
739e5dd7070Spatrick   if (BacktickToken->is(tok::l_brace)) {
740e5dd7070Spatrick     StateStack.push(LexerState::NORMAL);
741e5dd7070Spatrick     return;
742e5dd7070Spatrick   }
743e5dd7070Spatrick   if (BacktickToken->is(tok::r_brace)) {
744e5dd7070Spatrick     if (StateStack.size() == 1)
745e5dd7070Spatrick       return;
746e5dd7070Spatrick     StateStack.pop();
747e5dd7070Spatrick     if (StateStack.top() != LexerState::TEMPLATE_STRING)
748e5dd7070Spatrick       return;
749e5dd7070Spatrick     // If back in TEMPLATE_STRING, fallthrough and continue parsing the
750e5dd7070Spatrick   } else if (BacktickToken->is(tok::unknown) &&
751e5dd7070Spatrick              BacktickToken->TokenText == "`") {
752e5dd7070Spatrick     StateStack.push(LexerState::TEMPLATE_STRING);
753e5dd7070Spatrick   } else {
754e5dd7070Spatrick     return; // Not actually a template
755e5dd7070Spatrick   }
756e5dd7070Spatrick 
757e5dd7070Spatrick   // 'Manually' lex ahead in the current file buffer.
758e5dd7070Spatrick   const char *Offset = Lex->getBufferLocation();
759e5dd7070Spatrick   const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
760e5dd7070Spatrick   for (; Offset != Lex->getBuffer().end(); ++Offset) {
761e5dd7070Spatrick     if (Offset[0] == '`') {
762e5dd7070Spatrick       StateStack.pop();
763*12c85518Srobert       ++Offset;
764e5dd7070Spatrick       break;
765e5dd7070Spatrick     }
766e5dd7070Spatrick     if (Offset[0] == '\\') {
767e5dd7070Spatrick       ++Offset; // Skip the escaped character.
768e5dd7070Spatrick     } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
769e5dd7070Spatrick                Offset[1] == '{') {
770e5dd7070Spatrick       // '${' introduces an expression interpolation in the template string.
771e5dd7070Spatrick       StateStack.push(LexerState::NORMAL);
772*12c85518Srobert       Offset += 2;
773e5dd7070Spatrick       break;
774e5dd7070Spatrick     }
775e5dd7070Spatrick   }
776e5dd7070Spatrick 
777*12c85518Srobert   StringRef LiteralText(TmplBegin, Offset - TmplBegin);
778ec727ea7Spatrick   BacktickToken->setType(TT_TemplateString);
779e5dd7070Spatrick   BacktickToken->Tok.setKind(tok::string_literal);
780e5dd7070Spatrick   BacktickToken->TokenText = LiteralText;
781e5dd7070Spatrick 
782e5dd7070Spatrick   // Adjust width for potentially multiline string literals.
783e5dd7070Spatrick   size_t FirstBreak = LiteralText.find('\n');
784e5dd7070Spatrick   StringRef FirstLineText = FirstBreak == StringRef::npos
785e5dd7070Spatrick                                 ? LiteralText
786e5dd7070Spatrick                                 : LiteralText.substr(0, FirstBreak);
787e5dd7070Spatrick   BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
788e5dd7070Spatrick       FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
789e5dd7070Spatrick   size_t LastBreak = LiteralText.rfind('\n');
790e5dd7070Spatrick   if (LastBreak != StringRef::npos) {
791e5dd7070Spatrick     BacktickToken->IsMultiline = true;
792e5dd7070Spatrick     unsigned StartColumn = 0; // The template tail spans the entire line.
793*12c85518Srobert     BacktickToken->LastLineColumnWidth =
794*12c85518Srobert         encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
795*12c85518Srobert                                       StartColumn, Style.TabWidth, Encoding);
796e5dd7070Spatrick   }
797e5dd7070Spatrick 
798*12c85518Srobert   SourceLocation loc = Lex->getSourceLocation(Offset);
799e5dd7070Spatrick   resetLexer(SourceMgr.getFileOffset(loc));
800e5dd7070Spatrick }
801e5dd7070Spatrick 
tryParsePythonComment()802e5dd7070Spatrick void FormatTokenLexer::tryParsePythonComment() {
803e5dd7070Spatrick   FormatToken *HashToken = Tokens.back();
804e5dd7070Spatrick   if (!HashToken->isOneOf(tok::hash, tok::hashhash))
805e5dd7070Spatrick     return;
806e5dd7070Spatrick   // Turn the remainder of this line into a comment.
807e5dd7070Spatrick   const char *CommentBegin =
808e5dd7070Spatrick       Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
809e5dd7070Spatrick   size_t From = CommentBegin - Lex->getBuffer().begin();
810e5dd7070Spatrick   size_t To = Lex->getBuffer().find_first_of('\n', From);
811e5dd7070Spatrick   if (To == StringRef::npos)
812e5dd7070Spatrick     To = Lex->getBuffer().size();
813e5dd7070Spatrick   size_t Len = To - From;
814ec727ea7Spatrick   HashToken->setType(TT_LineComment);
815e5dd7070Spatrick   HashToken->Tok.setKind(tok::comment);
816e5dd7070Spatrick   HashToken->TokenText = Lex->getBuffer().substr(From, Len);
817e5dd7070Spatrick   SourceLocation Loc = To < Lex->getBuffer().size()
818e5dd7070Spatrick                            ? Lex->getSourceLocation(CommentBegin + Len)
819e5dd7070Spatrick                            : SourceMgr.getLocForEndOfFile(ID);
820e5dd7070Spatrick   resetLexer(SourceMgr.getFileOffset(Loc));
821e5dd7070Spatrick }
822e5dd7070Spatrick 
tryMerge_TMacro()823e5dd7070Spatrick bool FormatTokenLexer::tryMerge_TMacro() {
824e5dd7070Spatrick   if (Tokens.size() < 4)
825e5dd7070Spatrick     return false;
826e5dd7070Spatrick   FormatToken *Last = Tokens.back();
827e5dd7070Spatrick   if (!Last->is(tok::r_paren))
828e5dd7070Spatrick     return false;
829e5dd7070Spatrick 
830e5dd7070Spatrick   FormatToken *String = Tokens[Tokens.size() - 2];
831e5dd7070Spatrick   if (!String->is(tok::string_literal) || String->IsMultiline)
832e5dd7070Spatrick     return false;
833e5dd7070Spatrick 
834e5dd7070Spatrick   if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
835e5dd7070Spatrick     return false;
836e5dd7070Spatrick 
837e5dd7070Spatrick   FormatToken *Macro = Tokens[Tokens.size() - 4];
838e5dd7070Spatrick   if (Macro->TokenText != "_T")
839e5dd7070Spatrick     return false;
840e5dd7070Spatrick 
841e5dd7070Spatrick   const char *Start = Macro->TokenText.data();
842e5dd7070Spatrick   const char *End = Last->TokenText.data() + Last->TokenText.size();
843e5dd7070Spatrick   String->TokenText = StringRef(Start, End - Start);
844e5dd7070Spatrick   String->IsFirst = Macro->IsFirst;
845e5dd7070Spatrick   String->LastNewlineOffset = Macro->LastNewlineOffset;
846e5dd7070Spatrick   String->WhitespaceRange = Macro->WhitespaceRange;
847e5dd7070Spatrick   String->OriginalColumn = Macro->OriginalColumn;
848e5dd7070Spatrick   String->ColumnWidth = encoding::columnWidthWithTabs(
849e5dd7070Spatrick       String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
850e5dd7070Spatrick   String->NewlinesBefore = Macro->NewlinesBefore;
851e5dd7070Spatrick   String->HasUnescapedNewline = Macro->HasUnescapedNewline;
852e5dd7070Spatrick 
853e5dd7070Spatrick   Tokens.pop_back();
854e5dd7070Spatrick   Tokens.pop_back();
855e5dd7070Spatrick   Tokens.pop_back();
856e5dd7070Spatrick   Tokens.back() = String;
857*12c85518Srobert   if (FirstInLineIndex >= Tokens.size())
858*12c85518Srobert     FirstInLineIndex = Tokens.size() - 1;
859e5dd7070Spatrick   return true;
860e5dd7070Spatrick }
861e5dd7070Spatrick 
tryMergeConflictMarkers()862e5dd7070Spatrick bool FormatTokenLexer::tryMergeConflictMarkers() {
863e5dd7070Spatrick   if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
864e5dd7070Spatrick     return false;
865e5dd7070Spatrick 
866e5dd7070Spatrick   // Conflict lines look like:
867e5dd7070Spatrick   // <marker> <text from the vcs>
868e5dd7070Spatrick   // For example:
869e5dd7070Spatrick   // >>>>>>> /file/in/file/system at revision 1234
870e5dd7070Spatrick   //
871e5dd7070Spatrick   // We merge all tokens in a line that starts with a conflict marker
872e5dd7070Spatrick   // into a single token with a special token type that the unwrapped line
873e5dd7070Spatrick   // parser will use to correctly rebuild the underlying code.
874e5dd7070Spatrick 
875e5dd7070Spatrick   FileID ID;
876e5dd7070Spatrick   // Get the position of the first token in the line.
877e5dd7070Spatrick   unsigned FirstInLineOffset;
878e5dd7070Spatrick   std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
879e5dd7070Spatrick       Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
880a9ac8606Spatrick   StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
881e5dd7070Spatrick   // Calculate the offset of the start of the current line.
882e5dd7070Spatrick   auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
883*12c85518Srobert   if (LineOffset == StringRef::npos)
884e5dd7070Spatrick     LineOffset = 0;
885*12c85518Srobert   else
886e5dd7070Spatrick     ++LineOffset;
887e5dd7070Spatrick 
888e5dd7070Spatrick   auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
889e5dd7070Spatrick   StringRef LineStart;
890*12c85518Srobert   if (FirstSpace == StringRef::npos)
891e5dd7070Spatrick     LineStart = Buffer.substr(LineOffset);
892*12c85518Srobert   else
893e5dd7070Spatrick     LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
894e5dd7070Spatrick 
895e5dd7070Spatrick   TokenType Type = TT_Unknown;
896e5dd7070Spatrick   if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
897e5dd7070Spatrick     Type = TT_ConflictStart;
898e5dd7070Spatrick   } else if (LineStart == "|||||||" || LineStart == "=======" ||
899e5dd7070Spatrick              LineStart == "====") {
900e5dd7070Spatrick     Type = TT_ConflictAlternative;
901e5dd7070Spatrick   } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
902e5dd7070Spatrick     Type = TT_ConflictEnd;
903e5dd7070Spatrick   }
904e5dd7070Spatrick 
905e5dd7070Spatrick   if (Type != TT_Unknown) {
906e5dd7070Spatrick     FormatToken *Next = Tokens.back();
907e5dd7070Spatrick 
908e5dd7070Spatrick     Tokens.resize(FirstInLineIndex + 1);
909e5dd7070Spatrick     // We do not need to build a complete token here, as we will skip it
910e5dd7070Spatrick     // during parsing anyway (as we must not touch whitespace around conflict
911e5dd7070Spatrick     // markers).
912ec727ea7Spatrick     Tokens.back()->setType(Type);
913e5dd7070Spatrick     Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
914e5dd7070Spatrick 
915e5dd7070Spatrick     Tokens.push_back(Next);
916e5dd7070Spatrick     return true;
917e5dd7070Spatrick   }
918e5dd7070Spatrick 
919e5dd7070Spatrick   return false;
920e5dd7070Spatrick }
921e5dd7070Spatrick 
getStashedToken()922e5dd7070Spatrick FormatToken *FormatTokenLexer::getStashedToken() {
923e5dd7070Spatrick   // Create a synthesized second '>' or '<' token.
924e5dd7070Spatrick   Token Tok = FormatTok->Tok;
925e5dd7070Spatrick   StringRef TokenText = FormatTok->TokenText;
926e5dd7070Spatrick 
927e5dd7070Spatrick   unsigned OriginalColumn = FormatTok->OriginalColumn;
928e5dd7070Spatrick   FormatTok = new (Allocator.Allocate()) FormatToken;
929e5dd7070Spatrick   FormatTok->Tok = Tok;
930e5dd7070Spatrick   SourceLocation TokLocation =
931e5dd7070Spatrick       FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
932e5dd7070Spatrick   FormatTok->Tok.setLocation(TokLocation);
933e5dd7070Spatrick   FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
934e5dd7070Spatrick   FormatTok->TokenText = TokenText;
935e5dd7070Spatrick   FormatTok->ColumnWidth = 1;
936e5dd7070Spatrick   FormatTok->OriginalColumn = OriginalColumn + 1;
937e5dd7070Spatrick 
938e5dd7070Spatrick   return FormatTok;
939e5dd7070Spatrick }
940e5dd7070Spatrick 
941*12c85518Srobert /// Truncate the current token to the new length and make the lexer continue
942*12c85518Srobert /// from the end of the truncated token. Used for other languages that have
943*12c85518Srobert /// different token boundaries, like JavaScript in which a comment ends at a
944*12c85518Srobert /// line break regardless of whether the line break follows a backslash. Also
945*12c85518Srobert /// used to set the lexer to the end of whitespace if the lexer regards
946*12c85518Srobert /// whitespace and an unrecognized symbol as one token.
truncateToken(size_t NewLen)947*12c85518Srobert void FormatTokenLexer::truncateToken(size_t NewLen) {
948*12c85518Srobert   assert(NewLen <= FormatTok->TokenText.size());
949*12c85518Srobert   resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
950*12c85518Srobert       Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
951*12c85518Srobert   FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
952*12c85518Srobert   FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
953*12c85518Srobert       FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
954*12c85518Srobert       Encoding);
955*12c85518Srobert   FormatTok->Tok.setLength(NewLen);
956*12c85518Srobert }
957*12c85518Srobert 
958*12c85518Srobert /// Count the length of leading whitespace in a token.
countLeadingWhitespace(StringRef Text)959*12c85518Srobert static size_t countLeadingWhitespace(StringRef Text) {
960*12c85518Srobert   // Basically counting the length matched by this regex.
961*12c85518Srobert   // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
962*12c85518Srobert   // Directly using the regex turned out to be slow. With the regex
963*12c85518Srobert   // version formatting all files in this directory took about 1.25
964*12c85518Srobert   // seconds. This version took about 0.5 seconds.
965*12c85518Srobert   const unsigned char *const Begin = Text.bytes_begin();
966*12c85518Srobert   const unsigned char *const End = Text.bytes_end();
967*12c85518Srobert   const unsigned char *Cur = Begin;
968*12c85518Srobert   while (Cur < End) {
969*12c85518Srobert     if (isspace(Cur[0])) {
970*12c85518Srobert       ++Cur;
971*12c85518Srobert     } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
972*12c85518Srobert       // A '\' followed by a newline always escapes the newline, regardless
973*12c85518Srobert       // of whether there is another '\' before it.
974*12c85518Srobert       // The source has a null byte at the end. So the end of the entire input
975*12c85518Srobert       // isn't reached yet. Also the lexer doesn't break apart an escaped
976*12c85518Srobert       // newline.
977*12c85518Srobert       assert(End - Cur >= 2);
978*12c85518Srobert       Cur += 2;
979*12c85518Srobert     } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
980*12c85518Srobert                (Cur[3] == '\n' || Cur[3] == '\r')) {
981*12c85518Srobert       // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
982*12c85518Srobert       // characters are quoted individually in this comment because if we write
983*12c85518Srobert       // them together some compilers warn that we have a trigraph in the code.
984*12c85518Srobert       assert(End - Cur >= 4);
985*12c85518Srobert       Cur += 4;
986*12c85518Srobert     } else {
987*12c85518Srobert       break;
988*12c85518Srobert     }
989*12c85518Srobert   }
990*12c85518Srobert   return Cur - Begin;
991*12c85518Srobert }
992*12c85518Srobert 
getNextToken()993e5dd7070Spatrick FormatToken *FormatTokenLexer::getNextToken() {
994e5dd7070Spatrick   if (StateStack.top() == LexerState::TOKEN_STASHED) {
995e5dd7070Spatrick     StateStack.pop();
996e5dd7070Spatrick     return getStashedToken();
997e5dd7070Spatrick   }
998e5dd7070Spatrick 
999e5dd7070Spatrick   FormatTok = new (Allocator.Allocate()) FormatToken;
1000e5dd7070Spatrick   readRawToken(*FormatTok);
1001e5dd7070Spatrick   SourceLocation WhitespaceStart =
1002e5dd7070Spatrick       FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1003e5dd7070Spatrick   FormatTok->IsFirst = IsFirstToken;
1004e5dd7070Spatrick   IsFirstToken = false;
1005e5dd7070Spatrick 
1006e5dd7070Spatrick   // Consume and record whitespace until we find a significant token.
1007*12c85518Srobert   // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1008*12c85518Srobert   // followed by a symbol such as backtick. Those symbols may be
1009*12c85518Srobert   // significant in other languages.
1010e5dd7070Spatrick   unsigned WhitespaceLength = TrailingWhitespace;
1011*12c85518Srobert   while (FormatTok->isNot(tok::eof)) {
1012*12c85518Srobert     auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1013*12c85518Srobert     if (LeadingWhitespace == 0)
1014e5dd7070Spatrick       break;
1015*12c85518Srobert     if (LeadingWhitespace < FormatTok->TokenText.size())
1016*12c85518Srobert       truncateToken(LeadingWhitespace);
1017*12c85518Srobert     StringRef Text = FormatTok->TokenText;
1018*12c85518Srobert     bool InEscape = false;
1019e5dd7070Spatrick     for (int i = 0, e = Text.size(); i != e; ++i) {
1020e5dd7070Spatrick       switch (Text[i]) {
1021*12c85518Srobert       case '\r':
1022*12c85518Srobert         // If this is a CRLF sequence, break here and the LF will be handled on
1023*12c85518Srobert         // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1024*12c85518Srobert         // the same as a single LF.
1025*12c85518Srobert         if (i + 1 < e && Text[i + 1] == '\n')
1026*12c85518Srobert           break;
1027*12c85518Srobert         [[fallthrough]];
1028e5dd7070Spatrick       case '\n':
1029e5dd7070Spatrick         ++FormatTok->NewlinesBefore;
1030*12c85518Srobert         if (!InEscape)
1031*12c85518Srobert           FormatTok->HasUnescapedNewline = true;
1032*12c85518Srobert         else
1033*12c85518Srobert           InEscape = false;
1034e5dd7070Spatrick         FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1035e5dd7070Spatrick         Column = 0;
1036e5dd7070Spatrick         break;
1037e5dd7070Spatrick       case '\f':
1038e5dd7070Spatrick       case '\v':
1039e5dd7070Spatrick         Column = 0;
1040e5dd7070Spatrick         break;
1041e5dd7070Spatrick       case ' ':
1042e5dd7070Spatrick         ++Column;
1043e5dd7070Spatrick         break;
1044e5dd7070Spatrick       case '\t':
1045e5dd7070Spatrick         Column +=
1046e5dd7070Spatrick             Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1047e5dd7070Spatrick         break;
1048e5dd7070Spatrick       case '\\':
1049*12c85518Srobert       case '?':
1050*12c85518Srobert       case '/':
1051*12c85518Srobert         // The text was entirely whitespace when this loop was entered. Thus
1052*12c85518Srobert         // this has to be an escape sequence.
1053*12c85518Srobert         assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
1054*12c85518Srobert                Text.substr(i, 4) == "\?\?/\r" ||
1055*12c85518Srobert                Text.substr(i, 4) == "\?\?/\n" ||
1056*12c85518Srobert                (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
1057*12c85518Srobert                            Text.substr(i - 1, 4) == "\?\?/\n")) ||
1058*12c85518Srobert                (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
1059*12c85518Srobert                            Text.substr(i - 2, 4) == "\?\?/\n")));
1060*12c85518Srobert         InEscape = true;
1061e5dd7070Spatrick         break;
1062e5dd7070Spatrick       default:
1063*12c85518Srobert         // This shouldn't happen.
1064*12c85518Srobert         assert(false);
1065e5dd7070Spatrick         break;
1066e5dd7070Spatrick       }
1067e5dd7070Spatrick     }
1068*12c85518Srobert     WhitespaceLength += Text.size();
1069e5dd7070Spatrick     readRawToken(*FormatTok);
1070e5dd7070Spatrick   }
1071e5dd7070Spatrick 
1072*12c85518Srobert   if (FormatTok->is(tok::unknown))
1073*12c85518Srobert     FormatTok->setType(TT_ImplicitStringLiteral);
1074*12c85518Srobert 
1075e5dd7070Spatrick   // JavaScript and Java do not allow to escape the end of the line with a
1076e5dd7070Spatrick   // backslash. Backslashes are syntax errors in plain source, but can occur in
1077e5dd7070Spatrick   // comments. When a single line comment ends with a \, it'll cause the next
1078e5dd7070Spatrick   // line of code to be lexed as a comment, breaking formatting. The code below
1079e5dd7070Spatrick   // finds comments that contain a backslash followed by a line break, truncates
1080e5dd7070Spatrick   // the comment token at the backslash, and resets the lexer to restart behind
1081e5dd7070Spatrick   // the backslash.
1082*12c85518Srobert   if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&
1083e5dd7070Spatrick       FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
1084e5dd7070Spatrick     size_t BackslashPos = FormatTok->TokenText.find('\\');
1085e5dd7070Spatrick     while (BackslashPos != StringRef::npos) {
1086e5dd7070Spatrick       if (BackslashPos + 1 < FormatTok->TokenText.size() &&
1087e5dd7070Spatrick           FormatTok->TokenText[BackslashPos + 1] == '\n') {
1088*12c85518Srobert         truncateToken(BackslashPos + 1);
1089e5dd7070Spatrick         break;
1090e5dd7070Spatrick       }
1091e5dd7070Spatrick       BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
1092e5dd7070Spatrick     }
1093e5dd7070Spatrick   }
1094e5dd7070Spatrick 
1095*12c85518Srobert   if (Style.isVerilog()) {
1096*12c85518Srobert     static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1097*12c85518Srobert     SmallVector<StringRef, 1> Matches;
1098*12c85518Srobert     // Verilog uses the backtick instead of the hash for preprocessor stuff.
1099*12c85518Srobert     // And it uses the hash for delays and parameter lists. In order to continue
1100*12c85518Srobert     // using `tok::hash` in other places, the backtick gets marked as the hash
1101*12c85518Srobert     // here.  And in order to tell the backtick and hash apart for
1102*12c85518Srobert     // Verilog-specific stuff, the hash becomes an identifier.
1103*12c85518Srobert     if (FormatTok->is(tok::numeric_constant)) {
1104*12c85518Srobert       // In Verilog the quote is not part of a number.
1105*12c85518Srobert       auto Quote = FormatTok->TokenText.find('\'');
1106*12c85518Srobert       if (Quote != StringRef::npos)
1107*12c85518Srobert         truncateToken(Quote);
1108*12c85518Srobert     } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1109*12c85518Srobert       FormatTok->Tok.setKind(tok::raw_identifier);
1110*12c85518Srobert     } else if (FormatTok->is(tok::raw_identifier)) {
1111*12c85518Srobert       if (FormatTok->TokenText == "`") {
1112*12c85518Srobert         FormatTok->Tok.setIdentifierInfo(nullptr);
1113*12c85518Srobert         FormatTok->Tok.setKind(tok::hash);
1114*12c85518Srobert       } else if (FormatTok->TokenText == "``") {
1115*12c85518Srobert         FormatTok->Tok.setIdentifierInfo(nullptr);
1116*12c85518Srobert         FormatTok->Tok.setKind(tok::hashhash);
1117*12c85518Srobert       } else if (Tokens.size() > 0 &&
1118*12c85518Srobert                  Tokens.back()->is(Keywords.kw_apostrophe) &&
1119*12c85518Srobert                  NumberBase.match(FormatTok->TokenText, &Matches)) {
1120*12c85518Srobert         // In Verilog in a based number literal like `'b10`, there may be
1121*12c85518Srobert         // whitespace between `'b` and `10`. Therefore we handle the base and
1122*12c85518Srobert         // the rest of the number literal as two tokens. But if there is no
1123*12c85518Srobert         // space in the input code, we need to manually separate the two parts.
1124*12c85518Srobert         truncateToken(Matches[0].size());
1125*12c85518Srobert         FormatTok->setFinalizedType(TT_VerilogNumberBase);
1126*12c85518Srobert       }
1127*12c85518Srobert     }
1128e5dd7070Spatrick   }
1129e5dd7070Spatrick 
1130e5dd7070Spatrick   FormatTok->WhitespaceRange = SourceRange(
1131e5dd7070Spatrick       WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1132e5dd7070Spatrick 
1133e5dd7070Spatrick   FormatTok->OriginalColumn = Column;
1134e5dd7070Spatrick 
1135e5dd7070Spatrick   TrailingWhitespace = 0;
1136*12c85518Srobert   if (FormatTok->is(tok::comment)) {
1137e5dd7070Spatrick     // FIXME: Add the trimmed whitespace to Column.
1138e5dd7070Spatrick     StringRef UntrimmedText = FormatTok->TokenText;
1139e5dd7070Spatrick     FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1140e5dd7070Spatrick     TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1141*12c85518Srobert   } else if (FormatTok->is(tok::raw_identifier)) {
1142e5dd7070Spatrick     IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1143e5dd7070Spatrick     FormatTok->Tok.setIdentifierInfo(&Info);
1144e5dd7070Spatrick     FormatTok->Tok.setKind(Info.getTokenID());
1145e5dd7070Spatrick     if (Style.Language == FormatStyle::LK_Java &&
1146e5dd7070Spatrick         FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1147e5dd7070Spatrick                            tok::kw_operator)) {
1148e5dd7070Spatrick       FormatTok->Tok.setKind(tok::identifier);
1149e5dd7070Spatrick       FormatTok->Tok.setIdentifierInfo(nullptr);
1150*12c85518Srobert     } else if (Style.isJavaScript() &&
1151e5dd7070Spatrick                FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1152e5dd7070Spatrick                                   tok::kw_operator)) {
1153e5dd7070Spatrick       FormatTok->Tok.setKind(tok::identifier);
1154e5dd7070Spatrick       FormatTok->Tok.setIdentifierInfo(nullptr);
1155e5dd7070Spatrick     }
1156*12c85518Srobert   } else if (FormatTok->is(tok::greatergreater)) {
1157e5dd7070Spatrick     FormatTok->Tok.setKind(tok::greater);
1158e5dd7070Spatrick     FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1159e5dd7070Spatrick     ++Column;
1160e5dd7070Spatrick     StateStack.push(LexerState::TOKEN_STASHED);
1161*12c85518Srobert   } else if (FormatTok->is(tok::lessless)) {
1162e5dd7070Spatrick     FormatTok->Tok.setKind(tok::less);
1163e5dd7070Spatrick     FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1164e5dd7070Spatrick     ++Column;
1165e5dd7070Spatrick     StateStack.push(LexerState::TOKEN_STASHED);
1166e5dd7070Spatrick   }
1167e5dd7070Spatrick 
1168*12c85518Srobert   if (Style.isVerilog() && Tokens.size() > 0 &&
1169*12c85518Srobert       Tokens.back()->is(TT_VerilogNumberBase) &&
1170*12c85518Srobert       FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1171*12c85518Srobert     // Mark the number following a base like `'h?a0` as a number.
1172*12c85518Srobert     FormatTok->Tok.setKind(tok::numeric_constant);
1173*12c85518Srobert   }
1174*12c85518Srobert 
1175e5dd7070Spatrick   // Now FormatTok is the next non-whitespace token.
1176e5dd7070Spatrick 
1177e5dd7070Spatrick   StringRef Text = FormatTok->TokenText;
1178e5dd7070Spatrick   size_t FirstNewlinePos = Text.find('\n');
1179e5dd7070Spatrick   if (FirstNewlinePos == StringRef::npos) {
1180e5dd7070Spatrick     // FIXME: ColumnWidth actually depends on the start column, we need to
1181e5dd7070Spatrick     // take this into account when the token is moved.
1182e5dd7070Spatrick     FormatTok->ColumnWidth =
1183e5dd7070Spatrick         encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1184e5dd7070Spatrick     Column += FormatTok->ColumnWidth;
1185e5dd7070Spatrick   } else {
1186e5dd7070Spatrick     FormatTok->IsMultiline = true;
1187e5dd7070Spatrick     // FIXME: ColumnWidth actually depends on the start column, we need to
1188e5dd7070Spatrick     // take this into account when the token is moved.
1189e5dd7070Spatrick     FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1190e5dd7070Spatrick         Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1191e5dd7070Spatrick 
1192e5dd7070Spatrick     // The last line of the token always starts in column 0.
1193e5dd7070Spatrick     // Thus, the length can be precomputed even in the presence of tabs.
1194e5dd7070Spatrick     FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1195e5dd7070Spatrick         Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1196e5dd7070Spatrick     Column = FormatTok->LastLineColumnWidth;
1197e5dd7070Spatrick   }
1198e5dd7070Spatrick 
1199e5dd7070Spatrick   if (Style.isCpp()) {
1200e5dd7070Spatrick     auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
1201e5dd7070Spatrick     if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
1202e5dd7070Spatrick           Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1203e5dd7070Spatrick               tok::pp_define) &&
1204e5dd7070Spatrick         it != Macros.end()) {
1205ec727ea7Spatrick       FormatTok->setType(it->second);
1206a9ac8606Spatrick       if (it->second == TT_IfMacro) {
1207a9ac8606Spatrick         // The lexer token currently has type tok::kw_unknown. However, for this
1208a9ac8606Spatrick         // substitution to be treated correctly in the TokenAnnotator, faking
1209a9ac8606Spatrick         // the tok value seems to be needed. Not sure if there's a more elegant
1210a9ac8606Spatrick         // way.
1211a9ac8606Spatrick         FormatTok->Tok.setKind(tok::kw_if);
1212a9ac8606Spatrick       }
1213e5dd7070Spatrick     } else if (FormatTok->is(tok::identifier)) {
1214*12c85518Srobert       if (MacroBlockBeginRegex.match(Text))
1215ec727ea7Spatrick         FormatTok->setType(TT_MacroBlockBegin);
1216*12c85518Srobert       else if (MacroBlockEndRegex.match(Text))
1217ec727ea7Spatrick         FormatTok->setType(TT_MacroBlockEnd);
1218e5dd7070Spatrick     }
1219e5dd7070Spatrick   }
1220e5dd7070Spatrick 
1221e5dd7070Spatrick   return FormatTok;
1222e5dd7070Spatrick }
1223e5dd7070Spatrick 
readRawTokenVerilogSpecific(Token & Tok)1224*12c85518Srobert bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1225*12c85518Srobert   // In Verilog the quote is not a character literal.
1226*12c85518Srobert   //
1227*12c85518Srobert   // Make the backtick and double backtick identifiers to match against them
1228*12c85518Srobert   // more easily.
1229*12c85518Srobert   //
1230*12c85518Srobert   // In Verilog an escaped identifier starts with backslash and ends with
1231*12c85518Srobert   // whitespace. Unless that whitespace is an escaped newline. A backslash can
1232*12c85518Srobert   // also begin an escaped newline outside of an escaped identifier. We check
1233*12c85518Srobert   // for that outside of the Regex since we can't use negative lookhead
1234*12c85518Srobert   // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1235*12c85518Srobert   // identifier may have a length of 0 according to Section A.9.3.
1236*12c85518Srobert   // FIXME: If there is an escaped newline in the middle of an escaped
1237*12c85518Srobert   // identifier, allow for pasting the two lines together, But escaped
1238*12c85518Srobert   // identifiers usually occur only in generated code anyway.
1239*12c85518Srobert   static const llvm::Regex VerilogToken(R"re(^('|``?|\\(\\)re"
1240*12c85518Srobert                                         "(\r?\n|\r)|[^[:space:]])*)");
1241*12c85518Srobert 
1242*12c85518Srobert   SmallVector<StringRef, 4> Matches;
1243*12c85518Srobert   const char *Start = Lex->getBufferLocation();
1244*12c85518Srobert   if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start),
1245*12c85518Srobert                           &Matches)) {
1246*12c85518Srobert     return false;
1247*12c85518Srobert   }
1248*12c85518Srobert   // There is a null byte at the end of the buffer, so we don't have to check
1249*12c85518Srobert   // Start[1] is within the buffer.
1250*12c85518Srobert   if (Start[0] == '\\' && (Start[1] == '\r' || Start[1] == '\n'))
1251*12c85518Srobert     return false;
1252*12c85518Srobert   size_t Len = Matches[0].size();
1253*12c85518Srobert 
1254*12c85518Srobert   // The kind has to be an identifier so we can match it against those defined
1255*12c85518Srobert   // in Keywords. The kind has to be set before the length because the setLength
1256*12c85518Srobert   // function checks that the kind is not an annotation.
1257*12c85518Srobert   Tok.setKind(tok::raw_identifier);
1258*12c85518Srobert   Tok.setLength(Len);
1259*12c85518Srobert   Tok.setLocation(Lex->getSourceLocation(Start, Len));
1260*12c85518Srobert   Tok.setRawIdentifierData(Start);
1261*12c85518Srobert   Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1262*12c85518Srobert   return true;
1263*12c85518Srobert }
1264*12c85518Srobert 
readRawToken(FormatToken & Tok)1265e5dd7070Spatrick void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1266*12c85518Srobert   // For Verilog, first see if there is a special token, and fall back to the
1267*12c85518Srobert   // normal lexer if there isn't one.
1268*12c85518Srobert   if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
1269e5dd7070Spatrick     Lex->LexFromRawLexer(Tok.Tok);
1270e5dd7070Spatrick   Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1271e5dd7070Spatrick                             Tok.Tok.getLength());
1272e5dd7070Spatrick   // For formatting, treat unterminated string literals like normal string
1273e5dd7070Spatrick   // literals.
1274e5dd7070Spatrick   if (Tok.is(tok::unknown)) {
1275e5dd7070Spatrick     if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
1276e5dd7070Spatrick       Tok.Tok.setKind(tok::string_literal);
1277e5dd7070Spatrick       Tok.IsUnterminatedLiteral = true;
1278*12c85518Srobert     } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1279e5dd7070Spatrick       Tok.Tok.setKind(tok::string_literal);
1280e5dd7070Spatrick     }
1281e5dd7070Spatrick   }
1282e5dd7070Spatrick 
1283*12c85518Srobert   if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Proto ||
1284e5dd7070Spatrick        Style.Language == FormatStyle::LK_TextProto) &&
1285e5dd7070Spatrick       Tok.is(tok::char_constant)) {
1286e5dd7070Spatrick     Tok.Tok.setKind(tok::string_literal);
1287e5dd7070Spatrick   }
1288e5dd7070Spatrick 
1289e5dd7070Spatrick   if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
1290e5dd7070Spatrick                                Tok.TokenText == "/* clang-format on */")) {
1291e5dd7070Spatrick     FormattingDisabled = false;
1292e5dd7070Spatrick   }
1293e5dd7070Spatrick 
1294e5dd7070Spatrick   Tok.Finalized = FormattingDisabled;
1295e5dd7070Spatrick 
1296e5dd7070Spatrick   if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
1297e5dd7070Spatrick                                Tok.TokenText == "/* clang-format off */")) {
1298e5dd7070Spatrick     FormattingDisabled = true;
1299e5dd7070Spatrick   }
1300e5dd7070Spatrick }
1301e5dd7070Spatrick 
resetLexer(unsigned Offset)1302e5dd7070Spatrick void FormatTokenLexer::resetLexer(unsigned Offset) {
1303e5dd7070Spatrick   StringRef Buffer = SourceMgr.getBufferData(ID);
1304*12c85518Srobert   LangOpts = getFormattingLangOpts(Style);
1305*12c85518Srobert   Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1306*12c85518Srobert                       Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1307e5dd7070Spatrick   Lex->SetKeepWhitespaceMode(true);
1308e5dd7070Spatrick   TrailingWhitespace = 0;
1309e5dd7070Spatrick }
1310e5dd7070Spatrick 
1311e5dd7070Spatrick } // namespace format
1312e5dd7070Spatrick } // namespace clang
1313