1e5dd7070Spatrick //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2e5dd7070Spatrick //
3e5dd7070Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e5dd7070Spatrick // See https://llvm.org/LICENSE.txt for license information.
5e5dd7070Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6e5dd7070Spatrick //
7e5dd7070Spatrick //===----------------------------------------------------------------------===//
8e5dd7070Spatrick ///
9e5dd7070Spatrick /// \file
10e5dd7070Spatrick /// This file implements FormatTokenLexer, which tokenizes a source file
11e5dd7070Spatrick /// into a FormatToken stream suitable for ClangFormat.
12e5dd7070Spatrick ///
13e5dd7070Spatrick //===----------------------------------------------------------------------===//
14e5dd7070Spatrick
15e5dd7070Spatrick #include "FormatTokenLexer.h"
16e5dd7070Spatrick #include "FormatToken.h"
17e5dd7070Spatrick #include "clang/Basic/SourceLocation.h"
18e5dd7070Spatrick #include "clang/Basic/SourceManager.h"
19e5dd7070Spatrick #include "clang/Format/Format.h"
20e5dd7070Spatrick #include "llvm/Support/Regex.h"
21e5dd7070Spatrick
22e5dd7070Spatrick namespace clang {
23e5dd7070Spatrick namespace format {
24e5dd7070Spatrick
FormatTokenLexer(const SourceManager & SourceMgr,FileID ID,unsigned Column,const FormatStyle & Style,encoding::Encoding Encoding,llvm::SpecificBumpPtrAllocator<FormatToken> & Allocator,IdentifierTable & IdentTable)25ec727ea7Spatrick FormatTokenLexer::FormatTokenLexer(
26ec727ea7Spatrick const SourceManager &SourceMgr, FileID ID, unsigned Column,
27ec727ea7Spatrick const FormatStyle &Style, encoding::Encoding Encoding,
28ec727ea7Spatrick llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29ec727ea7Spatrick IdentifierTable &IdentTable)
30e5dd7070Spatrick : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31*12c85518Srobert Column(Column), TrailingWhitespace(0),
32*12c85518Srobert LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
33ec727ea7Spatrick Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
34ec727ea7Spatrick Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
35e5dd7070Spatrick FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
36e5dd7070Spatrick MacroBlockEndRegex(Style.MacroBlockEnd) {
37*12c85518Srobert Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
38e5dd7070Spatrick Lex->SetKeepWhitespaceMode(true);
39e5dd7070Spatrick
40*12c85518Srobert for (const std::string &ForEachMacro : Style.ForEachMacros) {
41*12c85518Srobert auto Identifier = &IdentTable.get(ForEachMacro);
42*12c85518Srobert Macros.insert({Identifier, TT_ForEachMacro});
43*12c85518Srobert }
44*12c85518Srobert for (const std::string &IfMacro : Style.IfMacros) {
45*12c85518Srobert auto Identifier = &IdentTable.get(IfMacro);
46*12c85518Srobert Macros.insert({Identifier, TT_IfMacro});
47*12c85518Srobert }
48*12c85518Srobert for (const std::string &AttributeMacro : Style.AttributeMacros) {
49*12c85518Srobert auto Identifier = &IdentTable.get(AttributeMacro);
50*12c85518Srobert Macros.insert({Identifier, TT_AttributeMacro});
51*12c85518Srobert }
52*12c85518Srobert for (const std::string &StatementMacro : Style.StatementMacros) {
53*12c85518Srobert auto Identifier = &IdentTable.get(StatementMacro);
54*12c85518Srobert Macros.insert({Identifier, TT_StatementMacro});
55*12c85518Srobert }
56*12c85518Srobert for (const std::string &TypenameMacro : Style.TypenameMacros) {
57*12c85518Srobert auto Identifier = &IdentTable.get(TypenameMacro);
58*12c85518Srobert Macros.insert({Identifier, TT_TypenameMacro});
59*12c85518Srobert }
60*12c85518Srobert for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
61*12c85518Srobert auto Identifier = &IdentTable.get(NamespaceMacro);
62*12c85518Srobert Macros.insert({Identifier, TT_NamespaceMacro});
63*12c85518Srobert }
64ec727ea7Spatrick for (const std::string &WhitespaceSensitiveMacro :
65ec727ea7Spatrick Style.WhitespaceSensitiveMacros) {
66*12c85518Srobert auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
67*12c85518Srobert Macros.insert({Identifier, TT_UntouchableMacroFunc});
68ec727ea7Spatrick }
69a9ac8606Spatrick for (const std::string &StatementAttributeLikeMacro :
70*12c85518Srobert Style.StatementAttributeLikeMacros) {
71*12c85518Srobert auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
72*12c85518Srobert Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
73*12c85518Srobert }
74e5dd7070Spatrick }
75e5dd7070Spatrick
lex()76e5dd7070Spatrick ArrayRef<FormatToken *> FormatTokenLexer::lex() {
77e5dd7070Spatrick assert(Tokens.empty());
78e5dd7070Spatrick assert(FirstInLineIndex == 0);
79e5dd7070Spatrick do {
80e5dd7070Spatrick Tokens.push_back(getNextToken());
81*12c85518Srobert if (Style.isJavaScript()) {
82e5dd7070Spatrick tryParseJSRegexLiteral();
83e5dd7070Spatrick handleTemplateStrings();
84e5dd7070Spatrick }
85e5dd7070Spatrick if (Style.Language == FormatStyle::LK_TextProto)
86e5dd7070Spatrick tryParsePythonComment();
87e5dd7070Spatrick tryMergePreviousTokens();
88*12c85518Srobert if (Style.isCSharp()) {
89ec727ea7Spatrick // This needs to come after tokens have been merged so that C#
90ec727ea7Spatrick // string literals are correctly identified.
91ec727ea7Spatrick handleCSharpVerbatimAndInterpolatedStrings();
92*12c85518Srobert }
93e5dd7070Spatrick if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
94e5dd7070Spatrick FirstInLineIndex = Tokens.size() - 1;
95*12c85518Srobert } while (Tokens.back()->isNot(tok::eof));
96e5dd7070Spatrick return Tokens;
97e5dd7070Spatrick }
98e5dd7070Spatrick
tryMergePreviousTokens()99e5dd7070Spatrick void FormatTokenLexer::tryMergePreviousTokens() {
100e5dd7070Spatrick if (tryMerge_TMacro())
101e5dd7070Spatrick return;
102e5dd7070Spatrick if (tryMergeConflictMarkers())
103e5dd7070Spatrick return;
104e5dd7070Spatrick if (tryMergeLessLess())
105e5dd7070Spatrick return;
106ec727ea7Spatrick if (tryMergeForEach())
107ec727ea7Spatrick return;
108ec727ea7Spatrick if (Style.isCpp() && tryTransformTryUsageForC())
109ec727ea7Spatrick return;
110e5dd7070Spatrick
111*12c85518Srobert if (Style.isJavaScript() || Style.isCSharp()) {
112a9ac8606Spatrick static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
113a9ac8606Spatrick tok::question};
114a9ac8606Spatrick static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
115a9ac8606Spatrick tok::period};
116a9ac8606Spatrick static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
117a9ac8606Spatrick
118a9ac8606Spatrick if (tryMergeTokens(FatArrow, TT_FatArrow))
119a9ac8606Spatrick return;
120a9ac8606Spatrick if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
121a9ac8606Spatrick // Treat like the "||" operator (as opposed to the ternary ?).
122a9ac8606Spatrick Tokens.back()->Tok.setKind(tok::pipepipe);
123a9ac8606Spatrick return;
124a9ac8606Spatrick }
125a9ac8606Spatrick if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
126a9ac8606Spatrick // Treat like a regular "." access.
127a9ac8606Spatrick Tokens.back()->Tok.setKind(tok::period);
128a9ac8606Spatrick return;
129a9ac8606Spatrick }
130*12c85518Srobert if (tryMergeNullishCoalescingEqual())
131a9ac8606Spatrick return;
132a9ac8606Spatrick }
133a9ac8606Spatrick
134e5dd7070Spatrick if (Style.isCSharp()) {
135a9ac8606Spatrick static const tok::TokenKind CSharpNullConditionalLSquare[] = {
136a9ac8606Spatrick tok::question, tok::l_square};
137a9ac8606Spatrick
138e5dd7070Spatrick if (tryMergeCSharpKeywordVariables())
139e5dd7070Spatrick return;
140ec727ea7Spatrick if (tryMergeCSharpStringLiteral())
141e5dd7070Spatrick return;
142e5dd7070Spatrick if (tryTransformCSharpForEach())
143e5dd7070Spatrick return;
144a9ac8606Spatrick if (tryMergeTokens(CSharpNullConditionalLSquare,
145a9ac8606Spatrick TT_CSharpNullConditionalLSquare)) {
146a9ac8606Spatrick // Treat like a regular "[" operator.
147a9ac8606Spatrick Tokens.back()->Tok.setKind(tok::l_square);
148e5dd7070Spatrick return;
149e5dd7070Spatrick }
150a9ac8606Spatrick }
151e5dd7070Spatrick
152e5dd7070Spatrick if (tryMergeNSStringLiteral())
153e5dd7070Spatrick return;
154e5dd7070Spatrick
155*12c85518Srobert if (Style.isJavaScript()) {
156e5dd7070Spatrick static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
157e5dd7070Spatrick static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
158e5dd7070Spatrick tok::equal};
159e5dd7070Spatrick static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
160e5dd7070Spatrick tok::greaterequal};
161e5dd7070Spatrick static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
162e5dd7070Spatrick static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
163e5dd7070Spatrick tok::starequal};
164a9ac8606Spatrick static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
165a9ac8606Spatrick static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
166e5dd7070Spatrick
167e5dd7070Spatrick // FIXME: Investigate what token type gives the correct operator priority.
168e5dd7070Spatrick if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
169e5dd7070Spatrick return;
170e5dd7070Spatrick if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
171e5dd7070Spatrick return;
172e5dd7070Spatrick if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
173e5dd7070Spatrick return;
174e5dd7070Spatrick if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
175e5dd7070Spatrick return;
176e5dd7070Spatrick if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
177e5dd7070Spatrick Tokens.back()->Tok.setKind(tok::starequal);
178e5dd7070Spatrick return;
179e5dd7070Spatrick }
180a9ac8606Spatrick if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
181a9ac8606Spatrick tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
182a9ac8606Spatrick // Treat like the "=" assignment operator.
183a9ac8606Spatrick Tokens.back()->Tok.setKind(tok::equal);
184e5dd7070Spatrick return;
185e5dd7070Spatrick }
186e5dd7070Spatrick if (tryMergeJSPrivateIdentifier())
187e5dd7070Spatrick return;
188e5dd7070Spatrick }
189e5dd7070Spatrick
190e5dd7070Spatrick if (Style.Language == FormatStyle::LK_Java) {
191e5dd7070Spatrick static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
192e5dd7070Spatrick tok::greater, tok::greater, tok::greaterequal};
193e5dd7070Spatrick if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
194e5dd7070Spatrick return;
195e5dd7070Spatrick }
196*12c85518Srobert
197*12c85518Srobert if (Style.isVerilog()) {
198*12c85518Srobert // Merge the number following a base like `'h?a0`.
199*12c85518Srobert if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
200*12c85518Srobert Tokens.end()[-2]->is(tok::numeric_constant) &&
201*12c85518Srobert Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
202*12c85518Srobert tok::question) &&
203*12c85518Srobert tryMergeTokens(2, TT_Unknown)) {
204*12c85518Srobert return;
205*12c85518Srobert }
206*12c85518Srobert // Part select.
207*12c85518Srobert if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
208*12c85518Srobert TT_BitFieldColon)) {
209*12c85518Srobert return;
210*12c85518Srobert }
211*12c85518Srobert // Xnor. The combined token is treated as a caret which can also be either a
212*12c85518Srobert // unary or binary operator. The actual type is determined in
213*12c85518Srobert // TokenAnnotator. We also check the token length so we know it is not
214*12c85518Srobert // already a merged token.
215*12c85518Srobert if (Tokens.back()->TokenText.size() == 1 &&
216*12c85518Srobert tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
217*12c85518Srobert TT_BinaryOperator)) {
218*12c85518Srobert Tokens.back()->Tok.setKind(tok::caret);
219*12c85518Srobert return;
220*12c85518Srobert }
221*12c85518Srobert // Signed shift and distribution weight.
222*12c85518Srobert if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
223*12c85518Srobert Tokens.back()->Tok.setKind(tok::lessless);
224*12c85518Srobert return;
225*12c85518Srobert }
226*12c85518Srobert if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
227*12c85518Srobert Tokens.back()->Tok.setKind(tok::greatergreater);
228*12c85518Srobert return;
229*12c85518Srobert }
230*12c85518Srobert if (tryMergeTokensAny({{tok::lessless, tok::equal},
231*12c85518Srobert {tok::lessless, tok::lessequal},
232*12c85518Srobert {tok::greatergreater, tok::equal},
233*12c85518Srobert {tok::greatergreater, tok::greaterequal},
234*12c85518Srobert {tok::colon, tok::equal},
235*12c85518Srobert {tok::colon, tok::slash}},
236*12c85518Srobert TT_BinaryOperator)) {
237*12c85518Srobert Tokens.back()->ForcedPrecedence = prec::Assignment;
238*12c85518Srobert return;
239*12c85518Srobert }
240*12c85518Srobert // Exponentiation, signed shift, case equality, and wildcard equality.
241*12c85518Srobert if (tryMergeTokensAny({{tok::star, tok::star},
242*12c85518Srobert {tok::lessless, tok::less},
243*12c85518Srobert {tok::greatergreater, tok::greater},
244*12c85518Srobert {tok::exclaimequal, tok::equal},
245*12c85518Srobert {tok::exclaimequal, tok::question},
246*12c85518Srobert {tok::equalequal, tok::equal},
247*12c85518Srobert {tok::equalequal, tok::question}},
248*12c85518Srobert TT_BinaryOperator)) {
249*12c85518Srobert return;
250*12c85518Srobert }
251*12c85518Srobert // Module paths in specify blocks and implications in properties.
252*12c85518Srobert if (tryMergeTokensAny({{tok::plusequal, tok::greater},
253*12c85518Srobert {tok::plus, tok::star, tok::greater},
254*12c85518Srobert {tok::minusequal, tok::greater},
255*12c85518Srobert {tok::minus, tok::star, tok::greater},
256*12c85518Srobert {tok::less, tok::arrow},
257*12c85518Srobert {tok::equal, tok::greater},
258*12c85518Srobert {tok::star, tok::greater},
259*12c85518Srobert {tok::pipeequal, tok::greater},
260*12c85518Srobert {tok::pipe, tok::arrow},
261*12c85518Srobert {tok::hash, tok::minus, tok::hash},
262*12c85518Srobert {tok::hash, tok::equal, tok::hash}},
263*12c85518Srobert TT_BinaryOperator)) {
264*12c85518Srobert Tokens.back()->ForcedPrecedence = prec::Comma;
265*12c85518Srobert return;
266*12c85518Srobert }
267*12c85518Srobert }
268e5dd7070Spatrick }
269e5dd7070Spatrick
tryMergeNSStringLiteral()270e5dd7070Spatrick bool FormatTokenLexer::tryMergeNSStringLiteral() {
271e5dd7070Spatrick if (Tokens.size() < 2)
272e5dd7070Spatrick return false;
273e5dd7070Spatrick auto &At = *(Tokens.end() - 2);
274e5dd7070Spatrick auto &String = *(Tokens.end() - 1);
275e5dd7070Spatrick if (!At->is(tok::at) || !String->is(tok::string_literal))
276e5dd7070Spatrick return false;
277e5dd7070Spatrick At->Tok.setKind(tok::string_literal);
278e5dd7070Spatrick At->TokenText = StringRef(At->TokenText.begin(),
279e5dd7070Spatrick String->TokenText.end() - At->TokenText.begin());
280e5dd7070Spatrick At->ColumnWidth += String->ColumnWidth;
281ec727ea7Spatrick At->setType(TT_ObjCStringLiteral);
282e5dd7070Spatrick Tokens.erase(Tokens.end() - 1);
283e5dd7070Spatrick return true;
284e5dd7070Spatrick }
285e5dd7070Spatrick
tryMergeJSPrivateIdentifier()286e5dd7070Spatrick bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
287e5dd7070Spatrick // Merges #idenfier into a single identifier with the text #identifier
288e5dd7070Spatrick // but the token tok::identifier.
289e5dd7070Spatrick if (Tokens.size() < 2)
290e5dd7070Spatrick return false;
291e5dd7070Spatrick auto &Hash = *(Tokens.end() - 2);
292e5dd7070Spatrick auto &Identifier = *(Tokens.end() - 1);
293e5dd7070Spatrick if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier))
294e5dd7070Spatrick return false;
295e5dd7070Spatrick Hash->Tok.setKind(tok::identifier);
296e5dd7070Spatrick Hash->TokenText =
297e5dd7070Spatrick StringRef(Hash->TokenText.begin(),
298e5dd7070Spatrick Identifier->TokenText.end() - Hash->TokenText.begin());
299e5dd7070Spatrick Hash->ColumnWidth += Identifier->ColumnWidth;
300ec727ea7Spatrick Hash->setType(TT_JsPrivateIdentifier);
301e5dd7070Spatrick Tokens.erase(Tokens.end() - 1);
302e5dd7070Spatrick return true;
303e5dd7070Spatrick }
304e5dd7070Spatrick
305e5dd7070Spatrick // Search for verbatim or interpolated string literals @"ABC" or
306e5dd7070Spatrick // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
307e5dd7070Spatrick // prevent splitting of @, $ and ".
308ec727ea7Spatrick // Merging of multiline verbatim strings with embedded '"' is handled in
309ec727ea7Spatrick // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
tryMergeCSharpStringLiteral()310ec727ea7Spatrick bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
311e5dd7070Spatrick if (Tokens.size() < 2)
312e5dd7070Spatrick return false;
313e5dd7070Spatrick
314ec727ea7Spatrick // Look for @"aaaaaa" or $"aaaaaa".
315*12c85518Srobert const auto String = *(Tokens.end() - 1);
316*12c85518Srobert if (String->isNot(tok::string_literal))
317e5dd7070Spatrick return false;
318e5dd7070Spatrick
319*12c85518Srobert auto Prefix = *(Tokens.end() - 2);
320*12c85518Srobert if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
321ec727ea7Spatrick return false;
322ec727ea7Spatrick
323*12c85518Srobert if (Tokens.size() > 2) {
324*12c85518Srobert const auto Tok = *(Tokens.end() - 3);
325*12c85518Srobert if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
326*12c85518Srobert (Tok->is(tok::at) && Prefix->TokenText == "$")) {
327*12c85518Srobert // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
328*12c85518Srobert Tok->ColumnWidth += Prefix->ColumnWidth;
329e5dd7070Spatrick Tokens.erase(Tokens.end() - 2);
330*12c85518Srobert Prefix = Tok;
331e5dd7070Spatrick }
332e5dd7070Spatrick }
333e5dd7070Spatrick
334e5dd7070Spatrick // Convert back into just a string_literal.
335*12c85518Srobert Prefix->Tok.setKind(tok::string_literal);
336*12c85518Srobert Prefix->TokenText =
337*12c85518Srobert StringRef(Prefix->TokenText.begin(),
338*12c85518Srobert String->TokenText.end() - Prefix->TokenText.begin());
339*12c85518Srobert Prefix->ColumnWidth += String->ColumnWidth;
340*12c85518Srobert Prefix->setType(TT_CSharpStringLiteral);
341e5dd7070Spatrick Tokens.erase(Tokens.end() - 1);
342e5dd7070Spatrick return true;
343e5dd7070Spatrick }
344e5dd7070Spatrick
345ec727ea7Spatrick // Valid C# attribute targets:
346ec727ea7Spatrick // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
347ec727ea7Spatrick const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
348ec727ea7Spatrick "assembly", "module", "field", "event", "method",
349ec727ea7Spatrick "param", "property", "return", "type",
350ec727ea7Spatrick };
351ec727ea7Spatrick
tryMergeNullishCoalescingEqual()352a9ac8606Spatrick bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
353e5dd7070Spatrick if (Tokens.size() < 2)
354e5dd7070Spatrick return false;
355a9ac8606Spatrick auto &NullishCoalescing = *(Tokens.end() - 2);
356a9ac8606Spatrick auto &Equal = *(Tokens.end() - 1);
357a9ac8606Spatrick if (NullishCoalescing->getType() != TT_NullCoalescingOperator ||
358*12c85518Srobert !Equal->is(tok::equal)) {
359e5dd7070Spatrick return false;
360*12c85518Srobert }
361a9ac8606Spatrick NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
362a9ac8606Spatrick NullishCoalescing->TokenText =
363a9ac8606Spatrick StringRef(NullishCoalescing->TokenText.begin(),
364a9ac8606Spatrick Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
365a9ac8606Spatrick NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
366a9ac8606Spatrick NullishCoalescing->setType(TT_NullCoalescingEqual);
367e5dd7070Spatrick Tokens.erase(Tokens.end() - 1);
368e5dd7070Spatrick return true;
369e5dd7070Spatrick }
370e5dd7070Spatrick
tryMergeCSharpKeywordVariables()371e5dd7070Spatrick bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
372e5dd7070Spatrick if (Tokens.size() < 2)
373e5dd7070Spatrick return false;
374*12c85518Srobert const auto At = *(Tokens.end() - 2);
375*12c85518Srobert if (At->isNot(tok::at))
376*12c85518Srobert return false;
377*12c85518Srobert const auto Keyword = *(Tokens.end() - 1);
378*12c85518Srobert if (Keyword->TokenText == "$")
379e5dd7070Spatrick return false;
380e5dd7070Spatrick if (!Keywords.isCSharpKeyword(*Keyword))
381e5dd7070Spatrick return false;
382e5dd7070Spatrick
383e5dd7070Spatrick At->Tok.setKind(tok::identifier);
384e5dd7070Spatrick At->TokenText = StringRef(At->TokenText.begin(),
385e5dd7070Spatrick Keyword->TokenText.end() - At->TokenText.begin());
386e5dd7070Spatrick At->ColumnWidth += Keyword->ColumnWidth;
387ec727ea7Spatrick At->setType(Keyword->getType());
388e5dd7070Spatrick Tokens.erase(Tokens.end() - 1);
389e5dd7070Spatrick return true;
390e5dd7070Spatrick }
391e5dd7070Spatrick
392e5dd7070Spatrick // In C# transform identifier foreach into kw_foreach
tryTransformCSharpForEach()393e5dd7070Spatrick bool FormatTokenLexer::tryTransformCSharpForEach() {
394e5dd7070Spatrick if (Tokens.size() < 1)
395e5dd7070Spatrick return false;
396e5dd7070Spatrick auto &Identifier = *(Tokens.end() - 1);
397e5dd7070Spatrick if (!Identifier->is(tok::identifier))
398e5dd7070Spatrick return false;
399e5dd7070Spatrick if (Identifier->TokenText != "foreach")
400e5dd7070Spatrick return false;
401e5dd7070Spatrick
402ec727ea7Spatrick Identifier->setType(TT_ForEachMacro);
403e5dd7070Spatrick Identifier->Tok.setKind(tok::kw_for);
404e5dd7070Spatrick return true;
405e5dd7070Spatrick }
406e5dd7070Spatrick
tryMergeForEach()407ec727ea7Spatrick bool FormatTokenLexer::tryMergeForEach() {
408ec727ea7Spatrick if (Tokens.size() < 2)
409ec727ea7Spatrick return false;
410ec727ea7Spatrick auto &For = *(Tokens.end() - 2);
411ec727ea7Spatrick auto &Each = *(Tokens.end() - 1);
412ec727ea7Spatrick if (!For->is(tok::kw_for))
413ec727ea7Spatrick return false;
414ec727ea7Spatrick if (!Each->is(tok::identifier))
415ec727ea7Spatrick return false;
416ec727ea7Spatrick if (Each->TokenText != "each")
417ec727ea7Spatrick return false;
418ec727ea7Spatrick
419ec727ea7Spatrick For->setType(TT_ForEachMacro);
420ec727ea7Spatrick For->Tok.setKind(tok::kw_for);
421ec727ea7Spatrick
422ec727ea7Spatrick For->TokenText = StringRef(For->TokenText.begin(),
423ec727ea7Spatrick Each->TokenText.end() - For->TokenText.begin());
424ec727ea7Spatrick For->ColumnWidth += Each->ColumnWidth;
425ec727ea7Spatrick Tokens.erase(Tokens.end() - 1);
426ec727ea7Spatrick return true;
427ec727ea7Spatrick }
428ec727ea7Spatrick
tryTransformTryUsageForC()429ec727ea7Spatrick bool FormatTokenLexer::tryTransformTryUsageForC() {
430ec727ea7Spatrick if (Tokens.size() < 2)
431ec727ea7Spatrick return false;
432ec727ea7Spatrick auto &Try = *(Tokens.end() - 2);
433ec727ea7Spatrick if (!Try->is(tok::kw_try))
434ec727ea7Spatrick return false;
435ec727ea7Spatrick auto &Next = *(Tokens.end() - 1);
436a9ac8606Spatrick if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
437ec727ea7Spatrick return false;
438ec727ea7Spatrick
439ec727ea7Spatrick if (Tokens.size() > 2) {
440ec727ea7Spatrick auto &At = *(Tokens.end() - 3);
441ec727ea7Spatrick if (At->is(tok::at))
442ec727ea7Spatrick return false;
443ec727ea7Spatrick }
444ec727ea7Spatrick
445ec727ea7Spatrick Try->Tok.setKind(tok::identifier);
446ec727ea7Spatrick return true;
447ec727ea7Spatrick }
448ec727ea7Spatrick
tryMergeLessLess()449e5dd7070Spatrick bool FormatTokenLexer::tryMergeLessLess() {
450e5dd7070Spatrick // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
451e5dd7070Spatrick if (Tokens.size() < 3)
452e5dd7070Spatrick return false;
453e5dd7070Spatrick
454e5dd7070Spatrick auto First = Tokens.end() - 3;
455*12c85518Srobert if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
456e5dd7070Spatrick return false;
457e5dd7070Spatrick
458e5dd7070Spatrick // Only merge if there currently is no whitespace between the two "<".
459*12c85518Srobert if (First[1]->hasWhitespaceBefore())
460*12c85518Srobert return false;
461*12c85518Srobert
462*12c85518Srobert auto X = Tokens.size() > 3 ? First[-1] : nullptr;
463*12c85518Srobert auto Y = First[2];
464*12c85518Srobert if ((X && X->is(tok::less)) || Y->is(tok::less))
465*12c85518Srobert return false;
466*12c85518Srobert
467*12c85518Srobert // Do not remove a whitespace between the two "<" e.g. "operator< <>".
468*12c85518Srobert if (X && X->is(tok::kw_operator) && Y->is(tok::greater))
469e5dd7070Spatrick return false;
470e5dd7070Spatrick
471e5dd7070Spatrick First[0]->Tok.setKind(tok::lessless);
472e5dd7070Spatrick First[0]->TokenText = "<<";
473e5dd7070Spatrick First[0]->ColumnWidth += 1;
474e5dd7070Spatrick Tokens.erase(Tokens.end() - 2);
475e5dd7070Spatrick return true;
476e5dd7070Spatrick }
477e5dd7070Spatrick
tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,TokenType NewType)478e5dd7070Spatrick bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
479e5dd7070Spatrick TokenType NewType) {
480e5dd7070Spatrick if (Tokens.size() < Kinds.size())
481e5dd7070Spatrick return false;
482e5dd7070Spatrick
483e5dd7070Spatrick SmallVectorImpl<FormatToken *>::const_iterator First =
484e5dd7070Spatrick Tokens.end() - Kinds.size();
485*12c85518Srobert for (unsigned i = 0; i < Kinds.size(); ++i)
486*12c85518Srobert if (!First[i]->is(Kinds[i]))
487e5dd7070Spatrick return false;
488*12c85518Srobert
489*12c85518Srobert return tryMergeTokens(Kinds.size(), NewType);
490*12c85518Srobert }
491*12c85518Srobert
tryMergeTokens(size_t Count,TokenType NewType)492*12c85518Srobert bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
493*12c85518Srobert if (Tokens.size() < Count)
494*12c85518Srobert return false;
495*12c85518Srobert
496*12c85518Srobert SmallVectorImpl<FormatToken *>::const_iterator First = Tokens.end() - Count;
497e5dd7070Spatrick unsigned AddLength = 0;
498*12c85518Srobert for (size_t i = 1; i < Count; ++i) {
499*12c85518Srobert // If there is whitespace separating the token and the previous one,
500*12c85518Srobert // they should not be merged.
501*12c85518Srobert if (First[i]->hasWhitespaceBefore())
502e5dd7070Spatrick return false;
503e5dd7070Spatrick AddLength += First[i]->TokenText.size();
504e5dd7070Spatrick }
505*12c85518Srobert
506*12c85518Srobert Tokens.resize(Tokens.size() - Count + 1);
507e5dd7070Spatrick First[0]->TokenText = StringRef(First[0]->TokenText.data(),
508e5dd7070Spatrick First[0]->TokenText.size() + AddLength);
509e5dd7070Spatrick First[0]->ColumnWidth += AddLength;
510ec727ea7Spatrick First[0]->setType(NewType);
511e5dd7070Spatrick return true;
512e5dd7070Spatrick }
513e5dd7070Spatrick
tryMergeTokensAny(ArrayRef<ArrayRef<tok::TokenKind>> Kinds,TokenType NewType)514*12c85518Srobert bool FormatTokenLexer::tryMergeTokensAny(
515*12c85518Srobert ArrayRef<ArrayRef<tok::TokenKind>> Kinds, TokenType NewType) {
516*12c85518Srobert return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
517*12c85518Srobert return tryMergeTokens(Kinds, NewType);
518*12c85518Srobert });
519*12c85518Srobert }
520*12c85518Srobert
521e5dd7070Spatrick // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
precedesOperand(FormatToken * Tok)522e5dd7070Spatrick bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
523e5dd7070Spatrick // NB: This is not entirely correct, as an r_paren can introduce an operand
524e5dd7070Spatrick // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
525e5dd7070Spatrick // corner case to not matter in practice, though.
526e5dd7070Spatrick return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
527e5dd7070Spatrick tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
528e5dd7070Spatrick tok::colon, tok::question, tok::tilde) ||
529e5dd7070Spatrick Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
530e5dd7070Spatrick tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
531e5dd7070Spatrick tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
532e5dd7070Spatrick Tok->isBinaryOperator();
533e5dd7070Spatrick }
534e5dd7070Spatrick
canPrecedeRegexLiteral(FormatToken * Prev)535e5dd7070Spatrick bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
536e5dd7070Spatrick if (!Prev)
537e5dd7070Spatrick return true;
538e5dd7070Spatrick
539e5dd7070Spatrick // Regex literals can only follow after prefix unary operators, not after
540e5dd7070Spatrick // postfix unary operators. If the '++' is followed by a non-operand
541e5dd7070Spatrick // introducing token, the slash here is the operand and not the start of a
542e5dd7070Spatrick // regex.
543e5dd7070Spatrick // `!` is an unary prefix operator, but also a post-fix operator that casts
544e5dd7070Spatrick // away nullability, so the same check applies.
545e5dd7070Spatrick if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
546*12c85518Srobert return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
547e5dd7070Spatrick
548e5dd7070Spatrick // The previous token must introduce an operand location where regex
549e5dd7070Spatrick // literals can occur.
550e5dd7070Spatrick if (!precedesOperand(Prev))
551e5dd7070Spatrick return false;
552e5dd7070Spatrick
553e5dd7070Spatrick return true;
554e5dd7070Spatrick }
555e5dd7070Spatrick
556e5dd7070Spatrick // Tries to parse a JavaScript Regex literal starting at the current token,
557e5dd7070Spatrick // if that begins with a slash and is in a location where JavaScript allows
558e5dd7070Spatrick // regex literals. Changes the current token to a regex literal and updates
559e5dd7070Spatrick // its text if successful.
tryParseJSRegexLiteral()560e5dd7070Spatrick void FormatTokenLexer::tryParseJSRegexLiteral() {
561e5dd7070Spatrick FormatToken *RegexToken = Tokens.back();
562e5dd7070Spatrick if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
563e5dd7070Spatrick return;
564e5dd7070Spatrick
565e5dd7070Spatrick FormatToken *Prev = nullptr;
566*12c85518Srobert for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
567e5dd7070Spatrick // NB: Because previous pointers are not initialized yet, this cannot use
568e5dd7070Spatrick // Token.getPreviousNonComment.
569*12c85518Srobert if (FT->isNot(tok::comment)) {
570*12c85518Srobert Prev = FT;
571e5dd7070Spatrick break;
572e5dd7070Spatrick }
573e5dd7070Spatrick }
574e5dd7070Spatrick
575e5dd7070Spatrick if (!canPrecedeRegexLiteral(Prev))
576e5dd7070Spatrick return;
577e5dd7070Spatrick
578e5dd7070Spatrick // 'Manually' lex ahead in the current file buffer.
579e5dd7070Spatrick const char *Offset = Lex->getBufferLocation();
580e5dd7070Spatrick const char *RegexBegin = Offset - RegexToken->TokenText.size();
581e5dd7070Spatrick StringRef Buffer = Lex->getBuffer();
582e5dd7070Spatrick bool InCharacterClass = false;
583e5dd7070Spatrick bool HaveClosingSlash = false;
584e5dd7070Spatrick for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
585e5dd7070Spatrick // Regular expressions are terminated with a '/', which can only be
586e5dd7070Spatrick // escaped using '\' or a character class between '[' and ']'.
587e5dd7070Spatrick // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
588e5dd7070Spatrick switch (*Offset) {
589e5dd7070Spatrick case '\\':
590e5dd7070Spatrick // Skip the escaped character.
591e5dd7070Spatrick ++Offset;
592e5dd7070Spatrick break;
593e5dd7070Spatrick case '[':
594e5dd7070Spatrick InCharacterClass = true;
595e5dd7070Spatrick break;
596e5dd7070Spatrick case ']':
597e5dd7070Spatrick InCharacterClass = false;
598e5dd7070Spatrick break;
599e5dd7070Spatrick case '/':
600e5dd7070Spatrick if (!InCharacterClass)
601e5dd7070Spatrick HaveClosingSlash = true;
602e5dd7070Spatrick break;
603e5dd7070Spatrick }
604e5dd7070Spatrick }
605e5dd7070Spatrick
606ec727ea7Spatrick RegexToken->setType(TT_RegexLiteral);
607e5dd7070Spatrick // Treat regex literals like other string_literals.
608e5dd7070Spatrick RegexToken->Tok.setKind(tok::string_literal);
609e5dd7070Spatrick RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
610e5dd7070Spatrick RegexToken->ColumnWidth = RegexToken->TokenText.size();
611e5dd7070Spatrick
612e5dd7070Spatrick resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
613e5dd7070Spatrick }
614e5dd7070Spatrick
lexCSharpString(const char * Begin,const char * End,bool Verbatim,bool Interpolated)615*12c85518Srobert static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
616*12c85518Srobert bool Interpolated) {
617*12c85518Srobert auto Repeated = [&Begin, End]() {
618*12c85518Srobert return Begin + 1 < End && Begin[1] == Begin[0];
619*12c85518Srobert };
620ec727ea7Spatrick
621ec727ea7Spatrick // Look for a terminating '"' in the current file buffer.
622ec727ea7Spatrick // Make no effort to format code within an interpolated or verbatim string.
623*12c85518Srobert //
624*12c85518Srobert // Interpolated strings could contain { } with " characters inside.
625*12c85518Srobert // $"{x ?? "null"}"
626*12c85518Srobert // should not be split into $"{x ?? ", null, "}" but should be treated as a
627*12c85518Srobert // single string-literal.
628*12c85518Srobert //
629*12c85518Srobert // We opt not to try and format expressions inside {} within a C#
630*12c85518Srobert // interpolated string. Formatting expressions within an interpolated string
631*12c85518Srobert // would require similar work as that done for JavaScript template strings
632*12c85518Srobert // in `handleTemplateStrings()`.
633*12c85518Srobert for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
634*12c85518Srobert switch (*Begin) {
635*12c85518Srobert case '\\':
636*12c85518Srobert if (!Verbatim)
637*12c85518Srobert ++Begin;
638*12c85518Srobert break;
639*12c85518Srobert case '{':
640*12c85518Srobert if (Interpolated) {
641*12c85518Srobert // {{ inside an interpolated string is escaped, so skip it.
642*12c85518Srobert if (Repeated())
643*12c85518Srobert ++Begin;
644ec727ea7Spatrick else
645*12c85518Srobert ++UnmatchedOpeningBraceCount;
646*12c85518Srobert }
647*12c85518Srobert break;
648*12c85518Srobert case '}':
649*12c85518Srobert if (Interpolated) {
650*12c85518Srobert // }} inside an interpolated string is escaped, so skip it.
651*12c85518Srobert if (Repeated())
652*12c85518Srobert ++Begin;
653*12c85518Srobert else if (UnmatchedOpeningBraceCount > 0)
654*12c85518Srobert --UnmatchedOpeningBraceCount;
655*12c85518Srobert else
656*12c85518Srobert return End;
657*12c85518Srobert }
658*12c85518Srobert break;
659*12c85518Srobert case '"':
660*12c85518Srobert if (UnmatchedOpeningBraceCount > 0)
661*12c85518Srobert break;
662*12c85518Srobert // "" within a verbatim string is an escaped double quote: skip it.
663*12c85518Srobert if (Verbatim && Repeated()) {
664*12c85518Srobert ++Begin;
665ec727ea7Spatrick break;
666ec727ea7Spatrick }
667*12c85518Srobert return Begin;
668ec727ea7Spatrick }
669*12c85518Srobert }
670*12c85518Srobert
671*12c85518Srobert return End;
672*12c85518Srobert }
673*12c85518Srobert
handleCSharpVerbatimAndInterpolatedStrings()674*12c85518Srobert void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
675*12c85518Srobert FormatToken *CSharpStringLiteral = Tokens.back();
676*12c85518Srobert
677*12c85518Srobert if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
678*12c85518Srobert return;
679*12c85518Srobert
680*12c85518Srobert auto &TokenText = CSharpStringLiteral->TokenText;
681*12c85518Srobert
682*12c85518Srobert bool Verbatim = false;
683*12c85518Srobert bool Interpolated = false;
684*12c85518Srobert if (TokenText.startswith(R"($@")") || TokenText.startswith(R"(@$")")) {
685*12c85518Srobert Verbatim = true;
686*12c85518Srobert Interpolated = true;
687*12c85518Srobert } else if (TokenText.startswith(R"(@")")) {
688*12c85518Srobert Verbatim = true;
689*12c85518Srobert } else if (TokenText.startswith(R"($")")) {
690*12c85518Srobert Interpolated = true;
691*12c85518Srobert }
692*12c85518Srobert
693*12c85518Srobert // Deal with multiline strings.
694*12c85518Srobert if (!Verbatim && !Interpolated)
695*12c85518Srobert return;
696*12c85518Srobert
697*12c85518Srobert const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
698*12c85518Srobert const char *Offset = StrBegin;
699*12c85518Srobert if (Verbatim && Interpolated)
700*12c85518Srobert Offset += 3;
701*12c85518Srobert else
702*12c85518Srobert Offset += 2;
703*12c85518Srobert
704*12c85518Srobert const auto End = Lex->getBuffer().end();
705*12c85518Srobert Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
706ec727ea7Spatrick
707ec727ea7Spatrick // Make no attempt to format code properly if a verbatim string is
708ec727ea7Spatrick // unterminated.
709*12c85518Srobert if (Offset >= End)
710ec727ea7Spatrick return;
711ec727ea7Spatrick
712ec727ea7Spatrick StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
713*12c85518Srobert TokenText = LiteralText;
714ec727ea7Spatrick
715ec727ea7Spatrick // Adjust width for potentially multiline string literals.
716ec727ea7Spatrick size_t FirstBreak = LiteralText.find('\n');
717ec727ea7Spatrick StringRef FirstLineText = FirstBreak == StringRef::npos
718ec727ea7Spatrick ? LiteralText
719ec727ea7Spatrick : LiteralText.substr(0, FirstBreak);
720ec727ea7Spatrick CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
721ec727ea7Spatrick FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
722ec727ea7Spatrick Encoding);
723ec727ea7Spatrick size_t LastBreak = LiteralText.rfind('\n');
724ec727ea7Spatrick if (LastBreak != StringRef::npos) {
725ec727ea7Spatrick CSharpStringLiteral->IsMultiline = true;
726ec727ea7Spatrick unsigned StartColumn = 0;
727*12c85518Srobert CSharpStringLiteral->LastLineColumnWidth =
728*12c85518Srobert encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
729*12c85518Srobert StartColumn, Style.TabWidth, Encoding);
730ec727ea7Spatrick }
731ec727ea7Spatrick
732*12c85518Srobert assert(Offset < End);
733*12c85518Srobert resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
734ec727ea7Spatrick }
735ec727ea7Spatrick
handleTemplateStrings()736e5dd7070Spatrick void FormatTokenLexer::handleTemplateStrings() {
737e5dd7070Spatrick FormatToken *BacktickToken = Tokens.back();
738e5dd7070Spatrick
739e5dd7070Spatrick if (BacktickToken->is(tok::l_brace)) {
740e5dd7070Spatrick StateStack.push(LexerState::NORMAL);
741e5dd7070Spatrick return;
742e5dd7070Spatrick }
743e5dd7070Spatrick if (BacktickToken->is(tok::r_brace)) {
744e5dd7070Spatrick if (StateStack.size() == 1)
745e5dd7070Spatrick return;
746e5dd7070Spatrick StateStack.pop();
747e5dd7070Spatrick if (StateStack.top() != LexerState::TEMPLATE_STRING)
748e5dd7070Spatrick return;
749e5dd7070Spatrick // If back in TEMPLATE_STRING, fallthrough and continue parsing the
750e5dd7070Spatrick } else if (BacktickToken->is(tok::unknown) &&
751e5dd7070Spatrick BacktickToken->TokenText == "`") {
752e5dd7070Spatrick StateStack.push(LexerState::TEMPLATE_STRING);
753e5dd7070Spatrick } else {
754e5dd7070Spatrick return; // Not actually a template
755e5dd7070Spatrick }
756e5dd7070Spatrick
757e5dd7070Spatrick // 'Manually' lex ahead in the current file buffer.
758e5dd7070Spatrick const char *Offset = Lex->getBufferLocation();
759e5dd7070Spatrick const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
760e5dd7070Spatrick for (; Offset != Lex->getBuffer().end(); ++Offset) {
761e5dd7070Spatrick if (Offset[0] == '`') {
762e5dd7070Spatrick StateStack.pop();
763*12c85518Srobert ++Offset;
764e5dd7070Spatrick break;
765e5dd7070Spatrick }
766e5dd7070Spatrick if (Offset[0] == '\\') {
767e5dd7070Spatrick ++Offset; // Skip the escaped character.
768e5dd7070Spatrick } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
769e5dd7070Spatrick Offset[1] == '{') {
770e5dd7070Spatrick // '${' introduces an expression interpolation in the template string.
771e5dd7070Spatrick StateStack.push(LexerState::NORMAL);
772*12c85518Srobert Offset += 2;
773e5dd7070Spatrick break;
774e5dd7070Spatrick }
775e5dd7070Spatrick }
776e5dd7070Spatrick
777*12c85518Srobert StringRef LiteralText(TmplBegin, Offset - TmplBegin);
778ec727ea7Spatrick BacktickToken->setType(TT_TemplateString);
779e5dd7070Spatrick BacktickToken->Tok.setKind(tok::string_literal);
780e5dd7070Spatrick BacktickToken->TokenText = LiteralText;
781e5dd7070Spatrick
782e5dd7070Spatrick // Adjust width for potentially multiline string literals.
783e5dd7070Spatrick size_t FirstBreak = LiteralText.find('\n');
784e5dd7070Spatrick StringRef FirstLineText = FirstBreak == StringRef::npos
785e5dd7070Spatrick ? LiteralText
786e5dd7070Spatrick : LiteralText.substr(0, FirstBreak);
787e5dd7070Spatrick BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
788e5dd7070Spatrick FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
789e5dd7070Spatrick size_t LastBreak = LiteralText.rfind('\n');
790e5dd7070Spatrick if (LastBreak != StringRef::npos) {
791e5dd7070Spatrick BacktickToken->IsMultiline = true;
792e5dd7070Spatrick unsigned StartColumn = 0; // The template tail spans the entire line.
793*12c85518Srobert BacktickToken->LastLineColumnWidth =
794*12c85518Srobert encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
795*12c85518Srobert StartColumn, Style.TabWidth, Encoding);
796e5dd7070Spatrick }
797e5dd7070Spatrick
798*12c85518Srobert SourceLocation loc = Lex->getSourceLocation(Offset);
799e5dd7070Spatrick resetLexer(SourceMgr.getFileOffset(loc));
800e5dd7070Spatrick }
801e5dd7070Spatrick
tryParsePythonComment()802e5dd7070Spatrick void FormatTokenLexer::tryParsePythonComment() {
803e5dd7070Spatrick FormatToken *HashToken = Tokens.back();
804e5dd7070Spatrick if (!HashToken->isOneOf(tok::hash, tok::hashhash))
805e5dd7070Spatrick return;
806e5dd7070Spatrick // Turn the remainder of this line into a comment.
807e5dd7070Spatrick const char *CommentBegin =
808e5dd7070Spatrick Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
809e5dd7070Spatrick size_t From = CommentBegin - Lex->getBuffer().begin();
810e5dd7070Spatrick size_t To = Lex->getBuffer().find_first_of('\n', From);
811e5dd7070Spatrick if (To == StringRef::npos)
812e5dd7070Spatrick To = Lex->getBuffer().size();
813e5dd7070Spatrick size_t Len = To - From;
814ec727ea7Spatrick HashToken->setType(TT_LineComment);
815e5dd7070Spatrick HashToken->Tok.setKind(tok::comment);
816e5dd7070Spatrick HashToken->TokenText = Lex->getBuffer().substr(From, Len);
817e5dd7070Spatrick SourceLocation Loc = To < Lex->getBuffer().size()
818e5dd7070Spatrick ? Lex->getSourceLocation(CommentBegin + Len)
819e5dd7070Spatrick : SourceMgr.getLocForEndOfFile(ID);
820e5dd7070Spatrick resetLexer(SourceMgr.getFileOffset(Loc));
821e5dd7070Spatrick }
822e5dd7070Spatrick
tryMerge_TMacro()823e5dd7070Spatrick bool FormatTokenLexer::tryMerge_TMacro() {
824e5dd7070Spatrick if (Tokens.size() < 4)
825e5dd7070Spatrick return false;
826e5dd7070Spatrick FormatToken *Last = Tokens.back();
827e5dd7070Spatrick if (!Last->is(tok::r_paren))
828e5dd7070Spatrick return false;
829e5dd7070Spatrick
830e5dd7070Spatrick FormatToken *String = Tokens[Tokens.size() - 2];
831e5dd7070Spatrick if (!String->is(tok::string_literal) || String->IsMultiline)
832e5dd7070Spatrick return false;
833e5dd7070Spatrick
834e5dd7070Spatrick if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
835e5dd7070Spatrick return false;
836e5dd7070Spatrick
837e5dd7070Spatrick FormatToken *Macro = Tokens[Tokens.size() - 4];
838e5dd7070Spatrick if (Macro->TokenText != "_T")
839e5dd7070Spatrick return false;
840e5dd7070Spatrick
841e5dd7070Spatrick const char *Start = Macro->TokenText.data();
842e5dd7070Spatrick const char *End = Last->TokenText.data() + Last->TokenText.size();
843e5dd7070Spatrick String->TokenText = StringRef(Start, End - Start);
844e5dd7070Spatrick String->IsFirst = Macro->IsFirst;
845e5dd7070Spatrick String->LastNewlineOffset = Macro->LastNewlineOffset;
846e5dd7070Spatrick String->WhitespaceRange = Macro->WhitespaceRange;
847e5dd7070Spatrick String->OriginalColumn = Macro->OriginalColumn;
848e5dd7070Spatrick String->ColumnWidth = encoding::columnWidthWithTabs(
849e5dd7070Spatrick String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
850e5dd7070Spatrick String->NewlinesBefore = Macro->NewlinesBefore;
851e5dd7070Spatrick String->HasUnescapedNewline = Macro->HasUnescapedNewline;
852e5dd7070Spatrick
853e5dd7070Spatrick Tokens.pop_back();
854e5dd7070Spatrick Tokens.pop_back();
855e5dd7070Spatrick Tokens.pop_back();
856e5dd7070Spatrick Tokens.back() = String;
857*12c85518Srobert if (FirstInLineIndex >= Tokens.size())
858*12c85518Srobert FirstInLineIndex = Tokens.size() - 1;
859e5dd7070Spatrick return true;
860e5dd7070Spatrick }
861e5dd7070Spatrick
tryMergeConflictMarkers()862e5dd7070Spatrick bool FormatTokenLexer::tryMergeConflictMarkers() {
863e5dd7070Spatrick if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
864e5dd7070Spatrick return false;
865e5dd7070Spatrick
866e5dd7070Spatrick // Conflict lines look like:
867e5dd7070Spatrick // <marker> <text from the vcs>
868e5dd7070Spatrick // For example:
869e5dd7070Spatrick // >>>>>>> /file/in/file/system at revision 1234
870e5dd7070Spatrick //
871e5dd7070Spatrick // We merge all tokens in a line that starts with a conflict marker
872e5dd7070Spatrick // into a single token with a special token type that the unwrapped line
873e5dd7070Spatrick // parser will use to correctly rebuild the underlying code.
874e5dd7070Spatrick
875e5dd7070Spatrick FileID ID;
876e5dd7070Spatrick // Get the position of the first token in the line.
877e5dd7070Spatrick unsigned FirstInLineOffset;
878e5dd7070Spatrick std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
879e5dd7070Spatrick Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
880a9ac8606Spatrick StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
881e5dd7070Spatrick // Calculate the offset of the start of the current line.
882e5dd7070Spatrick auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
883*12c85518Srobert if (LineOffset == StringRef::npos)
884e5dd7070Spatrick LineOffset = 0;
885*12c85518Srobert else
886e5dd7070Spatrick ++LineOffset;
887e5dd7070Spatrick
888e5dd7070Spatrick auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
889e5dd7070Spatrick StringRef LineStart;
890*12c85518Srobert if (FirstSpace == StringRef::npos)
891e5dd7070Spatrick LineStart = Buffer.substr(LineOffset);
892*12c85518Srobert else
893e5dd7070Spatrick LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
894e5dd7070Spatrick
895e5dd7070Spatrick TokenType Type = TT_Unknown;
896e5dd7070Spatrick if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
897e5dd7070Spatrick Type = TT_ConflictStart;
898e5dd7070Spatrick } else if (LineStart == "|||||||" || LineStart == "=======" ||
899e5dd7070Spatrick LineStart == "====") {
900e5dd7070Spatrick Type = TT_ConflictAlternative;
901e5dd7070Spatrick } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
902e5dd7070Spatrick Type = TT_ConflictEnd;
903e5dd7070Spatrick }
904e5dd7070Spatrick
905e5dd7070Spatrick if (Type != TT_Unknown) {
906e5dd7070Spatrick FormatToken *Next = Tokens.back();
907e5dd7070Spatrick
908e5dd7070Spatrick Tokens.resize(FirstInLineIndex + 1);
909e5dd7070Spatrick // We do not need to build a complete token here, as we will skip it
910e5dd7070Spatrick // during parsing anyway (as we must not touch whitespace around conflict
911e5dd7070Spatrick // markers).
912ec727ea7Spatrick Tokens.back()->setType(Type);
913e5dd7070Spatrick Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
914e5dd7070Spatrick
915e5dd7070Spatrick Tokens.push_back(Next);
916e5dd7070Spatrick return true;
917e5dd7070Spatrick }
918e5dd7070Spatrick
919e5dd7070Spatrick return false;
920e5dd7070Spatrick }
921e5dd7070Spatrick
getStashedToken()922e5dd7070Spatrick FormatToken *FormatTokenLexer::getStashedToken() {
923e5dd7070Spatrick // Create a synthesized second '>' or '<' token.
924e5dd7070Spatrick Token Tok = FormatTok->Tok;
925e5dd7070Spatrick StringRef TokenText = FormatTok->TokenText;
926e5dd7070Spatrick
927e5dd7070Spatrick unsigned OriginalColumn = FormatTok->OriginalColumn;
928e5dd7070Spatrick FormatTok = new (Allocator.Allocate()) FormatToken;
929e5dd7070Spatrick FormatTok->Tok = Tok;
930e5dd7070Spatrick SourceLocation TokLocation =
931e5dd7070Spatrick FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
932e5dd7070Spatrick FormatTok->Tok.setLocation(TokLocation);
933e5dd7070Spatrick FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
934e5dd7070Spatrick FormatTok->TokenText = TokenText;
935e5dd7070Spatrick FormatTok->ColumnWidth = 1;
936e5dd7070Spatrick FormatTok->OriginalColumn = OriginalColumn + 1;
937e5dd7070Spatrick
938e5dd7070Spatrick return FormatTok;
939e5dd7070Spatrick }
940e5dd7070Spatrick
941*12c85518Srobert /// Truncate the current token to the new length and make the lexer continue
942*12c85518Srobert /// from the end of the truncated token. Used for other languages that have
943*12c85518Srobert /// different token boundaries, like JavaScript in which a comment ends at a
944*12c85518Srobert /// line break regardless of whether the line break follows a backslash. Also
945*12c85518Srobert /// used to set the lexer to the end of whitespace if the lexer regards
946*12c85518Srobert /// whitespace and an unrecognized symbol as one token.
truncateToken(size_t NewLen)947*12c85518Srobert void FormatTokenLexer::truncateToken(size_t NewLen) {
948*12c85518Srobert assert(NewLen <= FormatTok->TokenText.size());
949*12c85518Srobert resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
950*12c85518Srobert Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
951*12c85518Srobert FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
952*12c85518Srobert FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
953*12c85518Srobert FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
954*12c85518Srobert Encoding);
955*12c85518Srobert FormatTok->Tok.setLength(NewLen);
956*12c85518Srobert }
957*12c85518Srobert
958*12c85518Srobert /// Count the length of leading whitespace in a token.
countLeadingWhitespace(StringRef Text)959*12c85518Srobert static size_t countLeadingWhitespace(StringRef Text) {
960*12c85518Srobert // Basically counting the length matched by this regex.
961*12c85518Srobert // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
962*12c85518Srobert // Directly using the regex turned out to be slow. With the regex
963*12c85518Srobert // version formatting all files in this directory took about 1.25
964*12c85518Srobert // seconds. This version took about 0.5 seconds.
965*12c85518Srobert const unsigned char *const Begin = Text.bytes_begin();
966*12c85518Srobert const unsigned char *const End = Text.bytes_end();
967*12c85518Srobert const unsigned char *Cur = Begin;
968*12c85518Srobert while (Cur < End) {
969*12c85518Srobert if (isspace(Cur[0])) {
970*12c85518Srobert ++Cur;
971*12c85518Srobert } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
972*12c85518Srobert // A '\' followed by a newline always escapes the newline, regardless
973*12c85518Srobert // of whether there is another '\' before it.
974*12c85518Srobert // The source has a null byte at the end. So the end of the entire input
975*12c85518Srobert // isn't reached yet. Also the lexer doesn't break apart an escaped
976*12c85518Srobert // newline.
977*12c85518Srobert assert(End - Cur >= 2);
978*12c85518Srobert Cur += 2;
979*12c85518Srobert } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
980*12c85518Srobert (Cur[3] == '\n' || Cur[3] == '\r')) {
981*12c85518Srobert // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
982*12c85518Srobert // characters are quoted individually in this comment because if we write
983*12c85518Srobert // them together some compilers warn that we have a trigraph in the code.
984*12c85518Srobert assert(End - Cur >= 4);
985*12c85518Srobert Cur += 4;
986*12c85518Srobert } else {
987*12c85518Srobert break;
988*12c85518Srobert }
989*12c85518Srobert }
990*12c85518Srobert return Cur - Begin;
991*12c85518Srobert }
992*12c85518Srobert
getNextToken()993e5dd7070Spatrick FormatToken *FormatTokenLexer::getNextToken() {
994e5dd7070Spatrick if (StateStack.top() == LexerState::TOKEN_STASHED) {
995e5dd7070Spatrick StateStack.pop();
996e5dd7070Spatrick return getStashedToken();
997e5dd7070Spatrick }
998e5dd7070Spatrick
999e5dd7070Spatrick FormatTok = new (Allocator.Allocate()) FormatToken;
1000e5dd7070Spatrick readRawToken(*FormatTok);
1001e5dd7070Spatrick SourceLocation WhitespaceStart =
1002e5dd7070Spatrick FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1003e5dd7070Spatrick FormatTok->IsFirst = IsFirstToken;
1004e5dd7070Spatrick IsFirstToken = false;
1005e5dd7070Spatrick
1006e5dd7070Spatrick // Consume and record whitespace until we find a significant token.
1007*12c85518Srobert // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1008*12c85518Srobert // followed by a symbol such as backtick. Those symbols may be
1009*12c85518Srobert // significant in other languages.
1010e5dd7070Spatrick unsigned WhitespaceLength = TrailingWhitespace;
1011*12c85518Srobert while (FormatTok->isNot(tok::eof)) {
1012*12c85518Srobert auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1013*12c85518Srobert if (LeadingWhitespace == 0)
1014e5dd7070Spatrick break;
1015*12c85518Srobert if (LeadingWhitespace < FormatTok->TokenText.size())
1016*12c85518Srobert truncateToken(LeadingWhitespace);
1017*12c85518Srobert StringRef Text = FormatTok->TokenText;
1018*12c85518Srobert bool InEscape = false;
1019e5dd7070Spatrick for (int i = 0, e = Text.size(); i != e; ++i) {
1020e5dd7070Spatrick switch (Text[i]) {
1021*12c85518Srobert case '\r':
1022*12c85518Srobert // If this is a CRLF sequence, break here and the LF will be handled on
1023*12c85518Srobert // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1024*12c85518Srobert // the same as a single LF.
1025*12c85518Srobert if (i + 1 < e && Text[i + 1] == '\n')
1026*12c85518Srobert break;
1027*12c85518Srobert [[fallthrough]];
1028e5dd7070Spatrick case '\n':
1029e5dd7070Spatrick ++FormatTok->NewlinesBefore;
1030*12c85518Srobert if (!InEscape)
1031*12c85518Srobert FormatTok->HasUnescapedNewline = true;
1032*12c85518Srobert else
1033*12c85518Srobert InEscape = false;
1034e5dd7070Spatrick FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1035e5dd7070Spatrick Column = 0;
1036e5dd7070Spatrick break;
1037e5dd7070Spatrick case '\f':
1038e5dd7070Spatrick case '\v':
1039e5dd7070Spatrick Column = 0;
1040e5dd7070Spatrick break;
1041e5dd7070Spatrick case ' ':
1042e5dd7070Spatrick ++Column;
1043e5dd7070Spatrick break;
1044e5dd7070Spatrick case '\t':
1045e5dd7070Spatrick Column +=
1046e5dd7070Spatrick Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1047e5dd7070Spatrick break;
1048e5dd7070Spatrick case '\\':
1049*12c85518Srobert case '?':
1050*12c85518Srobert case '/':
1051*12c85518Srobert // The text was entirely whitespace when this loop was entered. Thus
1052*12c85518Srobert // this has to be an escape sequence.
1053*12c85518Srobert assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
1054*12c85518Srobert Text.substr(i, 4) == "\?\?/\r" ||
1055*12c85518Srobert Text.substr(i, 4) == "\?\?/\n" ||
1056*12c85518Srobert (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
1057*12c85518Srobert Text.substr(i - 1, 4) == "\?\?/\n")) ||
1058*12c85518Srobert (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
1059*12c85518Srobert Text.substr(i - 2, 4) == "\?\?/\n")));
1060*12c85518Srobert InEscape = true;
1061e5dd7070Spatrick break;
1062e5dd7070Spatrick default:
1063*12c85518Srobert // This shouldn't happen.
1064*12c85518Srobert assert(false);
1065e5dd7070Spatrick break;
1066e5dd7070Spatrick }
1067e5dd7070Spatrick }
1068*12c85518Srobert WhitespaceLength += Text.size();
1069e5dd7070Spatrick readRawToken(*FormatTok);
1070e5dd7070Spatrick }
1071e5dd7070Spatrick
1072*12c85518Srobert if (FormatTok->is(tok::unknown))
1073*12c85518Srobert FormatTok->setType(TT_ImplicitStringLiteral);
1074*12c85518Srobert
1075e5dd7070Spatrick // JavaScript and Java do not allow to escape the end of the line with a
1076e5dd7070Spatrick // backslash. Backslashes are syntax errors in plain source, but can occur in
1077e5dd7070Spatrick // comments. When a single line comment ends with a \, it'll cause the next
1078e5dd7070Spatrick // line of code to be lexed as a comment, breaking formatting. The code below
1079e5dd7070Spatrick // finds comments that contain a backslash followed by a line break, truncates
1080e5dd7070Spatrick // the comment token at the backslash, and resets the lexer to restart behind
1081e5dd7070Spatrick // the backslash.
1082*12c85518Srobert if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&
1083e5dd7070Spatrick FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
1084e5dd7070Spatrick size_t BackslashPos = FormatTok->TokenText.find('\\');
1085e5dd7070Spatrick while (BackslashPos != StringRef::npos) {
1086e5dd7070Spatrick if (BackslashPos + 1 < FormatTok->TokenText.size() &&
1087e5dd7070Spatrick FormatTok->TokenText[BackslashPos + 1] == '\n') {
1088*12c85518Srobert truncateToken(BackslashPos + 1);
1089e5dd7070Spatrick break;
1090e5dd7070Spatrick }
1091e5dd7070Spatrick BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
1092e5dd7070Spatrick }
1093e5dd7070Spatrick }
1094e5dd7070Spatrick
1095*12c85518Srobert if (Style.isVerilog()) {
1096*12c85518Srobert static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1097*12c85518Srobert SmallVector<StringRef, 1> Matches;
1098*12c85518Srobert // Verilog uses the backtick instead of the hash for preprocessor stuff.
1099*12c85518Srobert // And it uses the hash for delays and parameter lists. In order to continue
1100*12c85518Srobert // using `tok::hash` in other places, the backtick gets marked as the hash
1101*12c85518Srobert // here. And in order to tell the backtick and hash apart for
1102*12c85518Srobert // Verilog-specific stuff, the hash becomes an identifier.
1103*12c85518Srobert if (FormatTok->is(tok::numeric_constant)) {
1104*12c85518Srobert // In Verilog the quote is not part of a number.
1105*12c85518Srobert auto Quote = FormatTok->TokenText.find('\'');
1106*12c85518Srobert if (Quote != StringRef::npos)
1107*12c85518Srobert truncateToken(Quote);
1108*12c85518Srobert } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1109*12c85518Srobert FormatTok->Tok.setKind(tok::raw_identifier);
1110*12c85518Srobert } else if (FormatTok->is(tok::raw_identifier)) {
1111*12c85518Srobert if (FormatTok->TokenText == "`") {
1112*12c85518Srobert FormatTok->Tok.setIdentifierInfo(nullptr);
1113*12c85518Srobert FormatTok->Tok.setKind(tok::hash);
1114*12c85518Srobert } else if (FormatTok->TokenText == "``") {
1115*12c85518Srobert FormatTok->Tok.setIdentifierInfo(nullptr);
1116*12c85518Srobert FormatTok->Tok.setKind(tok::hashhash);
1117*12c85518Srobert } else if (Tokens.size() > 0 &&
1118*12c85518Srobert Tokens.back()->is(Keywords.kw_apostrophe) &&
1119*12c85518Srobert NumberBase.match(FormatTok->TokenText, &Matches)) {
1120*12c85518Srobert // In Verilog in a based number literal like `'b10`, there may be
1121*12c85518Srobert // whitespace between `'b` and `10`. Therefore we handle the base and
1122*12c85518Srobert // the rest of the number literal as two tokens. But if there is no
1123*12c85518Srobert // space in the input code, we need to manually separate the two parts.
1124*12c85518Srobert truncateToken(Matches[0].size());
1125*12c85518Srobert FormatTok->setFinalizedType(TT_VerilogNumberBase);
1126*12c85518Srobert }
1127*12c85518Srobert }
1128e5dd7070Spatrick }
1129e5dd7070Spatrick
1130e5dd7070Spatrick FormatTok->WhitespaceRange = SourceRange(
1131e5dd7070Spatrick WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1132e5dd7070Spatrick
1133e5dd7070Spatrick FormatTok->OriginalColumn = Column;
1134e5dd7070Spatrick
1135e5dd7070Spatrick TrailingWhitespace = 0;
1136*12c85518Srobert if (FormatTok->is(tok::comment)) {
1137e5dd7070Spatrick // FIXME: Add the trimmed whitespace to Column.
1138e5dd7070Spatrick StringRef UntrimmedText = FormatTok->TokenText;
1139e5dd7070Spatrick FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1140e5dd7070Spatrick TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1141*12c85518Srobert } else if (FormatTok->is(tok::raw_identifier)) {
1142e5dd7070Spatrick IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1143e5dd7070Spatrick FormatTok->Tok.setIdentifierInfo(&Info);
1144e5dd7070Spatrick FormatTok->Tok.setKind(Info.getTokenID());
1145e5dd7070Spatrick if (Style.Language == FormatStyle::LK_Java &&
1146e5dd7070Spatrick FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1147e5dd7070Spatrick tok::kw_operator)) {
1148e5dd7070Spatrick FormatTok->Tok.setKind(tok::identifier);
1149e5dd7070Spatrick FormatTok->Tok.setIdentifierInfo(nullptr);
1150*12c85518Srobert } else if (Style.isJavaScript() &&
1151e5dd7070Spatrick FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1152e5dd7070Spatrick tok::kw_operator)) {
1153e5dd7070Spatrick FormatTok->Tok.setKind(tok::identifier);
1154e5dd7070Spatrick FormatTok->Tok.setIdentifierInfo(nullptr);
1155e5dd7070Spatrick }
1156*12c85518Srobert } else if (FormatTok->is(tok::greatergreater)) {
1157e5dd7070Spatrick FormatTok->Tok.setKind(tok::greater);
1158e5dd7070Spatrick FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1159e5dd7070Spatrick ++Column;
1160e5dd7070Spatrick StateStack.push(LexerState::TOKEN_STASHED);
1161*12c85518Srobert } else if (FormatTok->is(tok::lessless)) {
1162e5dd7070Spatrick FormatTok->Tok.setKind(tok::less);
1163e5dd7070Spatrick FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1164e5dd7070Spatrick ++Column;
1165e5dd7070Spatrick StateStack.push(LexerState::TOKEN_STASHED);
1166e5dd7070Spatrick }
1167e5dd7070Spatrick
1168*12c85518Srobert if (Style.isVerilog() && Tokens.size() > 0 &&
1169*12c85518Srobert Tokens.back()->is(TT_VerilogNumberBase) &&
1170*12c85518Srobert FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1171*12c85518Srobert // Mark the number following a base like `'h?a0` as a number.
1172*12c85518Srobert FormatTok->Tok.setKind(tok::numeric_constant);
1173*12c85518Srobert }
1174*12c85518Srobert
1175e5dd7070Spatrick // Now FormatTok is the next non-whitespace token.
1176e5dd7070Spatrick
1177e5dd7070Spatrick StringRef Text = FormatTok->TokenText;
1178e5dd7070Spatrick size_t FirstNewlinePos = Text.find('\n');
1179e5dd7070Spatrick if (FirstNewlinePos == StringRef::npos) {
1180e5dd7070Spatrick // FIXME: ColumnWidth actually depends on the start column, we need to
1181e5dd7070Spatrick // take this into account when the token is moved.
1182e5dd7070Spatrick FormatTok->ColumnWidth =
1183e5dd7070Spatrick encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1184e5dd7070Spatrick Column += FormatTok->ColumnWidth;
1185e5dd7070Spatrick } else {
1186e5dd7070Spatrick FormatTok->IsMultiline = true;
1187e5dd7070Spatrick // FIXME: ColumnWidth actually depends on the start column, we need to
1188e5dd7070Spatrick // take this into account when the token is moved.
1189e5dd7070Spatrick FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1190e5dd7070Spatrick Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1191e5dd7070Spatrick
1192e5dd7070Spatrick // The last line of the token always starts in column 0.
1193e5dd7070Spatrick // Thus, the length can be precomputed even in the presence of tabs.
1194e5dd7070Spatrick FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1195e5dd7070Spatrick Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1196e5dd7070Spatrick Column = FormatTok->LastLineColumnWidth;
1197e5dd7070Spatrick }
1198e5dd7070Spatrick
1199e5dd7070Spatrick if (Style.isCpp()) {
1200e5dd7070Spatrick auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
1201e5dd7070Spatrick if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
1202e5dd7070Spatrick Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1203e5dd7070Spatrick tok::pp_define) &&
1204e5dd7070Spatrick it != Macros.end()) {
1205ec727ea7Spatrick FormatTok->setType(it->second);
1206a9ac8606Spatrick if (it->second == TT_IfMacro) {
1207a9ac8606Spatrick // The lexer token currently has type tok::kw_unknown. However, for this
1208a9ac8606Spatrick // substitution to be treated correctly in the TokenAnnotator, faking
1209a9ac8606Spatrick // the tok value seems to be needed. Not sure if there's a more elegant
1210a9ac8606Spatrick // way.
1211a9ac8606Spatrick FormatTok->Tok.setKind(tok::kw_if);
1212a9ac8606Spatrick }
1213e5dd7070Spatrick } else if (FormatTok->is(tok::identifier)) {
1214*12c85518Srobert if (MacroBlockBeginRegex.match(Text))
1215ec727ea7Spatrick FormatTok->setType(TT_MacroBlockBegin);
1216*12c85518Srobert else if (MacroBlockEndRegex.match(Text))
1217ec727ea7Spatrick FormatTok->setType(TT_MacroBlockEnd);
1218e5dd7070Spatrick }
1219e5dd7070Spatrick }
1220e5dd7070Spatrick
1221e5dd7070Spatrick return FormatTok;
1222e5dd7070Spatrick }
1223e5dd7070Spatrick
readRawTokenVerilogSpecific(Token & Tok)1224*12c85518Srobert bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1225*12c85518Srobert // In Verilog the quote is not a character literal.
1226*12c85518Srobert //
1227*12c85518Srobert // Make the backtick and double backtick identifiers to match against them
1228*12c85518Srobert // more easily.
1229*12c85518Srobert //
1230*12c85518Srobert // In Verilog an escaped identifier starts with backslash and ends with
1231*12c85518Srobert // whitespace. Unless that whitespace is an escaped newline. A backslash can
1232*12c85518Srobert // also begin an escaped newline outside of an escaped identifier. We check
1233*12c85518Srobert // for that outside of the Regex since we can't use negative lookhead
1234*12c85518Srobert // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1235*12c85518Srobert // identifier may have a length of 0 according to Section A.9.3.
1236*12c85518Srobert // FIXME: If there is an escaped newline in the middle of an escaped
1237*12c85518Srobert // identifier, allow for pasting the two lines together, But escaped
1238*12c85518Srobert // identifiers usually occur only in generated code anyway.
1239*12c85518Srobert static const llvm::Regex VerilogToken(R"re(^('|``?|\\(\\)re"
1240*12c85518Srobert "(\r?\n|\r)|[^[:space:]])*)");
1241*12c85518Srobert
1242*12c85518Srobert SmallVector<StringRef, 4> Matches;
1243*12c85518Srobert const char *Start = Lex->getBufferLocation();
1244*12c85518Srobert if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start),
1245*12c85518Srobert &Matches)) {
1246*12c85518Srobert return false;
1247*12c85518Srobert }
1248*12c85518Srobert // There is a null byte at the end of the buffer, so we don't have to check
1249*12c85518Srobert // Start[1] is within the buffer.
1250*12c85518Srobert if (Start[0] == '\\' && (Start[1] == '\r' || Start[1] == '\n'))
1251*12c85518Srobert return false;
1252*12c85518Srobert size_t Len = Matches[0].size();
1253*12c85518Srobert
1254*12c85518Srobert // The kind has to be an identifier so we can match it against those defined
1255*12c85518Srobert // in Keywords. The kind has to be set before the length because the setLength
1256*12c85518Srobert // function checks that the kind is not an annotation.
1257*12c85518Srobert Tok.setKind(tok::raw_identifier);
1258*12c85518Srobert Tok.setLength(Len);
1259*12c85518Srobert Tok.setLocation(Lex->getSourceLocation(Start, Len));
1260*12c85518Srobert Tok.setRawIdentifierData(Start);
1261*12c85518Srobert Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1262*12c85518Srobert return true;
1263*12c85518Srobert }
1264*12c85518Srobert
readRawToken(FormatToken & Tok)1265e5dd7070Spatrick void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1266*12c85518Srobert // For Verilog, first see if there is a special token, and fall back to the
1267*12c85518Srobert // normal lexer if there isn't one.
1268*12c85518Srobert if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
1269e5dd7070Spatrick Lex->LexFromRawLexer(Tok.Tok);
1270e5dd7070Spatrick Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1271e5dd7070Spatrick Tok.Tok.getLength());
1272e5dd7070Spatrick // For formatting, treat unterminated string literals like normal string
1273e5dd7070Spatrick // literals.
1274e5dd7070Spatrick if (Tok.is(tok::unknown)) {
1275e5dd7070Spatrick if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
1276e5dd7070Spatrick Tok.Tok.setKind(tok::string_literal);
1277e5dd7070Spatrick Tok.IsUnterminatedLiteral = true;
1278*12c85518Srobert } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1279e5dd7070Spatrick Tok.Tok.setKind(tok::string_literal);
1280e5dd7070Spatrick }
1281e5dd7070Spatrick }
1282e5dd7070Spatrick
1283*12c85518Srobert if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Proto ||
1284e5dd7070Spatrick Style.Language == FormatStyle::LK_TextProto) &&
1285e5dd7070Spatrick Tok.is(tok::char_constant)) {
1286e5dd7070Spatrick Tok.Tok.setKind(tok::string_literal);
1287e5dd7070Spatrick }
1288e5dd7070Spatrick
1289e5dd7070Spatrick if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
1290e5dd7070Spatrick Tok.TokenText == "/* clang-format on */")) {
1291e5dd7070Spatrick FormattingDisabled = false;
1292e5dd7070Spatrick }
1293e5dd7070Spatrick
1294e5dd7070Spatrick Tok.Finalized = FormattingDisabled;
1295e5dd7070Spatrick
1296e5dd7070Spatrick if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
1297e5dd7070Spatrick Tok.TokenText == "/* clang-format off */")) {
1298e5dd7070Spatrick FormattingDisabled = true;
1299e5dd7070Spatrick }
1300e5dd7070Spatrick }
1301e5dd7070Spatrick
resetLexer(unsigned Offset)1302e5dd7070Spatrick void FormatTokenLexer::resetLexer(unsigned Offset) {
1303e5dd7070Spatrick StringRef Buffer = SourceMgr.getBufferData(ID);
1304*12c85518Srobert LangOpts = getFormattingLangOpts(Style);
1305*12c85518Srobert Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1306*12c85518Srobert Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1307e5dd7070Spatrick Lex->SetKeepWhitespaceMode(true);
1308e5dd7070Spatrick TrailingWhitespace = 0;
1309e5dd7070Spatrick }
1310e5dd7070Spatrick
1311e5dd7070Spatrick } // namespace format
1312e5dd7070Spatrick } // namespace clang
1313