xref: /openbsd-src/gnu/llvm/clang/lib/Format/FormatTokenLexer.h (revision 12c855180aad702bbcca06e0398d774beeafb155)
1e5dd7070Spatrick //===--- FormatTokenLexer.h - Format C++ code ----------------*- C++ ----*-===//
2e5dd7070Spatrick //
3e5dd7070Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e5dd7070Spatrick // See https://llvm.org/LICENSE.txt for license information.
5e5dd7070Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6e5dd7070Spatrick //
7e5dd7070Spatrick //===----------------------------------------------------------------------===//
8e5dd7070Spatrick ///
9e5dd7070Spatrick /// \file
10e5dd7070Spatrick /// This file contains FormatTokenLexer, which tokenizes a source file
11e5dd7070Spatrick /// into a token stream suitable for ClangFormat.
12e5dd7070Spatrick ///
13e5dd7070Spatrick //===----------------------------------------------------------------------===//
14e5dd7070Spatrick 
15e5dd7070Spatrick #ifndef LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
16e5dd7070Spatrick #define LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
17e5dd7070Spatrick 
18e5dd7070Spatrick #include "Encoding.h"
19e5dd7070Spatrick #include "FormatToken.h"
20*12c85518Srobert #include "clang/Basic/LangOptions.h"
21e5dd7070Spatrick #include "clang/Basic/SourceLocation.h"
22e5dd7070Spatrick #include "clang/Basic/SourceManager.h"
23e5dd7070Spatrick #include "clang/Format/Format.h"
24e5dd7070Spatrick #include "llvm/ADT/MapVector.h"
25ec727ea7Spatrick #include "llvm/ADT/StringSet.h"
26e5dd7070Spatrick #include "llvm/Support/Regex.h"
27e5dd7070Spatrick 
28e5dd7070Spatrick #include <stack>
29e5dd7070Spatrick 
30e5dd7070Spatrick namespace clang {
31e5dd7070Spatrick namespace format {
32e5dd7070Spatrick 
33e5dd7070Spatrick enum LexerState {
34e5dd7070Spatrick   NORMAL,
35e5dd7070Spatrick   TEMPLATE_STRING,
36e5dd7070Spatrick   TOKEN_STASHED,
37e5dd7070Spatrick };
38e5dd7070Spatrick 
39e5dd7070Spatrick class FormatTokenLexer {
40e5dd7070Spatrick public:
41e5dd7070Spatrick   FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column,
42ec727ea7Spatrick                    const FormatStyle &Style, encoding::Encoding Encoding,
43ec727ea7Spatrick                    llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
44ec727ea7Spatrick                    IdentifierTable &IdentTable);
45e5dd7070Spatrick 
46e5dd7070Spatrick   ArrayRef<FormatToken *> lex();
47e5dd7070Spatrick 
getKeywords()48e5dd7070Spatrick   const AdditionalKeywords &getKeywords() { return Keywords; }
49e5dd7070Spatrick 
50e5dd7070Spatrick private:
51e5dd7070Spatrick   void tryMergePreviousTokens();
52e5dd7070Spatrick 
53e5dd7070Spatrick   bool tryMergeLessLess();
54e5dd7070Spatrick   bool tryMergeNSStringLiteral();
55e5dd7070Spatrick   bool tryMergeJSPrivateIdentifier();
56ec727ea7Spatrick   bool tryMergeCSharpStringLiteral();
57e5dd7070Spatrick   bool tryMergeCSharpKeywordVariables();
58a9ac8606Spatrick   bool tryMergeNullishCoalescingEqual();
59e5dd7070Spatrick   bool tryTransformCSharpForEach();
60ec727ea7Spatrick   bool tryMergeForEach();
61ec727ea7Spatrick   bool tryTransformTryUsageForC();
62e5dd7070Spatrick 
63*12c85518Srobert   // Merge the most recently lexed tokens into a single token if their kinds are
64*12c85518Srobert   // correct.
65e5dd7070Spatrick   bool tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, TokenType NewType);
66*12c85518Srobert   // Merge without checking their kinds.
67*12c85518Srobert   bool tryMergeTokens(size_t Count, TokenType NewType);
68*12c85518Srobert   // Merge if their kinds match any one of Kinds.
69*12c85518Srobert   bool tryMergeTokensAny(ArrayRef<ArrayRef<tok::TokenKind>> Kinds,
70*12c85518Srobert                          TokenType NewType);
71e5dd7070Spatrick 
72e5dd7070Spatrick   // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
73e5dd7070Spatrick   bool precedesOperand(FormatToken *Tok);
74e5dd7070Spatrick 
75e5dd7070Spatrick   bool canPrecedeRegexLiteral(FormatToken *Prev);
76e5dd7070Spatrick 
77e5dd7070Spatrick   // Tries to parse a JavaScript Regex literal starting at the current token,
78e5dd7070Spatrick   // if that begins with a slash and is in a location where JavaScript allows
79e5dd7070Spatrick   // regex literals. Changes the current token to a regex literal and updates
80e5dd7070Spatrick   // its text if successful.
81e5dd7070Spatrick   void tryParseJSRegexLiteral();
82e5dd7070Spatrick 
83e5dd7070Spatrick   // Handles JavaScript template strings.
84e5dd7070Spatrick   //
85e5dd7070Spatrick   // JavaScript template strings use backticks ('`') as delimiters, and allow
86e5dd7070Spatrick   // embedding expressions nested in ${expr-here}. Template strings can be
87e5dd7070Spatrick   // nested recursively, i.e. expressions can contain template strings in turn.
88e5dd7070Spatrick   //
89e5dd7070Spatrick   // The code below parses starting from a backtick, up to a closing backtick or
90e5dd7070Spatrick   // an opening ${. It also maintains a stack of lexing contexts to handle
91e5dd7070Spatrick   // nested template parts by balancing curly braces.
92e5dd7070Spatrick   void handleTemplateStrings();
93e5dd7070Spatrick 
94ec727ea7Spatrick   void handleCSharpVerbatimAndInterpolatedStrings();
95ec727ea7Spatrick 
96e5dd7070Spatrick   void tryParsePythonComment();
97e5dd7070Spatrick 
98e5dd7070Spatrick   bool tryMerge_TMacro();
99e5dd7070Spatrick 
100e5dd7070Spatrick   bool tryMergeConflictMarkers();
101e5dd7070Spatrick 
102*12c85518Srobert   void truncateToken(size_t NewLen);
103*12c85518Srobert 
104e5dd7070Spatrick   FormatToken *getStashedToken();
105e5dd7070Spatrick 
106e5dd7070Spatrick   FormatToken *getNextToken();
107e5dd7070Spatrick 
108e5dd7070Spatrick   FormatToken *FormatTok;
109e5dd7070Spatrick   bool IsFirstToken;
110e5dd7070Spatrick   std::stack<LexerState> StateStack;
111e5dd7070Spatrick   unsigned Column;
112e5dd7070Spatrick   unsigned TrailingWhitespace;
113e5dd7070Spatrick   std::unique_ptr<Lexer> Lex;
114*12c85518Srobert   LangOptions LangOpts;
115e5dd7070Spatrick   const SourceManager &SourceMgr;
116e5dd7070Spatrick   FileID ID;
117e5dd7070Spatrick   const FormatStyle &Style;
118ec727ea7Spatrick   IdentifierTable &IdentTable;
119e5dd7070Spatrick   AdditionalKeywords Keywords;
120e5dd7070Spatrick   encoding::Encoding Encoding;
121ec727ea7Spatrick   llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator;
122e5dd7070Spatrick   // Index (in 'Tokens') of the last token that starts a new line.
123e5dd7070Spatrick   unsigned FirstInLineIndex;
124e5dd7070Spatrick   SmallVector<FormatToken *, 16> Tokens;
125e5dd7070Spatrick 
126e5dd7070Spatrick   llvm::SmallMapVector<IdentifierInfo *, TokenType, 8> Macros;
127e5dd7070Spatrick 
128e5dd7070Spatrick   bool FormattingDisabled;
129e5dd7070Spatrick 
130e5dd7070Spatrick   llvm::Regex MacroBlockBeginRegex;
131e5dd7070Spatrick   llvm::Regex MacroBlockEndRegex;
132e5dd7070Spatrick 
133ec727ea7Spatrick   // Targets that may appear inside a C# attribute.
134ec727ea7Spatrick   static const llvm::StringSet<> CSharpAttributeTargets;
135ec727ea7Spatrick 
136*12c85518Srobert   /// Handle Verilog-specific tokens.
137*12c85518Srobert   bool readRawTokenVerilogSpecific(Token &Tok);
138*12c85518Srobert 
139e5dd7070Spatrick   void readRawToken(FormatToken &Tok);
140e5dd7070Spatrick 
141e5dd7070Spatrick   void resetLexer(unsigned Offset);
142e5dd7070Spatrick };
143e5dd7070Spatrick 
144e5dd7070Spatrick } // namespace format
145e5dd7070Spatrick } // namespace clang
146e5dd7070Spatrick 
147e5dd7070Spatrick #endif
148