xref: /freebsd-src/contrib/llvm-project/clang/lib/Format/FormatTokenLexer.h (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
10b57cec5SDimitry Andric //===--- FormatTokenLexer.h - Format C++ code ----------------*- C++ ----*-===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric ///
90b57cec5SDimitry Andric /// \file
100b57cec5SDimitry Andric /// This file contains FormatTokenLexer, which tokenizes a source file
110b57cec5SDimitry Andric /// into a token stream suitable for ClangFormat.
120b57cec5SDimitry Andric ///
130b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
140b57cec5SDimitry Andric 
150b57cec5SDimitry Andric #ifndef LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
160b57cec5SDimitry Andric #define LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
170b57cec5SDimitry Andric 
180b57cec5SDimitry Andric #include "Encoding.h"
190b57cec5SDimitry Andric #include "FormatToken.h"
200b57cec5SDimitry Andric #include "llvm/ADT/MapVector.h"
2106c3fb27SDimitry Andric #include "llvm/ADT/SmallPtrSet.h"
225ffd83dbSDimitry Andric #include "llvm/ADT/StringSet.h"
230b57cec5SDimitry Andric 
240b57cec5SDimitry Andric #include <stack>
250b57cec5SDimitry Andric 
260b57cec5SDimitry Andric namespace clang {
270b57cec5SDimitry Andric namespace format {
280b57cec5SDimitry Andric 
290b57cec5SDimitry Andric enum LexerState {
300b57cec5SDimitry Andric   NORMAL,
310b57cec5SDimitry Andric   TEMPLATE_STRING,
320b57cec5SDimitry Andric   TOKEN_STASHED,
330b57cec5SDimitry Andric };
340b57cec5SDimitry Andric 
350b57cec5SDimitry Andric class FormatTokenLexer {
360b57cec5SDimitry Andric public:
370b57cec5SDimitry Andric   FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column,
385ffd83dbSDimitry Andric                    const FormatStyle &Style, encoding::Encoding Encoding,
395ffd83dbSDimitry Andric                    llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
405ffd83dbSDimitry Andric                    IdentifierTable &IdentTable);
410b57cec5SDimitry Andric 
420b57cec5SDimitry Andric   ArrayRef<FormatToken *> lex();
430b57cec5SDimitry Andric 
440b57cec5SDimitry Andric   const AdditionalKeywords &getKeywords() { return Keywords; }
450b57cec5SDimitry Andric 
460b57cec5SDimitry Andric private:
470b57cec5SDimitry Andric   void tryMergePreviousTokens();
480b57cec5SDimitry Andric 
490b57cec5SDimitry Andric   bool tryMergeLessLess();
5006c3fb27SDimitry Andric   bool tryMergeGreaterGreater();
510b57cec5SDimitry Andric   bool tryMergeNSStringLiteral();
520b57cec5SDimitry Andric   bool tryMergeJSPrivateIdentifier();
535ffd83dbSDimitry Andric   bool tryMergeCSharpStringLiteral();
540b57cec5SDimitry Andric   bool tryMergeCSharpKeywordVariables();
55fe6060f1SDimitry Andric   bool tryMergeNullishCoalescingEqual();
56a7dea167SDimitry Andric   bool tryTransformCSharpForEach();
575ffd83dbSDimitry Andric   bool tryMergeForEach();
585ffd83dbSDimitry Andric   bool tryTransformTryUsageForC();
590b57cec5SDimitry Andric 
60bdd1243dSDimitry Andric   // Merge the most recently lexed tokens into a single token if their kinds are
61bdd1243dSDimitry Andric   // correct.
620b57cec5SDimitry Andric   bool tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, TokenType NewType);
63bdd1243dSDimitry Andric   // Merge without checking their kinds.
64bdd1243dSDimitry Andric   bool tryMergeTokens(size_t Count, TokenType NewType);
65bdd1243dSDimitry Andric   // Merge if their kinds match any one of Kinds.
66bdd1243dSDimitry Andric   bool tryMergeTokensAny(ArrayRef<ArrayRef<tok::TokenKind>> Kinds,
67bdd1243dSDimitry Andric                          TokenType NewType);
680b57cec5SDimitry Andric 
690b57cec5SDimitry Andric   // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
700b57cec5SDimitry Andric   bool precedesOperand(FormatToken *Tok);
710b57cec5SDimitry Andric 
720b57cec5SDimitry Andric   bool canPrecedeRegexLiteral(FormatToken *Prev);
730b57cec5SDimitry Andric 
740b57cec5SDimitry Andric   // Tries to parse a JavaScript Regex literal starting at the current token,
750b57cec5SDimitry Andric   // if that begins with a slash and is in a location where JavaScript allows
760b57cec5SDimitry Andric   // regex literals. Changes the current token to a regex literal and updates
770b57cec5SDimitry Andric   // its text if successful.
780b57cec5SDimitry Andric   void tryParseJSRegexLiteral();
790b57cec5SDimitry Andric 
800b57cec5SDimitry Andric   // Handles JavaScript template strings.
810b57cec5SDimitry Andric   //
820b57cec5SDimitry Andric   // JavaScript template strings use backticks ('`') as delimiters, and allow
830b57cec5SDimitry Andric   // embedding expressions nested in ${expr-here}. Template strings can be
840b57cec5SDimitry Andric   // nested recursively, i.e. expressions can contain template strings in turn.
850b57cec5SDimitry Andric   //
860b57cec5SDimitry Andric   // The code below parses starting from a backtick, up to a closing backtick or
870b57cec5SDimitry Andric   // an opening ${. It also maintains a stack of lexing contexts to handle
880b57cec5SDimitry Andric   // nested template parts by balancing curly braces.
890b57cec5SDimitry Andric   void handleTemplateStrings();
900b57cec5SDimitry Andric 
915ffd83dbSDimitry Andric   void handleCSharpVerbatimAndInterpolatedStrings();
925ffd83dbSDimitry Andric 
93*7a6dacacSDimitry Andric   // Handles TableGen multiline strings. It has the form [{ ... }].
94*7a6dacacSDimitry Andric   void handleTableGenMultilineString();
95*7a6dacacSDimitry Andric   // Handles TableGen numeric like identifiers.
96*7a6dacacSDimitry Andric   // They have a forms of [0-9]*[_a-zA-Z]([_a-zA-Z0-9]*). But limited to the
97*7a6dacacSDimitry Andric   // case it is not lexed as an integer.
98*7a6dacacSDimitry Andric   void handleTableGenNumericLikeIdentifier();
99*7a6dacacSDimitry Andric 
1000b57cec5SDimitry Andric   void tryParsePythonComment();
1010b57cec5SDimitry Andric 
1020b57cec5SDimitry Andric   bool tryMerge_TMacro();
1030b57cec5SDimitry Andric 
1040b57cec5SDimitry Andric   bool tryMergeConflictMarkers();
1050b57cec5SDimitry Andric 
10681ad6265SDimitry Andric   void truncateToken(size_t NewLen);
10781ad6265SDimitry Andric 
1080b57cec5SDimitry Andric   FormatToken *getStashedToken();
1090b57cec5SDimitry Andric 
1100b57cec5SDimitry Andric   FormatToken *getNextToken();
1110b57cec5SDimitry Andric 
1120b57cec5SDimitry Andric   FormatToken *FormatTok;
1130b57cec5SDimitry Andric   bool IsFirstToken;
1140b57cec5SDimitry Andric   std::stack<LexerState> StateStack;
1150b57cec5SDimitry Andric   unsigned Column;
1160b57cec5SDimitry Andric   unsigned TrailingWhitespace;
1170b57cec5SDimitry Andric   std::unique_ptr<Lexer> Lex;
11881ad6265SDimitry Andric   LangOptions LangOpts;
1190b57cec5SDimitry Andric   const SourceManager &SourceMgr;
1200b57cec5SDimitry Andric   FileID ID;
1210b57cec5SDimitry Andric   const FormatStyle &Style;
1225ffd83dbSDimitry Andric   IdentifierTable &IdentTable;
1230b57cec5SDimitry Andric   AdditionalKeywords Keywords;
1240b57cec5SDimitry Andric   encoding::Encoding Encoding;
1255ffd83dbSDimitry Andric   llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator;
1260b57cec5SDimitry Andric   // Index (in 'Tokens') of the last token that starts a new line.
1270b57cec5SDimitry Andric   unsigned FirstInLineIndex;
1280b57cec5SDimitry Andric   SmallVector<FormatToken *, 16> Tokens;
1290b57cec5SDimitry Andric 
1300b57cec5SDimitry Andric   llvm::SmallMapVector<IdentifierInfo *, TokenType, 8> Macros;
1310b57cec5SDimitry Andric 
13206c3fb27SDimitry Andric   llvm::SmallPtrSet<IdentifierInfo *, 8> TypeNames;
13306c3fb27SDimitry Andric 
1340b57cec5SDimitry Andric   bool FormattingDisabled;
1350b57cec5SDimitry Andric 
1360b57cec5SDimitry Andric   llvm::Regex MacroBlockBeginRegex;
1370b57cec5SDimitry Andric   llvm::Regex MacroBlockEndRegex;
1380b57cec5SDimitry Andric 
1395ffd83dbSDimitry Andric   // Targets that may appear inside a C# attribute.
1405ffd83dbSDimitry Andric   static const llvm::StringSet<> CSharpAttributeTargets;
1415ffd83dbSDimitry Andric 
14281ad6265SDimitry Andric   /// Handle Verilog-specific tokens.
14381ad6265SDimitry Andric   bool readRawTokenVerilogSpecific(Token &Tok);
14481ad6265SDimitry Andric 
1450b57cec5SDimitry Andric   void readRawToken(FormatToken &Tok);
1460b57cec5SDimitry Andric 
1470b57cec5SDimitry Andric   void resetLexer(unsigned Offset);
1480b57cec5SDimitry Andric };
1490b57cec5SDimitry Andric 
1500b57cec5SDimitry Andric } // namespace format
1510b57cec5SDimitry Andric } // namespace clang
1520b57cec5SDimitry Andric 
1530b57cec5SDimitry Andric #endif
154