1e5dd7070Spatrick //===--- FormatTokenLexer.h - Format C++ code ----------------*- C++ ----*-===// 2e5dd7070Spatrick // 3e5dd7070Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4e5dd7070Spatrick // See https://llvm.org/LICENSE.txt for license information. 5e5dd7070Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6e5dd7070Spatrick // 7e5dd7070Spatrick //===----------------------------------------------------------------------===// 8e5dd7070Spatrick /// 9e5dd7070Spatrick /// \file 10e5dd7070Spatrick /// This file contains FormatTokenLexer, which tokenizes a source file 11e5dd7070Spatrick /// into a token stream suitable for ClangFormat. 12e5dd7070Spatrick /// 13e5dd7070Spatrick //===----------------------------------------------------------------------===// 14e5dd7070Spatrick 15e5dd7070Spatrick #ifndef LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H 16e5dd7070Spatrick #define LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H 17e5dd7070Spatrick 18e5dd7070Spatrick #include "Encoding.h" 19e5dd7070Spatrick #include "FormatToken.h" 20*12c85518Srobert #include "clang/Basic/LangOptions.h" 21e5dd7070Spatrick #include "clang/Basic/SourceLocation.h" 22e5dd7070Spatrick #include "clang/Basic/SourceManager.h" 23e5dd7070Spatrick #include "clang/Format/Format.h" 24e5dd7070Spatrick #include "llvm/ADT/MapVector.h" 25ec727ea7Spatrick #include "llvm/ADT/StringSet.h" 26e5dd7070Spatrick #include "llvm/Support/Regex.h" 27e5dd7070Spatrick 28e5dd7070Spatrick #include <stack> 29e5dd7070Spatrick 30e5dd7070Spatrick namespace clang { 31e5dd7070Spatrick namespace format { 32e5dd7070Spatrick 33e5dd7070Spatrick enum LexerState { 34e5dd7070Spatrick NORMAL, 35e5dd7070Spatrick TEMPLATE_STRING, 36e5dd7070Spatrick TOKEN_STASHED, 37e5dd7070Spatrick }; 38e5dd7070Spatrick 39e5dd7070Spatrick class FormatTokenLexer { 40e5dd7070Spatrick public: 41e5dd7070Spatrick FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, 42ec727ea7Spatrick const FormatStyle &Style, encoding::Encoding Encoding, 43ec727ea7Spatrick llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator, 44ec727ea7Spatrick IdentifierTable &IdentTable); 45e5dd7070Spatrick 46e5dd7070Spatrick ArrayRef<FormatToken *> lex(); 47e5dd7070Spatrick getKeywords()48e5dd7070Spatrick const AdditionalKeywords &getKeywords() { return Keywords; } 49e5dd7070Spatrick 50e5dd7070Spatrick private: 51e5dd7070Spatrick void tryMergePreviousTokens(); 52e5dd7070Spatrick 53e5dd7070Spatrick bool tryMergeLessLess(); 54e5dd7070Spatrick bool tryMergeNSStringLiteral(); 55e5dd7070Spatrick bool tryMergeJSPrivateIdentifier(); 56ec727ea7Spatrick bool tryMergeCSharpStringLiteral(); 57e5dd7070Spatrick bool tryMergeCSharpKeywordVariables(); 58a9ac8606Spatrick bool tryMergeNullishCoalescingEqual(); 59e5dd7070Spatrick bool tryTransformCSharpForEach(); 60ec727ea7Spatrick bool tryMergeForEach(); 61ec727ea7Spatrick bool tryTransformTryUsageForC(); 62e5dd7070Spatrick 63*12c85518Srobert // Merge the most recently lexed tokens into a single token if their kinds are 64*12c85518Srobert // correct. 65e5dd7070Spatrick bool tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, TokenType NewType); 66*12c85518Srobert // Merge without checking their kinds. 67*12c85518Srobert bool tryMergeTokens(size_t Count, TokenType NewType); 68*12c85518Srobert // Merge if their kinds match any one of Kinds. 69*12c85518Srobert bool tryMergeTokensAny(ArrayRef<ArrayRef<tok::TokenKind>> Kinds, 70*12c85518Srobert TokenType NewType); 71e5dd7070Spatrick 72e5dd7070Spatrick // Returns \c true if \p Tok can only be followed by an operand in JavaScript. 73e5dd7070Spatrick bool precedesOperand(FormatToken *Tok); 74e5dd7070Spatrick 75e5dd7070Spatrick bool canPrecedeRegexLiteral(FormatToken *Prev); 76e5dd7070Spatrick 77e5dd7070Spatrick // Tries to parse a JavaScript Regex literal starting at the current token, 78e5dd7070Spatrick // if that begins with a slash and is in a location where JavaScript allows 79e5dd7070Spatrick // regex literals. Changes the current token to a regex literal and updates 80e5dd7070Spatrick // its text if successful. 81e5dd7070Spatrick void tryParseJSRegexLiteral(); 82e5dd7070Spatrick 83e5dd7070Spatrick // Handles JavaScript template strings. 84e5dd7070Spatrick // 85e5dd7070Spatrick // JavaScript template strings use backticks ('`') as delimiters, and allow 86e5dd7070Spatrick // embedding expressions nested in ${expr-here}. Template strings can be 87e5dd7070Spatrick // nested recursively, i.e. expressions can contain template strings in turn. 88e5dd7070Spatrick // 89e5dd7070Spatrick // The code below parses starting from a backtick, up to a closing backtick or 90e5dd7070Spatrick // an opening ${. It also maintains a stack of lexing contexts to handle 91e5dd7070Spatrick // nested template parts by balancing curly braces. 92e5dd7070Spatrick void handleTemplateStrings(); 93e5dd7070Spatrick 94ec727ea7Spatrick void handleCSharpVerbatimAndInterpolatedStrings(); 95ec727ea7Spatrick 96e5dd7070Spatrick void tryParsePythonComment(); 97e5dd7070Spatrick 98e5dd7070Spatrick bool tryMerge_TMacro(); 99e5dd7070Spatrick 100e5dd7070Spatrick bool tryMergeConflictMarkers(); 101e5dd7070Spatrick 102*12c85518Srobert void truncateToken(size_t NewLen); 103*12c85518Srobert 104e5dd7070Spatrick FormatToken *getStashedToken(); 105e5dd7070Spatrick 106e5dd7070Spatrick FormatToken *getNextToken(); 107e5dd7070Spatrick 108e5dd7070Spatrick FormatToken *FormatTok; 109e5dd7070Spatrick bool IsFirstToken; 110e5dd7070Spatrick std::stack<LexerState> StateStack; 111e5dd7070Spatrick unsigned Column; 112e5dd7070Spatrick unsigned TrailingWhitespace; 113e5dd7070Spatrick std::unique_ptr<Lexer> Lex; 114*12c85518Srobert LangOptions LangOpts; 115e5dd7070Spatrick const SourceManager &SourceMgr; 116e5dd7070Spatrick FileID ID; 117e5dd7070Spatrick const FormatStyle &Style; 118ec727ea7Spatrick IdentifierTable &IdentTable; 119e5dd7070Spatrick AdditionalKeywords Keywords; 120e5dd7070Spatrick encoding::Encoding Encoding; 121ec727ea7Spatrick llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator; 122e5dd7070Spatrick // Index (in 'Tokens') of the last token that starts a new line. 123e5dd7070Spatrick unsigned FirstInLineIndex; 124e5dd7070Spatrick SmallVector<FormatToken *, 16> Tokens; 125e5dd7070Spatrick 126e5dd7070Spatrick llvm::SmallMapVector<IdentifierInfo *, TokenType, 8> Macros; 127e5dd7070Spatrick 128e5dd7070Spatrick bool FormattingDisabled; 129e5dd7070Spatrick 130e5dd7070Spatrick llvm::Regex MacroBlockBeginRegex; 131e5dd7070Spatrick llvm::Regex MacroBlockEndRegex; 132e5dd7070Spatrick 133ec727ea7Spatrick // Targets that may appear inside a C# attribute. 134ec727ea7Spatrick static const llvm::StringSet<> CSharpAttributeTargets; 135ec727ea7Spatrick 136*12c85518Srobert /// Handle Verilog-specific tokens. 137*12c85518Srobert bool readRawTokenVerilogSpecific(Token &Tok); 138*12c85518Srobert 139e5dd7070Spatrick void readRawToken(FormatToken &Tok); 140e5dd7070Spatrick 141e5dd7070Spatrick void resetLexer(unsigned Offset); 142e5dd7070Spatrick }; 143e5dd7070Spatrick 144e5dd7070Spatrick } // namespace format 145e5dd7070Spatrick } // namespace clang 146e5dd7070Spatrick 147e5dd7070Spatrick #endif 148