1*ed8f7882SAaron Ballman //===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===// 2*ed8f7882SAaron Ballman // 3*ed8f7882SAaron Ballman // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*ed8f7882SAaron Ballman // See https://llvm.org/LICENSE.txt for license information. 5*ed8f7882SAaron Ballman // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*ed8f7882SAaron Ballman // 7*ed8f7882SAaron Ballman //===----------------------------------------------------------------------===// 8*ed8f7882SAaron Ballman // 9*ed8f7882SAaron Ballman // Tokens are the first level of abstraction above bytes used in pseudoparsing. 10*ed8f7882SAaron Ballman // We use clang's lexer to scan the bytes (in raw mode, with no preprocessor). 11*ed8f7882SAaron Ballman // The tokens is wrapped into pseudo::Token, along with line/indent info. 12*ed8f7882SAaron Ballman // 13*ed8f7882SAaron Ballman // Unlike clang, we make multiple passes over the whole file, out-of-order. 14*ed8f7882SAaron Ballman // Therefore we retain the whole token sequence in memory. (This is feasible as 15*ed8f7882SAaron Ballman // we process one file at a time). pseudo::TokenStream holds such a stream. 16*ed8f7882SAaron Ballman // The initial stream holds the raw tokens read from the file, later passes 17*ed8f7882SAaron Ballman // operate on derived TokenStreams (e.g. with directives stripped). 18*ed8f7882SAaron Ballman // 19*ed8f7882SAaron Ballman // Similar facilities from clang that are *not* used: 20*ed8f7882SAaron Ballman // - SourceManager: designed around multiple files and precise macro expansion. 21*ed8f7882SAaron Ballman // - clang::Token: coupled to SourceManager, doesn't retain layout info. 22*ed8f7882SAaron Ballman // (pseudo::Token is similar, but without SourceLocations). 23*ed8f7882SAaron Ballman // - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros. 24*ed8f7882SAaron Ballman // (pseudo::TokenStream is similar, but a flat token list). 25*ed8f7882SAaron Ballman // 26*ed8f7882SAaron Ballman //===----------------------------------------------------------------------===// 27*ed8f7882SAaron Ballman 28*ed8f7882SAaron Ballman #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H 29*ed8f7882SAaron Ballman #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H 30*ed8f7882SAaron Ballman 31*ed8f7882SAaron Ballman #include "clang/Basic/LLVM.h" 32*ed8f7882SAaron Ballman #include "clang/Basic/LangStandard.h" 33*ed8f7882SAaron Ballman #include "clang/Basic/TokenKinds.h" 34*ed8f7882SAaron Ballman #include "llvm/ADT/ArrayRef.h" 35*ed8f7882SAaron Ballman #include "llvm/ADT/STLForwardCompat.h" 36*ed8f7882SAaron Ballman #include "llvm/Support/raw_ostream.h" 37*ed8f7882SAaron Ballman #include <cstdint> 38*ed8f7882SAaron Ballman #include <limits> 39*ed8f7882SAaron Ballman #include <memory> 40*ed8f7882SAaron Ballman #include <vector> 41*ed8f7882SAaron Ballman 42*ed8f7882SAaron Ballman namespace clang { 43*ed8f7882SAaron Ballman class LangOptions; 44*ed8f7882SAaron Ballman namespace clangd { 45*ed8f7882SAaron Ballman 46*ed8f7882SAaron Ballman /// A single C++ or preprocessor token. 47*ed8f7882SAaron Ballman /// 48*ed8f7882SAaron Ballman /// Unlike clang::Token and syntax::Token, these tokens are not connected to a 49*ed8f7882SAaron Ballman /// SourceManager - we are not dealing with multiple files. 50*ed8f7882SAaron Ballman struct Token { 51*ed8f7882SAaron Ballman /// An Index identifies a token within a stream. 52*ed8f7882SAaron Ballman using Index = uint32_t; 53*ed8f7882SAaron Ballman /// A sentinel Index indicating no token. 54*ed8f7882SAaron Ballman constexpr static Index Invalid = std::numeric_limits<Index>::max(); 55*ed8f7882SAaron Ballman struct Range; 56*ed8f7882SAaron Ballman 57*ed8f7882SAaron Ballman /// The token text. 58*ed8f7882SAaron Ballman /// 59*ed8f7882SAaron Ballman /// Typically from the original source file, but may have been synthesized. 60*ed8f7882SAaron Ballman StringRef text() const { return StringRef(Data, Length); } 61*ed8f7882SAaron Ballman const char *Data = nullptr; 62*ed8f7882SAaron Ballman uint32_t Length = 0; 63*ed8f7882SAaron Ballman 64*ed8f7882SAaron Ballman /// Zero-based line number for the start of the token. 65*ed8f7882SAaron Ballman /// This refers to the original source file as written. 66*ed8f7882SAaron Ballman uint32_t Line = 0; 67*ed8f7882SAaron Ballman /// Width of whitespace before the first token on this line. 68*ed8f7882SAaron Ballman uint8_t Indent = 0; 69*ed8f7882SAaron Ballman /// Flags have some meaning defined by the function that produced this stream. 70*ed8f7882SAaron Ballman uint8_t Flags = 0; 71*ed8f7882SAaron Ballman /// Index into the original token stream (as raw-lexed from the source code). 72*ed8f7882SAaron Ballman Index OriginalIndex = Invalid; 73*ed8f7882SAaron Ballman // Helpers to get/set Flags based on `enum class`. 74*ed8f7882SAaron Ballman template <class T> bool flag(T Mask) const { 75*ed8f7882SAaron Ballman return Flags & uint8_t{llvm::to_underlying(Mask)}; 76*ed8f7882SAaron Ballman } 77*ed8f7882SAaron Ballman template <class T> void setFlag(T Mask) { 78*ed8f7882SAaron Ballman Flags |= uint8_t{llvm::to_underlying(Mask)}; 79*ed8f7882SAaron Ballman } 80*ed8f7882SAaron Ballman 81*ed8f7882SAaron Ballman /// Returns the next token in the stream. this may not be a sentinel. 82*ed8f7882SAaron Ballman const Token &next() const { 83*ed8f7882SAaron Ballman assert(Kind != tok::eof); 84*ed8f7882SAaron Ballman return *(this + 1); 85*ed8f7882SAaron Ballman } 86*ed8f7882SAaron Ballman /// Returns the next token in the stream, skipping over comments. 87*ed8f7882SAaron Ballman const Token &nextNC() const { 88*ed8f7882SAaron Ballman const Token *T = this; 89*ed8f7882SAaron Ballman do 90*ed8f7882SAaron Ballman T = &T->next(); 91*ed8f7882SAaron Ballman while (T->Kind == tok::comment); 92*ed8f7882SAaron Ballman return *T; 93*ed8f7882SAaron Ballman } 94*ed8f7882SAaron Ballman /// Returns the previous token in the stream. this may not be a sentinel. 95*ed8f7882SAaron Ballman const Token &prev() const { 96*ed8f7882SAaron Ballman assert(Kind != tok::eof); 97*ed8f7882SAaron Ballman return *(this - 1); 98*ed8f7882SAaron Ballman } 99*ed8f7882SAaron Ballman /// Returns the bracket paired with this one, if any. 100*ed8f7882SAaron Ballman const Token *pair() const { return Pair == 0 ? nullptr : this + Pair; } 101*ed8f7882SAaron Ballman 102*ed8f7882SAaron Ballman /// The type of token as determined by clang's lexer. 103*ed8f7882SAaron Ballman clang::tok::TokenKind Kind = clang::tok::unknown; 104*ed8f7882SAaron Ballman /// If this token is a paired bracket, the offset of the pair in the stream. 105*ed8f7882SAaron Ballman int32_t Pair = 0; 106*ed8f7882SAaron Ballman }; 107*ed8f7882SAaron Ballman static_assert(sizeof(Token) <= sizeof(char *) + 24, "Careful with layout!"); 108*ed8f7882SAaron Ballman llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &); 109*ed8f7882SAaron Ballman 110*ed8f7882SAaron Ballman /// A half-open range of tokens within a stream. 111*ed8f7882SAaron Ballman struct Token::Range { 112*ed8f7882SAaron Ballman Index Begin = 0; 113*ed8f7882SAaron Ballman Index End = 0; 114*ed8f7882SAaron Ballman 115*ed8f7882SAaron Ballman uint32_t size() const { return End - Begin; } 116*ed8f7882SAaron Ballman static Range emptyAt(Index Index) { return Range{Index, Index}; } 117*ed8f7882SAaron Ballman }; 118*ed8f7882SAaron Ballman llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &); 119*ed8f7882SAaron Ballman 120*ed8f7882SAaron Ballman /// A complete sequence of Tokens representing a source file. 121*ed8f7882SAaron Ballman /// 122*ed8f7882SAaron Ballman /// This may match a raw file from disk, or be derived from a previous stream. 123*ed8f7882SAaron Ballman /// For example, stripping comments from a TokenStream results in a new stream. 124*ed8f7882SAaron Ballman /// 125*ed8f7882SAaron Ballman /// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes: 126*ed8f7882SAaron Ballman /// int main ( ) ; 127*ed8f7882SAaron Ballman /// eof kw_int ident l_paren r_paren semi eof 128*ed8f7882SAaron Ballman /// front() back() 129*ed8f7882SAaron Ballman /// 0 1 2 3 4 5 130*ed8f7882SAaron Ballman class TokenStream { 131*ed8f7882SAaron Ballman public: 132*ed8f7882SAaron Ballman /// Create an empty stream. 133*ed8f7882SAaron Ballman /// 134*ed8f7882SAaron Ballman /// Initially, the stream is appendable and not finalized. 135*ed8f7882SAaron Ballman /// The token sequence may only be accessed after finalize() is called. 136*ed8f7882SAaron Ballman /// 137*ed8f7882SAaron Ballman /// Payload is an opaque object which will be owned by the stream. 138*ed8f7882SAaron Ballman /// e.g. an allocator to hold backing storage for synthesized token text. 139*ed8f7882SAaron Ballman explicit TokenStream(std::shared_ptr<void> Payload = nullptr); 140*ed8f7882SAaron Ballman 141*ed8f7882SAaron Ballman /// Append a token to the stream, which must not be finalized. 142*ed8f7882SAaron Ballman void push(Token T) { 143*ed8f7882SAaron Ballman assert(!isFinalized()); 144*ed8f7882SAaron Ballman Storage.push_back(std::move(T)); 145*ed8f7882SAaron Ballman } 146*ed8f7882SAaron Ballman 147*ed8f7882SAaron Ballman /// Finalize the token stream, allowing tokens to be accessed. 148*ed8f7882SAaron Ballman /// Tokens may no longer be appended. 149*ed8f7882SAaron Ballman void finalize(); 150*ed8f7882SAaron Ballman bool isFinalized() const; 151*ed8f7882SAaron Ballman 152*ed8f7882SAaron Ballman /// Returns the index of T within the stream. 153*ed8f7882SAaron Ballman /// 154*ed8f7882SAaron Ballman /// T must be within the stream or the end sentinel (not the start sentinel). 155*ed8f7882SAaron Ballman Token::Index index(const Token &T) const { 156*ed8f7882SAaron Ballman assert(isFinalized()); 157*ed8f7882SAaron Ballman assert(&T >= Storage.data() && &T < Storage.data() + Storage.size()); 158*ed8f7882SAaron Ballman assert(&T != Storage.data() && "start sentinel"); 159*ed8f7882SAaron Ballman return &T - Tokens.data(); 160*ed8f7882SAaron Ballman } 161*ed8f7882SAaron Ballman 162*ed8f7882SAaron Ballman ArrayRef<Token> tokens() const { 163*ed8f7882SAaron Ballman assert(isFinalized()); 164*ed8f7882SAaron Ballman return Tokens; 165*ed8f7882SAaron Ballman } 166*ed8f7882SAaron Ballman ArrayRef<Token> tokens(Token::Range R) const { 167*ed8f7882SAaron Ballman return tokens().slice(R.Begin, R.End - R.Begin); 168*ed8f7882SAaron Ballman } 169*ed8f7882SAaron Ballman 170*ed8f7882SAaron Ballman MutableArrayRef<Token> tokens() { 171*ed8f7882SAaron Ballman assert(isFinalized()); 172*ed8f7882SAaron Ballman return Tokens; 173*ed8f7882SAaron Ballman } 174*ed8f7882SAaron Ballman 175*ed8f7882SAaron Ballman /// May return the end sentinel if the stream is empty. 176*ed8f7882SAaron Ballman const Token &front() const { 177*ed8f7882SAaron Ballman assert(isFinalized()); 178*ed8f7882SAaron Ballman return Storage[1]; 179*ed8f7882SAaron Ballman } 180*ed8f7882SAaron Ballman 181*ed8f7882SAaron Ballman /// Returns the shared payload. 182*ed8f7882SAaron Ballman std::shared_ptr<void> getPayload() const { return Payload; } 183*ed8f7882SAaron Ballman /// Adds the given payload to the stream. 184*ed8f7882SAaron Ballman void addPayload(std::shared_ptr<void> P) { 185*ed8f7882SAaron Ballman if (!Payload) 186*ed8f7882SAaron Ballman Payload = std::move(P); 187*ed8f7882SAaron Ballman else 188*ed8f7882SAaron Ballman Payload = std::make_shared< 189*ed8f7882SAaron Ballman std::pair<std::shared_ptr<void>, std::shared_ptr<void>>>( 190*ed8f7882SAaron Ballman std::move(P), std::move(Payload)); 191*ed8f7882SAaron Ballman } 192*ed8f7882SAaron Ballman 193*ed8f7882SAaron Ballman /// Print the tokens in this stream to the output stream. 194*ed8f7882SAaron Ballman /// 195*ed8f7882SAaron Ballman /// The presence of newlines/spaces is preserved, but not the quantity. 196*ed8f7882SAaron Ballman void print(llvm::raw_ostream &) const; 197*ed8f7882SAaron Ballman 198*ed8f7882SAaron Ballman private: 199*ed8f7882SAaron Ballman std::shared_ptr<void> Payload; 200*ed8f7882SAaron Ballman 201*ed8f7882SAaron Ballman MutableArrayRef<Token> Tokens; 202*ed8f7882SAaron Ballman std::vector<Token> Storage; // eof + Tokens + eof 203*ed8f7882SAaron Ballman }; 204*ed8f7882SAaron Ballman llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &); 205*ed8f7882SAaron Ballman 206*ed8f7882SAaron Ballman /// Extracts a raw token stream from the source code. 207*ed8f7882SAaron Ballman /// 208*ed8f7882SAaron Ballman /// All tokens will reference the data of the provided string. 209*ed8f7882SAaron Ballman /// "word-like" tokens such as identifiers and keywords will be raw_identifier. 210*ed8f7882SAaron Ballman TokenStream lex(const std::string &, const clang::LangOptions &); 211*ed8f7882SAaron Ballman enum class LexFlags : uint8_t { 212*ed8f7882SAaron Ballman /// Marks the token at the start of a logical preprocessor line. 213*ed8f7882SAaron Ballman /// This is a position where a directive might start. 214*ed8f7882SAaron Ballman /// 215*ed8f7882SAaron Ballman /// Here, the first # is StartsPPLine, but second is not (same logical line). 216*ed8f7882SAaron Ballman /// #define X(error) \ 217*ed8f7882SAaron Ballman /// #error // not a directive! 218*ed8f7882SAaron Ballman /// 219*ed8f7882SAaron Ballman /// Careful, the directive may not start exactly on the StartsPPLine token: 220*ed8f7882SAaron Ballman /// /*comment*/ #include <foo.h> 221*ed8f7882SAaron Ballman StartsPPLine = 1 << 0, 222*ed8f7882SAaron Ballman /// Marks tokens containing trigraphs, escaped newlines, UCNs etc. 223*ed8f7882SAaron Ballman /// The text() of such tokens will contain the raw trigrah. 224*ed8f7882SAaron Ballman NeedsCleaning = 1 << 1, 225*ed8f7882SAaron Ballman }; 226*ed8f7882SAaron Ballman /// A generic lang options suitable for lexing/parsing a langage. 227*ed8f7882SAaron Ballman clang::LangOptions genericLangOpts( 228*ed8f7882SAaron Ballman clang::Language = clang::Language::CXX, 229*ed8f7882SAaron Ballman clang::LangStandard::Kind = clang::LangStandard::lang_unspecified); 230*ed8f7882SAaron Ballman 231*ed8f7882SAaron Ballman /// Decoding raw tokens written in the source code, returning a derived stream. 232*ed8f7882SAaron Ballman /// 233*ed8f7882SAaron Ballman /// - escaped newlines within tokens are removed 234*ed8f7882SAaron Ballman /// - trigraphs are replaced with the characters they encode 235*ed8f7882SAaron Ballman /// - UCNs within raw_identifiers are replaced by the characters they encode 236*ed8f7882SAaron Ballman /// (UCNs within strings, comments etc are not translated) 237*ed8f7882SAaron Ballman /// - raw_identifier tokens are assigned their correct keyword type 238*ed8f7882SAaron Ballman /// - the >> token is split into separate > > tokens 239*ed8f7882SAaron Ballman /// (we use a modified grammar where >> is a nonterminal, not a token) 240*ed8f7882SAaron Ballman /// 241*ed8f7882SAaron Ballman /// The StartsPPLine flag is preserved. 242*ed8f7882SAaron Ballman /// 243*ed8f7882SAaron Ballman /// Formally the identifier correctly happens before preprocessing, while we 244*ed8f7882SAaron Ballman /// should only cook raw_identifiers that survive preprocessing. 245*ed8f7882SAaron Ballman /// However, ignoring the Token::Kind of tokens in directives achieves the same. 246*ed8f7882SAaron Ballman /// (And having cooked token kinds in PP-disabled sections is useful for us). 247*ed8f7882SAaron Ballman TokenStream cook(const TokenStream &, const clang::LangOptions &); 248*ed8f7882SAaron Ballman 249*ed8f7882SAaron Ballman /// Drops comment tokens. 250*ed8f7882SAaron Ballman TokenStream stripComments(const TokenStream &); 251*ed8f7882SAaron Ballman 252*ed8f7882SAaron Ballman } // namespace clangd 253*ed8f7882SAaron Ballman } // namespace clang 254*ed8f7882SAaron Ballman 255*ed8f7882SAaron Ballman #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H 256