1 //===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Tokens are the first level of abstraction above bytes used in pseudoparsing. 10 // We use clang's lexer to scan the bytes (in raw mode, with no preprocessor). 11 // The tokens is wrapped into pseudo::Token, along with line/indent info. 12 // 13 // Unlike clang, we make multiple passes over the whole file, out-of-order. 14 // Therefore we retain the whole token sequence in memory. (This is feasible as 15 // we process one file at a time). pseudo::TokenStream holds such a stream. 16 // The initial stream holds the raw tokens read from the file, later passes 17 // operate on derived TokenStreams (e.g. with directives stripped). 18 // 19 // Similar facilities from clang that are *not* used: 20 // - SourceManager: designed around multiple files and precise macro expansion. 21 // - clang::Token: coupled to SourceManager, doesn't retain layout info. 22 // (pseudo::Token is similar, but without SourceLocations). 23 // - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros. 24 // (pseudo::TokenStream is similar, but a flat token list). 25 // 26 //===----------------------------------------------------------------------===// 27 28 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H 29 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H 30 31 #include "clang/Basic/LLVM.h" 32 #include "clang/Basic/LangStandard.h" 33 #include "clang/Basic/TokenKinds.h" 34 #include "llvm/ADT/ArrayRef.h" 35 #include "llvm/ADT/STLForwardCompat.h" 36 #include "llvm/Support/raw_ostream.h" 37 #include <cstdint> 38 #include <limits> 39 #include <memory> 40 #include <vector> 41 42 namespace clang { 43 class LangOptions; 44 namespace clangd { 45 46 /// A single C++ or preprocessor token. 47 /// 48 /// Unlike clang::Token and syntax::Token, these tokens are not connected to a 49 /// SourceManager - we are not dealing with multiple files. 50 struct Token { 51 /// An Index identifies a token within a stream. 52 using Index = uint32_t; 53 /// A sentinel Index indicating no token. 54 constexpr static Index Invalid = std::numeric_limits<Index>::max(); 55 struct Range; 56 57 /// The token text. 58 /// 59 /// Typically from the original source file, but may have been synthesized. 60 StringRef text() const { return StringRef(Data, Length); } 61 const char *Data = nullptr; 62 uint32_t Length = 0; 63 64 /// Zero-based line number for the start of the token. 65 /// This refers to the original source file as written. 66 uint32_t Line = 0; 67 /// Width of whitespace before the first token on this line. 68 uint8_t Indent = 0; 69 /// Flags have some meaning defined by the function that produced this stream. 70 uint8_t Flags = 0; 71 /// Index into the original token stream (as raw-lexed from the source code). 72 Index OriginalIndex = Invalid; 73 // Helpers to get/set Flags based on `enum class`. 74 template <class T> bool flag(T Mask) const { 75 return Flags & uint8_t{llvm::to_underlying(Mask)}; 76 } 77 template <class T> void setFlag(T Mask) { 78 Flags |= uint8_t{llvm::to_underlying(Mask)}; 79 } 80 81 /// Returns the next token in the stream. this may not be a sentinel. 82 const Token &next() const { 83 assert(Kind != tok::eof); 84 return *(this + 1); 85 } 86 /// Returns the next token in the stream, skipping over comments. 87 const Token &nextNC() const { 88 const Token *T = this; 89 do 90 T = &T->next(); 91 while (T->Kind == tok::comment); 92 return *T; 93 } 94 /// Returns the previous token in the stream. this may not be a sentinel. 95 const Token &prev() const { 96 assert(Kind != tok::eof); 97 return *(this - 1); 98 } 99 /// Returns the bracket paired with this one, if any. 100 const Token *pair() const { return Pair == 0 ? nullptr : this + Pair; } 101 102 /// The type of token as determined by clang's lexer. 103 clang::tok::TokenKind Kind = clang::tok::unknown; 104 /// If this token is a paired bracket, the offset of the pair in the stream. 105 int32_t Pair = 0; 106 }; 107 static_assert(sizeof(Token) <= sizeof(char *) + 24, "Careful with layout!"); 108 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &); 109 110 /// A half-open range of tokens within a stream. 111 struct Token::Range { 112 Index Begin = 0; 113 Index End = 0; 114 115 uint32_t size() const { return End - Begin; } 116 static Range emptyAt(Index Index) { return Range{Index, Index}; } 117 }; 118 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &); 119 120 /// A complete sequence of Tokens representing a source file. 121 /// 122 /// This may match a raw file from disk, or be derived from a previous stream. 123 /// For example, stripping comments from a TokenStream results in a new stream. 124 /// 125 /// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes: 126 /// int main ( ) ; 127 /// eof kw_int ident l_paren r_paren semi eof 128 /// front() back() 129 /// 0 1 2 3 4 5 130 class TokenStream { 131 public: 132 /// Create an empty stream. 133 /// 134 /// Initially, the stream is appendable and not finalized. 135 /// The token sequence may only be accessed after finalize() is called. 136 /// 137 /// Payload is an opaque object which will be owned by the stream. 138 /// e.g. an allocator to hold backing storage for synthesized token text. 139 explicit TokenStream(std::shared_ptr<void> Payload = nullptr); 140 141 /// Append a token to the stream, which must not be finalized. 142 void push(Token T) { 143 assert(!isFinalized()); 144 Storage.push_back(std::move(T)); 145 } 146 147 /// Finalize the token stream, allowing tokens to be accessed. 148 /// Tokens may no longer be appended. 149 void finalize(); 150 bool isFinalized() const; 151 152 /// Returns the index of T within the stream. 153 /// 154 /// T must be within the stream or the end sentinel (not the start sentinel). 155 Token::Index index(const Token &T) const { 156 assert(isFinalized()); 157 assert(&T >= Storage.data() && &T < Storage.data() + Storage.size()); 158 assert(&T != Storage.data() && "start sentinel"); 159 return &T - Tokens.data(); 160 } 161 162 ArrayRef<Token> tokens() const { 163 assert(isFinalized()); 164 return Tokens; 165 } 166 ArrayRef<Token> tokens(Token::Range R) const { 167 return tokens().slice(R.Begin, R.End - R.Begin); 168 } 169 170 MutableArrayRef<Token> tokens() { 171 assert(isFinalized()); 172 return Tokens; 173 } 174 175 /// May return the end sentinel if the stream is empty. 176 const Token &front() const { 177 assert(isFinalized()); 178 return Storage[1]; 179 } 180 181 /// Returns the shared payload. 182 std::shared_ptr<void> getPayload() const { return Payload; } 183 /// Adds the given payload to the stream. 184 void addPayload(std::shared_ptr<void> P) { 185 if (!Payload) 186 Payload = std::move(P); 187 else 188 Payload = std::make_shared< 189 std::pair<std::shared_ptr<void>, std::shared_ptr<void>>>( 190 std::move(P), std::move(Payload)); 191 } 192 193 /// Print the tokens in this stream to the output stream. 194 /// 195 /// The presence of newlines/spaces is preserved, but not the quantity. 196 void print(llvm::raw_ostream &) const; 197 198 private: 199 std::shared_ptr<void> Payload; 200 201 MutableArrayRef<Token> Tokens; 202 std::vector<Token> Storage; // eof + Tokens + eof 203 }; 204 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &); 205 206 /// Extracts a raw token stream from the source code. 207 /// 208 /// All tokens will reference the data of the provided string. 209 /// "word-like" tokens such as identifiers and keywords will be raw_identifier. 210 TokenStream lex(const std::string &, const clang::LangOptions &); 211 enum class LexFlags : uint8_t { 212 /// Marks the token at the start of a logical preprocessor line. 213 /// This is a position where a directive might start. 214 /// 215 /// Here, the first # is StartsPPLine, but second is not (same logical line). 216 /// #define X(error) \ 217 /// #error // not a directive! 218 /// 219 /// Careful, the directive may not start exactly on the StartsPPLine token: 220 /// /*comment*/ #include <foo.h> 221 StartsPPLine = 1 << 0, 222 /// Marks tokens containing trigraphs, escaped newlines, UCNs etc. 223 /// The text() of such tokens will contain the raw trigrah. 224 NeedsCleaning = 1 << 1, 225 }; 226 /// A generic lang options suitable for lexing/parsing a langage. 227 clang::LangOptions genericLangOpts( 228 clang::Language = clang::Language::CXX, 229 clang::LangStandard::Kind = clang::LangStandard::lang_unspecified); 230 231 /// Decoding raw tokens written in the source code, returning a derived stream. 232 /// 233 /// - escaped newlines within tokens are removed 234 /// - trigraphs are replaced with the characters they encode 235 /// - UCNs within raw_identifiers are replaced by the characters they encode 236 /// (UCNs within strings, comments etc are not translated) 237 /// - raw_identifier tokens are assigned their correct keyword type 238 /// - the >> token is split into separate > > tokens 239 /// (we use a modified grammar where >> is a nonterminal, not a token) 240 /// 241 /// The StartsPPLine flag is preserved. 242 /// 243 /// Formally the identifier correctly happens before preprocessing, while we 244 /// should only cook raw_identifiers that survive preprocessing. 245 /// However, ignoring the Token::Kind of tokens in directives achieves the same. 246 /// (And having cooked token kinds in PP-disabled sections is useful for us). 247 TokenStream cook(const TokenStream &, const clang::LangOptions &); 248 249 /// Drops comment tokens. 250 TokenStream stripComments(const TokenStream &); 251 252 } // namespace clangd 253 } // namespace clang 254 255 #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H 256