xref: /llvm-project/clang-tools-extra/clangd/support/Token.h (revision ed8f78827895050442f544edef2933a60d4a7935)
1 //===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Tokens are the first level of abstraction above bytes used in pseudoparsing.
10 // We use clang's lexer to scan the bytes (in raw mode, with no preprocessor).
11 // The tokens is wrapped into pseudo::Token, along with line/indent info.
12 //
13 // Unlike clang, we make multiple passes over the whole file, out-of-order.
14 // Therefore we retain the whole token sequence in memory. (This is feasible as
15 // we process one file at a time). pseudo::TokenStream holds such a stream.
16 // The initial stream holds the raw tokens read from the file, later passes
17 // operate on derived TokenStreams (e.g. with directives stripped).
18 //
19 // Similar facilities from clang that are *not* used:
20 //  - SourceManager: designed around multiple files and precise macro expansion.
21 //  - clang::Token: coupled to SourceManager, doesn't retain layout info.
22 //                  (pseudo::Token is similar, but without SourceLocations).
23 //  - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros.
24 //                  (pseudo::TokenStream is similar, but a flat token list).
25 //
26 //===----------------------------------------------------------------------===//
27 
28 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H
29 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H
30 
31 #include "clang/Basic/LLVM.h"
32 #include "clang/Basic/LangStandard.h"
33 #include "clang/Basic/TokenKinds.h"
34 #include "llvm/ADT/ArrayRef.h"
35 #include "llvm/ADT/STLForwardCompat.h"
36 #include "llvm/Support/raw_ostream.h"
37 #include <cstdint>
38 #include <limits>
39 #include <memory>
40 #include <vector>
41 
42 namespace clang {
43 class LangOptions;
44 namespace clangd {
45 
46 /// A single C++ or preprocessor token.
47 ///
48 /// Unlike clang::Token and syntax::Token, these tokens are not connected to a
49 /// SourceManager - we are not dealing with multiple files.
50 struct Token {
51   /// An Index identifies a token within a stream.
52   using Index = uint32_t;
53   /// A sentinel Index indicating no token.
54   constexpr static Index Invalid = std::numeric_limits<Index>::max();
55   struct Range;
56 
57   /// The token text.
58   ///
59   /// Typically from the original source file, but may have been synthesized.
60   StringRef text() const { return StringRef(Data, Length); }
61   const char *Data = nullptr;
62   uint32_t Length = 0;
63 
64   /// Zero-based line number for the start of the token.
65   /// This refers to the original source file as written.
66   uint32_t Line = 0;
67   /// Width of whitespace before the first token on this line.
68   uint8_t Indent = 0;
69   /// Flags have some meaning defined by the function that produced this stream.
70   uint8_t Flags = 0;
71   /// Index into the original token stream (as raw-lexed from the source code).
72   Index OriginalIndex = Invalid;
73   // Helpers to get/set Flags based on `enum class`.
74   template <class T> bool flag(T Mask) const {
75     return Flags & uint8_t{llvm::to_underlying(Mask)};
76   }
77   template <class T> void setFlag(T Mask) {
78     Flags |= uint8_t{llvm::to_underlying(Mask)};
79   }
80 
81   /// Returns the next token in the stream. this may not be a sentinel.
82   const Token &next() const {
83     assert(Kind != tok::eof);
84     return *(this + 1);
85   }
86   /// Returns the next token in the stream, skipping over comments.
87   const Token &nextNC() const {
88     const Token *T = this;
89     do
90       T = &T->next();
91     while (T->Kind == tok::comment);
92     return *T;
93   }
94   /// Returns the previous token in the stream. this may not be a sentinel.
95   const Token &prev() const {
96     assert(Kind != tok::eof);
97     return *(this - 1);
98   }
99   /// Returns the bracket paired with this one, if any.
100   const Token *pair() const { return Pair == 0 ? nullptr : this + Pair; }
101 
102   /// The type of token as determined by clang's lexer.
103   clang::tok::TokenKind Kind = clang::tok::unknown;
104   /// If this token is a paired bracket, the offset of the pair in the stream.
105   int32_t Pair = 0;
106 };
107 static_assert(sizeof(Token) <= sizeof(char *) + 24, "Careful with layout!");
108 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
109 
110 /// A half-open range of tokens within a stream.
111 struct Token::Range {
112   Index Begin = 0;
113   Index End = 0;
114 
115   uint32_t size() const { return End - Begin; }
116   static Range emptyAt(Index Index) { return Range{Index, Index}; }
117 };
118 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &);
119 
120 /// A complete sequence of Tokens representing a source file.
121 ///
122 /// This may match a raw file from disk, or be derived from a previous stream.
123 /// For example, stripping comments from a TokenStream results in a new stream.
124 ///
125 /// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes:
126 ///       int      main   (        )        ;
127 ///   eof kw_int   ident  l_paren  r_paren  semi   eof
128 ///       front()                           back()
129 ///       0        1      2        3        4      5
130 class TokenStream {
131 public:
132   /// Create an empty stream.
133   ///
134   /// Initially, the stream is appendable and not finalized.
135   /// The token sequence may only be accessed after finalize() is called.
136   ///
137   /// Payload is an opaque object which will be owned by the stream.
138   /// e.g. an allocator to hold backing storage for synthesized token text.
139   explicit TokenStream(std::shared_ptr<void> Payload = nullptr);
140 
141   /// Append a token to the stream, which must not be finalized.
142   void push(Token T) {
143     assert(!isFinalized());
144     Storage.push_back(std::move(T));
145   }
146 
147   /// Finalize the token stream, allowing tokens to be accessed.
148   /// Tokens may no longer be appended.
149   void finalize();
150   bool isFinalized() const;
151 
152   /// Returns the index of T within the stream.
153   ///
154   /// T must be within the stream or the end sentinel (not the start sentinel).
155   Token::Index index(const Token &T) const {
156     assert(isFinalized());
157     assert(&T >= Storage.data() && &T < Storage.data() + Storage.size());
158     assert(&T != Storage.data() && "start sentinel");
159     return &T - Tokens.data();
160   }
161 
162   ArrayRef<Token> tokens() const {
163     assert(isFinalized());
164     return Tokens;
165   }
166   ArrayRef<Token> tokens(Token::Range R) const {
167     return tokens().slice(R.Begin, R.End - R.Begin);
168   }
169 
170   MutableArrayRef<Token> tokens() {
171     assert(isFinalized());
172     return Tokens;
173   }
174 
175   /// May return the end sentinel if the stream is empty.
176   const Token &front() const {
177     assert(isFinalized());
178     return Storage[1];
179   }
180 
181   /// Returns the shared payload.
182   std::shared_ptr<void> getPayload() const { return Payload; }
183   /// Adds the given payload to the stream.
184   void addPayload(std::shared_ptr<void> P) {
185     if (!Payload)
186       Payload = std::move(P);
187     else
188       Payload = std::make_shared<
189           std::pair<std::shared_ptr<void>, std::shared_ptr<void>>>(
190           std::move(P), std::move(Payload));
191   }
192 
193   /// Print the tokens in this stream to the output stream.
194   ///
195   /// The presence of newlines/spaces is preserved, but not the quantity.
196   void print(llvm::raw_ostream &) const;
197 
198 private:
199   std::shared_ptr<void> Payload;
200 
201   MutableArrayRef<Token> Tokens;
202   std::vector<Token> Storage; // eof + Tokens + eof
203 };
204 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &);
205 
206 /// Extracts a raw token stream from the source code.
207 ///
208 /// All tokens will reference the data of the provided string.
209 /// "word-like" tokens such as identifiers and keywords will be raw_identifier.
210 TokenStream lex(const std::string &, const clang::LangOptions &);
211 enum class LexFlags : uint8_t {
212   /// Marks the token at the start of a logical preprocessor line.
213   /// This is a position where a directive might start.
214   ///
215   /// Here, the first # is StartsPPLine, but second is not (same logical line).
216   ///   #define X(error) \
217   ///   #error // not a directive!
218   ///
219   /// Careful, the directive may not start exactly on the StartsPPLine token:
220   ///   /*comment*/ #include <foo.h>
221   StartsPPLine = 1 << 0,
222   /// Marks tokens containing trigraphs, escaped newlines, UCNs etc.
223   /// The text() of such tokens will contain the raw trigrah.
224   NeedsCleaning = 1 << 1,
225 };
226 /// A generic lang options suitable for lexing/parsing a langage.
227 clang::LangOptions genericLangOpts(
228     clang::Language = clang::Language::CXX,
229     clang::LangStandard::Kind = clang::LangStandard::lang_unspecified);
230 
231 /// Decoding raw tokens written in the source code, returning a derived stream.
232 ///
233 /// - escaped newlines within tokens are removed
234 /// - trigraphs are replaced with the characters they encode
235 /// - UCNs within raw_identifiers are replaced by the characters they encode
236 ///   (UCNs within strings, comments etc are not translated)
237 /// - raw_identifier tokens are assigned their correct keyword type
238 /// - the >> token is split into separate > > tokens
239 ///   (we use a modified grammar where >> is a nonterminal, not a token)
240 ///
241 /// The StartsPPLine flag is preserved.
242 ///
243 /// Formally the identifier correctly happens before preprocessing, while we
244 /// should only cook raw_identifiers that survive preprocessing.
245 /// However, ignoring the Token::Kind of tokens in directives achieves the same.
246 /// (And having cooked token kinds in PP-disabled sections is useful for us).
247 TokenStream cook(const TokenStream &, const clang::LangOptions &);
248 
249 /// Drops comment tokens.
250 TokenStream stripComments(const TokenStream &);
251 
252 } // namespace clangd
253 } // namespace clang
254 
255 #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H
256