xref: /llvm-project/clang-tools-extra/clangd/support/Token.h (revision ed8f78827895050442f544edef2933a60d4a7935)
1*ed8f7882SAaron Ballman //===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===//
2*ed8f7882SAaron Ballman //
3*ed8f7882SAaron Ballman // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*ed8f7882SAaron Ballman // See https://llvm.org/LICENSE.txt for license information.
5*ed8f7882SAaron Ballman // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*ed8f7882SAaron Ballman //
7*ed8f7882SAaron Ballman //===----------------------------------------------------------------------===//
8*ed8f7882SAaron Ballman //
9*ed8f7882SAaron Ballman // Tokens are the first level of abstraction above bytes used in pseudoparsing.
10*ed8f7882SAaron Ballman // We use clang's lexer to scan the bytes (in raw mode, with no preprocessor).
11*ed8f7882SAaron Ballman // The tokens is wrapped into pseudo::Token, along with line/indent info.
12*ed8f7882SAaron Ballman //
13*ed8f7882SAaron Ballman // Unlike clang, we make multiple passes over the whole file, out-of-order.
14*ed8f7882SAaron Ballman // Therefore we retain the whole token sequence in memory. (This is feasible as
15*ed8f7882SAaron Ballman // we process one file at a time). pseudo::TokenStream holds such a stream.
16*ed8f7882SAaron Ballman // The initial stream holds the raw tokens read from the file, later passes
17*ed8f7882SAaron Ballman // operate on derived TokenStreams (e.g. with directives stripped).
18*ed8f7882SAaron Ballman //
19*ed8f7882SAaron Ballman // Similar facilities from clang that are *not* used:
20*ed8f7882SAaron Ballman //  - SourceManager: designed around multiple files and precise macro expansion.
21*ed8f7882SAaron Ballman //  - clang::Token: coupled to SourceManager, doesn't retain layout info.
22*ed8f7882SAaron Ballman //                  (pseudo::Token is similar, but without SourceLocations).
23*ed8f7882SAaron Ballman //  - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros.
24*ed8f7882SAaron Ballman //                  (pseudo::TokenStream is similar, but a flat token list).
25*ed8f7882SAaron Ballman //
26*ed8f7882SAaron Ballman //===----------------------------------------------------------------------===//
27*ed8f7882SAaron Ballman 
28*ed8f7882SAaron Ballman #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H
29*ed8f7882SAaron Ballman #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H
30*ed8f7882SAaron Ballman 
31*ed8f7882SAaron Ballman #include "clang/Basic/LLVM.h"
32*ed8f7882SAaron Ballman #include "clang/Basic/LangStandard.h"
33*ed8f7882SAaron Ballman #include "clang/Basic/TokenKinds.h"
34*ed8f7882SAaron Ballman #include "llvm/ADT/ArrayRef.h"
35*ed8f7882SAaron Ballman #include "llvm/ADT/STLForwardCompat.h"
36*ed8f7882SAaron Ballman #include "llvm/Support/raw_ostream.h"
37*ed8f7882SAaron Ballman #include <cstdint>
38*ed8f7882SAaron Ballman #include <limits>
39*ed8f7882SAaron Ballman #include <memory>
40*ed8f7882SAaron Ballman #include <vector>
41*ed8f7882SAaron Ballman 
42*ed8f7882SAaron Ballman namespace clang {
43*ed8f7882SAaron Ballman class LangOptions;
44*ed8f7882SAaron Ballman namespace clangd {
45*ed8f7882SAaron Ballman 
46*ed8f7882SAaron Ballman /// A single C++ or preprocessor token.
47*ed8f7882SAaron Ballman ///
48*ed8f7882SAaron Ballman /// Unlike clang::Token and syntax::Token, these tokens are not connected to a
49*ed8f7882SAaron Ballman /// SourceManager - we are not dealing with multiple files.
50*ed8f7882SAaron Ballman struct Token {
51*ed8f7882SAaron Ballman   /// An Index identifies a token within a stream.
52*ed8f7882SAaron Ballman   using Index = uint32_t;
53*ed8f7882SAaron Ballman   /// A sentinel Index indicating no token.
54*ed8f7882SAaron Ballman   constexpr static Index Invalid = std::numeric_limits<Index>::max();
55*ed8f7882SAaron Ballman   struct Range;
56*ed8f7882SAaron Ballman 
57*ed8f7882SAaron Ballman   /// The token text.
58*ed8f7882SAaron Ballman   ///
59*ed8f7882SAaron Ballman   /// Typically from the original source file, but may have been synthesized.
60*ed8f7882SAaron Ballman   StringRef text() const { return StringRef(Data, Length); }
61*ed8f7882SAaron Ballman   const char *Data = nullptr;
62*ed8f7882SAaron Ballman   uint32_t Length = 0;
63*ed8f7882SAaron Ballman 
64*ed8f7882SAaron Ballman   /// Zero-based line number for the start of the token.
65*ed8f7882SAaron Ballman   /// This refers to the original source file as written.
66*ed8f7882SAaron Ballman   uint32_t Line = 0;
67*ed8f7882SAaron Ballman   /// Width of whitespace before the first token on this line.
68*ed8f7882SAaron Ballman   uint8_t Indent = 0;
69*ed8f7882SAaron Ballman   /// Flags have some meaning defined by the function that produced this stream.
70*ed8f7882SAaron Ballman   uint8_t Flags = 0;
71*ed8f7882SAaron Ballman   /// Index into the original token stream (as raw-lexed from the source code).
72*ed8f7882SAaron Ballman   Index OriginalIndex = Invalid;
73*ed8f7882SAaron Ballman   // Helpers to get/set Flags based on `enum class`.
74*ed8f7882SAaron Ballman   template <class T> bool flag(T Mask) const {
75*ed8f7882SAaron Ballman     return Flags & uint8_t{llvm::to_underlying(Mask)};
76*ed8f7882SAaron Ballman   }
77*ed8f7882SAaron Ballman   template <class T> void setFlag(T Mask) {
78*ed8f7882SAaron Ballman     Flags |= uint8_t{llvm::to_underlying(Mask)};
79*ed8f7882SAaron Ballman   }
80*ed8f7882SAaron Ballman 
81*ed8f7882SAaron Ballman   /// Returns the next token in the stream. this may not be a sentinel.
82*ed8f7882SAaron Ballman   const Token &next() const {
83*ed8f7882SAaron Ballman     assert(Kind != tok::eof);
84*ed8f7882SAaron Ballman     return *(this + 1);
85*ed8f7882SAaron Ballman   }
86*ed8f7882SAaron Ballman   /// Returns the next token in the stream, skipping over comments.
87*ed8f7882SAaron Ballman   const Token &nextNC() const {
88*ed8f7882SAaron Ballman     const Token *T = this;
89*ed8f7882SAaron Ballman     do
90*ed8f7882SAaron Ballman       T = &T->next();
91*ed8f7882SAaron Ballman     while (T->Kind == tok::comment);
92*ed8f7882SAaron Ballman     return *T;
93*ed8f7882SAaron Ballman   }
94*ed8f7882SAaron Ballman   /// Returns the previous token in the stream. this may not be a sentinel.
95*ed8f7882SAaron Ballman   const Token &prev() const {
96*ed8f7882SAaron Ballman     assert(Kind != tok::eof);
97*ed8f7882SAaron Ballman     return *(this - 1);
98*ed8f7882SAaron Ballman   }
99*ed8f7882SAaron Ballman   /// Returns the bracket paired with this one, if any.
100*ed8f7882SAaron Ballman   const Token *pair() const { return Pair == 0 ? nullptr : this + Pair; }
101*ed8f7882SAaron Ballman 
102*ed8f7882SAaron Ballman   /// The type of token as determined by clang's lexer.
103*ed8f7882SAaron Ballman   clang::tok::TokenKind Kind = clang::tok::unknown;
104*ed8f7882SAaron Ballman   /// If this token is a paired bracket, the offset of the pair in the stream.
105*ed8f7882SAaron Ballman   int32_t Pair = 0;
106*ed8f7882SAaron Ballman };
107*ed8f7882SAaron Ballman static_assert(sizeof(Token) <= sizeof(char *) + 24, "Careful with layout!");
108*ed8f7882SAaron Ballman llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
109*ed8f7882SAaron Ballman 
110*ed8f7882SAaron Ballman /// A half-open range of tokens within a stream.
111*ed8f7882SAaron Ballman struct Token::Range {
112*ed8f7882SAaron Ballman   Index Begin = 0;
113*ed8f7882SAaron Ballman   Index End = 0;
114*ed8f7882SAaron Ballman 
115*ed8f7882SAaron Ballman   uint32_t size() const { return End - Begin; }
116*ed8f7882SAaron Ballman   static Range emptyAt(Index Index) { return Range{Index, Index}; }
117*ed8f7882SAaron Ballman };
118*ed8f7882SAaron Ballman llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &);
119*ed8f7882SAaron Ballman 
120*ed8f7882SAaron Ballman /// A complete sequence of Tokens representing a source file.
121*ed8f7882SAaron Ballman ///
122*ed8f7882SAaron Ballman /// This may match a raw file from disk, or be derived from a previous stream.
123*ed8f7882SAaron Ballman /// For example, stripping comments from a TokenStream results in a new stream.
124*ed8f7882SAaron Ballman ///
125*ed8f7882SAaron Ballman /// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes:
126*ed8f7882SAaron Ballman ///       int      main   (        )        ;
127*ed8f7882SAaron Ballman ///   eof kw_int   ident  l_paren  r_paren  semi   eof
128*ed8f7882SAaron Ballman ///       front()                           back()
129*ed8f7882SAaron Ballman ///       0        1      2        3        4      5
130*ed8f7882SAaron Ballman class TokenStream {
131*ed8f7882SAaron Ballman public:
132*ed8f7882SAaron Ballman   /// Create an empty stream.
133*ed8f7882SAaron Ballman   ///
134*ed8f7882SAaron Ballman   /// Initially, the stream is appendable and not finalized.
135*ed8f7882SAaron Ballman   /// The token sequence may only be accessed after finalize() is called.
136*ed8f7882SAaron Ballman   ///
137*ed8f7882SAaron Ballman   /// Payload is an opaque object which will be owned by the stream.
138*ed8f7882SAaron Ballman   /// e.g. an allocator to hold backing storage for synthesized token text.
139*ed8f7882SAaron Ballman   explicit TokenStream(std::shared_ptr<void> Payload = nullptr);
140*ed8f7882SAaron Ballman 
141*ed8f7882SAaron Ballman   /// Append a token to the stream, which must not be finalized.
142*ed8f7882SAaron Ballman   void push(Token T) {
143*ed8f7882SAaron Ballman     assert(!isFinalized());
144*ed8f7882SAaron Ballman     Storage.push_back(std::move(T));
145*ed8f7882SAaron Ballman   }
146*ed8f7882SAaron Ballman 
147*ed8f7882SAaron Ballman   /// Finalize the token stream, allowing tokens to be accessed.
148*ed8f7882SAaron Ballman   /// Tokens may no longer be appended.
149*ed8f7882SAaron Ballman   void finalize();
150*ed8f7882SAaron Ballman   bool isFinalized() const;
151*ed8f7882SAaron Ballman 
152*ed8f7882SAaron Ballman   /// Returns the index of T within the stream.
153*ed8f7882SAaron Ballman   ///
154*ed8f7882SAaron Ballman   /// T must be within the stream or the end sentinel (not the start sentinel).
155*ed8f7882SAaron Ballman   Token::Index index(const Token &T) const {
156*ed8f7882SAaron Ballman     assert(isFinalized());
157*ed8f7882SAaron Ballman     assert(&T >= Storage.data() && &T < Storage.data() + Storage.size());
158*ed8f7882SAaron Ballman     assert(&T != Storage.data() && "start sentinel");
159*ed8f7882SAaron Ballman     return &T - Tokens.data();
160*ed8f7882SAaron Ballman   }
161*ed8f7882SAaron Ballman 
162*ed8f7882SAaron Ballman   ArrayRef<Token> tokens() const {
163*ed8f7882SAaron Ballman     assert(isFinalized());
164*ed8f7882SAaron Ballman     return Tokens;
165*ed8f7882SAaron Ballman   }
166*ed8f7882SAaron Ballman   ArrayRef<Token> tokens(Token::Range R) const {
167*ed8f7882SAaron Ballman     return tokens().slice(R.Begin, R.End - R.Begin);
168*ed8f7882SAaron Ballman   }
169*ed8f7882SAaron Ballman 
170*ed8f7882SAaron Ballman   MutableArrayRef<Token> tokens() {
171*ed8f7882SAaron Ballman     assert(isFinalized());
172*ed8f7882SAaron Ballman     return Tokens;
173*ed8f7882SAaron Ballman   }
174*ed8f7882SAaron Ballman 
175*ed8f7882SAaron Ballman   /// May return the end sentinel if the stream is empty.
176*ed8f7882SAaron Ballman   const Token &front() const {
177*ed8f7882SAaron Ballman     assert(isFinalized());
178*ed8f7882SAaron Ballman     return Storage[1];
179*ed8f7882SAaron Ballman   }
180*ed8f7882SAaron Ballman 
181*ed8f7882SAaron Ballman   /// Returns the shared payload.
182*ed8f7882SAaron Ballman   std::shared_ptr<void> getPayload() const { return Payload; }
183*ed8f7882SAaron Ballman   /// Adds the given payload to the stream.
184*ed8f7882SAaron Ballman   void addPayload(std::shared_ptr<void> P) {
185*ed8f7882SAaron Ballman     if (!Payload)
186*ed8f7882SAaron Ballman       Payload = std::move(P);
187*ed8f7882SAaron Ballman     else
188*ed8f7882SAaron Ballman       Payload = std::make_shared<
189*ed8f7882SAaron Ballman           std::pair<std::shared_ptr<void>, std::shared_ptr<void>>>(
190*ed8f7882SAaron Ballman           std::move(P), std::move(Payload));
191*ed8f7882SAaron Ballman   }
192*ed8f7882SAaron Ballman 
193*ed8f7882SAaron Ballman   /// Print the tokens in this stream to the output stream.
194*ed8f7882SAaron Ballman   ///
195*ed8f7882SAaron Ballman   /// The presence of newlines/spaces is preserved, but not the quantity.
196*ed8f7882SAaron Ballman   void print(llvm::raw_ostream &) const;
197*ed8f7882SAaron Ballman 
198*ed8f7882SAaron Ballman private:
199*ed8f7882SAaron Ballman   std::shared_ptr<void> Payload;
200*ed8f7882SAaron Ballman 
201*ed8f7882SAaron Ballman   MutableArrayRef<Token> Tokens;
202*ed8f7882SAaron Ballman   std::vector<Token> Storage; // eof + Tokens + eof
203*ed8f7882SAaron Ballman };
204*ed8f7882SAaron Ballman llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &);
205*ed8f7882SAaron Ballman 
206*ed8f7882SAaron Ballman /// Extracts a raw token stream from the source code.
207*ed8f7882SAaron Ballman ///
208*ed8f7882SAaron Ballman /// All tokens will reference the data of the provided string.
209*ed8f7882SAaron Ballman /// "word-like" tokens such as identifiers and keywords will be raw_identifier.
210*ed8f7882SAaron Ballman TokenStream lex(const std::string &, const clang::LangOptions &);
211*ed8f7882SAaron Ballman enum class LexFlags : uint8_t {
212*ed8f7882SAaron Ballman   /// Marks the token at the start of a logical preprocessor line.
213*ed8f7882SAaron Ballman   /// This is a position where a directive might start.
214*ed8f7882SAaron Ballman   ///
215*ed8f7882SAaron Ballman   /// Here, the first # is StartsPPLine, but second is not (same logical line).
216*ed8f7882SAaron Ballman   ///   #define X(error) \
217*ed8f7882SAaron Ballman   ///   #error // not a directive!
218*ed8f7882SAaron Ballman   ///
219*ed8f7882SAaron Ballman   /// Careful, the directive may not start exactly on the StartsPPLine token:
220*ed8f7882SAaron Ballman   ///   /*comment*/ #include <foo.h>
221*ed8f7882SAaron Ballman   StartsPPLine = 1 << 0,
222*ed8f7882SAaron Ballman   /// Marks tokens containing trigraphs, escaped newlines, UCNs etc.
223*ed8f7882SAaron Ballman   /// The text() of such tokens will contain the raw trigrah.
224*ed8f7882SAaron Ballman   NeedsCleaning = 1 << 1,
225*ed8f7882SAaron Ballman };
226*ed8f7882SAaron Ballman /// A generic lang options suitable for lexing/parsing a langage.
227*ed8f7882SAaron Ballman clang::LangOptions genericLangOpts(
228*ed8f7882SAaron Ballman     clang::Language = clang::Language::CXX,
229*ed8f7882SAaron Ballman     clang::LangStandard::Kind = clang::LangStandard::lang_unspecified);
230*ed8f7882SAaron Ballman 
231*ed8f7882SAaron Ballman /// Decoding raw tokens written in the source code, returning a derived stream.
232*ed8f7882SAaron Ballman ///
233*ed8f7882SAaron Ballman /// - escaped newlines within tokens are removed
234*ed8f7882SAaron Ballman /// - trigraphs are replaced with the characters they encode
235*ed8f7882SAaron Ballman /// - UCNs within raw_identifiers are replaced by the characters they encode
236*ed8f7882SAaron Ballman ///   (UCNs within strings, comments etc are not translated)
237*ed8f7882SAaron Ballman /// - raw_identifier tokens are assigned their correct keyword type
238*ed8f7882SAaron Ballman /// - the >> token is split into separate > > tokens
239*ed8f7882SAaron Ballman ///   (we use a modified grammar where >> is a nonterminal, not a token)
240*ed8f7882SAaron Ballman ///
241*ed8f7882SAaron Ballman /// The StartsPPLine flag is preserved.
242*ed8f7882SAaron Ballman ///
243*ed8f7882SAaron Ballman /// Formally the identifier correctly happens before preprocessing, while we
244*ed8f7882SAaron Ballman /// should only cook raw_identifiers that survive preprocessing.
245*ed8f7882SAaron Ballman /// However, ignoring the Token::Kind of tokens in directives achieves the same.
246*ed8f7882SAaron Ballman /// (And having cooked token kinds in PP-disabled sections is useful for us).
247*ed8f7882SAaron Ballman TokenStream cook(const TokenStream &, const clang::LangOptions &);
248*ed8f7882SAaron Ballman 
249*ed8f7882SAaron Ballman /// Drops comment tokens.
250*ed8f7882SAaron Ballman TokenStream stripComments(const TokenStream &);
251*ed8f7882SAaron Ballman 
252*ed8f7882SAaron Ballman } // namespace clangd
253*ed8f7882SAaron Ballman } // namespace clang
254*ed8f7882SAaron Ballman 
255*ed8f7882SAaron Ballman #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H
256