xref: /llvm-project/clang-tools-extra/clangd/support/Lex.cpp (revision ed8f78827895050442f544edef2933a60d4a7935)
1*ed8f7882SAaron Ballman //===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
2*ed8f7882SAaron Ballman //
3*ed8f7882SAaron Ballman // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*ed8f7882SAaron Ballman // See https://llvm.org/LICENSE.txt for license information.
5*ed8f7882SAaron Ballman // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*ed8f7882SAaron Ballman //
7*ed8f7882SAaron Ballman //===----------------------------------------------------------------------===//
8*ed8f7882SAaron Ballman 
9*ed8f7882SAaron Ballman #include "Token.h"
10*ed8f7882SAaron Ballman #include "clang/Basic/IdentifierTable.h"
11*ed8f7882SAaron Ballman #include "clang/Basic/SourceLocation.h"
12*ed8f7882SAaron Ballman #include "clang/Basic/TokenKinds.h"
13*ed8f7882SAaron Ballman #include "clang/Lex/Lexer.h"
14*ed8f7882SAaron Ballman #include "clang/Lex/LiteralSupport.h"
15*ed8f7882SAaron Ballman 
16*ed8f7882SAaron Ballman namespace clang {
17*ed8f7882SAaron Ballman namespace clangd {
18*ed8f7882SAaron Ballman 
19*ed8f7882SAaron Ballman TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
20*ed8f7882SAaron Ballman   clang::SourceLocation Start;
21*ed8f7882SAaron Ballman   // Tokenize using clang's lexer in raw mode.
22*ed8f7882SAaron Ballman   // std::string guarantees null-termination, which the lexer needs.
23*ed8f7882SAaron Ballman   clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
24*ed8f7882SAaron Ballman                      Code.data() + Code.size());
25*ed8f7882SAaron Ballman   Lexer.SetCommentRetentionState(true);
26*ed8f7882SAaron Ballman 
27*ed8f7882SAaron Ballman   TokenStream Result;
28*ed8f7882SAaron Ballman   clang::Token CT;
29*ed8f7882SAaron Ballman   // Index into the token stream of original source code.
30*ed8f7882SAaron Ballman   Token::Index TokenIndex = 0;
31*ed8f7882SAaron Ballman   unsigned LastOffset = 0;
32*ed8f7882SAaron Ballman   unsigned Line = 0;
33*ed8f7882SAaron Ballman   unsigned Indent = 0;
34*ed8f7882SAaron Ballman   for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
35*ed8f7882SAaron Ballman        Lexer.LexFromRawLexer(CT)) {
36*ed8f7882SAaron Ballman     unsigned Offset =
37*ed8f7882SAaron Ballman         CT.getLocation().getRawEncoding() - Start.getRawEncoding();
38*ed8f7882SAaron Ballman 
39*ed8f7882SAaron Ballman     Token Tok;
40*ed8f7882SAaron Ballman     Tok.Data = &Code[Offset];
41*ed8f7882SAaron Ballman     Tok.Length = CT.getLength();
42*ed8f7882SAaron Ballman     Tok.Kind = CT.getKind();
43*ed8f7882SAaron Ballman 
44*ed8f7882SAaron Ballman     // Update current line number and indentation from raw source code.
45*ed8f7882SAaron Ballman     unsigned NewLineStart = 0;
46*ed8f7882SAaron Ballman     for (unsigned I = LastOffset; I < Offset; ++I) {
47*ed8f7882SAaron Ballman       if (Code[I] == '\n') {
48*ed8f7882SAaron Ballman         NewLineStart = I + 1;
49*ed8f7882SAaron Ballman         ++Line;
50*ed8f7882SAaron Ballman       }
51*ed8f7882SAaron Ballman     }
52*ed8f7882SAaron Ballman     if (NewLineStart || !LastOffset) {
53*ed8f7882SAaron Ballman       Indent = 0;
54*ed8f7882SAaron Ballman       for (char C : StringRef(Code).slice(NewLineStart, Offset)) {
55*ed8f7882SAaron Ballman         if (C == ' ')
56*ed8f7882SAaron Ballman           ++Indent;
57*ed8f7882SAaron Ballman         else if (C == '\t')
58*ed8f7882SAaron Ballman           Indent += 8;
59*ed8f7882SAaron Ballman         else
60*ed8f7882SAaron Ballman           break;
61*ed8f7882SAaron Ballman       }
62*ed8f7882SAaron Ballman     }
63*ed8f7882SAaron Ballman     Tok.Indent = Indent;
64*ed8f7882SAaron Ballman     Tok.Line = Line;
65*ed8f7882SAaron Ballman 
66*ed8f7882SAaron Ballman     if (CT.isAtStartOfLine())
67*ed8f7882SAaron Ballman       Tok.setFlag(LexFlags::StartsPPLine);
68*ed8f7882SAaron Ballman     if (CT.needsCleaning() || CT.hasUCN())
69*ed8f7882SAaron Ballman       Tok.setFlag(LexFlags::NeedsCleaning);
70*ed8f7882SAaron Ballman 
71*ed8f7882SAaron Ballman     Tok.OriginalIndex = TokenIndex++;
72*ed8f7882SAaron Ballman     Result.push(Tok);
73*ed8f7882SAaron Ballman     LastOffset = Offset;
74*ed8f7882SAaron Ballman   }
75*ed8f7882SAaron Ballman   Result.finalize();
76*ed8f7882SAaron Ballman   return Result;
77*ed8f7882SAaron Ballman }
78*ed8f7882SAaron Ballman 
79*ed8f7882SAaron Ballman TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
80*ed8f7882SAaron Ballman   auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
81*ed8f7882SAaron Ballman   clang::IdentifierTable Identifiers(LangOpts);
82*ed8f7882SAaron Ballman   TokenStream Result(CleanedStorage);
83*ed8f7882SAaron Ballman   Result.addPayload(Code.getPayload());
84*ed8f7882SAaron Ballman   for (auto Tok : Code.tokens()) {
85*ed8f7882SAaron Ballman     if (Tok.flag(LexFlags::NeedsCleaning)) {
86*ed8f7882SAaron Ballman       // Remove escaped newlines and trigraphs.
87*ed8f7882SAaron Ballman       llvm::SmallString<64> CleanBuffer;
88*ed8f7882SAaron Ballman       const char *Pos = Tok.text().begin();
89*ed8f7882SAaron Ballman       while (Pos < Tok.text().end()) {
90*ed8f7882SAaron Ballman         auto [Char, CharSize] =
91*ed8f7882SAaron Ballman             clang::Lexer::getCharAndSizeNoWarn(Pos, LangOpts);
92*ed8f7882SAaron Ballman         CleanBuffer.push_back(Char);
93*ed8f7882SAaron Ballman         assert(CharSize != 0 && "no progress!");
94*ed8f7882SAaron Ballman         Pos += CharSize;
95*ed8f7882SAaron Ballman       }
96*ed8f7882SAaron Ballman       llvm::StringRef Text = CleanBuffer;
97*ed8f7882SAaron Ballman       llvm::SmallString<64> UCNBuffer;
98*ed8f7882SAaron Ballman       // A surface reading of the standard suggests UCNs might appear anywhere.
99*ed8f7882SAaron Ballman       // But we need only decode them in raw_identifiers.
100*ed8f7882SAaron Ballman       //  - they cannot appear in punctuation/keyword tokens, because UCNs
101*ed8f7882SAaron Ballman       //    cannot encode basic characters outside of literals [lex.charset]
102*ed8f7882SAaron Ballman       //  - they can appear in literals, but we need not unescape them now.
103*ed8f7882SAaron Ballman       //    We treat them as escape sequences when evaluating the literal.
104*ed8f7882SAaron Ballman       //  - comments are handled similarly to literals
105*ed8f7882SAaron Ballman       // This is good fortune, because expandUCNs requires its input to be a
106*ed8f7882SAaron Ballman       // reasonably valid identifier (e.g. without stray backslashes).
107*ed8f7882SAaron Ballman       if (Tok.Kind == tok::raw_identifier) {
108*ed8f7882SAaron Ballman         clang::expandUCNs(UCNBuffer, CleanBuffer);
109*ed8f7882SAaron Ballman         Text = UCNBuffer;
110*ed8f7882SAaron Ballman       }
111*ed8f7882SAaron Ballman 
112*ed8f7882SAaron Ballman       Tok.Data = Text.copy(*CleanedStorage).data();
113*ed8f7882SAaron Ballman       Tok.Length = Text.size();
114*ed8f7882SAaron Ballman       Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
115*ed8f7882SAaron Ballman     }
116*ed8f7882SAaron Ballman 
117*ed8f7882SAaron Ballman     if (Tok.Kind == tok::raw_identifier) {
118*ed8f7882SAaron Ballman       // Cook raw_identifiers into identifier, keyword, etc.
119*ed8f7882SAaron Ballman       Tok.Kind = Identifiers.get(Tok.text()).getTokenID();
120*ed8f7882SAaron Ballman     } else if (Tok.Kind == tok::greatergreater) {
121*ed8f7882SAaron Ballman       // Split the greatergreater token.
122*ed8f7882SAaron Ballman       // FIXME: split lessless token to support Cuda triple angle brackets <<<.
123*ed8f7882SAaron Ballman       assert(Tok.text() == ">>");
124*ed8f7882SAaron Ballman       Tok.Kind = tok::greater;
125*ed8f7882SAaron Ballman       Tok.Length = 1;
126*ed8f7882SAaron Ballman       Result.push(Tok);
127*ed8f7882SAaron Ballman       // Line is wrong if the first greater is followed by an escaped newline!
128*ed8f7882SAaron Ballman       Tok.Data = Tok.text().data() + 1;
129*ed8f7882SAaron Ballman     }
130*ed8f7882SAaron Ballman 
131*ed8f7882SAaron Ballman     Result.push(std::move(Tok));
132*ed8f7882SAaron Ballman   }
133*ed8f7882SAaron Ballman 
134*ed8f7882SAaron Ballman   Result.finalize();
135*ed8f7882SAaron Ballman   return Result;
136*ed8f7882SAaron Ballman }
137*ed8f7882SAaron Ballman 
138*ed8f7882SAaron Ballman } // namespace clangd
139*ed8f7882SAaron Ballman } // namespace clang
140