xref: /llvm-project/clang-tools-extra/clangd/support/Lex.cpp (revision ed8f78827895050442f544edef2933a60d4a7935)
1 //===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "Token.h"
10 #include "clang/Basic/IdentifierTable.h"
11 #include "clang/Basic/SourceLocation.h"
12 #include "clang/Basic/TokenKinds.h"
13 #include "clang/Lex/Lexer.h"
14 #include "clang/Lex/LiteralSupport.h"
15 
16 namespace clang {
17 namespace clangd {
18 
19 TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
20   clang::SourceLocation Start;
21   // Tokenize using clang's lexer in raw mode.
22   // std::string guarantees null-termination, which the lexer needs.
23   clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
24                      Code.data() + Code.size());
25   Lexer.SetCommentRetentionState(true);
26 
27   TokenStream Result;
28   clang::Token CT;
29   // Index into the token stream of original source code.
30   Token::Index TokenIndex = 0;
31   unsigned LastOffset = 0;
32   unsigned Line = 0;
33   unsigned Indent = 0;
34   for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
35        Lexer.LexFromRawLexer(CT)) {
36     unsigned Offset =
37         CT.getLocation().getRawEncoding() - Start.getRawEncoding();
38 
39     Token Tok;
40     Tok.Data = &Code[Offset];
41     Tok.Length = CT.getLength();
42     Tok.Kind = CT.getKind();
43 
44     // Update current line number and indentation from raw source code.
45     unsigned NewLineStart = 0;
46     for (unsigned I = LastOffset; I < Offset; ++I) {
47       if (Code[I] == '\n') {
48         NewLineStart = I + 1;
49         ++Line;
50       }
51     }
52     if (NewLineStart || !LastOffset) {
53       Indent = 0;
54       for (char C : StringRef(Code).slice(NewLineStart, Offset)) {
55         if (C == ' ')
56           ++Indent;
57         else if (C == '\t')
58           Indent += 8;
59         else
60           break;
61       }
62     }
63     Tok.Indent = Indent;
64     Tok.Line = Line;
65 
66     if (CT.isAtStartOfLine())
67       Tok.setFlag(LexFlags::StartsPPLine);
68     if (CT.needsCleaning() || CT.hasUCN())
69       Tok.setFlag(LexFlags::NeedsCleaning);
70 
71     Tok.OriginalIndex = TokenIndex++;
72     Result.push(Tok);
73     LastOffset = Offset;
74   }
75   Result.finalize();
76   return Result;
77 }
78 
79 TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
80   auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
81   clang::IdentifierTable Identifiers(LangOpts);
82   TokenStream Result(CleanedStorage);
83   Result.addPayload(Code.getPayload());
84   for (auto Tok : Code.tokens()) {
85     if (Tok.flag(LexFlags::NeedsCleaning)) {
86       // Remove escaped newlines and trigraphs.
87       llvm::SmallString<64> CleanBuffer;
88       const char *Pos = Tok.text().begin();
89       while (Pos < Tok.text().end()) {
90         auto [Char, CharSize] =
91             clang::Lexer::getCharAndSizeNoWarn(Pos, LangOpts);
92         CleanBuffer.push_back(Char);
93         assert(CharSize != 0 && "no progress!");
94         Pos += CharSize;
95       }
96       llvm::StringRef Text = CleanBuffer;
97       llvm::SmallString<64> UCNBuffer;
98       // A surface reading of the standard suggests UCNs might appear anywhere.
99       // But we need only decode them in raw_identifiers.
100       //  - they cannot appear in punctuation/keyword tokens, because UCNs
101       //    cannot encode basic characters outside of literals [lex.charset]
102       //  - they can appear in literals, but we need not unescape them now.
103       //    We treat them as escape sequences when evaluating the literal.
104       //  - comments are handled similarly to literals
105       // This is good fortune, because expandUCNs requires its input to be a
106       // reasonably valid identifier (e.g. without stray backslashes).
107       if (Tok.Kind == tok::raw_identifier) {
108         clang::expandUCNs(UCNBuffer, CleanBuffer);
109         Text = UCNBuffer;
110       }
111 
112       Tok.Data = Text.copy(*CleanedStorage).data();
113       Tok.Length = Text.size();
114       Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
115     }
116 
117     if (Tok.Kind == tok::raw_identifier) {
118       // Cook raw_identifiers into identifier, keyword, etc.
119       Tok.Kind = Identifiers.get(Tok.text()).getTokenID();
120     } else if (Tok.Kind == tok::greatergreater) {
121       // Split the greatergreater token.
122       // FIXME: split lessless token to support Cuda triple angle brackets <<<.
123       assert(Tok.text() == ">>");
124       Tok.Kind = tok::greater;
125       Tok.Length = 1;
126       Result.push(Tok);
127       // Line is wrong if the first greater is followed by an escaped newline!
128       Tok.Data = Tok.text().data() + 1;
129     }
130 
131     Result.push(std::move(Tok));
132   }
133 
134   Result.finalize();
135   return Result;
136 }
137 
138 } // namespace clangd
139 } // namespace clang
140