1*ed8f7882SAaron Ballman //===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===// 2*ed8f7882SAaron Ballman // 3*ed8f7882SAaron Ballman // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*ed8f7882SAaron Ballman // See https://llvm.org/LICENSE.txt for license information. 5*ed8f7882SAaron Ballman // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*ed8f7882SAaron Ballman // 7*ed8f7882SAaron Ballman //===----------------------------------------------------------------------===// 8*ed8f7882SAaron Ballman 9*ed8f7882SAaron Ballman #include "Token.h" 10*ed8f7882SAaron Ballman #include "clang/Basic/IdentifierTable.h" 11*ed8f7882SAaron Ballman #include "clang/Basic/SourceLocation.h" 12*ed8f7882SAaron Ballman #include "clang/Basic/TokenKinds.h" 13*ed8f7882SAaron Ballman #include "clang/Lex/Lexer.h" 14*ed8f7882SAaron Ballman #include "clang/Lex/LiteralSupport.h" 15*ed8f7882SAaron Ballman 16*ed8f7882SAaron Ballman namespace clang { 17*ed8f7882SAaron Ballman namespace clangd { 18*ed8f7882SAaron Ballman 19*ed8f7882SAaron Ballman TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) { 20*ed8f7882SAaron Ballman clang::SourceLocation Start; 21*ed8f7882SAaron Ballman // Tokenize using clang's lexer in raw mode. 22*ed8f7882SAaron Ballman // std::string guarantees null-termination, which the lexer needs. 23*ed8f7882SAaron Ballman clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(), 24*ed8f7882SAaron Ballman Code.data() + Code.size()); 25*ed8f7882SAaron Ballman Lexer.SetCommentRetentionState(true); 26*ed8f7882SAaron Ballman 27*ed8f7882SAaron Ballman TokenStream Result; 28*ed8f7882SAaron Ballman clang::Token CT; 29*ed8f7882SAaron Ballman // Index into the token stream of original source code. 30*ed8f7882SAaron Ballman Token::Index TokenIndex = 0; 31*ed8f7882SAaron Ballman unsigned LastOffset = 0; 32*ed8f7882SAaron Ballman unsigned Line = 0; 33*ed8f7882SAaron Ballman unsigned Indent = 0; 34*ed8f7882SAaron Ballman for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof; 35*ed8f7882SAaron Ballman Lexer.LexFromRawLexer(CT)) { 36*ed8f7882SAaron Ballman unsigned Offset = 37*ed8f7882SAaron Ballman CT.getLocation().getRawEncoding() - Start.getRawEncoding(); 38*ed8f7882SAaron Ballman 39*ed8f7882SAaron Ballman Token Tok; 40*ed8f7882SAaron Ballman Tok.Data = &Code[Offset]; 41*ed8f7882SAaron Ballman Tok.Length = CT.getLength(); 42*ed8f7882SAaron Ballman Tok.Kind = CT.getKind(); 43*ed8f7882SAaron Ballman 44*ed8f7882SAaron Ballman // Update current line number and indentation from raw source code. 45*ed8f7882SAaron Ballman unsigned NewLineStart = 0; 46*ed8f7882SAaron Ballman for (unsigned I = LastOffset; I < Offset; ++I) { 47*ed8f7882SAaron Ballman if (Code[I] == '\n') { 48*ed8f7882SAaron Ballman NewLineStart = I + 1; 49*ed8f7882SAaron Ballman ++Line; 50*ed8f7882SAaron Ballman } 51*ed8f7882SAaron Ballman } 52*ed8f7882SAaron Ballman if (NewLineStart || !LastOffset) { 53*ed8f7882SAaron Ballman Indent = 0; 54*ed8f7882SAaron Ballman for (char C : StringRef(Code).slice(NewLineStart, Offset)) { 55*ed8f7882SAaron Ballman if (C == ' ') 56*ed8f7882SAaron Ballman ++Indent; 57*ed8f7882SAaron Ballman else if (C == '\t') 58*ed8f7882SAaron Ballman Indent += 8; 59*ed8f7882SAaron Ballman else 60*ed8f7882SAaron Ballman break; 61*ed8f7882SAaron Ballman } 62*ed8f7882SAaron Ballman } 63*ed8f7882SAaron Ballman Tok.Indent = Indent; 64*ed8f7882SAaron Ballman Tok.Line = Line; 65*ed8f7882SAaron Ballman 66*ed8f7882SAaron Ballman if (CT.isAtStartOfLine()) 67*ed8f7882SAaron Ballman Tok.setFlag(LexFlags::StartsPPLine); 68*ed8f7882SAaron Ballman if (CT.needsCleaning() || CT.hasUCN()) 69*ed8f7882SAaron Ballman Tok.setFlag(LexFlags::NeedsCleaning); 70*ed8f7882SAaron Ballman 71*ed8f7882SAaron Ballman Tok.OriginalIndex = TokenIndex++; 72*ed8f7882SAaron Ballman Result.push(Tok); 73*ed8f7882SAaron Ballman LastOffset = Offset; 74*ed8f7882SAaron Ballman } 75*ed8f7882SAaron Ballman Result.finalize(); 76*ed8f7882SAaron Ballman return Result; 77*ed8f7882SAaron Ballman } 78*ed8f7882SAaron Ballman 79*ed8f7882SAaron Ballman TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) { 80*ed8f7882SAaron Ballman auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>(); 81*ed8f7882SAaron Ballman clang::IdentifierTable Identifiers(LangOpts); 82*ed8f7882SAaron Ballman TokenStream Result(CleanedStorage); 83*ed8f7882SAaron Ballman Result.addPayload(Code.getPayload()); 84*ed8f7882SAaron Ballman for (auto Tok : Code.tokens()) { 85*ed8f7882SAaron Ballman if (Tok.flag(LexFlags::NeedsCleaning)) { 86*ed8f7882SAaron Ballman // Remove escaped newlines and trigraphs. 87*ed8f7882SAaron Ballman llvm::SmallString<64> CleanBuffer; 88*ed8f7882SAaron Ballman const char *Pos = Tok.text().begin(); 89*ed8f7882SAaron Ballman while (Pos < Tok.text().end()) { 90*ed8f7882SAaron Ballman auto [Char, CharSize] = 91*ed8f7882SAaron Ballman clang::Lexer::getCharAndSizeNoWarn(Pos, LangOpts); 92*ed8f7882SAaron Ballman CleanBuffer.push_back(Char); 93*ed8f7882SAaron Ballman assert(CharSize != 0 && "no progress!"); 94*ed8f7882SAaron Ballman Pos += CharSize; 95*ed8f7882SAaron Ballman } 96*ed8f7882SAaron Ballman llvm::StringRef Text = CleanBuffer; 97*ed8f7882SAaron Ballman llvm::SmallString<64> UCNBuffer; 98*ed8f7882SAaron Ballman // A surface reading of the standard suggests UCNs might appear anywhere. 99*ed8f7882SAaron Ballman // But we need only decode them in raw_identifiers. 100*ed8f7882SAaron Ballman // - they cannot appear in punctuation/keyword tokens, because UCNs 101*ed8f7882SAaron Ballman // cannot encode basic characters outside of literals [lex.charset] 102*ed8f7882SAaron Ballman // - they can appear in literals, but we need not unescape them now. 103*ed8f7882SAaron Ballman // We treat them as escape sequences when evaluating the literal. 104*ed8f7882SAaron Ballman // - comments are handled similarly to literals 105*ed8f7882SAaron Ballman // This is good fortune, because expandUCNs requires its input to be a 106*ed8f7882SAaron Ballman // reasonably valid identifier (e.g. without stray backslashes). 107*ed8f7882SAaron Ballman if (Tok.Kind == tok::raw_identifier) { 108*ed8f7882SAaron Ballman clang::expandUCNs(UCNBuffer, CleanBuffer); 109*ed8f7882SAaron Ballman Text = UCNBuffer; 110*ed8f7882SAaron Ballman } 111*ed8f7882SAaron Ballman 112*ed8f7882SAaron Ballman Tok.Data = Text.copy(*CleanedStorage).data(); 113*ed8f7882SAaron Ballman Tok.Length = Text.size(); 114*ed8f7882SAaron Ballman Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning); 115*ed8f7882SAaron Ballman } 116*ed8f7882SAaron Ballman 117*ed8f7882SAaron Ballman if (Tok.Kind == tok::raw_identifier) { 118*ed8f7882SAaron Ballman // Cook raw_identifiers into identifier, keyword, etc. 119*ed8f7882SAaron Ballman Tok.Kind = Identifiers.get(Tok.text()).getTokenID(); 120*ed8f7882SAaron Ballman } else if (Tok.Kind == tok::greatergreater) { 121*ed8f7882SAaron Ballman // Split the greatergreater token. 122*ed8f7882SAaron Ballman // FIXME: split lessless token to support Cuda triple angle brackets <<<. 123*ed8f7882SAaron Ballman assert(Tok.text() == ">>"); 124*ed8f7882SAaron Ballman Tok.Kind = tok::greater; 125*ed8f7882SAaron Ballman Tok.Length = 1; 126*ed8f7882SAaron Ballman Result.push(Tok); 127*ed8f7882SAaron Ballman // Line is wrong if the first greater is followed by an escaped newline! 128*ed8f7882SAaron Ballman Tok.Data = Tok.text().data() + 1; 129*ed8f7882SAaron Ballman } 130*ed8f7882SAaron Ballman 131*ed8f7882SAaron Ballman Result.push(std::move(Tok)); 132*ed8f7882SAaron Ballman } 133*ed8f7882SAaron Ballman 134*ed8f7882SAaron Ballman Result.finalize(); 135*ed8f7882SAaron Ballman return Result; 136*ed8f7882SAaron Ballman } 137*ed8f7882SAaron Ballman 138*ed8f7882SAaron Ballman } // namespace clangd 139*ed8f7882SAaron Ballman } // namespace clang 140