1 //===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "Token.h" 10 #include "clang/Basic/IdentifierTable.h" 11 #include "clang/Basic/SourceLocation.h" 12 #include "clang/Basic/TokenKinds.h" 13 #include "clang/Lex/Lexer.h" 14 #include "clang/Lex/LiteralSupport.h" 15 16 namespace clang { 17 namespace clangd { 18 19 TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) { 20 clang::SourceLocation Start; 21 // Tokenize using clang's lexer in raw mode. 22 // std::string guarantees null-termination, which the lexer needs. 23 clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(), 24 Code.data() + Code.size()); 25 Lexer.SetCommentRetentionState(true); 26 27 TokenStream Result; 28 clang::Token CT; 29 // Index into the token stream of original source code. 30 Token::Index TokenIndex = 0; 31 unsigned LastOffset = 0; 32 unsigned Line = 0; 33 unsigned Indent = 0; 34 for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof; 35 Lexer.LexFromRawLexer(CT)) { 36 unsigned Offset = 37 CT.getLocation().getRawEncoding() - Start.getRawEncoding(); 38 39 Token Tok; 40 Tok.Data = &Code[Offset]; 41 Tok.Length = CT.getLength(); 42 Tok.Kind = CT.getKind(); 43 44 // Update current line number and indentation from raw source code. 45 unsigned NewLineStart = 0; 46 for (unsigned I = LastOffset; I < Offset; ++I) { 47 if (Code[I] == '\n') { 48 NewLineStart = I + 1; 49 ++Line; 50 } 51 } 52 if (NewLineStart || !LastOffset) { 53 Indent = 0; 54 for (char C : StringRef(Code).slice(NewLineStart, Offset)) { 55 if (C == ' ') 56 ++Indent; 57 else if (C == '\t') 58 Indent += 8; 59 else 60 break; 61 } 62 } 63 Tok.Indent = Indent; 64 Tok.Line = Line; 65 66 if (CT.isAtStartOfLine()) 67 Tok.setFlag(LexFlags::StartsPPLine); 68 if (CT.needsCleaning() || CT.hasUCN()) 69 Tok.setFlag(LexFlags::NeedsCleaning); 70 71 Tok.OriginalIndex = TokenIndex++; 72 Result.push(Tok); 73 LastOffset = Offset; 74 } 75 Result.finalize(); 76 return Result; 77 } 78 79 TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) { 80 auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>(); 81 clang::IdentifierTable Identifiers(LangOpts); 82 TokenStream Result(CleanedStorage); 83 Result.addPayload(Code.getPayload()); 84 for (auto Tok : Code.tokens()) { 85 if (Tok.flag(LexFlags::NeedsCleaning)) { 86 // Remove escaped newlines and trigraphs. 87 llvm::SmallString<64> CleanBuffer; 88 const char *Pos = Tok.text().begin(); 89 while (Pos < Tok.text().end()) { 90 auto [Char, CharSize] = 91 clang::Lexer::getCharAndSizeNoWarn(Pos, LangOpts); 92 CleanBuffer.push_back(Char); 93 assert(CharSize != 0 && "no progress!"); 94 Pos += CharSize; 95 } 96 llvm::StringRef Text = CleanBuffer; 97 llvm::SmallString<64> UCNBuffer; 98 // A surface reading of the standard suggests UCNs might appear anywhere. 99 // But we need only decode them in raw_identifiers. 100 // - they cannot appear in punctuation/keyword tokens, because UCNs 101 // cannot encode basic characters outside of literals [lex.charset] 102 // - they can appear in literals, but we need not unescape them now. 103 // We treat them as escape sequences when evaluating the literal. 104 // - comments are handled similarly to literals 105 // This is good fortune, because expandUCNs requires its input to be a 106 // reasonably valid identifier (e.g. without stray backslashes). 107 if (Tok.Kind == tok::raw_identifier) { 108 clang::expandUCNs(UCNBuffer, CleanBuffer); 109 Text = UCNBuffer; 110 } 111 112 Tok.Data = Text.copy(*CleanedStorage).data(); 113 Tok.Length = Text.size(); 114 Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning); 115 } 116 117 if (Tok.Kind == tok::raw_identifier) { 118 // Cook raw_identifiers into identifier, keyword, etc. 119 Tok.Kind = Identifiers.get(Tok.text()).getTokenID(); 120 } else if (Tok.Kind == tok::greatergreater) { 121 // Split the greatergreater token. 122 // FIXME: split lessless token to support Cuda triple angle brackets <<<. 123 assert(Tok.text() == ">>"); 124 Tok.Kind = tok::greater; 125 Tok.Length = 1; 126 Result.push(Tok); 127 // Line is wrong if the first greater is followed by an escaped newline! 128 Tok.Data = Tok.text().data() + 1; 129 } 130 131 Result.push(std::move(Tok)); 132 } 133 134 Result.finalize(); 135 return Result; 136 } 137 138 } // namespace clangd 139 } // namespace clang 140