10b57cec5SDimitry Andric //===- Lexer.cpp - C Language Family Lexer --------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric // This file implements the Lexer and Token interfaces. 100b57cec5SDimitry Andric // 110b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 120b57cec5SDimitry Andric 130b57cec5SDimitry Andric #include "clang/Lex/Lexer.h" 140b57cec5SDimitry Andric #include "UnicodeCharSets.h" 150b57cec5SDimitry Andric #include "clang/Basic/CharInfo.h" 16e8d8bef9SDimitry Andric #include "clang/Basic/Diagnostic.h" 170b57cec5SDimitry Andric #include "clang/Basic/IdentifierTable.h" 18e8d8bef9SDimitry Andric #include "clang/Basic/LLVM.h" 190b57cec5SDimitry Andric #include "clang/Basic/LangOptions.h" 200b57cec5SDimitry Andric #include "clang/Basic/SourceLocation.h" 210b57cec5SDimitry Andric #include "clang/Basic/SourceManager.h" 220b57cec5SDimitry Andric #include "clang/Basic/TokenKinds.h" 230b57cec5SDimitry Andric #include "clang/Lex/LexDiagnostic.h" 240b57cec5SDimitry Andric #include "clang/Lex/LiteralSupport.h" 250b57cec5SDimitry Andric #include "clang/Lex/MultipleIncludeOpt.h" 260b57cec5SDimitry Andric #include "clang/Lex/Preprocessor.h" 270b57cec5SDimitry Andric #include "clang/Lex/PreprocessorOptions.h" 280b57cec5SDimitry Andric #include "clang/Lex/Token.h" 295ffd83dbSDimitry Andric #include "llvm/ADT/STLExtras.h" 300b57cec5SDimitry Andric #include "llvm/ADT/StringExtras.h" 310b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h" 32e8d8bef9SDimitry Andric #include "llvm/ADT/StringSwitch.h" 330b57cec5SDimitry Andric #include "llvm/Support/Compiler.h" 340b57cec5SDimitry Andric #include "llvm/Support/ConvertUTF.h" 350b57cec5SDimitry Andric #include "llvm/Support/MathExtras.h" 36e8d8bef9SDimitry Andric #include "llvm/Support/MemoryBufferRef.h" 370b57cec5SDimitry Andric #include "llvm/Support/NativeFormatting.h" 3881ad6265SDimitry Andric #include "llvm/Support/Unicode.h" 390b57cec5SDimitry Andric #include "llvm/Support/UnicodeCharRanges.h" 400b57cec5SDimitry Andric #include <algorithm> 410b57cec5SDimitry Andric #include <cassert> 420b57cec5SDimitry Andric #include <cstddef> 430b57cec5SDimitry Andric #include <cstdint> 440b57cec5SDimitry Andric #include <cstring> 45bdd1243dSDimitry Andric #include <optional> 460b57cec5SDimitry Andric #include <string> 470b57cec5SDimitry Andric #include <tuple> 480b57cec5SDimitry Andric #include <utility> 490b57cec5SDimitry Andric 505f757f3fSDimitry Andric #ifdef __SSE4_2__ 515f757f3fSDimitry Andric #include <nmmintrin.h> 525f757f3fSDimitry Andric #endif 535f757f3fSDimitry Andric 540b57cec5SDimitry Andric using namespace clang; 550b57cec5SDimitry Andric 560b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 570b57cec5SDimitry Andric // Token Class Implementation 580b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 590b57cec5SDimitry Andric 600b57cec5SDimitry Andric /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 610b57cec5SDimitry Andric bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 620b57cec5SDimitry Andric if (isAnnotation()) 630b57cec5SDimitry Andric return false; 645f757f3fSDimitry Andric if (const IdentifierInfo *II = getIdentifierInfo()) 650b57cec5SDimitry Andric return II->getObjCKeywordID() == objcKey; 660b57cec5SDimitry Andric return false; 670b57cec5SDimitry Andric } 680b57cec5SDimitry Andric 690b57cec5SDimitry Andric /// getObjCKeywordID - Return the ObjC keyword kind. 700b57cec5SDimitry Andric tok::ObjCKeywordKind Token::getObjCKeywordID() const { 710b57cec5SDimitry Andric if (isAnnotation()) 720b57cec5SDimitry Andric return tok::objc_not_keyword; 735f757f3fSDimitry Andric const IdentifierInfo *specId = getIdentifierInfo(); 740b57cec5SDimitry Andric return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 750b57cec5SDimitry Andric } 760b57cec5SDimitry Andric 77*0fca6ea1SDimitry Andric /// Determine whether the token kind starts a simple-type-specifier. 78*0fca6ea1SDimitry Andric bool Token::isSimpleTypeSpecifier(const LangOptions &LangOpts) const { 79*0fca6ea1SDimitry Andric switch (getKind()) { 80*0fca6ea1SDimitry Andric case tok::annot_typename: 81*0fca6ea1SDimitry Andric case tok::annot_decltype: 82*0fca6ea1SDimitry Andric case tok::annot_pack_indexing_type: 83*0fca6ea1SDimitry Andric return true; 84*0fca6ea1SDimitry Andric 85*0fca6ea1SDimitry Andric case tok::kw_short: 86*0fca6ea1SDimitry Andric case tok::kw_long: 87*0fca6ea1SDimitry Andric case tok::kw___int64: 88*0fca6ea1SDimitry Andric case tok::kw___int128: 89*0fca6ea1SDimitry Andric case tok::kw_signed: 90*0fca6ea1SDimitry Andric case tok::kw_unsigned: 91*0fca6ea1SDimitry Andric case tok::kw_void: 92*0fca6ea1SDimitry Andric case tok::kw_char: 93*0fca6ea1SDimitry Andric case tok::kw_int: 94*0fca6ea1SDimitry Andric case tok::kw_half: 95*0fca6ea1SDimitry Andric case tok::kw_float: 96*0fca6ea1SDimitry Andric case tok::kw_double: 97*0fca6ea1SDimitry Andric case tok::kw___bf16: 98*0fca6ea1SDimitry Andric case tok::kw__Float16: 99*0fca6ea1SDimitry Andric case tok::kw___float128: 100*0fca6ea1SDimitry Andric case tok::kw___ibm128: 101*0fca6ea1SDimitry Andric case tok::kw_wchar_t: 102*0fca6ea1SDimitry Andric case tok::kw_bool: 103*0fca6ea1SDimitry Andric case tok::kw__Bool: 104*0fca6ea1SDimitry Andric case tok::kw__Accum: 105*0fca6ea1SDimitry Andric case tok::kw__Fract: 106*0fca6ea1SDimitry Andric case tok::kw__Sat: 107*0fca6ea1SDimitry Andric #define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait: 108*0fca6ea1SDimitry Andric #include "clang/Basic/TransformTypeTraits.def" 109*0fca6ea1SDimitry Andric case tok::kw___auto_type: 110*0fca6ea1SDimitry Andric case tok::kw_char16_t: 111*0fca6ea1SDimitry Andric case tok::kw_char32_t: 112*0fca6ea1SDimitry Andric case tok::kw_typeof: 113*0fca6ea1SDimitry Andric case tok::kw_decltype: 114*0fca6ea1SDimitry Andric case tok::kw_char8_t: 115*0fca6ea1SDimitry Andric return getIdentifierInfo()->isKeyword(LangOpts); 116*0fca6ea1SDimitry Andric 117*0fca6ea1SDimitry Andric default: 118*0fca6ea1SDimitry Andric return false; 119*0fca6ea1SDimitry Andric } 120*0fca6ea1SDimitry Andric } 121*0fca6ea1SDimitry Andric 1220b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 1230b57cec5SDimitry Andric // Lexer Class Implementation 1240b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 1250b57cec5SDimitry Andric 1260b57cec5SDimitry Andric void Lexer::anchor() {} 1270b57cec5SDimitry Andric 1280b57cec5SDimitry Andric void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 1290b57cec5SDimitry Andric const char *BufEnd) { 1300b57cec5SDimitry Andric BufferStart = BufStart; 1310b57cec5SDimitry Andric BufferPtr = BufPtr; 1320b57cec5SDimitry Andric BufferEnd = BufEnd; 1330b57cec5SDimitry Andric 1340b57cec5SDimitry Andric assert(BufEnd[0] == 0 && 1350b57cec5SDimitry Andric "We assume that the input buffer has a null character at the end" 1360b57cec5SDimitry Andric " to simplify lexing!"); 1370b57cec5SDimitry Andric 1380b57cec5SDimitry Andric // Check whether we have a BOM in the beginning of the buffer. If yes - act 1390b57cec5SDimitry Andric // accordingly. Right now we support only UTF-8 with and without BOM, so, just 1400b57cec5SDimitry Andric // skip the UTF-8 BOM if it's present. 1410b57cec5SDimitry Andric if (BufferStart == BufferPtr) { 1420b57cec5SDimitry Andric // Determine the size of the BOM. 1430b57cec5SDimitry Andric StringRef Buf(BufferStart, BufferEnd - BufferStart); 1440b57cec5SDimitry Andric size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 1450b57cec5SDimitry Andric .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 1460b57cec5SDimitry Andric .Default(0); 1470b57cec5SDimitry Andric 1480b57cec5SDimitry Andric // Skip the BOM. 1490b57cec5SDimitry Andric BufferPtr += BOMLength; 1500b57cec5SDimitry Andric } 1510b57cec5SDimitry Andric 1520b57cec5SDimitry Andric Is_PragmaLexer = false; 1530b57cec5SDimitry Andric CurrentConflictMarkerState = CMK_None; 1540b57cec5SDimitry Andric 1550b57cec5SDimitry Andric // Start of the file is a start of line. 1560b57cec5SDimitry Andric IsAtStartOfLine = true; 1570b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 1580b57cec5SDimitry Andric 1590b57cec5SDimitry Andric HasLeadingSpace = false; 1600b57cec5SDimitry Andric HasLeadingEmptyMacro = false; 1610b57cec5SDimitry Andric 1620b57cec5SDimitry Andric // We are not after parsing a #. 1630b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 1640b57cec5SDimitry Andric 1650b57cec5SDimitry Andric // We are not after parsing #include. 1660b57cec5SDimitry Andric ParsingFilename = false; 1670b57cec5SDimitry Andric 1680b57cec5SDimitry Andric // We are not in raw mode. Raw mode disables diagnostics and interpretation 1690b57cec5SDimitry Andric // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 1700b57cec5SDimitry Andric // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 1710b57cec5SDimitry Andric // or otherwise skipping over tokens. 1720b57cec5SDimitry Andric LexingRawMode = false; 1730b57cec5SDimitry Andric 1740b57cec5SDimitry Andric // Default to not keeping comments. 1750b57cec5SDimitry Andric ExtendedTokenMode = 0; 176e8d8bef9SDimitry Andric 177e8d8bef9SDimitry Andric NewLinePtr = nullptr; 1780b57cec5SDimitry Andric } 1790b57cec5SDimitry Andric 1800b57cec5SDimitry Andric /// Lexer constructor - Create a new lexer object for the specified buffer 1810b57cec5SDimitry Andric /// with the specified preprocessor managing the lexing process. This lexer 1820b57cec5SDimitry Andric /// assumes that the associated file buffer and Preprocessor objects will 1830b57cec5SDimitry Andric /// outlive it, so it doesn't take ownership of either of them. 184e8d8bef9SDimitry Andric Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, 185349cc55cSDimitry Andric Preprocessor &PP, bool IsFirstIncludeOfFile) 1860b57cec5SDimitry Andric : PreprocessorLexer(&PP, FID), 1870b57cec5SDimitry Andric FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 18881ad6265SDimitry Andric LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment), 18981ad6265SDimitry Andric IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 190e8d8bef9SDimitry Andric InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(), 191e8d8bef9SDimitry Andric InputFile.getBufferEnd()); 1920b57cec5SDimitry Andric 1930b57cec5SDimitry Andric resetExtendedTokenMode(); 1940b57cec5SDimitry Andric } 1950b57cec5SDimitry Andric 1960b57cec5SDimitry Andric /// Lexer constructor - Create a new raw lexer object. This object is only 1970b57cec5SDimitry Andric /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 1980b57cec5SDimitry Andric /// range will outlive it, so it doesn't take ownership of it. 1990b57cec5SDimitry Andric Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 200349cc55cSDimitry Andric const char *BufStart, const char *BufPtr, const char *BufEnd, 201349cc55cSDimitry Andric bool IsFirstIncludeOfFile) 20281ad6265SDimitry Andric : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment), 203349cc55cSDimitry Andric IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 2040b57cec5SDimitry Andric InitLexer(BufStart, BufPtr, BufEnd); 2050b57cec5SDimitry Andric 2060b57cec5SDimitry Andric // We *are* in raw mode. 2070b57cec5SDimitry Andric LexingRawMode = true; 2080b57cec5SDimitry Andric } 2090b57cec5SDimitry Andric 2100b57cec5SDimitry Andric /// Lexer constructor - Create a new raw lexer object. This object is only 2110b57cec5SDimitry Andric /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 2120b57cec5SDimitry Andric /// range will outlive it, so it doesn't take ownership of it. 213e8d8bef9SDimitry Andric Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile, 214349cc55cSDimitry Andric const SourceManager &SM, const LangOptions &langOpts, 215349cc55cSDimitry Andric bool IsFirstIncludeOfFile) 216e8d8bef9SDimitry Andric : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(), 217349cc55cSDimitry Andric FromFile.getBufferStart(), FromFile.getBufferEnd(), 218349cc55cSDimitry Andric IsFirstIncludeOfFile) {} 2190b57cec5SDimitry Andric 2200b57cec5SDimitry Andric void Lexer::resetExtendedTokenMode() { 2210b57cec5SDimitry Andric assert(PP && "Cannot reset token mode without a preprocessor"); 2220b57cec5SDimitry Andric if (LangOpts.TraditionalCPP) 2230b57cec5SDimitry Andric SetKeepWhitespaceMode(true); 2240b57cec5SDimitry Andric else 2250b57cec5SDimitry Andric SetCommentRetentionState(PP->getCommentRetentionState()); 2260b57cec5SDimitry Andric } 2270b57cec5SDimitry Andric 2280b57cec5SDimitry Andric /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 2290b57cec5SDimitry Andric /// _Pragma expansion. This has a variety of magic semantics that this method 2300b57cec5SDimitry Andric /// sets up. It returns a new'd Lexer that must be delete'd when done. 2310b57cec5SDimitry Andric /// 2320b57cec5SDimitry Andric /// On entrance to this routine, TokStartLoc is a macro location which has a 2330b57cec5SDimitry Andric /// spelling loc that indicates the bytes to be lexed for the token and an 2340b57cec5SDimitry Andric /// expansion location that indicates where all lexed tokens should be 2350b57cec5SDimitry Andric /// "expanded from". 2360b57cec5SDimitry Andric /// 2370b57cec5SDimitry Andric /// TODO: It would really be nice to make _Pragma just be a wrapper around a 2380b57cec5SDimitry Andric /// normal lexer that remaps tokens as they fly by. This would require making 2390b57cec5SDimitry Andric /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 2400b57cec5SDimitry Andric /// interface that could handle this stuff. This would pull GetMappedTokenLoc 2410b57cec5SDimitry Andric /// out of the critical path of the lexer! 2420b57cec5SDimitry Andric /// 2430b57cec5SDimitry Andric Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 2440b57cec5SDimitry Andric SourceLocation ExpansionLocStart, 2450b57cec5SDimitry Andric SourceLocation ExpansionLocEnd, 2460b57cec5SDimitry Andric unsigned TokLen, Preprocessor &PP) { 2470b57cec5SDimitry Andric SourceManager &SM = PP.getSourceManager(); 2480b57cec5SDimitry Andric 2490b57cec5SDimitry Andric // Create the lexer as if we were going to lex the file normally. 2500b57cec5SDimitry Andric FileID SpellingFID = SM.getFileID(SpellingLoc); 251e8d8bef9SDimitry Andric llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID); 2520b57cec5SDimitry Andric Lexer *L = new Lexer(SpellingFID, InputFile, PP); 2530b57cec5SDimitry Andric 2540b57cec5SDimitry Andric // Now that the lexer is created, change the start/end locations so that we 2550b57cec5SDimitry Andric // just lex the subsection of the file that we want. This is lexing from a 2560b57cec5SDimitry Andric // scratch buffer. 2570b57cec5SDimitry Andric const char *StrData = SM.getCharacterData(SpellingLoc); 2580b57cec5SDimitry Andric 2590b57cec5SDimitry Andric L->BufferPtr = StrData; 2600b57cec5SDimitry Andric L->BufferEnd = StrData+TokLen; 2610b57cec5SDimitry Andric assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 2620b57cec5SDimitry Andric 2630b57cec5SDimitry Andric // Set the SourceLocation with the remapping information. This ensures that 2640b57cec5SDimitry Andric // GetMappedTokenLoc will remap the tokens as they are lexed. 2650b57cec5SDimitry Andric L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 2660b57cec5SDimitry Andric ExpansionLocStart, 2670b57cec5SDimitry Andric ExpansionLocEnd, TokLen); 2680b57cec5SDimitry Andric 2690b57cec5SDimitry Andric // Ensure that the lexer thinks it is inside a directive, so that end \n will 2700b57cec5SDimitry Andric // return an EOD token. 2710b57cec5SDimitry Andric L->ParsingPreprocessorDirective = true; 2720b57cec5SDimitry Andric 2730b57cec5SDimitry Andric // This lexer really is for _Pragma. 2740b57cec5SDimitry Andric L->Is_PragmaLexer = true; 2750b57cec5SDimitry Andric return L; 2760b57cec5SDimitry Andric } 2770b57cec5SDimitry Andric 27881ad6265SDimitry Andric void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) { 27981ad6265SDimitry Andric this->IsAtPhysicalStartOfLine = IsAtStartOfLine; 28081ad6265SDimitry Andric this->IsAtStartOfLine = IsAtStartOfLine; 28181ad6265SDimitry Andric assert((BufferStart + Offset) <= BufferEnd); 28281ad6265SDimitry Andric BufferPtr = BufferStart + Offset; 283a7dea167SDimitry Andric } 284a7dea167SDimitry Andric 2850b57cec5SDimitry Andric template <typename T> static void StringifyImpl(T &Str, char Quote) { 2860b57cec5SDimitry Andric typename T::size_type i = 0, e = Str.size(); 2870b57cec5SDimitry Andric while (i < e) { 2880b57cec5SDimitry Andric if (Str[i] == '\\' || Str[i] == Quote) { 2890b57cec5SDimitry Andric Str.insert(Str.begin() + i, '\\'); 2900b57cec5SDimitry Andric i += 2; 2910b57cec5SDimitry Andric ++e; 2920b57cec5SDimitry Andric } else if (Str[i] == '\n' || Str[i] == '\r') { 2930b57cec5SDimitry Andric // Replace '\r\n' and '\n\r' to '\\' followed by 'n'. 2940b57cec5SDimitry Andric if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && 2950b57cec5SDimitry Andric Str[i] != Str[i + 1]) { 2960b57cec5SDimitry Andric Str[i] = '\\'; 2970b57cec5SDimitry Andric Str[i + 1] = 'n'; 2980b57cec5SDimitry Andric } else { 2990b57cec5SDimitry Andric // Replace '\n' and '\r' to '\\' followed by 'n'. 3000b57cec5SDimitry Andric Str[i] = '\\'; 3010b57cec5SDimitry Andric Str.insert(Str.begin() + i + 1, 'n'); 3020b57cec5SDimitry Andric ++e; 3030b57cec5SDimitry Andric } 3040b57cec5SDimitry Andric i += 2; 3050b57cec5SDimitry Andric } else 3060b57cec5SDimitry Andric ++i; 3070b57cec5SDimitry Andric } 3080b57cec5SDimitry Andric } 3090b57cec5SDimitry Andric 3100b57cec5SDimitry Andric std::string Lexer::Stringify(StringRef Str, bool Charify) { 3115ffd83dbSDimitry Andric std::string Result = std::string(Str); 3120b57cec5SDimitry Andric char Quote = Charify ? '\'' : '"'; 3130b57cec5SDimitry Andric StringifyImpl(Result, Quote); 3140b57cec5SDimitry Andric return Result; 3150b57cec5SDimitry Andric } 3160b57cec5SDimitry Andric 3170b57cec5SDimitry Andric void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); } 3180b57cec5SDimitry Andric 3190b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 3200b57cec5SDimitry Andric // Token Spelling 3210b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 3220b57cec5SDimitry Andric 3230b57cec5SDimitry Andric /// Slow case of getSpelling. Extract the characters comprising the 3240b57cec5SDimitry Andric /// spelling of this token from the provided input buffer. 3250b57cec5SDimitry Andric static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, 3260b57cec5SDimitry Andric const LangOptions &LangOpts, char *Spelling) { 3270b57cec5SDimitry Andric assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); 3280b57cec5SDimitry Andric 3290b57cec5SDimitry Andric size_t Length = 0; 3300b57cec5SDimitry Andric const char *BufEnd = BufPtr + Tok.getLength(); 3310b57cec5SDimitry Andric 3320b57cec5SDimitry Andric if (tok::isStringLiteral(Tok.getKind())) { 3330b57cec5SDimitry Andric // Munch the encoding-prefix and opening double-quote. 3340b57cec5SDimitry Andric while (BufPtr < BufEnd) { 3355f757f3fSDimitry Andric auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts); 3365f757f3fSDimitry Andric Spelling[Length++] = CharAndSize.Char; 3375f757f3fSDimitry Andric BufPtr += CharAndSize.Size; 3380b57cec5SDimitry Andric 3390b57cec5SDimitry Andric if (Spelling[Length - 1] == '"') 3400b57cec5SDimitry Andric break; 3410b57cec5SDimitry Andric } 3420b57cec5SDimitry Andric 3430b57cec5SDimitry Andric // Raw string literals need special handling; trigraph expansion and line 3440b57cec5SDimitry Andric // splicing do not occur within their d-char-sequence nor within their 3450b57cec5SDimitry Andric // r-char-sequence. 3460b57cec5SDimitry Andric if (Length >= 2 && 3470b57cec5SDimitry Andric Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { 3480b57cec5SDimitry Andric // Search backwards from the end of the token to find the matching closing 3490b57cec5SDimitry Andric // quote. 3500b57cec5SDimitry Andric const char *RawEnd = BufEnd; 3510b57cec5SDimitry Andric do --RawEnd; while (*RawEnd != '"'); 3520b57cec5SDimitry Andric size_t RawLength = RawEnd - BufPtr + 1; 3530b57cec5SDimitry Andric 3540b57cec5SDimitry Andric // Everything between the quotes is included verbatim in the spelling. 3550b57cec5SDimitry Andric memcpy(Spelling + Length, BufPtr, RawLength); 3560b57cec5SDimitry Andric Length += RawLength; 3570b57cec5SDimitry Andric BufPtr += RawLength; 3580b57cec5SDimitry Andric 3590b57cec5SDimitry Andric // The rest of the token is lexed normally. 3600b57cec5SDimitry Andric } 3610b57cec5SDimitry Andric } 3620b57cec5SDimitry Andric 3630b57cec5SDimitry Andric while (BufPtr < BufEnd) { 3645f757f3fSDimitry Andric auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts); 3655f757f3fSDimitry Andric Spelling[Length++] = CharAndSize.Char; 3665f757f3fSDimitry Andric BufPtr += CharAndSize.Size; 3670b57cec5SDimitry Andric } 3680b57cec5SDimitry Andric 3690b57cec5SDimitry Andric assert(Length < Tok.getLength() && 3700b57cec5SDimitry Andric "NeedsCleaning flag set on token that didn't need cleaning!"); 3710b57cec5SDimitry Andric return Length; 3720b57cec5SDimitry Andric } 3730b57cec5SDimitry Andric 3740b57cec5SDimitry Andric /// getSpelling() - Return the 'spelling' of this token. The spelling of a 3750b57cec5SDimitry Andric /// token are the characters used to represent the token in the source file 3760b57cec5SDimitry Andric /// after trigraph expansion and escaped-newline folding. In particular, this 3770b57cec5SDimitry Andric /// wants to get the true, uncanonicalized, spelling of things like digraphs 3780b57cec5SDimitry Andric /// UCNs, etc. 3790b57cec5SDimitry Andric StringRef Lexer::getSpelling(SourceLocation loc, 3800b57cec5SDimitry Andric SmallVectorImpl<char> &buffer, 3810b57cec5SDimitry Andric const SourceManager &SM, 3820b57cec5SDimitry Andric const LangOptions &options, 3830b57cec5SDimitry Andric bool *invalid) { 3840b57cec5SDimitry Andric // Break down the source location. 3850b57cec5SDimitry Andric std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 3860b57cec5SDimitry Andric 3870b57cec5SDimitry Andric // Try to the load the file buffer. 3880b57cec5SDimitry Andric bool invalidTemp = false; 3890b57cec5SDimitry Andric StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 3900b57cec5SDimitry Andric if (invalidTemp) { 3910b57cec5SDimitry Andric if (invalid) *invalid = true; 3920b57cec5SDimitry Andric return {}; 3930b57cec5SDimitry Andric } 3940b57cec5SDimitry Andric 3950b57cec5SDimitry Andric const char *tokenBegin = file.data() + locInfo.second; 3960b57cec5SDimitry Andric 3970b57cec5SDimitry Andric // Lex from the start of the given location. 3980b57cec5SDimitry Andric Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 3990b57cec5SDimitry Andric file.begin(), tokenBegin, file.end()); 4000b57cec5SDimitry Andric Token token; 4010b57cec5SDimitry Andric lexer.LexFromRawLexer(token); 4020b57cec5SDimitry Andric 4030b57cec5SDimitry Andric unsigned length = token.getLength(); 4040b57cec5SDimitry Andric 4050b57cec5SDimitry Andric // Common case: no need for cleaning. 4060b57cec5SDimitry Andric if (!token.needsCleaning()) 4070b57cec5SDimitry Andric return StringRef(tokenBegin, length); 4080b57cec5SDimitry Andric 4090b57cec5SDimitry Andric // Hard case, we need to relex the characters into the string. 4100b57cec5SDimitry Andric buffer.resize(length); 4110b57cec5SDimitry Andric buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); 4120b57cec5SDimitry Andric return StringRef(buffer.data(), buffer.size()); 4130b57cec5SDimitry Andric } 4140b57cec5SDimitry Andric 4150b57cec5SDimitry Andric /// getSpelling() - Return the 'spelling' of this token. The spelling of a 4160b57cec5SDimitry Andric /// token are the characters used to represent the token in the source file 4170b57cec5SDimitry Andric /// after trigraph expansion and escaped-newline folding. In particular, this 4180b57cec5SDimitry Andric /// wants to get the true, uncanonicalized, spelling of things like digraphs 4190b57cec5SDimitry Andric /// UCNs, etc. 4200b57cec5SDimitry Andric std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 4210b57cec5SDimitry Andric const LangOptions &LangOpts, bool *Invalid) { 4220b57cec5SDimitry Andric assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 4230b57cec5SDimitry Andric 4240b57cec5SDimitry Andric bool CharDataInvalid = false; 4250b57cec5SDimitry Andric const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 4260b57cec5SDimitry Andric &CharDataInvalid); 4270b57cec5SDimitry Andric if (Invalid) 4280b57cec5SDimitry Andric *Invalid = CharDataInvalid; 4290b57cec5SDimitry Andric if (CharDataInvalid) 4300b57cec5SDimitry Andric return {}; 4310b57cec5SDimitry Andric 4320b57cec5SDimitry Andric // If this token contains nothing interesting, return it directly. 4330b57cec5SDimitry Andric if (!Tok.needsCleaning()) 4340b57cec5SDimitry Andric return std::string(TokStart, TokStart + Tok.getLength()); 4350b57cec5SDimitry Andric 4360b57cec5SDimitry Andric std::string Result; 4370b57cec5SDimitry Andric Result.resize(Tok.getLength()); 4380b57cec5SDimitry Andric Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); 4390b57cec5SDimitry Andric return Result; 4400b57cec5SDimitry Andric } 4410b57cec5SDimitry Andric 4420b57cec5SDimitry Andric /// getSpelling - This method is used to get the spelling of a token into a 4430b57cec5SDimitry Andric /// preallocated buffer, instead of as an std::string. The caller is required 4440b57cec5SDimitry Andric /// to allocate enough space for the token, which is guaranteed to be at least 4450b57cec5SDimitry Andric /// Tok.getLength() bytes long. The actual length of the token is returned. 4460b57cec5SDimitry Andric /// 4470b57cec5SDimitry Andric /// Note that this method may do two possible things: it may either fill in 4480b57cec5SDimitry Andric /// the buffer specified with characters, or it may *change the input pointer* 4490b57cec5SDimitry Andric /// to point to a constant buffer with the data already in it (avoiding a 4500b57cec5SDimitry Andric /// copy). The caller is not allowed to modify the returned buffer pointer 4510b57cec5SDimitry Andric /// if an internal buffer is returned. 4520b57cec5SDimitry Andric unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 4530b57cec5SDimitry Andric const SourceManager &SourceMgr, 4540b57cec5SDimitry Andric const LangOptions &LangOpts, bool *Invalid) { 4550b57cec5SDimitry Andric assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 4560b57cec5SDimitry Andric 4570b57cec5SDimitry Andric const char *TokStart = nullptr; 4580b57cec5SDimitry Andric // NOTE: this has to be checked *before* testing for an IdentifierInfo. 4590b57cec5SDimitry Andric if (Tok.is(tok::raw_identifier)) 4600b57cec5SDimitry Andric TokStart = Tok.getRawIdentifier().data(); 4610b57cec5SDimitry Andric else if (!Tok.hasUCN()) { 4620b57cec5SDimitry Andric if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 4630b57cec5SDimitry Andric // Just return the string from the identifier table, which is very quick. 4640b57cec5SDimitry Andric Buffer = II->getNameStart(); 4650b57cec5SDimitry Andric return II->getLength(); 4660b57cec5SDimitry Andric } 4670b57cec5SDimitry Andric } 4680b57cec5SDimitry Andric 4690b57cec5SDimitry Andric // NOTE: this can be checked even after testing for an IdentifierInfo. 4700b57cec5SDimitry Andric if (Tok.isLiteral()) 4710b57cec5SDimitry Andric TokStart = Tok.getLiteralData(); 4720b57cec5SDimitry Andric 4730b57cec5SDimitry Andric if (!TokStart) { 4740b57cec5SDimitry Andric // Compute the start of the token in the input lexer buffer. 4750b57cec5SDimitry Andric bool CharDataInvalid = false; 4760b57cec5SDimitry Andric TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 4770b57cec5SDimitry Andric if (Invalid) 4780b57cec5SDimitry Andric *Invalid = CharDataInvalid; 4790b57cec5SDimitry Andric if (CharDataInvalid) { 4800b57cec5SDimitry Andric Buffer = ""; 4810b57cec5SDimitry Andric return 0; 4820b57cec5SDimitry Andric } 4830b57cec5SDimitry Andric } 4840b57cec5SDimitry Andric 4850b57cec5SDimitry Andric // If this token contains nothing interesting, return it directly. 4860b57cec5SDimitry Andric if (!Tok.needsCleaning()) { 4870b57cec5SDimitry Andric Buffer = TokStart; 4880b57cec5SDimitry Andric return Tok.getLength(); 4890b57cec5SDimitry Andric } 4900b57cec5SDimitry Andric 4910b57cec5SDimitry Andric // Otherwise, hard case, relex the characters into the string. 4920b57cec5SDimitry Andric return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); 4930b57cec5SDimitry Andric } 4940b57cec5SDimitry Andric 4950b57cec5SDimitry Andric /// MeasureTokenLength - Relex the token at the specified location and return 4960b57cec5SDimitry Andric /// its length in bytes in the input file. If the token needs cleaning (e.g. 4970b57cec5SDimitry Andric /// includes a trigraph or an escaped newline) then this count includes bytes 4980b57cec5SDimitry Andric /// that are part of that. 4990b57cec5SDimitry Andric unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 5000b57cec5SDimitry Andric const SourceManager &SM, 5010b57cec5SDimitry Andric const LangOptions &LangOpts) { 5020b57cec5SDimitry Andric Token TheTok; 5030b57cec5SDimitry Andric if (getRawToken(Loc, TheTok, SM, LangOpts)) 5040b57cec5SDimitry Andric return 0; 5050b57cec5SDimitry Andric return TheTok.getLength(); 5060b57cec5SDimitry Andric } 5070b57cec5SDimitry Andric 5080b57cec5SDimitry Andric /// Relex the token at the specified location. 5090b57cec5SDimitry Andric /// \returns true if there was a failure, false on success. 5100b57cec5SDimitry Andric bool Lexer::getRawToken(SourceLocation Loc, Token &Result, 5110b57cec5SDimitry Andric const SourceManager &SM, 5120b57cec5SDimitry Andric const LangOptions &LangOpts, 5130b57cec5SDimitry Andric bool IgnoreWhiteSpace) { 5140b57cec5SDimitry Andric // TODO: this could be special cased for common tokens like identifiers, ')', 5150b57cec5SDimitry Andric // etc to make this faster, if it mattered. Just look at StrData[0] to handle 5160b57cec5SDimitry Andric // all obviously single-char tokens. This could use 5170b57cec5SDimitry Andric // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 5180b57cec5SDimitry Andric // something. 5190b57cec5SDimitry Andric 5200b57cec5SDimitry Andric // If this comes from a macro expansion, we really do want the macro name, not 5210b57cec5SDimitry Andric // the token this macro expanded to. 5220b57cec5SDimitry Andric Loc = SM.getExpansionLoc(Loc); 5230b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 5240b57cec5SDimitry Andric bool Invalid = false; 5250b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 5260b57cec5SDimitry Andric if (Invalid) 5270b57cec5SDimitry Andric return true; 5280b57cec5SDimitry Andric 5290b57cec5SDimitry Andric const char *StrData = Buffer.data()+LocInfo.second; 5300b57cec5SDimitry Andric 5310b57cec5SDimitry Andric if (!IgnoreWhiteSpace && isWhitespace(StrData[0])) 5320b57cec5SDimitry Andric return true; 5330b57cec5SDimitry Andric 5340b57cec5SDimitry Andric // Create a lexer starting at the beginning of this token. 5350b57cec5SDimitry Andric Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 5360b57cec5SDimitry Andric Buffer.begin(), StrData, Buffer.end()); 5370b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 5380b57cec5SDimitry Andric TheLexer.LexFromRawLexer(Result); 5390b57cec5SDimitry Andric return false; 5400b57cec5SDimitry Andric } 5410b57cec5SDimitry Andric 5420b57cec5SDimitry Andric /// Returns the pointer that points to the beginning of line that contains 5430b57cec5SDimitry Andric /// the given offset, or null if the offset if invalid. 5440b57cec5SDimitry Andric static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { 5450b57cec5SDimitry Andric const char *BufStart = Buffer.data(); 5460b57cec5SDimitry Andric if (Offset >= Buffer.size()) 5470b57cec5SDimitry Andric return nullptr; 5480b57cec5SDimitry Andric 5490b57cec5SDimitry Andric const char *LexStart = BufStart + Offset; 5500b57cec5SDimitry Andric for (; LexStart != BufStart; --LexStart) { 5510b57cec5SDimitry Andric if (isVerticalWhitespace(LexStart[0]) && 5520b57cec5SDimitry Andric !Lexer::isNewLineEscaped(BufStart, LexStart)) { 5530b57cec5SDimitry Andric // LexStart should point at first character of logical line. 5540b57cec5SDimitry Andric ++LexStart; 5550b57cec5SDimitry Andric break; 5560b57cec5SDimitry Andric } 5570b57cec5SDimitry Andric } 5580b57cec5SDimitry Andric return LexStart; 5590b57cec5SDimitry Andric } 5600b57cec5SDimitry Andric 5610b57cec5SDimitry Andric static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 5620b57cec5SDimitry Andric const SourceManager &SM, 5630b57cec5SDimitry Andric const LangOptions &LangOpts) { 5640b57cec5SDimitry Andric assert(Loc.isFileID()); 5650b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 5660b57cec5SDimitry Andric if (LocInfo.first.isInvalid()) 5670b57cec5SDimitry Andric return Loc; 5680b57cec5SDimitry Andric 5690b57cec5SDimitry Andric bool Invalid = false; 5700b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 5710b57cec5SDimitry Andric if (Invalid) 5720b57cec5SDimitry Andric return Loc; 5730b57cec5SDimitry Andric 5740b57cec5SDimitry Andric // Back up from the current location until we hit the beginning of a line 5750b57cec5SDimitry Andric // (or the buffer). We'll relex from that point. 5760b57cec5SDimitry Andric const char *StrData = Buffer.data() + LocInfo.second; 5770b57cec5SDimitry Andric const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); 5780b57cec5SDimitry Andric if (!LexStart || LexStart == StrData) 5790b57cec5SDimitry Andric return Loc; 5800b57cec5SDimitry Andric 5810b57cec5SDimitry Andric // Create a lexer starting at the beginning of this token. 5820b57cec5SDimitry Andric SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 5830b57cec5SDimitry Andric Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, 5840b57cec5SDimitry Andric Buffer.end()); 5850b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 5860b57cec5SDimitry Andric 5870b57cec5SDimitry Andric // Lex tokens until we find the token that contains the source location. 5880b57cec5SDimitry Andric Token TheTok; 5890b57cec5SDimitry Andric do { 5900b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 5910b57cec5SDimitry Andric 5920b57cec5SDimitry Andric if (TheLexer.getBufferLocation() > StrData) { 5930b57cec5SDimitry Andric // Lexing this token has taken the lexer past the source location we're 5940b57cec5SDimitry Andric // looking for. If the current token encompasses our source location, 5950b57cec5SDimitry Andric // return the beginning of that token. 5960b57cec5SDimitry Andric if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 5970b57cec5SDimitry Andric return TheTok.getLocation(); 5980b57cec5SDimitry Andric 5990b57cec5SDimitry Andric // We ended up skipping over the source location entirely, which means 6000b57cec5SDimitry Andric // that it points into whitespace. We're done here. 6010b57cec5SDimitry Andric break; 6020b57cec5SDimitry Andric } 6030b57cec5SDimitry Andric } while (TheTok.getKind() != tok::eof); 6040b57cec5SDimitry Andric 6050b57cec5SDimitry Andric // We've passed our source location; just return the original source location. 6060b57cec5SDimitry Andric return Loc; 6070b57cec5SDimitry Andric } 6080b57cec5SDimitry Andric 6090b57cec5SDimitry Andric SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 6100b57cec5SDimitry Andric const SourceManager &SM, 6110b57cec5SDimitry Andric const LangOptions &LangOpts) { 6120b57cec5SDimitry Andric if (Loc.isFileID()) 6130b57cec5SDimitry Andric return getBeginningOfFileToken(Loc, SM, LangOpts); 6140b57cec5SDimitry Andric 6150b57cec5SDimitry Andric if (!SM.isMacroArgExpansion(Loc)) 6160b57cec5SDimitry Andric return Loc; 6170b57cec5SDimitry Andric 6180b57cec5SDimitry Andric SourceLocation FileLoc = SM.getSpellingLoc(Loc); 6190b57cec5SDimitry Andric SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 6200b57cec5SDimitry Andric std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 6210b57cec5SDimitry Andric std::pair<FileID, unsigned> BeginFileLocInfo = 6220b57cec5SDimitry Andric SM.getDecomposedLoc(BeginFileLoc); 6230b57cec5SDimitry Andric assert(FileLocInfo.first == BeginFileLocInfo.first && 6240b57cec5SDimitry Andric FileLocInfo.second >= BeginFileLocInfo.second); 6250b57cec5SDimitry Andric return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 6260b57cec5SDimitry Andric } 6270b57cec5SDimitry Andric 6280b57cec5SDimitry Andric namespace { 6290b57cec5SDimitry Andric 6300b57cec5SDimitry Andric enum PreambleDirectiveKind { 6310b57cec5SDimitry Andric PDK_Skipped, 6320b57cec5SDimitry Andric PDK_Unknown 6330b57cec5SDimitry Andric }; 6340b57cec5SDimitry Andric 6350b57cec5SDimitry Andric } // namespace 6360b57cec5SDimitry Andric 6370b57cec5SDimitry Andric PreambleBounds Lexer::ComputePreamble(StringRef Buffer, 6380b57cec5SDimitry Andric const LangOptions &LangOpts, 6390b57cec5SDimitry Andric unsigned MaxLines) { 6400b57cec5SDimitry Andric // Create a lexer starting at the beginning of the file. Note that we use a 6410b57cec5SDimitry Andric // "fake" file source location at offset 1 so that the lexer will track our 6420b57cec5SDimitry Andric // position within the file. 643fe6060f1SDimitry Andric const SourceLocation::UIntTy StartOffset = 1; 6440b57cec5SDimitry Andric SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); 6450b57cec5SDimitry Andric Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(), 6460b57cec5SDimitry Andric Buffer.end()); 6470b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 6480b57cec5SDimitry Andric 6490b57cec5SDimitry Andric bool InPreprocessorDirective = false; 6500b57cec5SDimitry Andric Token TheTok; 6510b57cec5SDimitry Andric SourceLocation ActiveCommentLoc; 6520b57cec5SDimitry Andric 6530b57cec5SDimitry Andric unsigned MaxLineOffset = 0; 6540b57cec5SDimitry Andric if (MaxLines) { 6550b57cec5SDimitry Andric const char *CurPtr = Buffer.begin(); 6560b57cec5SDimitry Andric unsigned CurLine = 0; 6570b57cec5SDimitry Andric while (CurPtr != Buffer.end()) { 6580b57cec5SDimitry Andric char ch = *CurPtr++; 6590b57cec5SDimitry Andric if (ch == '\n') { 6600b57cec5SDimitry Andric ++CurLine; 6610b57cec5SDimitry Andric if (CurLine == MaxLines) 6620b57cec5SDimitry Andric break; 6630b57cec5SDimitry Andric } 6640b57cec5SDimitry Andric } 6650b57cec5SDimitry Andric if (CurPtr != Buffer.end()) 6660b57cec5SDimitry Andric MaxLineOffset = CurPtr - Buffer.begin(); 6670b57cec5SDimitry Andric } 6680b57cec5SDimitry Andric 6690b57cec5SDimitry Andric do { 6700b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 6710b57cec5SDimitry Andric 6720b57cec5SDimitry Andric if (InPreprocessorDirective) { 6730b57cec5SDimitry Andric // If we've hit the end of the file, we're done. 6740b57cec5SDimitry Andric if (TheTok.getKind() == tok::eof) { 6750b57cec5SDimitry Andric break; 6760b57cec5SDimitry Andric } 6770b57cec5SDimitry Andric 6780b57cec5SDimitry Andric // If we haven't hit the end of the preprocessor directive, skip this 6790b57cec5SDimitry Andric // token. 6800b57cec5SDimitry Andric if (!TheTok.isAtStartOfLine()) 6810b57cec5SDimitry Andric continue; 6820b57cec5SDimitry Andric 6830b57cec5SDimitry Andric // We've passed the end of the preprocessor directive, and will look 6840b57cec5SDimitry Andric // at this token again below. 6850b57cec5SDimitry Andric InPreprocessorDirective = false; 6860b57cec5SDimitry Andric } 6870b57cec5SDimitry Andric 6880b57cec5SDimitry Andric // Keep track of the # of lines in the preamble. 6890b57cec5SDimitry Andric if (TheTok.isAtStartOfLine()) { 6900b57cec5SDimitry Andric unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 6910b57cec5SDimitry Andric 6920b57cec5SDimitry Andric // If we were asked to limit the number of lines in the preamble, 6930b57cec5SDimitry Andric // and we're about to exceed that limit, we're done. 6940b57cec5SDimitry Andric if (MaxLineOffset && TokOffset >= MaxLineOffset) 6950b57cec5SDimitry Andric break; 6960b57cec5SDimitry Andric } 6970b57cec5SDimitry Andric 6980b57cec5SDimitry Andric // Comments are okay; skip over them. 6990b57cec5SDimitry Andric if (TheTok.getKind() == tok::comment) { 7000b57cec5SDimitry Andric if (ActiveCommentLoc.isInvalid()) 7010b57cec5SDimitry Andric ActiveCommentLoc = TheTok.getLocation(); 7020b57cec5SDimitry Andric continue; 7030b57cec5SDimitry Andric } 7040b57cec5SDimitry Andric 7050b57cec5SDimitry Andric if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 7060b57cec5SDimitry Andric // This is the start of a preprocessor directive. 7070b57cec5SDimitry Andric Token HashTok = TheTok; 7080b57cec5SDimitry Andric InPreprocessorDirective = true; 7090b57cec5SDimitry Andric ActiveCommentLoc = SourceLocation(); 7100b57cec5SDimitry Andric 7110b57cec5SDimitry Andric // Figure out which directive this is. Since we're lexing raw tokens, 7120b57cec5SDimitry Andric // we don't have an identifier table available. Instead, just look at 7130b57cec5SDimitry Andric // the raw identifier to recognize and categorize preprocessor directives. 7140b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 7150b57cec5SDimitry Andric if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 7160b57cec5SDimitry Andric StringRef Keyword = TheTok.getRawIdentifier(); 7170b57cec5SDimitry Andric PreambleDirectiveKind PDK 7180b57cec5SDimitry Andric = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 7190b57cec5SDimitry Andric .Case("include", PDK_Skipped) 7200b57cec5SDimitry Andric .Case("__include_macros", PDK_Skipped) 7210b57cec5SDimitry Andric .Case("define", PDK_Skipped) 7220b57cec5SDimitry Andric .Case("undef", PDK_Skipped) 7230b57cec5SDimitry Andric .Case("line", PDK_Skipped) 7240b57cec5SDimitry Andric .Case("error", PDK_Skipped) 7250b57cec5SDimitry Andric .Case("pragma", PDK_Skipped) 7260b57cec5SDimitry Andric .Case("import", PDK_Skipped) 7270b57cec5SDimitry Andric .Case("include_next", PDK_Skipped) 7280b57cec5SDimitry Andric .Case("warning", PDK_Skipped) 7290b57cec5SDimitry Andric .Case("ident", PDK_Skipped) 7300b57cec5SDimitry Andric .Case("sccs", PDK_Skipped) 7310b57cec5SDimitry Andric .Case("assert", PDK_Skipped) 7320b57cec5SDimitry Andric .Case("unassert", PDK_Skipped) 7330b57cec5SDimitry Andric .Case("if", PDK_Skipped) 7340b57cec5SDimitry Andric .Case("ifdef", PDK_Skipped) 7350b57cec5SDimitry Andric .Case("ifndef", PDK_Skipped) 7360b57cec5SDimitry Andric .Case("elif", PDK_Skipped) 737fe6060f1SDimitry Andric .Case("elifdef", PDK_Skipped) 738fe6060f1SDimitry Andric .Case("elifndef", PDK_Skipped) 7390b57cec5SDimitry Andric .Case("else", PDK_Skipped) 7400b57cec5SDimitry Andric .Case("endif", PDK_Skipped) 7410b57cec5SDimitry Andric .Default(PDK_Unknown); 7420b57cec5SDimitry Andric 7430b57cec5SDimitry Andric switch (PDK) { 7440b57cec5SDimitry Andric case PDK_Skipped: 7450b57cec5SDimitry Andric continue; 7460b57cec5SDimitry Andric 7470b57cec5SDimitry Andric case PDK_Unknown: 7480b57cec5SDimitry Andric // We don't know what this directive is; stop at the '#'. 7490b57cec5SDimitry Andric break; 7500b57cec5SDimitry Andric } 7510b57cec5SDimitry Andric } 7520b57cec5SDimitry Andric 7530b57cec5SDimitry Andric // We only end up here if we didn't recognize the preprocessor 7540b57cec5SDimitry Andric // directive or it was one that can't occur in the preamble at this 7550b57cec5SDimitry Andric // point. Roll back the current token to the location of the '#'. 7560b57cec5SDimitry Andric TheTok = HashTok; 7575f757f3fSDimitry Andric } else if (TheTok.isAtStartOfLine() && 7585f757f3fSDimitry Andric TheTok.getKind() == tok::raw_identifier && 7595f757f3fSDimitry Andric TheTok.getRawIdentifier() == "module" && 7605f757f3fSDimitry Andric LangOpts.CPlusPlusModules) { 7615f757f3fSDimitry Andric // The initial global module fragment introducer "module;" is part of 7625f757f3fSDimitry Andric // the preamble, which runs up to the module declaration "module foo;". 7635f757f3fSDimitry Andric Token ModuleTok = TheTok; 7645f757f3fSDimitry Andric do { 7655f757f3fSDimitry Andric TheLexer.LexFromRawLexer(TheTok); 7665f757f3fSDimitry Andric } while (TheTok.getKind() == tok::comment); 7675f757f3fSDimitry Andric if (TheTok.getKind() != tok::semi) { 7685f757f3fSDimitry Andric // Not global module fragment, roll back. 7695f757f3fSDimitry Andric TheTok = ModuleTok; 7705f757f3fSDimitry Andric break; 7715f757f3fSDimitry Andric } 7725f757f3fSDimitry Andric continue; 7730b57cec5SDimitry Andric } 7740b57cec5SDimitry Andric 7750b57cec5SDimitry Andric // We hit a token that we don't recognize as being in the 7760b57cec5SDimitry Andric // "preprocessing only" part of the file, so we're no longer in 7770b57cec5SDimitry Andric // the preamble. 7780b57cec5SDimitry Andric break; 7790b57cec5SDimitry Andric } while (true); 7800b57cec5SDimitry Andric 7810b57cec5SDimitry Andric SourceLocation End; 7820b57cec5SDimitry Andric if (ActiveCommentLoc.isValid()) 7830b57cec5SDimitry Andric End = ActiveCommentLoc; // don't truncate a decl comment. 7840b57cec5SDimitry Andric else 7850b57cec5SDimitry Andric End = TheTok.getLocation(); 7860b57cec5SDimitry Andric 7870b57cec5SDimitry Andric return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(), 7880b57cec5SDimitry Andric TheTok.isAtStartOfLine()); 7890b57cec5SDimitry Andric } 7900b57cec5SDimitry Andric 7910b57cec5SDimitry Andric unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, 7920b57cec5SDimitry Andric const SourceManager &SM, 7930b57cec5SDimitry Andric const LangOptions &LangOpts) { 7940b57cec5SDimitry Andric // Figure out how many physical characters away the specified expansion 7950b57cec5SDimitry Andric // character is. This needs to take into consideration newlines and 7960b57cec5SDimitry Andric // trigraphs. 7970b57cec5SDimitry Andric bool Invalid = false; 7980b57cec5SDimitry Andric const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 7990b57cec5SDimitry Andric 8000b57cec5SDimitry Andric // If they request the first char of the token, we're trivially done. 8010b57cec5SDimitry Andric if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 8020b57cec5SDimitry Andric return 0; 8030b57cec5SDimitry Andric 8040b57cec5SDimitry Andric unsigned PhysOffset = 0; 8050b57cec5SDimitry Andric 8060b57cec5SDimitry Andric // The usual case is that tokens don't contain anything interesting. Skip 8070b57cec5SDimitry Andric // over the uninteresting characters. If a token only consists of simple 8080b57cec5SDimitry Andric // chars, this method is extremely fast. 8090b57cec5SDimitry Andric while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 8100b57cec5SDimitry Andric if (CharNo == 0) 8110b57cec5SDimitry Andric return PhysOffset; 8120b57cec5SDimitry Andric ++TokPtr; 8130b57cec5SDimitry Andric --CharNo; 8140b57cec5SDimitry Andric ++PhysOffset; 8150b57cec5SDimitry Andric } 8160b57cec5SDimitry Andric 8170b57cec5SDimitry Andric // If we have a character that may be a trigraph or escaped newline, use a 8180b57cec5SDimitry Andric // lexer to parse it correctly. 8190b57cec5SDimitry Andric for (; CharNo; --CharNo) { 8205f757f3fSDimitry Andric auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts); 8215f757f3fSDimitry Andric TokPtr += CharAndSize.Size; 8225f757f3fSDimitry Andric PhysOffset += CharAndSize.Size; 8230b57cec5SDimitry Andric } 8240b57cec5SDimitry Andric 8250b57cec5SDimitry Andric // Final detail: if we end up on an escaped newline, we want to return the 8260b57cec5SDimitry Andric // location of the actual byte of the token. For example foo\<newline>bar 8270b57cec5SDimitry Andric // advanced by 3 should return the location of b, not of \\. One compounding 8280b57cec5SDimitry Andric // detail of this is that the escape may be made by a trigraph. 8290b57cec5SDimitry Andric if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 8300b57cec5SDimitry Andric PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 8310b57cec5SDimitry Andric 8320b57cec5SDimitry Andric return PhysOffset; 8330b57cec5SDimitry Andric } 8340b57cec5SDimitry Andric 8350b57cec5SDimitry Andric /// Computes the source location just past the end of the 8360b57cec5SDimitry Andric /// token at this source location. 8370b57cec5SDimitry Andric /// 8380b57cec5SDimitry Andric /// This routine can be used to produce a source location that 8390b57cec5SDimitry Andric /// points just past the end of the token referenced by \p Loc, and 8400b57cec5SDimitry Andric /// is generally used when a diagnostic needs to point just after a 8410b57cec5SDimitry Andric /// token where it expected something different that it received. If 8420b57cec5SDimitry Andric /// the returned source location would not be meaningful (e.g., if 8430b57cec5SDimitry Andric /// it points into a macro), this routine returns an invalid 8440b57cec5SDimitry Andric /// source location. 8450b57cec5SDimitry Andric /// 8460b57cec5SDimitry Andric /// \param Offset an offset from the end of the token, where the source 8470b57cec5SDimitry Andric /// location should refer to. The default offset (0) produces a source 8480b57cec5SDimitry Andric /// location pointing just past the end of the token; an offset of 1 produces 8490b57cec5SDimitry Andric /// a source location pointing to the last character in the token, etc. 8500b57cec5SDimitry Andric SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 8510b57cec5SDimitry Andric const SourceManager &SM, 8520b57cec5SDimitry Andric const LangOptions &LangOpts) { 8530b57cec5SDimitry Andric if (Loc.isInvalid()) 8540b57cec5SDimitry Andric return {}; 8550b57cec5SDimitry Andric 8560b57cec5SDimitry Andric if (Loc.isMacroID()) { 8570b57cec5SDimitry Andric if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 8580b57cec5SDimitry Andric return {}; // Points inside the macro expansion. 8590b57cec5SDimitry Andric } 8600b57cec5SDimitry Andric 8610b57cec5SDimitry Andric unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 8620b57cec5SDimitry Andric if (Len > Offset) 8630b57cec5SDimitry Andric Len = Len - Offset; 8640b57cec5SDimitry Andric else 8650b57cec5SDimitry Andric return Loc; 8660b57cec5SDimitry Andric 8670b57cec5SDimitry Andric return Loc.getLocWithOffset(Len); 8680b57cec5SDimitry Andric } 8690b57cec5SDimitry Andric 8700b57cec5SDimitry Andric /// Returns true if the given MacroID location points at the first 8710b57cec5SDimitry Andric /// token of the macro expansion. 8720b57cec5SDimitry Andric bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 8730b57cec5SDimitry Andric const SourceManager &SM, 8740b57cec5SDimitry Andric const LangOptions &LangOpts, 8750b57cec5SDimitry Andric SourceLocation *MacroBegin) { 8760b57cec5SDimitry Andric assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 8770b57cec5SDimitry Andric 8780b57cec5SDimitry Andric SourceLocation expansionLoc; 8790b57cec5SDimitry Andric if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) 8800b57cec5SDimitry Andric return false; 8810b57cec5SDimitry Andric 8820b57cec5SDimitry Andric if (expansionLoc.isFileID()) { 8830b57cec5SDimitry Andric // No other macro expansions, this is the first. 8840b57cec5SDimitry Andric if (MacroBegin) 8850b57cec5SDimitry Andric *MacroBegin = expansionLoc; 8860b57cec5SDimitry Andric return true; 8870b57cec5SDimitry Andric } 8880b57cec5SDimitry Andric 8890b57cec5SDimitry Andric return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 8900b57cec5SDimitry Andric } 8910b57cec5SDimitry Andric 8920b57cec5SDimitry Andric /// Returns true if the given MacroID location points at the last 8930b57cec5SDimitry Andric /// token of the macro expansion. 8940b57cec5SDimitry Andric bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 8950b57cec5SDimitry Andric const SourceManager &SM, 8960b57cec5SDimitry Andric const LangOptions &LangOpts, 8970b57cec5SDimitry Andric SourceLocation *MacroEnd) { 8980b57cec5SDimitry Andric assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 8990b57cec5SDimitry Andric 9000b57cec5SDimitry Andric SourceLocation spellLoc = SM.getSpellingLoc(loc); 9010b57cec5SDimitry Andric unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 9020b57cec5SDimitry Andric if (tokLen == 0) 9030b57cec5SDimitry Andric return false; 9040b57cec5SDimitry Andric 9050b57cec5SDimitry Andric SourceLocation afterLoc = loc.getLocWithOffset(tokLen); 9060b57cec5SDimitry Andric SourceLocation expansionLoc; 9070b57cec5SDimitry Andric if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) 9080b57cec5SDimitry Andric return false; 9090b57cec5SDimitry Andric 9100b57cec5SDimitry Andric if (expansionLoc.isFileID()) { 9110b57cec5SDimitry Andric // No other macro expansions. 9120b57cec5SDimitry Andric if (MacroEnd) 9130b57cec5SDimitry Andric *MacroEnd = expansionLoc; 9140b57cec5SDimitry Andric return true; 9150b57cec5SDimitry Andric } 9160b57cec5SDimitry Andric 9170b57cec5SDimitry Andric return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 9180b57cec5SDimitry Andric } 9190b57cec5SDimitry Andric 9200b57cec5SDimitry Andric static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 9210b57cec5SDimitry Andric const SourceManager &SM, 9220b57cec5SDimitry Andric const LangOptions &LangOpts) { 9230b57cec5SDimitry Andric SourceLocation Begin = Range.getBegin(); 9240b57cec5SDimitry Andric SourceLocation End = Range.getEnd(); 9250b57cec5SDimitry Andric assert(Begin.isFileID() && End.isFileID()); 9260b57cec5SDimitry Andric if (Range.isTokenRange()) { 9270b57cec5SDimitry Andric End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 9280b57cec5SDimitry Andric if (End.isInvalid()) 9290b57cec5SDimitry Andric return {}; 9300b57cec5SDimitry Andric } 9310b57cec5SDimitry Andric 9320b57cec5SDimitry Andric // Break down the source locations. 9330b57cec5SDimitry Andric FileID FID; 9340b57cec5SDimitry Andric unsigned BeginOffs; 9350b57cec5SDimitry Andric std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 9360b57cec5SDimitry Andric if (FID.isInvalid()) 9370b57cec5SDimitry Andric return {}; 9380b57cec5SDimitry Andric 9390b57cec5SDimitry Andric unsigned EndOffs; 9400b57cec5SDimitry Andric if (!SM.isInFileID(End, FID, &EndOffs) || 9410b57cec5SDimitry Andric BeginOffs > EndOffs) 9420b57cec5SDimitry Andric return {}; 9430b57cec5SDimitry Andric 9440b57cec5SDimitry Andric return CharSourceRange::getCharRange(Begin, End); 9450b57cec5SDimitry Andric } 9460b57cec5SDimitry Andric 947fe6060f1SDimitry Andric // Assumes that `Loc` is in an expansion. 948fe6060f1SDimitry Andric static bool isInExpansionTokenRange(const SourceLocation Loc, 949fe6060f1SDimitry Andric const SourceManager &SM) { 950fe6060f1SDimitry Andric return SM.getSLocEntry(SM.getFileID(Loc)) 951fe6060f1SDimitry Andric .getExpansion() 952fe6060f1SDimitry Andric .isExpansionTokenRange(); 953fe6060f1SDimitry Andric } 954fe6060f1SDimitry Andric 9550b57cec5SDimitry Andric CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 9560b57cec5SDimitry Andric const SourceManager &SM, 9570b57cec5SDimitry Andric const LangOptions &LangOpts) { 9580b57cec5SDimitry Andric SourceLocation Begin = Range.getBegin(); 9590b57cec5SDimitry Andric SourceLocation End = Range.getEnd(); 9600b57cec5SDimitry Andric if (Begin.isInvalid() || End.isInvalid()) 9610b57cec5SDimitry Andric return {}; 9620b57cec5SDimitry Andric 9630b57cec5SDimitry Andric if (Begin.isFileID() && End.isFileID()) 9640b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9650b57cec5SDimitry Andric 9660b57cec5SDimitry Andric if (Begin.isMacroID() && End.isFileID()) { 9670b57cec5SDimitry Andric if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 9680b57cec5SDimitry Andric return {}; 9690b57cec5SDimitry Andric Range.setBegin(Begin); 9700b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9710b57cec5SDimitry Andric } 9720b57cec5SDimitry Andric 9730b57cec5SDimitry Andric if (Begin.isFileID() && End.isMacroID()) { 974fe6060f1SDimitry Andric if (Range.isTokenRange()) { 975fe6060f1SDimitry Andric if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End)) 976fe6060f1SDimitry Andric return {}; 977fe6060f1SDimitry Andric // Use the *original* end, not the expanded one in `End`. 978fe6060f1SDimitry Andric Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM)); 979fe6060f1SDimitry Andric } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End)) 9800b57cec5SDimitry Andric return {}; 9810b57cec5SDimitry Andric Range.setEnd(End); 9820b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9830b57cec5SDimitry Andric } 9840b57cec5SDimitry Andric 9850b57cec5SDimitry Andric assert(Begin.isMacroID() && End.isMacroID()); 9860b57cec5SDimitry Andric SourceLocation MacroBegin, MacroEnd; 9870b57cec5SDimitry Andric if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 9880b57cec5SDimitry Andric ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 9890b57cec5SDimitry Andric &MacroEnd)) || 9900b57cec5SDimitry Andric (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 9910b57cec5SDimitry Andric &MacroEnd)))) { 9920b57cec5SDimitry Andric Range.setBegin(MacroBegin); 9930b57cec5SDimitry Andric Range.setEnd(MacroEnd); 994fe6060f1SDimitry Andric // Use the *original* `End`, not the expanded one in `MacroEnd`. 995fe6060f1SDimitry Andric if (Range.isTokenRange()) 996fe6060f1SDimitry Andric Range.setTokenRange(isInExpansionTokenRange(End, SM)); 9970b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9980b57cec5SDimitry Andric } 9990b57cec5SDimitry Andric 10000b57cec5SDimitry Andric bool Invalid = false; 10010b57cec5SDimitry Andric const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), 10020b57cec5SDimitry Andric &Invalid); 10030b57cec5SDimitry Andric if (Invalid) 10040b57cec5SDimitry Andric return {}; 10050b57cec5SDimitry Andric 10060b57cec5SDimitry Andric if (BeginEntry.getExpansion().isMacroArgExpansion()) { 10070b57cec5SDimitry Andric const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), 10080b57cec5SDimitry Andric &Invalid); 10090b57cec5SDimitry Andric if (Invalid) 10100b57cec5SDimitry Andric return {}; 10110b57cec5SDimitry Andric 10120b57cec5SDimitry Andric if (EndEntry.getExpansion().isMacroArgExpansion() && 10130b57cec5SDimitry Andric BeginEntry.getExpansion().getExpansionLocStart() == 10140b57cec5SDimitry Andric EndEntry.getExpansion().getExpansionLocStart()) { 10150b57cec5SDimitry Andric Range.setBegin(SM.getImmediateSpellingLoc(Begin)); 10160b57cec5SDimitry Andric Range.setEnd(SM.getImmediateSpellingLoc(End)); 10170b57cec5SDimitry Andric return makeFileCharRange(Range, SM, LangOpts); 10180b57cec5SDimitry Andric } 10190b57cec5SDimitry Andric } 10200b57cec5SDimitry Andric 10210b57cec5SDimitry Andric return {}; 10220b57cec5SDimitry Andric } 10230b57cec5SDimitry Andric 10240b57cec5SDimitry Andric StringRef Lexer::getSourceText(CharSourceRange Range, 10250b57cec5SDimitry Andric const SourceManager &SM, 10260b57cec5SDimitry Andric const LangOptions &LangOpts, 10270b57cec5SDimitry Andric bool *Invalid) { 10280b57cec5SDimitry Andric Range = makeFileCharRange(Range, SM, LangOpts); 10290b57cec5SDimitry Andric if (Range.isInvalid()) { 10300b57cec5SDimitry Andric if (Invalid) *Invalid = true; 10310b57cec5SDimitry Andric return {}; 10320b57cec5SDimitry Andric } 10330b57cec5SDimitry Andric 10340b57cec5SDimitry Andric // Break down the source location. 10350b57cec5SDimitry Andric std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 10360b57cec5SDimitry Andric if (beginInfo.first.isInvalid()) { 10370b57cec5SDimitry Andric if (Invalid) *Invalid = true; 10380b57cec5SDimitry Andric return {}; 10390b57cec5SDimitry Andric } 10400b57cec5SDimitry Andric 10410b57cec5SDimitry Andric unsigned EndOffs; 10420b57cec5SDimitry Andric if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 10430b57cec5SDimitry Andric beginInfo.second > EndOffs) { 10440b57cec5SDimitry Andric if (Invalid) *Invalid = true; 10450b57cec5SDimitry Andric return {}; 10460b57cec5SDimitry Andric } 10470b57cec5SDimitry Andric 10480b57cec5SDimitry Andric // Try to the load the file buffer. 10490b57cec5SDimitry Andric bool invalidTemp = false; 10500b57cec5SDimitry Andric StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 10510b57cec5SDimitry Andric if (invalidTemp) { 10520b57cec5SDimitry Andric if (Invalid) *Invalid = true; 10530b57cec5SDimitry Andric return {}; 10540b57cec5SDimitry Andric } 10550b57cec5SDimitry Andric 10560b57cec5SDimitry Andric if (Invalid) *Invalid = false; 10570b57cec5SDimitry Andric return file.substr(beginInfo.second, EndOffs - beginInfo.second); 10580b57cec5SDimitry Andric } 10590b57cec5SDimitry Andric 10600b57cec5SDimitry Andric StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 10610b57cec5SDimitry Andric const SourceManager &SM, 10620b57cec5SDimitry Andric const LangOptions &LangOpts) { 10630b57cec5SDimitry Andric assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 10640b57cec5SDimitry Andric 10650b57cec5SDimitry Andric // Find the location of the immediate macro expansion. 10660b57cec5SDimitry Andric while (true) { 10670b57cec5SDimitry Andric FileID FID = SM.getFileID(Loc); 10680b57cec5SDimitry Andric const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 10690b57cec5SDimitry Andric const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 10700b57cec5SDimitry Andric Loc = Expansion.getExpansionLocStart(); 10710b57cec5SDimitry Andric if (!Expansion.isMacroArgExpansion()) 10720b57cec5SDimitry Andric break; 10730b57cec5SDimitry Andric 10740b57cec5SDimitry Andric // For macro arguments we need to check that the argument did not come 10750b57cec5SDimitry Andric // from an inner macro, e.g: "MAC1( MAC2(foo) )" 10760b57cec5SDimitry Andric 10770b57cec5SDimitry Andric // Loc points to the argument id of the macro definition, move to the 10780b57cec5SDimitry Andric // macro expansion. 10790b57cec5SDimitry Andric Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 10800b57cec5SDimitry Andric SourceLocation SpellLoc = Expansion.getSpellingLoc(); 10810b57cec5SDimitry Andric if (SpellLoc.isFileID()) 10820b57cec5SDimitry Andric break; // No inner macro. 10830b57cec5SDimitry Andric 10840b57cec5SDimitry Andric // If spelling location resides in the same FileID as macro expansion 10850b57cec5SDimitry Andric // location, it means there is no inner macro. 10860b57cec5SDimitry Andric FileID MacroFID = SM.getFileID(Loc); 10870b57cec5SDimitry Andric if (SM.isInFileID(SpellLoc, MacroFID)) 10880b57cec5SDimitry Andric break; 10890b57cec5SDimitry Andric 10900b57cec5SDimitry Andric // Argument came from inner macro. 10910b57cec5SDimitry Andric Loc = SpellLoc; 10920b57cec5SDimitry Andric } 10930b57cec5SDimitry Andric 10940b57cec5SDimitry Andric // Find the spelling location of the start of the non-argument expansion 10950b57cec5SDimitry Andric // range. This is where the macro name was spelled in order to begin 10960b57cec5SDimitry Andric // expanding this macro. 10970b57cec5SDimitry Andric Loc = SM.getSpellingLoc(Loc); 10980b57cec5SDimitry Andric 10990b57cec5SDimitry Andric // Dig out the buffer where the macro name was spelled and the extents of the 11000b57cec5SDimitry Andric // name so that we can render it into the expansion note. 11010b57cec5SDimitry Andric std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 11020b57cec5SDimitry Andric unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 11030b57cec5SDimitry Andric StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 11040b57cec5SDimitry Andric return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 11050b57cec5SDimitry Andric } 11060b57cec5SDimitry Andric 11070b57cec5SDimitry Andric StringRef Lexer::getImmediateMacroNameForDiagnostics( 11080b57cec5SDimitry Andric SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { 11090b57cec5SDimitry Andric assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 11100b57cec5SDimitry Andric // Walk past macro argument expansions. 11110b57cec5SDimitry Andric while (SM.isMacroArgExpansion(Loc)) 11120b57cec5SDimitry Andric Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 11130b57cec5SDimitry Andric 1114bdd1243dSDimitry Andric // If the macro's spelling isn't FileID or from scratch space, then it's 1115bdd1243dSDimitry Andric // actually a token paste or stringization (or similar) and not a macro at 1116bdd1243dSDimitry Andric // all. 1117bdd1243dSDimitry Andric SourceLocation SpellLoc = SM.getSpellingLoc(Loc); 1118bdd1243dSDimitry Andric if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc)) 11190b57cec5SDimitry Andric return {}; 11200b57cec5SDimitry Andric 11210b57cec5SDimitry Andric // Find the spelling location of the start of the non-argument expansion 11220b57cec5SDimitry Andric // range. This is where the macro name was spelled in order to begin 11230b57cec5SDimitry Andric // expanding this macro. 11240b57cec5SDimitry Andric Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin()); 11250b57cec5SDimitry Andric 11260b57cec5SDimitry Andric // Dig out the buffer where the macro name was spelled and the extents of the 11270b57cec5SDimitry Andric // name so that we can render it into the expansion note. 11280b57cec5SDimitry Andric std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 11290b57cec5SDimitry Andric unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 11300b57cec5SDimitry Andric StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 11310b57cec5SDimitry Andric return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 11320b57cec5SDimitry Andric } 11330b57cec5SDimitry Andric 1134349cc55cSDimitry Andric bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) { 1135349cc55cSDimitry Andric return isAsciiIdentifierContinue(c, LangOpts.DollarIdents); 11360b57cec5SDimitry Andric } 11370b57cec5SDimitry Andric 11380b57cec5SDimitry Andric bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { 11390b57cec5SDimitry Andric assert(isVerticalWhitespace(Str[0])); 11400b57cec5SDimitry Andric if (Str - 1 < BufferStart) 11410b57cec5SDimitry Andric return false; 11420b57cec5SDimitry Andric 11430b57cec5SDimitry Andric if ((Str[0] == '\n' && Str[-1] == '\r') || 11440b57cec5SDimitry Andric (Str[0] == '\r' && Str[-1] == '\n')) { 11450b57cec5SDimitry Andric if (Str - 2 < BufferStart) 11460b57cec5SDimitry Andric return false; 11470b57cec5SDimitry Andric --Str; 11480b57cec5SDimitry Andric } 11490b57cec5SDimitry Andric --Str; 11500b57cec5SDimitry Andric 11510b57cec5SDimitry Andric // Rewind to first non-space character: 11520b57cec5SDimitry Andric while (Str > BufferStart && isHorizontalWhitespace(*Str)) 11530b57cec5SDimitry Andric --Str; 11540b57cec5SDimitry Andric 11550b57cec5SDimitry Andric return *Str == '\\'; 11560b57cec5SDimitry Andric } 11570b57cec5SDimitry Andric 11580b57cec5SDimitry Andric StringRef Lexer::getIndentationForLine(SourceLocation Loc, 11590b57cec5SDimitry Andric const SourceManager &SM) { 11600b57cec5SDimitry Andric if (Loc.isInvalid() || Loc.isMacroID()) 11610b57cec5SDimitry Andric return {}; 11620b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 11630b57cec5SDimitry Andric if (LocInfo.first.isInvalid()) 11640b57cec5SDimitry Andric return {}; 11650b57cec5SDimitry Andric bool Invalid = false; 11660b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 11670b57cec5SDimitry Andric if (Invalid) 11680b57cec5SDimitry Andric return {}; 11690b57cec5SDimitry Andric const char *Line = findBeginningOfLine(Buffer, LocInfo.second); 11700b57cec5SDimitry Andric if (!Line) 11710b57cec5SDimitry Andric return {}; 11720b57cec5SDimitry Andric StringRef Rest = Buffer.substr(Line - Buffer.data()); 11730b57cec5SDimitry Andric size_t NumWhitespaceChars = Rest.find_first_not_of(" \t"); 11740b57cec5SDimitry Andric return NumWhitespaceChars == StringRef::npos 11750b57cec5SDimitry Andric ? "" 11760b57cec5SDimitry Andric : Rest.take_front(NumWhitespaceChars); 11770b57cec5SDimitry Andric } 11780b57cec5SDimitry Andric 11790b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 11800b57cec5SDimitry Andric // Diagnostics forwarding code. 11810b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 11820b57cec5SDimitry Andric 11830b57cec5SDimitry Andric /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 11840b57cec5SDimitry Andric /// lexer buffer was all expanded at a single point, perform the mapping. 11850b57cec5SDimitry Andric /// This is currently only used for _Pragma implementation, so it is the slow 11860b57cec5SDimitry Andric /// path of the hot getSourceLocation method. Do not allow it to be inlined. 11870b57cec5SDimitry Andric static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 11880b57cec5SDimitry Andric Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 11890b57cec5SDimitry Andric static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 11900b57cec5SDimitry Andric SourceLocation FileLoc, 11910b57cec5SDimitry Andric unsigned CharNo, unsigned TokLen) { 11920b57cec5SDimitry Andric assert(FileLoc.isMacroID() && "Must be a macro expansion"); 11930b57cec5SDimitry Andric 11940b57cec5SDimitry Andric // Otherwise, we're lexing "mapped tokens". This is used for things like 11950b57cec5SDimitry Andric // _Pragma handling. Combine the expansion location of FileLoc with the 11960b57cec5SDimitry Andric // spelling location. 11970b57cec5SDimitry Andric SourceManager &SM = PP.getSourceManager(); 11980b57cec5SDimitry Andric 11990b57cec5SDimitry Andric // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 12000b57cec5SDimitry Andric // characters come from spelling(FileLoc)+Offset. 12010b57cec5SDimitry Andric SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 12020b57cec5SDimitry Andric SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 12030b57cec5SDimitry Andric 12040b57cec5SDimitry Andric // Figure out the expansion loc range, which is the range covered by the 12050b57cec5SDimitry Andric // original _Pragma(...) sequence. 12060b57cec5SDimitry Andric CharSourceRange II = SM.getImmediateExpansionRange(FileLoc); 12070b57cec5SDimitry Andric 12080b57cec5SDimitry Andric return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen); 12090b57cec5SDimitry Andric } 12100b57cec5SDimitry Andric 12110b57cec5SDimitry Andric /// getSourceLocation - Return a source location identifier for the specified 12120b57cec5SDimitry Andric /// offset in the current file. 12130b57cec5SDimitry Andric SourceLocation Lexer::getSourceLocation(const char *Loc, 12140b57cec5SDimitry Andric unsigned TokLen) const { 12150b57cec5SDimitry Andric assert(Loc >= BufferStart && Loc <= BufferEnd && 12160b57cec5SDimitry Andric "Location out of range for this buffer!"); 12170b57cec5SDimitry Andric 12180b57cec5SDimitry Andric // In the normal case, we're just lexing from a simple file buffer, return 12190b57cec5SDimitry Andric // the file id from FileLoc with the offset specified. 12200b57cec5SDimitry Andric unsigned CharNo = Loc-BufferStart; 12210b57cec5SDimitry Andric if (FileLoc.isFileID()) 12220b57cec5SDimitry Andric return FileLoc.getLocWithOffset(CharNo); 12230b57cec5SDimitry Andric 12240b57cec5SDimitry Andric // Otherwise, this is the _Pragma lexer case, which pretends that all of the 12250b57cec5SDimitry Andric // tokens are lexed from where the _Pragma was defined. 12260b57cec5SDimitry Andric assert(PP && "This doesn't work on raw lexers"); 12270b57cec5SDimitry Andric return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 12280b57cec5SDimitry Andric } 12290b57cec5SDimitry Andric 12300b57cec5SDimitry Andric /// Diag - Forwarding function for diagnostics. This translate a source 12310b57cec5SDimitry Andric /// position in the current buffer into a SourceLocation object for rendering. 12320b57cec5SDimitry Andric DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 12330b57cec5SDimitry Andric return PP->Diag(getSourceLocation(Loc), DiagID); 12340b57cec5SDimitry Andric } 12350b57cec5SDimitry Andric 12360b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 12370b57cec5SDimitry Andric // Trigraph and Escaped Newline Handling Code. 12380b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 12390b57cec5SDimitry Andric 12400b57cec5SDimitry Andric /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 12410b57cec5SDimitry Andric /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 12420b57cec5SDimitry Andric static char GetTrigraphCharForLetter(char Letter) { 12430b57cec5SDimitry Andric switch (Letter) { 12440b57cec5SDimitry Andric default: return 0; 12450b57cec5SDimitry Andric case '=': return '#'; 12460b57cec5SDimitry Andric case ')': return ']'; 12470b57cec5SDimitry Andric case '(': return '['; 12480b57cec5SDimitry Andric case '!': return '|'; 12490b57cec5SDimitry Andric case '\'': return '^'; 12500b57cec5SDimitry Andric case '>': return '}'; 12510b57cec5SDimitry Andric case '/': return '\\'; 12520b57cec5SDimitry Andric case '<': return '{'; 12530b57cec5SDimitry Andric case '-': return '~'; 12540b57cec5SDimitry Andric } 12550b57cec5SDimitry Andric } 12560b57cec5SDimitry Andric 12570b57cec5SDimitry Andric /// DecodeTrigraphChar - If the specified character is a legal trigraph when 12580b57cec5SDimitry Andric /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 12590b57cec5SDimitry Andric /// return the result character. Finally, emit a warning about trigraph use 12600b57cec5SDimitry Andric /// whether trigraphs are enabled or not. 126181ad6265SDimitry Andric static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) { 12620b57cec5SDimitry Andric char Res = GetTrigraphCharForLetter(*CP); 1263bdd1243dSDimitry Andric if (!Res) 1264bdd1243dSDimitry Andric return Res; 12650b57cec5SDimitry Andric 126681ad6265SDimitry Andric if (!Trigraphs) { 1267bdd1243dSDimitry Andric if (L && !L->isLexingRawMode()) 12680b57cec5SDimitry Andric L->Diag(CP-2, diag::trigraph_ignored); 12690b57cec5SDimitry Andric return 0; 12700b57cec5SDimitry Andric } 12710b57cec5SDimitry Andric 1272bdd1243dSDimitry Andric if (L && !L->isLexingRawMode()) 12730b57cec5SDimitry Andric L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 12740b57cec5SDimitry Andric return Res; 12750b57cec5SDimitry Andric } 12760b57cec5SDimitry Andric 12770b57cec5SDimitry Andric /// getEscapedNewLineSize - Return the size of the specified escaped newline, 12780b57cec5SDimitry Andric /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 12790b57cec5SDimitry Andric /// trigraph equivalent on entry to this function. 12800b57cec5SDimitry Andric unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 12810b57cec5SDimitry Andric unsigned Size = 0; 12820b57cec5SDimitry Andric while (isWhitespace(Ptr[Size])) { 12830b57cec5SDimitry Andric ++Size; 12840b57cec5SDimitry Andric 12850b57cec5SDimitry Andric if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 12860b57cec5SDimitry Andric continue; 12870b57cec5SDimitry Andric 12880b57cec5SDimitry Andric // If this is a \r\n or \n\r, skip the other half. 12890b57cec5SDimitry Andric if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 12900b57cec5SDimitry Andric Ptr[Size-1] != Ptr[Size]) 12910b57cec5SDimitry Andric ++Size; 12920b57cec5SDimitry Andric 12930b57cec5SDimitry Andric return Size; 12940b57cec5SDimitry Andric } 12950b57cec5SDimitry Andric 12960b57cec5SDimitry Andric // Not an escaped newline, must be a \t or something else. 12970b57cec5SDimitry Andric return 0; 12980b57cec5SDimitry Andric } 12990b57cec5SDimitry Andric 13000b57cec5SDimitry Andric /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 13010b57cec5SDimitry Andric /// them), skip over them and return the first non-escaped-newline found, 13020b57cec5SDimitry Andric /// otherwise return P. 13030b57cec5SDimitry Andric const char *Lexer::SkipEscapedNewLines(const char *P) { 13040b57cec5SDimitry Andric while (true) { 13050b57cec5SDimitry Andric const char *AfterEscape; 13060b57cec5SDimitry Andric if (*P == '\\') { 13070b57cec5SDimitry Andric AfterEscape = P+1; 13080b57cec5SDimitry Andric } else if (*P == '?') { 13090b57cec5SDimitry Andric // If not a trigraph for escape, bail out. 13100b57cec5SDimitry Andric if (P[1] != '?' || P[2] != '/') 13110b57cec5SDimitry Andric return P; 13120b57cec5SDimitry Andric // FIXME: Take LangOpts into account; the language might not 13130b57cec5SDimitry Andric // support trigraphs. 13140b57cec5SDimitry Andric AfterEscape = P+3; 13150b57cec5SDimitry Andric } else { 13160b57cec5SDimitry Andric return P; 13170b57cec5SDimitry Andric } 13180b57cec5SDimitry Andric 13190b57cec5SDimitry Andric unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 13200b57cec5SDimitry Andric if (NewLineSize == 0) return P; 13210b57cec5SDimitry Andric P = AfterEscape+NewLineSize; 13220b57cec5SDimitry Andric } 13230b57cec5SDimitry Andric } 13240b57cec5SDimitry Andric 1325bdd1243dSDimitry Andric std::optional<Token> Lexer::findNextToken(SourceLocation Loc, 13260b57cec5SDimitry Andric const SourceManager &SM, 13270b57cec5SDimitry Andric const LangOptions &LangOpts) { 13280b57cec5SDimitry Andric if (Loc.isMacroID()) { 13290b57cec5SDimitry Andric if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 1330bdd1243dSDimitry Andric return std::nullopt; 13310b57cec5SDimitry Andric } 13320b57cec5SDimitry Andric Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 13330b57cec5SDimitry Andric 13340b57cec5SDimitry Andric // Break down the source location. 13350b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 13360b57cec5SDimitry Andric 13370b57cec5SDimitry Andric // Try to load the file buffer. 13380b57cec5SDimitry Andric bool InvalidTemp = false; 13390b57cec5SDimitry Andric StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 13400b57cec5SDimitry Andric if (InvalidTemp) 1341bdd1243dSDimitry Andric return std::nullopt; 13420b57cec5SDimitry Andric 13430b57cec5SDimitry Andric const char *TokenBegin = File.data() + LocInfo.second; 13440b57cec5SDimitry Andric 13450b57cec5SDimitry Andric // Lex from the start of the given location. 13460b57cec5SDimitry Andric Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 13470b57cec5SDimitry Andric TokenBegin, File.end()); 13480b57cec5SDimitry Andric // Find the token. 13490b57cec5SDimitry Andric Token Tok; 13500b57cec5SDimitry Andric lexer.LexFromRawLexer(Tok); 13510b57cec5SDimitry Andric return Tok; 13520b57cec5SDimitry Andric } 13530b57cec5SDimitry Andric 13540b57cec5SDimitry Andric /// Checks that the given token is the first token that occurs after the 13550b57cec5SDimitry Andric /// given location (this excludes comments and whitespace). Returns the location 13560b57cec5SDimitry Andric /// immediately after the specified token. If the token is not found or the 13570b57cec5SDimitry Andric /// location is inside a macro, the returned source location will be invalid. 13580b57cec5SDimitry Andric SourceLocation Lexer::findLocationAfterToken( 13590b57cec5SDimitry Andric SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, 13600b57cec5SDimitry Andric const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { 1361bdd1243dSDimitry Andric std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts); 13620b57cec5SDimitry Andric if (!Tok || Tok->isNot(TKind)) 13630b57cec5SDimitry Andric return {}; 13640b57cec5SDimitry Andric SourceLocation TokenLoc = Tok->getLocation(); 13650b57cec5SDimitry Andric 13660b57cec5SDimitry Andric // Calculate how much whitespace needs to be skipped if any. 13670b57cec5SDimitry Andric unsigned NumWhitespaceChars = 0; 13680b57cec5SDimitry Andric if (SkipTrailingWhitespaceAndNewLine) { 13690b57cec5SDimitry Andric const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength(); 13700b57cec5SDimitry Andric unsigned char C = *TokenEnd; 13710b57cec5SDimitry Andric while (isHorizontalWhitespace(C)) { 13720b57cec5SDimitry Andric C = *(++TokenEnd); 13730b57cec5SDimitry Andric NumWhitespaceChars++; 13740b57cec5SDimitry Andric } 13750b57cec5SDimitry Andric 13760b57cec5SDimitry Andric // Skip \r, \n, \r\n, or \n\r 13770b57cec5SDimitry Andric if (C == '\n' || C == '\r') { 13780b57cec5SDimitry Andric char PrevC = C; 13790b57cec5SDimitry Andric C = *(++TokenEnd); 13800b57cec5SDimitry Andric NumWhitespaceChars++; 13810b57cec5SDimitry Andric if ((C == '\n' || C == '\r') && C != PrevC) 13820b57cec5SDimitry Andric NumWhitespaceChars++; 13830b57cec5SDimitry Andric } 13840b57cec5SDimitry Andric } 13850b57cec5SDimitry Andric 13860b57cec5SDimitry Andric return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars); 13870b57cec5SDimitry Andric } 13880b57cec5SDimitry Andric 13890b57cec5SDimitry Andric /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 13900b57cec5SDimitry Andric /// get its size, and return it. This is tricky in several cases: 13910b57cec5SDimitry Andric /// 1. If currently at the start of a trigraph, we warn about the trigraph, 13920b57cec5SDimitry Andric /// then either return the trigraph (skipping 3 chars) or the '?', 13930b57cec5SDimitry Andric /// depending on whether trigraphs are enabled or not. 13940b57cec5SDimitry Andric /// 2. If this is an escaped newline (potentially with whitespace between 13950b57cec5SDimitry Andric /// the backslash and newline), implicitly skip the newline and return 13960b57cec5SDimitry Andric /// the char after it. 13970b57cec5SDimitry Andric /// 13980b57cec5SDimitry Andric /// This handles the slow/uncommon case of the getCharAndSize method. Here we 13990b57cec5SDimitry Andric /// know that we can accumulate into Size, and that we have already incremented 14000b57cec5SDimitry Andric /// Ptr by Size bytes. 14010b57cec5SDimitry Andric /// 14020b57cec5SDimitry Andric /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 14030b57cec5SDimitry Andric /// be updated to match. 14045f757f3fSDimitry Andric Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) { 14055f757f3fSDimitry Andric unsigned Size = 0; 14060b57cec5SDimitry Andric // If we have a slash, look for an escaped newline. 14070b57cec5SDimitry Andric if (Ptr[0] == '\\') { 14080b57cec5SDimitry Andric ++Size; 14090b57cec5SDimitry Andric ++Ptr; 14100b57cec5SDimitry Andric Slash: 14110b57cec5SDimitry Andric // Common case, backslash-char where the char is not whitespace. 14125f757f3fSDimitry Andric if (!isWhitespace(Ptr[0])) 14135f757f3fSDimitry Andric return {'\\', Size}; 14140b57cec5SDimitry Andric 14150b57cec5SDimitry Andric // See if we have optional whitespace characters between the slash and 14160b57cec5SDimitry Andric // newline. 14170b57cec5SDimitry Andric if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 14180b57cec5SDimitry Andric // Remember that this token needs to be cleaned. 14190b57cec5SDimitry Andric if (Tok) Tok->setFlag(Token::NeedsCleaning); 14200b57cec5SDimitry Andric 14210b57cec5SDimitry Andric // Warn if there was whitespace between the backslash and newline. 14220b57cec5SDimitry Andric if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 14230b57cec5SDimitry Andric Diag(Ptr, diag::backslash_newline_space); 14240b57cec5SDimitry Andric 14250b57cec5SDimitry Andric // Found backslash<whitespace><newline>. Parse the char after it. 14260b57cec5SDimitry Andric Size += EscapedNewLineSize; 14270b57cec5SDimitry Andric Ptr += EscapedNewLineSize; 14280b57cec5SDimitry Andric 14290b57cec5SDimitry Andric // Use slow version to accumulate a correct size field. 14305f757f3fSDimitry Andric auto CharAndSize = getCharAndSizeSlow(Ptr, Tok); 14315f757f3fSDimitry Andric CharAndSize.Size += Size; 14325f757f3fSDimitry Andric return CharAndSize; 14330b57cec5SDimitry Andric } 14340b57cec5SDimitry Andric 14350b57cec5SDimitry Andric // Otherwise, this is not an escaped newline, just return the slash. 14365f757f3fSDimitry Andric return {'\\', Size}; 14370b57cec5SDimitry Andric } 14380b57cec5SDimitry Andric 14390b57cec5SDimitry Andric // If this is a trigraph, process it. 14400b57cec5SDimitry Andric if (Ptr[0] == '?' && Ptr[1] == '?') { 14410b57cec5SDimitry Andric // If this is actually a legal trigraph (not something like "??x"), emit 14420b57cec5SDimitry Andric // a trigraph warning. If so, and if trigraphs are enabled, return it. 144381ad6265SDimitry Andric if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr, 144481ad6265SDimitry Andric LangOpts.Trigraphs)) { 14450b57cec5SDimitry Andric // Remember that this token needs to be cleaned. 14460b57cec5SDimitry Andric if (Tok) Tok->setFlag(Token::NeedsCleaning); 14470b57cec5SDimitry Andric 14480b57cec5SDimitry Andric Ptr += 3; 14490b57cec5SDimitry Andric Size += 3; 14500b57cec5SDimitry Andric if (C == '\\') goto Slash; 14515f757f3fSDimitry Andric return {C, Size}; 14520b57cec5SDimitry Andric } 14530b57cec5SDimitry Andric } 14540b57cec5SDimitry Andric 14550b57cec5SDimitry Andric // If this is neither, return a single character. 14565f757f3fSDimitry Andric return {*Ptr, Size + 1u}; 14570b57cec5SDimitry Andric } 14580b57cec5SDimitry Andric 14590b57cec5SDimitry Andric /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 14600b57cec5SDimitry Andric /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 14610b57cec5SDimitry Andric /// and that we have already incremented Ptr by Size bytes. 14620b57cec5SDimitry Andric /// 14630b57cec5SDimitry Andric /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 14640b57cec5SDimitry Andric /// be updated to match. 14655f757f3fSDimitry Andric Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, 14660b57cec5SDimitry Andric const LangOptions &LangOpts) { 14675f757f3fSDimitry Andric 14685f757f3fSDimitry Andric unsigned Size = 0; 14690b57cec5SDimitry Andric // If we have a slash, look for an escaped newline. 14700b57cec5SDimitry Andric if (Ptr[0] == '\\') { 14710b57cec5SDimitry Andric ++Size; 14720b57cec5SDimitry Andric ++Ptr; 14730b57cec5SDimitry Andric Slash: 14740b57cec5SDimitry Andric // Common case, backslash-char where the char is not whitespace. 14755f757f3fSDimitry Andric if (!isWhitespace(Ptr[0])) 14765f757f3fSDimitry Andric return {'\\', Size}; 14770b57cec5SDimitry Andric 14780b57cec5SDimitry Andric // See if we have optional whitespace characters followed by a newline. 14790b57cec5SDimitry Andric if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 14800b57cec5SDimitry Andric // Found backslash<whitespace><newline>. Parse the char after it. 14810b57cec5SDimitry Andric Size += EscapedNewLineSize; 14820b57cec5SDimitry Andric Ptr += EscapedNewLineSize; 14830b57cec5SDimitry Andric 14840b57cec5SDimitry Andric // Use slow version to accumulate a correct size field. 14855f757f3fSDimitry Andric auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts); 14865f757f3fSDimitry Andric CharAndSize.Size += Size; 14875f757f3fSDimitry Andric return CharAndSize; 14880b57cec5SDimitry Andric } 14890b57cec5SDimitry Andric 14900b57cec5SDimitry Andric // Otherwise, this is not an escaped newline, just return the slash. 14915f757f3fSDimitry Andric return {'\\', Size}; 14920b57cec5SDimitry Andric } 14930b57cec5SDimitry Andric 14940b57cec5SDimitry Andric // If this is a trigraph, process it. 14950b57cec5SDimitry Andric if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 14960b57cec5SDimitry Andric // If this is actually a legal trigraph (not something like "??x"), return 14970b57cec5SDimitry Andric // it. 14980b57cec5SDimitry Andric if (char C = GetTrigraphCharForLetter(Ptr[2])) { 14990b57cec5SDimitry Andric Ptr += 3; 15000b57cec5SDimitry Andric Size += 3; 15010b57cec5SDimitry Andric if (C == '\\') goto Slash; 15025f757f3fSDimitry Andric return {C, Size}; 15030b57cec5SDimitry Andric } 15040b57cec5SDimitry Andric } 15050b57cec5SDimitry Andric 15060b57cec5SDimitry Andric // If this is neither, return a single character. 15075f757f3fSDimitry Andric return {*Ptr, Size + 1u}; 15080b57cec5SDimitry Andric } 15090b57cec5SDimitry Andric 15100b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 15110b57cec5SDimitry Andric // Helper methods for lexing. 15120b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 15130b57cec5SDimitry Andric 15140b57cec5SDimitry Andric /// Routine that indiscriminately sets the offset into the source file. 15150b57cec5SDimitry Andric void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { 15160b57cec5SDimitry Andric BufferPtr = BufferStart + Offset; 15170b57cec5SDimitry Andric if (BufferPtr > BufferEnd) 15180b57cec5SDimitry Andric BufferPtr = BufferEnd; 15190b57cec5SDimitry Andric // FIXME: What exactly does the StartOfLine bit mean? There are two 15200b57cec5SDimitry Andric // possible meanings for the "start" of the line: the first token on the 15210b57cec5SDimitry Andric // unexpanded line, or the first token on the expanded line. 15220b57cec5SDimitry Andric IsAtStartOfLine = StartOfLine; 15230b57cec5SDimitry Andric IsAtPhysicalStartOfLine = StartOfLine; 15240b57cec5SDimitry Andric } 15250b57cec5SDimitry Andric 1526349cc55cSDimitry Andric static bool isUnicodeWhitespace(uint32_t Codepoint) { 1527349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( 1528349cc55cSDimitry Andric UnicodeWhitespaceCharRanges); 1529349cc55cSDimitry Andric return UnicodeWhitespaceChars.contains(Codepoint); 1530349cc55cSDimitry Andric } 1531349cc55cSDimitry Andric 1532bdd1243dSDimitry Andric static llvm::SmallString<5> codepointAsHexString(uint32_t C) { 1533bdd1243dSDimitry Andric llvm::SmallString<5> CharBuf; 1534bdd1243dSDimitry Andric llvm::raw_svector_ostream CharOS(CharBuf); 1535bdd1243dSDimitry Andric llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); 1536bdd1243dSDimitry Andric return CharBuf; 1537bdd1243dSDimitry Andric } 1538bdd1243dSDimitry Andric 1539bdd1243dSDimitry Andric // To mitigate https://github.com/llvm/llvm-project/issues/54732, 1540bdd1243dSDimitry Andric // we allow "Mathematical Notation Characters" in identifiers. 1541bdd1243dSDimitry Andric // This is a proposed profile that extends the XID_Start/XID_continue 1542bdd1243dSDimitry Andric // with mathematical symbols, superscipts and subscripts digits 1543bdd1243dSDimitry Andric // found in some production software. 1544bdd1243dSDimitry Andric // https://www.unicode.org/L2/L2022/22230-math-profile.pdf 1545bdd1243dSDimitry Andric static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, 1546bdd1243dSDimitry Andric bool IsStart, bool &IsExtension) { 1547bdd1243dSDimitry Andric static const llvm::sys::UnicodeCharSet MathStartChars( 1548bdd1243dSDimitry Andric MathematicalNotationProfileIDStartRanges); 1549bdd1243dSDimitry Andric static const llvm::sys::UnicodeCharSet MathContinueChars( 1550bdd1243dSDimitry Andric MathematicalNotationProfileIDContinueRanges); 1551bdd1243dSDimitry Andric if (MathStartChars.contains(C) || 1552bdd1243dSDimitry Andric (!IsStart && MathContinueChars.contains(C))) { 1553bdd1243dSDimitry Andric IsExtension = true; 1554bdd1243dSDimitry Andric return true; 1555bdd1243dSDimitry Andric } 1556bdd1243dSDimitry Andric return false; 1557bdd1243dSDimitry Andric } 1558bdd1243dSDimitry Andric 1559bdd1243dSDimitry Andric static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, 1560bdd1243dSDimitry Andric bool &IsExtension) { 15610b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) { 15620b57cec5SDimitry Andric return false; 1563480093f4SDimitry Andric } else if (LangOpts.DollarIdents && '$' == C) { 1564480093f4SDimitry Andric return true; 15655f757f3fSDimitry Andric } else if (LangOpts.CPlusPlus || LangOpts.C23) { 1566349cc55cSDimitry Andric // A non-leading codepoint must have the XID_Continue property. 1567349cc55cSDimitry Andric // XIDContinueRanges doesn't contains characters also in XIDStartRanges, 1568349cc55cSDimitry Andric // so we need to check both tables. 1569fcaf7f86SDimitry Andric // '_' doesn't have the XID_Continue property but is allowed in C and C++. 1570349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1571349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges); 1572bdd1243dSDimitry Andric if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C)) 1573bdd1243dSDimitry Andric return true; 1574bdd1243dSDimitry Andric return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false, 1575bdd1243dSDimitry Andric IsExtension); 1576349cc55cSDimitry Andric } else if (LangOpts.C11) { 15770b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C11AllowedIDChars( 15780b57cec5SDimitry Andric C11AllowedIDCharRanges); 15790b57cec5SDimitry Andric return C11AllowedIDChars.contains(C); 15800b57cec5SDimitry Andric } else { 15810b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 15820b57cec5SDimitry Andric C99AllowedIDCharRanges); 15830b57cec5SDimitry Andric return C99AllowedIDChars.contains(C); 15840b57cec5SDimitry Andric } 15850b57cec5SDimitry Andric } 15860b57cec5SDimitry Andric 1587bdd1243dSDimitry Andric static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, 1588bdd1243dSDimitry Andric bool &IsExtension) { 1589bdd1243dSDimitry Andric assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint"); 1590bdd1243dSDimitry Andric IsExtension = false; 15910b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) { 15920b57cec5SDimitry Andric return false; 1593349cc55cSDimitry Andric } 15945f757f3fSDimitry Andric if (LangOpts.CPlusPlus || LangOpts.C23) { 1595349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1596bdd1243dSDimitry Andric if (XIDStartChars.contains(C)) 1597bdd1243dSDimitry Andric return true; 1598bdd1243dSDimitry Andric return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true, 1599bdd1243dSDimitry Andric IsExtension); 1600349cc55cSDimitry Andric } 1601bdd1243dSDimitry Andric if (!isAllowedIDChar(C, LangOpts, IsExtension)) 1602349cc55cSDimitry Andric return false; 1603349cc55cSDimitry Andric if (LangOpts.C11) { 16040b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( 16050b57cec5SDimitry Andric C11DisallowedInitialIDCharRanges); 16060b57cec5SDimitry Andric return !C11DisallowedInitialIDChars.contains(C); 1607349cc55cSDimitry Andric } 16080b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 16090b57cec5SDimitry Andric C99DisallowedInitialIDCharRanges); 16100b57cec5SDimitry Andric return !C99DisallowedInitialIDChars.contains(C); 16110b57cec5SDimitry Andric } 16120b57cec5SDimitry Andric 1613bdd1243dSDimitry Andric static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, 1614bdd1243dSDimitry Andric CharSourceRange Range) { 1615bdd1243dSDimitry Andric 1616bdd1243dSDimitry Andric static const llvm::sys::UnicodeCharSet MathStartChars( 1617bdd1243dSDimitry Andric MathematicalNotationProfileIDStartRanges); 1618bdd1243dSDimitry Andric static const llvm::sys::UnicodeCharSet MathContinueChars( 1619bdd1243dSDimitry Andric MathematicalNotationProfileIDContinueRanges); 1620bdd1243dSDimitry Andric 1621bdd1243dSDimitry Andric (void)MathStartChars; 1622bdd1243dSDimitry Andric (void)MathContinueChars; 1623bdd1243dSDimitry Andric assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) && 1624bdd1243dSDimitry Andric "Unexpected mathematical notation codepoint"); 1625bdd1243dSDimitry Andric Diags.Report(Range.getBegin(), diag::ext_mathematical_notation) 1626bdd1243dSDimitry Andric << codepointAsHexString(C) << Range; 1627bdd1243dSDimitry Andric } 1628bdd1243dSDimitry Andric 16290b57cec5SDimitry Andric static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, 16300b57cec5SDimitry Andric const char *End) { 16310b57cec5SDimitry Andric return CharSourceRange::getCharRange(L.getSourceLocation(Begin), 16320b57cec5SDimitry Andric L.getSourceLocation(End)); 16330b57cec5SDimitry Andric } 16340b57cec5SDimitry Andric 16350b57cec5SDimitry Andric static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, 16360b57cec5SDimitry Andric CharSourceRange Range, bool IsFirst) { 16370b57cec5SDimitry Andric // Check C99 compatibility. 16380b57cec5SDimitry Andric if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) { 16390b57cec5SDimitry Andric enum { 16400b57cec5SDimitry Andric CannotAppearInIdentifier = 0, 16410b57cec5SDimitry Andric CannotStartIdentifier 16420b57cec5SDimitry Andric }; 16430b57cec5SDimitry Andric 16440b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 16450b57cec5SDimitry Andric C99AllowedIDCharRanges); 16460b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 16470b57cec5SDimitry Andric C99DisallowedInitialIDCharRanges); 16480b57cec5SDimitry Andric if (!C99AllowedIDChars.contains(C)) { 16490b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 16500b57cec5SDimitry Andric << Range 16510b57cec5SDimitry Andric << CannotAppearInIdentifier; 16520b57cec5SDimitry Andric } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { 16530b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 16540b57cec5SDimitry Andric << Range 16550b57cec5SDimitry Andric << CannotStartIdentifier; 16560b57cec5SDimitry Andric } 16570b57cec5SDimitry Andric } 16580b57cec5SDimitry Andric } 16590b57cec5SDimitry Andric 16600b57cec5SDimitry Andric /// After encountering UTF-8 character C and interpreting it as an identifier 16610b57cec5SDimitry Andric /// character, check whether it's a homoglyph for a common non-identifier 16620b57cec5SDimitry Andric /// source character that is unlikely to be an intentional identifier 16630b57cec5SDimitry Andric /// character and warn if so. 16640b57cec5SDimitry Andric static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, 16650b57cec5SDimitry Andric CharSourceRange Range) { 16660b57cec5SDimitry Andric // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). 16670b57cec5SDimitry Andric struct HomoglyphPair { 16680b57cec5SDimitry Andric uint32_t Character; 16690b57cec5SDimitry Andric char LooksLike; 16700b57cec5SDimitry Andric bool operator<(HomoglyphPair R) const { return Character < R.Character; } 16710b57cec5SDimitry Andric }; 16720b57cec5SDimitry Andric static constexpr HomoglyphPair SortedHomoglyphs[] = { 16730b57cec5SDimitry Andric {U'\u00ad', 0}, // SOFT HYPHEN 16740b57cec5SDimitry Andric {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK 16750b57cec5SDimitry Andric {U'\u037e', ';'}, // GREEK QUESTION MARK 16760b57cec5SDimitry Andric {U'\u200b', 0}, // ZERO WIDTH SPACE 16770b57cec5SDimitry Andric {U'\u200c', 0}, // ZERO WIDTH NON-JOINER 16780b57cec5SDimitry Andric {U'\u200d', 0}, // ZERO WIDTH JOINER 16790b57cec5SDimitry Andric {U'\u2060', 0}, // WORD JOINER 16800b57cec5SDimitry Andric {U'\u2061', 0}, // FUNCTION APPLICATION 16810b57cec5SDimitry Andric {U'\u2062', 0}, // INVISIBLE TIMES 16820b57cec5SDimitry Andric {U'\u2063', 0}, // INVISIBLE SEPARATOR 16830b57cec5SDimitry Andric {U'\u2064', 0}, // INVISIBLE PLUS 16840b57cec5SDimitry Andric {U'\u2212', '-'}, // MINUS SIGN 16850b57cec5SDimitry Andric {U'\u2215', '/'}, // DIVISION SLASH 16860b57cec5SDimitry Andric {U'\u2216', '\\'}, // SET MINUS 16870b57cec5SDimitry Andric {U'\u2217', '*'}, // ASTERISK OPERATOR 16880b57cec5SDimitry Andric {U'\u2223', '|'}, // DIVIDES 16890b57cec5SDimitry Andric {U'\u2227', '^'}, // LOGICAL AND 16900b57cec5SDimitry Andric {U'\u2236', ':'}, // RATIO 16910b57cec5SDimitry Andric {U'\u223c', '~'}, // TILDE OPERATOR 16920b57cec5SDimitry Andric {U'\ua789', ':'}, // MODIFIER LETTER COLON 16930b57cec5SDimitry Andric {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE 16940b57cec5SDimitry Andric {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK 16950b57cec5SDimitry Andric {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN 16960b57cec5SDimitry Andric {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN 16970b57cec5SDimitry Andric {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN 16980b57cec5SDimitry Andric {U'\uff06', '&'}, // FULLWIDTH AMPERSAND 16990b57cec5SDimitry Andric {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS 17000b57cec5SDimitry Andric {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS 17010b57cec5SDimitry Andric {U'\uff0a', '*'}, // FULLWIDTH ASTERISK 17020b57cec5SDimitry Andric {U'\uff0b', '+'}, // FULLWIDTH ASTERISK 17030b57cec5SDimitry Andric {U'\uff0c', ','}, // FULLWIDTH COMMA 17040b57cec5SDimitry Andric {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS 17050b57cec5SDimitry Andric {U'\uff0e', '.'}, // FULLWIDTH FULL STOP 17060b57cec5SDimitry Andric {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS 17070b57cec5SDimitry Andric {U'\uff1a', ':'}, // FULLWIDTH COLON 17080b57cec5SDimitry Andric {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON 17090b57cec5SDimitry Andric {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN 17100b57cec5SDimitry Andric {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN 17110b57cec5SDimitry Andric {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN 17120b57cec5SDimitry Andric {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK 17130b57cec5SDimitry Andric {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT 17140b57cec5SDimitry Andric {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET 17150b57cec5SDimitry Andric {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS 17160b57cec5SDimitry Andric {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET 17170b57cec5SDimitry Andric {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT 17180b57cec5SDimitry Andric {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET 17190b57cec5SDimitry Andric {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE 17200b57cec5SDimitry Andric {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET 17210b57cec5SDimitry Andric {U'\uff5e', '~'}, // FULLWIDTH TILDE 17220b57cec5SDimitry Andric {0, 0} 17230b57cec5SDimitry Andric }; 17240b57cec5SDimitry Andric auto Homoglyph = 17250b57cec5SDimitry Andric std::lower_bound(std::begin(SortedHomoglyphs), 17260b57cec5SDimitry Andric std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); 17270b57cec5SDimitry Andric if (Homoglyph->Character == C) { 17280b57cec5SDimitry Andric if (Homoglyph->LooksLike) { 17290b57cec5SDimitry Andric const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; 17300b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) 1731bdd1243dSDimitry Andric << Range << codepointAsHexString(C) << LooksLikeStr; 17320b57cec5SDimitry Andric } else { 17330b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) 1734bdd1243dSDimitry Andric << Range << codepointAsHexString(C); 17350b57cec5SDimitry Andric } 17360b57cec5SDimitry Andric } 17370b57cec5SDimitry Andric } 17380b57cec5SDimitry Andric 1739349cc55cSDimitry Andric static void diagnoseInvalidUnicodeCodepointInIdentifier( 1740349cc55cSDimitry Andric DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, 1741349cc55cSDimitry Andric CharSourceRange Range, bool IsFirst) { 1742349cc55cSDimitry Andric if (isASCII(CodePoint)) 1743349cc55cSDimitry Andric return; 1744349cc55cSDimitry Andric 1745bdd1243dSDimitry Andric bool IsExtension; 1746bdd1243dSDimitry Andric bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension); 1747bdd1243dSDimitry Andric bool IsIDContinue = 1748bdd1243dSDimitry Andric IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension); 1749349cc55cSDimitry Andric 1750349cc55cSDimitry Andric if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue)) 1751349cc55cSDimitry Andric return; 1752349cc55cSDimitry Andric 1753349cc55cSDimitry Andric bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue; 1754349cc55cSDimitry Andric 1755349cc55cSDimitry Andric if (!IsFirst || InvalidOnlyAtStart) { 1756349cc55cSDimitry Andric Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier) 1757bdd1243dSDimitry Andric << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart) 1758349cc55cSDimitry Andric << FixItHint::CreateRemoval(Range); 1759349cc55cSDimitry Andric } else { 1760349cc55cSDimitry Andric Diags.Report(Range.getBegin(), diag::err_character_not_allowed) 1761bdd1243dSDimitry Andric << Range << codepointAsHexString(CodePoint) 1762bdd1243dSDimitry Andric << FixItHint::CreateRemoval(Range); 1763349cc55cSDimitry Andric } 1764349cc55cSDimitry Andric } 1765349cc55cSDimitry Andric 17660b57cec5SDimitry Andric bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 17670b57cec5SDimitry Andric Token &Result) { 17680b57cec5SDimitry Andric const char *UCNPtr = CurPtr + Size; 17690b57cec5SDimitry Andric uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr); 1770349cc55cSDimitry Andric if (CodePoint == 0) { 17710b57cec5SDimitry Andric return false; 1772349cc55cSDimitry Andric } 1773bdd1243dSDimitry Andric bool IsExtension = false; 1774bdd1243dSDimitry Andric if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) { 1775349cc55cSDimitry Andric if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1776349cc55cSDimitry Andric return false; 1777349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1778349cc55cSDimitry Andric !PP->isPreprocessedOutput()) 1779349cc55cSDimitry Andric diagnoseInvalidUnicodeCodepointInIdentifier( 1780349cc55cSDimitry Andric PP->getDiagnostics(), LangOpts, CodePoint, 1781349cc55cSDimitry Andric makeCharRange(*this, CurPtr, UCNPtr), 1782349cc55cSDimitry Andric /*IsFirst=*/false); 1783349cc55cSDimitry Andric 1784349cc55cSDimitry Andric // We got a unicode codepoint that is neither a space nor a 1785349cc55cSDimitry Andric // a valid identifier part. 1786349cc55cSDimitry Andric // Carry on as if the codepoint was valid for recovery purposes. 1787bdd1243dSDimitry Andric } else if (!isLexingRawMode()) { 1788bdd1243dSDimitry Andric if (IsExtension) 1789bdd1243dSDimitry Andric diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint, 1790bdd1243dSDimitry Andric makeCharRange(*this, CurPtr, UCNPtr)); 1791bdd1243dSDimitry Andric 17920b57cec5SDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 17930b57cec5SDimitry Andric makeCharRange(*this, CurPtr, UCNPtr), 17940b57cec5SDimitry Andric /*IsFirst=*/false); 1795bdd1243dSDimitry Andric } 17960b57cec5SDimitry Andric 17970b57cec5SDimitry Andric Result.setFlag(Token::HasUCN); 17980b57cec5SDimitry Andric if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || 17990b57cec5SDimitry Andric (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) 18000b57cec5SDimitry Andric CurPtr = UCNPtr; 18010b57cec5SDimitry Andric else 18020b57cec5SDimitry Andric while (CurPtr != UCNPtr) 18030b57cec5SDimitry Andric (void)getAndAdvanceChar(CurPtr, Result); 18040b57cec5SDimitry Andric return true; 18050b57cec5SDimitry Andric } 18060b57cec5SDimitry Andric 18075f757f3fSDimitry Andric bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) { 18080b57cec5SDimitry Andric llvm::UTF32 CodePoint; 18095f757f3fSDimitry Andric 18105f757f3fSDimitry Andric // If a UTF-8 codepoint appears immediately after an escaped new line, 18115f757f3fSDimitry Andric // CurPtr may point to the splicing \ on the preceding line, 18125f757f3fSDimitry Andric // so we need to skip it. 18135f757f3fSDimitry Andric unsigned FirstCodeUnitSize; 18145f757f3fSDimitry Andric getCharAndSize(CurPtr, FirstCodeUnitSize); 18155f757f3fSDimitry Andric const char *CharStart = CurPtr + FirstCodeUnitSize - 1; 18165f757f3fSDimitry Andric const char *UnicodePtr = CharStart; 18175f757f3fSDimitry Andric 18185f757f3fSDimitry Andric llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence( 18195f757f3fSDimitry Andric (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd, 18205f757f3fSDimitry Andric &CodePoint, llvm::strictConversion); 18215f757f3fSDimitry Andric if (ConvResult != llvm::conversionOK) 18220b57cec5SDimitry Andric return false; 18230b57cec5SDimitry Andric 1824bdd1243dSDimitry Andric bool IsExtension = false; 1825bdd1243dSDimitry Andric if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts, 1826bdd1243dSDimitry Andric IsExtension)) { 1827349cc55cSDimitry Andric if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1828349cc55cSDimitry Andric return false; 1829349cc55cSDimitry Andric 1830349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1831349cc55cSDimitry Andric !PP->isPreprocessedOutput()) 1832349cc55cSDimitry Andric diagnoseInvalidUnicodeCodepointInIdentifier( 1833349cc55cSDimitry Andric PP->getDiagnostics(), LangOpts, CodePoint, 18345f757f3fSDimitry Andric makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false); 1835349cc55cSDimitry Andric // We got a unicode codepoint that is neither a space nor a 1836349cc55cSDimitry Andric // a valid identifier part. Carry on as if the codepoint was 1837349cc55cSDimitry Andric // valid for recovery purposes. 1838349cc55cSDimitry Andric } else if (!isLexingRawMode()) { 1839bdd1243dSDimitry Andric if (IsExtension) 18405f757f3fSDimitry Andric diagnoseExtensionInIdentifier( 18415f757f3fSDimitry Andric PP->getDiagnostics(), CodePoint, 18425f757f3fSDimitry Andric makeCharRange(*this, CharStart, UnicodePtr)); 18430b57cec5SDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 18445f757f3fSDimitry Andric makeCharRange(*this, CharStart, UnicodePtr), 18450b57cec5SDimitry Andric /*IsFirst=*/false); 18460b57cec5SDimitry Andric maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, 18475f757f3fSDimitry Andric makeCharRange(*this, CharStart, UnicodePtr)); 18480b57cec5SDimitry Andric } 18490b57cec5SDimitry Andric 18505f757f3fSDimitry Andric // Once we sucessfully parsed some UTF-8, 18515f757f3fSDimitry Andric // calling ConsumeChar ensures the NeedsCleaning flag is set on the token 18525f757f3fSDimitry Andric // being lexed, and that warnings about trailing spaces are emitted. 18535f757f3fSDimitry Andric ConsumeChar(CurPtr, FirstCodeUnitSize, Result); 18540b57cec5SDimitry Andric CurPtr = UnicodePtr; 18550b57cec5SDimitry Andric return true; 18560b57cec5SDimitry Andric } 18570b57cec5SDimitry Andric 1858349cc55cSDimitry Andric bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C, 1859349cc55cSDimitry Andric const char *CurPtr) { 1860bdd1243dSDimitry Andric bool IsExtension = false; 1861bdd1243dSDimitry Andric if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) { 1862349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1863349cc55cSDimitry Andric !PP->isPreprocessedOutput()) { 1864bdd1243dSDimitry Andric if (IsExtension) 1865bdd1243dSDimitry Andric diagnoseExtensionInIdentifier(PP->getDiagnostics(), C, 1866bdd1243dSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr)); 1867349cc55cSDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, 1868349cc55cSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr), 1869349cc55cSDimitry Andric /*IsFirst=*/true); 1870349cc55cSDimitry Andric maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, 1871349cc55cSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr)); 1872349cc55cSDimitry Andric } 1873349cc55cSDimitry Andric 1874349cc55cSDimitry Andric MIOpt.ReadToken(); 1875349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 1876349cc55cSDimitry Andric } 1877349cc55cSDimitry Andric 1878349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1879349cc55cSDimitry Andric !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) && 1880bdd1243dSDimitry Andric !isUnicodeWhitespace(C)) { 1881349cc55cSDimitry Andric // Non-ASCII characters tend to creep into source code unintentionally. 1882349cc55cSDimitry Andric // Instead of letting the parser complain about the unknown token, 1883349cc55cSDimitry Andric // just drop the character. 1884349cc55cSDimitry Andric // Note that we can /only/ do this when the non-ASCII character is actually 1885349cc55cSDimitry Andric // spelled as Unicode, not written as a UCN. The standard requires that 1886349cc55cSDimitry Andric // we not throw away any possible preprocessor tokens, but there's a 1887349cc55cSDimitry Andric // loophole in the mapping of Unicode characters to basic character set 1888349cc55cSDimitry Andric // characters that allows us to map these particular characters to, say, 1889349cc55cSDimitry Andric // whitespace. 1890349cc55cSDimitry Andric diagnoseInvalidUnicodeCodepointInIdentifier( 1891349cc55cSDimitry Andric PP->getDiagnostics(), LangOpts, C, 1892349cc55cSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true); 1893349cc55cSDimitry Andric BufferPtr = CurPtr; 1894349cc55cSDimitry Andric return false; 1895349cc55cSDimitry Andric } 1896349cc55cSDimitry Andric 1897349cc55cSDimitry Andric // Otherwise, we have an explicit UCN or a character that's unlikely to show 1898349cc55cSDimitry Andric // up by accident. 1899349cc55cSDimitry Andric MIOpt.ReadToken(); 1900349cc55cSDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 1901349cc55cSDimitry Andric return true; 1902349cc55cSDimitry Andric } 1903349cc55cSDimitry Andric 19045f757f3fSDimitry Andric static const char * 19055f757f3fSDimitry Andric fastParseASCIIIdentifier(const char *CurPtr, 19065f757f3fSDimitry Andric [[maybe_unused]] const char *BufferEnd) { 19075f757f3fSDimitry Andric #ifdef __SSE4_2__ 19085f757f3fSDimitry Andric alignas(16) static constexpr char AsciiIdentifierRange[16] = { 19095f757f3fSDimitry Andric '_', '_', 'A', 'Z', 'a', 'z', '0', '9', 19105f757f3fSDimitry Andric }; 19115f757f3fSDimitry Andric constexpr ssize_t BytesPerRegister = 16; 19125f757f3fSDimitry Andric 19135f757f3fSDimitry Andric __m128i AsciiIdentifierRangeV = 19145f757f3fSDimitry Andric _mm_load_si128((const __m128i *)AsciiIdentifierRange); 19155f757f3fSDimitry Andric 19165f757f3fSDimitry Andric while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) { 19175f757f3fSDimitry Andric __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr)); 19185f757f3fSDimitry Andric 19195f757f3fSDimitry Andric int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv, 19205f757f3fSDimitry Andric _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | 19215f757f3fSDimitry Andric _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY); 19225f757f3fSDimitry Andric CurPtr += Consumed; 19235f757f3fSDimitry Andric if (Consumed == BytesPerRegister) 19245f757f3fSDimitry Andric continue; 19255f757f3fSDimitry Andric return CurPtr; 19265f757f3fSDimitry Andric } 19275f757f3fSDimitry Andric #endif 19285f757f3fSDimitry Andric 19295f757f3fSDimitry Andric unsigned char C = *CurPtr; 19305f757f3fSDimitry Andric while (isAsciiIdentifierContinue(C)) 19315f757f3fSDimitry Andric C = *++CurPtr; 19325f757f3fSDimitry Andric return CurPtr; 19335f757f3fSDimitry Andric } 19345f757f3fSDimitry Andric 1935349cc55cSDimitry Andric bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) { 1936349cc55cSDimitry Andric // Match [_A-Za-z0-9]*, we have already matched an identifier start. 19375f757f3fSDimitry Andric 1938349cc55cSDimitry Andric while (true) { 19395f757f3fSDimitry Andric 19405f757f3fSDimitry Andric CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd); 1941349cc55cSDimitry Andric 19420b57cec5SDimitry Andric unsigned Size; 1943349cc55cSDimitry Andric // Slow path: handle trigraph, unicode codepoints, UCNs. 19445f757f3fSDimitry Andric unsigned char C = getCharAndSize(CurPtr, Size); 1945349cc55cSDimitry Andric if (isAsciiIdentifierContinue(C)) { 1946349cc55cSDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 1947349cc55cSDimitry Andric continue; 1948349cc55cSDimitry Andric } 1949349cc55cSDimitry Andric if (C == '$') { 1950349cc55cSDimitry Andric // If we hit a $ and they are not supported in identifiers, we are done. 1951349cc55cSDimitry Andric if (!LangOpts.DollarIdents) 1952349cc55cSDimitry Andric break; 1953349cc55cSDimitry Andric // Otherwise, emit a diagnostic and continue. 1954349cc55cSDimitry Andric if (!isLexingRawMode()) 1955349cc55cSDimitry Andric Diag(CurPtr, diag::ext_dollar_in_identifier); 1956349cc55cSDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 1957349cc55cSDimitry Andric continue; 1958349cc55cSDimitry Andric } 1959349cc55cSDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1960349cc55cSDimitry Andric continue; 19615f757f3fSDimitry Andric if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 1962349cc55cSDimitry Andric continue; 1963349cc55cSDimitry Andric // Neither an expected Unicode codepoint nor a UCN. 1964349cc55cSDimitry Andric break; 1965349cc55cSDimitry Andric } 19660b57cec5SDimitry Andric 19670b57cec5SDimitry Andric const char *IdStart = BufferPtr; 19680b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 19690b57cec5SDimitry Andric Result.setRawIdentifierData(IdStart); 19700b57cec5SDimitry Andric 19710b57cec5SDimitry Andric // If we are in raw mode, return this identifier raw. There is no need to 19720b57cec5SDimitry Andric // look up identifier information or attempt to macro expand it. 19730b57cec5SDimitry Andric if (LexingRawMode) 19740b57cec5SDimitry Andric return true; 19750b57cec5SDimitry Andric 19760b57cec5SDimitry Andric // Fill in Result.IdentifierInfo and update the token kind, 19770b57cec5SDimitry Andric // looking up the identifier in the identifier table. 19785f757f3fSDimitry Andric const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 19790b57cec5SDimitry Andric // Note that we have to call PP->LookUpIdentifierInfo() even for code 19800b57cec5SDimitry Andric // completion, it writes IdentifierInfo into Result, and callers rely on it. 19810b57cec5SDimitry Andric 19820b57cec5SDimitry Andric // If the completion point is at the end of an identifier, we want to treat 19830b57cec5SDimitry Andric // the identifier as incomplete even if it resolves to a macro or a keyword. 19840b57cec5SDimitry Andric // This allows e.g. 'class^' to complete to 'classifier'. 19850b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr)) { 19860b57cec5SDimitry Andric // Return the code-completion token. 19870b57cec5SDimitry Andric Result.setKind(tok::code_completion); 19880b57cec5SDimitry Andric // Skip the code-completion char and all immediate identifier characters. 19890b57cec5SDimitry Andric // This ensures we get consistent behavior when completing at any point in 19900b57cec5SDimitry Andric // an identifier (i.e. at the start, in the middle, at the end). Note that 19910b57cec5SDimitry Andric // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code 19920b57cec5SDimitry Andric // simpler. 19930b57cec5SDimitry Andric assert(*CurPtr == 0 && "Completion character must be 0"); 19940b57cec5SDimitry Andric ++CurPtr; 19950b57cec5SDimitry Andric // Note that code completion token is not added as a separate character 19960b57cec5SDimitry Andric // when the completion point is at the end of the buffer. Therefore, we need 19970b57cec5SDimitry Andric // to check if the buffer has ended. 19980b57cec5SDimitry Andric if (CurPtr < BufferEnd) { 1999349cc55cSDimitry Andric while (isAsciiIdentifierContinue(*CurPtr)) 20000b57cec5SDimitry Andric ++CurPtr; 20010b57cec5SDimitry Andric } 20020b57cec5SDimitry Andric BufferPtr = CurPtr; 20030b57cec5SDimitry Andric return true; 20040b57cec5SDimitry Andric } 20050b57cec5SDimitry Andric 20060b57cec5SDimitry Andric // Finally, now that we know we have an identifier, pass this off to the 20070b57cec5SDimitry Andric // preprocessor, which may macro expand it or something. 20080b57cec5SDimitry Andric if (II->isHandleIdentifierCase()) 20090b57cec5SDimitry Andric return PP->HandleIdentifier(Result); 20100b57cec5SDimitry Andric 20110b57cec5SDimitry Andric return true; 20120b57cec5SDimitry Andric } 20130b57cec5SDimitry Andric 20140b57cec5SDimitry Andric /// isHexaLiteral - Return true if Start points to a hex constant. 20150b57cec5SDimitry Andric /// in microsoft mode (where this is supposed to be several different tokens). 20160b57cec5SDimitry Andric bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 20175f757f3fSDimitry Andric auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts); 20185f757f3fSDimitry Andric char C1 = CharAndSize1.Char; 20190b57cec5SDimitry Andric if (C1 != '0') 20200b57cec5SDimitry Andric return false; 20215f757f3fSDimitry Andric 20225f757f3fSDimitry Andric auto CharAndSize2 = 20235f757f3fSDimitry Andric Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts); 20245f757f3fSDimitry Andric char C2 = CharAndSize2.Char; 20250b57cec5SDimitry Andric return (C2 == 'x' || C2 == 'X'); 20260b57cec5SDimitry Andric } 20270b57cec5SDimitry Andric 20280b57cec5SDimitry Andric /// LexNumericConstant - Lex the remainder of a integer or floating point 20290b57cec5SDimitry Andric /// constant. From[-1] is the first character lexed. Return the end of the 20300b57cec5SDimitry Andric /// constant. 20310b57cec5SDimitry Andric bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 20320b57cec5SDimitry Andric unsigned Size; 20330b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, Size); 20340b57cec5SDimitry Andric char PrevCh = 0; 20350b57cec5SDimitry Andric while (isPreprocessingNumberBody(C)) { 20360b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 20370b57cec5SDimitry Andric PrevCh = C; 20385f757f3fSDimitry Andric if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) { 20395f757f3fSDimitry Andric CurPtr -= Size; 20405f757f3fSDimitry Andric break; 20415f757f3fSDimitry Andric } 20420b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 20430b57cec5SDimitry Andric } 20440b57cec5SDimitry Andric 20450b57cec5SDimitry Andric // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 20460b57cec5SDimitry Andric if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 20470b57cec5SDimitry Andric // If we are in Microsoft mode, don't continue if the constant is hex. 20480b57cec5SDimitry Andric // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 20490b57cec5SDimitry Andric if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 20500b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 20510b57cec5SDimitry Andric } 20520b57cec5SDimitry Andric 20530b57cec5SDimitry Andric // If we have a hex FP constant, continue. 20540b57cec5SDimitry Andric if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 20550b57cec5SDimitry Andric // Outside C99 and C++17, we accept hexadecimal floating point numbers as a 20560b57cec5SDimitry Andric // not-quite-conforming extension. Only do so if this looks like it's 20570b57cec5SDimitry Andric // actually meant to be a hexfloat, and not if it has a ud-suffix. 20580b57cec5SDimitry Andric bool IsHexFloat = true; 20590b57cec5SDimitry Andric if (!LangOpts.C99) { 20600b57cec5SDimitry Andric if (!isHexaLiteral(BufferPtr, LangOpts)) 20610b57cec5SDimitry Andric IsHexFloat = false; 206281ad6265SDimitry Andric else if (!LangOpts.CPlusPlus17 && 20630b57cec5SDimitry Andric std::find(BufferPtr, CurPtr, '_') != CurPtr) 20640b57cec5SDimitry Andric IsHexFloat = false; 20650b57cec5SDimitry Andric } 20660b57cec5SDimitry Andric if (IsHexFloat) 20670b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 20680b57cec5SDimitry Andric } 20690b57cec5SDimitry Andric 20700b57cec5SDimitry Andric // If we have a digit separator, continue. 20715f757f3fSDimitry Andric if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) { 20725f757f3fSDimitry Andric auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts); 2073349cc55cSDimitry Andric if (isAsciiIdentifierContinue(Next)) { 20740b57cec5SDimitry Andric if (!isLexingRawMode()) 207581ad6265SDimitry Andric Diag(CurPtr, LangOpts.CPlusPlus 2076fe6060f1SDimitry Andric ? diag::warn_cxx11_compat_digit_separator 20775f757f3fSDimitry Andric : diag::warn_c23_compat_digit_separator); 20780b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 20790b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, NextSize, Result); 20800b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 20810b57cec5SDimitry Andric } 20820b57cec5SDimitry Andric } 20830b57cec5SDimitry Andric 20840b57cec5SDimitry Andric // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. 20850b57cec5SDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 20860b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 20875f757f3fSDimitry Andric if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 20880b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 20890b57cec5SDimitry Andric 20900b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 20910b57cec5SDimitry Andric const char *TokStart = BufferPtr; 20920b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 20930b57cec5SDimitry Andric Result.setLiteralData(TokStart); 20940b57cec5SDimitry Andric return true; 20950b57cec5SDimitry Andric } 20960b57cec5SDimitry Andric 20970b57cec5SDimitry Andric /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 20980b57cec5SDimitry Andric /// in C++11, or warn on a ud-suffix in C++98. 20990b57cec5SDimitry Andric const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, 21000b57cec5SDimitry Andric bool IsStringLiteral) { 210181ad6265SDimitry Andric assert(LangOpts.CPlusPlus); 21020b57cec5SDimitry Andric 21030b57cec5SDimitry Andric // Maximally munch an identifier. 21040b57cec5SDimitry Andric unsigned Size; 21050b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, Size); 21060b57cec5SDimitry Andric bool Consumed = false; 21070b57cec5SDimitry Andric 2108349cc55cSDimitry Andric if (!isAsciiIdentifierStart(C)) { 21090b57cec5SDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 21100b57cec5SDimitry Andric Consumed = true; 21115f757f3fSDimitry Andric else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 21120b57cec5SDimitry Andric Consumed = true; 21130b57cec5SDimitry Andric else 21140b57cec5SDimitry Andric return CurPtr; 21150b57cec5SDimitry Andric } 21160b57cec5SDimitry Andric 211781ad6265SDimitry Andric if (!LangOpts.CPlusPlus11) { 21180b57cec5SDimitry Andric if (!isLexingRawMode()) 21190b57cec5SDimitry Andric Diag(CurPtr, 21200b57cec5SDimitry Andric C == '_' ? diag::warn_cxx11_compat_user_defined_literal 21210b57cec5SDimitry Andric : diag::warn_cxx11_compat_reserved_user_defined_literal) 21220b57cec5SDimitry Andric << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 21230b57cec5SDimitry Andric return CurPtr; 21240b57cec5SDimitry Andric } 21250b57cec5SDimitry Andric 21260b57cec5SDimitry Andric // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 21270b57cec5SDimitry Andric // that does not start with an underscore is ill-formed. As a conforming 21280b57cec5SDimitry Andric // extension, we treat all such suffixes as if they had whitespace before 21290b57cec5SDimitry Andric // them. We assume a suffix beginning with a UCN or UTF-8 character is more 21300b57cec5SDimitry Andric // likely to be a ud-suffix than a macro, however, and accept that. 21310b57cec5SDimitry Andric if (!Consumed) { 21320b57cec5SDimitry Andric bool IsUDSuffix = false; 21330b57cec5SDimitry Andric if (C == '_') 21340b57cec5SDimitry Andric IsUDSuffix = true; 213581ad6265SDimitry Andric else if (IsStringLiteral && LangOpts.CPlusPlus14) { 21360b57cec5SDimitry Andric // In C++1y, we need to look ahead a few characters to see if this is a 21370b57cec5SDimitry Andric // valid suffix for a string literal or a numeric literal (this could be 21380b57cec5SDimitry Andric // the 'operator""if' defining a numeric literal operator). 21390b57cec5SDimitry Andric const unsigned MaxStandardSuffixLength = 3; 21400b57cec5SDimitry Andric char Buffer[MaxStandardSuffixLength] = { C }; 21410b57cec5SDimitry Andric unsigned Consumed = Size; 21420b57cec5SDimitry Andric unsigned Chars = 1; 21430b57cec5SDimitry Andric while (true) { 21445f757f3fSDimitry Andric auto [Next, NextSize] = 21455f757f3fSDimitry Andric getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts); 2146349cc55cSDimitry Andric if (!isAsciiIdentifierContinue(Next)) { 21475ffd83dbSDimitry Andric // End of suffix. Check whether this is on the allowed list. 21480b57cec5SDimitry Andric const StringRef CompleteSuffix(Buffer, Chars); 214981ad6265SDimitry Andric IsUDSuffix = 215081ad6265SDimitry Andric StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix); 21510b57cec5SDimitry Andric break; 21520b57cec5SDimitry Andric } 21530b57cec5SDimitry Andric 21540b57cec5SDimitry Andric if (Chars == MaxStandardSuffixLength) 21550b57cec5SDimitry Andric // Too long: can't be a standard suffix. 21560b57cec5SDimitry Andric break; 21570b57cec5SDimitry Andric 21580b57cec5SDimitry Andric Buffer[Chars++] = Next; 21590b57cec5SDimitry Andric Consumed += NextSize; 21600b57cec5SDimitry Andric } 21610b57cec5SDimitry Andric } 21620b57cec5SDimitry Andric 21630b57cec5SDimitry Andric if (!IsUDSuffix) { 21640b57cec5SDimitry Andric if (!isLexingRawMode()) 216581ad6265SDimitry Andric Diag(CurPtr, LangOpts.MSVCCompat 21660b57cec5SDimitry Andric ? diag::ext_ms_reserved_user_defined_literal 21670b57cec5SDimitry Andric : diag::ext_reserved_user_defined_literal) 21680b57cec5SDimitry Andric << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 21690b57cec5SDimitry Andric return CurPtr; 21700b57cec5SDimitry Andric } 21710b57cec5SDimitry Andric 21720b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 21730b57cec5SDimitry Andric } 21740b57cec5SDimitry Andric 21750b57cec5SDimitry Andric Result.setFlag(Token::HasUDSuffix); 21760b57cec5SDimitry Andric while (true) { 21770b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 2178349cc55cSDimitry Andric if (isAsciiIdentifierContinue(C)) { 2179349cc55cSDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 2180349cc55cSDimitry Andric } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { 21815f757f3fSDimitry Andric } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) { 2182349cc55cSDimitry Andric } else 2183349cc55cSDimitry Andric break; 21840b57cec5SDimitry Andric } 21850b57cec5SDimitry Andric 21860b57cec5SDimitry Andric return CurPtr; 21870b57cec5SDimitry Andric } 21880b57cec5SDimitry Andric 21890b57cec5SDimitry Andric /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 21900b57cec5SDimitry Andric /// either " or L" or u8" or u" or U". 21910b57cec5SDimitry Andric bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 21920b57cec5SDimitry Andric tok::TokenKind Kind) { 21930b57cec5SDimitry Andric const char *AfterQuote = CurPtr; 21940b57cec5SDimitry Andric // Does this string contain the \0 character? 21950b57cec5SDimitry Andric const char *NulCharacter = nullptr; 21960b57cec5SDimitry Andric 21970b57cec5SDimitry Andric if (!isLexingRawMode() && 21980b57cec5SDimitry Andric (Kind == tok::utf8_string_literal || 21990b57cec5SDimitry Andric Kind == tok::utf16_string_literal || 22000b57cec5SDimitry Andric Kind == tok::utf32_string_literal)) 220181ad6265SDimitry Andric Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal 22020b57cec5SDimitry Andric : diag::warn_c99_compat_unicode_literal); 22030b57cec5SDimitry Andric 22040b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 22050b57cec5SDimitry Andric while (C != '"') { 22060b57cec5SDimitry Andric // Skip escaped characters. Escaped newlines will already be processed by 22070b57cec5SDimitry Andric // getAndAdvanceChar. 22080b57cec5SDimitry Andric if (C == '\\') 22090b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 22100b57cec5SDimitry Andric 22110b57cec5SDimitry Andric if (C == '\n' || C == '\r' || // Newline. 22120b57cec5SDimitry Andric (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 22130b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 22140b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; 22150b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 22160b57cec5SDimitry Andric return true; 22170b57cec5SDimitry Andric } 22180b57cec5SDimitry Andric 22190b57cec5SDimitry Andric if (C == 0) { 22200b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 22210b57cec5SDimitry Andric if (ParsingFilename) 22220b57cec5SDimitry Andric codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false); 22230b57cec5SDimitry Andric else 22240b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 22250b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 22260b57cec5SDimitry Andric cutOffLexing(); 22270b57cec5SDimitry Andric return true; 22280b57cec5SDimitry Andric } 22290b57cec5SDimitry Andric 22300b57cec5SDimitry Andric NulCharacter = CurPtr-1; 22310b57cec5SDimitry Andric } 22320b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 22330b57cec5SDimitry Andric } 22340b57cec5SDimitry Andric 22350b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 223681ad6265SDimitry Andric if (LangOpts.CPlusPlus) 22370b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, true); 22380b57cec5SDimitry Andric 22390b57cec5SDimitry Andric // If a nul character existed in the string, warn about it. 22400b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 22410b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 1; 22420b57cec5SDimitry Andric 22430b57cec5SDimitry Andric // Update the location of the token as well as the BufferPtr instance var. 22440b57cec5SDimitry Andric const char *TokStart = BufferPtr; 22450b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 22460b57cec5SDimitry Andric Result.setLiteralData(TokStart); 22470b57cec5SDimitry Andric return true; 22480b57cec5SDimitry Andric } 22490b57cec5SDimitry Andric 22500b57cec5SDimitry Andric /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 22510b57cec5SDimitry Andric /// having lexed R", LR", u8R", uR", or UR". 22520b57cec5SDimitry Andric bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 22530b57cec5SDimitry Andric tok::TokenKind Kind) { 22540b57cec5SDimitry Andric // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 22550b57cec5SDimitry Andric // Between the initial and final double quote characters of the raw string, 22560b57cec5SDimitry Andric // any transformations performed in phases 1 and 2 (trigraphs, 22570b57cec5SDimitry Andric // universal-character-names, and line splicing) are reverted. 22580b57cec5SDimitry Andric 22590b57cec5SDimitry Andric if (!isLexingRawMode()) 22600b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 22610b57cec5SDimitry Andric 22620b57cec5SDimitry Andric unsigned PrefixLen = 0; 22630b57cec5SDimitry Andric 2264*0fca6ea1SDimitry Andric while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) { 2265*0fca6ea1SDimitry Andric if (!isLexingRawMode() && 2266*0fca6ea1SDimitry Andric llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) { 2267*0fca6ea1SDimitry Andric const char *Pos = &CurPtr[PrefixLen]; 2268*0fca6ea1SDimitry Andric Diag(Pos, LangOpts.CPlusPlus26 2269*0fca6ea1SDimitry Andric ? diag::warn_cxx26_compat_raw_string_literal_character_set 2270*0fca6ea1SDimitry Andric : diag::ext_cxx26_raw_string_literal_character_set) 2271*0fca6ea1SDimitry Andric << StringRef(Pos, 1); 2272*0fca6ea1SDimitry Andric } 22730b57cec5SDimitry Andric ++PrefixLen; 2274*0fca6ea1SDimitry Andric } 22750b57cec5SDimitry Andric 22760b57cec5SDimitry Andric // If the last character was not a '(', then we didn't lex a valid delimiter. 22770b57cec5SDimitry Andric if (CurPtr[PrefixLen] != '(') { 22780b57cec5SDimitry Andric if (!isLexingRawMode()) { 22790b57cec5SDimitry Andric const char *PrefixEnd = &CurPtr[PrefixLen]; 22800b57cec5SDimitry Andric if (PrefixLen == 16) { 22810b57cec5SDimitry Andric Diag(PrefixEnd, diag::err_raw_delim_too_long); 2282*0fca6ea1SDimitry Andric } else if (*PrefixEnd == '\n') { 2283*0fca6ea1SDimitry Andric Diag(PrefixEnd, diag::err_invalid_newline_raw_delim); 22840b57cec5SDimitry Andric } else { 22850b57cec5SDimitry Andric Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 22860b57cec5SDimitry Andric << StringRef(PrefixEnd, 1); 22870b57cec5SDimitry Andric } 22880b57cec5SDimitry Andric } 22890b57cec5SDimitry Andric 22900b57cec5SDimitry Andric // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 22910b57cec5SDimitry Andric // it's possible the '"' was intended to be part of the raw string, but 22920b57cec5SDimitry Andric // there's not much we can do about that. 22930b57cec5SDimitry Andric while (true) { 22940b57cec5SDimitry Andric char C = *CurPtr++; 22950b57cec5SDimitry Andric 22960b57cec5SDimitry Andric if (C == '"') 22970b57cec5SDimitry Andric break; 22980b57cec5SDimitry Andric if (C == 0 && CurPtr-1 == BufferEnd) { 22990b57cec5SDimitry Andric --CurPtr; 23000b57cec5SDimitry Andric break; 23010b57cec5SDimitry Andric } 23020b57cec5SDimitry Andric } 23030b57cec5SDimitry Andric 23040b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 23050b57cec5SDimitry Andric return true; 23060b57cec5SDimitry Andric } 23070b57cec5SDimitry Andric 23080b57cec5SDimitry Andric // Save prefix and move CurPtr past it 23090b57cec5SDimitry Andric const char *Prefix = CurPtr; 23100b57cec5SDimitry Andric CurPtr += PrefixLen + 1; // skip over prefix and '(' 23110b57cec5SDimitry Andric 23120b57cec5SDimitry Andric while (true) { 23130b57cec5SDimitry Andric char C = *CurPtr++; 23140b57cec5SDimitry Andric 23150b57cec5SDimitry Andric if (C == ')') { 23160b57cec5SDimitry Andric // Check for prefix match and closing quote. 23170b57cec5SDimitry Andric if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 23180b57cec5SDimitry Andric CurPtr += PrefixLen + 1; // skip over prefix and '"' 23190b57cec5SDimitry Andric break; 23200b57cec5SDimitry Andric } 23210b57cec5SDimitry Andric } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 23220b57cec5SDimitry Andric if (!isLexingRawMode()) 23230b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_raw_string) 23240b57cec5SDimitry Andric << StringRef(Prefix, PrefixLen); 23250b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 23260b57cec5SDimitry Andric return true; 23270b57cec5SDimitry Andric } 23280b57cec5SDimitry Andric } 23290b57cec5SDimitry Andric 23300b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 233181ad6265SDimitry Andric if (LangOpts.CPlusPlus) 23320b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, true); 23330b57cec5SDimitry Andric 23340b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 23350b57cec5SDimitry Andric const char *TokStart = BufferPtr; 23360b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 23370b57cec5SDimitry Andric Result.setLiteralData(TokStart); 23380b57cec5SDimitry Andric return true; 23390b57cec5SDimitry Andric } 23400b57cec5SDimitry Andric 23410b57cec5SDimitry Andric /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 23420b57cec5SDimitry Andric /// after having lexed the '<' character. This is used for #include filenames. 23430b57cec5SDimitry Andric bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 23440b57cec5SDimitry Andric // Does this string contain the \0 character? 23450b57cec5SDimitry Andric const char *NulCharacter = nullptr; 23460b57cec5SDimitry Andric const char *AfterLessPos = CurPtr; 23470b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 23480b57cec5SDimitry Andric while (C != '>') { 23490b57cec5SDimitry Andric // Skip escaped characters. Escaped newlines will already be processed by 23500b57cec5SDimitry Andric // getAndAdvanceChar. 23510b57cec5SDimitry Andric if (C == '\\') 23520b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 23530b57cec5SDimitry Andric 2354fe6060f1SDimitry Andric if (isVerticalWhitespace(C) || // Newline. 23550b57cec5SDimitry Andric (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file. 23560b57cec5SDimitry Andric // If the filename is unterminated, then it must just be a lone < 23570b57cec5SDimitry Andric // character. Return this as such. 23580b57cec5SDimitry Andric FormTokenWithChars(Result, AfterLessPos, tok::less); 23590b57cec5SDimitry Andric return true; 23600b57cec5SDimitry Andric } 23610b57cec5SDimitry Andric 23620b57cec5SDimitry Andric if (C == 0) { 23630b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr - 1)) { 23640b57cec5SDimitry Andric codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true); 23650b57cec5SDimitry Andric cutOffLexing(); 23660b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 23670b57cec5SDimitry Andric return true; 23680b57cec5SDimitry Andric } 23690b57cec5SDimitry Andric NulCharacter = CurPtr-1; 23700b57cec5SDimitry Andric } 23710b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 23720b57cec5SDimitry Andric } 23730b57cec5SDimitry Andric 23740b57cec5SDimitry Andric // If a nul character existed in the string, warn about it. 23750b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 23760b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 1; 23770b57cec5SDimitry Andric 23780b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 23790b57cec5SDimitry Andric const char *TokStart = BufferPtr; 23800b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::header_name); 23810b57cec5SDimitry Andric Result.setLiteralData(TokStart); 23820b57cec5SDimitry Andric return true; 23830b57cec5SDimitry Andric } 23840b57cec5SDimitry Andric 23850b57cec5SDimitry Andric void Lexer::codeCompleteIncludedFile(const char *PathStart, 23860b57cec5SDimitry Andric const char *CompletionPoint, 23870b57cec5SDimitry Andric bool IsAngled) { 23880b57cec5SDimitry Andric // Completion only applies to the filename, after the last slash. 23890b57cec5SDimitry Andric StringRef PartialPath(PathStart, CompletionPoint - PathStart); 23905ffd83dbSDimitry Andric llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/"; 23915ffd83dbSDimitry Andric auto Slash = PartialPath.find_last_of(SlashChars); 23920b57cec5SDimitry Andric StringRef Dir = 23930b57cec5SDimitry Andric (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash); 23940b57cec5SDimitry Andric const char *StartOfFilename = 23950b57cec5SDimitry Andric (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1; 23960b57cec5SDimitry Andric // Code completion filter range is the filename only, up to completion point. 23970b57cec5SDimitry Andric PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get( 23980b57cec5SDimitry Andric StringRef(StartOfFilename, CompletionPoint - StartOfFilename))); 23995ffd83dbSDimitry Andric // We should replace the characters up to the closing quote or closest slash, 24005ffd83dbSDimitry Andric // if any. 24010b57cec5SDimitry Andric while (CompletionPoint < BufferEnd) { 24020b57cec5SDimitry Andric char Next = *(CompletionPoint + 1); 24030b57cec5SDimitry Andric if (Next == 0 || Next == '\r' || Next == '\n') 24040b57cec5SDimitry Andric break; 24050b57cec5SDimitry Andric ++CompletionPoint; 24060b57cec5SDimitry Andric if (Next == (IsAngled ? '>' : '"')) 24070b57cec5SDimitry Andric break; 240806c3fb27SDimitry Andric if (SlashChars.contains(Next)) 24095ffd83dbSDimitry Andric break; 24100b57cec5SDimitry Andric } 24115ffd83dbSDimitry Andric 24120b57cec5SDimitry Andric PP->setCodeCompletionTokenRange( 24130b57cec5SDimitry Andric FileLoc.getLocWithOffset(StartOfFilename - BufferStart), 24140b57cec5SDimitry Andric FileLoc.getLocWithOffset(CompletionPoint - BufferStart)); 24150b57cec5SDimitry Andric PP->CodeCompleteIncludedFile(Dir, IsAngled); 24160b57cec5SDimitry Andric } 24170b57cec5SDimitry Andric 24180b57cec5SDimitry Andric /// LexCharConstant - Lex the remainder of a character constant, after having 24190b57cec5SDimitry Andric /// lexed either ' or L' or u8' or u' or U'. 24200b57cec5SDimitry Andric bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, 24210b57cec5SDimitry Andric tok::TokenKind Kind) { 24220b57cec5SDimitry Andric // Does this character contain the \0 character? 24230b57cec5SDimitry Andric const char *NulCharacter = nullptr; 24240b57cec5SDimitry Andric 24250b57cec5SDimitry Andric if (!isLexingRawMode()) { 24260b57cec5SDimitry Andric if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) 242781ad6265SDimitry Andric Diag(BufferPtr, LangOpts.CPlusPlus 24280b57cec5SDimitry Andric ? diag::warn_cxx98_compat_unicode_literal 24290b57cec5SDimitry Andric : diag::warn_c99_compat_unicode_literal); 24300b57cec5SDimitry Andric else if (Kind == tok::utf8_char_constant) 24310b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal); 24320b57cec5SDimitry Andric } 24330b57cec5SDimitry Andric 24340b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 24350b57cec5SDimitry Andric if (C == '\'') { 24360b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 24370b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_empty_character); 24380b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 24390b57cec5SDimitry Andric return true; 24400b57cec5SDimitry Andric } 24410b57cec5SDimitry Andric 24420b57cec5SDimitry Andric while (C != '\'') { 24430b57cec5SDimitry Andric // Skip escaped characters. 24440b57cec5SDimitry Andric if (C == '\\') 24450b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 24460b57cec5SDimitry Andric 24470b57cec5SDimitry Andric if (C == '\n' || C == '\r' || // Newline. 24480b57cec5SDimitry Andric (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 24490b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 24500b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0; 24510b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 24520b57cec5SDimitry Andric return true; 24530b57cec5SDimitry Andric } 24540b57cec5SDimitry Andric 24550b57cec5SDimitry Andric if (C == 0) { 24560b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 24570b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 24580b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 24590b57cec5SDimitry Andric cutOffLexing(); 24600b57cec5SDimitry Andric return true; 24610b57cec5SDimitry Andric } 24620b57cec5SDimitry Andric 24630b57cec5SDimitry Andric NulCharacter = CurPtr-1; 24640b57cec5SDimitry Andric } 24650b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 24660b57cec5SDimitry Andric } 24670b57cec5SDimitry Andric 24680b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 246981ad6265SDimitry Andric if (LangOpts.CPlusPlus) 24700b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, false); 24710b57cec5SDimitry Andric 24720b57cec5SDimitry Andric // If a nul character existed in the character, warn about it. 24730b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 24740b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 0; 24750b57cec5SDimitry Andric 24760b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 24770b57cec5SDimitry Andric const char *TokStart = BufferPtr; 24780b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 24790b57cec5SDimitry Andric Result.setLiteralData(TokStart); 24800b57cec5SDimitry Andric return true; 24810b57cec5SDimitry Andric } 24820b57cec5SDimitry Andric 24830b57cec5SDimitry Andric /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 24840b57cec5SDimitry Andric /// Update BufferPtr to point to the next non-whitespace character and return. 24850b57cec5SDimitry Andric /// 24860b57cec5SDimitry Andric /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 24870b57cec5SDimitry Andric bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, 24880b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 24890b57cec5SDimitry Andric // Whitespace - Skip it, then return the token after the whitespace. 24900b57cec5SDimitry Andric bool SawNewline = isVerticalWhitespace(CurPtr[-1]); 24910b57cec5SDimitry Andric 24920b57cec5SDimitry Andric unsigned char Char = *CurPtr; 24930b57cec5SDimitry Andric 2494e8d8bef9SDimitry Andric const char *lastNewLine = nullptr; 2495e8d8bef9SDimitry Andric auto setLastNewLine = [&](const char *Ptr) { 2496e8d8bef9SDimitry Andric lastNewLine = Ptr; 2497e8d8bef9SDimitry Andric if (!NewLinePtr) 2498e8d8bef9SDimitry Andric NewLinePtr = Ptr; 2499e8d8bef9SDimitry Andric }; 2500e8d8bef9SDimitry Andric if (SawNewline) 2501e8d8bef9SDimitry Andric setLastNewLine(CurPtr - 1); 2502e8d8bef9SDimitry Andric 25030b57cec5SDimitry Andric // Skip consecutive spaces efficiently. 25040b57cec5SDimitry Andric while (true) { 25050b57cec5SDimitry Andric // Skip horizontal whitespace very aggressively. 25060b57cec5SDimitry Andric while (isHorizontalWhitespace(Char)) 25070b57cec5SDimitry Andric Char = *++CurPtr; 25080b57cec5SDimitry Andric 25090b57cec5SDimitry Andric // Otherwise if we have something other than whitespace, we're done. 25100b57cec5SDimitry Andric if (!isVerticalWhitespace(Char)) 25110b57cec5SDimitry Andric break; 25120b57cec5SDimitry Andric 25130b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 25140b57cec5SDimitry Andric // End of preprocessor directive line, let LexTokenInternal handle this. 25150b57cec5SDimitry Andric BufferPtr = CurPtr; 25160b57cec5SDimitry Andric return false; 25170b57cec5SDimitry Andric } 25180b57cec5SDimitry Andric 25190b57cec5SDimitry Andric // OK, but handle newline. 2520e8d8bef9SDimitry Andric if (*CurPtr == '\n') 2521e8d8bef9SDimitry Andric setLastNewLine(CurPtr); 25220b57cec5SDimitry Andric SawNewline = true; 25230b57cec5SDimitry Andric Char = *++CurPtr; 25240b57cec5SDimitry Andric } 25250b57cec5SDimitry Andric 25260b57cec5SDimitry Andric // If the client wants us to return whitespace, return it now. 25270b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 25280b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 25290b57cec5SDimitry Andric if (SawNewline) { 25300b57cec5SDimitry Andric IsAtStartOfLine = true; 25310b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 25320b57cec5SDimitry Andric } 25330b57cec5SDimitry Andric // FIXME: The next token will not have LeadingSpace set. 25340b57cec5SDimitry Andric return true; 25350b57cec5SDimitry Andric } 25360b57cec5SDimitry Andric 25370b57cec5SDimitry Andric // If this isn't immediately after a newline, there is leading space. 25380b57cec5SDimitry Andric char PrevChar = CurPtr[-1]; 25390b57cec5SDimitry Andric bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); 25400b57cec5SDimitry Andric 25410b57cec5SDimitry Andric Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); 25420b57cec5SDimitry Andric if (SawNewline) { 25430b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 25440b57cec5SDimitry Andric TokAtPhysicalStartOfLine = true; 2545e8d8bef9SDimitry Andric 2546e8d8bef9SDimitry Andric if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) { 2547e8d8bef9SDimitry Andric if (auto *Handler = PP->getEmptylineHandler()) 2548e8d8bef9SDimitry Andric Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1), 2549e8d8bef9SDimitry Andric getSourceLocation(lastNewLine))); 2550e8d8bef9SDimitry Andric } 25510b57cec5SDimitry Andric } 25520b57cec5SDimitry Andric 25530b57cec5SDimitry Andric BufferPtr = CurPtr; 25540b57cec5SDimitry Andric return false; 25550b57cec5SDimitry Andric } 25560b57cec5SDimitry Andric 25570b57cec5SDimitry Andric /// We have just read the // characters from input. Skip until we find the 25580b57cec5SDimitry Andric /// newline character that terminates the comment. Then update BufferPtr and 25590b57cec5SDimitry Andric /// return. 25600b57cec5SDimitry Andric /// 25610b57cec5SDimitry Andric /// If we're in KeepCommentMode or any CommentHandler has inserted 25620b57cec5SDimitry Andric /// some tokens, this will store the first token and return true. 25630b57cec5SDimitry Andric bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, 25640b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 25650b57cec5SDimitry Andric // If Line comments aren't explicitly enabled for this language, emit an 25660b57cec5SDimitry Andric // extension warning. 256781ad6265SDimitry Andric if (!LineComment) { 25681fd87a68SDimitry Andric if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags. 25690b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_line_comment); 25700b57cec5SDimitry Andric 25710b57cec5SDimitry Andric // Mark them enabled so we only emit one warning for this translation 25720b57cec5SDimitry Andric // unit. 257381ad6265SDimitry Andric LineComment = true; 25740b57cec5SDimitry Andric } 25750b57cec5SDimitry Andric 25760b57cec5SDimitry Andric // Scan over the body of the comment. The common case, when scanning, is that 25770b57cec5SDimitry Andric // the comment contains normal ascii characters with nothing interesting in 25780b57cec5SDimitry Andric // them. As such, optimize for this case with the inner loop. 25790b57cec5SDimitry Andric // 25800b57cec5SDimitry Andric // This loop terminates with CurPtr pointing at the newline (or end of buffer) 25810b57cec5SDimitry Andric // character that ends the line comment. 2582753f127fSDimitry Andric 2583753f127fSDimitry Andric // C++23 [lex.phases] p1 2584753f127fSDimitry Andric // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a 2585753f127fSDimitry Andric // diagnostic only once per entire ill-formed subsequence to avoid 2586753f127fSDimitry Andric // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). 2587753f127fSDimitry Andric bool UnicodeDecodingAlreadyDiagnosed = false; 2588753f127fSDimitry Andric 25890b57cec5SDimitry Andric char C; 25900b57cec5SDimitry Andric while (true) { 25910b57cec5SDimitry Andric C = *CurPtr; 25920b57cec5SDimitry Andric // Skip over characters in the fast loop. 2593753f127fSDimitry Andric while (isASCII(C) && C != 0 && // Potentially EOF. 2594753f127fSDimitry Andric C != '\n' && C != '\r') { // Newline or DOS-style newline. 25950b57cec5SDimitry Andric C = *++CurPtr; 2596753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = false; 2597753f127fSDimitry Andric } 2598753f127fSDimitry Andric 2599753f127fSDimitry Andric if (!isASCII(C)) { 2600753f127fSDimitry Andric unsigned Length = llvm::getUTF8SequenceSize( 2601753f127fSDimitry Andric (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd); 2602753f127fSDimitry Andric if (Length == 0) { 2603753f127fSDimitry Andric if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) 2604753f127fSDimitry Andric Diag(CurPtr, diag::warn_invalid_utf8_in_comment); 2605753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = true; 2606753f127fSDimitry Andric ++CurPtr; 2607753f127fSDimitry Andric } else { 2608753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = false; 2609753f127fSDimitry Andric CurPtr += Length; 2610753f127fSDimitry Andric } 2611753f127fSDimitry Andric continue; 2612753f127fSDimitry Andric } 26130b57cec5SDimitry Andric 26140b57cec5SDimitry Andric const char *NextLine = CurPtr; 26150b57cec5SDimitry Andric if (C != 0) { 26160b57cec5SDimitry Andric // We found a newline, see if it's escaped. 26170b57cec5SDimitry Andric const char *EscapePtr = CurPtr-1; 26180b57cec5SDimitry Andric bool HasSpace = false; 26190b57cec5SDimitry Andric while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace. 26200b57cec5SDimitry Andric --EscapePtr; 26210b57cec5SDimitry Andric HasSpace = true; 26220b57cec5SDimitry Andric } 26230b57cec5SDimitry Andric 26240b57cec5SDimitry Andric if (*EscapePtr == '\\') 26250b57cec5SDimitry Andric // Escaped newline. 26260b57cec5SDimitry Andric CurPtr = EscapePtr; 26270b57cec5SDimitry Andric else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 26280b57cec5SDimitry Andric EscapePtr[-2] == '?' && LangOpts.Trigraphs) 26290b57cec5SDimitry Andric // Trigraph-escaped newline. 26300b57cec5SDimitry Andric CurPtr = EscapePtr-2; 26310b57cec5SDimitry Andric else 26320b57cec5SDimitry Andric break; // This is a newline, we're done. 26330b57cec5SDimitry Andric 26340b57cec5SDimitry Andric // If there was space between the backslash and newline, warn about it. 26350b57cec5SDimitry Andric if (HasSpace && !isLexingRawMode()) 26360b57cec5SDimitry Andric Diag(EscapePtr, diag::backslash_newline_space); 26370b57cec5SDimitry Andric } 26380b57cec5SDimitry Andric 26390b57cec5SDimitry Andric // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 26400b57cec5SDimitry Andric // properly decode the character. Read it in raw mode to avoid emitting 26410b57cec5SDimitry Andric // diagnostics about things like trigraphs. If we see an escaped newline, 26420b57cec5SDimitry Andric // we'll handle it below. 26430b57cec5SDimitry Andric const char *OldPtr = CurPtr; 26440b57cec5SDimitry Andric bool OldRawMode = isLexingRawMode(); 26450b57cec5SDimitry Andric LexingRawMode = true; 26460b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 26470b57cec5SDimitry Andric LexingRawMode = OldRawMode; 26480b57cec5SDimitry Andric 26490b57cec5SDimitry Andric // If we only read only one character, then no special handling is needed. 26500b57cec5SDimitry Andric // We're done and can skip forward to the newline. 26510b57cec5SDimitry Andric if (C != 0 && CurPtr == OldPtr+1) { 26520b57cec5SDimitry Andric CurPtr = NextLine; 26530b57cec5SDimitry Andric break; 26540b57cec5SDimitry Andric } 26550b57cec5SDimitry Andric 26560b57cec5SDimitry Andric // If we read multiple characters, and one of those characters was a \r or 26570b57cec5SDimitry Andric // \n, then we had an escaped newline within the comment. Emit diagnostic 26580b57cec5SDimitry Andric // unless the next line is also a // comment. 26590b57cec5SDimitry Andric if (CurPtr != OldPtr + 1 && C != '/' && 26600b57cec5SDimitry Andric (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) { 26610b57cec5SDimitry Andric for (; OldPtr != CurPtr; ++OldPtr) 26620b57cec5SDimitry Andric if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 26630b57cec5SDimitry Andric // Okay, we found a // comment that ends in a newline, if the next 26640b57cec5SDimitry Andric // line is also a // comment, but has spaces, don't emit a diagnostic. 26650b57cec5SDimitry Andric if (isWhitespace(C)) { 26660b57cec5SDimitry Andric const char *ForwardPtr = CurPtr; 26670b57cec5SDimitry Andric while (isWhitespace(*ForwardPtr)) // Skip whitespace. 26680b57cec5SDimitry Andric ++ForwardPtr; 26690b57cec5SDimitry Andric if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 26700b57cec5SDimitry Andric break; 26710b57cec5SDimitry Andric } 26720b57cec5SDimitry Andric 26730b57cec5SDimitry Andric if (!isLexingRawMode()) 26740b57cec5SDimitry Andric Diag(OldPtr-1, diag::ext_multi_line_line_comment); 26750b57cec5SDimitry Andric break; 26760b57cec5SDimitry Andric } 26770b57cec5SDimitry Andric } 26780b57cec5SDimitry Andric 26790b57cec5SDimitry Andric if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) { 26800b57cec5SDimitry Andric --CurPtr; 26810b57cec5SDimitry Andric break; 26820b57cec5SDimitry Andric } 26830b57cec5SDimitry Andric 26840b57cec5SDimitry Andric if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 26850b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 26860b57cec5SDimitry Andric cutOffLexing(); 26870b57cec5SDimitry Andric return false; 26880b57cec5SDimitry Andric } 26890b57cec5SDimitry Andric } 26900b57cec5SDimitry Andric 26910b57cec5SDimitry Andric // Found but did not consume the newline. Notify comment handlers about the 26920b57cec5SDimitry Andric // comment unless we're in a #if 0 block. 26930b57cec5SDimitry Andric if (PP && !isLexingRawMode() && 26940b57cec5SDimitry Andric PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 26950b57cec5SDimitry Andric getSourceLocation(CurPtr)))) { 26960b57cec5SDimitry Andric BufferPtr = CurPtr; 26970b57cec5SDimitry Andric return true; // A token has to be returned. 26980b57cec5SDimitry Andric } 26990b57cec5SDimitry Andric 27000b57cec5SDimitry Andric // If we are returning comments as tokens, return this comment as a token. 27010b57cec5SDimitry Andric if (inKeepCommentMode()) 27020b57cec5SDimitry Andric return SaveLineComment(Result, CurPtr); 27030b57cec5SDimitry Andric 27040b57cec5SDimitry Andric // If we are inside a preprocessor directive and we see the end of line, 27050b57cec5SDimitry Andric // return immediately, so that the lexer can return this as an EOD token. 27060b57cec5SDimitry Andric if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 27070b57cec5SDimitry Andric BufferPtr = CurPtr; 27080b57cec5SDimitry Andric return false; 27090b57cec5SDimitry Andric } 27100b57cec5SDimitry Andric 27110b57cec5SDimitry Andric // Otherwise, eat the \n character. We don't care if this is a \n\r or 27120b57cec5SDimitry Andric // \r\n sequence. This is an efficiency hack (because we know the \n can't 27130b57cec5SDimitry Andric // contribute to another token), it isn't needed for correctness. Note that 27140b57cec5SDimitry Andric // this is ok even in KeepWhitespaceMode, because we would have returned the 27155f757f3fSDimitry Andric // comment above in that mode. 2716e8d8bef9SDimitry Andric NewLinePtr = CurPtr++; 27170b57cec5SDimitry Andric 27180b57cec5SDimitry Andric // The next returned token is at the start of the line. 27190b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 27200b57cec5SDimitry Andric TokAtPhysicalStartOfLine = true; 27210b57cec5SDimitry Andric // No leading whitespace seen so far. 27220b57cec5SDimitry Andric Result.clearFlag(Token::LeadingSpace); 27230b57cec5SDimitry Andric BufferPtr = CurPtr; 27240b57cec5SDimitry Andric return false; 27250b57cec5SDimitry Andric } 27260b57cec5SDimitry Andric 27270b57cec5SDimitry Andric /// If in save-comment mode, package up this Line comment in an appropriate 27280b57cec5SDimitry Andric /// way and return it. 27290b57cec5SDimitry Andric bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { 27300b57cec5SDimitry Andric // If we're not in a preprocessor directive, just return the // comment 27310b57cec5SDimitry Andric // directly. 27320b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::comment); 27330b57cec5SDimitry Andric 27340b57cec5SDimitry Andric if (!ParsingPreprocessorDirective || LexingRawMode) 27350b57cec5SDimitry Andric return true; 27360b57cec5SDimitry Andric 27370b57cec5SDimitry Andric // If this Line-style comment is in a macro definition, transmogrify it into 27380b57cec5SDimitry Andric // a C-style block comment. 27390b57cec5SDimitry Andric bool Invalid = false; 27400b57cec5SDimitry Andric std::string Spelling = PP->getSpelling(Result, &Invalid); 27410b57cec5SDimitry Andric if (Invalid) 27420b57cec5SDimitry Andric return true; 27430b57cec5SDimitry Andric 27440b57cec5SDimitry Andric assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); 27450b57cec5SDimitry Andric Spelling[1] = '*'; // Change prefix to "/*". 27460b57cec5SDimitry Andric Spelling += "*/"; // add suffix. 27470b57cec5SDimitry Andric 27480b57cec5SDimitry Andric Result.setKind(tok::comment); 27490b57cec5SDimitry Andric PP->CreateString(Spelling, Result, 27500b57cec5SDimitry Andric Result.getLocation(), Result.getLocation()); 27510b57cec5SDimitry Andric return true; 27520b57cec5SDimitry Andric } 27530b57cec5SDimitry Andric 27540b57cec5SDimitry Andric /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 27550b57cec5SDimitry Andric /// character (either \\n or \\r) is part of an escaped newline sequence. Issue 27560b57cec5SDimitry Andric /// a diagnostic if so. We know that the newline is inside of a block comment. 275781ad6265SDimitry Andric static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, 275881ad6265SDimitry Andric bool Trigraphs) { 27590b57cec5SDimitry Andric assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 27600b57cec5SDimitry Andric 2761fe6060f1SDimitry Andric // Position of the first trigraph in the ending sequence. 276204eeddc0SDimitry Andric const char *TrigraphPos = nullptr; 2763fe6060f1SDimitry Andric // Position of the first whitespace after a '\' in the ending sequence. 276404eeddc0SDimitry Andric const char *SpacePos = nullptr; 2765fe6060f1SDimitry Andric 2766fe6060f1SDimitry Andric while (true) { 27670b57cec5SDimitry Andric // Back up off the newline. 27680b57cec5SDimitry Andric --CurPtr; 27690b57cec5SDimitry Andric 27700b57cec5SDimitry Andric // If this is a two-character newline sequence, skip the other character. 27710b57cec5SDimitry Andric if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 27720b57cec5SDimitry Andric // \n\n or \r\r -> not escaped newline. 27730b57cec5SDimitry Andric if (CurPtr[0] == CurPtr[1]) 27740b57cec5SDimitry Andric return false; 27750b57cec5SDimitry Andric // \n\r or \r\n -> skip the newline. 27760b57cec5SDimitry Andric --CurPtr; 27770b57cec5SDimitry Andric } 27780b57cec5SDimitry Andric 27790b57cec5SDimitry Andric // If we have horizontal whitespace, skip over it. We allow whitespace 27800b57cec5SDimitry Andric // between the slash and newline. 27810b57cec5SDimitry Andric while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 2782fe6060f1SDimitry Andric SpacePos = CurPtr; 27830b57cec5SDimitry Andric --CurPtr; 27840b57cec5SDimitry Andric } 27850b57cec5SDimitry Andric 2786fe6060f1SDimitry Andric // If we have a slash, this is an escaped newline. 27870b57cec5SDimitry Andric if (*CurPtr == '\\') { 2788fe6060f1SDimitry Andric --CurPtr; 2789fe6060f1SDimitry Andric } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') { 2790fe6060f1SDimitry Andric // This is a trigraph encoding of a slash. 2791fe6060f1SDimitry Andric TrigraphPos = CurPtr - 2; 2792fe6060f1SDimitry Andric CurPtr -= 3; 27930b57cec5SDimitry Andric } else { 27940b57cec5SDimitry Andric return false; 2795fe6060f1SDimitry Andric } 27960b57cec5SDimitry Andric 2797fe6060f1SDimitry Andric // If the character preceding the escaped newline is a '*', then after line 2798fe6060f1SDimitry Andric // splicing we have a '*/' ending the comment. 2799fe6060f1SDimitry Andric if (*CurPtr == '*') 2800fe6060f1SDimitry Andric break; 28010b57cec5SDimitry Andric 2802fe6060f1SDimitry Andric if (*CurPtr != '\n' && *CurPtr != '\r') 2803fe6060f1SDimitry Andric return false; 2804fe6060f1SDimitry Andric } 2805fe6060f1SDimitry Andric 2806fe6060f1SDimitry Andric if (TrigraphPos) { 28070b57cec5SDimitry Andric // If no trigraphs are enabled, warn that we ignored this trigraph and 28080b57cec5SDimitry Andric // ignore this * character. 280981ad6265SDimitry Andric if (!Trigraphs) { 28100b57cec5SDimitry Andric if (!L->isLexingRawMode()) 2811fe6060f1SDimitry Andric L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment); 28120b57cec5SDimitry Andric return false; 28130b57cec5SDimitry Andric } 28140b57cec5SDimitry Andric if (!L->isLexingRawMode()) 2815fe6060f1SDimitry Andric L->Diag(TrigraphPos, diag::trigraph_ends_block_comment); 28160b57cec5SDimitry Andric } 28170b57cec5SDimitry Andric 28180b57cec5SDimitry Andric // Warn about having an escaped newline between the */ characters. 28190b57cec5SDimitry Andric if (!L->isLexingRawMode()) 2820fe6060f1SDimitry Andric L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end); 28210b57cec5SDimitry Andric 28220b57cec5SDimitry Andric // If there was space between the backslash and newline, warn about it. 2823fe6060f1SDimitry Andric if (SpacePos && !L->isLexingRawMode()) 2824fe6060f1SDimitry Andric L->Diag(SpacePos, diag::backslash_newline_space); 28250b57cec5SDimitry Andric 28260b57cec5SDimitry Andric return true; 28270b57cec5SDimitry Andric } 28280b57cec5SDimitry Andric 28290b57cec5SDimitry Andric #ifdef __SSE2__ 28300b57cec5SDimitry Andric #include <emmintrin.h> 28310b57cec5SDimitry Andric #elif __ALTIVEC__ 28320b57cec5SDimitry Andric #include <altivec.h> 28330b57cec5SDimitry Andric #undef bool 28340b57cec5SDimitry Andric #endif 28350b57cec5SDimitry Andric 28360b57cec5SDimitry Andric /// We have just read from input the / and * characters that started a comment. 28370b57cec5SDimitry Andric /// Read until we find the * and / characters that terminate the comment. 28380b57cec5SDimitry Andric /// Note that we don't bother decoding trigraphs or escaped newlines in block 28390b57cec5SDimitry Andric /// comments, because they cannot cause the comment to end. The only thing 28400b57cec5SDimitry Andric /// that can happen is the comment could end with an escaped newline between 28410b57cec5SDimitry Andric /// the terminating * and /. 28420b57cec5SDimitry Andric /// 28430b57cec5SDimitry Andric /// If we're in KeepCommentMode or any CommentHandler has inserted 28440b57cec5SDimitry Andric /// some tokens, this will store the first token and return true. 28450b57cec5SDimitry Andric bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, 28460b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 28470b57cec5SDimitry Andric // Scan one character past where we should, looking for a '/' character. Once 28480b57cec5SDimitry Andric // we find it, check to see if it was preceded by a *. This common 28490b57cec5SDimitry Andric // optimization helps people who like to put a lot of * characters in their 28500b57cec5SDimitry Andric // comments. 28510b57cec5SDimitry Andric 28520b57cec5SDimitry Andric // The first character we get with newlines and trigraphs skipped to handle 28530b57cec5SDimitry Andric // the degenerate /*/ case below correctly if the * has an escaped newline 28540b57cec5SDimitry Andric // after it. 28550b57cec5SDimitry Andric unsigned CharSize; 28560b57cec5SDimitry Andric unsigned char C = getCharAndSize(CurPtr, CharSize); 28570b57cec5SDimitry Andric CurPtr += CharSize; 28580b57cec5SDimitry Andric if (C == 0 && CurPtr == BufferEnd+1) { 28590b57cec5SDimitry Andric if (!isLexingRawMode()) 28600b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_block_comment); 28610b57cec5SDimitry Andric --CurPtr; 28620b57cec5SDimitry Andric 28630b57cec5SDimitry Andric // KeepWhitespaceMode should return this broken comment as a token. Since 28640b57cec5SDimitry Andric // it isn't a well formed comment, just return it as an 'unknown' token. 28650b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 28660b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 28670b57cec5SDimitry Andric return true; 28680b57cec5SDimitry Andric } 28690b57cec5SDimitry Andric 28700b57cec5SDimitry Andric BufferPtr = CurPtr; 28710b57cec5SDimitry Andric return false; 28720b57cec5SDimitry Andric } 28730b57cec5SDimitry Andric 28740b57cec5SDimitry Andric // Check to see if the first character after the '/*' is another /. If so, 28750b57cec5SDimitry Andric // then this slash does not end the block comment, it is part of it. 28760b57cec5SDimitry Andric if (C == '/') 28770b57cec5SDimitry Andric C = *CurPtr++; 28780b57cec5SDimitry Andric 2879753f127fSDimitry Andric // C++23 [lex.phases] p1 2880753f127fSDimitry Andric // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a 2881753f127fSDimitry Andric // diagnostic only once per entire ill-formed subsequence to avoid 2882753f127fSDimitry Andric // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). 2883753f127fSDimitry Andric bool UnicodeDecodingAlreadyDiagnosed = false; 2884753f127fSDimitry Andric 28850b57cec5SDimitry Andric while (true) { 28860b57cec5SDimitry Andric // Skip over all non-interesting characters until we find end of buffer or a 28870b57cec5SDimitry Andric // (probably ending) '/' character. 28880b57cec5SDimitry Andric if (CurPtr + 24 < BufferEnd && 28890b57cec5SDimitry Andric // If there is a code-completion point avoid the fast scan because it 28900b57cec5SDimitry Andric // doesn't check for '\0'. 28910b57cec5SDimitry Andric !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 28920b57cec5SDimitry Andric // While not aligned to a 16-byte boundary. 2893753f127fSDimitry Andric while (C != '/' && (intptr_t)CurPtr % 16 != 0) { 2894753f127fSDimitry Andric if (!isASCII(C)) 2895753f127fSDimitry Andric goto MultiByteUTF8; 28960b57cec5SDimitry Andric C = *CurPtr++; 2897753f127fSDimitry Andric } 28980b57cec5SDimitry Andric if (C == '/') goto FoundSlash; 28990b57cec5SDimitry Andric 29000b57cec5SDimitry Andric #ifdef __SSE2__ 29010b57cec5SDimitry Andric __m128i Slashes = _mm_set1_epi8('/'); 2902753f127fSDimitry Andric while (CurPtr + 16 < BufferEnd) { 2903753f127fSDimitry Andric int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr); 2904753f127fSDimitry Andric if (LLVM_UNLIKELY(Mask != 0)) { 2905753f127fSDimitry Andric goto MultiByteUTF8; 2906753f127fSDimitry Andric } 2907753f127fSDimitry Andric // look for slashes 29080b57cec5SDimitry Andric int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 29090b57cec5SDimitry Andric Slashes)); 29100b57cec5SDimitry Andric if (cmp != 0) { 29110b57cec5SDimitry Andric // Adjust the pointer to point directly after the first slash. It's 29120b57cec5SDimitry Andric // not necessary to set C here, it will be overwritten at the end of 29130b57cec5SDimitry Andric // the outer loop. 291406c3fb27SDimitry Andric CurPtr += llvm::countr_zero<unsigned>(cmp) + 1; 29150b57cec5SDimitry Andric goto FoundSlash; 29160b57cec5SDimitry Andric } 29170b57cec5SDimitry Andric CurPtr += 16; 29180b57cec5SDimitry Andric } 29190b57cec5SDimitry Andric #elif __ALTIVEC__ 2920753f127fSDimitry Andric __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2921753f127fSDimitry Andric 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2922753f127fSDimitry Andric 0x80, 0x80, 0x80, 0x80}; 29230b57cec5SDimitry Andric __vector unsigned char Slashes = { 29240b57cec5SDimitry Andric '/', '/', '/', '/', '/', '/', '/', '/', 29250b57cec5SDimitry Andric '/', '/', '/', '/', '/', '/', '/', '/' 29260b57cec5SDimitry Andric }; 2927753f127fSDimitry Andric while (CurPtr + 16 < BufferEnd) { 2928753f127fSDimitry Andric if (LLVM_UNLIKELY( 2929753f127fSDimitry Andric vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF))) 2930753f127fSDimitry Andric goto MultiByteUTF8; 2931753f127fSDimitry Andric if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) { 2932753f127fSDimitry Andric break; 2933753f127fSDimitry Andric } 29340b57cec5SDimitry Andric CurPtr += 16; 2935753f127fSDimitry Andric } 2936753f127fSDimitry Andric 29370b57cec5SDimitry Andric #else 2938753f127fSDimitry Andric while (CurPtr + 16 < BufferEnd) { 2939753f127fSDimitry Andric bool HasNonASCII = false; 2940753f127fSDimitry Andric for (unsigned I = 0; I < 16; ++I) 2941753f127fSDimitry Andric HasNonASCII |= !isASCII(CurPtr[I]); 2942753f127fSDimitry Andric 2943753f127fSDimitry Andric if (LLVM_UNLIKELY(HasNonASCII)) 2944753f127fSDimitry Andric goto MultiByteUTF8; 2945753f127fSDimitry Andric 2946753f127fSDimitry Andric bool HasSlash = false; 2947753f127fSDimitry Andric for (unsigned I = 0; I < 16; ++I) 2948753f127fSDimitry Andric HasSlash |= CurPtr[I] == '/'; 2949753f127fSDimitry Andric if (HasSlash) 2950753f127fSDimitry Andric break; 2951753f127fSDimitry Andric CurPtr += 16; 29520b57cec5SDimitry Andric } 29530b57cec5SDimitry Andric #endif 29540b57cec5SDimitry Andric 29550b57cec5SDimitry Andric // It has to be one of the bytes scanned, increment to it and read one. 29560b57cec5SDimitry Andric C = *CurPtr++; 29570b57cec5SDimitry Andric } 29580b57cec5SDimitry Andric 2959753f127fSDimitry Andric // Loop to scan the remainder, warning on invalid UTF-8 2960753f127fSDimitry Andric // if the corresponding warning is enabled, emitting a diagnostic only once 2961753f127fSDimitry Andric // per sequence that cannot be decoded. 2962753f127fSDimitry Andric while (C != '/' && C != '\0') { 2963753f127fSDimitry Andric if (isASCII(C)) { 2964753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = false; 29650b57cec5SDimitry Andric C = *CurPtr++; 2966753f127fSDimitry Andric continue; 2967753f127fSDimitry Andric } 2968753f127fSDimitry Andric MultiByteUTF8: 2969753f127fSDimitry Andric // CurPtr is 1 code unit past C, so to decode 2970753f127fSDimitry Andric // the codepoint, we need to read from the previous position. 2971753f127fSDimitry Andric unsigned Length = llvm::getUTF8SequenceSize( 2972753f127fSDimitry Andric (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd); 2973753f127fSDimitry Andric if (Length == 0) { 2974753f127fSDimitry Andric if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) 2975753f127fSDimitry Andric Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment); 2976753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = true; 2977753f127fSDimitry Andric } else { 2978753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = false; 2979753f127fSDimitry Andric CurPtr += Length - 1; 2980753f127fSDimitry Andric } 2981753f127fSDimitry Andric C = *CurPtr++; 2982753f127fSDimitry Andric } 29830b57cec5SDimitry Andric 29840b57cec5SDimitry Andric if (C == '/') { 29850b57cec5SDimitry Andric FoundSlash: 29860b57cec5SDimitry Andric if (CurPtr[-2] == '*') // We found the final */. We're done! 29870b57cec5SDimitry Andric break; 29880b57cec5SDimitry Andric 29890b57cec5SDimitry Andric if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 299081ad6265SDimitry Andric if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this, 299181ad6265SDimitry Andric LangOpts.Trigraphs)) { 29920b57cec5SDimitry Andric // We found the final */, though it had an escaped newline between the 29930b57cec5SDimitry Andric // * and /. We're done! 29940b57cec5SDimitry Andric break; 29950b57cec5SDimitry Andric } 29960b57cec5SDimitry Andric } 29970b57cec5SDimitry Andric if (CurPtr[0] == '*' && CurPtr[1] != '/') { 29980b57cec5SDimitry Andric // If this is a /* inside of the comment, emit a warning. Don't do this 29990b57cec5SDimitry Andric // if this is a /*/, which will end the comment. This misses cases with 30000b57cec5SDimitry Andric // embedded escaped newlines, but oh well. 30010b57cec5SDimitry Andric if (!isLexingRawMode()) 30020b57cec5SDimitry Andric Diag(CurPtr-1, diag::warn_nested_block_comment); 30030b57cec5SDimitry Andric } 30040b57cec5SDimitry Andric } else if (C == 0 && CurPtr == BufferEnd+1) { 30050b57cec5SDimitry Andric if (!isLexingRawMode()) 30060b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_block_comment); 30070b57cec5SDimitry Andric // Note: the user probably forgot a */. We could continue immediately 30080b57cec5SDimitry Andric // after the /*, but this would involve lexing a lot of what really is the 30090b57cec5SDimitry Andric // comment, which surely would confuse the parser. 30100b57cec5SDimitry Andric --CurPtr; 30110b57cec5SDimitry Andric 30120b57cec5SDimitry Andric // KeepWhitespaceMode should return this broken comment as a token. Since 30130b57cec5SDimitry Andric // it isn't a well formed comment, just return it as an 'unknown' token. 30140b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 30150b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 30160b57cec5SDimitry Andric return true; 30170b57cec5SDimitry Andric } 30180b57cec5SDimitry Andric 30190b57cec5SDimitry Andric BufferPtr = CurPtr; 30200b57cec5SDimitry Andric return false; 30210b57cec5SDimitry Andric } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 30220b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 30230b57cec5SDimitry Andric cutOffLexing(); 30240b57cec5SDimitry Andric return false; 30250b57cec5SDimitry Andric } 30260b57cec5SDimitry Andric 30270b57cec5SDimitry Andric C = *CurPtr++; 30280b57cec5SDimitry Andric } 30290b57cec5SDimitry Andric 30300b57cec5SDimitry Andric // Notify comment handlers about the comment unless we're in a #if 0 block. 30310b57cec5SDimitry Andric if (PP && !isLexingRawMode() && 30320b57cec5SDimitry Andric PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 30330b57cec5SDimitry Andric getSourceLocation(CurPtr)))) { 30340b57cec5SDimitry Andric BufferPtr = CurPtr; 30350b57cec5SDimitry Andric return true; // A token has to be returned. 30360b57cec5SDimitry Andric } 30370b57cec5SDimitry Andric 30380b57cec5SDimitry Andric // If we are returning comments as tokens, return this comment as a token. 30390b57cec5SDimitry Andric if (inKeepCommentMode()) { 30400b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::comment); 30410b57cec5SDimitry Andric return true; 30420b57cec5SDimitry Andric } 30430b57cec5SDimitry Andric 30440b57cec5SDimitry Andric // It is common for the tokens immediately after a /**/ comment to be 30450b57cec5SDimitry Andric // whitespace. Instead of going through the big switch, handle it 30460b57cec5SDimitry Andric // efficiently now. This is safe even in KeepWhitespaceMode because we would 30470b57cec5SDimitry Andric // have already returned above with the comment as a token. 30480b57cec5SDimitry Andric if (isHorizontalWhitespace(*CurPtr)) { 30490b57cec5SDimitry Andric SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); 30500b57cec5SDimitry Andric return false; 30510b57cec5SDimitry Andric } 30520b57cec5SDimitry Andric 30530b57cec5SDimitry Andric // Otherwise, just return so that the next character will be lexed as a token. 30540b57cec5SDimitry Andric BufferPtr = CurPtr; 30550b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 30560b57cec5SDimitry Andric return false; 30570b57cec5SDimitry Andric } 30580b57cec5SDimitry Andric 30590b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 30600b57cec5SDimitry Andric // Primary Lexing Entry Points 30610b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 30620b57cec5SDimitry Andric 30630b57cec5SDimitry Andric /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 30640b57cec5SDimitry Andric /// uninterpreted string. This switches the lexer out of directive mode. 30650b57cec5SDimitry Andric void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 30660b57cec5SDimitry Andric assert(ParsingPreprocessorDirective && ParsingFilename == false && 30670b57cec5SDimitry Andric "Must be in a preprocessing directive!"); 30680b57cec5SDimitry Andric Token Tmp; 3069480093f4SDimitry Andric Tmp.startToken(); 30700b57cec5SDimitry Andric 30710b57cec5SDimitry Andric // CurPtr - Cache BufferPtr in an automatic variable. 30720b57cec5SDimitry Andric const char *CurPtr = BufferPtr; 30730b57cec5SDimitry Andric while (true) { 30740b57cec5SDimitry Andric char Char = getAndAdvanceChar(CurPtr, Tmp); 30750b57cec5SDimitry Andric switch (Char) { 30760b57cec5SDimitry Andric default: 30770b57cec5SDimitry Andric if (Result) 30780b57cec5SDimitry Andric Result->push_back(Char); 30790b57cec5SDimitry Andric break; 30800b57cec5SDimitry Andric case 0: // Null. 30810b57cec5SDimitry Andric // Found end of file? 30820b57cec5SDimitry Andric if (CurPtr-1 != BufferEnd) { 30830b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 30840b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 30850b57cec5SDimitry Andric cutOffLexing(); 30860b57cec5SDimitry Andric return; 30870b57cec5SDimitry Andric } 30880b57cec5SDimitry Andric 30890b57cec5SDimitry Andric // Nope, normal character, continue. 30900b57cec5SDimitry Andric if (Result) 30910b57cec5SDimitry Andric Result->push_back(Char); 30920b57cec5SDimitry Andric break; 30930b57cec5SDimitry Andric } 30940b57cec5SDimitry Andric // FALL THROUGH. 3095bdd1243dSDimitry Andric [[fallthrough]]; 30960b57cec5SDimitry Andric case '\r': 30970b57cec5SDimitry Andric case '\n': 30980b57cec5SDimitry Andric // Okay, we found the end of the line. First, back up past the \0, \r, \n. 30990b57cec5SDimitry Andric assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 31000b57cec5SDimitry Andric BufferPtr = CurPtr-1; 31010b57cec5SDimitry Andric 31020b57cec5SDimitry Andric // Next, lex the character, which should handle the EOD transition. 31030b57cec5SDimitry Andric Lex(Tmp); 31040b57cec5SDimitry Andric if (Tmp.is(tok::code_completion)) { 31050b57cec5SDimitry Andric if (PP) 31060b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 31070b57cec5SDimitry Andric Lex(Tmp); 31080b57cec5SDimitry Andric } 31090b57cec5SDimitry Andric assert(Tmp.is(tok::eod) && "Unexpected token!"); 31100b57cec5SDimitry Andric 31110b57cec5SDimitry Andric // Finally, we're done; 31120b57cec5SDimitry Andric return; 31130b57cec5SDimitry Andric } 31140b57cec5SDimitry Andric } 31150b57cec5SDimitry Andric } 31160b57cec5SDimitry Andric 31170b57cec5SDimitry Andric /// LexEndOfFile - CurPtr points to the end of this file. Handle this 31180b57cec5SDimitry Andric /// condition, reporting diagnostics and handling other edge cases as required. 31190b57cec5SDimitry Andric /// This returns true if Result contains a token, false if PP.Lex should be 31200b57cec5SDimitry Andric /// called again. 31210b57cec5SDimitry Andric bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 31220b57cec5SDimitry Andric // If we hit the end of the file while parsing a preprocessor directive, 31230b57cec5SDimitry Andric // end the preprocessor directive first. The next token returned will 31240b57cec5SDimitry Andric // then be the end of file. 31250b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 31260b57cec5SDimitry Andric // Done parsing the "line". 31270b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 31280b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 31290b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::eod); 31300b57cec5SDimitry Andric 31310b57cec5SDimitry Andric // Restore comment saving mode, in case it was disabled for directive. 31320b57cec5SDimitry Andric if (PP) 31330b57cec5SDimitry Andric resetExtendedTokenMode(); 31340b57cec5SDimitry Andric return true; // Have a token. 31350b57cec5SDimitry Andric } 31360b57cec5SDimitry Andric 31370b57cec5SDimitry Andric // If we are in raw mode, return this event as an EOF token. Let the caller 31380b57cec5SDimitry Andric // that put us in raw mode handle the event. 31390b57cec5SDimitry Andric if (isLexingRawMode()) { 31400b57cec5SDimitry Andric Result.startToken(); 31410b57cec5SDimitry Andric BufferPtr = BufferEnd; 31420b57cec5SDimitry Andric FormTokenWithChars(Result, BufferEnd, tok::eof); 31430b57cec5SDimitry Andric return true; 31440b57cec5SDimitry Andric } 31450b57cec5SDimitry Andric 31460b57cec5SDimitry Andric if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { 31470b57cec5SDimitry Andric PP->setRecordedPreambleConditionalStack(ConditionalStack); 3148fe6060f1SDimitry Andric // If the preamble cuts off the end of a header guard, consider it guarded. 3149fe6060f1SDimitry Andric // The guard is valid for the preamble content itself, and for tools the 3150fe6060f1SDimitry Andric // most useful answer is "yes, this file has a header guard". 3151fe6060f1SDimitry Andric if (!ConditionalStack.empty()) 3152fe6060f1SDimitry Andric MIOpt.ExitTopLevelConditional(); 31530b57cec5SDimitry Andric ConditionalStack.clear(); 31540b57cec5SDimitry Andric } 31550b57cec5SDimitry Andric 31560b57cec5SDimitry Andric // Issue diagnostics for unterminated #if and missing newline. 31570b57cec5SDimitry Andric 31580b57cec5SDimitry Andric // If we are in a #if directive, emit an error. 31590b57cec5SDimitry Andric while (!ConditionalStack.empty()) { 31600b57cec5SDimitry Andric if (PP->getCodeCompletionFileLoc() != FileLoc) 31610b57cec5SDimitry Andric PP->Diag(ConditionalStack.back().IfLoc, 31620b57cec5SDimitry Andric diag::err_pp_unterminated_conditional); 31630b57cec5SDimitry Andric ConditionalStack.pop_back(); 31640b57cec5SDimitry Andric } 31650b57cec5SDimitry Andric 31660b57cec5SDimitry Andric // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 31670b57cec5SDimitry Andric // a pedwarn. 31680b57cec5SDimitry Andric if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) { 31690b57cec5SDimitry Andric DiagnosticsEngine &Diags = PP->getDiagnostics(); 317081ad6265SDimitry Andric SourceLocation EndLoc = getSourceLocation(BufferEnd); 31710b57cec5SDimitry Andric unsigned DiagID; 31720b57cec5SDimitry Andric 31730b57cec5SDimitry Andric if (LangOpts.CPlusPlus11) { 31740b57cec5SDimitry Andric // C++11 [lex.phases] 2.2 p2 31750b57cec5SDimitry Andric // Prefer the C++98 pedantic compatibility warning over the generic, 31760b57cec5SDimitry Andric // non-extension, user-requested "missing newline at EOF" warning. 31770b57cec5SDimitry Andric if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) { 31780b57cec5SDimitry Andric DiagID = diag::warn_cxx98_compat_no_newline_eof; 31790b57cec5SDimitry Andric } else { 31800b57cec5SDimitry Andric DiagID = diag::warn_no_newline_eof; 31810b57cec5SDimitry Andric } 31820b57cec5SDimitry Andric } else { 31830b57cec5SDimitry Andric DiagID = diag::ext_no_newline_eof; 31840b57cec5SDimitry Andric } 31850b57cec5SDimitry Andric 31860b57cec5SDimitry Andric Diag(BufferEnd, DiagID) 31870b57cec5SDimitry Andric << FixItHint::CreateInsertion(EndLoc, "\n"); 31880b57cec5SDimitry Andric } 31890b57cec5SDimitry Andric 31900b57cec5SDimitry Andric BufferPtr = CurPtr; 31910b57cec5SDimitry Andric 31920b57cec5SDimitry Andric // Finally, let the preprocessor handle this. 319381ad6265SDimitry Andric return PP->HandleEndOfFile(Result, isPragmaLexer()); 31940b57cec5SDimitry Andric } 31950b57cec5SDimitry Andric 31960b57cec5SDimitry Andric /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 31970b57cec5SDimitry Andric /// the specified lexer will return a tok::l_paren token, 0 if it is something 31980b57cec5SDimitry Andric /// else and 2 if there are no more tokens in the buffer controlled by the 31990b57cec5SDimitry Andric /// lexer. 32000b57cec5SDimitry Andric unsigned Lexer::isNextPPTokenLParen() { 32010b57cec5SDimitry Andric assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 32020b57cec5SDimitry Andric 320381ad6265SDimitry Andric if (isDependencyDirectivesLexer()) { 320481ad6265SDimitry Andric if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) 320581ad6265SDimitry Andric return 2; 320681ad6265SDimitry Andric return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( 320781ad6265SDimitry Andric tok::l_paren); 320881ad6265SDimitry Andric } 320981ad6265SDimitry Andric 32100b57cec5SDimitry Andric // Switch to 'skipping' mode. This will ensure that we can lex a token 32110b57cec5SDimitry Andric // without emitting diagnostics, disables macro expansion, and will cause EOF 32120b57cec5SDimitry Andric // to return an EOF token instead of popping the include stack. 32130b57cec5SDimitry Andric LexingRawMode = true; 32140b57cec5SDimitry Andric 32150b57cec5SDimitry Andric // Save state that can be changed while lexing so that we can restore it. 32160b57cec5SDimitry Andric const char *TmpBufferPtr = BufferPtr; 32170b57cec5SDimitry Andric bool inPPDirectiveMode = ParsingPreprocessorDirective; 32180b57cec5SDimitry Andric bool atStartOfLine = IsAtStartOfLine; 32190b57cec5SDimitry Andric bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 32200b57cec5SDimitry Andric bool leadingSpace = HasLeadingSpace; 32210b57cec5SDimitry Andric 32220b57cec5SDimitry Andric Token Tok; 32230b57cec5SDimitry Andric Lex(Tok); 32240b57cec5SDimitry Andric 32250b57cec5SDimitry Andric // Restore state that may have changed. 32260b57cec5SDimitry Andric BufferPtr = TmpBufferPtr; 32270b57cec5SDimitry Andric ParsingPreprocessorDirective = inPPDirectiveMode; 32280b57cec5SDimitry Andric HasLeadingSpace = leadingSpace; 32290b57cec5SDimitry Andric IsAtStartOfLine = atStartOfLine; 32300b57cec5SDimitry Andric IsAtPhysicalStartOfLine = atPhysicalStartOfLine; 32310b57cec5SDimitry Andric 32320b57cec5SDimitry Andric // Restore the lexer back to non-skipping mode. 32330b57cec5SDimitry Andric LexingRawMode = false; 32340b57cec5SDimitry Andric 32350b57cec5SDimitry Andric if (Tok.is(tok::eof)) 32360b57cec5SDimitry Andric return 2; 32370b57cec5SDimitry Andric return Tok.is(tok::l_paren); 32380b57cec5SDimitry Andric } 32390b57cec5SDimitry Andric 32400b57cec5SDimitry Andric /// Find the end of a version control conflict marker. 32410b57cec5SDimitry Andric static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 32420b57cec5SDimitry Andric ConflictMarkerKind CMK) { 32430b57cec5SDimitry Andric const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 32440b57cec5SDimitry Andric size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 32450b57cec5SDimitry Andric auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen); 32460b57cec5SDimitry Andric size_t Pos = RestOfBuffer.find(Terminator); 32470b57cec5SDimitry Andric while (Pos != StringRef::npos) { 32480b57cec5SDimitry Andric // Must occur at start of line. 32490b57cec5SDimitry Andric if (Pos == 0 || 32500b57cec5SDimitry Andric (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) { 32510b57cec5SDimitry Andric RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 32520b57cec5SDimitry Andric Pos = RestOfBuffer.find(Terminator); 32530b57cec5SDimitry Andric continue; 32540b57cec5SDimitry Andric } 32550b57cec5SDimitry Andric return RestOfBuffer.data()+Pos; 32560b57cec5SDimitry Andric } 32570b57cec5SDimitry Andric return nullptr; 32580b57cec5SDimitry Andric } 32590b57cec5SDimitry Andric 32600b57cec5SDimitry Andric /// IsStartOfConflictMarker - If the specified pointer is the start of a version 32610b57cec5SDimitry Andric /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 32620b57cec5SDimitry Andric /// and recover nicely. This returns true if it is a conflict marker and false 32630b57cec5SDimitry Andric /// if not. 32640b57cec5SDimitry Andric bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 32650b57cec5SDimitry Andric // Only a conflict marker if it starts at the beginning of a line. 32660b57cec5SDimitry Andric if (CurPtr != BufferStart && 32670b57cec5SDimitry Andric CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 32680b57cec5SDimitry Andric return false; 32690b57cec5SDimitry Andric 32700b57cec5SDimitry Andric // Check to see if we have <<<<<<< or >>>>. 32715f757f3fSDimitry Andric if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") && 32725f757f3fSDimitry Andric !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> ")) 32730b57cec5SDimitry Andric return false; 32740b57cec5SDimitry Andric 32750b57cec5SDimitry Andric // If we have a situation where we don't care about conflict markers, ignore 32760b57cec5SDimitry Andric // it. 32770b57cec5SDimitry Andric if (CurrentConflictMarkerState || isLexingRawMode()) 32780b57cec5SDimitry Andric return false; 32790b57cec5SDimitry Andric 32800b57cec5SDimitry Andric ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 32810b57cec5SDimitry Andric 32820b57cec5SDimitry Andric // Check to see if there is an ending marker somewhere in the buffer at the 32830b57cec5SDimitry Andric // start of a line to terminate this conflict marker. 32840b57cec5SDimitry Andric if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 32850b57cec5SDimitry Andric // We found a match. We are really in a conflict marker. 32860b57cec5SDimitry Andric // Diagnose this, and ignore to the end of line. 32870b57cec5SDimitry Andric Diag(CurPtr, diag::err_conflict_marker); 32880b57cec5SDimitry Andric CurrentConflictMarkerState = Kind; 32890b57cec5SDimitry Andric 32900b57cec5SDimitry Andric // Skip ahead to the end of line. We know this exists because the 32910b57cec5SDimitry Andric // end-of-conflict marker starts with \r or \n. 32920b57cec5SDimitry Andric while (*CurPtr != '\r' && *CurPtr != '\n') { 32930b57cec5SDimitry Andric assert(CurPtr != BufferEnd && "Didn't find end of line"); 32940b57cec5SDimitry Andric ++CurPtr; 32950b57cec5SDimitry Andric } 32960b57cec5SDimitry Andric BufferPtr = CurPtr; 32970b57cec5SDimitry Andric return true; 32980b57cec5SDimitry Andric } 32990b57cec5SDimitry Andric 33000b57cec5SDimitry Andric // No end of conflict marker found. 33010b57cec5SDimitry Andric return false; 33020b57cec5SDimitry Andric } 33030b57cec5SDimitry Andric 33040b57cec5SDimitry Andric /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 33050b57cec5SDimitry Andric /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 33060b57cec5SDimitry Andric /// is the end of a conflict marker. Handle it by ignoring up until the end of 33070b57cec5SDimitry Andric /// the line. This returns true if it is a conflict marker and false if not. 33080b57cec5SDimitry Andric bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 33090b57cec5SDimitry Andric // Only a conflict marker if it starts at the beginning of a line. 33100b57cec5SDimitry Andric if (CurPtr != BufferStart && 33110b57cec5SDimitry Andric CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 33120b57cec5SDimitry Andric return false; 33130b57cec5SDimitry Andric 33140b57cec5SDimitry Andric // If we have a situation where we don't care about conflict markers, ignore 33150b57cec5SDimitry Andric // it. 33160b57cec5SDimitry Andric if (!CurrentConflictMarkerState || isLexingRawMode()) 33170b57cec5SDimitry Andric return false; 33180b57cec5SDimitry Andric 33190b57cec5SDimitry Andric // Check to see if we have the marker (4 characters in a row). 33200b57cec5SDimitry Andric for (unsigned i = 1; i != 4; ++i) 33210b57cec5SDimitry Andric if (CurPtr[i] != CurPtr[0]) 33220b57cec5SDimitry Andric return false; 33230b57cec5SDimitry Andric 33240b57cec5SDimitry Andric // If we do have it, search for the end of the conflict marker. This could 33250b57cec5SDimitry Andric // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 33260b57cec5SDimitry Andric // be the end of conflict marker. 33270b57cec5SDimitry Andric if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 33280b57cec5SDimitry Andric CurrentConflictMarkerState)) { 33290b57cec5SDimitry Andric CurPtr = End; 33300b57cec5SDimitry Andric 33310b57cec5SDimitry Andric // Skip ahead to the end of line. 33320b57cec5SDimitry Andric while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 33330b57cec5SDimitry Andric ++CurPtr; 33340b57cec5SDimitry Andric 33350b57cec5SDimitry Andric BufferPtr = CurPtr; 33360b57cec5SDimitry Andric 33370b57cec5SDimitry Andric // No longer in the conflict marker. 33380b57cec5SDimitry Andric CurrentConflictMarkerState = CMK_None; 33390b57cec5SDimitry Andric return true; 33400b57cec5SDimitry Andric } 33410b57cec5SDimitry Andric 33420b57cec5SDimitry Andric return false; 33430b57cec5SDimitry Andric } 33440b57cec5SDimitry Andric 33450b57cec5SDimitry Andric static const char *findPlaceholderEnd(const char *CurPtr, 33460b57cec5SDimitry Andric const char *BufferEnd) { 33470b57cec5SDimitry Andric if (CurPtr == BufferEnd) 33480b57cec5SDimitry Andric return nullptr; 33490b57cec5SDimitry Andric BufferEnd -= 1; // Scan until the second last character. 33500b57cec5SDimitry Andric for (; CurPtr != BufferEnd; ++CurPtr) { 33510b57cec5SDimitry Andric if (CurPtr[0] == '#' && CurPtr[1] == '>') 33520b57cec5SDimitry Andric return CurPtr + 2; 33530b57cec5SDimitry Andric } 33540b57cec5SDimitry Andric return nullptr; 33550b57cec5SDimitry Andric } 33560b57cec5SDimitry Andric 33570b57cec5SDimitry Andric bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) { 33580b57cec5SDimitry Andric assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!"); 33590b57cec5SDimitry Andric if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) 33600b57cec5SDimitry Andric return false; 33610b57cec5SDimitry Andric const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd); 33620b57cec5SDimitry Andric if (!End) 33630b57cec5SDimitry Andric return false; 33640b57cec5SDimitry Andric const char *Start = CurPtr - 1; 33650b57cec5SDimitry Andric if (!LangOpts.AllowEditorPlaceholders) 33660b57cec5SDimitry Andric Diag(Start, diag::err_placeholder_in_source); 33670b57cec5SDimitry Andric Result.startToken(); 33680b57cec5SDimitry Andric FormTokenWithChars(Result, End, tok::raw_identifier); 33690b57cec5SDimitry Andric Result.setRawIdentifierData(Start); 33700b57cec5SDimitry Andric PP->LookUpIdentifierInfo(Result); 33710b57cec5SDimitry Andric Result.setFlag(Token::IsEditorPlaceholder); 33720b57cec5SDimitry Andric BufferPtr = End; 33730b57cec5SDimitry Andric return true; 33740b57cec5SDimitry Andric } 33750b57cec5SDimitry Andric 33760b57cec5SDimitry Andric bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 33770b57cec5SDimitry Andric if (PP && PP->isCodeCompletionEnabled()) { 33780b57cec5SDimitry Andric SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 33790b57cec5SDimitry Andric return Loc == PP->getCodeCompletionLoc(); 33800b57cec5SDimitry Andric } 33810b57cec5SDimitry Andric 33820b57cec5SDimitry Andric return false; 33830b57cec5SDimitry Andric } 33840b57cec5SDimitry Andric 3385bdd1243dSDimitry Andric std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr, 338681ad6265SDimitry Andric const char *SlashLoc, 33870b57cec5SDimitry Andric Token *Result) { 33880b57cec5SDimitry Andric unsigned CharSize; 33890b57cec5SDimitry Andric char Kind = getCharAndSize(StartPtr, CharSize); 339081ad6265SDimitry Andric assert((Kind == 'u' || Kind == 'U') && "expected a UCN"); 33910b57cec5SDimitry Andric 33920b57cec5SDimitry Andric unsigned NumHexDigits; 33930b57cec5SDimitry Andric if (Kind == 'u') 33940b57cec5SDimitry Andric NumHexDigits = 4; 33950b57cec5SDimitry Andric else if (Kind == 'U') 33960b57cec5SDimitry Andric NumHexDigits = 8; 339781ad6265SDimitry Andric 339881ad6265SDimitry Andric bool Delimited = false; 339981ad6265SDimitry Andric bool FoundEndDelimiter = false; 340081ad6265SDimitry Andric unsigned Count = 0; 340181ad6265SDimitry Andric bool Diagnose = Result && !isLexingRawMode(); 34020b57cec5SDimitry Andric 34030b57cec5SDimitry Andric if (!LangOpts.CPlusPlus && !LangOpts.C99) { 3404349cc55cSDimitry Andric if (Diagnose) 34050b57cec5SDimitry Andric Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); 3406bdd1243dSDimitry Andric return std::nullopt; 34070b57cec5SDimitry Andric } 34080b57cec5SDimitry Andric 34090b57cec5SDimitry Andric const char *CurPtr = StartPtr + CharSize; 34100b57cec5SDimitry Andric const char *KindLoc = &CurPtr[-1]; 34110b57cec5SDimitry Andric 34120b57cec5SDimitry Andric uint32_t CodePoint = 0; 3413349cc55cSDimitry Andric while (Count != NumHexDigits || Delimited) { 34140b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, CharSize); 3415bdd1243dSDimitry Andric if (!Delimited && Count == 0 && C == '{') { 3416349cc55cSDimitry Andric Delimited = true; 3417349cc55cSDimitry Andric CurPtr += CharSize; 3418349cc55cSDimitry Andric continue; 3419349cc55cSDimitry Andric } 3420349cc55cSDimitry Andric 3421349cc55cSDimitry Andric if (Delimited && C == '}') { 3422349cc55cSDimitry Andric CurPtr += CharSize; 3423349cc55cSDimitry Andric FoundEndDelimiter = true; 3424349cc55cSDimitry Andric break; 3425349cc55cSDimitry Andric } 34260b57cec5SDimitry Andric 34270b57cec5SDimitry Andric unsigned Value = llvm::hexDigitValue(C); 34280b57cec5SDimitry Andric if (Value == -1U) { 3429349cc55cSDimitry Andric if (!Delimited) 3430349cc55cSDimitry Andric break; 3431349cc55cSDimitry Andric if (Diagnose) 3432bdd1243dSDimitry Andric Diag(SlashLoc, diag::warn_delimited_ucn_incomplete) 343381ad6265SDimitry Andric << StringRef(KindLoc, 1); 3434bdd1243dSDimitry Andric return std::nullopt; 3435349cc55cSDimitry Andric } 34360b57cec5SDimitry Andric 3437349cc55cSDimitry Andric if (CodePoint & 0xF000'0000) { 3438349cc55cSDimitry Andric if (Diagnose) 3439349cc55cSDimitry Andric Diag(KindLoc, diag::err_escape_too_large) << 0; 3440bdd1243dSDimitry Andric return std::nullopt; 3441349cc55cSDimitry Andric } 3442349cc55cSDimitry Andric 3443349cc55cSDimitry Andric CodePoint <<= 4; 3444349cc55cSDimitry Andric CodePoint |= Value; 3445349cc55cSDimitry Andric CurPtr += CharSize; 3446349cc55cSDimitry Andric Count++; 3447349cc55cSDimitry Andric } 3448349cc55cSDimitry Andric 3449349cc55cSDimitry Andric if (Count == 0) { 3450349cc55cSDimitry Andric if (Diagnose) 3451bdd1243dSDimitry Andric Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 3452349cc55cSDimitry Andric : diag::warn_ucn_escape_no_digits) 3453349cc55cSDimitry Andric << StringRef(KindLoc, 1); 3454bdd1243dSDimitry Andric return std::nullopt; 345581ad6265SDimitry Andric } 345681ad6265SDimitry Andric 345781ad6265SDimitry Andric if (Delimited && Kind == 'U') { 345881ad6265SDimitry Andric if (Diagnose) 3459bdd1243dSDimitry Andric Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1); 3460bdd1243dSDimitry Andric return std::nullopt; 3461349cc55cSDimitry Andric } 3462349cc55cSDimitry Andric 3463349cc55cSDimitry Andric if (!Delimited && Count != NumHexDigits) { 3464349cc55cSDimitry Andric if (Diagnose) { 3465bdd1243dSDimitry Andric Diag(SlashLoc, diag::warn_ucn_escape_incomplete); 34660b57cec5SDimitry Andric // If the user wrote \U1234, suggest a fixit to \u. 3467349cc55cSDimitry Andric if (Count == 4 && NumHexDigits == 8) { 34680b57cec5SDimitry Andric CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); 34690b57cec5SDimitry Andric Diag(KindLoc, diag::note_ucn_four_not_eight) 34700b57cec5SDimitry Andric << FixItHint::CreateReplacement(URange, "u"); 34710b57cec5SDimitry Andric } 34720b57cec5SDimitry Andric } 3473bdd1243dSDimitry Andric return std::nullopt; 34740b57cec5SDimitry Andric } 34750b57cec5SDimitry Andric 3476349cc55cSDimitry Andric if (Delimited && PP) { 347706c3fb27SDimitry Andric Diag(SlashLoc, PP->getLangOpts().CPlusPlus23 347806c3fb27SDimitry Andric ? diag::warn_cxx23_delimited_escape_sequence 3479753f127fSDimitry Andric : diag::ext_delimited_escape_sequence) 3480753f127fSDimitry Andric << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0); 34810b57cec5SDimitry Andric } 34820b57cec5SDimitry Andric 34830b57cec5SDimitry Andric if (Result) { 34840b57cec5SDimitry Andric Result->setFlag(Token::HasUCN); 3485bdd1243dSDimitry Andric // If the UCN contains either a trigraph or a line splicing, 3486bdd1243dSDimitry Andric // we need to call getAndAdvanceChar again to set the appropriate flags 3487bdd1243dSDimitry Andric // on Result. 3488bdd1243dSDimitry Andric if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0))) 34890b57cec5SDimitry Andric StartPtr = CurPtr; 34900b57cec5SDimitry Andric else 34910b57cec5SDimitry Andric while (StartPtr != CurPtr) 34920b57cec5SDimitry Andric (void)getAndAdvanceChar(StartPtr, *Result); 34930b57cec5SDimitry Andric } else { 34940b57cec5SDimitry Andric StartPtr = CurPtr; 34950b57cec5SDimitry Andric } 349681ad6265SDimitry Andric return CodePoint; 349781ad6265SDimitry Andric } 349881ad6265SDimitry Andric 3499bdd1243dSDimitry Andric std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr, 3500bdd1243dSDimitry Andric const char *SlashLoc, 350181ad6265SDimitry Andric Token *Result) { 350281ad6265SDimitry Andric unsigned CharSize; 350381ad6265SDimitry Andric bool Diagnose = Result && !isLexingRawMode(); 350481ad6265SDimitry Andric 350581ad6265SDimitry Andric char C = getCharAndSize(StartPtr, CharSize); 350681ad6265SDimitry Andric assert(C == 'N' && "expected \\N{...}"); 350781ad6265SDimitry Andric 350881ad6265SDimitry Andric const char *CurPtr = StartPtr + CharSize; 350981ad6265SDimitry Andric const char *KindLoc = &CurPtr[-1]; 351081ad6265SDimitry Andric 351181ad6265SDimitry Andric C = getCharAndSize(CurPtr, CharSize); 351281ad6265SDimitry Andric if (C != '{') { 351381ad6265SDimitry Andric if (Diagnose) 3514bdd1243dSDimitry Andric Diag(SlashLoc, diag::warn_ucn_escape_incomplete); 3515bdd1243dSDimitry Andric return std::nullopt; 351681ad6265SDimitry Andric } 351781ad6265SDimitry Andric CurPtr += CharSize; 351881ad6265SDimitry Andric const char *StartName = CurPtr; 351981ad6265SDimitry Andric bool FoundEndDelimiter = false; 352081ad6265SDimitry Andric llvm::SmallVector<char, 30> Buffer; 352181ad6265SDimitry Andric while (C) { 352281ad6265SDimitry Andric C = getCharAndSize(CurPtr, CharSize); 352381ad6265SDimitry Andric CurPtr += CharSize; 352481ad6265SDimitry Andric if (C == '}') { 352581ad6265SDimitry Andric FoundEndDelimiter = true; 352681ad6265SDimitry Andric break; 352781ad6265SDimitry Andric } 352881ad6265SDimitry Andric 3529bdd1243dSDimitry Andric if (isVerticalWhitespace(C)) 353081ad6265SDimitry Andric break; 353181ad6265SDimitry Andric Buffer.push_back(C); 353281ad6265SDimitry Andric } 353381ad6265SDimitry Andric 353481ad6265SDimitry Andric if (!FoundEndDelimiter || Buffer.empty()) { 353581ad6265SDimitry Andric if (Diagnose) 3536bdd1243dSDimitry Andric Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 353781ad6265SDimitry Andric : diag::warn_delimited_ucn_incomplete) 353881ad6265SDimitry Andric << StringRef(KindLoc, 1); 3539bdd1243dSDimitry Andric return std::nullopt; 354081ad6265SDimitry Andric } 354181ad6265SDimitry Andric 354281ad6265SDimitry Andric StringRef Name(Buffer.data(), Buffer.size()); 3543bdd1243dSDimitry Andric std::optional<char32_t> Match = 354481ad6265SDimitry Andric llvm::sys::unicode::nameToCodepointStrict(Name); 3545bdd1243dSDimitry Andric std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch; 3546bdd1243dSDimitry Andric if (!Match) { 354781ad6265SDimitry Andric LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name); 3548bdd1243dSDimitry Andric if (Diagnose) { 3549bdd1243dSDimitry Andric Diag(StartName, diag::err_invalid_ucn_name) 3550bdd1243dSDimitry Andric << StringRef(Buffer.data(), Buffer.size()) 3551bdd1243dSDimitry Andric << makeCharRange(*this, StartName, CurPtr - CharSize); 355281ad6265SDimitry Andric if (LooseMatch) { 355381ad6265SDimitry Andric Diag(StartName, diag::note_invalid_ucn_name_loose_matching) 355481ad6265SDimitry Andric << FixItHint::CreateReplacement( 355581ad6265SDimitry Andric makeCharRange(*this, StartName, CurPtr - CharSize), 355681ad6265SDimitry Andric LooseMatch->Name); 355781ad6265SDimitry Andric } 355881ad6265SDimitry Andric } 3559bdd1243dSDimitry Andric // We do not offer misspelled character names suggestions here 356081ad6265SDimitry Andric // as the set of what would be a valid suggestion depends on context, 356181ad6265SDimitry Andric // and we should not make invalid suggestions. 356281ad6265SDimitry Andric } 356381ad6265SDimitry Andric 3564bdd1243dSDimitry Andric if (Diagnose && Match) 356506c3fb27SDimitry Andric Diag(SlashLoc, PP->getLangOpts().CPlusPlus23 356606c3fb27SDimitry Andric ? diag::warn_cxx23_delimited_escape_sequence 3567753f127fSDimitry Andric : diag::ext_delimited_escape_sequence) 3568753f127fSDimitry Andric << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0); 356981ad6265SDimitry Andric 3570bdd1243dSDimitry Andric // If no diagnostic has been emitted yet, likely because we are doing a 3571bdd1243dSDimitry Andric // tentative lexing, we do not want to recover here to make sure the token 3572bdd1243dSDimitry Andric // will not be incorrectly considered valid. This function will be called 3573bdd1243dSDimitry Andric // again and a diagnostic emitted then. 3574bdd1243dSDimitry Andric if (LooseMatch && Diagnose) 3575bdd1243dSDimitry Andric Match = LooseMatch->CodePoint; 357681ad6265SDimitry Andric 357781ad6265SDimitry Andric if (Result) { 357881ad6265SDimitry Andric Result->setFlag(Token::HasUCN); 3579bdd1243dSDimitry Andric // If the UCN contains either a trigraph or a line splicing, 3580bdd1243dSDimitry Andric // we need to call getAndAdvanceChar again to set the appropriate flags 3581bdd1243dSDimitry Andric // on Result. 3582bdd1243dSDimitry Andric if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3)) 358381ad6265SDimitry Andric StartPtr = CurPtr; 358481ad6265SDimitry Andric else 358581ad6265SDimitry Andric while (StartPtr != CurPtr) 358681ad6265SDimitry Andric (void)getAndAdvanceChar(StartPtr, *Result); 358781ad6265SDimitry Andric } else { 358881ad6265SDimitry Andric StartPtr = CurPtr; 358981ad6265SDimitry Andric } 3590bdd1243dSDimitry Andric return Match ? std::optional<uint32_t>(*Match) : std::nullopt; 359181ad6265SDimitry Andric } 359281ad6265SDimitry Andric 359381ad6265SDimitry Andric uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, 359481ad6265SDimitry Andric Token *Result) { 359581ad6265SDimitry Andric 359681ad6265SDimitry Andric unsigned CharSize; 3597bdd1243dSDimitry Andric std::optional<uint32_t> CodePointOpt; 359881ad6265SDimitry Andric char Kind = getCharAndSize(StartPtr, CharSize); 359981ad6265SDimitry Andric if (Kind == 'u' || Kind == 'U') 360081ad6265SDimitry Andric CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result); 360181ad6265SDimitry Andric else if (Kind == 'N') 3602bdd1243dSDimitry Andric CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result); 360381ad6265SDimitry Andric 360481ad6265SDimitry Andric if (!CodePointOpt) 360581ad6265SDimitry Andric return 0; 360681ad6265SDimitry Andric 360781ad6265SDimitry Andric uint32_t CodePoint = *CodePointOpt; 36080b57cec5SDimitry Andric 36090b57cec5SDimitry Andric // Don't apply C family restrictions to UCNs in assembly mode 36100b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) 36110b57cec5SDimitry Andric return CodePoint; 36120b57cec5SDimitry Andric 36135f757f3fSDimitry Andric // C23 6.4.3p2: A universal character name shall not designate a code point 361406c3fb27SDimitry Andric // where the hexadecimal value is: 361506c3fb27SDimitry Andric // - in the range D800 through DFFF inclusive; or 361606c3fb27SDimitry Andric // - greater than 10FFFF. 361706c3fb27SDimitry Andric // A universal-character-name outside the c-char-sequence of a character 361806c3fb27SDimitry Andric // constant, or the s-char-sequence of a string-literal shall not designate 361906c3fb27SDimitry Andric // a control character or a character in the basic character set. 362006c3fb27SDimitry Andric 36210b57cec5SDimitry Andric // C++11 [lex.charset]p2: If the hexadecimal value for a 36220b57cec5SDimitry Andric // universal-character-name corresponds to a surrogate code point (in the 36230b57cec5SDimitry Andric // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, 36240b57cec5SDimitry Andric // if the hexadecimal value for a universal-character-name outside the 36250b57cec5SDimitry Andric // c-char-sequence, s-char-sequence, or r-char-sequence of a character or 36260b57cec5SDimitry Andric // string literal corresponds to a control character (in either of the 36270b57cec5SDimitry Andric // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the 36280b57cec5SDimitry Andric // basic source character set, the program is ill-formed. 36290b57cec5SDimitry Andric if (CodePoint < 0xA0) { 36300b57cec5SDimitry Andric // We don't use isLexingRawMode() here because we need to warn about bad 36310b57cec5SDimitry Andric // UCNs even when skipping preprocessing tokens in a #if block. 36320b57cec5SDimitry Andric if (Result && PP) { 36330b57cec5SDimitry Andric if (CodePoint < 0x20 || CodePoint >= 0x7F) 36340b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_control_character); 36350b57cec5SDimitry Andric else { 36360b57cec5SDimitry Andric char C = static_cast<char>(CodePoint); 36370b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); 36380b57cec5SDimitry Andric } 36390b57cec5SDimitry Andric } 36400b57cec5SDimitry Andric 36410b57cec5SDimitry Andric return 0; 36420b57cec5SDimitry Andric } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { 36430b57cec5SDimitry Andric // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. 36440b57cec5SDimitry Andric // We don't use isLexingRawMode() here because we need to diagnose bad 36450b57cec5SDimitry Andric // UCNs even when skipping preprocessing tokens in a #if block. 36460b57cec5SDimitry Andric if (Result && PP) { 36470b57cec5SDimitry Andric if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11) 36480b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_ucn_escape_surrogate); 36490b57cec5SDimitry Andric else 36500b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_escape_invalid); 36510b57cec5SDimitry Andric } 36520b57cec5SDimitry Andric return 0; 36530b57cec5SDimitry Andric } 36540b57cec5SDimitry Andric 36550b57cec5SDimitry Andric return CodePoint; 36560b57cec5SDimitry Andric } 36570b57cec5SDimitry Andric 36580b57cec5SDimitry Andric bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, 36590b57cec5SDimitry Andric const char *CurPtr) { 36600b57cec5SDimitry Andric if (!isLexingRawMode() && !PP->isPreprocessedOutput() && 3661349cc55cSDimitry Andric isUnicodeWhitespace(C)) { 36620b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unicode_whitespace) 36630b57cec5SDimitry Andric << makeCharRange(*this, BufferPtr, CurPtr); 36640b57cec5SDimitry Andric 36650b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 36660b57cec5SDimitry Andric return true; 36670b57cec5SDimitry Andric } 36680b57cec5SDimitry Andric return false; 36690b57cec5SDimitry Andric } 36700b57cec5SDimitry Andric 36710b57cec5SDimitry Andric void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { 36720b57cec5SDimitry Andric IsAtStartOfLine = Result.isAtStartOfLine(); 36730b57cec5SDimitry Andric HasLeadingSpace = Result.hasLeadingSpace(); 36740b57cec5SDimitry Andric HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); 36750b57cec5SDimitry Andric // Note that this doesn't affect IsAtPhysicalStartOfLine. 36760b57cec5SDimitry Andric } 36770b57cec5SDimitry Andric 36780b57cec5SDimitry Andric bool Lexer::Lex(Token &Result) { 367981ad6265SDimitry Andric assert(!isDependencyDirectivesLexer()); 368081ad6265SDimitry Andric 36810b57cec5SDimitry Andric // Start a new token. 36820b57cec5SDimitry Andric Result.startToken(); 36830b57cec5SDimitry Andric 36840b57cec5SDimitry Andric // Set up misc whitespace flags for LexTokenInternal. 36850b57cec5SDimitry Andric if (IsAtStartOfLine) { 36860b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 36870b57cec5SDimitry Andric IsAtStartOfLine = false; 36880b57cec5SDimitry Andric } 36890b57cec5SDimitry Andric 36900b57cec5SDimitry Andric if (HasLeadingSpace) { 36910b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 36920b57cec5SDimitry Andric HasLeadingSpace = false; 36930b57cec5SDimitry Andric } 36940b57cec5SDimitry Andric 36950b57cec5SDimitry Andric if (HasLeadingEmptyMacro) { 36960b57cec5SDimitry Andric Result.setFlag(Token::LeadingEmptyMacro); 36970b57cec5SDimitry Andric HasLeadingEmptyMacro = false; 36980b57cec5SDimitry Andric } 36990b57cec5SDimitry Andric 37000b57cec5SDimitry Andric bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 37010b57cec5SDimitry Andric IsAtPhysicalStartOfLine = false; 37020b57cec5SDimitry Andric bool isRawLex = isLexingRawMode(); 37030b57cec5SDimitry Andric (void) isRawLex; 37040b57cec5SDimitry Andric bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); 37050b57cec5SDimitry Andric // (After the LexTokenInternal call, the lexer might be destroyed.) 37060b57cec5SDimitry Andric assert((returnedToken || !isRawLex) && "Raw lex must succeed"); 37070b57cec5SDimitry Andric return returnedToken; 37080b57cec5SDimitry Andric } 37090b57cec5SDimitry Andric 37100b57cec5SDimitry Andric /// LexTokenInternal - This implements a simple C family lexer. It is an 37110b57cec5SDimitry Andric /// extremely performance critical piece of code. This assumes that the buffer 37120b57cec5SDimitry Andric /// has a null character at the end of the file. This returns a preprocessing 37130b57cec5SDimitry Andric /// token, not a normal token, as such, it is an internal interface. It assumes 37140b57cec5SDimitry Andric /// that the Flags of result have been cleared before calling this. 37150b57cec5SDimitry Andric bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { 3716bdd1243dSDimitry Andric LexStart: 3717bdd1243dSDimitry Andric assert(!Result.needsCleaning() && "Result needs cleaning"); 3718bdd1243dSDimitry Andric assert(!Result.hasPtrData() && "Result has not been reset"); 37190b57cec5SDimitry Andric 37200b57cec5SDimitry Andric // CurPtr - Cache BufferPtr in an automatic variable. 37210b57cec5SDimitry Andric const char *CurPtr = BufferPtr; 37220b57cec5SDimitry Andric 37230b57cec5SDimitry Andric // Small amounts of horizontal whitespace is very common between tokens. 3724fe6060f1SDimitry Andric if (isHorizontalWhitespace(*CurPtr)) { 3725fe6060f1SDimitry Andric do { 37260b57cec5SDimitry Andric ++CurPtr; 3727fe6060f1SDimitry Andric } while (isHorizontalWhitespace(*CurPtr)); 37280b57cec5SDimitry Andric 37290b57cec5SDimitry Andric // If we are keeping whitespace and other tokens, just return what we just 37300b57cec5SDimitry Andric // skipped. The next lexer invocation will return the token after the 37310b57cec5SDimitry Andric // whitespace. 37320b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 37330b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 37340b57cec5SDimitry Andric // FIXME: The next token will not have LeadingSpace set. 37350b57cec5SDimitry Andric return true; 37360b57cec5SDimitry Andric } 37370b57cec5SDimitry Andric 37380b57cec5SDimitry Andric BufferPtr = CurPtr; 37390b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 37400b57cec5SDimitry Andric } 37410b57cec5SDimitry Andric 37420b57cec5SDimitry Andric unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 37430b57cec5SDimitry Andric 37440b57cec5SDimitry Andric // Read a character, advancing over it. 37450b57cec5SDimitry Andric char Char = getAndAdvanceChar(CurPtr, Result); 37460b57cec5SDimitry Andric tok::TokenKind Kind; 37470b57cec5SDimitry Andric 3748e8d8bef9SDimitry Andric if (!isVerticalWhitespace(Char)) 3749e8d8bef9SDimitry Andric NewLinePtr = nullptr; 3750e8d8bef9SDimitry Andric 37510b57cec5SDimitry Andric switch (Char) { 37520b57cec5SDimitry Andric case 0: // Null. 37530b57cec5SDimitry Andric // Found end of file? 37540b57cec5SDimitry Andric if (CurPtr-1 == BufferEnd) 37550b57cec5SDimitry Andric return LexEndOfFile(Result, CurPtr-1); 37560b57cec5SDimitry Andric 37570b57cec5SDimitry Andric // Check if we are performing code completion. 37580b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 37590b57cec5SDimitry Andric // Return the code-completion token. 37600b57cec5SDimitry Andric Result.startToken(); 37610b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::code_completion); 37620b57cec5SDimitry Andric return true; 37630b57cec5SDimitry Andric } 37640b57cec5SDimitry Andric 37650b57cec5SDimitry Andric if (!isLexingRawMode()) 37660b57cec5SDimitry Andric Diag(CurPtr-1, diag::null_in_file); 37670b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 37680b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 37690b57cec5SDimitry Andric return true; // KeepWhitespaceMode 37700b57cec5SDimitry Andric 37710b57cec5SDimitry Andric // We know the lexer hasn't changed, so just try again with this lexer. 37720b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 37730b57cec5SDimitry Andric goto LexNextToken; 37740b57cec5SDimitry Andric 37750b57cec5SDimitry Andric case 26: // DOS & CP/M EOF: "^Z". 37760b57cec5SDimitry Andric // If we're in Microsoft extensions mode, treat this as end of file. 37770b57cec5SDimitry Andric if (LangOpts.MicrosoftExt) { 37780b57cec5SDimitry Andric if (!isLexingRawMode()) 37790b57cec5SDimitry Andric Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft); 37800b57cec5SDimitry Andric return LexEndOfFile(Result, CurPtr-1); 37810b57cec5SDimitry Andric } 37820b57cec5SDimitry Andric 37830b57cec5SDimitry Andric // If Microsoft extensions are disabled, this is just random garbage. 37840b57cec5SDimitry Andric Kind = tok::unknown; 37850b57cec5SDimitry Andric break; 37860b57cec5SDimitry Andric 37870b57cec5SDimitry Andric case '\r': 37880b57cec5SDimitry Andric if (CurPtr[0] == '\n') 37890b57cec5SDimitry Andric (void)getAndAdvanceChar(CurPtr, Result); 3790bdd1243dSDimitry Andric [[fallthrough]]; 37910b57cec5SDimitry Andric case '\n': 37920b57cec5SDimitry Andric // If we are inside a preprocessor directive and we see the end of line, 37930b57cec5SDimitry Andric // we know we are done with the directive, so return an EOD token. 37940b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 37950b57cec5SDimitry Andric // Done parsing the "line". 37960b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 37970b57cec5SDimitry Andric 37980b57cec5SDimitry Andric // Restore comment saving mode, in case it was disabled for directive. 37990b57cec5SDimitry Andric if (PP) 38000b57cec5SDimitry Andric resetExtendedTokenMode(); 38010b57cec5SDimitry Andric 38020b57cec5SDimitry Andric // Since we consumed a newline, we are back at the start of a line. 38030b57cec5SDimitry Andric IsAtStartOfLine = true; 38040b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 3805e8d8bef9SDimitry Andric NewLinePtr = CurPtr - 1; 38060b57cec5SDimitry Andric 38070b57cec5SDimitry Andric Kind = tok::eod; 38080b57cec5SDimitry Andric break; 38090b57cec5SDimitry Andric } 38100b57cec5SDimitry Andric 38110b57cec5SDimitry Andric // No leading whitespace seen so far. 38120b57cec5SDimitry Andric Result.clearFlag(Token::LeadingSpace); 38130b57cec5SDimitry Andric 38140b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 38150b57cec5SDimitry Andric return true; // KeepWhitespaceMode 38160b57cec5SDimitry Andric 38170b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 38180b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 38190b57cec5SDimitry Andric goto LexNextToken; 38200b57cec5SDimitry Andric case ' ': 38210b57cec5SDimitry Andric case '\t': 38220b57cec5SDimitry Andric case '\f': 38230b57cec5SDimitry Andric case '\v': 38240b57cec5SDimitry Andric SkipHorizontalWhitespace: 38250b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 38260b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 38270b57cec5SDimitry Andric return true; // KeepWhitespaceMode 38280b57cec5SDimitry Andric 38290b57cec5SDimitry Andric SkipIgnoredUnits: 38300b57cec5SDimitry Andric CurPtr = BufferPtr; 38310b57cec5SDimitry Andric 38320b57cec5SDimitry Andric // If the next token is obviously a // or /* */ comment, skip it efficiently 38330b57cec5SDimitry Andric // too (without going through the big switch stmt). 38340b57cec5SDimitry Andric if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 383581ad6265SDimitry Andric LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { 38360b57cec5SDimitry Andric if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 38370b57cec5SDimitry Andric return true; // There is a token to return. 38380b57cec5SDimitry Andric goto SkipIgnoredUnits; 38390b57cec5SDimitry Andric } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 38400b57cec5SDimitry Andric if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 38410b57cec5SDimitry Andric return true; // There is a token to return. 38420b57cec5SDimitry Andric goto SkipIgnoredUnits; 38430b57cec5SDimitry Andric } else if (isHorizontalWhitespace(*CurPtr)) { 38440b57cec5SDimitry Andric goto SkipHorizontalWhitespace; 38450b57cec5SDimitry Andric } 38460b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 38470b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 38480b57cec5SDimitry Andric goto LexNextToken; 38490b57cec5SDimitry Andric 38500b57cec5SDimitry Andric // C99 6.4.4.1: Integer Constants. 38510b57cec5SDimitry Andric // C99 6.4.4.2: Floating Constants. 38520b57cec5SDimitry Andric case '0': case '1': case '2': case '3': case '4': 38530b57cec5SDimitry Andric case '5': case '6': case '7': case '8': case '9': 38540b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 38550b57cec5SDimitry Andric MIOpt.ReadToken(); 38560b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 38570b57cec5SDimitry Andric 385881ad6265SDimitry Andric // Identifier (e.g., uber), or 38595f757f3fSDimitry Andric // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or 386081ad6265SDimitry Andric // UTF-8 or UTF-16 string literal (C11/C++11). 386181ad6265SDimitry Andric case 'u': 38620b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 38630b57cec5SDimitry Andric MIOpt.ReadToken(); 38640b57cec5SDimitry Andric 38650b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 || LangOpts.C11) { 38660b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 38670b57cec5SDimitry Andric 38680b57cec5SDimitry Andric // UTF-16 string literal 38690b57cec5SDimitry Andric if (Char == '"') 38700b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 38710b57cec5SDimitry Andric tok::utf16_string_literal); 38720b57cec5SDimitry Andric 38730b57cec5SDimitry Andric // UTF-16 character constant 38740b57cec5SDimitry Andric if (Char == '\'') 38750b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 38760b57cec5SDimitry Andric tok::utf16_char_constant); 38770b57cec5SDimitry Andric 38780b57cec5SDimitry Andric // UTF-16 raw string literal 3879*0fca6ea1SDimitry Andric if (Char == 'R' && LangOpts.RawStringLiterals && 38800b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 38810b57cec5SDimitry Andric return LexRawStringLiteral(Result, 38820b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 38830b57cec5SDimitry Andric SizeTmp2, Result), 38840b57cec5SDimitry Andric tok::utf16_string_literal); 38850b57cec5SDimitry Andric 38860b57cec5SDimitry Andric if (Char == '8') { 38870b57cec5SDimitry Andric char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 38880b57cec5SDimitry Andric 38890b57cec5SDimitry Andric // UTF-8 string literal 38900b57cec5SDimitry Andric if (Char2 == '"') 38910b57cec5SDimitry Andric return LexStringLiteral(Result, 38920b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 38930b57cec5SDimitry Andric SizeTmp2, Result), 38940b57cec5SDimitry Andric tok::utf8_string_literal); 38955f757f3fSDimitry Andric if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23)) 38960b57cec5SDimitry Andric return LexCharConstant( 38970b57cec5SDimitry Andric Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 38980b57cec5SDimitry Andric SizeTmp2, Result), 38990b57cec5SDimitry Andric tok::utf8_char_constant); 39000b57cec5SDimitry Andric 3901*0fca6ea1SDimitry Andric if (Char2 == 'R' && LangOpts.RawStringLiterals) { 39020b57cec5SDimitry Andric unsigned SizeTmp3; 39030b57cec5SDimitry Andric char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 39040b57cec5SDimitry Andric // UTF-8 raw string literal 39050b57cec5SDimitry Andric if (Char3 == '"') { 39060b57cec5SDimitry Andric return LexRawStringLiteral(Result, 39070b57cec5SDimitry Andric ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 39080b57cec5SDimitry Andric SizeTmp2, Result), 39090b57cec5SDimitry Andric SizeTmp3, Result), 39100b57cec5SDimitry Andric tok::utf8_string_literal); 39110b57cec5SDimitry Andric } 39120b57cec5SDimitry Andric } 39130b57cec5SDimitry Andric } 39140b57cec5SDimitry Andric } 39150b57cec5SDimitry Andric 39160b57cec5SDimitry Andric // treat u like the start of an identifier. 3917349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 39180b57cec5SDimitry Andric 391981ad6265SDimitry Andric case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal 39200b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 39210b57cec5SDimitry Andric MIOpt.ReadToken(); 39220b57cec5SDimitry Andric 39230b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 || LangOpts.C11) { 39240b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 39250b57cec5SDimitry Andric 39260b57cec5SDimitry Andric // UTF-32 string literal 39270b57cec5SDimitry Andric if (Char == '"') 39280b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 39290b57cec5SDimitry Andric tok::utf32_string_literal); 39300b57cec5SDimitry Andric 39310b57cec5SDimitry Andric // UTF-32 character constant 39320b57cec5SDimitry Andric if (Char == '\'') 39330b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 39340b57cec5SDimitry Andric tok::utf32_char_constant); 39350b57cec5SDimitry Andric 39360b57cec5SDimitry Andric // UTF-32 raw string literal 3937*0fca6ea1SDimitry Andric if (Char == 'R' && LangOpts.RawStringLiterals && 39380b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 39390b57cec5SDimitry Andric return LexRawStringLiteral(Result, 39400b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 39410b57cec5SDimitry Andric SizeTmp2, Result), 39420b57cec5SDimitry Andric tok::utf32_string_literal); 39430b57cec5SDimitry Andric } 39440b57cec5SDimitry Andric 39450b57cec5SDimitry Andric // treat U like the start of an identifier. 3946349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 39470b57cec5SDimitry Andric 39480b57cec5SDimitry Andric case 'R': // Identifier or C++0x raw string literal 39490b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 39500b57cec5SDimitry Andric MIOpt.ReadToken(); 39510b57cec5SDimitry Andric 3952*0fca6ea1SDimitry Andric if (LangOpts.RawStringLiterals) { 39530b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 39540b57cec5SDimitry Andric 39550b57cec5SDimitry Andric if (Char == '"') 39560b57cec5SDimitry Andric return LexRawStringLiteral(Result, 39570b57cec5SDimitry Andric ConsumeChar(CurPtr, SizeTmp, Result), 39580b57cec5SDimitry Andric tok::string_literal); 39590b57cec5SDimitry Andric } 39600b57cec5SDimitry Andric 39610b57cec5SDimitry Andric // treat R like the start of an identifier. 3962349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 39630b57cec5SDimitry Andric 39640b57cec5SDimitry Andric case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 39650b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 39660b57cec5SDimitry Andric MIOpt.ReadToken(); 39670b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 39680b57cec5SDimitry Andric 39690b57cec5SDimitry Andric // Wide string literal. 39700b57cec5SDimitry Andric if (Char == '"') 39710b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 39720b57cec5SDimitry Andric tok::wide_string_literal); 39730b57cec5SDimitry Andric 39740b57cec5SDimitry Andric // Wide raw string literal. 3975*0fca6ea1SDimitry Andric if (LangOpts.RawStringLiterals && Char == 'R' && 39760b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 39770b57cec5SDimitry Andric return LexRawStringLiteral(Result, 39780b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 39790b57cec5SDimitry Andric SizeTmp2, Result), 39800b57cec5SDimitry Andric tok::wide_string_literal); 39810b57cec5SDimitry Andric 39820b57cec5SDimitry Andric // Wide character constant. 39830b57cec5SDimitry Andric if (Char == '\'') 39840b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 39850b57cec5SDimitry Andric tok::wide_char_constant); 39860b57cec5SDimitry Andric // FALL THROUGH, treating L like the start of an identifier. 3987bdd1243dSDimitry Andric [[fallthrough]]; 39880b57cec5SDimitry Andric 39890b57cec5SDimitry Andric // C99 6.4.2: Identifiers. 39900b57cec5SDimitry Andric case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 39910b57cec5SDimitry Andric case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 39920b57cec5SDimitry Andric case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 39930b57cec5SDimitry Andric case 'V': case 'W': case 'X': case 'Y': case 'Z': 39940b57cec5SDimitry Andric case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 39950b57cec5SDimitry Andric case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 39960b57cec5SDimitry Andric case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 39970b57cec5SDimitry Andric case 'v': case 'w': case 'x': case 'y': case 'z': 39980b57cec5SDimitry Andric case '_': 39990b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 40000b57cec5SDimitry Andric MIOpt.ReadToken(); 4001349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 40020b57cec5SDimitry Andric 40030b57cec5SDimitry Andric case '$': // $ in identifiers. 40040b57cec5SDimitry Andric if (LangOpts.DollarIdents) { 40050b57cec5SDimitry Andric if (!isLexingRawMode()) 40060b57cec5SDimitry Andric Diag(CurPtr-1, diag::ext_dollar_in_identifier); 40070b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 40080b57cec5SDimitry Andric MIOpt.ReadToken(); 4009349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 40100b57cec5SDimitry Andric } 40110b57cec5SDimitry Andric 40120b57cec5SDimitry Andric Kind = tok::unknown; 40130b57cec5SDimitry Andric break; 40140b57cec5SDimitry Andric 40150b57cec5SDimitry Andric // C99 6.4.4: Character Constants. 40160b57cec5SDimitry Andric case '\'': 40170b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 40180b57cec5SDimitry Andric MIOpt.ReadToken(); 40190b57cec5SDimitry Andric return LexCharConstant(Result, CurPtr, tok::char_constant); 40200b57cec5SDimitry Andric 40210b57cec5SDimitry Andric // C99 6.4.5: String Literals. 40220b57cec5SDimitry Andric case '"': 40230b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 40240b57cec5SDimitry Andric MIOpt.ReadToken(); 40250b57cec5SDimitry Andric return LexStringLiteral(Result, CurPtr, 40260b57cec5SDimitry Andric ParsingFilename ? tok::header_name 40270b57cec5SDimitry Andric : tok::string_literal); 40280b57cec5SDimitry Andric 40290b57cec5SDimitry Andric // C99 6.4.6: Punctuators. 40300b57cec5SDimitry Andric case '?': 40310b57cec5SDimitry Andric Kind = tok::question; 40320b57cec5SDimitry Andric break; 40330b57cec5SDimitry Andric case '[': 40340b57cec5SDimitry Andric Kind = tok::l_square; 40350b57cec5SDimitry Andric break; 40360b57cec5SDimitry Andric case ']': 40370b57cec5SDimitry Andric Kind = tok::r_square; 40380b57cec5SDimitry Andric break; 40390b57cec5SDimitry Andric case '(': 40400b57cec5SDimitry Andric Kind = tok::l_paren; 40410b57cec5SDimitry Andric break; 40420b57cec5SDimitry Andric case ')': 40430b57cec5SDimitry Andric Kind = tok::r_paren; 40440b57cec5SDimitry Andric break; 40450b57cec5SDimitry Andric case '{': 40460b57cec5SDimitry Andric Kind = tok::l_brace; 40470b57cec5SDimitry Andric break; 40480b57cec5SDimitry Andric case '}': 40490b57cec5SDimitry Andric Kind = tok::r_brace; 40500b57cec5SDimitry Andric break; 40510b57cec5SDimitry Andric case '.': 40520b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 40530b57cec5SDimitry Andric if (Char >= '0' && Char <= '9') { 40540b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 40550b57cec5SDimitry Andric MIOpt.ReadToken(); 40560b57cec5SDimitry Andric 40570b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 40580b57cec5SDimitry Andric } else if (LangOpts.CPlusPlus && Char == '*') { 40590b57cec5SDimitry Andric Kind = tok::periodstar; 40600b57cec5SDimitry Andric CurPtr += SizeTmp; 40610b57cec5SDimitry Andric } else if (Char == '.' && 40620b57cec5SDimitry Andric getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 40630b57cec5SDimitry Andric Kind = tok::ellipsis; 40640b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 40650b57cec5SDimitry Andric SizeTmp2, Result); 40660b57cec5SDimitry Andric } else { 40670b57cec5SDimitry Andric Kind = tok::period; 40680b57cec5SDimitry Andric } 40690b57cec5SDimitry Andric break; 40700b57cec5SDimitry Andric case '&': 40710b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 40720b57cec5SDimitry Andric if (Char == '&') { 40730b57cec5SDimitry Andric Kind = tok::ampamp; 40740b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40750b57cec5SDimitry Andric } else if (Char == '=') { 40760b57cec5SDimitry Andric Kind = tok::ampequal; 40770b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40780b57cec5SDimitry Andric } else { 40790b57cec5SDimitry Andric Kind = tok::amp; 40800b57cec5SDimitry Andric } 40810b57cec5SDimitry Andric break; 40820b57cec5SDimitry Andric case '*': 40830b57cec5SDimitry Andric if (getCharAndSize(CurPtr, SizeTmp) == '=') { 40840b57cec5SDimitry Andric Kind = tok::starequal; 40850b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40860b57cec5SDimitry Andric } else { 40870b57cec5SDimitry Andric Kind = tok::star; 40880b57cec5SDimitry Andric } 40890b57cec5SDimitry Andric break; 40900b57cec5SDimitry Andric case '+': 40910b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 40920b57cec5SDimitry Andric if (Char == '+') { 40930b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40940b57cec5SDimitry Andric Kind = tok::plusplus; 40950b57cec5SDimitry Andric } else if (Char == '=') { 40960b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40970b57cec5SDimitry Andric Kind = tok::plusequal; 40980b57cec5SDimitry Andric } else { 40990b57cec5SDimitry Andric Kind = tok::plus; 41000b57cec5SDimitry Andric } 41010b57cec5SDimitry Andric break; 41020b57cec5SDimitry Andric case '-': 41030b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 41040b57cec5SDimitry Andric if (Char == '-') { // -- 41050b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41060b57cec5SDimitry Andric Kind = tok::minusminus; 41070b57cec5SDimitry Andric } else if (Char == '>' && LangOpts.CPlusPlus && 41080b57cec5SDimitry Andric getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 41090b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 41100b57cec5SDimitry Andric SizeTmp2, Result); 41110b57cec5SDimitry Andric Kind = tok::arrowstar; 41120b57cec5SDimitry Andric } else if (Char == '>') { // -> 41130b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41140b57cec5SDimitry Andric Kind = tok::arrow; 41150b57cec5SDimitry Andric } else if (Char == '=') { // -= 41160b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41170b57cec5SDimitry Andric Kind = tok::minusequal; 41180b57cec5SDimitry Andric } else { 41190b57cec5SDimitry Andric Kind = tok::minus; 41200b57cec5SDimitry Andric } 41210b57cec5SDimitry Andric break; 41220b57cec5SDimitry Andric case '~': 41230b57cec5SDimitry Andric Kind = tok::tilde; 41240b57cec5SDimitry Andric break; 41250b57cec5SDimitry Andric case '!': 41260b57cec5SDimitry Andric if (getCharAndSize(CurPtr, SizeTmp) == '=') { 41270b57cec5SDimitry Andric Kind = tok::exclaimequal; 41280b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41290b57cec5SDimitry Andric } else { 41300b57cec5SDimitry Andric Kind = tok::exclaim; 41310b57cec5SDimitry Andric } 41320b57cec5SDimitry Andric break; 41330b57cec5SDimitry Andric case '/': 41340b57cec5SDimitry Andric // 6.4.9: Comments 41350b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 41360b57cec5SDimitry Andric if (Char == '/') { // Line comment. 41370b57cec5SDimitry Andric // Even if Line comments are disabled (e.g. in C89 mode), we generally 41380b57cec5SDimitry Andric // want to lex this as a comment. There is one problem with this though, 41390b57cec5SDimitry Andric // that in one particular corner case, this can change the behavior of the 41400b57cec5SDimitry Andric // resultant program. For example, In "foo //**/ bar", C89 would lex 41410b57cec5SDimitry Andric // this as "foo / bar" and languages with Line comments would lex it as 41420b57cec5SDimitry Andric // "foo". Check to see if the character after the second slash is a '*'. 41430b57cec5SDimitry Andric // If so, we will lex that as a "/" instead of the start of a comment. 41440b57cec5SDimitry Andric // However, we never do this if we are just preprocessing. 414581ad6265SDimitry Andric bool TreatAsComment = 414681ad6265SDimitry Andric LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); 41470b57cec5SDimitry Andric if (!TreatAsComment) 41480b57cec5SDimitry Andric if (!(PP && PP->isPreprocessedOutput())) 41490b57cec5SDimitry Andric TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; 41500b57cec5SDimitry Andric 41510b57cec5SDimitry Andric if (TreatAsComment) { 41520b57cec5SDimitry Andric if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 41530b57cec5SDimitry Andric TokAtPhysicalStartOfLine)) 41540b57cec5SDimitry Andric return true; // There is a token to return. 41550b57cec5SDimitry Andric 41560b57cec5SDimitry Andric // It is common for the tokens immediately after a // comment to be 41570b57cec5SDimitry Andric // whitespace (indentation for the next line). Instead of going through 41580b57cec5SDimitry Andric // the big switch, handle it efficiently now. 41590b57cec5SDimitry Andric goto SkipIgnoredUnits; 41600b57cec5SDimitry Andric } 41610b57cec5SDimitry Andric } 41620b57cec5SDimitry Andric 41630b57cec5SDimitry Andric if (Char == '*') { // /**/ comment. 41640b57cec5SDimitry Andric if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 41650b57cec5SDimitry Andric TokAtPhysicalStartOfLine)) 41660b57cec5SDimitry Andric return true; // There is a token to return. 41670b57cec5SDimitry Andric 41680b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 41690b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 41700b57cec5SDimitry Andric goto LexNextToken; 41710b57cec5SDimitry Andric } 41720b57cec5SDimitry Andric 41730b57cec5SDimitry Andric if (Char == '=') { 41740b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41750b57cec5SDimitry Andric Kind = tok::slashequal; 41760b57cec5SDimitry Andric } else { 41770b57cec5SDimitry Andric Kind = tok::slash; 41780b57cec5SDimitry Andric } 41790b57cec5SDimitry Andric break; 41800b57cec5SDimitry Andric case '%': 41810b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 41820b57cec5SDimitry Andric if (Char == '=') { 41830b57cec5SDimitry Andric Kind = tok::percentequal; 41840b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41850b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == '>') { 41860b57cec5SDimitry Andric Kind = tok::r_brace; // '%>' -> '}' 41870b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41880b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == ':') { 41890b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41900b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 41910b57cec5SDimitry Andric if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 41920b57cec5SDimitry Andric Kind = tok::hashhash; // '%:%:' -> '##' 41930b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 41940b57cec5SDimitry Andric SizeTmp2, Result); 41950b57cec5SDimitry Andric } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 41960b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41970b57cec5SDimitry Andric if (!isLexingRawMode()) 41980b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_charize_microsoft); 41990b57cec5SDimitry Andric Kind = tok::hashat; 42000b57cec5SDimitry Andric } else { // '%:' -> '#' 42010b57cec5SDimitry Andric // We parsed a # character. If this occurs at the start of the line, 42020b57cec5SDimitry Andric // it's actually the start of a preprocessing directive. Callback to 42030b57cec5SDimitry Andric // the preprocessor to handle it. 42040b57cec5SDimitry Andric // TODO: -fpreprocessed mode?? 42050b57cec5SDimitry Andric if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 42060b57cec5SDimitry Andric goto HandleDirective; 42070b57cec5SDimitry Andric 42080b57cec5SDimitry Andric Kind = tok::hash; 42090b57cec5SDimitry Andric } 42100b57cec5SDimitry Andric } else { 42110b57cec5SDimitry Andric Kind = tok::percent; 42120b57cec5SDimitry Andric } 42130b57cec5SDimitry Andric break; 42140b57cec5SDimitry Andric case '<': 42150b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 42160b57cec5SDimitry Andric if (ParsingFilename) { 42170b57cec5SDimitry Andric return LexAngledStringLiteral(Result, CurPtr); 42180b57cec5SDimitry Andric } else if (Char == '<') { 42190b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 42200b57cec5SDimitry Andric if (After == '=') { 42210b57cec5SDimitry Andric Kind = tok::lesslessequal; 42220b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 42230b57cec5SDimitry Andric SizeTmp2, Result); 42240b57cec5SDimitry Andric } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 42250b57cec5SDimitry Andric // If this is actually a '<<<<<<<' version control conflict marker, 42260b57cec5SDimitry Andric // recognize it as such and recover nicely. 42270b57cec5SDimitry Andric goto LexNextToken; 42280b57cec5SDimitry Andric } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 42290b57cec5SDimitry Andric // If this is '<<<<' and we're in a Perforce-style conflict marker, 42300b57cec5SDimitry Andric // ignore it. 42310b57cec5SDimitry Andric goto LexNextToken; 42320b57cec5SDimitry Andric } else if (LangOpts.CUDA && After == '<') { 42330b57cec5SDimitry Andric Kind = tok::lesslessless; 42340b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 42350b57cec5SDimitry Andric SizeTmp2, Result); 42360b57cec5SDimitry Andric } else { 42370b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 42380b57cec5SDimitry Andric Kind = tok::lessless; 42390b57cec5SDimitry Andric } 42400b57cec5SDimitry Andric } else if (Char == '=') { 42410b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 42420b57cec5SDimitry Andric if (After == '>') { 424381ad6265SDimitry Andric if (LangOpts.CPlusPlus20) { 42440b57cec5SDimitry Andric if (!isLexingRawMode()) 42450b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx17_compat_spaceship); 42460b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 42470b57cec5SDimitry Andric SizeTmp2, Result); 42480b57cec5SDimitry Andric Kind = tok::spaceship; 42490b57cec5SDimitry Andric break; 42500b57cec5SDimitry Andric } 42510b57cec5SDimitry Andric // Suggest adding a space between the '<=' and the '>' to avoid a 42520b57cec5SDimitry Andric // change in semantics if this turns up in C++ <=17 mode. 425381ad6265SDimitry Andric if (LangOpts.CPlusPlus && !isLexingRawMode()) { 42545ffd83dbSDimitry Andric Diag(BufferPtr, diag::warn_cxx20_compat_spaceship) 42550b57cec5SDimitry Andric << FixItHint::CreateInsertion( 42560b57cec5SDimitry Andric getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " "); 42570b57cec5SDimitry Andric } 42580b57cec5SDimitry Andric } 42590b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 42600b57cec5SDimitry Andric Kind = tok::lessequal; 42610b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 42620b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 && 42630b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 42640b57cec5SDimitry Andric // C++0x [lex.pptoken]p3: 42650b57cec5SDimitry Andric // Otherwise, if the next three characters are <:: and the subsequent 42660b57cec5SDimitry Andric // character is neither : nor >, the < is treated as a preprocessor 42670b57cec5SDimitry Andric // token by itself and not as the first character of the alternative 42680b57cec5SDimitry Andric // token <:. 42690b57cec5SDimitry Andric unsigned SizeTmp3; 42700b57cec5SDimitry Andric char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 42710b57cec5SDimitry Andric if (After != ':' && After != '>') { 42720b57cec5SDimitry Andric Kind = tok::less; 42730b57cec5SDimitry Andric if (!isLexingRawMode()) 42740b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 42750b57cec5SDimitry Andric break; 42760b57cec5SDimitry Andric } 42770b57cec5SDimitry Andric } 42780b57cec5SDimitry Andric 42790b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 42800b57cec5SDimitry Andric Kind = tok::l_square; 42810b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 42820b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 42830b57cec5SDimitry Andric Kind = tok::l_brace; 42840b57cec5SDimitry Andric } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 && 42850b57cec5SDimitry Andric lexEditorPlaceholder(Result, CurPtr)) { 42860b57cec5SDimitry Andric return true; 42870b57cec5SDimitry Andric } else { 42880b57cec5SDimitry Andric Kind = tok::less; 42890b57cec5SDimitry Andric } 42900b57cec5SDimitry Andric break; 42910b57cec5SDimitry Andric case '>': 42920b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 42930b57cec5SDimitry Andric if (Char == '=') { 42940b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 42950b57cec5SDimitry Andric Kind = tok::greaterequal; 42960b57cec5SDimitry Andric } else if (Char == '>') { 42970b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 42980b57cec5SDimitry Andric if (After == '=') { 42990b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 43000b57cec5SDimitry Andric SizeTmp2, Result); 43010b57cec5SDimitry Andric Kind = tok::greatergreaterequal; 43020b57cec5SDimitry Andric } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 43030b57cec5SDimitry Andric // If this is actually a '>>>>' conflict marker, recognize it as such 43040b57cec5SDimitry Andric // and recover nicely. 43050b57cec5SDimitry Andric goto LexNextToken; 43060b57cec5SDimitry Andric } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 43070b57cec5SDimitry Andric // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 43080b57cec5SDimitry Andric goto LexNextToken; 43090b57cec5SDimitry Andric } else if (LangOpts.CUDA && After == '>') { 43100b57cec5SDimitry Andric Kind = tok::greatergreatergreater; 43110b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 43120b57cec5SDimitry Andric SizeTmp2, Result); 43130b57cec5SDimitry Andric } else { 43140b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 43150b57cec5SDimitry Andric Kind = tok::greatergreater; 43160b57cec5SDimitry Andric } 43170b57cec5SDimitry Andric } else { 43180b57cec5SDimitry Andric Kind = tok::greater; 43190b57cec5SDimitry Andric } 43200b57cec5SDimitry Andric break; 43210b57cec5SDimitry Andric case '^': 43220b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 43230b57cec5SDimitry Andric if (Char == '=') { 43240b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 43250b57cec5SDimitry Andric Kind = tok::caretequal; 43260b57cec5SDimitry Andric } else if (LangOpts.OpenCL && Char == '^') { 43270b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 43280b57cec5SDimitry Andric Kind = tok::caretcaret; 43290b57cec5SDimitry Andric } else { 43300b57cec5SDimitry Andric Kind = tok::caret; 43310b57cec5SDimitry Andric } 43320b57cec5SDimitry Andric break; 43330b57cec5SDimitry Andric case '|': 43340b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 43350b57cec5SDimitry Andric if (Char == '=') { 43360b57cec5SDimitry Andric Kind = tok::pipeequal; 43370b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 43380b57cec5SDimitry Andric } else if (Char == '|') { 43390b57cec5SDimitry Andric // If this is '|||||||' and we're in a conflict marker, ignore it. 43400b57cec5SDimitry Andric if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 43410b57cec5SDimitry Andric goto LexNextToken; 43420b57cec5SDimitry Andric Kind = tok::pipepipe; 43430b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 43440b57cec5SDimitry Andric } else { 43450b57cec5SDimitry Andric Kind = tok::pipe; 43460b57cec5SDimitry Andric } 43470b57cec5SDimitry Andric break; 43480b57cec5SDimitry Andric case ':': 43490b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 43500b57cec5SDimitry Andric if (LangOpts.Digraphs && Char == '>') { 43510b57cec5SDimitry Andric Kind = tok::r_square; // ':>' -> ']' 43520b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 435306c3fb27SDimitry Andric } else if (Char == ':') { 43540b57cec5SDimitry Andric Kind = tok::coloncolon; 43550b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 43560b57cec5SDimitry Andric } else { 43570b57cec5SDimitry Andric Kind = tok::colon; 43580b57cec5SDimitry Andric } 43590b57cec5SDimitry Andric break; 43600b57cec5SDimitry Andric case ';': 43610b57cec5SDimitry Andric Kind = tok::semi; 43620b57cec5SDimitry Andric break; 43630b57cec5SDimitry Andric case '=': 43640b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 43650b57cec5SDimitry Andric if (Char == '=') { 43660b57cec5SDimitry Andric // If this is '====' and we're in a conflict marker, ignore it. 43670b57cec5SDimitry Andric if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 43680b57cec5SDimitry Andric goto LexNextToken; 43690b57cec5SDimitry Andric 43700b57cec5SDimitry Andric Kind = tok::equalequal; 43710b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 43720b57cec5SDimitry Andric } else { 43730b57cec5SDimitry Andric Kind = tok::equal; 43740b57cec5SDimitry Andric } 43750b57cec5SDimitry Andric break; 43760b57cec5SDimitry Andric case ',': 43770b57cec5SDimitry Andric Kind = tok::comma; 43780b57cec5SDimitry Andric break; 43790b57cec5SDimitry Andric case '#': 43800b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 43810b57cec5SDimitry Andric if (Char == '#') { 43820b57cec5SDimitry Andric Kind = tok::hashhash; 43830b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 43840b57cec5SDimitry Andric } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 43850b57cec5SDimitry Andric Kind = tok::hashat; 43860b57cec5SDimitry Andric if (!isLexingRawMode()) 43870b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_charize_microsoft); 43880b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 43890b57cec5SDimitry Andric } else { 43900b57cec5SDimitry Andric // We parsed a # character. If this occurs at the start of the line, 43910b57cec5SDimitry Andric // it's actually the start of a preprocessing directive. Callback to 43920b57cec5SDimitry Andric // the preprocessor to handle it. 43930b57cec5SDimitry Andric // TODO: -fpreprocessed mode?? 43940b57cec5SDimitry Andric if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 43950b57cec5SDimitry Andric goto HandleDirective; 43960b57cec5SDimitry Andric 43970b57cec5SDimitry Andric Kind = tok::hash; 43980b57cec5SDimitry Andric } 43990b57cec5SDimitry Andric break; 44000b57cec5SDimitry Andric 44010b57cec5SDimitry Andric case '@': 44020b57cec5SDimitry Andric // Objective C support. 44030b57cec5SDimitry Andric if (CurPtr[-1] == '@' && LangOpts.ObjC) 44040b57cec5SDimitry Andric Kind = tok::at; 44050b57cec5SDimitry Andric else 44060b57cec5SDimitry Andric Kind = tok::unknown; 44070b57cec5SDimitry Andric break; 44080b57cec5SDimitry Andric 44090b57cec5SDimitry Andric // UCNs (C99 6.4.3, C++11 [lex.charset]p2) 44100b57cec5SDimitry Andric case '\\': 44110b57cec5SDimitry Andric if (!LangOpts.AsmPreprocessor) { 44120b57cec5SDimitry Andric if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { 44130b57cec5SDimitry Andric if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 44140b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 44150b57cec5SDimitry Andric return true; // KeepWhitespaceMode 44160b57cec5SDimitry Andric 44170b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 44180b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 44190b57cec5SDimitry Andric goto LexNextToken; 44200b57cec5SDimitry Andric } 44210b57cec5SDimitry Andric 4422349cc55cSDimitry Andric return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 44230b57cec5SDimitry Andric } 44240b57cec5SDimitry Andric } 44250b57cec5SDimitry Andric 44260b57cec5SDimitry Andric Kind = tok::unknown; 44270b57cec5SDimitry Andric break; 44280b57cec5SDimitry Andric 44290b57cec5SDimitry Andric default: { 44300b57cec5SDimitry Andric if (isASCII(Char)) { 44310b57cec5SDimitry Andric Kind = tok::unknown; 44320b57cec5SDimitry Andric break; 44330b57cec5SDimitry Andric } 44340b57cec5SDimitry Andric 44350b57cec5SDimitry Andric llvm::UTF32 CodePoint; 44360b57cec5SDimitry Andric 44370b57cec5SDimitry Andric // We can't just reset CurPtr to BufferPtr because BufferPtr may point to 44380b57cec5SDimitry Andric // an escaped newline. 44390b57cec5SDimitry Andric --CurPtr; 44400b57cec5SDimitry Andric llvm::ConversionResult Status = 44410b57cec5SDimitry Andric llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, 44420b57cec5SDimitry Andric (const llvm::UTF8 *)BufferEnd, 44430b57cec5SDimitry Andric &CodePoint, 44440b57cec5SDimitry Andric llvm::strictConversion); 44450b57cec5SDimitry Andric if (Status == llvm::conversionOK) { 44460b57cec5SDimitry Andric if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 44470b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 44480b57cec5SDimitry Andric return true; // KeepWhitespaceMode 44490b57cec5SDimitry Andric 44500b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 44510b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 44520b57cec5SDimitry Andric goto LexNextToken; 44530b57cec5SDimitry Andric } 4454349cc55cSDimitry Andric return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 44550b57cec5SDimitry Andric } 44560b57cec5SDimitry Andric 44570b57cec5SDimitry Andric if (isLexingRawMode() || ParsingPreprocessorDirective || 44580b57cec5SDimitry Andric PP->isPreprocessedOutput()) { 44590b57cec5SDimitry Andric ++CurPtr; 44600b57cec5SDimitry Andric Kind = tok::unknown; 44610b57cec5SDimitry Andric break; 44620b57cec5SDimitry Andric } 44630b57cec5SDimitry Andric 44640b57cec5SDimitry Andric // Non-ASCII characters tend to creep into source code unintentionally. 44650b57cec5SDimitry Andric // Instead of letting the parser complain about the unknown token, 44660b57cec5SDimitry Andric // just diagnose the invalid UTF-8, then drop the character. 44670b57cec5SDimitry Andric Diag(CurPtr, diag::err_invalid_utf8); 44680b57cec5SDimitry Andric 44690b57cec5SDimitry Andric BufferPtr = CurPtr+1; 44700b57cec5SDimitry Andric // We're pretending the character didn't exist, so just try again with 44710b57cec5SDimitry Andric // this lexer. 44720b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 44730b57cec5SDimitry Andric goto LexNextToken; 44740b57cec5SDimitry Andric } 44750b57cec5SDimitry Andric } 44760b57cec5SDimitry Andric 44770b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 44780b57cec5SDimitry Andric MIOpt.ReadToken(); 44790b57cec5SDimitry Andric 44800b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 44810b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 44820b57cec5SDimitry Andric return true; 44830b57cec5SDimitry Andric 44840b57cec5SDimitry Andric HandleDirective: 44850b57cec5SDimitry Andric // We parsed a # character and it's the start of a preprocessing directive. 44860b57cec5SDimitry Andric 44870b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::hash); 44880b57cec5SDimitry Andric PP->HandleDirective(Result); 44890b57cec5SDimitry Andric 449006c3fb27SDimitry Andric if (PP->hadModuleLoaderFatalFailure()) 44910b57cec5SDimitry Andric // With a fatal failure in the module loader, we abort parsing. 44920b57cec5SDimitry Andric return true; 44930b57cec5SDimitry Andric 44940b57cec5SDimitry Andric // We parsed the directive; lex a token with the new state. 44950b57cec5SDimitry Andric return false; 4496bdd1243dSDimitry Andric 4497bdd1243dSDimitry Andric LexNextToken: 4498bdd1243dSDimitry Andric Result.clearFlag(Token::NeedsCleaning); 4499bdd1243dSDimitry Andric goto LexStart; 45000b57cec5SDimitry Andric } 450181ad6265SDimitry Andric 450281ad6265SDimitry Andric const char *Lexer::convertDependencyDirectiveToken( 450381ad6265SDimitry Andric const dependency_directives_scan::Token &DDTok, Token &Result) { 450481ad6265SDimitry Andric const char *TokPtr = BufferStart + DDTok.Offset; 450581ad6265SDimitry Andric Result.startToken(); 450681ad6265SDimitry Andric Result.setLocation(getSourceLocation(TokPtr)); 450781ad6265SDimitry Andric Result.setKind(DDTok.Kind); 450881ad6265SDimitry Andric Result.setFlag((Token::TokenFlags)DDTok.Flags); 450981ad6265SDimitry Andric Result.setLength(DDTok.Length); 451081ad6265SDimitry Andric BufferPtr = TokPtr + DDTok.Length; 451181ad6265SDimitry Andric return TokPtr; 451281ad6265SDimitry Andric } 451381ad6265SDimitry Andric 451481ad6265SDimitry Andric bool Lexer::LexDependencyDirectiveToken(Token &Result) { 451581ad6265SDimitry Andric assert(isDependencyDirectivesLexer()); 451681ad6265SDimitry Andric 451781ad6265SDimitry Andric using namespace dependency_directives_scan; 451881ad6265SDimitry Andric 451981ad6265SDimitry Andric while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) { 452081ad6265SDimitry Andric if (DepDirectives.front().Kind == pp_eof) 452181ad6265SDimitry Andric return LexEndOfFile(Result, BufferEnd); 4522bdd1243dSDimitry Andric if (DepDirectives.front().Kind == tokens_present_before_eof) 4523bdd1243dSDimitry Andric MIOpt.ReadToken(); 452481ad6265SDimitry Andric NextDepDirectiveTokenIndex = 0; 452581ad6265SDimitry Andric DepDirectives = DepDirectives.drop_front(); 452681ad6265SDimitry Andric } 452781ad6265SDimitry Andric 452881ad6265SDimitry Andric const dependency_directives_scan::Token &DDTok = 452981ad6265SDimitry Andric DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++]; 453081ad6265SDimitry Andric if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) { 453181ad6265SDimitry Andric // Read something other than a preprocessor directive hash. 453281ad6265SDimitry Andric MIOpt.ReadToken(); 453381ad6265SDimitry Andric } 453481ad6265SDimitry Andric 4535bdd1243dSDimitry Andric if (ParsingFilename && DDTok.is(tok::less)) { 4536bdd1243dSDimitry Andric BufferPtr = BufferStart + DDTok.Offset; 4537bdd1243dSDimitry Andric LexAngledStringLiteral(Result, BufferPtr + 1); 4538bdd1243dSDimitry Andric if (Result.isNot(tok::header_name)) 4539bdd1243dSDimitry Andric return true; 4540bdd1243dSDimitry Andric // Advance the index of lexed tokens. 4541bdd1243dSDimitry Andric while (true) { 4542bdd1243dSDimitry Andric const dependency_directives_scan::Token &NextTok = 4543bdd1243dSDimitry Andric DepDirectives.front().Tokens[NextDepDirectiveTokenIndex]; 4544bdd1243dSDimitry Andric if (BufferStart + NextTok.Offset >= BufferPtr) 4545bdd1243dSDimitry Andric break; 4546bdd1243dSDimitry Andric ++NextDepDirectiveTokenIndex; 4547bdd1243dSDimitry Andric } 4548bdd1243dSDimitry Andric return true; 4549bdd1243dSDimitry Andric } 4550bdd1243dSDimitry Andric 455181ad6265SDimitry Andric const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result); 455281ad6265SDimitry Andric 455381ad6265SDimitry Andric if (Result.is(tok::hash) && Result.isAtStartOfLine()) { 455481ad6265SDimitry Andric PP->HandleDirective(Result); 455581ad6265SDimitry Andric return false; 455681ad6265SDimitry Andric } 455781ad6265SDimitry Andric if (Result.is(tok::raw_identifier)) { 455881ad6265SDimitry Andric Result.setRawIdentifierData(TokPtr); 455981ad6265SDimitry Andric if (!isLexingRawMode()) { 45605f757f3fSDimitry Andric const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 456181ad6265SDimitry Andric if (II->isHandleIdentifierCase()) 456281ad6265SDimitry Andric return PP->HandleIdentifier(Result); 456381ad6265SDimitry Andric } 456481ad6265SDimitry Andric return true; 456581ad6265SDimitry Andric } 456681ad6265SDimitry Andric if (Result.isLiteral()) { 456781ad6265SDimitry Andric Result.setLiteralData(TokPtr); 456881ad6265SDimitry Andric return true; 456981ad6265SDimitry Andric } 457006c3fb27SDimitry Andric if (Result.is(tok::colon)) { 457181ad6265SDimitry Andric // Convert consecutive colons to 'tok::coloncolon'. 457281ad6265SDimitry Andric if (*BufferPtr == ':') { 457381ad6265SDimitry Andric assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( 457481ad6265SDimitry Andric tok::colon)); 457581ad6265SDimitry Andric ++NextDepDirectiveTokenIndex; 457681ad6265SDimitry Andric Result.setKind(tok::coloncolon); 457781ad6265SDimitry Andric } 457881ad6265SDimitry Andric return true; 457981ad6265SDimitry Andric } 458081ad6265SDimitry Andric if (Result.is(tok::eod)) 458181ad6265SDimitry Andric ParsingPreprocessorDirective = false; 458281ad6265SDimitry Andric 458381ad6265SDimitry Andric return true; 458481ad6265SDimitry Andric } 458581ad6265SDimitry Andric 458681ad6265SDimitry Andric bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) { 458781ad6265SDimitry Andric assert(isDependencyDirectivesLexer()); 458881ad6265SDimitry Andric 458981ad6265SDimitry Andric using namespace dependency_directives_scan; 459081ad6265SDimitry Andric 459181ad6265SDimitry Andric bool Stop = false; 459281ad6265SDimitry Andric unsigned NestedIfs = 0; 459381ad6265SDimitry Andric do { 459481ad6265SDimitry Andric DepDirectives = DepDirectives.drop_front(); 459581ad6265SDimitry Andric switch (DepDirectives.front().Kind) { 459681ad6265SDimitry Andric case pp_none: 459781ad6265SDimitry Andric llvm_unreachable("unexpected 'pp_none'"); 459881ad6265SDimitry Andric case pp_include: 459981ad6265SDimitry Andric case pp___include_macros: 460081ad6265SDimitry Andric case pp_define: 460181ad6265SDimitry Andric case pp_undef: 460281ad6265SDimitry Andric case pp_import: 460381ad6265SDimitry Andric case pp_pragma_import: 460481ad6265SDimitry Andric case pp_pragma_once: 460581ad6265SDimitry Andric case pp_pragma_push_macro: 460681ad6265SDimitry Andric case pp_pragma_pop_macro: 460781ad6265SDimitry Andric case pp_pragma_include_alias: 460806c3fb27SDimitry Andric case pp_pragma_system_header: 460981ad6265SDimitry Andric case pp_include_next: 461081ad6265SDimitry Andric case decl_at_import: 461181ad6265SDimitry Andric case cxx_module_decl: 461281ad6265SDimitry Andric case cxx_import_decl: 461381ad6265SDimitry Andric case cxx_export_module_decl: 461481ad6265SDimitry Andric case cxx_export_import_decl: 4615bdd1243dSDimitry Andric case tokens_present_before_eof: 461681ad6265SDimitry Andric break; 461781ad6265SDimitry Andric case pp_if: 461881ad6265SDimitry Andric case pp_ifdef: 461981ad6265SDimitry Andric case pp_ifndef: 462081ad6265SDimitry Andric ++NestedIfs; 462181ad6265SDimitry Andric break; 462281ad6265SDimitry Andric case pp_elif: 462381ad6265SDimitry Andric case pp_elifdef: 462481ad6265SDimitry Andric case pp_elifndef: 462581ad6265SDimitry Andric case pp_else: 462681ad6265SDimitry Andric if (!NestedIfs) { 462781ad6265SDimitry Andric Stop = true; 462881ad6265SDimitry Andric } 462981ad6265SDimitry Andric break; 463081ad6265SDimitry Andric case pp_endif: 463181ad6265SDimitry Andric if (!NestedIfs) { 463281ad6265SDimitry Andric Stop = true; 463381ad6265SDimitry Andric } else { 463481ad6265SDimitry Andric --NestedIfs; 463581ad6265SDimitry Andric } 463681ad6265SDimitry Andric break; 463781ad6265SDimitry Andric case pp_eof: 463881ad6265SDimitry Andric NextDepDirectiveTokenIndex = 0; 463981ad6265SDimitry Andric return LexEndOfFile(Result, BufferEnd); 464081ad6265SDimitry Andric } 464181ad6265SDimitry Andric } while (!Stop); 464281ad6265SDimitry Andric 464381ad6265SDimitry Andric const dependency_directives_scan::Token &DDTok = 464481ad6265SDimitry Andric DepDirectives.front().Tokens.front(); 464581ad6265SDimitry Andric assert(DDTok.is(tok::hash)); 464681ad6265SDimitry Andric NextDepDirectiveTokenIndex = 1; 464781ad6265SDimitry Andric 464881ad6265SDimitry Andric convertDependencyDirectiveToken(DDTok, Result); 464981ad6265SDimitry Andric return false; 465081ad6265SDimitry Andric } 4651