1 //===- Lexer.cpp - C Language Family Lexer --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Lexer and Token interfaces. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "clang/Lex/Lexer.h" 14 #include "UnicodeCharSets.h" 15 #include "clang/Basic/CharInfo.h" 16 #include "clang/Basic/Diagnostic.h" 17 #include "clang/Basic/IdentifierTable.h" 18 #include "clang/Basic/LLVM.h" 19 #include "clang/Basic/LangOptions.h" 20 #include "clang/Basic/SourceLocation.h" 21 #include "clang/Basic/SourceManager.h" 22 #include "clang/Basic/TokenKinds.h" 23 #include "clang/Lex/LexDiagnostic.h" 24 #include "clang/Lex/LiteralSupport.h" 25 #include "clang/Lex/MultipleIncludeOpt.h" 26 #include "clang/Lex/Preprocessor.h" 27 #include "clang/Lex/PreprocessorOptions.h" 28 #include "clang/Lex/Token.h" 29 #include "llvm/ADT/STLExtras.h" 30 #include "llvm/ADT/StringExtras.h" 31 #include "llvm/ADT/StringRef.h" 32 #include "llvm/ADT/StringSwitch.h" 33 #include "llvm/Support/Compiler.h" 34 #include "llvm/Support/ConvertUTF.h" 35 #include "llvm/Support/MathExtras.h" 36 #include "llvm/Support/MemoryBufferRef.h" 37 #include "llvm/Support/NativeFormatting.h" 38 #include "llvm/Support/Unicode.h" 39 #include "llvm/Support/UnicodeCharRanges.h" 40 #include <algorithm> 41 #include <cassert> 42 #include <cstddef> 43 #include <cstdint> 44 #include <cstring> 45 #include <optional> 46 #include <string> 47 #include <tuple> 48 #include <utility> 49 50 #ifdef __SSE4_2__ 51 #include <nmmintrin.h> 52 #endif 53 54 using namespace clang; 55 56 //===----------------------------------------------------------------------===// 57 // Token Class Implementation 58 //===----------------------------------------------------------------------===// 59 60 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 61 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 62 if (isAnnotation()) 63 return false; 64 if (const IdentifierInfo *II = getIdentifierInfo()) 65 return II->getObjCKeywordID() == objcKey; 66 return false; 67 } 68 69 /// getObjCKeywordID - Return the ObjC keyword kind. 70 tok::ObjCKeywordKind Token::getObjCKeywordID() const { 71 if (isAnnotation()) 72 return tok::objc_not_keyword; 73 const IdentifierInfo *specId = getIdentifierInfo(); 74 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 75 } 76 77 //===----------------------------------------------------------------------===// 78 // Lexer Class Implementation 79 //===----------------------------------------------------------------------===// 80 81 void Lexer::anchor() {} 82 83 void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 84 const char *BufEnd) { 85 BufferStart = BufStart; 86 BufferPtr = BufPtr; 87 BufferEnd = BufEnd; 88 89 assert(BufEnd[0] == 0 && 90 "We assume that the input buffer has a null character at the end" 91 " to simplify lexing!"); 92 93 // Check whether we have a BOM in the beginning of the buffer. If yes - act 94 // accordingly. Right now we support only UTF-8 with and without BOM, so, just 95 // skip the UTF-8 BOM if it's present. 96 if (BufferStart == BufferPtr) { 97 // Determine the size of the BOM. 98 StringRef Buf(BufferStart, BufferEnd - BufferStart); 99 size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 100 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 101 .Default(0); 102 103 // Skip the BOM. 104 BufferPtr += BOMLength; 105 } 106 107 Is_PragmaLexer = false; 108 CurrentConflictMarkerState = CMK_None; 109 110 // Start of the file is a start of line. 111 IsAtStartOfLine = true; 112 IsAtPhysicalStartOfLine = true; 113 114 HasLeadingSpace = false; 115 HasLeadingEmptyMacro = false; 116 117 // We are not after parsing a #. 118 ParsingPreprocessorDirective = false; 119 120 // We are not after parsing #include. 121 ParsingFilename = false; 122 123 // We are not in raw mode. Raw mode disables diagnostics and interpretation 124 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 125 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 126 // or otherwise skipping over tokens. 127 LexingRawMode = false; 128 129 // Default to not keeping comments. 130 ExtendedTokenMode = 0; 131 132 NewLinePtr = nullptr; 133 } 134 135 /// Lexer constructor - Create a new lexer object for the specified buffer 136 /// with the specified preprocessor managing the lexing process. This lexer 137 /// assumes that the associated file buffer and Preprocessor objects will 138 /// outlive it, so it doesn't take ownership of either of them. 139 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, 140 Preprocessor &PP, bool IsFirstIncludeOfFile) 141 : PreprocessorLexer(&PP, FID), 142 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 143 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment), 144 IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 145 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(), 146 InputFile.getBufferEnd()); 147 148 resetExtendedTokenMode(); 149 } 150 151 /// Lexer constructor - Create a new raw lexer object. This object is only 152 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 153 /// range will outlive it, so it doesn't take ownership of it. 154 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 155 const char *BufStart, const char *BufPtr, const char *BufEnd, 156 bool IsFirstIncludeOfFile) 157 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment), 158 IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 159 InitLexer(BufStart, BufPtr, BufEnd); 160 161 // We *are* in raw mode. 162 LexingRawMode = true; 163 } 164 165 /// Lexer constructor - Create a new raw lexer object. This object is only 166 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 167 /// range will outlive it, so it doesn't take ownership of it. 168 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile, 169 const SourceManager &SM, const LangOptions &langOpts, 170 bool IsFirstIncludeOfFile) 171 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(), 172 FromFile.getBufferStart(), FromFile.getBufferEnd(), 173 IsFirstIncludeOfFile) {} 174 175 void Lexer::resetExtendedTokenMode() { 176 assert(PP && "Cannot reset token mode without a preprocessor"); 177 if (LangOpts.TraditionalCPP) 178 SetKeepWhitespaceMode(true); 179 else 180 SetCommentRetentionState(PP->getCommentRetentionState()); 181 } 182 183 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 184 /// _Pragma expansion. This has a variety of magic semantics that this method 185 /// sets up. It returns a new'd Lexer that must be delete'd when done. 186 /// 187 /// On entrance to this routine, TokStartLoc is a macro location which has a 188 /// spelling loc that indicates the bytes to be lexed for the token and an 189 /// expansion location that indicates where all lexed tokens should be 190 /// "expanded from". 191 /// 192 /// TODO: It would really be nice to make _Pragma just be a wrapper around a 193 /// normal lexer that remaps tokens as they fly by. This would require making 194 /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 195 /// interface that could handle this stuff. This would pull GetMappedTokenLoc 196 /// out of the critical path of the lexer! 197 /// 198 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 199 SourceLocation ExpansionLocStart, 200 SourceLocation ExpansionLocEnd, 201 unsigned TokLen, Preprocessor &PP) { 202 SourceManager &SM = PP.getSourceManager(); 203 204 // Create the lexer as if we were going to lex the file normally. 205 FileID SpellingFID = SM.getFileID(SpellingLoc); 206 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID); 207 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 208 209 // Now that the lexer is created, change the start/end locations so that we 210 // just lex the subsection of the file that we want. This is lexing from a 211 // scratch buffer. 212 const char *StrData = SM.getCharacterData(SpellingLoc); 213 214 L->BufferPtr = StrData; 215 L->BufferEnd = StrData+TokLen; 216 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 217 218 // Set the SourceLocation with the remapping information. This ensures that 219 // GetMappedTokenLoc will remap the tokens as they are lexed. 220 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 221 ExpansionLocStart, 222 ExpansionLocEnd, TokLen); 223 224 // Ensure that the lexer thinks it is inside a directive, so that end \n will 225 // return an EOD token. 226 L->ParsingPreprocessorDirective = true; 227 228 // This lexer really is for _Pragma. 229 L->Is_PragmaLexer = true; 230 return L; 231 } 232 233 void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) { 234 this->IsAtPhysicalStartOfLine = IsAtStartOfLine; 235 this->IsAtStartOfLine = IsAtStartOfLine; 236 assert((BufferStart + Offset) <= BufferEnd); 237 BufferPtr = BufferStart + Offset; 238 } 239 240 template <typename T> static void StringifyImpl(T &Str, char Quote) { 241 typename T::size_type i = 0, e = Str.size(); 242 while (i < e) { 243 if (Str[i] == '\\' || Str[i] == Quote) { 244 Str.insert(Str.begin() + i, '\\'); 245 i += 2; 246 ++e; 247 } else if (Str[i] == '\n' || Str[i] == '\r') { 248 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'. 249 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && 250 Str[i] != Str[i + 1]) { 251 Str[i] = '\\'; 252 Str[i + 1] = 'n'; 253 } else { 254 // Replace '\n' and '\r' to '\\' followed by 'n'. 255 Str[i] = '\\'; 256 Str.insert(Str.begin() + i + 1, 'n'); 257 ++e; 258 } 259 i += 2; 260 } else 261 ++i; 262 } 263 } 264 265 std::string Lexer::Stringify(StringRef Str, bool Charify) { 266 std::string Result = std::string(Str); 267 char Quote = Charify ? '\'' : '"'; 268 StringifyImpl(Result, Quote); 269 return Result; 270 } 271 272 void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); } 273 274 //===----------------------------------------------------------------------===// 275 // Token Spelling 276 //===----------------------------------------------------------------------===// 277 278 /// Slow case of getSpelling. Extract the characters comprising the 279 /// spelling of this token from the provided input buffer. 280 static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, 281 const LangOptions &LangOpts, char *Spelling) { 282 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); 283 284 size_t Length = 0; 285 const char *BufEnd = BufPtr + Tok.getLength(); 286 287 if (tok::isStringLiteral(Tok.getKind())) { 288 // Munch the encoding-prefix and opening double-quote. 289 while (BufPtr < BufEnd) { 290 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts); 291 Spelling[Length++] = CharAndSize.Char; 292 BufPtr += CharAndSize.Size; 293 294 if (Spelling[Length - 1] == '"') 295 break; 296 } 297 298 // Raw string literals need special handling; trigraph expansion and line 299 // splicing do not occur within their d-char-sequence nor within their 300 // r-char-sequence. 301 if (Length >= 2 && 302 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { 303 // Search backwards from the end of the token to find the matching closing 304 // quote. 305 const char *RawEnd = BufEnd; 306 do --RawEnd; while (*RawEnd != '"'); 307 size_t RawLength = RawEnd - BufPtr + 1; 308 309 // Everything between the quotes is included verbatim in the spelling. 310 memcpy(Spelling + Length, BufPtr, RawLength); 311 Length += RawLength; 312 BufPtr += RawLength; 313 314 // The rest of the token is lexed normally. 315 } 316 } 317 318 while (BufPtr < BufEnd) { 319 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts); 320 Spelling[Length++] = CharAndSize.Char; 321 BufPtr += CharAndSize.Size; 322 } 323 324 assert(Length < Tok.getLength() && 325 "NeedsCleaning flag set on token that didn't need cleaning!"); 326 return Length; 327 } 328 329 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 330 /// token are the characters used to represent the token in the source file 331 /// after trigraph expansion and escaped-newline folding. In particular, this 332 /// wants to get the true, uncanonicalized, spelling of things like digraphs 333 /// UCNs, etc. 334 StringRef Lexer::getSpelling(SourceLocation loc, 335 SmallVectorImpl<char> &buffer, 336 const SourceManager &SM, 337 const LangOptions &options, 338 bool *invalid) { 339 // Break down the source location. 340 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 341 342 // Try to the load the file buffer. 343 bool invalidTemp = false; 344 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 345 if (invalidTemp) { 346 if (invalid) *invalid = true; 347 return {}; 348 } 349 350 const char *tokenBegin = file.data() + locInfo.second; 351 352 // Lex from the start of the given location. 353 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 354 file.begin(), tokenBegin, file.end()); 355 Token token; 356 lexer.LexFromRawLexer(token); 357 358 unsigned length = token.getLength(); 359 360 // Common case: no need for cleaning. 361 if (!token.needsCleaning()) 362 return StringRef(tokenBegin, length); 363 364 // Hard case, we need to relex the characters into the string. 365 buffer.resize(length); 366 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); 367 return StringRef(buffer.data(), buffer.size()); 368 } 369 370 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 371 /// token are the characters used to represent the token in the source file 372 /// after trigraph expansion and escaped-newline folding. In particular, this 373 /// wants to get the true, uncanonicalized, spelling of things like digraphs 374 /// UCNs, etc. 375 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 376 const LangOptions &LangOpts, bool *Invalid) { 377 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 378 379 bool CharDataInvalid = false; 380 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 381 &CharDataInvalid); 382 if (Invalid) 383 *Invalid = CharDataInvalid; 384 if (CharDataInvalid) 385 return {}; 386 387 // If this token contains nothing interesting, return it directly. 388 if (!Tok.needsCleaning()) 389 return std::string(TokStart, TokStart + Tok.getLength()); 390 391 std::string Result; 392 Result.resize(Tok.getLength()); 393 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); 394 return Result; 395 } 396 397 /// getSpelling - This method is used to get the spelling of a token into a 398 /// preallocated buffer, instead of as an std::string. The caller is required 399 /// to allocate enough space for the token, which is guaranteed to be at least 400 /// Tok.getLength() bytes long. The actual length of the token is returned. 401 /// 402 /// Note that this method may do two possible things: it may either fill in 403 /// the buffer specified with characters, or it may *change the input pointer* 404 /// to point to a constant buffer with the data already in it (avoiding a 405 /// copy). The caller is not allowed to modify the returned buffer pointer 406 /// if an internal buffer is returned. 407 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 408 const SourceManager &SourceMgr, 409 const LangOptions &LangOpts, bool *Invalid) { 410 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 411 412 const char *TokStart = nullptr; 413 // NOTE: this has to be checked *before* testing for an IdentifierInfo. 414 if (Tok.is(tok::raw_identifier)) 415 TokStart = Tok.getRawIdentifier().data(); 416 else if (!Tok.hasUCN()) { 417 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 418 // Just return the string from the identifier table, which is very quick. 419 Buffer = II->getNameStart(); 420 return II->getLength(); 421 } 422 } 423 424 // NOTE: this can be checked even after testing for an IdentifierInfo. 425 if (Tok.isLiteral()) 426 TokStart = Tok.getLiteralData(); 427 428 if (!TokStart) { 429 // Compute the start of the token in the input lexer buffer. 430 bool CharDataInvalid = false; 431 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 432 if (Invalid) 433 *Invalid = CharDataInvalid; 434 if (CharDataInvalid) { 435 Buffer = ""; 436 return 0; 437 } 438 } 439 440 // If this token contains nothing interesting, return it directly. 441 if (!Tok.needsCleaning()) { 442 Buffer = TokStart; 443 return Tok.getLength(); 444 } 445 446 // Otherwise, hard case, relex the characters into the string. 447 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); 448 } 449 450 /// MeasureTokenLength - Relex the token at the specified location and return 451 /// its length in bytes in the input file. If the token needs cleaning (e.g. 452 /// includes a trigraph or an escaped newline) then this count includes bytes 453 /// that are part of that. 454 unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 455 const SourceManager &SM, 456 const LangOptions &LangOpts) { 457 Token TheTok; 458 if (getRawToken(Loc, TheTok, SM, LangOpts)) 459 return 0; 460 return TheTok.getLength(); 461 } 462 463 /// Relex the token at the specified location. 464 /// \returns true if there was a failure, false on success. 465 bool Lexer::getRawToken(SourceLocation Loc, Token &Result, 466 const SourceManager &SM, 467 const LangOptions &LangOpts, 468 bool IgnoreWhiteSpace) { 469 // TODO: this could be special cased for common tokens like identifiers, ')', 470 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 471 // all obviously single-char tokens. This could use 472 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 473 // something. 474 475 // If this comes from a macro expansion, we really do want the macro name, not 476 // the token this macro expanded to. 477 Loc = SM.getExpansionLoc(Loc); 478 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 479 bool Invalid = false; 480 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 481 if (Invalid) 482 return true; 483 484 const char *StrData = Buffer.data()+LocInfo.second; 485 486 if (!IgnoreWhiteSpace && isWhitespace(StrData[0])) 487 return true; 488 489 // Create a lexer starting at the beginning of this token. 490 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 491 Buffer.begin(), StrData, Buffer.end()); 492 TheLexer.SetCommentRetentionState(true); 493 TheLexer.LexFromRawLexer(Result); 494 return false; 495 } 496 497 /// Returns the pointer that points to the beginning of line that contains 498 /// the given offset, or null if the offset if invalid. 499 static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { 500 const char *BufStart = Buffer.data(); 501 if (Offset >= Buffer.size()) 502 return nullptr; 503 504 const char *LexStart = BufStart + Offset; 505 for (; LexStart != BufStart; --LexStart) { 506 if (isVerticalWhitespace(LexStart[0]) && 507 !Lexer::isNewLineEscaped(BufStart, LexStart)) { 508 // LexStart should point at first character of logical line. 509 ++LexStart; 510 break; 511 } 512 } 513 return LexStart; 514 } 515 516 static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 517 const SourceManager &SM, 518 const LangOptions &LangOpts) { 519 assert(Loc.isFileID()); 520 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 521 if (LocInfo.first.isInvalid()) 522 return Loc; 523 524 bool Invalid = false; 525 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 526 if (Invalid) 527 return Loc; 528 529 // Back up from the current location until we hit the beginning of a line 530 // (or the buffer). We'll relex from that point. 531 const char *StrData = Buffer.data() + LocInfo.second; 532 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); 533 if (!LexStart || LexStart == StrData) 534 return Loc; 535 536 // Create a lexer starting at the beginning of this token. 537 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 538 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, 539 Buffer.end()); 540 TheLexer.SetCommentRetentionState(true); 541 542 // Lex tokens until we find the token that contains the source location. 543 Token TheTok; 544 do { 545 TheLexer.LexFromRawLexer(TheTok); 546 547 if (TheLexer.getBufferLocation() > StrData) { 548 // Lexing this token has taken the lexer past the source location we're 549 // looking for. If the current token encompasses our source location, 550 // return the beginning of that token. 551 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 552 return TheTok.getLocation(); 553 554 // We ended up skipping over the source location entirely, which means 555 // that it points into whitespace. We're done here. 556 break; 557 } 558 } while (TheTok.getKind() != tok::eof); 559 560 // We've passed our source location; just return the original source location. 561 return Loc; 562 } 563 564 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 565 const SourceManager &SM, 566 const LangOptions &LangOpts) { 567 if (Loc.isFileID()) 568 return getBeginningOfFileToken(Loc, SM, LangOpts); 569 570 if (!SM.isMacroArgExpansion(Loc)) 571 return Loc; 572 573 SourceLocation FileLoc = SM.getSpellingLoc(Loc); 574 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 575 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 576 std::pair<FileID, unsigned> BeginFileLocInfo = 577 SM.getDecomposedLoc(BeginFileLoc); 578 assert(FileLocInfo.first == BeginFileLocInfo.first && 579 FileLocInfo.second >= BeginFileLocInfo.second); 580 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 581 } 582 583 namespace { 584 585 enum PreambleDirectiveKind { 586 PDK_Skipped, 587 PDK_Unknown 588 }; 589 590 } // namespace 591 592 PreambleBounds Lexer::ComputePreamble(StringRef Buffer, 593 const LangOptions &LangOpts, 594 unsigned MaxLines) { 595 // Create a lexer starting at the beginning of the file. Note that we use a 596 // "fake" file source location at offset 1 so that the lexer will track our 597 // position within the file. 598 const SourceLocation::UIntTy StartOffset = 1; 599 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); 600 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(), 601 Buffer.end()); 602 TheLexer.SetCommentRetentionState(true); 603 604 bool InPreprocessorDirective = false; 605 Token TheTok; 606 SourceLocation ActiveCommentLoc; 607 608 unsigned MaxLineOffset = 0; 609 if (MaxLines) { 610 const char *CurPtr = Buffer.begin(); 611 unsigned CurLine = 0; 612 while (CurPtr != Buffer.end()) { 613 char ch = *CurPtr++; 614 if (ch == '\n') { 615 ++CurLine; 616 if (CurLine == MaxLines) 617 break; 618 } 619 } 620 if (CurPtr != Buffer.end()) 621 MaxLineOffset = CurPtr - Buffer.begin(); 622 } 623 624 do { 625 TheLexer.LexFromRawLexer(TheTok); 626 627 if (InPreprocessorDirective) { 628 // If we've hit the end of the file, we're done. 629 if (TheTok.getKind() == tok::eof) { 630 break; 631 } 632 633 // If we haven't hit the end of the preprocessor directive, skip this 634 // token. 635 if (!TheTok.isAtStartOfLine()) 636 continue; 637 638 // We've passed the end of the preprocessor directive, and will look 639 // at this token again below. 640 InPreprocessorDirective = false; 641 } 642 643 // Keep track of the # of lines in the preamble. 644 if (TheTok.isAtStartOfLine()) { 645 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 646 647 // If we were asked to limit the number of lines in the preamble, 648 // and we're about to exceed that limit, we're done. 649 if (MaxLineOffset && TokOffset >= MaxLineOffset) 650 break; 651 } 652 653 // Comments are okay; skip over them. 654 if (TheTok.getKind() == tok::comment) { 655 if (ActiveCommentLoc.isInvalid()) 656 ActiveCommentLoc = TheTok.getLocation(); 657 continue; 658 } 659 660 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 661 // This is the start of a preprocessor directive. 662 Token HashTok = TheTok; 663 InPreprocessorDirective = true; 664 ActiveCommentLoc = SourceLocation(); 665 666 // Figure out which directive this is. Since we're lexing raw tokens, 667 // we don't have an identifier table available. Instead, just look at 668 // the raw identifier to recognize and categorize preprocessor directives. 669 TheLexer.LexFromRawLexer(TheTok); 670 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 671 StringRef Keyword = TheTok.getRawIdentifier(); 672 PreambleDirectiveKind PDK 673 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 674 .Case("include", PDK_Skipped) 675 .Case("__include_macros", PDK_Skipped) 676 .Case("define", PDK_Skipped) 677 .Case("undef", PDK_Skipped) 678 .Case("line", PDK_Skipped) 679 .Case("error", PDK_Skipped) 680 .Case("pragma", PDK_Skipped) 681 .Case("import", PDK_Skipped) 682 .Case("include_next", PDK_Skipped) 683 .Case("warning", PDK_Skipped) 684 .Case("ident", PDK_Skipped) 685 .Case("sccs", PDK_Skipped) 686 .Case("assert", PDK_Skipped) 687 .Case("unassert", PDK_Skipped) 688 .Case("if", PDK_Skipped) 689 .Case("ifdef", PDK_Skipped) 690 .Case("ifndef", PDK_Skipped) 691 .Case("elif", PDK_Skipped) 692 .Case("elifdef", PDK_Skipped) 693 .Case("elifndef", PDK_Skipped) 694 .Case("else", PDK_Skipped) 695 .Case("endif", PDK_Skipped) 696 .Default(PDK_Unknown); 697 698 switch (PDK) { 699 case PDK_Skipped: 700 continue; 701 702 case PDK_Unknown: 703 // We don't know what this directive is; stop at the '#'. 704 break; 705 } 706 } 707 708 // We only end up here if we didn't recognize the preprocessor 709 // directive or it was one that can't occur in the preamble at this 710 // point. Roll back the current token to the location of the '#'. 711 TheTok = HashTok; 712 } else if (TheTok.isAtStartOfLine() && 713 TheTok.getKind() == tok::raw_identifier && 714 TheTok.getRawIdentifier() == "module" && 715 LangOpts.CPlusPlusModules) { 716 // The initial global module fragment introducer "module;" is part of 717 // the preamble, which runs up to the module declaration "module foo;". 718 Token ModuleTok = TheTok; 719 do { 720 TheLexer.LexFromRawLexer(TheTok); 721 } while (TheTok.getKind() == tok::comment); 722 if (TheTok.getKind() != tok::semi) { 723 // Not global module fragment, roll back. 724 TheTok = ModuleTok; 725 break; 726 } 727 continue; 728 } 729 730 // We hit a token that we don't recognize as being in the 731 // "preprocessing only" part of the file, so we're no longer in 732 // the preamble. 733 break; 734 } while (true); 735 736 SourceLocation End; 737 if (ActiveCommentLoc.isValid()) 738 End = ActiveCommentLoc; // don't truncate a decl comment. 739 else 740 End = TheTok.getLocation(); 741 742 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(), 743 TheTok.isAtStartOfLine()); 744 } 745 746 unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, 747 const SourceManager &SM, 748 const LangOptions &LangOpts) { 749 // Figure out how many physical characters away the specified expansion 750 // character is. This needs to take into consideration newlines and 751 // trigraphs. 752 bool Invalid = false; 753 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 754 755 // If they request the first char of the token, we're trivially done. 756 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 757 return 0; 758 759 unsigned PhysOffset = 0; 760 761 // The usual case is that tokens don't contain anything interesting. Skip 762 // over the uninteresting characters. If a token only consists of simple 763 // chars, this method is extremely fast. 764 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 765 if (CharNo == 0) 766 return PhysOffset; 767 ++TokPtr; 768 --CharNo; 769 ++PhysOffset; 770 } 771 772 // If we have a character that may be a trigraph or escaped newline, use a 773 // lexer to parse it correctly. 774 for (; CharNo; --CharNo) { 775 auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts); 776 TokPtr += CharAndSize.Size; 777 PhysOffset += CharAndSize.Size; 778 } 779 780 // Final detail: if we end up on an escaped newline, we want to return the 781 // location of the actual byte of the token. For example foo\<newline>bar 782 // advanced by 3 should return the location of b, not of \\. One compounding 783 // detail of this is that the escape may be made by a trigraph. 784 if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 785 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 786 787 return PhysOffset; 788 } 789 790 /// Computes the source location just past the end of the 791 /// token at this source location. 792 /// 793 /// This routine can be used to produce a source location that 794 /// points just past the end of the token referenced by \p Loc, and 795 /// is generally used when a diagnostic needs to point just after a 796 /// token where it expected something different that it received. If 797 /// the returned source location would not be meaningful (e.g., if 798 /// it points into a macro), this routine returns an invalid 799 /// source location. 800 /// 801 /// \param Offset an offset from the end of the token, where the source 802 /// location should refer to. The default offset (0) produces a source 803 /// location pointing just past the end of the token; an offset of 1 produces 804 /// a source location pointing to the last character in the token, etc. 805 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 806 const SourceManager &SM, 807 const LangOptions &LangOpts) { 808 if (Loc.isInvalid()) 809 return {}; 810 811 if (Loc.isMacroID()) { 812 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 813 return {}; // Points inside the macro expansion. 814 } 815 816 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 817 if (Len > Offset) 818 Len = Len - Offset; 819 else 820 return Loc; 821 822 return Loc.getLocWithOffset(Len); 823 } 824 825 /// Returns true if the given MacroID location points at the first 826 /// token of the macro expansion. 827 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 828 const SourceManager &SM, 829 const LangOptions &LangOpts, 830 SourceLocation *MacroBegin) { 831 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 832 833 SourceLocation expansionLoc; 834 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) 835 return false; 836 837 if (expansionLoc.isFileID()) { 838 // No other macro expansions, this is the first. 839 if (MacroBegin) 840 *MacroBegin = expansionLoc; 841 return true; 842 } 843 844 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 845 } 846 847 /// Returns true if the given MacroID location points at the last 848 /// token of the macro expansion. 849 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 850 const SourceManager &SM, 851 const LangOptions &LangOpts, 852 SourceLocation *MacroEnd) { 853 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 854 855 SourceLocation spellLoc = SM.getSpellingLoc(loc); 856 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 857 if (tokLen == 0) 858 return false; 859 860 SourceLocation afterLoc = loc.getLocWithOffset(tokLen); 861 SourceLocation expansionLoc; 862 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) 863 return false; 864 865 if (expansionLoc.isFileID()) { 866 // No other macro expansions. 867 if (MacroEnd) 868 *MacroEnd = expansionLoc; 869 return true; 870 } 871 872 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 873 } 874 875 static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 876 const SourceManager &SM, 877 const LangOptions &LangOpts) { 878 SourceLocation Begin = Range.getBegin(); 879 SourceLocation End = Range.getEnd(); 880 assert(Begin.isFileID() && End.isFileID()); 881 if (Range.isTokenRange()) { 882 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 883 if (End.isInvalid()) 884 return {}; 885 } 886 887 // Break down the source locations. 888 FileID FID; 889 unsigned BeginOffs; 890 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 891 if (FID.isInvalid()) 892 return {}; 893 894 unsigned EndOffs; 895 if (!SM.isInFileID(End, FID, &EndOffs) || 896 BeginOffs > EndOffs) 897 return {}; 898 899 return CharSourceRange::getCharRange(Begin, End); 900 } 901 902 // Assumes that `Loc` is in an expansion. 903 static bool isInExpansionTokenRange(const SourceLocation Loc, 904 const SourceManager &SM) { 905 return SM.getSLocEntry(SM.getFileID(Loc)) 906 .getExpansion() 907 .isExpansionTokenRange(); 908 } 909 910 CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 911 const SourceManager &SM, 912 const LangOptions &LangOpts) { 913 SourceLocation Begin = Range.getBegin(); 914 SourceLocation End = Range.getEnd(); 915 if (Begin.isInvalid() || End.isInvalid()) 916 return {}; 917 918 if (Begin.isFileID() && End.isFileID()) 919 return makeRangeFromFileLocs(Range, SM, LangOpts); 920 921 if (Begin.isMacroID() && End.isFileID()) { 922 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 923 return {}; 924 Range.setBegin(Begin); 925 return makeRangeFromFileLocs(Range, SM, LangOpts); 926 } 927 928 if (Begin.isFileID() && End.isMacroID()) { 929 if (Range.isTokenRange()) { 930 if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End)) 931 return {}; 932 // Use the *original* end, not the expanded one in `End`. 933 Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM)); 934 } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End)) 935 return {}; 936 Range.setEnd(End); 937 return makeRangeFromFileLocs(Range, SM, LangOpts); 938 } 939 940 assert(Begin.isMacroID() && End.isMacroID()); 941 SourceLocation MacroBegin, MacroEnd; 942 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 943 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 944 &MacroEnd)) || 945 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 946 &MacroEnd)))) { 947 Range.setBegin(MacroBegin); 948 Range.setEnd(MacroEnd); 949 // Use the *original* `End`, not the expanded one in `MacroEnd`. 950 if (Range.isTokenRange()) 951 Range.setTokenRange(isInExpansionTokenRange(End, SM)); 952 return makeRangeFromFileLocs(Range, SM, LangOpts); 953 } 954 955 bool Invalid = false; 956 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), 957 &Invalid); 958 if (Invalid) 959 return {}; 960 961 if (BeginEntry.getExpansion().isMacroArgExpansion()) { 962 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), 963 &Invalid); 964 if (Invalid) 965 return {}; 966 967 if (EndEntry.getExpansion().isMacroArgExpansion() && 968 BeginEntry.getExpansion().getExpansionLocStart() == 969 EndEntry.getExpansion().getExpansionLocStart()) { 970 Range.setBegin(SM.getImmediateSpellingLoc(Begin)); 971 Range.setEnd(SM.getImmediateSpellingLoc(End)); 972 return makeFileCharRange(Range, SM, LangOpts); 973 } 974 } 975 976 return {}; 977 } 978 979 StringRef Lexer::getSourceText(CharSourceRange Range, 980 const SourceManager &SM, 981 const LangOptions &LangOpts, 982 bool *Invalid) { 983 Range = makeFileCharRange(Range, SM, LangOpts); 984 if (Range.isInvalid()) { 985 if (Invalid) *Invalid = true; 986 return {}; 987 } 988 989 // Break down the source location. 990 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 991 if (beginInfo.first.isInvalid()) { 992 if (Invalid) *Invalid = true; 993 return {}; 994 } 995 996 unsigned EndOffs; 997 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 998 beginInfo.second > EndOffs) { 999 if (Invalid) *Invalid = true; 1000 return {}; 1001 } 1002 1003 // Try to the load the file buffer. 1004 bool invalidTemp = false; 1005 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 1006 if (invalidTemp) { 1007 if (Invalid) *Invalid = true; 1008 return {}; 1009 } 1010 1011 if (Invalid) *Invalid = false; 1012 return file.substr(beginInfo.second, EndOffs - beginInfo.second); 1013 } 1014 1015 StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 1016 const SourceManager &SM, 1017 const LangOptions &LangOpts) { 1018 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 1019 1020 // Find the location of the immediate macro expansion. 1021 while (true) { 1022 FileID FID = SM.getFileID(Loc); 1023 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 1024 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 1025 Loc = Expansion.getExpansionLocStart(); 1026 if (!Expansion.isMacroArgExpansion()) 1027 break; 1028 1029 // For macro arguments we need to check that the argument did not come 1030 // from an inner macro, e.g: "MAC1( MAC2(foo) )" 1031 1032 // Loc points to the argument id of the macro definition, move to the 1033 // macro expansion. 1034 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 1035 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 1036 if (SpellLoc.isFileID()) 1037 break; // No inner macro. 1038 1039 // If spelling location resides in the same FileID as macro expansion 1040 // location, it means there is no inner macro. 1041 FileID MacroFID = SM.getFileID(Loc); 1042 if (SM.isInFileID(SpellLoc, MacroFID)) 1043 break; 1044 1045 // Argument came from inner macro. 1046 Loc = SpellLoc; 1047 } 1048 1049 // Find the spelling location of the start of the non-argument expansion 1050 // range. This is where the macro name was spelled in order to begin 1051 // expanding this macro. 1052 Loc = SM.getSpellingLoc(Loc); 1053 1054 // Dig out the buffer where the macro name was spelled and the extents of the 1055 // name so that we can render it into the expansion note. 1056 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1057 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1058 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1059 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1060 } 1061 1062 StringRef Lexer::getImmediateMacroNameForDiagnostics( 1063 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { 1064 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 1065 // Walk past macro argument expansions. 1066 while (SM.isMacroArgExpansion(Loc)) 1067 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 1068 1069 // If the macro's spelling isn't FileID or from scratch space, then it's 1070 // actually a token paste or stringization (or similar) and not a macro at 1071 // all. 1072 SourceLocation SpellLoc = SM.getSpellingLoc(Loc); 1073 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc)) 1074 return {}; 1075 1076 // Find the spelling location of the start of the non-argument expansion 1077 // range. This is where the macro name was spelled in order to begin 1078 // expanding this macro. 1079 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin()); 1080 1081 // Dig out the buffer where the macro name was spelled and the extents of the 1082 // name so that we can render it into the expansion note. 1083 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1084 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1085 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1086 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1087 } 1088 1089 bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) { 1090 return isAsciiIdentifierContinue(c, LangOpts.DollarIdents); 1091 } 1092 1093 bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { 1094 assert(isVerticalWhitespace(Str[0])); 1095 if (Str - 1 < BufferStart) 1096 return false; 1097 1098 if ((Str[0] == '\n' && Str[-1] == '\r') || 1099 (Str[0] == '\r' && Str[-1] == '\n')) { 1100 if (Str - 2 < BufferStart) 1101 return false; 1102 --Str; 1103 } 1104 --Str; 1105 1106 // Rewind to first non-space character: 1107 while (Str > BufferStart && isHorizontalWhitespace(*Str)) 1108 --Str; 1109 1110 return *Str == '\\'; 1111 } 1112 1113 StringRef Lexer::getIndentationForLine(SourceLocation Loc, 1114 const SourceManager &SM) { 1115 if (Loc.isInvalid() || Loc.isMacroID()) 1116 return {}; 1117 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1118 if (LocInfo.first.isInvalid()) 1119 return {}; 1120 bool Invalid = false; 1121 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 1122 if (Invalid) 1123 return {}; 1124 const char *Line = findBeginningOfLine(Buffer, LocInfo.second); 1125 if (!Line) 1126 return {}; 1127 StringRef Rest = Buffer.substr(Line - Buffer.data()); 1128 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t"); 1129 return NumWhitespaceChars == StringRef::npos 1130 ? "" 1131 : Rest.take_front(NumWhitespaceChars); 1132 } 1133 1134 //===----------------------------------------------------------------------===// 1135 // Diagnostics forwarding code. 1136 //===----------------------------------------------------------------------===// 1137 1138 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 1139 /// lexer buffer was all expanded at a single point, perform the mapping. 1140 /// This is currently only used for _Pragma implementation, so it is the slow 1141 /// path of the hot getSourceLocation method. Do not allow it to be inlined. 1142 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 1143 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 1144 static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 1145 SourceLocation FileLoc, 1146 unsigned CharNo, unsigned TokLen) { 1147 assert(FileLoc.isMacroID() && "Must be a macro expansion"); 1148 1149 // Otherwise, we're lexing "mapped tokens". This is used for things like 1150 // _Pragma handling. Combine the expansion location of FileLoc with the 1151 // spelling location. 1152 SourceManager &SM = PP.getSourceManager(); 1153 1154 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 1155 // characters come from spelling(FileLoc)+Offset. 1156 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 1157 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 1158 1159 // Figure out the expansion loc range, which is the range covered by the 1160 // original _Pragma(...) sequence. 1161 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc); 1162 1163 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen); 1164 } 1165 1166 /// getSourceLocation - Return a source location identifier for the specified 1167 /// offset in the current file. 1168 SourceLocation Lexer::getSourceLocation(const char *Loc, 1169 unsigned TokLen) const { 1170 assert(Loc >= BufferStart && Loc <= BufferEnd && 1171 "Location out of range for this buffer!"); 1172 1173 // In the normal case, we're just lexing from a simple file buffer, return 1174 // the file id from FileLoc with the offset specified. 1175 unsigned CharNo = Loc-BufferStart; 1176 if (FileLoc.isFileID()) 1177 return FileLoc.getLocWithOffset(CharNo); 1178 1179 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 1180 // tokens are lexed from where the _Pragma was defined. 1181 assert(PP && "This doesn't work on raw lexers"); 1182 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 1183 } 1184 1185 /// Diag - Forwarding function for diagnostics. This translate a source 1186 /// position in the current buffer into a SourceLocation object for rendering. 1187 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 1188 return PP->Diag(getSourceLocation(Loc), DiagID); 1189 } 1190 1191 //===----------------------------------------------------------------------===// 1192 // Trigraph and Escaped Newline Handling Code. 1193 //===----------------------------------------------------------------------===// 1194 1195 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 1196 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 1197 static char GetTrigraphCharForLetter(char Letter) { 1198 switch (Letter) { 1199 default: return 0; 1200 case '=': return '#'; 1201 case ')': return ']'; 1202 case '(': return '['; 1203 case '!': return '|'; 1204 case '\'': return '^'; 1205 case '>': return '}'; 1206 case '/': return '\\'; 1207 case '<': return '{'; 1208 case '-': return '~'; 1209 } 1210 } 1211 1212 /// DecodeTrigraphChar - If the specified character is a legal trigraph when 1213 /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 1214 /// return the result character. Finally, emit a warning about trigraph use 1215 /// whether trigraphs are enabled or not. 1216 static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) { 1217 char Res = GetTrigraphCharForLetter(*CP); 1218 if (!Res) 1219 return Res; 1220 1221 if (!Trigraphs) { 1222 if (L && !L->isLexingRawMode()) 1223 L->Diag(CP-2, diag::trigraph_ignored); 1224 return 0; 1225 } 1226 1227 if (L && !L->isLexingRawMode()) 1228 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 1229 return Res; 1230 } 1231 1232 /// getEscapedNewLineSize - Return the size of the specified escaped newline, 1233 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 1234 /// trigraph equivalent on entry to this function. 1235 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 1236 unsigned Size = 0; 1237 while (isWhitespace(Ptr[Size])) { 1238 ++Size; 1239 1240 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 1241 continue; 1242 1243 // If this is a \r\n or \n\r, skip the other half. 1244 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 1245 Ptr[Size-1] != Ptr[Size]) 1246 ++Size; 1247 1248 return Size; 1249 } 1250 1251 // Not an escaped newline, must be a \t or something else. 1252 return 0; 1253 } 1254 1255 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 1256 /// them), skip over them and return the first non-escaped-newline found, 1257 /// otherwise return P. 1258 const char *Lexer::SkipEscapedNewLines(const char *P) { 1259 while (true) { 1260 const char *AfterEscape; 1261 if (*P == '\\') { 1262 AfterEscape = P+1; 1263 } else if (*P == '?') { 1264 // If not a trigraph for escape, bail out. 1265 if (P[1] != '?' || P[2] != '/') 1266 return P; 1267 // FIXME: Take LangOpts into account; the language might not 1268 // support trigraphs. 1269 AfterEscape = P+3; 1270 } else { 1271 return P; 1272 } 1273 1274 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 1275 if (NewLineSize == 0) return P; 1276 P = AfterEscape+NewLineSize; 1277 } 1278 } 1279 1280 std::optional<Token> Lexer::findNextToken(SourceLocation Loc, 1281 const SourceManager &SM, 1282 const LangOptions &LangOpts) { 1283 if (Loc.isMacroID()) { 1284 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 1285 return std::nullopt; 1286 } 1287 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 1288 1289 // Break down the source location. 1290 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1291 1292 // Try to load the file buffer. 1293 bool InvalidTemp = false; 1294 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 1295 if (InvalidTemp) 1296 return std::nullopt; 1297 1298 const char *TokenBegin = File.data() + LocInfo.second; 1299 1300 // Lex from the start of the given location. 1301 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 1302 TokenBegin, File.end()); 1303 // Find the token. 1304 Token Tok; 1305 lexer.LexFromRawLexer(Tok); 1306 return Tok; 1307 } 1308 1309 /// Checks that the given token is the first token that occurs after the 1310 /// given location (this excludes comments and whitespace). Returns the location 1311 /// immediately after the specified token. If the token is not found or the 1312 /// location is inside a macro, the returned source location will be invalid. 1313 SourceLocation Lexer::findLocationAfterToken( 1314 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, 1315 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { 1316 std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts); 1317 if (!Tok || Tok->isNot(TKind)) 1318 return {}; 1319 SourceLocation TokenLoc = Tok->getLocation(); 1320 1321 // Calculate how much whitespace needs to be skipped if any. 1322 unsigned NumWhitespaceChars = 0; 1323 if (SkipTrailingWhitespaceAndNewLine) { 1324 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength(); 1325 unsigned char C = *TokenEnd; 1326 while (isHorizontalWhitespace(C)) { 1327 C = *(++TokenEnd); 1328 NumWhitespaceChars++; 1329 } 1330 1331 // Skip \r, \n, \r\n, or \n\r 1332 if (C == '\n' || C == '\r') { 1333 char PrevC = C; 1334 C = *(++TokenEnd); 1335 NumWhitespaceChars++; 1336 if ((C == '\n' || C == '\r') && C != PrevC) 1337 NumWhitespaceChars++; 1338 } 1339 } 1340 1341 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars); 1342 } 1343 1344 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 1345 /// get its size, and return it. This is tricky in several cases: 1346 /// 1. If currently at the start of a trigraph, we warn about the trigraph, 1347 /// then either return the trigraph (skipping 3 chars) or the '?', 1348 /// depending on whether trigraphs are enabled or not. 1349 /// 2. If this is an escaped newline (potentially with whitespace between 1350 /// the backslash and newline), implicitly skip the newline and return 1351 /// the char after it. 1352 /// 1353 /// This handles the slow/uncommon case of the getCharAndSize method. Here we 1354 /// know that we can accumulate into Size, and that we have already incremented 1355 /// Ptr by Size bytes. 1356 /// 1357 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 1358 /// be updated to match. 1359 Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) { 1360 unsigned Size = 0; 1361 // If we have a slash, look for an escaped newline. 1362 if (Ptr[0] == '\\') { 1363 ++Size; 1364 ++Ptr; 1365 Slash: 1366 // Common case, backslash-char where the char is not whitespace. 1367 if (!isWhitespace(Ptr[0])) 1368 return {'\\', Size}; 1369 1370 // See if we have optional whitespace characters between the slash and 1371 // newline. 1372 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1373 // Remember that this token needs to be cleaned. 1374 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1375 1376 // Warn if there was whitespace between the backslash and newline. 1377 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 1378 Diag(Ptr, diag::backslash_newline_space); 1379 1380 // Found backslash<whitespace><newline>. Parse the char after it. 1381 Size += EscapedNewLineSize; 1382 Ptr += EscapedNewLineSize; 1383 1384 // Use slow version to accumulate a correct size field. 1385 auto CharAndSize = getCharAndSizeSlow(Ptr, Tok); 1386 CharAndSize.Size += Size; 1387 return CharAndSize; 1388 } 1389 1390 // Otherwise, this is not an escaped newline, just return the slash. 1391 return {'\\', Size}; 1392 } 1393 1394 // If this is a trigraph, process it. 1395 if (Ptr[0] == '?' && Ptr[1] == '?') { 1396 // If this is actually a legal trigraph (not something like "??x"), emit 1397 // a trigraph warning. If so, and if trigraphs are enabled, return it. 1398 if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr, 1399 LangOpts.Trigraphs)) { 1400 // Remember that this token needs to be cleaned. 1401 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1402 1403 Ptr += 3; 1404 Size += 3; 1405 if (C == '\\') goto Slash; 1406 return {C, Size}; 1407 } 1408 } 1409 1410 // If this is neither, return a single character. 1411 return {*Ptr, Size + 1u}; 1412 } 1413 1414 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 1415 /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 1416 /// and that we have already incremented Ptr by Size bytes. 1417 /// 1418 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 1419 /// be updated to match. 1420 Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, 1421 const LangOptions &LangOpts) { 1422 1423 unsigned Size = 0; 1424 // If we have a slash, look for an escaped newline. 1425 if (Ptr[0] == '\\') { 1426 ++Size; 1427 ++Ptr; 1428 Slash: 1429 // Common case, backslash-char where the char is not whitespace. 1430 if (!isWhitespace(Ptr[0])) 1431 return {'\\', Size}; 1432 1433 // See if we have optional whitespace characters followed by a newline. 1434 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1435 // Found backslash<whitespace><newline>. Parse the char after it. 1436 Size += EscapedNewLineSize; 1437 Ptr += EscapedNewLineSize; 1438 1439 // Use slow version to accumulate a correct size field. 1440 auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts); 1441 CharAndSize.Size += Size; 1442 return CharAndSize; 1443 } 1444 1445 // Otherwise, this is not an escaped newline, just return the slash. 1446 return {'\\', Size}; 1447 } 1448 1449 // If this is a trigraph, process it. 1450 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 1451 // If this is actually a legal trigraph (not something like "??x"), return 1452 // it. 1453 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 1454 Ptr += 3; 1455 Size += 3; 1456 if (C == '\\') goto Slash; 1457 return {C, Size}; 1458 } 1459 } 1460 1461 // If this is neither, return a single character. 1462 return {*Ptr, Size + 1u}; 1463 } 1464 1465 //===----------------------------------------------------------------------===// 1466 // Helper methods for lexing. 1467 //===----------------------------------------------------------------------===// 1468 1469 /// Routine that indiscriminately sets the offset into the source file. 1470 void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { 1471 BufferPtr = BufferStart + Offset; 1472 if (BufferPtr > BufferEnd) 1473 BufferPtr = BufferEnd; 1474 // FIXME: What exactly does the StartOfLine bit mean? There are two 1475 // possible meanings for the "start" of the line: the first token on the 1476 // unexpanded line, or the first token on the expanded line. 1477 IsAtStartOfLine = StartOfLine; 1478 IsAtPhysicalStartOfLine = StartOfLine; 1479 } 1480 1481 static bool isUnicodeWhitespace(uint32_t Codepoint) { 1482 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( 1483 UnicodeWhitespaceCharRanges); 1484 return UnicodeWhitespaceChars.contains(Codepoint); 1485 } 1486 1487 static llvm::SmallString<5> codepointAsHexString(uint32_t C) { 1488 llvm::SmallString<5> CharBuf; 1489 llvm::raw_svector_ostream CharOS(CharBuf); 1490 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); 1491 return CharBuf; 1492 } 1493 1494 // To mitigate https://github.com/llvm/llvm-project/issues/54732, 1495 // we allow "Mathematical Notation Characters" in identifiers. 1496 // This is a proposed profile that extends the XID_Start/XID_continue 1497 // with mathematical symbols, superscipts and subscripts digits 1498 // found in some production software. 1499 // https://www.unicode.org/L2/L2022/22230-math-profile.pdf 1500 static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, 1501 bool IsStart, bool &IsExtension) { 1502 static const llvm::sys::UnicodeCharSet MathStartChars( 1503 MathematicalNotationProfileIDStartRanges); 1504 static const llvm::sys::UnicodeCharSet MathContinueChars( 1505 MathematicalNotationProfileIDContinueRanges); 1506 if (MathStartChars.contains(C) || 1507 (!IsStart && MathContinueChars.contains(C))) { 1508 IsExtension = true; 1509 return true; 1510 } 1511 return false; 1512 } 1513 1514 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, 1515 bool &IsExtension) { 1516 if (LangOpts.AsmPreprocessor) { 1517 return false; 1518 } else if (LangOpts.DollarIdents && '$' == C) { 1519 return true; 1520 } else if (LangOpts.CPlusPlus || LangOpts.C23) { 1521 // A non-leading codepoint must have the XID_Continue property. 1522 // XIDContinueRanges doesn't contains characters also in XIDStartRanges, 1523 // so we need to check both tables. 1524 // '_' doesn't have the XID_Continue property but is allowed in C and C++. 1525 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1526 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges); 1527 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C)) 1528 return true; 1529 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false, 1530 IsExtension); 1531 } else if (LangOpts.C11) { 1532 static const llvm::sys::UnicodeCharSet C11AllowedIDChars( 1533 C11AllowedIDCharRanges); 1534 return C11AllowedIDChars.contains(C); 1535 } else { 1536 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1537 C99AllowedIDCharRanges); 1538 return C99AllowedIDChars.contains(C); 1539 } 1540 } 1541 1542 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, 1543 bool &IsExtension) { 1544 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint"); 1545 IsExtension = false; 1546 if (LangOpts.AsmPreprocessor) { 1547 return false; 1548 } 1549 if (LangOpts.CPlusPlus || LangOpts.C23) { 1550 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1551 if (XIDStartChars.contains(C)) 1552 return true; 1553 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true, 1554 IsExtension); 1555 } 1556 if (!isAllowedIDChar(C, LangOpts, IsExtension)) 1557 return false; 1558 if (LangOpts.C11) { 1559 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( 1560 C11DisallowedInitialIDCharRanges); 1561 return !C11DisallowedInitialIDChars.contains(C); 1562 } 1563 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1564 C99DisallowedInitialIDCharRanges); 1565 return !C99DisallowedInitialIDChars.contains(C); 1566 } 1567 1568 static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, 1569 CharSourceRange Range) { 1570 1571 static const llvm::sys::UnicodeCharSet MathStartChars( 1572 MathematicalNotationProfileIDStartRanges); 1573 static const llvm::sys::UnicodeCharSet MathContinueChars( 1574 MathematicalNotationProfileIDContinueRanges); 1575 1576 (void)MathStartChars; 1577 (void)MathContinueChars; 1578 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) && 1579 "Unexpected mathematical notation codepoint"); 1580 Diags.Report(Range.getBegin(), diag::ext_mathematical_notation) 1581 << codepointAsHexString(C) << Range; 1582 } 1583 1584 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, 1585 const char *End) { 1586 return CharSourceRange::getCharRange(L.getSourceLocation(Begin), 1587 L.getSourceLocation(End)); 1588 } 1589 1590 static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, 1591 CharSourceRange Range, bool IsFirst) { 1592 // Check C99 compatibility. 1593 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) { 1594 enum { 1595 CannotAppearInIdentifier = 0, 1596 CannotStartIdentifier 1597 }; 1598 1599 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1600 C99AllowedIDCharRanges); 1601 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1602 C99DisallowedInitialIDCharRanges); 1603 if (!C99AllowedIDChars.contains(C)) { 1604 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1605 << Range 1606 << CannotAppearInIdentifier; 1607 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { 1608 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1609 << Range 1610 << CannotStartIdentifier; 1611 } 1612 } 1613 } 1614 1615 /// After encountering UTF-8 character C and interpreting it as an identifier 1616 /// character, check whether it's a homoglyph for a common non-identifier 1617 /// source character that is unlikely to be an intentional identifier 1618 /// character and warn if so. 1619 static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, 1620 CharSourceRange Range) { 1621 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). 1622 struct HomoglyphPair { 1623 uint32_t Character; 1624 char LooksLike; 1625 bool operator<(HomoglyphPair R) const { return Character < R.Character; } 1626 }; 1627 static constexpr HomoglyphPair SortedHomoglyphs[] = { 1628 {U'\u00ad', 0}, // SOFT HYPHEN 1629 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK 1630 {U'\u037e', ';'}, // GREEK QUESTION MARK 1631 {U'\u200b', 0}, // ZERO WIDTH SPACE 1632 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER 1633 {U'\u200d', 0}, // ZERO WIDTH JOINER 1634 {U'\u2060', 0}, // WORD JOINER 1635 {U'\u2061', 0}, // FUNCTION APPLICATION 1636 {U'\u2062', 0}, // INVISIBLE TIMES 1637 {U'\u2063', 0}, // INVISIBLE SEPARATOR 1638 {U'\u2064', 0}, // INVISIBLE PLUS 1639 {U'\u2212', '-'}, // MINUS SIGN 1640 {U'\u2215', '/'}, // DIVISION SLASH 1641 {U'\u2216', '\\'}, // SET MINUS 1642 {U'\u2217', '*'}, // ASTERISK OPERATOR 1643 {U'\u2223', '|'}, // DIVIDES 1644 {U'\u2227', '^'}, // LOGICAL AND 1645 {U'\u2236', ':'}, // RATIO 1646 {U'\u223c', '~'}, // TILDE OPERATOR 1647 {U'\ua789', ':'}, // MODIFIER LETTER COLON 1648 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE 1649 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK 1650 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN 1651 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN 1652 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN 1653 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND 1654 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS 1655 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS 1656 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK 1657 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK 1658 {U'\uff0c', ','}, // FULLWIDTH COMMA 1659 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS 1660 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP 1661 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS 1662 {U'\uff1a', ':'}, // FULLWIDTH COLON 1663 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON 1664 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN 1665 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN 1666 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN 1667 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK 1668 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT 1669 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET 1670 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS 1671 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET 1672 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT 1673 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET 1674 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE 1675 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET 1676 {U'\uff5e', '~'}, // FULLWIDTH TILDE 1677 {0, 0} 1678 }; 1679 auto Homoglyph = 1680 std::lower_bound(std::begin(SortedHomoglyphs), 1681 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); 1682 if (Homoglyph->Character == C) { 1683 if (Homoglyph->LooksLike) { 1684 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; 1685 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) 1686 << Range << codepointAsHexString(C) << LooksLikeStr; 1687 } else { 1688 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) 1689 << Range << codepointAsHexString(C); 1690 } 1691 } 1692 } 1693 1694 static void diagnoseInvalidUnicodeCodepointInIdentifier( 1695 DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, 1696 CharSourceRange Range, bool IsFirst) { 1697 if (isASCII(CodePoint)) 1698 return; 1699 1700 bool IsExtension; 1701 bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension); 1702 bool IsIDContinue = 1703 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension); 1704 1705 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue)) 1706 return; 1707 1708 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue; 1709 1710 if (!IsFirst || InvalidOnlyAtStart) { 1711 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier) 1712 << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart) 1713 << FixItHint::CreateRemoval(Range); 1714 } else { 1715 Diags.Report(Range.getBegin(), diag::err_character_not_allowed) 1716 << Range << codepointAsHexString(CodePoint) 1717 << FixItHint::CreateRemoval(Range); 1718 } 1719 } 1720 1721 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 1722 Token &Result) { 1723 const char *UCNPtr = CurPtr + Size; 1724 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr); 1725 if (CodePoint == 0) { 1726 return false; 1727 } 1728 bool IsExtension = false; 1729 if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) { 1730 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1731 return false; 1732 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1733 !PP->isPreprocessedOutput()) 1734 diagnoseInvalidUnicodeCodepointInIdentifier( 1735 PP->getDiagnostics(), LangOpts, CodePoint, 1736 makeCharRange(*this, CurPtr, UCNPtr), 1737 /*IsFirst=*/false); 1738 1739 // We got a unicode codepoint that is neither a space nor a 1740 // a valid identifier part. 1741 // Carry on as if the codepoint was valid for recovery purposes. 1742 } else if (!isLexingRawMode()) { 1743 if (IsExtension) 1744 diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint, 1745 makeCharRange(*this, CurPtr, UCNPtr)); 1746 1747 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1748 makeCharRange(*this, CurPtr, UCNPtr), 1749 /*IsFirst=*/false); 1750 } 1751 1752 Result.setFlag(Token::HasUCN); 1753 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || 1754 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) 1755 CurPtr = UCNPtr; 1756 else 1757 while (CurPtr != UCNPtr) 1758 (void)getAndAdvanceChar(CurPtr, Result); 1759 return true; 1760 } 1761 1762 bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) { 1763 llvm::UTF32 CodePoint; 1764 1765 // If a UTF-8 codepoint appears immediately after an escaped new line, 1766 // CurPtr may point to the splicing \ on the preceding line, 1767 // so we need to skip it. 1768 unsigned FirstCodeUnitSize; 1769 getCharAndSize(CurPtr, FirstCodeUnitSize); 1770 const char *CharStart = CurPtr + FirstCodeUnitSize - 1; 1771 const char *UnicodePtr = CharStart; 1772 1773 llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence( 1774 (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd, 1775 &CodePoint, llvm::strictConversion); 1776 if (ConvResult != llvm::conversionOK) 1777 return false; 1778 1779 bool IsExtension = false; 1780 if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts, 1781 IsExtension)) { 1782 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1783 return false; 1784 1785 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1786 !PP->isPreprocessedOutput()) 1787 diagnoseInvalidUnicodeCodepointInIdentifier( 1788 PP->getDiagnostics(), LangOpts, CodePoint, 1789 makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false); 1790 // We got a unicode codepoint that is neither a space nor a 1791 // a valid identifier part. Carry on as if the codepoint was 1792 // valid for recovery purposes. 1793 } else if (!isLexingRawMode()) { 1794 if (IsExtension) 1795 diagnoseExtensionInIdentifier( 1796 PP->getDiagnostics(), CodePoint, 1797 makeCharRange(*this, CharStart, UnicodePtr)); 1798 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1799 makeCharRange(*this, CharStart, UnicodePtr), 1800 /*IsFirst=*/false); 1801 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, 1802 makeCharRange(*this, CharStart, UnicodePtr)); 1803 } 1804 1805 // Once we sucessfully parsed some UTF-8, 1806 // calling ConsumeChar ensures the NeedsCleaning flag is set on the token 1807 // being lexed, and that warnings about trailing spaces are emitted. 1808 ConsumeChar(CurPtr, FirstCodeUnitSize, Result); 1809 CurPtr = UnicodePtr; 1810 return true; 1811 } 1812 1813 bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C, 1814 const char *CurPtr) { 1815 bool IsExtension = false; 1816 if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) { 1817 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1818 !PP->isPreprocessedOutput()) { 1819 if (IsExtension) 1820 diagnoseExtensionInIdentifier(PP->getDiagnostics(), C, 1821 makeCharRange(*this, BufferPtr, CurPtr)); 1822 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, 1823 makeCharRange(*this, BufferPtr, CurPtr), 1824 /*IsFirst=*/true); 1825 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, 1826 makeCharRange(*this, BufferPtr, CurPtr)); 1827 } 1828 1829 MIOpt.ReadToken(); 1830 return LexIdentifierContinue(Result, CurPtr); 1831 } 1832 1833 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1834 !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) && 1835 !isUnicodeWhitespace(C)) { 1836 // Non-ASCII characters tend to creep into source code unintentionally. 1837 // Instead of letting the parser complain about the unknown token, 1838 // just drop the character. 1839 // Note that we can /only/ do this when the non-ASCII character is actually 1840 // spelled as Unicode, not written as a UCN. The standard requires that 1841 // we not throw away any possible preprocessor tokens, but there's a 1842 // loophole in the mapping of Unicode characters to basic character set 1843 // characters that allows us to map these particular characters to, say, 1844 // whitespace. 1845 diagnoseInvalidUnicodeCodepointInIdentifier( 1846 PP->getDiagnostics(), LangOpts, C, 1847 makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true); 1848 BufferPtr = CurPtr; 1849 return false; 1850 } 1851 1852 // Otherwise, we have an explicit UCN or a character that's unlikely to show 1853 // up by accident. 1854 MIOpt.ReadToken(); 1855 FormTokenWithChars(Result, CurPtr, tok::unknown); 1856 return true; 1857 } 1858 1859 static const char * 1860 fastParseASCIIIdentifier(const char *CurPtr, 1861 [[maybe_unused]] const char *BufferEnd) { 1862 #ifdef __SSE4_2__ 1863 alignas(16) static constexpr char AsciiIdentifierRange[16] = { 1864 '_', '_', 'A', 'Z', 'a', 'z', '0', '9', 1865 }; 1866 constexpr ssize_t BytesPerRegister = 16; 1867 1868 __m128i AsciiIdentifierRangeV = 1869 _mm_load_si128((const __m128i *)AsciiIdentifierRange); 1870 1871 while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) { 1872 __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr)); 1873 1874 int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv, 1875 _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | 1876 _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY); 1877 CurPtr += Consumed; 1878 if (Consumed == BytesPerRegister) 1879 continue; 1880 return CurPtr; 1881 } 1882 #endif 1883 1884 unsigned char C = *CurPtr; 1885 while (isAsciiIdentifierContinue(C)) 1886 C = *++CurPtr; 1887 return CurPtr; 1888 } 1889 1890 bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) { 1891 // Match [_A-Za-z0-9]*, we have already matched an identifier start. 1892 1893 while (true) { 1894 1895 CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd); 1896 1897 unsigned Size; 1898 // Slow path: handle trigraph, unicode codepoints, UCNs. 1899 unsigned char C = getCharAndSize(CurPtr, Size); 1900 if (isAsciiIdentifierContinue(C)) { 1901 CurPtr = ConsumeChar(CurPtr, Size, Result); 1902 continue; 1903 } 1904 if (C == '$') { 1905 // If we hit a $ and they are not supported in identifiers, we are done. 1906 if (!LangOpts.DollarIdents) 1907 break; 1908 // Otherwise, emit a diagnostic and continue. 1909 if (!isLexingRawMode()) 1910 Diag(CurPtr, diag::ext_dollar_in_identifier); 1911 CurPtr = ConsumeChar(CurPtr, Size, Result); 1912 continue; 1913 } 1914 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1915 continue; 1916 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 1917 continue; 1918 // Neither an expected Unicode codepoint nor a UCN. 1919 break; 1920 } 1921 1922 const char *IdStart = BufferPtr; 1923 FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 1924 Result.setRawIdentifierData(IdStart); 1925 1926 // If we are in raw mode, return this identifier raw. There is no need to 1927 // look up identifier information or attempt to macro expand it. 1928 if (LexingRawMode) 1929 return true; 1930 1931 // Fill in Result.IdentifierInfo and update the token kind, 1932 // looking up the identifier in the identifier table. 1933 const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 1934 // Note that we have to call PP->LookUpIdentifierInfo() even for code 1935 // completion, it writes IdentifierInfo into Result, and callers rely on it. 1936 1937 // If the completion point is at the end of an identifier, we want to treat 1938 // the identifier as incomplete even if it resolves to a macro or a keyword. 1939 // This allows e.g. 'class^' to complete to 'classifier'. 1940 if (isCodeCompletionPoint(CurPtr)) { 1941 // Return the code-completion token. 1942 Result.setKind(tok::code_completion); 1943 // Skip the code-completion char and all immediate identifier characters. 1944 // This ensures we get consistent behavior when completing at any point in 1945 // an identifier (i.e. at the start, in the middle, at the end). Note that 1946 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code 1947 // simpler. 1948 assert(*CurPtr == 0 && "Completion character must be 0"); 1949 ++CurPtr; 1950 // Note that code completion token is not added as a separate character 1951 // when the completion point is at the end of the buffer. Therefore, we need 1952 // to check if the buffer has ended. 1953 if (CurPtr < BufferEnd) { 1954 while (isAsciiIdentifierContinue(*CurPtr)) 1955 ++CurPtr; 1956 } 1957 BufferPtr = CurPtr; 1958 return true; 1959 } 1960 1961 // Finally, now that we know we have an identifier, pass this off to the 1962 // preprocessor, which may macro expand it or something. 1963 if (II->isHandleIdentifierCase()) 1964 return PP->HandleIdentifier(Result); 1965 1966 return true; 1967 } 1968 1969 /// isHexaLiteral - Return true if Start points to a hex constant. 1970 /// in microsoft mode (where this is supposed to be several different tokens). 1971 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 1972 auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts); 1973 char C1 = CharAndSize1.Char; 1974 if (C1 != '0') 1975 return false; 1976 1977 auto CharAndSize2 = 1978 Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts); 1979 char C2 = CharAndSize2.Char; 1980 return (C2 == 'x' || C2 == 'X'); 1981 } 1982 1983 /// LexNumericConstant - Lex the remainder of a integer or floating point 1984 /// constant. From[-1] is the first character lexed. Return the end of the 1985 /// constant. 1986 bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 1987 unsigned Size; 1988 char C = getCharAndSize(CurPtr, Size); 1989 char PrevCh = 0; 1990 while (isPreprocessingNumberBody(C)) { 1991 CurPtr = ConsumeChar(CurPtr, Size, Result); 1992 PrevCh = C; 1993 C = getCharAndSize(CurPtr, Size); 1994 } 1995 1996 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 1997 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 1998 // If we are in Microsoft mode, don't continue if the constant is hex. 1999 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 2000 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 2001 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 2002 } 2003 2004 // If we have a hex FP constant, continue. 2005 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 2006 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a 2007 // not-quite-conforming extension. Only do so if this looks like it's 2008 // actually meant to be a hexfloat, and not if it has a ud-suffix. 2009 bool IsHexFloat = true; 2010 if (!LangOpts.C99) { 2011 if (!isHexaLiteral(BufferPtr, LangOpts)) 2012 IsHexFloat = false; 2013 else if (!LangOpts.CPlusPlus17 && 2014 std::find(BufferPtr, CurPtr, '_') != CurPtr) 2015 IsHexFloat = false; 2016 } 2017 if (IsHexFloat) 2018 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 2019 } 2020 2021 // If we have a digit separator, continue. 2022 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) { 2023 auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts); 2024 if (isAsciiIdentifierContinue(Next)) { 2025 if (!isLexingRawMode()) 2026 Diag(CurPtr, LangOpts.CPlusPlus 2027 ? diag::warn_cxx11_compat_digit_separator 2028 : diag::warn_c23_compat_digit_separator); 2029 CurPtr = ConsumeChar(CurPtr, Size, Result); 2030 CurPtr = ConsumeChar(CurPtr, NextSize, Result); 2031 return LexNumericConstant(Result, CurPtr); 2032 } 2033 } 2034 2035 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. 2036 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 2037 return LexNumericConstant(Result, CurPtr); 2038 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 2039 return LexNumericConstant(Result, CurPtr); 2040 2041 // Update the location of token as well as BufferPtr. 2042 const char *TokStart = BufferPtr; 2043 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 2044 Result.setLiteralData(TokStart); 2045 return true; 2046 } 2047 2048 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 2049 /// in C++11, or warn on a ud-suffix in C++98. 2050 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, 2051 bool IsStringLiteral) { 2052 assert(LangOpts.CPlusPlus); 2053 2054 // Maximally munch an identifier. 2055 unsigned Size; 2056 char C = getCharAndSize(CurPtr, Size); 2057 bool Consumed = false; 2058 2059 if (!isAsciiIdentifierStart(C)) { 2060 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 2061 Consumed = true; 2062 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 2063 Consumed = true; 2064 else 2065 return CurPtr; 2066 } 2067 2068 if (!LangOpts.CPlusPlus11) { 2069 if (!isLexingRawMode()) 2070 Diag(CurPtr, 2071 C == '_' ? diag::warn_cxx11_compat_user_defined_literal 2072 : diag::warn_cxx11_compat_reserved_user_defined_literal) 2073 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 2074 return CurPtr; 2075 } 2076 2077 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 2078 // that does not start with an underscore is ill-formed. As a conforming 2079 // extension, we treat all such suffixes as if they had whitespace before 2080 // them. We assume a suffix beginning with a UCN or UTF-8 character is more 2081 // likely to be a ud-suffix than a macro, however, and accept that. 2082 if (!Consumed) { 2083 bool IsUDSuffix = false; 2084 if (C == '_') 2085 IsUDSuffix = true; 2086 else if (IsStringLiteral && LangOpts.CPlusPlus14) { 2087 // In C++1y, we need to look ahead a few characters to see if this is a 2088 // valid suffix for a string literal or a numeric literal (this could be 2089 // the 'operator""if' defining a numeric literal operator). 2090 const unsigned MaxStandardSuffixLength = 3; 2091 char Buffer[MaxStandardSuffixLength] = { C }; 2092 unsigned Consumed = Size; 2093 unsigned Chars = 1; 2094 while (true) { 2095 auto [Next, NextSize] = 2096 getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts); 2097 if (!isAsciiIdentifierContinue(Next)) { 2098 // End of suffix. Check whether this is on the allowed list. 2099 const StringRef CompleteSuffix(Buffer, Chars); 2100 IsUDSuffix = 2101 StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix); 2102 break; 2103 } 2104 2105 if (Chars == MaxStandardSuffixLength) 2106 // Too long: can't be a standard suffix. 2107 break; 2108 2109 Buffer[Chars++] = Next; 2110 Consumed += NextSize; 2111 } 2112 } 2113 2114 if (!IsUDSuffix) { 2115 if (!isLexingRawMode()) 2116 Diag(CurPtr, LangOpts.MSVCCompat 2117 ? diag::ext_ms_reserved_user_defined_literal 2118 : diag::ext_reserved_user_defined_literal) 2119 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 2120 return CurPtr; 2121 } 2122 2123 CurPtr = ConsumeChar(CurPtr, Size, Result); 2124 } 2125 2126 Result.setFlag(Token::HasUDSuffix); 2127 while (true) { 2128 C = getCharAndSize(CurPtr, Size); 2129 if (isAsciiIdentifierContinue(C)) { 2130 CurPtr = ConsumeChar(CurPtr, Size, Result); 2131 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { 2132 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) { 2133 } else 2134 break; 2135 } 2136 2137 return CurPtr; 2138 } 2139 2140 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 2141 /// either " or L" or u8" or u" or U". 2142 bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 2143 tok::TokenKind Kind) { 2144 const char *AfterQuote = CurPtr; 2145 // Does this string contain the \0 character? 2146 const char *NulCharacter = nullptr; 2147 2148 if (!isLexingRawMode() && 2149 (Kind == tok::utf8_string_literal || 2150 Kind == tok::utf16_string_literal || 2151 Kind == tok::utf32_string_literal)) 2152 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal 2153 : diag::warn_c99_compat_unicode_literal); 2154 2155 char C = getAndAdvanceChar(CurPtr, Result); 2156 while (C != '"') { 2157 // Skip escaped characters. Escaped newlines will already be processed by 2158 // getAndAdvanceChar. 2159 if (C == '\\') 2160 C = getAndAdvanceChar(CurPtr, Result); 2161 2162 if (C == '\n' || C == '\r' || // Newline. 2163 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 2164 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2165 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; 2166 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2167 return true; 2168 } 2169 2170 if (C == 0) { 2171 if (isCodeCompletionPoint(CurPtr-1)) { 2172 if (ParsingFilename) 2173 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false); 2174 else 2175 PP->CodeCompleteNaturalLanguage(); 2176 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 2177 cutOffLexing(); 2178 return true; 2179 } 2180 2181 NulCharacter = CurPtr-1; 2182 } 2183 C = getAndAdvanceChar(CurPtr, Result); 2184 } 2185 2186 // If we are in C++11, lex the optional ud-suffix. 2187 if (LangOpts.CPlusPlus) 2188 CurPtr = LexUDSuffix(Result, CurPtr, true); 2189 2190 // If a nul character existed in the string, warn about it. 2191 if (NulCharacter && !isLexingRawMode()) 2192 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 2193 2194 // Update the location of the token as well as the BufferPtr instance var. 2195 const char *TokStart = BufferPtr; 2196 FormTokenWithChars(Result, CurPtr, Kind); 2197 Result.setLiteralData(TokStart); 2198 return true; 2199 } 2200 2201 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 2202 /// having lexed R", LR", u8R", uR", or UR". 2203 bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 2204 tok::TokenKind Kind) { 2205 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 2206 // Between the initial and final double quote characters of the raw string, 2207 // any transformations performed in phases 1 and 2 (trigraphs, 2208 // universal-character-names, and line splicing) are reverted. 2209 2210 if (!isLexingRawMode()) 2211 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 2212 2213 unsigned PrefixLen = 0; 2214 2215 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) 2216 ++PrefixLen; 2217 2218 // If the last character was not a '(', then we didn't lex a valid delimiter. 2219 if (CurPtr[PrefixLen] != '(') { 2220 if (!isLexingRawMode()) { 2221 const char *PrefixEnd = &CurPtr[PrefixLen]; 2222 if (PrefixLen == 16) { 2223 Diag(PrefixEnd, diag::err_raw_delim_too_long); 2224 } else { 2225 Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 2226 << StringRef(PrefixEnd, 1); 2227 } 2228 } 2229 2230 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 2231 // it's possible the '"' was intended to be part of the raw string, but 2232 // there's not much we can do about that. 2233 while (true) { 2234 char C = *CurPtr++; 2235 2236 if (C == '"') 2237 break; 2238 if (C == 0 && CurPtr-1 == BufferEnd) { 2239 --CurPtr; 2240 break; 2241 } 2242 } 2243 2244 FormTokenWithChars(Result, CurPtr, tok::unknown); 2245 return true; 2246 } 2247 2248 // Save prefix and move CurPtr past it 2249 const char *Prefix = CurPtr; 2250 CurPtr += PrefixLen + 1; // skip over prefix and '(' 2251 2252 while (true) { 2253 char C = *CurPtr++; 2254 2255 if (C == ')') { 2256 // Check for prefix match and closing quote. 2257 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 2258 CurPtr += PrefixLen + 1; // skip over prefix and '"' 2259 break; 2260 } 2261 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 2262 if (!isLexingRawMode()) 2263 Diag(BufferPtr, diag::err_unterminated_raw_string) 2264 << StringRef(Prefix, PrefixLen); 2265 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2266 return true; 2267 } 2268 } 2269 2270 // If we are in C++11, lex the optional ud-suffix. 2271 if (LangOpts.CPlusPlus) 2272 CurPtr = LexUDSuffix(Result, CurPtr, true); 2273 2274 // Update the location of token as well as BufferPtr. 2275 const char *TokStart = BufferPtr; 2276 FormTokenWithChars(Result, CurPtr, Kind); 2277 Result.setLiteralData(TokStart); 2278 return true; 2279 } 2280 2281 /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 2282 /// after having lexed the '<' character. This is used for #include filenames. 2283 bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 2284 // Does this string contain the \0 character? 2285 const char *NulCharacter = nullptr; 2286 const char *AfterLessPos = CurPtr; 2287 char C = getAndAdvanceChar(CurPtr, Result); 2288 while (C != '>') { 2289 // Skip escaped characters. Escaped newlines will already be processed by 2290 // getAndAdvanceChar. 2291 if (C == '\\') 2292 C = getAndAdvanceChar(CurPtr, Result); 2293 2294 if (isVerticalWhitespace(C) || // Newline. 2295 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file. 2296 // If the filename is unterminated, then it must just be a lone < 2297 // character. Return this as such. 2298 FormTokenWithChars(Result, AfterLessPos, tok::less); 2299 return true; 2300 } 2301 2302 if (C == 0) { 2303 if (isCodeCompletionPoint(CurPtr - 1)) { 2304 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true); 2305 cutOffLexing(); 2306 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 2307 return true; 2308 } 2309 NulCharacter = CurPtr-1; 2310 } 2311 C = getAndAdvanceChar(CurPtr, Result); 2312 } 2313 2314 // If a nul character existed in the string, warn about it. 2315 if (NulCharacter && !isLexingRawMode()) 2316 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 2317 2318 // Update the location of token as well as BufferPtr. 2319 const char *TokStart = BufferPtr; 2320 FormTokenWithChars(Result, CurPtr, tok::header_name); 2321 Result.setLiteralData(TokStart); 2322 return true; 2323 } 2324 2325 void Lexer::codeCompleteIncludedFile(const char *PathStart, 2326 const char *CompletionPoint, 2327 bool IsAngled) { 2328 // Completion only applies to the filename, after the last slash. 2329 StringRef PartialPath(PathStart, CompletionPoint - PathStart); 2330 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/"; 2331 auto Slash = PartialPath.find_last_of(SlashChars); 2332 StringRef Dir = 2333 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash); 2334 const char *StartOfFilename = 2335 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1; 2336 // Code completion filter range is the filename only, up to completion point. 2337 PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get( 2338 StringRef(StartOfFilename, CompletionPoint - StartOfFilename))); 2339 // We should replace the characters up to the closing quote or closest slash, 2340 // if any. 2341 while (CompletionPoint < BufferEnd) { 2342 char Next = *(CompletionPoint + 1); 2343 if (Next == 0 || Next == '\r' || Next == '\n') 2344 break; 2345 ++CompletionPoint; 2346 if (Next == (IsAngled ? '>' : '"')) 2347 break; 2348 if (SlashChars.contains(Next)) 2349 break; 2350 } 2351 2352 PP->setCodeCompletionTokenRange( 2353 FileLoc.getLocWithOffset(StartOfFilename - BufferStart), 2354 FileLoc.getLocWithOffset(CompletionPoint - BufferStart)); 2355 PP->CodeCompleteIncludedFile(Dir, IsAngled); 2356 } 2357 2358 /// LexCharConstant - Lex the remainder of a character constant, after having 2359 /// lexed either ' or L' or u8' or u' or U'. 2360 bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, 2361 tok::TokenKind Kind) { 2362 // Does this character contain the \0 character? 2363 const char *NulCharacter = nullptr; 2364 2365 if (!isLexingRawMode()) { 2366 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) 2367 Diag(BufferPtr, LangOpts.CPlusPlus 2368 ? diag::warn_cxx98_compat_unicode_literal 2369 : diag::warn_c99_compat_unicode_literal); 2370 else if (Kind == tok::utf8_char_constant) 2371 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal); 2372 } 2373 2374 char C = getAndAdvanceChar(CurPtr, Result); 2375 if (C == '\'') { 2376 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2377 Diag(BufferPtr, diag::ext_empty_character); 2378 FormTokenWithChars(Result, CurPtr, tok::unknown); 2379 return true; 2380 } 2381 2382 while (C != '\'') { 2383 // Skip escaped characters. 2384 if (C == '\\') 2385 C = getAndAdvanceChar(CurPtr, Result); 2386 2387 if (C == '\n' || C == '\r' || // Newline. 2388 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 2389 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2390 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0; 2391 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2392 return true; 2393 } 2394 2395 if (C == 0) { 2396 if (isCodeCompletionPoint(CurPtr-1)) { 2397 PP->CodeCompleteNaturalLanguage(); 2398 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2399 cutOffLexing(); 2400 return true; 2401 } 2402 2403 NulCharacter = CurPtr-1; 2404 } 2405 C = getAndAdvanceChar(CurPtr, Result); 2406 } 2407 2408 // If we are in C++11, lex the optional ud-suffix. 2409 if (LangOpts.CPlusPlus) 2410 CurPtr = LexUDSuffix(Result, CurPtr, false); 2411 2412 // If a nul character existed in the character, warn about it. 2413 if (NulCharacter && !isLexingRawMode()) 2414 Diag(NulCharacter, diag::null_in_char_or_string) << 0; 2415 2416 // Update the location of token as well as BufferPtr. 2417 const char *TokStart = BufferPtr; 2418 FormTokenWithChars(Result, CurPtr, Kind); 2419 Result.setLiteralData(TokStart); 2420 return true; 2421 } 2422 2423 /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 2424 /// Update BufferPtr to point to the next non-whitespace character and return. 2425 /// 2426 /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 2427 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, 2428 bool &TokAtPhysicalStartOfLine) { 2429 // Whitespace - Skip it, then return the token after the whitespace. 2430 bool SawNewline = isVerticalWhitespace(CurPtr[-1]); 2431 2432 unsigned char Char = *CurPtr; 2433 2434 const char *lastNewLine = nullptr; 2435 auto setLastNewLine = [&](const char *Ptr) { 2436 lastNewLine = Ptr; 2437 if (!NewLinePtr) 2438 NewLinePtr = Ptr; 2439 }; 2440 if (SawNewline) 2441 setLastNewLine(CurPtr - 1); 2442 2443 // Skip consecutive spaces efficiently. 2444 while (true) { 2445 // Skip horizontal whitespace very aggressively. 2446 while (isHorizontalWhitespace(Char)) 2447 Char = *++CurPtr; 2448 2449 // Otherwise if we have something other than whitespace, we're done. 2450 if (!isVerticalWhitespace(Char)) 2451 break; 2452 2453 if (ParsingPreprocessorDirective) { 2454 // End of preprocessor directive line, let LexTokenInternal handle this. 2455 BufferPtr = CurPtr; 2456 return false; 2457 } 2458 2459 // OK, but handle newline. 2460 if (*CurPtr == '\n') 2461 setLastNewLine(CurPtr); 2462 SawNewline = true; 2463 Char = *++CurPtr; 2464 } 2465 2466 // If the client wants us to return whitespace, return it now. 2467 if (isKeepWhitespaceMode()) { 2468 FormTokenWithChars(Result, CurPtr, tok::unknown); 2469 if (SawNewline) { 2470 IsAtStartOfLine = true; 2471 IsAtPhysicalStartOfLine = true; 2472 } 2473 // FIXME: The next token will not have LeadingSpace set. 2474 return true; 2475 } 2476 2477 // If this isn't immediately after a newline, there is leading space. 2478 char PrevChar = CurPtr[-1]; 2479 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); 2480 2481 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); 2482 if (SawNewline) { 2483 Result.setFlag(Token::StartOfLine); 2484 TokAtPhysicalStartOfLine = true; 2485 2486 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) { 2487 if (auto *Handler = PP->getEmptylineHandler()) 2488 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1), 2489 getSourceLocation(lastNewLine))); 2490 } 2491 } 2492 2493 BufferPtr = CurPtr; 2494 return false; 2495 } 2496 2497 /// We have just read the // characters from input. Skip until we find the 2498 /// newline character that terminates the comment. Then update BufferPtr and 2499 /// return. 2500 /// 2501 /// If we're in KeepCommentMode or any CommentHandler has inserted 2502 /// some tokens, this will store the first token and return true. 2503 bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, 2504 bool &TokAtPhysicalStartOfLine) { 2505 // If Line comments aren't explicitly enabled for this language, emit an 2506 // extension warning. 2507 if (!LineComment) { 2508 if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags. 2509 Diag(BufferPtr, diag::ext_line_comment); 2510 2511 // Mark them enabled so we only emit one warning for this translation 2512 // unit. 2513 LineComment = true; 2514 } 2515 2516 // Scan over the body of the comment. The common case, when scanning, is that 2517 // the comment contains normal ascii characters with nothing interesting in 2518 // them. As such, optimize for this case with the inner loop. 2519 // 2520 // This loop terminates with CurPtr pointing at the newline (or end of buffer) 2521 // character that ends the line comment. 2522 2523 // C++23 [lex.phases] p1 2524 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a 2525 // diagnostic only once per entire ill-formed subsequence to avoid 2526 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). 2527 bool UnicodeDecodingAlreadyDiagnosed = false; 2528 2529 char C; 2530 while (true) { 2531 C = *CurPtr; 2532 // Skip over characters in the fast loop. 2533 while (isASCII(C) && C != 0 && // Potentially EOF. 2534 C != '\n' && C != '\r') { // Newline or DOS-style newline. 2535 C = *++CurPtr; 2536 UnicodeDecodingAlreadyDiagnosed = false; 2537 } 2538 2539 if (!isASCII(C)) { 2540 unsigned Length = llvm::getUTF8SequenceSize( 2541 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd); 2542 if (Length == 0) { 2543 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) 2544 Diag(CurPtr, diag::warn_invalid_utf8_in_comment); 2545 UnicodeDecodingAlreadyDiagnosed = true; 2546 ++CurPtr; 2547 } else { 2548 UnicodeDecodingAlreadyDiagnosed = false; 2549 CurPtr += Length; 2550 } 2551 continue; 2552 } 2553 2554 const char *NextLine = CurPtr; 2555 if (C != 0) { 2556 // We found a newline, see if it's escaped. 2557 const char *EscapePtr = CurPtr-1; 2558 bool HasSpace = false; 2559 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace. 2560 --EscapePtr; 2561 HasSpace = true; 2562 } 2563 2564 if (*EscapePtr == '\\') 2565 // Escaped newline. 2566 CurPtr = EscapePtr; 2567 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 2568 EscapePtr[-2] == '?' && LangOpts.Trigraphs) 2569 // Trigraph-escaped newline. 2570 CurPtr = EscapePtr-2; 2571 else 2572 break; // This is a newline, we're done. 2573 2574 // If there was space between the backslash and newline, warn about it. 2575 if (HasSpace && !isLexingRawMode()) 2576 Diag(EscapePtr, diag::backslash_newline_space); 2577 } 2578 2579 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 2580 // properly decode the character. Read it in raw mode to avoid emitting 2581 // diagnostics about things like trigraphs. If we see an escaped newline, 2582 // we'll handle it below. 2583 const char *OldPtr = CurPtr; 2584 bool OldRawMode = isLexingRawMode(); 2585 LexingRawMode = true; 2586 C = getAndAdvanceChar(CurPtr, Result); 2587 LexingRawMode = OldRawMode; 2588 2589 // If we only read only one character, then no special handling is needed. 2590 // We're done and can skip forward to the newline. 2591 if (C != 0 && CurPtr == OldPtr+1) { 2592 CurPtr = NextLine; 2593 break; 2594 } 2595 2596 // If we read multiple characters, and one of those characters was a \r or 2597 // \n, then we had an escaped newline within the comment. Emit diagnostic 2598 // unless the next line is also a // comment. 2599 if (CurPtr != OldPtr + 1 && C != '/' && 2600 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) { 2601 for (; OldPtr != CurPtr; ++OldPtr) 2602 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 2603 // Okay, we found a // comment that ends in a newline, if the next 2604 // line is also a // comment, but has spaces, don't emit a diagnostic. 2605 if (isWhitespace(C)) { 2606 const char *ForwardPtr = CurPtr; 2607 while (isWhitespace(*ForwardPtr)) // Skip whitespace. 2608 ++ForwardPtr; 2609 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 2610 break; 2611 } 2612 2613 if (!isLexingRawMode()) 2614 Diag(OldPtr-1, diag::ext_multi_line_line_comment); 2615 break; 2616 } 2617 } 2618 2619 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) { 2620 --CurPtr; 2621 break; 2622 } 2623 2624 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2625 PP->CodeCompleteNaturalLanguage(); 2626 cutOffLexing(); 2627 return false; 2628 } 2629 } 2630 2631 // Found but did not consume the newline. Notify comment handlers about the 2632 // comment unless we're in a #if 0 block. 2633 if (PP && !isLexingRawMode() && 2634 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2635 getSourceLocation(CurPtr)))) { 2636 BufferPtr = CurPtr; 2637 return true; // A token has to be returned. 2638 } 2639 2640 // If we are returning comments as tokens, return this comment as a token. 2641 if (inKeepCommentMode()) 2642 return SaveLineComment(Result, CurPtr); 2643 2644 // If we are inside a preprocessor directive and we see the end of line, 2645 // return immediately, so that the lexer can return this as an EOD token. 2646 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 2647 BufferPtr = CurPtr; 2648 return false; 2649 } 2650 2651 // Otherwise, eat the \n character. We don't care if this is a \n\r or 2652 // \r\n sequence. This is an efficiency hack (because we know the \n can't 2653 // contribute to another token), it isn't needed for correctness. Note that 2654 // this is ok even in KeepWhitespaceMode, because we would have returned the 2655 // comment above in that mode. 2656 NewLinePtr = CurPtr++; 2657 2658 // The next returned token is at the start of the line. 2659 Result.setFlag(Token::StartOfLine); 2660 TokAtPhysicalStartOfLine = true; 2661 // No leading whitespace seen so far. 2662 Result.clearFlag(Token::LeadingSpace); 2663 BufferPtr = CurPtr; 2664 return false; 2665 } 2666 2667 /// If in save-comment mode, package up this Line comment in an appropriate 2668 /// way and return it. 2669 bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { 2670 // If we're not in a preprocessor directive, just return the // comment 2671 // directly. 2672 FormTokenWithChars(Result, CurPtr, tok::comment); 2673 2674 if (!ParsingPreprocessorDirective || LexingRawMode) 2675 return true; 2676 2677 // If this Line-style comment is in a macro definition, transmogrify it into 2678 // a C-style block comment. 2679 bool Invalid = false; 2680 std::string Spelling = PP->getSpelling(Result, &Invalid); 2681 if (Invalid) 2682 return true; 2683 2684 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); 2685 Spelling[1] = '*'; // Change prefix to "/*". 2686 Spelling += "*/"; // add suffix. 2687 2688 Result.setKind(tok::comment); 2689 PP->CreateString(Spelling, Result, 2690 Result.getLocation(), Result.getLocation()); 2691 return true; 2692 } 2693 2694 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 2695 /// character (either \\n or \\r) is part of an escaped newline sequence. Issue 2696 /// a diagnostic if so. We know that the newline is inside of a block comment. 2697 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, 2698 bool Trigraphs) { 2699 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 2700 2701 // Position of the first trigraph in the ending sequence. 2702 const char *TrigraphPos = nullptr; 2703 // Position of the first whitespace after a '\' in the ending sequence. 2704 const char *SpacePos = nullptr; 2705 2706 while (true) { 2707 // Back up off the newline. 2708 --CurPtr; 2709 2710 // If this is a two-character newline sequence, skip the other character. 2711 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 2712 // \n\n or \r\r -> not escaped newline. 2713 if (CurPtr[0] == CurPtr[1]) 2714 return false; 2715 // \n\r or \r\n -> skip the newline. 2716 --CurPtr; 2717 } 2718 2719 // If we have horizontal whitespace, skip over it. We allow whitespace 2720 // between the slash and newline. 2721 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 2722 SpacePos = CurPtr; 2723 --CurPtr; 2724 } 2725 2726 // If we have a slash, this is an escaped newline. 2727 if (*CurPtr == '\\') { 2728 --CurPtr; 2729 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') { 2730 // This is a trigraph encoding of a slash. 2731 TrigraphPos = CurPtr - 2; 2732 CurPtr -= 3; 2733 } else { 2734 return false; 2735 } 2736 2737 // If the character preceding the escaped newline is a '*', then after line 2738 // splicing we have a '*/' ending the comment. 2739 if (*CurPtr == '*') 2740 break; 2741 2742 if (*CurPtr != '\n' && *CurPtr != '\r') 2743 return false; 2744 } 2745 2746 if (TrigraphPos) { 2747 // If no trigraphs are enabled, warn that we ignored this trigraph and 2748 // ignore this * character. 2749 if (!Trigraphs) { 2750 if (!L->isLexingRawMode()) 2751 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment); 2752 return false; 2753 } 2754 if (!L->isLexingRawMode()) 2755 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment); 2756 } 2757 2758 // Warn about having an escaped newline between the */ characters. 2759 if (!L->isLexingRawMode()) 2760 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end); 2761 2762 // If there was space between the backslash and newline, warn about it. 2763 if (SpacePos && !L->isLexingRawMode()) 2764 L->Diag(SpacePos, diag::backslash_newline_space); 2765 2766 return true; 2767 } 2768 2769 #ifdef __SSE2__ 2770 #include <emmintrin.h> 2771 #elif __ALTIVEC__ 2772 #include <altivec.h> 2773 #undef bool 2774 #endif 2775 2776 /// We have just read from input the / and * characters that started a comment. 2777 /// Read until we find the * and / characters that terminate the comment. 2778 /// Note that we don't bother decoding trigraphs or escaped newlines in block 2779 /// comments, because they cannot cause the comment to end. The only thing 2780 /// that can happen is the comment could end with an escaped newline between 2781 /// the terminating * and /. 2782 /// 2783 /// If we're in KeepCommentMode or any CommentHandler has inserted 2784 /// some tokens, this will store the first token and return true. 2785 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, 2786 bool &TokAtPhysicalStartOfLine) { 2787 // Scan one character past where we should, looking for a '/' character. Once 2788 // we find it, check to see if it was preceded by a *. This common 2789 // optimization helps people who like to put a lot of * characters in their 2790 // comments. 2791 2792 // The first character we get with newlines and trigraphs skipped to handle 2793 // the degenerate /*/ case below correctly if the * has an escaped newline 2794 // after it. 2795 unsigned CharSize; 2796 unsigned char C = getCharAndSize(CurPtr, CharSize); 2797 CurPtr += CharSize; 2798 if (C == 0 && CurPtr == BufferEnd+1) { 2799 if (!isLexingRawMode()) 2800 Diag(BufferPtr, diag::err_unterminated_block_comment); 2801 --CurPtr; 2802 2803 // KeepWhitespaceMode should return this broken comment as a token. Since 2804 // it isn't a well formed comment, just return it as an 'unknown' token. 2805 if (isKeepWhitespaceMode()) { 2806 FormTokenWithChars(Result, CurPtr, tok::unknown); 2807 return true; 2808 } 2809 2810 BufferPtr = CurPtr; 2811 return false; 2812 } 2813 2814 // Check to see if the first character after the '/*' is another /. If so, 2815 // then this slash does not end the block comment, it is part of it. 2816 if (C == '/') 2817 C = *CurPtr++; 2818 2819 // C++23 [lex.phases] p1 2820 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a 2821 // diagnostic only once per entire ill-formed subsequence to avoid 2822 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). 2823 bool UnicodeDecodingAlreadyDiagnosed = false; 2824 2825 while (true) { 2826 // Skip over all non-interesting characters until we find end of buffer or a 2827 // (probably ending) '/' character. 2828 if (CurPtr + 24 < BufferEnd && 2829 // If there is a code-completion point avoid the fast scan because it 2830 // doesn't check for '\0'. 2831 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 2832 // While not aligned to a 16-byte boundary. 2833 while (C != '/' && (intptr_t)CurPtr % 16 != 0) { 2834 if (!isASCII(C)) 2835 goto MultiByteUTF8; 2836 C = *CurPtr++; 2837 } 2838 if (C == '/') goto FoundSlash; 2839 2840 #ifdef __SSE2__ 2841 __m128i Slashes = _mm_set1_epi8('/'); 2842 while (CurPtr + 16 < BufferEnd) { 2843 int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr); 2844 if (LLVM_UNLIKELY(Mask != 0)) { 2845 goto MultiByteUTF8; 2846 } 2847 // look for slashes 2848 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 2849 Slashes)); 2850 if (cmp != 0) { 2851 // Adjust the pointer to point directly after the first slash. It's 2852 // not necessary to set C here, it will be overwritten at the end of 2853 // the outer loop. 2854 CurPtr += llvm::countr_zero<unsigned>(cmp) + 1; 2855 goto FoundSlash; 2856 } 2857 CurPtr += 16; 2858 } 2859 #elif __ALTIVEC__ 2860 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2861 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2862 0x80, 0x80, 0x80, 0x80}; 2863 __vector unsigned char Slashes = { 2864 '/', '/', '/', '/', '/', '/', '/', '/', 2865 '/', '/', '/', '/', '/', '/', '/', '/' 2866 }; 2867 while (CurPtr + 16 < BufferEnd) { 2868 if (LLVM_UNLIKELY( 2869 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF))) 2870 goto MultiByteUTF8; 2871 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) { 2872 break; 2873 } 2874 CurPtr += 16; 2875 } 2876 2877 #else 2878 while (CurPtr + 16 < BufferEnd) { 2879 bool HasNonASCII = false; 2880 for (unsigned I = 0; I < 16; ++I) 2881 HasNonASCII |= !isASCII(CurPtr[I]); 2882 2883 if (LLVM_UNLIKELY(HasNonASCII)) 2884 goto MultiByteUTF8; 2885 2886 bool HasSlash = false; 2887 for (unsigned I = 0; I < 16; ++I) 2888 HasSlash |= CurPtr[I] == '/'; 2889 if (HasSlash) 2890 break; 2891 CurPtr += 16; 2892 } 2893 #endif 2894 2895 // It has to be one of the bytes scanned, increment to it and read one. 2896 C = *CurPtr++; 2897 } 2898 2899 // Loop to scan the remainder, warning on invalid UTF-8 2900 // if the corresponding warning is enabled, emitting a diagnostic only once 2901 // per sequence that cannot be decoded. 2902 while (C != '/' && C != '\0') { 2903 if (isASCII(C)) { 2904 UnicodeDecodingAlreadyDiagnosed = false; 2905 C = *CurPtr++; 2906 continue; 2907 } 2908 MultiByteUTF8: 2909 // CurPtr is 1 code unit past C, so to decode 2910 // the codepoint, we need to read from the previous position. 2911 unsigned Length = llvm::getUTF8SequenceSize( 2912 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd); 2913 if (Length == 0) { 2914 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) 2915 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment); 2916 UnicodeDecodingAlreadyDiagnosed = true; 2917 } else { 2918 UnicodeDecodingAlreadyDiagnosed = false; 2919 CurPtr += Length - 1; 2920 } 2921 C = *CurPtr++; 2922 } 2923 2924 if (C == '/') { 2925 FoundSlash: 2926 if (CurPtr[-2] == '*') // We found the final */. We're done! 2927 break; 2928 2929 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 2930 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this, 2931 LangOpts.Trigraphs)) { 2932 // We found the final */, though it had an escaped newline between the 2933 // * and /. We're done! 2934 break; 2935 } 2936 } 2937 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 2938 // If this is a /* inside of the comment, emit a warning. Don't do this 2939 // if this is a /*/, which will end the comment. This misses cases with 2940 // embedded escaped newlines, but oh well. 2941 if (!isLexingRawMode()) 2942 Diag(CurPtr-1, diag::warn_nested_block_comment); 2943 } 2944 } else if (C == 0 && CurPtr == BufferEnd+1) { 2945 if (!isLexingRawMode()) 2946 Diag(BufferPtr, diag::err_unterminated_block_comment); 2947 // Note: the user probably forgot a */. We could continue immediately 2948 // after the /*, but this would involve lexing a lot of what really is the 2949 // comment, which surely would confuse the parser. 2950 --CurPtr; 2951 2952 // KeepWhitespaceMode should return this broken comment as a token. Since 2953 // it isn't a well formed comment, just return it as an 'unknown' token. 2954 if (isKeepWhitespaceMode()) { 2955 FormTokenWithChars(Result, CurPtr, tok::unknown); 2956 return true; 2957 } 2958 2959 BufferPtr = CurPtr; 2960 return false; 2961 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2962 PP->CodeCompleteNaturalLanguage(); 2963 cutOffLexing(); 2964 return false; 2965 } 2966 2967 C = *CurPtr++; 2968 } 2969 2970 // Notify comment handlers about the comment unless we're in a #if 0 block. 2971 if (PP && !isLexingRawMode() && 2972 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2973 getSourceLocation(CurPtr)))) { 2974 BufferPtr = CurPtr; 2975 return true; // A token has to be returned. 2976 } 2977 2978 // If we are returning comments as tokens, return this comment as a token. 2979 if (inKeepCommentMode()) { 2980 FormTokenWithChars(Result, CurPtr, tok::comment); 2981 return true; 2982 } 2983 2984 // It is common for the tokens immediately after a /**/ comment to be 2985 // whitespace. Instead of going through the big switch, handle it 2986 // efficiently now. This is safe even in KeepWhitespaceMode because we would 2987 // have already returned above with the comment as a token. 2988 if (isHorizontalWhitespace(*CurPtr)) { 2989 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); 2990 return false; 2991 } 2992 2993 // Otherwise, just return so that the next character will be lexed as a token. 2994 BufferPtr = CurPtr; 2995 Result.setFlag(Token::LeadingSpace); 2996 return false; 2997 } 2998 2999 //===----------------------------------------------------------------------===// 3000 // Primary Lexing Entry Points 3001 //===----------------------------------------------------------------------===// 3002 3003 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 3004 /// uninterpreted string. This switches the lexer out of directive mode. 3005 void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 3006 assert(ParsingPreprocessorDirective && ParsingFilename == false && 3007 "Must be in a preprocessing directive!"); 3008 Token Tmp; 3009 Tmp.startToken(); 3010 3011 // CurPtr - Cache BufferPtr in an automatic variable. 3012 const char *CurPtr = BufferPtr; 3013 while (true) { 3014 char Char = getAndAdvanceChar(CurPtr, Tmp); 3015 switch (Char) { 3016 default: 3017 if (Result) 3018 Result->push_back(Char); 3019 break; 3020 case 0: // Null. 3021 // Found end of file? 3022 if (CurPtr-1 != BufferEnd) { 3023 if (isCodeCompletionPoint(CurPtr-1)) { 3024 PP->CodeCompleteNaturalLanguage(); 3025 cutOffLexing(); 3026 return; 3027 } 3028 3029 // Nope, normal character, continue. 3030 if (Result) 3031 Result->push_back(Char); 3032 break; 3033 } 3034 // FALL THROUGH. 3035 [[fallthrough]]; 3036 case '\r': 3037 case '\n': 3038 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 3039 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 3040 BufferPtr = CurPtr-1; 3041 3042 // Next, lex the character, which should handle the EOD transition. 3043 Lex(Tmp); 3044 if (Tmp.is(tok::code_completion)) { 3045 if (PP) 3046 PP->CodeCompleteNaturalLanguage(); 3047 Lex(Tmp); 3048 } 3049 assert(Tmp.is(tok::eod) && "Unexpected token!"); 3050 3051 // Finally, we're done; 3052 return; 3053 } 3054 } 3055 } 3056 3057 /// LexEndOfFile - CurPtr points to the end of this file. Handle this 3058 /// condition, reporting diagnostics and handling other edge cases as required. 3059 /// This returns true if Result contains a token, false if PP.Lex should be 3060 /// called again. 3061 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 3062 // If we hit the end of the file while parsing a preprocessor directive, 3063 // end the preprocessor directive first. The next token returned will 3064 // then be the end of file. 3065 if (ParsingPreprocessorDirective) { 3066 // Done parsing the "line". 3067 ParsingPreprocessorDirective = false; 3068 // Update the location of token as well as BufferPtr. 3069 FormTokenWithChars(Result, CurPtr, tok::eod); 3070 3071 // Restore comment saving mode, in case it was disabled for directive. 3072 if (PP) 3073 resetExtendedTokenMode(); 3074 return true; // Have a token. 3075 } 3076 3077 // If we are in raw mode, return this event as an EOF token. Let the caller 3078 // that put us in raw mode handle the event. 3079 if (isLexingRawMode()) { 3080 Result.startToken(); 3081 BufferPtr = BufferEnd; 3082 FormTokenWithChars(Result, BufferEnd, tok::eof); 3083 return true; 3084 } 3085 3086 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { 3087 PP->setRecordedPreambleConditionalStack(ConditionalStack); 3088 // If the preamble cuts off the end of a header guard, consider it guarded. 3089 // The guard is valid for the preamble content itself, and for tools the 3090 // most useful answer is "yes, this file has a header guard". 3091 if (!ConditionalStack.empty()) 3092 MIOpt.ExitTopLevelConditional(); 3093 ConditionalStack.clear(); 3094 } 3095 3096 // Issue diagnostics for unterminated #if and missing newline. 3097 3098 // If we are in a #if directive, emit an error. 3099 while (!ConditionalStack.empty()) { 3100 if (PP->getCodeCompletionFileLoc() != FileLoc) 3101 PP->Diag(ConditionalStack.back().IfLoc, 3102 diag::err_pp_unterminated_conditional); 3103 ConditionalStack.pop_back(); 3104 } 3105 3106 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 3107 // a pedwarn. 3108 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) { 3109 DiagnosticsEngine &Diags = PP->getDiagnostics(); 3110 SourceLocation EndLoc = getSourceLocation(BufferEnd); 3111 unsigned DiagID; 3112 3113 if (LangOpts.CPlusPlus11) { 3114 // C++11 [lex.phases] 2.2 p2 3115 // Prefer the C++98 pedantic compatibility warning over the generic, 3116 // non-extension, user-requested "missing newline at EOF" warning. 3117 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) { 3118 DiagID = diag::warn_cxx98_compat_no_newline_eof; 3119 } else { 3120 DiagID = diag::warn_no_newline_eof; 3121 } 3122 } else { 3123 DiagID = diag::ext_no_newline_eof; 3124 } 3125 3126 Diag(BufferEnd, DiagID) 3127 << FixItHint::CreateInsertion(EndLoc, "\n"); 3128 } 3129 3130 BufferPtr = CurPtr; 3131 3132 // Finally, let the preprocessor handle this. 3133 return PP->HandleEndOfFile(Result, isPragmaLexer()); 3134 } 3135 3136 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 3137 /// the specified lexer will return a tok::l_paren token, 0 if it is something 3138 /// else and 2 if there are no more tokens in the buffer controlled by the 3139 /// lexer. 3140 unsigned Lexer::isNextPPTokenLParen() { 3141 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 3142 3143 if (isDependencyDirectivesLexer()) { 3144 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) 3145 return 2; 3146 return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( 3147 tok::l_paren); 3148 } 3149 3150 // Switch to 'skipping' mode. This will ensure that we can lex a token 3151 // without emitting diagnostics, disables macro expansion, and will cause EOF 3152 // to return an EOF token instead of popping the include stack. 3153 LexingRawMode = true; 3154 3155 // Save state that can be changed while lexing so that we can restore it. 3156 const char *TmpBufferPtr = BufferPtr; 3157 bool inPPDirectiveMode = ParsingPreprocessorDirective; 3158 bool atStartOfLine = IsAtStartOfLine; 3159 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 3160 bool leadingSpace = HasLeadingSpace; 3161 3162 Token Tok; 3163 Lex(Tok); 3164 3165 // Restore state that may have changed. 3166 BufferPtr = TmpBufferPtr; 3167 ParsingPreprocessorDirective = inPPDirectiveMode; 3168 HasLeadingSpace = leadingSpace; 3169 IsAtStartOfLine = atStartOfLine; 3170 IsAtPhysicalStartOfLine = atPhysicalStartOfLine; 3171 3172 // Restore the lexer back to non-skipping mode. 3173 LexingRawMode = false; 3174 3175 if (Tok.is(tok::eof)) 3176 return 2; 3177 return Tok.is(tok::l_paren); 3178 } 3179 3180 /// Find the end of a version control conflict marker. 3181 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 3182 ConflictMarkerKind CMK) { 3183 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 3184 size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 3185 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen); 3186 size_t Pos = RestOfBuffer.find(Terminator); 3187 while (Pos != StringRef::npos) { 3188 // Must occur at start of line. 3189 if (Pos == 0 || 3190 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) { 3191 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 3192 Pos = RestOfBuffer.find(Terminator); 3193 continue; 3194 } 3195 return RestOfBuffer.data()+Pos; 3196 } 3197 return nullptr; 3198 } 3199 3200 /// IsStartOfConflictMarker - If the specified pointer is the start of a version 3201 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 3202 /// and recover nicely. This returns true if it is a conflict marker and false 3203 /// if not. 3204 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 3205 // Only a conflict marker if it starts at the beginning of a line. 3206 if (CurPtr != BufferStart && 3207 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 3208 return false; 3209 3210 // Check to see if we have <<<<<<< or >>>>. 3211 if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") && 3212 !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> ")) 3213 return false; 3214 3215 // If we have a situation where we don't care about conflict markers, ignore 3216 // it. 3217 if (CurrentConflictMarkerState || isLexingRawMode()) 3218 return false; 3219 3220 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 3221 3222 // Check to see if there is an ending marker somewhere in the buffer at the 3223 // start of a line to terminate this conflict marker. 3224 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 3225 // We found a match. We are really in a conflict marker. 3226 // Diagnose this, and ignore to the end of line. 3227 Diag(CurPtr, diag::err_conflict_marker); 3228 CurrentConflictMarkerState = Kind; 3229 3230 // Skip ahead to the end of line. We know this exists because the 3231 // end-of-conflict marker starts with \r or \n. 3232 while (*CurPtr != '\r' && *CurPtr != '\n') { 3233 assert(CurPtr != BufferEnd && "Didn't find end of line"); 3234 ++CurPtr; 3235 } 3236 BufferPtr = CurPtr; 3237 return true; 3238 } 3239 3240 // No end of conflict marker found. 3241 return false; 3242 } 3243 3244 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 3245 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 3246 /// is the end of a conflict marker. Handle it by ignoring up until the end of 3247 /// the line. This returns true if it is a conflict marker and false if not. 3248 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 3249 // Only a conflict marker if it starts at the beginning of a line. 3250 if (CurPtr != BufferStart && 3251 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 3252 return false; 3253 3254 // If we have a situation where we don't care about conflict markers, ignore 3255 // it. 3256 if (!CurrentConflictMarkerState || isLexingRawMode()) 3257 return false; 3258 3259 // Check to see if we have the marker (4 characters in a row). 3260 for (unsigned i = 1; i != 4; ++i) 3261 if (CurPtr[i] != CurPtr[0]) 3262 return false; 3263 3264 // If we do have it, search for the end of the conflict marker. This could 3265 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 3266 // be the end of conflict marker. 3267 if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 3268 CurrentConflictMarkerState)) { 3269 CurPtr = End; 3270 3271 // Skip ahead to the end of line. 3272 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 3273 ++CurPtr; 3274 3275 BufferPtr = CurPtr; 3276 3277 // No longer in the conflict marker. 3278 CurrentConflictMarkerState = CMK_None; 3279 return true; 3280 } 3281 3282 return false; 3283 } 3284 3285 static const char *findPlaceholderEnd(const char *CurPtr, 3286 const char *BufferEnd) { 3287 if (CurPtr == BufferEnd) 3288 return nullptr; 3289 BufferEnd -= 1; // Scan until the second last character. 3290 for (; CurPtr != BufferEnd; ++CurPtr) { 3291 if (CurPtr[0] == '#' && CurPtr[1] == '>') 3292 return CurPtr + 2; 3293 } 3294 return nullptr; 3295 } 3296 3297 bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) { 3298 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!"); 3299 if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) 3300 return false; 3301 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd); 3302 if (!End) 3303 return false; 3304 const char *Start = CurPtr - 1; 3305 if (!LangOpts.AllowEditorPlaceholders) 3306 Diag(Start, diag::err_placeholder_in_source); 3307 Result.startToken(); 3308 FormTokenWithChars(Result, End, tok::raw_identifier); 3309 Result.setRawIdentifierData(Start); 3310 PP->LookUpIdentifierInfo(Result); 3311 Result.setFlag(Token::IsEditorPlaceholder); 3312 BufferPtr = End; 3313 return true; 3314 } 3315 3316 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 3317 if (PP && PP->isCodeCompletionEnabled()) { 3318 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 3319 return Loc == PP->getCodeCompletionLoc(); 3320 } 3321 3322 return false; 3323 } 3324 3325 std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr, 3326 const char *SlashLoc, 3327 Token *Result) { 3328 unsigned CharSize; 3329 char Kind = getCharAndSize(StartPtr, CharSize); 3330 assert((Kind == 'u' || Kind == 'U') && "expected a UCN"); 3331 3332 unsigned NumHexDigits; 3333 if (Kind == 'u') 3334 NumHexDigits = 4; 3335 else if (Kind == 'U') 3336 NumHexDigits = 8; 3337 3338 bool Delimited = false; 3339 bool FoundEndDelimiter = false; 3340 unsigned Count = 0; 3341 bool Diagnose = Result && !isLexingRawMode(); 3342 3343 if (!LangOpts.CPlusPlus && !LangOpts.C99) { 3344 if (Diagnose) 3345 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); 3346 return std::nullopt; 3347 } 3348 3349 const char *CurPtr = StartPtr + CharSize; 3350 const char *KindLoc = &CurPtr[-1]; 3351 3352 uint32_t CodePoint = 0; 3353 while (Count != NumHexDigits || Delimited) { 3354 char C = getCharAndSize(CurPtr, CharSize); 3355 if (!Delimited && Count == 0 && C == '{') { 3356 Delimited = true; 3357 CurPtr += CharSize; 3358 continue; 3359 } 3360 3361 if (Delimited && C == '}') { 3362 CurPtr += CharSize; 3363 FoundEndDelimiter = true; 3364 break; 3365 } 3366 3367 unsigned Value = llvm::hexDigitValue(C); 3368 if (Value == -1U) { 3369 if (!Delimited) 3370 break; 3371 if (Diagnose) 3372 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete) 3373 << StringRef(KindLoc, 1); 3374 return std::nullopt; 3375 } 3376 3377 if (CodePoint & 0xF000'0000) { 3378 if (Diagnose) 3379 Diag(KindLoc, diag::err_escape_too_large) << 0; 3380 return std::nullopt; 3381 } 3382 3383 CodePoint <<= 4; 3384 CodePoint |= Value; 3385 CurPtr += CharSize; 3386 Count++; 3387 } 3388 3389 if (Count == 0) { 3390 if (Diagnose) 3391 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 3392 : diag::warn_ucn_escape_no_digits) 3393 << StringRef(KindLoc, 1); 3394 return std::nullopt; 3395 } 3396 3397 if (Delimited && Kind == 'U') { 3398 if (Diagnose) 3399 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1); 3400 return std::nullopt; 3401 } 3402 3403 if (!Delimited && Count != NumHexDigits) { 3404 if (Diagnose) { 3405 Diag(SlashLoc, diag::warn_ucn_escape_incomplete); 3406 // If the user wrote \U1234, suggest a fixit to \u. 3407 if (Count == 4 && NumHexDigits == 8) { 3408 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); 3409 Diag(KindLoc, diag::note_ucn_four_not_eight) 3410 << FixItHint::CreateReplacement(URange, "u"); 3411 } 3412 } 3413 return std::nullopt; 3414 } 3415 3416 if (Delimited && PP) { 3417 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23 3418 ? diag::warn_cxx23_delimited_escape_sequence 3419 : diag::ext_delimited_escape_sequence) 3420 << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0); 3421 } 3422 3423 if (Result) { 3424 Result->setFlag(Token::HasUCN); 3425 // If the UCN contains either a trigraph or a line splicing, 3426 // we need to call getAndAdvanceChar again to set the appropriate flags 3427 // on Result. 3428 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0))) 3429 StartPtr = CurPtr; 3430 else 3431 while (StartPtr != CurPtr) 3432 (void)getAndAdvanceChar(StartPtr, *Result); 3433 } else { 3434 StartPtr = CurPtr; 3435 } 3436 return CodePoint; 3437 } 3438 3439 std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr, 3440 const char *SlashLoc, 3441 Token *Result) { 3442 unsigned CharSize; 3443 bool Diagnose = Result && !isLexingRawMode(); 3444 3445 char C = getCharAndSize(StartPtr, CharSize); 3446 assert(C == 'N' && "expected \\N{...}"); 3447 3448 const char *CurPtr = StartPtr + CharSize; 3449 const char *KindLoc = &CurPtr[-1]; 3450 3451 C = getCharAndSize(CurPtr, CharSize); 3452 if (C != '{') { 3453 if (Diagnose) 3454 Diag(SlashLoc, diag::warn_ucn_escape_incomplete); 3455 return std::nullopt; 3456 } 3457 CurPtr += CharSize; 3458 const char *StartName = CurPtr; 3459 bool FoundEndDelimiter = false; 3460 llvm::SmallVector<char, 30> Buffer; 3461 while (C) { 3462 C = getCharAndSize(CurPtr, CharSize); 3463 CurPtr += CharSize; 3464 if (C == '}') { 3465 FoundEndDelimiter = true; 3466 break; 3467 } 3468 3469 if (isVerticalWhitespace(C)) 3470 break; 3471 Buffer.push_back(C); 3472 } 3473 3474 if (!FoundEndDelimiter || Buffer.empty()) { 3475 if (Diagnose) 3476 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 3477 : diag::warn_delimited_ucn_incomplete) 3478 << StringRef(KindLoc, 1); 3479 return std::nullopt; 3480 } 3481 3482 StringRef Name(Buffer.data(), Buffer.size()); 3483 std::optional<char32_t> Match = 3484 llvm::sys::unicode::nameToCodepointStrict(Name); 3485 std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch; 3486 if (!Match) { 3487 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name); 3488 if (Diagnose) { 3489 Diag(StartName, diag::err_invalid_ucn_name) 3490 << StringRef(Buffer.data(), Buffer.size()) 3491 << makeCharRange(*this, StartName, CurPtr - CharSize); 3492 if (LooseMatch) { 3493 Diag(StartName, diag::note_invalid_ucn_name_loose_matching) 3494 << FixItHint::CreateReplacement( 3495 makeCharRange(*this, StartName, CurPtr - CharSize), 3496 LooseMatch->Name); 3497 } 3498 } 3499 // We do not offer misspelled character names suggestions here 3500 // as the set of what would be a valid suggestion depends on context, 3501 // and we should not make invalid suggestions. 3502 } 3503 3504 if (Diagnose && Match) 3505 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23 3506 ? diag::warn_cxx23_delimited_escape_sequence 3507 : diag::ext_delimited_escape_sequence) 3508 << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0); 3509 3510 // If no diagnostic has been emitted yet, likely because we are doing a 3511 // tentative lexing, we do not want to recover here to make sure the token 3512 // will not be incorrectly considered valid. This function will be called 3513 // again and a diagnostic emitted then. 3514 if (LooseMatch && Diagnose) 3515 Match = LooseMatch->CodePoint; 3516 3517 if (Result) { 3518 Result->setFlag(Token::HasUCN); 3519 // If the UCN contains either a trigraph or a line splicing, 3520 // we need to call getAndAdvanceChar again to set the appropriate flags 3521 // on Result. 3522 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3)) 3523 StartPtr = CurPtr; 3524 else 3525 while (StartPtr != CurPtr) 3526 (void)getAndAdvanceChar(StartPtr, *Result); 3527 } else { 3528 StartPtr = CurPtr; 3529 } 3530 return Match ? std::optional<uint32_t>(*Match) : std::nullopt; 3531 } 3532 3533 uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, 3534 Token *Result) { 3535 3536 unsigned CharSize; 3537 std::optional<uint32_t> CodePointOpt; 3538 char Kind = getCharAndSize(StartPtr, CharSize); 3539 if (Kind == 'u' || Kind == 'U') 3540 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result); 3541 else if (Kind == 'N') 3542 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result); 3543 3544 if (!CodePointOpt) 3545 return 0; 3546 3547 uint32_t CodePoint = *CodePointOpt; 3548 3549 // Don't apply C family restrictions to UCNs in assembly mode 3550 if (LangOpts.AsmPreprocessor) 3551 return CodePoint; 3552 3553 // C23 6.4.3p2: A universal character name shall not designate a code point 3554 // where the hexadecimal value is: 3555 // - in the range D800 through DFFF inclusive; or 3556 // - greater than 10FFFF. 3557 // A universal-character-name outside the c-char-sequence of a character 3558 // constant, or the s-char-sequence of a string-literal shall not designate 3559 // a control character or a character in the basic character set. 3560 3561 // C++11 [lex.charset]p2: If the hexadecimal value for a 3562 // universal-character-name corresponds to a surrogate code point (in the 3563 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, 3564 // if the hexadecimal value for a universal-character-name outside the 3565 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or 3566 // string literal corresponds to a control character (in either of the 3567 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the 3568 // basic source character set, the program is ill-formed. 3569 if (CodePoint < 0xA0) { 3570 // We don't use isLexingRawMode() here because we need to warn about bad 3571 // UCNs even when skipping preprocessing tokens in a #if block. 3572 if (Result && PP) { 3573 if (CodePoint < 0x20 || CodePoint >= 0x7F) 3574 Diag(BufferPtr, diag::err_ucn_control_character); 3575 else { 3576 char C = static_cast<char>(CodePoint); 3577 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); 3578 } 3579 } 3580 3581 return 0; 3582 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { 3583 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. 3584 // We don't use isLexingRawMode() here because we need to diagnose bad 3585 // UCNs even when skipping preprocessing tokens in a #if block. 3586 if (Result && PP) { 3587 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11) 3588 Diag(BufferPtr, diag::warn_ucn_escape_surrogate); 3589 else 3590 Diag(BufferPtr, diag::err_ucn_escape_invalid); 3591 } 3592 return 0; 3593 } 3594 3595 return CodePoint; 3596 } 3597 3598 bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, 3599 const char *CurPtr) { 3600 if (!isLexingRawMode() && !PP->isPreprocessedOutput() && 3601 isUnicodeWhitespace(C)) { 3602 Diag(BufferPtr, diag::ext_unicode_whitespace) 3603 << makeCharRange(*this, BufferPtr, CurPtr); 3604 3605 Result.setFlag(Token::LeadingSpace); 3606 return true; 3607 } 3608 return false; 3609 } 3610 3611 void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { 3612 IsAtStartOfLine = Result.isAtStartOfLine(); 3613 HasLeadingSpace = Result.hasLeadingSpace(); 3614 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); 3615 // Note that this doesn't affect IsAtPhysicalStartOfLine. 3616 } 3617 3618 bool Lexer::Lex(Token &Result) { 3619 assert(!isDependencyDirectivesLexer()); 3620 3621 // Start a new token. 3622 Result.startToken(); 3623 3624 // Set up misc whitespace flags for LexTokenInternal. 3625 if (IsAtStartOfLine) { 3626 Result.setFlag(Token::StartOfLine); 3627 IsAtStartOfLine = false; 3628 } 3629 3630 if (HasLeadingSpace) { 3631 Result.setFlag(Token::LeadingSpace); 3632 HasLeadingSpace = false; 3633 } 3634 3635 if (HasLeadingEmptyMacro) { 3636 Result.setFlag(Token::LeadingEmptyMacro); 3637 HasLeadingEmptyMacro = false; 3638 } 3639 3640 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 3641 IsAtPhysicalStartOfLine = false; 3642 bool isRawLex = isLexingRawMode(); 3643 (void) isRawLex; 3644 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); 3645 // (After the LexTokenInternal call, the lexer might be destroyed.) 3646 assert((returnedToken || !isRawLex) && "Raw lex must succeed"); 3647 return returnedToken; 3648 } 3649 3650 /// LexTokenInternal - This implements a simple C family lexer. It is an 3651 /// extremely performance critical piece of code. This assumes that the buffer 3652 /// has a null character at the end of the file. This returns a preprocessing 3653 /// token, not a normal token, as such, it is an internal interface. It assumes 3654 /// that the Flags of result have been cleared before calling this. 3655 bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { 3656 LexStart: 3657 assert(!Result.needsCleaning() && "Result needs cleaning"); 3658 assert(!Result.hasPtrData() && "Result has not been reset"); 3659 3660 // CurPtr - Cache BufferPtr in an automatic variable. 3661 const char *CurPtr = BufferPtr; 3662 3663 // Small amounts of horizontal whitespace is very common between tokens. 3664 if (isHorizontalWhitespace(*CurPtr)) { 3665 do { 3666 ++CurPtr; 3667 } while (isHorizontalWhitespace(*CurPtr)); 3668 3669 // If we are keeping whitespace and other tokens, just return what we just 3670 // skipped. The next lexer invocation will return the token after the 3671 // whitespace. 3672 if (isKeepWhitespaceMode()) { 3673 FormTokenWithChars(Result, CurPtr, tok::unknown); 3674 // FIXME: The next token will not have LeadingSpace set. 3675 return true; 3676 } 3677 3678 BufferPtr = CurPtr; 3679 Result.setFlag(Token::LeadingSpace); 3680 } 3681 3682 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 3683 3684 // Read a character, advancing over it. 3685 char Char = getAndAdvanceChar(CurPtr, Result); 3686 tok::TokenKind Kind; 3687 3688 if (!isVerticalWhitespace(Char)) 3689 NewLinePtr = nullptr; 3690 3691 switch (Char) { 3692 case 0: // Null. 3693 // Found end of file? 3694 if (CurPtr-1 == BufferEnd) 3695 return LexEndOfFile(Result, CurPtr-1); 3696 3697 // Check if we are performing code completion. 3698 if (isCodeCompletionPoint(CurPtr-1)) { 3699 // Return the code-completion token. 3700 Result.startToken(); 3701 FormTokenWithChars(Result, CurPtr, tok::code_completion); 3702 return true; 3703 } 3704 3705 if (!isLexingRawMode()) 3706 Diag(CurPtr-1, diag::null_in_file); 3707 Result.setFlag(Token::LeadingSpace); 3708 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3709 return true; // KeepWhitespaceMode 3710 3711 // We know the lexer hasn't changed, so just try again with this lexer. 3712 // (We manually eliminate the tail call to avoid recursion.) 3713 goto LexNextToken; 3714 3715 case 26: // DOS & CP/M EOF: "^Z". 3716 // If we're in Microsoft extensions mode, treat this as end of file. 3717 if (LangOpts.MicrosoftExt) { 3718 if (!isLexingRawMode()) 3719 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft); 3720 return LexEndOfFile(Result, CurPtr-1); 3721 } 3722 3723 // If Microsoft extensions are disabled, this is just random garbage. 3724 Kind = tok::unknown; 3725 break; 3726 3727 case '\r': 3728 if (CurPtr[0] == '\n') 3729 (void)getAndAdvanceChar(CurPtr, Result); 3730 [[fallthrough]]; 3731 case '\n': 3732 // If we are inside a preprocessor directive and we see the end of line, 3733 // we know we are done with the directive, so return an EOD token. 3734 if (ParsingPreprocessorDirective) { 3735 // Done parsing the "line". 3736 ParsingPreprocessorDirective = false; 3737 3738 // Restore comment saving mode, in case it was disabled for directive. 3739 if (PP) 3740 resetExtendedTokenMode(); 3741 3742 // Since we consumed a newline, we are back at the start of a line. 3743 IsAtStartOfLine = true; 3744 IsAtPhysicalStartOfLine = true; 3745 NewLinePtr = CurPtr - 1; 3746 3747 Kind = tok::eod; 3748 break; 3749 } 3750 3751 // No leading whitespace seen so far. 3752 Result.clearFlag(Token::LeadingSpace); 3753 3754 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3755 return true; // KeepWhitespaceMode 3756 3757 // We only saw whitespace, so just try again with this lexer. 3758 // (We manually eliminate the tail call to avoid recursion.) 3759 goto LexNextToken; 3760 case ' ': 3761 case '\t': 3762 case '\f': 3763 case '\v': 3764 SkipHorizontalWhitespace: 3765 Result.setFlag(Token::LeadingSpace); 3766 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3767 return true; // KeepWhitespaceMode 3768 3769 SkipIgnoredUnits: 3770 CurPtr = BufferPtr; 3771 3772 // If the next token is obviously a // or /* */ comment, skip it efficiently 3773 // too (without going through the big switch stmt). 3774 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 3775 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { 3776 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3777 return true; // There is a token to return. 3778 goto SkipIgnoredUnits; 3779 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 3780 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3781 return true; // There is a token to return. 3782 goto SkipIgnoredUnits; 3783 } else if (isHorizontalWhitespace(*CurPtr)) { 3784 goto SkipHorizontalWhitespace; 3785 } 3786 // We only saw whitespace, so just try again with this lexer. 3787 // (We manually eliminate the tail call to avoid recursion.) 3788 goto LexNextToken; 3789 3790 // C99 6.4.4.1: Integer Constants. 3791 // C99 6.4.4.2: Floating Constants. 3792 case '0': case '1': case '2': case '3': case '4': 3793 case '5': case '6': case '7': case '8': case '9': 3794 // Notify MIOpt that we read a non-whitespace/non-comment token. 3795 MIOpt.ReadToken(); 3796 return LexNumericConstant(Result, CurPtr); 3797 3798 // Identifier (e.g., uber), or 3799 // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or 3800 // UTF-8 or UTF-16 string literal (C11/C++11). 3801 case 'u': 3802 // Notify MIOpt that we read a non-whitespace/non-comment token. 3803 MIOpt.ReadToken(); 3804 3805 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3806 Char = getCharAndSize(CurPtr, SizeTmp); 3807 3808 // UTF-16 string literal 3809 if (Char == '"') 3810 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3811 tok::utf16_string_literal); 3812 3813 // UTF-16 character constant 3814 if (Char == '\'') 3815 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3816 tok::utf16_char_constant); 3817 3818 // UTF-16 raw string literal 3819 if (Char == 'R' && LangOpts.CPlusPlus11 && 3820 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3821 return LexRawStringLiteral(Result, 3822 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3823 SizeTmp2, Result), 3824 tok::utf16_string_literal); 3825 3826 if (Char == '8') { 3827 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 3828 3829 // UTF-8 string literal 3830 if (Char2 == '"') 3831 return LexStringLiteral(Result, 3832 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3833 SizeTmp2, Result), 3834 tok::utf8_string_literal); 3835 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23)) 3836 return LexCharConstant( 3837 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3838 SizeTmp2, Result), 3839 tok::utf8_char_constant); 3840 3841 if (Char2 == 'R' && LangOpts.CPlusPlus11) { 3842 unsigned SizeTmp3; 3843 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3844 // UTF-8 raw string literal 3845 if (Char3 == '"') { 3846 return LexRawStringLiteral(Result, 3847 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3848 SizeTmp2, Result), 3849 SizeTmp3, Result), 3850 tok::utf8_string_literal); 3851 } 3852 } 3853 } 3854 } 3855 3856 // treat u like the start of an identifier. 3857 return LexIdentifierContinue(Result, CurPtr); 3858 3859 case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal 3860 // Notify MIOpt that we read a non-whitespace/non-comment token. 3861 MIOpt.ReadToken(); 3862 3863 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3864 Char = getCharAndSize(CurPtr, SizeTmp); 3865 3866 // UTF-32 string literal 3867 if (Char == '"') 3868 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3869 tok::utf32_string_literal); 3870 3871 // UTF-32 character constant 3872 if (Char == '\'') 3873 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3874 tok::utf32_char_constant); 3875 3876 // UTF-32 raw string literal 3877 if (Char == 'R' && LangOpts.CPlusPlus11 && 3878 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3879 return LexRawStringLiteral(Result, 3880 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3881 SizeTmp2, Result), 3882 tok::utf32_string_literal); 3883 } 3884 3885 // treat U like the start of an identifier. 3886 return LexIdentifierContinue(Result, CurPtr); 3887 3888 case 'R': // Identifier or C++0x raw string literal 3889 // Notify MIOpt that we read a non-whitespace/non-comment token. 3890 MIOpt.ReadToken(); 3891 3892 if (LangOpts.CPlusPlus11) { 3893 Char = getCharAndSize(CurPtr, SizeTmp); 3894 3895 if (Char == '"') 3896 return LexRawStringLiteral(Result, 3897 ConsumeChar(CurPtr, SizeTmp, Result), 3898 tok::string_literal); 3899 } 3900 3901 // treat R like the start of an identifier. 3902 return LexIdentifierContinue(Result, CurPtr); 3903 3904 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 3905 // Notify MIOpt that we read a non-whitespace/non-comment token. 3906 MIOpt.ReadToken(); 3907 Char = getCharAndSize(CurPtr, SizeTmp); 3908 3909 // Wide string literal. 3910 if (Char == '"') 3911 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3912 tok::wide_string_literal); 3913 3914 // Wide raw string literal. 3915 if (LangOpts.CPlusPlus11 && Char == 'R' && 3916 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3917 return LexRawStringLiteral(Result, 3918 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3919 SizeTmp2, Result), 3920 tok::wide_string_literal); 3921 3922 // Wide character constant. 3923 if (Char == '\'') 3924 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3925 tok::wide_char_constant); 3926 // FALL THROUGH, treating L like the start of an identifier. 3927 [[fallthrough]]; 3928 3929 // C99 6.4.2: Identifiers. 3930 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 3931 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 3932 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 3933 case 'V': case 'W': case 'X': case 'Y': case 'Z': 3934 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 3935 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 3936 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 3937 case 'v': case 'w': case 'x': case 'y': case 'z': 3938 case '_': 3939 // Notify MIOpt that we read a non-whitespace/non-comment token. 3940 MIOpt.ReadToken(); 3941 return LexIdentifierContinue(Result, CurPtr); 3942 3943 case '$': // $ in identifiers. 3944 if (LangOpts.DollarIdents) { 3945 if (!isLexingRawMode()) 3946 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 3947 // Notify MIOpt that we read a non-whitespace/non-comment token. 3948 MIOpt.ReadToken(); 3949 return LexIdentifierContinue(Result, CurPtr); 3950 } 3951 3952 Kind = tok::unknown; 3953 break; 3954 3955 // C99 6.4.4: Character Constants. 3956 case '\'': 3957 // Notify MIOpt that we read a non-whitespace/non-comment token. 3958 MIOpt.ReadToken(); 3959 return LexCharConstant(Result, CurPtr, tok::char_constant); 3960 3961 // C99 6.4.5: String Literals. 3962 case '"': 3963 // Notify MIOpt that we read a non-whitespace/non-comment token. 3964 MIOpt.ReadToken(); 3965 return LexStringLiteral(Result, CurPtr, 3966 ParsingFilename ? tok::header_name 3967 : tok::string_literal); 3968 3969 // C99 6.4.6: Punctuators. 3970 case '?': 3971 Kind = tok::question; 3972 break; 3973 case '[': 3974 Kind = tok::l_square; 3975 break; 3976 case ']': 3977 Kind = tok::r_square; 3978 break; 3979 case '(': 3980 Kind = tok::l_paren; 3981 break; 3982 case ')': 3983 Kind = tok::r_paren; 3984 break; 3985 case '{': 3986 Kind = tok::l_brace; 3987 break; 3988 case '}': 3989 Kind = tok::r_brace; 3990 break; 3991 case '.': 3992 Char = getCharAndSize(CurPtr, SizeTmp); 3993 if (Char >= '0' && Char <= '9') { 3994 // Notify MIOpt that we read a non-whitespace/non-comment token. 3995 MIOpt.ReadToken(); 3996 3997 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 3998 } else if (LangOpts.CPlusPlus && Char == '*') { 3999 Kind = tok::periodstar; 4000 CurPtr += SizeTmp; 4001 } else if (Char == '.' && 4002 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 4003 Kind = tok::ellipsis; 4004 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4005 SizeTmp2, Result); 4006 } else { 4007 Kind = tok::period; 4008 } 4009 break; 4010 case '&': 4011 Char = getCharAndSize(CurPtr, SizeTmp); 4012 if (Char == '&') { 4013 Kind = tok::ampamp; 4014 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4015 } else if (Char == '=') { 4016 Kind = tok::ampequal; 4017 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4018 } else { 4019 Kind = tok::amp; 4020 } 4021 break; 4022 case '*': 4023 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 4024 Kind = tok::starequal; 4025 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4026 } else { 4027 Kind = tok::star; 4028 } 4029 break; 4030 case '+': 4031 Char = getCharAndSize(CurPtr, SizeTmp); 4032 if (Char == '+') { 4033 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4034 Kind = tok::plusplus; 4035 } else if (Char == '=') { 4036 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4037 Kind = tok::plusequal; 4038 } else { 4039 Kind = tok::plus; 4040 } 4041 break; 4042 case '-': 4043 Char = getCharAndSize(CurPtr, SizeTmp); 4044 if (Char == '-') { // -- 4045 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4046 Kind = tok::minusminus; 4047 } else if (Char == '>' && LangOpts.CPlusPlus && 4048 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 4049 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4050 SizeTmp2, Result); 4051 Kind = tok::arrowstar; 4052 } else if (Char == '>') { // -> 4053 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4054 Kind = tok::arrow; 4055 } else if (Char == '=') { // -= 4056 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4057 Kind = tok::minusequal; 4058 } else { 4059 Kind = tok::minus; 4060 } 4061 break; 4062 case '~': 4063 Kind = tok::tilde; 4064 break; 4065 case '!': 4066 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 4067 Kind = tok::exclaimequal; 4068 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4069 } else { 4070 Kind = tok::exclaim; 4071 } 4072 break; 4073 case '/': 4074 // 6.4.9: Comments 4075 Char = getCharAndSize(CurPtr, SizeTmp); 4076 if (Char == '/') { // Line comment. 4077 // Even if Line comments are disabled (e.g. in C89 mode), we generally 4078 // want to lex this as a comment. There is one problem with this though, 4079 // that in one particular corner case, this can change the behavior of the 4080 // resultant program. For example, In "foo //**/ bar", C89 would lex 4081 // this as "foo / bar" and languages with Line comments would lex it as 4082 // "foo". Check to see if the character after the second slash is a '*'. 4083 // If so, we will lex that as a "/" instead of the start of a comment. 4084 // However, we never do this if we are just preprocessing. 4085 bool TreatAsComment = 4086 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); 4087 if (!TreatAsComment) 4088 if (!(PP && PP->isPreprocessedOutput())) 4089 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; 4090 4091 if (TreatAsComment) { 4092 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 4093 TokAtPhysicalStartOfLine)) 4094 return true; // There is a token to return. 4095 4096 // It is common for the tokens immediately after a // comment to be 4097 // whitespace (indentation for the next line). Instead of going through 4098 // the big switch, handle it efficiently now. 4099 goto SkipIgnoredUnits; 4100 } 4101 } 4102 4103 if (Char == '*') { // /**/ comment. 4104 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 4105 TokAtPhysicalStartOfLine)) 4106 return true; // There is a token to return. 4107 4108 // We only saw whitespace, so just try again with this lexer. 4109 // (We manually eliminate the tail call to avoid recursion.) 4110 goto LexNextToken; 4111 } 4112 4113 if (Char == '=') { 4114 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4115 Kind = tok::slashequal; 4116 } else { 4117 Kind = tok::slash; 4118 } 4119 break; 4120 case '%': 4121 Char = getCharAndSize(CurPtr, SizeTmp); 4122 if (Char == '=') { 4123 Kind = tok::percentequal; 4124 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4125 } else if (LangOpts.Digraphs && Char == '>') { 4126 Kind = tok::r_brace; // '%>' -> '}' 4127 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4128 } else if (LangOpts.Digraphs && Char == ':') { 4129 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4130 Char = getCharAndSize(CurPtr, SizeTmp); 4131 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 4132 Kind = tok::hashhash; // '%:%:' -> '##' 4133 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4134 SizeTmp2, Result); 4135 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 4136 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4137 if (!isLexingRawMode()) 4138 Diag(BufferPtr, diag::ext_charize_microsoft); 4139 Kind = tok::hashat; 4140 } else { // '%:' -> '#' 4141 // We parsed a # character. If this occurs at the start of the line, 4142 // it's actually the start of a preprocessing directive. Callback to 4143 // the preprocessor to handle it. 4144 // TODO: -fpreprocessed mode?? 4145 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 4146 goto HandleDirective; 4147 4148 Kind = tok::hash; 4149 } 4150 } else { 4151 Kind = tok::percent; 4152 } 4153 break; 4154 case '<': 4155 Char = getCharAndSize(CurPtr, SizeTmp); 4156 if (ParsingFilename) { 4157 return LexAngledStringLiteral(Result, CurPtr); 4158 } else if (Char == '<') { 4159 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4160 if (After == '=') { 4161 Kind = tok::lesslessequal; 4162 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4163 SizeTmp2, Result); 4164 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 4165 // If this is actually a '<<<<<<<' version control conflict marker, 4166 // recognize it as such and recover nicely. 4167 goto LexNextToken; 4168 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 4169 // If this is '<<<<' and we're in a Perforce-style conflict marker, 4170 // ignore it. 4171 goto LexNextToken; 4172 } else if (LangOpts.CUDA && After == '<') { 4173 Kind = tok::lesslessless; 4174 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4175 SizeTmp2, Result); 4176 } else { 4177 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4178 Kind = tok::lessless; 4179 } 4180 } else if (Char == '=') { 4181 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4182 if (After == '>') { 4183 if (LangOpts.CPlusPlus20) { 4184 if (!isLexingRawMode()) 4185 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship); 4186 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4187 SizeTmp2, Result); 4188 Kind = tok::spaceship; 4189 break; 4190 } 4191 // Suggest adding a space between the '<=' and the '>' to avoid a 4192 // change in semantics if this turns up in C++ <=17 mode. 4193 if (LangOpts.CPlusPlus && !isLexingRawMode()) { 4194 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship) 4195 << FixItHint::CreateInsertion( 4196 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " "); 4197 } 4198 } 4199 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4200 Kind = tok::lessequal; 4201 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 4202 if (LangOpts.CPlusPlus11 && 4203 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 4204 // C++0x [lex.pptoken]p3: 4205 // Otherwise, if the next three characters are <:: and the subsequent 4206 // character is neither : nor >, the < is treated as a preprocessor 4207 // token by itself and not as the first character of the alternative 4208 // token <:. 4209 unsigned SizeTmp3; 4210 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 4211 if (After != ':' && After != '>') { 4212 Kind = tok::less; 4213 if (!isLexingRawMode()) 4214 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 4215 break; 4216 } 4217 } 4218 4219 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4220 Kind = tok::l_square; 4221 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 4222 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4223 Kind = tok::l_brace; 4224 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 && 4225 lexEditorPlaceholder(Result, CurPtr)) { 4226 return true; 4227 } else { 4228 Kind = tok::less; 4229 } 4230 break; 4231 case '>': 4232 Char = getCharAndSize(CurPtr, SizeTmp); 4233 if (Char == '=') { 4234 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4235 Kind = tok::greaterequal; 4236 } else if (Char == '>') { 4237 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4238 if (After == '=') { 4239 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4240 SizeTmp2, Result); 4241 Kind = tok::greatergreaterequal; 4242 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 4243 // If this is actually a '>>>>' conflict marker, recognize it as such 4244 // and recover nicely. 4245 goto LexNextToken; 4246 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 4247 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 4248 goto LexNextToken; 4249 } else if (LangOpts.CUDA && After == '>') { 4250 Kind = tok::greatergreatergreater; 4251 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4252 SizeTmp2, Result); 4253 } else { 4254 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4255 Kind = tok::greatergreater; 4256 } 4257 } else { 4258 Kind = tok::greater; 4259 } 4260 break; 4261 case '^': 4262 Char = getCharAndSize(CurPtr, SizeTmp); 4263 if (Char == '=') { 4264 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4265 Kind = tok::caretequal; 4266 } else if (LangOpts.OpenCL && Char == '^') { 4267 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4268 Kind = tok::caretcaret; 4269 } else { 4270 Kind = tok::caret; 4271 } 4272 break; 4273 case '|': 4274 Char = getCharAndSize(CurPtr, SizeTmp); 4275 if (Char == '=') { 4276 Kind = tok::pipeequal; 4277 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4278 } else if (Char == '|') { 4279 // If this is '|||||||' and we're in a conflict marker, ignore it. 4280 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 4281 goto LexNextToken; 4282 Kind = tok::pipepipe; 4283 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4284 } else { 4285 Kind = tok::pipe; 4286 } 4287 break; 4288 case ':': 4289 Char = getCharAndSize(CurPtr, SizeTmp); 4290 if (LangOpts.Digraphs && Char == '>') { 4291 Kind = tok::r_square; // ':>' -> ']' 4292 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4293 } else if (Char == ':') { 4294 Kind = tok::coloncolon; 4295 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4296 } else { 4297 Kind = tok::colon; 4298 } 4299 break; 4300 case ';': 4301 Kind = tok::semi; 4302 break; 4303 case '=': 4304 Char = getCharAndSize(CurPtr, SizeTmp); 4305 if (Char == '=') { 4306 // If this is '====' and we're in a conflict marker, ignore it. 4307 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 4308 goto LexNextToken; 4309 4310 Kind = tok::equalequal; 4311 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4312 } else { 4313 Kind = tok::equal; 4314 } 4315 break; 4316 case ',': 4317 Kind = tok::comma; 4318 break; 4319 case '#': 4320 Char = getCharAndSize(CurPtr, SizeTmp); 4321 if (Char == '#') { 4322 Kind = tok::hashhash; 4323 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4324 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 4325 Kind = tok::hashat; 4326 if (!isLexingRawMode()) 4327 Diag(BufferPtr, diag::ext_charize_microsoft); 4328 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4329 } else { 4330 // We parsed a # character. If this occurs at the start of the line, 4331 // it's actually the start of a preprocessing directive. Callback to 4332 // the preprocessor to handle it. 4333 // TODO: -fpreprocessed mode?? 4334 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 4335 goto HandleDirective; 4336 4337 Kind = tok::hash; 4338 } 4339 break; 4340 4341 case '@': 4342 // Objective C support. 4343 if (CurPtr[-1] == '@' && LangOpts.ObjC) 4344 Kind = tok::at; 4345 else 4346 Kind = tok::unknown; 4347 break; 4348 4349 // UCNs (C99 6.4.3, C++11 [lex.charset]p2) 4350 case '\\': 4351 if (!LangOpts.AsmPreprocessor) { 4352 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { 4353 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 4354 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 4355 return true; // KeepWhitespaceMode 4356 4357 // We only saw whitespace, so just try again with this lexer. 4358 // (We manually eliminate the tail call to avoid recursion.) 4359 goto LexNextToken; 4360 } 4361 4362 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 4363 } 4364 } 4365 4366 Kind = tok::unknown; 4367 break; 4368 4369 default: { 4370 if (isASCII(Char)) { 4371 Kind = tok::unknown; 4372 break; 4373 } 4374 4375 llvm::UTF32 CodePoint; 4376 4377 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to 4378 // an escaped newline. 4379 --CurPtr; 4380 llvm::ConversionResult Status = 4381 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, 4382 (const llvm::UTF8 *)BufferEnd, 4383 &CodePoint, 4384 llvm::strictConversion); 4385 if (Status == llvm::conversionOK) { 4386 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 4387 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 4388 return true; // KeepWhitespaceMode 4389 4390 // We only saw whitespace, so just try again with this lexer. 4391 // (We manually eliminate the tail call to avoid recursion.) 4392 goto LexNextToken; 4393 } 4394 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 4395 } 4396 4397 if (isLexingRawMode() || ParsingPreprocessorDirective || 4398 PP->isPreprocessedOutput()) { 4399 ++CurPtr; 4400 Kind = tok::unknown; 4401 break; 4402 } 4403 4404 // Non-ASCII characters tend to creep into source code unintentionally. 4405 // Instead of letting the parser complain about the unknown token, 4406 // just diagnose the invalid UTF-8, then drop the character. 4407 Diag(CurPtr, diag::err_invalid_utf8); 4408 4409 BufferPtr = CurPtr+1; 4410 // We're pretending the character didn't exist, so just try again with 4411 // this lexer. 4412 // (We manually eliminate the tail call to avoid recursion.) 4413 goto LexNextToken; 4414 } 4415 } 4416 4417 // Notify MIOpt that we read a non-whitespace/non-comment token. 4418 MIOpt.ReadToken(); 4419 4420 // Update the location of token as well as BufferPtr. 4421 FormTokenWithChars(Result, CurPtr, Kind); 4422 return true; 4423 4424 HandleDirective: 4425 // We parsed a # character and it's the start of a preprocessing directive. 4426 4427 FormTokenWithChars(Result, CurPtr, tok::hash); 4428 PP->HandleDirective(Result); 4429 4430 if (PP->hadModuleLoaderFatalFailure()) 4431 // With a fatal failure in the module loader, we abort parsing. 4432 return true; 4433 4434 // We parsed the directive; lex a token with the new state. 4435 return false; 4436 4437 LexNextToken: 4438 Result.clearFlag(Token::NeedsCleaning); 4439 goto LexStart; 4440 } 4441 4442 const char *Lexer::convertDependencyDirectiveToken( 4443 const dependency_directives_scan::Token &DDTok, Token &Result) { 4444 const char *TokPtr = BufferStart + DDTok.Offset; 4445 Result.startToken(); 4446 Result.setLocation(getSourceLocation(TokPtr)); 4447 Result.setKind(DDTok.Kind); 4448 Result.setFlag((Token::TokenFlags)DDTok.Flags); 4449 Result.setLength(DDTok.Length); 4450 BufferPtr = TokPtr + DDTok.Length; 4451 return TokPtr; 4452 } 4453 4454 bool Lexer::LexDependencyDirectiveToken(Token &Result) { 4455 assert(isDependencyDirectivesLexer()); 4456 4457 using namespace dependency_directives_scan; 4458 4459 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) { 4460 if (DepDirectives.front().Kind == pp_eof) 4461 return LexEndOfFile(Result, BufferEnd); 4462 if (DepDirectives.front().Kind == tokens_present_before_eof) 4463 MIOpt.ReadToken(); 4464 NextDepDirectiveTokenIndex = 0; 4465 DepDirectives = DepDirectives.drop_front(); 4466 } 4467 4468 const dependency_directives_scan::Token &DDTok = 4469 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++]; 4470 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) { 4471 // Read something other than a preprocessor directive hash. 4472 MIOpt.ReadToken(); 4473 } 4474 4475 if (ParsingFilename && DDTok.is(tok::less)) { 4476 BufferPtr = BufferStart + DDTok.Offset; 4477 LexAngledStringLiteral(Result, BufferPtr + 1); 4478 if (Result.isNot(tok::header_name)) 4479 return true; 4480 // Advance the index of lexed tokens. 4481 while (true) { 4482 const dependency_directives_scan::Token &NextTok = 4483 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex]; 4484 if (BufferStart + NextTok.Offset >= BufferPtr) 4485 break; 4486 ++NextDepDirectiveTokenIndex; 4487 } 4488 return true; 4489 } 4490 4491 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result); 4492 4493 if (Result.is(tok::hash) && Result.isAtStartOfLine()) { 4494 PP->HandleDirective(Result); 4495 return false; 4496 } 4497 if (Result.is(tok::raw_identifier)) { 4498 Result.setRawIdentifierData(TokPtr); 4499 if (!isLexingRawMode()) { 4500 const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 4501 if (II->isHandleIdentifierCase()) 4502 return PP->HandleIdentifier(Result); 4503 } 4504 return true; 4505 } 4506 if (Result.isLiteral()) { 4507 Result.setLiteralData(TokPtr); 4508 return true; 4509 } 4510 if (Result.is(tok::colon)) { 4511 // Convert consecutive colons to 'tok::coloncolon'. 4512 if (*BufferPtr == ':') { 4513 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( 4514 tok::colon)); 4515 ++NextDepDirectiveTokenIndex; 4516 Result.setKind(tok::coloncolon); 4517 } 4518 return true; 4519 } 4520 if (Result.is(tok::eod)) 4521 ParsingPreprocessorDirective = false; 4522 4523 return true; 4524 } 4525 4526 bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) { 4527 assert(isDependencyDirectivesLexer()); 4528 4529 using namespace dependency_directives_scan; 4530 4531 bool Stop = false; 4532 unsigned NestedIfs = 0; 4533 do { 4534 DepDirectives = DepDirectives.drop_front(); 4535 switch (DepDirectives.front().Kind) { 4536 case pp_none: 4537 llvm_unreachable("unexpected 'pp_none'"); 4538 case pp_include: 4539 case pp___include_macros: 4540 case pp_define: 4541 case pp_undef: 4542 case pp_import: 4543 case pp_pragma_import: 4544 case pp_pragma_once: 4545 case pp_pragma_push_macro: 4546 case pp_pragma_pop_macro: 4547 case pp_pragma_include_alias: 4548 case pp_pragma_system_header: 4549 case pp_include_next: 4550 case decl_at_import: 4551 case cxx_module_decl: 4552 case cxx_import_decl: 4553 case cxx_export_module_decl: 4554 case cxx_export_import_decl: 4555 case tokens_present_before_eof: 4556 break; 4557 case pp_if: 4558 case pp_ifdef: 4559 case pp_ifndef: 4560 ++NestedIfs; 4561 break; 4562 case pp_elif: 4563 case pp_elifdef: 4564 case pp_elifndef: 4565 case pp_else: 4566 if (!NestedIfs) { 4567 Stop = true; 4568 } 4569 break; 4570 case pp_endif: 4571 if (!NestedIfs) { 4572 Stop = true; 4573 } else { 4574 --NestedIfs; 4575 } 4576 break; 4577 case pp_eof: 4578 NextDepDirectiveTokenIndex = 0; 4579 return LexEndOfFile(Result, BufferEnd); 4580 } 4581 } while (!Stop); 4582 4583 const dependency_directives_scan::Token &DDTok = 4584 DepDirectives.front().Tokens.front(); 4585 assert(DDTok.is(tok::hash)); 4586 NextDepDirectiveTokenIndex = 1; 4587 4588 convertDependencyDirectiveToken(DDTok, Result); 4589 return false; 4590 } 4591