1 //===- Lexer.cpp - C Language Family Lexer --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Lexer and Token interfaces. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "clang/Lex/Lexer.h" 14 #include "UnicodeCharSets.h" 15 #include "clang/Basic/CharInfo.h" 16 #include "clang/Basic/Diagnostic.h" 17 #include "clang/Basic/IdentifierTable.h" 18 #include "clang/Basic/LLVM.h" 19 #include "clang/Basic/LangOptions.h" 20 #include "clang/Basic/SourceLocation.h" 21 #include "clang/Basic/SourceManager.h" 22 #include "clang/Basic/TokenKinds.h" 23 #include "clang/Lex/LexDiagnostic.h" 24 #include "clang/Lex/LiteralSupport.h" 25 #include "clang/Lex/MultipleIncludeOpt.h" 26 #include "clang/Lex/Preprocessor.h" 27 #include "clang/Lex/PreprocessorOptions.h" 28 #include "clang/Lex/Token.h" 29 #include "llvm/ADT/STLExtras.h" 30 #include "llvm/ADT/StringExtras.h" 31 #include "llvm/ADT/StringRef.h" 32 #include "llvm/ADT/StringSwitch.h" 33 #include "llvm/Support/Compiler.h" 34 #include "llvm/Support/ConvertUTF.h" 35 #include "llvm/Support/MemoryBufferRef.h" 36 #include "llvm/Support/NativeFormatting.h" 37 #include "llvm/Support/Unicode.h" 38 #include "llvm/Support/UnicodeCharRanges.h" 39 #include <algorithm> 40 #include <cassert> 41 #include <cstddef> 42 #include <cstdint> 43 #include <cstring> 44 #include <optional> 45 #include <string> 46 #include <tuple> 47 #include <utility> 48 49 #ifdef __SSE4_2__ 50 #include <nmmintrin.h> 51 #endif 52 53 using namespace clang; 54 55 //===----------------------------------------------------------------------===// 56 // Token Class Implementation 57 //===----------------------------------------------------------------------===// 58 59 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 60 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 61 if (isAnnotation()) 62 return false; 63 if (const IdentifierInfo *II = getIdentifierInfo()) 64 return II->getObjCKeywordID() == objcKey; 65 return false; 66 } 67 68 /// getObjCKeywordID - Return the ObjC keyword kind. 69 tok::ObjCKeywordKind Token::getObjCKeywordID() const { 70 if (isAnnotation()) 71 return tok::objc_not_keyword; 72 const IdentifierInfo *specId = getIdentifierInfo(); 73 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 74 } 75 76 /// Determine whether the token kind starts a simple-type-specifier. 77 bool Token::isSimpleTypeSpecifier(const LangOptions &LangOpts) const { 78 switch (getKind()) { 79 case tok::annot_typename: 80 case tok::annot_decltype: 81 case tok::annot_pack_indexing_type: 82 return true; 83 84 case tok::kw_short: 85 case tok::kw_long: 86 case tok::kw___int64: 87 case tok::kw___int128: 88 case tok::kw_signed: 89 case tok::kw_unsigned: 90 case tok::kw_void: 91 case tok::kw_char: 92 case tok::kw_int: 93 case tok::kw_half: 94 case tok::kw_float: 95 case tok::kw_double: 96 case tok::kw___bf16: 97 case tok::kw__Float16: 98 case tok::kw___float128: 99 case tok::kw___ibm128: 100 case tok::kw_wchar_t: 101 case tok::kw_bool: 102 case tok::kw__Bool: 103 case tok::kw__Accum: 104 case tok::kw__Fract: 105 case tok::kw__Sat: 106 #define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait: 107 #include "clang/Basic/TransformTypeTraits.def" 108 case tok::kw___auto_type: 109 case tok::kw_char16_t: 110 case tok::kw_char32_t: 111 case tok::kw_typeof: 112 case tok::kw_decltype: 113 case tok::kw_char8_t: 114 return getIdentifierInfo()->isKeyword(LangOpts); 115 116 default: 117 return false; 118 } 119 } 120 121 //===----------------------------------------------------------------------===// 122 // Lexer Class Implementation 123 //===----------------------------------------------------------------------===// 124 125 void Lexer::anchor() {} 126 127 void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 128 const char *BufEnd) { 129 BufferStart = BufStart; 130 BufferPtr = BufPtr; 131 BufferEnd = BufEnd; 132 133 assert(BufEnd[0] == 0 && 134 "We assume that the input buffer has a null character at the end" 135 " to simplify lexing!"); 136 137 // Check whether we have a BOM in the beginning of the buffer. If yes - act 138 // accordingly. Right now we support only UTF-8 with and without BOM, so, just 139 // skip the UTF-8 BOM if it's present. 140 if (BufferStart == BufferPtr) { 141 // Determine the size of the BOM. 142 StringRef Buf(BufferStart, BufferEnd - BufferStart); 143 size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 144 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 145 .Default(0); 146 147 // Skip the BOM. 148 BufferPtr += BOMLength; 149 } 150 151 Is_PragmaLexer = false; 152 CurrentConflictMarkerState = CMK_None; 153 154 // Start of the file is a start of line. 155 IsAtStartOfLine = true; 156 IsAtPhysicalStartOfLine = true; 157 158 HasLeadingSpace = false; 159 HasLeadingEmptyMacro = false; 160 161 // We are not after parsing a #. 162 ParsingPreprocessorDirective = false; 163 164 // We are not after parsing #include. 165 ParsingFilename = false; 166 167 // We are not in raw mode. Raw mode disables diagnostics and interpretation 168 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 169 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 170 // or otherwise skipping over tokens. 171 LexingRawMode = false; 172 173 // Default to not keeping comments. 174 ExtendedTokenMode = 0; 175 176 NewLinePtr = nullptr; 177 } 178 179 /// Lexer constructor - Create a new lexer object for the specified buffer 180 /// with the specified preprocessor managing the lexing process. This lexer 181 /// assumes that the associated file buffer and Preprocessor objects will 182 /// outlive it, so it doesn't take ownership of either of them. 183 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, 184 Preprocessor &PP, bool IsFirstIncludeOfFile) 185 : PreprocessorLexer(&PP, FID), 186 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 187 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment), 188 IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 189 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(), 190 InputFile.getBufferEnd()); 191 192 resetExtendedTokenMode(); 193 } 194 195 /// Lexer constructor - Create a new raw lexer object. This object is only 196 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 197 /// range will outlive it, so it doesn't take ownership of it. 198 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 199 const char *BufStart, const char *BufPtr, const char *BufEnd, 200 bool IsFirstIncludeOfFile) 201 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment), 202 IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 203 InitLexer(BufStart, BufPtr, BufEnd); 204 205 // We *are* in raw mode. 206 LexingRawMode = true; 207 } 208 209 /// Lexer constructor - Create a new raw lexer object. This object is only 210 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 211 /// range will outlive it, so it doesn't take ownership of it. 212 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile, 213 const SourceManager &SM, const LangOptions &langOpts, 214 bool IsFirstIncludeOfFile) 215 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(), 216 FromFile.getBufferStart(), FromFile.getBufferEnd(), 217 IsFirstIncludeOfFile) {} 218 219 void Lexer::resetExtendedTokenMode() { 220 assert(PP && "Cannot reset token mode without a preprocessor"); 221 if (LangOpts.TraditionalCPP) 222 SetKeepWhitespaceMode(true); 223 else 224 SetCommentRetentionState(PP->getCommentRetentionState()); 225 } 226 227 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 228 /// _Pragma expansion. This has a variety of magic semantics that this method 229 /// sets up. It returns a new'd Lexer that must be delete'd when done. 230 /// 231 /// On entrance to this routine, TokStartLoc is a macro location which has a 232 /// spelling loc that indicates the bytes to be lexed for the token and an 233 /// expansion location that indicates where all lexed tokens should be 234 /// "expanded from". 235 /// 236 /// TODO: It would really be nice to make _Pragma just be a wrapper around a 237 /// normal lexer that remaps tokens as they fly by. This would require making 238 /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 239 /// interface that could handle this stuff. This would pull GetMappedTokenLoc 240 /// out of the critical path of the lexer! 241 /// 242 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 243 SourceLocation ExpansionLocStart, 244 SourceLocation ExpansionLocEnd, 245 unsigned TokLen, Preprocessor &PP) { 246 SourceManager &SM = PP.getSourceManager(); 247 248 // Create the lexer as if we were going to lex the file normally. 249 FileID SpellingFID = SM.getFileID(SpellingLoc); 250 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID); 251 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 252 253 // Now that the lexer is created, change the start/end locations so that we 254 // just lex the subsection of the file that we want. This is lexing from a 255 // scratch buffer. 256 const char *StrData = SM.getCharacterData(SpellingLoc); 257 258 L->BufferPtr = StrData; 259 L->BufferEnd = StrData+TokLen; 260 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 261 262 // Set the SourceLocation with the remapping information. This ensures that 263 // GetMappedTokenLoc will remap the tokens as they are lexed. 264 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 265 ExpansionLocStart, 266 ExpansionLocEnd, TokLen); 267 268 // Ensure that the lexer thinks it is inside a directive, so that end \n will 269 // return an EOD token. 270 L->ParsingPreprocessorDirective = true; 271 272 // This lexer really is for _Pragma. 273 L->Is_PragmaLexer = true; 274 return L; 275 } 276 277 void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) { 278 this->IsAtPhysicalStartOfLine = IsAtStartOfLine; 279 this->IsAtStartOfLine = IsAtStartOfLine; 280 assert((BufferStart + Offset) <= BufferEnd); 281 BufferPtr = BufferStart + Offset; 282 } 283 284 template <typename T> static void StringifyImpl(T &Str, char Quote) { 285 typename T::size_type i = 0, e = Str.size(); 286 while (i < e) { 287 if (Str[i] == '\\' || Str[i] == Quote) { 288 Str.insert(Str.begin() + i, '\\'); 289 i += 2; 290 ++e; 291 } else if (Str[i] == '\n' || Str[i] == '\r') { 292 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'. 293 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && 294 Str[i] != Str[i + 1]) { 295 Str[i] = '\\'; 296 Str[i + 1] = 'n'; 297 } else { 298 // Replace '\n' and '\r' to '\\' followed by 'n'. 299 Str[i] = '\\'; 300 Str.insert(Str.begin() + i + 1, 'n'); 301 ++e; 302 } 303 i += 2; 304 } else 305 ++i; 306 } 307 } 308 309 std::string Lexer::Stringify(StringRef Str, bool Charify) { 310 std::string Result = std::string(Str); 311 char Quote = Charify ? '\'' : '"'; 312 StringifyImpl(Result, Quote); 313 return Result; 314 } 315 316 void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); } 317 318 //===----------------------------------------------------------------------===// 319 // Token Spelling 320 //===----------------------------------------------------------------------===// 321 322 /// Slow case of getSpelling. Extract the characters comprising the 323 /// spelling of this token from the provided input buffer. 324 static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, 325 const LangOptions &LangOpts, char *Spelling) { 326 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); 327 328 size_t Length = 0; 329 const char *BufEnd = BufPtr + Tok.getLength(); 330 331 if (tok::isStringLiteral(Tok.getKind())) { 332 // Munch the encoding-prefix and opening double-quote. 333 while (BufPtr < BufEnd) { 334 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts); 335 Spelling[Length++] = CharAndSize.Char; 336 BufPtr += CharAndSize.Size; 337 338 if (Spelling[Length - 1] == '"') 339 break; 340 } 341 342 // Raw string literals need special handling; trigraph expansion and line 343 // splicing do not occur within their d-char-sequence nor within their 344 // r-char-sequence. 345 if (Length >= 2 && 346 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { 347 // Search backwards from the end of the token to find the matching closing 348 // quote. 349 const char *RawEnd = BufEnd; 350 do --RawEnd; while (*RawEnd != '"'); 351 size_t RawLength = RawEnd - BufPtr + 1; 352 353 // Everything between the quotes is included verbatim in the spelling. 354 memcpy(Spelling + Length, BufPtr, RawLength); 355 Length += RawLength; 356 BufPtr += RawLength; 357 358 // The rest of the token is lexed normally. 359 } 360 } 361 362 while (BufPtr < BufEnd) { 363 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts); 364 Spelling[Length++] = CharAndSize.Char; 365 BufPtr += CharAndSize.Size; 366 } 367 368 assert(Length < Tok.getLength() && 369 "NeedsCleaning flag set on token that didn't need cleaning!"); 370 return Length; 371 } 372 373 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 374 /// token are the characters used to represent the token in the source file 375 /// after trigraph expansion and escaped-newline folding. In particular, this 376 /// wants to get the true, uncanonicalized, spelling of things like digraphs 377 /// UCNs, etc. 378 StringRef Lexer::getSpelling(SourceLocation loc, 379 SmallVectorImpl<char> &buffer, 380 const SourceManager &SM, 381 const LangOptions &options, 382 bool *invalid) { 383 // Break down the source location. 384 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 385 386 // Try to the load the file buffer. 387 bool invalidTemp = false; 388 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 389 if (invalidTemp) { 390 if (invalid) *invalid = true; 391 return {}; 392 } 393 394 const char *tokenBegin = file.data() + locInfo.second; 395 396 // Lex from the start of the given location. 397 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 398 file.begin(), tokenBegin, file.end()); 399 Token token; 400 lexer.LexFromRawLexer(token); 401 402 unsigned length = token.getLength(); 403 404 // Common case: no need for cleaning. 405 if (!token.needsCleaning()) 406 return StringRef(tokenBegin, length); 407 408 // Hard case, we need to relex the characters into the string. 409 buffer.resize(length); 410 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); 411 return StringRef(buffer.data(), buffer.size()); 412 } 413 414 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 415 /// token are the characters used to represent the token in the source file 416 /// after trigraph expansion and escaped-newline folding. In particular, this 417 /// wants to get the true, uncanonicalized, spelling of things like digraphs 418 /// UCNs, etc. 419 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 420 const LangOptions &LangOpts, bool *Invalid) { 421 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 422 423 bool CharDataInvalid = false; 424 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 425 &CharDataInvalid); 426 if (Invalid) 427 *Invalid = CharDataInvalid; 428 if (CharDataInvalid) 429 return {}; 430 431 // If this token contains nothing interesting, return it directly. 432 if (!Tok.needsCleaning()) 433 return std::string(TokStart, TokStart + Tok.getLength()); 434 435 std::string Result; 436 Result.resize(Tok.getLength()); 437 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); 438 return Result; 439 } 440 441 /// getSpelling - This method is used to get the spelling of a token into a 442 /// preallocated buffer, instead of as an std::string. The caller is required 443 /// to allocate enough space for the token, which is guaranteed to be at least 444 /// Tok.getLength() bytes long. The actual length of the token is returned. 445 /// 446 /// Note that this method may do two possible things: it may either fill in 447 /// the buffer specified with characters, or it may *change the input pointer* 448 /// to point to a constant buffer with the data already in it (avoiding a 449 /// copy). The caller is not allowed to modify the returned buffer pointer 450 /// if an internal buffer is returned. 451 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 452 const SourceManager &SourceMgr, 453 const LangOptions &LangOpts, bool *Invalid) { 454 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 455 456 const char *TokStart = nullptr; 457 // NOTE: this has to be checked *before* testing for an IdentifierInfo. 458 if (Tok.is(tok::raw_identifier)) 459 TokStart = Tok.getRawIdentifier().data(); 460 else if (!Tok.hasUCN()) { 461 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 462 // Just return the string from the identifier table, which is very quick. 463 Buffer = II->getNameStart(); 464 return II->getLength(); 465 } 466 } 467 468 // NOTE: this can be checked even after testing for an IdentifierInfo. 469 if (Tok.isLiteral()) 470 TokStart = Tok.getLiteralData(); 471 472 if (!TokStart) { 473 // Compute the start of the token in the input lexer buffer. 474 bool CharDataInvalid = false; 475 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 476 if (Invalid) 477 *Invalid = CharDataInvalid; 478 if (CharDataInvalid) { 479 Buffer = ""; 480 return 0; 481 } 482 } 483 484 // If this token contains nothing interesting, return it directly. 485 if (!Tok.needsCleaning()) { 486 Buffer = TokStart; 487 return Tok.getLength(); 488 } 489 490 // Otherwise, hard case, relex the characters into the string. 491 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); 492 } 493 494 /// MeasureTokenLength - Relex the token at the specified location and return 495 /// its length in bytes in the input file. If the token needs cleaning (e.g. 496 /// includes a trigraph or an escaped newline) then this count includes bytes 497 /// that are part of that. 498 unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 499 const SourceManager &SM, 500 const LangOptions &LangOpts) { 501 Token TheTok; 502 if (getRawToken(Loc, TheTok, SM, LangOpts)) 503 return 0; 504 return TheTok.getLength(); 505 } 506 507 /// Relex the token at the specified location. 508 /// \returns true if there was a failure, false on success. 509 bool Lexer::getRawToken(SourceLocation Loc, Token &Result, 510 const SourceManager &SM, 511 const LangOptions &LangOpts, 512 bool IgnoreWhiteSpace) { 513 // TODO: this could be special cased for common tokens like identifiers, ')', 514 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 515 // all obviously single-char tokens. This could use 516 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 517 // something. 518 519 // If this comes from a macro expansion, we really do want the macro name, not 520 // the token this macro expanded to. 521 Loc = SM.getExpansionLoc(Loc); 522 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 523 bool Invalid = false; 524 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 525 if (Invalid) 526 return true; 527 528 const char *StrData = Buffer.data()+LocInfo.second; 529 530 if (!IgnoreWhiteSpace && isWhitespace(StrData[0])) 531 return true; 532 533 // Create a lexer starting at the beginning of this token. 534 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 535 Buffer.begin(), StrData, Buffer.end()); 536 TheLexer.SetCommentRetentionState(true); 537 TheLexer.LexFromRawLexer(Result); 538 return false; 539 } 540 541 /// Returns the pointer that points to the beginning of line that contains 542 /// the given offset, or null if the offset if invalid. 543 static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { 544 const char *BufStart = Buffer.data(); 545 if (Offset >= Buffer.size()) 546 return nullptr; 547 548 const char *LexStart = BufStart + Offset; 549 for (; LexStart != BufStart; --LexStart) { 550 if (isVerticalWhitespace(LexStart[0]) && 551 !Lexer::isNewLineEscaped(BufStart, LexStart)) { 552 // LexStart should point at first character of logical line. 553 ++LexStart; 554 break; 555 } 556 } 557 return LexStart; 558 } 559 560 static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 561 const SourceManager &SM, 562 const LangOptions &LangOpts) { 563 assert(Loc.isFileID()); 564 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 565 if (LocInfo.first.isInvalid()) 566 return Loc; 567 568 bool Invalid = false; 569 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 570 if (Invalid) 571 return Loc; 572 573 // Back up from the current location until we hit the beginning of a line 574 // (or the buffer). We'll relex from that point. 575 const char *StrData = Buffer.data() + LocInfo.second; 576 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); 577 if (!LexStart || LexStart == StrData) 578 return Loc; 579 580 // Create a lexer starting at the beginning of this token. 581 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 582 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, 583 Buffer.end()); 584 TheLexer.SetCommentRetentionState(true); 585 586 // Lex tokens until we find the token that contains the source location. 587 Token TheTok; 588 do { 589 TheLexer.LexFromRawLexer(TheTok); 590 591 if (TheLexer.getBufferLocation() > StrData) { 592 // Lexing this token has taken the lexer past the source location we're 593 // looking for. If the current token encompasses our source location, 594 // return the beginning of that token. 595 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 596 return TheTok.getLocation(); 597 598 // We ended up skipping over the source location entirely, which means 599 // that it points into whitespace. We're done here. 600 break; 601 } 602 } while (TheTok.getKind() != tok::eof); 603 604 // We've passed our source location; just return the original source location. 605 return Loc; 606 } 607 608 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 609 const SourceManager &SM, 610 const LangOptions &LangOpts) { 611 if (Loc.isFileID()) 612 return getBeginningOfFileToken(Loc, SM, LangOpts); 613 614 if (!SM.isMacroArgExpansion(Loc)) 615 return Loc; 616 617 SourceLocation FileLoc = SM.getSpellingLoc(Loc); 618 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 619 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 620 std::pair<FileID, unsigned> BeginFileLocInfo = 621 SM.getDecomposedLoc(BeginFileLoc); 622 assert(FileLocInfo.first == BeginFileLocInfo.first && 623 FileLocInfo.second >= BeginFileLocInfo.second); 624 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 625 } 626 627 namespace { 628 629 enum PreambleDirectiveKind { 630 PDK_Skipped, 631 PDK_Unknown 632 }; 633 634 } // namespace 635 636 PreambleBounds Lexer::ComputePreamble(StringRef Buffer, 637 const LangOptions &LangOpts, 638 unsigned MaxLines) { 639 // Create a lexer starting at the beginning of the file. Note that we use a 640 // "fake" file source location at offset 1 so that the lexer will track our 641 // position within the file. 642 const SourceLocation::UIntTy StartOffset = 1; 643 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); 644 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(), 645 Buffer.end()); 646 TheLexer.SetCommentRetentionState(true); 647 648 bool InPreprocessorDirective = false; 649 Token TheTok; 650 SourceLocation ActiveCommentLoc; 651 652 unsigned MaxLineOffset = 0; 653 if (MaxLines) { 654 const char *CurPtr = Buffer.begin(); 655 unsigned CurLine = 0; 656 while (CurPtr != Buffer.end()) { 657 char ch = *CurPtr++; 658 if (ch == '\n') { 659 ++CurLine; 660 if (CurLine == MaxLines) 661 break; 662 } 663 } 664 if (CurPtr != Buffer.end()) 665 MaxLineOffset = CurPtr - Buffer.begin(); 666 } 667 668 do { 669 TheLexer.LexFromRawLexer(TheTok); 670 671 if (InPreprocessorDirective) { 672 // If we've hit the end of the file, we're done. 673 if (TheTok.getKind() == tok::eof) { 674 break; 675 } 676 677 // If we haven't hit the end of the preprocessor directive, skip this 678 // token. 679 if (!TheTok.isAtStartOfLine()) 680 continue; 681 682 // We've passed the end of the preprocessor directive, and will look 683 // at this token again below. 684 InPreprocessorDirective = false; 685 } 686 687 // Keep track of the # of lines in the preamble. 688 if (TheTok.isAtStartOfLine()) { 689 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 690 691 // If we were asked to limit the number of lines in the preamble, 692 // and we're about to exceed that limit, we're done. 693 if (MaxLineOffset && TokOffset >= MaxLineOffset) 694 break; 695 } 696 697 // Comments are okay; skip over them. 698 if (TheTok.getKind() == tok::comment) { 699 if (ActiveCommentLoc.isInvalid()) 700 ActiveCommentLoc = TheTok.getLocation(); 701 continue; 702 } 703 704 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 705 // This is the start of a preprocessor directive. 706 Token HashTok = TheTok; 707 InPreprocessorDirective = true; 708 ActiveCommentLoc = SourceLocation(); 709 710 // Figure out which directive this is. Since we're lexing raw tokens, 711 // we don't have an identifier table available. Instead, just look at 712 // the raw identifier to recognize and categorize preprocessor directives. 713 TheLexer.LexFromRawLexer(TheTok); 714 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 715 StringRef Keyword = TheTok.getRawIdentifier(); 716 PreambleDirectiveKind PDK 717 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 718 .Case("include", PDK_Skipped) 719 .Case("__include_macros", PDK_Skipped) 720 .Case("define", PDK_Skipped) 721 .Case("undef", PDK_Skipped) 722 .Case("line", PDK_Skipped) 723 .Case("error", PDK_Skipped) 724 .Case("pragma", PDK_Skipped) 725 .Case("import", PDK_Skipped) 726 .Case("include_next", PDK_Skipped) 727 .Case("warning", PDK_Skipped) 728 .Case("ident", PDK_Skipped) 729 .Case("sccs", PDK_Skipped) 730 .Case("assert", PDK_Skipped) 731 .Case("unassert", PDK_Skipped) 732 .Case("if", PDK_Skipped) 733 .Case("ifdef", PDK_Skipped) 734 .Case("ifndef", PDK_Skipped) 735 .Case("elif", PDK_Skipped) 736 .Case("elifdef", PDK_Skipped) 737 .Case("elifndef", PDK_Skipped) 738 .Case("else", PDK_Skipped) 739 .Case("endif", PDK_Skipped) 740 .Default(PDK_Unknown); 741 742 switch (PDK) { 743 case PDK_Skipped: 744 continue; 745 746 case PDK_Unknown: 747 // We don't know what this directive is; stop at the '#'. 748 break; 749 } 750 } 751 752 // We only end up here if we didn't recognize the preprocessor 753 // directive or it was one that can't occur in the preamble at this 754 // point. Roll back the current token to the location of the '#'. 755 TheTok = HashTok; 756 } else if (TheTok.isAtStartOfLine() && 757 TheTok.getKind() == tok::raw_identifier && 758 TheTok.getRawIdentifier() == "module" && 759 LangOpts.CPlusPlusModules) { 760 // The initial global module fragment introducer "module;" is part of 761 // the preamble, which runs up to the module declaration "module foo;". 762 Token ModuleTok = TheTok; 763 do { 764 TheLexer.LexFromRawLexer(TheTok); 765 } while (TheTok.getKind() == tok::comment); 766 if (TheTok.getKind() != tok::semi) { 767 // Not global module fragment, roll back. 768 TheTok = ModuleTok; 769 break; 770 } 771 continue; 772 } 773 774 // We hit a token that we don't recognize as being in the 775 // "preprocessing only" part of the file, so we're no longer in 776 // the preamble. 777 break; 778 } while (true); 779 780 SourceLocation End; 781 if (ActiveCommentLoc.isValid()) 782 End = ActiveCommentLoc; // don't truncate a decl comment. 783 else 784 End = TheTok.getLocation(); 785 786 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(), 787 TheTok.isAtStartOfLine()); 788 } 789 790 unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, 791 const SourceManager &SM, 792 const LangOptions &LangOpts) { 793 // Figure out how many physical characters away the specified expansion 794 // character is. This needs to take into consideration newlines and 795 // trigraphs. 796 bool Invalid = false; 797 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 798 799 // If they request the first char of the token, we're trivially done. 800 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 801 return 0; 802 803 unsigned PhysOffset = 0; 804 805 // The usual case is that tokens don't contain anything interesting. Skip 806 // over the uninteresting characters. If a token only consists of simple 807 // chars, this method is extremely fast. 808 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 809 if (CharNo == 0) 810 return PhysOffset; 811 ++TokPtr; 812 --CharNo; 813 ++PhysOffset; 814 } 815 816 // If we have a character that may be a trigraph or escaped newline, use a 817 // lexer to parse it correctly. 818 for (; CharNo; --CharNo) { 819 auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts); 820 TokPtr += CharAndSize.Size; 821 PhysOffset += CharAndSize.Size; 822 } 823 824 // Final detail: if we end up on an escaped newline, we want to return the 825 // location of the actual byte of the token. For example foo\<newline>bar 826 // advanced by 3 should return the location of b, not of \\. One compounding 827 // detail of this is that the escape may be made by a trigraph. 828 if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 829 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 830 831 return PhysOffset; 832 } 833 834 /// Computes the source location just past the end of the 835 /// token at this source location. 836 /// 837 /// This routine can be used to produce a source location that 838 /// points just past the end of the token referenced by \p Loc, and 839 /// is generally used when a diagnostic needs to point just after a 840 /// token where it expected something different that it received. If 841 /// the returned source location would not be meaningful (e.g., if 842 /// it points into a macro), this routine returns an invalid 843 /// source location. 844 /// 845 /// \param Offset an offset from the end of the token, where the source 846 /// location should refer to. The default offset (0) produces a source 847 /// location pointing just past the end of the token; an offset of 1 produces 848 /// a source location pointing to the last character in the token, etc. 849 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 850 const SourceManager &SM, 851 const LangOptions &LangOpts) { 852 if (Loc.isInvalid()) 853 return {}; 854 855 if (Loc.isMacroID()) { 856 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 857 return {}; // Points inside the macro expansion. 858 } 859 860 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 861 if (Len > Offset) 862 Len = Len - Offset; 863 else 864 return Loc; 865 866 return Loc.getLocWithOffset(Len); 867 } 868 869 /// Returns true if the given MacroID location points at the first 870 /// token of the macro expansion. 871 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 872 const SourceManager &SM, 873 const LangOptions &LangOpts, 874 SourceLocation *MacroBegin) { 875 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 876 877 SourceLocation expansionLoc; 878 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) 879 return false; 880 881 if (expansionLoc.isFileID()) { 882 // No other macro expansions, this is the first. 883 if (MacroBegin) 884 *MacroBegin = expansionLoc; 885 return true; 886 } 887 888 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 889 } 890 891 /// Returns true if the given MacroID location points at the last 892 /// token of the macro expansion. 893 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 894 const SourceManager &SM, 895 const LangOptions &LangOpts, 896 SourceLocation *MacroEnd) { 897 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 898 899 SourceLocation spellLoc = SM.getSpellingLoc(loc); 900 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 901 if (tokLen == 0) 902 return false; 903 904 SourceLocation afterLoc = loc.getLocWithOffset(tokLen); 905 SourceLocation expansionLoc; 906 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) 907 return false; 908 909 if (expansionLoc.isFileID()) { 910 // No other macro expansions. 911 if (MacroEnd) 912 *MacroEnd = expansionLoc; 913 return true; 914 } 915 916 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 917 } 918 919 static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 920 const SourceManager &SM, 921 const LangOptions &LangOpts) { 922 SourceLocation Begin = Range.getBegin(); 923 SourceLocation End = Range.getEnd(); 924 assert(Begin.isFileID() && End.isFileID()); 925 if (Range.isTokenRange()) { 926 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 927 if (End.isInvalid()) 928 return {}; 929 } 930 931 // Break down the source locations. 932 FileID FID; 933 unsigned BeginOffs; 934 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 935 if (FID.isInvalid()) 936 return {}; 937 938 unsigned EndOffs; 939 if (!SM.isInFileID(End, FID, &EndOffs) || 940 BeginOffs > EndOffs) 941 return {}; 942 943 return CharSourceRange::getCharRange(Begin, End); 944 } 945 946 // Assumes that `Loc` is in an expansion. 947 static bool isInExpansionTokenRange(const SourceLocation Loc, 948 const SourceManager &SM) { 949 return SM.getSLocEntry(SM.getFileID(Loc)) 950 .getExpansion() 951 .isExpansionTokenRange(); 952 } 953 954 CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 955 const SourceManager &SM, 956 const LangOptions &LangOpts) { 957 SourceLocation Begin = Range.getBegin(); 958 SourceLocation End = Range.getEnd(); 959 if (Begin.isInvalid() || End.isInvalid()) 960 return {}; 961 962 if (Begin.isFileID() && End.isFileID()) 963 return makeRangeFromFileLocs(Range, SM, LangOpts); 964 965 if (Begin.isMacroID() && End.isFileID()) { 966 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 967 return {}; 968 Range.setBegin(Begin); 969 return makeRangeFromFileLocs(Range, SM, LangOpts); 970 } 971 972 if (Begin.isFileID() && End.isMacroID()) { 973 if (Range.isTokenRange()) { 974 if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End)) 975 return {}; 976 // Use the *original* end, not the expanded one in `End`. 977 Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM)); 978 } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End)) 979 return {}; 980 Range.setEnd(End); 981 return makeRangeFromFileLocs(Range, SM, LangOpts); 982 } 983 984 assert(Begin.isMacroID() && End.isMacroID()); 985 SourceLocation MacroBegin, MacroEnd; 986 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 987 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 988 &MacroEnd)) || 989 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 990 &MacroEnd)))) { 991 Range.setBegin(MacroBegin); 992 Range.setEnd(MacroEnd); 993 // Use the *original* `End`, not the expanded one in `MacroEnd`. 994 if (Range.isTokenRange()) 995 Range.setTokenRange(isInExpansionTokenRange(End, SM)); 996 return makeRangeFromFileLocs(Range, SM, LangOpts); 997 } 998 999 bool Invalid = false; 1000 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), 1001 &Invalid); 1002 if (Invalid) 1003 return {}; 1004 1005 if (BeginEntry.getExpansion().isMacroArgExpansion()) { 1006 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), 1007 &Invalid); 1008 if (Invalid) 1009 return {}; 1010 1011 if (EndEntry.getExpansion().isMacroArgExpansion() && 1012 BeginEntry.getExpansion().getExpansionLocStart() == 1013 EndEntry.getExpansion().getExpansionLocStart()) { 1014 Range.setBegin(SM.getImmediateSpellingLoc(Begin)); 1015 Range.setEnd(SM.getImmediateSpellingLoc(End)); 1016 return makeFileCharRange(Range, SM, LangOpts); 1017 } 1018 } 1019 1020 return {}; 1021 } 1022 1023 StringRef Lexer::getSourceText(CharSourceRange Range, 1024 const SourceManager &SM, 1025 const LangOptions &LangOpts, 1026 bool *Invalid) { 1027 Range = makeFileCharRange(Range, SM, LangOpts); 1028 if (Range.isInvalid()) { 1029 if (Invalid) *Invalid = true; 1030 return {}; 1031 } 1032 1033 // Break down the source location. 1034 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 1035 if (beginInfo.first.isInvalid()) { 1036 if (Invalid) *Invalid = true; 1037 return {}; 1038 } 1039 1040 unsigned EndOffs; 1041 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 1042 beginInfo.second > EndOffs) { 1043 if (Invalid) *Invalid = true; 1044 return {}; 1045 } 1046 1047 // Try to the load the file buffer. 1048 bool invalidTemp = false; 1049 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 1050 if (invalidTemp) { 1051 if (Invalid) *Invalid = true; 1052 return {}; 1053 } 1054 1055 if (Invalid) *Invalid = false; 1056 return file.substr(beginInfo.second, EndOffs - beginInfo.second); 1057 } 1058 1059 StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 1060 const SourceManager &SM, 1061 const LangOptions &LangOpts) { 1062 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 1063 1064 // Find the location of the immediate macro expansion. 1065 while (true) { 1066 FileID FID = SM.getFileID(Loc); 1067 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 1068 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 1069 Loc = Expansion.getExpansionLocStart(); 1070 if (!Expansion.isMacroArgExpansion()) 1071 break; 1072 1073 // For macro arguments we need to check that the argument did not come 1074 // from an inner macro, e.g: "MAC1( MAC2(foo) )" 1075 1076 // Loc points to the argument id of the macro definition, move to the 1077 // macro expansion. 1078 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 1079 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 1080 if (SpellLoc.isFileID()) 1081 break; // No inner macro. 1082 1083 // If spelling location resides in the same FileID as macro expansion 1084 // location, it means there is no inner macro. 1085 FileID MacroFID = SM.getFileID(Loc); 1086 if (SM.isInFileID(SpellLoc, MacroFID)) 1087 break; 1088 1089 // Argument came from inner macro. 1090 Loc = SpellLoc; 1091 } 1092 1093 // Find the spelling location of the start of the non-argument expansion 1094 // range. This is where the macro name was spelled in order to begin 1095 // expanding this macro. 1096 Loc = SM.getSpellingLoc(Loc); 1097 1098 // Dig out the buffer where the macro name was spelled and the extents of the 1099 // name so that we can render it into the expansion note. 1100 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1101 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1102 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1103 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1104 } 1105 1106 StringRef Lexer::getImmediateMacroNameForDiagnostics( 1107 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { 1108 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 1109 // Walk past macro argument expansions. 1110 while (SM.isMacroArgExpansion(Loc)) 1111 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 1112 1113 // If the macro's spelling isn't FileID or from scratch space, then it's 1114 // actually a token paste or stringization (or similar) and not a macro at 1115 // all. 1116 SourceLocation SpellLoc = SM.getSpellingLoc(Loc); 1117 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc)) 1118 return {}; 1119 1120 // Find the spelling location of the start of the non-argument expansion 1121 // range. This is where the macro name was spelled in order to begin 1122 // expanding this macro. 1123 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin()); 1124 1125 // Dig out the buffer where the macro name was spelled and the extents of the 1126 // name so that we can render it into the expansion note. 1127 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1128 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1129 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1130 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1131 } 1132 1133 bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) { 1134 return isAsciiIdentifierContinue(c, LangOpts.DollarIdents); 1135 } 1136 1137 bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { 1138 assert(isVerticalWhitespace(Str[0])); 1139 if (Str - 1 < BufferStart) 1140 return false; 1141 1142 if ((Str[0] == '\n' && Str[-1] == '\r') || 1143 (Str[0] == '\r' && Str[-1] == '\n')) { 1144 if (Str - 2 < BufferStart) 1145 return false; 1146 --Str; 1147 } 1148 --Str; 1149 1150 // Rewind to first non-space character: 1151 while (Str > BufferStart && isHorizontalWhitespace(*Str)) 1152 --Str; 1153 1154 return *Str == '\\'; 1155 } 1156 1157 StringRef Lexer::getIndentationForLine(SourceLocation Loc, 1158 const SourceManager &SM) { 1159 if (Loc.isInvalid() || Loc.isMacroID()) 1160 return {}; 1161 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1162 if (LocInfo.first.isInvalid()) 1163 return {}; 1164 bool Invalid = false; 1165 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 1166 if (Invalid) 1167 return {}; 1168 const char *Line = findBeginningOfLine(Buffer, LocInfo.second); 1169 if (!Line) 1170 return {}; 1171 StringRef Rest = Buffer.substr(Line - Buffer.data()); 1172 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t"); 1173 return NumWhitespaceChars == StringRef::npos 1174 ? "" 1175 : Rest.take_front(NumWhitespaceChars); 1176 } 1177 1178 //===----------------------------------------------------------------------===// 1179 // Diagnostics forwarding code. 1180 //===----------------------------------------------------------------------===// 1181 1182 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 1183 /// lexer buffer was all expanded at a single point, perform the mapping. 1184 /// This is currently only used for _Pragma implementation, so it is the slow 1185 /// path of the hot getSourceLocation method. Do not allow it to be inlined. 1186 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 1187 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 1188 static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 1189 SourceLocation FileLoc, 1190 unsigned CharNo, unsigned TokLen) { 1191 assert(FileLoc.isMacroID() && "Must be a macro expansion"); 1192 1193 // Otherwise, we're lexing "mapped tokens". This is used for things like 1194 // _Pragma handling. Combine the expansion location of FileLoc with the 1195 // spelling location. 1196 SourceManager &SM = PP.getSourceManager(); 1197 1198 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 1199 // characters come from spelling(FileLoc)+Offset. 1200 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 1201 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 1202 1203 // Figure out the expansion loc range, which is the range covered by the 1204 // original _Pragma(...) sequence. 1205 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc); 1206 1207 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen); 1208 } 1209 1210 /// getSourceLocation - Return a source location identifier for the specified 1211 /// offset in the current file. 1212 SourceLocation Lexer::getSourceLocation(const char *Loc, 1213 unsigned TokLen) const { 1214 assert(Loc >= BufferStart && Loc <= BufferEnd && 1215 "Location out of range for this buffer!"); 1216 1217 // In the normal case, we're just lexing from a simple file buffer, return 1218 // the file id from FileLoc with the offset specified. 1219 unsigned CharNo = Loc-BufferStart; 1220 if (FileLoc.isFileID()) 1221 return FileLoc.getLocWithOffset(CharNo); 1222 1223 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 1224 // tokens are lexed from where the _Pragma was defined. 1225 assert(PP && "This doesn't work on raw lexers"); 1226 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 1227 } 1228 1229 /// Diag - Forwarding function for diagnostics. This translate a source 1230 /// position in the current buffer into a SourceLocation object for rendering. 1231 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 1232 return PP->Diag(getSourceLocation(Loc), DiagID); 1233 } 1234 1235 //===----------------------------------------------------------------------===// 1236 // Trigraph and Escaped Newline Handling Code. 1237 //===----------------------------------------------------------------------===// 1238 1239 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 1240 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 1241 static char GetTrigraphCharForLetter(char Letter) { 1242 switch (Letter) { 1243 default: return 0; 1244 case '=': return '#'; 1245 case ')': return ']'; 1246 case '(': return '['; 1247 case '!': return '|'; 1248 case '\'': return '^'; 1249 case '>': return '}'; 1250 case '/': return '\\'; 1251 case '<': return '{'; 1252 case '-': return '~'; 1253 } 1254 } 1255 1256 /// DecodeTrigraphChar - If the specified character is a legal trigraph when 1257 /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 1258 /// return the result character. Finally, emit a warning about trigraph use 1259 /// whether trigraphs are enabled or not. 1260 static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) { 1261 char Res = GetTrigraphCharForLetter(*CP); 1262 if (!Res) 1263 return Res; 1264 1265 if (!Trigraphs) { 1266 if (L && !L->isLexingRawMode()) 1267 L->Diag(CP-2, diag::trigraph_ignored); 1268 return 0; 1269 } 1270 1271 if (L && !L->isLexingRawMode()) 1272 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 1273 return Res; 1274 } 1275 1276 /// getEscapedNewLineSize - Return the size of the specified escaped newline, 1277 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 1278 /// trigraph equivalent on entry to this function. 1279 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 1280 unsigned Size = 0; 1281 while (isWhitespace(Ptr[Size])) { 1282 ++Size; 1283 1284 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 1285 continue; 1286 1287 // If this is a \r\n or \n\r, skip the other half. 1288 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 1289 Ptr[Size-1] != Ptr[Size]) 1290 ++Size; 1291 1292 return Size; 1293 } 1294 1295 // Not an escaped newline, must be a \t or something else. 1296 return 0; 1297 } 1298 1299 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 1300 /// them), skip over them and return the first non-escaped-newline found, 1301 /// otherwise return P. 1302 const char *Lexer::SkipEscapedNewLines(const char *P) { 1303 while (true) { 1304 const char *AfterEscape; 1305 if (*P == '\\') { 1306 AfterEscape = P+1; 1307 } else if (*P == '?') { 1308 // If not a trigraph for escape, bail out. 1309 if (P[1] != '?' || P[2] != '/') 1310 return P; 1311 // FIXME: Take LangOpts into account; the language might not 1312 // support trigraphs. 1313 AfterEscape = P+3; 1314 } else { 1315 return P; 1316 } 1317 1318 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 1319 if (NewLineSize == 0) return P; 1320 P = AfterEscape+NewLineSize; 1321 } 1322 } 1323 1324 std::optional<Token> Lexer::findNextToken(SourceLocation Loc, 1325 const SourceManager &SM, 1326 const LangOptions &LangOpts) { 1327 if (Loc.isMacroID()) { 1328 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 1329 return std::nullopt; 1330 } 1331 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 1332 1333 // Break down the source location. 1334 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1335 1336 // Try to load the file buffer. 1337 bool InvalidTemp = false; 1338 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 1339 if (InvalidTemp) 1340 return std::nullopt; 1341 1342 const char *TokenBegin = File.data() + LocInfo.second; 1343 1344 // Lex from the start of the given location. 1345 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 1346 TokenBegin, File.end()); 1347 // Find the token. 1348 Token Tok; 1349 lexer.LexFromRawLexer(Tok); 1350 return Tok; 1351 } 1352 1353 /// Checks that the given token is the first token that occurs after the 1354 /// given location (this excludes comments and whitespace). Returns the location 1355 /// immediately after the specified token. If the token is not found or the 1356 /// location is inside a macro, the returned source location will be invalid. 1357 SourceLocation Lexer::findLocationAfterToken( 1358 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, 1359 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { 1360 std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts); 1361 if (!Tok || Tok->isNot(TKind)) 1362 return {}; 1363 SourceLocation TokenLoc = Tok->getLocation(); 1364 1365 // Calculate how much whitespace needs to be skipped if any. 1366 unsigned NumWhitespaceChars = 0; 1367 if (SkipTrailingWhitespaceAndNewLine) { 1368 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength(); 1369 unsigned char C = *TokenEnd; 1370 while (isHorizontalWhitespace(C)) { 1371 C = *(++TokenEnd); 1372 NumWhitespaceChars++; 1373 } 1374 1375 // Skip \r, \n, \r\n, or \n\r 1376 if (C == '\n' || C == '\r') { 1377 char PrevC = C; 1378 C = *(++TokenEnd); 1379 NumWhitespaceChars++; 1380 if ((C == '\n' || C == '\r') && C != PrevC) 1381 NumWhitespaceChars++; 1382 } 1383 } 1384 1385 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars); 1386 } 1387 1388 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 1389 /// get its size, and return it. This is tricky in several cases: 1390 /// 1. If currently at the start of a trigraph, we warn about the trigraph, 1391 /// then either return the trigraph (skipping 3 chars) or the '?', 1392 /// depending on whether trigraphs are enabled or not. 1393 /// 2. If this is an escaped newline (potentially with whitespace between 1394 /// the backslash and newline), implicitly skip the newline and return 1395 /// the char after it. 1396 /// 1397 /// This handles the slow/uncommon case of the getCharAndSize method. Here we 1398 /// know that we can accumulate into Size, and that we have already incremented 1399 /// Ptr by Size bytes. 1400 /// 1401 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 1402 /// be updated to match. 1403 Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) { 1404 unsigned Size = 0; 1405 // If we have a slash, look for an escaped newline. 1406 if (Ptr[0] == '\\') { 1407 ++Size; 1408 ++Ptr; 1409 Slash: 1410 // Common case, backslash-char where the char is not whitespace. 1411 if (!isWhitespace(Ptr[0])) 1412 return {'\\', Size}; 1413 1414 // See if we have optional whitespace characters between the slash and 1415 // newline. 1416 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1417 // Remember that this token needs to be cleaned. 1418 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1419 1420 // Warn if there was whitespace between the backslash and newline. 1421 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 1422 Diag(Ptr, diag::backslash_newline_space); 1423 1424 // Found backslash<whitespace><newline>. Parse the char after it. 1425 Size += EscapedNewLineSize; 1426 Ptr += EscapedNewLineSize; 1427 1428 // Use slow version to accumulate a correct size field. 1429 auto CharAndSize = getCharAndSizeSlow(Ptr, Tok); 1430 CharAndSize.Size += Size; 1431 return CharAndSize; 1432 } 1433 1434 // Otherwise, this is not an escaped newline, just return the slash. 1435 return {'\\', Size}; 1436 } 1437 1438 // If this is a trigraph, process it. 1439 if (Ptr[0] == '?' && Ptr[1] == '?') { 1440 // If this is actually a legal trigraph (not something like "??x"), emit 1441 // a trigraph warning. If so, and if trigraphs are enabled, return it. 1442 if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr, 1443 LangOpts.Trigraphs)) { 1444 // Remember that this token needs to be cleaned. 1445 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1446 1447 Ptr += 3; 1448 Size += 3; 1449 if (C == '\\') goto Slash; 1450 return {C, Size}; 1451 } 1452 } 1453 1454 // If this is neither, return a single character. 1455 return {*Ptr, Size + 1u}; 1456 } 1457 1458 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 1459 /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 1460 /// and that we have already incremented Ptr by Size bytes. 1461 /// 1462 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 1463 /// be updated to match. 1464 Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, 1465 const LangOptions &LangOpts) { 1466 1467 unsigned Size = 0; 1468 // If we have a slash, look for an escaped newline. 1469 if (Ptr[0] == '\\') { 1470 ++Size; 1471 ++Ptr; 1472 Slash: 1473 // Common case, backslash-char where the char is not whitespace. 1474 if (!isWhitespace(Ptr[0])) 1475 return {'\\', Size}; 1476 1477 // See if we have optional whitespace characters followed by a newline. 1478 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1479 // Found backslash<whitespace><newline>. Parse the char after it. 1480 Size += EscapedNewLineSize; 1481 Ptr += EscapedNewLineSize; 1482 1483 // Use slow version to accumulate a correct size field. 1484 auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts); 1485 CharAndSize.Size += Size; 1486 return CharAndSize; 1487 } 1488 1489 // Otherwise, this is not an escaped newline, just return the slash. 1490 return {'\\', Size}; 1491 } 1492 1493 // If this is a trigraph, process it. 1494 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 1495 // If this is actually a legal trigraph (not something like "??x"), return 1496 // it. 1497 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 1498 Ptr += 3; 1499 Size += 3; 1500 if (C == '\\') goto Slash; 1501 return {C, Size}; 1502 } 1503 } 1504 1505 // If this is neither, return a single character. 1506 return {*Ptr, Size + 1u}; 1507 } 1508 1509 //===----------------------------------------------------------------------===// 1510 // Helper methods for lexing. 1511 //===----------------------------------------------------------------------===// 1512 1513 /// Routine that indiscriminately sets the offset into the source file. 1514 void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { 1515 BufferPtr = BufferStart + Offset; 1516 if (BufferPtr > BufferEnd) 1517 BufferPtr = BufferEnd; 1518 // FIXME: What exactly does the StartOfLine bit mean? There are two 1519 // possible meanings for the "start" of the line: the first token on the 1520 // unexpanded line, or the first token on the expanded line. 1521 IsAtStartOfLine = StartOfLine; 1522 IsAtPhysicalStartOfLine = StartOfLine; 1523 } 1524 1525 static bool isUnicodeWhitespace(uint32_t Codepoint) { 1526 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( 1527 UnicodeWhitespaceCharRanges); 1528 return UnicodeWhitespaceChars.contains(Codepoint); 1529 } 1530 1531 static llvm::SmallString<5> codepointAsHexString(uint32_t C) { 1532 llvm::SmallString<5> CharBuf; 1533 llvm::raw_svector_ostream CharOS(CharBuf); 1534 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); 1535 return CharBuf; 1536 } 1537 1538 // To mitigate https://github.com/llvm/llvm-project/issues/54732, 1539 // we allow "Mathematical Notation Characters" in identifiers. 1540 // This is a proposed profile that extends the XID_Start/XID_continue 1541 // with mathematical symbols, superscipts and subscripts digits 1542 // found in some production software. 1543 // https://www.unicode.org/L2/L2022/22230-math-profile.pdf 1544 static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, 1545 bool IsStart, bool &IsExtension) { 1546 static const llvm::sys::UnicodeCharSet MathStartChars( 1547 MathematicalNotationProfileIDStartRanges); 1548 static const llvm::sys::UnicodeCharSet MathContinueChars( 1549 MathematicalNotationProfileIDContinueRanges); 1550 if (MathStartChars.contains(C) || 1551 (!IsStart && MathContinueChars.contains(C))) { 1552 IsExtension = true; 1553 return true; 1554 } 1555 return false; 1556 } 1557 1558 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, 1559 bool &IsExtension) { 1560 if (LangOpts.AsmPreprocessor) { 1561 return false; 1562 } else if (LangOpts.DollarIdents && '$' == C) { 1563 return true; 1564 } else if (LangOpts.CPlusPlus || LangOpts.C23) { 1565 // A non-leading codepoint must have the XID_Continue property. 1566 // XIDContinueRanges doesn't contains characters also in XIDStartRanges, 1567 // so we need to check both tables. 1568 // '_' doesn't have the XID_Continue property but is allowed in C and C++. 1569 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1570 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges); 1571 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C)) 1572 return true; 1573 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false, 1574 IsExtension); 1575 } else if (LangOpts.C11) { 1576 static const llvm::sys::UnicodeCharSet C11AllowedIDChars( 1577 C11AllowedIDCharRanges); 1578 return C11AllowedIDChars.contains(C); 1579 } else { 1580 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1581 C99AllowedIDCharRanges); 1582 return C99AllowedIDChars.contains(C); 1583 } 1584 } 1585 1586 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, 1587 bool &IsExtension) { 1588 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint"); 1589 IsExtension = false; 1590 if (LangOpts.AsmPreprocessor) { 1591 return false; 1592 } 1593 if (LangOpts.CPlusPlus || LangOpts.C23) { 1594 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1595 if (XIDStartChars.contains(C)) 1596 return true; 1597 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true, 1598 IsExtension); 1599 } 1600 if (!isAllowedIDChar(C, LangOpts, IsExtension)) 1601 return false; 1602 if (LangOpts.C11) { 1603 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( 1604 C11DisallowedInitialIDCharRanges); 1605 return !C11DisallowedInitialIDChars.contains(C); 1606 } 1607 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1608 C99DisallowedInitialIDCharRanges); 1609 return !C99DisallowedInitialIDChars.contains(C); 1610 } 1611 1612 static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, 1613 CharSourceRange Range) { 1614 1615 static const llvm::sys::UnicodeCharSet MathStartChars( 1616 MathematicalNotationProfileIDStartRanges); 1617 static const llvm::sys::UnicodeCharSet MathContinueChars( 1618 MathematicalNotationProfileIDContinueRanges); 1619 1620 (void)MathStartChars; 1621 (void)MathContinueChars; 1622 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) && 1623 "Unexpected mathematical notation codepoint"); 1624 Diags.Report(Range.getBegin(), diag::ext_mathematical_notation) 1625 << codepointAsHexString(C) << Range; 1626 } 1627 1628 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, 1629 const char *End) { 1630 return CharSourceRange::getCharRange(L.getSourceLocation(Begin), 1631 L.getSourceLocation(End)); 1632 } 1633 1634 static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, 1635 CharSourceRange Range, bool IsFirst) { 1636 // Check C99 compatibility. 1637 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) { 1638 enum { 1639 CannotAppearInIdentifier = 0, 1640 CannotStartIdentifier 1641 }; 1642 1643 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1644 C99AllowedIDCharRanges); 1645 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1646 C99DisallowedInitialIDCharRanges); 1647 if (!C99AllowedIDChars.contains(C)) { 1648 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1649 << Range 1650 << CannotAppearInIdentifier; 1651 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { 1652 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1653 << Range 1654 << CannotStartIdentifier; 1655 } 1656 } 1657 } 1658 1659 /// After encountering UTF-8 character C and interpreting it as an identifier 1660 /// character, check whether it's a homoglyph for a common non-identifier 1661 /// source character that is unlikely to be an intentional identifier 1662 /// character and warn if so. 1663 static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, 1664 CharSourceRange Range) { 1665 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). 1666 struct HomoglyphPair { 1667 uint32_t Character; 1668 char LooksLike; 1669 bool operator<(HomoglyphPair R) const { return Character < R.Character; } 1670 }; 1671 static constexpr HomoglyphPair SortedHomoglyphs[] = { 1672 {U'\u00ad', 0}, // SOFT HYPHEN 1673 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK 1674 {U'\u037e', ';'}, // GREEK QUESTION MARK 1675 {U'\u200b', 0}, // ZERO WIDTH SPACE 1676 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER 1677 {U'\u200d', 0}, // ZERO WIDTH JOINER 1678 {U'\u2060', 0}, // WORD JOINER 1679 {U'\u2061', 0}, // FUNCTION APPLICATION 1680 {U'\u2062', 0}, // INVISIBLE TIMES 1681 {U'\u2063', 0}, // INVISIBLE SEPARATOR 1682 {U'\u2064', 0}, // INVISIBLE PLUS 1683 {U'\u2212', '-'}, // MINUS SIGN 1684 {U'\u2215', '/'}, // DIVISION SLASH 1685 {U'\u2216', '\\'}, // SET MINUS 1686 {U'\u2217', '*'}, // ASTERISK OPERATOR 1687 {U'\u2223', '|'}, // DIVIDES 1688 {U'\u2227', '^'}, // LOGICAL AND 1689 {U'\u2236', ':'}, // RATIO 1690 {U'\u223c', '~'}, // TILDE OPERATOR 1691 {U'\ua789', ':'}, // MODIFIER LETTER COLON 1692 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE 1693 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK 1694 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN 1695 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN 1696 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN 1697 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND 1698 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS 1699 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS 1700 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK 1701 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK 1702 {U'\uff0c', ','}, // FULLWIDTH COMMA 1703 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS 1704 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP 1705 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS 1706 {U'\uff1a', ':'}, // FULLWIDTH COLON 1707 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON 1708 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN 1709 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN 1710 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN 1711 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK 1712 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT 1713 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET 1714 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS 1715 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET 1716 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT 1717 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET 1718 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE 1719 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET 1720 {U'\uff5e', '~'}, // FULLWIDTH TILDE 1721 {0, 0} 1722 }; 1723 auto Homoglyph = 1724 std::lower_bound(std::begin(SortedHomoglyphs), 1725 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); 1726 if (Homoglyph->Character == C) { 1727 if (Homoglyph->LooksLike) { 1728 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; 1729 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) 1730 << Range << codepointAsHexString(C) << LooksLikeStr; 1731 } else { 1732 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) 1733 << Range << codepointAsHexString(C); 1734 } 1735 } 1736 } 1737 1738 static void diagnoseInvalidUnicodeCodepointInIdentifier( 1739 DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, 1740 CharSourceRange Range, bool IsFirst) { 1741 if (isASCII(CodePoint)) 1742 return; 1743 1744 bool IsExtension; 1745 bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension); 1746 bool IsIDContinue = 1747 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension); 1748 1749 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue)) 1750 return; 1751 1752 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue; 1753 1754 if (!IsFirst || InvalidOnlyAtStart) { 1755 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier) 1756 << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart) 1757 << FixItHint::CreateRemoval(Range); 1758 } else { 1759 Diags.Report(Range.getBegin(), diag::err_character_not_allowed) 1760 << Range << codepointAsHexString(CodePoint) 1761 << FixItHint::CreateRemoval(Range); 1762 } 1763 } 1764 1765 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 1766 Token &Result) { 1767 const char *UCNPtr = CurPtr + Size; 1768 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr); 1769 if (CodePoint == 0) { 1770 return false; 1771 } 1772 bool IsExtension = false; 1773 if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) { 1774 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1775 return false; 1776 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1777 !PP->isPreprocessedOutput()) 1778 diagnoseInvalidUnicodeCodepointInIdentifier( 1779 PP->getDiagnostics(), LangOpts, CodePoint, 1780 makeCharRange(*this, CurPtr, UCNPtr), 1781 /*IsFirst=*/false); 1782 1783 // We got a unicode codepoint that is neither a space nor a 1784 // a valid identifier part. 1785 // Carry on as if the codepoint was valid for recovery purposes. 1786 } else if (!isLexingRawMode()) { 1787 if (IsExtension) 1788 diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint, 1789 makeCharRange(*this, CurPtr, UCNPtr)); 1790 1791 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1792 makeCharRange(*this, CurPtr, UCNPtr), 1793 /*IsFirst=*/false); 1794 } 1795 1796 Result.setFlag(Token::HasUCN); 1797 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || 1798 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) 1799 CurPtr = UCNPtr; 1800 else 1801 while (CurPtr != UCNPtr) 1802 (void)getAndAdvanceChar(CurPtr, Result); 1803 return true; 1804 } 1805 1806 bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) { 1807 llvm::UTF32 CodePoint; 1808 1809 // If a UTF-8 codepoint appears immediately after an escaped new line, 1810 // CurPtr may point to the splicing \ on the preceding line, 1811 // so we need to skip it. 1812 unsigned FirstCodeUnitSize; 1813 getCharAndSize(CurPtr, FirstCodeUnitSize); 1814 const char *CharStart = CurPtr + FirstCodeUnitSize - 1; 1815 const char *UnicodePtr = CharStart; 1816 1817 llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence( 1818 (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd, 1819 &CodePoint, llvm::strictConversion); 1820 if (ConvResult != llvm::conversionOK) 1821 return false; 1822 1823 bool IsExtension = false; 1824 if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts, 1825 IsExtension)) { 1826 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1827 return false; 1828 1829 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1830 !PP->isPreprocessedOutput()) 1831 diagnoseInvalidUnicodeCodepointInIdentifier( 1832 PP->getDiagnostics(), LangOpts, CodePoint, 1833 makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false); 1834 // We got a unicode codepoint that is neither a space nor a 1835 // a valid identifier part. Carry on as if the codepoint was 1836 // valid for recovery purposes. 1837 } else if (!isLexingRawMode()) { 1838 if (IsExtension) 1839 diagnoseExtensionInIdentifier( 1840 PP->getDiagnostics(), CodePoint, 1841 makeCharRange(*this, CharStart, UnicodePtr)); 1842 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1843 makeCharRange(*this, CharStart, UnicodePtr), 1844 /*IsFirst=*/false); 1845 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, 1846 makeCharRange(*this, CharStart, UnicodePtr)); 1847 } 1848 1849 // Once we sucessfully parsed some UTF-8, 1850 // calling ConsumeChar ensures the NeedsCleaning flag is set on the token 1851 // being lexed, and that warnings about trailing spaces are emitted. 1852 ConsumeChar(CurPtr, FirstCodeUnitSize, Result); 1853 CurPtr = UnicodePtr; 1854 return true; 1855 } 1856 1857 bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C, 1858 const char *CurPtr) { 1859 bool IsExtension = false; 1860 if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) { 1861 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1862 !PP->isPreprocessedOutput()) { 1863 if (IsExtension) 1864 diagnoseExtensionInIdentifier(PP->getDiagnostics(), C, 1865 makeCharRange(*this, BufferPtr, CurPtr)); 1866 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, 1867 makeCharRange(*this, BufferPtr, CurPtr), 1868 /*IsFirst=*/true); 1869 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, 1870 makeCharRange(*this, BufferPtr, CurPtr)); 1871 } 1872 1873 MIOpt.ReadToken(); 1874 return LexIdentifierContinue(Result, CurPtr); 1875 } 1876 1877 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1878 !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) && 1879 !isUnicodeWhitespace(C)) { 1880 // Non-ASCII characters tend to creep into source code unintentionally. 1881 // Instead of letting the parser complain about the unknown token, 1882 // just drop the character. 1883 // Note that we can /only/ do this when the non-ASCII character is actually 1884 // spelled as Unicode, not written as a UCN. The standard requires that 1885 // we not throw away any possible preprocessor tokens, but there's a 1886 // loophole in the mapping of Unicode characters to basic character set 1887 // characters that allows us to map these particular characters to, say, 1888 // whitespace. 1889 diagnoseInvalidUnicodeCodepointInIdentifier( 1890 PP->getDiagnostics(), LangOpts, C, 1891 makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true); 1892 BufferPtr = CurPtr; 1893 return false; 1894 } 1895 1896 // Otherwise, we have an explicit UCN or a character that's unlikely to show 1897 // up by accident. 1898 MIOpt.ReadToken(); 1899 FormTokenWithChars(Result, CurPtr, tok::unknown); 1900 return true; 1901 } 1902 1903 static const char * 1904 fastParseASCIIIdentifier(const char *CurPtr, 1905 [[maybe_unused]] const char *BufferEnd) { 1906 #ifdef __SSE4_2__ 1907 alignas(16) static constexpr char AsciiIdentifierRange[16] = { 1908 '_', '_', 'A', 'Z', 'a', 'z', '0', '9', 1909 }; 1910 constexpr ssize_t BytesPerRegister = 16; 1911 1912 __m128i AsciiIdentifierRangeV = 1913 _mm_load_si128((const __m128i *)AsciiIdentifierRange); 1914 1915 while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) { 1916 __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr)); 1917 1918 int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv, 1919 _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | 1920 _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY); 1921 CurPtr += Consumed; 1922 if (Consumed == BytesPerRegister) 1923 continue; 1924 return CurPtr; 1925 } 1926 #endif 1927 1928 unsigned char C = *CurPtr; 1929 while (isAsciiIdentifierContinue(C)) 1930 C = *++CurPtr; 1931 return CurPtr; 1932 } 1933 1934 bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) { 1935 // Match [_A-Za-z0-9]*, we have already matched an identifier start. 1936 1937 while (true) { 1938 1939 CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd); 1940 1941 unsigned Size; 1942 // Slow path: handle trigraph, unicode codepoints, UCNs. 1943 unsigned char C = getCharAndSize(CurPtr, Size); 1944 if (isAsciiIdentifierContinue(C)) { 1945 CurPtr = ConsumeChar(CurPtr, Size, Result); 1946 continue; 1947 } 1948 if (C == '$') { 1949 // If we hit a $ and they are not supported in identifiers, we are done. 1950 if (!LangOpts.DollarIdents) 1951 break; 1952 // Otherwise, emit a diagnostic and continue. 1953 if (!isLexingRawMode()) 1954 Diag(CurPtr, diag::ext_dollar_in_identifier); 1955 CurPtr = ConsumeChar(CurPtr, Size, Result); 1956 continue; 1957 } 1958 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1959 continue; 1960 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 1961 continue; 1962 // Neither an expected Unicode codepoint nor a UCN. 1963 break; 1964 } 1965 1966 const char *IdStart = BufferPtr; 1967 FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 1968 Result.setRawIdentifierData(IdStart); 1969 1970 // If we are in raw mode, return this identifier raw. There is no need to 1971 // look up identifier information or attempt to macro expand it. 1972 if (LexingRawMode) 1973 return true; 1974 1975 // Fill in Result.IdentifierInfo and update the token kind, 1976 // looking up the identifier in the identifier table. 1977 const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 1978 // Note that we have to call PP->LookUpIdentifierInfo() even for code 1979 // completion, it writes IdentifierInfo into Result, and callers rely on it. 1980 1981 // If the completion point is at the end of an identifier, we want to treat 1982 // the identifier as incomplete even if it resolves to a macro or a keyword. 1983 // This allows e.g. 'class^' to complete to 'classifier'. 1984 if (isCodeCompletionPoint(CurPtr)) { 1985 // Return the code-completion token. 1986 Result.setKind(tok::code_completion); 1987 // Skip the code-completion char and all immediate identifier characters. 1988 // This ensures we get consistent behavior when completing at any point in 1989 // an identifier (i.e. at the start, in the middle, at the end). Note that 1990 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code 1991 // simpler. 1992 assert(*CurPtr == 0 && "Completion character must be 0"); 1993 ++CurPtr; 1994 // Note that code completion token is not added as a separate character 1995 // when the completion point is at the end of the buffer. Therefore, we need 1996 // to check if the buffer has ended. 1997 if (CurPtr < BufferEnd) { 1998 while (isAsciiIdentifierContinue(*CurPtr)) 1999 ++CurPtr; 2000 } 2001 BufferPtr = CurPtr; 2002 return true; 2003 } 2004 2005 // Finally, now that we know we have an identifier, pass this off to the 2006 // preprocessor, which may macro expand it or something. 2007 if (II->isHandleIdentifierCase()) 2008 return PP->HandleIdentifier(Result); 2009 2010 return true; 2011 } 2012 2013 /// isHexaLiteral - Return true if Start points to a hex constant. 2014 /// in microsoft mode (where this is supposed to be several different tokens). 2015 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 2016 auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts); 2017 char C1 = CharAndSize1.Char; 2018 if (C1 != '0') 2019 return false; 2020 2021 auto CharAndSize2 = 2022 Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts); 2023 char C2 = CharAndSize2.Char; 2024 return (C2 == 'x' || C2 == 'X'); 2025 } 2026 2027 /// LexNumericConstant - Lex the remainder of a integer or floating point 2028 /// constant. From[-1] is the first character lexed. Return the end of the 2029 /// constant. 2030 bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 2031 unsigned Size; 2032 char C = getCharAndSize(CurPtr, Size); 2033 char PrevCh = 0; 2034 while (isPreprocessingNumberBody(C)) { 2035 CurPtr = ConsumeChar(CurPtr, Size, Result); 2036 PrevCh = C; 2037 if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) { 2038 CurPtr -= Size; 2039 break; 2040 } 2041 C = getCharAndSize(CurPtr, Size); 2042 } 2043 2044 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 2045 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 2046 // If we are in Microsoft mode, don't continue if the constant is hex. 2047 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 2048 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 2049 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 2050 } 2051 2052 // If we have a hex FP constant, continue. 2053 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 2054 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a 2055 // not-quite-conforming extension. Only do so if this looks like it's 2056 // actually meant to be a hexfloat, and not if it has a ud-suffix. 2057 bool IsHexFloat = true; 2058 if (!LangOpts.C99) { 2059 if (!isHexaLiteral(BufferPtr, LangOpts)) 2060 IsHexFloat = false; 2061 else if (!LangOpts.CPlusPlus17 && 2062 std::find(BufferPtr, CurPtr, '_') != CurPtr) 2063 IsHexFloat = false; 2064 } 2065 if (IsHexFloat) 2066 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 2067 } 2068 2069 // If we have a digit separator, continue. 2070 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) { 2071 auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts); 2072 if (isAsciiIdentifierContinue(Next)) { 2073 if (!isLexingRawMode()) 2074 Diag(CurPtr, LangOpts.CPlusPlus 2075 ? diag::warn_cxx11_compat_digit_separator 2076 : diag::warn_c23_compat_digit_separator); 2077 CurPtr = ConsumeChar(CurPtr, Size, Result); 2078 CurPtr = ConsumeChar(CurPtr, NextSize, Result); 2079 return LexNumericConstant(Result, CurPtr); 2080 } 2081 } 2082 2083 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. 2084 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 2085 return LexNumericConstant(Result, CurPtr); 2086 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 2087 return LexNumericConstant(Result, CurPtr); 2088 2089 // Update the location of token as well as BufferPtr. 2090 const char *TokStart = BufferPtr; 2091 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 2092 Result.setLiteralData(TokStart); 2093 return true; 2094 } 2095 2096 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 2097 /// in C++11, or warn on a ud-suffix in C++98. 2098 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, 2099 bool IsStringLiteral) { 2100 assert(LangOpts.CPlusPlus); 2101 2102 // Maximally munch an identifier. 2103 unsigned Size; 2104 char C = getCharAndSize(CurPtr, Size); 2105 bool Consumed = false; 2106 2107 if (!isAsciiIdentifierStart(C)) { 2108 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 2109 Consumed = true; 2110 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 2111 Consumed = true; 2112 else 2113 return CurPtr; 2114 } 2115 2116 if (!LangOpts.CPlusPlus11) { 2117 if (!isLexingRawMode()) 2118 Diag(CurPtr, 2119 C == '_' ? diag::warn_cxx11_compat_user_defined_literal 2120 : diag::warn_cxx11_compat_reserved_user_defined_literal) 2121 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 2122 return CurPtr; 2123 } 2124 2125 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 2126 // that does not start with an underscore is ill-formed. As a conforming 2127 // extension, we treat all such suffixes as if they had whitespace before 2128 // them. We assume a suffix beginning with a UCN or UTF-8 character is more 2129 // likely to be a ud-suffix than a macro, however, and accept that. 2130 if (!Consumed) { 2131 bool IsUDSuffix = false; 2132 if (C == '_') 2133 IsUDSuffix = true; 2134 else if (IsStringLiteral && LangOpts.CPlusPlus14) { 2135 // In C++1y, we need to look ahead a few characters to see if this is a 2136 // valid suffix for a string literal or a numeric literal (this could be 2137 // the 'operator""if' defining a numeric literal operator). 2138 const unsigned MaxStandardSuffixLength = 3; 2139 char Buffer[MaxStandardSuffixLength] = { C }; 2140 unsigned Consumed = Size; 2141 unsigned Chars = 1; 2142 while (true) { 2143 auto [Next, NextSize] = 2144 getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts); 2145 if (!isAsciiIdentifierContinue(Next)) { 2146 // End of suffix. Check whether this is on the allowed list. 2147 const StringRef CompleteSuffix(Buffer, Chars); 2148 IsUDSuffix = 2149 StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix); 2150 break; 2151 } 2152 2153 if (Chars == MaxStandardSuffixLength) 2154 // Too long: can't be a standard suffix. 2155 break; 2156 2157 Buffer[Chars++] = Next; 2158 Consumed += NextSize; 2159 } 2160 } 2161 2162 if (!IsUDSuffix) { 2163 if (!isLexingRawMode()) 2164 Diag(CurPtr, LangOpts.MSVCCompat 2165 ? diag::ext_ms_reserved_user_defined_literal 2166 : diag::ext_reserved_user_defined_literal) 2167 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 2168 return CurPtr; 2169 } 2170 2171 CurPtr = ConsumeChar(CurPtr, Size, Result); 2172 } 2173 2174 Result.setFlag(Token::HasUDSuffix); 2175 while (true) { 2176 C = getCharAndSize(CurPtr, Size); 2177 if (isAsciiIdentifierContinue(C)) { 2178 CurPtr = ConsumeChar(CurPtr, Size, Result); 2179 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { 2180 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) { 2181 } else 2182 break; 2183 } 2184 2185 return CurPtr; 2186 } 2187 2188 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 2189 /// either " or L" or u8" or u" or U". 2190 bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 2191 tok::TokenKind Kind) { 2192 const char *AfterQuote = CurPtr; 2193 // Does this string contain the \0 character? 2194 const char *NulCharacter = nullptr; 2195 2196 if (!isLexingRawMode() && 2197 (Kind == tok::utf8_string_literal || 2198 Kind == tok::utf16_string_literal || 2199 Kind == tok::utf32_string_literal)) 2200 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal 2201 : diag::warn_c99_compat_unicode_literal); 2202 2203 char C = getAndAdvanceChar(CurPtr, Result); 2204 while (C != '"') { 2205 // Skip escaped characters. Escaped newlines will already be processed by 2206 // getAndAdvanceChar. 2207 if (C == '\\') 2208 C = getAndAdvanceChar(CurPtr, Result); 2209 2210 if (C == '\n' || C == '\r' || // Newline. 2211 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 2212 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2213 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; 2214 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2215 return true; 2216 } 2217 2218 if (C == 0) { 2219 if (isCodeCompletionPoint(CurPtr-1)) { 2220 if (ParsingFilename) 2221 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false); 2222 else 2223 PP->CodeCompleteNaturalLanguage(); 2224 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 2225 cutOffLexing(); 2226 return true; 2227 } 2228 2229 NulCharacter = CurPtr-1; 2230 } 2231 C = getAndAdvanceChar(CurPtr, Result); 2232 } 2233 2234 // If we are in C++11, lex the optional ud-suffix. 2235 if (LangOpts.CPlusPlus) 2236 CurPtr = LexUDSuffix(Result, CurPtr, true); 2237 2238 // If a nul character existed in the string, warn about it. 2239 if (NulCharacter && !isLexingRawMode()) 2240 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 2241 2242 // Update the location of the token as well as the BufferPtr instance var. 2243 const char *TokStart = BufferPtr; 2244 FormTokenWithChars(Result, CurPtr, Kind); 2245 Result.setLiteralData(TokStart); 2246 return true; 2247 } 2248 2249 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 2250 /// having lexed R", LR", u8R", uR", or UR". 2251 bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 2252 tok::TokenKind Kind) { 2253 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 2254 // Between the initial and final double quote characters of the raw string, 2255 // any transformations performed in phases 1 and 2 (trigraphs, 2256 // universal-character-names, and line splicing) are reverted. 2257 2258 if (!isLexingRawMode()) 2259 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 2260 2261 unsigned PrefixLen = 0; 2262 2263 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) { 2264 if (!isLexingRawMode() && 2265 llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) { 2266 const char *Pos = &CurPtr[PrefixLen]; 2267 Diag(Pos, LangOpts.CPlusPlus26 2268 ? diag::warn_cxx26_compat_raw_string_literal_character_set 2269 : diag::ext_cxx26_raw_string_literal_character_set) 2270 << StringRef(Pos, 1); 2271 } 2272 ++PrefixLen; 2273 } 2274 2275 // If the last character was not a '(', then we didn't lex a valid delimiter. 2276 if (CurPtr[PrefixLen] != '(') { 2277 if (!isLexingRawMode()) { 2278 const char *PrefixEnd = &CurPtr[PrefixLen]; 2279 if (PrefixLen == 16) { 2280 Diag(PrefixEnd, diag::err_raw_delim_too_long); 2281 } else if (*PrefixEnd == '\n') { 2282 Diag(PrefixEnd, diag::err_invalid_newline_raw_delim); 2283 } else { 2284 Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 2285 << StringRef(PrefixEnd, 1); 2286 } 2287 } 2288 2289 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 2290 // it's possible the '"' was intended to be part of the raw string, but 2291 // there's not much we can do about that. 2292 while (true) { 2293 char C = *CurPtr++; 2294 2295 if (C == '"') 2296 break; 2297 if (C == 0 && CurPtr-1 == BufferEnd) { 2298 --CurPtr; 2299 break; 2300 } 2301 } 2302 2303 FormTokenWithChars(Result, CurPtr, tok::unknown); 2304 return true; 2305 } 2306 2307 // Save prefix and move CurPtr past it 2308 const char *Prefix = CurPtr; 2309 CurPtr += PrefixLen + 1; // skip over prefix and '(' 2310 2311 while (true) { 2312 char C = *CurPtr++; 2313 2314 if (C == ')') { 2315 // Check for prefix match and closing quote. 2316 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 2317 CurPtr += PrefixLen + 1; // skip over prefix and '"' 2318 break; 2319 } 2320 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 2321 if (!isLexingRawMode()) 2322 Diag(BufferPtr, diag::err_unterminated_raw_string) 2323 << StringRef(Prefix, PrefixLen); 2324 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2325 return true; 2326 } 2327 } 2328 2329 // If we are in C++11, lex the optional ud-suffix. 2330 if (LangOpts.CPlusPlus) 2331 CurPtr = LexUDSuffix(Result, CurPtr, true); 2332 2333 // Update the location of token as well as BufferPtr. 2334 const char *TokStart = BufferPtr; 2335 FormTokenWithChars(Result, CurPtr, Kind); 2336 Result.setLiteralData(TokStart); 2337 return true; 2338 } 2339 2340 /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 2341 /// after having lexed the '<' character. This is used for #include filenames. 2342 bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 2343 // Does this string contain the \0 character? 2344 const char *NulCharacter = nullptr; 2345 const char *AfterLessPos = CurPtr; 2346 char C = getAndAdvanceChar(CurPtr, Result); 2347 while (C != '>') { 2348 // Skip escaped characters. Escaped newlines will already be processed by 2349 // getAndAdvanceChar. 2350 if (C == '\\') 2351 C = getAndAdvanceChar(CurPtr, Result); 2352 2353 if (isVerticalWhitespace(C) || // Newline. 2354 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file. 2355 // If the filename is unterminated, then it must just be a lone < 2356 // character. Return this as such. 2357 FormTokenWithChars(Result, AfterLessPos, tok::less); 2358 return true; 2359 } 2360 2361 if (C == 0) { 2362 if (isCodeCompletionPoint(CurPtr - 1)) { 2363 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true); 2364 cutOffLexing(); 2365 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 2366 return true; 2367 } 2368 NulCharacter = CurPtr-1; 2369 } 2370 C = getAndAdvanceChar(CurPtr, Result); 2371 } 2372 2373 // If a nul character existed in the string, warn about it. 2374 if (NulCharacter && !isLexingRawMode()) 2375 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 2376 2377 // Update the location of token as well as BufferPtr. 2378 const char *TokStart = BufferPtr; 2379 FormTokenWithChars(Result, CurPtr, tok::header_name); 2380 Result.setLiteralData(TokStart); 2381 return true; 2382 } 2383 2384 void Lexer::codeCompleteIncludedFile(const char *PathStart, 2385 const char *CompletionPoint, 2386 bool IsAngled) { 2387 // Completion only applies to the filename, after the last slash. 2388 StringRef PartialPath(PathStart, CompletionPoint - PathStart); 2389 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/"; 2390 auto Slash = PartialPath.find_last_of(SlashChars); 2391 StringRef Dir = 2392 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash); 2393 const char *StartOfFilename = 2394 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1; 2395 // Code completion filter range is the filename only, up to completion point. 2396 PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get( 2397 StringRef(StartOfFilename, CompletionPoint - StartOfFilename))); 2398 // We should replace the characters up to the closing quote or closest slash, 2399 // if any. 2400 while (CompletionPoint < BufferEnd) { 2401 char Next = *(CompletionPoint + 1); 2402 if (Next == 0 || Next == '\r' || Next == '\n') 2403 break; 2404 ++CompletionPoint; 2405 if (Next == (IsAngled ? '>' : '"')) 2406 break; 2407 if (SlashChars.contains(Next)) 2408 break; 2409 } 2410 2411 PP->setCodeCompletionTokenRange( 2412 FileLoc.getLocWithOffset(StartOfFilename - BufferStart), 2413 FileLoc.getLocWithOffset(CompletionPoint - BufferStart)); 2414 PP->CodeCompleteIncludedFile(Dir, IsAngled); 2415 } 2416 2417 /// LexCharConstant - Lex the remainder of a character constant, after having 2418 /// lexed either ' or L' or u8' or u' or U'. 2419 bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, 2420 tok::TokenKind Kind) { 2421 // Does this character contain the \0 character? 2422 const char *NulCharacter = nullptr; 2423 2424 if (!isLexingRawMode()) { 2425 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) 2426 Diag(BufferPtr, LangOpts.CPlusPlus 2427 ? diag::warn_cxx98_compat_unicode_literal 2428 : diag::warn_c99_compat_unicode_literal); 2429 else if (Kind == tok::utf8_char_constant) 2430 Diag(BufferPtr, LangOpts.CPlusPlus 2431 ? diag::warn_cxx14_compat_u8_character_literal 2432 : diag::warn_c17_compat_u8_character_literal); 2433 } 2434 2435 char C = getAndAdvanceChar(CurPtr, Result); 2436 if (C == '\'') { 2437 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2438 Diag(BufferPtr, diag::ext_empty_character); 2439 FormTokenWithChars(Result, CurPtr, tok::unknown); 2440 return true; 2441 } 2442 2443 while (C != '\'') { 2444 // Skip escaped characters. 2445 if (C == '\\') 2446 C = getAndAdvanceChar(CurPtr, Result); 2447 2448 if (C == '\n' || C == '\r' || // Newline. 2449 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 2450 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2451 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0; 2452 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2453 return true; 2454 } 2455 2456 if (C == 0) { 2457 if (isCodeCompletionPoint(CurPtr-1)) { 2458 PP->CodeCompleteNaturalLanguage(); 2459 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2460 cutOffLexing(); 2461 return true; 2462 } 2463 2464 NulCharacter = CurPtr-1; 2465 } 2466 C = getAndAdvanceChar(CurPtr, Result); 2467 } 2468 2469 // If we are in C++11, lex the optional ud-suffix. 2470 if (LangOpts.CPlusPlus) 2471 CurPtr = LexUDSuffix(Result, CurPtr, false); 2472 2473 // If a nul character existed in the character, warn about it. 2474 if (NulCharacter && !isLexingRawMode()) 2475 Diag(NulCharacter, diag::null_in_char_or_string) << 0; 2476 2477 // Update the location of token as well as BufferPtr. 2478 const char *TokStart = BufferPtr; 2479 FormTokenWithChars(Result, CurPtr, Kind); 2480 Result.setLiteralData(TokStart); 2481 return true; 2482 } 2483 2484 /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 2485 /// Update BufferPtr to point to the next non-whitespace character and return. 2486 /// 2487 /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 2488 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, 2489 bool &TokAtPhysicalStartOfLine) { 2490 // Whitespace - Skip it, then return the token after the whitespace. 2491 bool SawNewline = isVerticalWhitespace(CurPtr[-1]); 2492 2493 unsigned char Char = *CurPtr; 2494 2495 const char *lastNewLine = nullptr; 2496 auto setLastNewLine = [&](const char *Ptr) { 2497 lastNewLine = Ptr; 2498 if (!NewLinePtr) 2499 NewLinePtr = Ptr; 2500 }; 2501 if (SawNewline) 2502 setLastNewLine(CurPtr - 1); 2503 2504 // Skip consecutive spaces efficiently. 2505 while (true) { 2506 // Skip horizontal whitespace very aggressively. 2507 while (isHorizontalWhitespace(Char)) 2508 Char = *++CurPtr; 2509 2510 // Otherwise if we have something other than whitespace, we're done. 2511 if (!isVerticalWhitespace(Char)) 2512 break; 2513 2514 if (ParsingPreprocessorDirective) { 2515 // End of preprocessor directive line, let LexTokenInternal handle this. 2516 BufferPtr = CurPtr; 2517 return false; 2518 } 2519 2520 // OK, but handle newline. 2521 if (*CurPtr == '\n') 2522 setLastNewLine(CurPtr); 2523 SawNewline = true; 2524 Char = *++CurPtr; 2525 } 2526 2527 // If the client wants us to return whitespace, return it now. 2528 if (isKeepWhitespaceMode()) { 2529 FormTokenWithChars(Result, CurPtr, tok::unknown); 2530 if (SawNewline) { 2531 IsAtStartOfLine = true; 2532 IsAtPhysicalStartOfLine = true; 2533 } 2534 // FIXME: The next token will not have LeadingSpace set. 2535 return true; 2536 } 2537 2538 // If this isn't immediately after a newline, there is leading space. 2539 char PrevChar = CurPtr[-1]; 2540 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); 2541 2542 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); 2543 if (SawNewline) { 2544 Result.setFlag(Token::StartOfLine); 2545 TokAtPhysicalStartOfLine = true; 2546 2547 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) { 2548 if (auto *Handler = PP->getEmptylineHandler()) 2549 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1), 2550 getSourceLocation(lastNewLine))); 2551 } 2552 } 2553 2554 BufferPtr = CurPtr; 2555 return false; 2556 } 2557 2558 /// We have just read the // characters from input. Skip until we find the 2559 /// newline character that terminates the comment. Then update BufferPtr and 2560 /// return. 2561 /// 2562 /// If we're in KeepCommentMode or any CommentHandler has inserted 2563 /// some tokens, this will store the first token and return true. 2564 bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, 2565 bool &TokAtPhysicalStartOfLine) { 2566 // If Line comments aren't explicitly enabled for this language, emit an 2567 // extension warning. 2568 if (!LineComment) { 2569 if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags. 2570 Diag(BufferPtr, diag::ext_line_comment); 2571 2572 // Mark them enabled so we only emit one warning for this translation 2573 // unit. 2574 LineComment = true; 2575 } 2576 2577 // Scan over the body of the comment. The common case, when scanning, is that 2578 // the comment contains normal ascii characters with nothing interesting in 2579 // them. As such, optimize for this case with the inner loop. 2580 // 2581 // This loop terminates with CurPtr pointing at the newline (or end of buffer) 2582 // character that ends the line comment. 2583 2584 // C++23 [lex.phases] p1 2585 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a 2586 // diagnostic only once per entire ill-formed subsequence to avoid 2587 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). 2588 bool UnicodeDecodingAlreadyDiagnosed = false; 2589 2590 char C; 2591 while (true) { 2592 C = *CurPtr; 2593 // Skip over characters in the fast loop. 2594 while (isASCII(C) && C != 0 && // Potentially EOF. 2595 C != '\n' && C != '\r') { // Newline or DOS-style newline. 2596 C = *++CurPtr; 2597 UnicodeDecodingAlreadyDiagnosed = false; 2598 } 2599 2600 if (!isASCII(C)) { 2601 unsigned Length = llvm::getUTF8SequenceSize( 2602 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd); 2603 if (Length == 0) { 2604 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) 2605 Diag(CurPtr, diag::warn_invalid_utf8_in_comment); 2606 UnicodeDecodingAlreadyDiagnosed = true; 2607 ++CurPtr; 2608 } else { 2609 UnicodeDecodingAlreadyDiagnosed = false; 2610 CurPtr += Length; 2611 } 2612 continue; 2613 } 2614 2615 const char *NextLine = CurPtr; 2616 if (C != 0) { 2617 // We found a newline, see if it's escaped. 2618 const char *EscapePtr = CurPtr-1; 2619 bool HasSpace = false; 2620 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace. 2621 --EscapePtr; 2622 HasSpace = true; 2623 } 2624 2625 if (*EscapePtr == '\\') 2626 // Escaped newline. 2627 CurPtr = EscapePtr; 2628 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 2629 EscapePtr[-2] == '?' && LangOpts.Trigraphs) 2630 // Trigraph-escaped newline. 2631 CurPtr = EscapePtr-2; 2632 else 2633 break; // This is a newline, we're done. 2634 2635 // If there was space between the backslash and newline, warn about it. 2636 if (HasSpace && !isLexingRawMode()) 2637 Diag(EscapePtr, diag::backslash_newline_space); 2638 } 2639 2640 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 2641 // properly decode the character. Read it in raw mode to avoid emitting 2642 // diagnostics about things like trigraphs. If we see an escaped newline, 2643 // we'll handle it below. 2644 const char *OldPtr = CurPtr; 2645 bool OldRawMode = isLexingRawMode(); 2646 LexingRawMode = true; 2647 C = getAndAdvanceChar(CurPtr, Result); 2648 LexingRawMode = OldRawMode; 2649 2650 // If we only read only one character, then no special handling is needed. 2651 // We're done and can skip forward to the newline. 2652 if (C != 0 && CurPtr == OldPtr+1) { 2653 CurPtr = NextLine; 2654 break; 2655 } 2656 2657 // If we read multiple characters, and one of those characters was a \r or 2658 // \n, then we had an escaped newline within the comment. Emit diagnostic 2659 // unless the next line is also a // comment. 2660 if (CurPtr != OldPtr + 1 && C != '/' && 2661 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) { 2662 for (; OldPtr != CurPtr; ++OldPtr) 2663 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 2664 // Okay, we found a // comment that ends in a newline, if the next 2665 // line is also a // comment, but has spaces, don't emit a diagnostic. 2666 if (isWhitespace(C)) { 2667 const char *ForwardPtr = CurPtr; 2668 while (isWhitespace(*ForwardPtr)) // Skip whitespace. 2669 ++ForwardPtr; 2670 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 2671 break; 2672 } 2673 2674 if (!isLexingRawMode()) 2675 Diag(OldPtr-1, diag::ext_multi_line_line_comment); 2676 break; 2677 } 2678 } 2679 2680 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) { 2681 --CurPtr; 2682 break; 2683 } 2684 2685 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2686 PP->CodeCompleteNaturalLanguage(); 2687 cutOffLexing(); 2688 return false; 2689 } 2690 } 2691 2692 // Found but did not consume the newline. Notify comment handlers about the 2693 // comment unless we're in a #if 0 block. 2694 if (PP && !isLexingRawMode() && 2695 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2696 getSourceLocation(CurPtr)))) { 2697 BufferPtr = CurPtr; 2698 return true; // A token has to be returned. 2699 } 2700 2701 // If we are returning comments as tokens, return this comment as a token. 2702 if (inKeepCommentMode()) 2703 return SaveLineComment(Result, CurPtr); 2704 2705 // If we are inside a preprocessor directive and we see the end of line, 2706 // return immediately, so that the lexer can return this as an EOD token. 2707 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 2708 BufferPtr = CurPtr; 2709 return false; 2710 } 2711 2712 // Otherwise, eat the \n character. We don't care if this is a \n\r or 2713 // \r\n sequence. This is an efficiency hack (because we know the \n can't 2714 // contribute to another token), it isn't needed for correctness. Note that 2715 // this is ok even in KeepWhitespaceMode, because we would have returned the 2716 // comment above in that mode. 2717 NewLinePtr = CurPtr++; 2718 2719 // The next returned token is at the start of the line. 2720 Result.setFlag(Token::StartOfLine); 2721 TokAtPhysicalStartOfLine = true; 2722 // No leading whitespace seen so far. 2723 Result.clearFlag(Token::LeadingSpace); 2724 BufferPtr = CurPtr; 2725 return false; 2726 } 2727 2728 /// If in save-comment mode, package up this Line comment in an appropriate 2729 /// way and return it. 2730 bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { 2731 // If we're not in a preprocessor directive, just return the // comment 2732 // directly. 2733 FormTokenWithChars(Result, CurPtr, tok::comment); 2734 2735 if (!ParsingPreprocessorDirective || LexingRawMode) 2736 return true; 2737 2738 // If this Line-style comment is in a macro definition, transmogrify it into 2739 // a C-style block comment. 2740 bool Invalid = false; 2741 std::string Spelling = PP->getSpelling(Result, &Invalid); 2742 if (Invalid) 2743 return true; 2744 2745 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); 2746 Spelling[1] = '*'; // Change prefix to "/*". 2747 Spelling += "*/"; // add suffix. 2748 2749 Result.setKind(tok::comment); 2750 PP->CreateString(Spelling, Result, 2751 Result.getLocation(), Result.getLocation()); 2752 return true; 2753 } 2754 2755 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 2756 /// character (either \\n or \\r) is part of an escaped newline sequence. Issue 2757 /// a diagnostic if so. We know that the newline is inside of a block comment. 2758 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, 2759 bool Trigraphs) { 2760 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 2761 2762 // Position of the first trigraph in the ending sequence. 2763 const char *TrigraphPos = nullptr; 2764 // Position of the first whitespace after a '\' in the ending sequence. 2765 const char *SpacePos = nullptr; 2766 2767 while (true) { 2768 // Back up off the newline. 2769 --CurPtr; 2770 2771 // If this is a two-character newline sequence, skip the other character. 2772 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 2773 // \n\n or \r\r -> not escaped newline. 2774 if (CurPtr[0] == CurPtr[1]) 2775 return false; 2776 // \n\r or \r\n -> skip the newline. 2777 --CurPtr; 2778 } 2779 2780 // If we have horizontal whitespace, skip over it. We allow whitespace 2781 // between the slash and newline. 2782 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 2783 SpacePos = CurPtr; 2784 --CurPtr; 2785 } 2786 2787 // If we have a slash, this is an escaped newline. 2788 if (*CurPtr == '\\') { 2789 --CurPtr; 2790 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') { 2791 // This is a trigraph encoding of a slash. 2792 TrigraphPos = CurPtr - 2; 2793 CurPtr -= 3; 2794 } else { 2795 return false; 2796 } 2797 2798 // If the character preceding the escaped newline is a '*', then after line 2799 // splicing we have a '*/' ending the comment. 2800 if (*CurPtr == '*') 2801 break; 2802 2803 if (*CurPtr != '\n' && *CurPtr != '\r') 2804 return false; 2805 } 2806 2807 if (TrigraphPos) { 2808 // If no trigraphs are enabled, warn that we ignored this trigraph and 2809 // ignore this * character. 2810 if (!Trigraphs) { 2811 if (!L->isLexingRawMode()) 2812 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment); 2813 return false; 2814 } 2815 if (!L->isLexingRawMode()) 2816 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment); 2817 } 2818 2819 // Warn about having an escaped newline between the */ characters. 2820 if (!L->isLexingRawMode()) 2821 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end); 2822 2823 // If there was space between the backslash and newline, warn about it. 2824 if (SpacePos && !L->isLexingRawMode()) 2825 L->Diag(SpacePos, diag::backslash_newline_space); 2826 2827 return true; 2828 } 2829 2830 #ifdef __SSE2__ 2831 #include <emmintrin.h> 2832 #elif __ALTIVEC__ 2833 #include <altivec.h> 2834 #undef bool 2835 #endif 2836 2837 /// We have just read from input the / and * characters that started a comment. 2838 /// Read until we find the * and / characters that terminate the comment. 2839 /// Note that we don't bother decoding trigraphs or escaped newlines in block 2840 /// comments, because they cannot cause the comment to end. The only thing 2841 /// that can happen is the comment could end with an escaped newline between 2842 /// the terminating * and /. 2843 /// 2844 /// If we're in KeepCommentMode or any CommentHandler has inserted 2845 /// some tokens, this will store the first token and return true. 2846 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, 2847 bool &TokAtPhysicalStartOfLine) { 2848 // Scan one character past where we should, looking for a '/' character. Once 2849 // we find it, check to see if it was preceded by a *. This common 2850 // optimization helps people who like to put a lot of * characters in their 2851 // comments. 2852 2853 // The first character we get with newlines and trigraphs skipped to handle 2854 // the degenerate /*/ case below correctly if the * has an escaped newline 2855 // after it. 2856 unsigned CharSize; 2857 unsigned char C = getCharAndSize(CurPtr, CharSize); 2858 CurPtr += CharSize; 2859 if (C == 0 && CurPtr == BufferEnd+1) { 2860 if (!isLexingRawMode()) 2861 Diag(BufferPtr, diag::err_unterminated_block_comment); 2862 --CurPtr; 2863 2864 // KeepWhitespaceMode should return this broken comment as a token. Since 2865 // it isn't a well formed comment, just return it as an 'unknown' token. 2866 if (isKeepWhitespaceMode()) { 2867 FormTokenWithChars(Result, CurPtr, tok::unknown); 2868 return true; 2869 } 2870 2871 BufferPtr = CurPtr; 2872 return false; 2873 } 2874 2875 // Check to see if the first character after the '/*' is another /. If so, 2876 // then this slash does not end the block comment, it is part of it. 2877 if (C == '/') 2878 C = *CurPtr++; 2879 2880 // C++23 [lex.phases] p1 2881 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a 2882 // diagnostic only once per entire ill-formed subsequence to avoid 2883 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). 2884 bool UnicodeDecodingAlreadyDiagnosed = false; 2885 2886 while (true) { 2887 // Skip over all non-interesting characters until we find end of buffer or a 2888 // (probably ending) '/' character. 2889 if (CurPtr + 24 < BufferEnd && 2890 // If there is a code-completion point avoid the fast scan because it 2891 // doesn't check for '\0'. 2892 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 2893 // While not aligned to a 16-byte boundary. 2894 while (C != '/' && (intptr_t)CurPtr % 16 != 0) { 2895 if (!isASCII(C)) 2896 goto MultiByteUTF8; 2897 C = *CurPtr++; 2898 } 2899 if (C == '/') goto FoundSlash; 2900 2901 #ifdef __SSE2__ 2902 __m128i Slashes = _mm_set1_epi8('/'); 2903 while (CurPtr + 16 < BufferEnd) { 2904 int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr); 2905 if (LLVM_UNLIKELY(Mask != 0)) { 2906 goto MultiByteUTF8; 2907 } 2908 // look for slashes 2909 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 2910 Slashes)); 2911 if (cmp != 0) { 2912 // Adjust the pointer to point directly after the first slash. It's 2913 // not necessary to set C here, it will be overwritten at the end of 2914 // the outer loop. 2915 CurPtr += llvm::countr_zero<unsigned>(cmp) + 1; 2916 goto FoundSlash; 2917 } 2918 CurPtr += 16; 2919 } 2920 #elif __ALTIVEC__ 2921 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2922 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2923 0x80, 0x80, 0x80, 0x80}; 2924 __vector unsigned char Slashes = { 2925 '/', '/', '/', '/', '/', '/', '/', '/', 2926 '/', '/', '/', '/', '/', '/', '/', '/' 2927 }; 2928 while (CurPtr + 16 < BufferEnd) { 2929 if (LLVM_UNLIKELY( 2930 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF))) 2931 goto MultiByteUTF8; 2932 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) { 2933 break; 2934 } 2935 CurPtr += 16; 2936 } 2937 2938 #else 2939 while (CurPtr + 16 < BufferEnd) { 2940 bool HasNonASCII = false; 2941 for (unsigned I = 0; I < 16; ++I) 2942 HasNonASCII |= !isASCII(CurPtr[I]); 2943 2944 if (LLVM_UNLIKELY(HasNonASCII)) 2945 goto MultiByteUTF8; 2946 2947 bool HasSlash = false; 2948 for (unsigned I = 0; I < 16; ++I) 2949 HasSlash |= CurPtr[I] == '/'; 2950 if (HasSlash) 2951 break; 2952 CurPtr += 16; 2953 } 2954 #endif 2955 2956 // It has to be one of the bytes scanned, increment to it and read one. 2957 C = *CurPtr++; 2958 } 2959 2960 // Loop to scan the remainder, warning on invalid UTF-8 2961 // if the corresponding warning is enabled, emitting a diagnostic only once 2962 // per sequence that cannot be decoded. 2963 while (C != '/' && C != '\0') { 2964 if (isASCII(C)) { 2965 UnicodeDecodingAlreadyDiagnosed = false; 2966 C = *CurPtr++; 2967 continue; 2968 } 2969 MultiByteUTF8: 2970 // CurPtr is 1 code unit past C, so to decode 2971 // the codepoint, we need to read from the previous position. 2972 unsigned Length = llvm::getUTF8SequenceSize( 2973 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd); 2974 if (Length == 0) { 2975 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) 2976 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment); 2977 UnicodeDecodingAlreadyDiagnosed = true; 2978 } else { 2979 UnicodeDecodingAlreadyDiagnosed = false; 2980 CurPtr += Length - 1; 2981 } 2982 C = *CurPtr++; 2983 } 2984 2985 if (C == '/') { 2986 FoundSlash: 2987 if (CurPtr[-2] == '*') // We found the final */. We're done! 2988 break; 2989 2990 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 2991 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this, 2992 LangOpts.Trigraphs)) { 2993 // We found the final */, though it had an escaped newline between the 2994 // * and /. We're done! 2995 break; 2996 } 2997 } 2998 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 2999 // If this is a /* inside of the comment, emit a warning. Don't do this 3000 // if this is a /*/, which will end the comment. This misses cases with 3001 // embedded escaped newlines, but oh well. 3002 if (!isLexingRawMode()) 3003 Diag(CurPtr-1, diag::warn_nested_block_comment); 3004 } 3005 } else if (C == 0 && CurPtr == BufferEnd+1) { 3006 if (!isLexingRawMode()) 3007 Diag(BufferPtr, diag::err_unterminated_block_comment); 3008 // Note: the user probably forgot a */. We could continue immediately 3009 // after the /*, but this would involve lexing a lot of what really is the 3010 // comment, which surely would confuse the parser. 3011 --CurPtr; 3012 3013 // KeepWhitespaceMode should return this broken comment as a token. Since 3014 // it isn't a well formed comment, just return it as an 'unknown' token. 3015 if (isKeepWhitespaceMode()) { 3016 FormTokenWithChars(Result, CurPtr, tok::unknown); 3017 return true; 3018 } 3019 3020 BufferPtr = CurPtr; 3021 return false; 3022 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 3023 PP->CodeCompleteNaturalLanguage(); 3024 cutOffLexing(); 3025 return false; 3026 } 3027 3028 C = *CurPtr++; 3029 } 3030 3031 // Notify comment handlers about the comment unless we're in a #if 0 block. 3032 if (PP && !isLexingRawMode() && 3033 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 3034 getSourceLocation(CurPtr)))) { 3035 BufferPtr = CurPtr; 3036 return true; // A token has to be returned. 3037 } 3038 3039 // If we are returning comments as tokens, return this comment as a token. 3040 if (inKeepCommentMode()) { 3041 FormTokenWithChars(Result, CurPtr, tok::comment); 3042 return true; 3043 } 3044 3045 // It is common for the tokens immediately after a /**/ comment to be 3046 // whitespace. Instead of going through the big switch, handle it 3047 // efficiently now. This is safe even in KeepWhitespaceMode because we would 3048 // have already returned above with the comment as a token. 3049 if (isHorizontalWhitespace(*CurPtr)) { 3050 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); 3051 return false; 3052 } 3053 3054 // Otherwise, just return so that the next character will be lexed as a token. 3055 BufferPtr = CurPtr; 3056 Result.setFlag(Token::LeadingSpace); 3057 return false; 3058 } 3059 3060 //===----------------------------------------------------------------------===// 3061 // Primary Lexing Entry Points 3062 //===----------------------------------------------------------------------===// 3063 3064 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 3065 /// uninterpreted string. This switches the lexer out of directive mode. 3066 void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 3067 assert(ParsingPreprocessorDirective && ParsingFilename == false && 3068 "Must be in a preprocessing directive!"); 3069 Token Tmp; 3070 Tmp.startToken(); 3071 3072 // CurPtr - Cache BufferPtr in an automatic variable. 3073 const char *CurPtr = BufferPtr; 3074 while (true) { 3075 char Char = getAndAdvanceChar(CurPtr, Tmp); 3076 switch (Char) { 3077 default: 3078 if (Result) 3079 Result->push_back(Char); 3080 break; 3081 case 0: // Null. 3082 // Found end of file? 3083 if (CurPtr-1 != BufferEnd) { 3084 if (isCodeCompletionPoint(CurPtr-1)) { 3085 PP->CodeCompleteNaturalLanguage(); 3086 cutOffLexing(); 3087 return; 3088 } 3089 3090 // Nope, normal character, continue. 3091 if (Result) 3092 Result->push_back(Char); 3093 break; 3094 } 3095 // FALL THROUGH. 3096 [[fallthrough]]; 3097 case '\r': 3098 case '\n': 3099 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 3100 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 3101 BufferPtr = CurPtr-1; 3102 3103 // Next, lex the character, which should handle the EOD transition. 3104 Lex(Tmp); 3105 if (Tmp.is(tok::code_completion)) { 3106 if (PP) 3107 PP->CodeCompleteNaturalLanguage(); 3108 Lex(Tmp); 3109 } 3110 assert(Tmp.is(tok::eod) && "Unexpected token!"); 3111 3112 // Finally, we're done; 3113 return; 3114 } 3115 } 3116 } 3117 3118 /// LexEndOfFile - CurPtr points to the end of this file. Handle this 3119 /// condition, reporting diagnostics and handling other edge cases as required. 3120 /// This returns true if Result contains a token, false if PP.Lex should be 3121 /// called again. 3122 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 3123 // If we hit the end of the file while parsing a preprocessor directive, 3124 // end the preprocessor directive first. The next token returned will 3125 // then be the end of file. 3126 if (ParsingPreprocessorDirective) { 3127 // Done parsing the "line". 3128 ParsingPreprocessorDirective = false; 3129 // Update the location of token as well as BufferPtr. 3130 FormTokenWithChars(Result, CurPtr, tok::eod); 3131 3132 // Restore comment saving mode, in case it was disabled for directive. 3133 if (PP) 3134 resetExtendedTokenMode(); 3135 return true; // Have a token. 3136 } 3137 3138 // If we are in raw mode, return this event as an EOF token. Let the caller 3139 // that put us in raw mode handle the event. 3140 if (isLexingRawMode()) { 3141 Result.startToken(); 3142 BufferPtr = BufferEnd; 3143 FormTokenWithChars(Result, BufferEnd, tok::eof); 3144 return true; 3145 } 3146 3147 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { 3148 PP->setRecordedPreambleConditionalStack(ConditionalStack); 3149 // If the preamble cuts off the end of a header guard, consider it guarded. 3150 // The guard is valid for the preamble content itself, and for tools the 3151 // most useful answer is "yes, this file has a header guard". 3152 if (!ConditionalStack.empty()) 3153 MIOpt.ExitTopLevelConditional(); 3154 ConditionalStack.clear(); 3155 } 3156 3157 // Issue diagnostics for unterminated #if and missing newline. 3158 3159 // If we are in a #if directive, emit an error. 3160 while (!ConditionalStack.empty()) { 3161 if (PP->getCodeCompletionFileLoc() != FileLoc) 3162 PP->Diag(ConditionalStack.back().IfLoc, 3163 diag::err_pp_unterminated_conditional); 3164 ConditionalStack.pop_back(); 3165 } 3166 3167 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 3168 // a pedwarn. 3169 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) { 3170 DiagnosticsEngine &Diags = PP->getDiagnostics(); 3171 SourceLocation EndLoc = getSourceLocation(BufferEnd); 3172 unsigned DiagID; 3173 3174 if (LangOpts.CPlusPlus11) { 3175 // C++11 [lex.phases] 2.2 p2 3176 // Prefer the C++98 pedantic compatibility warning over the generic, 3177 // non-extension, user-requested "missing newline at EOF" warning. 3178 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) { 3179 DiagID = diag::warn_cxx98_compat_no_newline_eof; 3180 } else { 3181 DiagID = diag::warn_no_newline_eof; 3182 } 3183 } else { 3184 DiagID = diag::ext_no_newline_eof; 3185 } 3186 3187 Diag(BufferEnd, DiagID) 3188 << FixItHint::CreateInsertion(EndLoc, "\n"); 3189 } 3190 3191 BufferPtr = CurPtr; 3192 3193 // Finally, let the preprocessor handle this. 3194 return PP->HandleEndOfFile(Result, isPragmaLexer()); 3195 } 3196 3197 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 3198 /// the specified lexer will return a tok::l_paren token, 0 if it is something 3199 /// else and 2 if there are no more tokens in the buffer controlled by the 3200 /// lexer. 3201 unsigned Lexer::isNextPPTokenLParen() { 3202 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 3203 3204 if (isDependencyDirectivesLexer()) { 3205 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) 3206 return 2; 3207 return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( 3208 tok::l_paren); 3209 } 3210 3211 // Switch to 'skipping' mode. This will ensure that we can lex a token 3212 // without emitting diagnostics, disables macro expansion, and will cause EOF 3213 // to return an EOF token instead of popping the include stack. 3214 LexingRawMode = true; 3215 3216 // Save state that can be changed while lexing so that we can restore it. 3217 const char *TmpBufferPtr = BufferPtr; 3218 bool inPPDirectiveMode = ParsingPreprocessorDirective; 3219 bool atStartOfLine = IsAtStartOfLine; 3220 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 3221 bool leadingSpace = HasLeadingSpace; 3222 3223 Token Tok; 3224 Lex(Tok); 3225 3226 // Restore state that may have changed. 3227 BufferPtr = TmpBufferPtr; 3228 ParsingPreprocessorDirective = inPPDirectiveMode; 3229 HasLeadingSpace = leadingSpace; 3230 IsAtStartOfLine = atStartOfLine; 3231 IsAtPhysicalStartOfLine = atPhysicalStartOfLine; 3232 3233 // Restore the lexer back to non-skipping mode. 3234 LexingRawMode = false; 3235 3236 if (Tok.is(tok::eof)) 3237 return 2; 3238 return Tok.is(tok::l_paren); 3239 } 3240 3241 /// Find the end of a version control conflict marker. 3242 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 3243 ConflictMarkerKind CMK) { 3244 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 3245 size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 3246 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen); 3247 size_t Pos = RestOfBuffer.find(Terminator); 3248 while (Pos != StringRef::npos) { 3249 // Must occur at start of line. 3250 if (Pos == 0 || 3251 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) { 3252 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 3253 Pos = RestOfBuffer.find(Terminator); 3254 continue; 3255 } 3256 return RestOfBuffer.data()+Pos; 3257 } 3258 return nullptr; 3259 } 3260 3261 /// IsStartOfConflictMarker - If the specified pointer is the start of a version 3262 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 3263 /// and recover nicely. This returns true if it is a conflict marker and false 3264 /// if not. 3265 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 3266 // Only a conflict marker if it starts at the beginning of a line. 3267 if (CurPtr != BufferStart && 3268 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 3269 return false; 3270 3271 // Check to see if we have <<<<<<< or >>>>. 3272 if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") && 3273 !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> ")) 3274 return false; 3275 3276 // If we have a situation where we don't care about conflict markers, ignore 3277 // it. 3278 if (CurrentConflictMarkerState || isLexingRawMode()) 3279 return false; 3280 3281 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 3282 3283 // Check to see if there is an ending marker somewhere in the buffer at the 3284 // start of a line to terminate this conflict marker. 3285 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 3286 // We found a match. We are really in a conflict marker. 3287 // Diagnose this, and ignore to the end of line. 3288 Diag(CurPtr, diag::err_conflict_marker); 3289 CurrentConflictMarkerState = Kind; 3290 3291 // Skip ahead to the end of line. We know this exists because the 3292 // end-of-conflict marker starts with \r or \n. 3293 while (*CurPtr != '\r' && *CurPtr != '\n') { 3294 assert(CurPtr != BufferEnd && "Didn't find end of line"); 3295 ++CurPtr; 3296 } 3297 BufferPtr = CurPtr; 3298 return true; 3299 } 3300 3301 // No end of conflict marker found. 3302 return false; 3303 } 3304 3305 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 3306 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 3307 /// is the end of a conflict marker. Handle it by ignoring up until the end of 3308 /// the line. This returns true if it is a conflict marker and false if not. 3309 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 3310 // Only a conflict marker if it starts at the beginning of a line. 3311 if (CurPtr != BufferStart && 3312 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 3313 return false; 3314 3315 // If we have a situation where we don't care about conflict markers, ignore 3316 // it. 3317 if (!CurrentConflictMarkerState || isLexingRawMode()) 3318 return false; 3319 3320 // Check to see if we have the marker (4 characters in a row). 3321 for (unsigned i = 1; i != 4; ++i) 3322 if (CurPtr[i] != CurPtr[0]) 3323 return false; 3324 3325 // If we do have it, search for the end of the conflict marker. This could 3326 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 3327 // be the end of conflict marker. 3328 if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 3329 CurrentConflictMarkerState)) { 3330 CurPtr = End; 3331 3332 // Skip ahead to the end of line. 3333 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 3334 ++CurPtr; 3335 3336 BufferPtr = CurPtr; 3337 3338 // No longer in the conflict marker. 3339 CurrentConflictMarkerState = CMK_None; 3340 return true; 3341 } 3342 3343 return false; 3344 } 3345 3346 static const char *findPlaceholderEnd(const char *CurPtr, 3347 const char *BufferEnd) { 3348 if (CurPtr == BufferEnd) 3349 return nullptr; 3350 BufferEnd -= 1; // Scan until the second last character. 3351 for (; CurPtr != BufferEnd; ++CurPtr) { 3352 if (CurPtr[0] == '#' && CurPtr[1] == '>') 3353 return CurPtr + 2; 3354 } 3355 return nullptr; 3356 } 3357 3358 bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) { 3359 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!"); 3360 if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) 3361 return false; 3362 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd); 3363 if (!End) 3364 return false; 3365 const char *Start = CurPtr - 1; 3366 if (!LangOpts.AllowEditorPlaceholders) 3367 Diag(Start, diag::err_placeholder_in_source); 3368 Result.startToken(); 3369 FormTokenWithChars(Result, End, tok::raw_identifier); 3370 Result.setRawIdentifierData(Start); 3371 PP->LookUpIdentifierInfo(Result); 3372 Result.setFlag(Token::IsEditorPlaceholder); 3373 BufferPtr = End; 3374 return true; 3375 } 3376 3377 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 3378 if (PP && PP->isCodeCompletionEnabled()) { 3379 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 3380 return Loc == PP->getCodeCompletionLoc(); 3381 } 3382 3383 return false; 3384 } 3385 3386 std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr, 3387 const char *SlashLoc, 3388 Token *Result) { 3389 unsigned CharSize; 3390 char Kind = getCharAndSize(StartPtr, CharSize); 3391 assert((Kind == 'u' || Kind == 'U') && "expected a UCN"); 3392 3393 unsigned NumHexDigits; 3394 if (Kind == 'u') 3395 NumHexDigits = 4; 3396 else if (Kind == 'U') 3397 NumHexDigits = 8; 3398 3399 bool Delimited = false; 3400 bool FoundEndDelimiter = false; 3401 unsigned Count = 0; 3402 bool Diagnose = Result && !isLexingRawMode(); 3403 3404 if (!LangOpts.CPlusPlus && !LangOpts.C99) { 3405 if (Diagnose) 3406 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); 3407 return std::nullopt; 3408 } 3409 3410 const char *CurPtr = StartPtr + CharSize; 3411 const char *KindLoc = &CurPtr[-1]; 3412 3413 uint32_t CodePoint = 0; 3414 while (Count != NumHexDigits || Delimited) { 3415 char C = getCharAndSize(CurPtr, CharSize); 3416 if (!Delimited && Count == 0 && C == '{') { 3417 Delimited = true; 3418 CurPtr += CharSize; 3419 continue; 3420 } 3421 3422 if (Delimited && C == '}') { 3423 CurPtr += CharSize; 3424 FoundEndDelimiter = true; 3425 break; 3426 } 3427 3428 unsigned Value = llvm::hexDigitValue(C); 3429 if (Value == -1U) { 3430 if (!Delimited) 3431 break; 3432 if (Diagnose) 3433 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete) 3434 << StringRef(KindLoc, 1); 3435 return std::nullopt; 3436 } 3437 3438 if (CodePoint & 0xF000'0000) { 3439 if (Diagnose) 3440 Diag(KindLoc, diag::err_escape_too_large) << 0; 3441 return std::nullopt; 3442 } 3443 3444 CodePoint <<= 4; 3445 CodePoint |= Value; 3446 CurPtr += CharSize; 3447 Count++; 3448 } 3449 3450 if (Count == 0) { 3451 if (Diagnose) 3452 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 3453 : diag::warn_ucn_escape_no_digits) 3454 << StringRef(KindLoc, 1); 3455 return std::nullopt; 3456 } 3457 3458 if (Delimited && Kind == 'U') { 3459 if (Diagnose) 3460 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1); 3461 return std::nullopt; 3462 } 3463 3464 if (!Delimited && Count != NumHexDigits) { 3465 if (Diagnose) { 3466 Diag(SlashLoc, diag::warn_ucn_escape_incomplete); 3467 // If the user wrote \U1234, suggest a fixit to \u. 3468 if (Count == 4 && NumHexDigits == 8) { 3469 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); 3470 Diag(KindLoc, diag::note_ucn_four_not_eight) 3471 << FixItHint::CreateReplacement(URange, "u"); 3472 } 3473 } 3474 return std::nullopt; 3475 } 3476 3477 if (Delimited && PP) { 3478 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23 3479 ? diag::warn_cxx23_delimited_escape_sequence 3480 : diag::ext_delimited_escape_sequence) 3481 << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0); 3482 } 3483 3484 if (Result) { 3485 Result->setFlag(Token::HasUCN); 3486 // If the UCN contains either a trigraph or a line splicing, 3487 // we need to call getAndAdvanceChar again to set the appropriate flags 3488 // on Result. 3489 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0))) 3490 StartPtr = CurPtr; 3491 else 3492 while (StartPtr != CurPtr) 3493 (void)getAndAdvanceChar(StartPtr, *Result); 3494 } else { 3495 StartPtr = CurPtr; 3496 } 3497 return CodePoint; 3498 } 3499 3500 std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr, 3501 const char *SlashLoc, 3502 Token *Result) { 3503 unsigned CharSize; 3504 bool Diagnose = Result && !isLexingRawMode(); 3505 3506 char C = getCharAndSize(StartPtr, CharSize); 3507 assert(C == 'N' && "expected \\N{...}"); 3508 3509 const char *CurPtr = StartPtr + CharSize; 3510 const char *KindLoc = &CurPtr[-1]; 3511 3512 C = getCharAndSize(CurPtr, CharSize); 3513 if (C != '{') { 3514 if (Diagnose) 3515 Diag(SlashLoc, diag::warn_ucn_escape_incomplete); 3516 return std::nullopt; 3517 } 3518 CurPtr += CharSize; 3519 const char *StartName = CurPtr; 3520 bool FoundEndDelimiter = false; 3521 llvm::SmallVector<char, 30> Buffer; 3522 while (C) { 3523 C = getCharAndSize(CurPtr, CharSize); 3524 CurPtr += CharSize; 3525 if (C == '}') { 3526 FoundEndDelimiter = true; 3527 break; 3528 } 3529 3530 if (isVerticalWhitespace(C)) 3531 break; 3532 Buffer.push_back(C); 3533 } 3534 3535 if (!FoundEndDelimiter || Buffer.empty()) { 3536 if (Diagnose) 3537 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 3538 : diag::warn_delimited_ucn_incomplete) 3539 << StringRef(KindLoc, 1); 3540 return std::nullopt; 3541 } 3542 3543 StringRef Name(Buffer.data(), Buffer.size()); 3544 std::optional<char32_t> Match = 3545 llvm::sys::unicode::nameToCodepointStrict(Name); 3546 std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch; 3547 if (!Match) { 3548 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name); 3549 if (Diagnose) { 3550 Diag(StartName, diag::err_invalid_ucn_name) 3551 << StringRef(Buffer.data(), Buffer.size()) 3552 << makeCharRange(*this, StartName, CurPtr - CharSize); 3553 if (LooseMatch) { 3554 Diag(StartName, diag::note_invalid_ucn_name_loose_matching) 3555 << FixItHint::CreateReplacement( 3556 makeCharRange(*this, StartName, CurPtr - CharSize), 3557 LooseMatch->Name); 3558 } 3559 } 3560 // We do not offer misspelled character names suggestions here 3561 // as the set of what would be a valid suggestion depends on context, 3562 // and we should not make invalid suggestions. 3563 } 3564 3565 if (Diagnose && Match) 3566 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23 3567 ? diag::warn_cxx23_delimited_escape_sequence 3568 : diag::ext_delimited_escape_sequence) 3569 << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0); 3570 3571 // If no diagnostic has been emitted yet, likely because we are doing a 3572 // tentative lexing, we do not want to recover here to make sure the token 3573 // will not be incorrectly considered valid. This function will be called 3574 // again and a diagnostic emitted then. 3575 if (LooseMatch && Diagnose) 3576 Match = LooseMatch->CodePoint; 3577 3578 if (Result) { 3579 Result->setFlag(Token::HasUCN); 3580 // If the UCN contains either a trigraph or a line splicing, 3581 // we need to call getAndAdvanceChar again to set the appropriate flags 3582 // on Result. 3583 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3)) 3584 StartPtr = CurPtr; 3585 else 3586 while (StartPtr != CurPtr) 3587 (void)getAndAdvanceChar(StartPtr, *Result); 3588 } else { 3589 StartPtr = CurPtr; 3590 } 3591 return Match ? std::optional<uint32_t>(*Match) : std::nullopt; 3592 } 3593 3594 uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, 3595 Token *Result) { 3596 3597 unsigned CharSize; 3598 std::optional<uint32_t> CodePointOpt; 3599 char Kind = getCharAndSize(StartPtr, CharSize); 3600 if (Kind == 'u' || Kind == 'U') 3601 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result); 3602 else if (Kind == 'N') 3603 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result); 3604 3605 if (!CodePointOpt) 3606 return 0; 3607 3608 uint32_t CodePoint = *CodePointOpt; 3609 3610 // Don't apply C family restrictions to UCNs in assembly mode 3611 if (LangOpts.AsmPreprocessor) 3612 return CodePoint; 3613 3614 // C23 6.4.3p2: A universal character name shall not designate a code point 3615 // where the hexadecimal value is: 3616 // - in the range D800 through DFFF inclusive; or 3617 // - greater than 10FFFF. 3618 // A universal-character-name outside the c-char-sequence of a character 3619 // constant, or the s-char-sequence of a string-literal shall not designate 3620 // a control character or a character in the basic character set. 3621 3622 // C++11 [lex.charset]p2: If the hexadecimal value for a 3623 // universal-character-name corresponds to a surrogate code point (in the 3624 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, 3625 // if the hexadecimal value for a universal-character-name outside the 3626 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or 3627 // string literal corresponds to a control character (in either of the 3628 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the 3629 // basic source character set, the program is ill-formed. 3630 if (CodePoint < 0xA0) { 3631 // We don't use isLexingRawMode() here because we need to warn about bad 3632 // UCNs even when skipping preprocessing tokens in a #if block. 3633 if (Result && PP) { 3634 if (CodePoint < 0x20 || CodePoint >= 0x7F) 3635 Diag(BufferPtr, diag::err_ucn_control_character); 3636 else { 3637 char C = static_cast<char>(CodePoint); 3638 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); 3639 } 3640 } 3641 3642 return 0; 3643 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { 3644 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. 3645 // We don't use isLexingRawMode() here because we need to diagnose bad 3646 // UCNs even when skipping preprocessing tokens in a #if block. 3647 if (Result && PP) { 3648 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11) 3649 Diag(BufferPtr, diag::warn_ucn_escape_surrogate); 3650 else 3651 Diag(BufferPtr, diag::err_ucn_escape_invalid); 3652 } 3653 return 0; 3654 } 3655 3656 return CodePoint; 3657 } 3658 3659 bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, 3660 const char *CurPtr) { 3661 if (!isLexingRawMode() && !PP->isPreprocessedOutput() && 3662 isUnicodeWhitespace(C)) { 3663 Diag(BufferPtr, diag::ext_unicode_whitespace) 3664 << makeCharRange(*this, BufferPtr, CurPtr); 3665 3666 Result.setFlag(Token::LeadingSpace); 3667 return true; 3668 } 3669 return false; 3670 } 3671 3672 void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { 3673 IsAtStartOfLine = Result.isAtStartOfLine(); 3674 HasLeadingSpace = Result.hasLeadingSpace(); 3675 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); 3676 // Note that this doesn't affect IsAtPhysicalStartOfLine. 3677 } 3678 3679 bool Lexer::Lex(Token &Result) { 3680 assert(!isDependencyDirectivesLexer()); 3681 3682 // Start a new token. 3683 Result.startToken(); 3684 3685 // Set up misc whitespace flags for LexTokenInternal. 3686 if (IsAtStartOfLine) { 3687 Result.setFlag(Token::StartOfLine); 3688 IsAtStartOfLine = false; 3689 } 3690 3691 if (HasLeadingSpace) { 3692 Result.setFlag(Token::LeadingSpace); 3693 HasLeadingSpace = false; 3694 } 3695 3696 if (HasLeadingEmptyMacro) { 3697 Result.setFlag(Token::LeadingEmptyMacro); 3698 HasLeadingEmptyMacro = false; 3699 } 3700 3701 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 3702 IsAtPhysicalStartOfLine = false; 3703 bool isRawLex = isLexingRawMode(); 3704 (void) isRawLex; 3705 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); 3706 // (After the LexTokenInternal call, the lexer might be destroyed.) 3707 assert((returnedToken || !isRawLex) && "Raw lex must succeed"); 3708 return returnedToken; 3709 } 3710 3711 /// LexTokenInternal - This implements a simple C family lexer. It is an 3712 /// extremely performance critical piece of code. This assumes that the buffer 3713 /// has a null character at the end of the file. This returns a preprocessing 3714 /// token, not a normal token, as such, it is an internal interface. It assumes 3715 /// that the Flags of result have been cleared before calling this. 3716 bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { 3717 LexStart: 3718 assert(!Result.needsCleaning() && "Result needs cleaning"); 3719 assert(!Result.hasPtrData() && "Result has not been reset"); 3720 3721 // CurPtr - Cache BufferPtr in an automatic variable. 3722 const char *CurPtr = BufferPtr; 3723 3724 // Small amounts of horizontal whitespace is very common between tokens. 3725 if (isHorizontalWhitespace(*CurPtr)) { 3726 do { 3727 ++CurPtr; 3728 } while (isHorizontalWhitespace(*CurPtr)); 3729 3730 // If we are keeping whitespace and other tokens, just return what we just 3731 // skipped. The next lexer invocation will return the token after the 3732 // whitespace. 3733 if (isKeepWhitespaceMode()) { 3734 FormTokenWithChars(Result, CurPtr, tok::unknown); 3735 // FIXME: The next token will not have LeadingSpace set. 3736 return true; 3737 } 3738 3739 BufferPtr = CurPtr; 3740 Result.setFlag(Token::LeadingSpace); 3741 } 3742 3743 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 3744 3745 // Read a character, advancing over it. 3746 char Char = getAndAdvanceChar(CurPtr, Result); 3747 tok::TokenKind Kind; 3748 3749 if (!isVerticalWhitespace(Char)) 3750 NewLinePtr = nullptr; 3751 3752 switch (Char) { 3753 case 0: // Null. 3754 // Found end of file? 3755 if (CurPtr-1 == BufferEnd) 3756 return LexEndOfFile(Result, CurPtr-1); 3757 3758 // Check if we are performing code completion. 3759 if (isCodeCompletionPoint(CurPtr-1)) { 3760 // Return the code-completion token. 3761 Result.startToken(); 3762 FormTokenWithChars(Result, CurPtr, tok::code_completion); 3763 return true; 3764 } 3765 3766 if (!isLexingRawMode()) 3767 Diag(CurPtr-1, diag::null_in_file); 3768 Result.setFlag(Token::LeadingSpace); 3769 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3770 return true; // KeepWhitespaceMode 3771 3772 // We know the lexer hasn't changed, so just try again with this lexer. 3773 // (We manually eliminate the tail call to avoid recursion.) 3774 goto LexNextToken; 3775 3776 case 26: // DOS & CP/M EOF: "^Z". 3777 // If we're in Microsoft extensions mode, treat this as end of file. 3778 if (LangOpts.MicrosoftExt) { 3779 if (!isLexingRawMode()) 3780 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft); 3781 return LexEndOfFile(Result, CurPtr-1); 3782 } 3783 3784 // If Microsoft extensions are disabled, this is just random garbage. 3785 Kind = tok::unknown; 3786 break; 3787 3788 case '\r': 3789 if (CurPtr[0] == '\n') 3790 (void)getAndAdvanceChar(CurPtr, Result); 3791 [[fallthrough]]; 3792 case '\n': 3793 // If we are inside a preprocessor directive and we see the end of line, 3794 // we know we are done with the directive, so return an EOD token. 3795 if (ParsingPreprocessorDirective) { 3796 // Done parsing the "line". 3797 ParsingPreprocessorDirective = false; 3798 3799 // Restore comment saving mode, in case it was disabled for directive. 3800 if (PP) 3801 resetExtendedTokenMode(); 3802 3803 // Since we consumed a newline, we are back at the start of a line. 3804 IsAtStartOfLine = true; 3805 IsAtPhysicalStartOfLine = true; 3806 NewLinePtr = CurPtr - 1; 3807 3808 Kind = tok::eod; 3809 break; 3810 } 3811 3812 // No leading whitespace seen so far. 3813 Result.clearFlag(Token::LeadingSpace); 3814 3815 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3816 return true; // KeepWhitespaceMode 3817 3818 // We only saw whitespace, so just try again with this lexer. 3819 // (We manually eliminate the tail call to avoid recursion.) 3820 goto LexNextToken; 3821 case ' ': 3822 case '\t': 3823 case '\f': 3824 case '\v': 3825 SkipHorizontalWhitespace: 3826 Result.setFlag(Token::LeadingSpace); 3827 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3828 return true; // KeepWhitespaceMode 3829 3830 SkipIgnoredUnits: 3831 CurPtr = BufferPtr; 3832 3833 // If the next token is obviously a // or /* */ comment, skip it efficiently 3834 // too (without going through the big switch stmt). 3835 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 3836 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { 3837 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3838 return true; // There is a token to return. 3839 goto SkipIgnoredUnits; 3840 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 3841 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3842 return true; // There is a token to return. 3843 goto SkipIgnoredUnits; 3844 } else if (isHorizontalWhitespace(*CurPtr)) { 3845 goto SkipHorizontalWhitespace; 3846 } 3847 // We only saw whitespace, so just try again with this lexer. 3848 // (We manually eliminate the tail call to avoid recursion.) 3849 goto LexNextToken; 3850 3851 // C99 6.4.4.1: Integer Constants. 3852 // C99 6.4.4.2: Floating Constants. 3853 case '0': case '1': case '2': case '3': case '4': 3854 case '5': case '6': case '7': case '8': case '9': 3855 // Notify MIOpt that we read a non-whitespace/non-comment token. 3856 MIOpt.ReadToken(); 3857 return LexNumericConstant(Result, CurPtr); 3858 3859 // Identifier (e.g., uber), or 3860 // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or 3861 // UTF-8 or UTF-16 string literal (C11/C++11). 3862 case 'u': 3863 // Notify MIOpt that we read a non-whitespace/non-comment token. 3864 MIOpt.ReadToken(); 3865 3866 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3867 Char = getCharAndSize(CurPtr, SizeTmp); 3868 3869 // UTF-16 string literal 3870 if (Char == '"') 3871 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3872 tok::utf16_string_literal); 3873 3874 // UTF-16 character constant 3875 if (Char == '\'') 3876 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3877 tok::utf16_char_constant); 3878 3879 // UTF-16 raw string literal 3880 if (Char == 'R' && LangOpts.RawStringLiterals && 3881 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3882 return LexRawStringLiteral(Result, 3883 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3884 SizeTmp2, Result), 3885 tok::utf16_string_literal); 3886 3887 if (Char == '8') { 3888 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 3889 3890 // UTF-8 string literal 3891 if (Char2 == '"') 3892 return LexStringLiteral(Result, 3893 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3894 SizeTmp2, Result), 3895 tok::utf8_string_literal); 3896 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23)) 3897 return LexCharConstant( 3898 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3899 SizeTmp2, Result), 3900 tok::utf8_char_constant); 3901 3902 if (Char2 == 'R' && LangOpts.RawStringLiterals) { 3903 unsigned SizeTmp3; 3904 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3905 // UTF-8 raw string literal 3906 if (Char3 == '"') { 3907 return LexRawStringLiteral(Result, 3908 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3909 SizeTmp2, Result), 3910 SizeTmp3, Result), 3911 tok::utf8_string_literal); 3912 } 3913 } 3914 } 3915 } 3916 3917 // treat u like the start of an identifier. 3918 return LexIdentifierContinue(Result, CurPtr); 3919 3920 case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal 3921 // Notify MIOpt that we read a non-whitespace/non-comment token. 3922 MIOpt.ReadToken(); 3923 3924 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3925 Char = getCharAndSize(CurPtr, SizeTmp); 3926 3927 // UTF-32 string literal 3928 if (Char == '"') 3929 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3930 tok::utf32_string_literal); 3931 3932 // UTF-32 character constant 3933 if (Char == '\'') 3934 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3935 tok::utf32_char_constant); 3936 3937 // UTF-32 raw string literal 3938 if (Char == 'R' && LangOpts.RawStringLiterals && 3939 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3940 return LexRawStringLiteral(Result, 3941 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3942 SizeTmp2, Result), 3943 tok::utf32_string_literal); 3944 } 3945 3946 // treat U like the start of an identifier. 3947 return LexIdentifierContinue(Result, CurPtr); 3948 3949 case 'R': // Identifier or C++0x raw string literal 3950 // Notify MIOpt that we read a non-whitespace/non-comment token. 3951 MIOpt.ReadToken(); 3952 3953 if (LangOpts.RawStringLiterals) { 3954 Char = getCharAndSize(CurPtr, SizeTmp); 3955 3956 if (Char == '"') 3957 return LexRawStringLiteral(Result, 3958 ConsumeChar(CurPtr, SizeTmp, Result), 3959 tok::string_literal); 3960 } 3961 3962 // treat R like the start of an identifier. 3963 return LexIdentifierContinue(Result, CurPtr); 3964 3965 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 3966 // Notify MIOpt that we read a non-whitespace/non-comment token. 3967 MIOpt.ReadToken(); 3968 Char = getCharAndSize(CurPtr, SizeTmp); 3969 3970 // Wide string literal. 3971 if (Char == '"') 3972 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3973 tok::wide_string_literal); 3974 3975 // Wide raw string literal. 3976 if (LangOpts.RawStringLiterals && Char == 'R' && 3977 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3978 return LexRawStringLiteral(Result, 3979 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3980 SizeTmp2, Result), 3981 tok::wide_string_literal); 3982 3983 // Wide character constant. 3984 if (Char == '\'') 3985 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3986 tok::wide_char_constant); 3987 // FALL THROUGH, treating L like the start of an identifier. 3988 [[fallthrough]]; 3989 3990 // C99 6.4.2: Identifiers. 3991 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 3992 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 3993 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 3994 case 'V': case 'W': case 'X': case 'Y': case 'Z': 3995 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 3996 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 3997 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 3998 case 'v': case 'w': case 'x': case 'y': case 'z': 3999 case '_': 4000 // Notify MIOpt that we read a non-whitespace/non-comment token. 4001 MIOpt.ReadToken(); 4002 return LexIdentifierContinue(Result, CurPtr); 4003 4004 case '$': // $ in identifiers. 4005 if (LangOpts.DollarIdents) { 4006 if (!isLexingRawMode()) 4007 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 4008 // Notify MIOpt that we read a non-whitespace/non-comment token. 4009 MIOpt.ReadToken(); 4010 return LexIdentifierContinue(Result, CurPtr); 4011 } 4012 4013 Kind = tok::unknown; 4014 break; 4015 4016 // C99 6.4.4: Character Constants. 4017 case '\'': 4018 // Notify MIOpt that we read a non-whitespace/non-comment token. 4019 MIOpt.ReadToken(); 4020 return LexCharConstant(Result, CurPtr, tok::char_constant); 4021 4022 // C99 6.4.5: String Literals. 4023 case '"': 4024 // Notify MIOpt that we read a non-whitespace/non-comment token. 4025 MIOpt.ReadToken(); 4026 return LexStringLiteral(Result, CurPtr, 4027 ParsingFilename ? tok::header_name 4028 : tok::string_literal); 4029 4030 // C99 6.4.6: Punctuators. 4031 case '?': 4032 Kind = tok::question; 4033 break; 4034 case '[': 4035 Kind = tok::l_square; 4036 break; 4037 case ']': 4038 Kind = tok::r_square; 4039 break; 4040 case '(': 4041 Kind = tok::l_paren; 4042 break; 4043 case ')': 4044 Kind = tok::r_paren; 4045 break; 4046 case '{': 4047 Kind = tok::l_brace; 4048 break; 4049 case '}': 4050 Kind = tok::r_brace; 4051 break; 4052 case '.': 4053 Char = getCharAndSize(CurPtr, SizeTmp); 4054 if (Char >= '0' && Char <= '9') { 4055 // Notify MIOpt that we read a non-whitespace/non-comment token. 4056 MIOpt.ReadToken(); 4057 4058 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 4059 } else if (LangOpts.CPlusPlus && Char == '*') { 4060 Kind = tok::periodstar; 4061 CurPtr += SizeTmp; 4062 } else if (Char == '.' && 4063 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 4064 Kind = tok::ellipsis; 4065 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4066 SizeTmp2, Result); 4067 } else { 4068 Kind = tok::period; 4069 } 4070 break; 4071 case '&': 4072 Char = getCharAndSize(CurPtr, SizeTmp); 4073 if (Char == '&') { 4074 Kind = tok::ampamp; 4075 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4076 } else if (Char == '=') { 4077 Kind = tok::ampequal; 4078 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4079 } else { 4080 Kind = tok::amp; 4081 } 4082 break; 4083 case '*': 4084 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 4085 Kind = tok::starequal; 4086 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4087 } else { 4088 Kind = tok::star; 4089 } 4090 break; 4091 case '+': 4092 Char = getCharAndSize(CurPtr, SizeTmp); 4093 if (Char == '+') { 4094 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4095 Kind = tok::plusplus; 4096 } else if (Char == '=') { 4097 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4098 Kind = tok::plusequal; 4099 } else { 4100 Kind = tok::plus; 4101 } 4102 break; 4103 case '-': 4104 Char = getCharAndSize(CurPtr, SizeTmp); 4105 if (Char == '-') { // -- 4106 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4107 Kind = tok::minusminus; 4108 } else if (Char == '>' && LangOpts.CPlusPlus && 4109 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 4110 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4111 SizeTmp2, Result); 4112 Kind = tok::arrowstar; 4113 } else if (Char == '>') { // -> 4114 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4115 Kind = tok::arrow; 4116 } else if (Char == '=') { // -= 4117 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4118 Kind = tok::minusequal; 4119 } else { 4120 Kind = tok::minus; 4121 } 4122 break; 4123 case '~': 4124 Kind = tok::tilde; 4125 break; 4126 case '!': 4127 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 4128 Kind = tok::exclaimequal; 4129 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4130 } else { 4131 Kind = tok::exclaim; 4132 } 4133 break; 4134 case '/': 4135 // 6.4.9: Comments 4136 Char = getCharAndSize(CurPtr, SizeTmp); 4137 if (Char == '/') { // Line comment. 4138 // Even if Line comments are disabled (e.g. in C89 mode), we generally 4139 // want to lex this as a comment. There is one problem with this though, 4140 // that in one particular corner case, this can change the behavior of the 4141 // resultant program. For example, In "foo //**/ bar", C89 would lex 4142 // this as "foo / bar" and languages with Line comments would lex it as 4143 // "foo". Check to see if the character after the second slash is a '*'. 4144 // If so, we will lex that as a "/" instead of the start of a comment. 4145 // However, we never do this if we are just preprocessing. 4146 bool TreatAsComment = 4147 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); 4148 if (!TreatAsComment) 4149 if (!(PP && PP->isPreprocessedOutput())) 4150 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; 4151 4152 if (TreatAsComment) { 4153 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 4154 TokAtPhysicalStartOfLine)) 4155 return true; // There is a token to return. 4156 4157 // It is common for the tokens immediately after a // comment to be 4158 // whitespace (indentation for the next line). Instead of going through 4159 // the big switch, handle it efficiently now. 4160 goto SkipIgnoredUnits; 4161 } 4162 } 4163 4164 if (Char == '*') { // /**/ comment. 4165 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 4166 TokAtPhysicalStartOfLine)) 4167 return true; // There is a token to return. 4168 4169 // We only saw whitespace, so just try again with this lexer. 4170 // (We manually eliminate the tail call to avoid recursion.) 4171 goto LexNextToken; 4172 } 4173 4174 if (Char == '=') { 4175 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4176 Kind = tok::slashequal; 4177 } else { 4178 Kind = tok::slash; 4179 } 4180 break; 4181 case '%': 4182 Char = getCharAndSize(CurPtr, SizeTmp); 4183 if (Char == '=') { 4184 Kind = tok::percentequal; 4185 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4186 } else if (LangOpts.Digraphs && Char == '>') { 4187 Kind = tok::r_brace; // '%>' -> '}' 4188 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4189 } else if (LangOpts.Digraphs && Char == ':') { 4190 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4191 Char = getCharAndSize(CurPtr, SizeTmp); 4192 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 4193 Kind = tok::hashhash; // '%:%:' -> '##' 4194 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4195 SizeTmp2, Result); 4196 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 4197 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4198 if (!isLexingRawMode()) 4199 Diag(BufferPtr, diag::ext_charize_microsoft); 4200 Kind = tok::hashat; 4201 } else { // '%:' -> '#' 4202 // We parsed a # character. If this occurs at the start of the line, 4203 // it's actually the start of a preprocessing directive. Callback to 4204 // the preprocessor to handle it. 4205 // TODO: -fpreprocessed mode?? 4206 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 4207 goto HandleDirective; 4208 4209 Kind = tok::hash; 4210 } 4211 } else { 4212 Kind = tok::percent; 4213 } 4214 break; 4215 case '<': 4216 Char = getCharAndSize(CurPtr, SizeTmp); 4217 if (ParsingFilename) { 4218 return LexAngledStringLiteral(Result, CurPtr); 4219 } else if (Char == '<') { 4220 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4221 if (After == '=') { 4222 Kind = tok::lesslessequal; 4223 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4224 SizeTmp2, Result); 4225 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 4226 // If this is actually a '<<<<<<<' version control conflict marker, 4227 // recognize it as such and recover nicely. 4228 goto LexNextToken; 4229 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 4230 // If this is '<<<<' and we're in a Perforce-style conflict marker, 4231 // ignore it. 4232 goto LexNextToken; 4233 } else if (LangOpts.CUDA && After == '<') { 4234 Kind = tok::lesslessless; 4235 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4236 SizeTmp2, Result); 4237 } else { 4238 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4239 Kind = tok::lessless; 4240 } 4241 } else if (Char == '=') { 4242 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4243 if (After == '>') { 4244 if (LangOpts.CPlusPlus20) { 4245 if (!isLexingRawMode()) 4246 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship); 4247 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4248 SizeTmp2, Result); 4249 Kind = tok::spaceship; 4250 break; 4251 } 4252 // Suggest adding a space between the '<=' and the '>' to avoid a 4253 // change in semantics if this turns up in C++ <=17 mode. 4254 if (LangOpts.CPlusPlus && !isLexingRawMode()) { 4255 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship) 4256 << FixItHint::CreateInsertion( 4257 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " "); 4258 } 4259 } 4260 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4261 Kind = tok::lessequal; 4262 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 4263 if (LangOpts.CPlusPlus11 && 4264 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 4265 // C++0x [lex.pptoken]p3: 4266 // Otherwise, if the next three characters are <:: and the subsequent 4267 // character is neither : nor >, the < is treated as a preprocessor 4268 // token by itself and not as the first character of the alternative 4269 // token <:. 4270 unsigned SizeTmp3; 4271 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 4272 if (After != ':' && After != '>') { 4273 Kind = tok::less; 4274 if (!isLexingRawMode()) 4275 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 4276 break; 4277 } 4278 } 4279 4280 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4281 Kind = tok::l_square; 4282 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 4283 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4284 Kind = tok::l_brace; 4285 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 && 4286 lexEditorPlaceholder(Result, CurPtr)) { 4287 return true; 4288 } else { 4289 Kind = tok::less; 4290 } 4291 break; 4292 case '>': 4293 Char = getCharAndSize(CurPtr, SizeTmp); 4294 if (Char == '=') { 4295 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4296 Kind = tok::greaterequal; 4297 } else if (Char == '>') { 4298 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4299 if (After == '=') { 4300 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4301 SizeTmp2, Result); 4302 Kind = tok::greatergreaterequal; 4303 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 4304 // If this is actually a '>>>>' conflict marker, recognize it as such 4305 // and recover nicely. 4306 goto LexNextToken; 4307 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 4308 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 4309 goto LexNextToken; 4310 } else if (LangOpts.CUDA && After == '>') { 4311 Kind = tok::greatergreatergreater; 4312 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4313 SizeTmp2, Result); 4314 } else { 4315 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4316 Kind = tok::greatergreater; 4317 } 4318 } else { 4319 Kind = tok::greater; 4320 } 4321 break; 4322 case '^': 4323 Char = getCharAndSize(CurPtr, SizeTmp); 4324 if (Char == '=') { 4325 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4326 Kind = tok::caretequal; 4327 } else { 4328 if (LangOpts.OpenCL && Char == '^') 4329 Diag(CurPtr, diag::err_opencl_logical_exclusive_or); 4330 Kind = tok::caret; 4331 } 4332 break; 4333 case '|': 4334 Char = getCharAndSize(CurPtr, SizeTmp); 4335 if (Char == '=') { 4336 Kind = tok::pipeequal; 4337 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4338 } else if (Char == '|') { 4339 // If this is '|||||||' and we're in a conflict marker, ignore it. 4340 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 4341 goto LexNextToken; 4342 Kind = tok::pipepipe; 4343 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4344 } else { 4345 Kind = tok::pipe; 4346 } 4347 break; 4348 case ':': 4349 Char = getCharAndSize(CurPtr, SizeTmp); 4350 if (LangOpts.Digraphs && Char == '>') { 4351 Kind = tok::r_square; // ':>' -> ']' 4352 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4353 } else if (Char == ':') { 4354 Kind = tok::coloncolon; 4355 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4356 } else { 4357 Kind = tok::colon; 4358 } 4359 break; 4360 case ';': 4361 Kind = tok::semi; 4362 break; 4363 case '=': 4364 Char = getCharAndSize(CurPtr, SizeTmp); 4365 if (Char == '=') { 4366 // If this is '====' and we're in a conflict marker, ignore it. 4367 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 4368 goto LexNextToken; 4369 4370 Kind = tok::equalequal; 4371 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4372 } else { 4373 Kind = tok::equal; 4374 } 4375 break; 4376 case ',': 4377 Kind = tok::comma; 4378 break; 4379 case '#': 4380 Char = getCharAndSize(CurPtr, SizeTmp); 4381 if (Char == '#') { 4382 Kind = tok::hashhash; 4383 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4384 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 4385 Kind = tok::hashat; 4386 if (!isLexingRawMode()) 4387 Diag(BufferPtr, diag::ext_charize_microsoft); 4388 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4389 } else { 4390 // We parsed a # character. If this occurs at the start of the line, 4391 // it's actually the start of a preprocessing directive. Callback to 4392 // the preprocessor to handle it. 4393 // TODO: -fpreprocessed mode?? 4394 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 4395 goto HandleDirective; 4396 4397 Kind = tok::hash; 4398 } 4399 break; 4400 4401 case '@': 4402 // Objective C support. 4403 if (CurPtr[-1] == '@' && LangOpts.ObjC) 4404 Kind = tok::at; 4405 else 4406 Kind = tok::unknown; 4407 break; 4408 4409 // UCNs (C99 6.4.3, C++11 [lex.charset]p2) 4410 case '\\': 4411 if (!LangOpts.AsmPreprocessor) { 4412 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { 4413 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 4414 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 4415 return true; // KeepWhitespaceMode 4416 4417 // We only saw whitespace, so just try again with this lexer. 4418 // (We manually eliminate the tail call to avoid recursion.) 4419 goto LexNextToken; 4420 } 4421 4422 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 4423 } 4424 } 4425 4426 Kind = tok::unknown; 4427 break; 4428 4429 default: { 4430 if (isASCII(Char)) { 4431 Kind = tok::unknown; 4432 break; 4433 } 4434 4435 llvm::UTF32 CodePoint; 4436 4437 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to 4438 // an escaped newline. 4439 --CurPtr; 4440 llvm::ConversionResult Status = 4441 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, 4442 (const llvm::UTF8 *)BufferEnd, 4443 &CodePoint, 4444 llvm::strictConversion); 4445 if (Status == llvm::conversionOK) { 4446 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 4447 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 4448 return true; // KeepWhitespaceMode 4449 4450 // We only saw whitespace, so just try again with this lexer. 4451 // (We manually eliminate the tail call to avoid recursion.) 4452 goto LexNextToken; 4453 } 4454 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 4455 } 4456 4457 if (isLexingRawMode() || ParsingPreprocessorDirective || 4458 PP->isPreprocessedOutput()) { 4459 ++CurPtr; 4460 Kind = tok::unknown; 4461 break; 4462 } 4463 4464 // Non-ASCII characters tend to creep into source code unintentionally. 4465 // Instead of letting the parser complain about the unknown token, 4466 // just diagnose the invalid UTF-8, then drop the character. 4467 Diag(CurPtr, diag::err_invalid_utf8); 4468 4469 BufferPtr = CurPtr+1; 4470 // We're pretending the character didn't exist, so just try again with 4471 // this lexer. 4472 // (We manually eliminate the tail call to avoid recursion.) 4473 goto LexNextToken; 4474 } 4475 } 4476 4477 // Notify MIOpt that we read a non-whitespace/non-comment token. 4478 MIOpt.ReadToken(); 4479 4480 // Update the location of token as well as BufferPtr. 4481 FormTokenWithChars(Result, CurPtr, Kind); 4482 return true; 4483 4484 HandleDirective: 4485 // We parsed a # character and it's the start of a preprocessing directive. 4486 4487 FormTokenWithChars(Result, CurPtr, tok::hash); 4488 PP->HandleDirective(Result); 4489 4490 if (PP->hadModuleLoaderFatalFailure()) 4491 // With a fatal failure in the module loader, we abort parsing. 4492 return true; 4493 4494 // We parsed the directive; lex a token with the new state. 4495 return false; 4496 4497 LexNextToken: 4498 Result.clearFlag(Token::NeedsCleaning); 4499 goto LexStart; 4500 } 4501 4502 const char *Lexer::convertDependencyDirectiveToken( 4503 const dependency_directives_scan::Token &DDTok, Token &Result) { 4504 const char *TokPtr = BufferStart + DDTok.Offset; 4505 Result.startToken(); 4506 Result.setLocation(getSourceLocation(TokPtr)); 4507 Result.setKind(DDTok.Kind); 4508 Result.setFlag((Token::TokenFlags)DDTok.Flags); 4509 Result.setLength(DDTok.Length); 4510 BufferPtr = TokPtr + DDTok.Length; 4511 return TokPtr; 4512 } 4513 4514 bool Lexer::LexDependencyDirectiveToken(Token &Result) { 4515 assert(isDependencyDirectivesLexer()); 4516 4517 using namespace dependency_directives_scan; 4518 4519 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) { 4520 if (DepDirectives.front().Kind == pp_eof) 4521 return LexEndOfFile(Result, BufferEnd); 4522 if (DepDirectives.front().Kind == tokens_present_before_eof) 4523 MIOpt.ReadToken(); 4524 NextDepDirectiveTokenIndex = 0; 4525 DepDirectives = DepDirectives.drop_front(); 4526 } 4527 4528 const dependency_directives_scan::Token &DDTok = 4529 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++]; 4530 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) { 4531 // Read something other than a preprocessor directive hash. 4532 MIOpt.ReadToken(); 4533 } 4534 4535 if (ParsingFilename && DDTok.is(tok::less)) { 4536 BufferPtr = BufferStart + DDTok.Offset; 4537 LexAngledStringLiteral(Result, BufferPtr + 1); 4538 if (Result.isNot(tok::header_name)) 4539 return true; 4540 // Advance the index of lexed tokens. 4541 while (true) { 4542 const dependency_directives_scan::Token &NextTok = 4543 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex]; 4544 if (BufferStart + NextTok.Offset >= BufferPtr) 4545 break; 4546 ++NextDepDirectiveTokenIndex; 4547 } 4548 return true; 4549 } 4550 4551 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result); 4552 4553 if (Result.is(tok::hash) && Result.isAtStartOfLine()) { 4554 PP->HandleDirective(Result); 4555 return false; 4556 } 4557 if (Result.is(tok::raw_identifier)) { 4558 Result.setRawIdentifierData(TokPtr); 4559 if (!isLexingRawMode()) { 4560 const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 4561 if (II->isHandleIdentifierCase()) 4562 return PP->HandleIdentifier(Result); 4563 } 4564 return true; 4565 } 4566 if (Result.isLiteral()) { 4567 Result.setLiteralData(TokPtr); 4568 return true; 4569 } 4570 if (Result.is(tok::colon)) { 4571 // Convert consecutive colons to 'tok::coloncolon'. 4572 if (*BufferPtr == ':') { 4573 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( 4574 tok::colon)); 4575 ++NextDepDirectiveTokenIndex; 4576 Result.setKind(tok::coloncolon); 4577 } 4578 return true; 4579 } 4580 if (Result.is(tok::eod)) 4581 ParsingPreprocessorDirective = false; 4582 4583 return true; 4584 } 4585 4586 bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) { 4587 assert(isDependencyDirectivesLexer()); 4588 4589 using namespace dependency_directives_scan; 4590 4591 bool Stop = false; 4592 unsigned NestedIfs = 0; 4593 do { 4594 DepDirectives = DepDirectives.drop_front(); 4595 switch (DepDirectives.front().Kind) { 4596 case pp_none: 4597 llvm_unreachable("unexpected 'pp_none'"); 4598 case pp_include: 4599 case pp___include_macros: 4600 case pp_define: 4601 case pp_undef: 4602 case pp_import: 4603 case pp_pragma_import: 4604 case pp_pragma_once: 4605 case pp_pragma_push_macro: 4606 case pp_pragma_pop_macro: 4607 case pp_pragma_include_alias: 4608 case pp_pragma_system_header: 4609 case pp_include_next: 4610 case decl_at_import: 4611 case cxx_module_decl: 4612 case cxx_import_decl: 4613 case cxx_export_module_decl: 4614 case cxx_export_import_decl: 4615 case tokens_present_before_eof: 4616 break; 4617 case pp_if: 4618 case pp_ifdef: 4619 case pp_ifndef: 4620 ++NestedIfs; 4621 break; 4622 case pp_elif: 4623 case pp_elifdef: 4624 case pp_elifndef: 4625 case pp_else: 4626 if (!NestedIfs) { 4627 Stop = true; 4628 } 4629 break; 4630 case pp_endif: 4631 if (!NestedIfs) { 4632 Stop = true; 4633 } else { 4634 --NestedIfs; 4635 } 4636 break; 4637 case pp_eof: 4638 NextDepDirectiveTokenIndex = 0; 4639 return LexEndOfFile(Result, BufferEnd); 4640 } 4641 } while (!Stop); 4642 4643 const dependency_directives_scan::Token &DDTok = 4644 DepDirectives.front().Tokens.front(); 4645 assert(DDTok.is(tok::hash)); 4646 NextDepDirectiveTokenIndex = 1; 4647 4648 convertDependencyDirectiveToken(DDTok, Result); 4649 return false; 4650 } 4651