1 //===- Lexer.cpp - C Language Family Lexer --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Lexer and Token interfaces. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "clang/Lex/Lexer.h" 14 #include "UnicodeCharSets.h" 15 #include "clang/Basic/CharInfo.h" 16 #include "clang/Basic/Diagnostic.h" 17 #include "clang/Basic/IdentifierTable.h" 18 #include "clang/Basic/LLVM.h" 19 #include "clang/Basic/LangOptions.h" 20 #include "clang/Basic/SourceLocation.h" 21 #include "clang/Basic/SourceManager.h" 22 #include "clang/Basic/TokenKinds.h" 23 #include "clang/Lex/LexDiagnostic.h" 24 #include "clang/Lex/LiteralSupport.h" 25 #include "clang/Lex/MultipleIncludeOpt.h" 26 #include "clang/Lex/Preprocessor.h" 27 #include "clang/Lex/PreprocessorOptions.h" 28 #include "clang/Lex/Token.h" 29 #include "llvm/ADT/STLExtras.h" 30 #include "llvm/ADT/StringExtras.h" 31 #include "llvm/ADT/StringRef.h" 32 #include "llvm/ADT/StringSwitch.h" 33 #include "llvm/Support/Compiler.h" 34 #include "llvm/Support/ConvertUTF.h" 35 #include "llvm/Support/MemoryBufferRef.h" 36 #include "llvm/Support/NativeFormatting.h" 37 #include "llvm/Support/Unicode.h" 38 #include "llvm/Support/UnicodeCharRanges.h" 39 #include <algorithm> 40 #include <cassert> 41 #include <cstddef> 42 #include <cstdint> 43 #include <cstring> 44 #include <optional> 45 #include <string> 46 #include <tuple> 47 #include <utility> 48 49 #ifdef __SSE4_2__ 50 #include <nmmintrin.h> 51 #endif 52 53 using namespace clang; 54 55 //===----------------------------------------------------------------------===// 56 // Token Class Implementation 57 //===----------------------------------------------------------------------===// 58 59 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 60 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 61 if (isAnnotation()) 62 return false; 63 if (const IdentifierInfo *II = getIdentifierInfo()) 64 return II->getObjCKeywordID() == objcKey; 65 return false; 66 } 67 68 /// getObjCKeywordID - Return the ObjC keyword kind. 69 tok::ObjCKeywordKind Token::getObjCKeywordID() const { 70 if (isAnnotation()) 71 return tok::objc_not_keyword; 72 const IdentifierInfo *specId = getIdentifierInfo(); 73 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 74 } 75 76 /// Determine whether the token kind starts a simple-type-specifier. 77 bool Token::isSimpleTypeSpecifier(const LangOptions &LangOpts) const { 78 switch (getKind()) { 79 case tok::annot_typename: 80 case tok::annot_decltype: 81 case tok::annot_pack_indexing_type: 82 return true; 83 84 case tok::kw_short: 85 case tok::kw_long: 86 case tok::kw___int64: 87 case tok::kw___int128: 88 case tok::kw_signed: 89 case tok::kw_unsigned: 90 case tok::kw_void: 91 case tok::kw_char: 92 case tok::kw_int: 93 case tok::kw_half: 94 case tok::kw_float: 95 case tok::kw_double: 96 case tok::kw___bf16: 97 case tok::kw__Float16: 98 case tok::kw___float128: 99 case tok::kw___ibm128: 100 case tok::kw_wchar_t: 101 case tok::kw_bool: 102 case tok::kw__Bool: 103 case tok::kw__Accum: 104 case tok::kw__Fract: 105 case tok::kw__Sat: 106 #define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait: 107 #include "clang/Basic/TransformTypeTraits.def" 108 case tok::kw___auto_type: 109 case tok::kw_char16_t: 110 case tok::kw_char32_t: 111 case tok::kw_typeof: 112 case tok::kw_decltype: 113 case tok::kw_char8_t: 114 return getIdentifierInfo()->isKeyword(LangOpts); 115 116 default: 117 return false; 118 } 119 } 120 121 //===----------------------------------------------------------------------===// 122 // Lexer Class Implementation 123 //===----------------------------------------------------------------------===// 124 125 void Lexer::anchor() {} 126 127 void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 128 const char *BufEnd) { 129 BufferStart = BufStart; 130 BufferPtr = BufPtr; 131 BufferEnd = BufEnd; 132 133 assert(BufEnd[0] == 0 && 134 "We assume that the input buffer has a null character at the end" 135 " to simplify lexing!"); 136 137 // Check whether we have a BOM in the beginning of the buffer. If yes - act 138 // accordingly. Right now we support only UTF-8 with and without BOM, so, just 139 // skip the UTF-8 BOM if it's present. 140 if (BufferStart == BufferPtr) { 141 // Determine the size of the BOM. 142 StringRef Buf(BufferStart, BufferEnd - BufferStart); 143 size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 144 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 145 .Default(0); 146 147 // Skip the BOM. 148 BufferPtr += BOMLength; 149 } 150 151 Is_PragmaLexer = false; 152 CurrentConflictMarkerState = CMK_None; 153 154 // Start of the file is a start of line. 155 IsAtStartOfLine = true; 156 IsAtPhysicalStartOfLine = true; 157 158 HasLeadingSpace = false; 159 HasLeadingEmptyMacro = false; 160 161 // We are not after parsing a #. 162 ParsingPreprocessorDirective = false; 163 164 // We are not after parsing #include. 165 ParsingFilename = false; 166 167 // We are not in raw mode. Raw mode disables diagnostics and interpretation 168 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 169 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 170 // or otherwise skipping over tokens. 171 LexingRawMode = false; 172 173 // Default to not keeping comments. 174 ExtendedTokenMode = 0; 175 176 NewLinePtr = nullptr; 177 } 178 179 /// Lexer constructor - Create a new lexer object for the specified buffer 180 /// with the specified preprocessor managing the lexing process. This lexer 181 /// assumes that the associated file buffer and Preprocessor objects will 182 /// outlive it, so it doesn't take ownership of either of them. 183 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, 184 Preprocessor &PP, bool IsFirstIncludeOfFile) 185 : PreprocessorLexer(&PP, FID), 186 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 187 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment), 188 IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 189 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(), 190 InputFile.getBufferEnd()); 191 192 resetExtendedTokenMode(); 193 } 194 195 /// Lexer constructor - Create a new raw lexer object. This object is only 196 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 197 /// range will outlive it, so it doesn't take ownership of it. 198 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 199 const char *BufStart, const char *BufPtr, const char *BufEnd, 200 bool IsFirstIncludeOfFile) 201 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment), 202 IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 203 InitLexer(BufStart, BufPtr, BufEnd); 204 205 // We *are* in raw mode. 206 LexingRawMode = true; 207 } 208 209 /// Lexer constructor - Create a new raw lexer object. This object is only 210 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 211 /// range will outlive it, so it doesn't take ownership of it. 212 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile, 213 const SourceManager &SM, const LangOptions &langOpts, 214 bool IsFirstIncludeOfFile) 215 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(), 216 FromFile.getBufferStart(), FromFile.getBufferEnd(), 217 IsFirstIncludeOfFile) {} 218 219 void Lexer::resetExtendedTokenMode() { 220 assert(PP && "Cannot reset token mode without a preprocessor"); 221 if (LangOpts.TraditionalCPP) 222 SetKeepWhitespaceMode(true); 223 else 224 SetCommentRetentionState(PP->getCommentRetentionState()); 225 } 226 227 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 228 /// _Pragma expansion. This has a variety of magic semantics that this method 229 /// sets up. It returns a new'd Lexer that must be delete'd when done. 230 /// 231 /// On entrance to this routine, TokStartLoc is a macro location which has a 232 /// spelling loc that indicates the bytes to be lexed for the token and an 233 /// expansion location that indicates where all lexed tokens should be 234 /// "expanded from". 235 /// 236 /// TODO: It would really be nice to make _Pragma just be a wrapper around a 237 /// normal lexer that remaps tokens as they fly by. This would require making 238 /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 239 /// interface that could handle this stuff. This would pull GetMappedTokenLoc 240 /// out of the critical path of the lexer! 241 /// 242 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 243 SourceLocation ExpansionLocStart, 244 SourceLocation ExpansionLocEnd, 245 unsigned TokLen, Preprocessor &PP) { 246 SourceManager &SM = PP.getSourceManager(); 247 248 // Create the lexer as if we were going to lex the file normally. 249 FileID SpellingFID = SM.getFileID(SpellingLoc); 250 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID); 251 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 252 253 // Now that the lexer is created, change the start/end locations so that we 254 // just lex the subsection of the file that we want. This is lexing from a 255 // scratch buffer. 256 const char *StrData = SM.getCharacterData(SpellingLoc); 257 258 L->BufferPtr = StrData; 259 L->BufferEnd = StrData+TokLen; 260 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 261 262 // Set the SourceLocation with the remapping information. This ensures that 263 // GetMappedTokenLoc will remap the tokens as they are lexed. 264 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 265 ExpansionLocStart, 266 ExpansionLocEnd, TokLen); 267 268 // Ensure that the lexer thinks it is inside a directive, so that end \n will 269 // return an EOD token. 270 L->ParsingPreprocessorDirective = true; 271 272 // This lexer really is for _Pragma. 273 L->Is_PragmaLexer = true; 274 return L; 275 } 276 277 void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) { 278 this->IsAtPhysicalStartOfLine = IsAtStartOfLine; 279 this->IsAtStartOfLine = IsAtStartOfLine; 280 assert((BufferStart + Offset) <= BufferEnd); 281 BufferPtr = BufferStart + Offset; 282 } 283 284 template <typename T> static void StringifyImpl(T &Str, char Quote) { 285 typename T::size_type i = 0, e = Str.size(); 286 while (i < e) { 287 if (Str[i] == '\\' || Str[i] == Quote) { 288 Str.insert(Str.begin() + i, '\\'); 289 i += 2; 290 ++e; 291 } else if (Str[i] == '\n' || Str[i] == '\r') { 292 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'. 293 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && 294 Str[i] != Str[i + 1]) { 295 Str[i] = '\\'; 296 Str[i + 1] = 'n'; 297 } else { 298 // Replace '\n' and '\r' to '\\' followed by 'n'. 299 Str[i] = '\\'; 300 Str.insert(Str.begin() + i + 1, 'n'); 301 ++e; 302 } 303 i += 2; 304 } else 305 ++i; 306 } 307 } 308 309 std::string Lexer::Stringify(StringRef Str, bool Charify) { 310 std::string Result = std::string(Str); 311 char Quote = Charify ? '\'' : '"'; 312 StringifyImpl(Result, Quote); 313 return Result; 314 } 315 316 void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); } 317 318 //===----------------------------------------------------------------------===// 319 // Token Spelling 320 //===----------------------------------------------------------------------===// 321 322 /// Slow case of getSpelling. Extract the characters comprising the 323 /// spelling of this token from the provided input buffer. 324 static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, 325 const LangOptions &LangOpts, char *Spelling) { 326 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); 327 328 size_t Length = 0; 329 const char *BufEnd = BufPtr + Tok.getLength(); 330 331 if (tok::isStringLiteral(Tok.getKind())) { 332 // Munch the encoding-prefix and opening double-quote. 333 while (BufPtr < BufEnd) { 334 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts); 335 Spelling[Length++] = CharAndSize.Char; 336 BufPtr += CharAndSize.Size; 337 338 if (Spelling[Length - 1] == '"') 339 break; 340 } 341 342 // Raw string literals need special handling; trigraph expansion and line 343 // splicing do not occur within their d-char-sequence nor within their 344 // r-char-sequence. 345 if (Length >= 2 && 346 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { 347 // Search backwards from the end of the token to find the matching closing 348 // quote. 349 const char *RawEnd = BufEnd; 350 do --RawEnd; while (*RawEnd != '"'); 351 size_t RawLength = RawEnd - BufPtr + 1; 352 353 // Everything between the quotes is included verbatim in the spelling. 354 memcpy(Spelling + Length, BufPtr, RawLength); 355 Length += RawLength; 356 BufPtr += RawLength; 357 358 // The rest of the token is lexed normally. 359 } 360 } 361 362 while (BufPtr < BufEnd) { 363 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts); 364 Spelling[Length++] = CharAndSize.Char; 365 BufPtr += CharAndSize.Size; 366 } 367 368 assert(Length < Tok.getLength() && 369 "NeedsCleaning flag set on token that didn't need cleaning!"); 370 return Length; 371 } 372 373 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 374 /// token are the characters used to represent the token in the source file 375 /// after trigraph expansion and escaped-newline folding. In particular, this 376 /// wants to get the true, uncanonicalized, spelling of things like digraphs 377 /// UCNs, etc. 378 StringRef Lexer::getSpelling(SourceLocation loc, 379 SmallVectorImpl<char> &buffer, 380 const SourceManager &SM, 381 const LangOptions &options, 382 bool *invalid) { 383 // Break down the source location. 384 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 385 386 // Try to the load the file buffer. 387 bool invalidTemp = false; 388 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 389 if (invalidTemp) { 390 if (invalid) *invalid = true; 391 return {}; 392 } 393 394 const char *tokenBegin = file.data() + locInfo.second; 395 396 // Lex from the start of the given location. 397 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 398 file.begin(), tokenBegin, file.end()); 399 Token token; 400 lexer.LexFromRawLexer(token); 401 402 unsigned length = token.getLength(); 403 404 // Common case: no need for cleaning. 405 if (!token.needsCleaning()) 406 return StringRef(tokenBegin, length); 407 408 // Hard case, we need to relex the characters into the string. 409 buffer.resize(length); 410 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); 411 return StringRef(buffer.data(), buffer.size()); 412 } 413 414 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 415 /// token are the characters used to represent the token in the source file 416 /// after trigraph expansion and escaped-newline folding. In particular, this 417 /// wants to get the true, uncanonicalized, spelling of things like digraphs 418 /// UCNs, etc. 419 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 420 const LangOptions &LangOpts, bool *Invalid) { 421 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 422 423 bool CharDataInvalid = false; 424 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 425 &CharDataInvalid); 426 if (Invalid) 427 *Invalid = CharDataInvalid; 428 if (CharDataInvalid) 429 return {}; 430 431 // If this token contains nothing interesting, return it directly. 432 if (!Tok.needsCleaning()) 433 return std::string(TokStart, TokStart + Tok.getLength()); 434 435 std::string Result; 436 Result.resize(Tok.getLength()); 437 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); 438 return Result; 439 } 440 441 /// getSpelling - This method is used to get the spelling of a token into a 442 /// preallocated buffer, instead of as an std::string. The caller is required 443 /// to allocate enough space for the token, which is guaranteed to be at least 444 /// Tok.getLength() bytes long. The actual length of the token is returned. 445 /// 446 /// Note that this method may do two possible things: it may either fill in 447 /// the buffer specified with characters, or it may *change the input pointer* 448 /// to point to a constant buffer with the data already in it (avoiding a 449 /// copy). The caller is not allowed to modify the returned buffer pointer 450 /// if an internal buffer is returned. 451 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 452 const SourceManager &SourceMgr, 453 const LangOptions &LangOpts, bool *Invalid) { 454 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 455 456 const char *TokStart = nullptr; 457 // NOTE: this has to be checked *before* testing for an IdentifierInfo. 458 if (Tok.is(tok::raw_identifier)) 459 TokStart = Tok.getRawIdentifier().data(); 460 else if (!Tok.hasUCN()) { 461 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 462 // Just return the string from the identifier table, which is very quick. 463 Buffer = II->getNameStart(); 464 return II->getLength(); 465 } 466 } 467 468 // NOTE: this can be checked even after testing for an IdentifierInfo. 469 if (Tok.isLiteral()) 470 TokStart = Tok.getLiteralData(); 471 472 if (!TokStart) { 473 // Compute the start of the token in the input lexer buffer. 474 bool CharDataInvalid = false; 475 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 476 if (Invalid) 477 *Invalid = CharDataInvalid; 478 if (CharDataInvalid) { 479 Buffer = ""; 480 return 0; 481 } 482 } 483 484 // If this token contains nothing interesting, return it directly. 485 if (!Tok.needsCleaning()) { 486 Buffer = TokStart; 487 return Tok.getLength(); 488 } 489 490 // Otherwise, hard case, relex the characters into the string. 491 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); 492 } 493 494 /// MeasureTokenLength - Relex the token at the specified location and return 495 /// its length in bytes in the input file. If the token needs cleaning (e.g. 496 /// includes a trigraph or an escaped newline) then this count includes bytes 497 /// that are part of that. 498 unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 499 const SourceManager &SM, 500 const LangOptions &LangOpts) { 501 Token TheTok; 502 if (getRawToken(Loc, TheTok, SM, LangOpts)) 503 return 0; 504 return TheTok.getLength(); 505 } 506 507 /// Relex the token at the specified location. 508 /// \returns true if there was a failure, false on success. 509 bool Lexer::getRawToken(SourceLocation Loc, Token &Result, 510 const SourceManager &SM, 511 const LangOptions &LangOpts, 512 bool IgnoreWhiteSpace) { 513 // TODO: this could be special cased for common tokens like identifiers, ')', 514 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 515 // all obviously single-char tokens. This could use 516 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 517 // something. 518 519 // If this comes from a macro expansion, we really do want the macro name, not 520 // the token this macro expanded to. 521 Loc = SM.getExpansionLoc(Loc); 522 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 523 bool Invalid = false; 524 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 525 if (Invalid) 526 return true; 527 528 const char *StrData = Buffer.data()+LocInfo.second; 529 530 if (!IgnoreWhiteSpace && isWhitespace(SkipEscapedNewLines(StrData)[0])) 531 return true; 532 533 // Create a lexer starting at the beginning of this token. 534 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 535 Buffer.begin(), StrData, Buffer.end()); 536 TheLexer.SetCommentRetentionState(true); 537 TheLexer.LexFromRawLexer(Result); 538 return false; 539 } 540 541 /// Returns the pointer that points to the beginning of line that contains 542 /// the given offset, or null if the offset if invalid. 543 static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { 544 const char *BufStart = Buffer.data(); 545 if (Offset >= Buffer.size()) 546 return nullptr; 547 548 const char *LexStart = BufStart + Offset; 549 for (; LexStart != BufStart; --LexStart) { 550 if (isVerticalWhitespace(LexStart[0]) && 551 !Lexer::isNewLineEscaped(BufStart, LexStart)) { 552 // LexStart should point at first character of logical line. 553 ++LexStart; 554 break; 555 } 556 } 557 return LexStart; 558 } 559 560 static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 561 const SourceManager &SM, 562 const LangOptions &LangOpts) { 563 assert(Loc.isFileID()); 564 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 565 if (LocInfo.first.isInvalid()) 566 return Loc; 567 568 bool Invalid = false; 569 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 570 if (Invalid) 571 return Loc; 572 573 // Back up from the current location until we hit the beginning of a line 574 // (or the buffer). We'll relex from that point. 575 const char *StrData = Buffer.data() + LocInfo.second; 576 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); 577 if (!LexStart || LexStart == StrData) 578 return Loc; 579 580 // Create a lexer starting at the beginning of this token. 581 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 582 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, 583 Buffer.end()); 584 TheLexer.SetCommentRetentionState(true); 585 586 // Lex tokens until we find the token that contains the source location. 587 Token TheTok; 588 do { 589 TheLexer.LexFromRawLexer(TheTok); 590 591 if (TheLexer.getBufferLocation() > StrData) { 592 // Lexing this token has taken the lexer past the source location we're 593 // looking for. If the current token encompasses our source location, 594 // return the beginning of that token. 595 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 596 return TheTok.getLocation(); 597 598 // We ended up skipping over the source location entirely, which means 599 // that it points into whitespace. We're done here. 600 break; 601 } 602 } while (TheTok.getKind() != tok::eof); 603 604 // We've passed our source location; just return the original source location. 605 return Loc; 606 } 607 608 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 609 const SourceManager &SM, 610 const LangOptions &LangOpts) { 611 if (Loc.isFileID()) 612 return getBeginningOfFileToken(Loc, SM, LangOpts); 613 614 if (!SM.isMacroArgExpansion(Loc)) 615 return Loc; 616 617 SourceLocation FileLoc = SM.getSpellingLoc(Loc); 618 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 619 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 620 std::pair<FileID, unsigned> BeginFileLocInfo = 621 SM.getDecomposedLoc(BeginFileLoc); 622 assert(FileLocInfo.first == BeginFileLocInfo.first && 623 FileLocInfo.second >= BeginFileLocInfo.second); 624 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 625 } 626 627 namespace { 628 629 enum PreambleDirectiveKind { 630 PDK_Skipped, 631 PDK_Unknown 632 }; 633 634 } // namespace 635 636 PreambleBounds Lexer::ComputePreamble(StringRef Buffer, 637 const LangOptions &LangOpts, 638 unsigned MaxLines) { 639 // Create a lexer starting at the beginning of the file. Note that we use a 640 // "fake" file source location at offset 1 so that the lexer will track our 641 // position within the file. 642 const SourceLocation::UIntTy StartOffset = 1; 643 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); 644 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(), 645 Buffer.end()); 646 TheLexer.SetCommentRetentionState(true); 647 648 bool InPreprocessorDirective = false; 649 Token TheTok; 650 SourceLocation ActiveCommentLoc; 651 652 unsigned MaxLineOffset = 0; 653 if (MaxLines) { 654 const char *CurPtr = Buffer.begin(); 655 unsigned CurLine = 0; 656 while (CurPtr != Buffer.end()) { 657 char ch = *CurPtr++; 658 if (ch == '\n') { 659 ++CurLine; 660 if (CurLine == MaxLines) 661 break; 662 } 663 } 664 if (CurPtr != Buffer.end()) 665 MaxLineOffset = CurPtr - Buffer.begin(); 666 } 667 668 do { 669 TheLexer.LexFromRawLexer(TheTok); 670 671 if (InPreprocessorDirective) { 672 // If we've hit the end of the file, we're done. 673 if (TheTok.getKind() == tok::eof) { 674 break; 675 } 676 677 // If we haven't hit the end of the preprocessor directive, skip this 678 // token. 679 if (!TheTok.isAtStartOfLine()) 680 continue; 681 682 // We've passed the end of the preprocessor directive, and will look 683 // at this token again below. 684 InPreprocessorDirective = false; 685 } 686 687 // Keep track of the # of lines in the preamble. 688 if (TheTok.isAtStartOfLine()) { 689 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 690 691 // If we were asked to limit the number of lines in the preamble, 692 // and we're about to exceed that limit, we're done. 693 if (MaxLineOffset && TokOffset >= MaxLineOffset) 694 break; 695 } 696 697 // Comments are okay; skip over them. 698 if (TheTok.getKind() == tok::comment) { 699 if (ActiveCommentLoc.isInvalid()) 700 ActiveCommentLoc = TheTok.getLocation(); 701 continue; 702 } 703 704 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 705 // This is the start of a preprocessor directive. 706 Token HashTok = TheTok; 707 InPreprocessorDirective = true; 708 ActiveCommentLoc = SourceLocation(); 709 710 // Figure out which directive this is. Since we're lexing raw tokens, 711 // we don't have an identifier table available. Instead, just look at 712 // the raw identifier to recognize and categorize preprocessor directives. 713 TheLexer.LexFromRawLexer(TheTok); 714 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 715 StringRef Keyword = TheTok.getRawIdentifier(); 716 PreambleDirectiveKind PDK 717 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 718 .Case("include", PDK_Skipped) 719 .Case("__include_macros", PDK_Skipped) 720 .Case("define", PDK_Skipped) 721 .Case("undef", PDK_Skipped) 722 .Case("line", PDK_Skipped) 723 .Case("error", PDK_Skipped) 724 .Case("pragma", PDK_Skipped) 725 .Case("import", PDK_Skipped) 726 .Case("include_next", PDK_Skipped) 727 .Case("warning", PDK_Skipped) 728 .Case("ident", PDK_Skipped) 729 .Case("sccs", PDK_Skipped) 730 .Case("assert", PDK_Skipped) 731 .Case("unassert", PDK_Skipped) 732 .Case("if", PDK_Skipped) 733 .Case("ifdef", PDK_Skipped) 734 .Case("ifndef", PDK_Skipped) 735 .Case("elif", PDK_Skipped) 736 .Case("elifdef", PDK_Skipped) 737 .Case("elifndef", PDK_Skipped) 738 .Case("else", PDK_Skipped) 739 .Case("endif", PDK_Skipped) 740 .Default(PDK_Unknown); 741 742 switch (PDK) { 743 case PDK_Skipped: 744 continue; 745 746 case PDK_Unknown: 747 // We don't know what this directive is; stop at the '#'. 748 break; 749 } 750 } 751 752 // We only end up here if we didn't recognize the preprocessor 753 // directive or it was one that can't occur in the preamble at this 754 // point. Roll back the current token to the location of the '#'. 755 TheTok = HashTok; 756 } else if (TheTok.isAtStartOfLine() && 757 TheTok.getKind() == tok::raw_identifier && 758 TheTok.getRawIdentifier() == "module" && 759 LangOpts.CPlusPlusModules) { 760 // The initial global module fragment introducer "module;" is part of 761 // the preamble, which runs up to the module declaration "module foo;". 762 Token ModuleTok = TheTok; 763 do { 764 TheLexer.LexFromRawLexer(TheTok); 765 } while (TheTok.getKind() == tok::comment); 766 if (TheTok.getKind() != tok::semi) { 767 // Not global module fragment, roll back. 768 TheTok = ModuleTok; 769 break; 770 } 771 continue; 772 } 773 774 // We hit a token that we don't recognize as being in the 775 // "preprocessing only" part of the file, so we're no longer in 776 // the preamble. 777 break; 778 } while (true); 779 780 SourceLocation End; 781 if (ActiveCommentLoc.isValid()) 782 End = ActiveCommentLoc; // don't truncate a decl comment. 783 else 784 End = TheTok.getLocation(); 785 786 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(), 787 TheTok.isAtStartOfLine()); 788 } 789 790 unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, 791 const SourceManager &SM, 792 const LangOptions &LangOpts) { 793 // Figure out how many physical characters away the specified expansion 794 // character is. This needs to take into consideration newlines and 795 // trigraphs. 796 bool Invalid = false; 797 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 798 799 // If they request the first char of the token, we're trivially done. 800 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 801 return 0; 802 803 unsigned PhysOffset = 0; 804 805 // The usual case is that tokens don't contain anything interesting. Skip 806 // over the uninteresting characters. If a token only consists of simple 807 // chars, this method is extremely fast. 808 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 809 if (CharNo == 0) 810 return PhysOffset; 811 ++TokPtr; 812 --CharNo; 813 ++PhysOffset; 814 } 815 816 // If we have a character that may be a trigraph or escaped newline, use a 817 // lexer to parse it correctly. 818 for (; CharNo; --CharNo) { 819 auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts); 820 TokPtr += CharAndSize.Size; 821 PhysOffset += CharAndSize.Size; 822 } 823 824 // Final detail: if we end up on an escaped newline, we want to return the 825 // location of the actual byte of the token. For example foo\<newline>bar 826 // advanced by 3 should return the location of b, not of \\. One compounding 827 // detail of this is that the escape may be made by a trigraph. 828 if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 829 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 830 831 return PhysOffset; 832 } 833 834 /// Computes the source location just past the end of the 835 /// token at this source location. 836 /// 837 /// This routine can be used to produce a source location that 838 /// points just past the end of the token referenced by \p Loc, and 839 /// is generally used when a diagnostic needs to point just after a 840 /// token where it expected something different that it received. If 841 /// the returned source location would not be meaningful (e.g., if 842 /// it points into a macro), this routine returns an invalid 843 /// source location. 844 /// 845 /// \param Offset an offset from the end of the token, where the source 846 /// location should refer to. The default offset (0) produces a source 847 /// location pointing just past the end of the token; an offset of 1 produces 848 /// a source location pointing to the last character in the token, etc. 849 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 850 const SourceManager &SM, 851 const LangOptions &LangOpts) { 852 if (Loc.isInvalid()) 853 return {}; 854 855 if (Loc.isMacroID()) { 856 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 857 return {}; // Points inside the macro expansion. 858 } 859 860 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 861 if (Len > Offset) 862 Len = Len - Offset; 863 else 864 return Loc; 865 866 return Loc.getLocWithOffset(Len); 867 } 868 869 /// Returns true if the given MacroID location points at the first 870 /// token of the macro expansion. 871 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 872 const SourceManager &SM, 873 const LangOptions &LangOpts, 874 SourceLocation *MacroBegin) { 875 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 876 877 SourceLocation expansionLoc; 878 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) 879 return false; 880 881 if (expansionLoc.isFileID()) { 882 // No other macro expansions, this is the first. 883 if (MacroBegin) 884 *MacroBegin = expansionLoc; 885 return true; 886 } 887 888 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 889 } 890 891 /// Returns true if the given MacroID location points at the last 892 /// token of the macro expansion. 893 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 894 const SourceManager &SM, 895 const LangOptions &LangOpts, 896 SourceLocation *MacroEnd) { 897 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 898 899 SourceLocation spellLoc = SM.getSpellingLoc(loc); 900 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 901 if (tokLen == 0) 902 return false; 903 904 SourceLocation afterLoc = loc.getLocWithOffset(tokLen); 905 SourceLocation expansionLoc; 906 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) 907 return false; 908 909 if (expansionLoc.isFileID()) { 910 // No other macro expansions. 911 if (MacroEnd) 912 *MacroEnd = expansionLoc; 913 return true; 914 } 915 916 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 917 } 918 919 static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 920 const SourceManager &SM, 921 const LangOptions &LangOpts) { 922 SourceLocation Begin = Range.getBegin(); 923 SourceLocation End = Range.getEnd(); 924 assert(Begin.isFileID() && End.isFileID()); 925 if (Range.isTokenRange()) { 926 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 927 if (End.isInvalid()) 928 return {}; 929 } 930 931 // Break down the source locations. 932 FileID FID; 933 unsigned BeginOffs; 934 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 935 if (FID.isInvalid()) 936 return {}; 937 938 unsigned EndOffs; 939 if (!SM.isInFileID(End, FID, &EndOffs) || 940 BeginOffs > EndOffs) 941 return {}; 942 943 return CharSourceRange::getCharRange(Begin, End); 944 } 945 946 // Assumes that `Loc` is in an expansion. 947 static bool isInExpansionTokenRange(const SourceLocation Loc, 948 const SourceManager &SM) { 949 return SM.getSLocEntry(SM.getFileID(Loc)) 950 .getExpansion() 951 .isExpansionTokenRange(); 952 } 953 954 CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 955 const SourceManager &SM, 956 const LangOptions &LangOpts) { 957 SourceLocation Begin = Range.getBegin(); 958 SourceLocation End = Range.getEnd(); 959 if (Begin.isInvalid() || End.isInvalid()) 960 return {}; 961 962 if (Begin.isFileID() && End.isFileID()) 963 return makeRangeFromFileLocs(Range, SM, LangOpts); 964 965 if (Begin.isMacroID() && End.isFileID()) { 966 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 967 return {}; 968 Range.setBegin(Begin); 969 return makeRangeFromFileLocs(Range, SM, LangOpts); 970 } 971 972 if (Begin.isFileID() && End.isMacroID()) { 973 if (Range.isTokenRange()) { 974 if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End)) 975 return {}; 976 // Use the *original* end, not the expanded one in `End`. 977 Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM)); 978 } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End)) 979 return {}; 980 Range.setEnd(End); 981 return makeRangeFromFileLocs(Range, SM, LangOpts); 982 } 983 984 assert(Begin.isMacroID() && End.isMacroID()); 985 SourceLocation MacroBegin, MacroEnd; 986 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 987 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 988 &MacroEnd)) || 989 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 990 &MacroEnd)))) { 991 Range.setBegin(MacroBegin); 992 Range.setEnd(MacroEnd); 993 // Use the *original* `End`, not the expanded one in `MacroEnd`. 994 if (Range.isTokenRange()) 995 Range.setTokenRange(isInExpansionTokenRange(End, SM)); 996 return makeRangeFromFileLocs(Range, SM, LangOpts); 997 } 998 999 bool Invalid = false; 1000 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), 1001 &Invalid); 1002 if (Invalid) 1003 return {}; 1004 1005 if (BeginEntry.getExpansion().isMacroArgExpansion()) { 1006 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), 1007 &Invalid); 1008 if (Invalid) 1009 return {}; 1010 1011 if (EndEntry.getExpansion().isMacroArgExpansion() && 1012 BeginEntry.getExpansion().getExpansionLocStart() == 1013 EndEntry.getExpansion().getExpansionLocStart()) { 1014 Range.setBegin(SM.getImmediateSpellingLoc(Begin)); 1015 Range.setEnd(SM.getImmediateSpellingLoc(End)); 1016 return makeFileCharRange(Range, SM, LangOpts); 1017 } 1018 } 1019 1020 return {}; 1021 } 1022 1023 StringRef Lexer::getSourceText(CharSourceRange Range, 1024 const SourceManager &SM, 1025 const LangOptions &LangOpts, 1026 bool *Invalid) { 1027 Range = makeFileCharRange(Range, SM, LangOpts); 1028 if (Range.isInvalid()) { 1029 if (Invalid) *Invalid = true; 1030 return {}; 1031 } 1032 1033 // Break down the source location. 1034 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 1035 if (beginInfo.first.isInvalid()) { 1036 if (Invalid) *Invalid = true; 1037 return {}; 1038 } 1039 1040 unsigned EndOffs; 1041 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 1042 beginInfo.second > EndOffs) { 1043 if (Invalid) *Invalid = true; 1044 return {}; 1045 } 1046 1047 // Try to the load the file buffer. 1048 bool invalidTemp = false; 1049 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 1050 if (invalidTemp) { 1051 if (Invalid) *Invalid = true; 1052 return {}; 1053 } 1054 1055 if (Invalid) *Invalid = false; 1056 return file.substr(beginInfo.second, EndOffs - beginInfo.second); 1057 } 1058 1059 StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 1060 const SourceManager &SM, 1061 const LangOptions &LangOpts) { 1062 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 1063 1064 // Find the location of the immediate macro expansion. 1065 while (true) { 1066 FileID FID = SM.getFileID(Loc); 1067 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 1068 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 1069 Loc = Expansion.getExpansionLocStart(); 1070 if (!Expansion.isMacroArgExpansion()) 1071 break; 1072 1073 // For macro arguments we need to check that the argument did not come 1074 // from an inner macro, e.g: "MAC1( MAC2(foo) )" 1075 1076 // Loc points to the argument id of the macro definition, move to the 1077 // macro expansion. 1078 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 1079 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 1080 if (SpellLoc.isFileID()) 1081 break; // No inner macro. 1082 1083 // If spelling location resides in the same FileID as macro expansion 1084 // location, it means there is no inner macro. 1085 FileID MacroFID = SM.getFileID(Loc); 1086 if (SM.isInFileID(SpellLoc, MacroFID)) 1087 break; 1088 1089 // Argument came from inner macro. 1090 Loc = SpellLoc; 1091 } 1092 1093 // Find the spelling location of the start of the non-argument expansion 1094 // range. This is where the macro name was spelled in order to begin 1095 // expanding this macro. 1096 Loc = SM.getSpellingLoc(Loc); 1097 1098 // Dig out the buffer where the macro name was spelled and the extents of the 1099 // name so that we can render it into the expansion note. 1100 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1101 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1102 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1103 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1104 } 1105 1106 StringRef Lexer::getImmediateMacroNameForDiagnostics( 1107 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { 1108 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 1109 // Walk past macro argument expansions. 1110 while (SM.isMacroArgExpansion(Loc)) 1111 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 1112 1113 // If the macro's spelling isn't FileID or from scratch space, then it's 1114 // actually a token paste or stringization (or similar) and not a macro at 1115 // all. 1116 SourceLocation SpellLoc = SM.getSpellingLoc(Loc); 1117 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc)) 1118 return {}; 1119 1120 // Find the spelling location of the start of the non-argument expansion 1121 // range. This is where the macro name was spelled in order to begin 1122 // expanding this macro. 1123 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin()); 1124 1125 // Dig out the buffer where the macro name was spelled and the extents of the 1126 // name so that we can render it into the expansion note. 1127 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1128 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1129 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1130 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1131 } 1132 1133 bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) { 1134 return isAsciiIdentifierContinue(c, LangOpts.DollarIdents); 1135 } 1136 1137 bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { 1138 assert(isVerticalWhitespace(Str[0])); 1139 if (Str - 1 < BufferStart) 1140 return false; 1141 1142 if ((Str[0] == '\n' && Str[-1] == '\r') || 1143 (Str[0] == '\r' && Str[-1] == '\n')) { 1144 if (Str - 2 < BufferStart) 1145 return false; 1146 --Str; 1147 } 1148 --Str; 1149 1150 // Rewind to first non-space character: 1151 while (Str > BufferStart && isHorizontalWhitespace(*Str)) 1152 --Str; 1153 1154 return *Str == '\\'; 1155 } 1156 1157 StringRef Lexer::getIndentationForLine(SourceLocation Loc, 1158 const SourceManager &SM) { 1159 if (Loc.isInvalid() || Loc.isMacroID()) 1160 return {}; 1161 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1162 if (LocInfo.first.isInvalid()) 1163 return {}; 1164 bool Invalid = false; 1165 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 1166 if (Invalid) 1167 return {}; 1168 const char *Line = findBeginningOfLine(Buffer, LocInfo.second); 1169 if (!Line) 1170 return {}; 1171 StringRef Rest = Buffer.substr(Line - Buffer.data()); 1172 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t"); 1173 return NumWhitespaceChars == StringRef::npos 1174 ? "" 1175 : Rest.take_front(NumWhitespaceChars); 1176 } 1177 1178 //===----------------------------------------------------------------------===// 1179 // Diagnostics forwarding code. 1180 //===----------------------------------------------------------------------===// 1181 1182 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 1183 /// lexer buffer was all expanded at a single point, perform the mapping. 1184 /// This is currently only used for _Pragma implementation, so it is the slow 1185 /// path of the hot getSourceLocation method. Do not allow it to be inlined. 1186 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 1187 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 1188 static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 1189 SourceLocation FileLoc, 1190 unsigned CharNo, unsigned TokLen) { 1191 assert(FileLoc.isMacroID() && "Must be a macro expansion"); 1192 1193 // Otherwise, we're lexing "mapped tokens". This is used for things like 1194 // _Pragma handling. Combine the expansion location of FileLoc with the 1195 // spelling location. 1196 SourceManager &SM = PP.getSourceManager(); 1197 1198 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 1199 // characters come from spelling(FileLoc)+Offset. 1200 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 1201 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 1202 1203 // Figure out the expansion loc range, which is the range covered by the 1204 // original _Pragma(...) sequence. 1205 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc); 1206 1207 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen); 1208 } 1209 1210 /// getSourceLocation - Return a source location identifier for the specified 1211 /// offset in the current file. 1212 SourceLocation Lexer::getSourceLocation(const char *Loc, 1213 unsigned TokLen) const { 1214 assert(Loc >= BufferStart && Loc <= BufferEnd && 1215 "Location out of range for this buffer!"); 1216 1217 // In the normal case, we're just lexing from a simple file buffer, return 1218 // the file id from FileLoc with the offset specified. 1219 unsigned CharNo = Loc-BufferStart; 1220 if (FileLoc.isFileID()) 1221 return FileLoc.getLocWithOffset(CharNo); 1222 1223 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 1224 // tokens are lexed from where the _Pragma was defined. 1225 assert(PP && "This doesn't work on raw lexers"); 1226 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 1227 } 1228 1229 /// Diag - Forwarding function for diagnostics. This translate a source 1230 /// position in the current buffer into a SourceLocation object for rendering. 1231 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 1232 return PP->Diag(getSourceLocation(Loc), DiagID); 1233 } 1234 1235 //===----------------------------------------------------------------------===// 1236 // Trigraph and Escaped Newline Handling Code. 1237 //===----------------------------------------------------------------------===// 1238 1239 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 1240 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 1241 static char GetTrigraphCharForLetter(char Letter) { 1242 switch (Letter) { 1243 default: return 0; 1244 case '=': return '#'; 1245 case ')': return ']'; 1246 case '(': return '['; 1247 case '!': return '|'; 1248 case '\'': return '^'; 1249 case '>': return '}'; 1250 case '/': return '\\'; 1251 case '<': return '{'; 1252 case '-': return '~'; 1253 } 1254 } 1255 1256 /// DecodeTrigraphChar - If the specified character is a legal trigraph when 1257 /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 1258 /// return the result character. Finally, emit a warning about trigraph use 1259 /// whether trigraphs are enabled or not. 1260 static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) { 1261 char Res = GetTrigraphCharForLetter(*CP); 1262 if (!Res) 1263 return Res; 1264 1265 if (!Trigraphs) { 1266 if (L && !L->isLexingRawMode()) 1267 L->Diag(CP-2, diag::trigraph_ignored); 1268 return 0; 1269 } 1270 1271 if (L && !L->isLexingRawMode()) 1272 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 1273 return Res; 1274 } 1275 1276 /// getEscapedNewLineSize - Return the size of the specified escaped newline, 1277 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 1278 /// trigraph equivalent on entry to this function. 1279 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 1280 unsigned Size = 0; 1281 while (isWhitespace(Ptr[Size])) { 1282 ++Size; 1283 1284 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 1285 continue; 1286 1287 // If this is a \r\n or \n\r, skip the other half. 1288 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 1289 Ptr[Size-1] != Ptr[Size]) 1290 ++Size; 1291 1292 return Size; 1293 } 1294 1295 // Not an escaped newline, must be a \t or something else. 1296 return 0; 1297 } 1298 1299 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 1300 /// them), skip over them and return the first non-escaped-newline found, 1301 /// otherwise return P. 1302 const char *Lexer::SkipEscapedNewLines(const char *P) { 1303 while (true) { 1304 const char *AfterEscape; 1305 if (*P == '\\') { 1306 AfterEscape = P+1; 1307 } else if (*P == '?') { 1308 // If not a trigraph for escape, bail out. 1309 if (P[1] != '?' || P[2] != '/') 1310 return P; 1311 // FIXME: Take LangOpts into account; the language might not 1312 // support trigraphs. 1313 AfterEscape = P+3; 1314 } else { 1315 return P; 1316 } 1317 1318 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 1319 if (NewLineSize == 0) return P; 1320 P = AfterEscape+NewLineSize; 1321 } 1322 } 1323 1324 std::optional<Token> Lexer::findNextToken(SourceLocation Loc, 1325 const SourceManager &SM, 1326 const LangOptions &LangOpts, 1327 bool IncludeComments) { 1328 if (Loc.isMacroID()) { 1329 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 1330 return std::nullopt; 1331 } 1332 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 1333 1334 // Break down the source location. 1335 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1336 1337 // Try to load the file buffer. 1338 bool InvalidTemp = false; 1339 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 1340 if (InvalidTemp) 1341 return std::nullopt; 1342 1343 const char *TokenBegin = File.data() + LocInfo.second; 1344 1345 // Lex from the start of the given location. 1346 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 1347 TokenBegin, File.end()); 1348 lexer.SetCommentRetentionState(IncludeComments); 1349 // Find the token. 1350 Token Tok; 1351 lexer.LexFromRawLexer(Tok); 1352 return Tok; 1353 } 1354 1355 std::optional<Token> Lexer::findPreviousToken(SourceLocation Loc, 1356 const SourceManager &SM, 1357 const LangOptions &LangOpts, 1358 bool IncludeComments) { 1359 const auto StartOfFile = SM.getLocForStartOfFile(SM.getFileID(Loc)); 1360 while (Loc != StartOfFile) { 1361 Loc = Loc.getLocWithOffset(-1); 1362 if (Loc.isInvalid()) 1363 return std::nullopt; 1364 1365 Loc = GetBeginningOfToken(Loc, SM, LangOpts); 1366 Token Tok; 1367 if (getRawToken(Loc, Tok, SM, LangOpts)) 1368 continue; // Not a token, go to prev location. 1369 if (!Tok.is(tok::comment) || IncludeComments) { 1370 return Tok; 1371 } 1372 } 1373 return std::nullopt; 1374 } 1375 1376 /// Checks that the given token is the first token that occurs after the 1377 /// given location (this excludes comments and whitespace). Returns the location 1378 /// immediately after the specified token. If the token is not found or the 1379 /// location is inside a macro, the returned source location will be invalid. 1380 SourceLocation Lexer::findLocationAfterToken( 1381 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, 1382 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { 1383 std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts); 1384 if (!Tok || Tok->isNot(TKind)) 1385 return {}; 1386 SourceLocation TokenLoc = Tok->getLocation(); 1387 1388 // Calculate how much whitespace needs to be skipped if any. 1389 unsigned NumWhitespaceChars = 0; 1390 if (SkipTrailingWhitespaceAndNewLine) { 1391 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength(); 1392 unsigned char C = *TokenEnd; 1393 while (isHorizontalWhitespace(C)) { 1394 C = *(++TokenEnd); 1395 NumWhitespaceChars++; 1396 } 1397 1398 // Skip \r, \n, \r\n, or \n\r 1399 if (C == '\n' || C == '\r') { 1400 char PrevC = C; 1401 C = *(++TokenEnd); 1402 NumWhitespaceChars++; 1403 if ((C == '\n' || C == '\r') && C != PrevC) 1404 NumWhitespaceChars++; 1405 } 1406 } 1407 1408 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars); 1409 } 1410 1411 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 1412 /// get its size, and return it. This is tricky in several cases: 1413 /// 1. If currently at the start of a trigraph, we warn about the trigraph, 1414 /// then either return the trigraph (skipping 3 chars) or the '?', 1415 /// depending on whether trigraphs are enabled or not. 1416 /// 2. If this is an escaped newline (potentially with whitespace between 1417 /// the backslash and newline), implicitly skip the newline and return 1418 /// the char after it. 1419 /// 1420 /// This handles the slow/uncommon case of the getCharAndSize method. Here we 1421 /// know that we can accumulate into Size, and that we have already incremented 1422 /// Ptr by Size bytes. 1423 /// 1424 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 1425 /// be updated to match. 1426 Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) { 1427 unsigned Size = 0; 1428 // If we have a slash, look for an escaped newline. 1429 if (Ptr[0] == '\\') { 1430 ++Size; 1431 ++Ptr; 1432 Slash: 1433 // Common case, backslash-char where the char is not whitespace. 1434 if (!isWhitespace(Ptr[0])) 1435 return {'\\', Size}; 1436 1437 // See if we have optional whitespace characters between the slash and 1438 // newline. 1439 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1440 // Remember that this token needs to be cleaned. 1441 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1442 1443 // Warn if there was whitespace between the backslash and newline. 1444 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 1445 Diag(Ptr, diag::backslash_newline_space); 1446 1447 // Found backslash<whitespace><newline>. Parse the char after it. 1448 Size += EscapedNewLineSize; 1449 Ptr += EscapedNewLineSize; 1450 1451 // Use slow version to accumulate a correct size field. 1452 auto CharAndSize = getCharAndSizeSlow(Ptr, Tok); 1453 CharAndSize.Size += Size; 1454 return CharAndSize; 1455 } 1456 1457 // Otherwise, this is not an escaped newline, just return the slash. 1458 return {'\\', Size}; 1459 } 1460 1461 // If this is a trigraph, process it. 1462 if (Ptr[0] == '?' && Ptr[1] == '?') { 1463 // If this is actually a legal trigraph (not something like "??x"), emit 1464 // a trigraph warning. If so, and if trigraphs are enabled, return it. 1465 if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr, 1466 LangOpts.Trigraphs)) { 1467 // Remember that this token needs to be cleaned. 1468 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1469 1470 Ptr += 3; 1471 Size += 3; 1472 if (C == '\\') goto Slash; 1473 return {C, Size}; 1474 } 1475 } 1476 1477 // If this is neither, return a single character. 1478 return {*Ptr, Size + 1u}; 1479 } 1480 1481 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 1482 /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 1483 /// and that we have already incremented Ptr by Size bytes. 1484 /// 1485 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 1486 /// be updated to match. 1487 Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, 1488 const LangOptions &LangOpts) { 1489 1490 unsigned Size = 0; 1491 // If we have a slash, look for an escaped newline. 1492 if (Ptr[0] == '\\') { 1493 ++Size; 1494 ++Ptr; 1495 Slash: 1496 // Common case, backslash-char where the char is not whitespace. 1497 if (!isWhitespace(Ptr[0])) 1498 return {'\\', Size}; 1499 1500 // See if we have optional whitespace characters followed by a newline. 1501 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1502 // Found backslash<whitespace><newline>. Parse the char after it. 1503 Size += EscapedNewLineSize; 1504 Ptr += EscapedNewLineSize; 1505 1506 // Use slow version to accumulate a correct size field. 1507 auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts); 1508 CharAndSize.Size += Size; 1509 return CharAndSize; 1510 } 1511 1512 // Otherwise, this is not an escaped newline, just return the slash. 1513 return {'\\', Size}; 1514 } 1515 1516 // If this is a trigraph, process it. 1517 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 1518 // If this is actually a legal trigraph (not something like "??x"), return 1519 // it. 1520 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 1521 Ptr += 3; 1522 Size += 3; 1523 if (C == '\\') goto Slash; 1524 return {C, Size}; 1525 } 1526 } 1527 1528 // If this is neither, return a single character. 1529 return {*Ptr, Size + 1u}; 1530 } 1531 1532 //===----------------------------------------------------------------------===// 1533 // Helper methods for lexing. 1534 //===----------------------------------------------------------------------===// 1535 1536 /// Routine that indiscriminately sets the offset into the source file. 1537 void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { 1538 BufferPtr = BufferStart + Offset; 1539 if (BufferPtr > BufferEnd) 1540 BufferPtr = BufferEnd; 1541 // FIXME: What exactly does the StartOfLine bit mean? There are two 1542 // possible meanings for the "start" of the line: the first token on the 1543 // unexpanded line, or the first token on the expanded line. 1544 IsAtStartOfLine = StartOfLine; 1545 IsAtPhysicalStartOfLine = StartOfLine; 1546 } 1547 1548 static bool isUnicodeWhitespace(uint32_t Codepoint) { 1549 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( 1550 UnicodeWhitespaceCharRanges); 1551 return UnicodeWhitespaceChars.contains(Codepoint); 1552 } 1553 1554 static llvm::SmallString<5> codepointAsHexString(uint32_t C) { 1555 llvm::SmallString<5> CharBuf; 1556 llvm::raw_svector_ostream CharOS(CharBuf); 1557 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); 1558 return CharBuf; 1559 } 1560 1561 // To mitigate https://github.com/llvm/llvm-project/issues/54732, 1562 // we allow "Mathematical Notation Characters" in identifiers. 1563 // This is a proposed profile that extends the XID_Start/XID_continue 1564 // with mathematical symbols, superscipts and subscripts digits 1565 // found in some production software. 1566 // https://www.unicode.org/L2/L2022/22230-math-profile.pdf 1567 static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, 1568 bool IsStart, bool &IsExtension) { 1569 static const llvm::sys::UnicodeCharSet MathStartChars( 1570 MathematicalNotationProfileIDStartRanges); 1571 static const llvm::sys::UnicodeCharSet MathContinueChars( 1572 MathematicalNotationProfileIDContinueRanges); 1573 if (MathStartChars.contains(C) || 1574 (!IsStart && MathContinueChars.contains(C))) { 1575 IsExtension = true; 1576 return true; 1577 } 1578 return false; 1579 } 1580 1581 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, 1582 bool &IsExtension) { 1583 if (LangOpts.AsmPreprocessor) { 1584 return false; 1585 } else if (LangOpts.DollarIdents && '$' == C) { 1586 return true; 1587 } else if (LangOpts.CPlusPlus || LangOpts.C23) { 1588 // A non-leading codepoint must have the XID_Continue property. 1589 // XIDContinueRanges doesn't contains characters also in XIDStartRanges, 1590 // so we need to check both tables. 1591 // '_' doesn't have the XID_Continue property but is allowed in C and C++. 1592 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1593 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges); 1594 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C)) 1595 return true; 1596 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false, 1597 IsExtension); 1598 } else if (LangOpts.C11) { 1599 static const llvm::sys::UnicodeCharSet C11AllowedIDChars( 1600 C11AllowedIDCharRanges); 1601 return C11AllowedIDChars.contains(C); 1602 } else { 1603 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1604 C99AllowedIDCharRanges); 1605 return C99AllowedIDChars.contains(C); 1606 } 1607 } 1608 1609 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, 1610 bool &IsExtension) { 1611 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint"); 1612 IsExtension = false; 1613 if (LangOpts.AsmPreprocessor) { 1614 return false; 1615 } 1616 if (LangOpts.CPlusPlus || LangOpts.C23) { 1617 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1618 if (XIDStartChars.contains(C)) 1619 return true; 1620 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true, 1621 IsExtension); 1622 } 1623 if (!isAllowedIDChar(C, LangOpts, IsExtension)) 1624 return false; 1625 if (LangOpts.C11) { 1626 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( 1627 C11DisallowedInitialIDCharRanges); 1628 return !C11DisallowedInitialIDChars.contains(C); 1629 } 1630 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1631 C99DisallowedInitialIDCharRanges); 1632 return !C99DisallowedInitialIDChars.contains(C); 1633 } 1634 1635 static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, 1636 CharSourceRange Range) { 1637 1638 static const llvm::sys::UnicodeCharSet MathStartChars( 1639 MathematicalNotationProfileIDStartRanges); 1640 static const llvm::sys::UnicodeCharSet MathContinueChars( 1641 MathematicalNotationProfileIDContinueRanges); 1642 1643 (void)MathStartChars; 1644 (void)MathContinueChars; 1645 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) && 1646 "Unexpected mathematical notation codepoint"); 1647 Diags.Report(Range.getBegin(), diag::ext_mathematical_notation) 1648 << codepointAsHexString(C) << Range; 1649 } 1650 1651 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, 1652 const char *End) { 1653 return CharSourceRange::getCharRange(L.getSourceLocation(Begin), 1654 L.getSourceLocation(End)); 1655 } 1656 1657 static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, 1658 CharSourceRange Range, bool IsFirst) { 1659 // Check C99 compatibility. 1660 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) { 1661 enum { 1662 CannotAppearInIdentifier = 0, 1663 CannotStartIdentifier 1664 }; 1665 1666 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1667 C99AllowedIDCharRanges); 1668 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1669 C99DisallowedInitialIDCharRanges); 1670 if (!C99AllowedIDChars.contains(C)) { 1671 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1672 << Range 1673 << CannotAppearInIdentifier; 1674 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { 1675 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1676 << Range 1677 << CannotStartIdentifier; 1678 } 1679 } 1680 } 1681 1682 /// After encountering UTF-8 character C and interpreting it as an identifier 1683 /// character, check whether it's a homoglyph for a common non-identifier 1684 /// source character that is unlikely to be an intentional identifier 1685 /// character and warn if so. 1686 static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, 1687 CharSourceRange Range) { 1688 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). 1689 struct HomoglyphPair { 1690 uint32_t Character; 1691 char LooksLike; 1692 bool operator<(HomoglyphPair R) const { return Character < R.Character; } 1693 }; 1694 static constexpr HomoglyphPair SortedHomoglyphs[] = { 1695 {U'\u00ad', 0}, // SOFT HYPHEN 1696 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK 1697 {U'\u037e', ';'}, // GREEK QUESTION MARK 1698 {U'\u200b', 0}, // ZERO WIDTH SPACE 1699 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER 1700 {U'\u200d', 0}, // ZERO WIDTH JOINER 1701 {U'\u2060', 0}, // WORD JOINER 1702 {U'\u2061', 0}, // FUNCTION APPLICATION 1703 {U'\u2062', 0}, // INVISIBLE TIMES 1704 {U'\u2063', 0}, // INVISIBLE SEPARATOR 1705 {U'\u2064', 0}, // INVISIBLE PLUS 1706 {U'\u2212', '-'}, // MINUS SIGN 1707 {U'\u2215', '/'}, // DIVISION SLASH 1708 {U'\u2216', '\\'}, // SET MINUS 1709 {U'\u2217', '*'}, // ASTERISK OPERATOR 1710 {U'\u2223', '|'}, // DIVIDES 1711 {U'\u2227', '^'}, // LOGICAL AND 1712 {U'\u2236', ':'}, // RATIO 1713 {U'\u223c', '~'}, // TILDE OPERATOR 1714 {U'\ua789', ':'}, // MODIFIER LETTER COLON 1715 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE 1716 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK 1717 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN 1718 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN 1719 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN 1720 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND 1721 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS 1722 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS 1723 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK 1724 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK 1725 {U'\uff0c', ','}, // FULLWIDTH COMMA 1726 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS 1727 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP 1728 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS 1729 {U'\uff1a', ':'}, // FULLWIDTH COLON 1730 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON 1731 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN 1732 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN 1733 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN 1734 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK 1735 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT 1736 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET 1737 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS 1738 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET 1739 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT 1740 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET 1741 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE 1742 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET 1743 {U'\uff5e', '~'}, // FULLWIDTH TILDE 1744 {0, 0} 1745 }; 1746 auto Homoglyph = 1747 std::lower_bound(std::begin(SortedHomoglyphs), 1748 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); 1749 if (Homoglyph->Character == C) { 1750 if (Homoglyph->LooksLike) { 1751 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; 1752 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) 1753 << Range << codepointAsHexString(C) << LooksLikeStr; 1754 } else { 1755 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) 1756 << Range << codepointAsHexString(C); 1757 } 1758 } 1759 } 1760 1761 static void diagnoseInvalidUnicodeCodepointInIdentifier( 1762 DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, 1763 CharSourceRange Range, bool IsFirst) { 1764 if (isASCII(CodePoint)) 1765 return; 1766 1767 bool IsExtension; 1768 bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension); 1769 bool IsIDContinue = 1770 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension); 1771 1772 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue)) 1773 return; 1774 1775 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue; 1776 1777 if (!IsFirst || InvalidOnlyAtStart) { 1778 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier) 1779 << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart) 1780 << FixItHint::CreateRemoval(Range); 1781 } else { 1782 Diags.Report(Range.getBegin(), diag::err_character_not_allowed) 1783 << Range << codepointAsHexString(CodePoint) 1784 << FixItHint::CreateRemoval(Range); 1785 } 1786 } 1787 1788 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 1789 Token &Result) { 1790 const char *UCNPtr = CurPtr + Size; 1791 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr); 1792 if (CodePoint == 0) { 1793 return false; 1794 } 1795 bool IsExtension = false; 1796 if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) { 1797 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1798 return false; 1799 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1800 !PP->isPreprocessedOutput()) 1801 diagnoseInvalidUnicodeCodepointInIdentifier( 1802 PP->getDiagnostics(), LangOpts, CodePoint, 1803 makeCharRange(*this, CurPtr, UCNPtr), 1804 /*IsFirst=*/false); 1805 1806 // We got a unicode codepoint that is neither a space nor a 1807 // a valid identifier part. 1808 // Carry on as if the codepoint was valid for recovery purposes. 1809 } else if (!isLexingRawMode()) { 1810 if (IsExtension) 1811 diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint, 1812 makeCharRange(*this, CurPtr, UCNPtr)); 1813 1814 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1815 makeCharRange(*this, CurPtr, UCNPtr), 1816 /*IsFirst=*/false); 1817 } 1818 1819 Result.setFlag(Token::HasUCN); 1820 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || 1821 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) 1822 CurPtr = UCNPtr; 1823 else 1824 while (CurPtr != UCNPtr) 1825 (void)getAndAdvanceChar(CurPtr, Result); 1826 return true; 1827 } 1828 1829 bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) { 1830 llvm::UTF32 CodePoint; 1831 1832 // If a UTF-8 codepoint appears immediately after an escaped new line, 1833 // CurPtr may point to the splicing \ on the preceding line, 1834 // so we need to skip it. 1835 unsigned FirstCodeUnitSize; 1836 getCharAndSize(CurPtr, FirstCodeUnitSize); 1837 const char *CharStart = CurPtr + FirstCodeUnitSize - 1; 1838 const char *UnicodePtr = CharStart; 1839 1840 llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence( 1841 (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd, 1842 &CodePoint, llvm::strictConversion); 1843 if (ConvResult != llvm::conversionOK) 1844 return false; 1845 1846 bool IsExtension = false; 1847 if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts, 1848 IsExtension)) { 1849 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1850 return false; 1851 1852 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1853 !PP->isPreprocessedOutput()) 1854 diagnoseInvalidUnicodeCodepointInIdentifier( 1855 PP->getDiagnostics(), LangOpts, CodePoint, 1856 makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false); 1857 // We got a unicode codepoint that is neither a space nor a 1858 // a valid identifier part. Carry on as if the codepoint was 1859 // valid for recovery purposes. 1860 } else if (!isLexingRawMode()) { 1861 if (IsExtension) 1862 diagnoseExtensionInIdentifier( 1863 PP->getDiagnostics(), CodePoint, 1864 makeCharRange(*this, CharStart, UnicodePtr)); 1865 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1866 makeCharRange(*this, CharStart, UnicodePtr), 1867 /*IsFirst=*/false); 1868 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, 1869 makeCharRange(*this, CharStart, UnicodePtr)); 1870 } 1871 1872 // Once we sucessfully parsed some UTF-8, 1873 // calling ConsumeChar ensures the NeedsCleaning flag is set on the token 1874 // being lexed, and that warnings about trailing spaces are emitted. 1875 ConsumeChar(CurPtr, FirstCodeUnitSize, Result); 1876 CurPtr = UnicodePtr; 1877 return true; 1878 } 1879 1880 bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C, 1881 const char *CurPtr) { 1882 bool IsExtension = false; 1883 if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) { 1884 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1885 !PP->isPreprocessedOutput()) { 1886 if (IsExtension) 1887 diagnoseExtensionInIdentifier(PP->getDiagnostics(), C, 1888 makeCharRange(*this, BufferPtr, CurPtr)); 1889 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, 1890 makeCharRange(*this, BufferPtr, CurPtr), 1891 /*IsFirst=*/true); 1892 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, 1893 makeCharRange(*this, BufferPtr, CurPtr)); 1894 } 1895 1896 MIOpt.ReadToken(); 1897 return LexIdentifierContinue(Result, CurPtr); 1898 } 1899 1900 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1901 !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) && 1902 !isUnicodeWhitespace(C)) { 1903 // Non-ASCII characters tend to creep into source code unintentionally. 1904 // Instead of letting the parser complain about the unknown token, 1905 // just drop the character. 1906 // Note that we can /only/ do this when the non-ASCII character is actually 1907 // spelled as Unicode, not written as a UCN. The standard requires that 1908 // we not throw away any possible preprocessor tokens, but there's a 1909 // loophole in the mapping of Unicode characters to basic character set 1910 // characters that allows us to map these particular characters to, say, 1911 // whitespace. 1912 diagnoseInvalidUnicodeCodepointInIdentifier( 1913 PP->getDiagnostics(), LangOpts, C, 1914 makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true); 1915 BufferPtr = CurPtr; 1916 return false; 1917 } 1918 1919 // Otherwise, we have an explicit UCN or a character that's unlikely to show 1920 // up by accident. 1921 MIOpt.ReadToken(); 1922 FormTokenWithChars(Result, CurPtr, tok::unknown); 1923 return true; 1924 } 1925 1926 static const char * 1927 fastParseASCIIIdentifier(const char *CurPtr, 1928 [[maybe_unused]] const char *BufferEnd) { 1929 #ifdef __SSE4_2__ 1930 alignas(16) static constexpr char AsciiIdentifierRange[16] = { 1931 '_', '_', 'A', 'Z', 'a', 'z', '0', '9', 1932 }; 1933 constexpr ssize_t BytesPerRegister = 16; 1934 1935 __m128i AsciiIdentifierRangeV = 1936 _mm_load_si128((const __m128i *)AsciiIdentifierRange); 1937 1938 while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) { 1939 __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr)); 1940 1941 int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv, 1942 _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | 1943 _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY); 1944 CurPtr += Consumed; 1945 if (Consumed == BytesPerRegister) 1946 continue; 1947 return CurPtr; 1948 } 1949 #endif 1950 1951 unsigned char C = *CurPtr; 1952 while (isAsciiIdentifierContinue(C)) 1953 C = *++CurPtr; 1954 return CurPtr; 1955 } 1956 1957 bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) { 1958 // Match [_A-Za-z0-9]*, we have already matched an identifier start. 1959 1960 while (true) { 1961 1962 CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd); 1963 1964 unsigned Size; 1965 // Slow path: handle trigraph, unicode codepoints, UCNs. 1966 unsigned char C = getCharAndSize(CurPtr, Size); 1967 if (isAsciiIdentifierContinue(C)) { 1968 CurPtr = ConsumeChar(CurPtr, Size, Result); 1969 continue; 1970 } 1971 if (C == '$') { 1972 // If we hit a $ and they are not supported in identifiers, we are done. 1973 if (!LangOpts.DollarIdents) 1974 break; 1975 // Otherwise, emit a diagnostic and continue. 1976 if (!isLexingRawMode()) 1977 Diag(CurPtr, diag::ext_dollar_in_identifier); 1978 CurPtr = ConsumeChar(CurPtr, Size, Result); 1979 continue; 1980 } 1981 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1982 continue; 1983 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 1984 continue; 1985 // Neither an expected Unicode codepoint nor a UCN. 1986 break; 1987 } 1988 1989 const char *IdStart = BufferPtr; 1990 FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 1991 Result.setRawIdentifierData(IdStart); 1992 1993 // If we are in raw mode, return this identifier raw. There is no need to 1994 // look up identifier information or attempt to macro expand it. 1995 if (LexingRawMode) 1996 return true; 1997 1998 // Fill in Result.IdentifierInfo and update the token kind, 1999 // looking up the identifier in the identifier table. 2000 const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 2001 // Note that we have to call PP->LookUpIdentifierInfo() even for code 2002 // completion, it writes IdentifierInfo into Result, and callers rely on it. 2003 2004 // If the completion point is at the end of an identifier, we want to treat 2005 // the identifier as incomplete even if it resolves to a macro or a keyword. 2006 // This allows e.g. 'class^' to complete to 'classifier'. 2007 if (isCodeCompletionPoint(CurPtr)) { 2008 // Return the code-completion token. 2009 Result.setKind(tok::code_completion); 2010 // Skip the code-completion char and all immediate identifier characters. 2011 // This ensures we get consistent behavior when completing at any point in 2012 // an identifier (i.e. at the start, in the middle, at the end). Note that 2013 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code 2014 // simpler. 2015 assert(*CurPtr == 0 && "Completion character must be 0"); 2016 ++CurPtr; 2017 // Note that code completion token is not added as a separate character 2018 // when the completion point is at the end of the buffer. Therefore, we need 2019 // to check if the buffer has ended. 2020 if (CurPtr < BufferEnd) { 2021 while (isAsciiIdentifierContinue(*CurPtr)) 2022 ++CurPtr; 2023 } 2024 BufferPtr = CurPtr; 2025 return true; 2026 } 2027 2028 // Finally, now that we know we have an identifier, pass this off to the 2029 // preprocessor, which may macro expand it or something. 2030 if (II->isHandleIdentifierCase()) 2031 return PP->HandleIdentifier(Result); 2032 2033 return true; 2034 } 2035 2036 /// isHexaLiteral - Return true if Start points to a hex constant. 2037 /// in microsoft mode (where this is supposed to be several different tokens). 2038 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 2039 auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts); 2040 char C1 = CharAndSize1.Char; 2041 if (C1 != '0') 2042 return false; 2043 2044 auto CharAndSize2 = 2045 Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts); 2046 char C2 = CharAndSize2.Char; 2047 return (C2 == 'x' || C2 == 'X'); 2048 } 2049 2050 /// LexNumericConstant - Lex the remainder of a integer or floating point 2051 /// constant. From[-1] is the first character lexed. Return the end of the 2052 /// constant. 2053 bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 2054 unsigned Size; 2055 char C = getCharAndSize(CurPtr, Size); 2056 char PrevCh = 0; 2057 while (isPreprocessingNumberBody(C)) { 2058 CurPtr = ConsumeChar(CurPtr, Size, Result); 2059 PrevCh = C; 2060 if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) { 2061 CurPtr -= Size; 2062 break; 2063 } 2064 C = getCharAndSize(CurPtr, Size); 2065 } 2066 2067 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 2068 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 2069 // If we are in Microsoft mode, don't continue if the constant is hex. 2070 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 2071 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 2072 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 2073 } 2074 2075 // If we have a hex FP constant, continue. 2076 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 2077 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a 2078 // not-quite-conforming extension. Only do so if this looks like it's 2079 // actually meant to be a hexfloat, and not if it has a ud-suffix. 2080 bool IsHexFloat = true; 2081 if (!LangOpts.C99) { 2082 if (!isHexaLiteral(BufferPtr, LangOpts)) 2083 IsHexFloat = false; 2084 else if (!LangOpts.CPlusPlus17 && 2085 std::find(BufferPtr, CurPtr, '_') != CurPtr) 2086 IsHexFloat = false; 2087 } 2088 if (IsHexFloat) 2089 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 2090 } 2091 2092 // If we have a digit separator, continue. 2093 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) { 2094 auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts); 2095 if (isAsciiIdentifierContinue(Next)) { 2096 if (!isLexingRawMode()) 2097 Diag(CurPtr, LangOpts.CPlusPlus 2098 ? diag::warn_cxx11_compat_digit_separator 2099 : diag::warn_c23_compat_digit_separator); 2100 CurPtr = ConsumeChar(CurPtr, Size, Result); 2101 CurPtr = ConsumeChar(CurPtr, NextSize, Result); 2102 return LexNumericConstant(Result, CurPtr); 2103 } 2104 } 2105 2106 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. 2107 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 2108 return LexNumericConstant(Result, CurPtr); 2109 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 2110 return LexNumericConstant(Result, CurPtr); 2111 2112 // Update the location of token as well as BufferPtr. 2113 const char *TokStart = BufferPtr; 2114 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 2115 Result.setLiteralData(TokStart); 2116 return true; 2117 } 2118 2119 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 2120 /// in C++11, or warn on a ud-suffix in C++98. 2121 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, 2122 bool IsStringLiteral) { 2123 assert(LangOpts.CPlusPlus); 2124 2125 // Maximally munch an identifier. 2126 unsigned Size; 2127 char C = getCharAndSize(CurPtr, Size); 2128 bool Consumed = false; 2129 2130 if (!isAsciiIdentifierStart(C)) { 2131 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 2132 Consumed = true; 2133 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 2134 Consumed = true; 2135 else 2136 return CurPtr; 2137 } 2138 2139 if (!LangOpts.CPlusPlus11) { 2140 if (!isLexingRawMode()) 2141 Diag(CurPtr, 2142 C == '_' ? diag::warn_cxx11_compat_user_defined_literal 2143 : diag::warn_cxx11_compat_reserved_user_defined_literal) 2144 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 2145 return CurPtr; 2146 } 2147 2148 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 2149 // that does not start with an underscore is ill-formed. As a conforming 2150 // extension, we treat all such suffixes as if they had whitespace before 2151 // them. We assume a suffix beginning with a UCN or UTF-8 character is more 2152 // likely to be a ud-suffix than a macro, however, and accept that. 2153 if (!Consumed) { 2154 bool IsUDSuffix = false; 2155 if (C == '_') 2156 IsUDSuffix = true; 2157 else if (IsStringLiteral && LangOpts.CPlusPlus14) { 2158 // In C++1y, we need to look ahead a few characters to see if this is a 2159 // valid suffix for a string literal or a numeric literal (this could be 2160 // the 'operator""if' defining a numeric literal operator). 2161 const unsigned MaxStandardSuffixLength = 3; 2162 char Buffer[MaxStandardSuffixLength] = { C }; 2163 unsigned Consumed = Size; 2164 unsigned Chars = 1; 2165 while (true) { 2166 auto [Next, NextSize] = 2167 getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts); 2168 if (!isAsciiIdentifierContinue(Next)) { 2169 // End of suffix. Check whether this is on the allowed list. 2170 const StringRef CompleteSuffix(Buffer, Chars); 2171 IsUDSuffix = 2172 StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix); 2173 break; 2174 } 2175 2176 if (Chars == MaxStandardSuffixLength) 2177 // Too long: can't be a standard suffix. 2178 break; 2179 2180 Buffer[Chars++] = Next; 2181 Consumed += NextSize; 2182 } 2183 } 2184 2185 if (!IsUDSuffix) { 2186 if (!isLexingRawMode()) 2187 Diag(CurPtr, LangOpts.MSVCCompat 2188 ? diag::ext_ms_reserved_user_defined_literal 2189 : diag::ext_reserved_user_defined_literal) 2190 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 2191 return CurPtr; 2192 } 2193 2194 CurPtr = ConsumeChar(CurPtr, Size, Result); 2195 } 2196 2197 Result.setFlag(Token::HasUDSuffix); 2198 while (true) { 2199 C = getCharAndSize(CurPtr, Size); 2200 if (isAsciiIdentifierContinue(C)) { 2201 CurPtr = ConsumeChar(CurPtr, Size, Result); 2202 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { 2203 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) { 2204 } else 2205 break; 2206 } 2207 2208 return CurPtr; 2209 } 2210 2211 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 2212 /// either " or L" or u8" or u" or U". 2213 bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 2214 tok::TokenKind Kind) { 2215 const char *AfterQuote = CurPtr; 2216 // Does this string contain the \0 character? 2217 const char *NulCharacter = nullptr; 2218 2219 if (!isLexingRawMode() && 2220 (Kind == tok::utf8_string_literal || 2221 Kind == tok::utf16_string_literal || 2222 Kind == tok::utf32_string_literal)) 2223 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal 2224 : diag::warn_c99_compat_unicode_literal); 2225 2226 char C = getAndAdvanceChar(CurPtr, Result); 2227 while (C != '"') { 2228 // Skip escaped characters. Escaped newlines will already be processed by 2229 // getAndAdvanceChar. 2230 if (C == '\\') 2231 C = getAndAdvanceChar(CurPtr, Result); 2232 2233 if (C == '\n' || C == '\r' || // Newline. 2234 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 2235 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2236 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; 2237 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2238 return true; 2239 } 2240 2241 if (C == 0) { 2242 if (isCodeCompletionPoint(CurPtr-1)) { 2243 if (ParsingFilename) 2244 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false); 2245 else 2246 PP->CodeCompleteNaturalLanguage(); 2247 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 2248 cutOffLexing(); 2249 return true; 2250 } 2251 2252 NulCharacter = CurPtr-1; 2253 } 2254 C = getAndAdvanceChar(CurPtr, Result); 2255 } 2256 2257 // If we are in C++11, lex the optional ud-suffix. 2258 if (LangOpts.CPlusPlus) 2259 CurPtr = LexUDSuffix(Result, CurPtr, true); 2260 2261 // If a nul character existed in the string, warn about it. 2262 if (NulCharacter && !isLexingRawMode()) 2263 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 2264 2265 // Update the location of the token as well as the BufferPtr instance var. 2266 const char *TokStart = BufferPtr; 2267 FormTokenWithChars(Result, CurPtr, Kind); 2268 Result.setLiteralData(TokStart); 2269 return true; 2270 } 2271 2272 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 2273 /// having lexed R", LR", u8R", uR", or UR". 2274 bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 2275 tok::TokenKind Kind) { 2276 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 2277 // Between the initial and final double quote characters of the raw string, 2278 // any transformations performed in phases 1 and 2 (trigraphs, 2279 // universal-character-names, and line splicing) are reverted. 2280 2281 if (!isLexingRawMode()) 2282 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 2283 2284 unsigned PrefixLen = 0; 2285 2286 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) { 2287 if (!isLexingRawMode() && 2288 llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) { 2289 const char *Pos = &CurPtr[PrefixLen]; 2290 Diag(Pos, LangOpts.CPlusPlus26 2291 ? diag::warn_cxx26_compat_raw_string_literal_character_set 2292 : diag::ext_cxx26_raw_string_literal_character_set) 2293 << StringRef(Pos, 1); 2294 } 2295 ++PrefixLen; 2296 } 2297 2298 // If the last character was not a '(', then we didn't lex a valid delimiter. 2299 if (CurPtr[PrefixLen] != '(') { 2300 if (!isLexingRawMode()) { 2301 const char *PrefixEnd = &CurPtr[PrefixLen]; 2302 if (PrefixLen == 16) { 2303 Diag(PrefixEnd, diag::err_raw_delim_too_long); 2304 } else if (*PrefixEnd == '\n') { 2305 Diag(PrefixEnd, diag::err_invalid_newline_raw_delim); 2306 } else { 2307 Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 2308 << StringRef(PrefixEnd, 1); 2309 } 2310 } 2311 2312 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 2313 // it's possible the '"' was intended to be part of the raw string, but 2314 // there's not much we can do about that. 2315 while (true) { 2316 char C = *CurPtr++; 2317 2318 if (C == '"') 2319 break; 2320 if (C == 0 && CurPtr-1 == BufferEnd) { 2321 --CurPtr; 2322 break; 2323 } 2324 } 2325 2326 FormTokenWithChars(Result, CurPtr, tok::unknown); 2327 return true; 2328 } 2329 2330 // Save prefix and move CurPtr past it 2331 const char *Prefix = CurPtr; 2332 CurPtr += PrefixLen + 1; // skip over prefix and '(' 2333 2334 while (true) { 2335 char C = *CurPtr++; 2336 2337 if (C == ')') { 2338 // Check for prefix match and closing quote. 2339 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 2340 CurPtr += PrefixLen + 1; // skip over prefix and '"' 2341 break; 2342 } 2343 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 2344 if (!isLexingRawMode()) 2345 Diag(BufferPtr, diag::err_unterminated_raw_string) 2346 << StringRef(Prefix, PrefixLen); 2347 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2348 return true; 2349 } 2350 } 2351 2352 // If we are in C++11, lex the optional ud-suffix. 2353 if (LangOpts.CPlusPlus) 2354 CurPtr = LexUDSuffix(Result, CurPtr, true); 2355 2356 // Update the location of token as well as BufferPtr. 2357 const char *TokStart = BufferPtr; 2358 FormTokenWithChars(Result, CurPtr, Kind); 2359 Result.setLiteralData(TokStart); 2360 return true; 2361 } 2362 2363 /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 2364 /// after having lexed the '<' character. This is used for #include filenames. 2365 bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 2366 // Does this string contain the \0 character? 2367 const char *NulCharacter = nullptr; 2368 const char *AfterLessPos = CurPtr; 2369 char C = getAndAdvanceChar(CurPtr, Result); 2370 while (C != '>') { 2371 // Skip escaped characters. Escaped newlines will already be processed by 2372 // getAndAdvanceChar. 2373 if (C == '\\') 2374 C = getAndAdvanceChar(CurPtr, Result); 2375 2376 if (isVerticalWhitespace(C) || // Newline. 2377 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file. 2378 // If the filename is unterminated, then it must just be a lone < 2379 // character. Return this as such. 2380 FormTokenWithChars(Result, AfterLessPos, tok::less); 2381 return true; 2382 } 2383 2384 if (C == 0) { 2385 if (isCodeCompletionPoint(CurPtr - 1)) { 2386 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true); 2387 cutOffLexing(); 2388 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 2389 return true; 2390 } 2391 NulCharacter = CurPtr-1; 2392 } 2393 C = getAndAdvanceChar(CurPtr, Result); 2394 } 2395 2396 // If a nul character existed in the string, warn about it. 2397 if (NulCharacter && !isLexingRawMode()) 2398 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 2399 2400 // Update the location of token as well as BufferPtr. 2401 const char *TokStart = BufferPtr; 2402 FormTokenWithChars(Result, CurPtr, tok::header_name); 2403 Result.setLiteralData(TokStart); 2404 return true; 2405 } 2406 2407 void Lexer::codeCompleteIncludedFile(const char *PathStart, 2408 const char *CompletionPoint, 2409 bool IsAngled) { 2410 // Completion only applies to the filename, after the last slash. 2411 StringRef PartialPath(PathStart, CompletionPoint - PathStart); 2412 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/"; 2413 auto Slash = PartialPath.find_last_of(SlashChars); 2414 StringRef Dir = 2415 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash); 2416 const char *StartOfFilename = 2417 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1; 2418 // Code completion filter range is the filename only, up to completion point. 2419 PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get( 2420 StringRef(StartOfFilename, CompletionPoint - StartOfFilename))); 2421 // We should replace the characters up to the closing quote or closest slash, 2422 // if any. 2423 while (CompletionPoint < BufferEnd) { 2424 char Next = *(CompletionPoint + 1); 2425 if (Next == 0 || Next == '\r' || Next == '\n') 2426 break; 2427 ++CompletionPoint; 2428 if (Next == (IsAngled ? '>' : '"')) 2429 break; 2430 if (SlashChars.contains(Next)) 2431 break; 2432 } 2433 2434 PP->setCodeCompletionTokenRange( 2435 FileLoc.getLocWithOffset(StartOfFilename - BufferStart), 2436 FileLoc.getLocWithOffset(CompletionPoint - BufferStart)); 2437 PP->CodeCompleteIncludedFile(Dir, IsAngled); 2438 } 2439 2440 /// LexCharConstant - Lex the remainder of a character constant, after having 2441 /// lexed either ' or L' or u8' or u' or U'. 2442 bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, 2443 tok::TokenKind Kind) { 2444 // Does this character contain the \0 character? 2445 const char *NulCharacter = nullptr; 2446 2447 if (!isLexingRawMode()) { 2448 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) 2449 Diag(BufferPtr, LangOpts.CPlusPlus 2450 ? diag::warn_cxx98_compat_unicode_literal 2451 : diag::warn_c99_compat_unicode_literal); 2452 else if (Kind == tok::utf8_char_constant) 2453 Diag(BufferPtr, LangOpts.CPlusPlus 2454 ? diag::warn_cxx14_compat_u8_character_literal 2455 : diag::warn_c17_compat_u8_character_literal); 2456 } 2457 2458 char C = getAndAdvanceChar(CurPtr, Result); 2459 if (C == '\'') { 2460 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2461 Diag(BufferPtr, diag::ext_empty_character); 2462 FormTokenWithChars(Result, CurPtr, tok::unknown); 2463 return true; 2464 } 2465 2466 while (C != '\'') { 2467 // Skip escaped characters. 2468 if (C == '\\') 2469 C = getAndAdvanceChar(CurPtr, Result); 2470 2471 if (C == '\n' || C == '\r' || // Newline. 2472 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 2473 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2474 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0; 2475 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2476 return true; 2477 } 2478 2479 if (C == 0) { 2480 if (isCodeCompletionPoint(CurPtr-1)) { 2481 PP->CodeCompleteNaturalLanguage(); 2482 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2483 cutOffLexing(); 2484 return true; 2485 } 2486 2487 NulCharacter = CurPtr-1; 2488 } 2489 C = getAndAdvanceChar(CurPtr, Result); 2490 } 2491 2492 // If we are in C++11, lex the optional ud-suffix. 2493 if (LangOpts.CPlusPlus) 2494 CurPtr = LexUDSuffix(Result, CurPtr, false); 2495 2496 // If a nul character existed in the character, warn about it. 2497 if (NulCharacter && !isLexingRawMode()) 2498 Diag(NulCharacter, diag::null_in_char_or_string) << 0; 2499 2500 // Update the location of token as well as BufferPtr. 2501 const char *TokStart = BufferPtr; 2502 FormTokenWithChars(Result, CurPtr, Kind); 2503 Result.setLiteralData(TokStart); 2504 return true; 2505 } 2506 2507 /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 2508 /// Update BufferPtr to point to the next non-whitespace character and return. 2509 /// 2510 /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 2511 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, 2512 bool &TokAtPhysicalStartOfLine) { 2513 // Whitespace - Skip it, then return the token after the whitespace. 2514 bool SawNewline = isVerticalWhitespace(CurPtr[-1]); 2515 2516 unsigned char Char = *CurPtr; 2517 2518 const char *lastNewLine = nullptr; 2519 auto setLastNewLine = [&](const char *Ptr) { 2520 lastNewLine = Ptr; 2521 if (!NewLinePtr) 2522 NewLinePtr = Ptr; 2523 }; 2524 if (SawNewline) 2525 setLastNewLine(CurPtr - 1); 2526 2527 // Skip consecutive spaces efficiently. 2528 while (true) { 2529 // Skip horizontal whitespace very aggressively. 2530 while (isHorizontalWhitespace(Char)) 2531 Char = *++CurPtr; 2532 2533 // Otherwise if we have something other than whitespace, we're done. 2534 if (!isVerticalWhitespace(Char)) 2535 break; 2536 2537 if (ParsingPreprocessorDirective) { 2538 // End of preprocessor directive line, let LexTokenInternal handle this. 2539 BufferPtr = CurPtr; 2540 return false; 2541 } 2542 2543 // OK, but handle newline. 2544 if (*CurPtr == '\n') 2545 setLastNewLine(CurPtr); 2546 SawNewline = true; 2547 Char = *++CurPtr; 2548 } 2549 2550 // If the client wants us to return whitespace, return it now. 2551 if (isKeepWhitespaceMode()) { 2552 FormTokenWithChars(Result, CurPtr, tok::unknown); 2553 if (SawNewline) { 2554 IsAtStartOfLine = true; 2555 IsAtPhysicalStartOfLine = true; 2556 } 2557 // FIXME: The next token will not have LeadingSpace set. 2558 return true; 2559 } 2560 2561 // If this isn't immediately after a newline, there is leading space. 2562 char PrevChar = CurPtr[-1]; 2563 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); 2564 2565 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); 2566 if (SawNewline) { 2567 Result.setFlag(Token::StartOfLine); 2568 TokAtPhysicalStartOfLine = true; 2569 2570 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) { 2571 if (auto *Handler = PP->getEmptylineHandler()) 2572 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1), 2573 getSourceLocation(lastNewLine))); 2574 } 2575 } 2576 2577 BufferPtr = CurPtr; 2578 return false; 2579 } 2580 2581 /// We have just read the // characters from input. Skip until we find the 2582 /// newline character that terminates the comment. Then update BufferPtr and 2583 /// return. 2584 /// 2585 /// If we're in KeepCommentMode or any CommentHandler has inserted 2586 /// some tokens, this will store the first token and return true. 2587 bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, 2588 bool &TokAtPhysicalStartOfLine) { 2589 // If Line comments aren't explicitly enabled for this language, emit an 2590 // extension warning. 2591 if (!LineComment) { 2592 if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags. 2593 Diag(BufferPtr, diag::ext_line_comment); 2594 2595 // Mark them enabled so we only emit one warning for this translation 2596 // unit. 2597 LineComment = true; 2598 } 2599 2600 // Scan over the body of the comment. The common case, when scanning, is that 2601 // the comment contains normal ascii characters with nothing interesting in 2602 // them. As such, optimize for this case with the inner loop. 2603 // 2604 // This loop terminates with CurPtr pointing at the newline (or end of buffer) 2605 // character that ends the line comment. 2606 2607 // C++23 [lex.phases] p1 2608 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a 2609 // diagnostic only once per entire ill-formed subsequence to avoid 2610 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). 2611 bool UnicodeDecodingAlreadyDiagnosed = false; 2612 2613 char C; 2614 while (true) { 2615 C = *CurPtr; 2616 // Skip over characters in the fast loop. 2617 while (isASCII(C) && C != 0 && // Potentially EOF. 2618 C != '\n' && C != '\r') { // Newline or DOS-style newline. 2619 C = *++CurPtr; 2620 UnicodeDecodingAlreadyDiagnosed = false; 2621 } 2622 2623 if (!isASCII(C)) { 2624 unsigned Length = llvm::getUTF8SequenceSize( 2625 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd); 2626 if (Length == 0) { 2627 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) 2628 Diag(CurPtr, diag::warn_invalid_utf8_in_comment); 2629 UnicodeDecodingAlreadyDiagnosed = true; 2630 ++CurPtr; 2631 } else { 2632 UnicodeDecodingAlreadyDiagnosed = false; 2633 CurPtr += Length; 2634 } 2635 continue; 2636 } 2637 2638 const char *NextLine = CurPtr; 2639 if (C != 0) { 2640 // We found a newline, see if it's escaped. 2641 const char *EscapePtr = CurPtr-1; 2642 bool HasSpace = false; 2643 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace. 2644 --EscapePtr; 2645 HasSpace = true; 2646 } 2647 2648 if (*EscapePtr == '\\') 2649 // Escaped newline. 2650 CurPtr = EscapePtr; 2651 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 2652 EscapePtr[-2] == '?' && LangOpts.Trigraphs) 2653 // Trigraph-escaped newline. 2654 CurPtr = EscapePtr-2; 2655 else 2656 break; // This is a newline, we're done. 2657 2658 // If there was space between the backslash and newline, warn about it. 2659 if (HasSpace && !isLexingRawMode()) 2660 Diag(EscapePtr, diag::backslash_newline_space); 2661 } 2662 2663 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 2664 // properly decode the character. Read it in raw mode to avoid emitting 2665 // diagnostics about things like trigraphs. If we see an escaped newline, 2666 // we'll handle it below. 2667 const char *OldPtr = CurPtr; 2668 bool OldRawMode = isLexingRawMode(); 2669 LexingRawMode = true; 2670 C = getAndAdvanceChar(CurPtr, Result); 2671 LexingRawMode = OldRawMode; 2672 2673 // If we only read only one character, then no special handling is needed. 2674 // We're done and can skip forward to the newline. 2675 if (C != 0 && CurPtr == OldPtr+1) { 2676 CurPtr = NextLine; 2677 break; 2678 } 2679 2680 // If we read multiple characters, and one of those characters was a \r or 2681 // \n, then we had an escaped newline within the comment. Emit diagnostic 2682 // unless the next line is also a // comment. 2683 if (CurPtr != OldPtr + 1 && C != '/' && 2684 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) { 2685 for (; OldPtr != CurPtr; ++OldPtr) 2686 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 2687 // Okay, we found a // comment that ends in a newline, if the next 2688 // line is also a // comment, but has spaces, don't emit a diagnostic. 2689 if (isWhitespace(C)) { 2690 const char *ForwardPtr = CurPtr; 2691 while (isWhitespace(*ForwardPtr)) // Skip whitespace. 2692 ++ForwardPtr; 2693 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 2694 break; 2695 } 2696 2697 if (!isLexingRawMode()) 2698 Diag(OldPtr-1, diag::ext_multi_line_line_comment); 2699 break; 2700 } 2701 } 2702 2703 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) { 2704 --CurPtr; 2705 break; 2706 } 2707 2708 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2709 PP->CodeCompleteNaturalLanguage(); 2710 cutOffLexing(); 2711 return false; 2712 } 2713 } 2714 2715 // Found but did not consume the newline. Notify comment handlers about the 2716 // comment unless we're in a #if 0 block. 2717 if (PP && !isLexingRawMode() && 2718 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2719 getSourceLocation(CurPtr)))) { 2720 BufferPtr = CurPtr; 2721 return true; // A token has to be returned. 2722 } 2723 2724 // If we are returning comments as tokens, return this comment as a token. 2725 if (inKeepCommentMode()) 2726 return SaveLineComment(Result, CurPtr); 2727 2728 // If we are inside a preprocessor directive and we see the end of line, 2729 // return immediately, so that the lexer can return this as an EOD token. 2730 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 2731 BufferPtr = CurPtr; 2732 return false; 2733 } 2734 2735 // Otherwise, eat the \n character. We don't care if this is a \n\r or 2736 // \r\n sequence. This is an efficiency hack (because we know the \n can't 2737 // contribute to another token), it isn't needed for correctness. Note that 2738 // this is ok even in KeepWhitespaceMode, because we would have returned the 2739 // comment above in that mode. 2740 NewLinePtr = CurPtr++; 2741 2742 // The next returned token is at the start of the line. 2743 Result.setFlag(Token::StartOfLine); 2744 TokAtPhysicalStartOfLine = true; 2745 // No leading whitespace seen so far. 2746 Result.clearFlag(Token::LeadingSpace); 2747 BufferPtr = CurPtr; 2748 return false; 2749 } 2750 2751 /// If in save-comment mode, package up this Line comment in an appropriate 2752 /// way and return it. 2753 bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { 2754 // If we're not in a preprocessor directive, just return the // comment 2755 // directly. 2756 FormTokenWithChars(Result, CurPtr, tok::comment); 2757 2758 if (!ParsingPreprocessorDirective || LexingRawMode) 2759 return true; 2760 2761 // If this Line-style comment is in a macro definition, transmogrify it into 2762 // a C-style block comment. 2763 bool Invalid = false; 2764 std::string Spelling = PP->getSpelling(Result, &Invalid); 2765 if (Invalid) 2766 return true; 2767 2768 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); 2769 Spelling[1] = '*'; // Change prefix to "/*". 2770 Spelling += "*/"; // add suffix. 2771 2772 Result.setKind(tok::comment); 2773 PP->CreateString(Spelling, Result, 2774 Result.getLocation(), Result.getLocation()); 2775 return true; 2776 } 2777 2778 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 2779 /// character (either \\n or \\r) is part of an escaped newline sequence. Issue 2780 /// a diagnostic if so. We know that the newline is inside of a block comment. 2781 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, 2782 bool Trigraphs) { 2783 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 2784 2785 // Position of the first trigraph in the ending sequence. 2786 const char *TrigraphPos = nullptr; 2787 // Position of the first whitespace after a '\' in the ending sequence. 2788 const char *SpacePos = nullptr; 2789 2790 while (true) { 2791 // Back up off the newline. 2792 --CurPtr; 2793 2794 // If this is a two-character newline sequence, skip the other character. 2795 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 2796 // \n\n or \r\r -> not escaped newline. 2797 if (CurPtr[0] == CurPtr[1]) 2798 return false; 2799 // \n\r or \r\n -> skip the newline. 2800 --CurPtr; 2801 } 2802 2803 // If we have horizontal whitespace, skip over it. We allow whitespace 2804 // between the slash and newline. 2805 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 2806 SpacePos = CurPtr; 2807 --CurPtr; 2808 } 2809 2810 // If we have a slash, this is an escaped newline. 2811 if (*CurPtr == '\\') { 2812 --CurPtr; 2813 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') { 2814 // This is a trigraph encoding of a slash. 2815 TrigraphPos = CurPtr - 2; 2816 CurPtr -= 3; 2817 } else { 2818 return false; 2819 } 2820 2821 // If the character preceding the escaped newline is a '*', then after line 2822 // splicing we have a '*/' ending the comment. 2823 if (*CurPtr == '*') 2824 break; 2825 2826 if (*CurPtr != '\n' && *CurPtr != '\r') 2827 return false; 2828 } 2829 2830 if (TrigraphPos) { 2831 // If no trigraphs are enabled, warn that we ignored this trigraph and 2832 // ignore this * character. 2833 if (!Trigraphs) { 2834 if (!L->isLexingRawMode()) 2835 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment); 2836 return false; 2837 } 2838 if (!L->isLexingRawMode()) 2839 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment); 2840 } 2841 2842 // Warn about having an escaped newline between the */ characters. 2843 if (!L->isLexingRawMode()) 2844 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end); 2845 2846 // If there was space between the backslash and newline, warn about it. 2847 if (SpacePos && !L->isLexingRawMode()) 2848 L->Diag(SpacePos, diag::backslash_newline_space); 2849 2850 return true; 2851 } 2852 2853 #ifdef __SSE2__ 2854 #include <emmintrin.h> 2855 #elif __ALTIVEC__ 2856 #include <altivec.h> 2857 #undef bool 2858 #endif 2859 2860 /// We have just read from input the / and * characters that started a comment. 2861 /// Read until we find the * and / characters that terminate the comment. 2862 /// Note that we don't bother decoding trigraphs or escaped newlines in block 2863 /// comments, because they cannot cause the comment to end. The only thing 2864 /// that can happen is the comment could end with an escaped newline between 2865 /// the terminating * and /. 2866 /// 2867 /// If we're in KeepCommentMode or any CommentHandler has inserted 2868 /// some tokens, this will store the first token and return true. 2869 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, 2870 bool &TokAtPhysicalStartOfLine) { 2871 // Scan one character past where we should, looking for a '/' character. Once 2872 // we find it, check to see if it was preceded by a *. This common 2873 // optimization helps people who like to put a lot of * characters in their 2874 // comments. 2875 2876 // The first character we get with newlines and trigraphs skipped to handle 2877 // the degenerate /*/ case below correctly if the * has an escaped newline 2878 // after it. 2879 unsigned CharSize; 2880 unsigned char C = getCharAndSize(CurPtr, CharSize); 2881 CurPtr += CharSize; 2882 if (C == 0 && CurPtr == BufferEnd+1) { 2883 if (!isLexingRawMode()) 2884 Diag(BufferPtr, diag::err_unterminated_block_comment); 2885 --CurPtr; 2886 2887 // KeepWhitespaceMode should return this broken comment as a token. Since 2888 // it isn't a well formed comment, just return it as an 'unknown' token. 2889 if (isKeepWhitespaceMode()) { 2890 FormTokenWithChars(Result, CurPtr, tok::unknown); 2891 return true; 2892 } 2893 2894 BufferPtr = CurPtr; 2895 return false; 2896 } 2897 2898 // Check to see if the first character after the '/*' is another /. If so, 2899 // then this slash does not end the block comment, it is part of it. 2900 if (C == '/') 2901 C = *CurPtr++; 2902 2903 // C++23 [lex.phases] p1 2904 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a 2905 // diagnostic only once per entire ill-formed subsequence to avoid 2906 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). 2907 bool UnicodeDecodingAlreadyDiagnosed = false; 2908 2909 while (true) { 2910 // Skip over all non-interesting characters until we find end of buffer or a 2911 // (probably ending) '/' character. 2912 if (CurPtr + 24 < BufferEnd && 2913 // If there is a code-completion point avoid the fast scan because it 2914 // doesn't check for '\0'. 2915 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 2916 // While not aligned to a 16-byte boundary. 2917 while (C != '/' && (intptr_t)CurPtr % 16 != 0) { 2918 if (!isASCII(C)) 2919 goto MultiByteUTF8; 2920 C = *CurPtr++; 2921 } 2922 if (C == '/') goto FoundSlash; 2923 2924 #ifdef __SSE2__ 2925 __m128i Slashes = _mm_set1_epi8('/'); 2926 while (CurPtr + 16 < BufferEnd) { 2927 int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr); 2928 if (LLVM_UNLIKELY(Mask != 0)) { 2929 goto MultiByteUTF8; 2930 } 2931 // look for slashes 2932 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 2933 Slashes)); 2934 if (cmp != 0) { 2935 // Adjust the pointer to point directly after the first slash. It's 2936 // not necessary to set C here, it will be overwritten at the end of 2937 // the outer loop. 2938 CurPtr += llvm::countr_zero<unsigned>(cmp) + 1; 2939 goto FoundSlash; 2940 } 2941 CurPtr += 16; 2942 } 2943 #elif __ALTIVEC__ 2944 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2945 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2946 0x80, 0x80, 0x80, 0x80}; 2947 __vector unsigned char Slashes = { 2948 '/', '/', '/', '/', '/', '/', '/', '/', 2949 '/', '/', '/', '/', '/', '/', '/', '/' 2950 }; 2951 while (CurPtr + 16 < BufferEnd) { 2952 if (LLVM_UNLIKELY( 2953 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF))) 2954 goto MultiByteUTF8; 2955 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) { 2956 break; 2957 } 2958 CurPtr += 16; 2959 } 2960 2961 #else 2962 while (CurPtr + 16 < BufferEnd) { 2963 bool HasNonASCII = false; 2964 for (unsigned I = 0; I < 16; ++I) 2965 HasNonASCII |= !isASCII(CurPtr[I]); 2966 2967 if (LLVM_UNLIKELY(HasNonASCII)) 2968 goto MultiByteUTF8; 2969 2970 bool HasSlash = false; 2971 for (unsigned I = 0; I < 16; ++I) 2972 HasSlash |= CurPtr[I] == '/'; 2973 if (HasSlash) 2974 break; 2975 CurPtr += 16; 2976 } 2977 #endif 2978 2979 // It has to be one of the bytes scanned, increment to it and read one. 2980 C = *CurPtr++; 2981 } 2982 2983 // Loop to scan the remainder, warning on invalid UTF-8 2984 // if the corresponding warning is enabled, emitting a diagnostic only once 2985 // per sequence that cannot be decoded. 2986 while (C != '/' && C != '\0') { 2987 if (isASCII(C)) { 2988 UnicodeDecodingAlreadyDiagnosed = false; 2989 C = *CurPtr++; 2990 continue; 2991 } 2992 MultiByteUTF8: 2993 // CurPtr is 1 code unit past C, so to decode 2994 // the codepoint, we need to read from the previous position. 2995 unsigned Length = llvm::getUTF8SequenceSize( 2996 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd); 2997 if (Length == 0) { 2998 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) 2999 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment); 3000 UnicodeDecodingAlreadyDiagnosed = true; 3001 } else { 3002 UnicodeDecodingAlreadyDiagnosed = false; 3003 CurPtr += Length - 1; 3004 } 3005 C = *CurPtr++; 3006 } 3007 3008 if (C == '/') { 3009 FoundSlash: 3010 if (CurPtr[-2] == '*') // We found the final */. We're done! 3011 break; 3012 3013 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 3014 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this, 3015 LangOpts.Trigraphs)) { 3016 // We found the final */, though it had an escaped newline between the 3017 // * and /. We're done! 3018 break; 3019 } 3020 } 3021 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 3022 // If this is a /* inside of the comment, emit a warning. Don't do this 3023 // if this is a /*/, which will end the comment. This misses cases with 3024 // embedded escaped newlines, but oh well. 3025 if (!isLexingRawMode()) 3026 Diag(CurPtr-1, diag::warn_nested_block_comment); 3027 } 3028 } else if (C == 0 && CurPtr == BufferEnd+1) { 3029 if (!isLexingRawMode()) 3030 Diag(BufferPtr, diag::err_unterminated_block_comment); 3031 // Note: the user probably forgot a */. We could continue immediately 3032 // after the /*, but this would involve lexing a lot of what really is the 3033 // comment, which surely would confuse the parser. 3034 --CurPtr; 3035 3036 // KeepWhitespaceMode should return this broken comment as a token. Since 3037 // it isn't a well formed comment, just return it as an 'unknown' token. 3038 if (isKeepWhitespaceMode()) { 3039 FormTokenWithChars(Result, CurPtr, tok::unknown); 3040 return true; 3041 } 3042 3043 BufferPtr = CurPtr; 3044 return false; 3045 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 3046 PP->CodeCompleteNaturalLanguage(); 3047 cutOffLexing(); 3048 return false; 3049 } 3050 3051 C = *CurPtr++; 3052 } 3053 3054 // Notify comment handlers about the comment unless we're in a #if 0 block. 3055 if (PP && !isLexingRawMode() && 3056 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 3057 getSourceLocation(CurPtr)))) { 3058 BufferPtr = CurPtr; 3059 return true; // A token has to be returned. 3060 } 3061 3062 // If we are returning comments as tokens, return this comment as a token. 3063 if (inKeepCommentMode()) { 3064 FormTokenWithChars(Result, CurPtr, tok::comment); 3065 return true; 3066 } 3067 3068 // It is common for the tokens immediately after a /**/ comment to be 3069 // whitespace. Instead of going through the big switch, handle it 3070 // efficiently now. This is safe even in KeepWhitespaceMode because we would 3071 // have already returned above with the comment as a token. 3072 if (isHorizontalWhitespace(*CurPtr)) { 3073 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); 3074 return false; 3075 } 3076 3077 // Otherwise, just return so that the next character will be lexed as a token. 3078 BufferPtr = CurPtr; 3079 Result.setFlag(Token::LeadingSpace); 3080 return false; 3081 } 3082 3083 //===----------------------------------------------------------------------===// 3084 // Primary Lexing Entry Points 3085 //===----------------------------------------------------------------------===// 3086 3087 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 3088 /// uninterpreted string. This switches the lexer out of directive mode. 3089 void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 3090 assert(ParsingPreprocessorDirective && ParsingFilename == false && 3091 "Must be in a preprocessing directive!"); 3092 Token Tmp; 3093 Tmp.startToken(); 3094 3095 // CurPtr - Cache BufferPtr in an automatic variable. 3096 const char *CurPtr = BufferPtr; 3097 while (true) { 3098 char Char = getAndAdvanceChar(CurPtr, Tmp); 3099 switch (Char) { 3100 default: 3101 if (Result) 3102 Result->push_back(Char); 3103 break; 3104 case 0: // Null. 3105 // Found end of file? 3106 if (CurPtr-1 != BufferEnd) { 3107 if (isCodeCompletionPoint(CurPtr-1)) { 3108 PP->CodeCompleteNaturalLanguage(); 3109 cutOffLexing(); 3110 return; 3111 } 3112 3113 // Nope, normal character, continue. 3114 if (Result) 3115 Result->push_back(Char); 3116 break; 3117 } 3118 // FALL THROUGH. 3119 [[fallthrough]]; 3120 case '\r': 3121 case '\n': 3122 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 3123 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 3124 BufferPtr = CurPtr-1; 3125 3126 // Next, lex the character, which should handle the EOD transition. 3127 Lex(Tmp); 3128 if (Tmp.is(tok::code_completion)) { 3129 if (PP) 3130 PP->CodeCompleteNaturalLanguage(); 3131 Lex(Tmp); 3132 } 3133 assert(Tmp.is(tok::eod) && "Unexpected token!"); 3134 3135 // Finally, we're done; 3136 return; 3137 } 3138 } 3139 } 3140 3141 /// LexEndOfFile - CurPtr points to the end of this file. Handle this 3142 /// condition, reporting diagnostics and handling other edge cases as required. 3143 /// This returns true if Result contains a token, false if PP.Lex should be 3144 /// called again. 3145 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 3146 // If we hit the end of the file while parsing a preprocessor directive, 3147 // end the preprocessor directive first. The next token returned will 3148 // then be the end of file. 3149 if (ParsingPreprocessorDirective) { 3150 // Done parsing the "line". 3151 ParsingPreprocessorDirective = false; 3152 // Update the location of token as well as BufferPtr. 3153 FormTokenWithChars(Result, CurPtr, tok::eod); 3154 3155 // Restore comment saving mode, in case it was disabled for directive. 3156 if (PP) 3157 resetExtendedTokenMode(); 3158 return true; // Have a token. 3159 } 3160 3161 // If we are in raw mode, return this event as an EOF token. Let the caller 3162 // that put us in raw mode handle the event. 3163 if (isLexingRawMode()) { 3164 Result.startToken(); 3165 BufferPtr = BufferEnd; 3166 FormTokenWithChars(Result, BufferEnd, tok::eof); 3167 return true; 3168 } 3169 3170 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { 3171 PP->setRecordedPreambleConditionalStack(ConditionalStack); 3172 // If the preamble cuts off the end of a header guard, consider it guarded. 3173 // The guard is valid for the preamble content itself, and for tools the 3174 // most useful answer is "yes, this file has a header guard". 3175 if (!ConditionalStack.empty()) 3176 MIOpt.ExitTopLevelConditional(); 3177 ConditionalStack.clear(); 3178 } 3179 3180 // Issue diagnostics for unterminated #if and missing newline. 3181 3182 // If we are in a #if directive, emit an error. 3183 while (!ConditionalStack.empty()) { 3184 if (PP->getCodeCompletionFileLoc() != FileLoc) 3185 PP->Diag(ConditionalStack.back().IfLoc, 3186 diag::err_pp_unterminated_conditional); 3187 ConditionalStack.pop_back(); 3188 } 3189 3190 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 3191 // a pedwarn. 3192 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) { 3193 DiagnosticsEngine &Diags = PP->getDiagnostics(); 3194 SourceLocation EndLoc = getSourceLocation(BufferEnd); 3195 unsigned DiagID; 3196 3197 if (LangOpts.CPlusPlus11) { 3198 // C++11 [lex.phases] 2.2 p2 3199 // Prefer the C++98 pedantic compatibility warning over the generic, 3200 // non-extension, user-requested "missing newline at EOF" warning. 3201 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) { 3202 DiagID = diag::warn_cxx98_compat_no_newline_eof; 3203 } else { 3204 DiagID = diag::warn_no_newline_eof; 3205 } 3206 } else { 3207 DiagID = diag::ext_no_newline_eof; 3208 } 3209 3210 Diag(BufferEnd, DiagID) 3211 << FixItHint::CreateInsertion(EndLoc, "\n"); 3212 } 3213 3214 BufferPtr = CurPtr; 3215 3216 // Finally, let the preprocessor handle this. 3217 return PP->HandleEndOfFile(Result, isPragmaLexer()); 3218 } 3219 3220 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 3221 /// the specified lexer will return a tok::l_paren token, 0 if it is something 3222 /// else and 2 if there are no more tokens in the buffer controlled by the 3223 /// lexer. 3224 unsigned Lexer::isNextPPTokenLParen() { 3225 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 3226 3227 if (isDependencyDirectivesLexer()) { 3228 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) 3229 return 2; 3230 return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( 3231 tok::l_paren); 3232 } 3233 3234 // Switch to 'skipping' mode. This will ensure that we can lex a token 3235 // without emitting diagnostics, disables macro expansion, and will cause EOF 3236 // to return an EOF token instead of popping the include stack. 3237 LexingRawMode = true; 3238 3239 // Save state that can be changed while lexing so that we can restore it. 3240 const char *TmpBufferPtr = BufferPtr; 3241 bool inPPDirectiveMode = ParsingPreprocessorDirective; 3242 bool atStartOfLine = IsAtStartOfLine; 3243 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 3244 bool leadingSpace = HasLeadingSpace; 3245 3246 Token Tok; 3247 Lex(Tok); 3248 3249 // Restore state that may have changed. 3250 BufferPtr = TmpBufferPtr; 3251 ParsingPreprocessorDirective = inPPDirectiveMode; 3252 HasLeadingSpace = leadingSpace; 3253 IsAtStartOfLine = atStartOfLine; 3254 IsAtPhysicalStartOfLine = atPhysicalStartOfLine; 3255 3256 // Restore the lexer back to non-skipping mode. 3257 LexingRawMode = false; 3258 3259 if (Tok.is(tok::eof)) 3260 return 2; 3261 return Tok.is(tok::l_paren); 3262 } 3263 3264 /// Find the end of a version control conflict marker. 3265 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 3266 ConflictMarkerKind CMK) { 3267 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 3268 size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 3269 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen); 3270 size_t Pos = RestOfBuffer.find(Terminator); 3271 while (Pos != StringRef::npos) { 3272 // Must occur at start of line. 3273 if (Pos == 0 || 3274 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) { 3275 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 3276 Pos = RestOfBuffer.find(Terminator); 3277 continue; 3278 } 3279 return RestOfBuffer.data()+Pos; 3280 } 3281 return nullptr; 3282 } 3283 3284 /// IsStartOfConflictMarker - If the specified pointer is the start of a version 3285 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 3286 /// and recover nicely. This returns true if it is a conflict marker and false 3287 /// if not. 3288 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 3289 // Only a conflict marker if it starts at the beginning of a line. 3290 if (CurPtr != BufferStart && 3291 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 3292 return false; 3293 3294 // Check to see if we have <<<<<<< or >>>>. 3295 if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") && 3296 !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> ")) 3297 return false; 3298 3299 // If we have a situation where we don't care about conflict markers, ignore 3300 // it. 3301 if (CurrentConflictMarkerState || isLexingRawMode()) 3302 return false; 3303 3304 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 3305 3306 // Check to see if there is an ending marker somewhere in the buffer at the 3307 // start of a line to terminate this conflict marker. 3308 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 3309 // We found a match. We are really in a conflict marker. 3310 // Diagnose this, and ignore to the end of line. 3311 Diag(CurPtr, diag::err_conflict_marker); 3312 CurrentConflictMarkerState = Kind; 3313 3314 // Skip ahead to the end of line. We know this exists because the 3315 // end-of-conflict marker starts with \r or \n. 3316 while (*CurPtr != '\r' && *CurPtr != '\n') { 3317 assert(CurPtr != BufferEnd && "Didn't find end of line"); 3318 ++CurPtr; 3319 } 3320 BufferPtr = CurPtr; 3321 return true; 3322 } 3323 3324 // No end of conflict marker found. 3325 return false; 3326 } 3327 3328 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 3329 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 3330 /// is the end of a conflict marker. Handle it by ignoring up until the end of 3331 /// the line. This returns true if it is a conflict marker and false if not. 3332 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 3333 // Only a conflict marker if it starts at the beginning of a line. 3334 if (CurPtr != BufferStart && 3335 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 3336 return false; 3337 3338 // If we have a situation where we don't care about conflict markers, ignore 3339 // it. 3340 if (!CurrentConflictMarkerState || isLexingRawMode()) 3341 return false; 3342 3343 // Check to see if we have the marker (4 characters in a row). 3344 for (unsigned i = 1; i != 4; ++i) 3345 if (CurPtr[i] != CurPtr[0]) 3346 return false; 3347 3348 // If we do have it, search for the end of the conflict marker. This could 3349 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 3350 // be the end of conflict marker. 3351 if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 3352 CurrentConflictMarkerState)) { 3353 CurPtr = End; 3354 3355 // Skip ahead to the end of line. 3356 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 3357 ++CurPtr; 3358 3359 BufferPtr = CurPtr; 3360 3361 // No longer in the conflict marker. 3362 CurrentConflictMarkerState = CMK_None; 3363 return true; 3364 } 3365 3366 return false; 3367 } 3368 3369 static const char *findPlaceholderEnd(const char *CurPtr, 3370 const char *BufferEnd) { 3371 if (CurPtr == BufferEnd) 3372 return nullptr; 3373 BufferEnd -= 1; // Scan until the second last character. 3374 for (; CurPtr != BufferEnd; ++CurPtr) { 3375 if (CurPtr[0] == '#' && CurPtr[1] == '>') 3376 return CurPtr + 2; 3377 } 3378 return nullptr; 3379 } 3380 3381 bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) { 3382 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!"); 3383 if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) 3384 return false; 3385 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd); 3386 if (!End) 3387 return false; 3388 const char *Start = CurPtr - 1; 3389 if (!LangOpts.AllowEditorPlaceholders) 3390 Diag(Start, diag::err_placeholder_in_source); 3391 Result.startToken(); 3392 FormTokenWithChars(Result, End, tok::raw_identifier); 3393 Result.setRawIdentifierData(Start); 3394 PP->LookUpIdentifierInfo(Result); 3395 Result.setFlag(Token::IsEditorPlaceholder); 3396 BufferPtr = End; 3397 return true; 3398 } 3399 3400 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 3401 if (PP && PP->isCodeCompletionEnabled()) { 3402 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 3403 return Loc == PP->getCodeCompletionLoc(); 3404 } 3405 3406 return false; 3407 } 3408 3409 std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr, 3410 const char *SlashLoc, 3411 Token *Result) { 3412 unsigned CharSize; 3413 char Kind = getCharAndSize(StartPtr, CharSize); 3414 assert((Kind == 'u' || Kind == 'U') && "expected a UCN"); 3415 3416 unsigned NumHexDigits; 3417 if (Kind == 'u') 3418 NumHexDigits = 4; 3419 else if (Kind == 'U') 3420 NumHexDigits = 8; 3421 3422 bool Delimited = false; 3423 bool FoundEndDelimiter = false; 3424 unsigned Count = 0; 3425 bool Diagnose = Result && !isLexingRawMode(); 3426 3427 if (!LangOpts.CPlusPlus && !LangOpts.C99) { 3428 if (Diagnose) 3429 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); 3430 return std::nullopt; 3431 } 3432 3433 const char *CurPtr = StartPtr + CharSize; 3434 const char *KindLoc = &CurPtr[-1]; 3435 3436 uint32_t CodePoint = 0; 3437 while (Count != NumHexDigits || Delimited) { 3438 char C = getCharAndSize(CurPtr, CharSize); 3439 if (!Delimited && Count == 0 && C == '{') { 3440 Delimited = true; 3441 CurPtr += CharSize; 3442 continue; 3443 } 3444 3445 if (Delimited && C == '}') { 3446 CurPtr += CharSize; 3447 FoundEndDelimiter = true; 3448 break; 3449 } 3450 3451 unsigned Value = llvm::hexDigitValue(C); 3452 if (Value == -1U) { 3453 if (!Delimited) 3454 break; 3455 if (Diagnose) 3456 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete) 3457 << StringRef(KindLoc, 1); 3458 return std::nullopt; 3459 } 3460 3461 if (CodePoint & 0xF000'0000) { 3462 if (Diagnose) 3463 Diag(KindLoc, diag::err_escape_too_large) << 0; 3464 return std::nullopt; 3465 } 3466 3467 CodePoint <<= 4; 3468 CodePoint |= Value; 3469 CurPtr += CharSize; 3470 Count++; 3471 } 3472 3473 if (Count == 0) { 3474 if (Diagnose) 3475 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 3476 : diag::warn_ucn_escape_no_digits) 3477 << StringRef(KindLoc, 1); 3478 return std::nullopt; 3479 } 3480 3481 if (Delimited && Kind == 'U') { 3482 if (Diagnose) 3483 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1); 3484 return std::nullopt; 3485 } 3486 3487 if (!Delimited && Count != NumHexDigits) { 3488 if (Diagnose) { 3489 Diag(SlashLoc, diag::warn_ucn_escape_incomplete); 3490 // If the user wrote \U1234, suggest a fixit to \u. 3491 if (Count == 4 && NumHexDigits == 8) { 3492 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); 3493 Diag(KindLoc, diag::note_ucn_four_not_eight) 3494 << FixItHint::CreateReplacement(URange, "u"); 3495 } 3496 } 3497 return std::nullopt; 3498 } 3499 3500 if (Delimited && PP) { 3501 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23 3502 ? diag::warn_cxx23_delimited_escape_sequence 3503 : diag::ext_delimited_escape_sequence) 3504 << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0); 3505 } 3506 3507 if (Result) { 3508 Result->setFlag(Token::HasUCN); 3509 // If the UCN contains either a trigraph or a line splicing, 3510 // we need to call getAndAdvanceChar again to set the appropriate flags 3511 // on Result. 3512 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0))) 3513 StartPtr = CurPtr; 3514 else 3515 while (StartPtr != CurPtr) 3516 (void)getAndAdvanceChar(StartPtr, *Result); 3517 } else { 3518 StartPtr = CurPtr; 3519 } 3520 return CodePoint; 3521 } 3522 3523 std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr, 3524 const char *SlashLoc, 3525 Token *Result) { 3526 unsigned CharSize; 3527 bool Diagnose = Result && !isLexingRawMode(); 3528 3529 char C = getCharAndSize(StartPtr, CharSize); 3530 assert(C == 'N' && "expected \\N{...}"); 3531 3532 const char *CurPtr = StartPtr + CharSize; 3533 const char *KindLoc = &CurPtr[-1]; 3534 3535 C = getCharAndSize(CurPtr, CharSize); 3536 if (C != '{') { 3537 if (Diagnose) 3538 Diag(SlashLoc, diag::warn_ucn_escape_incomplete); 3539 return std::nullopt; 3540 } 3541 CurPtr += CharSize; 3542 const char *StartName = CurPtr; 3543 bool FoundEndDelimiter = false; 3544 llvm::SmallVector<char, 30> Buffer; 3545 while (C) { 3546 C = getCharAndSize(CurPtr, CharSize); 3547 CurPtr += CharSize; 3548 if (C == '}') { 3549 FoundEndDelimiter = true; 3550 break; 3551 } 3552 3553 if (isVerticalWhitespace(C)) 3554 break; 3555 Buffer.push_back(C); 3556 } 3557 3558 if (!FoundEndDelimiter || Buffer.empty()) { 3559 if (Diagnose) 3560 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 3561 : diag::warn_delimited_ucn_incomplete) 3562 << StringRef(KindLoc, 1); 3563 return std::nullopt; 3564 } 3565 3566 StringRef Name(Buffer.data(), Buffer.size()); 3567 std::optional<char32_t> Match = 3568 llvm::sys::unicode::nameToCodepointStrict(Name); 3569 std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch; 3570 if (!Match) { 3571 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name); 3572 if (Diagnose) { 3573 Diag(StartName, diag::err_invalid_ucn_name) 3574 << StringRef(Buffer.data(), Buffer.size()) 3575 << makeCharRange(*this, StartName, CurPtr - CharSize); 3576 if (LooseMatch) { 3577 Diag(StartName, diag::note_invalid_ucn_name_loose_matching) 3578 << FixItHint::CreateReplacement( 3579 makeCharRange(*this, StartName, CurPtr - CharSize), 3580 LooseMatch->Name); 3581 } 3582 } 3583 // We do not offer misspelled character names suggestions here 3584 // as the set of what would be a valid suggestion depends on context, 3585 // and we should not make invalid suggestions. 3586 } 3587 3588 if (Diagnose && Match) 3589 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23 3590 ? diag::warn_cxx23_delimited_escape_sequence 3591 : diag::ext_delimited_escape_sequence) 3592 << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0); 3593 3594 // If no diagnostic has been emitted yet, likely because we are doing a 3595 // tentative lexing, we do not want to recover here to make sure the token 3596 // will not be incorrectly considered valid. This function will be called 3597 // again and a diagnostic emitted then. 3598 if (LooseMatch && Diagnose) 3599 Match = LooseMatch->CodePoint; 3600 3601 if (Result) { 3602 Result->setFlag(Token::HasUCN); 3603 // If the UCN contains either a trigraph or a line splicing, 3604 // we need to call getAndAdvanceChar again to set the appropriate flags 3605 // on Result. 3606 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3)) 3607 StartPtr = CurPtr; 3608 else 3609 while (StartPtr != CurPtr) 3610 (void)getAndAdvanceChar(StartPtr, *Result); 3611 } else { 3612 StartPtr = CurPtr; 3613 } 3614 return Match ? std::optional<uint32_t>(*Match) : std::nullopt; 3615 } 3616 3617 uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, 3618 Token *Result) { 3619 3620 unsigned CharSize; 3621 std::optional<uint32_t> CodePointOpt; 3622 char Kind = getCharAndSize(StartPtr, CharSize); 3623 if (Kind == 'u' || Kind == 'U') 3624 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result); 3625 else if (Kind == 'N') 3626 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result); 3627 3628 if (!CodePointOpt) 3629 return 0; 3630 3631 uint32_t CodePoint = *CodePointOpt; 3632 3633 // Don't apply C family restrictions to UCNs in assembly mode 3634 if (LangOpts.AsmPreprocessor) 3635 return CodePoint; 3636 3637 // C23 6.4.3p2: A universal character name shall not designate a code point 3638 // where the hexadecimal value is: 3639 // - in the range D800 through DFFF inclusive; or 3640 // - greater than 10FFFF. 3641 // A universal-character-name outside the c-char-sequence of a character 3642 // constant, or the s-char-sequence of a string-literal shall not designate 3643 // a control character or a character in the basic character set. 3644 3645 // C++11 [lex.charset]p2: If the hexadecimal value for a 3646 // universal-character-name corresponds to a surrogate code point (in the 3647 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, 3648 // if the hexadecimal value for a universal-character-name outside the 3649 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or 3650 // string literal corresponds to a control character (in either of the 3651 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the 3652 // basic source character set, the program is ill-formed. 3653 if (CodePoint < 0xA0) { 3654 // We don't use isLexingRawMode() here because we need to warn about bad 3655 // UCNs even when skipping preprocessing tokens in a #if block. 3656 if (Result && PP) { 3657 if (CodePoint < 0x20 || CodePoint >= 0x7F) 3658 Diag(BufferPtr, diag::err_ucn_control_character); 3659 else { 3660 char C = static_cast<char>(CodePoint); 3661 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); 3662 } 3663 } 3664 3665 return 0; 3666 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { 3667 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. 3668 // We don't use isLexingRawMode() here because we need to diagnose bad 3669 // UCNs even when skipping preprocessing tokens in a #if block. 3670 if (Result && PP) { 3671 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11) 3672 Diag(BufferPtr, diag::warn_ucn_escape_surrogate); 3673 else 3674 Diag(BufferPtr, diag::err_ucn_escape_invalid); 3675 } 3676 return 0; 3677 } 3678 3679 return CodePoint; 3680 } 3681 3682 bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, 3683 const char *CurPtr) { 3684 if (!isLexingRawMode() && !PP->isPreprocessedOutput() && 3685 isUnicodeWhitespace(C)) { 3686 Diag(BufferPtr, diag::ext_unicode_whitespace) 3687 << makeCharRange(*this, BufferPtr, CurPtr); 3688 3689 Result.setFlag(Token::LeadingSpace); 3690 return true; 3691 } 3692 return false; 3693 } 3694 3695 void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { 3696 IsAtStartOfLine = Result.isAtStartOfLine(); 3697 HasLeadingSpace = Result.hasLeadingSpace(); 3698 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); 3699 // Note that this doesn't affect IsAtPhysicalStartOfLine. 3700 } 3701 3702 bool Lexer::Lex(Token &Result) { 3703 assert(!isDependencyDirectivesLexer()); 3704 3705 // Start a new token. 3706 Result.startToken(); 3707 3708 // Set up misc whitespace flags for LexTokenInternal. 3709 if (IsAtStartOfLine) { 3710 Result.setFlag(Token::StartOfLine); 3711 IsAtStartOfLine = false; 3712 } 3713 3714 if (HasLeadingSpace) { 3715 Result.setFlag(Token::LeadingSpace); 3716 HasLeadingSpace = false; 3717 } 3718 3719 if (HasLeadingEmptyMacro) { 3720 Result.setFlag(Token::LeadingEmptyMacro); 3721 HasLeadingEmptyMacro = false; 3722 } 3723 3724 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 3725 IsAtPhysicalStartOfLine = false; 3726 bool isRawLex = isLexingRawMode(); 3727 (void) isRawLex; 3728 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); 3729 // (After the LexTokenInternal call, the lexer might be destroyed.) 3730 assert((returnedToken || !isRawLex) && "Raw lex must succeed"); 3731 return returnedToken; 3732 } 3733 3734 /// LexTokenInternal - This implements a simple C family lexer. It is an 3735 /// extremely performance critical piece of code. This assumes that the buffer 3736 /// has a null character at the end of the file. This returns a preprocessing 3737 /// token, not a normal token, as such, it is an internal interface. It assumes 3738 /// that the Flags of result have been cleared before calling this. 3739 bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { 3740 LexStart: 3741 assert(!Result.needsCleaning() && "Result needs cleaning"); 3742 assert(!Result.hasPtrData() && "Result has not been reset"); 3743 3744 // CurPtr - Cache BufferPtr in an automatic variable. 3745 const char *CurPtr = BufferPtr; 3746 3747 // Small amounts of horizontal whitespace is very common between tokens. 3748 if (isHorizontalWhitespace(*CurPtr)) { 3749 do { 3750 ++CurPtr; 3751 } while (isHorizontalWhitespace(*CurPtr)); 3752 3753 // If we are keeping whitespace and other tokens, just return what we just 3754 // skipped. The next lexer invocation will return the token after the 3755 // whitespace. 3756 if (isKeepWhitespaceMode()) { 3757 FormTokenWithChars(Result, CurPtr, tok::unknown); 3758 // FIXME: The next token will not have LeadingSpace set. 3759 return true; 3760 } 3761 3762 BufferPtr = CurPtr; 3763 Result.setFlag(Token::LeadingSpace); 3764 } 3765 3766 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 3767 3768 // Read a character, advancing over it. 3769 char Char = getAndAdvanceChar(CurPtr, Result); 3770 tok::TokenKind Kind; 3771 3772 if (!isVerticalWhitespace(Char)) 3773 NewLinePtr = nullptr; 3774 3775 switch (Char) { 3776 case 0: // Null. 3777 // Found end of file? 3778 if (CurPtr-1 == BufferEnd) 3779 return LexEndOfFile(Result, CurPtr-1); 3780 3781 // Check if we are performing code completion. 3782 if (isCodeCompletionPoint(CurPtr-1)) { 3783 // Return the code-completion token. 3784 Result.startToken(); 3785 FormTokenWithChars(Result, CurPtr, tok::code_completion); 3786 return true; 3787 } 3788 3789 if (!isLexingRawMode()) 3790 Diag(CurPtr-1, diag::null_in_file); 3791 Result.setFlag(Token::LeadingSpace); 3792 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3793 return true; // KeepWhitespaceMode 3794 3795 // We know the lexer hasn't changed, so just try again with this lexer. 3796 // (We manually eliminate the tail call to avoid recursion.) 3797 goto LexNextToken; 3798 3799 case 26: // DOS & CP/M EOF: "^Z". 3800 // If we're in Microsoft extensions mode, treat this as end of file. 3801 if (LangOpts.MicrosoftExt) { 3802 if (!isLexingRawMode()) 3803 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft); 3804 return LexEndOfFile(Result, CurPtr-1); 3805 } 3806 3807 // If Microsoft extensions are disabled, this is just random garbage. 3808 Kind = tok::unknown; 3809 break; 3810 3811 case '\r': 3812 if (CurPtr[0] == '\n') 3813 (void)getAndAdvanceChar(CurPtr, Result); 3814 [[fallthrough]]; 3815 case '\n': 3816 // If we are inside a preprocessor directive and we see the end of line, 3817 // we know we are done with the directive, so return an EOD token. 3818 if (ParsingPreprocessorDirective) { 3819 // Done parsing the "line". 3820 ParsingPreprocessorDirective = false; 3821 3822 // Restore comment saving mode, in case it was disabled for directive. 3823 if (PP) 3824 resetExtendedTokenMode(); 3825 3826 // Since we consumed a newline, we are back at the start of a line. 3827 IsAtStartOfLine = true; 3828 IsAtPhysicalStartOfLine = true; 3829 NewLinePtr = CurPtr - 1; 3830 3831 Kind = tok::eod; 3832 break; 3833 } 3834 3835 // No leading whitespace seen so far. 3836 Result.clearFlag(Token::LeadingSpace); 3837 3838 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3839 return true; // KeepWhitespaceMode 3840 3841 // We only saw whitespace, so just try again with this lexer. 3842 // (We manually eliminate the tail call to avoid recursion.) 3843 goto LexNextToken; 3844 case ' ': 3845 case '\t': 3846 case '\f': 3847 case '\v': 3848 SkipHorizontalWhitespace: 3849 Result.setFlag(Token::LeadingSpace); 3850 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3851 return true; // KeepWhitespaceMode 3852 3853 SkipIgnoredUnits: 3854 CurPtr = BufferPtr; 3855 3856 // If the next token is obviously a // or /* */ comment, skip it efficiently 3857 // too (without going through the big switch stmt). 3858 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 3859 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { 3860 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3861 return true; // There is a token to return. 3862 goto SkipIgnoredUnits; 3863 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 3864 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3865 return true; // There is a token to return. 3866 goto SkipIgnoredUnits; 3867 } else if (isHorizontalWhitespace(*CurPtr)) { 3868 goto SkipHorizontalWhitespace; 3869 } 3870 // We only saw whitespace, so just try again with this lexer. 3871 // (We manually eliminate the tail call to avoid recursion.) 3872 goto LexNextToken; 3873 3874 // C99 6.4.4.1: Integer Constants. 3875 // C99 6.4.4.2: Floating Constants. 3876 case '0': case '1': case '2': case '3': case '4': 3877 case '5': case '6': case '7': case '8': case '9': 3878 // Notify MIOpt that we read a non-whitespace/non-comment token. 3879 MIOpt.ReadToken(); 3880 return LexNumericConstant(Result, CurPtr); 3881 3882 // Identifier (e.g., uber), or 3883 // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or 3884 // UTF-8 or UTF-16 string literal (C11/C++11). 3885 case 'u': 3886 // Notify MIOpt that we read a non-whitespace/non-comment token. 3887 MIOpt.ReadToken(); 3888 3889 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3890 Char = getCharAndSize(CurPtr, SizeTmp); 3891 3892 // UTF-16 string literal 3893 if (Char == '"') 3894 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3895 tok::utf16_string_literal); 3896 3897 // UTF-16 character constant 3898 if (Char == '\'') 3899 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3900 tok::utf16_char_constant); 3901 3902 // UTF-16 raw string literal 3903 if (Char == 'R' && LangOpts.RawStringLiterals && 3904 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3905 return LexRawStringLiteral(Result, 3906 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3907 SizeTmp2, Result), 3908 tok::utf16_string_literal); 3909 3910 if (Char == '8') { 3911 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 3912 3913 // UTF-8 string literal 3914 if (Char2 == '"') 3915 return LexStringLiteral(Result, 3916 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3917 SizeTmp2, Result), 3918 tok::utf8_string_literal); 3919 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23)) 3920 return LexCharConstant( 3921 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3922 SizeTmp2, Result), 3923 tok::utf8_char_constant); 3924 3925 if (Char2 == 'R' && LangOpts.RawStringLiterals) { 3926 unsigned SizeTmp3; 3927 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3928 // UTF-8 raw string literal 3929 if (Char3 == '"') { 3930 return LexRawStringLiteral(Result, 3931 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3932 SizeTmp2, Result), 3933 SizeTmp3, Result), 3934 tok::utf8_string_literal); 3935 } 3936 } 3937 } 3938 } 3939 3940 // treat u like the start of an identifier. 3941 return LexIdentifierContinue(Result, CurPtr); 3942 3943 case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal 3944 // Notify MIOpt that we read a non-whitespace/non-comment token. 3945 MIOpt.ReadToken(); 3946 3947 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3948 Char = getCharAndSize(CurPtr, SizeTmp); 3949 3950 // UTF-32 string literal 3951 if (Char == '"') 3952 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3953 tok::utf32_string_literal); 3954 3955 // UTF-32 character constant 3956 if (Char == '\'') 3957 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3958 tok::utf32_char_constant); 3959 3960 // UTF-32 raw string literal 3961 if (Char == 'R' && LangOpts.RawStringLiterals && 3962 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3963 return LexRawStringLiteral(Result, 3964 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3965 SizeTmp2, Result), 3966 tok::utf32_string_literal); 3967 } 3968 3969 // treat U like the start of an identifier. 3970 return LexIdentifierContinue(Result, CurPtr); 3971 3972 case 'R': // Identifier or C++0x raw string literal 3973 // Notify MIOpt that we read a non-whitespace/non-comment token. 3974 MIOpt.ReadToken(); 3975 3976 if (LangOpts.RawStringLiterals) { 3977 Char = getCharAndSize(CurPtr, SizeTmp); 3978 3979 if (Char == '"') 3980 return LexRawStringLiteral(Result, 3981 ConsumeChar(CurPtr, SizeTmp, Result), 3982 tok::string_literal); 3983 } 3984 3985 // treat R like the start of an identifier. 3986 return LexIdentifierContinue(Result, CurPtr); 3987 3988 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 3989 // Notify MIOpt that we read a non-whitespace/non-comment token. 3990 MIOpt.ReadToken(); 3991 Char = getCharAndSize(CurPtr, SizeTmp); 3992 3993 // Wide string literal. 3994 if (Char == '"') 3995 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3996 tok::wide_string_literal); 3997 3998 // Wide raw string literal. 3999 if (LangOpts.RawStringLiterals && Char == 'R' && 4000 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 4001 return LexRawStringLiteral(Result, 4002 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4003 SizeTmp2, Result), 4004 tok::wide_string_literal); 4005 4006 // Wide character constant. 4007 if (Char == '\'') 4008 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 4009 tok::wide_char_constant); 4010 // FALL THROUGH, treating L like the start of an identifier. 4011 [[fallthrough]]; 4012 4013 // C99 6.4.2: Identifiers. 4014 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 4015 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 4016 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 4017 case 'V': case 'W': case 'X': case 'Y': case 'Z': 4018 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 4019 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 4020 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 4021 case 'v': case 'w': case 'x': case 'y': case 'z': 4022 case '_': 4023 // Notify MIOpt that we read a non-whitespace/non-comment token. 4024 MIOpt.ReadToken(); 4025 return LexIdentifierContinue(Result, CurPtr); 4026 4027 case '$': // $ in identifiers. 4028 if (LangOpts.DollarIdents) { 4029 if (!isLexingRawMode()) 4030 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 4031 // Notify MIOpt that we read a non-whitespace/non-comment token. 4032 MIOpt.ReadToken(); 4033 return LexIdentifierContinue(Result, CurPtr); 4034 } 4035 4036 Kind = tok::unknown; 4037 break; 4038 4039 // C99 6.4.4: Character Constants. 4040 case '\'': 4041 // Notify MIOpt that we read a non-whitespace/non-comment token. 4042 MIOpt.ReadToken(); 4043 return LexCharConstant(Result, CurPtr, tok::char_constant); 4044 4045 // C99 6.4.5: String Literals. 4046 case '"': 4047 // Notify MIOpt that we read a non-whitespace/non-comment token. 4048 MIOpt.ReadToken(); 4049 return LexStringLiteral(Result, CurPtr, 4050 ParsingFilename ? tok::header_name 4051 : tok::string_literal); 4052 4053 // C99 6.4.6: Punctuators. 4054 case '?': 4055 Kind = tok::question; 4056 break; 4057 case '[': 4058 Kind = tok::l_square; 4059 break; 4060 case ']': 4061 Kind = tok::r_square; 4062 break; 4063 case '(': 4064 Kind = tok::l_paren; 4065 break; 4066 case ')': 4067 Kind = tok::r_paren; 4068 break; 4069 case '{': 4070 Kind = tok::l_brace; 4071 break; 4072 case '}': 4073 Kind = tok::r_brace; 4074 break; 4075 case '.': 4076 Char = getCharAndSize(CurPtr, SizeTmp); 4077 if (Char >= '0' && Char <= '9') { 4078 // Notify MIOpt that we read a non-whitespace/non-comment token. 4079 MIOpt.ReadToken(); 4080 4081 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 4082 } else if (LangOpts.CPlusPlus && Char == '*') { 4083 Kind = tok::periodstar; 4084 CurPtr += SizeTmp; 4085 } else if (Char == '.' && 4086 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 4087 Kind = tok::ellipsis; 4088 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4089 SizeTmp2, Result); 4090 } else { 4091 Kind = tok::period; 4092 } 4093 break; 4094 case '&': 4095 Char = getCharAndSize(CurPtr, SizeTmp); 4096 if (Char == '&') { 4097 Kind = tok::ampamp; 4098 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4099 } else if (Char == '=') { 4100 Kind = tok::ampequal; 4101 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4102 } else { 4103 Kind = tok::amp; 4104 } 4105 break; 4106 case '*': 4107 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 4108 Kind = tok::starequal; 4109 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4110 } else { 4111 Kind = tok::star; 4112 } 4113 break; 4114 case '+': 4115 Char = getCharAndSize(CurPtr, SizeTmp); 4116 if (Char == '+') { 4117 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4118 Kind = tok::plusplus; 4119 } else if (Char == '=') { 4120 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4121 Kind = tok::plusequal; 4122 } else { 4123 Kind = tok::plus; 4124 } 4125 break; 4126 case '-': 4127 Char = getCharAndSize(CurPtr, SizeTmp); 4128 if (Char == '-') { // -- 4129 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4130 Kind = tok::minusminus; 4131 } else if (Char == '>' && LangOpts.CPlusPlus && 4132 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 4133 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4134 SizeTmp2, Result); 4135 Kind = tok::arrowstar; 4136 } else if (Char == '>') { // -> 4137 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4138 Kind = tok::arrow; 4139 } else if (Char == '=') { // -= 4140 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4141 Kind = tok::minusequal; 4142 } else { 4143 Kind = tok::minus; 4144 } 4145 break; 4146 case '~': 4147 Kind = tok::tilde; 4148 break; 4149 case '!': 4150 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 4151 Kind = tok::exclaimequal; 4152 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4153 } else { 4154 Kind = tok::exclaim; 4155 } 4156 break; 4157 case '/': 4158 // 6.4.9: Comments 4159 Char = getCharAndSize(CurPtr, SizeTmp); 4160 if (Char == '/') { // Line comment. 4161 // Even if Line comments are disabled (e.g. in C89 mode), we generally 4162 // want to lex this as a comment. There is one problem with this though, 4163 // that in one particular corner case, this can change the behavior of the 4164 // resultant program. For example, In "foo //**/ bar", C89 would lex 4165 // this as "foo / bar" and languages with Line comments would lex it as 4166 // "foo". Check to see if the character after the second slash is a '*'. 4167 // If so, we will lex that as a "/" instead of the start of a comment. 4168 // However, we never do this if we are just preprocessing. 4169 bool TreatAsComment = 4170 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); 4171 if (!TreatAsComment) 4172 if (!(PP && PP->isPreprocessedOutput())) 4173 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; 4174 4175 if (TreatAsComment) { 4176 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 4177 TokAtPhysicalStartOfLine)) 4178 return true; // There is a token to return. 4179 4180 // It is common for the tokens immediately after a // comment to be 4181 // whitespace (indentation for the next line). Instead of going through 4182 // the big switch, handle it efficiently now. 4183 goto SkipIgnoredUnits; 4184 } 4185 } 4186 4187 if (Char == '*') { // /**/ comment. 4188 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 4189 TokAtPhysicalStartOfLine)) 4190 return true; // There is a token to return. 4191 4192 // We only saw whitespace, so just try again with this lexer. 4193 // (We manually eliminate the tail call to avoid recursion.) 4194 goto LexNextToken; 4195 } 4196 4197 if (Char == '=') { 4198 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4199 Kind = tok::slashequal; 4200 } else { 4201 Kind = tok::slash; 4202 } 4203 break; 4204 case '%': 4205 Char = getCharAndSize(CurPtr, SizeTmp); 4206 if (Char == '=') { 4207 Kind = tok::percentequal; 4208 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4209 } else if (LangOpts.Digraphs && Char == '>') { 4210 Kind = tok::r_brace; // '%>' -> '}' 4211 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4212 } else if (LangOpts.Digraphs && Char == ':') { 4213 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4214 Char = getCharAndSize(CurPtr, SizeTmp); 4215 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 4216 Kind = tok::hashhash; // '%:%:' -> '##' 4217 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4218 SizeTmp2, Result); 4219 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 4220 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4221 if (!isLexingRawMode()) 4222 Diag(BufferPtr, diag::ext_charize_microsoft); 4223 Kind = tok::hashat; 4224 } else { // '%:' -> '#' 4225 // We parsed a # character. If this occurs at the start of the line, 4226 // it's actually the start of a preprocessing directive. Callback to 4227 // the preprocessor to handle it. 4228 // TODO: -fpreprocessed mode?? 4229 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 4230 goto HandleDirective; 4231 4232 Kind = tok::hash; 4233 } 4234 } else { 4235 Kind = tok::percent; 4236 } 4237 break; 4238 case '<': 4239 Char = getCharAndSize(CurPtr, SizeTmp); 4240 if (ParsingFilename) { 4241 return LexAngledStringLiteral(Result, CurPtr); 4242 } else if (Char == '<') { 4243 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4244 if (After == '=') { 4245 Kind = tok::lesslessequal; 4246 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4247 SizeTmp2, Result); 4248 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 4249 // If this is actually a '<<<<<<<' version control conflict marker, 4250 // recognize it as such and recover nicely. 4251 goto LexNextToken; 4252 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 4253 // If this is '<<<<' and we're in a Perforce-style conflict marker, 4254 // ignore it. 4255 goto LexNextToken; 4256 } else if (LangOpts.CUDA && After == '<') { 4257 Kind = tok::lesslessless; 4258 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4259 SizeTmp2, Result); 4260 } else { 4261 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4262 Kind = tok::lessless; 4263 } 4264 } else if (Char == '=') { 4265 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4266 if (After == '>') { 4267 if (LangOpts.CPlusPlus20) { 4268 if (!isLexingRawMode()) 4269 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship); 4270 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4271 SizeTmp2, Result); 4272 Kind = tok::spaceship; 4273 break; 4274 } 4275 // Suggest adding a space between the '<=' and the '>' to avoid a 4276 // change in semantics if this turns up in C++ <=17 mode. 4277 if (LangOpts.CPlusPlus && !isLexingRawMode()) { 4278 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship) 4279 << FixItHint::CreateInsertion( 4280 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " "); 4281 } 4282 } 4283 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4284 Kind = tok::lessequal; 4285 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 4286 if (LangOpts.CPlusPlus11 && 4287 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 4288 // C++0x [lex.pptoken]p3: 4289 // Otherwise, if the next three characters are <:: and the subsequent 4290 // character is neither : nor >, the < is treated as a preprocessor 4291 // token by itself and not as the first character of the alternative 4292 // token <:. 4293 unsigned SizeTmp3; 4294 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 4295 if (After != ':' && After != '>') { 4296 Kind = tok::less; 4297 if (!isLexingRawMode()) 4298 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 4299 break; 4300 } 4301 } 4302 4303 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4304 Kind = tok::l_square; 4305 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 4306 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4307 Kind = tok::l_brace; 4308 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 && 4309 lexEditorPlaceholder(Result, CurPtr)) { 4310 return true; 4311 } else { 4312 Kind = tok::less; 4313 } 4314 break; 4315 case '>': 4316 Char = getCharAndSize(CurPtr, SizeTmp); 4317 if (Char == '=') { 4318 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4319 Kind = tok::greaterequal; 4320 } else if (Char == '>') { 4321 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4322 if (After == '=') { 4323 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4324 SizeTmp2, Result); 4325 Kind = tok::greatergreaterequal; 4326 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 4327 // If this is actually a '>>>>' conflict marker, recognize it as such 4328 // and recover nicely. 4329 goto LexNextToken; 4330 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 4331 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 4332 goto LexNextToken; 4333 } else if (LangOpts.CUDA && After == '>') { 4334 Kind = tok::greatergreatergreater; 4335 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4336 SizeTmp2, Result); 4337 } else { 4338 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4339 Kind = tok::greatergreater; 4340 } 4341 } else { 4342 Kind = tok::greater; 4343 } 4344 break; 4345 case '^': 4346 Char = getCharAndSize(CurPtr, SizeTmp); 4347 if (Char == '=') { 4348 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4349 Kind = tok::caretequal; 4350 } else { 4351 if (LangOpts.OpenCL && Char == '^') 4352 Diag(CurPtr, diag::err_opencl_logical_exclusive_or); 4353 Kind = tok::caret; 4354 } 4355 break; 4356 case '|': 4357 Char = getCharAndSize(CurPtr, SizeTmp); 4358 if (Char == '=') { 4359 Kind = tok::pipeequal; 4360 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4361 } else if (Char == '|') { 4362 // If this is '|||||||' and we're in a conflict marker, ignore it. 4363 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 4364 goto LexNextToken; 4365 Kind = tok::pipepipe; 4366 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4367 } else { 4368 Kind = tok::pipe; 4369 } 4370 break; 4371 case ':': 4372 Char = getCharAndSize(CurPtr, SizeTmp); 4373 if (LangOpts.Digraphs && Char == '>') { 4374 Kind = tok::r_square; // ':>' -> ']' 4375 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4376 } else if (Char == ':') { 4377 Kind = tok::coloncolon; 4378 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4379 } else { 4380 Kind = tok::colon; 4381 } 4382 break; 4383 case ';': 4384 Kind = tok::semi; 4385 break; 4386 case '=': 4387 Char = getCharAndSize(CurPtr, SizeTmp); 4388 if (Char == '=') { 4389 // If this is '====' and we're in a conflict marker, ignore it. 4390 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 4391 goto LexNextToken; 4392 4393 Kind = tok::equalequal; 4394 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4395 } else { 4396 Kind = tok::equal; 4397 } 4398 break; 4399 case ',': 4400 Kind = tok::comma; 4401 break; 4402 case '#': 4403 Char = getCharAndSize(CurPtr, SizeTmp); 4404 if (Char == '#') { 4405 Kind = tok::hashhash; 4406 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4407 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 4408 Kind = tok::hashat; 4409 if (!isLexingRawMode()) 4410 Diag(BufferPtr, diag::ext_charize_microsoft); 4411 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4412 } else { 4413 // We parsed a # character. If this occurs at the start of the line, 4414 // it's actually the start of a preprocessing directive. Callback to 4415 // the preprocessor to handle it. 4416 // TODO: -fpreprocessed mode?? 4417 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 4418 goto HandleDirective; 4419 4420 Kind = tok::hash; 4421 } 4422 break; 4423 4424 case '@': 4425 // Objective C support. 4426 if (CurPtr[-1] == '@' && LangOpts.ObjC) 4427 Kind = tok::at; 4428 else 4429 Kind = tok::unknown; 4430 break; 4431 4432 // UCNs (C99 6.4.3, C++11 [lex.charset]p2) 4433 case '\\': 4434 if (!LangOpts.AsmPreprocessor) { 4435 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { 4436 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 4437 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 4438 return true; // KeepWhitespaceMode 4439 4440 // We only saw whitespace, so just try again with this lexer. 4441 // (We manually eliminate the tail call to avoid recursion.) 4442 goto LexNextToken; 4443 } 4444 4445 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 4446 } 4447 } 4448 4449 Kind = tok::unknown; 4450 break; 4451 4452 default: { 4453 if (isASCII(Char)) { 4454 Kind = tok::unknown; 4455 break; 4456 } 4457 4458 llvm::UTF32 CodePoint; 4459 4460 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to 4461 // an escaped newline. 4462 --CurPtr; 4463 llvm::ConversionResult Status = 4464 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, 4465 (const llvm::UTF8 *)BufferEnd, 4466 &CodePoint, 4467 llvm::strictConversion); 4468 if (Status == llvm::conversionOK) { 4469 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 4470 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 4471 return true; // KeepWhitespaceMode 4472 4473 // We only saw whitespace, so just try again with this lexer. 4474 // (We manually eliminate the tail call to avoid recursion.) 4475 goto LexNextToken; 4476 } 4477 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 4478 } 4479 4480 if (isLexingRawMode() || ParsingPreprocessorDirective || 4481 PP->isPreprocessedOutput()) { 4482 ++CurPtr; 4483 Kind = tok::unknown; 4484 break; 4485 } 4486 4487 // Non-ASCII characters tend to creep into source code unintentionally. 4488 // Instead of letting the parser complain about the unknown token, 4489 // just diagnose the invalid UTF-8, then drop the character. 4490 Diag(CurPtr, diag::err_invalid_utf8); 4491 4492 BufferPtr = CurPtr+1; 4493 // We're pretending the character didn't exist, so just try again with 4494 // this lexer. 4495 // (We manually eliminate the tail call to avoid recursion.) 4496 goto LexNextToken; 4497 } 4498 } 4499 4500 // Notify MIOpt that we read a non-whitespace/non-comment token. 4501 MIOpt.ReadToken(); 4502 4503 // Update the location of token as well as BufferPtr. 4504 FormTokenWithChars(Result, CurPtr, Kind); 4505 return true; 4506 4507 HandleDirective: 4508 // We parsed a # character and it's the start of a preprocessing directive. 4509 4510 FormTokenWithChars(Result, CurPtr, tok::hash); 4511 PP->HandleDirective(Result); 4512 4513 if (PP->hadModuleLoaderFatalFailure()) 4514 // With a fatal failure in the module loader, we abort parsing. 4515 return true; 4516 4517 // We parsed the directive; lex a token with the new state. 4518 return false; 4519 4520 LexNextToken: 4521 Result.clearFlag(Token::NeedsCleaning); 4522 goto LexStart; 4523 } 4524 4525 const char *Lexer::convertDependencyDirectiveToken( 4526 const dependency_directives_scan::Token &DDTok, Token &Result) { 4527 const char *TokPtr = BufferStart + DDTok.Offset; 4528 Result.startToken(); 4529 Result.setLocation(getSourceLocation(TokPtr)); 4530 Result.setKind(DDTok.Kind); 4531 Result.setFlag((Token::TokenFlags)DDTok.Flags); 4532 Result.setLength(DDTok.Length); 4533 BufferPtr = TokPtr + DDTok.Length; 4534 return TokPtr; 4535 } 4536 4537 bool Lexer::LexDependencyDirectiveToken(Token &Result) { 4538 assert(isDependencyDirectivesLexer()); 4539 4540 using namespace dependency_directives_scan; 4541 4542 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) { 4543 if (DepDirectives.front().Kind == pp_eof) 4544 return LexEndOfFile(Result, BufferEnd); 4545 if (DepDirectives.front().Kind == tokens_present_before_eof) 4546 MIOpt.ReadToken(); 4547 NextDepDirectiveTokenIndex = 0; 4548 DepDirectives = DepDirectives.drop_front(); 4549 } 4550 4551 const dependency_directives_scan::Token &DDTok = 4552 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++]; 4553 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) { 4554 // Read something other than a preprocessor directive hash. 4555 MIOpt.ReadToken(); 4556 } 4557 4558 if (ParsingFilename && DDTok.is(tok::less)) { 4559 BufferPtr = BufferStart + DDTok.Offset; 4560 LexAngledStringLiteral(Result, BufferPtr + 1); 4561 if (Result.isNot(tok::header_name)) 4562 return true; 4563 // Advance the index of lexed tokens. 4564 while (true) { 4565 const dependency_directives_scan::Token &NextTok = 4566 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex]; 4567 if (BufferStart + NextTok.Offset >= BufferPtr) 4568 break; 4569 ++NextDepDirectiveTokenIndex; 4570 } 4571 return true; 4572 } 4573 4574 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result); 4575 4576 if (Result.is(tok::hash) && Result.isAtStartOfLine()) { 4577 PP->HandleDirective(Result); 4578 return false; 4579 } 4580 if (Result.is(tok::raw_identifier)) { 4581 Result.setRawIdentifierData(TokPtr); 4582 if (!isLexingRawMode()) { 4583 const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 4584 if (II->isHandleIdentifierCase()) 4585 return PP->HandleIdentifier(Result); 4586 } 4587 return true; 4588 } 4589 if (Result.isLiteral()) { 4590 Result.setLiteralData(TokPtr); 4591 return true; 4592 } 4593 if (Result.is(tok::colon)) { 4594 // Convert consecutive colons to 'tok::coloncolon'. 4595 if (*BufferPtr == ':') { 4596 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( 4597 tok::colon)); 4598 ++NextDepDirectiveTokenIndex; 4599 Result.setKind(tok::coloncolon); 4600 } 4601 return true; 4602 } 4603 if (Result.is(tok::eod)) 4604 ParsingPreprocessorDirective = false; 4605 4606 return true; 4607 } 4608 4609 bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) { 4610 assert(isDependencyDirectivesLexer()); 4611 4612 using namespace dependency_directives_scan; 4613 4614 bool Stop = false; 4615 unsigned NestedIfs = 0; 4616 do { 4617 DepDirectives = DepDirectives.drop_front(); 4618 switch (DepDirectives.front().Kind) { 4619 case pp_none: 4620 llvm_unreachable("unexpected 'pp_none'"); 4621 case pp_include: 4622 case pp___include_macros: 4623 case pp_define: 4624 case pp_undef: 4625 case pp_import: 4626 case pp_pragma_import: 4627 case pp_pragma_once: 4628 case pp_pragma_push_macro: 4629 case pp_pragma_pop_macro: 4630 case pp_pragma_include_alias: 4631 case pp_pragma_system_header: 4632 case pp_include_next: 4633 case decl_at_import: 4634 case cxx_module_decl: 4635 case cxx_import_decl: 4636 case cxx_export_module_decl: 4637 case cxx_export_import_decl: 4638 case tokens_present_before_eof: 4639 break; 4640 case pp_if: 4641 case pp_ifdef: 4642 case pp_ifndef: 4643 ++NestedIfs; 4644 break; 4645 case pp_elif: 4646 case pp_elifdef: 4647 case pp_elifndef: 4648 case pp_else: 4649 if (!NestedIfs) { 4650 Stop = true; 4651 } 4652 break; 4653 case pp_endif: 4654 if (!NestedIfs) { 4655 Stop = true; 4656 } else { 4657 --NestedIfs; 4658 } 4659 break; 4660 case pp_eof: 4661 NextDepDirectiveTokenIndex = 0; 4662 return LexEndOfFile(Result, BufferEnd); 4663 } 4664 } while (!Stop); 4665 4666 const dependency_directives_scan::Token &DDTok = 4667 DepDirectives.front().Tokens.front(); 4668 assert(DDTok.is(tok::hash)); 4669 NextDepDirectiveTokenIndex = 1; 4670 4671 convertDependencyDirectiveToken(DDTok, Result); 4672 return false; 4673 } 4674