1 //===- Lexer.cpp - C Language Family Lexer --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Lexer and Token interfaces. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "clang/Lex/Lexer.h" 14 #include "UnicodeCharSets.h" 15 #include "clang/Basic/CharInfo.h" 16 #include "clang/Basic/IdentifierTable.h" 17 #include "clang/Basic/LangOptions.h" 18 #include "clang/Basic/SourceLocation.h" 19 #include "clang/Basic/SourceManager.h" 20 #include "clang/Basic/TokenKinds.h" 21 #include "clang/Lex/LexDiagnostic.h" 22 #include "clang/Lex/LiteralSupport.h" 23 #include "clang/Lex/MultipleIncludeOpt.h" 24 #include "clang/Lex/Preprocessor.h" 25 #include "clang/Lex/PreprocessorOptions.h" 26 #include "clang/Lex/Token.h" 27 #include "clang/Basic/Diagnostic.h" 28 #include "clang/Basic/LLVM.h" 29 #include "clang/Basic/TokenKinds.h" 30 #include "llvm/ADT/None.h" 31 #include "llvm/ADT/Optional.h" 32 #include "llvm/ADT/StringExtras.h" 33 #include "llvm/ADT/StringSwitch.h" 34 #include "llvm/ADT/StringRef.h" 35 #include "llvm/Support/Compiler.h" 36 #include "llvm/Support/ConvertUTF.h" 37 #include "llvm/Support/MathExtras.h" 38 #include "llvm/Support/MemoryBuffer.h" 39 #include "llvm/Support/NativeFormatting.h" 40 #include "llvm/Support/UnicodeCharRanges.h" 41 #include <algorithm> 42 #include <cassert> 43 #include <cstddef> 44 #include <cstdint> 45 #include <cstring> 46 #include <string> 47 #include <tuple> 48 #include <utility> 49 50 using namespace clang; 51 52 //===----------------------------------------------------------------------===// 53 // Token Class Implementation 54 //===----------------------------------------------------------------------===// 55 56 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 57 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 58 if (isAnnotation()) 59 return false; 60 if (IdentifierInfo *II = getIdentifierInfo()) 61 return II->getObjCKeywordID() == objcKey; 62 return false; 63 } 64 65 /// getObjCKeywordID - Return the ObjC keyword kind. 66 tok::ObjCKeywordKind Token::getObjCKeywordID() const { 67 if (isAnnotation()) 68 return tok::objc_not_keyword; 69 IdentifierInfo *specId = getIdentifierInfo(); 70 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 71 } 72 73 //===----------------------------------------------------------------------===// 74 // Lexer Class Implementation 75 //===----------------------------------------------------------------------===// 76 77 void Lexer::anchor() {} 78 79 void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 80 const char *BufEnd) { 81 BufferStart = BufStart; 82 BufferPtr = BufPtr; 83 BufferEnd = BufEnd; 84 85 assert(BufEnd[0] == 0 && 86 "We assume that the input buffer has a null character at the end" 87 " to simplify lexing!"); 88 89 // Check whether we have a BOM in the beginning of the buffer. If yes - act 90 // accordingly. Right now we support only UTF-8 with and without BOM, so, just 91 // skip the UTF-8 BOM if it's present. 92 if (BufferStart == BufferPtr) { 93 // Determine the size of the BOM. 94 StringRef Buf(BufferStart, BufferEnd - BufferStart); 95 size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 96 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 97 .Default(0); 98 99 // Skip the BOM. 100 BufferPtr += BOMLength; 101 } 102 103 Is_PragmaLexer = false; 104 CurrentConflictMarkerState = CMK_None; 105 106 // Start of the file is a start of line. 107 IsAtStartOfLine = true; 108 IsAtPhysicalStartOfLine = true; 109 110 HasLeadingSpace = false; 111 HasLeadingEmptyMacro = false; 112 113 // We are not after parsing a #. 114 ParsingPreprocessorDirective = false; 115 116 // We are not after parsing #include. 117 ParsingFilename = false; 118 119 // We are not in raw mode. Raw mode disables diagnostics and interpretation 120 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 121 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 122 // or otherwise skipping over tokens. 123 LexingRawMode = false; 124 125 // Default to not keeping comments. 126 ExtendedTokenMode = 0; 127 } 128 129 /// Lexer constructor - Create a new lexer object for the specified buffer 130 /// with the specified preprocessor managing the lexing process. This lexer 131 /// assumes that the associated file buffer and Preprocessor objects will 132 /// outlive it, so it doesn't take ownership of either of them. 133 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) 134 : PreprocessorLexer(&PP, FID), 135 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 136 LangOpts(PP.getLangOpts()) { 137 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 138 InputFile->getBufferEnd()); 139 140 resetExtendedTokenMode(); 141 } 142 143 /// Lexer constructor - Create a new raw lexer object. This object is only 144 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 145 /// range will outlive it, so it doesn't take ownership of it. 146 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 147 const char *BufStart, const char *BufPtr, const char *BufEnd) 148 : FileLoc(fileloc), LangOpts(langOpts) { 149 InitLexer(BufStart, BufPtr, BufEnd); 150 151 // We *are* in raw mode. 152 LexingRawMode = true; 153 } 154 155 /// Lexer constructor - Create a new raw lexer object. This object is only 156 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 157 /// range will outlive it, so it doesn't take ownership of it. 158 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, 159 const SourceManager &SM, const LangOptions &langOpts) 160 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(), 161 FromFile->getBufferStart(), FromFile->getBufferEnd()) {} 162 163 void Lexer::resetExtendedTokenMode() { 164 assert(PP && "Cannot reset token mode without a preprocessor"); 165 if (LangOpts.TraditionalCPP) 166 SetKeepWhitespaceMode(true); 167 else 168 SetCommentRetentionState(PP->getCommentRetentionState()); 169 } 170 171 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 172 /// _Pragma expansion. This has a variety of magic semantics that this method 173 /// sets up. It returns a new'd Lexer that must be delete'd when done. 174 /// 175 /// On entrance to this routine, TokStartLoc is a macro location which has a 176 /// spelling loc that indicates the bytes to be lexed for the token and an 177 /// expansion location that indicates where all lexed tokens should be 178 /// "expanded from". 179 /// 180 /// TODO: It would really be nice to make _Pragma just be a wrapper around a 181 /// normal lexer that remaps tokens as they fly by. This would require making 182 /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 183 /// interface that could handle this stuff. This would pull GetMappedTokenLoc 184 /// out of the critical path of the lexer! 185 /// 186 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 187 SourceLocation ExpansionLocStart, 188 SourceLocation ExpansionLocEnd, 189 unsigned TokLen, Preprocessor &PP) { 190 SourceManager &SM = PP.getSourceManager(); 191 192 // Create the lexer as if we were going to lex the file normally. 193 FileID SpellingFID = SM.getFileID(SpellingLoc); 194 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID); 195 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 196 197 // Now that the lexer is created, change the start/end locations so that we 198 // just lex the subsection of the file that we want. This is lexing from a 199 // scratch buffer. 200 const char *StrData = SM.getCharacterData(SpellingLoc); 201 202 L->BufferPtr = StrData; 203 L->BufferEnd = StrData+TokLen; 204 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 205 206 // Set the SourceLocation with the remapping information. This ensures that 207 // GetMappedTokenLoc will remap the tokens as they are lexed. 208 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 209 ExpansionLocStart, 210 ExpansionLocEnd, TokLen); 211 212 // Ensure that the lexer thinks it is inside a directive, so that end \n will 213 // return an EOD token. 214 L->ParsingPreprocessorDirective = true; 215 216 // This lexer really is for _Pragma. 217 L->Is_PragmaLexer = true; 218 return L; 219 } 220 221 bool Lexer::skipOver(unsigned NumBytes) { 222 IsAtPhysicalStartOfLine = true; 223 IsAtStartOfLine = true; 224 if ((BufferPtr + NumBytes) > BufferEnd) 225 return true; 226 BufferPtr += NumBytes; 227 return false; 228 } 229 230 template <typename T> static void StringifyImpl(T &Str, char Quote) { 231 typename T::size_type i = 0, e = Str.size(); 232 while (i < e) { 233 if (Str[i] == '\\' || Str[i] == Quote) { 234 Str.insert(Str.begin() + i, '\\'); 235 i += 2; 236 ++e; 237 } else if (Str[i] == '\n' || Str[i] == '\r') { 238 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'. 239 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && 240 Str[i] != Str[i + 1]) { 241 Str[i] = '\\'; 242 Str[i + 1] = 'n'; 243 } else { 244 // Replace '\n' and '\r' to '\\' followed by 'n'. 245 Str[i] = '\\'; 246 Str.insert(Str.begin() + i + 1, 'n'); 247 ++e; 248 } 249 i += 2; 250 } else 251 ++i; 252 } 253 } 254 255 std::string Lexer::Stringify(StringRef Str, bool Charify) { 256 std::string Result = Str; 257 char Quote = Charify ? '\'' : '"'; 258 StringifyImpl(Result, Quote); 259 return Result; 260 } 261 262 void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); } 263 264 //===----------------------------------------------------------------------===// 265 // Token Spelling 266 //===----------------------------------------------------------------------===// 267 268 /// Slow case of getSpelling. Extract the characters comprising the 269 /// spelling of this token from the provided input buffer. 270 static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, 271 const LangOptions &LangOpts, char *Spelling) { 272 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); 273 274 size_t Length = 0; 275 const char *BufEnd = BufPtr + Tok.getLength(); 276 277 if (tok::isStringLiteral(Tok.getKind())) { 278 // Munch the encoding-prefix and opening double-quote. 279 while (BufPtr < BufEnd) { 280 unsigned Size; 281 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 282 BufPtr += Size; 283 284 if (Spelling[Length - 1] == '"') 285 break; 286 } 287 288 // Raw string literals need special handling; trigraph expansion and line 289 // splicing do not occur within their d-char-sequence nor within their 290 // r-char-sequence. 291 if (Length >= 2 && 292 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { 293 // Search backwards from the end of the token to find the matching closing 294 // quote. 295 const char *RawEnd = BufEnd; 296 do --RawEnd; while (*RawEnd != '"'); 297 size_t RawLength = RawEnd - BufPtr + 1; 298 299 // Everything between the quotes is included verbatim in the spelling. 300 memcpy(Spelling + Length, BufPtr, RawLength); 301 Length += RawLength; 302 BufPtr += RawLength; 303 304 // The rest of the token is lexed normally. 305 } 306 } 307 308 while (BufPtr < BufEnd) { 309 unsigned Size; 310 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 311 BufPtr += Size; 312 } 313 314 assert(Length < Tok.getLength() && 315 "NeedsCleaning flag set on token that didn't need cleaning!"); 316 return Length; 317 } 318 319 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 320 /// token are the characters used to represent the token in the source file 321 /// after trigraph expansion and escaped-newline folding. In particular, this 322 /// wants to get the true, uncanonicalized, spelling of things like digraphs 323 /// UCNs, etc. 324 StringRef Lexer::getSpelling(SourceLocation loc, 325 SmallVectorImpl<char> &buffer, 326 const SourceManager &SM, 327 const LangOptions &options, 328 bool *invalid) { 329 // Break down the source location. 330 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 331 332 // Try to the load the file buffer. 333 bool invalidTemp = false; 334 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 335 if (invalidTemp) { 336 if (invalid) *invalid = true; 337 return {}; 338 } 339 340 const char *tokenBegin = file.data() + locInfo.second; 341 342 // Lex from the start of the given location. 343 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 344 file.begin(), tokenBegin, file.end()); 345 Token token; 346 lexer.LexFromRawLexer(token); 347 348 unsigned length = token.getLength(); 349 350 // Common case: no need for cleaning. 351 if (!token.needsCleaning()) 352 return StringRef(tokenBegin, length); 353 354 // Hard case, we need to relex the characters into the string. 355 buffer.resize(length); 356 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); 357 return StringRef(buffer.data(), buffer.size()); 358 } 359 360 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 361 /// token are the characters used to represent the token in the source file 362 /// after trigraph expansion and escaped-newline folding. In particular, this 363 /// wants to get the true, uncanonicalized, spelling of things like digraphs 364 /// UCNs, etc. 365 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 366 const LangOptions &LangOpts, bool *Invalid) { 367 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 368 369 bool CharDataInvalid = false; 370 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 371 &CharDataInvalid); 372 if (Invalid) 373 *Invalid = CharDataInvalid; 374 if (CharDataInvalid) 375 return {}; 376 377 // If this token contains nothing interesting, return it directly. 378 if (!Tok.needsCleaning()) 379 return std::string(TokStart, TokStart + Tok.getLength()); 380 381 std::string Result; 382 Result.resize(Tok.getLength()); 383 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); 384 return Result; 385 } 386 387 /// getSpelling - This method is used to get the spelling of a token into a 388 /// preallocated buffer, instead of as an std::string. The caller is required 389 /// to allocate enough space for the token, which is guaranteed to be at least 390 /// Tok.getLength() bytes long. The actual length of the token is returned. 391 /// 392 /// Note that this method may do two possible things: it may either fill in 393 /// the buffer specified with characters, or it may *change the input pointer* 394 /// to point to a constant buffer with the data already in it (avoiding a 395 /// copy). The caller is not allowed to modify the returned buffer pointer 396 /// if an internal buffer is returned. 397 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 398 const SourceManager &SourceMgr, 399 const LangOptions &LangOpts, bool *Invalid) { 400 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 401 402 const char *TokStart = nullptr; 403 // NOTE: this has to be checked *before* testing for an IdentifierInfo. 404 if (Tok.is(tok::raw_identifier)) 405 TokStart = Tok.getRawIdentifier().data(); 406 else if (!Tok.hasUCN()) { 407 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 408 // Just return the string from the identifier table, which is very quick. 409 Buffer = II->getNameStart(); 410 return II->getLength(); 411 } 412 } 413 414 // NOTE: this can be checked even after testing for an IdentifierInfo. 415 if (Tok.isLiteral()) 416 TokStart = Tok.getLiteralData(); 417 418 if (!TokStart) { 419 // Compute the start of the token in the input lexer buffer. 420 bool CharDataInvalid = false; 421 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 422 if (Invalid) 423 *Invalid = CharDataInvalid; 424 if (CharDataInvalid) { 425 Buffer = ""; 426 return 0; 427 } 428 } 429 430 // If this token contains nothing interesting, return it directly. 431 if (!Tok.needsCleaning()) { 432 Buffer = TokStart; 433 return Tok.getLength(); 434 } 435 436 // Otherwise, hard case, relex the characters into the string. 437 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); 438 } 439 440 /// MeasureTokenLength - Relex the token at the specified location and return 441 /// its length in bytes in the input file. If the token needs cleaning (e.g. 442 /// includes a trigraph or an escaped newline) then this count includes bytes 443 /// that are part of that. 444 unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 445 const SourceManager &SM, 446 const LangOptions &LangOpts) { 447 Token TheTok; 448 if (getRawToken(Loc, TheTok, SM, LangOpts)) 449 return 0; 450 return TheTok.getLength(); 451 } 452 453 /// Relex the token at the specified location. 454 /// \returns true if there was a failure, false on success. 455 bool Lexer::getRawToken(SourceLocation Loc, Token &Result, 456 const SourceManager &SM, 457 const LangOptions &LangOpts, 458 bool IgnoreWhiteSpace) { 459 // TODO: this could be special cased for common tokens like identifiers, ')', 460 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 461 // all obviously single-char tokens. This could use 462 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 463 // something. 464 465 // If this comes from a macro expansion, we really do want the macro name, not 466 // the token this macro expanded to. 467 Loc = SM.getExpansionLoc(Loc); 468 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 469 bool Invalid = false; 470 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 471 if (Invalid) 472 return true; 473 474 const char *StrData = Buffer.data()+LocInfo.second; 475 476 if (!IgnoreWhiteSpace && isWhitespace(StrData[0])) 477 return true; 478 479 // Create a lexer starting at the beginning of this token. 480 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 481 Buffer.begin(), StrData, Buffer.end()); 482 TheLexer.SetCommentRetentionState(true); 483 TheLexer.LexFromRawLexer(Result); 484 return false; 485 } 486 487 /// Returns the pointer that points to the beginning of line that contains 488 /// the given offset, or null if the offset if invalid. 489 static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { 490 const char *BufStart = Buffer.data(); 491 if (Offset >= Buffer.size()) 492 return nullptr; 493 494 const char *LexStart = BufStart + Offset; 495 for (; LexStart != BufStart; --LexStart) { 496 if (isVerticalWhitespace(LexStart[0]) && 497 !Lexer::isNewLineEscaped(BufStart, LexStart)) { 498 // LexStart should point at first character of logical line. 499 ++LexStart; 500 break; 501 } 502 } 503 return LexStart; 504 } 505 506 static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 507 const SourceManager &SM, 508 const LangOptions &LangOpts) { 509 assert(Loc.isFileID()); 510 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 511 if (LocInfo.first.isInvalid()) 512 return Loc; 513 514 bool Invalid = false; 515 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 516 if (Invalid) 517 return Loc; 518 519 // Back up from the current location until we hit the beginning of a line 520 // (or the buffer). We'll relex from that point. 521 const char *StrData = Buffer.data() + LocInfo.second; 522 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); 523 if (!LexStart || LexStart == StrData) 524 return Loc; 525 526 // Create a lexer starting at the beginning of this token. 527 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 528 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, 529 Buffer.end()); 530 TheLexer.SetCommentRetentionState(true); 531 532 // Lex tokens until we find the token that contains the source location. 533 Token TheTok; 534 do { 535 TheLexer.LexFromRawLexer(TheTok); 536 537 if (TheLexer.getBufferLocation() > StrData) { 538 // Lexing this token has taken the lexer past the source location we're 539 // looking for. If the current token encompasses our source location, 540 // return the beginning of that token. 541 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 542 return TheTok.getLocation(); 543 544 // We ended up skipping over the source location entirely, which means 545 // that it points into whitespace. We're done here. 546 break; 547 } 548 } while (TheTok.getKind() != tok::eof); 549 550 // We've passed our source location; just return the original source location. 551 return Loc; 552 } 553 554 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 555 const SourceManager &SM, 556 const LangOptions &LangOpts) { 557 if (Loc.isFileID()) 558 return getBeginningOfFileToken(Loc, SM, LangOpts); 559 560 if (!SM.isMacroArgExpansion(Loc)) 561 return Loc; 562 563 SourceLocation FileLoc = SM.getSpellingLoc(Loc); 564 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 565 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 566 std::pair<FileID, unsigned> BeginFileLocInfo = 567 SM.getDecomposedLoc(BeginFileLoc); 568 assert(FileLocInfo.first == BeginFileLocInfo.first && 569 FileLocInfo.second >= BeginFileLocInfo.second); 570 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 571 } 572 573 namespace { 574 575 enum PreambleDirectiveKind { 576 PDK_Skipped, 577 PDK_Unknown 578 }; 579 580 } // namespace 581 582 PreambleBounds Lexer::ComputePreamble(StringRef Buffer, 583 const LangOptions &LangOpts, 584 unsigned MaxLines) { 585 // Create a lexer starting at the beginning of the file. Note that we use a 586 // "fake" file source location at offset 1 so that the lexer will track our 587 // position within the file. 588 const unsigned StartOffset = 1; 589 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); 590 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(), 591 Buffer.end()); 592 TheLexer.SetCommentRetentionState(true); 593 594 bool InPreprocessorDirective = false; 595 Token TheTok; 596 SourceLocation ActiveCommentLoc; 597 598 unsigned MaxLineOffset = 0; 599 if (MaxLines) { 600 const char *CurPtr = Buffer.begin(); 601 unsigned CurLine = 0; 602 while (CurPtr != Buffer.end()) { 603 char ch = *CurPtr++; 604 if (ch == '\n') { 605 ++CurLine; 606 if (CurLine == MaxLines) 607 break; 608 } 609 } 610 if (CurPtr != Buffer.end()) 611 MaxLineOffset = CurPtr - Buffer.begin(); 612 } 613 614 do { 615 TheLexer.LexFromRawLexer(TheTok); 616 617 if (InPreprocessorDirective) { 618 // If we've hit the end of the file, we're done. 619 if (TheTok.getKind() == tok::eof) { 620 break; 621 } 622 623 // If we haven't hit the end of the preprocessor directive, skip this 624 // token. 625 if (!TheTok.isAtStartOfLine()) 626 continue; 627 628 // We've passed the end of the preprocessor directive, and will look 629 // at this token again below. 630 InPreprocessorDirective = false; 631 } 632 633 // Keep track of the # of lines in the preamble. 634 if (TheTok.isAtStartOfLine()) { 635 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 636 637 // If we were asked to limit the number of lines in the preamble, 638 // and we're about to exceed that limit, we're done. 639 if (MaxLineOffset && TokOffset >= MaxLineOffset) 640 break; 641 } 642 643 // Comments are okay; skip over them. 644 if (TheTok.getKind() == tok::comment) { 645 if (ActiveCommentLoc.isInvalid()) 646 ActiveCommentLoc = TheTok.getLocation(); 647 continue; 648 } 649 650 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 651 // This is the start of a preprocessor directive. 652 Token HashTok = TheTok; 653 InPreprocessorDirective = true; 654 ActiveCommentLoc = SourceLocation(); 655 656 // Figure out which directive this is. Since we're lexing raw tokens, 657 // we don't have an identifier table available. Instead, just look at 658 // the raw identifier to recognize and categorize preprocessor directives. 659 TheLexer.LexFromRawLexer(TheTok); 660 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 661 StringRef Keyword = TheTok.getRawIdentifier(); 662 PreambleDirectiveKind PDK 663 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 664 .Case("include", PDK_Skipped) 665 .Case("__include_macros", PDK_Skipped) 666 .Case("define", PDK_Skipped) 667 .Case("undef", PDK_Skipped) 668 .Case("line", PDK_Skipped) 669 .Case("error", PDK_Skipped) 670 .Case("pragma", PDK_Skipped) 671 .Case("import", PDK_Skipped) 672 .Case("include_next", PDK_Skipped) 673 .Case("warning", PDK_Skipped) 674 .Case("ident", PDK_Skipped) 675 .Case("sccs", PDK_Skipped) 676 .Case("assert", PDK_Skipped) 677 .Case("unassert", PDK_Skipped) 678 .Case("if", PDK_Skipped) 679 .Case("ifdef", PDK_Skipped) 680 .Case("ifndef", PDK_Skipped) 681 .Case("elif", PDK_Skipped) 682 .Case("else", PDK_Skipped) 683 .Case("endif", PDK_Skipped) 684 .Default(PDK_Unknown); 685 686 switch (PDK) { 687 case PDK_Skipped: 688 continue; 689 690 case PDK_Unknown: 691 // We don't know what this directive is; stop at the '#'. 692 break; 693 } 694 } 695 696 // We only end up here if we didn't recognize the preprocessor 697 // directive or it was one that can't occur in the preamble at this 698 // point. Roll back the current token to the location of the '#'. 699 TheTok = HashTok; 700 } 701 702 // We hit a token that we don't recognize as being in the 703 // "preprocessing only" part of the file, so we're no longer in 704 // the preamble. 705 break; 706 } while (true); 707 708 SourceLocation End; 709 if (ActiveCommentLoc.isValid()) 710 End = ActiveCommentLoc; // don't truncate a decl comment. 711 else 712 End = TheTok.getLocation(); 713 714 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(), 715 TheTok.isAtStartOfLine()); 716 } 717 718 unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, 719 const SourceManager &SM, 720 const LangOptions &LangOpts) { 721 // Figure out how many physical characters away the specified expansion 722 // character is. This needs to take into consideration newlines and 723 // trigraphs. 724 bool Invalid = false; 725 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 726 727 // If they request the first char of the token, we're trivially done. 728 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 729 return 0; 730 731 unsigned PhysOffset = 0; 732 733 // The usual case is that tokens don't contain anything interesting. Skip 734 // over the uninteresting characters. If a token only consists of simple 735 // chars, this method is extremely fast. 736 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 737 if (CharNo == 0) 738 return PhysOffset; 739 ++TokPtr; 740 --CharNo; 741 ++PhysOffset; 742 } 743 744 // If we have a character that may be a trigraph or escaped newline, use a 745 // lexer to parse it correctly. 746 for (; CharNo; --CharNo) { 747 unsigned Size; 748 Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts); 749 TokPtr += Size; 750 PhysOffset += Size; 751 } 752 753 // Final detail: if we end up on an escaped newline, we want to return the 754 // location of the actual byte of the token. For example foo\<newline>bar 755 // advanced by 3 should return the location of b, not of \\. One compounding 756 // detail of this is that the escape may be made by a trigraph. 757 if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 758 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 759 760 return PhysOffset; 761 } 762 763 /// Computes the source location just past the end of the 764 /// token at this source location. 765 /// 766 /// This routine can be used to produce a source location that 767 /// points just past the end of the token referenced by \p Loc, and 768 /// is generally used when a diagnostic needs to point just after a 769 /// token where it expected something different that it received. If 770 /// the returned source location would not be meaningful (e.g., if 771 /// it points into a macro), this routine returns an invalid 772 /// source location. 773 /// 774 /// \param Offset an offset from the end of the token, where the source 775 /// location should refer to. The default offset (0) produces a source 776 /// location pointing just past the end of the token; an offset of 1 produces 777 /// a source location pointing to the last character in the token, etc. 778 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 779 const SourceManager &SM, 780 const LangOptions &LangOpts) { 781 if (Loc.isInvalid()) 782 return {}; 783 784 if (Loc.isMacroID()) { 785 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 786 return {}; // Points inside the macro expansion. 787 } 788 789 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 790 if (Len > Offset) 791 Len = Len - Offset; 792 else 793 return Loc; 794 795 return Loc.getLocWithOffset(Len); 796 } 797 798 /// Returns true if the given MacroID location points at the first 799 /// token of the macro expansion. 800 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 801 const SourceManager &SM, 802 const LangOptions &LangOpts, 803 SourceLocation *MacroBegin) { 804 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 805 806 SourceLocation expansionLoc; 807 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) 808 return false; 809 810 if (expansionLoc.isFileID()) { 811 // No other macro expansions, this is the first. 812 if (MacroBegin) 813 *MacroBegin = expansionLoc; 814 return true; 815 } 816 817 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 818 } 819 820 /// Returns true if the given MacroID location points at the last 821 /// token of the macro expansion. 822 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 823 const SourceManager &SM, 824 const LangOptions &LangOpts, 825 SourceLocation *MacroEnd) { 826 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 827 828 SourceLocation spellLoc = SM.getSpellingLoc(loc); 829 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 830 if (tokLen == 0) 831 return false; 832 833 SourceLocation afterLoc = loc.getLocWithOffset(tokLen); 834 SourceLocation expansionLoc; 835 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) 836 return false; 837 838 if (expansionLoc.isFileID()) { 839 // No other macro expansions. 840 if (MacroEnd) 841 *MacroEnd = expansionLoc; 842 return true; 843 } 844 845 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 846 } 847 848 static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 849 const SourceManager &SM, 850 const LangOptions &LangOpts) { 851 SourceLocation Begin = Range.getBegin(); 852 SourceLocation End = Range.getEnd(); 853 assert(Begin.isFileID() && End.isFileID()); 854 if (Range.isTokenRange()) { 855 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 856 if (End.isInvalid()) 857 return {}; 858 } 859 860 // Break down the source locations. 861 FileID FID; 862 unsigned BeginOffs; 863 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 864 if (FID.isInvalid()) 865 return {}; 866 867 unsigned EndOffs; 868 if (!SM.isInFileID(End, FID, &EndOffs) || 869 BeginOffs > EndOffs) 870 return {}; 871 872 return CharSourceRange::getCharRange(Begin, End); 873 } 874 875 CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 876 const SourceManager &SM, 877 const LangOptions &LangOpts) { 878 SourceLocation Begin = Range.getBegin(); 879 SourceLocation End = Range.getEnd(); 880 if (Begin.isInvalid() || End.isInvalid()) 881 return {}; 882 883 if (Begin.isFileID() && End.isFileID()) 884 return makeRangeFromFileLocs(Range, SM, LangOpts); 885 886 if (Begin.isMacroID() && End.isFileID()) { 887 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 888 return {}; 889 Range.setBegin(Begin); 890 return makeRangeFromFileLocs(Range, SM, LangOpts); 891 } 892 893 if (Begin.isFileID() && End.isMacroID()) { 894 if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts, 895 &End)) || 896 (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts, 897 &End))) 898 return {}; 899 Range.setEnd(End); 900 return makeRangeFromFileLocs(Range, SM, LangOpts); 901 } 902 903 assert(Begin.isMacroID() && End.isMacroID()); 904 SourceLocation MacroBegin, MacroEnd; 905 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 906 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 907 &MacroEnd)) || 908 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 909 &MacroEnd)))) { 910 Range.setBegin(MacroBegin); 911 Range.setEnd(MacroEnd); 912 return makeRangeFromFileLocs(Range, SM, LangOpts); 913 } 914 915 bool Invalid = false; 916 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), 917 &Invalid); 918 if (Invalid) 919 return {}; 920 921 if (BeginEntry.getExpansion().isMacroArgExpansion()) { 922 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), 923 &Invalid); 924 if (Invalid) 925 return {}; 926 927 if (EndEntry.getExpansion().isMacroArgExpansion() && 928 BeginEntry.getExpansion().getExpansionLocStart() == 929 EndEntry.getExpansion().getExpansionLocStart()) { 930 Range.setBegin(SM.getImmediateSpellingLoc(Begin)); 931 Range.setEnd(SM.getImmediateSpellingLoc(End)); 932 return makeFileCharRange(Range, SM, LangOpts); 933 } 934 } 935 936 return {}; 937 } 938 939 StringRef Lexer::getSourceText(CharSourceRange Range, 940 const SourceManager &SM, 941 const LangOptions &LangOpts, 942 bool *Invalid) { 943 Range = makeFileCharRange(Range, SM, LangOpts); 944 if (Range.isInvalid()) { 945 if (Invalid) *Invalid = true; 946 return {}; 947 } 948 949 // Break down the source location. 950 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 951 if (beginInfo.first.isInvalid()) { 952 if (Invalid) *Invalid = true; 953 return {}; 954 } 955 956 unsigned EndOffs; 957 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 958 beginInfo.second > EndOffs) { 959 if (Invalid) *Invalid = true; 960 return {}; 961 } 962 963 // Try to the load the file buffer. 964 bool invalidTemp = false; 965 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 966 if (invalidTemp) { 967 if (Invalid) *Invalid = true; 968 return {}; 969 } 970 971 if (Invalid) *Invalid = false; 972 return file.substr(beginInfo.second, EndOffs - beginInfo.second); 973 } 974 975 StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 976 const SourceManager &SM, 977 const LangOptions &LangOpts) { 978 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 979 980 // Find the location of the immediate macro expansion. 981 while (true) { 982 FileID FID = SM.getFileID(Loc); 983 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 984 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 985 Loc = Expansion.getExpansionLocStart(); 986 if (!Expansion.isMacroArgExpansion()) 987 break; 988 989 // For macro arguments we need to check that the argument did not come 990 // from an inner macro, e.g: "MAC1( MAC2(foo) )" 991 992 // Loc points to the argument id of the macro definition, move to the 993 // macro expansion. 994 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 995 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 996 if (SpellLoc.isFileID()) 997 break; // No inner macro. 998 999 // If spelling location resides in the same FileID as macro expansion 1000 // location, it means there is no inner macro. 1001 FileID MacroFID = SM.getFileID(Loc); 1002 if (SM.isInFileID(SpellLoc, MacroFID)) 1003 break; 1004 1005 // Argument came from inner macro. 1006 Loc = SpellLoc; 1007 } 1008 1009 // Find the spelling location of the start of the non-argument expansion 1010 // range. This is where the macro name was spelled in order to begin 1011 // expanding this macro. 1012 Loc = SM.getSpellingLoc(Loc); 1013 1014 // Dig out the buffer where the macro name was spelled and the extents of the 1015 // name so that we can render it into the expansion note. 1016 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1017 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1018 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1019 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1020 } 1021 1022 StringRef Lexer::getImmediateMacroNameForDiagnostics( 1023 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { 1024 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 1025 // Walk past macro argument expansions. 1026 while (SM.isMacroArgExpansion(Loc)) 1027 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 1028 1029 // If the macro's spelling has no FileID, then it's actually a token paste 1030 // or stringization (or similar) and not a macro at all. 1031 if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc)))) 1032 return {}; 1033 1034 // Find the spelling location of the start of the non-argument expansion 1035 // range. This is where the macro name was spelled in order to begin 1036 // expanding this macro. 1037 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin()); 1038 1039 // Dig out the buffer where the macro name was spelled and the extents of the 1040 // name so that we can render it into the expansion note. 1041 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1042 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1043 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1044 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1045 } 1046 1047 bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) { 1048 return isIdentifierBody(c, LangOpts.DollarIdents); 1049 } 1050 1051 bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { 1052 assert(isVerticalWhitespace(Str[0])); 1053 if (Str - 1 < BufferStart) 1054 return false; 1055 1056 if ((Str[0] == '\n' && Str[-1] == '\r') || 1057 (Str[0] == '\r' && Str[-1] == '\n')) { 1058 if (Str - 2 < BufferStart) 1059 return false; 1060 --Str; 1061 } 1062 --Str; 1063 1064 // Rewind to first non-space character: 1065 while (Str > BufferStart && isHorizontalWhitespace(*Str)) 1066 --Str; 1067 1068 return *Str == '\\'; 1069 } 1070 1071 StringRef Lexer::getIndentationForLine(SourceLocation Loc, 1072 const SourceManager &SM) { 1073 if (Loc.isInvalid() || Loc.isMacroID()) 1074 return {}; 1075 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1076 if (LocInfo.first.isInvalid()) 1077 return {}; 1078 bool Invalid = false; 1079 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 1080 if (Invalid) 1081 return {}; 1082 const char *Line = findBeginningOfLine(Buffer, LocInfo.second); 1083 if (!Line) 1084 return {}; 1085 StringRef Rest = Buffer.substr(Line - Buffer.data()); 1086 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t"); 1087 return NumWhitespaceChars == StringRef::npos 1088 ? "" 1089 : Rest.take_front(NumWhitespaceChars); 1090 } 1091 1092 //===----------------------------------------------------------------------===// 1093 // Diagnostics forwarding code. 1094 //===----------------------------------------------------------------------===// 1095 1096 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 1097 /// lexer buffer was all expanded at a single point, perform the mapping. 1098 /// This is currently only used for _Pragma implementation, so it is the slow 1099 /// path of the hot getSourceLocation method. Do not allow it to be inlined. 1100 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 1101 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 1102 static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 1103 SourceLocation FileLoc, 1104 unsigned CharNo, unsigned TokLen) { 1105 assert(FileLoc.isMacroID() && "Must be a macro expansion"); 1106 1107 // Otherwise, we're lexing "mapped tokens". This is used for things like 1108 // _Pragma handling. Combine the expansion location of FileLoc with the 1109 // spelling location. 1110 SourceManager &SM = PP.getSourceManager(); 1111 1112 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 1113 // characters come from spelling(FileLoc)+Offset. 1114 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 1115 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 1116 1117 // Figure out the expansion loc range, which is the range covered by the 1118 // original _Pragma(...) sequence. 1119 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc); 1120 1121 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen); 1122 } 1123 1124 /// getSourceLocation - Return a source location identifier for the specified 1125 /// offset in the current file. 1126 SourceLocation Lexer::getSourceLocation(const char *Loc, 1127 unsigned TokLen) const { 1128 assert(Loc >= BufferStart && Loc <= BufferEnd && 1129 "Location out of range for this buffer!"); 1130 1131 // In the normal case, we're just lexing from a simple file buffer, return 1132 // the file id from FileLoc with the offset specified. 1133 unsigned CharNo = Loc-BufferStart; 1134 if (FileLoc.isFileID()) 1135 return FileLoc.getLocWithOffset(CharNo); 1136 1137 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 1138 // tokens are lexed from where the _Pragma was defined. 1139 assert(PP && "This doesn't work on raw lexers"); 1140 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 1141 } 1142 1143 /// Diag - Forwarding function for diagnostics. This translate a source 1144 /// position in the current buffer into a SourceLocation object for rendering. 1145 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 1146 return PP->Diag(getSourceLocation(Loc), DiagID); 1147 } 1148 1149 //===----------------------------------------------------------------------===// 1150 // Trigraph and Escaped Newline Handling Code. 1151 //===----------------------------------------------------------------------===// 1152 1153 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 1154 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 1155 static char GetTrigraphCharForLetter(char Letter) { 1156 switch (Letter) { 1157 default: return 0; 1158 case '=': return '#'; 1159 case ')': return ']'; 1160 case '(': return '['; 1161 case '!': return '|'; 1162 case '\'': return '^'; 1163 case '>': return '}'; 1164 case '/': return '\\'; 1165 case '<': return '{'; 1166 case '-': return '~'; 1167 } 1168 } 1169 1170 /// DecodeTrigraphChar - If the specified character is a legal trigraph when 1171 /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 1172 /// return the result character. Finally, emit a warning about trigraph use 1173 /// whether trigraphs are enabled or not. 1174 static char DecodeTrigraphChar(const char *CP, Lexer *L) { 1175 char Res = GetTrigraphCharForLetter(*CP); 1176 if (!Res || !L) return Res; 1177 1178 if (!L->getLangOpts().Trigraphs) { 1179 if (!L->isLexingRawMode()) 1180 L->Diag(CP-2, diag::trigraph_ignored); 1181 return 0; 1182 } 1183 1184 if (!L->isLexingRawMode()) 1185 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 1186 return Res; 1187 } 1188 1189 /// getEscapedNewLineSize - Return the size of the specified escaped newline, 1190 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 1191 /// trigraph equivalent on entry to this function. 1192 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 1193 unsigned Size = 0; 1194 while (isWhitespace(Ptr[Size])) { 1195 ++Size; 1196 1197 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 1198 continue; 1199 1200 // If this is a \r\n or \n\r, skip the other half. 1201 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 1202 Ptr[Size-1] != Ptr[Size]) 1203 ++Size; 1204 1205 return Size; 1206 } 1207 1208 // Not an escaped newline, must be a \t or something else. 1209 return 0; 1210 } 1211 1212 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 1213 /// them), skip over them and return the first non-escaped-newline found, 1214 /// otherwise return P. 1215 const char *Lexer::SkipEscapedNewLines(const char *P) { 1216 while (true) { 1217 const char *AfterEscape; 1218 if (*P == '\\') { 1219 AfterEscape = P+1; 1220 } else if (*P == '?') { 1221 // If not a trigraph for escape, bail out. 1222 if (P[1] != '?' || P[2] != '/') 1223 return P; 1224 // FIXME: Take LangOpts into account; the language might not 1225 // support trigraphs. 1226 AfterEscape = P+3; 1227 } else { 1228 return P; 1229 } 1230 1231 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 1232 if (NewLineSize == 0) return P; 1233 P = AfterEscape+NewLineSize; 1234 } 1235 } 1236 1237 Optional<Token> Lexer::findNextToken(SourceLocation Loc, 1238 const SourceManager &SM, 1239 const LangOptions &LangOpts) { 1240 if (Loc.isMacroID()) { 1241 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 1242 return None; 1243 } 1244 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 1245 1246 // Break down the source location. 1247 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1248 1249 // Try to load the file buffer. 1250 bool InvalidTemp = false; 1251 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 1252 if (InvalidTemp) 1253 return None; 1254 1255 const char *TokenBegin = File.data() + LocInfo.second; 1256 1257 // Lex from the start of the given location. 1258 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 1259 TokenBegin, File.end()); 1260 // Find the token. 1261 Token Tok; 1262 lexer.LexFromRawLexer(Tok); 1263 return Tok; 1264 } 1265 1266 /// Checks that the given token is the first token that occurs after the 1267 /// given location (this excludes comments and whitespace). Returns the location 1268 /// immediately after the specified token. If the token is not found or the 1269 /// location is inside a macro, the returned source location will be invalid. 1270 SourceLocation Lexer::findLocationAfterToken( 1271 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, 1272 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { 1273 Optional<Token> Tok = findNextToken(Loc, SM, LangOpts); 1274 if (!Tok || Tok->isNot(TKind)) 1275 return {}; 1276 SourceLocation TokenLoc = Tok->getLocation(); 1277 1278 // Calculate how much whitespace needs to be skipped if any. 1279 unsigned NumWhitespaceChars = 0; 1280 if (SkipTrailingWhitespaceAndNewLine) { 1281 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength(); 1282 unsigned char C = *TokenEnd; 1283 while (isHorizontalWhitespace(C)) { 1284 C = *(++TokenEnd); 1285 NumWhitespaceChars++; 1286 } 1287 1288 // Skip \r, \n, \r\n, or \n\r 1289 if (C == '\n' || C == '\r') { 1290 char PrevC = C; 1291 C = *(++TokenEnd); 1292 NumWhitespaceChars++; 1293 if ((C == '\n' || C == '\r') && C != PrevC) 1294 NumWhitespaceChars++; 1295 } 1296 } 1297 1298 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars); 1299 } 1300 1301 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 1302 /// get its size, and return it. This is tricky in several cases: 1303 /// 1. If currently at the start of a trigraph, we warn about the trigraph, 1304 /// then either return the trigraph (skipping 3 chars) or the '?', 1305 /// depending on whether trigraphs are enabled or not. 1306 /// 2. If this is an escaped newline (potentially with whitespace between 1307 /// the backslash and newline), implicitly skip the newline and return 1308 /// the char after it. 1309 /// 1310 /// This handles the slow/uncommon case of the getCharAndSize method. Here we 1311 /// know that we can accumulate into Size, and that we have already incremented 1312 /// Ptr by Size bytes. 1313 /// 1314 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 1315 /// be updated to match. 1316 char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 1317 Token *Tok) { 1318 // If we have a slash, look for an escaped newline. 1319 if (Ptr[0] == '\\') { 1320 ++Size; 1321 ++Ptr; 1322 Slash: 1323 // Common case, backslash-char where the char is not whitespace. 1324 if (!isWhitespace(Ptr[0])) return '\\'; 1325 1326 // See if we have optional whitespace characters between the slash and 1327 // newline. 1328 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1329 // Remember that this token needs to be cleaned. 1330 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1331 1332 // Warn if there was whitespace between the backslash and newline. 1333 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 1334 Diag(Ptr, diag::backslash_newline_space); 1335 1336 // Found backslash<whitespace><newline>. Parse the char after it. 1337 Size += EscapedNewLineSize; 1338 Ptr += EscapedNewLineSize; 1339 1340 // Use slow version to accumulate a correct size field. 1341 return getCharAndSizeSlow(Ptr, Size, Tok); 1342 } 1343 1344 // Otherwise, this is not an escaped newline, just return the slash. 1345 return '\\'; 1346 } 1347 1348 // If this is a trigraph, process it. 1349 if (Ptr[0] == '?' && Ptr[1] == '?') { 1350 // If this is actually a legal trigraph (not something like "??x"), emit 1351 // a trigraph warning. If so, and if trigraphs are enabled, return it. 1352 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) { 1353 // Remember that this token needs to be cleaned. 1354 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1355 1356 Ptr += 3; 1357 Size += 3; 1358 if (C == '\\') goto Slash; 1359 return C; 1360 } 1361 } 1362 1363 // If this is neither, return a single character. 1364 ++Size; 1365 return *Ptr; 1366 } 1367 1368 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 1369 /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 1370 /// and that we have already incremented Ptr by Size bytes. 1371 /// 1372 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 1373 /// be updated to match. 1374 char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 1375 const LangOptions &LangOpts) { 1376 // If we have a slash, look for an escaped newline. 1377 if (Ptr[0] == '\\') { 1378 ++Size; 1379 ++Ptr; 1380 Slash: 1381 // Common case, backslash-char where the char is not whitespace. 1382 if (!isWhitespace(Ptr[0])) return '\\'; 1383 1384 // See if we have optional whitespace characters followed by a newline. 1385 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1386 // Found backslash<whitespace><newline>. Parse the char after it. 1387 Size += EscapedNewLineSize; 1388 Ptr += EscapedNewLineSize; 1389 1390 // Use slow version to accumulate a correct size field. 1391 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts); 1392 } 1393 1394 // Otherwise, this is not an escaped newline, just return the slash. 1395 return '\\'; 1396 } 1397 1398 // If this is a trigraph, process it. 1399 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 1400 // If this is actually a legal trigraph (not something like "??x"), return 1401 // it. 1402 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 1403 Ptr += 3; 1404 Size += 3; 1405 if (C == '\\') goto Slash; 1406 return C; 1407 } 1408 } 1409 1410 // If this is neither, return a single character. 1411 ++Size; 1412 return *Ptr; 1413 } 1414 1415 //===----------------------------------------------------------------------===// 1416 // Helper methods for lexing. 1417 //===----------------------------------------------------------------------===// 1418 1419 /// Routine that indiscriminately sets the offset into the source file. 1420 void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { 1421 BufferPtr = BufferStart + Offset; 1422 if (BufferPtr > BufferEnd) 1423 BufferPtr = BufferEnd; 1424 // FIXME: What exactly does the StartOfLine bit mean? There are two 1425 // possible meanings for the "start" of the line: the first token on the 1426 // unexpanded line, or the first token on the expanded line. 1427 IsAtStartOfLine = StartOfLine; 1428 IsAtPhysicalStartOfLine = StartOfLine; 1429 } 1430 1431 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) { 1432 if (LangOpts.AsmPreprocessor) { 1433 return false; 1434 } else if (LangOpts.CPlusPlus11 || LangOpts.C11) { 1435 static const llvm::sys::UnicodeCharSet C11AllowedIDChars( 1436 C11AllowedIDCharRanges); 1437 return C11AllowedIDChars.contains(C); 1438 } else if (LangOpts.CPlusPlus) { 1439 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars( 1440 CXX03AllowedIDCharRanges); 1441 return CXX03AllowedIDChars.contains(C); 1442 } else { 1443 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1444 C99AllowedIDCharRanges); 1445 return C99AllowedIDChars.contains(C); 1446 } 1447 } 1448 1449 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) { 1450 assert(isAllowedIDChar(C, LangOpts)); 1451 if (LangOpts.AsmPreprocessor) { 1452 return false; 1453 } else if (LangOpts.CPlusPlus11 || LangOpts.C11) { 1454 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( 1455 C11DisallowedInitialIDCharRanges); 1456 return !C11DisallowedInitialIDChars.contains(C); 1457 } else if (LangOpts.CPlusPlus) { 1458 return true; 1459 } else { 1460 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1461 C99DisallowedInitialIDCharRanges); 1462 return !C99DisallowedInitialIDChars.contains(C); 1463 } 1464 } 1465 1466 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, 1467 const char *End) { 1468 return CharSourceRange::getCharRange(L.getSourceLocation(Begin), 1469 L.getSourceLocation(End)); 1470 } 1471 1472 static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, 1473 CharSourceRange Range, bool IsFirst) { 1474 // Check C99 compatibility. 1475 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) { 1476 enum { 1477 CannotAppearInIdentifier = 0, 1478 CannotStartIdentifier 1479 }; 1480 1481 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1482 C99AllowedIDCharRanges); 1483 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1484 C99DisallowedInitialIDCharRanges); 1485 if (!C99AllowedIDChars.contains(C)) { 1486 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1487 << Range 1488 << CannotAppearInIdentifier; 1489 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { 1490 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1491 << Range 1492 << CannotStartIdentifier; 1493 } 1494 } 1495 1496 // Check C++98 compatibility. 1497 if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) { 1498 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars( 1499 CXX03AllowedIDCharRanges); 1500 if (!CXX03AllowedIDChars.contains(C)) { 1501 Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id) 1502 << Range; 1503 } 1504 } 1505 } 1506 1507 /// After encountering UTF-8 character C and interpreting it as an identifier 1508 /// character, check whether it's a homoglyph for a common non-identifier 1509 /// source character that is unlikely to be an intentional identifier 1510 /// character and warn if so. 1511 static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, 1512 CharSourceRange Range) { 1513 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). 1514 struct HomoglyphPair { 1515 uint32_t Character; 1516 char LooksLike; 1517 bool operator<(HomoglyphPair R) const { return Character < R.Character; } 1518 }; 1519 static constexpr HomoglyphPair SortedHomoglyphs[] = { 1520 {U'\u00ad', 0}, // SOFT HYPHEN 1521 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK 1522 {U'\u037e', ';'}, // GREEK QUESTION MARK 1523 {U'\u200b', 0}, // ZERO WIDTH SPACE 1524 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER 1525 {U'\u200d', 0}, // ZERO WIDTH JOINER 1526 {U'\u2060', 0}, // WORD JOINER 1527 {U'\u2061', 0}, // FUNCTION APPLICATION 1528 {U'\u2062', 0}, // INVISIBLE TIMES 1529 {U'\u2063', 0}, // INVISIBLE SEPARATOR 1530 {U'\u2064', 0}, // INVISIBLE PLUS 1531 {U'\u2212', '-'}, // MINUS SIGN 1532 {U'\u2215', '/'}, // DIVISION SLASH 1533 {U'\u2216', '\\'}, // SET MINUS 1534 {U'\u2217', '*'}, // ASTERISK OPERATOR 1535 {U'\u2223', '|'}, // DIVIDES 1536 {U'\u2227', '^'}, // LOGICAL AND 1537 {U'\u2236', ':'}, // RATIO 1538 {U'\u223c', '~'}, // TILDE OPERATOR 1539 {U'\ua789', ':'}, // MODIFIER LETTER COLON 1540 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE 1541 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK 1542 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN 1543 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN 1544 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN 1545 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND 1546 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS 1547 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS 1548 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK 1549 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK 1550 {U'\uff0c', ','}, // FULLWIDTH COMMA 1551 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS 1552 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP 1553 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS 1554 {U'\uff1a', ':'}, // FULLWIDTH COLON 1555 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON 1556 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN 1557 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN 1558 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN 1559 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK 1560 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT 1561 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET 1562 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS 1563 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET 1564 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT 1565 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET 1566 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE 1567 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET 1568 {U'\uff5e', '~'}, // FULLWIDTH TILDE 1569 {0, 0} 1570 }; 1571 auto Homoglyph = 1572 std::lower_bound(std::begin(SortedHomoglyphs), 1573 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); 1574 if (Homoglyph->Character == C) { 1575 llvm::SmallString<5> CharBuf; 1576 { 1577 llvm::raw_svector_ostream CharOS(CharBuf); 1578 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); 1579 } 1580 if (Homoglyph->LooksLike) { 1581 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; 1582 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) 1583 << Range << CharBuf << LooksLikeStr; 1584 } else { 1585 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) 1586 << Range << CharBuf; 1587 } 1588 } 1589 } 1590 1591 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 1592 Token &Result) { 1593 const char *UCNPtr = CurPtr + Size; 1594 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr); 1595 if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts)) 1596 return false; 1597 1598 if (!isLexingRawMode()) 1599 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1600 makeCharRange(*this, CurPtr, UCNPtr), 1601 /*IsFirst=*/false); 1602 1603 Result.setFlag(Token::HasUCN); 1604 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || 1605 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) 1606 CurPtr = UCNPtr; 1607 else 1608 while (CurPtr != UCNPtr) 1609 (void)getAndAdvanceChar(CurPtr, Result); 1610 return true; 1611 } 1612 1613 bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { 1614 const char *UnicodePtr = CurPtr; 1615 llvm::UTF32 CodePoint; 1616 llvm::ConversionResult Result = 1617 llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr, 1618 (const llvm::UTF8 *)BufferEnd, 1619 &CodePoint, 1620 llvm::strictConversion); 1621 if (Result != llvm::conversionOK || 1622 !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) 1623 return false; 1624 1625 if (!isLexingRawMode()) { 1626 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1627 makeCharRange(*this, CurPtr, UnicodePtr), 1628 /*IsFirst=*/false); 1629 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, 1630 makeCharRange(*this, CurPtr, UnicodePtr)); 1631 } 1632 1633 CurPtr = UnicodePtr; 1634 return true; 1635 } 1636 1637 bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 1638 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 1639 unsigned Size; 1640 unsigned char C = *CurPtr++; 1641 while (isIdentifierBody(C)) 1642 C = *CurPtr++; 1643 1644 --CurPtr; // Back up over the skipped character. 1645 1646 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 1647 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 1648 // 1649 // TODO: Could merge these checks into an InfoTable flag to make the 1650 // comparison cheaper 1651 if (isASCII(C) && C != '\\' && C != '?' && 1652 (C != '$' || !LangOpts.DollarIdents)) { 1653 FinishIdentifier: 1654 const char *IdStart = BufferPtr; 1655 FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 1656 Result.setRawIdentifierData(IdStart); 1657 1658 // If we are in raw mode, return this identifier raw. There is no need to 1659 // look up identifier information or attempt to macro expand it. 1660 if (LexingRawMode) 1661 return true; 1662 1663 // Fill in Result.IdentifierInfo and update the token kind, 1664 // looking up the identifier in the identifier table. 1665 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 1666 // Note that we have to call PP->LookUpIdentifierInfo() even for code 1667 // completion, it writes IdentifierInfo into Result, and callers rely on it. 1668 1669 // If the completion point is at the end of an identifier, we want to treat 1670 // the identifier as incomplete even if it resolves to a macro or a keyword. 1671 // This allows e.g. 'class^' to complete to 'classifier'. 1672 if (isCodeCompletionPoint(CurPtr)) { 1673 // Return the code-completion token. 1674 Result.setKind(tok::code_completion); 1675 // Skip the code-completion char and all immediate identifier characters. 1676 // This ensures we get consistent behavior when completing at any point in 1677 // an identifier (i.e. at the start, in the middle, at the end). Note that 1678 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code 1679 // simpler. 1680 assert(*CurPtr == 0 && "Completion character must be 0"); 1681 ++CurPtr; 1682 // Note that code completion token is not added as a separate character 1683 // when the completion point is at the end of the buffer. Therefore, we need 1684 // to check if the buffer has ended. 1685 if (CurPtr < BufferEnd) { 1686 while (isIdentifierBody(*CurPtr)) 1687 ++CurPtr; 1688 } 1689 BufferPtr = CurPtr; 1690 return true; 1691 } 1692 1693 // Finally, now that we know we have an identifier, pass this off to the 1694 // preprocessor, which may macro expand it or something. 1695 if (II->isHandleIdentifierCase()) 1696 return PP->HandleIdentifier(Result); 1697 1698 return true; 1699 } 1700 1701 // Otherwise, $,\,? in identifier found. Enter slower path. 1702 1703 C = getCharAndSize(CurPtr, Size); 1704 while (true) { 1705 if (C == '$') { 1706 // If we hit a $ and they are not supported in identifiers, we are done. 1707 if (!LangOpts.DollarIdents) goto FinishIdentifier; 1708 1709 // Otherwise, emit a diagnostic and continue. 1710 if (!isLexingRawMode()) 1711 Diag(CurPtr, diag::ext_dollar_in_identifier); 1712 CurPtr = ConsumeChar(CurPtr, Size, Result); 1713 C = getCharAndSize(CurPtr, Size); 1714 continue; 1715 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { 1716 C = getCharAndSize(CurPtr, Size); 1717 continue; 1718 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { 1719 C = getCharAndSize(CurPtr, Size); 1720 continue; 1721 } else if (!isIdentifierBody(C)) { 1722 goto FinishIdentifier; 1723 } 1724 1725 // Otherwise, this character is good, consume it. 1726 CurPtr = ConsumeChar(CurPtr, Size, Result); 1727 1728 C = getCharAndSize(CurPtr, Size); 1729 while (isIdentifierBody(C)) { 1730 CurPtr = ConsumeChar(CurPtr, Size, Result); 1731 C = getCharAndSize(CurPtr, Size); 1732 } 1733 } 1734 } 1735 1736 /// isHexaLiteral - Return true if Start points to a hex constant. 1737 /// in microsoft mode (where this is supposed to be several different tokens). 1738 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 1739 unsigned Size; 1740 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts); 1741 if (C1 != '0') 1742 return false; 1743 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts); 1744 return (C2 == 'x' || C2 == 'X'); 1745 } 1746 1747 /// LexNumericConstant - Lex the remainder of a integer or floating point 1748 /// constant. From[-1] is the first character lexed. Return the end of the 1749 /// constant. 1750 bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 1751 unsigned Size; 1752 char C = getCharAndSize(CurPtr, Size); 1753 char PrevCh = 0; 1754 while (isPreprocessingNumberBody(C)) { 1755 CurPtr = ConsumeChar(CurPtr, Size, Result); 1756 PrevCh = C; 1757 C = getCharAndSize(CurPtr, Size); 1758 } 1759 1760 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 1761 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 1762 // If we are in Microsoft mode, don't continue if the constant is hex. 1763 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 1764 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 1765 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1766 } 1767 1768 // If we have a hex FP constant, continue. 1769 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 1770 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a 1771 // not-quite-conforming extension. Only do so if this looks like it's 1772 // actually meant to be a hexfloat, and not if it has a ud-suffix. 1773 bool IsHexFloat = true; 1774 if (!LangOpts.C99) { 1775 if (!isHexaLiteral(BufferPtr, LangOpts)) 1776 IsHexFloat = false; 1777 else if (!getLangOpts().CPlusPlus17 && 1778 std::find(BufferPtr, CurPtr, '_') != CurPtr) 1779 IsHexFloat = false; 1780 } 1781 if (IsHexFloat) 1782 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1783 } 1784 1785 // If we have a digit separator, continue. 1786 if (C == '\'' && getLangOpts().CPlusPlus14) { 1787 unsigned NextSize; 1788 char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts()); 1789 if (isIdentifierBody(Next)) { 1790 if (!isLexingRawMode()) 1791 Diag(CurPtr, diag::warn_cxx11_compat_digit_separator); 1792 CurPtr = ConsumeChar(CurPtr, Size, Result); 1793 CurPtr = ConsumeChar(CurPtr, NextSize, Result); 1794 return LexNumericConstant(Result, CurPtr); 1795 } 1796 } 1797 1798 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. 1799 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1800 return LexNumericConstant(Result, CurPtr); 1801 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 1802 return LexNumericConstant(Result, CurPtr); 1803 1804 // Update the location of token as well as BufferPtr. 1805 const char *TokStart = BufferPtr; 1806 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 1807 Result.setLiteralData(TokStart); 1808 return true; 1809 } 1810 1811 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 1812 /// in C++11, or warn on a ud-suffix in C++98. 1813 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, 1814 bool IsStringLiteral) { 1815 assert(getLangOpts().CPlusPlus); 1816 1817 // Maximally munch an identifier. 1818 unsigned Size; 1819 char C = getCharAndSize(CurPtr, Size); 1820 bool Consumed = false; 1821 1822 if (!isIdentifierHead(C)) { 1823 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1824 Consumed = true; 1825 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 1826 Consumed = true; 1827 else 1828 return CurPtr; 1829 } 1830 1831 if (!getLangOpts().CPlusPlus11) { 1832 if (!isLexingRawMode()) 1833 Diag(CurPtr, 1834 C == '_' ? diag::warn_cxx11_compat_user_defined_literal 1835 : diag::warn_cxx11_compat_reserved_user_defined_literal) 1836 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 1837 return CurPtr; 1838 } 1839 1840 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 1841 // that does not start with an underscore is ill-formed. As a conforming 1842 // extension, we treat all such suffixes as if they had whitespace before 1843 // them. We assume a suffix beginning with a UCN or UTF-8 character is more 1844 // likely to be a ud-suffix than a macro, however, and accept that. 1845 if (!Consumed) { 1846 bool IsUDSuffix = false; 1847 if (C == '_') 1848 IsUDSuffix = true; 1849 else if (IsStringLiteral && getLangOpts().CPlusPlus14) { 1850 // In C++1y, we need to look ahead a few characters to see if this is a 1851 // valid suffix for a string literal or a numeric literal (this could be 1852 // the 'operator""if' defining a numeric literal operator). 1853 const unsigned MaxStandardSuffixLength = 3; 1854 char Buffer[MaxStandardSuffixLength] = { C }; 1855 unsigned Consumed = Size; 1856 unsigned Chars = 1; 1857 while (true) { 1858 unsigned NextSize; 1859 char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, 1860 getLangOpts()); 1861 if (!isIdentifierBody(Next)) { 1862 // End of suffix. Check whether this is on the whitelist. 1863 const StringRef CompleteSuffix(Buffer, Chars); 1864 IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(), 1865 CompleteSuffix); 1866 break; 1867 } 1868 1869 if (Chars == MaxStandardSuffixLength) 1870 // Too long: can't be a standard suffix. 1871 break; 1872 1873 Buffer[Chars++] = Next; 1874 Consumed += NextSize; 1875 } 1876 } 1877 1878 if (!IsUDSuffix) { 1879 if (!isLexingRawMode()) 1880 Diag(CurPtr, getLangOpts().MSVCCompat 1881 ? diag::ext_ms_reserved_user_defined_literal 1882 : diag::ext_reserved_user_defined_literal) 1883 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 1884 return CurPtr; 1885 } 1886 1887 CurPtr = ConsumeChar(CurPtr, Size, Result); 1888 } 1889 1890 Result.setFlag(Token::HasUDSuffix); 1891 while (true) { 1892 C = getCharAndSize(CurPtr, Size); 1893 if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); } 1894 else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {} 1895 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {} 1896 else break; 1897 } 1898 1899 return CurPtr; 1900 } 1901 1902 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 1903 /// either " or L" or u8" or u" or U". 1904 bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 1905 tok::TokenKind Kind) { 1906 const char *AfterQuote = CurPtr; 1907 // Does this string contain the \0 character? 1908 const char *NulCharacter = nullptr; 1909 1910 if (!isLexingRawMode() && 1911 (Kind == tok::utf8_string_literal || 1912 Kind == tok::utf16_string_literal || 1913 Kind == tok::utf32_string_literal)) 1914 Diag(BufferPtr, getLangOpts().CPlusPlus 1915 ? diag::warn_cxx98_compat_unicode_literal 1916 : diag::warn_c99_compat_unicode_literal); 1917 1918 char C = getAndAdvanceChar(CurPtr, Result); 1919 while (C != '"') { 1920 // Skip escaped characters. Escaped newlines will already be processed by 1921 // getAndAdvanceChar. 1922 if (C == '\\') 1923 C = getAndAdvanceChar(CurPtr, Result); 1924 1925 if (C == '\n' || C == '\r' || // Newline. 1926 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 1927 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 1928 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; 1929 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1930 return true; 1931 } 1932 1933 if (C == 0) { 1934 if (isCodeCompletionPoint(CurPtr-1)) { 1935 if (ParsingFilename) 1936 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false); 1937 else 1938 PP->CodeCompleteNaturalLanguage(); 1939 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 1940 cutOffLexing(); 1941 return true; 1942 } 1943 1944 NulCharacter = CurPtr-1; 1945 } 1946 C = getAndAdvanceChar(CurPtr, Result); 1947 } 1948 1949 // If we are in C++11, lex the optional ud-suffix. 1950 if (getLangOpts().CPlusPlus) 1951 CurPtr = LexUDSuffix(Result, CurPtr, true); 1952 1953 // If a nul character existed in the string, warn about it. 1954 if (NulCharacter && !isLexingRawMode()) 1955 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 1956 1957 // Update the location of the token as well as the BufferPtr instance var. 1958 const char *TokStart = BufferPtr; 1959 FormTokenWithChars(Result, CurPtr, Kind); 1960 Result.setLiteralData(TokStart); 1961 return true; 1962 } 1963 1964 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 1965 /// having lexed R", LR", u8R", uR", or UR". 1966 bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 1967 tok::TokenKind Kind) { 1968 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 1969 // Between the initial and final double quote characters of the raw string, 1970 // any transformations performed in phases 1 and 2 (trigraphs, 1971 // universal-character-names, and line splicing) are reverted. 1972 1973 if (!isLexingRawMode()) 1974 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 1975 1976 unsigned PrefixLen = 0; 1977 1978 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) 1979 ++PrefixLen; 1980 1981 // If the last character was not a '(', then we didn't lex a valid delimiter. 1982 if (CurPtr[PrefixLen] != '(') { 1983 if (!isLexingRawMode()) { 1984 const char *PrefixEnd = &CurPtr[PrefixLen]; 1985 if (PrefixLen == 16) { 1986 Diag(PrefixEnd, diag::err_raw_delim_too_long); 1987 } else { 1988 Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 1989 << StringRef(PrefixEnd, 1); 1990 } 1991 } 1992 1993 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 1994 // it's possible the '"' was intended to be part of the raw string, but 1995 // there's not much we can do about that. 1996 while (true) { 1997 char C = *CurPtr++; 1998 1999 if (C == '"') 2000 break; 2001 if (C == 0 && CurPtr-1 == BufferEnd) { 2002 --CurPtr; 2003 break; 2004 } 2005 } 2006 2007 FormTokenWithChars(Result, CurPtr, tok::unknown); 2008 return true; 2009 } 2010 2011 // Save prefix and move CurPtr past it 2012 const char *Prefix = CurPtr; 2013 CurPtr += PrefixLen + 1; // skip over prefix and '(' 2014 2015 while (true) { 2016 char C = *CurPtr++; 2017 2018 if (C == ')') { 2019 // Check for prefix match and closing quote. 2020 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 2021 CurPtr += PrefixLen + 1; // skip over prefix and '"' 2022 break; 2023 } 2024 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 2025 if (!isLexingRawMode()) 2026 Diag(BufferPtr, diag::err_unterminated_raw_string) 2027 << StringRef(Prefix, PrefixLen); 2028 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2029 return true; 2030 } 2031 } 2032 2033 // If we are in C++11, lex the optional ud-suffix. 2034 if (getLangOpts().CPlusPlus) 2035 CurPtr = LexUDSuffix(Result, CurPtr, true); 2036 2037 // Update the location of token as well as BufferPtr. 2038 const char *TokStart = BufferPtr; 2039 FormTokenWithChars(Result, CurPtr, Kind); 2040 Result.setLiteralData(TokStart); 2041 return true; 2042 } 2043 2044 /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 2045 /// after having lexed the '<' character. This is used for #include filenames. 2046 bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 2047 // Does this string contain the \0 character? 2048 const char *NulCharacter = nullptr; 2049 const char *AfterLessPos = CurPtr; 2050 char C = getAndAdvanceChar(CurPtr, Result); 2051 while (C != '>') { 2052 // Skip escaped characters. Escaped newlines will already be processed by 2053 // getAndAdvanceChar. 2054 if (C == '\\') 2055 C = getAndAdvanceChar(CurPtr, Result); 2056 2057 if (C == '\n' || C == '\r' || // Newline. 2058 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file. 2059 // If the filename is unterminated, then it must just be a lone < 2060 // character. Return this as such. 2061 FormTokenWithChars(Result, AfterLessPos, tok::less); 2062 return true; 2063 } 2064 2065 if (C == 0) { 2066 if (isCodeCompletionPoint(CurPtr - 1)) { 2067 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true); 2068 cutOffLexing(); 2069 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 2070 return true; 2071 } 2072 NulCharacter = CurPtr-1; 2073 } 2074 C = getAndAdvanceChar(CurPtr, Result); 2075 } 2076 2077 // If a nul character existed in the string, warn about it. 2078 if (NulCharacter && !isLexingRawMode()) 2079 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 2080 2081 // Update the location of token as well as BufferPtr. 2082 const char *TokStart = BufferPtr; 2083 FormTokenWithChars(Result, CurPtr, tok::header_name); 2084 Result.setLiteralData(TokStart); 2085 return true; 2086 } 2087 2088 void Lexer::codeCompleteIncludedFile(const char *PathStart, 2089 const char *CompletionPoint, 2090 bool IsAngled) { 2091 // Completion only applies to the filename, after the last slash. 2092 StringRef PartialPath(PathStart, CompletionPoint - PathStart); 2093 auto Slash = PartialPath.find_last_of(LangOpts.MSVCCompat ? "/\\" : "/"); 2094 StringRef Dir = 2095 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash); 2096 const char *StartOfFilename = 2097 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1; 2098 // Code completion filter range is the filename only, up to completion point. 2099 PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get( 2100 StringRef(StartOfFilename, CompletionPoint - StartOfFilename))); 2101 // We should replace the characters up to the closing quote, if any. 2102 while (CompletionPoint < BufferEnd) { 2103 char Next = *(CompletionPoint + 1); 2104 if (Next == 0 || Next == '\r' || Next == '\n') 2105 break; 2106 ++CompletionPoint; 2107 if (Next == (IsAngled ? '>' : '"')) 2108 break; 2109 } 2110 PP->setCodeCompletionTokenRange( 2111 FileLoc.getLocWithOffset(StartOfFilename - BufferStart), 2112 FileLoc.getLocWithOffset(CompletionPoint - BufferStart)); 2113 PP->CodeCompleteIncludedFile(Dir, IsAngled); 2114 } 2115 2116 /// LexCharConstant - Lex the remainder of a character constant, after having 2117 /// lexed either ' or L' or u8' or u' or U'. 2118 bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, 2119 tok::TokenKind Kind) { 2120 // Does this character contain the \0 character? 2121 const char *NulCharacter = nullptr; 2122 2123 if (!isLexingRawMode()) { 2124 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) 2125 Diag(BufferPtr, getLangOpts().CPlusPlus 2126 ? diag::warn_cxx98_compat_unicode_literal 2127 : diag::warn_c99_compat_unicode_literal); 2128 else if (Kind == tok::utf8_char_constant) 2129 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal); 2130 } 2131 2132 char C = getAndAdvanceChar(CurPtr, Result); 2133 if (C == '\'') { 2134 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2135 Diag(BufferPtr, diag::ext_empty_character); 2136 FormTokenWithChars(Result, CurPtr, tok::unknown); 2137 return true; 2138 } 2139 2140 while (C != '\'') { 2141 // Skip escaped characters. 2142 if (C == '\\') 2143 C = getAndAdvanceChar(CurPtr, Result); 2144 2145 if (C == '\n' || C == '\r' || // Newline. 2146 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 2147 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2148 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0; 2149 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2150 return true; 2151 } 2152 2153 if (C == 0) { 2154 if (isCodeCompletionPoint(CurPtr-1)) { 2155 PP->CodeCompleteNaturalLanguage(); 2156 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2157 cutOffLexing(); 2158 return true; 2159 } 2160 2161 NulCharacter = CurPtr-1; 2162 } 2163 C = getAndAdvanceChar(CurPtr, Result); 2164 } 2165 2166 // If we are in C++11, lex the optional ud-suffix. 2167 if (getLangOpts().CPlusPlus) 2168 CurPtr = LexUDSuffix(Result, CurPtr, false); 2169 2170 // If a nul character existed in the character, warn about it. 2171 if (NulCharacter && !isLexingRawMode()) 2172 Diag(NulCharacter, diag::null_in_char_or_string) << 0; 2173 2174 // Update the location of token as well as BufferPtr. 2175 const char *TokStart = BufferPtr; 2176 FormTokenWithChars(Result, CurPtr, Kind); 2177 Result.setLiteralData(TokStart); 2178 return true; 2179 } 2180 2181 /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 2182 /// Update BufferPtr to point to the next non-whitespace character and return. 2183 /// 2184 /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 2185 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, 2186 bool &TokAtPhysicalStartOfLine) { 2187 // Whitespace - Skip it, then return the token after the whitespace. 2188 bool SawNewline = isVerticalWhitespace(CurPtr[-1]); 2189 2190 unsigned char Char = *CurPtr; 2191 2192 // Skip consecutive spaces efficiently. 2193 while (true) { 2194 // Skip horizontal whitespace very aggressively. 2195 while (isHorizontalWhitespace(Char)) 2196 Char = *++CurPtr; 2197 2198 // Otherwise if we have something other than whitespace, we're done. 2199 if (!isVerticalWhitespace(Char)) 2200 break; 2201 2202 if (ParsingPreprocessorDirective) { 2203 // End of preprocessor directive line, let LexTokenInternal handle this. 2204 BufferPtr = CurPtr; 2205 return false; 2206 } 2207 2208 // OK, but handle newline. 2209 SawNewline = true; 2210 Char = *++CurPtr; 2211 } 2212 2213 // If the client wants us to return whitespace, return it now. 2214 if (isKeepWhitespaceMode()) { 2215 FormTokenWithChars(Result, CurPtr, tok::unknown); 2216 if (SawNewline) { 2217 IsAtStartOfLine = true; 2218 IsAtPhysicalStartOfLine = true; 2219 } 2220 // FIXME: The next token will not have LeadingSpace set. 2221 return true; 2222 } 2223 2224 // If this isn't immediately after a newline, there is leading space. 2225 char PrevChar = CurPtr[-1]; 2226 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); 2227 2228 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); 2229 if (SawNewline) { 2230 Result.setFlag(Token::StartOfLine); 2231 TokAtPhysicalStartOfLine = true; 2232 } 2233 2234 BufferPtr = CurPtr; 2235 return false; 2236 } 2237 2238 /// We have just read the // characters from input. Skip until we find the 2239 /// newline character that terminates the comment. Then update BufferPtr and 2240 /// return. 2241 /// 2242 /// If we're in KeepCommentMode or any CommentHandler has inserted 2243 /// some tokens, this will store the first token and return true. 2244 bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, 2245 bool &TokAtPhysicalStartOfLine) { 2246 // If Line comments aren't explicitly enabled for this language, emit an 2247 // extension warning. 2248 if (!LangOpts.LineComment && !isLexingRawMode()) { 2249 Diag(BufferPtr, diag::ext_line_comment); 2250 2251 // Mark them enabled so we only emit one warning for this translation 2252 // unit. 2253 LangOpts.LineComment = true; 2254 } 2255 2256 // Scan over the body of the comment. The common case, when scanning, is that 2257 // the comment contains normal ascii characters with nothing interesting in 2258 // them. As such, optimize for this case with the inner loop. 2259 // 2260 // This loop terminates with CurPtr pointing at the newline (or end of buffer) 2261 // character that ends the line comment. 2262 char C; 2263 while (true) { 2264 C = *CurPtr; 2265 // Skip over characters in the fast loop. 2266 while (C != 0 && // Potentially EOF. 2267 C != '\n' && C != '\r') // Newline or DOS-style newline. 2268 C = *++CurPtr; 2269 2270 const char *NextLine = CurPtr; 2271 if (C != 0) { 2272 // We found a newline, see if it's escaped. 2273 const char *EscapePtr = CurPtr-1; 2274 bool HasSpace = false; 2275 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace. 2276 --EscapePtr; 2277 HasSpace = true; 2278 } 2279 2280 if (*EscapePtr == '\\') 2281 // Escaped newline. 2282 CurPtr = EscapePtr; 2283 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 2284 EscapePtr[-2] == '?' && LangOpts.Trigraphs) 2285 // Trigraph-escaped newline. 2286 CurPtr = EscapePtr-2; 2287 else 2288 break; // This is a newline, we're done. 2289 2290 // If there was space between the backslash and newline, warn about it. 2291 if (HasSpace && !isLexingRawMode()) 2292 Diag(EscapePtr, diag::backslash_newline_space); 2293 } 2294 2295 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 2296 // properly decode the character. Read it in raw mode to avoid emitting 2297 // diagnostics about things like trigraphs. If we see an escaped newline, 2298 // we'll handle it below. 2299 const char *OldPtr = CurPtr; 2300 bool OldRawMode = isLexingRawMode(); 2301 LexingRawMode = true; 2302 C = getAndAdvanceChar(CurPtr, Result); 2303 LexingRawMode = OldRawMode; 2304 2305 // If we only read only one character, then no special handling is needed. 2306 // We're done and can skip forward to the newline. 2307 if (C != 0 && CurPtr == OldPtr+1) { 2308 CurPtr = NextLine; 2309 break; 2310 } 2311 2312 // If we read multiple characters, and one of those characters was a \r or 2313 // \n, then we had an escaped newline within the comment. Emit diagnostic 2314 // unless the next line is also a // comment. 2315 if (CurPtr != OldPtr + 1 && C != '/' && 2316 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) { 2317 for (; OldPtr != CurPtr; ++OldPtr) 2318 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 2319 // Okay, we found a // comment that ends in a newline, if the next 2320 // line is also a // comment, but has spaces, don't emit a diagnostic. 2321 if (isWhitespace(C)) { 2322 const char *ForwardPtr = CurPtr; 2323 while (isWhitespace(*ForwardPtr)) // Skip whitespace. 2324 ++ForwardPtr; 2325 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 2326 break; 2327 } 2328 2329 if (!isLexingRawMode()) 2330 Diag(OldPtr-1, diag::ext_multi_line_line_comment); 2331 break; 2332 } 2333 } 2334 2335 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) { 2336 --CurPtr; 2337 break; 2338 } 2339 2340 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2341 PP->CodeCompleteNaturalLanguage(); 2342 cutOffLexing(); 2343 return false; 2344 } 2345 } 2346 2347 // Found but did not consume the newline. Notify comment handlers about the 2348 // comment unless we're in a #if 0 block. 2349 if (PP && !isLexingRawMode() && 2350 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2351 getSourceLocation(CurPtr)))) { 2352 BufferPtr = CurPtr; 2353 return true; // A token has to be returned. 2354 } 2355 2356 // If we are returning comments as tokens, return this comment as a token. 2357 if (inKeepCommentMode()) 2358 return SaveLineComment(Result, CurPtr); 2359 2360 // If we are inside a preprocessor directive and we see the end of line, 2361 // return immediately, so that the lexer can return this as an EOD token. 2362 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 2363 BufferPtr = CurPtr; 2364 return false; 2365 } 2366 2367 // Otherwise, eat the \n character. We don't care if this is a \n\r or 2368 // \r\n sequence. This is an efficiency hack (because we know the \n can't 2369 // contribute to another token), it isn't needed for correctness. Note that 2370 // this is ok even in KeepWhitespaceMode, because we would have returned the 2371 /// comment above in that mode. 2372 ++CurPtr; 2373 2374 // The next returned token is at the start of the line. 2375 Result.setFlag(Token::StartOfLine); 2376 TokAtPhysicalStartOfLine = true; 2377 // No leading whitespace seen so far. 2378 Result.clearFlag(Token::LeadingSpace); 2379 BufferPtr = CurPtr; 2380 return false; 2381 } 2382 2383 /// If in save-comment mode, package up this Line comment in an appropriate 2384 /// way and return it. 2385 bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { 2386 // If we're not in a preprocessor directive, just return the // comment 2387 // directly. 2388 FormTokenWithChars(Result, CurPtr, tok::comment); 2389 2390 if (!ParsingPreprocessorDirective || LexingRawMode) 2391 return true; 2392 2393 // If this Line-style comment is in a macro definition, transmogrify it into 2394 // a C-style block comment. 2395 bool Invalid = false; 2396 std::string Spelling = PP->getSpelling(Result, &Invalid); 2397 if (Invalid) 2398 return true; 2399 2400 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); 2401 Spelling[1] = '*'; // Change prefix to "/*". 2402 Spelling += "*/"; // add suffix. 2403 2404 Result.setKind(tok::comment); 2405 PP->CreateString(Spelling, Result, 2406 Result.getLocation(), Result.getLocation()); 2407 return true; 2408 } 2409 2410 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 2411 /// character (either \\n or \\r) is part of an escaped newline sequence. Issue 2412 /// a diagnostic if so. We know that the newline is inside of a block comment. 2413 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 2414 Lexer *L) { 2415 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 2416 2417 // Back up off the newline. 2418 --CurPtr; 2419 2420 // If this is a two-character newline sequence, skip the other character. 2421 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 2422 // \n\n or \r\r -> not escaped newline. 2423 if (CurPtr[0] == CurPtr[1]) 2424 return false; 2425 // \n\r or \r\n -> skip the newline. 2426 --CurPtr; 2427 } 2428 2429 // If we have horizontal whitespace, skip over it. We allow whitespace 2430 // between the slash and newline. 2431 bool HasSpace = false; 2432 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 2433 --CurPtr; 2434 HasSpace = true; 2435 } 2436 2437 // If we have a slash, we know this is an escaped newline. 2438 if (*CurPtr == '\\') { 2439 if (CurPtr[-1] != '*') return false; 2440 } else { 2441 // It isn't a slash, is it the ?? / trigraph? 2442 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 2443 CurPtr[-3] != '*') 2444 return false; 2445 2446 // This is the trigraph ending the comment. Emit a stern warning! 2447 CurPtr -= 2; 2448 2449 // If no trigraphs are enabled, warn that we ignored this trigraph and 2450 // ignore this * character. 2451 if (!L->getLangOpts().Trigraphs) { 2452 if (!L->isLexingRawMode()) 2453 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 2454 return false; 2455 } 2456 if (!L->isLexingRawMode()) 2457 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 2458 } 2459 2460 // Warn about having an escaped newline between the */ characters. 2461 if (!L->isLexingRawMode()) 2462 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 2463 2464 // If there was space between the backslash and newline, warn about it. 2465 if (HasSpace && !L->isLexingRawMode()) 2466 L->Diag(CurPtr, diag::backslash_newline_space); 2467 2468 return true; 2469 } 2470 2471 #ifdef __SSE2__ 2472 #include <emmintrin.h> 2473 #elif __ALTIVEC__ 2474 #include <altivec.h> 2475 #undef bool 2476 #endif 2477 2478 /// We have just read from input the / and * characters that started a comment. 2479 /// Read until we find the * and / characters that terminate the comment. 2480 /// Note that we don't bother decoding trigraphs or escaped newlines in block 2481 /// comments, because they cannot cause the comment to end. The only thing 2482 /// that can happen is the comment could end with an escaped newline between 2483 /// the terminating * and /. 2484 /// 2485 /// If we're in KeepCommentMode or any CommentHandler has inserted 2486 /// some tokens, this will store the first token and return true. 2487 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, 2488 bool &TokAtPhysicalStartOfLine) { 2489 // Scan one character past where we should, looking for a '/' character. Once 2490 // we find it, check to see if it was preceded by a *. This common 2491 // optimization helps people who like to put a lot of * characters in their 2492 // comments. 2493 2494 // The first character we get with newlines and trigraphs skipped to handle 2495 // the degenerate /*/ case below correctly if the * has an escaped newline 2496 // after it. 2497 unsigned CharSize; 2498 unsigned char C = getCharAndSize(CurPtr, CharSize); 2499 CurPtr += CharSize; 2500 if (C == 0 && CurPtr == BufferEnd+1) { 2501 if (!isLexingRawMode()) 2502 Diag(BufferPtr, diag::err_unterminated_block_comment); 2503 --CurPtr; 2504 2505 // KeepWhitespaceMode should return this broken comment as a token. Since 2506 // it isn't a well formed comment, just return it as an 'unknown' token. 2507 if (isKeepWhitespaceMode()) { 2508 FormTokenWithChars(Result, CurPtr, tok::unknown); 2509 return true; 2510 } 2511 2512 BufferPtr = CurPtr; 2513 return false; 2514 } 2515 2516 // Check to see if the first character after the '/*' is another /. If so, 2517 // then this slash does not end the block comment, it is part of it. 2518 if (C == '/') 2519 C = *CurPtr++; 2520 2521 while (true) { 2522 // Skip over all non-interesting characters until we find end of buffer or a 2523 // (probably ending) '/' character. 2524 if (CurPtr + 24 < BufferEnd && 2525 // If there is a code-completion point avoid the fast scan because it 2526 // doesn't check for '\0'. 2527 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 2528 // While not aligned to a 16-byte boundary. 2529 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 2530 C = *CurPtr++; 2531 2532 if (C == '/') goto FoundSlash; 2533 2534 #ifdef __SSE2__ 2535 __m128i Slashes = _mm_set1_epi8('/'); 2536 while (CurPtr+16 <= BufferEnd) { 2537 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 2538 Slashes)); 2539 if (cmp != 0) { 2540 // Adjust the pointer to point directly after the first slash. It's 2541 // not necessary to set C here, it will be overwritten at the end of 2542 // the outer loop. 2543 CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1; 2544 goto FoundSlash; 2545 } 2546 CurPtr += 16; 2547 } 2548 #elif __ALTIVEC__ 2549 __vector unsigned char Slashes = { 2550 '/', '/', '/', '/', '/', '/', '/', '/', 2551 '/', '/', '/', '/', '/', '/', '/', '/' 2552 }; 2553 while (CurPtr+16 <= BufferEnd && 2554 !vec_any_eq(*(const vector unsigned char*)CurPtr, Slashes)) 2555 CurPtr += 16; 2556 #else 2557 // Scan for '/' quickly. Many block comments are very large. 2558 while (CurPtr[0] != '/' && 2559 CurPtr[1] != '/' && 2560 CurPtr[2] != '/' && 2561 CurPtr[3] != '/' && 2562 CurPtr+4 < BufferEnd) { 2563 CurPtr += 4; 2564 } 2565 #endif 2566 2567 // It has to be one of the bytes scanned, increment to it and read one. 2568 C = *CurPtr++; 2569 } 2570 2571 // Loop to scan the remainder. 2572 while (C != '/' && C != '\0') 2573 C = *CurPtr++; 2574 2575 if (C == '/') { 2576 FoundSlash: 2577 if (CurPtr[-2] == '*') // We found the final */. We're done! 2578 break; 2579 2580 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 2581 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 2582 // We found the final */, though it had an escaped newline between the 2583 // * and /. We're done! 2584 break; 2585 } 2586 } 2587 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 2588 // If this is a /* inside of the comment, emit a warning. Don't do this 2589 // if this is a /*/, which will end the comment. This misses cases with 2590 // embedded escaped newlines, but oh well. 2591 if (!isLexingRawMode()) 2592 Diag(CurPtr-1, diag::warn_nested_block_comment); 2593 } 2594 } else if (C == 0 && CurPtr == BufferEnd+1) { 2595 if (!isLexingRawMode()) 2596 Diag(BufferPtr, diag::err_unterminated_block_comment); 2597 // Note: the user probably forgot a */. We could continue immediately 2598 // after the /*, but this would involve lexing a lot of what really is the 2599 // comment, which surely would confuse the parser. 2600 --CurPtr; 2601 2602 // KeepWhitespaceMode should return this broken comment as a token. Since 2603 // it isn't a well formed comment, just return it as an 'unknown' token. 2604 if (isKeepWhitespaceMode()) { 2605 FormTokenWithChars(Result, CurPtr, tok::unknown); 2606 return true; 2607 } 2608 2609 BufferPtr = CurPtr; 2610 return false; 2611 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2612 PP->CodeCompleteNaturalLanguage(); 2613 cutOffLexing(); 2614 return false; 2615 } 2616 2617 C = *CurPtr++; 2618 } 2619 2620 // Notify comment handlers about the comment unless we're in a #if 0 block. 2621 if (PP && !isLexingRawMode() && 2622 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2623 getSourceLocation(CurPtr)))) { 2624 BufferPtr = CurPtr; 2625 return true; // A token has to be returned. 2626 } 2627 2628 // If we are returning comments as tokens, return this comment as a token. 2629 if (inKeepCommentMode()) { 2630 FormTokenWithChars(Result, CurPtr, tok::comment); 2631 return true; 2632 } 2633 2634 // It is common for the tokens immediately after a /**/ comment to be 2635 // whitespace. Instead of going through the big switch, handle it 2636 // efficiently now. This is safe even in KeepWhitespaceMode because we would 2637 // have already returned above with the comment as a token. 2638 if (isHorizontalWhitespace(*CurPtr)) { 2639 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); 2640 return false; 2641 } 2642 2643 // Otherwise, just return so that the next character will be lexed as a token. 2644 BufferPtr = CurPtr; 2645 Result.setFlag(Token::LeadingSpace); 2646 return false; 2647 } 2648 2649 //===----------------------------------------------------------------------===// 2650 // Primary Lexing Entry Points 2651 //===----------------------------------------------------------------------===// 2652 2653 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 2654 /// uninterpreted string. This switches the lexer out of directive mode. 2655 void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 2656 assert(ParsingPreprocessorDirective && ParsingFilename == false && 2657 "Must be in a preprocessing directive!"); 2658 Token Tmp; 2659 2660 // CurPtr - Cache BufferPtr in an automatic variable. 2661 const char *CurPtr = BufferPtr; 2662 while (true) { 2663 char Char = getAndAdvanceChar(CurPtr, Tmp); 2664 switch (Char) { 2665 default: 2666 if (Result) 2667 Result->push_back(Char); 2668 break; 2669 case 0: // Null. 2670 // Found end of file? 2671 if (CurPtr-1 != BufferEnd) { 2672 if (isCodeCompletionPoint(CurPtr-1)) { 2673 PP->CodeCompleteNaturalLanguage(); 2674 cutOffLexing(); 2675 return; 2676 } 2677 2678 // Nope, normal character, continue. 2679 if (Result) 2680 Result->push_back(Char); 2681 break; 2682 } 2683 // FALL THROUGH. 2684 LLVM_FALLTHROUGH; 2685 case '\r': 2686 case '\n': 2687 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 2688 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 2689 BufferPtr = CurPtr-1; 2690 2691 // Next, lex the character, which should handle the EOD transition. 2692 Lex(Tmp); 2693 if (Tmp.is(tok::code_completion)) { 2694 if (PP) 2695 PP->CodeCompleteNaturalLanguage(); 2696 Lex(Tmp); 2697 } 2698 assert(Tmp.is(tok::eod) && "Unexpected token!"); 2699 2700 // Finally, we're done; 2701 return; 2702 } 2703 } 2704 } 2705 2706 /// LexEndOfFile - CurPtr points to the end of this file. Handle this 2707 /// condition, reporting diagnostics and handling other edge cases as required. 2708 /// This returns true if Result contains a token, false if PP.Lex should be 2709 /// called again. 2710 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 2711 // If we hit the end of the file while parsing a preprocessor directive, 2712 // end the preprocessor directive first. The next token returned will 2713 // then be the end of file. 2714 if (ParsingPreprocessorDirective) { 2715 // Done parsing the "line". 2716 ParsingPreprocessorDirective = false; 2717 // Update the location of token as well as BufferPtr. 2718 FormTokenWithChars(Result, CurPtr, tok::eod); 2719 2720 // Restore comment saving mode, in case it was disabled for directive. 2721 if (PP) 2722 resetExtendedTokenMode(); 2723 return true; // Have a token. 2724 } 2725 2726 // If we are in raw mode, return this event as an EOF token. Let the caller 2727 // that put us in raw mode handle the event. 2728 if (isLexingRawMode()) { 2729 Result.startToken(); 2730 BufferPtr = BufferEnd; 2731 FormTokenWithChars(Result, BufferEnd, tok::eof); 2732 return true; 2733 } 2734 2735 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { 2736 PP->setRecordedPreambleConditionalStack(ConditionalStack); 2737 ConditionalStack.clear(); 2738 } 2739 2740 // Issue diagnostics for unterminated #if and missing newline. 2741 2742 // If we are in a #if directive, emit an error. 2743 while (!ConditionalStack.empty()) { 2744 if (PP->getCodeCompletionFileLoc() != FileLoc) 2745 PP->Diag(ConditionalStack.back().IfLoc, 2746 diag::err_pp_unterminated_conditional); 2747 ConditionalStack.pop_back(); 2748 } 2749 2750 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 2751 // a pedwarn. 2752 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) { 2753 DiagnosticsEngine &Diags = PP->getDiagnostics(); 2754 SourceLocation EndLoc = getSourceLocation(BufferEnd); 2755 unsigned DiagID; 2756 2757 if (LangOpts.CPlusPlus11) { 2758 // C++11 [lex.phases] 2.2 p2 2759 // Prefer the C++98 pedantic compatibility warning over the generic, 2760 // non-extension, user-requested "missing newline at EOF" warning. 2761 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) { 2762 DiagID = diag::warn_cxx98_compat_no_newline_eof; 2763 } else { 2764 DiagID = diag::warn_no_newline_eof; 2765 } 2766 } else { 2767 DiagID = diag::ext_no_newline_eof; 2768 } 2769 2770 Diag(BufferEnd, DiagID) 2771 << FixItHint::CreateInsertion(EndLoc, "\n"); 2772 } 2773 2774 BufferPtr = CurPtr; 2775 2776 // Finally, let the preprocessor handle this. 2777 return PP->HandleEndOfFile(Result, isPragmaLexer()); 2778 } 2779 2780 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 2781 /// the specified lexer will return a tok::l_paren token, 0 if it is something 2782 /// else and 2 if there are no more tokens in the buffer controlled by the 2783 /// lexer. 2784 unsigned Lexer::isNextPPTokenLParen() { 2785 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 2786 2787 // Switch to 'skipping' mode. This will ensure that we can lex a token 2788 // without emitting diagnostics, disables macro expansion, and will cause EOF 2789 // to return an EOF token instead of popping the include stack. 2790 LexingRawMode = true; 2791 2792 // Save state that can be changed while lexing so that we can restore it. 2793 const char *TmpBufferPtr = BufferPtr; 2794 bool inPPDirectiveMode = ParsingPreprocessorDirective; 2795 bool atStartOfLine = IsAtStartOfLine; 2796 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 2797 bool leadingSpace = HasLeadingSpace; 2798 2799 Token Tok; 2800 Lex(Tok); 2801 2802 // Restore state that may have changed. 2803 BufferPtr = TmpBufferPtr; 2804 ParsingPreprocessorDirective = inPPDirectiveMode; 2805 HasLeadingSpace = leadingSpace; 2806 IsAtStartOfLine = atStartOfLine; 2807 IsAtPhysicalStartOfLine = atPhysicalStartOfLine; 2808 2809 // Restore the lexer back to non-skipping mode. 2810 LexingRawMode = false; 2811 2812 if (Tok.is(tok::eof)) 2813 return 2; 2814 return Tok.is(tok::l_paren); 2815 } 2816 2817 /// Find the end of a version control conflict marker. 2818 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 2819 ConflictMarkerKind CMK) { 2820 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 2821 size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 2822 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen); 2823 size_t Pos = RestOfBuffer.find(Terminator); 2824 while (Pos != StringRef::npos) { 2825 // Must occur at start of line. 2826 if (Pos == 0 || 2827 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) { 2828 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 2829 Pos = RestOfBuffer.find(Terminator); 2830 continue; 2831 } 2832 return RestOfBuffer.data()+Pos; 2833 } 2834 return nullptr; 2835 } 2836 2837 /// IsStartOfConflictMarker - If the specified pointer is the start of a version 2838 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 2839 /// and recover nicely. This returns true if it is a conflict marker and false 2840 /// if not. 2841 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 2842 // Only a conflict marker if it starts at the beginning of a line. 2843 if (CurPtr != BufferStart && 2844 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2845 return false; 2846 2847 // Check to see if we have <<<<<<< or >>>>. 2848 if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") && 2849 !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> ")) 2850 return false; 2851 2852 // If we have a situation where we don't care about conflict markers, ignore 2853 // it. 2854 if (CurrentConflictMarkerState || isLexingRawMode()) 2855 return false; 2856 2857 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 2858 2859 // Check to see if there is an ending marker somewhere in the buffer at the 2860 // start of a line to terminate this conflict marker. 2861 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 2862 // We found a match. We are really in a conflict marker. 2863 // Diagnose this, and ignore to the end of line. 2864 Diag(CurPtr, diag::err_conflict_marker); 2865 CurrentConflictMarkerState = Kind; 2866 2867 // Skip ahead to the end of line. We know this exists because the 2868 // end-of-conflict marker starts with \r or \n. 2869 while (*CurPtr != '\r' && *CurPtr != '\n') { 2870 assert(CurPtr != BufferEnd && "Didn't find end of line"); 2871 ++CurPtr; 2872 } 2873 BufferPtr = CurPtr; 2874 return true; 2875 } 2876 2877 // No end of conflict marker found. 2878 return false; 2879 } 2880 2881 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 2882 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 2883 /// is the end of a conflict marker. Handle it by ignoring up until the end of 2884 /// the line. This returns true if it is a conflict marker and false if not. 2885 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 2886 // Only a conflict marker if it starts at the beginning of a line. 2887 if (CurPtr != BufferStart && 2888 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2889 return false; 2890 2891 // If we have a situation where we don't care about conflict markers, ignore 2892 // it. 2893 if (!CurrentConflictMarkerState || isLexingRawMode()) 2894 return false; 2895 2896 // Check to see if we have the marker (4 characters in a row). 2897 for (unsigned i = 1; i != 4; ++i) 2898 if (CurPtr[i] != CurPtr[0]) 2899 return false; 2900 2901 // If we do have it, search for the end of the conflict marker. This could 2902 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 2903 // be the end of conflict marker. 2904 if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 2905 CurrentConflictMarkerState)) { 2906 CurPtr = End; 2907 2908 // Skip ahead to the end of line. 2909 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 2910 ++CurPtr; 2911 2912 BufferPtr = CurPtr; 2913 2914 // No longer in the conflict marker. 2915 CurrentConflictMarkerState = CMK_None; 2916 return true; 2917 } 2918 2919 return false; 2920 } 2921 2922 static const char *findPlaceholderEnd(const char *CurPtr, 2923 const char *BufferEnd) { 2924 if (CurPtr == BufferEnd) 2925 return nullptr; 2926 BufferEnd -= 1; // Scan until the second last character. 2927 for (; CurPtr != BufferEnd; ++CurPtr) { 2928 if (CurPtr[0] == '#' && CurPtr[1] == '>') 2929 return CurPtr + 2; 2930 } 2931 return nullptr; 2932 } 2933 2934 bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) { 2935 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!"); 2936 if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) 2937 return false; 2938 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd); 2939 if (!End) 2940 return false; 2941 const char *Start = CurPtr - 1; 2942 if (!LangOpts.AllowEditorPlaceholders) 2943 Diag(Start, diag::err_placeholder_in_source); 2944 Result.startToken(); 2945 FormTokenWithChars(Result, End, tok::raw_identifier); 2946 Result.setRawIdentifierData(Start); 2947 PP->LookUpIdentifierInfo(Result); 2948 Result.setFlag(Token::IsEditorPlaceholder); 2949 BufferPtr = End; 2950 return true; 2951 } 2952 2953 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 2954 if (PP && PP->isCodeCompletionEnabled()) { 2955 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 2956 return Loc == PP->getCodeCompletionLoc(); 2957 } 2958 2959 return false; 2960 } 2961 2962 uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, 2963 Token *Result) { 2964 unsigned CharSize; 2965 char Kind = getCharAndSize(StartPtr, CharSize); 2966 2967 unsigned NumHexDigits; 2968 if (Kind == 'u') 2969 NumHexDigits = 4; 2970 else if (Kind == 'U') 2971 NumHexDigits = 8; 2972 else 2973 return 0; 2974 2975 if (!LangOpts.CPlusPlus && !LangOpts.C99) { 2976 if (Result && !isLexingRawMode()) 2977 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); 2978 return 0; 2979 } 2980 2981 const char *CurPtr = StartPtr + CharSize; 2982 const char *KindLoc = &CurPtr[-1]; 2983 2984 uint32_t CodePoint = 0; 2985 for (unsigned i = 0; i < NumHexDigits; ++i) { 2986 char C = getCharAndSize(CurPtr, CharSize); 2987 2988 unsigned Value = llvm::hexDigitValue(C); 2989 if (Value == -1U) { 2990 if (Result && !isLexingRawMode()) { 2991 if (i == 0) { 2992 Diag(BufferPtr, diag::warn_ucn_escape_no_digits) 2993 << StringRef(KindLoc, 1); 2994 } else { 2995 Diag(BufferPtr, diag::warn_ucn_escape_incomplete); 2996 2997 // If the user wrote \U1234, suggest a fixit to \u. 2998 if (i == 4 && NumHexDigits == 8) { 2999 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); 3000 Diag(KindLoc, diag::note_ucn_four_not_eight) 3001 << FixItHint::CreateReplacement(URange, "u"); 3002 } 3003 } 3004 } 3005 3006 return 0; 3007 } 3008 3009 CodePoint <<= 4; 3010 CodePoint += Value; 3011 3012 CurPtr += CharSize; 3013 } 3014 3015 if (Result) { 3016 Result->setFlag(Token::HasUCN); 3017 if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2) 3018 StartPtr = CurPtr; 3019 else 3020 while (StartPtr != CurPtr) 3021 (void)getAndAdvanceChar(StartPtr, *Result); 3022 } else { 3023 StartPtr = CurPtr; 3024 } 3025 3026 // Don't apply C family restrictions to UCNs in assembly mode 3027 if (LangOpts.AsmPreprocessor) 3028 return CodePoint; 3029 3030 // C99 6.4.3p2: A universal character name shall not specify a character whose 3031 // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or 3032 // 0060 (`), nor one in the range D800 through DFFF inclusive.) 3033 // C++11 [lex.charset]p2: If the hexadecimal value for a 3034 // universal-character-name corresponds to a surrogate code point (in the 3035 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, 3036 // if the hexadecimal value for a universal-character-name outside the 3037 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or 3038 // string literal corresponds to a control character (in either of the 3039 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the 3040 // basic source character set, the program is ill-formed. 3041 if (CodePoint < 0xA0) { 3042 if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60) 3043 return CodePoint; 3044 3045 // We don't use isLexingRawMode() here because we need to warn about bad 3046 // UCNs even when skipping preprocessing tokens in a #if block. 3047 if (Result && PP) { 3048 if (CodePoint < 0x20 || CodePoint >= 0x7F) 3049 Diag(BufferPtr, diag::err_ucn_control_character); 3050 else { 3051 char C = static_cast<char>(CodePoint); 3052 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); 3053 } 3054 } 3055 3056 return 0; 3057 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { 3058 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. 3059 // We don't use isLexingRawMode() here because we need to diagnose bad 3060 // UCNs even when skipping preprocessing tokens in a #if block. 3061 if (Result && PP) { 3062 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11) 3063 Diag(BufferPtr, diag::warn_ucn_escape_surrogate); 3064 else 3065 Diag(BufferPtr, diag::err_ucn_escape_invalid); 3066 } 3067 return 0; 3068 } 3069 3070 return CodePoint; 3071 } 3072 3073 bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, 3074 const char *CurPtr) { 3075 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( 3076 UnicodeWhitespaceCharRanges); 3077 if (!isLexingRawMode() && !PP->isPreprocessedOutput() && 3078 UnicodeWhitespaceChars.contains(C)) { 3079 Diag(BufferPtr, diag::ext_unicode_whitespace) 3080 << makeCharRange(*this, BufferPtr, CurPtr); 3081 3082 Result.setFlag(Token::LeadingSpace); 3083 return true; 3084 } 3085 return false; 3086 } 3087 3088 bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) { 3089 if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) { 3090 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 3091 !PP->isPreprocessedOutput()) { 3092 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, 3093 makeCharRange(*this, BufferPtr, CurPtr), 3094 /*IsFirst=*/true); 3095 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, 3096 makeCharRange(*this, BufferPtr, CurPtr)); 3097 } 3098 3099 MIOpt.ReadToken(); 3100 return LexIdentifier(Result, CurPtr); 3101 } 3102 3103 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 3104 !PP->isPreprocessedOutput() && 3105 !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) { 3106 // Non-ASCII characters tend to creep into source code unintentionally. 3107 // Instead of letting the parser complain about the unknown token, 3108 // just drop the character. 3109 // Note that we can /only/ do this when the non-ASCII character is actually 3110 // spelled as Unicode, not written as a UCN. The standard requires that 3111 // we not throw away any possible preprocessor tokens, but there's a 3112 // loophole in the mapping of Unicode characters to basic character set 3113 // characters that allows us to map these particular characters to, say, 3114 // whitespace. 3115 Diag(BufferPtr, diag::err_non_ascii) 3116 << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr)); 3117 3118 BufferPtr = CurPtr; 3119 return false; 3120 } 3121 3122 // Otherwise, we have an explicit UCN or a character that's unlikely to show 3123 // up by accident. 3124 MIOpt.ReadToken(); 3125 FormTokenWithChars(Result, CurPtr, tok::unknown); 3126 return true; 3127 } 3128 3129 void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { 3130 IsAtStartOfLine = Result.isAtStartOfLine(); 3131 HasLeadingSpace = Result.hasLeadingSpace(); 3132 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); 3133 // Note that this doesn't affect IsAtPhysicalStartOfLine. 3134 } 3135 3136 bool Lexer::Lex(Token &Result) { 3137 // Start a new token. 3138 Result.startToken(); 3139 3140 // Set up misc whitespace flags for LexTokenInternal. 3141 if (IsAtStartOfLine) { 3142 Result.setFlag(Token::StartOfLine); 3143 IsAtStartOfLine = false; 3144 } 3145 3146 if (HasLeadingSpace) { 3147 Result.setFlag(Token::LeadingSpace); 3148 HasLeadingSpace = false; 3149 } 3150 3151 if (HasLeadingEmptyMacro) { 3152 Result.setFlag(Token::LeadingEmptyMacro); 3153 HasLeadingEmptyMacro = false; 3154 } 3155 3156 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 3157 IsAtPhysicalStartOfLine = false; 3158 bool isRawLex = isLexingRawMode(); 3159 (void) isRawLex; 3160 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); 3161 // (After the LexTokenInternal call, the lexer might be destroyed.) 3162 assert((returnedToken || !isRawLex) && "Raw lex must succeed"); 3163 return returnedToken; 3164 } 3165 3166 /// LexTokenInternal - This implements a simple C family lexer. It is an 3167 /// extremely performance critical piece of code. This assumes that the buffer 3168 /// has a null character at the end of the file. This returns a preprocessing 3169 /// token, not a normal token, as such, it is an internal interface. It assumes 3170 /// that the Flags of result have been cleared before calling this. 3171 bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { 3172 LexNextToken: 3173 // New token, can't need cleaning yet. 3174 Result.clearFlag(Token::NeedsCleaning); 3175 Result.setIdentifierInfo(nullptr); 3176 3177 // CurPtr - Cache BufferPtr in an automatic variable. 3178 const char *CurPtr = BufferPtr; 3179 3180 // Small amounts of horizontal whitespace is very common between tokens. 3181 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 3182 ++CurPtr; 3183 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 3184 ++CurPtr; 3185 3186 // If we are keeping whitespace and other tokens, just return what we just 3187 // skipped. The next lexer invocation will return the token after the 3188 // whitespace. 3189 if (isKeepWhitespaceMode()) { 3190 FormTokenWithChars(Result, CurPtr, tok::unknown); 3191 // FIXME: The next token will not have LeadingSpace set. 3192 return true; 3193 } 3194 3195 BufferPtr = CurPtr; 3196 Result.setFlag(Token::LeadingSpace); 3197 } 3198 3199 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 3200 3201 // Read a character, advancing over it. 3202 char Char = getAndAdvanceChar(CurPtr, Result); 3203 tok::TokenKind Kind; 3204 3205 switch (Char) { 3206 case 0: // Null. 3207 // Found end of file? 3208 if (CurPtr-1 == BufferEnd) 3209 return LexEndOfFile(Result, CurPtr-1); 3210 3211 // Check if we are performing code completion. 3212 if (isCodeCompletionPoint(CurPtr-1)) { 3213 // Return the code-completion token. 3214 Result.startToken(); 3215 FormTokenWithChars(Result, CurPtr, tok::code_completion); 3216 return true; 3217 } 3218 3219 if (!isLexingRawMode()) 3220 Diag(CurPtr-1, diag::null_in_file); 3221 Result.setFlag(Token::LeadingSpace); 3222 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3223 return true; // KeepWhitespaceMode 3224 3225 // We know the lexer hasn't changed, so just try again with this lexer. 3226 // (We manually eliminate the tail call to avoid recursion.) 3227 goto LexNextToken; 3228 3229 case 26: // DOS & CP/M EOF: "^Z". 3230 // If we're in Microsoft extensions mode, treat this as end of file. 3231 if (LangOpts.MicrosoftExt) { 3232 if (!isLexingRawMode()) 3233 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft); 3234 return LexEndOfFile(Result, CurPtr-1); 3235 } 3236 3237 // If Microsoft extensions are disabled, this is just random garbage. 3238 Kind = tok::unknown; 3239 break; 3240 3241 case '\r': 3242 if (CurPtr[0] == '\n') 3243 (void)getAndAdvanceChar(CurPtr, Result); 3244 LLVM_FALLTHROUGH; 3245 case '\n': 3246 // If we are inside a preprocessor directive and we see the end of line, 3247 // we know we are done with the directive, so return an EOD token. 3248 if (ParsingPreprocessorDirective) { 3249 // Done parsing the "line". 3250 ParsingPreprocessorDirective = false; 3251 3252 // Restore comment saving mode, in case it was disabled for directive. 3253 if (PP) 3254 resetExtendedTokenMode(); 3255 3256 // Since we consumed a newline, we are back at the start of a line. 3257 IsAtStartOfLine = true; 3258 IsAtPhysicalStartOfLine = true; 3259 3260 Kind = tok::eod; 3261 break; 3262 } 3263 3264 // No leading whitespace seen so far. 3265 Result.clearFlag(Token::LeadingSpace); 3266 3267 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3268 return true; // KeepWhitespaceMode 3269 3270 // We only saw whitespace, so just try again with this lexer. 3271 // (We manually eliminate the tail call to avoid recursion.) 3272 goto LexNextToken; 3273 case ' ': 3274 case '\t': 3275 case '\f': 3276 case '\v': 3277 SkipHorizontalWhitespace: 3278 Result.setFlag(Token::LeadingSpace); 3279 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3280 return true; // KeepWhitespaceMode 3281 3282 SkipIgnoredUnits: 3283 CurPtr = BufferPtr; 3284 3285 // If the next token is obviously a // or /* */ comment, skip it efficiently 3286 // too (without going through the big switch stmt). 3287 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 3288 LangOpts.LineComment && 3289 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { 3290 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3291 return true; // There is a token to return. 3292 goto SkipIgnoredUnits; 3293 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 3294 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3295 return true; // There is a token to return. 3296 goto SkipIgnoredUnits; 3297 } else if (isHorizontalWhitespace(*CurPtr)) { 3298 goto SkipHorizontalWhitespace; 3299 } 3300 // We only saw whitespace, so just try again with this lexer. 3301 // (We manually eliminate the tail call to avoid recursion.) 3302 goto LexNextToken; 3303 3304 // C99 6.4.4.1: Integer Constants. 3305 // C99 6.4.4.2: Floating Constants. 3306 case '0': case '1': case '2': case '3': case '4': 3307 case '5': case '6': case '7': case '8': case '9': 3308 // Notify MIOpt that we read a non-whitespace/non-comment token. 3309 MIOpt.ReadToken(); 3310 return LexNumericConstant(Result, CurPtr); 3311 3312 case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal 3313 // Notify MIOpt that we read a non-whitespace/non-comment token. 3314 MIOpt.ReadToken(); 3315 3316 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3317 Char = getCharAndSize(CurPtr, SizeTmp); 3318 3319 // UTF-16 string literal 3320 if (Char == '"') 3321 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3322 tok::utf16_string_literal); 3323 3324 // UTF-16 character constant 3325 if (Char == '\'') 3326 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3327 tok::utf16_char_constant); 3328 3329 // UTF-16 raw string literal 3330 if (Char == 'R' && LangOpts.CPlusPlus11 && 3331 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3332 return LexRawStringLiteral(Result, 3333 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3334 SizeTmp2, Result), 3335 tok::utf16_string_literal); 3336 3337 if (Char == '8') { 3338 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 3339 3340 // UTF-8 string literal 3341 if (Char2 == '"') 3342 return LexStringLiteral(Result, 3343 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3344 SizeTmp2, Result), 3345 tok::utf8_string_literal); 3346 if (Char2 == '\'' && LangOpts.CPlusPlus17) 3347 return LexCharConstant( 3348 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3349 SizeTmp2, Result), 3350 tok::utf8_char_constant); 3351 3352 if (Char2 == 'R' && LangOpts.CPlusPlus11) { 3353 unsigned SizeTmp3; 3354 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3355 // UTF-8 raw string literal 3356 if (Char3 == '"') { 3357 return LexRawStringLiteral(Result, 3358 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3359 SizeTmp2, Result), 3360 SizeTmp3, Result), 3361 tok::utf8_string_literal); 3362 } 3363 } 3364 } 3365 } 3366 3367 // treat u like the start of an identifier. 3368 return LexIdentifier(Result, CurPtr); 3369 3370 case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal 3371 // Notify MIOpt that we read a non-whitespace/non-comment token. 3372 MIOpt.ReadToken(); 3373 3374 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3375 Char = getCharAndSize(CurPtr, SizeTmp); 3376 3377 // UTF-32 string literal 3378 if (Char == '"') 3379 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3380 tok::utf32_string_literal); 3381 3382 // UTF-32 character constant 3383 if (Char == '\'') 3384 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3385 tok::utf32_char_constant); 3386 3387 // UTF-32 raw string literal 3388 if (Char == 'R' && LangOpts.CPlusPlus11 && 3389 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3390 return LexRawStringLiteral(Result, 3391 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3392 SizeTmp2, Result), 3393 tok::utf32_string_literal); 3394 } 3395 3396 // treat U like the start of an identifier. 3397 return LexIdentifier(Result, CurPtr); 3398 3399 case 'R': // Identifier or C++0x raw string literal 3400 // Notify MIOpt that we read a non-whitespace/non-comment token. 3401 MIOpt.ReadToken(); 3402 3403 if (LangOpts.CPlusPlus11) { 3404 Char = getCharAndSize(CurPtr, SizeTmp); 3405 3406 if (Char == '"') 3407 return LexRawStringLiteral(Result, 3408 ConsumeChar(CurPtr, SizeTmp, Result), 3409 tok::string_literal); 3410 } 3411 3412 // treat R like the start of an identifier. 3413 return LexIdentifier(Result, CurPtr); 3414 3415 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 3416 // Notify MIOpt that we read a non-whitespace/non-comment token. 3417 MIOpt.ReadToken(); 3418 Char = getCharAndSize(CurPtr, SizeTmp); 3419 3420 // Wide string literal. 3421 if (Char == '"') 3422 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3423 tok::wide_string_literal); 3424 3425 // Wide raw string literal. 3426 if (LangOpts.CPlusPlus11 && Char == 'R' && 3427 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3428 return LexRawStringLiteral(Result, 3429 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3430 SizeTmp2, Result), 3431 tok::wide_string_literal); 3432 3433 // Wide character constant. 3434 if (Char == '\'') 3435 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3436 tok::wide_char_constant); 3437 // FALL THROUGH, treating L like the start of an identifier. 3438 LLVM_FALLTHROUGH; 3439 3440 // C99 6.4.2: Identifiers. 3441 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 3442 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 3443 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 3444 case 'V': case 'W': case 'X': case 'Y': case 'Z': 3445 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 3446 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 3447 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 3448 case 'v': case 'w': case 'x': case 'y': case 'z': 3449 case '_': 3450 // Notify MIOpt that we read a non-whitespace/non-comment token. 3451 MIOpt.ReadToken(); 3452 return LexIdentifier(Result, CurPtr); 3453 3454 case '$': // $ in identifiers. 3455 if (LangOpts.DollarIdents) { 3456 if (!isLexingRawMode()) 3457 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 3458 // Notify MIOpt that we read a non-whitespace/non-comment token. 3459 MIOpt.ReadToken(); 3460 return LexIdentifier(Result, CurPtr); 3461 } 3462 3463 Kind = tok::unknown; 3464 break; 3465 3466 // C99 6.4.4: Character Constants. 3467 case '\'': 3468 // Notify MIOpt that we read a non-whitespace/non-comment token. 3469 MIOpt.ReadToken(); 3470 return LexCharConstant(Result, CurPtr, tok::char_constant); 3471 3472 // C99 6.4.5: String Literals. 3473 case '"': 3474 // Notify MIOpt that we read a non-whitespace/non-comment token. 3475 MIOpt.ReadToken(); 3476 return LexStringLiteral(Result, CurPtr, 3477 ParsingFilename ? tok::header_name 3478 : tok::string_literal); 3479 3480 // C99 6.4.6: Punctuators. 3481 case '?': 3482 Kind = tok::question; 3483 break; 3484 case '[': 3485 Kind = tok::l_square; 3486 break; 3487 case ']': 3488 Kind = tok::r_square; 3489 break; 3490 case '(': 3491 Kind = tok::l_paren; 3492 break; 3493 case ')': 3494 Kind = tok::r_paren; 3495 break; 3496 case '{': 3497 Kind = tok::l_brace; 3498 break; 3499 case '}': 3500 Kind = tok::r_brace; 3501 break; 3502 case '.': 3503 Char = getCharAndSize(CurPtr, SizeTmp); 3504 if (Char >= '0' && Char <= '9') { 3505 // Notify MIOpt that we read a non-whitespace/non-comment token. 3506 MIOpt.ReadToken(); 3507 3508 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 3509 } else if (LangOpts.CPlusPlus && Char == '*') { 3510 Kind = tok::periodstar; 3511 CurPtr += SizeTmp; 3512 } else if (Char == '.' && 3513 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 3514 Kind = tok::ellipsis; 3515 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3516 SizeTmp2, Result); 3517 } else { 3518 Kind = tok::period; 3519 } 3520 break; 3521 case '&': 3522 Char = getCharAndSize(CurPtr, SizeTmp); 3523 if (Char == '&') { 3524 Kind = tok::ampamp; 3525 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3526 } else if (Char == '=') { 3527 Kind = tok::ampequal; 3528 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3529 } else { 3530 Kind = tok::amp; 3531 } 3532 break; 3533 case '*': 3534 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 3535 Kind = tok::starequal; 3536 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3537 } else { 3538 Kind = tok::star; 3539 } 3540 break; 3541 case '+': 3542 Char = getCharAndSize(CurPtr, SizeTmp); 3543 if (Char == '+') { 3544 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3545 Kind = tok::plusplus; 3546 } else if (Char == '=') { 3547 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3548 Kind = tok::plusequal; 3549 } else { 3550 Kind = tok::plus; 3551 } 3552 break; 3553 case '-': 3554 Char = getCharAndSize(CurPtr, SizeTmp); 3555 if (Char == '-') { // -- 3556 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3557 Kind = tok::minusminus; 3558 } else if (Char == '>' && LangOpts.CPlusPlus && 3559 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 3560 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3561 SizeTmp2, Result); 3562 Kind = tok::arrowstar; 3563 } else if (Char == '>') { // -> 3564 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3565 Kind = tok::arrow; 3566 } else if (Char == '=') { // -= 3567 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3568 Kind = tok::minusequal; 3569 } else { 3570 Kind = tok::minus; 3571 } 3572 break; 3573 case '~': 3574 Kind = tok::tilde; 3575 break; 3576 case '!': 3577 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 3578 Kind = tok::exclaimequal; 3579 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3580 } else { 3581 Kind = tok::exclaim; 3582 } 3583 break; 3584 case '/': 3585 // 6.4.9: Comments 3586 Char = getCharAndSize(CurPtr, SizeTmp); 3587 if (Char == '/') { // Line comment. 3588 // Even if Line comments are disabled (e.g. in C89 mode), we generally 3589 // want to lex this as a comment. There is one problem with this though, 3590 // that in one particular corner case, this can change the behavior of the 3591 // resultant program. For example, In "foo //**/ bar", C89 would lex 3592 // this as "foo / bar" and languages with Line comments would lex it as 3593 // "foo". Check to see if the character after the second slash is a '*'. 3594 // If so, we will lex that as a "/" instead of the start of a comment. 3595 // However, we never do this if we are just preprocessing. 3596 bool TreatAsComment = LangOpts.LineComment && 3597 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); 3598 if (!TreatAsComment) 3599 if (!(PP && PP->isPreprocessedOutput())) 3600 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; 3601 3602 if (TreatAsComment) { 3603 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3604 TokAtPhysicalStartOfLine)) 3605 return true; // There is a token to return. 3606 3607 // It is common for the tokens immediately after a // comment to be 3608 // whitespace (indentation for the next line). Instead of going through 3609 // the big switch, handle it efficiently now. 3610 goto SkipIgnoredUnits; 3611 } 3612 } 3613 3614 if (Char == '*') { // /**/ comment. 3615 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3616 TokAtPhysicalStartOfLine)) 3617 return true; // There is a token to return. 3618 3619 // We only saw whitespace, so just try again with this lexer. 3620 // (We manually eliminate the tail call to avoid recursion.) 3621 goto LexNextToken; 3622 } 3623 3624 if (Char == '=') { 3625 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3626 Kind = tok::slashequal; 3627 } else { 3628 Kind = tok::slash; 3629 } 3630 break; 3631 case '%': 3632 Char = getCharAndSize(CurPtr, SizeTmp); 3633 if (Char == '=') { 3634 Kind = tok::percentequal; 3635 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3636 } else if (LangOpts.Digraphs && Char == '>') { 3637 Kind = tok::r_brace; // '%>' -> '}' 3638 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3639 } else if (LangOpts.Digraphs && Char == ':') { 3640 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3641 Char = getCharAndSize(CurPtr, SizeTmp); 3642 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 3643 Kind = tok::hashhash; // '%:%:' -> '##' 3644 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3645 SizeTmp2, Result); 3646 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 3647 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3648 if (!isLexingRawMode()) 3649 Diag(BufferPtr, diag::ext_charize_microsoft); 3650 Kind = tok::hashat; 3651 } else { // '%:' -> '#' 3652 // We parsed a # character. If this occurs at the start of the line, 3653 // it's actually the start of a preprocessing directive. Callback to 3654 // the preprocessor to handle it. 3655 // TODO: -fpreprocessed mode?? 3656 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 3657 goto HandleDirective; 3658 3659 Kind = tok::hash; 3660 } 3661 } else { 3662 Kind = tok::percent; 3663 } 3664 break; 3665 case '<': 3666 Char = getCharAndSize(CurPtr, SizeTmp); 3667 if (ParsingFilename) { 3668 return LexAngledStringLiteral(Result, CurPtr); 3669 } else if (Char == '<') { 3670 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3671 if (After == '=') { 3672 Kind = tok::lesslessequal; 3673 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3674 SizeTmp2, Result); 3675 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 3676 // If this is actually a '<<<<<<<' version control conflict marker, 3677 // recognize it as such and recover nicely. 3678 goto LexNextToken; 3679 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 3680 // If this is '<<<<' and we're in a Perforce-style conflict marker, 3681 // ignore it. 3682 goto LexNextToken; 3683 } else if (LangOpts.CUDA && After == '<') { 3684 Kind = tok::lesslessless; 3685 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3686 SizeTmp2, Result); 3687 } else { 3688 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3689 Kind = tok::lessless; 3690 } 3691 } else if (Char == '=') { 3692 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3693 if (After == '>') { 3694 if (getLangOpts().CPlusPlus2a) { 3695 if (!isLexingRawMode()) 3696 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship); 3697 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3698 SizeTmp2, Result); 3699 Kind = tok::spaceship; 3700 break; 3701 } 3702 // Suggest adding a space between the '<=' and the '>' to avoid a 3703 // change in semantics if this turns up in C++ <=17 mode. 3704 if (getLangOpts().CPlusPlus && !isLexingRawMode()) { 3705 Diag(BufferPtr, diag::warn_cxx2a_compat_spaceship) 3706 << FixItHint::CreateInsertion( 3707 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " "); 3708 } 3709 } 3710 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3711 Kind = tok::lessequal; 3712 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 3713 if (LangOpts.CPlusPlus11 && 3714 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 3715 // C++0x [lex.pptoken]p3: 3716 // Otherwise, if the next three characters are <:: and the subsequent 3717 // character is neither : nor >, the < is treated as a preprocessor 3718 // token by itself and not as the first character of the alternative 3719 // token <:. 3720 unsigned SizeTmp3; 3721 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3722 if (After != ':' && After != '>') { 3723 Kind = tok::less; 3724 if (!isLexingRawMode()) 3725 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 3726 break; 3727 } 3728 } 3729 3730 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3731 Kind = tok::l_square; 3732 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 3733 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3734 Kind = tok::l_brace; 3735 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 && 3736 lexEditorPlaceholder(Result, CurPtr)) { 3737 return true; 3738 } else { 3739 Kind = tok::less; 3740 } 3741 break; 3742 case '>': 3743 Char = getCharAndSize(CurPtr, SizeTmp); 3744 if (Char == '=') { 3745 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3746 Kind = tok::greaterequal; 3747 } else if (Char == '>') { 3748 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3749 if (After == '=') { 3750 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3751 SizeTmp2, Result); 3752 Kind = tok::greatergreaterequal; 3753 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 3754 // If this is actually a '>>>>' conflict marker, recognize it as such 3755 // and recover nicely. 3756 goto LexNextToken; 3757 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 3758 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 3759 goto LexNextToken; 3760 } else if (LangOpts.CUDA && After == '>') { 3761 Kind = tok::greatergreatergreater; 3762 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3763 SizeTmp2, Result); 3764 } else { 3765 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3766 Kind = tok::greatergreater; 3767 } 3768 } else { 3769 Kind = tok::greater; 3770 } 3771 break; 3772 case '^': 3773 Char = getCharAndSize(CurPtr, SizeTmp); 3774 if (Char == '=') { 3775 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3776 Kind = tok::caretequal; 3777 } else if (LangOpts.OpenCL && Char == '^') { 3778 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3779 Kind = tok::caretcaret; 3780 } else { 3781 Kind = tok::caret; 3782 } 3783 break; 3784 case '|': 3785 Char = getCharAndSize(CurPtr, SizeTmp); 3786 if (Char == '=') { 3787 Kind = tok::pipeequal; 3788 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3789 } else if (Char == '|') { 3790 // If this is '|||||||' and we're in a conflict marker, ignore it. 3791 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 3792 goto LexNextToken; 3793 Kind = tok::pipepipe; 3794 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3795 } else { 3796 Kind = tok::pipe; 3797 } 3798 break; 3799 case ':': 3800 Char = getCharAndSize(CurPtr, SizeTmp); 3801 if (LangOpts.Digraphs && Char == '>') { 3802 Kind = tok::r_square; // ':>' -> ']' 3803 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3804 } else if ((LangOpts.CPlusPlus || 3805 LangOpts.DoubleSquareBracketAttributes) && 3806 Char == ':') { 3807 Kind = tok::coloncolon; 3808 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3809 } else { 3810 Kind = tok::colon; 3811 } 3812 break; 3813 case ';': 3814 Kind = tok::semi; 3815 break; 3816 case '=': 3817 Char = getCharAndSize(CurPtr, SizeTmp); 3818 if (Char == '=') { 3819 // If this is '====' and we're in a conflict marker, ignore it. 3820 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 3821 goto LexNextToken; 3822 3823 Kind = tok::equalequal; 3824 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3825 } else { 3826 Kind = tok::equal; 3827 } 3828 break; 3829 case ',': 3830 Kind = tok::comma; 3831 break; 3832 case '#': 3833 Char = getCharAndSize(CurPtr, SizeTmp); 3834 if (Char == '#') { 3835 Kind = tok::hashhash; 3836 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3837 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 3838 Kind = tok::hashat; 3839 if (!isLexingRawMode()) 3840 Diag(BufferPtr, diag::ext_charize_microsoft); 3841 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3842 } else { 3843 // We parsed a # character. If this occurs at the start of the line, 3844 // it's actually the start of a preprocessing directive. Callback to 3845 // the preprocessor to handle it. 3846 // TODO: -fpreprocessed mode?? 3847 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 3848 goto HandleDirective; 3849 3850 Kind = tok::hash; 3851 } 3852 break; 3853 3854 case '@': 3855 // Objective C support. 3856 if (CurPtr[-1] == '@' && LangOpts.ObjC) 3857 Kind = tok::at; 3858 else 3859 Kind = tok::unknown; 3860 break; 3861 3862 // UCNs (C99 6.4.3, C++11 [lex.charset]p2) 3863 case '\\': 3864 if (!LangOpts.AsmPreprocessor) { 3865 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { 3866 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 3867 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3868 return true; // KeepWhitespaceMode 3869 3870 // We only saw whitespace, so just try again with this lexer. 3871 // (We manually eliminate the tail call to avoid recursion.) 3872 goto LexNextToken; 3873 } 3874 3875 return LexUnicode(Result, CodePoint, CurPtr); 3876 } 3877 } 3878 3879 Kind = tok::unknown; 3880 break; 3881 3882 default: { 3883 if (isASCII(Char)) { 3884 Kind = tok::unknown; 3885 break; 3886 } 3887 3888 llvm::UTF32 CodePoint; 3889 3890 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to 3891 // an escaped newline. 3892 --CurPtr; 3893 llvm::ConversionResult Status = 3894 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, 3895 (const llvm::UTF8 *)BufferEnd, 3896 &CodePoint, 3897 llvm::strictConversion); 3898 if (Status == llvm::conversionOK) { 3899 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 3900 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3901 return true; // KeepWhitespaceMode 3902 3903 // We only saw whitespace, so just try again with this lexer. 3904 // (We manually eliminate the tail call to avoid recursion.) 3905 goto LexNextToken; 3906 } 3907 return LexUnicode(Result, CodePoint, CurPtr); 3908 } 3909 3910 if (isLexingRawMode() || ParsingPreprocessorDirective || 3911 PP->isPreprocessedOutput()) { 3912 ++CurPtr; 3913 Kind = tok::unknown; 3914 break; 3915 } 3916 3917 // Non-ASCII characters tend to creep into source code unintentionally. 3918 // Instead of letting the parser complain about the unknown token, 3919 // just diagnose the invalid UTF-8, then drop the character. 3920 Diag(CurPtr, diag::err_invalid_utf8); 3921 3922 BufferPtr = CurPtr+1; 3923 // We're pretending the character didn't exist, so just try again with 3924 // this lexer. 3925 // (We manually eliminate the tail call to avoid recursion.) 3926 goto LexNextToken; 3927 } 3928 } 3929 3930 // Notify MIOpt that we read a non-whitespace/non-comment token. 3931 MIOpt.ReadToken(); 3932 3933 // Update the location of token as well as BufferPtr. 3934 FormTokenWithChars(Result, CurPtr, Kind); 3935 return true; 3936 3937 HandleDirective: 3938 // We parsed a # character and it's the start of a preprocessing directive. 3939 3940 FormTokenWithChars(Result, CurPtr, tok::hash); 3941 PP->HandleDirective(Result); 3942 3943 if (PP->hadModuleLoaderFatalFailure()) { 3944 // With a fatal failure in the module loader, we abort parsing. 3945 assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof"); 3946 return true; 3947 } 3948 3949 // We parsed the directive; lex a token with the new state. 3950 return false; 3951 } 3952