xref: /freebsd-src/contrib/llvm-project/clang/lib/Lex/DependencyDirectivesScanner.cpp (revision bdd1243df58e60e85101c09001d9812a789b6bc4)
181ad6265SDimitry Andric //===- DependencyDirectivesScanner.cpp ------------------------------------===//
281ad6265SDimitry Andric //
381ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
481ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
581ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
681ad6265SDimitry Andric //
781ad6265SDimitry Andric //===----------------------------------------------------------------------===//
881ad6265SDimitry Andric ///
981ad6265SDimitry Andric /// \file
1081ad6265SDimitry Andric /// This is the interface for scanning header and source files to get the
1181ad6265SDimitry Andric /// minimum necessary preprocessor directives for evaluating includes. It
1281ad6265SDimitry Andric /// reduces the source down to #define, #include, #import, @import, and any
1381ad6265SDimitry Andric /// conditional preprocessor logic that contains one of those.
1481ad6265SDimitry Andric ///
1581ad6265SDimitry Andric //===----------------------------------------------------------------------===//
1681ad6265SDimitry Andric 
1781ad6265SDimitry Andric #include "clang/Lex/DependencyDirectivesScanner.h"
1881ad6265SDimitry Andric #include "clang/Basic/CharInfo.h"
1981ad6265SDimitry Andric #include "clang/Basic/Diagnostic.h"
2081ad6265SDimitry Andric #include "clang/Lex/LexDiagnostic.h"
2181ad6265SDimitry Andric #include "clang/Lex/Lexer.h"
2281ad6265SDimitry Andric #include "llvm/ADT/ScopeExit.h"
2381ad6265SDimitry Andric #include "llvm/ADT/SmallString.h"
2481ad6265SDimitry Andric #include "llvm/ADT/StringMap.h"
2581ad6265SDimitry Andric #include "llvm/ADT/StringSwitch.h"
26*bdd1243dSDimitry Andric #include <optional>
2781ad6265SDimitry Andric 
2881ad6265SDimitry Andric using namespace clang;
2981ad6265SDimitry Andric using namespace clang::dependency_directives_scan;
3081ad6265SDimitry Andric using namespace llvm;
3181ad6265SDimitry Andric 
3281ad6265SDimitry Andric namespace {
3381ad6265SDimitry Andric 
3481ad6265SDimitry Andric struct DirectiveWithTokens {
3581ad6265SDimitry Andric   DirectiveKind Kind;
3681ad6265SDimitry Andric   unsigned NumTokens;
3781ad6265SDimitry Andric 
3881ad6265SDimitry Andric   DirectiveWithTokens(DirectiveKind Kind, unsigned NumTokens)
3981ad6265SDimitry Andric       : Kind(Kind), NumTokens(NumTokens) {}
4081ad6265SDimitry Andric };
4181ad6265SDimitry Andric 
4281ad6265SDimitry Andric /// Does an efficient "scan" of the sources to detect the presence of
4381ad6265SDimitry Andric /// preprocessor (or module import) directives and collects the raw lexed tokens
4481ad6265SDimitry Andric /// for those directives so that the \p Lexer can "replay" them when the file is
4581ad6265SDimitry Andric /// included.
4681ad6265SDimitry Andric ///
4781ad6265SDimitry Andric /// Note that the behavior of the raw lexer is affected by the language mode,
4881ad6265SDimitry Andric /// while at this point we want to do a scan and collect tokens once,
4981ad6265SDimitry Andric /// irrespective of the language mode that the file will get included in. To
5081ad6265SDimitry Andric /// compensate for that the \p Lexer, while "replaying", will adjust a token
5181ad6265SDimitry Andric /// where appropriate, when it could affect the preprocessor's state.
5281ad6265SDimitry Andric /// For example in a directive like
5381ad6265SDimitry Andric ///
5481ad6265SDimitry Andric /// \code
5581ad6265SDimitry Andric ///   #if __has_cpp_attribute(clang::fallthrough)
5681ad6265SDimitry Andric /// \endcode
5781ad6265SDimitry Andric ///
5881ad6265SDimitry Andric /// The preprocessor needs to see '::' as 'tok::coloncolon' instead of 2
5981ad6265SDimitry Andric /// 'tok::colon'. The \p Lexer will adjust if it sees consecutive 'tok::colon'
6081ad6265SDimitry Andric /// while in C++ mode.
6181ad6265SDimitry Andric struct Scanner {
6281ad6265SDimitry Andric   Scanner(StringRef Input,
6381ad6265SDimitry Andric           SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
6481ad6265SDimitry Andric           DiagnosticsEngine *Diags, SourceLocation InputSourceLoc)
6581ad6265SDimitry Andric       : Input(Input), Tokens(Tokens), Diags(Diags),
6681ad6265SDimitry Andric         InputSourceLoc(InputSourceLoc), LangOpts(getLangOptsForDepScanning()),
6781ad6265SDimitry Andric         TheLexer(InputSourceLoc, LangOpts, Input.begin(), Input.begin(),
6881ad6265SDimitry Andric                  Input.end()) {}
6981ad6265SDimitry Andric 
7081ad6265SDimitry Andric   static LangOptions getLangOptsForDepScanning() {
7181ad6265SDimitry Andric     LangOptions LangOpts;
7281ad6265SDimitry Andric     // Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'.
7381ad6265SDimitry Andric     LangOpts.ObjC = true;
7481ad6265SDimitry Andric     LangOpts.LineComment = true;
7581ad6265SDimitry Andric     return LangOpts;
7681ad6265SDimitry Andric   }
7781ad6265SDimitry Andric 
7881ad6265SDimitry Andric   /// Lex the provided source and emit the directive tokens.
7981ad6265SDimitry Andric   ///
8081ad6265SDimitry Andric   /// \returns True on error.
8181ad6265SDimitry Andric   bool scan(SmallVectorImpl<Directive> &Directives);
8281ad6265SDimitry Andric 
8381ad6265SDimitry Andric private:
8481ad6265SDimitry Andric   /// Lexes next token and advances \p First and the \p Lexer.
85*bdd1243dSDimitry Andric   [[nodiscard]] dependency_directives_scan::Token &
8681ad6265SDimitry Andric   lexToken(const char *&First, const char *const End);
8781ad6265SDimitry Andric 
8881ad6265SDimitry Andric   dependency_directives_scan::Token &lexIncludeFilename(const char *&First,
8981ad6265SDimitry Andric                                                         const char *const End);
9081ad6265SDimitry Andric 
91*bdd1243dSDimitry Andric   void skipLine(const char *&First, const char *const End);
92*bdd1243dSDimitry Andric   void skipDirective(StringRef Name, const char *&First, const char *const End);
93*bdd1243dSDimitry Andric 
9481ad6265SDimitry Andric   /// Lexes next token and if it is identifier returns its string, otherwise
95*bdd1243dSDimitry Andric   /// it skips the current line and returns \p std::nullopt.
9681ad6265SDimitry Andric   ///
9781ad6265SDimitry Andric   /// In any case (whatever the token kind) \p First and the \p Lexer will
9881ad6265SDimitry Andric   /// advance beyond the token.
99*bdd1243dSDimitry Andric   [[nodiscard]] std::optional<StringRef>
10081ad6265SDimitry Andric   tryLexIdentifierOrSkipLine(const char *&First, const char *const End);
10181ad6265SDimitry Andric 
10281ad6265SDimitry Andric   /// Used when it is certain that next token is an identifier.
103*bdd1243dSDimitry Andric   [[nodiscard]] StringRef lexIdentifier(const char *&First,
10481ad6265SDimitry Andric                                         const char *const End);
10581ad6265SDimitry Andric 
10681ad6265SDimitry Andric   /// Lexes next token and returns true iff it is an identifier that matches \p
10781ad6265SDimitry Andric   /// Id, otherwise it skips the current line and returns false.
10881ad6265SDimitry Andric   ///
10981ad6265SDimitry Andric   /// In any case (whatever the token kind) \p First and the \p Lexer will
11081ad6265SDimitry Andric   /// advance beyond the token.
111*bdd1243dSDimitry Andric   [[nodiscard]] bool isNextIdentifierOrSkipLine(StringRef Id,
11281ad6265SDimitry Andric                                                 const char *&First,
11381ad6265SDimitry Andric                                                 const char *const End);
11481ad6265SDimitry Andric 
115*bdd1243dSDimitry Andric   [[nodiscard]] bool scanImpl(const char *First, const char *const End);
116*bdd1243dSDimitry Andric   [[nodiscard]] bool lexPPLine(const char *&First, const char *const End);
117*bdd1243dSDimitry Andric   [[nodiscard]] bool lexAt(const char *&First, const char *const End);
118*bdd1243dSDimitry Andric   [[nodiscard]] bool lexModule(const char *&First, const char *const End);
119*bdd1243dSDimitry Andric   [[nodiscard]] bool lexDefine(const char *HashLoc, const char *&First,
12081ad6265SDimitry Andric                                const char *const End);
121*bdd1243dSDimitry Andric   [[nodiscard]] bool lexPragma(const char *&First, const char *const End);
122*bdd1243dSDimitry Andric   [[nodiscard]] bool lexEndif(const char *&First, const char *const End);
123*bdd1243dSDimitry Andric   [[nodiscard]] bool lexDefault(DirectiveKind Kind, const char *&First,
12481ad6265SDimitry Andric                                 const char *const End);
125*bdd1243dSDimitry Andric   [[nodiscard]] bool lexModuleDirectiveBody(DirectiveKind Kind,
12681ad6265SDimitry Andric                                             const char *&First,
12781ad6265SDimitry Andric                                             const char *const End);
12881ad6265SDimitry Andric   void lexPPDirectiveBody(const char *&First, const char *const End);
12981ad6265SDimitry Andric 
13081ad6265SDimitry Andric   DirectiveWithTokens &pushDirective(DirectiveKind Kind) {
13181ad6265SDimitry Andric     Tokens.append(CurDirToks);
13281ad6265SDimitry Andric     DirsWithToks.emplace_back(Kind, CurDirToks.size());
13381ad6265SDimitry Andric     CurDirToks.clear();
13481ad6265SDimitry Andric     return DirsWithToks.back();
13581ad6265SDimitry Andric   }
13681ad6265SDimitry Andric   void popDirective() {
13781ad6265SDimitry Andric     Tokens.pop_back_n(DirsWithToks.pop_back_val().NumTokens);
13881ad6265SDimitry Andric   }
13981ad6265SDimitry Andric   DirectiveKind topDirective() const {
14081ad6265SDimitry Andric     return DirsWithToks.empty() ? pp_none : DirsWithToks.back().Kind;
14181ad6265SDimitry Andric   }
14281ad6265SDimitry Andric 
14381ad6265SDimitry Andric   unsigned getOffsetAt(const char *CurPtr) const {
14481ad6265SDimitry Andric     return CurPtr - Input.data();
14581ad6265SDimitry Andric   }
14681ad6265SDimitry Andric 
14781ad6265SDimitry Andric   /// Reports a diagnostic if the diagnostic engine is provided. Always returns
14881ad6265SDimitry Andric   /// true at the end.
14981ad6265SDimitry Andric   bool reportError(const char *CurPtr, unsigned Err);
15081ad6265SDimitry Andric 
15181ad6265SDimitry Andric   StringMap<char> SplitIds;
15281ad6265SDimitry Andric   StringRef Input;
15381ad6265SDimitry Andric   SmallVectorImpl<dependency_directives_scan::Token> &Tokens;
15481ad6265SDimitry Andric   DiagnosticsEngine *Diags;
15581ad6265SDimitry Andric   SourceLocation InputSourceLoc;
15681ad6265SDimitry Andric 
157*bdd1243dSDimitry Andric   const char *LastTokenPtr = nullptr;
15881ad6265SDimitry Andric   /// Keeps track of the tokens for the currently lexed directive. Once a
15981ad6265SDimitry Andric   /// directive is fully lexed and "committed" then the tokens get appended to
16081ad6265SDimitry Andric   /// \p Tokens and \p CurDirToks is cleared for the next directive.
16181ad6265SDimitry Andric   SmallVector<dependency_directives_scan::Token, 32> CurDirToks;
16281ad6265SDimitry Andric   /// The directives that were lexed along with the number of tokens that each
16381ad6265SDimitry Andric   /// directive contains. The tokens of all the directives are kept in \p Tokens
16481ad6265SDimitry Andric   /// vector, in the same order as the directives order in \p DirsWithToks.
16581ad6265SDimitry Andric   SmallVector<DirectiveWithTokens, 64> DirsWithToks;
16681ad6265SDimitry Andric   LangOptions LangOpts;
16781ad6265SDimitry Andric   Lexer TheLexer;
16881ad6265SDimitry Andric };
16981ad6265SDimitry Andric 
17081ad6265SDimitry Andric } // end anonymous namespace
17181ad6265SDimitry Andric 
17281ad6265SDimitry Andric bool Scanner::reportError(const char *CurPtr, unsigned Err) {
17381ad6265SDimitry Andric   if (!Diags)
17481ad6265SDimitry Andric     return true;
17581ad6265SDimitry Andric   assert(CurPtr >= Input.data() && "invalid buffer ptr");
17681ad6265SDimitry Andric   Diags->Report(InputSourceLoc.getLocWithOffset(getOffsetAt(CurPtr)), Err);
17781ad6265SDimitry Andric   return true;
17881ad6265SDimitry Andric }
17981ad6265SDimitry Andric 
18081ad6265SDimitry Andric static void skipOverSpaces(const char *&First, const char *const End) {
18181ad6265SDimitry Andric   while (First != End && isHorizontalWhitespace(*First))
18281ad6265SDimitry Andric     ++First;
18381ad6265SDimitry Andric }
18481ad6265SDimitry Andric 
185*bdd1243dSDimitry Andric [[nodiscard]] static bool isRawStringLiteral(const char *First,
18681ad6265SDimitry Andric                                              const char *Current) {
18781ad6265SDimitry Andric   assert(First <= Current);
18881ad6265SDimitry Andric 
18981ad6265SDimitry Andric   // Check if we can even back up.
19081ad6265SDimitry Andric   if (*Current != '"' || First == Current)
19181ad6265SDimitry Andric     return false;
19281ad6265SDimitry Andric 
19381ad6265SDimitry Andric   // Check for an "R".
19481ad6265SDimitry Andric   --Current;
19581ad6265SDimitry Andric   if (*Current != 'R')
19681ad6265SDimitry Andric     return false;
19781ad6265SDimitry Andric   if (First == Current || !isAsciiIdentifierContinue(*--Current))
19881ad6265SDimitry Andric     return true;
19981ad6265SDimitry Andric 
20081ad6265SDimitry Andric   // Check for a prefix of "u", "U", or "L".
20181ad6265SDimitry Andric   if (*Current == 'u' || *Current == 'U' || *Current == 'L')
20281ad6265SDimitry Andric     return First == Current || !isAsciiIdentifierContinue(*--Current);
20381ad6265SDimitry Andric 
20481ad6265SDimitry Andric   // Check for a prefix of "u8".
20581ad6265SDimitry Andric   if (*Current != '8' || First == Current || *Current-- != 'u')
20681ad6265SDimitry Andric     return false;
20781ad6265SDimitry Andric   return First == Current || !isAsciiIdentifierContinue(*--Current);
20881ad6265SDimitry Andric }
20981ad6265SDimitry Andric 
21081ad6265SDimitry Andric static void skipRawString(const char *&First, const char *const End) {
21181ad6265SDimitry Andric   assert(First[0] == '"');
21281ad6265SDimitry Andric   assert(First[-1] == 'R');
21381ad6265SDimitry Andric 
21481ad6265SDimitry Andric   const char *Last = ++First;
21581ad6265SDimitry Andric   while (Last != End && *Last != '(')
21681ad6265SDimitry Andric     ++Last;
21781ad6265SDimitry Andric   if (Last == End) {
21881ad6265SDimitry Andric     First = Last; // Hit the end... just give up.
21981ad6265SDimitry Andric     return;
22081ad6265SDimitry Andric   }
22181ad6265SDimitry Andric 
22281ad6265SDimitry Andric   StringRef Terminator(First, Last - First);
22381ad6265SDimitry Andric   for (;;) {
22481ad6265SDimitry Andric     // Move First to just past the next ")".
22581ad6265SDimitry Andric     First = Last;
22681ad6265SDimitry Andric     while (First != End && *First != ')')
22781ad6265SDimitry Andric       ++First;
22881ad6265SDimitry Andric     if (First == End)
22981ad6265SDimitry Andric       return;
23081ad6265SDimitry Andric     ++First;
23181ad6265SDimitry Andric 
23281ad6265SDimitry Andric     // Look ahead for the terminator sequence.
23381ad6265SDimitry Andric     Last = First;
23481ad6265SDimitry Andric     while (Last != End && size_t(Last - First) < Terminator.size() &&
23581ad6265SDimitry Andric            Terminator[Last - First] == *Last)
23681ad6265SDimitry Andric       ++Last;
23781ad6265SDimitry Andric 
23881ad6265SDimitry Andric     // Check if we hit it (or the end of the file).
23981ad6265SDimitry Andric     if (Last == End) {
24081ad6265SDimitry Andric       First = Last;
24181ad6265SDimitry Andric       return;
24281ad6265SDimitry Andric     }
24381ad6265SDimitry Andric     if (size_t(Last - First) < Terminator.size())
24481ad6265SDimitry Andric       continue;
24581ad6265SDimitry Andric     if (*Last != '"')
24681ad6265SDimitry Andric       continue;
24781ad6265SDimitry Andric     First = Last + 1;
24881ad6265SDimitry Andric     return;
24981ad6265SDimitry Andric   }
25081ad6265SDimitry Andric }
25181ad6265SDimitry Andric 
25281ad6265SDimitry Andric // Returns the length of EOL, either 0 (no end-of-line), 1 (\n) or 2 (\r\n)
25381ad6265SDimitry Andric static unsigned isEOL(const char *First, const char *const End) {
25481ad6265SDimitry Andric   if (First == End)
25581ad6265SDimitry Andric     return 0;
25681ad6265SDimitry Andric   if (End - First > 1 && isVerticalWhitespace(First[0]) &&
25781ad6265SDimitry Andric       isVerticalWhitespace(First[1]) && First[0] != First[1])
25881ad6265SDimitry Andric     return 2;
25981ad6265SDimitry Andric   return !!isVerticalWhitespace(First[0]);
26081ad6265SDimitry Andric }
26181ad6265SDimitry Andric 
26281ad6265SDimitry Andric static void skipString(const char *&First, const char *const End) {
26381ad6265SDimitry Andric   assert(*First == '\'' || *First == '"' || *First == '<');
26481ad6265SDimitry Andric   const char Terminator = *First == '<' ? '>' : *First;
26581ad6265SDimitry Andric   for (++First; First != End && *First != Terminator; ++First) {
26681ad6265SDimitry Andric     // String and character literals don't extend past the end of the line.
26781ad6265SDimitry Andric     if (isVerticalWhitespace(*First))
26881ad6265SDimitry Andric       return;
26981ad6265SDimitry Andric     if (*First != '\\')
27081ad6265SDimitry Andric       continue;
27181ad6265SDimitry Andric     // Skip past backslash to the next character. This ensures that the
27281ad6265SDimitry Andric     // character right after it is skipped as well, which matters if it's
27381ad6265SDimitry Andric     // the terminator.
27481ad6265SDimitry Andric     if (++First == End)
27581ad6265SDimitry Andric       return;
27681ad6265SDimitry Andric     if (!isWhitespace(*First))
27781ad6265SDimitry Andric       continue;
27881ad6265SDimitry Andric     // Whitespace after the backslash might indicate a line continuation.
27981ad6265SDimitry Andric     const char *FirstAfterBackslashPastSpace = First;
28081ad6265SDimitry Andric     skipOverSpaces(FirstAfterBackslashPastSpace, End);
28181ad6265SDimitry Andric     if (unsigned NLSize = isEOL(FirstAfterBackslashPastSpace, End)) {
28281ad6265SDimitry Andric       // Advance the character pointer to the next line for the next
28381ad6265SDimitry Andric       // iteration.
28481ad6265SDimitry Andric       First = FirstAfterBackslashPastSpace + NLSize - 1;
28581ad6265SDimitry Andric     }
28681ad6265SDimitry Andric   }
28781ad6265SDimitry Andric   if (First != End)
28881ad6265SDimitry Andric     ++First; // Finish off the string.
28981ad6265SDimitry Andric }
29081ad6265SDimitry Andric 
29181ad6265SDimitry Andric // Returns the length of the skipped newline
29281ad6265SDimitry Andric static unsigned skipNewline(const char *&First, const char *End) {
29381ad6265SDimitry Andric   if (First == End)
29481ad6265SDimitry Andric     return 0;
29581ad6265SDimitry Andric   assert(isVerticalWhitespace(*First));
29681ad6265SDimitry Andric   unsigned Len = isEOL(First, End);
29781ad6265SDimitry Andric   assert(Len && "expected newline");
29881ad6265SDimitry Andric   First += Len;
29981ad6265SDimitry Andric   return Len;
30081ad6265SDimitry Andric }
30181ad6265SDimitry Andric 
30281ad6265SDimitry Andric static bool wasLineContinuation(const char *First, unsigned EOLLen) {
30381ad6265SDimitry Andric   return *(First - (int)EOLLen - 1) == '\\';
30481ad6265SDimitry Andric }
30581ad6265SDimitry Andric 
30681ad6265SDimitry Andric static void skipToNewlineRaw(const char *&First, const char *const End) {
30781ad6265SDimitry Andric   for (;;) {
30881ad6265SDimitry Andric     if (First == End)
30981ad6265SDimitry Andric       return;
31081ad6265SDimitry Andric 
31181ad6265SDimitry Andric     unsigned Len = isEOL(First, End);
31281ad6265SDimitry Andric     if (Len)
31381ad6265SDimitry Andric       return;
31481ad6265SDimitry Andric 
31581ad6265SDimitry Andric     do {
31681ad6265SDimitry Andric       if (++First == End)
31781ad6265SDimitry Andric         return;
31881ad6265SDimitry Andric       Len = isEOL(First, End);
31981ad6265SDimitry Andric     } while (!Len);
32081ad6265SDimitry Andric 
32181ad6265SDimitry Andric     if (First[-1] != '\\')
32281ad6265SDimitry Andric       return;
32381ad6265SDimitry Andric 
32481ad6265SDimitry Andric     First += Len;
32581ad6265SDimitry Andric     // Keep skipping lines...
32681ad6265SDimitry Andric   }
32781ad6265SDimitry Andric }
32881ad6265SDimitry Andric 
32981ad6265SDimitry Andric static void skipLineComment(const char *&First, const char *const End) {
33081ad6265SDimitry Andric   assert(First[0] == '/' && First[1] == '/');
33181ad6265SDimitry Andric   First += 2;
33281ad6265SDimitry Andric   skipToNewlineRaw(First, End);
33381ad6265SDimitry Andric }
33481ad6265SDimitry Andric 
33581ad6265SDimitry Andric static void skipBlockComment(const char *&First, const char *const End) {
33681ad6265SDimitry Andric   assert(First[0] == '/' && First[1] == '*');
33781ad6265SDimitry Andric   if (End - First < 4) {
33881ad6265SDimitry Andric     First = End;
33981ad6265SDimitry Andric     return;
34081ad6265SDimitry Andric   }
34181ad6265SDimitry Andric   for (First += 3; First != End; ++First)
34281ad6265SDimitry Andric     if (First[-1] == '*' && First[0] == '/') {
34381ad6265SDimitry Andric       ++First;
34481ad6265SDimitry Andric       return;
34581ad6265SDimitry Andric     }
34681ad6265SDimitry Andric }
34781ad6265SDimitry Andric 
34881ad6265SDimitry Andric /// \returns True if the current single quotation mark character is a C++ 14
34981ad6265SDimitry Andric /// digit separator.
35081ad6265SDimitry Andric static bool isQuoteCppDigitSeparator(const char *const Start,
35181ad6265SDimitry Andric                                      const char *const Cur,
35281ad6265SDimitry Andric                                      const char *const End) {
35381ad6265SDimitry Andric   assert(*Cur == '\'' && "expected quotation character");
35481ad6265SDimitry Andric   // skipLine called in places where we don't expect a valid number
35581ad6265SDimitry Andric   // body before `start` on the same line, so always return false at the start.
35681ad6265SDimitry Andric   if (Start == Cur)
35781ad6265SDimitry Andric     return false;
35881ad6265SDimitry Andric   // The previous character must be a valid PP number character.
35981ad6265SDimitry Andric   // Make sure that the L, u, U, u8 prefixes don't get marked as a
36081ad6265SDimitry Andric   // separator though.
36181ad6265SDimitry Andric   char Prev = *(Cur - 1);
36281ad6265SDimitry Andric   if (Prev == 'L' || Prev == 'U' || Prev == 'u')
36381ad6265SDimitry Andric     return false;
36481ad6265SDimitry Andric   if (Prev == '8' && (Cur - 1 != Start) && *(Cur - 2) == 'u')
36581ad6265SDimitry Andric     return false;
36681ad6265SDimitry Andric   if (!isPreprocessingNumberBody(Prev))
36781ad6265SDimitry Andric     return false;
36881ad6265SDimitry Andric   // The next character should be a valid identifier body character.
36981ad6265SDimitry Andric   return (Cur + 1) < End && isAsciiIdentifierContinue(*(Cur + 1));
37081ad6265SDimitry Andric }
37181ad6265SDimitry Andric 
372*bdd1243dSDimitry Andric void Scanner::skipLine(const char *&First, const char *const End) {
37381ad6265SDimitry Andric   for (;;) {
37481ad6265SDimitry Andric     assert(First <= End);
37581ad6265SDimitry Andric     if (First == End)
37681ad6265SDimitry Andric       return;
37781ad6265SDimitry Andric 
37881ad6265SDimitry Andric     if (isVerticalWhitespace(*First)) {
37981ad6265SDimitry Andric       skipNewline(First, End);
38081ad6265SDimitry Andric       return;
38181ad6265SDimitry Andric     }
38281ad6265SDimitry Andric     const char *Start = First;
38381ad6265SDimitry Andric     while (First != End && !isVerticalWhitespace(*First)) {
38481ad6265SDimitry Andric       // Iterate over strings correctly to avoid comments and newlines.
38581ad6265SDimitry Andric       if (*First == '"' ||
38681ad6265SDimitry Andric           (*First == '\'' && !isQuoteCppDigitSeparator(Start, First, End))) {
387*bdd1243dSDimitry Andric         LastTokenPtr = First;
38881ad6265SDimitry Andric         if (isRawStringLiteral(Start, First))
38981ad6265SDimitry Andric           skipRawString(First, End);
39081ad6265SDimitry Andric         else
39181ad6265SDimitry Andric           skipString(First, End);
39281ad6265SDimitry Andric         continue;
39381ad6265SDimitry Andric       }
39481ad6265SDimitry Andric 
39581ad6265SDimitry Andric       // Iterate over comments correctly.
39681ad6265SDimitry Andric       if (*First != '/' || End - First < 2) {
397*bdd1243dSDimitry Andric         LastTokenPtr = First;
39881ad6265SDimitry Andric         ++First;
39981ad6265SDimitry Andric         continue;
40081ad6265SDimitry Andric       }
40181ad6265SDimitry Andric 
40281ad6265SDimitry Andric       if (First[1] == '/') {
40381ad6265SDimitry Andric         // "//...".
40481ad6265SDimitry Andric         skipLineComment(First, End);
40581ad6265SDimitry Andric         continue;
40681ad6265SDimitry Andric       }
40781ad6265SDimitry Andric 
40881ad6265SDimitry Andric       if (First[1] != '*') {
409*bdd1243dSDimitry Andric         LastTokenPtr = First;
41081ad6265SDimitry Andric         ++First;
41181ad6265SDimitry Andric         continue;
41281ad6265SDimitry Andric       }
41381ad6265SDimitry Andric 
41481ad6265SDimitry Andric       // "/*...*/".
41581ad6265SDimitry Andric       skipBlockComment(First, End);
41681ad6265SDimitry Andric     }
41781ad6265SDimitry Andric     if (First == End)
41881ad6265SDimitry Andric       return;
41981ad6265SDimitry Andric 
42081ad6265SDimitry Andric     // Skip over the newline.
42181ad6265SDimitry Andric     unsigned Len = skipNewline(First, End);
42281ad6265SDimitry Andric     if (!wasLineContinuation(First, Len)) // Continue past line-continuations.
42381ad6265SDimitry Andric       break;
42481ad6265SDimitry Andric   }
42581ad6265SDimitry Andric }
42681ad6265SDimitry Andric 
427*bdd1243dSDimitry Andric void Scanner::skipDirective(StringRef Name, const char *&First,
42881ad6265SDimitry Andric                             const char *const End) {
42981ad6265SDimitry Andric   if (llvm::StringSwitch<bool>(Name)
43081ad6265SDimitry Andric           .Case("warning", true)
43181ad6265SDimitry Andric           .Case("error", true)
43281ad6265SDimitry Andric           .Default(false))
43381ad6265SDimitry Andric     // Do not process quotes or comments.
43481ad6265SDimitry Andric     skipToNewlineRaw(First, End);
43581ad6265SDimitry Andric   else
43681ad6265SDimitry Andric     skipLine(First, End);
43781ad6265SDimitry Andric }
43881ad6265SDimitry Andric 
43981ad6265SDimitry Andric static void skipWhitespace(const char *&First, const char *const End) {
44081ad6265SDimitry Andric   for (;;) {
44181ad6265SDimitry Andric     assert(First <= End);
44281ad6265SDimitry Andric     skipOverSpaces(First, End);
44381ad6265SDimitry Andric 
44481ad6265SDimitry Andric     if (End - First < 2)
44581ad6265SDimitry Andric       return;
44681ad6265SDimitry Andric 
44781ad6265SDimitry Andric     if (First[0] == '\\' && isVerticalWhitespace(First[1])) {
44881ad6265SDimitry Andric       skipNewline(++First, End);
44981ad6265SDimitry Andric       continue;
45081ad6265SDimitry Andric     }
45181ad6265SDimitry Andric 
45281ad6265SDimitry Andric     // Check for a non-comment character.
45381ad6265SDimitry Andric     if (First[0] != '/')
45481ad6265SDimitry Andric       return;
45581ad6265SDimitry Andric 
45681ad6265SDimitry Andric     // "// ...".
45781ad6265SDimitry Andric     if (First[1] == '/') {
45881ad6265SDimitry Andric       skipLineComment(First, End);
45981ad6265SDimitry Andric       return;
46081ad6265SDimitry Andric     }
46181ad6265SDimitry Andric 
46281ad6265SDimitry Andric     // Cannot be a comment.
46381ad6265SDimitry Andric     if (First[1] != '*')
46481ad6265SDimitry Andric       return;
46581ad6265SDimitry Andric 
46681ad6265SDimitry Andric     // "/*...*/".
46781ad6265SDimitry Andric     skipBlockComment(First, End);
46881ad6265SDimitry Andric   }
46981ad6265SDimitry Andric }
47081ad6265SDimitry Andric 
47181ad6265SDimitry Andric bool Scanner::lexModuleDirectiveBody(DirectiveKind Kind, const char *&First,
47281ad6265SDimitry Andric                                      const char *const End) {
47381ad6265SDimitry Andric   const char *DirectiveLoc = Input.data() + CurDirToks.front().Offset;
47481ad6265SDimitry Andric   for (;;) {
47581ad6265SDimitry Andric     const dependency_directives_scan::Token &Tok = lexToken(First, End);
47681ad6265SDimitry Andric     if (Tok.is(tok::eof))
47781ad6265SDimitry Andric       return reportError(
47881ad6265SDimitry Andric           DirectiveLoc,
47981ad6265SDimitry Andric           diag::err_dep_source_scanner_missing_semi_after_at_import);
48081ad6265SDimitry Andric     if (Tok.is(tok::semi))
48181ad6265SDimitry Andric       break;
48281ad6265SDimitry Andric   }
48381ad6265SDimitry Andric   pushDirective(Kind);
48481ad6265SDimitry Andric   skipWhitespace(First, End);
48581ad6265SDimitry Andric   if (First == End)
48681ad6265SDimitry Andric     return false;
48781ad6265SDimitry Andric   if (!isVerticalWhitespace(*First))
48881ad6265SDimitry Andric     return reportError(
48981ad6265SDimitry Andric         DirectiveLoc, diag::err_dep_source_scanner_unexpected_tokens_at_import);
49081ad6265SDimitry Andric   skipNewline(First, End);
49181ad6265SDimitry Andric   return false;
49281ad6265SDimitry Andric }
49381ad6265SDimitry Andric 
49481ad6265SDimitry Andric dependency_directives_scan::Token &Scanner::lexToken(const char *&First,
49581ad6265SDimitry Andric                                                      const char *const End) {
49681ad6265SDimitry Andric   clang::Token Tok;
49781ad6265SDimitry Andric   TheLexer.LexFromRawLexer(Tok);
49881ad6265SDimitry Andric   First = Input.data() + TheLexer.getCurrentBufferOffset();
49981ad6265SDimitry Andric   assert(First <= End);
50081ad6265SDimitry Andric 
50181ad6265SDimitry Andric   unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
50281ad6265SDimitry Andric   CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),
50381ad6265SDimitry Andric                           Tok.getFlags());
50481ad6265SDimitry Andric   return CurDirToks.back();
50581ad6265SDimitry Andric }
50681ad6265SDimitry Andric 
50781ad6265SDimitry Andric dependency_directives_scan::Token &
50881ad6265SDimitry Andric Scanner::lexIncludeFilename(const char *&First, const char *const End) {
50981ad6265SDimitry Andric   clang::Token Tok;
51081ad6265SDimitry Andric   TheLexer.LexIncludeFilename(Tok);
51181ad6265SDimitry Andric   First = Input.data() + TheLexer.getCurrentBufferOffset();
51281ad6265SDimitry Andric   assert(First <= End);
51381ad6265SDimitry Andric 
51481ad6265SDimitry Andric   unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
51581ad6265SDimitry Andric   CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),
51681ad6265SDimitry Andric                           Tok.getFlags());
51781ad6265SDimitry Andric   return CurDirToks.back();
51881ad6265SDimitry Andric }
51981ad6265SDimitry Andric 
52081ad6265SDimitry Andric void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) {
52181ad6265SDimitry Andric   while (true) {
52281ad6265SDimitry Andric     const dependency_directives_scan::Token &Tok = lexToken(First, End);
52381ad6265SDimitry Andric     if (Tok.is(tok::eod))
52481ad6265SDimitry Andric       break;
52581ad6265SDimitry Andric   }
52681ad6265SDimitry Andric }
52781ad6265SDimitry Andric 
528*bdd1243dSDimitry Andric [[nodiscard]] std::optional<StringRef>
52981ad6265SDimitry Andric Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
53081ad6265SDimitry Andric   const dependency_directives_scan::Token &Tok = lexToken(First, End);
53181ad6265SDimitry Andric   if (Tok.isNot(tok::raw_identifier)) {
53281ad6265SDimitry Andric     if (!Tok.is(tok::eod))
53381ad6265SDimitry Andric       skipLine(First, End);
534*bdd1243dSDimitry Andric     return std::nullopt;
53581ad6265SDimitry Andric   }
53681ad6265SDimitry Andric 
53781ad6265SDimitry Andric   bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning;
53881ad6265SDimitry Andric   if (LLVM_LIKELY(!NeedsCleaning))
53981ad6265SDimitry Andric     return Input.slice(Tok.Offset, Tok.getEnd());
54081ad6265SDimitry Andric 
54181ad6265SDimitry Andric   SmallString<64> Spelling;
54281ad6265SDimitry Andric   Spelling.resize(Tok.Length);
54381ad6265SDimitry Andric 
54481ad6265SDimitry Andric   unsigned SpellingLength = 0;
54581ad6265SDimitry Andric   const char *BufPtr = Input.begin() + Tok.Offset;
54681ad6265SDimitry Andric   const char *AfterIdent = Input.begin() + Tok.getEnd();
54781ad6265SDimitry Andric   while (BufPtr < AfterIdent) {
54881ad6265SDimitry Andric     unsigned Size;
54981ad6265SDimitry Andric     Spelling[SpellingLength++] =
55081ad6265SDimitry Andric         Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
55181ad6265SDimitry Andric     BufPtr += Size;
55281ad6265SDimitry Andric   }
55381ad6265SDimitry Andric 
55481ad6265SDimitry Andric   return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0)
55581ad6265SDimitry Andric       .first->first();
55681ad6265SDimitry Andric }
55781ad6265SDimitry Andric 
55881ad6265SDimitry Andric StringRef Scanner::lexIdentifier(const char *&First, const char *const End) {
559*bdd1243dSDimitry Andric   std::optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End);
56081ad6265SDimitry Andric   assert(Id && "expected identifier token");
561*bdd1243dSDimitry Andric   return *Id;
56281ad6265SDimitry Andric }
56381ad6265SDimitry Andric 
56481ad6265SDimitry Andric bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First,
56581ad6265SDimitry Andric                                          const char *const End) {
566*bdd1243dSDimitry Andric   if (std::optional<StringRef> FoundId =
567*bdd1243dSDimitry Andric           tryLexIdentifierOrSkipLine(First, End)) {
56881ad6265SDimitry Andric     if (*FoundId == Id)
56981ad6265SDimitry Andric       return true;
57081ad6265SDimitry Andric     skipLine(First, End);
57181ad6265SDimitry Andric   }
57281ad6265SDimitry Andric   return false;
57381ad6265SDimitry Andric }
57481ad6265SDimitry Andric 
57581ad6265SDimitry Andric bool Scanner::lexAt(const char *&First, const char *const End) {
57681ad6265SDimitry Andric   // Handle "@import".
57781ad6265SDimitry Andric 
57881ad6265SDimitry Andric   // Lex '@'.
57981ad6265SDimitry Andric   const dependency_directives_scan::Token &AtTok = lexToken(First, End);
58081ad6265SDimitry Andric   assert(AtTok.is(tok::at));
58181ad6265SDimitry Andric   (void)AtTok;
58281ad6265SDimitry Andric 
58381ad6265SDimitry Andric   if (!isNextIdentifierOrSkipLine("import", First, End))
58481ad6265SDimitry Andric     return false;
58581ad6265SDimitry Andric   return lexModuleDirectiveBody(decl_at_import, First, End);
58681ad6265SDimitry Andric }
58781ad6265SDimitry Andric 
58881ad6265SDimitry Andric bool Scanner::lexModule(const char *&First, const char *const End) {
58981ad6265SDimitry Andric   StringRef Id = lexIdentifier(First, End);
59081ad6265SDimitry Andric   bool Export = false;
59181ad6265SDimitry Andric   if (Id == "export") {
59281ad6265SDimitry Andric     Export = true;
593*bdd1243dSDimitry Andric     std::optional<StringRef> NextId = tryLexIdentifierOrSkipLine(First, End);
59481ad6265SDimitry Andric     if (!NextId)
59581ad6265SDimitry Andric       return false;
59681ad6265SDimitry Andric     Id = *NextId;
59781ad6265SDimitry Andric   }
59881ad6265SDimitry Andric 
59981ad6265SDimitry Andric   if (Id != "module" && Id != "import") {
60081ad6265SDimitry Andric     skipLine(First, End);
60181ad6265SDimitry Andric     return false;
60281ad6265SDimitry Andric   }
60381ad6265SDimitry Andric 
60481ad6265SDimitry Andric   skipWhitespace(First, End);
60581ad6265SDimitry Andric 
60681ad6265SDimitry Andric   // Ignore this as a module directive if the next character can't be part of
60781ad6265SDimitry Andric   // an import.
60881ad6265SDimitry Andric 
60981ad6265SDimitry Andric   switch (*First) {
61081ad6265SDimitry Andric   case ':':
61181ad6265SDimitry Andric   case '<':
61281ad6265SDimitry Andric   case '"':
61381ad6265SDimitry Andric     break;
61481ad6265SDimitry Andric   default:
61581ad6265SDimitry Andric     if (!isAsciiIdentifierContinue(*First)) {
61681ad6265SDimitry Andric       skipLine(First, End);
61781ad6265SDimitry Andric       return false;
61881ad6265SDimitry Andric     }
61981ad6265SDimitry Andric   }
62081ad6265SDimitry Andric 
62181ad6265SDimitry Andric   TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ false);
62281ad6265SDimitry Andric 
62381ad6265SDimitry Andric   DirectiveKind Kind;
62481ad6265SDimitry Andric   if (Id == "module")
62581ad6265SDimitry Andric     Kind = Export ? cxx_export_module_decl : cxx_module_decl;
62681ad6265SDimitry Andric   else
62781ad6265SDimitry Andric     Kind = Export ? cxx_export_import_decl : cxx_import_decl;
62881ad6265SDimitry Andric 
62981ad6265SDimitry Andric   return lexModuleDirectiveBody(Kind, First, End);
63081ad6265SDimitry Andric }
63181ad6265SDimitry Andric 
63281ad6265SDimitry Andric bool Scanner::lexPragma(const char *&First, const char *const End) {
633*bdd1243dSDimitry Andric   std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
63481ad6265SDimitry Andric   if (!FoundId)
63581ad6265SDimitry Andric     return false;
63681ad6265SDimitry Andric 
63781ad6265SDimitry Andric   StringRef Id = *FoundId;
63881ad6265SDimitry Andric   auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
63981ad6265SDimitry Andric                   .Case("once", pp_pragma_once)
64081ad6265SDimitry Andric                   .Case("push_macro", pp_pragma_push_macro)
64181ad6265SDimitry Andric                   .Case("pop_macro", pp_pragma_pop_macro)
64281ad6265SDimitry Andric                   .Case("include_alias", pp_pragma_include_alias)
64381ad6265SDimitry Andric                   .Default(pp_none);
64481ad6265SDimitry Andric   if (Kind != pp_none) {
64581ad6265SDimitry Andric     lexPPDirectiveBody(First, End);
64681ad6265SDimitry Andric     pushDirective(Kind);
64781ad6265SDimitry Andric     return false;
64881ad6265SDimitry Andric   }
64981ad6265SDimitry Andric 
65081ad6265SDimitry Andric   if (Id != "clang") {
65181ad6265SDimitry Andric     skipLine(First, End);
65281ad6265SDimitry Andric     return false;
65381ad6265SDimitry Andric   }
65481ad6265SDimitry Andric 
65581ad6265SDimitry Andric   // #pragma clang.
65681ad6265SDimitry Andric   if (!isNextIdentifierOrSkipLine("module", First, End))
65781ad6265SDimitry Andric     return false;
65881ad6265SDimitry Andric 
65981ad6265SDimitry Andric   // #pragma clang module.
66081ad6265SDimitry Andric   if (!isNextIdentifierOrSkipLine("import", First, End))
66181ad6265SDimitry Andric     return false;
66281ad6265SDimitry Andric 
66381ad6265SDimitry Andric   // #pragma clang module import.
66481ad6265SDimitry Andric   lexPPDirectiveBody(First, End);
66581ad6265SDimitry Andric   pushDirective(pp_pragma_import);
66681ad6265SDimitry Andric   return false;
66781ad6265SDimitry Andric }
66881ad6265SDimitry Andric 
66981ad6265SDimitry Andric bool Scanner::lexEndif(const char *&First, const char *const End) {
67081ad6265SDimitry Andric   // Strip out "#else" if it's empty.
67181ad6265SDimitry Andric   if (topDirective() == pp_else)
67281ad6265SDimitry Andric     popDirective();
67381ad6265SDimitry Andric 
67481ad6265SDimitry Andric   // If "#ifdef" is empty, strip it and skip the "#endif".
67581ad6265SDimitry Andric   //
67681ad6265SDimitry Andric   // FIXME: Once/if Clang starts disallowing __has_include in macro expansions,
67781ad6265SDimitry Andric   // we can skip empty `#if` and `#elif` blocks as well after scanning for a
67881ad6265SDimitry Andric   // literal __has_include in the condition.  Even without that rule we could
67981ad6265SDimitry Andric   // drop the tokens if we scan for identifiers in the condition and find none.
68081ad6265SDimitry Andric   if (topDirective() == pp_ifdef || topDirective() == pp_ifndef) {
68181ad6265SDimitry Andric     popDirective();
68281ad6265SDimitry Andric     skipLine(First, End);
68381ad6265SDimitry Andric     return false;
68481ad6265SDimitry Andric   }
68581ad6265SDimitry Andric 
68681ad6265SDimitry Andric   return lexDefault(pp_endif, First, End);
68781ad6265SDimitry Andric }
68881ad6265SDimitry Andric 
68981ad6265SDimitry Andric bool Scanner::lexDefault(DirectiveKind Kind, const char *&First,
69081ad6265SDimitry Andric                          const char *const End) {
69181ad6265SDimitry Andric   lexPPDirectiveBody(First, End);
69281ad6265SDimitry Andric   pushDirective(Kind);
69381ad6265SDimitry Andric   return false;
69481ad6265SDimitry Andric }
69581ad6265SDimitry Andric 
69681ad6265SDimitry Andric static bool isStartOfRelevantLine(char First) {
69781ad6265SDimitry Andric   switch (First) {
69881ad6265SDimitry Andric   case '#':
69981ad6265SDimitry Andric   case '@':
70081ad6265SDimitry Andric   case 'i':
70181ad6265SDimitry Andric   case 'e':
70281ad6265SDimitry Andric   case 'm':
70381ad6265SDimitry Andric     return true;
70481ad6265SDimitry Andric   }
70581ad6265SDimitry Andric   return false;
70681ad6265SDimitry Andric }
70781ad6265SDimitry Andric 
70881ad6265SDimitry Andric bool Scanner::lexPPLine(const char *&First, const char *const End) {
70981ad6265SDimitry Andric   assert(First != End);
71081ad6265SDimitry Andric 
71181ad6265SDimitry Andric   skipWhitespace(First, End);
71281ad6265SDimitry Andric   assert(First <= End);
71381ad6265SDimitry Andric   if (First == End)
71481ad6265SDimitry Andric     return false;
71581ad6265SDimitry Andric 
71681ad6265SDimitry Andric   if (!isStartOfRelevantLine(*First)) {
71781ad6265SDimitry Andric     skipLine(First, End);
71881ad6265SDimitry Andric     assert(First <= End);
71981ad6265SDimitry Andric     return false;
72081ad6265SDimitry Andric   }
72181ad6265SDimitry Andric 
722*bdd1243dSDimitry Andric   LastTokenPtr = First;
723*bdd1243dSDimitry Andric 
72481ad6265SDimitry Andric   TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ true);
72581ad6265SDimitry Andric 
72681ad6265SDimitry Andric   auto ScEx1 = make_scope_exit([&]() {
72781ad6265SDimitry Andric     /// Clear Scanner's CurDirToks before returning, in case we didn't push a
72881ad6265SDimitry Andric     /// new directive.
72981ad6265SDimitry Andric     CurDirToks.clear();
73081ad6265SDimitry Andric   });
73181ad6265SDimitry Andric 
73281ad6265SDimitry Andric   // Handle "@import".
73381ad6265SDimitry Andric   if (*First == '@')
73481ad6265SDimitry Andric     return lexAt(First, End);
73581ad6265SDimitry Andric 
73681ad6265SDimitry Andric   if (*First == 'i' || *First == 'e' || *First == 'm')
73781ad6265SDimitry Andric     return lexModule(First, End);
73881ad6265SDimitry Andric 
73981ad6265SDimitry Andric   // Handle preprocessing directives.
74081ad6265SDimitry Andric 
74181ad6265SDimitry Andric   TheLexer.setParsingPreprocessorDirective(true);
74281ad6265SDimitry Andric   auto ScEx2 = make_scope_exit(
74381ad6265SDimitry Andric       [&]() { TheLexer.setParsingPreprocessorDirective(false); });
74481ad6265SDimitry Andric 
74581ad6265SDimitry Andric   // Lex '#'.
74681ad6265SDimitry Andric   const dependency_directives_scan::Token &HashTok = lexToken(First, End);
747*bdd1243dSDimitry Andric   if (HashTok.is(tok::hashhash)) {
748*bdd1243dSDimitry Andric     // A \p tok::hashhash at this location is passed by the preprocessor to the
749*bdd1243dSDimitry Andric     // parser to interpret, like any other token. So for dependency scanning
750*bdd1243dSDimitry Andric     // skip it like a normal token not affecting the preprocessor.
751*bdd1243dSDimitry Andric     skipLine(First, End);
752*bdd1243dSDimitry Andric     assert(First <= End);
753*bdd1243dSDimitry Andric     return false;
754*bdd1243dSDimitry Andric   }
75581ad6265SDimitry Andric   assert(HashTok.is(tok::hash));
75681ad6265SDimitry Andric   (void)HashTok;
75781ad6265SDimitry Andric 
758*bdd1243dSDimitry Andric   std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
75981ad6265SDimitry Andric   if (!FoundId)
76081ad6265SDimitry Andric     return false;
76181ad6265SDimitry Andric 
76281ad6265SDimitry Andric   StringRef Id = *FoundId;
76381ad6265SDimitry Andric 
76481ad6265SDimitry Andric   if (Id == "pragma")
76581ad6265SDimitry Andric     return lexPragma(First, End);
76681ad6265SDimitry Andric 
76781ad6265SDimitry Andric   auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
76881ad6265SDimitry Andric                   .Case("include", pp_include)
76981ad6265SDimitry Andric                   .Case("__include_macros", pp___include_macros)
77081ad6265SDimitry Andric                   .Case("define", pp_define)
77181ad6265SDimitry Andric                   .Case("undef", pp_undef)
77281ad6265SDimitry Andric                   .Case("import", pp_import)
77381ad6265SDimitry Andric                   .Case("include_next", pp_include_next)
77481ad6265SDimitry Andric                   .Case("if", pp_if)
77581ad6265SDimitry Andric                   .Case("ifdef", pp_ifdef)
77681ad6265SDimitry Andric                   .Case("ifndef", pp_ifndef)
77781ad6265SDimitry Andric                   .Case("elif", pp_elif)
77881ad6265SDimitry Andric                   .Case("elifdef", pp_elifdef)
77981ad6265SDimitry Andric                   .Case("elifndef", pp_elifndef)
78081ad6265SDimitry Andric                   .Case("else", pp_else)
78181ad6265SDimitry Andric                   .Case("endif", pp_endif)
78281ad6265SDimitry Andric                   .Default(pp_none);
78381ad6265SDimitry Andric   if (Kind == pp_none) {
78481ad6265SDimitry Andric     skipDirective(Id, First, End);
78581ad6265SDimitry Andric     return false;
78681ad6265SDimitry Andric   }
78781ad6265SDimitry Andric 
78881ad6265SDimitry Andric   if (Kind == pp_endif)
78981ad6265SDimitry Andric     return lexEndif(First, End);
79081ad6265SDimitry Andric 
79181ad6265SDimitry Andric   switch (Kind) {
79281ad6265SDimitry Andric   case pp_include:
79381ad6265SDimitry Andric   case pp___include_macros:
79481ad6265SDimitry Andric   case pp_include_next:
79581ad6265SDimitry Andric   case pp_import:
79681ad6265SDimitry Andric     lexIncludeFilename(First, End);
79781ad6265SDimitry Andric     break;
79881ad6265SDimitry Andric   default:
79981ad6265SDimitry Andric     break;
80081ad6265SDimitry Andric   }
80181ad6265SDimitry Andric 
80281ad6265SDimitry Andric   // Everything else.
80381ad6265SDimitry Andric   return lexDefault(Kind, First, End);
80481ad6265SDimitry Andric }
80581ad6265SDimitry Andric 
80681ad6265SDimitry Andric static void skipUTF8ByteOrderMark(const char *&First, const char *const End) {
80781ad6265SDimitry Andric   if ((End - First) >= 3 && First[0] == '\xef' && First[1] == '\xbb' &&
80881ad6265SDimitry Andric       First[2] == '\xbf')
80981ad6265SDimitry Andric     First += 3;
81081ad6265SDimitry Andric }
81181ad6265SDimitry Andric 
81281ad6265SDimitry Andric bool Scanner::scanImpl(const char *First, const char *const End) {
81381ad6265SDimitry Andric   skipUTF8ByteOrderMark(First, End);
81481ad6265SDimitry Andric   while (First != End)
81581ad6265SDimitry Andric     if (lexPPLine(First, End))
81681ad6265SDimitry Andric       return true;
81781ad6265SDimitry Andric   return false;
81881ad6265SDimitry Andric }
81981ad6265SDimitry Andric 
82081ad6265SDimitry Andric bool Scanner::scan(SmallVectorImpl<Directive> &Directives) {
82181ad6265SDimitry Andric   bool Error = scanImpl(Input.begin(), Input.end());
82281ad6265SDimitry Andric 
82381ad6265SDimitry Andric   if (!Error) {
82481ad6265SDimitry Andric     // Add an EOF on success.
825*bdd1243dSDimitry Andric     if (LastTokenPtr &&
826*bdd1243dSDimitry Andric         (Tokens.empty() || LastTokenPtr > Input.begin() + Tokens.back().Offset))
827*bdd1243dSDimitry Andric       pushDirective(tokens_present_before_eof);
82881ad6265SDimitry Andric     pushDirective(pp_eof);
82981ad6265SDimitry Andric   }
83081ad6265SDimitry Andric 
83181ad6265SDimitry Andric   ArrayRef<dependency_directives_scan::Token> RemainingTokens = Tokens;
83281ad6265SDimitry Andric   for (const DirectiveWithTokens &DirWithToks : DirsWithToks) {
83381ad6265SDimitry Andric     assert(RemainingTokens.size() >= DirWithToks.NumTokens);
83481ad6265SDimitry Andric     Directives.emplace_back(DirWithToks.Kind,
83581ad6265SDimitry Andric                             RemainingTokens.take_front(DirWithToks.NumTokens));
83681ad6265SDimitry Andric     RemainingTokens = RemainingTokens.drop_front(DirWithToks.NumTokens);
83781ad6265SDimitry Andric   }
83881ad6265SDimitry Andric   assert(RemainingTokens.empty());
83981ad6265SDimitry Andric 
84081ad6265SDimitry Andric   return Error;
84181ad6265SDimitry Andric }
84281ad6265SDimitry Andric 
84381ad6265SDimitry Andric bool clang::scanSourceForDependencyDirectives(
84481ad6265SDimitry Andric     StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
84581ad6265SDimitry Andric     SmallVectorImpl<Directive> &Directives, DiagnosticsEngine *Diags,
84681ad6265SDimitry Andric     SourceLocation InputSourceLoc) {
84781ad6265SDimitry Andric   return Scanner(Input, Tokens, Diags, InputSourceLoc).scan(Directives);
84881ad6265SDimitry Andric }
84981ad6265SDimitry Andric 
85081ad6265SDimitry Andric void clang::printDependencyDirectivesAsSource(
85181ad6265SDimitry Andric     StringRef Source,
85281ad6265SDimitry Andric     ArrayRef<dependency_directives_scan::Directive> Directives,
85381ad6265SDimitry Andric     llvm::raw_ostream &OS) {
85481ad6265SDimitry Andric   // Add a space separator where it is convenient for testing purposes.
85581ad6265SDimitry Andric   auto needsSpaceSeparator =
85681ad6265SDimitry Andric       [](tok::TokenKind Prev,
85781ad6265SDimitry Andric          const dependency_directives_scan::Token &Tok) -> bool {
85881ad6265SDimitry Andric     if (Prev == Tok.Kind)
85981ad6265SDimitry Andric       return !Tok.isOneOf(tok::l_paren, tok::r_paren, tok::l_square,
86081ad6265SDimitry Andric                           tok::r_square);
86181ad6265SDimitry Andric     if (Prev == tok::raw_identifier &&
86281ad6265SDimitry Andric         Tok.isOneOf(tok::hash, tok::numeric_constant, tok::string_literal,
86381ad6265SDimitry Andric                     tok::char_constant, tok::header_name))
86481ad6265SDimitry Andric       return true;
86581ad6265SDimitry Andric     if (Prev == tok::r_paren &&
86681ad6265SDimitry Andric         Tok.isOneOf(tok::raw_identifier, tok::hash, tok::string_literal,
86781ad6265SDimitry Andric                     tok::char_constant, tok::unknown))
86881ad6265SDimitry Andric       return true;
86981ad6265SDimitry Andric     if (Prev == tok::comma &&
87081ad6265SDimitry Andric         Tok.isOneOf(tok::l_paren, tok::string_literal, tok::less))
87181ad6265SDimitry Andric       return true;
87281ad6265SDimitry Andric     return false;
87381ad6265SDimitry Andric   };
87481ad6265SDimitry Andric 
87581ad6265SDimitry Andric   for (const dependency_directives_scan::Directive &Directive : Directives) {
876*bdd1243dSDimitry Andric     if (Directive.Kind == tokens_present_before_eof)
877*bdd1243dSDimitry Andric       OS << "<TokBeforeEOF>";
878*bdd1243dSDimitry Andric     std::optional<tok::TokenKind> PrevTokenKind;
87981ad6265SDimitry Andric     for (const dependency_directives_scan::Token &Tok : Directive.Tokens) {
88081ad6265SDimitry Andric       if (PrevTokenKind && needsSpaceSeparator(*PrevTokenKind, Tok))
88181ad6265SDimitry Andric         OS << ' ';
88281ad6265SDimitry Andric       PrevTokenKind = Tok.Kind;
88381ad6265SDimitry Andric       OS << Source.slice(Tok.Offset, Tok.getEnd());
88481ad6265SDimitry Andric     }
88581ad6265SDimitry Andric   }
88681ad6265SDimitry Andric }
887