xref: /freebsd-src/contrib/llvm-project/clang/lib/Lex/DependencyDirectivesScanner.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
181ad6265SDimitry Andric //===- DependencyDirectivesScanner.cpp ------------------------------------===//
281ad6265SDimitry Andric //
381ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
481ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
581ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
681ad6265SDimitry Andric //
781ad6265SDimitry Andric //===----------------------------------------------------------------------===//
881ad6265SDimitry Andric ///
981ad6265SDimitry Andric /// \file
1081ad6265SDimitry Andric /// This is the interface for scanning header and source files to get the
1181ad6265SDimitry Andric /// minimum necessary preprocessor directives for evaluating includes. It
1281ad6265SDimitry Andric /// reduces the source down to #define, #include, #import, @import, and any
1381ad6265SDimitry Andric /// conditional preprocessor logic that contains one of those.
1481ad6265SDimitry Andric ///
1581ad6265SDimitry Andric //===----------------------------------------------------------------------===//
1681ad6265SDimitry Andric 
1781ad6265SDimitry Andric #include "clang/Lex/DependencyDirectivesScanner.h"
1881ad6265SDimitry Andric #include "clang/Basic/CharInfo.h"
1981ad6265SDimitry Andric #include "clang/Basic/Diagnostic.h"
2081ad6265SDimitry Andric #include "clang/Lex/LexDiagnostic.h"
2181ad6265SDimitry Andric #include "clang/Lex/Lexer.h"
2206c3fb27SDimitry Andric #include "clang/Lex/Pragma.h"
2381ad6265SDimitry Andric #include "llvm/ADT/ScopeExit.h"
2481ad6265SDimitry Andric #include "llvm/ADT/SmallString.h"
2581ad6265SDimitry Andric #include "llvm/ADT/StringMap.h"
2681ad6265SDimitry Andric #include "llvm/ADT/StringSwitch.h"
27bdd1243dSDimitry Andric #include <optional>
2881ad6265SDimitry Andric 
2981ad6265SDimitry Andric using namespace clang;
3081ad6265SDimitry Andric using namespace clang::dependency_directives_scan;
3181ad6265SDimitry Andric using namespace llvm;
3281ad6265SDimitry Andric 
3381ad6265SDimitry Andric namespace {
3481ad6265SDimitry Andric 
3581ad6265SDimitry Andric struct DirectiveWithTokens {
3681ad6265SDimitry Andric   DirectiveKind Kind;
3781ad6265SDimitry Andric   unsigned NumTokens;
3881ad6265SDimitry Andric 
3981ad6265SDimitry Andric   DirectiveWithTokens(DirectiveKind Kind, unsigned NumTokens)
4081ad6265SDimitry Andric       : Kind(Kind), NumTokens(NumTokens) {}
4181ad6265SDimitry Andric };
4281ad6265SDimitry Andric 
4381ad6265SDimitry Andric /// Does an efficient "scan" of the sources to detect the presence of
4481ad6265SDimitry Andric /// preprocessor (or module import) directives and collects the raw lexed tokens
4581ad6265SDimitry Andric /// for those directives so that the \p Lexer can "replay" them when the file is
4681ad6265SDimitry Andric /// included.
4781ad6265SDimitry Andric ///
4881ad6265SDimitry Andric /// Note that the behavior of the raw lexer is affected by the language mode,
4981ad6265SDimitry Andric /// while at this point we want to do a scan and collect tokens once,
5081ad6265SDimitry Andric /// irrespective of the language mode that the file will get included in. To
5181ad6265SDimitry Andric /// compensate for that the \p Lexer, while "replaying", will adjust a token
5281ad6265SDimitry Andric /// where appropriate, when it could affect the preprocessor's state.
5381ad6265SDimitry Andric /// For example in a directive like
5481ad6265SDimitry Andric ///
5581ad6265SDimitry Andric /// \code
5681ad6265SDimitry Andric ///   #if __has_cpp_attribute(clang::fallthrough)
5781ad6265SDimitry Andric /// \endcode
5881ad6265SDimitry Andric ///
5981ad6265SDimitry Andric /// The preprocessor needs to see '::' as 'tok::coloncolon' instead of 2
6081ad6265SDimitry Andric /// 'tok::colon'. The \p Lexer will adjust if it sees consecutive 'tok::colon'
6181ad6265SDimitry Andric /// while in C++ mode.
6281ad6265SDimitry Andric struct Scanner {
6381ad6265SDimitry Andric   Scanner(StringRef Input,
6481ad6265SDimitry Andric           SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
6581ad6265SDimitry Andric           DiagnosticsEngine *Diags, SourceLocation InputSourceLoc)
6681ad6265SDimitry Andric       : Input(Input), Tokens(Tokens), Diags(Diags),
6781ad6265SDimitry Andric         InputSourceLoc(InputSourceLoc), LangOpts(getLangOptsForDepScanning()),
6881ad6265SDimitry Andric         TheLexer(InputSourceLoc, LangOpts, Input.begin(), Input.begin(),
6981ad6265SDimitry Andric                  Input.end()) {}
7081ad6265SDimitry Andric 
7181ad6265SDimitry Andric   static LangOptions getLangOptsForDepScanning() {
7281ad6265SDimitry Andric     LangOptions LangOpts;
7381ad6265SDimitry Andric     // Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'.
7481ad6265SDimitry Andric     LangOpts.ObjC = true;
7581ad6265SDimitry Andric     LangOpts.LineComment = true;
76*0fca6ea1SDimitry Andric     LangOpts.RawStringLiterals = true;
77*0fca6ea1SDimitry Andric     // FIXME: we do not enable C11 or C++11, so we are missing u/u8/U"".
7881ad6265SDimitry Andric     return LangOpts;
7981ad6265SDimitry Andric   }
8081ad6265SDimitry Andric 
8181ad6265SDimitry Andric   /// Lex the provided source and emit the directive tokens.
8281ad6265SDimitry Andric   ///
8381ad6265SDimitry Andric   /// \returns True on error.
8481ad6265SDimitry Andric   bool scan(SmallVectorImpl<Directive> &Directives);
8581ad6265SDimitry Andric 
8681ad6265SDimitry Andric private:
8781ad6265SDimitry Andric   /// Lexes next token and advances \p First and the \p Lexer.
88bdd1243dSDimitry Andric   [[nodiscard]] dependency_directives_scan::Token &
8981ad6265SDimitry Andric   lexToken(const char *&First, const char *const End);
9081ad6265SDimitry Andric 
91*0fca6ea1SDimitry Andric   [[nodiscard]] dependency_directives_scan::Token &
92*0fca6ea1SDimitry Andric   lexIncludeFilename(const char *&First, const char *const End);
9381ad6265SDimitry Andric 
94bdd1243dSDimitry Andric   void skipLine(const char *&First, const char *const End);
95bdd1243dSDimitry Andric   void skipDirective(StringRef Name, const char *&First, const char *const End);
96bdd1243dSDimitry Andric 
9706c3fb27SDimitry Andric   /// Returns the spelling of a string literal or identifier after performing
9806c3fb27SDimitry Andric   /// any processing needed to handle \c clang::Token::NeedsCleaning.
9906c3fb27SDimitry Andric   StringRef cleanStringIfNeeded(const dependency_directives_scan::Token &Tok);
10006c3fb27SDimitry Andric 
10181ad6265SDimitry Andric   /// Lexes next token and if it is identifier returns its string, otherwise
102bdd1243dSDimitry Andric   /// it skips the current line and returns \p std::nullopt.
10381ad6265SDimitry Andric   ///
10481ad6265SDimitry Andric   /// In any case (whatever the token kind) \p First and the \p Lexer will
10581ad6265SDimitry Andric   /// advance beyond the token.
106bdd1243dSDimitry Andric   [[nodiscard]] std::optional<StringRef>
10781ad6265SDimitry Andric   tryLexIdentifierOrSkipLine(const char *&First, const char *const End);
10881ad6265SDimitry Andric 
10981ad6265SDimitry Andric   /// Used when it is certain that next token is an identifier.
110bdd1243dSDimitry Andric   [[nodiscard]] StringRef lexIdentifier(const char *&First,
11181ad6265SDimitry Andric                                         const char *const End);
11281ad6265SDimitry Andric 
11381ad6265SDimitry Andric   /// Lexes next token and returns true iff it is an identifier that matches \p
11481ad6265SDimitry Andric   /// Id, otherwise it skips the current line and returns false.
11581ad6265SDimitry Andric   ///
11681ad6265SDimitry Andric   /// In any case (whatever the token kind) \p First and the \p Lexer will
11781ad6265SDimitry Andric   /// advance beyond the token.
118bdd1243dSDimitry Andric   [[nodiscard]] bool isNextIdentifierOrSkipLine(StringRef Id,
11981ad6265SDimitry Andric                                                 const char *&First,
12081ad6265SDimitry Andric                                                 const char *const End);
12181ad6265SDimitry Andric 
12206c3fb27SDimitry Andric   /// Lexes next token and returns true iff it matches the kind \p K.
12306c3fb27SDimitry Andric   /// Otherwise it skips the current line and returns false.
12406c3fb27SDimitry Andric   ///
12506c3fb27SDimitry Andric   /// In any case (whatever the token kind) \p First and the \p Lexer will
12606c3fb27SDimitry Andric   /// advance beyond the token.
12706c3fb27SDimitry Andric   [[nodiscard]] bool isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
12806c3fb27SDimitry Andric                                            const char *const End);
12906c3fb27SDimitry Andric 
13006c3fb27SDimitry Andric   /// Lexes next token and if it is string literal, returns its string.
13106c3fb27SDimitry Andric   /// Otherwise, it skips the current line and returns \p std::nullopt.
13206c3fb27SDimitry Andric   ///
13306c3fb27SDimitry Andric   /// In any case (whatever the token kind) \p First and the \p Lexer will
13406c3fb27SDimitry Andric   /// advance beyond the token.
13506c3fb27SDimitry Andric   [[nodiscard]] std::optional<StringRef>
13606c3fb27SDimitry Andric   tryLexStringLiteralOrSkipLine(const char *&First, const char *const End);
13706c3fb27SDimitry Andric 
138bdd1243dSDimitry Andric   [[nodiscard]] bool scanImpl(const char *First, const char *const End);
139bdd1243dSDimitry Andric   [[nodiscard]] bool lexPPLine(const char *&First, const char *const End);
140bdd1243dSDimitry Andric   [[nodiscard]] bool lexAt(const char *&First, const char *const End);
141bdd1243dSDimitry Andric   [[nodiscard]] bool lexModule(const char *&First, const char *const End);
142bdd1243dSDimitry Andric   [[nodiscard]] bool lexDefine(const char *HashLoc, const char *&First,
14381ad6265SDimitry Andric                                const char *const End);
144bdd1243dSDimitry Andric   [[nodiscard]] bool lexPragma(const char *&First, const char *const End);
14506c3fb27SDimitry Andric   [[nodiscard]] bool lex_Pragma(const char *&First, const char *const End);
146bdd1243dSDimitry Andric   [[nodiscard]] bool lexEndif(const char *&First, const char *const End);
147bdd1243dSDimitry Andric   [[nodiscard]] bool lexDefault(DirectiveKind Kind, const char *&First,
14881ad6265SDimitry Andric                                 const char *const End);
149bdd1243dSDimitry Andric   [[nodiscard]] bool lexModuleDirectiveBody(DirectiveKind Kind,
15081ad6265SDimitry Andric                                             const char *&First,
15181ad6265SDimitry Andric                                             const char *const End);
15281ad6265SDimitry Andric   void lexPPDirectiveBody(const char *&First, const char *const End);
15381ad6265SDimitry Andric 
15481ad6265SDimitry Andric   DirectiveWithTokens &pushDirective(DirectiveKind Kind) {
15581ad6265SDimitry Andric     Tokens.append(CurDirToks);
15681ad6265SDimitry Andric     DirsWithToks.emplace_back(Kind, CurDirToks.size());
15781ad6265SDimitry Andric     CurDirToks.clear();
15881ad6265SDimitry Andric     return DirsWithToks.back();
15981ad6265SDimitry Andric   }
16081ad6265SDimitry Andric   void popDirective() {
16181ad6265SDimitry Andric     Tokens.pop_back_n(DirsWithToks.pop_back_val().NumTokens);
16281ad6265SDimitry Andric   }
16381ad6265SDimitry Andric   DirectiveKind topDirective() const {
16481ad6265SDimitry Andric     return DirsWithToks.empty() ? pp_none : DirsWithToks.back().Kind;
16581ad6265SDimitry Andric   }
16681ad6265SDimitry Andric 
16781ad6265SDimitry Andric   unsigned getOffsetAt(const char *CurPtr) const {
16881ad6265SDimitry Andric     return CurPtr - Input.data();
16981ad6265SDimitry Andric   }
17081ad6265SDimitry Andric 
17181ad6265SDimitry Andric   /// Reports a diagnostic if the diagnostic engine is provided. Always returns
17281ad6265SDimitry Andric   /// true at the end.
17381ad6265SDimitry Andric   bool reportError(const char *CurPtr, unsigned Err);
17481ad6265SDimitry Andric 
17581ad6265SDimitry Andric   StringMap<char> SplitIds;
17681ad6265SDimitry Andric   StringRef Input;
17781ad6265SDimitry Andric   SmallVectorImpl<dependency_directives_scan::Token> &Tokens;
17881ad6265SDimitry Andric   DiagnosticsEngine *Diags;
17981ad6265SDimitry Andric   SourceLocation InputSourceLoc;
18081ad6265SDimitry Andric 
181bdd1243dSDimitry Andric   const char *LastTokenPtr = nullptr;
18281ad6265SDimitry Andric   /// Keeps track of the tokens for the currently lexed directive. Once a
18381ad6265SDimitry Andric   /// directive is fully lexed and "committed" then the tokens get appended to
18481ad6265SDimitry Andric   /// \p Tokens and \p CurDirToks is cleared for the next directive.
18581ad6265SDimitry Andric   SmallVector<dependency_directives_scan::Token, 32> CurDirToks;
18681ad6265SDimitry Andric   /// The directives that were lexed along with the number of tokens that each
18781ad6265SDimitry Andric   /// directive contains. The tokens of all the directives are kept in \p Tokens
18881ad6265SDimitry Andric   /// vector, in the same order as the directives order in \p DirsWithToks.
18981ad6265SDimitry Andric   SmallVector<DirectiveWithTokens, 64> DirsWithToks;
19081ad6265SDimitry Andric   LangOptions LangOpts;
19181ad6265SDimitry Andric   Lexer TheLexer;
19281ad6265SDimitry Andric };
19381ad6265SDimitry Andric 
19481ad6265SDimitry Andric } // end anonymous namespace
19581ad6265SDimitry Andric 
19681ad6265SDimitry Andric bool Scanner::reportError(const char *CurPtr, unsigned Err) {
19781ad6265SDimitry Andric   if (!Diags)
19881ad6265SDimitry Andric     return true;
19981ad6265SDimitry Andric   assert(CurPtr >= Input.data() && "invalid buffer ptr");
20081ad6265SDimitry Andric   Diags->Report(InputSourceLoc.getLocWithOffset(getOffsetAt(CurPtr)), Err);
20181ad6265SDimitry Andric   return true;
20281ad6265SDimitry Andric }
20381ad6265SDimitry Andric 
20481ad6265SDimitry Andric static void skipOverSpaces(const char *&First, const char *const End) {
20581ad6265SDimitry Andric   while (First != End && isHorizontalWhitespace(*First))
20681ad6265SDimitry Andric     ++First;
20781ad6265SDimitry Andric }
20881ad6265SDimitry Andric 
209bdd1243dSDimitry Andric [[nodiscard]] static bool isRawStringLiteral(const char *First,
21081ad6265SDimitry Andric                                              const char *Current) {
21181ad6265SDimitry Andric   assert(First <= Current);
21281ad6265SDimitry Andric 
21381ad6265SDimitry Andric   // Check if we can even back up.
21481ad6265SDimitry Andric   if (*Current != '"' || First == Current)
21581ad6265SDimitry Andric     return false;
21681ad6265SDimitry Andric 
21781ad6265SDimitry Andric   // Check for an "R".
21881ad6265SDimitry Andric   --Current;
21981ad6265SDimitry Andric   if (*Current != 'R')
22081ad6265SDimitry Andric     return false;
22181ad6265SDimitry Andric   if (First == Current || !isAsciiIdentifierContinue(*--Current))
22281ad6265SDimitry Andric     return true;
22381ad6265SDimitry Andric 
22481ad6265SDimitry Andric   // Check for a prefix of "u", "U", or "L".
22581ad6265SDimitry Andric   if (*Current == 'u' || *Current == 'U' || *Current == 'L')
22681ad6265SDimitry Andric     return First == Current || !isAsciiIdentifierContinue(*--Current);
22781ad6265SDimitry Andric 
22881ad6265SDimitry Andric   // Check for a prefix of "u8".
22981ad6265SDimitry Andric   if (*Current != '8' || First == Current || *Current-- != 'u')
23081ad6265SDimitry Andric     return false;
23181ad6265SDimitry Andric   return First == Current || !isAsciiIdentifierContinue(*--Current);
23281ad6265SDimitry Andric }
23381ad6265SDimitry Andric 
23481ad6265SDimitry Andric static void skipRawString(const char *&First, const char *const End) {
23581ad6265SDimitry Andric   assert(First[0] == '"');
23681ad6265SDimitry Andric   assert(First[-1] == 'R');
23781ad6265SDimitry Andric 
23881ad6265SDimitry Andric   const char *Last = ++First;
23981ad6265SDimitry Andric   while (Last != End && *Last != '(')
24081ad6265SDimitry Andric     ++Last;
24181ad6265SDimitry Andric   if (Last == End) {
24281ad6265SDimitry Andric     First = Last; // Hit the end... just give up.
24381ad6265SDimitry Andric     return;
24481ad6265SDimitry Andric   }
24581ad6265SDimitry Andric 
24681ad6265SDimitry Andric   StringRef Terminator(First, Last - First);
24781ad6265SDimitry Andric   for (;;) {
24881ad6265SDimitry Andric     // Move First to just past the next ")".
24981ad6265SDimitry Andric     First = Last;
25081ad6265SDimitry Andric     while (First != End && *First != ')')
25181ad6265SDimitry Andric       ++First;
25281ad6265SDimitry Andric     if (First == End)
25381ad6265SDimitry Andric       return;
25481ad6265SDimitry Andric     ++First;
25581ad6265SDimitry Andric 
25681ad6265SDimitry Andric     // Look ahead for the terminator sequence.
25781ad6265SDimitry Andric     Last = First;
25881ad6265SDimitry Andric     while (Last != End && size_t(Last - First) < Terminator.size() &&
25981ad6265SDimitry Andric            Terminator[Last - First] == *Last)
26081ad6265SDimitry Andric       ++Last;
26181ad6265SDimitry Andric 
26281ad6265SDimitry Andric     // Check if we hit it (or the end of the file).
26381ad6265SDimitry Andric     if (Last == End) {
26481ad6265SDimitry Andric       First = Last;
26581ad6265SDimitry Andric       return;
26681ad6265SDimitry Andric     }
26781ad6265SDimitry Andric     if (size_t(Last - First) < Terminator.size())
26881ad6265SDimitry Andric       continue;
26981ad6265SDimitry Andric     if (*Last != '"')
27081ad6265SDimitry Andric       continue;
27181ad6265SDimitry Andric     First = Last + 1;
27281ad6265SDimitry Andric     return;
27381ad6265SDimitry Andric   }
27481ad6265SDimitry Andric }
27581ad6265SDimitry Andric 
27681ad6265SDimitry Andric // Returns the length of EOL, either 0 (no end-of-line), 1 (\n) or 2 (\r\n)
27781ad6265SDimitry Andric static unsigned isEOL(const char *First, const char *const End) {
27881ad6265SDimitry Andric   if (First == End)
27981ad6265SDimitry Andric     return 0;
28081ad6265SDimitry Andric   if (End - First > 1 && isVerticalWhitespace(First[0]) &&
28181ad6265SDimitry Andric       isVerticalWhitespace(First[1]) && First[0] != First[1])
28281ad6265SDimitry Andric     return 2;
28381ad6265SDimitry Andric   return !!isVerticalWhitespace(First[0]);
28481ad6265SDimitry Andric }
28581ad6265SDimitry Andric 
28681ad6265SDimitry Andric static void skipString(const char *&First, const char *const End) {
28781ad6265SDimitry Andric   assert(*First == '\'' || *First == '"' || *First == '<');
28881ad6265SDimitry Andric   const char Terminator = *First == '<' ? '>' : *First;
28981ad6265SDimitry Andric   for (++First; First != End && *First != Terminator; ++First) {
29081ad6265SDimitry Andric     // String and character literals don't extend past the end of the line.
29181ad6265SDimitry Andric     if (isVerticalWhitespace(*First))
29281ad6265SDimitry Andric       return;
29381ad6265SDimitry Andric     if (*First != '\\')
29481ad6265SDimitry Andric       continue;
29581ad6265SDimitry Andric     // Skip past backslash to the next character. This ensures that the
29681ad6265SDimitry Andric     // character right after it is skipped as well, which matters if it's
29781ad6265SDimitry Andric     // the terminator.
29881ad6265SDimitry Andric     if (++First == End)
29981ad6265SDimitry Andric       return;
30081ad6265SDimitry Andric     if (!isWhitespace(*First))
30181ad6265SDimitry Andric       continue;
30281ad6265SDimitry Andric     // Whitespace after the backslash might indicate a line continuation.
30381ad6265SDimitry Andric     const char *FirstAfterBackslashPastSpace = First;
30481ad6265SDimitry Andric     skipOverSpaces(FirstAfterBackslashPastSpace, End);
30581ad6265SDimitry Andric     if (unsigned NLSize = isEOL(FirstAfterBackslashPastSpace, End)) {
30681ad6265SDimitry Andric       // Advance the character pointer to the next line for the next
30781ad6265SDimitry Andric       // iteration.
30881ad6265SDimitry Andric       First = FirstAfterBackslashPastSpace + NLSize - 1;
30981ad6265SDimitry Andric     }
31081ad6265SDimitry Andric   }
31181ad6265SDimitry Andric   if (First != End)
31281ad6265SDimitry Andric     ++First; // Finish off the string.
31381ad6265SDimitry Andric }
31481ad6265SDimitry Andric 
31581ad6265SDimitry Andric // Returns the length of the skipped newline
31681ad6265SDimitry Andric static unsigned skipNewline(const char *&First, const char *End) {
31781ad6265SDimitry Andric   if (First == End)
31881ad6265SDimitry Andric     return 0;
31981ad6265SDimitry Andric   assert(isVerticalWhitespace(*First));
32081ad6265SDimitry Andric   unsigned Len = isEOL(First, End);
32181ad6265SDimitry Andric   assert(Len && "expected newline");
32281ad6265SDimitry Andric   First += Len;
32381ad6265SDimitry Andric   return Len;
32481ad6265SDimitry Andric }
32581ad6265SDimitry Andric 
32681ad6265SDimitry Andric static bool wasLineContinuation(const char *First, unsigned EOLLen) {
32781ad6265SDimitry Andric   return *(First - (int)EOLLen - 1) == '\\';
32881ad6265SDimitry Andric }
32981ad6265SDimitry Andric 
33081ad6265SDimitry Andric static void skipToNewlineRaw(const char *&First, const char *const End) {
33181ad6265SDimitry Andric   for (;;) {
33281ad6265SDimitry Andric     if (First == End)
33381ad6265SDimitry Andric       return;
33481ad6265SDimitry Andric 
33581ad6265SDimitry Andric     unsigned Len = isEOL(First, End);
33681ad6265SDimitry Andric     if (Len)
33781ad6265SDimitry Andric       return;
33881ad6265SDimitry Andric 
33981ad6265SDimitry Andric     do {
34081ad6265SDimitry Andric       if (++First == End)
34181ad6265SDimitry Andric         return;
34281ad6265SDimitry Andric       Len = isEOL(First, End);
34381ad6265SDimitry Andric     } while (!Len);
34481ad6265SDimitry Andric 
34581ad6265SDimitry Andric     if (First[-1] != '\\')
34681ad6265SDimitry Andric       return;
34781ad6265SDimitry Andric 
34881ad6265SDimitry Andric     First += Len;
34981ad6265SDimitry Andric     // Keep skipping lines...
35081ad6265SDimitry Andric   }
35181ad6265SDimitry Andric }
35281ad6265SDimitry Andric 
35381ad6265SDimitry Andric static void skipLineComment(const char *&First, const char *const End) {
35481ad6265SDimitry Andric   assert(First[0] == '/' && First[1] == '/');
35581ad6265SDimitry Andric   First += 2;
35681ad6265SDimitry Andric   skipToNewlineRaw(First, End);
35781ad6265SDimitry Andric }
35881ad6265SDimitry Andric 
35981ad6265SDimitry Andric static void skipBlockComment(const char *&First, const char *const End) {
36081ad6265SDimitry Andric   assert(First[0] == '/' && First[1] == '*');
36181ad6265SDimitry Andric   if (End - First < 4) {
36281ad6265SDimitry Andric     First = End;
36381ad6265SDimitry Andric     return;
36481ad6265SDimitry Andric   }
36581ad6265SDimitry Andric   for (First += 3; First != End; ++First)
36681ad6265SDimitry Andric     if (First[-1] == '*' && First[0] == '/') {
36781ad6265SDimitry Andric       ++First;
36881ad6265SDimitry Andric       return;
36981ad6265SDimitry Andric     }
37081ad6265SDimitry Andric }
37181ad6265SDimitry Andric 
37281ad6265SDimitry Andric /// \returns True if the current single quotation mark character is a C++14
37381ad6265SDimitry Andric /// digit separator.
37481ad6265SDimitry Andric static bool isQuoteCppDigitSeparator(const char *const Start,
37581ad6265SDimitry Andric                                      const char *const Cur,
37681ad6265SDimitry Andric                                      const char *const End) {
37781ad6265SDimitry Andric   assert(*Cur == '\'' && "expected quotation character");
37881ad6265SDimitry Andric   // skipLine called in places where we don't expect a valid number
37981ad6265SDimitry Andric   // body before `start` on the same line, so always return false at the start.
38081ad6265SDimitry Andric   if (Start == Cur)
38181ad6265SDimitry Andric     return false;
38281ad6265SDimitry Andric   // The previous character must be a valid PP number character.
38381ad6265SDimitry Andric   // Make sure that the L, u, U, u8 prefixes don't get marked as a
38481ad6265SDimitry Andric   // separator though.
38581ad6265SDimitry Andric   char Prev = *(Cur - 1);
38681ad6265SDimitry Andric   if (Prev == 'L' || Prev == 'U' || Prev == 'u')
38781ad6265SDimitry Andric     return false;
38881ad6265SDimitry Andric   if (Prev == '8' && (Cur - 1 != Start) && *(Cur - 2) == 'u')
38981ad6265SDimitry Andric     return false;
39081ad6265SDimitry Andric   if (!isPreprocessingNumberBody(Prev))
39181ad6265SDimitry Andric     return false;
39281ad6265SDimitry Andric   // The next character should be a valid identifier body character.
39381ad6265SDimitry Andric   return (Cur + 1) < End && isAsciiIdentifierContinue(*(Cur + 1));
39481ad6265SDimitry Andric }
39581ad6265SDimitry Andric 
396bdd1243dSDimitry Andric void Scanner::skipLine(const char *&First, const char *const End) {
39781ad6265SDimitry Andric   for (;;) {
39881ad6265SDimitry Andric     assert(First <= End);
39981ad6265SDimitry Andric     if (First == End)
40081ad6265SDimitry Andric       return;
40181ad6265SDimitry Andric 
40281ad6265SDimitry Andric     if (isVerticalWhitespace(*First)) {
40381ad6265SDimitry Andric       skipNewline(First, End);
40481ad6265SDimitry Andric       return;
40581ad6265SDimitry Andric     }
40681ad6265SDimitry Andric     const char *Start = First;
40781ad6265SDimitry Andric     while (First != End && !isVerticalWhitespace(*First)) {
40881ad6265SDimitry Andric       // Iterate over strings correctly to avoid comments and newlines.
40981ad6265SDimitry Andric       if (*First == '"' ||
41081ad6265SDimitry Andric           (*First == '\'' && !isQuoteCppDigitSeparator(Start, First, End))) {
411bdd1243dSDimitry Andric         LastTokenPtr = First;
41281ad6265SDimitry Andric         if (isRawStringLiteral(Start, First))
41381ad6265SDimitry Andric           skipRawString(First, End);
41481ad6265SDimitry Andric         else
41581ad6265SDimitry Andric           skipString(First, End);
41681ad6265SDimitry Andric         continue;
41781ad6265SDimitry Andric       }
41881ad6265SDimitry Andric 
41981ad6265SDimitry Andric       // Iterate over comments correctly.
42081ad6265SDimitry Andric       if (*First != '/' || End - First < 2) {
421bdd1243dSDimitry Andric         LastTokenPtr = First;
42281ad6265SDimitry Andric         ++First;
42381ad6265SDimitry Andric         continue;
42481ad6265SDimitry Andric       }
42581ad6265SDimitry Andric 
42681ad6265SDimitry Andric       if (First[1] == '/') {
42781ad6265SDimitry Andric         // "//...".
42881ad6265SDimitry Andric         skipLineComment(First, End);
42981ad6265SDimitry Andric         continue;
43081ad6265SDimitry Andric       }
43181ad6265SDimitry Andric 
43281ad6265SDimitry Andric       if (First[1] != '*') {
433bdd1243dSDimitry Andric         LastTokenPtr = First;
43481ad6265SDimitry Andric         ++First;
43581ad6265SDimitry Andric         continue;
43681ad6265SDimitry Andric       }
43781ad6265SDimitry Andric 
43881ad6265SDimitry Andric       // "/*...*/".
43981ad6265SDimitry Andric       skipBlockComment(First, End);
44081ad6265SDimitry Andric     }
44181ad6265SDimitry Andric     if (First == End)
44281ad6265SDimitry Andric       return;
44381ad6265SDimitry Andric 
44481ad6265SDimitry Andric     // Skip over the newline.
44581ad6265SDimitry Andric     unsigned Len = skipNewline(First, End);
44681ad6265SDimitry Andric     if (!wasLineContinuation(First, Len)) // Continue past line-continuations.
44781ad6265SDimitry Andric       break;
44881ad6265SDimitry Andric   }
44981ad6265SDimitry Andric }
45081ad6265SDimitry Andric 
451bdd1243dSDimitry Andric void Scanner::skipDirective(StringRef Name, const char *&First,
45281ad6265SDimitry Andric                             const char *const End) {
45381ad6265SDimitry Andric   if (llvm::StringSwitch<bool>(Name)
45481ad6265SDimitry Andric           .Case("warning", true)
45581ad6265SDimitry Andric           .Case("error", true)
45681ad6265SDimitry Andric           .Default(false))
45781ad6265SDimitry Andric     // Do not process quotes or comments.
45881ad6265SDimitry Andric     skipToNewlineRaw(First, End);
45981ad6265SDimitry Andric   else
46081ad6265SDimitry Andric     skipLine(First, End);
46181ad6265SDimitry Andric }
46281ad6265SDimitry Andric 
46381ad6265SDimitry Andric static void skipWhitespace(const char *&First, const char *const End) {
46481ad6265SDimitry Andric   for (;;) {
46581ad6265SDimitry Andric     assert(First <= End);
46681ad6265SDimitry Andric     skipOverSpaces(First, End);
46781ad6265SDimitry Andric 
46881ad6265SDimitry Andric     if (End - First < 2)
46981ad6265SDimitry Andric       return;
47081ad6265SDimitry Andric 
47181ad6265SDimitry Andric     if (First[0] == '\\' && isVerticalWhitespace(First[1])) {
47281ad6265SDimitry Andric       skipNewline(++First, End);
47381ad6265SDimitry Andric       continue;
47481ad6265SDimitry Andric     }
47581ad6265SDimitry Andric 
47681ad6265SDimitry Andric     // Check for a non-comment character.
47781ad6265SDimitry Andric     if (First[0] != '/')
47881ad6265SDimitry Andric       return;
47981ad6265SDimitry Andric 
48081ad6265SDimitry Andric     // "// ...".
48181ad6265SDimitry Andric     if (First[1] == '/') {
48281ad6265SDimitry Andric       skipLineComment(First, End);
48381ad6265SDimitry Andric       return;
48481ad6265SDimitry Andric     }
48581ad6265SDimitry Andric 
48681ad6265SDimitry Andric     // Cannot be a comment.
48781ad6265SDimitry Andric     if (First[1] != '*')
48881ad6265SDimitry Andric       return;
48981ad6265SDimitry Andric 
49081ad6265SDimitry Andric     // "/*...*/".
49181ad6265SDimitry Andric     skipBlockComment(First, End);
49281ad6265SDimitry Andric   }
49381ad6265SDimitry Andric }
49481ad6265SDimitry Andric 
49581ad6265SDimitry Andric bool Scanner::lexModuleDirectiveBody(DirectiveKind Kind, const char *&First,
49681ad6265SDimitry Andric                                      const char *const End) {
49781ad6265SDimitry Andric   const char *DirectiveLoc = Input.data() + CurDirToks.front().Offset;
49881ad6265SDimitry Andric   for (;;) {
49981ad6265SDimitry Andric     const dependency_directives_scan::Token &Tok = lexToken(First, End);
50081ad6265SDimitry Andric     if (Tok.is(tok::eof))
50181ad6265SDimitry Andric       return reportError(
50281ad6265SDimitry Andric           DirectiveLoc,
50381ad6265SDimitry Andric           diag::err_dep_source_scanner_missing_semi_after_at_import);
50481ad6265SDimitry Andric     if (Tok.is(tok::semi))
50581ad6265SDimitry Andric       break;
50681ad6265SDimitry Andric   }
50781ad6265SDimitry Andric   pushDirective(Kind);
50881ad6265SDimitry Andric   skipWhitespace(First, End);
50981ad6265SDimitry Andric   if (First == End)
51081ad6265SDimitry Andric     return false;
51181ad6265SDimitry Andric   if (!isVerticalWhitespace(*First))
51281ad6265SDimitry Andric     return reportError(
51381ad6265SDimitry Andric         DirectiveLoc, diag::err_dep_source_scanner_unexpected_tokens_at_import);
51481ad6265SDimitry Andric   skipNewline(First, End);
51581ad6265SDimitry Andric   return false;
51681ad6265SDimitry Andric }
51781ad6265SDimitry Andric 
51881ad6265SDimitry Andric dependency_directives_scan::Token &Scanner::lexToken(const char *&First,
51981ad6265SDimitry Andric                                                      const char *const End) {
52081ad6265SDimitry Andric   clang::Token Tok;
52181ad6265SDimitry Andric   TheLexer.LexFromRawLexer(Tok);
52281ad6265SDimitry Andric   First = Input.data() + TheLexer.getCurrentBufferOffset();
52381ad6265SDimitry Andric   assert(First <= End);
52481ad6265SDimitry Andric 
52581ad6265SDimitry Andric   unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
52681ad6265SDimitry Andric   CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),
52781ad6265SDimitry Andric                           Tok.getFlags());
52881ad6265SDimitry Andric   return CurDirToks.back();
52981ad6265SDimitry Andric }
53081ad6265SDimitry Andric 
53181ad6265SDimitry Andric dependency_directives_scan::Token &
53281ad6265SDimitry Andric Scanner::lexIncludeFilename(const char *&First, const char *const End) {
53381ad6265SDimitry Andric   clang::Token Tok;
53481ad6265SDimitry Andric   TheLexer.LexIncludeFilename(Tok);
53581ad6265SDimitry Andric   First = Input.data() + TheLexer.getCurrentBufferOffset();
53681ad6265SDimitry Andric   assert(First <= End);
53781ad6265SDimitry Andric 
53881ad6265SDimitry Andric   unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
53981ad6265SDimitry Andric   CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),
54081ad6265SDimitry Andric                           Tok.getFlags());
54181ad6265SDimitry Andric   return CurDirToks.back();
54281ad6265SDimitry Andric }
54381ad6265SDimitry Andric 
54481ad6265SDimitry Andric void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) {
54581ad6265SDimitry Andric   while (true) {
54681ad6265SDimitry Andric     const dependency_directives_scan::Token &Tok = lexToken(First, End);
547*0fca6ea1SDimitry Andric     if (Tok.is(tok::eod) || Tok.is(tok::eof))
54881ad6265SDimitry Andric       break;
54981ad6265SDimitry Andric   }
55081ad6265SDimitry Andric }
55181ad6265SDimitry Andric 
55206c3fb27SDimitry Andric StringRef
55306c3fb27SDimitry Andric Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
55481ad6265SDimitry Andric   bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning;
55581ad6265SDimitry Andric   if (LLVM_LIKELY(!NeedsCleaning))
55681ad6265SDimitry Andric     return Input.slice(Tok.Offset, Tok.getEnd());
55781ad6265SDimitry Andric 
55881ad6265SDimitry Andric   SmallString<64> Spelling;
55981ad6265SDimitry Andric   Spelling.resize(Tok.Length);
56081ad6265SDimitry Andric 
56106c3fb27SDimitry Andric   // FIXME: C++11 raw string literals need special handling (see getSpellingSlow
56206c3fb27SDimitry Andric   // in the Lexer). Currently we cannot see them due to our LangOpts.
56306c3fb27SDimitry Andric 
56481ad6265SDimitry Andric   unsigned SpellingLength = 0;
56581ad6265SDimitry Andric   const char *BufPtr = Input.begin() + Tok.Offset;
56681ad6265SDimitry Andric   const char *AfterIdent = Input.begin() + Tok.getEnd();
56781ad6265SDimitry Andric   while (BufPtr < AfterIdent) {
5685f757f3fSDimitry Andric     auto [Char, Size] = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
5695f757f3fSDimitry Andric     Spelling[SpellingLength++] = Char;
57081ad6265SDimitry Andric     BufPtr += Size;
57181ad6265SDimitry Andric   }
57281ad6265SDimitry Andric 
57381ad6265SDimitry Andric   return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0)
57481ad6265SDimitry Andric       .first->first();
57581ad6265SDimitry Andric }
57681ad6265SDimitry Andric 
57706c3fb27SDimitry Andric std::optional<StringRef>
57806c3fb27SDimitry Andric Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
57906c3fb27SDimitry Andric   const dependency_directives_scan::Token &Tok = lexToken(First, End);
58006c3fb27SDimitry Andric   if (Tok.isNot(tok::raw_identifier)) {
58106c3fb27SDimitry Andric     if (!Tok.is(tok::eod))
58206c3fb27SDimitry Andric       skipLine(First, End);
58306c3fb27SDimitry Andric     return std::nullopt;
58406c3fb27SDimitry Andric   }
58506c3fb27SDimitry Andric 
58606c3fb27SDimitry Andric   return cleanStringIfNeeded(Tok);
58706c3fb27SDimitry Andric }
58806c3fb27SDimitry Andric 
58981ad6265SDimitry Andric StringRef Scanner::lexIdentifier(const char *&First, const char *const End) {
590bdd1243dSDimitry Andric   std::optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End);
59181ad6265SDimitry Andric   assert(Id && "expected identifier token");
592bdd1243dSDimitry Andric   return *Id;
59381ad6265SDimitry Andric }
59481ad6265SDimitry Andric 
59581ad6265SDimitry Andric bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First,
59681ad6265SDimitry Andric                                          const char *const End) {
597bdd1243dSDimitry Andric   if (std::optional<StringRef> FoundId =
598bdd1243dSDimitry Andric           tryLexIdentifierOrSkipLine(First, End)) {
59981ad6265SDimitry Andric     if (*FoundId == Id)
60081ad6265SDimitry Andric       return true;
60181ad6265SDimitry Andric     skipLine(First, End);
60281ad6265SDimitry Andric   }
60381ad6265SDimitry Andric   return false;
60481ad6265SDimitry Andric }
60581ad6265SDimitry Andric 
60606c3fb27SDimitry Andric bool Scanner::isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
60706c3fb27SDimitry Andric                                     const char *const End) {
60806c3fb27SDimitry Andric   const dependency_directives_scan::Token &Tok = lexToken(First, End);
60906c3fb27SDimitry Andric   if (Tok.is(K))
61006c3fb27SDimitry Andric     return true;
61106c3fb27SDimitry Andric   skipLine(First, End);
61206c3fb27SDimitry Andric   return false;
61306c3fb27SDimitry Andric }
61406c3fb27SDimitry Andric 
61506c3fb27SDimitry Andric std::optional<StringRef>
61606c3fb27SDimitry Andric Scanner::tryLexStringLiteralOrSkipLine(const char *&First,
61706c3fb27SDimitry Andric                                        const char *const End) {
61806c3fb27SDimitry Andric   const dependency_directives_scan::Token &Tok = lexToken(First, End);
61906c3fb27SDimitry Andric   if (!tok::isStringLiteral(Tok.Kind)) {
62006c3fb27SDimitry Andric     if (!Tok.is(tok::eod))
62106c3fb27SDimitry Andric       skipLine(First, End);
62206c3fb27SDimitry Andric     return std::nullopt;
62306c3fb27SDimitry Andric   }
62406c3fb27SDimitry Andric 
62506c3fb27SDimitry Andric   return cleanStringIfNeeded(Tok);
62606c3fb27SDimitry Andric }
62706c3fb27SDimitry Andric 
62881ad6265SDimitry Andric bool Scanner::lexAt(const char *&First, const char *const End) {
62981ad6265SDimitry Andric   // Handle "@import".
63081ad6265SDimitry Andric 
63181ad6265SDimitry Andric   // Lex '@'.
63281ad6265SDimitry Andric   const dependency_directives_scan::Token &AtTok = lexToken(First, End);
63381ad6265SDimitry Andric   assert(AtTok.is(tok::at));
63481ad6265SDimitry Andric   (void)AtTok;
63581ad6265SDimitry Andric 
63681ad6265SDimitry Andric   if (!isNextIdentifierOrSkipLine("import", First, End))
63781ad6265SDimitry Andric     return false;
63881ad6265SDimitry Andric   return lexModuleDirectiveBody(decl_at_import, First, End);
63981ad6265SDimitry Andric }
64081ad6265SDimitry Andric 
64181ad6265SDimitry Andric bool Scanner::lexModule(const char *&First, const char *const End) {
64281ad6265SDimitry Andric   StringRef Id = lexIdentifier(First, End);
64381ad6265SDimitry Andric   bool Export = false;
64481ad6265SDimitry Andric   if (Id == "export") {
64581ad6265SDimitry Andric     Export = true;
646bdd1243dSDimitry Andric     std::optional<StringRef> NextId = tryLexIdentifierOrSkipLine(First, End);
64781ad6265SDimitry Andric     if (!NextId)
64881ad6265SDimitry Andric       return false;
64981ad6265SDimitry Andric     Id = *NextId;
65081ad6265SDimitry Andric   }
65181ad6265SDimitry Andric 
65281ad6265SDimitry Andric   if (Id != "module" && Id != "import") {
65381ad6265SDimitry Andric     skipLine(First, End);
65481ad6265SDimitry Andric     return false;
65581ad6265SDimitry Andric   }
65681ad6265SDimitry Andric 
65781ad6265SDimitry Andric   skipWhitespace(First, End);
65881ad6265SDimitry Andric 
65981ad6265SDimitry Andric   // Ignore this as a module directive if the next character can't be part of
66081ad6265SDimitry Andric   // an import.
66181ad6265SDimitry Andric 
66281ad6265SDimitry Andric   switch (*First) {
663*0fca6ea1SDimitry Andric   case ':': {
664*0fca6ea1SDimitry Andric     // `module :` is never the start of a valid module declaration.
665*0fca6ea1SDimitry Andric     if (Id == "module") {
666*0fca6ea1SDimitry Andric       skipLine(First, End);
667*0fca6ea1SDimitry Andric       return false;
668*0fca6ea1SDimitry Andric     }
669*0fca6ea1SDimitry Andric     // `import:(type)name` is a valid ObjC method decl, so check one more token.
670*0fca6ea1SDimitry Andric     (void)lexToken(First, End);
671*0fca6ea1SDimitry Andric     if (!tryLexIdentifierOrSkipLine(First, End))
672*0fca6ea1SDimitry Andric       return false;
673*0fca6ea1SDimitry Andric     break;
674*0fca6ea1SDimitry Andric   }
67581ad6265SDimitry Andric   case '<':
67681ad6265SDimitry Andric   case '"':
67781ad6265SDimitry Andric     break;
67881ad6265SDimitry Andric   default:
67981ad6265SDimitry Andric     if (!isAsciiIdentifierContinue(*First)) {
68081ad6265SDimitry Andric       skipLine(First, End);
68181ad6265SDimitry Andric       return false;
68281ad6265SDimitry Andric     }
68381ad6265SDimitry Andric   }
68481ad6265SDimitry Andric 
68581ad6265SDimitry Andric   TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ false);
68681ad6265SDimitry Andric 
68781ad6265SDimitry Andric   DirectiveKind Kind;
68881ad6265SDimitry Andric   if (Id == "module")
68981ad6265SDimitry Andric     Kind = Export ? cxx_export_module_decl : cxx_module_decl;
69081ad6265SDimitry Andric   else
69181ad6265SDimitry Andric     Kind = Export ? cxx_export_import_decl : cxx_import_decl;
69281ad6265SDimitry Andric 
69381ad6265SDimitry Andric   return lexModuleDirectiveBody(Kind, First, End);
69481ad6265SDimitry Andric }
69581ad6265SDimitry Andric 
69606c3fb27SDimitry Andric bool Scanner::lex_Pragma(const char *&First, const char *const End) {
69706c3fb27SDimitry Andric   if (!isNextTokenOrSkipLine(tok::l_paren, First, End))
69806c3fb27SDimitry Andric     return false;
69906c3fb27SDimitry Andric 
70006c3fb27SDimitry Andric   std::optional<StringRef> Str = tryLexStringLiteralOrSkipLine(First, End);
70106c3fb27SDimitry Andric 
70206c3fb27SDimitry Andric   if (!Str || !isNextTokenOrSkipLine(tok::r_paren, First, End))
70306c3fb27SDimitry Andric     return false;
70406c3fb27SDimitry Andric 
70506c3fb27SDimitry Andric   SmallString<64> Buffer(*Str);
70606c3fb27SDimitry Andric   prepare_PragmaString(Buffer);
70706c3fb27SDimitry Andric 
70806c3fb27SDimitry Andric   // Use a new scanner instance since the tokens will be inside the allocated
70906c3fb27SDimitry Andric   // string. We should already have captured all the relevant tokens in the
71006c3fb27SDimitry Andric   // current scanner.
71106c3fb27SDimitry Andric   SmallVector<dependency_directives_scan::Token> DiscardTokens;
71206c3fb27SDimitry Andric   const char *Begin = Buffer.c_str();
71306c3fb27SDimitry Andric   Scanner PragmaScanner{StringRef(Begin, Buffer.size()), DiscardTokens, Diags,
71406c3fb27SDimitry Andric                         InputSourceLoc};
71506c3fb27SDimitry Andric 
71606c3fb27SDimitry Andric   PragmaScanner.TheLexer.setParsingPreprocessorDirective(true);
71706c3fb27SDimitry Andric   if (PragmaScanner.lexPragma(Begin, Buffer.end()))
71806c3fb27SDimitry Andric     return true;
71906c3fb27SDimitry Andric 
72006c3fb27SDimitry Andric   DirectiveKind K = PragmaScanner.topDirective();
72106c3fb27SDimitry Andric   if (K == pp_none) {
72206c3fb27SDimitry Andric     skipLine(First, End);
72306c3fb27SDimitry Andric     return false;
72406c3fb27SDimitry Andric   }
72506c3fb27SDimitry Andric 
72606c3fb27SDimitry Andric   assert(Begin == Buffer.end());
72706c3fb27SDimitry Andric   pushDirective(K);
72806c3fb27SDimitry Andric   return false;
72906c3fb27SDimitry Andric }
73006c3fb27SDimitry Andric 
73181ad6265SDimitry Andric bool Scanner::lexPragma(const char *&First, const char *const End) {
732bdd1243dSDimitry Andric   std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
73381ad6265SDimitry Andric   if (!FoundId)
73481ad6265SDimitry Andric     return false;
73581ad6265SDimitry Andric 
73681ad6265SDimitry Andric   StringRef Id = *FoundId;
73781ad6265SDimitry Andric   auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
73881ad6265SDimitry Andric                   .Case("once", pp_pragma_once)
73981ad6265SDimitry Andric                   .Case("push_macro", pp_pragma_push_macro)
74081ad6265SDimitry Andric                   .Case("pop_macro", pp_pragma_pop_macro)
74181ad6265SDimitry Andric                   .Case("include_alias", pp_pragma_include_alias)
74281ad6265SDimitry Andric                   .Default(pp_none);
74381ad6265SDimitry Andric   if (Kind != pp_none) {
74481ad6265SDimitry Andric     lexPPDirectiveBody(First, End);
74581ad6265SDimitry Andric     pushDirective(Kind);
74681ad6265SDimitry Andric     return false;
74781ad6265SDimitry Andric   }
74881ad6265SDimitry Andric 
74981ad6265SDimitry Andric   if (Id != "clang") {
75081ad6265SDimitry Andric     skipLine(First, End);
75181ad6265SDimitry Andric     return false;
75281ad6265SDimitry Andric   }
75381ad6265SDimitry Andric 
75406c3fb27SDimitry Andric   FoundId = tryLexIdentifierOrSkipLine(First, End);
75506c3fb27SDimitry Andric   if (!FoundId)
75681ad6265SDimitry Andric     return false;
75706c3fb27SDimitry Andric   Id = *FoundId;
75806c3fb27SDimitry Andric 
75906c3fb27SDimitry Andric   // #pragma clang system_header
76006c3fb27SDimitry Andric   if (Id == "system_header") {
76106c3fb27SDimitry Andric     lexPPDirectiveBody(First, End);
76206c3fb27SDimitry Andric     pushDirective(pp_pragma_system_header);
76306c3fb27SDimitry Andric     return false;
76406c3fb27SDimitry Andric   }
76506c3fb27SDimitry Andric 
76606c3fb27SDimitry Andric   if (Id != "module") {
76706c3fb27SDimitry Andric     skipLine(First, End);
76806c3fb27SDimitry Andric     return false;
76906c3fb27SDimitry Andric   }
77081ad6265SDimitry Andric 
77181ad6265SDimitry Andric   // #pragma clang module.
77281ad6265SDimitry Andric   if (!isNextIdentifierOrSkipLine("import", First, End))
77381ad6265SDimitry Andric     return false;
77481ad6265SDimitry Andric 
77581ad6265SDimitry Andric   // #pragma clang module import.
77681ad6265SDimitry Andric   lexPPDirectiveBody(First, End);
77781ad6265SDimitry Andric   pushDirective(pp_pragma_import);
77881ad6265SDimitry Andric   return false;
77981ad6265SDimitry Andric }
78081ad6265SDimitry Andric 
78181ad6265SDimitry Andric bool Scanner::lexEndif(const char *&First, const char *const End) {
78281ad6265SDimitry Andric   // Strip out "#else" if it's empty.
78381ad6265SDimitry Andric   if (topDirective() == pp_else)
78481ad6265SDimitry Andric     popDirective();
78581ad6265SDimitry Andric 
78681ad6265SDimitry Andric   // If "#ifdef" is empty, strip it and skip the "#endif".
78781ad6265SDimitry Andric   //
78881ad6265SDimitry Andric   // FIXME: Once/if Clang starts disallowing __has_include in macro expansions,
78981ad6265SDimitry Andric   // we can skip empty `#if` and `#elif` blocks as well after scanning for a
79081ad6265SDimitry Andric   // literal __has_include in the condition.  Even without that rule we could
79181ad6265SDimitry Andric   // drop the tokens if we scan for identifiers in the condition and find none.
79281ad6265SDimitry Andric   if (topDirective() == pp_ifdef || topDirective() == pp_ifndef) {
79381ad6265SDimitry Andric     popDirective();
79481ad6265SDimitry Andric     skipLine(First, End);
79581ad6265SDimitry Andric     return false;
79681ad6265SDimitry Andric   }
79781ad6265SDimitry Andric 
79881ad6265SDimitry Andric   return lexDefault(pp_endif, First, End);
79981ad6265SDimitry Andric }
80081ad6265SDimitry Andric 
80181ad6265SDimitry Andric bool Scanner::lexDefault(DirectiveKind Kind, const char *&First,
80281ad6265SDimitry Andric                          const char *const End) {
80381ad6265SDimitry Andric   lexPPDirectiveBody(First, End);
80481ad6265SDimitry Andric   pushDirective(Kind);
80581ad6265SDimitry Andric   return false;
80681ad6265SDimitry Andric }
80781ad6265SDimitry Andric 
80881ad6265SDimitry Andric static bool isStartOfRelevantLine(char First) {
80981ad6265SDimitry Andric   switch (First) {
81081ad6265SDimitry Andric   case '#':
81181ad6265SDimitry Andric   case '@':
81281ad6265SDimitry Andric   case 'i':
81381ad6265SDimitry Andric   case 'e':
81481ad6265SDimitry Andric   case 'm':
81506c3fb27SDimitry Andric   case '_':
81681ad6265SDimitry Andric     return true;
81781ad6265SDimitry Andric   }
81881ad6265SDimitry Andric   return false;
81981ad6265SDimitry Andric }
82081ad6265SDimitry Andric 
82181ad6265SDimitry Andric bool Scanner::lexPPLine(const char *&First, const char *const End) {
82281ad6265SDimitry Andric   assert(First != End);
82381ad6265SDimitry Andric 
82481ad6265SDimitry Andric   skipWhitespace(First, End);
82581ad6265SDimitry Andric   assert(First <= End);
82681ad6265SDimitry Andric   if (First == End)
82781ad6265SDimitry Andric     return false;
82881ad6265SDimitry Andric 
82981ad6265SDimitry Andric   if (!isStartOfRelevantLine(*First)) {
83081ad6265SDimitry Andric     skipLine(First, End);
83181ad6265SDimitry Andric     assert(First <= End);
83281ad6265SDimitry Andric     return false;
83381ad6265SDimitry Andric   }
83481ad6265SDimitry Andric 
835bdd1243dSDimitry Andric   LastTokenPtr = First;
836bdd1243dSDimitry Andric 
83781ad6265SDimitry Andric   TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ true);
83881ad6265SDimitry Andric 
83981ad6265SDimitry Andric   auto ScEx1 = make_scope_exit([&]() {
84081ad6265SDimitry Andric     /// Clear Scanner's CurDirToks before returning, in case we didn't push a
84181ad6265SDimitry Andric     /// new directive.
84281ad6265SDimitry Andric     CurDirToks.clear();
84381ad6265SDimitry Andric   });
84481ad6265SDimitry Andric 
84581ad6265SDimitry Andric   // Handle "@import".
84681ad6265SDimitry Andric   if (*First == '@')
84781ad6265SDimitry Andric     return lexAt(First, End);
84881ad6265SDimitry Andric 
84981ad6265SDimitry Andric   if (*First == 'i' || *First == 'e' || *First == 'm')
85081ad6265SDimitry Andric     return lexModule(First, End);
85181ad6265SDimitry Andric 
85206c3fb27SDimitry Andric   if (*First == '_') {
85306c3fb27SDimitry Andric     if (isNextIdentifierOrSkipLine("_Pragma", First, End))
85406c3fb27SDimitry Andric       return lex_Pragma(First, End);
85506c3fb27SDimitry Andric     return false;
85606c3fb27SDimitry Andric   }
85706c3fb27SDimitry Andric 
85881ad6265SDimitry Andric   // Handle preprocessing directives.
85981ad6265SDimitry Andric 
86081ad6265SDimitry Andric   TheLexer.setParsingPreprocessorDirective(true);
86181ad6265SDimitry Andric   auto ScEx2 = make_scope_exit(
86281ad6265SDimitry Andric       [&]() { TheLexer.setParsingPreprocessorDirective(false); });
86381ad6265SDimitry Andric 
86481ad6265SDimitry Andric   // Lex '#'.
86581ad6265SDimitry Andric   const dependency_directives_scan::Token &HashTok = lexToken(First, End);
866bdd1243dSDimitry Andric   if (HashTok.is(tok::hashhash)) {
867bdd1243dSDimitry Andric     // A \p tok::hashhash at this location is passed by the preprocessor to the
868bdd1243dSDimitry Andric     // parser to interpret, like any other token. So for dependency scanning
869bdd1243dSDimitry Andric     // skip it like a normal token not affecting the preprocessor.
870bdd1243dSDimitry Andric     skipLine(First, End);
871bdd1243dSDimitry Andric     assert(First <= End);
872bdd1243dSDimitry Andric     return false;
873bdd1243dSDimitry Andric   }
87481ad6265SDimitry Andric   assert(HashTok.is(tok::hash));
87581ad6265SDimitry Andric   (void)HashTok;
87681ad6265SDimitry Andric 
877bdd1243dSDimitry Andric   std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
87881ad6265SDimitry Andric   if (!FoundId)
87981ad6265SDimitry Andric     return false;
88081ad6265SDimitry Andric 
88181ad6265SDimitry Andric   StringRef Id = *FoundId;
88281ad6265SDimitry Andric 
88381ad6265SDimitry Andric   if (Id == "pragma")
88481ad6265SDimitry Andric     return lexPragma(First, End);
88581ad6265SDimitry Andric 
88681ad6265SDimitry Andric   auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
88781ad6265SDimitry Andric                   .Case("include", pp_include)
88881ad6265SDimitry Andric                   .Case("__include_macros", pp___include_macros)
88981ad6265SDimitry Andric                   .Case("define", pp_define)
89081ad6265SDimitry Andric                   .Case("undef", pp_undef)
89181ad6265SDimitry Andric                   .Case("import", pp_import)
89281ad6265SDimitry Andric                   .Case("include_next", pp_include_next)
89381ad6265SDimitry Andric                   .Case("if", pp_if)
89481ad6265SDimitry Andric                   .Case("ifdef", pp_ifdef)
89581ad6265SDimitry Andric                   .Case("ifndef", pp_ifndef)
89681ad6265SDimitry Andric                   .Case("elif", pp_elif)
89781ad6265SDimitry Andric                   .Case("elifdef", pp_elifdef)
89881ad6265SDimitry Andric                   .Case("elifndef", pp_elifndef)
89981ad6265SDimitry Andric                   .Case("else", pp_else)
90081ad6265SDimitry Andric                   .Case("endif", pp_endif)
90181ad6265SDimitry Andric                   .Default(pp_none);
90281ad6265SDimitry Andric   if (Kind == pp_none) {
90381ad6265SDimitry Andric     skipDirective(Id, First, End);
90481ad6265SDimitry Andric     return false;
90581ad6265SDimitry Andric   }
90681ad6265SDimitry Andric 
90781ad6265SDimitry Andric   if (Kind == pp_endif)
90881ad6265SDimitry Andric     return lexEndif(First, End);
90981ad6265SDimitry Andric 
91081ad6265SDimitry Andric   switch (Kind) {
91181ad6265SDimitry Andric   case pp_include:
91281ad6265SDimitry Andric   case pp___include_macros:
91381ad6265SDimitry Andric   case pp_include_next:
91481ad6265SDimitry Andric   case pp_import:
915*0fca6ea1SDimitry Andric     // Ignore missing filenames in include or import directives.
916*0fca6ea1SDimitry Andric     if (lexIncludeFilename(First, End).is(tok::eod)) {
917*0fca6ea1SDimitry Andric       skipDirective(Id, First, End);
918*0fca6ea1SDimitry Andric       return true;
919*0fca6ea1SDimitry Andric     }
92081ad6265SDimitry Andric     break;
92181ad6265SDimitry Andric   default:
92281ad6265SDimitry Andric     break;
92381ad6265SDimitry Andric   }
92481ad6265SDimitry Andric 
92581ad6265SDimitry Andric   // Everything else.
92681ad6265SDimitry Andric   return lexDefault(Kind, First, End);
92781ad6265SDimitry Andric }
92881ad6265SDimitry Andric 
92981ad6265SDimitry Andric static void skipUTF8ByteOrderMark(const char *&First, const char *const End) {
93081ad6265SDimitry Andric   if ((End - First) >= 3 && First[0] == '\xef' && First[1] == '\xbb' &&
93181ad6265SDimitry Andric       First[2] == '\xbf')
93281ad6265SDimitry Andric     First += 3;
93381ad6265SDimitry Andric }
93481ad6265SDimitry Andric 
93581ad6265SDimitry Andric bool Scanner::scanImpl(const char *First, const char *const End) {
93681ad6265SDimitry Andric   skipUTF8ByteOrderMark(First, End);
93781ad6265SDimitry Andric   while (First != End)
93881ad6265SDimitry Andric     if (lexPPLine(First, End))
93981ad6265SDimitry Andric       return true;
94081ad6265SDimitry Andric   return false;
94181ad6265SDimitry Andric }
94281ad6265SDimitry Andric 
94381ad6265SDimitry Andric bool Scanner::scan(SmallVectorImpl<Directive> &Directives) {
94481ad6265SDimitry Andric   bool Error = scanImpl(Input.begin(), Input.end());
94581ad6265SDimitry Andric 
94681ad6265SDimitry Andric   if (!Error) {
94781ad6265SDimitry Andric     // Add an EOF on success.
948bdd1243dSDimitry Andric     if (LastTokenPtr &&
949bdd1243dSDimitry Andric         (Tokens.empty() || LastTokenPtr > Input.begin() + Tokens.back().Offset))
950bdd1243dSDimitry Andric       pushDirective(tokens_present_before_eof);
95181ad6265SDimitry Andric     pushDirective(pp_eof);
95281ad6265SDimitry Andric   }
95381ad6265SDimitry Andric 
95481ad6265SDimitry Andric   ArrayRef<dependency_directives_scan::Token> RemainingTokens = Tokens;
95581ad6265SDimitry Andric   for (const DirectiveWithTokens &DirWithToks : DirsWithToks) {
95681ad6265SDimitry Andric     assert(RemainingTokens.size() >= DirWithToks.NumTokens);
95781ad6265SDimitry Andric     Directives.emplace_back(DirWithToks.Kind,
95881ad6265SDimitry Andric                             RemainingTokens.take_front(DirWithToks.NumTokens));
95981ad6265SDimitry Andric     RemainingTokens = RemainingTokens.drop_front(DirWithToks.NumTokens);
96081ad6265SDimitry Andric   }
96181ad6265SDimitry Andric   assert(RemainingTokens.empty());
96281ad6265SDimitry Andric 
96381ad6265SDimitry Andric   return Error;
96481ad6265SDimitry Andric }
96581ad6265SDimitry Andric 
96681ad6265SDimitry Andric bool clang::scanSourceForDependencyDirectives(
96781ad6265SDimitry Andric     StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
96881ad6265SDimitry Andric     SmallVectorImpl<Directive> &Directives, DiagnosticsEngine *Diags,
96981ad6265SDimitry Andric     SourceLocation InputSourceLoc) {
97081ad6265SDimitry Andric   return Scanner(Input, Tokens, Diags, InputSourceLoc).scan(Directives);
97181ad6265SDimitry Andric }
97281ad6265SDimitry Andric 
97381ad6265SDimitry Andric void clang::printDependencyDirectivesAsSource(
97481ad6265SDimitry Andric     StringRef Source,
97581ad6265SDimitry Andric     ArrayRef<dependency_directives_scan::Directive> Directives,
97681ad6265SDimitry Andric     llvm::raw_ostream &OS) {
97781ad6265SDimitry Andric   // Add a space separator where it is convenient for testing purposes.
97881ad6265SDimitry Andric   auto needsSpaceSeparator =
97981ad6265SDimitry Andric       [](tok::TokenKind Prev,
98081ad6265SDimitry Andric          const dependency_directives_scan::Token &Tok) -> bool {
98181ad6265SDimitry Andric     if (Prev == Tok.Kind)
98281ad6265SDimitry Andric       return !Tok.isOneOf(tok::l_paren, tok::r_paren, tok::l_square,
98381ad6265SDimitry Andric                           tok::r_square);
98481ad6265SDimitry Andric     if (Prev == tok::raw_identifier &&
98581ad6265SDimitry Andric         Tok.isOneOf(tok::hash, tok::numeric_constant, tok::string_literal,
98681ad6265SDimitry Andric                     tok::char_constant, tok::header_name))
98781ad6265SDimitry Andric       return true;
98881ad6265SDimitry Andric     if (Prev == tok::r_paren &&
98981ad6265SDimitry Andric         Tok.isOneOf(tok::raw_identifier, tok::hash, tok::string_literal,
99081ad6265SDimitry Andric                     tok::char_constant, tok::unknown))
99181ad6265SDimitry Andric       return true;
99281ad6265SDimitry Andric     if (Prev == tok::comma &&
99381ad6265SDimitry Andric         Tok.isOneOf(tok::l_paren, tok::string_literal, tok::less))
99481ad6265SDimitry Andric       return true;
99581ad6265SDimitry Andric     return false;
99681ad6265SDimitry Andric   };
99781ad6265SDimitry Andric 
99881ad6265SDimitry Andric   for (const dependency_directives_scan::Directive &Directive : Directives) {
999bdd1243dSDimitry Andric     if (Directive.Kind == tokens_present_before_eof)
1000bdd1243dSDimitry Andric       OS << "<TokBeforeEOF>";
1001bdd1243dSDimitry Andric     std::optional<tok::TokenKind> PrevTokenKind;
100281ad6265SDimitry Andric     for (const dependency_directives_scan::Token &Tok : Directive.Tokens) {
100381ad6265SDimitry Andric       if (PrevTokenKind && needsSpaceSeparator(*PrevTokenKind, Tok))
100481ad6265SDimitry Andric         OS << ' ';
100581ad6265SDimitry Andric       PrevTokenKind = Tok.Kind;
100681ad6265SDimitry Andric       OS << Source.slice(Tok.Offset, Tok.getEnd());
100781ad6265SDimitry Andric     }
100881ad6265SDimitry Andric   }
100981ad6265SDimitry Andric }
1010