xref: /llvm-project/flang/lib/Parser/prescan.cpp (revision e286ecfecf50fd8eff2e7003131d2c845cb6045f)
1 //===-- lib/Parser/prescan.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "prescan.h"
10 #include "flang/Common/idioms.h"
11 #include "flang/Parser/characters.h"
12 #include "flang/Parser/message.h"
13 #include "flang/Parser/preprocessor.h"
14 #include "flang/Parser/source.h"
15 #include "flang/Parser/token-sequence.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <cstddef>
18 #include <cstring>
19 #include <utility>
20 #include <vector>
21 
22 namespace Fortran::parser {
23 
24 using common::LanguageFeature;
25 
26 static constexpr int maxPrescannerNesting{100};
27 
28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
29     Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
30     : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
31       allSources_{preprocessor_.allSources()}, features_{lfc},
32       backslashFreeFormContinuation_{preprocessor.AnyDefinitions()},
33       encoding_{allSources_.encoding()} {}
34 
35 Prescanner::Prescanner(const Prescanner &that, bool isNestedInIncludeDirective)
36     : messages_{that.messages_}, cooked_{that.cooked_},
37       preprocessor_{that.preprocessor_}, allSources_{that.allSources_},
38       features_{that.features_},
39       isNestedInIncludeDirective_{isNestedInIncludeDirective},
40       backslashFreeFormContinuation_{that.backslashFreeFormContinuation_},
41       inFixedForm_{that.inFixedForm_},
42       fixedFormColumnLimit_{that.fixedFormColumnLimit_},
43       encoding_{that.encoding_},
44       prescannerNesting_{that.prescannerNesting_ + 1},
45       skipLeadingAmpersand_{that.skipLeadingAmpersand_},
46       compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
47       compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
48 
49 static inline constexpr bool IsFixedFormCommentChar(char ch) {
50   return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
51 }
52 
53 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
54   char *p{dir.GetMutableCharData()};
55   char *limit{p + dir.SizeInChars()};
56   for (; p < limit; ++p) {
57     if (*p != ' ') {
58       CHECK(IsFixedFormCommentChar(*p));
59       *p = '!';
60       return;
61     }
62   }
63   DIE("compiler directive all blank");
64 }
65 
66 void Prescanner::Prescan(ProvenanceRange range) {
67   startProvenance_ = range.start();
68   start_ = allSources_.GetSource(range);
69   CHECK(start_);
70   limit_ = start_ + range.size();
71   nextLine_ = start_;
72   const bool beganInFixedForm{inFixedForm_};
73   if (prescannerNesting_ > maxPrescannerNesting) {
74     Say(GetProvenance(start_),
75         "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
76     return;
77   }
78   while (!IsAtEnd()) {
79     Statement();
80   }
81   if (inFixedForm_ != beganInFixedForm) {
82     std::string dir{"!dir$ "};
83     if (beganInFixedForm) {
84       dir += "fixed";
85     } else {
86       dir += "free";
87     }
88     dir += '\n';
89     TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()};
90     tokens.Emit(cooked_);
91   }
92 }
93 
94 void Prescanner::Statement() {
95   TokenSequence tokens;
96   const char *statementStart{nextLine_};
97   LineClassification line{ClassifyLine(statementStart)};
98   switch (line.kind) {
99   case LineClassification::Kind::Comment:
100     nextLine_ += line.payloadOffset; // advance to '!' or newline
101     NextLine();
102     return;
103   case LineClassification::Kind::IncludeLine:
104     FortranInclude(nextLine_ + line.payloadOffset);
105     NextLine();
106     return;
107   case LineClassification::Kind::ConditionalCompilationDirective:
108   case LineClassification::Kind::DefinitionDirective:
109   case LineClassification::Kind::PreprocessorDirective:
110     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
111     return;
112   case LineClassification::Kind::IncludeDirective:
113     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
114     afterIncludeDirective_ = true;
115     return;
116   case LineClassification::Kind::CompilerDirective: {
117     directiveSentinel_ = line.sentinel;
118     CHECK(InCompilerDirective());
119     BeginStatementAndAdvance();
120     if (inFixedForm_) {
121       CHECK(IsFixedFormCommentChar(*at_));
122     } else {
123       while (*at_ == ' ' || *at_ == '\t') {
124         ++at_, ++column_;
125       }
126       CHECK(*at_ == '!');
127     }
128     std::optional<int> condOffset;
129     if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
130       // OpenMP conditional compilation line.
131       condOffset = 2;
132     } else if (directiveSentinel_[0] == '@' && directiveSentinel_[1] == 'c' &&
133         directiveSentinel_[2] == 'u' && directiveSentinel_[3] == 'f' &&
134         directiveSentinel_[4] == '\0') {
135       // CUDA conditional compilation line.
136       condOffset = 5;
137     }
138     if (condOffset) {
139       at_ += *condOffset, column_ += *condOffset;
140       if (auto payload{IsIncludeLine(at_)}) {
141         FortranInclude(at_ + *payload);
142         return;
143       } else if (inFixedForm_) {
144         LabelField(tokens);
145       } else {
146         SkipSpaces();
147       }
148     } else {
149       // Compiler directive.  Emit normalized sentinel, squash following spaces.
150       EmitChar(tokens, '!');
151       ++at_, ++column_;
152       for (const char *sp{directiveSentinel_}; *sp != '\0';
153            ++sp, ++at_, ++column_) {
154         EmitChar(tokens, *sp);
155       }
156       if (*at_ == ' ' || *at_ == '\t') {
157         EmitChar(tokens, ' ');
158         while (*at_ == ' ' || *at_ == '\t') {
159           ++at_, ++column_;
160         }
161       }
162       tokens.CloseToken();
163     }
164     break;
165   }
166   case LineClassification::Kind::Source:
167     BeginStatementAndAdvance();
168     if (inFixedForm_) {
169       if (features_.IsEnabled(LanguageFeature::OldDebugLines) &&
170           (*at_ == 'D' || *at_ == 'd')) {
171         NextChar();
172       }
173       LabelField(tokens);
174     } else if (skipLeadingAmpersand_) {
175       skipLeadingAmpersand_ = false;
176       const char *p{SkipWhiteSpace(at_)};
177       if (p < limit_ && *p == '&') {
178         column_ += ++p - at_;
179         at_ = p;
180       }
181     } else {
182       SkipSpaces();
183       // Check for a leading identifier that might be a keyword macro
184       // that will expand to anything indicating a non-source line, like
185       // a comment marker or directive sentinel.  If so, disable line
186       // continuation, so that NextToken() won't consume anything from
187       // following lines.
188       if (IsLegalIdentifierStart(*at_)) {
189         CHECK(NextToken(tokens));
190         CHECK(tokens.SizeInTokens() == 1);
191         CharBlock id{tokens.TokenAt(0)};
192         if (preprocessor_.IsNameDefined(id) &&
193             !preprocessor_.IsFunctionLikeDefinition(id)) {
194           if (auto replaced{preprocessor_.MacroReplacement(tokens, *this)}) {
195             auto newLineClass{ClassifyLine(*replaced, GetCurrentProvenance())};
196             disableSourceContinuation_ =
197                 newLineClass.kind != LineClassification::Kind::Source;
198             if (newLineClass.kind ==
199                 LineClassification::Kind::CompilerDirective) {
200               directiveSentinel_ = newLineClass.sentinel;
201             }
202           }
203         }
204       }
205     }
206     break;
207   }
208 
209   while (NextToken(tokens)) {
210   }
211   if (continuationLines_ > 255) {
212     if (features_.ShouldWarn(common::LanguageFeature::MiscSourceExtensions)) {
213       Say(GetProvenance(statementStart),
214           "%d continuation lines is more than the Fortran standard allows"_port_en_US,
215           continuationLines_);
216     }
217   }
218 
219   Provenance newlineProvenance{GetCurrentProvenance()};
220   if (std::optional<TokenSequence> preprocessed{
221           preprocessor_.MacroReplacement(tokens, *this)}) {
222     // Reprocess the preprocessed line.
223     LineClassification ppl{ClassifyLine(*preprocessed, newlineProvenance)};
224     switch (ppl.kind) {
225     case LineClassification::Kind::Comment:
226       break;
227     case LineClassification::Kind::IncludeLine:
228       FortranInclude(preprocessed->TokenAt(0).begin() + ppl.payloadOffset);
229       break;
230     case LineClassification::Kind::ConditionalCompilationDirective:
231     case LineClassification::Kind::IncludeDirective:
232     case LineClassification::Kind::DefinitionDirective:
233     case LineClassification::Kind::PreprocessorDirective:
234       if (features_.ShouldWarn(common::UsageWarning::Preprocessing)) {
235         Say(preprocessed->GetProvenanceRange(),
236             "Preprocessed line resembles a preprocessor directive"_warn_en_US);
237       }
238       CheckAndEmitLine(preprocessed->ToLowerCase(), newlineProvenance);
239       break;
240     case LineClassification::Kind::CompilerDirective:
241       if (preprocessed->HasRedundantBlanks()) {
242         preprocessed->RemoveRedundantBlanks();
243       }
244       while (CompilerDirectiveContinuation(*preprocessed, ppl.sentinel)) {
245         newlineProvenance = GetCurrentProvenance();
246       }
247       NormalizeCompilerDirectiveCommentMarker(*preprocessed);
248       preprocessed->ToLowerCase();
249       SourceFormChange(preprocessed->ToString());
250       CheckAndEmitLine(preprocessed->ToLowerCase().ClipComment(
251                            *this, true /* skip first ! */),
252           newlineProvenance);
253       break;
254     case LineClassification::Kind::Source:
255       if (inFixedForm_) {
256         if (preprocessed->HasBlanks(/*after column*/ 6)) {
257           preprocessed->RemoveBlanks(/*after column*/ 6);
258         }
259       } else {
260         while (SourceLineContinuation(*preprocessed)) {
261           newlineProvenance = GetCurrentProvenance();
262         }
263         if (preprocessed->HasRedundantBlanks()) {
264           preprocessed->RemoveRedundantBlanks();
265         }
266       }
267       CheckAndEmitLine(
268           preprocessed->ToLowerCase().ClipComment(*this), newlineProvenance);
269       break;
270     }
271   } else { // no macro replacement
272     if (line.kind == LineClassification::Kind::CompilerDirective) {
273       while (CompilerDirectiveContinuation(tokens, line.sentinel)) {
274         newlineProvenance = GetCurrentProvenance();
275       }
276       tokens.ToLowerCase();
277       SourceFormChange(tokens.ToString());
278     } else { // Kind::Source
279       tokens.ToLowerCase();
280       if (inFixedForm_) {
281         EnforceStupidEndStatementRules(tokens);
282       }
283     }
284     CheckAndEmitLine(tokens, newlineProvenance);
285   }
286   directiveSentinel_ = nullptr;
287 }
288 
289 void Prescanner::CheckAndEmitLine(
290     TokenSequence &tokens, Provenance newlineProvenance) {
291   tokens.CheckBadFortranCharacters(
292       messages_, *this, disableSourceContinuation_);
293   // Parenthesis nesting check does not apply while any #include is
294   // active, nor on the lines before and after a top-level #include.
295   // Applications play shenanigans with line continuation before and
296   // after #include'd subprogram argument lists.
297   if (!isNestedInIncludeDirective_ && !omitNewline_ &&
298       !afterIncludeDirective_) {
299     tokens.CheckBadParentheses(messages_);
300   }
301   tokens.Emit(cooked_);
302   if (omitNewline_) {
303     omitNewline_ = false;
304   } else {
305     cooked_.Put('\n', newlineProvenance);
306     afterIncludeDirective_ = false;
307   }
308 }
309 
310 TokenSequence Prescanner::TokenizePreprocessorDirective() {
311   CHECK(!IsAtEnd() && !inPreprocessorDirective_);
312   inPreprocessorDirective_ = true;
313   BeginStatementAndAdvance();
314   TokenSequence tokens;
315   while (NextToken(tokens)) {
316   }
317   inPreprocessorDirective_ = false;
318   return tokens;
319 }
320 
321 void Prescanner::NextLine() {
322   void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
323   void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
324   if (!v) {
325     nextLine_ = limit_;
326   } else {
327     const char *nl{const_cast<const char *>(static_cast<char *>(v))};
328     nextLine_ = nl + 1;
329   }
330 }
331 
332 void Prescanner::LabelField(TokenSequence &token) {
333   int outCol{1};
334   const char *start{at_};
335   std::optional<int> badColumn;
336   for (; *at_ != '\n' && column_ <= 6; ++at_) {
337     if (*at_ == '\t') {
338       ++at_;
339       column_ = 7;
340       break;
341     }
342     if (*at_ != ' ' &&
343         !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
344       EmitChar(token, *at_);
345       ++outCol;
346       if (!badColumn && (column_ == 6 || !IsDecimalDigit(*at_))) {
347         badColumn = column_;
348       }
349     }
350     ++column_;
351   }
352   if (badColumn && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) {
353     if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
354       Say(GetProvenance(start + *badColumn - 1),
355           *badColumn == 6
356               ? "Statement should not begin with a continuation line"_warn_en_US
357               : "Character in fixed-form label field must be a digit"_warn_en_US);
358     }
359     token.clear();
360     if (*badColumn < 6) {
361       at_ = start;
362       column_ = 1;
363       return;
364     }
365     outCol = 1;
366   }
367   if (outCol == 1) { // empty label field
368     // Emit a space so that, if the line is rescanned after preprocessing,
369     // a leading 'C' or 'D' won't be left-justified and then accidentally
370     // misinterpreted as a comment card.
371     EmitChar(token, ' ');
372     ++outCol;
373   }
374   token.CloseToken();
375   SkipToNextSignificantCharacter();
376   if (IsDecimalDigit(*at_)) {
377     if (features_.ShouldWarn(common::LanguageFeature::MiscSourceExtensions)) {
378       Say(GetCurrentProvenance(),
379           "Label digit is not in fixed-form label field"_port_en_US);
380     }
381   }
382 }
383 
384 // 6.3.3.5: A program unit END statement, or any other statement whose
385 // initial line resembles an END statement, shall not be continued in
386 // fixed form source.
387 void Prescanner::EnforceStupidEndStatementRules(const TokenSequence &tokens) {
388   CharBlock cBlock{tokens.ToCharBlock()};
389   const char *str{cBlock.begin()};
390   std::size_t n{cBlock.size()};
391   if (n < 3) {
392     return;
393   }
394   std::size_t j{0};
395   for (; j < n && (str[j] == ' ' || (str[j] >= '0' && str[j] <= '9')); ++j) {
396   }
397   if (j + 3 > n || std::memcmp(str + j, "end", 3) != 0) {
398     return;
399   }
400   // It starts with END, possibly after a label.
401   auto start{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
402   auto end{allSources_.GetSourcePosition(tokens.GetCharProvenance(n - 1))};
403   if (!start || !end) {
404     return;
405   }
406   if (&*start->sourceFile == &*end->sourceFile && start->line == end->line) {
407     return; // no continuation
408   }
409   j += 3;
410   static const char *const prefixes[]{"program", "subroutine", "function",
411       "blockdata", "module", "submodule", nullptr};
412   bool isPrefix{j == n || !IsLegalInIdentifier(str[j])}; // prefix is END
413   std::size_t endOfPrefix{j - 1};
414   for (const char *const *p{prefixes}; *p; ++p) {
415     std::size_t pLen{std::strlen(*p)};
416     if (j + pLen <= n && std::memcmp(str + j, *p, pLen) == 0) {
417       isPrefix = true; // END thing as prefix
418       j += pLen;
419       endOfPrefix = j - 1;
420       for (; j < n && IsLegalInIdentifier(str[j]); ++j) {
421       }
422       break;
423     }
424   }
425   if (isPrefix) {
426     auto range{tokens.GetTokenProvenanceRange(1)};
427     if (j == n) { // END or END thing [name]
428       Say(range,
429           "Program unit END statement may not be continued in fixed form source"_err_en_US);
430     } else {
431       auto endOfPrefixPos{
432           allSources_.GetSourcePosition(tokens.GetCharProvenance(endOfPrefix))};
433       auto next{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
434       if (endOfPrefixPos && next &&
435           &*endOfPrefixPos->sourceFile == &*start->sourceFile &&
436           endOfPrefixPos->line == start->line &&
437           (&*next->sourceFile != &*start->sourceFile ||
438               next->line != start->line)) {
439         Say(range,
440             "Initial line of continued statement must not appear to be a program unit END in fixed form source"_err_en_US);
441       }
442     }
443   }
444 }
445 
446 void Prescanner::SkipToEndOfLine() {
447   while (*at_ != '\n') {
448     ++at_, ++column_;
449   }
450 }
451 
452 bool Prescanner::MustSkipToEndOfLine() const {
453   if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
454     return true; // skip over ignored columns in right margin (73:80)
455   } else if (*at_ == '!' && !inCharLiteral_) {
456     return true; // inline comment goes to end of source line
457   } else {
458     return false;
459   }
460 }
461 
462 void Prescanner::NextChar() {
463   CHECK(*at_ != '\n');
464   ++at_, ++column_;
465   while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
466     // UTF-8 byte order mark - treat this file as UTF-8
467     at_ += 3;
468     encoding_ = Encoding::UTF_8;
469   }
470   SkipToNextSignificantCharacter();
471 }
472 
473 // Skip everything that should be ignored until the next significant
474 // character is reached; handles C-style comments in preprocessing
475 // directives, Fortran ! comments, stuff after the right margin in
476 // fixed form, and all forms of line continuation.
477 bool Prescanner::SkipToNextSignificantCharacter() {
478   auto anyContinuationLine{false};
479   if (inPreprocessorDirective_) {
480     SkipCComments();
481   } else {
482     bool mightNeedSpace{false};
483     if (MustSkipToEndOfLine()) {
484       SkipToEndOfLine();
485     } else {
486       mightNeedSpace = *at_ == '\n';
487     }
488     for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
489       anyContinuationLine = true;
490       ++continuationLines_;
491       if (MustSkipToEndOfLine()) {
492         SkipToEndOfLine();
493       }
494     }
495     if (*at_ == '\t') {
496       tabInCurrentLine_ = true;
497     }
498   }
499   return anyContinuationLine;
500 }
501 
502 void Prescanner::SkipCComments() {
503   while (true) {
504     if (IsCComment(at_)) {
505       if (const char *after{SkipCComment(at_)}) {
506         column_ += after - at_;
507         // May have skipped over one or more newlines; relocate the start of
508         // the next line.
509         nextLine_ = at_ = after;
510         NextLine();
511       } else {
512         // Don't emit any messages about unclosed C-style comments, because
513         // the sequence /* can appear legally in a FORMAT statement.  There's
514         // no ambiguity, since the sequence */ cannot appear legally.
515         break;
516       }
517     } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
518         at_[1] == '\n' && !IsAtEnd()) {
519       BeginSourceLineAndAdvance();
520     } else {
521       break;
522     }
523   }
524 }
525 
526 void Prescanner::SkipSpaces() {
527   while (*at_ == ' ' || *at_ == '\t') {
528     NextChar();
529   }
530   insertASpace_ = false;
531 }
532 
533 const char *Prescanner::SkipWhiteSpace(const char *p) {
534   while (*p == ' ' || *p == '\t') {
535     ++p;
536   }
537   return p;
538 }
539 
540 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
541   while (true) {
542     if (*p == ' ' || *p == '\t') {
543       ++p;
544     } else if (IsCComment(p)) {
545       if (const char *after{SkipCComment(p)}) {
546         p = after;
547       } else {
548         break;
549       }
550     } else {
551       break;
552     }
553   }
554   return p;
555 }
556 
557 const char *Prescanner::SkipCComment(const char *p) const {
558   char star{' '}, slash{' '};
559   p += 2;
560   while (star != '*' || slash != '/') {
561     if (p >= limit_) {
562       return nullptr; // signifies an unterminated comment
563     }
564     star = slash;
565     slash = *p++;
566   }
567   return p;
568 }
569 
570 bool Prescanner::NextToken(TokenSequence &tokens) {
571   CHECK(at_ >= start_ && at_ < limit_);
572   if (InFixedFormSource()) {
573     SkipSpaces();
574   } else {
575     if (*at_ == '/' && IsCComment(at_)) {
576       // Recognize and skip over classic C style /*comments*/ when
577       // outside a character literal.
578       if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
579         Say(GetCurrentProvenance(),
580             "nonstandard usage: C-style comment"_port_en_US);
581       }
582       SkipCComments();
583     }
584     if (*at_ == ' ' || *at_ == '\t') {
585       // Compress free-form white space into a single space character.
586       const auto theSpace{at_};
587       char previous{at_ <= start_ ? ' ' : at_[-1]};
588       NextChar();
589       SkipSpaces();
590       if (*at_ == '\n') {
591         // Discard white space at the end of a line.
592       } else if (!inPreprocessorDirective_ &&
593           (previous == '(' || *at_ == '(' || *at_ == ')')) {
594         // Discard white space before/after '(' and before ')', unless in a
595         // preprocessor directive.  This helps yield space-free contiguous
596         // names for generic interfaces like OPERATOR( + ) and
597         // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
598         // This has the effect of silently ignoring the illegal spaces in
599         // the array constructor ( /1,2/ ) but that seems benign; it's
600         // hard to avoid that while still removing spaces from OPERATOR( / )
601         // and OPERATOR( // ).
602       } else {
603         // Preserve the squashed white space as a single space character.
604         tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
605         tokens.CloseToken();
606         return true;
607       }
608     }
609   }
610   if (insertASpace_) {
611     tokens.PutNextTokenChar(' ', spaceProvenance_);
612     insertASpace_ = false;
613   }
614   if (*at_ == '\n') {
615     return false;
616   }
617   const char *start{at_};
618   if (*at_ == '\'' || *at_ == '"') {
619     QuotedCharacterLiteral(tokens, start);
620     preventHollerith_ = false;
621   } else if (IsDecimalDigit(*at_)) {
622     int n{0}, digits{0};
623     static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
624     do {
625       if (n < maxHollerith) {
626         n = 10 * n + DecimalDigitValue(*at_);
627       }
628       EmitCharAndAdvance(tokens, *at_);
629       ++digits;
630       if (InFixedFormSource()) {
631         SkipSpaces();
632       }
633     } while (IsDecimalDigit(*at_));
634     if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
635         !preventHollerith_) {
636       Hollerith(tokens, n, start);
637     } else if (*at_ == '.') {
638       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
639       }
640       ExponentAndKind(tokens);
641     } else if (ExponentAndKind(tokens)) {
642     } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
643         inPreprocessorDirective_) {
644       do {
645         EmitCharAndAdvance(tokens, *at_);
646       } while (IsHexadecimalDigit(*at_));
647     } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..."
648       EmitCharAndAdvance(tokens, *at_);
649       QuotedCharacterLiteral(tokens, start);
650     } else if (IsLetter(*at_) && !preventHollerith_ &&
651         parenthesisNesting_ > 0) {
652       // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
653       // we don't misrecognize I9HOLLERITH as an identifier in the next case.
654       EmitCharAndAdvance(tokens, *at_);
655     }
656     preventHollerith_ = false;
657   } else if (*at_ == '.') {
658     char nch{EmitCharAndAdvance(tokens, '.')};
659     if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
660       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
661       }
662       ExponentAndKind(tokens);
663     } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
664       EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
665     }
666     preventHollerith_ = false;
667   } else if (IsLegalInIdentifier(*at_)) {
668     int parts{1};
669     const char *afterLast{nullptr};
670     do {
671       EmitChar(tokens, *at_);
672       ++at_, ++column_;
673       afterLast = at_;
674       if (SkipToNextSignificantCharacter() && IsLegalIdentifierStart(*at_)) {
675         tokens.CloseToken();
676         ++parts;
677       }
678     } while (IsLegalInIdentifier(*at_));
679     if (parts >= 3) {
680       // Subtlety: When an identifier is split across three or more continuation
681       // lines (or two continuation lines, immediately preceded or followed
682       // by '&' free form continuation line markers, its parts are kept as
683       // distinct pp-tokens so that macro operates on them independently.
684       // This trick accommodates the historic practice of using line
685       // continuation for token pasting after replacement.
686     } else if (parts == 2) {
687       if ((start > start_ && start[-1] == '&') ||
688           (afterLast < limit_ && (*afterLast == '&' || *afterLast == '\n'))) {
689         // call &                call foo&        call foo&
690         //   &MACRO&      OR       &MACRO&   OR     &MACRO
691         //   &foo(...)             &(...)
692       } else {
693         tokens.ReopenLastToken();
694       }
695     }
696     if (InFixedFormSource()) {
697       SkipSpaces();
698     }
699     if ((*at_ == '\'' || *at_ == '"') &&
700         tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..."
701       QuotedCharacterLiteral(tokens, start);
702       preventHollerith_ = false;
703     } else {
704       preventHollerith_ = true; // DO 10 H = ...
705     }
706   } else if (*at_ == '*') {
707     if (EmitCharAndAdvance(tokens, '*') == '*') {
708       EmitCharAndAdvance(tokens, '*');
709     } else {
710       // Subtle ambiguity:
711       //  CHARACTER*2H     declares H because *2 is a kind specifier
712       //  DATAC/N*2H  /    is repeated Hollerith
713       preventHollerith_ = !slashInCurrentStatement_;
714     }
715   } else {
716     char ch{*at_};
717     if (ch == '(') {
718       if (parenthesisNesting_++ == 0) {
719         isPossibleMacroCall_ = tokens.SizeInTokens() > 0 &&
720             preprocessor_.IsFunctionLikeDefinition(
721                 tokens.TokenAt(tokens.SizeInTokens() - 1));
722       }
723     } else if (ch == ')' && parenthesisNesting_ > 0) {
724       --parenthesisNesting_;
725     }
726     char nch{EmitCharAndAdvance(tokens, ch)};
727     preventHollerith_ = false;
728     if ((nch == '=' &&
729             (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
730         (ch == nch &&
731             (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
732                 ch == '|' || ch == '<' || ch == '>')) ||
733         (ch == '=' && nch == '>')) {
734       // token comprises two characters
735       EmitCharAndAdvance(tokens, nch);
736     } else if (ch == '/') {
737       slashInCurrentStatement_ = true;
738     } else if (ch == ';' && InFixedFormSource()) {
739       SkipSpaces();
740       if (IsDecimalDigit(*at_)) {
741         if (features_.ShouldWarn(
742                 common::LanguageFeature::MiscSourceExtensions)) {
743           Say(GetProvenanceRange(at_, at_ + 1),
744               "Label should be in the label field"_port_en_US);
745         }
746       }
747     }
748   }
749   tokens.CloseToken();
750   return true;
751 }
752 
753 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
754   char ed{ToLowerCaseLetter(*at_)};
755   if (ed != 'e' && ed != 'd') {
756     return false;
757   }
758   EmitCharAndAdvance(tokens, ed);
759   if (*at_ == '+' || *at_ == '-') {
760     EmitCharAndAdvance(tokens, *at_);
761   }
762   while (IsDecimalDigit(*at_)) {
763     EmitCharAndAdvance(tokens, *at_);
764   }
765   if (*at_ == '_') {
766     while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
767     }
768   }
769   return true;
770 }
771 
772 void Prescanner::QuotedCharacterLiteral(
773     TokenSequence &tokens, const char *start) {
774   char quote{*at_};
775   const char *end{at_ + 1};
776   inCharLiteral_ = true;
777   continuationInCharLiteral_ = true;
778   const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
779   const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
780   bool isEscaped{false};
781   bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
782   while (true) {
783     if (*at_ == '\\') {
784       if (escapesEnabled) {
785         isEscaped = !isEscaped;
786       } else {
787         // The parser always processes escape sequences, so don't confuse it
788         // when escapes are disabled.
789         insert('\\');
790       }
791     } else {
792       isEscaped = false;
793     }
794     EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
795         Encoding::LATIN_1);
796     while (PadOutCharacterLiteral(tokens)) {
797     }
798     if (*at_ == '\n') {
799       if (!inPreprocessorDirective_) {
800         Say(GetProvenanceRange(start, end),
801             "Incomplete character literal"_err_en_US);
802       }
803       break;
804     }
805     end = at_ + 1;
806     NextChar();
807     if (*at_ == quote && !isEscaped) {
808       // A doubled unescaped quote mark becomes a single instance of that
809       // quote character in the literal (later).  There can be spaces between
810       // the quotes in fixed form source.
811       EmitChar(tokens, quote);
812       inCharLiteral_ = false; // for cases like print *, '...'!comment
813       NextChar();
814       if (InFixedFormSource()) {
815         SkipSpaces();
816       }
817       if (*at_ != quote) {
818         break;
819       }
820       inCharLiteral_ = true;
821     }
822   }
823   continuationInCharLiteral_ = false;
824   inCharLiteral_ = false;
825 }
826 
827 void Prescanner::Hollerith(
828     TokenSequence &tokens, int count, const char *start) {
829   inCharLiteral_ = true;
830   CHECK(*at_ == 'h' || *at_ == 'H');
831   EmitChar(tokens, 'H');
832   while (count-- > 0) {
833     if (PadOutCharacterLiteral(tokens)) {
834     } else if (*at_ == '\n') {
835       if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
836         Say(GetProvenanceRange(start, at_),
837             "Possible truncated Hollerith literal"_warn_en_US);
838       }
839       break;
840     } else {
841       NextChar();
842       // Each multi-byte character encoding counts as a single character.
843       // No escape sequences are recognized.
844       // Hollerith is always emitted to the cooked character
845       // stream in UTF-8.
846       DecodedCharacter decoded{DecodeCharacter(
847           encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
848       if (decoded.bytes > 0) {
849         EncodedCharacter utf8{
850             EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
851         for (int j{0}; j < utf8.bytes; ++j) {
852           EmitChar(tokens, utf8.buffer[j]);
853         }
854         at_ += decoded.bytes - 1;
855       } else {
856         Say(GetProvenanceRange(start, at_),
857             "Bad character in Hollerith literal"_err_en_US);
858         break;
859       }
860     }
861   }
862   if (*at_ != '\n') {
863     NextChar();
864   }
865   inCharLiteral_ = false;
866 }
867 
868 // In fixed form, source card images must be processed as if they were at
869 // least 72 columns wide, at least in character literal contexts.
870 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
871   while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
872     if (column_ < fixedFormColumnLimit_) {
873       tokens.PutNextTokenChar(' ', spaceProvenance_);
874       ++column_;
875       return true;
876     }
877     if (!FixedFormContinuation(false /*no need to insert space*/) ||
878         tabInCurrentLine_) {
879       return false;
880     }
881     CHECK(column_ == 7);
882     --at_; // point to column 6 of continuation line
883     column_ = 6;
884   }
885   return false;
886 }
887 
888 static bool IsAtProcess(const char *p) {
889   static const char pAtProc[]{"process"};
890   for (std::size_t i{0}; i < sizeof pAtProc - 1; ++i) {
891     if (ToLowerCaseLetter(*++p) != pAtProc[i])
892       return false;
893   }
894   return true;
895 }
896 
897 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
898   const char *p{start};
899 
900   // The @process directive must start in column 1.
901   if (*p == '@' && IsAtProcess(p)) {
902     return true;
903   }
904 
905   if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
906       ((*p == 'D' || *p == 'd') &&
907           !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
908     return true;
909   }
910   bool anyTabs{false};
911   while (true) {
912     if (*p == ' ') {
913       ++p;
914     } else if (*p == '\t') {
915       anyTabs = true;
916       ++p;
917     } else if (*p == '0' && !anyTabs && p == start + 5) {
918       ++p; // 0 in column 6 must treated as a space
919     } else {
920       break;
921     }
922   }
923   if (!anyTabs && p >= start + fixedFormColumnLimit_) {
924     return true;
925   }
926   if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
927     return true;
928   }
929   return *p == '\n';
930 }
931 
932 const char *Prescanner::IsFreeFormComment(const char *p) const {
933   p = SkipWhiteSpaceAndCComments(p);
934   if (*p == '!' || *p == '\n') {
935     return p;
936   } else if (*p == '@') {
937     return IsAtProcess(p) ? p : nullptr;
938   } else {
939     return nullptr;
940   }
941 }
942 
943 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
944   const char *p{SkipWhiteSpace(start)};
945   if (*p == '0' && inFixedForm_ && p == start + 5) {
946     // Accept "     0INCLUDE" in fixed form.
947     p = SkipWhiteSpace(p + 1);
948   }
949   for (const char *q{"include"}; *q; ++q) {
950     if (ToLowerCaseLetter(*p) != *q) {
951       return std::nullopt;
952     }
953     p = SkipWhiteSpace(p + 1);
954   }
955   if (IsDecimalDigit(*p)) { // accept & ignore a numeric kind prefix
956     for (p = SkipWhiteSpace(p + 1); IsDecimalDigit(*p);
957          p = SkipWhiteSpace(p + 1)) {
958     }
959     if (*p != '_') {
960       return std::nullopt;
961     }
962     p = SkipWhiteSpace(p + 1);
963   }
964   if (*p == '"' || *p == '\'') {
965     return {p - start};
966   }
967   return std::nullopt;
968 }
969 
970 void Prescanner::FortranInclude(const char *firstQuote) {
971   const char *p{firstQuote};
972   while (*p != '"' && *p != '\'') {
973     ++p;
974   }
975   char quote{*p};
976   std::string path;
977   for (++p; *p != '\n'; ++p) {
978     if (*p == quote) {
979       if (p[1] != quote) {
980         break;
981       }
982       ++p;
983     }
984     path += *p;
985   }
986   if (*p != quote) {
987     Say(GetProvenanceRange(firstQuote, p),
988         "malformed path name string"_err_en_US);
989     return;
990   }
991   p = SkipWhiteSpace(p + 1);
992   if (*p != '\n' && *p != '!') {
993     const char *garbage{p};
994     for (; *p != '\n' && *p != '!'; ++p) {
995     }
996     if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
997       Say(GetProvenanceRange(garbage, p),
998           "excess characters after path name"_warn_en_US);
999     }
1000   }
1001   std::string buf;
1002   llvm::raw_string_ostream error{buf};
1003   Provenance provenance{GetProvenance(nextLine_)};
1004   std::optional<std::string> prependPath;
1005   if (const SourceFile * currentFile{allSources_.GetSourceFile(provenance)}) {
1006     prependPath = DirectoryName(currentFile->path());
1007   }
1008   const SourceFile *included{
1009       allSources_.Open(path, error, std::move(prependPath))};
1010   if (!included) {
1011     Say(provenance, "INCLUDE: %s"_err_en_US, error.str());
1012   } else if (included->bytes() > 0) {
1013     ProvenanceRange includeLineRange{
1014         provenance, static_cast<std::size_t>(p - nextLine_)};
1015     ProvenanceRange fileRange{
1016         allSources_.AddIncludedFile(*included, includeLineRange)};
1017     Prescanner{*this, /*isNestedInIncludeDirective=*/false}
1018         .set_encoding(included->encoding())
1019         .Prescan(fileRange);
1020   }
1021 }
1022 
1023 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
1024   const char *p{start};
1025   for (; *p == ' '; ++p) {
1026   }
1027   if (*p == '#') {
1028     if (inFixedForm_ && p == start + 5) {
1029       return nullptr;
1030     }
1031   } else {
1032     p = SkipWhiteSpace(p);
1033     if (*p != '#') {
1034       return nullptr;
1035     }
1036   }
1037   return SkipWhiteSpace(p + 1);
1038 }
1039 
1040 bool Prescanner::IsNextLinePreprocessorDirective() const {
1041   return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
1042 }
1043 
1044 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
1045   if (IsAtEnd()) {
1046     if (afterAmpersand && prescannerNesting_ > 0) {
1047       // A continuation marker at the end of the last line in an
1048       // include file inhibits the newline for that line.
1049       SkipToEndOfLine();
1050       omitNewline_ = true;
1051     }
1052     return false;
1053   }
1054   auto lineClass{ClassifyLine(nextLine_)};
1055   if (lineClass.kind == LineClassification::Kind::Comment) {
1056     NextLine();
1057     return true;
1058   } else if (inPreprocessorDirective_) {
1059     return false;
1060   } else if (lineClass.kind ==
1061           LineClassification::Kind::ConditionalCompilationDirective ||
1062       lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
1063     // Allow conditional compilation directives (e.g., #ifdef) to affect
1064     // continuation lines.
1065     // Allow other preprocessor directives, too, except #include
1066     // (when it does not follow '&'), #define, and #undef (because
1067     // they cannot be allowed to affect preceding text on a
1068     // continued line).
1069     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
1070     return true;
1071   } else if (afterAmpersand &&
1072       (lineClass.kind == LineClassification::Kind::IncludeDirective ||
1073           lineClass.kind == LineClassification::Kind::IncludeLine)) {
1074     SkipToEndOfLine();
1075     omitNewline_ = true;
1076     skipLeadingAmpersand_ = true;
1077     return false;
1078   } else {
1079     return false;
1080   }
1081 }
1082 
1083 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
1084   if (IsAtEnd()) {
1085     return nullptr;
1086   }
1087   tabInCurrentLine_ = false;
1088   char col1{*nextLine_};
1089   if (IsFixedFormCommentChar(col1)) {
1090     int j{1};
1091     if (InCompilerDirective()) {
1092       // Must be a continued compiler directive.
1093       for (; j < 5; ++j) {
1094         char ch{directiveSentinel_[j - 1]};
1095         if (ch == '\0') {
1096           break;
1097         }
1098         if (ch != ToLowerCaseLetter(nextLine_[j])) {
1099           return nullptr;
1100         }
1101       }
1102     } else if (features_.IsEnabled(LanguageFeature::OpenMP)) {
1103       // Fixed Source Form Conditional Compilation Sentinels.
1104       if (nextLine_[1] != '$') {
1105         return nullptr;
1106       }
1107       j++;
1108     } else {
1109       return nullptr;
1110     }
1111     for (; j < 5; ++j) {
1112       if (nextLine_[j] != ' ') {
1113         return nullptr;
1114       }
1115     }
1116     char col6{nextLine_[5]};
1117     if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
1118       if (nextLine_[6] != ' ' && mightNeedSpace) {
1119         insertASpace_ = true;
1120       }
1121       return nextLine_ + 6;
1122     }
1123     return nullptr;
1124   } else {
1125     // Normal case: not in a compiler directive.
1126     if (col1 == '&' &&
1127         features_.IsEnabled(
1128             LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
1129       // Extension: '&' as continuation marker
1130       if (features_.ShouldWarn(
1131               LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
1132         Say(GetProvenance(nextLine_), "nonstandard usage"_port_en_US);
1133       }
1134       return nextLine_ + 1;
1135     }
1136     if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
1137       tabInCurrentLine_ = true;
1138       return nextLine_ + 2; // VAX extension
1139     }
1140     if ((col1 == ' ' ||
1141             ((col1 == 'D' || col1 == 'd') &&
1142                 features_.IsEnabled(LanguageFeature::OldDebugLines))) &&
1143         nextLine_[1] == ' ' && nextLine_[2] == ' ' && nextLine_[3] == ' ' &&
1144         nextLine_[4] == ' ') {
1145       char col6{nextLine_[5]};
1146       if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
1147         if ((col6 == 'i' || col6 == 'I') && IsIncludeLine(nextLine_)) {
1148           // It's An INCLUDE line, not a continuation
1149         } else {
1150           return nextLine_ + 6;
1151         }
1152       }
1153     }
1154     if (IsImplicitContinuation()) {
1155       return nextLine_;
1156     }
1157   }
1158   return nullptr; // not a continuation line
1159 }
1160 
1161 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
1162   const char *p{nextLine_};
1163   if (p >= limit_) {
1164     return nullptr;
1165   }
1166   p = SkipWhiteSpace(p);
1167   if (InCompilerDirective()) {
1168     if (*p++ != '!') {
1169       return nullptr;
1170     }
1171     for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
1172       if (*s != ToLowerCaseLetter(*p)) {
1173         return nullptr;
1174       }
1175     }
1176     p = SkipWhiteSpace(p);
1177     if (*p == '&') {
1178       if (!ampersand) {
1179         insertASpace_ = true;
1180       }
1181       return p + 1;
1182     } else if (ampersand) {
1183       return p;
1184     } else {
1185       return nullptr;
1186     }
1187   } else {
1188     if (*p == '&') {
1189       return p + 1;
1190     } else if (*p == '!' || *p == '\n' || *p == '#') {
1191       return nullptr;
1192     } else if (ampersand || IsImplicitContinuation()) {
1193       if (continuationInCharLiteral_) {
1194         // 'a'&            -> 'a''b' == "a'b"
1195         //   'b'
1196         if (features_.ShouldWarn(
1197                 common::LanguageFeature::MiscSourceExtensions)) {
1198           Say(GetProvenanceRange(p, p + 1),
1199               "Character literal continuation line should have been preceded by '&'"_port_en_US);
1200         }
1201       } else if (p > nextLine_) {
1202         --p;
1203       } else {
1204         insertASpace_ = true;
1205       }
1206       return p;
1207     } else {
1208       return nullptr;
1209     }
1210   }
1211 }
1212 
1213 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
1214   // N.B. We accept '&' as a continuation indicator in fixed form, too,
1215   // but not in a character literal.
1216   if (*at_ == '&' && inCharLiteral_) {
1217     return false;
1218   }
1219   do {
1220     if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
1221       BeginSourceLine(cont);
1222       column_ = 7;
1223       NextLine();
1224       return true;
1225     }
1226   } while (SkipCommentLine(false /* not after ampersand */));
1227   return false;
1228 }
1229 
1230 bool Prescanner::FreeFormContinuation() {
1231   const char *p{at_};
1232   bool ampersand{*p == '&'};
1233   if (ampersand) {
1234     p = SkipWhiteSpace(p + 1);
1235   }
1236   if (*p != '\n') {
1237     if (inCharLiteral_) {
1238       return false;
1239     } else if (*p == '!') { // & ! comment - ok
1240     } else if (ampersand && isPossibleMacroCall_ && (*p == ',' || *p == ')')) {
1241       return false; // allow & at end of a macro argument
1242     } else if (features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
1243       Say(GetProvenance(p), "missing ! before comment after &"_warn_en_US);
1244     }
1245   }
1246   do {
1247     if (const char *cont{FreeFormContinuationLine(ampersand)}) {
1248       BeginSourceLine(cont);
1249       NextLine();
1250       return true;
1251     }
1252   } while (SkipCommentLine(ampersand));
1253   return false;
1254 }
1255 
1256 // Implicit line continuation allows a preprocessor macro call with
1257 // arguments to span multiple lines.
1258 bool Prescanner::IsImplicitContinuation() const {
1259   return !inPreprocessorDirective_ && !inCharLiteral_ && isPossibleMacroCall_ &&
1260       parenthesisNesting_ > 0 && !IsAtEnd() &&
1261       ClassifyLine(nextLine_).kind == LineClassification::Kind::Source;
1262 }
1263 
1264 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
1265   if (disableSourceContinuation_) {
1266     return false;
1267   } else if (*at_ == '\n' || *at_ == '&') {
1268     if (inFixedForm_) {
1269       return FixedFormContinuation(mightNeedFixedFormSpace);
1270     } else {
1271       return FreeFormContinuation();
1272     }
1273   } else if (*at_ == '\\' && at_ + 2 == nextLine_ &&
1274       backslashFreeFormContinuation_ && !inFixedForm_ && nextLine_ < limit_) {
1275     // cpp-like handling of \ at end of a free form source line
1276     BeginSourceLine(nextLine_);
1277     NextLine();
1278     return true;
1279   } else {
1280     return false;
1281   }
1282 }
1283 
1284 std::optional<Prescanner::LineClassification>
1285 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
1286   const char *p{start};
1287   char col1{*p++};
1288   if (!IsFixedFormCommentChar(col1)) {
1289     return std::nullopt;
1290   }
1291   char sentinel[5], *sp{sentinel};
1292   int column{2};
1293   for (; column < 6; ++column, ++p) {
1294     if (*p == ' ' || *p == '\n' || *p == '\t') {
1295       break;
1296     }
1297     if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1298       // OpenMP conditional compilation line: leave the label alone
1299       break;
1300     }
1301     *sp++ = ToLowerCaseLetter(*p);
1302   }
1303   if (column == 6) {
1304     if (*p == ' ' || *p == '\t' || *p == '0') {
1305       ++p;
1306     } else {
1307       // This is a Continuation line, not an initial directive line.
1308       return std::nullopt;
1309     }
1310   }
1311   if (sp == sentinel) {
1312     return std::nullopt;
1313   }
1314   *sp = '\0';
1315   if (const char *ss{IsCompilerDirectiveSentinel(
1316           sentinel, static_cast<std::size_t>(sp - sentinel))}) {
1317     std::size_t payloadOffset = p - start;
1318     return {LineClassification{
1319         LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1320   }
1321   return std::nullopt;
1322 }
1323 
1324 std::optional<Prescanner::LineClassification>
1325 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1326   char sentinel[8];
1327   const char *p{SkipWhiteSpace(start)};
1328   if (*p++ != '!') {
1329     return std::nullopt;
1330   }
1331   for (std::size_t j{0}; j + 1 < sizeof sentinel; ++p, ++j) {
1332     if (*p == '\n') {
1333       break;
1334     }
1335     if (*p == ' ' || *p == '\t' || *p == '&') {
1336       if (j == 0) {
1337         break;
1338       }
1339       sentinel[j] = '\0';
1340       p = SkipWhiteSpace(p + 1);
1341       if (*p == '!') {
1342         break;
1343       }
1344       if (const char *sp{IsCompilerDirectiveSentinel(sentinel, j)}) {
1345         std::size_t offset = p - start;
1346         return {LineClassification{
1347             LineClassification::Kind::CompilerDirective, offset, sp}};
1348       }
1349       break;
1350     }
1351     sentinel[j] = ToLowerCaseLetter(*p);
1352   }
1353   return std::nullopt;
1354 }
1355 
1356 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1357   std::uint64_t packed{0};
1358   for (char ch : dir) {
1359     packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1360   }
1361   compilerDirectiveBloomFilter_.set(packed % prime1);
1362   compilerDirectiveBloomFilter_.set(packed % prime2);
1363   compilerDirectiveSentinels_.insert(dir);
1364   return *this;
1365 }
1366 
1367 const char *Prescanner::IsCompilerDirectiveSentinel(
1368     const char *sentinel, std::size_t len) const {
1369   std::uint64_t packed{0};
1370   for (std::size_t j{0}; j < len; ++j) {
1371     packed = (packed << 8) | (sentinel[j] & 0xff);
1372   }
1373   if (len == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1374       !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1375     return nullptr;
1376   }
1377   const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, len))};
1378   return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
1379 }
1380 
1381 const char *Prescanner::IsCompilerDirectiveSentinel(CharBlock token) const {
1382   const char *p{token.begin()};
1383   const char *end{p + token.size()};
1384   while (p < end && (*p == ' ' || *p == '\n')) {
1385     ++p;
1386   }
1387   if (p < end && *p == '!') {
1388     ++p;
1389   }
1390   while (end > p && (end[-1] == ' ' || end[-1] == '\t')) {
1391     --end;
1392   }
1393   return end > p && IsCompilerDirectiveSentinel(p, end - p) ? p : nullptr;
1394 }
1395 
1396 constexpr bool IsDirective(const char *match, const char *dir) {
1397   for (; *match; ++match) {
1398     if (*match != ToLowerCaseLetter(*dir++)) {
1399       return false;
1400     }
1401   }
1402   return true;
1403 }
1404 
1405 Prescanner::LineClassification Prescanner::ClassifyLine(
1406     const char *start) const {
1407   if (inFixedForm_) {
1408     if (std::optional<LineClassification> lc{
1409             IsFixedFormCompilerDirectiveLine(start)}) {
1410       return std::move(*lc);
1411     }
1412     if (IsFixedFormCommentLine(start)) {
1413       return {LineClassification::Kind::Comment};
1414     }
1415   } else {
1416     if (std::optional<LineClassification> lc{
1417             IsFreeFormCompilerDirectiveLine(start)}) {
1418       return std::move(*lc);
1419     }
1420     if (const char *bang{IsFreeFormComment(start)}) {
1421       return {LineClassification::Kind::Comment,
1422           static_cast<std::size_t>(bang - start)};
1423     }
1424   }
1425   if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1426     return {LineClassification::Kind::IncludeLine, *quoteOffset};
1427   }
1428   if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1429     if (IsDirective("if", dir) || IsDirective("elif", dir) ||
1430         IsDirective("else", dir) || IsDirective("endif", dir)) {
1431       return {LineClassification::Kind::ConditionalCompilationDirective};
1432     } else if (IsDirective("include", dir)) {
1433       return {LineClassification::Kind::IncludeDirective};
1434     } else if (IsDirective("define", dir) || IsDirective("undef", dir)) {
1435       return {LineClassification::Kind::DefinitionDirective};
1436     } else {
1437       return {LineClassification::Kind::PreprocessorDirective};
1438     }
1439   }
1440   return {LineClassification::Kind::Source};
1441 }
1442 
1443 Prescanner::LineClassification Prescanner::ClassifyLine(
1444     TokenSequence &tokens, Provenance newlineProvenance) const {
1445   // Append a newline temporarily.
1446   tokens.PutNextTokenChar('\n', newlineProvenance);
1447   tokens.CloseToken();
1448   const char *ppd{tokens.ToCharBlock().begin()};
1449   LineClassification classification{ClassifyLine(ppd)};
1450   tokens.pop_back(); // remove the newline
1451   return classification;
1452 }
1453 
1454 void Prescanner::SourceFormChange(std::string &&dir) {
1455   if (dir == "!dir$ free") {
1456     inFixedForm_ = false;
1457   } else if (dir == "!dir$ fixed") {
1458     inFixedForm_ = true;
1459   }
1460 }
1461 
1462 // Acquire and append compiler directive continuation lines to
1463 // the tokens that constitute a compiler directive, even when those
1464 // directive continuation lines are the result of macro expansion.
1465 // (Not used when neither the original compiler directive line nor
1466 // the directive continuation line result from preprocessing; regular
1467 // line continuation during tokenization handles that normal case.)
1468 bool Prescanner::CompilerDirectiveContinuation(
1469     TokenSequence &tokens, const char *origSentinel) {
1470   if (inFixedForm_ || tokens.empty() ||
1471       tokens.TokenAt(tokens.SizeInTokens() - 1) != "&") {
1472     return false;
1473   }
1474   LineClassification followingLine{ClassifyLine(nextLine_)};
1475   if (followingLine.kind == LineClassification::Kind::Comment) {
1476     nextLine_ += followingLine.payloadOffset; // advance to '!' or newline
1477     NextLine();
1478     return true;
1479   }
1480   CHECK(origSentinel != nullptr);
1481   directiveSentinel_ = origSentinel; // so InCompilerDirective() is true
1482   const char *nextContinuation{
1483       followingLine.kind == LineClassification::Kind::CompilerDirective
1484           ? FreeFormContinuationLine(true)
1485           : nullptr};
1486   if (!nextContinuation &&
1487       followingLine.kind != LineClassification::Kind::Source) {
1488     return false;
1489   }
1490   auto origNextLine{nextLine_};
1491   BeginSourceLine(nextLine_);
1492   NextLine();
1493   if (nextContinuation) {
1494     // What follows is !DIR$ & xxx; skip over the & so that it
1495     // doesn't cause a spurious continuation.
1496     at_ = nextContinuation;
1497   } else {
1498     // What follows looks like a source line before macro expansion,
1499     // but might become a directive continuation afterwards.
1500     SkipSpaces();
1501   }
1502   TokenSequence followingTokens;
1503   while (NextToken(followingTokens)) {
1504   }
1505   if (auto followingPrepro{
1506           preprocessor_.MacroReplacement(followingTokens, *this)}) {
1507     followingTokens = std::move(*followingPrepro);
1508   }
1509   followingTokens.RemoveRedundantBlanks();
1510   std::size_t startAt{0};
1511   std::size_t following{followingTokens.SizeInTokens()};
1512   bool ok{false};
1513   if (nextContinuation) {
1514     ok = true;
1515   } else {
1516     startAt = 2;
1517     if (startAt < following && followingTokens.TokenAt(0) == "!") {
1518       CharBlock sentinel{followingTokens.TokenAt(1)};
1519       if (!sentinel.empty() &&
1520           std::memcmp(sentinel.begin(), origSentinel, sentinel.size()) == 0) {
1521         ok = true;
1522         while (
1523             startAt < following && followingTokens.TokenAt(startAt).IsBlank()) {
1524           ++startAt;
1525         }
1526         if (startAt < following && followingTokens.TokenAt(startAt) == "&") {
1527           ++startAt;
1528         }
1529       }
1530     }
1531   }
1532   if (ok) {
1533     tokens.pop_back(); // delete original '&'
1534     tokens.Put(followingTokens, startAt, following - startAt);
1535     tokens.RemoveRedundantBlanks();
1536   } else {
1537     nextLine_ = origNextLine;
1538   }
1539   return ok;
1540 }
1541 
1542 // Similar, but for source line continuation after macro replacement.
1543 bool Prescanner::SourceLineContinuation(TokenSequence &tokens) {
1544   if (!inFixedForm_ && !tokens.empty() &&
1545       tokens.TokenAt(tokens.SizeInTokens() - 1) == "&") {
1546     LineClassification followingLine{ClassifyLine(nextLine_)};
1547     if (followingLine.kind == LineClassification::Kind::Comment) {
1548       nextLine_ += followingLine.payloadOffset; // advance to '!' or newline
1549       NextLine();
1550       return true;
1551     } else if (const char *nextContinuation{FreeFormContinuationLine(true)}) {
1552       BeginSourceLine(nextLine_);
1553       NextLine();
1554       TokenSequence followingTokens;
1555       at_ = nextContinuation;
1556       while (NextToken(followingTokens)) {
1557       }
1558       if (auto followingPrepro{
1559               preprocessor_.MacroReplacement(followingTokens, *this)}) {
1560         followingTokens = std::move(*followingPrepro);
1561       }
1562       followingTokens.RemoveRedundantBlanks();
1563       tokens.pop_back(); // delete original '&'
1564       tokens.Put(followingTokens);
1565       return true;
1566     }
1567   }
1568   return false;
1569 }
1570 } // namespace Fortran::parser
1571