xref: /llvm-project/flang/lib/Parser/prescan.cpp (revision dfbc80febb45544710b12cc05f268f40ef88cd6e)
1 //===-- lib/Parser/prescan.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "prescan.h"
10 #include "flang/Common/idioms.h"
11 #include "flang/Parser/characters.h"
12 #include "flang/Parser/message.h"
13 #include "flang/Parser/preprocessor.h"
14 #include "flang/Parser/source.h"
15 #include "flang/Parser/token-sequence.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <cstddef>
18 #include <cstring>
19 #include <utility>
20 #include <vector>
21 
22 namespace Fortran::parser {
23 
24 using common::LanguageFeature;
25 
26 static constexpr int maxPrescannerNesting{100};
27 
28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
29     Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
30     : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
31       allSources_{preprocessor_.allSources()}, features_{lfc},
32       backslashFreeFormContinuation_{preprocessor.AnyDefinitions()},
33       encoding_{allSources_.encoding()} {}
34 
35 Prescanner::Prescanner(const Prescanner &that, Preprocessor &prepro,
36     bool isNestedInIncludeDirective)
37     : messages_{that.messages_}, cooked_{that.cooked_}, preprocessor_{prepro},
38       allSources_{that.allSources_}, features_{that.features_},
39       preprocessingOnly_{that.preprocessingOnly_},
40       expandIncludeLines_{that.expandIncludeLines_},
41       isNestedInIncludeDirective_{isNestedInIncludeDirective},
42       backslashFreeFormContinuation_{that.backslashFreeFormContinuation_},
43       inFixedForm_{that.inFixedForm_},
44       fixedFormColumnLimit_{that.fixedFormColumnLimit_},
45       encoding_{that.encoding_},
46       prescannerNesting_{that.prescannerNesting_ + 1},
47       skipLeadingAmpersand_{that.skipLeadingAmpersand_},
48       compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
49       compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
50 
51 // Returns number of bytes to skip
52 static inline int IsSpace(const char *p) {
53   if (*p == ' ') {
54     return 1;
55   } else if (*p == '\xa0') { // LATIN-1 NBSP non-breaking space
56     return 1;
57   } else if (p[0] == '\xc2' && p[1] == '\xa0') { // UTF-8 NBSP
58     return 2;
59   } else {
60     return 0;
61   }
62 }
63 
64 static inline int IsSpaceOrTab(const char *p) {
65   return *p == '\t' ? 1 : IsSpace(p);
66 }
67 
68 static inline constexpr bool IsFixedFormCommentChar(char ch) {
69   return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
70 }
71 
72 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
73   char *p{dir.GetMutableCharData()};
74   char *limit{p + dir.SizeInChars()};
75   for (; p < limit; ++p) {
76     if (*p != ' ') {
77       CHECK(IsFixedFormCommentChar(*p));
78       *p = '!';
79       return;
80     }
81   }
82   DIE("compiler directive all blank");
83 }
84 
85 void Prescanner::Prescan(ProvenanceRange range) {
86   startProvenance_ = range.start();
87   start_ = allSources_.GetSource(range);
88   CHECK(start_);
89   limit_ = start_ + range.size();
90   nextLine_ = start_;
91   const bool beganInFixedForm{inFixedForm_};
92   if (prescannerNesting_ > maxPrescannerNesting) {
93     Say(GetProvenance(start_),
94         "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
95     return;
96   }
97   while (!IsAtEnd()) {
98     Statement();
99   }
100   if (inFixedForm_ != beganInFixedForm) {
101     std::string dir{"!dir$ "};
102     if (beganInFixedForm) {
103       dir += "fixed";
104     } else {
105       dir += "free";
106     }
107     dir += '\n';
108     TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()};
109     tokens.Emit(cooked_);
110   }
111 }
112 
113 void Prescanner::Statement() {
114   TokenSequence tokens;
115   const char *statementStart{nextLine_};
116   LineClassification line{ClassifyLine(statementStart)};
117   switch (line.kind) {
118   case LineClassification::Kind::Comment:
119     nextLine_ += line.payloadOffset; // advance to '!' or newline
120     NextLine();
121     return;
122   case LineClassification::Kind::IncludeLine:
123     FortranInclude(nextLine_ + line.payloadOffset);
124     NextLine();
125     return;
126   case LineClassification::Kind::ConditionalCompilationDirective:
127   case LineClassification::Kind::IncludeDirective:
128     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
129     afterPreprocessingDirective_ = true;
130     skipLeadingAmpersand_ |= !inFixedForm_;
131     return;
132   case LineClassification::Kind::PreprocessorDirective:
133     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
134     afterPreprocessingDirective_ = true;
135     // Don't set skipLeadingAmpersand_
136     return;
137   case LineClassification::Kind::DefinitionDirective:
138     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
139     // Don't set afterPreprocessingDirective_ or skipLeadingAmpersand_
140     return;
141   case LineClassification::Kind::CompilerDirective: {
142     directiveSentinel_ = line.sentinel;
143     CHECK(InCompilerDirective());
144     BeginStatementAndAdvance();
145     if (inFixedForm_) {
146       CHECK(IsFixedFormCommentChar(*at_));
147     } else {
148       while (int n{IsSpaceOrTab(at_)}) {
149         at_ += n, ++column_;
150       }
151       CHECK(*at_ == '!');
152     }
153     std::optional<int> condOffset;
154     if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
155       // OpenMP conditional compilation line.
156       condOffset = 2;
157     } else if (directiveSentinel_[0] == '@' && directiveSentinel_[1] == 'c' &&
158         directiveSentinel_[2] == 'u' && directiveSentinel_[3] == 'f' &&
159         directiveSentinel_[4] == '\0') {
160       // CUDA conditional compilation line.
161       condOffset = 5;
162     }
163     if (condOffset) {
164       at_ += *condOffset, column_ += *condOffset;
165       if (auto payload{IsIncludeLine(at_)}) {
166         FortranInclude(at_ + *payload);
167         return;
168       } else if (inFixedForm_) {
169         LabelField(tokens);
170       } else {
171         SkipSpaces();
172       }
173     } else {
174       // Compiler directive.  Emit normalized sentinel, squash following spaces.
175       EmitChar(tokens, '!');
176       ++at_, ++column_;
177       for (const char *sp{directiveSentinel_}; *sp != '\0';
178            ++sp, ++at_, ++column_) {
179         EmitChar(tokens, *sp);
180       }
181       if (IsSpaceOrTab(at_)) {
182         EmitChar(tokens, ' ');
183         while (int n{IsSpaceOrTab(at_)}) {
184           at_ += n, ++column_;
185         }
186       }
187       tokens.CloseToken();
188     }
189     break;
190   }
191   case LineClassification::Kind::Source: {
192     BeginStatementAndAdvance();
193     bool checkLabelField{false};
194     if (inFixedForm_) {
195       if (features_.IsEnabled(LanguageFeature::OldDebugLines) &&
196           (*at_ == 'D' || *at_ == 'd')) {
197         NextChar();
198       }
199       checkLabelField = true;
200     } else {
201       if (skipLeadingAmpersand_) {
202         skipLeadingAmpersand_ = false;
203         const char *p{SkipWhiteSpace(at_)};
204         if (p < limit_ && *p == '&') {
205           column_ += ++p - at_;
206           at_ = p;
207         }
208       } else {
209         SkipSpaces();
210       }
211     }
212     // Check for a leading identifier that might be a keyword macro
213     // that will expand to anything indicating a non-source line, like
214     // a comment marker or directive sentinel.  If so, disable line
215     // continuation, so that NextToken() won't consume anything from
216     // following lines.
217     if (IsLegalIdentifierStart(*at_)) {
218       // TODO: Only bother with these cases when any keyword macro has
219       // been defined with replacement text that could begin a comment
220       // or directive sentinel.
221       const char *p{at_};
222       while (IsLegalInIdentifier(*++p)) {
223       }
224       CharBlock id{at_, static_cast<std::size_t>(p - at_)};
225       if (preprocessor_.IsNameDefined(id) &&
226           !preprocessor_.IsFunctionLikeDefinition(id)) {
227         checkLabelField = false;
228         TokenSequence toks;
229         toks.Put(id, GetProvenance(at_));
230         if (auto replaced{preprocessor_.MacroReplacement(toks, *this)}) {
231           auto newLineClass{ClassifyLine(*replaced, GetCurrentProvenance())};
232           if (newLineClass.kind ==
233               LineClassification::Kind::CompilerDirective) {
234             directiveSentinel_ = newLineClass.sentinel;
235             disableSourceContinuation_ = false;
236           } else {
237             disableSourceContinuation_ = !replaced->empty() &&
238                 newLineClass.kind != LineClassification::Kind::Source;
239           }
240         }
241       }
242     }
243     if (checkLabelField) {
244       LabelField(tokens);
245     }
246   } break;
247   }
248 
249   while (NextToken(tokens)) {
250   }
251   if (continuationLines_ > 255) {
252     if (features_.ShouldWarn(common::LanguageFeature::MiscSourceExtensions)) {
253       Say(common::LanguageFeature::MiscSourceExtensions,
254           GetProvenance(statementStart),
255           "%d continuation lines is more than the Fortran standard allows"_port_en_US,
256           continuationLines_);
257     }
258   }
259 
260   Provenance newlineProvenance{GetCurrentProvenance()};
261   if (std::optional<TokenSequence> preprocessed{
262           preprocessor_.MacroReplacement(tokens, *this)}) {
263     // Reprocess the preprocessed line.
264     LineClassification ppl{ClassifyLine(*preprocessed, newlineProvenance)};
265     switch (ppl.kind) {
266     case LineClassification::Kind::Comment:
267       break;
268     case LineClassification::Kind::IncludeLine:
269       FortranInclude(preprocessed->TokenAt(0).begin() + ppl.payloadOffset);
270       break;
271     case LineClassification::Kind::ConditionalCompilationDirective:
272     case LineClassification::Kind::IncludeDirective:
273     case LineClassification::Kind::DefinitionDirective:
274     case LineClassification::Kind::PreprocessorDirective:
275       if (features_.ShouldWarn(common::UsageWarning::Preprocessing)) {
276         Say(common::UsageWarning::Preprocessing,
277             preprocessed->GetProvenanceRange(),
278             "Preprocessed line resembles a preprocessor directive"_warn_en_US);
279       }
280       CheckAndEmitLine(preprocessed->ToLowerCase(), newlineProvenance);
281       break;
282     case LineClassification::Kind::CompilerDirective:
283       if (preprocessed->HasRedundantBlanks()) {
284         preprocessed->RemoveRedundantBlanks();
285       }
286       while (CompilerDirectiveContinuation(*preprocessed, ppl.sentinel)) {
287         newlineProvenance = GetCurrentProvenance();
288       }
289       NormalizeCompilerDirectiveCommentMarker(*preprocessed);
290       preprocessed->ToLowerCase();
291       SourceFormChange(preprocessed->ToString());
292       CheckAndEmitLine(preprocessed->ToLowerCase().ClipComment(
293                            *this, true /* skip first ! */),
294           newlineProvenance);
295       break;
296     case LineClassification::Kind::Source:
297       if (inFixedForm_) {
298         if (!preprocessingOnly_ && preprocessed->HasBlanks()) {
299           preprocessed->RemoveBlanks();
300         }
301       } else {
302         while (SourceLineContinuation(*preprocessed)) {
303           newlineProvenance = GetCurrentProvenance();
304         }
305         if (preprocessed->HasRedundantBlanks()) {
306           preprocessed->RemoveRedundantBlanks();
307         }
308       }
309       CheckAndEmitLine(
310           preprocessed->ToLowerCase().ClipComment(*this), newlineProvenance);
311       break;
312     }
313   } else { // no macro replacement
314     if (line.kind == LineClassification::Kind::CompilerDirective) {
315       while (CompilerDirectiveContinuation(tokens, line.sentinel)) {
316         newlineProvenance = GetCurrentProvenance();
317       }
318       tokens.ToLowerCase();
319       SourceFormChange(tokens.ToString());
320     } else { // Kind::Source
321       tokens.ToLowerCase();
322       if (inFixedForm_) {
323         EnforceStupidEndStatementRules(tokens);
324       }
325     }
326     CheckAndEmitLine(tokens, newlineProvenance);
327   }
328   directiveSentinel_ = nullptr;
329 }
330 
331 void Prescanner::CheckAndEmitLine(
332     TokenSequence &tokens, Provenance newlineProvenance) {
333   tokens.CheckBadFortranCharacters(
334       messages_, *this, disableSourceContinuation_);
335   // Parenthesis nesting check does not apply while any #include is
336   // active, nor on the lines before and after a top-level #include,
337   // nor before or after conditional source.
338   // Applications play shenanigans with line continuation before and
339   // after #include'd subprogram argument lists and conditional source.
340   if (!isNestedInIncludeDirective_ && !omitNewline_ &&
341       !afterPreprocessingDirective_ && tokens.BadlyNestedParentheses() &&
342       !preprocessor_.InConditional()) {
343     if (nextLine_ < limit_ && IsPreprocessorDirectiveLine(nextLine_)) {
344       // don't complain
345     } else {
346       tokens.CheckBadParentheses(messages_);
347     }
348   }
349   tokens.Emit(cooked_);
350   if (omitNewline_) {
351     omitNewline_ = false;
352   } else {
353     cooked_.Put('\n', newlineProvenance);
354     afterPreprocessingDirective_ = false;
355   }
356 }
357 
358 TokenSequence Prescanner::TokenizePreprocessorDirective() {
359   CHECK(!IsAtEnd() && !inPreprocessorDirective_);
360   inPreprocessorDirective_ = true;
361   BeginStatementAndAdvance();
362   TokenSequence tokens;
363   while (NextToken(tokens)) {
364   }
365   inPreprocessorDirective_ = false;
366   return tokens;
367 }
368 
369 void Prescanner::NextLine() {
370   void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
371   void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
372   if (!v) {
373     nextLine_ = limit_;
374   } else {
375     const char *nl{const_cast<const char *>(static_cast<char *>(v))};
376     nextLine_ = nl + 1;
377   }
378 }
379 
380 void Prescanner::LabelField(TokenSequence &token) {
381   int outCol{1};
382   const char *start{at_};
383   std::optional<int> badColumn;
384   for (; *at_ != '\n' && column_ <= 6; ++at_) {
385     if (*at_ == '\t') {
386       ++at_;
387       column_ = 7;
388       break;
389     }
390     if (int n{IsSpace(at_)}; n == 0 &&
391         !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
392       EmitChar(token, *at_);
393       ++outCol;
394       if (!badColumn && (column_ == 6 || !IsDecimalDigit(*at_))) {
395         badColumn = column_;
396       }
397     }
398     ++column_;
399   }
400   if (badColumn && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) {
401     if ((prescannerNesting_ > 0 && *badColumn == 6 &&
402             cooked_.BufferedBytes() == firstCookedCharacterOffset_) ||
403         afterPreprocessingDirective_) {
404       // This is the first source line in #include'd text or conditional
405       // code under #if, or the first source line after such.
406       // If it turns out that the preprocessed text begins with a
407       // fixed form continuation line, the newline at the end
408       // of the latest source line beforehand will be deleted in
409       // CookedSource::Marshal().
410       cooked_.MarkPossibleFixedFormContinuation();
411     } else if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
412       Say(common::UsageWarning::Scanning, GetProvenance(start + *badColumn - 1),
413           *badColumn == 6
414               ? "Statement should not begin with a continuation line"_warn_en_US
415               : "Character in fixed-form label field must be a digit"_warn_en_US);
416     }
417     token.clear();
418     if (*badColumn < 6) {
419       at_ = start;
420       column_ = 1;
421       return;
422     }
423     outCol = 1;
424   }
425   if (outCol == 1) { // empty label field
426     // Emit a space so that, if the line is rescanned after preprocessing,
427     // a leading 'C' or 'D' won't be left-justified and then accidentally
428     // misinterpreted as a comment card.
429     EmitChar(token, ' ');
430     ++outCol;
431   }
432   token.CloseToken();
433   SkipToNextSignificantCharacter();
434   if (IsDecimalDigit(*at_)) {
435     if (features_.ShouldWarn(common::LanguageFeature::MiscSourceExtensions)) {
436       Say(common::LanguageFeature::MiscSourceExtensions, GetCurrentProvenance(),
437           "Label digit is not in fixed-form label field"_port_en_US);
438     }
439   }
440 }
441 
442 // 6.3.3.5: A program unit END statement, or any other statement whose
443 // initial line resembles an END statement, shall not be continued in
444 // fixed form source.
445 void Prescanner::EnforceStupidEndStatementRules(const TokenSequence &tokens) {
446   CharBlock cBlock{tokens.ToCharBlock()};
447   const char *str{cBlock.begin()};
448   std::size_t n{cBlock.size()};
449   if (n < 3) {
450     return;
451   }
452   std::size_t j{0};
453   for (; j < n && (str[j] == ' ' || (str[j] >= '0' && str[j] <= '9')); ++j) {
454   }
455   if (j + 3 > n || std::memcmp(str + j, "end", 3) != 0) {
456     return;
457   }
458   // It starts with END, possibly after a label.
459   auto start{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
460   auto end{allSources_.GetSourcePosition(tokens.GetCharProvenance(n - 1))};
461   if (!start || !end) {
462     return;
463   }
464   if (&*start->sourceFile == &*end->sourceFile && start->line == end->line) {
465     return; // no continuation
466   }
467   j += 3;
468   static const char *const prefixes[]{"program", "subroutine", "function",
469       "blockdata", "module", "submodule", nullptr};
470   bool isPrefix{j == n || !IsLegalInIdentifier(str[j])}; // prefix is END
471   std::size_t endOfPrefix{j - 1};
472   for (const char *const *p{prefixes}; *p; ++p) {
473     std::size_t pLen{std::strlen(*p)};
474     if (j + pLen <= n && std::memcmp(str + j, *p, pLen) == 0) {
475       isPrefix = true; // END thing as prefix
476       j += pLen;
477       endOfPrefix = j - 1;
478       for (; j < n && IsLegalInIdentifier(str[j]); ++j) {
479       }
480       break;
481     }
482   }
483   if (isPrefix) {
484     auto range{tokens.GetTokenProvenanceRange(1)};
485     if (j == n) { // END or END thing [name]
486       Say(range,
487           "Program unit END statement may not be continued in fixed form source"_err_en_US);
488     } else {
489       auto endOfPrefixPos{
490           allSources_.GetSourcePosition(tokens.GetCharProvenance(endOfPrefix))};
491       auto next{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
492       if (endOfPrefixPos && next &&
493           &*endOfPrefixPos->sourceFile == &*start->sourceFile &&
494           endOfPrefixPos->line == start->line &&
495           (&*next->sourceFile != &*start->sourceFile ||
496               next->line != start->line)) {
497         Say(range,
498             "Initial line of continued statement must not appear to be a program unit END in fixed form source"_err_en_US);
499       }
500     }
501   }
502 }
503 
504 void Prescanner::SkipToEndOfLine() {
505   while (*at_ != '\n') {
506     ++at_, ++column_;
507   }
508 }
509 
510 bool Prescanner::MustSkipToEndOfLine() const {
511   if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
512     return true; // skip over ignored columns in right margin (73:80)
513   } else if (*at_ == '!' && !inCharLiteral_) {
514     return !IsCompilerDirectiveSentinel(at_);
515   } else {
516     return false;
517   }
518 }
519 
520 void Prescanner::NextChar() {
521   CHECK(*at_ != '\n');
522   int n{IsSpace(at_)};
523   at_ += n ? n : 1;
524   ++column_;
525   while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
526     // UTF-8 byte order mark - treat this file as UTF-8
527     at_ += 3;
528     encoding_ = Encoding::UTF_8;
529   }
530   SkipToNextSignificantCharacter();
531 }
532 
533 // Skip everything that should be ignored until the next significant
534 // character is reached; handles C-style comments in preprocessing
535 // directives, Fortran ! comments, stuff after the right margin in
536 // fixed form, and all forms of line continuation.
537 bool Prescanner::SkipToNextSignificantCharacter() {
538   auto anyContinuationLine{false};
539   if (inPreprocessorDirective_) {
540     SkipCComments();
541   } else {
542     bool mightNeedSpace{false};
543     if (MustSkipToEndOfLine()) {
544       SkipToEndOfLine();
545     } else {
546       mightNeedSpace = *at_ == '\n';
547     }
548     for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
549       anyContinuationLine = true;
550       ++continuationLines_;
551       if (MustSkipToEndOfLine()) {
552         SkipToEndOfLine();
553       }
554     }
555     if (*at_ == '\t') {
556       tabInCurrentLine_ = true;
557     }
558   }
559   return anyContinuationLine;
560 }
561 
562 void Prescanner::SkipCComments() {
563   while (true) {
564     if (IsCComment(at_)) {
565       if (const char *after{SkipCComment(at_)}) {
566         column_ += after - at_;
567         // May have skipped over one or more newlines; relocate the start of
568         // the next line.
569         nextLine_ = at_ = after;
570         NextLine();
571       } else {
572         // Don't emit any messages about unclosed C-style comments, because
573         // the sequence /* can appear legally in a FORMAT statement.  There's
574         // no ambiguity, since the sequence */ cannot appear legally.
575         break;
576       }
577     } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
578         at_[1] == '\n' && !IsAtEnd()) {
579       BeginSourceLineAndAdvance();
580     } else {
581       break;
582     }
583   }
584 }
585 
586 void Prescanner::SkipSpaces() {
587   while (IsSpaceOrTab(at_)) {
588     NextChar();
589   }
590   insertASpace_ = false;
591 }
592 
593 const char *Prescanner::SkipWhiteSpace(const char *p) {
594   while (int n{IsSpaceOrTab(p)}) {
595     p += n;
596   }
597   return p;
598 }
599 
600 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
601   while (true) {
602     if (int n{IsSpaceOrTab(p)}) {
603       p += n;
604     } else if (IsCComment(p)) {
605       if (const char *after{SkipCComment(p)}) {
606         p = after;
607       } else {
608         break;
609       }
610     } else {
611       break;
612     }
613   }
614   return p;
615 }
616 
617 const char *Prescanner::SkipCComment(const char *p) const {
618   char star{' '}, slash{' '};
619   p += 2;
620   while (star != '*' || slash != '/') {
621     if (p >= limit_) {
622       return nullptr; // signifies an unterminated comment
623     }
624     star = slash;
625     slash = *p++;
626   }
627   return p;
628 }
629 
630 bool Prescanner::NextToken(TokenSequence &tokens) {
631   CHECK(at_ >= start_ && at_ < limit_);
632   if (InFixedFormSource() && !preprocessingOnly_) {
633     SkipSpaces();
634   } else {
635     if (*at_ == '/' && IsCComment(at_)) {
636       // Recognize and skip over classic C style /*comments*/ when
637       // outside a character literal.
638       if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
639         Say(LanguageFeature::ClassicCComments, GetCurrentProvenance(),
640             "nonstandard usage: C-style comment"_port_en_US);
641       }
642       SkipCComments();
643     }
644     if (IsSpaceOrTab(at_)) {
645       // Compress free-form white space into a single space character.
646       const auto theSpace{at_};
647       char previous{at_ <= start_ ? ' ' : at_[-1]};
648       NextChar();
649       SkipSpaces();
650       if (*at_ == '\n' && !omitNewline_) {
651         // Discard white space at the end of a line.
652       } else if (!inPreprocessorDirective_ &&
653           (previous == '(' || *at_ == '(' || *at_ == ')')) {
654         // Discard white space before/after '(' and before ')', unless in a
655         // preprocessor directive.  This helps yield space-free contiguous
656         // names for generic interfaces like OPERATOR( + ) and
657         // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
658         // This has the effect of silently ignoring the illegal spaces in
659         // the array constructor ( /1,2/ ) but that seems benign; it's
660         // hard to avoid that while still removing spaces from OPERATOR( / )
661         // and OPERATOR( // ).
662       } else {
663         // Preserve the squashed white space as a single space character.
664         tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
665         tokens.CloseToken();
666         return true;
667       }
668     }
669   }
670   if (insertASpace_) {
671     tokens.PutNextTokenChar(' ', spaceProvenance_);
672     insertASpace_ = false;
673   }
674   if (*at_ == '\n') {
675     return false;
676   }
677   const char *start{at_};
678   if (*at_ == '\'' || *at_ == '"') {
679     QuotedCharacterLiteral(tokens, start);
680     preventHollerith_ = false;
681   } else if (IsDecimalDigit(*at_)) {
682     int n{0}, digits{0};
683     static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
684     do {
685       if (n < maxHollerith) {
686         n = 10 * n + DecimalDigitValue(*at_);
687       }
688       EmitCharAndAdvance(tokens, *at_);
689       ++digits;
690       if (InFixedFormSource()) {
691         SkipSpaces();
692       }
693     } while (IsDecimalDigit(*at_));
694     if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
695         !preventHollerith_) {
696       Hollerith(tokens, n, start);
697     } else if (*at_ == '.') {
698       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
699       }
700       ExponentAndKind(tokens);
701     } else if (ExponentAndKind(tokens)) {
702     } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
703         inPreprocessorDirective_) {
704       do {
705         EmitCharAndAdvance(tokens, *at_);
706       } while (IsHexadecimalDigit(*at_));
707     } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..."
708       EmitCharAndAdvance(tokens, *at_);
709       QuotedCharacterLiteral(tokens, start);
710     } else if (IsLetter(*at_) && !preventHollerith_ &&
711         parenthesisNesting_ > 0) {
712       // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
713       // we don't misrecognize I9HOLLERITH as an identifier in the next case.
714       EmitCharAndAdvance(tokens, *at_);
715     }
716     preventHollerith_ = false;
717   } else if (*at_ == '.') {
718     char nch{EmitCharAndAdvance(tokens, '.')};
719     if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
720       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
721       }
722       ExponentAndKind(tokens);
723     } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
724       EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
725     }
726     preventHollerith_ = false;
727   } else if (IsLegalInIdentifier(*at_)) {
728     int parts{1};
729     const char *afterLast{nullptr};
730     do {
731       EmitChar(tokens, *at_);
732       ++at_, ++column_;
733       afterLast = at_;
734       if (SkipToNextSignificantCharacter() && IsLegalIdentifierStart(*at_)) {
735         tokens.CloseToken();
736         ++parts;
737       }
738     } while (IsLegalInIdentifier(*at_));
739     if (parts >= 3) {
740       // Subtlety: When an identifier is split across three or more continuation
741       // lines (or two continuation lines, immediately preceded or followed
742       // by '&' free form continuation line markers, its parts are kept as
743       // distinct pp-tokens so that macro replacement operates on them
744       // independently.  This trick accommodates the historic practice of
745       // using line continuation for token pasting after replacement.
746     } else if (parts == 2) {
747       if (afterLast && afterLast < limit_) {
748         afterLast = SkipWhiteSpace(afterLast);
749       }
750       if ((start > start_ && start[-1] == '&') ||
751           (afterLast && afterLast < limit_ &&
752               (*afterLast == '&' || *afterLast == '\n'))) {
753         // call &                call foo&        call foo&
754         //   &MACRO&      OR       &MACRO&   OR     &MACRO
755         //   &foo(...)             &(...)
756       } else {
757         tokens.ReopenLastToken();
758       }
759     }
760     if (InFixedFormSource()) {
761       SkipSpaces();
762     }
763     if ((*at_ == '\'' || *at_ == '"') &&
764         tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..."
765       QuotedCharacterLiteral(tokens, start);
766       preventHollerith_ = false;
767     } else {
768       preventHollerith_ = true; // DO 10 H = ...
769     }
770   } else if (*at_ == '*') {
771     if (EmitCharAndAdvance(tokens, '*') == '*') {
772       EmitCharAndAdvance(tokens, '*');
773     } else {
774       // Subtle ambiguity:
775       //  CHARACTER*2H     declares H because *2 is a kind specifier
776       //  DATAC/N*2H  /    is repeated Hollerith
777       preventHollerith_ = !slashInCurrentStatement_;
778     }
779   } else {
780     char ch{*at_};
781     if (ch == '(') {
782       if (parenthesisNesting_++ == 0) {
783         isPossibleMacroCall_ = tokens.SizeInTokens() > 0 &&
784             preprocessor_.IsFunctionLikeDefinition(
785                 tokens.TokenAt(tokens.SizeInTokens() - 1));
786       }
787     } else if (ch == ')' && parenthesisNesting_ > 0) {
788       --parenthesisNesting_;
789     }
790     char nch{EmitCharAndAdvance(tokens, ch)};
791     preventHollerith_ = false;
792     if ((nch == '=' &&
793             (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
794         (ch == nch &&
795             (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
796                 ch == '|' || ch == '<' || ch == '>')) ||
797         (ch == '=' && nch == '>')) {
798       // token comprises two characters
799       EmitCharAndAdvance(tokens, nch);
800     } else if (ch == '/') {
801       slashInCurrentStatement_ = true;
802     } else if (ch == ';' && InFixedFormSource()) {
803       SkipSpaces();
804       if (IsDecimalDigit(*at_)) {
805         if (features_.ShouldWarn(
806                 common::LanguageFeature::MiscSourceExtensions)) {
807           Say(common::LanguageFeature::MiscSourceExtensions,
808               GetProvenanceRange(at_, at_ + 1),
809               "Label should be in the label field"_port_en_US);
810         }
811       }
812     }
813   }
814   tokens.CloseToken();
815   return true;
816 }
817 
818 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
819   char ed{ToLowerCaseLetter(*at_)};
820   if (ed != 'e' && ed != 'd') {
821     return false;
822   }
823   // Do some look-ahead to ensure that this 'e'/'d' is an exponent,
824   // not the start of an identifier that could be a macro.
825   const char *p{at_};
826   if (int n{IsSpace(++p)}) {
827     p += n;
828   }
829   if (*p == '+' || *p == '-') {
830     if (int n{IsSpace(++p)}) {
831       p += n;
832     }
833   }
834   if (IsDecimalDigit(*p)) { // it's an exponent
835     EmitCharAndAdvance(tokens, ed);
836     if (*at_ == '+' || *at_ == '-') {
837       EmitCharAndAdvance(tokens, *at_);
838     }
839     while (IsDecimalDigit(*at_)) {
840       EmitCharAndAdvance(tokens, *at_);
841     }
842     if (*at_ == '_') {
843       while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
844       }
845     }
846     return true;
847   } else {
848     return false;
849   }
850 }
851 
852 void Prescanner::QuotedCharacterLiteral(
853     TokenSequence &tokens, const char *start) {
854   char quote{*at_};
855   const char *end{at_ + 1};
856   inCharLiteral_ = true;
857   continuationInCharLiteral_ = true;
858   const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
859   const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
860   bool isEscaped{false};
861   bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
862   while (true) {
863     if (*at_ == '\\') {
864       if (escapesEnabled) {
865         isEscaped = !isEscaped;
866       } else {
867         // The parser always processes escape sequences, so don't confuse it
868         // when escapes are disabled.
869         insert('\\');
870       }
871     } else {
872       isEscaped = false;
873     }
874     EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
875         Encoding::LATIN_1);
876     while (PadOutCharacterLiteral(tokens)) {
877     }
878     if (*at_ == '\n') {
879       if (!inPreprocessorDirective_) {
880         Say(GetProvenanceRange(start, end),
881             "Incomplete character literal"_err_en_US);
882       }
883       break;
884     }
885     // Here's a weird edge case.  When there's a two or more following
886     // continuation lines at this point, and the entire significant part of
887     // the next continuation line is the name of a keyword macro, replace
888     // it in the character literal with its definition.  Example:
889     //   #define FOO foo
890     //   subroutine subr() bind(c, name="my_&
891     //     &FOO&
892     //     &_bar") ...
893     // produces a binding name of "my_foo_bar".
894     while (at_[1] == '&' && nextLine_ < limit_ && !InFixedFormSource()) {
895       const char *idStart{nextLine_};
896       if (const char *amper{SkipWhiteSpace(nextLine_)}; *amper == '&') {
897         idStart = amper + 1;
898       }
899       if (IsLegalIdentifierStart(*idStart)) {
900         std::size_t idLen{1};
901         for (; IsLegalInIdentifier(idStart[idLen]); ++idLen) {
902         }
903         if (idStart[idLen] == '&') {
904           CharBlock id{idStart, idLen};
905           if (preprocessor_.IsNameDefined(id)) {
906             TokenSequence ppTokens;
907             ppTokens.Put(id, GetProvenance(idStart));
908             if (auto replaced{
909                     preprocessor_.MacroReplacement(ppTokens, *this)}) {
910               tokens.Put(*replaced);
911               at_ = &idStart[idLen - 1];
912               NextLine();
913               continue; // try again on the next line
914             }
915           }
916         }
917       }
918       break;
919     }
920     end = at_ + 1;
921     NextChar();
922     if (*at_ == quote && !isEscaped) {
923       // A doubled unescaped quote mark becomes a single instance of that
924       // quote character in the literal (later).  There can be spaces between
925       // the quotes in fixed form source.
926       EmitChar(tokens, quote);
927       inCharLiteral_ = false; // for cases like print *, '...'!comment
928       NextChar();
929       if (InFixedFormSource()) {
930         SkipSpaces();
931       }
932       if (*at_ != quote) {
933         break;
934       }
935       inCharLiteral_ = true;
936     }
937   }
938   continuationInCharLiteral_ = false;
939   inCharLiteral_ = false;
940 }
941 
942 void Prescanner::Hollerith(
943     TokenSequence &tokens, int count, const char *start) {
944   inCharLiteral_ = true;
945   CHECK(*at_ == 'h' || *at_ == 'H');
946   EmitChar(tokens, 'H');
947   while (count-- > 0) {
948     if (PadOutCharacterLiteral(tokens)) {
949     } else if (*at_ == '\n') {
950       if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
951         Say(common::UsageWarning::Scanning, GetProvenanceRange(start, at_),
952             "Possible truncated Hollerith literal"_warn_en_US);
953       }
954       break;
955     } else {
956       NextChar();
957       // Each multi-byte character encoding counts as a single character.
958       // No escape sequences are recognized.
959       // Hollerith is always emitted to the cooked character
960       // stream in UTF-8.
961       DecodedCharacter decoded{DecodeCharacter(
962           encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
963       if (decoded.bytes > 0) {
964         EncodedCharacter utf8{
965             EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
966         for (int j{0}; j < utf8.bytes; ++j) {
967           EmitChar(tokens, utf8.buffer[j]);
968         }
969         at_ += decoded.bytes - 1;
970       } else {
971         Say(GetProvenanceRange(start, at_),
972             "Bad character in Hollerith literal"_err_en_US);
973         break;
974       }
975     }
976   }
977   if (*at_ != '\n') {
978     NextChar();
979   }
980   inCharLiteral_ = false;
981 }
982 
983 // In fixed form, source card images must be processed as if they were at
984 // least 72 columns wide, at least in character literal contexts.
985 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
986   while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
987     if (column_ < fixedFormColumnLimit_) {
988       tokens.PutNextTokenChar(' ', spaceProvenance_);
989       ++column_;
990       return true;
991     }
992     if (!FixedFormContinuation(false /*no need to insert space*/) ||
993         tabInCurrentLine_) {
994       return false;
995     }
996     CHECK(column_ == 7);
997     --at_; // point to column 6 of continuation line
998     column_ = 6;
999   }
1000   return false;
1001 }
1002 
1003 static bool IsAtProcess(const char *p) {
1004   static const char pAtProc[]{"process"};
1005   for (std::size_t i{0}; i < sizeof pAtProc - 1; ++i) {
1006     if (ToLowerCaseLetter(*++p) != pAtProc[i])
1007       return false;
1008   }
1009   return true;
1010 }
1011 
1012 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
1013   const char *p{start};
1014 
1015   // The @process directive must start in column 1.
1016   if (*p == '@' && IsAtProcess(p)) {
1017     return true;
1018   }
1019 
1020   if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
1021       ((*p == 'D' || *p == 'd') &&
1022           !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
1023     return true;
1024   }
1025   bool anyTabs{false};
1026   while (true) {
1027     if (int n{IsSpace(p)}) {
1028       p += n;
1029     } else if (*p == '\t') {
1030       anyTabs = true;
1031       ++p;
1032     } else if (*p == '0' && !anyTabs && p == start + 5) {
1033       ++p; // 0 in column 6 must treated as a space
1034     } else {
1035       break;
1036     }
1037   }
1038   if (!anyTabs && p >= start + fixedFormColumnLimit_) {
1039     return true;
1040   }
1041   if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
1042     return true;
1043   }
1044   return *p == '\n';
1045 }
1046 
1047 const char *Prescanner::IsFreeFormComment(const char *p) const {
1048   p = SkipWhiteSpaceAndCComments(p);
1049   if (*p == '!' || *p == '\n') {
1050     return p;
1051   } else if (*p == '@') {
1052     return IsAtProcess(p) ? p : nullptr;
1053   } else {
1054     return nullptr;
1055   }
1056 }
1057 
1058 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
1059   if (!expandIncludeLines_) {
1060     return std::nullopt;
1061   }
1062   const char *p{SkipWhiteSpace(start)};
1063   if (*p == '0' && inFixedForm_ && p == start + 5) {
1064     // Accept "     0INCLUDE" in fixed form.
1065     p = SkipWhiteSpace(p + 1);
1066   }
1067   for (const char *q{"include"}; *q; ++q) {
1068     if (ToLowerCaseLetter(*p) != *q) {
1069       return std::nullopt;
1070     }
1071     p = SkipWhiteSpace(p + 1);
1072   }
1073   if (IsDecimalDigit(*p)) { // accept & ignore a numeric kind prefix
1074     for (p = SkipWhiteSpace(p + 1); IsDecimalDigit(*p);
1075          p = SkipWhiteSpace(p + 1)) {
1076     }
1077     if (*p != '_') {
1078       return std::nullopt;
1079     }
1080     p = SkipWhiteSpace(p + 1);
1081   }
1082   if (*p == '"' || *p == '\'') {
1083     return {p - start};
1084   }
1085   return std::nullopt;
1086 }
1087 
1088 void Prescanner::FortranInclude(const char *firstQuote) {
1089   const char *p{firstQuote};
1090   while (*p != '"' && *p != '\'') {
1091     ++p;
1092   }
1093   char quote{*p};
1094   std::string path;
1095   for (++p; *p != '\n'; ++p) {
1096     if (*p == quote) {
1097       if (p[1] != quote) {
1098         break;
1099       }
1100       ++p;
1101     }
1102     path += *p;
1103   }
1104   if (*p != quote) {
1105     Say(GetProvenanceRange(firstQuote, p),
1106         "malformed path name string"_err_en_US);
1107     return;
1108   }
1109   p = SkipWhiteSpace(p + 1);
1110   if (*p != '\n' && *p != '!') {
1111     const char *garbage{p};
1112     for (; *p != '\n' && *p != '!'; ++p) {
1113     }
1114     if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
1115       Say(common::UsageWarning::Scanning, GetProvenanceRange(garbage, p),
1116           "excess characters after path name"_warn_en_US);
1117     }
1118   }
1119   std::string buf;
1120   llvm::raw_string_ostream error{buf};
1121   Provenance provenance{GetProvenance(nextLine_)};
1122   std::optional<std::string> prependPath;
1123   if (const SourceFile * currentFile{allSources_.GetSourceFile(provenance)}) {
1124     prependPath = DirectoryName(currentFile->path());
1125   }
1126   const SourceFile *included{
1127       allSources_.Open(path, error, std::move(prependPath))};
1128   if (!included) {
1129     Say(provenance, "INCLUDE: %s"_err_en_US, buf);
1130   } else if (included->bytes() > 0) {
1131     ProvenanceRange includeLineRange{
1132         provenance, static_cast<std::size_t>(p - nextLine_)};
1133     ProvenanceRange fileRange{
1134         allSources_.AddIncludedFile(*included, includeLineRange)};
1135     Preprocessor cleanPrepro{allSources_};
1136     if (preprocessor_.IsNameDefined("__FILE__"s)) {
1137       cleanPrepro.DefineStandardMacros(); // __FILE__, __LINE__, &c.
1138     }
1139     if (preprocessor_.IsNameDefined("_CUDA"s)) {
1140       cleanPrepro.Define("_CUDA"s, "1");
1141     }
1142     Prescanner{*this, cleanPrepro, /*isNestedInIncludeDirective=*/false}
1143         .set_encoding(included->encoding())
1144         .Prescan(fileRange);
1145   }
1146 }
1147 
1148 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
1149   const char *p{start};
1150   while (int n{IsSpace(p)}) {
1151     p += n;
1152   }
1153   if (*p == '#') {
1154     if (inFixedForm_ && p == start + 5) {
1155       return nullptr;
1156     }
1157   } else {
1158     p = SkipWhiteSpace(p);
1159     if (*p != '#') {
1160       return nullptr;
1161     }
1162   }
1163   return SkipWhiteSpace(p + 1);
1164 }
1165 
1166 bool Prescanner::IsNextLinePreprocessorDirective() const {
1167   return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
1168 }
1169 
1170 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
1171   if (IsAtEnd()) {
1172     if (afterAmpersand && prescannerNesting_ > 0) {
1173       // A continuation marker at the end of the last line in an
1174       // include file inhibits the newline for that line.
1175       SkipToEndOfLine();
1176       omitNewline_ = true;
1177     }
1178   } else if (inPreprocessorDirective_) {
1179   } else {
1180     auto lineClass{ClassifyLine(nextLine_)};
1181     if (lineClass.kind == LineClassification::Kind::Comment) {
1182       NextLine();
1183       return true;
1184     } else if (lineClass.kind ==
1185             LineClassification::Kind::ConditionalCompilationDirective ||
1186         lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
1187       // Allow conditional compilation directives (e.g., #ifdef) to affect
1188       // continuation lines.
1189       // Allow other preprocessor directives, too, except #include
1190       // (when it does not follow '&'), #define, and #undef (because
1191       // they cannot be allowed to affect preceding text on a
1192       // continued line).
1193       preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
1194       return true;
1195     } else if (afterAmpersand &&
1196         (lineClass.kind == LineClassification::Kind::DefinitionDirective ||
1197             lineClass.kind == LineClassification::Kind::IncludeDirective ||
1198             lineClass.kind == LineClassification::Kind::IncludeLine)) {
1199       SkipToEndOfLine();
1200       omitNewline_ = true;
1201       skipLeadingAmpersand_ = true;
1202     }
1203   }
1204   return false;
1205 }
1206 
1207 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
1208   if (IsAtEnd()) {
1209     return nullptr;
1210   }
1211   tabInCurrentLine_ = false;
1212   char col1{*nextLine_};
1213   if (IsFixedFormCommentChar(col1)) {
1214     int j{1};
1215     if (InCompilerDirective()) {
1216       // Must be a continued compiler directive.
1217       for (; j < 5; ++j) {
1218         char ch{directiveSentinel_[j - 1]};
1219         if (ch == '\0') {
1220           break;
1221         }
1222         if (ch != ToLowerCaseLetter(nextLine_[j])) {
1223           return nullptr;
1224         }
1225       }
1226     } else if (features_.IsEnabled(LanguageFeature::OpenMP)) {
1227       // Fixed Source Form Conditional Compilation Sentinels.
1228       if (nextLine_[1] != '$') {
1229         return nullptr;
1230       }
1231       j++;
1232     } else {
1233       return nullptr;
1234     }
1235     for (; j < 5; ++j) {
1236       if (nextLine_[j] != ' ') {
1237         return nullptr;
1238       }
1239     }
1240     const char *col6{nextLine_ + 5};
1241     if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) {
1242       if (mightNeedSpace && !IsSpace(nextLine_ + 6)) {
1243         insertASpace_ = true;
1244       }
1245       return nextLine_ + 6;
1246     }
1247     return nullptr;
1248   } else {
1249     // Normal case: not in a compiler directive.
1250     if (col1 == '&' &&
1251         features_.IsEnabled(
1252             LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
1253       // Extension: '&' as continuation marker
1254       if (features_.ShouldWarn(
1255               LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
1256         Say(LanguageFeature::FixedFormContinuationWithColumn1Ampersand,
1257             GetProvenance(nextLine_), "nonstandard usage"_port_en_US);
1258       }
1259       return nextLine_ + 1;
1260     }
1261     if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
1262       tabInCurrentLine_ = true;
1263       return nextLine_ + 2; // VAX extension
1264     }
1265     if ((col1 == ' ' ||
1266             ((col1 == 'D' || col1 == 'd') &&
1267                 features_.IsEnabled(LanguageFeature::OldDebugLines))) &&
1268         nextLine_[1] == ' ' && nextLine_[2] == ' ' && nextLine_[3] == ' ' &&
1269         nextLine_[4] == ' ') {
1270       const char *col6{nextLine_ + 5};
1271       if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) {
1272         if ((*col6 == 'i' || *col6 == 'I') && IsIncludeLine(nextLine_)) {
1273           // It's An INCLUDE line, not a continuation
1274         } else {
1275           return nextLine_ + 6;
1276         }
1277       }
1278     }
1279     if (IsImplicitContinuation()) {
1280       return nextLine_;
1281     }
1282   }
1283   return nullptr; // not a continuation line
1284 }
1285 
1286 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
1287   const char *p{nextLine_};
1288   if (p >= limit_) {
1289     return nullptr;
1290   }
1291   p = SkipWhiteSpace(p);
1292   if (InCompilerDirective()) {
1293     if (*p++ != '!') {
1294       return nullptr;
1295     }
1296     for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
1297       if (*s != ToLowerCaseLetter(*p)) {
1298         return nullptr;
1299       }
1300     }
1301     p = SkipWhiteSpace(p);
1302     if (*p == '&') {
1303       if (!ampersand) {
1304         insertASpace_ = true;
1305       }
1306       return p + 1;
1307     } else if (ampersand) {
1308       return p;
1309     } else {
1310       return nullptr;
1311     }
1312   } else {
1313     if (*p == '&') {
1314       return p + 1;
1315     } else if (*p == '!' || *p == '\n' || *p == '#') {
1316       return nullptr;
1317     } else if (ampersand || IsImplicitContinuation()) {
1318       if (continuationInCharLiteral_) {
1319         // 'a'&            -> 'a''b' == "a'b"
1320         //   'b'
1321         if (features_.ShouldWarn(
1322                 common::LanguageFeature::MiscSourceExtensions)) {
1323           Say(common::LanguageFeature::MiscSourceExtensions,
1324               GetProvenanceRange(p, p + 1),
1325               "Character literal continuation line should have been preceded by '&'"_port_en_US);
1326         }
1327       } else if (p > nextLine_) {
1328         --p;
1329       } else {
1330         insertASpace_ = true;
1331       }
1332       return p;
1333     } else {
1334       return nullptr;
1335     }
1336   }
1337 }
1338 
1339 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
1340   // N.B. We accept '&' as a continuation indicator in fixed form, too,
1341   // but not in a character literal.
1342   if (*at_ == '&' && inCharLiteral_) {
1343     return false;
1344   }
1345   do {
1346     if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
1347       BeginSourceLine(cont);
1348       column_ = 7;
1349       NextLine();
1350       return true;
1351     }
1352   } while (SkipCommentLine(false /* not after ampersand */));
1353   return false;
1354 }
1355 
1356 bool Prescanner::FreeFormContinuation() {
1357   const char *p{at_};
1358   bool ampersand{*p == '&'};
1359   if (ampersand) {
1360     p = SkipWhiteSpace(p + 1);
1361   }
1362   if (*p != '\n') {
1363     if (inCharLiteral_) {
1364       return false;
1365     } else if (*p == '!') { // & ! comment - ok
1366     } else if (ampersand && isPossibleMacroCall_ && (*p == ',' || *p == ')')) {
1367       return false; // allow & at end of a macro argument
1368     } else if (features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
1369       Say(LanguageFeature::CruftAfterAmpersand, GetProvenance(p),
1370           "missing ! before comment after &"_warn_en_US);
1371     }
1372   }
1373   do {
1374     if (const char *cont{FreeFormContinuationLine(ampersand)}) {
1375       BeginSourceLine(cont);
1376       NextLine();
1377       return true;
1378     }
1379   } while (SkipCommentLine(ampersand));
1380   return false;
1381 }
1382 
1383 // Implicit line continuation allows a preprocessor macro call with
1384 // arguments to span multiple lines.
1385 bool Prescanner::IsImplicitContinuation() const {
1386   return !inPreprocessorDirective_ && !inCharLiteral_ && isPossibleMacroCall_ &&
1387       parenthesisNesting_ > 0 && !IsAtEnd() &&
1388       ClassifyLine(nextLine_).kind == LineClassification::Kind::Source;
1389 }
1390 
1391 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
1392   if (disableSourceContinuation_) {
1393     return false;
1394   } else if (*at_ == '\n' || *at_ == '&') {
1395     if (inFixedForm_) {
1396       return FixedFormContinuation(mightNeedFixedFormSpace);
1397     } else {
1398       return FreeFormContinuation();
1399     }
1400   } else if (*at_ == '\\' && at_ + 2 == nextLine_ &&
1401       backslashFreeFormContinuation_ && !inFixedForm_ && nextLine_ < limit_) {
1402     // cpp-like handling of \ at end of a free form source line
1403     BeginSourceLine(nextLine_);
1404     NextLine();
1405     return true;
1406   } else {
1407     return false;
1408   }
1409 }
1410 
1411 std::optional<Prescanner::LineClassification>
1412 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
1413   const char *p{start};
1414   char col1{*p++};
1415   if (!IsFixedFormCommentChar(col1)) {
1416     return std::nullopt;
1417   }
1418   char sentinel[5], *sp{sentinel};
1419   int column{2};
1420   for (; column < 6; ++column, ++p) {
1421     if (*p == '\n' || IsSpaceOrTab(p)) {
1422       break;
1423     }
1424     if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1425       // OpenMP conditional compilation line: leave the label alone
1426       break;
1427     }
1428     *sp++ = ToLowerCaseLetter(*p);
1429   }
1430   if (column == 6) {
1431     if (*p == '0') {
1432       ++p;
1433     } else if (int n{IsSpaceOrTab(p)}) {
1434       p += n;
1435     } else {
1436       // This is a Continuation line, not an initial directive line.
1437       return std::nullopt;
1438     }
1439   }
1440   if (sp == sentinel) {
1441     return std::nullopt;
1442   }
1443   *sp = '\0';
1444   if (const char *ss{IsCompilerDirectiveSentinel(
1445           sentinel, static_cast<std::size_t>(sp - sentinel))}) {
1446     std::size_t payloadOffset = p - start;
1447     return {LineClassification{
1448         LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1449   }
1450   return std::nullopt;
1451 }
1452 
1453 std::optional<Prescanner::LineClassification>
1454 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1455   if (const char *p{SkipWhiteSpace(start)}; p && *p++ == '!') {
1456     if (auto maybePair{IsCompilerDirectiveSentinel(p)}) {
1457       auto offset{static_cast<std::size_t>(maybePair->second - start)};
1458       return {LineClassification{LineClassification::Kind::CompilerDirective,
1459           offset, maybePair->first}};
1460     }
1461   }
1462   return std::nullopt;
1463 }
1464 
1465 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1466   std::uint64_t packed{0};
1467   for (char ch : dir) {
1468     packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1469   }
1470   compilerDirectiveBloomFilter_.set(packed % prime1);
1471   compilerDirectiveBloomFilter_.set(packed % prime2);
1472   compilerDirectiveSentinels_.insert(dir);
1473   return *this;
1474 }
1475 
1476 const char *Prescanner::IsCompilerDirectiveSentinel(
1477     const char *sentinel, std::size_t len) const {
1478   std::uint64_t packed{0};
1479   for (std::size_t j{0}; j < len; ++j) {
1480     packed = (packed << 8) | (sentinel[j] & 0xff);
1481   }
1482   if (len == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1483       !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1484     return nullptr;
1485   }
1486   const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, len))};
1487   return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
1488 }
1489 
1490 const char *Prescanner::IsCompilerDirectiveSentinel(CharBlock token) const {
1491   const char *p{token.begin()};
1492   const char *end{p + token.size()};
1493   while (p < end && (*p == ' ' || *p == '\n')) {
1494     ++p;
1495   }
1496   if (p < end && *p == '!') {
1497     ++p;
1498   }
1499   while (end > p && (end[-1] == ' ' || end[-1] == '\t')) {
1500     --end;
1501   }
1502   return end > p && IsCompilerDirectiveSentinel(p, end - p) ? p : nullptr;
1503 }
1504 
1505 std::optional<std::pair<const char *, const char *>>
1506 Prescanner::IsCompilerDirectiveSentinel(const char *p) const {
1507   char sentinel[8];
1508   for (std::size_t j{0}; j + 1 < sizeof sentinel && *p != '\n'; ++p, ++j) {
1509     if (int n{*p == '&' ? 1 : IsSpaceOrTab(p)}) {
1510       if (j > 0) {
1511         sentinel[j] = '\0';
1512         p = SkipWhiteSpace(p + n);
1513         if (*p != '!') {
1514           if (const char *sp{IsCompilerDirectiveSentinel(sentinel, j)}) {
1515             return std::make_pair(sp, p);
1516           }
1517         }
1518       }
1519       break;
1520     } else {
1521       sentinel[j] = ToLowerCaseLetter(*p);
1522     }
1523   }
1524   return std::nullopt;
1525 }
1526 
1527 constexpr bool IsDirective(const char *match, const char *dir) {
1528   for (; *match; ++match) {
1529     if (*match != ToLowerCaseLetter(*dir++)) {
1530       return false;
1531     }
1532   }
1533   return true;
1534 }
1535 
1536 Prescanner::LineClassification Prescanner::ClassifyLine(
1537     const char *start) const {
1538   if (inFixedForm_) {
1539     if (std::optional<LineClassification> lc{
1540             IsFixedFormCompilerDirectiveLine(start)}) {
1541       return std::move(*lc);
1542     }
1543     if (IsFixedFormCommentLine(start)) {
1544       return {LineClassification::Kind::Comment};
1545     }
1546   } else {
1547     if (std::optional<LineClassification> lc{
1548             IsFreeFormCompilerDirectiveLine(start)}) {
1549       return std::move(*lc);
1550     }
1551     if (const char *bang{IsFreeFormComment(start)}) {
1552       return {LineClassification::Kind::Comment,
1553           static_cast<std::size_t>(bang - start)};
1554     }
1555   }
1556   if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1557     return {LineClassification::Kind::IncludeLine, *quoteOffset};
1558   }
1559   if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1560     if (IsDirective("if", dir) || IsDirective("elif", dir) ||
1561         IsDirective("else", dir) || IsDirective("endif", dir)) {
1562       return {LineClassification::Kind::ConditionalCompilationDirective};
1563     } else if (IsDirective("include", dir)) {
1564       return {LineClassification::Kind::IncludeDirective};
1565     } else if (IsDirective("define", dir) || IsDirective("undef", dir)) {
1566       return {LineClassification::Kind::DefinitionDirective};
1567     } else {
1568       return {LineClassification::Kind::PreprocessorDirective};
1569     }
1570   }
1571   return {LineClassification::Kind::Source};
1572 }
1573 
1574 Prescanner::LineClassification Prescanner::ClassifyLine(
1575     TokenSequence &tokens, Provenance newlineProvenance) const {
1576   // Append a newline temporarily.
1577   tokens.PutNextTokenChar('\n', newlineProvenance);
1578   tokens.CloseToken();
1579   const char *ppd{tokens.ToCharBlock().begin()};
1580   LineClassification classification{ClassifyLine(ppd)};
1581   tokens.pop_back(); // remove the newline
1582   return classification;
1583 }
1584 
1585 void Prescanner::SourceFormChange(std::string &&dir) {
1586   if (dir == "!dir$ free") {
1587     inFixedForm_ = false;
1588   } else if (dir == "!dir$ fixed") {
1589     inFixedForm_ = true;
1590   }
1591 }
1592 
1593 // Acquire and append compiler directive continuation lines to
1594 // the tokens that constitute a compiler directive, even when those
1595 // directive continuation lines are the result of macro expansion.
1596 // (Not used when neither the original compiler directive line nor
1597 // the directive continuation line result from preprocessing; regular
1598 // line continuation during tokenization handles that normal case.)
1599 bool Prescanner::CompilerDirectiveContinuation(
1600     TokenSequence &tokens, const char *origSentinel) {
1601   if (inFixedForm_ || tokens.empty() ||
1602       tokens.TokenAt(tokens.SizeInTokens() - 1) != "&") {
1603     return false;
1604   }
1605   LineClassification followingLine{ClassifyLine(nextLine_)};
1606   if (followingLine.kind == LineClassification::Kind::Comment) {
1607     nextLine_ += followingLine.payloadOffset; // advance to '!' or newline
1608     NextLine();
1609     return true;
1610   }
1611   CHECK(origSentinel != nullptr);
1612   directiveSentinel_ = origSentinel; // so InCompilerDirective() is true
1613   const char *nextContinuation{
1614       followingLine.kind == LineClassification::Kind::CompilerDirective
1615           ? FreeFormContinuationLine(true)
1616           : nullptr};
1617   if (!nextContinuation &&
1618       followingLine.kind != LineClassification::Kind::Source) {
1619     return false;
1620   }
1621   auto origNextLine{nextLine_};
1622   BeginSourceLine(nextLine_);
1623   NextLine();
1624   if (nextContinuation) {
1625     // What follows is !DIR$ & xxx; skip over the & so that it
1626     // doesn't cause a spurious continuation.
1627     at_ = nextContinuation;
1628   } else {
1629     // What follows looks like a source line before macro expansion,
1630     // but might become a directive continuation afterwards.
1631     SkipSpaces();
1632   }
1633   TokenSequence followingTokens;
1634   while (NextToken(followingTokens)) {
1635   }
1636   if (auto followingPrepro{
1637           preprocessor_.MacroReplacement(followingTokens, *this)}) {
1638     followingTokens = std::move(*followingPrepro);
1639   }
1640   followingTokens.RemoveRedundantBlanks();
1641   std::size_t startAt{0};
1642   std::size_t following{followingTokens.SizeInTokens()};
1643   bool ok{false};
1644   if (nextContinuation) {
1645     ok = true;
1646   } else {
1647     startAt = 2;
1648     if (startAt < following && followingTokens.TokenAt(0) == "!") {
1649       CharBlock sentinel{followingTokens.TokenAt(1)};
1650       if (!sentinel.empty() &&
1651           std::memcmp(sentinel.begin(), origSentinel, sentinel.size()) == 0) {
1652         ok = true;
1653         while (
1654             startAt < following && followingTokens.TokenAt(startAt).IsBlank()) {
1655           ++startAt;
1656         }
1657         if (startAt < following && followingTokens.TokenAt(startAt) == "&") {
1658           ++startAt;
1659         }
1660       }
1661     }
1662   }
1663   if (ok) {
1664     tokens.pop_back(); // delete original '&'
1665     tokens.Put(followingTokens, startAt, following - startAt);
1666     tokens.RemoveRedundantBlanks();
1667   } else {
1668     nextLine_ = origNextLine;
1669   }
1670   return ok;
1671 }
1672 
1673 // Similar, but for source line continuation after macro replacement.
1674 bool Prescanner::SourceLineContinuation(TokenSequence &tokens) {
1675   if (!inFixedForm_ && !tokens.empty() &&
1676       tokens.TokenAt(tokens.SizeInTokens() - 1) == "&") {
1677     LineClassification followingLine{ClassifyLine(nextLine_)};
1678     if (followingLine.kind == LineClassification::Kind::Comment) {
1679       nextLine_ += followingLine.payloadOffset; // advance to '!' or newline
1680       NextLine();
1681       return true;
1682     } else if (const char *nextContinuation{FreeFormContinuationLine(true)}) {
1683       BeginSourceLine(nextLine_);
1684       NextLine();
1685       TokenSequence followingTokens;
1686       at_ = nextContinuation;
1687       while (NextToken(followingTokens)) {
1688       }
1689       if (auto followingPrepro{
1690               preprocessor_.MacroReplacement(followingTokens, *this)}) {
1691         followingTokens = std::move(*followingPrepro);
1692       }
1693       followingTokens.RemoveRedundantBlanks();
1694       tokens.pop_back(); // delete original '&'
1695       tokens.Put(followingTokens);
1696       return true;
1697     }
1698   }
1699   return false;
1700 }
1701 } // namespace Fortran::parser
1702