xref: /llvm-project/flang/lib/Parser/prescan.cpp (revision bde2f39ae076c893d881d73b0d9c4ef4ea89c853)
1 //===-- lib/Parser/prescan.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "prescan.h"
10 #include "flang/Common/idioms.h"
11 #include "flang/Parser/characters.h"
12 #include "flang/Parser/message.h"
13 #include "flang/Parser/preprocessor.h"
14 #include "flang/Parser/source.h"
15 #include "flang/Parser/token-sequence.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <cstddef>
18 #include <cstring>
19 #include <utility>
20 #include <vector>
21 
22 namespace Fortran::parser {
23 
24 using common::LanguageFeature;
25 
26 static constexpr int maxPrescannerNesting{100};
27 
28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
29     Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
30     : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
31       allSources_{preprocessor_.allSources()}, features_{lfc},
32       backslashFreeFormContinuation_{preprocessor.AnyDefinitions()},
33       encoding_{allSources_.encoding()} {}
34 
35 Prescanner::Prescanner(const Prescanner &that, Preprocessor &prepro,
36     bool isNestedInIncludeDirective)
37     : messages_{that.messages_}, cooked_{that.cooked_}, preprocessor_{prepro},
38       allSources_{that.allSources_}, features_{that.features_},
39       preprocessingOnly_{that.preprocessingOnly_},
40       expandIncludeLines_{that.expandIncludeLines_},
41       isNestedInIncludeDirective_{isNestedInIncludeDirective},
42       backslashFreeFormContinuation_{that.backslashFreeFormContinuation_},
43       inFixedForm_{that.inFixedForm_},
44       fixedFormColumnLimit_{that.fixedFormColumnLimit_},
45       encoding_{that.encoding_},
46       prescannerNesting_{that.prescannerNesting_ + 1},
47       skipLeadingAmpersand_{that.skipLeadingAmpersand_},
48       compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
49       compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
50 
51 // Returns number of bytes to skip
52 static inline int IsSpace(const char *p) {
53   if (*p == ' ') {
54     return 1;
55   } else if (*p == '\xa0') { // LATIN-1 NBSP non-breaking space
56     return 1;
57   } else if (p[0] == '\xc2' && p[1] == '\xa0') { // UTF-8 NBSP
58     return 2;
59   } else {
60     return 0;
61   }
62 }
63 
64 static inline int IsSpaceOrTab(const char *p) {
65   return *p == '\t' ? 1 : IsSpace(p);
66 }
67 
68 static inline constexpr bool IsFixedFormCommentChar(char ch) {
69   return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
70 }
71 
72 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
73   char *p{dir.GetMutableCharData()};
74   char *limit{p + dir.SizeInChars()};
75   for (; p < limit; ++p) {
76     if (*p != ' ') {
77       CHECK(IsFixedFormCommentChar(*p));
78       *p = '!';
79       return;
80     }
81   }
82   DIE("compiler directive all blank");
83 }
84 
85 void Prescanner::Prescan(ProvenanceRange range) {
86   startProvenance_ = range.start();
87   start_ = allSources_.GetSource(range);
88   CHECK(start_);
89   limit_ = start_ + range.size();
90   nextLine_ = start_;
91   const bool beganInFixedForm{inFixedForm_};
92   if (prescannerNesting_ > maxPrescannerNesting) {
93     Say(GetProvenance(start_),
94         "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
95     return;
96   }
97   while (!IsAtEnd()) {
98     Statement();
99   }
100   if (inFixedForm_ != beganInFixedForm) {
101     std::string dir{"!dir$ "};
102     if (beganInFixedForm) {
103       dir += "fixed";
104     } else {
105       dir += "free";
106     }
107     dir += '\n';
108     TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()};
109     tokens.Emit(cooked_);
110   }
111 }
112 
113 void Prescanner::Statement() {
114   TokenSequence tokens;
115   const char *statementStart{nextLine_};
116   LineClassification line{ClassifyLine(statementStart)};
117   switch (line.kind) {
118   case LineClassification::Kind::Comment:
119     nextLine_ += line.payloadOffset; // advance to '!' or newline
120     NextLine();
121     return;
122   case LineClassification::Kind::IncludeLine:
123     FortranInclude(nextLine_ + line.payloadOffset);
124     NextLine();
125     return;
126   case LineClassification::Kind::ConditionalCompilationDirective:
127   case LineClassification::Kind::IncludeDirective:
128     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
129     afterPreprocessingDirective_ = true;
130     skipLeadingAmpersand_ |= !inFixedForm_;
131     return;
132   case LineClassification::Kind::PreprocessorDirective:
133     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
134     afterPreprocessingDirective_ = true;
135     // Don't set skipLeadingAmpersand_
136     return;
137   case LineClassification::Kind::DefinitionDirective:
138     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
139     // Don't set afterPreprocessingDirective_ or skipLeadingAmpersand_
140     return;
141   case LineClassification::Kind::CompilerDirective: {
142     directiveSentinel_ = line.sentinel;
143     CHECK(InCompilerDirective());
144     BeginStatementAndAdvance();
145     if (inFixedForm_) {
146       CHECK(IsFixedFormCommentChar(*at_));
147     } else {
148       while (int n{IsSpaceOrTab(at_)}) {
149         at_ += n, ++column_;
150       }
151       CHECK(*at_ == '!');
152     }
153     std::optional<int> condOffset;
154     if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
155       // OpenMP conditional compilation line.
156       condOffset = 2;
157     } else if (directiveSentinel_[0] == '@' && directiveSentinel_[1] == 'c' &&
158         directiveSentinel_[2] == 'u' && directiveSentinel_[3] == 'f' &&
159         directiveSentinel_[4] == '\0') {
160       // CUDA conditional compilation line.
161       condOffset = 5;
162     }
163     if (condOffset) {
164       at_ += *condOffset, column_ += *condOffset;
165       if (auto payload{IsIncludeLine(at_)}) {
166         FortranInclude(at_ + *payload);
167         return;
168       } else if (inFixedForm_) {
169         LabelField(tokens);
170       } else {
171         SkipSpaces();
172       }
173     } else {
174       // Compiler directive.  Emit normalized sentinel, squash following spaces.
175       EmitChar(tokens, '!');
176       ++at_, ++column_;
177       for (const char *sp{directiveSentinel_}; *sp != '\0';
178            ++sp, ++at_, ++column_) {
179         EmitChar(tokens, *sp);
180       }
181       if (IsSpaceOrTab(at_)) {
182         EmitChar(tokens, ' ');
183         while (int n{IsSpaceOrTab(at_)}) {
184           at_ += n, ++column_;
185         }
186       }
187       tokens.CloseToken();
188     }
189     break;
190   }
191   case LineClassification::Kind::Source: {
192     BeginStatementAndAdvance();
193     bool checkLabelField{false};
194     if (inFixedForm_) {
195       if (features_.IsEnabled(LanguageFeature::OldDebugLines) &&
196           (*at_ == 'D' || *at_ == 'd')) {
197         NextChar();
198       }
199       checkLabelField = true;
200     } else {
201       if (skipLeadingAmpersand_) {
202         skipLeadingAmpersand_ = false;
203         const char *p{SkipWhiteSpace(at_)};
204         if (p < limit_ && *p == '&') {
205           column_ += ++p - at_;
206           at_ = p;
207         }
208       } else {
209         SkipSpaces();
210       }
211     }
212     // Check for a leading identifier that might be a keyword macro
213     // that will expand to anything indicating a non-source line, like
214     // a comment marker or directive sentinel.  If so, disable line
215     // continuation, so that NextToken() won't consume anything from
216     // following lines.
217     if (IsLegalIdentifierStart(*at_)) {
218       // TODO: Only bother with these cases when any keyword macro has
219       // been defined with replacement text that could begin a comment
220       // or directive sentinel.
221       const char *p{at_};
222       while (IsLegalInIdentifier(*++p)) {
223       }
224       CharBlock id{at_, static_cast<std::size_t>(p - at_)};
225       if (preprocessor_.IsNameDefined(id) &&
226           !preprocessor_.IsFunctionLikeDefinition(id)) {
227         checkLabelField = false;
228         TokenSequence toks;
229         toks.Put(id, GetProvenance(at_));
230         if (auto replaced{preprocessor_.MacroReplacement(toks, *this)}) {
231           auto newLineClass{ClassifyLine(*replaced, GetCurrentProvenance())};
232           if (newLineClass.kind ==
233               LineClassification::Kind::CompilerDirective) {
234             directiveSentinel_ = newLineClass.sentinel;
235             disableSourceContinuation_ = false;
236           } else {
237             disableSourceContinuation_ =
238                 newLineClass.kind != LineClassification::Kind::Source;
239           }
240         }
241       }
242     }
243     if (checkLabelField) {
244       LabelField(tokens);
245     }
246   } break;
247   }
248 
249   while (NextToken(tokens)) {
250   }
251   if (continuationLines_ > 255) {
252     if (features_.ShouldWarn(common::LanguageFeature::MiscSourceExtensions)) {
253       Say(common::LanguageFeature::MiscSourceExtensions,
254           GetProvenance(statementStart),
255           "%d continuation lines is more than the Fortran standard allows"_port_en_US,
256           continuationLines_);
257     }
258   }
259 
260   Provenance newlineProvenance{GetCurrentProvenance()};
261   if (std::optional<TokenSequence> preprocessed{
262           preprocessor_.MacroReplacement(tokens, *this)}) {
263     // Reprocess the preprocessed line.
264     LineClassification ppl{ClassifyLine(*preprocessed, newlineProvenance)};
265     switch (ppl.kind) {
266     case LineClassification::Kind::Comment:
267       break;
268     case LineClassification::Kind::IncludeLine:
269       FortranInclude(preprocessed->TokenAt(0).begin() + ppl.payloadOffset);
270       break;
271     case LineClassification::Kind::ConditionalCompilationDirective:
272     case LineClassification::Kind::IncludeDirective:
273     case LineClassification::Kind::DefinitionDirective:
274     case LineClassification::Kind::PreprocessorDirective:
275       if (features_.ShouldWarn(common::UsageWarning::Preprocessing)) {
276         Say(common::UsageWarning::Preprocessing,
277             preprocessed->GetProvenanceRange(),
278             "Preprocessed line resembles a preprocessor directive"_warn_en_US);
279       }
280       CheckAndEmitLine(preprocessed->ToLowerCase(), newlineProvenance);
281       break;
282     case LineClassification::Kind::CompilerDirective:
283       if (preprocessed->HasRedundantBlanks()) {
284         preprocessed->RemoveRedundantBlanks();
285       }
286       while (CompilerDirectiveContinuation(*preprocessed, ppl.sentinel)) {
287         newlineProvenance = GetCurrentProvenance();
288       }
289       NormalizeCompilerDirectiveCommentMarker(*preprocessed);
290       preprocessed->ToLowerCase();
291       SourceFormChange(preprocessed->ToString());
292       CheckAndEmitLine(preprocessed->ToLowerCase().ClipComment(
293                            *this, true /* skip first ! */),
294           newlineProvenance);
295       break;
296     case LineClassification::Kind::Source:
297       if (inFixedForm_) {
298         if (!preprocessingOnly_ && preprocessed->HasBlanks()) {
299           preprocessed->RemoveBlanks();
300         }
301       } else {
302         while (SourceLineContinuation(*preprocessed)) {
303           newlineProvenance = GetCurrentProvenance();
304         }
305         if (preprocessed->HasRedundantBlanks()) {
306           preprocessed->RemoveRedundantBlanks();
307         }
308       }
309       CheckAndEmitLine(
310           preprocessed->ToLowerCase().ClipComment(*this), newlineProvenance);
311       break;
312     }
313   } else { // no macro replacement
314     if (line.kind == LineClassification::Kind::CompilerDirective) {
315       while (CompilerDirectiveContinuation(tokens, line.sentinel)) {
316         newlineProvenance = GetCurrentProvenance();
317       }
318       tokens.ToLowerCase();
319       SourceFormChange(tokens.ToString());
320     } else { // Kind::Source
321       tokens.ToLowerCase();
322       if (inFixedForm_) {
323         EnforceStupidEndStatementRules(tokens);
324       }
325     }
326     CheckAndEmitLine(tokens, newlineProvenance);
327   }
328   directiveSentinel_ = nullptr;
329 }
330 
331 void Prescanner::CheckAndEmitLine(
332     TokenSequence &tokens, Provenance newlineProvenance) {
333   tokens.CheckBadFortranCharacters(
334       messages_, *this, disableSourceContinuation_);
335   // Parenthesis nesting check does not apply while any #include is
336   // active, nor on the lines before and after a top-level #include,
337   // nor before or after conditional source.
338   // Applications play shenanigans with line continuation before and
339   // after #include'd subprogram argument lists and conditional source.
340   if (!isNestedInIncludeDirective_ && !omitNewline_ &&
341       !afterPreprocessingDirective_ && tokens.BadlyNestedParentheses() &&
342       !preprocessor_.InConditional()) {
343     if (nextLine_ < limit_ && IsPreprocessorDirectiveLine(nextLine_)) {
344       // don't complain
345     } else {
346       tokens.CheckBadParentheses(messages_);
347     }
348   }
349   tokens.Emit(cooked_);
350   if (omitNewline_) {
351     omitNewline_ = false;
352   } else {
353     cooked_.Put('\n', newlineProvenance);
354     afterPreprocessingDirective_ = false;
355   }
356 }
357 
358 TokenSequence Prescanner::TokenizePreprocessorDirective() {
359   CHECK(!IsAtEnd() && !inPreprocessorDirective_);
360   inPreprocessorDirective_ = true;
361   BeginStatementAndAdvance();
362   TokenSequence tokens;
363   while (NextToken(tokens)) {
364   }
365   inPreprocessorDirective_ = false;
366   return tokens;
367 }
368 
369 void Prescanner::NextLine() {
370   void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
371   void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
372   if (!v) {
373     nextLine_ = limit_;
374   } else {
375     const char *nl{const_cast<const char *>(static_cast<char *>(v))};
376     nextLine_ = nl + 1;
377   }
378 }
379 
380 void Prescanner::LabelField(TokenSequence &token) {
381   int outCol{1};
382   const char *start{at_};
383   std::optional<int> badColumn;
384   for (; *at_ != '\n' && column_ <= 6; ++at_) {
385     if (*at_ == '\t') {
386       ++at_;
387       column_ = 7;
388       break;
389     }
390     if (int n{IsSpace(at_)}; n == 0 &&
391         !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
392       EmitChar(token, *at_);
393       ++outCol;
394       if (!badColumn && (column_ == 6 || !IsDecimalDigit(*at_))) {
395         badColumn = column_;
396       }
397     }
398     ++column_;
399   }
400   if (badColumn && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) {
401     if ((prescannerNesting_ > 0 && *badColumn == 6 &&
402             cooked_.BufferedBytes() == firstCookedCharacterOffset_) ||
403         afterPreprocessingDirective_) {
404       // This is the first source line in #include'd text or conditional
405       // code under #if, or the first source line after such.
406       // If it turns out that the preprocessed text begins with a
407       // fixed form continuation line, the newline at the end
408       // of the latest source line beforehand will be deleted in
409       // CookedSource::Marshal().
410       cooked_.MarkPossibleFixedFormContinuation();
411     } else if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
412       Say(common::UsageWarning::Scanning, GetProvenance(start + *badColumn - 1),
413           *badColumn == 6
414               ? "Statement should not begin with a continuation line"_warn_en_US
415               : "Character in fixed-form label field must be a digit"_warn_en_US);
416     }
417     token.clear();
418     if (*badColumn < 6) {
419       at_ = start;
420       column_ = 1;
421       return;
422     }
423     outCol = 1;
424   }
425   if (outCol == 1) { // empty label field
426     // Emit a space so that, if the line is rescanned after preprocessing,
427     // a leading 'C' or 'D' won't be left-justified and then accidentally
428     // misinterpreted as a comment card.
429     EmitChar(token, ' ');
430     ++outCol;
431   }
432   token.CloseToken();
433   SkipToNextSignificantCharacter();
434   if (IsDecimalDigit(*at_)) {
435     if (features_.ShouldWarn(common::LanguageFeature::MiscSourceExtensions)) {
436       Say(common::LanguageFeature::MiscSourceExtensions, GetCurrentProvenance(),
437           "Label digit is not in fixed-form label field"_port_en_US);
438     }
439   }
440 }
441 
442 // 6.3.3.5: A program unit END statement, or any other statement whose
443 // initial line resembles an END statement, shall not be continued in
444 // fixed form source.
445 void Prescanner::EnforceStupidEndStatementRules(const TokenSequence &tokens) {
446   CharBlock cBlock{tokens.ToCharBlock()};
447   const char *str{cBlock.begin()};
448   std::size_t n{cBlock.size()};
449   if (n < 3) {
450     return;
451   }
452   std::size_t j{0};
453   for (; j < n && (str[j] == ' ' || (str[j] >= '0' && str[j] <= '9')); ++j) {
454   }
455   if (j + 3 > n || std::memcmp(str + j, "end", 3) != 0) {
456     return;
457   }
458   // It starts with END, possibly after a label.
459   auto start{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
460   auto end{allSources_.GetSourcePosition(tokens.GetCharProvenance(n - 1))};
461   if (!start || !end) {
462     return;
463   }
464   if (&*start->sourceFile == &*end->sourceFile && start->line == end->line) {
465     return; // no continuation
466   }
467   j += 3;
468   static const char *const prefixes[]{"program", "subroutine", "function",
469       "blockdata", "module", "submodule", nullptr};
470   bool isPrefix{j == n || !IsLegalInIdentifier(str[j])}; // prefix is END
471   std::size_t endOfPrefix{j - 1};
472   for (const char *const *p{prefixes}; *p; ++p) {
473     std::size_t pLen{std::strlen(*p)};
474     if (j + pLen <= n && std::memcmp(str + j, *p, pLen) == 0) {
475       isPrefix = true; // END thing as prefix
476       j += pLen;
477       endOfPrefix = j - 1;
478       for (; j < n && IsLegalInIdentifier(str[j]); ++j) {
479       }
480       break;
481     }
482   }
483   if (isPrefix) {
484     auto range{tokens.GetTokenProvenanceRange(1)};
485     if (j == n) { // END or END thing [name]
486       Say(range,
487           "Program unit END statement may not be continued in fixed form source"_err_en_US);
488     } else {
489       auto endOfPrefixPos{
490           allSources_.GetSourcePosition(tokens.GetCharProvenance(endOfPrefix))};
491       auto next{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
492       if (endOfPrefixPos && next &&
493           &*endOfPrefixPos->sourceFile == &*start->sourceFile &&
494           endOfPrefixPos->line == start->line &&
495           (&*next->sourceFile != &*start->sourceFile ||
496               next->line != start->line)) {
497         Say(range,
498             "Initial line of continued statement must not appear to be a program unit END in fixed form source"_err_en_US);
499       }
500     }
501   }
502 }
503 
504 void Prescanner::SkipToEndOfLine() {
505   while (*at_ != '\n') {
506     ++at_, ++column_;
507   }
508 }
509 
510 bool Prescanner::MustSkipToEndOfLine() const {
511   if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
512     return true; // skip over ignored columns in right margin (73:80)
513   } else if (*at_ == '!' && !inCharLiteral_) {
514     return !IsCompilerDirectiveSentinel(at_);
515   } else {
516     return false;
517   }
518 }
519 
520 void Prescanner::NextChar() {
521   CHECK(*at_ != '\n');
522   int n{IsSpace(at_)};
523   at_ += n ? n : 1;
524   ++column_;
525   while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
526     // UTF-8 byte order mark - treat this file as UTF-8
527     at_ += 3;
528     encoding_ = Encoding::UTF_8;
529   }
530   SkipToNextSignificantCharacter();
531 }
532 
533 // Skip everything that should be ignored until the next significant
534 // character is reached; handles C-style comments in preprocessing
535 // directives, Fortran ! comments, stuff after the right margin in
536 // fixed form, and all forms of line continuation.
537 bool Prescanner::SkipToNextSignificantCharacter() {
538   auto anyContinuationLine{false};
539   if (inPreprocessorDirective_) {
540     SkipCComments();
541   } else {
542     bool mightNeedSpace{false};
543     if (MustSkipToEndOfLine()) {
544       SkipToEndOfLine();
545     } else {
546       mightNeedSpace = *at_ == '\n';
547     }
548     for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
549       anyContinuationLine = true;
550       ++continuationLines_;
551       if (MustSkipToEndOfLine()) {
552         SkipToEndOfLine();
553       }
554     }
555     if (*at_ == '\t') {
556       tabInCurrentLine_ = true;
557     }
558   }
559   return anyContinuationLine;
560 }
561 
562 void Prescanner::SkipCComments() {
563   while (true) {
564     if (IsCComment(at_)) {
565       if (const char *after{SkipCComment(at_)}) {
566         column_ += after - at_;
567         // May have skipped over one or more newlines; relocate the start of
568         // the next line.
569         nextLine_ = at_ = after;
570         NextLine();
571       } else {
572         // Don't emit any messages about unclosed C-style comments, because
573         // the sequence /* can appear legally in a FORMAT statement.  There's
574         // no ambiguity, since the sequence */ cannot appear legally.
575         break;
576       }
577     } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
578         at_[1] == '\n' && !IsAtEnd()) {
579       BeginSourceLineAndAdvance();
580     } else {
581       break;
582     }
583   }
584 }
585 
586 void Prescanner::SkipSpaces() {
587   while (IsSpaceOrTab(at_)) {
588     NextChar();
589   }
590   insertASpace_ = false;
591 }
592 
593 const char *Prescanner::SkipWhiteSpace(const char *p) {
594   while (int n{IsSpaceOrTab(p)}) {
595     p += n;
596   }
597   return p;
598 }
599 
600 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
601   while (true) {
602     if (int n{IsSpaceOrTab(p)}) {
603       p += n;
604     } else if (IsCComment(p)) {
605       if (const char *after{SkipCComment(p)}) {
606         p = after;
607       } else {
608         break;
609       }
610     } else {
611       break;
612     }
613   }
614   return p;
615 }
616 
617 const char *Prescanner::SkipCComment(const char *p) const {
618   char star{' '}, slash{' '};
619   p += 2;
620   while (star != '*' || slash != '/') {
621     if (p >= limit_) {
622       return nullptr; // signifies an unterminated comment
623     }
624     star = slash;
625     slash = *p++;
626   }
627   return p;
628 }
629 
630 bool Prescanner::NextToken(TokenSequence &tokens) {
631   CHECK(at_ >= start_ && at_ < limit_);
632   if (InFixedFormSource() && !preprocessingOnly_) {
633     SkipSpaces();
634   } else {
635     if (*at_ == '/' && IsCComment(at_)) {
636       // Recognize and skip over classic C style /*comments*/ when
637       // outside a character literal.
638       if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
639         Say(LanguageFeature::ClassicCComments, GetCurrentProvenance(),
640             "nonstandard usage: C-style comment"_port_en_US);
641       }
642       SkipCComments();
643     }
644     if (IsSpaceOrTab(at_)) {
645       // Compress free-form white space into a single space character.
646       const auto theSpace{at_};
647       char previous{at_ <= start_ ? ' ' : at_[-1]};
648       NextChar();
649       SkipSpaces();
650       if (*at_ == '\n' && !omitNewline_) {
651         // Discard white space at the end of a line.
652       } else if (!inPreprocessorDirective_ &&
653           (previous == '(' || *at_ == '(' || *at_ == ')')) {
654         // Discard white space before/after '(' and before ')', unless in a
655         // preprocessor directive.  This helps yield space-free contiguous
656         // names for generic interfaces like OPERATOR( + ) and
657         // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
658         // This has the effect of silently ignoring the illegal spaces in
659         // the array constructor ( /1,2/ ) but that seems benign; it's
660         // hard to avoid that while still removing spaces from OPERATOR( / )
661         // and OPERATOR( // ).
662       } else {
663         // Preserve the squashed white space as a single space character.
664         tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
665         tokens.CloseToken();
666         return true;
667       }
668     }
669   }
670   if (insertASpace_) {
671     tokens.PutNextTokenChar(' ', spaceProvenance_);
672     insertASpace_ = false;
673   }
674   if (*at_ == '\n') {
675     return false;
676   }
677   const char *start{at_};
678   if (*at_ == '\'' || *at_ == '"') {
679     QuotedCharacterLiteral(tokens, start);
680     preventHollerith_ = false;
681   } else if (IsDecimalDigit(*at_)) {
682     int n{0}, digits{0};
683     static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
684     do {
685       if (n < maxHollerith) {
686         n = 10 * n + DecimalDigitValue(*at_);
687       }
688       EmitCharAndAdvance(tokens, *at_);
689       ++digits;
690       if (InFixedFormSource()) {
691         SkipSpaces();
692       }
693     } while (IsDecimalDigit(*at_));
694     if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
695         !preventHollerith_) {
696       Hollerith(tokens, n, start);
697     } else if (*at_ == '.') {
698       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
699       }
700       ExponentAndKind(tokens);
701     } else if (ExponentAndKind(tokens)) {
702     } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
703         inPreprocessorDirective_) {
704       do {
705         EmitCharAndAdvance(tokens, *at_);
706       } while (IsHexadecimalDigit(*at_));
707     } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..."
708       EmitCharAndAdvance(tokens, *at_);
709       QuotedCharacterLiteral(tokens, start);
710     } else if (IsLetter(*at_) && !preventHollerith_ &&
711         parenthesisNesting_ > 0) {
712       // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
713       // we don't misrecognize I9HOLLERITH as an identifier in the next case.
714       EmitCharAndAdvance(tokens, *at_);
715     }
716     preventHollerith_ = false;
717   } else if (*at_ == '.') {
718     char nch{EmitCharAndAdvance(tokens, '.')};
719     if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
720       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
721       }
722       ExponentAndKind(tokens);
723     } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
724       EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
725     }
726     preventHollerith_ = false;
727   } else if (IsLegalInIdentifier(*at_)) {
728     int parts{1};
729     const char *afterLast{nullptr};
730     do {
731       EmitChar(tokens, *at_);
732       ++at_, ++column_;
733       afterLast = at_;
734       if (SkipToNextSignificantCharacter() && IsLegalIdentifierStart(*at_)) {
735         tokens.CloseToken();
736         ++parts;
737       }
738     } while (IsLegalInIdentifier(*at_));
739     if (parts >= 3) {
740       // Subtlety: When an identifier is split across three or more continuation
741       // lines (or two continuation lines, immediately preceded or followed
742       // by '&' free form continuation line markers, its parts are kept as
743       // distinct pp-tokens so that macro replacement operates on them
744       // independently.  This trick accommodates the historic practice of
745       // using line continuation for token pasting after replacement.
746     } else if (parts == 2) {
747       if (afterLast && afterLast < limit_) {
748         afterLast = SkipWhiteSpace(afterLast);
749       }
750       if ((start > start_ && start[-1] == '&') ||
751           (afterLast && afterLast < limit_ &&
752               (*afterLast == '&' || *afterLast == '\n'))) {
753         // call &                call foo&        call foo&
754         //   &MACRO&      OR       &MACRO&   OR     &MACRO
755         //   &foo(...)             &(...)
756       } else {
757         tokens.ReopenLastToken();
758       }
759     }
760     if (InFixedFormSource()) {
761       SkipSpaces();
762     }
763     if ((*at_ == '\'' || *at_ == '"') &&
764         tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..."
765       QuotedCharacterLiteral(tokens, start);
766       preventHollerith_ = false;
767     } else {
768       preventHollerith_ = true; // DO 10 H = ...
769     }
770   } else if (*at_ == '*') {
771     if (EmitCharAndAdvance(tokens, '*') == '*') {
772       EmitCharAndAdvance(tokens, '*');
773     } else {
774       // Subtle ambiguity:
775       //  CHARACTER*2H     declares H because *2 is a kind specifier
776       //  DATAC/N*2H  /    is repeated Hollerith
777       preventHollerith_ = !slashInCurrentStatement_;
778     }
779   } else {
780     char ch{*at_};
781     if (ch == '(') {
782       if (parenthesisNesting_++ == 0) {
783         isPossibleMacroCall_ = tokens.SizeInTokens() > 0 &&
784             preprocessor_.IsFunctionLikeDefinition(
785                 tokens.TokenAt(tokens.SizeInTokens() - 1));
786       }
787     } else if (ch == ')' && parenthesisNesting_ > 0) {
788       --parenthesisNesting_;
789     }
790     char nch{EmitCharAndAdvance(tokens, ch)};
791     preventHollerith_ = false;
792     if ((nch == '=' &&
793             (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
794         (ch == nch &&
795             (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
796                 ch == '|' || ch == '<' || ch == '>')) ||
797         (ch == '=' && nch == '>')) {
798       // token comprises two characters
799       EmitCharAndAdvance(tokens, nch);
800     } else if (ch == '/') {
801       slashInCurrentStatement_ = true;
802     } else if (ch == ';' && InFixedFormSource()) {
803       SkipSpaces();
804       if (IsDecimalDigit(*at_)) {
805         if (features_.ShouldWarn(
806                 common::LanguageFeature::MiscSourceExtensions)) {
807           Say(common::LanguageFeature::MiscSourceExtensions,
808               GetProvenanceRange(at_, at_ + 1),
809               "Label should be in the label field"_port_en_US);
810         }
811       }
812     }
813   }
814   tokens.CloseToken();
815   return true;
816 }
817 
818 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
819   char ed{ToLowerCaseLetter(*at_)};
820   if (ed != 'e' && ed != 'd') {
821     return false;
822   }
823   EmitCharAndAdvance(tokens, ed);
824   if (*at_ == '+' || *at_ == '-') {
825     EmitCharAndAdvance(tokens, *at_);
826   }
827   while (IsDecimalDigit(*at_)) {
828     EmitCharAndAdvance(tokens, *at_);
829   }
830   if (*at_ == '_') {
831     while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
832     }
833   }
834   return true;
835 }
836 
837 void Prescanner::QuotedCharacterLiteral(
838     TokenSequence &tokens, const char *start) {
839   char quote{*at_};
840   const char *end{at_ + 1};
841   inCharLiteral_ = true;
842   continuationInCharLiteral_ = true;
843   const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
844   const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
845   bool isEscaped{false};
846   bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
847   while (true) {
848     if (*at_ == '\\') {
849       if (escapesEnabled) {
850         isEscaped = !isEscaped;
851       } else {
852         // The parser always processes escape sequences, so don't confuse it
853         // when escapes are disabled.
854         insert('\\');
855       }
856     } else {
857       isEscaped = false;
858     }
859     EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
860         Encoding::LATIN_1);
861     while (PadOutCharacterLiteral(tokens)) {
862     }
863     if (*at_ == '\n') {
864       if (!inPreprocessorDirective_) {
865         Say(GetProvenanceRange(start, end),
866             "Incomplete character literal"_err_en_US);
867       }
868       break;
869     }
870     // Here's a weird edge case.  When there's a two or more following
871     // continuation lines at this point, and the entire significant part of
872     // the next continuation line is the name of a keyword macro, replace
873     // it in the character literal with its definition.  Example:
874     //   #define FOO foo
875     //   subroutine subr() bind(c, name="my_&
876     //     &FOO&
877     //     &_bar") ...
878     // produces a binding name of "my_foo_bar".
879     while (at_[1] == '&' && nextLine_ < limit_ && !InFixedFormSource()) {
880       const char *idStart{nextLine_};
881       if (const char *amper{SkipWhiteSpace(nextLine_)}; *amper == '&') {
882         idStart = amper + 1;
883       }
884       if (IsLegalIdentifierStart(*idStart)) {
885         std::size_t idLen{1};
886         for (; IsLegalInIdentifier(idStart[idLen]); ++idLen) {
887         }
888         if (idStart[idLen] == '&') {
889           CharBlock id{idStart, idLen};
890           if (preprocessor_.IsNameDefined(id)) {
891             TokenSequence ppTokens;
892             ppTokens.Put(id, GetProvenance(idStart));
893             if (auto replaced{
894                     preprocessor_.MacroReplacement(ppTokens, *this)}) {
895               tokens.Put(*replaced);
896               at_ = &idStart[idLen - 1];
897               NextLine();
898               continue; // try again on the next line
899             }
900           }
901         }
902       }
903       break;
904     }
905     end = at_ + 1;
906     NextChar();
907     if (*at_ == quote && !isEscaped) {
908       // A doubled unescaped quote mark becomes a single instance of that
909       // quote character in the literal (later).  There can be spaces between
910       // the quotes in fixed form source.
911       EmitChar(tokens, quote);
912       inCharLiteral_ = false; // for cases like print *, '...'!comment
913       NextChar();
914       if (InFixedFormSource()) {
915         SkipSpaces();
916       }
917       if (*at_ != quote) {
918         break;
919       }
920       inCharLiteral_ = true;
921     }
922   }
923   continuationInCharLiteral_ = false;
924   inCharLiteral_ = false;
925 }
926 
927 void Prescanner::Hollerith(
928     TokenSequence &tokens, int count, const char *start) {
929   inCharLiteral_ = true;
930   CHECK(*at_ == 'h' || *at_ == 'H');
931   EmitChar(tokens, 'H');
932   while (count-- > 0) {
933     if (PadOutCharacterLiteral(tokens)) {
934     } else if (*at_ == '\n') {
935       if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
936         Say(common::UsageWarning::Scanning, GetProvenanceRange(start, at_),
937             "Possible truncated Hollerith literal"_warn_en_US);
938       }
939       break;
940     } else {
941       NextChar();
942       // Each multi-byte character encoding counts as a single character.
943       // No escape sequences are recognized.
944       // Hollerith is always emitted to the cooked character
945       // stream in UTF-8.
946       DecodedCharacter decoded{DecodeCharacter(
947           encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
948       if (decoded.bytes > 0) {
949         EncodedCharacter utf8{
950             EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
951         for (int j{0}; j < utf8.bytes; ++j) {
952           EmitChar(tokens, utf8.buffer[j]);
953         }
954         at_ += decoded.bytes - 1;
955       } else {
956         Say(GetProvenanceRange(start, at_),
957             "Bad character in Hollerith literal"_err_en_US);
958         break;
959       }
960     }
961   }
962   if (*at_ != '\n') {
963     NextChar();
964   }
965   inCharLiteral_ = false;
966 }
967 
968 // In fixed form, source card images must be processed as if they were at
969 // least 72 columns wide, at least in character literal contexts.
970 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
971   while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
972     if (column_ < fixedFormColumnLimit_) {
973       tokens.PutNextTokenChar(' ', spaceProvenance_);
974       ++column_;
975       return true;
976     }
977     if (!FixedFormContinuation(false /*no need to insert space*/) ||
978         tabInCurrentLine_) {
979       return false;
980     }
981     CHECK(column_ == 7);
982     --at_; // point to column 6 of continuation line
983     column_ = 6;
984   }
985   return false;
986 }
987 
988 static bool IsAtProcess(const char *p) {
989   static const char pAtProc[]{"process"};
990   for (std::size_t i{0}; i < sizeof pAtProc - 1; ++i) {
991     if (ToLowerCaseLetter(*++p) != pAtProc[i])
992       return false;
993   }
994   return true;
995 }
996 
997 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
998   const char *p{start};
999 
1000   // The @process directive must start in column 1.
1001   if (*p == '@' && IsAtProcess(p)) {
1002     return true;
1003   }
1004 
1005   if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
1006       ((*p == 'D' || *p == 'd') &&
1007           !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
1008     return true;
1009   }
1010   bool anyTabs{false};
1011   while (true) {
1012     if (int n{IsSpace(p)}) {
1013       p += n;
1014     } else if (*p == '\t') {
1015       anyTabs = true;
1016       ++p;
1017     } else if (*p == '0' && !anyTabs && p == start + 5) {
1018       ++p; // 0 in column 6 must treated as a space
1019     } else {
1020       break;
1021     }
1022   }
1023   if (!anyTabs && p >= start + fixedFormColumnLimit_) {
1024     return true;
1025   }
1026   if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
1027     return true;
1028   }
1029   return *p == '\n';
1030 }
1031 
1032 const char *Prescanner::IsFreeFormComment(const char *p) const {
1033   p = SkipWhiteSpaceAndCComments(p);
1034   if (*p == '!' || *p == '\n') {
1035     return p;
1036   } else if (*p == '@') {
1037     return IsAtProcess(p) ? p : nullptr;
1038   } else {
1039     return nullptr;
1040   }
1041 }
1042 
1043 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
1044   if (!expandIncludeLines_) {
1045     return std::nullopt;
1046   }
1047   const char *p{SkipWhiteSpace(start)};
1048   if (*p == '0' && inFixedForm_ && p == start + 5) {
1049     // Accept "     0INCLUDE" in fixed form.
1050     p = SkipWhiteSpace(p + 1);
1051   }
1052   for (const char *q{"include"}; *q; ++q) {
1053     if (ToLowerCaseLetter(*p) != *q) {
1054       return std::nullopt;
1055     }
1056     p = SkipWhiteSpace(p + 1);
1057   }
1058   if (IsDecimalDigit(*p)) { // accept & ignore a numeric kind prefix
1059     for (p = SkipWhiteSpace(p + 1); IsDecimalDigit(*p);
1060          p = SkipWhiteSpace(p + 1)) {
1061     }
1062     if (*p != '_') {
1063       return std::nullopt;
1064     }
1065     p = SkipWhiteSpace(p + 1);
1066   }
1067   if (*p == '"' || *p == '\'') {
1068     return {p - start};
1069   }
1070   return std::nullopt;
1071 }
1072 
1073 void Prescanner::FortranInclude(const char *firstQuote) {
1074   const char *p{firstQuote};
1075   while (*p != '"' && *p != '\'') {
1076     ++p;
1077   }
1078   char quote{*p};
1079   std::string path;
1080   for (++p; *p != '\n'; ++p) {
1081     if (*p == quote) {
1082       if (p[1] != quote) {
1083         break;
1084       }
1085       ++p;
1086     }
1087     path += *p;
1088   }
1089   if (*p != quote) {
1090     Say(GetProvenanceRange(firstQuote, p),
1091         "malformed path name string"_err_en_US);
1092     return;
1093   }
1094   p = SkipWhiteSpace(p + 1);
1095   if (*p != '\n' && *p != '!') {
1096     const char *garbage{p};
1097     for (; *p != '\n' && *p != '!'; ++p) {
1098     }
1099     if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
1100       Say(common::UsageWarning::Scanning, GetProvenanceRange(garbage, p),
1101           "excess characters after path name"_warn_en_US);
1102     }
1103   }
1104   std::string buf;
1105   llvm::raw_string_ostream error{buf};
1106   Provenance provenance{GetProvenance(nextLine_)};
1107   std::optional<std::string> prependPath;
1108   if (const SourceFile * currentFile{allSources_.GetSourceFile(provenance)}) {
1109     prependPath = DirectoryName(currentFile->path());
1110   }
1111   const SourceFile *included{
1112       allSources_.Open(path, error, std::move(prependPath))};
1113   if (!included) {
1114     Say(provenance, "INCLUDE: %s"_err_en_US, buf);
1115   } else if (included->bytes() > 0) {
1116     ProvenanceRange includeLineRange{
1117         provenance, static_cast<std::size_t>(p - nextLine_)};
1118     ProvenanceRange fileRange{
1119         allSources_.AddIncludedFile(*included, includeLineRange)};
1120     Preprocessor cleanPrepro{allSources_};
1121     if (preprocessor_.IsNameDefined("__FILE__"s)) {
1122       cleanPrepro.DefineStandardMacros(); // __FILE__, __LINE__, &c.
1123     }
1124     if (preprocessor_.IsNameDefined("_CUDA"s)) {
1125       cleanPrepro.Define("_CUDA"s, "1");
1126     }
1127     Prescanner{*this, cleanPrepro, /*isNestedInIncludeDirective=*/false}
1128         .set_encoding(included->encoding())
1129         .Prescan(fileRange);
1130   }
1131 }
1132 
1133 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
1134   const char *p{start};
1135   while (int n{IsSpace(p)}) {
1136     p += n;
1137   }
1138   if (*p == '#') {
1139     if (inFixedForm_ && p == start + 5) {
1140       return nullptr;
1141     }
1142   } else {
1143     p = SkipWhiteSpace(p);
1144     if (*p != '#') {
1145       return nullptr;
1146     }
1147   }
1148   return SkipWhiteSpace(p + 1);
1149 }
1150 
1151 bool Prescanner::IsNextLinePreprocessorDirective() const {
1152   return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
1153 }
1154 
1155 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
1156   if (IsAtEnd()) {
1157     if (afterAmpersand && prescannerNesting_ > 0) {
1158       // A continuation marker at the end of the last line in an
1159       // include file inhibits the newline for that line.
1160       SkipToEndOfLine();
1161       omitNewline_ = true;
1162     }
1163   } else if (inPreprocessorDirective_) {
1164   } else {
1165     auto lineClass{ClassifyLine(nextLine_)};
1166     if (lineClass.kind == LineClassification::Kind::Comment) {
1167       NextLine();
1168       return true;
1169     } else if (lineClass.kind ==
1170             LineClassification::Kind::ConditionalCompilationDirective ||
1171         lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
1172       // Allow conditional compilation directives (e.g., #ifdef) to affect
1173       // continuation lines.
1174       // Allow other preprocessor directives, too, except #include
1175       // (when it does not follow '&'), #define, and #undef (because
1176       // they cannot be allowed to affect preceding text on a
1177       // continued line).
1178       preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
1179       return true;
1180     } else if (afterAmpersand &&
1181         (lineClass.kind == LineClassification::Kind::DefinitionDirective ||
1182             lineClass.kind == LineClassification::Kind::IncludeDirective ||
1183             lineClass.kind == LineClassification::Kind::IncludeLine)) {
1184       SkipToEndOfLine();
1185       omitNewline_ = true;
1186       skipLeadingAmpersand_ = true;
1187     }
1188   }
1189   return false;
1190 }
1191 
1192 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
1193   if (IsAtEnd()) {
1194     return nullptr;
1195   }
1196   tabInCurrentLine_ = false;
1197   char col1{*nextLine_};
1198   if (IsFixedFormCommentChar(col1)) {
1199     int j{1};
1200     if (InCompilerDirective()) {
1201       // Must be a continued compiler directive.
1202       for (; j < 5; ++j) {
1203         char ch{directiveSentinel_[j - 1]};
1204         if (ch == '\0') {
1205           break;
1206         }
1207         if (ch != ToLowerCaseLetter(nextLine_[j])) {
1208           return nullptr;
1209         }
1210       }
1211     } else if (features_.IsEnabled(LanguageFeature::OpenMP)) {
1212       // Fixed Source Form Conditional Compilation Sentinels.
1213       if (nextLine_[1] != '$') {
1214         return nullptr;
1215       }
1216       j++;
1217     } else {
1218       return nullptr;
1219     }
1220     for (; j < 5; ++j) {
1221       if (nextLine_[j] != ' ') {
1222         return nullptr;
1223       }
1224     }
1225     const char *col6{nextLine_ + 5};
1226     if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) {
1227       if (mightNeedSpace && !IsSpace(nextLine_ + 6)) {
1228         insertASpace_ = true;
1229       }
1230       return nextLine_ + 6;
1231     }
1232     return nullptr;
1233   } else {
1234     // Normal case: not in a compiler directive.
1235     if (col1 == '&' &&
1236         features_.IsEnabled(
1237             LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
1238       // Extension: '&' as continuation marker
1239       if (features_.ShouldWarn(
1240               LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
1241         Say(LanguageFeature::FixedFormContinuationWithColumn1Ampersand,
1242             GetProvenance(nextLine_), "nonstandard usage"_port_en_US);
1243       }
1244       return nextLine_ + 1;
1245     }
1246     if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
1247       tabInCurrentLine_ = true;
1248       return nextLine_ + 2; // VAX extension
1249     }
1250     if ((col1 == ' ' ||
1251             ((col1 == 'D' || col1 == 'd') &&
1252                 features_.IsEnabled(LanguageFeature::OldDebugLines))) &&
1253         nextLine_[1] == ' ' && nextLine_[2] == ' ' && nextLine_[3] == ' ' &&
1254         nextLine_[4] == ' ') {
1255       const char *col6{nextLine_ + 5};
1256       if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) {
1257         if ((*col6 == 'i' || *col6 == 'I') && IsIncludeLine(nextLine_)) {
1258           // It's An INCLUDE line, not a continuation
1259         } else {
1260           return nextLine_ + 6;
1261         }
1262       }
1263     }
1264     if (IsImplicitContinuation()) {
1265       return nextLine_;
1266     }
1267   }
1268   return nullptr; // not a continuation line
1269 }
1270 
1271 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
1272   const char *p{nextLine_};
1273   if (p >= limit_) {
1274     return nullptr;
1275   }
1276   p = SkipWhiteSpace(p);
1277   if (InCompilerDirective()) {
1278     if (*p++ != '!') {
1279       return nullptr;
1280     }
1281     for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
1282       if (*s != ToLowerCaseLetter(*p)) {
1283         return nullptr;
1284       }
1285     }
1286     p = SkipWhiteSpace(p);
1287     if (*p == '&') {
1288       if (!ampersand) {
1289         insertASpace_ = true;
1290       }
1291       return p + 1;
1292     } else if (ampersand) {
1293       return p;
1294     } else {
1295       return nullptr;
1296     }
1297   } else {
1298     if (*p == '&') {
1299       return p + 1;
1300     } else if (*p == '!' || *p == '\n' || *p == '#') {
1301       return nullptr;
1302     } else if (ampersand || IsImplicitContinuation()) {
1303       if (continuationInCharLiteral_) {
1304         // 'a'&            -> 'a''b' == "a'b"
1305         //   'b'
1306         if (features_.ShouldWarn(
1307                 common::LanguageFeature::MiscSourceExtensions)) {
1308           Say(common::LanguageFeature::MiscSourceExtensions,
1309               GetProvenanceRange(p, p + 1),
1310               "Character literal continuation line should have been preceded by '&'"_port_en_US);
1311         }
1312       } else if (p > nextLine_) {
1313         --p;
1314       } else {
1315         insertASpace_ = true;
1316       }
1317       return p;
1318     } else {
1319       return nullptr;
1320     }
1321   }
1322 }
1323 
1324 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
1325   // N.B. We accept '&' as a continuation indicator in fixed form, too,
1326   // but not in a character literal.
1327   if (*at_ == '&' && inCharLiteral_) {
1328     return false;
1329   }
1330   do {
1331     if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
1332       BeginSourceLine(cont);
1333       column_ = 7;
1334       NextLine();
1335       return true;
1336     }
1337   } while (SkipCommentLine(false /* not after ampersand */));
1338   return false;
1339 }
1340 
1341 bool Prescanner::FreeFormContinuation() {
1342   const char *p{at_};
1343   bool ampersand{*p == '&'};
1344   if (ampersand) {
1345     p = SkipWhiteSpace(p + 1);
1346   }
1347   if (*p != '\n') {
1348     if (inCharLiteral_) {
1349       return false;
1350     } else if (*p == '!') { // & ! comment - ok
1351     } else if (ampersand && isPossibleMacroCall_ && (*p == ',' || *p == ')')) {
1352       return false; // allow & at end of a macro argument
1353     } else if (features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
1354       Say(LanguageFeature::CruftAfterAmpersand, GetProvenance(p),
1355           "missing ! before comment after &"_warn_en_US);
1356     }
1357   }
1358   do {
1359     if (const char *cont{FreeFormContinuationLine(ampersand)}) {
1360       BeginSourceLine(cont);
1361       NextLine();
1362       return true;
1363     }
1364   } while (SkipCommentLine(ampersand));
1365   return false;
1366 }
1367 
1368 // Implicit line continuation allows a preprocessor macro call with
1369 // arguments to span multiple lines.
1370 bool Prescanner::IsImplicitContinuation() const {
1371   return !inPreprocessorDirective_ && !inCharLiteral_ && isPossibleMacroCall_ &&
1372       parenthesisNesting_ > 0 && !IsAtEnd() &&
1373       ClassifyLine(nextLine_).kind == LineClassification::Kind::Source;
1374 }
1375 
1376 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
1377   if (disableSourceContinuation_) {
1378     return false;
1379   } else if (*at_ == '\n' || *at_ == '&') {
1380     if (inFixedForm_) {
1381       return FixedFormContinuation(mightNeedFixedFormSpace);
1382     } else {
1383       return FreeFormContinuation();
1384     }
1385   } else if (*at_ == '\\' && at_ + 2 == nextLine_ &&
1386       backslashFreeFormContinuation_ && !inFixedForm_ && nextLine_ < limit_) {
1387     // cpp-like handling of \ at end of a free form source line
1388     BeginSourceLine(nextLine_);
1389     NextLine();
1390     return true;
1391   } else {
1392     return false;
1393   }
1394 }
1395 
1396 std::optional<Prescanner::LineClassification>
1397 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
1398   const char *p{start};
1399   char col1{*p++};
1400   if (!IsFixedFormCommentChar(col1)) {
1401     return std::nullopt;
1402   }
1403   char sentinel[5], *sp{sentinel};
1404   int column{2};
1405   for (; column < 6; ++column, ++p) {
1406     if (*p == '\n' || IsSpaceOrTab(p)) {
1407       break;
1408     }
1409     if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1410       // OpenMP conditional compilation line: leave the label alone
1411       break;
1412     }
1413     *sp++ = ToLowerCaseLetter(*p);
1414   }
1415   if (column == 6) {
1416     if (*p == '0') {
1417       ++p;
1418     } else if (int n{IsSpaceOrTab(p)}) {
1419       p += n;
1420     } else {
1421       // This is a Continuation line, not an initial directive line.
1422       return std::nullopt;
1423     }
1424   }
1425   if (sp == sentinel) {
1426     return std::nullopt;
1427   }
1428   *sp = '\0';
1429   if (const char *ss{IsCompilerDirectiveSentinel(
1430           sentinel, static_cast<std::size_t>(sp - sentinel))}) {
1431     std::size_t payloadOffset = p - start;
1432     return {LineClassification{
1433         LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1434   }
1435   return std::nullopt;
1436 }
1437 
1438 std::optional<Prescanner::LineClassification>
1439 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1440   if (const char *p{SkipWhiteSpace(start)}; p && *p++ == '!') {
1441     if (auto maybePair{IsCompilerDirectiveSentinel(p)}) {
1442       auto offset{static_cast<std::size_t>(maybePair->second - start)};
1443       return {LineClassification{LineClassification::Kind::CompilerDirective,
1444           offset, maybePair->first}};
1445     }
1446   }
1447   return std::nullopt;
1448 }
1449 
1450 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1451   std::uint64_t packed{0};
1452   for (char ch : dir) {
1453     packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1454   }
1455   compilerDirectiveBloomFilter_.set(packed % prime1);
1456   compilerDirectiveBloomFilter_.set(packed % prime2);
1457   compilerDirectiveSentinels_.insert(dir);
1458   return *this;
1459 }
1460 
1461 const char *Prescanner::IsCompilerDirectiveSentinel(
1462     const char *sentinel, std::size_t len) const {
1463   std::uint64_t packed{0};
1464   for (std::size_t j{0}; j < len; ++j) {
1465     packed = (packed << 8) | (sentinel[j] & 0xff);
1466   }
1467   if (len == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1468       !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1469     return nullptr;
1470   }
1471   const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, len))};
1472   return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
1473 }
1474 
1475 const char *Prescanner::IsCompilerDirectiveSentinel(CharBlock token) const {
1476   const char *p{token.begin()};
1477   const char *end{p + token.size()};
1478   while (p < end && (*p == ' ' || *p == '\n')) {
1479     ++p;
1480   }
1481   if (p < end && *p == '!') {
1482     ++p;
1483   }
1484   while (end > p && (end[-1] == ' ' || end[-1] == '\t')) {
1485     --end;
1486   }
1487   return end > p && IsCompilerDirectiveSentinel(p, end - p) ? p : nullptr;
1488 }
1489 
1490 std::optional<std::pair<const char *, const char *>>
1491 Prescanner::IsCompilerDirectiveSentinel(const char *p) const {
1492   char sentinel[8];
1493   for (std::size_t j{0}; j + 1 < sizeof sentinel && *p != '\n'; ++p, ++j) {
1494     if (int n{*p == '&' ? 1 : IsSpaceOrTab(p)}) {
1495       if (j > 0) {
1496         sentinel[j] = '\0';
1497         p = SkipWhiteSpace(p + n);
1498         if (*p != '!') {
1499           if (const char *sp{IsCompilerDirectiveSentinel(sentinel, j)}) {
1500             return std::make_pair(sp, p);
1501           }
1502         }
1503       }
1504       break;
1505     } else {
1506       sentinel[j] = ToLowerCaseLetter(*p);
1507     }
1508   }
1509   return std::nullopt;
1510 }
1511 
1512 constexpr bool IsDirective(const char *match, const char *dir) {
1513   for (; *match; ++match) {
1514     if (*match != ToLowerCaseLetter(*dir++)) {
1515       return false;
1516     }
1517   }
1518   return true;
1519 }
1520 
1521 Prescanner::LineClassification Prescanner::ClassifyLine(
1522     const char *start) const {
1523   if (inFixedForm_) {
1524     if (std::optional<LineClassification> lc{
1525             IsFixedFormCompilerDirectiveLine(start)}) {
1526       return std::move(*lc);
1527     }
1528     if (IsFixedFormCommentLine(start)) {
1529       return {LineClassification::Kind::Comment};
1530     }
1531   } else {
1532     if (std::optional<LineClassification> lc{
1533             IsFreeFormCompilerDirectiveLine(start)}) {
1534       return std::move(*lc);
1535     }
1536     if (const char *bang{IsFreeFormComment(start)}) {
1537       return {LineClassification::Kind::Comment,
1538           static_cast<std::size_t>(bang - start)};
1539     }
1540   }
1541   if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1542     return {LineClassification::Kind::IncludeLine, *quoteOffset};
1543   }
1544   if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1545     if (IsDirective("if", dir) || IsDirective("elif", dir) ||
1546         IsDirective("else", dir) || IsDirective("endif", dir)) {
1547       return {LineClassification::Kind::ConditionalCompilationDirective};
1548     } else if (IsDirective("include", dir)) {
1549       return {LineClassification::Kind::IncludeDirective};
1550     } else if (IsDirective("define", dir) || IsDirective("undef", dir)) {
1551       return {LineClassification::Kind::DefinitionDirective};
1552     } else {
1553       return {LineClassification::Kind::PreprocessorDirective};
1554     }
1555   }
1556   return {LineClassification::Kind::Source};
1557 }
1558 
1559 Prescanner::LineClassification Prescanner::ClassifyLine(
1560     TokenSequence &tokens, Provenance newlineProvenance) const {
1561   // Append a newline temporarily.
1562   tokens.PutNextTokenChar('\n', newlineProvenance);
1563   tokens.CloseToken();
1564   const char *ppd{tokens.ToCharBlock().begin()};
1565   LineClassification classification{ClassifyLine(ppd)};
1566   tokens.pop_back(); // remove the newline
1567   return classification;
1568 }
1569 
1570 void Prescanner::SourceFormChange(std::string &&dir) {
1571   if (dir == "!dir$ free") {
1572     inFixedForm_ = false;
1573   } else if (dir == "!dir$ fixed") {
1574     inFixedForm_ = true;
1575   }
1576 }
1577 
1578 // Acquire and append compiler directive continuation lines to
1579 // the tokens that constitute a compiler directive, even when those
1580 // directive continuation lines are the result of macro expansion.
1581 // (Not used when neither the original compiler directive line nor
1582 // the directive continuation line result from preprocessing; regular
1583 // line continuation during tokenization handles that normal case.)
1584 bool Prescanner::CompilerDirectiveContinuation(
1585     TokenSequence &tokens, const char *origSentinel) {
1586   if (inFixedForm_ || tokens.empty() ||
1587       tokens.TokenAt(tokens.SizeInTokens() - 1) != "&") {
1588     return false;
1589   }
1590   LineClassification followingLine{ClassifyLine(nextLine_)};
1591   if (followingLine.kind == LineClassification::Kind::Comment) {
1592     nextLine_ += followingLine.payloadOffset; // advance to '!' or newline
1593     NextLine();
1594     return true;
1595   }
1596   CHECK(origSentinel != nullptr);
1597   directiveSentinel_ = origSentinel; // so InCompilerDirective() is true
1598   const char *nextContinuation{
1599       followingLine.kind == LineClassification::Kind::CompilerDirective
1600           ? FreeFormContinuationLine(true)
1601           : nullptr};
1602   if (!nextContinuation &&
1603       followingLine.kind != LineClassification::Kind::Source) {
1604     return false;
1605   }
1606   auto origNextLine{nextLine_};
1607   BeginSourceLine(nextLine_);
1608   NextLine();
1609   if (nextContinuation) {
1610     // What follows is !DIR$ & xxx; skip over the & so that it
1611     // doesn't cause a spurious continuation.
1612     at_ = nextContinuation;
1613   } else {
1614     // What follows looks like a source line before macro expansion,
1615     // but might become a directive continuation afterwards.
1616     SkipSpaces();
1617   }
1618   TokenSequence followingTokens;
1619   while (NextToken(followingTokens)) {
1620   }
1621   if (auto followingPrepro{
1622           preprocessor_.MacroReplacement(followingTokens, *this)}) {
1623     followingTokens = std::move(*followingPrepro);
1624   }
1625   followingTokens.RemoveRedundantBlanks();
1626   std::size_t startAt{0};
1627   std::size_t following{followingTokens.SizeInTokens()};
1628   bool ok{false};
1629   if (nextContinuation) {
1630     ok = true;
1631   } else {
1632     startAt = 2;
1633     if (startAt < following && followingTokens.TokenAt(0) == "!") {
1634       CharBlock sentinel{followingTokens.TokenAt(1)};
1635       if (!sentinel.empty() &&
1636           std::memcmp(sentinel.begin(), origSentinel, sentinel.size()) == 0) {
1637         ok = true;
1638         while (
1639             startAt < following && followingTokens.TokenAt(startAt).IsBlank()) {
1640           ++startAt;
1641         }
1642         if (startAt < following && followingTokens.TokenAt(startAt) == "&") {
1643           ++startAt;
1644         }
1645       }
1646     }
1647   }
1648   if (ok) {
1649     tokens.pop_back(); // delete original '&'
1650     tokens.Put(followingTokens, startAt, following - startAt);
1651     tokens.RemoveRedundantBlanks();
1652   } else {
1653     nextLine_ = origNextLine;
1654   }
1655   return ok;
1656 }
1657 
1658 // Similar, but for source line continuation after macro replacement.
1659 bool Prescanner::SourceLineContinuation(TokenSequence &tokens) {
1660   if (!inFixedForm_ && !tokens.empty() &&
1661       tokens.TokenAt(tokens.SizeInTokens() - 1) == "&") {
1662     LineClassification followingLine{ClassifyLine(nextLine_)};
1663     if (followingLine.kind == LineClassification::Kind::Comment) {
1664       nextLine_ += followingLine.payloadOffset; // advance to '!' or newline
1665       NextLine();
1666       return true;
1667     } else if (const char *nextContinuation{FreeFormContinuationLine(true)}) {
1668       BeginSourceLine(nextLine_);
1669       NextLine();
1670       TokenSequence followingTokens;
1671       at_ = nextContinuation;
1672       while (NextToken(followingTokens)) {
1673       }
1674       if (auto followingPrepro{
1675               preprocessor_.MacroReplacement(followingTokens, *this)}) {
1676         followingTokens = std::move(*followingPrepro);
1677       }
1678       followingTokens.RemoveRedundantBlanks();
1679       tokens.pop_back(); // delete original '&'
1680       tokens.Put(followingTokens);
1681       return true;
1682     }
1683   }
1684   return false;
1685 }
1686 } // namespace Fortran::parser
1687