xref: /llvm-project/flang/lib/Parser/prescan.cpp (revision 1324789a65665c27eda9e04bc93db81cc859924c)
1 //===-- lib/Parser/prescan.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "prescan.h"
10 #include "flang/Common/idioms.h"
11 #include "flang/Parser/characters.h"
12 #include "flang/Parser/message.h"
13 #include "flang/Parser/preprocessor.h"
14 #include "flang/Parser/source.h"
15 #include "flang/Parser/token-sequence.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <cstddef>
18 #include <cstring>
19 #include <utility>
20 #include <vector>
21 
22 namespace Fortran::parser {
23 
24 using common::LanguageFeature;
25 
26 static constexpr int maxPrescannerNesting{100};
27 
28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
29     Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
30     : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
31       allSources_{preprocessor_.allSources()}, features_{lfc},
32       backslashFreeFormContinuation_{preprocessor.AnyDefinitions()},
33       encoding_{allSources_.encoding()} {}
34 
35 Prescanner::Prescanner(const Prescanner &that, bool isNestedInIncludeDirective)
36     : messages_{that.messages_}, cooked_{that.cooked_},
37       preprocessor_{that.preprocessor_}, allSources_{that.allSources_},
38       features_{that.features_},
39       isNestedInIncludeDirective_{isNestedInIncludeDirective},
40       backslashFreeFormContinuation_{that.backslashFreeFormContinuation_},
41       inFixedForm_{that.inFixedForm_},
42       fixedFormColumnLimit_{that.fixedFormColumnLimit_},
43       encoding_{that.encoding_},
44       prescannerNesting_{that.prescannerNesting_ + 1},
45       skipLeadingAmpersand_{that.skipLeadingAmpersand_},
46       compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
47       compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
48 
49 // Returns number of bytes to skip
50 static inline int IsSpace(const char *p) {
51   if (*p == ' ') {
52     return 1;
53   } else if (*p == '\xa0') { // LATIN-1 NBSP non-breaking space
54     return 1;
55   } else if (p[0] == '\xc2' && p[1] == '\xa0') { // UTF-8 NBSP
56     return 2;
57   } else {
58     return 0;
59   }
60 }
61 
62 static inline int IsSpaceOrTab(const char *p) {
63   return *p == '\t' ? 1 : IsSpace(p);
64 }
65 
66 static inline constexpr bool IsFixedFormCommentChar(char ch) {
67   return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
68 }
69 
70 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
71   char *p{dir.GetMutableCharData()};
72   char *limit{p + dir.SizeInChars()};
73   for (; p < limit; ++p) {
74     if (*p != ' ') {
75       CHECK(IsFixedFormCommentChar(*p));
76       *p = '!';
77       return;
78     }
79   }
80   DIE("compiler directive all blank");
81 }
82 
83 void Prescanner::Prescan(ProvenanceRange range) {
84   startProvenance_ = range.start();
85   start_ = allSources_.GetSource(range);
86   CHECK(start_);
87   limit_ = start_ + range.size();
88   nextLine_ = start_;
89   const bool beganInFixedForm{inFixedForm_};
90   if (prescannerNesting_ > maxPrescannerNesting) {
91     Say(GetProvenance(start_),
92         "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
93     return;
94   }
95   while (!IsAtEnd()) {
96     Statement();
97   }
98   if (inFixedForm_ != beganInFixedForm) {
99     std::string dir{"!dir$ "};
100     if (beganInFixedForm) {
101       dir += "fixed";
102     } else {
103       dir += "free";
104     }
105     dir += '\n';
106     TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()};
107     tokens.Emit(cooked_);
108   }
109 }
110 
111 void Prescanner::Statement() {
112   TokenSequence tokens;
113   const char *statementStart{nextLine_};
114   LineClassification line{ClassifyLine(statementStart)};
115   switch (line.kind) {
116   case LineClassification::Kind::Comment:
117     nextLine_ += line.payloadOffset; // advance to '!' or newline
118     NextLine();
119     return;
120   case LineClassification::Kind::IncludeLine:
121     FortranInclude(nextLine_ + line.payloadOffset);
122     NextLine();
123     return;
124   case LineClassification::Kind::ConditionalCompilationDirective:
125   case LineClassification::Kind::IncludeDirective:
126     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
127     afterPreprocessingDirective_ = true;
128     skipLeadingAmpersand_ |= !inFixedForm_;
129     return;
130   case LineClassification::Kind::PreprocessorDirective:
131     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
132     afterPreprocessingDirective_ = true;
133     // Don't set skipLeadingAmpersand_
134     return;
135   case LineClassification::Kind::DefinitionDirective:
136     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
137     // Don't set afterPreprocessingDirective_ or skipLeadingAmpersand_
138     return;
139   case LineClassification::Kind::CompilerDirective: {
140     directiveSentinel_ = line.sentinel;
141     CHECK(InCompilerDirective());
142     BeginStatementAndAdvance();
143     if (inFixedForm_) {
144       CHECK(IsFixedFormCommentChar(*at_));
145     } else {
146       while (int n{IsSpaceOrTab(at_)}) {
147         at_ += n, ++column_;
148       }
149       CHECK(*at_ == '!');
150     }
151     std::optional<int> condOffset;
152     if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
153       // OpenMP conditional compilation line.
154       condOffset = 2;
155     } else if (directiveSentinel_[0] == '@' && directiveSentinel_[1] == 'c' &&
156         directiveSentinel_[2] == 'u' && directiveSentinel_[3] == 'f' &&
157         directiveSentinel_[4] == '\0') {
158       // CUDA conditional compilation line.
159       condOffset = 5;
160     }
161     if (condOffset) {
162       at_ += *condOffset, column_ += *condOffset;
163       if (auto payload{IsIncludeLine(at_)}) {
164         FortranInclude(at_ + *payload);
165         return;
166       } else if (inFixedForm_) {
167         LabelField(tokens);
168       } else {
169         SkipSpaces();
170       }
171     } else {
172       // Compiler directive.  Emit normalized sentinel, squash following spaces.
173       EmitChar(tokens, '!');
174       ++at_, ++column_;
175       for (const char *sp{directiveSentinel_}; *sp != '\0';
176            ++sp, ++at_, ++column_) {
177         EmitChar(tokens, *sp);
178       }
179       if (IsSpaceOrTab(at_)) {
180         EmitChar(tokens, ' ');
181         while (int n{IsSpaceOrTab(at_)}) {
182           at_ += n, ++column_;
183         }
184       }
185       tokens.CloseToken();
186     }
187     break;
188   }
189   case LineClassification::Kind::Source:
190     BeginStatementAndAdvance();
191     if (inFixedForm_) {
192       if (features_.IsEnabled(LanguageFeature::OldDebugLines) &&
193           (*at_ == 'D' || *at_ == 'd')) {
194         NextChar();
195       }
196       LabelField(tokens);
197     } else {
198       if (skipLeadingAmpersand_) {
199         skipLeadingAmpersand_ = false;
200         const char *p{SkipWhiteSpace(at_)};
201         if (p < limit_ && *p == '&') {
202           column_ += ++p - at_;
203           at_ = p;
204         }
205       } else {
206         SkipSpaces();
207       }
208       // Check for a leading identifier that might be a keyword macro
209       // that will expand to anything indicating a non-source line, like
210       // a comment marker or directive sentinel.  If so, disable line
211       // continuation, so that NextToken() won't consume anything from
212       // following lines.
213       if (IsLegalIdentifierStart(*at_)) {
214         // TODO: Only bother with these cases when any keyword macro has
215         // been defined with replacement text that could begin a comment
216         // or directive sentinel.
217         const char *p{at_};
218         while (IsLegalInIdentifier(*++p)) {
219         }
220         CharBlock id{at_, static_cast<std::size_t>(p - at_)};
221         if (preprocessor_.IsNameDefined(id) &&
222             !preprocessor_.IsFunctionLikeDefinition(id)) {
223           TokenSequence toks;
224           toks.Put(id, GetProvenance(at_));
225           if (auto replaced{preprocessor_.MacroReplacement(toks, *this)}) {
226             auto newLineClass{ClassifyLine(*replaced, GetCurrentProvenance())};
227             if (newLineClass.kind ==
228                 LineClassification::Kind::CompilerDirective) {
229               directiveSentinel_ = newLineClass.sentinel;
230               disableSourceContinuation_ = false;
231             } else {
232               disableSourceContinuation_ =
233                   newLineClass.kind != LineClassification::Kind::Source;
234             }
235           }
236         }
237       }
238     }
239     break;
240   }
241 
242   while (NextToken(tokens)) {
243   }
244   if (continuationLines_ > 255) {
245     if (features_.ShouldWarn(common::LanguageFeature::MiscSourceExtensions)) {
246       Say(GetProvenance(statementStart),
247           "%d continuation lines is more than the Fortran standard allows"_port_en_US,
248           continuationLines_);
249     }
250   }
251 
252   Provenance newlineProvenance{GetCurrentProvenance()};
253   if (std::optional<TokenSequence> preprocessed{
254           preprocessor_.MacroReplacement(tokens, *this)}) {
255     // Reprocess the preprocessed line.
256     LineClassification ppl{ClassifyLine(*preprocessed, newlineProvenance)};
257     switch (ppl.kind) {
258     case LineClassification::Kind::Comment:
259       break;
260     case LineClassification::Kind::IncludeLine:
261       FortranInclude(preprocessed->TokenAt(0).begin() + ppl.payloadOffset);
262       break;
263     case LineClassification::Kind::ConditionalCompilationDirective:
264     case LineClassification::Kind::IncludeDirective:
265     case LineClassification::Kind::DefinitionDirective:
266     case LineClassification::Kind::PreprocessorDirective:
267       if (features_.ShouldWarn(common::UsageWarning::Preprocessing)) {
268         Say(preprocessed->GetProvenanceRange(),
269             "Preprocessed line resembles a preprocessor directive"_warn_en_US);
270       }
271       CheckAndEmitLine(preprocessed->ToLowerCase(), newlineProvenance);
272       break;
273     case LineClassification::Kind::CompilerDirective:
274       if (preprocessed->HasRedundantBlanks()) {
275         preprocessed->RemoveRedundantBlanks();
276       }
277       while (CompilerDirectiveContinuation(*preprocessed, ppl.sentinel)) {
278         newlineProvenance = GetCurrentProvenance();
279       }
280       NormalizeCompilerDirectiveCommentMarker(*preprocessed);
281       preprocessed->ToLowerCase();
282       SourceFormChange(preprocessed->ToString());
283       CheckAndEmitLine(preprocessed->ToLowerCase().ClipComment(
284                            *this, true /* skip first ! */),
285           newlineProvenance);
286       break;
287     case LineClassification::Kind::Source:
288       if (inFixedForm_) {
289         if (preprocessed->HasBlanks(/*after column*/ 6)) {
290           preprocessed->RemoveBlanks(/*after column*/ 6);
291         }
292       } else {
293         while (SourceLineContinuation(*preprocessed)) {
294           newlineProvenance = GetCurrentProvenance();
295         }
296         if (preprocessed->HasRedundantBlanks()) {
297           preprocessed->RemoveRedundantBlanks();
298         }
299       }
300       CheckAndEmitLine(
301           preprocessed->ToLowerCase().ClipComment(*this), newlineProvenance);
302       break;
303     }
304   } else { // no macro replacement
305     if (line.kind == LineClassification::Kind::CompilerDirective) {
306       while (CompilerDirectiveContinuation(tokens, line.sentinel)) {
307         newlineProvenance = GetCurrentProvenance();
308       }
309       tokens.ToLowerCase();
310       SourceFormChange(tokens.ToString());
311     } else { // Kind::Source
312       tokens.ToLowerCase();
313       if (inFixedForm_) {
314         EnforceStupidEndStatementRules(tokens);
315       }
316     }
317     CheckAndEmitLine(tokens, newlineProvenance);
318   }
319   directiveSentinel_ = nullptr;
320 }
321 
322 void Prescanner::CheckAndEmitLine(
323     TokenSequence &tokens, Provenance newlineProvenance) {
324   tokens.CheckBadFortranCharacters(
325       messages_, *this, disableSourceContinuation_);
326   // Parenthesis nesting check does not apply while any #include is
327   // active, nor on the lines before and after a top-level #include,
328   // nor before or after conditional source.
329   // Applications play shenanigans with line continuation before and
330   // after #include'd subprogram argument lists and conditional source.
331   if (!isNestedInIncludeDirective_ && !omitNewline_ &&
332       !afterPreprocessingDirective_ && tokens.BadlyNestedParentheses() &&
333       !preprocessor_.InConditional()) {
334     if (nextLine_ < limit_ && IsPreprocessorDirectiveLine(nextLine_)) {
335       // don't complain
336     } else {
337       tokens.CheckBadParentheses(messages_);
338     }
339   }
340   tokens.Emit(cooked_);
341   if (omitNewline_) {
342     omitNewline_ = false;
343   } else {
344     cooked_.Put('\n', newlineProvenance);
345     afterPreprocessingDirective_ = false;
346   }
347 }
348 
349 TokenSequence Prescanner::TokenizePreprocessorDirective() {
350   CHECK(!IsAtEnd() && !inPreprocessorDirective_);
351   inPreprocessorDirective_ = true;
352   BeginStatementAndAdvance();
353   TokenSequence tokens;
354   while (NextToken(tokens)) {
355   }
356   inPreprocessorDirective_ = false;
357   return tokens;
358 }
359 
360 void Prescanner::NextLine() {
361   void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
362   void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
363   if (!v) {
364     nextLine_ = limit_;
365   } else {
366     const char *nl{const_cast<const char *>(static_cast<char *>(v))};
367     nextLine_ = nl + 1;
368   }
369 }
370 
371 void Prescanner::LabelField(TokenSequence &token) {
372   int outCol{1};
373   const char *start{at_};
374   std::optional<int> badColumn;
375   for (; *at_ != '\n' && column_ <= 6; ++at_) {
376     if (*at_ == '\t') {
377       ++at_;
378       column_ = 7;
379       break;
380     }
381     if (int n{IsSpace(at_)}; n == 0 &&
382         !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
383       EmitChar(token, *at_);
384       ++outCol;
385       if (!badColumn && (column_ == 6 || !IsDecimalDigit(*at_))) {
386         badColumn = column_;
387       }
388     }
389     ++column_;
390   }
391   if (badColumn && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) {
392     if ((prescannerNesting_ > 0 && *badColumn == 6 &&
393             cooked_.BufferedBytes() == firstCookedCharacterOffset_) ||
394         afterPreprocessingDirective_) {
395       // This is the first source line in #include'd text or conditional
396       // code under #if, or the first source line after such.
397       // If it turns out that the preprocessed text begins with a
398       // fixed form continuation line, the newline at the end
399       // of the latest source line beforehand will be deleted in
400       // CookedSource::Marshal().
401       cooked_.MarkPossibleFixedFormContinuation();
402     } else if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
403       Say(GetProvenance(start + *badColumn - 1),
404           *badColumn == 6
405               ? "Statement should not begin with a continuation line"_warn_en_US
406               : "Character in fixed-form label field must be a digit"_warn_en_US);
407     }
408     token.clear();
409     if (*badColumn < 6) {
410       at_ = start;
411       column_ = 1;
412       return;
413     }
414     outCol = 1;
415   }
416   if (outCol == 1) { // empty label field
417     // Emit a space so that, if the line is rescanned after preprocessing,
418     // a leading 'C' or 'D' won't be left-justified and then accidentally
419     // misinterpreted as a comment card.
420     EmitChar(token, ' ');
421     ++outCol;
422   }
423   token.CloseToken();
424   SkipToNextSignificantCharacter();
425   if (IsDecimalDigit(*at_)) {
426     if (features_.ShouldWarn(common::LanguageFeature::MiscSourceExtensions)) {
427       Say(GetCurrentProvenance(),
428           "Label digit is not in fixed-form label field"_port_en_US);
429     }
430   }
431 }
432 
433 // 6.3.3.5: A program unit END statement, or any other statement whose
434 // initial line resembles an END statement, shall not be continued in
435 // fixed form source.
436 void Prescanner::EnforceStupidEndStatementRules(const TokenSequence &tokens) {
437   CharBlock cBlock{tokens.ToCharBlock()};
438   const char *str{cBlock.begin()};
439   std::size_t n{cBlock.size()};
440   if (n < 3) {
441     return;
442   }
443   std::size_t j{0};
444   for (; j < n && (str[j] == ' ' || (str[j] >= '0' && str[j] <= '9')); ++j) {
445   }
446   if (j + 3 > n || std::memcmp(str + j, "end", 3) != 0) {
447     return;
448   }
449   // It starts with END, possibly after a label.
450   auto start{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
451   auto end{allSources_.GetSourcePosition(tokens.GetCharProvenance(n - 1))};
452   if (!start || !end) {
453     return;
454   }
455   if (&*start->sourceFile == &*end->sourceFile && start->line == end->line) {
456     return; // no continuation
457   }
458   j += 3;
459   static const char *const prefixes[]{"program", "subroutine", "function",
460       "blockdata", "module", "submodule", nullptr};
461   bool isPrefix{j == n || !IsLegalInIdentifier(str[j])}; // prefix is END
462   std::size_t endOfPrefix{j - 1};
463   for (const char *const *p{prefixes}; *p; ++p) {
464     std::size_t pLen{std::strlen(*p)};
465     if (j + pLen <= n && std::memcmp(str + j, *p, pLen) == 0) {
466       isPrefix = true; // END thing as prefix
467       j += pLen;
468       endOfPrefix = j - 1;
469       for (; j < n && IsLegalInIdentifier(str[j]); ++j) {
470       }
471       break;
472     }
473   }
474   if (isPrefix) {
475     auto range{tokens.GetTokenProvenanceRange(1)};
476     if (j == n) { // END or END thing [name]
477       Say(range,
478           "Program unit END statement may not be continued in fixed form source"_err_en_US);
479     } else {
480       auto endOfPrefixPos{
481           allSources_.GetSourcePosition(tokens.GetCharProvenance(endOfPrefix))};
482       auto next{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
483       if (endOfPrefixPos && next &&
484           &*endOfPrefixPos->sourceFile == &*start->sourceFile &&
485           endOfPrefixPos->line == start->line &&
486           (&*next->sourceFile != &*start->sourceFile ||
487               next->line != start->line)) {
488         Say(range,
489             "Initial line of continued statement must not appear to be a program unit END in fixed form source"_err_en_US);
490       }
491     }
492   }
493 }
494 
495 void Prescanner::SkipToEndOfLine() {
496   while (*at_ != '\n') {
497     ++at_, ++column_;
498   }
499 }
500 
501 bool Prescanner::MustSkipToEndOfLine() const {
502   if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
503     return true; // skip over ignored columns in right margin (73:80)
504   } else if (*at_ == '!' && !inCharLiteral_) {
505     return !IsCompilerDirectiveSentinel(at_);
506   } else {
507     return false;
508   }
509 }
510 
511 void Prescanner::NextChar() {
512   CHECK(*at_ != '\n');
513   int n{IsSpace(at_)};
514   at_ += n ? n : 1;
515   ++column_;
516   while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
517     // UTF-8 byte order mark - treat this file as UTF-8
518     at_ += 3;
519     encoding_ = Encoding::UTF_8;
520   }
521   SkipToNextSignificantCharacter();
522 }
523 
524 // Skip everything that should be ignored until the next significant
525 // character is reached; handles C-style comments in preprocessing
526 // directives, Fortran ! comments, stuff after the right margin in
527 // fixed form, and all forms of line continuation.
528 bool Prescanner::SkipToNextSignificantCharacter() {
529   auto anyContinuationLine{false};
530   if (inPreprocessorDirective_) {
531     SkipCComments();
532   } else {
533     bool mightNeedSpace{false};
534     if (MustSkipToEndOfLine()) {
535       SkipToEndOfLine();
536     } else {
537       mightNeedSpace = *at_ == '\n';
538     }
539     for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
540       anyContinuationLine = true;
541       ++continuationLines_;
542       if (MustSkipToEndOfLine()) {
543         SkipToEndOfLine();
544       }
545     }
546     if (*at_ == '\t') {
547       tabInCurrentLine_ = true;
548     }
549   }
550   return anyContinuationLine;
551 }
552 
553 void Prescanner::SkipCComments() {
554   while (true) {
555     if (IsCComment(at_)) {
556       if (const char *after{SkipCComment(at_)}) {
557         column_ += after - at_;
558         // May have skipped over one or more newlines; relocate the start of
559         // the next line.
560         nextLine_ = at_ = after;
561         NextLine();
562       } else {
563         // Don't emit any messages about unclosed C-style comments, because
564         // the sequence /* can appear legally in a FORMAT statement.  There's
565         // no ambiguity, since the sequence */ cannot appear legally.
566         break;
567       }
568     } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
569         at_[1] == '\n' && !IsAtEnd()) {
570       BeginSourceLineAndAdvance();
571     } else {
572       break;
573     }
574   }
575 }
576 
577 void Prescanner::SkipSpaces() {
578   while (IsSpaceOrTab(at_)) {
579     NextChar();
580   }
581   insertASpace_ = false;
582 }
583 
584 const char *Prescanner::SkipWhiteSpace(const char *p) {
585   while (int n{IsSpaceOrTab(p)}) {
586     p += n;
587   }
588   return p;
589 }
590 
591 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
592   while (true) {
593     if (int n{IsSpaceOrTab(p)}) {
594       p += n;
595     } else if (IsCComment(p)) {
596       if (const char *after{SkipCComment(p)}) {
597         p = after;
598       } else {
599         break;
600       }
601     } else {
602       break;
603     }
604   }
605   return p;
606 }
607 
608 const char *Prescanner::SkipCComment(const char *p) const {
609   char star{' '}, slash{' '};
610   p += 2;
611   while (star != '*' || slash != '/') {
612     if (p >= limit_) {
613       return nullptr; // signifies an unterminated comment
614     }
615     star = slash;
616     slash = *p++;
617   }
618   return p;
619 }
620 
621 bool Prescanner::NextToken(TokenSequence &tokens) {
622   CHECK(at_ >= start_ && at_ < limit_);
623   if (InFixedFormSource()) {
624     SkipSpaces();
625   } else {
626     if (*at_ == '/' && IsCComment(at_)) {
627       // Recognize and skip over classic C style /*comments*/ when
628       // outside a character literal.
629       if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
630         Say(GetCurrentProvenance(),
631             "nonstandard usage: C-style comment"_port_en_US);
632       }
633       SkipCComments();
634     }
635     if (IsSpaceOrTab(at_)) {
636       // Compress free-form white space into a single space character.
637       const auto theSpace{at_};
638       char previous{at_ <= start_ ? ' ' : at_[-1]};
639       NextChar();
640       SkipSpaces();
641       if (*at_ == '\n' && !omitNewline_) {
642         // Discard white space at the end of a line.
643       } else if (!inPreprocessorDirective_ &&
644           (previous == '(' || *at_ == '(' || *at_ == ')')) {
645         // Discard white space before/after '(' and before ')', unless in a
646         // preprocessor directive.  This helps yield space-free contiguous
647         // names for generic interfaces like OPERATOR( + ) and
648         // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
649         // This has the effect of silently ignoring the illegal spaces in
650         // the array constructor ( /1,2/ ) but that seems benign; it's
651         // hard to avoid that while still removing spaces from OPERATOR( / )
652         // and OPERATOR( // ).
653       } else {
654         // Preserve the squashed white space as a single space character.
655         tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
656         tokens.CloseToken();
657         return true;
658       }
659     }
660   }
661   if (insertASpace_) {
662     tokens.PutNextTokenChar(' ', spaceProvenance_);
663     insertASpace_ = false;
664   }
665   if (*at_ == '\n') {
666     return false;
667   }
668   const char *start{at_};
669   if (*at_ == '\'' || *at_ == '"') {
670     QuotedCharacterLiteral(tokens, start);
671     preventHollerith_ = false;
672   } else if (IsDecimalDigit(*at_)) {
673     int n{0}, digits{0};
674     static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
675     do {
676       if (n < maxHollerith) {
677         n = 10 * n + DecimalDigitValue(*at_);
678       }
679       EmitCharAndAdvance(tokens, *at_);
680       ++digits;
681       if (InFixedFormSource()) {
682         SkipSpaces();
683       }
684     } while (IsDecimalDigit(*at_));
685     if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
686         !preventHollerith_) {
687       Hollerith(tokens, n, start);
688     } else if (*at_ == '.') {
689       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
690       }
691       ExponentAndKind(tokens);
692     } else if (ExponentAndKind(tokens)) {
693     } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
694         inPreprocessorDirective_) {
695       do {
696         EmitCharAndAdvance(tokens, *at_);
697       } while (IsHexadecimalDigit(*at_));
698     } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..."
699       EmitCharAndAdvance(tokens, *at_);
700       QuotedCharacterLiteral(tokens, start);
701     } else if (IsLetter(*at_) && !preventHollerith_ &&
702         parenthesisNesting_ > 0) {
703       // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
704       // we don't misrecognize I9HOLLERITH as an identifier in the next case.
705       EmitCharAndAdvance(tokens, *at_);
706     }
707     preventHollerith_ = false;
708   } else if (*at_ == '.') {
709     char nch{EmitCharAndAdvance(tokens, '.')};
710     if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
711       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
712       }
713       ExponentAndKind(tokens);
714     } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
715       EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
716     }
717     preventHollerith_ = false;
718   } else if (IsLegalInIdentifier(*at_)) {
719     int parts{1};
720     const char *afterLast{nullptr};
721     do {
722       EmitChar(tokens, *at_);
723       ++at_, ++column_;
724       afterLast = at_;
725       if (SkipToNextSignificantCharacter() && IsLegalIdentifierStart(*at_)) {
726         tokens.CloseToken();
727         ++parts;
728       }
729     } while (IsLegalInIdentifier(*at_));
730     if (parts >= 3) {
731       // Subtlety: When an identifier is split across three or more continuation
732       // lines (or two continuation lines, immediately preceded or followed
733       // by '&' free form continuation line markers, its parts are kept as
734       // distinct pp-tokens so that macro replacement operates on them
735       // independently.  This trick accommodates the historic practice of
736       // using line continuation for token pasting after replacement.
737     } else if (parts == 2) {
738       if (afterLast && afterLast < limit_) {
739         afterLast = SkipWhiteSpace(afterLast);
740       }
741       if ((start > start_ && start[-1] == '&') ||
742           (afterLast && afterLast < limit_ &&
743               (*afterLast == '&' || *afterLast == '\n'))) {
744         // call &                call foo&        call foo&
745         //   &MACRO&      OR       &MACRO&   OR     &MACRO
746         //   &foo(...)             &(...)
747       } else {
748         tokens.ReopenLastToken();
749       }
750     }
751     if (InFixedFormSource()) {
752       SkipSpaces();
753     }
754     if ((*at_ == '\'' || *at_ == '"') &&
755         tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..."
756       QuotedCharacterLiteral(tokens, start);
757       preventHollerith_ = false;
758     } else {
759       preventHollerith_ = true; // DO 10 H = ...
760     }
761   } else if (*at_ == '*') {
762     if (EmitCharAndAdvance(tokens, '*') == '*') {
763       EmitCharAndAdvance(tokens, '*');
764     } else {
765       // Subtle ambiguity:
766       //  CHARACTER*2H     declares H because *2 is a kind specifier
767       //  DATAC/N*2H  /    is repeated Hollerith
768       preventHollerith_ = !slashInCurrentStatement_;
769     }
770   } else {
771     char ch{*at_};
772     if (ch == '(') {
773       if (parenthesisNesting_++ == 0) {
774         isPossibleMacroCall_ = tokens.SizeInTokens() > 0 &&
775             preprocessor_.IsFunctionLikeDefinition(
776                 tokens.TokenAt(tokens.SizeInTokens() - 1));
777       }
778     } else if (ch == ')' && parenthesisNesting_ > 0) {
779       --parenthesisNesting_;
780     }
781     char nch{EmitCharAndAdvance(tokens, ch)};
782     preventHollerith_ = false;
783     if ((nch == '=' &&
784             (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
785         (ch == nch &&
786             (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
787                 ch == '|' || ch == '<' || ch == '>')) ||
788         (ch == '=' && nch == '>')) {
789       // token comprises two characters
790       EmitCharAndAdvance(tokens, nch);
791     } else if (ch == '/') {
792       slashInCurrentStatement_ = true;
793     } else if (ch == ';' && InFixedFormSource()) {
794       SkipSpaces();
795       if (IsDecimalDigit(*at_)) {
796         if (features_.ShouldWarn(
797                 common::LanguageFeature::MiscSourceExtensions)) {
798           Say(GetProvenanceRange(at_, at_ + 1),
799               "Label should be in the label field"_port_en_US);
800         }
801       }
802     }
803   }
804   tokens.CloseToken();
805   return true;
806 }
807 
808 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
809   char ed{ToLowerCaseLetter(*at_)};
810   if (ed != 'e' && ed != 'd') {
811     return false;
812   }
813   EmitCharAndAdvance(tokens, ed);
814   if (*at_ == '+' || *at_ == '-') {
815     EmitCharAndAdvance(tokens, *at_);
816   }
817   while (IsDecimalDigit(*at_)) {
818     EmitCharAndAdvance(tokens, *at_);
819   }
820   if (*at_ == '_') {
821     while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
822     }
823   }
824   return true;
825 }
826 
827 void Prescanner::QuotedCharacterLiteral(
828     TokenSequence &tokens, const char *start) {
829   char quote{*at_};
830   const char *end{at_ + 1};
831   inCharLiteral_ = true;
832   continuationInCharLiteral_ = true;
833   const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
834   const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
835   bool isEscaped{false};
836   bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
837   while (true) {
838     if (*at_ == '\\') {
839       if (escapesEnabled) {
840         isEscaped = !isEscaped;
841       } else {
842         // The parser always processes escape sequences, so don't confuse it
843         // when escapes are disabled.
844         insert('\\');
845       }
846     } else {
847       isEscaped = false;
848     }
849     EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
850         Encoding::LATIN_1);
851     while (PadOutCharacterLiteral(tokens)) {
852     }
853     if (*at_ == '\n') {
854       if (!inPreprocessorDirective_) {
855         Say(GetProvenanceRange(start, end),
856             "Incomplete character literal"_err_en_US);
857       }
858       break;
859     }
860     // Here's a weird edge case.  When there's a two or more following
861     // continuation lines at this point, and the entire significant part of
862     // the next continuation line is the name of a keyword macro, replace
863     // it in the character literal with its definition.  Example:
864     //   #define FOO foo
865     //   subroutine subr() bind(c, name="my_&
866     //     &FOO&
867     //     &_bar") ...
868     // produces a binding name of "my_foo_bar".
869     while (at_[1] == '&' && nextLine_ < limit_ && !InFixedFormSource()) {
870       const char *idStart{nextLine_};
871       if (const char *amper{SkipWhiteSpace(nextLine_)}; *amper == '&') {
872         idStart = amper + 1;
873       }
874       if (IsLegalIdentifierStart(*idStart)) {
875         std::size_t idLen{1};
876         for (; IsLegalInIdentifier(idStart[idLen]); ++idLen) {
877         }
878         if (idStart[idLen] == '&') {
879           CharBlock id{idStart, idLen};
880           if (preprocessor_.IsNameDefined(id)) {
881             TokenSequence ppTokens;
882             ppTokens.Put(id, GetProvenance(idStart));
883             if (auto replaced{
884                     preprocessor_.MacroReplacement(ppTokens, *this)}) {
885               tokens.Put(*replaced);
886               at_ = &idStart[idLen - 1];
887               NextLine();
888               continue; // try again on the next line
889             }
890           }
891         }
892       }
893       break;
894     }
895     end = at_ + 1;
896     NextChar();
897     if (*at_ == quote && !isEscaped) {
898       // A doubled unescaped quote mark becomes a single instance of that
899       // quote character in the literal (later).  There can be spaces between
900       // the quotes in fixed form source.
901       EmitChar(tokens, quote);
902       inCharLiteral_ = false; // for cases like print *, '...'!comment
903       NextChar();
904       if (InFixedFormSource()) {
905         SkipSpaces();
906       }
907       if (*at_ != quote) {
908         break;
909       }
910       inCharLiteral_ = true;
911     }
912   }
913   continuationInCharLiteral_ = false;
914   inCharLiteral_ = false;
915 }
916 
917 void Prescanner::Hollerith(
918     TokenSequence &tokens, int count, const char *start) {
919   inCharLiteral_ = true;
920   CHECK(*at_ == 'h' || *at_ == 'H');
921   EmitChar(tokens, 'H');
922   while (count-- > 0) {
923     if (PadOutCharacterLiteral(tokens)) {
924     } else if (*at_ == '\n') {
925       if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
926         Say(GetProvenanceRange(start, at_),
927             "Possible truncated Hollerith literal"_warn_en_US);
928       }
929       break;
930     } else {
931       NextChar();
932       // Each multi-byte character encoding counts as a single character.
933       // No escape sequences are recognized.
934       // Hollerith is always emitted to the cooked character
935       // stream in UTF-8.
936       DecodedCharacter decoded{DecodeCharacter(
937           encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
938       if (decoded.bytes > 0) {
939         EncodedCharacter utf8{
940             EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
941         for (int j{0}; j < utf8.bytes; ++j) {
942           EmitChar(tokens, utf8.buffer[j]);
943         }
944         at_ += decoded.bytes - 1;
945       } else {
946         Say(GetProvenanceRange(start, at_),
947             "Bad character in Hollerith literal"_err_en_US);
948         break;
949       }
950     }
951   }
952   if (*at_ != '\n') {
953     NextChar();
954   }
955   inCharLiteral_ = false;
956 }
957 
958 // In fixed form, source card images must be processed as if they were at
959 // least 72 columns wide, at least in character literal contexts.
960 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
961   while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
962     if (column_ < fixedFormColumnLimit_) {
963       tokens.PutNextTokenChar(' ', spaceProvenance_);
964       ++column_;
965       return true;
966     }
967     if (!FixedFormContinuation(false /*no need to insert space*/) ||
968         tabInCurrentLine_) {
969       return false;
970     }
971     CHECK(column_ == 7);
972     --at_; // point to column 6 of continuation line
973     column_ = 6;
974   }
975   return false;
976 }
977 
978 static bool IsAtProcess(const char *p) {
979   static const char pAtProc[]{"process"};
980   for (std::size_t i{0}; i < sizeof pAtProc - 1; ++i) {
981     if (ToLowerCaseLetter(*++p) != pAtProc[i])
982       return false;
983   }
984   return true;
985 }
986 
987 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
988   const char *p{start};
989 
990   // The @process directive must start in column 1.
991   if (*p == '@' && IsAtProcess(p)) {
992     return true;
993   }
994 
995   if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
996       ((*p == 'D' || *p == 'd') &&
997           !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
998     return true;
999   }
1000   bool anyTabs{false};
1001   while (true) {
1002     if (int n{IsSpace(p)}) {
1003       p += n;
1004     } else if (*p == '\t') {
1005       anyTabs = true;
1006       ++p;
1007     } else if (*p == '0' && !anyTabs && p == start + 5) {
1008       ++p; // 0 in column 6 must treated as a space
1009     } else {
1010       break;
1011     }
1012   }
1013   if (!anyTabs && p >= start + fixedFormColumnLimit_) {
1014     return true;
1015   }
1016   if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
1017     return true;
1018   }
1019   return *p == '\n';
1020 }
1021 
1022 const char *Prescanner::IsFreeFormComment(const char *p) const {
1023   p = SkipWhiteSpaceAndCComments(p);
1024   if (*p == '!' || *p == '\n') {
1025     return p;
1026   } else if (*p == '@') {
1027     return IsAtProcess(p) ? p : nullptr;
1028   } else {
1029     return nullptr;
1030   }
1031 }
1032 
1033 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
1034   const char *p{SkipWhiteSpace(start)};
1035   if (*p == '0' && inFixedForm_ && p == start + 5) {
1036     // Accept "     0INCLUDE" in fixed form.
1037     p = SkipWhiteSpace(p + 1);
1038   }
1039   for (const char *q{"include"}; *q; ++q) {
1040     if (ToLowerCaseLetter(*p) != *q) {
1041       return std::nullopt;
1042     }
1043     p = SkipWhiteSpace(p + 1);
1044   }
1045   if (IsDecimalDigit(*p)) { // accept & ignore a numeric kind prefix
1046     for (p = SkipWhiteSpace(p + 1); IsDecimalDigit(*p);
1047          p = SkipWhiteSpace(p + 1)) {
1048     }
1049     if (*p != '_') {
1050       return std::nullopt;
1051     }
1052     p = SkipWhiteSpace(p + 1);
1053   }
1054   if (*p == '"' || *p == '\'') {
1055     return {p - start};
1056   }
1057   return std::nullopt;
1058 }
1059 
1060 void Prescanner::FortranInclude(const char *firstQuote) {
1061   const char *p{firstQuote};
1062   while (*p != '"' && *p != '\'') {
1063     ++p;
1064   }
1065   char quote{*p};
1066   std::string path;
1067   for (++p; *p != '\n'; ++p) {
1068     if (*p == quote) {
1069       if (p[1] != quote) {
1070         break;
1071       }
1072       ++p;
1073     }
1074     path += *p;
1075   }
1076   if (*p != quote) {
1077     Say(GetProvenanceRange(firstQuote, p),
1078         "malformed path name string"_err_en_US);
1079     return;
1080   }
1081   p = SkipWhiteSpace(p + 1);
1082   if (*p != '\n' && *p != '!') {
1083     const char *garbage{p};
1084     for (; *p != '\n' && *p != '!'; ++p) {
1085     }
1086     if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
1087       Say(GetProvenanceRange(garbage, p),
1088           "excess characters after path name"_warn_en_US);
1089     }
1090   }
1091   std::string buf;
1092   llvm::raw_string_ostream error{buf};
1093   Provenance provenance{GetProvenance(nextLine_)};
1094   std::optional<std::string> prependPath;
1095   if (const SourceFile * currentFile{allSources_.GetSourceFile(provenance)}) {
1096     prependPath = DirectoryName(currentFile->path());
1097   }
1098   const SourceFile *included{
1099       allSources_.Open(path, error, std::move(prependPath))};
1100   if (!included) {
1101     Say(provenance, "INCLUDE: %s"_err_en_US, error.str());
1102   } else if (included->bytes() > 0) {
1103     ProvenanceRange includeLineRange{
1104         provenance, static_cast<std::size_t>(p - nextLine_)};
1105     ProvenanceRange fileRange{
1106         allSources_.AddIncludedFile(*included, includeLineRange)};
1107     Prescanner{*this, /*isNestedInIncludeDirective=*/false}
1108         .set_encoding(included->encoding())
1109         .Prescan(fileRange);
1110   }
1111 }
1112 
1113 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
1114   const char *p{start};
1115   while (int n{IsSpace(p)}) {
1116     p += n;
1117   }
1118   if (*p == '#') {
1119     if (inFixedForm_ && p == start + 5) {
1120       return nullptr;
1121     }
1122   } else {
1123     p = SkipWhiteSpace(p);
1124     if (*p != '#') {
1125       return nullptr;
1126     }
1127   }
1128   return SkipWhiteSpace(p + 1);
1129 }
1130 
1131 bool Prescanner::IsNextLinePreprocessorDirective() const {
1132   return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
1133 }
1134 
1135 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
1136   if (IsAtEnd()) {
1137     if (afterAmpersand && prescannerNesting_ > 0) {
1138       // A continuation marker at the end of the last line in an
1139       // include file inhibits the newline for that line.
1140       SkipToEndOfLine();
1141       omitNewline_ = true;
1142     }
1143   } else if (inPreprocessorDirective_) {
1144   } else {
1145     auto lineClass{ClassifyLine(nextLine_)};
1146     if (lineClass.kind == LineClassification::Kind::Comment) {
1147       NextLine();
1148       return true;
1149     } else if (lineClass.kind ==
1150             LineClassification::Kind::ConditionalCompilationDirective ||
1151         lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
1152       // Allow conditional compilation directives (e.g., #ifdef) to affect
1153       // continuation lines.
1154       // Allow other preprocessor directives, too, except #include
1155       // (when it does not follow '&'), #define, and #undef (because
1156       // they cannot be allowed to affect preceding text on a
1157       // continued line).
1158       preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
1159       return true;
1160     } else if (afterAmpersand &&
1161         (lineClass.kind == LineClassification::Kind::DefinitionDirective ||
1162             lineClass.kind == LineClassification::Kind::IncludeDirective ||
1163             lineClass.kind == LineClassification::Kind::IncludeLine)) {
1164       SkipToEndOfLine();
1165       omitNewline_ = true;
1166       skipLeadingAmpersand_ = true;
1167     }
1168   }
1169   return false;
1170 }
1171 
1172 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
1173   if (IsAtEnd()) {
1174     return nullptr;
1175   }
1176   tabInCurrentLine_ = false;
1177   char col1{*nextLine_};
1178   if (IsFixedFormCommentChar(col1)) {
1179     int j{1};
1180     if (InCompilerDirective()) {
1181       // Must be a continued compiler directive.
1182       for (; j < 5; ++j) {
1183         char ch{directiveSentinel_[j - 1]};
1184         if (ch == '\0') {
1185           break;
1186         }
1187         if (ch != ToLowerCaseLetter(nextLine_[j])) {
1188           return nullptr;
1189         }
1190       }
1191     } else if (features_.IsEnabled(LanguageFeature::OpenMP)) {
1192       // Fixed Source Form Conditional Compilation Sentinels.
1193       if (nextLine_[1] != '$') {
1194         return nullptr;
1195       }
1196       j++;
1197     } else {
1198       return nullptr;
1199     }
1200     for (; j < 5; ++j) {
1201       if (nextLine_[j] != ' ') {
1202         return nullptr;
1203       }
1204     }
1205     const char *col6{nextLine_ + 5};
1206     if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) {
1207       if (mightNeedSpace && !IsSpace(nextLine_ + 6)) {
1208         insertASpace_ = true;
1209       }
1210       return nextLine_ + 6;
1211     }
1212     return nullptr;
1213   } else {
1214     // Normal case: not in a compiler directive.
1215     if (col1 == '&' &&
1216         features_.IsEnabled(
1217             LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
1218       // Extension: '&' as continuation marker
1219       if (features_.ShouldWarn(
1220               LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
1221         Say(GetProvenance(nextLine_), "nonstandard usage"_port_en_US);
1222       }
1223       return nextLine_ + 1;
1224     }
1225     if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
1226       tabInCurrentLine_ = true;
1227       return nextLine_ + 2; // VAX extension
1228     }
1229     if ((col1 == ' ' ||
1230             ((col1 == 'D' || col1 == 'd') &&
1231                 features_.IsEnabled(LanguageFeature::OldDebugLines))) &&
1232         nextLine_[1] == ' ' && nextLine_[2] == ' ' && nextLine_[3] == ' ' &&
1233         nextLine_[4] == ' ') {
1234       const char *col6{nextLine_ + 5};
1235       if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) {
1236         if ((*col6 == 'i' || *col6 == 'I') && IsIncludeLine(nextLine_)) {
1237           // It's An INCLUDE line, not a continuation
1238         } else {
1239           return nextLine_ + 6;
1240         }
1241       }
1242     }
1243     if (IsImplicitContinuation()) {
1244       return nextLine_;
1245     }
1246   }
1247   return nullptr; // not a continuation line
1248 }
1249 
1250 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
1251   const char *p{nextLine_};
1252   if (p >= limit_) {
1253     return nullptr;
1254   }
1255   p = SkipWhiteSpace(p);
1256   if (InCompilerDirective()) {
1257     if (*p++ != '!') {
1258       return nullptr;
1259     }
1260     for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
1261       if (*s != ToLowerCaseLetter(*p)) {
1262         return nullptr;
1263       }
1264     }
1265     p = SkipWhiteSpace(p);
1266     if (*p == '&') {
1267       if (!ampersand) {
1268         insertASpace_ = true;
1269       }
1270       return p + 1;
1271     } else if (ampersand) {
1272       return p;
1273     } else {
1274       return nullptr;
1275     }
1276   } else {
1277     if (*p == '&') {
1278       return p + 1;
1279     } else if (*p == '!' || *p == '\n' || *p == '#') {
1280       return nullptr;
1281     } else if (ampersand || IsImplicitContinuation()) {
1282       if (continuationInCharLiteral_) {
1283         // 'a'&            -> 'a''b' == "a'b"
1284         //   'b'
1285         if (features_.ShouldWarn(
1286                 common::LanguageFeature::MiscSourceExtensions)) {
1287           Say(GetProvenanceRange(p, p + 1),
1288               "Character literal continuation line should have been preceded by '&'"_port_en_US);
1289         }
1290       } else if (p > nextLine_) {
1291         --p;
1292       } else {
1293         insertASpace_ = true;
1294       }
1295       return p;
1296     } else {
1297       return nullptr;
1298     }
1299   }
1300 }
1301 
1302 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
1303   // N.B. We accept '&' as a continuation indicator in fixed form, too,
1304   // but not in a character literal.
1305   if (*at_ == '&' && inCharLiteral_) {
1306     return false;
1307   }
1308   do {
1309     if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
1310       BeginSourceLine(cont);
1311       column_ = 7;
1312       NextLine();
1313       return true;
1314     }
1315   } while (SkipCommentLine(false /* not after ampersand */));
1316   return false;
1317 }
1318 
1319 bool Prescanner::FreeFormContinuation() {
1320   const char *p{at_};
1321   bool ampersand{*p == '&'};
1322   if (ampersand) {
1323     p = SkipWhiteSpace(p + 1);
1324   }
1325   if (*p != '\n') {
1326     if (inCharLiteral_) {
1327       return false;
1328     } else if (*p == '!') { // & ! comment - ok
1329     } else if (ampersand && isPossibleMacroCall_ && (*p == ',' || *p == ')')) {
1330       return false; // allow & at end of a macro argument
1331     } else if (features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
1332       Say(GetProvenance(p), "missing ! before comment after &"_warn_en_US);
1333     }
1334   }
1335   do {
1336     if (const char *cont{FreeFormContinuationLine(ampersand)}) {
1337       BeginSourceLine(cont);
1338       NextLine();
1339       return true;
1340     }
1341   } while (SkipCommentLine(ampersand));
1342   return false;
1343 }
1344 
1345 // Implicit line continuation allows a preprocessor macro call with
1346 // arguments to span multiple lines.
1347 bool Prescanner::IsImplicitContinuation() const {
1348   return !inPreprocessorDirective_ && !inCharLiteral_ && isPossibleMacroCall_ &&
1349       parenthesisNesting_ > 0 && !IsAtEnd() &&
1350       ClassifyLine(nextLine_).kind == LineClassification::Kind::Source;
1351 }
1352 
1353 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
1354   if (disableSourceContinuation_) {
1355     return false;
1356   } else if (*at_ == '\n' || *at_ == '&') {
1357     if (inFixedForm_) {
1358       return FixedFormContinuation(mightNeedFixedFormSpace);
1359     } else {
1360       return FreeFormContinuation();
1361     }
1362   } else if (*at_ == '\\' && at_ + 2 == nextLine_ &&
1363       backslashFreeFormContinuation_ && !inFixedForm_ && nextLine_ < limit_) {
1364     // cpp-like handling of \ at end of a free form source line
1365     BeginSourceLine(nextLine_);
1366     NextLine();
1367     return true;
1368   } else {
1369     return false;
1370   }
1371 }
1372 
1373 std::optional<Prescanner::LineClassification>
1374 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
1375   const char *p{start};
1376   char col1{*p++};
1377   if (!IsFixedFormCommentChar(col1)) {
1378     return std::nullopt;
1379   }
1380   char sentinel[5], *sp{sentinel};
1381   int column{2};
1382   for (; column < 6; ++column, ++p) {
1383     if (*p == '\n' || IsSpaceOrTab(p)) {
1384       break;
1385     }
1386     if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1387       // OpenMP conditional compilation line: leave the label alone
1388       break;
1389     }
1390     *sp++ = ToLowerCaseLetter(*p);
1391   }
1392   if (column == 6) {
1393     if (*p == '0') {
1394       ++p;
1395     } else if (int n{IsSpaceOrTab(p)}) {
1396       p += n;
1397     } else {
1398       // This is a Continuation line, not an initial directive line.
1399       return std::nullopt;
1400     }
1401   }
1402   if (sp == sentinel) {
1403     return std::nullopt;
1404   }
1405   *sp = '\0';
1406   if (const char *ss{IsCompilerDirectiveSentinel(
1407           sentinel, static_cast<std::size_t>(sp - sentinel))}) {
1408     std::size_t payloadOffset = p - start;
1409     return {LineClassification{
1410         LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1411   }
1412   return std::nullopt;
1413 }
1414 
1415 std::optional<Prescanner::LineClassification>
1416 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1417   if (const char *p{SkipWhiteSpace(start)}; p && *p++ == '!') {
1418     if (auto maybePair{IsCompilerDirectiveSentinel(p)}) {
1419       auto offset{static_cast<std::size_t>(maybePair->second - start)};
1420       return {LineClassification{LineClassification::Kind::CompilerDirective,
1421           offset, maybePair->first}};
1422     }
1423   }
1424   return std::nullopt;
1425 }
1426 
1427 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1428   std::uint64_t packed{0};
1429   for (char ch : dir) {
1430     packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1431   }
1432   compilerDirectiveBloomFilter_.set(packed % prime1);
1433   compilerDirectiveBloomFilter_.set(packed % prime2);
1434   compilerDirectiveSentinels_.insert(dir);
1435   return *this;
1436 }
1437 
1438 const char *Prescanner::IsCompilerDirectiveSentinel(
1439     const char *sentinel, std::size_t len) const {
1440   std::uint64_t packed{0};
1441   for (std::size_t j{0}; j < len; ++j) {
1442     packed = (packed << 8) | (sentinel[j] & 0xff);
1443   }
1444   if (len == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1445       !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1446     return nullptr;
1447   }
1448   const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, len))};
1449   return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
1450 }
1451 
1452 const char *Prescanner::IsCompilerDirectiveSentinel(CharBlock token) const {
1453   const char *p{token.begin()};
1454   const char *end{p + token.size()};
1455   while (p < end && (*p == ' ' || *p == '\n')) {
1456     ++p;
1457   }
1458   if (p < end && *p == '!') {
1459     ++p;
1460   }
1461   while (end > p && (end[-1] == ' ' || end[-1] == '\t')) {
1462     --end;
1463   }
1464   return end > p && IsCompilerDirectiveSentinel(p, end - p) ? p : nullptr;
1465 }
1466 
1467 std::optional<std::pair<const char *, const char *>>
1468 Prescanner::IsCompilerDirectiveSentinel(const char *p) const {
1469   char sentinel[8];
1470   for (std::size_t j{0}; j + 1 < sizeof sentinel && *p != '\n'; ++p, ++j) {
1471     if (int n{*p == '&' ? 1 : IsSpaceOrTab(p)}) {
1472       if (j > 0) {
1473         sentinel[j] = '\0';
1474         p = SkipWhiteSpace(p + n);
1475         if (*p != '!') {
1476           if (const char *sp{IsCompilerDirectiveSentinel(sentinel, j)}) {
1477             return std::make_pair(sp, p);
1478           }
1479         }
1480       }
1481       break;
1482     } else {
1483       sentinel[j] = ToLowerCaseLetter(*p);
1484     }
1485   }
1486   return std::nullopt;
1487 }
1488 
1489 constexpr bool IsDirective(const char *match, const char *dir) {
1490   for (; *match; ++match) {
1491     if (*match != ToLowerCaseLetter(*dir++)) {
1492       return false;
1493     }
1494   }
1495   return true;
1496 }
1497 
1498 Prescanner::LineClassification Prescanner::ClassifyLine(
1499     const char *start) const {
1500   if (inFixedForm_) {
1501     if (std::optional<LineClassification> lc{
1502             IsFixedFormCompilerDirectiveLine(start)}) {
1503       return std::move(*lc);
1504     }
1505     if (IsFixedFormCommentLine(start)) {
1506       return {LineClassification::Kind::Comment};
1507     }
1508   } else {
1509     if (std::optional<LineClassification> lc{
1510             IsFreeFormCompilerDirectiveLine(start)}) {
1511       return std::move(*lc);
1512     }
1513     if (const char *bang{IsFreeFormComment(start)}) {
1514       return {LineClassification::Kind::Comment,
1515           static_cast<std::size_t>(bang - start)};
1516     }
1517   }
1518   if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1519     return {LineClassification::Kind::IncludeLine, *quoteOffset};
1520   }
1521   if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1522     if (IsDirective("if", dir) || IsDirective("elif", dir) ||
1523         IsDirective("else", dir) || IsDirective("endif", dir)) {
1524       return {LineClassification::Kind::ConditionalCompilationDirective};
1525     } else if (IsDirective("include", dir)) {
1526       return {LineClassification::Kind::IncludeDirective};
1527     } else if (IsDirective("define", dir) || IsDirective("undef", dir)) {
1528       return {LineClassification::Kind::DefinitionDirective};
1529     } else {
1530       return {LineClassification::Kind::PreprocessorDirective};
1531     }
1532   }
1533   return {LineClassification::Kind::Source};
1534 }
1535 
1536 Prescanner::LineClassification Prescanner::ClassifyLine(
1537     TokenSequence &tokens, Provenance newlineProvenance) const {
1538   // Append a newline temporarily.
1539   tokens.PutNextTokenChar('\n', newlineProvenance);
1540   tokens.CloseToken();
1541   const char *ppd{tokens.ToCharBlock().begin()};
1542   LineClassification classification{ClassifyLine(ppd)};
1543   tokens.pop_back(); // remove the newline
1544   return classification;
1545 }
1546 
1547 void Prescanner::SourceFormChange(std::string &&dir) {
1548   if (dir == "!dir$ free") {
1549     inFixedForm_ = false;
1550   } else if (dir == "!dir$ fixed") {
1551     inFixedForm_ = true;
1552   }
1553 }
1554 
1555 // Acquire and append compiler directive continuation lines to
1556 // the tokens that constitute a compiler directive, even when those
1557 // directive continuation lines are the result of macro expansion.
1558 // (Not used when neither the original compiler directive line nor
1559 // the directive continuation line result from preprocessing; regular
1560 // line continuation during tokenization handles that normal case.)
1561 bool Prescanner::CompilerDirectiveContinuation(
1562     TokenSequence &tokens, const char *origSentinel) {
1563   if (inFixedForm_ || tokens.empty() ||
1564       tokens.TokenAt(tokens.SizeInTokens() - 1) != "&") {
1565     return false;
1566   }
1567   LineClassification followingLine{ClassifyLine(nextLine_)};
1568   if (followingLine.kind == LineClassification::Kind::Comment) {
1569     nextLine_ += followingLine.payloadOffset; // advance to '!' or newline
1570     NextLine();
1571     return true;
1572   }
1573   CHECK(origSentinel != nullptr);
1574   directiveSentinel_ = origSentinel; // so InCompilerDirective() is true
1575   const char *nextContinuation{
1576       followingLine.kind == LineClassification::Kind::CompilerDirective
1577           ? FreeFormContinuationLine(true)
1578           : nullptr};
1579   if (!nextContinuation &&
1580       followingLine.kind != LineClassification::Kind::Source) {
1581     return false;
1582   }
1583   auto origNextLine{nextLine_};
1584   BeginSourceLine(nextLine_);
1585   NextLine();
1586   if (nextContinuation) {
1587     // What follows is !DIR$ & xxx; skip over the & so that it
1588     // doesn't cause a spurious continuation.
1589     at_ = nextContinuation;
1590   } else {
1591     // What follows looks like a source line before macro expansion,
1592     // but might become a directive continuation afterwards.
1593     SkipSpaces();
1594   }
1595   TokenSequence followingTokens;
1596   while (NextToken(followingTokens)) {
1597   }
1598   if (auto followingPrepro{
1599           preprocessor_.MacroReplacement(followingTokens, *this)}) {
1600     followingTokens = std::move(*followingPrepro);
1601   }
1602   followingTokens.RemoveRedundantBlanks();
1603   std::size_t startAt{0};
1604   std::size_t following{followingTokens.SizeInTokens()};
1605   bool ok{false};
1606   if (nextContinuation) {
1607     ok = true;
1608   } else {
1609     startAt = 2;
1610     if (startAt < following && followingTokens.TokenAt(0) == "!") {
1611       CharBlock sentinel{followingTokens.TokenAt(1)};
1612       if (!sentinel.empty() &&
1613           std::memcmp(sentinel.begin(), origSentinel, sentinel.size()) == 0) {
1614         ok = true;
1615         while (
1616             startAt < following && followingTokens.TokenAt(startAt).IsBlank()) {
1617           ++startAt;
1618         }
1619         if (startAt < following && followingTokens.TokenAt(startAt) == "&") {
1620           ++startAt;
1621         }
1622       }
1623     }
1624   }
1625   if (ok) {
1626     tokens.pop_back(); // delete original '&'
1627     tokens.Put(followingTokens, startAt, following - startAt);
1628     tokens.RemoveRedundantBlanks();
1629   } else {
1630     nextLine_ = origNextLine;
1631   }
1632   return ok;
1633 }
1634 
1635 // Similar, but for source line continuation after macro replacement.
1636 bool Prescanner::SourceLineContinuation(TokenSequence &tokens) {
1637   if (!inFixedForm_ && !tokens.empty() &&
1638       tokens.TokenAt(tokens.SizeInTokens() - 1) == "&") {
1639     LineClassification followingLine{ClassifyLine(nextLine_)};
1640     if (followingLine.kind == LineClassification::Kind::Comment) {
1641       nextLine_ += followingLine.payloadOffset; // advance to '!' or newline
1642       NextLine();
1643       return true;
1644     } else if (const char *nextContinuation{FreeFormContinuationLine(true)}) {
1645       BeginSourceLine(nextLine_);
1646       NextLine();
1647       TokenSequence followingTokens;
1648       at_ = nextContinuation;
1649       while (NextToken(followingTokens)) {
1650       }
1651       if (auto followingPrepro{
1652               preprocessor_.MacroReplacement(followingTokens, *this)}) {
1653         followingTokens = std::move(*followingPrepro);
1654       }
1655       followingTokens.RemoveRedundantBlanks();
1656       tokens.pop_back(); // delete original '&'
1657       tokens.Put(followingTokens);
1658       return true;
1659     }
1660   }
1661   return false;
1662 }
1663 } // namespace Fortran::parser
1664