xref: /llvm-project/flang/lib/Parser/prescan.cpp (revision 4a3e4b99b9ab3016afe8b02c4f83f24635964f4e)
1 //===-- lib/Parser/prescan.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "prescan.h"
10 #include "flang/Common/idioms.h"
11 #include "flang/Parser/characters.h"
12 #include "flang/Parser/message.h"
13 #include "flang/Parser/preprocessor.h"
14 #include "flang/Parser/source.h"
15 #include "flang/Parser/token-sequence.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <cstddef>
18 #include <cstring>
19 #include <utility>
20 #include <vector>
21 
22 namespace Fortran::parser {
23 
24 using common::LanguageFeature;
25 
26 static constexpr int maxPrescannerNesting{100};
27 
28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
29     Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
30     : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
31       allSources_{preprocessor_.allSources()}, features_{lfc},
32       backslashFreeFormContinuation_{preprocessor.AnyDefinitions()},
33       encoding_{allSources_.encoding()} {}
34 
35 Prescanner::Prescanner(const Prescanner &that, Preprocessor &prepro,
36     bool isNestedInIncludeDirective)
37     : messages_{that.messages_}, cooked_{that.cooked_}, preprocessor_{prepro},
38       allSources_{that.allSources_}, features_{that.features_},
39       preprocessingOnly_{that.preprocessingOnly_},
40       expandIncludeLines_{that.expandIncludeLines_},
41       isNestedInIncludeDirective_{isNestedInIncludeDirective},
42       backslashFreeFormContinuation_{that.backslashFreeFormContinuation_},
43       inFixedForm_{that.inFixedForm_},
44       fixedFormColumnLimit_{that.fixedFormColumnLimit_},
45       encoding_{that.encoding_},
46       prescannerNesting_{that.prescannerNesting_ + 1},
47       skipLeadingAmpersand_{that.skipLeadingAmpersand_},
48       compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
49       compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
50 
51 // Returns number of bytes to skip
52 static inline int IsSpace(const char *p) {
53   if (*p == ' ') {
54     return 1;
55   } else if (*p == '\xa0') { // LATIN-1 NBSP non-breaking space
56     return 1;
57   } else if (p[0] == '\xc2' && p[1] == '\xa0') { // UTF-8 NBSP
58     return 2;
59   } else {
60     return 0;
61   }
62 }
63 
64 static inline int IsSpaceOrTab(const char *p) {
65   return *p == '\t' ? 1 : IsSpace(p);
66 }
67 
68 static inline constexpr bool IsFixedFormCommentChar(char ch) {
69   return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
70 }
71 
72 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
73   char *p{dir.GetMutableCharData()};
74   char *limit{p + dir.SizeInChars()};
75   for (; p < limit; ++p) {
76     if (*p != ' ') {
77       CHECK(IsFixedFormCommentChar(*p));
78       *p = '!';
79       return;
80     }
81   }
82   DIE("compiler directive all blank");
83 }
84 
85 void Prescanner::Prescan(ProvenanceRange range) {
86   startProvenance_ = range.start();
87   start_ = allSources_.GetSource(range);
88   CHECK(start_);
89   limit_ = start_ + range.size();
90   nextLine_ = start_;
91   const bool beganInFixedForm{inFixedForm_};
92   if (prescannerNesting_ > maxPrescannerNesting) {
93     Say(GetProvenance(start_),
94         "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
95     return;
96   }
97   while (!IsAtEnd()) {
98     Statement();
99   }
100   if (inFixedForm_ != beganInFixedForm) {
101     std::string dir{"!dir$ "};
102     if (beganInFixedForm) {
103       dir += "fixed";
104     } else {
105       dir += "free";
106     }
107     dir += '\n';
108     TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()};
109     tokens.Emit(cooked_);
110   }
111 }
112 
113 void Prescanner::Statement() {
114   TokenSequence tokens;
115   const char *statementStart{nextLine_};
116   LineClassification line{ClassifyLine(statementStart)};
117   switch (line.kind) {
118   case LineClassification::Kind::Comment:
119     nextLine_ += line.payloadOffset; // advance to '!' or newline
120     NextLine();
121     return;
122   case LineClassification::Kind::IncludeLine:
123     FortranInclude(nextLine_ + line.payloadOffset);
124     NextLine();
125     return;
126   case LineClassification::Kind::ConditionalCompilationDirective:
127   case LineClassification::Kind::IncludeDirective:
128     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
129     afterPreprocessingDirective_ = true;
130     skipLeadingAmpersand_ |= !inFixedForm_;
131     return;
132   case LineClassification::Kind::PreprocessorDirective:
133     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
134     afterPreprocessingDirective_ = true;
135     // Don't set skipLeadingAmpersand_
136     return;
137   case LineClassification::Kind::DefinitionDirective:
138     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
139     // Don't set afterPreprocessingDirective_ or skipLeadingAmpersand_
140     return;
141   case LineClassification::Kind::CompilerDirective: {
142     directiveSentinel_ = line.sentinel;
143     CHECK(InCompilerDirective());
144     BeginStatementAndAdvance();
145     if (inFixedForm_) {
146       CHECK(IsFixedFormCommentChar(*at_));
147     } else {
148       while (int n{IsSpaceOrTab(at_)}) {
149         at_ += n, ++column_;
150       }
151       CHECK(*at_ == '!');
152     }
153     std::optional<int> condOffset;
154     if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
155       // OpenMP conditional compilation line.
156       condOffset = 2;
157     } else if (directiveSentinel_[0] == '@' && directiveSentinel_[1] == 'c' &&
158         directiveSentinel_[2] == 'u' && directiveSentinel_[3] == 'f' &&
159         directiveSentinel_[4] == '\0') {
160       // CUDA conditional compilation line.
161       condOffset = 5;
162     }
163     if (condOffset) {
164       at_ += *condOffset, column_ += *condOffset;
165       if (auto payload{IsIncludeLine(at_)}) {
166         FortranInclude(at_ + *payload);
167         return;
168       } else if (inFixedForm_) {
169         LabelField(tokens);
170       } else {
171         SkipSpaces();
172       }
173     } else {
174       // Compiler directive.  Emit normalized sentinel, squash following spaces.
175       EmitChar(tokens, '!');
176       ++at_, ++column_;
177       for (const char *sp{directiveSentinel_}; *sp != '\0';
178            ++sp, ++at_, ++column_) {
179         EmitChar(tokens, *sp);
180       }
181       if (IsSpaceOrTab(at_)) {
182         EmitChar(tokens, ' ');
183         while (int n{IsSpaceOrTab(at_)}) {
184           at_ += n, ++column_;
185         }
186       }
187       tokens.CloseToken();
188     }
189     break;
190   }
191   case LineClassification::Kind::Source: {
192     BeginStatementAndAdvance();
193     bool checkLabelField{false};
194     if (inFixedForm_) {
195       if (features_.IsEnabled(LanguageFeature::OldDebugLines) &&
196           (*at_ == 'D' || *at_ == 'd')) {
197         NextChar();
198       }
199       checkLabelField = true;
200     } else {
201       if (skipLeadingAmpersand_) {
202         skipLeadingAmpersand_ = false;
203         const char *p{SkipWhiteSpace(at_)};
204         if (p < limit_ && *p == '&') {
205           column_ += ++p - at_;
206           at_ = p;
207         }
208       } else {
209         SkipSpaces();
210       }
211     }
212     // Check for a leading identifier that might be a keyword macro
213     // that will expand to anything indicating a non-source line, like
214     // a comment marker or directive sentinel.  If so, disable line
215     // continuation, so that NextToken() won't consume anything from
216     // following lines.
217     if (IsLegalIdentifierStart(*at_)) {
218       // TODO: Only bother with these cases when any keyword macro has
219       // been defined with replacement text that could begin a comment
220       // or directive sentinel.
221       const char *p{at_};
222       while (IsLegalInIdentifier(*++p)) {
223       }
224       CharBlock id{at_, static_cast<std::size_t>(p - at_)};
225       if (preprocessor_.IsNameDefined(id) &&
226           !preprocessor_.IsFunctionLikeDefinition(id)) {
227         checkLabelField = false;
228         TokenSequence toks;
229         toks.Put(id, GetProvenance(at_));
230         if (auto replaced{preprocessor_.MacroReplacement(toks, *this)}) {
231           auto newLineClass{ClassifyLine(*replaced, GetCurrentProvenance())};
232           if (newLineClass.kind ==
233               LineClassification::Kind::CompilerDirective) {
234             directiveSentinel_ = newLineClass.sentinel;
235             disableSourceContinuation_ = false;
236           } else {
237             disableSourceContinuation_ = !replaced->empty() &&
238                 newLineClass.kind != LineClassification::Kind::Source;
239           }
240         }
241       }
242     }
243     if (checkLabelField) {
244       LabelField(tokens);
245     }
246   } break;
247   }
248 
249   while (NextToken(tokens)) {
250   }
251   if (continuationLines_ > 255) {
252     if (features_.ShouldWarn(common::LanguageFeature::MiscSourceExtensions)) {
253       Say(common::LanguageFeature::MiscSourceExtensions,
254           GetProvenance(statementStart),
255           "%d continuation lines is more than the Fortran standard allows"_port_en_US,
256           continuationLines_);
257     }
258   }
259 
260   Provenance newlineProvenance{GetCurrentProvenance()};
261   if (std::optional<TokenSequence> preprocessed{
262           preprocessor_.MacroReplacement(tokens, *this)}) {
263     // Reprocess the preprocessed line.
264     LineClassification ppl{ClassifyLine(*preprocessed, newlineProvenance)};
265     switch (ppl.kind) {
266     case LineClassification::Kind::Comment:
267       break;
268     case LineClassification::Kind::IncludeLine:
269       FortranInclude(preprocessed->TokenAt(0).begin() + ppl.payloadOffset);
270       break;
271     case LineClassification::Kind::ConditionalCompilationDirective:
272     case LineClassification::Kind::IncludeDirective:
273     case LineClassification::Kind::DefinitionDirective:
274     case LineClassification::Kind::PreprocessorDirective:
275       if (features_.ShouldWarn(common::UsageWarning::Preprocessing)) {
276         Say(common::UsageWarning::Preprocessing,
277             preprocessed->GetProvenanceRange(),
278             "Preprocessed line resembles a preprocessor directive"_warn_en_US);
279       }
280       CheckAndEmitLine(preprocessed->ToLowerCase(), newlineProvenance);
281       break;
282     case LineClassification::Kind::CompilerDirective:
283       if (preprocessed->HasRedundantBlanks()) {
284         preprocessed->RemoveRedundantBlanks();
285       }
286       while (CompilerDirectiveContinuation(*preprocessed, ppl.sentinel)) {
287         newlineProvenance = GetCurrentProvenance();
288       }
289       NormalizeCompilerDirectiveCommentMarker(*preprocessed);
290       preprocessed->ToLowerCase();
291       SourceFormChange(preprocessed->ToString());
292       CheckAndEmitLine(preprocessed->ToLowerCase().ClipComment(
293                            *this, true /* skip first ! */),
294           newlineProvenance);
295       break;
296     case LineClassification::Kind::Source:
297       if (inFixedForm_) {
298         if (!preprocessingOnly_ && preprocessed->HasBlanks()) {
299           preprocessed->RemoveBlanks();
300         }
301       } else {
302         while (SourceLineContinuation(*preprocessed)) {
303           newlineProvenance = GetCurrentProvenance();
304         }
305         if (preprocessed->HasRedundantBlanks()) {
306           preprocessed->RemoveRedundantBlanks();
307         }
308       }
309       CheckAndEmitLine(
310           preprocessed->ToLowerCase().ClipComment(*this), newlineProvenance);
311       break;
312     }
313   } else { // no macro replacement
314     if (line.kind == LineClassification::Kind::CompilerDirective) {
315       while (CompilerDirectiveContinuation(tokens, line.sentinel)) {
316         newlineProvenance = GetCurrentProvenance();
317       }
318       tokens.ToLowerCase();
319       SourceFormChange(tokens.ToString());
320     } else { // Kind::Source
321       tokens.ToLowerCase();
322       if (inFixedForm_) {
323         EnforceStupidEndStatementRules(tokens);
324       }
325     }
326     CheckAndEmitLine(tokens, newlineProvenance);
327   }
328   directiveSentinel_ = nullptr;
329 }
330 
331 void Prescanner::CheckAndEmitLine(
332     TokenSequence &tokens, Provenance newlineProvenance) {
333   tokens.CheckBadFortranCharacters(
334       messages_, *this, disableSourceContinuation_);
335   // Parenthesis nesting check does not apply while any #include is
336   // active, nor on the lines before and after a top-level #include,
337   // nor before or after conditional source.
338   // Applications play shenanigans with line continuation before and
339   // after #include'd subprogram argument lists and conditional source.
340   if (!isNestedInIncludeDirective_ && !omitNewline_ &&
341       !afterPreprocessingDirective_ && tokens.BadlyNestedParentheses() &&
342       !preprocessor_.InConditional()) {
343     if (nextLine_ < limit_ && IsPreprocessorDirectiveLine(nextLine_)) {
344       // don't complain
345     } else {
346       tokens.CheckBadParentheses(messages_);
347     }
348   }
349   tokens.Emit(cooked_);
350   if (omitNewline_) {
351     omitNewline_ = false;
352   } else {
353     cooked_.Put('\n', newlineProvenance);
354     afterPreprocessingDirective_ = false;
355   }
356 }
357 
358 TokenSequence Prescanner::TokenizePreprocessorDirective() {
359   CHECK(!IsAtEnd() && !inPreprocessorDirective_);
360   inPreprocessorDirective_ = true;
361   BeginStatementAndAdvance();
362   TokenSequence tokens;
363   while (NextToken(tokens)) {
364   }
365   inPreprocessorDirective_ = false;
366   return tokens;
367 }
368 
369 void Prescanner::NextLine() {
370   void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
371   void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
372   if (!v) {
373     nextLine_ = limit_;
374   } else {
375     const char *nl{const_cast<const char *>(static_cast<char *>(v))};
376     nextLine_ = nl + 1;
377   }
378 }
379 
380 void Prescanner::LabelField(TokenSequence &token) {
381   int outCol{1};
382   const char *start{at_};
383   std::optional<int> badColumn;
384   for (; *at_ != '\n' && column_ <= 6; ++at_) {
385     if (*at_ == '\t') {
386       ++at_;
387       column_ = 7;
388       break;
389     }
390     if (int n{IsSpace(at_)}; n == 0 &&
391         !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
392       EmitChar(token, *at_);
393       ++outCol;
394       if (!badColumn && (column_ == 6 || !IsDecimalDigit(*at_))) {
395         badColumn = column_;
396       }
397     }
398     ++column_;
399   }
400   if (badColumn && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) {
401     if ((prescannerNesting_ > 0 && *badColumn == 6 &&
402             cooked_.BufferedBytes() == firstCookedCharacterOffset_) ||
403         afterPreprocessingDirective_) {
404       // This is the first source line in #include'd text or conditional
405       // code under #if, or the first source line after such.
406       // If it turns out that the preprocessed text begins with a
407       // fixed form continuation line, the newline at the end
408       // of the latest source line beforehand will be deleted in
409       // CookedSource::Marshal().
410       cooked_.MarkPossibleFixedFormContinuation();
411     } else if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
412       Say(common::UsageWarning::Scanning, GetProvenance(start + *badColumn - 1),
413           *badColumn == 6
414               ? "Statement should not begin with a continuation line"_warn_en_US
415               : "Character in fixed-form label field must be a digit"_warn_en_US);
416     }
417     token.clear();
418     if (*badColumn < 6) {
419       at_ = start;
420       column_ = 1;
421       return;
422     }
423     outCol = 1;
424   }
425   if (outCol == 1) { // empty label field
426     // Emit a space so that, if the line is rescanned after preprocessing,
427     // a leading 'C' or 'D' won't be left-justified and then accidentally
428     // misinterpreted as a comment card.
429     EmitChar(token, ' ');
430     ++outCol;
431   }
432   token.CloseToken();
433   SkipToNextSignificantCharacter();
434   if (IsDecimalDigit(*at_)) {
435     if (features_.ShouldWarn(common::LanguageFeature::MiscSourceExtensions)) {
436       Say(common::LanguageFeature::MiscSourceExtensions, GetCurrentProvenance(),
437           "Label digit is not in fixed-form label field"_port_en_US);
438     }
439   }
440 }
441 
442 // 6.3.3.5: A program unit END statement, or any other statement whose
443 // initial line resembles an END statement, shall not be continued in
444 // fixed form source.
445 void Prescanner::EnforceStupidEndStatementRules(const TokenSequence &tokens) {
446   CharBlock cBlock{tokens.ToCharBlock()};
447   const char *str{cBlock.begin()};
448   std::size_t n{cBlock.size()};
449   if (n < 3) {
450     return;
451   }
452   std::size_t j{0};
453   for (; j < n && (str[j] == ' ' || (str[j] >= '0' && str[j] <= '9')); ++j) {
454   }
455   if (j + 3 > n || std::memcmp(str + j, "end", 3) != 0) {
456     return;
457   }
458   // It starts with END, possibly after a label.
459   auto start{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
460   auto end{allSources_.GetSourcePosition(tokens.GetCharProvenance(n - 1))};
461   if (!start || !end) {
462     return;
463   }
464   if (&*start->sourceFile == &*end->sourceFile && start->line == end->line) {
465     return; // no continuation
466   }
467   j += 3;
468   static const char *const prefixes[]{"program", "subroutine", "function",
469       "blockdata", "module", "submodule", nullptr};
470   bool isPrefix{j == n || !IsLegalInIdentifier(str[j])}; // prefix is END
471   std::size_t endOfPrefix{j - 1};
472   for (const char *const *p{prefixes}; *p; ++p) {
473     std::size_t pLen{std::strlen(*p)};
474     if (j + pLen <= n && std::memcmp(str + j, *p, pLen) == 0) {
475       isPrefix = true; // END thing as prefix
476       j += pLen;
477       endOfPrefix = j - 1;
478       for (; j < n && IsLegalInIdentifier(str[j]); ++j) {
479       }
480       break;
481     }
482   }
483   if (isPrefix) {
484     auto range{tokens.GetTokenProvenanceRange(1)};
485     if (j == n) { // END or END thing [name]
486       Say(range,
487           "Program unit END statement may not be continued in fixed form source"_err_en_US);
488     } else {
489       auto endOfPrefixPos{
490           allSources_.GetSourcePosition(tokens.GetCharProvenance(endOfPrefix))};
491       auto next{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
492       if (endOfPrefixPos && next &&
493           &*endOfPrefixPos->sourceFile == &*start->sourceFile &&
494           endOfPrefixPos->line == start->line &&
495           (&*next->sourceFile != &*start->sourceFile ||
496               next->line != start->line)) {
497         Say(range,
498             "Initial line of continued statement must not appear to be a program unit END in fixed form source"_err_en_US);
499       }
500     }
501   }
502 }
503 
504 void Prescanner::SkipToEndOfLine() {
505   while (*at_ != '\n') {
506     ++at_, ++column_;
507   }
508 }
509 
510 bool Prescanner::MustSkipToEndOfLine() const {
511   if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
512     return true; // skip over ignored columns in right margin (73:80)
513   } else if (*at_ == '!' && !inCharLiteral_) {
514     return !IsCompilerDirectiveSentinel(at_);
515   } else {
516     return false;
517   }
518 }
519 
520 void Prescanner::NextChar() {
521   CHECK(*at_ != '\n');
522   int n{IsSpace(at_)};
523   at_ += n ? n : 1;
524   ++column_;
525   while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
526     // UTF-8 byte order mark - treat this file as UTF-8
527     at_ += 3;
528     encoding_ = Encoding::UTF_8;
529   }
530   SkipToNextSignificantCharacter();
531 }
532 
533 // Skip everything that should be ignored until the next significant
534 // character is reached; handles C-style comments in preprocessing
535 // directives, Fortran ! comments, stuff after the right margin in
536 // fixed form, and all forms of line continuation.
537 bool Prescanner::SkipToNextSignificantCharacter() {
538   auto anyContinuationLine{false};
539   if (inPreprocessorDirective_) {
540     SkipCComments();
541   } else {
542     bool mightNeedSpace{false};
543     if (MustSkipToEndOfLine()) {
544       SkipToEndOfLine();
545     } else {
546       mightNeedSpace = *at_ == '\n';
547     }
548     for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
549       anyContinuationLine = true;
550       ++continuationLines_;
551       if (MustSkipToEndOfLine()) {
552         SkipToEndOfLine();
553       }
554     }
555     if (*at_ == '\t') {
556       tabInCurrentLine_ = true;
557     }
558   }
559   return anyContinuationLine;
560 }
561 
562 void Prescanner::SkipCComments() {
563   while (true) {
564     if (IsCComment(at_)) {
565       if (const char *after{SkipCComment(at_)}) {
566         column_ += after - at_;
567         // May have skipped over one or more newlines; relocate the start of
568         // the next line.
569         nextLine_ = at_ = after;
570         NextLine();
571       } else {
572         // Don't emit any messages about unclosed C-style comments, because
573         // the sequence /* can appear legally in a FORMAT statement.  There's
574         // no ambiguity, since the sequence */ cannot appear legally.
575         break;
576       }
577     } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
578         at_[1] == '\n' && !IsAtEnd()) {
579       BeginSourceLineAndAdvance();
580     } else {
581       break;
582     }
583   }
584 }
585 
586 void Prescanner::SkipSpaces() {
587   while (IsSpaceOrTab(at_)) {
588     NextChar();
589   }
590   insertASpace_ = false;
591 }
592 
593 const char *Prescanner::SkipWhiteSpace(const char *p) {
594   while (int n{IsSpaceOrTab(p)}) {
595     p += n;
596   }
597   return p;
598 }
599 
600 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
601   while (true) {
602     if (int n{IsSpaceOrTab(p)}) {
603       p += n;
604     } else if (IsCComment(p)) {
605       if (const char *after{SkipCComment(p)}) {
606         p = after;
607       } else {
608         break;
609       }
610     } else {
611       break;
612     }
613   }
614   return p;
615 }
616 
617 const char *Prescanner::SkipCComment(const char *p) const {
618   char star{' '}, slash{' '};
619   p += 2;
620   while (star != '*' || slash != '/') {
621     if (p >= limit_) {
622       return nullptr; // signifies an unterminated comment
623     }
624     star = slash;
625     slash = *p++;
626   }
627   return p;
628 }
629 
630 bool Prescanner::NextToken(TokenSequence &tokens) {
631   CHECK(at_ >= start_ && at_ < limit_);
632   if (InFixedFormSource() && !preprocessingOnly_) {
633     SkipSpaces();
634   } else {
635     if (*at_ == '/' && IsCComment(at_)) {
636       // Recognize and skip over classic C style /*comments*/ when
637       // outside a character literal.
638       if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
639         Say(LanguageFeature::ClassicCComments, GetCurrentProvenance(),
640             "nonstandard usage: C-style comment"_port_en_US);
641       }
642       SkipCComments();
643     }
644     if (IsSpaceOrTab(at_)) {
645       // Compress free-form white space into a single space character.
646       const auto theSpace{at_};
647       char previous{at_ <= start_ ? ' ' : at_[-1]};
648       NextChar();
649       SkipSpaces();
650       if (*at_ == '\n' && !omitNewline_) {
651         // Discard white space at the end of a line.
652       } else if (!inPreprocessorDirective_ &&
653           (previous == '(' || *at_ == '(' || *at_ == ')')) {
654         // Discard white space before/after '(' and before ')', unless in a
655         // preprocessor directive.  This helps yield space-free contiguous
656         // names for generic interfaces like OPERATOR( + ) and
657         // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
658         // This has the effect of silently ignoring the illegal spaces in
659         // the array constructor ( /1,2/ ) but that seems benign; it's
660         // hard to avoid that while still removing spaces from OPERATOR( / )
661         // and OPERATOR( // ).
662       } else {
663         // Preserve the squashed white space as a single space character.
664         tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
665         tokens.CloseToken();
666         return true;
667       }
668     }
669   }
670   if (insertASpace_) {
671     tokens.PutNextTokenChar(' ', spaceProvenance_);
672     insertASpace_ = false;
673   }
674   if (*at_ == '\n') {
675     return false;
676   }
677   const char *start{at_};
678   if (*at_ == '\'' || *at_ == '"') {
679     QuotedCharacterLiteral(tokens, start);
680     preventHollerith_ = false;
681   } else if (IsDecimalDigit(*at_)) {
682     int n{0}, digits{0};
683     static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
684     do {
685       if (n < maxHollerith) {
686         n = 10 * n + DecimalDigitValue(*at_);
687       }
688       EmitCharAndAdvance(tokens, *at_);
689       ++digits;
690       if (InFixedFormSource()) {
691         SkipSpaces();
692       }
693     } while (IsDecimalDigit(*at_));
694     if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
695         !preventHollerith_) {
696       Hollerith(tokens, n, start);
697     } else if (*at_ == '.') {
698       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
699       }
700       ExponentAndKind(tokens);
701     } else if (ExponentAndKind(tokens)) {
702     } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
703         inPreprocessorDirective_) {
704       do {
705         EmitCharAndAdvance(tokens, *at_);
706       } while (IsHexadecimalDigit(*at_));
707     } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..."
708       EmitCharAndAdvance(tokens, *at_);
709       QuotedCharacterLiteral(tokens, start);
710     } else if (IsLetter(*at_) && !preventHollerith_ &&
711         parenthesisNesting_ > 0 &&
712         !preprocessor_.IsNameDefined(CharBlock{at_, 1})) {
713       // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
714       // we don't misrecognize I9HHOLLERITH as an identifier in the next case.
715       EmitCharAndAdvance(tokens, *at_);
716     }
717     preventHollerith_ = false;
718   } else if (*at_ == '.') {
719     char nch{EmitCharAndAdvance(tokens, '.')};
720     if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
721       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
722       }
723       ExponentAndKind(tokens);
724     } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
725       EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
726     }
727     preventHollerith_ = false;
728   } else if (IsLegalInIdentifier(*at_)) {
729     int parts{1};
730     const char *afterLast{nullptr};
731     do {
732       EmitChar(tokens, *at_);
733       ++at_, ++column_;
734       afterLast = at_;
735       if (SkipToNextSignificantCharacter() && IsLegalIdentifierStart(*at_)) {
736         tokens.CloseToken();
737         ++parts;
738       }
739     } while (IsLegalInIdentifier(*at_));
740     if (parts >= 3) {
741       // Subtlety: When an identifier is split across three or more continuation
742       // lines (or two continuation lines, immediately preceded or followed
743       // by '&' free form continuation line markers, its parts are kept as
744       // distinct pp-tokens so that macro replacement operates on them
745       // independently.  This trick accommodates the historic practice of
746       // using line continuation for token pasting after replacement.
747     } else if (parts == 2) {
748       if (afterLast && afterLast < limit_) {
749         afterLast = SkipWhiteSpace(afterLast);
750       }
751       if ((start > start_ && start[-1] == '&') ||
752           (afterLast && afterLast < limit_ &&
753               (*afterLast == '&' || *afterLast == '\n'))) {
754         // call &                call foo&        call foo&
755         //   &MACRO&      OR       &MACRO&   OR     &MACRO
756         //   &foo(...)             &(...)
757       } else {
758         tokens.ReopenLastToken();
759       }
760     }
761     if (InFixedFormSource()) {
762       SkipSpaces();
763     }
764     if ((*at_ == '\'' || *at_ == '"') &&
765         tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..."
766       QuotedCharacterLiteral(tokens, start);
767       preventHollerith_ = false;
768     } else {
769       preventHollerith_ = true; // DO 10 H = ...
770     }
771   } else if (*at_ == '*') {
772     if (EmitCharAndAdvance(tokens, '*') == '*') {
773       EmitCharAndAdvance(tokens, '*');
774     } else {
775       // Subtle ambiguity:
776       //  CHARACTER*2H     declares H because *2 is a kind specifier
777       //  DATAC/N*2H  /    is repeated Hollerith
778       preventHollerith_ = !slashInCurrentStatement_;
779     }
780   } else {
781     char ch{*at_};
782     if (ch == '(') {
783       if (parenthesisNesting_++ == 0) {
784         isPossibleMacroCall_ = tokens.SizeInTokens() > 0 &&
785             preprocessor_.IsFunctionLikeDefinition(
786                 tokens.TokenAt(tokens.SizeInTokens() - 1));
787       }
788     } else if (ch == ')' && parenthesisNesting_ > 0) {
789       --parenthesisNesting_;
790     }
791     char nch{EmitCharAndAdvance(tokens, ch)};
792     preventHollerith_ = false;
793     if ((nch == '=' &&
794             (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
795         (ch == nch &&
796             (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
797                 ch == '|' || ch == '<' || ch == '>')) ||
798         (ch == '=' && nch == '>')) {
799       // token comprises two characters
800       EmitCharAndAdvance(tokens, nch);
801     } else if (ch == '/') {
802       slashInCurrentStatement_ = true;
803     } else if (ch == ';' && InFixedFormSource()) {
804       SkipSpaces();
805       if (IsDecimalDigit(*at_)) {
806         if (features_.ShouldWarn(
807                 common::LanguageFeature::MiscSourceExtensions)) {
808           Say(common::LanguageFeature::MiscSourceExtensions,
809               GetProvenanceRange(at_, at_ + 1),
810               "Label should be in the label field"_port_en_US);
811         }
812       }
813     }
814   }
815   tokens.CloseToken();
816   return true;
817 }
818 
819 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
820   char ed{ToLowerCaseLetter(*at_)};
821   if (ed != 'e' && ed != 'd') {
822     return false;
823   }
824   // Do some look-ahead to ensure that this 'e'/'d' is an exponent,
825   // not the start of an identifier that could be a macro.
826   const char *p{at_};
827   if (int n{IsSpace(++p)}) {
828     p += n;
829   }
830   if (*p == '+' || *p == '-') {
831     if (int n{IsSpace(++p)}) {
832       p += n;
833     }
834   }
835   if (IsDecimalDigit(*p)) { // it's an exponent
836     EmitCharAndAdvance(tokens, ed);
837     if (*at_ == '+' || *at_ == '-') {
838       EmitCharAndAdvance(tokens, *at_);
839     }
840     while (IsDecimalDigit(*at_)) {
841       EmitCharAndAdvance(tokens, *at_);
842     }
843     if (*at_ == '_') {
844       while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
845       }
846     }
847     return true;
848   } else {
849     return false;
850   }
851 }
852 
853 void Prescanner::QuotedCharacterLiteral(
854     TokenSequence &tokens, const char *start) {
855   char quote{*at_};
856   const char *end{at_ + 1};
857   inCharLiteral_ = true;
858   continuationInCharLiteral_ = true;
859   const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
860   const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
861   bool isEscaped{false};
862   bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
863   while (true) {
864     if (*at_ == '\\') {
865       if (escapesEnabled) {
866         isEscaped = !isEscaped;
867       } else {
868         // The parser always processes escape sequences, so don't confuse it
869         // when escapes are disabled.
870         insert('\\');
871       }
872     } else {
873       isEscaped = false;
874     }
875     EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
876         Encoding::LATIN_1);
877     while (PadOutCharacterLiteral(tokens)) {
878     }
879     if (*at_ == '\n') {
880       if (!inPreprocessorDirective_) {
881         Say(GetProvenanceRange(start, end),
882             "Incomplete character literal"_err_en_US);
883       }
884       break;
885     }
886     // Here's a weird edge case.  When there's a two or more following
887     // continuation lines at this point, and the entire significant part of
888     // the next continuation line is the name of a keyword macro, replace
889     // it in the character literal with its definition.  Example:
890     //   #define FOO foo
891     //   subroutine subr() bind(c, name="my_&
892     //     &FOO&
893     //     &_bar") ...
894     // produces a binding name of "my_foo_bar".
895     while (at_[1] == '&' && nextLine_ < limit_ && !InFixedFormSource()) {
896       const char *idStart{nextLine_};
897       if (const char *amper{SkipWhiteSpace(nextLine_)}; *amper == '&') {
898         idStart = amper + 1;
899       }
900       if (IsLegalIdentifierStart(*idStart)) {
901         std::size_t idLen{1};
902         for (; IsLegalInIdentifier(idStart[idLen]); ++idLen) {
903         }
904         if (idStart[idLen] == '&') {
905           CharBlock id{idStart, idLen};
906           if (preprocessor_.IsNameDefined(id)) {
907             TokenSequence ppTokens;
908             ppTokens.Put(id, GetProvenance(idStart));
909             if (auto replaced{
910                     preprocessor_.MacroReplacement(ppTokens, *this)}) {
911               tokens.Put(*replaced);
912               at_ = &idStart[idLen - 1];
913               NextLine();
914               continue; // try again on the next line
915             }
916           }
917         }
918       }
919       break;
920     }
921     end = at_ + 1;
922     NextChar();
923     if (*at_ == quote && !isEscaped) {
924       // A doubled unescaped quote mark becomes a single instance of that
925       // quote character in the literal (later).  There can be spaces between
926       // the quotes in fixed form source.
927       EmitChar(tokens, quote);
928       inCharLiteral_ = false; // for cases like print *, '...'!comment
929       NextChar();
930       if (InFixedFormSource()) {
931         SkipSpaces();
932       }
933       if (*at_ != quote) {
934         break;
935       }
936       inCharLiteral_ = true;
937     }
938   }
939   continuationInCharLiteral_ = false;
940   inCharLiteral_ = false;
941 }
942 
943 void Prescanner::Hollerith(
944     TokenSequence &tokens, int count, const char *start) {
945   inCharLiteral_ = true;
946   CHECK(*at_ == 'h' || *at_ == 'H');
947   EmitChar(tokens, 'H');
948   while (count-- > 0) {
949     if (PadOutCharacterLiteral(tokens)) {
950     } else if (*at_ == '\n') {
951       if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
952         Say(common::UsageWarning::Scanning, GetProvenanceRange(start, at_),
953             "Possible truncated Hollerith literal"_warn_en_US);
954       }
955       break;
956     } else {
957       NextChar();
958       // Each multi-byte character encoding counts as a single character.
959       // No escape sequences are recognized.
960       // Hollerith is always emitted to the cooked character
961       // stream in UTF-8.
962       DecodedCharacter decoded{DecodeCharacter(
963           encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
964       if (decoded.bytes > 0) {
965         EncodedCharacter utf8{
966             EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
967         for (int j{0}; j < utf8.bytes; ++j) {
968           EmitChar(tokens, utf8.buffer[j]);
969         }
970         at_ += decoded.bytes - 1;
971       } else {
972         Say(GetProvenanceRange(start, at_),
973             "Bad character in Hollerith literal"_err_en_US);
974         break;
975       }
976     }
977   }
978   if (*at_ != '\n') {
979     NextChar();
980   }
981   inCharLiteral_ = false;
982 }
983 
984 // In fixed form, source card images must be processed as if they were at
985 // least 72 columns wide, at least in character literal contexts.
986 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
987   while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
988     if (column_ < fixedFormColumnLimit_) {
989       tokens.PutNextTokenChar(' ', spaceProvenance_);
990       ++column_;
991       return true;
992     }
993     if (!FixedFormContinuation(false /*no need to insert space*/) ||
994         tabInCurrentLine_) {
995       return false;
996     }
997     CHECK(column_ == 7);
998     --at_; // point to column 6 of continuation line
999     column_ = 6;
1000   }
1001   return false;
1002 }
1003 
1004 static bool IsAtProcess(const char *p) {
1005   static const char pAtProc[]{"process"};
1006   for (std::size_t i{0}; i < sizeof pAtProc - 1; ++i) {
1007     if (ToLowerCaseLetter(*++p) != pAtProc[i])
1008       return false;
1009   }
1010   return true;
1011 }
1012 
1013 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
1014   const char *p{start};
1015 
1016   // The @process directive must start in column 1.
1017   if (*p == '@' && IsAtProcess(p)) {
1018     return true;
1019   }
1020 
1021   if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
1022       ((*p == 'D' || *p == 'd') &&
1023           !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
1024     return true;
1025   }
1026   bool anyTabs{false};
1027   while (true) {
1028     if (int n{IsSpace(p)}) {
1029       p += n;
1030     } else if (*p == '\t') {
1031       anyTabs = true;
1032       ++p;
1033     } else if (*p == '0' && !anyTabs && p == start + 5) {
1034       ++p; // 0 in column 6 must treated as a space
1035     } else {
1036       break;
1037     }
1038   }
1039   if (!anyTabs && p >= start + fixedFormColumnLimit_) {
1040     return true;
1041   }
1042   if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
1043     return true;
1044   }
1045   return *p == '\n';
1046 }
1047 
1048 const char *Prescanner::IsFreeFormComment(const char *p) const {
1049   p = SkipWhiteSpaceAndCComments(p);
1050   if (*p == '!' || *p == '\n') {
1051     return p;
1052   } else if (*p == '@') {
1053     return IsAtProcess(p) ? p : nullptr;
1054   } else {
1055     return nullptr;
1056   }
1057 }
1058 
1059 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
1060   if (!expandIncludeLines_) {
1061     return std::nullopt;
1062   }
1063   const char *p{SkipWhiteSpace(start)};
1064   if (*p == '0' && inFixedForm_ && p == start + 5) {
1065     // Accept "     0INCLUDE" in fixed form.
1066     p = SkipWhiteSpace(p + 1);
1067   }
1068   for (const char *q{"include"}; *q; ++q) {
1069     if (ToLowerCaseLetter(*p) != *q) {
1070       return std::nullopt;
1071     }
1072     p = SkipWhiteSpace(p + 1);
1073   }
1074   if (IsDecimalDigit(*p)) { // accept & ignore a numeric kind prefix
1075     for (p = SkipWhiteSpace(p + 1); IsDecimalDigit(*p);
1076          p = SkipWhiteSpace(p + 1)) {
1077     }
1078     if (*p != '_') {
1079       return std::nullopt;
1080     }
1081     p = SkipWhiteSpace(p + 1);
1082   }
1083   if (*p == '"' || *p == '\'') {
1084     return {p - start};
1085   }
1086   return std::nullopt;
1087 }
1088 
1089 void Prescanner::FortranInclude(const char *firstQuote) {
1090   const char *p{firstQuote};
1091   while (*p != '"' && *p != '\'') {
1092     ++p;
1093   }
1094   char quote{*p};
1095   std::string path;
1096   for (++p; *p != '\n'; ++p) {
1097     if (*p == quote) {
1098       if (p[1] != quote) {
1099         break;
1100       }
1101       ++p;
1102     }
1103     path += *p;
1104   }
1105   if (*p != quote) {
1106     Say(GetProvenanceRange(firstQuote, p),
1107         "malformed path name string"_err_en_US);
1108     return;
1109   }
1110   p = SkipWhiteSpace(p + 1);
1111   if (*p != '\n' && *p != '!') {
1112     const char *garbage{p};
1113     for (; *p != '\n' && *p != '!'; ++p) {
1114     }
1115     if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
1116       Say(common::UsageWarning::Scanning, GetProvenanceRange(garbage, p),
1117           "excess characters after path name"_warn_en_US);
1118     }
1119   }
1120   std::string buf;
1121   llvm::raw_string_ostream error{buf};
1122   Provenance provenance{GetProvenance(nextLine_)};
1123   std::optional<std::string> prependPath;
1124   if (const SourceFile * currentFile{allSources_.GetSourceFile(provenance)}) {
1125     prependPath = DirectoryName(currentFile->path());
1126   }
1127   const SourceFile *included{
1128       allSources_.Open(path, error, std::move(prependPath))};
1129   if (!included) {
1130     Say(provenance, "INCLUDE: %s"_err_en_US, buf);
1131   } else if (included->bytes() > 0) {
1132     ProvenanceRange includeLineRange{
1133         provenance, static_cast<std::size_t>(p - nextLine_)};
1134     ProvenanceRange fileRange{
1135         allSources_.AddIncludedFile(*included, includeLineRange)};
1136     Preprocessor cleanPrepro{allSources_};
1137     if (preprocessor_.IsNameDefined("__FILE__"s)) {
1138       cleanPrepro.DefineStandardMacros(); // __FILE__, __LINE__, &c.
1139     }
1140     if (preprocessor_.IsNameDefined("_CUDA"s)) {
1141       cleanPrepro.Define("_CUDA"s, "1");
1142     }
1143     Prescanner{*this, cleanPrepro, /*isNestedInIncludeDirective=*/false}
1144         .set_encoding(included->encoding())
1145         .Prescan(fileRange);
1146   }
1147 }
1148 
1149 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
1150   const char *p{start};
1151   while (int n{IsSpace(p)}) {
1152     p += n;
1153   }
1154   if (*p == '#') {
1155     if (inFixedForm_ && p == start + 5) {
1156       return nullptr;
1157     }
1158   } else {
1159     p = SkipWhiteSpace(p);
1160     if (*p != '#') {
1161       return nullptr;
1162     }
1163   }
1164   return SkipWhiteSpace(p + 1);
1165 }
1166 
1167 bool Prescanner::IsNextLinePreprocessorDirective() const {
1168   return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
1169 }
1170 
1171 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
1172   if (IsAtEnd()) {
1173     if (afterAmpersand && prescannerNesting_ > 0) {
1174       // A continuation marker at the end of the last line in an
1175       // include file inhibits the newline for that line.
1176       SkipToEndOfLine();
1177       omitNewline_ = true;
1178     }
1179   } else if (inPreprocessorDirective_) {
1180   } else {
1181     auto lineClass{ClassifyLine(nextLine_)};
1182     if (lineClass.kind == LineClassification::Kind::Comment) {
1183       NextLine();
1184       return true;
1185     } else if (lineClass.kind ==
1186             LineClassification::Kind::ConditionalCompilationDirective ||
1187         lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
1188       // Allow conditional compilation directives (e.g., #ifdef) to affect
1189       // continuation lines.
1190       // Allow other preprocessor directives, too, except #include
1191       // (when it does not follow '&'), #define, and #undef (because
1192       // they cannot be allowed to affect preceding text on a
1193       // continued line).
1194       preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
1195       return true;
1196     } else if (afterAmpersand &&
1197         (lineClass.kind == LineClassification::Kind::DefinitionDirective ||
1198             lineClass.kind == LineClassification::Kind::IncludeDirective ||
1199             lineClass.kind == LineClassification::Kind::IncludeLine)) {
1200       SkipToEndOfLine();
1201       omitNewline_ = true;
1202       skipLeadingAmpersand_ = true;
1203     }
1204   }
1205   return false;
1206 }
1207 
1208 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
1209   if (IsAtEnd()) {
1210     return nullptr;
1211   }
1212   tabInCurrentLine_ = false;
1213   char col1{*nextLine_};
1214   if (IsFixedFormCommentChar(col1)) {
1215     int j{1};
1216     if (InCompilerDirective()) {
1217       // Must be a continued compiler directive.
1218       for (; j < 5; ++j) {
1219         char ch{directiveSentinel_[j - 1]};
1220         if (ch == '\0') {
1221           break;
1222         }
1223         if (ch != ToLowerCaseLetter(nextLine_[j])) {
1224           return nullptr;
1225         }
1226       }
1227     } else if (features_.IsEnabled(LanguageFeature::OpenMP)) {
1228       // Fixed Source Form Conditional Compilation Sentinels.
1229       if (nextLine_[1] != '$') {
1230         return nullptr;
1231       }
1232       j++;
1233     } else {
1234       return nullptr;
1235     }
1236     for (; j < 5; ++j) {
1237       if (nextLine_[j] != ' ') {
1238         return nullptr;
1239       }
1240     }
1241     const char *col6{nextLine_ + 5};
1242     if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) {
1243       if (mightNeedSpace && !IsSpace(nextLine_ + 6)) {
1244         insertASpace_ = true;
1245       }
1246       return nextLine_ + 6;
1247     }
1248     return nullptr;
1249   } else {
1250     // Normal case: not in a compiler directive.
1251     if (col1 == '&' &&
1252         features_.IsEnabled(
1253             LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
1254       // Extension: '&' as continuation marker
1255       if (features_.ShouldWarn(
1256               LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
1257         Say(LanguageFeature::FixedFormContinuationWithColumn1Ampersand,
1258             GetProvenance(nextLine_), "nonstandard usage"_port_en_US);
1259       }
1260       return nextLine_ + 1;
1261     }
1262     if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
1263       tabInCurrentLine_ = true;
1264       return nextLine_ + 2; // VAX extension
1265     }
1266     if ((col1 == ' ' ||
1267             ((col1 == 'D' || col1 == 'd') &&
1268                 features_.IsEnabled(LanguageFeature::OldDebugLines))) &&
1269         nextLine_[1] == ' ' && nextLine_[2] == ' ' && nextLine_[3] == ' ' &&
1270         nextLine_[4] == ' ') {
1271       const char *col6{nextLine_ + 5};
1272       if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) {
1273         if ((*col6 == 'i' || *col6 == 'I') && IsIncludeLine(nextLine_)) {
1274           // It's An INCLUDE line, not a continuation
1275         } else {
1276           return nextLine_ + 6;
1277         }
1278       }
1279     }
1280     if (IsImplicitContinuation()) {
1281       return nextLine_;
1282     }
1283   }
1284   return nullptr; // not a continuation line
1285 }
1286 
1287 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
1288   const char *p{nextLine_};
1289   if (p >= limit_) {
1290     return nullptr;
1291   }
1292   p = SkipWhiteSpace(p);
1293   if (*p == '!') {
1294     ++p;
1295     if (InCompilerDirective()) {
1296       for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
1297         if (*s != ToLowerCaseLetter(*p)) {
1298           return nullptr;
1299         }
1300       }
1301     } else if (features_.IsEnabled(LanguageFeature::OpenMP) && *p == '$') {
1302       ++p;
1303     } else {
1304       return nullptr;
1305     }
1306     p = SkipWhiteSpace(p);
1307     if (*p == '&') {
1308       if (!ampersand) {
1309         insertASpace_ = true;
1310       }
1311       return p + 1;
1312     } else if (ampersand) {
1313       return p;
1314     } else {
1315       return nullptr;
1316     }
1317   } else {
1318     if (*p == '&') {
1319       return p + 1;
1320     } else if (*p == '!' || *p == '\n' || *p == '#') {
1321       return nullptr;
1322     } else if (ampersand || IsImplicitContinuation()) {
1323       if (continuationInCharLiteral_) {
1324         // 'a'&            -> 'a''b' == "a'b"
1325         //   'b'
1326         if (features_.ShouldWarn(
1327                 common::LanguageFeature::MiscSourceExtensions)) {
1328           Say(common::LanguageFeature::MiscSourceExtensions,
1329               GetProvenanceRange(p, p + 1),
1330               "Character literal continuation line should have been preceded by '&'"_port_en_US);
1331         }
1332       } else if (p > nextLine_) {
1333         --p;
1334       } else {
1335         insertASpace_ = true;
1336       }
1337       return p;
1338     } else {
1339       return nullptr;
1340     }
1341   }
1342 }
1343 
1344 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
1345   // N.B. We accept '&' as a continuation indicator in fixed form, too,
1346   // but not in a character literal.
1347   if (*at_ == '&' && inCharLiteral_) {
1348     return false;
1349   }
1350   do {
1351     if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
1352       BeginSourceLine(cont);
1353       column_ = 7;
1354       NextLine();
1355       return true;
1356     }
1357   } while (SkipCommentLine(false /* not after ampersand */));
1358   return false;
1359 }
1360 
1361 bool Prescanner::FreeFormContinuation() {
1362   const char *p{at_};
1363   bool ampersand{*p == '&'};
1364   if (ampersand) {
1365     p = SkipWhiteSpace(p + 1);
1366   }
1367   if (*p != '\n') {
1368     if (inCharLiteral_) {
1369       return false;
1370     } else if (*p == '!') { // & ! comment - ok
1371     } else if (ampersand && isPossibleMacroCall_ && (*p == ',' || *p == ')')) {
1372       return false; // allow & at end of a macro argument
1373     } else if (features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
1374       Say(LanguageFeature::CruftAfterAmpersand, GetProvenance(p),
1375           "missing ! before comment after &"_warn_en_US);
1376     }
1377   }
1378   do {
1379     if (const char *cont{FreeFormContinuationLine(ampersand)}) {
1380       BeginSourceLine(cont);
1381       NextLine();
1382       return true;
1383     }
1384   } while (SkipCommentLine(ampersand));
1385   return false;
1386 }
1387 
1388 // Implicit line continuation allows a preprocessor macro call with
1389 // arguments to span multiple lines.
1390 bool Prescanner::IsImplicitContinuation() const {
1391   return !inPreprocessorDirective_ && !inCharLiteral_ && isPossibleMacroCall_ &&
1392       parenthesisNesting_ > 0 && !IsAtEnd() &&
1393       ClassifyLine(nextLine_).kind == LineClassification::Kind::Source;
1394 }
1395 
1396 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
1397   if (disableSourceContinuation_) {
1398     return false;
1399   } else if (*at_ == '\n' || *at_ == '&') {
1400     if (inFixedForm_) {
1401       return FixedFormContinuation(mightNeedFixedFormSpace);
1402     } else {
1403       return FreeFormContinuation();
1404     }
1405   } else if (*at_ == '\\' && at_ + 2 == nextLine_ &&
1406       backslashFreeFormContinuation_ && !inFixedForm_ && nextLine_ < limit_) {
1407     // cpp-like handling of \ at end of a free form source line
1408     BeginSourceLine(nextLine_);
1409     NextLine();
1410     return true;
1411   } else {
1412     return false;
1413   }
1414 }
1415 
1416 std::optional<Prescanner::LineClassification>
1417 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
1418   const char *p{start};
1419   char col1{*p++};
1420   if (!IsFixedFormCommentChar(col1)) {
1421     return std::nullopt;
1422   }
1423   char sentinel[5], *sp{sentinel};
1424   int column{2};
1425   for (; column < 6; ++column, ++p) {
1426     if (*p == '\n' || IsSpaceOrTab(p)) {
1427       break;
1428     }
1429     if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1430       // OpenMP conditional compilation line: leave the label alone
1431       break;
1432     }
1433     *sp++ = ToLowerCaseLetter(*p);
1434   }
1435   if (column == 6) {
1436     if (*p == '0') {
1437       ++p;
1438     } else if (int n{IsSpaceOrTab(p)}) {
1439       p += n;
1440     } else {
1441       // This is a Continuation line, not an initial directive line.
1442       return std::nullopt;
1443     }
1444   }
1445   if (sp == sentinel) {
1446     return std::nullopt;
1447   }
1448   *sp = '\0';
1449   if (const char *ss{IsCompilerDirectiveSentinel(
1450           sentinel, static_cast<std::size_t>(sp - sentinel))}) {
1451     std::size_t payloadOffset = p - start;
1452     return {LineClassification{
1453         LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1454   }
1455   return std::nullopt;
1456 }
1457 
1458 std::optional<Prescanner::LineClassification>
1459 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1460   if (const char *p{SkipWhiteSpace(start)}; p && *p++ == '!') {
1461     if (auto maybePair{IsCompilerDirectiveSentinel(p)}) {
1462       auto offset{static_cast<std::size_t>(maybePair->second - start)};
1463       return {LineClassification{LineClassification::Kind::CompilerDirective,
1464           offset, maybePair->first}};
1465     }
1466   }
1467   return std::nullopt;
1468 }
1469 
1470 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1471   std::uint64_t packed{0};
1472   for (char ch : dir) {
1473     packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1474   }
1475   compilerDirectiveBloomFilter_.set(packed % prime1);
1476   compilerDirectiveBloomFilter_.set(packed % prime2);
1477   compilerDirectiveSentinels_.insert(dir);
1478   return *this;
1479 }
1480 
1481 const char *Prescanner::IsCompilerDirectiveSentinel(
1482     const char *sentinel, std::size_t len) const {
1483   std::uint64_t packed{0};
1484   for (std::size_t j{0}; j < len; ++j) {
1485     packed = (packed << 8) | (sentinel[j] & 0xff);
1486   }
1487   if (len == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1488       !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1489     return nullptr;
1490   }
1491   const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, len))};
1492   return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
1493 }
1494 
1495 const char *Prescanner::IsCompilerDirectiveSentinel(CharBlock token) const {
1496   const char *p{token.begin()};
1497   const char *end{p + token.size()};
1498   while (p < end && (*p == ' ' || *p == '\n')) {
1499     ++p;
1500   }
1501   if (p < end && *p == '!') {
1502     ++p;
1503   }
1504   while (end > p && (end[-1] == ' ' || end[-1] == '\t')) {
1505     --end;
1506   }
1507   return end > p && IsCompilerDirectiveSentinel(p, end - p) ? p : nullptr;
1508 }
1509 
1510 std::optional<std::pair<const char *, const char *>>
1511 Prescanner::IsCompilerDirectiveSentinel(const char *p) const {
1512   char sentinel[8];
1513   for (std::size_t j{0}; j + 1 < sizeof sentinel && *p != '\n'; ++p, ++j) {
1514     if (int n{*p == '&' ? 1 : IsSpaceOrTab(p)}) {
1515       if (j > 0) {
1516         sentinel[j] = '\0';
1517         p = SkipWhiteSpace(p + n);
1518         if (*p != '!') {
1519           if (const char *sp{IsCompilerDirectiveSentinel(sentinel, j)}) {
1520             return std::make_pair(sp, p);
1521           }
1522         }
1523       }
1524       break;
1525     } else {
1526       sentinel[j] = ToLowerCaseLetter(*p);
1527     }
1528   }
1529   return std::nullopt;
1530 }
1531 
1532 constexpr bool IsDirective(const char *match, const char *dir) {
1533   for (; *match; ++match) {
1534     if (*match != ToLowerCaseLetter(*dir++)) {
1535       return false;
1536     }
1537   }
1538   return true;
1539 }
1540 
1541 Prescanner::LineClassification Prescanner::ClassifyLine(
1542     const char *start) const {
1543   if (inFixedForm_) {
1544     if (std::optional<LineClassification> lc{
1545             IsFixedFormCompilerDirectiveLine(start)}) {
1546       return std::move(*lc);
1547     }
1548     if (IsFixedFormCommentLine(start)) {
1549       return {LineClassification::Kind::Comment};
1550     }
1551   } else {
1552     if (std::optional<LineClassification> lc{
1553             IsFreeFormCompilerDirectiveLine(start)}) {
1554       return std::move(*lc);
1555     }
1556     if (const char *bang{IsFreeFormComment(start)}) {
1557       return {LineClassification::Kind::Comment,
1558           static_cast<std::size_t>(bang - start)};
1559     }
1560   }
1561   if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1562     return {LineClassification::Kind::IncludeLine, *quoteOffset};
1563   }
1564   if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1565     if (IsDirective("if", dir) || IsDirective("elif", dir) ||
1566         IsDirective("else", dir) || IsDirective("endif", dir)) {
1567       return {LineClassification::Kind::ConditionalCompilationDirective};
1568     } else if (IsDirective("include", dir)) {
1569       return {LineClassification::Kind::IncludeDirective};
1570     } else if (IsDirective("define", dir) || IsDirective("undef", dir)) {
1571       return {LineClassification::Kind::DefinitionDirective};
1572     } else {
1573       return {LineClassification::Kind::PreprocessorDirective};
1574     }
1575   }
1576   return {LineClassification::Kind::Source};
1577 }
1578 
1579 Prescanner::LineClassification Prescanner::ClassifyLine(
1580     TokenSequence &tokens, Provenance newlineProvenance) const {
1581   // Append a newline temporarily.
1582   tokens.PutNextTokenChar('\n', newlineProvenance);
1583   tokens.CloseToken();
1584   const char *ppd{tokens.ToCharBlock().begin()};
1585   LineClassification classification{ClassifyLine(ppd)};
1586   tokens.pop_back(); // remove the newline
1587   return classification;
1588 }
1589 
1590 void Prescanner::SourceFormChange(std::string &&dir) {
1591   if (dir == "!dir$ free") {
1592     inFixedForm_ = false;
1593   } else if (dir == "!dir$ fixed") {
1594     inFixedForm_ = true;
1595   }
1596 }
1597 
1598 // Acquire and append compiler directive continuation lines to
1599 // the tokens that constitute a compiler directive, even when those
1600 // directive continuation lines are the result of macro expansion.
1601 // (Not used when neither the original compiler directive line nor
1602 // the directive continuation line result from preprocessing; regular
1603 // line continuation during tokenization handles that normal case.)
1604 bool Prescanner::CompilerDirectiveContinuation(
1605     TokenSequence &tokens, const char *origSentinel) {
1606   if (inFixedForm_ || tokens.empty() ||
1607       tokens.TokenAt(tokens.SizeInTokens() - 1) != "&") {
1608     return false;
1609   }
1610   LineClassification followingLine{ClassifyLine(nextLine_)};
1611   if (followingLine.kind == LineClassification::Kind::Comment) {
1612     nextLine_ += followingLine.payloadOffset; // advance to '!' or newline
1613     NextLine();
1614     return true;
1615   }
1616   CHECK(origSentinel != nullptr);
1617   directiveSentinel_ = origSentinel; // so InCompilerDirective() is true
1618   const char *nextContinuation{
1619       followingLine.kind == LineClassification::Kind::CompilerDirective
1620           ? FreeFormContinuationLine(true)
1621           : nullptr};
1622   if (!nextContinuation &&
1623       followingLine.kind != LineClassification::Kind::Source) {
1624     return false;
1625   }
1626   auto origNextLine{nextLine_};
1627   BeginSourceLine(nextLine_);
1628   NextLine();
1629   if (nextContinuation) {
1630     // What follows is !DIR$ & xxx; skip over the & so that it
1631     // doesn't cause a spurious continuation.
1632     at_ = nextContinuation;
1633   } else {
1634     // What follows looks like a source line before macro expansion,
1635     // but might become a directive continuation afterwards.
1636     SkipSpaces();
1637   }
1638   TokenSequence followingTokens;
1639   while (NextToken(followingTokens)) {
1640   }
1641   if (auto followingPrepro{
1642           preprocessor_.MacroReplacement(followingTokens, *this)}) {
1643     followingTokens = std::move(*followingPrepro);
1644   }
1645   followingTokens.RemoveRedundantBlanks();
1646   std::size_t startAt{0};
1647   std::size_t following{followingTokens.SizeInTokens()};
1648   bool ok{false};
1649   if (nextContinuation) {
1650     ok = true;
1651   } else {
1652     startAt = 2;
1653     if (startAt < following && followingTokens.TokenAt(0) == "!") {
1654       CharBlock sentinel{followingTokens.TokenAt(1)};
1655       if (!sentinel.empty() &&
1656           std::memcmp(sentinel.begin(), origSentinel, sentinel.size()) == 0) {
1657         ok = true;
1658         while (
1659             startAt < following && followingTokens.TokenAt(startAt).IsBlank()) {
1660           ++startAt;
1661         }
1662         if (startAt < following && followingTokens.TokenAt(startAt) == "&") {
1663           ++startAt;
1664         }
1665       }
1666     }
1667   }
1668   if (ok) {
1669     tokens.pop_back(); // delete original '&'
1670     tokens.Put(followingTokens, startAt, following - startAt);
1671     tokens.RemoveRedundantBlanks();
1672   } else {
1673     nextLine_ = origNextLine;
1674   }
1675   return ok;
1676 }
1677 
1678 // Similar, but for source line continuation after macro replacement.
1679 bool Prescanner::SourceLineContinuation(TokenSequence &tokens) {
1680   if (!inFixedForm_ && !tokens.empty() &&
1681       tokens.TokenAt(tokens.SizeInTokens() - 1) == "&") {
1682     LineClassification followingLine{ClassifyLine(nextLine_)};
1683     if (followingLine.kind == LineClassification::Kind::Comment) {
1684       nextLine_ += followingLine.payloadOffset; // advance to '!' or newline
1685       NextLine();
1686       return true;
1687     } else if (const char *nextContinuation{FreeFormContinuationLine(true)}) {
1688       BeginSourceLine(nextLine_);
1689       NextLine();
1690       TokenSequence followingTokens;
1691       at_ = nextContinuation;
1692       while (NextToken(followingTokens)) {
1693       }
1694       if (auto followingPrepro{
1695               preprocessor_.MacroReplacement(followingTokens, *this)}) {
1696         followingTokens = std::move(*followingPrepro);
1697       }
1698       followingTokens.RemoveRedundantBlanks();
1699       tokens.pop_back(); // delete original '&'
1700       tokens.Put(followingTokens);
1701       return true;
1702     }
1703   }
1704   return false;
1705 }
1706 } // namespace Fortran::parser
1707