xref: /llvm-project/flang/lib/Parser/prescan.cpp (revision f099f76bb2a55bb6a90b30b81bae9f55ea37fcb5)
1 //===-- lib/Parser/prescan.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "prescan.h"
10 #include "flang/Common/idioms.h"
11 #include "flang/Parser/characters.h"
12 #include "flang/Parser/message.h"
13 #include "flang/Parser/preprocessor.h"
14 #include "flang/Parser/source.h"
15 #include "flang/Parser/token-sequence.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <cstddef>
18 #include <cstring>
19 #include <utility>
20 #include <vector>
21 
22 namespace Fortran::parser {
23 
24 using common::LanguageFeature;
25 
26 static constexpr int maxPrescannerNesting{100};
27 
28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
29     Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
30     : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
31       allSources_{preprocessor_.allSources()}, features_{lfc},
32       backslashFreeFormContinuation_{preprocessor.AnyDefinitions()},
33       encoding_{allSources_.encoding()} {}
34 
35 Prescanner::Prescanner(const Prescanner &that, bool isNestedInIncludeDirective)
36     : messages_{that.messages_}, cooked_{that.cooked_},
37       preprocessor_{that.preprocessor_}, allSources_{that.allSources_},
38       features_{that.features_},
39       isNestedInIncludeDirective_{isNestedInIncludeDirective},
40       backslashFreeFormContinuation_{that.backslashFreeFormContinuation_},
41       inFixedForm_{that.inFixedForm_},
42       fixedFormColumnLimit_{that.fixedFormColumnLimit_},
43       encoding_{that.encoding_},
44       prescannerNesting_{that.prescannerNesting_ + 1},
45       skipLeadingAmpersand_{that.skipLeadingAmpersand_},
46       compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
47       compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
48 
49 static inline constexpr bool IsFixedFormCommentChar(char ch) {
50   return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
51 }
52 
53 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
54   char *p{dir.GetMutableCharData()};
55   char *limit{p + dir.SizeInChars()};
56   for (; p < limit; ++p) {
57     if (*p != ' ') {
58       CHECK(IsFixedFormCommentChar(*p));
59       *p = '!';
60       return;
61     }
62   }
63   DIE("compiler directive all blank");
64 }
65 
66 void Prescanner::Prescan(ProvenanceRange range) {
67   startProvenance_ = range.start();
68   start_ = allSources_.GetSource(range);
69   CHECK(start_);
70   limit_ = start_ + range.size();
71   nextLine_ = start_;
72   const bool beganInFixedForm{inFixedForm_};
73   if (prescannerNesting_ > maxPrescannerNesting) {
74     Say(GetProvenance(start_),
75         "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
76     return;
77   }
78   while (!IsAtEnd()) {
79     Statement();
80   }
81   if (inFixedForm_ != beganInFixedForm) {
82     std::string dir{"!dir$ "};
83     if (beganInFixedForm) {
84       dir += "fixed";
85     } else {
86       dir += "free";
87     }
88     dir += '\n';
89     TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()};
90     tokens.Emit(cooked_);
91   }
92 }
93 
94 void Prescanner::Statement() {
95   TokenSequence tokens;
96   const char *statementStart{nextLine_};
97   LineClassification line{ClassifyLine(statementStart)};
98   switch (line.kind) {
99   case LineClassification::Kind::Comment:
100     nextLine_ += line.payloadOffset; // advance to '!' or newline
101     NextLine();
102     return;
103   case LineClassification::Kind::IncludeLine:
104     FortranInclude(nextLine_ + line.payloadOffset);
105     NextLine();
106     return;
107   case LineClassification::Kind::ConditionalCompilationDirective:
108   case LineClassification::Kind::IncludeDirective:
109     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
110     afterPreprocessingDirective_ = true;
111     skipLeadingAmpersand_ |= !inFixedForm_;
112     return;
113   case LineClassification::Kind::PreprocessorDirective:
114     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
115     afterPreprocessingDirective_ = true;
116     // Don't set skipLeadingAmpersand_
117     return;
118   case LineClassification::Kind::DefinitionDirective:
119     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
120     // Don't set afterPreprocessingDirective_ or skipLeadingAmpersand_
121     return;
122   case LineClassification::Kind::CompilerDirective: {
123     directiveSentinel_ = line.sentinel;
124     CHECK(InCompilerDirective());
125     BeginStatementAndAdvance();
126     if (inFixedForm_) {
127       CHECK(IsFixedFormCommentChar(*at_));
128     } else {
129       while (*at_ == ' ' || *at_ == '\t') {
130         ++at_, ++column_;
131       }
132       CHECK(*at_ == '!');
133     }
134     std::optional<int> condOffset;
135     if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
136       // OpenMP conditional compilation line.
137       condOffset = 2;
138     } else if (directiveSentinel_[0] == '@' && directiveSentinel_[1] == 'c' &&
139         directiveSentinel_[2] == 'u' && directiveSentinel_[3] == 'f' &&
140         directiveSentinel_[4] == '\0') {
141       // CUDA conditional compilation line.
142       condOffset = 5;
143     }
144     if (condOffset) {
145       at_ += *condOffset, column_ += *condOffset;
146       if (auto payload{IsIncludeLine(at_)}) {
147         FortranInclude(at_ + *payload);
148         return;
149       } else if (inFixedForm_) {
150         LabelField(tokens);
151       } else {
152         SkipSpaces();
153       }
154     } else {
155       // Compiler directive.  Emit normalized sentinel, squash following spaces.
156       EmitChar(tokens, '!');
157       ++at_, ++column_;
158       for (const char *sp{directiveSentinel_}; *sp != '\0';
159            ++sp, ++at_, ++column_) {
160         EmitChar(tokens, *sp);
161       }
162       if (*at_ == ' ' || *at_ == '\t') {
163         EmitChar(tokens, ' ');
164         while (*at_ == ' ' || *at_ == '\t') {
165           ++at_, ++column_;
166         }
167       }
168       tokens.CloseToken();
169     }
170     break;
171   }
172   case LineClassification::Kind::Source:
173     BeginStatementAndAdvance();
174     if (inFixedForm_) {
175       if (features_.IsEnabled(LanguageFeature::OldDebugLines) &&
176           (*at_ == 'D' || *at_ == 'd')) {
177         NextChar();
178       }
179       LabelField(tokens);
180     } else {
181       if (skipLeadingAmpersand_) {
182         skipLeadingAmpersand_ = false;
183         const char *p{SkipWhiteSpace(at_)};
184         if (p < limit_ && *p == '&') {
185           column_ += ++p - at_;
186           at_ = p;
187         }
188       } else {
189         SkipSpaces();
190       }
191       // Check for a leading identifier that might be a keyword macro
192       // that will expand to anything indicating a non-source line, like
193       // a comment marker or directive sentinel.  If so, disable line
194       // continuation, so that NextToken() won't consume anything from
195       // following lines.
196       if (IsLegalIdentifierStart(*at_)) {
197         // TODO: Only bother with these cases when any keyword macro has
198         // been defined with replacement text that could begin a comment
199         // or directive sentinel.
200         const char *p{at_};
201         while (IsLegalInIdentifier(*++p)) {
202         }
203         CharBlock id{at_, static_cast<std::size_t>(p - at_)};
204         if (preprocessor_.IsNameDefined(id) &&
205             !preprocessor_.IsFunctionLikeDefinition(id)) {
206           TokenSequence toks;
207           toks.Put(id, GetProvenance(at_));
208           if (auto replaced{preprocessor_.MacroReplacement(toks, *this)}) {
209             auto newLineClass{ClassifyLine(*replaced, GetCurrentProvenance())};
210             if (newLineClass.kind ==
211                 LineClassification::Kind::CompilerDirective) {
212               directiveSentinel_ = newLineClass.sentinel;
213               disableSourceContinuation_ = false;
214             } else {
215               disableSourceContinuation_ =
216                   newLineClass.kind != LineClassification::Kind::Source;
217             }
218           }
219         }
220       }
221     }
222     break;
223   }
224 
225   while (NextToken(tokens)) {
226   }
227   if (continuationLines_ > 255) {
228     if (features_.ShouldWarn(common::LanguageFeature::MiscSourceExtensions)) {
229       Say(GetProvenance(statementStart),
230           "%d continuation lines is more than the Fortran standard allows"_port_en_US,
231           continuationLines_);
232     }
233   }
234 
235   Provenance newlineProvenance{GetCurrentProvenance()};
236   if (std::optional<TokenSequence> preprocessed{
237           preprocessor_.MacroReplacement(tokens, *this)}) {
238     // Reprocess the preprocessed line.
239     LineClassification ppl{ClassifyLine(*preprocessed, newlineProvenance)};
240     switch (ppl.kind) {
241     case LineClassification::Kind::Comment:
242       break;
243     case LineClassification::Kind::IncludeLine:
244       FortranInclude(preprocessed->TokenAt(0).begin() + ppl.payloadOffset);
245       break;
246     case LineClassification::Kind::ConditionalCompilationDirective:
247     case LineClassification::Kind::IncludeDirective:
248     case LineClassification::Kind::DefinitionDirective:
249     case LineClassification::Kind::PreprocessorDirective:
250       if (features_.ShouldWarn(common::UsageWarning::Preprocessing)) {
251         Say(preprocessed->GetProvenanceRange(),
252             "Preprocessed line resembles a preprocessor directive"_warn_en_US);
253       }
254       CheckAndEmitLine(preprocessed->ToLowerCase(), newlineProvenance);
255       break;
256     case LineClassification::Kind::CompilerDirective:
257       if (preprocessed->HasRedundantBlanks()) {
258         preprocessed->RemoveRedundantBlanks();
259       }
260       while (CompilerDirectiveContinuation(*preprocessed, ppl.sentinel)) {
261         newlineProvenance = GetCurrentProvenance();
262       }
263       NormalizeCompilerDirectiveCommentMarker(*preprocessed);
264       preprocessed->ToLowerCase();
265       SourceFormChange(preprocessed->ToString());
266       CheckAndEmitLine(preprocessed->ToLowerCase().ClipComment(
267                            *this, true /* skip first ! */),
268           newlineProvenance);
269       break;
270     case LineClassification::Kind::Source:
271       if (inFixedForm_) {
272         if (preprocessed->HasBlanks(/*after column*/ 6)) {
273           preprocessed->RemoveBlanks(/*after column*/ 6);
274         }
275       } else {
276         while (SourceLineContinuation(*preprocessed)) {
277           newlineProvenance = GetCurrentProvenance();
278         }
279         if (preprocessed->HasRedundantBlanks()) {
280           preprocessed->RemoveRedundantBlanks();
281         }
282       }
283       CheckAndEmitLine(
284           preprocessed->ToLowerCase().ClipComment(*this), newlineProvenance);
285       break;
286     }
287   } else { // no macro replacement
288     if (line.kind == LineClassification::Kind::CompilerDirective) {
289       while (CompilerDirectiveContinuation(tokens, line.sentinel)) {
290         newlineProvenance = GetCurrentProvenance();
291       }
292       tokens.ToLowerCase();
293       SourceFormChange(tokens.ToString());
294     } else { // Kind::Source
295       tokens.ToLowerCase();
296       if (inFixedForm_) {
297         EnforceStupidEndStatementRules(tokens);
298       }
299     }
300     CheckAndEmitLine(tokens, newlineProvenance);
301   }
302   directiveSentinel_ = nullptr;
303 }
304 
305 void Prescanner::CheckAndEmitLine(
306     TokenSequence &tokens, Provenance newlineProvenance) {
307   tokens.CheckBadFortranCharacters(
308       messages_, *this, disableSourceContinuation_);
309   // Parenthesis nesting check does not apply while any #include is
310   // active, nor on the lines before and after a top-level #include,
311   // nor before or after conditional source.
312   // Applications play shenanigans with line continuation before and
313   // after #include'd subprogram argument lists and conditional source.
314   if (!isNestedInIncludeDirective_ && !omitNewline_ &&
315       !afterPreprocessingDirective_ && tokens.BadlyNestedParentheses() &&
316       !preprocessor_.InConditional()) {
317     if (nextLine_ < limit_ && IsPreprocessorDirectiveLine(nextLine_)) {
318       // don't complain
319     } else {
320       tokens.CheckBadParentheses(messages_);
321     }
322   }
323   tokens.Emit(cooked_);
324   if (omitNewline_) {
325     omitNewline_ = false;
326   } else {
327     cooked_.Put('\n', newlineProvenance);
328     afterPreprocessingDirective_ = false;
329   }
330 }
331 
332 TokenSequence Prescanner::TokenizePreprocessorDirective() {
333   CHECK(!IsAtEnd() && !inPreprocessorDirective_);
334   inPreprocessorDirective_ = true;
335   BeginStatementAndAdvance();
336   TokenSequence tokens;
337   while (NextToken(tokens)) {
338   }
339   inPreprocessorDirective_ = false;
340   return tokens;
341 }
342 
343 void Prescanner::NextLine() {
344   void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
345   void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
346   if (!v) {
347     nextLine_ = limit_;
348   } else {
349     const char *nl{const_cast<const char *>(static_cast<char *>(v))};
350     nextLine_ = nl + 1;
351   }
352 }
353 
354 void Prescanner::LabelField(TokenSequence &token) {
355   int outCol{1};
356   const char *start{at_};
357   std::optional<int> badColumn;
358   for (; *at_ != '\n' && column_ <= 6; ++at_) {
359     if (*at_ == '\t') {
360       ++at_;
361       column_ = 7;
362       break;
363     }
364     if (*at_ != ' ' &&
365         !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
366       EmitChar(token, *at_);
367       ++outCol;
368       if (!badColumn && (column_ == 6 || !IsDecimalDigit(*at_))) {
369         badColumn = column_;
370       }
371     }
372     ++column_;
373   }
374   if (badColumn && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) {
375     if ((prescannerNesting_ > 0 && *badColumn == 6 &&
376             cooked_.BufferedBytes() == firstCookedCharacterOffset_) ||
377         afterPreprocessingDirective_) {
378       // This is the first source line in #include'd text or conditional
379       // code under #if, or the first source line after such.
380       // If it turns out that the preprocessed text begins with a
381       // fixed form continuation line, the newline at the end
382       // of the latest source line beforehand will be deleted in
383       // CookedSource::Marshal().
384       cooked_.MarkPossibleFixedFormContinuation();
385     } else if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
386       Say(GetProvenance(start + *badColumn - 1),
387           *badColumn == 6
388               ? "Statement should not begin with a continuation line"_warn_en_US
389               : "Character in fixed-form label field must be a digit"_warn_en_US);
390     }
391     token.clear();
392     if (*badColumn < 6) {
393       at_ = start;
394       column_ = 1;
395       return;
396     }
397     outCol = 1;
398   }
399   if (outCol == 1) { // empty label field
400     // Emit a space so that, if the line is rescanned after preprocessing,
401     // a leading 'C' or 'D' won't be left-justified and then accidentally
402     // misinterpreted as a comment card.
403     EmitChar(token, ' ');
404     ++outCol;
405   }
406   token.CloseToken();
407   SkipToNextSignificantCharacter();
408   if (IsDecimalDigit(*at_)) {
409     if (features_.ShouldWarn(common::LanguageFeature::MiscSourceExtensions)) {
410       Say(GetCurrentProvenance(),
411           "Label digit is not in fixed-form label field"_port_en_US);
412     }
413   }
414 }
415 
416 // 6.3.3.5: A program unit END statement, or any other statement whose
417 // initial line resembles an END statement, shall not be continued in
418 // fixed form source.
419 void Prescanner::EnforceStupidEndStatementRules(const TokenSequence &tokens) {
420   CharBlock cBlock{tokens.ToCharBlock()};
421   const char *str{cBlock.begin()};
422   std::size_t n{cBlock.size()};
423   if (n < 3) {
424     return;
425   }
426   std::size_t j{0};
427   for (; j < n && (str[j] == ' ' || (str[j] >= '0' && str[j] <= '9')); ++j) {
428   }
429   if (j + 3 > n || std::memcmp(str + j, "end", 3) != 0) {
430     return;
431   }
432   // It starts with END, possibly after a label.
433   auto start{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
434   auto end{allSources_.GetSourcePosition(tokens.GetCharProvenance(n - 1))};
435   if (!start || !end) {
436     return;
437   }
438   if (&*start->sourceFile == &*end->sourceFile && start->line == end->line) {
439     return; // no continuation
440   }
441   j += 3;
442   static const char *const prefixes[]{"program", "subroutine", "function",
443       "blockdata", "module", "submodule", nullptr};
444   bool isPrefix{j == n || !IsLegalInIdentifier(str[j])}; // prefix is END
445   std::size_t endOfPrefix{j - 1};
446   for (const char *const *p{prefixes}; *p; ++p) {
447     std::size_t pLen{std::strlen(*p)};
448     if (j + pLen <= n && std::memcmp(str + j, *p, pLen) == 0) {
449       isPrefix = true; // END thing as prefix
450       j += pLen;
451       endOfPrefix = j - 1;
452       for (; j < n && IsLegalInIdentifier(str[j]); ++j) {
453       }
454       break;
455     }
456   }
457   if (isPrefix) {
458     auto range{tokens.GetTokenProvenanceRange(1)};
459     if (j == n) { // END or END thing [name]
460       Say(range,
461           "Program unit END statement may not be continued in fixed form source"_err_en_US);
462     } else {
463       auto endOfPrefixPos{
464           allSources_.GetSourcePosition(tokens.GetCharProvenance(endOfPrefix))};
465       auto next{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
466       if (endOfPrefixPos && next &&
467           &*endOfPrefixPos->sourceFile == &*start->sourceFile &&
468           endOfPrefixPos->line == start->line &&
469           (&*next->sourceFile != &*start->sourceFile ||
470               next->line != start->line)) {
471         Say(range,
472             "Initial line of continued statement must not appear to be a program unit END in fixed form source"_err_en_US);
473       }
474     }
475   }
476 }
477 
478 void Prescanner::SkipToEndOfLine() {
479   while (*at_ != '\n') {
480     ++at_, ++column_;
481   }
482 }
483 
484 bool Prescanner::MustSkipToEndOfLine() const {
485   if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
486     return true; // skip over ignored columns in right margin (73:80)
487   } else if (*at_ == '!' && !inCharLiteral_) {
488     return !IsCompilerDirectiveSentinel(at_);
489   } else {
490     return false;
491   }
492 }
493 
494 void Prescanner::NextChar() {
495   CHECK(*at_ != '\n');
496   ++at_, ++column_;
497   while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
498     // UTF-8 byte order mark - treat this file as UTF-8
499     at_ += 3;
500     encoding_ = Encoding::UTF_8;
501   }
502   SkipToNextSignificantCharacter();
503 }
504 
505 // Skip everything that should be ignored until the next significant
506 // character is reached; handles C-style comments in preprocessing
507 // directives, Fortran ! comments, stuff after the right margin in
508 // fixed form, and all forms of line continuation.
509 bool Prescanner::SkipToNextSignificantCharacter() {
510   auto anyContinuationLine{false};
511   if (inPreprocessorDirective_) {
512     SkipCComments();
513   } else {
514     bool mightNeedSpace{false};
515     if (MustSkipToEndOfLine()) {
516       SkipToEndOfLine();
517     } else {
518       mightNeedSpace = *at_ == '\n';
519     }
520     for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
521       anyContinuationLine = true;
522       ++continuationLines_;
523       if (MustSkipToEndOfLine()) {
524         SkipToEndOfLine();
525       }
526     }
527     if (*at_ == '\t') {
528       tabInCurrentLine_ = true;
529     }
530   }
531   return anyContinuationLine;
532 }
533 
534 void Prescanner::SkipCComments() {
535   while (true) {
536     if (IsCComment(at_)) {
537       if (const char *after{SkipCComment(at_)}) {
538         column_ += after - at_;
539         // May have skipped over one or more newlines; relocate the start of
540         // the next line.
541         nextLine_ = at_ = after;
542         NextLine();
543       } else {
544         // Don't emit any messages about unclosed C-style comments, because
545         // the sequence /* can appear legally in a FORMAT statement.  There's
546         // no ambiguity, since the sequence */ cannot appear legally.
547         break;
548       }
549     } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
550         at_[1] == '\n' && !IsAtEnd()) {
551       BeginSourceLineAndAdvance();
552     } else {
553       break;
554     }
555   }
556 }
557 
558 void Prescanner::SkipSpaces() {
559   while (*at_ == ' ' || *at_ == '\t') {
560     NextChar();
561   }
562   insertASpace_ = false;
563 }
564 
565 const char *Prescanner::SkipWhiteSpace(const char *p) {
566   while (*p == ' ' || *p == '\t') {
567     ++p;
568   }
569   return p;
570 }
571 
572 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
573   while (true) {
574     if (*p == ' ' || *p == '\t') {
575       ++p;
576     } else if (IsCComment(p)) {
577       if (const char *after{SkipCComment(p)}) {
578         p = after;
579       } else {
580         break;
581       }
582     } else {
583       break;
584     }
585   }
586   return p;
587 }
588 
589 const char *Prescanner::SkipCComment(const char *p) const {
590   char star{' '}, slash{' '};
591   p += 2;
592   while (star != '*' || slash != '/') {
593     if (p >= limit_) {
594       return nullptr; // signifies an unterminated comment
595     }
596     star = slash;
597     slash = *p++;
598   }
599   return p;
600 }
601 
602 bool Prescanner::NextToken(TokenSequence &tokens) {
603   CHECK(at_ >= start_ && at_ < limit_);
604   if (InFixedFormSource()) {
605     SkipSpaces();
606   } else {
607     if (*at_ == '/' && IsCComment(at_)) {
608       // Recognize and skip over classic C style /*comments*/ when
609       // outside a character literal.
610       if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
611         Say(GetCurrentProvenance(),
612             "nonstandard usage: C-style comment"_port_en_US);
613       }
614       SkipCComments();
615     }
616     if (*at_ == ' ' || *at_ == '\t') {
617       // Compress free-form white space into a single space character.
618       const auto theSpace{at_};
619       char previous{at_ <= start_ ? ' ' : at_[-1]};
620       NextChar();
621       SkipSpaces();
622       if (*at_ == '\n' && !omitNewline_) {
623         // Discard white space at the end of a line.
624       } else if (!inPreprocessorDirective_ &&
625           (previous == '(' || *at_ == '(' || *at_ == ')')) {
626         // Discard white space before/after '(' and before ')', unless in a
627         // preprocessor directive.  This helps yield space-free contiguous
628         // names for generic interfaces like OPERATOR( + ) and
629         // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
630         // This has the effect of silently ignoring the illegal spaces in
631         // the array constructor ( /1,2/ ) but that seems benign; it's
632         // hard to avoid that while still removing spaces from OPERATOR( / )
633         // and OPERATOR( // ).
634       } else {
635         // Preserve the squashed white space as a single space character.
636         tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
637         tokens.CloseToken();
638         return true;
639       }
640     }
641   }
642   if (insertASpace_) {
643     tokens.PutNextTokenChar(' ', spaceProvenance_);
644     insertASpace_ = false;
645   }
646   if (*at_ == '\n') {
647     return false;
648   }
649   const char *start{at_};
650   if (*at_ == '\'' || *at_ == '"') {
651     QuotedCharacterLiteral(tokens, start);
652     preventHollerith_ = false;
653   } else if (IsDecimalDigit(*at_)) {
654     int n{0}, digits{0};
655     static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
656     do {
657       if (n < maxHollerith) {
658         n = 10 * n + DecimalDigitValue(*at_);
659       }
660       EmitCharAndAdvance(tokens, *at_);
661       ++digits;
662       if (InFixedFormSource()) {
663         SkipSpaces();
664       }
665     } while (IsDecimalDigit(*at_));
666     if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
667         !preventHollerith_) {
668       Hollerith(tokens, n, start);
669     } else if (*at_ == '.') {
670       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
671       }
672       ExponentAndKind(tokens);
673     } else if (ExponentAndKind(tokens)) {
674     } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
675         inPreprocessorDirective_) {
676       do {
677         EmitCharAndAdvance(tokens, *at_);
678       } while (IsHexadecimalDigit(*at_));
679     } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..."
680       EmitCharAndAdvance(tokens, *at_);
681       QuotedCharacterLiteral(tokens, start);
682     } else if (IsLetter(*at_) && !preventHollerith_ &&
683         parenthesisNesting_ > 0) {
684       // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
685       // we don't misrecognize I9HOLLERITH as an identifier in the next case.
686       EmitCharAndAdvance(tokens, *at_);
687     }
688     preventHollerith_ = false;
689   } else if (*at_ == '.') {
690     char nch{EmitCharAndAdvance(tokens, '.')};
691     if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
692       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
693       }
694       ExponentAndKind(tokens);
695     } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
696       EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
697     }
698     preventHollerith_ = false;
699   } else if (IsLegalInIdentifier(*at_)) {
700     int parts{1};
701     const char *afterLast{nullptr};
702     do {
703       EmitChar(tokens, *at_);
704       ++at_, ++column_;
705       afterLast = at_;
706       if (SkipToNextSignificantCharacter() && IsLegalIdentifierStart(*at_)) {
707         tokens.CloseToken();
708         ++parts;
709       }
710     } while (IsLegalInIdentifier(*at_));
711     if (parts >= 3) {
712       // Subtlety: When an identifier is split across three or more continuation
713       // lines (or two continuation lines, immediately preceded or followed
714       // by '&' free form continuation line markers, its parts are kept as
715       // distinct pp-tokens so that macro operates on them independently.
716       // This trick accommodates the historic practice of using line
717       // continuation for token pasting after replacement.
718     } else if (parts == 2) {
719       if ((start > start_ && start[-1] == '&') ||
720           (afterLast < limit_ && (*afterLast == '&' || *afterLast == '\n'))) {
721         // call &                call foo&        call foo&
722         //   &MACRO&      OR       &MACRO&   OR     &MACRO
723         //   &foo(...)             &(...)
724       } else {
725         tokens.ReopenLastToken();
726       }
727     }
728     if (InFixedFormSource()) {
729       SkipSpaces();
730     }
731     if ((*at_ == '\'' || *at_ == '"') &&
732         tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..."
733       QuotedCharacterLiteral(tokens, start);
734       preventHollerith_ = false;
735     } else {
736       preventHollerith_ = true; // DO 10 H = ...
737     }
738   } else if (*at_ == '*') {
739     if (EmitCharAndAdvance(tokens, '*') == '*') {
740       EmitCharAndAdvance(tokens, '*');
741     } else {
742       // Subtle ambiguity:
743       //  CHARACTER*2H     declares H because *2 is a kind specifier
744       //  DATAC/N*2H  /    is repeated Hollerith
745       preventHollerith_ = !slashInCurrentStatement_;
746     }
747   } else {
748     char ch{*at_};
749     if (ch == '(') {
750       if (parenthesisNesting_++ == 0) {
751         isPossibleMacroCall_ = tokens.SizeInTokens() > 0 &&
752             preprocessor_.IsFunctionLikeDefinition(
753                 tokens.TokenAt(tokens.SizeInTokens() - 1));
754       }
755     } else if (ch == ')' && parenthesisNesting_ > 0) {
756       --parenthesisNesting_;
757     }
758     char nch{EmitCharAndAdvance(tokens, ch)};
759     preventHollerith_ = false;
760     if ((nch == '=' &&
761             (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
762         (ch == nch &&
763             (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
764                 ch == '|' || ch == '<' || ch == '>')) ||
765         (ch == '=' && nch == '>')) {
766       // token comprises two characters
767       EmitCharAndAdvance(tokens, nch);
768     } else if (ch == '/') {
769       slashInCurrentStatement_ = true;
770     } else if (ch == ';' && InFixedFormSource()) {
771       SkipSpaces();
772       if (IsDecimalDigit(*at_)) {
773         if (features_.ShouldWarn(
774                 common::LanguageFeature::MiscSourceExtensions)) {
775           Say(GetProvenanceRange(at_, at_ + 1),
776               "Label should be in the label field"_port_en_US);
777         }
778       }
779     }
780   }
781   tokens.CloseToken();
782   return true;
783 }
784 
785 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
786   char ed{ToLowerCaseLetter(*at_)};
787   if (ed != 'e' && ed != 'd') {
788     return false;
789   }
790   EmitCharAndAdvance(tokens, ed);
791   if (*at_ == '+' || *at_ == '-') {
792     EmitCharAndAdvance(tokens, *at_);
793   }
794   while (IsDecimalDigit(*at_)) {
795     EmitCharAndAdvance(tokens, *at_);
796   }
797   if (*at_ == '_') {
798     while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
799     }
800   }
801   return true;
802 }
803 
804 void Prescanner::QuotedCharacterLiteral(
805     TokenSequence &tokens, const char *start) {
806   char quote{*at_};
807   const char *end{at_ + 1};
808   inCharLiteral_ = true;
809   continuationInCharLiteral_ = true;
810   const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
811   const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
812   bool isEscaped{false};
813   bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
814   while (true) {
815     if (*at_ == '\\') {
816       if (escapesEnabled) {
817         isEscaped = !isEscaped;
818       } else {
819         // The parser always processes escape sequences, so don't confuse it
820         // when escapes are disabled.
821         insert('\\');
822       }
823     } else {
824       isEscaped = false;
825     }
826     EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
827         Encoding::LATIN_1);
828     while (PadOutCharacterLiteral(tokens)) {
829     }
830     if (*at_ == '\n') {
831       if (!inPreprocessorDirective_) {
832         Say(GetProvenanceRange(start, end),
833             "Incomplete character literal"_err_en_US);
834       }
835       break;
836     }
837     // Here's a weird edge case.  When there's a two or more following
838     // continuation lines at this point, and the entire significant part of
839     // the next continuation line is the name of a keyword macro, replace
840     // it in the character literal with its definition.  Example:
841     //   #define FOO foo
842     //   subroutine subr() bind(c, name="my_&
843     //     &FOO&
844     //     &_bar") ...
845     // produces a binding name of "my_foo_bar".
846     while (at_[1] == '&' && nextLine_ < limit_ && !InFixedFormSource()) {
847       const char *idStart{nextLine_};
848       if (const char *amper{SkipWhiteSpace(nextLine_)}; *amper == '&') {
849         idStart = amper + 1;
850       }
851       if (IsLegalIdentifierStart(*idStart)) {
852         std::size_t idLen{1};
853         for (; IsLegalInIdentifier(idStart[idLen]); ++idLen) {
854         }
855         if (idStart[idLen] == '&') {
856           CharBlock id{idStart, idLen};
857           if (preprocessor_.IsNameDefined(id)) {
858             TokenSequence ppTokens;
859             ppTokens.Put(id, GetProvenance(idStart));
860             if (auto replaced{
861                     preprocessor_.MacroReplacement(ppTokens, *this)}) {
862               tokens.Put(*replaced);
863               at_ = &idStart[idLen - 1];
864               NextLine();
865               continue; // try again on the next line
866             }
867           }
868         }
869       }
870       break;
871     }
872     end = at_ + 1;
873     NextChar();
874     if (*at_ == quote && !isEscaped) {
875       // A doubled unescaped quote mark becomes a single instance of that
876       // quote character in the literal (later).  There can be spaces between
877       // the quotes in fixed form source.
878       EmitChar(tokens, quote);
879       inCharLiteral_ = false; // for cases like print *, '...'!comment
880       NextChar();
881       if (InFixedFormSource()) {
882         SkipSpaces();
883       }
884       if (*at_ != quote) {
885         break;
886       }
887       inCharLiteral_ = true;
888     }
889   }
890   continuationInCharLiteral_ = false;
891   inCharLiteral_ = false;
892 }
893 
894 void Prescanner::Hollerith(
895     TokenSequence &tokens, int count, const char *start) {
896   inCharLiteral_ = true;
897   CHECK(*at_ == 'h' || *at_ == 'H');
898   EmitChar(tokens, 'H');
899   while (count-- > 0) {
900     if (PadOutCharacterLiteral(tokens)) {
901     } else if (*at_ == '\n') {
902       if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
903         Say(GetProvenanceRange(start, at_),
904             "Possible truncated Hollerith literal"_warn_en_US);
905       }
906       break;
907     } else {
908       NextChar();
909       // Each multi-byte character encoding counts as a single character.
910       // No escape sequences are recognized.
911       // Hollerith is always emitted to the cooked character
912       // stream in UTF-8.
913       DecodedCharacter decoded{DecodeCharacter(
914           encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
915       if (decoded.bytes > 0) {
916         EncodedCharacter utf8{
917             EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
918         for (int j{0}; j < utf8.bytes; ++j) {
919           EmitChar(tokens, utf8.buffer[j]);
920         }
921         at_ += decoded.bytes - 1;
922       } else {
923         Say(GetProvenanceRange(start, at_),
924             "Bad character in Hollerith literal"_err_en_US);
925         break;
926       }
927     }
928   }
929   if (*at_ != '\n') {
930     NextChar();
931   }
932   inCharLiteral_ = false;
933 }
934 
935 // In fixed form, source card images must be processed as if they were at
936 // least 72 columns wide, at least in character literal contexts.
937 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
938   while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
939     if (column_ < fixedFormColumnLimit_) {
940       tokens.PutNextTokenChar(' ', spaceProvenance_);
941       ++column_;
942       return true;
943     }
944     if (!FixedFormContinuation(false /*no need to insert space*/) ||
945         tabInCurrentLine_) {
946       return false;
947     }
948     CHECK(column_ == 7);
949     --at_; // point to column 6 of continuation line
950     column_ = 6;
951   }
952   return false;
953 }
954 
955 static bool IsAtProcess(const char *p) {
956   static const char pAtProc[]{"process"};
957   for (std::size_t i{0}; i < sizeof pAtProc - 1; ++i) {
958     if (ToLowerCaseLetter(*++p) != pAtProc[i])
959       return false;
960   }
961   return true;
962 }
963 
964 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
965   const char *p{start};
966 
967   // The @process directive must start in column 1.
968   if (*p == '@' && IsAtProcess(p)) {
969     return true;
970   }
971 
972   if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
973       ((*p == 'D' || *p == 'd') &&
974           !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
975     return true;
976   }
977   bool anyTabs{false};
978   while (true) {
979     if (*p == ' ') {
980       ++p;
981     } else if (*p == '\t') {
982       anyTabs = true;
983       ++p;
984     } else if (*p == '0' && !anyTabs && p == start + 5) {
985       ++p; // 0 in column 6 must treated as a space
986     } else {
987       break;
988     }
989   }
990   if (!anyTabs && p >= start + fixedFormColumnLimit_) {
991     return true;
992   }
993   if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
994     return true;
995   }
996   return *p == '\n';
997 }
998 
999 const char *Prescanner::IsFreeFormComment(const char *p) const {
1000   p = SkipWhiteSpaceAndCComments(p);
1001   if (*p == '!' || *p == '\n') {
1002     return p;
1003   } else if (*p == '@') {
1004     return IsAtProcess(p) ? p : nullptr;
1005   } else {
1006     return nullptr;
1007   }
1008 }
1009 
1010 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
1011   const char *p{SkipWhiteSpace(start)};
1012   if (*p == '0' && inFixedForm_ && p == start + 5) {
1013     // Accept "     0INCLUDE" in fixed form.
1014     p = SkipWhiteSpace(p + 1);
1015   }
1016   for (const char *q{"include"}; *q; ++q) {
1017     if (ToLowerCaseLetter(*p) != *q) {
1018       return std::nullopt;
1019     }
1020     p = SkipWhiteSpace(p + 1);
1021   }
1022   if (IsDecimalDigit(*p)) { // accept & ignore a numeric kind prefix
1023     for (p = SkipWhiteSpace(p + 1); IsDecimalDigit(*p);
1024          p = SkipWhiteSpace(p + 1)) {
1025     }
1026     if (*p != '_') {
1027       return std::nullopt;
1028     }
1029     p = SkipWhiteSpace(p + 1);
1030   }
1031   if (*p == '"' || *p == '\'') {
1032     return {p - start};
1033   }
1034   return std::nullopt;
1035 }
1036 
1037 void Prescanner::FortranInclude(const char *firstQuote) {
1038   const char *p{firstQuote};
1039   while (*p != '"' && *p != '\'') {
1040     ++p;
1041   }
1042   char quote{*p};
1043   std::string path;
1044   for (++p; *p != '\n'; ++p) {
1045     if (*p == quote) {
1046       if (p[1] != quote) {
1047         break;
1048       }
1049       ++p;
1050     }
1051     path += *p;
1052   }
1053   if (*p != quote) {
1054     Say(GetProvenanceRange(firstQuote, p),
1055         "malformed path name string"_err_en_US);
1056     return;
1057   }
1058   p = SkipWhiteSpace(p + 1);
1059   if (*p != '\n' && *p != '!') {
1060     const char *garbage{p};
1061     for (; *p != '\n' && *p != '!'; ++p) {
1062     }
1063     if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
1064       Say(GetProvenanceRange(garbage, p),
1065           "excess characters after path name"_warn_en_US);
1066     }
1067   }
1068   std::string buf;
1069   llvm::raw_string_ostream error{buf};
1070   Provenance provenance{GetProvenance(nextLine_)};
1071   std::optional<std::string> prependPath;
1072   if (const SourceFile * currentFile{allSources_.GetSourceFile(provenance)}) {
1073     prependPath = DirectoryName(currentFile->path());
1074   }
1075   const SourceFile *included{
1076       allSources_.Open(path, error, std::move(prependPath))};
1077   if (!included) {
1078     Say(provenance, "INCLUDE: %s"_err_en_US, error.str());
1079   } else if (included->bytes() > 0) {
1080     ProvenanceRange includeLineRange{
1081         provenance, static_cast<std::size_t>(p - nextLine_)};
1082     ProvenanceRange fileRange{
1083         allSources_.AddIncludedFile(*included, includeLineRange)};
1084     Prescanner{*this, /*isNestedInIncludeDirective=*/false}
1085         .set_encoding(included->encoding())
1086         .Prescan(fileRange);
1087   }
1088 }
1089 
1090 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
1091   const char *p{start};
1092   for (; *p == ' '; ++p) {
1093   }
1094   if (*p == '#') {
1095     if (inFixedForm_ && p == start + 5) {
1096       return nullptr;
1097     }
1098   } else {
1099     p = SkipWhiteSpace(p);
1100     if (*p != '#') {
1101       return nullptr;
1102     }
1103   }
1104   return SkipWhiteSpace(p + 1);
1105 }
1106 
1107 bool Prescanner::IsNextLinePreprocessorDirective() const {
1108   return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
1109 }
1110 
1111 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
1112   if (IsAtEnd()) {
1113     if (afterAmpersand && prescannerNesting_ > 0) {
1114       // A continuation marker at the end of the last line in an
1115       // include file inhibits the newline for that line.
1116       SkipToEndOfLine();
1117       omitNewline_ = true;
1118     }
1119   } else if (inPreprocessorDirective_) {
1120   } else {
1121     auto lineClass{ClassifyLine(nextLine_)};
1122     if (lineClass.kind == LineClassification::Kind::Comment) {
1123       NextLine();
1124       return true;
1125     } else if (lineClass.kind ==
1126             LineClassification::Kind::ConditionalCompilationDirective ||
1127         lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
1128       // Allow conditional compilation directives (e.g., #ifdef) to affect
1129       // continuation lines.
1130       // Allow other preprocessor directives, too, except #include
1131       // (when it does not follow '&'), #define, and #undef (because
1132       // they cannot be allowed to affect preceding text on a
1133       // continued line).
1134       preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
1135       return true;
1136     } else if (afterAmpersand &&
1137         (lineClass.kind == LineClassification::Kind::DefinitionDirective ||
1138             lineClass.kind == LineClassification::Kind::IncludeDirective ||
1139             lineClass.kind == LineClassification::Kind::IncludeLine)) {
1140       SkipToEndOfLine();
1141       omitNewline_ = true;
1142       skipLeadingAmpersand_ = true;
1143     }
1144   }
1145   return false;
1146 }
1147 
1148 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
1149   if (IsAtEnd()) {
1150     return nullptr;
1151   }
1152   tabInCurrentLine_ = false;
1153   char col1{*nextLine_};
1154   if (IsFixedFormCommentChar(col1)) {
1155     int j{1};
1156     if (InCompilerDirective()) {
1157       // Must be a continued compiler directive.
1158       for (; j < 5; ++j) {
1159         char ch{directiveSentinel_[j - 1]};
1160         if (ch == '\0') {
1161           break;
1162         }
1163         if (ch != ToLowerCaseLetter(nextLine_[j])) {
1164           return nullptr;
1165         }
1166       }
1167     } else if (features_.IsEnabled(LanguageFeature::OpenMP)) {
1168       // Fixed Source Form Conditional Compilation Sentinels.
1169       if (nextLine_[1] != '$') {
1170         return nullptr;
1171       }
1172       j++;
1173     } else {
1174       return nullptr;
1175     }
1176     for (; j < 5; ++j) {
1177       if (nextLine_[j] != ' ') {
1178         return nullptr;
1179       }
1180     }
1181     char col6{nextLine_[5]};
1182     if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
1183       if (nextLine_[6] != ' ' && mightNeedSpace) {
1184         insertASpace_ = true;
1185       }
1186       return nextLine_ + 6;
1187     }
1188     return nullptr;
1189   } else {
1190     // Normal case: not in a compiler directive.
1191     if (col1 == '&' &&
1192         features_.IsEnabled(
1193             LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
1194       // Extension: '&' as continuation marker
1195       if (features_.ShouldWarn(
1196               LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
1197         Say(GetProvenance(nextLine_), "nonstandard usage"_port_en_US);
1198       }
1199       return nextLine_ + 1;
1200     }
1201     if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
1202       tabInCurrentLine_ = true;
1203       return nextLine_ + 2; // VAX extension
1204     }
1205     if ((col1 == ' ' ||
1206             ((col1 == 'D' || col1 == 'd') &&
1207                 features_.IsEnabled(LanguageFeature::OldDebugLines))) &&
1208         nextLine_[1] == ' ' && nextLine_[2] == ' ' && nextLine_[3] == ' ' &&
1209         nextLine_[4] == ' ') {
1210       char col6{nextLine_[5]};
1211       if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
1212         if ((col6 == 'i' || col6 == 'I') && IsIncludeLine(nextLine_)) {
1213           // It's An INCLUDE line, not a continuation
1214         } else {
1215           return nextLine_ + 6;
1216         }
1217       }
1218     }
1219     if (IsImplicitContinuation()) {
1220       return nextLine_;
1221     }
1222   }
1223   return nullptr; // not a continuation line
1224 }
1225 
1226 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
1227   const char *p{nextLine_};
1228   if (p >= limit_) {
1229     return nullptr;
1230   }
1231   p = SkipWhiteSpace(p);
1232   if (InCompilerDirective()) {
1233     if (*p++ != '!') {
1234       return nullptr;
1235     }
1236     for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
1237       if (*s != ToLowerCaseLetter(*p)) {
1238         return nullptr;
1239       }
1240     }
1241     p = SkipWhiteSpace(p);
1242     if (*p == '&') {
1243       if (!ampersand) {
1244         insertASpace_ = true;
1245       }
1246       return p + 1;
1247     } else if (ampersand) {
1248       return p;
1249     } else {
1250       return nullptr;
1251     }
1252   } else {
1253     if (*p == '&') {
1254       return p + 1;
1255     } else if (*p == '!' || *p == '\n' || *p == '#') {
1256       return nullptr;
1257     } else if (ampersand || IsImplicitContinuation()) {
1258       if (continuationInCharLiteral_) {
1259         // 'a'&            -> 'a''b' == "a'b"
1260         //   'b'
1261         if (features_.ShouldWarn(
1262                 common::LanguageFeature::MiscSourceExtensions)) {
1263           Say(GetProvenanceRange(p, p + 1),
1264               "Character literal continuation line should have been preceded by '&'"_port_en_US);
1265         }
1266       } else if (p > nextLine_) {
1267         --p;
1268       } else {
1269         insertASpace_ = true;
1270       }
1271       return p;
1272     } else {
1273       return nullptr;
1274     }
1275   }
1276 }
1277 
1278 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
1279   // N.B. We accept '&' as a continuation indicator in fixed form, too,
1280   // but not in a character literal.
1281   if (*at_ == '&' && inCharLiteral_) {
1282     return false;
1283   }
1284   do {
1285     if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
1286       BeginSourceLine(cont);
1287       column_ = 7;
1288       NextLine();
1289       return true;
1290     }
1291   } while (SkipCommentLine(false /* not after ampersand */));
1292   return false;
1293 }
1294 
1295 bool Prescanner::FreeFormContinuation() {
1296   const char *p{at_};
1297   bool ampersand{*p == '&'};
1298   if (ampersand) {
1299     p = SkipWhiteSpace(p + 1);
1300   }
1301   if (*p != '\n') {
1302     if (inCharLiteral_) {
1303       return false;
1304     } else if (*p == '!') { // & ! comment - ok
1305     } else if (ampersand && isPossibleMacroCall_ && (*p == ',' || *p == ')')) {
1306       return false; // allow & at end of a macro argument
1307     } else if (features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
1308       Say(GetProvenance(p), "missing ! before comment after &"_warn_en_US);
1309     }
1310   }
1311   do {
1312     if (const char *cont{FreeFormContinuationLine(ampersand)}) {
1313       BeginSourceLine(cont);
1314       NextLine();
1315       return true;
1316     }
1317   } while (SkipCommentLine(ampersand));
1318   return false;
1319 }
1320 
1321 // Implicit line continuation allows a preprocessor macro call with
1322 // arguments to span multiple lines.
1323 bool Prescanner::IsImplicitContinuation() const {
1324   return !inPreprocessorDirective_ && !inCharLiteral_ && isPossibleMacroCall_ &&
1325       parenthesisNesting_ > 0 && !IsAtEnd() &&
1326       ClassifyLine(nextLine_).kind == LineClassification::Kind::Source;
1327 }
1328 
1329 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
1330   if (disableSourceContinuation_) {
1331     return false;
1332   } else if (*at_ == '\n' || *at_ == '&') {
1333     if (inFixedForm_) {
1334       return FixedFormContinuation(mightNeedFixedFormSpace);
1335     } else {
1336       return FreeFormContinuation();
1337     }
1338   } else if (*at_ == '\\' && at_ + 2 == nextLine_ &&
1339       backslashFreeFormContinuation_ && !inFixedForm_ && nextLine_ < limit_) {
1340     // cpp-like handling of \ at end of a free form source line
1341     BeginSourceLine(nextLine_);
1342     NextLine();
1343     return true;
1344   } else {
1345     return false;
1346   }
1347 }
1348 
1349 std::optional<Prescanner::LineClassification>
1350 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
1351   const char *p{start};
1352   char col1{*p++};
1353   if (!IsFixedFormCommentChar(col1)) {
1354     return std::nullopt;
1355   }
1356   char sentinel[5], *sp{sentinel};
1357   int column{2};
1358   for (; column < 6; ++column, ++p) {
1359     if (*p == ' ' || *p == '\n' || *p == '\t') {
1360       break;
1361     }
1362     if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1363       // OpenMP conditional compilation line: leave the label alone
1364       break;
1365     }
1366     *sp++ = ToLowerCaseLetter(*p);
1367   }
1368   if (column == 6) {
1369     if (*p == ' ' || *p == '\t' || *p == '0') {
1370       ++p;
1371     } else {
1372       // This is a Continuation line, not an initial directive line.
1373       return std::nullopt;
1374     }
1375   }
1376   if (sp == sentinel) {
1377     return std::nullopt;
1378   }
1379   *sp = '\0';
1380   if (const char *ss{IsCompilerDirectiveSentinel(
1381           sentinel, static_cast<std::size_t>(sp - sentinel))}) {
1382     std::size_t payloadOffset = p - start;
1383     return {LineClassification{
1384         LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1385   }
1386   return std::nullopt;
1387 }
1388 
1389 std::optional<Prescanner::LineClassification>
1390 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1391   if (const char *p{SkipWhiteSpace(start)}; p && *p++ == '!') {
1392     if (auto maybePair{IsCompilerDirectiveSentinel(p)}) {
1393       auto offset{static_cast<std::size_t>(maybePair->second - start)};
1394       return {LineClassification{LineClassification::Kind::CompilerDirective,
1395           offset, maybePair->first}};
1396     }
1397   }
1398   return std::nullopt;
1399 }
1400 
1401 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1402   std::uint64_t packed{0};
1403   for (char ch : dir) {
1404     packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1405   }
1406   compilerDirectiveBloomFilter_.set(packed % prime1);
1407   compilerDirectiveBloomFilter_.set(packed % prime2);
1408   compilerDirectiveSentinels_.insert(dir);
1409   return *this;
1410 }
1411 
1412 const char *Prescanner::IsCompilerDirectiveSentinel(
1413     const char *sentinel, std::size_t len) const {
1414   std::uint64_t packed{0};
1415   for (std::size_t j{0}; j < len; ++j) {
1416     packed = (packed << 8) | (sentinel[j] & 0xff);
1417   }
1418   if (len == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1419       !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1420     return nullptr;
1421   }
1422   const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, len))};
1423   return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
1424 }
1425 
1426 const char *Prescanner::IsCompilerDirectiveSentinel(CharBlock token) const {
1427   const char *p{token.begin()};
1428   const char *end{p + token.size()};
1429   while (p < end && (*p == ' ' || *p == '\n')) {
1430     ++p;
1431   }
1432   if (p < end && *p == '!') {
1433     ++p;
1434   }
1435   while (end > p && (end[-1] == ' ' || end[-1] == '\t')) {
1436     --end;
1437   }
1438   return end > p && IsCompilerDirectiveSentinel(p, end - p) ? p : nullptr;
1439 }
1440 
1441 std::optional<std::pair<const char *, const char *>>
1442 Prescanner::IsCompilerDirectiveSentinel(const char *p) const {
1443   char sentinel[8];
1444   for (std::size_t j{0}; j + 1 < sizeof sentinel && *p != '\n'; ++p, ++j) {
1445     if (*p == ' ' || *p == '\t' || *p == '&') {
1446       if (j > 0) {
1447         sentinel[j] = '\0';
1448         p = SkipWhiteSpace(p + 1);
1449         if (*p != '!') {
1450           if (const char *sp{IsCompilerDirectiveSentinel(sentinel, j)}) {
1451             return std::make_pair(sp, p);
1452           }
1453         }
1454       }
1455       break;
1456     } else {
1457       sentinel[j] = ToLowerCaseLetter(*p);
1458     }
1459   }
1460   return std::nullopt;
1461 }
1462 
1463 constexpr bool IsDirective(const char *match, const char *dir) {
1464   for (; *match; ++match) {
1465     if (*match != ToLowerCaseLetter(*dir++)) {
1466       return false;
1467     }
1468   }
1469   return true;
1470 }
1471 
1472 Prescanner::LineClassification Prescanner::ClassifyLine(
1473     const char *start) const {
1474   if (inFixedForm_) {
1475     if (std::optional<LineClassification> lc{
1476             IsFixedFormCompilerDirectiveLine(start)}) {
1477       return std::move(*lc);
1478     }
1479     if (IsFixedFormCommentLine(start)) {
1480       return {LineClassification::Kind::Comment};
1481     }
1482   } else {
1483     if (std::optional<LineClassification> lc{
1484             IsFreeFormCompilerDirectiveLine(start)}) {
1485       return std::move(*lc);
1486     }
1487     if (const char *bang{IsFreeFormComment(start)}) {
1488       return {LineClassification::Kind::Comment,
1489           static_cast<std::size_t>(bang - start)};
1490     }
1491   }
1492   if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1493     return {LineClassification::Kind::IncludeLine, *quoteOffset};
1494   }
1495   if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1496     if (IsDirective("if", dir) || IsDirective("elif", dir) ||
1497         IsDirective("else", dir) || IsDirective("endif", dir)) {
1498       return {LineClassification::Kind::ConditionalCompilationDirective};
1499     } else if (IsDirective("include", dir)) {
1500       return {LineClassification::Kind::IncludeDirective};
1501     } else if (IsDirective("define", dir) || IsDirective("undef", dir)) {
1502       return {LineClassification::Kind::DefinitionDirective};
1503     } else {
1504       return {LineClassification::Kind::PreprocessorDirective};
1505     }
1506   }
1507   return {LineClassification::Kind::Source};
1508 }
1509 
1510 Prescanner::LineClassification Prescanner::ClassifyLine(
1511     TokenSequence &tokens, Provenance newlineProvenance) const {
1512   // Append a newline temporarily.
1513   tokens.PutNextTokenChar('\n', newlineProvenance);
1514   tokens.CloseToken();
1515   const char *ppd{tokens.ToCharBlock().begin()};
1516   LineClassification classification{ClassifyLine(ppd)};
1517   tokens.pop_back(); // remove the newline
1518   return classification;
1519 }
1520 
1521 void Prescanner::SourceFormChange(std::string &&dir) {
1522   if (dir == "!dir$ free") {
1523     inFixedForm_ = false;
1524   } else if (dir == "!dir$ fixed") {
1525     inFixedForm_ = true;
1526   }
1527 }
1528 
1529 // Acquire and append compiler directive continuation lines to
1530 // the tokens that constitute a compiler directive, even when those
1531 // directive continuation lines are the result of macro expansion.
1532 // (Not used when neither the original compiler directive line nor
1533 // the directive continuation line result from preprocessing; regular
1534 // line continuation during tokenization handles that normal case.)
1535 bool Prescanner::CompilerDirectiveContinuation(
1536     TokenSequence &tokens, const char *origSentinel) {
1537   if (inFixedForm_ || tokens.empty() ||
1538       tokens.TokenAt(tokens.SizeInTokens() - 1) != "&") {
1539     return false;
1540   }
1541   LineClassification followingLine{ClassifyLine(nextLine_)};
1542   if (followingLine.kind == LineClassification::Kind::Comment) {
1543     nextLine_ += followingLine.payloadOffset; // advance to '!' or newline
1544     NextLine();
1545     return true;
1546   }
1547   CHECK(origSentinel != nullptr);
1548   directiveSentinel_ = origSentinel; // so InCompilerDirective() is true
1549   const char *nextContinuation{
1550       followingLine.kind == LineClassification::Kind::CompilerDirective
1551           ? FreeFormContinuationLine(true)
1552           : nullptr};
1553   if (!nextContinuation &&
1554       followingLine.kind != LineClassification::Kind::Source) {
1555     return false;
1556   }
1557   auto origNextLine{nextLine_};
1558   BeginSourceLine(nextLine_);
1559   NextLine();
1560   if (nextContinuation) {
1561     // What follows is !DIR$ & xxx; skip over the & so that it
1562     // doesn't cause a spurious continuation.
1563     at_ = nextContinuation;
1564   } else {
1565     // What follows looks like a source line before macro expansion,
1566     // but might become a directive continuation afterwards.
1567     SkipSpaces();
1568   }
1569   TokenSequence followingTokens;
1570   while (NextToken(followingTokens)) {
1571   }
1572   if (auto followingPrepro{
1573           preprocessor_.MacroReplacement(followingTokens, *this)}) {
1574     followingTokens = std::move(*followingPrepro);
1575   }
1576   followingTokens.RemoveRedundantBlanks();
1577   std::size_t startAt{0};
1578   std::size_t following{followingTokens.SizeInTokens()};
1579   bool ok{false};
1580   if (nextContinuation) {
1581     ok = true;
1582   } else {
1583     startAt = 2;
1584     if (startAt < following && followingTokens.TokenAt(0) == "!") {
1585       CharBlock sentinel{followingTokens.TokenAt(1)};
1586       if (!sentinel.empty() &&
1587           std::memcmp(sentinel.begin(), origSentinel, sentinel.size()) == 0) {
1588         ok = true;
1589         while (
1590             startAt < following && followingTokens.TokenAt(startAt).IsBlank()) {
1591           ++startAt;
1592         }
1593         if (startAt < following && followingTokens.TokenAt(startAt) == "&") {
1594           ++startAt;
1595         }
1596       }
1597     }
1598   }
1599   if (ok) {
1600     tokens.pop_back(); // delete original '&'
1601     tokens.Put(followingTokens, startAt, following - startAt);
1602     tokens.RemoveRedundantBlanks();
1603   } else {
1604     nextLine_ = origNextLine;
1605   }
1606   return ok;
1607 }
1608 
1609 // Similar, but for source line continuation after macro replacement.
1610 bool Prescanner::SourceLineContinuation(TokenSequence &tokens) {
1611   if (!inFixedForm_ && !tokens.empty() &&
1612       tokens.TokenAt(tokens.SizeInTokens() - 1) == "&") {
1613     LineClassification followingLine{ClassifyLine(nextLine_)};
1614     if (followingLine.kind == LineClassification::Kind::Comment) {
1615       nextLine_ += followingLine.payloadOffset; // advance to '!' or newline
1616       NextLine();
1617       return true;
1618     } else if (const char *nextContinuation{FreeFormContinuationLine(true)}) {
1619       BeginSourceLine(nextLine_);
1620       NextLine();
1621       TokenSequence followingTokens;
1622       at_ = nextContinuation;
1623       while (NextToken(followingTokens)) {
1624       }
1625       if (auto followingPrepro{
1626               preprocessor_.MacroReplacement(followingTokens, *this)}) {
1627         followingTokens = std::move(*followingPrepro);
1628       }
1629       followingTokens.RemoveRedundantBlanks();
1630       tokens.pop_back(); // delete original '&'
1631       tokens.Put(followingTokens);
1632       return true;
1633     }
1634   }
1635   return false;
1636 }
1637 } // namespace Fortran::parser
1638