xref: /llvm-project/flang/lib/Parser/token-sequence.cpp (revision 850d42fb145c636a3b56a7616c3e3c5c188c1916)
1 //===-- lib/Parser/token-sequence.cpp -------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "flang/Parser/token-sequence.h"
10 
11 #include "prescan.h"
12 #include "flang/Parser/characters.h"
13 #include "flang/Parser/message.h"
14 #include "llvm/Support/raw_ostream.h"
15 
16 namespace Fortran::parser {
17 
18 TokenSequence &TokenSequence::operator=(TokenSequence &&that) {
19   clear();
20   swap(that);
21   return *this;
22 }
23 
24 void TokenSequence::clear() {
25   start_.clear();
26   nextStart_ = 0;
27   char_.clear();
28   provenances_.clear();
29 }
30 
31 void TokenSequence::pop_back() {
32   CHECK(!start_.empty());
33   CHECK(nextStart_ > start_.back());
34   std::size_t bytes{nextStart_ - start_.back()};
35   nextStart_ = start_.back();
36   start_.pop_back();
37   char_.resize(nextStart_);
38   provenances_.RemoveLastBytes(bytes);
39 }
40 
41 void TokenSequence::shrink_to_fit() {
42   start_.shrink_to_fit();
43   char_.shrink_to_fit();
44   provenances_.shrink_to_fit();
45 }
46 
47 void TokenSequence::swap(TokenSequence &that) {
48   start_.swap(that.start_);
49   std::swap(nextStart_, that.nextStart_);
50   char_.swap(that.char_);
51   provenances_.swap(that.provenances_);
52 }
53 
54 std::size_t TokenSequence::SkipBlanks(std::size_t at) const {
55   std::size_t tokens{start_.size()};
56   for (; at < tokens; ++at) {
57     if (!TokenAt(at).IsBlank()) {
58       return at;
59     }
60   }
61   return tokens; // even if at > tokens
62 }
63 
64 std::optional<std::size_t> TokenSequence::SkipBlanksBackwards(
65     std::size_t at) const {
66   while (at-- > 0) {
67     if (!TokenAt(at).IsBlank()) {
68       return at;
69     }
70   }
71   return std::nullopt;
72 }
73 
74 // C-style /*comments*/ are removed from preprocessing directive
75 // token sequences by the prescanner, but not C++ or Fortran
76 // free-form line-ending comments (//...  and !...) because
77 // ignoring them is directive-specific.
78 bool TokenSequence::IsAnythingLeft(std::size_t at) const {
79   std::size_t tokens{start_.size()};
80   for (; at < tokens; ++at) {
81     auto tok{TokenAt(at)};
82     const char *end{tok.end()};
83     for (const char *p{tok.begin()}; p < end; ++p) {
84       switch (*p) {
85       case '/':
86         return p + 1 >= end || p[1] != '/';
87       case '!':
88         return false;
89       case ' ':
90         break;
91       default:
92         return true;
93       }
94     }
95   }
96   return false;
97 }
98 
99 void TokenSequence::Put(const TokenSequence &that) {
100   if (nextStart_ < char_.size()) {
101     start_.push_back(nextStart_);
102   }
103   int offset = char_.size();
104   for (int st : that.start_) {
105     start_.push_back(st + offset);
106   }
107   char_.insert(char_.end(), that.char_.begin(), that.char_.end());
108   nextStart_ = char_.size();
109   provenances_.Put(that.provenances_);
110 }
111 
112 void TokenSequence::Put(const TokenSequence &that, ProvenanceRange range) {
113   std::size_t offset{0};
114   std::size_t tokens{that.SizeInTokens()};
115   for (std::size_t j{0}; j < tokens; ++j) {
116     CharBlock tok{that.TokenAt(j)};
117     Put(tok, range.OffsetMember(offset));
118     offset += tok.size();
119   }
120   CHECK(offset == range.size());
121 }
122 
123 void TokenSequence::Put(
124     const TokenSequence &that, std::size_t at, std::size_t tokens) {
125   ProvenanceRange provenance;
126   std::size_t offset{0};
127   for (; tokens-- > 0; ++at) {
128     CharBlock tok{that.TokenAt(at)};
129     std::size_t tokBytes{tok.size()};
130     for (std::size_t j{0}; j < tokBytes; ++j) {
131       if (offset == provenance.size()) {
132         provenance = that.provenances_.Map(that.start_[at] + j);
133         offset = 0;
134       }
135       PutNextTokenChar(tok[j], provenance.OffsetMember(offset++));
136     }
137     CloseToken();
138   }
139 }
140 
141 void TokenSequence::Put(
142     const char *s, std::size_t bytes, Provenance provenance) {
143   for (std::size_t j{0}; j < bytes; ++j) {
144     PutNextTokenChar(s[j], provenance + j);
145   }
146   CloseToken();
147 }
148 
149 void TokenSequence::Put(const CharBlock &t, Provenance provenance) {
150   // Avoid t[0] if t is empty: it would create a reference to nullptr,
151   // which is UB.
152   const char *addr{t.size() ? &t[0] : nullptr};
153   Put(addr, t.size(), provenance);
154 }
155 
156 void TokenSequence::Put(const std::string &s, Provenance provenance) {
157   Put(s.data(), s.size(), provenance);
158 }
159 
160 void TokenSequence::Put(llvm::raw_string_ostream &ss, Provenance provenance) {
161   Put(ss.str(), provenance);
162 }
163 
164 TokenSequence &TokenSequence::ToLowerCase() {
165   std::size_t tokens{start_.size()};
166   std::size_t chars{char_.size()};
167   std::size_t atToken{0};
168   for (std::size_t j{0}; j < chars;) {
169     std::size_t nextStart{atToken + 1 < tokens ? start_[++atToken] : chars};
170     char *p{&char_[j]};
171     char const *limit{char_.data() + nextStart};
172     const char *lastChar{limit - 1};
173     j = nextStart;
174     // Skip leading whitespaces
175     while (p < limit - 1 && *p == ' ') {
176       ++p;
177     }
178     // Find last non-whitespace char
179     while (lastChar > p + 1 && *lastChar == ' ') {
180       --lastChar;
181     }
182     if (IsDecimalDigit(*p)) {
183       while (p < limit && IsDecimalDigit(*p)) {
184         ++p;
185       }
186       if (p >= limit) {
187       } else if (*p == 'h' || *p == 'H') {
188         // Hollerith
189         *p = 'h';
190       } else if (*p == '_') {
191         // kind-prefixed character literal (e.g., 1_"ABC")
192       } else {
193         // exponent
194         for (; p < limit; ++p) {
195           *p = ToLowerCaseLetter(*p);
196         }
197       }
198     } else if (*lastChar == '\'' || *lastChar == '"') {
199       if (*p == *lastChar) {
200         // Character literal without prefix
201       } else if (p[1] == *lastChar) {
202         // BOZX-prefixed constant
203         for (; p < limit; ++p) {
204           *p = ToLowerCaseLetter(*p);
205         }
206       } else {
207         // Literal with kind-param prefix name (e.g., K_"ABC").
208         for (; *p != *lastChar; ++p) {
209           *p = ToLowerCaseLetter(*p);
210         }
211       }
212     } else {
213       for (; p < limit; ++p) {
214         *p = ToLowerCaseLetter(*p);
215       }
216     }
217   }
218   return *this;
219 }
220 
221 bool TokenSequence::HasBlanks(std::size_t firstChar) const {
222   std::size_t tokens{SizeInTokens()};
223   for (std::size_t j{0}; j < tokens; ++j) {
224     if (start_[j] >= firstChar && TokenAt(j).IsBlank()) {
225       return true;
226     }
227   }
228   return false;
229 }
230 
231 bool TokenSequence::HasRedundantBlanks(std::size_t firstChar) const {
232   std::size_t tokens{SizeInTokens()};
233   bool lastWasBlank{false};
234   for (std::size_t j{0}; j < tokens; ++j) {
235     bool isBlank{TokenAt(j).IsBlank()};
236     if (isBlank && lastWasBlank && start_[j] >= firstChar) {
237       return true;
238     }
239     lastWasBlank = isBlank;
240   }
241   return false;
242 }
243 
244 TokenSequence &TokenSequence::RemoveBlanks(std::size_t firstChar) {
245   std::size_t tokens{SizeInTokens()};
246   TokenSequence result;
247   for (std::size_t j{0}; j < tokens; ++j) {
248     if (!TokenAt(j).IsBlank() || start_[j] < firstChar) {
249       result.Put(*this, j);
250     }
251   }
252   swap(result);
253   return *this;
254 }
255 
256 TokenSequence &TokenSequence::RemoveRedundantBlanks(std::size_t firstChar) {
257   std::size_t tokens{SizeInTokens()};
258   TokenSequence result;
259   bool lastWasBlank{false};
260   for (std::size_t j{0}; j < tokens; ++j) {
261     bool isBlank{TokenAt(j).IsBlank()};
262     if (!isBlank || !lastWasBlank || start_[j] < firstChar) {
263       result.Put(*this, j);
264     }
265     lastWasBlank = isBlank;
266   }
267   swap(result);
268   return *this;
269 }
270 
271 TokenSequence &TokenSequence::ClipComment(
272     const Prescanner &prescanner, bool skipFirst) {
273   std::size_t tokens{SizeInTokens()};
274   for (std::size_t j{0}; j < tokens; ++j) {
275     CharBlock tok{TokenAt(j)};
276     if (std::size_t blanks{tok.CountLeadingBlanks()};
277         blanks < tok.size() && tok[blanks] == '!') {
278       // Retain active compiler directive sentinels (e.g. "!dir$")
279       for (std::size_t k{j + 1}; k < tokens && tok.size() <= blanks + 5; ++k) {
280         if (tok.begin() + tok.size() == TokenAt(k).begin()) {
281           tok.ExtendToCover(TokenAt(k));
282         } else {
283           break;
284         }
285       }
286       bool isSentinel{false};
287       if (tok.size() > blanks + 5) {
288         isSentinel = prescanner.IsCompilerDirectiveSentinel(&tok[blanks + 1])
289                          .has_value();
290       }
291       if (isSentinel) {
292       } else if (skipFirst) {
293         skipFirst = false;
294       } else {
295         TokenSequence result;
296         if (j > 0) {
297           result.Put(*this, 0, j - 1);
298         }
299         swap(result);
300         return *this;
301       }
302     }
303   }
304   return *this;
305 }
306 
307 void TokenSequence::Emit(CookedSource &cooked) const {
308   if (auto n{char_.size()}) {
309     cooked.Put(&char_[0], n);
310     cooked.PutProvenanceMappings(provenances_);
311   }
312 }
313 
314 llvm::raw_ostream &TokenSequence::Dump(llvm::raw_ostream &o) const {
315   o << "TokenSequence has " << char_.size() << " chars; nextStart_ "
316     << nextStart_ << '\n';
317   for (std::size_t j{0}; j < start_.size(); ++j) {
318     o << '[' << j << "] @ " << start_[j] << " '" << TokenAt(j).ToString()
319       << "'\n";
320   }
321   return o;
322 }
323 
324 Provenance TokenSequence::GetCharProvenance(std::size_t offset) const {
325   ProvenanceRange range{provenances_.Map(offset)};
326   return range.start();
327 }
328 
329 Provenance TokenSequence::GetTokenProvenance(
330     std::size_t token, std::size_t offset) const {
331   return GetCharProvenance(start_[token] + offset);
332 }
333 
334 ProvenanceRange TokenSequence::GetTokenProvenanceRange(
335     std::size_t token, std::size_t offset) const {
336   ProvenanceRange range{provenances_.Map(start_[token] + offset)};
337   return range.Prefix(TokenBytes(token) - offset);
338 }
339 
340 ProvenanceRange TokenSequence::GetIntervalProvenanceRange(
341     std::size_t token, std::size_t tokens) const {
342   if (tokens == 0) {
343     return {};
344   }
345   ProvenanceRange range{provenances_.Map(start_[token])};
346   while (--tokens > 0 &&
347       range.AnnexIfPredecessor(provenances_.Map(start_[++token]))) {
348   }
349   return range;
350 }
351 
352 ProvenanceRange TokenSequence::GetProvenanceRange() const {
353   return GetIntervalProvenanceRange(0, start_.size());
354 }
355 
356 const TokenSequence &TokenSequence::CheckBadFortranCharacters(
357     Messages &messages, const Prescanner &prescanner,
358     bool allowAmpersand) const {
359   std::size_t tokens{SizeInTokens()};
360   for (std::size_t j{0}; j < tokens; ++j) {
361     CharBlock token{TokenAt(j)};
362     char ch{token.FirstNonBlank()};
363     if (ch != ' ' && !IsValidFortranTokenCharacter(ch)) {
364       if (ch == '!') {
365         if (prescanner.IsCompilerDirectiveSentinel(token)) {
366           continue;
367         } else if (j + 1 < tokens &&
368             prescanner.IsCompilerDirectiveSentinel(
369                 TokenAt(j + 1))) { // !dir$, &c.
370           ++j;
371           continue;
372         }
373       } else if (ch == '&' && allowAmpersand) {
374         continue;
375       }
376       if (ch < ' ' || ch >= '\x7f') {
377         messages.Say(GetTokenProvenanceRange(j),
378             "bad character (0x%02x) in Fortran token"_err_en_US, ch & 0xff);
379       } else {
380         messages.Say(GetTokenProvenanceRange(j),
381             "bad character ('%c') in Fortran token"_err_en_US, ch);
382       }
383     }
384   }
385   return *this;
386 }
387 
388 bool TokenSequence::BadlyNestedParentheses() const {
389   int nesting{0};
390   std::size_t tokens{SizeInTokens()};
391   for (std::size_t j{0}; j < tokens; ++j) {
392     CharBlock token{TokenAt(j)};
393     char ch{token.OnlyNonBlank()};
394     if (ch == '(') {
395       ++nesting;
396     } else if (ch == ')') {
397       if (nesting-- == 0) {
398         break;
399       }
400     }
401   }
402   return nesting != 0;
403 }
404 
405 const TokenSequence &TokenSequence::CheckBadParentheses(
406     Messages &messages) const {
407   if (BadlyNestedParentheses()) {
408     // There's an error; diagnose it
409     std::size_t tokens{SizeInTokens()};
410     std::vector<std::size_t> stack;
411     for (std::size_t j{0}; j < tokens; ++j) {
412       CharBlock token{TokenAt(j)};
413       char ch{token.OnlyNonBlank()};
414       if (ch == '(') {
415         stack.push_back(j);
416       } else if (ch == ')') {
417         if (stack.empty()) {
418           messages.Say(GetTokenProvenanceRange(j), "Unmatched ')'"_err_en_US);
419           return *this;
420         }
421         stack.pop_back();
422       }
423     }
424     CHECK(!stack.empty());
425     messages.Say(
426         GetTokenProvenanceRange(stack.back()), "Unmatched '('"_err_en_US);
427   }
428   return *this;
429 }
430 } // namespace Fortran::parser
431