xref: /llvm-project/flang/lib/Parser/token-sequence.cpp (revision cbc5d42fcedace0b9dcfa2e2a91d41e3ce84908c)
1 //===-- lib/Parser/token-sequence.cpp -------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "token-sequence.h"
10 #include "prescan.h"
11 #include "flang/Parser/characters.h"
12 #include "flang/Parser/message.h"
13 #include "llvm/Support/raw_ostream.h"
14 
15 namespace Fortran::parser {
16 
17 TokenSequence &TokenSequence::operator=(TokenSequence &&that) {
18   clear();
19   swap(that);
20   return *this;
21 }
22 
23 void TokenSequence::clear() {
24   start_.clear();
25   nextStart_ = 0;
26   char_.clear();
27   provenances_.clear();
28 }
29 
30 void TokenSequence::pop_back() {
31   CHECK(!start_.empty());
32   CHECK(nextStart_ > start_.back());
33   std::size_t bytes{nextStart_ - start_.back()};
34   nextStart_ = start_.back();
35   start_.pop_back();
36   char_.resize(nextStart_);
37   provenances_.RemoveLastBytes(bytes);
38 }
39 
40 void TokenSequence::shrink_to_fit() {
41   start_.shrink_to_fit();
42   char_.shrink_to_fit();
43   provenances_.shrink_to_fit();
44 }
45 
46 void TokenSequence::swap(TokenSequence &that) {
47   start_.swap(that.start_);
48   std::swap(nextStart_, that.nextStart_);
49   char_.swap(that.char_);
50   provenances_.swap(that.provenances_);
51 }
52 
53 std::size_t TokenSequence::SkipBlanks(std::size_t at) const {
54   std::size_t tokens{start_.size()};
55   for (; at < tokens; ++at) {
56     if (!TokenAt(at).IsBlank()) {
57       return at;
58     }
59   }
60   return tokens; // even if at > tokens
61 }
62 
63 // C-style /*comments*/ are removed from preprocessing directive
64 // token sequences by the prescanner, but not C++ or Fortran
65 // free-form line-ending comments (//...  and !...) because
66 // ignoring them is directive-specific.
67 bool TokenSequence::IsAnythingLeft(std::size_t at) const {
68   std::size_t tokens{start_.size()};
69   for (; at < tokens; ++at) {
70     auto tok{TokenAt(at)};
71     const char *end{tok.end()};
72     for (const char *p{tok.begin()}; p < end; ++p) {
73       switch (*p) {
74       case '/':
75         return p + 1 >= end || p[1] != '/';
76       case '!':
77         return false;
78       case ' ':
79         break;
80       default:
81         return true;
82       }
83     }
84   }
85   return false;
86 }
87 
88 void TokenSequence::Put(const TokenSequence &that) {
89   if (nextStart_ < char_.size()) {
90     start_.push_back(nextStart_);
91   }
92   int offset = char_.size();
93   for (int st : that.start_) {
94     start_.push_back(st + offset);
95   }
96   char_.insert(char_.end(), that.char_.begin(), that.char_.end());
97   nextStart_ = char_.size();
98   provenances_.Put(that.provenances_);
99 }
100 
101 void TokenSequence::Put(const TokenSequence &that, ProvenanceRange range) {
102   std::size_t offset{0};
103   std::size_t tokens{that.SizeInTokens()};
104   for (std::size_t j{0}; j < tokens; ++j) {
105     CharBlock tok{that.TokenAt(j)};
106     Put(tok, range.OffsetMember(offset));
107     offset += tok.size();
108   }
109   CHECK(offset == range.size());
110 }
111 
112 void TokenSequence::Put(
113     const TokenSequence &that, std::size_t at, std::size_t tokens) {
114   ProvenanceRange provenance;
115   std::size_t offset{0};
116   for (; tokens-- > 0; ++at) {
117     CharBlock tok{that.TokenAt(at)};
118     std::size_t tokBytes{tok.size()};
119     for (std::size_t j{0}; j < tokBytes; ++j) {
120       if (offset == provenance.size()) {
121         provenance = that.provenances_.Map(that.start_[at] + j);
122         offset = 0;
123       }
124       PutNextTokenChar(tok[j], provenance.OffsetMember(offset++));
125     }
126     CloseToken();
127   }
128 }
129 
130 void TokenSequence::Put(
131     const char *s, std::size_t bytes, Provenance provenance) {
132   for (std::size_t j{0}; j < bytes; ++j) {
133     PutNextTokenChar(s[j], provenance + j);
134   }
135   CloseToken();
136 }
137 
138 void TokenSequence::Put(const CharBlock &t, Provenance provenance) {
139   Put(&t[0], t.size(), provenance);
140 }
141 
142 void TokenSequence::Put(const std::string &s, Provenance provenance) {
143   Put(s.data(), s.size(), provenance);
144 }
145 
146 void TokenSequence::Put(llvm::raw_string_ostream &ss, Provenance provenance) {
147   Put(ss.str(), provenance);
148 }
149 
150 TokenSequence &TokenSequence::ToLowerCase() {
151   std::size_t tokens{start_.size()};
152   std::size_t chars{char_.size()};
153   std::size_t atToken{0};
154   for (std::size_t j{0}; j < chars;) {
155     std::size_t nextStart{atToken + 1 < tokens ? start_[++atToken] : chars};
156     char *p{&char_[j]};
157     char const *limit{char_.data() + nextStart};
158     j = nextStart;
159     if (IsDecimalDigit(*p)) {
160       while (p < limit && IsDecimalDigit(*p)) {
161         ++p;
162       }
163       if (p >= limit) {
164       } else if (*p == 'h' || *p == 'H') {
165         // Hollerith
166         *p = 'h';
167       } else if (*p == '_') {
168         // kind-prefixed character literal (e.g., 1_"ABC")
169       } else {
170         // exponent
171         for (; p < limit; ++p) {
172           *p = ToLowerCaseLetter(*p);
173         }
174       }
175     } else if (limit[-1] == '\'' || limit[-1] == '"') {
176       if (*p == limit[-1]) {
177         // Character literal without prefix
178       } else if (p[1] == limit[-1]) {
179         // BOZX-prefixed constant
180         for (; p < limit; ++p) {
181           *p = ToLowerCaseLetter(*p);
182         }
183       } else {
184         // Literal with kind-param prefix name (e.g., K_"ABC").
185         for (; *p != limit[-1]; ++p) {
186           *p = ToLowerCaseLetter(*p);
187         }
188       }
189     } else {
190       for (; p < limit; ++p) {
191         *p = ToLowerCaseLetter(*p);
192       }
193     }
194   }
195   return *this;
196 }
197 
198 bool TokenSequence::HasBlanks(std::size_t firstChar) const {
199   std::size_t tokens{SizeInTokens()};
200   for (std::size_t j{0}; j < tokens; ++j) {
201     if (start_[j] >= firstChar && TokenAt(j).IsBlank()) {
202       return true;
203     }
204   }
205   return false;
206 }
207 
208 bool TokenSequence::HasRedundantBlanks(std::size_t firstChar) const {
209   std::size_t tokens{SizeInTokens()};
210   bool lastWasBlank{false};
211   for (std::size_t j{0}; j < tokens; ++j) {
212     bool isBlank{TokenAt(j).IsBlank()};
213     if (isBlank && lastWasBlank && start_[j] >= firstChar) {
214       return true;
215     }
216     lastWasBlank = isBlank;
217   }
218   return false;
219 }
220 
221 TokenSequence &TokenSequence::RemoveBlanks(std::size_t firstChar) {
222   std::size_t tokens{SizeInTokens()};
223   TokenSequence result;
224   for (std::size_t j{0}; j < tokens; ++j) {
225     if (!TokenAt(j).IsBlank() || start_[j] < firstChar) {
226       result.Put(*this, j);
227     }
228   }
229   swap(result);
230   return *this;
231 }
232 
233 TokenSequence &TokenSequence::RemoveRedundantBlanks(std::size_t firstChar) {
234   std::size_t tokens{SizeInTokens()};
235   TokenSequence result;
236   bool lastWasBlank{false};
237   for (std::size_t j{0}; j < tokens; ++j) {
238     bool isBlank{TokenAt(j).IsBlank()};
239     if (!isBlank || !lastWasBlank || start_[j] < firstChar) {
240       result.Put(*this, j);
241     }
242     lastWasBlank = isBlank;
243   }
244   swap(result);
245   return *this;
246 }
247 
248 TokenSequence &TokenSequence::ClipComment(
249     const Prescanner &prescanner, bool skipFirst) {
250   std::size_t tokens{SizeInTokens()};
251   for (std::size_t j{0}; j < tokens; ++j) {
252     CharBlock tok{TokenAt(j)};
253     if (std::size_t blanks{tok.CountLeadingBlanks()};
254         blanks < tok.size() && tok[blanks] == '!') {
255       // Retain active compiler directive sentinels (e.g. "!dir$")
256       for (std::size_t k{j + 1}; k < tokens && tok.size() < blanks + 5; ++k) {
257         if (tok.begin() + tok.size() == TokenAt(k).begin()) {
258           tok.ExtendToCover(TokenAt(k));
259         } else {
260           break;
261         }
262       }
263       bool isSentinel{false};
264       if (tok.size() == blanks + 5) {
265         char sentinel[4];
266         for (int k{0}; k < 4; ++k) {
267           sentinel[k] = ToLowerCaseLetter(tok[blanks + k + 1]);
268         }
269         isSentinel = prescanner.IsCompilerDirectiveSentinel(sentinel, 4);
270       }
271       if (isSentinel) {
272       } else if (skipFirst) {
273         skipFirst = false;
274       } else {
275         TokenSequence result;
276         if (j > 0) {
277           result.Put(*this, 0, j - 1);
278         }
279         swap(result);
280         return *this;
281       }
282     }
283   }
284   return *this;
285 }
286 
287 void TokenSequence::Emit(CookedSource &cooked) const {
288   if (auto n{char_.size()}) {
289     cooked.Put(&char_[0], n);
290     cooked.PutProvenanceMappings(provenances_);
291   }
292 }
293 
294 llvm::raw_ostream &TokenSequence::Dump(llvm::raw_ostream &o) const {
295   o << "TokenSequence has " << char_.size() << " chars; nextStart_ "
296     << nextStart_ << '\n';
297   for (std::size_t j{0}; j < start_.size(); ++j) {
298     o << '[' << j << "] @ " << start_[j] << " '" << TokenAt(j).ToString()
299       << "'\n";
300   }
301   return o;
302 }
303 
304 Provenance TokenSequence::GetCharProvenance(std::size_t offset) const {
305   ProvenanceRange range{provenances_.Map(offset)};
306   return range.start();
307 }
308 
309 Provenance TokenSequence::GetTokenProvenance(
310     std::size_t token, std::size_t offset) const {
311   return GetCharProvenance(start_[token] + offset);
312 }
313 
314 ProvenanceRange TokenSequence::GetTokenProvenanceRange(
315     std::size_t token, std::size_t offset) const {
316   ProvenanceRange range{provenances_.Map(start_[token] + offset)};
317   return range.Prefix(TokenBytes(token) - offset);
318 }
319 
320 ProvenanceRange TokenSequence::GetIntervalProvenanceRange(
321     std::size_t token, std::size_t tokens) const {
322   if (tokens == 0) {
323     return {};
324   }
325   ProvenanceRange range{provenances_.Map(start_[token])};
326   while (--tokens > 0 &&
327       range.AnnexIfPredecessor(provenances_.Map(start_[++token]))) {
328   }
329   return range;
330 }
331 
332 ProvenanceRange TokenSequence::GetProvenanceRange() const {
333   return GetIntervalProvenanceRange(0, start_.size());
334 }
335 
336 const TokenSequence &TokenSequence::CheckBadFortranCharacters(
337     Messages &messages) const {
338   std::size_t tokens{SizeInTokens()};
339   bool isBangOk{true};
340   for (std::size_t j{0}; j < tokens; ++j) {
341     CharBlock token{TokenAt(j)};
342     char ch{token.FirstNonBlank()};
343     if (ch != ' ' && !IsValidFortranTokenCharacter(ch)) {
344       if (ch == '!' && isBangOk) {
345         // allow in !dir$
346       } else if (ch < ' ' || ch >= '\x7f') {
347         messages.Say(GetTokenProvenanceRange(j),
348             "bad character (0x%02x) in Fortran token"_err_en_US, ch & 0xff);
349       } else {
350         messages.Say(GetTokenProvenanceRange(j),
351             "bad character ('%c') in Fortran token"_err_en_US, ch);
352       }
353     }
354     if (ch == ';') {
355       isBangOk = true;
356     } else if (ch != ' ') {
357       isBangOk = false;
358     }
359   }
360   return *this;
361 }
362 
363 const TokenSequence &TokenSequence::CheckBadParentheses(
364     Messages &messages) const {
365   // First, a quick pass with no allocation for the common case
366   int nesting{0};
367   std::size_t tokens{SizeInTokens()};
368   for (std::size_t j{0}; j < tokens; ++j) {
369     CharBlock token{TokenAt(j)};
370     char ch{token.FirstNonBlank()};
371     if (ch == '(') {
372       ++nesting;
373     } else if (ch == ')') {
374       --nesting;
375     }
376   }
377   if (nesting != 0) {
378     // There's an error; diagnose it
379     std::vector<std::size_t> stack;
380     for (std::size_t j{0}; j < tokens; ++j) {
381       CharBlock token{TokenAt(j)};
382       char ch{token.FirstNonBlank()};
383       if (ch == '(') {
384         stack.push_back(j);
385       } else if (ch == ')') {
386         if (stack.empty()) {
387           messages.Say(GetTokenProvenanceRange(j), "Unmatched ')'"_err_en_US);
388           return *this;
389         }
390         stack.pop_back();
391       }
392     }
393     CHECK(!stack.empty());
394     messages.Say(
395         GetTokenProvenanceRange(stack.back()), "Unmatched '('"_err_en_US);
396   }
397   return *this;
398 }
399 } // namespace Fortran::parser
400