1 //===-- lib/Parser/token-sequence.cpp -------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "flang/Parser/token-sequence.h" 10 11 #include "prescan.h" 12 #include "flang/Parser/characters.h" 13 #include "flang/Parser/message.h" 14 #include "llvm/Support/raw_ostream.h" 15 16 namespace Fortran::parser { 17 18 TokenSequence &TokenSequence::operator=(TokenSequence &&that) { 19 clear(); 20 swap(that); 21 return *this; 22 } 23 24 void TokenSequence::clear() { 25 start_.clear(); 26 nextStart_ = 0; 27 char_.clear(); 28 provenances_.clear(); 29 } 30 31 void TokenSequence::pop_back() { 32 CHECK(!start_.empty()); 33 CHECK(nextStart_ > start_.back()); 34 std::size_t bytes{nextStart_ - start_.back()}; 35 nextStart_ = start_.back(); 36 start_.pop_back(); 37 char_.resize(nextStart_); 38 provenances_.RemoveLastBytes(bytes); 39 } 40 41 void TokenSequence::shrink_to_fit() { 42 start_.shrink_to_fit(); 43 char_.shrink_to_fit(); 44 provenances_.shrink_to_fit(); 45 } 46 47 void TokenSequence::swap(TokenSequence &that) { 48 start_.swap(that.start_); 49 std::swap(nextStart_, that.nextStart_); 50 char_.swap(that.char_); 51 provenances_.swap(that.provenances_); 52 } 53 54 std::size_t TokenSequence::SkipBlanks(std::size_t at) const { 55 std::size_t tokens{start_.size()}; 56 for (; at < tokens; ++at) { 57 if (!TokenAt(at).IsBlank()) { 58 return at; 59 } 60 } 61 return tokens; // even if at > tokens 62 } 63 64 std::optional<std::size_t> TokenSequence::SkipBlanksBackwards( 65 std::size_t at) const { 66 while (at-- > 0) { 67 if (!TokenAt(at).IsBlank()) { 68 return at; 69 } 70 } 71 return std::nullopt; 72 } 73 74 // C-style /*comments*/ are removed from preprocessing directive 75 // token sequences by the prescanner, but not C++ or Fortran 76 // free-form line-ending comments (//... and !...) because 77 // ignoring them is directive-specific. 78 bool TokenSequence::IsAnythingLeft(std::size_t at) const { 79 std::size_t tokens{start_.size()}; 80 for (; at < tokens; ++at) { 81 auto tok{TokenAt(at)}; 82 const char *end{tok.end()}; 83 for (const char *p{tok.begin()}; p < end; ++p) { 84 switch (*p) { 85 case '/': 86 return p + 1 >= end || p[1] != '/'; 87 case '!': 88 return false; 89 case ' ': 90 break; 91 default: 92 return true; 93 } 94 } 95 } 96 return false; 97 } 98 99 void TokenSequence::Put(const TokenSequence &that) { 100 if (nextStart_ < char_.size()) { 101 start_.push_back(nextStart_); 102 } 103 int offset = char_.size(); 104 for (int st : that.start_) { 105 start_.push_back(st + offset); 106 } 107 char_.insert(char_.end(), that.char_.begin(), that.char_.end()); 108 nextStart_ = char_.size(); 109 provenances_.Put(that.provenances_); 110 } 111 112 void TokenSequence::Put(const TokenSequence &that, ProvenanceRange range) { 113 std::size_t offset{0}; 114 std::size_t tokens{that.SizeInTokens()}; 115 for (std::size_t j{0}; j < tokens; ++j) { 116 CharBlock tok{that.TokenAt(j)}; 117 Put(tok, range.OffsetMember(offset)); 118 offset += tok.size(); 119 } 120 CHECK(offset == range.size()); 121 } 122 123 void TokenSequence::Put( 124 const TokenSequence &that, std::size_t at, std::size_t tokens) { 125 ProvenanceRange provenance; 126 std::size_t offset{0}; 127 for (; tokens-- > 0; ++at) { 128 CharBlock tok{that.TokenAt(at)}; 129 std::size_t tokBytes{tok.size()}; 130 for (std::size_t j{0}; j < tokBytes; ++j) { 131 if (offset == provenance.size()) { 132 provenance = that.provenances_.Map(that.start_[at] + j); 133 offset = 0; 134 } 135 PutNextTokenChar(tok[j], provenance.OffsetMember(offset++)); 136 } 137 CloseToken(); 138 } 139 } 140 141 void TokenSequence::Put( 142 const char *s, std::size_t bytes, Provenance provenance) { 143 for (std::size_t j{0}; j < bytes; ++j) { 144 PutNextTokenChar(s[j], provenance + j); 145 } 146 CloseToken(); 147 } 148 149 void TokenSequence::Put(const CharBlock &t, Provenance provenance) { 150 // Avoid t[0] if t is empty: it would create a reference to nullptr, 151 // which is UB. 152 const char *addr{t.size() ? &t[0] : nullptr}; 153 Put(addr, t.size(), provenance); 154 } 155 156 void TokenSequence::Put(const std::string &s, Provenance provenance) { 157 Put(s.data(), s.size(), provenance); 158 } 159 160 void TokenSequence::Put(llvm::raw_string_ostream &ss, Provenance provenance) { 161 Put(ss.str(), provenance); 162 } 163 164 TokenSequence &TokenSequence::ToLowerCase() { 165 std::size_t tokens{start_.size()}; 166 std::size_t chars{char_.size()}; 167 std::size_t atToken{0}; 168 for (std::size_t j{0}; j < chars;) { 169 std::size_t nextStart{atToken + 1 < tokens ? start_[++atToken] : chars}; 170 char *p{&char_[j]}; 171 char const *limit{char_.data() + nextStart}; 172 const char *lastChar{limit - 1}; 173 j = nextStart; 174 // Skip leading whitespaces 175 while (p < limit - 1 && *p == ' ') { 176 ++p; 177 } 178 // Find last non-whitespace char 179 while (lastChar > p + 1 && *lastChar == ' ') { 180 --lastChar; 181 } 182 if (IsDecimalDigit(*p)) { 183 while (p < limit && IsDecimalDigit(*p)) { 184 ++p; 185 } 186 if (p >= limit) { 187 } else if (*p == 'h' || *p == 'H') { 188 // Hollerith 189 *p = 'h'; 190 } else if (*p == '_') { 191 // kind-prefixed character literal (e.g., 1_"ABC") 192 } else { 193 // exponent 194 for (; p < limit; ++p) { 195 *p = ToLowerCaseLetter(*p); 196 } 197 } 198 } else if (*lastChar == '\'' || *lastChar == '"') { 199 if (*p == *lastChar) { 200 // Character literal without prefix 201 } else if (p[1] == *lastChar) { 202 // BOZX-prefixed constant 203 for (; p < limit; ++p) { 204 *p = ToLowerCaseLetter(*p); 205 } 206 } else { 207 // Literal with kind-param prefix name (e.g., K_"ABC"). 208 for (; *p != *lastChar; ++p) { 209 *p = ToLowerCaseLetter(*p); 210 } 211 } 212 } else { 213 for (; p < limit; ++p) { 214 *p = ToLowerCaseLetter(*p); 215 } 216 } 217 } 218 return *this; 219 } 220 221 bool TokenSequence::HasBlanks(std::size_t firstChar) const { 222 std::size_t tokens{SizeInTokens()}; 223 for (std::size_t j{0}; j < tokens; ++j) { 224 if (start_[j] >= firstChar && TokenAt(j).IsBlank()) { 225 return true; 226 } 227 } 228 return false; 229 } 230 231 bool TokenSequence::HasRedundantBlanks(std::size_t firstChar) const { 232 std::size_t tokens{SizeInTokens()}; 233 bool lastWasBlank{false}; 234 for (std::size_t j{0}; j < tokens; ++j) { 235 bool isBlank{TokenAt(j).IsBlank()}; 236 if (isBlank && lastWasBlank && start_[j] >= firstChar) { 237 return true; 238 } 239 lastWasBlank = isBlank; 240 } 241 return false; 242 } 243 244 TokenSequence &TokenSequence::RemoveBlanks(std::size_t firstChar) { 245 std::size_t tokens{SizeInTokens()}; 246 TokenSequence result; 247 for (std::size_t j{0}; j < tokens; ++j) { 248 if (!TokenAt(j).IsBlank() || start_[j] < firstChar) { 249 result.Put(*this, j); 250 } 251 } 252 swap(result); 253 return *this; 254 } 255 256 TokenSequence &TokenSequence::RemoveRedundantBlanks(std::size_t firstChar) { 257 std::size_t tokens{SizeInTokens()}; 258 TokenSequence result; 259 bool lastWasBlank{false}; 260 for (std::size_t j{0}; j < tokens; ++j) { 261 bool isBlank{TokenAt(j).IsBlank()}; 262 if (!isBlank || !lastWasBlank || start_[j] < firstChar) { 263 result.Put(*this, j); 264 } 265 lastWasBlank = isBlank; 266 } 267 swap(result); 268 return *this; 269 } 270 271 TokenSequence &TokenSequence::ClipComment( 272 const Prescanner &prescanner, bool skipFirst) { 273 std::size_t tokens{SizeInTokens()}; 274 for (std::size_t j{0}; j < tokens; ++j) { 275 CharBlock tok{TokenAt(j)}; 276 if (std::size_t blanks{tok.CountLeadingBlanks()}; 277 blanks < tok.size() && tok[blanks] == '!') { 278 // Retain active compiler directive sentinels (e.g. "!dir$") 279 for (std::size_t k{j + 1}; k < tokens && tok.size() <= blanks + 5; ++k) { 280 if (tok.begin() + tok.size() == TokenAt(k).begin()) { 281 tok.ExtendToCover(TokenAt(k)); 282 } else { 283 break; 284 } 285 } 286 bool isSentinel{false}; 287 if (tok.size() > blanks + 5) { 288 isSentinel = prescanner.IsCompilerDirectiveSentinel(&tok[blanks + 1]) 289 .has_value(); 290 } 291 if (isSentinel) { 292 } else if (skipFirst) { 293 skipFirst = false; 294 } else { 295 TokenSequence result; 296 if (j > 0) { 297 result.Put(*this, 0, j - 1); 298 } 299 swap(result); 300 return *this; 301 } 302 } 303 } 304 return *this; 305 } 306 307 void TokenSequence::Emit(CookedSource &cooked) const { 308 if (auto n{char_.size()}) { 309 cooked.Put(&char_[0], n); 310 cooked.PutProvenanceMappings(provenances_); 311 } 312 } 313 314 llvm::raw_ostream &TokenSequence::Dump(llvm::raw_ostream &o) const { 315 o << "TokenSequence has " << char_.size() << " chars; nextStart_ " 316 << nextStart_ << '\n'; 317 for (std::size_t j{0}; j < start_.size(); ++j) { 318 o << '[' << j << "] @ " << start_[j] << " '" << TokenAt(j).ToString() 319 << "'\n"; 320 } 321 return o; 322 } 323 324 Provenance TokenSequence::GetCharProvenance(std::size_t offset) const { 325 ProvenanceRange range{provenances_.Map(offset)}; 326 return range.start(); 327 } 328 329 Provenance TokenSequence::GetTokenProvenance( 330 std::size_t token, std::size_t offset) const { 331 return GetCharProvenance(start_[token] + offset); 332 } 333 334 ProvenanceRange TokenSequence::GetTokenProvenanceRange( 335 std::size_t token, std::size_t offset) const { 336 ProvenanceRange range{provenances_.Map(start_[token] + offset)}; 337 return range.Prefix(TokenBytes(token) - offset); 338 } 339 340 ProvenanceRange TokenSequence::GetIntervalProvenanceRange( 341 std::size_t token, std::size_t tokens) const { 342 if (tokens == 0) { 343 return {}; 344 } 345 ProvenanceRange range{provenances_.Map(start_[token])}; 346 while (--tokens > 0 && 347 range.AnnexIfPredecessor(provenances_.Map(start_[++token]))) { 348 } 349 return range; 350 } 351 352 ProvenanceRange TokenSequence::GetProvenanceRange() const { 353 return GetIntervalProvenanceRange(0, start_.size()); 354 } 355 356 const TokenSequence &TokenSequence::CheckBadFortranCharacters( 357 Messages &messages, const Prescanner &prescanner, 358 bool allowAmpersand) const { 359 std::size_t tokens{SizeInTokens()}; 360 for (std::size_t j{0}; j < tokens; ++j) { 361 CharBlock token{TokenAt(j)}; 362 char ch{token.FirstNonBlank()}; 363 if (ch != ' ' && !IsValidFortranTokenCharacter(ch)) { 364 if (ch == '!') { 365 if (prescanner.IsCompilerDirectiveSentinel(token)) { 366 continue; 367 } else if (j + 1 < tokens && 368 prescanner.IsCompilerDirectiveSentinel( 369 TokenAt(j + 1))) { // !dir$, &c. 370 ++j; 371 continue; 372 } 373 } else if (ch == '&' && allowAmpersand) { 374 continue; 375 } 376 if (ch < ' ' || ch >= '\x7f') { 377 messages.Say(GetTokenProvenanceRange(j), 378 "bad character (0x%02x) in Fortran token"_err_en_US, ch & 0xff); 379 } else { 380 messages.Say(GetTokenProvenanceRange(j), 381 "bad character ('%c') in Fortran token"_err_en_US, ch); 382 } 383 } 384 } 385 return *this; 386 } 387 388 bool TokenSequence::BadlyNestedParentheses() const { 389 int nesting{0}; 390 std::size_t tokens{SizeInTokens()}; 391 for (std::size_t j{0}; j < tokens; ++j) { 392 CharBlock token{TokenAt(j)}; 393 char ch{token.OnlyNonBlank()}; 394 if (ch == '(') { 395 ++nesting; 396 } else if (ch == ')') { 397 if (nesting-- == 0) { 398 break; 399 } 400 } 401 } 402 return nesting != 0; 403 } 404 405 const TokenSequence &TokenSequence::CheckBadParentheses( 406 Messages &messages) const { 407 if (BadlyNestedParentheses()) { 408 // There's an error; diagnose it 409 std::size_t tokens{SizeInTokens()}; 410 std::vector<std::size_t> stack; 411 for (std::size_t j{0}; j < tokens; ++j) { 412 CharBlock token{TokenAt(j)}; 413 char ch{token.OnlyNonBlank()}; 414 if (ch == '(') { 415 stack.push_back(j); 416 } else if (ch == ')') { 417 if (stack.empty()) { 418 messages.Say(GetTokenProvenanceRange(j), "Unmatched ')'"_err_en_US); 419 return *this; 420 } 421 stack.pop_back(); 422 } 423 } 424 CHECK(!stack.empty()); 425 messages.Say( 426 GetTokenProvenanceRange(stack.back()), "Unmatched '('"_err_en_US); 427 } 428 return *this; 429 } 430 } // namespace Fortran::parser 431