1 //===-- lib/Parser/token-sequence.cpp -------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "token-sequence.h" 10 #include "prescan.h" 11 #include "flang/Parser/characters.h" 12 #include "flang/Parser/message.h" 13 #include "llvm/Support/raw_ostream.h" 14 15 namespace Fortran::parser { 16 17 TokenSequence &TokenSequence::operator=(TokenSequence &&that) { 18 clear(); 19 swap(that); 20 return *this; 21 } 22 23 void TokenSequence::clear() { 24 start_.clear(); 25 nextStart_ = 0; 26 char_.clear(); 27 provenances_.clear(); 28 } 29 30 void TokenSequence::pop_back() { 31 CHECK(!start_.empty()); 32 CHECK(nextStart_ > start_.back()); 33 std::size_t bytes{nextStart_ - start_.back()}; 34 nextStart_ = start_.back(); 35 start_.pop_back(); 36 char_.resize(nextStart_); 37 provenances_.RemoveLastBytes(bytes); 38 } 39 40 void TokenSequence::shrink_to_fit() { 41 start_.shrink_to_fit(); 42 char_.shrink_to_fit(); 43 provenances_.shrink_to_fit(); 44 } 45 46 void TokenSequence::swap(TokenSequence &that) { 47 start_.swap(that.start_); 48 std::swap(nextStart_, that.nextStart_); 49 char_.swap(that.char_); 50 provenances_.swap(that.provenances_); 51 } 52 53 std::size_t TokenSequence::SkipBlanks(std::size_t at) const { 54 std::size_t tokens{start_.size()}; 55 for (; at < tokens; ++at) { 56 if (!TokenAt(at).IsBlank()) { 57 return at; 58 } 59 } 60 return tokens; // even if at > tokens 61 } 62 63 // C-style /*comments*/ are removed from preprocessing directive 64 // token sequences by the prescanner, but not C++ or Fortran 65 // free-form line-ending comments (//... and !...) because 66 // ignoring them is directive-specific. 67 bool TokenSequence::IsAnythingLeft(std::size_t at) const { 68 std::size_t tokens{start_.size()}; 69 for (; at < tokens; ++at) { 70 auto tok{TokenAt(at)}; 71 const char *end{tok.end()}; 72 for (const char *p{tok.begin()}; p < end; ++p) { 73 switch (*p) { 74 case '/': 75 return p + 1 >= end || p[1] != '/'; 76 case '!': 77 return false; 78 case ' ': 79 break; 80 default: 81 return true; 82 } 83 } 84 } 85 return false; 86 } 87 88 void TokenSequence::Put(const TokenSequence &that) { 89 if (nextStart_ < char_.size()) { 90 start_.push_back(nextStart_); 91 } 92 int offset = char_.size(); 93 for (int st : that.start_) { 94 start_.push_back(st + offset); 95 } 96 char_.insert(char_.end(), that.char_.begin(), that.char_.end()); 97 nextStart_ = char_.size(); 98 provenances_.Put(that.provenances_); 99 } 100 101 void TokenSequence::Put(const TokenSequence &that, ProvenanceRange range) { 102 std::size_t offset{0}; 103 std::size_t tokens{that.SizeInTokens()}; 104 for (std::size_t j{0}; j < tokens; ++j) { 105 CharBlock tok{that.TokenAt(j)}; 106 Put(tok, range.OffsetMember(offset)); 107 offset += tok.size(); 108 } 109 CHECK(offset == range.size()); 110 } 111 112 void TokenSequence::Put( 113 const TokenSequence &that, std::size_t at, std::size_t tokens) { 114 ProvenanceRange provenance; 115 std::size_t offset{0}; 116 for (; tokens-- > 0; ++at) { 117 CharBlock tok{that.TokenAt(at)}; 118 std::size_t tokBytes{tok.size()}; 119 for (std::size_t j{0}; j < tokBytes; ++j) { 120 if (offset == provenance.size()) { 121 provenance = that.provenances_.Map(that.start_[at] + j); 122 offset = 0; 123 } 124 PutNextTokenChar(tok[j], provenance.OffsetMember(offset++)); 125 } 126 CloseToken(); 127 } 128 } 129 130 void TokenSequence::Put( 131 const char *s, std::size_t bytes, Provenance provenance) { 132 for (std::size_t j{0}; j < bytes; ++j) { 133 PutNextTokenChar(s[j], provenance + j); 134 } 135 CloseToken(); 136 } 137 138 void TokenSequence::Put(const CharBlock &t, Provenance provenance) { 139 Put(&t[0], t.size(), provenance); 140 } 141 142 void TokenSequence::Put(const std::string &s, Provenance provenance) { 143 Put(s.data(), s.size(), provenance); 144 } 145 146 void TokenSequence::Put(llvm::raw_string_ostream &ss, Provenance provenance) { 147 Put(ss.str(), provenance); 148 } 149 150 TokenSequence &TokenSequence::ToLowerCase() { 151 std::size_t tokens{start_.size()}; 152 std::size_t chars{char_.size()}; 153 std::size_t atToken{0}; 154 for (std::size_t j{0}; j < chars;) { 155 std::size_t nextStart{atToken + 1 < tokens ? start_[++atToken] : chars}; 156 char *p{&char_[j]}; 157 char const *limit{char_.data() + nextStart}; 158 j = nextStart; 159 if (IsDecimalDigit(*p)) { 160 while (p < limit && IsDecimalDigit(*p)) { 161 ++p; 162 } 163 if (p >= limit) { 164 } else if (*p == 'h' || *p == 'H') { 165 // Hollerith 166 *p = 'h'; 167 } else if (*p == '_') { 168 // kind-prefixed character literal (e.g., 1_"ABC") 169 } else { 170 // exponent 171 for (; p < limit; ++p) { 172 *p = ToLowerCaseLetter(*p); 173 } 174 } 175 } else if (limit[-1] == '\'' || limit[-1] == '"') { 176 if (*p == limit[-1]) { 177 // Character literal without prefix 178 } else if (p[1] == limit[-1]) { 179 // BOZX-prefixed constant 180 for (; p < limit; ++p) { 181 *p = ToLowerCaseLetter(*p); 182 } 183 } else { 184 // Literal with kind-param prefix name (e.g., K_"ABC"). 185 for (; *p != limit[-1]; ++p) { 186 *p = ToLowerCaseLetter(*p); 187 } 188 } 189 } else { 190 for (; p < limit; ++p) { 191 *p = ToLowerCaseLetter(*p); 192 } 193 } 194 } 195 return *this; 196 } 197 198 bool TokenSequence::HasBlanks(std::size_t firstChar) const { 199 std::size_t tokens{SizeInTokens()}; 200 for (std::size_t j{0}; j < tokens; ++j) { 201 if (start_[j] >= firstChar && TokenAt(j).IsBlank()) { 202 return true; 203 } 204 } 205 return false; 206 } 207 208 bool TokenSequence::HasRedundantBlanks(std::size_t firstChar) const { 209 std::size_t tokens{SizeInTokens()}; 210 bool lastWasBlank{false}; 211 for (std::size_t j{0}; j < tokens; ++j) { 212 bool isBlank{TokenAt(j).IsBlank()}; 213 if (isBlank && lastWasBlank && start_[j] >= firstChar) { 214 return true; 215 } 216 lastWasBlank = isBlank; 217 } 218 return false; 219 } 220 221 TokenSequence &TokenSequence::RemoveBlanks(std::size_t firstChar) { 222 std::size_t tokens{SizeInTokens()}; 223 TokenSequence result; 224 for (std::size_t j{0}; j < tokens; ++j) { 225 if (!TokenAt(j).IsBlank() || start_[j] < firstChar) { 226 result.Put(*this, j); 227 } 228 } 229 swap(result); 230 return *this; 231 } 232 233 TokenSequence &TokenSequence::RemoveRedundantBlanks(std::size_t firstChar) { 234 std::size_t tokens{SizeInTokens()}; 235 TokenSequence result; 236 bool lastWasBlank{false}; 237 for (std::size_t j{0}; j < tokens; ++j) { 238 bool isBlank{TokenAt(j).IsBlank()}; 239 if (!isBlank || !lastWasBlank || start_[j] < firstChar) { 240 result.Put(*this, j); 241 } 242 lastWasBlank = isBlank; 243 } 244 swap(result); 245 return *this; 246 } 247 248 TokenSequence &TokenSequence::ClipComment( 249 const Prescanner &prescanner, bool skipFirst) { 250 std::size_t tokens{SizeInTokens()}; 251 for (std::size_t j{0}; j < tokens; ++j) { 252 CharBlock tok{TokenAt(j)}; 253 if (std::size_t blanks{tok.CountLeadingBlanks()}; 254 blanks < tok.size() && tok[blanks] == '!') { 255 // Retain active compiler directive sentinels (e.g. "!dir$") 256 for (std::size_t k{j + 1}; k < tokens && tok.size() < blanks + 5; ++k) { 257 if (tok.begin() + tok.size() == TokenAt(k).begin()) { 258 tok.ExtendToCover(TokenAt(k)); 259 } else { 260 break; 261 } 262 } 263 bool isSentinel{false}; 264 if (tok.size() == blanks + 5) { 265 char sentinel[4]; 266 for (int k{0}; k < 4; ++k) { 267 sentinel[k] = ToLowerCaseLetter(tok[blanks + k + 1]); 268 } 269 isSentinel = prescanner.IsCompilerDirectiveSentinel(sentinel, 4); 270 } 271 if (isSentinel) { 272 } else if (skipFirst) { 273 skipFirst = false; 274 } else { 275 TokenSequence result; 276 if (j > 0) { 277 result.Put(*this, 0, j - 1); 278 } 279 swap(result); 280 return *this; 281 } 282 } 283 } 284 return *this; 285 } 286 287 void TokenSequence::Emit(CookedSource &cooked) const { 288 if (auto n{char_.size()}) { 289 cooked.Put(&char_[0], n); 290 cooked.PutProvenanceMappings(provenances_); 291 } 292 } 293 294 llvm::raw_ostream &TokenSequence::Dump(llvm::raw_ostream &o) const { 295 o << "TokenSequence has " << char_.size() << " chars; nextStart_ " 296 << nextStart_ << '\n'; 297 for (std::size_t j{0}; j < start_.size(); ++j) { 298 o << '[' << j << "] @ " << start_[j] << " '" << TokenAt(j).ToString() 299 << "'\n"; 300 } 301 return o; 302 } 303 304 Provenance TokenSequence::GetCharProvenance(std::size_t offset) const { 305 ProvenanceRange range{provenances_.Map(offset)}; 306 return range.start(); 307 } 308 309 Provenance TokenSequence::GetTokenProvenance( 310 std::size_t token, std::size_t offset) const { 311 return GetCharProvenance(start_[token] + offset); 312 } 313 314 ProvenanceRange TokenSequence::GetTokenProvenanceRange( 315 std::size_t token, std::size_t offset) const { 316 ProvenanceRange range{provenances_.Map(start_[token] + offset)}; 317 return range.Prefix(TokenBytes(token) - offset); 318 } 319 320 ProvenanceRange TokenSequence::GetIntervalProvenanceRange( 321 std::size_t token, std::size_t tokens) const { 322 if (tokens == 0) { 323 return {}; 324 } 325 ProvenanceRange range{provenances_.Map(start_[token])}; 326 while (--tokens > 0 && 327 range.AnnexIfPredecessor(provenances_.Map(start_[++token]))) { 328 } 329 return range; 330 } 331 332 ProvenanceRange TokenSequence::GetProvenanceRange() const { 333 return GetIntervalProvenanceRange(0, start_.size()); 334 } 335 336 const TokenSequence &TokenSequence::CheckBadFortranCharacters( 337 Messages &messages) const { 338 std::size_t tokens{SizeInTokens()}; 339 bool isBangOk{true}; 340 for (std::size_t j{0}; j < tokens; ++j) { 341 CharBlock token{TokenAt(j)}; 342 char ch{token.FirstNonBlank()}; 343 if (ch != ' ' && !IsValidFortranTokenCharacter(ch)) { 344 if (ch == '!' && isBangOk) { 345 // allow in !dir$ 346 } else if (ch < ' ' || ch >= '\x7f') { 347 messages.Say(GetTokenProvenanceRange(j), 348 "bad character (0x%02x) in Fortran token"_err_en_US, ch & 0xff); 349 } else { 350 messages.Say(GetTokenProvenanceRange(j), 351 "bad character ('%c') in Fortran token"_err_en_US, ch); 352 } 353 } 354 if (ch == ';') { 355 isBangOk = true; 356 } else if (ch != ' ') { 357 isBangOk = false; 358 } 359 } 360 return *this; 361 } 362 363 const TokenSequence &TokenSequence::CheckBadParentheses( 364 Messages &messages) const { 365 // First, a quick pass with no allocation for the common case 366 int nesting{0}; 367 std::size_t tokens{SizeInTokens()}; 368 for (std::size_t j{0}; j < tokens; ++j) { 369 CharBlock token{TokenAt(j)}; 370 char ch{token.FirstNonBlank()}; 371 if (ch == '(') { 372 ++nesting; 373 } else if (ch == ')') { 374 --nesting; 375 } 376 } 377 if (nesting != 0) { 378 // There's an error; diagnose it 379 std::vector<std::size_t> stack; 380 for (std::size_t j{0}; j < tokens; ++j) { 381 CharBlock token{TokenAt(j)}; 382 char ch{token.FirstNonBlank()}; 383 if (ch == '(') { 384 stack.push_back(j); 385 } else if (ch == ')') { 386 if (stack.empty()) { 387 messages.Say(GetTokenProvenanceRange(j), "Unmatched ')'"_err_en_US); 388 return *this; 389 } 390 stack.pop_back(); 391 } 392 } 393 CHECK(!stack.empty()); 394 messages.Say( 395 GetTokenProvenanceRange(stack.back()), "Unmatched '('"_err_en_US); 396 } 397 return *this; 398 } 399 } // namespace Fortran::parser 400