1 //===- YAMLParser.cpp - Simple YAML parser --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a YAML parser. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/YAMLParser.h" 14 #include "llvm/ADT/AllocatorList.h" 15 #include "llvm/ADT/ArrayRef.h" 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/ADT/SmallString.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/StringExtras.h" 20 #include "llvm/ADT/StringRef.h" 21 #include "llvm/ADT/Twine.h" 22 #include "llvm/Support/Compiler.h" 23 #include "llvm/Support/ErrorHandling.h" 24 #include "llvm/Support/MemoryBuffer.h" 25 #include "llvm/Support/SMLoc.h" 26 #include "llvm/Support/SourceMgr.h" 27 #include "llvm/Support/Unicode.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include <cassert> 30 #include <cstddef> 31 #include <cstdint> 32 #include <map> 33 #include <memory> 34 #include <string> 35 #include <system_error> 36 #include <utility> 37 38 using namespace llvm; 39 using namespace yaml; 40 41 enum UnicodeEncodingForm { 42 UEF_UTF32_LE, ///< UTF-32 Little Endian 43 UEF_UTF32_BE, ///< UTF-32 Big Endian 44 UEF_UTF16_LE, ///< UTF-16 Little Endian 45 UEF_UTF16_BE, ///< UTF-16 Big Endian 46 UEF_UTF8, ///< UTF-8 or ascii. 47 UEF_Unknown ///< Not a valid Unicode encoding. 48 }; 49 50 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 51 /// it exists. Length is in {0, 2, 3, 4}. 52 using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>; 53 54 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 55 /// encoding form of \a Input. 56 /// 57 /// @param Input A string of length 0 or more. 58 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 59 /// and how long the byte order mark is if one exists. 60 static EncodingInfo getUnicodeEncoding(StringRef Input) { 61 if (Input.empty()) 62 return std::make_pair(UEF_Unknown, 0); 63 64 switch (uint8_t(Input[0])) { 65 case 0x00: 66 if (Input.size() >= 4) { 67 if ( Input[1] == 0 68 && uint8_t(Input[2]) == 0xFE 69 && uint8_t(Input[3]) == 0xFF) 70 return std::make_pair(UEF_UTF32_BE, 4); 71 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 72 return std::make_pair(UEF_UTF32_BE, 0); 73 } 74 75 if (Input.size() >= 2 && Input[1] != 0) 76 return std::make_pair(UEF_UTF16_BE, 0); 77 return std::make_pair(UEF_Unknown, 0); 78 case 0xFF: 79 if ( Input.size() >= 4 80 && uint8_t(Input[1]) == 0xFE 81 && Input[2] == 0 82 && Input[3] == 0) 83 return std::make_pair(UEF_UTF32_LE, 4); 84 85 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 86 return std::make_pair(UEF_UTF16_LE, 2); 87 return std::make_pair(UEF_Unknown, 0); 88 case 0xFE: 89 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 90 return std::make_pair(UEF_UTF16_BE, 2); 91 return std::make_pair(UEF_Unknown, 0); 92 case 0xEF: 93 if ( Input.size() >= 3 94 && uint8_t(Input[1]) == 0xBB 95 && uint8_t(Input[2]) == 0xBF) 96 return std::make_pair(UEF_UTF8, 3); 97 return std::make_pair(UEF_Unknown, 0); 98 } 99 100 // It could still be utf-32 or utf-16. 101 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 102 return std::make_pair(UEF_UTF32_LE, 0); 103 104 if (Input.size() >= 2 && Input[1] == 0) 105 return std::make_pair(UEF_UTF16_LE, 0); 106 107 return std::make_pair(UEF_UTF8, 0); 108 } 109 110 /// Pin the vtables to this file. 111 void Node::anchor() {} 112 void NullNode::anchor() {} 113 void ScalarNode::anchor() {} 114 void BlockScalarNode::anchor() {} 115 void KeyValueNode::anchor() {} 116 void MappingNode::anchor() {} 117 void SequenceNode::anchor() {} 118 void AliasNode::anchor() {} 119 120 namespace llvm { 121 namespace yaml { 122 123 /// Token - A single YAML token. 124 struct Token { 125 enum TokenKind { 126 TK_Error, // Uninitialized token. 127 TK_StreamStart, 128 TK_StreamEnd, 129 TK_VersionDirective, 130 TK_TagDirective, 131 TK_DocumentStart, 132 TK_DocumentEnd, 133 TK_BlockEntry, 134 TK_BlockEnd, 135 TK_BlockSequenceStart, 136 TK_BlockMappingStart, 137 TK_FlowEntry, 138 TK_FlowSequenceStart, 139 TK_FlowSequenceEnd, 140 TK_FlowMappingStart, 141 TK_FlowMappingEnd, 142 TK_Key, 143 TK_Value, 144 TK_Scalar, 145 TK_BlockScalar, 146 TK_Alias, 147 TK_Anchor, 148 TK_Tag 149 } Kind = TK_Error; 150 151 /// A string of length 0 or more whose begin() points to the logical location 152 /// of the token in the input. 153 StringRef Range; 154 155 /// The value of a block scalar node. 156 std::string Value; 157 158 Token() = default; 159 }; 160 161 } // end namespace yaml 162 } // end namespace llvm 163 164 using TokenQueueT = BumpPtrList<Token>; 165 166 namespace { 167 168 /// This struct is used to track simple keys. 169 /// 170 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 171 /// which could legally be the start of a simple key. When peekNext is called, 172 /// if the Token To be returned is referenced by a SimpleKey, we continue 173 /// tokenizing until that potential simple key has either been found to not be 174 /// a simple key (we moved on to the next line or went further than 1024 chars). 175 /// Or when we run into a Value, and then insert a Key token (and possibly 176 /// others) before the SimpleKey's Tok. 177 struct SimpleKey { 178 TokenQueueT::iterator Tok; 179 unsigned Column = 0; 180 unsigned Line = 0; 181 unsigned FlowLevel = 0; 182 bool IsRequired = false; 183 184 bool operator ==(const SimpleKey &Other) { 185 return Tok == Other.Tok; 186 } 187 }; 188 189 } // end anonymous namespace 190 191 /// The Unicode scalar value of a UTF-8 minimal well-formed code unit 192 /// subsequence and the subsequence's length in code units (uint8_t). 193 /// A length of 0 represents an error. 194 using UTF8Decoded = std::pair<uint32_t, unsigned>; 195 196 static UTF8Decoded decodeUTF8(StringRef Range) { 197 StringRef::iterator Position= Range.begin(); 198 StringRef::iterator End = Range.end(); 199 // 1 byte: [0x00, 0x7f] 200 // Bit pattern: 0xxxxxxx 201 if (Position < End && (*Position & 0x80) == 0) { 202 return std::make_pair(*Position, 1); 203 } 204 // 2 bytes: [0x80, 0x7ff] 205 // Bit pattern: 110xxxxx 10xxxxxx 206 if (Position + 1 < End && ((*Position & 0xE0) == 0xC0) && 207 ((*(Position + 1) & 0xC0) == 0x80)) { 208 uint32_t codepoint = ((*Position & 0x1F) << 6) | 209 (*(Position + 1) & 0x3F); 210 if (codepoint >= 0x80) 211 return std::make_pair(codepoint, 2); 212 } 213 // 3 bytes: [0x8000, 0xffff] 214 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 215 if (Position + 2 < End && ((*Position & 0xF0) == 0xE0) && 216 ((*(Position + 1) & 0xC0) == 0x80) && 217 ((*(Position + 2) & 0xC0) == 0x80)) { 218 uint32_t codepoint = ((*Position & 0x0F) << 12) | 219 ((*(Position + 1) & 0x3F) << 6) | 220 (*(Position + 2) & 0x3F); 221 // Codepoints between 0xD800 and 0xDFFF are invalid, as 222 // they are high / low surrogate halves used by UTF-16. 223 if (codepoint >= 0x800 && 224 (codepoint < 0xD800 || codepoint > 0xDFFF)) 225 return std::make_pair(codepoint, 3); 226 } 227 // 4 bytes: [0x10000, 0x10FFFF] 228 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 229 if (Position + 3 < End && ((*Position & 0xF8) == 0xF0) && 230 ((*(Position + 1) & 0xC0) == 0x80) && 231 ((*(Position + 2) & 0xC0) == 0x80) && 232 ((*(Position + 3) & 0xC0) == 0x80)) { 233 uint32_t codepoint = ((*Position & 0x07) << 18) | 234 ((*(Position + 1) & 0x3F) << 12) | 235 ((*(Position + 2) & 0x3F) << 6) | 236 (*(Position + 3) & 0x3F); 237 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 238 return std::make_pair(codepoint, 4); 239 } 240 return std::make_pair(0, 0); 241 } 242 243 namespace llvm { 244 namespace yaml { 245 246 /// Scans YAML tokens from a MemoryBuffer. 247 class Scanner { 248 public: 249 Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true, 250 std::error_code *EC = nullptr); 251 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true, 252 std::error_code *EC = nullptr); 253 254 /// Parse the next token and return it without popping it. 255 Token &peekNext(); 256 257 /// Parse the next token and pop it from the queue. 258 Token getNext(); 259 260 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 261 ArrayRef<SMRange> Ranges = {}) { 262 SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ {}, ShowColors); 263 } 264 265 void setError(const Twine &Message, StringRef::iterator Position) { 266 if (Position >= End) 267 Position = End - 1; 268 269 // propagate the error if possible 270 if (EC) 271 *EC = make_error_code(std::errc::invalid_argument); 272 273 // Don't print out more errors after the first one we encounter. The rest 274 // are just the result of the first, and have no meaning. 275 if (!Failed) 276 printError(SMLoc::getFromPointer(Position), SourceMgr::DK_Error, Message); 277 Failed = true; 278 } 279 280 /// Returns true if an error occurred while parsing. 281 bool failed() { 282 return Failed; 283 } 284 285 private: 286 void init(MemoryBufferRef Buffer); 287 288 StringRef currentInput() { 289 return StringRef(Current, End - Current); 290 } 291 292 /// Decode a UTF-8 minimal well-formed code unit subsequence starting 293 /// at \a Position. 294 /// 295 /// If the UTF-8 code units starting at Position do not form a well-formed 296 /// code unit subsequence, then the Unicode scalar value is 0, and the length 297 /// is 0. 298 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 299 return ::decodeUTF8(StringRef(Position, End - Position)); 300 } 301 302 // The following functions are based on the gramar rules in the YAML spec. The 303 // style of the function names it meant to closely match how they are written 304 // in the spec. The number within the [] is the number of the grammar rule in 305 // the spec. 306 // 307 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 308 // 309 // c- 310 // A production starting and ending with a special character. 311 // b- 312 // A production matching a single line break. 313 // nb- 314 // A production starting and ending with a non-break character. 315 // s- 316 // A production starting and ending with a white space character. 317 // ns- 318 // A production starting and ending with a non-space character. 319 // l- 320 // A production matching complete line(s). 321 322 /// Skip a single nb-char[27] starting at Position. 323 /// 324 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 325 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 326 /// 327 /// @returns The code unit after the nb-char, or Position if it's not an 328 /// nb-char. 329 StringRef::iterator skip_nb_char(StringRef::iterator Position); 330 331 /// Skip a single b-break[28] starting at Position. 332 /// 333 /// A b-break is 0xD 0xA | 0xD | 0xA 334 /// 335 /// @returns The code unit after the b-break, or Position if it's not a 336 /// b-break. 337 StringRef::iterator skip_b_break(StringRef::iterator Position); 338 339 /// Skip a single s-space[31] starting at Position. 340 /// 341 /// An s-space is 0x20 342 /// 343 /// @returns The code unit after the s-space, or Position if it's not a 344 /// s-space. 345 StringRef::iterator skip_s_space(StringRef::iterator Position); 346 347 /// Skip a single s-white[33] starting at Position. 348 /// 349 /// A s-white is 0x20 | 0x9 350 /// 351 /// @returns The code unit after the s-white, or Position if it's not a 352 /// s-white. 353 StringRef::iterator skip_s_white(StringRef::iterator Position); 354 355 /// Skip a single ns-char[34] starting at Position. 356 /// 357 /// A ns-char is nb-char - s-white 358 /// 359 /// @returns The code unit after the ns-char, or Position if it's not a 360 /// ns-char. 361 StringRef::iterator skip_ns_char(StringRef::iterator Position); 362 363 using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator); 364 365 /// Skip minimal well-formed code unit subsequences until Func 366 /// returns its input. 367 /// 368 /// @returns The code unit after the last minimal well-formed code unit 369 /// subsequence that Func accepted. 370 StringRef::iterator skip_while( SkipWhileFunc Func 371 , StringRef::iterator Position); 372 373 /// Skip minimal well-formed code unit subsequences until Func returns its 374 /// input. 375 void advanceWhile(SkipWhileFunc Func); 376 377 /// Scan ns-uri-char[39]s starting at Cur. 378 /// 379 /// This updates Cur and Column while scanning. 380 void scan_ns_uri_char(); 381 382 /// Consume a minimal well-formed code unit subsequence starting at 383 /// \a Cur. Return false if it is not the same Unicode scalar value as 384 /// \a Expected. This updates \a Column. 385 bool consume(uint32_t Expected); 386 387 /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 388 void skip(uint32_t Distance); 389 390 /// Return true if the minimal well-formed code unit subsequence at 391 /// Pos is whitespace or a new line 392 bool isBlankOrBreak(StringRef::iterator Position); 393 394 /// Return true if the minimal well-formed code unit subsequence at 395 /// Pos is considered a "safe" character for plain scalars. 396 bool isPlainSafeNonBlank(StringRef::iterator Position); 397 398 /// Return true if the line is a line break, false otherwise. 399 bool isLineEmpty(StringRef Line); 400 401 /// Consume a single b-break[28] if it's present at the current position. 402 /// 403 /// Return false if the code unit at the current position isn't a line break. 404 bool consumeLineBreakIfPresent(); 405 406 /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 407 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 408 , unsigned AtColumn 409 , bool IsRequired); 410 411 /// Remove simple keys that can no longer be valid simple keys. 412 /// 413 /// Invalid simple keys are not on the current line or are further than 1024 414 /// columns back. 415 void removeStaleSimpleKeyCandidates(); 416 417 /// Remove all simple keys on FlowLevel \a Level. 418 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 419 420 /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 421 /// tokens if needed. 422 bool unrollIndent(int ToColumn); 423 424 /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 425 /// if needed. 426 bool rollIndent( int ToColumn 427 , Token::TokenKind Kind 428 , TokenQueueT::iterator InsertPoint); 429 430 /// Skip a single-line comment when the comment starts at the current 431 /// position of the scanner. 432 void skipComment(); 433 434 /// Skip whitespace and comments until the start of the next token. 435 void scanToNextToken(); 436 437 /// Must be the first token generated. 438 bool scanStreamStart(); 439 440 /// Generate tokens needed to close out the stream. 441 bool scanStreamEnd(); 442 443 /// Scan a %BLAH directive. 444 bool scanDirective(); 445 446 /// Scan a ... or ---. 447 bool scanDocumentIndicator(bool IsStart); 448 449 /// Scan a [ or { and generate the proper flow collection start token. 450 bool scanFlowCollectionStart(bool IsSequence); 451 452 /// Scan a ] or } and generate the proper flow collection end token. 453 bool scanFlowCollectionEnd(bool IsSequence); 454 455 /// Scan the , that separates entries in a flow collection. 456 bool scanFlowEntry(); 457 458 /// Scan the - that starts block sequence entries. 459 bool scanBlockEntry(); 460 461 /// Scan an explicit ? indicating a key. 462 bool scanKey(); 463 464 /// Scan an explicit : indicating a value. 465 bool scanValue(); 466 467 /// Scan a quoted scalar. 468 bool scanFlowScalar(bool IsDoubleQuoted); 469 470 /// Scan an unquoted scalar. 471 bool scanPlainScalar(); 472 473 /// Scan an Alias or Anchor starting with * or &. 474 bool scanAliasOrAnchor(bool IsAlias); 475 476 /// Scan a block scalar starting with | or >. 477 bool scanBlockScalar(bool IsLiteral); 478 479 /// Scan a block scalar style indicator and header. 480 /// 481 /// Note: This is distinct from scanBlockScalarHeader to mirror the fact that 482 /// YAML does not consider the style indicator to be a part of the header. 483 /// 484 /// Return false if an error occurred. 485 bool scanBlockScalarIndicators(char &StyleIndicator, char &ChompingIndicator, 486 unsigned &IndentIndicator, bool &IsDone); 487 488 /// Scan a style indicator in a block scalar header. 489 char scanBlockStyleIndicator(); 490 491 /// Scan a chomping indicator in a block scalar header. 492 char scanBlockChompingIndicator(); 493 494 /// Scan an indentation indicator in a block scalar header. 495 unsigned scanBlockIndentationIndicator(); 496 497 /// Scan a block scalar header. 498 /// 499 /// Return false if an error occurred. 500 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, 501 bool &IsDone); 502 503 /// Look for the indentation level of a block scalar. 504 /// 505 /// Return false if an error occurred. 506 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, 507 unsigned &LineBreaks, bool &IsDone); 508 509 /// Scan the indentation of a text line in a block scalar. 510 /// 511 /// Return false if an error occurred. 512 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, 513 bool &IsDone); 514 515 /// Scan a tag of the form !stuff. 516 bool scanTag(); 517 518 /// Dispatch to the next scanning function based on \a *Cur. 519 bool fetchMoreTokens(); 520 521 /// The SourceMgr used for diagnostics and buffer management. 522 SourceMgr &SM; 523 524 /// The original input. 525 MemoryBufferRef InputBuffer; 526 527 /// The current position of the scanner. 528 StringRef::iterator Current; 529 530 /// The end of the input (one past the last character). 531 StringRef::iterator End; 532 533 /// Current YAML indentation level in spaces. 534 int Indent; 535 536 /// Current column number in Unicode code points. 537 unsigned Column; 538 539 /// Current line number. 540 unsigned Line; 541 542 /// How deep we are in flow style containers. 0 Means at block level. 543 unsigned FlowLevel; 544 545 /// Are we at the start of the stream? 546 bool IsStartOfStream; 547 548 /// Can the next token be the start of a simple key? 549 bool IsSimpleKeyAllowed; 550 551 /// Can the next token be a value indicator even if it does not have a 552 /// trailing space? 553 bool IsAdjacentValueAllowedInFlow; 554 555 /// True if an error has occurred. 556 bool Failed; 557 558 /// Should colors be used when printing out the diagnostic messages? 559 bool ShowColors; 560 561 /// Queue of tokens. This is required to queue up tokens while looking 562 /// for the end of a simple key. And for cases where a single character 563 /// can produce multiple tokens (e.g. BlockEnd). 564 TokenQueueT TokenQueue; 565 566 /// Indentation levels. 567 SmallVector<int, 4> Indents; 568 569 /// Potential simple keys. 570 SmallVector<SimpleKey, 4> SimpleKeys; 571 572 std::error_code *EC; 573 }; 574 575 } // end namespace yaml 576 } // end namespace llvm 577 578 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 579 static void encodeUTF8( uint32_t UnicodeScalarValue 580 , SmallVectorImpl<char> &Result) { 581 if (UnicodeScalarValue <= 0x7F) { 582 Result.push_back(UnicodeScalarValue & 0x7F); 583 } else if (UnicodeScalarValue <= 0x7FF) { 584 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 585 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 586 Result.push_back(FirstByte); 587 Result.push_back(SecondByte); 588 } else if (UnicodeScalarValue <= 0xFFFF) { 589 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 590 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 591 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 592 Result.push_back(FirstByte); 593 Result.push_back(SecondByte); 594 Result.push_back(ThirdByte); 595 } else if (UnicodeScalarValue <= 0x10FFFF) { 596 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 597 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 598 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 599 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 600 Result.push_back(FirstByte); 601 Result.push_back(SecondByte); 602 Result.push_back(ThirdByte); 603 Result.push_back(FourthByte); 604 } 605 } 606 607 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 608 SourceMgr SM; 609 Scanner scanner(Input, SM); 610 while (true) { 611 Token T = scanner.getNext(); 612 switch (T.Kind) { 613 case Token::TK_StreamStart: 614 OS << "Stream-Start: "; 615 break; 616 case Token::TK_StreamEnd: 617 OS << "Stream-End: "; 618 break; 619 case Token::TK_VersionDirective: 620 OS << "Version-Directive: "; 621 break; 622 case Token::TK_TagDirective: 623 OS << "Tag-Directive: "; 624 break; 625 case Token::TK_DocumentStart: 626 OS << "Document-Start: "; 627 break; 628 case Token::TK_DocumentEnd: 629 OS << "Document-End: "; 630 break; 631 case Token::TK_BlockEntry: 632 OS << "Block-Entry: "; 633 break; 634 case Token::TK_BlockEnd: 635 OS << "Block-End: "; 636 break; 637 case Token::TK_BlockSequenceStart: 638 OS << "Block-Sequence-Start: "; 639 break; 640 case Token::TK_BlockMappingStart: 641 OS << "Block-Mapping-Start: "; 642 break; 643 case Token::TK_FlowEntry: 644 OS << "Flow-Entry: "; 645 break; 646 case Token::TK_FlowSequenceStart: 647 OS << "Flow-Sequence-Start: "; 648 break; 649 case Token::TK_FlowSequenceEnd: 650 OS << "Flow-Sequence-End: "; 651 break; 652 case Token::TK_FlowMappingStart: 653 OS << "Flow-Mapping-Start: "; 654 break; 655 case Token::TK_FlowMappingEnd: 656 OS << "Flow-Mapping-End: "; 657 break; 658 case Token::TK_Key: 659 OS << "Key: "; 660 break; 661 case Token::TK_Value: 662 OS << "Value: "; 663 break; 664 case Token::TK_Scalar: 665 OS << "Scalar: "; 666 break; 667 case Token::TK_BlockScalar: 668 OS << "Block Scalar: "; 669 break; 670 case Token::TK_Alias: 671 OS << "Alias: "; 672 break; 673 case Token::TK_Anchor: 674 OS << "Anchor: "; 675 break; 676 case Token::TK_Tag: 677 OS << "Tag: "; 678 break; 679 case Token::TK_Error: 680 break; 681 } 682 OS << T.Range << "\n"; 683 if (T.Kind == Token::TK_StreamEnd) 684 break; 685 else if (T.Kind == Token::TK_Error) 686 return false; 687 } 688 return true; 689 } 690 691 bool yaml::scanTokens(StringRef Input) { 692 SourceMgr SM; 693 Scanner scanner(Input, SM); 694 while (true) { 695 Token T = scanner.getNext(); 696 if (T.Kind == Token::TK_StreamEnd) 697 break; 698 else if (T.Kind == Token::TK_Error) 699 return false; 700 } 701 return true; 702 } 703 704 std::string yaml::escape(StringRef Input, bool EscapePrintable) { 705 std::string EscapedInput; 706 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 707 if (*i == '\\') 708 EscapedInput += "\\\\"; 709 else if (*i == '"') 710 EscapedInput += "\\\""; 711 else if (*i == 0) 712 EscapedInput += "\\0"; 713 else if (*i == 0x07) 714 EscapedInput += "\\a"; 715 else if (*i == 0x08) 716 EscapedInput += "\\b"; 717 else if (*i == 0x09) 718 EscapedInput += "\\t"; 719 else if (*i == 0x0A) 720 EscapedInput += "\\n"; 721 else if (*i == 0x0B) 722 EscapedInput += "\\v"; 723 else if (*i == 0x0C) 724 EscapedInput += "\\f"; 725 else if (*i == 0x0D) 726 EscapedInput += "\\r"; 727 else if (*i == 0x1B) 728 EscapedInput += "\\e"; 729 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 730 std::string HexStr = utohexstr(*i); 731 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 732 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 733 UTF8Decoded UnicodeScalarValue 734 = decodeUTF8(StringRef(i, Input.end() - i)); 735 if (UnicodeScalarValue.second == 0) { 736 // Found invalid char. 737 SmallString<4> Val; 738 encodeUTF8(0xFFFD, Val); 739 llvm::append_range(EscapedInput, Val); 740 // FIXME: Error reporting. 741 return EscapedInput; 742 } 743 if (UnicodeScalarValue.first == 0x85) 744 EscapedInput += "\\N"; 745 else if (UnicodeScalarValue.first == 0xA0) 746 EscapedInput += "\\_"; 747 else if (UnicodeScalarValue.first == 0x2028) 748 EscapedInput += "\\L"; 749 else if (UnicodeScalarValue.first == 0x2029) 750 EscapedInput += "\\P"; 751 else if (!EscapePrintable && 752 sys::unicode::isPrintable(UnicodeScalarValue.first)) 753 EscapedInput += StringRef(i, UnicodeScalarValue.second); 754 else { 755 std::string HexStr = utohexstr(UnicodeScalarValue.first); 756 if (HexStr.size() <= 2) 757 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 758 else if (HexStr.size() <= 4) 759 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 760 else if (HexStr.size() <= 8) 761 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 762 } 763 i += UnicodeScalarValue.second - 1; 764 } else 765 EscapedInput.push_back(*i); 766 } 767 return EscapedInput; 768 } 769 770 std::optional<bool> yaml::parseBool(StringRef S) { 771 switch (S.size()) { 772 case 1: 773 switch (S.front()) { 774 case 'y': 775 case 'Y': 776 return true; 777 case 'n': 778 case 'N': 779 return false; 780 default: 781 return std::nullopt; 782 } 783 case 2: 784 switch (S.front()) { 785 case 'O': 786 if (S[1] == 'N') // ON 787 return true; 788 [[fallthrough]]; 789 case 'o': 790 if (S[1] == 'n') //[Oo]n 791 return true; 792 return std::nullopt; 793 case 'N': 794 if (S[1] == 'O') // NO 795 return false; 796 [[fallthrough]]; 797 case 'n': 798 if (S[1] == 'o') //[Nn]o 799 return false; 800 return std::nullopt; 801 default: 802 return std::nullopt; 803 } 804 case 3: 805 switch (S.front()) { 806 case 'O': 807 if (S.drop_front() == "FF") // OFF 808 return false; 809 [[fallthrough]]; 810 case 'o': 811 if (S.drop_front() == "ff") //[Oo]ff 812 return false; 813 return std::nullopt; 814 case 'Y': 815 if (S.drop_front() == "ES") // YES 816 return true; 817 [[fallthrough]]; 818 case 'y': 819 if (S.drop_front() == "es") //[Yy]es 820 return true; 821 return std::nullopt; 822 default: 823 return std::nullopt; 824 } 825 case 4: 826 switch (S.front()) { 827 case 'T': 828 if (S.drop_front() == "RUE") // TRUE 829 return true; 830 [[fallthrough]]; 831 case 't': 832 if (S.drop_front() == "rue") //[Tt]rue 833 return true; 834 return std::nullopt; 835 default: 836 return std::nullopt; 837 } 838 case 5: 839 switch (S.front()) { 840 case 'F': 841 if (S.drop_front() == "ALSE") // FALSE 842 return false; 843 [[fallthrough]]; 844 case 'f': 845 if (S.drop_front() == "alse") //[Ff]alse 846 return false; 847 return std::nullopt; 848 default: 849 return std::nullopt; 850 } 851 default: 852 return std::nullopt; 853 } 854 } 855 856 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors, 857 std::error_code *EC) 858 : SM(sm), ShowColors(ShowColors), EC(EC) { 859 init(MemoryBufferRef(Input, "YAML")); 860 } 861 862 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors, 863 std::error_code *EC) 864 : SM(SM_), ShowColors(ShowColors), EC(EC) { 865 init(Buffer); 866 } 867 868 void Scanner::init(MemoryBufferRef Buffer) { 869 InputBuffer = Buffer; 870 Current = InputBuffer.getBufferStart(); 871 End = InputBuffer.getBufferEnd(); 872 Indent = -1; 873 Column = 0; 874 Line = 0; 875 FlowLevel = 0; 876 IsStartOfStream = true; 877 IsSimpleKeyAllowed = true; 878 IsAdjacentValueAllowedInFlow = false; 879 Failed = false; 880 std::unique_ptr<MemoryBuffer> InputBufferOwner = 881 MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false); 882 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 883 } 884 885 Token &Scanner::peekNext() { 886 // If the current token is a possible simple key, keep parsing until we 887 // can confirm. 888 bool NeedMore = false; 889 while (true) { 890 if (TokenQueue.empty() || NeedMore) { 891 if (!fetchMoreTokens()) { 892 TokenQueue.clear(); 893 SimpleKeys.clear(); 894 TokenQueue.push_back(Token()); 895 return TokenQueue.front(); 896 } 897 } 898 assert(!TokenQueue.empty() && 899 "fetchMoreTokens lied about getting tokens!"); 900 901 removeStaleSimpleKeyCandidates(); 902 SimpleKey SK; 903 SK.Tok = TokenQueue.begin(); 904 if (!is_contained(SimpleKeys, SK)) 905 break; 906 else 907 NeedMore = true; 908 } 909 return TokenQueue.front(); 910 } 911 912 Token Scanner::getNext() { 913 Token Ret = peekNext(); 914 // TokenQueue can be empty if there was an error getting the next token. 915 if (!TokenQueue.empty()) 916 TokenQueue.pop_front(); 917 918 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 919 // quick deallocation of them all. 920 if (TokenQueue.empty()) 921 TokenQueue.resetAlloc(); 922 923 return Ret; 924 } 925 926 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 927 if (Position == End) 928 return Position; 929 // Check 7 bit c-printable - b-char. 930 if ( *Position == 0x09 931 || (*Position >= 0x20 && *Position <= 0x7E)) 932 return Position + 1; 933 934 // Check for valid UTF-8. 935 if (uint8_t(*Position) & 0x80) { 936 UTF8Decoded u8d = decodeUTF8(Position); 937 if ( u8d.second != 0 938 && u8d.first != 0xFEFF 939 && ( u8d.first == 0x85 940 || ( u8d.first >= 0xA0 941 && u8d.first <= 0xD7FF) 942 || ( u8d.first >= 0xE000 943 && u8d.first <= 0xFFFD) 944 || ( u8d.first >= 0x10000 945 && u8d.first <= 0x10FFFF))) 946 return Position + u8d.second; 947 } 948 return Position; 949 } 950 951 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 952 if (Position == End) 953 return Position; 954 if (*Position == 0x0D) { 955 if (Position + 1 != End && *(Position + 1) == 0x0A) 956 return Position + 2; 957 return Position + 1; 958 } 959 960 if (*Position == 0x0A) 961 return Position + 1; 962 return Position; 963 } 964 965 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { 966 if (Position == End) 967 return Position; 968 if (*Position == ' ') 969 return Position + 1; 970 return Position; 971 } 972 973 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 974 if (Position == End) 975 return Position; 976 if (*Position == ' ' || *Position == '\t') 977 return Position + 1; 978 return Position; 979 } 980 981 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 982 if (Position == End) 983 return Position; 984 if (*Position == ' ' || *Position == '\t') 985 return Position; 986 return skip_nb_char(Position); 987 } 988 989 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 990 , StringRef::iterator Position) { 991 while (true) { 992 StringRef::iterator i = (this->*Func)(Position); 993 if (i == Position) 994 break; 995 Position = i; 996 } 997 return Position; 998 } 999 1000 void Scanner::advanceWhile(SkipWhileFunc Func) { 1001 auto Final = skip_while(Func, Current); 1002 Column += Final - Current; 1003 Current = Final; 1004 } 1005 1006 static bool is_ns_hex_digit(const char C) { return isAlnum(C); } 1007 1008 static bool is_ns_word_char(const char C) { return C == '-' || isAlpha(C); } 1009 1010 void Scanner::scan_ns_uri_char() { 1011 while (true) { 1012 if (Current == End) 1013 break; 1014 if (( *Current == '%' 1015 && Current + 2 < End 1016 && is_ns_hex_digit(*(Current + 1)) 1017 && is_ns_hex_digit(*(Current + 2))) 1018 || is_ns_word_char(*Current) 1019 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 1020 != StringRef::npos) { 1021 ++Current; 1022 ++Column; 1023 } else 1024 break; 1025 } 1026 } 1027 1028 bool Scanner::consume(uint32_t Expected) { 1029 if (Expected >= 0x80) { 1030 setError("Cannot consume non-ascii characters", Current); 1031 return false; 1032 } 1033 if (Current == End) 1034 return false; 1035 if (uint8_t(*Current) >= 0x80) { 1036 setError("Cannot consume non-ascii characters", Current); 1037 return false; 1038 } 1039 if (uint8_t(*Current) == Expected) { 1040 ++Current; 1041 ++Column; 1042 return true; 1043 } 1044 return false; 1045 } 1046 1047 void Scanner::skip(uint32_t Distance) { 1048 Current += Distance; 1049 Column += Distance; 1050 assert(Current <= End && "Skipped past the end"); 1051 } 1052 1053 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 1054 if (Position == End) 1055 return false; 1056 return *Position == ' ' || *Position == '\t' || *Position == '\r' || 1057 *Position == '\n'; 1058 } 1059 1060 bool Scanner::isPlainSafeNonBlank(StringRef::iterator Position) { 1061 if (Position == End || isBlankOrBreak(Position)) 1062 return false; 1063 if (FlowLevel && 1064 StringRef(Position, 1).find_first_of(",[]{}") != StringRef::npos) 1065 return false; 1066 return true; 1067 } 1068 1069 bool Scanner::isLineEmpty(StringRef Line) { 1070 for (const auto *Position = Line.begin(); Position != Line.end(); ++Position) 1071 if (!isBlankOrBreak(Position)) 1072 return false; 1073 return true; 1074 } 1075 1076 bool Scanner::consumeLineBreakIfPresent() { 1077 auto Next = skip_b_break(Current); 1078 if (Next == Current) 1079 return false; 1080 Column = 0; 1081 ++Line; 1082 Current = Next; 1083 return true; 1084 } 1085 1086 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 1087 , unsigned AtColumn 1088 , bool IsRequired) { 1089 if (IsSimpleKeyAllowed) { 1090 SimpleKey SK; 1091 SK.Tok = Tok; 1092 SK.Line = Line; 1093 SK.Column = AtColumn; 1094 SK.IsRequired = IsRequired; 1095 SK.FlowLevel = FlowLevel; 1096 SimpleKeys.push_back(SK); 1097 } 1098 } 1099 1100 void Scanner::removeStaleSimpleKeyCandidates() { 1101 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 1102 i != SimpleKeys.end();) { 1103 if (i->Line != Line || i->Column + 1024 < Column) { 1104 if (i->IsRequired) 1105 setError( "Could not find expected : for simple key" 1106 , i->Tok->Range.begin()); 1107 i = SimpleKeys.erase(i); 1108 } else 1109 ++i; 1110 } 1111 } 1112 1113 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 1114 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 1115 SimpleKeys.pop_back(); 1116 } 1117 1118 bool Scanner::unrollIndent(int ToColumn) { 1119 Token T; 1120 // Indentation is ignored in flow. 1121 if (FlowLevel != 0) 1122 return true; 1123 1124 while (Indent > ToColumn) { 1125 T.Kind = Token::TK_BlockEnd; 1126 T.Range = StringRef(Current, 1); 1127 TokenQueue.push_back(T); 1128 Indent = Indents.pop_back_val(); 1129 } 1130 1131 return true; 1132 } 1133 1134 bool Scanner::rollIndent( int ToColumn 1135 , Token::TokenKind Kind 1136 , TokenQueueT::iterator InsertPoint) { 1137 if (FlowLevel) 1138 return true; 1139 if (Indent < ToColumn) { 1140 Indents.push_back(Indent); 1141 Indent = ToColumn; 1142 1143 Token T; 1144 T.Kind = Kind; 1145 T.Range = StringRef(Current, 0); 1146 TokenQueue.insert(InsertPoint, T); 1147 } 1148 return true; 1149 } 1150 1151 void Scanner::skipComment() { 1152 if (Current == End || *Current != '#') 1153 return; 1154 while (true) { 1155 // This may skip more than one byte, thus Column is only incremented 1156 // for code points. 1157 StringRef::iterator I = skip_nb_char(Current); 1158 if (I == Current) 1159 break; 1160 Current = I; 1161 ++Column; 1162 } 1163 } 1164 1165 void Scanner::scanToNextToken() { 1166 while (true) { 1167 while (Current != End && (*Current == ' ' || *Current == '\t')) { 1168 skip(1); 1169 } 1170 1171 skipComment(); 1172 1173 // Skip EOL. 1174 StringRef::iterator i = skip_b_break(Current); 1175 if (i == Current) 1176 break; 1177 Current = i; 1178 ++Line; 1179 Column = 0; 1180 // New lines may start a simple key. 1181 if (!FlowLevel) 1182 IsSimpleKeyAllowed = true; 1183 } 1184 } 1185 1186 bool Scanner::scanStreamStart() { 1187 IsStartOfStream = false; 1188 1189 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1190 1191 Token T; 1192 T.Kind = Token::TK_StreamStart; 1193 T.Range = StringRef(Current, EI.second); 1194 TokenQueue.push_back(T); 1195 Current += EI.second; 1196 return true; 1197 } 1198 1199 bool Scanner::scanStreamEnd() { 1200 // Force an ending new line if one isn't present. 1201 if (Column != 0) { 1202 Column = 0; 1203 ++Line; 1204 } 1205 1206 unrollIndent(-1); 1207 SimpleKeys.clear(); 1208 IsSimpleKeyAllowed = false; 1209 IsAdjacentValueAllowedInFlow = false; 1210 1211 Token T; 1212 T.Kind = Token::TK_StreamEnd; 1213 T.Range = StringRef(Current, 0); 1214 TokenQueue.push_back(T); 1215 return true; 1216 } 1217 1218 bool Scanner::scanDirective() { 1219 // Reset the indentation level. 1220 unrollIndent(-1); 1221 SimpleKeys.clear(); 1222 IsSimpleKeyAllowed = false; 1223 IsAdjacentValueAllowedInFlow = false; 1224 1225 StringRef::iterator Start = Current; 1226 consume('%'); 1227 StringRef::iterator NameStart = Current; 1228 Current = skip_while(&Scanner::skip_ns_char, Current); 1229 StringRef Name(NameStart, Current - NameStart); 1230 Current = skip_while(&Scanner::skip_s_white, Current); 1231 1232 Token T; 1233 if (Name == "YAML") { 1234 Current = skip_while(&Scanner::skip_ns_char, Current); 1235 T.Kind = Token::TK_VersionDirective; 1236 T.Range = StringRef(Start, Current - Start); 1237 TokenQueue.push_back(T); 1238 return true; 1239 } else if(Name == "TAG") { 1240 Current = skip_while(&Scanner::skip_ns_char, Current); 1241 Current = skip_while(&Scanner::skip_s_white, Current); 1242 Current = skip_while(&Scanner::skip_ns_char, Current); 1243 T.Kind = Token::TK_TagDirective; 1244 T.Range = StringRef(Start, Current - Start); 1245 TokenQueue.push_back(T); 1246 return true; 1247 } 1248 return false; 1249 } 1250 1251 bool Scanner::scanDocumentIndicator(bool IsStart) { 1252 unrollIndent(-1); 1253 SimpleKeys.clear(); 1254 IsSimpleKeyAllowed = false; 1255 IsAdjacentValueAllowedInFlow = false; 1256 1257 Token T; 1258 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1259 T.Range = StringRef(Current, 3); 1260 skip(3); 1261 TokenQueue.push_back(T); 1262 return true; 1263 } 1264 1265 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1266 Token T; 1267 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1268 : Token::TK_FlowMappingStart; 1269 T.Range = StringRef(Current, 1); 1270 skip(1); 1271 TokenQueue.push_back(T); 1272 1273 // [ and { may begin a simple key. 1274 saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); 1275 1276 // And may also be followed by a simple key. 1277 IsSimpleKeyAllowed = true; 1278 // Adjacent values are allowed in flows only after JSON-style keys. 1279 IsAdjacentValueAllowedInFlow = false; 1280 ++FlowLevel; 1281 return true; 1282 } 1283 1284 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1285 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1286 IsSimpleKeyAllowed = false; 1287 IsAdjacentValueAllowedInFlow = true; 1288 Token T; 1289 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1290 : Token::TK_FlowMappingEnd; 1291 T.Range = StringRef(Current, 1); 1292 skip(1); 1293 TokenQueue.push_back(T); 1294 if (FlowLevel) 1295 --FlowLevel; 1296 return true; 1297 } 1298 1299 bool Scanner::scanFlowEntry() { 1300 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1301 IsSimpleKeyAllowed = true; 1302 IsAdjacentValueAllowedInFlow = false; 1303 Token T; 1304 T.Kind = Token::TK_FlowEntry; 1305 T.Range = StringRef(Current, 1); 1306 skip(1); 1307 TokenQueue.push_back(T); 1308 return true; 1309 } 1310 1311 bool Scanner::scanBlockEntry() { 1312 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1313 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1314 IsSimpleKeyAllowed = true; 1315 IsAdjacentValueAllowedInFlow = false; 1316 Token T; 1317 T.Kind = Token::TK_BlockEntry; 1318 T.Range = StringRef(Current, 1); 1319 skip(1); 1320 TokenQueue.push_back(T); 1321 return true; 1322 } 1323 1324 bool Scanner::scanKey() { 1325 if (!FlowLevel) 1326 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1327 1328 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1329 IsSimpleKeyAllowed = !FlowLevel; 1330 IsAdjacentValueAllowedInFlow = false; 1331 1332 Token T; 1333 T.Kind = Token::TK_Key; 1334 T.Range = StringRef(Current, 1); 1335 skip(1); 1336 TokenQueue.push_back(T); 1337 return true; 1338 } 1339 1340 bool Scanner::scanValue() { 1341 // If the previous token could have been a simple key, insert the key token 1342 // into the token queue. 1343 if (!SimpleKeys.empty()) { 1344 SimpleKey SK = SimpleKeys.pop_back_val(); 1345 Token T; 1346 T.Kind = Token::TK_Key; 1347 T.Range = SK.Tok->Range; 1348 TokenQueueT::iterator i, e; 1349 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1350 if (i == SK.Tok) 1351 break; 1352 } 1353 if (i == e) { 1354 Failed = true; 1355 return false; 1356 } 1357 i = TokenQueue.insert(i, T); 1358 1359 // We may also need to add a Block-Mapping-Start token. 1360 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1361 1362 IsSimpleKeyAllowed = false; 1363 } else { 1364 if (!FlowLevel) 1365 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1366 IsSimpleKeyAllowed = !FlowLevel; 1367 } 1368 IsAdjacentValueAllowedInFlow = false; 1369 1370 Token T; 1371 T.Kind = Token::TK_Value; 1372 T.Range = StringRef(Current, 1); 1373 skip(1); 1374 TokenQueue.push_back(T); 1375 return true; 1376 } 1377 1378 // Forbidding inlining improves performance by roughly 20%. 1379 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1380 LLVM_ATTRIBUTE_NOINLINE static bool 1381 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1382 1383 // Returns whether a character at 'Position' was escaped with a leading '\'. 1384 // 'First' specifies the position of the first character in the string. 1385 static bool wasEscaped(StringRef::iterator First, 1386 StringRef::iterator Position) { 1387 assert(Position - 1 >= First); 1388 StringRef::iterator I = Position - 1; 1389 // We calculate the number of consecutive '\'s before the current position 1390 // by iterating backwards through our string. 1391 while (I >= First && *I == '\\') --I; 1392 // (Position - 1 - I) now contains the number of '\'s before the current 1393 // position. If it is odd, the character at 'Position' was escaped. 1394 return (Position - 1 - I) % 2 == 1; 1395 } 1396 1397 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1398 StringRef::iterator Start = Current; 1399 unsigned ColStart = Column; 1400 if (IsDoubleQuoted) { 1401 do { 1402 ++Current; 1403 while (Current != End && *Current != '"') 1404 ++Current; 1405 // Repeat until the previous character was not a '\' or was an escaped 1406 // backslash. 1407 } while ( Current != End 1408 && *(Current - 1) == '\\' 1409 && wasEscaped(Start + 1, Current)); 1410 } else { 1411 skip(1); 1412 while (Current != End) { 1413 // Skip a ' followed by another '. 1414 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1415 skip(2); 1416 continue; 1417 } else if (*Current == '\'') 1418 break; 1419 StringRef::iterator i = skip_nb_char(Current); 1420 if (i == Current) { 1421 i = skip_b_break(Current); 1422 if (i == Current) 1423 break; 1424 Current = i; 1425 Column = 0; 1426 ++Line; 1427 } else { 1428 if (i == End) 1429 break; 1430 Current = i; 1431 ++Column; 1432 } 1433 } 1434 } 1435 1436 if (Current == End) { 1437 setError("Expected quote at end of scalar", Current); 1438 return false; 1439 } 1440 1441 skip(1); // Skip ending quote. 1442 Token T; 1443 T.Kind = Token::TK_Scalar; 1444 T.Range = StringRef(Start, Current - Start); 1445 TokenQueue.push_back(T); 1446 1447 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1448 1449 IsSimpleKeyAllowed = false; 1450 IsAdjacentValueAllowedInFlow = true; 1451 1452 return true; 1453 } 1454 1455 bool Scanner::scanPlainScalar() { 1456 StringRef::iterator Start = Current; 1457 unsigned ColStart = Column; 1458 unsigned LeadingBlanks = 0; 1459 assert(Indent >= -1 && "Indent must be >= -1 !"); 1460 unsigned indent = static_cast<unsigned>(Indent + 1); 1461 while (Current != End) { 1462 if (*Current == '#') 1463 break; 1464 1465 while (Current != End && 1466 ((*Current != ':' && isPlainSafeNonBlank(Current)) || 1467 (*Current == ':' && isPlainSafeNonBlank(Current + 1)))) { 1468 StringRef::iterator i = skip_nb_char(Current); 1469 if (i == Current) 1470 break; 1471 Current = i; 1472 ++Column; 1473 } 1474 1475 // Are we at the end? 1476 if (!isBlankOrBreak(Current)) 1477 break; 1478 1479 // Eat blanks. 1480 StringRef::iterator Tmp = Current; 1481 while (isBlankOrBreak(Tmp)) { 1482 StringRef::iterator i = skip_s_white(Tmp); 1483 if (i != Tmp) { 1484 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1485 setError("Found invalid tab character in indentation", Tmp); 1486 return false; 1487 } 1488 Tmp = i; 1489 ++Column; 1490 } else { 1491 i = skip_b_break(Tmp); 1492 if (!LeadingBlanks) 1493 LeadingBlanks = 1; 1494 Tmp = i; 1495 Column = 0; 1496 ++Line; 1497 } 1498 } 1499 1500 if (!FlowLevel && Column < indent) 1501 break; 1502 1503 Current = Tmp; 1504 } 1505 if (Start == Current) { 1506 setError("Got empty plain scalar", Start); 1507 return false; 1508 } 1509 Token T; 1510 T.Kind = Token::TK_Scalar; 1511 T.Range = StringRef(Start, Current - Start); 1512 TokenQueue.push_back(T); 1513 1514 // Plain scalars can be simple keys. 1515 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1516 1517 IsSimpleKeyAllowed = false; 1518 IsAdjacentValueAllowedInFlow = false; 1519 1520 return true; 1521 } 1522 1523 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1524 StringRef::iterator Start = Current; 1525 unsigned ColStart = Column; 1526 skip(1); 1527 while (Current != End) { 1528 if ( *Current == '[' || *Current == ']' 1529 || *Current == '{' || *Current == '}' 1530 || *Current == ',' 1531 || *Current == ':') 1532 break; 1533 StringRef::iterator i = skip_ns_char(Current); 1534 if (i == Current) 1535 break; 1536 Current = i; 1537 ++Column; 1538 } 1539 1540 if (Start + 1 == Current) { 1541 setError("Got empty alias or anchor", Start); 1542 return false; 1543 } 1544 1545 Token T; 1546 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1547 T.Range = StringRef(Start, Current - Start); 1548 TokenQueue.push_back(T); 1549 1550 // Alias and anchors can be simple keys. 1551 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1552 1553 IsSimpleKeyAllowed = false; 1554 IsAdjacentValueAllowedInFlow = false; 1555 1556 return true; 1557 } 1558 1559 bool Scanner::scanBlockScalarIndicators(char &StyleIndicator, 1560 char &ChompingIndicator, 1561 unsigned &IndentIndicator, 1562 bool &IsDone) { 1563 StyleIndicator = scanBlockStyleIndicator(); 1564 if (!scanBlockScalarHeader(ChompingIndicator, IndentIndicator, IsDone)) 1565 return false; 1566 return true; 1567 } 1568 1569 char Scanner::scanBlockStyleIndicator() { 1570 char Indicator = ' '; 1571 if (Current != End && (*Current == '>' || *Current == '|')) { 1572 Indicator = *Current; 1573 skip(1); 1574 } 1575 return Indicator; 1576 } 1577 1578 char Scanner::scanBlockChompingIndicator() { 1579 char Indicator = ' '; 1580 if (Current != End && (*Current == '+' || *Current == '-')) { 1581 Indicator = *Current; 1582 skip(1); 1583 } 1584 return Indicator; 1585 } 1586 1587 /// Get the number of line breaks after chomping. 1588 /// 1589 /// Return the number of trailing line breaks to emit, depending on 1590 /// \p ChompingIndicator. 1591 static unsigned getChompedLineBreaks(char ChompingIndicator, 1592 unsigned LineBreaks, StringRef Str) { 1593 if (ChompingIndicator == '-') // Strip all line breaks. 1594 return 0; 1595 if (ChompingIndicator == '+') // Keep all line breaks. 1596 return LineBreaks; 1597 // Clip trailing lines. 1598 return Str.empty() ? 0 : 1; 1599 } 1600 1601 unsigned Scanner::scanBlockIndentationIndicator() { 1602 unsigned Indent = 0; 1603 if (Current != End && (*Current >= '1' && *Current <= '9')) { 1604 Indent = unsigned(*Current - '0'); 1605 skip(1); 1606 } 1607 return Indent; 1608 } 1609 1610 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, 1611 unsigned &IndentIndicator, bool &IsDone) { 1612 auto Start = Current; 1613 1614 ChompingIndicator = scanBlockChompingIndicator(); 1615 IndentIndicator = scanBlockIndentationIndicator(); 1616 // Check for the chomping indicator once again. 1617 if (ChompingIndicator == ' ') 1618 ChompingIndicator = scanBlockChompingIndicator(); 1619 Current = skip_while(&Scanner::skip_s_white, Current); 1620 skipComment(); 1621 1622 if (Current == End) { // EOF, we have an empty scalar. 1623 Token T; 1624 T.Kind = Token::TK_BlockScalar; 1625 T.Range = StringRef(Start, Current - Start); 1626 TokenQueue.push_back(T); 1627 IsDone = true; 1628 return true; 1629 } 1630 1631 if (!consumeLineBreakIfPresent()) { 1632 setError("Expected a line break after block scalar header", Current); 1633 return false; 1634 } 1635 return true; 1636 } 1637 1638 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, 1639 unsigned BlockExitIndent, 1640 unsigned &LineBreaks, bool &IsDone) { 1641 unsigned MaxAllSpaceLineCharacters = 0; 1642 StringRef::iterator LongestAllSpaceLine; 1643 1644 while (true) { 1645 advanceWhile(&Scanner::skip_s_space); 1646 if (skip_nb_char(Current) != Current) { 1647 // This line isn't empty, so try and find the indentation. 1648 if (Column <= BlockExitIndent) { // End of the block literal. 1649 IsDone = true; 1650 return true; 1651 } 1652 // We found the block's indentation. 1653 BlockIndent = Column; 1654 if (MaxAllSpaceLineCharacters > BlockIndent) { 1655 setError( 1656 "Leading all-spaces line must be smaller than the block indent", 1657 LongestAllSpaceLine); 1658 return false; 1659 } 1660 return true; 1661 } 1662 if (skip_b_break(Current) != Current && 1663 Column > MaxAllSpaceLineCharacters) { 1664 // Record the longest all-space line in case it's longer than the 1665 // discovered block indent. 1666 MaxAllSpaceLineCharacters = Column; 1667 LongestAllSpaceLine = Current; 1668 } 1669 1670 // Check for EOF. 1671 if (Current == End) { 1672 IsDone = true; 1673 return true; 1674 } 1675 1676 if (!consumeLineBreakIfPresent()) { 1677 IsDone = true; 1678 return true; 1679 } 1680 ++LineBreaks; 1681 } 1682 return true; 1683 } 1684 1685 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, 1686 unsigned BlockExitIndent, bool &IsDone) { 1687 // Skip the indentation. 1688 while (Column < BlockIndent) { 1689 auto I = skip_s_space(Current); 1690 if (I == Current) 1691 break; 1692 Current = I; 1693 ++Column; 1694 } 1695 1696 if (skip_nb_char(Current) == Current) 1697 return true; 1698 1699 if (Column <= BlockExitIndent) { // End of the block literal. 1700 IsDone = true; 1701 return true; 1702 } 1703 1704 if (Column < BlockIndent) { 1705 if (Current != End && *Current == '#') { // Trailing comment. 1706 IsDone = true; 1707 return true; 1708 } 1709 setError("A text line is less indented than the block scalar", Current); 1710 return false; 1711 } 1712 return true; // A normal text line. 1713 } 1714 1715 bool Scanner::scanBlockScalar(bool IsLiteral) { 1716 assert(*Current == '|' || *Current == '>'); 1717 char StyleIndicator; 1718 char ChompingIndicator; 1719 unsigned BlockIndent; 1720 bool IsDone = false; 1721 if (!scanBlockScalarIndicators(StyleIndicator, ChompingIndicator, BlockIndent, 1722 IsDone)) 1723 return false; 1724 if (IsDone) 1725 return true; 1726 bool IsFolded = StyleIndicator == '>'; 1727 1728 const auto *Start = Current; 1729 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; 1730 unsigned LineBreaks = 0; 1731 if (BlockIndent == 0) { 1732 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, 1733 IsDone)) 1734 return false; 1735 } 1736 1737 // Scan the block's scalars body. 1738 SmallString<256> Str; 1739 while (!IsDone) { 1740 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) 1741 return false; 1742 if (IsDone) 1743 break; 1744 1745 // Parse the current line. 1746 auto LineStart = Current; 1747 advanceWhile(&Scanner::skip_nb_char); 1748 if (LineStart != Current) { 1749 if (LineBreaks && IsFolded && !Scanner::isLineEmpty(Str)) { 1750 // The folded style "folds" any single line break between content into a 1751 // single space, except when that content is "empty" (only contains 1752 // whitespace) in which case the line break is left as-is. 1753 if (LineBreaks == 1) { 1754 Str.append(LineBreaks, 1755 isLineEmpty(StringRef(LineStart, Current - LineStart)) 1756 ? '\n' 1757 : ' '); 1758 } 1759 // If we saw a single line break, we are completely replacing it and so 1760 // want `LineBreaks == 0`. Otherwise this decrement accounts for the 1761 // fact that the first line break is "trimmed", only being used to 1762 // signal a sequence of line breaks which should not be folded. 1763 LineBreaks--; 1764 } 1765 Str.append(LineBreaks, '\n'); 1766 Str.append(StringRef(LineStart, Current - LineStart)); 1767 LineBreaks = 0; 1768 } 1769 1770 // Check for EOF. 1771 if (Current == End) 1772 break; 1773 1774 if (!consumeLineBreakIfPresent()) 1775 break; 1776 ++LineBreaks; 1777 } 1778 1779 if (Current == End && !LineBreaks) 1780 // Ensure that there is at least one line break before the end of file. 1781 LineBreaks = 1; 1782 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); 1783 1784 // New lines may start a simple key. 1785 if (!FlowLevel) 1786 IsSimpleKeyAllowed = true; 1787 IsAdjacentValueAllowedInFlow = false; 1788 1789 Token T; 1790 T.Kind = Token::TK_BlockScalar; 1791 T.Range = StringRef(Start, Current - Start); 1792 T.Value = std::string(Str); 1793 TokenQueue.push_back(T); 1794 return true; 1795 } 1796 1797 bool Scanner::scanTag() { 1798 StringRef::iterator Start = Current; 1799 unsigned ColStart = Column; 1800 skip(1); // Eat !. 1801 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1802 else if (*Current == '<') { 1803 skip(1); 1804 scan_ns_uri_char(); 1805 if (!consume('>')) 1806 return false; 1807 } else { 1808 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1809 Current = skip_while(&Scanner::skip_ns_char, Current); 1810 } 1811 1812 Token T; 1813 T.Kind = Token::TK_Tag; 1814 T.Range = StringRef(Start, Current - Start); 1815 TokenQueue.push_back(T); 1816 1817 // Tags can be simple keys. 1818 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1819 1820 IsSimpleKeyAllowed = false; 1821 IsAdjacentValueAllowedInFlow = false; 1822 1823 return true; 1824 } 1825 1826 bool Scanner::fetchMoreTokens() { 1827 if (IsStartOfStream) 1828 return scanStreamStart(); 1829 1830 scanToNextToken(); 1831 1832 if (Current == End) 1833 return scanStreamEnd(); 1834 1835 removeStaleSimpleKeyCandidates(); 1836 1837 unrollIndent(Column); 1838 1839 if (Column == 0 && *Current == '%') 1840 return scanDirective(); 1841 1842 if (Column == 0 && Current + 4 <= End 1843 && *Current == '-' 1844 && *(Current + 1) == '-' 1845 && *(Current + 2) == '-' 1846 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1847 return scanDocumentIndicator(true); 1848 1849 if (Column == 0 && Current + 4 <= End 1850 && *Current == '.' 1851 && *(Current + 1) == '.' 1852 && *(Current + 2) == '.' 1853 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1854 return scanDocumentIndicator(false); 1855 1856 if (*Current == '[') 1857 return scanFlowCollectionStart(true); 1858 1859 if (*Current == '{') 1860 return scanFlowCollectionStart(false); 1861 1862 if (*Current == ']') 1863 return scanFlowCollectionEnd(true); 1864 1865 if (*Current == '}') 1866 return scanFlowCollectionEnd(false); 1867 1868 if (*Current == ',') 1869 return scanFlowEntry(); 1870 1871 if (*Current == '-' && (isBlankOrBreak(Current + 1) || Current + 1 == End)) 1872 return scanBlockEntry(); 1873 1874 if (*Current == '?' && (Current + 1 == End || isBlankOrBreak(Current + 1))) 1875 return scanKey(); 1876 1877 if (*Current == ':' && 1878 (!isPlainSafeNonBlank(Current + 1) || IsAdjacentValueAllowedInFlow)) 1879 return scanValue(); 1880 1881 if (*Current == '*') 1882 return scanAliasOrAnchor(true); 1883 1884 if (*Current == '&') 1885 return scanAliasOrAnchor(false); 1886 1887 if (*Current == '!') 1888 return scanTag(); 1889 1890 if (*Current == '|' && !FlowLevel) 1891 return scanBlockScalar(true); 1892 1893 if (*Current == '>' && !FlowLevel) 1894 return scanBlockScalar(false); 1895 1896 if (*Current == '\'') 1897 return scanFlowScalar(false); 1898 1899 if (*Current == '"') 1900 return scanFlowScalar(true); 1901 1902 // Get a plain scalar. 1903 StringRef FirstChar(Current, 1); 1904 if ((!isBlankOrBreak(Current) && 1905 FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") == StringRef::npos) || 1906 (FirstChar.find_first_of("?:-") != StringRef::npos && 1907 isPlainSafeNonBlank(Current + 1))) 1908 return scanPlainScalar(); 1909 1910 setError("Unrecognized character while tokenizing.", Current); 1911 return false; 1912 } 1913 1914 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors, 1915 std::error_code *EC) 1916 : scanner(new Scanner(Input, SM, ShowColors, EC)) {} 1917 1918 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors, 1919 std::error_code *EC) 1920 : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)) {} 1921 1922 Stream::~Stream() = default; 1923 1924 bool Stream::failed() { return scanner->failed(); } 1925 1926 void Stream::printError(Node *N, const Twine &Msg, SourceMgr::DiagKind Kind) { 1927 printError(N ? N->getSourceRange() : SMRange(), Msg, Kind); 1928 } 1929 1930 void Stream::printError(const SMRange &Range, const Twine &Msg, 1931 SourceMgr::DiagKind Kind) { 1932 scanner->printError(Range.Start, Kind, Msg, Range); 1933 } 1934 1935 document_iterator Stream::begin() { 1936 if (CurrentDoc) 1937 report_fatal_error("Can only iterate over the stream once"); 1938 1939 // Skip Stream-Start. 1940 scanner->getNext(); 1941 1942 CurrentDoc.reset(new Document(*this)); 1943 return document_iterator(CurrentDoc); 1944 } 1945 1946 document_iterator Stream::end() { 1947 return document_iterator(); 1948 } 1949 1950 void Stream::skip() { 1951 for (Document &Doc : *this) 1952 Doc.skip(); 1953 } 1954 1955 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1956 StringRef T) 1957 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1958 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1959 SourceRange = SMRange(Start, Start); 1960 } 1961 1962 std::string Node::getVerbatimTag() const { 1963 StringRef Raw = getRawTag(); 1964 if (!Raw.empty() && Raw != "!") { 1965 std::string Ret; 1966 if (Raw.find_last_of('!') == 0) { 1967 Ret = std::string(Doc->getTagMap().find("!")->second); 1968 Ret += Raw.substr(1); 1969 return Ret; 1970 } else if (Raw.starts_with("!!")) { 1971 Ret = std::string(Doc->getTagMap().find("!!")->second); 1972 Ret += Raw.substr(2); 1973 return Ret; 1974 } else { 1975 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1976 std::map<StringRef, StringRef>::const_iterator It = 1977 Doc->getTagMap().find(TagHandle); 1978 if (It != Doc->getTagMap().end()) 1979 Ret = std::string(It->second); 1980 else { 1981 Token T; 1982 T.Kind = Token::TK_Tag; 1983 T.Range = TagHandle; 1984 setError(Twine("Unknown tag handle ") + TagHandle, T); 1985 } 1986 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1987 return Ret; 1988 } 1989 } 1990 1991 switch (getType()) { 1992 case NK_Null: 1993 return "tag:yaml.org,2002:null"; 1994 case NK_Scalar: 1995 case NK_BlockScalar: 1996 // TODO: Tag resolution. 1997 return "tag:yaml.org,2002:str"; 1998 case NK_Mapping: 1999 return "tag:yaml.org,2002:map"; 2000 case NK_Sequence: 2001 return "tag:yaml.org,2002:seq"; 2002 } 2003 2004 return ""; 2005 } 2006 2007 Token &Node::peekNext() { 2008 return Doc->peekNext(); 2009 } 2010 2011 Token Node::getNext() { 2012 return Doc->getNext(); 2013 } 2014 2015 Node *Node::parseBlockNode() { 2016 return Doc->parseBlockNode(); 2017 } 2018 2019 BumpPtrAllocator &Node::getAllocator() { 2020 return Doc->NodeAllocator; 2021 } 2022 2023 void Node::setError(const Twine &Msg, Token &Tok) const { 2024 Doc->setError(Msg, Tok); 2025 } 2026 2027 bool Node::failed() const { 2028 return Doc->failed(); 2029 } 2030 2031 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 2032 if (Value[0] == '"') 2033 return getDoubleQuotedValue(Value, Storage); 2034 if (Value[0] == '\'') 2035 return getSingleQuotedValue(Value, Storage); 2036 return getPlainValue(Value, Storage); 2037 } 2038 2039 /// parseScalarValue - A common parsing routine for all flow scalar styles. 2040 /// It handles line break characters by itself, adds regular content characters 2041 /// to the result, and forwards escaped sequences to the provided routine for 2042 /// the style-specific processing. 2043 /// 2044 /// \param UnquotedValue - An input value without quotation marks. 2045 /// \param Storage - A storage for the result if the input value is multiline or 2046 /// contains escaped characters. 2047 /// \param LookupChars - A set of special characters to search in the input 2048 /// string. Should include line break characters and the escape character 2049 /// specific for the processing scalar style, if any. 2050 /// \param UnescapeCallback - This is called when the escape character is found 2051 /// in the input. 2052 /// \returns - The unfolded and unescaped value. 2053 static StringRef 2054 parseScalarValue(StringRef UnquotedValue, SmallVectorImpl<char> &Storage, 2055 StringRef LookupChars, 2056 std::function<StringRef(StringRef, SmallVectorImpl<char> &)> 2057 UnescapeCallback) { 2058 size_t I = UnquotedValue.find_first_of(LookupChars); 2059 if (I == StringRef::npos) 2060 return UnquotedValue; 2061 2062 Storage.clear(); 2063 Storage.reserve(UnquotedValue.size()); 2064 char LastNewLineAddedAs = '\0'; 2065 for (; I != StringRef::npos; I = UnquotedValue.find_first_of(LookupChars)) { 2066 if (UnquotedValue[I] != '\r' && UnquotedValue[I] != '\n') { 2067 llvm::append_range(Storage, UnquotedValue.take_front(I)); 2068 UnquotedValue = UnescapeCallback(UnquotedValue.drop_front(I), Storage); 2069 LastNewLineAddedAs = '\0'; 2070 continue; 2071 } 2072 if (size_t LastNonSWhite = UnquotedValue.find_last_not_of(" \t", I); 2073 LastNonSWhite != StringRef::npos) { 2074 llvm::append_range(Storage, UnquotedValue.take_front(LastNonSWhite + 1)); 2075 Storage.push_back(' '); 2076 LastNewLineAddedAs = ' '; 2077 } else { 2078 // Note: we can't just check if the last character in Storage is ' ', 2079 // '\n', or something else; that would give a wrong result for double 2080 // quoted values containing an escaped space character before a new-line 2081 // character. 2082 switch (LastNewLineAddedAs) { 2083 case ' ': 2084 assert(!Storage.empty() && Storage.back() == ' '); 2085 Storage.back() = '\n'; 2086 LastNewLineAddedAs = '\n'; 2087 break; 2088 case '\n': 2089 assert(!Storage.empty() && Storage.back() == '\n'); 2090 Storage.push_back('\n'); 2091 break; 2092 default: 2093 Storage.push_back(' '); 2094 LastNewLineAddedAs = ' '; 2095 break; 2096 } 2097 } 2098 // Handle Windows-style EOL 2099 if (UnquotedValue.substr(I, 2) == "\r\n") 2100 I++; 2101 UnquotedValue = UnquotedValue.drop_front(I + 1).ltrim(" \t"); 2102 } 2103 llvm::append_range(Storage, UnquotedValue); 2104 return StringRef(Storage.begin(), Storage.size()); 2105 } 2106 2107 StringRef 2108 ScalarNode::getDoubleQuotedValue(StringRef RawValue, 2109 SmallVectorImpl<char> &Storage) const { 2110 assert(RawValue.size() >= 2 && RawValue.front() == '"' && 2111 RawValue.back() == '"'); 2112 StringRef UnquotedValue = RawValue.substr(1, RawValue.size() - 2); 2113 2114 auto UnescapeFunc = [this](StringRef UnquotedValue, 2115 SmallVectorImpl<char> &Storage) { 2116 assert(UnquotedValue.take_front(1) == "\\"); 2117 if (UnquotedValue.size() == 1) { 2118 Token T; 2119 T.Range = UnquotedValue; 2120 setError("Unrecognized escape code", T); 2121 Storage.clear(); 2122 return StringRef(); 2123 } 2124 UnquotedValue = UnquotedValue.drop_front(1); 2125 switch (UnquotedValue[0]) { 2126 default: { 2127 Token T; 2128 T.Range = UnquotedValue.take_front(1); 2129 setError("Unrecognized escape code", T); 2130 Storage.clear(); 2131 return StringRef(); 2132 } 2133 case '\r': 2134 // Shrink the Windows-style EOL. 2135 if (UnquotedValue.size() >= 2 && UnquotedValue[1] == '\n') 2136 UnquotedValue = UnquotedValue.drop_front(1); 2137 [[fallthrough]]; 2138 case '\n': 2139 return UnquotedValue.drop_front(1).ltrim(" \t"); 2140 case '0': 2141 Storage.push_back(0x00); 2142 break; 2143 case 'a': 2144 Storage.push_back(0x07); 2145 break; 2146 case 'b': 2147 Storage.push_back(0x08); 2148 break; 2149 case 't': 2150 case 0x09: 2151 Storage.push_back(0x09); 2152 break; 2153 case 'n': 2154 Storage.push_back(0x0A); 2155 break; 2156 case 'v': 2157 Storage.push_back(0x0B); 2158 break; 2159 case 'f': 2160 Storage.push_back(0x0C); 2161 break; 2162 case 'r': 2163 Storage.push_back(0x0D); 2164 break; 2165 case 'e': 2166 Storage.push_back(0x1B); 2167 break; 2168 case ' ': 2169 Storage.push_back(0x20); 2170 break; 2171 case '"': 2172 Storage.push_back(0x22); 2173 break; 2174 case '/': 2175 Storage.push_back(0x2F); 2176 break; 2177 case '\\': 2178 Storage.push_back(0x5C); 2179 break; 2180 case 'N': 2181 encodeUTF8(0x85, Storage); 2182 break; 2183 case '_': 2184 encodeUTF8(0xA0, Storage); 2185 break; 2186 case 'L': 2187 encodeUTF8(0x2028, Storage); 2188 break; 2189 case 'P': 2190 encodeUTF8(0x2029, Storage); 2191 break; 2192 case 'x': { 2193 if (UnquotedValue.size() < 3) 2194 // TODO: Report error. 2195 break; 2196 unsigned int UnicodeScalarValue; 2197 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 2198 // TODO: Report error. 2199 UnicodeScalarValue = 0xFFFD; 2200 encodeUTF8(UnicodeScalarValue, Storage); 2201 return UnquotedValue.drop_front(3); 2202 } 2203 case 'u': { 2204 if (UnquotedValue.size() < 5) 2205 // TODO: Report error. 2206 break; 2207 unsigned int UnicodeScalarValue; 2208 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 2209 // TODO: Report error. 2210 UnicodeScalarValue = 0xFFFD; 2211 encodeUTF8(UnicodeScalarValue, Storage); 2212 return UnquotedValue.drop_front(5); 2213 } 2214 case 'U': { 2215 if (UnquotedValue.size() < 9) 2216 // TODO: Report error. 2217 break; 2218 unsigned int UnicodeScalarValue; 2219 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 2220 // TODO: Report error. 2221 UnicodeScalarValue = 0xFFFD; 2222 encodeUTF8(UnicodeScalarValue, Storage); 2223 return UnquotedValue.drop_front(9); 2224 } 2225 } 2226 return UnquotedValue.drop_front(1); 2227 }; 2228 2229 return parseScalarValue(UnquotedValue, Storage, "\\\r\n", UnescapeFunc); 2230 } 2231 2232 StringRef ScalarNode::getSingleQuotedValue(StringRef RawValue, 2233 SmallVectorImpl<char> &Storage) { 2234 assert(RawValue.size() >= 2 && RawValue.front() == '\'' && 2235 RawValue.back() == '\''); 2236 StringRef UnquotedValue = RawValue.substr(1, RawValue.size() - 2); 2237 2238 auto UnescapeFunc = [](StringRef UnquotedValue, 2239 SmallVectorImpl<char> &Storage) { 2240 assert(UnquotedValue.take_front(2) == "''"); 2241 Storage.push_back('\''); 2242 return UnquotedValue.drop_front(2); 2243 }; 2244 2245 return parseScalarValue(UnquotedValue, Storage, "'\r\n", UnescapeFunc); 2246 } 2247 2248 StringRef ScalarNode::getPlainValue(StringRef RawValue, 2249 SmallVectorImpl<char> &Storage) { 2250 // Trim trailing whitespace ('b-char' and 's-white'). 2251 // NOTE: Alternatively we could change the scanner to not include whitespace 2252 // here in the first place. 2253 RawValue = RawValue.rtrim("\r\n \t"); 2254 return parseScalarValue(RawValue, Storage, "\r\n", nullptr); 2255 } 2256 2257 Node *KeyValueNode::getKey() { 2258 if (Key) 2259 return Key; 2260 // Handle implicit null keys. 2261 { 2262 Token &t = peekNext(); 2263 if ( t.Kind == Token::TK_BlockEnd 2264 || t.Kind == Token::TK_Value 2265 || t.Kind == Token::TK_Error) { 2266 return Key = new (getAllocator()) NullNode(Doc); 2267 } 2268 if (t.Kind == Token::TK_Key) 2269 getNext(); // skip TK_Key. 2270 } 2271 2272 // Handle explicit null keys. 2273 Token &t = peekNext(); 2274 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 2275 return Key = new (getAllocator()) NullNode(Doc); 2276 } 2277 2278 // We've got a normal key. 2279 return Key = parseBlockNode(); 2280 } 2281 2282 Node *KeyValueNode::getValue() { 2283 if (Value) 2284 return Value; 2285 2286 if (Node* Key = getKey()) 2287 Key->skip(); 2288 else { 2289 setError("Null key in Key Value.", peekNext()); 2290 return Value = new (getAllocator()) NullNode(Doc); 2291 } 2292 2293 if (failed()) 2294 return Value = new (getAllocator()) NullNode(Doc); 2295 2296 // Handle implicit null values. 2297 { 2298 Token &t = peekNext(); 2299 if ( t.Kind == Token::TK_BlockEnd 2300 || t.Kind == Token::TK_FlowMappingEnd 2301 || t.Kind == Token::TK_Key 2302 || t.Kind == Token::TK_FlowEntry 2303 || t.Kind == Token::TK_Error) { 2304 return Value = new (getAllocator()) NullNode(Doc); 2305 } 2306 2307 if (t.Kind != Token::TK_Value) { 2308 setError("Unexpected token in Key Value.", t); 2309 return Value = new (getAllocator()) NullNode(Doc); 2310 } 2311 getNext(); // skip TK_Value. 2312 } 2313 2314 // Handle explicit null values. 2315 Token &t = peekNext(); 2316 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 2317 return Value = new (getAllocator()) NullNode(Doc); 2318 } 2319 2320 // We got a normal value. 2321 return Value = parseBlockNode(); 2322 } 2323 2324 void MappingNode::increment() { 2325 if (failed()) { 2326 IsAtEnd = true; 2327 CurrentEntry = nullptr; 2328 return; 2329 } 2330 if (CurrentEntry) { 2331 CurrentEntry->skip(); 2332 if (Type == MT_Inline) { 2333 IsAtEnd = true; 2334 CurrentEntry = nullptr; 2335 return; 2336 } 2337 } 2338 Token T = peekNext(); 2339 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 2340 // KeyValueNode eats the TK_Key. That way it can detect null keys. 2341 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 2342 } else if (Type == MT_Block) { 2343 switch (T.Kind) { 2344 case Token::TK_BlockEnd: 2345 getNext(); 2346 IsAtEnd = true; 2347 CurrentEntry = nullptr; 2348 break; 2349 default: 2350 setError("Unexpected token. Expected Key or Block End", T); 2351 [[fallthrough]]; 2352 case Token::TK_Error: 2353 IsAtEnd = true; 2354 CurrentEntry = nullptr; 2355 } 2356 } else { 2357 switch (T.Kind) { 2358 case Token::TK_FlowEntry: 2359 // Eat the flow entry and recurse. 2360 getNext(); 2361 return increment(); 2362 case Token::TK_FlowMappingEnd: 2363 getNext(); 2364 [[fallthrough]]; 2365 case Token::TK_Error: 2366 // Set this to end iterator. 2367 IsAtEnd = true; 2368 CurrentEntry = nullptr; 2369 break; 2370 default: 2371 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 2372 "Mapping End." 2373 , T); 2374 IsAtEnd = true; 2375 CurrentEntry = nullptr; 2376 } 2377 } 2378 } 2379 2380 void SequenceNode::increment() { 2381 if (failed()) { 2382 IsAtEnd = true; 2383 CurrentEntry = nullptr; 2384 return; 2385 } 2386 if (CurrentEntry) 2387 CurrentEntry->skip(); 2388 Token T = peekNext(); 2389 if (SeqType == ST_Block) { 2390 switch (T.Kind) { 2391 case Token::TK_BlockEntry: 2392 getNext(); 2393 CurrentEntry = parseBlockNode(); 2394 if (!CurrentEntry) { // An error occurred. 2395 IsAtEnd = true; 2396 CurrentEntry = nullptr; 2397 } 2398 break; 2399 case Token::TK_BlockEnd: 2400 getNext(); 2401 IsAtEnd = true; 2402 CurrentEntry = nullptr; 2403 break; 2404 default: 2405 setError( "Unexpected token. Expected Block Entry or Block End." 2406 , T); 2407 [[fallthrough]]; 2408 case Token::TK_Error: 2409 IsAtEnd = true; 2410 CurrentEntry = nullptr; 2411 } 2412 } else if (SeqType == ST_Indentless) { 2413 switch (T.Kind) { 2414 case Token::TK_BlockEntry: 2415 getNext(); 2416 CurrentEntry = parseBlockNode(); 2417 if (!CurrentEntry) { // An error occurred. 2418 IsAtEnd = true; 2419 CurrentEntry = nullptr; 2420 } 2421 break; 2422 default: 2423 case Token::TK_Error: 2424 IsAtEnd = true; 2425 CurrentEntry = nullptr; 2426 } 2427 } else if (SeqType == ST_Flow) { 2428 switch (T.Kind) { 2429 case Token::TK_FlowEntry: 2430 // Eat the flow entry and recurse. 2431 getNext(); 2432 WasPreviousTokenFlowEntry = true; 2433 return increment(); 2434 case Token::TK_FlowSequenceEnd: 2435 getNext(); 2436 [[fallthrough]]; 2437 case Token::TK_Error: 2438 // Set this to end iterator. 2439 IsAtEnd = true; 2440 CurrentEntry = nullptr; 2441 break; 2442 case Token::TK_StreamEnd: 2443 case Token::TK_DocumentEnd: 2444 case Token::TK_DocumentStart: 2445 setError("Could not find closing ]!", T); 2446 // Set this to end iterator. 2447 IsAtEnd = true; 2448 CurrentEntry = nullptr; 2449 break; 2450 default: 2451 if (!WasPreviousTokenFlowEntry) { 2452 setError("Expected , between entries!", T); 2453 IsAtEnd = true; 2454 CurrentEntry = nullptr; 2455 break; 2456 } 2457 // Otherwise it must be a flow entry. 2458 CurrentEntry = parseBlockNode(); 2459 if (!CurrentEntry) { 2460 IsAtEnd = true; 2461 } 2462 WasPreviousTokenFlowEntry = false; 2463 break; 2464 } 2465 } 2466 } 2467 2468 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2469 // Tag maps starts with two default mappings. 2470 TagMap["!"] = "!"; 2471 TagMap["!!"] = "tag:yaml.org,2002:"; 2472 2473 if (parseDirectives()) 2474 expectToken(Token::TK_DocumentStart); 2475 Token &T = peekNext(); 2476 if (T.Kind == Token::TK_DocumentStart) 2477 getNext(); 2478 } 2479 2480 bool Document::skip() { 2481 if (stream.scanner->failed()) 2482 return false; 2483 if (!Root && !getRoot()) 2484 return false; 2485 Root->skip(); 2486 Token &T = peekNext(); 2487 if (T.Kind == Token::TK_StreamEnd) 2488 return false; 2489 if (T.Kind == Token::TK_DocumentEnd) { 2490 getNext(); 2491 return skip(); 2492 } 2493 return true; 2494 } 2495 2496 Token &Document::peekNext() { 2497 return stream.scanner->peekNext(); 2498 } 2499 2500 Token Document::getNext() { 2501 return stream.scanner->getNext(); 2502 } 2503 2504 void Document::setError(const Twine &Message, Token &Location) const { 2505 stream.scanner->setError(Message, Location.Range.begin()); 2506 } 2507 2508 bool Document::failed() const { 2509 return stream.scanner->failed(); 2510 } 2511 2512 Node *Document::parseBlockNode() { 2513 Token T = peekNext(); 2514 // Handle properties. 2515 Token AnchorInfo; 2516 Token TagInfo; 2517 parse_property: 2518 switch (T.Kind) { 2519 case Token::TK_Alias: 2520 getNext(); 2521 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2522 case Token::TK_Anchor: 2523 if (AnchorInfo.Kind == Token::TK_Anchor) { 2524 setError("Already encountered an anchor for this node!", T); 2525 return nullptr; 2526 } 2527 AnchorInfo = getNext(); // Consume TK_Anchor. 2528 T = peekNext(); 2529 goto parse_property; 2530 case Token::TK_Tag: 2531 if (TagInfo.Kind == Token::TK_Tag) { 2532 setError("Already encountered a tag for this node!", T); 2533 return nullptr; 2534 } 2535 TagInfo = getNext(); // Consume TK_Tag. 2536 T = peekNext(); 2537 goto parse_property; 2538 default: 2539 break; 2540 } 2541 2542 switch (T.Kind) { 2543 case Token::TK_BlockEntry: 2544 // We got an unindented BlockEntry sequence. This is not terminated with 2545 // a BlockEnd. 2546 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2547 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2548 , AnchorInfo.Range.substr(1) 2549 , TagInfo.Range 2550 , SequenceNode::ST_Indentless); 2551 case Token::TK_BlockSequenceStart: 2552 getNext(); 2553 return new (NodeAllocator) 2554 SequenceNode( stream.CurrentDoc 2555 , AnchorInfo.Range.substr(1) 2556 , TagInfo.Range 2557 , SequenceNode::ST_Block); 2558 case Token::TK_BlockMappingStart: 2559 getNext(); 2560 return new (NodeAllocator) 2561 MappingNode( stream.CurrentDoc 2562 , AnchorInfo.Range.substr(1) 2563 , TagInfo.Range 2564 , MappingNode::MT_Block); 2565 case Token::TK_FlowSequenceStart: 2566 getNext(); 2567 return new (NodeAllocator) 2568 SequenceNode( stream.CurrentDoc 2569 , AnchorInfo.Range.substr(1) 2570 , TagInfo.Range 2571 , SequenceNode::ST_Flow); 2572 case Token::TK_FlowMappingStart: 2573 getNext(); 2574 return new (NodeAllocator) 2575 MappingNode( stream.CurrentDoc 2576 , AnchorInfo.Range.substr(1) 2577 , TagInfo.Range 2578 , MappingNode::MT_Flow); 2579 case Token::TK_Scalar: 2580 getNext(); 2581 return new (NodeAllocator) 2582 ScalarNode( stream.CurrentDoc 2583 , AnchorInfo.Range.substr(1) 2584 , TagInfo.Range 2585 , T.Range); 2586 case Token::TK_BlockScalar: { 2587 getNext(); 2588 StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); 2589 StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); 2590 return new (NodeAllocator) 2591 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), 2592 TagInfo.Range, StrCopy, T.Range); 2593 } 2594 case Token::TK_Key: 2595 // Don't eat the TK_Key, KeyValueNode expects it. 2596 return new (NodeAllocator) 2597 MappingNode( stream.CurrentDoc 2598 , AnchorInfo.Range.substr(1) 2599 , TagInfo.Range 2600 , MappingNode::MT_Inline); 2601 case Token::TK_DocumentStart: 2602 case Token::TK_DocumentEnd: 2603 case Token::TK_StreamEnd: 2604 default: 2605 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2606 // !!null null. 2607 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2608 case Token::TK_FlowMappingEnd: 2609 case Token::TK_FlowSequenceEnd: 2610 case Token::TK_FlowEntry: { 2611 if (Root && (isa<MappingNode>(Root) || isa<SequenceNode>(Root))) 2612 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2613 2614 setError("Unexpected token", T); 2615 return nullptr; 2616 } 2617 case Token::TK_Error: 2618 return nullptr; 2619 } 2620 llvm_unreachable("Control flow shouldn't reach here."); 2621 return nullptr; 2622 } 2623 2624 bool Document::parseDirectives() { 2625 bool isDirective = false; 2626 while (true) { 2627 Token T = peekNext(); 2628 if (T.Kind == Token::TK_TagDirective) { 2629 parseTAGDirective(); 2630 isDirective = true; 2631 } else if (T.Kind == Token::TK_VersionDirective) { 2632 parseYAMLDirective(); 2633 isDirective = true; 2634 } else 2635 break; 2636 } 2637 return isDirective; 2638 } 2639 2640 void Document::parseYAMLDirective() { 2641 getNext(); // Eat %YAML <version> 2642 } 2643 2644 void Document::parseTAGDirective() { 2645 Token Tag = getNext(); // %TAG <handle> <prefix> 2646 StringRef T = Tag.Range; 2647 // Strip %TAG 2648 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2649 std::size_t HandleEnd = T.find_first_of(" \t"); 2650 StringRef TagHandle = T.substr(0, HandleEnd); 2651 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2652 TagMap[TagHandle] = TagPrefix; 2653 } 2654 2655 bool Document::expectToken(int TK) { 2656 Token T = getNext(); 2657 if (T.Kind != TK) { 2658 setError("Unexpected token", T); 2659 return false; 2660 } 2661 return true; 2662 } 2663