1 //===- YAMLParser.cpp - Simple YAML parser --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a YAML parser. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/YAMLParser.h" 14 #include "llvm/ADT/AllocatorList.h" 15 #include "llvm/ADT/ArrayRef.h" 16 #include "llvm/ADT/None.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/ADT/SmallString.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/StringExtras.h" 21 #include "llvm/ADT/StringRef.h" 22 #include "llvm/ADT/Twine.h" 23 #include "llvm/Support/Compiler.h" 24 #include "llvm/Support/ErrorHandling.h" 25 #include "llvm/Support/MemoryBuffer.h" 26 #include "llvm/Support/SMLoc.h" 27 #include "llvm/Support/SourceMgr.h" 28 #include "llvm/Support/Unicode.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include <cassert> 31 #include <cstddef> 32 #include <cstdint> 33 #include <map> 34 #include <memory> 35 #include <string> 36 #include <system_error> 37 #include <utility> 38 39 using namespace llvm; 40 using namespace yaml; 41 42 enum UnicodeEncodingForm { 43 UEF_UTF32_LE, ///< UTF-32 Little Endian 44 UEF_UTF32_BE, ///< UTF-32 Big Endian 45 UEF_UTF16_LE, ///< UTF-16 Little Endian 46 UEF_UTF16_BE, ///< UTF-16 Big Endian 47 UEF_UTF8, ///< UTF-8 or ascii. 48 UEF_Unknown ///< Not a valid Unicode encoding. 49 }; 50 51 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 52 /// it exists. Length is in {0, 2, 3, 4}. 53 using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>; 54 55 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 56 /// encoding form of \a Input. 57 /// 58 /// @param Input A string of length 0 or more. 59 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 60 /// and how long the byte order mark is if one exists. 61 static EncodingInfo getUnicodeEncoding(StringRef Input) { 62 if (Input.empty()) 63 return std::make_pair(UEF_Unknown, 0); 64 65 switch (uint8_t(Input[0])) { 66 case 0x00: 67 if (Input.size() >= 4) { 68 if ( Input[1] == 0 69 && uint8_t(Input[2]) == 0xFE 70 && uint8_t(Input[3]) == 0xFF) 71 return std::make_pair(UEF_UTF32_BE, 4); 72 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 73 return std::make_pair(UEF_UTF32_BE, 0); 74 } 75 76 if (Input.size() >= 2 && Input[1] != 0) 77 return std::make_pair(UEF_UTF16_BE, 0); 78 return std::make_pair(UEF_Unknown, 0); 79 case 0xFF: 80 if ( Input.size() >= 4 81 && uint8_t(Input[1]) == 0xFE 82 && Input[2] == 0 83 && Input[3] == 0) 84 return std::make_pair(UEF_UTF32_LE, 4); 85 86 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 87 return std::make_pair(UEF_UTF16_LE, 2); 88 return std::make_pair(UEF_Unknown, 0); 89 case 0xFE: 90 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 91 return std::make_pair(UEF_UTF16_BE, 2); 92 return std::make_pair(UEF_Unknown, 0); 93 case 0xEF: 94 if ( Input.size() >= 3 95 && uint8_t(Input[1]) == 0xBB 96 && uint8_t(Input[2]) == 0xBF) 97 return std::make_pair(UEF_UTF8, 3); 98 return std::make_pair(UEF_Unknown, 0); 99 } 100 101 // It could still be utf-32 or utf-16. 102 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 103 return std::make_pair(UEF_UTF32_LE, 0); 104 105 if (Input.size() >= 2 && Input[1] == 0) 106 return std::make_pair(UEF_UTF16_LE, 0); 107 108 return std::make_pair(UEF_UTF8, 0); 109 } 110 111 /// Pin the vtables to this file. 112 void Node::anchor() {} 113 void NullNode::anchor() {} 114 void ScalarNode::anchor() {} 115 void BlockScalarNode::anchor() {} 116 void KeyValueNode::anchor() {} 117 void MappingNode::anchor() {} 118 void SequenceNode::anchor() {} 119 void AliasNode::anchor() {} 120 121 namespace llvm { 122 namespace yaml { 123 124 /// Token - A single YAML token. 125 struct Token { 126 enum TokenKind { 127 TK_Error, // Uninitialized token. 128 TK_StreamStart, 129 TK_StreamEnd, 130 TK_VersionDirective, 131 TK_TagDirective, 132 TK_DocumentStart, 133 TK_DocumentEnd, 134 TK_BlockEntry, 135 TK_BlockEnd, 136 TK_BlockSequenceStart, 137 TK_BlockMappingStart, 138 TK_FlowEntry, 139 TK_FlowSequenceStart, 140 TK_FlowSequenceEnd, 141 TK_FlowMappingStart, 142 TK_FlowMappingEnd, 143 TK_Key, 144 TK_Value, 145 TK_Scalar, 146 TK_BlockScalar, 147 TK_Alias, 148 TK_Anchor, 149 TK_Tag 150 } Kind = TK_Error; 151 152 /// A string of length 0 or more whose begin() points to the logical location 153 /// of the token in the input. 154 StringRef Range; 155 156 /// The value of a block scalar node. 157 std::string Value; 158 159 Token() = default; 160 }; 161 162 } // end namespace yaml 163 } // end namespace llvm 164 165 using TokenQueueT = BumpPtrList<Token>; 166 167 namespace { 168 169 /// This struct is used to track simple keys. 170 /// 171 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 172 /// which could legally be the start of a simple key. When peekNext is called, 173 /// if the Token To be returned is referenced by a SimpleKey, we continue 174 /// tokenizing until that potential simple key has either been found to not be 175 /// a simple key (we moved on to the next line or went further than 1024 chars). 176 /// Or when we run into a Value, and then insert a Key token (and possibly 177 /// others) before the SimpleKey's Tok. 178 struct SimpleKey { 179 TokenQueueT::iterator Tok; 180 unsigned Column = 0; 181 unsigned Line = 0; 182 unsigned FlowLevel = 0; 183 bool IsRequired = false; 184 185 bool operator ==(const SimpleKey &Other) { 186 return Tok == Other.Tok; 187 } 188 }; 189 190 } // end anonymous namespace 191 192 /// The Unicode scalar value of a UTF-8 minimal well-formed code unit 193 /// subsequence and the subsequence's length in code units (uint8_t). 194 /// A length of 0 represents an error. 195 using UTF8Decoded = std::pair<uint32_t, unsigned>; 196 197 static UTF8Decoded decodeUTF8(StringRef Range) { 198 StringRef::iterator Position= Range.begin(); 199 StringRef::iterator End = Range.end(); 200 // 1 byte: [0x00, 0x7f] 201 // Bit pattern: 0xxxxxxx 202 if (Position < End && (*Position & 0x80) == 0) { 203 return std::make_pair(*Position, 1); 204 } 205 // 2 bytes: [0x80, 0x7ff] 206 // Bit pattern: 110xxxxx 10xxxxxx 207 if (Position + 1 < End && ((*Position & 0xE0) == 0xC0) && 208 ((*(Position + 1) & 0xC0) == 0x80)) { 209 uint32_t codepoint = ((*Position & 0x1F) << 6) | 210 (*(Position + 1) & 0x3F); 211 if (codepoint >= 0x80) 212 return std::make_pair(codepoint, 2); 213 } 214 // 3 bytes: [0x8000, 0xffff] 215 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 216 if (Position + 2 < End && ((*Position & 0xF0) == 0xE0) && 217 ((*(Position + 1) & 0xC0) == 0x80) && 218 ((*(Position + 2) & 0xC0) == 0x80)) { 219 uint32_t codepoint = ((*Position & 0x0F) << 12) | 220 ((*(Position + 1) & 0x3F) << 6) | 221 (*(Position + 2) & 0x3F); 222 // Codepoints between 0xD800 and 0xDFFF are invalid, as 223 // they are high / low surrogate halves used by UTF-16. 224 if (codepoint >= 0x800 && 225 (codepoint < 0xD800 || codepoint > 0xDFFF)) 226 return std::make_pair(codepoint, 3); 227 } 228 // 4 bytes: [0x10000, 0x10FFFF] 229 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 230 if (Position + 3 < End && ((*Position & 0xF8) == 0xF0) && 231 ((*(Position + 1) & 0xC0) == 0x80) && 232 ((*(Position + 2) & 0xC0) == 0x80) && 233 ((*(Position + 3) & 0xC0) == 0x80)) { 234 uint32_t codepoint = ((*Position & 0x07) << 18) | 235 ((*(Position + 1) & 0x3F) << 12) | 236 ((*(Position + 2) & 0x3F) << 6) | 237 (*(Position + 3) & 0x3F); 238 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 239 return std::make_pair(codepoint, 4); 240 } 241 return std::make_pair(0, 0); 242 } 243 244 namespace llvm { 245 namespace yaml { 246 247 /// Scans YAML tokens from a MemoryBuffer. 248 class Scanner { 249 public: 250 Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true, 251 std::error_code *EC = nullptr); 252 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true, 253 std::error_code *EC = nullptr); 254 255 /// Parse the next token and return it without popping it. 256 Token &peekNext(); 257 258 /// Parse the next token and pop it from the queue. 259 Token getNext(); 260 261 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 262 ArrayRef<SMRange> Ranges = std::nullopt) { 263 SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ std::nullopt, 264 ShowColors); 265 } 266 267 void setError(const Twine &Message, StringRef::iterator Position) { 268 if (Position >= End) 269 Position = End - 1; 270 271 // propagate the error if possible 272 if (EC) 273 *EC = make_error_code(std::errc::invalid_argument); 274 275 // Don't print out more errors after the first one we encounter. The rest 276 // are just the result of the first, and have no meaning. 277 if (!Failed) 278 printError(SMLoc::getFromPointer(Position), SourceMgr::DK_Error, Message); 279 Failed = true; 280 } 281 282 /// Returns true if an error occurred while parsing. 283 bool failed() { 284 return Failed; 285 } 286 287 private: 288 void init(MemoryBufferRef Buffer); 289 290 StringRef currentInput() { 291 return StringRef(Current, End - Current); 292 } 293 294 /// Decode a UTF-8 minimal well-formed code unit subsequence starting 295 /// at \a Position. 296 /// 297 /// If the UTF-8 code units starting at Position do not form a well-formed 298 /// code unit subsequence, then the Unicode scalar value is 0, and the length 299 /// is 0. 300 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 301 return ::decodeUTF8(StringRef(Position, End - Position)); 302 } 303 304 // The following functions are based on the gramar rules in the YAML spec. The 305 // style of the function names it meant to closely match how they are written 306 // in the spec. The number within the [] is the number of the grammar rule in 307 // the spec. 308 // 309 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 310 // 311 // c- 312 // A production starting and ending with a special character. 313 // b- 314 // A production matching a single line break. 315 // nb- 316 // A production starting and ending with a non-break character. 317 // s- 318 // A production starting and ending with a white space character. 319 // ns- 320 // A production starting and ending with a non-space character. 321 // l- 322 // A production matching complete line(s). 323 324 /// Skip a single nb-char[27] starting at Position. 325 /// 326 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 327 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 328 /// 329 /// @returns The code unit after the nb-char, or Position if it's not an 330 /// nb-char. 331 StringRef::iterator skip_nb_char(StringRef::iterator Position); 332 333 /// Skip a single b-break[28] starting at Position. 334 /// 335 /// A b-break is 0xD 0xA | 0xD | 0xA 336 /// 337 /// @returns The code unit after the b-break, or Position if it's not a 338 /// b-break. 339 StringRef::iterator skip_b_break(StringRef::iterator Position); 340 341 /// Skip a single s-space[31] starting at Position. 342 /// 343 /// An s-space is 0x20 344 /// 345 /// @returns The code unit after the s-space, or Position if it's not a 346 /// s-space. 347 StringRef::iterator skip_s_space(StringRef::iterator Position); 348 349 /// Skip a single s-white[33] starting at Position. 350 /// 351 /// A s-white is 0x20 | 0x9 352 /// 353 /// @returns The code unit after the s-white, or Position if it's not a 354 /// s-white. 355 StringRef::iterator skip_s_white(StringRef::iterator Position); 356 357 /// Skip a single ns-char[34] starting at Position. 358 /// 359 /// A ns-char is nb-char - s-white 360 /// 361 /// @returns The code unit after the ns-char, or Position if it's not a 362 /// ns-char. 363 StringRef::iterator skip_ns_char(StringRef::iterator Position); 364 365 using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator); 366 367 /// Skip minimal well-formed code unit subsequences until Func 368 /// returns its input. 369 /// 370 /// @returns The code unit after the last minimal well-formed code unit 371 /// subsequence that Func accepted. 372 StringRef::iterator skip_while( SkipWhileFunc Func 373 , StringRef::iterator Position); 374 375 /// Skip minimal well-formed code unit subsequences until Func returns its 376 /// input. 377 void advanceWhile(SkipWhileFunc Func); 378 379 /// Scan ns-uri-char[39]s starting at Cur. 380 /// 381 /// This updates Cur and Column while scanning. 382 void scan_ns_uri_char(); 383 384 /// Consume a minimal well-formed code unit subsequence starting at 385 /// \a Cur. Return false if it is not the same Unicode scalar value as 386 /// \a Expected. This updates \a Column. 387 bool consume(uint32_t Expected); 388 389 /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 390 void skip(uint32_t Distance); 391 392 /// Return true if the minimal well-formed code unit subsequence at 393 /// Pos is whitespace or a new line 394 bool isBlankOrBreak(StringRef::iterator Position); 395 396 /// Return true if the line is a line break, false otherwise. 397 bool isLineEmpty(StringRef Line); 398 399 /// Consume a single b-break[28] if it's present at the current position. 400 /// 401 /// Return false if the code unit at the current position isn't a line break. 402 bool consumeLineBreakIfPresent(); 403 404 /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 405 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 406 , unsigned AtColumn 407 , bool IsRequired); 408 409 /// Remove simple keys that can no longer be valid simple keys. 410 /// 411 /// Invalid simple keys are not on the current line or are further than 1024 412 /// columns back. 413 void removeStaleSimpleKeyCandidates(); 414 415 /// Remove all simple keys on FlowLevel \a Level. 416 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 417 418 /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 419 /// tokens if needed. 420 bool unrollIndent(int ToColumn); 421 422 /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 423 /// if needed. 424 bool rollIndent( int ToColumn 425 , Token::TokenKind Kind 426 , TokenQueueT::iterator InsertPoint); 427 428 /// Skip a single-line comment when the comment starts at the current 429 /// position of the scanner. 430 void skipComment(); 431 432 /// Skip whitespace and comments until the start of the next token. 433 void scanToNextToken(); 434 435 /// Must be the first token generated. 436 bool scanStreamStart(); 437 438 /// Generate tokens needed to close out the stream. 439 bool scanStreamEnd(); 440 441 /// Scan a %BLAH directive. 442 bool scanDirective(); 443 444 /// Scan a ... or ---. 445 bool scanDocumentIndicator(bool IsStart); 446 447 /// Scan a [ or { and generate the proper flow collection start token. 448 bool scanFlowCollectionStart(bool IsSequence); 449 450 /// Scan a ] or } and generate the proper flow collection end token. 451 bool scanFlowCollectionEnd(bool IsSequence); 452 453 /// Scan the , that separates entries in a flow collection. 454 bool scanFlowEntry(); 455 456 /// Scan the - that starts block sequence entries. 457 bool scanBlockEntry(); 458 459 /// Scan an explicit ? indicating a key. 460 bool scanKey(); 461 462 /// Scan an explicit : indicating a value. 463 bool scanValue(); 464 465 /// Scan a quoted scalar. 466 bool scanFlowScalar(bool IsDoubleQuoted); 467 468 /// Scan an unquoted scalar. 469 bool scanPlainScalar(); 470 471 /// Scan an Alias or Anchor starting with * or &. 472 bool scanAliasOrAnchor(bool IsAlias); 473 474 /// Scan a block scalar starting with | or >. 475 bool scanBlockScalar(bool IsLiteral); 476 477 /// Scan a block scalar style indicator and header. 478 /// 479 /// Note: This is distinct from scanBlockScalarHeader to mirror the fact that 480 /// YAML does not consider the style indicator to be a part of the header. 481 /// 482 /// Return false if an error occurred. 483 bool scanBlockScalarIndicators(char &StyleIndicator, char &ChompingIndicator, 484 unsigned &IndentIndicator, bool &IsDone); 485 486 /// Scan a style indicator in a block scalar header. 487 char scanBlockStyleIndicator(); 488 489 /// Scan a chomping indicator in a block scalar header. 490 char scanBlockChompingIndicator(); 491 492 /// Scan an indentation indicator in a block scalar header. 493 unsigned scanBlockIndentationIndicator(); 494 495 /// Scan a block scalar header. 496 /// 497 /// Return false if an error occurred. 498 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, 499 bool &IsDone); 500 501 /// Look for the indentation level of a block scalar. 502 /// 503 /// Return false if an error occurred. 504 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, 505 unsigned &LineBreaks, bool &IsDone); 506 507 /// Scan the indentation of a text line in a block scalar. 508 /// 509 /// Return false if an error occurred. 510 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, 511 bool &IsDone); 512 513 /// Scan a tag of the form !stuff. 514 bool scanTag(); 515 516 /// Dispatch to the next scanning function based on \a *Cur. 517 bool fetchMoreTokens(); 518 519 /// The SourceMgr used for diagnostics and buffer management. 520 SourceMgr &SM; 521 522 /// The original input. 523 MemoryBufferRef InputBuffer; 524 525 /// The current position of the scanner. 526 StringRef::iterator Current; 527 528 /// The end of the input (one past the last character). 529 StringRef::iterator End; 530 531 /// Current YAML indentation level in spaces. 532 int Indent; 533 534 /// Current column number in Unicode code points. 535 unsigned Column; 536 537 /// Current line number. 538 unsigned Line; 539 540 /// How deep we are in flow style containers. 0 Means at block level. 541 unsigned FlowLevel; 542 543 /// Are we at the start of the stream? 544 bool IsStartOfStream; 545 546 /// Can the next token be the start of a simple key? 547 bool IsSimpleKeyAllowed; 548 549 /// True if an error has occurred. 550 bool Failed; 551 552 /// Should colors be used when printing out the diagnostic messages? 553 bool ShowColors; 554 555 /// Queue of tokens. This is required to queue up tokens while looking 556 /// for the end of a simple key. And for cases where a single character 557 /// can produce multiple tokens (e.g. BlockEnd). 558 TokenQueueT TokenQueue; 559 560 /// Indentation levels. 561 SmallVector<int, 4> Indents; 562 563 /// Potential simple keys. 564 SmallVector<SimpleKey, 4> SimpleKeys; 565 566 std::error_code *EC; 567 }; 568 569 } // end namespace yaml 570 } // end namespace llvm 571 572 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 573 static void encodeUTF8( uint32_t UnicodeScalarValue 574 , SmallVectorImpl<char> &Result) { 575 if (UnicodeScalarValue <= 0x7F) { 576 Result.push_back(UnicodeScalarValue & 0x7F); 577 } else if (UnicodeScalarValue <= 0x7FF) { 578 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 579 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 580 Result.push_back(FirstByte); 581 Result.push_back(SecondByte); 582 } else if (UnicodeScalarValue <= 0xFFFF) { 583 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 584 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 585 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 586 Result.push_back(FirstByte); 587 Result.push_back(SecondByte); 588 Result.push_back(ThirdByte); 589 } else if (UnicodeScalarValue <= 0x10FFFF) { 590 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 591 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 592 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 593 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 594 Result.push_back(FirstByte); 595 Result.push_back(SecondByte); 596 Result.push_back(ThirdByte); 597 Result.push_back(FourthByte); 598 } 599 } 600 601 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 602 SourceMgr SM; 603 Scanner scanner(Input, SM); 604 while (true) { 605 Token T = scanner.getNext(); 606 switch (T.Kind) { 607 case Token::TK_StreamStart: 608 OS << "Stream-Start: "; 609 break; 610 case Token::TK_StreamEnd: 611 OS << "Stream-End: "; 612 break; 613 case Token::TK_VersionDirective: 614 OS << "Version-Directive: "; 615 break; 616 case Token::TK_TagDirective: 617 OS << "Tag-Directive: "; 618 break; 619 case Token::TK_DocumentStart: 620 OS << "Document-Start: "; 621 break; 622 case Token::TK_DocumentEnd: 623 OS << "Document-End: "; 624 break; 625 case Token::TK_BlockEntry: 626 OS << "Block-Entry: "; 627 break; 628 case Token::TK_BlockEnd: 629 OS << "Block-End: "; 630 break; 631 case Token::TK_BlockSequenceStart: 632 OS << "Block-Sequence-Start: "; 633 break; 634 case Token::TK_BlockMappingStart: 635 OS << "Block-Mapping-Start: "; 636 break; 637 case Token::TK_FlowEntry: 638 OS << "Flow-Entry: "; 639 break; 640 case Token::TK_FlowSequenceStart: 641 OS << "Flow-Sequence-Start: "; 642 break; 643 case Token::TK_FlowSequenceEnd: 644 OS << "Flow-Sequence-End: "; 645 break; 646 case Token::TK_FlowMappingStart: 647 OS << "Flow-Mapping-Start: "; 648 break; 649 case Token::TK_FlowMappingEnd: 650 OS << "Flow-Mapping-End: "; 651 break; 652 case Token::TK_Key: 653 OS << "Key: "; 654 break; 655 case Token::TK_Value: 656 OS << "Value: "; 657 break; 658 case Token::TK_Scalar: 659 OS << "Scalar: "; 660 break; 661 case Token::TK_BlockScalar: 662 OS << "Block Scalar: "; 663 break; 664 case Token::TK_Alias: 665 OS << "Alias: "; 666 break; 667 case Token::TK_Anchor: 668 OS << "Anchor: "; 669 break; 670 case Token::TK_Tag: 671 OS << "Tag: "; 672 break; 673 case Token::TK_Error: 674 break; 675 } 676 OS << T.Range << "\n"; 677 if (T.Kind == Token::TK_StreamEnd) 678 break; 679 else if (T.Kind == Token::TK_Error) 680 return false; 681 } 682 return true; 683 } 684 685 bool yaml::scanTokens(StringRef Input) { 686 SourceMgr SM; 687 Scanner scanner(Input, SM); 688 while (true) { 689 Token T = scanner.getNext(); 690 if (T.Kind == Token::TK_StreamEnd) 691 break; 692 else if (T.Kind == Token::TK_Error) 693 return false; 694 } 695 return true; 696 } 697 698 std::string yaml::escape(StringRef Input, bool EscapePrintable) { 699 std::string EscapedInput; 700 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 701 if (*i == '\\') 702 EscapedInput += "\\\\"; 703 else if (*i == '"') 704 EscapedInput += "\\\""; 705 else if (*i == 0) 706 EscapedInput += "\\0"; 707 else if (*i == 0x07) 708 EscapedInput += "\\a"; 709 else if (*i == 0x08) 710 EscapedInput += "\\b"; 711 else if (*i == 0x09) 712 EscapedInput += "\\t"; 713 else if (*i == 0x0A) 714 EscapedInput += "\\n"; 715 else if (*i == 0x0B) 716 EscapedInput += "\\v"; 717 else if (*i == 0x0C) 718 EscapedInput += "\\f"; 719 else if (*i == 0x0D) 720 EscapedInput += "\\r"; 721 else if (*i == 0x1B) 722 EscapedInput += "\\e"; 723 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 724 std::string HexStr = utohexstr(*i); 725 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 726 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 727 UTF8Decoded UnicodeScalarValue 728 = decodeUTF8(StringRef(i, Input.end() - i)); 729 if (UnicodeScalarValue.second == 0) { 730 // Found invalid char. 731 SmallString<4> Val; 732 encodeUTF8(0xFFFD, Val); 733 llvm::append_range(EscapedInput, Val); 734 // FIXME: Error reporting. 735 return EscapedInput; 736 } 737 if (UnicodeScalarValue.first == 0x85) 738 EscapedInput += "\\N"; 739 else if (UnicodeScalarValue.first == 0xA0) 740 EscapedInput += "\\_"; 741 else if (UnicodeScalarValue.first == 0x2028) 742 EscapedInput += "\\L"; 743 else if (UnicodeScalarValue.first == 0x2029) 744 EscapedInput += "\\P"; 745 else if (!EscapePrintable && 746 sys::unicode::isPrintable(UnicodeScalarValue.first)) 747 EscapedInput += StringRef(i, UnicodeScalarValue.second); 748 else { 749 std::string HexStr = utohexstr(UnicodeScalarValue.first); 750 if (HexStr.size() <= 2) 751 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 752 else if (HexStr.size() <= 4) 753 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 754 else if (HexStr.size() <= 8) 755 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 756 } 757 i += UnicodeScalarValue.second - 1; 758 } else 759 EscapedInput.push_back(*i); 760 } 761 return EscapedInput; 762 } 763 764 llvm::Optional<bool> yaml::parseBool(StringRef S) { 765 switch (S.size()) { 766 case 1: 767 switch (S.front()) { 768 case 'y': 769 case 'Y': 770 return true; 771 case 'n': 772 case 'N': 773 return false; 774 default: 775 return std::nullopt; 776 } 777 case 2: 778 switch (S.front()) { 779 case 'O': 780 if (S[1] == 'N') // ON 781 return true; 782 [[fallthrough]]; 783 case 'o': 784 if (S[1] == 'n') //[Oo]n 785 return true; 786 return std::nullopt; 787 case 'N': 788 if (S[1] == 'O') // NO 789 return false; 790 [[fallthrough]]; 791 case 'n': 792 if (S[1] == 'o') //[Nn]o 793 return false; 794 return std::nullopt; 795 default: 796 return std::nullopt; 797 } 798 case 3: 799 switch (S.front()) { 800 case 'O': 801 if (S.drop_front() == "FF") // OFF 802 return false; 803 [[fallthrough]]; 804 case 'o': 805 if (S.drop_front() == "ff") //[Oo]ff 806 return false; 807 return std::nullopt; 808 case 'Y': 809 if (S.drop_front() == "ES") // YES 810 return true; 811 [[fallthrough]]; 812 case 'y': 813 if (S.drop_front() == "es") //[Yy]es 814 return true; 815 return std::nullopt; 816 default: 817 return std::nullopt; 818 } 819 case 4: 820 switch (S.front()) { 821 case 'T': 822 if (S.drop_front() == "RUE") // TRUE 823 return true; 824 [[fallthrough]]; 825 case 't': 826 if (S.drop_front() == "rue") //[Tt]rue 827 return true; 828 return std::nullopt; 829 default: 830 return std::nullopt; 831 } 832 case 5: 833 switch (S.front()) { 834 case 'F': 835 if (S.drop_front() == "ALSE") // FALSE 836 return false; 837 [[fallthrough]]; 838 case 'f': 839 if (S.drop_front() == "alse") //[Ff]alse 840 return false; 841 return std::nullopt; 842 default: 843 return std::nullopt; 844 } 845 default: 846 return std::nullopt; 847 } 848 } 849 850 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors, 851 std::error_code *EC) 852 : SM(sm), ShowColors(ShowColors), EC(EC) { 853 init(MemoryBufferRef(Input, "YAML")); 854 } 855 856 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors, 857 std::error_code *EC) 858 : SM(SM_), ShowColors(ShowColors), EC(EC) { 859 init(Buffer); 860 } 861 862 void Scanner::init(MemoryBufferRef Buffer) { 863 InputBuffer = Buffer; 864 Current = InputBuffer.getBufferStart(); 865 End = InputBuffer.getBufferEnd(); 866 Indent = -1; 867 Column = 0; 868 Line = 0; 869 FlowLevel = 0; 870 IsStartOfStream = true; 871 IsSimpleKeyAllowed = true; 872 Failed = false; 873 std::unique_ptr<MemoryBuffer> InputBufferOwner = 874 MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false); 875 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 876 } 877 878 Token &Scanner::peekNext() { 879 // If the current token is a possible simple key, keep parsing until we 880 // can confirm. 881 bool NeedMore = false; 882 while (true) { 883 if (TokenQueue.empty() || NeedMore) { 884 if (!fetchMoreTokens()) { 885 TokenQueue.clear(); 886 SimpleKeys.clear(); 887 TokenQueue.push_back(Token()); 888 return TokenQueue.front(); 889 } 890 } 891 assert(!TokenQueue.empty() && 892 "fetchMoreTokens lied about getting tokens!"); 893 894 removeStaleSimpleKeyCandidates(); 895 SimpleKey SK; 896 SK.Tok = TokenQueue.begin(); 897 if (!is_contained(SimpleKeys, SK)) 898 break; 899 else 900 NeedMore = true; 901 } 902 return TokenQueue.front(); 903 } 904 905 Token Scanner::getNext() { 906 Token Ret = peekNext(); 907 // TokenQueue can be empty if there was an error getting the next token. 908 if (!TokenQueue.empty()) 909 TokenQueue.pop_front(); 910 911 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 912 // quick deallocation of them all. 913 if (TokenQueue.empty()) 914 TokenQueue.resetAlloc(); 915 916 return Ret; 917 } 918 919 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 920 if (Position == End) 921 return Position; 922 // Check 7 bit c-printable - b-char. 923 if ( *Position == 0x09 924 || (*Position >= 0x20 && *Position <= 0x7E)) 925 return Position + 1; 926 927 // Check for valid UTF-8. 928 if (uint8_t(*Position) & 0x80) { 929 UTF8Decoded u8d = decodeUTF8(Position); 930 if ( u8d.second != 0 931 && u8d.first != 0xFEFF 932 && ( u8d.first == 0x85 933 || ( u8d.first >= 0xA0 934 && u8d.first <= 0xD7FF) 935 || ( u8d.first >= 0xE000 936 && u8d.first <= 0xFFFD) 937 || ( u8d.first >= 0x10000 938 && u8d.first <= 0x10FFFF))) 939 return Position + u8d.second; 940 } 941 return Position; 942 } 943 944 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 945 if (Position == End) 946 return Position; 947 if (*Position == 0x0D) { 948 if (Position + 1 != End && *(Position + 1) == 0x0A) 949 return Position + 2; 950 return Position + 1; 951 } 952 953 if (*Position == 0x0A) 954 return Position + 1; 955 return Position; 956 } 957 958 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { 959 if (Position == End) 960 return Position; 961 if (*Position == ' ') 962 return Position + 1; 963 return Position; 964 } 965 966 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 967 if (Position == End) 968 return Position; 969 if (*Position == ' ' || *Position == '\t') 970 return Position + 1; 971 return Position; 972 } 973 974 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 975 if (Position == End) 976 return Position; 977 if (*Position == ' ' || *Position == '\t') 978 return Position; 979 return skip_nb_char(Position); 980 } 981 982 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 983 , StringRef::iterator Position) { 984 while (true) { 985 StringRef::iterator i = (this->*Func)(Position); 986 if (i == Position) 987 break; 988 Position = i; 989 } 990 return Position; 991 } 992 993 void Scanner::advanceWhile(SkipWhileFunc Func) { 994 auto Final = skip_while(Func, Current); 995 Column += Final - Current; 996 Current = Final; 997 } 998 999 static bool is_ns_hex_digit(const char C) { return isAlnum(C); } 1000 1001 static bool is_ns_word_char(const char C) { return C == '-' || isAlpha(C); } 1002 1003 void Scanner::scan_ns_uri_char() { 1004 while (true) { 1005 if (Current == End) 1006 break; 1007 if (( *Current == '%' 1008 && Current + 2 < End 1009 && is_ns_hex_digit(*(Current + 1)) 1010 && is_ns_hex_digit(*(Current + 2))) 1011 || is_ns_word_char(*Current) 1012 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 1013 != StringRef::npos) { 1014 ++Current; 1015 ++Column; 1016 } else 1017 break; 1018 } 1019 } 1020 1021 bool Scanner::consume(uint32_t Expected) { 1022 if (Expected >= 0x80) { 1023 setError("Cannot consume non-ascii characters", Current); 1024 return false; 1025 } 1026 if (Current == End) 1027 return false; 1028 if (uint8_t(*Current) >= 0x80) { 1029 setError("Cannot consume non-ascii characters", Current); 1030 return false; 1031 } 1032 if (uint8_t(*Current) == Expected) { 1033 ++Current; 1034 ++Column; 1035 return true; 1036 } 1037 return false; 1038 } 1039 1040 void Scanner::skip(uint32_t Distance) { 1041 Current += Distance; 1042 Column += Distance; 1043 assert(Current <= End && "Skipped past the end"); 1044 } 1045 1046 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 1047 if (Position == End) 1048 return false; 1049 return *Position == ' ' || *Position == '\t' || *Position == '\r' || 1050 *Position == '\n'; 1051 } 1052 1053 bool Scanner::isLineEmpty(StringRef Line) { 1054 for (const auto *Position = Line.begin(); Position != Line.end(); ++Position) 1055 if (!isBlankOrBreak(Position)) 1056 return false; 1057 return true; 1058 } 1059 1060 bool Scanner::consumeLineBreakIfPresent() { 1061 auto Next = skip_b_break(Current); 1062 if (Next == Current) 1063 return false; 1064 Column = 0; 1065 ++Line; 1066 Current = Next; 1067 return true; 1068 } 1069 1070 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 1071 , unsigned AtColumn 1072 , bool IsRequired) { 1073 if (IsSimpleKeyAllowed) { 1074 SimpleKey SK; 1075 SK.Tok = Tok; 1076 SK.Line = Line; 1077 SK.Column = AtColumn; 1078 SK.IsRequired = IsRequired; 1079 SK.FlowLevel = FlowLevel; 1080 SimpleKeys.push_back(SK); 1081 } 1082 } 1083 1084 void Scanner::removeStaleSimpleKeyCandidates() { 1085 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 1086 i != SimpleKeys.end();) { 1087 if (i->Line != Line || i->Column + 1024 < Column) { 1088 if (i->IsRequired) 1089 setError( "Could not find expected : for simple key" 1090 , i->Tok->Range.begin()); 1091 i = SimpleKeys.erase(i); 1092 } else 1093 ++i; 1094 } 1095 } 1096 1097 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 1098 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 1099 SimpleKeys.pop_back(); 1100 } 1101 1102 bool Scanner::unrollIndent(int ToColumn) { 1103 Token T; 1104 // Indentation is ignored in flow. 1105 if (FlowLevel != 0) 1106 return true; 1107 1108 while (Indent > ToColumn) { 1109 T.Kind = Token::TK_BlockEnd; 1110 T.Range = StringRef(Current, 1); 1111 TokenQueue.push_back(T); 1112 Indent = Indents.pop_back_val(); 1113 } 1114 1115 return true; 1116 } 1117 1118 bool Scanner::rollIndent( int ToColumn 1119 , Token::TokenKind Kind 1120 , TokenQueueT::iterator InsertPoint) { 1121 if (FlowLevel) 1122 return true; 1123 if (Indent < ToColumn) { 1124 Indents.push_back(Indent); 1125 Indent = ToColumn; 1126 1127 Token T; 1128 T.Kind = Kind; 1129 T.Range = StringRef(Current, 0); 1130 TokenQueue.insert(InsertPoint, T); 1131 } 1132 return true; 1133 } 1134 1135 void Scanner::skipComment() { 1136 if (Current == End || *Current != '#') 1137 return; 1138 while (true) { 1139 // This may skip more than one byte, thus Column is only incremented 1140 // for code points. 1141 StringRef::iterator I = skip_nb_char(Current); 1142 if (I == Current) 1143 break; 1144 Current = I; 1145 ++Column; 1146 } 1147 } 1148 1149 void Scanner::scanToNextToken() { 1150 while (true) { 1151 while (Current != End && (*Current == ' ' || *Current == '\t')) { 1152 skip(1); 1153 } 1154 1155 skipComment(); 1156 1157 // Skip EOL. 1158 StringRef::iterator i = skip_b_break(Current); 1159 if (i == Current) 1160 break; 1161 Current = i; 1162 ++Line; 1163 Column = 0; 1164 // New lines may start a simple key. 1165 if (!FlowLevel) 1166 IsSimpleKeyAllowed = true; 1167 } 1168 } 1169 1170 bool Scanner::scanStreamStart() { 1171 IsStartOfStream = false; 1172 1173 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1174 1175 Token T; 1176 T.Kind = Token::TK_StreamStart; 1177 T.Range = StringRef(Current, EI.second); 1178 TokenQueue.push_back(T); 1179 Current += EI.second; 1180 return true; 1181 } 1182 1183 bool Scanner::scanStreamEnd() { 1184 // Force an ending new line if one isn't present. 1185 if (Column != 0) { 1186 Column = 0; 1187 ++Line; 1188 } 1189 1190 unrollIndent(-1); 1191 SimpleKeys.clear(); 1192 IsSimpleKeyAllowed = false; 1193 1194 Token T; 1195 T.Kind = Token::TK_StreamEnd; 1196 T.Range = StringRef(Current, 0); 1197 TokenQueue.push_back(T); 1198 return true; 1199 } 1200 1201 bool Scanner::scanDirective() { 1202 // Reset the indentation level. 1203 unrollIndent(-1); 1204 SimpleKeys.clear(); 1205 IsSimpleKeyAllowed = false; 1206 1207 StringRef::iterator Start = Current; 1208 consume('%'); 1209 StringRef::iterator NameStart = Current; 1210 Current = skip_while(&Scanner::skip_ns_char, Current); 1211 StringRef Name(NameStart, Current - NameStart); 1212 Current = skip_while(&Scanner::skip_s_white, Current); 1213 1214 Token T; 1215 if (Name == "YAML") { 1216 Current = skip_while(&Scanner::skip_ns_char, Current); 1217 T.Kind = Token::TK_VersionDirective; 1218 T.Range = StringRef(Start, Current - Start); 1219 TokenQueue.push_back(T); 1220 return true; 1221 } else if(Name == "TAG") { 1222 Current = skip_while(&Scanner::skip_ns_char, Current); 1223 Current = skip_while(&Scanner::skip_s_white, Current); 1224 Current = skip_while(&Scanner::skip_ns_char, Current); 1225 T.Kind = Token::TK_TagDirective; 1226 T.Range = StringRef(Start, Current - Start); 1227 TokenQueue.push_back(T); 1228 return true; 1229 } 1230 return false; 1231 } 1232 1233 bool Scanner::scanDocumentIndicator(bool IsStart) { 1234 unrollIndent(-1); 1235 SimpleKeys.clear(); 1236 IsSimpleKeyAllowed = false; 1237 1238 Token T; 1239 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1240 T.Range = StringRef(Current, 3); 1241 skip(3); 1242 TokenQueue.push_back(T); 1243 return true; 1244 } 1245 1246 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1247 Token T; 1248 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1249 : Token::TK_FlowMappingStart; 1250 T.Range = StringRef(Current, 1); 1251 skip(1); 1252 TokenQueue.push_back(T); 1253 1254 // [ and { may begin a simple key. 1255 saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); 1256 1257 // And may also be followed by a simple key. 1258 IsSimpleKeyAllowed = true; 1259 ++FlowLevel; 1260 return true; 1261 } 1262 1263 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1264 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1265 IsSimpleKeyAllowed = false; 1266 Token T; 1267 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1268 : Token::TK_FlowMappingEnd; 1269 T.Range = StringRef(Current, 1); 1270 skip(1); 1271 TokenQueue.push_back(T); 1272 if (FlowLevel) 1273 --FlowLevel; 1274 return true; 1275 } 1276 1277 bool Scanner::scanFlowEntry() { 1278 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1279 IsSimpleKeyAllowed = true; 1280 Token T; 1281 T.Kind = Token::TK_FlowEntry; 1282 T.Range = StringRef(Current, 1); 1283 skip(1); 1284 TokenQueue.push_back(T); 1285 return true; 1286 } 1287 1288 bool Scanner::scanBlockEntry() { 1289 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1290 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1291 IsSimpleKeyAllowed = true; 1292 Token T; 1293 T.Kind = Token::TK_BlockEntry; 1294 T.Range = StringRef(Current, 1); 1295 skip(1); 1296 TokenQueue.push_back(T); 1297 return true; 1298 } 1299 1300 bool Scanner::scanKey() { 1301 if (!FlowLevel) 1302 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1303 1304 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1305 IsSimpleKeyAllowed = !FlowLevel; 1306 1307 Token T; 1308 T.Kind = Token::TK_Key; 1309 T.Range = StringRef(Current, 1); 1310 skip(1); 1311 TokenQueue.push_back(T); 1312 return true; 1313 } 1314 1315 bool Scanner::scanValue() { 1316 // If the previous token could have been a simple key, insert the key token 1317 // into the token queue. 1318 if (!SimpleKeys.empty()) { 1319 SimpleKey SK = SimpleKeys.pop_back_val(); 1320 Token T; 1321 T.Kind = Token::TK_Key; 1322 T.Range = SK.Tok->Range; 1323 TokenQueueT::iterator i, e; 1324 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1325 if (i == SK.Tok) 1326 break; 1327 } 1328 if (i == e) { 1329 Failed = true; 1330 return false; 1331 } 1332 i = TokenQueue.insert(i, T); 1333 1334 // We may also need to add a Block-Mapping-Start token. 1335 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1336 1337 IsSimpleKeyAllowed = false; 1338 } else { 1339 if (!FlowLevel) 1340 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1341 IsSimpleKeyAllowed = !FlowLevel; 1342 } 1343 1344 Token T; 1345 T.Kind = Token::TK_Value; 1346 T.Range = StringRef(Current, 1); 1347 skip(1); 1348 TokenQueue.push_back(T); 1349 return true; 1350 } 1351 1352 // Forbidding inlining improves performance by roughly 20%. 1353 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1354 LLVM_ATTRIBUTE_NOINLINE static bool 1355 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1356 1357 // Returns whether a character at 'Position' was escaped with a leading '\'. 1358 // 'First' specifies the position of the first character in the string. 1359 static bool wasEscaped(StringRef::iterator First, 1360 StringRef::iterator Position) { 1361 assert(Position - 1 >= First); 1362 StringRef::iterator I = Position - 1; 1363 // We calculate the number of consecutive '\'s before the current position 1364 // by iterating backwards through our string. 1365 while (I >= First && *I == '\\') --I; 1366 // (Position - 1 - I) now contains the number of '\'s before the current 1367 // position. If it is odd, the character at 'Position' was escaped. 1368 return (Position - 1 - I) % 2 == 1; 1369 } 1370 1371 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1372 StringRef::iterator Start = Current; 1373 unsigned ColStart = Column; 1374 if (IsDoubleQuoted) { 1375 do { 1376 ++Current; 1377 while (Current != End && *Current != '"') 1378 ++Current; 1379 // Repeat until the previous character was not a '\' or was an escaped 1380 // backslash. 1381 } while ( Current != End 1382 && *(Current - 1) == '\\' 1383 && wasEscaped(Start + 1, Current)); 1384 } else { 1385 skip(1); 1386 while (Current != End) { 1387 // Skip a ' followed by another '. 1388 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1389 skip(2); 1390 continue; 1391 } else if (*Current == '\'') 1392 break; 1393 StringRef::iterator i = skip_nb_char(Current); 1394 if (i == Current) { 1395 i = skip_b_break(Current); 1396 if (i == Current) 1397 break; 1398 Current = i; 1399 Column = 0; 1400 ++Line; 1401 } else { 1402 if (i == End) 1403 break; 1404 Current = i; 1405 ++Column; 1406 } 1407 } 1408 } 1409 1410 if (Current == End) { 1411 setError("Expected quote at end of scalar", Current); 1412 return false; 1413 } 1414 1415 skip(1); // Skip ending quote. 1416 Token T; 1417 T.Kind = Token::TK_Scalar; 1418 T.Range = StringRef(Start, Current - Start); 1419 TokenQueue.push_back(T); 1420 1421 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1422 1423 IsSimpleKeyAllowed = false; 1424 1425 return true; 1426 } 1427 1428 bool Scanner::scanPlainScalar() { 1429 StringRef::iterator Start = Current; 1430 unsigned ColStart = Column; 1431 unsigned LeadingBlanks = 0; 1432 assert(Indent >= -1 && "Indent must be >= -1 !"); 1433 unsigned indent = static_cast<unsigned>(Indent + 1); 1434 while (Current != End) { 1435 if (*Current == '#') 1436 break; 1437 1438 while (Current != End && !isBlankOrBreak(Current)) { 1439 if (FlowLevel && *Current == ':' && 1440 (Current + 1 == End || 1441 !(isBlankOrBreak(Current + 1) || *(Current + 1) == ','))) { 1442 setError("Found unexpected ':' while scanning a plain scalar", Current); 1443 return false; 1444 } 1445 1446 // Check for the end of the plain scalar. 1447 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1448 || ( FlowLevel 1449 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1450 != StringRef::npos))) 1451 break; 1452 1453 StringRef::iterator i = skip_nb_char(Current); 1454 if (i == Current) 1455 break; 1456 Current = i; 1457 ++Column; 1458 } 1459 1460 // Are we at the end? 1461 if (!isBlankOrBreak(Current)) 1462 break; 1463 1464 // Eat blanks. 1465 StringRef::iterator Tmp = Current; 1466 while (isBlankOrBreak(Tmp)) { 1467 StringRef::iterator i = skip_s_white(Tmp); 1468 if (i != Tmp) { 1469 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1470 setError("Found invalid tab character in indentation", Tmp); 1471 return false; 1472 } 1473 Tmp = i; 1474 ++Column; 1475 } else { 1476 i = skip_b_break(Tmp); 1477 if (!LeadingBlanks) 1478 LeadingBlanks = 1; 1479 Tmp = i; 1480 Column = 0; 1481 ++Line; 1482 } 1483 } 1484 1485 if (!FlowLevel && Column < indent) 1486 break; 1487 1488 Current = Tmp; 1489 } 1490 if (Start == Current) { 1491 setError("Got empty plain scalar", Start); 1492 return false; 1493 } 1494 Token T; 1495 T.Kind = Token::TK_Scalar; 1496 T.Range = StringRef(Start, Current - Start); 1497 TokenQueue.push_back(T); 1498 1499 // Plain scalars can be simple keys. 1500 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1501 1502 IsSimpleKeyAllowed = false; 1503 1504 return true; 1505 } 1506 1507 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1508 StringRef::iterator Start = Current; 1509 unsigned ColStart = Column; 1510 skip(1); 1511 while (Current != End) { 1512 if ( *Current == '[' || *Current == ']' 1513 || *Current == '{' || *Current == '}' 1514 || *Current == ',' 1515 || *Current == ':') 1516 break; 1517 StringRef::iterator i = skip_ns_char(Current); 1518 if (i == Current) 1519 break; 1520 Current = i; 1521 ++Column; 1522 } 1523 1524 if (Start + 1 == Current) { 1525 setError("Got empty alias or anchor", Start); 1526 return false; 1527 } 1528 1529 Token T; 1530 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1531 T.Range = StringRef(Start, Current - Start); 1532 TokenQueue.push_back(T); 1533 1534 // Alias and anchors can be simple keys. 1535 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1536 1537 IsSimpleKeyAllowed = false; 1538 1539 return true; 1540 } 1541 1542 bool Scanner::scanBlockScalarIndicators(char &StyleIndicator, 1543 char &ChompingIndicator, 1544 unsigned &IndentIndicator, 1545 bool &IsDone) { 1546 StyleIndicator = scanBlockStyleIndicator(); 1547 if (!scanBlockScalarHeader(ChompingIndicator, IndentIndicator, IsDone)) 1548 return false; 1549 return true; 1550 } 1551 1552 char Scanner::scanBlockStyleIndicator() { 1553 char Indicator = ' '; 1554 if (Current != End && (*Current == '>' || *Current == '|')) { 1555 Indicator = *Current; 1556 skip(1); 1557 } 1558 return Indicator; 1559 } 1560 1561 char Scanner::scanBlockChompingIndicator() { 1562 char Indicator = ' '; 1563 if (Current != End && (*Current == '+' || *Current == '-')) { 1564 Indicator = *Current; 1565 skip(1); 1566 } 1567 return Indicator; 1568 } 1569 1570 /// Get the number of line breaks after chomping. 1571 /// 1572 /// Return the number of trailing line breaks to emit, depending on 1573 /// \p ChompingIndicator. 1574 static unsigned getChompedLineBreaks(char ChompingIndicator, 1575 unsigned LineBreaks, StringRef Str) { 1576 if (ChompingIndicator == '-') // Strip all line breaks. 1577 return 0; 1578 if (ChompingIndicator == '+') // Keep all line breaks. 1579 return LineBreaks; 1580 // Clip trailing lines. 1581 return Str.empty() ? 0 : 1; 1582 } 1583 1584 unsigned Scanner::scanBlockIndentationIndicator() { 1585 unsigned Indent = 0; 1586 if (Current != End && (*Current >= '1' && *Current <= '9')) { 1587 Indent = unsigned(*Current - '0'); 1588 skip(1); 1589 } 1590 return Indent; 1591 } 1592 1593 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, 1594 unsigned &IndentIndicator, bool &IsDone) { 1595 auto Start = Current; 1596 1597 ChompingIndicator = scanBlockChompingIndicator(); 1598 IndentIndicator = scanBlockIndentationIndicator(); 1599 // Check for the chomping indicator once again. 1600 if (ChompingIndicator == ' ') 1601 ChompingIndicator = scanBlockChompingIndicator(); 1602 Current = skip_while(&Scanner::skip_s_white, Current); 1603 skipComment(); 1604 1605 if (Current == End) { // EOF, we have an empty scalar. 1606 Token T; 1607 T.Kind = Token::TK_BlockScalar; 1608 T.Range = StringRef(Start, Current - Start); 1609 TokenQueue.push_back(T); 1610 IsDone = true; 1611 return true; 1612 } 1613 1614 if (!consumeLineBreakIfPresent()) { 1615 setError("Expected a line break after block scalar header", Current); 1616 return false; 1617 } 1618 return true; 1619 } 1620 1621 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, 1622 unsigned BlockExitIndent, 1623 unsigned &LineBreaks, bool &IsDone) { 1624 unsigned MaxAllSpaceLineCharacters = 0; 1625 StringRef::iterator LongestAllSpaceLine; 1626 1627 while (true) { 1628 advanceWhile(&Scanner::skip_s_space); 1629 if (skip_nb_char(Current) != Current) { 1630 // This line isn't empty, so try and find the indentation. 1631 if (Column <= BlockExitIndent) { // End of the block literal. 1632 IsDone = true; 1633 return true; 1634 } 1635 // We found the block's indentation. 1636 BlockIndent = Column; 1637 if (MaxAllSpaceLineCharacters > BlockIndent) { 1638 setError( 1639 "Leading all-spaces line must be smaller than the block indent", 1640 LongestAllSpaceLine); 1641 return false; 1642 } 1643 return true; 1644 } 1645 if (skip_b_break(Current) != Current && 1646 Column > MaxAllSpaceLineCharacters) { 1647 // Record the longest all-space line in case it's longer than the 1648 // discovered block indent. 1649 MaxAllSpaceLineCharacters = Column; 1650 LongestAllSpaceLine = Current; 1651 } 1652 1653 // Check for EOF. 1654 if (Current == End) { 1655 IsDone = true; 1656 return true; 1657 } 1658 1659 if (!consumeLineBreakIfPresent()) { 1660 IsDone = true; 1661 return true; 1662 } 1663 ++LineBreaks; 1664 } 1665 return true; 1666 } 1667 1668 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, 1669 unsigned BlockExitIndent, bool &IsDone) { 1670 // Skip the indentation. 1671 while (Column < BlockIndent) { 1672 auto I = skip_s_space(Current); 1673 if (I == Current) 1674 break; 1675 Current = I; 1676 ++Column; 1677 } 1678 1679 if (skip_nb_char(Current) == Current) 1680 return true; 1681 1682 if (Column <= BlockExitIndent) { // End of the block literal. 1683 IsDone = true; 1684 return true; 1685 } 1686 1687 if (Column < BlockIndent) { 1688 if (Current != End && *Current == '#') { // Trailing comment. 1689 IsDone = true; 1690 return true; 1691 } 1692 setError("A text line is less indented than the block scalar", Current); 1693 return false; 1694 } 1695 return true; // A normal text line. 1696 } 1697 1698 bool Scanner::scanBlockScalar(bool IsLiteral) { 1699 assert(*Current == '|' || *Current == '>'); 1700 char StyleIndicator; 1701 char ChompingIndicator; 1702 unsigned BlockIndent; 1703 bool IsDone = false; 1704 if (!scanBlockScalarIndicators(StyleIndicator, ChompingIndicator, BlockIndent, 1705 IsDone)) 1706 return false; 1707 if (IsDone) 1708 return true; 1709 bool IsFolded = StyleIndicator == '>'; 1710 1711 const auto *Start = Current; 1712 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; 1713 unsigned LineBreaks = 0; 1714 if (BlockIndent == 0) { 1715 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, 1716 IsDone)) 1717 return false; 1718 } 1719 1720 // Scan the block's scalars body. 1721 SmallString<256> Str; 1722 while (!IsDone) { 1723 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) 1724 return false; 1725 if (IsDone) 1726 break; 1727 1728 // Parse the current line. 1729 auto LineStart = Current; 1730 advanceWhile(&Scanner::skip_nb_char); 1731 if (LineStart != Current) { 1732 if (LineBreaks && IsFolded && !Scanner::isLineEmpty(Str)) { 1733 // The folded style "folds" any single line break between content into a 1734 // single space, except when that content is "empty" (only contains 1735 // whitespace) in which case the line break is left as-is. 1736 if (LineBreaks == 1) { 1737 Str.append(LineBreaks, 1738 isLineEmpty(StringRef(LineStart, Current - LineStart)) 1739 ? '\n' 1740 : ' '); 1741 } 1742 // If we saw a single line break, we are completely replacing it and so 1743 // want `LineBreaks == 0`. Otherwise this decrement accounts for the 1744 // fact that the first line break is "trimmed", only being used to 1745 // signal a sequence of line breaks which should not be folded. 1746 LineBreaks--; 1747 } 1748 Str.append(LineBreaks, '\n'); 1749 Str.append(StringRef(LineStart, Current - LineStart)); 1750 LineBreaks = 0; 1751 } 1752 1753 // Check for EOF. 1754 if (Current == End) 1755 break; 1756 1757 if (!consumeLineBreakIfPresent()) 1758 break; 1759 ++LineBreaks; 1760 } 1761 1762 if (Current == End && !LineBreaks) 1763 // Ensure that there is at least one line break before the end of file. 1764 LineBreaks = 1; 1765 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); 1766 1767 // New lines may start a simple key. 1768 if (!FlowLevel) 1769 IsSimpleKeyAllowed = true; 1770 1771 Token T; 1772 T.Kind = Token::TK_BlockScalar; 1773 T.Range = StringRef(Start, Current - Start); 1774 T.Value = std::string(Str); 1775 TokenQueue.push_back(T); 1776 return true; 1777 } 1778 1779 bool Scanner::scanTag() { 1780 StringRef::iterator Start = Current; 1781 unsigned ColStart = Column; 1782 skip(1); // Eat !. 1783 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1784 else if (*Current == '<') { 1785 skip(1); 1786 scan_ns_uri_char(); 1787 if (!consume('>')) 1788 return false; 1789 } else { 1790 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1791 Current = skip_while(&Scanner::skip_ns_char, Current); 1792 } 1793 1794 Token T; 1795 T.Kind = Token::TK_Tag; 1796 T.Range = StringRef(Start, Current - Start); 1797 TokenQueue.push_back(T); 1798 1799 // Tags can be simple keys. 1800 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1801 1802 IsSimpleKeyAllowed = false; 1803 1804 return true; 1805 } 1806 1807 bool Scanner::fetchMoreTokens() { 1808 if (IsStartOfStream) 1809 return scanStreamStart(); 1810 1811 scanToNextToken(); 1812 1813 if (Current == End) 1814 return scanStreamEnd(); 1815 1816 removeStaleSimpleKeyCandidates(); 1817 1818 unrollIndent(Column); 1819 1820 if (Column == 0 && *Current == '%') 1821 return scanDirective(); 1822 1823 if (Column == 0 && Current + 4 <= End 1824 && *Current == '-' 1825 && *(Current + 1) == '-' 1826 && *(Current + 2) == '-' 1827 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1828 return scanDocumentIndicator(true); 1829 1830 if (Column == 0 && Current + 4 <= End 1831 && *Current == '.' 1832 && *(Current + 1) == '.' 1833 && *(Current + 2) == '.' 1834 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1835 return scanDocumentIndicator(false); 1836 1837 if (*Current == '[') 1838 return scanFlowCollectionStart(true); 1839 1840 if (*Current == '{') 1841 return scanFlowCollectionStart(false); 1842 1843 if (*Current == ']') 1844 return scanFlowCollectionEnd(true); 1845 1846 if (*Current == '}') 1847 return scanFlowCollectionEnd(false); 1848 1849 if (*Current == ',') 1850 return scanFlowEntry(); 1851 1852 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1853 return scanBlockEntry(); 1854 1855 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1856 return scanKey(); 1857 1858 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1859 return scanValue(); 1860 1861 if (*Current == '*') 1862 return scanAliasOrAnchor(true); 1863 1864 if (*Current == '&') 1865 return scanAliasOrAnchor(false); 1866 1867 if (*Current == '!') 1868 return scanTag(); 1869 1870 if (*Current == '|' && !FlowLevel) 1871 return scanBlockScalar(true); 1872 1873 if (*Current == '>' && !FlowLevel) 1874 return scanBlockScalar(false); 1875 1876 if (*Current == '\'') 1877 return scanFlowScalar(false); 1878 1879 if (*Current == '"') 1880 return scanFlowScalar(true); 1881 1882 // Get a plain scalar. 1883 StringRef FirstChar(Current, 1); 1884 if (!(isBlankOrBreak(Current) 1885 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1886 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1887 || (!FlowLevel && (*Current == '?' || *Current == ':') 1888 && isBlankOrBreak(Current + 1)) 1889 || (!FlowLevel && *Current == ':' 1890 && Current + 2 < End 1891 && *(Current + 1) == ':' 1892 && !isBlankOrBreak(Current + 2))) 1893 return scanPlainScalar(); 1894 1895 setError("Unrecognized character while tokenizing.", Current); 1896 return false; 1897 } 1898 1899 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors, 1900 std::error_code *EC) 1901 : scanner(new Scanner(Input, SM, ShowColors, EC)) {} 1902 1903 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors, 1904 std::error_code *EC) 1905 : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)) {} 1906 1907 Stream::~Stream() = default; 1908 1909 bool Stream::failed() { return scanner->failed(); } 1910 1911 void Stream::printError(Node *N, const Twine &Msg, SourceMgr::DiagKind Kind) { 1912 printError(N ? N->getSourceRange() : SMRange(), Msg, Kind); 1913 } 1914 1915 void Stream::printError(const SMRange &Range, const Twine &Msg, 1916 SourceMgr::DiagKind Kind) { 1917 scanner->printError(Range.Start, Kind, Msg, Range); 1918 } 1919 1920 document_iterator Stream::begin() { 1921 if (CurrentDoc) 1922 report_fatal_error("Can only iterate over the stream once"); 1923 1924 // Skip Stream-Start. 1925 scanner->getNext(); 1926 1927 CurrentDoc.reset(new Document(*this)); 1928 return document_iterator(CurrentDoc); 1929 } 1930 1931 document_iterator Stream::end() { 1932 return document_iterator(); 1933 } 1934 1935 void Stream::skip() { 1936 for (Document &Doc : *this) 1937 Doc.skip(); 1938 } 1939 1940 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1941 StringRef T) 1942 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1943 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1944 SourceRange = SMRange(Start, Start); 1945 } 1946 1947 std::string Node::getVerbatimTag() const { 1948 StringRef Raw = getRawTag(); 1949 if (!Raw.empty() && Raw != "!") { 1950 std::string Ret; 1951 if (Raw.find_last_of('!') == 0) { 1952 Ret = std::string(Doc->getTagMap().find("!")->second); 1953 Ret += Raw.substr(1); 1954 return Ret; 1955 } else if (Raw.startswith("!!")) { 1956 Ret = std::string(Doc->getTagMap().find("!!")->second); 1957 Ret += Raw.substr(2); 1958 return Ret; 1959 } else { 1960 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1961 std::map<StringRef, StringRef>::const_iterator It = 1962 Doc->getTagMap().find(TagHandle); 1963 if (It != Doc->getTagMap().end()) 1964 Ret = std::string(It->second); 1965 else { 1966 Token T; 1967 T.Kind = Token::TK_Tag; 1968 T.Range = TagHandle; 1969 setError(Twine("Unknown tag handle ") + TagHandle, T); 1970 } 1971 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1972 return Ret; 1973 } 1974 } 1975 1976 switch (getType()) { 1977 case NK_Null: 1978 return "tag:yaml.org,2002:null"; 1979 case NK_Scalar: 1980 case NK_BlockScalar: 1981 // TODO: Tag resolution. 1982 return "tag:yaml.org,2002:str"; 1983 case NK_Mapping: 1984 return "tag:yaml.org,2002:map"; 1985 case NK_Sequence: 1986 return "tag:yaml.org,2002:seq"; 1987 } 1988 1989 return ""; 1990 } 1991 1992 Token &Node::peekNext() { 1993 return Doc->peekNext(); 1994 } 1995 1996 Token Node::getNext() { 1997 return Doc->getNext(); 1998 } 1999 2000 Node *Node::parseBlockNode() { 2001 return Doc->parseBlockNode(); 2002 } 2003 2004 BumpPtrAllocator &Node::getAllocator() { 2005 return Doc->NodeAllocator; 2006 } 2007 2008 void Node::setError(const Twine &Msg, Token &Tok) const { 2009 Doc->setError(Msg, Tok); 2010 } 2011 2012 bool Node::failed() const { 2013 return Doc->failed(); 2014 } 2015 2016 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 2017 // TODO: Handle newlines properly. We need to remove leading whitespace. 2018 if (Value[0] == '"') { // Double quoted. 2019 // Pull off the leading and trailing "s. 2020 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 2021 // Search for characters that would require unescaping the value. 2022 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 2023 if (i != StringRef::npos) 2024 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 2025 return UnquotedValue; 2026 } else if (Value[0] == '\'') { // Single quoted. 2027 // Pull off the leading and trailing 's. 2028 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 2029 StringRef::size_type i = UnquotedValue.find('\''); 2030 if (i != StringRef::npos) { 2031 // We're going to need Storage. 2032 Storage.clear(); 2033 Storage.reserve(UnquotedValue.size()); 2034 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 2035 StringRef Valid(UnquotedValue.begin(), i); 2036 llvm::append_range(Storage, Valid); 2037 Storage.push_back('\''); 2038 UnquotedValue = UnquotedValue.substr(i + 2); 2039 } 2040 llvm::append_range(Storage, UnquotedValue); 2041 return StringRef(Storage.begin(), Storage.size()); 2042 } 2043 return UnquotedValue; 2044 } 2045 // Plain or block. 2046 return Value.rtrim(' '); 2047 } 2048 2049 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 2050 , StringRef::size_type i 2051 , SmallVectorImpl<char> &Storage) 2052 const { 2053 // Use Storage to build proper value. 2054 Storage.clear(); 2055 Storage.reserve(UnquotedValue.size()); 2056 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 2057 // Insert all previous chars into Storage. 2058 StringRef Valid(UnquotedValue.begin(), i); 2059 llvm::append_range(Storage, Valid); 2060 // Chop off inserted chars. 2061 UnquotedValue = UnquotedValue.substr(i); 2062 2063 assert(!UnquotedValue.empty() && "Can't be empty!"); 2064 2065 // Parse escape or line break. 2066 switch (UnquotedValue[0]) { 2067 case '\r': 2068 case '\n': 2069 Storage.push_back('\n'); 2070 if ( UnquotedValue.size() > 1 2071 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 2072 UnquotedValue = UnquotedValue.substr(1); 2073 UnquotedValue = UnquotedValue.substr(1); 2074 break; 2075 default: 2076 if (UnquotedValue.size() == 1) { 2077 Token T; 2078 T.Range = StringRef(UnquotedValue.begin(), 1); 2079 setError("Unrecognized escape code", T); 2080 return ""; 2081 } 2082 UnquotedValue = UnquotedValue.substr(1); 2083 switch (UnquotedValue[0]) { 2084 default: { 2085 Token T; 2086 T.Range = StringRef(UnquotedValue.begin(), 1); 2087 setError("Unrecognized escape code", T); 2088 return ""; 2089 } 2090 case '\r': 2091 case '\n': 2092 // Remove the new line. 2093 if ( UnquotedValue.size() > 1 2094 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 2095 UnquotedValue = UnquotedValue.substr(1); 2096 // If this was just a single byte newline, it will get skipped 2097 // below. 2098 break; 2099 case '0': 2100 Storage.push_back(0x00); 2101 break; 2102 case 'a': 2103 Storage.push_back(0x07); 2104 break; 2105 case 'b': 2106 Storage.push_back(0x08); 2107 break; 2108 case 't': 2109 case 0x09: 2110 Storage.push_back(0x09); 2111 break; 2112 case 'n': 2113 Storage.push_back(0x0A); 2114 break; 2115 case 'v': 2116 Storage.push_back(0x0B); 2117 break; 2118 case 'f': 2119 Storage.push_back(0x0C); 2120 break; 2121 case 'r': 2122 Storage.push_back(0x0D); 2123 break; 2124 case 'e': 2125 Storage.push_back(0x1B); 2126 break; 2127 case ' ': 2128 Storage.push_back(0x20); 2129 break; 2130 case '"': 2131 Storage.push_back(0x22); 2132 break; 2133 case '/': 2134 Storage.push_back(0x2F); 2135 break; 2136 case '\\': 2137 Storage.push_back(0x5C); 2138 break; 2139 case 'N': 2140 encodeUTF8(0x85, Storage); 2141 break; 2142 case '_': 2143 encodeUTF8(0xA0, Storage); 2144 break; 2145 case 'L': 2146 encodeUTF8(0x2028, Storage); 2147 break; 2148 case 'P': 2149 encodeUTF8(0x2029, Storage); 2150 break; 2151 case 'x': { 2152 if (UnquotedValue.size() < 3) 2153 // TODO: Report error. 2154 break; 2155 unsigned int UnicodeScalarValue; 2156 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 2157 // TODO: Report error. 2158 UnicodeScalarValue = 0xFFFD; 2159 encodeUTF8(UnicodeScalarValue, Storage); 2160 UnquotedValue = UnquotedValue.substr(2); 2161 break; 2162 } 2163 case 'u': { 2164 if (UnquotedValue.size() < 5) 2165 // TODO: Report error. 2166 break; 2167 unsigned int UnicodeScalarValue; 2168 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 2169 // TODO: Report error. 2170 UnicodeScalarValue = 0xFFFD; 2171 encodeUTF8(UnicodeScalarValue, Storage); 2172 UnquotedValue = UnquotedValue.substr(4); 2173 break; 2174 } 2175 case 'U': { 2176 if (UnquotedValue.size() < 9) 2177 // TODO: Report error. 2178 break; 2179 unsigned int UnicodeScalarValue; 2180 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 2181 // TODO: Report error. 2182 UnicodeScalarValue = 0xFFFD; 2183 encodeUTF8(UnicodeScalarValue, Storage); 2184 UnquotedValue = UnquotedValue.substr(8); 2185 break; 2186 } 2187 } 2188 UnquotedValue = UnquotedValue.substr(1); 2189 } 2190 } 2191 llvm::append_range(Storage, UnquotedValue); 2192 return StringRef(Storage.begin(), Storage.size()); 2193 } 2194 2195 Node *KeyValueNode::getKey() { 2196 if (Key) 2197 return Key; 2198 // Handle implicit null keys. 2199 { 2200 Token &t = peekNext(); 2201 if ( t.Kind == Token::TK_BlockEnd 2202 || t.Kind == Token::TK_Value 2203 || t.Kind == Token::TK_Error) { 2204 return Key = new (getAllocator()) NullNode(Doc); 2205 } 2206 if (t.Kind == Token::TK_Key) 2207 getNext(); // skip TK_Key. 2208 } 2209 2210 // Handle explicit null keys. 2211 Token &t = peekNext(); 2212 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 2213 return Key = new (getAllocator()) NullNode(Doc); 2214 } 2215 2216 // We've got a normal key. 2217 return Key = parseBlockNode(); 2218 } 2219 2220 Node *KeyValueNode::getValue() { 2221 if (Value) 2222 return Value; 2223 2224 if (Node* Key = getKey()) 2225 Key->skip(); 2226 else { 2227 setError("Null key in Key Value.", peekNext()); 2228 return Value = new (getAllocator()) NullNode(Doc); 2229 } 2230 2231 if (failed()) 2232 return Value = new (getAllocator()) NullNode(Doc); 2233 2234 // Handle implicit null values. 2235 { 2236 Token &t = peekNext(); 2237 if ( t.Kind == Token::TK_BlockEnd 2238 || t.Kind == Token::TK_FlowMappingEnd 2239 || t.Kind == Token::TK_Key 2240 || t.Kind == Token::TK_FlowEntry 2241 || t.Kind == Token::TK_Error) { 2242 return Value = new (getAllocator()) NullNode(Doc); 2243 } 2244 2245 if (t.Kind != Token::TK_Value) { 2246 setError("Unexpected token in Key Value.", t); 2247 return Value = new (getAllocator()) NullNode(Doc); 2248 } 2249 getNext(); // skip TK_Value. 2250 } 2251 2252 // Handle explicit null values. 2253 Token &t = peekNext(); 2254 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 2255 return Value = new (getAllocator()) NullNode(Doc); 2256 } 2257 2258 // We got a normal value. 2259 return Value = parseBlockNode(); 2260 } 2261 2262 void MappingNode::increment() { 2263 if (failed()) { 2264 IsAtEnd = true; 2265 CurrentEntry = nullptr; 2266 return; 2267 } 2268 if (CurrentEntry) { 2269 CurrentEntry->skip(); 2270 if (Type == MT_Inline) { 2271 IsAtEnd = true; 2272 CurrentEntry = nullptr; 2273 return; 2274 } 2275 } 2276 Token T = peekNext(); 2277 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 2278 // KeyValueNode eats the TK_Key. That way it can detect null keys. 2279 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 2280 } else if (Type == MT_Block) { 2281 switch (T.Kind) { 2282 case Token::TK_BlockEnd: 2283 getNext(); 2284 IsAtEnd = true; 2285 CurrentEntry = nullptr; 2286 break; 2287 default: 2288 setError("Unexpected token. Expected Key or Block End", T); 2289 [[fallthrough]]; 2290 case Token::TK_Error: 2291 IsAtEnd = true; 2292 CurrentEntry = nullptr; 2293 } 2294 } else { 2295 switch (T.Kind) { 2296 case Token::TK_FlowEntry: 2297 // Eat the flow entry and recurse. 2298 getNext(); 2299 return increment(); 2300 case Token::TK_FlowMappingEnd: 2301 getNext(); 2302 [[fallthrough]]; 2303 case Token::TK_Error: 2304 // Set this to end iterator. 2305 IsAtEnd = true; 2306 CurrentEntry = nullptr; 2307 break; 2308 default: 2309 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 2310 "Mapping End." 2311 , T); 2312 IsAtEnd = true; 2313 CurrentEntry = nullptr; 2314 } 2315 } 2316 } 2317 2318 void SequenceNode::increment() { 2319 if (failed()) { 2320 IsAtEnd = true; 2321 CurrentEntry = nullptr; 2322 return; 2323 } 2324 if (CurrentEntry) 2325 CurrentEntry->skip(); 2326 Token T = peekNext(); 2327 if (SeqType == ST_Block) { 2328 switch (T.Kind) { 2329 case Token::TK_BlockEntry: 2330 getNext(); 2331 CurrentEntry = parseBlockNode(); 2332 if (!CurrentEntry) { // An error occurred. 2333 IsAtEnd = true; 2334 CurrentEntry = nullptr; 2335 } 2336 break; 2337 case Token::TK_BlockEnd: 2338 getNext(); 2339 IsAtEnd = true; 2340 CurrentEntry = nullptr; 2341 break; 2342 default: 2343 setError( "Unexpected token. Expected Block Entry or Block End." 2344 , T); 2345 [[fallthrough]]; 2346 case Token::TK_Error: 2347 IsAtEnd = true; 2348 CurrentEntry = nullptr; 2349 } 2350 } else if (SeqType == ST_Indentless) { 2351 switch (T.Kind) { 2352 case Token::TK_BlockEntry: 2353 getNext(); 2354 CurrentEntry = parseBlockNode(); 2355 if (!CurrentEntry) { // An error occurred. 2356 IsAtEnd = true; 2357 CurrentEntry = nullptr; 2358 } 2359 break; 2360 default: 2361 case Token::TK_Error: 2362 IsAtEnd = true; 2363 CurrentEntry = nullptr; 2364 } 2365 } else if (SeqType == ST_Flow) { 2366 switch (T.Kind) { 2367 case Token::TK_FlowEntry: 2368 // Eat the flow entry and recurse. 2369 getNext(); 2370 WasPreviousTokenFlowEntry = true; 2371 return increment(); 2372 case Token::TK_FlowSequenceEnd: 2373 getNext(); 2374 [[fallthrough]]; 2375 case Token::TK_Error: 2376 // Set this to end iterator. 2377 IsAtEnd = true; 2378 CurrentEntry = nullptr; 2379 break; 2380 case Token::TK_StreamEnd: 2381 case Token::TK_DocumentEnd: 2382 case Token::TK_DocumentStart: 2383 setError("Could not find closing ]!", T); 2384 // Set this to end iterator. 2385 IsAtEnd = true; 2386 CurrentEntry = nullptr; 2387 break; 2388 default: 2389 if (!WasPreviousTokenFlowEntry) { 2390 setError("Expected , between entries!", T); 2391 IsAtEnd = true; 2392 CurrentEntry = nullptr; 2393 break; 2394 } 2395 // Otherwise it must be a flow entry. 2396 CurrentEntry = parseBlockNode(); 2397 if (!CurrentEntry) { 2398 IsAtEnd = true; 2399 } 2400 WasPreviousTokenFlowEntry = false; 2401 break; 2402 } 2403 } 2404 } 2405 2406 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2407 // Tag maps starts with two default mappings. 2408 TagMap["!"] = "!"; 2409 TagMap["!!"] = "tag:yaml.org,2002:"; 2410 2411 if (parseDirectives()) 2412 expectToken(Token::TK_DocumentStart); 2413 Token &T = peekNext(); 2414 if (T.Kind == Token::TK_DocumentStart) 2415 getNext(); 2416 } 2417 2418 bool Document::skip() { 2419 if (stream.scanner->failed()) 2420 return false; 2421 if (!Root && !getRoot()) 2422 return false; 2423 Root->skip(); 2424 Token &T = peekNext(); 2425 if (T.Kind == Token::TK_StreamEnd) 2426 return false; 2427 if (T.Kind == Token::TK_DocumentEnd) { 2428 getNext(); 2429 return skip(); 2430 } 2431 return true; 2432 } 2433 2434 Token &Document::peekNext() { 2435 return stream.scanner->peekNext(); 2436 } 2437 2438 Token Document::getNext() { 2439 return stream.scanner->getNext(); 2440 } 2441 2442 void Document::setError(const Twine &Message, Token &Location) const { 2443 stream.scanner->setError(Message, Location.Range.begin()); 2444 } 2445 2446 bool Document::failed() const { 2447 return stream.scanner->failed(); 2448 } 2449 2450 Node *Document::parseBlockNode() { 2451 Token T = peekNext(); 2452 // Handle properties. 2453 Token AnchorInfo; 2454 Token TagInfo; 2455 parse_property: 2456 switch (T.Kind) { 2457 case Token::TK_Alias: 2458 getNext(); 2459 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2460 case Token::TK_Anchor: 2461 if (AnchorInfo.Kind == Token::TK_Anchor) { 2462 setError("Already encountered an anchor for this node!", T); 2463 return nullptr; 2464 } 2465 AnchorInfo = getNext(); // Consume TK_Anchor. 2466 T = peekNext(); 2467 goto parse_property; 2468 case Token::TK_Tag: 2469 if (TagInfo.Kind == Token::TK_Tag) { 2470 setError("Already encountered a tag for this node!", T); 2471 return nullptr; 2472 } 2473 TagInfo = getNext(); // Consume TK_Tag. 2474 T = peekNext(); 2475 goto parse_property; 2476 default: 2477 break; 2478 } 2479 2480 switch (T.Kind) { 2481 case Token::TK_BlockEntry: 2482 // We got an unindented BlockEntry sequence. This is not terminated with 2483 // a BlockEnd. 2484 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2485 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2486 , AnchorInfo.Range.substr(1) 2487 , TagInfo.Range 2488 , SequenceNode::ST_Indentless); 2489 case Token::TK_BlockSequenceStart: 2490 getNext(); 2491 return new (NodeAllocator) 2492 SequenceNode( stream.CurrentDoc 2493 , AnchorInfo.Range.substr(1) 2494 , TagInfo.Range 2495 , SequenceNode::ST_Block); 2496 case Token::TK_BlockMappingStart: 2497 getNext(); 2498 return new (NodeAllocator) 2499 MappingNode( stream.CurrentDoc 2500 , AnchorInfo.Range.substr(1) 2501 , TagInfo.Range 2502 , MappingNode::MT_Block); 2503 case Token::TK_FlowSequenceStart: 2504 getNext(); 2505 return new (NodeAllocator) 2506 SequenceNode( stream.CurrentDoc 2507 , AnchorInfo.Range.substr(1) 2508 , TagInfo.Range 2509 , SequenceNode::ST_Flow); 2510 case Token::TK_FlowMappingStart: 2511 getNext(); 2512 return new (NodeAllocator) 2513 MappingNode( stream.CurrentDoc 2514 , AnchorInfo.Range.substr(1) 2515 , TagInfo.Range 2516 , MappingNode::MT_Flow); 2517 case Token::TK_Scalar: 2518 getNext(); 2519 return new (NodeAllocator) 2520 ScalarNode( stream.CurrentDoc 2521 , AnchorInfo.Range.substr(1) 2522 , TagInfo.Range 2523 , T.Range); 2524 case Token::TK_BlockScalar: { 2525 getNext(); 2526 StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); 2527 StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); 2528 return new (NodeAllocator) 2529 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), 2530 TagInfo.Range, StrCopy, T.Range); 2531 } 2532 case Token::TK_Key: 2533 // Don't eat the TK_Key, KeyValueNode expects it. 2534 return new (NodeAllocator) 2535 MappingNode( stream.CurrentDoc 2536 , AnchorInfo.Range.substr(1) 2537 , TagInfo.Range 2538 , MappingNode::MT_Inline); 2539 case Token::TK_DocumentStart: 2540 case Token::TK_DocumentEnd: 2541 case Token::TK_StreamEnd: 2542 default: 2543 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2544 // !!null null. 2545 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2546 case Token::TK_FlowMappingEnd: 2547 case Token::TK_FlowSequenceEnd: 2548 case Token::TK_FlowEntry: { 2549 if (Root && (isa<MappingNode>(Root) || isa<SequenceNode>(Root))) 2550 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2551 2552 setError("Unexpected token", T); 2553 return nullptr; 2554 } 2555 case Token::TK_Error: 2556 return nullptr; 2557 } 2558 llvm_unreachable("Control flow shouldn't reach here."); 2559 return nullptr; 2560 } 2561 2562 bool Document::parseDirectives() { 2563 bool isDirective = false; 2564 while (true) { 2565 Token T = peekNext(); 2566 if (T.Kind == Token::TK_TagDirective) { 2567 parseTAGDirective(); 2568 isDirective = true; 2569 } else if (T.Kind == Token::TK_VersionDirective) { 2570 parseYAMLDirective(); 2571 isDirective = true; 2572 } else 2573 break; 2574 } 2575 return isDirective; 2576 } 2577 2578 void Document::parseYAMLDirective() { 2579 getNext(); // Eat %YAML <version> 2580 } 2581 2582 void Document::parseTAGDirective() { 2583 Token Tag = getNext(); // %TAG <handle> <prefix> 2584 StringRef T = Tag.Range; 2585 // Strip %TAG 2586 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2587 std::size_t HandleEnd = T.find_first_of(" \t"); 2588 StringRef TagHandle = T.substr(0, HandleEnd); 2589 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2590 TagMap[TagHandle] = TagPrefix; 2591 } 2592 2593 bool Document::expectToken(int TK) { 2594 Token T = getNext(); 2595 if (T.Kind != TK) { 2596 setError("Unexpected token", T); 2597 return false; 2598 } 2599 return true; 2600 } 2601