1 //===- YAMLParser.cpp - Simple YAML parser --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a YAML parser. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/YAMLParser.h" 14 #include "llvm/ADT/AllocatorList.h" 15 #include "llvm/ADT/ArrayRef.h" 16 #include "llvm/ADT/None.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/ADT/SmallString.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/StringExtras.h" 21 #include "llvm/ADT/StringRef.h" 22 #include "llvm/ADT/Twine.h" 23 #include "llvm/Support/Compiler.h" 24 #include "llvm/Support/ErrorHandling.h" 25 #include "llvm/Support/MemoryBuffer.h" 26 #include "llvm/Support/SMLoc.h" 27 #include "llvm/Support/SourceMgr.h" 28 #include "llvm/Support/Unicode.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include <algorithm> 31 #include <cassert> 32 #include <cstddef> 33 #include <cstdint> 34 #include <map> 35 #include <memory> 36 #include <string> 37 #include <system_error> 38 #include <utility> 39 40 using namespace llvm; 41 using namespace yaml; 42 43 enum UnicodeEncodingForm { 44 UEF_UTF32_LE, ///< UTF-32 Little Endian 45 UEF_UTF32_BE, ///< UTF-32 Big Endian 46 UEF_UTF16_LE, ///< UTF-16 Little Endian 47 UEF_UTF16_BE, ///< UTF-16 Big Endian 48 UEF_UTF8, ///< UTF-8 or ascii. 49 UEF_Unknown ///< Not a valid Unicode encoding. 50 }; 51 52 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 53 /// it exists. Length is in {0, 2, 3, 4}. 54 using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>; 55 56 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 57 /// encoding form of \a Input. 58 /// 59 /// @param Input A string of length 0 or more. 60 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 61 /// and how long the byte order mark is if one exists. 62 static EncodingInfo getUnicodeEncoding(StringRef Input) { 63 if (Input.empty()) 64 return std::make_pair(UEF_Unknown, 0); 65 66 switch (uint8_t(Input[0])) { 67 case 0x00: 68 if (Input.size() >= 4) { 69 if ( Input[1] == 0 70 && uint8_t(Input[2]) == 0xFE 71 && uint8_t(Input[3]) == 0xFF) 72 return std::make_pair(UEF_UTF32_BE, 4); 73 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 74 return std::make_pair(UEF_UTF32_BE, 0); 75 } 76 77 if (Input.size() >= 2 && Input[1] != 0) 78 return std::make_pair(UEF_UTF16_BE, 0); 79 return std::make_pair(UEF_Unknown, 0); 80 case 0xFF: 81 if ( Input.size() >= 4 82 && uint8_t(Input[1]) == 0xFE 83 && Input[2] == 0 84 && Input[3] == 0) 85 return std::make_pair(UEF_UTF32_LE, 4); 86 87 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 88 return std::make_pair(UEF_UTF16_LE, 2); 89 return std::make_pair(UEF_Unknown, 0); 90 case 0xFE: 91 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 92 return std::make_pair(UEF_UTF16_BE, 2); 93 return std::make_pair(UEF_Unknown, 0); 94 case 0xEF: 95 if ( Input.size() >= 3 96 && uint8_t(Input[1]) == 0xBB 97 && uint8_t(Input[2]) == 0xBF) 98 return std::make_pair(UEF_UTF8, 3); 99 return std::make_pair(UEF_Unknown, 0); 100 } 101 102 // It could still be utf-32 or utf-16. 103 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 104 return std::make_pair(UEF_UTF32_LE, 0); 105 106 if (Input.size() >= 2 && Input[1] == 0) 107 return std::make_pair(UEF_UTF16_LE, 0); 108 109 return std::make_pair(UEF_UTF8, 0); 110 } 111 112 /// Pin the vtables to this file. 113 void Node::anchor() {} 114 void NullNode::anchor() {} 115 void ScalarNode::anchor() {} 116 void BlockScalarNode::anchor() {} 117 void KeyValueNode::anchor() {} 118 void MappingNode::anchor() {} 119 void SequenceNode::anchor() {} 120 void AliasNode::anchor() {} 121 122 namespace llvm { 123 namespace yaml { 124 125 /// Token - A single YAML token. 126 struct Token { 127 enum TokenKind { 128 TK_Error, // Uninitialized token. 129 TK_StreamStart, 130 TK_StreamEnd, 131 TK_VersionDirective, 132 TK_TagDirective, 133 TK_DocumentStart, 134 TK_DocumentEnd, 135 TK_BlockEntry, 136 TK_BlockEnd, 137 TK_BlockSequenceStart, 138 TK_BlockMappingStart, 139 TK_FlowEntry, 140 TK_FlowSequenceStart, 141 TK_FlowSequenceEnd, 142 TK_FlowMappingStart, 143 TK_FlowMappingEnd, 144 TK_Key, 145 TK_Value, 146 TK_Scalar, 147 TK_BlockScalar, 148 TK_Alias, 149 TK_Anchor, 150 TK_Tag 151 } Kind = TK_Error; 152 153 /// A string of length 0 or more whose begin() points to the logical location 154 /// of the token in the input. 155 StringRef Range; 156 157 /// The value of a block scalar node. 158 std::string Value; 159 160 Token() = default; 161 }; 162 163 } // end namespace yaml 164 } // end namespace llvm 165 166 using TokenQueueT = BumpPtrList<Token>; 167 168 namespace { 169 170 /// This struct is used to track simple keys. 171 /// 172 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 173 /// which could legally be the start of a simple key. When peekNext is called, 174 /// if the Token To be returned is referenced by a SimpleKey, we continue 175 /// tokenizing until that potential simple key has either been found to not be 176 /// a simple key (we moved on to the next line or went further than 1024 chars). 177 /// Or when we run into a Value, and then insert a Key token (and possibly 178 /// others) before the SimpleKey's Tok. 179 struct SimpleKey { 180 TokenQueueT::iterator Tok; 181 unsigned Column = 0; 182 unsigned Line = 0; 183 unsigned FlowLevel = 0; 184 bool IsRequired = false; 185 186 bool operator ==(const SimpleKey &Other) { 187 return Tok == Other.Tok; 188 } 189 }; 190 191 } // end anonymous namespace 192 193 /// The Unicode scalar value of a UTF-8 minimal well-formed code unit 194 /// subsequence and the subsequence's length in code units (uint8_t). 195 /// A length of 0 represents an error. 196 using UTF8Decoded = std::pair<uint32_t, unsigned>; 197 198 static UTF8Decoded decodeUTF8(StringRef Range) { 199 StringRef::iterator Position= Range.begin(); 200 StringRef::iterator End = Range.end(); 201 // 1 byte: [0x00, 0x7f] 202 // Bit pattern: 0xxxxxxx 203 if (Position < End && (*Position & 0x80) == 0) { 204 return std::make_pair(*Position, 1); 205 } 206 // 2 bytes: [0x80, 0x7ff] 207 // Bit pattern: 110xxxxx 10xxxxxx 208 if (Position + 1 < End && ((*Position & 0xE0) == 0xC0) && 209 ((*(Position + 1) & 0xC0) == 0x80)) { 210 uint32_t codepoint = ((*Position & 0x1F) << 6) | 211 (*(Position + 1) & 0x3F); 212 if (codepoint >= 0x80) 213 return std::make_pair(codepoint, 2); 214 } 215 // 3 bytes: [0x8000, 0xffff] 216 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 217 if (Position + 2 < End && ((*Position & 0xF0) == 0xE0) && 218 ((*(Position + 1) & 0xC0) == 0x80) && 219 ((*(Position + 2) & 0xC0) == 0x80)) { 220 uint32_t codepoint = ((*Position & 0x0F) << 12) | 221 ((*(Position + 1) & 0x3F) << 6) | 222 (*(Position + 2) & 0x3F); 223 // Codepoints between 0xD800 and 0xDFFF are invalid, as 224 // they are high / low surrogate halves used by UTF-16. 225 if (codepoint >= 0x800 && 226 (codepoint < 0xD800 || codepoint > 0xDFFF)) 227 return std::make_pair(codepoint, 3); 228 } 229 // 4 bytes: [0x10000, 0x10FFFF] 230 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 231 if (Position + 3 < End && ((*Position & 0xF8) == 0xF0) && 232 ((*(Position + 1) & 0xC0) == 0x80) && 233 ((*(Position + 2) & 0xC0) == 0x80) && 234 ((*(Position + 3) & 0xC0) == 0x80)) { 235 uint32_t codepoint = ((*Position & 0x07) << 18) | 236 ((*(Position + 1) & 0x3F) << 12) | 237 ((*(Position + 2) & 0x3F) << 6) | 238 (*(Position + 3) & 0x3F); 239 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 240 return std::make_pair(codepoint, 4); 241 } 242 return std::make_pair(0, 0); 243 } 244 245 namespace llvm { 246 namespace yaml { 247 248 /// Scans YAML tokens from a MemoryBuffer. 249 class Scanner { 250 public: 251 Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true, 252 std::error_code *EC = nullptr); 253 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true, 254 std::error_code *EC = nullptr); 255 256 /// Parse the next token and return it without popping it. 257 Token &peekNext(); 258 259 /// Parse the next token and pop it from the queue. 260 Token getNext(); 261 262 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 263 ArrayRef<SMRange> Ranges = None) { 264 SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors); 265 } 266 267 void setError(const Twine &Message, StringRef::iterator Position) { 268 if (Position >= End) 269 Position = End - 1; 270 271 // propagate the error if possible 272 if (EC) 273 *EC = make_error_code(std::errc::invalid_argument); 274 275 // Don't print out more errors after the first one we encounter. The rest 276 // are just the result of the first, and have no meaning. 277 if (!Failed) 278 printError(SMLoc::getFromPointer(Position), SourceMgr::DK_Error, Message); 279 Failed = true; 280 } 281 282 /// Returns true if an error occurred while parsing. 283 bool failed() { 284 return Failed; 285 } 286 287 private: 288 void init(MemoryBufferRef Buffer); 289 290 StringRef currentInput() { 291 return StringRef(Current, End - Current); 292 } 293 294 /// Decode a UTF-8 minimal well-formed code unit subsequence starting 295 /// at \a Position. 296 /// 297 /// If the UTF-8 code units starting at Position do not form a well-formed 298 /// code unit subsequence, then the Unicode scalar value is 0, and the length 299 /// is 0. 300 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 301 return ::decodeUTF8(StringRef(Position, End - Position)); 302 } 303 304 // The following functions are based on the gramar rules in the YAML spec. The 305 // style of the function names it meant to closely match how they are written 306 // in the spec. The number within the [] is the number of the grammar rule in 307 // the spec. 308 // 309 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 310 // 311 // c- 312 // A production starting and ending with a special character. 313 // b- 314 // A production matching a single line break. 315 // nb- 316 // A production starting and ending with a non-break character. 317 // s- 318 // A production starting and ending with a white space character. 319 // ns- 320 // A production starting and ending with a non-space character. 321 // l- 322 // A production matching complete line(s). 323 324 /// Skip a single nb-char[27] starting at Position. 325 /// 326 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 327 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 328 /// 329 /// @returns The code unit after the nb-char, or Position if it's not an 330 /// nb-char. 331 StringRef::iterator skip_nb_char(StringRef::iterator Position); 332 333 /// Skip a single b-break[28] starting at Position. 334 /// 335 /// A b-break is 0xD 0xA | 0xD | 0xA 336 /// 337 /// @returns The code unit after the b-break, or Position if it's not a 338 /// b-break. 339 StringRef::iterator skip_b_break(StringRef::iterator Position); 340 341 /// Skip a single s-space[31] starting at Position. 342 /// 343 /// An s-space is 0x20 344 /// 345 /// @returns The code unit after the s-space, or Position if it's not a 346 /// s-space. 347 StringRef::iterator skip_s_space(StringRef::iterator Position); 348 349 /// Skip a single s-white[33] starting at Position. 350 /// 351 /// A s-white is 0x20 | 0x9 352 /// 353 /// @returns The code unit after the s-white, or Position if it's not a 354 /// s-white. 355 StringRef::iterator skip_s_white(StringRef::iterator Position); 356 357 /// Skip a single ns-char[34] starting at Position. 358 /// 359 /// A ns-char is nb-char - s-white 360 /// 361 /// @returns The code unit after the ns-char, or Position if it's not a 362 /// ns-char. 363 StringRef::iterator skip_ns_char(StringRef::iterator Position); 364 365 using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator); 366 367 /// Skip minimal well-formed code unit subsequences until Func 368 /// returns its input. 369 /// 370 /// @returns The code unit after the last minimal well-formed code unit 371 /// subsequence that Func accepted. 372 StringRef::iterator skip_while( SkipWhileFunc Func 373 , StringRef::iterator Position); 374 375 /// Skip minimal well-formed code unit subsequences until Func returns its 376 /// input. 377 void advanceWhile(SkipWhileFunc Func); 378 379 /// Scan ns-uri-char[39]s starting at Cur. 380 /// 381 /// This updates Cur and Column while scanning. 382 void scan_ns_uri_char(); 383 384 /// Consume a minimal well-formed code unit subsequence starting at 385 /// \a Cur. Return false if it is not the same Unicode scalar value as 386 /// \a Expected. This updates \a Column. 387 bool consume(uint32_t Expected); 388 389 /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 390 void skip(uint32_t Distance); 391 392 /// Return true if the minimal well-formed code unit subsequence at 393 /// Pos is whitespace or a new line 394 bool isBlankOrBreak(StringRef::iterator Position); 395 396 /// Consume a single b-break[28] if it's present at the current position. 397 /// 398 /// Return false if the code unit at the current position isn't a line break. 399 bool consumeLineBreakIfPresent(); 400 401 /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 402 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 403 , unsigned AtColumn 404 , bool IsRequired); 405 406 /// Remove simple keys that can no longer be valid simple keys. 407 /// 408 /// Invalid simple keys are not on the current line or are further than 1024 409 /// columns back. 410 void removeStaleSimpleKeyCandidates(); 411 412 /// Remove all simple keys on FlowLevel \a Level. 413 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 414 415 /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 416 /// tokens if needed. 417 bool unrollIndent(int ToColumn); 418 419 /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 420 /// if needed. 421 bool rollIndent( int ToColumn 422 , Token::TokenKind Kind 423 , TokenQueueT::iterator InsertPoint); 424 425 /// Skip a single-line comment when the comment starts at the current 426 /// position of the scanner. 427 void skipComment(); 428 429 /// Skip whitespace and comments until the start of the next token. 430 void scanToNextToken(); 431 432 /// Must be the first token generated. 433 bool scanStreamStart(); 434 435 /// Generate tokens needed to close out the stream. 436 bool scanStreamEnd(); 437 438 /// Scan a %BLAH directive. 439 bool scanDirective(); 440 441 /// Scan a ... or ---. 442 bool scanDocumentIndicator(bool IsStart); 443 444 /// Scan a [ or { and generate the proper flow collection start token. 445 bool scanFlowCollectionStart(bool IsSequence); 446 447 /// Scan a ] or } and generate the proper flow collection end token. 448 bool scanFlowCollectionEnd(bool IsSequence); 449 450 /// Scan the , that separates entries in a flow collection. 451 bool scanFlowEntry(); 452 453 /// Scan the - that starts block sequence entries. 454 bool scanBlockEntry(); 455 456 /// Scan an explicit ? indicating a key. 457 bool scanKey(); 458 459 /// Scan an explicit : indicating a value. 460 bool scanValue(); 461 462 /// Scan a quoted scalar. 463 bool scanFlowScalar(bool IsDoubleQuoted); 464 465 /// Scan an unquoted scalar. 466 bool scanPlainScalar(); 467 468 /// Scan an Alias or Anchor starting with * or &. 469 bool scanAliasOrAnchor(bool IsAlias); 470 471 /// Scan a block scalar starting with | or >. 472 bool scanBlockScalar(bool IsLiteral); 473 474 /// Scan a chomping indicator in a block scalar header. 475 char scanBlockChompingIndicator(); 476 477 /// Scan an indentation indicator in a block scalar header. 478 unsigned scanBlockIndentationIndicator(); 479 480 /// Scan a block scalar header. 481 /// 482 /// Return false if an error occurred. 483 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, 484 bool &IsDone); 485 486 /// Look for the indentation level of a block scalar. 487 /// 488 /// Return false if an error occurred. 489 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, 490 unsigned &LineBreaks, bool &IsDone); 491 492 /// Scan the indentation of a text line in a block scalar. 493 /// 494 /// Return false if an error occurred. 495 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, 496 bool &IsDone); 497 498 /// Scan a tag of the form !stuff. 499 bool scanTag(); 500 501 /// Dispatch to the next scanning function based on \a *Cur. 502 bool fetchMoreTokens(); 503 504 /// The SourceMgr used for diagnostics and buffer management. 505 SourceMgr &SM; 506 507 /// The original input. 508 MemoryBufferRef InputBuffer; 509 510 /// The current position of the scanner. 511 StringRef::iterator Current; 512 513 /// The end of the input (one past the last character). 514 StringRef::iterator End; 515 516 /// Current YAML indentation level in spaces. 517 int Indent; 518 519 /// Current column number in Unicode code points. 520 unsigned Column; 521 522 /// Current line number. 523 unsigned Line; 524 525 /// How deep we are in flow style containers. 0 Means at block level. 526 unsigned FlowLevel; 527 528 /// Are we at the start of the stream? 529 bool IsStartOfStream; 530 531 /// Can the next token be the start of a simple key? 532 bool IsSimpleKeyAllowed; 533 534 /// True if an error has occurred. 535 bool Failed; 536 537 /// Should colors be used when printing out the diagnostic messages? 538 bool ShowColors; 539 540 /// Queue of tokens. This is required to queue up tokens while looking 541 /// for the end of a simple key. And for cases where a single character 542 /// can produce multiple tokens (e.g. BlockEnd). 543 TokenQueueT TokenQueue; 544 545 /// Indentation levels. 546 SmallVector<int, 4> Indents; 547 548 /// Potential simple keys. 549 SmallVector<SimpleKey, 4> SimpleKeys; 550 551 std::error_code *EC; 552 }; 553 554 } // end namespace yaml 555 } // end namespace llvm 556 557 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 558 static void encodeUTF8( uint32_t UnicodeScalarValue 559 , SmallVectorImpl<char> &Result) { 560 if (UnicodeScalarValue <= 0x7F) { 561 Result.push_back(UnicodeScalarValue & 0x7F); 562 } else if (UnicodeScalarValue <= 0x7FF) { 563 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 564 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 565 Result.push_back(FirstByte); 566 Result.push_back(SecondByte); 567 } else if (UnicodeScalarValue <= 0xFFFF) { 568 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 569 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 570 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 571 Result.push_back(FirstByte); 572 Result.push_back(SecondByte); 573 Result.push_back(ThirdByte); 574 } else if (UnicodeScalarValue <= 0x10FFFF) { 575 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 576 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 577 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 578 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 579 Result.push_back(FirstByte); 580 Result.push_back(SecondByte); 581 Result.push_back(ThirdByte); 582 Result.push_back(FourthByte); 583 } 584 } 585 586 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 587 SourceMgr SM; 588 Scanner scanner(Input, SM); 589 while (true) { 590 Token T = scanner.getNext(); 591 switch (T.Kind) { 592 case Token::TK_StreamStart: 593 OS << "Stream-Start: "; 594 break; 595 case Token::TK_StreamEnd: 596 OS << "Stream-End: "; 597 break; 598 case Token::TK_VersionDirective: 599 OS << "Version-Directive: "; 600 break; 601 case Token::TK_TagDirective: 602 OS << "Tag-Directive: "; 603 break; 604 case Token::TK_DocumentStart: 605 OS << "Document-Start: "; 606 break; 607 case Token::TK_DocumentEnd: 608 OS << "Document-End: "; 609 break; 610 case Token::TK_BlockEntry: 611 OS << "Block-Entry: "; 612 break; 613 case Token::TK_BlockEnd: 614 OS << "Block-End: "; 615 break; 616 case Token::TK_BlockSequenceStart: 617 OS << "Block-Sequence-Start: "; 618 break; 619 case Token::TK_BlockMappingStart: 620 OS << "Block-Mapping-Start: "; 621 break; 622 case Token::TK_FlowEntry: 623 OS << "Flow-Entry: "; 624 break; 625 case Token::TK_FlowSequenceStart: 626 OS << "Flow-Sequence-Start: "; 627 break; 628 case Token::TK_FlowSequenceEnd: 629 OS << "Flow-Sequence-End: "; 630 break; 631 case Token::TK_FlowMappingStart: 632 OS << "Flow-Mapping-Start: "; 633 break; 634 case Token::TK_FlowMappingEnd: 635 OS << "Flow-Mapping-End: "; 636 break; 637 case Token::TK_Key: 638 OS << "Key: "; 639 break; 640 case Token::TK_Value: 641 OS << "Value: "; 642 break; 643 case Token::TK_Scalar: 644 OS << "Scalar: "; 645 break; 646 case Token::TK_BlockScalar: 647 OS << "Block Scalar: "; 648 break; 649 case Token::TK_Alias: 650 OS << "Alias: "; 651 break; 652 case Token::TK_Anchor: 653 OS << "Anchor: "; 654 break; 655 case Token::TK_Tag: 656 OS << "Tag: "; 657 break; 658 case Token::TK_Error: 659 break; 660 } 661 OS << T.Range << "\n"; 662 if (T.Kind == Token::TK_StreamEnd) 663 break; 664 else if (T.Kind == Token::TK_Error) 665 return false; 666 } 667 return true; 668 } 669 670 bool yaml::scanTokens(StringRef Input) { 671 SourceMgr SM; 672 Scanner scanner(Input, SM); 673 while (true) { 674 Token T = scanner.getNext(); 675 if (T.Kind == Token::TK_StreamEnd) 676 break; 677 else if (T.Kind == Token::TK_Error) 678 return false; 679 } 680 return true; 681 } 682 683 std::string yaml::escape(StringRef Input, bool EscapePrintable) { 684 std::string EscapedInput; 685 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 686 if (*i == '\\') 687 EscapedInput += "\\\\"; 688 else if (*i == '"') 689 EscapedInput += "\\\""; 690 else if (*i == 0) 691 EscapedInput += "\\0"; 692 else if (*i == 0x07) 693 EscapedInput += "\\a"; 694 else if (*i == 0x08) 695 EscapedInput += "\\b"; 696 else if (*i == 0x09) 697 EscapedInput += "\\t"; 698 else if (*i == 0x0A) 699 EscapedInput += "\\n"; 700 else if (*i == 0x0B) 701 EscapedInput += "\\v"; 702 else if (*i == 0x0C) 703 EscapedInput += "\\f"; 704 else if (*i == 0x0D) 705 EscapedInput += "\\r"; 706 else if (*i == 0x1B) 707 EscapedInput += "\\e"; 708 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 709 std::string HexStr = utohexstr(*i); 710 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 711 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 712 UTF8Decoded UnicodeScalarValue 713 = decodeUTF8(StringRef(i, Input.end() - i)); 714 if (UnicodeScalarValue.second == 0) { 715 // Found invalid char. 716 SmallString<4> Val; 717 encodeUTF8(0xFFFD, Val); 718 llvm::append_range(EscapedInput, Val); 719 // FIXME: Error reporting. 720 return EscapedInput; 721 } 722 if (UnicodeScalarValue.first == 0x85) 723 EscapedInput += "\\N"; 724 else if (UnicodeScalarValue.first == 0xA0) 725 EscapedInput += "\\_"; 726 else if (UnicodeScalarValue.first == 0x2028) 727 EscapedInput += "\\L"; 728 else if (UnicodeScalarValue.first == 0x2029) 729 EscapedInput += "\\P"; 730 else if (!EscapePrintable && 731 sys::unicode::isPrintable(UnicodeScalarValue.first)) 732 EscapedInput += StringRef(i, UnicodeScalarValue.second); 733 else { 734 std::string HexStr = utohexstr(UnicodeScalarValue.first); 735 if (HexStr.size() <= 2) 736 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 737 else if (HexStr.size() <= 4) 738 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 739 else if (HexStr.size() <= 8) 740 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 741 } 742 i += UnicodeScalarValue.second - 1; 743 } else 744 EscapedInput.push_back(*i); 745 } 746 return EscapedInput; 747 } 748 749 llvm::Optional<bool> yaml::parseBool(StringRef S) { 750 switch (S.size()) { 751 case 1: 752 switch (S.front()) { 753 case 'y': 754 case 'Y': 755 return true; 756 case 'n': 757 case 'N': 758 return false; 759 default: 760 return None; 761 } 762 case 2: 763 switch (S.front()) { 764 case 'O': 765 if (S[1] == 'N') // ON 766 return true; 767 LLVM_FALLTHROUGH; 768 case 'o': 769 if (S[1] == 'n') //[Oo]n 770 return true; 771 return None; 772 case 'N': 773 if (S[1] == 'O') // NO 774 return false; 775 LLVM_FALLTHROUGH; 776 case 'n': 777 if (S[1] == 'o') //[Nn]o 778 return false; 779 return None; 780 default: 781 return None; 782 } 783 case 3: 784 switch (S.front()) { 785 case 'O': 786 if (S.drop_front() == "FF") // OFF 787 return false; 788 LLVM_FALLTHROUGH; 789 case 'o': 790 if (S.drop_front() == "ff") //[Oo]ff 791 return false; 792 return None; 793 case 'Y': 794 if (S.drop_front() == "ES") // YES 795 return true; 796 LLVM_FALLTHROUGH; 797 case 'y': 798 if (S.drop_front() == "es") //[Yy]es 799 return true; 800 return None; 801 default: 802 return None; 803 } 804 case 4: 805 switch (S.front()) { 806 case 'T': 807 if (S.drop_front() == "RUE") // TRUE 808 return true; 809 LLVM_FALLTHROUGH; 810 case 't': 811 if (S.drop_front() == "rue") //[Tt]rue 812 return true; 813 return None; 814 default: 815 return None; 816 } 817 case 5: 818 switch (S.front()) { 819 case 'F': 820 if (S.drop_front() == "ALSE") // FALSE 821 return false; 822 LLVM_FALLTHROUGH; 823 case 'f': 824 if (S.drop_front() == "alse") //[Ff]alse 825 return false; 826 return None; 827 default: 828 return None; 829 } 830 default: 831 return None; 832 } 833 } 834 835 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors, 836 std::error_code *EC) 837 : SM(sm), ShowColors(ShowColors), EC(EC) { 838 init(MemoryBufferRef(Input, "YAML")); 839 } 840 841 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors, 842 std::error_code *EC) 843 : SM(SM_), ShowColors(ShowColors), EC(EC) { 844 init(Buffer); 845 } 846 847 void Scanner::init(MemoryBufferRef Buffer) { 848 InputBuffer = Buffer; 849 Current = InputBuffer.getBufferStart(); 850 End = InputBuffer.getBufferEnd(); 851 Indent = -1; 852 Column = 0; 853 Line = 0; 854 FlowLevel = 0; 855 IsStartOfStream = true; 856 IsSimpleKeyAllowed = true; 857 Failed = false; 858 std::unique_ptr<MemoryBuffer> InputBufferOwner = 859 MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false); 860 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 861 } 862 863 Token &Scanner::peekNext() { 864 // If the current token is a possible simple key, keep parsing until we 865 // can confirm. 866 bool NeedMore = false; 867 while (true) { 868 if (TokenQueue.empty() || NeedMore) { 869 if (!fetchMoreTokens()) { 870 TokenQueue.clear(); 871 SimpleKeys.clear(); 872 TokenQueue.push_back(Token()); 873 return TokenQueue.front(); 874 } 875 } 876 assert(!TokenQueue.empty() && 877 "fetchMoreTokens lied about getting tokens!"); 878 879 removeStaleSimpleKeyCandidates(); 880 SimpleKey SK; 881 SK.Tok = TokenQueue.begin(); 882 if (!is_contained(SimpleKeys, SK)) 883 break; 884 else 885 NeedMore = true; 886 } 887 return TokenQueue.front(); 888 } 889 890 Token Scanner::getNext() { 891 Token Ret = peekNext(); 892 // TokenQueue can be empty if there was an error getting the next token. 893 if (!TokenQueue.empty()) 894 TokenQueue.pop_front(); 895 896 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 897 // quick deallocation of them all. 898 if (TokenQueue.empty()) 899 TokenQueue.resetAlloc(); 900 901 return Ret; 902 } 903 904 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 905 if (Position == End) 906 return Position; 907 // Check 7 bit c-printable - b-char. 908 if ( *Position == 0x09 909 || (*Position >= 0x20 && *Position <= 0x7E)) 910 return Position + 1; 911 912 // Check for valid UTF-8. 913 if (uint8_t(*Position) & 0x80) { 914 UTF8Decoded u8d = decodeUTF8(Position); 915 if ( u8d.second != 0 916 && u8d.first != 0xFEFF 917 && ( u8d.first == 0x85 918 || ( u8d.first >= 0xA0 919 && u8d.first <= 0xD7FF) 920 || ( u8d.first >= 0xE000 921 && u8d.first <= 0xFFFD) 922 || ( u8d.first >= 0x10000 923 && u8d.first <= 0x10FFFF))) 924 return Position + u8d.second; 925 } 926 return Position; 927 } 928 929 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 930 if (Position == End) 931 return Position; 932 if (*Position == 0x0D) { 933 if (Position + 1 != End && *(Position + 1) == 0x0A) 934 return Position + 2; 935 return Position + 1; 936 } 937 938 if (*Position == 0x0A) 939 return Position + 1; 940 return Position; 941 } 942 943 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { 944 if (Position == End) 945 return Position; 946 if (*Position == ' ') 947 return Position + 1; 948 return Position; 949 } 950 951 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 952 if (Position == End) 953 return Position; 954 if (*Position == ' ' || *Position == '\t') 955 return Position + 1; 956 return Position; 957 } 958 959 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 960 if (Position == End) 961 return Position; 962 if (*Position == ' ' || *Position == '\t') 963 return Position; 964 return skip_nb_char(Position); 965 } 966 967 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 968 , StringRef::iterator Position) { 969 while (true) { 970 StringRef::iterator i = (this->*Func)(Position); 971 if (i == Position) 972 break; 973 Position = i; 974 } 975 return Position; 976 } 977 978 void Scanner::advanceWhile(SkipWhileFunc Func) { 979 auto Final = skip_while(Func, Current); 980 Column += Final - Current; 981 Current = Final; 982 } 983 984 static bool is_ns_hex_digit(const char C) { 985 return (C >= '0' && C <= '9') 986 || (C >= 'a' && C <= 'z') 987 || (C >= 'A' && C <= 'Z'); 988 } 989 990 static bool is_ns_word_char(const char C) { 991 return C == '-' 992 || (C >= 'a' && C <= 'z') 993 || (C >= 'A' && C <= 'Z'); 994 } 995 996 void Scanner::scan_ns_uri_char() { 997 while (true) { 998 if (Current == End) 999 break; 1000 if (( *Current == '%' 1001 && Current + 2 < End 1002 && is_ns_hex_digit(*(Current + 1)) 1003 && is_ns_hex_digit(*(Current + 2))) 1004 || is_ns_word_char(*Current) 1005 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 1006 != StringRef::npos) { 1007 ++Current; 1008 ++Column; 1009 } else 1010 break; 1011 } 1012 } 1013 1014 bool Scanner::consume(uint32_t Expected) { 1015 if (Expected >= 0x80) { 1016 setError("Cannot consume non-ascii characters", Current); 1017 return false; 1018 } 1019 if (Current == End) 1020 return false; 1021 if (uint8_t(*Current) >= 0x80) { 1022 setError("Cannot consume non-ascii characters", Current); 1023 return false; 1024 } 1025 if (uint8_t(*Current) == Expected) { 1026 ++Current; 1027 ++Column; 1028 return true; 1029 } 1030 return false; 1031 } 1032 1033 void Scanner::skip(uint32_t Distance) { 1034 Current += Distance; 1035 Column += Distance; 1036 assert(Current <= End && "Skipped past the end"); 1037 } 1038 1039 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 1040 if (Position == End) 1041 return false; 1042 return *Position == ' ' || *Position == '\t' || *Position == '\r' || 1043 *Position == '\n'; 1044 } 1045 1046 bool Scanner::consumeLineBreakIfPresent() { 1047 auto Next = skip_b_break(Current); 1048 if (Next == Current) 1049 return false; 1050 Column = 0; 1051 ++Line; 1052 Current = Next; 1053 return true; 1054 } 1055 1056 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 1057 , unsigned AtColumn 1058 , bool IsRequired) { 1059 if (IsSimpleKeyAllowed) { 1060 SimpleKey SK; 1061 SK.Tok = Tok; 1062 SK.Line = Line; 1063 SK.Column = AtColumn; 1064 SK.IsRequired = IsRequired; 1065 SK.FlowLevel = FlowLevel; 1066 SimpleKeys.push_back(SK); 1067 } 1068 } 1069 1070 void Scanner::removeStaleSimpleKeyCandidates() { 1071 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 1072 i != SimpleKeys.end();) { 1073 if (i->Line != Line || i->Column + 1024 < Column) { 1074 if (i->IsRequired) 1075 setError( "Could not find expected : for simple key" 1076 , i->Tok->Range.begin()); 1077 i = SimpleKeys.erase(i); 1078 } else 1079 ++i; 1080 } 1081 } 1082 1083 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 1084 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 1085 SimpleKeys.pop_back(); 1086 } 1087 1088 bool Scanner::unrollIndent(int ToColumn) { 1089 Token T; 1090 // Indentation is ignored in flow. 1091 if (FlowLevel != 0) 1092 return true; 1093 1094 while (Indent > ToColumn) { 1095 T.Kind = Token::TK_BlockEnd; 1096 T.Range = StringRef(Current, 1); 1097 TokenQueue.push_back(T); 1098 Indent = Indents.pop_back_val(); 1099 } 1100 1101 return true; 1102 } 1103 1104 bool Scanner::rollIndent( int ToColumn 1105 , Token::TokenKind Kind 1106 , TokenQueueT::iterator InsertPoint) { 1107 if (FlowLevel) 1108 return true; 1109 if (Indent < ToColumn) { 1110 Indents.push_back(Indent); 1111 Indent = ToColumn; 1112 1113 Token T; 1114 T.Kind = Kind; 1115 T.Range = StringRef(Current, 0); 1116 TokenQueue.insert(InsertPoint, T); 1117 } 1118 return true; 1119 } 1120 1121 void Scanner::skipComment() { 1122 if (Current == End || *Current != '#') 1123 return; 1124 while (true) { 1125 // This may skip more than one byte, thus Column is only incremented 1126 // for code points. 1127 StringRef::iterator I = skip_nb_char(Current); 1128 if (I == Current) 1129 break; 1130 Current = I; 1131 ++Column; 1132 } 1133 } 1134 1135 void Scanner::scanToNextToken() { 1136 while (true) { 1137 while (Current != End && (*Current == ' ' || *Current == '\t')) { 1138 skip(1); 1139 } 1140 1141 skipComment(); 1142 1143 // Skip EOL. 1144 StringRef::iterator i = skip_b_break(Current); 1145 if (i == Current) 1146 break; 1147 Current = i; 1148 ++Line; 1149 Column = 0; 1150 // New lines may start a simple key. 1151 if (!FlowLevel) 1152 IsSimpleKeyAllowed = true; 1153 } 1154 } 1155 1156 bool Scanner::scanStreamStart() { 1157 IsStartOfStream = false; 1158 1159 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1160 1161 Token T; 1162 T.Kind = Token::TK_StreamStart; 1163 T.Range = StringRef(Current, EI.second); 1164 TokenQueue.push_back(T); 1165 Current += EI.second; 1166 return true; 1167 } 1168 1169 bool Scanner::scanStreamEnd() { 1170 // Force an ending new line if one isn't present. 1171 if (Column != 0) { 1172 Column = 0; 1173 ++Line; 1174 } 1175 1176 unrollIndent(-1); 1177 SimpleKeys.clear(); 1178 IsSimpleKeyAllowed = false; 1179 1180 Token T; 1181 T.Kind = Token::TK_StreamEnd; 1182 T.Range = StringRef(Current, 0); 1183 TokenQueue.push_back(T); 1184 return true; 1185 } 1186 1187 bool Scanner::scanDirective() { 1188 // Reset the indentation level. 1189 unrollIndent(-1); 1190 SimpleKeys.clear(); 1191 IsSimpleKeyAllowed = false; 1192 1193 StringRef::iterator Start = Current; 1194 consume('%'); 1195 StringRef::iterator NameStart = Current; 1196 Current = skip_while(&Scanner::skip_ns_char, Current); 1197 StringRef Name(NameStart, Current - NameStart); 1198 Current = skip_while(&Scanner::skip_s_white, Current); 1199 1200 Token T; 1201 if (Name == "YAML") { 1202 Current = skip_while(&Scanner::skip_ns_char, Current); 1203 T.Kind = Token::TK_VersionDirective; 1204 T.Range = StringRef(Start, Current - Start); 1205 TokenQueue.push_back(T); 1206 return true; 1207 } else if(Name == "TAG") { 1208 Current = skip_while(&Scanner::skip_ns_char, Current); 1209 Current = skip_while(&Scanner::skip_s_white, Current); 1210 Current = skip_while(&Scanner::skip_ns_char, Current); 1211 T.Kind = Token::TK_TagDirective; 1212 T.Range = StringRef(Start, Current - Start); 1213 TokenQueue.push_back(T); 1214 return true; 1215 } 1216 return false; 1217 } 1218 1219 bool Scanner::scanDocumentIndicator(bool IsStart) { 1220 unrollIndent(-1); 1221 SimpleKeys.clear(); 1222 IsSimpleKeyAllowed = false; 1223 1224 Token T; 1225 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1226 T.Range = StringRef(Current, 3); 1227 skip(3); 1228 TokenQueue.push_back(T); 1229 return true; 1230 } 1231 1232 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1233 Token T; 1234 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1235 : Token::TK_FlowMappingStart; 1236 T.Range = StringRef(Current, 1); 1237 skip(1); 1238 TokenQueue.push_back(T); 1239 1240 // [ and { may begin a simple key. 1241 saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); 1242 1243 // And may also be followed by a simple key. 1244 IsSimpleKeyAllowed = true; 1245 ++FlowLevel; 1246 return true; 1247 } 1248 1249 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1250 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1251 IsSimpleKeyAllowed = false; 1252 Token T; 1253 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1254 : Token::TK_FlowMappingEnd; 1255 T.Range = StringRef(Current, 1); 1256 skip(1); 1257 TokenQueue.push_back(T); 1258 if (FlowLevel) 1259 --FlowLevel; 1260 return true; 1261 } 1262 1263 bool Scanner::scanFlowEntry() { 1264 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1265 IsSimpleKeyAllowed = true; 1266 Token T; 1267 T.Kind = Token::TK_FlowEntry; 1268 T.Range = StringRef(Current, 1); 1269 skip(1); 1270 TokenQueue.push_back(T); 1271 return true; 1272 } 1273 1274 bool Scanner::scanBlockEntry() { 1275 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1276 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1277 IsSimpleKeyAllowed = true; 1278 Token T; 1279 T.Kind = Token::TK_BlockEntry; 1280 T.Range = StringRef(Current, 1); 1281 skip(1); 1282 TokenQueue.push_back(T); 1283 return true; 1284 } 1285 1286 bool Scanner::scanKey() { 1287 if (!FlowLevel) 1288 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1289 1290 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1291 IsSimpleKeyAllowed = !FlowLevel; 1292 1293 Token T; 1294 T.Kind = Token::TK_Key; 1295 T.Range = StringRef(Current, 1); 1296 skip(1); 1297 TokenQueue.push_back(T); 1298 return true; 1299 } 1300 1301 bool Scanner::scanValue() { 1302 // If the previous token could have been a simple key, insert the key token 1303 // into the token queue. 1304 if (!SimpleKeys.empty()) { 1305 SimpleKey SK = SimpleKeys.pop_back_val(); 1306 Token T; 1307 T.Kind = Token::TK_Key; 1308 T.Range = SK.Tok->Range; 1309 TokenQueueT::iterator i, e; 1310 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1311 if (i == SK.Tok) 1312 break; 1313 } 1314 if (i == e) { 1315 Failed = true; 1316 return false; 1317 } 1318 i = TokenQueue.insert(i, T); 1319 1320 // We may also need to add a Block-Mapping-Start token. 1321 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1322 1323 IsSimpleKeyAllowed = false; 1324 } else { 1325 if (!FlowLevel) 1326 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1327 IsSimpleKeyAllowed = !FlowLevel; 1328 } 1329 1330 Token T; 1331 T.Kind = Token::TK_Value; 1332 T.Range = StringRef(Current, 1); 1333 skip(1); 1334 TokenQueue.push_back(T); 1335 return true; 1336 } 1337 1338 // Forbidding inlining improves performance by roughly 20%. 1339 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1340 LLVM_ATTRIBUTE_NOINLINE static bool 1341 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1342 1343 // Returns whether a character at 'Position' was escaped with a leading '\'. 1344 // 'First' specifies the position of the first character in the string. 1345 static bool wasEscaped(StringRef::iterator First, 1346 StringRef::iterator Position) { 1347 assert(Position - 1 >= First); 1348 StringRef::iterator I = Position - 1; 1349 // We calculate the number of consecutive '\'s before the current position 1350 // by iterating backwards through our string. 1351 while (I >= First && *I == '\\') --I; 1352 // (Position - 1 - I) now contains the number of '\'s before the current 1353 // position. If it is odd, the character at 'Position' was escaped. 1354 return (Position - 1 - I) % 2 == 1; 1355 } 1356 1357 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1358 StringRef::iterator Start = Current; 1359 unsigned ColStart = Column; 1360 if (IsDoubleQuoted) { 1361 do { 1362 ++Current; 1363 while (Current != End && *Current != '"') 1364 ++Current; 1365 // Repeat until the previous character was not a '\' or was an escaped 1366 // backslash. 1367 } while ( Current != End 1368 && *(Current - 1) == '\\' 1369 && wasEscaped(Start + 1, Current)); 1370 } else { 1371 skip(1); 1372 while (Current != End) { 1373 // Skip a ' followed by another '. 1374 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1375 skip(2); 1376 continue; 1377 } else if (*Current == '\'') 1378 break; 1379 StringRef::iterator i = skip_nb_char(Current); 1380 if (i == Current) { 1381 i = skip_b_break(Current); 1382 if (i == Current) 1383 break; 1384 Current = i; 1385 Column = 0; 1386 ++Line; 1387 } else { 1388 if (i == End) 1389 break; 1390 Current = i; 1391 ++Column; 1392 } 1393 } 1394 } 1395 1396 if (Current == End) { 1397 setError("Expected quote at end of scalar", Current); 1398 return false; 1399 } 1400 1401 skip(1); // Skip ending quote. 1402 Token T; 1403 T.Kind = Token::TK_Scalar; 1404 T.Range = StringRef(Start, Current - Start); 1405 TokenQueue.push_back(T); 1406 1407 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1408 1409 IsSimpleKeyAllowed = false; 1410 1411 return true; 1412 } 1413 1414 bool Scanner::scanPlainScalar() { 1415 StringRef::iterator Start = Current; 1416 unsigned ColStart = Column; 1417 unsigned LeadingBlanks = 0; 1418 assert(Indent >= -1 && "Indent must be >= -1 !"); 1419 unsigned indent = static_cast<unsigned>(Indent + 1); 1420 while (Current != End) { 1421 if (*Current == '#') 1422 break; 1423 1424 while (Current != End && !isBlankOrBreak(Current)) { 1425 if (FlowLevel && *Current == ':' && 1426 (Current + 1 == End || 1427 !(isBlankOrBreak(Current + 1) || *(Current + 1) == ','))) { 1428 setError("Found unexpected ':' while scanning a plain scalar", Current); 1429 return false; 1430 } 1431 1432 // Check for the end of the plain scalar. 1433 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1434 || ( FlowLevel 1435 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1436 != StringRef::npos))) 1437 break; 1438 1439 StringRef::iterator i = skip_nb_char(Current); 1440 if (i == Current) 1441 break; 1442 Current = i; 1443 ++Column; 1444 } 1445 1446 // Are we at the end? 1447 if (!isBlankOrBreak(Current)) 1448 break; 1449 1450 // Eat blanks. 1451 StringRef::iterator Tmp = Current; 1452 while (isBlankOrBreak(Tmp)) { 1453 StringRef::iterator i = skip_s_white(Tmp); 1454 if (i != Tmp) { 1455 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1456 setError("Found invalid tab character in indentation", Tmp); 1457 return false; 1458 } 1459 Tmp = i; 1460 ++Column; 1461 } else { 1462 i = skip_b_break(Tmp); 1463 if (!LeadingBlanks) 1464 LeadingBlanks = 1; 1465 Tmp = i; 1466 Column = 0; 1467 ++Line; 1468 } 1469 } 1470 1471 if (!FlowLevel && Column < indent) 1472 break; 1473 1474 Current = Tmp; 1475 } 1476 if (Start == Current) { 1477 setError("Got empty plain scalar", Start); 1478 return false; 1479 } 1480 Token T; 1481 T.Kind = Token::TK_Scalar; 1482 T.Range = StringRef(Start, Current - Start); 1483 TokenQueue.push_back(T); 1484 1485 // Plain scalars can be simple keys. 1486 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1487 1488 IsSimpleKeyAllowed = false; 1489 1490 return true; 1491 } 1492 1493 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1494 StringRef::iterator Start = Current; 1495 unsigned ColStart = Column; 1496 skip(1); 1497 while (Current != End) { 1498 if ( *Current == '[' || *Current == ']' 1499 || *Current == '{' || *Current == '}' 1500 || *Current == ',' 1501 || *Current == ':') 1502 break; 1503 StringRef::iterator i = skip_ns_char(Current); 1504 if (i == Current) 1505 break; 1506 Current = i; 1507 ++Column; 1508 } 1509 1510 if (Start + 1 == Current) { 1511 setError("Got empty alias or anchor", Start); 1512 return false; 1513 } 1514 1515 Token T; 1516 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1517 T.Range = StringRef(Start, Current - Start); 1518 TokenQueue.push_back(T); 1519 1520 // Alias and anchors can be simple keys. 1521 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1522 1523 IsSimpleKeyAllowed = false; 1524 1525 return true; 1526 } 1527 1528 char Scanner::scanBlockChompingIndicator() { 1529 char Indicator = ' '; 1530 if (Current != End && (*Current == '+' || *Current == '-')) { 1531 Indicator = *Current; 1532 skip(1); 1533 } 1534 return Indicator; 1535 } 1536 1537 /// Get the number of line breaks after chomping. 1538 /// 1539 /// Return the number of trailing line breaks to emit, depending on 1540 /// \p ChompingIndicator. 1541 static unsigned getChompedLineBreaks(char ChompingIndicator, 1542 unsigned LineBreaks, StringRef Str) { 1543 if (ChompingIndicator == '-') // Strip all line breaks. 1544 return 0; 1545 if (ChompingIndicator == '+') // Keep all line breaks. 1546 return LineBreaks; 1547 // Clip trailing lines. 1548 return Str.empty() ? 0 : 1; 1549 } 1550 1551 unsigned Scanner::scanBlockIndentationIndicator() { 1552 unsigned Indent = 0; 1553 if (Current != End && (*Current >= '1' && *Current <= '9')) { 1554 Indent = unsigned(*Current - '0'); 1555 skip(1); 1556 } 1557 return Indent; 1558 } 1559 1560 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, 1561 unsigned &IndentIndicator, bool &IsDone) { 1562 auto Start = Current; 1563 1564 ChompingIndicator = scanBlockChompingIndicator(); 1565 IndentIndicator = scanBlockIndentationIndicator(); 1566 // Check for the chomping indicator once again. 1567 if (ChompingIndicator == ' ') 1568 ChompingIndicator = scanBlockChompingIndicator(); 1569 Current = skip_while(&Scanner::skip_s_white, Current); 1570 skipComment(); 1571 1572 if (Current == End) { // EOF, we have an empty scalar. 1573 Token T; 1574 T.Kind = Token::TK_BlockScalar; 1575 T.Range = StringRef(Start, Current - Start); 1576 TokenQueue.push_back(T); 1577 IsDone = true; 1578 return true; 1579 } 1580 1581 if (!consumeLineBreakIfPresent()) { 1582 setError("Expected a line break after block scalar header", Current); 1583 return false; 1584 } 1585 return true; 1586 } 1587 1588 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, 1589 unsigned BlockExitIndent, 1590 unsigned &LineBreaks, bool &IsDone) { 1591 unsigned MaxAllSpaceLineCharacters = 0; 1592 StringRef::iterator LongestAllSpaceLine; 1593 1594 while (true) { 1595 advanceWhile(&Scanner::skip_s_space); 1596 if (skip_nb_char(Current) != Current) { 1597 // This line isn't empty, so try and find the indentation. 1598 if (Column <= BlockExitIndent) { // End of the block literal. 1599 IsDone = true; 1600 return true; 1601 } 1602 // We found the block's indentation. 1603 BlockIndent = Column; 1604 if (MaxAllSpaceLineCharacters > BlockIndent) { 1605 setError( 1606 "Leading all-spaces line must be smaller than the block indent", 1607 LongestAllSpaceLine); 1608 return false; 1609 } 1610 return true; 1611 } 1612 if (skip_b_break(Current) != Current && 1613 Column > MaxAllSpaceLineCharacters) { 1614 // Record the longest all-space line in case it's longer than the 1615 // discovered block indent. 1616 MaxAllSpaceLineCharacters = Column; 1617 LongestAllSpaceLine = Current; 1618 } 1619 1620 // Check for EOF. 1621 if (Current == End) { 1622 IsDone = true; 1623 return true; 1624 } 1625 1626 if (!consumeLineBreakIfPresent()) { 1627 IsDone = true; 1628 return true; 1629 } 1630 ++LineBreaks; 1631 } 1632 return true; 1633 } 1634 1635 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, 1636 unsigned BlockExitIndent, bool &IsDone) { 1637 // Skip the indentation. 1638 while (Column < BlockIndent) { 1639 auto I = skip_s_space(Current); 1640 if (I == Current) 1641 break; 1642 Current = I; 1643 ++Column; 1644 } 1645 1646 if (skip_nb_char(Current) == Current) 1647 return true; 1648 1649 if (Column <= BlockExitIndent) { // End of the block literal. 1650 IsDone = true; 1651 return true; 1652 } 1653 1654 if (Column < BlockIndent) { 1655 if (Current != End && *Current == '#') { // Trailing comment. 1656 IsDone = true; 1657 return true; 1658 } 1659 setError("A text line is less indented than the block scalar", Current); 1660 return false; 1661 } 1662 return true; // A normal text line. 1663 } 1664 1665 bool Scanner::scanBlockScalar(bool IsLiteral) { 1666 // Eat '|' or '>' 1667 assert(*Current == '|' || *Current == '>'); 1668 skip(1); 1669 1670 char ChompingIndicator; 1671 unsigned BlockIndent; 1672 bool IsDone = false; 1673 if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone)) 1674 return false; 1675 if (IsDone) 1676 return true; 1677 1678 auto Start = Current; 1679 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; 1680 unsigned LineBreaks = 0; 1681 if (BlockIndent == 0) { 1682 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, 1683 IsDone)) 1684 return false; 1685 } 1686 1687 // Scan the block's scalars body. 1688 SmallString<256> Str; 1689 while (!IsDone) { 1690 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) 1691 return false; 1692 if (IsDone) 1693 break; 1694 1695 // Parse the current line. 1696 auto LineStart = Current; 1697 advanceWhile(&Scanner::skip_nb_char); 1698 if (LineStart != Current) { 1699 Str.append(LineBreaks, '\n'); 1700 Str.append(StringRef(LineStart, Current - LineStart)); 1701 LineBreaks = 0; 1702 } 1703 1704 // Check for EOF. 1705 if (Current == End) 1706 break; 1707 1708 if (!consumeLineBreakIfPresent()) 1709 break; 1710 ++LineBreaks; 1711 } 1712 1713 if (Current == End && !LineBreaks) 1714 // Ensure that there is at least one line break before the end of file. 1715 LineBreaks = 1; 1716 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); 1717 1718 // New lines may start a simple key. 1719 if (!FlowLevel) 1720 IsSimpleKeyAllowed = true; 1721 1722 Token T; 1723 T.Kind = Token::TK_BlockScalar; 1724 T.Range = StringRef(Start, Current - Start); 1725 T.Value = std::string(Str); 1726 TokenQueue.push_back(T); 1727 return true; 1728 } 1729 1730 bool Scanner::scanTag() { 1731 StringRef::iterator Start = Current; 1732 unsigned ColStart = Column; 1733 skip(1); // Eat !. 1734 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1735 else if (*Current == '<') { 1736 skip(1); 1737 scan_ns_uri_char(); 1738 if (!consume('>')) 1739 return false; 1740 } else { 1741 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1742 Current = skip_while(&Scanner::skip_ns_char, Current); 1743 } 1744 1745 Token T; 1746 T.Kind = Token::TK_Tag; 1747 T.Range = StringRef(Start, Current - Start); 1748 TokenQueue.push_back(T); 1749 1750 // Tags can be simple keys. 1751 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1752 1753 IsSimpleKeyAllowed = false; 1754 1755 return true; 1756 } 1757 1758 bool Scanner::fetchMoreTokens() { 1759 if (IsStartOfStream) 1760 return scanStreamStart(); 1761 1762 scanToNextToken(); 1763 1764 if (Current == End) 1765 return scanStreamEnd(); 1766 1767 removeStaleSimpleKeyCandidates(); 1768 1769 unrollIndent(Column); 1770 1771 if (Column == 0 && *Current == '%') 1772 return scanDirective(); 1773 1774 if (Column == 0 && Current + 4 <= End 1775 && *Current == '-' 1776 && *(Current + 1) == '-' 1777 && *(Current + 2) == '-' 1778 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1779 return scanDocumentIndicator(true); 1780 1781 if (Column == 0 && Current + 4 <= End 1782 && *Current == '.' 1783 && *(Current + 1) == '.' 1784 && *(Current + 2) == '.' 1785 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1786 return scanDocumentIndicator(false); 1787 1788 if (*Current == '[') 1789 return scanFlowCollectionStart(true); 1790 1791 if (*Current == '{') 1792 return scanFlowCollectionStart(false); 1793 1794 if (*Current == ']') 1795 return scanFlowCollectionEnd(true); 1796 1797 if (*Current == '}') 1798 return scanFlowCollectionEnd(false); 1799 1800 if (*Current == ',') 1801 return scanFlowEntry(); 1802 1803 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1804 return scanBlockEntry(); 1805 1806 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1807 return scanKey(); 1808 1809 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1810 return scanValue(); 1811 1812 if (*Current == '*') 1813 return scanAliasOrAnchor(true); 1814 1815 if (*Current == '&') 1816 return scanAliasOrAnchor(false); 1817 1818 if (*Current == '!') 1819 return scanTag(); 1820 1821 if (*Current == '|' && !FlowLevel) 1822 return scanBlockScalar(true); 1823 1824 if (*Current == '>' && !FlowLevel) 1825 return scanBlockScalar(false); 1826 1827 if (*Current == '\'') 1828 return scanFlowScalar(false); 1829 1830 if (*Current == '"') 1831 return scanFlowScalar(true); 1832 1833 // Get a plain scalar. 1834 StringRef FirstChar(Current, 1); 1835 if (!(isBlankOrBreak(Current) 1836 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1837 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1838 || (!FlowLevel && (*Current == '?' || *Current == ':') 1839 && isBlankOrBreak(Current + 1)) 1840 || (!FlowLevel && *Current == ':' 1841 && Current + 2 < End 1842 && *(Current + 1) == ':' 1843 && !isBlankOrBreak(Current + 2))) 1844 return scanPlainScalar(); 1845 1846 setError("Unrecognized character while tokenizing.", Current); 1847 return false; 1848 } 1849 1850 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors, 1851 std::error_code *EC) 1852 : scanner(new Scanner(Input, SM, ShowColors, EC)), CurrentDoc() {} 1853 1854 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors, 1855 std::error_code *EC) 1856 : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)), CurrentDoc() {} 1857 1858 Stream::~Stream() = default; 1859 1860 bool Stream::failed() { return scanner->failed(); } 1861 1862 void Stream::printError(Node *N, const Twine &Msg, SourceMgr::DiagKind Kind) { 1863 printError(N ? N->getSourceRange() : SMRange(), Msg, Kind); 1864 } 1865 1866 void Stream::printError(const SMRange &Range, const Twine &Msg, 1867 SourceMgr::DiagKind Kind) { 1868 scanner->printError(Range.Start, Kind, Msg, Range); 1869 } 1870 1871 document_iterator Stream::begin() { 1872 if (CurrentDoc) 1873 report_fatal_error("Can only iterate over the stream once"); 1874 1875 // Skip Stream-Start. 1876 scanner->getNext(); 1877 1878 CurrentDoc.reset(new Document(*this)); 1879 return document_iterator(CurrentDoc); 1880 } 1881 1882 document_iterator Stream::end() { 1883 return document_iterator(); 1884 } 1885 1886 void Stream::skip() { 1887 for (document_iterator i = begin(), e = end(); i != e; ++i) 1888 i->skip(); 1889 } 1890 1891 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1892 StringRef T) 1893 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1894 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1895 SourceRange = SMRange(Start, Start); 1896 } 1897 1898 std::string Node::getVerbatimTag() const { 1899 StringRef Raw = getRawTag(); 1900 if (!Raw.empty() && Raw != "!") { 1901 std::string Ret; 1902 if (Raw.find_last_of('!') == 0) { 1903 Ret = std::string(Doc->getTagMap().find("!")->second); 1904 Ret += Raw.substr(1); 1905 return Ret; 1906 } else if (Raw.startswith("!!")) { 1907 Ret = std::string(Doc->getTagMap().find("!!")->second); 1908 Ret += Raw.substr(2); 1909 return Ret; 1910 } else { 1911 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1912 std::map<StringRef, StringRef>::const_iterator It = 1913 Doc->getTagMap().find(TagHandle); 1914 if (It != Doc->getTagMap().end()) 1915 Ret = std::string(It->second); 1916 else { 1917 Token T; 1918 T.Kind = Token::TK_Tag; 1919 T.Range = TagHandle; 1920 setError(Twine("Unknown tag handle ") + TagHandle, T); 1921 } 1922 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1923 return Ret; 1924 } 1925 } 1926 1927 switch (getType()) { 1928 case NK_Null: 1929 return "tag:yaml.org,2002:null"; 1930 case NK_Scalar: 1931 case NK_BlockScalar: 1932 // TODO: Tag resolution. 1933 return "tag:yaml.org,2002:str"; 1934 case NK_Mapping: 1935 return "tag:yaml.org,2002:map"; 1936 case NK_Sequence: 1937 return "tag:yaml.org,2002:seq"; 1938 } 1939 1940 return ""; 1941 } 1942 1943 Token &Node::peekNext() { 1944 return Doc->peekNext(); 1945 } 1946 1947 Token Node::getNext() { 1948 return Doc->getNext(); 1949 } 1950 1951 Node *Node::parseBlockNode() { 1952 return Doc->parseBlockNode(); 1953 } 1954 1955 BumpPtrAllocator &Node::getAllocator() { 1956 return Doc->NodeAllocator; 1957 } 1958 1959 void Node::setError(const Twine &Msg, Token &Tok) const { 1960 Doc->setError(Msg, Tok); 1961 } 1962 1963 bool Node::failed() const { 1964 return Doc->failed(); 1965 } 1966 1967 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1968 // TODO: Handle newlines properly. We need to remove leading whitespace. 1969 if (Value[0] == '"') { // Double quoted. 1970 // Pull off the leading and trailing "s. 1971 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1972 // Search for characters that would require unescaping the value. 1973 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1974 if (i != StringRef::npos) 1975 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1976 return UnquotedValue; 1977 } else if (Value[0] == '\'') { // Single quoted. 1978 // Pull off the leading and trailing 's. 1979 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1980 StringRef::size_type i = UnquotedValue.find('\''); 1981 if (i != StringRef::npos) { 1982 // We're going to need Storage. 1983 Storage.clear(); 1984 Storage.reserve(UnquotedValue.size()); 1985 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1986 StringRef Valid(UnquotedValue.begin(), i); 1987 llvm::append_range(Storage, Valid); 1988 Storage.push_back('\''); 1989 UnquotedValue = UnquotedValue.substr(i + 2); 1990 } 1991 llvm::append_range(Storage, UnquotedValue); 1992 return StringRef(Storage.begin(), Storage.size()); 1993 } 1994 return UnquotedValue; 1995 } 1996 // Plain or block. 1997 return Value.rtrim(' '); 1998 } 1999 2000 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 2001 , StringRef::size_type i 2002 , SmallVectorImpl<char> &Storage) 2003 const { 2004 // Use Storage to build proper value. 2005 Storage.clear(); 2006 Storage.reserve(UnquotedValue.size()); 2007 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 2008 // Insert all previous chars into Storage. 2009 StringRef Valid(UnquotedValue.begin(), i); 2010 llvm::append_range(Storage, Valid); 2011 // Chop off inserted chars. 2012 UnquotedValue = UnquotedValue.substr(i); 2013 2014 assert(!UnquotedValue.empty() && "Can't be empty!"); 2015 2016 // Parse escape or line break. 2017 switch (UnquotedValue[0]) { 2018 case '\r': 2019 case '\n': 2020 Storage.push_back('\n'); 2021 if ( UnquotedValue.size() > 1 2022 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 2023 UnquotedValue = UnquotedValue.substr(1); 2024 UnquotedValue = UnquotedValue.substr(1); 2025 break; 2026 default: 2027 if (UnquotedValue.size() == 1) { 2028 Token T; 2029 T.Range = StringRef(UnquotedValue.begin(), 1); 2030 setError("Unrecognized escape code", T); 2031 return ""; 2032 } 2033 UnquotedValue = UnquotedValue.substr(1); 2034 switch (UnquotedValue[0]) { 2035 default: { 2036 Token T; 2037 T.Range = StringRef(UnquotedValue.begin(), 1); 2038 setError("Unrecognized escape code", T); 2039 return ""; 2040 } 2041 case '\r': 2042 case '\n': 2043 // Remove the new line. 2044 if ( UnquotedValue.size() > 1 2045 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 2046 UnquotedValue = UnquotedValue.substr(1); 2047 // If this was just a single byte newline, it will get skipped 2048 // below. 2049 break; 2050 case '0': 2051 Storage.push_back(0x00); 2052 break; 2053 case 'a': 2054 Storage.push_back(0x07); 2055 break; 2056 case 'b': 2057 Storage.push_back(0x08); 2058 break; 2059 case 't': 2060 case 0x09: 2061 Storage.push_back(0x09); 2062 break; 2063 case 'n': 2064 Storage.push_back(0x0A); 2065 break; 2066 case 'v': 2067 Storage.push_back(0x0B); 2068 break; 2069 case 'f': 2070 Storage.push_back(0x0C); 2071 break; 2072 case 'r': 2073 Storage.push_back(0x0D); 2074 break; 2075 case 'e': 2076 Storage.push_back(0x1B); 2077 break; 2078 case ' ': 2079 Storage.push_back(0x20); 2080 break; 2081 case '"': 2082 Storage.push_back(0x22); 2083 break; 2084 case '/': 2085 Storage.push_back(0x2F); 2086 break; 2087 case '\\': 2088 Storage.push_back(0x5C); 2089 break; 2090 case 'N': 2091 encodeUTF8(0x85, Storage); 2092 break; 2093 case '_': 2094 encodeUTF8(0xA0, Storage); 2095 break; 2096 case 'L': 2097 encodeUTF8(0x2028, Storage); 2098 break; 2099 case 'P': 2100 encodeUTF8(0x2029, Storage); 2101 break; 2102 case 'x': { 2103 if (UnquotedValue.size() < 3) 2104 // TODO: Report error. 2105 break; 2106 unsigned int UnicodeScalarValue; 2107 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 2108 // TODO: Report error. 2109 UnicodeScalarValue = 0xFFFD; 2110 encodeUTF8(UnicodeScalarValue, Storage); 2111 UnquotedValue = UnquotedValue.substr(2); 2112 break; 2113 } 2114 case 'u': { 2115 if (UnquotedValue.size() < 5) 2116 // TODO: Report error. 2117 break; 2118 unsigned int UnicodeScalarValue; 2119 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 2120 // TODO: Report error. 2121 UnicodeScalarValue = 0xFFFD; 2122 encodeUTF8(UnicodeScalarValue, Storage); 2123 UnquotedValue = UnquotedValue.substr(4); 2124 break; 2125 } 2126 case 'U': { 2127 if (UnquotedValue.size() < 9) 2128 // TODO: Report error. 2129 break; 2130 unsigned int UnicodeScalarValue; 2131 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 2132 // TODO: Report error. 2133 UnicodeScalarValue = 0xFFFD; 2134 encodeUTF8(UnicodeScalarValue, Storage); 2135 UnquotedValue = UnquotedValue.substr(8); 2136 break; 2137 } 2138 } 2139 UnquotedValue = UnquotedValue.substr(1); 2140 } 2141 } 2142 llvm::append_range(Storage, UnquotedValue); 2143 return StringRef(Storage.begin(), Storage.size()); 2144 } 2145 2146 Node *KeyValueNode::getKey() { 2147 if (Key) 2148 return Key; 2149 // Handle implicit null keys. 2150 { 2151 Token &t = peekNext(); 2152 if ( t.Kind == Token::TK_BlockEnd 2153 || t.Kind == Token::TK_Value 2154 || t.Kind == Token::TK_Error) { 2155 return Key = new (getAllocator()) NullNode(Doc); 2156 } 2157 if (t.Kind == Token::TK_Key) 2158 getNext(); // skip TK_Key. 2159 } 2160 2161 // Handle explicit null keys. 2162 Token &t = peekNext(); 2163 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 2164 return Key = new (getAllocator()) NullNode(Doc); 2165 } 2166 2167 // We've got a normal key. 2168 return Key = parseBlockNode(); 2169 } 2170 2171 Node *KeyValueNode::getValue() { 2172 if (Value) 2173 return Value; 2174 2175 if (Node* Key = getKey()) 2176 Key->skip(); 2177 else { 2178 setError("Null key in Key Value.", peekNext()); 2179 return Value = new (getAllocator()) NullNode(Doc); 2180 } 2181 2182 if (failed()) 2183 return Value = new (getAllocator()) NullNode(Doc); 2184 2185 // Handle implicit null values. 2186 { 2187 Token &t = peekNext(); 2188 if ( t.Kind == Token::TK_BlockEnd 2189 || t.Kind == Token::TK_FlowMappingEnd 2190 || t.Kind == Token::TK_Key 2191 || t.Kind == Token::TK_FlowEntry 2192 || t.Kind == Token::TK_Error) { 2193 return Value = new (getAllocator()) NullNode(Doc); 2194 } 2195 2196 if (t.Kind != Token::TK_Value) { 2197 setError("Unexpected token in Key Value.", t); 2198 return Value = new (getAllocator()) NullNode(Doc); 2199 } 2200 getNext(); // skip TK_Value. 2201 } 2202 2203 // Handle explicit null values. 2204 Token &t = peekNext(); 2205 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 2206 return Value = new (getAllocator()) NullNode(Doc); 2207 } 2208 2209 // We got a normal value. 2210 return Value = parseBlockNode(); 2211 } 2212 2213 void MappingNode::increment() { 2214 if (failed()) { 2215 IsAtEnd = true; 2216 CurrentEntry = nullptr; 2217 return; 2218 } 2219 if (CurrentEntry) { 2220 CurrentEntry->skip(); 2221 if (Type == MT_Inline) { 2222 IsAtEnd = true; 2223 CurrentEntry = nullptr; 2224 return; 2225 } 2226 } 2227 Token T = peekNext(); 2228 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 2229 // KeyValueNode eats the TK_Key. That way it can detect null keys. 2230 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 2231 } else if (Type == MT_Block) { 2232 switch (T.Kind) { 2233 case Token::TK_BlockEnd: 2234 getNext(); 2235 IsAtEnd = true; 2236 CurrentEntry = nullptr; 2237 break; 2238 default: 2239 setError("Unexpected token. Expected Key or Block End", T); 2240 LLVM_FALLTHROUGH; 2241 case Token::TK_Error: 2242 IsAtEnd = true; 2243 CurrentEntry = nullptr; 2244 } 2245 } else { 2246 switch (T.Kind) { 2247 case Token::TK_FlowEntry: 2248 // Eat the flow entry and recurse. 2249 getNext(); 2250 return increment(); 2251 case Token::TK_FlowMappingEnd: 2252 getNext(); 2253 LLVM_FALLTHROUGH; 2254 case Token::TK_Error: 2255 // Set this to end iterator. 2256 IsAtEnd = true; 2257 CurrentEntry = nullptr; 2258 break; 2259 default: 2260 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 2261 "Mapping End." 2262 , T); 2263 IsAtEnd = true; 2264 CurrentEntry = nullptr; 2265 } 2266 } 2267 } 2268 2269 void SequenceNode::increment() { 2270 if (failed()) { 2271 IsAtEnd = true; 2272 CurrentEntry = nullptr; 2273 return; 2274 } 2275 if (CurrentEntry) 2276 CurrentEntry->skip(); 2277 Token T = peekNext(); 2278 if (SeqType == ST_Block) { 2279 switch (T.Kind) { 2280 case Token::TK_BlockEntry: 2281 getNext(); 2282 CurrentEntry = parseBlockNode(); 2283 if (!CurrentEntry) { // An error occurred. 2284 IsAtEnd = true; 2285 CurrentEntry = nullptr; 2286 } 2287 break; 2288 case Token::TK_BlockEnd: 2289 getNext(); 2290 IsAtEnd = true; 2291 CurrentEntry = nullptr; 2292 break; 2293 default: 2294 setError( "Unexpected token. Expected Block Entry or Block End." 2295 , T); 2296 LLVM_FALLTHROUGH; 2297 case Token::TK_Error: 2298 IsAtEnd = true; 2299 CurrentEntry = nullptr; 2300 } 2301 } else if (SeqType == ST_Indentless) { 2302 switch (T.Kind) { 2303 case Token::TK_BlockEntry: 2304 getNext(); 2305 CurrentEntry = parseBlockNode(); 2306 if (!CurrentEntry) { // An error occurred. 2307 IsAtEnd = true; 2308 CurrentEntry = nullptr; 2309 } 2310 break; 2311 default: 2312 case Token::TK_Error: 2313 IsAtEnd = true; 2314 CurrentEntry = nullptr; 2315 } 2316 } else if (SeqType == ST_Flow) { 2317 switch (T.Kind) { 2318 case Token::TK_FlowEntry: 2319 // Eat the flow entry and recurse. 2320 getNext(); 2321 WasPreviousTokenFlowEntry = true; 2322 return increment(); 2323 case Token::TK_FlowSequenceEnd: 2324 getNext(); 2325 LLVM_FALLTHROUGH; 2326 case Token::TK_Error: 2327 // Set this to end iterator. 2328 IsAtEnd = true; 2329 CurrentEntry = nullptr; 2330 break; 2331 case Token::TK_StreamEnd: 2332 case Token::TK_DocumentEnd: 2333 case Token::TK_DocumentStart: 2334 setError("Could not find closing ]!", T); 2335 // Set this to end iterator. 2336 IsAtEnd = true; 2337 CurrentEntry = nullptr; 2338 break; 2339 default: 2340 if (!WasPreviousTokenFlowEntry) { 2341 setError("Expected , between entries!", T); 2342 IsAtEnd = true; 2343 CurrentEntry = nullptr; 2344 break; 2345 } 2346 // Otherwise it must be a flow entry. 2347 CurrentEntry = parseBlockNode(); 2348 if (!CurrentEntry) { 2349 IsAtEnd = true; 2350 } 2351 WasPreviousTokenFlowEntry = false; 2352 break; 2353 } 2354 } 2355 } 2356 2357 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2358 // Tag maps starts with two default mappings. 2359 TagMap["!"] = "!"; 2360 TagMap["!!"] = "tag:yaml.org,2002:"; 2361 2362 if (parseDirectives()) 2363 expectToken(Token::TK_DocumentStart); 2364 Token &T = peekNext(); 2365 if (T.Kind == Token::TK_DocumentStart) 2366 getNext(); 2367 } 2368 2369 bool Document::skip() { 2370 if (stream.scanner->failed()) 2371 return false; 2372 if (!Root && !getRoot()) 2373 return false; 2374 Root->skip(); 2375 Token &T = peekNext(); 2376 if (T.Kind == Token::TK_StreamEnd) 2377 return false; 2378 if (T.Kind == Token::TK_DocumentEnd) { 2379 getNext(); 2380 return skip(); 2381 } 2382 return true; 2383 } 2384 2385 Token &Document::peekNext() { 2386 return stream.scanner->peekNext(); 2387 } 2388 2389 Token Document::getNext() { 2390 return stream.scanner->getNext(); 2391 } 2392 2393 void Document::setError(const Twine &Message, Token &Location) const { 2394 stream.scanner->setError(Message, Location.Range.begin()); 2395 } 2396 2397 bool Document::failed() const { 2398 return stream.scanner->failed(); 2399 } 2400 2401 Node *Document::parseBlockNode() { 2402 Token T = peekNext(); 2403 // Handle properties. 2404 Token AnchorInfo; 2405 Token TagInfo; 2406 parse_property: 2407 switch (T.Kind) { 2408 case Token::TK_Alias: 2409 getNext(); 2410 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2411 case Token::TK_Anchor: 2412 if (AnchorInfo.Kind == Token::TK_Anchor) { 2413 setError("Already encountered an anchor for this node!", T); 2414 return nullptr; 2415 } 2416 AnchorInfo = getNext(); // Consume TK_Anchor. 2417 T = peekNext(); 2418 goto parse_property; 2419 case Token::TK_Tag: 2420 if (TagInfo.Kind == Token::TK_Tag) { 2421 setError("Already encountered a tag for this node!", T); 2422 return nullptr; 2423 } 2424 TagInfo = getNext(); // Consume TK_Tag. 2425 T = peekNext(); 2426 goto parse_property; 2427 default: 2428 break; 2429 } 2430 2431 switch (T.Kind) { 2432 case Token::TK_BlockEntry: 2433 // We got an unindented BlockEntry sequence. This is not terminated with 2434 // a BlockEnd. 2435 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2436 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2437 , AnchorInfo.Range.substr(1) 2438 , TagInfo.Range 2439 , SequenceNode::ST_Indentless); 2440 case Token::TK_BlockSequenceStart: 2441 getNext(); 2442 return new (NodeAllocator) 2443 SequenceNode( stream.CurrentDoc 2444 , AnchorInfo.Range.substr(1) 2445 , TagInfo.Range 2446 , SequenceNode::ST_Block); 2447 case Token::TK_BlockMappingStart: 2448 getNext(); 2449 return new (NodeAllocator) 2450 MappingNode( stream.CurrentDoc 2451 , AnchorInfo.Range.substr(1) 2452 , TagInfo.Range 2453 , MappingNode::MT_Block); 2454 case Token::TK_FlowSequenceStart: 2455 getNext(); 2456 return new (NodeAllocator) 2457 SequenceNode( stream.CurrentDoc 2458 , AnchorInfo.Range.substr(1) 2459 , TagInfo.Range 2460 , SequenceNode::ST_Flow); 2461 case Token::TK_FlowMappingStart: 2462 getNext(); 2463 return new (NodeAllocator) 2464 MappingNode( stream.CurrentDoc 2465 , AnchorInfo.Range.substr(1) 2466 , TagInfo.Range 2467 , MappingNode::MT_Flow); 2468 case Token::TK_Scalar: 2469 getNext(); 2470 return new (NodeAllocator) 2471 ScalarNode( stream.CurrentDoc 2472 , AnchorInfo.Range.substr(1) 2473 , TagInfo.Range 2474 , T.Range); 2475 case Token::TK_BlockScalar: { 2476 getNext(); 2477 StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); 2478 StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); 2479 return new (NodeAllocator) 2480 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), 2481 TagInfo.Range, StrCopy, T.Range); 2482 } 2483 case Token::TK_Key: 2484 // Don't eat the TK_Key, KeyValueNode expects it. 2485 return new (NodeAllocator) 2486 MappingNode( stream.CurrentDoc 2487 , AnchorInfo.Range.substr(1) 2488 , TagInfo.Range 2489 , MappingNode::MT_Inline); 2490 case Token::TK_DocumentStart: 2491 case Token::TK_DocumentEnd: 2492 case Token::TK_StreamEnd: 2493 default: 2494 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2495 // !!null null. 2496 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2497 case Token::TK_FlowMappingEnd: 2498 case Token::TK_FlowSequenceEnd: 2499 case Token::TK_FlowEntry: { 2500 if (Root && (isa<MappingNode>(Root) || isa<SequenceNode>(Root))) 2501 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2502 2503 setError("Unexpected token", T); 2504 return nullptr; 2505 } 2506 case Token::TK_Error: 2507 return nullptr; 2508 } 2509 llvm_unreachable("Control flow shouldn't reach here."); 2510 return nullptr; 2511 } 2512 2513 bool Document::parseDirectives() { 2514 bool isDirective = false; 2515 while (true) { 2516 Token T = peekNext(); 2517 if (T.Kind == Token::TK_TagDirective) { 2518 parseTAGDirective(); 2519 isDirective = true; 2520 } else if (T.Kind == Token::TK_VersionDirective) { 2521 parseYAMLDirective(); 2522 isDirective = true; 2523 } else 2524 break; 2525 } 2526 return isDirective; 2527 } 2528 2529 void Document::parseYAMLDirective() { 2530 getNext(); // Eat %YAML <version> 2531 } 2532 2533 void Document::parseTAGDirective() { 2534 Token Tag = getNext(); // %TAG <handle> <prefix> 2535 StringRef T = Tag.Range; 2536 // Strip %TAG 2537 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2538 std::size_t HandleEnd = T.find_first_of(" \t"); 2539 StringRef TagHandle = T.substr(0, HandleEnd); 2540 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2541 TagMap[TagHandle] = TagPrefix; 2542 } 2543 2544 bool Document::expectToken(int TK) { 2545 Token T = getNext(); 2546 if (T.Kind != TK) { 2547 setError("Unexpected token", T); 2548 return false; 2549 } 2550 return true; 2551 } 2552