xref: /llvm-project/llvm/lib/Support/YAMLParser.cpp (revision aadaaface2ec96ee30d92bf46faa41dd9e68b64d)
1 //===- YAMLParser.cpp - Simple YAML parser --------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //  This file implements a YAML parser.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/Support/YAMLParser.h"
14 #include "llvm/ADT/AllocatorList.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/None.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/ADT/SmallString.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/StringExtras.h"
21 #include "llvm/ADT/StringRef.h"
22 #include "llvm/ADT/Twine.h"
23 #include "llvm/Support/Compiler.h"
24 #include "llvm/Support/ErrorHandling.h"
25 #include "llvm/Support/MemoryBuffer.h"
26 #include "llvm/Support/SMLoc.h"
27 #include "llvm/Support/SourceMgr.h"
28 #include "llvm/Support/Unicode.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include <cassert>
31 #include <cstddef>
32 #include <cstdint>
33 #include <map>
34 #include <memory>
35 #include <string>
36 #include <system_error>
37 #include <utility>
38 
39 using namespace llvm;
40 using namespace yaml;
41 
42 enum UnicodeEncodingForm {
43   UEF_UTF32_LE, ///< UTF-32 Little Endian
44   UEF_UTF32_BE, ///< UTF-32 Big Endian
45   UEF_UTF16_LE, ///< UTF-16 Little Endian
46   UEF_UTF16_BE, ///< UTF-16 Big Endian
47   UEF_UTF8,     ///< UTF-8 or ascii.
48   UEF_Unknown   ///< Not a valid Unicode encoding.
49 };
50 
51 /// EncodingInfo - Holds the encoding type and length of the byte order mark if
52 ///                it exists. Length is in {0, 2, 3, 4}.
53 using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>;
54 
55 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
56 ///                      encoding form of \a Input.
57 ///
58 /// @param Input A string of length 0 or more.
59 /// @returns An EncodingInfo indicating the Unicode encoding form of the input
60 ///          and how long the byte order mark is if one exists.
61 static EncodingInfo getUnicodeEncoding(StringRef Input) {
62   if (Input.empty())
63     return std::make_pair(UEF_Unknown, 0);
64 
65   switch (uint8_t(Input[0])) {
66   case 0x00:
67     if (Input.size() >= 4) {
68       if (  Input[1] == 0
69          && uint8_t(Input[2]) == 0xFE
70          && uint8_t(Input[3]) == 0xFF)
71         return std::make_pair(UEF_UTF32_BE, 4);
72       if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
73         return std::make_pair(UEF_UTF32_BE, 0);
74     }
75 
76     if (Input.size() >= 2 && Input[1] != 0)
77       return std::make_pair(UEF_UTF16_BE, 0);
78     return std::make_pair(UEF_Unknown, 0);
79   case 0xFF:
80     if (  Input.size() >= 4
81        && uint8_t(Input[1]) == 0xFE
82        && Input[2] == 0
83        && Input[3] == 0)
84       return std::make_pair(UEF_UTF32_LE, 4);
85 
86     if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
87       return std::make_pair(UEF_UTF16_LE, 2);
88     return std::make_pair(UEF_Unknown, 0);
89   case 0xFE:
90     if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
91       return std::make_pair(UEF_UTF16_BE, 2);
92     return std::make_pair(UEF_Unknown, 0);
93   case 0xEF:
94     if (  Input.size() >= 3
95        && uint8_t(Input[1]) == 0xBB
96        && uint8_t(Input[2]) == 0xBF)
97       return std::make_pair(UEF_UTF8, 3);
98     return std::make_pair(UEF_Unknown, 0);
99   }
100 
101   // It could still be utf-32 or utf-16.
102   if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
103     return std::make_pair(UEF_UTF32_LE, 0);
104 
105   if (Input.size() >= 2 && Input[1] == 0)
106     return std::make_pair(UEF_UTF16_LE, 0);
107 
108   return std::make_pair(UEF_UTF8, 0);
109 }
110 
111 /// Pin the vtables to this file.
112 void Node::anchor() {}
113 void NullNode::anchor() {}
114 void ScalarNode::anchor() {}
115 void BlockScalarNode::anchor() {}
116 void KeyValueNode::anchor() {}
117 void MappingNode::anchor() {}
118 void SequenceNode::anchor() {}
119 void AliasNode::anchor() {}
120 
121 namespace llvm {
122 namespace yaml {
123 
124 /// Token - A single YAML token.
125 struct Token {
126   enum TokenKind {
127     TK_Error, // Uninitialized token.
128     TK_StreamStart,
129     TK_StreamEnd,
130     TK_VersionDirective,
131     TK_TagDirective,
132     TK_DocumentStart,
133     TK_DocumentEnd,
134     TK_BlockEntry,
135     TK_BlockEnd,
136     TK_BlockSequenceStart,
137     TK_BlockMappingStart,
138     TK_FlowEntry,
139     TK_FlowSequenceStart,
140     TK_FlowSequenceEnd,
141     TK_FlowMappingStart,
142     TK_FlowMappingEnd,
143     TK_Key,
144     TK_Value,
145     TK_Scalar,
146     TK_BlockScalar,
147     TK_Alias,
148     TK_Anchor,
149     TK_Tag
150   } Kind = TK_Error;
151 
152   /// A string of length 0 or more whose begin() points to the logical location
153   /// of the token in the input.
154   StringRef Range;
155 
156   /// The value of a block scalar node.
157   std::string Value;
158 
159   Token() = default;
160 };
161 
162 } // end namespace yaml
163 } // end namespace llvm
164 
165 using TokenQueueT = BumpPtrList<Token>;
166 
167 namespace {
168 
169 /// This struct is used to track simple keys.
170 ///
171 /// Simple keys are handled by creating an entry in SimpleKeys for each Token
172 /// which could legally be the start of a simple key. When peekNext is called,
173 /// if the Token To be returned is referenced by a SimpleKey, we continue
174 /// tokenizing until that potential simple key has either been found to not be
175 /// a simple key (we moved on to the next line or went further than 1024 chars).
176 /// Or when we run into a Value, and then insert a Key token (and possibly
177 /// others) before the SimpleKey's Tok.
178 struct SimpleKey {
179   TokenQueueT::iterator Tok;
180   unsigned Column = 0;
181   unsigned Line = 0;
182   unsigned FlowLevel = 0;
183   bool IsRequired = false;
184 
185   bool operator ==(const SimpleKey &Other) {
186     return Tok == Other.Tok;
187   }
188 };
189 
190 } // end anonymous namespace
191 
192 /// The Unicode scalar value of a UTF-8 minimal well-formed code unit
193 ///        subsequence and the subsequence's length in code units (uint8_t).
194 ///        A length of 0 represents an error.
195 using UTF8Decoded = std::pair<uint32_t, unsigned>;
196 
197 static UTF8Decoded decodeUTF8(StringRef Range) {
198   StringRef::iterator Position= Range.begin();
199   StringRef::iterator End = Range.end();
200   // 1 byte: [0x00, 0x7f]
201   // Bit pattern: 0xxxxxxx
202   if (Position < End && (*Position & 0x80) == 0) {
203     return std::make_pair(*Position, 1);
204   }
205   // 2 bytes: [0x80, 0x7ff]
206   // Bit pattern: 110xxxxx 10xxxxxx
207   if (Position + 1 < End && ((*Position & 0xE0) == 0xC0) &&
208       ((*(Position + 1) & 0xC0) == 0x80)) {
209     uint32_t codepoint = ((*Position & 0x1F) << 6) |
210                           (*(Position + 1) & 0x3F);
211     if (codepoint >= 0x80)
212       return std::make_pair(codepoint, 2);
213   }
214   // 3 bytes: [0x8000, 0xffff]
215   // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
216   if (Position + 2 < End && ((*Position & 0xF0) == 0xE0) &&
217       ((*(Position + 1) & 0xC0) == 0x80) &&
218       ((*(Position + 2) & 0xC0) == 0x80)) {
219     uint32_t codepoint = ((*Position & 0x0F) << 12) |
220                          ((*(Position + 1) & 0x3F) << 6) |
221                           (*(Position + 2) & 0x3F);
222     // Codepoints between 0xD800 and 0xDFFF are invalid, as
223     // they are high / low surrogate halves used by UTF-16.
224     if (codepoint >= 0x800 &&
225         (codepoint < 0xD800 || codepoint > 0xDFFF))
226       return std::make_pair(codepoint, 3);
227   }
228   // 4 bytes: [0x10000, 0x10FFFF]
229   // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
230   if (Position + 3 < End && ((*Position & 0xF8) == 0xF0) &&
231       ((*(Position + 1) & 0xC0) == 0x80) &&
232       ((*(Position + 2) & 0xC0) == 0x80) &&
233       ((*(Position + 3) & 0xC0) == 0x80)) {
234     uint32_t codepoint = ((*Position & 0x07) << 18) |
235                          ((*(Position + 1) & 0x3F) << 12) |
236                          ((*(Position + 2) & 0x3F) << 6) |
237                           (*(Position + 3) & 0x3F);
238     if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
239       return std::make_pair(codepoint, 4);
240   }
241   return std::make_pair(0, 0);
242 }
243 
244 namespace llvm {
245 namespace yaml {
246 
247 /// Scans YAML tokens from a MemoryBuffer.
248 class Scanner {
249 public:
250   Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true,
251           std::error_code *EC = nullptr);
252   Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true,
253           std::error_code *EC = nullptr);
254 
255   /// Parse the next token and return it without popping it.
256   Token &peekNext();
257 
258   /// Parse the next token and pop it from the queue.
259   Token getNext();
260 
261   void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
262                   ArrayRef<SMRange> Ranges = std::nullopt) {
263     SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ std::nullopt,
264                     ShowColors);
265   }
266 
267   void setError(const Twine &Message, StringRef::iterator Position) {
268     if (Position >= End)
269       Position = End - 1;
270 
271     // propagate the error if possible
272     if (EC)
273       *EC = make_error_code(std::errc::invalid_argument);
274 
275     // Don't print out more errors after the first one we encounter. The rest
276     // are just the result of the first, and have no meaning.
277     if (!Failed)
278       printError(SMLoc::getFromPointer(Position), SourceMgr::DK_Error, Message);
279     Failed = true;
280   }
281 
282   /// Returns true if an error occurred while parsing.
283   bool failed() {
284     return Failed;
285   }
286 
287 private:
288   void init(MemoryBufferRef Buffer);
289 
290   StringRef currentInput() {
291     return StringRef(Current, End - Current);
292   }
293 
294   /// Decode a UTF-8 minimal well-formed code unit subsequence starting
295   ///        at \a Position.
296   ///
297   /// If the UTF-8 code units starting at Position do not form a well-formed
298   /// code unit subsequence, then the Unicode scalar value is 0, and the length
299   /// is 0.
300   UTF8Decoded decodeUTF8(StringRef::iterator Position) {
301     return ::decodeUTF8(StringRef(Position, End - Position));
302   }
303 
304   // The following functions are based on the gramar rules in the YAML spec. The
305   // style of the function names it meant to closely match how they are written
306   // in the spec. The number within the [] is the number of the grammar rule in
307   // the spec.
308   //
309   // See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
310   //
311   // c-
312   //   A production starting and ending with a special character.
313   // b-
314   //   A production matching a single line break.
315   // nb-
316   //   A production starting and ending with a non-break character.
317   // s-
318   //   A production starting and ending with a white space character.
319   // ns-
320   //   A production starting and ending with a non-space character.
321   // l-
322   //   A production matching complete line(s).
323 
324   /// Skip a single nb-char[27] starting at Position.
325   ///
326   /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
327   ///                  | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
328   ///
329   /// @returns The code unit after the nb-char, or Position if it's not an
330   ///          nb-char.
331   StringRef::iterator skip_nb_char(StringRef::iterator Position);
332 
333   /// Skip a single b-break[28] starting at Position.
334   ///
335   /// A b-break is 0xD 0xA | 0xD | 0xA
336   ///
337   /// @returns The code unit after the b-break, or Position if it's not a
338   ///          b-break.
339   StringRef::iterator skip_b_break(StringRef::iterator Position);
340 
341   /// Skip a single s-space[31] starting at Position.
342   ///
343   /// An s-space is 0x20
344   ///
345   /// @returns The code unit after the s-space, or Position if it's not a
346   ///          s-space.
347   StringRef::iterator skip_s_space(StringRef::iterator Position);
348 
349   /// Skip a single s-white[33] starting at Position.
350   ///
351   /// A s-white is 0x20 | 0x9
352   ///
353   /// @returns The code unit after the s-white, or Position if it's not a
354   ///          s-white.
355   StringRef::iterator skip_s_white(StringRef::iterator Position);
356 
357   /// Skip a single ns-char[34] starting at Position.
358   ///
359   /// A ns-char is nb-char - s-white
360   ///
361   /// @returns The code unit after the ns-char, or Position if it's not a
362   ///          ns-char.
363   StringRef::iterator skip_ns_char(StringRef::iterator Position);
364 
365   using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator);
366 
367   /// Skip minimal well-formed code unit subsequences until Func
368   ///        returns its input.
369   ///
370   /// @returns The code unit after the last minimal well-formed code unit
371   ///          subsequence that Func accepted.
372   StringRef::iterator skip_while( SkipWhileFunc Func
373                                 , StringRef::iterator Position);
374 
375   /// Skip minimal well-formed code unit subsequences until Func returns its
376   /// input.
377   void advanceWhile(SkipWhileFunc Func);
378 
379   /// Scan ns-uri-char[39]s starting at Cur.
380   ///
381   /// This updates Cur and Column while scanning.
382   void scan_ns_uri_char();
383 
384   /// Consume a minimal well-formed code unit subsequence starting at
385   ///        \a Cur. Return false if it is not the same Unicode scalar value as
386   ///        \a Expected. This updates \a Column.
387   bool consume(uint32_t Expected);
388 
389   /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
390   void skip(uint32_t Distance);
391 
392   /// Return true if the minimal well-formed code unit subsequence at
393   ///        Pos is whitespace or a new line
394   bool isBlankOrBreak(StringRef::iterator Position);
395 
396   /// Return true if the line is a line break, false otherwise.
397   bool isLineEmpty(StringRef Line);
398 
399   /// Consume a single b-break[28] if it's present at the current position.
400   ///
401   /// Return false if the code unit at the current position isn't a line break.
402   bool consumeLineBreakIfPresent();
403 
404   /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
405   void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
406                              , unsigned AtColumn
407                              , bool IsRequired);
408 
409   /// Remove simple keys that can no longer be valid simple keys.
410   ///
411   /// Invalid simple keys are not on the current line or are further than 1024
412   /// columns back.
413   void removeStaleSimpleKeyCandidates();
414 
415   /// Remove all simple keys on FlowLevel \a Level.
416   void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
417 
418   /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
419   ///        tokens if needed.
420   bool unrollIndent(int ToColumn);
421 
422   /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
423   ///        if needed.
424   bool rollIndent( int ToColumn
425                  , Token::TokenKind Kind
426                  , TokenQueueT::iterator InsertPoint);
427 
428   /// Skip a single-line comment when the comment starts at the current
429   /// position of the scanner.
430   void skipComment();
431 
432   /// Skip whitespace and comments until the start of the next token.
433   void scanToNextToken();
434 
435   /// Must be the first token generated.
436   bool scanStreamStart();
437 
438   /// Generate tokens needed to close out the stream.
439   bool scanStreamEnd();
440 
441   /// Scan a %BLAH directive.
442   bool scanDirective();
443 
444   /// Scan a ... or ---.
445   bool scanDocumentIndicator(bool IsStart);
446 
447   /// Scan a [ or { and generate the proper flow collection start token.
448   bool scanFlowCollectionStart(bool IsSequence);
449 
450   /// Scan a ] or } and generate the proper flow collection end token.
451   bool scanFlowCollectionEnd(bool IsSequence);
452 
453   /// Scan the , that separates entries in a flow collection.
454   bool scanFlowEntry();
455 
456   /// Scan the - that starts block sequence entries.
457   bool scanBlockEntry();
458 
459   /// Scan an explicit ? indicating a key.
460   bool scanKey();
461 
462   /// Scan an explicit : indicating a value.
463   bool scanValue();
464 
465   /// Scan a quoted scalar.
466   bool scanFlowScalar(bool IsDoubleQuoted);
467 
468   /// Scan an unquoted scalar.
469   bool scanPlainScalar();
470 
471   /// Scan an Alias or Anchor starting with * or &.
472   bool scanAliasOrAnchor(bool IsAlias);
473 
474   /// Scan a block scalar starting with | or >.
475   bool scanBlockScalar(bool IsLiteral);
476 
477   /// Scan a block scalar style indicator and header.
478   ///
479   /// Note: This is distinct from scanBlockScalarHeader to mirror the fact that
480   /// YAML does not consider the style indicator to be a part of the header.
481   ///
482   /// Return false if an error occurred.
483   bool scanBlockScalarIndicators(char &StyleIndicator, char &ChompingIndicator,
484                                  unsigned &IndentIndicator, bool &IsDone);
485 
486   /// Scan a style indicator in a block scalar header.
487   char scanBlockStyleIndicator();
488 
489   /// Scan a chomping indicator in a block scalar header.
490   char scanBlockChompingIndicator();
491 
492   /// Scan an indentation indicator in a block scalar header.
493   unsigned scanBlockIndentationIndicator();
494 
495   /// Scan a block scalar header.
496   ///
497   /// Return false if an error occurred.
498   bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator,
499                              bool &IsDone);
500 
501   /// Look for the indentation level of a block scalar.
502   ///
503   /// Return false if an error occurred.
504   bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent,
505                              unsigned &LineBreaks, bool &IsDone);
506 
507   /// Scan the indentation of a text line in a block scalar.
508   ///
509   /// Return false if an error occurred.
510   bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent,
511                              bool &IsDone);
512 
513   /// Scan a tag of the form !stuff.
514   bool scanTag();
515 
516   /// Dispatch to the next scanning function based on \a *Cur.
517   bool fetchMoreTokens();
518 
519   /// The SourceMgr used for diagnostics and buffer management.
520   SourceMgr &SM;
521 
522   /// The original input.
523   MemoryBufferRef InputBuffer;
524 
525   /// The current position of the scanner.
526   StringRef::iterator Current;
527 
528   /// The end of the input (one past the last character).
529   StringRef::iterator End;
530 
531   /// Current YAML indentation level in spaces.
532   int Indent;
533 
534   /// Current column number in Unicode code points.
535   unsigned Column;
536 
537   /// Current line number.
538   unsigned Line;
539 
540   /// How deep we are in flow style containers. 0 Means at block level.
541   unsigned FlowLevel;
542 
543   /// Are we at the start of the stream?
544   bool IsStartOfStream;
545 
546   /// Can the next token be the start of a simple key?
547   bool IsSimpleKeyAllowed;
548 
549   /// True if an error has occurred.
550   bool Failed;
551 
552   /// Should colors be used when printing out the diagnostic messages?
553   bool ShowColors;
554 
555   /// Queue of tokens. This is required to queue up tokens while looking
556   ///        for the end of a simple key. And for cases where a single character
557   ///        can produce multiple tokens (e.g. BlockEnd).
558   TokenQueueT TokenQueue;
559 
560   /// Indentation levels.
561   SmallVector<int, 4> Indents;
562 
563   /// Potential simple keys.
564   SmallVector<SimpleKey, 4> SimpleKeys;
565 
566   std::error_code *EC;
567 };
568 
569 } // end namespace yaml
570 } // end namespace llvm
571 
572 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
573 static void encodeUTF8( uint32_t UnicodeScalarValue
574                       , SmallVectorImpl<char> &Result) {
575   if (UnicodeScalarValue <= 0x7F) {
576     Result.push_back(UnicodeScalarValue & 0x7F);
577   } else if (UnicodeScalarValue <= 0x7FF) {
578     uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6);
579     uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F);
580     Result.push_back(FirstByte);
581     Result.push_back(SecondByte);
582   } else if (UnicodeScalarValue <= 0xFFFF) {
583     uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12);
584     uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
585     uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F);
586     Result.push_back(FirstByte);
587     Result.push_back(SecondByte);
588     Result.push_back(ThirdByte);
589   } else if (UnicodeScalarValue <= 0x10FFFF) {
590     uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18);
591     uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12);
592     uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
593     uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F);
594     Result.push_back(FirstByte);
595     Result.push_back(SecondByte);
596     Result.push_back(ThirdByte);
597     Result.push_back(FourthByte);
598   }
599 }
600 
601 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
602   SourceMgr SM;
603   Scanner scanner(Input, SM);
604   while (true) {
605     Token T = scanner.getNext();
606     switch (T.Kind) {
607     case Token::TK_StreamStart:
608       OS << "Stream-Start: ";
609       break;
610     case Token::TK_StreamEnd:
611       OS << "Stream-End: ";
612       break;
613     case Token::TK_VersionDirective:
614       OS << "Version-Directive: ";
615       break;
616     case Token::TK_TagDirective:
617       OS << "Tag-Directive: ";
618       break;
619     case Token::TK_DocumentStart:
620       OS << "Document-Start: ";
621       break;
622     case Token::TK_DocumentEnd:
623       OS << "Document-End: ";
624       break;
625     case Token::TK_BlockEntry:
626       OS << "Block-Entry: ";
627       break;
628     case Token::TK_BlockEnd:
629       OS << "Block-End: ";
630       break;
631     case Token::TK_BlockSequenceStart:
632       OS << "Block-Sequence-Start: ";
633       break;
634     case Token::TK_BlockMappingStart:
635       OS << "Block-Mapping-Start: ";
636       break;
637     case Token::TK_FlowEntry:
638       OS << "Flow-Entry: ";
639       break;
640     case Token::TK_FlowSequenceStart:
641       OS << "Flow-Sequence-Start: ";
642       break;
643     case Token::TK_FlowSequenceEnd:
644       OS << "Flow-Sequence-End: ";
645       break;
646     case Token::TK_FlowMappingStart:
647       OS << "Flow-Mapping-Start: ";
648       break;
649     case Token::TK_FlowMappingEnd:
650       OS << "Flow-Mapping-End: ";
651       break;
652     case Token::TK_Key:
653       OS << "Key: ";
654       break;
655     case Token::TK_Value:
656       OS << "Value: ";
657       break;
658     case Token::TK_Scalar:
659       OS << "Scalar: ";
660       break;
661     case Token::TK_BlockScalar:
662       OS << "Block Scalar: ";
663       break;
664     case Token::TK_Alias:
665       OS << "Alias: ";
666       break;
667     case Token::TK_Anchor:
668       OS << "Anchor: ";
669       break;
670     case Token::TK_Tag:
671       OS << "Tag: ";
672       break;
673     case Token::TK_Error:
674       break;
675     }
676     OS << T.Range << "\n";
677     if (T.Kind == Token::TK_StreamEnd)
678       break;
679     else if (T.Kind == Token::TK_Error)
680       return false;
681   }
682   return true;
683 }
684 
685 bool yaml::scanTokens(StringRef Input) {
686   SourceMgr SM;
687   Scanner scanner(Input, SM);
688   while (true) {
689     Token T = scanner.getNext();
690     if (T.Kind == Token::TK_StreamEnd)
691       break;
692     else if (T.Kind == Token::TK_Error)
693       return false;
694   }
695   return true;
696 }
697 
698 std::string yaml::escape(StringRef Input, bool EscapePrintable) {
699   std::string EscapedInput;
700   for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
701     if (*i == '\\')
702       EscapedInput += "\\\\";
703     else if (*i == '"')
704       EscapedInput += "\\\"";
705     else if (*i == 0)
706       EscapedInput += "\\0";
707     else if (*i == 0x07)
708       EscapedInput += "\\a";
709     else if (*i == 0x08)
710       EscapedInput += "\\b";
711     else if (*i == 0x09)
712       EscapedInput += "\\t";
713     else if (*i == 0x0A)
714       EscapedInput += "\\n";
715     else if (*i == 0x0B)
716       EscapedInput += "\\v";
717     else if (*i == 0x0C)
718       EscapedInput += "\\f";
719     else if (*i == 0x0D)
720       EscapedInput += "\\r";
721     else if (*i == 0x1B)
722       EscapedInput += "\\e";
723     else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
724       std::string HexStr = utohexstr(*i);
725       EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
726     } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
727       UTF8Decoded UnicodeScalarValue
728         = decodeUTF8(StringRef(i, Input.end() - i));
729       if (UnicodeScalarValue.second == 0) {
730         // Found invalid char.
731         SmallString<4> Val;
732         encodeUTF8(0xFFFD, Val);
733         llvm::append_range(EscapedInput, Val);
734         // FIXME: Error reporting.
735         return EscapedInput;
736       }
737       if (UnicodeScalarValue.first == 0x85)
738         EscapedInput += "\\N";
739       else if (UnicodeScalarValue.first == 0xA0)
740         EscapedInput += "\\_";
741       else if (UnicodeScalarValue.first == 0x2028)
742         EscapedInput += "\\L";
743       else if (UnicodeScalarValue.first == 0x2029)
744         EscapedInput += "\\P";
745       else if (!EscapePrintable &&
746                sys::unicode::isPrintable(UnicodeScalarValue.first))
747         EscapedInput += StringRef(i, UnicodeScalarValue.second);
748       else {
749         std::string HexStr = utohexstr(UnicodeScalarValue.first);
750         if (HexStr.size() <= 2)
751           EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
752         else if (HexStr.size() <= 4)
753           EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
754         else if (HexStr.size() <= 8)
755           EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
756       }
757       i += UnicodeScalarValue.second - 1;
758     } else
759       EscapedInput.push_back(*i);
760   }
761   return EscapedInput;
762 }
763 
764 llvm::Optional<bool> yaml::parseBool(StringRef S) {
765   switch (S.size()) {
766   case 1:
767     switch (S.front()) {
768     case 'y':
769     case 'Y':
770       return true;
771     case 'n':
772     case 'N':
773       return false;
774     default:
775       return std::nullopt;
776     }
777   case 2:
778     switch (S.front()) {
779     case 'O':
780       if (S[1] == 'N') // ON
781         return true;
782       [[fallthrough]];
783     case 'o':
784       if (S[1] == 'n') //[Oo]n
785         return true;
786       return std::nullopt;
787     case 'N':
788       if (S[1] == 'O') // NO
789         return false;
790       [[fallthrough]];
791     case 'n':
792       if (S[1] == 'o') //[Nn]o
793         return false;
794       return std::nullopt;
795     default:
796       return std::nullopt;
797     }
798   case 3:
799     switch (S.front()) {
800     case 'O':
801       if (S.drop_front() == "FF") // OFF
802         return false;
803       [[fallthrough]];
804     case 'o':
805       if (S.drop_front() == "ff") //[Oo]ff
806         return false;
807       return std::nullopt;
808     case 'Y':
809       if (S.drop_front() == "ES") // YES
810         return true;
811       [[fallthrough]];
812     case 'y':
813       if (S.drop_front() == "es") //[Yy]es
814         return true;
815       return std::nullopt;
816     default:
817       return std::nullopt;
818     }
819   case 4:
820     switch (S.front()) {
821     case 'T':
822       if (S.drop_front() == "RUE") // TRUE
823         return true;
824       [[fallthrough]];
825     case 't':
826       if (S.drop_front() == "rue") //[Tt]rue
827         return true;
828       return std::nullopt;
829     default:
830       return std::nullopt;
831     }
832   case 5:
833     switch (S.front()) {
834     case 'F':
835       if (S.drop_front() == "ALSE") // FALSE
836         return false;
837       [[fallthrough]];
838     case 'f':
839       if (S.drop_front() == "alse") //[Ff]alse
840         return false;
841       return std::nullopt;
842     default:
843       return std::nullopt;
844     }
845   default:
846     return std::nullopt;
847   }
848 }
849 
850 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors,
851                  std::error_code *EC)
852     : SM(sm), ShowColors(ShowColors), EC(EC) {
853   init(MemoryBufferRef(Input, "YAML"));
854 }
855 
856 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors,
857                  std::error_code *EC)
858     : SM(SM_), ShowColors(ShowColors), EC(EC) {
859   init(Buffer);
860 }
861 
862 void Scanner::init(MemoryBufferRef Buffer) {
863   InputBuffer = Buffer;
864   Current = InputBuffer.getBufferStart();
865   End = InputBuffer.getBufferEnd();
866   Indent = -1;
867   Column = 0;
868   Line = 0;
869   FlowLevel = 0;
870   IsStartOfStream = true;
871   IsSimpleKeyAllowed = true;
872   Failed = false;
873   std::unique_ptr<MemoryBuffer> InputBufferOwner =
874       MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false);
875   SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc());
876 }
877 
878 Token &Scanner::peekNext() {
879   // If the current token is a possible simple key, keep parsing until we
880   // can confirm.
881   bool NeedMore = false;
882   while (true) {
883     if (TokenQueue.empty() || NeedMore) {
884       if (!fetchMoreTokens()) {
885         TokenQueue.clear();
886         SimpleKeys.clear();
887         TokenQueue.push_back(Token());
888         return TokenQueue.front();
889       }
890     }
891     assert(!TokenQueue.empty() &&
892             "fetchMoreTokens lied about getting tokens!");
893 
894     removeStaleSimpleKeyCandidates();
895     SimpleKey SK;
896     SK.Tok = TokenQueue.begin();
897     if (!is_contained(SimpleKeys, SK))
898       break;
899     else
900       NeedMore = true;
901   }
902   return TokenQueue.front();
903 }
904 
905 Token Scanner::getNext() {
906   Token Ret = peekNext();
907   // TokenQueue can be empty if there was an error getting the next token.
908   if (!TokenQueue.empty())
909     TokenQueue.pop_front();
910 
911   // There cannot be any referenced Token's if the TokenQueue is empty. So do a
912   // quick deallocation of them all.
913   if (TokenQueue.empty())
914     TokenQueue.resetAlloc();
915 
916   return Ret;
917 }
918 
919 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
920   if (Position == End)
921     return Position;
922   // Check 7 bit c-printable - b-char.
923   if (   *Position == 0x09
924       || (*Position >= 0x20 && *Position <= 0x7E))
925     return Position + 1;
926 
927   // Check for valid UTF-8.
928   if (uint8_t(*Position) & 0x80) {
929     UTF8Decoded u8d = decodeUTF8(Position);
930     if (   u8d.second != 0
931         && u8d.first != 0xFEFF
932         && ( u8d.first == 0x85
933           || ( u8d.first >= 0xA0
934             && u8d.first <= 0xD7FF)
935           || ( u8d.first >= 0xE000
936             && u8d.first <= 0xFFFD)
937           || ( u8d.first >= 0x10000
938             && u8d.first <= 0x10FFFF)))
939       return Position + u8d.second;
940   }
941   return Position;
942 }
943 
944 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
945   if (Position == End)
946     return Position;
947   if (*Position == 0x0D) {
948     if (Position + 1 != End && *(Position + 1) == 0x0A)
949       return Position + 2;
950     return Position + 1;
951   }
952 
953   if (*Position == 0x0A)
954     return Position + 1;
955   return Position;
956 }
957 
958 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) {
959   if (Position == End)
960     return Position;
961   if (*Position == ' ')
962     return Position + 1;
963   return Position;
964 }
965 
966 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
967   if (Position == End)
968     return Position;
969   if (*Position == ' ' || *Position == '\t')
970     return Position + 1;
971   return Position;
972 }
973 
974 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
975   if (Position == End)
976     return Position;
977   if (*Position == ' ' || *Position == '\t')
978     return Position;
979   return skip_nb_char(Position);
980 }
981 
982 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
983                                        , StringRef::iterator Position) {
984   while (true) {
985     StringRef::iterator i = (this->*Func)(Position);
986     if (i == Position)
987       break;
988     Position = i;
989   }
990   return Position;
991 }
992 
993 void Scanner::advanceWhile(SkipWhileFunc Func) {
994   auto Final = skip_while(Func, Current);
995   Column += Final - Current;
996   Current = Final;
997 }
998 
999 static bool is_ns_hex_digit(const char C) { return isAlnum(C); }
1000 
1001 static bool is_ns_word_char(const char C) { return C == '-' || isAlpha(C); }
1002 
1003 void Scanner::scan_ns_uri_char() {
1004   while (true) {
1005     if (Current == End)
1006       break;
1007     if ((   *Current == '%'
1008           && Current + 2 < End
1009           && is_ns_hex_digit(*(Current + 1))
1010           && is_ns_hex_digit(*(Current + 2)))
1011         || is_ns_word_char(*Current)
1012         || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
1013           != StringRef::npos) {
1014       ++Current;
1015       ++Column;
1016     } else
1017       break;
1018   }
1019 }
1020 
1021 bool Scanner::consume(uint32_t Expected) {
1022   if (Expected >= 0x80) {
1023     setError("Cannot consume non-ascii characters", Current);
1024     return false;
1025   }
1026   if (Current == End)
1027     return false;
1028   if (uint8_t(*Current) >= 0x80) {
1029     setError("Cannot consume non-ascii characters", Current);
1030     return false;
1031   }
1032   if (uint8_t(*Current) == Expected) {
1033     ++Current;
1034     ++Column;
1035     return true;
1036   }
1037   return false;
1038 }
1039 
1040 void Scanner::skip(uint32_t Distance) {
1041   Current += Distance;
1042   Column += Distance;
1043   assert(Current <= End && "Skipped past the end");
1044 }
1045 
1046 bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
1047   if (Position == End)
1048     return false;
1049   return *Position == ' ' || *Position == '\t' || *Position == '\r' ||
1050          *Position == '\n';
1051 }
1052 
1053 bool Scanner::isLineEmpty(StringRef Line) {
1054   for (const auto *Position = Line.begin(); Position != Line.end(); ++Position)
1055     if (!isBlankOrBreak(Position))
1056       return false;
1057   return true;
1058 }
1059 
1060 bool Scanner::consumeLineBreakIfPresent() {
1061   auto Next = skip_b_break(Current);
1062   if (Next == Current)
1063     return false;
1064   Column = 0;
1065   ++Line;
1066   Current = Next;
1067   return true;
1068 }
1069 
1070 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
1071                                     , unsigned AtColumn
1072                                     , bool IsRequired) {
1073   if (IsSimpleKeyAllowed) {
1074     SimpleKey SK;
1075     SK.Tok = Tok;
1076     SK.Line = Line;
1077     SK.Column = AtColumn;
1078     SK.IsRequired = IsRequired;
1079     SK.FlowLevel = FlowLevel;
1080     SimpleKeys.push_back(SK);
1081   }
1082 }
1083 
1084 void Scanner::removeStaleSimpleKeyCandidates() {
1085   for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
1086                                             i != SimpleKeys.end();) {
1087     if (i->Line != Line || i->Column + 1024 < Column) {
1088       if (i->IsRequired)
1089         setError( "Could not find expected : for simple key"
1090                 , i->Tok->Range.begin());
1091       i = SimpleKeys.erase(i);
1092     } else
1093       ++i;
1094   }
1095 }
1096 
1097 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
1098   if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
1099     SimpleKeys.pop_back();
1100 }
1101 
1102 bool Scanner::unrollIndent(int ToColumn) {
1103   Token T;
1104   // Indentation is ignored in flow.
1105   if (FlowLevel != 0)
1106     return true;
1107 
1108   while (Indent > ToColumn) {
1109     T.Kind = Token::TK_BlockEnd;
1110     T.Range = StringRef(Current, 1);
1111     TokenQueue.push_back(T);
1112     Indent = Indents.pop_back_val();
1113   }
1114 
1115   return true;
1116 }
1117 
1118 bool Scanner::rollIndent( int ToColumn
1119                         , Token::TokenKind Kind
1120                         , TokenQueueT::iterator InsertPoint) {
1121   if (FlowLevel)
1122     return true;
1123   if (Indent < ToColumn) {
1124     Indents.push_back(Indent);
1125     Indent = ToColumn;
1126 
1127     Token T;
1128     T.Kind = Kind;
1129     T.Range = StringRef(Current, 0);
1130     TokenQueue.insert(InsertPoint, T);
1131   }
1132   return true;
1133 }
1134 
1135 void Scanner::skipComment() {
1136   if (Current == End || *Current != '#')
1137     return;
1138   while (true) {
1139     // This may skip more than one byte, thus Column is only incremented
1140     // for code points.
1141     StringRef::iterator I = skip_nb_char(Current);
1142     if (I == Current)
1143       break;
1144     Current = I;
1145     ++Column;
1146   }
1147 }
1148 
1149 void Scanner::scanToNextToken() {
1150   while (true) {
1151     while (Current != End && (*Current == ' ' || *Current == '\t')) {
1152       skip(1);
1153     }
1154 
1155     skipComment();
1156 
1157     // Skip EOL.
1158     StringRef::iterator i = skip_b_break(Current);
1159     if (i == Current)
1160       break;
1161     Current = i;
1162     ++Line;
1163     Column = 0;
1164     // New lines may start a simple key.
1165     if (!FlowLevel)
1166       IsSimpleKeyAllowed = true;
1167   }
1168 }
1169 
1170 bool Scanner::scanStreamStart() {
1171   IsStartOfStream = false;
1172 
1173   EncodingInfo EI = getUnicodeEncoding(currentInput());
1174 
1175   Token T;
1176   T.Kind = Token::TK_StreamStart;
1177   T.Range = StringRef(Current, EI.second);
1178   TokenQueue.push_back(T);
1179   Current += EI.second;
1180   return true;
1181 }
1182 
1183 bool Scanner::scanStreamEnd() {
1184   // Force an ending new line if one isn't present.
1185   if (Column != 0) {
1186     Column = 0;
1187     ++Line;
1188   }
1189 
1190   unrollIndent(-1);
1191   SimpleKeys.clear();
1192   IsSimpleKeyAllowed = false;
1193 
1194   Token T;
1195   T.Kind = Token::TK_StreamEnd;
1196   T.Range = StringRef(Current, 0);
1197   TokenQueue.push_back(T);
1198   return true;
1199 }
1200 
1201 bool Scanner::scanDirective() {
1202   // Reset the indentation level.
1203   unrollIndent(-1);
1204   SimpleKeys.clear();
1205   IsSimpleKeyAllowed = false;
1206 
1207   StringRef::iterator Start = Current;
1208   consume('%');
1209   StringRef::iterator NameStart = Current;
1210   Current = skip_while(&Scanner::skip_ns_char, Current);
1211   StringRef Name(NameStart, Current - NameStart);
1212   Current = skip_while(&Scanner::skip_s_white, Current);
1213 
1214   Token T;
1215   if (Name == "YAML") {
1216     Current = skip_while(&Scanner::skip_ns_char, Current);
1217     T.Kind = Token::TK_VersionDirective;
1218     T.Range = StringRef(Start, Current - Start);
1219     TokenQueue.push_back(T);
1220     return true;
1221   } else if(Name == "TAG") {
1222     Current = skip_while(&Scanner::skip_ns_char, Current);
1223     Current = skip_while(&Scanner::skip_s_white, Current);
1224     Current = skip_while(&Scanner::skip_ns_char, Current);
1225     T.Kind = Token::TK_TagDirective;
1226     T.Range = StringRef(Start, Current - Start);
1227     TokenQueue.push_back(T);
1228     return true;
1229   }
1230   return false;
1231 }
1232 
1233 bool Scanner::scanDocumentIndicator(bool IsStart) {
1234   unrollIndent(-1);
1235   SimpleKeys.clear();
1236   IsSimpleKeyAllowed = false;
1237 
1238   Token T;
1239   T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
1240   T.Range = StringRef(Current, 3);
1241   skip(3);
1242   TokenQueue.push_back(T);
1243   return true;
1244 }
1245 
1246 bool Scanner::scanFlowCollectionStart(bool IsSequence) {
1247   Token T;
1248   T.Kind = IsSequence ? Token::TK_FlowSequenceStart
1249                       : Token::TK_FlowMappingStart;
1250   T.Range = StringRef(Current, 1);
1251   skip(1);
1252   TokenQueue.push_back(T);
1253 
1254   // [ and { may begin a simple key.
1255   saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false);
1256 
1257   // And may also be followed by a simple key.
1258   IsSimpleKeyAllowed = true;
1259   ++FlowLevel;
1260   return true;
1261 }
1262 
1263 bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
1264   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1265   IsSimpleKeyAllowed = false;
1266   Token T;
1267   T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
1268                       : Token::TK_FlowMappingEnd;
1269   T.Range = StringRef(Current, 1);
1270   skip(1);
1271   TokenQueue.push_back(T);
1272   if (FlowLevel)
1273     --FlowLevel;
1274   return true;
1275 }
1276 
1277 bool Scanner::scanFlowEntry() {
1278   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1279   IsSimpleKeyAllowed = true;
1280   Token T;
1281   T.Kind = Token::TK_FlowEntry;
1282   T.Range = StringRef(Current, 1);
1283   skip(1);
1284   TokenQueue.push_back(T);
1285   return true;
1286 }
1287 
1288 bool Scanner::scanBlockEntry() {
1289   rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
1290   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1291   IsSimpleKeyAllowed = true;
1292   Token T;
1293   T.Kind = Token::TK_BlockEntry;
1294   T.Range = StringRef(Current, 1);
1295   skip(1);
1296   TokenQueue.push_back(T);
1297   return true;
1298 }
1299 
1300 bool Scanner::scanKey() {
1301   if (!FlowLevel)
1302     rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1303 
1304   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1305   IsSimpleKeyAllowed = !FlowLevel;
1306 
1307   Token T;
1308   T.Kind = Token::TK_Key;
1309   T.Range = StringRef(Current, 1);
1310   skip(1);
1311   TokenQueue.push_back(T);
1312   return true;
1313 }
1314 
1315 bool Scanner::scanValue() {
1316   // If the previous token could have been a simple key, insert the key token
1317   // into the token queue.
1318   if (!SimpleKeys.empty()) {
1319     SimpleKey SK = SimpleKeys.pop_back_val();
1320     Token T;
1321     T.Kind = Token::TK_Key;
1322     T.Range = SK.Tok->Range;
1323     TokenQueueT::iterator i, e;
1324     for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
1325       if (i == SK.Tok)
1326         break;
1327     }
1328     if (i == e) {
1329       Failed = true;
1330       return false;
1331     }
1332     i = TokenQueue.insert(i, T);
1333 
1334     // We may also need to add a Block-Mapping-Start token.
1335     rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
1336 
1337     IsSimpleKeyAllowed = false;
1338   } else {
1339     if (!FlowLevel)
1340       rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1341     IsSimpleKeyAllowed = !FlowLevel;
1342   }
1343 
1344   Token T;
1345   T.Kind = Token::TK_Value;
1346   T.Range = StringRef(Current, 1);
1347   skip(1);
1348   TokenQueue.push_back(T);
1349   return true;
1350 }
1351 
1352 // Forbidding inlining improves performance by roughly 20%.
1353 // FIXME: Remove once llvm optimizes this to the faster version without hints.
1354 LLVM_ATTRIBUTE_NOINLINE static bool
1355 wasEscaped(StringRef::iterator First, StringRef::iterator Position);
1356 
1357 // Returns whether a character at 'Position' was escaped with a leading '\'.
1358 // 'First' specifies the position of the first character in the string.
1359 static bool wasEscaped(StringRef::iterator First,
1360                        StringRef::iterator Position) {
1361   assert(Position - 1 >= First);
1362   StringRef::iterator I = Position - 1;
1363   // We calculate the number of consecutive '\'s before the current position
1364   // by iterating backwards through our string.
1365   while (I >= First && *I == '\\') --I;
1366   // (Position - 1 - I) now contains the number of '\'s before the current
1367   // position. If it is odd, the character at 'Position' was escaped.
1368   return (Position - 1 - I) % 2 == 1;
1369 }
1370 
1371 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
1372   StringRef::iterator Start = Current;
1373   unsigned ColStart = Column;
1374   if (IsDoubleQuoted) {
1375     do {
1376       ++Current;
1377       while (Current != End && *Current != '"')
1378         ++Current;
1379       // Repeat until the previous character was not a '\' or was an escaped
1380       // backslash.
1381     } while (   Current != End
1382              && *(Current - 1) == '\\'
1383              && wasEscaped(Start + 1, Current));
1384   } else {
1385     skip(1);
1386     while (Current != End) {
1387       // Skip a ' followed by another '.
1388       if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
1389         skip(2);
1390         continue;
1391       } else if (*Current == '\'')
1392         break;
1393       StringRef::iterator i = skip_nb_char(Current);
1394       if (i == Current) {
1395         i = skip_b_break(Current);
1396         if (i == Current)
1397           break;
1398         Current = i;
1399         Column = 0;
1400         ++Line;
1401       } else {
1402         if (i == End)
1403           break;
1404         Current = i;
1405         ++Column;
1406       }
1407     }
1408   }
1409 
1410   if (Current == End) {
1411     setError("Expected quote at end of scalar", Current);
1412     return false;
1413   }
1414 
1415   skip(1); // Skip ending quote.
1416   Token T;
1417   T.Kind = Token::TK_Scalar;
1418   T.Range = StringRef(Start, Current - Start);
1419   TokenQueue.push_back(T);
1420 
1421   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1422 
1423   IsSimpleKeyAllowed = false;
1424 
1425   return true;
1426 }
1427 
1428 bool Scanner::scanPlainScalar() {
1429   StringRef::iterator Start = Current;
1430   unsigned ColStart = Column;
1431   unsigned LeadingBlanks = 0;
1432   assert(Indent >= -1 && "Indent must be >= -1 !");
1433   unsigned indent = static_cast<unsigned>(Indent + 1);
1434   while (Current != End) {
1435     if (*Current == '#')
1436       break;
1437 
1438     while (Current != End && !isBlankOrBreak(Current)) {
1439       if (FlowLevel && *Current == ':' &&
1440           (Current + 1 == End ||
1441            !(isBlankOrBreak(Current + 1) || *(Current + 1) == ','))) {
1442         setError("Found unexpected ':' while scanning a plain scalar", Current);
1443         return false;
1444       }
1445 
1446       // Check for the end of the plain scalar.
1447       if (  (*Current == ':' && isBlankOrBreak(Current + 1))
1448           || (  FlowLevel
1449           && (StringRef(Current, 1).find_first_of(",:?[]{}")
1450               != StringRef::npos)))
1451         break;
1452 
1453       StringRef::iterator i = skip_nb_char(Current);
1454       if (i == Current)
1455         break;
1456       Current = i;
1457       ++Column;
1458     }
1459 
1460     // Are we at the end?
1461     if (!isBlankOrBreak(Current))
1462       break;
1463 
1464     // Eat blanks.
1465     StringRef::iterator Tmp = Current;
1466     while (isBlankOrBreak(Tmp)) {
1467       StringRef::iterator i = skip_s_white(Tmp);
1468       if (i != Tmp) {
1469         if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
1470           setError("Found invalid tab character in indentation", Tmp);
1471           return false;
1472         }
1473         Tmp = i;
1474         ++Column;
1475       } else {
1476         i = skip_b_break(Tmp);
1477         if (!LeadingBlanks)
1478           LeadingBlanks = 1;
1479         Tmp = i;
1480         Column = 0;
1481         ++Line;
1482       }
1483     }
1484 
1485     if (!FlowLevel && Column < indent)
1486       break;
1487 
1488     Current = Tmp;
1489   }
1490   if (Start == Current) {
1491     setError("Got empty plain scalar", Start);
1492     return false;
1493   }
1494   Token T;
1495   T.Kind = Token::TK_Scalar;
1496   T.Range = StringRef(Start, Current - Start);
1497   TokenQueue.push_back(T);
1498 
1499   // Plain scalars can be simple keys.
1500   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1501 
1502   IsSimpleKeyAllowed = false;
1503 
1504   return true;
1505 }
1506 
1507 bool Scanner::scanAliasOrAnchor(bool IsAlias) {
1508   StringRef::iterator Start = Current;
1509   unsigned ColStart = Column;
1510   skip(1);
1511   while (Current != End) {
1512     if (   *Current == '[' || *Current == ']'
1513         || *Current == '{' || *Current == '}'
1514         || *Current == ','
1515         || *Current == ':')
1516       break;
1517     StringRef::iterator i = skip_ns_char(Current);
1518     if (i == Current)
1519       break;
1520     Current = i;
1521     ++Column;
1522   }
1523 
1524   if (Start + 1 == Current) {
1525     setError("Got empty alias or anchor", Start);
1526     return false;
1527   }
1528 
1529   Token T;
1530   T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
1531   T.Range = StringRef(Start, Current - Start);
1532   TokenQueue.push_back(T);
1533 
1534   // Alias and anchors can be simple keys.
1535   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1536 
1537   IsSimpleKeyAllowed = false;
1538 
1539   return true;
1540 }
1541 
1542 bool Scanner::scanBlockScalarIndicators(char &StyleIndicator,
1543                                         char &ChompingIndicator,
1544                                         unsigned &IndentIndicator,
1545                                         bool &IsDone) {
1546   StyleIndicator = scanBlockStyleIndicator();
1547   if (!scanBlockScalarHeader(ChompingIndicator, IndentIndicator, IsDone))
1548     return false;
1549   return true;
1550 }
1551 
1552 char Scanner::scanBlockStyleIndicator() {
1553   char Indicator = ' ';
1554   if (Current != End && (*Current == '>' || *Current == '|')) {
1555     Indicator = *Current;
1556     skip(1);
1557   }
1558   return Indicator;
1559 }
1560 
1561 char Scanner::scanBlockChompingIndicator() {
1562   char Indicator = ' ';
1563   if (Current != End && (*Current == '+' || *Current == '-')) {
1564     Indicator = *Current;
1565     skip(1);
1566   }
1567   return Indicator;
1568 }
1569 
1570 /// Get the number of line breaks after chomping.
1571 ///
1572 /// Return the number of trailing line breaks to emit, depending on
1573 /// \p ChompingIndicator.
1574 static unsigned getChompedLineBreaks(char ChompingIndicator,
1575                                      unsigned LineBreaks, StringRef Str) {
1576   if (ChompingIndicator == '-') // Strip all line breaks.
1577     return 0;
1578   if (ChompingIndicator == '+') // Keep all line breaks.
1579     return LineBreaks;
1580   // Clip trailing lines.
1581   return Str.empty() ? 0 : 1;
1582 }
1583 
1584 unsigned Scanner::scanBlockIndentationIndicator() {
1585   unsigned Indent = 0;
1586   if (Current != End && (*Current >= '1' && *Current <= '9')) {
1587     Indent = unsigned(*Current - '0');
1588     skip(1);
1589   }
1590   return Indent;
1591 }
1592 
1593 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator,
1594                                     unsigned &IndentIndicator, bool &IsDone) {
1595   auto Start = Current;
1596 
1597   ChompingIndicator = scanBlockChompingIndicator();
1598   IndentIndicator = scanBlockIndentationIndicator();
1599   // Check for the chomping indicator once again.
1600   if (ChompingIndicator == ' ')
1601     ChompingIndicator = scanBlockChompingIndicator();
1602   Current = skip_while(&Scanner::skip_s_white, Current);
1603   skipComment();
1604 
1605   if (Current == End) { // EOF, we have an empty scalar.
1606     Token T;
1607     T.Kind = Token::TK_BlockScalar;
1608     T.Range = StringRef(Start, Current - Start);
1609     TokenQueue.push_back(T);
1610     IsDone = true;
1611     return true;
1612   }
1613 
1614   if (!consumeLineBreakIfPresent()) {
1615     setError("Expected a line break after block scalar header", Current);
1616     return false;
1617   }
1618   return true;
1619 }
1620 
1621 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent,
1622                                     unsigned BlockExitIndent,
1623                                     unsigned &LineBreaks, bool &IsDone) {
1624   unsigned MaxAllSpaceLineCharacters = 0;
1625   StringRef::iterator LongestAllSpaceLine;
1626 
1627   while (true) {
1628     advanceWhile(&Scanner::skip_s_space);
1629     if (skip_nb_char(Current) != Current) {
1630       // This line isn't empty, so try and find the indentation.
1631       if (Column <= BlockExitIndent) { // End of the block literal.
1632         IsDone = true;
1633         return true;
1634       }
1635       // We found the block's indentation.
1636       BlockIndent = Column;
1637       if (MaxAllSpaceLineCharacters > BlockIndent) {
1638         setError(
1639             "Leading all-spaces line must be smaller than the block indent",
1640             LongestAllSpaceLine);
1641         return false;
1642       }
1643       return true;
1644     }
1645     if (skip_b_break(Current) != Current &&
1646         Column > MaxAllSpaceLineCharacters) {
1647       // Record the longest all-space line in case it's longer than the
1648       // discovered block indent.
1649       MaxAllSpaceLineCharacters = Column;
1650       LongestAllSpaceLine = Current;
1651     }
1652 
1653     // Check for EOF.
1654     if (Current == End) {
1655       IsDone = true;
1656       return true;
1657     }
1658 
1659     if (!consumeLineBreakIfPresent()) {
1660       IsDone = true;
1661       return true;
1662     }
1663     ++LineBreaks;
1664   }
1665   return true;
1666 }
1667 
1668 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent,
1669                                     unsigned BlockExitIndent, bool &IsDone) {
1670   // Skip the indentation.
1671   while (Column < BlockIndent) {
1672     auto I = skip_s_space(Current);
1673     if (I == Current)
1674       break;
1675     Current = I;
1676     ++Column;
1677   }
1678 
1679   if (skip_nb_char(Current) == Current)
1680     return true;
1681 
1682   if (Column <= BlockExitIndent) { // End of the block literal.
1683     IsDone = true;
1684     return true;
1685   }
1686 
1687   if (Column < BlockIndent) {
1688     if (Current != End && *Current == '#') { // Trailing comment.
1689       IsDone = true;
1690       return true;
1691     }
1692     setError("A text line is less indented than the block scalar", Current);
1693     return false;
1694   }
1695   return true; // A normal text line.
1696 }
1697 
1698 bool Scanner::scanBlockScalar(bool IsLiteral) {
1699   assert(*Current == '|' || *Current == '>');
1700   char StyleIndicator;
1701   char ChompingIndicator;
1702   unsigned BlockIndent;
1703   bool IsDone = false;
1704   if (!scanBlockScalarIndicators(StyleIndicator, ChompingIndicator, BlockIndent,
1705                                  IsDone))
1706     return false;
1707   if (IsDone)
1708     return true;
1709   bool IsFolded = StyleIndicator == '>';
1710 
1711   const auto *Start = Current;
1712   unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent;
1713   unsigned LineBreaks = 0;
1714   if (BlockIndent == 0) {
1715     if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks,
1716                                IsDone))
1717       return false;
1718   }
1719 
1720   // Scan the block's scalars body.
1721   SmallString<256> Str;
1722   while (!IsDone) {
1723     if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone))
1724       return false;
1725     if (IsDone)
1726       break;
1727 
1728     // Parse the current line.
1729     auto LineStart = Current;
1730     advanceWhile(&Scanner::skip_nb_char);
1731     if (LineStart != Current) {
1732       if (LineBreaks && IsFolded && !Scanner::isLineEmpty(Str)) {
1733         // The folded style "folds" any single line break between content into a
1734         // single space, except when that content is "empty" (only contains
1735         // whitespace) in which case the line break is left as-is.
1736         if (LineBreaks == 1) {
1737           Str.append(LineBreaks,
1738                      isLineEmpty(StringRef(LineStart, Current - LineStart))
1739                          ? '\n'
1740                          : ' ');
1741         }
1742         // If we saw a single line break, we are completely replacing it and so
1743         // want `LineBreaks == 0`. Otherwise this decrement accounts for the
1744         // fact that the first line break is "trimmed", only being used to
1745         // signal a sequence of line breaks which should not be folded.
1746         LineBreaks--;
1747       }
1748       Str.append(LineBreaks, '\n');
1749       Str.append(StringRef(LineStart, Current - LineStart));
1750       LineBreaks = 0;
1751     }
1752 
1753     // Check for EOF.
1754     if (Current == End)
1755       break;
1756 
1757     if (!consumeLineBreakIfPresent())
1758       break;
1759     ++LineBreaks;
1760   }
1761 
1762   if (Current == End && !LineBreaks)
1763     // Ensure that there is at least one line break before the end of file.
1764     LineBreaks = 1;
1765   Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n');
1766 
1767   // New lines may start a simple key.
1768   if (!FlowLevel)
1769     IsSimpleKeyAllowed = true;
1770 
1771   Token T;
1772   T.Kind = Token::TK_BlockScalar;
1773   T.Range = StringRef(Start, Current - Start);
1774   T.Value = std::string(Str);
1775   TokenQueue.push_back(T);
1776   return true;
1777 }
1778 
1779 bool Scanner::scanTag() {
1780   StringRef::iterator Start = Current;
1781   unsigned ColStart = Column;
1782   skip(1); // Eat !.
1783   if (Current == End || isBlankOrBreak(Current)); // An empty tag.
1784   else if (*Current == '<') {
1785     skip(1);
1786     scan_ns_uri_char();
1787     if (!consume('>'))
1788       return false;
1789   } else {
1790     // FIXME: Actually parse the c-ns-shorthand-tag rule.
1791     Current = skip_while(&Scanner::skip_ns_char, Current);
1792   }
1793 
1794   Token T;
1795   T.Kind = Token::TK_Tag;
1796   T.Range = StringRef(Start, Current - Start);
1797   TokenQueue.push_back(T);
1798 
1799   // Tags can be simple keys.
1800   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1801 
1802   IsSimpleKeyAllowed = false;
1803 
1804   return true;
1805 }
1806 
1807 bool Scanner::fetchMoreTokens() {
1808   if (IsStartOfStream)
1809     return scanStreamStart();
1810 
1811   scanToNextToken();
1812 
1813   if (Current == End)
1814     return scanStreamEnd();
1815 
1816   removeStaleSimpleKeyCandidates();
1817 
1818   unrollIndent(Column);
1819 
1820   if (Column == 0 && *Current == '%')
1821     return scanDirective();
1822 
1823   if (Column == 0 && Current + 4 <= End
1824       && *Current == '-'
1825       && *(Current + 1) == '-'
1826       && *(Current + 2) == '-'
1827       && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1828     return scanDocumentIndicator(true);
1829 
1830   if (Column == 0 && Current + 4 <= End
1831       && *Current == '.'
1832       && *(Current + 1) == '.'
1833       && *(Current + 2) == '.'
1834       && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1835     return scanDocumentIndicator(false);
1836 
1837   if (*Current == '[')
1838     return scanFlowCollectionStart(true);
1839 
1840   if (*Current == '{')
1841     return scanFlowCollectionStart(false);
1842 
1843   if (*Current == ']')
1844     return scanFlowCollectionEnd(true);
1845 
1846   if (*Current == '}')
1847     return scanFlowCollectionEnd(false);
1848 
1849   if (*Current == ',')
1850     return scanFlowEntry();
1851 
1852   if (*Current == '-' && isBlankOrBreak(Current + 1))
1853     return scanBlockEntry();
1854 
1855   if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
1856     return scanKey();
1857 
1858   if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
1859     return scanValue();
1860 
1861   if (*Current == '*')
1862     return scanAliasOrAnchor(true);
1863 
1864   if (*Current == '&')
1865     return scanAliasOrAnchor(false);
1866 
1867   if (*Current == '!')
1868     return scanTag();
1869 
1870   if (*Current == '|' && !FlowLevel)
1871     return scanBlockScalar(true);
1872 
1873   if (*Current == '>' && !FlowLevel)
1874     return scanBlockScalar(false);
1875 
1876   if (*Current == '\'')
1877     return scanFlowScalar(false);
1878 
1879   if (*Current == '"')
1880     return scanFlowScalar(true);
1881 
1882   // Get a plain scalar.
1883   StringRef FirstChar(Current, 1);
1884   if (!(isBlankOrBreak(Current)
1885         || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
1886       || (*Current == '-' && !isBlankOrBreak(Current + 1))
1887       || (!FlowLevel && (*Current == '?' || *Current == ':')
1888           && isBlankOrBreak(Current + 1))
1889       || (!FlowLevel && *Current == ':'
1890                       && Current + 2 < End
1891                       && *(Current + 1) == ':'
1892                       && !isBlankOrBreak(Current + 2)))
1893     return scanPlainScalar();
1894 
1895   setError("Unrecognized character while tokenizing.", Current);
1896   return false;
1897 }
1898 
1899 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors,
1900                std::error_code *EC)
1901     : scanner(new Scanner(Input, SM, ShowColors, EC)) {}
1902 
1903 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors,
1904                std::error_code *EC)
1905     : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)) {}
1906 
1907 Stream::~Stream() = default;
1908 
1909 bool Stream::failed() { return scanner->failed(); }
1910 
1911 void Stream::printError(Node *N, const Twine &Msg, SourceMgr::DiagKind Kind) {
1912   printError(N ? N->getSourceRange() : SMRange(), Msg, Kind);
1913 }
1914 
1915 void Stream::printError(const SMRange &Range, const Twine &Msg,
1916                         SourceMgr::DiagKind Kind) {
1917   scanner->printError(Range.Start, Kind, Msg, Range);
1918 }
1919 
1920 document_iterator Stream::begin() {
1921   if (CurrentDoc)
1922     report_fatal_error("Can only iterate over the stream once");
1923 
1924   // Skip Stream-Start.
1925   scanner->getNext();
1926 
1927   CurrentDoc.reset(new Document(*this));
1928   return document_iterator(CurrentDoc);
1929 }
1930 
1931 document_iterator Stream::end() {
1932   return document_iterator();
1933 }
1934 
1935 void Stream::skip() {
1936   for (Document &Doc : *this)
1937     Doc.skip();
1938 }
1939 
1940 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A,
1941            StringRef T)
1942     : Doc(D), TypeID(Type), Anchor(A), Tag(T) {
1943   SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
1944   SourceRange = SMRange(Start, Start);
1945 }
1946 
1947 std::string Node::getVerbatimTag() const {
1948   StringRef Raw = getRawTag();
1949   if (!Raw.empty() && Raw != "!") {
1950     std::string Ret;
1951     if (Raw.find_last_of('!') == 0) {
1952       Ret = std::string(Doc->getTagMap().find("!")->second);
1953       Ret += Raw.substr(1);
1954       return Ret;
1955     } else if (Raw.startswith("!!")) {
1956       Ret = std::string(Doc->getTagMap().find("!!")->second);
1957       Ret += Raw.substr(2);
1958       return Ret;
1959     } else {
1960       StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
1961       std::map<StringRef, StringRef>::const_iterator It =
1962           Doc->getTagMap().find(TagHandle);
1963       if (It != Doc->getTagMap().end())
1964         Ret = std::string(It->second);
1965       else {
1966         Token T;
1967         T.Kind = Token::TK_Tag;
1968         T.Range = TagHandle;
1969         setError(Twine("Unknown tag handle ") + TagHandle, T);
1970       }
1971       Ret += Raw.substr(Raw.find_last_of('!') + 1);
1972       return Ret;
1973     }
1974   }
1975 
1976   switch (getType()) {
1977   case NK_Null:
1978     return "tag:yaml.org,2002:null";
1979   case NK_Scalar:
1980   case NK_BlockScalar:
1981     // TODO: Tag resolution.
1982     return "tag:yaml.org,2002:str";
1983   case NK_Mapping:
1984     return "tag:yaml.org,2002:map";
1985   case NK_Sequence:
1986     return "tag:yaml.org,2002:seq";
1987   }
1988 
1989   return "";
1990 }
1991 
1992 Token &Node::peekNext() {
1993   return Doc->peekNext();
1994 }
1995 
1996 Token Node::getNext() {
1997   return Doc->getNext();
1998 }
1999 
2000 Node *Node::parseBlockNode() {
2001   return Doc->parseBlockNode();
2002 }
2003 
2004 BumpPtrAllocator &Node::getAllocator() {
2005   return Doc->NodeAllocator;
2006 }
2007 
2008 void Node::setError(const Twine &Msg, Token &Tok) const {
2009   Doc->setError(Msg, Tok);
2010 }
2011 
2012 bool Node::failed() const {
2013   return Doc->failed();
2014 }
2015 
2016 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
2017   // TODO: Handle newlines properly. We need to remove leading whitespace.
2018   if (Value[0] == '"') { // Double quoted.
2019     // Pull off the leading and trailing "s.
2020     StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
2021     // Search for characters that would require unescaping the value.
2022     StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
2023     if (i != StringRef::npos)
2024       return unescapeDoubleQuoted(UnquotedValue, i, Storage);
2025     return UnquotedValue;
2026   } else if (Value[0] == '\'') { // Single quoted.
2027     // Pull off the leading and trailing 's.
2028     StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
2029     StringRef::size_type i = UnquotedValue.find('\'');
2030     if (i != StringRef::npos) {
2031       // We're going to need Storage.
2032       Storage.clear();
2033       Storage.reserve(UnquotedValue.size());
2034       for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
2035         StringRef Valid(UnquotedValue.begin(), i);
2036         llvm::append_range(Storage, Valid);
2037         Storage.push_back('\'');
2038         UnquotedValue = UnquotedValue.substr(i + 2);
2039       }
2040       llvm::append_range(Storage, UnquotedValue);
2041       return StringRef(Storage.begin(), Storage.size());
2042     }
2043     return UnquotedValue;
2044   }
2045   // Plain or block.
2046   return Value.rtrim(' ');
2047 }
2048 
2049 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
2050                                           , StringRef::size_type i
2051                                           , SmallVectorImpl<char> &Storage)
2052                                           const {
2053   // Use Storage to build proper value.
2054   Storage.clear();
2055   Storage.reserve(UnquotedValue.size());
2056   for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
2057     // Insert all previous chars into Storage.
2058     StringRef Valid(UnquotedValue.begin(), i);
2059     llvm::append_range(Storage, Valid);
2060     // Chop off inserted chars.
2061     UnquotedValue = UnquotedValue.substr(i);
2062 
2063     assert(!UnquotedValue.empty() && "Can't be empty!");
2064 
2065     // Parse escape or line break.
2066     switch (UnquotedValue[0]) {
2067     case '\r':
2068     case '\n':
2069       Storage.push_back('\n');
2070       if (   UnquotedValue.size() > 1
2071           && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
2072         UnquotedValue = UnquotedValue.substr(1);
2073       UnquotedValue = UnquotedValue.substr(1);
2074       break;
2075     default:
2076       if (UnquotedValue.size() == 1) {
2077         Token T;
2078         T.Range = StringRef(UnquotedValue.begin(), 1);
2079         setError("Unrecognized escape code", T);
2080         return "";
2081       }
2082       UnquotedValue = UnquotedValue.substr(1);
2083       switch (UnquotedValue[0]) {
2084       default: {
2085           Token T;
2086           T.Range = StringRef(UnquotedValue.begin(), 1);
2087           setError("Unrecognized escape code", T);
2088           return "";
2089         }
2090       case '\r':
2091       case '\n':
2092         // Remove the new line.
2093         if (   UnquotedValue.size() > 1
2094             && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
2095           UnquotedValue = UnquotedValue.substr(1);
2096         // If this was just a single byte newline, it will get skipped
2097         // below.
2098         break;
2099       case '0':
2100         Storage.push_back(0x00);
2101         break;
2102       case 'a':
2103         Storage.push_back(0x07);
2104         break;
2105       case 'b':
2106         Storage.push_back(0x08);
2107         break;
2108       case 't':
2109       case 0x09:
2110         Storage.push_back(0x09);
2111         break;
2112       case 'n':
2113         Storage.push_back(0x0A);
2114         break;
2115       case 'v':
2116         Storage.push_back(0x0B);
2117         break;
2118       case 'f':
2119         Storage.push_back(0x0C);
2120         break;
2121       case 'r':
2122         Storage.push_back(0x0D);
2123         break;
2124       case 'e':
2125         Storage.push_back(0x1B);
2126         break;
2127       case ' ':
2128         Storage.push_back(0x20);
2129         break;
2130       case '"':
2131         Storage.push_back(0x22);
2132         break;
2133       case '/':
2134         Storage.push_back(0x2F);
2135         break;
2136       case '\\':
2137         Storage.push_back(0x5C);
2138         break;
2139       case 'N':
2140         encodeUTF8(0x85, Storage);
2141         break;
2142       case '_':
2143         encodeUTF8(0xA0, Storage);
2144         break;
2145       case 'L':
2146         encodeUTF8(0x2028, Storage);
2147         break;
2148       case 'P':
2149         encodeUTF8(0x2029, Storage);
2150         break;
2151       case 'x': {
2152           if (UnquotedValue.size() < 3)
2153             // TODO: Report error.
2154             break;
2155           unsigned int UnicodeScalarValue;
2156           if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
2157             // TODO: Report error.
2158             UnicodeScalarValue = 0xFFFD;
2159           encodeUTF8(UnicodeScalarValue, Storage);
2160           UnquotedValue = UnquotedValue.substr(2);
2161           break;
2162         }
2163       case 'u': {
2164           if (UnquotedValue.size() < 5)
2165             // TODO: Report error.
2166             break;
2167           unsigned int UnicodeScalarValue;
2168           if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
2169             // TODO: Report error.
2170             UnicodeScalarValue = 0xFFFD;
2171           encodeUTF8(UnicodeScalarValue, Storage);
2172           UnquotedValue = UnquotedValue.substr(4);
2173           break;
2174         }
2175       case 'U': {
2176           if (UnquotedValue.size() < 9)
2177             // TODO: Report error.
2178             break;
2179           unsigned int UnicodeScalarValue;
2180           if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
2181             // TODO: Report error.
2182             UnicodeScalarValue = 0xFFFD;
2183           encodeUTF8(UnicodeScalarValue, Storage);
2184           UnquotedValue = UnquotedValue.substr(8);
2185           break;
2186         }
2187       }
2188       UnquotedValue = UnquotedValue.substr(1);
2189     }
2190   }
2191   llvm::append_range(Storage, UnquotedValue);
2192   return StringRef(Storage.begin(), Storage.size());
2193 }
2194 
2195 Node *KeyValueNode::getKey() {
2196   if (Key)
2197     return Key;
2198   // Handle implicit null keys.
2199   {
2200     Token &t = peekNext();
2201     if (   t.Kind == Token::TK_BlockEnd
2202         || t.Kind == Token::TK_Value
2203         || t.Kind == Token::TK_Error) {
2204       return Key = new (getAllocator()) NullNode(Doc);
2205     }
2206     if (t.Kind == Token::TK_Key)
2207       getNext(); // skip TK_Key.
2208   }
2209 
2210   // Handle explicit null keys.
2211   Token &t = peekNext();
2212   if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) {
2213     return Key = new (getAllocator()) NullNode(Doc);
2214   }
2215 
2216   // We've got a normal key.
2217   return Key = parseBlockNode();
2218 }
2219 
2220 Node *KeyValueNode::getValue() {
2221   if (Value)
2222     return Value;
2223 
2224   if (Node* Key = getKey())
2225     Key->skip();
2226   else {
2227     setError("Null key in Key Value.", peekNext());
2228     return Value = new (getAllocator()) NullNode(Doc);
2229   }
2230 
2231   if (failed())
2232     return Value = new (getAllocator()) NullNode(Doc);
2233 
2234   // Handle implicit null values.
2235   {
2236     Token &t = peekNext();
2237     if (   t.Kind == Token::TK_BlockEnd
2238         || t.Kind == Token::TK_FlowMappingEnd
2239         || t.Kind == Token::TK_Key
2240         || t.Kind == Token::TK_FlowEntry
2241         || t.Kind == Token::TK_Error) {
2242       return Value = new (getAllocator()) NullNode(Doc);
2243     }
2244 
2245     if (t.Kind != Token::TK_Value) {
2246       setError("Unexpected token in Key Value.", t);
2247       return Value = new (getAllocator()) NullNode(Doc);
2248     }
2249     getNext(); // skip TK_Value.
2250   }
2251 
2252   // Handle explicit null values.
2253   Token &t = peekNext();
2254   if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) {
2255     return Value = new (getAllocator()) NullNode(Doc);
2256   }
2257 
2258   // We got a normal value.
2259   return Value = parseBlockNode();
2260 }
2261 
2262 void MappingNode::increment() {
2263   if (failed()) {
2264     IsAtEnd = true;
2265     CurrentEntry = nullptr;
2266     return;
2267   }
2268   if (CurrentEntry) {
2269     CurrentEntry->skip();
2270     if (Type == MT_Inline) {
2271       IsAtEnd = true;
2272       CurrentEntry = nullptr;
2273       return;
2274     }
2275   }
2276   Token T = peekNext();
2277   if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) {
2278     // KeyValueNode eats the TK_Key. That way it can detect null keys.
2279     CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
2280   } else if (Type == MT_Block) {
2281     switch (T.Kind) {
2282     case Token::TK_BlockEnd:
2283       getNext();
2284       IsAtEnd = true;
2285       CurrentEntry = nullptr;
2286       break;
2287     default:
2288       setError("Unexpected token. Expected Key or Block End", T);
2289       [[fallthrough]];
2290     case Token::TK_Error:
2291       IsAtEnd = true;
2292       CurrentEntry = nullptr;
2293     }
2294   } else {
2295     switch (T.Kind) {
2296     case Token::TK_FlowEntry:
2297       // Eat the flow entry and recurse.
2298       getNext();
2299       return increment();
2300     case Token::TK_FlowMappingEnd:
2301       getNext();
2302       [[fallthrough]];
2303     case Token::TK_Error:
2304       // Set this to end iterator.
2305       IsAtEnd = true;
2306       CurrentEntry = nullptr;
2307       break;
2308     default:
2309       setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
2310                 "Mapping End."
2311               , T);
2312       IsAtEnd = true;
2313       CurrentEntry = nullptr;
2314     }
2315   }
2316 }
2317 
2318 void SequenceNode::increment() {
2319   if (failed()) {
2320     IsAtEnd = true;
2321     CurrentEntry = nullptr;
2322     return;
2323   }
2324   if (CurrentEntry)
2325     CurrentEntry->skip();
2326   Token T = peekNext();
2327   if (SeqType == ST_Block) {
2328     switch (T.Kind) {
2329     case Token::TK_BlockEntry:
2330       getNext();
2331       CurrentEntry = parseBlockNode();
2332       if (!CurrentEntry) { // An error occurred.
2333         IsAtEnd = true;
2334         CurrentEntry = nullptr;
2335       }
2336       break;
2337     case Token::TK_BlockEnd:
2338       getNext();
2339       IsAtEnd = true;
2340       CurrentEntry = nullptr;
2341       break;
2342     default:
2343       setError( "Unexpected token. Expected Block Entry or Block End."
2344               , T);
2345       [[fallthrough]];
2346     case Token::TK_Error:
2347       IsAtEnd = true;
2348       CurrentEntry = nullptr;
2349     }
2350   } else if (SeqType == ST_Indentless) {
2351     switch (T.Kind) {
2352     case Token::TK_BlockEntry:
2353       getNext();
2354       CurrentEntry = parseBlockNode();
2355       if (!CurrentEntry) { // An error occurred.
2356         IsAtEnd = true;
2357         CurrentEntry = nullptr;
2358       }
2359       break;
2360     default:
2361     case Token::TK_Error:
2362       IsAtEnd = true;
2363       CurrentEntry = nullptr;
2364     }
2365   } else if (SeqType == ST_Flow) {
2366     switch (T.Kind) {
2367     case Token::TK_FlowEntry:
2368       // Eat the flow entry and recurse.
2369       getNext();
2370       WasPreviousTokenFlowEntry = true;
2371       return increment();
2372     case Token::TK_FlowSequenceEnd:
2373       getNext();
2374       [[fallthrough]];
2375     case Token::TK_Error:
2376       // Set this to end iterator.
2377       IsAtEnd = true;
2378       CurrentEntry = nullptr;
2379       break;
2380     case Token::TK_StreamEnd:
2381     case Token::TK_DocumentEnd:
2382     case Token::TK_DocumentStart:
2383       setError("Could not find closing ]!", T);
2384       // Set this to end iterator.
2385       IsAtEnd = true;
2386       CurrentEntry = nullptr;
2387       break;
2388     default:
2389       if (!WasPreviousTokenFlowEntry) {
2390         setError("Expected , between entries!", T);
2391         IsAtEnd = true;
2392         CurrentEntry = nullptr;
2393         break;
2394       }
2395       // Otherwise it must be a flow entry.
2396       CurrentEntry = parseBlockNode();
2397       if (!CurrentEntry) {
2398         IsAtEnd = true;
2399       }
2400       WasPreviousTokenFlowEntry = false;
2401       break;
2402     }
2403   }
2404 }
2405 
2406 Document::Document(Stream &S) : stream(S), Root(nullptr) {
2407   // Tag maps starts with two default mappings.
2408   TagMap["!"] = "!";
2409   TagMap["!!"] = "tag:yaml.org,2002:";
2410 
2411   if (parseDirectives())
2412     expectToken(Token::TK_DocumentStart);
2413   Token &T = peekNext();
2414   if (T.Kind == Token::TK_DocumentStart)
2415     getNext();
2416 }
2417 
2418 bool Document::skip()  {
2419   if (stream.scanner->failed())
2420     return false;
2421   if (!Root && !getRoot())
2422     return false;
2423   Root->skip();
2424   Token &T = peekNext();
2425   if (T.Kind == Token::TK_StreamEnd)
2426     return false;
2427   if (T.Kind == Token::TK_DocumentEnd) {
2428     getNext();
2429     return skip();
2430   }
2431   return true;
2432 }
2433 
2434 Token &Document::peekNext() {
2435   return stream.scanner->peekNext();
2436 }
2437 
2438 Token Document::getNext() {
2439   return stream.scanner->getNext();
2440 }
2441 
2442 void Document::setError(const Twine &Message, Token &Location) const {
2443   stream.scanner->setError(Message, Location.Range.begin());
2444 }
2445 
2446 bool Document::failed() const {
2447   return stream.scanner->failed();
2448 }
2449 
2450 Node *Document::parseBlockNode() {
2451   Token T = peekNext();
2452   // Handle properties.
2453   Token AnchorInfo;
2454   Token TagInfo;
2455 parse_property:
2456   switch (T.Kind) {
2457   case Token::TK_Alias:
2458     getNext();
2459     return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
2460   case Token::TK_Anchor:
2461     if (AnchorInfo.Kind == Token::TK_Anchor) {
2462       setError("Already encountered an anchor for this node!", T);
2463       return nullptr;
2464     }
2465     AnchorInfo = getNext(); // Consume TK_Anchor.
2466     T = peekNext();
2467     goto parse_property;
2468   case Token::TK_Tag:
2469     if (TagInfo.Kind == Token::TK_Tag) {
2470       setError("Already encountered a tag for this node!", T);
2471       return nullptr;
2472     }
2473     TagInfo = getNext(); // Consume TK_Tag.
2474     T = peekNext();
2475     goto parse_property;
2476   default:
2477     break;
2478   }
2479 
2480   switch (T.Kind) {
2481   case Token::TK_BlockEntry:
2482     // We got an unindented BlockEntry sequence. This is not terminated with
2483     // a BlockEnd.
2484     // Don't eat the TK_BlockEntry, SequenceNode needs it.
2485     return new (NodeAllocator) SequenceNode( stream.CurrentDoc
2486                                            , AnchorInfo.Range.substr(1)
2487                                            , TagInfo.Range
2488                                            , SequenceNode::ST_Indentless);
2489   case Token::TK_BlockSequenceStart:
2490     getNext();
2491     return new (NodeAllocator)
2492       SequenceNode( stream.CurrentDoc
2493                   , AnchorInfo.Range.substr(1)
2494                   , TagInfo.Range
2495                   , SequenceNode::ST_Block);
2496   case Token::TK_BlockMappingStart:
2497     getNext();
2498     return new (NodeAllocator)
2499       MappingNode( stream.CurrentDoc
2500                  , AnchorInfo.Range.substr(1)
2501                  , TagInfo.Range
2502                  , MappingNode::MT_Block);
2503   case Token::TK_FlowSequenceStart:
2504     getNext();
2505     return new (NodeAllocator)
2506       SequenceNode( stream.CurrentDoc
2507                   , AnchorInfo.Range.substr(1)
2508                   , TagInfo.Range
2509                   , SequenceNode::ST_Flow);
2510   case Token::TK_FlowMappingStart:
2511     getNext();
2512     return new (NodeAllocator)
2513       MappingNode( stream.CurrentDoc
2514                  , AnchorInfo.Range.substr(1)
2515                  , TagInfo.Range
2516                  , MappingNode::MT_Flow);
2517   case Token::TK_Scalar:
2518     getNext();
2519     return new (NodeAllocator)
2520       ScalarNode( stream.CurrentDoc
2521                 , AnchorInfo.Range.substr(1)
2522                 , TagInfo.Range
2523                 , T.Range);
2524   case Token::TK_BlockScalar: {
2525     getNext();
2526     StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1);
2527     StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back();
2528     return new (NodeAllocator)
2529         BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1),
2530                         TagInfo.Range, StrCopy, T.Range);
2531   }
2532   case Token::TK_Key:
2533     // Don't eat the TK_Key, KeyValueNode expects it.
2534     return new (NodeAllocator)
2535       MappingNode( stream.CurrentDoc
2536                  , AnchorInfo.Range.substr(1)
2537                  , TagInfo.Range
2538                  , MappingNode::MT_Inline);
2539   case Token::TK_DocumentStart:
2540   case Token::TK_DocumentEnd:
2541   case Token::TK_StreamEnd:
2542   default:
2543     // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
2544     //       !!null null.
2545     return new (NodeAllocator) NullNode(stream.CurrentDoc);
2546   case Token::TK_FlowMappingEnd:
2547   case Token::TK_FlowSequenceEnd:
2548   case Token::TK_FlowEntry: {
2549     if (Root && (isa<MappingNode>(Root) || isa<SequenceNode>(Root)))
2550       return new (NodeAllocator) NullNode(stream.CurrentDoc);
2551 
2552     setError("Unexpected token", T);
2553     return nullptr;
2554   }
2555   case Token::TK_Error:
2556     return nullptr;
2557   }
2558   llvm_unreachable("Control flow shouldn't reach here.");
2559   return nullptr;
2560 }
2561 
2562 bool Document::parseDirectives() {
2563   bool isDirective = false;
2564   while (true) {
2565     Token T = peekNext();
2566     if (T.Kind == Token::TK_TagDirective) {
2567       parseTAGDirective();
2568       isDirective = true;
2569     } else if (T.Kind == Token::TK_VersionDirective) {
2570       parseYAMLDirective();
2571       isDirective = true;
2572     } else
2573       break;
2574   }
2575   return isDirective;
2576 }
2577 
2578 void Document::parseYAMLDirective() {
2579   getNext(); // Eat %YAML <version>
2580 }
2581 
2582 void Document::parseTAGDirective() {
2583   Token Tag = getNext(); // %TAG <handle> <prefix>
2584   StringRef T = Tag.Range;
2585   // Strip %TAG
2586   T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
2587   std::size_t HandleEnd = T.find_first_of(" \t");
2588   StringRef TagHandle = T.substr(0, HandleEnd);
2589   StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
2590   TagMap[TagHandle] = TagPrefix;
2591 }
2592 
2593 bool Document::expectToken(int TK) {
2594   Token T = getNext();
2595   if (T.Kind != TK) {
2596     setError("Unexpected token", T);
2597     return false;
2598   }
2599   return true;
2600 }
2601