1//===- YAMLParser.cpp - Simple YAML parser --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9//  This file implements a YAML parser.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/Support/YAMLParser.h"
14#include "llvm/ADT/AllocatorList.h"
15#include "llvm/ADT/ArrayRef.h"
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallString.h"
18#include "llvm/ADT/SmallVector.h"
19#include "llvm/ADT/StringExtras.h"
20#include "llvm/ADT/StringRef.h"
21#include "llvm/ADT/Twine.h"
22#include "llvm/Support/Compiler.h"
23#include "llvm/Support/ErrorHandling.h"
24#include "llvm/Support/MemoryBuffer.h"
25#include "llvm/Support/SMLoc.h"
26#include "llvm/Support/SourceMgr.h"
27#include "llvm/Support/Unicode.h"
28#include "llvm/Support/raw_ostream.h"
29#include <cassert>
30#include <cstddef>
31#include <cstdint>
32#include <map>
33#include <memory>
34#include <string>
35#include <system_error>
36#include <utility>
37
38using namespace llvm;
39using namespace yaml;
40
41enum UnicodeEncodingForm {
42  UEF_UTF32_LE, ///< UTF-32 Little Endian
43  UEF_UTF32_BE, ///< UTF-32 Big Endian
44  UEF_UTF16_LE, ///< UTF-16 Little Endian
45  UEF_UTF16_BE, ///< UTF-16 Big Endian
46  UEF_UTF8,     ///< UTF-8 or ascii.
47  UEF_Unknown   ///< Not a valid Unicode encoding.
48};
49
50/// EncodingInfo - Holds the encoding type and length of the byte order mark if
51///                it exists. Length is in {0, 2, 3, 4}.
52using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>;
53
54/// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
55///                      encoding form of \a Input.
56///
57/// @param Input A string of length 0 or more.
58/// @returns An EncodingInfo indicating the Unicode encoding form of the input
59///          and how long the byte order mark is if one exists.
60static EncodingInfo getUnicodeEncoding(StringRef Input) {
61  if (Input.empty())
62    return std::make_pair(UEF_Unknown, 0);
63
64  switch (uint8_t(Input[0])) {
65  case 0x00:
66    if (Input.size() >= 4) {
67      if (  Input[1] == 0
68         && uint8_t(Input[2]) == 0xFE
69         && uint8_t(Input[3]) == 0xFF)
70        return std::make_pair(UEF_UTF32_BE, 4);
71      if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
72        return std::make_pair(UEF_UTF32_BE, 0);
73    }
74
75    if (Input.size() >= 2 && Input[1] != 0)
76      return std::make_pair(UEF_UTF16_BE, 0);
77    return std::make_pair(UEF_Unknown, 0);
78  case 0xFF:
79    if (  Input.size() >= 4
80       && uint8_t(Input[1]) == 0xFE
81       && Input[2] == 0
82       && Input[3] == 0)
83      return std::make_pair(UEF_UTF32_LE, 4);
84
85    if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
86      return std::make_pair(UEF_UTF16_LE, 2);
87    return std::make_pair(UEF_Unknown, 0);
88  case 0xFE:
89    if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
90      return std::make_pair(UEF_UTF16_BE, 2);
91    return std::make_pair(UEF_Unknown, 0);
92  case 0xEF:
93    if (  Input.size() >= 3
94       && uint8_t(Input[1]) == 0xBB
95       && uint8_t(Input[2]) == 0xBF)
96      return std::make_pair(UEF_UTF8, 3);
97    return std::make_pair(UEF_Unknown, 0);
98  }
99
100  // It could still be utf-32 or utf-16.
101  if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
102    return std::make_pair(UEF_UTF32_LE, 0);
103
104  if (Input.size() >= 2 && Input[1] == 0)
105    return std::make_pair(UEF_UTF16_LE, 0);
106
107  return std::make_pair(UEF_UTF8, 0);
108}
109
110/// Pin the vtables to this file.
111void Node::anchor() {}
112void NullNode::anchor() {}
113void ScalarNode::anchor() {}
114void BlockScalarNode::anchor() {}
115void KeyValueNode::anchor() {}
116void MappingNode::anchor() {}
117void SequenceNode::anchor() {}
118void AliasNode::anchor() {}
119
120namespace llvm {
121namespace yaml {
122
123/// Token - A single YAML token.
124struct Token {
125  enum TokenKind {
126    TK_Error, // Uninitialized token.
127    TK_StreamStart,
128    TK_StreamEnd,
129    TK_VersionDirective,
130    TK_TagDirective,
131    TK_DocumentStart,
132    TK_DocumentEnd,
133    TK_BlockEntry,
134    TK_BlockEnd,
135    TK_BlockSequenceStart,
136    TK_BlockMappingStart,
137    TK_FlowEntry,
138    TK_FlowSequenceStart,
139    TK_FlowSequenceEnd,
140    TK_FlowMappingStart,
141    TK_FlowMappingEnd,
142    TK_Key,
143    TK_Value,
144    TK_Scalar,
145    TK_BlockScalar,
146    TK_Alias,
147    TK_Anchor,
148    TK_Tag
149  } Kind = TK_Error;
150
151  /// A string of length 0 or more whose begin() points to the logical location
152  /// of the token in the input.
153  StringRef Range;
154
155  /// The value of a block scalar node.
156  std::string Value;
157
158  Token() = default;
159};
160
161} // end namespace yaml
162} // end namespace llvm
163
164using TokenQueueT = BumpPtrList<Token>;
165
166namespace {
167
168/// This struct is used to track simple keys.
169///
170/// Simple keys are handled by creating an entry in SimpleKeys for each Token
171/// which could legally be the start of a simple key. When peekNext is called,
172/// if the Token To be returned is referenced by a SimpleKey, we continue
173/// tokenizing until that potential simple key has either been found to not be
174/// a simple key (we moved on to the next line or went further than 1024 chars).
175/// Or when we run into a Value, and then insert a Key token (and possibly
176/// others) before the SimpleKey's Tok.
177struct SimpleKey {
178  TokenQueueT::iterator Tok;
179  unsigned Column = 0;
180  unsigned Line = 0;
181  unsigned FlowLevel = 0;
182  bool IsRequired = false;
183
184  bool operator ==(const SimpleKey &Other) {
185    return Tok == Other.Tok;
186  }
187};
188
189} // end anonymous namespace
190
191/// The Unicode scalar value of a UTF-8 minimal well-formed code unit
192///        subsequence and the subsequence's length in code units (uint8_t).
193///        A length of 0 represents an error.
194using UTF8Decoded = std::pair<uint32_t, unsigned>;
195
196static UTF8Decoded decodeUTF8(StringRef Range) {
197  StringRef::iterator Position= Range.begin();
198  StringRef::iterator End = Range.end();
199  // 1 byte: [0x00, 0x7f]
200  // Bit pattern: 0xxxxxxx
201  if (Position < End && (*Position & 0x80) == 0) {
202    return std::make_pair(*Position, 1);
203  }
204  // 2 bytes: [0x80, 0x7ff]
205  // Bit pattern: 110xxxxx 10xxxxxx
206  if (Position + 1 < End && ((*Position & 0xE0) == 0xC0) &&
207      ((*(Position + 1) & 0xC0) == 0x80)) {
208    uint32_t codepoint = ((*Position & 0x1F) << 6) |
209                          (*(Position + 1) & 0x3F);
210    if (codepoint >= 0x80)
211      return std::make_pair(codepoint, 2);
212  }
213  // 3 bytes: [0x8000, 0xffff]
214  // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
215  if (Position + 2 < End && ((*Position & 0xF0) == 0xE0) &&
216      ((*(Position + 1) & 0xC0) == 0x80) &&
217      ((*(Position + 2) & 0xC0) == 0x80)) {
218    uint32_t codepoint = ((*Position & 0x0F) << 12) |
219                         ((*(Position + 1) & 0x3F) << 6) |
220                          (*(Position + 2) & 0x3F);
221    // Codepoints between 0xD800 and 0xDFFF are invalid, as
222    // they are high / low surrogate halves used by UTF-16.
223    if (codepoint >= 0x800 &&
224        (codepoint < 0xD800 || codepoint > 0xDFFF))
225      return std::make_pair(codepoint, 3);
226  }
227  // 4 bytes: [0x10000, 0x10FFFF]
228  // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
229  if (Position + 3 < End && ((*Position & 0xF8) == 0xF0) &&
230      ((*(Position + 1) & 0xC0) == 0x80) &&
231      ((*(Position + 2) & 0xC0) == 0x80) &&
232      ((*(Position + 3) & 0xC0) == 0x80)) {
233    uint32_t codepoint = ((*Position & 0x07) << 18) |
234                         ((*(Position + 1) & 0x3F) << 12) |
235                         ((*(Position + 2) & 0x3F) << 6) |
236                          (*(Position + 3) & 0x3F);
237    if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
238      return std::make_pair(codepoint, 4);
239  }
240  return std::make_pair(0, 0);
241}
242
243namespace llvm {
244namespace yaml {
245
246/// Scans YAML tokens from a MemoryBuffer.
247class Scanner {
248public:
249  Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true,
250          std::error_code *EC = nullptr);
251  Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true,
252          std::error_code *EC = nullptr);
253
254  /// Parse the next token and return it without popping it.
255  Token &peekNext();
256
257  /// Parse the next token and pop it from the queue.
258  Token getNext();
259
260  void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
261                  ArrayRef<SMRange> Ranges = std::nullopt) {
262    SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ std::nullopt,
263                    ShowColors);
264  }
265
266  void setError(const Twine &Message, StringRef::iterator Position) {
267    if (Position >= End)
268      Position = End - 1;
269
270    // propagate the error if possible
271    if (EC)
272      *EC = make_error_code(std::errc::invalid_argument);
273
274    // Don't print out more errors after the first one we encounter. The rest
275    // are just the result of the first, and have no meaning.
276    if (!Failed)
277      printError(SMLoc::getFromPointer(Position), SourceMgr::DK_Error, Message);
278    Failed = true;
279  }
280
281  /// Returns true if an error occurred while parsing.
282  bool failed() {
283    return Failed;
284  }
285
286private:
287  void init(MemoryBufferRef Buffer);
288
289  StringRef currentInput() {
290    return StringRef(Current, End - Current);
291  }
292
293  /// Decode a UTF-8 minimal well-formed code unit subsequence starting
294  ///        at \a Position.
295  ///
296  /// If the UTF-8 code units starting at Position do not form a well-formed
297  /// code unit subsequence, then the Unicode scalar value is 0, and the length
298  /// is 0.
299  UTF8Decoded decodeUTF8(StringRef::iterator Position) {
300    return ::decodeUTF8(StringRef(Position, End - Position));
301  }
302
303  // The following functions are based on the gramar rules in the YAML spec. The
304  // style of the function names it meant to closely match how they are written
305  // in the spec. The number within the [] is the number of the grammar rule in
306  // the spec.
307  //
308  // See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
309  //
310  // c-
311  //   A production starting and ending with a special character.
312  // b-
313  //   A production matching a single line break.
314  // nb-
315  //   A production starting and ending with a non-break character.
316  // s-
317  //   A production starting and ending with a white space character.
318  // ns-
319  //   A production starting and ending with a non-space character.
320  // l-
321  //   A production matching complete line(s).
322
323  /// Skip a single nb-char[27] starting at Position.
324  ///
325  /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
326  ///                  | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
327  ///
328  /// @returns The code unit after the nb-char, or Position if it's not an
329  ///          nb-char.
330  StringRef::iterator skip_nb_char(StringRef::iterator Position);
331
332  /// Skip a single b-break[28] starting at Position.
333  ///
334  /// A b-break is 0xD 0xA | 0xD | 0xA
335  ///
336  /// @returns The code unit after the b-break, or Position if it's not a
337  ///          b-break.
338  StringRef::iterator skip_b_break(StringRef::iterator Position);
339
340  /// Skip a single s-space[31] starting at Position.
341  ///
342  /// An s-space is 0x20
343  ///
344  /// @returns The code unit after the s-space, or Position if it's not a
345  ///          s-space.
346  StringRef::iterator skip_s_space(StringRef::iterator Position);
347
348  /// Skip a single s-white[33] starting at Position.
349  ///
350  /// A s-white is 0x20 | 0x9
351  ///
352  /// @returns The code unit after the s-white, or Position if it's not a
353  ///          s-white.
354  StringRef::iterator skip_s_white(StringRef::iterator Position);
355
356  /// Skip a single ns-char[34] starting at Position.
357  ///
358  /// A ns-char is nb-char - s-white
359  ///
360  /// @returns The code unit after the ns-char, or Position if it's not a
361  ///          ns-char.
362  StringRef::iterator skip_ns_char(StringRef::iterator Position);
363
364  using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator);
365
366  /// Skip minimal well-formed code unit subsequences until Func
367  ///        returns its input.
368  ///
369  /// @returns The code unit after the last minimal well-formed code unit
370  ///          subsequence that Func accepted.
371  StringRef::iterator skip_while( SkipWhileFunc Func
372                                , StringRef::iterator Position);
373
374  /// Skip minimal well-formed code unit subsequences until Func returns its
375  /// input.
376  void advanceWhile(SkipWhileFunc Func);
377
378  /// Scan ns-uri-char[39]s starting at Cur.
379  ///
380  /// This updates Cur and Column while scanning.
381  void scan_ns_uri_char();
382
383  /// Consume a minimal well-formed code unit subsequence starting at
384  ///        \a Cur. Return false if it is not the same Unicode scalar value as
385  ///        \a Expected. This updates \a Column.
386  bool consume(uint32_t Expected);
387
388  /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
389  void skip(uint32_t Distance);
390
391  /// Return true if the minimal well-formed code unit subsequence at
392  ///        Pos is whitespace or a new line
393  bool isBlankOrBreak(StringRef::iterator Position);
394
395  /// Return true if the minimal well-formed code unit subsequence at
396  ///        Pos is considered a "safe" character for plain scalars.
397  bool isPlainSafeNonBlank(StringRef::iterator Position);
398
399  /// Return true if the line is a line break, false otherwise.
400  bool isLineEmpty(StringRef Line);
401
402  /// Consume a single b-break[28] if it's present at the current position.
403  ///
404  /// Return false if the code unit at the current position isn't a line break.
405  bool consumeLineBreakIfPresent();
406
407  /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
408  void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
409                             , unsigned AtColumn
410                             , bool IsRequired);
411
412  /// Remove simple keys that can no longer be valid simple keys.
413  ///
414  /// Invalid simple keys are not on the current line or are further than 1024
415  /// columns back.
416  void removeStaleSimpleKeyCandidates();
417
418  /// Remove all simple keys on FlowLevel \a Level.
419  void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
420
421  /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
422  ///        tokens if needed.
423  bool unrollIndent(int ToColumn);
424
425  /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
426  ///        if needed.
427  bool rollIndent( int ToColumn
428                 , Token::TokenKind Kind
429                 , TokenQueueT::iterator InsertPoint);
430
431  /// Skip a single-line comment when the comment starts at the current
432  /// position of the scanner.
433  void skipComment();
434
435  /// Skip whitespace and comments until the start of the next token.
436  void scanToNextToken();
437
438  /// Must be the first token generated.
439  bool scanStreamStart();
440
441  /// Generate tokens needed to close out the stream.
442  bool scanStreamEnd();
443
444  /// Scan a %BLAH directive.
445  bool scanDirective();
446
447  /// Scan a ... or ---.
448  bool scanDocumentIndicator(bool IsStart);
449
450  /// Scan a [ or { and generate the proper flow collection start token.
451  bool scanFlowCollectionStart(bool IsSequence);
452
453  /// Scan a ] or } and generate the proper flow collection end token.
454  bool scanFlowCollectionEnd(bool IsSequence);
455
456  /// Scan the , that separates entries in a flow collection.
457  bool scanFlowEntry();
458
459  /// Scan the - that starts block sequence entries.
460  bool scanBlockEntry();
461
462  /// Scan an explicit ? indicating a key.
463  bool scanKey();
464
465  /// Scan an explicit : indicating a value.
466  bool scanValue();
467
468  /// Scan a quoted scalar.
469  bool scanFlowScalar(bool IsDoubleQuoted);
470
471  /// Scan an unquoted scalar.
472  bool scanPlainScalar();
473
474  /// Scan an Alias or Anchor starting with * or &.
475  bool scanAliasOrAnchor(bool IsAlias);
476
477  /// Scan a block scalar starting with | or >.
478  bool scanBlockScalar(bool IsLiteral);
479
480  /// Scan a block scalar style indicator and header.
481  ///
482  /// Note: This is distinct from scanBlockScalarHeader to mirror the fact that
483  /// YAML does not consider the style indicator to be a part of the header.
484  ///
485  /// Return false if an error occurred.
486  bool scanBlockScalarIndicators(char &StyleIndicator, char &ChompingIndicator,
487                                 unsigned &IndentIndicator, bool &IsDone);
488
489  /// Scan a style indicator in a block scalar header.
490  char scanBlockStyleIndicator();
491
492  /// Scan a chomping indicator in a block scalar header.
493  char scanBlockChompingIndicator();
494
495  /// Scan an indentation indicator in a block scalar header.
496  unsigned scanBlockIndentationIndicator();
497
498  /// Scan a block scalar header.
499  ///
500  /// Return false if an error occurred.
501  bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator,
502                             bool &IsDone);
503
504  /// Look for the indentation level of a block scalar.
505  ///
506  /// Return false if an error occurred.
507  bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent,
508                             unsigned &LineBreaks, bool &IsDone);
509
510  /// Scan the indentation of a text line in a block scalar.
511  ///
512  /// Return false if an error occurred.
513  bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent,
514                             bool &IsDone);
515
516  /// Scan a tag of the form !stuff.
517  bool scanTag();
518
519  /// Dispatch to the next scanning function based on \a *Cur.
520  bool fetchMoreTokens();
521
522  /// The SourceMgr used for diagnostics and buffer management.
523  SourceMgr &SM;
524
525  /// The original input.
526  MemoryBufferRef InputBuffer;
527
528  /// The current position of the scanner.
529  StringRef::iterator Current;
530
531  /// The end of the input (one past the last character).
532  StringRef::iterator End;
533
534  /// Current YAML indentation level in spaces.
535  int Indent;
536
537  /// Current column number in Unicode code points.
538  unsigned Column;
539
540  /// Current line number.
541  unsigned Line;
542
543  /// How deep we are in flow style containers. 0 Means at block level.
544  unsigned FlowLevel;
545
546  /// Are we at the start of the stream?
547  bool IsStartOfStream;
548
549  /// Can the next token be the start of a simple key?
550  bool IsSimpleKeyAllowed;
551
552  /// Can the next token be a value indicator even if it does not have a
553  /// trailing space?
554  bool IsAdjacentValueAllowedInFlow;
555
556  /// True if an error has occurred.
557  bool Failed;
558
559  /// Should colors be used when printing out the diagnostic messages?
560  bool ShowColors;
561
562  /// Queue of tokens. This is required to queue up tokens while looking
563  ///        for the end of a simple key. And for cases where a single character
564  ///        can produce multiple tokens (e.g. BlockEnd).
565  TokenQueueT TokenQueue;
566
567  /// Indentation levels.
568  SmallVector<int, 4> Indents;
569
570  /// Potential simple keys.
571  SmallVector<SimpleKey, 4> SimpleKeys;
572
573  std::error_code *EC;
574};
575
576} // end namespace yaml
577} // end namespace llvm
578
579/// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
580static void encodeUTF8( uint32_t UnicodeScalarValue
581                      , SmallVectorImpl<char> &Result) {
582  if (UnicodeScalarValue <= 0x7F) {
583    Result.push_back(UnicodeScalarValue & 0x7F);
584  } else if (UnicodeScalarValue <= 0x7FF) {
585    uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6);
586    uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F);
587    Result.push_back(FirstByte);
588    Result.push_back(SecondByte);
589  } else if (UnicodeScalarValue <= 0xFFFF) {
590    uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12);
591    uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
592    uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F);
593    Result.push_back(FirstByte);
594    Result.push_back(SecondByte);
595    Result.push_back(ThirdByte);
596  } else if (UnicodeScalarValue <= 0x10FFFF) {
597    uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18);
598    uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12);
599    uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
600    uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F);
601    Result.push_back(FirstByte);
602    Result.push_back(SecondByte);
603    Result.push_back(ThirdByte);
604    Result.push_back(FourthByte);
605  }
606}
607
608bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
609  SourceMgr SM;
610  Scanner scanner(Input, SM);
611  while (true) {
612    Token T = scanner.getNext();
613    switch (T.Kind) {
614    case Token::TK_StreamStart:
615      OS << "Stream-Start: ";
616      break;
617    case Token::TK_StreamEnd:
618      OS << "Stream-End: ";
619      break;
620    case Token::TK_VersionDirective:
621      OS << "Version-Directive: ";
622      break;
623    case Token::TK_TagDirective:
624      OS << "Tag-Directive: ";
625      break;
626    case Token::TK_DocumentStart:
627      OS << "Document-Start: ";
628      break;
629    case Token::TK_DocumentEnd:
630      OS << "Document-End: ";
631      break;
632    case Token::TK_BlockEntry:
633      OS << "Block-Entry: ";
634      break;
635    case Token::TK_BlockEnd:
636      OS << "Block-End: ";
637      break;
638    case Token::TK_BlockSequenceStart:
639      OS << "Block-Sequence-Start: ";
640      break;
641    case Token::TK_BlockMappingStart:
642      OS << "Block-Mapping-Start: ";
643      break;
644    case Token::TK_FlowEntry:
645      OS << "Flow-Entry: ";
646      break;
647    case Token::TK_FlowSequenceStart:
648      OS << "Flow-Sequence-Start: ";
649      break;
650    case Token::TK_FlowSequenceEnd:
651      OS << "Flow-Sequence-End: ";
652      break;
653    case Token::TK_FlowMappingStart:
654      OS << "Flow-Mapping-Start: ";
655      break;
656    case Token::TK_FlowMappingEnd:
657      OS << "Flow-Mapping-End: ";
658      break;
659    case Token::TK_Key:
660      OS << "Key: ";
661      break;
662    case Token::TK_Value:
663      OS << "Value: ";
664      break;
665    case Token::TK_Scalar:
666      OS << "Scalar: ";
667      break;
668    case Token::TK_BlockScalar:
669      OS << "Block Scalar: ";
670      break;
671    case Token::TK_Alias:
672      OS << "Alias: ";
673      break;
674    case Token::TK_Anchor:
675      OS << "Anchor: ";
676      break;
677    case Token::TK_Tag:
678      OS << "Tag: ";
679      break;
680    case Token::TK_Error:
681      break;
682    }
683    OS << T.Range << "\n";
684    if (T.Kind == Token::TK_StreamEnd)
685      break;
686    else if (T.Kind == Token::TK_Error)
687      return false;
688  }
689  return true;
690}
691
692bool yaml::scanTokens(StringRef Input) {
693  SourceMgr SM;
694  Scanner scanner(Input, SM);
695  while (true) {
696    Token T = scanner.getNext();
697    if (T.Kind == Token::TK_StreamEnd)
698      break;
699    else if (T.Kind == Token::TK_Error)
700      return false;
701  }
702  return true;
703}
704
705std::string yaml::escape(StringRef Input, bool EscapePrintable) {
706  std::string EscapedInput;
707  for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
708    if (*i == '\\')
709      EscapedInput += "\\\\";
710    else if (*i == '"')
711      EscapedInput += "\\\"";
712    else if (*i == 0)
713      EscapedInput += "\\0";
714    else if (*i == 0x07)
715      EscapedInput += "\\a";
716    else if (*i == 0x08)
717      EscapedInput += "\\b";
718    else if (*i == 0x09)
719      EscapedInput += "\\t";
720    else if (*i == 0x0A)
721      EscapedInput += "\\n";
722    else if (*i == 0x0B)
723      EscapedInput += "\\v";
724    else if (*i == 0x0C)
725      EscapedInput += "\\f";
726    else if (*i == 0x0D)
727      EscapedInput += "\\r";
728    else if (*i == 0x1B)
729      EscapedInput += "\\e";
730    else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
731      std::string HexStr = utohexstr(*i);
732      EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
733    } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
734      UTF8Decoded UnicodeScalarValue
735        = decodeUTF8(StringRef(i, Input.end() - i));
736      if (UnicodeScalarValue.second == 0) {
737        // Found invalid char.
738        SmallString<4> Val;
739        encodeUTF8(0xFFFD, Val);
740        llvm::append_range(EscapedInput, Val);
741        // FIXME: Error reporting.
742        return EscapedInput;
743      }
744      if (UnicodeScalarValue.first == 0x85)
745        EscapedInput += "\\N";
746      else if (UnicodeScalarValue.first == 0xA0)
747        EscapedInput += "\\_";
748      else if (UnicodeScalarValue.first == 0x2028)
749        EscapedInput += "\\L";
750      else if (UnicodeScalarValue.first == 0x2029)
751        EscapedInput += "\\P";
752      else if (!EscapePrintable &&
753               sys::unicode::isPrintable(UnicodeScalarValue.first))
754        EscapedInput += StringRef(i, UnicodeScalarValue.second);
755      else {
756        std::string HexStr = utohexstr(UnicodeScalarValue.first);
757        if (HexStr.size() <= 2)
758          EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
759        else if (HexStr.size() <= 4)
760          EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
761        else if (HexStr.size() <= 8)
762          EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
763      }
764      i += UnicodeScalarValue.second - 1;
765    } else
766      EscapedInput.push_back(*i);
767  }
768  return EscapedInput;
769}
770
771std::optional<bool> yaml::parseBool(StringRef S) {
772  switch (S.size()) {
773  case 1:
774    switch (S.front()) {
775    case 'y':
776    case 'Y':
777      return true;
778    case 'n':
779    case 'N':
780      return false;
781    default:
782      return std::nullopt;
783    }
784  case 2:
785    switch (S.front()) {
786    case 'O':
787      if (S[1] == 'N') // ON
788        return true;
789      [[fallthrough]];
790    case 'o':
791      if (S[1] == 'n') //[Oo]n
792        return true;
793      return std::nullopt;
794    case 'N':
795      if (S[1] == 'O') // NO
796        return false;
797      [[fallthrough]];
798    case 'n':
799      if (S[1] == 'o') //[Nn]o
800        return false;
801      return std::nullopt;
802    default:
803      return std::nullopt;
804    }
805  case 3:
806    switch (S.front()) {
807    case 'O':
808      if (S.drop_front() == "FF") // OFF
809        return false;
810      [[fallthrough]];
811    case 'o':
812      if (S.drop_front() == "ff") //[Oo]ff
813        return false;
814      return std::nullopt;
815    case 'Y':
816      if (S.drop_front() == "ES") // YES
817        return true;
818      [[fallthrough]];
819    case 'y':
820      if (S.drop_front() == "es") //[Yy]es
821        return true;
822      return std::nullopt;
823    default:
824      return std::nullopt;
825    }
826  case 4:
827    switch (S.front()) {
828    case 'T':
829      if (S.drop_front() == "RUE") // TRUE
830        return true;
831      [[fallthrough]];
832    case 't':
833      if (S.drop_front() == "rue") //[Tt]rue
834        return true;
835      return std::nullopt;
836    default:
837      return std::nullopt;
838    }
839  case 5:
840    switch (S.front()) {
841    case 'F':
842      if (S.drop_front() == "ALSE") // FALSE
843        return false;
844      [[fallthrough]];
845    case 'f':
846      if (S.drop_front() == "alse") //[Ff]alse
847        return false;
848      return std::nullopt;
849    default:
850      return std::nullopt;
851    }
852  default:
853    return std::nullopt;
854  }
855}
856
857Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors,
858                 std::error_code *EC)
859    : SM(sm), ShowColors(ShowColors), EC(EC) {
860  init(MemoryBufferRef(Input, "YAML"));
861}
862
863Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors,
864                 std::error_code *EC)
865    : SM(SM_), ShowColors(ShowColors), EC(EC) {
866  init(Buffer);
867}
868
869void Scanner::init(MemoryBufferRef Buffer) {
870  InputBuffer = Buffer;
871  Current = InputBuffer.getBufferStart();
872  End = InputBuffer.getBufferEnd();
873  Indent = -1;
874  Column = 0;
875  Line = 0;
876  FlowLevel = 0;
877  IsStartOfStream = true;
878  IsSimpleKeyAllowed = true;
879  IsAdjacentValueAllowedInFlow = false;
880  Failed = false;
881  std::unique_ptr<MemoryBuffer> InputBufferOwner =
882      MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false);
883  SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc());
884}
885
886Token &Scanner::peekNext() {
887  // If the current token is a possible simple key, keep parsing until we
888  // can confirm.
889  bool NeedMore = false;
890  while (true) {
891    if (TokenQueue.empty() || NeedMore) {
892      if (!fetchMoreTokens()) {
893        TokenQueue.clear();
894        SimpleKeys.clear();
895        TokenQueue.push_back(Token());
896        return TokenQueue.front();
897      }
898    }
899    assert(!TokenQueue.empty() &&
900            "fetchMoreTokens lied about getting tokens!");
901
902    removeStaleSimpleKeyCandidates();
903    SimpleKey SK;
904    SK.Tok = TokenQueue.begin();
905    if (!is_contained(SimpleKeys, SK))
906      break;
907    else
908      NeedMore = true;
909  }
910  return TokenQueue.front();
911}
912
913Token Scanner::getNext() {
914  Token Ret = peekNext();
915  // TokenQueue can be empty if there was an error getting the next token.
916  if (!TokenQueue.empty())
917    TokenQueue.pop_front();
918
919  // There cannot be any referenced Token's if the TokenQueue is empty. So do a
920  // quick deallocation of them all.
921  if (TokenQueue.empty())
922    TokenQueue.resetAlloc();
923
924  return Ret;
925}
926
927StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
928  if (Position == End)
929    return Position;
930  // Check 7 bit c-printable - b-char.
931  if (   *Position == 0x09
932      || (*Position >= 0x20 && *Position <= 0x7E))
933    return Position + 1;
934
935  // Check for valid UTF-8.
936  if (uint8_t(*Position) & 0x80) {
937    UTF8Decoded u8d = decodeUTF8(Position);
938    if (   u8d.second != 0
939        && u8d.first != 0xFEFF
940        && ( u8d.first == 0x85
941          || ( u8d.first >= 0xA0
942            && u8d.first <= 0xD7FF)
943          || ( u8d.first >= 0xE000
944            && u8d.first <= 0xFFFD)
945          || ( u8d.first >= 0x10000
946            && u8d.first <= 0x10FFFF)))
947      return Position + u8d.second;
948  }
949  return Position;
950}
951
952StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
953  if (Position == End)
954    return Position;
955  if (*Position == 0x0D) {
956    if (Position + 1 != End && *(Position + 1) == 0x0A)
957      return Position + 2;
958    return Position + 1;
959  }
960
961  if (*Position == 0x0A)
962    return Position + 1;
963  return Position;
964}
965
966StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) {
967  if (Position == End)
968    return Position;
969  if (*Position == ' ')
970    return Position + 1;
971  return Position;
972}
973
974StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
975  if (Position == End)
976    return Position;
977  if (*Position == ' ' || *Position == '\t')
978    return Position + 1;
979  return Position;
980}
981
982StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
983  if (Position == End)
984    return Position;
985  if (*Position == ' ' || *Position == '\t')
986    return Position;
987  return skip_nb_char(Position);
988}
989
990StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
991                                       , StringRef::iterator Position) {
992  while (true) {
993    StringRef::iterator i = (this->*Func)(Position);
994    if (i == Position)
995      break;
996    Position = i;
997  }
998  return Position;
999}
1000
1001void Scanner::advanceWhile(SkipWhileFunc Func) {
1002  auto Final = skip_while(Func, Current);
1003  Column += Final - Current;
1004  Current = Final;
1005}
1006
1007static bool is_ns_hex_digit(const char C) { return isAlnum(C); }
1008
1009static bool is_ns_word_char(const char C) { return C == '-' || isAlpha(C); }
1010
1011void Scanner::scan_ns_uri_char() {
1012  while (true) {
1013    if (Current == End)
1014      break;
1015    if ((   *Current == '%'
1016          && Current + 2 < End
1017          && is_ns_hex_digit(*(Current + 1))
1018          && is_ns_hex_digit(*(Current + 2)))
1019        || is_ns_word_char(*Current)
1020        || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
1021          != StringRef::npos) {
1022      ++Current;
1023      ++Column;
1024    } else
1025      break;
1026  }
1027}
1028
1029bool Scanner::consume(uint32_t Expected) {
1030  if (Expected >= 0x80) {
1031    setError("Cannot consume non-ascii characters", Current);
1032    return false;
1033  }
1034  if (Current == End)
1035    return false;
1036  if (uint8_t(*Current) >= 0x80) {
1037    setError("Cannot consume non-ascii characters", Current);
1038    return false;
1039  }
1040  if (uint8_t(*Current) == Expected) {
1041    ++Current;
1042    ++Column;
1043    return true;
1044  }
1045  return false;
1046}
1047
1048void Scanner::skip(uint32_t Distance) {
1049  Current += Distance;
1050  Column += Distance;
1051  assert(Current <= End && "Skipped past the end");
1052}
1053
1054bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
1055  if (Position == End)
1056    return false;
1057  return *Position == ' ' || *Position == '\t' || *Position == '\r' ||
1058         *Position == '\n';
1059}
1060
1061bool Scanner::isPlainSafeNonBlank(StringRef::iterator Position) {
1062  if (Position == End || isBlankOrBreak(Position))
1063    return false;
1064  if (FlowLevel &&
1065      StringRef(Position, 1).find_first_of(",[]{}") != StringRef::npos)
1066    return false;
1067  return true;
1068}
1069
1070bool Scanner::isLineEmpty(StringRef Line) {
1071  for (const auto *Position = Line.begin(); Position != Line.end(); ++Position)
1072    if (!isBlankOrBreak(Position))
1073      return false;
1074  return true;
1075}
1076
1077bool Scanner::consumeLineBreakIfPresent() {
1078  auto Next = skip_b_break(Current);
1079  if (Next == Current)
1080    return false;
1081  Column = 0;
1082  ++Line;
1083  Current = Next;
1084  return true;
1085}
1086
1087void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
1088                                    , unsigned AtColumn
1089                                    , bool IsRequired) {
1090  if (IsSimpleKeyAllowed) {
1091    SimpleKey SK;
1092    SK.Tok = Tok;
1093    SK.Line = Line;
1094    SK.Column = AtColumn;
1095    SK.IsRequired = IsRequired;
1096    SK.FlowLevel = FlowLevel;
1097    SimpleKeys.push_back(SK);
1098  }
1099}
1100
1101void Scanner::removeStaleSimpleKeyCandidates() {
1102  for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
1103                                            i != SimpleKeys.end();) {
1104    if (i->Line != Line || i->Column + 1024 < Column) {
1105      if (i->IsRequired)
1106        setError( "Could not find expected : for simple key"
1107                , i->Tok->Range.begin());
1108      i = SimpleKeys.erase(i);
1109    } else
1110      ++i;
1111  }
1112}
1113
1114void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
1115  if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
1116    SimpleKeys.pop_back();
1117}
1118
1119bool Scanner::unrollIndent(int ToColumn) {
1120  Token T;
1121  // Indentation is ignored in flow.
1122  if (FlowLevel != 0)
1123    return true;
1124
1125  while (Indent > ToColumn) {
1126    T.Kind = Token::TK_BlockEnd;
1127    T.Range = StringRef(Current, 1);
1128    TokenQueue.push_back(T);
1129    Indent = Indents.pop_back_val();
1130  }
1131
1132  return true;
1133}
1134
1135bool Scanner::rollIndent( int ToColumn
1136                        , Token::TokenKind Kind
1137                        , TokenQueueT::iterator InsertPoint) {
1138  if (FlowLevel)
1139    return true;
1140  if (Indent < ToColumn) {
1141    Indents.push_back(Indent);
1142    Indent = ToColumn;
1143
1144    Token T;
1145    T.Kind = Kind;
1146    T.Range = StringRef(Current, 0);
1147    TokenQueue.insert(InsertPoint, T);
1148  }
1149  return true;
1150}
1151
1152void Scanner::skipComment() {
1153  if (Current == End || *Current != '#')
1154    return;
1155  while (true) {
1156    // This may skip more than one byte, thus Column is only incremented
1157    // for code points.
1158    StringRef::iterator I = skip_nb_char(Current);
1159    if (I == Current)
1160      break;
1161    Current = I;
1162    ++Column;
1163  }
1164}
1165
1166void Scanner::scanToNextToken() {
1167  while (true) {
1168    while (Current != End && (*Current == ' ' || *Current == '\t')) {
1169      skip(1);
1170    }
1171
1172    skipComment();
1173
1174    // Skip EOL.
1175    StringRef::iterator i = skip_b_break(Current);
1176    if (i == Current)
1177      break;
1178    Current = i;
1179    ++Line;
1180    Column = 0;
1181    // New lines may start a simple key.
1182    if (!FlowLevel)
1183      IsSimpleKeyAllowed = true;
1184  }
1185}
1186
1187bool Scanner::scanStreamStart() {
1188  IsStartOfStream = false;
1189
1190  EncodingInfo EI = getUnicodeEncoding(currentInput());
1191
1192  Token T;
1193  T.Kind = Token::TK_StreamStart;
1194  T.Range = StringRef(Current, EI.second);
1195  TokenQueue.push_back(T);
1196  Current += EI.second;
1197  return true;
1198}
1199
1200bool Scanner::scanStreamEnd() {
1201  // Force an ending new line if one isn't present.
1202  if (Column != 0) {
1203    Column = 0;
1204    ++Line;
1205  }
1206
1207  unrollIndent(-1);
1208  SimpleKeys.clear();
1209  IsSimpleKeyAllowed = false;
1210  IsAdjacentValueAllowedInFlow = false;
1211
1212  Token T;
1213  T.Kind = Token::TK_StreamEnd;
1214  T.Range = StringRef(Current, 0);
1215  TokenQueue.push_back(T);
1216  return true;
1217}
1218
1219bool Scanner::scanDirective() {
1220  // Reset the indentation level.
1221  unrollIndent(-1);
1222  SimpleKeys.clear();
1223  IsSimpleKeyAllowed = false;
1224  IsAdjacentValueAllowedInFlow = false;
1225
1226  StringRef::iterator Start = Current;
1227  consume('%');
1228  StringRef::iterator NameStart = Current;
1229  Current = skip_while(&Scanner::skip_ns_char, Current);
1230  StringRef Name(NameStart, Current - NameStart);
1231  Current = skip_while(&Scanner::skip_s_white, Current);
1232
1233  Token T;
1234  if (Name == "YAML") {
1235    Current = skip_while(&Scanner::skip_ns_char, Current);
1236    T.Kind = Token::TK_VersionDirective;
1237    T.Range = StringRef(Start, Current - Start);
1238    TokenQueue.push_back(T);
1239    return true;
1240  } else if(Name == "TAG") {
1241    Current = skip_while(&Scanner::skip_ns_char, Current);
1242    Current = skip_while(&Scanner::skip_s_white, Current);
1243    Current = skip_while(&Scanner::skip_ns_char, Current);
1244    T.Kind = Token::TK_TagDirective;
1245    T.Range = StringRef(Start, Current - Start);
1246    TokenQueue.push_back(T);
1247    return true;
1248  }
1249  return false;
1250}
1251
1252bool Scanner::scanDocumentIndicator(bool IsStart) {
1253  unrollIndent(-1);
1254  SimpleKeys.clear();
1255  IsSimpleKeyAllowed = false;
1256  IsAdjacentValueAllowedInFlow = false;
1257
1258  Token T;
1259  T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
1260  T.Range = StringRef(Current, 3);
1261  skip(3);
1262  TokenQueue.push_back(T);
1263  return true;
1264}
1265
1266bool Scanner::scanFlowCollectionStart(bool IsSequence) {
1267  Token T;
1268  T.Kind = IsSequence ? Token::TK_FlowSequenceStart
1269                      : Token::TK_FlowMappingStart;
1270  T.Range = StringRef(Current, 1);
1271  skip(1);
1272  TokenQueue.push_back(T);
1273
1274  // [ and { may begin a simple key.
1275  saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false);
1276
1277  // And may also be followed by a simple key.
1278  IsSimpleKeyAllowed = true;
1279  // Adjacent values are allowed in flows only after JSON-style keys.
1280  IsAdjacentValueAllowedInFlow = false;
1281  ++FlowLevel;
1282  return true;
1283}
1284
1285bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
1286  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1287  IsSimpleKeyAllowed = false;
1288  IsAdjacentValueAllowedInFlow = true;
1289  Token T;
1290  T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
1291                      : Token::TK_FlowMappingEnd;
1292  T.Range = StringRef(Current, 1);
1293  skip(1);
1294  TokenQueue.push_back(T);
1295  if (FlowLevel)
1296    --FlowLevel;
1297  return true;
1298}
1299
1300bool Scanner::scanFlowEntry() {
1301  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1302  IsSimpleKeyAllowed = true;
1303  IsAdjacentValueAllowedInFlow = false;
1304  Token T;
1305  T.Kind = Token::TK_FlowEntry;
1306  T.Range = StringRef(Current, 1);
1307  skip(1);
1308  TokenQueue.push_back(T);
1309  return true;
1310}
1311
1312bool Scanner::scanBlockEntry() {
1313  rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
1314  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1315  IsSimpleKeyAllowed = true;
1316  IsAdjacentValueAllowedInFlow = false;
1317  Token T;
1318  T.Kind = Token::TK_BlockEntry;
1319  T.Range = StringRef(Current, 1);
1320  skip(1);
1321  TokenQueue.push_back(T);
1322  return true;
1323}
1324
1325bool Scanner::scanKey() {
1326  if (!FlowLevel)
1327    rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1328
1329  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1330  IsSimpleKeyAllowed = !FlowLevel;
1331  IsAdjacentValueAllowedInFlow = false;
1332
1333  Token T;
1334  T.Kind = Token::TK_Key;
1335  T.Range = StringRef(Current, 1);
1336  skip(1);
1337  TokenQueue.push_back(T);
1338  return true;
1339}
1340
1341bool Scanner::scanValue() {
1342  // If the previous token could have been a simple key, insert the key token
1343  // into the token queue.
1344  if (!SimpleKeys.empty()) {
1345    SimpleKey SK = SimpleKeys.pop_back_val();
1346    Token T;
1347    T.Kind = Token::TK_Key;
1348    T.Range = SK.Tok->Range;
1349    TokenQueueT::iterator i, e;
1350    for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
1351      if (i == SK.Tok)
1352        break;
1353    }
1354    if (i == e) {
1355      Failed = true;
1356      return false;
1357    }
1358    i = TokenQueue.insert(i, T);
1359
1360    // We may also need to add a Block-Mapping-Start token.
1361    rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
1362
1363    IsSimpleKeyAllowed = false;
1364  } else {
1365    if (!FlowLevel)
1366      rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1367    IsSimpleKeyAllowed = !FlowLevel;
1368  }
1369  IsAdjacentValueAllowedInFlow = false;
1370
1371  Token T;
1372  T.Kind = Token::TK_Value;
1373  T.Range = StringRef(Current, 1);
1374  skip(1);
1375  TokenQueue.push_back(T);
1376  return true;
1377}
1378
1379// Forbidding inlining improves performance by roughly 20%.
1380// FIXME: Remove once llvm optimizes this to the faster version without hints.
1381LLVM_ATTRIBUTE_NOINLINE static bool
1382wasEscaped(StringRef::iterator First, StringRef::iterator Position);
1383
1384// Returns whether a character at 'Position' was escaped with a leading '\'.
1385// 'First' specifies the position of the first character in the string.
1386static bool wasEscaped(StringRef::iterator First,
1387                       StringRef::iterator Position) {
1388  assert(Position - 1 >= First);
1389  StringRef::iterator I = Position - 1;
1390  // We calculate the number of consecutive '\'s before the current position
1391  // by iterating backwards through our string.
1392  while (I >= First && *I == '\\') --I;
1393  // (Position - 1 - I) now contains the number of '\'s before the current
1394  // position. If it is odd, the character at 'Position' was escaped.
1395  return (Position - 1 - I) % 2 == 1;
1396}
1397
1398bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
1399  StringRef::iterator Start = Current;
1400  unsigned ColStart = Column;
1401  if (IsDoubleQuoted) {
1402    do {
1403      ++Current;
1404      while (Current != End && *Current != '"')
1405        ++Current;
1406      // Repeat until the previous character was not a '\' or was an escaped
1407      // backslash.
1408    } while (   Current != End
1409             && *(Current - 1) == '\\'
1410             && wasEscaped(Start + 1, Current));
1411  } else {
1412    skip(1);
1413    while (Current != End) {
1414      // Skip a ' followed by another '.
1415      if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
1416        skip(2);
1417        continue;
1418      } else if (*Current == '\'')
1419        break;
1420      StringRef::iterator i = skip_nb_char(Current);
1421      if (i == Current) {
1422        i = skip_b_break(Current);
1423        if (i == Current)
1424          break;
1425        Current = i;
1426        Column = 0;
1427        ++Line;
1428      } else {
1429        if (i == End)
1430          break;
1431        Current = i;
1432        ++Column;
1433      }
1434    }
1435  }
1436
1437  if (Current == End) {
1438    setError("Expected quote at end of scalar", Current);
1439    return false;
1440  }
1441
1442  skip(1); // Skip ending quote.
1443  Token T;
1444  T.Kind = Token::TK_Scalar;
1445  T.Range = StringRef(Start, Current - Start);
1446  TokenQueue.push_back(T);
1447
1448  saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1449
1450  IsSimpleKeyAllowed = false;
1451  IsAdjacentValueAllowedInFlow = true;
1452
1453  return true;
1454}
1455
1456bool Scanner::scanPlainScalar() {
1457  StringRef::iterator Start = Current;
1458  unsigned ColStart = Column;
1459  unsigned LeadingBlanks = 0;
1460  assert(Indent >= -1 && "Indent must be >= -1 !");
1461  unsigned indent = static_cast<unsigned>(Indent + 1);
1462  while (Current != End) {
1463    if (*Current == '#')
1464      break;
1465
1466    while (Current != End &&
1467           ((*Current != ':' && isPlainSafeNonBlank(Current)) ||
1468            (*Current == ':' && isPlainSafeNonBlank(Current + 1)))) {
1469      StringRef::iterator i = skip_nb_char(Current);
1470      if (i == Current)
1471        break;
1472      Current = i;
1473      ++Column;
1474    }
1475
1476    // Are we at the end?
1477    if (!isBlankOrBreak(Current))
1478      break;
1479
1480    // Eat blanks.
1481    StringRef::iterator Tmp = Current;
1482    while (isBlankOrBreak(Tmp)) {
1483      StringRef::iterator i = skip_s_white(Tmp);
1484      if (i != Tmp) {
1485        if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
1486          setError("Found invalid tab character in indentation", Tmp);
1487          return false;
1488        }
1489        Tmp = i;
1490        ++Column;
1491      } else {
1492        i = skip_b_break(Tmp);
1493        if (!LeadingBlanks)
1494          LeadingBlanks = 1;
1495        Tmp = i;
1496        Column = 0;
1497        ++Line;
1498      }
1499    }
1500
1501    if (!FlowLevel && Column < indent)
1502      break;
1503
1504    Current = Tmp;
1505  }
1506  if (Start == Current) {
1507    setError("Got empty plain scalar", Start);
1508    return false;
1509  }
1510  Token T;
1511  T.Kind = Token::TK_Scalar;
1512  T.Range = StringRef(Start, Current - Start);
1513  TokenQueue.push_back(T);
1514
1515  // Plain scalars can be simple keys.
1516  saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1517
1518  IsSimpleKeyAllowed = false;
1519  IsAdjacentValueAllowedInFlow = false;
1520
1521  return true;
1522}
1523
1524bool Scanner::scanAliasOrAnchor(bool IsAlias) {
1525  StringRef::iterator Start = Current;
1526  unsigned ColStart = Column;
1527  skip(1);
1528  while (Current != End) {
1529    if (   *Current == '[' || *Current == ']'
1530        || *Current == '{' || *Current == '}'
1531        || *Current == ','
1532        || *Current == ':')
1533      break;
1534    StringRef::iterator i = skip_ns_char(Current);
1535    if (i == Current)
1536      break;
1537    Current = i;
1538    ++Column;
1539  }
1540
1541  if (Start + 1 == Current) {
1542    setError("Got empty alias or anchor", Start);
1543    return false;
1544  }
1545
1546  Token T;
1547  T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
1548  T.Range = StringRef(Start, Current - Start);
1549  TokenQueue.push_back(T);
1550
1551  // Alias and anchors can be simple keys.
1552  saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1553
1554  IsSimpleKeyAllowed = false;
1555  IsAdjacentValueAllowedInFlow = false;
1556
1557  return true;
1558}
1559
1560bool Scanner::scanBlockScalarIndicators(char &StyleIndicator,
1561                                        char &ChompingIndicator,
1562                                        unsigned &IndentIndicator,
1563                                        bool &IsDone) {
1564  StyleIndicator = scanBlockStyleIndicator();
1565  if (!scanBlockScalarHeader(ChompingIndicator, IndentIndicator, IsDone))
1566    return false;
1567  return true;
1568}
1569
1570char Scanner::scanBlockStyleIndicator() {
1571  char Indicator = ' ';
1572  if (Current != End && (*Current == '>' || *Current == '|')) {
1573    Indicator = *Current;
1574    skip(1);
1575  }
1576  return Indicator;
1577}
1578
1579char Scanner::scanBlockChompingIndicator() {
1580  char Indicator = ' ';
1581  if (Current != End && (*Current == '+' || *Current == '-')) {
1582    Indicator = *Current;
1583    skip(1);
1584  }
1585  return Indicator;
1586}
1587
1588/// Get the number of line breaks after chomping.
1589///
1590/// Return the number of trailing line breaks to emit, depending on
1591/// \p ChompingIndicator.
1592static unsigned getChompedLineBreaks(char ChompingIndicator,
1593                                     unsigned LineBreaks, StringRef Str) {
1594  if (ChompingIndicator == '-') // Strip all line breaks.
1595    return 0;
1596  if (ChompingIndicator == '+') // Keep all line breaks.
1597    return LineBreaks;
1598  // Clip trailing lines.
1599  return Str.empty() ? 0 : 1;
1600}
1601
1602unsigned Scanner::scanBlockIndentationIndicator() {
1603  unsigned Indent = 0;
1604  if (Current != End && (*Current >= '1' && *Current <= '9')) {
1605    Indent = unsigned(*Current - '0');
1606    skip(1);
1607  }
1608  return Indent;
1609}
1610
1611bool Scanner::scanBlockScalarHeader(char &ChompingIndicator,
1612                                    unsigned &IndentIndicator, bool &IsDone) {
1613  auto Start = Current;
1614
1615  ChompingIndicator = scanBlockChompingIndicator();
1616  IndentIndicator = scanBlockIndentationIndicator();
1617  // Check for the chomping indicator once again.
1618  if (ChompingIndicator == ' ')
1619    ChompingIndicator = scanBlockChompingIndicator();
1620  Current = skip_while(&Scanner::skip_s_white, Current);
1621  skipComment();
1622
1623  if (Current == End) { // EOF, we have an empty scalar.
1624    Token T;
1625    T.Kind = Token::TK_BlockScalar;
1626    T.Range = StringRef(Start, Current - Start);
1627    TokenQueue.push_back(T);
1628    IsDone = true;
1629    return true;
1630  }
1631
1632  if (!consumeLineBreakIfPresent()) {
1633    setError("Expected a line break after block scalar header", Current);
1634    return false;
1635  }
1636  return true;
1637}
1638
1639bool Scanner::findBlockScalarIndent(unsigned &BlockIndent,
1640                                    unsigned BlockExitIndent,
1641                                    unsigned &LineBreaks, bool &IsDone) {
1642  unsigned MaxAllSpaceLineCharacters = 0;
1643  StringRef::iterator LongestAllSpaceLine;
1644
1645  while (true) {
1646    advanceWhile(&Scanner::skip_s_space);
1647    if (skip_nb_char(Current) != Current) {
1648      // This line isn't empty, so try and find the indentation.
1649      if (Column <= BlockExitIndent) { // End of the block literal.
1650        IsDone = true;
1651        return true;
1652      }
1653      // We found the block's indentation.
1654      BlockIndent = Column;
1655      if (MaxAllSpaceLineCharacters > BlockIndent) {
1656        setError(
1657            "Leading all-spaces line must be smaller than the block indent",
1658            LongestAllSpaceLine);
1659        return false;
1660      }
1661      return true;
1662    }
1663    if (skip_b_break(Current) != Current &&
1664        Column > MaxAllSpaceLineCharacters) {
1665      // Record the longest all-space line in case it's longer than the
1666      // discovered block indent.
1667      MaxAllSpaceLineCharacters = Column;
1668      LongestAllSpaceLine = Current;
1669    }
1670
1671    // Check for EOF.
1672    if (Current == End) {
1673      IsDone = true;
1674      return true;
1675    }
1676
1677    if (!consumeLineBreakIfPresent()) {
1678      IsDone = true;
1679      return true;
1680    }
1681    ++LineBreaks;
1682  }
1683  return true;
1684}
1685
1686bool Scanner::scanBlockScalarIndent(unsigned BlockIndent,
1687                                    unsigned BlockExitIndent, bool &IsDone) {
1688  // Skip the indentation.
1689  while (Column < BlockIndent) {
1690    auto I = skip_s_space(Current);
1691    if (I == Current)
1692      break;
1693    Current = I;
1694    ++Column;
1695  }
1696
1697  if (skip_nb_char(Current) == Current)
1698    return true;
1699
1700  if (Column <= BlockExitIndent) { // End of the block literal.
1701    IsDone = true;
1702    return true;
1703  }
1704
1705  if (Column < BlockIndent) {
1706    if (Current != End && *Current == '#') { // Trailing comment.
1707      IsDone = true;
1708      return true;
1709    }
1710    setError("A text line is less indented than the block scalar", Current);
1711    return false;
1712  }
1713  return true; // A normal text line.
1714}
1715
1716bool Scanner::scanBlockScalar(bool IsLiteral) {
1717  assert(*Current == '|' || *Current == '>');
1718  char StyleIndicator;
1719  char ChompingIndicator;
1720  unsigned BlockIndent;
1721  bool IsDone = false;
1722  if (!scanBlockScalarIndicators(StyleIndicator, ChompingIndicator, BlockIndent,
1723                                 IsDone))
1724    return false;
1725  if (IsDone)
1726    return true;
1727  bool IsFolded = StyleIndicator == '>';
1728
1729  const auto *Start = Current;
1730  unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent;
1731  unsigned LineBreaks = 0;
1732  if (BlockIndent == 0) {
1733    if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks,
1734                               IsDone))
1735      return false;
1736  }
1737
1738  // Scan the block's scalars body.
1739  SmallString<256> Str;
1740  while (!IsDone) {
1741    if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone))
1742      return false;
1743    if (IsDone)
1744      break;
1745
1746    // Parse the current line.
1747    auto LineStart = Current;
1748    advanceWhile(&Scanner::skip_nb_char);
1749    if (LineStart != Current) {
1750      if (LineBreaks && IsFolded && !Scanner::isLineEmpty(Str)) {
1751        // The folded style "folds" any single line break between content into a
1752        // single space, except when that content is "empty" (only contains
1753        // whitespace) in which case the line break is left as-is.
1754        if (LineBreaks == 1) {
1755          Str.append(LineBreaks,
1756                     isLineEmpty(StringRef(LineStart, Current - LineStart))
1757                         ? '\n'
1758                         : ' ');
1759        }
1760        // If we saw a single line break, we are completely replacing it and so
1761        // want `LineBreaks == 0`. Otherwise this decrement accounts for the
1762        // fact that the first line break is "trimmed", only being used to
1763        // signal a sequence of line breaks which should not be folded.
1764        LineBreaks--;
1765      }
1766      Str.append(LineBreaks, '\n');
1767      Str.append(StringRef(LineStart, Current - LineStart));
1768      LineBreaks = 0;
1769    }
1770
1771    // Check for EOF.
1772    if (Current == End)
1773      break;
1774
1775    if (!consumeLineBreakIfPresent())
1776      break;
1777    ++LineBreaks;
1778  }
1779
1780  if (Current == End && !LineBreaks)
1781    // Ensure that there is at least one line break before the end of file.
1782    LineBreaks = 1;
1783  Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n');
1784
1785  // New lines may start a simple key.
1786  if (!FlowLevel)
1787    IsSimpleKeyAllowed = true;
1788  IsAdjacentValueAllowedInFlow = false;
1789
1790  Token T;
1791  T.Kind = Token::TK_BlockScalar;
1792  T.Range = StringRef(Start, Current - Start);
1793  T.Value = std::string(Str);
1794  TokenQueue.push_back(T);
1795  return true;
1796}
1797
1798bool Scanner::scanTag() {
1799  StringRef::iterator Start = Current;
1800  unsigned ColStart = Column;
1801  skip(1); // Eat !.
1802  if (Current == End || isBlankOrBreak(Current)); // An empty tag.
1803  else if (*Current == '<') {
1804    skip(1);
1805    scan_ns_uri_char();
1806    if (!consume('>'))
1807      return false;
1808  } else {
1809    // FIXME: Actually parse the c-ns-shorthand-tag rule.
1810    Current = skip_while(&Scanner::skip_ns_char, Current);
1811  }
1812
1813  Token T;
1814  T.Kind = Token::TK_Tag;
1815  T.Range = StringRef(Start, Current - Start);
1816  TokenQueue.push_back(T);
1817
1818  // Tags can be simple keys.
1819  saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1820
1821  IsSimpleKeyAllowed = false;
1822  IsAdjacentValueAllowedInFlow = false;
1823
1824  return true;
1825}
1826
1827bool Scanner::fetchMoreTokens() {
1828  if (IsStartOfStream)
1829    return scanStreamStart();
1830
1831  scanToNextToken();
1832
1833  if (Current == End)
1834    return scanStreamEnd();
1835
1836  removeStaleSimpleKeyCandidates();
1837
1838  unrollIndent(Column);
1839
1840  if (Column == 0 && *Current == '%')
1841    return scanDirective();
1842
1843  if (Column == 0 && Current + 4 <= End
1844      && *Current == '-'
1845      && *(Current + 1) == '-'
1846      && *(Current + 2) == '-'
1847      && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1848    return scanDocumentIndicator(true);
1849
1850  if (Column == 0 && Current + 4 <= End
1851      && *Current == '.'
1852      && *(Current + 1) == '.'
1853      && *(Current + 2) == '.'
1854      && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1855    return scanDocumentIndicator(false);
1856
1857  if (*Current == '[')
1858    return scanFlowCollectionStart(true);
1859
1860  if (*Current == '{')
1861    return scanFlowCollectionStart(false);
1862
1863  if (*Current == ']')
1864    return scanFlowCollectionEnd(true);
1865
1866  if (*Current == '}')
1867    return scanFlowCollectionEnd(false);
1868
1869  if (*Current == ',')
1870    return scanFlowEntry();
1871
1872  if (*Current == '-' && (isBlankOrBreak(Current + 1) || Current + 1 == End))
1873    return scanBlockEntry();
1874
1875  if (*Current == '?' && (Current + 1 == End || isBlankOrBreak(Current + 1)))
1876    return scanKey();
1877
1878  if (*Current == ':' &&
1879      (!isPlainSafeNonBlank(Current + 1) || IsAdjacentValueAllowedInFlow))
1880    return scanValue();
1881
1882  if (*Current == '*')
1883    return scanAliasOrAnchor(true);
1884
1885  if (*Current == '&')
1886    return scanAliasOrAnchor(false);
1887
1888  if (*Current == '!')
1889    return scanTag();
1890
1891  if (*Current == '|' && !FlowLevel)
1892    return scanBlockScalar(true);
1893
1894  if (*Current == '>' && !FlowLevel)
1895    return scanBlockScalar(false);
1896
1897  if (*Current == '\'')
1898    return scanFlowScalar(false);
1899
1900  if (*Current == '"')
1901    return scanFlowScalar(true);
1902
1903  // Get a plain scalar.
1904  StringRef FirstChar(Current, 1);
1905  if ((!isBlankOrBreak(Current) &&
1906       FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") == StringRef::npos) ||
1907      (FirstChar.find_first_of("?:-") != StringRef::npos &&
1908       isPlainSafeNonBlank(Current + 1)))
1909    return scanPlainScalar();
1910
1911  setError("Unrecognized character while tokenizing.", Current);
1912  return false;
1913}
1914
1915Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors,
1916               std::error_code *EC)
1917    : scanner(new Scanner(Input, SM, ShowColors, EC)) {}
1918
1919Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors,
1920               std::error_code *EC)
1921    : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)) {}
1922
1923Stream::~Stream() = default;
1924
1925bool Stream::failed() { return scanner->failed(); }
1926
1927void Stream::printError(Node *N, const Twine &Msg, SourceMgr::DiagKind Kind) {
1928  printError(N ? N->getSourceRange() : SMRange(), Msg, Kind);
1929}
1930
1931void Stream::printError(const SMRange &Range, const Twine &Msg,
1932                        SourceMgr::DiagKind Kind) {
1933  scanner->printError(Range.Start, Kind, Msg, Range);
1934}
1935
1936document_iterator Stream::begin() {
1937  if (CurrentDoc)
1938    report_fatal_error("Can only iterate over the stream once");
1939
1940  // Skip Stream-Start.
1941  scanner->getNext();
1942
1943  CurrentDoc.reset(new Document(*this));
1944  return document_iterator(CurrentDoc);
1945}
1946
1947document_iterator Stream::end() {
1948  return document_iterator();
1949}
1950
1951void Stream::skip() {
1952  for (Document &Doc : *this)
1953    Doc.skip();
1954}
1955
1956Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A,
1957           StringRef T)
1958    : Doc(D), TypeID(Type), Anchor(A), Tag(T) {
1959  SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
1960  SourceRange = SMRange(Start, Start);
1961}
1962
1963std::string Node::getVerbatimTag() const {
1964  StringRef Raw = getRawTag();
1965  if (!Raw.empty() && Raw != "!") {
1966    std::string Ret;
1967    if (Raw.find_last_of('!') == 0) {
1968      Ret = std::string(Doc->getTagMap().find("!")->second);
1969      Ret += Raw.substr(1);
1970      return Ret;
1971    } else if (Raw.starts_with("!!")) {
1972      Ret = std::string(Doc->getTagMap().find("!!")->second);
1973      Ret += Raw.substr(2);
1974      return Ret;
1975    } else {
1976      StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
1977      std::map<StringRef, StringRef>::const_iterator It =
1978          Doc->getTagMap().find(TagHandle);
1979      if (It != Doc->getTagMap().end())
1980        Ret = std::string(It->second);
1981      else {
1982        Token T;
1983        T.Kind = Token::TK_Tag;
1984        T.Range = TagHandle;
1985        setError(Twine("Unknown tag handle ") + TagHandle, T);
1986      }
1987      Ret += Raw.substr(Raw.find_last_of('!') + 1);
1988      return Ret;
1989    }
1990  }
1991
1992  switch (getType()) {
1993  case NK_Null:
1994    return "tag:yaml.org,2002:null";
1995  case NK_Scalar:
1996  case NK_BlockScalar:
1997    // TODO: Tag resolution.
1998    return "tag:yaml.org,2002:str";
1999  case NK_Mapping:
2000    return "tag:yaml.org,2002:map";
2001  case NK_Sequence:
2002    return "tag:yaml.org,2002:seq";
2003  }
2004
2005  return "";
2006}
2007
2008Token &Node::peekNext() {
2009  return Doc->peekNext();
2010}
2011
2012Token Node::getNext() {
2013  return Doc->getNext();
2014}
2015
2016Node *Node::parseBlockNode() {
2017  return Doc->parseBlockNode();
2018}
2019
2020BumpPtrAllocator &Node::getAllocator() {
2021  return Doc->NodeAllocator;
2022}
2023
2024void Node::setError(const Twine &Msg, Token &Tok) const {
2025  Doc->setError(Msg, Tok);
2026}
2027
2028bool Node::failed() const {
2029  return Doc->failed();
2030}
2031
2032StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
2033  if (Value[0] == '"')
2034    return getDoubleQuotedValue(Value, Storage);
2035  if (Value[0] == '\'')
2036    return getSingleQuotedValue(Value, Storage);
2037  return getPlainValue(Value, Storage);
2038}
2039
2040/// parseScalarValue - A common parsing routine for all flow scalar styles.
2041/// It handles line break characters by itself, adds regular content characters
2042/// to the result, and forwards escaped sequences to the provided routine for
2043/// the style-specific processing.
2044///
2045/// \param UnquotedValue - An input value without quotation marks.
2046/// \param Storage - A storage for the result if the input value is multiline or
2047/// contains escaped characters.
2048/// \param LookupChars - A set of special characters to search in the input
2049/// string. Should include line break characters and the escape character
2050/// specific for the processing scalar style, if any.
2051/// \param UnescapeCallback - This is called when the escape character is found
2052/// in the input.
2053/// \returns - The unfolded and unescaped value.
2054static StringRef
2055parseScalarValue(StringRef UnquotedValue, SmallVectorImpl<char> &Storage,
2056                 StringRef LookupChars,
2057                 std::function<StringRef(StringRef, SmallVectorImpl<char> &)>
2058                     UnescapeCallback) {
2059  size_t I = UnquotedValue.find_first_of(LookupChars);
2060  if (I == StringRef::npos)
2061    return UnquotedValue;
2062
2063  Storage.clear();
2064  Storage.reserve(UnquotedValue.size());
2065  char LastNewLineAddedAs = '\0';
2066  for (; I != StringRef::npos; I = UnquotedValue.find_first_of(LookupChars)) {
2067    if (UnquotedValue[I] != '\r' && UnquotedValue[I] != '\n') {
2068      llvm::append_range(Storage, UnquotedValue.take_front(I));
2069      UnquotedValue = UnescapeCallback(UnquotedValue.drop_front(I), Storage);
2070      LastNewLineAddedAs = '\0';
2071      continue;
2072    }
2073    if (size_t LastNonSWhite = UnquotedValue.find_last_not_of(" \t", I);
2074        LastNonSWhite != StringRef::npos) {
2075      llvm::append_range(Storage, UnquotedValue.take_front(LastNonSWhite + 1));
2076      Storage.push_back(' ');
2077      LastNewLineAddedAs = ' ';
2078    } else {
2079      // Note: we can't just check if the last character in Storage is ' ',
2080      // '\n', or something else; that would give a wrong result for double
2081      // quoted values containing an escaped space character before a new-line
2082      // character.
2083      switch (LastNewLineAddedAs) {
2084      case ' ':
2085        assert(!Storage.empty() && Storage.back() == ' ');
2086        Storage.back() = '\n';
2087        LastNewLineAddedAs = '\n';
2088        break;
2089      case '\n':
2090        assert(!Storage.empty() && Storage.back() == '\n');
2091        Storage.push_back('\n');
2092        break;
2093      default:
2094        Storage.push_back(' ');
2095        LastNewLineAddedAs = ' ';
2096        break;
2097      }
2098    }
2099    // Handle Windows-style EOL
2100    if (UnquotedValue.substr(I, 2) == "\r\n")
2101      I++;
2102    UnquotedValue = UnquotedValue.drop_front(I + 1).ltrim(" \t");
2103  }
2104  llvm::append_range(Storage, UnquotedValue);
2105  return StringRef(Storage.begin(), Storage.size());
2106}
2107
2108StringRef
2109ScalarNode::getDoubleQuotedValue(StringRef RawValue,
2110                                 SmallVectorImpl<char> &Storage) const {
2111  assert(RawValue.size() >= 2 && RawValue.front() == '"' &&
2112         RawValue.back() == '"');
2113  StringRef UnquotedValue = RawValue.substr(1, RawValue.size() - 2);
2114
2115  auto UnescapeFunc = [this](StringRef UnquotedValue,
2116                             SmallVectorImpl<char> &Storage) {
2117    assert(UnquotedValue.take_front(1) == "\\");
2118    if (UnquotedValue.size() == 1) {
2119      Token T;
2120      T.Range = UnquotedValue;
2121      setError("Unrecognized escape code", T);
2122      Storage.clear();
2123      return StringRef();
2124    }
2125    UnquotedValue = UnquotedValue.drop_front(1);
2126    switch (UnquotedValue[0]) {
2127    default: {
2128      Token T;
2129      T.Range = UnquotedValue.take_front(1);
2130      setError("Unrecognized escape code", T);
2131      Storage.clear();
2132      return StringRef();
2133    }
2134    case '\r':
2135      // Shrink the Windows-style EOL.
2136      if (UnquotedValue.size() >= 2 && UnquotedValue[1] == '\n')
2137        UnquotedValue = UnquotedValue.drop_front(1);
2138      [[fallthrough]];
2139    case '\n':
2140      return UnquotedValue.drop_front(1).ltrim(" \t");
2141    case '0':
2142      Storage.push_back(0x00);
2143      break;
2144    case 'a':
2145      Storage.push_back(0x07);
2146      break;
2147    case 'b':
2148      Storage.push_back(0x08);
2149      break;
2150    case 't':
2151    case 0x09:
2152      Storage.push_back(0x09);
2153      break;
2154    case 'n':
2155      Storage.push_back(0x0A);
2156      break;
2157    case 'v':
2158      Storage.push_back(0x0B);
2159      break;
2160    case 'f':
2161      Storage.push_back(0x0C);
2162      break;
2163    case 'r':
2164      Storage.push_back(0x0D);
2165      break;
2166    case 'e':
2167      Storage.push_back(0x1B);
2168      break;
2169    case ' ':
2170      Storage.push_back(0x20);
2171      break;
2172    case '"':
2173      Storage.push_back(0x22);
2174      break;
2175    case '/':
2176      Storage.push_back(0x2F);
2177      break;
2178    case '\\':
2179      Storage.push_back(0x5C);
2180      break;
2181    case 'N':
2182      encodeUTF8(0x85, Storage);
2183      break;
2184    case '_':
2185      encodeUTF8(0xA0, Storage);
2186      break;
2187    case 'L':
2188      encodeUTF8(0x2028, Storage);
2189      break;
2190    case 'P':
2191      encodeUTF8(0x2029, Storage);
2192      break;
2193    case 'x': {
2194      if (UnquotedValue.size() < 3)
2195        // TODO: Report error.
2196        break;
2197      unsigned int UnicodeScalarValue;
2198      if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
2199        // TODO: Report error.
2200        UnicodeScalarValue = 0xFFFD;
2201      encodeUTF8(UnicodeScalarValue, Storage);
2202      return UnquotedValue.drop_front(3);
2203    }
2204    case 'u': {
2205      if (UnquotedValue.size() < 5)
2206        // TODO: Report error.
2207        break;
2208      unsigned int UnicodeScalarValue;
2209      if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
2210        // TODO: Report error.
2211        UnicodeScalarValue = 0xFFFD;
2212      encodeUTF8(UnicodeScalarValue, Storage);
2213      return UnquotedValue.drop_front(5);
2214    }
2215    case 'U': {
2216      if (UnquotedValue.size() < 9)
2217        // TODO: Report error.
2218        break;
2219      unsigned int UnicodeScalarValue;
2220      if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
2221        // TODO: Report error.
2222        UnicodeScalarValue = 0xFFFD;
2223      encodeUTF8(UnicodeScalarValue, Storage);
2224      return UnquotedValue.drop_front(9);
2225    }
2226    }
2227    return UnquotedValue.drop_front(1);
2228  };
2229
2230  return parseScalarValue(UnquotedValue, Storage, "\\\r\n", UnescapeFunc);
2231}
2232
2233StringRef ScalarNode::getSingleQuotedValue(StringRef RawValue,
2234                                           SmallVectorImpl<char> &Storage) {
2235  assert(RawValue.size() >= 2 && RawValue.front() == '\'' &&
2236         RawValue.back() == '\'');
2237  StringRef UnquotedValue = RawValue.substr(1, RawValue.size() - 2);
2238
2239  auto UnescapeFunc = [](StringRef UnquotedValue,
2240                         SmallVectorImpl<char> &Storage) {
2241    assert(UnquotedValue.take_front(2) == "''");
2242    Storage.push_back('\'');
2243    return UnquotedValue.drop_front(2);
2244  };
2245
2246  return parseScalarValue(UnquotedValue, Storage, "'\r\n", UnescapeFunc);
2247}
2248
2249StringRef ScalarNode::getPlainValue(StringRef RawValue,
2250                                    SmallVectorImpl<char> &Storage) {
2251  // Trim trailing whitespace ('b-char' and 's-white').
2252  // NOTE: Alternatively we could change the scanner to not include whitespace
2253  //       here in the first place.
2254  RawValue = RawValue.rtrim("\r\n \t");
2255  return parseScalarValue(RawValue, Storage, "\r\n", nullptr);
2256}
2257
2258Node *KeyValueNode::getKey() {
2259  if (Key)
2260    return Key;
2261  // Handle implicit null keys.
2262  {
2263    Token &t = peekNext();
2264    if (   t.Kind == Token::TK_BlockEnd
2265        || t.Kind == Token::TK_Value
2266        || t.Kind == Token::TK_Error) {
2267      return Key = new (getAllocator()) NullNode(Doc);
2268    }
2269    if (t.Kind == Token::TK_Key)
2270      getNext(); // skip TK_Key.
2271  }
2272
2273  // Handle explicit null keys.
2274  Token &t = peekNext();
2275  if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) {
2276    return Key = new (getAllocator()) NullNode(Doc);
2277  }
2278
2279  // We've got a normal key.
2280  return Key = parseBlockNode();
2281}
2282
2283Node *KeyValueNode::getValue() {
2284  if (Value)
2285    return Value;
2286
2287  if (Node* Key = getKey())
2288    Key->skip();
2289  else {
2290    setError("Null key in Key Value.", peekNext());
2291    return Value = new (getAllocator()) NullNode(Doc);
2292  }
2293
2294  if (failed())
2295    return Value = new (getAllocator()) NullNode(Doc);
2296
2297  // Handle implicit null values.
2298  {
2299    Token &t = peekNext();
2300    if (   t.Kind == Token::TK_BlockEnd
2301        || t.Kind == Token::TK_FlowMappingEnd
2302        || t.Kind == Token::TK_Key
2303        || t.Kind == Token::TK_FlowEntry
2304        || t.Kind == Token::TK_Error) {
2305      return Value = new (getAllocator()) NullNode(Doc);
2306    }
2307
2308    if (t.Kind != Token::TK_Value) {
2309      setError("Unexpected token in Key Value.", t);
2310      return Value = new (getAllocator()) NullNode(Doc);
2311    }
2312    getNext(); // skip TK_Value.
2313  }
2314
2315  // Handle explicit null values.
2316  Token &t = peekNext();
2317  if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) {
2318    return Value = new (getAllocator()) NullNode(Doc);
2319  }
2320
2321  // We got a normal value.
2322  return Value = parseBlockNode();
2323}
2324
2325void MappingNode::increment() {
2326  if (failed()) {
2327    IsAtEnd = true;
2328    CurrentEntry = nullptr;
2329    return;
2330  }
2331  if (CurrentEntry) {
2332    CurrentEntry->skip();
2333    if (Type == MT_Inline) {
2334      IsAtEnd = true;
2335      CurrentEntry = nullptr;
2336      return;
2337    }
2338  }
2339  Token T = peekNext();
2340  if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) {
2341    // KeyValueNode eats the TK_Key. That way it can detect null keys.
2342    CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
2343  } else if (Type == MT_Block) {
2344    switch (T.Kind) {
2345    case Token::TK_BlockEnd:
2346      getNext();
2347      IsAtEnd = true;
2348      CurrentEntry = nullptr;
2349      break;
2350    default:
2351      setError("Unexpected token. Expected Key or Block End", T);
2352      [[fallthrough]];
2353    case Token::TK_Error:
2354      IsAtEnd = true;
2355      CurrentEntry = nullptr;
2356    }
2357  } else {
2358    switch (T.Kind) {
2359    case Token::TK_FlowEntry:
2360      // Eat the flow entry and recurse.
2361      getNext();
2362      return increment();
2363    case Token::TK_FlowMappingEnd:
2364      getNext();
2365      [[fallthrough]];
2366    case Token::TK_Error:
2367      // Set this to end iterator.
2368      IsAtEnd = true;
2369      CurrentEntry = nullptr;
2370      break;
2371    default:
2372      setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
2373                "Mapping End."
2374              , T);
2375      IsAtEnd = true;
2376      CurrentEntry = nullptr;
2377    }
2378  }
2379}
2380
2381void SequenceNode::increment() {
2382  if (failed()) {
2383    IsAtEnd = true;
2384    CurrentEntry = nullptr;
2385    return;
2386  }
2387  if (CurrentEntry)
2388    CurrentEntry->skip();
2389  Token T = peekNext();
2390  if (SeqType == ST_Block) {
2391    switch (T.Kind) {
2392    case Token::TK_BlockEntry:
2393      getNext();
2394      CurrentEntry = parseBlockNode();
2395      if (!CurrentEntry) { // An error occurred.
2396        IsAtEnd = true;
2397        CurrentEntry = nullptr;
2398      }
2399      break;
2400    case Token::TK_BlockEnd:
2401      getNext();
2402      IsAtEnd = true;
2403      CurrentEntry = nullptr;
2404      break;
2405    default:
2406      setError( "Unexpected token. Expected Block Entry or Block End."
2407              , T);
2408      [[fallthrough]];
2409    case Token::TK_Error:
2410      IsAtEnd = true;
2411      CurrentEntry = nullptr;
2412    }
2413  } else if (SeqType == ST_Indentless) {
2414    switch (T.Kind) {
2415    case Token::TK_BlockEntry:
2416      getNext();
2417      CurrentEntry = parseBlockNode();
2418      if (!CurrentEntry) { // An error occurred.
2419        IsAtEnd = true;
2420        CurrentEntry = nullptr;
2421      }
2422      break;
2423    default:
2424    case Token::TK_Error:
2425      IsAtEnd = true;
2426      CurrentEntry = nullptr;
2427    }
2428  } else if (SeqType == ST_Flow) {
2429    switch (T.Kind) {
2430    case Token::TK_FlowEntry:
2431      // Eat the flow entry and recurse.
2432      getNext();
2433      WasPreviousTokenFlowEntry = true;
2434      return increment();
2435    case Token::TK_FlowSequenceEnd:
2436      getNext();
2437      [[fallthrough]];
2438    case Token::TK_Error:
2439      // Set this to end iterator.
2440      IsAtEnd = true;
2441      CurrentEntry = nullptr;
2442      break;
2443    case Token::TK_StreamEnd:
2444    case Token::TK_DocumentEnd:
2445    case Token::TK_DocumentStart:
2446      setError("Could not find closing ]!", T);
2447      // Set this to end iterator.
2448      IsAtEnd = true;
2449      CurrentEntry = nullptr;
2450      break;
2451    default:
2452      if (!WasPreviousTokenFlowEntry) {
2453        setError("Expected , between entries!", T);
2454        IsAtEnd = true;
2455        CurrentEntry = nullptr;
2456        break;
2457      }
2458      // Otherwise it must be a flow entry.
2459      CurrentEntry = parseBlockNode();
2460      if (!CurrentEntry) {
2461        IsAtEnd = true;
2462      }
2463      WasPreviousTokenFlowEntry = false;
2464      break;
2465    }
2466  }
2467}
2468
2469Document::Document(Stream &S) : stream(S), Root(nullptr) {
2470  // Tag maps starts with two default mappings.
2471  TagMap["!"] = "!";
2472  TagMap["!!"] = "tag:yaml.org,2002:";
2473
2474  if (parseDirectives())
2475    expectToken(Token::TK_DocumentStart);
2476  Token &T = peekNext();
2477  if (T.Kind == Token::TK_DocumentStart)
2478    getNext();
2479}
2480
2481bool Document::skip()  {
2482  if (stream.scanner->failed())
2483    return false;
2484  if (!Root && !getRoot())
2485    return false;
2486  Root->skip();
2487  Token &T = peekNext();
2488  if (T.Kind == Token::TK_StreamEnd)
2489    return false;
2490  if (T.Kind == Token::TK_DocumentEnd) {
2491    getNext();
2492    return skip();
2493  }
2494  return true;
2495}
2496
2497Token &Document::peekNext() {
2498  return stream.scanner->peekNext();
2499}
2500
2501Token Document::getNext() {
2502  return stream.scanner->getNext();
2503}
2504
2505void Document::setError(const Twine &Message, Token &Location) const {
2506  stream.scanner->setError(Message, Location.Range.begin());
2507}
2508
2509bool Document::failed() const {
2510  return stream.scanner->failed();
2511}
2512
2513Node *Document::parseBlockNode() {
2514  Token T = peekNext();
2515  // Handle properties.
2516  Token AnchorInfo;
2517  Token TagInfo;
2518parse_property:
2519  switch (T.Kind) {
2520  case Token::TK_Alias:
2521    getNext();
2522    return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
2523  case Token::TK_Anchor:
2524    if (AnchorInfo.Kind == Token::TK_Anchor) {
2525      setError("Already encountered an anchor for this node!", T);
2526      return nullptr;
2527    }
2528    AnchorInfo = getNext(); // Consume TK_Anchor.
2529    T = peekNext();
2530    goto parse_property;
2531  case Token::TK_Tag:
2532    if (TagInfo.Kind == Token::TK_Tag) {
2533      setError("Already encountered a tag for this node!", T);
2534      return nullptr;
2535    }
2536    TagInfo = getNext(); // Consume TK_Tag.
2537    T = peekNext();
2538    goto parse_property;
2539  default:
2540    break;
2541  }
2542
2543  switch (T.Kind) {
2544  case Token::TK_BlockEntry:
2545    // We got an unindented BlockEntry sequence. This is not terminated with
2546    // a BlockEnd.
2547    // Don't eat the TK_BlockEntry, SequenceNode needs it.
2548    return new (NodeAllocator) SequenceNode( stream.CurrentDoc
2549                                           , AnchorInfo.Range.substr(1)
2550                                           , TagInfo.Range
2551                                           , SequenceNode::ST_Indentless);
2552  case Token::TK_BlockSequenceStart:
2553    getNext();
2554    return new (NodeAllocator)
2555      SequenceNode( stream.CurrentDoc
2556                  , AnchorInfo.Range.substr(1)
2557                  , TagInfo.Range
2558                  , SequenceNode::ST_Block);
2559  case Token::TK_BlockMappingStart:
2560    getNext();
2561    return new (NodeAllocator)
2562      MappingNode( stream.CurrentDoc
2563                 , AnchorInfo.Range.substr(1)
2564                 , TagInfo.Range
2565                 , MappingNode::MT_Block);
2566  case Token::TK_FlowSequenceStart:
2567    getNext();
2568    return new (NodeAllocator)
2569      SequenceNode( stream.CurrentDoc
2570                  , AnchorInfo.Range.substr(1)
2571                  , TagInfo.Range
2572                  , SequenceNode::ST_Flow);
2573  case Token::TK_FlowMappingStart:
2574    getNext();
2575    return new (NodeAllocator)
2576      MappingNode( stream.CurrentDoc
2577                 , AnchorInfo.Range.substr(1)
2578                 , TagInfo.Range
2579                 , MappingNode::MT_Flow);
2580  case Token::TK_Scalar:
2581    getNext();
2582    return new (NodeAllocator)
2583      ScalarNode( stream.CurrentDoc
2584                , AnchorInfo.Range.substr(1)
2585                , TagInfo.Range
2586                , T.Range);
2587  case Token::TK_BlockScalar: {
2588    getNext();
2589    StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1);
2590    StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back();
2591    return new (NodeAllocator)
2592        BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1),
2593                        TagInfo.Range, StrCopy, T.Range);
2594  }
2595  case Token::TK_Key:
2596    // Don't eat the TK_Key, KeyValueNode expects it.
2597    return new (NodeAllocator)
2598      MappingNode( stream.CurrentDoc
2599                 , AnchorInfo.Range.substr(1)
2600                 , TagInfo.Range
2601                 , MappingNode::MT_Inline);
2602  case Token::TK_DocumentStart:
2603  case Token::TK_DocumentEnd:
2604  case Token::TK_StreamEnd:
2605  default:
2606    // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
2607    //       !!null null.
2608    return new (NodeAllocator) NullNode(stream.CurrentDoc);
2609  case Token::TK_FlowMappingEnd:
2610  case Token::TK_FlowSequenceEnd:
2611  case Token::TK_FlowEntry: {
2612    if (Root && (isa<MappingNode>(Root) || isa<SequenceNode>(Root)))
2613      return new (NodeAllocator) NullNode(stream.CurrentDoc);
2614
2615    setError("Unexpected token", T);
2616    return nullptr;
2617  }
2618  case Token::TK_Error:
2619    return nullptr;
2620  }
2621  llvm_unreachable("Control flow shouldn't reach here.");
2622  return nullptr;
2623}
2624
2625bool Document::parseDirectives() {
2626  bool isDirective = false;
2627  while (true) {
2628    Token T = peekNext();
2629    if (T.Kind == Token::TK_TagDirective) {
2630      parseTAGDirective();
2631      isDirective = true;
2632    } else if (T.Kind == Token::TK_VersionDirective) {
2633      parseYAMLDirective();
2634      isDirective = true;
2635    } else
2636      break;
2637  }
2638  return isDirective;
2639}
2640
2641void Document::parseYAMLDirective() {
2642  getNext(); // Eat %YAML <version>
2643}
2644
2645void Document::parseTAGDirective() {
2646  Token Tag = getNext(); // %TAG <handle> <prefix>
2647  StringRef T = Tag.Range;
2648  // Strip %TAG
2649  T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
2650  std::size_t HandleEnd = T.find_first_of(" \t");
2651  StringRef TagHandle = T.substr(0, HandleEnd);
2652  StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
2653  TagMap[TagHandle] = TagPrefix;
2654}
2655
2656bool Document::expectToken(int TK) {
2657  Token T = getNext();
2658  if (T.Kind != TK) {
2659    setError("Unexpected token", T);
2660    return false;
2661  }
2662  return true;
2663}
2664