1234285Sdim//===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 2234285Sdim// 3234285Sdim// The LLVM Compiler Infrastructure 4234285Sdim// 5234285Sdim// This file is distributed under the University of Illinois Open Source 6234285Sdim// License. See LICENSE.TXT for details. 7234285Sdim// 8234285Sdim//===----------------------------------------------------------------------===// 9234285Sdim// 10234285Sdim// This file implements a YAML parser. 11234285Sdim// 12234285Sdim//===----------------------------------------------------------------------===// 13234285Sdim 14234285Sdim#include "llvm/Support/YAMLParser.h" 15234285Sdim#include "llvm/ADT/SmallVector.h" 16234285Sdim#include "llvm/ADT/StringExtras.h" 17234285Sdim#include "llvm/ADT/Twine.h" 18252723Sdim#include "llvm/ADT/ilist.h" 19252723Sdim#include "llvm/ADT/ilist_node.h" 20234285Sdim#include "llvm/Support/ErrorHandling.h" 21234285Sdim#include "llvm/Support/MemoryBuffer.h" 22252723Sdim#include "llvm/Support/SourceMgr.h" 23234285Sdim#include "llvm/Support/raw_ostream.h" 24234285Sdim 25234285Sdimusing namespace llvm; 26234285Sdimusing namespace yaml; 27234285Sdim 28234285Sdimenum UnicodeEncodingForm { 29245431Sdim UEF_UTF32_LE, ///< UTF-32 Little Endian 30245431Sdim UEF_UTF32_BE, ///< UTF-32 Big Endian 31245431Sdim UEF_UTF16_LE, ///< UTF-16 Little Endian 32245431Sdim UEF_UTF16_BE, ///< UTF-16 Big Endian 33245431Sdim UEF_UTF8, ///< UTF-8 or ascii. 34245431Sdim UEF_Unknown ///< Not a valid Unicode encoding. 35234285Sdim}; 36234285Sdim 37234285Sdim/// EncodingInfo - Holds the encoding type and length of the byte order mark if 38234285Sdim/// it exists. Length is in {0, 2, 3, 4}. 39234285Sdimtypedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 40234285Sdim 41234285Sdim/// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 42234285Sdim/// encoding form of \a Input. 43234285Sdim/// 44234285Sdim/// @param Input A string of length 0 or more. 45234285Sdim/// @returns An EncodingInfo indicating the Unicode encoding form of the input 46234285Sdim/// and how long the byte order mark is if one exists. 47234285Sdimstatic EncodingInfo getUnicodeEncoding(StringRef Input) { 48234285Sdim if (Input.size() == 0) 49234285Sdim return std::make_pair(UEF_Unknown, 0); 50234285Sdim 51234285Sdim switch (uint8_t(Input[0])) { 52234285Sdim case 0x00: 53234285Sdim if (Input.size() >= 4) { 54234285Sdim if ( Input[1] == 0 55234285Sdim && uint8_t(Input[2]) == 0xFE 56234285Sdim && uint8_t(Input[3]) == 0xFF) 57234285Sdim return std::make_pair(UEF_UTF32_BE, 4); 58234285Sdim if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 59234285Sdim return std::make_pair(UEF_UTF32_BE, 0); 60234285Sdim } 61234285Sdim 62234285Sdim if (Input.size() >= 2 && Input[1] != 0) 63234285Sdim return std::make_pair(UEF_UTF16_BE, 0); 64234285Sdim return std::make_pair(UEF_Unknown, 0); 65234285Sdim case 0xFF: 66234285Sdim if ( Input.size() >= 4 67234285Sdim && uint8_t(Input[1]) == 0xFE 68234285Sdim && Input[2] == 0 69234285Sdim && Input[3] == 0) 70234285Sdim return std::make_pair(UEF_UTF32_LE, 4); 71234285Sdim 72234285Sdim if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 73234285Sdim return std::make_pair(UEF_UTF16_LE, 2); 74234285Sdim return std::make_pair(UEF_Unknown, 0); 75234285Sdim case 0xFE: 76234285Sdim if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 77234285Sdim return std::make_pair(UEF_UTF16_BE, 2); 78234285Sdim return std::make_pair(UEF_Unknown, 0); 79234285Sdim case 0xEF: 80234285Sdim if ( Input.size() >= 3 81234285Sdim && uint8_t(Input[1]) == 0xBB 82234285Sdim && uint8_t(Input[2]) == 0xBF) 83234285Sdim return std::make_pair(UEF_UTF8, 3); 84234285Sdim return std::make_pair(UEF_Unknown, 0); 85234285Sdim } 86234285Sdim 87234285Sdim // It could still be utf-32 or utf-16. 88234285Sdim if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 89234285Sdim return std::make_pair(UEF_UTF32_LE, 0); 90234285Sdim 91234285Sdim if (Input.size() >= 2 && Input[1] == 0) 92234285Sdim return std::make_pair(UEF_UTF16_LE, 0); 93234285Sdim 94234285Sdim return std::make_pair(UEF_UTF8, 0); 95234285Sdim} 96234285Sdim 97234285Sdimnamespace llvm { 98234285Sdimnamespace yaml { 99263509Sdim/// Pin the vtables to this file. 100263509Sdimvoid Node::anchor() {} 101263509Sdimvoid NullNode::anchor() {} 102263509Sdimvoid ScalarNode::anchor() {} 103263509Sdimvoid KeyValueNode::anchor() {} 104263509Sdimvoid MappingNode::anchor() {} 105263509Sdimvoid SequenceNode::anchor() {} 106263509Sdimvoid AliasNode::anchor() {} 107263509Sdim 108234285Sdim/// Token - A single YAML token. 109234285Sdimstruct Token : ilist_node<Token> { 110234285Sdim enum TokenKind { 111234285Sdim TK_Error, // Uninitialized token. 112234285Sdim TK_StreamStart, 113234285Sdim TK_StreamEnd, 114234285Sdim TK_VersionDirective, 115234285Sdim TK_TagDirective, 116234285Sdim TK_DocumentStart, 117234285Sdim TK_DocumentEnd, 118234285Sdim TK_BlockEntry, 119234285Sdim TK_BlockEnd, 120234285Sdim TK_BlockSequenceStart, 121234285Sdim TK_BlockMappingStart, 122234285Sdim TK_FlowEntry, 123234285Sdim TK_FlowSequenceStart, 124234285Sdim TK_FlowSequenceEnd, 125234285Sdim TK_FlowMappingStart, 126234285Sdim TK_FlowMappingEnd, 127234285Sdim TK_Key, 128234285Sdim TK_Value, 129234285Sdim TK_Scalar, 130234285Sdim TK_Alias, 131234285Sdim TK_Anchor, 132234285Sdim TK_Tag 133234285Sdim } Kind; 134234285Sdim 135234285Sdim /// A string of length 0 or more whose begin() points to the logical location 136234285Sdim /// of the token in the input. 137234285Sdim StringRef Range; 138234285Sdim 139234285Sdim Token() : Kind(TK_Error) {} 140234285Sdim}; 141234285Sdim} 142234285Sdim} 143234285Sdim 144234285Sdimnamespace llvm { 145234285Sdimtemplate<> 146234285Sdimstruct ilist_sentinel_traits<Token> { 147234285Sdim Token *createSentinel() const { 148234285Sdim return &Sentinel; 149234285Sdim } 150234285Sdim static void destroySentinel(Token*) {} 151234285Sdim 152234285Sdim Token *provideInitialHead() const { return createSentinel(); } 153234285Sdim Token *ensureHead(Token*) const { return createSentinel(); } 154234285Sdim static void noteHead(Token*, Token*) {} 155234285Sdim 156234285Sdimprivate: 157234285Sdim mutable Token Sentinel; 158234285Sdim}; 159234285Sdim 160234285Sdimtemplate<> 161234285Sdimstruct ilist_node_traits<Token> { 162234285Sdim Token *createNode(const Token &V) { 163234285Sdim return new (Alloc.Allocate<Token>()) Token(V); 164234285Sdim } 165234285Sdim static void deleteNode(Token *V) {} 166234285Sdim 167234285Sdim void addNodeToList(Token *) {} 168234285Sdim void removeNodeFromList(Token *) {} 169234285Sdim void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, 170234285Sdim ilist_iterator<Token> /*first*/, 171234285Sdim ilist_iterator<Token> /*last*/) {} 172234285Sdim 173234285Sdim BumpPtrAllocator Alloc; 174234285Sdim}; 175234285Sdim} 176234285Sdim 177234285Sdimtypedef ilist<Token> TokenQueueT; 178234285Sdim 179234285Sdimnamespace { 180234285Sdim/// @brief This struct is used to track simple keys. 181234285Sdim/// 182234285Sdim/// Simple keys are handled by creating an entry in SimpleKeys for each Token 183234285Sdim/// which could legally be the start of a simple key. When peekNext is called, 184234285Sdim/// if the Token To be returned is referenced by a SimpleKey, we continue 185234285Sdim/// tokenizing until that potential simple key has either been found to not be 186234285Sdim/// a simple key (we moved on to the next line or went further than 1024 chars). 187234285Sdim/// Or when we run into a Value, and then insert a Key token (and possibly 188234285Sdim/// others) before the SimpleKey's Tok. 189234285Sdimstruct SimpleKey { 190234285Sdim TokenQueueT::iterator Tok; 191234285Sdim unsigned Column; 192234285Sdim unsigned Line; 193234285Sdim unsigned FlowLevel; 194234285Sdim bool IsRequired; 195234285Sdim 196234285Sdim bool operator ==(const SimpleKey &Other) { 197234285Sdim return Tok == Other.Tok; 198234285Sdim } 199234285Sdim}; 200234285Sdim} 201234285Sdim 202234285Sdim/// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 203234285Sdim/// subsequence and the subsequence's length in code units (uint8_t). 204234285Sdim/// A length of 0 represents an error. 205234285Sdimtypedef std::pair<uint32_t, unsigned> UTF8Decoded; 206234285Sdim 207234285Sdimstatic UTF8Decoded decodeUTF8(StringRef Range) { 208234285Sdim StringRef::iterator Position= Range.begin(); 209234285Sdim StringRef::iterator End = Range.end(); 210234285Sdim // 1 byte: [0x00, 0x7f] 211234285Sdim // Bit pattern: 0xxxxxxx 212234285Sdim if ((*Position & 0x80) == 0) { 213234285Sdim return std::make_pair(*Position, 1); 214234285Sdim } 215234285Sdim // 2 bytes: [0x80, 0x7ff] 216234285Sdim // Bit pattern: 110xxxxx 10xxxxxx 217234285Sdim if (Position + 1 != End && 218234285Sdim ((*Position & 0xE0) == 0xC0) && 219234285Sdim ((*(Position + 1) & 0xC0) == 0x80)) { 220234285Sdim uint32_t codepoint = ((*Position & 0x1F) << 6) | 221234285Sdim (*(Position + 1) & 0x3F); 222234285Sdim if (codepoint >= 0x80) 223234285Sdim return std::make_pair(codepoint, 2); 224234285Sdim } 225234285Sdim // 3 bytes: [0x8000, 0xffff] 226234285Sdim // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 227234285Sdim if (Position + 2 != End && 228234285Sdim ((*Position & 0xF0) == 0xE0) && 229234285Sdim ((*(Position + 1) & 0xC0) == 0x80) && 230234285Sdim ((*(Position + 2) & 0xC0) == 0x80)) { 231234285Sdim uint32_t codepoint = ((*Position & 0x0F) << 12) | 232234285Sdim ((*(Position + 1) & 0x3F) << 6) | 233234285Sdim (*(Position + 2) & 0x3F); 234234285Sdim // Codepoints between 0xD800 and 0xDFFF are invalid, as 235234285Sdim // they are high / low surrogate halves used by UTF-16. 236234285Sdim if (codepoint >= 0x800 && 237234285Sdim (codepoint < 0xD800 || codepoint > 0xDFFF)) 238234285Sdim return std::make_pair(codepoint, 3); 239234285Sdim } 240234285Sdim // 4 bytes: [0x10000, 0x10FFFF] 241234285Sdim // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 242234285Sdim if (Position + 3 != End && 243234285Sdim ((*Position & 0xF8) == 0xF0) && 244234285Sdim ((*(Position + 1) & 0xC0) == 0x80) && 245234285Sdim ((*(Position + 2) & 0xC0) == 0x80) && 246234285Sdim ((*(Position + 3) & 0xC0) == 0x80)) { 247234285Sdim uint32_t codepoint = ((*Position & 0x07) << 18) | 248234285Sdim ((*(Position + 1) & 0x3F) << 12) | 249234285Sdim ((*(Position + 2) & 0x3F) << 6) | 250234285Sdim (*(Position + 3) & 0x3F); 251234285Sdim if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 252234285Sdim return std::make_pair(codepoint, 4); 253234285Sdim } 254234285Sdim return std::make_pair(0, 0); 255234285Sdim} 256234285Sdim 257234285Sdimnamespace llvm { 258234285Sdimnamespace yaml { 259234285Sdim/// @brief Scans YAML tokens from a MemoryBuffer. 260234285Sdimclass Scanner { 261234285Sdimpublic: 262234285Sdim Scanner(const StringRef Input, SourceMgr &SM); 263252723Sdim Scanner(MemoryBuffer *Buffer, SourceMgr &SM_); 264234285Sdim 265234285Sdim /// @brief Parse the next token and return it without popping it. 266234285Sdim Token &peekNext(); 267234285Sdim 268234285Sdim /// @brief Parse the next token and pop it from the queue. 269234285Sdim Token getNext(); 270234285Sdim 271234285Sdim void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 272252723Sdim ArrayRef<SMRange> Ranges = None) { 273234285Sdim SM.PrintMessage(Loc, Kind, Message, Ranges); 274234285Sdim } 275234285Sdim 276234285Sdim void setError(const Twine &Message, StringRef::iterator Position) { 277234285Sdim if (Current >= End) 278234285Sdim Current = End - 1; 279234285Sdim 280234285Sdim // Don't print out more errors after the first one we encounter. The rest 281234285Sdim // are just the result of the first, and have no meaning. 282234285Sdim if (!Failed) 283234285Sdim printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 284234285Sdim Failed = true; 285234285Sdim } 286234285Sdim 287234285Sdim void setError(const Twine &Message) { 288234285Sdim setError(Message, Current); 289234285Sdim } 290234285Sdim 291234285Sdim /// @brief Returns true if an error occurred while parsing. 292234285Sdim bool failed() { 293234285Sdim return Failed; 294234285Sdim } 295234285Sdim 296234285Sdimprivate: 297234285Sdim StringRef currentInput() { 298234285Sdim return StringRef(Current, End - Current); 299234285Sdim } 300234285Sdim 301234285Sdim /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 302234285Sdim /// at \a Position. 303234285Sdim /// 304234285Sdim /// If the UTF-8 code units starting at Position do not form a well-formed 305234285Sdim /// code unit subsequence, then the Unicode scalar value is 0, and the length 306234285Sdim /// is 0. 307234285Sdim UTF8Decoded decodeUTF8(StringRef::iterator Position) { 308234285Sdim return ::decodeUTF8(StringRef(Position, End - Position)); 309234285Sdim } 310234285Sdim 311234285Sdim // The following functions are based on the gramar rules in the YAML spec. The 312234285Sdim // style of the function names it meant to closely match how they are written 313234285Sdim // in the spec. The number within the [] is the number of the grammar rule in 314234285Sdim // the spec. 315234285Sdim // 316234285Sdim // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 317234285Sdim // 318234285Sdim // c- 319234285Sdim // A production starting and ending with a special character. 320234285Sdim // b- 321234285Sdim // A production matching a single line break. 322234285Sdim // nb- 323234285Sdim // A production starting and ending with a non-break character. 324234285Sdim // s- 325234285Sdim // A production starting and ending with a white space character. 326234285Sdim // ns- 327234285Sdim // A production starting and ending with a non-space character. 328234285Sdim // l- 329234285Sdim // A production matching complete line(s). 330234285Sdim 331234285Sdim /// @brief Skip a single nb-char[27] starting at Position. 332234285Sdim /// 333234285Sdim /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 334234285Sdim /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 335234285Sdim /// 336234285Sdim /// @returns The code unit after the nb-char, or Position if it's not an 337234285Sdim /// nb-char. 338234285Sdim StringRef::iterator skip_nb_char(StringRef::iterator Position); 339234285Sdim 340234285Sdim /// @brief Skip a single b-break[28] starting at Position. 341234285Sdim /// 342234285Sdim /// A b-break is 0xD 0xA | 0xD | 0xA 343234285Sdim /// 344234285Sdim /// @returns The code unit after the b-break, or Position if it's not a 345234285Sdim /// b-break. 346234285Sdim StringRef::iterator skip_b_break(StringRef::iterator Position); 347234285Sdim 348234285Sdim /// @brief Skip a single s-white[33] starting at Position. 349234285Sdim /// 350234285Sdim /// A s-white is 0x20 | 0x9 351234285Sdim /// 352234285Sdim /// @returns The code unit after the s-white, or Position if it's not a 353234285Sdim /// s-white. 354234285Sdim StringRef::iterator skip_s_white(StringRef::iterator Position); 355234285Sdim 356234285Sdim /// @brief Skip a single ns-char[34] starting at Position. 357234285Sdim /// 358234285Sdim /// A ns-char is nb-char - s-white 359234285Sdim /// 360234285Sdim /// @returns The code unit after the ns-char, or Position if it's not a 361234285Sdim /// ns-char. 362234285Sdim StringRef::iterator skip_ns_char(StringRef::iterator Position); 363234285Sdim 364234285Sdim typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 365234285Sdim /// @brief Skip minimal well-formed code unit subsequences until Func 366234285Sdim /// returns its input. 367234285Sdim /// 368234285Sdim /// @returns The code unit after the last minimal well-formed code unit 369234285Sdim /// subsequence that Func accepted. 370234285Sdim StringRef::iterator skip_while( SkipWhileFunc Func 371234285Sdim , StringRef::iterator Position); 372234285Sdim 373234285Sdim /// @brief Scan ns-uri-char[39]s starting at Cur. 374234285Sdim /// 375234285Sdim /// This updates Cur and Column while scanning. 376234285Sdim /// 377234285Sdim /// @returns A StringRef starting at Cur which covers the longest contiguous 378234285Sdim /// sequence of ns-uri-char. 379234285Sdim StringRef scan_ns_uri_char(); 380234285Sdim 381234285Sdim /// @brief Scan ns-plain-one-line[133] starting at \a Cur. 382234285Sdim StringRef scan_ns_plain_one_line(); 383234285Sdim 384234285Sdim /// @brief Consume a minimal well-formed code unit subsequence starting at 385234285Sdim /// \a Cur. Return false if it is not the same Unicode scalar value as 386234285Sdim /// \a Expected. This updates \a Column. 387234285Sdim bool consume(uint32_t Expected); 388234285Sdim 389234285Sdim /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 390234285Sdim void skip(uint32_t Distance); 391234285Sdim 392234285Sdim /// @brief Return true if the minimal well-formed code unit subsequence at 393234285Sdim /// Pos is whitespace or a new line 394234285Sdim bool isBlankOrBreak(StringRef::iterator Position); 395234285Sdim 396234285Sdim /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 397234285Sdim void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 398234285Sdim , unsigned AtColumn 399234285Sdim , bool IsRequired); 400234285Sdim 401234285Sdim /// @brief Remove simple keys that can no longer be valid simple keys. 402234285Sdim /// 403234285Sdim /// Invalid simple keys are not on the current line or are further than 1024 404234285Sdim /// columns back. 405234285Sdim void removeStaleSimpleKeyCandidates(); 406234285Sdim 407234285Sdim /// @brief Remove all simple keys on FlowLevel \a Level. 408234285Sdim void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 409234285Sdim 410234285Sdim /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 411234285Sdim /// tokens if needed. 412234285Sdim bool unrollIndent(int ToColumn); 413234285Sdim 414234285Sdim /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 415234285Sdim /// if needed. 416234285Sdim bool rollIndent( int ToColumn 417234285Sdim , Token::TokenKind Kind 418234285Sdim , TokenQueueT::iterator InsertPoint); 419234285Sdim 420234285Sdim /// @brief Skip whitespace and comments until the start of the next token. 421234285Sdim void scanToNextToken(); 422234285Sdim 423234285Sdim /// @brief Must be the first token generated. 424234285Sdim bool scanStreamStart(); 425234285Sdim 426234285Sdim /// @brief Generate tokens needed to close out the stream. 427234285Sdim bool scanStreamEnd(); 428234285Sdim 429234285Sdim /// @brief Scan a %BLAH directive. 430234285Sdim bool scanDirective(); 431234285Sdim 432234285Sdim /// @brief Scan a ... or ---. 433234285Sdim bool scanDocumentIndicator(bool IsStart); 434234285Sdim 435234285Sdim /// @brief Scan a [ or { and generate the proper flow collection start token. 436234285Sdim bool scanFlowCollectionStart(bool IsSequence); 437234285Sdim 438234285Sdim /// @brief Scan a ] or } and generate the proper flow collection end token. 439234285Sdim bool scanFlowCollectionEnd(bool IsSequence); 440234285Sdim 441234285Sdim /// @brief Scan the , that separates entries in a flow collection. 442234285Sdim bool scanFlowEntry(); 443234285Sdim 444234285Sdim /// @brief Scan the - that starts block sequence entries. 445234285Sdim bool scanBlockEntry(); 446234285Sdim 447234285Sdim /// @brief Scan an explicit ? indicating a key. 448234285Sdim bool scanKey(); 449234285Sdim 450234285Sdim /// @brief Scan an explicit : indicating a value. 451234285Sdim bool scanValue(); 452234285Sdim 453234285Sdim /// @brief Scan a quoted scalar. 454234285Sdim bool scanFlowScalar(bool IsDoubleQuoted); 455234285Sdim 456234285Sdim /// @brief Scan an unquoted scalar. 457234285Sdim bool scanPlainScalar(); 458234285Sdim 459234285Sdim /// @brief Scan an Alias or Anchor starting with * or &. 460234285Sdim bool scanAliasOrAnchor(bool IsAlias); 461234285Sdim 462234285Sdim /// @brief Scan a block scalar starting with | or >. 463234285Sdim bool scanBlockScalar(bool IsLiteral); 464234285Sdim 465234285Sdim /// @brief Scan a tag of the form !stuff. 466234285Sdim bool scanTag(); 467234285Sdim 468234285Sdim /// @brief Dispatch to the next scanning function based on \a *Cur. 469234285Sdim bool fetchMoreTokens(); 470234285Sdim 471234285Sdim /// @brief The SourceMgr used for diagnostics and buffer management. 472234285Sdim SourceMgr &SM; 473234285Sdim 474234285Sdim /// @brief The original input. 475234285Sdim MemoryBuffer *InputBuffer; 476234285Sdim 477234285Sdim /// @brief The current position of the scanner. 478234285Sdim StringRef::iterator Current; 479234285Sdim 480234285Sdim /// @brief The end of the input (one past the last character). 481234285Sdim StringRef::iterator End; 482234285Sdim 483234285Sdim /// @brief Current YAML indentation level in spaces. 484234285Sdim int Indent; 485234285Sdim 486234285Sdim /// @brief Current column number in Unicode code points. 487234285Sdim unsigned Column; 488234285Sdim 489234285Sdim /// @brief Current line number. 490234285Sdim unsigned Line; 491234285Sdim 492234285Sdim /// @brief How deep we are in flow style containers. 0 Means at block level. 493234285Sdim unsigned FlowLevel; 494234285Sdim 495234285Sdim /// @brief Are we at the start of the stream? 496234285Sdim bool IsStartOfStream; 497234285Sdim 498234285Sdim /// @brief Can the next token be the start of a simple key? 499234285Sdim bool IsSimpleKeyAllowed; 500234285Sdim 501234285Sdim /// @brief True if an error has occurred. 502234285Sdim bool Failed; 503234285Sdim 504234285Sdim /// @brief Queue of tokens. This is required to queue up tokens while looking 505234285Sdim /// for the end of a simple key. And for cases where a single character 506234285Sdim /// can produce multiple tokens (e.g. BlockEnd). 507234285Sdim TokenQueueT TokenQueue; 508234285Sdim 509234285Sdim /// @brief Indentation levels. 510234285Sdim SmallVector<int, 4> Indents; 511234285Sdim 512234285Sdim /// @brief Potential simple keys. 513234285Sdim SmallVector<SimpleKey, 4> SimpleKeys; 514234285Sdim}; 515234285Sdim 516234285Sdim} // end namespace yaml 517234285Sdim} // end namespace llvm 518234285Sdim 519234285Sdim/// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 520234285Sdimstatic void encodeUTF8( uint32_t UnicodeScalarValue 521234285Sdim , SmallVectorImpl<char> &Result) { 522234285Sdim if (UnicodeScalarValue <= 0x7F) { 523234285Sdim Result.push_back(UnicodeScalarValue & 0x7F); 524234285Sdim } else if (UnicodeScalarValue <= 0x7FF) { 525234285Sdim uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 526234285Sdim uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 527234285Sdim Result.push_back(FirstByte); 528234285Sdim Result.push_back(SecondByte); 529234285Sdim } else if (UnicodeScalarValue <= 0xFFFF) { 530234285Sdim uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 531234285Sdim uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 532234285Sdim uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 533234285Sdim Result.push_back(FirstByte); 534234285Sdim Result.push_back(SecondByte); 535234285Sdim Result.push_back(ThirdByte); 536234285Sdim } else if (UnicodeScalarValue <= 0x10FFFF) { 537234285Sdim uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 538234285Sdim uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 539234285Sdim uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 540234285Sdim uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 541234285Sdim Result.push_back(FirstByte); 542234285Sdim Result.push_back(SecondByte); 543234285Sdim Result.push_back(ThirdByte); 544234285Sdim Result.push_back(FourthByte); 545234285Sdim } 546234285Sdim} 547234285Sdim 548234285Sdimbool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 549234285Sdim SourceMgr SM; 550234285Sdim Scanner scanner(Input, SM); 551234285Sdim while (true) { 552234285Sdim Token T = scanner.getNext(); 553234285Sdim switch (T.Kind) { 554234285Sdim case Token::TK_StreamStart: 555234285Sdim OS << "Stream-Start: "; 556234285Sdim break; 557234285Sdim case Token::TK_StreamEnd: 558234285Sdim OS << "Stream-End: "; 559234285Sdim break; 560234285Sdim case Token::TK_VersionDirective: 561234285Sdim OS << "Version-Directive: "; 562234285Sdim break; 563234285Sdim case Token::TK_TagDirective: 564234285Sdim OS << "Tag-Directive: "; 565234285Sdim break; 566234285Sdim case Token::TK_DocumentStart: 567234285Sdim OS << "Document-Start: "; 568234285Sdim break; 569234285Sdim case Token::TK_DocumentEnd: 570234285Sdim OS << "Document-End: "; 571234285Sdim break; 572234285Sdim case Token::TK_BlockEntry: 573234285Sdim OS << "Block-Entry: "; 574234285Sdim break; 575234285Sdim case Token::TK_BlockEnd: 576234285Sdim OS << "Block-End: "; 577234285Sdim break; 578234285Sdim case Token::TK_BlockSequenceStart: 579234285Sdim OS << "Block-Sequence-Start: "; 580234285Sdim break; 581234285Sdim case Token::TK_BlockMappingStart: 582234285Sdim OS << "Block-Mapping-Start: "; 583234285Sdim break; 584234285Sdim case Token::TK_FlowEntry: 585234285Sdim OS << "Flow-Entry: "; 586234285Sdim break; 587234285Sdim case Token::TK_FlowSequenceStart: 588234285Sdim OS << "Flow-Sequence-Start: "; 589234285Sdim break; 590234285Sdim case Token::TK_FlowSequenceEnd: 591234285Sdim OS << "Flow-Sequence-End: "; 592234285Sdim break; 593234285Sdim case Token::TK_FlowMappingStart: 594234285Sdim OS << "Flow-Mapping-Start: "; 595234285Sdim break; 596234285Sdim case Token::TK_FlowMappingEnd: 597234285Sdim OS << "Flow-Mapping-End: "; 598234285Sdim break; 599234285Sdim case Token::TK_Key: 600234285Sdim OS << "Key: "; 601234285Sdim break; 602234285Sdim case Token::TK_Value: 603234285Sdim OS << "Value: "; 604234285Sdim break; 605234285Sdim case Token::TK_Scalar: 606234285Sdim OS << "Scalar: "; 607234285Sdim break; 608234285Sdim case Token::TK_Alias: 609234285Sdim OS << "Alias: "; 610234285Sdim break; 611234285Sdim case Token::TK_Anchor: 612234285Sdim OS << "Anchor: "; 613234285Sdim break; 614234285Sdim case Token::TK_Tag: 615234285Sdim OS << "Tag: "; 616234285Sdim break; 617234285Sdim case Token::TK_Error: 618234285Sdim break; 619234285Sdim } 620234285Sdim OS << T.Range << "\n"; 621234285Sdim if (T.Kind == Token::TK_StreamEnd) 622234285Sdim break; 623234285Sdim else if (T.Kind == Token::TK_Error) 624234285Sdim return false; 625234285Sdim } 626234285Sdim return true; 627234285Sdim} 628234285Sdim 629234285Sdimbool yaml::scanTokens(StringRef Input) { 630234285Sdim llvm::SourceMgr SM; 631234285Sdim llvm::yaml::Scanner scanner(Input, SM); 632234285Sdim for (;;) { 633234285Sdim llvm::yaml::Token T = scanner.getNext(); 634234285Sdim if (T.Kind == Token::TK_StreamEnd) 635234285Sdim break; 636234285Sdim else if (T.Kind == Token::TK_Error) 637234285Sdim return false; 638234285Sdim } 639234285Sdim return true; 640234285Sdim} 641234285Sdim 642234285Sdimstd::string yaml::escape(StringRef Input) { 643234285Sdim std::string EscapedInput; 644234285Sdim for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 645234285Sdim if (*i == '\\') 646234285Sdim EscapedInput += "\\\\"; 647234285Sdim else if (*i == '"') 648234285Sdim EscapedInput += "\\\""; 649234285Sdim else if (*i == 0) 650234285Sdim EscapedInput += "\\0"; 651234285Sdim else if (*i == 0x07) 652234285Sdim EscapedInput += "\\a"; 653234285Sdim else if (*i == 0x08) 654234285Sdim EscapedInput += "\\b"; 655234285Sdim else if (*i == 0x09) 656234285Sdim EscapedInput += "\\t"; 657234285Sdim else if (*i == 0x0A) 658234285Sdim EscapedInput += "\\n"; 659234285Sdim else if (*i == 0x0B) 660234285Sdim EscapedInput += "\\v"; 661234285Sdim else if (*i == 0x0C) 662234285Sdim EscapedInput += "\\f"; 663234285Sdim else if (*i == 0x0D) 664234285Sdim EscapedInput += "\\r"; 665234285Sdim else if (*i == 0x1B) 666234285Sdim EscapedInput += "\\e"; 667245431Sdim else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 668234285Sdim std::string HexStr = utohexstr(*i); 669234285Sdim EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 670234285Sdim } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 671234285Sdim UTF8Decoded UnicodeScalarValue 672234285Sdim = decodeUTF8(StringRef(i, Input.end() - i)); 673234285Sdim if (UnicodeScalarValue.second == 0) { 674234285Sdim // Found invalid char. 675234285Sdim SmallString<4> Val; 676234285Sdim encodeUTF8(0xFFFD, Val); 677234285Sdim EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 678234285Sdim // FIXME: Error reporting. 679234285Sdim return EscapedInput; 680234285Sdim } 681234285Sdim if (UnicodeScalarValue.first == 0x85) 682234285Sdim EscapedInput += "\\N"; 683234285Sdim else if (UnicodeScalarValue.first == 0xA0) 684234285Sdim EscapedInput += "\\_"; 685234285Sdim else if (UnicodeScalarValue.first == 0x2028) 686234285Sdim EscapedInput += "\\L"; 687234285Sdim else if (UnicodeScalarValue.first == 0x2029) 688234285Sdim EscapedInput += "\\P"; 689234285Sdim else { 690234285Sdim std::string HexStr = utohexstr(UnicodeScalarValue.first); 691234285Sdim if (HexStr.size() <= 2) 692234285Sdim EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 693234285Sdim else if (HexStr.size() <= 4) 694234285Sdim EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 695234285Sdim else if (HexStr.size() <= 8) 696234285Sdim EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 697234285Sdim } 698234285Sdim i += UnicodeScalarValue.second - 1; 699234285Sdim } else 700234285Sdim EscapedInput.push_back(*i); 701234285Sdim } 702234285Sdim return EscapedInput; 703234285Sdim} 704234285Sdim 705234285SdimScanner::Scanner(StringRef Input, SourceMgr &sm) 706234285Sdim : SM(sm) 707234285Sdim , Indent(-1) 708234285Sdim , Column(0) 709234285Sdim , Line(0) 710234285Sdim , FlowLevel(0) 711234285Sdim , IsStartOfStream(true) 712234285Sdim , IsSimpleKeyAllowed(true) 713234285Sdim , Failed(false) { 714234285Sdim InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML"); 715234285Sdim SM.AddNewSourceBuffer(InputBuffer, SMLoc()); 716234285Sdim Current = InputBuffer->getBufferStart(); 717234285Sdim End = InputBuffer->getBufferEnd(); 718234285Sdim} 719234285Sdim 720252723SdimScanner::Scanner(MemoryBuffer *Buffer, SourceMgr &SM_) 721252723Sdim : SM(SM_) 722252723Sdim , InputBuffer(Buffer) 723252723Sdim , Current(InputBuffer->getBufferStart()) 724252723Sdim , End(InputBuffer->getBufferEnd()) 725252723Sdim , Indent(-1) 726252723Sdim , Column(0) 727252723Sdim , Line(0) 728252723Sdim , FlowLevel(0) 729252723Sdim , IsStartOfStream(true) 730252723Sdim , IsSimpleKeyAllowed(true) 731252723Sdim , Failed(false) { 732252723Sdim SM.AddNewSourceBuffer(InputBuffer, SMLoc()); 733252723Sdim} 734252723Sdim 735234285SdimToken &Scanner::peekNext() { 736234285Sdim // If the current token is a possible simple key, keep parsing until we 737234285Sdim // can confirm. 738234285Sdim bool NeedMore = false; 739234285Sdim while (true) { 740234285Sdim if (TokenQueue.empty() || NeedMore) { 741234285Sdim if (!fetchMoreTokens()) { 742234285Sdim TokenQueue.clear(); 743234285Sdim TokenQueue.push_back(Token()); 744234285Sdim return TokenQueue.front(); 745234285Sdim } 746234285Sdim } 747234285Sdim assert(!TokenQueue.empty() && 748234285Sdim "fetchMoreTokens lied about getting tokens!"); 749234285Sdim 750234285Sdim removeStaleSimpleKeyCandidates(); 751234285Sdim SimpleKey SK; 752234285Sdim SK.Tok = TokenQueue.front(); 753234285Sdim if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK) 754234285Sdim == SimpleKeys.end()) 755234285Sdim break; 756234285Sdim else 757234285Sdim NeedMore = true; 758234285Sdim } 759234285Sdim return TokenQueue.front(); 760234285Sdim} 761234285Sdim 762234285SdimToken Scanner::getNext() { 763234285Sdim Token Ret = peekNext(); 764234285Sdim // TokenQueue can be empty if there was an error getting the next token. 765234285Sdim if (!TokenQueue.empty()) 766234285Sdim TokenQueue.pop_front(); 767234285Sdim 768234285Sdim // There cannot be any referenced Token's if the TokenQueue is empty. So do a 769234285Sdim // quick deallocation of them all. 770234285Sdim if (TokenQueue.empty()) { 771234285Sdim TokenQueue.Alloc.Reset(); 772234285Sdim } 773234285Sdim 774234285Sdim return Ret; 775234285Sdim} 776234285Sdim 777234285SdimStringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 778245431Sdim if (Position == End) 779245431Sdim return Position; 780234285Sdim // Check 7 bit c-printable - b-char. 781234285Sdim if ( *Position == 0x09 782234285Sdim || (*Position >= 0x20 && *Position <= 0x7E)) 783234285Sdim return Position + 1; 784234285Sdim 785234285Sdim // Check for valid UTF-8. 786234285Sdim if (uint8_t(*Position) & 0x80) { 787234285Sdim UTF8Decoded u8d = decodeUTF8(Position); 788234285Sdim if ( u8d.second != 0 789234285Sdim && u8d.first != 0xFEFF 790234285Sdim && ( u8d.first == 0x85 791234285Sdim || ( u8d.first >= 0xA0 792234285Sdim && u8d.first <= 0xD7FF) 793234285Sdim || ( u8d.first >= 0xE000 794234285Sdim && u8d.first <= 0xFFFD) 795234285Sdim || ( u8d.first >= 0x10000 796234285Sdim && u8d.first <= 0x10FFFF))) 797234285Sdim return Position + u8d.second; 798234285Sdim } 799234285Sdim return Position; 800234285Sdim} 801234285Sdim 802234285SdimStringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 803245431Sdim if (Position == End) 804245431Sdim return Position; 805234285Sdim if (*Position == 0x0D) { 806234285Sdim if (Position + 1 != End && *(Position + 1) == 0x0A) 807234285Sdim return Position + 2; 808234285Sdim return Position + 1; 809234285Sdim } 810234285Sdim 811234285Sdim if (*Position == 0x0A) 812234285Sdim return Position + 1; 813234285Sdim return Position; 814234285Sdim} 815234285Sdim 816234285Sdim 817234285SdimStringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 818234285Sdim if (Position == End) 819234285Sdim return Position; 820234285Sdim if (*Position == ' ' || *Position == '\t') 821234285Sdim return Position + 1; 822234285Sdim return Position; 823234285Sdim} 824234285Sdim 825234285SdimStringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 826234285Sdim if (Position == End) 827234285Sdim return Position; 828234285Sdim if (*Position == ' ' || *Position == '\t') 829234285Sdim return Position; 830234285Sdim return skip_nb_char(Position); 831234285Sdim} 832234285Sdim 833234285SdimStringRef::iterator Scanner::skip_while( SkipWhileFunc Func 834234285Sdim , StringRef::iterator Position) { 835234285Sdim while (true) { 836234285Sdim StringRef::iterator i = (this->*Func)(Position); 837234285Sdim if (i == Position) 838234285Sdim break; 839234285Sdim Position = i; 840234285Sdim } 841234285Sdim return Position; 842234285Sdim} 843234285Sdim 844234285Sdimstatic bool is_ns_hex_digit(const char C) { 845234285Sdim return (C >= '0' && C <= '9') 846234285Sdim || (C >= 'a' && C <= 'z') 847234285Sdim || (C >= 'A' && C <= 'Z'); 848234285Sdim} 849234285Sdim 850234285Sdimstatic bool is_ns_word_char(const char C) { 851234285Sdim return C == '-' 852234285Sdim || (C >= 'a' && C <= 'z') 853234285Sdim || (C >= 'A' && C <= 'Z'); 854234285Sdim} 855234285Sdim 856234285SdimStringRef Scanner::scan_ns_uri_char() { 857234285Sdim StringRef::iterator Start = Current; 858234285Sdim while (true) { 859234285Sdim if (Current == End) 860234285Sdim break; 861234285Sdim if (( *Current == '%' 862234285Sdim && Current + 2 < End 863234285Sdim && is_ns_hex_digit(*(Current + 1)) 864234285Sdim && is_ns_hex_digit(*(Current + 2))) 865234285Sdim || is_ns_word_char(*Current) 866234285Sdim || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 867234285Sdim != StringRef::npos) { 868234285Sdim ++Current; 869234285Sdim ++Column; 870234285Sdim } else 871234285Sdim break; 872234285Sdim } 873234285Sdim return StringRef(Start, Current - Start); 874234285Sdim} 875234285Sdim 876234285SdimStringRef Scanner::scan_ns_plain_one_line() { 877234285Sdim StringRef::iterator start = Current; 878234285Sdim // The first character must already be verified. 879234285Sdim ++Current; 880234285Sdim while (true) { 881234285Sdim if (Current == End) { 882234285Sdim break; 883234285Sdim } else if (*Current == ':') { 884234285Sdim // Check if the next character is a ns-char. 885234285Sdim if (Current + 1 == End) 886234285Sdim break; 887234285Sdim StringRef::iterator i = skip_ns_char(Current + 1); 888234285Sdim if (Current + 1 != i) { 889234285Sdim Current = i; 890234285Sdim Column += 2; // Consume both the ':' and ns-char. 891234285Sdim } else 892234285Sdim break; 893234285Sdim } else if (*Current == '#') { 894234285Sdim // Check if the previous character was a ns-char. 895234285Sdim // The & 0x80 check is to check for the trailing byte of a utf-8 896234285Sdim if (*(Current - 1) & 0x80 || skip_ns_char(Current - 1) == Current) { 897234285Sdim ++Current; 898234285Sdim ++Column; 899234285Sdim } else 900234285Sdim break; 901234285Sdim } else { 902234285Sdim StringRef::iterator i = skip_nb_char(Current); 903234285Sdim if (i == Current) 904234285Sdim break; 905234285Sdim Current = i; 906234285Sdim ++Column; 907234285Sdim } 908234285Sdim } 909234285Sdim return StringRef(start, Current - start); 910234285Sdim} 911234285Sdim 912234285Sdimbool Scanner::consume(uint32_t Expected) { 913234285Sdim if (Expected >= 0x80) 914234285Sdim report_fatal_error("Not dealing with this yet"); 915234285Sdim if (Current == End) 916234285Sdim return false; 917234285Sdim if (uint8_t(*Current) >= 0x80) 918234285Sdim report_fatal_error("Not dealing with this yet"); 919234285Sdim if (uint8_t(*Current) == Expected) { 920234285Sdim ++Current; 921234285Sdim ++Column; 922234285Sdim return true; 923234285Sdim } 924234285Sdim return false; 925234285Sdim} 926234285Sdim 927234285Sdimvoid Scanner::skip(uint32_t Distance) { 928234285Sdim Current += Distance; 929234285Sdim Column += Distance; 930245431Sdim assert(Current <= End && "Skipped past the end"); 931234285Sdim} 932234285Sdim 933234285Sdimbool Scanner::isBlankOrBreak(StringRef::iterator Position) { 934234285Sdim if (Position == End) 935234285Sdim return false; 936234285Sdim if ( *Position == ' ' || *Position == '\t' 937234285Sdim || *Position == '\r' || *Position == '\n') 938234285Sdim return true; 939234285Sdim return false; 940234285Sdim} 941234285Sdim 942234285Sdimvoid Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 943234285Sdim , unsigned AtColumn 944234285Sdim , bool IsRequired) { 945234285Sdim if (IsSimpleKeyAllowed) { 946234285Sdim SimpleKey SK; 947234285Sdim SK.Tok = Tok; 948234285Sdim SK.Line = Line; 949234285Sdim SK.Column = AtColumn; 950234285Sdim SK.IsRequired = IsRequired; 951234285Sdim SK.FlowLevel = FlowLevel; 952234285Sdim SimpleKeys.push_back(SK); 953234285Sdim } 954234285Sdim} 955234285Sdim 956234285Sdimvoid Scanner::removeStaleSimpleKeyCandidates() { 957234285Sdim for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 958234285Sdim i != SimpleKeys.end();) { 959234285Sdim if (i->Line != Line || i->Column + 1024 < Column) { 960234285Sdim if (i->IsRequired) 961234285Sdim setError( "Could not find expected : for simple key" 962234285Sdim , i->Tok->Range.begin()); 963234285Sdim i = SimpleKeys.erase(i); 964234285Sdim } else 965234285Sdim ++i; 966234285Sdim } 967234285Sdim} 968234285Sdim 969234285Sdimvoid Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 970234285Sdim if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 971234285Sdim SimpleKeys.pop_back(); 972234285Sdim} 973234285Sdim 974234285Sdimbool Scanner::unrollIndent(int ToColumn) { 975234285Sdim Token T; 976234285Sdim // Indentation is ignored in flow. 977234285Sdim if (FlowLevel != 0) 978234285Sdim return true; 979234285Sdim 980234285Sdim while (Indent > ToColumn) { 981234285Sdim T.Kind = Token::TK_BlockEnd; 982234285Sdim T.Range = StringRef(Current, 1); 983234285Sdim TokenQueue.push_back(T); 984234285Sdim Indent = Indents.pop_back_val(); 985234285Sdim } 986234285Sdim 987234285Sdim return true; 988234285Sdim} 989234285Sdim 990234285Sdimbool Scanner::rollIndent( int ToColumn 991234285Sdim , Token::TokenKind Kind 992234285Sdim , TokenQueueT::iterator InsertPoint) { 993234285Sdim if (FlowLevel) 994234285Sdim return true; 995234285Sdim if (Indent < ToColumn) { 996234285Sdim Indents.push_back(Indent); 997234285Sdim Indent = ToColumn; 998234285Sdim 999234285Sdim Token T; 1000234285Sdim T.Kind = Kind; 1001234285Sdim T.Range = StringRef(Current, 0); 1002234285Sdim TokenQueue.insert(InsertPoint, T); 1003234285Sdim } 1004234285Sdim return true; 1005234285Sdim} 1006234285Sdim 1007234285Sdimvoid Scanner::scanToNextToken() { 1008234285Sdim while (true) { 1009234285Sdim while (*Current == ' ' || *Current == '\t') { 1010234285Sdim skip(1); 1011234285Sdim } 1012234285Sdim 1013234285Sdim // Skip comment. 1014234285Sdim if (*Current == '#') { 1015234285Sdim while (true) { 1016234285Sdim // This may skip more than one byte, thus Column is only incremented 1017234285Sdim // for code points. 1018234285Sdim StringRef::iterator i = skip_nb_char(Current); 1019234285Sdim if (i == Current) 1020234285Sdim break; 1021234285Sdim Current = i; 1022234285Sdim ++Column; 1023234285Sdim } 1024234285Sdim } 1025234285Sdim 1026234285Sdim // Skip EOL. 1027234285Sdim StringRef::iterator i = skip_b_break(Current); 1028234285Sdim if (i == Current) 1029234285Sdim break; 1030234285Sdim Current = i; 1031234285Sdim ++Line; 1032234285Sdim Column = 0; 1033234285Sdim // New lines may start a simple key. 1034234285Sdim if (!FlowLevel) 1035234285Sdim IsSimpleKeyAllowed = true; 1036234285Sdim } 1037234285Sdim} 1038234285Sdim 1039234285Sdimbool Scanner::scanStreamStart() { 1040234285Sdim IsStartOfStream = false; 1041234285Sdim 1042234285Sdim EncodingInfo EI = getUnicodeEncoding(currentInput()); 1043234285Sdim 1044234285Sdim Token T; 1045234285Sdim T.Kind = Token::TK_StreamStart; 1046234285Sdim T.Range = StringRef(Current, EI.second); 1047234285Sdim TokenQueue.push_back(T); 1048234285Sdim Current += EI.second; 1049234285Sdim return true; 1050234285Sdim} 1051234285Sdim 1052234285Sdimbool Scanner::scanStreamEnd() { 1053234285Sdim // Force an ending new line if one isn't present. 1054234285Sdim if (Column != 0) { 1055234285Sdim Column = 0; 1056234285Sdim ++Line; 1057234285Sdim } 1058234285Sdim 1059234285Sdim unrollIndent(-1); 1060234285Sdim SimpleKeys.clear(); 1061234285Sdim IsSimpleKeyAllowed = false; 1062234285Sdim 1063234285Sdim Token T; 1064234285Sdim T.Kind = Token::TK_StreamEnd; 1065234285Sdim T.Range = StringRef(Current, 0); 1066234285Sdim TokenQueue.push_back(T); 1067234285Sdim return true; 1068234285Sdim} 1069234285Sdim 1070234285Sdimbool Scanner::scanDirective() { 1071234285Sdim // Reset the indentation level. 1072234285Sdim unrollIndent(-1); 1073234285Sdim SimpleKeys.clear(); 1074234285Sdim IsSimpleKeyAllowed = false; 1075234285Sdim 1076234285Sdim StringRef::iterator Start = Current; 1077234285Sdim consume('%'); 1078234285Sdim StringRef::iterator NameStart = Current; 1079234285Sdim Current = skip_while(&Scanner::skip_ns_char, Current); 1080234285Sdim StringRef Name(NameStart, Current - NameStart); 1081234285Sdim Current = skip_while(&Scanner::skip_s_white, Current); 1082263509Sdim 1083263509Sdim Token T; 1084234285Sdim if (Name == "YAML") { 1085234285Sdim Current = skip_while(&Scanner::skip_ns_char, Current); 1086234285Sdim T.Kind = Token::TK_VersionDirective; 1087234285Sdim T.Range = StringRef(Start, Current - Start); 1088234285Sdim TokenQueue.push_back(T); 1089234285Sdim return true; 1090263509Sdim } else if(Name == "TAG") { 1091263509Sdim Current = skip_while(&Scanner::skip_ns_char, Current); 1092263509Sdim Current = skip_while(&Scanner::skip_s_white, Current); 1093263509Sdim Current = skip_while(&Scanner::skip_ns_char, Current); 1094263509Sdim T.Kind = Token::TK_TagDirective; 1095263509Sdim T.Range = StringRef(Start, Current - Start); 1096263509Sdim TokenQueue.push_back(T); 1097263509Sdim return true; 1098234285Sdim } 1099234285Sdim return false; 1100234285Sdim} 1101234285Sdim 1102234285Sdimbool Scanner::scanDocumentIndicator(bool IsStart) { 1103234285Sdim unrollIndent(-1); 1104234285Sdim SimpleKeys.clear(); 1105234285Sdim IsSimpleKeyAllowed = false; 1106234285Sdim 1107234285Sdim Token T; 1108234285Sdim T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1109234285Sdim T.Range = StringRef(Current, 3); 1110234285Sdim skip(3); 1111234285Sdim TokenQueue.push_back(T); 1112234285Sdim return true; 1113234285Sdim} 1114234285Sdim 1115234285Sdimbool Scanner::scanFlowCollectionStart(bool IsSequence) { 1116234285Sdim Token T; 1117234285Sdim T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1118234285Sdim : Token::TK_FlowMappingStart; 1119234285Sdim T.Range = StringRef(Current, 1); 1120234285Sdim skip(1); 1121234285Sdim TokenQueue.push_back(T); 1122234285Sdim 1123234285Sdim // [ and { may begin a simple key. 1124234285Sdim saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false); 1125234285Sdim 1126234285Sdim // And may also be followed by a simple key. 1127234285Sdim IsSimpleKeyAllowed = true; 1128234285Sdim ++FlowLevel; 1129234285Sdim return true; 1130234285Sdim} 1131234285Sdim 1132234285Sdimbool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1133234285Sdim removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1134234285Sdim IsSimpleKeyAllowed = false; 1135234285Sdim Token T; 1136234285Sdim T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1137234285Sdim : Token::TK_FlowMappingEnd; 1138234285Sdim T.Range = StringRef(Current, 1); 1139234285Sdim skip(1); 1140234285Sdim TokenQueue.push_back(T); 1141234285Sdim if (FlowLevel) 1142234285Sdim --FlowLevel; 1143234285Sdim return true; 1144234285Sdim} 1145234285Sdim 1146234285Sdimbool Scanner::scanFlowEntry() { 1147234285Sdim removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1148234285Sdim IsSimpleKeyAllowed = true; 1149234285Sdim Token T; 1150234285Sdim T.Kind = Token::TK_FlowEntry; 1151234285Sdim T.Range = StringRef(Current, 1); 1152234285Sdim skip(1); 1153234285Sdim TokenQueue.push_back(T); 1154234285Sdim return true; 1155234285Sdim} 1156234285Sdim 1157234285Sdimbool Scanner::scanBlockEntry() { 1158234285Sdim rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1159234285Sdim removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1160234285Sdim IsSimpleKeyAllowed = true; 1161234285Sdim Token T; 1162234285Sdim T.Kind = Token::TK_BlockEntry; 1163234285Sdim T.Range = StringRef(Current, 1); 1164234285Sdim skip(1); 1165234285Sdim TokenQueue.push_back(T); 1166234285Sdim return true; 1167234285Sdim} 1168234285Sdim 1169234285Sdimbool Scanner::scanKey() { 1170234285Sdim if (!FlowLevel) 1171234285Sdim rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1172234285Sdim 1173234285Sdim removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1174234285Sdim IsSimpleKeyAllowed = !FlowLevel; 1175234285Sdim 1176234285Sdim Token T; 1177234285Sdim T.Kind = Token::TK_Key; 1178234285Sdim T.Range = StringRef(Current, 1); 1179234285Sdim skip(1); 1180234285Sdim TokenQueue.push_back(T); 1181234285Sdim return true; 1182234285Sdim} 1183234285Sdim 1184234285Sdimbool Scanner::scanValue() { 1185234285Sdim // If the previous token could have been a simple key, insert the key token 1186234285Sdim // into the token queue. 1187234285Sdim if (!SimpleKeys.empty()) { 1188234285Sdim SimpleKey SK = SimpleKeys.pop_back_val(); 1189234285Sdim Token T; 1190234285Sdim T.Kind = Token::TK_Key; 1191234285Sdim T.Range = SK.Tok->Range; 1192234285Sdim TokenQueueT::iterator i, e; 1193234285Sdim for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1194234285Sdim if (i == SK.Tok) 1195234285Sdim break; 1196234285Sdim } 1197234285Sdim assert(i != e && "SimpleKey not in token queue!"); 1198234285Sdim i = TokenQueue.insert(i, T); 1199234285Sdim 1200234285Sdim // We may also need to add a Block-Mapping-Start token. 1201234285Sdim rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1202234285Sdim 1203234285Sdim IsSimpleKeyAllowed = false; 1204234285Sdim } else { 1205234285Sdim if (!FlowLevel) 1206234285Sdim rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1207234285Sdim IsSimpleKeyAllowed = !FlowLevel; 1208234285Sdim } 1209234285Sdim 1210234285Sdim Token T; 1211234285Sdim T.Kind = Token::TK_Value; 1212234285Sdim T.Range = StringRef(Current, 1); 1213234285Sdim skip(1); 1214234285Sdim TokenQueue.push_back(T); 1215234285Sdim return true; 1216234285Sdim} 1217234285Sdim 1218234285Sdim// Forbidding inlining improves performance by roughly 20%. 1219234285Sdim// FIXME: Remove once llvm optimizes this to the faster version without hints. 1220234285SdimLLVM_ATTRIBUTE_NOINLINE static bool 1221234285SdimwasEscaped(StringRef::iterator First, StringRef::iterator Position); 1222234285Sdim 1223234285Sdim// Returns whether a character at 'Position' was escaped with a leading '\'. 1224234285Sdim// 'First' specifies the position of the first character in the string. 1225234285Sdimstatic bool wasEscaped(StringRef::iterator First, 1226234285Sdim StringRef::iterator Position) { 1227234285Sdim assert(Position - 1 >= First); 1228234285Sdim StringRef::iterator I = Position - 1; 1229234285Sdim // We calculate the number of consecutive '\'s before the current position 1230234285Sdim // by iterating backwards through our string. 1231234285Sdim while (I >= First && *I == '\\') --I; 1232234285Sdim // (Position - 1 - I) now contains the number of '\'s before the current 1233234285Sdim // position. If it is odd, the character at 'Position' was escaped. 1234234285Sdim return (Position - 1 - I) % 2 == 1; 1235234285Sdim} 1236234285Sdim 1237234285Sdimbool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1238234285Sdim StringRef::iterator Start = Current; 1239234285Sdim unsigned ColStart = Column; 1240234285Sdim if (IsDoubleQuoted) { 1241234285Sdim do { 1242234285Sdim ++Current; 1243234285Sdim while (Current != End && *Current != '"') 1244234285Sdim ++Current; 1245234285Sdim // Repeat until the previous character was not a '\' or was an escaped 1246234285Sdim // backslash. 1247245431Sdim } while ( Current != End 1248245431Sdim && *(Current - 1) == '\\' 1249245431Sdim && wasEscaped(Start + 1, Current)); 1250234285Sdim } else { 1251234285Sdim skip(1); 1252234285Sdim while (true) { 1253234285Sdim // Skip a ' followed by another '. 1254234285Sdim if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1255234285Sdim skip(2); 1256234285Sdim continue; 1257234285Sdim } else if (*Current == '\'') 1258234285Sdim break; 1259234285Sdim StringRef::iterator i = skip_nb_char(Current); 1260234285Sdim if (i == Current) { 1261234285Sdim i = skip_b_break(Current); 1262234285Sdim if (i == Current) 1263234285Sdim break; 1264234285Sdim Current = i; 1265234285Sdim Column = 0; 1266234285Sdim ++Line; 1267234285Sdim } else { 1268234285Sdim if (i == End) 1269234285Sdim break; 1270234285Sdim Current = i; 1271234285Sdim ++Column; 1272234285Sdim } 1273234285Sdim } 1274234285Sdim } 1275245431Sdim 1276245431Sdim if (Current == End) { 1277245431Sdim setError("Expected quote at end of scalar", Current); 1278245431Sdim return false; 1279245431Sdim } 1280245431Sdim 1281234285Sdim skip(1); // Skip ending quote. 1282234285Sdim Token T; 1283234285Sdim T.Kind = Token::TK_Scalar; 1284234285Sdim T.Range = StringRef(Start, Current - Start); 1285234285Sdim TokenQueue.push_back(T); 1286234285Sdim 1287234285Sdim saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1288234285Sdim 1289234285Sdim IsSimpleKeyAllowed = false; 1290234285Sdim 1291234285Sdim return true; 1292234285Sdim} 1293234285Sdim 1294234285Sdimbool Scanner::scanPlainScalar() { 1295234285Sdim StringRef::iterator Start = Current; 1296234285Sdim unsigned ColStart = Column; 1297234285Sdim unsigned LeadingBlanks = 0; 1298234285Sdim assert(Indent >= -1 && "Indent must be >= -1 !"); 1299234285Sdim unsigned indent = static_cast<unsigned>(Indent + 1); 1300234285Sdim while (true) { 1301234285Sdim if (*Current == '#') 1302234285Sdim break; 1303234285Sdim 1304234285Sdim while (!isBlankOrBreak(Current)) { 1305234285Sdim if ( FlowLevel && *Current == ':' 1306234285Sdim && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1307234285Sdim setError("Found unexpected ':' while scanning a plain scalar", Current); 1308234285Sdim return false; 1309234285Sdim } 1310234285Sdim 1311234285Sdim // Check for the end of the plain scalar. 1312234285Sdim if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1313234285Sdim || ( FlowLevel 1314234285Sdim && (StringRef(Current, 1).find_first_of(",:?[]{}") 1315234285Sdim != StringRef::npos))) 1316234285Sdim break; 1317234285Sdim 1318234285Sdim StringRef::iterator i = skip_nb_char(Current); 1319234285Sdim if (i == Current) 1320234285Sdim break; 1321234285Sdim Current = i; 1322234285Sdim ++Column; 1323234285Sdim } 1324234285Sdim 1325234285Sdim // Are we at the end? 1326234285Sdim if (!isBlankOrBreak(Current)) 1327234285Sdim break; 1328234285Sdim 1329234285Sdim // Eat blanks. 1330234285Sdim StringRef::iterator Tmp = Current; 1331234285Sdim while (isBlankOrBreak(Tmp)) { 1332234285Sdim StringRef::iterator i = skip_s_white(Tmp); 1333234285Sdim if (i != Tmp) { 1334234285Sdim if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1335234285Sdim setError("Found invalid tab character in indentation", Tmp); 1336234285Sdim return false; 1337234285Sdim } 1338234285Sdim Tmp = i; 1339234285Sdim ++Column; 1340234285Sdim } else { 1341234285Sdim i = skip_b_break(Tmp); 1342234285Sdim if (!LeadingBlanks) 1343234285Sdim LeadingBlanks = 1; 1344234285Sdim Tmp = i; 1345234285Sdim Column = 0; 1346234285Sdim ++Line; 1347234285Sdim } 1348234285Sdim } 1349234285Sdim 1350234285Sdim if (!FlowLevel && Column < indent) 1351234285Sdim break; 1352234285Sdim 1353234285Sdim Current = Tmp; 1354234285Sdim } 1355234285Sdim if (Start == Current) { 1356234285Sdim setError("Got empty plain scalar", Start); 1357234285Sdim return false; 1358234285Sdim } 1359234285Sdim Token T; 1360234285Sdim T.Kind = Token::TK_Scalar; 1361234285Sdim T.Range = StringRef(Start, Current - Start); 1362234285Sdim TokenQueue.push_back(T); 1363234285Sdim 1364234285Sdim // Plain scalars can be simple keys. 1365234285Sdim saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1366234285Sdim 1367234285Sdim IsSimpleKeyAllowed = false; 1368234285Sdim 1369234285Sdim return true; 1370234285Sdim} 1371234285Sdim 1372234285Sdimbool Scanner::scanAliasOrAnchor(bool IsAlias) { 1373234285Sdim StringRef::iterator Start = Current; 1374234285Sdim unsigned ColStart = Column; 1375234285Sdim skip(1); 1376234285Sdim while(true) { 1377234285Sdim if ( *Current == '[' || *Current == ']' 1378234285Sdim || *Current == '{' || *Current == '}' 1379234285Sdim || *Current == ',' 1380234285Sdim || *Current == ':') 1381234285Sdim break; 1382234285Sdim StringRef::iterator i = skip_ns_char(Current); 1383234285Sdim if (i == Current) 1384234285Sdim break; 1385234285Sdim Current = i; 1386234285Sdim ++Column; 1387234285Sdim } 1388234285Sdim 1389234285Sdim if (Start == Current) { 1390234285Sdim setError("Got empty alias or anchor", Start); 1391234285Sdim return false; 1392234285Sdim } 1393234285Sdim 1394234285Sdim Token T; 1395234285Sdim T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1396234285Sdim T.Range = StringRef(Start, Current - Start); 1397234285Sdim TokenQueue.push_back(T); 1398234285Sdim 1399234285Sdim // Alias and anchors can be simple keys. 1400234285Sdim saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1401234285Sdim 1402234285Sdim IsSimpleKeyAllowed = false; 1403234285Sdim 1404234285Sdim return true; 1405234285Sdim} 1406234285Sdim 1407234285Sdimbool Scanner::scanBlockScalar(bool IsLiteral) { 1408234285Sdim StringRef::iterator Start = Current; 1409234285Sdim skip(1); // Eat | or > 1410234285Sdim while(true) { 1411234285Sdim StringRef::iterator i = skip_nb_char(Current); 1412234285Sdim if (i == Current) { 1413234285Sdim if (Column == 0) 1414234285Sdim break; 1415234285Sdim i = skip_b_break(Current); 1416234285Sdim if (i != Current) { 1417234285Sdim // We got a line break. 1418234285Sdim Column = 0; 1419234285Sdim ++Line; 1420234285Sdim Current = i; 1421234285Sdim continue; 1422234285Sdim } else { 1423234285Sdim // There was an error, which should already have been printed out. 1424234285Sdim return false; 1425234285Sdim } 1426234285Sdim } 1427234285Sdim Current = i; 1428234285Sdim ++Column; 1429234285Sdim } 1430234285Sdim 1431234285Sdim if (Start == Current) { 1432234285Sdim setError("Got empty block scalar", Start); 1433234285Sdim return false; 1434234285Sdim } 1435234285Sdim 1436234285Sdim Token T; 1437234285Sdim T.Kind = Token::TK_Scalar; 1438234285Sdim T.Range = StringRef(Start, Current - Start); 1439234285Sdim TokenQueue.push_back(T); 1440234285Sdim return true; 1441234285Sdim} 1442234285Sdim 1443234285Sdimbool Scanner::scanTag() { 1444234285Sdim StringRef::iterator Start = Current; 1445234285Sdim unsigned ColStart = Column; 1446234285Sdim skip(1); // Eat !. 1447234285Sdim if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1448234285Sdim else if (*Current == '<') { 1449234285Sdim skip(1); 1450234285Sdim scan_ns_uri_char(); 1451234285Sdim if (!consume('>')) 1452234285Sdim return false; 1453234285Sdim } else { 1454234285Sdim // FIXME: Actually parse the c-ns-shorthand-tag rule. 1455234285Sdim Current = skip_while(&Scanner::skip_ns_char, Current); 1456234285Sdim } 1457234285Sdim 1458234285Sdim Token T; 1459234285Sdim T.Kind = Token::TK_Tag; 1460234285Sdim T.Range = StringRef(Start, Current - Start); 1461234285Sdim TokenQueue.push_back(T); 1462234285Sdim 1463234285Sdim // Tags can be simple keys. 1464234285Sdim saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1465234285Sdim 1466234285Sdim IsSimpleKeyAllowed = false; 1467234285Sdim 1468234285Sdim return true; 1469234285Sdim} 1470234285Sdim 1471234285Sdimbool Scanner::fetchMoreTokens() { 1472234285Sdim if (IsStartOfStream) 1473234285Sdim return scanStreamStart(); 1474234285Sdim 1475234285Sdim scanToNextToken(); 1476234285Sdim 1477234285Sdim if (Current == End) 1478234285Sdim return scanStreamEnd(); 1479234285Sdim 1480234285Sdim removeStaleSimpleKeyCandidates(); 1481234285Sdim 1482234285Sdim unrollIndent(Column); 1483234285Sdim 1484234285Sdim if (Column == 0 && *Current == '%') 1485234285Sdim return scanDirective(); 1486234285Sdim 1487234285Sdim if (Column == 0 && Current + 4 <= End 1488234285Sdim && *Current == '-' 1489234285Sdim && *(Current + 1) == '-' 1490234285Sdim && *(Current + 2) == '-' 1491234285Sdim && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1492234285Sdim return scanDocumentIndicator(true); 1493234285Sdim 1494234285Sdim if (Column == 0 && Current + 4 <= End 1495234285Sdim && *Current == '.' 1496234285Sdim && *(Current + 1) == '.' 1497234285Sdim && *(Current + 2) == '.' 1498234285Sdim && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1499234285Sdim return scanDocumentIndicator(false); 1500234285Sdim 1501234285Sdim if (*Current == '[') 1502234285Sdim return scanFlowCollectionStart(true); 1503234285Sdim 1504234285Sdim if (*Current == '{') 1505234285Sdim return scanFlowCollectionStart(false); 1506234285Sdim 1507234285Sdim if (*Current == ']') 1508234285Sdim return scanFlowCollectionEnd(true); 1509234285Sdim 1510234285Sdim if (*Current == '}') 1511234285Sdim return scanFlowCollectionEnd(false); 1512234285Sdim 1513234285Sdim if (*Current == ',') 1514234285Sdim return scanFlowEntry(); 1515234285Sdim 1516234285Sdim if (*Current == '-' && isBlankOrBreak(Current + 1)) 1517234285Sdim return scanBlockEntry(); 1518234285Sdim 1519234285Sdim if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1520234285Sdim return scanKey(); 1521234285Sdim 1522234285Sdim if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1523234285Sdim return scanValue(); 1524234285Sdim 1525234285Sdim if (*Current == '*') 1526234285Sdim return scanAliasOrAnchor(true); 1527234285Sdim 1528234285Sdim if (*Current == '&') 1529234285Sdim return scanAliasOrAnchor(false); 1530234285Sdim 1531234285Sdim if (*Current == '!') 1532234285Sdim return scanTag(); 1533234285Sdim 1534234285Sdim if (*Current == '|' && !FlowLevel) 1535234285Sdim return scanBlockScalar(true); 1536234285Sdim 1537234285Sdim if (*Current == '>' && !FlowLevel) 1538234285Sdim return scanBlockScalar(false); 1539234285Sdim 1540234285Sdim if (*Current == '\'') 1541234285Sdim return scanFlowScalar(false); 1542234285Sdim 1543234285Sdim if (*Current == '"') 1544234285Sdim return scanFlowScalar(true); 1545234285Sdim 1546234285Sdim // Get a plain scalar. 1547234285Sdim StringRef FirstChar(Current, 1); 1548234285Sdim if (!(isBlankOrBreak(Current) 1549234285Sdim || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1550234285Sdim || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1551234285Sdim || (!FlowLevel && (*Current == '?' || *Current == ':') 1552234285Sdim && isBlankOrBreak(Current + 1)) 1553234285Sdim || (!FlowLevel && *Current == ':' 1554234285Sdim && Current + 2 < End 1555234285Sdim && *(Current + 1) == ':' 1556234285Sdim && !isBlankOrBreak(Current + 2))) 1557234285Sdim return scanPlainScalar(); 1558234285Sdim 1559234285Sdim setError("Unrecognized character while tokenizing."); 1560234285Sdim return false; 1561234285Sdim} 1562234285Sdim 1563234285SdimStream::Stream(StringRef Input, SourceMgr &SM) 1564234285Sdim : scanner(new Scanner(Input, SM)) 1565234285Sdim , CurrentDoc(0) {} 1566234285Sdim 1567252723SdimStream::Stream(MemoryBuffer *InputBuffer, SourceMgr &SM) 1568252723Sdim : scanner(new Scanner(InputBuffer, SM)) 1569252723Sdim , CurrentDoc(0) {} 1570252723Sdim 1571234285SdimStream::~Stream() {} 1572234285Sdim 1573234285Sdimbool Stream::failed() { return scanner->failed(); } 1574234285Sdim 1575234285Sdimvoid Stream::printError(Node *N, const Twine &Msg) { 1576234285Sdim SmallVector<SMRange, 1> Ranges; 1577234285Sdim Ranges.push_back(N->getSourceRange()); 1578234285Sdim scanner->printError( N->getSourceRange().Start 1579234285Sdim , SourceMgr::DK_Error 1580234285Sdim , Msg 1581234285Sdim , Ranges); 1582234285Sdim} 1583234285Sdim 1584234285Sdimdocument_iterator Stream::begin() { 1585234285Sdim if (CurrentDoc) 1586234285Sdim report_fatal_error("Can only iterate over the stream once"); 1587234285Sdim 1588234285Sdim // Skip Stream-Start. 1589234285Sdim scanner->getNext(); 1590234285Sdim 1591234285Sdim CurrentDoc.reset(new Document(*this)); 1592234285Sdim return document_iterator(CurrentDoc); 1593234285Sdim} 1594234285Sdim 1595234285Sdimdocument_iterator Stream::end() { 1596234285Sdim return document_iterator(); 1597234285Sdim} 1598234285Sdim 1599234285Sdimvoid Stream::skip() { 1600234285Sdim for (document_iterator i = begin(), e = end(); i != e; ++i) 1601234285Sdim i->skip(); 1602234285Sdim} 1603234285Sdim 1604263509SdimNode::Node(unsigned int Type, OwningPtr<Document> &D, StringRef A, StringRef T) 1605234285Sdim : Doc(D) 1606234285Sdim , TypeID(Type) 1607263509Sdim , Anchor(A) 1608263509Sdim , Tag(T) { 1609234285Sdim SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1610234285Sdim SourceRange = SMRange(Start, Start); 1611234285Sdim} 1612234285Sdim 1613263509Sdimstd::string Node::getVerbatimTag() const { 1614263509Sdim StringRef Raw = getRawTag(); 1615263509Sdim if (!Raw.empty() && Raw != "!") { 1616263509Sdim std::string Ret; 1617263509Sdim if (Raw.find_last_of('!') == 0) { 1618263509Sdim Ret = Doc->getTagMap().find("!")->second; 1619263509Sdim Ret += Raw.substr(1); 1620263509Sdim return llvm_move(Ret); 1621263509Sdim } else if (Raw.startswith("!!")) { 1622263509Sdim Ret = Doc->getTagMap().find("!!")->second; 1623263509Sdim Ret += Raw.substr(2); 1624263509Sdim return llvm_move(Ret); 1625263509Sdim } else { 1626263509Sdim StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1627263509Sdim std::map<StringRef, StringRef>::const_iterator It = 1628263509Sdim Doc->getTagMap().find(TagHandle); 1629263509Sdim if (It != Doc->getTagMap().end()) 1630263509Sdim Ret = It->second; 1631263509Sdim else { 1632263509Sdim Token T; 1633263509Sdim T.Kind = Token::TK_Tag; 1634263509Sdim T.Range = TagHandle; 1635263509Sdim setError(Twine("Unknown tag handle ") + TagHandle, T); 1636263509Sdim } 1637263509Sdim Ret += Raw.substr(Raw.find_last_of('!') + 1); 1638263509Sdim return llvm_move(Ret); 1639263509Sdim } 1640263509Sdim } 1641263509Sdim 1642263509Sdim switch (getType()) { 1643263509Sdim case NK_Null: 1644263509Sdim return "tag:yaml.org,2002:null"; 1645263509Sdim case NK_Scalar: 1646263509Sdim // TODO: Tag resolution. 1647263509Sdim return "tag:yaml.org,2002:str"; 1648263509Sdim case NK_Mapping: 1649263509Sdim return "tag:yaml.org,2002:map"; 1650263509Sdim case NK_Sequence: 1651263509Sdim return "tag:yaml.org,2002:seq"; 1652263509Sdim } 1653263509Sdim 1654263509Sdim return ""; 1655263509Sdim} 1656263509Sdim 1657234285SdimToken &Node::peekNext() { 1658234285Sdim return Doc->peekNext(); 1659234285Sdim} 1660234285Sdim 1661234285SdimToken Node::getNext() { 1662234285Sdim return Doc->getNext(); 1663234285Sdim} 1664234285Sdim 1665234285SdimNode *Node::parseBlockNode() { 1666234285Sdim return Doc->parseBlockNode(); 1667234285Sdim} 1668234285Sdim 1669234285SdimBumpPtrAllocator &Node::getAllocator() { 1670234285Sdim return Doc->NodeAllocator; 1671234285Sdim} 1672234285Sdim 1673234285Sdimvoid Node::setError(const Twine &Msg, Token &Tok) const { 1674234285Sdim Doc->setError(Msg, Tok); 1675234285Sdim} 1676234285Sdim 1677234285Sdimbool Node::failed() const { 1678234285Sdim return Doc->failed(); 1679234285Sdim} 1680234285Sdim 1681234285Sdim 1682234285Sdim 1683234285SdimStringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1684234285Sdim // TODO: Handle newlines properly. We need to remove leading whitespace. 1685234285Sdim if (Value[0] == '"') { // Double quoted. 1686234285Sdim // Pull off the leading and trailing "s. 1687234285Sdim StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1688234285Sdim // Search for characters that would require unescaping the value. 1689234285Sdim StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1690234285Sdim if (i != StringRef::npos) 1691234285Sdim return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1692234285Sdim return UnquotedValue; 1693234285Sdim } else if (Value[0] == '\'') { // Single quoted. 1694234285Sdim // Pull off the leading and trailing 's. 1695234285Sdim StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1696234285Sdim StringRef::size_type i = UnquotedValue.find('\''); 1697234285Sdim if (i != StringRef::npos) { 1698234285Sdim // We're going to need Storage. 1699234285Sdim Storage.clear(); 1700234285Sdim Storage.reserve(UnquotedValue.size()); 1701234285Sdim for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1702234285Sdim StringRef Valid(UnquotedValue.begin(), i); 1703234285Sdim Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1704234285Sdim Storage.push_back('\''); 1705234285Sdim UnquotedValue = UnquotedValue.substr(i + 2); 1706234285Sdim } 1707234285Sdim Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1708234285Sdim return StringRef(Storage.begin(), Storage.size()); 1709234285Sdim } 1710234285Sdim return UnquotedValue; 1711234285Sdim } 1712234285Sdim // Plain or block. 1713245431Sdim return Value.rtrim(" "); 1714234285Sdim} 1715234285Sdim 1716234285SdimStringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1717234285Sdim , StringRef::size_type i 1718234285Sdim , SmallVectorImpl<char> &Storage) 1719234285Sdim const { 1720234285Sdim // Use Storage to build proper value. 1721234285Sdim Storage.clear(); 1722234285Sdim Storage.reserve(UnquotedValue.size()); 1723234285Sdim for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1724234285Sdim // Insert all previous chars into Storage. 1725234285Sdim StringRef Valid(UnquotedValue.begin(), i); 1726234285Sdim Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1727234285Sdim // Chop off inserted chars. 1728234285Sdim UnquotedValue = UnquotedValue.substr(i); 1729234285Sdim 1730234285Sdim assert(!UnquotedValue.empty() && "Can't be empty!"); 1731234285Sdim 1732234285Sdim // Parse escape or line break. 1733234285Sdim switch (UnquotedValue[0]) { 1734234285Sdim case '\r': 1735234285Sdim case '\n': 1736234285Sdim Storage.push_back('\n'); 1737234285Sdim if ( UnquotedValue.size() > 1 1738234285Sdim && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1739234285Sdim UnquotedValue = UnquotedValue.substr(1); 1740234285Sdim UnquotedValue = UnquotedValue.substr(1); 1741234285Sdim break; 1742234285Sdim default: 1743234285Sdim if (UnquotedValue.size() == 1) 1744234285Sdim // TODO: Report error. 1745234285Sdim break; 1746234285Sdim UnquotedValue = UnquotedValue.substr(1); 1747234285Sdim switch (UnquotedValue[0]) { 1748234285Sdim default: { 1749234285Sdim Token T; 1750234285Sdim T.Range = StringRef(UnquotedValue.begin(), 1); 1751234285Sdim setError("Unrecognized escape code!", T); 1752234285Sdim return ""; 1753234285Sdim } 1754234285Sdim case '\r': 1755234285Sdim case '\n': 1756234285Sdim // Remove the new line. 1757234285Sdim if ( UnquotedValue.size() > 1 1758234285Sdim && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1759234285Sdim UnquotedValue = UnquotedValue.substr(1); 1760234285Sdim // If this was just a single byte newline, it will get skipped 1761234285Sdim // below. 1762234285Sdim break; 1763234285Sdim case '0': 1764234285Sdim Storage.push_back(0x00); 1765234285Sdim break; 1766234285Sdim case 'a': 1767234285Sdim Storage.push_back(0x07); 1768234285Sdim break; 1769234285Sdim case 'b': 1770234285Sdim Storage.push_back(0x08); 1771234285Sdim break; 1772234285Sdim case 't': 1773234285Sdim case 0x09: 1774234285Sdim Storage.push_back(0x09); 1775234285Sdim break; 1776234285Sdim case 'n': 1777234285Sdim Storage.push_back(0x0A); 1778234285Sdim break; 1779234285Sdim case 'v': 1780234285Sdim Storage.push_back(0x0B); 1781234285Sdim break; 1782234285Sdim case 'f': 1783234285Sdim Storage.push_back(0x0C); 1784234285Sdim break; 1785234285Sdim case 'r': 1786234285Sdim Storage.push_back(0x0D); 1787234285Sdim break; 1788234285Sdim case 'e': 1789234285Sdim Storage.push_back(0x1B); 1790234285Sdim break; 1791234285Sdim case ' ': 1792234285Sdim Storage.push_back(0x20); 1793234285Sdim break; 1794234285Sdim case '"': 1795234285Sdim Storage.push_back(0x22); 1796234285Sdim break; 1797234285Sdim case '/': 1798234285Sdim Storage.push_back(0x2F); 1799234285Sdim break; 1800234285Sdim case '\\': 1801234285Sdim Storage.push_back(0x5C); 1802234285Sdim break; 1803234285Sdim case 'N': 1804234285Sdim encodeUTF8(0x85, Storage); 1805234285Sdim break; 1806234285Sdim case '_': 1807234285Sdim encodeUTF8(0xA0, Storage); 1808234285Sdim break; 1809234285Sdim case 'L': 1810234285Sdim encodeUTF8(0x2028, Storage); 1811234285Sdim break; 1812234285Sdim case 'P': 1813234285Sdim encodeUTF8(0x2029, Storage); 1814234285Sdim break; 1815234285Sdim case 'x': { 1816234285Sdim if (UnquotedValue.size() < 3) 1817234285Sdim // TODO: Report error. 1818234285Sdim break; 1819245431Sdim unsigned int UnicodeScalarValue; 1820245431Sdim if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 1821245431Sdim // TODO: Report error. 1822245431Sdim UnicodeScalarValue = 0xFFFD; 1823234285Sdim encodeUTF8(UnicodeScalarValue, Storage); 1824234285Sdim UnquotedValue = UnquotedValue.substr(2); 1825234285Sdim break; 1826234285Sdim } 1827234285Sdim case 'u': { 1828234285Sdim if (UnquotedValue.size() < 5) 1829234285Sdim // TODO: Report error. 1830234285Sdim break; 1831245431Sdim unsigned int UnicodeScalarValue; 1832245431Sdim if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 1833245431Sdim // TODO: Report error. 1834245431Sdim UnicodeScalarValue = 0xFFFD; 1835234285Sdim encodeUTF8(UnicodeScalarValue, Storage); 1836234285Sdim UnquotedValue = UnquotedValue.substr(4); 1837234285Sdim break; 1838234285Sdim } 1839234285Sdim case 'U': { 1840234285Sdim if (UnquotedValue.size() < 9) 1841234285Sdim // TODO: Report error. 1842234285Sdim break; 1843245431Sdim unsigned int UnicodeScalarValue; 1844245431Sdim if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 1845245431Sdim // TODO: Report error. 1846245431Sdim UnicodeScalarValue = 0xFFFD; 1847234285Sdim encodeUTF8(UnicodeScalarValue, Storage); 1848234285Sdim UnquotedValue = UnquotedValue.substr(8); 1849234285Sdim break; 1850234285Sdim } 1851234285Sdim } 1852234285Sdim UnquotedValue = UnquotedValue.substr(1); 1853234285Sdim } 1854234285Sdim } 1855234285Sdim Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1856234285Sdim return StringRef(Storage.begin(), Storage.size()); 1857234285Sdim} 1858234285Sdim 1859234285SdimNode *KeyValueNode::getKey() { 1860234285Sdim if (Key) 1861234285Sdim return Key; 1862234285Sdim // Handle implicit null keys. 1863234285Sdim { 1864234285Sdim Token &t = peekNext(); 1865234285Sdim if ( t.Kind == Token::TK_BlockEnd 1866234285Sdim || t.Kind == Token::TK_Value 1867234285Sdim || t.Kind == Token::TK_Error) { 1868234285Sdim return Key = new (getAllocator()) NullNode(Doc); 1869234285Sdim } 1870234285Sdim if (t.Kind == Token::TK_Key) 1871234285Sdim getNext(); // skip TK_Key. 1872234285Sdim } 1873234285Sdim 1874234285Sdim // Handle explicit null keys. 1875234285Sdim Token &t = peekNext(); 1876234285Sdim if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 1877234285Sdim return Key = new (getAllocator()) NullNode(Doc); 1878234285Sdim } 1879234285Sdim 1880234285Sdim // We've got a normal key. 1881234285Sdim return Key = parseBlockNode(); 1882234285Sdim} 1883234285Sdim 1884234285SdimNode *KeyValueNode::getValue() { 1885234285Sdim if (Value) 1886234285Sdim return Value; 1887234285Sdim getKey()->skip(); 1888234285Sdim if (failed()) 1889234285Sdim return Value = new (getAllocator()) NullNode(Doc); 1890234285Sdim 1891234285Sdim // Handle implicit null values. 1892234285Sdim { 1893234285Sdim Token &t = peekNext(); 1894234285Sdim if ( t.Kind == Token::TK_BlockEnd 1895234285Sdim || t.Kind == Token::TK_FlowMappingEnd 1896234285Sdim || t.Kind == Token::TK_Key 1897234285Sdim || t.Kind == Token::TK_FlowEntry 1898234285Sdim || t.Kind == Token::TK_Error) { 1899234285Sdim return Value = new (getAllocator()) NullNode(Doc); 1900234285Sdim } 1901234285Sdim 1902234285Sdim if (t.Kind != Token::TK_Value) { 1903234285Sdim setError("Unexpected token in Key Value.", t); 1904234285Sdim return Value = new (getAllocator()) NullNode(Doc); 1905234285Sdim } 1906234285Sdim getNext(); // skip TK_Value. 1907234285Sdim } 1908234285Sdim 1909234285Sdim // Handle explicit null values. 1910234285Sdim Token &t = peekNext(); 1911234285Sdim if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 1912234285Sdim return Value = new (getAllocator()) NullNode(Doc); 1913234285Sdim } 1914234285Sdim 1915234285Sdim // We got a normal value. 1916234285Sdim return Value = parseBlockNode(); 1917234285Sdim} 1918234285Sdim 1919234285Sdimvoid MappingNode::increment() { 1920234285Sdim if (failed()) { 1921234285Sdim IsAtEnd = true; 1922234285Sdim CurrentEntry = 0; 1923234285Sdim return; 1924234285Sdim } 1925234285Sdim if (CurrentEntry) { 1926234285Sdim CurrentEntry->skip(); 1927234285Sdim if (Type == MT_Inline) { 1928234285Sdim IsAtEnd = true; 1929234285Sdim CurrentEntry = 0; 1930234285Sdim return; 1931234285Sdim } 1932234285Sdim } 1933234285Sdim Token T = peekNext(); 1934234285Sdim if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 1935234285Sdim // KeyValueNode eats the TK_Key. That way it can detect null keys. 1936234285Sdim CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 1937234285Sdim } else if (Type == MT_Block) { 1938234285Sdim switch (T.Kind) { 1939234285Sdim case Token::TK_BlockEnd: 1940234285Sdim getNext(); 1941234285Sdim IsAtEnd = true; 1942234285Sdim CurrentEntry = 0; 1943234285Sdim break; 1944234285Sdim default: 1945234285Sdim setError("Unexpected token. Expected Key or Block End", T); 1946234285Sdim case Token::TK_Error: 1947234285Sdim IsAtEnd = true; 1948234285Sdim CurrentEntry = 0; 1949234285Sdim } 1950234285Sdim } else { 1951234285Sdim switch (T.Kind) { 1952234285Sdim case Token::TK_FlowEntry: 1953234285Sdim // Eat the flow entry and recurse. 1954234285Sdim getNext(); 1955234285Sdim return increment(); 1956234285Sdim case Token::TK_FlowMappingEnd: 1957234285Sdim getNext(); 1958234285Sdim case Token::TK_Error: 1959234285Sdim // Set this to end iterator. 1960234285Sdim IsAtEnd = true; 1961234285Sdim CurrentEntry = 0; 1962234285Sdim break; 1963234285Sdim default: 1964234285Sdim setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 1965234285Sdim "Mapping End." 1966234285Sdim , T); 1967234285Sdim IsAtEnd = true; 1968234285Sdim CurrentEntry = 0; 1969234285Sdim } 1970234285Sdim } 1971234285Sdim} 1972234285Sdim 1973234285Sdimvoid SequenceNode::increment() { 1974234285Sdim if (failed()) { 1975234285Sdim IsAtEnd = true; 1976234285Sdim CurrentEntry = 0; 1977234285Sdim return; 1978234285Sdim } 1979234285Sdim if (CurrentEntry) 1980234285Sdim CurrentEntry->skip(); 1981234285Sdim Token T = peekNext(); 1982234285Sdim if (SeqType == ST_Block) { 1983234285Sdim switch (T.Kind) { 1984234285Sdim case Token::TK_BlockEntry: 1985234285Sdim getNext(); 1986234285Sdim CurrentEntry = parseBlockNode(); 1987234285Sdim if (CurrentEntry == 0) { // An error occurred. 1988234285Sdim IsAtEnd = true; 1989234285Sdim CurrentEntry = 0; 1990234285Sdim } 1991234285Sdim break; 1992234285Sdim case Token::TK_BlockEnd: 1993234285Sdim getNext(); 1994234285Sdim IsAtEnd = true; 1995234285Sdim CurrentEntry = 0; 1996234285Sdim break; 1997234285Sdim default: 1998234285Sdim setError( "Unexpected token. Expected Block Entry or Block End." 1999234285Sdim , T); 2000234285Sdim case Token::TK_Error: 2001234285Sdim IsAtEnd = true; 2002234285Sdim CurrentEntry = 0; 2003234285Sdim } 2004234285Sdim } else if (SeqType == ST_Indentless) { 2005234285Sdim switch (T.Kind) { 2006234285Sdim case Token::TK_BlockEntry: 2007234285Sdim getNext(); 2008234285Sdim CurrentEntry = parseBlockNode(); 2009234285Sdim if (CurrentEntry == 0) { // An error occurred. 2010234285Sdim IsAtEnd = true; 2011234285Sdim CurrentEntry = 0; 2012234285Sdim } 2013234285Sdim break; 2014234285Sdim default: 2015234285Sdim case Token::TK_Error: 2016234285Sdim IsAtEnd = true; 2017234285Sdim CurrentEntry = 0; 2018234285Sdim } 2019234285Sdim } else if (SeqType == ST_Flow) { 2020234285Sdim switch (T.Kind) { 2021234285Sdim case Token::TK_FlowEntry: 2022234285Sdim // Eat the flow entry and recurse. 2023234285Sdim getNext(); 2024234285Sdim WasPreviousTokenFlowEntry = true; 2025234285Sdim return increment(); 2026234285Sdim case Token::TK_FlowSequenceEnd: 2027234285Sdim getNext(); 2028234285Sdim case Token::TK_Error: 2029234285Sdim // Set this to end iterator. 2030234285Sdim IsAtEnd = true; 2031234285Sdim CurrentEntry = 0; 2032234285Sdim break; 2033234285Sdim case Token::TK_StreamEnd: 2034234285Sdim case Token::TK_DocumentEnd: 2035234285Sdim case Token::TK_DocumentStart: 2036234285Sdim setError("Could not find closing ]!", T); 2037234285Sdim // Set this to end iterator. 2038234285Sdim IsAtEnd = true; 2039234285Sdim CurrentEntry = 0; 2040234285Sdim break; 2041234285Sdim default: 2042234285Sdim if (!WasPreviousTokenFlowEntry) { 2043234285Sdim setError("Expected , between entries!", T); 2044234285Sdim IsAtEnd = true; 2045234285Sdim CurrentEntry = 0; 2046234285Sdim break; 2047234285Sdim } 2048234285Sdim // Otherwise it must be a flow entry. 2049234285Sdim CurrentEntry = parseBlockNode(); 2050234285Sdim if (!CurrentEntry) { 2051234285Sdim IsAtEnd = true; 2052234285Sdim } 2053234285Sdim WasPreviousTokenFlowEntry = false; 2054234285Sdim break; 2055234285Sdim } 2056234285Sdim } 2057234285Sdim} 2058234285Sdim 2059234285SdimDocument::Document(Stream &S) : stream(S), Root(0) { 2060263509Sdim // Tag maps starts with two default mappings. 2061263509Sdim TagMap["!"] = "!"; 2062263509Sdim TagMap["!!"] = "tag:yaml.org,2002:"; 2063263509Sdim 2064234285Sdim if (parseDirectives()) 2065234285Sdim expectToken(Token::TK_DocumentStart); 2066234285Sdim Token &T = peekNext(); 2067234285Sdim if (T.Kind == Token::TK_DocumentStart) 2068234285Sdim getNext(); 2069234285Sdim} 2070234285Sdim 2071234285Sdimbool Document::skip() { 2072234285Sdim if (stream.scanner->failed()) 2073234285Sdim return false; 2074234285Sdim if (!Root) 2075234285Sdim getRoot(); 2076234285Sdim Root->skip(); 2077234285Sdim Token &T = peekNext(); 2078234285Sdim if (T.Kind == Token::TK_StreamEnd) 2079234285Sdim return false; 2080234285Sdim if (T.Kind == Token::TK_DocumentEnd) { 2081234285Sdim getNext(); 2082234285Sdim return skip(); 2083234285Sdim } 2084234285Sdim return true; 2085234285Sdim} 2086234285Sdim 2087234285SdimToken &Document::peekNext() { 2088234285Sdim return stream.scanner->peekNext(); 2089234285Sdim} 2090234285Sdim 2091234285SdimToken Document::getNext() { 2092234285Sdim return stream.scanner->getNext(); 2093234285Sdim} 2094234285Sdim 2095234285Sdimvoid Document::setError(const Twine &Message, Token &Location) const { 2096234285Sdim stream.scanner->setError(Message, Location.Range.begin()); 2097234285Sdim} 2098234285Sdim 2099234285Sdimbool Document::failed() const { 2100234285Sdim return stream.scanner->failed(); 2101234285Sdim} 2102234285Sdim 2103234285SdimNode *Document::parseBlockNode() { 2104234285Sdim Token T = peekNext(); 2105234285Sdim // Handle properties. 2106234285Sdim Token AnchorInfo; 2107263509Sdim Token TagInfo; 2108234285Sdimparse_property: 2109234285Sdim switch (T.Kind) { 2110234285Sdim case Token::TK_Alias: 2111234285Sdim getNext(); 2112234285Sdim return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2113234285Sdim case Token::TK_Anchor: 2114234285Sdim if (AnchorInfo.Kind == Token::TK_Anchor) { 2115234285Sdim setError("Already encountered an anchor for this node!", T); 2116234285Sdim return 0; 2117234285Sdim } 2118234285Sdim AnchorInfo = getNext(); // Consume TK_Anchor. 2119234285Sdim T = peekNext(); 2120234285Sdim goto parse_property; 2121234285Sdim case Token::TK_Tag: 2122263509Sdim if (TagInfo.Kind == Token::TK_Tag) { 2123263509Sdim setError("Already encountered a tag for this node!", T); 2124263509Sdim return 0; 2125263509Sdim } 2126263509Sdim TagInfo = getNext(); // Consume TK_Tag. 2127234285Sdim T = peekNext(); 2128234285Sdim goto parse_property; 2129234285Sdim default: 2130234285Sdim break; 2131234285Sdim } 2132234285Sdim 2133234285Sdim switch (T.Kind) { 2134234285Sdim case Token::TK_BlockEntry: 2135234285Sdim // We got an unindented BlockEntry sequence. This is not terminated with 2136234285Sdim // a BlockEnd. 2137234285Sdim // Don't eat the TK_BlockEntry, SequenceNode needs it. 2138234285Sdim return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2139234285Sdim , AnchorInfo.Range.substr(1) 2140263509Sdim , TagInfo.Range 2141234285Sdim , SequenceNode::ST_Indentless); 2142234285Sdim case Token::TK_BlockSequenceStart: 2143234285Sdim getNext(); 2144234285Sdim return new (NodeAllocator) 2145234285Sdim SequenceNode( stream.CurrentDoc 2146234285Sdim , AnchorInfo.Range.substr(1) 2147263509Sdim , TagInfo.Range 2148234285Sdim , SequenceNode::ST_Block); 2149234285Sdim case Token::TK_BlockMappingStart: 2150234285Sdim getNext(); 2151234285Sdim return new (NodeAllocator) 2152234285Sdim MappingNode( stream.CurrentDoc 2153234285Sdim , AnchorInfo.Range.substr(1) 2154263509Sdim , TagInfo.Range 2155234285Sdim , MappingNode::MT_Block); 2156234285Sdim case Token::TK_FlowSequenceStart: 2157234285Sdim getNext(); 2158234285Sdim return new (NodeAllocator) 2159234285Sdim SequenceNode( stream.CurrentDoc 2160234285Sdim , AnchorInfo.Range.substr(1) 2161263509Sdim , TagInfo.Range 2162234285Sdim , SequenceNode::ST_Flow); 2163234285Sdim case Token::TK_FlowMappingStart: 2164234285Sdim getNext(); 2165234285Sdim return new (NodeAllocator) 2166234285Sdim MappingNode( stream.CurrentDoc 2167234285Sdim , AnchorInfo.Range.substr(1) 2168263509Sdim , TagInfo.Range 2169234285Sdim , MappingNode::MT_Flow); 2170234285Sdim case Token::TK_Scalar: 2171234285Sdim getNext(); 2172234285Sdim return new (NodeAllocator) 2173234285Sdim ScalarNode( stream.CurrentDoc 2174234285Sdim , AnchorInfo.Range.substr(1) 2175263509Sdim , TagInfo.Range 2176234285Sdim , T.Range); 2177234285Sdim case Token::TK_Key: 2178234285Sdim // Don't eat the TK_Key, KeyValueNode expects it. 2179234285Sdim return new (NodeAllocator) 2180234285Sdim MappingNode( stream.CurrentDoc 2181234285Sdim , AnchorInfo.Range.substr(1) 2182263509Sdim , TagInfo.Range 2183234285Sdim , MappingNode::MT_Inline); 2184234285Sdim case Token::TK_DocumentStart: 2185234285Sdim case Token::TK_DocumentEnd: 2186234285Sdim case Token::TK_StreamEnd: 2187234285Sdim default: 2188234285Sdim // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2189234285Sdim // !!null null. 2190234285Sdim return new (NodeAllocator) NullNode(stream.CurrentDoc); 2191234285Sdim case Token::TK_Error: 2192234285Sdim return 0; 2193234285Sdim } 2194234285Sdim llvm_unreachable("Control flow shouldn't reach here."); 2195234285Sdim return 0; 2196234285Sdim} 2197234285Sdim 2198234285Sdimbool Document::parseDirectives() { 2199234285Sdim bool isDirective = false; 2200234285Sdim while (true) { 2201234285Sdim Token T = peekNext(); 2202234285Sdim if (T.Kind == Token::TK_TagDirective) { 2203263509Sdim parseTAGDirective(); 2204234285Sdim isDirective = true; 2205234285Sdim } else if (T.Kind == Token::TK_VersionDirective) { 2206263509Sdim parseYAMLDirective(); 2207234285Sdim isDirective = true; 2208234285Sdim } else 2209234285Sdim break; 2210234285Sdim } 2211234285Sdim return isDirective; 2212234285Sdim} 2213234285Sdim 2214263509Sdimvoid Document::parseYAMLDirective() { 2215263509Sdim getNext(); // Eat %YAML <version> 2216263509Sdim} 2217263509Sdim 2218263509Sdimvoid Document::parseTAGDirective() { 2219263509Sdim Token Tag = getNext(); // %TAG <handle> <prefix> 2220263509Sdim StringRef T = Tag.Range; 2221263509Sdim // Strip %TAG 2222263509Sdim T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2223263509Sdim std::size_t HandleEnd = T.find_first_of(" \t"); 2224263509Sdim StringRef TagHandle = T.substr(0, HandleEnd); 2225263509Sdim StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2226263509Sdim TagMap[TagHandle] = TagPrefix; 2227263509Sdim} 2228263509Sdim 2229234285Sdimbool Document::expectToken(int TK) { 2230234285Sdim Token T = getNext(); 2231234285Sdim if (T.Kind != TK) { 2232234285Sdim setError("Unexpected token", T); 2233234285Sdim return false; 2234234285Sdim } 2235234285Sdim return true; 2236234285Sdim} 2237