1247738Sbapt 2247738Sbapt#include "yaml_private.h" 3247738Sbapt 4247738Sbapt/* 5247738Sbapt * Declarations. 6247738Sbapt */ 7247738Sbapt 8247738Sbaptstatic int 9247738Sbaptyaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem, 10247738Sbapt size_t offset, int value); 11247738Sbapt 12247738Sbaptstatic int 13247738Sbaptyaml_parser_update_raw_buffer(yaml_parser_t *parser); 14247738Sbapt 15247738Sbaptstatic int 16247738Sbaptyaml_parser_determine_encoding(yaml_parser_t *parser); 17247738Sbapt 18247738SbaptYAML_DECLARE(int) 19247738Sbaptyaml_parser_update_buffer(yaml_parser_t *parser, size_t length); 20247738Sbapt 21247738Sbapt/* 22247738Sbapt * Set the reader error and return 0. 23247738Sbapt */ 24247738Sbapt 25247738Sbaptstatic int 26247738Sbaptyaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem, 27247738Sbapt size_t offset, int value) 28247738Sbapt{ 29247738Sbapt parser->error = YAML_READER_ERROR; 30247738Sbapt parser->problem = problem; 31247738Sbapt parser->problem_offset = offset; 32247738Sbapt parser->problem_value = value; 33247738Sbapt 34247738Sbapt return 0; 35247738Sbapt} 36247738Sbapt 37247738Sbapt/* 38247738Sbapt * Byte order marks. 39247738Sbapt */ 40247738Sbapt 41247738Sbapt#define BOM_UTF8 "\xef\xbb\xbf" 42247738Sbapt#define BOM_UTF16LE "\xff\xfe" 43247738Sbapt#define BOM_UTF16BE "\xfe\xff" 44247738Sbapt 45247738Sbapt/* 46247738Sbapt * Determine the input stream encoding by checking the BOM symbol. If no BOM is 47247738Sbapt * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure. 48247738Sbapt */ 49247738Sbapt 50247738Sbaptstatic int 51247738Sbaptyaml_parser_determine_encoding(yaml_parser_t *parser) 52247738Sbapt{ 53247738Sbapt /* Ensure that we had enough bytes in the raw buffer. */ 54247738Sbapt 55247738Sbapt while (!parser->eof 56247738Sbapt && parser->raw_buffer.last - parser->raw_buffer.pointer < 3) { 57247738Sbapt if (!yaml_parser_update_raw_buffer(parser)) { 58247738Sbapt return 0; 59247738Sbapt } 60247738Sbapt } 61247738Sbapt 62247738Sbapt /* Determine the encoding. */ 63247738Sbapt 64247738Sbapt if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2 65247738Sbapt && !memcmp(parser->raw_buffer.pointer, BOM_UTF16LE, 2)) { 66247738Sbapt parser->encoding = YAML_UTF16LE_ENCODING; 67247738Sbapt parser->raw_buffer.pointer += 2; 68247738Sbapt parser->offset += 2; 69247738Sbapt } 70247738Sbapt else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2 71247738Sbapt && !memcmp(parser->raw_buffer.pointer, BOM_UTF16BE, 2)) { 72247738Sbapt parser->encoding = YAML_UTF16BE_ENCODING; 73247738Sbapt parser->raw_buffer.pointer += 2; 74247738Sbapt parser->offset += 2; 75247738Sbapt } 76247738Sbapt else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 3 77247738Sbapt && !memcmp(parser->raw_buffer.pointer, BOM_UTF8, 3)) { 78247738Sbapt parser->encoding = YAML_UTF8_ENCODING; 79247738Sbapt parser->raw_buffer.pointer += 3; 80247738Sbapt parser->offset += 3; 81247738Sbapt } 82247738Sbapt else { 83247738Sbapt parser->encoding = YAML_UTF8_ENCODING; 84247738Sbapt } 85247738Sbapt 86247738Sbapt return 1; 87247738Sbapt} 88247738Sbapt 89247738Sbapt/* 90247738Sbapt * Update the raw buffer. 91247738Sbapt */ 92247738Sbapt 93247738Sbaptstatic int 94247738Sbaptyaml_parser_update_raw_buffer(yaml_parser_t *parser) 95247738Sbapt{ 96247738Sbapt size_t size_read = 0; 97247738Sbapt 98247738Sbapt /* Return if the raw buffer is full. */ 99247738Sbapt 100247738Sbapt if (parser->raw_buffer.start == parser->raw_buffer.pointer 101247738Sbapt && parser->raw_buffer.last == parser->raw_buffer.end) 102247738Sbapt return 1; 103247738Sbapt 104247738Sbapt /* Return on EOF. */ 105247738Sbapt 106247738Sbapt if (parser->eof) return 1; 107247738Sbapt 108247738Sbapt /* Move the remaining bytes in the raw buffer to the beginning. */ 109247738Sbapt 110247738Sbapt if (parser->raw_buffer.start < parser->raw_buffer.pointer 111247738Sbapt && parser->raw_buffer.pointer < parser->raw_buffer.last) { 112247738Sbapt memmove(parser->raw_buffer.start, parser->raw_buffer.pointer, 113247738Sbapt parser->raw_buffer.last - parser->raw_buffer.pointer); 114247738Sbapt } 115247738Sbapt parser->raw_buffer.last -= 116247738Sbapt parser->raw_buffer.pointer - parser->raw_buffer.start; 117247738Sbapt parser->raw_buffer.pointer = parser->raw_buffer.start; 118247738Sbapt 119247738Sbapt /* Call the read handler to fill the buffer. */ 120247738Sbapt 121247738Sbapt if (!parser->read_handler(parser->read_handler_data, parser->raw_buffer.last, 122247738Sbapt parser->raw_buffer.end - parser->raw_buffer.last, &size_read)) { 123247738Sbapt return yaml_parser_set_reader_error(parser, "input error", 124247738Sbapt parser->offset, -1); 125247738Sbapt } 126247738Sbapt parser->raw_buffer.last += size_read; 127247738Sbapt if (!size_read) { 128247738Sbapt parser->eof = 1; 129247738Sbapt } 130247738Sbapt 131247738Sbapt return 1; 132247738Sbapt} 133247738Sbapt 134247738Sbapt/* 135247738Sbapt * Ensure that the buffer contains at least `length` characters. 136247738Sbapt * Return 1 on success, 0 on failure. 137247738Sbapt * 138247738Sbapt * The length is supposed to be significantly less that the buffer size. 139247738Sbapt */ 140247738Sbapt 141247738SbaptYAML_DECLARE(int) 142247738Sbaptyaml_parser_update_buffer(yaml_parser_t *parser, size_t length) 143247738Sbapt{ 144247738Sbapt int first = 1; 145247738Sbapt 146247738Sbapt assert(parser->read_handler); /* Read handler must be set. */ 147247738Sbapt 148247738Sbapt /* If the EOF flag is set and the raw buffer is empty, do nothing. */ 149247738Sbapt 150247738Sbapt if (parser->eof && parser->raw_buffer.pointer == parser->raw_buffer.last) 151247738Sbapt return 1; 152247738Sbapt 153247738Sbapt /* Return if the buffer contains enough characters. */ 154247738Sbapt 155247738Sbapt if (parser->unread >= length) 156247738Sbapt return 1; 157247738Sbapt 158247738Sbapt /* Determine the input encoding if it is not known yet. */ 159247738Sbapt 160247738Sbapt if (!parser->encoding) { 161247738Sbapt if (!yaml_parser_determine_encoding(parser)) 162247738Sbapt return 0; 163247738Sbapt } 164247738Sbapt 165247738Sbapt /* Move the unread characters to the beginning of the buffer. */ 166247738Sbapt 167247738Sbapt if (parser->buffer.start < parser->buffer.pointer 168247738Sbapt && parser->buffer.pointer < parser->buffer.last) { 169247738Sbapt size_t size = parser->buffer.last - parser->buffer.pointer; 170247738Sbapt memmove(parser->buffer.start, parser->buffer.pointer, size); 171247738Sbapt parser->buffer.pointer = parser->buffer.start; 172247738Sbapt parser->buffer.last = parser->buffer.start + size; 173247738Sbapt } 174247738Sbapt else if (parser->buffer.pointer == parser->buffer.last) { 175247738Sbapt parser->buffer.pointer = parser->buffer.start; 176247738Sbapt parser->buffer.last = parser->buffer.start; 177247738Sbapt } 178247738Sbapt 179247738Sbapt /* Fill the buffer until it has enough characters. */ 180247738Sbapt 181247738Sbapt while (parser->unread < length) 182247738Sbapt { 183247738Sbapt /* Fill the raw buffer if necessary. */ 184247738Sbapt 185247738Sbapt if (!first || parser->raw_buffer.pointer == parser->raw_buffer.last) { 186247738Sbapt if (!yaml_parser_update_raw_buffer(parser)) return 0; 187247738Sbapt } 188247738Sbapt first = 0; 189247738Sbapt 190247738Sbapt /* Decode the raw buffer. */ 191247738Sbapt 192247738Sbapt while (parser->raw_buffer.pointer != parser->raw_buffer.last) 193247738Sbapt { 194247738Sbapt unsigned int value = 0, value2 = 0; 195247738Sbapt int incomplete = 0; 196247738Sbapt unsigned char octet; 197247738Sbapt unsigned int width = 0; 198247738Sbapt int low, high; 199247738Sbapt size_t k; 200247738Sbapt size_t raw_unread = parser->raw_buffer.last - parser->raw_buffer.pointer; 201247738Sbapt 202247738Sbapt /* Decode the next character. */ 203247738Sbapt 204247738Sbapt switch (parser->encoding) 205247738Sbapt { 206247738Sbapt case YAML_UTF8_ENCODING: 207247738Sbapt 208247738Sbapt /* 209247738Sbapt * Decode a UTF-8 character. Check RFC 3629 210247738Sbapt * (http://www.ietf.org/rfc/rfc3629.txt) for more details. 211247738Sbapt * 212247738Sbapt * The following table (taken from the RFC) is used for 213247738Sbapt * decoding. 214247738Sbapt * 215247738Sbapt * Char. number range | UTF-8 octet sequence 216247738Sbapt * (hexadecimal) | (binary) 217247738Sbapt * --------------------+------------------------------------ 218247738Sbapt * 0000 0000-0000 007F | 0xxxxxxx 219247738Sbapt * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx 220247738Sbapt * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 221247738Sbapt * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 222247738Sbapt * 223247738Sbapt * Additionally, the characters in the range 0xD800-0xDFFF 224247738Sbapt * are prohibited as they are reserved for use with UTF-16 225247738Sbapt * surrogate pairs. 226247738Sbapt */ 227247738Sbapt 228247738Sbapt /* Determine the length of the UTF-8 sequence. */ 229247738Sbapt 230247738Sbapt octet = parser->raw_buffer.pointer[0]; 231247738Sbapt width = (octet & 0x80) == 0x00 ? 1 : 232247738Sbapt (octet & 0xE0) == 0xC0 ? 2 : 233247738Sbapt (octet & 0xF0) == 0xE0 ? 3 : 234247738Sbapt (octet & 0xF8) == 0xF0 ? 4 : 0; 235247738Sbapt 236247738Sbapt /* Check if the leading octet is valid. */ 237247738Sbapt 238247738Sbapt if (!width) 239247738Sbapt return yaml_parser_set_reader_error(parser, 240247738Sbapt "invalid leading UTF-8 octet", 241247738Sbapt parser->offset, octet); 242247738Sbapt 243247738Sbapt /* Check if the raw buffer contains an incomplete character. */ 244247738Sbapt 245247738Sbapt if (width > raw_unread) { 246247738Sbapt if (parser->eof) { 247247738Sbapt return yaml_parser_set_reader_error(parser, 248247738Sbapt "incomplete UTF-8 octet sequence", 249247738Sbapt parser->offset, -1); 250247738Sbapt } 251247738Sbapt incomplete = 1; 252247738Sbapt break; 253247738Sbapt } 254247738Sbapt 255247738Sbapt /* Decode the leading octet. */ 256247738Sbapt 257247738Sbapt value = (octet & 0x80) == 0x00 ? octet & 0x7F : 258247738Sbapt (octet & 0xE0) == 0xC0 ? octet & 0x1F : 259247738Sbapt (octet & 0xF0) == 0xE0 ? octet & 0x0F : 260247738Sbapt (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0; 261247738Sbapt 262247738Sbapt /* Check and decode the trailing octets. */ 263247738Sbapt 264247738Sbapt for (k = 1; k < width; k ++) 265247738Sbapt { 266247738Sbapt octet = parser->raw_buffer.pointer[k]; 267247738Sbapt 268247738Sbapt /* Check if the octet is valid. */ 269247738Sbapt 270247738Sbapt if ((octet & 0xC0) != 0x80) 271247738Sbapt return yaml_parser_set_reader_error(parser, 272247738Sbapt "invalid trailing UTF-8 octet", 273247738Sbapt parser->offset+k, octet); 274247738Sbapt 275247738Sbapt /* Decode the octet. */ 276247738Sbapt 277247738Sbapt value = (value << 6) + (octet & 0x3F); 278247738Sbapt } 279247738Sbapt 280247738Sbapt /* Check the length of the sequence against the value. */ 281247738Sbapt 282247738Sbapt if (!((width == 1) || 283247738Sbapt (width == 2 && value >= 0x80) || 284247738Sbapt (width == 3 && value >= 0x800) || 285247738Sbapt (width == 4 && value >= 0x10000))) 286247738Sbapt return yaml_parser_set_reader_error(parser, 287247738Sbapt "invalid length of a UTF-8 sequence", 288247738Sbapt parser->offset, -1); 289247738Sbapt 290247738Sbapt /* Check the range of the value. */ 291247738Sbapt 292247738Sbapt if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF) 293247738Sbapt return yaml_parser_set_reader_error(parser, 294247738Sbapt "invalid Unicode character", 295247738Sbapt parser->offset, value); 296247738Sbapt 297247738Sbapt break; 298247738Sbapt 299247738Sbapt case YAML_UTF16LE_ENCODING: 300247738Sbapt case YAML_UTF16BE_ENCODING: 301247738Sbapt 302247738Sbapt low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1); 303247738Sbapt high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0); 304247738Sbapt 305247738Sbapt /* 306247738Sbapt * The UTF-16 encoding is not as simple as one might 307247738Sbapt * naively think. Check RFC 2781 308247738Sbapt * (http://www.ietf.org/rfc/rfc2781.txt). 309247738Sbapt * 310247738Sbapt * Normally, two subsequent bytes describe a Unicode 311247738Sbapt * character. However a special technique (called a 312247738Sbapt * surrogate pair) is used for specifying character 313247738Sbapt * values larger than 0xFFFF. 314247738Sbapt * 315247738Sbapt * A surrogate pair consists of two pseudo-characters: 316247738Sbapt * high surrogate area (0xD800-0xDBFF) 317247738Sbapt * low surrogate area (0xDC00-0xDFFF) 318247738Sbapt * 319247738Sbapt * The following formulas are used for decoding 320247738Sbapt * and encoding characters using surrogate pairs: 321247738Sbapt * 322247738Sbapt * U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF) 323247738Sbapt * U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF) 324247738Sbapt * W1 = 110110yyyyyyyyyy 325247738Sbapt * W2 = 110111xxxxxxxxxx 326247738Sbapt * 327247738Sbapt * where U is the character value, W1 is the high surrogate 328247738Sbapt * area, W2 is the low surrogate area. 329247738Sbapt */ 330247738Sbapt 331247738Sbapt /* Check for incomplete UTF-16 character. */ 332247738Sbapt 333247738Sbapt if (raw_unread < 2) { 334247738Sbapt if (parser->eof) { 335247738Sbapt return yaml_parser_set_reader_error(parser, 336247738Sbapt "incomplete UTF-16 character", 337247738Sbapt parser->offset, -1); 338247738Sbapt } 339247738Sbapt incomplete = 1; 340247738Sbapt break; 341247738Sbapt } 342247738Sbapt 343247738Sbapt /* Get the character. */ 344247738Sbapt 345247738Sbapt value = parser->raw_buffer.pointer[low] 346247738Sbapt + (parser->raw_buffer.pointer[high] << 8); 347247738Sbapt 348247738Sbapt /* Check for unexpected low surrogate area. */ 349247738Sbapt 350247738Sbapt if ((value & 0xFC00) == 0xDC00) 351247738Sbapt return yaml_parser_set_reader_error(parser, 352247738Sbapt "unexpected low surrogate area", 353247738Sbapt parser->offset, value); 354247738Sbapt 355247738Sbapt /* Check for a high surrogate area. */ 356247738Sbapt 357247738Sbapt if ((value & 0xFC00) == 0xD800) { 358247738Sbapt 359247738Sbapt width = 4; 360247738Sbapt 361247738Sbapt /* Check for incomplete surrogate pair. */ 362247738Sbapt 363247738Sbapt if (raw_unread < 4) { 364247738Sbapt if (parser->eof) { 365247738Sbapt return yaml_parser_set_reader_error(parser, 366247738Sbapt "incomplete UTF-16 surrogate pair", 367247738Sbapt parser->offset, -1); 368247738Sbapt } 369247738Sbapt incomplete = 1; 370247738Sbapt break; 371247738Sbapt } 372247738Sbapt 373247738Sbapt /* Get the next character. */ 374247738Sbapt 375247738Sbapt value2 = parser->raw_buffer.pointer[low+2] 376247738Sbapt + (parser->raw_buffer.pointer[high+2] << 8); 377247738Sbapt 378247738Sbapt /* Check for a low surrogate area. */ 379247738Sbapt 380247738Sbapt if ((value2 & 0xFC00) != 0xDC00) 381247738Sbapt return yaml_parser_set_reader_error(parser, 382247738Sbapt "expected low surrogate area", 383247738Sbapt parser->offset+2, value2); 384247738Sbapt 385247738Sbapt /* Generate the value of the surrogate pair. */ 386247738Sbapt 387247738Sbapt value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF); 388247738Sbapt } 389247738Sbapt 390247738Sbapt else { 391247738Sbapt width = 2; 392247738Sbapt } 393247738Sbapt 394247738Sbapt break; 395247738Sbapt 396247738Sbapt default: 397247738Sbapt assert(1); /* Impossible. */ 398247738Sbapt } 399247738Sbapt 400247738Sbapt /* Check if the raw buffer contains enough bytes to form a character. */ 401247738Sbapt 402247738Sbapt if (incomplete) break; 403247738Sbapt 404247738Sbapt /* 405247738Sbapt * Check if the character is in the allowed range: 406247738Sbapt * #x9 | #xA | #xD | [#x20-#x7E] (8 bit) 407247738Sbapt * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit) 408247738Sbapt * | [#x10000-#x10FFFF] (32 bit) 409247738Sbapt */ 410247738Sbapt 411247738Sbapt if (! (value == 0x09 || value == 0x0A || value == 0x0D 412247738Sbapt || (value >= 0x20 && value <= 0x7E) 413247738Sbapt || (value == 0x85) || (value >= 0xA0 && value <= 0xD7FF) 414247738Sbapt || (value >= 0xE000 && value <= 0xFFFD) 415247738Sbapt || (value >= 0x10000 && value <= 0x10FFFF))) 416247738Sbapt return yaml_parser_set_reader_error(parser, 417247738Sbapt "control characters are not allowed", 418247738Sbapt parser->offset, value); 419247738Sbapt 420247738Sbapt /* Move the raw pointers. */ 421247738Sbapt 422247738Sbapt parser->raw_buffer.pointer += width; 423247738Sbapt parser->offset += width; 424247738Sbapt 425247738Sbapt /* Finally put the character into the buffer. */ 426247738Sbapt 427247738Sbapt /* 0000 0000-0000 007F -> 0xxxxxxx */ 428247738Sbapt if (value <= 0x7F) { 429247738Sbapt *(parser->buffer.last++) = value; 430247738Sbapt } 431247738Sbapt /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */ 432247738Sbapt else if (value <= 0x7FF) { 433247738Sbapt *(parser->buffer.last++) = 0xC0 + (value >> 6); 434247738Sbapt *(parser->buffer.last++) = 0x80 + (value & 0x3F); 435247738Sbapt } 436247738Sbapt /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */ 437247738Sbapt else if (value <= 0xFFFF) { 438247738Sbapt *(parser->buffer.last++) = 0xE0 + (value >> 12); 439247738Sbapt *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F); 440247738Sbapt *(parser->buffer.last++) = 0x80 + (value & 0x3F); 441247738Sbapt } 442247738Sbapt /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 443247738Sbapt else { 444247738Sbapt *(parser->buffer.last++) = 0xF0 + (value >> 18); 445247738Sbapt *(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F); 446247738Sbapt *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F); 447247738Sbapt *(parser->buffer.last++) = 0x80 + (value & 0x3F); 448247738Sbapt } 449247738Sbapt 450247738Sbapt parser->unread ++; 451247738Sbapt } 452247738Sbapt 453247738Sbapt /* On EOF, put NUL into the buffer and return. */ 454247738Sbapt 455247738Sbapt if (parser->eof) { 456247738Sbapt *(parser->buffer.last++) = '\0'; 457247738Sbapt parser->unread ++; 458247738Sbapt return 1; 459247738Sbapt } 460247738Sbapt 461247738Sbapt } 462247738Sbapt 463247738Sbapt return 1; 464247738Sbapt} 465247738Sbapt 466