1 2#include "yaml_private.h" 3 4/* 5 * Declarations. 6 */ 7 8static int 9yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem, 10 size_t offset, int value); 11 12static int 13yaml_parser_update_raw_buffer(yaml_parser_t *parser); 14 15static int 16yaml_parser_determine_encoding(yaml_parser_t *parser); 17 18YAML_DECLARE(int) 19yaml_parser_update_buffer(yaml_parser_t *parser, size_t length); 20 21/* 22 * Set the reader error and return 0. 23 */ 24 25static int 26yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem, 27 size_t offset, int value) 28{ 29 parser->error = YAML_READER_ERROR; 30 parser->problem = problem; 31 parser->problem_offset = offset; 32 parser->problem_value = value; 33 34 return 0; 35} 36 37/* 38 * Byte order marks. 39 */ 40 41#define BOM_UTF8 "\xef\xbb\xbf" 42#define BOM_UTF16LE "\xff\xfe" 43#define BOM_UTF16BE "\xfe\xff" 44 45/* 46 * Determine the input stream encoding by checking the BOM symbol. If no BOM is 47 * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure. 48 */ 49 50static int 51yaml_parser_determine_encoding(yaml_parser_t *parser) 52{ 53 /* Ensure that we had enough bytes in the raw buffer. */ 54 55 while (!parser->eof 56 && parser->raw_buffer.last - parser->raw_buffer.pointer < 3) { 57 if (!yaml_parser_update_raw_buffer(parser)) { 58 return 0; 59 } 60 } 61 62 /* Determine the encoding. */ 63 64 if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2 65 && !memcmp(parser->raw_buffer.pointer, BOM_UTF16LE, 2)) { 66 parser->encoding = YAML_UTF16LE_ENCODING; 67 parser->raw_buffer.pointer += 2; 68 parser->offset += 2; 69 } 70 else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2 71 && !memcmp(parser->raw_buffer.pointer, BOM_UTF16BE, 2)) { 72 parser->encoding = YAML_UTF16BE_ENCODING; 73 parser->raw_buffer.pointer += 2; 74 parser->offset += 2; 75 } 76 else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 3 77 && !memcmp(parser->raw_buffer.pointer, BOM_UTF8, 3)) { 78 parser->encoding = YAML_UTF8_ENCODING; 79 parser->raw_buffer.pointer += 3; 80 parser->offset += 3; 81 } 82 else { 83 parser->encoding = YAML_UTF8_ENCODING; 84 } 85 86 return 1; 87} 88 89/* 90 * Update the raw buffer. 91 */ 92 93static int 94yaml_parser_update_raw_buffer(yaml_parser_t *parser) 95{ 96 size_t size_read = 0; 97 98 /* Return if the raw buffer is full. */ 99 100 if (parser->raw_buffer.start == parser->raw_buffer.pointer 101 && parser->raw_buffer.last == parser->raw_buffer.end) 102 return 1; 103 104 /* Return on EOF. */ 105 106 if (parser->eof) return 1; 107 108 /* Move the remaining bytes in the raw buffer to the beginning. */ 109 110 if (parser->raw_buffer.start < parser->raw_buffer.pointer 111 && parser->raw_buffer.pointer < parser->raw_buffer.last) { 112 memmove(parser->raw_buffer.start, parser->raw_buffer.pointer, 113 parser->raw_buffer.last - parser->raw_buffer.pointer); 114 } 115 parser->raw_buffer.last -= 116 parser->raw_buffer.pointer - parser->raw_buffer.start; 117 parser->raw_buffer.pointer = parser->raw_buffer.start; 118 119 /* Call the read handler to fill the buffer. */ 120 121 if (!parser->read_handler(parser->read_handler_data, parser->raw_buffer.last, 122 parser->raw_buffer.end - parser->raw_buffer.last, &size_read)) { 123 return yaml_parser_set_reader_error(parser, "input error", 124 parser->offset, -1); 125 } 126 parser->raw_buffer.last += size_read; 127 if (!size_read) { 128 parser->eof = 1; 129 } 130 131 return 1; 132} 133 134/* 135 * Ensure that the buffer contains at least `length` characters. 136 * Return 1 on success, 0 on failure. 137 * 138 * The length is supposed to be significantly less that the buffer size. 139 */ 140 141YAML_DECLARE(int) 142yaml_parser_update_buffer(yaml_parser_t *parser, size_t length) 143{ 144 int first = 1; 145 146 assert(parser->read_handler); /* Read handler must be set. */ 147 148 /* If the EOF flag is set and the raw buffer is empty, do nothing. */ 149 150 if (parser->eof && parser->raw_buffer.pointer == parser->raw_buffer.last) 151 return 1; 152 153 /* Return if the buffer contains enough characters. */ 154 155 if (parser->unread >= length) 156 return 1; 157 158 /* Determine the input encoding if it is not known yet. */ 159 160 if (!parser->encoding) { 161 if (!yaml_parser_determine_encoding(parser)) 162 return 0; 163 } 164 165 /* Move the unread characters to the beginning of the buffer. */ 166 167 if (parser->buffer.start < parser->buffer.pointer 168 && parser->buffer.pointer < parser->buffer.last) { 169 size_t size = parser->buffer.last - parser->buffer.pointer; 170 memmove(parser->buffer.start, parser->buffer.pointer, size); 171 parser->buffer.pointer = parser->buffer.start; 172 parser->buffer.last = parser->buffer.start + size; 173 } 174 else if (parser->buffer.pointer == parser->buffer.last) { 175 parser->buffer.pointer = parser->buffer.start; 176 parser->buffer.last = parser->buffer.start; 177 } 178 179 /* Fill the buffer until it has enough characters. */ 180 181 while (parser->unread < length) 182 { 183 /* Fill the raw buffer if necessary. */ 184 185 if (!first || parser->raw_buffer.pointer == parser->raw_buffer.last) { 186 if (!yaml_parser_update_raw_buffer(parser)) return 0; 187 } 188 first = 0; 189 190 /* Decode the raw buffer. */ 191 192 while (parser->raw_buffer.pointer != parser->raw_buffer.last) 193 { 194 unsigned int value = 0, value2 = 0; 195 int incomplete = 0; 196 unsigned char octet; 197 unsigned int width = 0; 198 int low, high; 199 size_t k; 200 size_t raw_unread = parser->raw_buffer.last - parser->raw_buffer.pointer; 201 202 /* Decode the next character. */ 203 204 switch (parser->encoding) 205 { 206 case YAML_UTF8_ENCODING: 207 208 /* 209 * Decode a UTF-8 character. Check RFC 3629 210 * (http://www.ietf.org/rfc/rfc3629.txt) for more details. 211 * 212 * The following table (taken from the RFC) is used for 213 * decoding. 214 * 215 * Char. number range | UTF-8 octet sequence 216 * (hexadecimal) | (binary) 217 * --------------------+------------------------------------ 218 * 0000 0000-0000 007F | 0xxxxxxx 219 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx 220 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 221 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 222 * 223 * Additionally, the characters in the range 0xD800-0xDFFF 224 * are prohibited as they are reserved for use with UTF-16 225 * surrogate pairs. 226 */ 227 228 /* Determine the length of the UTF-8 sequence. */ 229 230 octet = parser->raw_buffer.pointer[0]; 231 width = (octet & 0x80) == 0x00 ? 1 : 232 (octet & 0xE0) == 0xC0 ? 2 : 233 (octet & 0xF0) == 0xE0 ? 3 : 234 (octet & 0xF8) == 0xF0 ? 4 : 0; 235 236 /* Check if the leading octet is valid. */ 237 238 if (!width) 239 return yaml_parser_set_reader_error(parser, 240 "invalid leading UTF-8 octet", 241 parser->offset, octet); 242 243 /* Check if the raw buffer contains an incomplete character. */ 244 245 if (width > raw_unread) { 246 if (parser->eof) { 247 return yaml_parser_set_reader_error(parser, 248 "incomplete UTF-8 octet sequence", 249 parser->offset, -1); 250 } 251 incomplete = 1; 252 break; 253 } 254 255 /* Decode the leading octet. */ 256 257 value = (octet & 0x80) == 0x00 ? octet & 0x7F : 258 (octet & 0xE0) == 0xC0 ? octet & 0x1F : 259 (octet & 0xF0) == 0xE0 ? octet & 0x0F : 260 (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0; 261 262 /* Check and decode the trailing octets. */ 263 264 for (k = 1; k < width; k ++) 265 { 266 octet = parser->raw_buffer.pointer[k]; 267 268 /* Check if the octet is valid. */ 269 270 if ((octet & 0xC0) != 0x80) 271 return yaml_parser_set_reader_error(parser, 272 "invalid trailing UTF-8 octet", 273 parser->offset+k, octet); 274 275 /* Decode the octet. */ 276 277 value = (value << 6) + (octet & 0x3F); 278 } 279 280 /* Check the length of the sequence against the value. */ 281 282 if (!((width == 1) || 283 (width == 2 && value >= 0x80) || 284 (width == 3 && value >= 0x800) || 285 (width == 4 && value >= 0x10000))) 286 return yaml_parser_set_reader_error(parser, 287 "invalid length of a UTF-8 sequence", 288 parser->offset, -1); 289 290 /* Check the range of the value. */ 291 292 if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF) 293 return yaml_parser_set_reader_error(parser, 294 "invalid Unicode character", 295 parser->offset, value); 296 297 break; 298 299 case YAML_UTF16LE_ENCODING: 300 case YAML_UTF16BE_ENCODING: 301 302 low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1); 303 high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0); 304 305 /* 306 * The UTF-16 encoding is not as simple as one might 307 * naively think. Check RFC 2781 308 * (http://www.ietf.org/rfc/rfc2781.txt). 309 * 310 * Normally, two subsequent bytes describe a Unicode 311 * character. However a special technique (called a 312 * surrogate pair) is used for specifying character 313 * values larger than 0xFFFF. 314 * 315 * A surrogate pair consists of two pseudo-characters: 316 * high surrogate area (0xD800-0xDBFF) 317 * low surrogate area (0xDC00-0xDFFF) 318 * 319 * The following formulas are used for decoding 320 * and encoding characters using surrogate pairs: 321 * 322 * U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF) 323 * U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF) 324 * W1 = 110110yyyyyyyyyy 325 * W2 = 110111xxxxxxxxxx 326 * 327 * where U is the character value, W1 is the high surrogate 328 * area, W2 is the low surrogate area. 329 */ 330 331 /* Check for incomplete UTF-16 character. */ 332 333 if (raw_unread < 2) { 334 if (parser->eof) { 335 return yaml_parser_set_reader_error(parser, 336 "incomplete UTF-16 character", 337 parser->offset, -1); 338 } 339 incomplete = 1; 340 break; 341 } 342 343 /* Get the character. */ 344 345 value = parser->raw_buffer.pointer[low] 346 + (parser->raw_buffer.pointer[high] << 8); 347 348 /* Check for unexpected low surrogate area. */ 349 350 if ((value & 0xFC00) == 0xDC00) 351 return yaml_parser_set_reader_error(parser, 352 "unexpected low surrogate area", 353 parser->offset, value); 354 355 /* Check for a high surrogate area. */ 356 357 if ((value & 0xFC00) == 0xD800) { 358 359 width = 4; 360 361 /* Check for incomplete surrogate pair. */ 362 363 if (raw_unread < 4) { 364 if (parser->eof) { 365 return yaml_parser_set_reader_error(parser, 366 "incomplete UTF-16 surrogate pair", 367 parser->offset, -1); 368 } 369 incomplete = 1; 370 break; 371 } 372 373 /* Get the next character. */ 374 375 value2 = parser->raw_buffer.pointer[low+2] 376 + (parser->raw_buffer.pointer[high+2] << 8); 377 378 /* Check for a low surrogate area. */ 379 380 if ((value2 & 0xFC00) != 0xDC00) 381 return yaml_parser_set_reader_error(parser, 382 "expected low surrogate area", 383 parser->offset+2, value2); 384 385 /* Generate the value of the surrogate pair. */ 386 387 value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF); 388 } 389 390 else { 391 width = 2; 392 } 393 394 break; 395 396 default: 397 assert(1); /* Impossible. */ 398 } 399 400 /* Check if the raw buffer contains enough bytes to form a character. */ 401 402 if (incomplete) break; 403 404 /* 405 * Check if the character is in the allowed range: 406 * #x9 | #xA | #xD | [#x20-#x7E] (8 bit) 407 * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit) 408 * | [#x10000-#x10FFFF] (32 bit) 409 */ 410 411 if (! (value == 0x09 || value == 0x0A || value == 0x0D 412 || (value >= 0x20 && value <= 0x7E) 413 || (value == 0x85) || (value >= 0xA0 && value <= 0xD7FF) 414 || (value >= 0xE000 && value <= 0xFFFD) 415 || (value >= 0x10000 && value <= 0x10FFFF))) 416 return yaml_parser_set_reader_error(parser, 417 "control characters are not allowed", 418 parser->offset, value); 419 420 /* Move the raw pointers. */ 421 422 parser->raw_buffer.pointer += width; 423 parser->offset += width; 424 425 /* Finally put the character into the buffer. */ 426 427 /* 0000 0000-0000 007F -> 0xxxxxxx */ 428 if (value <= 0x7F) { 429 *(parser->buffer.last++) = value; 430 } 431 /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */ 432 else if (value <= 0x7FF) { 433 *(parser->buffer.last++) = 0xC0 + (value >> 6); 434 *(parser->buffer.last++) = 0x80 + (value & 0x3F); 435 } 436 /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */ 437 else if (value <= 0xFFFF) { 438 *(parser->buffer.last++) = 0xE0 + (value >> 12); 439 *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F); 440 *(parser->buffer.last++) = 0x80 + (value & 0x3F); 441 } 442 /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 443 else { 444 *(parser->buffer.last++) = 0xF0 + (value >> 18); 445 *(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F); 446 *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F); 447 *(parser->buffer.last++) = 0x80 + (value & 0x3F); 448 } 449 450 parser->unread ++; 451 } 452 453 /* On EOF, put NUL into the buffer and return. */ 454 455 if (parser->eof) { 456 *(parser->buffer.last++) = '\0'; 457 parser->unread ++; 458 return 1; 459 } 460 461 } 462 463 if (parser->offset >= PTRDIFF_MAX) 464 return yaml_parser_set_reader_error(parser, "input is too long", 465 PTRDIFF_MAX, -1); 466 467 return 1; 468} 469 470