1247738Sbapt
2247738Sbapt#include "yaml_private.h"
3247738Sbapt
4247738Sbapt/*
5247738Sbapt * Declarations.
6247738Sbapt */
7247738Sbapt
8247738Sbaptstatic int
9247738Sbaptyaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem,
10247738Sbapt        size_t offset, int value);
11247738Sbapt
12247738Sbaptstatic int
13247738Sbaptyaml_parser_update_raw_buffer(yaml_parser_t *parser);
14247738Sbapt
15247738Sbaptstatic int
16247738Sbaptyaml_parser_determine_encoding(yaml_parser_t *parser);
17247738Sbapt
18247738SbaptYAML_DECLARE(int)
19247738Sbaptyaml_parser_update_buffer(yaml_parser_t *parser, size_t length);
20247738Sbapt
21247738Sbapt/*
22247738Sbapt * Set the reader error and return 0.
23247738Sbapt */
24247738Sbapt
25247738Sbaptstatic int
26247738Sbaptyaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem,
27247738Sbapt        size_t offset, int value)
28247738Sbapt{
29247738Sbapt    parser->error = YAML_READER_ERROR;
30247738Sbapt    parser->problem = problem;
31247738Sbapt    parser->problem_offset = offset;
32247738Sbapt    parser->problem_value = value;
33247738Sbapt
34247738Sbapt    return 0;
35247738Sbapt}
36247738Sbapt
37247738Sbapt/*
38247738Sbapt * Byte order marks.
39247738Sbapt */
40247738Sbapt
41247738Sbapt#define BOM_UTF8    "\xef\xbb\xbf"
42247738Sbapt#define BOM_UTF16LE "\xff\xfe"
43247738Sbapt#define BOM_UTF16BE "\xfe\xff"
44247738Sbapt
45247738Sbapt/*
46247738Sbapt * Determine the input stream encoding by checking the BOM symbol. If no BOM is
47247738Sbapt * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure.
48247738Sbapt */
49247738Sbapt
50247738Sbaptstatic int
51247738Sbaptyaml_parser_determine_encoding(yaml_parser_t *parser)
52247738Sbapt{
53247738Sbapt    /* Ensure that we had enough bytes in the raw buffer. */
54247738Sbapt
55247738Sbapt    while (!parser->eof
56247738Sbapt            && parser->raw_buffer.last - parser->raw_buffer.pointer < 3) {
57247738Sbapt        if (!yaml_parser_update_raw_buffer(parser)) {
58247738Sbapt            return 0;
59247738Sbapt        }
60247738Sbapt    }
61247738Sbapt
62247738Sbapt    /* Determine the encoding. */
63247738Sbapt
64247738Sbapt    if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
65247738Sbapt            && !memcmp(parser->raw_buffer.pointer, BOM_UTF16LE, 2)) {
66247738Sbapt        parser->encoding = YAML_UTF16LE_ENCODING;
67247738Sbapt        parser->raw_buffer.pointer += 2;
68247738Sbapt        parser->offset += 2;
69247738Sbapt    }
70247738Sbapt    else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
71247738Sbapt            && !memcmp(parser->raw_buffer.pointer, BOM_UTF16BE, 2)) {
72247738Sbapt        parser->encoding = YAML_UTF16BE_ENCODING;
73247738Sbapt        parser->raw_buffer.pointer += 2;
74247738Sbapt        parser->offset += 2;
75247738Sbapt    }
76247738Sbapt    else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 3
77247738Sbapt            && !memcmp(parser->raw_buffer.pointer, BOM_UTF8, 3)) {
78247738Sbapt        parser->encoding = YAML_UTF8_ENCODING;
79247738Sbapt        parser->raw_buffer.pointer += 3;
80247738Sbapt        parser->offset += 3;
81247738Sbapt    }
82247738Sbapt    else {
83247738Sbapt        parser->encoding = YAML_UTF8_ENCODING;
84247738Sbapt    }
85247738Sbapt
86247738Sbapt    return 1;
87247738Sbapt}
88247738Sbapt
89247738Sbapt/*
90247738Sbapt * Update the raw buffer.
91247738Sbapt */
92247738Sbapt
93247738Sbaptstatic int
94247738Sbaptyaml_parser_update_raw_buffer(yaml_parser_t *parser)
95247738Sbapt{
96247738Sbapt    size_t size_read = 0;
97247738Sbapt
98247738Sbapt    /* Return if the raw buffer is full. */
99247738Sbapt
100247738Sbapt    if (parser->raw_buffer.start == parser->raw_buffer.pointer
101247738Sbapt            && parser->raw_buffer.last == parser->raw_buffer.end)
102247738Sbapt        return 1;
103247738Sbapt
104247738Sbapt    /* Return on EOF. */
105247738Sbapt
106247738Sbapt    if (parser->eof) return 1;
107247738Sbapt
108247738Sbapt    /* Move the remaining bytes in the raw buffer to the beginning. */
109247738Sbapt
110247738Sbapt    if (parser->raw_buffer.start < parser->raw_buffer.pointer
111247738Sbapt            && parser->raw_buffer.pointer < parser->raw_buffer.last) {
112247738Sbapt        memmove(parser->raw_buffer.start, parser->raw_buffer.pointer,
113247738Sbapt                parser->raw_buffer.last - parser->raw_buffer.pointer);
114247738Sbapt    }
115247738Sbapt    parser->raw_buffer.last -=
116247738Sbapt        parser->raw_buffer.pointer - parser->raw_buffer.start;
117247738Sbapt    parser->raw_buffer.pointer = parser->raw_buffer.start;
118247738Sbapt
119247738Sbapt    /* Call the read handler to fill the buffer. */
120247738Sbapt
121247738Sbapt    if (!parser->read_handler(parser->read_handler_data, parser->raw_buffer.last,
122247738Sbapt                parser->raw_buffer.end - parser->raw_buffer.last, &size_read)) {
123247738Sbapt        return yaml_parser_set_reader_error(parser, "input error",
124247738Sbapt                parser->offset, -1);
125247738Sbapt    }
126247738Sbapt    parser->raw_buffer.last += size_read;
127247738Sbapt    if (!size_read) {
128247738Sbapt        parser->eof = 1;
129247738Sbapt    }
130247738Sbapt
131247738Sbapt    return 1;
132247738Sbapt}
133247738Sbapt
134247738Sbapt/*
135247738Sbapt * Ensure that the buffer contains at least `length` characters.
136247738Sbapt * Return 1 on success, 0 on failure.
137247738Sbapt *
138247738Sbapt * The length is supposed to be significantly less that the buffer size.
139247738Sbapt */
140247738Sbapt
141247738SbaptYAML_DECLARE(int)
142247738Sbaptyaml_parser_update_buffer(yaml_parser_t *parser, size_t length)
143247738Sbapt{
144247738Sbapt    int first = 1;
145247738Sbapt
146247738Sbapt    assert(parser->read_handler);   /* Read handler must be set. */
147247738Sbapt
148247738Sbapt    /* If the EOF flag is set and the raw buffer is empty, do nothing. */
149247738Sbapt
150247738Sbapt    if (parser->eof && parser->raw_buffer.pointer == parser->raw_buffer.last)
151247738Sbapt        return 1;
152247738Sbapt
153247738Sbapt    /* Return if the buffer contains enough characters. */
154247738Sbapt
155247738Sbapt    if (parser->unread >= length)
156247738Sbapt        return 1;
157247738Sbapt
158247738Sbapt    /* Determine the input encoding if it is not known yet. */
159247738Sbapt
160247738Sbapt    if (!parser->encoding) {
161247738Sbapt        if (!yaml_parser_determine_encoding(parser))
162247738Sbapt            return 0;
163247738Sbapt    }
164247738Sbapt
165247738Sbapt    /* Move the unread characters to the beginning of the buffer. */
166247738Sbapt
167247738Sbapt    if (parser->buffer.start < parser->buffer.pointer
168247738Sbapt            && parser->buffer.pointer < parser->buffer.last) {
169247738Sbapt        size_t size = parser->buffer.last - parser->buffer.pointer;
170247738Sbapt        memmove(parser->buffer.start, parser->buffer.pointer, size);
171247738Sbapt        parser->buffer.pointer = parser->buffer.start;
172247738Sbapt        parser->buffer.last = parser->buffer.start + size;
173247738Sbapt    }
174247738Sbapt    else if (parser->buffer.pointer == parser->buffer.last) {
175247738Sbapt        parser->buffer.pointer = parser->buffer.start;
176247738Sbapt        parser->buffer.last = parser->buffer.start;
177247738Sbapt    }
178247738Sbapt
179247738Sbapt    /* Fill the buffer until it has enough characters. */
180247738Sbapt
181247738Sbapt    while (parser->unread < length)
182247738Sbapt    {
183247738Sbapt        /* Fill the raw buffer if necessary. */
184247738Sbapt
185247738Sbapt        if (!first || parser->raw_buffer.pointer == parser->raw_buffer.last) {
186247738Sbapt            if (!yaml_parser_update_raw_buffer(parser)) return 0;
187247738Sbapt        }
188247738Sbapt        first = 0;
189247738Sbapt
190247738Sbapt        /* Decode the raw buffer. */
191247738Sbapt
192247738Sbapt        while (parser->raw_buffer.pointer != parser->raw_buffer.last)
193247738Sbapt        {
194247738Sbapt            unsigned int value = 0, value2 = 0;
195247738Sbapt            int incomplete = 0;
196247738Sbapt            unsigned char octet;
197247738Sbapt            unsigned int width = 0;
198247738Sbapt            int low, high;
199247738Sbapt            size_t k;
200247738Sbapt            size_t raw_unread = parser->raw_buffer.last - parser->raw_buffer.pointer;
201247738Sbapt
202247738Sbapt            /* Decode the next character. */
203247738Sbapt
204247738Sbapt            switch (parser->encoding)
205247738Sbapt            {
206247738Sbapt                case YAML_UTF8_ENCODING:
207247738Sbapt
208247738Sbapt                    /*
209247738Sbapt                     * Decode a UTF-8 character.  Check RFC 3629
210247738Sbapt                     * (http://www.ietf.org/rfc/rfc3629.txt) for more details.
211247738Sbapt                     *
212247738Sbapt                     * The following table (taken from the RFC) is used for
213247738Sbapt                     * decoding.
214247738Sbapt                     *
215247738Sbapt                     *    Char. number range |        UTF-8 octet sequence
216247738Sbapt                     *      (hexadecimal)    |              (binary)
217247738Sbapt                     *   --------------------+------------------------------------
218247738Sbapt                     *   0000 0000-0000 007F | 0xxxxxxx
219247738Sbapt                     *   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
220247738Sbapt                     *   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
221247738Sbapt                     *   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
222247738Sbapt                     *
223247738Sbapt                     * Additionally, the characters in the range 0xD800-0xDFFF
224247738Sbapt                     * are prohibited as they are reserved for use with UTF-16
225247738Sbapt                     * surrogate pairs.
226247738Sbapt                     */
227247738Sbapt
228247738Sbapt                    /* Determine the length of the UTF-8 sequence. */
229247738Sbapt
230247738Sbapt                    octet = parser->raw_buffer.pointer[0];
231247738Sbapt                    width = (octet & 0x80) == 0x00 ? 1 :
232247738Sbapt                            (octet & 0xE0) == 0xC0 ? 2 :
233247738Sbapt                            (octet & 0xF0) == 0xE0 ? 3 :
234247738Sbapt                            (octet & 0xF8) == 0xF0 ? 4 : 0;
235247738Sbapt
236247738Sbapt                    /* Check if the leading octet is valid. */
237247738Sbapt
238247738Sbapt                    if (!width)
239247738Sbapt                        return yaml_parser_set_reader_error(parser,
240247738Sbapt                                "invalid leading UTF-8 octet",
241247738Sbapt                                parser->offset, octet);
242247738Sbapt
243247738Sbapt                    /* Check if the raw buffer contains an incomplete character. */
244247738Sbapt
245247738Sbapt                    if (width > raw_unread) {
246247738Sbapt                        if (parser->eof) {
247247738Sbapt                            return yaml_parser_set_reader_error(parser,
248247738Sbapt                                    "incomplete UTF-8 octet sequence",
249247738Sbapt                                    parser->offset, -1);
250247738Sbapt                        }
251247738Sbapt                        incomplete = 1;
252247738Sbapt                        break;
253247738Sbapt                    }
254247738Sbapt
255247738Sbapt                    /* Decode the leading octet. */
256247738Sbapt
257247738Sbapt                    value = (octet & 0x80) == 0x00 ? octet & 0x7F :
258247738Sbapt                            (octet & 0xE0) == 0xC0 ? octet & 0x1F :
259247738Sbapt                            (octet & 0xF0) == 0xE0 ? octet & 0x0F :
260247738Sbapt                            (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;
261247738Sbapt
262247738Sbapt                    /* Check and decode the trailing octets. */
263247738Sbapt
264247738Sbapt                    for (k = 1; k < width; k ++)
265247738Sbapt                    {
266247738Sbapt                        octet = parser->raw_buffer.pointer[k];
267247738Sbapt
268247738Sbapt                        /* Check if the octet is valid. */
269247738Sbapt
270247738Sbapt                        if ((octet & 0xC0) != 0x80)
271247738Sbapt                            return yaml_parser_set_reader_error(parser,
272247738Sbapt                                    "invalid trailing UTF-8 octet",
273247738Sbapt                                    parser->offset+k, octet);
274247738Sbapt
275247738Sbapt                        /* Decode the octet. */
276247738Sbapt
277247738Sbapt                        value = (value << 6) + (octet & 0x3F);
278247738Sbapt                    }
279247738Sbapt
280247738Sbapt                    /* Check the length of the sequence against the value. */
281247738Sbapt
282247738Sbapt                    if (!((width == 1) ||
283247738Sbapt                            (width == 2 && value >= 0x80) ||
284247738Sbapt                            (width == 3 && value >= 0x800) ||
285247738Sbapt                            (width == 4 && value >= 0x10000)))
286247738Sbapt                        return yaml_parser_set_reader_error(parser,
287247738Sbapt                                "invalid length of a UTF-8 sequence",
288247738Sbapt                                parser->offset, -1);
289247738Sbapt
290247738Sbapt                    /* Check the range of the value. */
291247738Sbapt
292247738Sbapt                    if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF)
293247738Sbapt                        return yaml_parser_set_reader_error(parser,
294247738Sbapt                                "invalid Unicode character",
295247738Sbapt                                parser->offset, value);
296247738Sbapt
297247738Sbapt                    break;
298247738Sbapt
299247738Sbapt                case YAML_UTF16LE_ENCODING:
300247738Sbapt                case YAML_UTF16BE_ENCODING:
301247738Sbapt
302247738Sbapt                    low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1);
303247738Sbapt                    high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0);
304247738Sbapt
305247738Sbapt                    /*
306247738Sbapt                     * The UTF-16 encoding is not as simple as one might
307247738Sbapt                     * naively think.  Check RFC 2781
308247738Sbapt                     * (http://www.ietf.org/rfc/rfc2781.txt).
309247738Sbapt                     *
310247738Sbapt                     * Normally, two subsequent bytes describe a Unicode
311247738Sbapt                     * character.  However a special technique (called a
312247738Sbapt                     * surrogate pair) is used for specifying character
313247738Sbapt                     * values larger than 0xFFFF.
314247738Sbapt                     *
315247738Sbapt                     * A surrogate pair consists of two pseudo-characters:
316247738Sbapt                     *      high surrogate area (0xD800-0xDBFF)
317247738Sbapt                     *      low surrogate area (0xDC00-0xDFFF)
318247738Sbapt                     *
319247738Sbapt                     * The following formulas are used for decoding
320247738Sbapt                     * and encoding characters using surrogate pairs:
321247738Sbapt                     *
322247738Sbapt                     *  U  = U' + 0x10000   (0x01 00 00 <= U <= 0x10 FF FF)
323247738Sbapt                     *  U' = yyyyyyyyyyxxxxxxxxxx   (0 <= U' <= 0x0F FF FF)
324247738Sbapt                     *  W1 = 110110yyyyyyyyyy
325247738Sbapt                     *  W2 = 110111xxxxxxxxxx
326247738Sbapt                     *
327247738Sbapt                     * where U is the character value, W1 is the high surrogate
328247738Sbapt                     * area, W2 is the low surrogate area.
329247738Sbapt                     */
330247738Sbapt
331247738Sbapt                    /* Check for incomplete UTF-16 character. */
332247738Sbapt
333247738Sbapt                    if (raw_unread < 2) {
334247738Sbapt                        if (parser->eof) {
335247738Sbapt                            return yaml_parser_set_reader_error(parser,
336247738Sbapt                                    "incomplete UTF-16 character",
337247738Sbapt                                    parser->offset, -1);
338247738Sbapt                        }
339247738Sbapt                        incomplete = 1;
340247738Sbapt                        break;
341247738Sbapt                    }
342247738Sbapt
343247738Sbapt                    /* Get the character. */
344247738Sbapt
345247738Sbapt                    value = parser->raw_buffer.pointer[low]
346247738Sbapt                        + (parser->raw_buffer.pointer[high] << 8);
347247738Sbapt
348247738Sbapt                    /* Check for unexpected low surrogate area. */
349247738Sbapt
350247738Sbapt                    if ((value & 0xFC00) == 0xDC00)
351247738Sbapt                        return yaml_parser_set_reader_error(parser,
352247738Sbapt                                "unexpected low surrogate area",
353247738Sbapt                                parser->offset, value);
354247738Sbapt
355247738Sbapt                    /* Check for a high surrogate area. */
356247738Sbapt
357247738Sbapt                    if ((value & 0xFC00) == 0xD800) {
358247738Sbapt
359247738Sbapt                        width = 4;
360247738Sbapt
361247738Sbapt                        /* Check for incomplete surrogate pair. */
362247738Sbapt
363247738Sbapt                        if (raw_unread < 4) {
364247738Sbapt                            if (parser->eof) {
365247738Sbapt                                return yaml_parser_set_reader_error(parser,
366247738Sbapt                                        "incomplete UTF-16 surrogate pair",
367247738Sbapt                                        parser->offset, -1);
368247738Sbapt                            }
369247738Sbapt                            incomplete = 1;
370247738Sbapt                            break;
371247738Sbapt                        }
372247738Sbapt
373247738Sbapt                        /* Get the next character. */
374247738Sbapt
375247738Sbapt                        value2 = parser->raw_buffer.pointer[low+2]
376247738Sbapt                            + (parser->raw_buffer.pointer[high+2] << 8);
377247738Sbapt
378247738Sbapt                        /* Check for a low surrogate area. */
379247738Sbapt
380247738Sbapt                        if ((value2 & 0xFC00) != 0xDC00)
381247738Sbapt                            return yaml_parser_set_reader_error(parser,
382247738Sbapt                                    "expected low surrogate area",
383247738Sbapt                                    parser->offset+2, value2);
384247738Sbapt
385247738Sbapt                        /* Generate the value of the surrogate pair. */
386247738Sbapt
387247738Sbapt                        value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF);
388247738Sbapt                    }
389247738Sbapt
390247738Sbapt                    else {
391247738Sbapt                        width = 2;
392247738Sbapt                    }
393247738Sbapt
394247738Sbapt                    break;
395247738Sbapt
396247738Sbapt                default:
397247738Sbapt                    assert(1);      /* Impossible. */
398247738Sbapt            }
399247738Sbapt
400247738Sbapt            /* Check if the raw buffer contains enough bytes to form a character. */
401247738Sbapt
402247738Sbapt            if (incomplete) break;
403247738Sbapt
404247738Sbapt            /*
405247738Sbapt             * Check if the character is in the allowed range:
406247738Sbapt             *      #x9 | #xA | #xD | [#x20-#x7E]               (8 bit)
407247738Sbapt             *      | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD]    (16 bit)
408247738Sbapt             *      | [#x10000-#x10FFFF]                        (32 bit)
409247738Sbapt             */
410247738Sbapt
411247738Sbapt            if (! (value == 0x09 || value == 0x0A || value == 0x0D
412247738Sbapt                        || (value >= 0x20 && value <= 0x7E)
413247738Sbapt                        || (value == 0x85) || (value >= 0xA0 && value <= 0xD7FF)
414247738Sbapt                        || (value >= 0xE000 && value <= 0xFFFD)
415247738Sbapt                        || (value >= 0x10000 && value <= 0x10FFFF)))
416247738Sbapt                return yaml_parser_set_reader_error(parser,
417247738Sbapt                        "control characters are not allowed",
418247738Sbapt                        parser->offset, value);
419247738Sbapt
420247738Sbapt            /* Move the raw pointers. */
421247738Sbapt
422247738Sbapt            parser->raw_buffer.pointer += width;
423247738Sbapt            parser->offset += width;
424247738Sbapt
425247738Sbapt            /* Finally put the character into the buffer. */
426247738Sbapt
427247738Sbapt            /* 0000 0000-0000 007F -> 0xxxxxxx */
428247738Sbapt            if (value <= 0x7F) {
429247738Sbapt                *(parser->buffer.last++) = value;
430247738Sbapt            }
431247738Sbapt            /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */
432247738Sbapt            else if (value <= 0x7FF) {
433247738Sbapt                *(parser->buffer.last++) = 0xC0 + (value >> 6);
434247738Sbapt                *(parser->buffer.last++) = 0x80 + (value & 0x3F);
435247738Sbapt            }
436247738Sbapt            /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
437247738Sbapt            else if (value <= 0xFFFF) {
438247738Sbapt                *(parser->buffer.last++) = 0xE0 + (value >> 12);
439247738Sbapt                *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
440247738Sbapt                *(parser->buffer.last++) = 0x80 + (value & 0x3F);
441247738Sbapt            }
442247738Sbapt            /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
443247738Sbapt            else {
444247738Sbapt                *(parser->buffer.last++) = 0xF0 + (value >> 18);
445247738Sbapt                *(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F);
446247738Sbapt                *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
447247738Sbapt                *(parser->buffer.last++) = 0x80 + (value & 0x3F);
448247738Sbapt            }
449247738Sbapt
450247738Sbapt            parser->unread ++;
451247738Sbapt        }
452247738Sbapt
453247738Sbapt        /* On EOF, put NUL into the buffer and return. */
454247738Sbapt
455247738Sbapt        if (parser->eof) {
456247738Sbapt            *(parser->buffer.last++) = '\0';
457247738Sbapt            parser->unread ++;
458247738Sbapt            return 1;
459247738Sbapt        }
460247738Sbapt
461247738Sbapt    }
462247738Sbapt
463247738Sbapt    return 1;
464247738Sbapt}
465247738Sbapt
466