1/*
2                            __  __            _
3                         ___\ \/ /_ __   __ _| |_
4                        / _ \\  /| '_ \ / _` | __|
5                       |  __//  \| |_) | (_| | |_
6                        \___/_/\_\ .__/ \__,_|\__|
7                                 |_| XML parser
8
9   Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10   Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11   Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12   Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
13   Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14   Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15   Copyright (c) 2016-2024 Sebastian Pipping <sebastian@pipping.org>
16   Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
17   Copyright (c) 2016      Don Lewis <truckman@apache.org>
18   Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
19   Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
20   Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
21   Copyright (c) 2017      Jos�� Guti��rrez de la Concha <jose@zeroc.com>
22   Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
23   Copyright (c) 2021      Donghee Na <donghee.na@python.org>
24   Copyright (c) 2022      Martin Ettl <ettl.martin78@googlemail.com>
25   Copyright (c) 2022      Sean McBride <sean@rogue-research.com>
26   Copyright (c) 2023      Hanno B��ck <hanno@gentoo.org>
27   Licensed under the MIT license:
28
29   Permission is  hereby granted,  free of charge,  to any  person obtaining
30   a  copy  of  this  software   and  associated  documentation  files  (the
31   "Software"),  to  deal in  the  Software  without restriction,  including
32   without  limitation the  rights  to use,  copy,  modify, merge,  publish,
33   distribute, sublicense, and/or sell copies of the Software, and to permit
34   persons  to whom  the Software  is  furnished to  do so,  subject to  the
35   following conditions:
36
37   The above copyright  notice and this permission notice  shall be included
38   in all copies or substantial portions of the Software.
39
40   THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
41   EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
42   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
43   NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
44   DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
45   OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
46   USE OR OTHER DEALINGS IN THE SOFTWARE.
47*/
48
49#include "expat_config.h"
50
51#include <stddef.h>
52#include <string.h> /* memcpy */
53#include <stdbool.h>
54
55#ifdef _WIN32
56#  include "winconfig.h"
57#endif
58
59#include "expat_external.h"
60#include "internal.h"
61#include "xmltok.h"
62#include "nametab.h"
63
64#ifdef XML_DTD
65#  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
66#else
67#  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
68#endif
69
70#define VTABLE1                                                                \
71  {PREFIX(prologTok), PREFIX(contentTok),                                      \
72   PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
73      {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
74      PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
75      PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
76      PREFIX(updatePosition), PREFIX(isPublicId)
77
78#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
79
80#define UCS2_GET_NAMING(pages, hi, lo)                                         \
81  (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
82
83/* A 2 byte UTF-8 representation splits the characters 11 bits between
84   the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
85   pages, 3 bits to add to that index and 5 bits to generate the mask.
86*/
87#define UTF8_GET_NAMING2(pages, byte)                                          \
88  (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
89                + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
90   & (1u << (((byte)[1]) & 0x1F)))
91
92/* A 3 byte UTF-8 representation splits the characters 16 bits between
93   the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
94   into pages, 3 bits to add to that index and 5 bits to generate the
95   mask.
96*/
97#define UTF8_GET_NAMING3(pages, byte)                                          \
98  (namingBitmap                                                                \
99       [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
100         << 3)                                                                 \
101        + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
102   & (1u << (((byte)[2]) & 0x1F)))
103
104/* Detection of invalid UTF-8 sequences is based on Table 3.1B
105   of Unicode 3.2: https://www.unicode.org/unicode/reports/tr28/
106   with the additional restriction of not allowing the Unicode
107   code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
108   Implementation details:
109     (A & 0x80) == 0     means A < 0x80
110   and
111     (A & 0xC0) == 0xC0  means A > 0xBF
112*/
113
114#define UTF8_INVALID2(p)                                                       \
115  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
116
117#define UTF8_INVALID3(p)                                                       \
118  (((p)[2] & 0x80) == 0                                                        \
119   || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
120                                      : ((p)[2] & 0xC0) == 0xC0)               \
121   || ((*p) == 0xE0                                                            \
122           ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
123           : ((p)[1] & 0x80) == 0                                              \
124                 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
125
126#define UTF8_INVALID4(p)                                                       \
127  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
128   || ((p)[2] & 0xC0) == 0xC0                                                  \
129   || ((*p) == 0xF0                                                            \
130           ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
131           : ((p)[1] & 0x80) == 0                                              \
132                 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
133
134static int PTRFASTCALL
135isNever(const ENCODING *enc, const char *p) {
136  UNUSED_P(enc);
137  UNUSED_P(p);
138  return 0;
139}
140
141static int PTRFASTCALL
142utf8_isName2(const ENCODING *enc, const char *p) {
143  UNUSED_P(enc);
144  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
145}
146
147static int PTRFASTCALL
148utf8_isName3(const ENCODING *enc, const char *p) {
149  UNUSED_P(enc);
150  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
151}
152
153#define utf8_isName4 isNever
154
155static int PTRFASTCALL
156utf8_isNmstrt2(const ENCODING *enc, const char *p) {
157  UNUSED_P(enc);
158  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
159}
160
161static int PTRFASTCALL
162utf8_isNmstrt3(const ENCODING *enc, const char *p) {
163  UNUSED_P(enc);
164  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
165}
166
167#define utf8_isNmstrt4 isNever
168
169static int PTRFASTCALL
170utf8_isInvalid2(const ENCODING *enc, const char *p) {
171  UNUSED_P(enc);
172  return UTF8_INVALID2((const unsigned char *)p);
173}
174
175static int PTRFASTCALL
176utf8_isInvalid3(const ENCODING *enc, const char *p) {
177  UNUSED_P(enc);
178  return UTF8_INVALID3((const unsigned char *)p);
179}
180
181static int PTRFASTCALL
182utf8_isInvalid4(const ENCODING *enc, const char *p) {
183  UNUSED_P(enc);
184  return UTF8_INVALID4((const unsigned char *)p);
185}
186
187struct normal_encoding {
188  ENCODING enc;
189  unsigned char type[256];
190#ifdef XML_MIN_SIZE
191  int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
192  int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
193  int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
194  int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
195  int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
196#endif /* XML_MIN_SIZE */
197  int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
198  int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
199  int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
200  int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
201  int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
202  int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
203  int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
204  int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
205  int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
206};
207
208#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
209
210#ifdef XML_MIN_SIZE
211
212#  define STANDARD_VTABLE(E)                                                   \
213    E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
214
215#else
216
217#  define STANDARD_VTABLE(E) /* as nothing */
218
219#endif
220
221#define NORMAL_VTABLE(E)                                                       \
222  E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
223      E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
224
225#define NULL_VTABLE                                                            \
226  /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
227      /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
228      /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
229
230static int FASTCALL checkCharRefNumber(int result);
231
232#include "xmltok_impl.h"
233#include "ascii.h"
234
235#ifdef XML_MIN_SIZE
236#  define sb_isNameMin isNever
237#  define sb_isNmstrtMin isNever
238#endif
239
240#ifdef XML_MIN_SIZE
241#  define MINBPC(enc) ((enc)->minBytesPerChar)
242#else
243/* minimum bytes per character */
244#  define MINBPC(enc) 1
245#endif
246
247#define SB_BYTE_TYPE(enc, p)                                                   \
248  (((const struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
249
250#ifdef XML_MIN_SIZE
251static int PTRFASTCALL
252sb_byteType(const ENCODING *enc, const char *p) {
253  return SB_BYTE_TYPE(enc, p);
254}
255#  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
256#else
257#  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
258#endif
259
260#ifdef XML_MIN_SIZE
261#  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
262static int PTRFASTCALL
263sb_byteToAscii(const ENCODING *enc, const char *p) {
264  UNUSED_P(enc);
265  return *p;
266}
267#else
268#  define BYTE_TO_ASCII(enc, p) (*(p))
269#endif
270
271#define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
272#define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
273#ifdef XML_MIN_SIZE
274#  define IS_INVALID_CHAR(enc, p, n)                                           \
275    (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
276     && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
277#else
278#  define IS_INVALID_CHAR(enc, p, n)                                           \
279    (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
280#endif
281
282#ifdef XML_MIN_SIZE
283#  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
284    (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
285#  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
286    (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
287#else
288#  define IS_NAME_CHAR_MINBPC(enc, p) (0)
289#  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
290#endif
291
292#ifdef XML_MIN_SIZE
293#  define CHAR_MATCHES(enc, p, c)                                              \
294    (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
295static int PTRCALL
296sb_charMatches(const ENCODING *enc, const char *p, int c) {
297  UNUSED_P(enc);
298  return *p == c;
299}
300#else
301/* c is an ASCII character */
302#  define CHAR_MATCHES(enc, p, c) (*(p) == (c))
303#endif
304
305#define PREFIX(ident) normal_##ident
306#define XML_TOK_IMPL_C
307#include "xmltok_impl.c"
308#undef XML_TOK_IMPL_C
309
310#undef MINBPC
311#undef BYTE_TYPE
312#undef BYTE_TO_ASCII
313#undef CHAR_MATCHES
314#undef IS_NAME_CHAR
315#undef IS_NAME_CHAR_MINBPC
316#undef IS_NMSTRT_CHAR
317#undef IS_NMSTRT_CHAR_MINBPC
318#undef IS_INVALID_CHAR
319
320enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
321       UTF8_cval1 = 0x00,
322       UTF8_cval2 = 0xc0,
323       UTF8_cval3 = 0xe0,
324       UTF8_cval4 = 0xf0
325};
326
327void
328_INTERNAL_trim_to_complete_utf8_characters(const char *from,
329                                           const char **fromLimRef) {
330  const char *fromLim = *fromLimRef;
331  size_t walked = 0;
332  for (; fromLim > from; fromLim--, walked++) {
333    const unsigned char prev = (unsigned char)fromLim[-1];
334    if ((prev & 0xf8u)
335        == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
336      if (walked + 1 >= 4) {
337        fromLim += 4 - 1;
338        break;
339      } else {
340        walked = 0;
341      }
342    } else if ((prev & 0xf0u)
343               == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
344      if (walked + 1 >= 3) {
345        fromLim += 3 - 1;
346        break;
347      } else {
348        walked = 0;
349      }
350    } else if ((prev & 0xe0u)
351               == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
352      if (walked + 1 >= 2) {
353        fromLim += 2 - 1;
354        break;
355      } else {
356        walked = 0;
357      }
358    } else if ((prev & 0x80u)
359               == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
360      break;
361    }
362  }
363  *fromLimRef = fromLim;
364}
365
366static enum XML_Convert_Result PTRCALL
367utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
368            char **toP, const char *toLim) {
369  bool input_incomplete = false;
370  bool output_exhausted = false;
371
372  /* Avoid copying partial characters (due to limited space). */
373  const ptrdiff_t bytesAvailable = fromLim - *fromP;
374  const ptrdiff_t bytesStorable = toLim - *toP;
375  UNUSED_P(enc);
376  if (bytesAvailable > bytesStorable) {
377    fromLim = *fromP + bytesStorable;
378    output_exhausted = true;
379  }
380
381  /* Avoid copying partial characters (from incomplete input). */
382  {
383    const char *const fromLimBefore = fromLim;
384    _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
385    if (fromLim < fromLimBefore) {
386      input_incomplete = true;
387    }
388  }
389
390  {
391    const ptrdiff_t bytesToCopy = fromLim - *fromP;
392    memcpy(*toP, *fromP, bytesToCopy);
393    *fromP += bytesToCopy;
394    *toP += bytesToCopy;
395  }
396
397  if (output_exhausted) /* needs to go first */
398    return XML_CONVERT_OUTPUT_EXHAUSTED;
399  else if (input_incomplete)
400    return XML_CONVERT_INPUT_INCOMPLETE;
401  else
402    return XML_CONVERT_COMPLETED;
403}
404
405static enum XML_Convert_Result PTRCALL
406utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
407             unsigned short **toP, const unsigned short *toLim) {
408  enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
409  unsigned short *to = *toP;
410  const char *from = *fromP;
411  while (from < fromLim && to < toLim) {
412    switch (SB_BYTE_TYPE(enc, from)) {
413    case BT_LEAD2:
414      if (fromLim - from < 2) {
415        res = XML_CONVERT_INPUT_INCOMPLETE;
416        goto after;
417      }
418      *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
419      from += 2;
420      break;
421    case BT_LEAD3:
422      if (fromLim - from < 3) {
423        res = XML_CONVERT_INPUT_INCOMPLETE;
424        goto after;
425      }
426      *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
427                               | (from[2] & 0x3f));
428      from += 3;
429      break;
430    case BT_LEAD4: {
431      unsigned long n;
432      if (toLim - to < 2) {
433        res = XML_CONVERT_OUTPUT_EXHAUSTED;
434        goto after;
435      }
436      if (fromLim - from < 4) {
437        res = XML_CONVERT_INPUT_INCOMPLETE;
438        goto after;
439      }
440      n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
441          | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
442      n -= 0x10000;
443      to[0] = (unsigned short)((n >> 10) | 0xD800);
444      to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
445      to += 2;
446      from += 4;
447    } break;
448    default:
449      *to++ = *from++;
450      break;
451    }
452  }
453  if (from < fromLim)
454    res = XML_CONVERT_OUTPUT_EXHAUSTED;
455after:
456  *fromP = from;
457  *toP = to;
458  return res;
459}
460
461#ifdef XML_NS
462static const struct normal_encoding utf8_encoding_ns
463    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
464       {
465#  include "asciitab.h"
466#  include "utf8tab.h"
467       },
468       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
469#endif
470
471static const struct normal_encoding utf8_encoding
472    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
473       {
474#define BT_COLON BT_NMSTRT
475#include "asciitab.h"
476#undef BT_COLON
477#include "utf8tab.h"
478       },
479       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
480
481#ifdef XML_NS
482
483static const struct normal_encoding internal_utf8_encoding_ns
484    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
485       {
486#  include "iasciitab.h"
487#  include "utf8tab.h"
488       },
489       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
490
491#endif
492
493static const struct normal_encoding internal_utf8_encoding
494    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
495       {
496#define BT_COLON BT_NMSTRT
497#include "iasciitab.h"
498#undef BT_COLON
499#include "utf8tab.h"
500       },
501       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
502
503static enum XML_Convert_Result PTRCALL
504latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
505              char **toP, const char *toLim) {
506  UNUSED_P(enc);
507  for (;;) {
508    unsigned char c;
509    if (*fromP == fromLim)
510      return XML_CONVERT_COMPLETED;
511    c = (unsigned char)**fromP;
512    if (c & 0x80) {
513      if (toLim - *toP < 2)
514        return XML_CONVERT_OUTPUT_EXHAUSTED;
515      *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
516      *(*toP)++ = (char)((c & 0x3f) | 0x80);
517      (*fromP)++;
518    } else {
519      if (*toP == toLim)
520        return XML_CONVERT_OUTPUT_EXHAUSTED;
521      *(*toP)++ = *(*fromP)++;
522    }
523  }
524}
525
526static enum XML_Convert_Result PTRCALL
527latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
528               unsigned short **toP, const unsigned short *toLim) {
529  UNUSED_P(enc);
530  while (*fromP < fromLim && *toP < toLim)
531    *(*toP)++ = (unsigned char)*(*fromP)++;
532
533  if ((*toP == toLim) && (*fromP < fromLim))
534    return XML_CONVERT_OUTPUT_EXHAUSTED;
535  else
536    return XML_CONVERT_COMPLETED;
537}
538
539#ifdef XML_NS
540
541static const struct normal_encoding latin1_encoding_ns
542    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
543       {
544#  include "asciitab.h"
545#  include "latin1tab.h"
546       },
547       STANDARD_VTABLE(sb_) NULL_VTABLE};
548
549#endif
550
551static const struct normal_encoding latin1_encoding
552    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
553       {
554#define BT_COLON BT_NMSTRT
555#include "asciitab.h"
556#undef BT_COLON
557#include "latin1tab.h"
558       },
559       STANDARD_VTABLE(sb_) NULL_VTABLE};
560
561static enum XML_Convert_Result PTRCALL
562ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
563             char **toP, const char *toLim) {
564  UNUSED_P(enc);
565  while (*fromP < fromLim && *toP < toLim)
566    *(*toP)++ = *(*fromP)++;
567
568  if ((*toP == toLim) && (*fromP < fromLim))
569    return XML_CONVERT_OUTPUT_EXHAUSTED;
570  else
571    return XML_CONVERT_COMPLETED;
572}
573
574#ifdef XML_NS
575
576static const struct normal_encoding ascii_encoding_ns
577    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
578       {
579#  include "asciitab.h"
580           /* BT_NONXML == 0 */
581       },
582       STANDARD_VTABLE(sb_) NULL_VTABLE};
583
584#endif
585
586static const struct normal_encoding ascii_encoding
587    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
588       {
589#define BT_COLON BT_NMSTRT
590#include "asciitab.h"
591#undef BT_COLON
592           /* BT_NONXML == 0 */
593       },
594       STANDARD_VTABLE(sb_) NULL_VTABLE};
595
596static int PTRFASTCALL
597unicode_byte_type(char hi, char lo) {
598  switch ((unsigned char)hi) {
599  /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
600  case 0xD8:
601  case 0xD9:
602  case 0xDA:
603  case 0xDB:
604    return BT_LEAD4;
605  /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
606  case 0xDC:
607  case 0xDD:
608  case 0xDE:
609  case 0xDF:
610    return BT_TRAIL;
611  case 0xFF:
612    switch ((unsigned char)lo) {
613    case 0xFF: /* noncharacter-FFFF */
614    case 0xFE: /* noncharacter-FFFE */
615      return BT_NONXML;
616    }
617    break;
618  }
619  return BT_NONASCII;
620}
621
622#define DEFINE_UTF16_TO_UTF8(E)                                                \
623  static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
624      const ENCODING *enc, const char **fromP, const char *fromLim,            \
625      char **toP, const char *toLim) {                                         \
626    const char *from = *fromP;                                                 \
627    UNUSED_P(enc);                                                             \
628    fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
629    for (; from < fromLim; from += 2) {                                        \
630      int plane;                                                               \
631      unsigned char lo2;                                                       \
632      unsigned char lo = GET_LO(from);                                         \
633      unsigned char hi = GET_HI(from);                                         \
634      switch (hi) {                                                            \
635      case 0:                                                                  \
636        if (lo < 0x80) {                                                       \
637          if (*toP == toLim) {                                                 \
638            *fromP = from;                                                     \
639            return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
640          }                                                                    \
641          *(*toP)++ = lo;                                                      \
642          break;                                                               \
643        }                                                                      \
644        /* fall through */                                                     \
645      case 0x1:                                                                \
646      case 0x2:                                                                \
647      case 0x3:                                                                \
648      case 0x4:                                                                \
649      case 0x5:                                                                \
650      case 0x6:                                                                \
651      case 0x7:                                                                \
652        if (toLim - *toP < 2) {                                                \
653          *fromP = from;                                                       \
654          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
655        }                                                                      \
656        *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
657        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
658        break;                                                                 \
659      default:                                                                 \
660        if (toLim - *toP < 3) {                                                \
661          *fromP = from;                                                       \
662          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
663        }                                                                      \
664        /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
665        *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
666        *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
667        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
668        break;                                                                 \
669      case 0xD8:                                                               \
670      case 0xD9:                                                               \
671      case 0xDA:                                                               \
672      case 0xDB:                                                               \
673        if (toLim - *toP < 4) {                                                \
674          *fromP = from;                                                       \
675          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
676        }                                                                      \
677        if (fromLim - from < 4) {                                              \
678          *fromP = from;                                                       \
679          return XML_CONVERT_INPUT_INCOMPLETE;                                 \
680        }                                                                      \
681        plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
682        *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
683        *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
684        from += 2;                                                             \
685        lo2 = GET_LO(from);                                                    \
686        *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
687                     | (lo2 >> 6) | 0x80);                                     \
688        *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
689        break;                                                                 \
690      }                                                                        \
691    }                                                                          \
692    *fromP = from;                                                             \
693    if (from < fromLim)                                                        \
694      return XML_CONVERT_INPUT_INCOMPLETE;                                     \
695    else                                                                       \
696      return XML_CONVERT_COMPLETED;                                            \
697  }
698
699#define DEFINE_UTF16_TO_UTF16(E)                                               \
700  static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
701      const ENCODING *enc, const char **fromP, const char *fromLim,            \
702      unsigned short **toP, const unsigned short *toLim) {                     \
703    enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
704    UNUSED_P(enc);                                                             \
705    fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
706    /* Avoid copying first half only of surrogate */                           \
707    if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
708        && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
709      fromLim -= 2;                                                            \
710      res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
711    }                                                                          \
712    for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
713      *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
714    if ((*toP == toLim) && (*fromP < fromLim))                                 \
715      return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
716    else                                                                       \
717      return res;                                                              \
718  }
719
720#define GET_LO(ptr) ((unsigned char)(ptr)[0])
721#define GET_HI(ptr) ((unsigned char)(ptr)[1])
722
723DEFINE_UTF16_TO_UTF8(little2_)
724DEFINE_UTF16_TO_UTF16(little2_)
725
726#undef GET_LO
727#undef GET_HI
728
729#define GET_LO(ptr) ((unsigned char)(ptr)[1])
730#define GET_HI(ptr) ((unsigned char)(ptr)[0])
731
732DEFINE_UTF16_TO_UTF8(big2_)
733DEFINE_UTF16_TO_UTF16(big2_)
734
735#undef GET_LO
736#undef GET_HI
737
738#define LITTLE2_BYTE_TYPE(enc, p)                                              \
739  ((p)[1] == 0 ? SB_BYTE_TYPE(enc, p) : unicode_byte_type((p)[1], (p)[0]))
740#define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
741#define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
742#define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
743  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
744#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
745  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
746
747#ifdef XML_MIN_SIZE
748
749static int PTRFASTCALL
750little2_byteType(const ENCODING *enc, const char *p) {
751  return LITTLE2_BYTE_TYPE(enc, p);
752}
753
754static int PTRFASTCALL
755little2_byteToAscii(const ENCODING *enc, const char *p) {
756  UNUSED_P(enc);
757  return LITTLE2_BYTE_TO_ASCII(p);
758}
759
760static int PTRCALL
761little2_charMatches(const ENCODING *enc, const char *p, int c) {
762  UNUSED_P(enc);
763  return LITTLE2_CHAR_MATCHES(p, c);
764}
765
766static int PTRFASTCALL
767little2_isNameMin(const ENCODING *enc, const char *p) {
768  UNUSED_P(enc);
769  return LITTLE2_IS_NAME_CHAR_MINBPC(p);
770}
771
772static int PTRFASTCALL
773little2_isNmstrtMin(const ENCODING *enc, const char *p) {
774  UNUSED_P(enc);
775  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
776}
777
778#  undef VTABLE
779#  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
780
781#else /* not XML_MIN_SIZE */
782
783#  undef PREFIX
784#  define PREFIX(ident) little2_##ident
785#  define MINBPC(enc) 2
786/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
787#  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
788#  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
789#  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
790#  define IS_NAME_CHAR(enc, p, n) 0
791#  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
792#  define IS_NMSTRT_CHAR(enc, p, n) (0)
793#  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
794
795#  define XML_TOK_IMPL_C
796#  include "xmltok_impl.c"
797#  undef XML_TOK_IMPL_C
798
799#  undef MINBPC
800#  undef BYTE_TYPE
801#  undef BYTE_TO_ASCII
802#  undef CHAR_MATCHES
803#  undef IS_NAME_CHAR
804#  undef IS_NAME_CHAR_MINBPC
805#  undef IS_NMSTRT_CHAR
806#  undef IS_NMSTRT_CHAR_MINBPC
807#  undef IS_INVALID_CHAR
808
809#endif /* not XML_MIN_SIZE */
810
811#ifdef XML_NS
812
813static const struct normal_encoding little2_encoding_ns
814    = {{VTABLE, 2, 0,
815#  if BYTEORDER == 1234
816        1
817#  else
818        0
819#  endif
820       },
821       {
822#  include "asciitab.h"
823#  include "latin1tab.h"
824       },
825       STANDARD_VTABLE(little2_) NULL_VTABLE};
826
827#endif
828
829static const struct normal_encoding little2_encoding
830    = {{VTABLE, 2, 0,
831#if BYTEORDER == 1234
832        1
833#else
834        0
835#endif
836       },
837       {
838#define BT_COLON BT_NMSTRT
839#include "asciitab.h"
840#undef BT_COLON
841#include "latin1tab.h"
842       },
843       STANDARD_VTABLE(little2_) NULL_VTABLE};
844
845#if BYTEORDER != 4321
846
847#  ifdef XML_NS
848
849static const struct normal_encoding internal_little2_encoding_ns
850    = {{VTABLE, 2, 0, 1},
851       {
852#    include "iasciitab.h"
853#    include "latin1tab.h"
854       },
855       STANDARD_VTABLE(little2_) NULL_VTABLE};
856
857#  endif
858
859static const struct normal_encoding internal_little2_encoding
860    = {{VTABLE, 2, 0, 1},
861       {
862#  define BT_COLON BT_NMSTRT
863#  include "iasciitab.h"
864#  undef BT_COLON
865#  include "latin1tab.h"
866       },
867       STANDARD_VTABLE(little2_) NULL_VTABLE};
868
869#endif
870
871#define BIG2_BYTE_TYPE(enc, p)                                                 \
872  ((p)[0] == 0 ? SB_BYTE_TYPE(enc, p + 1) : unicode_byte_type((p)[0], (p)[1]))
873#define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
874#define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
875#define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
876  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
877#define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
878  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
879
880#ifdef XML_MIN_SIZE
881
882static int PTRFASTCALL
883big2_byteType(const ENCODING *enc, const char *p) {
884  return BIG2_BYTE_TYPE(enc, p);
885}
886
887static int PTRFASTCALL
888big2_byteToAscii(const ENCODING *enc, const char *p) {
889  UNUSED_P(enc);
890  return BIG2_BYTE_TO_ASCII(p);
891}
892
893static int PTRCALL
894big2_charMatches(const ENCODING *enc, const char *p, int c) {
895  UNUSED_P(enc);
896  return BIG2_CHAR_MATCHES(p, c);
897}
898
899static int PTRFASTCALL
900big2_isNameMin(const ENCODING *enc, const char *p) {
901  UNUSED_P(enc);
902  return BIG2_IS_NAME_CHAR_MINBPC(p);
903}
904
905static int PTRFASTCALL
906big2_isNmstrtMin(const ENCODING *enc, const char *p) {
907  UNUSED_P(enc);
908  return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
909}
910
911#  undef VTABLE
912#  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
913
914#else /* not XML_MIN_SIZE */
915
916#  undef PREFIX
917#  define PREFIX(ident) big2_##ident
918#  define MINBPC(enc) 2
919/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
920#  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
921#  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
922#  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
923#  define IS_NAME_CHAR(enc, p, n) 0
924#  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
925#  define IS_NMSTRT_CHAR(enc, p, n) (0)
926#  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
927
928#  define XML_TOK_IMPL_C
929#  include "xmltok_impl.c"
930#  undef XML_TOK_IMPL_C
931
932#  undef MINBPC
933#  undef BYTE_TYPE
934#  undef BYTE_TO_ASCII
935#  undef CHAR_MATCHES
936#  undef IS_NAME_CHAR
937#  undef IS_NAME_CHAR_MINBPC
938#  undef IS_NMSTRT_CHAR
939#  undef IS_NMSTRT_CHAR_MINBPC
940#  undef IS_INVALID_CHAR
941
942#endif /* not XML_MIN_SIZE */
943
944#ifdef XML_NS
945
946static const struct normal_encoding big2_encoding_ns
947    = {{VTABLE, 2, 0,
948#  if BYTEORDER == 4321
949        1
950#  else
951        0
952#  endif
953       },
954       {
955#  include "asciitab.h"
956#  include "latin1tab.h"
957       },
958       STANDARD_VTABLE(big2_) NULL_VTABLE};
959
960#endif
961
962static const struct normal_encoding big2_encoding
963    = {{VTABLE, 2, 0,
964#if BYTEORDER == 4321
965        1
966#else
967        0
968#endif
969       },
970       {
971#define BT_COLON BT_NMSTRT
972#include "asciitab.h"
973#undef BT_COLON
974#include "latin1tab.h"
975       },
976       STANDARD_VTABLE(big2_) NULL_VTABLE};
977
978#if BYTEORDER != 1234
979
980#  ifdef XML_NS
981
982static const struct normal_encoding internal_big2_encoding_ns
983    = {{VTABLE, 2, 0, 1},
984       {
985#    include "iasciitab.h"
986#    include "latin1tab.h"
987       },
988       STANDARD_VTABLE(big2_) NULL_VTABLE};
989
990#  endif
991
992static const struct normal_encoding internal_big2_encoding
993    = {{VTABLE, 2, 0, 1},
994       {
995#  define BT_COLON BT_NMSTRT
996#  include "iasciitab.h"
997#  undef BT_COLON
998#  include "latin1tab.h"
999       },
1000       STANDARD_VTABLE(big2_) NULL_VTABLE};
1001
1002#endif
1003
1004#undef PREFIX
1005
1006static int FASTCALL
1007streqci(const char *s1, const char *s2) {
1008  for (;;) {
1009    char c1 = *s1++;
1010    char c2 = *s2++;
1011    if (ASCII_a <= c1 && c1 <= ASCII_z)
1012      c1 += ASCII_A - ASCII_a;
1013    if (ASCII_a <= c2 && c2 <= ASCII_z)
1014      /* The following line will never get executed.  streqci() is
1015       * only called from two places, both of which guarantee to put
1016       * upper-case strings into s2.
1017       */
1018      c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1019    if (c1 != c2)
1020      return 0;
1021    if (! c1)
1022      break;
1023  }
1024  return 1;
1025}
1026
1027static void PTRCALL
1028initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1029                   POSITION *pos) {
1030  UNUSED_P(enc);
1031  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1032}
1033
1034static int
1035toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1036  char buf[1];
1037  char *p = buf;
1038  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1039  if (p == buf)
1040    return -1;
1041  else
1042    return buf[0];
1043}
1044
1045static int FASTCALL
1046isSpace(int c) {
1047  switch (c) {
1048  case 0x20:
1049  case 0xD:
1050  case 0xA:
1051  case 0x9:
1052    return 1;
1053  }
1054  return 0;
1055}
1056
1057/* Return 1 if there's just optional white space or there's an S
1058   followed by name=val.
1059*/
1060static int
1061parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1062                     const char **namePtr, const char **nameEndPtr,
1063                     const char **valPtr, const char **nextTokPtr) {
1064  int c;
1065  char open;
1066  if (ptr == end) {
1067    *namePtr = NULL;
1068    return 1;
1069  }
1070  if (! isSpace(toAscii(enc, ptr, end))) {
1071    *nextTokPtr = ptr;
1072    return 0;
1073  }
1074  do {
1075    ptr += enc->minBytesPerChar;
1076  } while (isSpace(toAscii(enc, ptr, end)));
1077  if (ptr == end) {
1078    *namePtr = NULL;
1079    return 1;
1080  }
1081  *namePtr = ptr;
1082  for (;;) {
1083    c = toAscii(enc, ptr, end);
1084    if (c == -1) {
1085      *nextTokPtr = ptr;
1086      return 0;
1087    }
1088    if (c == ASCII_EQUALS) {
1089      *nameEndPtr = ptr;
1090      break;
1091    }
1092    if (isSpace(c)) {
1093      *nameEndPtr = ptr;
1094      do {
1095        ptr += enc->minBytesPerChar;
1096      } while (isSpace(c = toAscii(enc, ptr, end)));
1097      if (c != ASCII_EQUALS) {
1098        *nextTokPtr = ptr;
1099        return 0;
1100      }
1101      break;
1102    }
1103    ptr += enc->minBytesPerChar;
1104  }
1105  if (ptr == *namePtr) {
1106    *nextTokPtr = ptr;
1107    return 0;
1108  }
1109  ptr += enc->minBytesPerChar;
1110  c = toAscii(enc, ptr, end);
1111  while (isSpace(c)) {
1112    ptr += enc->minBytesPerChar;
1113    c = toAscii(enc, ptr, end);
1114  }
1115  if (c != ASCII_QUOT && c != ASCII_APOS) {
1116    *nextTokPtr = ptr;
1117    return 0;
1118  }
1119  open = (char)c;
1120  ptr += enc->minBytesPerChar;
1121  *valPtr = ptr;
1122  for (;; ptr += enc->minBytesPerChar) {
1123    c = toAscii(enc, ptr, end);
1124    if (c == open)
1125      break;
1126    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1127        && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1128        && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1129      *nextTokPtr = ptr;
1130      return 0;
1131    }
1132  }
1133  *nextTokPtr = ptr + enc->minBytesPerChar;
1134  return 1;
1135}
1136
1137static const char KW_version[]
1138    = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1139
1140static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1141                                   ASCII_i, ASCII_n, ASCII_g, '\0'};
1142
1143static const char KW_standalone[]
1144    = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1145       ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1146
1147static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1148
1149static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1150
1151static int
1152doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1153                                                 const char *),
1154               int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1155               const char *end, const char **badPtr, const char **versionPtr,
1156               const char **versionEndPtr, const char **encodingName,
1157               const ENCODING **encoding, int *standalone) {
1158  const char *val = NULL;
1159  const char *name = NULL;
1160  const char *nameEnd = NULL;
1161  ptr += 5 * enc->minBytesPerChar;
1162  end -= 2 * enc->minBytesPerChar;
1163  if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1164      || ! name) {
1165    *badPtr = ptr;
1166    return 0;
1167  }
1168  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1169    if (! isGeneralTextEntity) {
1170      *badPtr = name;
1171      return 0;
1172    }
1173  } else {
1174    if (versionPtr)
1175      *versionPtr = val;
1176    if (versionEndPtr)
1177      *versionEndPtr = ptr;
1178    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1179      *badPtr = ptr;
1180      return 0;
1181    }
1182    if (! name) {
1183      if (isGeneralTextEntity) {
1184        /* a TextDecl must have an EncodingDecl */
1185        *badPtr = ptr;
1186        return 0;
1187      }
1188      return 1;
1189    }
1190  }
1191  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1192    int c = toAscii(enc, val, end);
1193    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1194      *badPtr = val;
1195      return 0;
1196    }
1197    if (encodingName)
1198      *encodingName = val;
1199    if (encoding)
1200      *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1201    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1202      *badPtr = ptr;
1203      return 0;
1204    }
1205    if (! name)
1206      return 1;
1207  }
1208  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1209      || isGeneralTextEntity) {
1210    *badPtr = name;
1211    return 0;
1212  }
1213  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1214    if (standalone)
1215      *standalone = 1;
1216  } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1217    if (standalone)
1218      *standalone = 0;
1219  } else {
1220    *badPtr = val;
1221    return 0;
1222  }
1223  while (isSpace(toAscii(enc, ptr, end)))
1224    ptr += enc->minBytesPerChar;
1225  if (ptr != end) {
1226    *badPtr = ptr;
1227    return 0;
1228  }
1229  return 1;
1230}
1231
1232static int FASTCALL
1233checkCharRefNumber(int result) {
1234  switch (result >> 8) {
1235  case 0xD8:
1236  case 0xD9:
1237  case 0xDA:
1238  case 0xDB:
1239  case 0xDC:
1240  case 0xDD:
1241  case 0xDE:
1242  case 0xDF:
1243    return -1;
1244  case 0:
1245    if (latin1_encoding.type[result] == BT_NONXML)
1246      return -1;
1247    break;
1248  case 0xFF:
1249    if (result == 0xFFFE || result == 0xFFFF)
1250      return -1;
1251    break;
1252  }
1253  return result;
1254}
1255
1256int FASTCALL
1257XmlUtf8Encode(int c, char *buf) {
1258  enum {
1259    /* minN is minimum legal resulting value for N byte sequence */
1260    min2 = 0x80,
1261    min3 = 0x800,
1262    min4 = 0x10000
1263  };
1264
1265  if (c < 0)
1266    return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1267  if (c < min2) {
1268    buf[0] = (char)(c | UTF8_cval1);
1269    return 1;
1270  }
1271  if (c < min3) {
1272    buf[0] = (char)((c >> 6) | UTF8_cval2);
1273    buf[1] = (char)((c & 0x3f) | 0x80);
1274    return 2;
1275  }
1276  if (c < min4) {
1277    buf[0] = (char)((c >> 12) | UTF8_cval3);
1278    buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1279    buf[2] = (char)((c & 0x3f) | 0x80);
1280    return 3;
1281  }
1282  if (c < 0x110000) {
1283    buf[0] = (char)((c >> 18) | UTF8_cval4);
1284    buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1285    buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1286    buf[3] = (char)((c & 0x3f) | 0x80);
1287    return 4;
1288  }
1289  return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1290}
1291
1292int FASTCALL
1293XmlUtf16Encode(int charNum, unsigned short *buf) {
1294  if (charNum < 0)
1295    return 0;
1296  if (charNum < 0x10000) {
1297    buf[0] = (unsigned short)charNum;
1298    return 1;
1299  }
1300  if (charNum < 0x110000) {
1301    charNum -= 0x10000;
1302    buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1303    buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1304    return 2;
1305  }
1306  return 0;
1307}
1308
1309struct unknown_encoding {
1310  struct normal_encoding normal;
1311  CONVERTER convert;
1312  void *userData;
1313  unsigned short utf16[256];
1314  char utf8[256][4];
1315};
1316
1317#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1318
1319int
1320XmlSizeOfUnknownEncoding(void) {
1321  return sizeof(struct unknown_encoding);
1322}
1323
1324static int PTRFASTCALL
1325unknown_isName(const ENCODING *enc, const char *p) {
1326  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1327  int c = uenc->convert(uenc->userData, p);
1328  if (c & ~0xFFFF)
1329    return 0;
1330  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1331}
1332
1333static int PTRFASTCALL
1334unknown_isNmstrt(const ENCODING *enc, const char *p) {
1335  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1336  int c = uenc->convert(uenc->userData, p);
1337  if (c & ~0xFFFF)
1338    return 0;
1339  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1340}
1341
1342static int PTRFASTCALL
1343unknown_isInvalid(const ENCODING *enc, const char *p) {
1344  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1345  int c = uenc->convert(uenc->userData, p);
1346  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1347}
1348
1349static enum XML_Convert_Result PTRCALL
1350unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1351               char **toP, const char *toLim) {
1352  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1353  char buf[XML_UTF8_ENCODE_MAX];
1354  for (;;) {
1355    const char *utf8;
1356    int n;
1357    if (*fromP == fromLim)
1358      return XML_CONVERT_COMPLETED;
1359    utf8 = uenc->utf8[(unsigned char)**fromP];
1360    n = *utf8++;
1361    if (n == 0) {
1362      int c = uenc->convert(uenc->userData, *fromP);
1363      n = XmlUtf8Encode(c, buf);
1364      if (n > toLim - *toP)
1365        return XML_CONVERT_OUTPUT_EXHAUSTED;
1366      utf8 = buf;
1367      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1368                 - (BT_LEAD2 - 2));
1369    } else {
1370      if (n > toLim - *toP)
1371        return XML_CONVERT_OUTPUT_EXHAUSTED;
1372      (*fromP)++;
1373    }
1374    memcpy(*toP, utf8, n);
1375    *toP += n;
1376  }
1377}
1378
1379static enum XML_Convert_Result PTRCALL
1380unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1381                unsigned short **toP, const unsigned short *toLim) {
1382  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1383  while (*fromP < fromLim && *toP < toLim) {
1384    unsigned short c = uenc->utf16[(unsigned char)**fromP];
1385    if (c == 0) {
1386      c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1387      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1388                 - (BT_LEAD2 - 2));
1389    } else
1390      (*fromP)++;
1391    *(*toP)++ = c;
1392  }
1393
1394  if ((*toP == toLim) && (*fromP < fromLim))
1395    return XML_CONVERT_OUTPUT_EXHAUSTED;
1396  else
1397    return XML_CONVERT_COMPLETED;
1398}
1399
1400ENCODING *
1401XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1402                       void *userData) {
1403  int i;
1404  struct unknown_encoding *e = (struct unknown_encoding *)mem;
1405  memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1406  for (i = 0; i < 128; i++)
1407    if (latin1_encoding.type[i] != BT_OTHER
1408        && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1409      return 0;
1410  for (i = 0; i < 256; i++) {
1411    int c = table[i];
1412    if (c == -1) {
1413      e->normal.type[i] = BT_MALFORM;
1414      /* This shouldn't really get used. */
1415      e->utf16[i] = 0xFFFF;
1416      e->utf8[i][0] = 1;
1417      e->utf8[i][1] = 0;
1418    } else if (c < 0) {
1419      if (c < -4)
1420        return 0;
1421      /* Multi-byte sequences need a converter function */
1422      if (! convert)
1423        return 0;
1424      e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1425      e->utf8[i][0] = 0;
1426      e->utf16[i] = 0;
1427    } else if (c < 0x80) {
1428      if (latin1_encoding.type[c] != BT_OTHER
1429          && latin1_encoding.type[c] != BT_NONXML && c != i)
1430        return 0;
1431      e->normal.type[i] = latin1_encoding.type[c];
1432      e->utf8[i][0] = 1;
1433      e->utf8[i][1] = (char)c;
1434      e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1435    } else if (checkCharRefNumber(c) < 0) {
1436      e->normal.type[i] = BT_NONXML;
1437      /* This shouldn't really get used. */
1438      e->utf16[i] = 0xFFFF;
1439      e->utf8[i][0] = 1;
1440      e->utf8[i][1] = 0;
1441    } else {
1442      if (c > 0xFFFF)
1443        return 0;
1444      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1445        e->normal.type[i] = BT_NMSTRT;
1446      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1447        e->normal.type[i] = BT_NAME;
1448      else
1449        e->normal.type[i] = BT_OTHER;
1450      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1451      e->utf16[i] = (unsigned short)c;
1452    }
1453  }
1454  e->userData = userData;
1455  e->convert = convert;
1456  if (convert) {
1457    e->normal.isName2 = unknown_isName;
1458    e->normal.isName3 = unknown_isName;
1459    e->normal.isName4 = unknown_isName;
1460    e->normal.isNmstrt2 = unknown_isNmstrt;
1461    e->normal.isNmstrt3 = unknown_isNmstrt;
1462    e->normal.isNmstrt4 = unknown_isNmstrt;
1463    e->normal.isInvalid2 = unknown_isInvalid;
1464    e->normal.isInvalid3 = unknown_isInvalid;
1465    e->normal.isInvalid4 = unknown_isInvalid;
1466  }
1467  e->normal.enc.utf8Convert = unknown_toUtf8;
1468  e->normal.enc.utf16Convert = unknown_toUtf16;
1469  return &(e->normal.enc);
1470}
1471
1472/* If this enumeration is changed, getEncodingIndex and encodings
1473must also be changed. */
1474enum {
1475  UNKNOWN_ENC = -1,
1476  ISO_8859_1_ENC = 0,
1477  US_ASCII_ENC,
1478  UTF_8_ENC,
1479  UTF_16_ENC,
1480  UTF_16BE_ENC,
1481  UTF_16LE_ENC,
1482  /* must match encodingNames up to here */
1483  NO_ENC
1484};
1485
1486static const char KW_ISO_8859_1[]
1487    = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1488       ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1489static const char KW_US_ASCII[]
1490    = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1491       ASCII_C, ASCII_I, ASCII_I,     '\0'};
1492static const char KW_UTF_8[]
1493    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1494static const char KW_UTF_16[]
1495    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1496static const char KW_UTF_16BE[]
1497    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1498       ASCII_6, ASCII_B, ASCII_E, '\0'};
1499static const char KW_UTF_16LE[]
1500    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1501       ASCII_6, ASCII_L, ASCII_E, '\0'};
1502
1503static int FASTCALL
1504getEncodingIndex(const char *name) {
1505  static const char *const encodingNames[] = {
1506      KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1507  };
1508  int i;
1509  if (name == NULL)
1510    return NO_ENC;
1511  for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1512    if (streqci(name, encodingNames[i]))
1513      return i;
1514  return UNKNOWN_ENC;
1515}
1516
1517/* For binary compatibility, we store the index of the encoding
1518   specified at initialization in the isUtf16 member.
1519*/
1520
1521#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1522#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1523
1524/* This is what detects the encoding.  encodingTable maps from
1525   encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1526   the external (protocol) specified encoding; state is
1527   XML_CONTENT_STATE if we're parsing an external text entity, and
1528   XML_PROLOG_STATE otherwise.
1529*/
1530
1531static int
1532initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1533         int state, const char *ptr, const char *end, const char **nextTokPtr) {
1534  const ENCODING **encPtr;
1535
1536  if (ptr >= end)
1537    return XML_TOK_NONE;
1538  encPtr = enc->encPtr;
1539  if (ptr + 1 == end) {
1540    /* only a single byte available for auto-detection */
1541#ifndef XML_DTD /* FIXME */
1542    /* a well-formed document entity must have more than one byte */
1543    if (state != XML_CONTENT_STATE)
1544      return XML_TOK_PARTIAL;
1545#endif
1546    /* so we're parsing an external text entity... */
1547    /* if UTF-16 was externally specified, then we need at least 2 bytes */
1548    switch (INIT_ENC_INDEX(enc)) {
1549    case UTF_16_ENC:
1550    case UTF_16LE_ENC:
1551    case UTF_16BE_ENC:
1552      return XML_TOK_PARTIAL;
1553    }
1554    switch ((unsigned char)*ptr) {
1555    case 0xFE:
1556    case 0xFF:
1557    case 0xEF: /* possibly first byte of UTF-8 BOM */
1558      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1559        break;
1560      /* fall through */
1561    case 0x00:
1562    case 0x3C:
1563      return XML_TOK_PARTIAL;
1564    }
1565  } else {
1566    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1567    case 0xFEFF:
1568      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1569        break;
1570      *nextTokPtr = ptr + 2;
1571      *encPtr = encodingTable[UTF_16BE_ENC];
1572      return XML_TOK_BOM;
1573    /* 00 3C is handled in the default case */
1574    case 0x3C00:
1575      if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1576           || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1577          && state == XML_CONTENT_STATE)
1578        break;
1579      *encPtr = encodingTable[UTF_16LE_ENC];
1580      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1581    case 0xFFFE:
1582      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1583        break;
1584      *nextTokPtr = ptr + 2;
1585      *encPtr = encodingTable[UTF_16LE_ENC];
1586      return XML_TOK_BOM;
1587    case 0xEFBB:
1588      /* Maybe a UTF-8 BOM (EF BB BF) */
1589      /* If there's an explicitly specified (external) encoding
1590         of ISO-8859-1 or some flavour of UTF-16
1591         and this is an external text entity,
1592         don't look for the BOM,
1593         because it might be a legal data.
1594      */
1595      if (state == XML_CONTENT_STATE) {
1596        int e = INIT_ENC_INDEX(enc);
1597        if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1598            || e == UTF_16_ENC)
1599          break;
1600      }
1601      if (ptr + 2 == end)
1602        return XML_TOK_PARTIAL;
1603      if ((unsigned char)ptr[2] == 0xBF) {
1604        *nextTokPtr = ptr + 3;
1605        *encPtr = encodingTable[UTF_8_ENC];
1606        return XML_TOK_BOM;
1607      }
1608      break;
1609    default:
1610      if (ptr[0] == '\0') {
1611        /* 0 isn't a legal data character. Furthermore a document
1612           entity can only start with ASCII characters.  So the only
1613           way this can fail to be big-endian UTF-16 if it it's an
1614           external parsed general entity that's labelled as
1615           UTF-16LE.
1616        */
1617        if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1618          break;
1619        *encPtr = encodingTable[UTF_16BE_ENC];
1620        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1621      } else if (ptr[1] == '\0') {
1622        /* We could recover here in the case:
1623            - parsing an external entity
1624            - second byte is 0
1625            - no externally specified encoding
1626            - no encoding declaration
1627           by assuming UTF-16LE.  But we don't, because this would mean when
1628           presented just with a single byte, we couldn't reliably determine
1629           whether we needed further bytes.
1630        */
1631        if (state == XML_CONTENT_STATE)
1632          break;
1633        *encPtr = encodingTable[UTF_16LE_ENC];
1634        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1635      }
1636      break;
1637    }
1638  }
1639  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1640  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1641}
1642
1643#define NS(x) x
1644#define ns(x) x
1645#define XML_TOK_NS_C
1646#include "xmltok_ns.c"
1647#undef XML_TOK_NS_C
1648#undef NS
1649#undef ns
1650
1651#ifdef XML_NS
1652
1653#  define NS(x) x##NS
1654#  define ns(x) x##_ns
1655
1656#  define XML_TOK_NS_C
1657#  include "xmltok_ns.c"
1658#  undef XML_TOK_NS_C
1659
1660#  undef NS
1661#  undef ns
1662
1663ENCODING *
1664XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1665                         void *userData) {
1666  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1667  if (enc)
1668    ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1669  return enc;
1670}
1671
1672#endif /* XML_NS */
1673