1/*
2                            __  __            _
3                         ___\ \/ /_ __   __ _| |_
4                        / _ \\  /| '_ \ / _` | __|
5                       |  __//  \| |_) | (_| | |_
6                        \___/_/\_\ .__/ \__,_|\__|
7                                 |_| XML parser
8
9   Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10   Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11   Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12   Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
13   Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14   Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15   Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
16   Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
17   Copyright (c) 2016      Don Lewis <truckman@apache.org>
18   Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
19   Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
20   Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
21   Copyright (c) 2017      Jos�� Guti��rrez de la Concha <jose@zeroc.com>
22   Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
23   Copyright (c) 2021      Dong-hee Na <donghee.na@python.org>
24   Licensed under the MIT license:
25
26   Permission is  hereby granted,  free of charge,  to any  person obtaining
27   a  copy  of  this  software   and  associated  documentation  files  (the
28   "Software"),  to  deal in  the  Software  without restriction,  including
29   without  limitation the  rights  to use,  copy,  modify, merge,  publish,
30   distribute, sublicense, and/or sell copies of the Software, and to permit
31   persons  to whom  the Software  is  furnished to  do so,  subject to  the
32   following conditions:
33
34   The above copyright  notice and this permission notice  shall be included
35   in all copies or substantial portions of the Software.
36
37   THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
38   EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
39   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
40   NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
41   DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
42   OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
43   USE OR OTHER DEALINGS IN THE SOFTWARE.
44*/
45
46#include <expat_config.h>
47
48#include <stddef.h>
49#include <string.h> /* memcpy */
50#include <stdbool.h>
51
52#ifdef _WIN32
53#  include "winconfig.h"
54#endif
55
56#include "expat_external.h"
57#include "internal.h"
58#include "xmltok.h"
59#include "nametab.h"
60
61#ifdef XML_DTD
62#  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
63#else
64#  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
65#endif
66
67#define VTABLE1                                                                \
68  {PREFIX(prologTok), PREFIX(contentTok),                                      \
69   PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
70      {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
71      PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
72      PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
73      PREFIX(updatePosition), PREFIX(isPublicId)
74
75#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
76
77#define UCS2_GET_NAMING(pages, hi, lo)                                         \
78  (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
79
80/* A 2 byte UTF-8 representation splits the characters 11 bits between
81   the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
82   pages, 3 bits to add to that index and 5 bits to generate the mask.
83*/
84#define UTF8_GET_NAMING2(pages, byte)                                          \
85  (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
86                + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
87   & (1u << (((byte)[1]) & 0x1F)))
88
89/* A 3 byte UTF-8 representation splits the characters 16 bits between
90   the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
91   into pages, 3 bits to add to that index and 5 bits to generate the
92   mask.
93*/
94#define UTF8_GET_NAMING3(pages, byte)                                          \
95  (namingBitmap                                                                \
96       [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
97         << 3)                                                                 \
98        + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
99   & (1u << (((byte)[2]) & 0x1F)))
100
101/* Detection of invalid UTF-8 sequences is based on Table 3.1B
102   of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
103   with the additional restriction of not allowing the Unicode
104   code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
105   Implementation details:
106     (A & 0x80) == 0     means A < 0x80
107   and
108     (A & 0xC0) == 0xC0  means A > 0xBF
109*/
110
111#define UTF8_INVALID2(p)                                                       \
112  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
113
114#define UTF8_INVALID3(p)                                                       \
115  (((p)[2] & 0x80) == 0                                                        \
116   || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
117                                      : ((p)[2] & 0xC0) == 0xC0)               \
118   || ((*p) == 0xE0                                                            \
119           ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
120           : ((p)[1] & 0x80) == 0                                              \
121                 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
122
123#define UTF8_INVALID4(p)                                                       \
124  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
125   || ((p)[2] & 0xC0) == 0xC0                                                  \
126   || ((*p) == 0xF0                                                            \
127           ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
128           : ((p)[1] & 0x80) == 0                                              \
129                 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
130
131static int PTRFASTCALL
132isNever(const ENCODING *enc, const char *p) {
133  UNUSED_P(enc);
134  UNUSED_P(p);
135  return 0;
136}
137
138static int PTRFASTCALL
139utf8_isName2(const ENCODING *enc, const char *p) {
140  UNUSED_P(enc);
141  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
142}
143
144static int PTRFASTCALL
145utf8_isName3(const ENCODING *enc, const char *p) {
146  UNUSED_P(enc);
147  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
148}
149
150#define utf8_isName4 isNever
151
152static int PTRFASTCALL
153utf8_isNmstrt2(const ENCODING *enc, const char *p) {
154  UNUSED_P(enc);
155  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
156}
157
158static int PTRFASTCALL
159utf8_isNmstrt3(const ENCODING *enc, const char *p) {
160  UNUSED_P(enc);
161  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
162}
163
164#define utf8_isNmstrt4 isNever
165
166static int PTRFASTCALL
167utf8_isInvalid2(const ENCODING *enc, const char *p) {
168  UNUSED_P(enc);
169  return UTF8_INVALID2((const unsigned char *)p);
170}
171
172static int PTRFASTCALL
173utf8_isInvalid3(const ENCODING *enc, const char *p) {
174  UNUSED_P(enc);
175  return UTF8_INVALID3((const unsigned char *)p);
176}
177
178static int PTRFASTCALL
179utf8_isInvalid4(const ENCODING *enc, const char *p) {
180  UNUSED_P(enc);
181  return UTF8_INVALID4((const unsigned char *)p);
182}
183
184struct normal_encoding {
185  ENCODING enc;
186  unsigned char type[256];
187#ifdef XML_MIN_SIZE
188  int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
189  int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
190  int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
191  int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
192  int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
193#endif /* XML_MIN_SIZE */
194  int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
195  int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
196  int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
197  int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
198  int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
199  int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
200  int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
201  int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
202  int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
203};
204
205#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
206
207#ifdef XML_MIN_SIZE
208
209#  define STANDARD_VTABLE(E)                                                   \
210    E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
211
212#else
213
214#  define STANDARD_VTABLE(E) /* as nothing */
215
216#endif
217
218#define NORMAL_VTABLE(E)                                                       \
219  E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
220      E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
221
222#define NULL_VTABLE                                                            \
223  /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
224      /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
225      /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
226
227static int FASTCALL checkCharRefNumber(int);
228
229#include "xmltok_impl.h"
230#include "ascii.h"
231
232#ifdef XML_MIN_SIZE
233#  define sb_isNameMin isNever
234#  define sb_isNmstrtMin isNever
235#endif
236
237#ifdef XML_MIN_SIZE
238#  define MINBPC(enc) ((enc)->minBytesPerChar)
239#else
240/* minimum bytes per character */
241#  define MINBPC(enc) 1
242#endif
243
244#define SB_BYTE_TYPE(enc, p)                                                   \
245  (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
246
247#ifdef XML_MIN_SIZE
248static int PTRFASTCALL
249sb_byteType(const ENCODING *enc, const char *p) {
250  return SB_BYTE_TYPE(enc, p);
251}
252#  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
253#else
254#  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
255#endif
256
257#ifdef XML_MIN_SIZE
258#  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
259static int PTRFASTCALL
260sb_byteToAscii(const ENCODING *enc, const char *p) {
261  UNUSED_P(enc);
262  return *p;
263}
264#else
265#  define BYTE_TO_ASCII(enc, p) (*(p))
266#endif
267
268#define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
269#define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
270#ifdef XML_MIN_SIZE
271#  define IS_INVALID_CHAR(enc, p, n)                                           \
272    (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
273     && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
274#else
275#  define IS_INVALID_CHAR(enc, p, n)                                           \
276    (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
277#endif
278
279#ifdef XML_MIN_SIZE
280#  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
281    (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
282#  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
283    (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
284#else
285#  define IS_NAME_CHAR_MINBPC(enc, p) (0)
286#  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
287#endif
288
289#ifdef XML_MIN_SIZE
290#  define CHAR_MATCHES(enc, p, c)                                              \
291    (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
292static int PTRCALL
293sb_charMatches(const ENCODING *enc, const char *p, int c) {
294  UNUSED_P(enc);
295  return *p == c;
296}
297#else
298/* c is an ASCII character */
299#  define CHAR_MATCHES(enc, p, c) (*(p) == c)
300#endif
301
302#define PREFIX(ident) normal_##ident
303#define XML_TOK_IMPL_C
304#include "xmltok_impl.c"
305#undef XML_TOK_IMPL_C
306
307#undef MINBPC
308#undef BYTE_TYPE
309#undef BYTE_TO_ASCII
310#undef CHAR_MATCHES
311#undef IS_NAME_CHAR
312#undef IS_NAME_CHAR_MINBPC
313#undef IS_NMSTRT_CHAR
314#undef IS_NMSTRT_CHAR_MINBPC
315#undef IS_INVALID_CHAR
316
317enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
318       UTF8_cval1 = 0x00,
319       UTF8_cval2 = 0xc0,
320       UTF8_cval3 = 0xe0,
321       UTF8_cval4 = 0xf0
322};
323
324void
325_INTERNAL_trim_to_complete_utf8_characters(const char *from,
326                                           const char **fromLimRef) {
327  const char *fromLim = *fromLimRef;
328  size_t walked = 0;
329  for (; fromLim > from; fromLim--, walked++) {
330    const unsigned char prev = (unsigned char)fromLim[-1];
331    if ((prev & 0xf8u)
332        == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
333      if (walked + 1 >= 4) {
334        fromLim += 4 - 1;
335        break;
336      } else {
337        walked = 0;
338      }
339    } else if ((prev & 0xf0u)
340               == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
341      if (walked + 1 >= 3) {
342        fromLim += 3 - 1;
343        break;
344      } else {
345        walked = 0;
346      }
347    } else if ((prev & 0xe0u)
348               == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
349      if (walked + 1 >= 2) {
350        fromLim += 2 - 1;
351        break;
352      } else {
353        walked = 0;
354      }
355    } else if ((prev & 0x80u)
356               == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
357      break;
358    }
359  }
360  *fromLimRef = fromLim;
361}
362
363static enum XML_Convert_Result PTRCALL
364utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
365            char **toP, const char *toLim) {
366  bool input_incomplete = false;
367  bool output_exhausted = false;
368
369  /* Avoid copying partial characters (due to limited space). */
370  const ptrdiff_t bytesAvailable = fromLim - *fromP;
371  const ptrdiff_t bytesStorable = toLim - *toP;
372  UNUSED_P(enc);
373  if (bytesAvailable > bytesStorable) {
374    fromLim = *fromP + bytesStorable;
375    output_exhausted = true;
376  }
377
378  /* Avoid copying partial characters (from incomplete input). */
379  {
380    const char *const fromLimBefore = fromLim;
381    _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
382    if (fromLim < fromLimBefore) {
383      input_incomplete = true;
384    }
385  }
386
387  {
388    const ptrdiff_t bytesToCopy = fromLim - *fromP;
389    memcpy(*toP, *fromP, bytesToCopy);
390    *fromP += bytesToCopy;
391    *toP += bytesToCopy;
392  }
393
394  if (output_exhausted) /* needs to go first */
395    return XML_CONVERT_OUTPUT_EXHAUSTED;
396  else if (input_incomplete)
397    return XML_CONVERT_INPUT_INCOMPLETE;
398  else
399    return XML_CONVERT_COMPLETED;
400}
401
402static enum XML_Convert_Result PTRCALL
403utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
404             unsigned short **toP, const unsigned short *toLim) {
405  enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
406  unsigned short *to = *toP;
407  const char *from = *fromP;
408  while (from < fromLim && to < toLim) {
409    switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
410    case BT_LEAD2:
411      if (fromLim - from < 2) {
412        res = XML_CONVERT_INPUT_INCOMPLETE;
413        goto after;
414      }
415      *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
416      from += 2;
417      break;
418    case BT_LEAD3:
419      if (fromLim - from < 3) {
420        res = XML_CONVERT_INPUT_INCOMPLETE;
421        goto after;
422      }
423      *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
424                               | (from[2] & 0x3f));
425      from += 3;
426      break;
427    case BT_LEAD4: {
428      unsigned long n;
429      if (toLim - to < 2) {
430        res = XML_CONVERT_OUTPUT_EXHAUSTED;
431        goto after;
432      }
433      if (fromLim - from < 4) {
434        res = XML_CONVERT_INPUT_INCOMPLETE;
435        goto after;
436      }
437      n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
438          | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
439      n -= 0x10000;
440      to[0] = (unsigned short)((n >> 10) | 0xD800);
441      to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
442      to += 2;
443      from += 4;
444    } break;
445    default:
446      *to++ = *from++;
447      break;
448    }
449  }
450  if (from < fromLim)
451    res = XML_CONVERT_OUTPUT_EXHAUSTED;
452after:
453  *fromP = from;
454  *toP = to;
455  return res;
456}
457
458#ifdef XML_NS
459static const struct normal_encoding utf8_encoding_ns
460    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
461       {
462#  include "asciitab.h"
463#  include "utf8tab.h"
464       },
465       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
466#endif
467
468static const struct normal_encoding utf8_encoding
469    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
470       {
471#define BT_COLON BT_NMSTRT
472#include "asciitab.h"
473#undef BT_COLON
474#include "utf8tab.h"
475       },
476       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
477
478#ifdef XML_NS
479
480static const struct normal_encoding internal_utf8_encoding_ns
481    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
482       {
483#  include "iasciitab.h"
484#  include "utf8tab.h"
485       },
486       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
487
488#endif
489
490static const struct normal_encoding internal_utf8_encoding
491    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
492       {
493#define BT_COLON BT_NMSTRT
494#include "iasciitab.h"
495#undef BT_COLON
496#include "utf8tab.h"
497       },
498       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
499
500static enum XML_Convert_Result PTRCALL
501latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
502              char **toP, const char *toLim) {
503  UNUSED_P(enc);
504  for (;;) {
505    unsigned char c;
506    if (*fromP == fromLim)
507      return XML_CONVERT_COMPLETED;
508    c = (unsigned char)**fromP;
509    if (c & 0x80) {
510      if (toLim - *toP < 2)
511        return XML_CONVERT_OUTPUT_EXHAUSTED;
512      *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
513      *(*toP)++ = (char)((c & 0x3f) | 0x80);
514      (*fromP)++;
515    } else {
516      if (*toP == toLim)
517        return XML_CONVERT_OUTPUT_EXHAUSTED;
518      *(*toP)++ = *(*fromP)++;
519    }
520  }
521}
522
523static enum XML_Convert_Result PTRCALL
524latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
525               unsigned short **toP, const unsigned short *toLim) {
526  UNUSED_P(enc);
527  while (*fromP < fromLim && *toP < toLim)
528    *(*toP)++ = (unsigned char)*(*fromP)++;
529
530  if ((*toP == toLim) && (*fromP < fromLim))
531    return XML_CONVERT_OUTPUT_EXHAUSTED;
532  else
533    return XML_CONVERT_COMPLETED;
534}
535
536#ifdef XML_NS
537
538static const struct normal_encoding latin1_encoding_ns
539    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
540       {
541#  include "asciitab.h"
542#  include "latin1tab.h"
543       },
544       STANDARD_VTABLE(sb_) NULL_VTABLE};
545
546#endif
547
548static const struct normal_encoding latin1_encoding
549    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
550       {
551#define BT_COLON BT_NMSTRT
552#include "asciitab.h"
553#undef BT_COLON
554#include "latin1tab.h"
555       },
556       STANDARD_VTABLE(sb_) NULL_VTABLE};
557
558static enum XML_Convert_Result PTRCALL
559ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
560             char **toP, const char *toLim) {
561  UNUSED_P(enc);
562  while (*fromP < fromLim && *toP < toLim)
563    *(*toP)++ = *(*fromP)++;
564
565  if ((*toP == toLim) && (*fromP < fromLim))
566    return XML_CONVERT_OUTPUT_EXHAUSTED;
567  else
568    return XML_CONVERT_COMPLETED;
569}
570
571#ifdef XML_NS
572
573static const struct normal_encoding ascii_encoding_ns
574    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
575       {
576#  include "asciitab.h"
577           /* BT_NONXML == 0 */
578       },
579       STANDARD_VTABLE(sb_) NULL_VTABLE};
580
581#endif
582
583static const struct normal_encoding ascii_encoding
584    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
585       {
586#define BT_COLON BT_NMSTRT
587#include "asciitab.h"
588#undef BT_COLON
589           /* BT_NONXML == 0 */
590       },
591       STANDARD_VTABLE(sb_) NULL_VTABLE};
592
593static int PTRFASTCALL
594unicode_byte_type(char hi, char lo) {
595  switch ((unsigned char)hi) {
596  /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
597  case 0xD8:
598  case 0xD9:
599  case 0xDA:
600  case 0xDB:
601    return BT_LEAD4;
602  /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
603  case 0xDC:
604  case 0xDD:
605  case 0xDE:
606  case 0xDF:
607    return BT_TRAIL;
608  case 0xFF:
609    switch ((unsigned char)lo) {
610    case 0xFF: /* noncharacter-FFFF */
611    case 0xFE: /* noncharacter-FFFE */
612      return BT_NONXML;
613    }
614    break;
615  }
616  return BT_NONASCII;
617}
618
619#define DEFINE_UTF16_TO_UTF8(E)                                                \
620  static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
621      const ENCODING *enc, const char **fromP, const char *fromLim,            \
622      char **toP, const char *toLim) {                                         \
623    const char *from = *fromP;                                                 \
624    UNUSED_P(enc);                                                             \
625    fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
626    for (; from < fromLim; from += 2) {                                        \
627      int plane;                                                               \
628      unsigned char lo2;                                                       \
629      unsigned char lo = GET_LO(from);                                         \
630      unsigned char hi = GET_HI(from);                                         \
631      switch (hi) {                                                            \
632      case 0:                                                                  \
633        if (lo < 0x80) {                                                       \
634          if (*toP == toLim) {                                                 \
635            *fromP = from;                                                     \
636            return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
637          }                                                                    \
638          *(*toP)++ = lo;                                                      \
639          break;                                                               \
640        }                                                                      \
641        /* fall through */                                                     \
642      case 0x1:                                                                \
643      case 0x2:                                                                \
644      case 0x3:                                                                \
645      case 0x4:                                                                \
646      case 0x5:                                                                \
647      case 0x6:                                                                \
648      case 0x7:                                                                \
649        if (toLim - *toP < 2) {                                                \
650          *fromP = from;                                                       \
651          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
652        }                                                                      \
653        *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
654        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
655        break;                                                                 \
656      default:                                                                 \
657        if (toLim - *toP < 3) {                                                \
658          *fromP = from;                                                       \
659          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
660        }                                                                      \
661        /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
662        *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
663        *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
664        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
665        break;                                                                 \
666      case 0xD8:                                                               \
667      case 0xD9:                                                               \
668      case 0xDA:                                                               \
669      case 0xDB:                                                               \
670        if (toLim - *toP < 4) {                                                \
671          *fromP = from;                                                       \
672          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
673        }                                                                      \
674        if (fromLim - from < 4) {                                              \
675          *fromP = from;                                                       \
676          return XML_CONVERT_INPUT_INCOMPLETE;                                 \
677        }                                                                      \
678        plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
679        *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
680        *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
681        from += 2;                                                             \
682        lo2 = GET_LO(from);                                                    \
683        *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
684                     | (lo2 >> 6) | 0x80);                                     \
685        *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
686        break;                                                                 \
687      }                                                                        \
688    }                                                                          \
689    *fromP = from;                                                             \
690    if (from < fromLim)                                                        \
691      return XML_CONVERT_INPUT_INCOMPLETE;                                     \
692    else                                                                       \
693      return XML_CONVERT_COMPLETED;                                            \
694  }
695
696#define DEFINE_UTF16_TO_UTF16(E)                                               \
697  static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
698      const ENCODING *enc, const char **fromP, const char *fromLim,            \
699      unsigned short **toP, const unsigned short *toLim) {                     \
700    enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
701    UNUSED_P(enc);                                                             \
702    fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
703    /* Avoid copying first half only of surrogate */                           \
704    if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
705        && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
706      fromLim -= 2;                                                            \
707      res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
708    }                                                                          \
709    for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
710      *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
711    if ((*toP == toLim) && (*fromP < fromLim))                                 \
712      return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
713    else                                                                       \
714      return res;                                                              \
715  }
716
717#define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
718#define GET_LO(ptr) ((unsigned char)(ptr)[0])
719#define GET_HI(ptr) ((unsigned char)(ptr)[1])
720
721DEFINE_UTF16_TO_UTF8(little2_)
722DEFINE_UTF16_TO_UTF16(little2_)
723
724#undef SET2
725#undef GET_LO
726#undef GET_HI
727
728#define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
729#define GET_LO(ptr) ((unsigned char)(ptr)[1])
730#define GET_HI(ptr) ((unsigned char)(ptr)[0])
731
732DEFINE_UTF16_TO_UTF8(big2_)
733DEFINE_UTF16_TO_UTF16(big2_)
734
735#undef SET2
736#undef GET_LO
737#undef GET_HI
738
739#define LITTLE2_BYTE_TYPE(enc, p)                                              \
740  ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]  \
741               : unicode_byte_type((p)[1], (p)[0]))
742#define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
743#define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c)
744#define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
745  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
746#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
747  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
748
749#ifdef XML_MIN_SIZE
750
751static int PTRFASTCALL
752little2_byteType(const ENCODING *enc, const char *p) {
753  return LITTLE2_BYTE_TYPE(enc, p);
754}
755
756static int PTRFASTCALL
757little2_byteToAscii(const ENCODING *enc, const char *p) {
758  UNUSED_P(enc);
759  return LITTLE2_BYTE_TO_ASCII(p);
760}
761
762static int PTRCALL
763little2_charMatches(const ENCODING *enc, const char *p, int c) {
764  UNUSED_P(enc);
765  return LITTLE2_CHAR_MATCHES(p, c);
766}
767
768static int PTRFASTCALL
769little2_isNameMin(const ENCODING *enc, const char *p) {
770  UNUSED_P(enc);
771  return LITTLE2_IS_NAME_CHAR_MINBPC(p);
772}
773
774static int PTRFASTCALL
775little2_isNmstrtMin(const ENCODING *enc, const char *p) {
776  UNUSED_P(enc);
777  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
778}
779
780#  undef VTABLE
781#  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
782
783#else /* not XML_MIN_SIZE */
784
785#  undef PREFIX
786#  define PREFIX(ident) little2_##ident
787#  define MINBPC(enc) 2
788/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
789#  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
790#  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
791#  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
792#  define IS_NAME_CHAR(enc, p, n) 0
793#  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
794#  define IS_NMSTRT_CHAR(enc, p, n) (0)
795#  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
796
797#  define XML_TOK_IMPL_C
798#  include "xmltok_impl.c"
799#  undef XML_TOK_IMPL_C
800
801#  undef MINBPC
802#  undef BYTE_TYPE
803#  undef BYTE_TO_ASCII
804#  undef CHAR_MATCHES
805#  undef IS_NAME_CHAR
806#  undef IS_NAME_CHAR_MINBPC
807#  undef IS_NMSTRT_CHAR
808#  undef IS_NMSTRT_CHAR_MINBPC
809#  undef IS_INVALID_CHAR
810
811#endif /* not XML_MIN_SIZE */
812
813#ifdef XML_NS
814
815static const struct normal_encoding little2_encoding_ns
816    = {{VTABLE, 2, 0,
817#  if BYTEORDER == 1234
818        1
819#  else
820        0
821#  endif
822       },
823       {
824#  include "asciitab.h"
825#  include "latin1tab.h"
826       },
827       STANDARD_VTABLE(little2_) NULL_VTABLE};
828
829#endif
830
831static const struct normal_encoding little2_encoding
832    = {{VTABLE, 2, 0,
833#if BYTEORDER == 1234
834        1
835#else
836        0
837#endif
838       },
839       {
840#define BT_COLON BT_NMSTRT
841#include "asciitab.h"
842#undef BT_COLON
843#include "latin1tab.h"
844       },
845       STANDARD_VTABLE(little2_) NULL_VTABLE};
846
847#if BYTEORDER != 4321
848
849#  ifdef XML_NS
850
851static const struct normal_encoding internal_little2_encoding_ns
852    = {{VTABLE, 2, 0, 1},
853       {
854#    include "iasciitab.h"
855#    include "latin1tab.h"
856       },
857       STANDARD_VTABLE(little2_) NULL_VTABLE};
858
859#  endif
860
861static const struct normal_encoding internal_little2_encoding
862    = {{VTABLE, 2, 0, 1},
863       {
864#  define BT_COLON BT_NMSTRT
865#  include "iasciitab.h"
866#  undef BT_COLON
867#  include "latin1tab.h"
868       },
869       STANDARD_VTABLE(little2_) NULL_VTABLE};
870
871#endif
872
873#define BIG2_BYTE_TYPE(enc, p)                                                 \
874  ((p)[0] == 0                                                                 \
875       ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]]        \
876       : unicode_byte_type((p)[0], (p)[1]))
877#define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
878#define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c)
879#define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
880  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
881#define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
882  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
883
884#ifdef XML_MIN_SIZE
885
886static int PTRFASTCALL
887big2_byteType(const ENCODING *enc, const char *p) {
888  return BIG2_BYTE_TYPE(enc, p);
889}
890
891static int PTRFASTCALL
892big2_byteToAscii(const ENCODING *enc, const char *p) {
893  UNUSED_P(enc);
894  return BIG2_BYTE_TO_ASCII(p);
895}
896
897static int PTRCALL
898big2_charMatches(const ENCODING *enc, const char *p, int c) {
899  UNUSED_P(enc);
900  return BIG2_CHAR_MATCHES(p, c);
901}
902
903static int PTRFASTCALL
904big2_isNameMin(const ENCODING *enc, const char *p) {
905  UNUSED_P(enc);
906  return BIG2_IS_NAME_CHAR_MINBPC(p);
907}
908
909static int PTRFASTCALL
910big2_isNmstrtMin(const ENCODING *enc, const char *p) {
911  UNUSED_P(enc);
912  return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
913}
914
915#  undef VTABLE
916#  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
917
918#else /* not XML_MIN_SIZE */
919
920#  undef PREFIX
921#  define PREFIX(ident) big2_##ident
922#  define MINBPC(enc) 2
923/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
924#  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
925#  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
926#  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
927#  define IS_NAME_CHAR(enc, p, n) 0
928#  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
929#  define IS_NMSTRT_CHAR(enc, p, n) (0)
930#  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
931
932#  define XML_TOK_IMPL_C
933#  include "xmltok_impl.c"
934#  undef XML_TOK_IMPL_C
935
936#  undef MINBPC
937#  undef BYTE_TYPE
938#  undef BYTE_TO_ASCII
939#  undef CHAR_MATCHES
940#  undef IS_NAME_CHAR
941#  undef IS_NAME_CHAR_MINBPC
942#  undef IS_NMSTRT_CHAR
943#  undef IS_NMSTRT_CHAR_MINBPC
944#  undef IS_INVALID_CHAR
945
946#endif /* not XML_MIN_SIZE */
947
948#ifdef XML_NS
949
950static const struct normal_encoding big2_encoding_ns
951    = {{VTABLE, 2, 0,
952#  if BYTEORDER == 4321
953        1
954#  else
955        0
956#  endif
957       },
958       {
959#  include "asciitab.h"
960#  include "latin1tab.h"
961       },
962       STANDARD_VTABLE(big2_) NULL_VTABLE};
963
964#endif
965
966static const struct normal_encoding big2_encoding
967    = {{VTABLE, 2, 0,
968#if BYTEORDER == 4321
969        1
970#else
971        0
972#endif
973       },
974       {
975#define BT_COLON BT_NMSTRT
976#include "asciitab.h"
977#undef BT_COLON
978#include "latin1tab.h"
979       },
980       STANDARD_VTABLE(big2_) NULL_VTABLE};
981
982#if BYTEORDER != 1234
983
984#  ifdef XML_NS
985
986static const struct normal_encoding internal_big2_encoding_ns
987    = {{VTABLE, 2, 0, 1},
988       {
989#    include "iasciitab.h"
990#    include "latin1tab.h"
991       },
992       STANDARD_VTABLE(big2_) NULL_VTABLE};
993
994#  endif
995
996static const struct normal_encoding internal_big2_encoding
997    = {{VTABLE, 2, 0, 1},
998       {
999#  define BT_COLON BT_NMSTRT
1000#  include "iasciitab.h"
1001#  undef BT_COLON
1002#  include "latin1tab.h"
1003       },
1004       STANDARD_VTABLE(big2_) NULL_VTABLE};
1005
1006#endif
1007
1008#undef PREFIX
1009
1010static int FASTCALL
1011streqci(const char *s1, const char *s2) {
1012  for (;;) {
1013    char c1 = *s1++;
1014    char c2 = *s2++;
1015    if (ASCII_a <= c1 && c1 <= ASCII_z)
1016      c1 += ASCII_A - ASCII_a;
1017    if (ASCII_a <= c2 && c2 <= ASCII_z)
1018      /* The following line will never get executed.  streqci() is
1019       * only called from two places, both of which guarantee to put
1020       * upper-case strings into s2.
1021       */
1022      c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1023    if (c1 != c2)
1024      return 0;
1025    if (! c1)
1026      break;
1027  }
1028  return 1;
1029}
1030
1031static void PTRCALL
1032initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1033                   POSITION *pos) {
1034  UNUSED_P(enc);
1035  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1036}
1037
1038static int
1039toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1040  char buf[1];
1041  char *p = buf;
1042  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1043  if (p == buf)
1044    return -1;
1045  else
1046    return buf[0];
1047}
1048
1049static int FASTCALL
1050isSpace(int c) {
1051  switch (c) {
1052  case 0x20:
1053  case 0xD:
1054  case 0xA:
1055  case 0x9:
1056    return 1;
1057  }
1058  return 0;
1059}
1060
1061/* Return 1 if there's just optional white space or there's an S
1062   followed by name=val.
1063*/
1064static int
1065parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1066                     const char **namePtr, const char **nameEndPtr,
1067                     const char **valPtr, const char **nextTokPtr) {
1068  int c;
1069  char open;
1070  if (ptr == end) {
1071    *namePtr = NULL;
1072    return 1;
1073  }
1074  if (! isSpace(toAscii(enc, ptr, end))) {
1075    *nextTokPtr = ptr;
1076    return 0;
1077  }
1078  do {
1079    ptr += enc->minBytesPerChar;
1080  } while (isSpace(toAscii(enc, ptr, end)));
1081  if (ptr == end) {
1082    *namePtr = NULL;
1083    return 1;
1084  }
1085  *namePtr = ptr;
1086  for (;;) {
1087    c = toAscii(enc, ptr, end);
1088    if (c == -1) {
1089      *nextTokPtr = ptr;
1090      return 0;
1091    }
1092    if (c == ASCII_EQUALS) {
1093      *nameEndPtr = ptr;
1094      break;
1095    }
1096    if (isSpace(c)) {
1097      *nameEndPtr = ptr;
1098      do {
1099        ptr += enc->minBytesPerChar;
1100      } while (isSpace(c = toAscii(enc, ptr, end)));
1101      if (c != ASCII_EQUALS) {
1102        *nextTokPtr = ptr;
1103        return 0;
1104      }
1105      break;
1106    }
1107    ptr += enc->minBytesPerChar;
1108  }
1109  if (ptr == *namePtr) {
1110    *nextTokPtr = ptr;
1111    return 0;
1112  }
1113  ptr += enc->minBytesPerChar;
1114  c = toAscii(enc, ptr, end);
1115  while (isSpace(c)) {
1116    ptr += enc->minBytesPerChar;
1117    c = toAscii(enc, ptr, end);
1118  }
1119  if (c != ASCII_QUOT && c != ASCII_APOS) {
1120    *nextTokPtr = ptr;
1121    return 0;
1122  }
1123  open = (char)c;
1124  ptr += enc->minBytesPerChar;
1125  *valPtr = ptr;
1126  for (;; ptr += enc->minBytesPerChar) {
1127    c = toAscii(enc, ptr, end);
1128    if (c == open)
1129      break;
1130    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1131        && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1132        && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1133      *nextTokPtr = ptr;
1134      return 0;
1135    }
1136  }
1137  *nextTokPtr = ptr + enc->minBytesPerChar;
1138  return 1;
1139}
1140
1141static const char KW_version[]
1142    = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1143
1144static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1145                                   ASCII_i, ASCII_n, ASCII_g, '\0'};
1146
1147static const char KW_standalone[]
1148    = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1149       ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1150
1151static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1152
1153static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1154
1155static int
1156doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1157                                                 const char *),
1158               int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1159               const char *end, const char **badPtr, const char **versionPtr,
1160               const char **versionEndPtr, const char **encodingName,
1161               const ENCODING **encoding, int *standalone) {
1162  const char *val = NULL;
1163  const char *name = NULL;
1164  const char *nameEnd = NULL;
1165  ptr += 5 * enc->minBytesPerChar;
1166  end -= 2 * enc->minBytesPerChar;
1167  if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1168      || ! name) {
1169    *badPtr = ptr;
1170    return 0;
1171  }
1172  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1173    if (! isGeneralTextEntity) {
1174      *badPtr = name;
1175      return 0;
1176    }
1177  } else {
1178    if (versionPtr)
1179      *versionPtr = val;
1180    if (versionEndPtr)
1181      *versionEndPtr = ptr;
1182    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1183      *badPtr = ptr;
1184      return 0;
1185    }
1186    if (! name) {
1187      if (isGeneralTextEntity) {
1188        /* a TextDecl must have an EncodingDecl */
1189        *badPtr = ptr;
1190        return 0;
1191      }
1192      return 1;
1193    }
1194  }
1195  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1196    int c = toAscii(enc, val, end);
1197    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1198      *badPtr = val;
1199      return 0;
1200    }
1201    if (encodingName)
1202      *encodingName = val;
1203    if (encoding)
1204      *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1205    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1206      *badPtr = ptr;
1207      return 0;
1208    }
1209    if (! name)
1210      return 1;
1211  }
1212  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1213      || isGeneralTextEntity) {
1214    *badPtr = name;
1215    return 0;
1216  }
1217  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1218    if (standalone)
1219      *standalone = 1;
1220  } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1221    if (standalone)
1222      *standalone = 0;
1223  } else {
1224    *badPtr = val;
1225    return 0;
1226  }
1227  while (isSpace(toAscii(enc, ptr, end)))
1228    ptr += enc->minBytesPerChar;
1229  if (ptr != end) {
1230    *badPtr = ptr;
1231    return 0;
1232  }
1233  return 1;
1234}
1235
1236static int FASTCALL
1237checkCharRefNumber(int result) {
1238  switch (result >> 8) {
1239  case 0xD8:
1240  case 0xD9:
1241  case 0xDA:
1242  case 0xDB:
1243  case 0xDC:
1244  case 0xDD:
1245  case 0xDE:
1246  case 0xDF:
1247    return -1;
1248  case 0:
1249    if (latin1_encoding.type[result] == BT_NONXML)
1250      return -1;
1251    break;
1252  case 0xFF:
1253    if (result == 0xFFFE || result == 0xFFFF)
1254      return -1;
1255    break;
1256  }
1257  return result;
1258}
1259
1260int FASTCALL
1261XmlUtf8Encode(int c, char *buf) {
1262  enum {
1263    /* minN is minimum legal resulting value for N byte sequence */
1264    min2 = 0x80,
1265    min3 = 0x800,
1266    min4 = 0x10000
1267  };
1268
1269  if (c < 0)
1270    return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1271  if (c < min2) {
1272    buf[0] = (char)(c | UTF8_cval1);
1273    return 1;
1274  }
1275  if (c < min3) {
1276    buf[0] = (char)((c >> 6) | UTF8_cval2);
1277    buf[1] = (char)((c & 0x3f) | 0x80);
1278    return 2;
1279  }
1280  if (c < min4) {
1281    buf[0] = (char)((c >> 12) | UTF8_cval3);
1282    buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1283    buf[2] = (char)((c & 0x3f) | 0x80);
1284    return 3;
1285  }
1286  if (c < 0x110000) {
1287    buf[0] = (char)((c >> 18) | UTF8_cval4);
1288    buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1289    buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1290    buf[3] = (char)((c & 0x3f) | 0x80);
1291    return 4;
1292  }
1293  return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1294}
1295
1296int FASTCALL
1297XmlUtf16Encode(int charNum, unsigned short *buf) {
1298  if (charNum < 0)
1299    return 0;
1300  if (charNum < 0x10000) {
1301    buf[0] = (unsigned short)charNum;
1302    return 1;
1303  }
1304  if (charNum < 0x110000) {
1305    charNum -= 0x10000;
1306    buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1307    buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1308    return 2;
1309  }
1310  return 0;
1311}
1312
1313struct unknown_encoding {
1314  struct normal_encoding normal;
1315  CONVERTER convert;
1316  void *userData;
1317  unsigned short utf16[256];
1318  char utf8[256][4];
1319};
1320
1321#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1322
1323int
1324XmlSizeOfUnknownEncoding(void) {
1325  return sizeof(struct unknown_encoding);
1326}
1327
1328static int PTRFASTCALL
1329unknown_isName(const ENCODING *enc, const char *p) {
1330  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1331  int c = uenc->convert(uenc->userData, p);
1332  if (c & ~0xFFFF)
1333    return 0;
1334  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1335}
1336
1337static int PTRFASTCALL
1338unknown_isNmstrt(const ENCODING *enc, const char *p) {
1339  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1340  int c = uenc->convert(uenc->userData, p);
1341  if (c & ~0xFFFF)
1342    return 0;
1343  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1344}
1345
1346static int PTRFASTCALL
1347unknown_isInvalid(const ENCODING *enc, const char *p) {
1348  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1349  int c = uenc->convert(uenc->userData, p);
1350  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1351}
1352
1353static enum XML_Convert_Result PTRCALL
1354unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1355               char **toP, const char *toLim) {
1356  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1357  char buf[XML_UTF8_ENCODE_MAX];
1358  for (;;) {
1359    const char *utf8;
1360    int n;
1361    if (*fromP == fromLim)
1362      return XML_CONVERT_COMPLETED;
1363    utf8 = uenc->utf8[(unsigned char)**fromP];
1364    n = *utf8++;
1365    if (n == 0) {
1366      int c = uenc->convert(uenc->userData, *fromP);
1367      n = XmlUtf8Encode(c, buf);
1368      if (n > toLim - *toP)
1369        return XML_CONVERT_OUTPUT_EXHAUSTED;
1370      utf8 = buf;
1371      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1372                 - (BT_LEAD2 - 2));
1373    } else {
1374      if (n > toLim - *toP)
1375        return XML_CONVERT_OUTPUT_EXHAUSTED;
1376      (*fromP)++;
1377    }
1378    memcpy(*toP, utf8, n);
1379    *toP += n;
1380  }
1381}
1382
1383static enum XML_Convert_Result PTRCALL
1384unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1385                unsigned short **toP, const unsigned short *toLim) {
1386  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1387  while (*fromP < fromLim && *toP < toLim) {
1388    unsigned short c = uenc->utf16[(unsigned char)**fromP];
1389    if (c == 0) {
1390      c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1391      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1392                 - (BT_LEAD2 - 2));
1393    } else
1394      (*fromP)++;
1395    *(*toP)++ = c;
1396  }
1397
1398  if ((*toP == toLim) && (*fromP < fromLim))
1399    return XML_CONVERT_OUTPUT_EXHAUSTED;
1400  else
1401    return XML_CONVERT_COMPLETED;
1402}
1403
1404ENCODING *
1405XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1406                       void *userData) {
1407  int i;
1408  struct unknown_encoding *e = (struct unknown_encoding *)mem;
1409  memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1410  for (i = 0; i < 128; i++)
1411    if (latin1_encoding.type[i] != BT_OTHER
1412        && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1413      return 0;
1414  for (i = 0; i < 256; i++) {
1415    int c = table[i];
1416    if (c == -1) {
1417      e->normal.type[i] = BT_MALFORM;
1418      /* This shouldn't really get used. */
1419      e->utf16[i] = 0xFFFF;
1420      e->utf8[i][0] = 1;
1421      e->utf8[i][1] = 0;
1422    } else if (c < 0) {
1423      if (c < -4)
1424        return 0;
1425      /* Multi-byte sequences need a converter function */
1426      if (! convert)
1427        return 0;
1428      e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1429      e->utf8[i][0] = 0;
1430      e->utf16[i] = 0;
1431    } else if (c < 0x80) {
1432      if (latin1_encoding.type[c] != BT_OTHER
1433          && latin1_encoding.type[c] != BT_NONXML && c != i)
1434        return 0;
1435      e->normal.type[i] = latin1_encoding.type[c];
1436      e->utf8[i][0] = 1;
1437      e->utf8[i][1] = (char)c;
1438      e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1439    } else if (checkCharRefNumber(c) < 0) {
1440      e->normal.type[i] = BT_NONXML;
1441      /* This shouldn't really get used. */
1442      e->utf16[i] = 0xFFFF;
1443      e->utf8[i][0] = 1;
1444      e->utf8[i][1] = 0;
1445    } else {
1446      if (c > 0xFFFF)
1447        return 0;
1448      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1449        e->normal.type[i] = BT_NMSTRT;
1450      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1451        e->normal.type[i] = BT_NAME;
1452      else
1453        e->normal.type[i] = BT_OTHER;
1454      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1455      e->utf16[i] = (unsigned short)c;
1456    }
1457  }
1458  e->userData = userData;
1459  e->convert = convert;
1460  if (convert) {
1461    e->normal.isName2 = unknown_isName;
1462    e->normal.isName3 = unknown_isName;
1463    e->normal.isName4 = unknown_isName;
1464    e->normal.isNmstrt2 = unknown_isNmstrt;
1465    e->normal.isNmstrt3 = unknown_isNmstrt;
1466    e->normal.isNmstrt4 = unknown_isNmstrt;
1467    e->normal.isInvalid2 = unknown_isInvalid;
1468    e->normal.isInvalid3 = unknown_isInvalid;
1469    e->normal.isInvalid4 = unknown_isInvalid;
1470  }
1471  e->normal.enc.utf8Convert = unknown_toUtf8;
1472  e->normal.enc.utf16Convert = unknown_toUtf16;
1473  return &(e->normal.enc);
1474}
1475
1476/* If this enumeration is changed, getEncodingIndex and encodings
1477must also be changed. */
1478enum {
1479  UNKNOWN_ENC = -1,
1480  ISO_8859_1_ENC = 0,
1481  US_ASCII_ENC,
1482  UTF_8_ENC,
1483  UTF_16_ENC,
1484  UTF_16BE_ENC,
1485  UTF_16LE_ENC,
1486  /* must match encodingNames up to here */
1487  NO_ENC
1488};
1489
1490static const char KW_ISO_8859_1[]
1491    = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1492       ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1493static const char KW_US_ASCII[]
1494    = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1495       ASCII_C, ASCII_I, ASCII_I,     '\0'};
1496static const char KW_UTF_8[]
1497    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1498static const char KW_UTF_16[]
1499    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1500static const char KW_UTF_16BE[]
1501    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1502       ASCII_6, ASCII_B, ASCII_E, '\0'};
1503static const char KW_UTF_16LE[]
1504    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1505       ASCII_6, ASCII_L, ASCII_E, '\0'};
1506
1507static int FASTCALL
1508getEncodingIndex(const char *name) {
1509  static const char *const encodingNames[] = {
1510      KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1511  };
1512  int i;
1513  if (name == NULL)
1514    return NO_ENC;
1515  for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1516    if (streqci(name, encodingNames[i]))
1517      return i;
1518  return UNKNOWN_ENC;
1519}
1520
1521/* For binary compatibility, we store the index of the encoding
1522   specified at initialization in the isUtf16 member.
1523*/
1524
1525#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1526#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1527
1528/* This is what detects the encoding.  encodingTable maps from
1529   encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1530   the external (protocol) specified encoding; state is
1531   XML_CONTENT_STATE if we're parsing an external text entity, and
1532   XML_PROLOG_STATE otherwise.
1533*/
1534
1535static int
1536initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1537         int state, const char *ptr, const char *end, const char **nextTokPtr) {
1538  const ENCODING **encPtr;
1539
1540  if (ptr >= end)
1541    return XML_TOK_NONE;
1542  encPtr = enc->encPtr;
1543  if (ptr + 1 == end) {
1544    /* only a single byte available for auto-detection */
1545#ifndef XML_DTD /* FIXME */
1546    /* a well-formed document entity must have more than one byte */
1547    if (state != XML_CONTENT_STATE)
1548      return XML_TOK_PARTIAL;
1549#endif
1550    /* so we're parsing an external text entity... */
1551    /* if UTF-16 was externally specified, then we need at least 2 bytes */
1552    switch (INIT_ENC_INDEX(enc)) {
1553    case UTF_16_ENC:
1554    case UTF_16LE_ENC:
1555    case UTF_16BE_ENC:
1556      return XML_TOK_PARTIAL;
1557    }
1558    switch ((unsigned char)*ptr) {
1559    case 0xFE:
1560    case 0xFF:
1561    case 0xEF: /* possibly first byte of UTF-8 BOM */
1562      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1563        break;
1564      /* fall through */
1565    case 0x00:
1566    case 0x3C:
1567      return XML_TOK_PARTIAL;
1568    }
1569  } else {
1570    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1571    case 0xFEFF:
1572      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1573        break;
1574      *nextTokPtr = ptr + 2;
1575      *encPtr = encodingTable[UTF_16BE_ENC];
1576      return XML_TOK_BOM;
1577    /* 00 3C is handled in the default case */
1578    case 0x3C00:
1579      if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1580           || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1581          && state == XML_CONTENT_STATE)
1582        break;
1583      *encPtr = encodingTable[UTF_16LE_ENC];
1584      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1585    case 0xFFFE:
1586      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1587        break;
1588      *nextTokPtr = ptr + 2;
1589      *encPtr = encodingTable[UTF_16LE_ENC];
1590      return XML_TOK_BOM;
1591    case 0xEFBB:
1592      /* Maybe a UTF-8 BOM (EF BB BF) */
1593      /* If there's an explicitly specified (external) encoding
1594         of ISO-8859-1 or some flavour of UTF-16
1595         and this is an external text entity,
1596         don't look for the BOM,
1597         because it might be a legal data.
1598      */
1599      if (state == XML_CONTENT_STATE) {
1600        int e = INIT_ENC_INDEX(enc);
1601        if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1602            || e == UTF_16_ENC)
1603          break;
1604      }
1605      if (ptr + 2 == end)
1606        return XML_TOK_PARTIAL;
1607      if ((unsigned char)ptr[2] == 0xBF) {
1608        *nextTokPtr = ptr + 3;
1609        *encPtr = encodingTable[UTF_8_ENC];
1610        return XML_TOK_BOM;
1611      }
1612      break;
1613    default:
1614      if (ptr[0] == '\0') {
1615        /* 0 isn't a legal data character. Furthermore a document
1616           entity can only start with ASCII characters.  So the only
1617           way this can fail to be big-endian UTF-16 if it it's an
1618           external parsed general entity that's labelled as
1619           UTF-16LE.
1620        */
1621        if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1622          break;
1623        *encPtr = encodingTable[UTF_16BE_ENC];
1624        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1625      } else if (ptr[1] == '\0') {
1626        /* We could recover here in the case:
1627            - parsing an external entity
1628            - second byte is 0
1629            - no externally specified encoding
1630            - no encoding declaration
1631           by assuming UTF-16LE.  But we don't, because this would mean when
1632           presented just with a single byte, we couldn't reliably determine
1633           whether we needed further bytes.
1634        */
1635        if (state == XML_CONTENT_STATE)
1636          break;
1637        *encPtr = encodingTable[UTF_16LE_ENC];
1638        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1639      }
1640      break;
1641    }
1642  }
1643  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1644  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1645}
1646
1647#define NS(x) x
1648#define ns(x) x
1649#define XML_TOK_NS_C
1650#include "xmltok_ns.c"
1651#undef XML_TOK_NS_C
1652#undef NS
1653#undef ns
1654
1655#ifdef XML_NS
1656
1657#  define NS(x) x##NS
1658#  define ns(x) x##_ns
1659
1660#  define XML_TOK_NS_C
1661#  include "xmltok_ns.c"
1662#  undef XML_TOK_NS_C
1663
1664#  undef NS
1665#  undef ns
1666
1667ENCODING *
1668XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1669                         void *userData) {
1670  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1671  if (enc)
1672    ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1673  return enc;
1674}
1675
1676#endif /* XML_NS */
1677