1/*
2                            __  __            _
3                         ___\ \/ /_ __   __ _| |_
4                        / _ \\  /| '_ \ / _` | __|
5                       |  __//  \| |_) | (_| | |_
6                        \___/_/\_\ .__/ \__,_|\__|
7                                 |_| XML parser
8
9   Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10   Copyright (c) 2000-2017 Expat development team
11   Licensed under the MIT license:
12
13   Permission is  hereby granted,  free of charge,  to any  person obtaining
14   a  copy  of  this  software   and  associated  documentation  files  (the
15   "Software"),  to  deal in  the  Software  without restriction,  including
16   without  limitation the  rights  to use,  copy,  modify, merge,  publish,
17   distribute, sublicense, and/or sell copies of the Software, and to permit
18   persons  to whom  the Software  is  furnished to  do so,  subject to  the
19   following conditions:
20
21   The above copyright  notice and this permission notice  shall be included
22   in all copies or substantial portions of the Software.
23
24   THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
25   EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
26   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27   NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28   DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
29   OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30   USE OR OTHER DEALINGS IN THE SOFTWARE.
31*/
32
33#include <stddef.h>
34#include <string.h> /* memcpy */
35
36#if defined(_MSC_VER) && (_MSC_VER <= 1700)
37/* for vs2012/11.0/1700 and earlier Visual Studio compilers */
38#  define bool int
39#  define false 0
40#  define true 1
41#else
42#  include <stdbool.h>
43#endif
44
45#ifdef _WIN32
46#  include "winconfig.h"
47#else
48#  ifdef HAVE_EXPAT_CONFIG_H
49#    include <expat_config.h>
50#  endif
51#endif /* ndef _WIN32 */
52
53#include "expat_external.h"
54#include "internal.h"
55#include "xmltok.h"
56#include "nametab.h"
57
58#ifdef XML_DTD
59#  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
60#else
61#  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
62#endif
63
64#define VTABLE1                                                                \
65  {PREFIX(prologTok), PREFIX(contentTok),                                      \
66   PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
67      {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
68      PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
69      PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
70      PREFIX(updatePosition), PREFIX(isPublicId)
71
72#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
73
74#define UCS2_GET_NAMING(pages, hi, lo)                                         \
75  (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
76
77/* A 2 byte UTF-8 representation splits the characters 11 bits between
78   the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
79   pages, 3 bits to add to that index and 5 bits to generate the mask.
80*/
81#define UTF8_GET_NAMING2(pages, byte)                                          \
82  (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
83                + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
84   & (1u << (((byte)[1]) & 0x1F)))
85
86/* A 3 byte UTF-8 representation splits the characters 16 bits between
87   the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
88   into pages, 3 bits to add to that index and 5 bits to generate the
89   mask.
90*/
91#define UTF8_GET_NAMING3(pages, byte)                                          \
92  (namingBitmap                                                                \
93       [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
94         << 3)                                                                 \
95        + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
96   & (1u << (((byte)[2]) & 0x1F)))
97
98#define UTF8_GET_NAMING(pages, p, n)                                           \
99  ((n) == 2                                                                    \
100       ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p))                   \
101       : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0))
102
103/* Detection of invalid UTF-8 sequences is based on Table 3.1B
104   of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
105   with the additional restriction of not allowing the Unicode
106   code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
107   Implementation details:
108     (A & 0x80) == 0     means A < 0x80
109   and
110     (A & 0xC0) == 0xC0  means A > 0xBF
111*/
112
113#define UTF8_INVALID2(p)                                                       \
114  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
115
116#define UTF8_INVALID3(p)                                                       \
117  (((p)[2] & 0x80) == 0                                                        \
118   || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
119                                      : ((p)[2] & 0xC0) == 0xC0)               \
120   || ((*p) == 0xE0                                                            \
121           ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
122           : ((p)[1] & 0x80) == 0                                              \
123                 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
124
125#define UTF8_INVALID4(p)                                                       \
126  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
127   || ((p)[2] & 0xC0) == 0xC0                                                  \
128   || ((*p) == 0xF0                                                            \
129           ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
130           : ((p)[1] & 0x80) == 0                                              \
131                 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
132
133static int PTRFASTCALL
134isNever(const ENCODING *enc, const char *p) {
135  UNUSED_P(enc);
136  UNUSED_P(p);
137  return 0;
138}
139
140static int PTRFASTCALL
141utf8_isName2(const ENCODING *enc, const char *p) {
142  UNUSED_P(enc);
143  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
144}
145
146static int PTRFASTCALL
147utf8_isName3(const ENCODING *enc, const char *p) {
148  UNUSED_P(enc);
149  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
150}
151
152#define utf8_isName4 isNever
153
154static int PTRFASTCALL
155utf8_isNmstrt2(const ENCODING *enc, const char *p) {
156  UNUSED_P(enc);
157  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
158}
159
160static int PTRFASTCALL
161utf8_isNmstrt3(const ENCODING *enc, const char *p) {
162  UNUSED_P(enc);
163  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
164}
165
166#define utf8_isNmstrt4 isNever
167
168static int PTRFASTCALL
169utf8_isInvalid2(const ENCODING *enc, const char *p) {
170  UNUSED_P(enc);
171  return UTF8_INVALID2((const unsigned char *)p);
172}
173
174static int PTRFASTCALL
175utf8_isInvalid3(const ENCODING *enc, const char *p) {
176  UNUSED_P(enc);
177  return UTF8_INVALID3((const unsigned char *)p);
178}
179
180static int PTRFASTCALL
181utf8_isInvalid4(const ENCODING *enc, const char *p) {
182  UNUSED_P(enc);
183  return UTF8_INVALID4((const unsigned char *)p);
184}
185
186struct normal_encoding {
187  ENCODING enc;
188  unsigned char type[256];
189#ifdef XML_MIN_SIZE
190  int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
191  int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
192  int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
193  int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
194  int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
195#endif /* XML_MIN_SIZE */
196  int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
197  int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
198  int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
199  int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
200  int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
201  int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
202  int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
203  int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
204  int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
205};
206
207#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
208
209#ifdef XML_MIN_SIZE
210
211#  define STANDARD_VTABLE(E)                                                   \
212    E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
213
214#else
215
216#  define STANDARD_VTABLE(E) /* as nothing */
217
218#endif
219
220#define NORMAL_VTABLE(E)                                                       \
221  E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
222      E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
223
224#define NULL_VTABLE                                                            \
225  /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
226      /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
227      /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
228
229static int FASTCALL checkCharRefNumber(int);
230
231#include "xmltok_impl.h"
232#include "ascii.h"
233
234#ifdef XML_MIN_SIZE
235#  define sb_isNameMin isNever
236#  define sb_isNmstrtMin isNever
237#endif
238
239#ifdef XML_MIN_SIZE
240#  define MINBPC(enc) ((enc)->minBytesPerChar)
241#else
242/* minimum bytes per character */
243#  define MINBPC(enc) 1
244#endif
245
246#define SB_BYTE_TYPE(enc, p)                                                   \
247  (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
248
249#ifdef XML_MIN_SIZE
250static int PTRFASTCALL
251sb_byteType(const ENCODING *enc, const char *p) {
252  return SB_BYTE_TYPE(enc, p);
253}
254#  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
255#else
256#  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
257#endif
258
259#ifdef XML_MIN_SIZE
260#  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
261static int PTRFASTCALL
262sb_byteToAscii(const ENCODING *enc, const char *p) {
263  UNUSED_P(enc);
264  return *p;
265}
266#else
267#  define BYTE_TO_ASCII(enc, p) (*(p))
268#endif
269
270#define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
271#define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
272#define IS_INVALID_CHAR(enc, p, n)                                             \
273  (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
274
275#ifdef XML_MIN_SIZE
276#  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
277    (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
278#  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
279    (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
280#else
281#  define IS_NAME_CHAR_MINBPC(enc, p) (0)
282#  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
283#endif
284
285#ifdef XML_MIN_SIZE
286#  define CHAR_MATCHES(enc, p, c)                                              \
287    (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
288static int PTRCALL
289sb_charMatches(const ENCODING *enc, const char *p, int c) {
290  UNUSED_P(enc);
291  return *p == c;
292}
293#else
294/* c is an ASCII character */
295#  define CHAR_MATCHES(enc, p, c) (*(p) == c)
296#endif
297
298#define PREFIX(ident) normal_##ident
299#define XML_TOK_IMPL_C
300#include "xmltok_impl.c"
301#undef XML_TOK_IMPL_C
302
303#undef MINBPC
304#undef BYTE_TYPE
305#undef BYTE_TO_ASCII
306#undef CHAR_MATCHES
307#undef IS_NAME_CHAR
308#undef IS_NAME_CHAR_MINBPC
309#undef IS_NMSTRT_CHAR
310#undef IS_NMSTRT_CHAR_MINBPC
311#undef IS_INVALID_CHAR
312
313enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
314       UTF8_cval1 = 0x00,
315       UTF8_cval2 = 0xc0,
316       UTF8_cval3 = 0xe0,
317       UTF8_cval4 = 0xf0
318};
319
320void
321_INTERNAL_trim_to_complete_utf8_characters(const char *from,
322                                           const char **fromLimRef) {
323  const char *fromLim = *fromLimRef;
324  size_t walked = 0;
325  for (; fromLim > from; fromLim--, walked++) {
326    const unsigned char prev = (unsigned char)fromLim[-1];
327    if ((prev & 0xf8u)
328        == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
329      if (walked + 1 >= 4) {
330        fromLim += 4 - 1;
331        break;
332      } else {
333        walked = 0;
334      }
335    } else if ((prev & 0xf0u)
336               == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
337      if (walked + 1 >= 3) {
338        fromLim += 3 - 1;
339        break;
340      } else {
341        walked = 0;
342      }
343    } else if ((prev & 0xe0u)
344               == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
345      if (walked + 1 >= 2) {
346        fromLim += 2 - 1;
347        break;
348      } else {
349        walked = 0;
350      }
351    } else if ((prev & 0x80u)
352               == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
353      break;
354    }
355  }
356  *fromLimRef = fromLim;
357}
358
359static enum XML_Convert_Result PTRCALL
360utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
361            char **toP, const char *toLim) {
362  bool input_incomplete = false;
363  bool output_exhausted = false;
364
365  /* Avoid copying partial characters (due to limited space). */
366  const ptrdiff_t bytesAvailable = fromLim - *fromP;
367  const ptrdiff_t bytesStorable = toLim - *toP;
368  UNUSED_P(enc);
369  if (bytesAvailable > bytesStorable) {
370    fromLim = *fromP + bytesStorable;
371    output_exhausted = true;
372  }
373
374  /* Avoid copying partial characters (from incomplete input). */
375  {
376    const char *const fromLimBefore = fromLim;
377    _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
378    if (fromLim < fromLimBefore) {
379      input_incomplete = true;
380    }
381  }
382
383  {
384    const ptrdiff_t bytesToCopy = fromLim - *fromP;
385    memcpy(*toP, *fromP, bytesToCopy);
386    *fromP += bytesToCopy;
387    *toP += bytesToCopy;
388  }
389
390  if (output_exhausted) /* needs to go first */
391    return XML_CONVERT_OUTPUT_EXHAUSTED;
392  else if (input_incomplete)
393    return XML_CONVERT_INPUT_INCOMPLETE;
394  else
395    return XML_CONVERT_COMPLETED;
396}
397
398static enum XML_Convert_Result PTRCALL
399utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
400             unsigned short **toP, const unsigned short *toLim) {
401  enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
402  unsigned short *to = *toP;
403  const char *from = *fromP;
404  while (from < fromLim && to < toLim) {
405    switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
406    case BT_LEAD2:
407      if (fromLim - from < 2) {
408        res = XML_CONVERT_INPUT_INCOMPLETE;
409        goto after;
410      }
411      *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
412      from += 2;
413      break;
414    case BT_LEAD3:
415      if (fromLim - from < 3) {
416        res = XML_CONVERT_INPUT_INCOMPLETE;
417        goto after;
418      }
419      *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
420                               | (from[2] & 0x3f));
421      from += 3;
422      break;
423    case BT_LEAD4: {
424      unsigned long n;
425      if (toLim - to < 2) {
426        res = XML_CONVERT_OUTPUT_EXHAUSTED;
427        goto after;
428      }
429      if (fromLim - from < 4) {
430        res = XML_CONVERT_INPUT_INCOMPLETE;
431        goto after;
432      }
433      n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
434          | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
435      n -= 0x10000;
436      to[0] = (unsigned short)((n >> 10) | 0xD800);
437      to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
438      to += 2;
439      from += 4;
440    } break;
441    default:
442      *to++ = *from++;
443      break;
444    }
445  }
446  if (from < fromLim)
447    res = XML_CONVERT_OUTPUT_EXHAUSTED;
448after:
449  *fromP = from;
450  *toP = to;
451  return res;
452}
453
454#ifdef XML_NS
455static const struct normal_encoding utf8_encoding_ns
456    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
457       {
458#  include "asciitab.h"
459#  include "utf8tab.h"
460       },
461       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
462#endif
463
464static const struct normal_encoding utf8_encoding
465    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
466       {
467#define BT_COLON BT_NMSTRT
468#include "asciitab.h"
469#undef BT_COLON
470#include "utf8tab.h"
471       },
472       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
473
474#ifdef XML_NS
475
476static const struct normal_encoding internal_utf8_encoding_ns
477    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
478       {
479#  include "iasciitab.h"
480#  include "utf8tab.h"
481       },
482       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
483
484#endif
485
486static const struct normal_encoding internal_utf8_encoding
487    = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
488       {
489#define BT_COLON BT_NMSTRT
490#include "iasciitab.h"
491#undef BT_COLON
492#include "utf8tab.h"
493       },
494       STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
495
496static enum XML_Convert_Result PTRCALL
497latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
498              char **toP, const char *toLim) {
499  UNUSED_P(enc);
500  for (;;) {
501    unsigned char c;
502    if (*fromP == fromLim)
503      return XML_CONVERT_COMPLETED;
504    c = (unsigned char)**fromP;
505    if (c & 0x80) {
506      if (toLim - *toP < 2)
507        return XML_CONVERT_OUTPUT_EXHAUSTED;
508      *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
509      *(*toP)++ = (char)((c & 0x3f) | 0x80);
510      (*fromP)++;
511    } else {
512      if (*toP == toLim)
513        return XML_CONVERT_OUTPUT_EXHAUSTED;
514      *(*toP)++ = *(*fromP)++;
515    }
516  }
517}
518
519static enum XML_Convert_Result PTRCALL
520latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
521               unsigned short **toP, const unsigned short *toLim) {
522  UNUSED_P(enc);
523  while (*fromP < fromLim && *toP < toLim)
524    *(*toP)++ = (unsigned char)*(*fromP)++;
525
526  if ((*toP == toLim) && (*fromP < fromLim))
527    return XML_CONVERT_OUTPUT_EXHAUSTED;
528  else
529    return XML_CONVERT_COMPLETED;
530}
531
532#ifdef XML_NS
533
534static const struct normal_encoding latin1_encoding_ns
535    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
536       {
537#  include "asciitab.h"
538#  include "latin1tab.h"
539       },
540       STANDARD_VTABLE(sb_) NULL_VTABLE};
541
542#endif
543
544static const struct normal_encoding latin1_encoding
545    = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
546       {
547#define BT_COLON BT_NMSTRT
548#include "asciitab.h"
549#undef BT_COLON
550#include "latin1tab.h"
551       },
552       STANDARD_VTABLE(sb_) NULL_VTABLE};
553
554static enum XML_Convert_Result PTRCALL
555ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
556             char **toP, const char *toLim) {
557  UNUSED_P(enc);
558  while (*fromP < fromLim && *toP < toLim)
559    *(*toP)++ = *(*fromP)++;
560
561  if ((*toP == toLim) && (*fromP < fromLim))
562    return XML_CONVERT_OUTPUT_EXHAUSTED;
563  else
564    return XML_CONVERT_COMPLETED;
565}
566
567#ifdef XML_NS
568
569static const struct normal_encoding ascii_encoding_ns
570    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
571       {
572#  include "asciitab.h"
573           /* BT_NONXML == 0 */
574       },
575       STANDARD_VTABLE(sb_) NULL_VTABLE};
576
577#endif
578
579static const struct normal_encoding ascii_encoding
580    = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
581       {
582#define BT_COLON BT_NMSTRT
583#include "asciitab.h"
584#undef BT_COLON
585           /* BT_NONXML == 0 */
586       },
587       STANDARD_VTABLE(sb_) NULL_VTABLE};
588
589static int PTRFASTCALL
590unicode_byte_type(char hi, char lo) {
591  switch ((unsigned char)hi) {
592  /* 0xD800���0xDBFF first 16-bit code unit or high surrogate (W1) */
593  case 0xD8:
594  case 0xD9:
595  case 0xDA:
596  case 0xDB:
597    return BT_LEAD4;
598  /* 0xDC00���0xDFFF second 16-bit code unit or low surrogate (W2) */
599  case 0xDC:
600  case 0xDD:
601  case 0xDE:
602  case 0xDF:
603    return BT_TRAIL;
604  case 0xFF:
605    switch ((unsigned char)lo) {
606    case 0xFF: /* noncharacter-FFFF */
607    case 0xFE: /* noncharacter-FFFE */
608      return BT_NONXML;
609    }
610    break;
611  }
612  return BT_NONASCII;
613}
614
615#define DEFINE_UTF16_TO_UTF8(E)                                                \
616  static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
617      const ENCODING *enc, const char **fromP, const char *fromLim,            \
618      char **toP, const char *toLim) {                                         \
619    const char *from = *fromP;                                                 \
620    UNUSED_P(enc);                                                             \
621    fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
622    for (; from < fromLim; from += 2) {                                        \
623      int plane;                                                               \
624      unsigned char lo2;                                                       \
625      unsigned char lo = GET_LO(from);                                         \
626      unsigned char hi = GET_HI(from);                                         \
627      switch (hi) {                                                            \
628      case 0:                                                                  \
629        if (lo < 0x80) {                                                       \
630          if (*toP == toLim) {                                                 \
631            *fromP = from;                                                     \
632            return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
633          }                                                                    \
634          *(*toP)++ = lo;                                                      \
635          break;                                                               \
636        }                                                                      \
637        /* fall through */                                                     \
638      case 0x1:                                                                \
639      case 0x2:                                                                \
640      case 0x3:                                                                \
641      case 0x4:                                                                \
642      case 0x5:                                                                \
643      case 0x6:                                                                \
644      case 0x7:                                                                \
645        if (toLim - *toP < 2) {                                                \
646          *fromP = from;                                                       \
647          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
648        }                                                                      \
649        *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
650        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
651        break;                                                                 \
652      default:                                                                 \
653        if (toLim - *toP < 3) {                                                \
654          *fromP = from;                                                       \
655          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
656        }                                                                      \
657        /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
658        *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
659        *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
660        *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
661        break;                                                                 \
662      case 0xD8:                                                               \
663      case 0xD9:                                                               \
664      case 0xDA:                                                               \
665      case 0xDB:                                                               \
666        if (toLim - *toP < 4) {                                                \
667          *fromP = from;                                                       \
668          return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
669        }                                                                      \
670        if (fromLim - from < 4) {                                              \
671          *fromP = from;                                                       \
672          return XML_CONVERT_INPUT_INCOMPLETE;                                 \
673        }                                                                      \
674        plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
675        *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
676        *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
677        from += 2;                                                             \
678        lo2 = GET_LO(from);                                                    \
679        *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
680                     | (lo2 >> 6) | 0x80);                                     \
681        *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
682        break;                                                                 \
683      }                                                                        \
684    }                                                                          \
685    *fromP = from;                                                             \
686    if (from < fromLim)                                                        \
687      return XML_CONVERT_INPUT_INCOMPLETE;                                     \
688    else                                                                       \
689      return XML_CONVERT_COMPLETED;                                            \
690  }
691
692#define DEFINE_UTF16_TO_UTF16(E)                                               \
693  static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
694      const ENCODING *enc, const char **fromP, const char *fromLim,            \
695      unsigned short **toP, const unsigned short *toLim) {                     \
696    enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
697    UNUSED_P(enc);                                                             \
698    fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
699    /* Avoid copying first half only of surrogate */                           \
700    if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
701        && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
702      fromLim -= 2;                                                            \
703      res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
704    }                                                                          \
705    for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
706      *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
707    if ((*toP == toLim) && (*fromP < fromLim))                                 \
708      return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
709    else                                                                       \
710      return res;                                                              \
711  }
712
713#define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
714#define GET_LO(ptr) ((unsigned char)(ptr)[0])
715#define GET_HI(ptr) ((unsigned char)(ptr)[1])
716
717DEFINE_UTF16_TO_UTF8(little2_)
718DEFINE_UTF16_TO_UTF16(little2_)
719
720#undef SET2
721#undef GET_LO
722#undef GET_HI
723
724#define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
725#define GET_LO(ptr) ((unsigned char)(ptr)[1])
726#define GET_HI(ptr) ((unsigned char)(ptr)[0])
727
728DEFINE_UTF16_TO_UTF8(big2_)
729DEFINE_UTF16_TO_UTF16(big2_)
730
731#undef SET2
732#undef GET_LO
733#undef GET_HI
734
735#define LITTLE2_BYTE_TYPE(enc, p)                                              \
736  ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]  \
737               : unicode_byte_type((p)[1], (p)[0]))
738#define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
739#define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c)
740#define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
741  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
742#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
743  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
744
745#ifdef XML_MIN_SIZE
746
747static int PTRFASTCALL
748little2_byteType(const ENCODING *enc, const char *p) {
749  return LITTLE2_BYTE_TYPE(enc, p);
750}
751
752static int PTRFASTCALL
753little2_byteToAscii(const ENCODING *enc, const char *p) {
754  UNUSED_P(enc);
755  return LITTLE2_BYTE_TO_ASCII(p);
756}
757
758static int PTRCALL
759little2_charMatches(const ENCODING *enc, const char *p, int c) {
760  UNUSED_P(enc);
761  return LITTLE2_CHAR_MATCHES(p, c);
762}
763
764static int PTRFASTCALL
765little2_isNameMin(const ENCODING *enc, const char *p) {
766  UNUSED_P(enc);
767  return LITTLE2_IS_NAME_CHAR_MINBPC(p);
768}
769
770static int PTRFASTCALL
771little2_isNmstrtMin(const ENCODING *enc, const char *p) {
772  UNUSED_P(enc);
773  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
774}
775
776#  undef VTABLE
777#  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
778
779#else /* not XML_MIN_SIZE */
780
781#  undef PREFIX
782#  define PREFIX(ident) little2_##ident
783#  define MINBPC(enc) 2
784/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
785#  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
786#  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
787#  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
788#  define IS_NAME_CHAR(enc, p, n) 0
789#  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
790#  define IS_NMSTRT_CHAR(enc, p, n) (0)
791#  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
792
793#  define XML_TOK_IMPL_C
794#  include "xmltok_impl.c"
795#  undef XML_TOK_IMPL_C
796
797#  undef MINBPC
798#  undef BYTE_TYPE
799#  undef BYTE_TO_ASCII
800#  undef CHAR_MATCHES
801#  undef IS_NAME_CHAR
802#  undef IS_NAME_CHAR_MINBPC
803#  undef IS_NMSTRT_CHAR
804#  undef IS_NMSTRT_CHAR_MINBPC
805#  undef IS_INVALID_CHAR
806
807#endif /* not XML_MIN_SIZE */
808
809#ifdef XML_NS
810
811static const struct normal_encoding little2_encoding_ns
812    = {{VTABLE, 2, 0,
813#  if BYTEORDER == 1234
814        1
815#  else
816        0
817#  endif
818       },
819       {
820#  include "asciitab.h"
821#  include "latin1tab.h"
822       },
823       STANDARD_VTABLE(little2_) NULL_VTABLE};
824
825#endif
826
827static const struct normal_encoding little2_encoding
828    = {{VTABLE, 2, 0,
829#if BYTEORDER == 1234
830        1
831#else
832        0
833#endif
834       },
835       {
836#define BT_COLON BT_NMSTRT
837#include "asciitab.h"
838#undef BT_COLON
839#include "latin1tab.h"
840       },
841       STANDARD_VTABLE(little2_) NULL_VTABLE};
842
843#if BYTEORDER != 4321
844
845#  ifdef XML_NS
846
847static const struct normal_encoding internal_little2_encoding_ns
848    = {{VTABLE, 2, 0, 1},
849       {
850#    include "iasciitab.h"
851#    include "latin1tab.h"
852       },
853       STANDARD_VTABLE(little2_) NULL_VTABLE};
854
855#  endif
856
857static const struct normal_encoding internal_little2_encoding
858    = {{VTABLE, 2, 0, 1},
859       {
860#  define BT_COLON BT_NMSTRT
861#  include "iasciitab.h"
862#  undef BT_COLON
863#  include "latin1tab.h"
864       },
865       STANDARD_VTABLE(little2_) NULL_VTABLE};
866
867#endif
868
869#define BIG2_BYTE_TYPE(enc, p)                                                 \
870  ((p)[0] == 0                                                                 \
871       ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]]        \
872       : unicode_byte_type((p)[0], (p)[1]))
873#define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
874#define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c)
875#define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
876  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
877#define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
878  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
879
880#ifdef XML_MIN_SIZE
881
882static int PTRFASTCALL
883big2_byteType(const ENCODING *enc, const char *p) {
884  return BIG2_BYTE_TYPE(enc, p);
885}
886
887static int PTRFASTCALL
888big2_byteToAscii(const ENCODING *enc, const char *p) {
889  UNUSED_P(enc);
890  return BIG2_BYTE_TO_ASCII(p);
891}
892
893static int PTRCALL
894big2_charMatches(const ENCODING *enc, const char *p, int c) {
895  UNUSED_P(enc);
896  return BIG2_CHAR_MATCHES(p, c);
897}
898
899static int PTRFASTCALL
900big2_isNameMin(const ENCODING *enc, const char *p) {
901  UNUSED_P(enc);
902  return BIG2_IS_NAME_CHAR_MINBPC(p);
903}
904
905static int PTRFASTCALL
906big2_isNmstrtMin(const ENCODING *enc, const char *p) {
907  UNUSED_P(enc);
908  return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
909}
910
911#  undef VTABLE
912#  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
913
914#else /* not XML_MIN_SIZE */
915
916#  undef PREFIX
917#  define PREFIX(ident) big2_##ident
918#  define MINBPC(enc) 2
919/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
920#  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
921#  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
922#  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
923#  define IS_NAME_CHAR(enc, p, n) 0
924#  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
925#  define IS_NMSTRT_CHAR(enc, p, n) (0)
926#  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
927
928#  define XML_TOK_IMPL_C
929#  include "xmltok_impl.c"
930#  undef XML_TOK_IMPL_C
931
932#  undef MINBPC
933#  undef BYTE_TYPE
934#  undef BYTE_TO_ASCII
935#  undef CHAR_MATCHES
936#  undef IS_NAME_CHAR
937#  undef IS_NAME_CHAR_MINBPC
938#  undef IS_NMSTRT_CHAR
939#  undef IS_NMSTRT_CHAR_MINBPC
940#  undef IS_INVALID_CHAR
941
942#endif /* not XML_MIN_SIZE */
943
944#ifdef XML_NS
945
946static const struct normal_encoding big2_encoding_ns
947    = {{VTABLE, 2, 0,
948#  if BYTEORDER == 4321
949        1
950#  else
951        0
952#  endif
953       },
954       {
955#  include "asciitab.h"
956#  include "latin1tab.h"
957       },
958       STANDARD_VTABLE(big2_) NULL_VTABLE};
959
960#endif
961
962static const struct normal_encoding big2_encoding
963    = {{VTABLE, 2, 0,
964#if BYTEORDER == 4321
965        1
966#else
967        0
968#endif
969       },
970       {
971#define BT_COLON BT_NMSTRT
972#include "asciitab.h"
973#undef BT_COLON
974#include "latin1tab.h"
975       },
976       STANDARD_VTABLE(big2_) NULL_VTABLE};
977
978#if BYTEORDER != 1234
979
980#  ifdef XML_NS
981
982static const struct normal_encoding internal_big2_encoding_ns
983    = {{VTABLE, 2, 0, 1},
984       {
985#    include "iasciitab.h"
986#    include "latin1tab.h"
987       },
988       STANDARD_VTABLE(big2_) NULL_VTABLE};
989
990#  endif
991
992static const struct normal_encoding internal_big2_encoding
993    = {{VTABLE, 2, 0, 1},
994       {
995#  define BT_COLON BT_NMSTRT
996#  include "iasciitab.h"
997#  undef BT_COLON
998#  include "latin1tab.h"
999       },
1000       STANDARD_VTABLE(big2_) NULL_VTABLE};
1001
1002#endif
1003
1004#undef PREFIX
1005
1006static int FASTCALL
1007streqci(const char *s1, const char *s2) {
1008  for (;;) {
1009    char c1 = *s1++;
1010    char c2 = *s2++;
1011    if (ASCII_a <= c1 && c1 <= ASCII_z)
1012      c1 += ASCII_A - ASCII_a;
1013    if (ASCII_a <= c2 && c2 <= ASCII_z)
1014      /* The following line will never get executed.  streqci() is
1015       * only called from two places, both of which guarantee to put
1016       * upper-case strings into s2.
1017       */
1018      c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1019    if (c1 != c2)
1020      return 0;
1021    if (! c1)
1022      break;
1023  }
1024  return 1;
1025}
1026
1027static void PTRCALL
1028initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1029                   POSITION *pos) {
1030  UNUSED_P(enc);
1031  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1032}
1033
1034static int
1035toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1036  char buf[1];
1037  char *p = buf;
1038  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1039  if (p == buf)
1040    return -1;
1041  else
1042    return buf[0];
1043}
1044
1045static int FASTCALL
1046isSpace(int c) {
1047  switch (c) {
1048  case 0x20:
1049  case 0xD:
1050  case 0xA:
1051  case 0x9:
1052    return 1;
1053  }
1054  return 0;
1055}
1056
1057/* Return 1 if there's just optional white space or there's an S
1058   followed by name=val.
1059*/
1060static int
1061parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1062                     const char **namePtr, const char **nameEndPtr,
1063                     const char **valPtr, const char **nextTokPtr) {
1064  int c;
1065  char open;
1066  if (ptr == end) {
1067    *namePtr = NULL;
1068    return 1;
1069  }
1070  if (! isSpace(toAscii(enc, ptr, end))) {
1071    *nextTokPtr = ptr;
1072    return 0;
1073  }
1074  do {
1075    ptr += enc->minBytesPerChar;
1076  } while (isSpace(toAscii(enc, ptr, end)));
1077  if (ptr == end) {
1078    *namePtr = NULL;
1079    return 1;
1080  }
1081  *namePtr = ptr;
1082  for (;;) {
1083    c = toAscii(enc, ptr, end);
1084    if (c == -1) {
1085      *nextTokPtr = ptr;
1086      return 0;
1087    }
1088    if (c == ASCII_EQUALS) {
1089      *nameEndPtr = ptr;
1090      break;
1091    }
1092    if (isSpace(c)) {
1093      *nameEndPtr = ptr;
1094      do {
1095        ptr += enc->minBytesPerChar;
1096      } while (isSpace(c = toAscii(enc, ptr, end)));
1097      if (c != ASCII_EQUALS) {
1098        *nextTokPtr = ptr;
1099        return 0;
1100      }
1101      break;
1102    }
1103    ptr += enc->minBytesPerChar;
1104  }
1105  if (ptr == *namePtr) {
1106    *nextTokPtr = ptr;
1107    return 0;
1108  }
1109  ptr += enc->minBytesPerChar;
1110  c = toAscii(enc, ptr, end);
1111  while (isSpace(c)) {
1112    ptr += enc->minBytesPerChar;
1113    c = toAscii(enc, ptr, end);
1114  }
1115  if (c != ASCII_QUOT && c != ASCII_APOS) {
1116    *nextTokPtr = ptr;
1117    return 0;
1118  }
1119  open = (char)c;
1120  ptr += enc->minBytesPerChar;
1121  *valPtr = ptr;
1122  for (;; ptr += enc->minBytesPerChar) {
1123    c = toAscii(enc, ptr, end);
1124    if (c == open)
1125      break;
1126    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1127        && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1128        && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1129      *nextTokPtr = ptr;
1130      return 0;
1131    }
1132  }
1133  *nextTokPtr = ptr + enc->minBytesPerChar;
1134  return 1;
1135}
1136
1137static const char KW_version[]
1138    = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1139
1140static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1141                                   ASCII_i, ASCII_n, ASCII_g, '\0'};
1142
1143static const char KW_standalone[]
1144    = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1145       ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1146
1147static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1148
1149static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1150
1151static int
1152doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1153                                                 const char *),
1154               int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1155               const char *end, const char **badPtr, const char **versionPtr,
1156               const char **versionEndPtr, const char **encodingName,
1157               const ENCODING **encoding, int *standalone) {
1158  const char *val = NULL;
1159  const char *name = NULL;
1160  const char *nameEnd = NULL;
1161  ptr += 5 * enc->minBytesPerChar;
1162  end -= 2 * enc->minBytesPerChar;
1163  if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1164      || ! name) {
1165    *badPtr = ptr;
1166    return 0;
1167  }
1168  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1169    if (! isGeneralTextEntity) {
1170      *badPtr = name;
1171      return 0;
1172    }
1173  } else {
1174    if (versionPtr)
1175      *versionPtr = val;
1176    if (versionEndPtr)
1177      *versionEndPtr = ptr;
1178    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1179      *badPtr = ptr;
1180      return 0;
1181    }
1182    if (! name) {
1183      if (isGeneralTextEntity) {
1184        /* a TextDecl must have an EncodingDecl */
1185        *badPtr = ptr;
1186        return 0;
1187      }
1188      return 1;
1189    }
1190  }
1191  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1192    int c = toAscii(enc, val, end);
1193    if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1194      *badPtr = val;
1195      return 0;
1196    }
1197    if (encodingName)
1198      *encodingName = val;
1199    if (encoding)
1200      *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1201    if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1202      *badPtr = ptr;
1203      return 0;
1204    }
1205    if (! name)
1206      return 1;
1207  }
1208  if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1209      || isGeneralTextEntity) {
1210    *badPtr = name;
1211    return 0;
1212  }
1213  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1214    if (standalone)
1215      *standalone = 1;
1216  } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1217    if (standalone)
1218      *standalone = 0;
1219  } else {
1220    *badPtr = val;
1221    return 0;
1222  }
1223  while (isSpace(toAscii(enc, ptr, end)))
1224    ptr += enc->minBytesPerChar;
1225  if (ptr != end) {
1226    *badPtr = ptr;
1227    return 0;
1228  }
1229  return 1;
1230}
1231
1232static int FASTCALL
1233checkCharRefNumber(int result) {
1234  switch (result >> 8) {
1235  case 0xD8:
1236  case 0xD9:
1237  case 0xDA:
1238  case 0xDB:
1239  case 0xDC:
1240  case 0xDD:
1241  case 0xDE:
1242  case 0xDF:
1243    return -1;
1244  case 0:
1245    if (latin1_encoding.type[result] == BT_NONXML)
1246      return -1;
1247    break;
1248  case 0xFF:
1249    if (result == 0xFFFE || result == 0xFFFF)
1250      return -1;
1251    break;
1252  }
1253  return result;
1254}
1255
1256int FASTCALL
1257XmlUtf8Encode(int c, char *buf) {
1258  enum {
1259    /* minN is minimum legal resulting value for N byte sequence */
1260    min2 = 0x80,
1261    min3 = 0x800,
1262    min4 = 0x10000
1263  };
1264
1265  if (c < 0)
1266    return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1267  if (c < min2) {
1268    buf[0] = (char)(c | UTF8_cval1);
1269    return 1;
1270  }
1271  if (c < min3) {
1272    buf[0] = (char)((c >> 6) | UTF8_cval2);
1273    buf[1] = (char)((c & 0x3f) | 0x80);
1274    return 2;
1275  }
1276  if (c < min4) {
1277    buf[0] = (char)((c >> 12) | UTF8_cval3);
1278    buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1279    buf[2] = (char)((c & 0x3f) | 0x80);
1280    return 3;
1281  }
1282  if (c < 0x110000) {
1283    buf[0] = (char)((c >> 18) | UTF8_cval4);
1284    buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1285    buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1286    buf[3] = (char)((c & 0x3f) | 0x80);
1287    return 4;
1288  }
1289  return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1290}
1291
1292int FASTCALL
1293XmlUtf16Encode(int charNum, unsigned short *buf) {
1294  if (charNum < 0)
1295    return 0;
1296  if (charNum < 0x10000) {
1297    buf[0] = (unsigned short)charNum;
1298    return 1;
1299  }
1300  if (charNum < 0x110000) {
1301    charNum -= 0x10000;
1302    buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1303    buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1304    return 2;
1305  }
1306  return 0;
1307}
1308
1309struct unknown_encoding {
1310  struct normal_encoding normal;
1311  CONVERTER convert;
1312  void *userData;
1313  unsigned short utf16[256];
1314  char utf8[256][4];
1315};
1316
1317#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1318
1319int
1320XmlSizeOfUnknownEncoding(void) {
1321  return sizeof(struct unknown_encoding);
1322}
1323
1324static int PTRFASTCALL
1325unknown_isName(const ENCODING *enc, const char *p) {
1326  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1327  int c = uenc->convert(uenc->userData, p);
1328  if (c & ~0xFFFF)
1329    return 0;
1330  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1331}
1332
1333static int PTRFASTCALL
1334unknown_isNmstrt(const ENCODING *enc, const char *p) {
1335  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1336  int c = uenc->convert(uenc->userData, p);
1337  if (c & ~0xFFFF)
1338    return 0;
1339  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1340}
1341
1342static int PTRFASTCALL
1343unknown_isInvalid(const ENCODING *enc, const char *p) {
1344  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1345  int c = uenc->convert(uenc->userData, p);
1346  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1347}
1348
1349static enum XML_Convert_Result PTRCALL
1350unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1351               char **toP, const char *toLim) {
1352  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1353  char buf[XML_UTF8_ENCODE_MAX];
1354  for (;;) {
1355    const char *utf8;
1356    int n;
1357    if (*fromP == fromLim)
1358      return XML_CONVERT_COMPLETED;
1359    utf8 = uenc->utf8[(unsigned char)**fromP];
1360    n = *utf8++;
1361    if (n == 0) {
1362      int c = uenc->convert(uenc->userData, *fromP);
1363      n = XmlUtf8Encode(c, buf);
1364      if (n > toLim - *toP)
1365        return XML_CONVERT_OUTPUT_EXHAUSTED;
1366      utf8 = buf;
1367      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1368                 - (BT_LEAD2 - 2));
1369    } else {
1370      if (n > toLim - *toP)
1371        return XML_CONVERT_OUTPUT_EXHAUSTED;
1372      (*fromP)++;
1373    }
1374    memcpy(*toP, utf8, n);
1375    *toP += n;
1376  }
1377}
1378
1379static enum XML_Convert_Result PTRCALL
1380unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1381                unsigned short **toP, const unsigned short *toLim) {
1382  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1383  while (*fromP < fromLim && *toP < toLim) {
1384    unsigned short c = uenc->utf16[(unsigned char)**fromP];
1385    if (c == 0) {
1386      c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1387      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1388                 - (BT_LEAD2 - 2));
1389    } else
1390      (*fromP)++;
1391    *(*toP)++ = c;
1392  }
1393
1394  if ((*toP == toLim) && (*fromP < fromLim))
1395    return XML_CONVERT_OUTPUT_EXHAUSTED;
1396  else
1397    return XML_CONVERT_COMPLETED;
1398}
1399
1400ENCODING *
1401XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1402                       void *userData) {
1403  int i;
1404  struct unknown_encoding *e = (struct unknown_encoding *)mem;
1405  memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1406  for (i = 0; i < 128; i++)
1407    if (latin1_encoding.type[i] != BT_OTHER
1408        && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1409      return 0;
1410  for (i = 0; i < 256; i++) {
1411    int c = table[i];
1412    if (c == -1) {
1413      e->normal.type[i] = BT_MALFORM;
1414      /* This shouldn't really get used. */
1415      e->utf16[i] = 0xFFFF;
1416      e->utf8[i][0] = 1;
1417      e->utf8[i][1] = 0;
1418    } else if (c < 0) {
1419      if (c < -4)
1420        return 0;
1421      /* Multi-byte sequences need a converter function */
1422      if (! convert)
1423        return 0;
1424      e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1425      e->utf8[i][0] = 0;
1426      e->utf16[i] = 0;
1427    } else if (c < 0x80) {
1428      if (latin1_encoding.type[c] != BT_OTHER
1429          && latin1_encoding.type[c] != BT_NONXML && c != i)
1430        return 0;
1431      e->normal.type[i] = latin1_encoding.type[c];
1432      e->utf8[i][0] = 1;
1433      e->utf8[i][1] = (char)c;
1434      e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1435    } else if (checkCharRefNumber(c) < 0) {
1436      e->normal.type[i] = BT_NONXML;
1437      /* This shouldn't really get used. */
1438      e->utf16[i] = 0xFFFF;
1439      e->utf8[i][0] = 1;
1440      e->utf8[i][1] = 0;
1441    } else {
1442      if (c > 0xFFFF)
1443        return 0;
1444      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1445        e->normal.type[i] = BT_NMSTRT;
1446      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1447        e->normal.type[i] = BT_NAME;
1448      else
1449        e->normal.type[i] = BT_OTHER;
1450      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1451      e->utf16[i] = (unsigned short)c;
1452    }
1453  }
1454  e->userData = userData;
1455  e->convert = convert;
1456  if (convert) {
1457    e->normal.isName2 = unknown_isName;
1458    e->normal.isName3 = unknown_isName;
1459    e->normal.isName4 = unknown_isName;
1460    e->normal.isNmstrt2 = unknown_isNmstrt;
1461    e->normal.isNmstrt3 = unknown_isNmstrt;
1462    e->normal.isNmstrt4 = unknown_isNmstrt;
1463    e->normal.isInvalid2 = unknown_isInvalid;
1464    e->normal.isInvalid3 = unknown_isInvalid;
1465    e->normal.isInvalid4 = unknown_isInvalid;
1466  }
1467  e->normal.enc.utf8Convert = unknown_toUtf8;
1468  e->normal.enc.utf16Convert = unknown_toUtf16;
1469  return &(e->normal.enc);
1470}
1471
1472/* If this enumeration is changed, getEncodingIndex and encodings
1473must also be changed. */
1474enum {
1475  UNKNOWN_ENC = -1,
1476  ISO_8859_1_ENC = 0,
1477  US_ASCII_ENC,
1478  UTF_8_ENC,
1479  UTF_16_ENC,
1480  UTF_16BE_ENC,
1481  UTF_16LE_ENC,
1482  /* must match encodingNames up to here */
1483  NO_ENC
1484};
1485
1486static const char KW_ISO_8859_1[]
1487    = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1488       ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1489static const char KW_US_ASCII[]
1490    = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1491       ASCII_C, ASCII_I, ASCII_I,     '\0'};
1492static const char KW_UTF_8[]
1493    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1494static const char KW_UTF_16[]
1495    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1496static const char KW_UTF_16BE[]
1497    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1498       ASCII_6, ASCII_B, ASCII_E, '\0'};
1499static const char KW_UTF_16LE[]
1500    = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1501       ASCII_6, ASCII_L, ASCII_E, '\0'};
1502
1503static int FASTCALL
1504getEncodingIndex(const char *name) {
1505  static const char *const encodingNames[] = {
1506      KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1507  };
1508  int i;
1509  if (name == NULL)
1510    return NO_ENC;
1511  for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1512    if (streqci(name, encodingNames[i]))
1513      return i;
1514  return UNKNOWN_ENC;
1515}
1516
1517/* For binary compatibility, we store the index of the encoding
1518   specified at initialization in the isUtf16 member.
1519*/
1520
1521#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1522#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1523
1524/* This is what detects the encoding.  encodingTable maps from
1525   encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1526   the external (protocol) specified encoding; state is
1527   XML_CONTENT_STATE if we're parsing an external text entity, and
1528   XML_PROLOG_STATE otherwise.
1529*/
1530
1531static int
1532initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1533         int state, const char *ptr, const char *end, const char **nextTokPtr) {
1534  const ENCODING **encPtr;
1535
1536  if (ptr >= end)
1537    return XML_TOK_NONE;
1538  encPtr = enc->encPtr;
1539  if (ptr + 1 == end) {
1540    /* only a single byte available for auto-detection */
1541#ifndef XML_DTD /* FIXME */
1542    /* a well-formed document entity must have more than one byte */
1543    if (state != XML_CONTENT_STATE)
1544      return XML_TOK_PARTIAL;
1545#endif
1546    /* so we're parsing an external text entity... */
1547    /* if UTF-16 was externally specified, then we need at least 2 bytes */
1548    switch (INIT_ENC_INDEX(enc)) {
1549    case UTF_16_ENC:
1550    case UTF_16LE_ENC:
1551    case UTF_16BE_ENC:
1552      return XML_TOK_PARTIAL;
1553    }
1554    switch ((unsigned char)*ptr) {
1555    case 0xFE:
1556    case 0xFF:
1557    case 0xEF: /* possibly first byte of UTF-8 BOM */
1558      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1559        break;
1560      /* fall through */
1561    case 0x00:
1562    case 0x3C:
1563      return XML_TOK_PARTIAL;
1564    }
1565  } else {
1566    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1567    case 0xFEFF:
1568      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1569        break;
1570      *nextTokPtr = ptr + 2;
1571      *encPtr = encodingTable[UTF_16BE_ENC];
1572      return XML_TOK_BOM;
1573    /* 00 3C is handled in the default case */
1574    case 0x3C00:
1575      if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1576           || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1577          && state == XML_CONTENT_STATE)
1578        break;
1579      *encPtr = encodingTable[UTF_16LE_ENC];
1580      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1581    case 0xFFFE:
1582      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1583        break;
1584      *nextTokPtr = ptr + 2;
1585      *encPtr = encodingTable[UTF_16LE_ENC];
1586      return XML_TOK_BOM;
1587    case 0xEFBB:
1588      /* Maybe a UTF-8 BOM (EF BB BF) */
1589      /* If there's an explicitly specified (external) encoding
1590         of ISO-8859-1 or some flavour of UTF-16
1591         and this is an external text entity,
1592         don't look for the BOM,
1593         because it might be a legal data.
1594      */
1595      if (state == XML_CONTENT_STATE) {
1596        int e = INIT_ENC_INDEX(enc);
1597        if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1598            || e == UTF_16_ENC)
1599          break;
1600      }
1601      if (ptr + 2 == end)
1602        return XML_TOK_PARTIAL;
1603      if ((unsigned char)ptr[2] == 0xBF) {
1604        *nextTokPtr = ptr + 3;
1605        *encPtr = encodingTable[UTF_8_ENC];
1606        return XML_TOK_BOM;
1607      }
1608      break;
1609    default:
1610      if (ptr[0] == '\0') {
1611        /* 0 isn't a legal data character. Furthermore a document
1612           entity can only start with ASCII characters.  So the only
1613           way this can fail to be big-endian UTF-16 if it it's an
1614           external parsed general entity that's labelled as
1615           UTF-16LE.
1616        */
1617        if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1618          break;
1619        *encPtr = encodingTable[UTF_16BE_ENC];
1620        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1621      } else if (ptr[1] == '\0') {
1622        /* We could recover here in the case:
1623            - parsing an external entity
1624            - second byte is 0
1625            - no externally specified encoding
1626            - no encoding declaration
1627           by assuming UTF-16LE.  But we don't, because this would mean when
1628           presented just with a single byte, we couldn't reliably determine
1629           whether we needed further bytes.
1630        */
1631        if (state == XML_CONTENT_STATE)
1632          break;
1633        *encPtr = encodingTable[UTF_16LE_ENC];
1634        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1635      }
1636      break;
1637    }
1638  }
1639  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1640  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1641}
1642
1643#define NS(x) x
1644#define ns(x) x
1645#define XML_TOK_NS_C
1646#include "xmltok_ns.c"
1647#undef XML_TOK_NS_C
1648#undef NS
1649#undef ns
1650
1651#ifdef XML_NS
1652
1653#  define NS(x) x##NS
1654#  define ns(x) x##_ns
1655
1656#  define XML_TOK_NS_C
1657#  include "xmltok_ns.c"
1658#  undef XML_TOK_NS_C
1659
1660#  undef NS
1661#  undef ns
1662
1663ENCODING *
1664XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1665                         void *userData) {
1666  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1667  if (enc)
1668    ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1669  return enc;
1670}
1671
1672#endif /* XML_NS */
1673