1/*
2The contents of this file are subject to the Mozilla Public License
3Version 1.0 (the "License"); you may not use this file except in
4compliance with the License. You may obtain a copy of the License at
5http://www.mozilla.org/MPL/
6
7Software distributed under the License is distributed on an "AS IS"
8basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
9License for the specific language governing rights and limitations
10under the License.
11
12The Original Code is expat.
13
14The Initial Developer of the Original Code is James Clark.
15Portions created by James Clark are Copyright (C) 1998
16James Clark. All Rights Reserved.
17
18Contributor(s):
19*/
20
21#include <tcl.h> /*for size_t */
22#include "xmldef.h"
23#include "xmltok.h"
24#include "nametab.h"
25
26#define VTABLE1 \
27  { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
28  { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
29  PREFIX(sameName), \
30  PREFIX(nameMatchesAscii), \
31  PREFIX(nameLength), \
32  PREFIX(skipS), \
33  PREFIX(getAtts), \
34  PREFIX(charRefNumber), \
35  PREFIX(predefinedEntityName), \
36  PREFIX(updatePosition), \
37  PREFIX(isPublicId)
38
39#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
40
41#define UCS2_GET_NAMING(pages, hi, lo) \
42   (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
43
44/* A 2 byte UTF-8 representation splits the characters 11 bits
45between the bottom 5 and 6 bits of the bytes.
46We need 8 bits to index into pages, 3 bits to add to that index and
475 bits to generate the mask. */
48#define UTF8_GET_NAMING2(pages, byte) \
49    (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
50                      + ((((byte)[0]) & 3) << 1) \
51                      + ((((byte)[1]) >> 5) & 1)] \
52         & (1 << (((byte)[1]) & 0x1F)))
53
54/* A 3 byte UTF-8 representation splits the characters 16 bits
55between the bottom 4, 6 and 6 bits of the bytes.
56We need 8 bits to index into pages, 3 bits to add to that index and
575 bits to generate the mask. */
58#define UTF8_GET_NAMING3(pages, byte) \
59  (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
60                             + ((((byte)[1]) >> 2) & 0xF)] \
61		       << 3) \
62                      + ((((byte)[1]) & 3) << 1) \
63                      + ((((byte)[2]) >> 5) & 1)] \
64         & (1 << (((byte)[2]) & 0x1F)))
65
66#define UTF8_GET_NAMING(pages, p, n) \
67  ((n) == 2 \
68  ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
69  : ((n) == 3 \
70     ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
71     : 0))
72
73#define UTF8_INVALID3(p) \
74  ((*p) == 0xED \
75  ? (((p)[1] & 0x20) != 0) \
76  : ((*p) == 0xEF \
77     ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
78     : 0))
79
80#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
81
82static
83int isNever(const ENCODING *enc, const char *p)
84{
85  return 0;
86}
87
88static
89int utf8_isName2(const ENCODING *enc, const char *p)
90{
91  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
92}
93
94static
95int utf8_isName3(const ENCODING *enc, const char *p)
96{
97  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
98}
99
100#define utf8_isName4 isNever
101
102static
103int utf8_isNmstrt2(const ENCODING *enc, const char *p)
104{
105  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
106}
107
108static
109int utf8_isNmstrt3(const ENCODING *enc, const char *p)
110{
111  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
112}
113
114#define utf8_isNmstrt4 isNever
115
116#define utf8_isInvalid2 isNever
117
118static
119int utf8_isInvalid3(const ENCODING *enc, const char *p)
120{
121  return UTF8_INVALID3((const unsigned char *)p);
122}
123
124static
125int utf8_isInvalid4(const ENCODING *enc, const char *p)
126{
127  return UTF8_INVALID4((const unsigned char *)p);
128}
129
130struct normal_encoding {
131  ENCODING enc;
132  unsigned char type[256];
133  int (*isName2)(const ENCODING *, const char *);
134  int (*isName3)(const ENCODING *, const char *);
135  int (*isName4)(const ENCODING *, const char *);
136  int (*isNmstrt2)(const ENCODING *, const char *);
137  int (*isNmstrt3)(const ENCODING *, const char *);
138  int (*isNmstrt4)(const ENCODING *, const char *);
139  int (*isInvalid2)(const ENCODING *, const char *);
140  int (*isInvalid3)(const ENCODING *, const char *);
141  int (*isInvalid4)(const ENCODING *, const char *);
142};
143
144#define NORMAL_VTABLE(E) \
145 E ## isName2, \
146 E ## isName3, \
147 E ## isName4, \
148 E ## isNmstrt2, \
149 E ## isNmstrt3, \
150 E ## isNmstrt4, \
151 E ## isInvalid2, \
152 E ## isInvalid3, \
153 E ## isInvalid4
154
155static int checkCharRefNumber(int);
156
157#include "xmltok_impl.h"
158
159/* minimum bytes per character */
160#define MINBPC 1
161#define BYTE_TYPE(enc, p) \
162  (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
163#define BYTE_TO_ASCII(enc, p) (*p)
164
165#define IS_NAME_CHAR(enc, p, n) \
166 (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
167#define IS_NMSTRT_CHAR(enc, p, n) \
168 (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
169#define IS_INVALID_CHAR(enc, p, n) \
170 (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
171
172#define IS_NAME_CHAR_MINBPC(enc, p) (0)
173#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
174
175/* c is an ASCII character */
176#define CHAR_MATCHES(enc, p, c) (*(p) == c)
177
178#define PREFIX(ident) normal_ ## ident
179#include "xmltok_impl.c"
180
181#undef MINBPC
182#undef BYTE_TYPE
183#undef BYTE_TO_ASCII
184#undef CHAR_MATCHES
185#undef IS_NAME_CHAR
186#undef IS_NAME_CHAR_MINBPC
187#undef IS_NMSTRT_CHAR
188#undef IS_NMSTRT_CHAR_MINBPC
189#undef IS_INVALID_CHAR
190
191enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
192  UTF8_cval1 = 0x00,
193  UTF8_cval2 = 0xc0,
194  UTF8_cval3 = 0xe0,
195  UTF8_cval4 = 0xf0
196};
197
198static
199void utf8_toUtf8(const ENCODING *enc,
200		 const char **fromP, const char *fromLim,
201		 char **toP, const char *toLim)
202{
203  char *to;
204  const char *from;
205  if (fromLim - *fromP > toLim - *toP) {
206    /* Avoid copying partial characters. */
207    for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
208      if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
209	break;
210  }
211  for (to = *toP, from = *fromP; from != fromLim; from++, to++)
212    *to = *from;
213  *fromP = from;
214  *toP = to;
215}
216
217static
218void utf8_toUtf16(const ENCODING *enc,
219		  const char **fromP, const char *fromLim,
220		  unsigned short **toP, const unsigned short *toLim)
221{
222  unsigned short *to = *toP;
223  const char *from = *fromP;
224  while (from != fromLim && to != toLim) {
225    switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
226    case BT_LEAD2:
227      *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
228      from += 2;
229      break;
230    case BT_LEAD3:
231      *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
232      from += 3;
233      break;
234    case BT_LEAD4:
235      {
236	unsigned long n;
237	if (to + 1 == toLim)
238	  break;
239	n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
240	n -= 0x10000;
241	to[0] = (unsigned short)((n >> 10) | 0xD800);
242	to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
243	to += 2;
244	from += 4;
245      }
246      break;
247    default:
248      *to++ = *from++;
249      break;
250    }
251  }
252  *fromP = from;
253  *toP = to;
254}
255
256static const struct normal_encoding utf8_encoding = {
257  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
258  {
259#include "asciitab.h"
260#include "utf8tab.h"
261  },
262  NORMAL_VTABLE(utf8_)
263};
264
265static const struct normal_encoding internal_utf8_encoding = {
266  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
267  {
268#include "iasciitab.h"
269#include "utf8tab.h"
270  },
271  NORMAL_VTABLE(utf8_)
272};
273
274static
275void latin1_toUtf8(const ENCODING *enc,
276		   const char **fromP, const char *fromLim,
277		   char **toP, const char *toLim)
278{
279  for (;;) {
280    unsigned char c;
281    if (*fromP == fromLim)
282      break;
283    c = (unsigned char)**fromP;
284    if (c & 0x80) {
285      if (toLim - *toP < 2)
286	break;
287      *(*toP)++ = ((c >> 6) | UTF8_cval2);
288      *(*toP)++ = ((c & 0x3f) | 0x80);
289      (*fromP)++;
290    }
291    else {
292      if (*toP == toLim)
293	break;
294      *(*toP)++ = *(*fromP)++;
295    }
296  }
297}
298
299static
300void latin1_toUtf16(const ENCODING *enc,
301		    const char **fromP, const char *fromLim,
302		    unsigned short **toP, const unsigned short *toLim)
303{
304  while (*fromP != fromLim && *toP != toLim)
305    *(*toP)++ = (unsigned char)*(*fromP)++;
306}
307
308static const struct normal_encoding latin1_encoding = {
309  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
310  {
311#include "asciitab.h"
312#include "latin1tab.h"
313  }
314};
315
316static
317void ascii_toUtf8(const ENCODING *enc,
318		  const char **fromP, const char *fromLim,
319		  char **toP, const char *toLim)
320{
321  while (*fromP != fromLim && *toP != toLim)
322    *(*toP)++ = *(*fromP)++;
323}
324
325static const struct normal_encoding ascii_encoding = {
326  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
327  {
328#include "asciitab.h"
329/* BT_NONXML == 0 */
330  }
331};
332
333#undef PREFIX
334
335static int unicode_byte_type(char hi, char lo)
336{
337  switch ((unsigned char)hi) {
338  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
339    return BT_LEAD4;
340  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
341    return BT_TRAIL;
342  case 0xFF:
343    switch ((unsigned char)lo) {
344    case 0xFF:
345    case 0xFE:
346      return BT_NONXML;
347    }
348    break;
349  }
350  return BT_NONASCII;
351}
352
353#define DEFINE_UTF16_TO_UTF8 \
354static \
355void PREFIX(toUtf8)(const ENCODING *enc, \
356		    const char **fromP, const char *fromLim, \
357		    char **toP, const char *toLim) \
358{ \
359  const char *from; \
360  for (from = *fromP; from != fromLim; from += 2) { \
361    int plane; \
362    unsigned char lo2; \
363    unsigned char lo = GET_LO(from); \
364    unsigned char hi = GET_HI(from); \
365    switch (hi) { \
366    case 0: \
367      if (lo < 0x80) { \
368        if (*toP == toLim) { \
369          *fromP = from; \
370	  return; \
371        } \
372        *(*toP)++ = lo; \
373        break; \
374      } \
375      /* fall through */ \
376    case 0x1: case 0x2: case 0x3: \
377    case 0x4: case 0x5: case 0x6: case 0x7: \
378      if (toLim -  *toP < 2) { \
379        *fromP = from; \
380	return; \
381      } \
382      *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
383      *(*toP)++ = ((lo & 0x3f) | 0x80); \
384      break; \
385    default: \
386      if (toLim -  *toP < 3)  { \
387        *fromP = from; \
388	return; \
389      } \
390      /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
391      *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
392      *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
393      *(*toP)++ = ((lo & 0x3f) | 0x80); \
394      break; \
395    case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
396      if (toLim -  *toP < 4) { \
397	*fromP = from; \
398	return; \
399      } \
400      plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
401      *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
402      *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
403      from += 2; \
404      lo2 = GET_LO(from); \
405      *(*toP)++ = (((lo & 0x3) << 4) \
406	           | ((GET_HI(from) & 0x3) << 2) \
407		   | (lo2 >> 6) \
408		   | 0x80); \
409      *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
410      break; \
411    } \
412  } \
413  *fromP = from; \
414}
415
416#define DEFINE_UTF16_TO_UTF16 \
417static \
418void PREFIX(toUtf16)(const ENCODING *enc, \
419		     const char **fromP, const char *fromLim, \
420		     unsigned short **toP, const unsigned short *toLim) \
421{ \
422  /* Avoid copying first half only of surrogate */ \
423  if (fromLim - *fromP > ((toLim - *toP) << 1) \
424      && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
425    fromLim -= 2; \
426  for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
427    *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
428}
429
430#define PREFIX(ident) little2_ ## ident
431#define MINBPC 2
432#define BYTE_TYPE(enc, p) \
433 ((p)[1] == 0 \
434  ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
435  : unicode_byte_type((p)[1], (p)[0]))
436#define BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
437#define CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
438#define IS_NAME_CHAR(enc, p, n) (0)
439#define IS_NAME_CHAR_MINBPC(enc, p) \
440  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
441#define IS_NMSTRT_CHAR(enc, p, n) (0)
442#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
443  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
444
445#include "xmltok_impl.c"
446
447#define SET2(ptr, ch) \
448  (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
449#define GET_LO(ptr) ((unsigned char)(ptr)[0])
450#define GET_HI(ptr) ((unsigned char)(ptr)[1])
451
452DEFINE_UTF16_TO_UTF8
453DEFINE_UTF16_TO_UTF16
454
455#undef SET2
456#undef GET_LO
457#undef GET_HI
458#undef MINBPC
459#undef BYTE_TYPE
460#undef BYTE_TO_ASCII
461#undef CHAR_MATCHES
462#undef IS_NAME_CHAR
463#undef IS_NAME_CHAR_MINBPC
464#undef IS_NMSTRT_CHAR
465#undef IS_NMSTRT_CHAR_MINBPC
466#undef IS_INVALID_CHAR
467
468static const struct normal_encoding little2_encoding = {
469  { VTABLE, 2, 0,
470#if BYTE_ORDER == 12
471    1
472#else
473    0
474#endif
475  },
476#include "asciitab.h"
477#include "latin1tab.h"
478};
479
480#if BYTE_ORDER != 21
481
482static const struct normal_encoding internal_little2_encoding = {
483  { VTABLE, 2, 0, 1 },
484#include "iasciitab.h"
485#include "latin1tab.h"
486};
487
488#endif
489
490#undef PREFIX
491
492#define PREFIX(ident) big2_ ## ident
493#define MINBPC 2
494/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
495#define BYTE_TYPE(enc, p) \
496 ((p)[0] == 0 \
497  ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
498  : unicode_byte_type((p)[0], (p)[1]))
499#define BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
500#define CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
501#define IS_NAME_CHAR(enc, p, n) 0
502#define IS_NAME_CHAR_MINBPC(enc, p) \
503  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
504#define IS_NMSTRT_CHAR(enc, p, n) (0)
505#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
506  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
507
508#include "xmltok_impl.c"
509
510#define SET2(ptr, ch) \
511  (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
512#define GET_LO(ptr) ((unsigned char)(ptr)[1])
513#define GET_HI(ptr) ((unsigned char)(ptr)[0])
514
515DEFINE_UTF16_TO_UTF8
516DEFINE_UTF16_TO_UTF16
517
518#undef SET2
519#undef GET_LO
520#undef GET_HI
521#undef MINBPC
522#undef BYTE_TYPE
523#undef BYTE_TO_ASCII
524#undef CHAR_MATCHES
525#undef IS_NAME_CHAR
526#undef IS_NAME_CHAR_MINBPC
527#undef IS_NMSTRT_CHAR
528#undef IS_NMSTRT_CHAR_MINBPC
529#undef IS_INVALID_CHAR
530
531static const struct normal_encoding big2_encoding = {
532  { VTABLE, 2, 0,
533#if BYTE_ORDER == 21
534  1
535#else
536  0
537#endif
538  },
539#include "asciitab.h"
540#include "latin1tab.h"
541};
542
543#if BYTE_ORDER != 12
544
545static const struct normal_encoding internal_big2_encoding = {
546  { VTABLE, 2, 0, 1 },
547#include "iasciitab.h"
548#include "latin1tab.h"
549};
550
551#endif
552
553#undef PREFIX
554
555static
556int streqci(const char *s1, const char *s2)
557{
558  for (;;) {
559    char c1 = *s1++;
560    char c2 = *s2++;
561    if ('a' <= c1 && c1 <= 'z')
562      c1 += 'A' - 'a';
563    if ('a' <= c2 && c2 <= 'z')
564      c2 += 'A' - 'a';
565    if (c1 != c2)
566      return 0;
567    if (!c1)
568      break;
569  }
570  return 1;
571}
572
573static
574int initScan(const ENCODING *enc, int state, const char *ptr, const char *end,
575	     const char **nextTokPtr)
576{
577  const ENCODING **encPtr;
578
579  if (ptr == end)
580    return XML_TOK_NONE;
581  encPtr = ((const INIT_ENCODING *)enc)->encPtr;
582  if (ptr + 1 == end) {
583    switch ((unsigned char)*ptr) {
584    case 0xFE:
585    case 0xFF:
586    case 0x00:
587    case 0x3C:
588      return XML_TOK_PARTIAL;
589    }
590  }
591  else {
592    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
593    case 0x003C:
594      *encPtr = &big2_encoding.enc;
595      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
596    case 0xFEFF:
597      *nextTokPtr = ptr + 2;
598      *encPtr = &big2_encoding.enc;
599      return XML_TOK_BOM;
600    case 0x3C00:
601      *encPtr = &little2_encoding.enc;
602      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
603    case 0xFFFE:
604      *nextTokPtr = ptr + 2;
605      *encPtr = &little2_encoding.enc;
606      return XML_TOK_BOM;
607    }
608  }
609  *encPtr = &utf8_encoding.enc;
610  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
611}
612
613static
614int initScanProlog(const ENCODING *enc, const char *ptr, const char *end,
615		   const char **nextTokPtr)
616{
617  return initScan(enc, XML_PROLOG_STATE, ptr, end, nextTokPtr);
618}
619
620static
621int initScanContent(const ENCODING *enc, const char *ptr, const char *end,
622		    const char **nextTokPtr)
623{
624  return initScan(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr);
625}
626
627static
628void initUpdatePosition(const ENCODING *enc, const char *ptr,
629			const char *end, POSITION *pos)
630{
631  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
632}
633
634const ENCODING *XmlGetUtf8InternalEncoding()
635{
636  return &internal_utf8_encoding.enc;
637}
638
639const ENCODING *XmlGetUtf16InternalEncoding()
640{
641#if BYTE_ORDER == 12
642  return &internal_little2_encoding.enc;
643#elif BYTE_ORDER == 21
644  return &internal_big2_encoding.enc;
645#else
646  const short n = 1;
647  return *(const char *)&n ? &internal_little2_encoding.enc : &internal_big2_encoding.enc;
648#endif
649}
650
651int XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr, const char *name)
652{
653  if (name) {
654    if (streqci(name, "ISO-8859-1")) {
655      *encPtr = &latin1_encoding.enc;
656      return 1;
657    }
658    if (streqci(name, "UTF-8")) {
659      *encPtr = &utf8_encoding.enc;
660      return 1;
661    }
662    if (streqci(name, "US-ASCII")) {
663      *encPtr = &ascii_encoding.enc;
664      return 1;
665    }
666    if (!streqci(name, "UTF-16"))
667      return 0;
668  }
669  p->initEnc.scanners[XML_PROLOG_STATE] = initScanProlog;
670  p->initEnc.scanners[XML_CONTENT_STATE] = initScanContent;
671  p->initEnc.updatePosition = initUpdatePosition;
672  p->initEnc.minBytesPerChar = 1;
673  p->encPtr = encPtr;
674  *encPtr = &(p->initEnc);
675  return 1;
676}
677
678static
679int toAscii(const ENCODING *enc, const char *ptr, const char *end)
680{
681  char buf[1];
682  char *p = buf;
683  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
684  if (p == buf)
685    return -1;
686  else
687    return buf[0];
688}
689
690static
691int isSpace(int c)
692{
693  switch (c) {
694  case ' ':
695  case '\r':
696  case '\n':
697  case '\t':
698    return 1;
699  }
700  return 0;
701}
702
703/* Return 1 if there's just optional white space
704or there's an S followed by name=val. */
705static
706int parsePseudoAttribute(const ENCODING *enc,
707			 const char *ptr,
708			 const char *end,
709			 const char **namePtr,
710			 const char **valPtr,
711			 const char **nextTokPtr)
712{
713  int c;
714  char open;
715  if (ptr == end) {
716    *namePtr = 0;
717    return 1;
718  }
719  if (!isSpace(toAscii(enc, ptr, end))) {
720    *nextTokPtr = ptr;
721    return 0;
722  }
723  do {
724    ptr += enc->minBytesPerChar;
725  } while (isSpace(toAscii(enc, ptr, end)));
726  if (ptr == end) {
727    *namePtr = 0;
728    return 1;
729  }
730  *namePtr = ptr;
731  for (;;) {
732    c = toAscii(enc, ptr, end);
733    if (c == -1) {
734      *nextTokPtr = ptr;
735      return 0;
736    }
737    if (c == '=')
738      break;
739    if (isSpace(c)) {
740      do {
741	ptr += enc->minBytesPerChar;
742      } while (isSpace(c = toAscii(enc, ptr, end)));
743      if (c != '=') {
744	*nextTokPtr = ptr;
745	return 0;
746      }
747      break;
748    }
749    ptr += enc->minBytesPerChar;
750  }
751  if (ptr == *namePtr) {
752    *nextTokPtr = ptr;
753    return 0;
754  }
755  ptr += enc->minBytesPerChar;
756  c = toAscii(enc, ptr, end);
757  while (isSpace(c)) {
758    ptr += enc->minBytesPerChar;
759    c = toAscii(enc, ptr, end);
760  }
761  if (c != '"' && c != '\'') {
762    *nextTokPtr = ptr;
763    return 0;
764  }
765  open = c;
766  ptr += enc->minBytesPerChar;
767  *valPtr = ptr;
768  for (;; ptr += enc->minBytesPerChar) {
769    c = toAscii(enc, ptr, end);
770    if (c == open)
771      break;
772    if (!('a' <= c && c <= 'z')
773	&& !('A' <= c && c <= 'Z')
774	&& !('0' <= c && c <= '9')
775	&& c != '.'
776	&& c != '-'
777	&& c != '_') {
778      *nextTokPtr = ptr;
779      return 0;
780    }
781  }
782  *nextTokPtr = ptr + enc->minBytesPerChar;
783  return 1;
784}
785
786static
787const ENCODING *findEncoding(const ENCODING *enc, const char *ptr, const char *end)
788{
789#define ENCODING_MAX 128
790  char buf[ENCODING_MAX];
791  char *p = buf;
792  int i;
793  XmlUtf8Convert(enc, &ptr, end, &p, p + ENCODING_MAX - 1);
794  if (ptr != end)
795    return 0;
796  *p = 0;
797  for (i = 0; buf[i]; i++) {
798    if ('a' <= buf[i] && buf[i] <= 'z')
799      buf[i] +=  'A' - 'a';
800  }
801  if (streqci(buf, "UTF-8"))
802    return &utf8_encoding.enc;
803  if (streqci(buf, "ISO-8859-1"))
804    return &latin1_encoding.enc;
805  if (streqci(buf, "US-ASCII"))
806    return &ascii_encoding.enc;
807  if (streqci(buf, "UTF-16")) {
808    if (enc->minBytesPerChar == 2)
809      return enc;
810    return &big2_encoding.enc;
811  }
812  return 0;
813}
814
815int XmlParseXmlDecl(int isGeneralTextEntity,
816		    const ENCODING *enc,
817		    const char *ptr,
818		    const char *end,
819		    const char **badPtr,
820		    const char **versionPtr,
821		    const char **encodingName,
822		    const ENCODING **encoding,
823		    int *standalone)
824{
825  const char *val = 0;
826  const char *name = 0;
827  ptr += 5 * enc->minBytesPerChar;
828  end -= 2 * enc->minBytesPerChar;
829  if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) {
830    *badPtr = ptr;
831    return 0;
832  }
833  if (!XmlNameMatchesAscii(enc, name, "version")) {
834    if (!isGeneralTextEntity) {
835      *badPtr = name;
836      return 0;
837    }
838  }
839  else {
840    if (versionPtr)
841      *versionPtr = val;
842    if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
843      *badPtr = ptr;
844      return 0;
845    }
846    if (!name)
847      return 1;
848  }
849  if (XmlNameMatchesAscii(enc, name, "encoding")) {
850    int c = toAscii(enc, val, end);
851    if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) {
852      *badPtr = val;
853      return 0;
854    }
855    if (encodingName)
856      *encodingName = val;
857    if (encoding)
858      *encoding = findEncoding(enc, val, ptr - enc->minBytesPerChar);
859    if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
860      *badPtr = ptr;
861      return 0;
862    }
863    if (!name)
864      return 1;
865  }
866  if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) {
867    *badPtr = name;
868    return 0;
869  }
870  if (XmlNameMatchesAscii(enc, val, "yes")) {
871    if (standalone)
872      *standalone = 1;
873  }
874  else if (XmlNameMatchesAscii(enc, val, "no")) {
875    if (standalone)
876      *standalone = 0;
877  }
878  else {
879    *badPtr = val;
880    return 0;
881  }
882  while (isSpace(toAscii(enc, ptr, end)))
883    ptr += enc->minBytesPerChar;
884  if (ptr != end) {
885    *badPtr = ptr;
886    return 0;
887  }
888  return 1;
889}
890
891static
892int checkCharRefNumber(int result)
893{
894  switch (result >> 8) {
895  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
896  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
897    return -1;
898  case 0:
899    if (latin1_encoding.type[result] == BT_NONXML)
900      return -1;
901    break;
902  case 0xFF:
903    if (result == 0xFFFE || result == 0xFFFF)
904      return -1;
905    break;
906  }
907  return result;
908}
909
910size_t XmlUtf8Encode(int c, char *buf)
911{
912  enum {
913    /* minN is minimum legal resulting value for N byte sequence */
914    min2 = 0x80,
915    min3 = 0x800,
916    min4 = 0x10000
917  };
918
919  if (c < 0)
920    return 0;
921  if (c < min2) {
922    buf[0] = (c | UTF8_cval1);
923    return 1;
924  }
925  if (c < min3) {
926    buf[0] = ((c >> 6) | UTF8_cval2);
927    buf[1] = ((c & 0x3f) | 0x80);
928    return 2;
929  }
930  if (c < min4) {
931    buf[0] = ((c >> 12) | UTF8_cval3);
932    buf[1] = (((c >> 6) & 0x3f) | 0x80);
933    buf[2] = ((c & 0x3f) | 0x80);
934    return 3;
935  }
936  if (c < 0x110000) {
937    buf[0] = ((c >> 18) | UTF8_cval4);
938    buf[1] = (((c >> 12) & 0x3f) | 0x80);
939    buf[2] = (((c >> 6) & 0x3f) | 0x80);
940    buf[3] = ((c & 0x3f) | 0x80);
941    return 4;
942  }
943  return 0;
944}
945
946size_t XmlUtf16Encode(int charNum, unsigned short *buf)
947{
948  if (charNum < 0)
949    return 0;
950  if (charNum < 0x10000) {
951    buf[0] = charNum;
952    return 1;
953  }
954  if (charNum < 0x110000) {
955    charNum -= 0x10000;
956    buf[0] = (charNum >> 10) + 0xD800;
957    buf[1] = (charNum & 0x3FF) + 0xDC00;
958    return 2;
959  }
960  return 0;
961}
962
963struct unknown_encoding {
964  struct normal_encoding normal;
965  int (*convert)(void *userData, const char *p);
966  void *userData;
967  unsigned short utf16[256];
968  char utf8[256][4];
969};
970
971int XmlSizeOfUnknownEncoding()
972{
973  return sizeof(struct unknown_encoding);
974}
975
976static
977int unknown_isName(const ENCODING *enc, const char *p)
978{
979  int c = ((const struct unknown_encoding *)enc)
980	  ->convert(((const struct unknown_encoding *)enc)->userData, p);
981  if (c & ~0xFFFF)
982    return 0;
983  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
984}
985
986static
987int unknown_isNmstrt(const ENCODING *enc, const char *p)
988{
989  int c = ((const struct unknown_encoding *)enc)
990	  ->convert(((const struct unknown_encoding *)enc)->userData, p);
991  if (c & ~0xFFFF)
992    return 0;
993  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
994}
995
996static
997int unknown_isInvalid(const ENCODING *enc, const char *p)
998{
999  int c = ((const struct unknown_encoding *)enc)
1000	   ->convert(((const struct unknown_encoding *)enc)->userData, p);
1001  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1002}
1003
1004static
1005void unknown_toUtf8(const ENCODING *enc,
1006		    const char **fromP, const char *fromLim,
1007		    char **toP, const char *toLim)
1008{
1009  char buf[XML_UTF8_ENCODE_MAX];
1010  for (;;) {
1011    const char *utf8;
1012    int n;
1013    if (*fromP == fromLim)
1014      break;
1015    utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
1016    n = *utf8++;
1017    if (n == 0) {
1018      int c = ((const struct unknown_encoding *)enc)
1019	      ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1020      n = XmlUtf8Encode(c, buf);
1021      if (n > toLim - *toP)
1022	break;
1023      utf8 = buf;
1024      *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1025	         - (BT_LEAD2 - 2);
1026    }
1027    else {
1028      if (n > toLim - *toP)
1029	break;
1030      (*fromP)++;
1031    }
1032    do {
1033      *(*toP)++ = *utf8++;
1034    } while (--n != 0);
1035  }
1036}
1037
1038static
1039void unknown_toUtf16(const ENCODING *enc,
1040		     const char **fromP, const char *fromLim,
1041		     unsigned short **toP, const unsigned short *toLim)
1042{
1043  while (*fromP != fromLim && *toP != toLim) {
1044    unsigned short c
1045      = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
1046    if (c == 0) {
1047      c = (unsigned short)((const struct unknown_encoding *)enc)
1048	   ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1049      *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1050	         - (BT_LEAD2 - 2);
1051    }
1052    else
1053      (*fromP)++;
1054    *(*toP)++ = c;
1055  }
1056}
1057
1058ENCODING *
1059XmlInitUnknownEncoding(void *mem,
1060		       int *table,
1061		       int (*convert)(void *userData, const char *p),
1062		       void *userData)
1063{
1064  int i;
1065  struct unknown_encoding *e = mem;
1066  for (i = 0; i < sizeof(struct normal_encoding); i++)
1067    ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1068  for (i = 0; i < 128; i++)
1069    if (latin1_encoding.type[i] != BT_OTHER
1070        && latin1_encoding.type[i] != BT_NONXML
1071	&& table[i] != i)
1072      return 0;
1073  for (i = 0; i < 256; i++) {
1074    int c = table[i];
1075    if (c == -1) {
1076      e->normal.type[i] = BT_MALFORM;
1077      /* This shouldn't really get used. */
1078      e->utf16[i] = 0xFFFF;
1079      e->utf8[i][0] = 1;
1080      e->utf8[i][1] = 0;
1081    }
1082    else if (c < 0) {
1083      if (c < -4)
1084	return 0;
1085      e->normal.type[i] = BT_LEAD2 - (c + 2);
1086      e->utf8[i][0] = 0;
1087      e->utf16[i] = 0;
1088    }
1089    else if (c < 0x80) {
1090      if (latin1_encoding.type[c] != BT_OTHER
1091	  && latin1_encoding.type[c] != BT_NONXML
1092	  && c != i)
1093	return 0;
1094      e->normal.type[i] = latin1_encoding.type[c];
1095      e->utf8[i][0] = 1;
1096      e->utf8[i][1] = (char)c;
1097      e->utf16[i] = c == 0 ? 0xFFFF : c;
1098    }
1099    else if (checkCharRefNumber(c) < 0) {
1100      e->normal.type[i] = BT_NONXML;
1101      /* This shouldn't really get used. */
1102      e->utf16[i] = 0xFFFF;
1103      e->utf8[i][0] = 1;
1104      e->utf8[i][1] = 0;
1105    }
1106    else {
1107      if (c > 0xFFFF)
1108	return 0;
1109      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1110	e->normal.type[i] = BT_NMSTRT;
1111      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1112	e->normal.type[i] = BT_NAME;
1113      else
1114	e->normal.type[i] = BT_OTHER;
1115      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1116      e->utf16[i] = c;
1117    }
1118  }
1119  e->userData = userData;
1120  e->convert = convert;
1121  if (convert) {
1122    e->normal.isName2 = unknown_isName;
1123    e->normal.isName3 = unknown_isName;
1124    e->normal.isName4 = unknown_isName;
1125    e->normal.isNmstrt2 = unknown_isNmstrt;
1126    e->normal.isNmstrt3 = unknown_isNmstrt;
1127    e->normal.isNmstrt4 = unknown_isNmstrt;
1128    e->normal.isInvalid2 = unknown_isInvalid;
1129    e->normal.isInvalid3 = unknown_isInvalid;
1130    e->normal.isInvalid4 = unknown_isInvalid;
1131  }
1132  e->normal.enc.utf8Convert = unknown_toUtf8;
1133  e->normal.enc.utf16Convert = unknown_toUtf16;
1134  return &(e->normal.enc);
1135}
1136