1/*
2The contents of this file are subject to the Mozilla Public License
3Version 1.0 (the "License"); you may not use this file except in
4compliance with the License. You may obtain a copy of the License at
5http://www.mozilla.org/MPL/
6
7Software distributed under the License is distributed on an "AS IS"
8basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
9License for the specific language governing rights and limitations
10under the License.
11
12The Original Code is expat.
13
14The Initial Developer of the Original Code is James Clark.
15Portions created by James Clark are Copyright (C) 1998
16James Clark. All Rights Reserved.
17
18Contributor(s):
19*/
20
21#ifndef IS_INVALID_CHAR
22#define IS_INVALID_CHAR(enc, ptr, n) (0)
23#endif
24
25#define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
26    case BT_LEAD ## n: \
27      if (end - ptr < n) \
28	return XML_TOK_PARTIAL_CHAR; \
29      if (IS_INVALID_CHAR(enc, ptr, n)) { \
30        *(nextTokPtr) = (ptr); \
31        return XML_TOK_INVALID; \
32      } \
33      ptr += n; \
34      break;
35
36#define INVALID_CASES(ptr, nextTokPtr) \
37  INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
38  INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
39  INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
40  case BT_NONXML: \
41  case BT_MALFORM: \
42  case BT_TRAIL: \
43    *(nextTokPtr) = (ptr); \
44    return XML_TOK_INVALID;
45
46#define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
47   case BT_LEAD ## n: \
48     if (end - ptr < n) \
49       return XML_TOK_PARTIAL_CHAR; \
50     if (!IS_NAME_CHAR(enc, ptr, n)) { \
51       *nextTokPtr = ptr; \
52       return XML_TOK_INVALID; \
53     } \
54     ptr += n; \
55     break;
56
57#define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
58  case BT_NONASCII: \
59    if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
60      *nextTokPtr = ptr; \
61      return XML_TOK_INVALID; \
62    } \
63  case BT_NMSTRT: \
64  case BT_HEX: \
65  case BT_DIGIT: \
66  case BT_NAME: \
67  case BT_MINUS: \
68    ptr += MINBPC; \
69    break; \
70  CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
71  CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
72  CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
73
74#define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
75   case BT_LEAD ## n: \
76     if (end - ptr < n) \
77       return XML_TOK_PARTIAL_CHAR; \
78     if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
79       *nextTokPtr = ptr; \
80       return XML_TOK_INVALID; \
81     } \
82     ptr += n; \
83     break;
84
85#define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
86  case BT_NONASCII: \
87    if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
88      *nextTokPtr = ptr; \
89      return XML_TOK_INVALID; \
90    } \
91  case BT_NMSTRT: \
92  case BT_HEX: \
93    ptr += MINBPC; \
94    break; \
95  CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
96  CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
97  CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
98
99#ifndef PREFIX
100#define PREFIX(ident) ident
101#endif
102
103/* ptr points to character following "<!-" */
104
105static
106int PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
107			const char **nextTokPtr)
108{
109  if (ptr != end) {
110    if (!CHAR_MATCHES(enc, ptr, '-')) {
111      *nextTokPtr = ptr;
112      return XML_TOK_INVALID;
113    }
114    ptr += MINBPC;
115    while (ptr != end) {
116      switch (BYTE_TYPE(enc, ptr)) {
117      INVALID_CASES(ptr, nextTokPtr)
118      case BT_MINUS:
119	if ((ptr += MINBPC) == end)
120	  return XML_TOK_PARTIAL;
121	if (CHAR_MATCHES(enc, ptr, '-')) {
122	  if ((ptr += MINBPC) == end)
123	    return XML_TOK_PARTIAL;
124	  if (!CHAR_MATCHES(enc, ptr, '>')) {
125	    *nextTokPtr = ptr;
126	    return XML_TOK_INVALID;
127	  }
128	  *nextTokPtr = ptr + MINBPC;
129	  return XML_TOK_COMMENT;
130	}
131	/* fall through */
132      default:
133	ptr += MINBPC;
134	break;
135      }
136    }
137  }
138  return XML_TOK_PARTIAL;
139}
140
141/* ptr points to character following "<!" */
142
143static
144int PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
145		     const char **nextTokPtr)
146{
147  if (ptr == end)
148    return XML_TOK_PARTIAL;
149  switch (BYTE_TYPE(enc, ptr)) {
150  case BT_MINUS:
151    return PREFIX(scanComment)(enc, ptr + MINBPC, end, nextTokPtr);
152  case BT_LSQB:
153    *nextTokPtr = ptr + MINBPC;
154    return XML_TOK_COND_SECT_OPEN;
155  case BT_NMSTRT:
156  case BT_HEX:
157    ptr += MINBPC;
158    break;
159  default:
160    *nextTokPtr = ptr;
161    return XML_TOK_INVALID;
162  }
163  while (ptr != end) {
164    switch (BYTE_TYPE(enc, ptr)) {
165    case BT_PERCNT:
166      if (ptr + MINBPC == end)
167	return XML_TOK_PARTIAL;
168      /* don't allow <!ENTITY% foo "whatever"> */
169      switch (BYTE_TYPE(enc, ptr + MINBPC)) {
170      case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
171	*nextTokPtr = ptr;
172	return XML_TOK_INVALID;
173      }
174      /* fall through */
175    case BT_S: case BT_CR: case BT_LF:
176      *nextTokPtr = ptr;
177      return XML_TOK_DECL_OPEN;
178    case BT_NMSTRT:
179    case BT_HEX:
180      ptr += MINBPC;
181      break;
182    default:
183      *nextTokPtr = ptr;
184      return XML_TOK_INVALID;
185    }
186  }
187  return XML_TOK_PARTIAL;
188}
189
190static
191int PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end, int *tokPtr)
192{
193  int upper = 0;
194  *tokPtr = XML_TOK_PI;
195  if (end - ptr != MINBPC*3)
196    return 1;
197  switch (BYTE_TO_ASCII(enc, ptr)) {
198  case 'x':
199    break;
200  case 'X':
201    upper = 1;
202    break;
203  default:
204    return 1;
205  }
206  ptr += MINBPC;
207  switch (BYTE_TO_ASCII(enc, ptr)) {
208  case 'm':
209    break;
210  case 'M':
211    upper = 1;
212    break;
213  default:
214    return 1;
215  }
216  ptr += MINBPC;
217  switch (BYTE_TO_ASCII(enc, ptr)) {
218  case 'l':
219    break;
220  case 'L':
221    upper = 1;
222    break;
223  default:
224    return 1;
225  }
226  if (upper)
227    return 0;
228  *tokPtr = XML_TOK_XML_DECL;
229  return 1;
230}
231
232/* ptr points to character following "<?" */
233
234static
235int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
236		   const char **nextTokPtr)
237{
238  int tok;
239  const char *target = ptr;
240  if (ptr == end)
241    return XML_TOK_PARTIAL;
242  switch (BYTE_TYPE(enc, ptr)) {
243  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
244  default:
245    *nextTokPtr = ptr;
246    return XML_TOK_INVALID;
247  }
248  while (ptr != end) {
249    switch (BYTE_TYPE(enc, ptr)) {
250    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
251    case BT_S: case BT_CR: case BT_LF:
252      if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
253	*nextTokPtr = ptr;
254	return XML_TOK_INVALID;
255      }
256      ptr += MINBPC;
257      while (ptr != end) {
258        switch (BYTE_TYPE(enc, ptr)) {
259        INVALID_CASES(ptr, nextTokPtr)
260	case BT_QUEST:
261	  ptr += MINBPC;
262	  if (ptr == end)
263	    return XML_TOK_PARTIAL;
264	  if (CHAR_MATCHES(enc, ptr, '>')) {
265	    *nextTokPtr = ptr + MINBPC;
266	    return tok;
267	  }
268	  break;
269	default:
270	  ptr += MINBPC;
271	  break;
272	}
273      }
274      return XML_TOK_PARTIAL;
275    case BT_QUEST:
276      if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
277	*nextTokPtr = ptr;
278	return XML_TOK_INVALID;
279      }
280      ptr += MINBPC;
281      if (ptr == end)
282	return XML_TOK_PARTIAL;
283      if (CHAR_MATCHES(enc, ptr, '>')) {
284	*nextTokPtr = ptr + MINBPC;
285	return tok;
286      }
287      /* fall through */
288    default:
289      *nextTokPtr = ptr;
290      return XML_TOK_INVALID;
291    }
292  }
293  return XML_TOK_PARTIAL;
294}
295
296
297static
298int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
299			     const char **nextTokPtr)
300{
301  int i;
302  /* CDATA[ */
303  if (end - ptr < 6 * MINBPC)
304    return XML_TOK_PARTIAL;
305  for (i = 0; i < 6; i++, ptr += MINBPC) {
306    if (!CHAR_MATCHES(enc, ptr, "CDATA["[i])) {
307      *nextTokPtr = ptr;
308      return XML_TOK_INVALID;
309    }
310  }
311  *nextTokPtr = ptr;
312  return XML_TOK_CDATA_SECT_OPEN;
313}
314
315static
316int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
317			    const char **nextTokPtr)
318{
319  if (ptr == end)
320    return XML_TOK_NONE;
321#if MINBPC > 1
322  {
323    size_t n = end - ptr;
324    if (n & (MINBPC - 1)) {
325      n &= ~(MINBPC - 1);
326      if (n == 0)
327	return XML_TOK_PARTIAL;
328      end = ptr + n;
329    }
330  }
331#endif
332  switch (BYTE_TYPE(enc, ptr)) {
333  case BT_RSQB:
334    ptr += MINBPC;
335    if (ptr == end)
336      return XML_TOK_PARTIAL;
337    if (!CHAR_MATCHES(enc, ptr, ']'))
338      break;
339    ptr += MINBPC;
340    if (ptr == end)
341      return XML_TOK_PARTIAL;
342    if (!CHAR_MATCHES(enc, ptr, '>')) {
343      ptr -= MINBPC;
344      break;
345    }
346    *nextTokPtr = ptr + MINBPC;
347    return XML_TOK_CDATA_SECT_CLOSE;
348  case BT_CR:
349    ptr += MINBPC;
350    if (ptr == end)
351      return XML_TOK_PARTIAL;
352    if (BYTE_TYPE(enc, ptr) == BT_LF)
353      ptr += MINBPC;
354    *nextTokPtr = ptr;
355    return XML_TOK_DATA_NEWLINE;
356  case BT_LF:
357    *nextTokPtr = ptr + MINBPC;
358    return XML_TOK_DATA_NEWLINE;
359  INVALID_CASES(ptr, nextTokPtr)
360  default:
361    ptr += MINBPC;
362    break;
363  }
364  while (ptr != end) {
365    switch (BYTE_TYPE(enc, ptr)) {
366#define LEAD_CASE(n) \
367    case BT_LEAD ## n: \
368      if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
369	*nextTokPtr = ptr; \
370	return XML_TOK_DATA_CHARS; \
371      } \
372      ptr += n; \
373      break;
374    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
375#undef LEAD_CASE
376    case BT_NONXML:
377    case BT_MALFORM:
378    case BT_TRAIL:
379    case BT_CR:
380    case BT_LF:
381    case BT_RSQB:
382      *nextTokPtr = ptr;
383      return XML_TOK_DATA_CHARS;
384    default:
385      ptr += MINBPC;
386      break;
387    }
388  }
389  *nextTokPtr = ptr;
390  return XML_TOK_DATA_CHARS;
391}
392
393/* ptr points to character following "</" */
394
395static
396int PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
397		       const char **nextTokPtr)
398{
399  if (ptr == end)
400    return XML_TOK_PARTIAL;
401  switch (BYTE_TYPE(enc, ptr)) {
402  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
403  default:
404    *nextTokPtr = ptr;
405    return XML_TOK_INVALID;
406  }
407  while (ptr != end) {
408    switch (BYTE_TYPE(enc, ptr)) {
409    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
410    case BT_S: case BT_CR: case BT_LF:
411      for (ptr += MINBPC; ptr != end; ptr += MINBPC) {
412	switch (BYTE_TYPE(enc, ptr)) {
413	case BT_S: case BT_CR: case BT_LF:
414	  break;
415	case BT_GT:
416	  *nextTokPtr = ptr + MINBPC;
417          return XML_TOK_END_TAG;
418	default:
419	  *nextTokPtr = ptr;
420	  return XML_TOK_INVALID;
421	}
422      }
423      return XML_TOK_PARTIAL;
424    case BT_GT:
425      *nextTokPtr = ptr + MINBPC;
426      return XML_TOK_END_TAG;
427    default:
428      *nextTokPtr = ptr;
429      return XML_TOK_INVALID;
430    }
431  }
432  return XML_TOK_PARTIAL;
433}
434
435/* ptr points to character following "&#X" */
436
437static
438int PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
439			   const char **nextTokPtr)
440{
441  if (ptr != end) {
442    switch (BYTE_TYPE(enc, ptr)) {
443    case BT_DIGIT:
444    case BT_HEX:
445      break;
446    default:
447      *nextTokPtr = ptr;
448      return XML_TOK_INVALID;
449    }
450    for (ptr += MINBPC; ptr != end; ptr += MINBPC) {
451      switch (BYTE_TYPE(enc, ptr)) {
452      case BT_DIGIT:
453      case BT_HEX:
454	break;
455      case BT_SEMI:
456	*nextTokPtr = ptr + MINBPC;
457	return XML_TOK_CHAR_REF;
458      default:
459	*nextTokPtr = ptr;
460	return XML_TOK_INVALID;
461      }
462    }
463  }
464  return XML_TOK_PARTIAL;
465}
466
467/* ptr points to character following "&#" */
468
469static
470int PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
471			const char **nextTokPtr)
472{
473  if (ptr != end) {
474    if (CHAR_MATCHES(enc, ptr, 'x'))
475      return PREFIX(scanHexCharRef)(enc, ptr + MINBPC, end, nextTokPtr);
476    switch (BYTE_TYPE(enc, ptr)) {
477    case BT_DIGIT:
478      break;
479    default:
480      *nextTokPtr = ptr;
481      return XML_TOK_INVALID;
482    }
483    for (ptr += MINBPC; ptr != end; ptr += MINBPC) {
484      switch (BYTE_TYPE(enc, ptr)) {
485      case BT_DIGIT:
486	break;
487      case BT_SEMI:
488	*nextTokPtr = ptr + MINBPC;
489	return XML_TOK_CHAR_REF;
490      default:
491	*nextTokPtr = ptr;
492	return XML_TOK_INVALID;
493      }
494    }
495  }
496  return XML_TOK_PARTIAL;
497}
498
499/* ptr points to character following "&" */
500
501static
502int PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
503		    const char **nextTokPtr)
504{
505  if (ptr == end)
506    return XML_TOK_PARTIAL;
507  switch (BYTE_TYPE(enc, ptr)) {
508  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
509  case BT_NUM:
510    return PREFIX(scanCharRef)(enc, ptr + MINBPC, end, nextTokPtr);
511  default:
512    *nextTokPtr = ptr;
513    return XML_TOK_INVALID;
514  }
515  while (ptr != end) {
516    switch (BYTE_TYPE(enc, ptr)) {
517    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
518    case BT_SEMI:
519      *nextTokPtr = ptr + MINBPC;
520      return XML_TOK_ENTITY_REF;
521    default:
522      *nextTokPtr = ptr;
523      return XML_TOK_INVALID;
524    }
525  }
526  return XML_TOK_PARTIAL;
527}
528
529/* ptr points to character following first character of attribute name */
530
531static
532int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
533		     const char **nextTokPtr)
534{
535  while (ptr != end) {
536    switch (BYTE_TYPE(enc, ptr)) {
537    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
538    case BT_S: case BT_CR: case BT_LF:
539      for (;;) {
540	int t;
541
542	ptr += MINBPC;
543	if (ptr == end)
544	  return XML_TOK_PARTIAL;
545	t = BYTE_TYPE(enc, ptr);
546	if (t == BT_EQUALS)
547	  break;
548	switch (t) {
549	case BT_S:
550	case BT_LF:
551	case BT_CR:
552	  break;
553	default:
554	  *nextTokPtr = ptr;
555	  return XML_TOK_INVALID;
556	}
557      }
558    /* fall through */
559    case BT_EQUALS:
560      {
561	int open;
562	for (;;) {
563
564	  ptr += MINBPC;
565	  if (ptr == end)
566	    return XML_TOK_PARTIAL;
567	  open = BYTE_TYPE(enc, ptr);
568	  if (open == BT_QUOT || open == BT_APOS)
569	    break;
570	  switch (open) {
571	  case BT_S:
572	  case BT_LF:
573	  case BT_CR:
574	    break;
575	  default:
576	    *nextTokPtr = ptr;
577	    return XML_TOK_INVALID;
578	  }
579	}
580	ptr += MINBPC;
581	/* in attribute value */
582	for (;;) {
583	  int t;
584	  if (ptr == end)
585	    return XML_TOK_PARTIAL;
586	  t = BYTE_TYPE(enc, ptr);
587	  if (t == open)
588	    break;
589	  switch (t) {
590	  INVALID_CASES(ptr, nextTokPtr)
591	  case BT_AMP:
592	    {
593	      int tok = PREFIX(scanRef)(enc, ptr + MINBPC, end, &ptr);
594	      if (tok <= 0) {
595		if (tok == XML_TOK_INVALID)
596		  *nextTokPtr = ptr;
597		return tok;
598	      }
599	      break;
600	    }
601	  case BT_LT:
602	    *nextTokPtr = ptr;
603	    return XML_TOK_INVALID;
604	  default:
605	    ptr += MINBPC;
606	    break;
607	  }
608	}
609	ptr += MINBPC;
610	if (ptr == end)
611	  return XML_TOK_PARTIAL;
612	switch (BYTE_TYPE(enc, ptr)) {
613	case BT_S:
614	case BT_CR:
615	case BT_LF:
616	  break;
617	case BT_SOL:
618	  goto sol;
619	case BT_GT:
620	  goto gt;
621	default:
622	  *nextTokPtr = ptr;
623	  return XML_TOK_INVALID;
624	}
625	/* ptr points to closing quote */
626	for (;;) {
627	  ptr += MINBPC;
628	  if (ptr == end)
629	    return XML_TOK_PARTIAL;
630	  switch (BYTE_TYPE(enc, ptr)) {
631	  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
632	  case BT_S: case BT_CR: case BT_LF:
633	    continue;
634	  case BT_GT:
635          gt:
636	    *nextTokPtr = ptr + MINBPC;
637	    return XML_TOK_START_TAG_WITH_ATTS;
638	  case BT_SOL:
639          sol:
640	    ptr += MINBPC;
641	    if (ptr == end)
642	      return XML_TOK_PARTIAL;
643	    if (!CHAR_MATCHES(enc, ptr, '>')) {
644	      *nextTokPtr = ptr;
645	      return XML_TOK_INVALID;
646	    }
647	    *nextTokPtr = ptr + MINBPC;
648	    return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
649	  default:
650	    *nextTokPtr = ptr;
651	    return XML_TOK_INVALID;
652	  }
653	  break;
654	}
655	break;
656      }
657    default:
658      *nextTokPtr = ptr;
659      return XML_TOK_INVALID;
660    }
661  }
662  return XML_TOK_PARTIAL;
663}
664
665/* ptr points to character following "<" */
666
667static
668int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
669		   const char **nextTokPtr)
670{
671  if (ptr == end)
672    return XML_TOK_PARTIAL;
673  switch (BYTE_TYPE(enc, ptr)) {
674  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
675  case BT_EXCL:
676    if ((ptr += MINBPC) == end)
677      return XML_TOK_PARTIAL;
678    switch (BYTE_TYPE(enc, ptr)) {
679    case BT_MINUS:
680      return PREFIX(scanComment)(enc, ptr + MINBPC, end, nextTokPtr);
681    case BT_LSQB:
682      return PREFIX(scanCdataSection)(enc, ptr + MINBPC, end, nextTokPtr);
683    }
684    *nextTokPtr = ptr;
685    return XML_TOK_INVALID;
686  case BT_QUEST:
687    return PREFIX(scanPi)(enc, ptr + MINBPC, end, nextTokPtr);
688  case BT_SOL:
689    return PREFIX(scanEndTag)(enc, ptr + MINBPC, end, nextTokPtr);
690  default:
691    *nextTokPtr = ptr;
692    return XML_TOK_INVALID;
693  }
694  /* we have a start-tag */
695  while (ptr != end) {
696    switch (BYTE_TYPE(enc, ptr)) {
697    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
698    case BT_S: case BT_CR: case BT_LF:
699      {
700        ptr += MINBPC;
701	while (ptr != end) {
702	  switch (BYTE_TYPE(enc, ptr)) {
703	  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
704	  case BT_GT:
705	    goto gt;
706	  case BT_SOL:
707	    goto sol;
708	  case BT_S: case BT_CR: case BT_LF:
709	    ptr += MINBPC;
710	    continue;
711	  default:
712	    *nextTokPtr = ptr;
713	    return XML_TOK_INVALID;
714	  }
715	  return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
716	}
717	return XML_TOK_PARTIAL;
718      }
719    case BT_GT:
720    gt:
721      *nextTokPtr = ptr + MINBPC;
722      return XML_TOK_START_TAG_NO_ATTS;
723    case BT_SOL:
724    sol:
725      ptr += MINBPC;
726      if (ptr == end)
727	return XML_TOK_PARTIAL;
728      if (!CHAR_MATCHES(enc, ptr, '>')) {
729	*nextTokPtr = ptr;
730	return XML_TOK_INVALID;
731      }
732      *nextTokPtr = ptr + MINBPC;
733      return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
734    default:
735      *nextTokPtr = ptr;
736      return XML_TOK_INVALID;
737    }
738  }
739  return XML_TOK_PARTIAL;
740}
741
742static
743int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
744		       const char **nextTokPtr)
745{
746  if (ptr == end)
747    return XML_TOK_NONE;
748#if MINBPC > 1
749  {
750    size_t n = end - ptr;
751    if (n & (MINBPC - 1)) {
752      n &= ~(MINBPC - 1);
753      if (n == 0)
754	return XML_TOK_PARTIAL;
755      end = ptr + n;
756    }
757  }
758#endif
759  switch (BYTE_TYPE(enc, ptr)) {
760  case BT_LT:
761    return PREFIX(scanLt)(enc, ptr + MINBPC, end, nextTokPtr);
762  case BT_AMP:
763    return PREFIX(scanRef)(enc, ptr + MINBPC, end, nextTokPtr);
764  case BT_CR:
765    ptr += MINBPC;
766    if (ptr == end)
767      return XML_TOK_TRAILING_CR;
768    if (BYTE_TYPE(enc, ptr) == BT_LF)
769      ptr += MINBPC;
770    *nextTokPtr = ptr;
771    return XML_TOK_DATA_NEWLINE;
772  case BT_LF:
773    *nextTokPtr = ptr + MINBPC;
774    return XML_TOK_DATA_NEWLINE;
775  case BT_RSQB:
776    ptr += MINBPC;
777    if (ptr == end)
778      return XML_TOK_TRAILING_RSQB;
779    if (!CHAR_MATCHES(enc, ptr, ']'))
780      break;
781    ptr += MINBPC;
782    if (ptr == end)
783      return XML_TOK_TRAILING_RSQB;
784    if (!CHAR_MATCHES(enc, ptr, '>')) {
785      ptr -= MINBPC;
786      break;
787    }
788    *nextTokPtr = ptr;
789    return XML_TOK_INVALID;
790  INVALID_CASES(ptr, nextTokPtr)
791  default:
792    ptr += MINBPC;
793    break;
794  }
795  while (ptr != end) {
796    switch (BYTE_TYPE(enc, ptr)) {
797#define LEAD_CASE(n) \
798    case BT_LEAD ## n: \
799      if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
800	*nextTokPtr = ptr; \
801	return XML_TOK_DATA_CHARS; \
802      } \
803      ptr += n; \
804      break;
805    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
806#undef LEAD_CASE
807    case BT_RSQB:
808      if (ptr + MINBPC != end) {
809	 if (!CHAR_MATCHES(enc, ptr + MINBPC, ']')) {
810	   ptr += MINBPC;
811	   break;
812	 }
813	 if (ptr + 2*MINBPC != end) {
814	   if (!CHAR_MATCHES(enc, ptr + 2*MINBPC, '>')) {
815	     ptr += MINBPC;
816	     break;
817	   }
818	   *nextTokPtr = ptr + 2*MINBPC;
819	   return XML_TOK_INVALID;
820	 }
821      }
822      /* fall through */
823    case BT_AMP:
824    case BT_LT:
825    case BT_NONXML:
826    case BT_MALFORM:
827    case BT_TRAIL:
828    case BT_CR:
829    case BT_LF:
830      *nextTokPtr = ptr;
831      return XML_TOK_DATA_CHARS;
832    default:
833      ptr += MINBPC;
834      break;
835    }
836  }
837  *nextTokPtr = ptr;
838  return XML_TOK_DATA_CHARS;
839}
840
841/* ptr points to character following "%" */
842
843static
844int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
845			const char **nextTokPtr)
846{
847  if (ptr == end)
848    return XML_TOK_PARTIAL;
849  switch (BYTE_TYPE(enc, ptr)) {
850  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
851  case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
852    *nextTokPtr = ptr;
853    return XML_TOK_PERCENT;
854  default:
855    *nextTokPtr = ptr;
856    return XML_TOK_INVALID;
857  }
858  while (ptr != end) {
859    switch (BYTE_TYPE(enc, ptr)) {
860    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
861    case BT_SEMI:
862      *nextTokPtr = ptr + MINBPC;
863      return XML_TOK_PARAM_ENTITY_REF;
864    default:
865      *nextTokPtr = ptr;
866      return XML_TOK_INVALID;
867    }
868  }
869  return XML_TOK_PARTIAL;
870}
871
872static
873int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
874			  const char **nextTokPtr)
875{
876  if (ptr == end)
877    return XML_TOK_PARTIAL;
878  switch (BYTE_TYPE(enc, ptr)) {
879  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
880  default:
881    *nextTokPtr = ptr;
882    return XML_TOK_INVALID;
883  }
884  while (ptr != end) {
885    switch (BYTE_TYPE(enc, ptr)) {
886    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
887    case BT_CR: case BT_LF: case BT_S:
888    case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
889      *nextTokPtr = ptr;
890      return XML_TOK_POUND_NAME;
891    default:
892      *nextTokPtr = ptr;
893      return XML_TOK_INVALID;
894    }
895  }
896  return XML_TOK_PARTIAL;
897}
898
899static
900int PREFIX(scanLit)(int open, const ENCODING *enc,
901		    const char *ptr, const char *end,
902		    const char **nextTokPtr)
903{
904  while (ptr != end) {
905    int t = BYTE_TYPE(enc, ptr);
906    switch (t) {
907    INVALID_CASES(ptr, nextTokPtr)
908    case BT_QUOT:
909    case BT_APOS:
910      ptr += MINBPC;
911      if (t != open)
912	break;
913      if (ptr == end)
914	return XML_TOK_PARTIAL;
915      *nextTokPtr = ptr;
916      switch (BYTE_TYPE(enc, ptr)) {
917      case BT_S: case BT_CR: case BT_LF:
918      case BT_GT: case BT_PERCNT: case BT_LSQB:
919	return XML_TOK_LITERAL;
920      default:
921	return XML_TOK_INVALID;
922      }
923    default:
924      ptr += MINBPC;
925      break;
926    }
927  }
928  return XML_TOK_PARTIAL;
929}
930
931static
932int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
933		      const char **nextTokPtr)
934{
935  int tok;
936  if (ptr == end)
937    return XML_TOK_NONE;
938#if MINBPC > 1
939  {
940    size_t n = end - ptr;
941    if (n & (MINBPC - 1)) {
942      n &= ~(MINBPC - 1);
943      if (n == 0)
944	return XML_TOK_PARTIAL;
945      end = ptr + n;
946    }
947  }
948#endif
949  switch (BYTE_TYPE(enc, ptr)) {
950  case BT_QUOT:
951    return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC, end, nextTokPtr);
952  case BT_APOS:
953    return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC, end, nextTokPtr);
954  case BT_LT:
955    {
956      ptr += MINBPC;
957      if (ptr == end)
958	return XML_TOK_PARTIAL;
959      switch (BYTE_TYPE(enc, ptr)) {
960      case BT_EXCL:
961	return PREFIX(scanDecl)(enc, ptr + MINBPC, end, nextTokPtr);
962      case BT_QUEST:
963	return PREFIX(scanPi)(enc, ptr + MINBPC, end, nextTokPtr);
964      case BT_NMSTRT:
965      case BT_HEX:
966      case BT_NONASCII:
967      case BT_LEAD2:
968      case BT_LEAD3:
969      case BT_LEAD4:
970	*nextTokPtr = ptr - MINBPC;
971	return XML_TOK_INSTANCE_START;
972      }
973      *nextTokPtr = ptr;
974      return XML_TOK_INVALID;
975    }
976  case BT_CR:
977    if (ptr + MINBPC == end)
978      return XML_TOK_TRAILING_CR;
979    /* fall through */
980  case BT_S: case BT_LF:
981    for (;;) {
982      ptr += MINBPC;
983      if (ptr == end)
984	break;
985      switch (BYTE_TYPE(enc, ptr)) {
986      case BT_S: case BT_LF:
987	break;
988      case BT_CR:
989	/* don't split CR/LF pair */
990	if (ptr + MINBPC != end)
991	  break;
992	/* fall through */
993      default:
994	*nextTokPtr = ptr;
995	return XML_TOK_PROLOG_S;
996      }
997    }
998    *nextTokPtr = ptr;
999    return XML_TOK_PROLOG_S;
1000  case BT_PERCNT:
1001    return PREFIX(scanPercent)(enc, ptr + MINBPC, end, nextTokPtr);
1002  case BT_COMMA:
1003    *nextTokPtr = ptr + MINBPC;
1004    return XML_TOK_COMMA;
1005  case BT_LSQB:
1006    *nextTokPtr = ptr + MINBPC;
1007    return XML_TOK_OPEN_BRACKET;
1008  case BT_RSQB:
1009    ptr += MINBPC;
1010    if (ptr == end)
1011      return XML_TOK_PARTIAL;
1012    if (CHAR_MATCHES(enc, ptr, ']')) {
1013      if (ptr + MINBPC == end)
1014	return XML_TOK_PARTIAL;
1015      if (CHAR_MATCHES(enc, ptr + MINBPC, '>')) {
1016	*nextTokPtr = ptr + 2*MINBPC;
1017	return XML_TOK_COND_SECT_CLOSE;
1018      }
1019    }
1020    *nextTokPtr = ptr;
1021    return XML_TOK_CLOSE_BRACKET;
1022  case BT_LPAR:
1023    *nextTokPtr = ptr + MINBPC;
1024    return XML_TOK_OPEN_PAREN;
1025  case BT_RPAR:
1026    ptr += MINBPC;
1027    if (ptr == end)
1028      return XML_TOK_PARTIAL;
1029    switch (BYTE_TYPE(enc, ptr)) {
1030    case BT_AST:
1031      *nextTokPtr = ptr + MINBPC;
1032      return XML_TOK_CLOSE_PAREN_ASTERISK;
1033    case BT_QUEST:
1034      *nextTokPtr = ptr + MINBPC;
1035      return XML_TOK_CLOSE_PAREN_QUESTION;
1036    case BT_PLUS:
1037      *nextTokPtr = ptr + MINBPC;
1038      return XML_TOK_CLOSE_PAREN_PLUS;
1039    case BT_CR: case BT_LF: case BT_S:
1040    case BT_GT: case BT_COMMA: case BT_VERBAR:
1041    case BT_RPAR:
1042      *nextTokPtr = ptr;
1043      return XML_TOK_CLOSE_PAREN;
1044    }
1045    *nextTokPtr = ptr;
1046    return XML_TOK_INVALID;
1047  case BT_VERBAR:
1048    *nextTokPtr = ptr + MINBPC;
1049    return XML_TOK_OR;
1050  case BT_GT:
1051    *nextTokPtr = ptr + MINBPC;
1052    return XML_TOK_DECL_CLOSE;
1053  case BT_NUM:
1054    return PREFIX(scanPoundName)(enc, ptr + MINBPC, end, nextTokPtr);
1055#define LEAD_CASE(n) \
1056  case BT_LEAD ## n: \
1057    if (end - ptr < n) \
1058      return XML_TOK_PARTIAL_CHAR; \
1059    if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1060      ptr += n; \
1061      tok = XML_TOK_NAME; \
1062      break; \
1063    } \
1064    if (IS_NAME_CHAR(enc, ptr, n)) { \
1065      ptr += n; \
1066      tok = XML_TOK_NMTOKEN; \
1067      break; \
1068    } \
1069    *nextTokPtr = ptr; \
1070    return XML_TOK_INVALID;
1071    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1072#undef LEAD_CASE
1073  case BT_NMSTRT:
1074  case BT_HEX:
1075    tok = XML_TOK_NAME;
1076    ptr += MINBPC;
1077    break;
1078  case BT_DIGIT:
1079  case BT_NAME:
1080  case BT_MINUS:
1081    tok = XML_TOK_NMTOKEN;
1082    ptr += MINBPC;
1083    break;
1084  case BT_NONASCII:
1085    if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1086      ptr += MINBPC;
1087      tok = XML_TOK_NAME;
1088      break;
1089    }
1090    if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1091      ptr += MINBPC;
1092      tok = XML_TOK_NMTOKEN;
1093      break;
1094    }
1095    /* fall through */
1096  default:
1097    *nextTokPtr = ptr;
1098    return XML_TOK_INVALID;
1099  }
1100  while (ptr != end) {
1101    switch (BYTE_TYPE(enc, ptr)) {
1102    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1103    case BT_GT: case BT_RPAR: case BT_COMMA:
1104    case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1105    case BT_S: case BT_CR: case BT_LF:
1106      *nextTokPtr = ptr;
1107      return tok;
1108    case BT_PLUS:
1109      if (tok != XML_TOK_NAME)  {
1110	*nextTokPtr = ptr;
1111	return XML_TOK_INVALID;
1112      }
1113      *nextTokPtr = ptr + MINBPC;
1114      return XML_TOK_NAME_PLUS;
1115    case BT_AST:
1116      if (tok != XML_TOK_NAME)  {
1117	*nextTokPtr = ptr;
1118	return XML_TOK_INVALID;
1119      }
1120      *nextTokPtr = ptr + MINBPC;
1121      return XML_TOK_NAME_ASTERISK;
1122    case BT_QUEST:
1123      if (tok != XML_TOK_NAME)  {
1124	*nextTokPtr = ptr;
1125	return XML_TOK_INVALID;
1126      }
1127      *nextTokPtr = ptr + MINBPC;
1128      return XML_TOK_NAME_QUESTION;
1129    default:
1130      *nextTokPtr = ptr;
1131      return XML_TOK_INVALID;
1132    }
1133  }
1134  return XML_TOK_PARTIAL;
1135}
1136
1137static
1138int PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1139			      const char **nextTokPtr)
1140{
1141  const char *start;
1142  if (ptr == end)
1143    return XML_TOK_NONE;
1144  start = ptr;
1145  while (ptr != end) {
1146    switch (BYTE_TYPE(enc, ptr)) {
1147#define LEAD_CASE(n) \
1148    case BT_LEAD ## n: ptr += n; break;
1149    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1150#undef LEAD_CASE
1151    case BT_AMP:
1152      if (ptr == start)
1153	return PREFIX(scanRef)(enc, ptr + MINBPC, end, nextTokPtr);
1154      *nextTokPtr = ptr;
1155      return XML_TOK_DATA_CHARS;
1156    case BT_LT:
1157      /* this is for inside entity references */
1158      *nextTokPtr = ptr;
1159      return XML_TOK_INVALID;
1160    case BT_LF:
1161      if (ptr == start) {
1162	*nextTokPtr = ptr + MINBPC;
1163	return XML_TOK_DATA_NEWLINE;
1164      }
1165      *nextTokPtr = ptr;
1166      return XML_TOK_DATA_CHARS;
1167    case BT_CR:
1168      if (ptr == start) {
1169	ptr += MINBPC;
1170	if (ptr == end)
1171	  return XML_TOK_TRAILING_CR;
1172	if (BYTE_TYPE(enc, ptr) == BT_LF)
1173	  ptr += MINBPC;
1174	*nextTokPtr = ptr;
1175	return XML_TOK_DATA_NEWLINE;
1176      }
1177      *nextTokPtr = ptr;
1178      return XML_TOK_DATA_CHARS;
1179    case BT_S:
1180      if (ptr == start) {
1181	*nextTokPtr = ptr + MINBPC;
1182	return XML_TOK_ATTRIBUTE_VALUE_S;
1183      }
1184      *nextTokPtr = ptr;
1185      return XML_TOK_DATA_CHARS;
1186    default:
1187      ptr += MINBPC;
1188      break;
1189    }
1190  }
1191  *nextTokPtr = ptr;
1192  return XML_TOK_DATA_CHARS;
1193}
1194
1195static
1196int PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1197			   const char **nextTokPtr)
1198{
1199  const char *start;
1200  if (ptr == end)
1201    return XML_TOK_NONE;
1202  start = ptr;
1203  while (ptr != end) {
1204    switch (BYTE_TYPE(enc, ptr)) {
1205#define LEAD_CASE(n) \
1206    case BT_LEAD ## n: ptr += n; break;
1207    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1208#undef LEAD_CASE
1209    case BT_AMP:
1210      if (ptr == start)
1211	return PREFIX(scanRef)(enc, ptr + MINBPC, end, nextTokPtr);
1212      *nextTokPtr = ptr;
1213      return XML_TOK_DATA_CHARS;
1214    case BT_PERCNT:
1215      if (ptr == start)
1216	return PREFIX(scanPercent)(enc, ptr + MINBPC, end, nextTokPtr);
1217      *nextTokPtr = ptr;
1218      return XML_TOK_DATA_CHARS;
1219    case BT_LF:
1220      if (ptr == start) {
1221	*nextTokPtr = ptr + MINBPC;
1222	return XML_TOK_DATA_NEWLINE;
1223      }
1224      *nextTokPtr = ptr;
1225      return XML_TOK_DATA_CHARS;
1226    case BT_CR:
1227      if (ptr == start) {
1228	ptr += MINBPC;
1229	if (ptr == end)
1230	  return XML_TOK_TRAILING_CR;
1231	if (BYTE_TYPE(enc, ptr) == BT_LF)
1232	  ptr += MINBPC;
1233	*nextTokPtr = ptr;
1234	return XML_TOK_DATA_NEWLINE;
1235      }
1236      *nextTokPtr = ptr;
1237      return XML_TOK_DATA_CHARS;
1238    default:
1239      ptr += MINBPC;
1240      break;
1241    }
1242  }
1243  *nextTokPtr = ptr;
1244  return XML_TOK_DATA_CHARS;
1245}
1246
1247static
1248int PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1249		       const char **badPtr)
1250{
1251  ptr += MINBPC;
1252  end -= MINBPC;
1253  for (; ptr != end; ptr += MINBPC) {
1254    switch (BYTE_TYPE(enc, ptr)) {
1255    case BT_DIGIT:
1256    case BT_HEX:
1257    case BT_MINUS:
1258    case BT_APOS:
1259    case BT_LPAR:
1260    case BT_RPAR:
1261    case BT_PLUS:
1262    case BT_COMMA:
1263    case BT_SOL:
1264    case BT_EQUALS:
1265    case BT_QUEST:
1266    case BT_CR:
1267    case BT_LF:
1268    case BT_SEMI:
1269    case BT_EXCL:
1270    case BT_AST:
1271    case BT_PERCNT:
1272    case BT_NUM:
1273      break;
1274    case BT_S:
1275      if (CHAR_MATCHES(enc, ptr, '\t')) {
1276	*badPtr = ptr;
1277	return 0;
1278      }
1279      break;
1280    case BT_NAME:
1281    case BT_NMSTRT:
1282      if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1283	break;
1284    default:
1285      switch (BYTE_TO_ASCII(enc, ptr)) {
1286      case 0x24: /* $ */
1287      case 0x40: /* @ */
1288	break;
1289      default:
1290	*badPtr = ptr;
1291	return 0;
1292      }
1293      break;
1294    }
1295  }
1296  return 1;
1297}
1298
1299/* This must only be called for a well-formed start-tag or empty element tag.
1300Returns the number of attributes.  Pointers to the first attsMax attributes
1301are stored in atts. */
1302
1303static
1304int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1305		    int attsMax, ATTRIBUTE *atts)
1306{
1307  enum { other, inName, inValue } state = inName;
1308  int nAtts = 0;
1309  int open = 0;
1310
1311  for (ptr += MINBPC;; ptr += MINBPC) {
1312    switch (BYTE_TYPE(enc, ptr)) {
1313#define START_NAME \
1314      if (state == other) { \
1315	if (nAtts < attsMax) { \
1316	  atts[nAtts].name = ptr; \
1317	  atts[nAtts].normalized = 1; \
1318	} \
1319	state = inName; \
1320      }
1321#define LEAD_CASE(n) \
1322    case BT_LEAD ## n: START_NAME ptr += (n - MINBPC); break;
1323    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1324#undef LEAD_CASE
1325    case BT_NONASCII:
1326    case BT_NMSTRT:
1327    case BT_HEX:
1328      START_NAME
1329      break;
1330#undef START_NAME
1331    case BT_QUOT:
1332      if (state != inValue) {
1333	atts[nAtts].valuePtr = ptr + MINBPC;
1334        state = inValue;
1335        open = BT_QUOT;
1336      }
1337      else if (open == BT_QUOT) {
1338        state = other;
1339	atts[nAtts++].valueEnd = ptr;
1340      }
1341      break;
1342    case BT_APOS:
1343      if (state != inValue) {
1344	atts[nAtts].valuePtr = ptr + MINBPC;
1345        state = inValue;
1346        open = BT_APOS;
1347      }
1348      else if (open == BT_APOS) {
1349        state = other;
1350	atts[nAtts++].valueEnd = ptr;
1351      }
1352      break;
1353    case BT_AMP:
1354      atts[nAtts].normalized = 0;
1355      break;
1356    case BT_S:
1357      if (state == inName)
1358        state = other;
1359      else if (state == inValue
1360	       && atts[nAtts].normalized
1361	       && (ptr == atts[nAtts].valuePtr
1362		   || BYTE_TO_ASCII(enc, ptr) != ' '
1363		   || BYTE_TO_ASCII(enc, ptr + MINBPC) == ' '
1364	           || BYTE_TYPE(enc, ptr + MINBPC) == open))
1365	atts[nAtts].normalized = 0;
1366      break;
1367    case BT_CR: case BT_LF:
1368      /* This case ensures that the first attribute name is counted
1369         Apart from that we could just change state on the quote. */
1370      if (state == inName)
1371        state = other;
1372      else if (state == inValue)
1373	atts[nAtts].normalized = 0;
1374      break;
1375    case BT_GT:
1376    case BT_SOL:
1377      if (state != inValue)
1378	return nAtts;
1379      break;
1380    default:
1381      break;
1382    }
1383  }
1384  /* not reached */
1385}
1386
1387static
1388int PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
1389{
1390  int result = 0;
1391  /* skip &# */
1392  ptr += 2*MINBPC;
1393  if (CHAR_MATCHES(enc, ptr, 'x')) {
1394    for (ptr += MINBPC; !CHAR_MATCHES(enc, ptr, ';'); ptr += MINBPC) {
1395      int c = BYTE_TO_ASCII(enc, ptr);
1396      switch (c) {
1397      case '0': case '1': case '2': case '3': case '4':
1398      case '5': case '6': case '7': case '8': case '9':
1399	result <<= 4;
1400	result |= (c - '0');
1401	break;
1402      case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1403	result <<= 4;
1404	result += 10 + (c - 'A');
1405	break;
1406      case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1407	result <<= 4;
1408	result += 10 + (c - 'a');
1409	break;
1410      }
1411      if (result >= 0x110000)
1412	return -1;
1413    }
1414  }
1415  else {
1416    for (; !CHAR_MATCHES(enc, ptr, ';'); ptr += MINBPC) {
1417      int c = BYTE_TO_ASCII(enc, ptr);
1418      result *= 10;
1419      result += (c - '0');
1420      if (result >= 0x110000)
1421	return -1;
1422    }
1423  }
1424  return checkCharRefNumber(result);
1425}
1426
1427static
1428int PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, const char *end)
1429{
1430  switch (end - ptr) {
1431  case 2 * MINBPC:
1432    if (CHAR_MATCHES(enc, ptr + MINBPC, 't')) {
1433      switch (BYTE_TO_ASCII(enc, ptr)) {
1434      case 'l':
1435	return '<';
1436      case 'g':
1437	return '>';
1438      }
1439    }
1440    break;
1441  case 3 * MINBPC:
1442    if (CHAR_MATCHES(enc, ptr, 'a')) {
1443      ptr += MINBPC;
1444      if (CHAR_MATCHES(enc, ptr, 'm')) {
1445	ptr += MINBPC;
1446	if (CHAR_MATCHES(enc, ptr, 'p'))
1447	  return '&';
1448      }
1449    }
1450    break;
1451  case 4 * MINBPC:
1452    switch (BYTE_TO_ASCII(enc, ptr)) {
1453    case 'q':
1454      ptr += MINBPC;
1455      if (CHAR_MATCHES(enc, ptr, 'u')) {
1456	ptr += MINBPC;
1457	if (CHAR_MATCHES(enc, ptr, 'o')) {
1458	  ptr += MINBPC;
1459  	  if (CHAR_MATCHES(enc, ptr, 't'))
1460	    return '"';
1461	}
1462      }
1463      break;
1464    case 'a':
1465      ptr += MINBPC;
1466      if (CHAR_MATCHES(enc, ptr, 'p')) {
1467	ptr += MINBPC;
1468	if (CHAR_MATCHES(enc, ptr, 'o')) {
1469	  ptr += MINBPC;
1470  	  if (CHAR_MATCHES(enc, ptr, 's'))
1471	    return '\'';
1472	}
1473      }
1474      break;
1475    }
1476  }
1477  return 0;
1478}
1479
1480static
1481int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1482{
1483  for (;;) {
1484    switch (BYTE_TYPE(enc, ptr1)) {
1485#define LEAD_CASE(n) \
1486    case BT_LEAD ## n: \
1487      if (*ptr1++ != *ptr2++) \
1488	return 0;
1489    LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1490#undef LEAD_CASE
1491      /* fall through */
1492      if (*ptr1++ != *ptr2++)
1493	return 0;
1494      break;
1495    case BT_NONASCII:
1496    case BT_NMSTRT:
1497    case BT_HEX:
1498    case BT_DIGIT:
1499    case BT_NAME:
1500    case BT_MINUS:
1501      if (*ptr2++ != *ptr1++)
1502	return 0;
1503#if MINBPC > 1
1504      if (*ptr2++ != *ptr1++)
1505	return 0;
1506#if MINBPC > 2
1507      if (*ptr2++ != *ptr1++)
1508	return 0;
1509#if MINBPC > 3
1510      if (*ptr2++ != *ptr1++)
1511	return 0;
1512#endif
1513#endif
1514#endif
1515      break;
1516    default:
1517#if MINBPC == 1
1518      if (*ptr1 == *ptr2)
1519	return 1;
1520#endif
1521      switch (BYTE_TYPE(enc, ptr2)) {
1522      case BT_LEAD2:
1523      case BT_LEAD3:
1524      case BT_LEAD4:
1525      case BT_NONASCII:
1526      case BT_NMSTRT:
1527      case BT_HEX:
1528      case BT_DIGIT:
1529      case BT_NAME:
1530      case BT_MINUS:
1531	return 0;
1532      default:
1533	return 1;
1534      }
1535    }
1536  }
1537  /* not reached */
1538}
1539
1540static
1541int PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1542{
1543  for (; *ptr2; ptr1 += MINBPC, ptr2++) {
1544    if (!CHAR_MATCHES(end, ptr1, *ptr2))
1545      return 0;
1546  }
1547  switch (BYTE_TYPE(enc, ptr1)) {
1548  case BT_LEAD2:
1549  case BT_LEAD3:
1550  case BT_LEAD4:
1551  case BT_NONASCII:
1552  case BT_NMSTRT:
1553  case BT_HEX:
1554  case BT_DIGIT:
1555  case BT_NAME:
1556  case BT_MINUS:
1557    return 0;
1558  default:
1559    return 1;
1560  }
1561}
1562
1563static
1564int PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1565{
1566  const char *start = ptr;
1567  for (;;) {
1568    switch (BYTE_TYPE(enc, ptr)) {
1569#define LEAD_CASE(n) \
1570    case BT_LEAD ## n: ptr += n; break;
1571    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1572#undef LEAD_CASE
1573    case BT_NONASCII:
1574    case BT_NMSTRT:
1575    case BT_HEX:
1576    case BT_DIGIT:
1577    case BT_NAME:
1578    case BT_MINUS:
1579      ptr += MINBPC;
1580      break;
1581    default:
1582      return ptr - start;
1583    }
1584  }
1585}
1586
1587static
1588const char *PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1589{
1590  for (;;) {
1591    switch (BYTE_TYPE(enc, ptr)) {
1592    case BT_LF:
1593    case BT_CR:
1594    case BT_S:
1595      ptr += MINBPC;
1596      break;
1597    default:
1598      return ptr;
1599    }
1600  }
1601}
1602
1603static
1604void PREFIX(updatePosition)(const ENCODING *enc,
1605			    const char *ptr,
1606			    const char *end,
1607			    POSITION *pos)
1608{
1609  while (ptr != end) {
1610    switch (BYTE_TYPE(enc, ptr)) {
1611#define LEAD_CASE(n) \
1612    case BT_LEAD ## n: \
1613      ptr += n; \
1614      break;
1615    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1616#undef LEAD_CASE
1617    case BT_LF:
1618      pos->columnNumber = (unsigned)-1;
1619      pos->lineNumber++;
1620      ptr += MINBPC;
1621      break;
1622    case BT_CR:
1623      pos->lineNumber++;
1624      ptr += MINBPC;
1625      if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
1626	ptr += MINBPC;
1627      pos->columnNumber = (unsigned)-1;
1628      break;
1629    default:
1630      ptr += MINBPC;
1631      break;
1632    }
1633    pos->columnNumber++;
1634  }
1635}
1636
1637#undef DO_LEAD_CASE
1638#undef MULTIBYTE_CASES
1639#undef INVALID_CASES
1640#undef CHECK_NAME_CASE
1641#undef CHECK_NAME_CASES
1642#undef CHECK_NMSTRT_CASE
1643#undef CHECK_NMSTRT_CASES
1644