1/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2   See the file COPYING for copying permission.
3*/
4
5#ifndef IS_INVALID_CHAR
6#define IS_INVALID_CHAR(enc, ptr, n) (0)
7#endif
8
9#define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
10    case BT_LEAD ## n: \
11      if (end - ptr < n) \
12        return XML_TOK_PARTIAL_CHAR; \
13      if (IS_INVALID_CHAR(enc, ptr, n)) { \
14        *(nextTokPtr) = (ptr); \
15        return XML_TOK_INVALID; \
16      } \
17      ptr += n; \
18      break;
19
20#define INVALID_CASES(ptr, nextTokPtr) \
21  INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
22  INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
23  INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
24  case BT_NONXML: \
25  case BT_MALFORM: \
26  case BT_TRAIL: \
27    *(nextTokPtr) = (ptr); \
28    return XML_TOK_INVALID;
29
30#define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
31   case BT_LEAD ## n: \
32     if (end - ptr < n) \
33       return XML_TOK_PARTIAL_CHAR; \
34     if (!IS_NAME_CHAR(enc, ptr, n)) { \
35       *nextTokPtr = ptr; \
36       return XML_TOK_INVALID; \
37     } \
38     ptr += n; \
39     break;
40
41#define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
42  case BT_NONASCII: \
43    if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
44      *nextTokPtr = ptr; \
45      return XML_TOK_INVALID; \
46    } \
47  case BT_NMSTRT: \
48  case BT_HEX: \
49  case BT_DIGIT: \
50  case BT_NAME: \
51  case BT_MINUS: \
52    ptr += MINBPC(enc); \
53    break; \
54  CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
55  CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
56  CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
57
58#define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
59   case BT_LEAD ## n: \
60     if (end - ptr < n) \
61       return XML_TOK_PARTIAL_CHAR; \
62     if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
63       *nextTokPtr = ptr; \
64       return XML_TOK_INVALID; \
65     } \
66     ptr += n; \
67     break;
68
69#define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
70  case BT_NONASCII: \
71    if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
72      *nextTokPtr = ptr; \
73      return XML_TOK_INVALID; \
74    } \
75  case BT_NMSTRT: \
76  case BT_HEX: \
77    ptr += MINBPC(enc); \
78    break; \
79  CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
80  CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
81  CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
82
83#ifndef PREFIX
84#define PREFIX(ident) ident
85#endif
86
87/* ptr points to character following "<!-" */
88
89static int PTRCALL
90PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
91                    const char *end, const char **nextTokPtr)
92{
93  if (ptr != end) {
94    if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
95      *nextTokPtr = ptr;
96      return XML_TOK_INVALID;
97    }
98    ptr += MINBPC(enc);
99    while (ptr != end) {
100      switch (BYTE_TYPE(enc, ptr)) {
101      INVALID_CASES(ptr, nextTokPtr)
102      case BT_MINUS:
103        if ((ptr += MINBPC(enc)) == end)
104          return XML_TOK_PARTIAL;
105        if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
106          if ((ptr += MINBPC(enc)) == end)
107            return XML_TOK_PARTIAL;
108          if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
109            *nextTokPtr = ptr;
110            return XML_TOK_INVALID;
111          }
112          *nextTokPtr = ptr + MINBPC(enc);
113          return XML_TOK_COMMENT;
114        }
115        break;
116      default:
117        ptr += MINBPC(enc);
118        break;
119      }
120    }
121  }
122  return XML_TOK_PARTIAL;
123}
124
125/* ptr points to character following "<!" */
126
127static int PTRCALL
128PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
129                 const char *end, const char **nextTokPtr)
130{
131  if (ptr == end)
132    return XML_TOK_PARTIAL;
133  switch (BYTE_TYPE(enc, ptr)) {
134  case BT_MINUS:
135    return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
136  case BT_LSQB:
137    *nextTokPtr = ptr + MINBPC(enc);
138    return XML_TOK_COND_SECT_OPEN;
139  case BT_NMSTRT:
140  case BT_HEX:
141    ptr += MINBPC(enc);
142    break;
143  default:
144    *nextTokPtr = ptr;
145    return XML_TOK_INVALID;
146  }
147  while (ptr != end) {
148    switch (BYTE_TYPE(enc, ptr)) {
149    case BT_PERCNT:
150      if (ptr + MINBPC(enc) == end)
151        return XML_TOK_PARTIAL;
152      /* don't allow <!ENTITY% foo "whatever"> */
153      switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
154      case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
155        *nextTokPtr = ptr;
156        return XML_TOK_INVALID;
157      }
158      /* fall through */
159    case BT_S: case BT_CR: case BT_LF:
160      *nextTokPtr = ptr;
161      return XML_TOK_DECL_OPEN;
162    case BT_NMSTRT:
163    case BT_HEX:
164      ptr += MINBPC(enc);
165      break;
166    default:
167      *nextTokPtr = ptr;
168      return XML_TOK_INVALID;
169    }
170  }
171  return XML_TOK_PARTIAL;
172}
173
174static int PTRCALL
175PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr,
176                      const char *end, int *tokPtr)
177{
178  int upper = 0;
179  *tokPtr = XML_TOK_PI;
180  if (end - ptr != MINBPC(enc)*3)
181    return 1;
182  switch (BYTE_TO_ASCII(enc, ptr)) {
183  case ASCII_x:
184    break;
185  case ASCII_X:
186    upper = 1;
187    break;
188  default:
189    return 1;
190  }
191  ptr += MINBPC(enc);
192  switch (BYTE_TO_ASCII(enc, ptr)) {
193  case ASCII_m:
194    break;
195  case ASCII_M:
196    upper = 1;
197    break;
198  default:
199    return 1;
200  }
201  ptr += MINBPC(enc);
202  switch (BYTE_TO_ASCII(enc, ptr)) {
203  case ASCII_l:
204    break;
205  case ASCII_L:
206    upper = 1;
207    break;
208  default:
209    return 1;
210  }
211  if (upper)
212    return 0;
213  *tokPtr = XML_TOK_XML_DECL;
214  return 1;
215}
216
217/* ptr points to character following "<?" */
218
219static int PTRCALL
220PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
221               const char *end, const char **nextTokPtr)
222{
223  int tok;
224  const char *target = ptr;
225  if (ptr == end)
226    return XML_TOK_PARTIAL;
227  switch (BYTE_TYPE(enc, ptr)) {
228  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
229  default:
230    *nextTokPtr = ptr;
231    return XML_TOK_INVALID;
232  }
233  while (ptr != end) {
234    switch (BYTE_TYPE(enc, ptr)) {
235    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
236    case BT_S: case BT_CR: case BT_LF:
237      if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
238        *nextTokPtr = ptr;
239        return XML_TOK_INVALID;
240      }
241      ptr += MINBPC(enc);
242      while (ptr != end) {
243        switch (BYTE_TYPE(enc, ptr)) {
244        INVALID_CASES(ptr, nextTokPtr)
245        case BT_QUEST:
246          ptr += MINBPC(enc);
247          if (ptr == end)
248            return XML_TOK_PARTIAL;
249          if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
250            *nextTokPtr = ptr + MINBPC(enc);
251            return tok;
252          }
253          break;
254        default:
255          ptr += MINBPC(enc);
256          break;
257        }
258      }
259      return XML_TOK_PARTIAL;
260    case BT_QUEST:
261      if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
262        *nextTokPtr = ptr;
263        return XML_TOK_INVALID;
264      }
265      ptr += MINBPC(enc);
266      if (ptr == end)
267        return XML_TOK_PARTIAL;
268      if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
269        *nextTokPtr = ptr + MINBPC(enc);
270        return tok;
271      }
272      /* fall through */
273    default:
274      *nextTokPtr = ptr;
275      return XML_TOK_INVALID;
276    }
277  }
278  return XML_TOK_PARTIAL;
279}
280
281static int PTRCALL
282PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr,
283                         const char *end, const char **nextTokPtr)
284{
285  static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
286                                     ASCII_T, ASCII_A, ASCII_LSQB };
287  int i;
288  /* CDATA[ */
289  if (end - ptr < 6 * MINBPC(enc))
290    return XML_TOK_PARTIAL;
291  for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
292    if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
293      *nextTokPtr = ptr;
294      return XML_TOK_INVALID;
295    }
296  }
297  *nextTokPtr = ptr;
298  return XML_TOK_CDATA_SECT_OPEN;
299}
300
301static int PTRCALL
302PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
303                        const char *end, const char **nextTokPtr)
304{
305  if (ptr == end)
306    return XML_TOK_NONE;
307  if (MINBPC(enc) > 1) {
308    size_t n = end - ptr;
309    if (n & (MINBPC(enc) - 1)) {
310      n &= ~(MINBPC(enc) - 1);
311      if (n == 0)
312        return XML_TOK_PARTIAL;
313      end = ptr + n;
314    }
315  }
316  switch (BYTE_TYPE(enc, ptr)) {
317  case BT_RSQB:
318    ptr += MINBPC(enc);
319    if (ptr == end)
320      return XML_TOK_PARTIAL;
321    if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
322      break;
323    ptr += MINBPC(enc);
324    if (ptr == end)
325      return XML_TOK_PARTIAL;
326    if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
327      ptr -= MINBPC(enc);
328      break;
329    }
330    *nextTokPtr = ptr + MINBPC(enc);
331    return XML_TOK_CDATA_SECT_CLOSE;
332  case BT_CR:
333    ptr += MINBPC(enc);
334    if (ptr == end)
335      return XML_TOK_PARTIAL;
336    if (BYTE_TYPE(enc, ptr) == BT_LF)
337      ptr += MINBPC(enc);
338    *nextTokPtr = ptr;
339    return XML_TOK_DATA_NEWLINE;
340  case BT_LF:
341    *nextTokPtr = ptr + MINBPC(enc);
342    return XML_TOK_DATA_NEWLINE;
343  INVALID_CASES(ptr, nextTokPtr)
344  default:
345    ptr += MINBPC(enc);
346    break;
347  }
348  while (ptr != end) {
349    switch (BYTE_TYPE(enc, ptr)) {
350#define LEAD_CASE(n) \
351    case BT_LEAD ## n: \
352      if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
353        *nextTokPtr = ptr; \
354        return XML_TOK_DATA_CHARS; \
355      } \
356      ptr += n; \
357      break;
358    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
359#undef LEAD_CASE
360    case BT_NONXML:
361    case BT_MALFORM:
362    case BT_TRAIL:
363    case BT_CR:
364    case BT_LF:
365    case BT_RSQB:
366      *nextTokPtr = ptr;
367      return XML_TOK_DATA_CHARS;
368    default:
369      ptr += MINBPC(enc);
370      break;
371    }
372  }
373  *nextTokPtr = ptr;
374  return XML_TOK_DATA_CHARS;
375}
376
377/* ptr points to character following "</" */
378
379static int PTRCALL
380PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
381                   const char *end, const char **nextTokPtr)
382{
383  if (ptr == end)
384    return XML_TOK_PARTIAL;
385  switch (BYTE_TYPE(enc, ptr)) {
386  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
387  default:
388    *nextTokPtr = ptr;
389    return XML_TOK_INVALID;
390  }
391  while (ptr != end) {
392    switch (BYTE_TYPE(enc, ptr)) {
393    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
394    case BT_S: case BT_CR: case BT_LF:
395      for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
396        switch (BYTE_TYPE(enc, ptr)) {
397        case BT_S: case BT_CR: case BT_LF:
398          break;
399        case BT_GT:
400          *nextTokPtr = ptr + MINBPC(enc);
401          return XML_TOK_END_TAG;
402        default:
403          *nextTokPtr = ptr;
404          return XML_TOK_INVALID;
405        }
406      }
407      return XML_TOK_PARTIAL;
408#ifdef XML_NS
409    case BT_COLON:
410      /* no need to check qname syntax here,
411         since end-tag must match exactly */
412      ptr += MINBPC(enc);
413      break;
414#endif
415    case BT_GT:
416      *nextTokPtr = ptr + MINBPC(enc);
417      return XML_TOK_END_TAG;
418    default:
419      *nextTokPtr = ptr;
420      return XML_TOK_INVALID;
421    }
422  }
423  return XML_TOK_PARTIAL;
424}
425
426/* ptr points to character following "&#X" */
427
428static int PTRCALL
429PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
430                       const char *end, const char **nextTokPtr)
431{
432  if (ptr != end) {
433    switch (BYTE_TYPE(enc, ptr)) {
434    case BT_DIGIT:
435    case BT_HEX:
436      break;
437    default:
438      *nextTokPtr = ptr;
439      return XML_TOK_INVALID;
440    }
441    for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
442      switch (BYTE_TYPE(enc, ptr)) {
443      case BT_DIGIT:
444      case BT_HEX:
445        break;
446      case BT_SEMI:
447        *nextTokPtr = ptr + MINBPC(enc);
448        return XML_TOK_CHAR_REF;
449      default:
450        *nextTokPtr = ptr;
451        return XML_TOK_INVALID;
452      }
453    }
454  }
455  return XML_TOK_PARTIAL;
456}
457
458/* ptr points to character following "&#" */
459
460static int PTRCALL
461PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
462                    const char *end, const char **nextTokPtr)
463{
464  if (ptr != end) {
465    if (CHAR_MATCHES(enc, ptr, ASCII_x))
466      return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
467    switch (BYTE_TYPE(enc, ptr)) {
468    case BT_DIGIT:
469      break;
470    default:
471      *nextTokPtr = ptr;
472      return XML_TOK_INVALID;
473    }
474    for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
475      switch (BYTE_TYPE(enc, ptr)) {
476      case BT_DIGIT:
477        break;
478      case BT_SEMI:
479        *nextTokPtr = ptr + MINBPC(enc);
480        return XML_TOK_CHAR_REF;
481      default:
482        *nextTokPtr = ptr;
483        return XML_TOK_INVALID;
484      }
485    }
486  }
487  return XML_TOK_PARTIAL;
488}
489
490/* ptr points to character following "&" */
491
492static int PTRCALL
493PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
494                const char **nextTokPtr)
495{
496  if (ptr == end)
497    return XML_TOK_PARTIAL;
498  switch (BYTE_TYPE(enc, ptr)) {
499  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
500  case BT_NUM:
501    return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
502  default:
503    *nextTokPtr = ptr;
504    return XML_TOK_INVALID;
505  }
506  while (ptr != end) {
507    switch (BYTE_TYPE(enc, ptr)) {
508    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
509    case BT_SEMI:
510      *nextTokPtr = ptr + MINBPC(enc);
511      return XML_TOK_ENTITY_REF;
512    default:
513      *nextTokPtr = ptr;
514      return XML_TOK_INVALID;
515    }
516  }
517  return XML_TOK_PARTIAL;
518}
519
520/* ptr points to character following first character of attribute name */
521
522static int PTRCALL
523PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
524                 const char **nextTokPtr)
525{
526#ifdef XML_NS
527  int hadColon = 0;
528#endif
529  while (ptr != end) {
530    switch (BYTE_TYPE(enc, ptr)) {
531    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
532#ifdef XML_NS
533    case BT_COLON:
534      if (hadColon) {
535        *nextTokPtr = ptr;
536        return XML_TOK_INVALID;
537      }
538      hadColon = 1;
539      ptr += MINBPC(enc);
540      if (ptr == end)
541        return XML_TOK_PARTIAL;
542      switch (BYTE_TYPE(enc, ptr)) {
543      CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
544      default:
545        *nextTokPtr = ptr;
546        return XML_TOK_INVALID;
547      }
548      break;
549#endif
550    case BT_S: case BT_CR: case BT_LF:
551      for (;;) {
552        int t;
553
554        ptr += MINBPC(enc);
555        if (ptr == end)
556          return XML_TOK_PARTIAL;
557        t = BYTE_TYPE(enc, ptr);
558        if (t == BT_EQUALS)
559          break;
560        switch (t) {
561        case BT_S:
562        case BT_LF:
563        case BT_CR:
564          break;
565        default:
566          *nextTokPtr = ptr;
567          return XML_TOK_INVALID;
568        }
569      }
570    /* fall through */
571    case BT_EQUALS:
572      {
573        int open;
574#ifdef XML_NS
575        hadColon = 0;
576#endif
577        for (;;) {
578          ptr += MINBPC(enc);
579          if (ptr == end)
580            return XML_TOK_PARTIAL;
581          open = BYTE_TYPE(enc, ptr);
582          if (open == BT_QUOT || open == BT_APOS)
583            break;
584          switch (open) {
585          case BT_S:
586          case BT_LF:
587          case BT_CR:
588            break;
589          default:
590            *nextTokPtr = ptr;
591            return XML_TOK_INVALID;
592          }
593        }
594        ptr += MINBPC(enc);
595        /* in attribute value */
596        for (;;) {
597          int t;
598          if (ptr == end)
599            return XML_TOK_PARTIAL;
600          t = BYTE_TYPE(enc, ptr);
601          if (t == open)
602            break;
603          switch (t) {
604          INVALID_CASES(ptr, nextTokPtr)
605          case BT_AMP:
606            {
607              int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
608              if (tok <= 0) {
609                if (tok == XML_TOK_INVALID)
610                  *nextTokPtr = ptr;
611                return tok;
612              }
613              break;
614            }
615          case BT_LT:
616            *nextTokPtr = ptr;
617            return XML_TOK_INVALID;
618          default:
619            ptr += MINBPC(enc);
620            break;
621          }
622        }
623        ptr += MINBPC(enc);
624        if (ptr == end)
625          return XML_TOK_PARTIAL;
626        switch (BYTE_TYPE(enc, ptr)) {
627        case BT_S:
628        case BT_CR:
629        case BT_LF:
630          break;
631        case BT_SOL:
632          goto sol;
633        case BT_GT:
634          goto gt;
635        default:
636          *nextTokPtr = ptr;
637          return XML_TOK_INVALID;
638        }
639        /* ptr points to closing quote */
640        for (;;) {
641          ptr += MINBPC(enc);
642          if (ptr == end)
643            return XML_TOK_PARTIAL;
644          switch (BYTE_TYPE(enc, ptr)) {
645          CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
646          case BT_S: case BT_CR: case BT_LF:
647            continue;
648          case BT_GT:
649          gt:
650            *nextTokPtr = ptr + MINBPC(enc);
651            return XML_TOK_START_TAG_WITH_ATTS;
652          case BT_SOL:
653          sol:
654            ptr += MINBPC(enc);
655            if (ptr == end)
656              return XML_TOK_PARTIAL;
657            if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
658              *nextTokPtr = ptr;
659              return XML_TOK_INVALID;
660            }
661            *nextTokPtr = ptr + MINBPC(enc);
662            return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
663          default:
664            *nextTokPtr = ptr;
665            return XML_TOK_INVALID;
666          }
667          break;
668        }
669        break;
670      }
671    default:
672      *nextTokPtr = ptr;
673      return XML_TOK_INVALID;
674    }
675  }
676  return XML_TOK_PARTIAL;
677}
678
679/* ptr points to character following "<" */
680
681static int PTRCALL
682PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
683               const char **nextTokPtr)
684{
685#ifdef XML_NS
686  int hadColon;
687#endif
688  if (ptr == end)
689    return XML_TOK_PARTIAL;
690  switch (BYTE_TYPE(enc, ptr)) {
691  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
692  case BT_EXCL:
693    if ((ptr += MINBPC(enc)) == end)
694      return XML_TOK_PARTIAL;
695    switch (BYTE_TYPE(enc, ptr)) {
696    case BT_MINUS:
697      return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
698    case BT_LSQB:
699      return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
700                                      end, nextTokPtr);
701    }
702    *nextTokPtr = ptr;
703    return XML_TOK_INVALID;
704  case BT_QUEST:
705    return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
706  case BT_SOL:
707    return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
708  default:
709    *nextTokPtr = ptr;
710    return XML_TOK_INVALID;
711  }
712#ifdef XML_NS
713  hadColon = 0;
714#endif
715  /* we have a start-tag */
716  while (ptr != end) {
717    switch (BYTE_TYPE(enc, ptr)) {
718    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
719#ifdef XML_NS
720    case BT_COLON:
721      if (hadColon) {
722        *nextTokPtr = ptr;
723        return XML_TOK_INVALID;
724      }
725      hadColon = 1;
726      ptr += MINBPC(enc);
727      if (ptr == end)
728        return XML_TOK_PARTIAL;
729      switch (BYTE_TYPE(enc, ptr)) {
730      CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
731      default:
732        *nextTokPtr = ptr;
733        return XML_TOK_INVALID;
734      }
735      break;
736#endif
737    case BT_S: case BT_CR: case BT_LF:
738      {
739        ptr += MINBPC(enc);
740        while (ptr != end) {
741          switch (BYTE_TYPE(enc, ptr)) {
742          CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
743          case BT_GT:
744            goto gt;
745          case BT_SOL:
746            goto sol;
747          case BT_S: case BT_CR: case BT_LF:
748            ptr += MINBPC(enc);
749            continue;
750          default:
751            *nextTokPtr = ptr;
752            return XML_TOK_INVALID;
753          }
754          return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
755        }
756        return XML_TOK_PARTIAL;
757      }
758    case BT_GT:
759    gt:
760      *nextTokPtr = ptr + MINBPC(enc);
761      return XML_TOK_START_TAG_NO_ATTS;
762    case BT_SOL:
763    sol:
764      ptr += MINBPC(enc);
765      if (ptr == end)
766        return XML_TOK_PARTIAL;
767      if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
768        *nextTokPtr = ptr;
769        return XML_TOK_INVALID;
770      }
771      *nextTokPtr = ptr + MINBPC(enc);
772      return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
773    default:
774      *nextTokPtr = ptr;
775      return XML_TOK_INVALID;
776    }
777  }
778  return XML_TOK_PARTIAL;
779}
780
781static int PTRCALL
782PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
783                   const char **nextTokPtr)
784{
785  if (ptr == end)
786    return XML_TOK_NONE;
787  if (MINBPC(enc) > 1) {
788    size_t n = end - ptr;
789    if (n & (MINBPC(enc) - 1)) {
790      n &= ~(MINBPC(enc) - 1);
791      if (n == 0)
792        return XML_TOK_PARTIAL;
793      end = ptr + n;
794    }
795  }
796  switch (BYTE_TYPE(enc, ptr)) {
797  case BT_LT:
798    return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
799  case BT_AMP:
800    return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
801  case BT_CR:
802    ptr += MINBPC(enc);
803    if (ptr == end)
804      return XML_TOK_TRAILING_CR;
805    if (BYTE_TYPE(enc, ptr) == BT_LF)
806      ptr += MINBPC(enc);
807    *nextTokPtr = ptr;
808    return XML_TOK_DATA_NEWLINE;
809  case BT_LF:
810    *nextTokPtr = ptr + MINBPC(enc);
811    return XML_TOK_DATA_NEWLINE;
812  case BT_RSQB:
813    ptr += MINBPC(enc);
814    if (ptr == end)
815      return XML_TOK_TRAILING_RSQB;
816    if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
817      break;
818    ptr += MINBPC(enc);
819    if (ptr == end)
820      return XML_TOK_TRAILING_RSQB;
821    if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
822      ptr -= MINBPC(enc);
823      break;
824    }
825    *nextTokPtr = ptr;
826    return XML_TOK_INVALID;
827  INVALID_CASES(ptr, nextTokPtr)
828  default:
829    ptr += MINBPC(enc);
830    break;
831  }
832  while (ptr != end) {
833    switch (BYTE_TYPE(enc, ptr)) {
834#define LEAD_CASE(n) \
835    case BT_LEAD ## n: \
836      if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
837        *nextTokPtr = ptr; \
838        return XML_TOK_DATA_CHARS; \
839      } \
840      ptr += n; \
841      break;
842    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
843#undef LEAD_CASE
844    case BT_RSQB:
845      if (ptr + MINBPC(enc) != end) {
846         if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
847           ptr += MINBPC(enc);
848           break;
849         }
850         if (ptr + 2*MINBPC(enc) != end) {
851           if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
852             ptr += MINBPC(enc);
853             break;
854           }
855           *nextTokPtr = ptr + 2*MINBPC(enc);
856           return XML_TOK_INVALID;
857         }
858      }
859      /* fall through */
860    case BT_AMP:
861    case BT_LT:
862    case BT_NONXML:
863    case BT_MALFORM:
864    case BT_TRAIL:
865    case BT_CR:
866    case BT_LF:
867      *nextTokPtr = ptr;
868      return XML_TOK_DATA_CHARS;
869    default:
870      ptr += MINBPC(enc);
871      break;
872    }
873  }
874  *nextTokPtr = ptr;
875  return XML_TOK_DATA_CHARS;
876}
877
878/* ptr points to character following "%" */
879
880static int PTRCALL
881PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
882                    const char **nextTokPtr)
883{
884  if (ptr == end)
885    return -XML_TOK_PERCENT;
886  switch (BYTE_TYPE(enc, ptr)) {
887  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
888  case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
889    *nextTokPtr = ptr;
890    return XML_TOK_PERCENT;
891  default:
892    *nextTokPtr = ptr;
893    return XML_TOK_INVALID;
894  }
895  while (ptr != end) {
896    switch (BYTE_TYPE(enc, ptr)) {
897    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
898    case BT_SEMI:
899      *nextTokPtr = ptr + MINBPC(enc);
900      return XML_TOK_PARAM_ENTITY_REF;
901    default:
902      *nextTokPtr = ptr;
903      return XML_TOK_INVALID;
904    }
905  }
906  return XML_TOK_PARTIAL;
907}
908
909static int PTRCALL
910PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
911                      const char **nextTokPtr)
912{
913  if (ptr == end)
914    return XML_TOK_PARTIAL;
915  switch (BYTE_TYPE(enc, ptr)) {
916  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
917  default:
918    *nextTokPtr = ptr;
919    return XML_TOK_INVALID;
920  }
921  while (ptr != end) {
922    switch (BYTE_TYPE(enc, ptr)) {
923    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
924    case BT_CR: case BT_LF: case BT_S:
925    case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
926      *nextTokPtr = ptr;
927      return XML_TOK_POUND_NAME;
928    default:
929      *nextTokPtr = ptr;
930      return XML_TOK_INVALID;
931    }
932  }
933  return -XML_TOK_POUND_NAME;
934}
935
936static int PTRCALL
937PREFIX(scanLit)(int open, const ENCODING *enc,
938                const char *ptr, const char *end,
939                const char **nextTokPtr)
940{
941  while (ptr != end) {
942    int t = BYTE_TYPE(enc, ptr);
943    switch (t) {
944    INVALID_CASES(ptr, nextTokPtr)
945    case BT_QUOT:
946    case BT_APOS:
947      ptr += MINBPC(enc);
948      if (t != open)
949        break;
950      if (ptr == end)
951        return -XML_TOK_LITERAL;
952      *nextTokPtr = ptr;
953      switch (BYTE_TYPE(enc, ptr)) {
954      case BT_S: case BT_CR: case BT_LF:
955      case BT_GT: case BT_PERCNT: case BT_LSQB:
956        return XML_TOK_LITERAL;
957      default:
958        return XML_TOK_INVALID;
959      }
960    default:
961      ptr += MINBPC(enc);
962      break;
963    }
964  }
965  return XML_TOK_PARTIAL;
966}
967
968static int PTRCALL
969PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
970                  const char **nextTokPtr)
971{
972  int tok;
973  if (ptr == end)
974    return XML_TOK_NONE;
975  if (MINBPC(enc) > 1) {
976    size_t n = end - ptr;
977    if (n & (MINBPC(enc) - 1)) {
978      n &= ~(MINBPC(enc) - 1);
979      if (n == 0)
980        return XML_TOK_PARTIAL;
981      end = ptr + n;
982    }
983  }
984  switch (BYTE_TYPE(enc, ptr)) {
985  case BT_QUOT:
986    return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
987  case BT_APOS:
988    return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
989  case BT_LT:
990    {
991      ptr += MINBPC(enc);
992      if (ptr == end)
993        return XML_TOK_PARTIAL;
994      switch (BYTE_TYPE(enc, ptr)) {
995      case BT_EXCL:
996        return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
997      case BT_QUEST:
998        return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
999      case BT_NMSTRT:
1000      case BT_HEX:
1001      case BT_NONASCII:
1002      case BT_LEAD2:
1003      case BT_LEAD3:
1004      case BT_LEAD4:
1005        *nextTokPtr = ptr - MINBPC(enc);
1006        return XML_TOK_INSTANCE_START;
1007      }
1008      *nextTokPtr = ptr;
1009      return XML_TOK_INVALID;
1010    }
1011  case BT_CR:
1012    if (ptr + MINBPC(enc) == end) {
1013      *nextTokPtr = end;
1014      /* indicate that this might be part of a CR/LF pair */
1015      return -XML_TOK_PROLOG_S;
1016    }
1017    /* fall through */
1018  case BT_S: case BT_LF:
1019    for (;;) {
1020      ptr += MINBPC(enc);
1021      if (ptr == end)
1022        break;
1023      switch (BYTE_TYPE(enc, ptr)) {
1024      case BT_S: case BT_LF:
1025        break;
1026      case BT_CR:
1027        /* don't split CR/LF pair */
1028        if (ptr + MINBPC(enc) != end)
1029          break;
1030        /* fall through */
1031      default:
1032        *nextTokPtr = ptr;
1033        return XML_TOK_PROLOG_S;
1034      }
1035    }
1036    *nextTokPtr = ptr;
1037    return XML_TOK_PROLOG_S;
1038  case BT_PERCNT:
1039    return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1040  case BT_COMMA:
1041    *nextTokPtr = ptr + MINBPC(enc);
1042    return XML_TOK_COMMA;
1043  case BT_LSQB:
1044    *nextTokPtr = ptr + MINBPC(enc);
1045    return XML_TOK_OPEN_BRACKET;
1046  case BT_RSQB:
1047    ptr += MINBPC(enc);
1048    if (ptr == end)
1049      return -XML_TOK_CLOSE_BRACKET;
1050    if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1051      if (ptr + MINBPC(enc) == end)
1052        return XML_TOK_PARTIAL;
1053      if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1054        *nextTokPtr = ptr + 2*MINBPC(enc);
1055        return XML_TOK_COND_SECT_CLOSE;
1056      }
1057    }
1058    *nextTokPtr = ptr;
1059    return XML_TOK_CLOSE_BRACKET;
1060  case BT_LPAR:
1061    *nextTokPtr = ptr + MINBPC(enc);
1062    return XML_TOK_OPEN_PAREN;
1063  case BT_RPAR:
1064    ptr += MINBPC(enc);
1065    if (ptr == end)
1066      return -XML_TOK_CLOSE_PAREN;
1067    switch (BYTE_TYPE(enc, ptr)) {
1068    case BT_AST:
1069      *nextTokPtr = ptr + MINBPC(enc);
1070      return XML_TOK_CLOSE_PAREN_ASTERISK;
1071    case BT_QUEST:
1072      *nextTokPtr = ptr + MINBPC(enc);
1073      return XML_TOK_CLOSE_PAREN_QUESTION;
1074    case BT_PLUS:
1075      *nextTokPtr = ptr + MINBPC(enc);
1076      return XML_TOK_CLOSE_PAREN_PLUS;
1077    case BT_CR: case BT_LF: case BT_S:
1078    case BT_GT: case BT_COMMA: case BT_VERBAR:
1079    case BT_RPAR:
1080      *nextTokPtr = ptr;
1081      return XML_TOK_CLOSE_PAREN;
1082    }
1083    *nextTokPtr = ptr;
1084    return XML_TOK_INVALID;
1085  case BT_VERBAR:
1086    *nextTokPtr = ptr + MINBPC(enc);
1087    return XML_TOK_OR;
1088  case BT_GT:
1089    *nextTokPtr = ptr + MINBPC(enc);
1090    return XML_TOK_DECL_CLOSE;
1091  case BT_NUM:
1092    return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1093#define LEAD_CASE(n) \
1094  case BT_LEAD ## n: \
1095    if (end - ptr < n) \
1096      return XML_TOK_PARTIAL_CHAR; \
1097    if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1098      ptr += n; \
1099      tok = XML_TOK_NAME; \
1100      break; \
1101    } \
1102    if (IS_NAME_CHAR(enc, ptr, n)) { \
1103      ptr += n; \
1104      tok = XML_TOK_NMTOKEN; \
1105      break; \
1106    } \
1107    *nextTokPtr = ptr; \
1108    return XML_TOK_INVALID;
1109    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1110#undef LEAD_CASE
1111  case BT_NMSTRT:
1112  case BT_HEX:
1113    tok = XML_TOK_NAME;
1114    ptr += MINBPC(enc);
1115    break;
1116  case BT_DIGIT:
1117  case BT_NAME:
1118  case BT_MINUS:
1119#ifdef XML_NS
1120  case BT_COLON:
1121#endif
1122    tok = XML_TOK_NMTOKEN;
1123    ptr += MINBPC(enc);
1124    break;
1125  case BT_NONASCII:
1126    if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1127      ptr += MINBPC(enc);
1128      tok = XML_TOK_NAME;
1129      break;
1130    }
1131    if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1132      ptr += MINBPC(enc);
1133      tok = XML_TOK_NMTOKEN;
1134      break;
1135    }
1136    /* fall through */
1137  default:
1138    *nextTokPtr = ptr;
1139    return XML_TOK_INVALID;
1140  }
1141  while (ptr != end) {
1142    switch (BYTE_TYPE(enc, ptr)) {
1143    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1144    case BT_GT: case BT_RPAR: case BT_COMMA:
1145    case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1146    case BT_S: case BT_CR: case BT_LF:
1147      *nextTokPtr = ptr;
1148      return tok;
1149#ifdef XML_NS
1150    case BT_COLON:
1151      ptr += MINBPC(enc);
1152      switch (tok) {
1153      case XML_TOK_NAME:
1154        if (ptr == end)
1155          return XML_TOK_PARTIAL;
1156        tok = XML_TOK_PREFIXED_NAME;
1157        switch (BYTE_TYPE(enc, ptr)) {
1158        CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1159        default:
1160          tok = XML_TOK_NMTOKEN;
1161          break;
1162        }
1163        break;
1164      case XML_TOK_PREFIXED_NAME:
1165        tok = XML_TOK_NMTOKEN;
1166        break;
1167      }
1168      break;
1169#endif
1170    case BT_PLUS:
1171      if (tok == XML_TOK_NMTOKEN)  {
1172        *nextTokPtr = ptr;
1173        return XML_TOK_INVALID;
1174      }
1175      *nextTokPtr = ptr + MINBPC(enc);
1176      return XML_TOK_NAME_PLUS;
1177    case BT_AST:
1178      if (tok == XML_TOK_NMTOKEN)  {
1179        *nextTokPtr = ptr;
1180        return XML_TOK_INVALID;
1181      }
1182      *nextTokPtr = ptr + MINBPC(enc);
1183      return XML_TOK_NAME_ASTERISK;
1184    case BT_QUEST:
1185      if (tok == XML_TOK_NMTOKEN)  {
1186        *nextTokPtr = ptr;
1187        return XML_TOK_INVALID;
1188      }
1189      *nextTokPtr = ptr + MINBPC(enc);
1190      return XML_TOK_NAME_QUESTION;
1191    default:
1192      *nextTokPtr = ptr;
1193      return XML_TOK_INVALID;
1194    }
1195  }
1196  return -tok;
1197}
1198
1199static int PTRCALL
1200PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1201                          const char *end, const char **nextTokPtr)
1202{
1203  const char *start;
1204  if (ptr == end)
1205    return XML_TOK_NONE;
1206  start = ptr;
1207  while (ptr != end) {
1208    switch (BYTE_TYPE(enc, ptr)) {
1209#define LEAD_CASE(n) \
1210    case BT_LEAD ## n: ptr += n; break;
1211    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1212#undef LEAD_CASE
1213    case BT_AMP:
1214      if (ptr == start)
1215        return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1216      *nextTokPtr = ptr;
1217      return XML_TOK_DATA_CHARS;
1218    case BT_LT:
1219      /* this is for inside entity references */
1220      *nextTokPtr = ptr;
1221      return XML_TOK_INVALID;
1222    case BT_LF:
1223      if (ptr == start) {
1224        *nextTokPtr = ptr + MINBPC(enc);
1225        return XML_TOK_DATA_NEWLINE;
1226      }
1227      *nextTokPtr = ptr;
1228      return XML_TOK_DATA_CHARS;
1229    case BT_CR:
1230      if (ptr == start) {
1231        ptr += MINBPC(enc);
1232        if (ptr == end)
1233          return XML_TOK_TRAILING_CR;
1234        if (BYTE_TYPE(enc, ptr) == BT_LF)
1235          ptr += MINBPC(enc);
1236        *nextTokPtr = ptr;
1237        return XML_TOK_DATA_NEWLINE;
1238      }
1239      *nextTokPtr = ptr;
1240      return XML_TOK_DATA_CHARS;
1241    case BT_S:
1242      if (ptr == start) {
1243        *nextTokPtr = ptr + MINBPC(enc);
1244        return XML_TOK_ATTRIBUTE_VALUE_S;
1245      }
1246      *nextTokPtr = ptr;
1247      return XML_TOK_DATA_CHARS;
1248    default:
1249      ptr += MINBPC(enc);
1250      break;
1251    }
1252  }
1253  *nextTokPtr = ptr;
1254  return XML_TOK_DATA_CHARS;
1255}
1256
1257static int PTRCALL
1258PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1259                       const char *end, const char **nextTokPtr)
1260{
1261  const char *start;
1262  if (ptr == end)
1263    return XML_TOK_NONE;
1264  start = ptr;
1265  while (ptr != end) {
1266    switch (BYTE_TYPE(enc, ptr)) {
1267#define LEAD_CASE(n) \
1268    case BT_LEAD ## n: ptr += n; break;
1269    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1270#undef LEAD_CASE
1271    case BT_AMP:
1272      if (ptr == start)
1273        return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1274      *nextTokPtr = ptr;
1275      return XML_TOK_DATA_CHARS;
1276    case BT_PERCNT:
1277      if (ptr == start) {
1278        int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1279                                       end, nextTokPtr);
1280        return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1281      }
1282      *nextTokPtr = ptr;
1283      return XML_TOK_DATA_CHARS;
1284    case BT_LF:
1285      if (ptr == start) {
1286        *nextTokPtr = ptr + MINBPC(enc);
1287        return XML_TOK_DATA_NEWLINE;
1288      }
1289      *nextTokPtr = ptr;
1290      return XML_TOK_DATA_CHARS;
1291    case BT_CR:
1292      if (ptr == start) {
1293        ptr += MINBPC(enc);
1294        if (ptr == end)
1295          return XML_TOK_TRAILING_CR;
1296        if (BYTE_TYPE(enc, ptr) == BT_LF)
1297          ptr += MINBPC(enc);
1298        *nextTokPtr = ptr;
1299        return XML_TOK_DATA_NEWLINE;
1300      }
1301      *nextTokPtr = ptr;
1302      return XML_TOK_DATA_CHARS;
1303    default:
1304      ptr += MINBPC(enc);
1305      break;
1306    }
1307  }
1308  *nextTokPtr = ptr;
1309  return XML_TOK_DATA_CHARS;
1310}
1311
1312#ifdef XML_DTD
1313
1314static int PTRCALL
1315PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1316                         const char *end, const char **nextTokPtr)
1317{
1318  int level = 0;
1319  if (MINBPC(enc) > 1) {
1320    size_t n = end - ptr;
1321    if (n & (MINBPC(enc) - 1)) {
1322      n &= ~(MINBPC(enc) - 1);
1323      end = ptr + n;
1324    }
1325  }
1326  while (ptr != end) {
1327    switch (BYTE_TYPE(enc, ptr)) {
1328    INVALID_CASES(ptr, nextTokPtr)
1329    case BT_LT:
1330      if ((ptr += MINBPC(enc)) == end)
1331        return XML_TOK_PARTIAL;
1332      if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1333        if ((ptr += MINBPC(enc)) == end)
1334          return XML_TOK_PARTIAL;
1335        if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1336          ++level;
1337          ptr += MINBPC(enc);
1338        }
1339      }
1340      break;
1341    case BT_RSQB:
1342      if ((ptr += MINBPC(enc)) == end)
1343        return XML_TOK_PARTIAL;
1344      if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1345        if ((ptr += MINBPC(enc)) == end)
1346          return XML_TOK_PARTIAL;
1347        if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1348          ptr += MINBPC(enc);
1349          if (level == 0) {
1350            *nextTokPtr = ptr;
1351            return XML_TOK_IGNORE_SECT;
1352          }
1353          --level;
1354        }
1355      }
1356      break;
1357    default:
1358      ptr += MINBPC(enc);
1359      break;
1360    }
1361  }
1362  return XML_TOK_PARTIAL;
1363}
1364
1365#endif /* XML_DTD */
1366
1367static int PTRCALL
1368PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1369                   const char **badPtr)
1370{
1371  ptr += MINBPC(enc);
1372  end -= MINBPC(enc);
1373  for (; ptr != end; ptr += MINBPC(enc)) {
1374    switch (BYTE_TYPE(enc, ptr)) {
1375    case BT_DIGIT:
1376    case BT_HEX:
1377    case BT_MINUS:
1378    case BT_APOS:
1379    case BT_LPAR:
1380    case BT_RPAR:
1381    case BT_PLUS:
1382    case BT_COMMA:
1383    case BT_SOL:
1384    case BT_EQUALS:
1385    case BT_QUEST:
1386    case BT_CR:
1387    case BT_LF:
1388    case BT_SEMI:
1389    case BT_EXCL:
1390    case BT_AST:
1391    case BT_PERCNT:
1392    case BT_NUM:
1393#ifdef XML_NS
1394    case BT_COLON:
1395#endif
1396      break;
1397    case BT_S:
1398      if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1399        *badPtr = ptr;
1400        return 0;
1401      }
1402      break;
1403    case BT_NAME:
1404    case BT_NMSTRT:
1405      if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1406        break;
1407    default:
1408      switch (BYTE_TO_ASCII(enc, ptr)) {
1409      case 0x24: /* $ */
1410      case 0x40: /* @ */
1411        break;
1412      default:
1413        *badPtr = ptr;
1414        return 0;
1415      }
1416      break;
1417    }
1418  }
1419  return 1;
1420}
1421
1422/* This must only be called for a well-formed start-tag or empty
1423   element tag.  Returns the number of attributes.  Pointers to the
1424   first attsMax attributes are stored in atts.
1425*/
1426
1427static int PTRCALL
1428PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1429                int attsMax, ATTRIBUTE *atts)
1430{
1431  enum { other, inName, inValue } state = inName;
1432  int nAtts = 0;
1433  int open = 0; /* defined when state == inValue;
1434                   initialization just to shut up compilers */
1435
1436  for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1437    switch (BYTE_TYPE(enc, ptr)) {
1438#define START_NAME \
1439      if (state == other) { \
1440        if (nAtts < attsMax) { \
1441          atts[nAtts].name = ptr; \
1442          atts[nAtts].normalized = 1; \
1443        } \
1444        state = inName; \
1445      }
1446#define LEAD_CASE(n) \
1447    case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1448    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1449#undef LEAD_CASE
1450    case BT_NONASCII:
1451    case BT_NMSTRT:
1452    case BT_HEX:
1453      START_NAME
1454      break;
1455#undef START_NAME
1456    case BT_QUOT:
1457      if (state != inValue) {
1458        if (nAtts < attsMax)
1459          atts[nAtts].valuePtr = ptr + MINBPC(enc);
1460        state = inValue;
1461        open = BT_QUOT;
1462      }
1463      else if (open == BT_QUOT) {
1464        state = other;
1465        if (nAtts < attsMax)
1466          atts[nAtts].valueEnd = ptr;
1467        nAtts++;
1468      }
1469      break;
1470    case BT_APOS:
1471      if (state != inValue) {
1472        if (nAtts < attsMax)
1473          atts[nAtts].valuePtr = ptr + MINBPC(enc);
1474        state = inValue;
1475        open = BT_APOS;
1476      }
1477      else if (open == BT_APOS) {
1478        state = other;
1479        if (nAtts < attsMax)
1480          atts[nAtts].valueEnd = ptr;
1481        nAtts++;
1482      }
1483      break;
1484    case BT_AMP:
1485      if (nAtts < attsMax)
1486        atts[nAtts].normalized = 0;
1487      break;
1488    case BT_S:
1489      if (state == inName)
1490        state = other;
1491      else if (state == inValue
1492               && nAtts < attsMax
1493               && atts[nAtts].normalized
1494               && (ptr == atts[nAtts].valuePtr
1495                   || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1496                   || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1497                   || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1498        atts[nAtts].normalized = 0;
1499      break;
1500    case BT_CR: case BT_LF:
1501      /* This case ensures that the first attribute name is counted
1502         Apart from that we could just change state on the quote. */
1503      if (state == inName)
1504        state = other;
1505      else if (state == inValue && nAtts < attsMax)
1506        atts[nAtts].normalized = 0;
1507      break;
1508    case BT_GT:
1509    case BT_SOL:
1510      if (state != inValue)
1511        return nAtts;
1512      break;
1513    default:
1514      break;
1515    }
1516  }
1517  /* not reached */
1518}
1519
1520static int PTRFASTCALL
1521PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
1522{
1523  int result = 0;
1524  /* skip &# */
1525  ptr += 2*MINBPC(enc);
1526  if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1527    for (ptr += MINBPC(enc);
1528         !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1529         ptr += MINBPC(enc)) {
1530      int c = BYTE_TO_ASCII(enc, ptr);
1531      switch (c) {
1532      case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1533      case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1534        result <<= 4;
1535        result |= (c - ASCII_0);
1536        break;
1537      case ASCII_A: case ASCII_B: case ASCII_C:
1538      case ASCII_D: case ASCII_E: case ASCII_F:
1539        result <<= 4;
1540        result += 10 + (c - ASCII_A);
1541        break;
1542      case ASCII_a: case ASCII_b: case ASCII_c:
1543      case ASCII_d: case ASCII_e: case ASCII_f:
1544        result <<= 4;
1545        result += 10 + (c - ASCII_a);
1546        break;
1547      }
1548      if (result >= 0x110000)
1549        return -1;
1550    }
1551  }
1552  else {
1553    for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1554      int c = BYTE_TO_ASCII(enc, ptr);
1555      result *= 10;
1556      result += (c - ASCII_0);
1557      if (result >= 0x110000)
1558        return -1;
1559    }
1560  }
1561  return checkCharRefNumber(result);
1562}
1563
1564static int PTRCALL
1565PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1566                             const char *end)
1567{
1568  switch ((end - ptr)/MINBPC(enc)) {
1569  case 2:
1570    if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1571      switch (BYTE_TO_ASCII(enc, ptr)) {
1572      case ASCII_l:
1573        return ASCII_LT;
1574      case ASCII_g:
1575        return ASCII_GT;
1576      }
1577    }
1578    break;
1579  case 3:
1580    if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1581      ptr += MINBPC(enc);
1582      if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1583        ptr += MINBPC(enc);
1584        if (CHAR_MATCHES(enc, ptr, ASCII_p))
1585          return ASCII_AMP;
1586      }
1587    }
1588    break;
1589  case 4:
1590    switch (BYTE_TO_ASCII(enc, ptr)) {
1591    case ASCII_q:
1592      ptr += MINBPC(enc);
1593      if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1594        ptr += MINBPC(enc);
1595        if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1596          ptr += MINBPC(enc);
1597          if (CHAR_MATCHES(enc, ptr, ASCII_t))
1598            return ASCII_QUOT;
1599        }
1600      }
1601      break;
1602    case ASCII_a:
1603      ptr += MINBPC(enc);
1604      if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1605        ptr += MINBPC(enc);
1606        if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1607          ptr += MINBPC(enc);
1608          if (CHAR_MATCHES(enc, ptr, ASCII_s))
1609            return ASCII_APOS;
1610        }
1611      }
1612      break;
1613    }
1614  }
1615  return 0;
1616}
1617
1618static int PTRCALL
1619PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1620{
1621  for (;;) {
1622    switch (BYTE_TYPE(enc, ptr1)) {
1623#define LEAD_CASE(n) \
1624    case BT_LEAD ## n: \
1625      if (*ptr1++ != *ptr2++) \
1626        return 0;
1627    LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1628#undef LEAD_CASE
1629      /* fall through */
1630      if (*ptr1++ != *ptr2++)
1631        return 0;
1632      break;
1633    case BT_NONASCII:
1634    case BT_NMSTRT:
1635#ifdef XML_NS
1636    case BT_COLON:
1637#endif
1638    case BT_HEX:
1639    case BT_DIGIT:
1640    case BT_NAME:
1641    case BT_MINUS:
1642      if (*ptr2++ != *ptr1++)
1643        return 0;
1644      if (MINBPC(enc) > 1) {
1645        if (*ptr2++ != *ptr1++)
1646          return 0;
1647        if (MINBPC(enc) > 2) {
1648          if (*ptr2++ != *ptr1++)
1649            return 0;
1650          if (MINBPC(enc) > 3) {
1651            if (*ptr2++ != *ptr1++)
1652              return 0;
1653          }
1654        }
1655      }
1656      break;
1657    default:
1658      if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1659        return 1;
1660      switch (BYTE_TYPE(enc, ptr2)) {
1661      case BT_LEAD2:
1662      case BT_LEAD3:
1663      case BT_LEAD4:
1664      case BT_NONASCII:
1665      case BT_NMSTRT:
1666#ifdef XML_NS
1667      case BT_COLON:
1668#endif
1669      case BT_HEX:
1670      case BT_DIGIT:
1671      case BT_NAME:
1672      case BT_MINUS:
1673        return 0;
1674      default:
1675        return 1;
1676      }
1677    }
1678  }
1679  /* not reached */
1680}
1681
1682static int PTRCALL
1683PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1684                         const char *end1, const char *ptr2)
1685{
1686  for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1687    if (ptr1 == end1)
1688      return 0;
1689    if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1690      return 0;
1691  }
1692  return ptr1 == end1;
1693}
1694
1695static int PTRFASTCALL
1696PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1697{
1698  const char *start = ptr;
1699  for (;;) {
1700    switch (BYTE_TYPE(enc, ptr)) {
1701#define LEAD_CASE(n) \
1702    case BT_LEAD ## n: ptr += n; break;
1703    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1704#undef LEAD_CASE
1705    case BT_NONASCII:
1706    case BT_NMSTRT:
1707#ifdef XML_NS
1708    case BT_COLON:
1709#endif
1710    case BT_HEX:
1711    case BT_DIGIT:
1712    case BT_NAME:
1713    case BT_MINUS:
1714      ptr += MINBPC(enc);
1715      break;
1716    default:
1717      return ptr - start;
1718    }
1719  }
1720}
1721
1722static const char * PTRFASTCALL
1723PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1724{
1725  for (;;) {
1726    switch (BYTE_TYPE(enc, ptr)) {
1727    case BT_LF:
1728    case BT_CR:
1729    case BT_S:
1730      ptr += MINBPC(enc);
1731      break;
1732    default:
1733      return ptr;
1734    }
1735  }
1736}
1737
1738static void PTRCALL
1739PREFIX(updatePosition)(const ENCODING *enc,
1740                       const char *ptr,
1741                       const char *end,
1742                       POSITION *pos)
1743{
1744  while (ptr < end) {
1745    switch (BYTE_TYPE(enc, ptr)) {
1746#define LEAD_CASE(n) \
1747    case BT_LEAD ## n: \
1748      ptr += n; \
1749      break;
1750    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1751#undef LEAD_CASE
1752    case BT_LF:
1753      pos->columnNumber = (unsigned)-1;
1754      pos->lineNumber++;
1755      ptr += MINBPC(enc);
1756      break;
1757    case BT_CR:
1758      pos->lineNumber++;
1759      ptr += MINBPC(enc);
1760      if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
1761        ptr += MINBPC(enc);
1762      pos->columnNumber = (unsigned)-1;
1763      break;
1764    default:
1765      ptr += MINBPC(enc);
1766      break;
1767    }
1768    pos->columnNumber++;
1769  }
1770}
1771
1772#undef DO_LEAD_CASE
1773#undef MULTIBYTE_CASES
1774#undef INVALID_CASES
1775#undef CHECK_NAME_CASE
1776#undef CHECK_NAME_CASES
1777#undef CHECK_NMSTRT_CASE
1778#undef CHECK_NMSTRT_CASES
1779
1780