1/* This file is included!
2                            __  __            _
3                         ___\ \/ /_ __   __ _| |_
4                        / _ \\  /| '_ \ / _` | __|
5                       |  __//  \| |_) | (_| | |_
6                        \___/_/\_\ .__/ \__,_|\__|
7                                 |_| XML parser
8
9   Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10   Copyright (c) 2000-2017 Expat development team
11   Licensed under the MIT license:
12
13   Permission is  hereby granted,  free of charge,  to any  person obtaining
14   a  copy  of  this  software   and  associated  documentation  files  (the
15   "Software"),  to  deal in  the  Software  without restriction,  including
16   without  limitation the  rights  to use,  copy,  modify, merge,  publish,
17   distribute, sublicense, and/or sell copies of the Software, and to permit
18   persons  to whom  the Software  is  furnished to  do so,  subject to  the
19   following conditions:
20
21   The above copyright  notice and this permission notice  shall be included
22   in all copies or substantial portions of the Software.
23
24   THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
25   EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
26   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27   NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28   DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
29   OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30   USE OR OTHER DEALINGS IN THE SOFTWARE.
31*/
32
33#ifdef XML_TOK_IMPL_C
34
35#  ifndef IS_INVALID_CHAR
36#    define IS_INVALID_CHAR(enc, ptr, n) (0)
37#  endif
38
39#  define INVALID_LEAD_CASE(n, ptr, nextTokPtr)                                \
40  case BT_LEAD##n:                                                             \
41    if (end - ptr < n)                                                         \
42      return XML_TOK_PARTIAL_CHAR;                                             \
43    if (IS_INVALID_CHAR(enc, ptr, n)) {                                        \
44      *(nextTokPtr) = (ptr);                                                   \
45      return XML_TOK_INVALID;                                                  \
46    }                                                                          \
47    ptr += n;                                                                  \
48    break;
49
50#  define INVALID_CASES(ptr, nextTokPtr)                                       \
51    INVALID_LEAD_CASE(2, ptr, nextTokPtr)                                      \
52    INVALID_LEAD_CASE(3, ptr, nextTokPtr)                                      \
53    INVALID_LEAD_CASE(4, ptr, nextTokPtr)                                      \
54  case BT_NONXML:                                                              \
55  case BT_MALFORM:                                                             \
56  case BT_TRAIL:                                                               \
57    *(nextTokPtr) = (ptr);                                                     \
58    return XML_TOK_INVALID;
59
60#  define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr)                        \
61  case BT_LEAD##n:                                                             \
62    if (end - ptr < n)                                                         \
63      return XML_TOK_PARTIAL_CHAR;                                             \
64    if (! IS_NAME_CHAR(enc, ptr, n)) {                                         \
65      *nextTokPtr = ptr;                                                       \
66      return XML_TOK_INVALID;                                                  \
67    }                                                                          \
68    ptr += n;                                                                  \
69    break;
70
71#  define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)                          \
72  case BT_NONASCII:                                                            \
73    if (! IS_NAME_CHAR_MINBPC(enc, ptr)) {                                     \
74      *nextTokPtr = ptr;                                                       \
75      return XML_TOK_INVALID;                                                  \
76    }                                                                          \
77    /* fall through */                                                         \
78  case BT_NMSTRT:                                                              \
79  case BT_HEX:                                                                 \
80  case BT_DIGIT:                                                               \
81  case BT_NAME:                                                                \
82  case BT_MINUS:                                                               \
83    ptr += MINBPC(enc);                                                        \
84    break;                                                                     \
85    CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr)                              \
86    CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr)                              \
87    CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
88
89#  define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr)                      \
90  case BT_LEAD##n:                                                             \
91    if (end - ptr < n)                                                         \
92      return XML_TOK_PARTIAL_CHAR;                                             \
93    if (! IS_NMSTRT_CHAR(enc, ptr, n)) {                                       \
94      *nextTokPtr = ptr;                                                       \
95      return XML_TOK_INVALID;                                                  \
96    }                                                                          \
97    ptr += n;                                                                  \
98    break;
99
100#  define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)                        \
101  case BT_NONASCII:                                                            \
102    if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {                                   \
103      *nextTokPtr = ptr;                                                       \
104      return XML_TOK_INVALID;                                                  \
105    }                                                                          \
106    /* fall through */                                                         \
107  case BT_NMSTRT:                                                              \
108  case BT_HEX:                                                                 \
109    ptr += MINBPC(enc);                                                        \
110    break;                                                                     \
111    CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr)                            \
112    CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr)                            \
113    CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
114
115#  ifndef PREFIX
116#    define PREFIX(ident) ident
117#  endif
118
119#  define HAS_CHARS(enc, ptr, end, count) (end - ptr >= count * MINBPC(enc))
120
121#  define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1)
122
123#  define REQUIRE_CHARS(enc, ptr, end, count)                                  \
124    {                                                                          \
125      if (! HAS_CHARS(enc, ptr, end, count)) {                                 \
126        return XML_TOK_PARTIAL;                                                \
127      }                                                                        \
128    }
129
130#  define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1)
131
132/* ptr points to character following "<!-" */
133
134static int PTRCALL
135PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
136                    const char **nextTokPtr) {
137  if (HAS_CHAR(enc, ptr, end)) {
138    if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
139      *nextTokPtr = ptr;
140      return XML_TOK_INVALID;
141    }
142    ptr += MINBPC(enc);
143    while (HAS_CHAR(enc, ptr, end)) {
144      switch (BYTE_TYPE(enc, ptr)) {
145        INVALID_CASES(ptr, nextTokPtr)
146      case BT_MINUS:
147        ptr += MINBPC(enc);
148        REQUIRE_CHAR(enc, ptr, end);
149        if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
150          ptr += MINBPC(enc);
151          REQUIRE_CHAR(enc, ptr, end);
152          if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
153            *nextTokPtr = ptr;
154            return XML_TOK_INVALID;
155          }
156          *nextTokPtr = ptr + MINBPC(enc);
157          return XML_TOK_COMMENT;
158        }
159        break;
160      default:
161        ptr += MINBPC(enc);
162        break;
163      }
164    }
165  }
166  return XML_TOK_PARTIAL;
167}
168
169/* ptr points to character following "<!" */
170
171static int PTRCALL
172PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
173                 const char **nextTokPtr) {
174  REQUIRE_CHAR(enc, ptr, end);
175  switch (BYTE_TYPE(enc, ptr)) {
176  case BT_MINUS:
177    return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
178  case BT_LSQB:
179    *nextTokPtr = ptr + MINBPC(enc);
180    return XML_TOK_COND_SECT_OPEN;
181  case BT_NMSTRT:
182  case BT_HEX:
183    ptr += MINBPC(enc);
184    break;
185  default:
186    *nextTokPtr = ptr;
187    return XML_TOK_INVALID;
188  }
189  while (HAS_CHAR(enc, ptr, end)) {
190    switch (BYTE_TYPE(enc, ptr)) {
191    case BT_PERCNT:
192      REQUIRE_CHARS(enc, ptr, end, 2);
193      /* don't allow <!ENTITY% foo "whatever"> */
194      switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
195      case BT_S:
196      case BT_CR:
197      case BT_LF:
198      case BT_PERCNT:
199        *nextTokPtr = ptr;
200        return XML_TOK_INVALID;
201      }
202      /* fall through */
203    case BT_S:
204    case BT_CR:
205    case BT_LF:
206      *nextTokPtr = ptr;
207      return XML_TOK_DECL_OPEN;
208    case BT_NMSTRT:
209    case BT_HEX:
210      ptr += MINBPC(enc);
211      break;
212    default:
213      *nextTokPtr = ptr;
214      return XML_TOK_INVALID;
215    }
216  }
217  return XML_TOK_PARTIAL;
218}
219
220static int PTRCALL
221PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end,
222                      int *tokPtr) {
223  int upper = 0;
224  UNUSED_P(enc);
225  *tokPtr = XML_TOK_PI;
226  if (end - ptr != MINBPC(enc) * 3)
227    return 1;
228  switch (BYTE_TO_ASCII(enc, ptr)) {
229  case ASCII_x:
230    break;
231  case ASCII_X:
232    upper = 1;
233    break;
234  default:
235    return 1;
236  }
237  ptr += MINBPC(enc);
238  switch (BYTE_TO_ASCII(enc, ptr)) {
239  case ASCII_m:
240    break;
241  case ASCII_M:
242    upper = 1;
243    break;
244  default:
245    return 1;
246  }
247  ptr += MINBPC(enc);
248  switch (BYTE_TO_ASCII(enc, ptr)) {
249  case ASCII_l:
250    break;
251  case ASCII_L:
252    upper = 1;
253    break;
254  default:
255    return 1;
256  }
257  if (upper)
258    return 0;
259  *tokPtr = XML_TOK_XML_DECL;
260  return 1;
261}
262
263/* ptr points to character following "<?" */
264
265static int PTRCALL
266PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
267               const char **nextTokPtr) {
268  int tok;
269  const char *target = ptr;
270  REQUIRE_CHAR(enc, ptr, end);
271  switch (BYTE_TYPE(enc, ptr)) {
272    CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
273  default:
274    *nextTokPtr = ptr;
275    return XML_TOK_INVALID;
276  }
277  while (HAS_CHAR(enc, ptr, end)) {
278    switch (BYTE_TYPE(enc, ptr)) {
279      CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
280    case BT_S:
281    case BT_CR:
282    case BT_LF:
283      if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
284        *nextTokPtr = ptr;
285        return XML_TOK_INVALID;
286      }
287      ptr += MINBPC(enc);
288      while (HAS_CHAR(enc, ptr, end)) {
289        switch (BYTE_TYPE(enc, ptr)) {
290          INVALID_CASES(ptr, nextTokPtr)
291        case BT_QUEST:
292          ptr += MINBPC(enc);
293          REQUIRE_CHAR(enc, ptr, end);
294          if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
295            *nextTokPtr = ptr + MINBPC(enc);
296            return tok;
297          }
298          break;
299        default:
300          ptr += MINBPC(enc);
301          break;
302        }
303      }
304      return XML_TOK_PARTIAL;
305    case BT_QUEST:
306      if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
307        *nextTokPtr = ptr;
308        return XML_TOK_INVALID;
309      }
310      ptr += MINBPC(enc);
311      REQUIRE_CHAR(enc, ptr, end);
312      if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
313        *nextTokPtr = ptr + MINBPC(enc);
314        return tok;
315      }
316      /* fall through */
317    default:
318      *nextTokPtr = ptr;
319      return XML_TOK_INVALID;
320    }
321  }
322  return XML_TOK_PARTIAL;
323}
324
325static int PTRCALL
326PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
327                         const char **nextTokPtr) {
328  static const char CDATA_LSQB[]
329      = {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB};
330  int i;
331  UNUSED_P(enc);
332  /* CDATA[ */
333  REQUIRE_CHARS(enc, ptr, end, 6);
334  for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
335    if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
336      *nextTokPtr = ptr;
337      return XML_TOK_INVALID;
338    }
339  }
340  *nextTokPtr = ptr;
341  return XML_TOK_CDATA_SECT_OPEN;
342}
343
344static int PTRCALL
345PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
346                        const char **nextTokPtr) {
347  if (ptr >= end)
348    return XML_TOK_NONE;
349  if (MINBPC(enc) > 1) {
350    size_t n = end - ptr;
351    if (n & (MINBPC(enc) - 1)) {
352      n &= ~(MINBPC(enc) - 1);
353      if (n == 0)
354        return XML_TOK_PARTIAL;
355      end = ptr + n;
356    }
357  }
358  switch (BYTE_TYPE(enc, ptr)) {
359  case BT_RSQB:
360    ptr += MINBPC(enc);
361    REQUIRE_CHAR(enc, ptr, end);
362    if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
363      break;
364    ptr += MINBPC(enc);
365    REQUIRE_CHAR(enc, ptr, end);
366    if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
367      ptr -= MINBPC(enc);
368      break;
369    }
370    *nextTokPtr = ptr + MINBPC(enc);
371    return XML_TOK_CDATA_SECT_CLOSE;
372  case BT_CR:
373    ptr += MINBPC(enc);
374    REQUIRE_CHAR(enc, ptr, end);
375    if (BYTE_TYPE(enc, ptr) == BT_LF)
376      ptr += MINBPC(enc);
377    *nextTokPtr = ptr;
378    return XML_TOK_DATA_NEWLINE;
379  case BT_LF:
380    *nextTokPtr = ptr + MINBPC(enc);
381    return XML_TOK_DATA_NEWLINE;
382    INVALID_CASES(ptr, nextTokPtr)
383  default:
384    ptr += MINBPC(enc);
385    break;
386  }
387  while (HAS_CHAR(enc, ptr, end)) {
388    switch (BYTE_TYPE(enc, ptr)) {
389#  define LEAD_CASE(n)                                                         \
390  case BT_LEAD##n:                                                             \
391    if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) {                       \
392      *nextTokPtr = ptr;                                                       \
393      return XML_TOK_DATA_CHARS;                                               \
394    }                                                                          \
395    ptr += n;                                                                  \
396    break;
397      LEAD_CASE(2)
398      LEAD_CASE(3)
399      LEAD_CASE(4)
400#  undef LEAD_CASE
401    case BT_NONXML:
402    case BT_MALFORM:
403    case BT_TRAIL:
404    case BT_CR:
405    case BT_LF:
406    case BT_RSQB:
407      *nextTokPtr = ptr;
408      return XML_TOK_DATA_CHARS;
409    default:
410      ptr += MINBPC(enc);
411      break;
412    }
413  }
414  *nextTokPtr = ptr;
415  return XML_TOK_DATA_CHARS;
416}
417
418/* ptr points to character following "</" */
419
420static int PTRCALL
421PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
422                   const char **nextTokPtr) {
423  REQUIRE_CHAR(enc, ptr, end);
424  switch (BYTE_TYPE(enc, ptr)) {
425    CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
426  default:
427    *nextTokPtr = ptr;
428    return XML_TOK_INVALID;
429  }
430  while (HAS_CHAR(enc, ptr, end)) {
431    switch (BYTE_TYPE(enc, ptr)) {
432      CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
433    case BT_S:
434    case BT_CR:
435    case BT_LF:
436      for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
437        switch (BYTE_TYPE(enc, ptr)) {
438        case BT_S:
439        case BT_CR:
440        case BT_LF:
441          break;
442        case BT_GT:
443          *nextTokPtr = ptr + MINBPC(enc);
444          return XML_TOK_END_TAG;
445        default:
446          *nextTokPtr = ptr;
447          return XML_TOK_INVALID;
448        }
449      }
450      return XML_TOK_PARTIAL;
451#  ifdef XML_NS
452    case BT_COLON:
453      /* no need to check qname syntax here,
454         since end-tag must match exactly */
455      ptr += MINBPC(enc);
456      break;
457#  endif
458    case BT_GT:
459      *nextTokPtr = ptr + MINBPC(enc);
460      return XML_TOK_END_TAG;
461    default:
462      *nextTokPtr = ptr;
463      return XML_TOK_INVALID;
464    }
465  }
466  return XML_TOK_PARTIAL;
467}
468
469/* ptr points to character following "&#X" */
470
471static int PTRCALL
472PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
473                       const char **nextTokPtr) {
474  if (HAS_CHAR(enc, ptr, end)) {
475    switch (BYTE_TYPE(enc, ptr)) {
476    case BT_DIGIT:
477    case BT_HEX:
478      break;
479    default:
480      *nextTokPtr = ptr;
481      return XML_TOK_INVALID;
482    }
483    for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
484      switch (BYTE_TYPE(enc, ptr)) {
485      case BT_DIGIT:
486      case BT_HEX:
487        break;
488      case BT_SEMI:
489        *nextTokPtr = ptr + MINBPC(enc);
490        return XML_TOK_CHAR_REF;
491      default:
492        *nextTokPtr = ptr;
493        return XML_TOK_INVALID;
494      }
495    }
496  }
497  return XML_TOK_PARTIAL;
498}
499
500/* ptr points to character following "&#" */
501
502static int PTRCALL
503PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
504                    const char **nextTokPtr) {
505  if (HAS_CHAR(enc, ptr, end)) {
506    if (CHAR_MATCHES(enc, ptr, ASCII_x))
507      return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
508    switch (BYTE_TYPE(enc, ptr)) {
509    case BT_DIGIT:
510      break;
511    default:
512      *nextTokPtr = ptr;
513      return XML_TOK_INVALID;
514    }
515    for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
516      switch (BYTE_TYPE(enc, ptr)) {
517      case BT_DIGIT:
518        break;
519      case BT_SEMI:
520        *nextTokPtr = ptr + MINBPC(enc);
521        return XML_TOK_CHAR_REF;
522      default:
523        *nextTokPtr = ptr;
524        return XML_TOK_INVALID;
525      }
526    }
527  }
528  return XML_TOK_PARTIAL;
529}
530
531/* ptr points to character following "&" */
532
533static int PTRCALL
534PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
535                const char **nextTokPtr) {
536  REQUIRE_CHAR(enc, ptr, end);
537  switch (BYTE_TYPE(enc, ptr)) {
538    CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
539  case BT_NUM:
540    return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
541  default:
542    *nextTokPtr = ptr;
543    return XML_TOK_INVALID;
544  }
545  while (HAS_CHAR(enc, ptr, end)) {
546    switch (BYTE_TYPE(enc, ptr)) {
547      CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
548    case BT_SEMI:
549      *nextTokPtr = ptr + MINBPC(enc);
550      return XML_TOK_ENTITY_REF;
551    default:
552      *nextTokPtr = ptr;
553      return XML_TOK_INVALID;
554    }
555  }
556  return XML_TOK_PARTIAL;
557}
558
559/* ptr points to character following first character of attribute name */
560
561static int PTRCALL
562PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
563                 const char **nextTokPtr) {
564#  ifdef XML_NS
565  int hadColon = 0;
566#  endif
567  while (HAS_CHAR(enc, ptr, end)) {
568    switch (BYTE_TYPE(enc, ptr)) {
569      CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
570#  ifdef XML_NS
571    case BT_COLON:
572      if (hadColon) {
573        *nextTokPtr = ptr;
574        return XML_TOK_INVALID;
575      }
576      hadColon = 1;
577      ptr += MINBPC(enc);
578      REQUIRE_CHAR(enc, ptr, end);
579      switch (BYTE_TYPE(enc, ptr)) {
580        CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
581      default:
582        *nextTokPtr = ptr;
583        return XML_TOK_INVALID;
584      }
585      break;
586#  endif
587    case BT_S:
588    case BT_CR:
589    case BT_LF:
590      for (;;) {
591        int t;
592
593        ptr += MINBPC(enc);
594        REQUIRE_CHAR(enc, ptr, end);
595        t = BYTE_TYPE(enc, ptr);
596        if (t == BT_EQUALS)
597          break;
598        switch (t) {
599        case BT_S:
600        case BT_LF:
601        case BT_CR:
602          break;
603        default:
604          *nextTokPtr = ptr;
605          return XML_TOK_INVALID;
606        }
607      }
608      /* fall through */
609    case BT_EQUALS: {
610      int open;
611#  ifdef XML_NS
612      hadColon = 0;
613#  endif
614      for (;;) {
615        ptr += MINBPC(enc);
616        REQUIRE_CHAR(enc, ptr, end);
617        open = BYTE_TYPE(enc, ptr);
618        if (open == BT_QUOT || open == BT_APOS)
619          break;
620        switch (open) {
621        case BT_S:
622        case BT_LF:
623        case BT_CR:
624          break;
625        default:
626          *nextTokPtr = ptr;
627          return XML_TOK_INVALID;
628        }
629      }
630      ptr += MINBPC(enc);
631      /* in attribute value */
632      for (;;) {
633        int t;
634        REQUIRE_CHAR(enc, ptr, end);
635        t = BYTE_TYPE(enc, ptr);
636        if (t == open)
637          break;
638        switch (t) {
639          INVALID_CASES(ptr, nextTokPtr)
640        case BT_AMP: {
641          int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
642          if (tok <= 0) {
643            if (tok == XML_TOK_INVALID)
644              *nextTokPtr = ptr;
645            return tok;
646          }
647          break;
648        }
649        case BT_LT:
650          *nextTokPtr = ptr;
651          return XML_TOK_INVALID;
652        default:
653          ptr += MINBPC(enc);
654          break;
655        }
656      }
657      ptr += MINBPC(enc);
658      REQUIRE_CHAR(enc, ptr, end);
659      switch (BYTE_TYPE(enc, ptr)) {
660      case BT_S:
661      case BT_CR:
662      case BT_LF:
663        break;
664      case BT_SOL:
665        goto sol;
666      case BT_GT:
667        goto gt;
668      default:
669        *nextTokPtr = ptr;
670        return XML_TOK_INVALID;
671      }
672      /* ptr points to closing quote */
673      for (;;) {
674        ptr += MINBPC(enc);
675        REQUIRE_CHAR(enc, ptr, end);
676        switch (BYTE_TYPE(enc, ptr)) {
677          CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
678        case BT_S:
679        case BT_CR:
680        case BT_LF:
681          continue;
682        case BT_GT:
683        gt:
684          *nextTokPtr = ptr + MINBPC(enc);
685          return XML_TOK_START_TAG_WITH_ATTS;
686        case BT_SOL:
687        sol:
688          ptr += MINBPC(enc);
689          REQUIRE_CHAR(enc, ptr, end);
690          if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
691            *nextTokPtr = ptr;
692            return XML_TOK_INVALID;
693          }
694          *nextTokPtr = ptr + MINBPC(enc);
695          return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
696        default:
697          *nextTokPtr = ptr;
698          return XML_TOK_INVALID;
699        }
700        break;
701      }
702      break;
703    }
704    default:
705      *nextTokPtr = ptr;
706      return XML_TOK_INVALID;
707    }
708  }
709  return XML_TOK_PARTIAL;
710}
711
712/* ptr points to character following "<" */
713
714static int PTRCALL
715PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
716               const char **nextTokPtr) {
717#  ifdef XML_NS
718  int hadColon;
719#  endif
720  REQUIRE_CHAR(enc, ptr, end);
721  switch (BYTE_TYPE(enc, ptr)) {
722    CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
723  case BT_EXCL:
724    ptr += MINBPC(enc);
725    REQUIRE_CHAR(enc, ptr, end);
726    switch (BYTE_TYPE(enc, ptr)) {
727    case BT_MINUS:
728      return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
729    case BT_LSQB:
730      return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
731    }
732    *nextTokPtr = ptr;
733    return XML_TOK_INVALID;
734  case BT_QUEST:
735    return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
736  case BT_SOL:
737    return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
738  default:
739    *nextTokPtr = ptr;
740    return XML_TOK_INVALID;
741  }
742#  ifdef XML_NS
743  hadColon = 0;
744#  endif
745  /* we have a start-tag */
746  while (HAS_CHAR(enc, ptr, end)) {
747    switch (BYTE_TYPE(enc, ptr)) {
748      CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
749#  ifdef XML_NS
750    case BT_COLON:
751      if (hadColon) {
752        *nextTokPtr = ptr;
753        return XML_TOK_INVALID;
754      }
755      hadColon = 1;
756      ptr += MINBPC(enc);
757      REQUIRE_CHAR(enc, ptr, end);
758      switch (BYTE_TYPE(enc, ptr)) {
759        CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
760      default:
761        *nextTokPtr = ptr;
762        return XML_TOK_INVALID;
763      }
764      break;
765#  endif
766    case BT_S:
767    case BT_CR:
768    case BT_LF: {
769      ptr += MINBPC(enc);
770      while (HAS_CHAR(enc, ptr, end)) {
771        switch (BYTE_TYPE(enc, ptr)) {
772          CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
773        case BT_GT:
774          goto gt;
775        case BT_SOL:
776          goto sol;
777        case BT_S:
778        case BT_CR:
779        case BT_LF:
780          ptr += MINBPC(enc);
781          continue;
782        default:
783          *nextTokPtr = ptr;
784          return XML_TOK_INVALID;
785        }
786        return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
787      }
788      return XML_TOK_PARTIAL;
789    }
790    case BT_GT:
791    gt:
792      *nextTokPtr = ptr + MINBPC(enc);
793      return XML_TOK_START_TAG_NO_ATTS;
794    case BT_SOL:
795    sol:
796      ptr += MINBPC(enc);
797      REQUIRE_CHAR(enc, ptr, end);
798      if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
799        *nextTokPtr = ptr;
800        return XML_TOK_INVALID;
801      }
802      *nextTokPtr = ptr + MINBPC(enc);
803      return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
804    default:
805      *nextTokPtr = ptr;
806      return XML_TOK_INVALID;
807    }
808  }
809  return XML_TOK_PARTIAL;
810}
811
812static int PTRCALL
813PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
814                   const char **nextTokPtr) {
815  if (ptr >= end)
816    return XML_TOK_NONE;
817  if (MINBPC(enc) > 1) {
818    size_t n = end - ptr;
819    if (n & (MINBPC(enc) - 1)) {
820      n &= ~(MINBPC(enc) - 1);
821      if (n == 0)
822        return XML_TOK_PARTIAL;
823      end = ptr + n;
824    }
825  }
826  switch (BYTE_TYPE(enc, ptr)) {
827  case BT_LT:
828    return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
829  case BT_AMP:
830    return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
831  case BT_CR:
832    ptr += MINBPC(enc);
833    if (! HAS_CHAR(enc, ptr, end))
834      return XML_TOK_TRAILING_CR;
835    if (BYTE_TYPE(enc, ptr) == BT_LF)
836      ptr += MINBPC(enc);
837    *nextTokPtr = ptr;
838    return XML_TOK_DATA_NEWLINE;
839  case BT_LF:
840    *nextTokPtr = ptr + MINBPC(enc);
841    return XML_TOK_DATA_NEWLINE;
842  case BT_RSQB:
843    ptr += MINBPC(enc);
844    if (! HAS_CHAR(enc, ptr, end))
845      return XML_TOK_TRAILING_RSQB;
846    if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
847      break;
848    ptr += MINBPC(enc);
849    if (! HAS_CHAR(enc, ptr, end))
850      return XML_TOK_TRAILING_RSQB;
851    if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
852      ptr -= MINBPC(enc);
853      break;
854    }
855    *nextTokPtr = ptr;
856    return XML_TOK_INVALID;
857    INVALID_CASES(ptr, nextTokPtr)
858  default:
859    ptr += MINBPC(enc);
860    break;
861  }
862  while (HAS_CHAR(enc, ptr, end)) {
863    switch (BYTE_TYPE(enc, ptr)) {
864#  define LEAD_CASE(n)                                                         \
865  case BT_LEAD##n:                                                             \
866    if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) {                       \
867      *nextTokPtr = ptr;                                                       \
868      return XML_TOK_DATA_CHARS;                                               \
869    }                                                                          \
870    ptr += n;                                                                  \
871    break;
872      LEAD_CASE(2)
873      LEAD_CASE(3)
874      LEAD_CASE(4)
875#  undef LEAD_CASE
876    case BT_RSQB:
877      if (HAS_CHARS(enc, ptr, end, 2)) {
878        if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
879          ptr += MINBPC(enc);
880          break;
881        }
882        if (HAS_CHARS(enc, ptr, end, 3)) {
883          if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) {
884            ptr += MINBPC(enc);
885            break;
886          }
887          *nextTokPtr = ptr + 2 * MINBPC(enc);
888          return XML_TOK_INVALID;
889        }
890      }
891      /* fall through */
892    case BT_AMP:
893    case BT_LT:
894    case BT_NONXML:
895    case BT_MALFORM:
896    case BT_TRAIL:
897    case BT_CR:
898    case BT_LF:
899      *nextTokPtr = ptr;
900      return XML_TOK_DATA_CHARS;
901    default:
902      ptr += MINBPC(enc);
903      break;
904    }
905  }
906  *nextTokPtr = ptr;
907  return XML_TOK_DATA_CHARS;
908}
909
910/* ptr points to character following "%" */
911
912static int PTRCALL
913PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
914                    const char **nextTokPtr) {
915  REQUIRE_CHAR(enc, ptr, end);
916  switch (BYTE_TYPE(enc, ptr)) {
917    CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
918  case BT_S:
919  case BT_LF:
920  case BT_CR:
921  case BT_PERCNT:
922    *nextTokPtr = ptr;
923    return XML_TOK_PERCENT;
924  default:
925    *nextTokPtr = ptr;
926    return XML_TOK_INVALID;
927  }
928  while (HAS_CHAR(enc, ptr, end)) {
929    switch (BYTE_TYPE(enc, ptr)) {
930      CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
931    case BT_SEMI:
932      *nextTokPtr = ptr + MINBPC(enc);
933      return XML_TOK_PARAM_ENTITY_REF;
934    default:
935      *nextTokPtr = ptr;
936      return XML_TOK_INVALID;
937    }
938  }
939  return XML_TOK_PARTIAL;
940}
941
942static int PTRCALL
943PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
944                      const char **nextTokPtr) {
945  REQUIRE_CHAR(enc, ptr, end);
946  switch (BYTE_TYPE(enc, ptr)) {
947    CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
948  default:
949    *nextTokPtr = ptr;
950    return XML_TOK_INVALID;
951  }
952  while (HAS_CHAR(enc, ptr, end)) {
953    switch (BYTE_TYPE(enc, ptr)) {
954      CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
955    case BT_CR:
956    case BT_LF:
957    case BT_S:
958    case BT_RPAR:
959    case BT_GT:
960    case BT_PERCNT:
961    case BT_VERBAR:
962      *nextTokPtr = ptr;
963      return XML_TOK_POUND_NAME;
964    default:
965      *nextTokPtr = ptr;
966      return XML_TOK_INVALID;
967    }
968  }
969  return -XML_TOK_POUND_NAME;
970}
971
972static int PTRCALL
973PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end,
974                const char **nextTokPtr) {
975  while (HAS_CHAR(enc, ptr, end)) {
976    int t = BYTE_TYPE(enc, ptr);
977    switch (t) {
978      INVALID_CASES(ptr, nextTokPtr)
979    case BT_QUOT:
980    case BT_APOS:
981      ptr += MINBPC(enc);
982      if (t != open)
983        break;
984      if (! HAS_CHAR(enc, ptr, end))
985        return -XML_TOK_LITERAL;
986      *nextTokPtr = ptr;
987      switch (BYTE_TYPE(enc, ptr)) {
988      case BT_S:
989      case BT_CR:
990      case BT_LF:
991      case BT_GT:
992      case BT_PERCNT:
993      case BT_LSQB:
994        return XML_TOK_LITERAL;
995      default:
996        return XML_TOK_INVALID;
997      }
998    default:
999      ptr += MINBPC(enc);
1000      break;
1001    }
1002  }
1003  return XML_TOK_PARTIAL;
1004}
1005
1006static int PTRCALL
1007PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
1008                  const char **nextTokPtr) {
1009  int tok;
1010  if (ptr >= end)
1011    return XML_TOK_NONE;
1012  if (MINBPC(enc) > 1) {
1013    size_t n = end - ptr;
1014    if (n & (MINBPC(enc) - 1)) {
1015      n &= ~(MINBPC(enc) - 1);
1016      if (n == 0)
1017        return XML_TOK_PARTIAL;
1018      end = ptr + n;
1019    }
1020  }
1021  switch (BYTE_TYPE(enc, ptr)) {
1022  case BT_QUOT:
1023    return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1024  case BT_APOS:
1025    return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1026  case BT_LT: {
1027    ptr += MINBPC(enc);
1028    REQUIRE_CHAR(enc, ptr, end);
1029    switch (BYTE_TYPE(enc, ptr)) {
1030    case BT_EXCL:
1031      return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1032    case BT_QUEST:
1033      return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1034    case BT_NMSTRT:
1035    case BT_HEX:
1036    case BT_NONASCII:
1037    case BT_LEAD2:
1038    case BT_LEAD3:
1039    case BT_LEAD4:
1040      *nextTokPtr = ptr - MINBPC(enc);
1041      return XML_TOK_INSTANCE_START;
1042    }
1043    *nextTokPtr = ptr;
1044    return XML_TOK_INVALID;
1045  }
1046  case BT_CR:
1047    if (ptr + MINBPC(enc) == end) {
1048      *nextTokPtr = end;
1049      /* indicate that this might be part of a CR/LF pair */
1050      return -XML_TOK_PROLOG_S;
1051    }
1052    /* fall through */
1053  case BT_S:
1054  case BT_LF:
1055    for (;;) {
1056      ptr += MINBPC(enc);
1057      if (! HAS_CHAR(enc, ptr, end))
1058        break;
1059      switch (BYTE_TYPE(enc, ptr)) {
1060      case BT_S:
1061      case BT_LF:
1062        break;
1063      case BT_CR:
1064        /* don't split CR/LF pair */
1065        if (ptr + MINBPC(enc) != end)
1066          break;
1067        /* fall through */
1068      default:
1069        *nextTokPtr = ptr;
1070        return XML_TOK_PROLOG_S;
1071      }
1072    }
1073    *nextTokPtr = ptr;
1074    return XML_TOK_PROLOG_S;
1075  case BT_PERCNT:
1076    return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1077  case BT_COMMA:
1078    *nextTokPtr = ptr + MINBPC(enc);
1079    return XML_TOK_COMMA;
1080  case BT_LSQB:
1081    *nextTokPtr = ptr + MINBPC(enc);
1082    return XML_TOK_OPEN_BRACKET;
1083  case BT_RSQB:
1084    ptr += MINBPC(enc);
1085    if (! HAS_CHAR(enc, ptr, end))
1086      return -XML_TOK_CLOSE_BRACKET;
1087    if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1088      REQUIRE_CHARS(enc, ptr, end, 2);
1089      if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1090        *nextTokPtr = ptr + 2 * MINBPC(enc);
1091        return XML_TOK_COND_SECT_CLOSE;
1092      }
1093    }
1094    *nextTokPtr = ptr;
1095    return XML_TOK_CLOSE_BRACKET;
1096  case BT_LPAR:
1097    *nextTokPtr = ptr + MINBPC(enc);
1098    return XML_TOK_OPEN_PAREN;
1099  case BT_RPAR:
1100    ptr += MINBPC(enc);
1101    if (! HAS_CHAR(enc, ptr, end))
1102      return -XML_TOK_CLOSE_PAREN;
1103    switch (BYTE_TYPE(enc, ptr)) {
1104    case BT_AST:
1105      *nextTokPtr = ptr + MINBPC(enc);
1106      return XML_TOK_CLOSE_PAREN_ASTERISK;
1107    case BT_QUEST:
1108      *nextTokPtr = ptr + MINBPC(enc);
1109      return XML_TOK_CLOSE_PAREN_QUESTION;
1110    case BT_PLUS:
1111      *nextTokPtr = ptr + MINBPC(enc);
1112      return XML_TOK_CLOSE_PAREN_PLUS;
1113    case BT_CR:
1114    case BT_LF:
1115    case BT_S:
1116    case BT_GT:
1117    case BT_COMMA:
1118    case BT_VERBAR:
1119    case BT_RPAR:
1120      *nextTokPtr = ptr;
1121      return XML_TOK_CLOSE_PAREN;
1122    }
1123    *nextTokPtr = ptr;
1124    return XML_TOK_INVALID;
1125  case BT_VERBAR:
1126    *nextTokPtr = ptr + MINBPC(enc);
1127    return XML_TOK_OR;
1128  case BT_GT:
1129    *nextTokPtr = ptr + MINBPC(enc);
1130    return XML_TOK_DECL_CLOSE;
1131  case BT_NUM:
1132    return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1133#  define LEAD_CASE(n)                                                         \
1134  case BT_LEAD##n:                                                             \
1135    if (end - ptr < n)                                                         \
1136      return XML_TOK_PARTIAL_CHAR;                                             \
1137    if (IS_NMSTRT_CHAR(enc, ptr, n)) {                                         \
1138      ptr += n;                                                                \
1139      tok = XML_TOK_NAME;                                                      \
1140      break;                                                                   \
1141    }                                                                          \
1142    if (IS_NAME_CHAR(enc, ptr, n)) {                                           \
1143      ptr += n;                                                                \
1144      tok = XML_TOK_NMTOKEN;                                                   \
1145      break;                                                                   \
1146    }                                                                          \
1147    *nextTokPtr = ptr;                                                         \
1148    return XML_TOK_INVALID;
1149    LEAD_CASE(2)
1150    LEAD_CASE(3)
1151    LEAD_CASE(4)
1152#  undef LEAD_CASE
1153  case BT_NMSTRT:
1154  case BT_HEX:
1155    tok = XML_TOK_NAME;
1156    ptr += MINBPC(enc);
1157    break;
1158  case BT_DIGIT:
1159  case BT_NAME:
1160  case BT_MINUS:
1161#  ifdef XML_NS
1162  case BT_COLON:
1163#  endif
1164    tok = XML_TOK_NMTOKEN;
1165    ptr += MINBPC(enc);
1166    break;
1167  case BT_NONASCII:
1168    if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1169      ptr += MINBPC(enc);
1170      tok = XML_TOK_NAME;
1171      break;
1172    }
1173    if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1174      ptr += MINBPC(enc);
1175      tok = XML_TOK_NMTOKEN;
1176      break;
1177    }
1178    /* fall through */
1179  default:
1180    *nextTokPtr = ptr;
1181    return XML_TOK_INVALID;
1182  }
1183  while (HAS_CHAR(enc, ptr, end)) {
1184    switch (BYTE_TYPE(enc, ptr)) {
1185      CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1186    case BT_GT:
1187    case BT_RPAR:
1188    case BT_COMMA:
1189    case BT_VERBAR:
1190    case BT_LSQB:
1191    case BT_PERCNT:
1192    case BT_S:
1193    case BT_CR:
1194    case BT_LF:
1195      *nextTokPtr = ptr;
1196      return tok;
1197#  ifdef XML_NS
1198    case BT_COLON:
1199      ptr += MINBPC(enc);
1200      switch (tok) {
1201      case XML_TOK_NAME:
1202        REQUIRE_CHAR(enc, ptr, end);
1203        tok = XML_TOK_PREFIXED_NAME;
1204        switch (BYTE_TYPE(enc, ptr)) {
1205          CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1206        default:
1207          tok = XML_TOK_NMTOKEN;
1208          break;
1209        }
1210        break;
1211      case XML_TOK_PREFIXED_NAME:
1212        tok = XML_TOK_NMTOKEN;
1213        break;
1214      }
1215      break;
1216#  endif
1217    case BT_PLUS:
1218      if (tok == XML_TOK_NMTOKEN) {
1219        *nextTokPtr = ptr;
1220        return XML_TOK_INVALID;
1221      }
1222      *nextTokPtr = ptr + MINBPC(enc);
1223      return XML_TOK_NAME_PLUS;
1224    case BT_AST:
1225      if (tok == XML_TOK_NMTOKEN) {
1226        *nextTokPtr = ptr;
1227        return XML_TOK_INVALID;
1228      }
1229      *nextTokPtr = ptr + MINBPC(enc);
1230      return XML_TOK_NAME_ASTERISK;
1231    case BT_QUEST:
1232      if (tok == XML_TOK_NMTOKEN) {
1233        *nextTokPtr = ptr;
1234        return XML_TOK_INVALID;
1235      }
1236      *nextTokPtr = ptr + MINBPC(enc);
1237      return XML_TOK_NAME_QUESTION;
1238    default:
1239      *nextTokPtr = ptr;
1240      return XML_TOK_INVALID;
1241    }
1242  }
1243  return -tok;
1244}
1245
1246static int PTRCALL
1247PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1248                          const char **nextTokPtr) {
1249  const char *start;
1250  if (ptr >= end)
1251    return XML_TOK_NONE;
1252  else if (! HAS_CHAR(enc, ptr, end)) {
1253    /* This line cannot be executed.  The incoming data has already
1254     * been tokenized once, so incomplete characters like this have
1255     * already been eliminated from the input.  Retaining the paranoia
1256     * check is still valuable, however.
1257     */
1258    return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1259  }
1260  start = ptr;
1261  while (HAS_CHAR(enc, ptr, end)) {
1262    switch (BYTE_TYPE(enc, ptr)) {
1263#  define LEAD_CASE(n)                                                         \
1264  case BT_LEAD##n:                                                             \
1265    ptr += n;                                                                  \
1266    break;
1267      LEAD_CASE(2)
1268      LEAD_CASE(3)
1269      LEAD_CASE(4)
1270#  undef LEAD_CASE
1271    case BT_AMP:
1272      if (ptr == start)
1273        return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1274      *nextTokPtr = ptr;
1275      return XML_TOK_DATA_CHARS;
1276    case BT_LT:
1277      /* this is for inside entity references */
1278      *nextTokPtr = ptr;
1279      return XML_TOK_INVALID;
1280    case BT_LF:
1281      if (ptr == start) {
1282        *nextTokPtr = ptr + MINBPC(enc);
1283        return XML_TOK_DATA_NEWLINE;
1284      }
1285      *nextTokPtr = ptr;
1286      return XML_TOK_DATA_CHARS;
1287    case BT_CR:
1288      if (ptr == start) {
1289        ptr += MINBPC(enc);
1290        if (! HAS_CHAR(enc, ptr, end))
1291          return XML_TOK_TRAILING_CR;
1292        if (BYTE_TYPE(enc, ptr) == BT_LF)
1293          ptr += MINBPC(enc);
1294        *nextTokPtr = ptr;
1295        return XML_TOK_DATA_NEWLINE;
1296      }
1297      *nextTokPtr = ptr;
1298      return XML_TOK_DATA_CHARS;
1299    case BT_S:
1300      if (ptr == start) {
1301        *nextTokPtr = ptr + MINBPC(enc);
1302        return XML_TOK_ATTRIBUTE_VALUE_S;
1303      }
1304      *nextTokPtr = ptr;
1305      return XML_TOK_DATA_CHARS;
1306    default:
1307      ptr += MINBPC(enc);
1308      break;
1309    }
1310  }
1311  *nextTokPtr = ptr;
1312  return XML_TOK_DATA_CHARS;
1313}
1314
1315static int PTRCALL
1316PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1317                       const char **nextTokPtr) {
1318  const char *start;
1319  if (ptr >= end)
1320    return XML_TOK_NONE;
1321  else if (! HAS_CHAR(enc, ptr, end)) {
1322    /* This line cannot be executed.  The incoming data has already
1323     * been tokenized once, so incomplete characters like this have
1324     * already been eliminated from the input.  Retaining the paranoia
1325     * check is still valuable, however.
1326     */
1327    return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1328  }
1329  start = ptr;
1330  while (HAS_CHAR(enc, ptr, end)) {
1331    switch (BYTE_TYPE(enc, ptr)) {
1332#  define LEAD_CASE(n)                                                         \
1333  case BT_LEAD##n:                                                             \
1334    ptr += n;                                                                  \
1335    break;
1336      LEAD_CASE(2)
1337      LEAD_CASE(3)
1338      LEAD_CASE(4)
1339#  undef LEAD_CASE
1340    case BT_AMP:
1341      if (ptr == start)
1342        return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1343      *nextTokPtr = ptr;
1344      return XML_TOK_DATA_CHARS;
1345    case BT_PERCNT:
1346      if (ptr == start) {
1347        int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1348        return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1349      }
1350      *nextTokPtr = ptr;
1351      return XML_TOK_DATA_CHARS;
1352    case BT_LF:
1353      if (ptr == start) {
1354        *nextTokPtr = ptr + MINBPC(enc);
1355        return XML_TOK_DATA_NEWLINE;
1356      }
1357      *nextTokPtr = ptr;
1358      return XML_TOK_DATA_CHARS;
1359    case BT_CR:
1360      if (ptr == start) {
1361        ptr += MINBPC(enc);
1362        if (! HAS_CHAR(enc, ptr, end))
1363          return XML_TOK_TRAILING_CR;
1364        if (BYTE_TYPE(enc, ptr) == BT_LF)
1365          ptr += MINBPC(enc);
1366        *nextTokPtr = ptr;
1367        return XML_TOK_DATA_NEWLINE;
1368      }
1369      *nextTokPtr = ptr;
1370      return XML_TOK_DATA_CHARS;
1371    default:
1372      ptr += MINBPC(enc);
1373      break;
1374    }
1375  }
1376  *nextTokPtr = ptr;
1377  return XML_TOK_DATA_CHARS;
1378}
1379
1380#  ifdef XML_DTD
1381
1382static int PTRCALL
1383PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1384                         const char **nextTokPtr) {
1385  int level = 0;
1386  if (MINBPC(enc) > 1) {
1387    size_t n = end - ptr;
1388    if (n & (MINBPC(enc) - 1)) {
1389      n &= ~(MINBPC(enc) - 1);
1390      end = ptr + n;
1391    }
1392  }
1393  while (HAS_CHAR(enc, ptr, end)) {
1394    switch (BYTE_TYPE(enc, ptr)) {
1395      INVALID_CASES(ptr, nextTokPtr)
1396    case BT_LT:
1397      ptr += MINBPC(enc);
1398      REQUIRE_CHAR(enc, ptr, end);
1399      if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1400        ptr += MINBPC(enc);
1401        REQUIRE_CHAR(enc, ptr, end);
1402        if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1403          ++level;
1404          ptr += MINBPC(enc);
1405        }
1406      }
1407      break;
1408    case BT_RSQB:
1409      ptr += MINBPC(enc);
1410      REQUIRE_CHAR(enc, ptr, end);
1411      if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1412        ptr += MINBPC(enc);
1413        REQUIRE_CHAR(enc, ptr, end);
1414        if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1415          ptr += MINBPC(enc);
1416          if (level == 0) {
1417            *nextTokPtr = ptr;
1418            return XML_TOK_IGNORE_SECT;
1419          }
1420          --level;
1421        }
1422      }
1423      break;
1424    default:
1425      ptr += MINBPC(enc);
1426      break;
1427    }
1428  }
1429  return XML_TOK_PARTIAL;
1430}
1431
1432#  endif /* XML_DTD */
1433
1434static int PTRCALL
1435PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1436                   const char **badPtr) {
1437  ptr += MINBPC(enc);
1438  end -= MINBPC(enc);
1439  for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1440    switch (BYTE_TYPE(enc, ptr)) {
1441    case BT_DIGIT:
1442    case BT_HEX:
1443    case BT_MINUS:
1444    case BT_APOS:
1445    case BT_LPAR:
1446    case BT_RPAR:
1447    case BT_PLUS:
1448    case BT_COMMA:
1449    case BT_SOL:
1450    case BT_EQUALS:
1451    case BT_QUEST:
1452    case BT_CR:
1453    case BT_LF:
1454    case BT_SEMI:
1455    case BT_EXCL:
1456    case BT_AST:
1457    case BT_PERCNT:
1458    case BT_NUM:
1459#  ifdef XML_NS
1460    case BT_COLON:
1461#  endif
1462      break;
1463    case BT_S:
1464      if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1465        *badPtr = ptr;
1466        return 0;
1467      }
1468      break;
1469    case BT_NAME:
1470    case BT_NMSTRT:
1471      if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1472        break;
1473      /* fall through */
1474    default:
1475      switch (BYTE_TO_ASCII(enc, ptr)) {
1476      case 0x24: /* $ */
1477      case 0x40: /* @ */
1478        break;
1479      default:
1480        *badPtr = ptr;
1481        return 0;
1482      }
1483      break;
1484    }
1485  }
1486  return 1;
1487}
1488
1489/* This must only be called for a well-formed start-tag or empty
1490   element tag.  Returns the number of attributes.  Pointers to the
1491   first attsMax attributes are stored in atts.
1492*/
1493
1494static int PTRCALL
1495PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
1496                ATTRIBUTE *atts) {
1497  enum { other, inName, inValue } state = inName;
1498  int nAtts = 0;
1499  int open = 0; /* defined when state == inValue;
1500                   initialization just to shut up compilers */
1501
1502  for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1503    switch (BYTE_TYPE(enc, ptr)) {
1504#  define START_NAME                                                           \
1505    if (state == other) {                                                      \
1506      if (nAtts < attsMax) {                                                   \
1507        atts[nAtts].name = ptr;                                                \
1508        atts[nAtts].normalized = 1;                                            \
1509      }                                                                        \
1510      state = inName;                                                          \
1511    }
1512#  define LEAD_CASE(n)                                                         \
1513  case BT_LEAD##n:                                                             \
1514    START_NAME ptr += (n - MINBPC(enc));                                       \
1515    break;
1516      LEAD_CASE(2)
1517      LEAD_CASE(3)
1518      LEAD_CASE(4)
1519#  undef LEAD_CASE
1520    case BT_NONASCII:
1521    case BT_NMSTRT:
1522    case BT_HEX:
1523      START_NAME
1524      break;
1525#  undef START_NAME
1526    case BT_QUOT:
1527      if (state != inValue) {
1528        if (nAtts < attsMax)
1529          atts[nAtts].valuePtr = ptr + MINBPC(enc);
1530        state = inValue;
1531        open = BT_QUOT;
1532      } else if (open == BT_QUOT) {
1533        state = other;
1534        if (nAtts < attsMax)
1535          atts[nAtts].valueEnd = ptr;
1536        nAtts++;
1537      }
1538      break;
1539    case BT_APOS:
1540      if (state != inValue) {
1541        if (nAtts < attsMax)
1542          atts[nAtts].valuePtr = ptr + MINBPC(enc);
1543        state = inValue;
1544        open = BT_APOS;
1545      } else if (open == BT_APOS) {
1546        state = other;
1547        if (nAtts < attsMax)
1548          atts[nAtts].valueEnd = ptr;
1549        nAtts++;
1550      }
1551      break;
1552    case BT_AMP:
1553      if (nAtts < attsMax)
1554        atts[nAtts].normalized = 0;
1555      break;
1556    case BT_S:
1557      if (state == inName)
1558        state = other;
1559      else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized
1560               && (ptr == atts[nAtts].valuePtr
1561                   || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1562                   || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1563                   || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1564        atts[nAtts].normalized = 0;
1565      break;
1566    case BT_CR:
1567    case BT_LF:
1568      /* This case ensures that the first attribute name is counted
1569         Apart from that we could just change state on the quote. */
1570      if (state == inName)
1571        state = other;
1572      else if (state == inValue && nAtts < attsMax)
1573        atts[nAtts].normalized = 0;
1574      break;
1575    case BT_GT:
1576    case BT_SOL:
1577      if (state != inValue)
1578        return nAtts;
1579      break;
1580    default:
1581      break;
1582    }
1583  }
1584  /* not reached */
1585}
1586
1587static int PTRFASTCALL
1588PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) {
1589  int result = 0;
1590  /* skip &# */
1591  UNUSED_P(enc);
1592  ptr += 2 * MINBPC(enc);
1593  if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1594    for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1595         ptr += MINBPC(enc)) {
1596      int c = BYTE_TO_ASCII(enc, ptr);
1597      switch (c) {
1598      case ASCII_0:
1599      case ASCII_1:
1600      case ASCII_2:
1601      case ASCII_3:
1602      case ASCII_4:
1603      case ASCII_5:
1604      case ASCII_6:
1605      case ASCII_7:
1606      case ASCII_8:
1607      case ASCII_9:
1608        result <<= 4;
1609        result |= (c - ASCII_0);
1610        break;
1611      case ASCII_A:
1612      case ASCII_B:
1613      case ASCII_C:
1614      case ASCII_D:
1615      case ASCII_E:
1616      case ASCII_F:
1617        result <<= 4;
1618        result += 10 + (c - ASCII_A);
1619        break;
1620      case ASCII_a:
1621      case ASCII_b:
1622      case ASCII_c:
1623      case ASCII_d:
1624      case ASCII_e:
1625      case ASCII_f:
1626        result <<= 4;
1627        result += 10 + (c - ASCII_a);
1628        break;
1629      }
1630      if (result >= 0x110000)
1631        return -1;
1632    }
1633  } else {
1634    for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1635      int c = BYTE_TO_ASCII(enc, ptr);
1636      result *= 10;
1637      result += (c - ASCII_0);
1638      if (result >= 0x110000)
1639        return -1;
1640    }
1641  }
1642  return checkCharRefNumber(result);
1643}
1644
1645static int PTRCALL
1646PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1647                             const char *end) {
1648  UNUSED_P(enc);
1649  switch ((end - ptr) / MINBPC(enc)) {
1650  case 2:
1651    if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1652      switch (BYTE_TO_ASCII(enc, ptr)) {
1653      case ASCII_l:
1654        return ASCII_LT;
1655      case ASCII_g:
1656        return ASCII_GT;
1657      }
1658    }
1659    break;
1660  case 3:
1661    if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1662      ptr += MINBPC(enc);
1663      if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1664        ptr += MINBPC(enc);
1665        if (CHAR_MATCHES(enc, ptr, ASCII_p))
1666          return ASCII_AMP;
1667      }
1668    }
1669    break;
1670  case 4:
1671    switch (BYTE_TO_ASCII(enc, ptr)) {
1672    case ASCII_q:
1673      ptr += MINBPC(enc);
1674      if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1675        ptr += MINBPC(enc);
1676        if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1677          ptr += MINBPC(enc);
1678          if (CHAR_MATCHES(enc, ptr, ASCII_t))
1679            return ASCII_QUOT;
1680        }
1681      }
1682      break;
1683    case ASCII_a:
1684      ptr += MINBPC(enc);
1685      if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1686        ptr += MINBPC(enc);
1687        if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1688          ptr += MINBPC(enc);
1689          if (CHAR_MATCHES(enc, ptr, ASCII_s))
1690            return ASCII_APOS;
1691        }
1692      }
1693      break;
1694    }
1695  }
1696  return 0;
1697}
1698
1699static int PTRCALL
1700PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1701                         const char *end1, const char *ptr2) {
1702  UNUSED_P(enc);
1703  for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1704    if (end1 - ptr1 < MINBPC(enc)) {
1705      /* This line cannot be executed.  The incoming data has already
1706       * been tokenized once, so incomplete characters like this have
1707       * already been eliminated from the input.  Retaining the
1708       * paranoia check is still valuable, however.
1709       */
1710      return 0; /* LCOV_EXCL_LINE */
1711    }
1712    if (! CHAR_MATCHES(enc, ptr1, *ptr2))
1713      return 0;
1714  }
1715  return ptr1 == end1;
1716}
1717
1718static int PTRFASTCALL
1719PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
1720  const char *start = ptr;
1721  for (;;) {
1722    switch (BYTE_TYPE(enc, ptr)) {
1723#  define LEAD_CASE(n)                                                         \
1724  case BT_LEAD##n:                                                             \
1725    ptr += n;                                                                  \
1726    break;
1727      LEAD_CASE(2)
1728      LEAD_CASE(3)
1729      LEAD_CASE(4)
1730#  undef LEAD_CASE
1731    case BT_NONASCII:
1732    case BT_NMSTRT:
1733#  ifdef XML_NS
1734    case BT_COLON:
1735#  endif
1736    case BT_HEX:
1737    case BT_DIGIT:
1738    case BT_NAME:
1739    case BT_MINUS:
1740      ptr += MINBPC(enc);
1741      break;
1742    default:
1743      return (int)(ptr - start);
1744    }
1745  }
1746}
1747
1748static const char *PTRFASTCALL
1749PREFIX(skipS)(const ENCODING *enc, const char *ptr) {
1750  for (;;) {
1751    switch (BYTE_TYPE(enc, ptr)) {
1752    case BT_LF:
1753    case BT_CR:
1754    case BT_S:
1755      ptr += MINBPC(enc);
1756      break;
1757    default:
1758      return ptr;
1759    }
1760  }
1761}
1762
1763static void PTRCALL
1764PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
1765                       POSITION *pos) {
1766  while (HAS_CHAR(enc, ptr, end)) {
1767    switch (BYTE_TYPE(enc, ptr)) {
1768#  define LEAD_CASE(n)                                                         \
1769  case BT_LEAD##n:                                                             \
1770    ptr += n;                                                                  \
1771    break;
1772      LEAD_CASE(2)
1773      LEAD_CASE(3)
1774      LEAD_CASE(4)
1775#  undef LEAD_CASE
1776    case BT_LF:
1777      pos->columnNumber = (XML_Size)-1;
1778      pos->lineNumber++;
1779      ptr += MINBPC(enc);
1780      break;
1781    case BT_CR:
1782      pos->lineNumber++;
1783      ptr += MINBPC(enc);
1784      if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1785        ptr += MINBPC(enc);
1786      pos->columnNumber = (XML_Size)-1;
1787      break;
1788    default:
1789      ptr += MINBPC(enc);
1790      break;
1791    }
1792    pos->columnNumber++;
1793  }
1794}
1795
1796#  undef DO_LEAD_CASE
1797#  undef MULTIBYTE_CASES
1798#  undef INVALID_CASES
1799#  undef CHECK_NAME_CASE
1800#  undef CHECK_NAME_CASES
1801#  undef CHECK_NMSTRT_CASE
1802#  undef CHECK_NMSTRT_CASES
1803
1804#endif /* XML_TOK_IMPL_C */
1805