1104349Sphk/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2104349Sphk   See the file COPYING for copying permission.
3104349Sphk*/
4104349Sphk
5178848Scokane/* This file is included! */
6178848Scokane#ifdef XML_TOK_IMPL_C
7178848Scokane
8104349Sphk#ifndef IS_INVALID_CHAR
9104349Sphk#define IS_INVALID_CHAR(enc, ptr, n) (0)
10104349Sphk#endif
11104349Sphk
12104349Sphk#define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
13104349Sphk    case BT_LEAD ## n: \
14104349Sphk      if (end - ptr < n) \
15104349Sphk        return XML_TOK_PARTIAL_CHAR; \
16104349Sphk      if (IS_INVALID_CHAR(enc, ptr, n)) { \
17104349Sphk        *(nextTokPtr) = (ptr); \
18104349Sphk        return XML_TOK_INVALID; \
19104349Sphk      } \
20104349Sphk      ptr += n; \
21104349Sphk      break;
22104349Sphk
23104349Sphk#define INVALID_CASES(ptr, nextTokPtr) \
24104349Sphk  INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
25104349Sphk  INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
26104349Sphk  INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
27104349Sphk  case BT_NONXML: \
28104349Sphk  case BT_MALFORM: \
29104349Sphk  case BT_TRAIL: \
30104349Sphk    *(nextTokPtr) = (ptr); \
31104349Sphk    return XML_TOK_INVALID;
32104349Sphk
33104349Sphk#define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
34104349Sphk   case BT_LEAD ## n: \
35104349Sphk     if (end - ptr < n) \
36104349Sphk       return XML_TOK_PARTIAL_CHAR; \
37104349Sphk     if (!IS_NAME_CHAR(enc, ptr, n)) { \
38104349Sphk       *nextTokPtr = ptr; \
39104349Sphk       return XML_TOK_INVALID; \
40104349Sphk     } \
41104349Sphk     ptr += n; \
42104349Sphk     break;
43104349Sphk
44104349Sphk#define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
45104349Sphk  case BT_NONASCII: \
46104349Sphk    if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
47104349Sphk      *nextTokPtr = ptr; \
48104349Sphk      return XML_TOK_INVALID; \
49104349Sphk    } \
50104349Sphk  case BT_NMSTRT: \
51104349Sphk  case BT_HEX: \
52104349Sphk  case BT_DIGIT: \
53104349Sphk  case BT_NAME: \
54104349Sphk  case BT_MINUS: \
55104349Sphk    ptr += MINBPC(enc); \
56104349Sphk    break; \
57104349Sphk  CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
58104349Sphk  CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
59104349Sphk  CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
60104349Sphk
61104349Sphk#define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
62104349Sphk   case BT_LEAD ## n: \
63104349Sphk     if (end - ptr < n) \
64104349Sphk       return XML_TOK_PARTIAL_CHAR; \
65104349Sphk     if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
66104349Sphk       *nextTokPtr = ptr; \
67104349Sphk       return XML_TOK_INVALID; \
68104349Sphk     } \
69104349Sphk     ptr += n; \
70104349Sphk     break;
71104349Sphk
72104349Sphk#define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
73104349Sphk  case BT_NONASCII: \
74104349Sphk    if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
75104349Sphk      *nextTokPtr = ptr; \
76104349Sphk      return XML_TOK_INVALID; \
77104349Sphk    } \
78104349Sphk  case BT_NMSTRT: \
79104349Sphk  case BT_HEX: \
80104349Sphk    ptr += MINBPC(enc); \
81104349Sphk    break; \
82104349Sphk  CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
83104349Sphk  CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
84104349Sphk  CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
85104349Sphk
86104349Sphk#ifndef PREFIX
87104349Sphk#define PREFIX(ident) ident
88104349Sphk#endif
89104349Sphk
90302385Sdelphij
91302385Sdelphij#define HAS_CHARS(enc, ptr, end, count) \
92302385Sdelphij    (end - ptr >= count * MINBPC(enc))
93302385Sdelphij
94302385Sdelphij#define HAS_CHAR(enc, ptr, end) \
95302385Sdelphij    HAS_CHARS(enc, ptr, end, 1)
96302385Sdelphij
97302385Sdelphij#define REQUIRE_CHARS(enc, ptr, end, count) \
98302385Sdelphij    { \
99302385Sdelphij      if (! HAS_CHARS(enc, ptr, end, count)) { \
100302385Sdelphij        return XML_TOK_PARTIAL; \
101302385Sdelphij      } \
102302385Sdelphij    }
103302385Sdelphij
104302385Sdelphij#define REQUIRE_CHAR(enc, ptr, end) \
105302385Sdelphij    REQUIRE_CHARS(enc, ptr, end, 1)
106302385Sdelphij
107302385Sdelphij
108104349Sphk/* ptr points to character following "<!-" */
109104349Sphk
110178848Scokanestatic int PTRCALL
111104349SphkPREFIX(scanComment)(const ENCODING *enc, const char *ptr,
112104349Sphk                    const char *end, const char **nextTokPtr)
113104349Sphk{
114302385Sdelphij  if (HAS_CHAR(enc, ptr, end)) {
115104349Sphk    if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
116104349Sphk      *nextTokPtr = ptr;
117104349Sphk      return XML_TOK_INVALID;
118104349Sphk    }
119104349Sphk    ptr += MINBPC(enc);
120302385Sdelphij    while (HAS_CHAR(enc, ptr, end)) {
121104349Sphk      switch (BYTE_TYPE(enc, ptr)) {
122104349Sphk      INVALID_CASES(ptr, nextTokPtr)
123104349Sphk      case BT_MINUS:
124302385Sdelphij        ptr += MINBPC(enc);
125302385Sdelphij        REQUIRE_CHAR(enc, ptr, end);
126104349Sphk        if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
127302385Sdelphij          ptr += MINBPC(enc);
128302385Sdelphij          REQUIRE_CHAR(enc, ptr, end);
129104349Sphk          if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
130104349Sphk            *nextTokPtr = ptr;
131104349Sphk            return XML_TOK_INVALID;
132104349Sphk          }
133104349Sphk          *nextTokPtr = ptr + MINBPC(enc);
134104349Sphk          return XML_TOK_COMMENT;
135104349Sphk        }
136104349Sphk        break;
137104349Sphk      default:
138104349Sphk        ptr += MINBPC(enc);
139104349Sphk        break;
140104349Sphk      }
141104349Sphk    }
142104349Sphk  }
143104349Sphk  return XML_TOK_PARTIAL;
144104349Sphk}
145104349Sphk
146104349Sphk/* ptr points to character following "<!" */
147104349Sphk
148178848Scokanestatic int PTRCALL
149104349SphkPREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
150104349Sphk                 const char *end, const char **nextTokPtr)
151104349Sphk{
152302385Sdelphij  REQUIRE_CHAR(enc, ptr, end);
153104349Sphk  switch (BYTE_TYPE(enc, ptr)) {
154104349Sphk  case BT_MINUS:
155104349Sphk    return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
156104349Sphk  case BT_LSQB:
157104349Sphk    *nextTokPtr = ptr + MINBPC(enc);
158104349Sphk    return XML_TOK_COND_SECT_OPEN;
159104349Sphk  case BT_NMSTRT:
160104349Sphk  case BT_HEX:
161104349Sphk    ptr += MINBPC(enc);
162104349Sphk    break;
163104349Sphk  default:
164104349Sphk    *nextTokPtr = ptr;
165104349Sphk    return XML_TOK_INVALID;
166104349Sphk  }
167302385Sdelphij  while (HAS_CHAR(enc, ptr, end)) {
168104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
169104349Sphk    case BT_PERCNT:
170302385Sdelphij      REQUIRE_CHARS(enc, ptr, end, 2);
171104349Sphk      /* don't allow <!ENTITY% foo "whatever"> */
172104349Sphk      switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
173104349Sphk      case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
174104349Sphk        *nextTokPtr = ptr;
175104349Sphk        return XML_TOK_INVALID;
176104349Sphk      }
177104349Sphk      /* fall through */
178104349Sphk    case BT_S: case BT_CR: case BT_LF:
179104349Sphk      *nextTokPtr = ptr;
180104349Sphk      return XML_TOK_DECL_OPEN;
181104349Sphk    case BT_NMSTRT:
182104349Sphk    case BT_HEX:
183104349Sphk      ptr += MINBPC(enc);
184104349Sphk      break;
185104349Sphk    default:
186104349Sphk      *nextTokPtr = ptr;
187104349Sphk      return XML_TOK_INVALID;
188104349Sphk    }
189104349Sphk  }
190104349Sphk  return XML_TOK_PARTIAL;
191104349Sphk}
192104349Sphk
193178848Scokanestatic int PTRCALL
194302385SdelphijPREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr,
195104349Sphk                      const char *end, int *tokPtr)
196104349Sphk{
197104349Sphk  int upper = 0;
198104349Sphk  *tokPtr = XML_TOK_PI;
199104349Sphk  if (end - ptr != MINBPC(enc)*3)
200104349Sphk    return 1;
201104349Sphk  switch (BYTE_TO_ASCII(enc, ptr)) {
202104349Sphk  case ASCII_x:
203104349Sphk    break;
204104349Sphk  case ASCII_X:
205104349Sphk    upper = 1;
206104349Sphk    break;
207104349Sphk  default:
208104349Sphk    return 1;
209104349Sphk  }
210104349Sphk  ptr += MINBPC(enc);
211104349Sphk  switch (BYTE_TO_ASCII(enc, ptr)) {
212104349Sphk  case ASCII_m:
213104349Sphk    break;
214104349Sphk  case ASCII_M:
215104349Sphk    upper = 1;
216104349Sphk    break;
217104349Sphk  default:
218104349Sphk    return 1;
219104349Sphk  }
220104349Sphk  ptr += MINBPC(enc);
221104349Sphk  switch (BYTE_TO_ASCII(enc, ptr)) {
222104349Sphk  case ASCII_l:
223104349Sphk    break;
224104349Sphk  case ASCII_L:
225104349Sphk    upper = 1;
226104349Sphk    break;
227104349Sphk  default:
228104349Sphk    return 1;
229104349Sphk  }
230104349Sphk  if (upper)
231104349Sphk    return 0;
232104349Sphk  *tokPtr = XML_TOK_XML_DECL;
233104349Sphk  return 1;
234104349Sphk}
235104349Sphk
236104349Sphk/* ptr points to character following "<?" */
237104349Sphk
238178848Scokanestatic int PTRCALL
239104349SphkPREFIX(scanPi)(const ENCODING *enc, const char *ptr,
240104349Sphk               const char *end, const char **nextTokPtr)
241104349Sphk{
242104349Sphk  int tok;
243104349Sphk  const char *target = ptr;
244302385Sdelphij  REQUIRE_CHAR(enc, ptr, end);
245104349Sphk  switch (BYTE_TYPE(enc, ptr)) {
246104349Sphk  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
247104349Sphk  default:
248104349Sphk    *nextTokPtr = ptr;
249104349Sphk    return XML_TOK_INVALID;
250104349Sphk  }
251302385Sdelphij  while (HAS_CHAR(enc, ptr, end)) {
252104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
253104349Sphk    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
254104349Sphk    case BT_S: case BT_CR: case BT_LF:
255104349Sphk      if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
256104349Sphk        *nextTokPtr = ptr;
257104349Sphk        return XML_TOK_INVALID;
258104349Sphk      }
259104349Sphk      ptr += MINBPC(enc);
260302385Sdelphij      while (HAS_CHAR(enc, ptr, end)) {
261104349Sphk        switch (BYTE_TYPE(enc, ptr)) {
262104349Sphk        INVALID_CASES(ptr, nextTokPtr)
263104349Sphk        case BT_QUEST:
264104349Sphk          ptr += MINBPC(enc);
265302385Sdelphij          REQUIRE_CHAR(enc, ptr, end);
266104349Sphk          if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
267104349Sphk            *nextTokPtr = ptr + MINBPC(enc);
268104349Sphk            return tok;
269104349Sphk          }
270104349Sphk          break;
271104349Sphk        default:
272104349Sphk          ptr += MINBPC(enc);
273104349Sphk          break;
274104349Sphk        }
275104349Sphk      }
276104349Sphk      return XML_TOK_PARTIAL;
277104349Sphk    case BT_QUEST:
278104349Sphk      if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
279104349Sphk        *nextTokPtr = ptr;
280104349Sphk        return XML_TOK_INVALID;
281104349Sphk      }
282104349Sphk      ptr += MINBPC(enc);
283302385Sdelphij      REQUIRE_CHAR(enc, ptr, end);
284104349Sphk      if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
285104349Sphk        *nextTokPtr = ptr + MINBPC(enc);
286104349Sphk        return tok;
287104349Sphk      }
288104349Sphk      /* fall through */
289104349Sphk    default:
290104349Sphk      *nextTokPtr = ptr;
291104349Sphk      return XML_TOK_INVALID;
292104349Sphk    }
293104349Sphk  }
294104349Sphk  return XML_TOK_PARTIAL;
295104349Sphk}
296104349Sphk
297178848Scokanestatic int PTRCALL
298302385SdelphijPREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr,
299104349Sphk                         const char *end, const char **nextTokPtr)
300104349Sphk{
301104349Sphk  static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
302104349Sphk                                     ASCII_T, ASCII_A, ASCII_LSQB };
303104349Sphk  int i;
304104349Sphk  /* CDATA[ */
305302385Sdelphij  REQUIRE_CHARS(enc, ptr, end, 6);
306104349Sphk  for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
307104349Sphk    if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
308104349Sphk      *nextTokPtr = ptr;
309104349Sphk      return XML_TOK_INVALID;
310104349Sphk    }
311104349Sphk  }
312104349Sphk  *nextTokPtr = ptr;
313104349Sphk  return XML_TOK_CDATA_SECT_OPEN;
314104349Sphk}
315104349Sphk
316178848Scokanestatic int PTRCALL
317104349SphkPREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
318104349Sphk                        const char *end, const char **nextTokPtr)
319104349Sphk{
320302385Sdelphij  if (ptr >= end)
321104349Sphk    return XML_TOK_NONE;
322104349Sphk  if (MINBPC(enc) > 1) {
323104349Sphk    size_t n = end - ptr;
324104349Sphk    if (n & (MINBPC(enc) - 1)) {
325104349Sphk      n &= ~(MINBPC(enc) - 1);
326104349Sphk      if (n == 0)
327104349Sphk        return XML_TOK_PARTIAL;
328104349Sphk      end = ptr + n;
329104349Sphk    }
330104349Sphk  }
331104349Sphk  switch (BYTE_TYPE(enc, ptr)) {
332104349Sphk  case BT_RSQB:
333104349Sphk    ptr += MINBPC(enc);
334302385Sdelphij    REQUIRE_CHAR(enc, ptr, end);
335104349Sphk    if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
336104349Sphk      break;
337104349Sphk    ptr += MINBPC(enc);
338302385Sdelphij    REQUIRE_CHAR(enc, ptr, end);
339104349Sphk    if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
340104349Sphk      ptr -= MINBPC(enc);
341104349Sphk      break;
342104349Sphk    }
343104349Sphk    *nextTokPtr = ptr + MINBPC(enc);
344104349Sphk    return XML_TOK_CDATA_SECT_CLOSE;
345104349Sphk  case BT_CR:
346104349Sphk    ptr += MINBPC(enc);
347302385Sdelphij    REQUIRE_CHAR(enc, ptr, end);
348104349Sphk    if (BYTE_TYPE(enc, ptr) == BT_LF)
349104349Sphk      ptr += MINBPC(enc);
350104349Sphk    *nextTokPtr = ptr;
351104349Sphk    return XML_TOK_DATA_NEWLINE;
352104349Sphk  case BT_LF:
353104349Sphk    *nextTokPtr = ptr + MINBPC(enc);
354104349Sphk    return XML_TOK_DATA_NEWLINE;
355104349Sphk  INVALID_CASES(ptr, nextTokPtr)
356104349Sphk  default:
357104349Sphk    ptr += MINBPC(enc);
358104349Sphk    break;
359104349Sphk  }
360302385Sdelphij  while (HAS_CHAR(enc, ptr, end)) {
361104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
362104349Sphk#define LEAD_CASE(n) \
363104349Sphk    case BT_LEAD ## n: \
364104349Sphk      if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
365104349Sphk        *nextTokPtr = ptr; \
366104349Sphk        return XML_TOK_DATA_CHARS; \
367104349Sphk      } \
368104349Sphk      ptr += n; \
369104349Sphk      break;
370104349Sphk    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
371104349Sphk#undef LEAD_CASE
372104349Sphk    case BT_NONXML:
373104349Sphk    case BT_MALFORM:
374104349Sphk    case BT_TRAIL:
375104349Sphk    case BT_CR:
376104349Sphk    case BT_LF:
377104349Sphk    case BT_RSQB:
378104349Sphk      *nextTokPtr = ptr;
379104349Sphk      return XML_TOK_DATA_CHARS;
380104349Sphk    default:
381104349Sphk      ptr += MINBPC(enc);
382104349Sphk      break;
383104349Sphk    }
384104349Sphk  }
385104349Sphk  *nextTokPtr = ptr;
386104349Sphk  return XML_TOK_DATA_CHARS;
387104349Sphk}
388104349Sphk
389104349Sphk/* ptr points to character following "</" */
390104349Sphk
391178848Scokanestatic int PTRCALL
392104349SphkPREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
393104349Sphk                   const char *end, const char **nextTokPtr)
394104349Sphk{
395302385Sdelphij  REQUIRE_CHAR(enc, ptr, end);
396104349Sphk  switch (BYTE_TYPE(enc, ptr)) {
397104349Sphk  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
398104349Sphk  default:
399104349Sphk    *nextTokPtr = ptr;
400104349Sphk    return XML_TOK_INVALID;
401104349Sphk  }
402302385Sdelphij  while (HAS_CHAR(enc, ptr, end)) {
403104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
404104349Sphk    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
405104349Sphk    case BT_S: case BT_CR: case BT_LF:
406302385Sdelphij      for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
407104349Sphk        switch (BYTE_TYPE(enc, ptr)) {
408104349Sphk        case BT_S: case BT_CR: case BT_LF:
409104349Sphk          break;
410104349Sphk        case BT_GT:
411104349Sphk          *nextTokPtr = ptr + MINBPC(enc);
412104349Sphk          return XML_TOK_END_TAG;
413104349Sphk        default:
414104349Sphk          *nextTokPtr = ptr;
415104349Sphk          return XML_TOK_INVALID;
416104349Sphk        }
417104349Sphk      }
418104349Sphk      return XML_TOK_PARTIAL;
419104349Sphk#ifdef XML_NS
420104349Sphk    case BT_COLON:
421104349Sphk      /* no need to check qname syntax here,
422104349Sphk         since end-tag must match exactly */
423104349Sphk      ptr += MINBPC(enc);
424104349Sphk      break;
425104349Sphk#endif
426104349Sphk    case BT_GT:
427104349Sphk      *nextTokPtr = ptr + MINBPC(enc);
428104349Sphk      return XML_TOK_END_TAG;
429104349Sphk    default:
430104349Sphk      *nextTokPtr = ptr;
431104349Sphk      return XML_TOK_INVALID;
432104349Sphk    }
433104349Sphk  }
434104349Sphk  return XML_TOK_PARTIAL;
435104349Sphk}
436104349Sphk
437104349Sphk/* ptr points to character following "&#X" */
438104349Sphk
439178848Scokanestatic int PTRCALL
440104349SphkPREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
441104349Sphk                       const char *end, const char **nextTokPtr)
442104349Sphk{
443302385Sdelphij  if (HAS_CHAR(enc, ptr, end)) {
444104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
445104349Sphk    case BT_DIGIT:
446104349Sphk    case BT_HEX:
447104349Sphk      break;
448104349Sphk    default:
449104349Sphk      *nextTokPtr = ptr;
450104349Sphk      return XML_TOK_INVALID;
451104349Sphk    }
452302385Sdelphij    for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
453104349Sphk      switch (BYTE_TYPE(enc, ptr)) {
454104349Sphk      case BT_DIGIT:
455104349Sphk      case BT_HEX:
456104349Sphk        break;
457104349Sphk      case BT_SEMI:
458104349Sphk        *nextTokPtr = ptr + MINBPC(enc);
459104349Sphk        return XML_TOK_CHAR_REF;
460104349Sphk      default:
461104349Sphk        *nextTokPtr = ptr;
462104349Sphk        return XML_TOK_INVALID;
463104349Sphk      }
464104349Sphk    }
465104349Sphk  }
466104349Sphk  return XML_TOK_PARTIAL;
467104349Sphk}
468104349Sphk
469104349Sphk/* ptr points to character following "&#" */
470104349Sphk
471178848Scokanestatic int PTRCALL
472104349SphkPREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
473104349Sphk                    const char *end, const char **nextTokPtr)
474104349Sphk{
475302385Sdelphij  if (HAS_CHAR(enc, ptr, end)) {
476104349Sphk    if (CHAR_MATCHES(enc, ptr, ASCII_x))
477104349Sphk      return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
478104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
479104349Sphk    case BT_DIGIT:
480104349Sphk      break;
481104349Sphk    default:
482104349Sphk      *nextTokPtr = ptr;
483104349Sphk      return XML_TOK_INVALID;
484104349Sphk    }
485302385Sdelphij    for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
486104349Sphk      switch (BYTE_TYPE(enc, ptr)) {
487104349Sphk      case BT_DIGIT:
488104349Sphk        break;
489104349Sphk      case BT_SEMI:
490104349Sphk        *nextTokPtr = ptr + MINBPC(enc);
491104349Sphk        return XML_TOK_CHAR_REF;
492104349Sphk      default:
493104349Sphk        *nextTokPtr = ptr;
494104349Sphk        return XML_TOK_INVALID;
495104349Sphk      }
496104349Sphk    }
497104349Sphk  }
498104349Sphk  return XML_TOK_PARTIAL;
499104349Sphk}
500104349Sphk
501104349Sphk/* ptr points to character following "&" */
502104349Sphk
503178848Scokanestatic int PTRCALL
504104349SphkPREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
505104349Sphk                const char **nextTokPtr)
506104349Sphk{
507302385Sdelphij  REQUIRE_CHAR(enc, ptr, end);
508104349Sphk  switch (BYTE_TYPE(enc, ptr)) {
509104349Sphk  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
510104349Sphk  case BT_NUM:
511104349Sphk    return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
512104349Sphk  default:
513104349Sphk    *nextTokPtr = ptr;
514104349Sphk    return XML_TOK_INVALID;
515104349Sphk  }
516302385Sdelphij  while (HAS_CHAR(enc, ptr, end)) {
517104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
518104349Sphk    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
519104349Sphk    case BT_SEMI:
520104349Sphk      *nextTokPtr = ptr + MINBPC(enc);
521104349Sphk      return XML_TOK_ENTITY_REF;
522104349Sphk    default:
523104349Sphk      *nextTokPtr = ptr;
524104349Sphk      return XML_TOK_INVALID;
525104349Sphk    }
526104349Sphk  }
527104349Sphk  return XML_TOK_PARTIAL;
528104349Sphk}
529104349Sphk
530104349Sphk/* ptr points to character following first character of attribute name */
531104349Sphk
532178848Scokanestatic int PTRCALL
533104349SphkPREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
534104349Sphk                 const char **nextTokPtr)
535104349Sphk{
536104349Sphk#ifdef XML_NS
537104349Sphk  int hadColon = 0;
538104349Sphk#endif
539302385Sdelphij  while (HAS_CHAR(enc, ptr, end)) {
540104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
541104349Sphk    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
542104349Sphk#ifdef XML_NS
543104349Sphk    case BT_COLON:
544104349Sphk      if (hadColon) {
545104349Sphk        *nextTokPtr = ptr;
546104349Sphk        return XML_TOK_INVALID;
547104349Sphk      }
548104349Sphk      hadColon = 1;
549104349Sphk      ptr += MINBPC(enc);
550302385Sdelphij      REQUIRE_CHAR(enc, ptr, end);
551104349Sphk      switch (BYTE_TYPE(enc, ptr)) {
552104349Sphk      CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
553104349Sphk      default:
554104349Sphk        *nextTokPtr = ptr;
555104349Sphk        return XML_TOK_INVALID;
556104349Sphk      }
557104349Sphk      break;
558104349Sphk#endif
559104349Sphk    case BT_S: case BT_CR: case BT_LF:
560104349Sphk      for (;;) {
561104349Sphk        int t;
562104349Sphk
563104349Sphk        ptr += MINBPC(enc);
564302385Sdelphij        REQUIRE_CHAR(enc, ptr, end);
565104349Sphk        t = BYTE_TYPE(enc, ptr);
566104349Sphk        if (t == BT_EQUALS)
567104349Sphk          break;
568104349Sphk        switch (t) {
569104349Sphk        case BT_S:
570104349Sphk        case BT_LF:
571104349Sphk        case BT_CR:
572104349Sphk          break;
573104349Sphk        default:
574104349Sphk          *nextTokPtr = ptr;
575104349Sphk          return XML_TOK_INVALID;
576104349Sphk        }
577104349Sphk      }
578104349Sphk    /* fall through */
579104349Sphk    case BT_EQUALS:
580104349Sphk      {
581104349Sphk        int open;
582104349Sphk#ifdef XML_NS
583104349Sphk        hadColon = 0;
584104349Sphk#endif
585104349Sphk        for (;;) {
586104349Sphk          ptr += MINBPC(enc);
587302385Sdelphij          REQUIRE_CHAR(enc, ptr, end);
588104349Sphk          open = BYTE_TYPE(enc, ptr);
589104349Sphk          if (open == BT_QUOT || open == BT_APOS)
590104349Sphk            break;
591104349Sphk          switch (open) {
592104349Sphk          case BT_S:
593104349Sphk          case BT_LF:
594104349Sphk          case BT_CR:
595104349Sphk            break;
596104349Sphk          default:
597104349Sphk            *nextTokPtr = ptr;
598104349Sphk            return XML_TOK_INVALID;
599104349Sphk          }
600104349Sphk        }
601104349Sphk        ptr += MINBPC(enc);
602104349Sphk        /* in attribute value */
603104349Sphk        for (;;) {
604104349Sphk          int t;
605302385Sdelphij          REQUIRE_CHAR(enc, ptr, end);
606104349Sphk          t = BYTE_TYPE(enc, ptr);
607104349Sphk          if (t == open)
608104349Sphk            break;
609104349Sphk          switch (t) {
610104349Sphk          INVALID_CASES(ptr, nextTokPtr)
611104349Sphk          case BT_AMP:
612104349Sphk            {
613104349Sphk              int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
614104349Sphk              if (tok <= 0) {
615104349Sphk                if (tok == XML_TOK_INVALID)
616104349Sphk                  *nextTokPtr = ptr;
617104349Sphk                return tok;
618104349Sphk              }
619104349Sphk              break;
620104349Sphk            }
621104349Sphk          case BT_LT:
622104349Sphk            *nextTokPtr = ptr;
623104349Sphk            return XML_TOK_INVALID;
624104349Sphk          default:
625104349Sphk            ptr += MINBPC(enc);
626104349Sphk            break;
627104349Sphk          }
628104349Sphk        }
629104349Sphk        ptr += MINBPC(enc);
630302385Sdelphij        REQUIRE_CHAR(enc, ptr, end);
631104349Sphk        switch (BYTE_TYPE(enc, ptr)) {
632104349Sphk        case BT_S:
633104349Sphk        case BT_CR:
634104349Sphk        case BT_LF:
635104349Sphk          break;
636104349Sphk        case BT_SOL:
637104349Sphk          goto sol;
638104349Sphk        case BT_GT:
639104349Sphk          goto gt;
640104349Sphk        default:
641104349Sphk          *nextTokPtr = ptr;
642104349Sphk          return XML_TOK_INVALID;
643104349Sphk        }
644104349Sphk        /* ptr points to closing quote */
645104349Sphk        for (;;) {
646104349Sphk          ptr += MINBPC(enc);
647302385Sdelphij          REQUIRE_CHAR(enc, ptr, end);
648104349Sphk          switch (BYTE_TYPE(enc, ptr)) {
649104349Sphk          CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
650104349Sphk          case BT_S: case BT_CR: case BT_LF:
651104349Sphk            continue;
652104349Sphk          case BT_GT:
653104349Sphk          gt:
654104349Sphk            *nextTokPtr = ptr + MINBPC(enc);
655104349Sphk            return XML_TOK_START_TAG_WITH_ATTS;
656104349Sphk          case BT_SOL:
657104349Sphk          sol:
658104349Sphk            ptr += MINBPC(enc);
659302385Sdelphij            REQUIRE_CHAR(enc, ptr, end);
660104349Sphk            if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
661104349Sphk              *nextTokPtr = ptr;
662104349Sphk              return XML_TOK_INVALID;
663104349Sphk            }
664104349Sphk            *nextTokPtr = ptr + MINBPC(enc);
665104349Sphk            return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
666104349Sphk          default:
667104349Sphk            *nextTokPtr = ptr;
668104349Sphk            return XML_TOK_INVALID;
669104349Sphk          }
670104349Sphk          break;
671104349Sphk        }
672104349Sphk        break;
673104349Sphk      }
674104349Sphk    default:
675104349Sphk      *nextTokPtr = ptr;
676104349Sphk      return XML_TOK_INVALID;
677104349Sphk    }
678104349Sphk  }
679104349Sphk  return XML_TOK_PARTIAL;
680104349Sphk}
681104349Sphk
682104349Sphk/* ptr points to character following "<" */
683104349Sphk
684178848Scokanestatic int PTRCALL
685104349SphkPREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
686104349Sphk               const char **nextTokPtr)
687104349Sphk{
688104349Sphk#ifdef XML_NS
689104349Sphk  int hadColon;
690104349Sphk#endif
691302385Sdelphij  REQUIRE_CHAR(enc, ptr, end);
692104349Sphk  switch (BYTE_TYPE(enc, ptr)) {
693104349Sphk  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
694104349Sphk  case BT_EXCL:
695302385Sdelphij    ptr += MINBPC(enc);
696302385Sdelphij    REQUIRE_CHAR(enc, ptr, end);
697104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
698104349Sphk    case BT_MINUS:
699104349Sphk      return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
700104349Sphk    case BT_LSQB:
701104349Sphk      return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
702104349Sphk                                      end, nextTokPtr);
703104349Sphk    }
704104349Sphk    *nextTokPtr = ptr;
705104349Sphk    return XML_TOK_INVALID;
706104349Sphk  case BT_QUEST:
707104349Sphk    return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
708104349Sphk  case BT_SOL:
709104349Sphk    return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
710104349Sphk  default:
711104349Sphk    *nextTokPtr = ptr;
712104349Sphk    return XML_TOK_INVALID;
713104349Sphk  }
714104349Sphk#ifdef XML_NS
715104349Sphk  hadColon = 0;
716104349Sphk#endif
717104349Sphk  /* we have a start-tag */
718302385Sdelphij  while (HAS_CHAR(enc, ptr, end)) {
719104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
720104349Sphk    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
721104349Sphk#ifdef XML_NS
722104349Sphk    case BT_COLON:
723104349Sphk      if (hadColon) {
724104349Sphk        *nextTokPtr = ptr;
725104349Sphk        return XML_TOK_INVALID;
726104349Sphk      }
727104349Sphk      hadColon = 1;
728104349Sphk      ptr += MINBPC(enc);
729302385Sdelphij      REQUIRE_CHAR(enc, ptr, end);
730104349Sphk      switch (BYTE_TYPE(enc, ptr)) {
731104349Sphk      CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
732104349Sphk      default:
733104349Sphk        *nextTokPtr = ptr;
734104349Sphk        return XML_TOK_INVALID;
735104349Sphk      }
736104349Sphk      break;
737104349Sphk#endif
738104349Sphk    case BT_S: case BT_CR: case BT_LF:
739104349Sphk      {
740104349Sphk        ptr += MINBPC(enc);
741302385Sdelphij        while (HAS_CHAR(enc, ptr, end)) {
742104349Sphk          switch (BYTE_TYPE(enc, ptr)) {
743104349Sphk          CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
744104349Sphk          case BT_GT:
745104349Sphk            goto gt;
746104349Sphk          case BT_SOL:
747104349Sphk            goto sol;
748104349Sphk          case BT_S: case BT_CR: case BT_LF:
749104349Sphk            ptr += MINBPC(enc);
750104349Sphk            continue;
751104349Sphk          default:
752104349Sphk            *nextTokPtr = ptr;
753104349Sphk            return XML_TOK_INVALID;
754104349Sphk          }
755104349Sphk          return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
756104349Sphk        }
757104349Sphk        return XML_TOK_PARTIAL;
758104349Sphk      }
759104349Sphk    case BT_GT:
760104349Sphk    gt:
761104349Sphk      *nextTokPtr = ptr + MINBPC(enc);
762104349Sphk      return XML_TOK_START_TAG_NO_ATTS;
763104349Sphk    case BT_SOL:
764104349Sphk    sol:
765104349Sphk      ptr += MINBPC(enc);
766302385Sdelphij      REQUIRE_CHAR(enc, ptr, end);
767104349Sphk      if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
768104349Sphk        *nextTokPtr = ptr;
769104349Sphk        return XML_TOK_INVALID;
770104349Sphk      }
771104349Sphk      *nextTokPtr = ptr + MINBPC(enc);
772104349Sphk      return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
773104349Sphk    default:
774104349Sphk      *nextTokPtr = ptr;
775104349Sphk      return XML_TOK_INVALID;
776104349Sphk    }
777104349Sphk  }
778104349Sphk  return XML_TOK_PARTIAL;
779104349Sphk}
780104349Sphk
781178848Scokanestatic int PTRCALL
782104349SphkPREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
783104349Sphk                   const char **nextTokPtr)
784104349Sphk{
785302385Sdelphij  if (ptr >= end)
786104349Sphk    return XML_TOK_NONE;
787104349Sphk  if (MINBPC(enc) > 1) {
788104349Sphk    size_t n = end - ptr;
789104349Sphk    if (n & (MINBPC(enc) - 1)) {
790104349Sphk      n &= ~(MINBPC(enc) - 1);
791104349Sphk      if (n == 0)
792104349Sphk        return XML_TOK_PARTIAL;
793104349Sphk      end = ptr + n;
794104349Sphk    }
795104349Sphk  }
796104349Sphk  switch (BYTE_TYPE(enc, ptr)) {
797104349Sphk  case BT_LT:
798104349Sphk    return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
799104349Sphk  case BT_AMP:
800104349Sphk    return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
801104349Sphk  case BT_CR:
802104349Sphk    ptr += MINBPC(enc);
803302385Sdelphij    if (! HAS_CHAR(enc, ptr, end))
804104349Sphk      return XML_TOK_TRAILING_CR;
805104349Sphk    if (BYTE_TYPE(enc, ptr) == BT_LF)
806104349Sphk      ptr += MINBPC(enc);
807104349Sphk    *nextTokPtr = ptr;
808104349Sphk    return XML_TOK_DATA_NEWLINE;
809104349Sphk  case BT_LF:
810104349Sphk    *nextTokPtr = ptr + MINBPC(enc);
811104349Sphk    return XML_TOK_DATA_NEWLINE;
812104349Sphk  case BT_RSQB:
813104349Sphk    ptr += MINBPC(enc);
814302385Sdelphij    if (! HAS_CHAR(enc, ptr, end))
815104349Sphk      return XML_TOK_TRAILING_RSQB;
816104349Sphk    if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
817104349Sphk      break;
818104349Sphk    ptr += MINBPC(enc);
819302385Sdelphij    if (! HAS_CHAR(enc, ptr, end))
820104349Sphk      return XML_TOK_TRAILING_RSQB;
821104349Sphk    if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
822104349Sphk      ptr -= MINBPC(enc);
823104349Sphk      break;
824104349Sphk    }
825104349Sphk    *nextTokPtr = ptr;
826104349Sphk    return XML_TOK_INVALID;
827104349Sphk  INVALID_CASES(ptr, nextTokPtr)
828104349Sphk  default:
829104349Sphk    ptr += MINBPC(enc);
830104349Sphk    break;
831104349Sphk  }
832302385Sdelphij  while (HAS_CHAR(enc, ptr, end)) {
833104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
834104349Sphk#define LEAD_CASE(n) \
835104349Sphk    case BT_LEAD ## n: \
836104349Sphk      if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
837104349Sphk        *nextTokPtr = ptr; \
838104349Sphk        return XML_TOK_DATA_CHARS; \
839104349Sphk      } \
840104349Sphk      ptr += n; \
841104349Sphk      break;
842104349Sphk    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
843104349Sphk#undef LEAD_CASE
844104349Sphk    case BT_RSQB:
845302385Sdelphij      if (HAS_CHARS(enc, ptr, end, 2)) {
846104349Sphk         if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
847104349Sphk           ptr += MINBPC(enc);
848104349Sphk           break;
849104349Sphk         }
850302385Sdelphij         if (HAS_CHARS(enc, ptr, end, 3)) {
851104349Sphk           if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
852104349Sphk             ptr += MINBPC(enc);
853104349Sphk             break;
854104349Sphk           }
855104349Sphk           *nextTokPtr = ptr + 2*MINBPC(enc);
856104349Sphk           return XML_TOK_INVALID;
857104349Sphk         }
858104349Sphk      }
859104349Sphk      /* fall through */
860104349Sphk    case BT_AMP:
861104349Sphk    case BT_LT:
862104349Sphk    case BT_NONXML:
863104349Sphk    case BT_MALFORM:
864104349Sphk    case BT_TRAIL:
865104349Sphk    case BT_CR:
866104349Sphk    case BT_LF:
867104349Sphk      *nextTokPtr = ptr;
868104349Sphk      return XML_TOK_DATA_CHARS;
869104349Sphk    default:
870104349Sphk      ptr += MINBPC(enc);
871104349Sphk      break;
872104349Sphk    }
873104349Sphk  }
874104349Sphk  *nextTokPtr = ptr;
875104349Sphk  return XML_TOK_DATA_CHARS;
876104349Sphk}
877104349Sphk
878104349Sphk/* ptr points to character following "%" */
879104349Sphk
880178848Scokanestatic int PTRCALL
881104349SphkPREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
882104349Sphk                    const char **nextTokPtr)
883104349Sphk{
884302385Sdelphij  REQUIRE_CHAR(enc, ptr, end);
885104349Sphk  switch (BYTE_TYPE(enc, ptr)) {
886104349Sphk  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
887104349Sphk  case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
888104349Sphk    *nextTokPtr = ptr;
889104349Sphk    return XML_TOK_PERCENT;
890104349Sphk  default:
891104349Sphk    *nextTokPtr = ptr;
892104349Sphk    return XML_TOK_INVALID;
893104349Sphk  }
894302385Sdelphij  while (HAS_CHAR(enc, ptr, end)) {
895104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
896104349Sphk    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
897104349Sphk    case BT_SEMI:
898104349Sphk      *nextTokPtr = ptr + MINBPC(enc);
899104349Sphk      return XML_TOK_PARAM_ENTITY_REF;
900104349Sphk    default:
901104349Sphk      *nextTokPtr = ptr;
902104349Sphk      return XML_TOK_INVALID;
903104349Sphk    }
904104349Sphk  }
905104349Sphk  return XML_TOK_PARTIAL;
906104349Sphk}
907104349Sphk
908178848Scokanestatic int PTRCALL
909104349SphkPREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
910104349Sphk                      const char **nextTokPtr)
911104349Sphk{
912302385Sdelphij  REQUIRE_CHAR(enc, ptr, end);
913104349Sphk  switch (BYTE_TYPE(enc, ptr)) {
914104349Sphk  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
915104349Sphk  default:
916104349Sphk    *nextTokPtr = ptr;
917104349Sphk    return XML_TOK_INVALID;
918104349Sphk  }
919302385Sdelphij  while (HAS_CHAR(enc, ptr, end)) {
920104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
921104349Sphk    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
922104349Sphk    case BT_CR: case BT_LF: case BT_S:
923104349Sphk    case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
924104349Sphk      *nextTokPtr = ptr;
925104349Sphk      return XML_TOK_POUND_NAME;
926104349Sphk    default:
927104349Sphk      *nextTokPtr = ptr;
928104349Sphk      return XML_TOK_INVALID;
929104349Sphk    }
930104349Sphk  }
931104349Sphk  return -XML_TOK_POUND_NAME;
932104349Sphk}
933104349Sphk
934178848Scokanestatic int PTRCALL
935104349SphkPREFIX(scanLit)(int open, const ENCODING *enc,
936104349Sphk                const char *ptr, const char *end,
937104349Sphk                const char **nextTokPtr)
938104349Sphk{
939302385Sdelphij  while (HAS_CHAR(enc, ptr, end)) {
940104349Sphk    int t = BYTE_TYPE(enc, ptr);
941104349Sphk    switch (t) {
942104349Sphk    INVALID_CASES(ptr, nextTokPtr)
943104349Sphk    case BT_QUOT:
944104349Sphk    case BT_APOS:
945104349Sphk      ptr += MINBPC(enc);
946104349Sphk      if (t != open)
947104349Sphk        break;
948302385Sdelphij      if (! HAS_CHAR(enc, ptr, end))
949104349Sphk        return -XML_TOK_LITERAL;
950104349Sphk      *nextTokPtr = ptr;
951104349Sphk      switch (BYTE_TYPE(enc, ptr)) {
952104349Sphk      case BT_S: case BT_CR: case BT_LF:
953104349Sphk      case BT_GT: case BT_PERCNT: case BT_LSQB:
954104349Sphk        return XML_TOK_LITERAL;
955104349Sphk      default:
956104349Sphk        return XML_TOK_INVALID;
957104349Sphk      }
958104349Sphk    default:
959104349Sphk      ptr += MINBPC(enc);
960104349Sphk      break;
961104349Sphk    }
962104349Sphk  }
963104349Sphk  return XML_TOK_PARTIAL;
964104349Sphk}
965104349Sphk
966178848Scokanestatic int PTRCALL
967104349SphkPREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
968104349Sphk                  const char **nextTokPtr)
969104349Sphk{
970104349Sphk  int tok;
971302385Sdelphij  if (ptr >= end)
972104349Sphk    return XML_TOK_NONE;
973104349Sphk  if (MINBPC(enc) > 1) {
974104349Sphk    size_t n = end - ptr;
975104349Sphk    if (n & (MINBPC(enc) - 1)) {
976104349Sphk      n &= ~(MINBPC(enc) - 1);
977104349Sphk      if (n == 0)
978104349Sphk        return XML_TOK_PARTIAL;
979104349Sphk      end = ptr + n;
980104349Sphk    }
981104349Sphk  }
982104349Sphk  switch (BYTE_TYPE(enc, ptr)) {
983104349Sphk  case BT_QUOT:
984104349Sphk    return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
985104349Sphk  case BT_APOS:
986104349Sphk    return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
987104349Sphk  case BT_LT:
988104349Sphk    {
989104349Sphk      ptr += MINBPC(enc);
990302385Sdelphij      REQUIRE_CHAR(enc, ptr, end);
991104349Sphk      switch (BYTE_TYPE(enc, ptr)) {
992104349Sphk      case BT_EXCL:
993104349Sphk        return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
994104349Sphk      case BT_QUEST:
995104349Sphk        return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
996104349Sphk      case BT_NMSTRT:
997104349Sphk      case BT_HEX:
998104349Sphk      case BT_NONASCII:
999104349Sphk      case BT_LEAD2:
1000104349Sphk      case BT_LEAD3:
1001104349Sphk      case BT_LEAD4:
1002104349Sphk        *nextTokPtr = ptr - MINBPC(enc);
1003104349Sphk        return XML_TOK_INSTANCE_START;
1004104349Sphk      }
1005104349Sphk      *nextTokPtr = ptr;
1006104349Sphk      return XML_TOK_INVALID;
1007104349Sphk    }
1008104349Sphk  case BT_CR:
1009104349Sphk    if (ptr + MINBPC(enc) == end) {
1010104349Sphk      *nextTokPtr = end;
1011104349Sphk      /* indicate that this might be part of a CR/LF pair */
1012104349Sphk      return -XML_TOK_PROLOG_S;
1013104349Sphk    }
1014104349Sphk    /* fall through */
1015104349Sphk  case BT_S: case BT_LF:
1016104349Sphk    for (;;) {
1017104349Sphk      ptr += MINBPC(enc);
1018302385Sdelphij      if (! HAS_CHAR(enc, ptr, end))
1019104349Sphk        break;
1020104349Sphk      switch (BYTE_TYPE(enc, ptr)) {
1021104349Sphk      case BT_S: case BT_LF:
1022104349Sphk        break;
1023104349Sphk      case BT_CR:
1024104349Sphk        /* don't split CR/LF pair */
1025104349Sphk        if (ptr + MINBPC(enc) != end)
1026104349Sphk          break;
1027104349Sphk        /* fall through */
1028104349Sphk      default:
1029104349Sphk        *nextTokPtr = ptr;
1030104349Sphk        return XML_TOK_PROLOG_S;
1031104349Sphk      }
1032104349Sphk    }
1033104349Sphk    *nextTokPtr = ptr;
1034104349Sphk    return XML_TOK_PROLOG_S;
1035104349Sphk  case BT_PERCNT:
1036104349Sphk    return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1037104349Sphk  case BT_COMMA:
1038104349Sphk    *nextTokPtr = ptr + MINBPC(enc);
1039104349Sphk    return XML_TOK_COMMA;
1040104349Sphk  case BT_LSQB:
1041104349Sphk    *nextTokPtr = ptr + MINBPC(enc);
1042104349Sphk    return XML_TOK_OPEN_BRACKET;
1043104349Sphk  case BT_RSQB:
1044104349Sphk    ptr += MINBPC(enc);
1045302385Sdelphij    if (! HAS_CHAR(enc, ptr, end))
1046104349Sphk      return -XML_TOK_CLOSE_BRACKET;
1047104349Sphk    if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1048302385Sdelphij      REQUIRE_CHARS(enc, ptr, end, 2);
1049104349Sphk      if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1050104349Sphk        *nextTokPtr = ptr + 2*MINBPC(enc);
1051104349Sphk        return XML_TOK_COND_SECT_CLOSE;
1052104349Sphk      }
1053104349Sphk    }
1054104349Sphk    *nextTokPtr = ptr;
1055104349Sphk    return XML_TOK_CLOSE_BRACKET;
1056104349Sphk  case BT_LPAR:
1057104349Sphk    *nextTokPtr = ptr + MINBPC(enc);
1058104349Sphk    return XML_TOK_OPEN_PAREN;
1059104349Sphk  case BT_RPAR:
1060104349Sphk    ptr += MINBPC(enc);
1061302385Sdelphij    if (! HAS_CHAR(enc, ptr, end))
1062104349Sphk      return -XML_TOK_CLOSE_PAREN;
1063104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
1064104349Sphk    case BT_AST:
1065104349Sphk      *nextTokPtr = ptr + MINBPC(enc);
1066104349Sphk      return XML_TOK_CLOSE_PAREN_ASTERISK;
1067104349Sphk    case BT_QUEST:
1068104349Sphk      *nextTokPtr = ptr + MINBPC(enc);
1069104349Sphk      return XML_TOK_CLOSE_PAREN_QUESTION;
1070104349Sphk    case BT_PLUS:
1071104349Sphk      *nextTokPtr = ptr + MINBPC(enc);
1072104349Sphk      return XML_TOK_CLOSE_PAREN_PLUS;
1073104349Sphk    case BT_CR: case BT_LF: case BT_S:
1074104349Sphk    case BT_GT: case BT_COMMA: case BT_VERBAR:
1075104349Sphk    case BT_RPAR:
1076104349Sphk      *nextTokPtr = ptr;
1077104349Sphk      return XML_TOK_CLOSE_PAREN;
1078104349Sphk    }
1079104349Sphk    *nextTokPtr = ptr;
1080104349Sphk    return XML_TOK_INVALID;
1081104349Sphk  case BT_VERBAR:
1082104349Sphk    *nextTokPtr = ptr + MINBPC(enc);
1083104349Sphk    return XML_TOK_OR;
1084104349Sphk  case BT_GT:
1085104349Sphk    *nextTokPtr = ptr + MINBPC(enc);
1086104349Sphk    return XML_TOK_DECL_CLOSE;
1087104349Sphk  case BT_NUM:
1088104349Sphk    return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1089104349Sphk#define LEAD_CASE(n) \
1090104349Sphk  case BT_LEAD ## n: \
1091104349Sphk    if (end - ptr < n) \
1092104349Sphk      return XML_TOK_PARTIAL_CHAR; \
1093104349Sphk    if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1094104349Sphk      ptr += n; \
1095104349Sphk      tok = XML_TOK_NAME; \
1096104349Sphk      break; \
1097104349Sphk    } \
1098104349Sphk    if (IS_NAME_CHAR(enc, ptr, n)) { \
1099104349Sphk      ptr += n; \
1100104349Sphk      tok = XML_TOK_NMTOKEN; \
1101104349Sphk      break; \
1102104349Sphk    } \
1103104349Sphk    *nextTokPtr = ptr; \
1104104349Sphk    return XML_TOK_INVALID;
1105104349Sphk    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1106104349Sphk#undef LEAD_CASE
1107104349Sphk  case BT_NMSTRT:
1108104349Sphk  case BT_HEX:
1109104349Sphk    tok = XML_TOK_NAME;
1110104349Sphk    ptr += MINBPC(enc);
1111104349Sphk    break;
1112104349Sphk  case BT_DIGIT:
1113104349Sphk  case BT_NAME:
1114104349Sphk  case BT_MINUS:
1115104349Sphk#ifdef XML_NS
1116104349Sphk  case BT_COLON:
1117104349Sphk#endif
1118104349Sphk    tok = XML_TOK_NMTOKEN;
1119104349Sphk    ptr += MINBPC(enc);
1120104349Sphk    break;
1121104349Sphk  case BT_NONASCII:
1122104349Sphk    if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1123104349Sphk      ptr += MINBPC(enc);
1124104349Sphk      tok = XML_TOK_NAME;
1125104349Sphk      break;
1126104349Sphk    }
1127104349Sphk    if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1128104349Sphk      ptr += MINBPC(enc);
1129104349Sphk      tok = XML_TOK_NMTOKEN;
1130104349Sphk      break;
1131104349Sphk    }
1132104349Sphk    /* fall through */
1133104349Sphk  default:
1134104349Sphk    *nextTokPtr = ptr;
1135104349Sphk    return XML_TOK_INVALID;
1136104349Sphk  }
1137302385Sdelphij  while (HAS_CHAR(enc, ptr, end)) {
1138104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
1139104349Sphk    CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1140104349Sphk    case BT_GT: case BT_RPAR: case BT_COMMA:
1141104349Sphk    case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1142104349Sphk    case BT_S: case BT_CR: case BT_LF:
1143104349Sphk      *nextTokPtr = ptr;
1144104349Sphk      return tok;
1145104349Sphk#ifdef XML_NS
1146104349Sphk    case BT_COLON:
1147104349Sphk      ptr += MINBPC(enc);
1148104349Sphk      switch (tok) {
1149104349Sphk      case XML_TOK_NAME:
1150302385Sdelphij        REQUIRE_CHAR(enc, ptr, end);
1151104349Sphk        tok = XML_TOK_PREFIXED_NAME;
1152104349Sphk        switch (BYTE_TYPE(enc, ptr)) {
1153104349Sphk        CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1154104349Sphk        default:
1155104349Sphk          tok = XML_TOK_NMTOKEN;
1156104349Sphk          break;
1157104349Sphk        }
1158104349Sphk        break;
1159104349Sphk      case XML_TOK_PREFIXED_NAME:
1160104349Sphk        tok = XML_TOK_NMTOKEN;
1161104349Sphk        break;
1162104349Sphk      }
1163104349Sphk      break;
1164104349Sphk#endif
1165104349Sphk    case BT_PLUS:
1166104349Sphk      if (tok == XML_TOK_NMTOKEN)  {
1167104349Sphk        *nextTokPtr = ptr;
1168104349Sphk        return XML_TOK_INVALID;
1169104349Sphk      }
1170104349Sphk      *nextTokPtr = ptr + MINBPC(enc);
1171104349Sphk      return XML_TOK_NAME_PLUS;
1172104349Sphk    case BT_AST:
1173104349Sphk      if (tok == XML_TOK_NMTOKEN)  {
1174104349Sphk        *nextTokPtr = ptr;
1175104349Sphk        return XML_TOK_INVALID;
1176104349Sphk      }
1177104349Sphk      *nextTokPtr = ptr + MINBPC(enc);
1178104349Sphk      return XML_TOK_NAME_ASTERISK;
1179104349Sphk    case BT_QUEST:
1180104349Sphk      if (tok == XML_TOK_NMTOKEN)  {
1181104349Sphk        *nextTokPtr = ptr;
1182104349Sphk        return XML_TOK_INVALID;
1183104349Sphk      }
1184104349Sphk      *nextTokPtr = ptr + MINBPC(enc);
1185104349Sphk      return XML_TOK_NAME_QUESTION;
1186104349Sphk    default:
1187104349Sphk      *nextTokPtr = ptr;
1188104349Sphk      return XML_TOK_INVALID;
1189104349Sphk    }
1190104349Sphk  }
1191104349Sphk  return -tok;
1192104349Sphk}
1193104349Sphk
1194178848Scokanestatic int PTRCALL
1195104349SphkPREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1196104349Sphk                          const char *end, const char **nextTokPtr)
1197104349Sphk{
1198104349Sphk  const char *start;
1199302385Sdelphij  if (ptr >= end)
1200104349Sphk    return XML_TOK_NONE;
1201302385Sdelphij  else if (! HAS_CHAR(enc, ptr, end))
1202302385Sdelphij    return XML_TOK_PARTIAL;
1203104349Sphk  start = ptr;
1204302385Sdelphij  while (HAS_CHAR(enc, ptr, end)) {
1205104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
1206104349Sphk#define LEAD_CASE(n) \
1207104349Sphk    case BT_LEAD ## n: ptr += n; break;
1208104349Sphk    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1209104349Sphk#undef LEAD_CASE
1210104349Sphk    case BT_AMP:
1211104349Sphk      if (ptr == start)
1212104349Sphk        return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1213104349Sphk      *nextTokPtr = ptr;
1214104349Sphk      return XML_TOK_DATA_CHARS;
1215104349Sphk    case BT_LT:
1216104349Sphk      /* this is for inside entity references */
1217104349Sphk      *nextTokPtr = ptr;
1218104349Sphk      return XML_TOK_INVALID;
1219104349Sphk    case BT_LF:
1220104349Sphk      if (ptr == start) {
1221104349Sphk        *nextTokPtr = ptr + MINBPC(enc);
1222104349Sphk        return XML_TOK_DATA_NEWLINE;
1223104349Sphk      }
1224104349Sphk      *nextTokPtr = ptr;
1225104349Sphk      return XML_TOK_DATA_CHARS;
1226104349Sphk    case BT_CR:
1227104349Sphk      if (ptr == start) {
1228104349Sphk        ptr += MINBPC(enc);
1229302385Sdelphij        if (! HAS_CHAR(enc, ptr, end))
1230104349Sphk          return XML_TOK_TRAILING_CR;
1231104349Sphk        if (BYTE_TYPE(enc, ptr) == BT_LF)
1232104349Sphk          ptr += MINBPC(enc);
1233104349Sphk        *nextTokPtr = ptr;
1234104349Sphk        return XML_TOK_DATA_NEWLINE;
1235104349Sphk      }
1236104349Sphk      *nextTokPtr = ptr;
1237104349Sphk      return XML_TOK_DATA_CHARS;
1238104349Sphk    case BT_S:
1239104349Sphk      if (ptr == start) {
1240104349Sphk        *nextTokPtr = ptr + MINBPC(enc);
1241104349Sphk        return XML_TOK_ATTRIBUTE_VALUE_S;
1242104349Sphk      }
1243104349Sphk      *nextTokPtr = ptr;
1244104349Sphk      return XML_TOK_DATA_CHARS;
1245104349Sphk    default:
1246104349Sphk      ptr += MINBPC(enc);
1247104349Sphk      break;
1248104349Sphk    }
1249104349Sphk  }
1250104349Sphk  *nextTokPtr = ptr;
1251104349Sphk  return XML_TOK_DATA_CHARS;
1252104349Sphk}
1253104349Sphk
1254178848Scokanestatic int PTRCALL
1255104349SphkPREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1256104349Sphk                       const char *end, const char **nextTokPtr)
1257104349Sphk{
1258104349Sphk  const char *start;
1259302385Sdelphij  if (ptr >= end)
1260104349Sphk    return XML_TOK_NONE;
1261302385Sdelphij  else if (! HAS_CHAR(enc, ptr, end))
1262302385Sdelphij    return XML_TOK_PARTIAL;
1263104349Sphk  start = ptr;
1264302385Sdelphij  while (HAS_CHAR(enc, ptr, end)) {
1265104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
1266104349Sphk#define LEAD_CASE(n) \
1267104349Sphk    case BT_LEAD ## n: ptr += n; break;
1268104349Sphk    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1269104349Sphk#undef LEAD_CASE
1270104349Sphk    case BT_AMP:
1271104349Sphk      if (ptr == start)
1272104349Sphk        return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1273104349Sphk      *nextTokPtr = ptr;
1274104349Sphk      return XML_TOK_DATA_CHARS;
1275104349Sphk    case BT_PERCNT:
1276104349Sphk      if (ptr == start) {
1277104349Sphk        int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1278104349Sphk                                       end, nextTokPtr);
1279104349Sphk        return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1280104349Sphk      }
1281104349Sphk      *nextTokPtr = ptr;
1282104349Sphk      return XML_TOK_DATA_CHARS;
1283104349Sphk    case BT_LF:
1284104349Sphk      if (ptr == start) {
1285104349Sphk        *nextTokPtr = ptr + MINBPC(enc);
1286104349Sphk        return XML_TOK_DATA_NEWLINE;
1287104349Sphk      }
1288104349Sphk      *nextTokPtr = ptr;
1289104349Sphk      return XML_TOK_DATA_CHARS;
1290104349Sphk    case BT_CR:
1291104349Sphk      if (ptr == start) {
1292104349Sphk        ptr += MINBPC(enc);
1293302385Sdelphij        if (! HAS_CHAR(enc, ptr, end))
1294104349Sphk          return XML_TOK_TRAILING_CR;
1295104349Sphk        if (BYTE_TYPE(enc, ptr) == BT_LF)
1296104349Sphk          ptr += MINBPC(enc);
1297104349Sphk        *nextTokPtr = ptr;
1298104349Sphk        return XML_TOK_DATA_NEWLINE;
1299104349Sphk      }
1300104349Sphk      *nextTokPtr = ptr;
1301104349Sphk      return XML_TOK_DATA_CHARS;
1302104349Sphk    default:
1303104349Sphk      ptr += MINBPC(enc);
1304104349Sphk      break;
1305104349Sphk    }
1306104349Sphk  }
1307104349Sphk  *nextTokPtr = ptr;
1308104349Sphk  return XML_TOK_DATA_CHARS;
1309104349Sphk}
1310104349Sphk
1311104349Sphk#ifdef XML_DTD
1312104349Sphk
1313178848Scokanestatic int PTRCALL
1314104349SphkPREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1315104349Sphk                         const char *end, const char **nextTokPtr)
1316104349Sphk{
1317104349Sphk  int level = 0;
1318104349Sphk  if (MINBPC(enc) > 1) {
1319104349Sphk    size_t n = end - ptr;
1320104349Sphk    if (n & (MINBPC(enc) - 1)) {
1321104349Sphk      n &= ~(MINBPC(enc) - 1);
1322104349Sphk      end = ptr + n;
1323104349Sphk    }
1324104349Sphk  }
1325302385Sdelphij  while (HAS_CHAR(enc, ptr, end)) {
1326104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
1327104349Sphk    INVALID_CASES(ptr, nextTokPtr)
1328104349Sphk    case BT_LT:
1329302385Sdelphij      ptr += MINBPC(enc);
1330302385Sdelphij      REQUIRE_CHAR(enc, ptr, end);
1331104349Sphk      if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1332302385Sdelphij        ptr += MINBPC(enc);
1333302385Sdelphij        REQUIRE_CHAR(enc, ptr, end);
1334104349Sphk        if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1335104349Sphk          ++level;
1336104349Sphk          ptr += MINBPC(enc);
1337104349Sphk        }
1338104349Sphk      }
1339104349Sphk      break;
1340104349Sphk    case BT_RSQB:
1341302385Sdelphij      ptr += MINBPC(enc);
1342302385Sdelphij      REQUIRE_CHAR(enc, ptr, end);
1343104349Sphk      if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1344302385Sdelphij        ptr += MINBPC(enc);
1345302385Sdelphij        REQUIRE_CHAR(enc, ptr, end);
1346104349Sphk        if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1347104349Sphk          ptr += MINBPC(enc);
1348104349Sphk          if (level == 0) {
1349104349Sphk            *nextTokPtr = ptr;
1350104349Sphk            return XML_TOK_IGNORE_SECT;
1351104349Sphk          }
1352104349Sphk          --level;
1353104349Sphk        }
1354104349Sphk      }
1355104349Sphk      break;
1356104349Sphk    default:
1357104349Sphk      ptr += MINBPC(enc);
1358104349Sphk      break;
1359104349Sphk    }
1360104349Sphk  }
1361104349Sphk  return XML_TOK_PARTIAL;
1362104349Sphk}
1363104349Sphk
1364104349Sphk#endif /* XML_DTD */
1365104349Sphk
1366178848Scokanestatic int PTRCALL
1367104349SphkPREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1368104349Sphk                   const char **badPtr)
1369104349Sphk{
1370104349Sphk  ptr += MINBPC(enc);
1371104349Sphk  end -= MINBPC(enc);
1372302385Sdelphij  for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1373104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
1374104349Sphk    case BT_DIGIT:
1375104349Sphk    case BT_HEX:
1376104349Sphk    case BT_MINUS:
1377104349Sphk    case BT_APOS:
1378104349Sphk    case BT_LPAR:
1379104349Sphk    case BT_RPAR:
1380104349Sphk    case BT_PLUS:
1381104349Sphk    case BT_COMMA:
1382104349Sphk    case BT_SOL:
1383104349Sphk    case BT_EQUALS:
1384104349Sphk    case BT_QUEST:
1385104349Sphk    case BT_CR:
1386104349Sphk    case BT_LF:
1387104349Sphk    case BT_SEMI:
1388104349Sphk    case BT_EXCL:
1389104349Sphk    case BT_AST:
1390104349Sphk    case BT_PERCNT:
1391104349Sphk    case BT_NUM:
1392104349Sphk#ifdef XML_NS
1393104349Sphk    case BT_COLON:
1394104349Sphk#endif
1395104349Sphk      break;
1396104349Sphk    case BT_S:
1397104349Sphk      if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1398104349Sphk        *badPtr = ptr;
1399104349Sphk        return 0;
1400104349Sphk      }
1401104349Sphk      break;
1402104349Sphk    case BT_NAME:
1403104349Sphk    case BT_NMSTRT:
1404104349Sphk      if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1405104349Sphk        break;
1406104349Sphk    default:
1407104349Sphk      switch (BYTE_TO_ASCII(enc, ptr)) {
1408104349Sphk      case 0x24: /* $ */
1409104349Sphk      case 0x40: /* @ */
1410104349Sphk        break;
1411104349Sphk      default:
1412104349Sphk        *badPtr = ptr;
1413104349Sphk        return 0;
1414104349Sphk      }
1415104349Sphk      break;
1416104349Sphk    }
1417104349Sphk  }
1418104349Sphk  return 1;
1419104349Sphk}
1420104349Sphk
1421104349Sphk/* This must only be called for a well-formed start-tag or empty
1422104349Sphk   element tag.  Returns the number of attributes.  Pointers to the
1423104349Sphk   first attsMax attributes are stored in atts.
1424104349Sphk*/
1425104349Sphk
1426178848Scokanestatic int PTRCALL
1427104349SphkPREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1428104349Sphk                int attsMax, ATTRIBUTE *atts)
1429104349Sphk{
1430104349Sphk  enum { other, inName, inValue } state = inName;
1431104349Sphk  int nAtts = 0;
1432104349Sphk  int open = 0; /* defined when state == inValue;
1433104349Sphk                   initialization just to shut up compilers */
1434104349Sphk
1435104349Sphk  for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1436104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
1437104349Sphk#define START_NAME \
1438104349Sphk      if (state == other) { \
1439104349Sphk        if (nAtts < attsMax) { \
1440104349Sphk          atts[nAtts].name = ptr; \
1441104349Sphk          atts[nAtts].normalized = 1; \
1442104349Sphk        } \
1443104349Sphk        state = inName; \
1444104349Sphk      }
1445104349Sphk#define LEAD_CASE(n) \
1446104349Sphk    case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1447104349Sphk    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1448104349Sphk#undef LEAD_CASE
1449104349Sphk    case BT_NONASCII:
1450104349Sphk    case BT_NMSTRT:
1451104349Sphk    case BT_HEX:
1452104349Sphk      START_NAME
1453104349Sphk      break;
1454104349Sphk#undef START_NAME
1455104349Sphk    case BT_QUOT:
1456104349Sphk      if (state != inValue) {
1457104349Sphk        if (nAtts < attsMax)
1458104349Sphk          atts[nAtts].valuePtr = ptr + MINBPC(enc);
1459104349Sphk        state = inValue;
1460104349Sphk        open = BT_QUOT;
1461104349Sphk      }
1462104349Sphk      else if (open == BT_QUOT) {
1463104349Sphk        state = other;
1464104349Sphk        if (nAtts < attsMax)
1465104349Sphk          atts[nAtts].valueEnd = ptr;
1466104349Sphk        nAtts++;
1467104349Sphk      }
1468104349Sphk      break;
1469104349Sphk    case BT_APOS:
1470104349Sphk      if (state != inValue) {
1471104349Sphk        if (nAtts < attsMax)
1472104349Sphk          atts[nAtts].valuePtr = ptr + MINBPC(enc);
1473104349Sphk        state = inValue;
1474104349Sphk        open = BT_APOS;
1475104349Sphk      }
1476104349Sphk      else if (open == BT_APOS) {
1477104349Sphk        state = other;
1478104349Sphk        if (nAtts < attsMax)
1479104349Sphk          atts[nAtts].valueEnd = ptr;
1480104349Sphk        nAtts++;
1481104349Sphk      }
1482104349Sphk      break;
1483104349Sphk    case BT_AMP:
1484104349Sphk      if (nAtts < attsMax)
1485104349Sphk        atts[nAtts].normalized = 0;
1486104349Sphk      break;
1487104349Sphk    case BT_S:
1488104349Sphk      if (state == inName)
1489104349Sphk        state = other;
1490104349Sphk      else if (state == inValue
1491104349Sphk               && nAtts < attsMax
1492104349Sphk               && atts[nAtts].normalized
1493104349Sphk               && (ptr == atts[nAtts].valuePtr
1494104349Sphk                   || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1495104349Sphk                   || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1496104349Sphk                   || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1497104349Sphk        atts[nAtts].normalized = 0;
1498104349Sphk      break;
1499104349Sphk    case BT_CR: case BT_LF:
1500104349Sphk      /* This case ensures that the first attribute name is counted
1501104349Sphk         Apart from that we could just change state on the quote. */
1502104349Sphk      if (state == inName)
1503104349Sphk        state = other;
1504104349Sphk      else if (state == inValue && nAtts < attsMax)
1505104349Sphk        atts[nAtts].normalized = 0;
1506104349Sphk      break;
1507104349Sphk    case BT_GT:
1508104349Sphk    case BT_SOL:
1509104349Sphk      if (state != inValue)
1510104349Sphk        return nAtts;
1511104349Sphk      break;
1512104349Sphk    default:
1513104349Sphk      break;
1514104349Sphk    }
1515104349Sphk  }
1516104349Sphk  /* not reached */
1517104349Sphk}
1518104349Sphk
1519178848Scokanestatic int PTRFASTCALL
1520302385SdelphijPREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr)
1521104349Sphk{
1522104349Sphk  int result = 0;
1523104349Sphk  /* skip &# */
1524104349Sphk  ptr += 2*MINBPC(enc);
1525104349Sphk  if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1526104349Sphk    for (ptr += MINBPC(enc);
1527104349Sphk         !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1528104349Sphk         ptr += MINBPC(enc)) {
1529104349Sphk      int c = BYTE_TO_ASCII(enc, ptr);
1530104349Sphk      switch (c) {
1531104349Sphk      case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1532104349Sphk      case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1533104349Sphk        result <<= 4;
1534104349Sphk        result |= (c - ASCII_0);
1535104349Sphk        break;
1536104349Sphk      case ASCII_A: case ASCII_B: case ASCII_C:
1537104349Sphk      case ASCII_D: case ASCII_E: case ASCII_F:
1538104349Sphk        result <<= 4;
1539104349Sphk        result += 10 + (c - ASCII_A);
1540104349Sphk        break;
1541104349Sphk      case ASCII_a: case ASCII_b: case ASCII_c:
1542104349Sphk      case ASCII_d: case ASCII_e: case ASCII_f:
1543104349Sphk        result <<= 4;
1544104349Sphk        result += 10 + (c - ASCII_a);
1545104349Sphk        break;
1546104349Sphk      }
1547104349Sphk      if (result >= 0x110000)
1548104349Sphk        return -1;
1549104349Sphk    }
1550104349Sphk  }
1551104349Sphk  else {
1552104349Sphk    for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1553104349Sphk      int c = BYTE_TO_ASCII(enc, ptr);
1554104349Sphk      result *= 10;
1555104349Sphk      result += (c - ASCII_0);
1556104349Sphk      if (result >= 0x110000)
1557104349Sphk        return -1;
1558104349Sphk    }
1559104349Sphk  }
1560104349Sphk  return checkCharRefNumber(result);
1561104349Sphk}
1562104349Sphk
1563178848Scokanestatic int PTRCALL
1564302385SdelphijPREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr,
1565104349Sphk                             const char *end)
1566104349Sphk{
1567104349Sphk  switch ((end - ptr)/MINBPC(enc)) {
1568104349Sphk  case 2:
1569104349Sphk    if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1570104349Sphk      switch (BYTE_TO_ASCII(enc, ptr)) {
1571104349Sphk      case ASCII_l:
1572104349Sphk        return ASCII_LT;
1573104349Sphk      case ASCII_g:
1574104349Sphk        return ASCII_GT;
1575104349Sphk      }
1576104349Sphk    }
1577104349Sphk    break;
1578104349Sphk  case 3:
1579104349Sphk    if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1580104349Sphk      ptr += MINBPC(enc);
1581104349Sphk      if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1582104349Sphk        ptr += MINBPC(enc);
1583104349Sphk        if (CHAR_MATCHES(enc, ptr, ASCII_p))
1584104349Sphk          return ASCII_AMP;
1585104349Sphk      }
1586104349Sphk    }
1587104349Sphk    break;
1588104349Sphk  case 4:
1589104349Sphk    switch (BYTE_TO_ASCII(enc, ptr)) {
1590104349Sphk    case ASCII_q:
1591104349Sphk      ptr += MINBPC(enc);
1592104349Sphk      if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1593104349Sphk        ptr += MINBPC(enc);
1594104349Sphk        if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1595104349Sphk          ptr += MINBPC(enc);
1596104349Sphk          if (CHAR_MATCHES(enc, ptr, ASCII_t))
1597104349Sphk            return ASCII_QUOT;
1598104349Sphk        }
1599104349Sphk      }
1600104349Sphk      break;
1601104349Sphk    case ASCII_a:
1602104349Sphk      ptr += MINBPC(enc);
1603104349Sphk      if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1604104349Sphk        ptr += MINBPC(enc);
1605104349Sphk        if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1606104349Sphk          ptr += MINBPC(enc);
1607104349Sphk          if (CHAR_MATCHES(enc, ptr, ASCII_s))
1608104349Sphk            return ASCII_APOS;
1609104349Sphk        }
1610104349Sphk      }
1611104349Sphk      break;
1612104349Sphk    }
1613104349Sphk  }
1614104349Sphk  return 0;
1615104349Sphk}
1616104349Sphk
1617178848Scokanestatic int PTRCALL
1618104349SphkPREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1619104349Sphk{
1620104349Sphk  for (;;) {
1621104349Sphk    switch (BYTE_TYPE(enc, ptr1)) {
1622104349Sphk#define LEAD_CASE(n) \
1623104349Sphk    case BT_LEAD ## n: \
1624104349Sphk      if (*ptr1++ != *ptr2++) \
1625104349Sphk        return 0;
1626104349Sphk    LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1627104349Sphk#undef LEAD_CASE
1628104349Sphk      /* fall through */
1629104349Sphk      if (*ptr1++ != *ptr2++)
1630104349Sphk        return 0;
1631104349Sphk      break;
1632104349Sphk    case BT_NONASCII:
1633104349Sphk    case BT_NMSTRT:
1634104349Sphk#ifdef XML_NS
1635104349Sphk    case BT_COLON:
1636104349Sphk#endif
1637104349Sphk    case BT_HEX:
1638104349Sphk    case BT_DIGIT:
1639104349Sphk    case BT_NAME:
1640104349Sphk    case BT_MINUS:
1641104349Sphk      if (*ptr2++ != *ptr1++)
1642104349Sphk        return 0;
1643104349Sphk      if (MINBPC(enc) > 1) {
1644104349Sphk        if (*ptr2++ != *ptr1++)
1645104349Sphk          return 0;
1646104349Sphk        if (MINBPC(enc) > 2) {
1647104349Sphk          if (*ptr2++ != *ptr1++)
1648104349Sphk            return 0;
1649104349Sphk          if (MINBPC(enc) > 3) {
1650104349Sphk            if (*ptr2++ != *ptr1++)
1651104349Sphk              return 0;
1652104349Sphk          }
1653104349Sphk        }
1654104349Sphk      }
1655104349Sphk      break;
1656104349Sphk    default:
1657104349Sphk      if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1658104349Sphk        return 1;
1659104349Sphk      switch (BYTE_TYPE(enc, ptr2)) {
1660104349Sphk      case BT_LEAD2:
1661104349Sphk      case BT_LEAD3:
1662104349Sphk      case BT_LEAD4:
1663104349Sphk      case BT_NONASCII:
1664104349Sphk      case BT_NMSTRT:
1665104349Sphk#ifdef XML_NS
1666104349Sphk      case BT_COLON:
1667104349Sphk#endif
1668104349Sphk      case BT_HEX:
1669104349Sphk      case BT_DIGIT:
1670104349Sphk      case BT_NAME:
1671104349Sphk      case BT_MINUS:
1672104349Sphk        return 0;
1673104349Sphk      default:
1674104349Sphk        return 1;
1675104349Sphk      }
1676104349Sphk    }
1677104349Sphk  }
1678104349Sphk  /* not reached */
1679104349Sphk}
1680104349Sphk
1681178848Scokanestatic int PTRCALL
1682302385SdelphijPREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1,
1683104349Sphk                         const char *end1, const char *ptr2)
1684104349Sphk{
1685104349Sphk  for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1686302385Sdelphij    if (end1 - ptr1 < MINBPC(enc))
1687104349Sphk      return 0;
1688104349Sphk    if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1689104349Sphk      return 0;
1690104349Sphk  }
1691104349Sphk  return ptr1 == end1;
1692104349Sphk}
1693104349Sphk
1694178848Scokanestatic int PTRFASTCALL
1695104349SphkPREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1696104349Sphk{
1697104349Sphk  const char *start = ptr;
1698104349Sphk  for (;;) {
1699104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
1700104349Sphk#define LEAD_CASE(n) \
1701104349Sphk    case BT_LEAD ## n: ptr += n; break;
1702104349Sphk    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1703104349Sphk#undef LEAD_CASE
1704104349Sphk    case BT_NONASCII:
1705104349Sphk    case BT_NMSTRT:
1706104349Sphk#ifdef XML_NS
1707104349Sphk    case BT_COLON:
1708104349Sphk#endif
1709104349Sphk    case BT_HEX:
1710104349Sphk    case BT_DIGIT:
1711104349Sphk    case BT_NAME:
1712104349Sphk    case BT_MINUS:
1713104349Sphk      ptr += MINBPC(enc);
1714104349Sphk      break;
1715104349Sphk    default:
1716178848Scokane      return (int)(ptr - start);
1717104349Sphk    }
1718104349Sphk  }
1719104349Sphk}
1720104349Sphk
1721178848Scokanestatic const char * PTRFASTCALL
1722104349SphkPREFIX(skipS)(const ENCODING *enc, const char *ptr)
1723104349Sphk{
1724104349Sphk  for (;;) {
1725104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
1726104349Sphk    case BT_LF:
1727104349Sphk    case BT_CR:
1728104349Sphk    case BT_S:
1729104349Sphk      ptr += MINBPC(enc);
1730104349Sphk      break;
1731104349Sphk    default:
1732104349Sphk      return ptr;
1733104349Sphk    }
1734104349Sphk  }
1735104349Sphk}
1736104349Sphk
1737178848Scokanestatic void PTRCALL
1738104349SphkPREFIX(updatePosition)(const ENCODING *enc,
1739104349Sphk                       const char *ptr,
1740104349Sphk                       const char *end,
1741104349Sphk                       POSITION *pos)
1742104349Sphk{
1743302385Sdelphij  while (HAS_CHAR(enc, ptr, end)) {
1744104349Sphk    switch (BYTE_TYPE(enc, ptr)) {
1745104349Sphk#define LEAD_CASE(n) \
1746104349Sphk    case BT_LEAD ## n: \
1747104349Sphk      ptr += n; \
1748104349Sphk      break;
1749104349Sphk    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1750104349Sphk#undef LEAD_CASE
1751104349Sphk    case BT_LF:
1752178848Scokane      pos->columnNumber = (XML_Size)-1;
1753104349Sphk      pos->lineNumber++;
1754104349Sphk      ptr += MINBPC(enc);
1755104349Sphk      break;
1756104349Sphk    case BT_CR:
1757104349Sphk      pos->lineNumber++;
1758104349Sphk      ptr += MINBPC(enc);
1759302385Sdelphij      if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1760104349Sphk        ptr += MINBPC(enc);
1761178848Scokane      pos->columnNumber = (XML_Size)-1;
1762104349Sphk      break;
1763104349Sphk    default:
1764104349Sphk      ptr += MINBPC(enc);
1765104349Sphk      break;
1766104349Sphk    }
1767104349Sphk    pos->columnNumber++;
1768104349Sphk  }
1769104349Sphk}
1770104349Sphk
1771104349Sphk#undef DO_LEAD_CASE
1772104349Sphk#undef MULTIBYTE_CASES
1773104349Sphk#undef INVALID_CASES
1774104349Sphk#undef CHECK_NAME_CASE
1775104349Sphk#undef CHECK_NAME_CASES
1776104349Sphk#undef CHECK_NMSTRT_CASE
1777104349Sphk#undef CHECK_NMSTRT_CASES
1778178848Scokane
1779178848Scokane#endif /* XML_TOK_IMPL_C */
1780