1104349Sphk/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd 2104349Sphk See the file COPYING for copying permission. 3104349Sphk*/ 4104349Sphk 5178848Scokane/* This file is included! */ 6178848Scokane#ifdef XML_TOK_IMPL_C 7178848Scokane 8104349Sphk#ifndef IS_INVALID_CHAR 9104349Sphk#define IS_INVALID_CHAR(enc, ptr, n) (0) 10104349Sphk#endif 11104349Sphk 12104349Sphk#define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \ 13104349Sphk case BT_LEAD ## n: \ 14104349Sphk if (end - ptr < n) \ 15104349Sphk return XML_TOK_PARTIAL_CHAR; \ 16104349Sphk if (IS_INVALID_CHAR(enc, ptr, n)) { \ 17104349Sphk *(nextTokPtr) = (ptr); \ 18104349Sphk return XML_TOK_INVALID; \ 19104349Sphk } \ 20104349Sphk ptr += n; \ 21104349Sphk break; 22104349Sphk 23104349Sphk#define INVALID_CASES(ptr, nextTokPtr) \ 24104349Sphk INVALID_LEAD_CASE(2, ptr, nextTokPtr) \ 25104349Sphk INVALID_LEAD_CASE(3, ptr, nextTokPtr) \ 26104349Sphk INVALID_LEAD_CASE(4, ptr, nextTokPtr) \ 27104349Sphk case BT_NONXML: \ 28104349Sphk case BT_MALFORM: \ 29104349Sphk case BT_TRAIL: \ 30104349Sphk *(nextTokPtr) = (ptr); \ 31104349Sphk return XML_TOK_INVALID; 32104349Sphk 33104349Sphk#define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \ 34104349Sphk case BT_LEAD ## n: \ 35104349Sphk if (end - ptr < n) \ 36104349Sphk return XML_TOK_PARTIAL_CHAR; \ 37104349Sphk if (!IS_NAME_CHAR(enc, ptr, n)) { \ 38104349Sphk *nextTokPtr = ptr; \ 39104349Sphk return XML_TOK_INVALID; \ 40104349Sphk } \ 41104349Sphk ptr += n; \ 42104349Sphk break; 43104349Sphk 44104349Sphk#define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \ 45104349Sphk case BT_NONASCII: \ 46104349Sphk if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \ 47104349Sphk *nextTokPtr = ptr; \ 48104349Sphk return XML_TOK_INVALID; \ 49104349Sphk } \ 50104349Sphk case BT_NMSTRT: \ 51104349Sphk case BT_HEX: \ 52104349Sphk case BT_DIGIT: \ 53104349Sphk case BT_NAME: \ 54104349Sphk case BT_MINUS: \ 55104349Sphk ptr += MINBPC(enc); \ 56104349Sphk break; \ 57104349Sphk CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \ 58104349Sphk CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \ 59104349Sphk CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr) 60104349Sphk 61104349Sphk#define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \ 62104349Sphk case BT_LEAD ## n: \ 63104349Sphk if (end - ptr < n) \ 64104349Sphk return XML_TOK_PARTIAL_CHAR; \ 65104349Sphk if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \ 66104349Sphk *nextTokPtr = ptr; \ 67104349Sphk return XML_TOK_INVALID; \ 68104349Sphk } \ 69104349Sphk ptr += n; \ 70104349Sphk break; 71104349Sphk 72104349Sphk#define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \ 73104349Sphk case BT_NONASCII: \ 74104349Sphk if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \ 75104349Sphk *nextTokPtr = ptr; \ 76104349Sphk return XML_TOK_INVALID; \ 77104349Sphk } \ 78104349Sphk case BT_NMSTRT: \ 79104349Sphk case BT_HEX: \ 80104349Sphk ptr += MINBPC(enc); \ 81104349Sphk break; \ 82104349Sphk CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \ 83104349Sphk CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \ 84104349Sphk CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr) 85104349Sphk 86104349Sphk#ifndef PREFIX 87104349Sphk#define PREFIX(ident) ident 88104349Sphk#endif 89104349Sphk 90302385Sdelphij 91302385Sdelphij#define HAS_CHARS(enc, ptr, end, count) \ 92302385Sdelphij (end - ptr >= count * MINBPC(enc)) 93302385Sdelphij 94302385Sdelphij#define HAS_CHAR(enc, ptr, end) \ 95302385Sdelphij HAS_CHARS(enc, ptr, end, 1) 96302385Sdelphij 97302385Sdelphij#define REQUIRE_CHARS(enc, ptr, end, count) \ 98302385Sdelphij { \ 99302385Sdelphij if (! HAS_CHARS(enc, ptr, end, count)) { \ 100302385Sdelphij return XML_TOK_PARTIAL; \ 101302385Sdelphij } \ 102302385Sdelphij } 103302385Sdelphij 104302385Sdelphij#define REQUIRE_CHAR(enc, ptr, end) \ 105302385Sdelphij REQUIRE_CHARS(enc, ptr, end, 1) 106302385Sdelphij 107302385Sdelphij 108104349Sphk/* ptr points to character following "<!-" */ 109104349Sphk 110178848Scokanestatic int PTRCALL 111104349SphkPREFIX(scanComment)(const ENCODING *enc, const char *ptr, 112104349Sphk const char *end, const char **nextTokPtr) 113104349Sphk{ 114302385Sdelphij if (HAS_CHAR(enc, ptr, end)) { 115104349Sphk if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) { 116104349Sphk *nextTokPtr = ptr; 117104349Sphk return XML_TOK_INVALID; 118104349Sphk } 119104349Sphk ptr += MINBPC(enc); 120302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 121104349Sphk switch (BYTE_TYPE(enc, ptr)) { 122104349Sphk INVALID_CASES(ptr, nextTokPtr) 123104349Sphk case BT_MINUS: 124302385Sdelphij ptr += MINBPC(enc); 125302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 126104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) { 127302385Sdelphij ptr += MINBPC(enc); 128302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 129104349Sphk if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 130104349Sphk *nextTokPtr = ptr; 131104349Sphk return XML_TOK_INVALID; 132104349Sphk } 133104349Sphk *nextTokPtr = ptr + MINBPC(enc); 134104349Sphk return XML_TOK_COMMENT; 135104349Sphk } 136104349Sphk break; 137104349Sphk default: 138104349Sphk ptr += MINBPC(enc); 139104349Sphk break; 140104349Sphk } 141104349Sphk } 142104349Sphk } 143104349Sphk return XML_TOK_PARTIAL; 144104349Sphk} 145104349Sphk 146104349Sphk/* ptr points to character following "<!" */ 147104349Sphk 148178848Scokanestatic int PTRCALL 149104349SphkPREFIX(scanDecl)(const ENCODING *enc, const char *ptr, 150104349Sphk const char *end, const char **nextTokPtr) 151104349Sphk{ 152302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 153104349Sphk switch (BYTE_TYPE(enc, ptr)) { 154104349Sphk case BT_MINUS: 155104349Sphk return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); 156104349Sphk case BT_LSQB: 157104349Sphk *nextTokPtr = ptr + MINBPC(enc); 158104349Sphk return XML_TOK_COND_SECT_OPEN; 159104349Sphk case BT_NMSTRT: 160104349Sphk case BT_HEX: 161104349Sphk ptr += MINBPC(enc); 162104349Sphk break; 163104349Sphk default: 164104349Sphk *nextTokPtr = ptr; 165104349Sphk return XML_TOK_INVALID; 166104349Sphk } 167302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 168104349Sphk switch (BYTE_TYPE(enc, ptr)) { 169104349Sphk case BT_PERCNT: 170302385Sdelphij REQUIRE_CHARS(enc, ptr, end, 2); 171104349Sphk /* don't allow <!ENTITY% foo "whatever"> */ 172104349Sphk switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) { 173104349Sphk case BT_S: case BT_CR: case BT_LF: case BT_PERCNT: 174104349Sphk *nextTokPtr = ptr; 175104349Sphk return XML_TOK_INVALID; 176104349Sphk } 177104349Sphk /* fall through */ 178104349Sphk case BT_S: case BT_CR: case BT_LF: 179104349Sphk *nextTokPtr = ptr; 180104349Sphk return XML_TOK_DECL_OPEN; 181104349Sphk case BT_NMSTRT: 182104349Sphk case BT_HEX: 183104349Sphk ptr += MINBPC(enc); 184104349Sphk break; 185104349Sphk default: 186104349Sphk *nextTokPtr = ptr; 187104349Sphk return XML_TOK_INVALID; 188104349Sphk } 189104349Sphk } 190104349Sphk return XML_TOK_PARTIAL; 191104349Sphk} 192104349Sphk 193178848Scokanestatic int PTRCALL 194302385SdelphijPREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr, 195104349Sphk const char *end, int *tokPtr) 196104349Sphk{ 197104349Sphk int upper = 0; 198104349Sphk *tokPtr = XML_TOK_PI; 199104349Sphk if (end - ptr != MINBPC(enc)*3) 200104349Sphk return 1; 201104349Sphk switch (BYTE_TO_ASCII(enc, ptr)) { 202104349Sphk case ASCII_x: 203104349Sphk break; 204104349Sphk case ASCII_X: 205104349Sphk upper = 1; 206104349Sphk break; 207104349Sphk default: 208104349Sphk return 1; 209104349Sphk } 210104349Sphk ptr += MINBPC(enc); 211104349Sphk switch (BYTE_TO_ASCII(enc, ptr)) { 212104349Sphk case ASCII_m: 213104349Sphk break; 214104349Sphk case ASCII_M: 215104349Sphk upper = 1; 216104349Sphk break; 217104349Sphk default: 218104349Sphk return 1; 219104349Sphk } 220104349Sphk ptr += MINBPC(enc); 221104349Sphk switch (BYTE_TO_ASCII(enc, ptr)) { 222104349Sphk case ASCII_l: 223104349Sphk break; 224104349Sphk case ASCII_L: 225104349Sphk upper = 1; 226104349Sphk break; 227104349Sphk default: 228104349Sphk return 1; 229104349Sphk } 230104349Sphk if (upper) 231104349Sphk return 0; 232104349Sphk *tokPtr = XML_TOK_XML_DECL; 233104349Sphk return 1; 234104349Sphk} 235104349Sphk 236104349Sphk/* ptr points to character following "<?" */ 237104349Sphk 238178848Scokanestatic int PTRCALL 239104349SphkPREFIX(scanPi)(const ENCODING *enc, const char *ptr, 240104349Sphk const char *end, const char **nextTokPtr) 241104349Sphk{ 242104349Sphk int tok; 243104349Sphk const char *target = ptr; 244302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 245104349Sphk switch (BYTE_TYPE(enc, ptr)) { 246104349Sphk CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 247104349Sphk default: 248104349Sphk *nextTokPtr = ptr; 249104349Sphk return XML_TOK_INVALID; 250104349Sphk } 251302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 252104349Sphk switch (BYTE_TYPE(enc, ptr)) { 253104349Sphk CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 254104349Sphk case BT_S: case BT_CR: case BT_LF: 255104349Sphk if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { 256104349Sphk *nextTokPtr = ptr; 257104349Sphk return XML_TOK_INVALID; 258104349Sphk } 259104349Sphk ptr += MINBPC(enc); 260302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 261104349Sphk switch (BYTE_TYPE(enc, ptr)) { 262104349Sphk INVALID_CASES(ptr, nextTokPtr) 263104349Sphk case BT_QUEST: 264104349Sphk ptr += MINBPC(enc); 265302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 266104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 267104349Sphk *nextTokPtr = ptr + MINBPC(enc); 268104349Sphk return tok; 269104349Sphk } 270104349Sphk break; 271104349Sphk default: 272104349Sphk ptr += MINBPC(enc); 273104349Sphk break; 274104349Sphk } 275104349Sphk } 276104349Sphk return XML_TOK_PARTIAL; 277104349Sphk case BT_QUEST: 278104349Sphk if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { 279104349Sphk *nextTokPtr = ptr; 280104349Sphk return XML_TOK_INVALID; 281104349Sphk } 282104349Sphk ptr += MINBPC(enc); 283302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 284104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 285104349Sphk *nextTokPtr = ptr + MINBPC(enc); 286104349Sphk return tok; 287104349Sphk } 288104349Sphk /* fall through */ 289104349Sphk default: 290104349Sphk *nextTokPtr = ptr; 291104349Sphk return XML_TOK_INVALID; 292104349Sphk } 293104349Sphk } 294104349Sphk return XML_TOK_PARTIAL; 295104349Sphk} 296104349Sphk 297178848Scokanestatic int PTRCALL 298302385SdelphijPREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr, 299104349Sphk const char *end, const char **nextTokPtr) 300104349Sphk{ 301104349Sphk static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A, 302104349Sphk ASCII_T, ASCII_A, ASCII_LSQB }; 303104349Sphk int i; 304104349Sphk /* CDATA[ */ 305302385Sdelphij REQUIRE_CHARS(enc, ptr, end, 6); 306104349Sphk for (i = 0; i < 6; i++, ptr += MINBPC(enc)) { 307104349Sphk if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) { 308104349Sphk *nextTokPtr = ptr; 309104349Sphk return XML_TOK_INVALID; 310104349Sphk } 311104349Sphk } 312104349Sphk *nextTokPtr = ptr; 313104349Sphk return XML_TOK_CDATA_SECT_OPEN; 314104349Sphk} 315104349Sphk 316178848Scokanestatic int PTRCALL 317104349SphkPREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, 318104349Sphk const char *end, const char **nextTokPtr) 319104349Sphk{ 320302385Sdelphij if (ptr >= end) 321104349Sphk return XML_TOK_NONE; 322104349Sphk if (MINBPC(enc) > 1) { 323104349Sphk size_t n = end - ptr; 324104349Sphk if (n & (MINBPC(enc) - 1)) { 325104349Sphk n &= ~(MINBPC(enc) - 1); 326104349Sphk if (n == 0) 327104349Sphk return XML_TOK_PARTIAL; 328104349Sphk end = ptr + n; 329104349Sphk } 330104349Sphk } 331104349Sphk switch (BYTE_TYPE(enc, ptr)) { 332104349Sphk case BT_RSQB: 333104349Sphk ptr += MINBPC(enc); 334302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 335104349Sphk if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB)) 336104349Sphk break; 337104349Sphk ptr += MINBPC(enc); 338302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 339104349Sphk if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 340104349Sphk ptr -= MINBPC(enc); 341104349Sphk break; 342104349Sphk } 343104349Sphk *nextTokPtr = ptr + MINBPC(enc); 344104349Sphk return XML_TOK_CDATA_SECT_CLOSE; 345104349Sphk case BT_CR: 346104349Sphk ptr += MINBPC(enc); 347302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 348104349Sphk if (BYTE_TYPE(enc, ptr) == BT_LF) 349104349Sphk ptr += MINBPC(enc); 350104349Sphk *nextTokPtr = ptr; 351104349Sphk return XML_TOK_DATA_NEWLINE; 352104349Sphk case BT_LF: 353104349Sphk *nextTokPtr = ptr + MINBPC(enc); 354104349Sphk return XML_TOK_DATA_NEWLINE; 355104349Sphk INVALID_CASES(ptr, nextTokPtr) 356104349Sphk default: 357104349Sphk ptr += MINBPC(enc); 358104349Sphk break; 359104349Sphk } 360302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 361104349Sphk switch (BYTE_TYPE(enc, ptr)) { 362104349Sphk#define LEAD_CASE(n) \ 363104349Sphk case BT_LEAD ## n: \ 364104349Sphk if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ 365104349Sphk *nextTokPtr = ptr; \ 366104349Sphk return XML_TOK_DATA_CHARS; \ 367104349Sphk } \ 368104349Sphk ptr += n; \ 369104349Sphk break; 370104349Sphk LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 371104349Sphk#undef LEAD_CASE 372104349Sphk case BT_NONXML: 373104349Sphk case BT_MALFORM: 374104349Sphk case BT_TRAIL: 375104349Sphk case BT_CR: 376104349Sphk case BT_LF: 377104349Sphk case BT_RSQB: 378104349Sphk *nextTokPtr = ptr; 379104349Sphk return XML_TOK_DATA_CHARS; 380104349Sphk default: 381104349Sphk ptr += MINBPC(enc); 382104349Sphk break; 383104349Sphk } 384104349Sphk } 385104349Sphk *nextTokPtr = ptr; 386104349Sphk return XML_TOK_DATA_CHARS; 387104349Sphk} 388104349Sphk 389104349Sphk/* ptr points to character following "</" */ 390104349Sphk 391178848Scokanestatic int PTRCALL 392104349SphkPREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, 393104349Sphk const char *end, const char **nextTokPtr) 394104349Sphk{ 395302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 396104349Sphk switch (BYTE_TYPE(enc, ptr)) { 397104349Sphk CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 398104349Sphk default: 399104349Sphk *nextTokPtr = ptr; 400104349Sphk return XML_TOK_INVALID; 401104349Sphk } 402302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 403104349Sphk switch (BYTE_TYPE(enc, ptr)) { 404104349Sphk CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 405104349Sphk case BT_S: case BT_CR: case BT_LF: 406302385Sdelphij for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { 407104349Sphk switch (BYTE_TYPE(enc, ptr)) { 408104349Sphk case BT_S: case BT_CR: case BT_LF: 409104349Sphk break; 410104349Sphk case BT_GT: 411104349Sphk *nextTokPtr = ptr + MINBPC(enc); 412104349Sphk return XML_TOK_END_TAG; 413104349Sphk default: 414104349Sphk *nextTokPtr = ptr; 415104349Sphk return XML_TOK_INVALID; 416104349Sphk } 417104349Sphk } 418104349Sphk return XML_TOK_PARTIAL; 419104349Sphk#ifdef XML_NS 420104349Sphk case BT_COLON: 421104349Sphk /* no need to check qname syntax here, 422104349Sphk since end-tag must match exactly */ 423104349Sphk ptr += MINBPC(enc); 424104349Sphk break; 425104349Sphk#endif 426104349Sphk case BT_GT: 427104349Sphk *nextTokPtr = ptr + MINBPC(enc); 428104349Sphk return XML_TOK_END_TAG; 429104349Sphk default: 430104349Sphk *nextTokPtr = ptr; 431104349Sphk return XML_TOK_INVALID; 432104349Sphk } 433104349Sphk } 434104349Sphk return XML_TOK_PARTIAL; 435104349Sphk} 436104349Sphk 437104349Sphk/* ptr points to character following "&#X" */ 438104349Sphk 439178848Scokanestatic int PTRCALL 440104349SphkPREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, 441104349Sphk const char *end, const char **nextTokPtr) 442104349Sphk{ 443302385Sdelphij if (HAS_CHAR(enc, ptr, end)) { 444104349Sphk switch (BYTE_TYPE(enc, ptr)) { 445104349Sphk case BT_DIGIT: 446104349Sphk case BT_HEX: 447104349Sphk break; 448104349Sphk default: 449104349Sphk *nextTokPtr = ptr; 450104349Sphk return XML_TOK_INVALID; 451104349Sphk } 452302385Sdelphij for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { 453104349Sphk switch (BYTE_TYPE(enc, ptr)) { 454104349Sphk case BT_DIGIT: 455104349Sphk case BT_HEX: 456104349Sphk break; 457104349Sphk case BT_SEMI: 458104349Sphk *nextTokPtr = ptr + MINBPC(enc); 459104349Sphk return XML_TOK_CHAR_REF; 460104349Sphk default: 461104349Sphk *nextTokPtr = ptr; 462104349Sphk return XML_TOK_INVALID; 463104349Sphk } 464104349Sphk } 465104349Sphk } 466104349Sphk return XML_TOK_PARTIAL; 467104349Sphk} 468104349Sphk 469104349Sphk/* ptr points to character following "&#" */ 470104349Sphk 471178848Scokanestatic int PTRCALL 472104349SphkPREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, 473104349Sphk const char *end, const char **nextTokPtr) 474104349Sphk{ 475302385Sdelphij if (HAS_CHAR(enc, ptr, end)) { 476104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_x)) 477104349Sphk return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 478104349Sphk switch (BYTE_TYPE(enc, ptr)) { 479104349Sphk case BT_DIGIT: 480104349Sphk break; 481104349Sphk default: 482104349Sphk *nextTokPtr = ptr; 483104349Sphk return XML_TOK_INVALID; 484104349Sphk } 485302385Sdelphij for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { 486104349Sphk switch (BYTE_TYPE(enc, ptr)) { 487104349Sphk case BT_DIGIT: 488104349Sphk break; 489104349Sphk case BT_SEMI: 490104349Sphk *nextTokPtr = ptr + MINBPC(enc); 491104349Sphk return XML_TOK_CHAR_REF; 492104349Sphk default: 493104349Sphk *nextTokPtr = ptr; 494104349Sphk return XML_TOK_INVALID; 495104349Sphk } 496104349Sphk } 497104349Sphk } 498104349Sphk return XML_TOK_PARTIAL; 499104349Sphk} 500104349Sphk 501104349Sphk/* ptr points to character following "&" */ 502104349Sphk 503178848Scokanestatic int PTRCALL 504104349SphkPREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end, 505104349Sphk const char **nextTokPtr) 506104349Sphk{ 507302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 508104349Sphk switch (BYTE_TYPE(enc, ptr)) { 509104349Sphk CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 510104349Sphk case BT_NUM: 511104349Sphk return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 512104349Sphk default: 513104349Sphk *nextTokPtr = ptr; 514104349Sphk return XML_TOK_INVALID; 515104349Sphk } 516302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 517104349Sphk switch (BYTE_TYPE(enc, ptr)) { 518104349Sphk CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 519104349Sphk case BT_SEMI: 520104349Sphk *nextTokPtr = ptr + MINBPC(enc); 521104349Sphk return XML_TOK_ENTITY_REF; 522104349Sphk default: 523104349Sphk *nextTokPtr = ptr; 524104349Sphk return XML_TOK_INVALID; 525104349Sphk } 526104349Sphk } 527104349Sphk return XML_TOK_PARTIAL; 528104349Sphk} 529104349Sphk 530104349Sphk/* ptr points to character following first character of attribute name */ 531104349Sphk 532178848Scokanestatic int PTRCALL 533104349SphkPREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, 534104349Sphk const char **nextTokPtr) 535104349Sphk{ 536104349Sphk#ifdef XML_NS 537104349Sphk int hadColon = 0; 538104349Sphk#endif 539302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 540104349Sphk switch (BYTE_TYPE(enc, ptr)) { 541104349Sphk CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 542104349Sphk#ifdef XML_NS 543104349Sphk case BT_COLON: 544104349Sphk if (hadColon) { 545104349Sphk *nextTokPtr = ptr; 546104349Sphk return XML_TOK_INVALID; 547104349Sphk } 548104349Sphk hadColon = 1; 549104349Sphk ptr += MINBPC(enc); 550302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 551104349Sphk switch (BYTE_TYPE(enc, ptr)) { 552104349Sphk CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 553104349Sphk default: 554104349Sphk *nextTokPtr = ptr; 555104349Sphk return XML_TOK_INVALID; 556104349Sphk } 557104349Sphk break; 558104349Sphk#endif 559104349Sphk case BT_S: case BT_CR: case BT_LF: 560104349Sphk for (;;) { 561104349Sphk int t; 562104349Sphk 563104349Sphk ptr += MINBPC(enc); 564302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 565104349Sphk t = BYTE_TYPE(enc, ptr); 566104349Sphk if (t == BT_EQUALS) 567104349Sphk break; 568104349Sphk switch (t) { 569104349Sphk case BT_S: 570104349Sphk case BT_LF: 571104349Sphk case BT_CR: 572104349Sphk break; 573104349Sphk default: 574104349Sphk *nextTokPtr = ptr; 575104349Sphk return XML_TOK_INVALID; 576104349Sphk } 577104349Sphk } 578104349Sphk /* fall through */ 579104349Sphk case BT_EQUALS: 580104349Sphk { 581104349Sphk int open; 582104349Sphk#ifdef XML_NS 583104349Sphk hadColon = 0; 584104349Sphk#endif 585104349Sphk for (;;) { 586104349Sphk ptr += MINBPC(enc); 587302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 588104349Sphk open = BYTE_TYPE(enc, ptr); 589104349Sphk if (open == BT_QUOT || open == BT_APOS) 590104349Sphk break; 591104349Sphk switch (open) { 592104349Sphk case BT_S: 593104349Sphk case BT_LF: 594104349Sphk case BT_CR: 595104349Sphk break; 596104349Sphk default: 597104349Sphk *nextTokPtr = ptr; 598104349Sphk return XML_TOK_INVALID; 599104349Sphk } 600104349Sphk } 601104349Sphk ptr += MINBPC(enc); 602104349Sphk /* in attribute value */ 603104349Sphk for (;;) { 604104349Sphk int t; 605302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 606104349Sphk t = BYTE_TYPE(enc, ptr); 607104349Sphk if (t == open) 608104349Sphk break; 609104349Sphk switch (t) { 610104349Sphk INVALID_CASES(ptr, nextTokPtr) 611104349Sphk case BT_AMP: 612104349Sphk { 613104349Sphk int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr); 614104349Sphk if (tok <= 0) { 615104349Sphk if (tok == XML_TOK_INVALID) 616104349Sphk *nextTokPtr = ptr; 617104349Sphk return tok; 618104349Sphk } 619104349Sphk break; 620104349Sphk } 621104349Sphk case BT_LT: 622104349Sphk *nextTokPtr = ptr; 623104349Sphk return XML_TOK_INVALID; 624104349Sphk default: 625104349Sphk ptr += MINBPC(enc); 626104349Sphk break; 627104349Sphk } 628104349Sphk } 629104349Sphk ptr += MINBPC(enc); 630302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 631104349Sphk switch (BYTE_TYPE(enc, ptr)) { 632104349Sphk case BT_S: 633104349Sphk case BT_CR: 634104349Sphk case BT_LF: 635104349Sphk break; 636104349Sphk case BT_SOL: 637104349Sphk goto sol; 638104349Sphk case BT_GT: 639104349Sphk goto gt; 640104349Sphk default: 641104349Sphk *nextTokPtr = ptr; 642104349Sphk return XML_TOK_INVALID; 643104349Sphk } 644104349Sphk /* ptr points to closing quote */ 645104349Sphk for (;;) { 646104349Sphk ptr += MINBPC(enc); 647302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 648104349Sphk switch (BYTE_TYPE(enc, ptr)) { 649104349Sphk CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 650104349Sphk case BT_S: case BT_CR: case BT_LF: 651104349Sphk continue; 652104349Sphk case BT_GT: 653104349Sphk gt: 654104349Sphk *nextTokPtr = ptr + MINBPC(enc); 655104349Sphk return XML_TOK_START_TAG_WITH_ATTS; 656104349Sphk case BT_SOL: 657104349Sphk sol: 658104349Sphk ptr += MINBPC(enc); 659302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 660104349Sphk if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 661104349Sphk *nextTokPtr = ptr; 662104349Sphk return XML_TOK_INVALID; 663104349Sphk } 664104349Sphk *nextTokPtr = ptr + MINBPC(enc); 665104349Sphk return XML_TOK_EMPTY_ELEMENT_WITH_ATTS; 666104349Sphk default: 667104349Sphk *nextTokPtr = ptr; 668104349Sphk return XML_TOK_INVALID; 669104349Sphk } 670104349Sphk break; 671104349Sphk } 672104349Sphk break; 673104349Sphk } 674104349Sphk default: 675104349Sphk *nextTokPtr = ptr; 676104349Sphk return XML_TOK_INVALID; 677104349Sphk } 678104349Sphk } 679104349Sphk return XML_TOK_PARTIAL; 680104349Sphk} 681104349Sphk 682104349Sphk/* ptr points to character following "<" */ 683104349Sphk 684178848Scokanestatic int PTRCALL 685104349SphkPREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, 686104349Sphk const char **nextTokPtr) 687104349Sphk{ 688104349Sphk#ifdef XML_NS 689104349Sphk int hadColon; 690104349Sphk#endif 691302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 692104349Sphk switch (BYTE_TYPE(enc, ptr)) { 693104349Sphk CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 694104349Sphk case BT_EXCL: 695302385Sdelphij ptr += MINBPC(enc); 696302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 697104349Sphk switch (BYTE_TYPE(enc, ptr)) { 698104349Sphk case BT_MINUS: 699104349Sphk return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr); 700104349Sphk case BT_LSQB: 701104349Sphk return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), 702104349Sphk end, nextTokPtr); 703104349Sphk } 704104349Sphk *nextTokPtr = ptr; 705104349Sphk return XML_TOK_INVALID; 706104349Sphk case BT_QUEST: 707104349Sphk return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); 708104349Sphk case BT_SOL: 709104349Sphk return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr); 710104349Sphk default: 711104349Sphk *nextTokPtr = ptr; 712104349Sphk return XML_TOK_INVALID; 713104349Sphk } 714104349Sphk#ifdef XML_NS 715104349Sphk hadColon = 0; 716104349Sphk#endif 717104349Sphk /* we have a start-tag */ 718302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 719104349Sphk switch (BYTE_TYPE(enc, ptr)) { 720104349Sphk CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 721104349Sphk#ifdef XML_NS 722104349Sphk case BT_COLON: 723104349Sphk if (hadColon) { 724104349Sphk *nextTokPtr = ptr; 725104349Sphk return XML_TOK_INVALID; 726104349Sphk } 727104349Sphk hadColon = 1; 728104349Sphk ptr += MINBPC(enc); 729302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 730104349Sphk switch (BYTE_TYPE(enc, ptr)) { 731104349Sphk CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 732104349Sphk default: 733104349Sphk *nextTokPtr = ptr; 734104349Sphk return XML_TOK_INVALID; 735104349Sphk } 736104349Sphk break; 737104349Sphk#endif 738104349Sphk case BT_S: case BT_CR: case BT_LF: 739104349Sphk { 740104349Sphk ptr += MINBPC(enc); 741302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 742104349Sphk switch (BYTE_TYPE(enc, ptr)) { 743104349Sphk CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 744104349Sphk case BT_GT: 745104349Sphk goto gt; 746104349Sphk case BT_SOL: 747104349Sphk goto sol; 748104349Sphk case BT_S: case BT_CR: case BT_LF: 749104349Sphk ptr += MINBPC(enc); 750104349Sphk continue; 751104349Sphk default: 752104349Sphk *nextTokPtr = ptr; 753104349Sphk return XML_TOK_INVALID; 754104349Sphk } 755104349Sphk return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr); 756104349Sphk } 757104349Sphk return XML_TOK_PARTIAL; 758104349Sphk } 759104349Sphk case BT_GT: 760104349Sphk gt: 761104349Sphk *nextTokPtr = ptr + MINBPC(enc); 762104349Sphk return XML_TOK_START_TAG_NO_ATTS; 763104349Sphk case BT_SOL: 764104349Sphk sol: 765104349Sphk ptr += MINBPC(enc); 766302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 767104349Sphk if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 768104349Sphk *nextTokPtr = ptr; 769104349Sphk return XML_TOK_INVALID; 770104349Sphk } 771104349Sphk *nextTokPtr = ptr + MINBPC(enc); 772104349Sphk return XML_TOK_EMPTY_ELEMENT_NO_ATTS; 773104349Sphk default: 774104349Sphk *nextTokPtr = ptr; 775104349Sphk return XML_TOK_INVALID; 776104349Sphk } 777104349Sphk } 778104349Sphk return XML_TOK_PARTIAL; 779104349Sphk} 780104349Sphk 781178848Scokanestatic int PTRCALL 782104349SphkPREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end, 783104349Sphk const char **nextTokPtr) 784104349Sphk{ 785302385Sdelphij if (ptr >= end) 786104349Sphk return XML_TOK_NONE; 787104349Sphk if (MINBPC(enc) > 1) { 788104349Sphk size_t n = end - ptr; 789104349Sphk if (n & (MINBPC(enc) - 1)) { 790104349Sphk n &= ~(MINBPC(enc) - 1); 791104349Sphk if (n == 0) 792104349Sphk return XML_TOK_PARTIAL; 793104349Sphk end = ptr + n; 794104349Sphk } 795104349Sphk } 796104349Sphk switch (BYTE_TYPE(enc, ptr)) { 797104349Sphk case BT_LT: 798104349Sphk return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr); 799104349Sphk case BT_AMP: 800104349Sphk return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 801104349Sphk case BT_CR: 802104349Sphk ptr += MINBPC(enc); 803302385Sdelphij if (! HAS_CHAR(enc, ptr, end)) 804104349Sphk return XML_TOK_TRAILING_CR; 805104349Sphk if (BYTE_TYPE(enc, ptr) == BT_LF) 806104349Sphk ptr += MINBPC(enc); 807104349Sphk *nextTokPtr = ptr; 808104349Sphk return XML_TOK_DATA_NEWLINE; 809104349Sphk case BT_LF: 810104349Sphk *nextTokPtr = ptr + MINBPC(enc); 811104349Sphk return XML_TOK_DATA_NEWLINE; 812104349Sphk case BT_RSQB: 813104349Sphk ptr += MINBPC(enc); 814302385Sdelphij if (! HAS_CHAR(enc, ptr, end)) 815104349Sphk return XML_TOK_TRAILING_RSQB; 816104349Sphk if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB)) 817104349Sphk break; 818104349Sphk ptr += MINBPC(enc); 819302385Sdelphij if (! HAS_CHAR(enc, ptr, end)) 820104349Sphk return XML_TOK_TRAILING_RSQB; 821104349Sphk if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) { 822104349Sphk ptr -= MINBPC(enc); 823104349Sphk break; 824104349Sphk } 825104349Sphk *nextTokPtr = ptr; 826104349Sphk return XML_TOK_INVALID; 827104349Sphk INVALID_CASES(ptr, nextTokPtr) 828104349Sphk default: 829104349Sphk ptr += MINBPC(enc); 830104349Sphk break; 831104349Sphk } 832302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 833104349Sphk switch (BYTE_TYPE(enc, ptr)) { 834104349Sphk#define LEAD_CASE(n) \ 835104349Sphk case BT_LEAD ## n: \ 836104349Sphk if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ 837104349Sphk *nextTokPtr = ptr; \ 838104349Sphk return XML_TOK_DATA_CHARS; \ 839104349Sphk } \ 840104349Sphk ptr += n; \ 841104349Sphk break; 842104349Sphk LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 843104349Sphk#undef LEAD_CASE 844104349Sphk case BT_RSQB: 845302385Sdelphij if (HAS_CHARS(enc, ptr, end, 2)) { 846104349Sphk if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) { 847104349Sphk ptr += MINBPC(enc); 848104349Sphk break; 849104349Sphk } 850302385Sdelphij if (HAS_CHARS(enc, ptr, end, 3)) { 851104349Sphk if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) { 852104349Sphk ptr += MINBPC(enc); 853104349Sphk break; 854104349Sphk } 855104349Sphk *nextTokPtr = ptr + 2*MINBPC(enc); 856104349Sphk return XML_TOK_INVALID; 857104349Sphk } 858104349Sphk } 859104349Sphk /* fall through */ 860104349Sphk case BT_AMP: 861104349Sphk case BT_LT: 862104349Sphk case BT_NONXML: 863104349Sphk case BT_MALFORM: 864104349Sphk case BT_TRAIL: 865104349Sphk case BT_CR: 866104349Sphk case BT_LF: 867104349Sphk *nextTokPtr = ptr; 868104349Sphk return XML_TOK_DATA_CHARS; 869104349Sphk default: 870104349Sphk ptr += MINBPC(enc); 871104349Sphk break; 872104349Sphk } 873104349Sphk } 874104349Sphk *nextTokPtr = ptr; 875104349Sphk return XML_TOK_DATA_CHARS; 876104349Sphk} 877104349Sphk 878104349Sphk/* ptr points to character following "%" */ 879104349Sphk 880178848Scokanestatic int PTRCALL 881104349SphkPREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end, 882104349Sphk const char **nextTokPtr) 883104349Sphk{ 884302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 885104349Sphk switch (BYTE_TYPE(enc, ptr)) { 886104349Sphk CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 887104349Sphk case BT_S: case BT_LF: case BT_CR: case BT_PERCNT: 888104349Sphk *nextTokPtr = ptr; 889104349Sphk return XML_TOK_PERCENT; 890104349Sphk default: 891104349Sphk *nextTokPtr = ptr; 892104349Sphk return XML_TOK_INVALID; 893104349Sphk } 894302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 895104349Sphk switch (BYTE_TYPE(enc, ptr)) { 896104349Sphk CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 897104349Sphk case BT_SEMI: 898104349Sphk *nextTokPtr = ptr + MINBPC(enc); 899104349Sphk return XML_TOK_PARAM_ENTITY_REF; 900104349Sphk default: 901104349Sphk *nextTokPtr = ptr; 902104349Sphk return XML_TOK_INVALID; 903104349Sphk } 904104349Sphk } 905104349Sphk return XML_TOK_PARTIAL; 906104349Sphk} 907104349Sphk 908178848Scokanestatic int PTRCALL 909104349SphkPREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end, 910104349Sphk const char **nextTokPtr) 911104349Sphk{ 912302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 913104349Sphk switch (BYTE_TYPE(enc, ptr)) { 914104349Sphk CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 915104349Sphk default: 916104349Sphk *nextTokPtr = ptr; 917104349Sphk return XML_TOK_INVALID; 918104349Sphk } 919302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 920104349Sphk switch (BYTE_TYPE(enc, ptr)) { 921104349Sphk CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 922104349Sphk case BT_CR: case BT_LF: case BT_S: 923104349Sphk case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR: 924104349Sphk *nextTokPtr = ptr; 925104349Sphk return XML_TOK_POUND_NAME; 926104349Sphk default: 927104349Sphk *nextTokPtr = ptr; 928104349Sphk return XML_TOK_INVALID; 929104349Sphk } 930104349Sphk } 931104349Sphk return -XML_TOK_POUND_NAME; 932104349Sphk} 933104349Sphk 934178848Scokanestatic int PTRCALL 935104349SphkPREFIX(scanLit)(int open, const ENCODING *enc, 936104349Sphk const char *ptr, const char *end, 937104349Sphk const char **nextTokPtr) 938104349Sphk{ 939302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 940104349Sphk int t = BYTE_TYPE(enc, ptr); 941104349Sphk switch (t) { 942104349Sphk INVALID_CASES(ptr, nextTokPtr) 943104349Sphk case BT_QUOT: 944104349Sphk case BT_APOS: 945104349Sphk ptr += MINBPC(enc); 946104349Sphk if (t != open) 947104349Sphk break; 948302385Sdelphij if (! HAS_CHAR(enc, ptr, end)) 949104349Sphk return -XML_TOK_LITERAL; 950104349Sphk *nextTokPtr = ptr; 951104349Sphk switch (BYTE_TYPE(enc, ptr)) { 952104349Sphk case BT_S: case BT_CR: case BT_LF: 953104349Sphk case BT_GT: case BT_PERCNT: case BT_LSQB: 954104349Sphk return XML_TOK_LITERAL; 955104349Sphk default: 956104349Sphk return XML_TOK_INVALID; 957104349Sphk } 958104349Sphk default: 959104349Sphk ptr += MINBPC(enc); 960104349Sphk break; 961104349Sphk } 962104349Sphk } 963104349Sphk return XML_TOK_PARTIAL; 964104349Sphk} 965104349Sphk 966178848Scokanestatic int PTRCALL 967104349SphkPREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, 968104349Sphk const char **nextTokPtr) 969104349Sphk{ 970104349Sphk int tok; 971302385Sdelphij if (ptr >= end) 972104349Sphk return XML_TOK_NONE; 973104349Sphk if (MINBPC(enc) > 1) { 974104349Sphk size_t n = end - ptr; 975104349Sphk if (n & (MINBPC(enc) - 1)) { 976104349Sphk n &= ~(MINBPC(enc) - 1); 977104349Sphk if (n == 0) 978104349Sphk return XML_TOK_PARTIAL; 979104349Sphk end = ptr + n; 980104349Sphk } 981104349Sphk } 982104349Sphk switch (BYTE_TYPE(enc, ptr)) { 983104349Sphk case BT_QUOT: 984104349Sphk return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr); 985104349Sphk case BT_APOS: 986104349Sphk return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr); 987104349Sphk case BT_LT: 988104349Sphk { 989104349Sphk ptr += MINBPC(enc); 990302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 991104349Sphk switch (BYTE_TYPE(enc, ptr)) { 992104349Sphk case BT_EXCL: 993104349Sphk return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr); 994104349Sphk case BT_QUEST: 995104349Sphk return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr); 996104349Sphk case BT_NMSTRT: 997104349Sphk case BT_HEX: 998104349Sphk case BT_NONASCII: 999104349Sphk case BT_LEAD2: 1000104349Sphk case BT_LEAD3: 1001104349Sphk case BT_LEAD4: 1002104349Sphk *nextTokPtr = ptr - MINBPC(enc); 1003104349Sphk return XML_TOK_INSTANCE_START; 1004104349Sphk } 1005104349Sphk *nextTokPtr = ptr; 1006104349Sphk return XML_TOK_INVALID; 1007104349Sphk } 1008104349Sphk case BT_CR: 1009104349Sphk if (ptr + MINBPC(enc) == end) { 1010104349Sphk *nextTokPtr = end; 1011104349Sphk /* indicate that this might be part of a CR/LF pair */ 1012104349Sphk return -XML_TOK_PROLOG_S; 1013104349Sphk } 1014104349Sphk /* fall through */ 1015104349Sphk case BT_S: case BT_LF: 1016104349Sphk for (;;) { 1017104349Sphk ptr += MINBPC(enc); 1018302385Sdelphij if (! HAS_CHAR(enc, ptr, end)) 1019104349Sphk break; 1020104349Sphk switch (BYTE_TYPE(enc, ptr)) { 1021104349Sphk case BT_S: case BT_LF: 1022104349Sphk break; 1023104349Sphk case BT_CR: 1024104349Sphk /* don't split CR/LF pair */ 1025104349Sphk if (ptr + MINBPC(enc) != end) 1026104349Sphk break; 1027104349Sphk /* fall through */ 1028104349Sphk default: 1029104349Sphk *nextTokPtr = ptr; 1030104349Sphk return XML_TOK_PROLOG_S; 1031104349Sphk } 1032104349Sphk } 1033104349Sphk *nextTokPtr = ptr; 1034104349Sphk return XML_TOK_PROLOG_S; 1035104349Sphk case BT_PERCNT: 1036104349Sphk return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1037104349Sphk case BT_COMMA: 1038104349Sphk *nextTokPtr = ptr + MINBPC(enc); 1039104349Sphk return XML_TOK_COMMA; 1040104349Sphk case BT_LSQB: 1041104349Sphk *nextTokPtr = ptr + MINBPC(enc); 1042104349Sphk return XML_TOK_OPEN_BRACKET; 1043104349Sphk case BT_RSQB: 1044104349Sphk ptr += MINBPC(enc); 1045302385Sdelphij if (! HAS_CHAR(enc, ptr, end)) 1046104349Sphk return -XML_TOK_CLOSE_BRACKET; 1047104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { 1048302385Sdelphij REQUIRE_CHARS(enc, ptr, end, 2); 1049104349Sphk if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) { 1050104349Sphk *nextTokPtr = ptr + 2*MINBPC(enc); 1051104349Sphk return XML_TOK_COND_SECT_CLOSE; 1052104349Sphk } 1053104349Sphk } 1054104349Sphk *nextTokPtr = ptr; 1055104349Sphk return XML_TOK_CLOSE_BRACKET; 1056104349Sphk case BT_LPAR: 1057104349Sphk *nextTokPtr = ptr + MINBPC(enc); 1058104349Sphk return XML_TOK_OPEN_PAREN; 1059104349Sphk case BT_RPAR: 1060104349Sphk ptr += MINBPC(enc); 1061302385Sdelphij if (! HAS_CHAR(enc, ptr, end)) 1062104349Sphk return -XML_TOK_CLOSE_PAREN; 1063104349Sphk switch (BYTE_TYPE(enc, ptr)) { 1064104349Sphk case BT_AST: 1065104349Sphk *nextTokPtr = ptr + MINBPC(enc); 1066104349Sphk return XML_TOK_CLOSE_PAREN_ASTERISK; 1067104349Sphk case BT_QUEST: 1068104349Sphk *nextTokPtr = ptr + MINBPC(enc); 1069104349Sphk return XML_TOK_CLOSE_PAREN_QUESTION; 1070104349Sphk case BT_PLUS: 1071104349Sphk *nextTokPtr = ptr + MINBPC(enc); 1072104349Sphk return XML_TOK_CLOSE_PAREN_PLUS; 1073104349Sphk case BT_CR: case BT_LF: case BT_S: 1074104349Sphk case BT_GT: case BT_COMMA: case BT_VERBAR: 1075104349Sphk case BT_RPAR: 1076104349Sphk *nextTokPtr = ptr; 1077104349Sphk return XML_TOK_CLOSE_PAREN; 1078104349Sphk } 1079104349Sphk *nextTokPtr = ptr; 1080104349Sphk return XML_TOK_INVALID; 1081104349Sphk case BT_VERBAR: 1082104349Sphk *nextTokPtr = ptr + MINBPC(enc); 1083104349Sphk return XML_TOK_OR; 1084104349Sphk case BT_GT: 1085104349Sphk *nextTokPtr = ptr + MINBPC(enc); 1086104349Sphk return XML_TOK_DECL_CLOSE; 1087104349Sphk case BT_NUM: 1088104349Sphk return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1089104349Sphk#define LEAD_CASE(n) \ 1090104349Sphk case BT_LEAD ## n: \ 1091104349Sphk if (end - ptr < n) \ 1092104349Sphk return XML_TOK_PARTIAL_CHAR; \ 1093104349Sphk if (IS_NMSTRT_CHAR(enc, ptr, n)) { \ 1094104349Sphk ptr += n; \ 1095104349Sphk tok = XML_TOK_NAME; \ 1096104349Sphk break; \ 1097104349Sphk } \ 1098104349Sphk if (IS_NAME_CHAR(enc, ptr, n)) { \ 1099104349Sphk ptr += n; \ 1100104349Sphk tok = XML_TOK_NMTOKEN; \ 1101104349Sphk break; \ 1102104349Sphk } \ 1103104349Sphk *nextTokPtr = ptr; \ 1104104349Sphk return XML_TOK_INVALID; 1105104349Sphk LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1106104349Sphk#undef LEAD_CASE 1107104349Sphk case BT_NMSTRT: 1108104349Sphk case BT_HEX: 1109104349Sphk tok = XML_TOK_NAME; 1110104349Sphk ptr += MINBPC(enc); 1111104349Sphk break; 1112104349Sphk case BT_DIGIT: 1113104349Sphk case BT_NAME: 1114104349Sphk case BT_MINUS: 1115104349Sphk#ifdef XML_NS 1116104349Sphk case BT_COLON: 1117104349Sphk#endif 1118104349Sphk tok = XML_TOK_NMTOKEN; 1119104349Sphk ptr += MINBPC(enc); 1120104349Sphk break; 1121104349Sphk case BT_NONASCII: 1122104349Sphk if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { 1123104349Sphk ptr += MINBPC(enc); 1124104349Sphk tok = XML_TOK_NAME; 1125104349Sphk break; 1126104349Sphk } 1127104349Sphk if (IS_NAME_CHAR_MINBPC(enc, ptr)) { 1128104349Sphk ptr += MINBPC(enc); 1129104349Sphk tok = XML_TOK_NMTOKEN; 1130104349Sphk break; 1131104349Sphk } 1132104349Sphk /* fall through */ 1133104349Sphk default: 1134104349Sphk *nextTokPtr = ptr; 1135104349Sphk return XML_TOK_INVALID; 1136104349Sphk } 1137302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 1138104349Sphk switch (BYTE_TYPE(enc, ptr)) { 1139104349Sphk CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 1140104349Sphk case BT_GT: case BT_RPAR: case BT_COMMA: 1141104349Sphk case BT_VERBAR: case BT_LSQB: case BT_PERCNT: 1142104349Sphk case BT_S: case BT_CR: case BT_LF: 1143104349Sphk *nextTokPtr = ptr; 1144104349Sphk return tok; 1145104349Sphk#ifdef XML_NS 1146104349Sphk case BT_COLON: 1147104349Sphk ptr += MINBPC(enc); 1148104349Sphk switch (tok) { 1149104349Sphk case XML_TOK_NAME: 1150302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 1151104349Sphk tok = XML_TOK_PREFIXED_NAME; 1152104349Sphk switch (BYTE_TYPE(enc, ptr)) { 1153104349Sphk CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 1154104349Sphk default: 1155104349Sphk tok = XML_TOK_NMTOKEN; 1156104349Sphk break; 1157104349Sphk } 1158104349Sphk break; 1159104349Sphk case XML_TOK_PREFIXED_NAME: 1160104349Sphk tok = XML_TOK_NMTOKEN; 1161104349Sphk break; 1162104349Sphk } 1163104349Sphk break; 1164104349Sphk#endif 1165104349Sphk case BT_PLUS: 1166104349Sphk if (tok == XML_TOK_NMTOKEN) { 1167104349Sphk *nextTokPtr = ptr; 1168104349Sphk return XML_TOK_INVALID; 1169104349Sphk } 1170104349Sphk *nextTokPtr = ptr + MINBPC(enc); 1171104349Sphk return XML_TOK_NAME_PLUS; 1172104349Sphk case BT_AST: 1173104349Sphk if (tok == XML_TOK_NMTOKEN) { 1174104349Sphk *nextTokPtr = ptr; 1175104349Sphk return XML_TOK_INVALID; 1176104349Sphk } 1177104349Sphk *nextTokPtr = ptr + MINBPC(enc); 1178104349Sphk return XML_TOK_NAME_ASTERISK; 1179104349Sphk case BT_QUEST: 1180104349Sphk if (tok == XML_TOK_NMTOKEN) { 1181104349Sphk *nextTokPtr = ptr; 1182104349Sphk return XML_TOK_INVALID; 1183104349Sphk } 1184104349Sphk *nextTokPtr = ptr + MINBPC(enc); 1185104349Sphk return XML_TOK_NAME_QUESTION; 1186104349Sphk default: 1187104349Sphk *nextTokPtr = ptr; 1188104349Sphk return XML_TOK_INVALID; 1189104349Sphk } 1190104349Sphk } 1191104349Sphk return -tok; 1192104349Sphk} 1193104349Sphk 1194178848Scokanestatic int PTRCALL 1195104349SphkPREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, 1196104349Sphk const char *end, const char **nextTokPtr) 1197104349Sphk{ 1198104349Sphk const char *start; 1199302385Sdelphij if (ptr >= end) 1200104349Sphk return XML_TOK_NONE; 1201302385Sdelphij else if (! HAS_CHAR(enc, ptr, end)) 1202302385Sdelphij return XML_TOK_PARTIAL; 1203104349Sphk start = ptr; 1204302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 1205104349Sphk switch (BYTE_TYPE(enc, ptr)) { 1206104349Sphk#define LEAD_CASE(n) \ 1207104349Sphk case BT_LEAD ## n: ptr += n; break; 1208104349Sphk LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1209104349Sphk#undef LEAD_CASE 1210104349Sphk case BT_AMP: 1211104349Sphk if (ptr == start) 1212104349Sphk return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1213104349Sphk *nextTokPtr = ptr; 1214104349Sphk return XML_TOK_DATA_CHARS; 1215104349Sphk case BT_LT: 1216104349Sphk /* this is for inside entity references */ 1217104349Sphk *nextTokPtr = ptr; 1218104349Sphk return XML_TOK_INVALID; 1219104349Sphk case BT_LF: 1220104349Sphk if (ptr == start) { 1221104349Sphk *nextTokPtr = ptr + MINBPC(enc); 1222104349Sphk return XML_TOK_DATA_NEWLINE; 1223104349Sphk } 1224104349Sphk *nextTokPtr = ptr; 1225104349Sphk return XML_TOK_DATA_CHARS; 1226104349Sphk case BT_CR: 1227104349Sphk if (ptr == start) { 1228104349Sphk ptr += MINBPC(enc); 1229302385Sdelphij if (! HAS_CHAR(enc, ptr, end)) 1230104349Sphk return XML_TOK_TRAILING_CR; 1231104349Sphk if (BYTE_TYPE(enc, ptr) == BT_LF) 1232104349Sphk ptr += MINBPC(enc); 1233104349Sphk *nextTokPtr = ptr; 1234104349Sphk return XML_TOK_DATA_NEWLINE; 1235104349Sphk } 1236104349Sphk *nextTokPtr = ptr; 1237104349Sphk return XML_TOK_DATA_CHARS; 1238104349Sphk case BT_S: 1239104349Sphk if (ptr == start) { 1240104349Sphk *nextTokPtr = ptr + MINBPC(enc); 1241104349Sphk return XML_TOK_ATTRIBUTE_VALUE_S; 1242104349Sphk } 1243104349Sphk *nextTokPtr = ptr; 1244104349Sphk return XML_TOK_DATA_CHARS; 1245104349Sphk default: 1246104349Sphk ptr += MINBPC(enc); 1247104349Sphk break; 1248104349Sphk } 1249104349Sphk } 1250104349Sphk *nextTokPtr = ptr; 1251104349Sphk return XML_TOK_DATA_CHARS; 1252104349Sphk} 1253104349Sphk 1254178848Scokanestatic int PTRCALL 1255104349SphkPREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, 1256104349Sphk const char *end, const char **nextTokPtr) 1257104349Sphk{ 1258104349Sphk const char *start; 1259302385Sdelphij if (ptr >= end) 1260104349Sphk return XML_TOK_NONE; 1261302385Sdelphij else if (! HAS_CHAR(enc, ptr, end)) 1262302385Sdelphij return XML_TOK_PARTIAL; 1263104349Sphk start = ptr; 1264302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 1265104349Sphk switch (BYTE_TYPE(enc, ptr)) { 1266104349Sphk#define LEAD_CASE(n) \ 1267104349Sphk case BT_LEAD ## n: ptr += n; break; 1268104349Sphk LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1269104349Sphk#undef LEAD_CASE 1270104349Sphk case BT_AMP: 1271104349Sphk if (ptr == start) 1272104349Sphk return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr); 1273104349Sphk *nextTokPtr = ptr; 1274104349Sphk return XML_TOK_DATA_CHARS; 1275104349Sphk case BT_PERCNT: 1276104349Sphk if (ptr == start) { 1277104349Sphk int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), 1278104349Sphk end, nextTokPtr); 1279104349Sphk return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok; 1280104349Sphk } 1281104349Sphk *nextTokPtr = ptr; 1282104349Sphk return XML_TOK_DATA_CHARS; 1283104349Sphk case BT_LF: 1284104349Sphk if (ptr == start) { 1285104349Sphk *nextTokPtr = ptr + MINBPC(enc); 1286104349Sphk return XML_TOK_DATA_NEWLINE; 1287104349Sphk } 1288104349Sphk *nextTokPtr = ptr; 1289104349Sphk return XML_TOK_DATA_CHARS; 1290104349Sphk case BT_CR: 1291104349Sphk if (ptr == start) { 1292104349Sphk ptr += MINBPC(enc); 1293302385Sdelphij if (! HAS_CHAR(enc, ptr, end)) 1294104349Sphk return XML_TOK_TRAILING_CR; 1295104349Sphk if (BYTE_TYPE(enc, ptr) == BT_LF) 1296104349Sphk ptr += MINBPC(enc); 1297104349Sphk *nextTokPtr = ptr; 1298104349Sphk return XML_TOK_DATA_NEWLINE; 1299104349Sphk } 1300104349Sphk *nextTokPtr = ptr; 1301104349Sphk return XML_TOK_DATA_CHARS; 1302104349Sphk default: 1303104349Sphk ptr += MINBPC(enc); 1304104349Sphk break; 1305104349Sphk } 1306104349Sphk } 1307104349Sphk *nextTokPtr = ptr; 1308104349Sphk return XML_TOK_DATA_CHARS; 1309104349Sphk} 1310104349Sphk 1311104349Sphk#ifdef XML_DTD 1312104349Sphk 1313178848Scokanestatic int PTRCALL 1314104349SphkPREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, 1315104349Sphk const char *end, const char **nextTokPtr) 1316104349Sphk{ 1317104349Sphk int level = 0; 1318104349Sphk if (MINBPC(enc) > 1) { 1319104349Sphk size_t n = end - ptr; 1320104349Sphk if (n & (MINBPC(enc) - 1)) { 1321104349Sphk n &= ~(MINBPC(enc) - 1); 1322104349Sphk end = ptr + n; 1323104349Sphk } 1324104349Sphk } 1325302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 1326104349Sphk switch (BYTE_TYPE(enc, ptr)) { 1327104349Sphk INVALID_CASES(ptr, nextTokPtr) 1328104349Sphk case BT_LT: 1329302385Sdelphij ptr += MINBPC(enc); 1330302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 1331104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) { 1332302385Sdelphij ptr += MINBPC(enc); 1333302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 1334104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) { 1335104349Sphk ++level; 1336104349Sphk ptr += MINBPC(enc); 1337104349Sphk } 1338104349Sphk } 1339104349Sphk break; 1340104349Sphk case BT_RSQB: 1341302385Sdelphij ptr += MINBPC(enc); 1342302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 1343104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) { 1344302385Sdelphij ptr += MINBPC(enc); 1345302385Sdelphij REQUIRE_CHAR(enc, ptr, end); 1346104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_GT)) { 1347104349Sphk ptr += MINBPC(enc); 1348104349Sphk if (level == 0) { 1349104349Sphk *nextTokPtr = ptr; 1350104349Sphk return XML_TOK_IGNORE_SECT; 1351104349Sphk } 1352104349Sphk --level; 1353104349Sphk } 1354104349Sphk } 1355104349Sphk break; 1356104349Sphk default: 1357104349Sphk ptr += MINBPC(enc); 1358104349Sphk break; 1359104349Sphk } 1360104349Sphk } 1361104349Sphk return XML_TOK_PARTIAL; 1362104349Sphk} 1363104349Sphk 1364104349Sphk#endif /* XML_DTD */ 1365104349Sphk 1366178848Scokanestatic int PTRCALL 1367104349SphkPREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end, 1368104349Sphk const char **badPtr) 1369104349Sphk{ 1370104349Sphk ptr += MINBPC(enc); 1371104349Sphk end -= MINBPC(enc); 1372302385Sdelphij for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) { 1373104349Sphk switch (BYTE_TYPE(enc, ptr)) { 1374104349Sphk case BT_DIGIT: 1375104349Sphk case BT_HEX: 1376104349Sphk case BT_MINUS: 1377104349Sphk case BT_APOS: 1378104349Sphk case BT_LPAR: 1379104349Sphk case BT_RPAR: 1380104349Sphk case BT_PLUS: 1381104349Sphk case BT_COMMA: 1382104349Sphk case BT_SOL: 1383104349Sphk case BT_EQUALS: 1384104349Sphk case BT_QUEST: 1385104349Sphk case BT_CR: 1386104349Sphk case BT_LF: 1387104349Sphk case BT_SEMI: 1388104349Sphk case BT_EXCL: 1389104349Sphk case BT_AST: 1390104349Sphk case BT_PERCNT: 1391104349Sphk case BT_NUM: 1392104349Sphk#ifdef XML_NS 1393104349Sphk case BT_COLON: 1394104349Sphk#endif 1395104349Sphk break; 1396104349Sphk case BT_S: 1397104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) { 1398104349Sphk *badPtr = ptr; 1399104349Sphk return 0; 1400104349Sphk } 1401104349Sphk break; 1402104349Sphk case BT_NAME: 1403104349Sphk case BT_NMSTRT: 1404104349Sphk if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f)) 1405104349Sphk break; 1406104349Sphk default: 1407104349Sphk switch (BYTE_TO_ASCII(enc, ptr)) { 1408104349Sphk case 0x24: /* $ */ 1409104349Sphk case 0x40: /* @ */ 1410104349Sphk break; 1411104349Sphk default: 1412104349Sphk *badPtr = ptr; 1413104349Sphk return 0; 1414104349Sphk } 1415104349Sphk break; 1416104349Sphk } 1417104349Sphk } 1418104349Sphk return 1; 1419104349Sphk} 1420104349Sphk 1421104349Sphk/* This must only be called for a well-formed start-tag or empty 1422104349Sphk element tag. Returns the number of attributes. Pointers to the 1423104349Sphk first attsMax attributes are stored in atts. 1424104349Sphk*/ 1425104349Sphk 1426178848Scokanestatic int PTRCALL 1427104349SphkPREFIX(getAtts)(const ENCODING *enc, const char *ptr, 1428104349Sphk int attsMax, ATTRIBUTE *atts) 1429104349Sphk{ 1430104349Sphk enum { other, inName, inValue } state = inName; 1431104349Sphk int nAtts = 0; 1432104349Sphk int open = 0; /* defined when state == inValue; 1433104349Sphk initialization just to shut up compilers */ 1434104349Sphk 1435104349Sphk for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) { 1436104349Sphk switch (BYTE_TYPE(enc, ptr)) { 1437104349Sphk#define START_NAME \ 1438104349Sphk if (state == other) { \ 1439104349Sphk if (nAtts < attsMax) { \ 1440104349Sphk atts[nAtts].name = ptr; \ 1441104349Sphk atts[nAtts].normalized = 1; \ 1442104349Sphk } \ 1443104349Sphk state = inName; \ 1444104349Sphk } 1445104349Sphk#define LEAD_CASE(n) \ 1446104349Sphk case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break; 1447104349Sphk LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1448104349Sphk#undef LEAD_CASE 1449104349Sphk case BT_NONASCII: 1450104349Sphk case BT_NMSTRT: 1451104349Sphk case BT_HEX: 1452104349Sphk START_NAME 1453104349Sphk break; 1454104349Sphk#undef START_NAME 1455104349Sphk case BT_QUOT: 1456104349Sphk if (state != inValue) { 1457104349Sphk if (nAtts < attsMax) 1458104349Sphk atts[nAtts].valuePtr = ptr + MINBPC(enc); 1459104349Sphk state = inValue; 1460104349Sphk open = BT_QUOT; 1461104349Sphk } 1462104349Sphk else if (open == BT_QUOT) { 1463104349Sphk state = other; 1464104349Sphk if (nAtts < attsMax) 1465104349Sphk atts[nAtts].valueEnd = ptr; 1466104349Sphk nAtts++; 1467104349Sphk } 1468104349Sphk break; 1469104349Sphk case BT_APOS: 1470104349Sphk if (state != inValue) { 1471104349Sphk if (nAtts < attsMax) 1472104349Sphk atts[nAtts].valuePtr = ptr + MINBPC(enc); 1473104349Sphk state = inValue; 1474104349Sphk open = BT_APOS; 1475104349Sphk } 1476104349Sphk else if (open == BT_APOS) { 1477104349Sphk state = other; 1478104349Sphk if (nAtts < attsMax) 1479104349Sphk atts[nAtts].valueEnd = ptr; 1480104349Sphk nAtts++; 1481104349Sphk } 1482104349Sphk break; 1483104349Sphk case BT_AMP: 1484104349Sphk if (nAtts < attsMax) 1485104349Sphk atts[nAtts].normalized = 0; 1486104349Sphk break; 1487104349Sphk case BT_S: 1488104349Sphk if (state == inName) 1489104349Sphk state = other; 1490104349Sphk else if (state == inValue 1491104349Sphk && nAtts < attsMax 1492104349Sphk && atts[nAtts].normalized 1493104349Sphk && (ptr == atts[nAtts].valuePtr 1494104349Sphk || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE 1495104349Sphk || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE 1496104349Sphk || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open)) 1497104349Sphk atts[nAtts].normalized = 0; 1498104349Sphk break; 1499104349Sphk case BT_CR: case BT_LF: 1500104349Sphk /* This case ensures that the first attribute name is counted 1501104349Sphk Apart from that we could just change state on the quote. */ 1502104349Sphk if (state == inName) 1503104349Sphk state = other; 1504104349Sphk else if (state == inValue && nAtts < attsMax) 1505104349Sphk atts[nAtts].normalized = 0; 1506104349Sphk break; 1507104349Sphk case BT_GT: 1508104349Sphk case BT_SOL: 1509104349Sphk if (state != inValue) 1510104349Sphk return nAtts; 1511104349Sphk break; 1512104349Sphk default: 1513104349Sphk break; 1514104349Sphk } 1515104349Sphk } 1516104349Sphk /* not reached */ 1517104349Sphk} 1518104349Sphk 1519178848Scokanestatic int PTRFASTCALL 1520302385SdelphijPREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr) 1521104349Sphk{ 1522104349Sphk int result = 0; 1523104349Sphk /* skip &# */ 1524104349Sphk ptr += 2*MINBPC(enc); 1525104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_x)) { 1526104349Sphk for (ptr += MINBPC(enc); 1527104349Sphk !CHAR_MATCHES(enc, ptr, ASCII_SEMI); 1528104349Sphk ptr += MINBPC(enc)) { 1529104349Sphk int c = BYTE_TO_ASCII(enc, ptr); 1530104349Sphk switch (c) { 1531104349Sphk case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4: 1532104349Sphk case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9: 1533104349Sphk result <<= 4; 1534104349Sphk result |= (c - ASCII_0); 1535104349Sphk break; 1536104349Sphk case ASCII_A: case ASCII_B: case ASCII_C: 1537104349Sphk case ASCII_D: case ASCII_E: case ASCII_F: 1538104349Sphk result <<= 4; 1539104349Sphk result += 10 + (c - ASCII_A); 1540104349Sphk break; 1541104349Sphk case ASCII_a: case ASCII_b: case ASCII_c: 1542104349Sphk case ASCII_d: case ASCII_e: case ASCII_f: 1543104349Sphk result <<= 4; 1544104349Sphk result += 10 + (c - ASCII_a); 1545104349Sphk break; 1546104349Sphk } 1547104349Sphk if (result >= 0x110000) 1548104349Sphk return -1; 1549104349Sphk } 1550104349Sphk } 1551104349Sphk else { 1552104349Sphk for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) { 1553104349Sphk int c = BYTE_TO_ASCII(enc, ptr); 1554104349Sphk result *= 10; 1555104349Sphk result += (c - ASCII_0); 1556104349Sphk if (result >= 0x110000) 1557104349Sphk return -1; 1558104349Sphk } 1559104349Sphk } 1560104349Sphk return checkCharRefNumber(result); 1561104349Sphk} 1562104349Sphk 1563178848Scokanestatic int PTRCALL 1564302385SdelphijPREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr, 1565104349Sphk const char *end) 1566104349Sphk{ 1567104349Sphk switch ((end - ptr)/MINBPC(enc)) { 1568104349Sphk case 2: 1569104349Sphk if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) { 1570104349Sphk switch (BYTE_TO_ASCII(enc, ptr)) { 1571104349Sphk case ASCII_l: 1572104349Sphk return ASCII_LT; 1573104349Sphk case ASCII_g: 1574104349Sphk return ASCII_GT; 1575104349Sphk } 1576104349Sphk } 1577104349Sphk break; 1578104349Sphk case 3: 1579104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_a)) { 1580104349Sphk ptr += MINBPC(enc); 1581104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_m)) { 1582104349Sphk ptr += MINBPC(enc); 1583104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_p)) 1584104349Sphk return ASCII_AMP; 1585104349Sphk } 1586104349Sphk } 1587104349Sphk break; 1588104349Sphk case 4: 1589104349Sphk switch (BYTE_TO_ASCII(enc, ptr)) { 1590104349Sphk case ASCII_q: 1591104349Sphk ptr += MINBPC(enc); 1592104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_u)) { 1593104349Sphk ptr += MINBPC(enc); 1594104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_o)) { 1595104349Sphk ptr += MINBPC(enc); 1596104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_t)) 1597104349Sphk return ASCII_QUOT; 1598104349Sphk } 1599104349Sphk } 1600104349Sphk break; 1601104349Sphk case ASCII_a: 1602104349Sphk ptr += MINBPC(enc); 1603104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_p)) { 1604104349Sphk ptr += MINBPC(enc); 1605104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_o)) { 1606104349Sphk ptr += MINBPC(enc); 1607104349Sphk if (CHAR_MATCHES(enc, ptr, ASCII_s)) 1608104349Sphk return ASCII_APOS; 1609104349Sphk } 1610104349Sphk } 1611104349Sphk break; 1612104349Sphk } 1613104349Sphk } 1614104349Sphk return 0; 1615104349Sphk} 1616104349Sphk 1617178848Scokanestatic int PTRCALL 1618104349SphkPREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2) 1619104349Sphk{ 1620104349Sphk for (;;) { 1621104349Sphk switch (BYTE_TYPE(enc, ptr1)) { 1622104349Sphk#define LEAD_CASE(n) \ 1623104349Sphk case BT_LEAD ## n: \ 1624104349Sphk if (*ptr1++ != *ptr2++) \ 1625104349Sphk return 0; 1626104349Sphk LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2) 1627104349Sphk#undef LEAD_CASE 1628104349Sphk /* fall through */ 1629104349Sphk if (*ptr1++ != *ptr2++) 1630104349Sphk return 0; 1631104349Sphk break; 1632104349Sphk case BT_NONASCII: 1633104349Sphk case BT_NMSTRT: 1634104349Sphk#ifdef XML_NS 1635104349Sphk case BT_COLON: 1636104349Sphk#endif 1637104349Sphk case BT_HEX: 1638104349Sphk case BT_DIGIT: 1639104349Sphk case BT_NAME: 1640104349Sphk case BT_MINUS: 1641104349Sphk if (*ptr2++ != *ptr1++) 1642104349Sphk return 0; 1643104349Sphk if (MINBPC(enc) > 1) { 1644104349Sphk if (*ptr2++ != *ptr1++) 1645104349Sphk return 0; 1646104349Sphk if (MINBPC(enc) > 2) { 1647104349Sphk if (*ptr2++ != *ptr1++) 1648104349Sphk return 0; 1649104349Sphk if (MINBPC(enc) > 3) { 1650104349Sphk if (*ptr2++ != *ptr1++) 1651104349Sphk return 0; 1652104349Sphk } 1653104349Sphk } 1654104349Sphk } 1655104349Sphk break; 1656104349Sphk default: 1657104349Sphk if (MINBPC(enc) == 1 && *ptr1 == *ptr2) 1658104349Sphk return 1; 1659104349Sphk switch (BYTE_TYPE(enc, ptr2)) { 1660104349Sphk case BT_LEAD2: 1661104349Sphk case BT_LEAD3: 1662104349Sphk case BT_LEAD4: 1663104349Sphk case BT_NONASCII: 1664104349Sphk case BT_NMSTRT: 1665104349Sphk#ifdef XML_NS 1666104349Sphk case BT_COLON: 1667104349Sphk#endif 1668104349Sphk case BT_HEX: 1669104349Sphk case BT_DIGIT: 1670104349Sphk case BT_NAME: 1671104349Sphk case BT_MINUS: 1672104349Sphk return 0; 1673104349Sphk default: 1674104349Sphk return 1; 1675104349Sphk } 1676104349Sphk } 1677104349Sphk } 1678104349Sphk /* not reached */ 1679104349Sphk} 1680104349Sphk 1681178848Scokanestatic int PTRCALL 1682302385SdelphijPREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1, 1683104349Sphk const char *end1, const char *ptr2) 1684104349Sphk{ 1685104349Sphk for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) { 1686302385Sdelphij if (end1 - ptr1 < MINBPC(enc)) 1687104349Sphk return 0; 1688104349Sphk if (!CHAR_MATCHES(enc, ptr1, *ptr2)) 1689104349Sphk return 0; 1690104349Sphk } 1691104349Sphk return ptr1 == end1; 1692104349Sphk} 1693104349Sphk 1694178848Scokanestatic int PTRFASTCALL 1695104349SphkPREFIX(nameLength)(const ENCODING *enc, const char *ptr) 1696104349Sphk{ 1697104349Sphk const char *start = ptr; 1698104349Sphk for (;;) { 1699104349Sphk switch (BYTE_TYPE(enc, ptr)) { 1700104349Sphk#define LEAD_CASE(n) \ 1701104349Sphk case BT_LEAD ## n: ptr += n; break; 1702104349Sphk LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1703104349Sphk#undef LEAD_CASE 1704104349Sphk case BT_NONASCII: 1705104349Sphk case BT_NMSTRT: 1706104349Sphk#ifdef XML_NS 1707104349Sphk case BT_COLON: 1708104349Sphk#endif 1709104349Sphk case BT_HEX: 1710104349Sphk case BT_DIGIT: 1711104349Sphk case BT_NAME: 1712104349Sphk case BT_MINUS: 1713104349Sphk ptr += MINBPC(enc); 1714104349Sphk break; 1715104349Sphk default: 1716178848Scokane return (int)(ptr - start); 1717104349Sphk } 1718104349Sphk } 1719104349Sphk} 1720104349Sphk 1721178848Scokanestatic const char * PTRFASTCALL 1722104349SphkPREFIX(skipS)(const ENCODING *enc, const char *ptr) 1723104349Sphk{ 1724104349Sphk for (;;) { 1725104349Sphk switch (BYTE_TYPE(enc, ptr)) { 1726104349Sphk case BT_LF: 1727104349Sphk case BT_CR: 1728104349Sphk case BT_S: 1729104349Sphk ptr += MINBPC(enc); 1730104349Sphk break; 1731104349Sphk default: 1732104349Sphk return ptr; 1733104349Sphk } 1734104349Sphk } 1735104349Sphk} 1736104349Sphk 1737178848Scokanestatic void PTRCALL 1738104349SphkPREFIX(updatePosition)(const ENCODING *enc, 1739104349Sphk const char *ptr, 1740104349Sphk const char *end, 1741104349Sphk POSITION *pos) 1742104349Sphk{ 1743302385Sdelphij while (HAS_CHAR(enc, ptr, end)) { 1744104349Sphk switch (BYTE_TYPE(enc, ptr)) { 1745104349Sphk#define LEAD_CASE(n) \ 1746104349Sphk case BT_LEAD ## n: \ 1747104349Sphk ptr += n; \ 1748104349Sphk break; 1749104349Sphk LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1750104349Sphk#undef LEAD_CASE 1751104349Sphk case BT_LF: 1752178848Scokane pos->columnNumber = (XML_Size)-1; 1753104349Sphk pos->lineNumber++; 1754104349Sphk ptr += MINBPC(enc); 1755104349Sphk break; 1756104349Sphk case BT_CR: 1757104349Sphk pos->lineNumber++; 1758104349Sphk ptr += MINBPC(enc); 1759302385Sdelphij if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF) 1760104349Sphk ptr += MINBPC(enc); 1761178848Scokane pos->columnNumber = (XML_Size)-1; 1762104349Sphk break; 1763104349Sphk default: 1764104349Sphk ptr += MINBPC(enc); 1765104349Sphk break; 1766104349Sphk } 1767104349Sphk pos->columnNumber++; 1768104349Sphk } 1769104349Sphk} 1770104349Sphk 1771104349Sphk#undef DO_LEAD_CASE 1772104349Sphk#undef MULTIBYTE_CASES 1773104349Sphk#undef INVALID_CASES 1774104349Sphk#undef CHECK_NAME_CASE 1775104349Sphk#undef CHECK_NAME_CASES 1776104349Sphk#undef CHECK_NMSTRT_CASE 1777104349Sphk#undef CHECK_NMSTRT_CASES 1778178848Scokane 1779178848Scokane#endif /* XML_TOK_IMPL_C */ 1780