1/* 2The contents of this file are subject to the Mozilla Public License 3Version 1.0 (the "License"); you may not use this file except in 4compliance with the License. You may obtain a copy of the License at 5http://www.mozilla.org/MPL/ 6 7Software distributed under the License is distributed on an "AS IS" 8basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 9License for the specific language governing rights and limitations 10under the License. 11 12The Original Code is expat. 13 14The Initial Developer of the Original Code is James Clark. 15Portions created by James Clark are Copyright (C) 1998 16James Clark. All Rights Reserved. 17 18Contributor(s): 19*/ 20 21#ifndef IS_INVALID_CHAR 22#define IS_INVALID_CHAR(enc, ptr, n) (0) 23#endif 24 25#define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \ 26 case BT_LEAD ## n: \ 27 if (end - ptr < n) \ 28 return XML_TOK_PARTIAL_CHAR; \ 29 if (IS_INVALID_CHAR(enc, ptr, n)) { \ 30 *(nextTokPtr) = (ptr); \ 31 return XML_TOK_INVALID; \ 32 } \ 33 ptr += n; \ 34 break; 35 36#define INVALID_CASES(ptr, nextTokPtr) \ 37 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \ 38 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \ 39 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \ 40 case BT_NONXML: \ 41 case BT_MALFORM: \ 42 case BT_TRAIL: \ 43 *(nextTokPtr) = (ptr); \ 44 return XML_TOK_INVALID; 45 46#define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \ 47 case BT_LEAD ## n: \ 48 if (end - ptr < n) \ 49 return XML_TOK_PARTIAL_CHAR; \ 50 if (!IS_NAME_CHAR(enc, ptr, n)) { \ 51 *nextTokPtr = ptr; \ 52 return XML_TOK_INVALID; \ 53 } \ 54 ptr += n; \ 55 break; 56 57#define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \ 58 case BT_NONASCII: \ 59 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \ 60 *nextTokPtr = ptr; \ 61 return XML_TOK_INVALID; \ 62 } \ 63 case BT_NMSTRT: \ 64 case BT_HEX: \ 65 case BT_DIGIT: \ 66 case BT_NAME: \ 67 case BT_MINUS: \ 68 ptr += MINBPC; \ 69 break; \ 70 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \ 71 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \ 72 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr) 73 74#define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \ 75 case BT_LEAD ## n: \ 76 if (end - ptr < n) \ 77 return XML_TOK_PARTIAL_CHAR; \ 78 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \ 79 *nextTokPtr = ptr; \ 80 return XML_TOK_INVALID; \ 81 } \ 82 ptr += n; \ 83 break; 84 85#define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \ 86 case BT_NONASCII: \ 87 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \ 88 *nextTokPtr = ptr; \ 89 return XML_TOK_INVALID; \ 90 } \ 91 case BT_NMSTRT: \ 92 case BT_HEX: \ 93 ptr += MINBPC; \ 94 break; \ 95 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \ 96 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \ 97 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr) 98 99#ifndef PREFIX 100#define PREFIX(ident) ident 101#endif 102 103/* ptr points to character following "<!-" */ 104 105static 106int PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end, 107 const char **nextTokPtr) 108{ 109 if (ptr != end) { 110 if (!CHAR_MATCHES(enc, ptr, '-')) { 111 *nextTokPtr = ptr; 112 return XML_TOK_INVALID; 113 } 114 ptr += MINBPC; 115 while (ptr != end) { 116 switch (BYTE_TYPE(enc, ptr)) { 117 INVALID_CASES(ptr, nextTokPtr) 118 case BT_MINUS: 119 if ((ptr += MINBPC) == end) 120 return XML_TOK_PARTIAL; 121 if (CHAR_MATCHES(enc, ptr, '-')) { 122 if ((ptr += MINBPC) == end) 123 return XML_TOK_PARTIAL; 124 if (!CHAR_MATCHES(enc, ptr, '>')) { 125 *nextTokPtr = ptr; 126 return XML_TOK_INVALID; 127 } 128 *nextTokPtr = ptr + MINBPC; 129 return XML_TOK_COMMENT; 130 } 131 /* fall through */ 132 default: 133 ptr += MINBPC; 134 break; 135 } 136 } 137 } 138 return XML_TOK_PARTIAL; 139} 140 141/* ptr points to character following "<!" */ 142 143static 144int PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end, 145 const char **nextTokPtr) 146{ 147 if (ptr == end) 148 return XML_TOK_PARTIAL; 149 switch (BYTE_TYPE(enc, ptr)) { 150 case BT_MINUS: 151 return PREFIX(scanComment)(enc, ptr + MINBPC, end, nextTokPtr); 152 case BT_LSQB: 153 *nextTokPtr = ptr + MINBPC; 154 return XML_TOK_COND_SECT_OPEN; 155 case BT_NMSTRT: 156 case BT_HEX: 157 ptr += MINBPC; 158 break; 159 default: 160 *nextTokPtr = ptr; 161 return XML_TOK_INVALID; 162 } 163 while (ptr != end) { 164 switch (BYTE_TYPE(enc, ptr)) { 165 case BT_PERCNT: 166 if (ptr + MINBPC == end) 167 return XML_TOK_PARTIAL; 168 /* don't allow <!ENTITY% foo "whatever"> */ 169 switch (BYTE_TYPE(enc, ptr + MINBPC)) { 170 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT: 171 *nextTokPtr = ptr; 172 return XML_TOK_INVALID; 173 } 174 /* fall through */ 175 case BT_S: case BT_CR: case BT_LF: 176 *nextTokPtr = ptr; 177 return XML_TOK_DECL_OPEN; 178 case BT_NMSTRT: 179 case BT_HEX: 180 ptr += MINBPC; 181 break; 182 default: 183 *nextTokPtr = ptr; 184 return XML_TOK_INVALID; 185 } 186 } 187 return XML_TOK_PARTIAL; 188} 189 190static 191int PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end, int *tokPtr) 192{ 193 int upper = 0; 194 *tokPtr = XML_TOK_PI; 195 if (end - ptr != MINBPC*3) 196 return 1; 197 switch (BYTE_TO_ASCII(enc, ptr)) { 198 case 'x': 199 break; 200 case 'X': 201 upper = 1; 202 break; 203 default: 204 return 1; 205 } 206 ptr += MINBPC; 207 switch (BYTE_TO_ASCII(enc, ptr)) { 208 case 'm': 209 break; 210 case 'M': 211 upper = 1; 212 break; 213 default: 214 return 1; 215 } 216 ptr += MINBPC; 217 switch (BYTE_TO_ASCII(enc, ptr)) { 218 case 'l': 219 break; 220 case 'L': 221 upper = 1; 222 break; 223 default: 224 return 1; 225 } 226 if (upper) 227 return 0; 228 *tokPtr = XML_TOK_XML_DECL; 229 return 1; 230} 231 232/* ptr points to character following "<?" */ 233 234static 235int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end, 236 const char **nextTokPtr) 237{ 238 int tok; 239 const char *target = ptr; 240 if (ptr == end) 241 return XML_TOK_PARTIAL; 242 switch (BYTE_TYPE(enc, ptr)) { 243 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 244 default: 245 *nextTokPtr = ptr; 246 return XML_TOK_INVALID; 247 } 248 while (ptr != end) { 249 switch (BYTE_TYPE(enc, ptr)) { 250 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 251 case BT_S: case BT_CR: case BT_LF: 252 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { 253 *nextTokPtr = ptr; 254 return XML_TOK_INVALID; 255 } 256 ptr += MINBPC; 257 while (ptr != end) { 258 switch (BYTE_TYPE(enc, ptr)) { 259 INVALID_CASES(ptr, nextTokPtr) 260 case BT_QUEST: 261 ptr += MINBPC; 262 if (ptr == end) 263 return XML_TOK_PARTIAL; 264 if (CHAR_MATCHES(enc, ptr, '>')) { 265 *nextTokPtr = ptr + MINBPC; 266 return tok; 267 } 268 break; 269 default: 270 ptr += MINBPC; 271 break; 272 } 273 } 274 return XML_TOK_PARTIAL; 275 case BT_QUEST: 276 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) { 277 *nextTokPtr = ptr; 278 return XML_TOK_INVALID; 279 } 280 ptr += MINBPC; 281 if (ptr == end) 282 return XML_TOK_PARTIAL; 283 if (CHAR_MATCHES(enc, ptr, '>')) { 284 *nextTokPtr = ptr + MINBPC; 285 return tok; 286 } 287 /* fall through */ 288 default: 289 *nextTokPtr = ptr; 290 return XML_TOK_INVALID; 291 } 292 } 293 return XML_TOK_PARTIAL; 294} 295 296 297static 298int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end, 299 const char **nextTokPtr) 300{ 301 int i; 302 /* CDATA[ */ 303 if (end - ptr < 6 * MINBPC) 304 return XML_TOK_PARTIAL; 305 for (i = 0; i < 6; i++, ptr += MINBPC) { 306 if (!CHAR_MATCHES(enc, ptr, "CDATA["[i])) { 307 *nextTokPtr = ptr; 308 return XML_TOK_INVALID; 309 } 310 } 311 *nextTokPtr = ptr; 312 return XML_TOK_CDATA_SECT_OPEN; 313} 314 315static 316int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end, 317 const char **nextTokPtr) 318{ 319 if (ptr == end) 320 return XML_TOK_NONE; 321#if MINBPC > 1 322 { 323 size_t n = end - ptr; 324 if (n & (MINBPC - 1)) { 325 n &= ~(MINBPC - 1); 326 if (n == 0) 327 return XML_TOK_PARTIAL; 328 end = ptr + n; 329 } 330 } 331#endif 332 switch (BYTE_TYPE(enc, ptr)) { 333 case BT_RSQB: 334 ptr += MINBPC; 335 if (ptr == end) 336 return XML_TOK_PARTIAL; 337 if (!CHAR_MATCHES(enc, ptr, ']')) 338 break; 339 ptr += MINBPC; 340 if (ptr == end) 341 return XML_TOK_PARTIAL; 342 if (!CHAR_MATCHES(enc, ptr, '>')) { 343 ptr -= MINBPC; 344 break; 345 } 346 *nextTokPtr = ptr + MINBPC; 347 return XML_TOK_CDATA_SECT_CLOSE; 348 case BT_CR: 349 ptr += MINBPC; 350 if (ptr == end) 351 return XML_TOK_PARTIAL; 352 if (BYTE_TYPE(enc, ptr) == BT_LF) 353 ptr += MINBPC; 354 *nextTokPtr = ptr; 355 return XML_TOK_DATA_NEWLINE; 356 case BT_LF: 357 *nextTokPtr = ptr + MINBPC; 358 return XML_TOK_DATA_NEWLINE; 359 INVALID_CASES(ptr, nextTokPtr) 360 default: 361 ptr += MINBPC; 362 break; 363 } 364 while (ptr != end) { 365 switch (BYTE_TYPE(enc, ptr)) { 366#define LEAD_CASE(n) \ 367 case BT_LEAD ## n: \ 368 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ 369 *nextTokPtr = ptr; \ 370 return XML_TOK_DATA_CHARS; \ 371 } \ 372 ptr += n; \ 373 break; 374 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 375#undef LEAD_CASE 376 case BT_NONXML: 377 case BT_MALFORM: 378 case BT_TRAIL: 379 case BT_CR: 380 case BT_LF: 381 case BT_RSQB: 382 *nextTokPtr = ptr; 383 return XML_TOK_DATA_CHARS; 384 default: 385 ptr += MINBPC; 386 break; 387 } 388 } 389 *nextTokPtr = ptr; 390 return XML_TOK_DATA_CHARS; 391} 392 393/* ptr points to character following "</" */ 394 395static 396int PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end, 397 const char **nextTokPtr) 398{ 399 if (ptr == end) 400 return XML_TOK_PARTIAL; 401 switch (BYTE_TYPE(enc, ptr)) { 402 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 403 default: 404 *nextTokPtr = ptr; 405 return XML_TOK_INVALID; 406 } 407 while (ptr != end) { 408 switch (BYTE_TYPE(enc, ptr)) { 409 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 410 case BT_S: case BT_CR: case BT_LF: 411 for (ptr += MINBPC; ptr != end; ptr += MINBPC) { 412 switch (BYTE_TYPE(enc, ptr)) { 413 case BT_S: case BT_CR: case BT_LF: 414 break; 415 case BT_GT: 416 *nextTokPtr = ptr + MINBPC; 417 return XML_TOK_END_TAG; 418 default: 419 *nextTokPtr = ptr; 420 return XML_TOK_INVALID; 421 } 422 } 423 return XML_TOK_PARTIAL; 424 case BT_GT: 425 *nextTokPtr = ptr + MINBPC; 426 return XML_TOK_END_TAG; 427 default: 428 *nextTokPtr = ptr; 429 return XML_TOK_INVALID; 430 } 431 } 432 return XML_TOK_PARTIAL; 433} 434 435/* ptr points to character following "&#X" */ 436 437static 438int PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end, 439 const char **nextTokPtr) 440{ 441 if (ptr != end) { 442 switch (BYTE_TYPE(enc, ptr)) { 443 case BT_DIGIT: 444 case BT_HEX: 445 break; 446 default: 447 *nextTokPtr = ptr; 448 return XML_TOK_INVALID; 449 } 450 for (ptr += MINBPC; ptr != end; ptr += MINBPC) { 451 switch (BYTE_TYPE(enc, ptr)) { 452 case BT_DIGIT: 453 case BT_HEX: 454 break; 455 case BT_SEMI: 456 *nextTokPtr = ptr + MINBPC; 457 return XML_TOK_CHAR_REF; 458 default: 459 *nextTokPtr = ptr; 460 return XML_TOK_INVALID; 461 } 462 } 463 } 464 return XML_TOK_PARTIAL; 465} 466 467/* ptr points to character following "&#" */ 468 469static 470int PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end, 471 const char **nextTokPtr) 472{ 473 if (ptr != end) { 474 if (CHAR_MATCHES(enc, ptr, 'x')) 475 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC, end, nextTokPtr); 476 switch (BYTE_TYPE(enc, ptr)) { 477 case BT_DIGIT: 478 break; 479 default: 480 *nextTokPtr = ptr; 481 return XML_TOK_INVALID; 482 } 483 for (ptr += MINBPC; ptr != end; ptr += MINBPC) { 484 switch (BYTE_TYPE(enc, ptr)) { 485 case BT_DIGIT: 486 break; 487 case BT_SEMI: 488 *nextTokPtr = ptr + MINBPC; 489 return XML_TOK_CHAR_REF; 490 default: 491 *nextTokPtr = ptr; 492 return XML_TOK_INVALID; 493 } 494 } 495 } 496 return XML_TOK_PARTIAL; 497} 498 499/* ptr points to character following "&" */ 500 501static 502int PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end, 503 const char **nextTokPtr) 504{ 505 if (ptr == end) 506 return XML_TOK_PARTIAL; 507 switch (BYTE_TYPE(enc, ptr)) { 508 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 509 case BT_NUM: 510 return PREFIX(scanCharRef)(enc, ptr + MINBPC, end, nextTokPtr); 511 default: 512 *nextTokPtr = ptr; 513 return XML_TOK_INVALID; 514 } 515 while (ptr != end) { 516 switch (BYTE_TYPE(enc, ptr)) { 517 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 518 case BT_SEMI: 519 *nextTokPtr = ptr + MINBPC; 520 return XML_TOK_ENTITY_REF; 521 default: 522 *nextTokPtr = ptr; 523 return XML_TOK_INVALID; 524 } 525 } 526 return XML_TOK_PARTIAL; 527} 528 529/* ptr points to character following first character of attribute name */ 530 531static 532int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end, 533 const char **nextTokPtr) 534{ 535 while (ptr != end) { 536 switch (BYTE_TYPE(enc, ptr)) { 537 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 538 case BT_S: case BT_CR: case BT_LF: 539 for (;;) { 540 int t; 541 542 ptr += MINBPC; 543 if (ptr == end) 544 return XML_TOK_PARTIAL; 545 t = BYTE_TYPE(enc, ptr); 546 if (t == BT_EQUALS) 547 break; 548 switch (t) { 549 case BT_S: 550 case BT_LF: 551 case BT_CR: 552 break; 553 default: 554 *nextTokPtr = ptr; 555 return XML_TOK_INVALID; 556 } 557 } 558 /* fall through */ 559 case BT_EQUALS: 560 { 561 int open; 562 for (;;) { 563 564 ptr += MINBPC; 565 if (ptr == end) 566 return XML_TOK_PARTIAL; 567 open = BYTE_TYPE(enc, ptr); 568 if (open == BT_QUOT || open == BT_APOS) 569 break; 570 switch (open) { 571 case BT_S: 572 case BT_LF: 573 case BT_CR: 574 break; 575 default: 576 *nextTokPtr = ptr; 577 return XML_TOK_INVALID; 578 } 579 } 580 ptr += MINBPC; 581 /* in attribute value */ 582 for (;;) { 583 int t; 584 if (ptr == end) 585 return XML_TOK_PARTIAL; 586 t = BYTE_TYPE(enc, ptr); 587 if (t == open) 588 break; 589 switch (t) { 590 INVALID_CASES(ptr, nextTokPtr) 591 case BT_AMP: 592 { 593 int tok = PREFIX(scanRef)(enc, ptr + MINBPC, end, &ptr); 594 if (tok <= 0) { 595 if (tok == XML_TOK_INVALID) 596 *nextTokPtr = ptr; 597 return tok; 598 } 599 break; 600 } 601 case BT_LT: 602 *nextTokPtr = ptr; 603 return XML_TOK_INVALID; 604 default: 605 ptr += MINBPC; 606 break; 607 } 608 } 609 ptr += MINBPC; 610 if (ptr == end) 611 return XML_TOK_PARTIAL; 612 switch (BYTE_TYPE(enc, ptr)) { 613 case BT_S: 614 case BT_CR: 615 case BT_LF: 616 break; 617 case BT_SOL: 618 goto sol; 619 case BT_GT: 620 goto gt; 621 default: 622 *nextTokPtr = ptr; 623 return XML_TOK_INVALID; 624 } 625 /* ptr points to closing quote */ 626 for (;;) { 627 ptr += MINBPC; 628 if (ptr == end) 629 return XML_TOK_PARTIAL; 630 switch (BYTE_TYPE(enc, ptr)) { 631 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 632 case BT_S: case BT_CR: case BT_LF: 633 continue; 634 case BT_GT: 635 gt: 636 *nextTokPtr = ptr + MINBPC; 637 return XML_TOK_START_TAG_WITH_ATTS; 638 case BT_SOL: 639 sol: 640 ptr += MINBPC; 641 if (ptr == end) 642 return XML_TOK_PARTIAL; 643 if (!CHAR_MATCHES(enc, ptr, '>')) { 644 *nextTokPtr = ptr; 645 return XML_TOK_INVALID; 646 } 647 *nextTokPtr = ptr + MINBPC; 648 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS; 649 default: 650 *nextTokPtr = ptr; 651 return XML_TOK_INVALID; 652 } 653 break; 654 } 655 break; 656 } 657 default: 658 *nextTokPtr = ptr; 659 return XML_TOK_INVALID; 660 } 661 } 662 return XML_TOK_PARTIAL; 663} 664 665/* ptr points to character following "<" */ 666 667static 668int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end, 669 const char **nextTokPtr) 670{ 671 if (ptr == end) 672 return XML_TOK_PARTIAL; 673 switch (BYTE_TYPE(enc, ptr)) { 674 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 675 case BT_EXCL: 676 if ((ptr += MINBPC) == end) 677 return XML_TOK_PARTIAL; 678 switch (BYTE_TYPE(enc, ptr)) { 679 case BT_MINUS: 680 return PREFIX(scanComment)(enc, ptr + MINBPC, end, nextTokPtr); 681 case BT_LSQB: 682 return PREFIX(scanCdataSection)(enc, ptr + MINBPC, end, nextTokPtr); 683 } 684 *nextTokPtr = ptr; 685 return XML_TOK_INVALID; 686 case BT_QUEST: 687 return PREFIX(scanPi)(enc, ptr + MINBPC, end, nextTokPtr); 688 case BT_SOL: 689 return PREFIX(scanEndTag)(enc, ptr + MINBPC, end, nextTokPtr); 690 default: 691 *nextTokPtr = ptr; 692 return XML_TOK_INVALID; 693 } 694 /* we have a start-tag */ 695 while (ptr != end) { 696 switch (BYTE_TYPE(enc, ptr)) { 697 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 698 case BT_S: case BT_CR: case BT_LF: 699 { 700 ptr += MINBPC; 701 while (ptr != end) { 702 switch (BYTE_TYPE(enc, ptr)) { 703 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 704 case BT_GT: 705 goto gt; 706 case BT_SOL: 707 goto sol; 708 case BT_S: case BT_CR: case BT_LF: 709 ptr += MINBPC; 710 continue; 711 default: 712 *nextTokPtr = ptr; 713 return XML_TOK_INVALID; 714 } 715 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr); 716 } 717 return XML_TOK_PARTIAL; 718 } 719 case BT_GT: 720 gt: 721 *nextTokPtr = ptr + MINBPC; 722 return XML_TOK_START_TAG_NO_ATTS; 723 case BT_SOL: 724 sol: 725 ptr += MINBPC; 726 if (ptr == end) 727 return XML_TOK_PARTIAL; 728 if (!CHAR_MATCHES(enc, ptr, '>')) { 729 *nextTokPtr = ptr; 730 return XML_TOK_INVALID; 731 } 732 *nextTokPtr = ptr + MINBPC; 733 return XML_TOK_EMPTY_ELEMENT_NO_ATTS; 734 default: 735 *nextTokPtr = ptr; 736 return XML_TOK_INVALID; 737 } 738 } 739 return XML_TOK_PARTIAL; 740} 741 742static 743int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end, 744 const char **nextTokPtr) 745{ 746 if (ptr == end) 747 return XML_TOK_NONE; 748#if MINBPC > 1 749 { 750 size_t n = end - ptr; 751 if (n & (MINBPC - 1)) { 752 n &= ~(MINBPC - 1); 753 if (n == 0) 754 return XML_TOK_PARTIAL; 755 end = ptr + n; 756 } 757 } 758#endif 759 switch (BYTE_TYPE(enc, ptr)) { 760 case BT_LT: 761 return PREFIX(scanLt)(enc, ptr + MINBPC, end, nextTokPtr); 762 case BT_AMP: 763 return PREFIX(scanRef)(enc, ptr + MINBPC, end, nextTokPtr); 764 case BT_CR: 765 ptr += MINBPC; 766 if (ptr == end) 767 return XML_TOK_TRAILING_CR; 768 if (BYTE_TYPE(enc, ptr) == BT_LF) 769 ptr += MINBPC; 770 *nextTokPtr = ptr; 771 return XML_TOK_DATA_NEWLINE; 772 case BT_LF: 773 *nextTokPtr = ptr + MINBPC; 774 return XML_TOK_DATA_NEWLINE; 775 case BT_RSQB: 776 ptr += MINBPC; 777 if (ptr == end) 778 return XML_TOK_TRAILING_RSQB; 779 if (!CHAR_MATCHES(enc, ptr, ']')) 780 break; 781 ptr += MINBPC; 782 if (ptr == end) 783 return XML_TOK_TRAILING_RSQB; 784 if (!CHAR_MATCHES(enc, ptr, '>')) { 785 ptr -= MINBPC; 786 break; 787 } 788 *nextTokPtr = ptr; 789 return XML_TOK_INVALID; 790 INVALID_CASES(ptr, nextTokPtr) 791 default: 792 ptr += MINBPC; 793 break; 794 } 795 while (ptr != end) { 796 switch (BYTE_TYPE(enc, ptr)) { 797#define LEAD_CASE(n) \ 798 case BT_LEAD ## n: \ 799 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ 800 *nextTokPtr = ptr; \ 801 return XML_TOK_DATA_CHARS; \ 802 } \ 803 ptr += n; \ 804 break; 805 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 806#undef LEAD_CASE 807 case BT_RSQB: 808 if (ptr + MINBPC != end) { 809 if (!CHAR_MATCHES(enc, ptr + MINBPC, ']')) { 810 ptr += MINBPC; 811 break; 812 } 813 if (ptr + 2*MINBPC != end) { 814 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC, '>')) { 815 ptr += MINBPC; 816 break; 817 } 818 *nextTokPtr = ptr + 2*MINBPC; 819 return XML_TOK_INVALID; 820 } 821 } 822 /* fall through */ 823 case BT_AMP: 824 case BT_LT: 825 case BT_NONXML: 826 case BT_MALFORM: 827 case BT_TRAIL: 828 case BT_CR: 829 case BT_LF: 830 *nextTokPtr = ptr; 831 return XML_TOK_DATA_CHARS; 832 default: 833 ptr += MINBPC; 834 break; 835 } 836 } 837 *nextTokPtr = ptr; 838 return XML_TOK_DATA_CHARS; 839} 840 841/* ptr points to character following "%" */ 842 843static 844int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end, 845 const char **nextTokPtr) 846{ 847 if (ptr == end) 848 return XML_TOK_PARTIAL; 849 switch (BYTE_TYPE(enc, ptr)) { 850 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 851 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT: 852 *nextTokPtr = ptr; 853 return XML_TOK_PERCENT; 854 default: 855 *nextTokPtr = ptr; 856 return XML_TOK_INVALID; 857 } 858 while (ptr != end) { 859 switch (BYTE_TYPE(enc, ptr)) { 860 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 861 case BT_SEMI: 862 *nextTokPtr = ptr + MINBPC; 863 return XML_TOK_PARAM_ENTITY_REF; 864 default: 865 *nextTokPtr = ptr; 866 return XML_TOK_INVALID; 867 } 868 } 869 return XML_TOK_PARTIAL; 870} 871 872static 873int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end, 874 const char **nextTokPtr) 875{ 876 if (ptr == end) 877 return XML_TOK_PARTIAL; 878 switch (BYTE_TYPE(enc, ptr)) { 879 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) 880 default: 881 *nextTokPtr = ptr; 882 return XML_TOK_INVALID; 883 } 884 while (ptr != end) { 885 switch (BYTE_TYPE(enc, ptr)) { 886 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 887 case BT_CR: case BT_LF: case BT_S: 888 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR: 889 *nextTokPtr = ptr; 890 return XML_TOK_POUND_NAME; 891 default: 892 *nextTokPtr = ptr; 893 return XML_TOK_INVALID; 894 } 895 } 896 return XML_TOK_PARTIAL; 897} 898 899static 900int PREFIX(scanLit)(int open, const ENCODING *enc, 901 const char *ptr, const char *end, 902 const char **nextTokPtr) 903{ 904 while (ptr != end) { 905 int t = BYTE_TYPE(enc, ptr); 906 switch (t) { 907 INVALID_CASES(ptr, nextTokPtr) 908 case BT_QUOT: 909 case BT_APOS: 910 ptr += MINBPC; 911 if (t != open) 912 break; 913 if (ptr == end) 914 return XML_TOK_PARTIAL; 915 *nextTokPtr = ptr; 916 switch (BYTE_TYPE(enc, ptr)) { 917 case BT_S: case BT_CR: case BT_LF: 918 case BT_GT: case BT_PERCNT: case BT_LSQB: 919 return XML_TOK_LITERAL; 920 default: 921 return XML_TOK_INVALID; 922 } 923 default: 924 ptr += MINBPC; 925 break; 926 } 927 } 928 return XML_TOK_PARTIAL; 929} 930 931static 932int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, 933 const char **nextTokPtr) 934{ 935 int tok; 936 if (ptr == end) 937 return XML_TOK_NONE; 938#if MINBPC > 1 939 { 940 size_t n = end - ptr; 941 if (n & (MINBPC - 1)) { 942 n &= ~(MINBPC - 1); 943 if (n == 0) 944 return XML_TOK_PARTIAL; 945 end = ptr + n; 946 } 947 } 948#endif 949 switch (BYTE_TYPE(enc, ptr)) { 950 case BT_QUOT: 951 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC, end, nextTokPtr); 952 case BT_APOS: 953 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC, end, nextTokPtr); 954 case BT_LT: 955 { 956 ptr += MINBPC; 957 if (ptr == end) 958 return XML_TOK_PARTIAL; 959 switch (BYTE_TYPE(enc, ptr)) { 960 case BT_EXCL: 961 return PREFIX(scanDecl)(enc, ptr + MINBPC, end, nextTokPtr); 962 case BT_QUEST: 963 return PREFIX(scanPi)(enc, ptr + MINBPC, end, nextTokPtr); 964 case BT_NMSTRT: 965 case BT_HEX: 966 case BT_NONASCII: 967 case BT_LEAD2: 968 case BT_LEAD3: 969 case BT_LEAD4: 970 *nextTokPtr = ptr - MINBPC; 971 return XML_TOK_INSTANCE_START; 972 } 973 *nextTokPtr = ptr; 974 return XML_TOK_INVALID; 975 } 976 case BT_CR: 977 if (ptr + MINBPC == end) 978 return XML_TOK_TRAILING_CR; 979 /* fall through */ 980 case BT_S: case BT_LF: 981 for (;;) { 982 ptr += MINBPC; 983 if (ptr == end) 984 break; 985 switch (BYTE_TYPE(enc, ptr)) { 986 case BT_S: case BT_LF: 987 break; 988 case BT_CR: 989 /* don't split CR/LF pair */ 990 if (ptr + MINBPC != end) 991 break; 992 /* fall through */ 993 default: 994 *nextTokPtr = ptr; 995 return XML_TOK_PROLOG_S; 996 } 997 } 998 *nextTokPtr = ptr; 999 return XML_TOK_PROLOG_S; 1000 case BT_PERCNT: 1001 return PREFIX(scanPercent)(enc, ptr + MINBPC, end, nextTokPtr); 1002 case BT_COMMA: 1003 *nextTokPtr = ptr + MINBPC; 1004 return XML_TOK_COMMA; 1005 case BT_LSQB: 1006 *nextTokPtr = ptr + MINBPC; 1007 return XML_TOK_OPEN_BRACKET; 1008 case BT_RSQB: 1009 ptr += MINBPC; 1010 if (ptr == end) 1011 return XML_TOK_PARTIAL; 1012 if (CHAR_MATCHES(enc, ptr, ']')) { 1013 if (ptr + MINBPC == end) 1014 return XML_TOK_PARTIAL; 1015 if (CHAR_MATCHES(enc, ptr + MINBPC, '>')) { 1016 *nextTokPtr = ptr + 2*MINBPC; 1017 return XML_TOK_COND_SECT_CLOSE; 1018 } 1019 } 1020 *nextTokPtr = ptr; 1021 return XML_TOK_CLOSE_BRACKET; 1022 case BT_LPAR: 1023 *nextTokPtr = ptr + MINBPC; 1024 return XML_TOK_OPEN_PAREN; 1025 case BT_RPAR: 1026 ptr += MINBPC; 1027 if (ptr == end) 1028 return XML_TOK_PARTIAL; 1029 switch (BYTE_TYPE(enc, ptr)) { 1030 case BT_AST: 1031 *nextTokPtr = ptr + MINBPC; 1032 return XML_TOK_CLOSE_PAREN_ASTERISK; 1033 case BT_QUEST: 1034 *nextTokPtr = ptr + MINBPC; 1035 return XML_TOK_CLOSE_PAREN_QUESTION; 1036 case BT_PLUS: 1037 *nextTokPtr = ptr + MINBPC; 1038 return XML_TOK_CLOSE_PAREN_PLUS; 1039 case BT_CR: case BT_LF: case BT_S: 1040 case BT_GT: case BT_COMMA: case BT_VERBAR: 1041 case BT_RPAR: 1042 *nextTokPtr = ptr; 1043 return XML_TOK_CLOSE_PAREN; 1044 } 1045 *nextTokPtr = ptr; 1046 return XML_TOK_INVALID; 1047 case BT_VERBAR: 1048 *nextTokPtr = ptr + MINBPC; 1049 return XML_TOK_OR; 1050 case BT_GT: 1051 *nextTokPtr = ptr + MINBPC; 1052 return XML_TOK_DECL_CLOSE; 1053 case BT_NUM: 1054 return PREFIX(scanPoundName)(enc, ptr + MINBPC, end, nextTokPtr); 1055#define LEAD_CASE(n) \ 1056 case BT_LEAD ## n: \ 1057 if (end - ptr < n) \ 1058 return XML_TOK_PARTIAL_CHAR; \ 1059 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \ 1060 ptr += n; \ 1061 tok = XML_TOK_NAME; \ 1062 break; \ 1063 } \ 1064 if (IS_NAME_CHAR(enc, ptr, n)) { \ 1065 ptr += n; \ 1066 tok = XML_TOK_NMTOKEN; \ 1067 break; \ 1068 } \ 1069 *nextTokPtr = ptr; \ 1070 return XML_TOK_INVALID; 1071 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1072#undef LEAD_CASE 1073 case BT_NMSTRT: 1074 case BT_HEX: 1075 tok = XML_TOK_NAME; 1076 ptr += MINBPC; 1077 break; 1078 case BT_DIGIT: 1079 case BT_NAME: 1080 case BT_MINUS: 1081 tok = XML_TOK_NMTOKEN; 1082 ptr += MINBPC; 1083 break; 1084 case BT_NONASCII: 1085 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { 1086 ptr += MINBPC; 1087 tok = XML_TOK_NAME; 1088 break; 1089 } 1090 if (IS_NAME_CHAR_MINBPC(enc, ptr)) { 1091 ptr += MINBPC; 1092 tok = XML_TOK_NMTOKEN; 1093 break; 1094 } 1095 /* fall through */ 1096 default: 1097 *nextTokPtr = ptr; 1098 return XML_TOK_INVALID; 1099 } 1100 while (ptr != end) { 1101 switch (BYTE_TYPE(enc, ptr)) { 1102 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) 1103 case BT_GT: case BT_RPAR: case BT_COMMA: 1104 case BT_VERBAR: case BT_LSQB: case BT_PERCNT: 1105 case BT_S: case BT_CR: case BT_LF: 1106 *nextTokPtr = ptr; 1107 return tok; 1108 case BT_PLUS: 1109 if (tok != XML_TOK_NAME) { 1110 *nextTokPtr = ptr; 1111 return XML_TOK_INVALID; 1112 } 1113 *nextTokPtr = ptr + MINBPC; 1114 return XML_TOK_NAME_PLUS; 1115 case BT_AST: 1116 if (tok != XML_TOK_NAME) { 1117 *nextTokPtr = ptr; 1118 return XML_TOK_INVALID; 1119 } 1120 *nextTokPtr = ptr + MINBPC; 1121 return XML_TOK_NAME_ASTERISK; 1122 case BT_QUEST: 1123 if (tok != XML_TOK_NAME) { 1124 *nextTokPtr = ptr; 1125 return XML_TOK_INVALID; 1126 } 1127 *nextTokPtr = ptr + MINBPC; 1128 return XML_TOK_NAME_QUESTION; 1129 default: 1130 *nextTokPtr = ptr; 1131 return XML_TOK_INVALID; 1132 } 1133 } 1134 return XML_TOK_PARTIAL; 1135} 1136 1137static 1138int PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end, 1139 const char **nextTokPtr) 1140{ 1141 const char *start; 1142 if (ptr == end) 1143 return XML_TOK_NONE; 1144 start = ptr; 1145 while (ptr != end) { 1146 switch (BYTE_TYPE(enc, ptr)) { 1147#define LEAD_CASE(n) \ 1148 case BT_LEAD ## n: ptr += n; break; 1149 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1150#undef LEAD_CASE 1151 case BT_AMP: 1152 if (ptr == start) 1153 return PREFIX(scanRef)(enc, ptr + MINBPC, end, nextTokPtr); 1154 *nextTokPtr = ptr; 1155 return XML_TOK_DATA_CHARS; 1156 case BT_LT: 1157 /* this is for inside entity references */ 1158 *nextTokPtr = ptr; 1159 return XML_TOK_INVALID; 1160 case BT_LF: 1161 if (ptr == start) { 1162 *nextTokPtr = ptr + MINBPC; 1163 return XML_TOK_DATA_NEWLINE; 1164 } 1165 *nextTokPtr = ptr; 1166 return XML_TOK_DATA_CHARS; 1167 case BT_CR: 1168 if (ptr == start) { 1169 ptr += MINBPC; 1170 if (ptr == end) 1171 return XML_TOK_TRAILING_CR; 1172 if (BYTE_TYPE(enc, ptr) == BT_LF) 1173 ptr += MINBPC; 1174 *nextTokPtr = ptr; 1175 return XML_TOK_DATA_NEWLINE; 1176 } 1177 *nextTokPtr = ptr; 1178 return XML_TOK_DATA_CHARS; 1179 case BT_S: 1180 if (ptr == start) { 1181 *nextTokPtr = ptr + MINBPC; 1182 return XML_TOK_ATTRIBUTE_VALUE_S; 1183 } 1184 *nextTokPtr = ptr; 1185 return XML_TOK_DATA_CHARS; 1186 default: 1187 ptr += MINBPC; 1188 break; 1189 } 1190 } 1191 *nextTokPtr = ptr; 1192 return XML_TOK_DATA_CHARS; 1193} 1194 1195static 1196int PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end, 1197 const char **nextTokPtr) 1198{ 1199 const char *start; 1200 if (ptr == end) 1201 return XML_TOK_NONE; 1202 start = ptr; 1203 while (ptr != end) { 1204 switch (BYTE_TYPE(enc, ptr)) { 1205#define LEAD_CASE(n) \ 1206 case BT_LEAD ## n: ptr += n; break; 1207 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1208#undef LEAD_CASE 1209 case BT_AMP: 1210 if (ptr == start) 1211 return PREFIX(scanRef)(enc, ptr + MINBPC, end, nextTokPtr); 1212 *nextTokPtr = ptr; 1213 return XML_TOK_DATA_CHARS; 1214 case BT_PERCNT: 1215 if (ptr == start) 1216 return PREFIX(scanPercent)(enc, ptr + MINBPC, end, nextTokPtr); 1217 *nextTokPtr = ptr; 1218 return XML_TOK_DATA_CHARS; 1219 case BT_LF: 1220 if (ptr == start) { 1221 *nextTokPtr = ptr + MINBPC; 1222 return XML_TOK_DATA_NEWLINE; 1223 } 1224 *nextTokPtr = ptr; 1225 return XML_TOK_DATA_CHARS; 1226 case BT_CR: 1227 if (ptr == start) { 1228 ptr += MINBPC; 1229 if (ptr == end) 1230 return XML_TOK_TRAILING_CR; 1231 if (BYTE_TYPE(enc, ptr) == BT_LF) 1232 ptr += MINBPC; 1233 *nextTokPtr = ptr; 1234 return XML_TOK_DATA_NEWLINE; 1235 } 1236 *nextTokPtr = ptr; 1237 return XML_TOK_DATA_CHARS; 1238 default: 1239 ptr += MINBPC; 1240 break; 1241 } 1242 } 1243 *nextTokPtr = ptr; 1244 return XML_TOK_DATA_CHARS; 1245} 1246 1247static 1248int PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end, 1249 const char **badPtr) 1250{ 1251 ptr += MINBPC; 1252 end -= MINBPC; 1253 for (; ptr != end; ptr += MINBPC) { 1254 switch (BYTE_TYPE(enc, ptr)) { 1255 case BT_DIGIT: 1256 case BT_HEX: 1257 case BT_MINUS: 1258 case BT_APOS: 1259 case BT_LPAR: 1260 case BT_RPAR: 1261 case BT_PLUS: 1262 case BT_COMMA: 1263 case BT_SOL: 1264 case BT_EQUALS: 1265 case BT_QUEST: 1266 case BT_CR: 1267 case BT_LF: 1268 case BT_SEMI: 1269 case BT_EXCL: 1270 case BT_AST: 1271 case BT_PERCNT: 1272 case BT_NUM: 1273 break; 1274 case BT_S: 1275 if (CHAR_MATCHES(enc, ptr, '\t')) { 1276 *badPtr = ptr; 1277 return 0; 1278 } 1279 break; 1280 case BT_NAME: 1281 case BT_NMSTRT: 1282 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f)) 1283 break; 1284 default: 1285 switch (BYTE_TO_ASCII(enc, ptr)) { 1286 case 0x24: /* $ */ 1287 case 0x40: /* @ */ 1288 break; 1289 default: 1290 *badPtr = ptr; 1291 return 0; 1292 } 1293 break; 1294 } 1295 } 1296 return 1; 1297} 1298 1299/* This must only be called for a well-formed start-tag or empty element tag. 1300Returns the number of attributes. Pointers to the first attsMax attributes 1301are stored in atts. */ 1302 1303static 1304int PREFIX(getAtts)(const ENCODING *enc, const char *ptr, 1305 int attsMax, ATTRIBUTE *atts) 1306{ 1307 enum { other, inName, inValue } state = inName; 1308 int nAtts = 0; 1309 int open = 0; 1310 1311 for (ptr += MINBPC;; ptr += MINBPC) { 1312 switch (BYTE_TYPE(enc, ptr)) { 1313#define START_NAME \ 1314 if (state == other) { \ 1315 if (nAtts < attsMax) { \ 1316 atts[nAtts].name = ptr; \ 1317 atts[nAtts].normalized = 1; \ 1318 } \ 1319 state = inName; \ 1320 } 1321#define LEAD_CASE(n) \ 1322 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC); break; 1323 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1324#undef LEAD_CASE 1325 case BT_NONASCII: 1326 case BT_NMSTRT: 1327 case BT_HEX: 1328 START_NAME 1329 break; 1330#undef START_NAME 1331 case BT_QUOT: 1332 if (state != inValue) { 1333 atts[nAtts].valuePtr = ptr + MINBPC; 1334 state = inValue; 1335 open = BT_QUOT; 1336 } 1337 else if (open == BT_QUOT) { 1338 state = other; 1339 atts[nAtts++].valueEnd = ptr; 1340 } 1341 break; 1342 case BT_APOS: 1343 if (state != inValue) { 1344 atts[nAtts].valuePtr = ptr + MINBPC; 1345 state = inValue; 1346 open = BT_APOS; 1347 } 1348 else if (open == BT_APOS) { 1349 state = other; 1350 atts[nAtts++].valueEnd = ptr; 1351 } 1352 break; 1353 case BT_AMP: 1354 atts[nAtts].normalized = 0; 1355 break; 1356 case BT_S: 1357 if (state == inName) 1358 state = other; 1359 else if (state == inValue 1360 && atts[nAtts].normalized 1361 && (ptr == atts[nAtts].valuePtr 1362 || BYTE_TO_ASCII(enc, ptr) != ' ' 1363 || BYTE_TO_ASCII(enc, ptr + MINBPC) == ' ' 1364 || BYTE_TYPE(enc, ptr + MINBPC) == open)) 1365 atts[nAtts].normalized = 0; 1366 break; 1367 case BT_CR: case BT_LF: 1368 /* This case ensures that the first attribute name is counted 1369 Apart from that we could just change state on the quote. */ 1370 if (state == inName) 1371 state = other; 1372 else if (state == inValue) 1373 atts[nAtts].normalized = 0; 1374 break; 1375 case BT_GT: 1376 case BT_SOL: 1377 if (state != inValue) 1378 return nAtts; 1379 break; 1380 default: 1381 break; 1382 } 1383 } 1384 /* not reached */ 1385} 1386 1387static 1388int PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) 1389{ 1390 int result = 0; 1391 /* skip &# */ 1392 ptr += 2*MINBPC; 1393 if (CHAR_MATCHES(enc, ptr, 'x')) { 1394 for (ptr += MINBPC; !CHAR_MATCHES(enc, ptr, ';'); ptr += MINBPC) { 1395 int c = BYTE_TO_ASCII(enc, ptr); 1396 switch (c) { 1397 case '0': case '1': case '2': case '3': case '4': 1398 case '5': case '6': case '7': case '8': case '9': 1399 result <<= 4; 1400 result |= (c - '0'); 1401 break; 1402 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 1403 result <<= 4; 1404 result += 10 + (c - 'A'); 1405 break; 1406 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 1407 result <<= 4; 1408 result += 10 + (c - 'a'); 1409 break; 1410 } 1411 if (result >= 0x110000) 1412 return -1; 1413 } 1414 } 1415 else { 1416 for (; !CHAR_MATCHES(enc, ptr, ';'); ptr += MINBPC) { 1417 int c = BYTE_TO_ASCII(enc, ptr); 1418 result *= 10; 1419 result += (c - '0'); 1420 if (result >= 0x110000) 1421 return -1; 1422 } 1423 } 1424 return checkCharRefNumber(result); 1425} 1426 1427static 1428int PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, const char *end) 1429{ 1430 switch (end - ptr) { 1431 case 2 * MINBPC: 1432 if (CHAR_MATCHES(enc, ptr + MINBPC, 't')) { 1433 switch (BYTE_TO_ASCII(enc, ptr)) { 1434 case 'l': 1435 return '<'; 1436 case 'g': 1437 return '>'; 1438 } 1439 } 1440 break; 1441 case 3 * MINBPC: 1442 if (CHAR_MATCHES(enc, ptr, 'a')) { 1443 ptr += MINBPC; 1444 if (CHAR_MATCHES(enc, ptr, 'm')) { 1445 ptr += MINBPC; 1446 if (CHAR_MATCHES(enc, ptr, 'p')) 1447 return '&'; 1448 } 1449 } 1450 break; 1451 case 4 * MINBPC: 1452 switch (BYTE_TO_ASCII(enc, ptr)) { 1453 case 'q': 1454 ptr += MINBPC; 1455 if (CHAR_MATCHES(enc, ptr, 'u')) { 1456 ptr += MINBPC; 1457 if (CHAR_MATCHES(enc, ptr, 'o')) { 1458 ptr += MINBPC; 1459 if (CHAR_MATCHES(enc, ptr, 't')) 1460 return '"'; 1461 } 1462 } 1463 break; 1464 case 'a': 1465 ptr += MINBPC; 1466 if (CHAR_MATCHES(enc, ptr, 'p')) { 1467 ptr += MINBPC; 1468 if (CHAR_MATCHES(enc, ptr, 'o')) { 1469 ptr += MINBPC; 1470 if (CHAR_MATCHES(enc, ptr, 's')) 1471 return '\''; 1472 } 1473 } 1474 break; 1475 } 1476 } 1477 return 0; 1478} 1479 1480static 1481int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2) 1482{ 1483 for (;;) { 1484 switch (BYTE_TYPE(enc, ptr1)) { 1485#define LEAD_CASE(n) \ 1486 case BT_LEAD ## n: \ 1487 if (*ptr1++ != *ptr2++) \ 1488 return 0; 1489 LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2) 1490#undef LEAD_CASE 1491 /* fall through */ 1492 if (*ptr1++ != *ptr2++) 1493 return 0; 1494 break; 1495 case BT_NONASCII: 1496 case BT_NMSTRT: 1497 case BT_HEX: 1498 case BT_DIGIT: 1499 case BT_NAME: 1500 case BT_MINUS: 1501 if (*ptr2++ != *ptr1++) 1502 return 0; 1503#if MINBPC > 1 1504 if (*ptr2++ != *ptr1++) 1505 return 0; 1506#if MINBPC > 2 1507 if (*ptr2++ != *ptr1++) 1508 return 0; 1509#if MINBPC > 3 1510 if (*ptr2++ != *ptr1++) 1511 return 0; 1512#endif 1513#endif 1514#endif 1515 break; 1516 default: 1517#if MINBPC == 1 1518 if (*ptr1 == *ptr2) 1519 return 1; 1520#endif 1521 switch (BYTE_TYPE(enc, ptr2)) { 1522 case BT_LEAD2: 1523 case BT_LEAD3: 1524 case BT_LEAD4: 1525 case BT_NONASCII: 1526 case BT_NMSTRT: 1527 case BT_HEX: 1528 case BT_DIGIT: 1529 case BT_NAME: 1530 case BT_MINUS: 1531 return 0; 1532 default: 1533 return 1; 1534 } 1535 } 1536 } 1537 /* not reached */ 1538} 1539 1540static 1541int PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1, const char *ptr2) 1542{ 1543 for (; *ptr2; ptr1 += MINBPC, ptr2++) { 1544 if (!CHAR_MATCHES(end, ptr1, *ptr2)) 1545 return 0; 1546 } 1547 switch (BYTE_TYPE(enc, ptr1)) { 1548 case BT_LEAD2: 1549 case BT_LEAD3: 1550 case BT_LEAD4: 1551 case BT_NONASCII: 1552 case BT_NMSTRT: 1553 case BT_HEX: 1554 case BT_DIGIT: 1555 case BT_NAME: 1556 case BT_MINUS: 1557 return 0; 1558 default: 1559 return 1; 1560 } 1561} 1562 1563static 1564int PREFIX(nameLength)(const ENCODING *enc, const char *ptr) 1565{ 1566 const char *start = ptr; 1567 for (;;) { 1568 switch (BYTE_TYPE(enc, ptr)) { 1569#define LEAD_CASE(n) \ 1570 case BT_LEAD ## n: ptr += n; break; 1571 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1572#undef LEAD_CASE 1573 case BT_NONASCII: 1574 case BT_NMSTRT: 1575 case BT_HEX: 1576 case BT_DIGIT: 1577 case BT_NAME: 1578 case BT_MINUS: 1579 ptr += MINBPC; 1580 break; 1581 default: 1582 return ptr - start; 1583 } 1584 } 1585} 1586 1587static 1588const char *PREFIX(skipS)(const ENCODING *enc, const char *ptr) 1589{ 1590 for (;;) { 1591 switch (BYTE_TYPE(enc, ptr)) { 1592 case BT_LF: 1593 case BT_CR: 1594 case BT_S: 1595 ptr += MINBPC; 1596 break; 1597 default: 1598 return ptr; 1599 } 1600 } 1601} 1602 1603static 1604void PREFIX(updatePosition)(const ENCODING *enc, 1605 const char *ptr, 1606 const char *end, 1607 POSITION *pos) 1608{ 1609 while (ptr != end) { 1610 switch (BYTE_TYPE(enc, ptr)) { 1611#define LEAD_CASE(n) \ 1612 case BT_LEAD ## n: \ 1613 ptr += n; \ 1614 break; 1615 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) 1616#undef LEAD_CASE 1617 case BT_LF: 1618 pos->columnNumber = (unsigned)-1; 1619 pos->lineNumber++; 1620 ptr += MINBPC; 1621 break; 1622 case BT_CR: 1623 pos->lineNumber++; 1624 ptr += MINBPC; 1625 if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF) 1626 ptr += MINBPC; 1627 pos->columnNumber = (unsigned)-1; 1628 break; 1629 default: 1630 ptr += MINBPC; 1631 break; 1632 } 1633 pos->columnNumber++; 1634 } 1635} 1636 1637#undef DO_LEAD_CASE 1638#undef MULTIBYTE_CASES 1639#undef INVALID_CASES 1640#undef CHECK_NAME_CASE 1641#undef CHECK_NAME_CASES 1642#undef CHECK_NMSTRT_CASE 1643#undef CHECK_NMSTRT_CASES 1644