1/* 2 * Copyright (c) 2012, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26package jdk.internal.util.xml.impl; 27 28import java.io.IOException; 29import java.io.InputStream; 30import java.io.InputStreamReader; 31import java.io.Reader; 32import java.io.UnsupportedEncodingException; 33import java.util.HashMap; 34import java.util.Map; 35import jdk.internal.org.xml.sax.InputSource; 36import jdk.internal.org.xml.sax.SAXException; 37 38/** 39 * XML non-validating parser engine. 40 */ 41public abstract class Parser { 42 43 public static final String FAULT = ""; 44 protected static final int BUFFSIZE_READER = 512; 45 protected static final int BUFFSIZE_PARSER = 128; 46 /** 47 * The end of stream character. 48 */ 49 public static final char EOS = 0xffff; 50 private Pair mNoNS; // there is no namespace 51 private Pair mXml; // the xml namespace 52 private Map<String, Input> mEnt; // the entities look up table 53 private Map<String, Input> mPEnt; // the parmeter entities look up table 54 protected boolean mIsSAlone; // xml decl standalone flag 55 protected boolean mIsSAloneSet; // standalone is explicitely set 56 protected boolean mIsNSAware; // if true - namespace aware mode 57 protected int mPh; // current phase of document processing 58 protected static final int PH_BEFORE_DOC = -1; // before parsing 59 protected static final int PH_DOC_START = 0; // document start 60 protected static final int PH_MISC_DTD = 1; // misc before DTD 61 protected static final int PH_DTD = 2; // DTD 62 protected static final int PH_DTD_MISC = 3; // misc after DTD 63 protected static final int PH_DOCELM = 4; // document's element 64 protected static final int PH_DOCELM_MISC = 5; // misc after element 65 protected static final int PH_AFTER_DOC = 6; // after parsing 66 protected int mEvt; // current event type 67 protected static final int EV_NULL = 0; // unknown 68 protected static final int EV_ELM = 1; // empty element 69 protected static final int EV_ELMS = 2; // start element 70 protected static final int EV_ELME = 3; // end element 71 protected static final int EV_TEXT = 4; // textual content 72 protected static final int EV_WSPC = 5; // white space content 73 protected static final int EV_PI = 6; // processing instruction 74 protected static final int EV_CDAT = 7; // character data 75 protected static final int EV_COMM = 8; // comment 76 protected static final int EV_DTD = 9; // document type definition 77 protected static final int EV_ENT = 10; // skipped entity 78 private char mESt; // built-in entity recognizer state 79 // mESt values: 80 // 0x100 : the initial state 81 // > 0x100 : unrecognized name 82 // < 0x100 : replacement character 83 protected char[] mBuff; // parser buffer 84 protected int mBuffIdx; // index of the last char 85 protected Pair mPref; // stack of prefixes 86 protected Pair mElm; // stack of elements 87 // mAttL.chars - element qname 88 // mAttL.next - next element 89 // mAttL.list - list of attributes defined on this element 90 // mAttL.list.chars - attribute qname 91 // mAttL.list.id - a char representing attribute's type see below 92 // mAttL.list.next - next attribute defined on the element 93 // mAttL.list.list - devault value structure or null 94 // mAttL.list.list.chars - "name='value' " chars array for Input 95 // 96 // Attribute type character values: 97 // 'i' - "ID" 98 // 'r' - "IDREF" 99 // 'R' - "IDREFS" 100 // 'n' - "ENTITY" 101 // 'N' - "ENTITIES" 102 // 't' - "NMTOKEN" 103 // 'T' - "NMTOKENS" 104 // 'u' - enumeration type 105 // 'o' - "NOTATION" 106 // 'c' - "CDATA" 107 // see also: bkeyword() and atype() 108 // 109 protected Pair mAttL; // list of defined attrs by element name 110 protected Input mDoc; // document entity 111 protected Input mInp; // stack of entities 112 private char[] mChars; // reading buffer 113 private int mChLen; // current capacity 114 private int mChIdx; // index to the next char 115 protected Attrs mAttrs; // attributes of the curr. element 116 private String[] mItems; // attributes array of the curr. element 117 private char mAttrIdx; // attributes counter/index 118 private String mUnent; // unresolved entity name 119 private Pair mDltd; // deleted objects for reuse 120 /** 121 * Default prefixes 122 */ 123 private static final char NONS[]; 124 private static final char XML[]; 125 private static final char XMLNS[]; 126 127 static { 128 NONS = new char[1]; 129 NONS[0] = (char) 0; 130 131 XML = new char[4]; 132 XML[0] = (char) 4; 133 XML[1] = 'x'; 134 XML[2] = 'm'; 135 XML[3] = 'l'; 136 137 XMLNS = new char[6]; 138 XMLNS[0] = (char) 6; 139 XMLNS[1] = 'x'; 140 XMLNS[2] = 'm'; 141 XMLNS[3] = 'l'; 142 XMLNS[4] = 'n'; 143 XMLNS[5] = 's'; 144 } 145 /** 146 * ASCII character type array. 147 * 148 * This array maps an ASCII (7 bit) character to the character type.<br> 149 * Possible character type values are:<br> - ' ' for any kind of white 150 * space character;<br> - 'a' for any lower case alphabetical character 151 * value;<br> - 'A' for any upper case alphabetical character value;<br> 152 * - 'd' for any decimal digit character value;<br> - 'z' for any 153 * character less than ' ' except '\t', '\n', '\r';<br> An ASCII (7 bit) 154 * character which does not fall in any category listed above is mapped to 155 * it self. 156 */ 157 private static final byte asctyp[]; 158 /** 159 * NMTOKEN character type array. 160 * 161 * This array maps an ASCII (7 bit) character to the character type.<br> 162 * Possible character type values are:<br> - 0 for underscore ('_') or any 163 * lower and upper case alphabetical character value;<br> - 1 for colon 164 * (':') character;<br> - 2 for dash ('-') and dot ('.') or any decimal 165 * digit character value;<br> - 3 for any kind of white space character<br> 166 * An ASCII (7 bit) character which does not fall in any category listed 167 * above is mapped to 0xff. 168 */ 169 private static final byte nmttyp[]; 170 171 /** 172 * Static constructor. 173 * 174 * Sets up the ASCII character type array which is used by 175 * {@link #asctyp asctyp} method and NMTOKEN character type array. 176 */ 177 static { 178 short i = 0; 179 180 asctyp = new byte[0x80]; 181 while (i < ' ') { 182 asctyp[i++] = (byte) 'z'; 183 } 184 asctyp['\t'] = (byte) ' '; 185 asctyp['\r'] = (byte) ' '; 186 asctyp['\n'] = (byte) ' '; 187 while (i < '0') { 188 asctyp[i] = (byte) i++; 189 } 190 while (i <= '9') { 191 asctyp[i++] = (byte) 'd'; 192 } 193 while (i < 'A') { 194 asctyp[i] = (byte) i++; 195 } 196 while (i <= 'Z') { 197 asctyp[i++] = (byte) 'A'; 198 } 199 while (i < 'a') { 200 asctyp[i] = (byte) i++; 201 } 202 while (i <= 'z') { 203 asctyp[i++] = (byte) 'a'; 204 } 205 while (i < 0x80) { 206 asctyp[i] = (byte) i++; 207 } 208 209 nmttyp = new byte[0x80]; 210 for (i = 0; i < '0'; i++) { 211 nmttyp[i] = (byte) 0xff; 212 } 213 while (i <= '9') { 214 nmttyp[i++] = (byte) 2; // digits 215 } 216 while (i < 'A') { 217 nmttyp[i++] = (byte) 0xff; 218 } 219 // skiped upper case alphabetical character are already 0 220 for (i = '['; i < 'a'; i++) { 221 nmttyp[i] = (byte) 0xff; 222 } 223 // skiped lower case alphabetical character are already 0 224 for (i = '{'; i < 0x80; i++) { 225 nmttyp[i] = (byte) 0xff; 226 } 227 nmttyp['_'] = 0; 228 nmttyp[':'] = 1; 229 nmttyp['.'] = 2; 230 nmttyp['-'] = 2; 231 nmttyp[' '] = 3; 232 nmttyp['\t'] = 3; 233 nmttyp['\r'] = 3; 234 nmttyp['\n'] = 3; 235 } 236 237 /** 238 * Constructor. 239 */ 240 protected Parser() { 241 mPh = PH_BEFORE_DOC; // before parsing 242 243 // Initialize the parser 244 mBuff = new char[BUFFSIZE_PARSER]; 245 mAttrs = new Attrs(); 246 247 // Default namespace 248 mPref = pair(mPref); 249 mPref.name = ""; 250 mPref.value = ""; 251 mPref.chars = NONS; 252 mNoNS = mPref; // no namespace 253 // XML namespace 254 mPref = pair(mPref); 255 mPref.name = "xml"; 256 mPref.value = "http://www.w3.org/XML/1998/namespace"; 257 mPref.chars = XML; 258 mXml = mPref; // XML namespace 259 } 260 261 /** 262 * Initializes parser's internals. Note, current input has to be set before 263 * this method is called. 264 */ 265 protected void init() { 266 mUnent = null; 267 mElm = null; 268 mPref = mXml; 269 mAttL = null; 270 mPEnt = new HashMap<>(); 271 mEnt = new HashMap<>(); 272 mDoc = mInp; // current input is document entity 273 mChars = mInp.chars; // use document entity buffer 274 mPh = PH_DOC_START; // the begining of the document 275 } 276 277 /** 278 * Cleans up parser internal resources. 279 */ 280 protected void cleanup() { 281 // Default attributes 282 while (mAttL != null) { 283 while (mAttL.list != null) { 284 if (mAttL.list.list != null) { 285 del(mAttL.list.list); 286 } 287 mAttL.list = del(mAttL.list); 288 } 289 mAttL = del(mAttL); 290 } 291 // Element stack 292 while (mElm != null) { 293 mElm = del(mElm); 294 } 295 // Namespace prefixes 296 while (mPref != mXml) { 297 mPref = del(mPref); 298 } 299 // Inputs 300 while (mInp != null) { 301 pop(); 302 } 303 // Document reader 304 if ((mDoc != null) && (mDoc.src != null)) { 305 try { 306 mDoc.src.close(); 307 } catch (IOException ioe) { 308 } 309 } 310 mPEnt = null; 311 mEnt = null; 312 mDoc = null; 313 mPh = PH_AFTER_DOC; // before documnet processing 314 } 315 316 /** 317 * Processes a portion of document. This method returns one of EV_* 318 * constants as an identifier of the portion of document have been read. 319 * 320 * @return Identifier of processed document portion. 321 * @exception Exception is parser specific exception form panic method. 322 * @exception IOException 323 */ 324 @SuppressWarnings("fallthrough") 325 protected int step() throws Exception { 326 mEvt = EV_NULL; 327 int st = 0; 328 while (mEvt == EV_NULL) { 329 char ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 330 switch (st) { 331 case 0: // all sorts of markup (dispetcher) 332 if (ch != '<') { 333 bkch(); 334 mBuffIdx = -1; // clean parser buffer 335 st = 1; 336 break; 337 } 338 switch (getch()) { 339 case '/': // the end of the element content 340 mEvt = EV_ELME; 341 if (mElm == null) { 342 panic(FAULT); 343 } 344 // Check element's open/close tags balance 345 mBuffIdx = -1; // clean parser buffer 346 bname(mIsNSAware); 347 char[] chars = mElm.chars; 348 if (chars.length == (mBuffIdx + 1)) { 349 for (char i = 1; i <= mBuffIdx; i += 1) { 350 if (chars[i] != mBuff[i]) { 351 panic(FAULT); 352 } 353 } 354 } else { 355 panic(FAULT); 356 } 357 // Skip white spaces before '>' 358 if (wsskip() != '>') { 359 panic(FAULT); 360 } 361 getch(); // read '>' 362 break; 363 364 case '!': // a comment or a CDATA 365 ch = getch(); 366 bkch(); 367 switch (ch) { 368 case '-': // must be a comment 369 mEvt = EV_COMM; 370 comm(); 371 break; 372 373 case '[': // must be a CDATA section 374 mEvt = EV_CDAT; 375 cdat(); 376 break; 377 378 default: // must be 'DOCTYPE' 379 mEvt = EV_DTD; 380 dtd(); 381 break; 382 } 383 break; 384 385 case '?': // processing instruction 386 mEvt = EV_PI; 387 pi(); 388 break; 389 390 default: // must be the first char of an xml name 391 bkch(); 392 // Read an element name and put it on top of the 393 // element stack 394 mElm = pair(mElm); // add new element to the stack 395 mElm.chars = qname(mIsNSAware); 396 mElm.name = mElm.local(); 397 mElm.id = (mElm.next != null) ? mElm.next.id : 0; // flags 398 mElm.num = 0; // namespace counter 399 // Find the list of defined attributs of the current 400 // element 401 Pair elm = find(mAttL, mElm.chars); 402 mElm.list = (elm != null) ? elm.list : null; 403 // Read attributes till the end of the element tag 404 mAttrIdx = 0; 405 Pair att = pair(null); 406 att.num = 0; // clear attribute's flags 407 attr(att); // get all attributes inc. defaults 408 del(att); 409 mElm.value = (mIsNSAware) ? rslv(mElm.chars) : null; 410 // Skip white spaces before '>' 411 switch (wsskip()) { 412 case '>': 413 getch(); // read '>' 414 mEvt = EV_ELMS; 415 break; 416 417 case '/': 418 getch(); // read '/' 419 if (getch() != '>') // read '>' 420 { 421 panic(FAULT); 422 } 423 mEvt = EV_ELM; 424 break; 425 426 default: 427 panic(FAULT); 428 } 429 break; 430 } 431 break; 432 433 case 1: // read white space 434 switch (ch) { 435 case ' ': 436 case '\t': 437 case '\n': 438 bappend(ch); 439 break; 440 441 case '\r': // EOL processing [#2.11] 442 if (getch() != '\n') { 443 bkch(); 444 } 445 bappend('\n'); 446 break; 447 448 case '<': 449 mEvt = EV_WSPC; 450 bkch(); 451 bflash_ws(); 452 break; 453 454 default: 455 bkch(); 456 st = 2; 457 break; 458 } 459 break; 460 461 case 2: // read the text content of the element 462 switch (ch) { 463 case '&': 464 if (mUnent == null) { 465 // There was no unresolved entity on previous step. 466 if ((mUnent = ent('x')) != null) { 467 mEvt = EV_TEXT; 468 bkch(); // move back to ';' after entity name 469 setch('&'); // parser must be back on next step 470 bflash(); 471 } 472 } else { 473 // There was unresolved entity on previous step. 474 mEvt = EV_ENT; 475 skippedEnt(mUnent); 476 mUnent = null; 477 } 478 break; 479 480 case '<': 481 mEvt = EV_TEXT; 482 bkch(); 483 bflash(); 484 break; 485 486 case '\r': // EOL processing [#2.11] 487 if (getch() != '\n') { 488 bkch(); 489 } 490 bappend('\n'); 491 break; 492 493 case EOS: 494 panic(FAULT); 495 496 default: 497 bappend(ch); 498 break; 499 } 500 break; 501 502 default: 503 panic(FAULT); 504 } 505 } 506 507 return mEvt; 508 } 509 510 /** 511 * Parses the document type declaration. 512 * 513 * @exception Exception is parser specific exception form panic method. 514 * @exception IOException 515 */ 516 private void dtd() throws Exception { 517 char ch; 518 String str = null; 519 String name = null; 520 Pair psid = null; 521 // read 'DOCTYPE' 522 if ("DOCTYPE".equals(name(false)) != true) { 523 panic(FAULT); 524 } 525 mPh = PH_DTD; // DTD 526 for (short st = 0; st >= 0;) { 527 ch = getch(); 528 switch (st) { 529 case 0: // read the document type name 530 if (chtyp(ch) != ' ') { 531 bkch(); 532 name = name(mIsNSAware); 533 wsskip(); 534 st = 1; // read 'PUPLIC' or 'SYSTEM' 535 } 536 break; 537 538 case 1: // read 'PUPLIC' or 'SYSTEM' 539 switch (chtyp(ch)) { 540 case 'A': 541 bkch(); 542 psid = pubsys(' '); 543 st = 2; // skip spaces before internal subset 544 docType(name, psid.name, psid.value); 545 break; 546 547 case '[': 548 bkch(); 549 st = 2; // skip spaces before internal subset 550 docType(name, null, null); 551 break; 552 553 case '>': 554 bkch(); 555 st = 3; // skip spaces after internal subset 556 docType(name, null, null); 557 break; 558 559 default: 560 panic(FAULT); 561 } 562 break; 563 564 case 2: // skip spaces before internal subset 565 switch (chtyp(ch)) { 566 case '[': 567 // Process internal subset 568 dtdsub(); 569 st = 3; // skip spaces after internal subset 570 break; 571 572 case '>': 573 // There is no internal subset 574 bkch(); 575 st = 3; // skip spaces after internal subset 576 break; 577 578 case ' ': 579 // skip white spaces 580 break; 581 582 default: 583 panic(FAULT); 584 } 585 break; 586 587 case 3: // skip spaces after internal subset 588 switch (chtyp(ch)) { 589 case '>': 590 if (psid != null) { 591 // Report the DTD external subset 592 InputSource is = resolveEnt(name, psid.name, psid.value); 593 if (is != null) { 594 if (mIsSAlone == false) { 595 // Set the end of DTD external subset char 596 bkch(); 597 setch(']'); 598 // Set the DTD external subset InputSource 599 push(new Input(BUFFSIZE_READER)); 600 setinp(is); 601 mInp.pubid = psid.name; 602 mInp.sysid = psid.value; 603 // Parse the DTD external subset 604 dtdsub(); 605 } else { 606 // Unresolved DTD external subset 607 skippedEnt("[dtd]"); 608 // Release reader and stream 609 if (is.getCharacterStream() != null) { 610 try { 611 is.getCharacterStream().close(); 612 } catch (IOException ioe) { 613 } 614 } 615 if (is.getByteStream() != null) { 616 try { 617 is.getByteStream().close(); 618 } catch (IOException ioe) { 619 } 620 } 621 } 622 } else { 623 // Unresolved DTD external subset 624 skippedEnt("[dtd]"); 625 } 626 del(psid); 627 } 628 st = -1; // end of DTD 629 break; 630 631 case ' ': 632 // skip white spaces 633 break; 634 635 default: 636 panic(FAULT); 637 } 638 break; 639 640 default: 641 panic(FAULT); 642 } 643 } 644 } 645 646 /** 647 * Parses the document type declaration subset. 648 * 649 * @exception Exception is parser specific exception form panic method. 650 * @exception IOException 651 */ 652 private void dtdsub() throws Exception { 653 char ch; 654 for (short st = 0; st >= 0;) { 655 ch = getch(); 656 switch (st) { 657 case 0: // skip white spaces before a declaration 658 switch (chtyp(ch)) { 659 case '<': 660 ch = getch(); 661 switch (ch) { 662 case '?': 663 pi(); 664 break; 665 666 case '!': 667 ch = getch(); 668 bkch(); 669 if (ch == '-') { 670 comm(); 671 break; 672 } 673 // A markup or an entity declaration 674 bntok(); 675 switch (bkeyword()) { 676 case 'n': 677 dtdent(); 678 break; 679 680 case 'a': 681 dtdattl(); // parse attributes declaration 682 break; 683 684 case 'e': 685 dtdelm(); // parse element declaration 686 break; 687 688 case 'o': 689 dtdnot(); // parse notation declaration 690 break; 691 692 default: 693 panic(FAULT); // unsupported markup declaration 694 break; 695 } 696 st = 1; // read the end of declaration 697 break; 698 699 default: 700 panic(FAULT); 701 break; 702 } 703 break; 704 705 case '%': 706 // A parameter entity reference 707 pent(' '); 708 break; 709 710 case ']': 711 // End of DTD subset 712 st = -1; 713 break; 714 715 case ' ': 716 // Skip white spaces 717 break; 718 719 case 'Z': 720 // End of stream 721 if (getch() != ']') { 722 panic(FAULT); 723 } 724 st = -1; 725 break; 726 727 default: 728 panic(FAULT); 729 } 730 break; 731 732 case 1: // read the end of declaration 733 switch (ch) { 734 case '>': // there is no notation 735 st = 0; // skip white spaces before a declaration 736 break; 737 738 case ' ': 739 case '\n': 740 case '\r': 741 case '\t': 742 // Skip white spaces 743 break; 744 745 default: 746 panic(FAULT); 747 break; 748 } 749 break; 750 751 default: 752 panic(FAULT); 753 } 754 } 755 } 756 757 /** 758 * Parses an entity declaration. This method fills the general ( 759 * <code>mEnt</code>) and parameter 760 * ( 761 * <code>mPEnt</code>) entity look up table. 762 * 763 * @exception Exception is parser specific exception form panic method. 764 * @exception IOException 765 */ 766 @SuppressWarnings("fallthrough") 767 private void dtdent() throws Exception { 768 String str = null; 769 char[] val = null; 770 Input inp = null; 771 Pair ids = null; 772 char ch; 773 for (short st = 0; st >= 0;) { 774 ch = getch(); 775 switch (st) { 776 case 0: // skip white spaces before entity name 777 switch (chtyp(ch)) { 778 case ' ': 779 // Skip white spaces 780 break; 781 782 case '%': 783 // Parameter entity or parameter entity declaration. 784 ch = getch(); 785 bkch(); 786 if (chtyp(ch) == ' ') { 787 // Parameter entity declaration. 788 wsskip(); 789 str = name(false); 790 switch (chtyp(wsskip())) { 791 case 'A': 792 // Read the external identifier 793 ids = pubsys(' '); 794 if (wsskip() == '>') { 795 // External parsed entity 796 if (mPEnt.containsKey(str) == false) { // [#4.2] 797 inp = new Input(); 798 inp.pubid = ids.name; 799 inp.sysid = ids.value; 800 mPEnt.put(str, inp); 801 } 802 } else { 803 panic(FAULT); 804 } 805 del(ids); 806 st = -1; // the end of declaration 807 break; 808 809 case '\"': 810 case '\'': 811 // Read the parameter entity value 812 bqstr('d'); 813 // Create the parameter entity value 814 val = new char[mBuffIdx + 1]; 815 System.arraycopy(mBuff, 1, val, 1, val.length - 1); 816 // Add surrounding spaces [#4.4.8] 817 val[0] = ' '; 818 // Add the entity to the entity look up table 819 if (mPEnt.containsKey(str) == false) { // [#4.2] 820 inp = new Input(val); 821 inp.pubid = mInp.pubid; 822 inp.sysid = mInp.sysid; 823 inp.xmlenc = mInp.xmlenc; 824 inp.xmlver = mInp.xmlver; 825 mPEnt.put(str, inp); 826 } 827 st = -1; // the end of declaration 828 break; 829 830 default: 831 panic(FAULT); 832 break; 833 } 834 } else { 835 // Parameter entity reference. 836 pent(' '); 837 } 838 break; 839 840 default: 841 bkch(); 842 str = name(false); 843 st = 1; // read entity declaration value 844 break; 845 } 846 break; 847 848 case 1: // read entity declaration value 849 switch (chtyp(ch)) { 850 case '\"': // internal entity 851 case '\'': 852 bkch(); 853 bqstr('d'); // read a string into the buffer 854 if (mEnt.get(str) == null) { 855 // Create general entity value 856 val = new char[mBuffIdx]; 857 System.arraycopy(mBuff, 1, val, 0, val.length); 858 // Add the entity to the entity look up table 859 if (mEnt.containsKey(str) == false) { // [#4.2] 860 inp = new Input(val); 861 inp.pubid = mInp.pubid; 862 inp.sysid = mInp.sysid; 863 inp.xmlenc = mInp.xmlenc; 864 inp.xmlver = mInp.xmlver; 865 mEnt.put(str, inp); 866 } 867 } 868 st = -1; // the end of declaration 869 break; 870 871 case 'A': // external entity 872 bkch(); 873 ids = pubsys(' '); 874 switch (wsskip()) { 875 case '>': // external parsed entity 876 if (mEnt.containsKey(str) == false) { // [#4.2] 877 inp = new Input(); 878 inp.pubid = ids.name; 879 inp.sysid = ids.value; 880 mEnt.put(str, inp); 881 } 882 break; 883 884 case 'N': // external general unparsed entity 885 if ("NDATA".equals(name(false)) == true) { 886 wsskip(); 887 unparsedEntDecl(str, ids.name, ids.value, name(false)); 888 break; 889 } 890 default: 891 panic(FAULT); 892 break; 893 } 894 del(ids); 895 st = -1; // the end of declaration 896 break; 897 898 case ' ': 899 // Skip white spaces 900 break; 901 902 default: 903 panic(FAULT); 904 break; 905 } 906 break; 907 908 default: 909 panic(FAULT); 910 } 911 } 912 } 913 914 /** 915 * Parses an element declaration. 916 * 917 * This method parses the declaration up to the closing angle bracket. 918 * 919 * @exception Exception is parser specific exception form panic method. 920 * @exception IOException 921 */ 922 @SuppressWarnings("fallthrough") 923 private void dtdelm() throws Exception { 924 // This is stub implementation which skips an element 925 // declaration. 926 wsskip(); 927 name(mIsNSAware); 928 929 char ch; 930 while (true) { 931 ch = getch(); 932 switch (ch) { 933 case '>': 934 bkch(); 935 return; 936 937 case EOS: 938 panic(FAULT); 939 940 default: 941 break; 942 } 943 } 944 } 945 946 /** 947 * Parses an attribute list declaration. 948 * 949 * This method parses the declaration up to the closing angle bracket. 950 * 951 * @exception Exception is parser specific exception form panic method. 952 * @exception IOException 953 */ 954 private void dtdattl() throws Exception { 955 char elmqn[] = null; 956 Pair elm = null; 957 char ch; 958 for (short st = 0; st >= 0;) { 959 ch = getch(); 960 switch (st) { 961 case 0: // read the element name 962 switch (chtyp(ch)) { 963 case 'a': 964 case 'A': 965 case '_': 966 case 'X': 967 case ':': 968 bkch(); 969 // Get the element from the list or add a new one. 970 elmqn = qname(mIsNSAware); 971 elm = find(mAttL, elmqn); 972 if (elm == null) { 973 elm = pair(mAttL); 974 elm.chars = elmqn; 975 mAttL = elm; 976 } 977 st = 1; // read an attribute declaration 978 break; 979 980 case ' ': 981 break; 982 983 case '%': 984 pent(' '); 985 break; 986 987 default: 988 panic(FAULT); 989 break; 990 } 991 break; 992 993 case 1: // read an attribute declaration 994 switch (chtyp(ch)) { 995 case 'a': 996 case 'A': 997 case '_': 998 case 'X': 999 case ':': 1000 bkch(); 1001 dtdatt(elm); 1002 if (wsskip() == '>') { 1003 return; 1004 } 1005 break; 1006 1007 case ' ': 1008 break; 1009 1010 case '%': 1011 pent(' '); 1012 break; 1013 1014 default: 1015 panic(FAULT); 1016 break; 1017 } 1018 break; 1019 1020 default: 1021 panic(FAULT); 1022 break; 1023 } 1024 } 1025 } 1026 1027 /** 1028 * Parses an attribute declaration. 1029 * 1030 * The attribute uses the following fields of Pair object: chars - characters 1031 * of qualified name id - the type identifier of the attribute list - a pair 1032 * which holds the default value (chars field) 1033 * 1034 * @param elm An object which represents all defined attributes on an 1035 * element. 1036 * @exception Exception is parser specific exception form panic method. 1037 * @exception IOException 1038 */ 1039 @SuppressWarnings("fallthrough") 1040 private void dtdatt(Pair elm) throws Exception { 1041 char attqn[] = null; 1042 Pair att = null; 1043 char ch; 1044 for (short st = 0; st >= 0;) { 1045 ch = getch(); 1046 switch (st) { 1047 case 0: // the attribute name 1048 switch (chtyp(ch)) { 1049 case 'a': 1050 case 'A': 1051 case '_': 1052 case 'X': 1053 case ':': 1054 bkch(); 1055 // Get the attribute from the list or add a new one. 1056 attqn = qname(mIsNSAware); 1057 att = find(elm.list, attqn); 1058 if (att == null) { 1059 // New attribute declaration 1060 att = pair(elm.list); 1061 att.chars = attqn; 1062 elm.list = att; 1063 } else { 1064 // Do not override the attribute declaration [#3.3] 1065 att = pair(null); 1066 att.chars = attqn; 1067 att.id = 'c'; 1068 } 1069 wsskip(); 1070 st = 1; 1071 break; 1072 1073 case '%': 1074 pent(' '); 1075 break; 1076 1077 case ' ': 1078 break; 1079 1080 default: 1081 panic(FAULT); 1082 break; 1083 } 1084 break; 1085 1086 case 1: // the attribute type 1087 switch (chtyp(ch)) { 1088 case '(': 1089 att.id = 'u'; // enumeration type 1090 st = 2; // read the first element of the list 1091 break; 1092 1093 case '%': 1094 pent(' '); 1095 break; 1096 1097 case ' ': 1098 break; 1099 1100 default: 1101 bkch(); 1102 bntok(); // read type id 1103 att.id = bkeyword(); 1104 switch (att.id) { 1105 case 'o': // NOTATION 1106 if (wsskip() != '(') { 1107 panic(FAULT); 1108 } 1109 ch = getch(); 1110 st = 2; // read the first element of the list 1111 break; 1112 1113 case 'i': // ID 1114 case 'r': // IDREF 1115 case 'R': // IDREFS 1116 case 'n': // ENTITY 1117 case 'N': // ENTITIES 1118 case 't': // NMTOKEN 1119 case 'T': // NMTOKENS 1120 case 'c': // CDATA 1121 wsskip(); 1122 st = 4; // read default declaration 1123 break; 1124 1125 default: 1126 panic(FAULT); 1127 break; 1128 } 1129 break; 1130 } 1131 break; 1132 1133 case 2: // read the first element of the list 1134 switch (chtyp(ch)) { 1135 case 'a': 1136 case 'A': 1137 case 'd': 1138 case '.': 1139 case ':': 1140 case '-': 1141 case '_': 1142 case 'X': 1143 bkch(); 1144 switch (att.id) { 1145 case 'u': // enumeration type 1146 bntok(); 1147 break; 1148 1149 case 'o': // NOTATION 1150 mBuffIdx = -1; 1151 bname(false); 1152 break; 1153 1154 default: 1155 panic(FAULT); 1156 break; 1157 } 1158 wsskip(); 1159 st = 3; // read next element of the list 1160 break; 1161 1162 case '%': 1163 pent(' '); 1164 break; 1165 1166 case ' ': 1167 break; 1168 1169 default: 1170 panic(FAULT); 1171 break; 1172 } 1173 break; 1174 1175 case 3: // read next element of the list 1176 switch (ch) { 1177 case ')': 1178 wsskip(); 1179 st = 4; // read default declaration 1180 break; 1181 1182 case '|': 1183 wsskip(); 1184 switch (att.id) { 1185 case 'u': // enumeration type 1186 bntok(); 1187 break; 1188 1189 case 'o': // NOTATION 1190 mBuffIdx = -1; 1191 bname(false); 1192 break; 1193 1194 default: 1195 panic(FAULT); 1196 break; 1197 } 1198 wsskip(); 1199 break; 1200 1201 case '%': 1202 pent(' '); 1203 break; 1204 1205 default: 1206 panic(FAULT); 1207 break; 1208 } 1209 break; 1210 1211 case 4: // read default declaration 1212 switch (ch) { 1213 case '#': 1214 bntok(); 1215 switch (bkeyword()) { 1216 case 'F': // FIXED 1217 switch (wsskip()) { 1218 case '\"': 1219 case '\'': 1220 st = 5; // read the default value 1221 break; 1222 1223 case EOS: 1224 panic(FAULT); 1225 1226 default: 1227 st = -1; 1228 break; 1229 } 1230 break; 1231 1232 case 'Q': // REQUIRED 1233 case 'I': // IMPLIED 1234 st = -1; 1235 break; 1236 1237 default: 1238 panic(FAULT); 1239 break; 1240 } 1241 break; 1242 1243 case '\"': 1244 case '\'': 1245 bkch(); 1246 st = 5; // read the default value 1247 break; 1248 1249 case ' ': 1250 case '\n': 1251 case '\r': 1252 case '\t': 1253 break; 1254 1255 case '%': 1256 pent(' '); 1257 break; 1258 1259 default: 1260 bkch(); 1261 st = -1; 1262 break; 1263 } 1264 break; 1265 1266 case 5: // read the default value 1267 switch (ch) { 1268 case '\"': 1269 case '\'': 1270 bkch(); 1271 bqstr('d'); // the value in the mBuff now 1272 att.list = pair(null); 1273 // Create a string like "attqname='value' " 1274 att.list.chars = new char[att.chars.length + mBuffIdx + 3]; 1275 System.arraycopy( 1276 att.chars, 1, att.list.chars, 0, att.chars.length - 1); 1277 att.list.chars[att.chars.length - 1] = '='; 1278 att.list.chars[att.chars.length] = ch; 1279 System.arraycopy( 1280 mBuff, 1, att.list.chars, att.chars.length + 1, mBuffIdx); 1281 att.list.chars[att.chars.length + mBuffIdx + 1] = ch; 1282 att.list.chars[att.chars.length + mBuffIdx + 2] = ' '; 1283 st = -1; 1284 break; 1285 1286 default: 1287 panic(FAULT); 1288 break; 1289 } 1290 break; 1291 1292 default: 1293 panic(FAULT); 1294 break; 1295 } 1296 } 1297 } 1298 1299 /** 1300 * Parses a notation declaration. 1301 * 1302 * This method parses the declaration up to the closing angle bracket. 1303 * 1304 * @exception Exception is parser specific exception form panic method. 1305 * @exception IOException 1306 */ 1307 private void dtdnot() throws Exception { 1308 wsskip(); 1309 String name = name(false); 1310 wsskip(); 1311 Pair ids = pubsys('N'); 1312 notDecl(name, ids.name, ids.value); 1313 del(ids); 1314 } 1315 1316 /** 1317 * Parses an attribute. 1318 * 1319 * This recursive method is responsible for prefix addition 1320 * ( 1321 * <code>mPref</code>) on the way down. The element's start tag end triggers 1322 * the return process. The method then on it's way back resolves prefixes 1323 * and accumulates attributes. 1324 * 1325 * <p><code>att.num</code> carries attribute flags where: 0x1 - attribute is 1326 * declared in DTD (attribute decalration had been read); 0x2 - attribute's 1327 * default value is used.</p> 1328 * 1329 * @param att An object which reprecents current attribute. 1330 * @exception Exception is parser specific exception form panic method. 1331 * @exception IOException 1332 */ 1333 @SuppressWarnings("fallthrough") 1334 private void attr(Pair att) throws Exception { 1335 switch (wsskip()) { 1336 case '/': 1337 case '>': 1338 if ((att.num & 0x2) == 0) { // all attributes have been read 1339 att.num |= 0x2; // set default attribute flag 1340 Input inp = mInp; 1341 // Go through all attributes defined on current element. 1342 for (Pair def = mElm.list; def != null; def = def.next) { 1343 if (def.list == null) // no default value 1344 { 1345 continue; 1346 } 1347 // Go through all attributes defined on current 1348 // element and add defaults. 1349 Pair act = find(att.next, def.chars); 1350 if (act == null) { 1351 push(new Input(def.list.chars)); 1352 } 1353 } 1354 if (mInp != inp) { // defaults have been added 1355 attr(att); 1356 return; 1357 } 1358 } 1359 // Ensure the attribute string array capacity 1360 mAttrs.setLength(mAttrIdx); 1361 mItems = mAttrs.mItems; 1362 return; 1363 1364 case EOS: 1365 panic(FAULT); 1366 1367 default: 1368 // Read the attribute name and value 1369 att.chars = qname(mIsNSAware); 1370 att.name = att.local(); 1371 String type = atype(att); // sets attribute's type on att.id 1372 wsskip(); 1373 if (getch() != '=') { 1374 panic(FAULT); 1375 } 1376 bqstr((char) att.id); // read the value with normalization. 1377 String val = new String(mBuff, 1, mBuffIdx); 1378 Pair next = pair(att); 1379 next.num = (att.num & ~0x1); // inherit attribute flags 1380 // Put a namespace declaration on top of the prefix stack 1381 if ((mIsNSAware == false) || (isdecl(att, val) == false)) { 1382 // An ordinary attribute 1383 mAttrIdx++; 1384 attr(next); // recursive call to parse the next attribute 1385 mAttrIdx--; 1386 // Add the attribute to the attributes string array 1387 char idx = (char) (mAttrIdx << 3); 1388 mItems[idx + 1] = att.qname(); // attr qname 1389 mItems[idx + 2] = (mIsNSAware) ? att.name : ""; // attr local name 1390 mItems[idx + 3] = val; // attr value 1391 mItems[idx + 4] = type; // attr type 1392 switch (att.num & 0x3) { 1393 case 0x0: 1394 mItems[idx + 5] = null; 1395 break; 1396 1397 case 0x1: // declared attribute 1398 mItems[idx + 5] = "d"; 1399 break; 1400 1401 default: // 0x2, 0x3 - default attribute always declared 1402 mItems[idx + 5] = "D"; 1403 break; 1404 } 1405 // Resolve the prefix if any and report the attribute 1406 // NOTE: The attribute does not accept the default namespace. 1407 mItems[idx + 0] = (att.chars[0] != 0) ? rslv(att.chars) : ""; 1408 } else { 1409 // A namespace declaration. mPref.name contains prefix and 1410 // mPref.value contains namespace URI set by isdecl method. 1411 // Report a start of the new mapping 1412 newPrefix(); 1413 // Recursive call to parse the next attribute 1414 attr(next); 1415 // NOTE: The namespace declaration is not reported. 1416 } 1417 del(next); 1418 break; 1419 } 1420 } 1421 1422 /** 1423 * Retrieves attribute type. 1424 * 1425 * This method sets the type of normalization in the attribute 1426 * <code>id</code> field and returns the name of attribute type. 1427 * 1428 * @param att An object which represents current attribute. 1429 * @return The name of the attribute type. 1430 * @exception Exception is parser specific exception form panic method. 1431 */ 1432 private String atype(Pair att) 1433 throws Exception { 1434 Pair attr; 1435 1436 // CDATA-type normalization by default [#3.3.3] 1437 att.id = 'c'; 1438 if (mElm.list == null || (attr = find(mElm.list, att.chars)) == null) { 1439 return "CDATA"; 1440 } 1441 1442 att.num |= 0x1; // attribute is declared 1443 1444 // Non-CDATA normalization except when the attribute type is CDATA. 1445 att.id = 'i'; 1446 switch (attr.id) { 1447 case 'i': 1448 return "ID"; 1449 1450 case 'r': 1451 return "IDREF"; 1452 1453 case 'R': 1454 return "IDREFS"; 1455 1456 case 'n': 1457 return "ENTITY"; 1458 1459 case 'N': 1460 return "ENTITIES"; 1461 1462 case 't': 1463 return "NMTOKEN"; 1464 1465 case 'T': 1466 return "NMTOKENS"; 1467 1468 case 'u': 1469 return "NMTOKEN"; 1470 1471 case 'o': 1472 return "NOTATION"; 1473 1474 case 'c': 1475 att.id = 'c'; 1476 return "CDATA"; 1477 1478 default: 1479 panic(FAULT); 1480 } 1481 return null; 1482 } 1483 1484 /** 1485 * Parses a comment. 1486 * 1487 * The '<!' part is read in dispatcher so the method starts 1488 * with first '-' after '<!'. 1489 * 1490 * @exception Exception is parser specific exception form panic method. 1491 */ 1492 @SuppressWarnings("fallthrough") 1493 private void comm() throws Exception { 1494 if (mPh == PH_DOC_START) { 1495 mPh = PH_MISC_DTD; // misc before DTD 1496 } // '<!' has been already read by dispetcher. 1497 char ch; 1498 mBuffIdx = -1; 1499 for (short st = 0; st >= 0;) { 1500 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 1501 if (ch == EOS) { 1502 panic(FAULT); 1503 } 1504 switch (st) { 1505 case 0: // first '-' of the comment open 1506 if (ch == '-') { 1507 st = 1; 1508 } else { 1509 panic(FAULT); 1510 } 1511 break; 1512 1513 case 1: // secind '-' of the comment open 1514 if (ch == '-') { 1515 st = 2; 1516 } else { 1517 panic(FAULT); 1518 } 1519 break; 1520 1521 case 2: // skip the comment body 1522 switch (ch) { 1523 case '-': 1524 st = 3; 1525 break; 1526 1527 default: 1528 bappend(ch); 1529 break; 1530 } 1531 break; 1532 1533 case 3: // second '-' of the comment close 1534 switch (ch) { 1535 case '-': 1536 st = 4; 1537 break; 1538 1539 default: 1540 bappend('-'); 1541 bappend(ch); 1542 st = 2; 1543 break; 1544 } 1545 break; 1546 1547 case 4: // '>' of the comment close 1548 if (ch == '>') { 1549 comm(mBuff, mBuffIdx + 1); 1550 st = -1; 1551 break; 1552 } 1553 // else - panic [#2.5 compatibility note] 1554 1555 default: 1556 panic(FAULT); 1557 } 1558 } 1559 } 1560 1561 /** 1562 * Parses a processing instruction. 1563 * 1564 * The '<?' is read in dispatcher so the method starts with 1565 * first character of PI target name after '<?'. 1566 * 1567 * @exception Exception is parser specific exception form panic method. 1568 * @exception IOException 1569 */ 1570 private void pi() throws Exception { 1571 // '<?' has been already read by dispetcher. 1572 char ch; 1573 String str = null; 1574 mBuffIdx = -1; 1575 for (short st = 0; st >= 0;) { 1576 ch = getch(); 1577 if (ch == EOS) { 1578 panic(FAULT); 1579 } 1580 switch (st) { 1581 case 0: // read the PI target name 1582 switch (chtyp(ch)) { 1583 case 'a': 1584 case 'A': 1585 case '_': 1586 case ':': 1587 case 'X': 1588 bkch(); 1589 str = name(false); 1590 // PI target name may not be empty string [#2.6] 1591 // PI target name 'XML' is reserved [#2.6] 1592 if ((str.length() == 0) 1593 || (mXml.name.equals(str.toLowerCase()) == true)) { 1594 panic(FAULT); 1595 } 1596 // This is processing instruction 1597 if (mPh == PH_DOC_START) // the begining of the document 1598 { 1599 mPh = PH_MISC_DTD; // misc before DTD 1600 } 1601 wsskip(); // skip spaces after the PI target name 1602 st = 1; // accumulate the PI body 1603 mBuffIdx = -1; 1604 break; 1605 1606 default: 1607 panic(FAULT); 1608 } 1609 break; 1610 1611 case 1: // accumulate the PI body 1612 switch (ch) { 1613 case '?': 1614 st = 2; // end of the PI body 1615 break; 1616 1617 default: 1618 bappend(ch); 1619 break; 1620 } 1621 break; 1622 1623 case 2: // end of the PI body 1624 switch (ch) { 1625 case '>': 1626 // PI has been read. 1627 pi(str, new String(mBuff, 0, mBuffIdx + 1)); 1628 st = -1; 1629 break; 1630 1631 case '?': 1632 bappend('?'); 1633 break; 1634 1635 default: 1636 bappend('?'); 1637 bappend(ch); 1638 st = 1; // accumulate the PI body 1639 break; 1640 } 1641 break; 1642 1643 default: 1644 panic(FAULT); 1645 } 1646 } 1647 } 1648 1649 /** 1650 * Parses a character data. 1651 * 1652 * The '<!' part is read in dispatcher so the method starts 1653 * with first '[' after '<!'. 1654 * 1655 * @exception Exception is parser specific exception form panic method. 1656 * @exception IOException 1657 */ 1658 private void cdat() 1659 throws Exception { 1660 // '<!' has been already read by dispetcher. 1661 char ch; 1662 mBuffIdx = -1; 1663 for (short st = 0; st >= 0;) { 1664 ch = getch(); 1665 switch (st) { 1666 case 0: // the first '[' of the CDATA open 1667 if (ch == '[') { 1668 st = 1; 1669 } else { 1670 panic(FAULT); 1671 } 1672 break; 1673 1674 case 1: // read "CDATA" 1675 if (chtyp(ch) == 'A') { 1676 bappend(ch); 1677 } else { 1678 if ("CDATA".equals( 1679 new String(mBuff, 0, mBuffIdx + 1)) != true) { 1680 panic(FAULT); 1681 } 1682 bkch(); 1683 st = 2; 1684 } 1685 break; 1686 1687 case 2: // the second '[' of the CDATA open 1688 if (ch != '[') { 1689 panic(FAULT); 1690 } 1691 mBuffIdx = -1; 1692 st = 3; 1693 break; 1694 1695 case 3: // read data before the first ']' 1696 if (ch != ']') { 1697 bappend(ch); 1698 } else { 1699 st = 4; 1700 } 1701 break; 1702 1703 case 4: // read the second ']' or continue to read the data 1704 if (ch != ']') { 1705 bappend(']'); 1706 bappend(ch); 1707 st = 3; 1708 } else { 1709 st = 5; 1710 } 1711 break; 1712 1713 case 5: // read '>' or continue to read the data 1714 switch (ch) { 1715 case ']': 1716 bappend(']'); 1717 break; 1718 1719 case '>': 1720 bflash(); 1721 st = -1; 1722 break; 1723 1724 default: 1725 bappend(']'); 1726 bappend(']'); 1727 bappend(ch); 1728 st = 3; 1729 break; 1730 } 1731 break; 1732 1733 default: 1734 panic(FAULT); 1735 } 1736 } 1737 } 1738 1739 /** 1740 * Reads a xml name. 1741 * 1742 * The xml name must conform "Namespaces in XML" specification. Therefore 1743 * the ':' character is not allowed in the name. This method should be used 1744 * for PI and entity names which may not have a namespace according to the 1745 * specification mentioned above. 1746 * 1747 * @param ns The true value turns namespace conformance on. 1748 * @return The name has been read. 1749 * @exception Exception When incorrect character appear in the name. 1750 * @exception IOException 1751 */ 1752 protected String name(boolean ns) 1753 throws Exception { 1754 mBuffIdx = -1; 1755 bname(ns); 1756 return new String(mBuff, 1, mBuffIdx); 1757 } 1758 1759 /** 1760 * Reads a qualified xml name. 1761 * 1762 * The characters of a qualified name is an array of characters. The first 1763 * (chars[0]) character is the index of the colon character which separates 1764 * the prefix from the local name. If the index is zero, the name does not 1765 * contain separator or the parser works in the namespace unaware mode. The 1766 * length of qualified name is the length of the array minus one. 1767 * 1768 * @param ns The true value turns namespace conformance on. 1769 * @return The characters of a qualified name. 1770 * @exception Exception When incorrect character appear in the name. 1771 * @exception IOException 1772 */ 1773 protected char[] qname(boolean ns) 1774 throws Exception { 1775 mBuffIdx = -1; 1776 bname(ns); 1777 char chars[] = new char[mBuffIdx + 1]; 1778 System.arraycopy(mBuff, 0, chars, 0, mBuffIdx + 1); 1779 return chars; 1780 } 1781 1782 /** 1783 * Reads the public or/and system identifiers. 1784 * 1785 * @param inp The input object. 1786 * @exception Exception is parser specific exception form panic method. 1787 * @exception IOException 1788 */ 1789 private void pubsys(Input inp) 1790 throws Exception { 1791 Pair pair = pubsys(' '); 1792 inp.pubid = pair.name; 1793 inp.sysid = pair.value; 1794 del(pair); 1795 } 1796 1797 /** 1798 * Reads the public or/and system identifiers. 1799 * 1800 * @param flag The 'N' allows public id be without system id. 1801 * @return The public or/and system identifiers pair. 1802 * @exception Exception is parser specific exception form panic method. 1803 * @exception IOException 1804 */ 1805 @SuppressWarnings("fallthrough") 1806 private Pair pubsys(char flag) throws Exception { 1807 Pair ids = pair(null); 1808 String str = name(false); 1809 if ("PUBLIC".equals(str) == true) { 1810 bqstr('i'); // non-CDATA normalization [#4.2.2] 1811 ids.name = new String(mBuff, 1, mBuffIdx); 1812 switch (wsskip()) { 1813 case '\"': 1814 case '\'': 1815 bqstr(' '); 1816 ids.value = new String(mBuff, 1, mBuffIdx); 1817 break; 1818 1819 case EOS: 1820 panic(FAULT); 1821 1822 default: 1823 if (flag != 'N') // [#4.7] 1824 { 1825 panic(FAULT); 1826 } 1827 ids.value = null; 1828 break; 1829 } 1830 return ids; 1831 } else if ("SYSTEM".equals(str) == true) { 1832 ids.name = null; 1833 bqstr(' '); 1834 ids.value = new String(mBuff, 1, mBuffIdx); 1835 return ids; 1836 } 1837 panic(FAULT); 1838 return null; 1839 } 1840 1841 /** 1842 * Reads an attribute value. 1843 * 1844 * The grammar this method can read is: 1845 * <pre>{@code 1846 * eqstr := S "=" qstr 1847 * qstr := S ("'" string "'") | ('"' string '"') 1848 * }</pre> 1849 * This method resolves entities 1850 * inside a string unless the parser parses DTD. 1851 * 1852 * @param flag The '=' character forces the method to accept the '=' 1853 * character before quoted string and read the following string as not an 1854 * attribute ('-'), 'c' - CDATA, 'i' - non CDATA, ' ' - no normalization; 1855 * '-' - not an attribute value; 'd' - in DTD context. 1856 * @return The content of the quoted strign as a string. 1857 * @exception Exception is parser specific exception form panic method. 1858 * @exception IOException 1859 */ 1860 protected String eqstr(char flag) throws Exception { 1861 if (flag == '=') { 1862 wsskip(); 1863 if (getch() != '=') { 1864 panic(FAULT); 1865 } 1866 } 1867 bqstr((flag == '=') ? '-' : flag); 1868 return new String(mBuff, 1, mBuffIdx); 1869 } 1870 1871 /** 1872 * Resoves an entity. 1873 * 1874 * This method resolves built-in and character entity references. It is also 1875 * reports external entities to the application. 1876 * 1877 * @param flag The 'x' character forces the method to report a skipped 1878 * entity; 'i' character - indicates non-CDATA normalization. 1879 * @return Name of unresolved entity or <code>null</code> if entity had been 1880 * resolved successfully. 1881 * @exception Exception is parser specific exception form panic method. 1882 * @exception IOException 1883 */ 1884 @SuppressWarnings("fallthrough") 1885 private String ent(char flag) throws Exception { 1886 char ch; 1887 int idx = mBuffIdx + 1; 1888 Input inp = null; 1889 String str = null; 1890 mESt = 0x100; // reset the built-in entity recognizer 1891 bappend('&'); 1892 for (short st = 0; st >= 0;) { 1893 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 1894 switch (st) { 1895 case 0: // the first character of the entity name 1896 case 1: // read built-in entity name 1897 switch (chtyp(ch)) { 1898 case 'd': 1899 case '.': 1900 case '-': 1901 if (st != 1) { 1902 panic(FAULT); 1903 } 1904 case 'a': 1905 case 'A': 1906 case '_': 1907 case 'X': 1908 bappend(ch); 1909 eappend(ch); 1910 st = 1; 1911 break; 1912 1913 case ':': 1914 if (mIsNSAware != false) { 1915 panic(FAULT); 1916 } 1917 bappend(ch); 1918 eappend(ch); 1919 st = 1; 1920 break; 1921 1922 case ';': 1923 if (mESt < 0x100) { 1924 // The entity is a built-in entity 1925 mBuffIdx = idx - 1; 1926 bappend(mESt); 1927 st = -1; 1928 break; 1929 } else if (mPh == PH_DTD) { 1930 // In DTD entity declaration has to resolve character 1931 // entities and include "as is" others. [#4.4.7] 1932 bappend(';'); 1933 st = -1; 1934 break; 1935 } 1936 // Convert an entity name to a string 1937 str = new String(mBuff, idx + 1, mBuffIdx - idx); 1938 inp = mEnt.get(str); 1939 // Restore the buffer offset 1940 mBuffIdx = idx - 1; 1941 if (inp != null) { 1942 if (inp.chars == null) { 1943 // External entity 1944 InputSource is = resolveEnt(str, inp.pubid, inp.sysid); 1945 if (is != null) { 1946 push(new Input(BUFFSIZE_READER)); 1947 setinp(is); 1948 mInp.pubid = inp.pubid; 1949 mInp.sysid = inp.sysid; 1950 str = null; // the entity is resolved 1951 } else { 1952 // Unresolved external entity 1953 if (flag != 'x') { 1954 panic(FAULT); // unknown entity within marckup 1955 } // str is name of unresolved entity 1956 } 1957 } else { 1958 // Internal entity 1959 push(inp); 1960 str = null; // the entity is resolved 1961 } 1962 } else { 1963 // Unknown or general unparsed entity 1964 if (flag != 'x') { 1965 panic(FAULT); // unknown entity within marckup 1966 } // str is name of unresolved entity 1967 } 1968 st = -1; 1969 break; 1970 1971 case '#': 1972 if (st != 0) { 1973 panic(FAULT); 1974 } 1975 st = 2; 1976 break; 1977 1978 default: 1979 panic(FAULT); 1980 } 1981 break; 1982 1983 case 2: // read character entity 1984 switch (chtyp(ch)) { 1985 case 'd': 1986 bappend(ch); 1987 break; 1988 1989 case ';': 1990 // Convert the character entity to a character 1991 try { 1992 int i = Integer.parseInt( 1993 new String(mBuff, idx + 1, mBuffIdx - idx), 10); 1994 if (i >= 0xffff) { 1995 panic(FAULT); 1996 } 1997 ch = (char) i; 1998 } catch (NumberFormatException nfe) { 1999 panic(FAULT); 2000 } 2001 // Restore the buffer offset 2002 mBuffIdx = idx - 1; 2003 if (ch == ' ' || mInp.next != null) { 2004 bappend(ch, flag); 2005 } else { 2006 bappend(ch); 2007 } 2008 st = -1; 2009 break; 2010 2011 case 'a': 2012 // If the entity buffer is empty and ch == 'x' 2013 if ((mBuffIdx == idx) && (ch == 'x')) { 2014 st = 3; 2015 break; 2016 } 2017 default: 2018 panic(FAULT); 2019 } 2020 break; 2021 2022 case 3: // read hex character entity 2023 switch (chtyp(ch)) { 2024 case 'A': 2025 case 'a': 2026 case 'd': 2027 bappend(ch); 2028 break; 2029 2030 case ';': 2031 // Convert the character entity to a character 2032 try { 2033 int i = Integer.parseInt( 2034 new String(mBuff, idx + 1, mBuffIdx - idx), 16); 2035 if (i >= 0xffff) { 2036 panic(FAULT); 2037 } 2038 ch = (char) i; 2039 } catch (NumberFormatException nfe) { 2040 panic(FAULT); 2041 } 2042 // Restore the buffer offset 2043 mBuffIdx = idx - 1; 2044 if (ch == ' ' || mInp.next != null) { 2045 bappend(ch, flag); 2046 } else { 2047 bappend(ch); 2048 } 2049 st = -1; 2050 break; 2051 2052 default: 2053 panic(FAULT); 2054 } 2055 break; 2056 2057 default: 2058 panic(FAULT); 2059 } 2060 } 2061 2062 return str; 2063 } 2064 2065 /** 2066 * Resoves a parameter entity. 2067 * 2068 * This method resolves a parameter entity references. It is also reports 2069 * external entities to the application. 2070 * 2071 * @param flag The '-' instruct the method to do not set up surrounding 2072 * spaces [#4.4.8]. 2073 * @exception Exception is parser specific exception form panic method. 2074 * @exception IOException 2075 */ 2076 @SuppressWarnings("fallthrough") 2077 private void pent(char flag) throws Exception { 2078 char ch; 2079 int idx = mBuffIdx + 1; 2080 Input inp = null; 2081 String str = null; 2082 bappend('%'); 2083 if (mPh != PH_DTD) // the DTD internal subset 2084 { 2085 return; // Not Recognized [#4.4.1] 2086 } // Read entity name 2087 bname(false); 2088 str = new String(mBuff, idx + 2, mBuffIdx - idx - 1); 2089 if (getch() != ';') { 2090 panic(FAULT); 2091 } 2092 inp = mPEnt.get(str); 2093 // Restore the buffer offset 2094 mBuffIdx = idx - 1; 2095 if (inp != null) { 2096 if (inp.chars == null) { 2097 // External parameter entity 2098 InputSource is = resolveEnt(str, inp.pubid, inp.sysid); 2099 if (is != null) { 2100 if (flag != '-') { 2101 bappend(' '); // tail space 2102 } 2103 push(new Input(BUFFSIZE_READER)); 2104 // BUG: there is no leading space! [#4.4.8] 2105 setinp(is); 2106 mInp.pubid = inp.pubid; 2107 mInp.sysid = inp.sysid; 2108 } else { 2109 // Unresolved external parameter entity 2110 skippedEnt("%" + str); 2111 } 2112 } else { 2113 // Internal parameter entity 2114 if (flag == '-') { 2115 // No surrounding spaces 2116 inp.chIdx = 1; 2117 } else { 2118 // Insert surrounding spaces 2119 bappend(' '); // tail space 2120 inp.chIdx = 0; 2121 } 2122 push(inp); 2123 } 2124 } else { 2125 // Unknown parameter entity 2126 skippedEnt("%" + str); 2127 } 2128 } 2129 2130 /** 2131 * Recognizes and handles a namespace declaration. 2132 * 2133 * This method identifies a type of namespace declaration if any and puts 2134 * new mapping on top of prefix stack. 2135 * 2136 * @param name The attribute qualified name (<code>name.value</code> is a 2137 * <code>String</code> object which represents the attribute prefix). 2138 * @param value The attribute value. 2139 * @return <code>true</code> if a namespace declaration is recognized. 2140 */ 2141 private boolean isdecl(Pair name, String value) { 2142 if (name.chars[0] == 0) { 2143 if ("xmlns".equals(name.name) == true) { 2144 // New default namespace declaration 2145 mPref = pair(mPref); 2146 mPref.list = mElm; // prefix owner element 2147 mPref.value = value; 2148 mPref.name = ""; 2149 mPref.chars = NONS; 2150 mElm.num++; // namespace counter 2151 return true; 2152 } 2153 } else { 2154 if (name.eqpref(XMLNS) == true) { 2155 // New prefix declaration 2156 int len = name.name.length(); 2157 mPref = pair(mPref); 2158 mPref.list = mElm; // prefix owner element 2159 mPref.value = value; 2160 mPref.name = name.name; 2161 mPref.chars = new char[len + 1]; 2162 mPref.chars[0] = (char) (len + 1); 2163 name.name.getChars(0, len, mPref.chars, 1); 2164 mElm.num++; // namespace counter 2165 return true; 2166 } 2167 } 2168 return false; 2169 } 2170 2171 /** 2172 * Resolves a prefix. 2173 * 2174 * @return The namespace assigned to the prefix. 2175 * @exception Exception When mapping for specified prefix is not found. 2176 */ 2177 private String rslv(char[] qname) 2178 throws Exception { 2179 for (Pair pref = mPref; pref != null; pref = pref.next) { 2180 if (pref.eqpref(qname) == true) { 2181 return pref.value; 2182 } 2183 } 2184 if (qname[0] == 1) { // QNames like ':local' 2185 for (Pair pref = mPref; pref != null; pref = pref.next) { 2186 if (pref.chars[0] == 0) { 2187 return pref.value; 2188 } 2189 } 2190 } 2191 panic(FAULT); 2192 return null; 2193 } 2194 2195 /** 2196 * Skips xml white space characters. 2197 * 2198 * This method skips white space characters (' ', '\t', '\n', '\r') and 2199 * looks ahead not white space character. 2200 * 2201 * @return The first not white space look ahead character. 2202 * @exception IOException 2203 */ 2204 protected char wsskip() 2205 throws IOException { 2206 char ch; 2207 while (true) { 2208 // Read next character 2209 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 2210 if (ch < 0x80) { 2211 if (nmttyp[ch] != 3) // [ \t\n\r] 2212 { 2213 break; 2214 } 2215 } else { 2216 break; 2217 } 2218 } 2219 mChIdx--; // bkch(); 2220 return ch; 2221 } 2222 2223 /** 2224 * Reports document type. 2225 * 2226 * @param name The name of the entity. 2227 * @param pubid The public identifier of the entity or <code>null</code>. 2228 * @param sysid The system identifier of the entity or <code>null</code>. 2229 */ 2230 protected abstract void docType(String name, String pubid, String sysid) 2231 throws SAXException; 2232 2233 /** 2234 * Reports a comment. 2235 * 2236 * @param text The comment text starting from first charcater. 2237 * @param length The number of characters in comment. 2238 */ 2239 protected abstract void comm(char[] text, int length); 2240 2241 /** 2242 * Reports a processing instruction. 2243 * 2244 * @param target The processing instruction target name. 2245 * @param body The processing instruction body text. 2246 */ 2247 protected abstract void pi(String target, String body) 2248 throws Exception; 2249 2250 /** 2251 * Reports new namespace prefix. The Namespace prefix ( 2252 * <code>mPref.name</code>) being declared and the Namespace URI ( 2253 * <code>mPref.value</code>) the prefix is mapped to. An empty string is 2254 * used for the default element namespace, which has no prefix. 2255 */ 2256 protected abstract void newPrefix() 2257 throws Exception; 2258 2259 /** 2260 * Reports skipped entity name. 2261 * 2262 * @param name The entity name. 2263 */ 2264 protected abstract void skippedEnt(String name) 2265 throws Exception; 2266 2267 /** 2268 * Returns an 2269 * <code>InputSource</code> for specified entity or 2270 * <code>null</code>. 2271 * 2272 * @param name The name of the entity. 2273 * @param pubid The public identifier of the entity. 2274 * @param sysid The system identifier of the entity. 2275 */ 2276 protected abstract InputSource resolveEnt( 2277 String name, String pubid, String sysid) 2278 throws Exception; 2279 2280 /** 2281 * Reports notation declaration. 2282 * 2283 * @param name The notation's name. 2284 * @param pubid The notation's public identifier, or null if none was given. 2285 * @param sysid The notation's system identifier, or null if none was given. 2286 */ 2287 protected abstract void notDecl(String name, String pubid, String sysid) 2288 throws Exception; 2289 2290 /** 2291 * Reports unparsed entity name. 2292 * 2293 * @param name The unparsed entity's name. 2294 * @param pubid The entity's public identifier, or null if none was given. 2295 * @param sysid The entity's system identifier. 2296 * @param notation The name of the associated notation. 2297 */ 2298 protected abstract void unparsedEntDecl( 2299 String name, String pubid, String sysid, String notation) 2300 throws Exception; 2301 2302 /** 2303 * Notifies the handler about fatal parsing error. 2304 * 2305 * @param msg The problem description message. 2306 */ 2307 protected abstract void panic(String msg) 2308 throws Exception; 2309 2310 /** 2311 * Reads a qualified xml name. 2312 * 2313 * This is low level routine which leaves a qName in the buffer. The 2314 * characters of a qualified name is an array of characters. The first 2315 * (chars[0]) character is the index of the colon character which separates 2316 * the prefix from the local name. If the index is zero, the name does not 2317 * contain separator or the parser works in the namespace unaware mode. The 2318 * length of qualified name is the length of the array minus one. 2319 * 2320 * @param ns The true value turns namespace conformance on. 2321 * @exception Exception is parser specific exception form panic method. 2322 * @exception IOException 2323 */ 2324 private void bname(boolean ns) 2325 throws Exception { 2326 char ch; 2327 char type; 2328 mBuffIdx++; // allocate a char for colon offset 2329 int bqname = mBuffIdx; 2330 int bcolon = bqname; 2331 int bchidx = bqname + 1; 2332 int bstart = bchidx; 2333 int cstart = mChIdx; 2334 short st = (short) ((ns == true) ? 0 : 2); 2335 while (true) { 2336 // Read next character 2337 if (mChIdx >= mChLen) { 2338 bcopy(cstart, bstart); 2339 getch(); 2340 mChIdx--; // bkch(); 2341 cstart = mChIdx; 2342 bstart = bchidx; 2343 } 2344 ch = mChars[mChIdx++]; 2345 type = (char) 0; // [X] 2346 if (ch < 0x80) { 2347 type = (char) nmttyp[ch]; 2348 } else if (ch == EOS) { 2349 panic(FAULT); 2350 } 2351 // Parse QName 2352 switch (st) { 2353 case 0: // read the first char of the prefix 2354 case 2: // read the first char of the suffix 2355 switch (type) { 2356 case 0: // [aA_X] 2357 bchidx++; // append char to the buffer 2358 st++; // (st == 0)? 1: 3; 2359 break; 2360 2361 case 1: // [:] 2362 mChIdx--; // bkch(); 2363 st++; // (st == 0)? 1: 3; 2364 break; 2365 2366 default: 2367 panic(FAULT); 2368 } 2369 break; 2370 2371 case 1: // read the prefix 2372 case 3: // read the suffix 2373 switch (type) { 2374 case 0: // [aA_X] 2375 case 2: // [.-d] 2376 bchidx++; // append char to the buffer 2377 break; 2378 2379 case 1: // [:] 2380 bchidx++; // append char to the buffer 2381 if (ns == true) { 2382 if (bcolon != bqname) { 2383 panic(FAULT); // it must be only one colon 2384 } 2385 bcolon = bchidx - 1; 2386 if (st == 1) { 2387 st = 2; 2388 } 2389 } 2390 break; 2391 2392 default: 2393 mChIdx--; // bkch(); 2394 bcopy(cstart, bstart); 2395 mBuff[bqname] = (char) (bcolon - bqname); 2396 return; 2397 } 2398 break; 2399 2400 default: 2401 panic(FAULT); 2402 } 2403 } 2404 } 2405 2406 /** 2407 * Reads a nmtoken. 2408 * 2409 * This is low level routine which leaves a nmtoken in the buffer. 2410 * 2411 * @exception Exception is parser specific exception form panic method. 2412 * @exception IOException 2413 */ 2414 @SuppressWarnings("fallthrough") 2415 private void bntok() throws Exception { 2416 char ch; 2417 mBuffIdx = -1; 2418 bappend((char) 0); // default offset to the colon char 2419 while (true) { 2420 ch = getch(); 2421 switch (chtyp(ch)) { 2422 case 'a': 2423 case 'A': 2424 case 'd': 2425 case '.': 2426 case ':': 2427 case '-': 2428 case '_': 2429 case 'X': 2430 bappend(ch); 2431 break; 2432 2433 case 'Z': 2434 panic(FAULT); 2435 2436 default: 2437 bkch(); 2438 return; 2439 } 2440 } 2441 } 2442 2443 /** 2444 * Recognizes a keyword. 2445 * 2446 * This is low level routine which recognizes one of keywords in the buffer. 2447 * Keyword Id ID - i IDREF - r IDREFS - R ENTITY - n ENTITIES - N NMTOKEN - 2448 * t NMTOKENS - T ELEMENT - e ATTLIST - a NOTATION - o CDATA - c REQUIRED - 2449 * Q IMPLIED - I FIXED - F 2450 * 2451 * @return an id of a keyword or '?'. 2452 * @exception Exception is parser specific exception form panic method. 2453 * @exception IOException 2454 */ 2455 private char bkeyword() 2456 throws Exception { 2457 String str = new String(mBuff, 1, mBuffIdx); 2458 switch (str.length()) { 2459 case 2: // ID 2460 return ("ID".equals(str) == true) ? 'i' : '?'; 2461 2462 case 5: // IDREF, CDATA, FIXED 2463 switch (mBuff[1]) { 2464 case 'I': 2465 return ("IDREF".equals(str) == true) ? 'r' : '?'; 2466 case 'C': 2467 return ("CDATA".equals(str) == true) ? 'c' : '?'; 2468 case 'F': 2469 return ("FIXED".equals(str) == true) ? 'F' : '?'; 2470 default: 2471 break; 2472 } 2473 break; 2474 2475 case 6: // IDREFS, ENTITY 2476 switch (mBuff[1]) { 2477 case 'I': 2478 return ("IDREFS".equals(str) == true) ? 'R' : '?'; 2479 case 'E': 2480 return ("ENTITY".equals(str) == true) ? 'n' : '?'; 2481 default: 2482 break; 2483 } 2484 break; 2485 2486 case 7: // NMTOKEN, IMPLIED, ATTLIST, ELEMENT 2487 switch (mBuff[1]) { 2488 case 'I': 2489 return ("IMPLIED".equals(str) == true) ? 'I' : '?'; 2490 case 'N': 2491 return ("NMTOKEN".equals(str) == true) ? 't' : '?'; 2492 case 'A': 2493 return ("ATTLIST".equals(str) == true) ? 'a' : '?'; 2494 case 'E': 2495 return ("ELEMENT".equals(str) == true) ? 'e' : '?'; 2496 default: 2497 break; 2498 } 2499 break; 2500 2501 case 8: // ENTITIES, NMTOKENS, NOTATION, REQUIRED 2502 switch (mBuff[2]) { 2503 case 'N': 2504 return ("ENTITIES".equals(str) == true) ? 'N' : '?'; 2505 case 'M': 2506 return ("NMTOKENS".equals(str) == true) ? 'T' : '?'; 2507 case 'O': 2508 return ("NOTATION".equals(str) == true) ? 'o' : '?'; 2509 case 'E': 2510 return ("REQUIRED".equals(str) == true) ? 'Q' : '?'; 2511 default: 2512 break; 2513 } 2514 break; 2515 2516 default: 2517 break; 2518 } 2519 return '?'; 2520 } 2521 2522 /** 2523 * Reads a single or double quotted string in to the buffer. 2524 * 2525 * This method resolves entities inside a string unless the parser parses 2526 * DTD. 2527 * 2528 * @param flag 'c' - CDATA, 'i' - non CDATA, ' ' - no normalization; '-' - 2529 * not an attribute value; 'd' - in DTD context. 2530 * @exception Exception is parser specific exception form panic method. 2531 * @exception IOException 2532 */ 2533 @SuppressWarnings("fallthrough") 2534 private void bqstr(char flag) throws Exception { 2535 Input inp = mInp; // remember the original input 2536 mBuffIdx = -1; 2537 bappend((char) 0); // default offset to the colon char 2538 char ch; 2539 for (short st = 0; st >= 0;) { 2540 ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); 2541 switch (st) { 2542 case 0: // read a single or double quote 2543 switch (ch) { 2544 case ' ': 2545 case '\n': 2546 case '\r': 2547 case '\t': 2548 break; 2549 2550 case '\'': 2551 st = 2; // read a single quoted string 2552 break; 2553 2554 case '\"': 2555 st = 3; // read a double quoted string 2556 break; 2557 2558 default: 2559 panic(FAULT); 2560 break; 2561 } 2562 break; 2563 2564 case 2: // read a single quoted string 2565 case 3: // read a double quoted string 2566 switch (ch) { 2567 case '\'': 2568 if ((st == 2) && (mInp == inp)) { 2569 st = -1; 2570 } else { 2571 bappend(ch); 2572 } 2573 break; 2574 2575 case '\"': 2576 if ((st == 3) && (mInp == inp)) { 2577 st = -1; 2578 } else { 2579 bappend(ch); 2580 } 2581 break; 2582 2583 case '&': 2584 if (flag != 'd') { 2585 ent(flag); 2586 } else { 2587 bappend(ch); 2588 } 2589 break; 2590 2591 case '%': 2592 if (flag == 'd') { 2593 pent('-'); 2594 } else { 2595 bappend(ch); 2596 } 2597 break; 2598 2599 case '<': 2600 if ((flag == '-') || (flag == 'd')) { 2601 bappend(ch); 2602 } else { 2603 panic(FAULT); 2604 } 2605 break; 2606 2607 case EOS: // EOS before single/double quote 2608 panic(FAULT); 2609 2610 case '\r': // EOL processing [#2.11 & #3.3.3] 2611 if (flag != ' ' && mInp.next == null) { 2612 if (getch() != '\n') { 2613 bkch(); 2614 } 2615 ch = '\n'; 2616 } 2617 default: 2618 bappend(ch, flag); 2619 break; 2620 } 2621 break; 2622 2623 default: 2624 panic(FAULT); 2625 } 2626 } 2627 // There is maximum one space at the end of the string in 2628 // i-mode (non CDATA normalization) and it has to be removed. 2629 if ((flag == 'i') && (mBuff[mBuffIdx] == ' ')) { 2630 mBuffIdx -= 1; 2631 } 2632 } 2633 2634 /** 2635 * Reports characters and empties the parser's buffer. This method is called 2636 * only if parser is going to return control to the main loop. This means 2637 * that this method may use parser buffer to report white space without 2638 * copying characters to temporary buffer. 2639 */ 2640 protected abstract void bflash() 2641 throws Exception; 2642 2643 /** 2644 * Reports white space characters and empties the parser's buffer. This 2645 * method is called only if parser is going to return control to the main 2646 * loop. This means that this method may use parser buffer to report white 2647 * space without copying characters to temporary buffer. 2648 */ 2649 protected abstract void bflash_ws() 2650 throws Exception; 2651 2652 /** 2653 * Appends a character to parser's buffer with normalization. 2654 * 2655 * @param ch The character to append to the buffer. 2656 * @param mode The normalization mode. 2657 */ 2658 private void bappend(char ch, char mode) { 2659 // This implements attribute value normalization as 2660 // described in the XML specification [#3.3.3]. 2661 switch (mode) { 2662 case 'i': // non CDATA normalization 2663 switch (ch) { 2664 case ' ': 2665 case '\n': 2666 case '\r': 2667 case '\t': 2668 if ((mBuffIdx > 0) && (mBuff[mBuffIdx] != ' ')) { 2669 bappend(' '); 2670 } 2671 return; 2672 2673 default: 2674 break; 2675 } 2676 break; 2677 2678 case 'c': // CDATA normalization 2679 switch (ch) { 2680 case '\n': 2681 case '\r': 2682 case '\t': 2683 ch = ' '; 2684 break; 2685 2686 default: 2687 break; 2688 } 2689 break; 2690 2691 default: // no normalization 2692 break; 2693 } 2694 mBuffIdx++; 2695 if (mBuffIdx < mBuff.length) { 2696 mBuff[mBuffIdx] = ch; 2697 } else { 2698 mBuffIdx--; 2699 bappend(ch); 2700 } 2701 } 2702 2703 /** 2704 * Appends a character to parser's buffer. 2705 * 2706 * @param ch The character to append to the buffer. 2707 */ 2708 private void bappend(char ch) { 2709 try { 2710 mBuff[++mBuffIdx] = ch; 2711 } catch (Exception exp) { 2712 // Double the buffer size 2713 char buff[] = new char[mBuff.length << 1]; 2714 System.arraycopy(mBuff, 0, buff, 0, mBuff.length); 2715 mBuff = buff; 2716 mBuff[mBuffIdx] = ch; 2717 } 2718 } 2719 2720 /** 2721 * Appends (mChIdx - cidx) characters from character buffer (mChars) to 2722 * parser's buffer (mBuff). 2723 * 2724 * @param cidx The character buffer (mChars) start index. 2725 * @param bidx The parser buffer (mBuff) start index. 2726 */ 2727 private void bcopy(int cidx, int bidx) { 2728 int length = mChIdx - cidx; 2729 if ((bidx + length + 1) >= mBuff.length) { 2730 // Expand the buffer 2731 char buff[] = new char[mBuff.length + length]; 2732 System.arraycopy(mBuff, 0, buff, 0, mBuff.length); 2733 mBuff = buff; 2734 } 2735 System.arraycopy(mChars, cidx, mBuff, bidx, length); 2736 mBuffIdx += length; 2737 } 2738 2739 /** 2740 * Recognizes the built-in entities <i>lt</i>, <i>gt</i>, <i>amp</i>, 2741 * <i>apos</i>, <i>quot</i>. The initial state is 0x100. Any state belowe 2742 * 0x100 is a built-in entity replacement character. 2743 * 2744 * @param ch the next character of an entity name. 2745 */ 2746 @SuppressWarnings("fallthrough") 2747 private void eappend(char ch) { 2748 switch (mESt) { 2749 case 0x100: // "l" or "g" or "a" or "q" 2750 switch (ch) { 2751 case 'l': 2752 mESt = 0x101; 2753 break; 2754 case 'g': 2755 mESt = 0x102; 2756 break; 2757 case 'a': 2758 mESt = 0x103; 2759 break; 2760 case 'q': 2761 mESt = 0x107; 2762 break; 2763 default: 2764 mESt = 0x200; 2765 break; 2766 } 2767 break; 2768 2769 case 0x101: // "lt" 2770 mESt = (ch == 't') ? '<' : (char) 0x200; 2771 break; 2772 2773 case 0x102: // "gt" 2774 mESt = (ch == 't') ? '>' : (char) 0x200; 2775 break; 2776 2777 case 0x103: // "am" or "ap" 2778 switch (ch) { 2779 case 'm': 2780 mESt = 0x104; 2781 break; 2782 case 'p': 2783 mESt = 0x105; 2784 break; 2785 default: 2786 mESt = 0x200; 2787 break; 2788 } 2789 break; 2790 2791 case 0x104: // "amp" 2792 mESt = (ch == 'p') ? '&' : (char) 0x200; 2793 break; 2794 2795 case 0x105: // "apo" 2796 mESt = (ch == 'o') ? (char) 0x106 : (char) 0x200; 2797 break; 2798 2799 case 0x106: // "apos" 2800 mESt = (ch == 's') ? '\'' : (char) 0x200; 2801 break; 2802 2803 case 0x107: // "qu" 2804 mESt = (ch == 'u') ? (char) 0x108 : (char) 0x200; 2805 break; 2806 2807 case 0x108: // "quo" 2808 mESt = (ch == 'o') ? (char) 0x109 : (char) 0x200; 2809 break; 2810 2811 case 0x109: // "quot" 2812 mESt = (ch == 't') ? '\"' : (char) 0x200; 2813 break; 2814 2815 case '<': // "lt" 2816 case '>': // "gt" 2817 case '&': // "amp" 2818 case '\'': // "apos" 2819 case '\"': // "quot" 2820 mESt = 0x200; 2821 default: 2822 break; 2823 } 2824 } 2825 2826 /** 2827 * Sets up a new input source on the top of the input stack. Note, the first 2828 * byte returned by the entity's byte stream has to be the first byte in the 2829 * entity. However, the parser does not expect the byte order mask in both 2830 * cases when encoding is provided by the input source. 2831 * 2832 * @param is A new input source to set up. 2833 * @exception IOException If any IO errors occur. 2834 * @exception Exception is parser specific exception form panic method. 2835 */ 2836 protected void setinp(InputSource is) 2837 throws Exception { 2838 Reader reader = null; 2839 mChIdx = 0; 2840 mChLen = 0; 2841 mChars = mInp.chars; 2842 mInp.src = null; 2843 if (mPh < PH_DOC_START) { 2844 mIsSAlone = false; // default [#2.9] 2845 } 2846 mIsSAloneSet = false; 2847 if (is.getCharacterStream() != null) { 2848 // Ignore encoding in the xml text decl. 2849 reader = is.getCharacterStream(); 2850 xml(reader); 2851 } else if (is.getByteStream() != null) { 2852 String expenc; 2853 if (is.getEncoding() != null) { 2854 // Ignore encoding in the xml text decl. 2855 expenc = is.getEncoding().toUpperCase(); 2856 if (expenc.equals("UTF-16")) { 2857 reader = bom(is.getByteStream(), 'U'); // UTF-16 [#4.3.3] 2858 } else { 2859 reader = enc(expenc, is.getByteStream()); 2860 } 2861 xml(reader); 2862 } else { 2863 // Get encoding from BOM or the xml text decl. 2864 reader = bom(is.getByteStream(), ' '); 2865 /** 2866 * [#4.3.3] requires BOM for UTF-16, however, it's not uncommon 2867 * that it may be missing. A mature technique exists in Xerces 2868 * to further check for possible UTF-16 encoding 2869 */ 2870 if (reader == null) { 2871 reader = utf16(is.getByteStream()); 2872 } 2873 2874 if (reader == null) { 2875 // Encoding is defined by the xml text decl. 2876 reader = enc("UTF-8", is.getByteStream()); 2877 expenc = xml(reader); 2878 if (!expenc.equals("UTF-8")) { 2879 if (expenc.startsWith("UTF-16")) { 2880 panic(FAULT); // UTF-16 must have BOM [#4.3.3] 2881 } 2882 reader = enc(expenc, is.getByteStream()); 2883 } 2884 } else { 2885 // Encoding is defined by the BOM. 2886 xml(reader); 2887 } 2888 } 2889 } else { 2890 // There is no support for public/system identifiers. 2891 panic(FAULT); 2892 } 2893 mInp.src = reader; 2894 mInp.pubid = is.getPublicId(); 2895 mInp.sysid = is.getSystemId(); 2896 } 2897 2898 /** 2899 * Determines the entity encoding. 2900 * 2901 * This method gets encoding from Byte Order Mask [#4.3.3] if any. Note, the 2902 * first byte returned by the entity's byte stream has to be the first byte 2903 * in the entity. Also, there is no support for UCS-4. 2904 * 2905 * @param is A byte stream of the entity. 2906 * @param hint An encoding hint, character U means UTF-16. 2907 * @return a reader constructed from the BOM or UTF-8 by default. 2908 * @exception Exception is parser specific exception form panic method. 2909 * @exception IOException 2910 */ 2911 private Reader bom(InputStream is, char hint) 2912 throws Exception { 2913 int val = is.read(); 2914 switch (val) { 2915 case 0xef: // UTF-8 2916 if (hint == 'U') // must be UTF-16 2917 { 2918 panic(FAULT); 2919 } 2920 if (is.read() != 0xbb) { 2921 panic(FAULT); 2922 } 2923 if (is.read() != 0xbf) { 2924 panic(FAULT); 2925 } 2926 return new ReaderUTF8(is); 2927 2928 case 0xfe: // UTF-16, big-endian 2929 if (is.read() != 0xff) { 2930 panic(FAULT); 2931 } 2932 return new ReaderUTF16(is, 'b'); 2933 2934 case 0xff: // UTF-16, little-endian 2935 if (is.read() != 0xfe) { 2936 panic(FAULT); 2937 } 2938 return new ReaderUTF16(is, 'l'); 2939 2940 case -1: 2941 mChars[mChIdx++] = EOS; 2942 return new ReaderUTF8(is); 2943 2944 default: 2945 if (hint == 'U') // must be UTF-16 2946 { 2947 panic(FAULT); 2948 } 2949 // Read the rest of UTF-8 character 2950 switch (val & 0xf0) { 2951 case 0xc0: 2952 case 0xd0: 2953 mChars[mChIdx++] = (char) (((val & 0x1f) << 6) | (is.read() & 0x3f)); 2954 break; 2955 2956 case 0xe0: 2957 mChars[mChIdx++] = (char) (((val & 0x0f) << 12) 2958 | ((is.read() & 0x3f) << 6) | (is.read() & 0x3f)); 2959 break; 2960 2961 case 0xf0: // UCS-4 character 2962 throw new UnsupportedEncodingException(); 2963 2964 default: 2965 mChars[mChIdx++] = (char) val; 2966 break; 2967 } 2968 return null; 2969 } 2970 } 2971 2972 2973 /** 2974 * Using a mature technique from Xerces, this method checks further after 2975 * the bom method above to see if the encoding is UTF-16 2976 * 2977 * @param is A byte stream of the entity. 2978 * @return a reader, may be null 2979 * @exception Exception is parser specific exception form panic method. 2980 * @exception IOException 2981 */ 2982 private Reader utf16(InputStream is) 2983 throws Exception { 2984 if (mChIdx != 0) { 2985 //The bom method has read ONE byte into the buffer. 2986 byte b0 = (byte)mChars[0]; 2987 if (b0 == 0x00 || b0 == 0x3C) { 2988 int b1 = is.read(); 2989 int b2 = is.read(); 2990 int b3 = is.read(); 2991 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { 2992 // UTF-16, big-endian, no BOM 2993 mChars[0] = (char)(b1); 2994 mChars[mChIdx++] = (char)(b3); 2995 return new ReaderUTF16(is, 'b'); 2996 } else if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { 2997 // UTF-16, little-endian, no BOM 2998 mChars[0] = (char)(b0); 2999 mChars[mChIdx++] = (char)(b2); 3000 return new ReaderUTF16(is, 'l'); 3001 } else { 3002 /**not every InputStream supports reset, so we have to remember 3003 * the state for further parsing 3004 **/ 3005 mChars[0] = (char)(b0); 3006 mChars[mChIdx++] = (char)(b1); 3007 mChars[mChIdx++] = (char)(b2); 3008 mChars[mChIdx++] = (char)(b3); 3009 } 3010 3011 } 3012 } 3013 return null; 3014 } 3015 /** 3016 * Parses the xml text declaration. 3017 * 3018 * This method gets encoding from the xml text declaration [#4.3.1] if any. 3019 * The method assumes the buffer (mChars) is big enough to accommodate whole 3020 * xml text declaration. 3021 * 3022 * @param reader is entity reader. 3023 * @return The xml text declaration encoding or default UTF-8 encoding. 3024 * @exception Exception is parser specific exception form panic method. 3025 * @exception IOException 3026 */ 3027 private String xml(Reader reader) 3028 throws Exception { 3029 String str = null; 3030 String enc = "UTF-8"; 3031 char ch; 3032 int val; 3033 short st = 0; 3034 int byteRead = mChIdx; //number of bytes read prior to entering this method 3035 3036 while (st >= 0 && mChIdx < mChars.length) { 3037 if (st < byteRead) { 3038 ch = mChars[st]; 3039 } else { 3040 ch = ((val = reader.read()) >= 0) ? (char) val : EOS; 3041 mChars[mChIdx++] = ch; 3042 } 3043 3044 switch (st) { 3045 case 0: // read '<' of xml declaration 3046 switch (ch) { 3047 case '<': 3048 st = 1; 3049 break; 3050 3051 case 0xfeff: // the byte order mask 3052 ch = ((val = reader.read()) >= 0) ? (char) val : EOS; 3053 mChars[mChIdx - 1] = ch; 3054 st = (short) ((ch == '<') ? 1 : -1); 3055 break; 3056 3057 default: 3058 st = -1; 3059 break; 3060 } 3061 break; 3062 3063 case 1: // read '?' of xml declaration [#4.3.1] 3064 st = (short) ((ch == '?') ? 2 : -1); 3065 break; 3066 3067 case 2: // read 'x' of xml declaration [#4.3.1] 3068 st = (short) ((ch == 'x') ? 3 : -1); 3069 break; 3070 3071 case 3: // read 'm' of xml declaration [#4.3.1] 3072 st = (short) ((ch == 'm') ? 4 : -1); 3073 break; 3074 3075 case 4: // read 'l' of xml declaration [#4.3.1] 3076 st = (short) ((ch == 'l') ? 5 : -1); 3077 break; 3078 3079 case 5: // read white space after 'xml' 3080 switch (ch) { 3081 case ' ': 3082 case '\t': 3083 case '\r': 3084 case '\n': 3085 st = 6; 3086 break; 3087 3088 default: 3089 st = -1; 3090 break; 3091 } 3092 break; 3093 3094 case 6: // read content of xml declaration 3095 switch (ch) { 3096 case '?': 3097 st = 7; 3098 break; 3099 3100 case EOS: 3101 st = -2; 3102 break; 3103 3104 default: 3105 break; 3106 } 3107 break; 3108 3109 case 7: // read '>' after '?' of xml declaration 3110 switch (ch) { 3111 case '>': 3112 case EOS: 3113 st = -2; 3114 break; 3115 3116 default: 3117 st = 6; 3118 break; 3119 } 3120 break; 3121 3122 default: 3123 panic(FAULT); 3124 break; 3125 } 3126 } 3127 mChLen = mChIdx; 3128 mChIdx = 0; 3129 // If there is no xml text declaration, the encoding is default. 3130 if (st == -1) { 3131 return enc; 3132 } 3133 mChIdx = 5; // the first white space after "<?xml" 3134 // Parse the xml text declaration 3135 for (st = 0; st >= 0;) { 3136 ch = getch(); 3137 switch (st) { 3138 case 0: // skip spaces after the xml declaration name 3139 if (chtyp(ch) != ' ') { 3140 bkch(); 3141 st = 1; 3142 } 3143 break; 3144 3145 case 1: // read xml declaration version 3146 case 2: // read xml declaration encoding or standalone 3147 case 3: // read xml declaration standalone 3148 switch (chtyp(ch)) { 3149 case 'a': 3150 case 'A': 3151 case '_': 3152 bkch(); 3153 str = name(false).toLowerCase(); 3154 if ("version".equals(str) == true) { 3155 if (st != 1) { 3156 panic(FAULT); 3157 } 3158 if ("1.0".equals(eqstr('=')) != true) { 3159 panic(FAULT); 3160 } 3161 mInp.xmlver = 0x0100; 3162 st = 2; 3163 } else if ("encoding".equals(str) == true) { 3164 if (st != 2) { 3165 panic(FAULT); 3166 } 3167 mInp.xmlenc = eqstr('=').toUpperCase(); 3168 enc = mInp.xmlenc; 3169 st = 3; 3170 } else if ("standalone".equals(str) == true) { 3171 if ((st == 1) || (mPh >= PH_DOC_START)) // [#4.3.1] 3172 { 3173 panic(FAULT); 3174 } 3175 str = eqstr('=').toLowerCase(); 3176 // Check the 'standalone' value and use it [#5.1] 3177 if (str.equals("yes") == true) { 3178 mIsSAlone = true; 3179 } else if (str.equals("no") == true) { 3180 mIsSAlone = false; 3181 } else { 3182 panic(FAULT); 3183 } 3184 mIsSAloneSet = true; 3185 st = 4; 3186 } else { 3187 panic(FAULT); 3188 } 3189 break; 3190 3191 case ' ': 3192 break; 3193 3194 case '?': 3195 if (st == 1) { 3196 panic(FAULT); 3197 } 3198 bkch(); 3199 st = 4; 3200 break; 3201 3202 default: 3203 panic(FAULT); 3204 } 3205 break; 3206 3207 case 4: // end of xml declaration 3208 switch (chtyp(ch)) { 3209 case '?': 3210 if (getch() != '>') { 3211 panic(FAULT); 3212 } 3213 if (mPh <= PH_DOC_START) { 3214 mPh = PH_MISC_DTD; // misc before DTD 3215 } 3216 st = -1; 3217 break; 3218 3219 case ' ': 3220 break; 3221 3222 default: 3223 panic(FAULT); 3224 } 3225 break; 3226 3227 default: 3228 panic(FAULT); 3229 } 3230 } 3231 return enc; 3232 } 3233 3234 /** 3235 * Sets up the document reader. 3236 * 3237 * @param name an encoding name. 3238 * @param is the document byte input stream. 3239 * @return a reader constructed from encoding name and input stream. 3240 * @exception UnsupportedEncodingException 3241 */ 3242 private Reader enc(String name, InputStream is) 3243 throws UnsupportedEncodingException { 3244 // DO NOT CLOSE current reader if any! 3245 if (name.equals("UTF-8")) { 3246 return new ReaderUTF8(is); 3247 } else if (name.equals("UTF-16LE")) { 3248 return new ReaderUTF16(is, 'l'); 3249 } else if (name.equals("UTF-16BE")) { 3250 return new ReaderUTF16(is, 'b'); 3251 } else { 3252 return new InputStreamReader(is, name); 3253 } 3254 } 3255 3256 /** 3257 * Sets up current input on the top of the input stack. 3258 * 3259 * @param inp A new input to set up. 3260 */ 3261 protected void push(Input inp) { 3262 mInp.chLen = mChLen; 3263 mInp.chIdx = mChIdx; 3264 inp.next = mInp; 3265 mInp = inp; 3266 mChars = inp.chars; 3267 mChLen = inp.chLen; 3268 mChIdx = inp.chIdx; 3269 } 3270 3271 /** 3272 * Restores previous input on the top of the input stack. 3273 */ 3274 protected void pop() { 3275 if (mInp.src != null) { 3276 try { 3277 mInp.src.close(); 3278 } catch (IOException ioe) { 3279 } 3280 mInp.src = null; 3281 } 3282 mInp = mInp.next; 3283 if (mInp != null) { 3284 mChars = mInp.chars; 3285 mChLen = mInp.chLen; 3286 mChIdx = mInp.chIdx; 3287 } else { 3288 mChars = null; 3289 mChLen = 0; 3290 mChIdx = 0; 3291 } 3292 } 3293 3294 /** 3295 * Maps a character to its type. 3296 * 3297 * Possible character type values are: 3298 * <ul> 3299 * <li>' ' - for any kind of whitespace character;</li> 3300 * <li>'a' - for any lower case alphabetical character value;</li> 3301 * <li>'A' - for any upper case alphabetical character value;</li> 3302 * <li>'d' - for any decimal digit character value;</li> 3303 * <li>'z' - for any character less than ' ' except '\t', '\n', '\r';</li> 3304 * <li>'X' - for any not ASCII character;</li> 3305 * <li>'Z' - for EOS character.</li> 3306 * </ul> 3307 * An ASCII (7 bit) character which does not fall in any category 3308 * listed above is mapped to itself. 3309 * 3310 * @param ch The character to map. 3311 * @return The type of character. 3312 */ 3313 protected char chtyp(char ch) { 3314 if (ch < 0x80) { 3315 return (char) asctyp[ch]; 3316 } 3317 return (ch != EOS) ? 'X' : 'Z'; 3318 } 3319 3320 /** 3321 * Retrives the next character in the document. 3322 * 3323 * @return The next character in the document. 3324 */ 3325 protected char getch() 3326 throws IOException { 3327 if (mChIdx >= mChLen) { 3328 if (mInp.src == null) { 3329 pop(); // remove internal entity 3330 return getch(); 3331 } 3332 // Read new portion of the document characters 3333 int Num = mInp.src.read(mChars, 0, mChars.length); 3334 if (Num < 0) { 3335 if (mInp != mDoc) { 3336 pop(); // restore the previous input 3337 return getch(); 3338 } else { 3339 mChars[0] = EOS; 3340 mChLen = 1; 3341 } 3342 } else { 3343 mChLen = Num; 3344 } 3345 mChIdx = 0; 3346 } 3347 return mChars[mChIdx++]; 3348 } 3349 3350 /** 3351 * Puts back the last read character. 3352 * 3353 * This method <strong>MUST NOT</strong> be called more then once after each 3354 * call of {@link #getch getch} method. 3355 */ 3356 protected void bkch() 3357 throws Exception { 3358 if (mChIdx <= 0) { 3359 panic(FAULT); 3360 } 3361 mChIdx--; 3362 } 3363 3364 /** 3365 * Sets the current character. 3366 * 3367 * @param ch The character to set. 3368 */ 3369 protected void setch(char ch) { 3370 mChars[mChIdx] = ch; 3371 } 3372 3373 /** 3374 * Finds a pair in the pair chain by a qualified name. 3375 * 3376 * @param chain The first element of the chain of pairs. 3377 * @param qname The qualified name. 3378 * @return A pair with the specified qualified name or null. 3379 */ 3380 protected Pair find(Pair chain, char[] qname) { 3381 for (Pair pair = chain; pair != null; pair = pair.next) { 3382 if (pair.eqname(qname) == true) { 3383 return pair; 3384 } 3385 } 3386 return null; 3387 } 3388 3389 /** 3390 * Provedes an instance of a pair. 3391 * 3392 * @param next The reference to a next pair. 3393 * @return An instance of a pair. 3394 */ 3395 protected Pair pair(Pair next) { 3396 Pair pair; 3397 3398 if (mDltd != null) { 3399 pair = mDltd; 3400 mDltd = pair.next; 3401 } else { 3402 pair = new Pair(); 3403 } 3404 pair.next = next; 3405 3406 return pair; 3407 } 3408 3409 /** 3410 * Deletes an instance of a pair. 3411 * 3412 * @param pair The pair to delete. 3413 * @return A reference to the next pair in a chain. 3414 */ 3415 protected Pair del(Pair pair) { 3416 Pair next = pair.next; 3417 3418 pair.name = null; 3419 pair.value = null; 3420 pair.chars = null; 3421 pair.list = null; 3422 pair.next = mDltd; 3423 mDltd = pair; 3424 3425 return next; 3426 } 3427} 3428