Parser.java revision 16993:79e099ab284b
1/* 2 * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26package javax.swing.text.html.parser; 27 28import javax.swing.text.SimpleAttributeSet; 29import javax.swing.text.html.HTML; 30import javax.swing.text.ChangedCharSetException; 31import java.io.*; 32import java.util.Hashtable; 33import java.util.Properties; 34import java.util.Vector; 35import java.util.Enumeration; 36import java.net.URL; 37 38/** 39 * A simple DTD-driven HTML parser. The parser reads an 40 * HTML file from an InputStream and calls various methods 41 * (which should be overridden in a subclass) when tags and 42 * data are encountered. 43 * <p> 44 * Unfortunately there are many badly implemented HTML parsers 45 * out there, and as a result there are many badly formatted 46 * HTML files. This parser attempts to parse most HTML files. 47 * This means that the implementation sometimes deviates from 48 * the SGML specification in favor of HTML. 49 * <p> 50 * The parser treats \r and \r\n as \n. Newlines after starttags 51 * and before end tags are ignored just as specified in the SGML/HTML 52 * specification. 53 * <p> 54 * The html spec does not specify how spaces are to be coalesced very well. 55 * Specifically, the following scenarios are not discussed (note that a 56 * space should be used here, but I am using &nbsp to force the space to 57 * be displayed): 58 * <p> 59 * '<b>blah <i> <strike> foo' which can be treated as: 60 * '<b>blah <i><strike>foo' 61 * <p>as well as: 62 * '<p><a href="xx"> <em>Using</em></a></p>' 63 * which appears to be treated as: 64 * '<p><a href="xx"><em>Using</em></a></p>' 65 * <p> 66 * If <code>strict</code> is false, when a tag that breaks flow, 67 * (<code>TagElement.breaksFlows</code>) or trailing whitespace is 68 * encountered, all whitespace will be ignored until a non whitespace 69 * character is encountered. This appears to give behavior closer to 70 * the popular browsers. 71 * 72 * @see DTD 73 * @see TagElement 74 * @see SimpleAttributeSet 75 * @author Arthur van Hoff 76 * @author Sunita Mani 77 */ 78public 79class Parser implements DTDConstants { 80 81 private char text[] = new char[1024]; 82 private int textpos = 0; 83 private TagElement last; 84 private boolean space; 85 86 private char str[] = new char[128]; 87 private int strpos = 0; 88 89 /** 90 * The dtd. 91 */ 92 protected DTD dtd = null; 93 94 private int ch; 95 private int ln; 96 private Reader in; 97 98 private Element recent; 99 private TagStack stack; 100 private boolean skipTag = false; 101 private TagElement lastFormSent = null; 102 private SimpleAttributeSet attributes = new SimpleAttributeSet(); 103 104 // State for <html>, <head> and <body>. Since people like to slap 105 // together HTML documents without thinking, occasionally they 106 // have multiple instances of these tags. These booleans track 107 // the first sightings of these tags so they can be safely ignored 108 // by the parser if repeated. 109 private boolean seenHtml = false; 110 private boolean seenHead = false; 111 private boolean seenBody = false; 112 113 /** 114 * The html spec does not specify how spaces are coalesced very well. 115 * If strict == false, ignoreSpace is used to try and mimic the behavior 116 * of the popular browsers. 117 * <p> 118 * The problematic scenarios are: 119 * '<b>blah <i> <strike> foo' which can be treated as: 120 * '<b>blah <i><strike>foo' 121 * as well as: 122 * '<p><a href="xx"> <em>Using</em></a></p>' 123 * which appears to be treated as: 124 * '<p><a href="xx"><em>Using</em></a></p>' 125 * <p> 126 * When a tag that breaks flow, or trailing whitespace is encountered 127 * ignoreSpace is set to true. From then on, all whitespace will be 128 * ignored. 129 * ignoreSpace will be set back to false the first time a 130 * non whitespace character is encountered. This appears to give 131 * behavior closer to the popular browsers. 132 */ 133 private boolean ignoreSpace; 134 135 /** 136 * This flag determines whether or not the Parser will be strict 137 * in enforcing SGML compatibility. If false, it will be lenient 138 * with certain common classes of erroneous HTML constructs. 139 * Strict or not, in either case an error will be recorded. 140 * 141 */ 142 protected boolean strict = false; 143 144 145 /** Number of \r\n's encountered. */ 146 private int crlfCount; 147 /** Number of \r's encountered. A \r\n will not increment this. */ 148 private int crCount; 149 /** Number of \n's encountered. A \r\n will not increment this. */ 150 private int lfCount; 151 152 // 153 // To correctly identify the start of a tag/comment/text we need two 154 // ivars. Two are needed as handleText isn't invoked until the tag 155 // after the text has been parsed, that is the parser parses the text, 156 // then a tag, then invokes handleText followed by handleStart. 157 // 158 /** The start position of the current block. Block is overloaded here, 159 * it really means the current start position for the current comment, 160 * tag, text. Use getBlockStartPosition to access this. */ 161 private int currentBlockStartPos; 162 /** Start position of the last block. */ 163 private int lastBlockStartPos; 164 165 /** 166 * array for mapping numeric references in range 167 * 130-159 to displayable Unicode characters. 168 */ 169 private static final char[] cp1252Map = { 170 8218, // ‚ 171 402, // ƒ 172 8222, // „ 173 8230, // … 174 8224, // † 175 8225, // ‡ 176 710, // ˆ 177 8240, // ‰ 178 352, // Š 179 8249, // ‹ 180 338, // Œ 181 141, //  182 142, // Ž 183 143, //  184 144, //  185 8216, // ‘ 186 8217, // ’ 187 8220, // “ 188 8221, // ” 189 8226, // • 190 8211, // – 191 8212, // — 192 732, // ˜ 193 8482, // ™ 194 353, // š 195 8250, // › 196 339, // œ 197 157, //  198 158, // ž 199 376 // Ÿ 200 }; 201 202 /** 203 * Creates parser with the specified {@code dtd}. 204 * 205 * @param dtd the dtd. 206 */ 207 public Parser(DTD dtd) { 208 this.dtd = dtd; 209 } 210 211 212 /** 213 * @return the line number of the line currently being parsed 214 */ 215 protected int getCurrentLine() { 216 return ln; 217 } 218 219 /** 220 * Returns the start position of the current block. Block is 221 * overloaded here, it really means the current start position for 222 * the current comment tag, text, block.... This is provided for 223 * subclassers that wish to know the start of the current block when 224 * called with one of the handleXXX methods. 225 * 226 * @return the start position of the current block 227 */ 228 int getBlockStartPosition() { 229 return Math.max(0, lastBlockStartPos - 1); 230 } 231 232 /** 233 * Makes a TagElement. 234 * 235 * @param elem the element storing the tag definition 236 * @param fictional the value of the flag "{@code fictional}" to be set for the tag 237 * 238 * @return the created {@code TagElement} 239 */ 240 protected TagElement makeTag(Element elem, boolean fictional) { 241 return new TagElement(elem, fictional); 242 } 243 244 /** 245 * Makes a TagElement. 246 * 247 * @param elem the element storing the tag definition 248 * 249 * @return the created {@code TagElement} 250 */ 251 protected TagElement makeTag(Element elem) { 252 return makeTag(elem, false); 253 } 254 255 /** 256 * Returns attributes for the current tag. 257 * 258 * @return {@code SimpleAttributeSet} containing the attributes 259 */ 260 protected SimpleAttributeSet getAttributes() { 261 return attributes; 262 } 263 264 /** 265 * Removes the current attributes. 266 */ 267 protected void flushAttributes() { 268 attributes.removeAttributes(attributes); 269 } 270 271 /** 272 * Called when PCDATA is encountered. 273 * 274 * @param text the section text 275 */ 276 protected void handleText(char text[]) { 277 } 278 279 /** 280 * Called when an HTML title tag is encountered. 281 * 282 * @param text the title text 283 */ 284 protected void handleTitle(char text[]) { 285 // default behavior is to call handleText. Subclasses 286 // can override if necessary. 287 handleText(text); 288 } 289 290 /** 291 * Called when an HTML comment is encountered. 292 * 293 * @param text the comment being handled 294 */ 295 protected void handleComment(char text[]) { 296 } 297 298 /** 299 * Called when the content terminates without closing the HTML comment. 300 */ 301 protected void handleEOFInComment() { 302 // We've reached EOF. Our recovery strategy is to 303 // see if we have more than one line in the comment; 304 // if so, we pretend that the comment was an unterminated 305 // single line comment, and reparse the lines after the 306 // first line as normal HTML content. 307 308 int commentEndPos = strIndexOf('\n'); 309 if (commentEndPos >= 0) { 310 handleComment(getChars(0, commentEndPos)); 311 try { 312 in.close(); 313 in = new CharArrayReader(getChars(commentEndPos + 1)); 314 ch = '>'; 315 } catch (IOException e) { 316 error("ioexception"); 317 } 318 319 resetStrBuffer(); 320 } else { 321 // no newline, so signal an error 322 error("eof.comment"); 323 } 324 } 325 326 /** 327 * Called when an empty tag is encountered. 328 * 329 * @param tag the tag being handled 330 * @throws ChangedCharSetException if the document charset was changed 331 */ 332 protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException { 333 } 334 335 /** 336 * Called when a start tag is encountered. 337 * 338 * @param tag the tag being handled 339 */ 340 protected void handleStartTag(TagElement tag) { 341 } 342 343 /** 344 * Called when an end tag is encountered. 345 * 346 * @param tag the tag being handled 347 */ 348 protected void handleEndTag(TagElement tag) { 349 } 350 351 /** 352 * An error has occurred. 353 * 354 * @param ln the number of line containing the error 355 * @param msg the error message 356 */ 357 protected void handleError(int ln, String msg) { 358 /* 359 Thread.dumpStack(); 360 System.out.println("**** " + stack); 361 System.out.println("line " + ln + ": error: " + msg); 362 System.out.println(); 363 */ 364 } 365 366 /** 367 * Output text. 368 */ 369 void handleText(TagElement tag) { 370 if (tag.breaksFlow()) { 371 space = false; 372 if (!strict) { 373 ignoreSpace = true; 374 } 375 } 376 if (textpos == 0) { 377 if ((!space) || (stack == null) || last.breaksFlow() || 378 !stack.advance(dtd.pcdata)) { 379 last = tag; 380 space = false; 381 lastBlockStartPos = currentBlockStartPos; 382 return; 383 } 384 } 385 if (space) { 386 if (!ignoreSpace) { 387 // enlarge buffer if needed 388 if (textpos + 1 > text.length) { 389 char newtext[] = new char[text.length + 200]; 390 System.arraycopy(text, 0, newtext, 0, text.length); 391 text = newtext; 392 } 393 394 // output pending space 395 text[textpos++] = ' '; 396 if (!strict && !tag.getElement().isEmpty()) { 397 ignoreSpace = true; 398 } 399 } 400 space = false; 401 } 402 char newtext[] = new char[textpos]; 403 System.arraycopy(text, 0, newtext, 0, textpos); 404 // Handles cases of bad html where the title tag 405 // was getting lost when we did error recovery. 406 if (tag.getElement().getName().equals("title")) { 407 handleTitle(newtext); 408 } else { 409 handleText(newtext); 410 } 411 lastBlockStartPos = currentBlockStartPos; 412 textpos = 0; 413 last = tag; 414 space = false; 415 } 416 417 /** 418 * Invokes the error handler. 419 * 420 * @param err the error type 421 * @param arg1 the 1st error message argument 422 * @param arg2 the 2nd error message argument 423 * @param arg3 the 3rd error message argument 424 */ 425 protected void error(String err, String arg1, String arg2, 426 String arg3) { 427 handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3); 428 } 429 430 /** 431 * Invokes the error handler with the 3rd error message argument "?". 432 * 433 * @param err the error type 434 * @param arg1 the 1st error message argument 435 * @param arg2 the 2nd error message argument 436 */ 437 protected void error(String err, String arg1, String arg2) { 438 error(err, arg1, arg2, "?"); 439 } 440 441 /** 442 * Invokes the error handler with the 2nd and 3rd error message argument "?". 443 * 444 * @param err the error type 445 * @param arg1 the 1st error message argument 446 */ 447 protected void error(String err, String arg1) { 448 error(err, arg1, "?", "?"); 449 } 450 451 /** 452 * Invokes the error handler with the 1st, 2nd and 3rd error message argument "?". 453 * 454 * @param err the error type 455 */ 456 protected void error(String err) { 457 error(err, "?", "?", "?"); 458 } 459 460 461 /** 462 * Handle a start tag. The new tag is pushed 463 * onto the tag stack. The attribute list is 464 * checked for required attributes. 465 * 466 * @param tag the tag 467 * @throws ChangedCharSetException if the document charset was changed 468 */ 469 protected void startTag(TagElement tag) throws ChangedCharSetException { 470 Element elem = tag.getElement(); 471 472 // If the tag is an empty tag and texpos != 0 473 // this implies that there is text before the 474 // start tag that needs to be processed before 475 // handling the tag. 476 // 477 if (!elem.isEmpty() || 478 ((last != null) && !last.breaksFlow()) || 479 (textpos != 0)) { 480 handleText(tag); 481 } else { 482 // this variable gets updated in handleText(). 483 // Since in this case we do not call handleText() 484 // we need to update it here. 485 // 486 last = tag; 487 // Note that we should really check last.breakFlows before 488 // assuming this should be false. 489 space = false; 490 } 491 lastBlockStartPos = currentBlockStartPos; 492 493 // check required attributes 494 for (AttributeList a = elem.atts ; a != null ; a = a.next) { 495 if ((a.modifier == REQUIRED) && 496 ((attributes.isEmpty()) || 497 ((!attributes.isDefined(a.name)) && 498 (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) { 499 error("req.att ", a.getName(), elem.getName()); 500 } 501 } 502 503 if (elem.isEmpty()) { 504 handleEmptyTag(tag); 505 /* 506 } else if (elem.getName().equals("form")) { 507 handleStartTag(tag); 508 */ 509 } else { 510 recent = elem; 511 stack = new TagStack(tag, stack); 512 handleStartTag(tag); 513 } 514 } 515 516 /** 517 * Handle an end tag. The end tag is popped 518 * from the tag stack. 519 * 520 * @param omitted {@code true} if the tag is no actually present in the 521 * document, but is supposed by the parser 522 */ 523 protected void endTag(boolean omitted) { 524 handleText(stack.tag); 525 526 if (omitted && !stack.elem.omitEnd()) { 527 error("end.missing", stack.elem.getName()); 528 } else if (!stack.terminate()) { 529 error("end.unexpected", stack.elem.getName()); 530 } 531 532 // handle the tag 533 handleEndTag(stack.tag); 534 stack = stack.next; 535 recent = (stack != null) ? stack.elem : null; 536 } 537 538 539 boolean ignoreElement(Element elem) { 540 541 String stackElement = stack.elem.getName(); 542 String elemName = elem.getName(); 543 /* We ignore all elements that are not valid in the context of 544 a table except <td>, <th> (these we handle in 545 legalElementContext()) and #pcdata. We also ignore the 546 <font> tag in the context of <ul> and <ol> We additonally 547 ignore the <meta> and the <style> tag if the body tag has 548 been seen. **/ 549 if ((elemName.equals("html") && seenHtml) || 550 (elemName.equals("head") && seenHead) || 551 (elemName.equals("body") && seenBody)) { 552 return true; 553 } 554 if (elemName.equals("dt") || elemName.equals("dd")) { 555 TagStack s = stack; 556 while (s != null && !s.elem.getName().equals("dl")) { 557 s = s.next; 558 } 559 if (s == null) { 560 return true; 561 } 562 } 563 564 if (((stackElement.equals("table")) && 565 (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) || 566 ((elemName.equals("font")) && 567 (stackElement.equals("ul") || stackElement.equals("ol"))) || 568 (elemName.equals("meta") && stack != null) || 569 (elemName.equals("style") && seenBody) || 570 (stackElement.equals("table") && elemName.equals("a"))) { 571 return true; 572 } 573 return false; 574 } 575 576 577 /** 578 * Marks the first time a tag has been seen in a document 579 * 580 * @param elem the element represented by the tag 581 */ 582 583 protected void markFirstTime(Element elem) { 584 String elemName = elem.getName(); 585 if (elemName.equals("html")) { 586 seenHtml = true; 587 } else if (elemName.equals("head")) { 588 seenHead = true; 589 } else if (elemName.equals("body")) { 590 if (buf.length == 1) { 591 // Refer to note in definition of buf for details on this. 592 char[] newBuf = new char[256]; 593 594 newBuf[0] = buf[0]; 595 buf = newBuf; 596 } 597 seenBody = true; 598 } 599 } 600 601 /** 602 * Create a legal content for an element. 603 */ 604 boolean legalElementContext(Element elem) throws ChangedCharSetException { 605 606 // System.out.println("-- legalContext -- " + elem); 607 608 // Deal with the empty stack 609 if (stack == null) { 610 // System.out.println("-- stack is empty"); 611 if (elem != dtd.html) { 612 // System.out.println("-- pushing html"); 613 startTag(makeTag(dtd.html, true)); 614 return legalElementContext(elem); 615 } 616 return true; 617 } 618 619 // Is it allowed in the current context 620 if (stack.advance(elem)) { 621 // System.out.println("-- legal context"); 622 markFirstTime(elem); 623 return true; 624 } 625 boolean insertTag = false; 626 627 // The use of all error recovery strategies are contingent 628 // on the value of the strict property. 629 // 630 // These are commonly occurring errors. if insertTag is true, 631 // then we want to adopt an error recovery strategy that 632 // involves attempting to insert an additional tag to 633 // legalize the context. The two errors addressed here 634 // are: 635 // 1) when a <td> or <th> is seen soon after a <table> tag. 636 // In this case we insert a <tr>. 637 // 2) when any other tag apart from a <tr> is seen 638 // in the context of a <tr>. In this case we would 639 // like to add a <td>. If a <tr> is seen within a 640 // <tr> context, then we will close out the current 641 // <tr>. 642 // 643 // This insertion strategy is handled later in the method. 644 // The reason for checking this now, is that in other cases 645 // we would like to apply other error recovery strategies for example 646 // ignoring tags. 647 // 648 // In certain cases it is better to ignore a tag than try to 649 // fix the situation. So the first test is to see if this 650 // is what we need to do. 651 // 652 String stackElemName = stack.elem.getName(); 653 String elemName = elem.getName(); 654 655 656 if (!strict && 657 ((stackElemName.equals("table") && elemName.equals("td")) || 658 (stackElemName.equals("table") && elemName.equals("th")) || 659 (stackElemName.equals("tr") && !elemName.equals("tr")))){ 660 insertTag = true; 661 } 662 663 664 if (!strict && !insertTag && (stack.elem.getName() != elem.getName() || 665 elem.getName().equals("body"))) { 666 if (skipTag = ignoreElement(elem)) { 667 error("tag.ignore", elem.getName()); 668 return skipTag; 669 } 670 } 671 672 // Check for anything after the start of the table besides tr, td, th 673 // or caption, and if those aren't there, insert the <tr> and call 674 // legalElementContext again. 675 if (!strict && stackElemName.equals("table") && 676 !elemName.equals("tr") && !elemName.equals("td") && 677 !elemName.equals("th") && !elemName.equals("caption")) { 678 Element e = dtd.getElement("tr"); 679 TagElement t = makeTag(e, true); 680 legalTagContext(t); 681 startTag(t); 682 error("start.missing", elem.getName()); 683 return legalElementContext(elem); 684 } 685 686 // They try to find a legal context by checking if the current 687 // tag is valid in an enclosing context. If so 688 // close out the tags by outputing end tags and then 689 // insert the current tag. If the tags that are 690 // being closed out do not have an optional end tag 691 // specification in the DTD then an html error is 692 // reported. 693 // 694 if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) { 695 for (TagStack s = stack.next ; s != null ; s = s.next) { 696 if (s.advance(elem)) { 697 while (stack != s) { 698 endTag(true); 699 } 700 return true; 701 } 702 if (!s.terminate() || (strict && !s.elem.omitEnd())) { 703 break; 704 } 705 } 706 } 707 708 // Check if we know what tag is expected next. 709 // If so insert the tag. Report an error if the 710 // tag does not have its start tag spec in the DTD as optional. 711 // 712 Element next = stack.first(); 713 if (next != null && (!strict || next.omitStart()) && 714 !(next==dtd.head && elem==dtd.pcdata) ) { 715 // System.out.println("-- omitting start tag: " + next); 716 TagElement t = makeTag(next, true); 717 legalTagContext(t); 718 startTag(t); 719 if (!next.omitStart()) { 720 error("start.missing", elem.getName()); 721 } 722 return legalElementContext(elem); 723 } 724 725 726 // Traverse the list of expected elements and determine if adding 727 // any of these elements would make for a legal context. 728 // 729 730 if (!strict) { 731 ContentModel content = stack.contentModel(); 732 Vector<Element> elemVec = new Vector<Element>(); 733 if (content != null) { 734 content.getElements(elemVec); 735 for (Element e : elemVec) { 736 // Ensure that this element has not been included as 737 // part of the exclusions in the DTD. 738 // 739 if (stack.excluded(e.getIndex())) { 740 continue; 741 } 742 743 boolean reqAtts = false; 744 745 for (AttributeList a = e.getAttributes(); a != null ; a = a.next) { 746 if (a.modifier == REQUIRED) { 747 reqAtts = true; 748 break; 749 } 750 } 751 // Ensure that no tag that has required attributes 752 // gets inserted. 753 // 754 if (reqAtts) { 755 continue; 756 } 757 758 ContentModel m = e.getContent(); 759 if (m != null && m.first(elem)) { 760 // System.out.println("-- adding a legal tag: " + e); 761 TagElement t = makeTag(e, true); 762 legalTagContext(t); 763 startTag(t); 764 error("start.missing", e.getName()); 765 return legalElementContext(elem); 766 } 767 } 768 } 769 } 770 771 // Check if the stack can be terminated. If so add the appropriate 772 // end tag. Report an error if the tag being ended does not have its 773 // end tag spec in the DTD as optional. 774 // 775 if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) { 776 // System.out.println("-- omitting end tag: " + stack.elem); 777 if (!stack.elem.omitEnd()) { 778 error("end.missing", elem.getName()); 779 } 780 781 endTag(true); 782 return legalElementContext(elem); 783 } 784 785 // At this point we know that something is screwed up. 786 return false; 787 } 788 789 /** 790 * Create a legal context for a tag. 791 */ 792 void legalTagContext(TagElement tag) throws ChangedCharSetException { 793 if (legalElementContext(tag.getElement())) { 794 markFirstTime(tag.getElement()); 795 return; 796 } 797 798 // Avoid putting a block tag in a flow tag. 799 if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) { 800 endTag(true); 801 legalTagContext(tag); 802 return; 803 } 804 805 // Avoid putting something wierd in the head of the document. 806 for (TagStack s = stack ; s != null ; s = s.next) { 807 if (s.tag.getElement() == dtd.head) { 808 while (stack != s) { 809 endTag(true); 810 } 811 endTag(true); 812 legalTagContext(tag); 813 return; 814 } 815 } 816 817 // Everything failed 818 error("tag.unexpected", tag.getElement().getName()); 819 } 820 821 /** 822 * Error context. Something went wrong, make sure we are in 823 * the document's body context 824 */ 825 void errorContext() throws ChangedCharSetException { 826 for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) { 827 handleEndTag(stack.tag); 828 } 829 if (stack == null) { 830 legalElementContext(dtd.body); 831 startTag(makeTag(dtd.body, true)); 832 } 833 } 834 835 /** 836 * Add a char to the string buffer. 837 */ 838 void addString(int c) { 839 if (strpos == str.length) { 840 char newstr[] = new char[str.length + 128]; 841 System.arraycopy(str, 0, newstr, 0, str.length); 842 str = newstr; 843 } 844 str[strpos++] = (char)c; 845 } 846 847 /** 848 * Get the string that's been accumulated. 849 */ 850 String getString(int pos) { 851 char newStr[] = new char[strpos - pos]; 852 System.arraycopy(str, pos, newStr, 0, strpos - pos); 853 strpos = pos; 854 return new String(newStr); 855 } 856 857 char[] getChars(int pos) { 858 char newStr[] = new char[strpos - pos]; 859 System.arraycopy(str, pos, newStr, 0, strpos - pos); 860 strpos = pos; 861 return newStr; 862 } 863 864 char[] getChars(int pos, int endPos) { 865 char newStr[] = new char[endPos - pos]; 866 System.arraycopy(str, pos, newStr, 0, endPos - pos); 867 // REMIND: it's not clear whether this version should set strpos or not 868 // strpos = pos; 869 return newStr; 870 } 871 872 void resetStrBuffer() { 873 strpos = 0; 874 } 875 876 int strIndexOf(char target) { 877 for (int i = 0; i < strpos; i++) { 878 if (str[i] == target) { 879 return i; 880 } 881 } 882 883 return -1; 884 } 885 886 /** 887 * Skip space. 888 * [5] 297:5 889 */ 890 void skipSpace() throws IOException { 891 while (true) { 892 switch (ch) { 893 case '\n': 894 ln++; 895 ch = readCh(); 896 lfCount++; 897 break; 898 899 case '\r': 900 ln++; 901 if ((ch = readCh()) == '\n') { 902 ch = readCh(); 903 crlfCount++; 904 } 905 else { 906 crCount++; 907 } 908 break; 909 case ' ': 910 case '\t': 911 ch = readCh(); 912 break; 913 914 default: 915 return; 916 } 917 } 918 } 919 920 /** 921 * Parse identifier. Uppercase characters are folded 922 * to lowercase when lower is true. Returns falsed if 923 * no identifier is found. [55] 346:17 924 */ 925 boolean parseIdentifier(boolean lower) throws IOException { 926 switch (ch) { 927 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 928 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 929 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 930 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 931 case 'Y': case 'Z': 932 if (lower) { 933 ch = 'a' + (ch - 'A'); 934 } 935 break; 936 937 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 938 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 939 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 940 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 941 case 'y': case 'z': 942 break; 943 944 default: 945 return false; 946 } 947 948 while (true) { 949 addString(ch); 950 951 switch (ch = readCh()) { 952 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 953 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 954 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 955 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 956 case 'Y': case 'Z': 957 if (lower) { 958 ch = 'a' + (ch - 'A'); 959 } 960 break; 961 962 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 963 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 964 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 965 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 966 case 'y': case 'z': 967 968 case '0': case '1': case '2': case '3': case '4': 969 case '5': case '6': case '7': case '8': case '9': 970 971 case '.': case '-': 972 973 case '_': // not officially allowed 974 break; 975 976 default: 977 return true; 978 } 979 } 980 } 981 982 /** 983 * Parse an entity reference. [59] 350:17 984 */ 985 private char[] parseEntityReference() throws IOException { 986 int pos = strpos; 987 988 if ((ch = readCh()) == '#') { 989 int n = 0; 990 ch = readCh(); 991 if ((ch >= '0') && (ch <= '9') || 992 ch == 'x' || ch == 'X') { 993 994 if ((ch >= '0') && (ch <= '9')) { 995 // parse decimal reference 996 while ((ch >= '0') && (ch <= '9')) { 997 n = (n * 10) + ch - '0'; 998 ch = readCh(); 999 } 1000 } else { 1001 // parse hexadecimal reference 1002 ch = readCh(); 1003 char lch = (char) Character.toLowerCase(ch); 1004 while ((lch >= '0') && (lch <= '9') || 1005 (lch >= 'a') && (lch <= 'f')) { 1006 if (lch >= '0' && lch <= '9') { 1007 n = (n * 16) + lch - '0'; 1008 } else { 1009 n = (n * 16) + lch - 'a' + 10; 1010 } 1011 ch = readCh(); 1012 lch = (char) Character.toLowerCase(ch); 1013 } 1014 } 1015 switch (ch) { 1016 case '\n': 1017 ln++; 1018 ch = readCh(); 1019 lfCount++; 1020 break; 1021 1022 case '\r': 1023 ln++; 1024 if ((ch = readCh()) == '\n') { 1025 ch = readCh(); 1026 crlfCount++; 1027 } 1028 else { 1029 crCount++; 1030 } 1031 break; 1032 1033 case ';': 1034 ch = readCh(); 1035 break; 1036 } 1037 char data[] = mapNumericReference(n); 1038 return data; 1039 } 1040 addString('#'); 1041 if (!parseIdentifier(false)) { 1042 error("ident.expected"); 1043 strpos = pos; 1044 char data[] = {'&', '#'}; 1045 return data; 1046 } 1047 } else if (!parseIdentifier(false)) { 1048 char data[] = {'&'}; 1049 return data; 1050 } 1051 1052 boolean semicolon = false; 1053 1054 switch (ch) { 1055 case '\n': 1056 ln++; 1057 ch = readCh(); 1058 lfCount++; 1059 break; 1060 1061 case '\r': 1062 ln++; 1063 if ((ch = readCh()) == '\n') { 1064 ch = readCh(); 1065 crlfCount++; 1066 } 1067 else { 1068 crCount++; 1069 } 1070 break; 1071 1072 case ';': 1073 semicolon = true; 1074 1075 ch = readCh(); 1076 break; 1077 } 1078 1079 String nm = getString(pos); 1080 Entity ent = dtd.getEntity(nm); 1081 1082 // entities are case sensitive - however if strict 1083 // is false then we will try to make a match by 1084 // converting the string to all lowercase. 1085 // 1086 if (!strict && (ent == null)) { 1087 ent = dtd.getEntity(nm.toLowerCase()); 1088 } 1089 if ((ent == null) || !ent.isGeneral()) { 1090 1091 if (nm.length() == 0) { 1092 error("invalid.entref", nm); 1093 return new char[0]; 1094 } 1095 /* given that there is not a match restore the entity reference */ 1096 String str = "&" + nm + (semicolon ? ";" : ""); 1097 1098 char b[] = new char[str.length()]; 1099 str.getChars(0, b.length, b, 0); 1100 return b; 1101 } 1102 return ent.getData(); 1103 } 1104 1105 /** 1106 * Converts numeric character reference to char array. 1107 * 1108 * Normally the code in a reference should be always converted 1109 * to the Unicode character with the same code, but due to 1110 * wide usage of Cp1252 charset most browsers map numeric references 1111 * in the range 130-159 (which are control chars in Unicode set) 1112 * to displayable characters with other codes. 1113 * 1114 * @param c the code of numeric character reference. 1115 * @return a char array corresponding to the reference code. 1116 */ 1117 private char[] mapNumericReference(int c) { 1118 char[] data; 1119 if (c >= 0xffff) { // outside unicode BMP. 1120 try { 1121 data = Character.toChars(c); 1122 } catch (IllegalArgumentException e) { 1123 data = new char[0]; 1124 } 1125 } else { 1126 data = new char[1]; 1127 data[0] = (c < 130 || c > 159) ? (char) c : cp1252Map[c - 130]; 1128 } 1129 return data; 1130 } 1131 1132 /** 1133 * Parse a comment. [92] 391:7 1134 */ 1135 void parseComment() throws IOException { 1136 1137 while (true) { 1138 int c = ch; 1139 switch (c) { 1140 case '-': 1141 /** Presuming that the start string of a comment "<!--" has 1142 already been parsed, the '-' character is valid only as 1143 part of a comment termination and further more it must 1144 be present in even numbers. Hence if strict is true, we 1145 presume the comment has been terminated and return. 1146 However if strict is false, then there is no even number 1147 requirement and this character can appear anywhere in the 1148 comment. The parser reads on until it sees the following 1149 pattern: "-->" or "--!>". 1150 **/ 1151 if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) { 1152 if ((ch = readCh()) == '>') { 1153 return; 1154 } 1155 if (ch == '!') { 1156 if ((ch = readCh()) == '>') { 1157 return; 1158 } else { 1159 /* to account for extra read()'s that happened */ 1160 addString('-'); 1161 addString('!'); 1162 continue; 1163 } 1164 } 1165 break; 1166 } 1167 1168 if ((ch = readCh()) == '-') { 1169 ch = readCh(); 1170 if (strict || ch == '>') { 1171 return; 1172 } 1173 if (ch == '!') { 1174 if ((ch = readCh()) == '>') { 1175 return; 1176 } else { 1177 /* to account for extra read()'s that happened */ 1178 addString('-'); 1179 addString('!'); 1180 continue; 1181 } 1182 } 1183 /* to account for the extra read() */ 1184 addString('-'); 1185 } 1186 break; 1187 1188 case -1: 1189 handleEOFInComment(); 1190 return; 1191 1192 case '\n': 1193 ln++; 1194 ch = readCh(); 1195 lfCount++; 1196 break; 1197 1198 case '>': 1199 ch = readCh(); 1200 break; 1201 1202 case '\r': 1203 ln++; 1204 if ((ch = readCh()) == '\n') { 1205 ch = readCh(); 1206 crlfCount++; 1207 } 1208 else { 1209 crCount++; 1210 } 1211 c = '\n'; 1212 break; 1213 default: 1214 ch = readCh(); 1215 break; 1216 } 1217 1218 addString(c); 1219 } 1220 } 1221 1222 /** 1223 * Parse literal content. [46] 343:1 and [47] 344:1 1224 */ 1225 void parseLiteral(boolean replace) throws IOException { 1226 while (true) { 1227 int c = ch; 1228 switch (c) { 1229 case -1: 1230 error("eof.literal", stack.elem.getName()); 1231 endTag(true); 1232 return; 1233 1234 case '>': 1235 ch = readCh(); 1236 int i = textpos - (stack.elem.name.length() + 2), j = 0; 1237 1238 // match end tag 1239 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) { 1240 while ((++i < textpos) && 1241 (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++))); 1242 if (i == textpos) { 1243 textpos -= (stack.elem.name.length() + 2); 1244 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1245 textpos--; 1246 } 1247 endTag(false); 1248 return; 1249 } 1250 } 1251 break; 1252 1253 case '&': 1254 char data[] = parseEntityReference(); 1255 if (textpos + data.length > text.length) { 1256 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; 1257 System.arraycopy(text, 0, newtext, 0, text.length); 1258 text = newtext; 1259 } 1260 System.arraycopy(data, 0, text, textpos, data.length); 1261 textpos += data.length; 1262 continue; 1263 1264 case '\n': 1265 ln++; 1266 ch = readCh(); 1267 lfCount++; 1268 break; 1269 1270 case '\r': 1271 ln++; 1272 if ((ch = readCh()) == '\n') { 1273 ch = readCh(); 1274 crlfCount++; 1275 } 1276 else { 1277 crCount++; 1278 } 1279 c = '\n'; 1280 break; 1281 default: 1282 ch = readCh(); 1283 break; 1284 } 1285 1286 // output character 1287 if (textpos == text.length) { 1288 char newtext[] = new char[text.length + 128]; 1289 System.arraycopy(text, 0, newtext, 0, text.length); 1290 text = newtext; 1291 } 1292 text[textpos++] = (char)c; 1293 } 1294 } 1295 1296 /** 1297 * Parse attribute value. [33] 331:1 1298 */ 1299 @SuppressWarnings("fallthrough") 1300 String parseAttributeValue(boolean lower) throws IOException { 1301 int delim = -1; 1302 1303 // Check for a delimiter 1304 switch(ch) { 1305 case '\'': 1306 case '"': 1307 delim = ch; 1308 ch = readCh(); 1309 break; 1310 } 1311 1312 // Parse the rest of the value 1313 while (true) { 1314 int c = ch; 1315 1316 switch (c) { 1317 case '\n': 1318 ln++; 1319 ch = readCh(); 1320 lfCount++; 1321 if (delim < 0) { 1322 return getString(0); 1323 } 1324 break; 1325 1326 case '\r': 1327 ln++; 1328 1329 if ((ch = readCh()) == '\n') { 1330 ch = readCh(); 1331 crlfCount++; 1332 } 1333 else { 1334 crCount++; 1335 } 1336 if (delim < 0) { 1337 return getString(0); 1338 } 1339 break; 1340 1341 case '\t': 1342 if (delim < 0) 1343 c = ' '; 1344 // Fall through 1345 case ' ': 1346 ch = readCh(); 1347 if (delim < 0) { 1348 return getString(0); 1349 } 1350 break; 1351 1352 case '>': 1353 case '<': 1354 if (delim < 0) { 1355 return getString(0); 1356 } 1357 ch = readCh(); 1358 break; 1359 1360 case '\'': 1361 case '"': 1362 ch = readCh(); 1363 if (c == delim) { 1364 return getString(0); 1365 } else if (delim == -1) { 1366 error("attvalerr"); 1367 if (strict || ch == ' ') { 1368 return getString(0); 1369 } else { 1370 continue; 1371 } 1372 } 1373 break; 1374 1375 case '=': 1376 if (delim < 0) { 1377 /* In SGML a construct like <img src=/cgi-bin/foo?x=1> 1378 is considered invalid since an = sign can only be contained 1379 in an attributes value if the string is quoted. 1380 */ 1381 error("attvalerr"); 1382 /* If strict is true then we return with the string we have thus far. 1383 Otherwise we accept the = sign as part of the attribute's value and 1384 process the rest of the img tag. */ 1385 if (strict) { 1386 return getString(0); 1387 } 1388 } 1389 ch = readCh(); 1390 break; 1391 1392 case '&': 1393 if (strict && delim < 0) { 1394 ch = readCh(); 1395 break; 1396 } 1397 1398 char data[] = parseEntityReference(); 1399 for (int i = 0 ; i < data.length ; i++) { 1400 c = data[i]; 1401 addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c); 1402 } 1403 continue; 1404 1405 case -1: 1406 return getString(0); 1407 1408 default: 1409 if (lower && (c >= 'A') && (c <= 'Z')) { 1410 c = 'a' + c - 'A'; 1411 } 1412 ch = readCh(); 1413 break; 1414 } 1415 addString(c); 1416 } 1417 } 1418 1419 1420 /** 1421 * Parse attribute specification List. [31] 327:17 1422 */ 1423 void parseAttributeSpecificationList(Element elem) throws IOException { 1424 1425 while (true) { 1426 skipSpace(); 1427 1428 switch (ch) { 1429 case '/': 1430 case '>': 1431 case '<': 1432 case -1: 1433 return; 1434 1435 case '-': 1436 if ((ch = readCh()) == '-') { 1437 ch = readCh(); 1438 parseComment(); 1439 strpos = 0; 1440 } else { 1441 error("invalid.tagchar", "-", elem.getName()); 1442 ch = readCh(); 1443 } 1444 continue; 1445 } 1446 1447 AttributeList att; 1448 String attname; 1449 String attvalue; 1450 1451 if (parseIdentifier(true)) { 1452 attname = getString(0); 1453 skipSpace(); 1454 if (ch == '=') { 1455 ch = readCh(); 1456 skipSpace(); 1457 att = elem.getAttribute(attname); 1458// Bug ID 4102750 1459// Load the NAME of an Attribute Case Sensitive 1460// The case of the NAME must be intact 1461// MG 021898 1462 attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME)); 1463// attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION)); 1464 } else { 1465 attvalue = attname; 1466 att = elem.getAttributeByValue(attvalue); 1467 if (att == null) { 1468 att = elem.getAttribute(attname); 1469 if (att != null) { 1470 attvalue = att.getValue(); 1471 } 1472 else { 1473 // Make it null so that NULL_ATTRIBUTE_VALUE is 1474 // used 1475 attvalue = null; 1476 } 1477 } 1478 } 1479 } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs 1480 ch = readCh(); 1481 continue; 1482 } else if (!strict && ch == '"') { // allows for quoted attributes 1483 ch = readCh(); 1484 skipSpace(); 1485 if (parseIdentifier(true)) { 1486 attname = getString(0); 1487 if (ch == '"') { 1488 ch = readCh(); 1489 } 1490 skipSpace(); 1491 if (ch == '=') { 1492 ch = readCh(); 1493 skipSpace(); 1494 att = elem.getAttribute(attname); 1495 attvalue = parseAttributeValue((att != null) && 1496 (att.type != CDATA) && 1497 (att.type != NOTATION)); 1498 } else { 1499 attvalue = attname; 1500 att = elem.getAttributeByValue(attvalue); 1501 if (att == null) { 1502 att = elem.getAttribute(attname); 1503 if (att != null) { 1504 attvalue = att.getValue(); 1505 } 1506 } 1507 } 1508 } else { 1509 char str[] = {(char)ch}; 1510 error("invalid.tagchar", new String(str), elem.getName()); 1511 ch = readCh(); 1512 continue; 1513 } 1514 } else if (!strict && (attributes.isEmpty()) && (ch == '=')) { 1515 ch = readCh(); 1516 skipSpace(); 1517 attname = elem.getName(); 1518 att = elem.getAttribute(attname); 1519 attvalue = parseAttributeValue((att != null) && 1520 (att.type != CDATA) && 1521 (att.type != NOTATION)); 1522 } else if (!strict && (ch == '=')) { 1523 ch = readCh(); 1524 skipSpace(); 1525 attvalue = parseAttributeValue(true); 1526 error("attvalerr"); 1527 return; 1528 } else { 1529 char str[] = {(char)ch}; 1530 error("invalid.tagchar", new String(str), elem.getName()); 1531 if (!strict) { 1532 ch = readCh(); 1533 continue; 1534 } else { 1535 return; 1536 } 1537 } 1538 1539 if (att != null) { 1540 attname = att.getName(); 1541 } else { 1542 error("invalid.tagatt", attname, elem.getName()); 1543 } 1544 1545 // Check out the value 1546 if (attributes.isDefined(attname)) { 1547 error("multi.tagatt", attname, elem.getName()); 1548 } 1549 if (attvalue == null) { 1550 attvalue = ((att != null) && (att.value != null)) ? att.value : 1551 HTML.NULL_ATTRIBUTE_VALUE; 1552 } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) { 1553 error("invalid.tagattval", attname, elem.getName()); 1554 } 1555 HTML.Attribute attkey = HTML.getAttributeKey(attname); 1556 if (attkey == null) { 1557 attributes.addAttribute(attname, attvalue); 1558 } else { 1559 attributes.addAttribute(attkey, attvalue); 1560 } 1561 } 1562 } 1563 1564 /** 1565 * Parses the Document Type Declaration markup declaration. 1566 * Currently ignores it. 1567 * 1568 * @return the string representation of the markup declaration 1569 * @throws IOException if an I/O error occurs 1570 */ 1571 public String parseDTDMarkup() throws IOException { 1572 1573 StringBuilder strBuff = new StringBuilder(); 1574 ch = readCh(); 1575 while(true) { 1576 switch (ch) { 1577 case '>': 1578 ch = readCh(); 1579 return strBuff.toString(); 1580 case -1: 1581 error("invalid.markup"); 1582 return strBuff.toString(); 1583 case '\n': 1584 ln++; 1585 ch = readCh(); 1586 lfCount++; 1587 break; 1588 case '"': 1589 ch = readCh(); 1590 break; 1591 case '\r': 1592 ln++; 1593 if ((ch = readCh()) == '\n') { 1594 ch = readCh(); 1595 crlfCount++; 1596 } 1597 else { 1598 crCount++; 1599 } 1600 break; 1601 default: 1602 strBuff.append((char)(ch & 0xFF)); 1603 ch = readCh(); 1604 break; 1605 } 1606 } 1607 } 1608 1609 /** 1610 * Parse markup declarations. 1611 * Currently only handles the Document Type Declaration markup. 1612 * Returns true if it is a markup declaration false otherwise. 1613 * 1614 * @param strBuff the markup declaration 1615 * @return {@code true} if this is a valid markup declaration; 1616 * otherwise {@code false} 1617 * @throws IOException if an I/O error occurs 1618 */ 1619 protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException { 1620 1621 /* Currently handles only the DOCTYPE */ 1622 if ((strBuff.length() == "DOCTYPE".length()) && 1623 (strBuff.toString().toUpperCase().equals("DOCTYPE"))) { 1624 parseDTDMarkup(); 1625 return true; 1626 } 1627 return false; 1628 } 1629 1630 /** 1631 * Parse an invalid tag. 1632 */ 1633 void parseInvalidTag() throws IOException { 1634 // ignore all data upto the close bracket '>' 1635 while (true) { 1636 skipSpace(); 1637 switch (ch) { 1638 case '>': 1639 case -1: 1640 ch = readCh(); 1641 return; 1642 case '<': 1643 return; 1644 default: 1645 ch = readCh(); 1646 1647 } 1648 } 1649 } 1650 1651 /** 1652 * Parse a start or end tag. 1653 */ 1654 @SuppressWarnings("fallthrough") 1655 void parseTag() throws IOException { 1656 Element elem; 1657 boolean net = false; 1658 boolean warned = false; 1659 boolean unknown = false; 1660 1661 switch (ch = readCh()) { 1662 case '!': 1663 switch (ch = readCh()) { 1664 case '-': 1665 // Parse comment. [92] 391:7 1666 while (true) { 1667 if (ch == '-') { 1668 if (!strict || ((ch = readCh()) == '-')) { 1669 ch = readCh(); 1670 if (!strict && ch == '-') { 1671 ch = readCh(); 1672 } 1673 // send over any text you might see 1674 // before parsing and sending the 1675 // comment 1676 if (textpos != 0) { 1677 char newtext[] = new char[textpos]; 1678 System.arraycopy(text, 0, newtext, 0, textpos); 1679 handleText(newtext); 1680 lastBlockStartPos = currentBlockStartPos; 1681 textpos = 0; 1682 } 1683 parseComment(); 1684 last = makeTag(dtd.getElement("comment"), true); 1685 handleComment(getChars(0)); 1686 continue; 1687 } else if (!warned) { 1688 warned = true; 1689 error("invalid.commentchar", "-"); 1690 } 1691 } 1692 skipSpace(); 1693 switch (ch) { 1694 case '-': 1695 continue; 1696 case '>': 1697 ch = readCh(); 1698 return; 1699 case -1: 1700 return; 1701 default: 1702 ch = readCh(); 1703 if (!warned) { 1704 warned = true; 1705 error("invalid.commentchar", 1706 String.valueOf((char)ch)); 1707 } 1708 break; 1709 } 1710 } 1711 1712 default: 1713 // deal with marked sections 1714 StringBuffer strBuff = new StringBuffer(); 1715 while (true) { 1716 strBuff.append((char)ch); 1717 if (parseMarkupDeclarations(strBuff)) { 1718 return; 1719 } 1720 switch(ch) { 1721 case '>': 1722 ch = readCh(); 1723 // Fall through 1724 case -1: 1725 error("invalid.markup"); 1726 return; 1727 case '\n': 1728 ln++; 1729 ch = readCh(); 1730 lfCount++; 1731 break; 1732 case '\r': 1733 ln++; 1734 if ((ch = readCh()) == '\n') { 1735 ch = readCh(); 1736 crlfCount++; 1737 } 1738 else { 1739 crCount++; 1740 } 1741 break; 1742 1743 default: 1744 ch = readCh(); 1745 break; 1746 } 1747 } 1748 } 1749 1750 case '/': 1751 // parse end tag [19] 317:4 1752 switch (ch = readCh()) { 1753 case '>': 1754 ch = readCh(); 1755 // Fall through 1756 case '<': 1757 // empty end tag. either </> or </< 1758 if (recent == null) { 1759 error("invalid.shortend"); 1760 return; 1761 } 1762 elem = recent; 1763 break; 1764 1765 default: 1766 if (!parseIdentifier(true)) { 1767 error("expected.endtagname"); 1768 return; 1769 } 1770 skipSpace(); 1771 switch (ch) { 1772 case '>': 1773 ch = readCh(); 1774 break; 1775 case '<': 1776 break; 1777 1778 default: 1779 error("expected", "'>'"); 1780 while ((ch != -1) && (ch != '\n') && (ch != '>')) { 1781 ch = readCh(); 1782 } 1783 if (ch == '>') { 1784 ch = readCh(); 1785 } 1786 break; 1787 } 1788 String elemStr = getString(0); 1789 if (!dtd.elementExists(elemStr)) { 1790 error("end.unrecognized", elemStr); 1791 // Ignore RE before end tag 1792 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1793 textpos--; 1794 } 1795 elem = dtd.getElement("unknown"); 1796 elem.name = elemStr; 1797 unknown = true; 1798 } else { 1799 elem = dtd.getElement(elemStr); 1800 } 1801 break; 1802 } 1803 1804 1805 // If the stack is null, we're seeing end tags without any begin 1806 // tags. Ignore them. 1807 1808 if (stack == null) { 1809 error("end.extra.tag", elem.getName()); 1810 return; 1811 } 1812 1813 // Ignore RE before end tag 1814 if ((textpos > 0) && (text[textpos-1] == '\n')) { 1815 // In a pre tag, if there are blank lines 1816 // we do not want to remove the newline 1817 // before the end tag. Hence this code. 1818 // 1819 if (stack.pre) { 1820 if ((textpos > 1) && (text[textpos-2] != '\n')) { 1821 textpos--; 1822 } 1823 } else { 1824 textpos--; 1825 } 1826 } 1827 1828 // If the end tag is a form, since we did not put it 1829 // on the tag stack, there is no corresponding start 1830 // start tag to find. Hence do not touch the tag stack. 1831 // 1832 1833 /* 1834 if (!strict && elem.getName().equals("form")) { 1835 if (lastFormSent != null) { 1836 handleEndTag(lastFormSent); 1837 return; 1838 } else { 1839 // do nothing. 1840 return; 1841 } 1842 } 1843 */ 1844 1845 if (unknown) { 1846 // we will not see a corresponding start tag 1847 // on the stack. If we are seeing an 1848 // end tag, lets send this on as an empty 1849 // tag with the end tag attribute set to 1850 // true. 1851 TagElement t = makeTag(elem); 1852 handleText(t); 1853 attributes.addAttribute(HTML.Attribute.ENDTAG, "true"); 1854 handleEmptyTag(makeTag(elem)); 1855 unknown = false; 1856 return; 1857 } 1858 1859 // find the corresponding start tag 1860 1861 // A commonly occurring error appears to be the insertion 1862 // of extra end tags in a table. The intent here is ignore 1863 // such extra end tags. 1864 // 1865 if (!strict) { 1866 String stackElem = stack.elem.getName(); 1867 1868 if (stackElem.equals("table")) { 1869 // If it is not a valid end tag ignore it and return 1870 // 1871 if (!elem.getName().equals(stackElem)) { 1872 error("tag.ignore", elem.getName()); 1873 return; 1874 } 1875 } 1876 1877 1878 1879 if (stackElem.equals("tr") || 1880 stackElem.equals("td")) { 1881 if ((!elem.getName().equals("table")) && 1882 (!elem.getName().equals(stackElem))) { 1883 error("tag.ignore", elem.getName()); 1884 return; 1885 } 1886 } 1887 } 1888 TagStack sp = stack; 1889 1890 while ((sp != null) && (elem != sp.elem)) { 1891 sp = sp.next; 1892 } 1893 if (sp == null) { 1894 error("unmatched.endtag", elem.getName()); 1895 return; 1896 } 1897 1898 // People put font ending tags in the darndest places. 1899 // Don't close other contexts based on them being between 1900 // a font tag and the corresponding end tag. Instead, 1901 // ignore the end tag like it doesn't exist and allow the end 1902 // of the document to close us out. 1903 String elemName = elem.getName(); 1904 if (stack != sp && 1905 (elemName.equals("font") || 1906 elemName.equals("center"))) { 1907 1908 // Since closing out a center tag can have real wierd 1909 // effects on the formatting, make sure that tags 1910 // for which omitting an end tag is legimitate 1911 // get closed out. 1912 // 1913 if (elemName.equals("center")) { 1914 while(stack.elem.omitEnd() && stack != sp) { 1915 endTag(true); 1916 } 1917 if (stack.elem == elem) { 1918 endTag(false); 1919 } 1920 } 1921 return; 1922 } 1923 // People do the same thing with center tags. In this 1924 // case we would like to close off the center tag but 1925 // not necessarily all enclosing tags. 1926 1927 1928 1929 // end tags 1930 while (stack != sp) { 1931 endTag(true); 1932 } 1933 1934 endTag(false); 1935 return; 1936 1937 case -1: 1938 error("eof"); 1939 return; 1940 } 1941 1942 // start tag [14] 314:1 1943 if (!parseIdentifier(true)) { 1944 elem = recent; 1945 if ((ch != '>') || (elem == null)) { 1946 error("expected.tagname"); 1947 return; 1948 } 1949 } else { 1950 String elemStr = getString(0); 1951 1952 if (elemStr.equals("image")) { 1953 elemStr = "img"; 1954 } 1955 1956 /* determine if this element is part of the dtd. */ 1957 1958 if (!dtd.elementExists(elemStr)) { 1959 // parseInvalidTag(); 1960 error("tag.unrecognized ", elemStr); 1961 elem = dtd.getElement("unknown"); 1962 elem.name = elemStr; 1963 unknown = true; 1964 } else { 1965 elem = dtd.getElement(elemStr); 1966 } 1967 } 1968 1969 // Parse attributes 1970 parseAttributeSpecificationList(elem); 1971 1972 switch (ch) { 1973 case '/': 1974 net = true; 1975 // Fall through 1976 case '>': 1977 ch = readCh(); 1978 if (ch == '>' && net) { 1979 ch = readCh(); 1980 } 1981 case '<': 1982 break; 1983 1984 default: 1985 error("expected", "'>'"); 1986 break; 1987 } 1988 1989 if (!strict) { 1990 if (elem.getName().equals("script")) { 1991 error("javascript.unsupported"); 1992 } 1993 } 1994 1995 // ignore RE after start tag 1996 // 1997 if (!elem.isEmpty()) { 1998 if (ch == '\n') { 1999 ln++; 2000 lfCount++; 2001 ch = readCh(); 2002 } else if (ch == '\r') { 2003 ln++; 2004 if ((ch = readCh()) == '\n') { 2005 ch = readCh(); 2006 crlfCount++; 2007 } 2008 else { 2009 crCount++; 2010 } 2011 } 2012 } 2013 2014 // ensure a legal context for the tag 2015 TagElement tag = makeTag(elem, false); 2016 2017 2018 /** In dealing with forms, we have decided to treat 2019 them as legal in any context. Also, even though 2020 they do have a start and an end tag, we will 2021 not put this tag on the stack. This is to deal 2022 several pages in the web oasis that choose to 2023 start and end forms in any possible location. **/ 2024 2025 /* 2026 if (!strict && elem.getName().equals("form")) { 2027 if (lastFormSent == null) { 2028 lastFormSent = tag; 2029 } else { 2030 handleEndTag(lastFormSent); 2031 lastFormSent = tag; 2032 } 2033 } else { 2034 */ 2035 // Smlly, if a tag is unknown, we will apply 2036 // no legalTagContext logic to it. 2037 // 2038 if (!unknown) { 2039 legalTagContext(tag); 2040 2041 // If skip tag is true, this implies that 2042 // the tag was illegal and that the error 2043 // recovery strategy adopted is to ignore 2044 // the tag. 2045 if (!strict && skipTag) { 2046 skipTag = false; 2047 return; 2048 } 2049 } 2050 /* 2051 } 2052 */ 2053 2054 startTag(tag); 2055 2056 if (!elem.isEmpty()) { 2057 switch (elem.getType()) { 2058 case CDATA: 2059 parseLiteral(false); 2060 break; 2061 case RCDATA: 2062 parseLiteral(true); 2063 break; 2064 default: 2065 if (stack != null) { 2066 stack.net = net; 2067 } 2068 break; 2069 } 2070 } 2071 } 2072 2073 private static final String START_COMMENT = "<!--"; 2074 private static final String END_COMMENT = "-->"; 2075 private static final char[] SCRIPT_END_TAG = "</script>".toCharArray(); 2076 private static final char[] SCRIPT_END_TAG_UPPER_CASE = 2077 "</SCRIPT>".toCharArray(); 2078 2079 void parseScript() throws IOException { 2080 char[] charsToAdd = new char[SCRIPT_END_TAG.length]; 2081 boolean insideComment = false; 2082 2083 /* Here, ch should be the first character after <script> */ 2084 while (true) { 2085 int i = 0; 2086 while (!insideComment && i < SCRIPT_END_TAG.length 2087 && (SCRIPT_END_TAG[i] == ch 2088 || SCRIPT_END_TAG_UPPER_CASE[i] == ch)) { 2089 charsToAdd[i] = (char) ch; 2090 ch = readCh(); 2091 i++; 2092 } 2093 if (i == SCRIPT_END_TAG.length) { 2094 return; 2095 } 2096 2097 if (!insideComment && i == 1 && charsToAdd[0] == START_COMMENT.charAt(0)) { 2098 // it isn't end script tag, but may be it's start comment tag? 2099 while (i < START_COMMENT.length() 2100 && START_COMMENT.charAt(i) == ch) { 2101 charsToAdd[i] = (char) ch; 2102 ch = readCh(); 2103 i++; 2104 } 2105 if (i == START_COMMENT.length()) { 2106 insideComment = true; 2107 } 2108 } 2109 if (insideComment) { 2110 while (i < END_COMMENT.length() 2111 && END_COMMENT.charAt(i) == ch) { 2112 charsToAdd[i] = (char) ch; 2113 ch = readCh(); 2114 i++; 2115 } 2116 if (i == END_COMMENT.length()) { 2117 insideComment = false; 2118 } 2119 } 2120 2121 /* To account for extra read()'s that happened */ 2122 if (i > 0) { 2123 for (int j = 0; j < i; j++) { 2124 addString(charsToAdd[j]); 2125 } 2126 continue; 2127 } 2128 switch (ch) { 2129 case -1: 2130 error("eof.script"); 2131 return; 2132 case '\n': 2133 ln++; 2134 ch = readCh(); 2135 lfCount++; 2136 addString('\n'); 2137 break; 2138 case '\r': 2139 ln++; 2140 if ((ch = readCh()) == '\n') { 2141 ch = readCh(); 2142 crlfCount++; 2143 } else { 2144 crCount++; 2145 } 2146 addString('\n'); 2147 break; 2148 default: 2149 addString(ch); 2150 ch = readCh(); 2151 break; 2152 } // switch 2153 } // while 2154 } 2155 2156 /** 2157 * Parse Content. [24] 320:1 2158 */ 2159 void parseContent() throws IOException { 2160 Thread curThread = Thread.currentThread(); 2161 2162 for (;;) { 2163 if (curThread.isInterrupted()) { 2164 curThread.interrupt(); // resignal the interrupt 2165 break; 2166 } 2167 2168 int c = ch; 2169 currentBlockStartPos = currentPosition; 2170 2171 if (recent == dtd.script) { // means: if after starting <script> tag 2172 2173 /* Here, ch has to be the first character after <script> */ 2174 parseScript(); 2175 last = makeTag(dtd.getElement("comment"), true); 2176 2177 /* Remove leading and trailing HTML comment declarations */ 2178 String str = new String(getChars(0)).trim(); 2179 int minLength = START_COMMENT.length() + END_COMMENT.length(); 2180 if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT) 2181 && str.length() >= (minLength)) { 2182 str = str.substring(START_COMMENT.length(), 2183 str.length() - END_COMMENT.length()); 2184 } 2185 2186 /* Handle resulting chars as comment */ 2187 handleComment(str.toCharArray()); 2188 endTag(false); 2189 lastBlockStartPos = currentPosition; 2190 2191 continue; 2192 } else { 2193 switch (c) { 2194 case '<': 2195 parseTag(); 2196 lastBlockStartPos = currentPosition; 2197 continue; 2198 2199 case '/': 2200 ch = readCh(); 2201 if ((stack != null) && stack.net) { 2202 // null end tag. 2203 endTag(false); 2204 continue; 2205 } else if (textpos == 0) { 2206 if (!legalElementContext(dtd.pcdata)) { 2207 error("unexpected.pcdata"); 2208 } 2209 if (last.breaksFlow()) { 2210 space = false; 2211 } 2212 } 2213 break; 2214 2215 case -1: 2216 return; 2217 2218 case '&': 2219 if (textpos == 0) { 2220 if (!legalElementContext(dtd.pcdata)) { 2221 error("unexpected.pcdata"); 2222 } 2223 if (last.breaksFlow()) { 2224 space = false; 2225 } 2226 } 2227 char data[] = parseEntityReference(); 2228 if (textpos + data.length + 1 > text.length) { 2229 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; 2230 System.arraycopy(text, 0, newtext, 0, text.length); 2231 text = newtext; 2232 } 2233 if (space) { 2234 space = false; 2235 text[textpos++] = ' '; 2236 } 2237 System.arraycopy(data, 0, text, textpos, data.length); 2238 textpos += data.length; 2239 ignoreSpace = false; 2240 continue; 2241 2242 case '\n': 2243 ln++; 2244 lfCount++; 2245 ch = readCh(); 2246 if ((stack != null) && stack.pre) { 2247 break; 2248 } 2249 if (textpos == 0) { 2250 lastBlockStartPos = currentPosition; 2251 } 2252 if (!ignoreSpace) { 2253 space = true; 2254 } 2255 continue; 2256 2257 case '\r': 2258 ln++; 2259 c = '\n'; 2260 if ((ch = readCh()) == '\n') { 2261 ch = readCh(); 2262 crlfCount++; 2263 } 2264 else { 2265 crCount++; 2266 } 2267 if ((stack != null) && stack.pre) { 2268 break; 2269 } 2270 if (textpos == 0) { 2271 lastBlockStartPos = currentPosition; 2272 } 2273 if (!ignoreSpace) { 2274 space = true; 2275 } 2276 continue; 2277 2278 2279 case '\t': 2280 case ' ': 2281 ch = readCh(); 2282 if ((stack != null) && stack.pre) { 2283 break; 2284 } 2285 if (textpos == 0) { 2286 lastBlockStartPos = currentPosition; 2287 } 2288 if (!ignoreSpace) { 2289 space = true; 2290 } 2291 continue; 2292 2293 default: 2294 if (textpos == 0) { 2295 if (!legalElementContext(dtd.pcdata)) { 2296 error("unexpected.pcdata"); 2297 } 2298 if (last.breaksFlow()) { 2299 space = false; 2300 } 2301 } 2302 ch = readCh(); 2303 break; 2304 } 2305 } 2306 2307 // enlarge buffer if needed 2308 if (textpos + 2 > text.length) { 2309 char newtext[] = new char[text.length + 128]; 2310 System.arraycopy(text, 0, newtext, 0, text.length); 2311 text = newtext; 2312 } 2313 2314 // output pending space 2315 if (space) { 2316 if (textpos == 0) { 2317 lastBlockStartPos--; 2318 } 2319 text[textpos++] = ' '; 2320 space = false; 2321 } 2322 text[textpos++] = (char)c; 2323 ignoreSpace = false; 2324 } 2325 } 2326 2327 /** 2328 * Returns the end of line string. This will return the end of line 2329 * string that has been encountered the most, one of \r, \n or \r\n. 2330 */ 2331 String getEndOfLineString() { 2332 if (crlfCount >= crCount) { 2333 if (lfCount >= crlfCount) { 2334 return "\n"; 2335 } 2336 else { 2337 return "\r\n"; 2338 } 2339 } 2340 else { 2341 if (crCount > lfCount) { 2342 return "\r"; 2343 } 2344 else { 2345 return "\n"; 2346 } 2347 } 2348 } 2349 2350 /** 2351 * Parse an HTML stream, given a DTD. 2352 * 2353 * @param in the reader to read the source from 2354 * @throws IOException if an I/O error occurs 2355 */ 2356 public synchronized void parse(Reader in) throws IOException { 2357 this.in = in; 2358 2359 this.ln = 1; 2360 2361 seenHtml = false; 2362 seenHead = false; 2363 seenBody = false; 2364 2365 crCount = lfCount = crlfCount = 0; 2366 2367 try { 2368 ch = readCh(); 2369 text = new char[1024]; 2370 str = new char[128]; 2371 2372 parseContent(); 2373 // NOTE: interruption may have occurred. Control flows out 2374 // of here normally. 2375 while (stack != null) { 2376 endTag(true); 2377 } 2378 in.close(); 2379 } catch (IOException e) { 2380 errorContext(); 2381 error("ioexception"); 2382 throw e; 2383 } catch (Exception e) { 2384 errorContext(); 2385 error("exception", e.getClass().getName(), e.getMessage()); 2386 e.printStackTrace(); 2387 } catch (ThreadDeath e) { 2388 errorContext(); 2389 error("terminated"); 2390 e.printStackTrace(); 2391 throw e; 2392 } finally { 2393 for (; stack != null ; stack = stack.next) { 2394 handleEndTag(stack.tag); 2395 } 2396 2397 text = null; 2398 str = null; 2399 } 2400 2401 } 2402 2403 2404 /* 2405 * Input cache. This is much faster than calling down to a synchronized 2406 * method of BufferedReader for each byte. Measurements done 5/30/97 2407 * show that there's no point in having a bigger buffer: Increasing 2408 * the buffer to 8192 had no measurable impact for a program discarding 2409 * one character at a time (reading from an http URL to a local machine). 2410 * NOTE: If the current encoding is bogus, and we read too much 2411 * (past the content-type) we may suffer a MalformedInputException. For 2412 * this reason the initial size is 1 and when the body is encountered the 2413 * size is adjusted to 256. 2414 */ 2415 private char buf[] = new char[1]; 2416 private int pos; 2417 private int len; 2418 /* 2419 tracks position relative to the beginning of the 2420 document. 2421 */ 2422 private int currentPosition; 2423 2424 2425 private final int readCh() throws IOException { 2426 2427 if (pos >= len) { 2428 2429 // This loop allows us to ignore interrupts if the flag 2430 // says so 2431 for (;;) { 2432 try { 2433 len = in.read(buf); 2434 break; 2435 } catch (InterruptedIOException ex) { 2436 throw ex; 2437 } 2438 } 2439 2440 if (len <= 0) { 2441 return -1; // eof 2442 } 2443 pos = 0; 2444 } 2445 ++currentPosition; 2446 2447 return buf[pos++]; 2448 } 2449 2450 2451 /** 2452 * Returns the current position. 2453 * 2454 * @return the current position 2455 */ 2456 protected int getCurrentPos() { 2457 return currentPosition; 2458 } 2459} 2460