1/*
2 * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package javax.swing.text.html.parser;
27
28import javax.swing.text.SimpleAttributeSet;
29import javax.swing.text.html.HTML;
30import javax.swing.text.ChangedCharSetException;
31import java.io.*;
32import java.util.Hashtable;
33import java.util.Properties;
34import java.util.Vector;
35import java.util.Enumeration;
36import java.net.URL;
37
38/**
39 * A simple DTD-driven HTML parser. The parser reads an
40 * HTML file from an InputStream and calls various methods
41 * (which should be overridden in a subclass) when tags and
42 * data are encountered.
43 * <p>
44 * Unfortunately there are many badly implemented HTML parsers
45 * out there, and as a result there are many badly formatted
46 * HTML files. This parser attempts to parse most HTML files.
47 * This means that the implementation sometimes deviates from
48 * the SGML specification in favor of HTML.
49 * <p>
50 * The parser treats \r and \r\n as \n. Newlines after starttags
51 * and before end tags are ignored just as specified in the SGML/HTML
52 * specification.
53 * <p>
54 * The html spec does not specify how spaces are to be coalesced very well.
55 * Specifically, the following scenarios are not discussed (note that a
56 * space should be used here, but I am using &amp;nbsp to force the space to
57 * be displayed):
58 * <p>
59 * '&lt;b&gt;blah&nbsp;&lt;i&gt;&nbsp;&lt;strike&gt;&nbsp;foo' which can be treated as:
60 * '&lt;b&gt;blah&nbsp;&lt;i&gt;&lt;strike&gt;foo'
61 * <p>as well as:
62 * '&lt;p&gt;&lt;a href="xx"&gt;&nbsp;&lt;em&gt;Using&lt;/em&gt;&lt;/a&gt;&lt;/p&gt;'
63 * which appears to be treated as:
64 * '&lt;p&gt;&lt;a href="xx"&gt;&lt;em&gt;Using&lt;/em&gt;&lt;/a&gt;&lt;/p&gt;'
65 * <p>
66 * If <code>strict</code> is false, when a tag that breaks flow,
67 * (<code>TagElement.breaksFlows</code>) or trailing whitespace is
68 * encountered, all whitespace will be ignored until a non whitespace
69 * character is encountered. This appears to give behavior closer to
70 * the popular browsers.
71 *
72 * @see DTD
73 * @see TagElement
74 * @see SimpleAttributeSet
75 * @author Arthur van Hoff
76 * @author Sunita Mani
77 */
78public
79class Parser implements DTDConstants {
80
81    private char text[] = new char[1024];
82    private int textpos = 0;
83    private TagElement last;
84    private boolean space;
85
86    private char str[] = new char[128];
87    private int strpos = 0;
88
89    /**
90     * The dtd.
91     */
92    protected DTD dtd = null;
93
94    private int ch;
95    private int ln;
96    private Reader in;
97
98    private Element recent;
99    private TagStack stack;
100    private boolean skipTag = false;
101    private TagElement lastFormSent = null;
102    private SimpleAttributeSet attributes = new SimpleAttributeSet();
103
104    // State for <html>, <head> and <body>.  Since people like to slap
105    // together HTML documents without thinking, occasionally they
106    // have multiple instances of these tags.  These booleans track
107    // the first sightings of these tags so they can be safely ignored
108    // by the parser if repeated.
109    private boolean seenHtml = false;
110    private boolean seenHead = false;
111    private boolean seenBody = false;
112
113    /**
114     * The html spec does not specify how spaces are coalesced very well.
115     * If strict == false, ignoreSpace is used to try and mimic the behavior
116     * of the popular browsers.
117     * <p>
118     * The problematic scenarios are:
119     * '&lt;b>blah &lt;i> &lt;strike> foo' which can be treated as:
120     * '&lt;b>blah &lt;i>&lt;strike>foo'
121     * as well as:
122     * '&lt;p>&lt;a href="xx"> &lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
123     * which appears to be treated as:
124     * '&lt;p>&lt;a href="xx">&lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
125     * <p>
126     * When a tag that breaks flow, or trailing whitespace is encountered
127     * ignoreSpace is set to true. From then on, all whitespace will be
128     * ignored.
129     * ignoreSpace will be set back to false the first time a
130     * non whitespace character is encountered. This appears to give
131     * behavior closer to the popular browsers.
132     */
133    private boolean ignoreSpace;
134
135    /**
136     * This flag determines whether or not the Parser will be strict
137     * in enforcing SGML compatibility.  If false, it will be lenient
138     * with certain common classes of erroneous HTML constructs.
139     * Strict or not, in either case an error will be recorded.
140     *
141     */
142    protected boolean strict = false;
143
144
145    /** Number of \r\n's encountered. */
146    private int crlfCount;
147    /** Number of \r's encountered. A \r\n will not increment this. */
148    private int crCount;
149    /** Number of \n's encountered. A \r\n will not increment this. */
150    private int lfCount;
151
152    //
153    // To correctly identify the start of a tag/comment/text we need two
154    // ivars. Two are needed as handleText isn't invoked until the tag
155    // after the text has been parsed, that is the parser parses the text,
156    // then a tag, then invokes handleText followed by handleStart.
157    //
158    /** The start position of the current block. Block is overloaded here,
159     * it really means the current start position for the current comment,
160     * tag, text. Use getBlockStartPosition to access this. */
161    private int currentBlockStartPos;
162    /** Start position of the last block. */
163    private int lastBlockStartPos;
164
165    /**
166     * array for mapping numeric references in range
167     * 130-159 to displayable Unicode characters.
168     */
169    private static final char[] cp1252Map = {
170        8218,  // &#130;
171        402,   // &#131;
172        8222,  // &#132;
173        8230,  // &#133;
174        8224,  // &#134;
175        8225,  // &#135;
176        710,   // &#136;
177        8240,  // &#137;
178        352,   // &#138;
179        8249,  // &#139;
180        338,   // &#140;
181        141,   // &#141;
182        142,   // &#142;
183        143,   // &#143;
184        144,   // &#144;
185        8216,  // &#145;
186        8217,  // &#146;
187        8220,  // &#147;
188        8221,  // &#148;
189        8226,  // &#149;
190        8211,  // &#150;
191        8212,  // &#151;
192        732,   // &#152;
193        8482,  // &#153;
194        353,   // &#154;
195        8250,  // &#155;
196        339,   // &#156;
197        157,   // &#157;
198        158,   // &#158;
199        376    // &#159;
200    };
201
202    /**
203     * Creates parser with the specified {@code dtd}.
204     *
205     * @param dtd the dtd.
206     */
207    public Parser(DTD dtd) {
208        this.dtd = dtd;
209    }
210
211
212    /**
213     * @return the line number of the line currently being parsed
214     */
215    protected int getCurrentLine() {
216        return ln;
217    }
218
219    /**
220     * Returns the start position of the current block. Block is
221     * overloaded here, it really means the current start position for
222     * the current comment tag, text, block.... This is provided for
223     * subclassers that wish to know the start of the current block when
224     * called with one of the handleXXX methods.
225     *
226     * @return the start position of the current block
227     */
228    int getBlockStartPosition() {
229        return Math.max(0, lastBlockStartPos - 1);
230    }
231
232    /**
233     * Makes a TagElement.
234     *
235     * @param elem       the element storing the tag definition
236     * @param fictional  the value of the flag "{@code fictional}" to be set for the tag
237     *
238     * @return the created {@code TagElement}
239     */
240    protected TagElement makeTag(Element elem, boolean fictional) {
241        return new TagElement(elem, fictional);
242    }
243
244    /**
245     * Makes a TagElement.
246     *
247     * @param elem  the element storing the tag definition
248     *
249     * @return the created {@code TagElement}
250     */
251    protected TagElement makeTag(Element elem) {
252        return makeTag(elem, false);
253    }
254
255    /**
256     * Returns attributes for the current tag.
257     *
258     * @return {@code SimpleAttributeSet} containing the attributes
259     */
260    protected SimpleAttributeSet getAttributes() {
261        return attributes;
262    }
263
264    /**
265     * Removes the current attributes.
266     */
267    protected void flushAttributes() {
268        attributes.removeAttributes(attributes);
269    }
270
271    /**
272     * Called when PCDATA is encountered.
273     *
274     * @param text  the section text
275     */
276    protected void handleText(char text[]) {
277    }
278
279    /**
280     * Called when an HTML title tag is encountered.
281     *
282     * @param text  the title text
283     */
284    protected void handleTitle(char text[]) {
285        // default behavior is to call handleText. Subclasses
286        // can override if necessary.
287        handleText(text);
288    }
289
290    /**
291     * Called when an HTML comment is encountered.
292     *
293     * @param text  the comment being handled
294     */
295    protected void handleComment(char text[]) {
296    }
297
298    /**
299     * Called when the content terminates without closing the HTML comment.
300     */
301    protected void handleEOFInComment() {
302        // We've reached EOF.  Our recovery strategy is to
303        // see if we have more than one line in the comment;
304        // if so, we pretend that the comment was an unterminated
305        // single line comment, and reparse the lines after the
306        // first line as normal HTML content.
307
308        int commentEndPos = strIndexOf('\n');
309        if (commentEndPos >= 0) {
310            handleComment(getChars(0, commentEndPos));
311            try {
312                in.close();
313                in = new CharArrayReader(getChars(commentEndPos + 1));
314                ch = '>';
315            } catch (IOException e) {
316                error("ioexception");
317            }
318
319            resetStrBuffer();
320        } else {
321            // no newline, so signal an error
322            error("eof.comment");
323        }
324    }
325
326    /**
327     * Called when an empty tag is encountered.
328     *
329     * @param tag  the tag being handled
330     * @throws ChangedCharSetException if the document charset was changed
331     */
332    protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
333    }
334
335    /**
336     * Called when a start tag is encountered.
337     *
338     * @param tag  the tag being handled
339     */
340    protected void handleStartTag(TagElement tag) {
341    }
342
343    /**
344     * Called when an end tag is encountered.
345     *
346     * @param tag  the tag being handled
347     */
348    protected void handleEndTag(TagElement tag) {
349    }
350
351    /**
352     * An error has occurred.
353     *
354     * @param ln   the number of line containing the error
355     * @param msg  the error message
356     */
357    protected void handleError(int ln, String msg) {
358        /*
359        Thread.dumpStack();
360        System.out.println("**** " + stack);
361        System.out.println("line " + ln + ": error: " + msg);
362        System.out.println();
363        */
364    }
365
366    /**
367     * Output text.
368     */
369    void handleText(TagElement tag) {
370        if (tag.breaksFlow()) {
371            space = false;
372            if (!strict) {
373                ignoreSpace = true;
374            }
375        }
376        if (textpos == 0) {
377            if ((!space) || (stack == null) || last.breaksFlow() ||
378                !stack.advance(dtd.pcdata)) {
379                last = tag;
380                space = false;
381                lastBlockStartPos = currentBlockStartPos;
382                return;
383            }
384        }
385        if (space) {
386            if (!ignoreSpace) {
387                // enlarge buffer if needed
388                if (textpos + 1 > text.length) {
389                    char newtext[] = new char[text.length + 200];
390                    System.arraycopy(text, 0, newtext, 0, text.length);
391                    text = newtext;
392                }
393
394                // output pending space
395                text[textpos++] = ' ';
396                if (!strict && !tag.getElement().isEmpty()) {
397                    ignoreSpace = true;
398                }
399            }
400            space = false;
401        }
402        char newtext[] = new char[textpos];
403        System.arraycopy(text, 0, newtext, 0, textpos);
404        // Handles cases of bad html where the title tag
405        // was getting lost when we did error recovery.
406        if (tag.getElement().getName().equals("title")) {
407            handleTitle(newtext);
408        } else {
409            handleText(newtext);
410        }
411        lastBlockStartPos = currentBlockStartPos;
412        textpos = 0;
413        last = tag;
414        space = false;
415    }
416
417    /**
418     * Invokes the error handler.
419     *
420     * @param err   the error type
421     * @param arg1  the 1st error message argument
422     * @param arg2  the 2nd error message argument
423     * @param arg3  the 3rd error message argument
424     */
425    protected void error(String err, String arg1, String arg2,
426        String arg3) {
427        handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3);
428    }
429
430    /**
431     * Invokes the error handler with the 3rd error message argument "?".
432     *
433     * @param err   the error type
434     * @param arg1  the 1st error message argument
435     * @param arg2  the 2nd error message argument
436     */
437    protected void error(String err, String arg1, String arg2) {
438        error(err, arg1, arg2, "?");
439    }
440
441    /**
442     * Invokes the error handler with the 2nd and 3rd error message argument "?".
443     *
444     * @param err   the error type
445     * @param arg1  the 1st error message argument
446     */
447    protected void error(String err, String arg1) {
448        error(err, arg1, "?", "?");
449    }
450
451    /**
452     * Invokes the error handler with the 1st, 2nd and 3rd error message argument "?".
453     *
454     * @param err   the error type
455     */
456    protected void error(String err) {
457        error(err, "?", "?", "?");
458    }
459
460
461    /**
462     * Handle a start tag. The new tag is pushed
463     * onto the tag stack. The attribute list is
464     * checked for required attributes.
465     *
466     * @param tag  the tag
467     * @throws ChangedCharSetException if the document charset was changed
468     */
469    protected void startTag(TagElement tag) throws ChangedCharSetException {
470        Element elem = tag.getElement();
471
472        // If the tag is an empty tag and texpos != 0
473        // this implies that there is text before the
474        // start tag that needs to be processed before
475        // handling the tag.
476        //
477        if (!elem.isEmpty() ||
478                    ((last != null) && !last.breaksFlow()) ||
479                    (textpos != 0)) {
480            handleText(tag);
481        } else {
482            // this variable gets updated in handleText().
483            // Since in this case we do not call handleText()
484            // we need to update it here.
485            //
486            last = tag;
487            // Note that we should really check last.breakFlows before
488            // assuming this should be false.
489            space = false;
490        }
491        lastBlockStartPos = currentBlockStartPos;
492
493        // check required attributes
494        for (AttributeList a = elem.atts ; a != null ; a = a.next) {
495            if ((a.modifier == REQUIRED) &&
496                ((attributes.isEmpty()) ||
497                 ((!attributes.isDefined(a.name)) &&
498                  (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) {
499                error("req.att ", a.getName(), elem.getName());
500            }
501        }
502
503        if (elem.isEmpty()) {
504            handleEmptyTag(tag);
505            /*
506        } else if (elem.getName().equals("form")) {
507            handleStartTag(tag);
508            */
509        } else {
510            recent = elem;
511            stack = new TagStack(tag, stack);
512            handleStartTag(tag);
513        }
514    }
515
516    /**
517     * Handle an end tag. The end tag is popped
518     * from the tag stack.
519     *
520     * @param omitted  {@code true} if the tag is no actually present in the
521     *                 document, but is supposed by the parser
522     */
523    protected void endTag(boolean omitted) {
524        handleText(stack.tag);
525
526        if (omitted && !stack.elem.omitEnd()) {
527            error("end.missing", stack.elem.getName());
528        } else if (!stack.terminate()) {
529            error("end.unexpected", stack.elem.getName());
530        }
531
532        // handle the tag
533        handleEndTag(stack.tag);
534        stack = stack.next;
535        recent = (stack != null) ? stack.elem : null;
536    }
537
538
539    boolean ignoreElement(Element elem) {
540
541        String stackElement = stack.elem.getName();
542        String elemName = elem.getName();
543        /* We ignore all elements that are not valid in the context of
544           a table except <td>, <th> (these we handle in
545           legalElementContext()) and #pcdata.  We also ignore the
546           <font> tag in the context of <ul> and <ol> We additonally
547           ignore the <meta> and the <style> tag if the body tag has
548           been seen. **/
549        if ((elemName.equals("html") && seenHtml) ||
550            (elemName.equals("head") && seenHead) ||
551            (elemName.equals("body") && seenBody)) {
552            return true;
553        }
554        if (elemName.equals("dt") || elemName.equals("dd")) {
555            TagStack s = stack;
556            while (s != null && !s.elem.getName().equals("dl")) {
557                s = s.next;
558            }
559            if (s == null) {
560                return true;
561            }
562        }
563
564        if (((stackElement.equals("table")) &&
565             (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) ||
566            ((elemName.equals("font")) &&
567             (stackElement.equals("ul") || stackElement.equals("ol"))) ||
568            (elemName.equals("meta") && stack != null) ||
569            (elemName.equals("style") && seenBody) ||
570            (stackElement.equals("table") && elemName.equals("a"))) {
571            return true;
572        }
573        return false;
574    }
575
576
577    /**
578     * Marks the first time a tag has been seen in a document
579     *
580     * @param elem  the element represented by the tag
581     */
582
583    protected void markFirstTime(Element elem) {
584        String elemName = elem.getName();
585        if (elemName.equals("html")) {
586            seenHtml = true;
587        } else if (elemName.equals("head")) {
588            seenHead = true;
589        } else if (elemName.equals("body")) {
590            if (buf.length == 1) {
591                // Refer to note in definition of buf for details on this.
592                char[] newBuf = new char[256];
593
594                newBuf[0] = buf[0];
595                buf = newBuf;
596            }
597            seenBody = true;
598        }
599    }
600
601    /**
602     * Create a legal content for an element.
603     */
604    boolean legalElementContext(Element elem) throws ChangedCharSetException {
605
606        // System.out.println("-- legalContext -- " + elem);
607
608        // Deal with the empty stack
609        if (stack == null) {
610            // System.out.println("-- stack is empty");
611            if (elem != dtd.html) {
612                // System.out.println("-- pushing html");
613                startTag(makeTag(dtd.html, true));
614                return legalElementContext(elem);
615            }
616            return true;
617        }
618
619        // Is it allowed in the current context
620        if (stack.advance(elem)) {
621            // System.out.println("-- legal context");
622            markFirstTime(elem);
623            return true;
624        }
625        boolean insertTag = false;
626
627        // The use of all error recovery strategies are contingent
628        // on the value of the strict property.
629        //
630        // These are commonly occurring errors.  if insertTag is true,
631        // then we want to adopt an error recovery strategy that
632        // involves attempting to insert an additional tag to
633        // legalize the context.  The two errors addressed here
634        // are:
635        // 1) when a <td> or <th> is seen soon after a <table> tag.
636        //    In this case we insert a <tr>.
637        // 2) when any other tag apart from a <tr> is seen
638        //    in the context of a <tr>.  In this case we would
639        //    like to add a <td>.  If a <tr> is seen within a
640        //    <tr> context, then we will close out the current
641        //    <tr>.
642        //
643        // This insertion strategy is handled later in the method.
644        // The reason for checking this now, is that in other cases
645        // we would like to apply other error recovery strategies for example
646        // ignoring tags.
647        //
648        // In certain cases it is better to ignore a tag than try to
649        // fix the situation.  So the first test is to see if this
650        // is what we need to do.
651        //
652        String stackElemName = stack.elem.getName();
653        String elemName = elem.getName();
654
655
656        if (!strict &&
657            ((stackElemName.equals("table") && elemName.equals("td")) ||
658             (stackElemName.equals("table") && elemName.equals("th")) ||
659             (stackElemName.equals("tr") && !elemName.equals("tr")))){
660             insertTag = true;
661        }
662
663
664        if (!strict && !insertTag && (stack.elem.getName() != elem.getName() ||
665                                      elem.getName().equals("body"))) {
666            if (skipTag = ignoreElement(elem)) {
667                error("tag.ignore", elem.getName());
668                return skipTag;
669            }
670        }
671
672        // Check for anything after the start of the table besides tr, td, th
673        // or caption, and if those aren't there, insert the <tr> and call
674        // legalElementContext again.
675        if (!strict && stackElemName.equals("table") &&
676            !elemName.equals("tr") && !elemName.equals("td") &&
677            !elemName.equals("th") && !elemName.equals("caption")) {
678            Element e = dtd.getElement("tr");
679            TagElement t = makeTag(e, true);
680            legalTagContext(t);
681            startTag(t);
682            error("start.missing", elem.getName());
683            return legalElementContext(elem);
684        }
685
686        // They try to find a legal context by checking if the current
687        // tag is valid in an enclosing context.  If so
688        // close out the tags by outputing end tags and then
689        // insert the current tag.  If the tags that are
690        // being closed out do not have an optional end tag
691        // specification in the DTD then an html error is
692        // reported.
693        //
694        if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) {
695            for (TagStack s = stack.next ; s != null ; s = s.next) {
696                if (s.advance(elem)) {
697                    while (stack != s) {
698                        endTag(true);
699                    }
700                    return true;
701                }
702                if (!s.terminate() || (strict && !s.elem.omitEnd())) {
703                    break;
704                }
705            }
706        }
707
708        // Check if we know what tag is expected next.
709        // If so insert the tag.  Report an error if the
710        // tag does not have its start tag spec in the DTD as optional.
711        //
712        Element next = stack.first();
713        if (next != null && (!strict || next.omitStart()) &&
714           !(next==dtd.head && elem==dtd.pcdata) ) {
715            // System.out.println("-- omitting start tag: " + next);
716            TagElement t = makeTag(next, true);
717            legalTagContext(t);
718            startTag(t);
719            if (!next.omitStart()) {
720                error("start.missing", elem.getName());
721            }
722            return legalElementContext(elem);
723        }
724
725
726        // Traverse the list of expected elements and determine if adding
727        // any of these elements would make for a legal context.
728        //
729
730        if (!strict) {
731            ContentModel content = stack.contentModel();
732            Vector<Element> elemVec = new Vector<Element>();
733            if (content != null) {
734                content.getElements(elemVec);
735                for (Element e : elemVec) {
736                    // Ensure that this element has not been included as
737                    // part of the exclusions in the DTD.
738                    //
739                    if (stack.excluded(e.getIndex())) {
740                        continue;
741                    }
742
743                    boolean reqAtts = false;
744
745                    for (AttributeList a = e.getAttributes(); a != null ; a = a.next) {
746                        if (a.modifier == REQUIRED) {
747                            reqAtts = true;
748                            break;
749                        }
750                    }
751                    // Ensure that no tag that has required attributes
752                    // gets inserted.
753                    //
754                    if (reqAtts) {
755                        continue;
756                    }
757
758                    ContentModel m = e.getContent();
759                    if (m != null && m.first(elem)) {
760                        // System.out.println("-- adding a legal tag: " + e);
761                        TagElement t = makeTag(e, true);
762                        legalTagContext(t);
763                        startTag(t);
764                        error("start.missing", e.getName());
765                        return legalElementContext(elem);
766                    }
767                }
768            }
769        }
770
771        // Check if the stack can be terminated.  If so add the appropriate
772        // end tag.  Report an error if the tag being ended does not have its
773        // end tag spec in the DTD as optional.
774        //
775        if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) {
776            // System.out.println("-- omitting end tag: " + stack.elem);
777            if (!stack.elem.omitEnd()) {
778                error("end.missing", elem.getName());
779            }
780
781            endTag(true);
782            return legalElementContext(elem);
783        }
784
785        // At this point we know that something is screwed up.
786        return false;
787    }
788
789    /**
790     * Create a legal context for a tag.
791     */
792    void legalTagContext(TagElement tag) throws ChangedCharSetException {
793        if (legalElementContext(tag.getElement())) {
794            markFirstTime(tag.getElement());
795            return;
796        }
797
798        // Avoid putting a block tag in a flow tag.
799        if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) {
800            endTag(true);
801            legalTagContext(tag);
802            return;
803        }
804
805        // Avoid putting something wierd in the head of the document.
806        for (TagStack s = stack ; s != null ; s = s.next) {
807            if (s.tag.getElement() == dtd.head) {
808                while (stack != s) {
809                    endTag(true);
810                }
811                endTag(true);
812                legalTagContext(tag);
813                return;
814            }
815        }
816
817        // Everything failed
818        error("tag.unexpected", tag.getElement().getName());
819    }
820
821    /**
822     * Error context. Something went wrong, make sure we are in
823     * the document's body context
824     */
825    void errorContext() throws ChangedCharSetException {
826        for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) {
827            handleEndTag(stack.tag);
828        }
829        if (stack == null) {
830            legalElementContext(dtd.body);
831            startTag(makeTag(dtd.body, true));
832        }
833    }
834
835    /**
836     * Add a char to the string buffer.
837     */
838    void addString(int c) {
839        if (strpos  == str.length) {
840            char newstr[] = new char[str.length + 128];
841            System.arraycopy(str, 0, newstr, 0, str.length);
842            str = newstr;
843        }
844        str[strpos++] = (char)c;
845    }
846
847    /**
848     * Get the string that's been accumulated.
849     */
850    String getString(int pos) {
851        char newStr[] = new char[strpos - pos];
852        System.arraycopy(str, pos, newStr, 0, strpos - pos);
853        strpos = pos;
854        return new String(newStr);
855    }
856
857    char[] getChars(int pos) {
858        char newStr[] = new char[strpos - pos];
859        System.arraycopy(str, pos, newStr, 0, strpos - pos);
860        strpos = pos;
861        return newStr;
862    }
863
864    char[] getChars(int pos, int endPos) {
865        char newStr[] = new char[endPos - pos];
866        System.arraycopy(str, pos, newStr, 0, endPos - pos);
867        // REMIND: it's not clear whether this version should set strpos or not
868        // strpos = pos;
869        return newStr;
870    }
871
872    void resetStrBuffer() {
873        strpos = 0;
874    }
875
876    int strIndexOf(char target) {
877        for (int i = 0; i < strpos; i++) {
878            if (str[i] == target) {
879                return i;
880            }
881        }
882
883        return -1;
884    }
885
886    /**
887     * Skip space.
888     * [5] 297:5
889     */
890    void skipSpace() throws IOException {
891        while (true) {
892            switch (ch) {
893              case '\n':
894                ln++;
895                ch = readCh();
896                lfCount++;
897                break;
898
899              case '\r':
900                ln++;
901                if ((ch = readCh()) == '\n') {
902                    ch = readCh();
903                    crlfCount++;
904                }
905                else {
906                    crCount++;
907                }
908                break;
909              case ' ':
910              case '\t':
911                ch = readCh();
912                break;
913
914              default:
915                return;
916            }
917        }
918    }
919
920    /**
921     * Parse identifier. Uppercase characters are folded
922     * to lowercase when lower is true. Returns falsed if
923     * no identifier is found. [55] 346:17
924     */
925    boolean parseIdentifier(boolean lower) throws IOException {
926        switch (ch) {
927          case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
928          case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
929          case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
930          case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
931          case 'Y': case 'Z':
932            if (lower) {
933                ch = 'a' + (ch - 'A');
934            }
935            break;
936
937          case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
938          case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
939          case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
940          case 's': case 't': case 'u': case 'v': case 'w': case 'x':
941          case 'y': case 'z':
942            break;
943
944          default:
945            return false;
946        }
947
948        while (true) {
949            addString(ch);
950
951            switch (ch = readCh()) {
952              case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
953              case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
954              case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
955              case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
956              case 'Y': case 'Z':
957                if (lower) {
958                    ch = 'a' + (ch - 'A');
959                }
960                break;
961
962              case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
963              case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
964              case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
965              case 's': case 't': case 'u': case 'v': case 'w': case 'x':
966              case 'y': case 'z':
967
968              case '0': case '1': case '2': case '3': case '4':
969              case '5': case '6': case '7': case '8': case '9':
970
971              case '.': case '-':
972
973              case '_': // not officially allowed
974                break;
975
976              default:
977                return true;
978            }
979        }
980    }
981
982    /**
983     * Parse an entity reference. [59] 350:17
984     */
985    private char[] parseEntityReference() throws IOException {
986        int pos = strpos;
987
988        if ((ch = readCh()) == '#') {
989            int n = 0;
990            ch = readCh();
991            if ((ch >= '0') && (ch <= '9') ||
992                    ch == 'x' || ch == 'X') {
993
994                if ((ch >= '0') && (ch <= '9')) {
995                    // parse decimal reference
996                    while ((ch >= '0') && (ch <= '9')) {
997                        n = (n * 10) + ch - '0';
998                        ch = readCh();
999                    }
1000                } else {
1001                    // parse hexadecimal reference
1002                    ch = readCh();
1003                    char lch = (char) Character.toLowerCase(ch);
1004                    while ((lch >= '0') && (lch <= '9') ||
1005                            (lch >= 'a') && (lch <= 'f')) {
1006                        if (lch >= '0' && lch <= '9') {
1007                            n = (n * 16) + lch - '0';
1008                        } else {
1009                            n = (n * 16) + lch - 'a' + 10;
1010                        }
1011                        ch = readCh();
1012                        lch = (char) Character.toLowerCase(ch);
1013                    }
1014                }
1015                switch (ch) {
1016                    case '\n':
1017                        ln++;
1018                        ch = readCh();
1019                        lfCount++;
1020                        break;
1021
1022                    case '\r':
1023                        ln++;
1024                        if ((ch = readCh()) == '\n') {
1025                            ch = readCh();
1026                            crlfCount++;
1027                        }
1028                        else {
1029                            crCount++;
1030                        }
1031                        break;
1032
1033                    case ';':
1034                        ch = readCh();
1035                        break;
1036                }
1037                char data[] = mapNumericReference(n);
1038                return data;
1039            }
1040            addString('#');
1041            if (!parseIdentifier(false)) {
1042                error("ident.expected");
1043                strpos = pos;
1044                char data[] = {'&', '#'};
1045                return data;
1046            }
1047        } else if (!parseIdentifier(false)) {
1048            char data[] = {'&'};
1049            return data;
1050        }
1051
1052        boolean semicolon = false;
1053
1054        switch (ch) {
1055          case '\n':
1056            ln++;
1057            ch = readCh();
1058            lfCount++;
1059            break;
1060
1061          case '\r':
1062            ln++;
1063            if ((ch = readCh()) == '\n') {
1064                ch = readCh();
1065                crlfCount++;
1066            }
1067            else {
1068                crCount++;
1069            }
1070            break;
1071
1072          case ';':
1073            semicolon = true;
1074
1075            ch = readCh();
1076            break;
1077        }
1078
1079        String nm = getString(pos);
1080        Entity ent = dtd.getEntity(nm);
1081
1082        // entities are case sensitive - however if strict
1083        // is false then we will try to make a match by
1084        // converting the string to all lowercase.
1085        //
1086        if (!strict && (ent == null)) {
1087            ent = dtd.getEntity(nm.toLowerCase());
1088        }
1089        if ((ent == null) || !ent.isGeneral()) {
1090
1091            if (nm.length() == 0) {
1092                error("invalid.entref", nm);
1093                return new char[0];
1094            }
1095            /* given that there is not a match restore the entity reference */
1096            String str = "&" + nm + (semicolon ? ";" : "");
1097
1098            char b[] = new char[str.length()];
1099            str.getChars(0, b.length, b, 0);
1100            return b;
1101        }
1102        return ent.getData();
1103    }
1104
1105    /**
1106     * Converts numeric character reference to char array.
1107     *
1108     * Normally the code in a reference should be always converted
1109     * to the Unicode character with the same code, but due to
1110     * wide usage of Cp1252 charset most browsers map numeric references
1111     * in the range 130-159 (which are control chars in Unicode set)
1112     * to displayable characters with other codes.
1113     *
1114     * @param c the code of numeric character reference.
1115     * @return a char array corresponding to the reference code.
1116     */
1117    private char[] mapNumericReference(int c) {
1118        char[] data;
1119        if (c >= 0xffff) { // outside unicode BMP.
1120            try {
1121                data = Character.toChars(c);
1122            } catch (IllegalArgumentException e) {
1123                data = new char[0];
1124            }
1125        } else {
1126            data = new char[1];
1127            data[0] = (c < 130 || c > 159) ? (char) c : cp1252Map[c - 130];
1128        }
1129        return data;
1130    }
1131
1132    /**
1133     * Parse a comment. [92] 391:7
1134     */
1135    void parseComment() throws IOException {
1136
1137        while (true) {
1138            int c = ch;
1139            switch (c) {
1140              case '-':
1141                  /** Presuming that the start string of a comment "<!--" has
1142                      already been parsed, the '-' character is valid only as
1143                      part of a comment termination and further more it must
1144                      be present in even numbers. Hence if strict is true, we
1145                      presume the comment has been terminated and return.
1146                      However if strict is false, then there is no even number
1147                      requirement and this character can appear anywhere in the
1148                      comment.  The parser reads on until it sees the following
1149                      pattern: "-->" or "--!>".
1150                   **/
1151                if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) {
1152                    if ((ch = readCh()) == '>') {
1153                        return;
1154                    }
1155                    if (ch == '!') {
1156                        if ((ch = readCh()) == '>') {
1157                            return;
1158                        } else {
1159                            /* to account for extra read()'s that happened */
1160                            addString('-');
1161                            addString('!');
1162                            continue;
1163                        }
1164                    }
1165                    break;
1166                }
1167
1168                if ((ch = readCh()) == '-') {
1169                    ch = readCh();
1170                    if (strict || ch == '>') {
1171                        return;
1172                    }
1173                    if (ch == '!') {
1174                        if ((ch = readCh()) == '>') {
1175                            return;
1176                        } else {
1177                            /* to account for extra read()'s that happened */
1178                            addString('-');
1179                            addString('!');
1180                            continue;
1181                        }
1182                    }
1183                    /* to account for the extra read() */
1184                    addString('-');
1185                }
1186                break;
1187
1188              case -1:
1189                  handleEOFInComment();
1190                  return;
1191
1192              case '\n':
1193                ln++;
1194                ch = readCh();
1195                lfCount++;
1196                break;
1197
1198              case '>':
1199                ch = readCh();
1200                break;
1201
1202              case '\r':
1203                ln++;
1204                if ((ch = readCh()) == '\n') {
1205                    ch = readCh();
1206                    crlfCount++;
1207                }
1208                else {
1209                    crCount++;
1210                }
1211                c = '\n';
1212                break;
1213              default:
1214                ch = readCh();
1215                break;
1216            }
1217
1218            addString(c);
1219        }
1220    }
1221
1222    /**
1223     * Parse literal content. [46] 343:1 and [47] 344:1
1224     */
1225    void parseLiteral(boolean replace) throws IOException {
1226        while (true) {
1227            int c = ch;
1228            switch (c) {
1229              case -1:
1230                error("eof.literal", stack.elem.getName());
1231                endTag(true);
1232                return;
1233
1234              case '>':
1235                ch = readCh();
1236                int i = textpos - (stack.elem.name.length() + 2), j = 0;
1237
1238                // match end tag
1239                if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) {
1240                    while ((++i < textpos) &&
1241                           (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++)));
1242                    if (i == textpos) {
1243                        textpos -= (stack.elem.name.length() + 2);
1244                        if ((textpos > 0) && (text[textpos-1] == '\n')) {
1245                            textpos--;
1246                        }
1247                        endTag(false);
1248                        return;
1249                    }
1250                }
1251                break;
1252
1253              case '&':
1254                char data[] = parseEntityReference();
1255                if (textpos + data.length > text.length) {
1256                    char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
1257                    System.arraycopy(text, 0, newtext, 0, text.length);
1258                    text = newtext;
1259                }
1260                System.arraycopy(data, 0, text, textpos, data.length);
1261                textpos += data.length;
1262                continue;
1263
1264              case '\n':
1265                ln++;
1266                ch = readCh();
1267                lfCount++;
1268                break;
1269
1270              case '\r':
1271                ln++;
1272                if ((ch = readCh()) == '\n') {
1273                    ch = readCh();
1274                    crlfCount++;
1275                }
1276                else {
1277                    crCount++;
1278                }
1279                c = '\n';
1280                break;
1281              default:
1282                ch = readCh();
1283                break;
1284            }
1285
1286            // output character
1287            if (textpos == text.length) {
1288                char newtext[] = new char[text.length + 128];
1289                System.arraycopy(text, 0, newtext, 0, text.length);
1290                text = newtext;
1291            }
1292            text[textpos++] = (char)c;
1293        }
1294    }
1295
1296    /**
1297     * Parse attribute value. [33] 331:1
1298     */
1299    @SuppressWarnings("fallthrough")
1300    String parseAttributeValue(boolean lower) throws IOException {
1301        int delim = -1;
1302
1303        // Check for a delimiter
1304        switch(ch) {
1305          case '\'':
1306          case '"':
1307            delim = ch;
1308            ch = readCh();
1309            break;
1310        }
1311
1312        // Parse the rest of the value
1313        while (true) {
1314            int c = ch;
1315
1316            switch (c) {
1317              case '\n':
1318                ln++;
1319                ch = readCh();
1320                lfCount++;
1321                if (delim < 0) {
1322                    return getString(0);
1323                }
1324                break;
1325
1326              case '\r':
1327                ln++;
1328
1329                if ((ch = readCh()) == '\n') {
1330                    ch = readCh();
1331                    crlfCount++;
1332                }
1333                else {
1334                    crCount++;
1335                }
1336                if (delim < 0) {
1337                    return getString(0);
1338                }
1339                break;
1340
1341              case '\t':
1342                  if (delim < 0)
1343                      c = ' ';
1344                  // Fall through
1345              case ' ':
1346                ch = readCh();
1347                if (delim < 0) {
1348                    return getString(0);
1349                }
1350                break;
1351
1352              case '>':
1353              case '<':
1354                if (delim < 0) {
1355                    return getString(0);
1356                }
1357                ch = readCh();
1358                break;
1359
1360              case '\'':
1361              case '"':
1362                ch = readCh();
1363                if (c == delim) {
1364                    return getString(0);
1365                } else if (delim == -1) {
1366                    error("attvalerr");
1367                    if (strict || ch == ' ') {
1368                        return getString(0);
1369                    } else {
1370                        continue;
1371                    }
1372                }
1373                break;
1374
1375            case '=':
1376                if (delim < 0) {
1377                    /* In SGML a construct like <img src=/cgi-bin/foo?x=1>
1378                       is considered invalid since an = sign can only be contained
1379                       in an attributes value if the string is quoted.
1380                       */
1381                    error("attvalerr");
1382                    /* If strict is true then we return with the string we have thus far.
1383                       Otherwise we accept the = sign as part of the attribute's value and
1384                       process the rest of the img tag. */
1385                    if (strict) {
1386                        return getString(0);
1387                    }
1388                }
1389                ch = readCh();
1390                break;
1391
1392              case '&':
1393                if (strict && delim < 0) {
1394                    ch = readCh();
1395                    break;
1396                }
1397
1398                char data[] = parseEntityReference();
1399                for (int i = 0 ; i < data.length ; i++) {
1400                    c = data[i];
1401                    addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c);
1402                }
1403                continue;
1404
1405              case -1:
1406                return getString(0);
1407
1408              default:
1409                if (lower && (c >= 'A') && (c <= 'Z')) {
1410                    c = 'a' + c - 'A';
1411                }
1412                ch = readCh();
1413                break;
1414            }
1415            addString(c);
1416        }
1417    }
1418
1419
1420    /**
1421     * Parse attribute specification List. [31] 327:17
1422     */
1423    void parseAttributeSpecificationList(Element elem) throws IOException {
1424
1425        while (true) {
1426            skipSpace();
1427
1428            switch (ch) {
1429              case '/':
1430              case '>':
1431              case '<':
1432              case -1:
1433                return;
1434
1435              case '-':
1436                if ((ch = readCh()) == '-') {
1437                    ch = readCh();
1438                    parseComment();
1439                    strpos = 0;
1440                } else {
1441                    error("invalid.tagchar", "-", elem.getName());
1442                    ch = readCh();
1443                }
1444                continue;
1445            }
1446
1447            AttributeList att;
1448            String attname;
1449            String attvalue;
1450
1451            if (parseIdentifier(true)) {
1452                attname = getString(0);
1453                skipSpace();
1454                if (ch == '=') {
1455                    ch = readCh();
1456                    skipSpace();
1457                    att = elem.getAttribute(attname);
1458//  Bug ID 4102750
1459//  Load the NAME of an Attribute Case Sensitive
1460//  The case of the NAME  must be intact
1461//  MG 021898
1462                    attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME));
1463//                  attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION));
1464                } else {
1465                    attvalue = attname;
1466                    att = elem.getAttributeByValue(attvalue);
1467                    if (att == null) {
1468                        att = elem.getAttribute(attname);
1469                        if (att != null) {
1470                            attvalue = att.getValue();
1471                        }
1472                        else {
1473                            // Make it null so that NULL_ATTRIBUTE_VALUE is
1474                            // used
1475                            attvalue = null;
1476                        }
1477                    }
1478                }
1479            } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs
1480                ch = readCh();
1481                continue;
1482            } else if (!strict && ch == '"') { // allows for quoted attributes
1483                ch = readCh();
1484                skipSpace();
1485                if (parseIdentifier(true)) {
1486                    attname = getString(0);
1487                    if (ch == '"') {
1488                        ch = readCh();
1489                    }
1490                    skipSpace();
1491                    if (ch == '=') {
1492                        ch = readCh();
1493                        skipSpace();
1494                        att = elem.getAttribute(attname);
1495                        attvalue = parseAttributeValue((att != null) &&
1496                                                (att.type != CDATA) &&
1497                                                (att.type != NOTATION));
1498                    } else {
1499                        attvalue = attname;
1500                        att = elem.getAttributeByValue(attvalue);
1501                        if (att == null) {
1502                            att = elem.getAttribute(attname);
1503                            if (att != null) {
1504                                attvalue = att.getValue();
1505                            }
1506                        }
1507                    }
1508                } else {
1509                    char str[] = {(char)ch};
1510                    error("invalid.tagchar", new String(str), elem.getName());
1511                    ch = readCh();
1512                    continue;
1513                }
1514            } else if (!strict && (attributes.isEmpty()) && (ch == '=')) {
1515                ch = readCh();
1516                skipSpace();
1517                attname = elem.getName();
1518                att = elem.getAttribute(attname);
1519                attvalue = parseAttributeValue((att != null) &&
1520                                               (att.type != CDATA) &&
1521                                               (att.type != NOTATION));
1522            } else if (!strict && (ch == '=')) {
1523                ch = readCh();
1524                skipSpace();
1525                attvalue = parseAttributeValue(true);
1526                error("attvalerr");
1527                return;
1528            } else {
1529                char str[] = {(char)ch};
1530                error("invalid.tagchar", new String(str), elem.getName());
1531                if (!strict) {
1532                    ch = readCh();
1533                    continue;
1534                } else {
1535                    return;
1536                }
1537            }
1538
1539            if (att != null) {
1540                attname = att.getName();
1541            } else {
1542                error("invalid.tagatt", attname, elem.getName());
1543            }
1544
1545            // Check out the value
1546            if (attributes.isDefined(attname)) {
1547                error("multi.tagatt", attname, elem.getName());
1548            }
1549            if (attvalue == null) {
1550                attvalue = ((att != null) && (att.value != null)) ? att.value :
1551                    HTML.NULL_ATTRIBUTE_VALUE;
1552            } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) {
1553                error("invalid.tagattval", attname, elem.getName());
1554            }
1555            HTML.Attribute attkey = HTML.getAttributeKey(attname);
1556            if (attkey == null) {
1557                attributes.addAttribute(attname, attvalue);
1558            } else {
1559                attributes.addAttribute(attkey, attvalue);
1560            }
1561        }
1562    }
1563
1564    /**
1565     * Parses the Document Type Declaration markup declaration.
1566     * Currently ignores it.
1567     *
1568     * @return the string representation of the markup declaration
1569     * @throws IOException if an I/O error occurs
1570     */
1571    public String parseDTDMarkup() throws IOException {
1572
1573        StringBuilder strBuff = new StringBuilder();
1574        ch = readCh();
1575        while(true) {
1576            switch (ch) {
1577            case '>':
1578                ch = readCh();
1579                return strBuff.toString();
1580            case -1:
1581                error("invalid.markup");
1582                return strBuff.toString();
1583            case '\n':
1584                ln++;
1585                ch = readCh();
1586                lfCount++;
1587                break;
1588            case '"':
1589                ch = readCh();
1590                break;
1591            case '\r':
1592                ln++;
1593                if ((ch = readCh()) == '\n') {
1594                    ch = readCh();
1595                    crlfCount++;
1596                }
1597                else {
1598                    crCount++;
1599                }
1600                break;
1601            default:
1602                strBuff.append((char)(ch & 0xFF));
1603                ch = readCh();
1604                break;
1605            }
1606        }
1607    }
1608
1609    /**
1610     * Parse markup declarations.
1611     * Currently only handles the Document Type Declaration markup.
1612     * Returns true if it is a markup declaration false otherwise.
1613     *
1614     * @param strBuff  the markup declaration
1615     * @return {@code true} if this is a valid markup declaration;
1616     *         otherwise {@code false}
1617     * @throws IOException if an I/O error occurs
1618     */
1619    protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException {
1620
1621        /* Currently handles only the DOCTYPE */
1622        if ((strBuff.length() == "DOCTYPE".length()) &&
1623            (strBuff.toString().toUpperCase().equals("DOCTYPE"))) {
1624            parseDTDMarkup();
1625            return true;
1626        }
1627        return false;
1628    }
1629
1630    /**
1631     * Parse an invalid tag.
1632     */
1633    void parseInvalidTag() throws IOException {
1634        // ignore all data upto the close bracket '>'
1635        while (true) {
1636            skipSpace();
1637            switch (ch) {
1638              case '>':
1639              case -1:
1640                  ch = readCh();
1641                return;
1642              case '<':
1643                  return;
1644              default:
1645                  ch = readCh();
1646
1647            }
1648        }
1649    }
1650
1651    /**
1652     * Parse a start or end tag.
1653     */
1654    @SuppressWarnings("fallthrough")
1655    void parseTag() throws IOException {
1656        Element elem;
1657        boolean net = false;
1658        boolean warned = false;
1659        boolean unknown = false;
1660
1661        switch (ch = readCh()) {
1662          case '!':
1663            switch (ch = readCh()) {
1664              case '-':
1665                // Parse comment. [92] 391:7
1666                while (true) {
1667                    if (ch == '-') {
1668                        if (!strict || ((ch = readCh()) == '-')) {
1669                            ch = readCh();
1670                            if (!strict && ch == '-') {
1671                                ch = readCh();
1672                            }
1673                            // send over any text you might see
1674                            // before parsing and sending the
1675                            // comment
1676                            if (textpos != 0) {
1677                                char newtext[] = new char[textpos];
1678                                System.arraycopy(text, 0, newtext, 0, textpos);
1679                                handleText(newtext);
1680                                lastBlockStartPos = currentBlockStartPos;
1681                                textpos = 0;
1682                            }
1683                            parseComment();
1684                            last = makeTag(dtd.getElement("comment"), true);
1685                            handleComment(getChars(0));
1686                            continue;
1687                        } else if (!warned) {
1688                            warned = true;
1689                            error("invalid.commentchar", "-");
1690                        }
1691                    }
1692                    skipSpace();
1693                    switch (ch) {
1694                      case '-':
1695                        continue;
1696                      case '>':
1697                        ch = readCh();
1698                        return;
1699                      case -1:
1700                        return;
1701                      default:
1702                        ch = readCh();
1703                        if (!warned) {
1704                            warned = true;
1705                            error("invalid.commentchar",
1706                                  String.valueOf((char)ch));
1707                        }
1708                        break;
1709                    }
1710                }
1711
1712              default:
1713                // deal with marked sections
1714                StringBuffer strBuff = new StringBuffer();
1715                while (true) {
1716                    strBuff.append((char)ch);
1717                    if (parseMarkupDeclarations(strBuff)) {
1718                        return;
1719                    }
1720                    switch(ch) {
1721                      case '>':
1722                        ch = readCh();
1723                        // Fall through
1724                      case -1:
1725                        error("invalid.markup");
1726                        return;
1727                      case '\n':
1728                        ln++;
1729                        ch = readCh();
1730                        lfCount++;
1731                        break;
1732                      case '\r':
1733                        ln++;
1734                        if ((ch = readCh()) == '\n') {
1735                            ch = readCh();
1736                            crlfCount++;
1737                        }
1738                        else {
1739                            crCount++;
1740                        }
1741                        break;
1742
1743                      default:
1744                        ch = readCh();
1745                        break;
1746                    }
1747                }
1748            }
1749
1750          case '/':
1751            // parse end tag [19] 317:4
1752            switch (ch = readCh()) {
1753              case '>':
1754                ch = readCh();
1755                // Fall through
1756              case '<':
1757                // empty end tag. either </> or </<
1758                if (recent == null) {
1759                    error("invalid.shortend");
1760                    return;
1761                }
1762                elem = recent;
1763                break;
1764
1765              default:
1766                if (!parseIdentifier(true)) {
1767                    error("expected.endtagname");
1768                    return;
1769                }
1770                skipSpace();
1771                switch (ch) {
1772                  case '>':
1773                    ch = readCh();
1774                    break;
1775                  case '<':
1776                    break;
1777
1778                  default:
1779                    error("expected", "'>'");
1780                    while ((ch != -1) && (ch != '\n') && (ch != '>')) {
1781                        ch = readCh();
1782                    }
1783                    if (ch == '>') {
1784                        ch = readCh();
1785                    }
1786                    break;
1787                }
1788                String elemStr = getString(0);
1789                if (!dtd.elementExists(elemStr)) {
1790                    error("end.unrecognized", elemStr);
1791                    // Ignore RE before end tag
1792                    if ((textpos > 0) && (text[textpos-1] == '\n')) {
1793                        textpos--;
1794                    }
1795                    elem = dtd.getElement("unknown");
1796                    elem.name = elemStr;
1797                    unknown = true;
1798                } else {
1799                    elem = dtd.getElement(elemStr);
1800                }
1801                break;
1802            }
1803
1804
1805            // If the stack is null, we're seeing end tags without any begin
1806            // tags.  Ignore them.
1807
1808            if (stack == null) {
1809                error("end.extra.tag", elem.getName());
1810                return;
1811            }
1812
1813            // Ignore RE before end tag
1814            if ((textpos > 0) && (text[textpos-1] == '\n')) {
1815                // In a pre tag, if there are blank lines
1816                // we do not want to remove the newline
1817                // before the end tag.  Hence this code.
1818                //
1819                if (stack.pre) {
1820                    if ((textpos > 1) && (text[textpos-2] != '\n')) {
1821                        textpos--;
1822                    }
1823                } else {
1824                    textpos--;
1825                }
1826            }
1827
1828            // If the end tag is a form, since we did not put it
1829            // on the tag stack, there is no corresponding start
1830            // start tag to find. Hence do not touch the tag stack.
1831            //
1832
1833            /*
1834            if (!strict && elem.getName().equals("form")) {
1835                if (lastFormSent != null) {
1836                    handleEndTag(lastFormSent);
1837                    return;
1838                } else {
1839                    // do nothing.
1840                    return;
1841                }
1842            }
1843            */
1844
1845            if (unknown) {
1846                // we will not see a corresponding start tag
1847                // on the stack.  If we are seeing an
1848                // end tag, lets send this on as an empty
1849                // tag with the end tag attribute set to
1850                // true.
1851                TagElement t = makeTag(elem);
1852                handleText(t);
1853                attributes.addAttribute(HTML.Attribute.ENDTAG, "true");
1854                handleEmptyTag(makeTag(elem));
1855                unknown = false;
1856                return;
1857            }
1858
1859            // find the corresponding start tag
1860
1861            // A commonly occurring error appears to be the insertion
1862            // of extra end tags in a table.  The intent here is ignore
1863            // such extra end tags.
1864            //
1865            if (!strict) {
1866                String stackElem = stack.elem.getName();
1867
1868                if (stackElem.equals("table")) {
1869                    // If it is not a valid end tag ignore it and return
1870                    //
1871                    if (!elem.getName().equals(stackElem)) {
1872                        error("tag.ignore", elem.getName());
1873                        return;
1874                    }
1875                }
1876
1877
1878
1879                if (stackElem.equals("tr") ||
1880                    stackElem.equals("td")) {
1881                    if ((!elem.getName().equals("table")) &&
1882                        (!elem.getName().equals(stackElem))) {
1883                        error("tag.ignore", elem.getName());
1884                        return;
1885                    }
1886                }
1887            }
1888            TagStack sp = stack;
1889
1890            while ((sp != null) && (elem != sp.elem)) {
1891                sp = sp.next;
1892            }
1893            if (sp == null) {
1894                error("unmatched.endtag", elem.getName());
1895                return;
1896            }
1897
1898            // People put font ending tags in the darndest places.
1899            // Don't close other contexts based on them being between
1900            // a font tag and the corresponding end tag.  Instead,
1901            // ignore the end tag like it doesn't exist and allow the end
1902            // of the document to close us out.
1903            String elemName = elem.getName();
1904            if (stack != sp &&
1905                (elemName.equals("font") ||
1906                 elemName.equals("center"))) {
1907
1908                // Since closing out a center tag can have real wierd
1909                // effects on the formatting,  make sure that tags
1910                // for which omitting an end tag is legimitate
1911                // get closed out.
1912                //
1913                if (elemName.equals("center")) {
1914                    while(stack.elem.omitEnd() && stack != sp) {
1915                        endTag(true);
1916                    }
1917                    if (stack.elem == elem) {
1918                        endTag(false);
1919                    }
1920                }
1921                return;
1922            }
1923            // People do the same thing with center tags.  In this
1924            // case we would like to close off the center tag but
1925            // not necessarily all enclosing tags.
1926
1927
1928
1929            // end tags
1930            while (stack != sp) {
1931                endTag(true);
1932            }
1933
1934            endTag(false);
1935            return;
1936
1937          case -1:
1938            error("eof");
1939            return;
1940        }
1941
1942        // start tag [14] 314:1
1943        if (!parseIdentifier(true)) {
1944            elem = recent;
1945            if ((ch != '>') || (elem == null)) {
1946                error("expected.tagname");
1947                return;
1948            }
1949        } else {
1950            String elemStr = getString(0);
1951
1952            if (elemStr.equals("image")) {
1953                elemStr = "img";
1954            }
1955
1956            /* determine if this element is part of the dtd. */
1957
1958            if (!dtd.elementExists(elemStr)) {
1959                //              parseInvalidTag();
1960                error("tag.unrecognized ", elemStr);
1961                elem = dtd.getElement("unknown");
1962                elem.name = elemStr;
1963                unknown = true;
1964            } else {
1965                elem = dtd.getElement(elemStr);
1966            }
1967        }
1968
1969        // Parse attributes
1970        parseAttributeSpecificationList(elem);
1971
1972        switch (ch) {
1973          case '/':
1974            net = true;
1975            // Fall through
1976          case '>':
1977            ch = readCh();
1978            if (ch == '>' && net) {
1979                ch = readCh();
1980            }
1981          case '<':
1982            break;
1983
1984          default:
1985            error("expected", "'>'");
1986            break;
1987        }
1988
1989        if (!strict) {
1990          if (elem.getName().equals("script")) {
1991            error("javascript.unsupported");
1992          }
1993        }
1994
1995        // ignore RE after start tag
1996        //
1997        if (!elem.isEmpty())  {
1998            if (ch == '\n') {
1999                ln++;
2000                lfCount++;
2001                ch = readCh();
2002            } else if (ch == '\r') {
2003                ln++;
2004                if ((ch = readCh()) == '\n') {
2005                    ch = readCh();
2006                    crlfCount++;
2007                }
2008                else {
2009                    crCount++;
2010                }
2011            }
2012        }
2013
2014        // ensure a legal context for the tag
2015        TagElement tag = makeTag(elem, false);
2016
2017
2018        /** In dealing with forms, we have decided to treat
2019            them as legal in any context.  Also, even though
2020            they do have a start and an end tag, we will
2021            not put this tag on the stack.  This is to deal
2022            several pages in the web oasis that choose to
2023            start and end forms in any possible location. **/
2024
2025        /*
2026        if (!strict && elem.getName().equals("form")) {
2027            if (lastFormSent == null) {
2028                lastFormSent = tag;
2029            } else {
2030                handleEndTag(lastFormSent);
2031                lastFormSent = tag;
2032            }
2033        } else {
2034        */
2035            // Smlly, if a tag is unknown, we will apply
2036            // no legalTagContext logic to it.
2037            //
2038            if (!unknown) {
2039                legalTagContext(tag);
2040
2041                // If skip tag is true,  this implies that
2042                // the tag was illegal and that the error
2043                // recovery strategy adopted is to ignore
2044                // the tag.
2045                if (!strict && skipTag) {
2046                    skipTag = false;
2047                    return;
2048                }
2049            }
2050            /*
2051        }
2052            */
2053
2054        startTag(tag);
2055
2056        if (!elem.isEmpty()) {
2057            switch (elem.getType()) {
2058              case CDATA:
2059                parseLiteral(false);
2060                break;
2061              case RCDATA:
2062                parseLiteral(true);
2063                break;
2064              default:
2065                if (stack != null) {
2066                    stack.net = net;
2067                }
2068                break;
2069            }
2070        }
2071    }
2072
2073    private static final String START_COMMENT = "<!--";
2074    private static final String END_COMMENT = "-->";
2075    private static final char[] SCRIPT_END_TAG = "</script>".toCharArray();
2076    private static final char[] SCRIPT_END_TAG_UPPER_CASE =
2077                                        "</SCRIPT>".toCharArray();
2078
2079    void parseScript() throws IOException {
2080        char[] charsToAdd = new char[SCRIPT_END_TAG.length];
2081        boolean insideComment = false;
2082
2083        /* Here, ch should be the first character after <script> */
2084        while (true) {
2085            int i = 0;
2086            while (!insideComment && i < SCRIPT_END_TAG.length
2087                    && (SCRIPT_END_TAG[i] == ch
2088                    || SCRIPT_END_TAG_UPPER_CASE[i] == ch)) {
2089                charsToAdd[i] = (char) ch;
2090                ch = readCh();
2091                i++;
2092            }
2093            if (i == SCRIPT_END_TAG.length) {
2094                return;
2095            }
2096
2097            if (!insideComment && i == 1 && charsToAdd[0] == START_COMMENT.charAt(0)) {
2098                // it isn't end script tag, but may be it's start comment tag?
2099                while (i < START_COMMENT.length()
2100                        && START_COMMENT.charAt(i) == ch) {
2101                    charsToAdd[i] = (char) ch;
2102                    ch = readCh();
2103                    i++;
2104                }
2105                if (i == START_COMMENT.length()) {
2106                    insideComment = true;
2107                }
2108            }
2109            if (insideComment) {
2110                while (i < END_COMMENT.length()
2111                        && END_COMMENT.charAt(i) == ch) {
2112                    charsToAdd[i] = (char) ch;
2113                    ch = readCh();
2114                    i++;
2115                }
2116                if (i == END_COMMENT.length()) {
2117                    insideComment = false;
2118                }
2119            }
2120
2121            /* To account for extra read()'s that happened */
2122            if (i > 0) {
2123                for (int j = 0; j < i; j++) {
2124                    addString(charsToAdd[j]);
2125                }
2126                continue;
2127            }
2128            switch (ch) {
2129            case -1:
2130                error("eof.script");
2131                return;
2132            case '\n':
2133                ln++;
2134                ch = readCh();
2135                lfCount++;
2136                addString('\n');
2137                break;
2138            case '\r':
2139                ln++;
2140                if ((ch = readCh()) == '\n') {
2141                    ch = readCh();
2142                    crlfCount++;
2143                } else {
2144                    crCount++;
2145                }
2146                addString('\n');
2147                break;
2148            default:
2149                addString(ch);
2150                ch = readCh();
2151                break;
2152            } // switch
2153        } // while
2154    }
2155
2156    /**
2157     * Parse Content. [24] 320:1
2158     */
2159    void parseContent() throws IOException {
2160        Thread curThread = Thread.currentThread();
2161
2162        for (;;) {
2163            if (curThread.isInterrupted()) {
2164                curThread.interrupt(); // resignal the interrupt
2165                break;
2166            }
2167
2168            int c = ch;
2169            currentBlockStartPos = currentPosition;
2170
2171            if (recent == dtd.script) { // means: if after starting <script> tag
2172
2173                /* Here, ch has to be the first character after <script> */
2174                parseScript();
2175                last = makeTag(dtd.getElement("comment"), true);
2176
2177                /* Remove leading and trailing HTML comment declarations */
2178                String str = new String(getChars(0)).trim();
2179                int minLength = START_COMMENT.length() + END_COMMENT.length();
2180                if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT)
2181                       && str.length() >= (minLength)) {
2182                    str = str.substring(START_COMMENT.length(),
2183                                      str.length() - END_COMMENT.length());
2184                }
2185
2186                /* Handle resulting chars as comment */
2187                handleComment(str.toCharArray());
2188                endTag(false);
2189                lastBlockStartPos = currentPosition;
2190
2191                continue;
2192            } else {
2193                switch (c) {
2194                  case '<':
2195                    parseTag();
2196                    lastBlockStartPos = currentPosition;
2197                    continue;
2198
2199                  case '/':
2200                    ch = readCh();
2201                    if ((stack != null) && stack.net) {
2202                        // null end tag.
2203                        endTag(false);
2204                        continue;
2205                    } else if (textpos == 0) {
2206                        if (!legalElementContext(dtd.pcdata)) {
2207                            error("unexpected.pcdata");
2208                        }
2209                        if (last.breaksFlow()) {
2210                            space = false;
2211                        }
2212                    }
2213                    break;
2214
2215                  case -1:
2216                    return;
2217
2218                  case '&':
2219                    if (textpos == 0) {
2220                        if (!legalElementContext(dtd.pcdata)) {
2221                            error("unexpected.pcdata");
2222                        }
2223                        if (last.breaksFlow()) {
2224                            space = false;
2225                        }
2226                    }
2227                    char data[] = parseEntityReference();
2228                    if (textpos + data.length + 1 > text.length) {
2229                        char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
2230                        System.arraycopy(text, 0, newtext, 0, text.length);
2231                        text = newtext;
2232                    }
2233                    if (space) {
2234                        space = false;
2235                        text[textpos++] = ' ';
2236                    }
2237                    System.arraycopy(data, 0, text, textpos, data.length);
2238                    textpos += data.length;
2239                    ignoreSpace = false;
2240                    continue;
2241
2242                  case '\n':
2243                    ln++;
2244                    lfCount++;
2245                    ch = readCh();
2246                    if ((stack != null) && stack.pre) {
2247                        break;
2248                    }
2249                    if (textpos == 0) {
2250                        lastBlockStartPos = currentPosition;
2251                    }
2252                    if (!ignoreSpace) {
2253                        space = true;
2254                    }
2255                    continue;
2256
2257                  case '\r':
2258                    ln++;
2259                    c = '\n';
2260                    if ((ch = readCh()) == '\n') {
2261                        ch = readCh();
2262                        crlfCount++;
2263                    }
2264                    else {
2265                        crCount++;
2266                    }
2267                    if ((stack != null) && stack.pre) {
2268                        break;
2269                    }
2270                    if (textpos == 0) {
2271                        lastBlockStartPos = currentPosition;
2272                    }
2273                    if (!ignoreSpace) {
2274                        space = true;
2275                    }
2276                    continue;
2277
2278
2279                  case '\t':
2280                  case ' ':
2281                    ch = readCh();
2282                    if ((stack != null) && stack.pre) {
2283                        break;
2284                    }
2285                    if (textpos == 0) {
2286                        lastBlockStartPos = currentPosition;
2287                    }
2288                    if (!ignoreSpace) {
2289                        space = true;
2290                    }
2291                    continue;
2292
2293                  default:
2294                    if (textpos == 0) {
2295                        if (!legalElementContext(dtd.pcdata)) {
2296                            error("unexpected.pcdata");
2297                        }
2298                        if (last.breaksFlow()) {
2299                            space = false;
2300                        }
2301                    }
2302                    ch = readCh();
2303                    break;
2304                }
2305            }
2306
2307            // enlarge buffer if needed
2308            if (textpos + 2 > text.length) {
2309                char newtext[] = new char[text.length + 128];
2310                System.arraycopy(text, 0, newtext, 0, text.length);
2311                text = newtext;
2312            }
2313
2314            // output pending space
2315            if (space) {
2316                if (textpos == 0) {
2317                    lastBlockStartPos--;
2318                }
2319                text[textpos++] = ' ';
2320                space = false;
2321            }
2322            text[textpos++] = (char)c;
2323            ignoreSpace = false;
2324        }
2325    }
2326
2327    /**
2328     * Returns the end of line string. This will return the end of line
2329     * string that has been encountered the most, one of \r, \n or \r\n.
2330     */
2331    String getEndOfLineString() {
2332        if (crlfCount >= crCount) {
2333            if (lfCount >= crlfCount) {
2334                return "\n";
2335            }
2336            else {
2337                return "\r\n";
2338            }
2339        }
2340        else {
2341            if (crCount > lfCount) {
2342                return "\r";
2343            }
2344            else {
2345                return "\n";
2346            }
2347        }
2348    }
2349
2350    /**
2351     * Parse an HTML stream, given a DTD.
2352     *
2353     * @param in  the reader to read the source from
2354     * @throws IOException if an I/O error occurs
2355     */
2356    public synchronized void parse(Reader in) throws IOException {
2357        this.in = in;
2358
2359        this.ln = 1;
2360
2361        seenHtml = false;
2362        seenHead = false;
2363        seenBody = false;
2364
2365        crCount = lfCount = crlfCount = 0;
2366
2367        try {
2368            ch = readCh();
2369            text = new char[1024];
2370            str = new char[128];
2371
2372            parseContent();
2373            // NOTE: interruption may have occurred.  Control flows out
2374            // of here normally.
2375            while (stack != null) {
2376                endTag(true);
2377            }
2378            in.close();
2379        } catch (IOException e) {
2380            errorContext();
2381            error("ioexception");
2382            throw e;
2383        } catch (Exception e) {
2384            errorContext();
2385            error("exception", e.getClass().getName(), e.getMessage());
2386            e.printStackTrace();
2387        } catch (ThreadDeath e) {
2388            errorContext();
2389            error("terminated");
2390            e.printStackTrace();
2391            throw e;
2392        } finally {
2393            for (; stack != null ; stack = stack.next) {
2394                handleEndTag(stack.tag);
2395            }
2396
2397            text = null;
2398            str = null;
2399        }
2400
2401    }
2402
2403
2404    /*
2405     * Input cache.  This is much faster than calling down to a synchronized
2406     * method of BufferedReader for each byte.  Measurements done 5/30/97
2407     * show that there's no point in having a bigger buffer:  Increasing
2408     * the buffer to 8192 had no measurable impact for a program discarding
2409     * one character at a time (reading from an http URL to a local machine).
2410     * NOTE: If the current encoding is bogus, and we read too much
2411     * (past the content-type) we may suffer a MalformedInputException. For
2412     * this reason the initial size is 1 and when the body is encountered the
2413     * size is adjusted to 256.
2414     */
2415    private char buf[] = new char[1];
2416    private int pos;
2417    private int len;
2418    /*
2419        tracks position relative to the beginning of the
2420        document.
2421    */
2422    private int currentPosition;
2423
2424
2425    private final int readCh() throws IOException {
2426
2427        if (pos >= len) {
2428
2429            // This loop allows us to ignore interrupts if the flag
2430            // says so
2431            for (;;) {
2432                try {
2433                    len = in.read(buf);
2434                    break;
2435                } catch (InterruptedIOException ex) {
2436                    throw ex;
2437                }
2438            }
2439
2440            if (len <= 0) {
2441                return -1;      // eof
2442            }
2443            pos = 0;
2444        }
2445        ++currentPosition;
2446
2447        return buf[pos++];
2448    }
2449
2450
2451    /**
2452     * Returns the current position.
2453     *
2454     * @return the current position
2455     */
2456    protected int getCurrentPos() {
2457        return currentPosition;
2458    }
2459}
2460