1/*
2 * Copyright (c) 1998, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package javax.swing.text.html.parser;
27
28import javax.swing.text.SimpleAttributeSet;
29import javax.swing.text.html.HTMLEditorKit;
30import javax.swing.text.html.HTML;
31import javax.swing.text.ChangedCharSetException;
32
33import java.util.*;
34import java.io.*;
35import java.net.*;
36
37/**
38 * A Parser for HTML Documents (actually, you can specify a DTD, but
39 * you should really only use this class with the html dtd in swing).
40 * Reads an InputStream of HTML and
41 * invokes the appropriate methods in the ParserCallback class. This
42 * is the default parser used by HTMLEditorKit to parse HTML url's.
43 * <p>This will message the callback for all valid tags, as well as
44 * tags that are implied but not explicitly specified. For example, the
45 * html string (&lt;p&gt;blah) only has a p tag defined. The callback
46 * will see the following methods:
47 * <ol><li><i>handleStartTag(html, ...)</i></li>
48 *     <li><i>handleStartTag(head, ...)</i></li>
49 *     <li><i>handleEndTag(head)</i></li>
50 *     <li><i>handleStartTag(body, ...)</i></li>
51 *     <li><i>handleStartTag(p, ...)</i></li>
52 *     <li><i>handleText(...)</i></li>
53 *     <li><i>handleEndTag(p)</i></li>
54 *     <li><i>handleEndTag(body)</i></li>
55 *     <li><i>handleEndTag(html)</i></li>
56 * </ol>
57 * The items in <i>italic</i> are implied, that is, although they were not
58 * explicitly specified, to be correct html they should have been present
59 * (head isn't necessary, but it is still generated). For tags that
60 * are implied, the AttributeSet argument will have a value of
61 * <code>Boolean.TRUE</code> for the key
62 * <code>HTMLEditorKit.ParserCallback.IMPLIED</code>.
63 * <p>HTML.Attributes defines a type safe enumeration of html attributes.
64 * If an attribute key of a tag is defined in HTML.Attribute, the
65 * HTML.Attribute will be used as the key, otherwise a String will be used.
66 * For example &lt;p foo=bar class=neat&gt; has two attributes. foo is
67 * not defined in HTML.Attribute, where as class is, therefore the
68 * AttributeSet will have two values in it, HTML.Attribute.CLASS with
69 * a String value of 'neat' and the String key 'foo' with a String value of
70 * 'bar'.
71 * <p>The position argument will indicate the start of the tag, comment
72 * or text. Similar to arrays, the first character in the stream has a
73 * position of 0. For tags that are
74 * implied the position will indicate
75 * the location of the next encountered tag. In the first example,
76 * the implied start body and html tags will have the same position as the
77 * p tag, and the implied end p, html and body tags will all have the same
78 * position.
79 * <p>As html skips whitespace the position for text will be the position
80 * of the first valid character, eg in the string '\n\n\nblah'
81 * the text 'blah' will have a position of 3, the newlines are skipped.
82 * <p>
83 * For attributes that do not have a value, eg in the html
84 * string <code>&lt;foo blah&gt;</code> the attribute <code>blah</code>
85 * does not have a value, there are two possible values that will be
86 * placed in the AttributeSet's value:
87 * <ul>
88 * <li>If the DTD does not contain an definition for the element, or the
89 *     definition does not have an explicit value then the value in the
90 *     AttributeSet will be <code>HTML.NULL_ATTRIBUTE_VALUE</code>.
91 * <li>If the DTD contains an explicit value, as in:
92 *     <code>&lt;!ATTLIST OPTION selected (selected) #IMPLIED&gt;</code>
93 *     this value from the dtd (in this case selected) will be used.
94 * </ul>
95 * <p>
96 * Once the stream has been parsed, the callback is notified of the most
97 * likely end of line string. The end of line string will be one of
98 * \n, \r or \r\n, which ever is encountered the most in parsing the
99 * stream.
100 *
101 * @author      Sunita Mani
102 */
103public class DocumentParser extends javax.swing.text.html.parser.Parser {
104
105    private int inbody;
106    private int intitle;
107    private int inhead;
108    private int instyle;
109    private int inscript;
110    private boolean seentitle;
111    private HTMLEditorKit.ParserCallback callback = null;
112    private boolean ignoreCharSet = false;
113    private static final boolean debugFlag = false;
114
115    /**
116     * Creates document parser with the specified {@code dtd}.
117     *
118     * @param dtd the dtd.
119     */
120    public DocumentParser(DTD dtd) {
121        super(dtd);
122    }
123
124    /**
125     * Parse an HTML stream, given a DTD.
126     *
127     * @param in the reader to read the source from
128     * @param callback the callback
129     * @param ignoreCharSet if {@code true} the charset is ignored
130     * @throws IOException if an I/O error occurs
131     */
132    public void parse(Reader in, HTMLEditorKit.ParserCallback callback, boolean ignoreCharSet) throws IOException {
133        this.ignoreCharSet = ignoreCharSet;
134        this.callback = callback;
135        parse(in);
136        // end of line
137        callback.handleEndOfLineString(getEndOfLineString());
138    }
139
140    /**
141     * Handle Start Tag.
142     */
143    protected void handleStartTag(TagElement tag) {
144
145        Element elem = tag.getElement();
146        if (elem == dtd.body) {
147            inbody++;
148        } else if (elem == dtd.html) {
149        } else if (elem == dtd.head) {
150            inhead++;
151        } else if (elem == dtd.title) {
152            intitle++;
153        } else if (elem == dtd.style) {
154            instyle++;
155        } else if (elem == dtd.script) {
156            inscript++;
157        }
158        if (debugFlag) {
159            if (tag.fictional()) {
160                debug("Start Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
161            } else {
162                debug("Start Tag: " + tag.getHTMLTag() + " attributes: " +
163                      getAttributes() + " pos: " + getCurrentPos());
164            }
165        }
166        if (tag.fictional()) {
167            SimpleAttributeSet attrs = new SimpleAttributeSet();
168            attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
169                               Boolean.TRUE);
170            callback.handleStartTag(tag.getHTMLTag(), attrs,
171                                    getBlockStartPosition());
172        } else {
173            callback.handleStartTag(tag.getHTMLTag(), getAttributes(),
174                                    getBlockStartPosition());
175            flushAttributes();
176        }
177    }
178
179
180    protected void handleComment(char text[]) {
181        if (debugFlag) {
182            debug("comment: ->" + new String(text) + "<-"
183                  + " pos: " + getCurrentPos());
184        }
185        callback.handleComment(text, getBlockStartPosition());
186    }
187
188    /**
189     * Handle Empty Tag.
190     */
191    protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
192
193        Element elem = tag.getElement();
194        if (elem == dtd.meta && !ignoreCharSet) {
195            SimpleAttributeSet atts = getAttributes();
196            if (atts != null) {
197                String content = (String)atts.getAttribute(HTML.Attribute.CONTENT);
198                if (content != null) {
199                    if ("content-type".equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
200                        if (!content.equalsIgnoreCase("text/html") &&
201                                !content.equalsIgnoreCase("text/plain")) {
202                            throw new ChangedCharSetException(content, false);
203                        }
204                    } else if ("charset" .equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
205                        throw new ChangedCharSetException(content, true);
206                    }
207                }
208            }
209        }
210        if (inbody != 0 || elem == dtd.meta || elem == dtd.base || elem == dtd.isindex || elem == dtd.style || elem == dtd.link) {
211            if (debugFlag) {
212                if (tag.fictional()) {
213                    debug("Empty Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
214                } else {
215                    debug("Empty Tag: " + tag.getHTMLTag() + " attributes: "
216                          + getAttributes() + " pos: " + getCurrentPos());
217                }
218            }
219            if (tag.fictional()) {
220                SimpleAttributeSet attrs = new SimpleAttributeSet();
221                attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
222                                   Boolean.TRUE);
223                callback.handleSimpleTag(tag.getHTMLTag(), attrs,
224                                         getBlockStartPosition());
225            } else {
226                callback.handleSimpleTag(tag.getHTMLTag(), getAttributes(),
227                                         getBlockStartPosition());
228                flushAttributes();
229            }
230        }
231    }
232
233    /**
234     * Handle End Tag.
235     */
236    protected void handleEndTag(TagElement tag) {
237        Element elem = tag.getElement();
238        if (elem == dtd.body) {
239            inbody--;
240        } else if (elem == dtd.title) {
241            intitle--;
242            seentitle = true;
243        } else if (elem == dtd.head) {
244            inhead--;
245        } else if (elem == dtd.style) {
246            instyle--;
247        } else if (elem == dtd.script) {
248            inscript--;
249        }
250        if (debugFlag) {
251            debug("End Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
252        }
253        callback.handleEndTag(tag.getHTMLTag(), getBlockStartPosition());
254
255    }
256
257    /**
258     * Handle Text.
259     */
260    protected void handleText(char data[]) {
261        if (data != null) {
262            if (inscript != 0) {
263                callback.handleComment(data, getBlockStartPosition());
264                return;
265            }
266            if (inbody != 0 || ((instyle != 0) ||
267                                ((intitle != 0) && !seentitle))) {
268                if (debugFlag) {
269                    debug("text:  ->" + new String(data) + "<-" + " pos: " + getCurrentPos());
270                }
271                callback.handleText(data, getBlockStartPosition());
272            }
273        }
274    }
275
276    /*
277     * Error handling.
278     */
279    protected void handleError(int ln, String errorMsg) {
280        if (debugFlag) {
281            debug("Error: ->" + errorMsg + "<-" + " pos: " + getCurrentPos());
282        }
283        /* PENDING: need to improve the error string. */
284        callback.handleError(errorMsg, getCurrentPos());
285    }
286
287
288    /*
289     * debug messages
290     */
291    private void debug(String msg) {
292        System.out.println(msg);
293    }
294}
295