1/*
2 * Copyright (c) 1998, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package com.sun.xml.internal.dtdparser;
27
28import org.xml.sax.EntityResolver;
29import org.xml.sax.InputSource;
30
31import java.io.File;
32import java.io.FileInputStream;
33import java.io.IOException;
34import java.io.InputStream;
35import java.net.URL;
36import java.net.URLConnection;
37import java.util.Hashtable;
38import java.util.Locale;
39
40/**
41 * This entity resolver class provides a number of utilities which can help
42 * managment of external parsed entities in XML.  These are commonly used
43 * to hold markup declarations that are to be used as part of a Document
44 * Type Declaration (DTD), or to hold text marked up with XML.
45 * <p>
46 * <P> Features include: <UL>
47 *
48 * <LI> Static factory methods are provided for constructing SAX InputSource
49 * objects from Files, URLs, or MIME objects.  This eliminates a class of
50 * error-prone coding in applications.</LI>
51 *
52 * <LI> Character encodings for XML documents are correctly supported:<UL>
53 *
54 * <LI> The encodings defined in the RFCs for MIME content types
55 * (2046 for general MIME, and 2376 for XML in particular), are
56 * supported, handling <em>charset=...</em> attributes and accepting
57 * content types which are known to be safe for use with XML;</LI>
58 *
59 * <LI> The character encoding autodetection algorithm identified
60 * in the XML specification is used, and leverages all of
61 * the JDK 1.1 (and later) character encoding support.</LI>
62 *
63 * <LI> The use of MIME typing may optionally be disabled, forcing the
64 * use of autodetection, to support web servers which don't correctly
65 * report MIME types for XML.  For example, they may report text that
66 * is encoded in EUC-JP as being US-ASCII text, leading to fatal
67 * errors during parsing.</LI>
68 *
69 * <LI> The InputSource objects returned by this class always
70 * have a <code>java.io.Reader</code> available as the "character
71 * stream" property.</LI>
72 *
73 * </UL></LI>
74 *
75 * <LI> Catalog entries can map public identifiers to Java resources or
76 * to local URLs.  These are used to reduce network dependencies and loads,
77 * and will often be used for external DTD components.  For example, packages
78 * shipping DTD files as resources in JAR files can eliminate network traffic
79 * when accessing them, and sites may provide local caches of common DTDs.
80 * Note that no particular catalog syntax is supported by this class, only
81 * the notion of a set of entries.</LI>
82 *
83 * </UL>
84 * <p>
85 * <P> Subclasses can perform tasks such as supporting new URI schemes for
86 * URIs which are not URLs, such as URNs (see RFC 2396) or for accessing
87 * MIME entities which are part of a <em>multipart/related</em> group
88 * (see RFC 2387).  They may also be used to support particular catalog
89 * syntaxes, such as the <a href="http://www.oasis-open.org/html/a401.htm">
90 * SGML/Open Catalog (SOCAT)</a> which supports the SGML notion of "Formal
91 * Public Identifiers (FPIs).
92 *
93 * @author David Brownell
94 * @author Janet Koenig
95 * @version 1.3 00/02/24
96 */
97public class Resolver implements EntityResolver {
98    private boolean ignoringMIME;
99
100    // table mapping public IDs to (local) URIs
101    private Hashtable id2uri;
102
103    // tables mapping public IDs to resources and classloaders
104    private Hashtable id2resource;
105    private Hashtable id2loader;
106
107    //
108    // table of MIME content types (less attributes!) known
109    // to be mostly "OK" to use with XML MIME entities.  the
110    // idea is to rule out obvious braindamage ("image/jpg")
111    // not the subtle stuff ("text/html") that might actually
112    // be (or become) safe.
113    //
114    private static final String types [] = {
115        "application/xml",
116        "text/xml",
117        "text/plain",
118        "text/html", // commonly mis-inferred
119        "application/x-netcdf", // this is often illegal XML
120        "content/unknown"
121    };
122
123    /**
124     * Constructs a resolver.
125     */
126    public Resolver() {
127    }
128
129    /**
130     * <p>Returns an input source, using the MIME type information and URL
131     * scheme to statically determine the correct character encoding if
132     * possible and otherwise autodetecting it.  MIME carefully specifies
133     * the character encoding defaults, and how attributes of the content
134     * type can change it.  XML further specifies two mandatory encodings
135     * (UTF-8 and UTF-16), and includes an XML declaration which can be
136     * used to internally label most documents encoded using US-ASCII
137     * supersets (such as Shift_JIS, EUC-JP, ISO-2022-*, ISO-8859-*, and
138     * more).</p>
139     *
140     * <p> This method can be used to access XML documents which do not
141     * have URIs (such as servlet input streams, or most JavaMail message
142     * entities) and to support access methods such as HTTP POST or PUT.
143     * (URLs normally return content using the GET method.)</p>
144     *
145     * <p> <em> The caller should set the system ID in order for relative URIs
146     * found in this document to be interpreted correctly.</em> In some cases,
147     * a custom resolver will need to be used; for example, documents
148     * may be grouped in a single MIME "multipart/related" bundle, and
149     * relative URLs would refer to other documents in that bundle.</p>
150     *
151     * @param contentType The MIME content type for the source for which
152     *                    an InputSource is desired, such as <em>text/xml;charset=utf-8</em>.
153     * @param stream      The input byte stream for the input source.
154     * @param checkType   If true, this verifies that the content type is known
155     *                    to support XML documents, such as <em>application/xml</em>.
156     * @param scheme      Unless this is "file", unspecified MIME types
157     *                    default to US-ASCII.  Files are always autodetected since most
158     *                    file systems discard character encoding information.
159     */
160    public static InputSource createInputSource(String contentType,
161                                                InputStream stream,
162                                                boolean checkType,
163                                                String scheme) throws IOException {
164        InputSource retval;
165        String charset = null;
166
167        if (contentType != null) {
168            int index;
169
170            contentType = contentType.toLowerCase(Locale.ENGLISH);
171            index = contentType.indexOf(';');
172            if (index != -1) {
173                String attributes;
174
175                attributes = contentType.substring(index + 1);
176                contentType = contentType.substring(0, index);
177
178                // use "charset=..." if it's available
179                index = attributes.indexOf("charset");
180                if (index != -1) {
181                    attributes = attributes.substring(index + 7);
182                    // strip out subsequent attributes
183                    if ((index = attributes.indexOf(';')) != -1)
184                        attributes = attributes.substring(0, index);
185                    // find start of value
186                    if ((index = attributes.indexOf('=')) != -1) {
187                        attributes = attributes.substring(index + 1);
188                        // strip out rfc822 comments
189                        if ((index = attributes.indexOf('(')) != -1)
190                            attributes = attributes.substring(0, index);
191                        // double quotes are optional
192                        if ((index = attributes.indexOf('"')) != -1) {
193                            attributes = attributes.substring(index + 1);
194                            attributes = attributes.substring(0,
195                                    attributes.indexOf('"'));
196                        }
197                        charset = attributes.trim();
198                        // XXX "\;", "\)" etc were mishandled above
199                    }
200                }
201            }
202
203            //
204            // Check MIME type.
205            //
206            if (checkType) {
207                boolean isOK = false;
208                for (int i = 0; i < types.length; i++)
209                    if (types[i].equals(contentType)) {
210                        isOK = true;
211                        break;
212                    }
213                if (!isOK)
214                    throw new IOException("Not XML: " + contentType);
215            }
216
217            //
218            // "text/*" MIME types have hard-wired character set
219            // defaults, as specified in the RFCs.  For XML, we
220            // ignore the system "file.encoding" property since
221            // autodetection is more correct.
222            //
223            if (charset == null) {
224                contentType = contentType.trim();
225                if (contentType.startsWith("text/")) {
226                    if (!"file".equalsIgnoreCase(scheme))
227                        charset = "US-ASCII";
228                }
229                // "application/*" has no default
230            }
231        }
232
233        retval = new InputSource(XmlReader.createReader(stream, charset));
234        retval.setByteStream(stream);
235        retval.setEncoding(charset);
236        return retval;
237    }
238
239
240    /**
241     * Creates an input source from a given URI.
242     *
243     * @param uri       the URI (system ID) for the entity
244     * @param checkType if true, the MIME content type for the entity
245     *                  is checked for document type and character set encoding.
246     */
247    static public InputSource createInputSource(URL uri, boolean checkType)
248            throws IOException {
249
250        URLConnection conn = uri.openConnection();
251        InputSource retval;
252
253        if (checkType) {
254            String contentType = conn.getContentType();
255            retval = createInputSource(contentType, conn.getInputStream(),
256                    false, uri.getProtocol());
257        } else {
258            retval = new InputSource(XmlReader.createReader(conn.getInputStream()));
259        }
260        retval.setSystemId(conn.getURL().toString());
261        return retval;
262    }
263
264
265    /**
266     * Creates an input source from a given file, autodetecting
267     * the character encoding.
268     */
269    static public InputSource createInputSource(File file)
270            throws IOException {
271        InputSource retval;
272        String path;
273
274        retval = new InputSource(XmlReader.createReader(new FileInputStream(file)));
275
276        // On JDK 1.2 and later, simplify this:
277        //    "path = file.toURL ().toString ()".
278        path = file.getAbsolutePath();
279        if (File.separatorChar != '/')
280            path = path.replace(File.separatorChar, '/');
281        if (!path.startsWith("/"))
282            path = "/" + path;
283        if (!path.endsWith("/") && file.isDirectory())
284            path = path + "/";
285
286        retval.setSystemId("file:" + path);
287        return retval;
288    }
289
290
291    /**
292     * <b>SAX:</b>
293     * Resolve the given entity into an input source.  If the name can't
294     * be mapped to a preferred form of the entity, the URI is used.  To
295     * resolve the entity, first a local catalog mapping names to URIs is
296     * consulted.  If no mapping is found there, a catalog mapping names
297     * to java resources is consulted.  Finally, if neither mapping found
298     * a copy of the entity, the specified URI is used.
299     * <p>
300     * <P> When a URI is used, <a href="#createInputSource">
301     * createInputSource</a> is used to correctly deduce the character
302     * encoding used by this entity.  No MIME type checking is done.
303     *
304     * @param name Used to find alternate copies of the entity, when
305     *             this value is non-null; this is the XML "public ID".
306     * @param uri  Used when no alternate copy of the entity is found;
307     *             this is the XML "system ID", normally a URI.
308     */
309    @Override
310    public InputSource resolveEntity(String name, String uri)
311            throws IOException {
312        InputSource retval;
313        String mappedURI = name2uri(name);
314        InputStream stream;
315
316        // prefer explicit URI mappings, then bundled resources...
317        if (mappedURI == null && (stream = mapResource(name)) != null && id2resource != null) {
318            uri = "java:resource:" + (String) id2resource.get(name);
319            retval = new InputSource(XmlReader.createReader(stream));
320
321            // ...and treat all URIs the same (as URLs for now).
322        } else {
323            URL url;
324            URLConnection conn;
325
326            if (mappedURI != null)
327                uri = mappedURI;
328            else if (uri == null)
329                return null;
330
331            url = new URL(uri);
332            conn = url.openConnection();
333            uri = conn.getURL().toString();
334            // System.out.println ("++ URI: " + url);
335            if (ignoringMIME)
336                retval = new InputSource(XmlReader.createReader(conn.getInputStream()));
337            else {
338                String contentType = conn.getContentType();
339                retval = createInputSource(contentType,
340                        conn.getInputStream(),
341                        false, url.getProtocol());
342            }
343        }
344        retval.setSystemId(uri);
345        retval.setPublicId(name);
346        return retval;
347    }
348
349
350    /**
351     * Returns true if this resolver is ignoring MIME types in the documents
352     * it returns, to work around bugs in how servers have reported the
353     * documents' MIME types.
354     */
355    public boolean isIgnoringMIME() {
356        return ignoringMIME;
357    }
358
359    /**
360     * Tells the resolver whether to ignore MIME types in the documents it
361     * retrieves.  Many web servers incorrectly assign text documents a
362     * default character encoding, even when that is incorrect.  For example,
363     * all HTTP text documents default to use ISO-8859-1 (used for Western
364     * European languages), and other MIME sources default text documents
365     * to use US-ASCII (a seven bit encoding).  For XML documents which
366     * include text encoding declarations (as most should do), these server
367     * bugs can be worked around by ignoring the MIME type entirely.
368     */
369    public void setIgnoringMIME(boolean value) {
370        ignoringMIME = value;
371    }
372
373
374    // maps the public ID to an alternate URI, if one is registered
375    private String name2uri(String publicId) {
376        if (publicId == null || id2uri == null)
377            return null;
378        return (String) id2uri.get(publicId);
379    }
380
381
382    /**
383     * Registers the given public ID as corresponding to a particular
384     * URI, typically a local copy.  This URI will be used in preference
385     * to ones provided as system IDs in XML entity declarations.  This
386     * mechanism would most typically be used for Document Type Definitions
387     * (DTDs), where the public IDs are formally managed and versioned.
388     *
389     * @param publicId The managed public ID being mapped
390     * @param uri      The URI of the preferred copy of that entity
391     */
392    public void registerCatalogEntry(String publicId,
393                                     String uri) {
394        if (id2uri == null)
395            id2uri = new Hashtable(17);
396        id2uri.put(publicId, uri);
397    }
398
399
400    // return the resource as a stream
401    private InputStream mapResource(String publicId) {
402        // System.out.println ("++ PUBLIC: " + publicId);
403        if (publicId == null || id2resource == null)
404            return null;
405
406        String resourceName = (String) id2resource.get(publicId);
407        ClassLoader loader = null;
408
409        if (resourceName == null)
410            return null;
411        // System.out.println ("++ Resource: " + resourceName);
412
413        if (id2loader != null)
414            loader = (ClassLoader) id2loader.get(publicId);
415        // System.out.println ("++ Loader: " + loader);
416        if (loader == null)
417            return ClassLoader.getSystemResourceAsStream(resourceName);
418        return loader.getResourceAsStream(resourceName);
419    }
420
421    /**
422     * Registers a given public ID as corresponding to a particular Java
423     * resource in a given class loader, typically distributed with a
424     * software package.  This resource will be preferred over system IDs
425     * included in XML documents.  This mechanism should most typically be
426     * used for Document Type Definitions (DTDs), where the public IDs are
427     * formally managed and versioned.
428     * <p>
429     * <P> If a mapping to a URI has been provided, that mapping takes
430     * precedence over this one.
431     *
432     * @param publicId     The managed public ID being mapped
433     * @param resourceName The name of the Java resource
434     * @param loader       The class loader holding the resource, or null if
435     *                     it is a system resource.
436     */
437    public void registerCatalogEntry(String publicId,
438                                     String resourceName,
439                                     ClassLoader loader) {
440        if (id2resource == null)
441            id2resource = new Hashtable(17);
442        id2resource.put(publicId, resourceName);
443
444        if (loader != null) {
445            if (id2loader == null)
446                id2loader = new Hashtable(17);
447            id2loader.put(publicId, loader);
448        }
449    }
450}
451