XIncludeTextReader.java revision 761:e4bc32cbffad
1/*
2 * reserved comment block
3 * DO NOT REMOVE OR ALTER!
4 */
5/*
6 * Copyright 2003-2005 The Apache Software Foundation.
7 *
8 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 *      http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 */
20package com.sun.org.apache.xerces.internal.xinclude;
21
22import java.io.BufferedInputStream;
23import java.io.IOException;
24import java.io.InputStream;
25import java.io.InputStreamReader;
26import java.io.Reader;
27import java.net.HttpURLConnection;
28import java.net.URL;
29import java.net.URLConnection;
30import java.util.Iterator;
31import java.util.Locale;
32import java.util.Map;
33
34import com.sun.org.apache.xerces.internal.impl.XMLEntityManager;
35import com.sun.org.apache.xerces.internal.impl.XMLErrorReporter;
36import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader;
37import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader;
38import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
39import com.sun.org.apache.xerces.internal.util.EncodingMap;
40import com.sun.org.apache.xerces.internal.util.HTTPInputSource;
41import com.sun.org.apache.xerces.internal.util.MessageFormatter;
42import com.sun.org.apache.xerces.internal.util.XMLChar;
43import com.sun.org.apache.xerces.internal.xni.XMLString;
44import com.sun.org.apache.xerces.internal.xni.parser.XMLInputSource;
45
46/**
47 * This class is used for reading resources requested in <include> elements,
48 * when the parse attribute of the <include> element is "text".  Using this
49 * class will open the location, detect the encoding, and discard the byte order
50 * mark, if applicable.
51 *
52 * REVISIT:
53 * Much of the code in this class is taken from XMLEntityManager.  It would be nice
54 * if this code could be shared in some way.  However, since XMLEntityManager is used
55 * for reading files as XML, and this needs to read files as text, there would need
56 * to be some refactoring done.
57 *
58 * @author Michael Glavassevich, IBM
59 * @author Peter McCracken, IBM
60 * @author Ankit Pasricha, IBM
61 * @author Arun Yadav, Sun Microsystems Inc.
62 *
63 *
64 * @see XIncludeHandler
65 */
66public class XIncludeTextReader {
67
68    private Reader fReader;
69    private XIncludeHandler fHandler;
70    private XMLInputSource fSource;
71    private XMLErrorReporter fErrorReporter;
72    private XMLString fTempString = new XMLString();
73
74    /**
75     * Construct the XIncludeReader using the XMLInputSource and XIncludeHandler.
76     *
77     * @param source The XMLInputSource to use.
78     * @param handler The XIncludeHandler to use.
79     * @param bufferSize The size of this text reader's buffer.
80     */
81    public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler, int bufferSize)
82        throws IOException {
83        fHandler = handler;
84        fSource = source;
85        fTempString = new XMLString(new char[bufferSize + 1], 0, 0);
86    }
87
88    /**
89     * Sets the XMLErrorReporter used for reporting errors while
90     * reading the text include.
91     *
92     * @param errorReporter the XMLErrorReporter to be used for
93     * reporting errors.
94     */
95    public void setErrorReporter(XMLErrorReporter errorReporter) {
96        fErrorReporter = errorReporter;
97    }
98
99    /**
100     * Return the Reader for given XMLInputSource.
101     *
102     * @param source The XMLInputSource to use.
103     */
104    protected Reader getReader(XMLInputSource source) throws IOException {
105        if (source.getCharacterStream() != null) {
106            return source.getCharacterStream();
107        }
108        else {
109            InputStream stream = null;
110
111            String encoding = source.getEncoding();
112            if (encoding == null) {
113                encoding = "UTF-8";
114            }
115            if (source.getByteStream() != null) {
116                stream = source.getByteStream();
117                // Wrap the InputStream so that it is possible to rewind it.
118                if (!(stream instanceof BufferedInputStream)) {
119                    stream = new BufferedInputStream(stream, fTempString.ch.length);
120                }
121            }
122            else {
123                String expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false);
124
125                URL url = new URL(expandedSystemId);
126                URLConnection urlCon = url.openConnection();
127
128                // If this is an HTTP connection attach any request properties to the request.
129                if (urlCon instanceof HttpURLConnection && source instanceof HTTPInputSource) {
130                    final HttpURLConnection urlConnection = (HttpURLConnection) urlCon;
131                    final HTTPInputSource httpInputSource = (HTTPInputSource) source;
132
133                    // set request properties
134                    Iterator propIter = httpInputSource.getHTTPRequestProperties();
135                    while (propIter.hasNext()) {
136                        Map.Entry entry = (Map.Entry) propIter.next();
137                        urlConnection.setRequestProperty((String) entry.getKey(), (String) entry.getValue());
138                    }
139
140                    // set preference for redirection
141                    boolean followRedirects = httpInputSource.getFollowHTTPRedirects();
142                    if (!followRedirects) {
143                        urlConnection.setInstanceFollowRedirects(followRedirects);
144                    }
145                }
146
147                // Wrap the InputStream so that it is possible to rewind it.
148                stream = new BufferedInputStream(urlCon.getInputStream());
149
150                // content type will be string like "text/xml; charset=UTF-8" or "text/xml"
151                String rawContentType = urlCon.getContentType();
152
153                // text/xml and application/xml offer only one optional parameter
154                int index = (rawContentType != null) ? rawContentType.indexOf(';') : -1;
155
156                String contentType = null;
157                String charset = null;
158                if (index != -1) {
159                    // this should be something like "text/xml"
160                    contentType = rawContentType.substring(0, index).trim();
161
162                    // this should be something like "charset=UTF-8", but we want to
163                    // strip it down to just "UTF-8"
164                    charset = rawContentType.substring(index + 1).trim();
165                    if (charset.startsWith("charset=")) {
166                        // 8 is the length of "charset="
167                        charset = charset.substring(8).trim();
168                        // strip quotes, if present
169                        if ((charset.charAt(0) == '"'
170                            && charset.charAt(charset.length() - 1) == '"')
171                            || (charset.charAt(0) == '\''
172                                && charset.charAt(charset.length() - 1)
173                                    == '\'')) {
174                            charset =
175                                charset.substring(1, charset.length() - 1);
176                        }
177                    }
178                    else {
179                        charset = null;
180                    }
181                }
182                else {
183                    contentType = rawContentType.trim();
184                }
185
186                String detectedEncoding = null;
187                /**  The encoding of such a resource is determined by:
188                    1 external encoding information, if available, otherwise
189                         -- the most common type of external information is the "charset" parameter of a MIME package
190                    2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise
191                    3 the value of the encoding attribute if one exists, otherwise
192                    4 UTF-8.
193                 **/
194                if (contentType.equals("text/xml")) {
195                    if (charset != null) {
196                        detectedEncoding = charset;
197                    }
198                    else {
199                        // see RFC2376 or 3023, section 3.1
200                        detectedEncoding = "US-ASCII";
201                    }
202                }
203                else if (contentType.equals("application/xml")) {
204                    if (charset != null) {
205                        detectedEncoding = charset;
206                    }
207                    else {
208                        // see RFC2376 or 3023, section 3.2
209                        detectedEncoding = getEncodingName(stream);
210                    }
211                }
212                else if (contentType.endsWith("+xml")) {
213                    detectedEncoding = getEncodingName(stream);
214                }
215
216                if (detectedEncoding != null) {
217                    encoding = detectedEncoding;
218                }
219                // else 3 or 4.
220            }
221
222            encoding = encoding.toUpperCase(Locale.ENGLISH);
223
224            // eat the Byte Order Mark
225            encoding = consumeBOM(stream, encoding);
226
227            // If the document is UTF-8 or US-ASCII use
228            // the Xerces readers for these encodings. For
229            // US-ASCII consult the encoding map since
230            // this encoding has many aliases.
231            if (encoding.equals("UTF-8")) {
232                return new UTF8Reader(stream,
233                    fTempString.ch.length,
234                    fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
235                    fErrorReporter.getLocale() );
236            }
237
238            // Try to use a Java reader.
239            String javaEncoding = EncodingMap.getIANA2JavaMapping(encoding);
240
241            // If the specified encoding wasn't a recognized IANA encoding throw an IOException.
242            // The XIncludeHandler will report this as a ResourceError and then will
243            // attempt to include a fallback if there is one.
244            if (javaEncoding == null) {
245                MessageFormatter aFormatter =
246                    fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);
247                Locale aLocale = fErrorReporter.getLocale();
248                throw new IOException( aFormatter.formatMessage( aLocale,
249                    "EncodingDeclInvalid",
250                    new Object[] {encoding} ) );
251            }
252            else if (javaEncoding.equals("ASCII")) {
253                return new ASCIIReader(stream,
254                    fTempString.ch.length,
255                    fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
256                    fErrorReporter.getLocale() );
257            }
258
259            return new InputStreamReader(stream, javaEncoding);
260        }
261    }
262
263    /**
264     * XMLEntityManager cares about endian-ness, since it creates its own optimized
265     * readers. Since we're just using generic Java readers for now, we're not caring
266     * about endian-ness.  If this changes, even more code needs to be copied from
267     * XMLEntity manager. -- PJM
268     */
269    protected String getEncodingName(InputStream stream) throws IOException {
270        final byte[] b4 = new byte[4];
271        String encoding = null;
272
273        // this has the potential to throw an exception
274        // it will be fixed when we ensure the stream is rewindable (see note above)
275        stream.mark(4);
276        int count = stream.read(b4, 0, 4);
277        stream.reset();
278        if (count == 4) {
279            encoding = getEncodingName(b4);
280        }
281
282        return encoding;
283    }
284
285    /**
286     * Removes the byte order mark from the stream, if
287     * it exists and returns the encoding name.
288     *
289     * @param stream
290     * @param encoding
291     * @throws IOException
292     */
293    protected String consumeBOM(InputStream stream, String encoding)
294        throws IOException {
295
296        byte[] b = new byte[3];
297        int count = 0;
298        stream.mark(3);
299        if (encoding.equals("UTF-8")) {
300            count = stream.read(b, 0, 3);
301            if (count == 3) {
302                final int b0 = b[0] & 0xFF;
303                final int b1 = b[1] & 0xFF;
304                final int b2 = b[2] & 0xFF;
305                if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
306                    // First three bytes are not BOM, so reset.
307                    stream.reset();
308                }
309            }
310            else {
311                stream.reset();
312            }
313        }
314        else if (encoding.startsWith("UTF-16")) {
315            count = stream.read(b, 0, 2);
316            if (count == 2) {
317                final int b0 = b[0] & 0xFF;
318                final int b1 = b[1] & 0xFF;
319                if (b0 == 0xFE && b1 == 0xFF) {
320                    return "UTF-16BE";
321                }
322                else if (b0 == 0xFF && b1 == 0xFE) {
323                    return "UTF-16LE";
324                }
325            }
326            // First two bytes are not BOM, so reset.
327            stream.reset();
328        }
329        // We could do UTF-32, but since the getEncodingName() doesn't support that
330        // we won't support it here.
331        // To implement UTF-32, look for:  00 00 FE FF for big-endian
332        //                             or  FF FE 00 00 for little-endian
333        return encoding;
334    }
335
336    /**
337     * REVISIT: This code is taken from com.sun.org.apache.xerces.internal.impl.XMLEntityManager.
338     *          Is there any way we can share the code, without having it implemented twice?
339     *          I think we should make it public and static in XMLEntityManager. --PJM
340     *
341     * Returns the IANA encoding name that is auto-detected from
342     * the bytes specified, with the endian-ness of that encoding where appropriate.
343     *
344     * @param b4    The first four bytes of the input.
345     * @return the encoding name, or null if no encoding could be detected
346     */
347    protected String getEncodingName(byte[] b4) {
348
349        // UTF-16, with BOM
350        int b0 = b4[0] & 0xFF;
351        int b1 = b4[1] & 0xFF;
352        if (b0 == 0xFE && b1 == 0xFF) {
353            // UTF-16, big-endian
354            return "UTF-16BE";
355        }
356        if (b0 == 0xFF && b1 == 0xFE) {
357            // UTF-16, little-endian
358            return "UTF-16LE";
359        }
360
361        // UTF-8 with a BOM
362        int b2 = b4[2] & 0xFF;
363        if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
364            return "UTF-8";
365        }
366
367        // other encodings
368        int b3 = b4[3] & 0xFF;
369        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
370            // UCS-4, big endian (1234)
371            return "ISO-10646-UCS-4";
372        }
373        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
374            // UCS-4, little endian (4321)
375            return "ISO-10646-UCS-4";
376        }
377        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
378            // UCS-4, unusual octet order (2143)
379            return "ISO-10646-UCS-4";
380        }
381        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
382            // UCS-4, unusual octect order (3412)
383            return "ISO-10646-UCS-4";
384        }
385        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
386            // UTF-16, big-endian, no BOM
387            // (or could turn out to be UCS-2...
388            return "UTF-16BE";
389        }
390        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
391            // UTF-16, little-endian, no BOM
392            // (or could turn out to be UCS-2...
393            return "UTF-16LE";
394        }
395        if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
396            // EBCDIC
397            // a la xerces1, return CP037 instead of EBCDIC here
398            return "CP037";
399        }
400
401        // this signals us to use the value from the encoding attribute
402        return null;
403
404    } // getEncodingName(byte[]):Object[]
405
406    /**
407     * Read the input stream as text, and pass the text on to the XIncludeHandler
408     * using calls to characters().  This will read all of the text it can from the
409     * resource.
410     *
411     * @throws IOException
412     */
413    public void parse() throws IOException {
414
415        fReader = getReader(fSource);
416        fSource = null;
417        int readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1);
418        while (readSize != -1) {
419            for (int i = 0; i < readSize; ++i) {
420                char ch = fTempString.ch[i];
421                if (!isValid(ch)) {
422                    if (XMLChar.isHighSurrogate(ch)) {
423                        int ch2;
424                        // retrieve next character
425                        if (++i < readSize) {
426                            ch2 = fTempString.ch[i];
427                        }
428                        // handle rare boundary case
429                        else {
430                            ch2 = fReader.read();
431                            if (ch2 != -1) {
432                                fTempString.ch[readSize++] = (char) ch2;
433                            }
434                        }
435                        if (XMLChar.isLowSurrogate(ch2)) {
436                            // convert surrogates to a supplemental character
437                            int sup = XMLChar.supplemental(ch, (char)ch2);
438                            if (!isValid(sup)) {
439                                fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
440                                                           "InvalidCharInContent",
441                                                           new Object[] { Integer.toString(sup, 16) },
442                                                           XMLErrorReporter.SEVERITY_FATAL_ERROR);
443                            }
444                        }
445                        else {
446                            fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
447                                                       "InvalidCharInContent",
448                                                       new Object[] { Integer.toString(ch2, 16) },
449                                                       XMLErrorReporter.SEVERITY_FATAL_ERROR);
450                        }
451                    }
452                    else {
453                        fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
454                                                   "InvalidCharInContent",
455                                                   new Object[] { Integer.toString(ch, 16) },
456                                                   XMLErrorReporter.SEVERITY_FATAL_ERROR);
457                    }
458                }
459            }
460            if (fHandler != null && readSize > 0) {
461                fTempString.offset = 0;
462                fTempString.length = readSize;
463                fHandler.characters(
464                    fTempString,
465                    fHandler.modifyAugmentations(null, true));
466            }
467            readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1);
468        }
469
470    }
471
472    /**
473     * Sets the input source on this text reader.
474     *
475     * @param source The XMLInputSource to use.
476     */
477    public void setInputSource(XMLInputSource source) {
478        fSource = source;
479    }
480
481    /**
482     * Closes the stream.  Call this after parse(), or when there is no longer any need
483     * for this object.
484     *
485     * @throws IOException
486     */
487    public void close() throws IOException {
488        if (fReader != null) {
489            fReader.close();
490            fReader = null;
491        }
492    }
493
494    /**
495     * Returns true if the specified character is a valid XML character
496     * as per the rules of XML 1.0.
497     *
498     * @param ch The character to check.
499     */
500    protected boolean isValid(int ch) {
501        return XMLChar.isValid(ch);
502    }
503
504    /**
505     * Sets the buffer size property for the reader which decides the chunk sizes that are parsed
506     * by the reader at a time and passed to the handler
507     *
508     * @param bufferSize The size of the buffer desired
509     */
510    protected void setBufferSize(int bufferSize) {
511        if (fTempString.ch.length != ++bufferSize) {
512            fTempString.ch = new char[bufferSize];
513        }
514    }
515
516}
517