1/*
2 * reserved comment block
3 * DO NOT REMOVE OR ALTER!
4 */
5/*
6 * Licensed to the Apache Software Foundation (ASF) under one or more
7 * contributor license agreements.  See the NOTICE file distributed with
8 * this work for additional information regarding copyright ownership.
9 * The ASF licenses this file to You under the Apache License, Version 2.0
10 * (the "License"); you may not use this file except in compliance with
11 * the License.  You may obtain a copy of the License at
12 *
13 *      http://www.apache.org/licenses/LICENSE-2.0
14 *
15 * Unless required by applicable law or agreed to in writing, software
16 * distributed under the License is distributed on an "AS IS" BASIS,
17 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 * See the License for the specific language governing permissions and
19 * limitations under the License.
20 */
21
22package com.sun.org.apache.xerces.internal.xinclude;
23
24import java.io.BufferedInputStream;
25import java.io.IOException;
26import java.io.InputStream;
27import java.io.InputStreamReader;
28import java.io.Reader;
29import java.net.HttpURLConnection;
30import java.net.URL;
31import java.net.URLConnection;
32import java.util.Iterator;
33import java.util.Locale;
34import java.util.Map;
35
36import com.sun.org.apache.xerces.internal.impl.XMLEntityManager;
37import com.sun.org.apache.xerces.internal.impl.XMLErrorReporter;
38import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader;
39import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader;
40import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
41import com.sun.org.apache.xerces.internal.util.EncodingMap;
42import com.sun.org.apache.xerces.internal.util.HTTPInputSource;
43import com.sun.org.apache.xerces.internal.util.MessageFormatter;
44import com.sun.org.apache.xerces.internal.util.XMLChar;
45import com.sun.org.apache.xerces.internal.xni.XMLString;
46import com.sun.org.apache.xerces.internal.xni.parser.XMLInputSource;
47
48/**
49 * This class is used for reading resources requested in <include> elements,
50 * when the parse attribute of the <include> element is "text".  Using this
51 * class will open the location, detect the encoding, and discard the byte order
52 * mark, if applicable.
53 *
54 * REVISIT:
55 * Much of the code in this class is taken from XMLEntityManager.  It would be nice
56 * if this code could be shared in some way.  However, since XMLEntityManager is used
57 * for reading files as XML, and this needs to read files as text, there would need
58 * to be some refactoring done.
59 *
60 * @author Michael Glavassevich, IBM
61 * @author Peter McCracken, IBM
62 * @author Ankit Pasricha, IBM
63 * @author Arun Yadav, Sun Microsystems Inc.
64 *
65 *
66 * @see XIncludeHandler
67 */
68public class XIncludeTextReader {
69
70    private Reader fReader;
71    private XIncludeHandler fHandler;
72    private XMLInputSource fSource;
73    private XMLErrorReporter fErrorReporter;
74    private XMLString fTempString = new XMLString();
75
76    /**
77     * Construct the XIncludeReader using the XMLInputSource and XIncludeHandler.
78     *
79     * @param source The XMLInputSource to use.
80     * @param handler The XIncludeHandler to use.
81     * @param bufferSize The size of this text reader's buffer.
82     */
83    public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler, int bufferSize)
84        throws IOException {
85        fHandler = handler;
86        fSource = source;
87        fTempString = new XMLString(new char[bufferSize + 1], 0, 0);
88    }
89
90    /**
91     * Sets the XMLErrorReporter used for reporting errors while
92     * reading the text include.
93     *
94     * @param errorReporter the XMLErrorReporter to be used for
95     * reporting errors.
96     */
97    public void setErrorReporter(XMLErrorReporter errorReporter) {
98        fErrorReporter = errorReporter;
99    }
100
101    /**
102     * Return the Reader for given XMLInputSource.
103     *
104     * @param source The XMLInputSource to use.
105     */
106    protected Reader getReader(XMLInputSource source) throws IOException {
107        if (source.getCharacterStream() != null) {
108            return source.getCharacterStream();
109        }
110        else {
111            InputStream stream = null;
112
113            String encoding = source.getEncoding();
114            if (encoding == null) {
115                encoding = "UTF-8";
116            }
117            if (source.getByteStream() != null) {
118                stream = source.getByteStream();
119                // Wrap the InputStream so that it is possible to rewind it.
120                if (!(stream instanceof BufferedInputStream)) {
121                    stream = new BufferedInputStream(stream, fTempString.ch.length);
122                }
123            }
124            else {
125                String expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false);
126
127                URL url = new URL(expandedSystemId);
128                URLConnection urlCon = url.openConnection();
129
130                // If this is an HTTP connection attach any request properties to the request.
131                if (urlCon instanceof HttpURLConnection && source instanceof HTTPInputSource) {
132                    final HttpURLConnection urlConnection = (HttpURLConnection) urlCon;
133                    final HTTPInputSource httpInputSource = (HTTPInputSource) source;
134
135                    // set request properties
136                    Iterator propIter = httpInputSource.getHTTPRequestProperties();
137                    while (propIter.hasNext()) {
138                        Map.Entry entry = (Map.Entry) propIter.next();
139                        urlConnection.setRequestProperty((String) entry.getKey(), (String) entry.getValue());
140                    }
141
142                    // set preference for redirection
143                    boolean followRedirects = httpInputSource.getFollowHTTPRedirects();
144                    if (!followRedirects) {
145                        urlConnection.setInstanceFollowRedirects(followRedirects);
146                    }
147                }
148
149                // Wrap the InputStream so that it is possible to rewind it.
150                stream = new BufferedInputStream(urlCon.getInputStream());
151
152                // content type will be string like "text/xml; charset=UTF-8" or "text/xml"
153                String rawContentType = urlCon.getContentType();
154
155                // text/xml and application/xml offer only one optional parameter
156                int index = (rawContentType != null) ? rawContentType.indexOf(';') : -1;
157
158                String contentType = null;
159                String charset = null;
160                if (index != -1) {
161                    // this should be something like "text/xml"
162                    contentType = rawContentType.substring(0, index).trim();
163
164                    // this should be something like "charset=UTF-8", but we want to
165                    // strip it down to just "UTF-8"
166                    charset = rawContentType.substring(index + 1).trim();
167                    if (charset.startsWith("charset=")) {
168                        // 8 is the length of "charset="
169                        charset = charset.substring(8).trim();
170                        // strip quotes, if present
171                        if ((charset.charAt(0) == '"'
172                            && charset.charAt(charset.length() - 1) == '"')
173                            || (charset.charAt(0) == '\''
174                                && charset.charAt(charset.length() - 1)
175                                    == '\'')) {
176                            charset =
177                                charset.substring(1, charset.length() - 1);
178                        }
179                    }
180                    else {
181                        charset = null;
182                    }
183                }
184                else {
185                    contentType = rawContentType.trim();
186                }
187
188                String detectedEncoding = null;
189                /**  The encoding of such a resource is determined by:
190                    1 external encoding information, if available, otherwise
191                         -- the most common type of external information is the "charset" parameter of a MIME package
192                    2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise
193                    3 the value of the encoding attribute if one exists, otherwise
194                    4 UTF-8.
195                 **/
196                if (contentType.equals("text/xml")) {
197                    if (charset != null) {
198                        detectedEncoding = charset;
199                    }
200                    else {
201                        // see RFC2376 or 3023, section 3.1
202                        detectedEncoding = "US-ASCII";
203                    }
204                }
205                else if (contentType.equals("application/xml")) {
206                    if (charset != null) {
207                        detectedEncoding = charset;
208                    }
209                    else {
210                        // see RFC2376 or 3023, section 3.2
211                        detectedEncoding = getEncodingName(stream);
212                    }
213                }
214                else if (contentType.endsWith("+xml")) {
215                    detectedEncoding = getEncodingName(stream);
216                }
217
218                if (detectedEncoding != null) {
219                    encoding = detectedEncoding;
220                }
221                // else 3 or 4.
222            }
223
224            encoding = encoding.toUpperCase(Locale.ENGLISH);
225
226            // eat the Byte Order Mark
227            encoding = consumeBOM(stream, encoding);
228
229            // If the document is UTF-8 or US-ASCII use
230            // the Xerces readers for these encodings. For
231            // US-ASCII consult the encoding map since
232            // this encoding has many aliases.
233            if (encoding.equals("UTF-8")) {
234                return new UTF8Reader(stream,
235                    fTempString.ch.length,
236                    fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
237                    fErrorReporter.getLocale() );
238            }
239
240            // Try to use a Java reader.
241            String javaEncoding = EncodingMap.getIANA2JavaMapping(encoding);
242
243            // If the specified encoding wasn't a recognized IANA encoding throw an IOException.
244            // The XIncludeHandler will report this as a ResourceError and then will
245            // attempt to include a fallback if there is one.
246            if (javaEncoding == null) {
247                MessageFormatter aFormatter =
248                    fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);
249                Locale aLocale = fErrorReporter.getLocale();
250                throw new IOException( aFormatter.formatMessage( aLocale,
251                    "EncodingDeclInvalid",
252                    new Object[] {encoding} ) );
253            }
254            else if (javaEncoding.equals("ASCII")) {
255                return new ASCIIReader(stream,
256                    fTempString.ch.length,
257                    fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
258                    fErrorReporter.getLocale() );
259            }
260
261            return new InputStreamReader(stream, javaEncoding);
262        }
263    }
264
265    /**
266     * XMLEntityManager cares about endian-ness, since it creates its own optimized
267     * readers. Since we're just using generic Java readers for now, we're not caring
268     * about endian-ness.  If this changes, even more code needs to be copied from
269     * XMLEntity manager. -- PJM
270     */
271    protected String getEncodingName(InputStream stream) throws IOException {
272        final byte[] b4 = new byte[4];
273        String encoding = null;
274
275        // this has the potential to throw an exception
276        // it will be fixed when we ensure the stream is rewindable (see note above)
277        stream.mark(4);
278        int count = stream.read(b4, 0, 4);
279        stream.reset();
280        if (count == 4) {
281            encoding = getEncodingName(b4);
282        }
283
284        return encoding;
285    }
286
287    /**
288     * Removes the byte order mark from the stream, if
289     * it exists and returns the encoding name.
290     *
291     * @param stream
292     * @param encoding
293     * @throws IOException
294     */
295    protected String consumeBOM(InputStream stream, String encoding)
296        throws IOException {
297
298        byte[] b = new byte[3];
299        int count = 0;
300        stream.mark(3);
301        if (encoding.equals("UTF-8")) {
302            count = stream.read(b, 0, 3);
303            if (count == 3) {
304                final int b0 = b[0] & 0xFF;
305                final int b1 = b[1] & 0xFF;
306                final int b2 = b[2] & 0xFF;
307                if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
308                    // First three bytes are not BOM, so reset.
309                    stream.reset();
310                }
311            }
312            else {
313                stream.reset();
314            }
315        }
316        else if (encoding.startsWith("UTF-16")) {
317            count = stream.read(b, 0, 2);
318            if (count == 2) {
319                final int b0 = b[0] & 0xFF;
320                final int b1 = b[1] & 0xFF;
321                if (b0 == 0xFE && b1 == 0xFF) {
322                    return "UTF-16BE";
323                }
324                else if (b0 == 0xFF && b1 == 0xFE) {
325                    return "UTF-16LE";
326                }
327            }
328            // First two bytes are not BOM, so reset.
329            stream.reset();
330        }
331        // We could do UTF-32, but since the getEncodingName() doesn't support that
332        // we won't support it here.
333        // To implement UTF-32, look for:  00 00 FE FF for big-endian
334        //                             or  FF FE 00 00 for little-endian
335        return encoding;
336    }
337
338    /**
339     * REVISIT: This code is taken from com.sun.org.apache.xerces.internal.impl.XMLEntityManager.
340     *          Is there any way we can share the code, without having it implemented twice?
341     *          I think we should make it public and static in XMLEntityManager. --PJM
342     *
343     * Returns the IANA encoding name that is auto-detected from
344     * the bytes specified, with the endian-ness of that encoding where appropriate.
345     *
346     * @param b4    The first four bytes of the input.
347     * @return the encoding name, or null if no encoding could be detected
348     */
349    protected String getEncodingName(byte[] b4) {
350
351        // UTF-16, with BOM
352        int b0 = b4[0] & 0xFF;
353        int b1 = b4[1] & 0xFF;
354        if (b0 == 0xFE && b1 == 0xFF) {
355            // UTF-16, big-endian
356            return "UTF-16BE";
357        }
358        if (b0 == 0xFF && b1 == 0xFE) {
359            // UTF-16, little-endian
360            return "UTF-16LE";
361        }
362
363        // UTF-8 with a BOM
364        int b2 = b4[2] & 0xFF;
365        if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
366            return "UTF-8";
367        }
368
369        // other encodings
370        int b3 = b4[3] & 0xFF;
371        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
372            // UCS-4, big endian (1234)
373            return "ISO-10646-UCS-4";
374        }
375        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
376            // UCS-4, little endian (4321)
377            return "ISO-10646-UCS-4";
378        }
379        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
380            // UCS-4, unusual octet order (2143)
381            return "ISO-10646-UCS-4";
382        }
383        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
384            // UCS-4, unusual octect order (3412)
385            return "ISO-10646-UCS-4";
386        }
387        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
388            // UTF-16, big-endian, no BOM
389            // (or could turn out to be UCS-2...
390            return "UTF-16BE";
391        }
392        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
393            // UTF-16, little-endian, no BOM
394            // (or could turn out to be UCS-2...
395            return "UTF-16LE";
396        }
397        if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
398            // EBCDIC
399            // a la xerces1, return CP037 instead of EBCDIC here
400            return "CP037";
401        }
402
403        // this signals us to use the value from the encoding attribute
404        return null;
405
406    } // getEncodingName(byte[]):Object[]
407
408    /**
409     * Read the input stream as text, and pass the text on to the XIncludeHandler
410     * using calls to characters().  This will read all of the text it can from the
411     * resource.
412     *
413     * @throws IOException
414     */
415    public void parse() throws IOException {
416
417        fReader = getReader(fSource);
418        fSource = null;
419        int readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1);
420        while (readSize != -1) {
421            for (int i = 0; i < readSize; ++i) {
422                char ch = fTempString.ch[i];
423                if (!isValid(ch)) {
424                    if (XMLChar.isHighSurrogate(ch)) {
425                        int ch2;
426                        // retrieve next character
427                        if (++i < readSize) {
428                            ch2 = fTempString.ch[i];
429                        }
430                        // handle rare boundary case
431                        else {
432                            ch2 = fReader.read();
433                            if (ch2 != -1) {
434                                fTempString.ch[readSize++] = (char) ch2;
435                            }
436                        }
437                        if (XMLChar.isLowSurrogate(ch2)) {
438                            // convert surrogates to a supplemental character
439                            int sup = XMLChar.supplemental(ch, (char)ch2);
440                            if (!isValid(sup)) {
441                                fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
442                                                           "InvalidCharInContent",
443                                                           new Object[] { Integer.toString(sup, 16) },
444                                                           XMLErrorReporter.SEVERITY_FATAL_ERROR);
445                            }
446                        }
447                        else {
448                            fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
449                                                       "InvalidCharInContent",
450                                                       new Object[] { Integer.toString(ch2, 16) },
451                                                       XMLErrorReporter.SEVERITY_FATAL_ERROR);
452                        }
453                    }
454                    else {
455                        fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
456                                                   "InvalidCharInContent",
457                                                   new Object[] { Integer.toString(ch, 16) },
458                                                   XMLErrorReporter.SEVERITY_FATAL_ERROR);
459                    }
460                }
461            }
462            if (fHandler != null && readSize > 0) {
463                fTempString.offset = 0;
464                fTempString.length = readSize;
465                fHandler.characters(
466                    fTempString,
467                    fHandler.modifyAugmentations(null, true));
468            }
469            readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1);
470        }
471
472    }
473
474    /**
475     * Sets the input source on this text reader.
476     *
477     * @param source The XMLInputSource to use.
478     */
479    public void setInputSource(XMLInputSource source) {
480        fSource = source;
481    }
482
483    /**
484     * Closes the stream.  Call this after parse(), or when there is no longer any need
485     * for this object.
486     *
487     * @throws IOException
488     */
489    public void close() throws IOException {
490        if (fReader != null) {
491            fReader.close();
492            fReader = null;
493        }
494    }
495
496    /**
497     * Returns true if the specified character is a valid XML character
498     * as per the rules of XML 1.0.
499     *
500     * @param ch The character to check.
501     */
502    protected boolean isValid(int ch) {
503        return XMLChar.isValid(ch);
504    }
505
506    /**
507     * Sets the buffer size property for the reader which decides the chunk sizes that are parsed
508     * by the reader at a time and passed to the handler
509     *
510     * @param bufferSize The size of the buffer desired
511     */
512    protected void setBufferSize(int bufferSize) {
513        if (fTempString.ch.length != ++bufferSize) {
514            fTempString.ch = new char[bufferSize];
515        }
516    }
517
518}
519