1/* 2 * reserved comment block 3 * DO NOT REMOVE OR ALTER! 4 */ 5/* 6 * Licensed to the Apache Software Foundation (ASF) under one or more 7 * contributor license agreements. See the NOTICE file distributed with 8 * this work for additional information regarding copyright ownership. 9 * The ASF licenses this file to You under the Apache License, Version 2.0 10 * (the "License"); you may not use this file except in compliance with 11 * the License. You may obtain a copy of the License at 12 * 13 * http://www.apache.org/licenses/LICENSE-2.0 14 * 15 * Unless required by applicable law or agreed to in writing, software 16 * distributed under the License is distributed on an "AS IS" BASIS, 17 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 * See the License for the specific language governing permissions and 19 * limitations under the License. 20 */ 21 22package com.sun.org.apache.xerces.internal.xinclude; 23 24import java.io.BufferedInputStream; 25import java.io.IOException; 26import java.io.InputStream; 27import java.io.InputStreamReader; 28import java.io.Reader; 29import java.net.HttpURLConnection; 30import java.net.URL; 31import java.net.URLConnection; 32import java.util.Iterator; 33import java.util.Locale; 34import java.util.Map; 35 36import com.sun.org.apache.xerces.internal.impl.XMLEntityManager; 37import com.sun.org.apache.xerces.internal.impl.XMLErrorReporter; 38import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader; 39import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader; 40import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter; 41import com.sun.org.apache.xerces.internal.util.EncodingMap; 42import com.sun.org.apache.xerces.internal.util.HTTPInputSource; 43import com.sun.org.apache.xerces.internal.util.MessageFormatter; 44import com.sun.org.apache.xerces.internal.util.XMLChar; 45import com.sun.org.apache.xerces.internal.xni.XMLString; 46import com.sun.org.apache.xerces.internal.xni.parser.XMLInputSource; 47 48/** 49 * This class is used for reading resources requested in <include> elements, 50 * when the parse attribute of the <include> element is "text". Using this 51 * class will open the location, detect the encoding, and discard the byte order 52 * mark, if applicable. 53 * 54 * REVISIT: 55 * Much of the code in this class is taken from XMLEntityManager. It would be nice 56 * if this code could be shared in some way. However, since XMLEntityManager is used 57 * for reading files as XML, and this needs to read files as text, there would need 58 * to be some refactoring done. 59 * 60 * @author Michael Glavassevich, IBM 61 * @author Peter McCracken, IBM 62 * @author Ankit Pasricha, IBM 63 * @author Arun Yadav, Sun Microsystems Inc. 64 * 65 * 66 * @see XIncludeHandler 67 */ 68public class XIncludeTextReader { 69 70 private Reader fReader; 71 private XIncludeHandler fHandler; 72 private XMLInputSource fSource; 73 private XMLErrorReporter fErrorReporter; 74 private XMLString fTempString = new XMLString(); 75 76 /** 77 * Construct the XIncludeReader using the XMLInputSource and XIncludeHandler. 78 * 79 * @param source The XMLInputSource to use. 80 * @param handler The XIncludeHandler to use. 81 * @param bufferSize The size of this text reader's buffer. 82 */ 83 public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler, int bufferSize) 84 throws IOException { 85 fHandler = handler; 86 fSource = source; 87 fTempString = new XMLString(new char[bufferSize + 1], 0, 0); 88 } 89 90 /** 91 * Sets the XMLErrorReporter used for reporting errors while 92 * reading the text include. 93 * 94 * @param errorReporter the XMLErrorReporter to be used for 95 * reporting errors. 96 */ 97 public void setErrorReporter(XMLErrorReporter errorReporter) { 98 fErrorReporter = errorReporter; 99 } 100 101 /** 102 * Return the Reader for given XMLInputSource. 103 * 104 * @param source The XMLInputSource to use. 105 */ 106 protected Reader getReader(XMLInputSource source) throws IOException { 107 if (source.getCharacterStream() != null) { 108 return source.getCharacterStream(); 109 } 110 else { 111 InputStream stream = null; 112 113 String encoding = source.getEncoding(); 114 if (encoding == null) { 115 encoding = "UTF-8"; 116 } 117 if (source.getByteStream() != null) { 118 stream = source.getByteStream(); 119 // Wrap the InputStream so that it is possible to rewind it. 120 if (!(stream instanceof BufferedInputStream)) { 121 stream = new BufferedInputStream(stream, fTempString.ch.length); 122 } 123 } 124 else { 125 String expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false); 126 127 URL url = new URL(expandedSystemId); 128 URLConnection urlCon = url.openConnection(); 129 130 // If this is an HTTP connection attach any request properties to the request. 131 if (urlCon instanceof HttpURLConnection && source instanceof HTTPInputSource) { 132 final HttpURLConnection urlConnection = (HttpURLConnection) urlCon; 133 final HTTPInputSource httpInputSource = (HTTPInputSource) source; 134 135 // set request properties 136 Iterator propIter = httpInputSource.getHTTPRequestProperties(); 137 while (propIter.hasNext()) { 138 Map.Entry entry = (Map.Entry) propIter.next(); 139 urlConnection.setRequestProperty((String) entry.getKey(), (String) entry.getValue()); 140 } 141 142 // set preference for redirection 143 boolean followRedirects = httpInputSource.getFollowHTTPRedirects(); 144 if (!followRedirects) { 145 urlConnection.setInstanceFollowRedirects(followRedirects); 146 } 147 } 148 149 // Wrap the InputStream so that it is possible to rewind it. 150 stream = new BufferedInputStream(urlCon.getInputStream()); 151 152 // content type will be string like "text/xml; charset=UTF-8" or "text/xml" 153 String rawContentType = urlCon.getContentType(); 154 155 // text/xml and application/xml offer only one optional parameter 156 int index = (rawContentType != null) ? rawContentType.indexOf(';') : -1; 157 158 String contentType = null; 159 String charset = null; 160 if (index != -1) { 161 // this should be something like "text/xml" 162 contentType = rawContentType.substring(0, index).trim(); 163 164 // this should be something like "charset=UTF-8", but we want to 165 // strip it down to just "UTF-8" 166 charset = rawContentType.substring(index + 1).trim(); 167 if (charset.startsWith("charset=")) { 168 // 8 is the length of "charset=" 169 charset = charset.substring(8).trim(); 170 // strip quotes, if present 171 if ((charset.charAt(0) == '"' 172 && charset.charAt(charset.length() - 1) == '"') 173 || (charset.charAt(0) == '\'' 174 && charset.charAt(charset.length() - 1) 175 == '\'')) { 176 charset = 177 charset.substring(1, charset.length() - 1); 178 } 179 } 180 else { 181 charset = null; 182 } 183 } 184 else { 185 contentType = rawContentType.trim(); 186 } 187 188 String detectedEncoding = null; 189 /** The encoding of such a resource is determined by: 190 1 external encoding information, if available, otherwise 191 -- the most common type of external information is the "charset" parameter of a MIME package 192 2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise 193 3 the value of the encoding attribute if one exists, otherwise 194 4 UTF-8. 195 **/ 196 if (contentType.equals("text/xml")) { 197 if (charset != null) { 198 detectedEncoding = charset; 199 } 200 else { 201 // see RFC2376 or 3023, section 3.1 202 detectedEncoding = "US-ASCII"; 203 } 204 } 205 else if (contentType.equals("application/xml")) { 206 if (charset != null) { 207 detectedEncoding = charset; 208 } 209 else { 210 // see RFC2376 or 3023, section 3.2 211 detectedEncoding = getEncodingName(stream); 212 } 213 } 214 else if (contentType.endsWith("+xml")) { 215 detectedEncoding = getEncodingName(stream); 216 } 217 218 if (detectedEncoding != null) { 219 encoding = detectedEncoding; 220 } 221 // else 3 or 4. 222 } 223 224 encoding = encoding.toUpperCase(Locale.ENGLISH); 225 226 // eat the Byte Order Mark 227 encoding = consumeBOM(stream, encoding); 228 229 // If the document is UTF-8 or US-ASCII use 230 // the Xerces readers for these encodings. For 231 // US-ASCII consult the encoding map since 232 // this encoding has many aliases. 233 if (encoding.equals("UTF-8")) { 234 return new UTF8Reader(stream, 235 fTempString.ch.length, 236 fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), 237 fErrorReporter.getLocale() ); 238 } 239 240 // Try to use a Java reader. 241 String javaEncoding = EncodingMap.getIANA2JavaMapping(encoding); 242 243 // If the specified encoding wasn't a recognized IANA encoding throw an IOException. 244 // The XIncludeHandler will report this as a ResourceError and then will 245 // attempt to include a fallback if there is one. 246 if (javaEncoding == null) { 247 MessageFormatter aFormatter = 248 fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN); 249 Locale aLocale = fErrorReporter.getLocale(); 250 throw new IOException( aFormatter.formatMessage( aLocale, 251 "EncodingDeclInvalid", 252 new Object[] {encoding} ) ); 253 } 254 else if (javaEncoding.equals("ASCII")) { 255 return new ASCIIReader(stream, 256 fTempString.ch.length, 257 fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN), 258 fErrorReporter.getLocale() ); 259 } 260 261 return new InputStreamReader(stream, javaEncoding); 262 } 263 } 264 265 /** 266 * XMLEntityManager cares about endian-ness, since it creates its own optimized 267 * readers. Since we're just using generic Java readers for now, we're not caring 268 * about endian-ness. If this changes, even more code needs to be copied from 269 * XMLEntity manager. -- PJM 270 */ 271 protected String getEncodingName(InputStream stream) throws IOException { 272 final byte[] b4 = new byte[4]; 273 String encoding = null; 274 275 // this has the potential to throw an exception 276 // it will be fixed when we ensure the stream is rewindable (see note above) 277 stream.mark(4); 278 int count = stream.read(b4, 0, 4); 279 stream.reset(); 280 if (count == 4) { 281 encoding = getEncodingName(b4); 282 } 283 284 return encoding; 285 } 286 287 /** 288 * Removes the byte order mark from the stream, if 289 * it exists and returns the encoding name. 290 * 291 * @param stream 292 * @param encoding 293 * @throws IOException 294 */ 295 protected String consumeBOM(InputStream stream, String encoding) 296 throws IOException { 297 298 byte[] b = new byte[3]; 299 int count = 0; 300 stream.mark(3); 301 if (encoding.equals("UTF-8")) { 302 count = stream.read(b, 0, 3); 303 if (count == 3) { 304 final int b0 = b[0] & 0xFF; 305 final int b1 = b[1] & 0xFF; 306 final int b2 = b[2] & 0xFF; 307 if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) { 308 // First three bytes are not BOM, so reset. 309 stream.reset(); 310 } 311 } 312 else { 313 stream.reset(); 314 } 315 } 316 else if (encoding.startsWith("UTF-16")) { 317 count = stream.read(b, 0, 2); 318 if (count == 2) { 319 final int b0 = b[0] & 0xFF; 320 final int b1 = b[1] & 0xFF; 321 if (b0 == 0xFE && b1 == 0xFF) { 322 return "UTF-16BE"; 323 } 324 else if (b0 == 0xFF && b1 == 0xFE) { 325 return "UTF-16LE"; 326 } 327 } 328 // First two bytes are not BOM, so reset. 329 stream.reset(); 330 } 331 // We could do UTF-32, but since the getEncodingName() doesn't support that 332 // we won't support it here. 333 // To implement UTF-32, look for: 00 00 FE FF for big-endian 334 // or FF FE 00 00 for little-endian 335 return encoding; 336 } 337 338 /** 339 * REVISIT: This code is taken from com.sun.org.apache.xerces.internal.impl.XMLEntityManager. 340 * Is there any way we can share the code, without having it implemented twice? 341 * I think we should make it public and static in XMLEntityManager. --PJM 342 * 343 * Returns the IANA encoding name that is auto-detected from 344 * the bytes specified, with the endian-ness of that encoding where appropriate. 345 * 346 * @param b4 The first four bytes of the input. 347 * @return the encoding name, or null if no encoding could be detected 348 */ 349 protected String getEncodingName(byte[] b4) { 350 351 // UTF-16, with BOM 352 int b0 = b4[0] & 0xFF; 353 int b1 = b4[1] & 0xFF; 354 if (b0 == 0xFE && b1 == 0xFF) { 355 // UTF-16, big-endian 356 return "UTF-16BE"; 357 } 358 if (b0 == 0xFF && b1 == 0xFE) { 359 // UTF-16, little-endian 360 return "UTF-16LE"; 361 } 362 363 // UTF-8 with a BOM 364 int b2 = b4[2] & 0xFF; 365 if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { 366 return "UTF-8"; 367 } 368 369 // other encodings 370 int b3 = b4[3] & 0xFF; 371 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { 372 // UCS-4, big endian (1234) 373 return "ISO-10646-UCS-4"; 374 } 375 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { 376 // UCS-4, little endian (4321) 377 return "ISO-10646-UCS-4"; 378 } 379 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { 380 // UCS-4, unusual octet order (2143) 381 return "ISO-10646-UCS-4"; 382 } 383 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { 384 // UCS-4, unusual octect order (3412) 385 return "ISO-10646-UCS-4"; 386 } 387 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { 388 // UTF-16, big-endian, no BOM 389 // (or could turn out to be UCS-2... 390 return "UTF-16BE"; 391 } 392 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { 393 // UTF-16, little-endian, no BOM 394 // (or could turn out to be UCS-2... 395 return "UTF-16LE"; 396 } 397 if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { 398 // EBCDIC 399 // a la xerces1, return CP037 instead of EBCDIC here 400 return "CP037"; 401 } 402 403 // this signals us to use the value from the encoding attribute 404 return null; 405 406 } // getEncodingName(byte[]):Object[] 407 408 /** 409 * Read the input stream as text, and pass the text on to the XIncludeHandler 410 * using calls to characters(). This will read all of the text it can from the 411 * resource. 412 * 413 * @throws IOException 414 */ 415 public void parse() throws IOException { 416 417 fReader = getReader(fSource); 418 fSource = null; 419 int readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1); 420 while (readSize != -1) { 421 for (int i = 0; i < readSize; ++i) { 422 char ch = fTempString.ch[i]; 423 if (!isValid(ch)) { 424 if (XMLChar.isHighSurrogate(ch)) { 425 int ch2; 426 // retrieve next character 427 if (++i < readSize) { 428 ch2 = fTempString.ch[i]; 429 } 430 // handle rare boundary case 431 else { 432 ch2 = fReader.read(); 433 if (ch2 != -1) { 434 fTempString.ch[readSize++] = (char) ch2; 435 } 436 } 437 if (XMLChar.isLowSurrogate(ch2)) { 438 // convert surrogates to a supplemental character 439 int sup = XMLChar.supplemental(ch, (char)ch2); 440 if (!isValid(sup)) { 441 fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, 442 "InvalidCharInContent", 443 new Object[] { Integer.toString(sup, 16) }, 444 XMLErrorReporter.SEVERITY_FATAL_ERROR); 445 } 446 } 447 else { 448 fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, 449 "InvalidCharInContent", 450 new Object[] { Integer.toString(ch2, 16) }, 451 XMLErrorReporter.SEVERITY_FATAL_ERROR); 452 } 453 } 454 else { 455 fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, 456 "InvalidCharInContent", 457 new Object[] { Integer.toString(ch, 16) }, 458 XMLErrorReporter.SEVERITY_FATAL_ERROR); 459 } 460 } 461 } 462 if (fHandler != null && readSize > 0) { 463 fTempString.offset = 0; 464 fTempString.length = readSize; 465 fHandler.characters( 466 fTempString, 467 fHandler.modifyAugmentations(null, true)); 468 } 469 readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1); 470 } 471 472 } 473 474 /** 475 * Sets the input source on this text reader. 476 * 477 * @param source The XMLInputSource to use. 478 */ 479 public void setInputSource(XMLInputSource source) { 480 fSource = source; 481 } 482 483 /** 484 * Closes the stream. Call this after parse(), or when there is no longer any need 485 * for this object. 486 * 487 * @throws IOException 488 */ 489 public void close() throws IOException { 490 if (fReader != null) { 491 fReader.close(); 492 fReader = null; 493 } 494 } 495 496 /** 497 * Returns true if the specified character is a valid XML character 498 * as per the rules of XML 1.0. 499 * 500 * @param ch The character to check. 501 */ 502 protected boolean isValid(int ch) { 503 return XMLChar.isValid(ch); 504 } 505 506 /** 507 * Sets the buffer size property for the reader which decides the chunk sizes that are parsed 508 * by the reader at a time and passed to the handler 509 * 510 * @param bufferSize The size of the buffer desired 511 */ 512 protected void setBufferSize(int bufferSize) { 513 if (fTempString.ch.length != ++bufferSize) { 514 fTempString.ch = new char[bufferSize]; 515 } 516 } 517 518} 519