URI.java revision 12745:f068a4ffddd2
1/* 2 * Copyright (c) 2000, 2014, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26package java.net; 27 28import java.io.IOException; 29import java.io.InvalidObjectException; 30import java.io.ObjectInputStream; 31import java.io.ObjectOutputStream; 32import java.io.Serializable; 33import java.nio.ByteBuffer; 34import java.nio.CharBuffer; 35import java.nio.charset.CharsetDecoder; 36import java.nio.charset.CharsetEncoder; 37import java.nio.charset.CoderResult; 38import java.nio.charset.CodingErrorAction; 39import java.nio.charset.CharacterCodingException; 40import java.text.Normalizer; 41import sun.nio.cs.ThreadLocalCoders; 42 43import java.lang.Character; // for javadoc 44import java.lang.NullPointerException; // for javadoc 45 46 47/** 48 * Represents a Uniform Resource Identifier (URI) reference. 49 * 50 * <p> Aside from some minor deviations noted below, an instance of this 51 * class represents a URI reference as defined by 52 * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform 53 * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a 54 * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for 55 * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format 56 * also supports scope_ids. The syntax and usage of scope_ids is described 57 * <a href="Inet6Address.html#scoped">here</a>. 58 * This class provides constructors for creating URI instances from 59 * their components or by parsing their string forms, methods for accessing the 60 * various components of an instance, and methods for normalizing, resolving, 61 * and relativizing URI instances. Instances of this class are immutable. 62 * 63 * 64 * <h3> URI syntax and components </h3> 65 * 66 * At the highest level a URI reference (hereinafter simply "URI") in string 67 * form has the syntax 68 * 69 * <blockquote> 70 * [<i>scheme</i><b>{@code :}</b>]<i>scheme-specific-part</i>[<b>{@code #}</b><i>fragment</i>] 71 * </blockquote> 72 * 73 * where square brackets [...] delineate optional components and the characters 74 * <b>{@code :}</b> and <b>{@code #}</b> stand for themselves. 75 * 76 * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is 77 * said to be <i>relative</i>. URIs are also classified according to whether 78 * they are <i>opaque</i> or <i>hierarchical</i>. 79 * 80 * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does 81 * not begin with a slash character ({@code '/'}). Opaque URIs are not 82 * subject to further parsing. Some examples of opaque URIs are: 83 * 84 * <blockquote><table cellpadding=0 cellspacing=0 summary="layout"> 85 * <tr><td>{@code mailto:java-net@java.sun.com}<td></tr> 86 * <tr><td>{@code news:comp.lang.java}<td></tr> 87 * <tr><td>{@code urn:isbn:096139210x}</td></tr> 88 * </table></blockquote> 89 * 90 * <p> A <i>hierarchical</i> URI is either an absolute URI whose 91 * scheme-specific part begins with a slash character, or a relative URI, that 92 * is, a URI that does not specify a scheme. Some examples of hierarchical 93 * URIs are: 94 * 95 * <blockquote> 96 * {@code http://java.sun.com/j2se/1.3/}<br> 97 * {@code docs/guide/collections/designfaq.html#28}<br> 98 * {@code ../../../demo/jfc/SwingSet2/src/SwingSet2.java}<br> 99 * {@code file:///~/calendar} 100 * </blockquote> 101 * 102 * <p> A hierarchical URI is subject to further parsing according to the syntax 103 * 104 * <blockquote> 105 * [<i>scheme</i><b>{@code :}</b>][<b>{@code //}</b><i>authority</i>][<i>path</i>][<b>{@code ?}</b><i>query</i>][<b>{@code #}</b><i>fragment</i>] 106 * </blockquote> 107 * 108 * where the characters <b>{@code :}</b>, <b>{@code /}</b>, 109 * <b>{@code ?}</b>, and <b>{@code #}</b> stand for themselves. The 110 * scheme-specific part of a hierarchical URI consists of the characters 111 * between the scheme and fragment components. 112 * 113 * <p> The authority component of a hierarchical URI is, if specified, either 114 * <i>server-based</i> or <i>registry-based</i>. A server-based authority 115 * parses according to the familiar syntax 116 * 117 * <blockquote> 118 * [<i>user-info</i><b>{@code @}</b>]<i>host</i>[<b>{@code :}</b><i>port</i>] 119 * </blockquote> 120 * 121 * where the characters <b>{@code @}</b> and <b>{@code :}</b> stand for 122 * themselves. Nearly all URI schemes currently in use are server-based. An 123 * authority component that does not parse in this way is considered to be 124 * registry-based. 125 * 126 * <p> The path component of a hierarchical URI is itself said to be absolute 127 * if it begins with a slash character ({@code '/'}); otherwise it is 128 * relative. The path of a hierarchical URI that is either absolute or 129 * specifies an authority is always absolute. 130 * 131 * <p> All told, then, a URI instance has the following nine components: 132 * 133 * <blockquote><table summary="Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment"> 134 * <tr><th><i>Component</i></th><th><i>Type</i></th></tr> 135 * <tr><td>scheme</td><td>{@code String}</td></tr> 136 * <tr><td>scheme-specific-part </td><td>{@code String}</td></tr> 137 * <tr><td>authority</td><td>{@code String}</td></tr> 138 * <tr><td>user-info</td><td>{@code String}</td></tr> 139 * <tr><td>host</td><td>{@code String}</td></tr> 140 * <tr><td>port</td><td>{@code int}</td></tr> 141 * <tr><td>path</td><td>{@code String}</td></tr> 142 * <tr><td>query</td><td>{@code String}</td></tr> 143 * <tr><td>fragment</td><td>{@code String}</td></tr> 144 * </table></blockquote> 145 * 146 * In a given instance any particular component is either <i>undefined</i> or 147 * <i>defined</i> with a distinct value. Undefined string components are 148 * represented by {@code null}, while undefined integer components are 149 * represented by {@code -1}. A string component may be defined to have the 150 * empty string as its value; this is not equivalent to that component being 151 * undefined. 152 * 153 * <p> Whether a particular component is or is not defined in an instance 154 * depends upon the type of the URI being represented. An absolute URI has a 155 * scheme component. An opaque URI has a scheme, a scheme-specific part, and 156 * possibly a fragment, but has no other components. A hierarchical URI always 157 * has a path (though it may be empty) and a scheme-specific-part (which at 158 * least contains the path), and may have any of the other components. If the 159 * authority component is present and is server-based then the host component 160 * will be defined and the user-information and port components may be defined. 161 * 162 * 163 * <h4> Operations on URI instances </h4> 164 * 165 * The key operations supported by this class are those of 166 * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>. 167 * 168 * <p> <i>Normalization</i> is the process of removing unnecessary {@code "."} 169 * and {@code ".."} segments from the path component of a hierarchical URI. 170 * Each {@code "."} segment is simply removed. A {@code ".."} segment is 171 * removed only if it is preceded by a non-{@code ".."} segment. 172 * Normalization has no effect upon opaque URIs. 173 * 174 * <p> <i>Resolution</i> is the process of resolving one URI against another, 175 * <i>base</i> URI. The resulting URI is constructed from components of both 176 * URIs in the manner specified by RFC 2396, taking components from the 177 * base URI for those not specified in the original. For hierarchical URIs, 178 * the path of the original is resolved against the path of the base and then 179 * normalized. The result, for example, of resolving 180 * 181 * <blockquote> 182 * {@code docs/guide/collections/designfaq.html#28} 183 * 184 * (1) 185 * </blockquote> 186 * 187 * against the base URI {@code http://java.sun.com/j2se/1.3/} is the result 188 * URI 189 * 190 * <blockquote> 191 * {@code http://docs.oracle.com/javase/1.3/docs/guide/collections/designfaq.html#28} 192 * </blockquote> 193 * 194 * Resolving the relative URI 195 * 196 * <blockquote> 197 * {@code ../../../demo/jfc/SwingSet2/src/SwingSet2.java} (2) 198 * </blockquote> 199 * 200 * against this result yields, in turn, 201 * 202 * <blockquote> 203 * {@code http://java.sun.com/j2se/1.3/demo/jfc/SwingSet2/src/SwingSet2.java} 204 * </blockquote> 205 * 206 * Resolution of both absolute and relative URIs, and of both absolute and 207 * relative paths in the case of hierarchical URIs, is supported. Resolving 208 * the URI {@code file:///~calendar} against any other URI simply yields the 209 * original URI, since it is absolute. Resolving the relative URI (2) above 210 * against the relative base URI (1) yields the normalized, but still relative, 211 * URI 212 * 213 * <blockquote> 214 * {@code demo/jfc/SwingSet2/src/SwingSet2.java} 215 * </blockquote> 216 * 217 * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any 218 * two normalized URIs <i>u</i> and <i>v</i>, 219 * 220 * <blockquote> 221 * <i>u</i>{@code .relativize(}<i>u</i>{@code .resolve(}<i>v</i>{@code )).equals(}<i>v</i>{@code )} and<br> 222 * <i>u</i>{@code .resolve(}<i>u</i>{@code .relativize(}<i>v</i>{@code )).equals(}<i>v</i>{@code )} .<br> 223 * </blockquote> 224 * 225 * This operation is often useful when constructing a document containing URIs 226 * that must be made relative to the base URI of the document wherever 227 * possible. For example, relativizing the URI 228 * 229 * <blockquote> 230 * {@code http://docs.oracle.com/javase/1.3/docs/guide/index.html} 231 * </blockquote> 232 * 233 * against the base URI 234 * 235 * <blockquote> 236 * {@code http://java.sun.com/j2se/1.3} 237 * </blockquote> 238 * 239 * yields the relative URI {@code docs/guide/index.html}. 240 * 241 * 242 * <h4> Character categories </h4> 243 * 244 * RFC 2396 specifies precisely which characters are permitted in the 245 * various components of a URI reference. The following categories, most of 246 * which are taken from that specification, are used below to describe these 247 * constraints: 248 * 249 * <blockquote><table cellspacing=2 summary="Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other"> 250 * <tr><th valign=top><i>alpha</i></th> 251 * <td>The US-ASCII alphabetic characters, 252 * {@code 'A'} through {@code 'Z'} 253 * and {@code 'a'} through {@code 'z'}</td></tr> 254 * <tr><th valign=top><i>digit</i></th> 255 * <td>The US-ASCII decimal digit characters, 256 * {@code '0'} through {@code '9'}</td></tr> 257 * <tr><th valign=top><i>alphanum</i></th> 258 * <td>All <i>alpha</i> and <i>digit</i> characters</td></tr> 259 * <tr><th valign=top><i>unreserved</i> </th> 260 * <td>All <i>alphanum</i> characters together with those in the string 261 * {@code "_-!.~'()*"}</td></tr> 262 * <tr><th valign=top><i>punct</i></th> 263 * <td>The characters in the string {@code ",;:$&+="}</td></tr> 264 * <tr><th valign=top><i>reserved</i></th> 265 * <td>All <i>punct</i> characters together with those in the string 266 * {@code "?/[]@"}</td></tr> 267 * <tr><th valign=top><i>escaped</i></th> 268 * <td>Escaped octets, that is, triplets consisting of the percent 269 * character ({@code '%'}) followed by two hexadecimal digits 270 * ({@code '0'}-{@code '9'}, {@code 'A'}-{@code 'F'}, and 271 * {@code 'a'}-{@code 'f'})</td></tr> 272 * <tr><th valign=top><i>other</i></th> 273 * <td>The Unicode characters that are not in the US-ASCII character set, 274 * are not control characters (according to the {@link 275 * java.lang.Character#isISOControl(char) Character.isISOControl} 276 * method), and are not space characters (according to the {@link 277 * java.lang.Character#isSpaceChar(char) Character.isSpaceChar} 278 * method) <i>(<b>Deviation from RFC 2396</b>, which is 279 * limited to US-ASCII)</i></td></tr> 280 * </table></blockquote> 281 * 282 * <p><a name="legal-chars"></a> The set of all legal URI characters consists of 283 * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i> 284 * characters. 285 * 286 * 287 * <h4> Escaped octets, quotation, encoding, and decoding </h4> 288 * 289 * RFC 2396 allows escaped octets to appear in the user-info, path, query, and 290 * fragment components. Escaping serves two purposes in URIs: 291 * 292 * <ul> 293 * 294 * <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to 295 * conform strictly to RFC 2396 by not containing any <i>other</i> 296 * characters. </p></li> 297 * 298 * <li><p> To <i>quote</i> characters that are otherwise illegal in a 299 * component. The user-info, path, query, and fragment components differ 300 * slightly in terms of which characters are considered legal and illegal. 301 * </p></li> 302 * 303 * </ul> 304 * 305 * These purposes are served in this class by three related operations: 306 * 307 * <ul> 308 * 309 * <li><p><a name="encode"></a> A character is <i>encoded</i> by replacing it 310 * with the sequence of escaped octets that represent that character in the 311 * UTF-8 character set. The Euro currency symbol ({@code '\u005Cu20AC'}), 312 * for example, is encoded as {@code "%E2%82%AC"}. <i>(<b>Deviation from 313 * RFC 2396</b>, which does not specify any particular character 314 * set.)</i> </p></li> 315 * 316 * <li><p><a name="quote"></a> An illegal character is <i>quoted</i> simply by 317 * encoding it. The space character, for example, is quoted by replacing it 318 * with {@code "%20"}. UTF-8 contains US-ASCII, hence for US-ASCII 319 * characters this transformation has exactly the effect required by 320 * RFC 2396. </p></li> 321 * 322 * <li><p><a name="decode"></a> 323 * A sequence of escaped octets is <i>decoded</i> by 324 * replacing it with the sequence of characters that it represents in the 325 * UTF-8 character set. UTF-8 contains US-ASCII, hence decoding has the 326 * effect of de-quoting any quoted US-ASCII characters as well as that of 327 * decoding any encoded non-US-ASCII characters. If a <a 328 * href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs 329 * when decoding the escaped octets then the erroneous octets are replaced by 330 * {@code '\u005CuFFFD'}, the Unicode replacement character. </p></li> 331 * 332 * </ul> 333 * 334 * These operations are exposed in the constructors and methods of this class 335 * as follows: 336 * 337 * <ul> 338 * 339 * <li><p> The {@linkplain #URI(java.lang.String) single-argument 340 * constructor} requires any illegal characters in its argument to be 341 * quoted and preserves any escaped octets and <i>other</i> characters that 342 * are present. </p></li> 343 * 344 * <li><p> The {@linkplain 345 * #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String) 346 * multi-argument constructors} quote illegal characters as 347 * required by the components in which they appear. The percent character 348 * ({@code '%'}) is always quoted by these constructors. Any <i>other</i> 349 * characters are preserved. </p></li> 350 * 351 * <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath() 352 * getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment() 353 * getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link 354 * #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the 355 * values of their corresponding components in raw form, without interpreting 356 * any escaped octets. The strings returned by these methods may contain 357 * both escaped octets and <i>other</i> characters, and will not contain any 358 * illegal characters. </p></li> 359 * 360 * <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath() 361 * getPath}, {@link #getQuery() getQuery}, {@link #getFragment() 362 * getFragment}, {@link #getAuthority() getAuthority}, and {@link 363 * #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped 364 * octets in their corresponding components. The strings returned by these 365 * methods may contain both <i>other</i> characters and illegal characters, 366 * and will not contain any escaped octets. </p></li> 367 * 368 * <li><p> The {@link #toString() toString} method returns a URI string with 369 * all necessary quotation but which may contain <i>other</i> characters. 370 * </p></li> 371 * 372 * <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully 373 * quoted and encoded URI string that does not contain any <i>other</i> 374 * characters. </p></li> 375 * 376 * </ul> 377 * 378 * 379 * <h4> Identities </h4> 380 * 381 * For any URI <i>u</i>, it is always the case that 382 * 383 * <blockquote> 384 * {@code new URI(}<i>u</i>{@code .toString()).equals(}<i>u</i>{@code )} . 385 * </blockquote> 386 * 387 * For any URI <i>u</i> that does not contain redundant syntax such as two 388 * slashes before an empty authority (as in {@code file:///tmp/} ) or a 389 * colon following a host name but no port (as in 390 * {@code http://java.sun.com:} ), and that does not encode characters 391 * except those that must be quoted, the following identities also hold: 392 * <pre> 393 * new URI(<i>u</i>.getScheme(), 394 * <i>u</i>.getSchemeSpecificPart(), 395 * <i>u</i>.getFragment()) 396 * .equals(<i>u</i>)</pre> 397 * in all cases, 398 * <pre> 399 * new URI(<i>u</i>.getScheme(), 400 * <i>u</i>.getAuthority(), 401 * <i>u</i>.getPath(), <i>u</i>.getQuery(), 402 * <i>u</i>.getFragment()) 403 * .equals(<i>u</i>)</pre> 404 * if <i>u</i> is hierarchical, and 405 * <pre> 406 * new URI(<i>u</i>.getScheme(), 407 * <i>u</i>.getUserInfo(), <i>u</i>.getHost(), <i>u</i>.getPort(), 408 * <i>u</i>.getPath(), <i>u</i>.getQuery(), 409 * <i>u</i>.getFragment()) 410 * .equals(<i>u</i>)</pre> 411 * if <i>u</i> is hierarchical and has either no authority or a server-based 412 * authority. 413 * 414 * 415 * <h4> URIs, URLs, and URNs </h4> 416 * 417 * A URI is a uniform resource <i>identifier</i> while a URL is a uniform 418 * resource <i>locator</i>. Hence every URL is a URI, abstractly speaking, but 419 * not every URI is a URL. This is because there is another subcategory of 420 * URIs, uniform resource <i>names</i> (URNs), which name resources but do not 421 * specify how to locate them. The {@code mailto}, {@code news}, and 422 * {@code isbn} URIs shown above are examples of URNs. 423 * 424 * <p> The conceptual distinction between URIs and URLs is reflected in the 425 * differences between this class and the {@link URL} class. 426 * 427 * <p> An instance of this class represents a URI reference in the syntactic 428 * sense defined by RFC 2396. A URI may be either absolute or relative. 429 * A URI string is parsed according to the generic syntax without regard to the 430 * scheme, if any, that it specifies. No lookup of the host, if any, is 431 * performed, and no scheme-dependent stream handler is constructed. Equality, 432 * hashing, and comparison are defined strictly in terms of the character 433 * content of the instance. In other words, a URI instance is little more than 434 * a structured string that supports the syntactic, scheme-independent 435 * operations of comparison, normalization, resolution, and relativization. 436 * 437 * <p> An instance of the {@link URL} class, by contrast, represents the 438 * syntactic components of a URL together with some of the information required 439 * to access the resource that it describes. A URL must be absolute, that is, 440 * it must always specify a scheme. A URL string is parsed according to its 441 * scheme. A stream handler is always established for a URL, and in fact it is 442 * impossible to create a URL instance for a scheme for which no handler is 443 * available. Equality and hashing depend upon both the scheme and the 444 * Internet address of the host, if any; comparison is not defined. In other 445 * words, a URL is a structured string that supports the syntactic operation of 446 * resolution as well as the network I/O operations of looking up the host and 447 * opening a connection to the specified resource. 448 * 449 * 450 * @author Mark Reinhold 451 * @since 1.4 452 * 453 * @see <a href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC 2279: UTF-8, a 454 * transformation format of ISO 10646</i></a>, <br><a 455 * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6 Addressing 456 * Architecture</i></a>, <br><a 457 * href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform 458 * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a 459 * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for 460 * Literal IPv6 Addresses in URLs</i></a>, <br><a 461 * href="URISyntaxException.html">URISyntaxException</a> 462 */ 463 464public final class URI 465 implements Comparable<URI>, Serializable 466{ 467 468 // Note: Comments containing the word "ASSERT" indicate places where a 469 // throw of an InternalError should be replaced by an appropriate assertion 470 // statement once asserts are enabled in the build. 471 472 static final long serialVersionUID = -6052424284110960213L; 473 474 475 // -- Properties and components of this instance -- 476 477 // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>] 478 private transient String scheme; // null ==> relative URI 479 private transient String fragment; 480 481 // Hierarchical URI components: [//<authority>]<path>[?<query>] 482 private transient String authority; // Registry or server 483 484 // Server-based authority: [<userInfo>@]<host>[:<port>] 485 private transient String userInfo; 486 private transient String host; // null ==> registry-based 487 private transient int port = -1; // -1 ==> undefined 488 489 // Remaining components of hierarchical URIs 490 private transient String path; // null ==> opaque 491 private transient String query; 492 493 // The remaining fields may be computed on demand 494 495 private transient volatile String schemeSpecificPart; 496 private transient volatile int hash; // Zero ==> undefined 497 498 private transient volatile String decodedUserInfo = null; 499 private transient volatile String decodedAuthority = null; 500 private transient volatile String decodedPath = null; 501 private transient volatile String decodedQuery = null; 502 private transient volatile String decodedFragment = null; 503 private transient volatile String decodedSchemeSpecificPart = null; 504 505 /** 506 * The string form of this URI. 507 * 508 * @serial 509 */ 510 private volatile String string; // The only serializable field 511 512 513 514 // -- Constructors and factories -- 515 516 private URI() { } // Used internally 517 518 /** 519 * Constructs a URI by parsing the given string. 520 * 521 * <p> This constructor parses the given string exactly as specified by the 522 * grammar in <a 523 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 524 * Appendix A, <b><i>except for the following deviations:</i></b> </p> 525 * 526 * <ul> 527 * 528 * <li><p> An empty authority component is permitted as long as it is 529 * followed by a non-empty path, a query component, or a fragment 530 * component. This allows the parsing of URIs such as 531 * {@code "file:///foo/bar"}, which seems to be the intent of 532 * RFC 2396 although the grammar does not permit it. If the 533 * authority component is empty then the user-information, host, and port 534 * components are undefined. </p></li> 535 * 536 * <li><p> Empty relative paths are permitted; this seems to be the 537 * intent of RFC 2396 although the grammar does not permit it. The 538 * primary consequence of this deviation is that a standalone fragment 539 * such as {@code "#foo"} parses as a relative URI with an empty path 540 * and the given fragment, and can be usefully <a 541 * href="#resolve-frag">resolved</a> against a base URI. 542 * 543 * <li><p> IPv4 addresses in host components are parsed rigorously, as 544 * specified by <a 545 * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>: Each 546 * element of a dotted-quad address must contain no more than three 547 * decimal digits. Each element is further constrained to have a value 548 * no greater than 255. </p></li> 549 * 550 * <li> <p> Hostnames in host components that comprise only a single 551 * domain label are permitted to start with an <i>alphanum</i> 552 * character. This seems to be the intent of <a 553 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> 554 * section 3.2.2 although the grammar does not permit it. The 555 * consequence of this deviation is that the authority component of a 556 * hierarchical URI such as {@code s://123}, will parse as a server-based 557 * authority. </p></li> 558 * 559 * <li><p> IPv6 addresses are permitted for the host component. An IPv6 560 * address must be enclosed in square brackets ({@code '['} and 561 * {@code ']'}) as specified by <a 562 * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>. The 563 * IPv6 address itself must parse according to <a 564 * href="http://www.ietf.org/rfc/rfc2373.txt">RFC 2373</a>. IPv6 565 * addresses are further constrained to describe no more than sixteen 566 * bytes of address information, a constraint implicit in RFC 2373 567 * but not expressible in the grammar. </p></li> 568 * 569 * <li><p> Characters in the <i>other</i> category are permitted wherever 570 * RFC 2396 permits <i>escaped</i> octets, that is, in the 571 * user-information, path, query, and fragment components, as well as in 572 * the authority component if the authority is registry-based. This 573 * allows URIs to contain Unicode characters beyond those in the US-ASCII 574 * character set. </p></li> 575 * 576 * </ul> 577 * 578 * @param str The string to be parsed into a URI 579 * 580 * @throws NullPointerException 581 * If {@code str} is {@code null} 582 * 583 * @throws URISyntaxException 584 * If the given string violates RFC 2396, as augmented 585 * by the above deviations 586 */ 587 public URI(String str) throws URISyntaxException { 588 new Parser(str).parse(false); 589 } 590 591 /** 592 * Constructs a hierarchical URI from the given components. 593 * 594 * <p> If a scheme is given then the path, if also given, must either be 595 * empty or begin with a slash character ({@code '/'}). Otherwise a 596 * component of the new URI may be left undefined by passing {@code null} 597 * for the corresponding parameter or, in the case of the {@code port} 598 * parameter, by passing {@code -1}. 599 * 600 * <p> This constructor first builds a URI string from the given components 601 * according to the rules specified in <a 602 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 603 * section 5.2, step 7: </p> 604 * 605 * <ol> 606 * 607 * <li><p> Initially, the result string is empty. </p></li> 608 * 609 * <li><p> If a scheme is given then it is appended to the result, 610 * followed by a colon character ({@code ':'}). </p></li> 611 * 612 * <li><p> If user information, a host, or a port are given then the 613 * string {@code "//"} is appended. </p></li> 614 * 615 * <li><p> If user information is given then it is appended, followed by 616 * a commercial-at character ({@code '@'}). Any character not in the 617 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 618 * categories is <a href="#quote">quoted</a>. </p></li> 619 * 620 * <li><p> If a host is given then it is appended. If the host is a 621 * literal IPv6 address but is not enclosed in square brackets 622 * ({@code '['} and {@code ']'}) then the square brackets are added. 623 * </p></li> 624 * 625 * <li><p> If a port number is given then a colon character 626 * ({@code ':'}) is appended, followed by the port number in decimal. 627 * </p></li> 628 * 629 * <li><p> If a path is given then it is appended. Any character not in 630 * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 631 * categories, and not equal to the slash character ({@code '/'}) or the 632 * commercial-at character ({@code '@'}), is quoted. </p></li> 633 * 634 * <li><p> If a query is given then a question-mark character 635 * ({@code '?'}) is appended, followed by the query. Any character that 636 * is not a <a href="#legal-chars">legal URI character</a> is quoted. 637 * </p></li> 638 * 639 * <li><p> Finally, if a fragment is given then a hash character 640 * ({@code '#'}) is appended, followed by the fragment. Any character 641 * that is not a legal URI character is quoted. </p></li> 642 * 643 * </ol> 644 * 645 * <p> The resulting URI string is then parsed as if by invoking the {@link 646 * #URI(String)} constructor and then invoking the {@link 647 * #parseServerAuthority()} method upon the result; this may cause a {@link 648 * URISyntaxException} to be thrown. </p> 649 * 650 * @param scheme Scheme name 651 * @param userInfo User name and authorization information 652 * @param host Host name 653 * @param port Port number 654 * @param path Path 655 * @param query Query 656 * @param fragment Fragment 657 * 658 * @throws URISyntaxException 659 * If both a scheme and a path are given but the path is relative, 660 * if the URI string constructed from the given components violates 661 * RFC 2396, or if the authority component of the string is 662 * present but cannot be parsed as a server-based authority 663 */ 664 public URI(String scheme, 665 String userInfo, String host, int port, 666 String path, String query, String fragment) 667 throws URISyntaxException 668 { 669 String s = toString(scheme, null, 670 null, userInfo, host, port, 671 path, query, fragment); 672 checkPath(s, scheme, path); 673 new Parser(s).parse(true); 674 } 675 676 /** 677 * Constructs a hierarchical URI from the given components. 678 * 679 * <p> If a scheme is given then the path, if also given, must either be 680 * empty or begin with a slash character ({@code '/'}). Otherwise a 681 * component of the new URI may be left undefined by passing {@code null} 682 * for the corresponding parameter. 683 * 684 * <p> This constructor first builds a URI string from the given components 685 * according to the rules specified in <a 686 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 687 * section 5.2, step 7: </p> 688 * 689 * <ol> 690 * 691 * <li><p> Initially, the result string is empty. </p></li> 692 * 693 * <li><p> If a scheme is given then it is appended to the result, 694 * followed by a colon character ({@code ':'}). </p></li> 695 * 696 * <li><p> If an authority is given then the string {@code "//"} is 697 * appended, followed by the authority. If the authority contains a 698 * literal IPv6 address then the address must be enclosed in square 699 * brackets ({@code '['} and {@code ']'}). Any character not in the 700 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 701 * categories, and not equal to the commercial-at character 702 * ({@code '@'}), is <a href="#quote">quoted</a>. </p></li> 703 * 704 * <li><p> If a path is given then it is appended. Any character not in 705 * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 706 * categories, and not equal to the slash character ({@code '/'}) or the 707 * commercial-at character ({@code '@'}), is quoted. </p></li> 708 * 709 * <li><p> If a query is given then a question-mark character 710 * ({@code '?'}) is appended, followed by the query. Any character that 711 * is not a <a href="#legal-chars">legal URI character</a> is quoted. 712 * </p></li> 713 * 714 * <li><p> Finally, if a fragment is given then a hash character 715 * ({@code '#'}) is appended, followed by the fragment. Any character 716 * that is not a legal URI character is quoted. </p></li> 717 * 718 * </ol> 719 * 720 * <p> The resulting URI string is then parsed as if by invoking the {@link 721 * #URI(String)} constructor and then invoking the {@link 722 * #parseServerAuthority()} method upon the result; this may cause a {@link 723 * URISyntaxException} to be thrown. </p> 724 * 725 * @param scheme Scheme name 726 * @param authority Authority 727 * @param path Path 728 * @param query Query 729 * @param fragment Fragment 730 * 731 * @throws URISyntaxException 732 * If both a scheme and a path are given but the path is relative, 733 * if the URI string constructed from the given components violates 734 * RFC 2396, or if the authority component of the string is 735 * present but cannot be parsed as a server-based authority 736 */ 737 public URI(String scheme, 738 String authority, 739 String path, String query, String fragment) 740 throws URISyntaxException 741 { 742 String s = toString(scheme, null, 743 authority, null, null, -1, 744 path, query, fragment); 745 checkPath(s, scheme, path); 746 new Parser(s).parse(false); 747 } 748 749 /** 750 * Constructs a hierarchical URI from the given components. 751 * 752 * <p> A component may be left undefined by passing {@code null}. 753 * 754 * <p> This convenience constructor works as if by invoking the 755 * seven-argument constructor as follows: 756 * 757 * <blockquote> 758 * {@code new} {@link #URI(String, String, String, int, String, String, String) 759 * URI}{@code (scheme, null, host, -1, path, null, fragment);} 760 * </blockquote> 761 * 762 * @param scheme Scheme name 763 * @param host Host name 764 * @param path Path 765 * @param fragment Fragment 766 * 767 * @throws URISyntaxException 768 * If the URI string constructed from the given components 769 * violates RFC 2396 770 */ 771 public URI(String scheme, String host, String path, String fragment) 772 throws URISyntaxException 773 { 774 this(scheme, null, host, -1, path, null, fragment); 775 } 776 777 /** 778 * Constructs a URI from the given components. 779 * 780 * <p> A component may be left undefined by passing {@code null}. 781 * 782 * <p> This constructor first builds a URI in string form using the given 783 * components as follows: </p> 784 * 785 * <ol> 786 * 787 * <li><p> Initially, the result string is empty. </p></li> 788 * 789 * <li><p> If a scheme is given then it is appended to the result, 790 * followed by a colon character ({@code ':'}). </p></li> 791 * 792 * <li><p> If a scheme-specific part is given then it is appended. Any 793 * character that is not a <a href="#legal-chars">legal URI character</a> 794 * is <a href="#quote">quoted</a>. </p></li> 795 * 796 * <li><p> Finally, if a fragment is given then a hash character 797 * ({@code '#'}) is appended to the string, followed by the fragment. 798 * Any character that is not a legal URI character is quoted. </p></li> 799 * 800 * </ol> 801 * 802 * <p> The resulting URI string is then parsed in order to create the new 803 * URI instance as if by invoking the {@link #URI(String)} constructor; 804 * this may cause a {@link URISyntaxException} to be thrown. </p> 805 * 806 * @param scheme Scheme name 807 * @param ssp Scheme-specific part 808 * @param fragment Fragment 809 * 810 * @throws URISyntaxException 811 * If the URI string constructed from the given components 812 * violates RFC 2396 813 */ 814 public URI(String scheme, String ssp, String fragment) 815 throws URISyntaxException 816 { 817 new Parser(toString(scheme, ssp, 818 null, null, null, -1, 819 null, null, fragment)) 820 .parse(false); 821 } 822 823 /** 824 * Creates a URI by parsing the given string. 825 * 826 * <p> This convenience factory method works as if by invoking the {@link 827 * #URI(String)} constructor; any {@link URISyntaxException} thrown by the 828 * constructor is caught and wrapped in a new {@link 829 * IllegalArgumentException} object, which is then thrown. 830 * 831 * <p> This method is provided for use in situations where it is known that 832 * the given string is a legal URI, for example for URI constants declared 833 * within in a program, and so it would be considered a programming error 834 * for the string not to parse as such. The constructors, which throw 835 * {@link URISyntaxException} directly, should be used situations where a 836 * URI is being constructed from user input or from some other source that 837 * may be prone to errors. </p> 838 * 839 * @param str The string to be parsed into a URI 840 * @return The new URI 841 * 842 * @throws NullPointerException 843 * If {@code str} is {@code null} 844 * 845 * @throws IllegalArgumentException 846 * If the given string violates RFC 2396 847 */ 848 public static URI create(String str) { 849 try { 850 return new URI(str); 851 } catch (URISyntaxException x) { 852 throw new IllegalArgumentException(x.getMessage(), x); 853 } 854 } 855 856 857 // -- Operations -- 858 859 /** 860 * Attempts to parse this URI's authority component, if defined, into 861 * user-information, host, and port components. 862 * 863 * <p> If this URI's authority component has already been recognized as 864 * being server-based then it will already have been parsed into 865 * user-information, host, and port components. In this case, or if this 866 * URI has no authority component, this method simply returns this URI. 867 * 868 * <p> Otherwise this method attempts once more to parse the authority 869 * component into user-information, host, and port components, and throws 870 * an exception describing why the authority component could not be parsed 871 * in that way. 872 * 873 * <p> This method is provided because the generic URI syntax specified in 874 * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> 875 * cannot always distinguish a malformed server-based authority from a 876 * legitimate registry-based authority. It must therefore treat some 877 * instances of the former as instances of the latter. The authority 878 * component in the URI string {@code "//foo:bar"}, for example, is not a 879 * legal server-based authority but it is legal as a registry-based 880 * authority. 881 * 882 * <p> In many common situations, for example when working URIs that are 883 * known to be either URNs or URLs, the hierarchical URIs being used will 884 * always be server-based. They therefore must either be parsed as such or 885 * treated as an error. In these cases a statement such as 886 * 887 * <blockquote> 888 * {@code URI }<i>u</i>{@code = new URI(str).parseServerAuthority();} 889 * </blockquote> 890 * 891 * <p> can be used to ensure that <i>u</i> always refers to a URI that, if 892 * it has an authority component, has a server-based authority with proper 893 * user-information, host, and port components. Invoking this method also 894 * ensures that if the authority could not be parsed in that way then an 895 * appropriate diagnostic message can be issued based upon the exception 896 * that is thrown. </p> 897 * 898 * @return A URI whose authority field has been parsed 899 * as a server-based authority 900 * 901 * @throws URISyntaxException 902 * If the authority component of this URI is defined 903 * but cannot be parsed as a server-based authority 904 * according to RFC 2396 905 */ 906 public URI parseServerAuthority() 907 throws URISyntaxException 908 { 909 // We could be clever and cache the error message and index from the 910 // exception thrown during the original parse, but that would require 911 // either more fields or a more-obscure representation. 912 if ((host != null) || (authority == null)) 913 return this; 914 defineString(); 915 new Parser(string).parse(true); 916 return this; 917 } 918 919 /** 920 * Normalizes this URI's path. 921 * 922 * <p> If this URI is opaque, or if its path is already in normal form, 923 * then this URI is returned. Otherwise a new URI is constructed that is 924 * identical to this URI except that its path is computed by normalizing 925 * this URI's path in a manner consistent with <a 926 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 927 * section 5.2, step 6, sub-steps c through f; that is: 928 * </p> 929 * 930 * <ol> 931 * 932 * <li><p> All {@code "."} segments are removed. </p></li> 933 * 934 * <li><p> If a {@code ".."} segment is preceded by a non-{@code ".."} 935 * segment then both of these segments are removed. This step is 936 * repeated until it is no longer applicable. </p></li> 937 * 938 * <li><p> If the path is relative, and if its first segment contains a 939 * colon character ({@code ':'}), then a {@code "."} segment is 940 * prepended. This prevents a relative URI with a path such as 941 * {@code "a:b/c/d"} from later being re-parsed as an opaque URI with a 942 * scheme of {@code "a"} and a scheme-specific part of {@code "b/c/d"}. 943 * <b><i>(Deviation from RFC 2396)</i></b> </p></li> 944 * 945 * </ol> 946 * 947 * <p> A normalized path will begin with one or more {@code ".."} segments 948 * if there were insufficient non-{@code ".."} segments preceding them to 949 * allow their removal. A normalized path will begin with a {@code "."} 950 * segment if one was inserted by step 3 above. Otherwise, a normalized 951 * path will not contain any {@code "."} or {@code ".."} segments. </p> 952 * 953 * @return A URI equivalent to this URI, 954 * but whose path is in normal form 955 */ 956 public URI normalize() { 957 return normalize(this); 958 } 959 960 /** 961 * Resolves the given URI against this URI. 962 * 963 * <p> If the given URI is already absolute, or if this URI is opaque, then 964 * the given URI is returned. 965 * 966 * <p><a name="resolve-frag"></a> If the given URI's fragment component is 967 * defined, its path component is empty, and its scheme, authority, and 968 * query components are undefined, then a URI with the given fragment but 969 * with all other components equal to those of this URI is returned. This 970 * allows a URI representing a standalone fragment reference, such as 971 * {@code "#foo"}, to be usefully resolved against a base URI. 972 * 973 * <p> Otherwise this method constructs a new hierarchical URI in a manner 974 * consistent with <a 975 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 976 * section 5.2; that is: </p> 977 * 978 * <ol> 979 * 980 * <li><p> A new URI is constructed with this URI's scheme and the given 981 * URI's query and fragment components. </p></li> 982 * 983 * <li><p> If the given URI has an authority component then the new URI's 984 * authority and path are taken from the given URI. </p></li> 985 * 986 * <li><p> Otherwise the new URI's authority component is copied from 987 * this URI, and its path is computed as follows: </p> 988 * 989 * <ol> 990 * 991 * <li><p> If the given URI's path is absolute then the new URI's path 992 * is taken from the given URI. </p></li> 993 * 994 * <li><p> Otherwise the given URI's path is relative, and so the new 995 * URI's path is computed by resolving the path of the given URI 996 * against the path of this URI. This is done by concatenating all but 997 * the last segment of this URI's path, if any, with the given URI's 998 * path and then normalizing the result as if by invoking the {@link 999 * #normalize() normalize} method. </p></li> 1000 * 1001 * </ol></li> 1002 * 1003 * </ol> 1004 * 1005 * <p> The result of this method is absolute if, and only if, either this 1006 * URI is absolute or the given URI is absolute. </p> 1007 * 1008 * @param uri The URI to be resolved against this URI 1009 * @return The resulting URI 1010 * 1011 * @throws NullPointerException 1012 * If {@code uri} is {@code null} 1013 */ 1014 public URI resolve(URI uri) { 1015 return resolve(this, uri); 1016 } 1017 1018 /** 1019 * Constructs a new URI by parsing the given string and then resolving it 1020 * against this URI. 1021 * 1022 * <p> This convenience method works as if invoking it were equivalent to 1023 * evaluating the expression {@link #resolve(java.net.URI) 1024 * resolve}{@code (URI.}{@link #create(String) create}{@code (str))}. </p> 1025 * 1026 * @param str The string to be parsed into a URI 1027 * @return The resulting URI 1028 * 1029 * @throws NullPointerException 1030 * If {@code str} is {@code null} 1031 * 1032 * @throws IllegalArgumentException 1033 * If the given string violates RFC 2396 1034 */ 1035 public URI resolve(String str) { 1036 return resolve(URI.create(str)); 1037 } 1038 1039 /** 1040 * Relativizes the given URI against this URI. 1041 * 1042 * <p> The relativization of the given URI against this URI is computed as 1043 * follows: </p> 1044 * 1045 * <ol> 1046 * 1047 * <li><p> If either this URI or the given URI are opaque, or if the 1048 * scheme and authority components of the two URIs are not identical, or 1049 * if the path of this URI is not a prefix of the path of the given URI, 1050 * then the given URI is returned. </p></li> 1051 * 1052 * <li><p> Otherwise a new relative hierarchical URI is constructed with 1053 * query and fragment components taken from the given URI and with a path 1054 * component computed by removing this URI's path from the beginning of 1055 * the given URI's path. </p></li> 1056 * 1057 * </ol> 1058 * 1059 * @param uri The URI to be relativized against this URI 1060 * @return The resulting URI 1061 * 1062 * @throws NullPointerException 1063 * If {@code uri} is {@code null} 1064 */ 1065 public URI relativize(URI uri) { 1066 return relativize(this, uri); 1067 } 1068 1069 /** 1070 * Constructs a URL from this URI. 1071 * 1072 * <p> This convenience method works as if invoking it were equivalent to 1073 * evaluating the expression {@code new URL(this.toString())} after 1074 * first checking that this URI is absolute. </p> 1075 * 1076 * @return A URL constructed from this URI 1077 * 1078 * @throws IllegalArgumentException 1079 * If this URL is not absolute 1080 * 1081 * @throws MalformedURLException 1082 * If a protocol handler for the URL could not be found, 1083 * or if some other error occurred while constructing the URL 1084 */ 1085 public URL toURL() 1086 throws MalformedURLException { 1087 if (!isAbsolute()) 1088 throw new IllegalArgumentException("URI is not absolute"); 1089 return new URL(toString()); 1090 } 1091 1092 // -- Component access methods -- 1093 1094 /** 1095 * Returns the scheme component of this URI. 1096 * 1097 * <p> The scheme component of a URI, if defined, only contains characters 1098 * in the <i>alphanum</i> category and in the string {@code "-.+"}. A 1099 * scheme always starts with an <i>alpha</i> character. <p> 1100 * 1101 * The scheme component of a URI cannot contain escaped octets, hence this 1102 * method does not perform any decoding. 1103 * 1104 * @return The scheme component of this URI, 1105 * or {@code null} if the scheme is undefined 1106 */ 1107 public String getScheme() { 1108 return scheme; 1109 } 1110 1111 /** 1112 * Tells whether or not this URI is absolute. 1113 * 1114 * <p> A URI is absolute if, and only if, it has a scheme component. </p> 1115 * 1116 * @return {@code true} if, and only if, this URI is absolute 1117 */ 1118 public boolean isAbsolute() { 1119 return scheme != null; 1120 } 1121 1122 /** 1123 * Tells whether or not this URI is opaque. 1124 * 1125 * <p> A URI is opaque if, and only if, it is absolute and its 1126 * scheme-specific part does not begin with a slash character ('/'). 1127 * An opaque URI has a scheme, a scheme-specific part, and possibly 1128 * a fragment; all other components are undefined. </p> 1129 * 1130 * @return {@code true} if, and only if, this URI is opaque 1131 */ 1132 public boolean isOpaque() { 1133 return path == null; 1134 } 1135 1136 /** 1137 * Returns the raw scheme-specific part of this URI. The scheme-specific 1138 * part is never undefined, though it may be empty. 1139 * 1140 * <p> The scheme-specific part of a URI only contains legal URI 1141 * characters. </p> 1142 * 1143 * @return The raw scheme-specific part of this URI 1144 * (never {@code null}) 1145 */ 1146 public String getRawSchemeSpecificPart() { 1147 defineSchemeSpecificPart(); 1148 return schemeSpecificPart; 1149 } 1150 1151 /** 1152 * Returns the decoded scheme-specific part of this URI. 1153 * 1154 * <p> The string returned by this method is equal to that returned by the 1155 * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method 1156 * except that all sequences of escaped octets are <a 1157 * href="#decode">decoded</a>. </p> 1158 * 1159 * @return The decoded scheme-specific part of this URI 1160 * (never {@code null}) 1161 */ 1162 public String getSchemeSpecificPart() { 1163 if (decodedSchemeSpecificPart == null) 1164 decodedSchemeSpecificPart = decode(getRawSchemeSpecificPart()); 1165 return decodedSchemeSpecificPart; 1166 } 1167 1168 /** 1169 * Returns the raw authority component of this URI. 1170 * 1171 * <p> The authority component of a URI, if defined, only contains the 1172 * commercial-at character ({@code '@'}) and characters in the 1173 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i> 1174 * categories. If the authority is server-based then it is further 1175 * constrained to have valid user-information, host, and port 1176 * components. </p> 1177 * 1178 * @return The raw authority component of this URI, 1179 * or {@code null} if the authority is undefined 1180 */ 1181 public String getRawAuthority() { 1182 return authority; 1183 } 1184 1185 /** 1186 * Returns the decoded authority component of this URI. 1187 * 1188 * <p> The string returned by this method is equal to that returned by the 1189 * {@link #getRawAuthority() getRawAuthority} method except that all 1190 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1191 * 1192 * @return The decoded authority component of this URI, 1193 * or {@code null} if the authority is undefined 1194 */ 1195 public String getAuthority() { 1196 if (decodedAuthority == null) 1197 decodedAuthority = decode(authority); 1198 return decodedAuthority; 1199 } 1200 1201 /** 1202 * Returns the raw user-information component of this URI. 1203 * 1204 * <p> The user-information component of a URI, if defined, only contains 1205 * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and 1206 * <i>other</i> categories. </p> 1207 * 1208 * @return The raw user-information component of this URI, 1209 * or {@code null} if the user information is undefined 1210 */ 1211 public String getRawUserInfo() { 1212 return userInfo; 1213 } 1214 1215 /** 1216 * Returns the decoded user-information component of this URI. 1217 * 1218 * <p> The string returned by this method is equal to that returned by the 1219 * {@link #getRawUserInfo() getRawUserInfo} method except that all 1220 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1221 * 1222 * @return The decoded user-information component of this URI, 1223 * or {@code null} if the user information is undefined 1224 */ 1225 public String getUserInfo() { 1226 if ((decodedUserInfo == null) && (userInfo != null)) 1227 decodedUserInfo = decode(userInfo); 1228 return decodedUserInfo; 1229 } 1230 1231 /** 1232 * Returns the host component of this URI. 1233 * 1234 * <p> The host component of a URI, if defined, will have one of the 1235 * following forms: </p> 1236 * 1237 * <ul> 1238 * 1239 * <li><p> A domain name consisting of one or more <i>labels</i> 1240 * separated by period characters ({@code '.'}), optionally followed by 1241 * a period character. Each label consists of <i>alphanum</i> characters 1242 * as well as hyphen characters ({@code '-'}), though hyphens never 1243 * occur as the first or last characters in a label. The rightmost 1244 * label of a domain name consisting of two or more labels, begins 1245 * with an <i>alpha</i> character. </li> 1246 * 1247 * <li><p> A dotted-quad IPv4 address of the form 1248 * <i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +}, 1249 * where no <i>digit</i> sequence is longer than three characters and no 1250 * sequence has a value larger than 255. </p></li> 1251 * 1252 * <li><p> An IPv6 address enclosed in square brackets ({@code '['} and 1253 * {@code ']'}) and consisting of hexadecimal digits, colon characters 1254 * ({@code ':'}), and possibly an embedded IPv4 address. The full 1255 * syntax of IPv6 addresses is specified in <a 1256 * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6 1257 * Addressing Architecture</i></a>. </p></li> 1258 * 1259 * </ul> 1260 * 1261 * The host component of a URI cannot contain escaped octets, hence this 1262 * method does not perform any decoding. 1263 * 1264 * @return The host component of this URI, 1265 * or {@code null} if the host is undefined 1266 */ 1267 public String getHost() { 1268 return host; 1269 } 1270 1271 /** 1272 * Returns the port number of this URI. 1273 * 1274 * <p> The port component of a URI, if defined, is a non-negative 1275 * integer. </p> 1276 * 1277 * @return The port component of this URI, 1278 * or {@code -1} if the port is undefined 1279 */ 1280 public int getPort() { 1281 return port; 1282 } 1283 1284 /** 1285 * Returns the raw path component of this URI. 1286 * 1287 * <p> The path component of a URI, if defined, only contains the slash 1288 * character ({@code '/'}), the commercial-at character ({@code '@'}), 1289 * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, 1290 * and <i>other</i> categories. </p> 1291 * 1292 * @return The path component of this URI, 1293 * or {@code null} if the path is undefined 1294 */ 1295 public String getRawPath() { 1296 return path; 1297 } 1298 1299 /** 1300 * Returns the decoded path component of this URI. 1301 * 1302 * <p> The string returned by this method is equal to that returned by the 1303 * {@link #getRawPath() getRawPath} method except that all sequences of 1304 * escaped octets are <a href="#decode">decoded</a>. </p> 1305 * 1306 * @return The decoded path component of this URI, 1307 * or {@code null} if the path is undefined 1308 */ 1309 public String getPath() { 1310 if ((decodedPath == null) && (path != null)) 1311 decodedPath = decode(path); 1312 return decodedPath; 1313 } 1314 1315 /** 1316 * Returns the raw query component of this URI. 1317 * 1318 * <p> The query component of a URI, if defined, only contains legal URI 1319 * characters. </p> 1320 * 1321 * @return The raw query component of this URI, 1322 * or {@code null} if the query is undefined 1323 */ 1324 public String getRawQuery() { 1325 return query; 1326 } 1327 1328 /** 1329 * Returns the decoded query component of this URI. 1330 * 1331 * <p> The string returned by this method is equal to that returned by the 1332 * {@link #getRawQuery() getRawQuery} method except that all sequences of 1333 * escaped octets are <a href="#decode">decoded</a>. </p> 1334 * 1335 * @return The decoded query component of this URI, 1336 * or {@code null} if the query is undefined 1337 */ 1338 public String getQuery() { 1339 if ((decodedQuery == null) && (query != null)) 1340 decodedQuery = decode(query, false); 1341 return decodedQuery; 1342 } 1343 1344 /** 1345 * Returns the raw fragment component of this URI. 1346 * 1347 * <p> The fragment component of a URI, if defined, only contains legal URI 1348 * characters. </p> 1349 * 1350 * @return The raw fragment component of this URI, 1351 * or {@code null} if the fragment is undefined 1352 */ 1353 public String getRawFragment() { 1354 return fragment; 1355 } 1356 1357 /** 1358 * Returns the decoded fragment component of this URI. 1359 * 1360 * <p> The string returned by this method is equal to that returned by the 1361 * {@link #getRawFragment() getRawFragment} method except that all 1362 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1363 * 1364 * @return The decoded fragment component of this URI, 1365 * or {@code null} if the fragment is undefined 1366 */ 1367 public String getFragment() { 1368 if ((decodedFragment == null) && (fragment != null)) 1369 decodedFragment = decode(fragment, false); 1370 return decodedFragment; 1371 } 1372 1373 1374 // -- Equality, comparison, hash code, toString, and serialization -- 1375 1376 /** 1377 * Tests this URI for equality with another object. 1378 * 1379 * <p> If the given object is not a URI then this method immediately 1380 * returns {@code false}. 1381 * 1382 * <p> For two URIs to be considered equal requires that either both are 1383 * opaque or both are hierarchical. Their schemes must either both be 1384 * undefined or else be equal without regard to case. Their fragments 1385 * must either both be undefined or else be equal. 1386 * 1387 * <p> For two opaque URIs to be considered equal, their scheme-specific 1388 * parts must be equal. 1389 * 1390 * <p> For two hierarchical URIs to be considered equal, their paths must 1391 * be equal and their queries must either both be undefined or else be 1392 * equal. Their authorities must either both be undefined, or both be 1393 * registry-based, or both be server-based. If their authorities are 1394 * defined and are registry-based, then they must be equal. If their 1395 * authorities are defined and are server-based, then their hosts must be 1396 * equal without regard to case, their port numbers must be equal, and 1397 * their user-information components must be equal. 1398 * 1399 * <p> When testing the user-information, path, query, fragment, authority, 1400 * or scheme-specific parts of two URIs for equality, the raw forms rather 1401 * than the encoded forms of these components are compared and the 1402 * hexadecimal digits of escaped octets are compared without regard to 1403 * case. 1404 * 1405 * <p> This method satisfies the general contract of the {@link 1406 * java.lang.Object#equals(Object) Object.equals} method. </p> 1407 * 1408 * @param ob The object to which this object is to be compared 1409 * 1410 * @return {@code true} if, and only if, the given object is a URI that 1411 * is identical to this URI 1412 */ 1413 public boolean equals(Object ob) { 1414 if (ob == this) 1415 return true; 1416 if (!(ob instanceof URI)) 1417 return false; 1418 URI that = (URI)ob; 1419 if (this.isOpaque() != that.isOpaque()) return false; 1420 if (!equalIgnoringCase(this.scheme, that.scheme)) return false; 1421 if (!equal(this.fragment, that.fragment)) return false; 1422 1423 // Opaque 1424 if (this.isOpaque()) 1425 return equal(this.schemeSpecificPart, that.schemeSpecificPart); 1426 1427 // Hierarchical 1428 if (!equal(this.path, that.path)) return false; 1429 if (!equal(this.query, that.query)) return false; 1430 1431 // Authorities 1432 if (this.authority == that.authority) return true; 1433 if (this.host != null) { 1434 // Server-based 1435 if (!equal(this.userInfo, that.userInfo)) return false; 1436 if (!equalIgnoringCase(this.host, that.host)) return false; 1437 if (this.port != that.port) return false; 1438 } else if (this.authority != null) { 1439 // Registry-based 1440 if (!equal(this.authority, that.authority)) return false; 1441 } else if (this.authority != that.authority) { 1442 return false; 1443 } 1444 1445 return true; 1446 } 1447 1448 /** 1449 * Returns a hash-code value for this URI. The hash code is based upon all 1450 * of the URI's components, and satisfies the general contract of the 1451 * {@link java.lang.Object#hashCode() Object.hashCode} method. 1452 * 1453 * @return A hash-code value for this URI 1454 */ 1455 public int hashCode() { 1456 if (hash != 0) 1457 return hash; 1458 int h = hashIgnoringCase(0, scheme); 1459 h = hash(h, fragment); 1460 if (isOpaque()) { 1461 h = hash(h, schemeSpecificPart); 1462 } else { 1463 h = hash(h, path); 1464 h = hash(h, query); 1465 if (host != null) { 1466 h = hash(h, userInfo); 1467 h = hashIgnoringCase(h, host); 1468 h += 1949 * port; 1469 } else { 1470 h = hash(h, authority); 1471 } 1472 } 1473 hash = h; 1474 return h; 1475 } 1476 1477 /** 1478 * Compares this URI to another object, which must be a URI. 1479 * 1480 * <p> When comparing corresponding components of two URIs, if one 1481 * component is undefined but the other is defined then the first is 1482 * considered to be less than the second. Unless otherwise noted, string 1483 * components are ordered according to their natural, case-sensitive 1484 * ordering as defined by the {@link java.lang.String#compareTo(Object) 1485 * String.compareTo} method. String components that are subject to 1486 * encoding are compared by comparing their raw forms rather than their 1487 * encoded forms. 1488 * 1489 * <p> The ordering of URIs is defined as follows: </p> 1490 * 1491 * <ul> 1492 * 1493 * <li><p> Two URIs with different schemes are ordered according the 1494 * ordering of their schemes, without regard to case. </p></li> 1495 * 1496 * <li><p> A hierarchical URI is considered to be less than an opaque URI 1497 * with an identical scheme. </p></li> 1498 * 1499 * <li><p> Two opaque URIs with identical schemes are ordered according 1500 * to the ordering of their scheme-specific parts. </p></li> 1501 * 1502 * <li><p> Two opaque URIs with identical schemes and scheme-specific 1503 * parts are ordered according to the ordering of their 1504 * fragments. </p></li> 1505 * 1506 * <li><p> Two hierarchical URIs with identical schemes are ordered 1507 * according to the ordering of their authority components: </p> 1508 * 1509 * <ul> 1510 * 1511 * <li><p> If both authority components are server-based then the URIs 1512 * are ordered according to their user-information components; if these 1513 * components are identical then the URIs are ordered according to the 1514 * ordering of their hosts, without regard to case; if the hosts are 1515 * identical then the URIs are ordered according to the ordering of 1516 * their ports. </p></li> 1517 * 1518 * <li><p> If one or both authority components are registry-based then 1519 * the URIs are ordered according to the ordering of their authority 1520 * components. </p></li> 1521 * 1522 * </ul></li> 1523 * 1524 * <li><p> Finally, two hierarchical URIs with identical schemes and 1525 * authority components are ordered according to the ordering of their 1526 * paths; if their paths are identical then they are ordered according to 1527 * the ordering of their queries; if the queries are identical then they 1528 * are ordered according to the order of their fragments. </p></li> 1529 * 1530 * </ul> 1531 * 1532 * <p> This method satisfies the general contract of the {@link 1533 * java.lang.Comparable#compareTo(Object) Comparable.compareTo} 1534 * method. </p> 1535 * 1536 * @param that 1537 * The object to which this URI is to be compared 1538 * 1539 * @return A negative integer, zero, or a positive integer as this URI is 1540 * less than, equal to, or greater than the given URI 1541 * 1542 * @throws ClassCastException 1543 * If the given object is not a URI 1544 */ 1545 public int compareTo(URI that) { 1546 int c; 1547 1548 if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0) 1549 return c; 1550 1551 if (this.isOpaque()) { 1552 if (that.isOpaque()) { 1553 // Both opaque 1554 if ((c = compare(this.schemeSpecificPart, 1555 that.schemeSpecificPart)) != 0) 1556 return c; 1557 return compare(this.fragment, that.fragment); 1558 } 1559 return +1; // Opaque > hierarchical 1560 } else if (that.isOpaque()) { 1561 return -1; // Hierarchical < opaque 1562 } 1563 1564 // Hierarchical 1565 if ((this.host != null) && (that.host != null)) { 1566 // Both server-based 1567 if ((c = compare(this.userInfo, that.userInfo)) != 0) 1568 return c; 1569 if ((c = compareIgnoringCase(this.host, that.host)) != 0) 1570 return c; 1571 if ((c = this.port - that.port) != 0) 1572 return c; 1573 } else { 1574 // If one or both authorities are registry-based then we simply 1575 // compare them in the usual, case-sensitive way. If one is 1576 // registry-based and one is server-based then the strings are 1577 // guaranteed to be unequal, hence the comparison will never return 1578 // zero and the compareTo and equals methods will remain 1579 // consistent. 1580 if ((c = compare(this.authority, that.authority)) != 0) return c; 1581 } 1582 1583 if ((c = compare(this.path, that.path)) != 0) return c; 1584 if ((c = compare(this.query, that.query)) != 0) return c; 1585 return compare(this.fragment, that.fragment); 1586 } 1587 1588 /** 1589 * Returns the content of this URI as a string. 1590 * 1591 * <p> If this URI was created by invoking one of the constructors in this 1592 * class then a string equivalent to the original input string, or to the 1593 * string computed from the originally-given components, as appropriate, is 1594 * returned. Otherwise this URI was created by normalization, resolution, 1595 * or relativization, and so a string is constructed from this URI's 1596 * components according to the rules specified in <a 1597 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 1598 * section 5.2, step 7. </p> 1599 * 1600 * @return The string form of this URI 1601 */ 1602 public String toString() { 1603 defineString(); 1604 return string; 1605 } 1606 1607 /** 1608 * Returns the content of this URI as a US-ASCII string. 1609 * 1610 * <p> If this URI does not contain any characters in the <i>other</i> 1611 * category then an invocation of this method will return the same value as 1612 * an invocation of the {@link #toString() toString} method. Otherwise 1613 * this method works as if by invoking that method and then <a 1614 * href="#encode">encoding</a> the result. </p> 1615 * 1616 * @return The string form of this URI, encoded as needed 1617 * so that it only contains characters in the US-ASCII 1618 * charset 1619 */ 1620 public String toASCIIString() { 1621 defineString(); 1622 return encode(string); 1623 } 1624 1625 1626 // -- Serialization support -- 1627 1628 /** 1629 * Saves the content of this URI to the given serial stream. 1630 * 1631 * <p> The only serializable field of a URI instance is its {@code string} 1632 * field. That field is given a value, if it does not have one already, 1633 * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()} 1634 * method of the given object-output stream is invoked. </p> 1635 * 1636 * @param os The object-output stream to which this object 1637 * is to be written 1638 */ 1639 private void writeObject(ObjectOutputStream os) 1640 throws IOException 1641 { 1642 defineString(); 1643 os.defaultWriteObject(); // Writes the string field only 1644 } 1645 1646 /** 1647 * Reconstitutes a URI from the given serial stream. 1648 * 1649 * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is 1650 * invoked to read the value of the {@code string} field. The result is 1651 * then parsed in the usual way. 1652 * 1653 * @param is The object-input stream from which this object 1654 * is being read 1655 */ 1656 private void readObject(ObjectInputStream is) 1657 throws ClassNotFoundException, IOException 1658 { 1659 port = -1; // Argh 1660 is.defaultReadObject(); 1661 try { 1662 new Parser(string).parse(false); 1663 } catch (URISyntaxException x) { 1664 IOException y = new InvalidObjectException("Invalid URI"); 1665 y.initCause(x); 1666 throw y; 1667 } 1668 } 1669 1670 1671 // -- End of public methods -- 1672 1673 1674 // -- Utility methods for string-field comparison and hashing -- 1675 1676 // These methods return appropriate values for null string arguments, 1677 // thereby simplifying the equals, hashCode, and compareTo methods. 1678 // 1679 // The case-ignoring methods should only be applied to strings whose 1680 // characters are all known to be US-ASCII. Because of this restriction, 1681 // these methods are faster than the similar methods in the String class. 1682 1683 // US-ASCII only 1684 private static int toLower(char c) { 1685 if ((c >= 'A') && (c <= 'Z')) 1686 return c + ('a' - 'A'); 1687 return c; 1688 } 1689 1690 // US-ASCII only 1691 private static int toUpper(char c) { 1692 if ((c >= 'a') && (c <= 'z')) 1693 return c - ('a' - 'A'); 1694 return c; 1695 } 1696 1697 private static boolean equal(String s, String t) { 1698 if (s == t) return true; 1699 if ((s != null) && (t != null)) { 1700 if (s.length() != t.length()) 1701 return false; 1702 if (s.indexOf('%') < 0) 1703 return s.equals(t); 1704 int n = s.length(); 1705 for (int i = 0; i < n;) { 1706 char c = s.charAt(i); 1707 char d = t.charAt(i); 1708 if (c != '%') { 1709 if (c != d) 1710 return false; 1711 i++; 1712 continue; 1713 } 1714 if (d != '%') 1715 return false; 1716 i++; 1717 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1718 return false; 1719 i++; 1720 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1721 return false; 1722 i++; 1723 } 1724 return true; 1725 } 1726 return false; 1727 } 1728 1729 // US-ASCII only 1730 private static boolean equalIgnoringCase(String s, String t) { 1731 if (s == t) return true; 1732 if ((s != null) && (t != null)) { 1733 int n = s.length(); 1734 if (t.length() != n) 1735 return false; 1736 for (int i = 0; i < n; i++) { 1737 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1738 return false; 1739 } 1740 return true; 1741 } 1742 return false; 1743 } 1744 1745 private static int hash(int hash, String s) { 1746 if (s == null) return hash; 1747 return s.indexOf('%') < 0 ? hash * 127 + s.hashCode() 1748 : normalizedHash(hash, s); 1749 } 1750 1751 1752 private static int normalizedHash(int hash, String s) { 1753 int h = 0; 1754 for (int index = 0; index < s.length(); index++) { 1755 char ch = s.charAt(index); 1756 h = 31 * h + ch; 1757 if (ch == '%') { 1758 /* 1759 * Process the next two encoded characters 1760 */ 1761 for (int i = index + 1; i < index + 3; i++) 1762 h = 31 * h + toUpper(s.charAt(i)); 1763 index += 2; 1764 } 1765 } 1766 return hash * 127 + h; 1767 } 1768 1769 // US-ASCII only 1770 private static int hashIgnoringCase(int hash, String s) { 1771 if (s == null) return hash; 1772 int h = hash; 1773 int n = s.length(); 1774 for (int i = 0; i < n; i++) 1775 h = 31 * h + toLower(s.charAt(i)); 1776 return h; 1777 } 1778 1779 private static int compare(String s, String t) { 1780 if (s == t) return 0; 1781 if (s != null) { 1782 if (t != null) 1783 return s.compareTo(t); 1784 else 1785 return +1; 1786 } else { 1787 return -1; 1788 } 1789 } 1790 1791 // US-ASCII only 1792 private static int compareIgnoringCase(String s, String t) { 1793 if (s == t) return 0; 1794 if (s != null) { 1795 if (t != null) { 1796 int sn = s.length(); 1797 int tn = t.length(); 1798 int n = sn < tn ? sn : tn; 1799 for (int i = 0; i < n; i++) { 1800 int c = toLower(s.charAt(i)) - toLower(t.charAt(i)); 1801 if (c != 0) 1802 return c; 1803 } 1804 return sn - tn; 1805 } 1806 return +1; 1807 } else { 1808 return -1; 1809 } 1810 } 1811 1812 1813 // -- String construction -- 1814 1815 // If a scheme is given then the path, if given, must be absolute 1816 // 1817 private static void checkPath(String s, String scheme, String path) 1818 throws URISyntaxException 1819 { 1820 if (scheme != null) { 1821 if ((path != null) 1822 && ((path.length() > 0) && (path.charAt(0) != '/'))) 1823 throw new URISyntaxException(s, 1824 "Relative path in absolute URI"); 1825 } 1826 } 1827 1828 private void appendAuthority(StringBuffer sb, 1829 String authority, 1830 String userInfo, 1831 String host, 1832 int port) 1833 { 1834 if (host != null) { 1835 sb.append("//"); 1836 if (userInfo != null) { 1837 sb.append(quote(userInfo, L_USERINFO, H_USERINFO)); 1838 sb.append('@'); 1839 } 1840 boolean needBrackets = ((host.indexOf(':') >= 0) 1841 && !host.startsWith("[") 1842 && !host.endsWith("]")); 1843 if (needBrackets) sb.append('['); 1844 sb.append(host); 1845 if (needBrackets) sb.append(']'); 1846 if (port != -1) { 1847 sb.append(':'); 1848 sb.append(port); 1849 } 1850 } else if (authority != null) { 1851 sb.append("//"); 1852 if (authority.startsWith("[")) { 1853 // authority should (but may not) contain an embedded IPv6 address 1854 int end = authority.indexOf(']'); 1855 String doquote = authority, dontquote = ""; 1856 if (end != -1 && authority.indexOf(':') != -1) { 1857 // the authority contains an IPv6 address 1858 if (end == authority.length()) { 1859 dontquote = authority; 1860 doquote = ""; 1861 } else { 1862 dontquote = authority.substring(0 , end + 1); 1863 doquote = authority.substring(end + 1); 1864 } 1865 } 1866 sb.append(dontquote); 1867 sb.append(quote(doquote, 1868 L_REG_NAME | L_SERVER, 1869 H_REG_NAME | H_SERVER)); 1870 } else { 1871 sb.append(quote(authority, 1872 L_REG_NAME | L_SERVER, 1873 H_REG_NAME | H_SERVER)); 1874 } 1875 } 1876 } 1877 1878 private void appendSchemeSpecificPart(StringBuffer sb, 1879 String opaquePart, 1880 String authority, 1881 String userInfo, 1882 String host, 1883 int port, 1884 String path, 1885 String query) 1886 { 1887 if (opaquePart != null) { 1888 /* check if SSP begins with an IPv6 address 1889 * because we must not quote a literal IPv6 address 1890 */ 1891 if (opaquePart.startsWith("//[")) { 1892 int end = opaquePart.indexOf(']'); 1893 if (end != -1 && opaquePart.indexOf(':')!=-1) { 1894 String doquote, dontquote; 1895 if (end == opaquePart.length()) { 1896 dontquote = opaquePart; 1897 doquote = ""; 1898 } else { 1899 dontquote = opaquePart.substring(0,end+1); 1900 doquote = opaquePart.substring(end+1); 1901 } 1902 sb.append (dontquote); 1903 sb.append(quote(doquote, L_URIC, H_URIC)); 1904 } 1905 } else { 1906 sb.append(quote(opaquePart, L_URIC, H_URIC)); 1907 } 1908 } else { 1909 appendAuthority(sb, authority, userInfo, host, port); 1910 if (path != null) 1911 sb.append(quote(path, L_PATH, H_PATH)); 1912 if (query != null) { 1913 sb.append('?'); 1914 sb.append(quote(query, L_URIC, H_URIC)); 1915 } 1916 } 1917 } 1918 1919 private void appendFragment(StringBuffer sb, String fragment) { 1920 if (fragment != null) { 1921 sb.append('#'); 1922 sb.append(quote(fragment, L_URIC, H_URIC)); 1923 } 1924 } 1925 1926 private String toString(String scheme, 1927 String opaquePart, 1928 String authority, 1929 String userInfo, 1930 String host, 1931 int port, 1932 String path, 1933 String query, 1934 String fragment) 1935 { 1936 StringBuffer sb = new StringBuffer(); 1937 if (scheme != null) { 1938 sb.append(scheme); 1939 sb.append(':'); 1940 } 1941 appendSchemeSpecificPart(sb, opaquePart, 1942 authority, userInfo, host, port, 1943 path, query); 1944 appendFragment(sb, fragment); 1945 return sb.toString(); 1946 } 1947 1948 private void defineSchemeSpecificPart() { 1949 if (schemeSpecificPart != null) return; 1950 StringBuffer sb = new StringBuffer(); 1951 appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(), 1952 host, port, getPath(), getQuery()); 1953 if (sb.length() == 0) return; 1954 schemeSpecificPart = sb.toString(); 1955 } 1956 1957 private void defineString() { 1958 if (string != null) return; 1959 1960 StringBuilder sb = new StringBuilder(); 1961 if (scheme != null) { 1962 sb.append(scheme); 1963 sb.append(':'); 1964 } 1965 if (isOpaque()) { 1966 sb.append(schemeSpecificPart); 1967 } else { 1968 if (host != null) { 1969 sb.append("//"); 1970 if (userInfo != null) { 1971 sb.append(userInfo); 1972 sb.append('@'); 1973 } 1974 boolean needBrackets = ((host.indexOf(':') >= 0) 1975 && !host.startsWith("[") 1976 && !host.endsWith("]")); 1977 if (needBrackets) sb.append('['); 1978 sb.append(host); 1979 if (needBrackets) sb.append(']'); 1980 if (port != -1) { 1981 sb.append(':'); 1982 sb.append(port); 1983 } 1984 } else if (authority != null) { 1985 sb.append("//"); 1986 sb.append(authority); 1987 } 1988 if (path != null) 1989 sb.append(path); 1990 if (query != null) { 1991 sb.append('?'); 1992 sb.append(query); 1993 } 1994 } 1995 if (fragment != null) { 1996 sb.append('#'); 1997 sb.append(fragment); 1998 } 1999 string = sb.toString(); 2000 } 2001 2002 2003 // -- Normalization, resolution, and relativization -- 2004 2005 // RFC2396 5.2 (6) 2006 private static String resolvePath(String base, String child, 2007 boolean absolute) 2008 { 2009 int i = base.lastIndexOf('/'); 2010 int cn = child.length(); 2011 String path = ""; 2012 2013 if (cn == 0) { 2014 // 5.2 (6a) 2015 if (i >= 0) 2016 path = base.substring(0, i + 1); 2017 } else { 2018 StringBuilder sb = new StringBuilder(base.length() + cn); 2019 // 5.2 (6a) 2020 if (i >= 0) 2021 sb.append(base, 0, i + 1); 2022 // 5.2 (6b) 2023 sb.append(child); 2024 path = sb.toString(); 2025 } 2026 2027 // 5.2 (6c-f) 2028 String np = normalize(path); 2029 2030 // 5.2 (6g): If the result is absolute but the path begins with "../", 2031 // then we simply leave the path as-is 2032 2033 return np; 2034 } 2035 2036 // RFC2396 5.2 2037 private static URI resolve(URI base, URI child) { 2038 // check if child if opaque first so that NPE is thrown 2039 // if child is null. 2040 if (child.isOpaque() || base.isOpaque()) 2041 return child; 2042 2043 // 5.2 (2): Reference to current document (lone fragment) 2044 if ((child.scheme == null) && (child.authority == null) 2045 && child.path.equals("") && (child.fragment != null) 2046 && (child.query == null)) { 2047 if ((base.fragment != null) 2048 && child.fragment.equals(base.fragment)) { 2049 return base; 2050 } 2051 URI ru = new URI(); 2052 ru.scheme = base.scheme; 2053 ru.authority = base.authority; 2054 ru.userInfo = base.userInfo; 2055 ru.host = base.host; 2056 ru.port = base.port; 2057 ru.path = base.path; 2058 ru.fragment = child.fragment; 2059 ru.query = base.query; 2060 return ru; 2061 } 2062 2063 // 5.2 (3): Child is absolute 2064 if (child.scheme != null) 2065 return child; 2066 2067 URI ru = new URI(); // Resolved URI 2068 ru.scheme = base.scheme; 2069 ru.query = child.query; 2070 ru.fragment = child.fragment; 2071 2072 // 5.2 (4): Authority 2073 if (child.authority == null) { 2074 ru.authority = base.authority; 2075 ru.host = base.host; 2076 ru.userInfo = base.userInfo; 2077 ru.port = base.port; 2078 2079 String cp = (child.path == null) ? "" : child.path; 2080 if ((cp.length() > 0) && (cp.charAt(0) == '/')) { 2081 // 5.2 (5): Child path is absolute 2082 ru.path = child.path; 2083 } else { 2084 // 5.2 (6): Resolve relative path 2085 ru.path = resolvePath(base.path, cp, base.isAbsolute()); 2086 } 2087 } else { 2088 ru.authority = child.authority; 2089 ru.host = child.host; 2090 ru.userInfo = child.userInfo; 2091 ru.host = child.host; 2092 ru.port = child.port; 2093 ru.path = child.path; 2094 } 2095 2096 // 5.2 (7): Recombine (nothing to do here) 2097 return ru; 2098 } 2099 2100 // If the given URI's path is normal then return the URI; 2101 // o.w., return a new URI containing the normalized path. 2102 // 2103 private static URI normalize(URI u) { 2104 if (u.isOpaque() || (u.path == null) || (u.path.length() == 0)) 2105 return u; 2106 2107 String np = normalize(u.path); 2108 if (np == u.path) 2109 return u; 2110 2111 URI v = new URI(); 2112 v.scheme = u.scheme; 2113 v.fragment = u.fragment; 2114 v.authority = u.authority; 2115 v.userInfo = u.userInfo; 2116 v.host = u.host; 2117 v.port = u.port; 2118 v.path = np; 2119 v.query = u.query; 2120 return v; 2121 } 2122 2123 // If both URIs are hierarchical, their scheme and authority components are 2124 // identical, and the base path is a prefix of the child's path, then 2125 // return a relative URI that, when resolved against the base, yields the 2126 // child; otherwise, return the child. 2127 // 2128 private static URI relativize(URI base, URI child) { 2129 // check if child if opaque first so that NPE is thrown 2130 // if child is null. 2131 if (child.isOpaque() || base.isOpaque()) 2132 return child; 2133 if (!equalIgnoringCase(base.scheme, child.scheme) 2134 || !equal(base.authority, child.authority)) 2135 return child; 2136 2137 String bp = normalize(base.path); 2138 String cp = normalize(child.path); 2139 if (!bp.equals(cp)) { 2140 if (!bp.endsWith("/")) 2141 bp = bp + "/"; 2142 if (!cp.startsWith(bp)) 2143 return child; 2144 } 2145 2146 URI v = new URI(); 2147 v.path = cp.substring(bp.length()); 2148 v.query = child.query; 2149 v.fragment = child.fragment; 2150 return v; 2151 } 2152 2153 2154 2155 // -- Path normalization -- 2156 2157 // The following algorithm for path normalization avoids the creation of a 2158 // string object for each segment, as well as the use of a string buffer to 2159 // compute the final result, by using a single char array and editing it in 2160 // place. The array is first split into segments, replacing each slash 2161 // with '\0' and creating a segment-index array, each element of which is 2162 // the index of the first char in the corresponding segment. We then walk 2163 // through both arrays, removing ".", "..", and other segments as necessary 2164 // by setting their entries in the index array to -1. Finally, the two 2165 // arrays are used to rejoin the segments and compute the final result. 2166 // 2167 // This code is based upon src/solaris/native/java/io/canonicalize_md.c 2168 2169 2170 // Check the given path to see if it might need normalization. A path 2171 // might need normalization if it contains duplicate slashes, a "." 2172 // segment, or a ".." segment. Return -1 if no further normalization is 2173 // possible, otherwise return the number of segments found. 2174 // 2175 // This method takes a string argument rather than a char array so that 2176 // this test can be performed without invoking path.toCharArray(). 2177 // 2178 private static int needsNormalization(String path) { 2179 boolean normal = true; 2180 int ns = 0; // Number of segments 2181 int end = path.length() - 1; // Index of last char in path 2182 int p = 0; // Index of next char in path 2183 2184 // Skip initial slashes 2185 while (p <= end) { 2186 if (path.charAt(p) != '/') break; 2187 p++; 2188 } 2189 if (p > 1) normal = false; 2190 2191 // Scan segments 2192 while (p <= end) { 2193 2194 // Looking at "." or ".." ? 2195 if ((path.charAt(p) == '.') 2196 && ((p == end) 2197 || ((path.charAt(p + 1) == '/') 2198 || ((path.charAt(p + 1) == '.') 2199 && ((p + 1 == end) 2200 || (path.charAt(p + 2) == '/')))))) { 2201 normal = false; 2202 } 2203 ns++; 2204 2205 // Find beginning of next segment 2206 while (p <= end) { 2207 if (path.charAt(p++) != '/') 2208 continue; 2209 2210 // Skip redundant slashes 2211 while (p <= end) { 2212 if (path.charAt(p) != '/') break; 2213 normal = false; 2214 p++; 2215 } 2216 2217 break; 2218 } 2219 } 2220 2221 return normal ? -1 : ns; 2222 } 2223 2224 2225 // Split the given path into segments, replacing slashes with nulls and 2226 // filling in the given segment-index array. 2227 // 2228 // Preconditions: 2229 // segs.length == Number of segments in path 2230 // 2231 // Postconditions: 2232 // All slashes in path replaced by '\0' 2233 // segs[i] == Index of first char in segment i (0 <= i < segs.length) 2234 // 2235 private static void split(char[] path, int[] segs) { 2236 int end = path.length - 1; // Index of last char in path 2237 int p = 0; // Index of next char in path 2238 int i = 0; // Index of current segment 2239 2240 // Skip initial slashes 2241 while (p <= end) { 2242 if (path[p] != '/') break; 2243 path[p] = '\0'; 2244 p++; 2245 } 2246 2247 while (p <= end) { 2248 2249 // Note start of segment 2250 segs[i++] = p++; 2251 2252 // Find beginning of next segment 2253 while (p <= end) { 2254 if (path[p++] != '/') 2255 continue; 2256 path[p - 1] = '\0'; 2257 2258 // Skip redundant slashes 2259 while (p <= end) { 2260 if (path[p] != '/') break; 2261 path[p++] = '\0'; 2262 } 2263 break; 2264 } 2265 } 2266 2267 if (i != segs.length) 2268 throw new InternalError(); // ASSERT 2269 } 2270 2271 2272 // Join the segments in the given path according to the given segment-index 2273 // array, ignoring those segments whose index entries have been set to -1, 2274 // and inserting slashes as needed. Return the length of the resulting 2275 // path. 2276 // 2277 // Preconditions: 2278 // segs[i] == -1 implies segment i is to be ignored 2279 // path computed by split, as above, with '\0' having replaced '/' 2280 // 2281 // Postconditions: 2282 // path[0] .. path[return value] == Resulting path 2283 // 2284 private static int join(char[] path, int[] segs) { 2285 int ns = segs.length; // Number of segments 2286 int end = path.length - 1; // Index of last char in path 2287 int p = 0; // Index of next path char to write 2288 2289 if (path[p] == '\0') { 2290 // Restore initial slash for absolute paths 2291 path[p++] = '/'; 2292 } 2293 2294 for (int i = 0; i < ns; i++) { 2295 int q = segs[i]; // Current segment 2296 if (q == -1) 2297 // Ignore this segment 2298 continue; 2299 2300 if (p == q) { 2301 // We're already at this segment, so just skip to its end 2302 while ((p <= end) && (path[p] != '\0')) 2303 p++; 2304 if (p <= end) { 2305 // Preserve trailing slash 2306 path[p++] = '/'; 2307 } 2308 } else if (p < q) { 2309 // Copy q down to p 2310 while ((q <= end) && (path[q] != '\0')) 2311 path[p++] = path[q++]; 2312 if (q <= end) { 2313 // Preserve trailing slash 2314 path[p++] = '/'; 2315 } 2316 } else 2317 throw new InternalError(); // ASSERT false 2318 } 2319 2320 return p; 2321 } 2322 2323 2324 // Remove "." segments from the given path, and remove segment pairs 2325 // consisting of a non-".." segment followed by a ".." segment. 2326 // 2327 private static void removeDots(char[] path, int[] segs) { 2328 int ns = segs.length; 2329 int end = path.length - 1; 2330 2331 for (int i = 0; i < ns; i++) { 2332 int dots = 0; // Number of dots found (0, 1, or 2) 2333 2334 // Find next occurrence of "." or ".." 2335 do { 2336 int p = segs[i]; 2337 if (path[p] == '.') { 2338 if (p == end) { 2339 dots = 1; 2340 break; 2341 } else if (path[p + 1] == '\0') { 2342 dots = 1; 2343 break; 2344 } else if ((path[p + 1] == '.') 2345 && ((p + 1 == end) 2346 || (path[p + 2] == '\0'))) { 2347 dots = 2; 2348 break; 2349 } 2350 } 2351 i++; 2352 } while (i < ns); 2353 if ((i > ns) || (dots == 0)) 2354 break; 2355 2356 if (dots == 1) { 2357 // Remove this occurrence of "." 2358 segs[i] = -1; 2359 } else { 2360 // If there is a preceding non-".." segment, remove both that 2361 // segment and this occurrence of ".."; otherwise, leave this 2362 // ".." segment as-is. 2363 int j; 2364 for (j = i - 1; j >= 0; j--) { 2365 if (segs[j] != -1) break; 2366 } 2367 if (j >= 0) { 2368 int q = segs[j]; 2369 if (!((path[q] == '.') 2370 && (path[q + 1] == '.') 2371 && (path[q + 2] == '\0'))) { 2372 segs[i] = -1; 2373 segs[j] = -1; 2374 } 2375 } 2376 } 2377 } 2378 } 2379 2380 2381 // DEVIATION: If the normalized path is relative, and if the first 2382 // segment could be parsed as a scheme name, then prepend a "." segment 2383 // 2384 private static void maybeAddLeadingDot(char[] path, int[] segs) { 2385 2386 if (path[0] == '\0') 2387 // The path is absolute 2388 return; 2389 2390 int ns = segs.length; 2391 int f = 0; // Index of first segment 2392 while (f < ns) { 2393 if (segs[f] >= 0) 2394 break; 2395 f++; 2396 } 2397 if ((f >= ns) || (f == 0)) 2398 // The path is empty, or else the original first segment survived, 2399 // in which case we already know that no leading "." is needed 2400 return; 2401 2402 int p = segs[f]; 2403 while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++; 2404 if (p >= path.length || path[p] == '\0') 2405 // No colon in first segment, so no "." needed 2406 return; 2407 2408 // At this point we know that the first segment is unused, 2409 // hence we can insert a "." segment at that position 2410 path[0] = '.'; 2411 path[1] = '\0'; 2412 segs[0] = 0; 2413 } 2414 2415 2416 // Normalize the given path string. A normal path string has no empty 2417 // segments (i.e., occurrences of "//"), no segments equal to ".", and no 2418 // segments equal to ".." that are preceded by a segment not equal to "..". 2419 // In contrast to Unix-style pathname normalization, for URI paths we 2420 // always retain trailing slashes. 2421 // 2422 private static String normalize(String ps) { 2423 2424 // Does this path need normalization? 2425 int ns = needsNormalization(ps); // Number of segments 2426 if (ns < 0) 2427 // Nope -- just return it 2428 return ps; 2429 2430 char[] path = ps.toCharArray(); // Path in char-array form 2431 2432 // Split path into segments 2433 int[] segs = new int[ns]; // Segment-index array 2434 split(path, segs); 2435 2436 // Remove dots 2437 removeDots(path, segs); 2438 2439 // Prevent scheme-name confusion 2440 maybeAddLeadingDot(path, segs); 2441 2442 // Join the remaining segments and return the result 2443 String s = new String(path, 0, join(path, segs)); 2444 if (s.equals(ps)) { 2445 // string was already normalized 2446 return ps; 2447 } 2448 return s; 2449 } 2450 2451 2452 2453 // -- Character classes for parsing -- 2454 2455 // RFC2396 precisely specifies which characters in the US-ASCII charset are 2456 // permissible in the various components of a URI reference. We here 2457 // define a set of mask pairs to aid in enforcing these restrictions. Each 2458 // mask pair consists of two longs, a low mask and a high mask. Taken 2459 // together they represent a 128-bit mask, where bit i is set iff the 2460 // character with value i is permitted. 2461 // 2462 // This approach is more efficient than sequentially searching arrays of 2463 // permitted characters. It could be made still more efficient by 2464 // precompiling the mask information so that a character's presence in a 2465 // given mask could be determined by a single table lookup. 2466 2467 // Compute the low-order mask for the characters in the given string 2468 private static long lowMask(String chars) { 2469 int n = chars.length(); 2470 long m = 0; 2471 for (int i = 0; i < n; i++) { 2472 char c = chars.charAt(i); 2473 if (c < 64) 2474 m |= (1L << c); 2475 } 2476 return m; 2477 } 2478 2479 // Compute the high-order mask for the characters in the given string 2480 private static long highMask(String chars) { 2481 int n = chars.length(); 2482 long m = 0; 2483 for (int i = 0; i < n; i++) { 2484 char c = chars.charAt(i); 2485 if ((c >= 64) && (c < 128)) 2486 m |= (1L << (c - 64)); 2487 } 2488 return m; 2489 } 2490 2491 // Compute a low-order mask for the characters 2492 // between first and last, inclusive 2493 private static long lowMask(char first, char last) { 2494 long m = 0; 2495 int f = Math.max(Math.min(first, 63), 0); 2496 int l = Math.max(Math.min(last, 63), 0); 2497 for (int i = f; i <= l; i++) 2498 m |= 1L << i; 2499 return m; 2500 } 2501 2502 // Compute a high-order mask for the characters 2503 // between first and last, inclusive 2504 private static long highMask(char first, char last) { 2505 long m = 0; 2506 int f = Math.max(Math.min(first, 127), 64) - 64; 2507 int l = Math.max(Math.min(last, 127), 64) - 64; 2508 for (int i = f; i <= l; i++) 2509 m |= 1L << i; 2510 return m; 2511 } 2512 2513 // Tell whether the given character is permitted by the given mask pair 2514 private static boolean match(char c, long lowMask, long highMask) { 2515 if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches. 2516 return false; 2517 if (c < 64) 2518 return ((1L << c) & lowMask) != 0; 2519 if (c < 128) 2520 return ((1L << (c - 64)) & highMask) != 0; 2521 return false; 2522 } 2523 2524 // Character-class masks, in reverse order from RFC2396 because 2525 // initializers for static fields cannot make forward references. 2526 2527 // digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | 2528 // "8" | "9" 2529 private static final long L_DIGIT = lowMask('0', '9'); 2530 private static final long H_DIGIT = 0L; 2531 2532 // upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | 2533 // "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | 2534 // "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" 2535 private static final long L_UPALPHA = 0L; 2536 private static final long H_UPALPHA = highMask('A', 'Z'); 2537 2538 // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | 2539 // "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | 2540 // "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" 2541 private static final long L_LOWALPHA = 0L; 2542 private static final long H_LOWALPHA = highMask('a', 'z'); 2543 2544 // alpha = lowalpha | upalpha 2545 private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA; 2546 private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA; 2547 2548 // alphanum = alpha | digit 2549 private static final long L_ALPHANUM = L_DIGIT | L_ALPHA; 2550 private static final long H_ALPHANUM = H_DIGIT | H_ALPHA; 2551 2552 // hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | 2553 // "a" | "b" | "c" | "d" | "e" | "f" 2554 private static final long L_HEX = L_DIGIT; 2555 private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f'); 2556 2557 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | 2558 // "(" | ")" 2559 private static final long L_MARK = lowMask("-_.!~*'()"); 2560 private static final long H_MARK = highMask("-_.!~*'()"); 2561 2562 // unreserved = alphanum | mark 2563 private static final long L_UNRESERVED = L_ALPHANUM | L_MARK; 2564 private static final long H_UNRESERVED = H_ALPHANUM | H_MARK; 2565 2566 // reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 2567 // "$" | "," | "[" | "]" 2568 // Added per RFC2732: "[", "]" 2569 private static final long L_RESERVED = lowMask(";/?:@&=+$,[]"); 2570 private static final long H_RESERVED = highMask(";/?:@&=+$,[]"); 2571 2572 // The zero'th bit is used to indicate that escape pairs and non-US-ASCII 2573 // characters are allowed; this is handled by the scanEscape method below. 2574 private static final long L_ESCAPED = 1L; 2575 private static final long H_ESCAPED = 0L; 2576 2577 // uric = reserved | unreserved | escaped 2578 private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED; 2579 private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED; 2580 2581 // pchar = unreserved | escaped | 2582 // ":" | "@" | "&" | "=" | "+" | "$" | "," 2583 private static final long L_PCHAR 2584 = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,"); 2585 private static final long H_PCHAR 2586 = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,"); 2587 2588 // All valid path characters 2589 private static final long L_PATH = L_PCHAR | lowMask(";/"); 2590 private static final long H_PATH = H_PCHAR | highMask(";/"); 2591 2592 // Dash, for use in domainlabel and toplabel 2593 private static final long L_DASH = lowMask("-"); 2594 private static final long H_DASH = highMask("-"); 2595 2596 // Dot, for use in hostnames 2597 private static final long L_DOT = lowMask("."); 2598 private static final long H_DOT = highMask("."); 2599 2600 // userinfo = *( unreserved | escaped | 2601 // ";" | ":" | "&" | "=" | "+" | "$" | "," ) 2602 private static final long L_USERINFO 2603 = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,"); 2604 private static final long H_USERINFO 2605 = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,"); 2606 2607 // reg_name = 1*( unreserved | escaped | "$" | "," | 2608 // ";" | ":" | "@" | "&" | "=" | "+" ) 2609 private static final long L_REG_NAME 2610 = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+"); 2611 private static final long H_REG_NAME 2612 = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+"); 2613 2614 // All valid characters for server-based authorities 2615 private static final long L_SERVER 2616 = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]"); 2617 private static final long H_SERVER 2618 = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]"); 2619 2620 // Special case of server authority that represents an IPv6 address 2621 // In this case, a % does not signify an escape sequence 2622 private static final long L_SERVER_PERCENT 2623 = L_SERVER | lowMask("%"); 2624 private static final long H_SERVER_PERCENT 2625 = H_SERVER | highMask("%"); 2626 private static final long L_LEFT_BRACKET = lowMask("["); 2627 private static final long H_LEFT_BRACKET = highMask("["); 2628 2629 // scheme = alpha *( alpha | digit | "+" | "-" | "." ) 2630 private static final long L_SCHEME = L_ALPHA | L_DIGIT | lowMask("+-."); 2631 private static final long H_SCHEME = H_ALPHA | H_DIGIT | highMask("+-."); 2632 2633 // uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" | 2634 // "&" | "=" | "+" | "$" | "," 2635 private static final long L_URIC_NO_SLASH 2636 = L_UNRESERVED | L_ESCAPED | lowMask(";?:@&=+$,"); 2637 private static final long H_URIC_NO_SLASH 2638 = H_UNRESERVED | H_ESCAPED | highMask(";?:@&=+$,"); 2639 2640 // scope_id = alpha | digit | "_" | "." 2641 private static final long L_SCOPE_ID 2642 = L_ALPHANUM | lowMask("_."); 2643 private static final long H_SCOPE_ID 2644 = H_ALPHANUM | highMask("_."); 2645 2646 // -- Escaping and encoding -- 2647 2648 private static final char[] hexDigits = { 2649 '0', '1', '2', '3', '4', '5', '6', '7', 2650 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' 2651 }; 2652 2653 private static void appendEscape(StringBuffer sb, byte b) { 2654 sb.append('%'); 2655 sb.append(hexDigits[(b >> 4) & 0x0f]); 2656 sb.append(hexDigits[(b >> 0) & 0x0f]); 2657 } 2658 2659 private static void appendEncoded(StringBuffer sb, char c) { 2660 ByteBuffer bb = null; 2661 try { 2662 bb = ThreadLocalCoders.encoderFor("UTF-8") 2663 .encode(CharBuffer.wrap("" + c)); 2664 } catch (CharacterCodingException x) { 2665 assert false; 2666 } 2667 while (bb.hasRemaining()) { 2668 int b = bb.get() & 0xff; 2669 if (b >= 0x80) 2670 appendEscape(sb, (byte)b); 2671 else 2672 sb.append((char)b); 2673 } 2674 } 2675 2676 // Quote any characters in s that are not permitted 2677 // by the given mask pair 2678 // 2679 private static String quote(String s, long lowMask, long highMask) { 2680 int n = s.length(); 2681 StringBuffer sb = null; 2682 boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0); 2683 for (int i = 0; i < s.length(); i++) { 2684 char c = s.charAt(i); 2685 if (c < '\u0080') { 2686 if (!match(c, lowMask, highMask)) { 2687 if (sb == null) { 2688 sb = new StringBuffer(); 2689 sb.append(s, 0, i); 2690 } 2691 appendEscape(sb, (byte)c); 2692 } else { 2693 if (sb != null) 2694 sb.append(c); 2695 } 2696 } else if (allowNonASCII 2697 && (Character.isSpaceChar(c) 2698 || Character.isISOControl(c))) { 2699 if (sb == null) { 2700 sb = new StringBuffer(); 2701 sb.append(s, 0, i); 2702 } 2703 appendEncoded(sb, c); 2704 } else { 2705 if (sb != null) 2706 sb.append(c); 2707 } 2708 } 2709 return (sb == null) ? s : sb.toString(); 2710 } 2711 2712 // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets, 2713 // assuming that s is otherwise legal 2714 // 2715 private static String encode(String s) { 2716 int n = s.length(); 2717 if (n == 0) 2718 return s; 2719 2720 // First check whether we actually need to encode 2721 for (int i = 0;;) { 2722 if (s.charAt(i) >= '\u0080') 2723 break; 2724 if (++i >= n) 2725 return s; 2726 } 2727 2728 String ns = Normalizer.normalize(s, Normalizer.Form.NFC); 2729 ByteBuffer bb = null; 2730 try { 2731 bb = ThreadLocalCoders.encoderFor("UTF-8") 2732 .encode(CharBuffer.wrap(ns)); 2733 } catch (CharacterCodingException x) { 2734 assert false; 2735 } 2736 2737 StringBuffer sb = new StringBuffer(); 2738 while (bb.hasRemaining()) { 2739 int b = bb.get() & 0xff; 2740 if (b >= 0x80) 2741 appendEscape(sb, (byte)b); 2742 else 2743 sb.append((char)b); 2744 } 2745 return sb.toString(); 2746 } 2747 2748 private static int decode(char c) { 2749 if ((c >= '0') && (c <= '9')) 2750 return c - '0'; 2751 if ((c >= 'a') && (c <= 'f')) 2752 return c - 'a' + 10; 2753 if ((c >= 'A') && (c <= 'F')) 2754 return c - 'A' + 10; 2755 assert false; 2756 return -1; 2757 } 2758 2759 private static byte decode(char c1, char c2) { 2760 return (byte)( ((decode(c1) & 0xf) << 4) 2761 | ((decode(c2) & 0xf) << 0)); 2762 } 2763 2764 // Evaluates all escapes in s, applying UTF-8 decoding if needed. Assumes 2765 // that escapes are well-formed syntactically, i.e., of the form %XX. If a 2766 // sequence of escaped octets is not valid UTF-8 then the erroneous octets 2767 // are replaced with '\uFFFD'. 2768 // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal 2769 // with a scope_id 2770 // 2771 private static String decode(String s) { 2772 return decode(s, true); 2773 } 2774 2775 // This method was introduced as a generalization of URI.decode method 2776 // to provide a fix for JDK-8037396 2777 private static String decode(String s, boolean ignorePercentInBrackets) { 2778 if (s == null) 2779 return s; 2780 int n = s.length(); 2781 if (n == 0) 2782 return s; 2783 if (s.indexOf('%') < 0) 2784 return s; 2785 2786 StringBuilder sb = new StringBuilder(n); 2787 ByteBuffer bb = ByteBuffer.allocate(n); 2788 CharBuffer cb = CharBuffer.allocate(n); 2789 CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8") 2790 .onMalformedInput(CodingErrorAction.REPLACE) 2791 .onUnmappableCharacter(CodingErrorAction.REPLACE); 2792 2793 // This is not horribly efficient, but it will do for now 2794 char c = s.charAt(0); 2795 boolean betweenBrackets = false; 2796 2797 for (int i = 0; i < n;) { 2798 assert c == s.charAt(i); // Loop invariant 2799 if (c == '[') { 2800 betweenBrackets = true; 2801 } else if (betweenBrackets && c == ']') { 2802 betweenBrackets = false; 2803 } 2804 if (c != '%' || (betweenBrackets && ignorePercentInBrackets)) { 2805 sb.append(c); 2806 if (++i >= n) 2807 break; 2808 c = s.charAt(i); 2809 continue; 2810 } 2811 bb.clear(); 2812 int ui = i; 2813 for (;;) { 2814 assert (n - i >= 2); 2815 bb.put(decode(s.charAt(++i), s.charAt(++i))); 2816 if (++i >= n) 2817 break; 2818 c = s.charAt(i); 2819 if (c != '%') 2820 break; 2821 } 2822 bb.flip(); 2823 cb.clear(); 2824 dec.reset(); 2825 CoderResult cr = dec.decode(bb, cb, true); 2826 assert cr.isUnderflow(); 2827 cr = dec.flush(cb); 2828 assert cr.isUnderflow(); 2829 sb.append(cb.flip().toString()); 2830 } 2831 2832 return sb.toString(); 2833 } 2834 2835 2836 // -- Parsing -- 2837 2838 // For convenience we wrap the input URI string in a new instance of the 2839 // following internal class. This saves always having to pass the input 2840 // string as an argument to each internal scan/parse method. 2841 2842 private class Parser { 2843 2844 private String input; // URI input string 2845 private boolean requireServerAuthority = false; 2846 2847 Parser(String s) { 2848 input = s; 2849 string = s; 2850 } 2851 2852 // -- Methods for throwing URISyntaxException in various ways -- 2853 2854 private void fail(String reason) throws URISyntaxException { 2855 throw new URISyntaxException(input, reason); 2856 } 2857 2858 private void fail(String reason, int p) throws URISyntaxException { 2859 throw new URISyntaxException(input, reason, p); 2860 } 2861 2862 private void failExpecting(String expected, int p) 2863 throws URISyntaxException 2864 { 2865 fail("Expected " + expected, p); 2866 } 2867 2868 private void failExpecting(String expected, String prior, int p) 2869 throws URISyntaxException 2870 { 2871 fail("Expected " + expected + " following " + prior, p); 2872 } 2873 2874 2875 // -- Simple access to the input string -- 2876 2877 // Return a substring of the input string 2878 // 2879 private String substring(int start, int end) { 2880 return input.substring(start, end); 2881 } 2882 2883 // Return the char at position p, 2884 // assuming that p < input.length() 2885 // 2886 private char charAt(int p) { 2887 return input.charAt(p); 2888 } 2889 2890 // Tells whether start < end and, if so, whether charAt(start) == c 2891 // 2892 private boolean at(int start, int end, char c) { 2893 return (start < end) && (charAt(start) == c); 2894 } 2895 2896 // Tells whether start + s.length() < end and, if so, 2897 // whether the chars at the start position match s exactly 2898 // 2899 private boolean at(int start, int end, String s) { 2900 int p = start; 2901 int sn = s.length(); 2902 if (sn > end - p) 2903 return false; 2904 int i = 0; 2905 while (i < sn) { 2906 if (charAt(p++) != s.charAt(i)) { 2907 break; 2908 } 2909 i++; 2910 } 2911 return (i == sn); 2912 } 2913 2914 2915 // -- Scanning -- 2916 2917 // The various scan and parse methods that follow use a uniform 2918 // convention of taking the current start position and end index as 2919 // their first two arguments. The start is inclusive while the end is 2920 // exclusive, just as in the String class, i.e., a start/end pair 2921 // denotes the left-open interval [start, end) of the input string. 2922 // 2923 // These methods never proceed past the end position. They may return 2924 // -1 to indicate outright failure, but more often they simply return 2925 // the position of the first char after the last char scanned. Thus 2926 // a typical idiom is 2927 // 2928 // int p = start; 2929 // int q = scan(p, end, ...); 2930 // if (q > p) 2931 // // We scanned something 2932 // ...; 2933 // else if (q == p) 2934 // // We scanned nothing 2935 // ...; 2936 // else if (q == -1) 2937 // // Something went wrong 2938 // ...; 2939 2940 2941 // Scan a specific char: If the char at the given start position is 2942 // equal to c, return the index of the next char; otherwise, return the 2943 // start position. 2944 // 2945 private int scan(int start, int end, char c) { 2946 if ((start < end) && (charAt(start) == c)) 2947 return start + 1; 2948 return start; 2949 } 2950 2951 // Scan forward from the given start position. Stop at the first char 2952 // in the err string (in which case -1 is returned), or the first char 2953 // in the stop string (in which case the index of the preceding char is 2954 // returned), or the end of the input string (in which case the length 2955 // of the input string is returned). May return the start position if 2956 // nothing matches. 2957 // 2958 private int scan(int start, int end, String err, String stop) { 2959 int p = start; 2960 while (p < end) { 2961 char c = charAt(p); 2962 if (err.indexOf(c) >= 0) 2963 return -1; 2964 if (stop.indexOf(c) >= 0) 2965 break; 2966 p++; 2967 } 2968 return p; 2969 } 2970 2971 // Scan a potential escape sequence, starting at the given position, 2972 // with the given first char (i.e., charAt(start) == c). 2973 // 2974 // This method assumes that if escapes are allowed then visible 2975 // non-US-ASCII chars are also allowed. 2976 // 2977 private int scanEscape(int start, int n, char first) 2978 throws URISyntaxException 2979 { 2980 int p = start; 2981 char c = first; 2982 if (c == '%') { 2983 // Process escape pair 2984 if ((p + 3 <= n) 2985 && match(charAt(p + 1), L_HEX, H_HEX) 2986 && match(charAt(p + 2), L_HEX, H_HEX)) { 2987 return p + 3; 2988 } 2989 fail("Malformed escape pair", p); 2990 } else if ((c > 128) 2991 && !Character.isSpaceChar(c) 2992 && !Character.isISOControl(c)) { 2993 // Allow unescaped but visible non-US-ASCII chars 2994 return p + 1; 2995 } 2996 return p; 2997 } 2998 2999 // Scan chars that match the given mask pair 3000 // 3001 private int scan(int start, int n, long lowMask, long highMask) 3002 throws URISyntaxException 3003 { 3004 int p = start; 3005 while (p < n) { 3006 char c = charAt(p); 3007 if (match(c, lowMask, highMask)) { 3008 p++; 3009 continue; 3010 } 3011 if ((lowMask & L_ESCAPED) != 0) { 3012 int q = scanEscape(p, n, c); 3013 if (q > p) { 3014 p = q; 3015 continue; 3016 } 3017 } 3018 break; 3019 } 3020 return p; 3021 } 3022 3023 // Check that each of the chars in [start, end) matches the given mask 3024 // 3025 private void checkChars(int start, int end, 3026 long lowMask, long highMask, 3027 String what) 3028 throws URISyntaxException 3029 { 3030 int p = scan(start, end, lowMask, highMask); 3031 if (p < end) 3032 fail("Illegal character in " + what, p); 3033 } 3034 3035 // Check that the char at position p matches the given mask 3036 // 3037 private void checkChar(int p, 3038 long lowMask, long highMask, 3039 String what) 3040 throws URISyntaxException 3041 { 3042 checkChars(p, p + 1, lowMask, highMask, what); 3043 } 3044 3045 3046 // -- Parsing -- 3047 3048 // [<scheme>:]<scheme-specific-part>[#<fragment>] 3049 // 3050 void parse(boolean rsa) throws URISyntaxException { 3051 requireServerAuthority = rsa; 3052 int ssp; // Start of scheme-specific part 3053 int n = input.length(); 3054 int p = scan(0, n, "/?#", ":"); 3055 if ((p >= 0) && at(p, n, ':')) { 3056 if (p == 0) 3057 failExpecting("scheme name", 0); 3058 checkChar(0, L_ALPHA, H_ALPHA, "scheme name"); 3059 checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name"); 3060 scheme = substring(0, p); 3061 p++; // Skip ':' 3062 ssp = p; 3063 if (at(p, n, '/')) { 3064 p = parseHierarchical(p, n); 3065 } else { 3066 int q = scan(p, n, "", "#"); 3067 if (q <= p) 3068 failExpecting("scheme-specific part", p); 3069 checkChars(p, q, L_URIC, H_URIC, "opaque part"); 3070 p = q; 3071 } 3072 } else { 3073 ssp = 0; 3074 p = parseHierarchical(0, n); 3075 } 3076 schemeSpecificPart = substring(ssp, p); 3077 if (at(p, n, '#')) { 3078 checkChars(p + 1, n, L_URIC, H_URIC, "fragment"); 3079 fragment = substring(p + 1, n); 3080 p = n; 3081 } 3082 if (p < n) 3083 fail("end of URI", p); 3084 } 3085 3086 // [//authority]<path>[?<query>] 3087 // 3088 // DEVIATION from RFC2396: We allow an empty authority component as 3089 // long as it's followed by a non-empty path, query component, or 3090 // fragment component. This is so that URIs such as "file:///foo/bar" 3091 // will parse. This seems to be the intent of RFC2396, though the 3092 // grammar does not permit it. If the authority is empty then the 3093 // userInfo, host, and port components are undefined. 3094 // 3095 // DEVIATION from RFC2396: We allow empty relative paths. This seems 3096 // to be the intent of RFC2396, but the grammar does not permit it. 3097 // The primary consequence of this deviation is that "#f" parses as a 3098 // relative URI with an empty path. 3099 // 3100 private int parseHierarchical(int start, int n) 3101 throws URISyntaxException 3102 { 3103 int p = start; 3104 if (at(p, n, '/') && at(p + 1, n, '/')) { 3105 p += 2; 3106 int q = scan(p, n, "", "/?#"); 3107 if (q > p) { 3108 p = parseAuthority(p, q); 3109 } else if (q < n) { 3110 // DEVIATION: Allow empty authority prior to non-empty 3111 // path, query component or fragment identifier 3112 } else 3113 failExpecting("authority", p); 3114 } 3115 int q = scan(p, n, "", "?#"); // DEVIATION: May be empty 3116 checkChars(p, q, L_PATH, H_PATH, "path"); 3117 path = substring(p, q); 3118 p = q; 3119 if (at(p, n, '?')) { 3120 p++; 3121 q = scan(p, n, "", "#"); 3122 checkChars(p, q, L_URIC, H_URIC, "query"); 3123 query = substring(p, q); 3124 p = q; 3125 } 3126 return p; 3127 } 3128 3129 // authority = server | reg_name 3130 // 3131 // Ambiguity: An authority that is a registry name rather than a server 3132 // might have a prefix that parses as a server. We use the fact that 3133 // the authority component is always followed by '/' or the end of the 3134 // input string to resolve this: If the complete authority did not 3135 // parse as a server then we try to parse it as a registry name. 3136 // 3137 private int parseAuthority(int start, int n) 3138 throws URISyntaxException 3139 { 3140 int p = start; 3141 int q = p; 3142 URISyntaxException ex = null; 3143 3144 boolean serverChars; 3145 boolean regChars; 3146 3147 if (scan(p, n, "", "]") > p) { 3148 // contains a literal IPv6 address, therefore % is allowed 3149 serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n); 3150 } else { 3151 serverChars = (scan(p, n, L_SERVER, H_SERVER) == n); 3152 } 3153 regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n); 3154 3155 if (regChars && !serverChars) { 3156 // Must be a registry-based authority 3157 authority = substring(p, n); 3158 return n; 3159 } 3160 3161 if (serverChars) { 3162 // Might be (probably is) a server-based authority, so attempt 3163 // to parse it as such. If the attempt fails, try to treat it 3164 // as a registry-based authority. 3165 try { 3166 q = parseServer(p, n); 3167 if (q < n) 3168 failExpecting("end of authority", q); 3169 authority = substring(p, n); 3170 } catch (URISyntaxException x) { 3171 // Undo results of failed parse 3172 userInfo = null; 3173 host = null; 3174 port = -1; 3175 if (requireServerAuthority) { 3176 // If we're insisting upon a server-based authority, 3177 // then just re-throw the exception 3178 throw x; 3179 } else { 3180 // Save the exception in case it doesn't parse as a 3181 // registry either 3182 ex = x; 3183 q = p; 3184 } 3185 } 3186 } 3187 3188 if (q < n) { 3189 if (regChars) { 3190 // Registry-based authority 3191 authority = substring(p, n); 3192 } else if (ex != null) { 3193 // Re-throw exception; it was probably due to 3194 // a malformed IPv6 address 3195 throw ex; 3196 } else { 3197 fail("Illegal character in authority", q); 3198 } 3199 } 3200 3201 return n; 3202 } 3203 3204 3205 // [<userinfo>@]<host>[:<port>] 3206 // 3207 private int parseServer(int start, int n) 3208 throws URISyntaxException 3209 { 3210 int p = start; 3211 int q; 3212 3213 // userinfo 3214 q = scan(p, n, "/?#", "@"); 3215 if ((q >= p) && at(q, n, '@')) { 3216 checkChars(p, q, L_USERINFO, H_USERINFO, "user info"); 3217 userInfo = substring(p, q); 3218 p = q + 1; // Skip '@' 3219 } 3220 3221 // hostname, IPv4 address, or IPv6 address 3222 if (at(p, n, '[')) { 3223 // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732 3224 p++; 3225 q = scan(p, n, "/?#", "]"); 3226 if ((q > p) && at(q, n, ']')) { 3227 // look for a "%" scope id 3228 int r = scan (p, q, "", "%"); 3229 if (r > p) { 3230 parseIPv6Reference(p, r); 3231 if (r+1 == q) { 3232 fail ("scope id expected"); 3233 } 3234 checkChars (r+1, q, L_SCOPE_ID, H_SCOPE_ID, 3235 "scope id"); 3236 } else { 3237 parseIPv6Reference(p, q); 3238 } 3239 host = substring(p-1, q+1); 3240 p = q + 1; 3241 } else { 3242 failExpecting("closing bracket for IPv6 address", q); 3243 } 3244 } else { 3245 q = parseIPv4Address(p, n); 3246 if (q <= p) 3247 q = parseHostname(p, n); 3248 p = q; 3249 } 3250 3251 // port 3252 if (at(p, n, ':')) { 3253 p++; 3254 q = scan(p, n, "", "/"); 3255 if (q > p) { 3256 checkChars(p, q, L_DIGIT, H_DIGIT, "port number"); 3257 try { 3258 port = Integer.parseInt(input, p, q, 10); 3259 } catch (NumberFormatException x) { 3260 fail("Malformed port number", p); 3261 } 3262 p = q; 3263 } 3264 } 3265 if (p < n) 3266 failExpecting("port number", p); 3267 3268 return p; 3269 } 3270 3271 // Scan a string of decimal digits whose value fits in a byte 3272 // 3273 private int scanByte(int start, int n) 3274 throws URISyntaxException 3275 { 3276 int p = start; 3277 int q = scan(p, n, L_DIGIT, H_DIGIT); 3278 if (q <= p) return q; 3279 if (Integer.parseInt(input, p, q, 10) > 255) return p; 3280 return q; 3281 } 3282 3283 // Scan an IPv4 address. 3284 // 3285 // If the strict argument is true then we require that the given 3286 // interval contain nothing besides an IPv4 address; if it is false 3287 // then we only require that it start with an IPv4 address. 3288 // 3289 // If the interval does not contain or start with (depending upon the 3290 // strict argument) a legal IPv4 address characters then we return -1 3291 // immediately; otherwise we insist that these characters parse as a 3292 // legal IPv4 address and throw an exception on failure. 3293 // 3294 // We assume that any string of decimal digits and dots must be an IPv4 3295 // address. It won't parse as a hostname anyway, so making that 3296 // assumption here allows more meaningful exceptions to be thrown. 3297 // 3298 private int scanIPv4Address(int start, int n, boolean strict) 3299 throws URISyntaxException 3300 { 3301 int p = start; 3302 int q; 3303 int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT); 3304 if ((m <= p) || (strict && (m != n))) 3305 return -1; 3306 for (;;) { 3307 // Per RFC2732: At most three digits per byte 3308 // Further constraint: Each element fits in a byte 3309 if ((q = scanByte(p, m)) <= p) break; p = q; 3310 if ((q = scan(p, m, '.')) <= p) break; p = q; 3311 if ((q = scanByte(p, m)) <= p) break; p = q; 3312 if ((q = scan(p, m, '.')) <= p) break; p = q; 3313 if ((q = scanByte(p, m)) <= p) break; p = q; 3314 if ((q = scan(p, m, '.')) <= p) break; p = q; 3315 if ((q = scanByte(p, m)) <= p) break; p = q; 3316 if (q < m) break; 3317 return q; 3318 } 3319 fail("Malformed IPv4 address", q); 3320 return -1; 3321 } 3322 3323 // Take an IPv4 address: Throw an exception if the given interval 3324 // contains anything except an IPv4 address 3325 // 3326 private int takeIPv4Address(int start, int n, String expected) 3327 throws URISyntaxException 3328 { 3329 int p = scanIPv4Address(start, n, true); 3330 if (p <= start) 3331 failExpecting(expected, start); 3332 return p; 3333 } 3334 3335 // Attempt to parse an IPv4 address, returning -1 on failure but 3336 // allowing the given interval to contain [:<characters>] after 3337 // the IPv4 address. 3338 // 3339 private int parseIPv4Address(int start, int n) { 3340 int p; 3341 3342 try { 3343 p = scanIPv4Address(start, n, false); 3344 } catch (URISyntaxException x) { 3345 return -1; 3346 } catch (NumberFormatException nfe) { 3347 return -1; 3348 } 3349 3350 if (p > start && p < n) { 3351 // IPv4 address is followed by something - check that 3352 // it's a ":" as this is the only valid character to 3353 // follow an address. 3354 if (charAt(p) != ':') { 3355 p = -1; 3356 } 3357 } 3358 3359 if (p > start) 3360 host = substring(start, p); 3361 3362 return p; 3363 } 3364 3365 // hostname = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ] 3366 // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum 3367 // toplabel = alpha | alpha *( alphanum | "-" ) alphanum 3368 // 3369 private int parseHostname(int start, int n) 3370 throws URISyntaxException 3371 { 3372 int p = start; 3373 int q; 3374 int l = -1; // Start of last parsed label 3375 3376 do { 3377 // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ] 3378 q = scan(p, n, L_ALPHANUM, H_ALPHANUM); 3379 if (q <= p) 3380 break; 3381 l = p; 3382 if (q > p) { 3383 p = q; 3384 q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH); 3385 if (q > p) { 3386 if (charAt(q - 1) == '-') 3387 fail("Illegal character in hostname", q - 1); 3388 p = q; 3389 } 3390 } 3391 q = scan(p, n, '.'); 3392 if (q <= p) 3393 break; 3394 p = q; 3395 } while (p < n); 3396 3397 if ((p < n) && !at(p, n, ':')) 3398 fail("Illegal character in hostname", p); 3399 3400 if (l < 0) 3401 failExpecting("hostname", start); 3402 3403 // for a fully qualified hostname check that the rightmost 3404 // label starts with an alpha character. 3405 if (l > start && !match(charAt(l), L_ALPHA, H_ALPHA)) { 3406 fail("Illegal character in hostname", l); 3407 } 3408 3409 host = substring(start, p); 3410 return p; 3411 } 3412 3413 3414 // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture 3415 // 3416 // Bug: The grammar in RFC2373 Appendix B does not allow addresses of 3417 // the form ::12.34.56.78, which are clearly shown in the examples 3418 // earlier in the document. Here is the original grammar: 3419 // 3420 // IPv6address = hexpart [ ":" IPv4address ] 3421 // hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ] 3422 // hexseq = hex4 *( ":" hex4) 3423 // hex4 = 1*4HEXDIG 3424 // 3425 // We therefore use the following revised grammar: 3426 // 3427 // IPv6address = hexseq [ ":" IPv4address ] 3428 // | hexseq [ "::" [ hexpost ] ] 3429 // | "::" [ hexpost ] 3430 // hexpost = hexseq | hexseq ":" IPv4address | IPv4address 3431 // hexseq = hex4 *( ":" hex4) 3432 // hex4 = 1*4HEXDIG 3433 // 3434 // This covers all and only the following cases: 3435 // 3436 // hexseq 3437 // hexseq : IPv4address 3438 // hexseq :: 3439 // hexseq :: hexseq 3440 // hexseq :: hexseq : IPv4address 3441 // hexseq :: IPv4address 3442 // :: hexseq 3443 // :: hexseq : IPv4address 3444 // :: IPv4address 3445 // :: 3446 // 3447 // Additionally we constrain the IPv6 address as follows :- 3448 // 3449 // i. IPv6 addresses without compressed zeros should contain 3450 // exactly 16 bytes. 3451 // 3452 // ii. IPv6 addresses with compressed zeros should contain 3453 // less than 16 bytes. 3454 3455 private int ipv6byteCount = 0; 3456 3457 private int parseIPv6Reference(int start, int n) 3458 throws URISyntaxException 3459 { 3460 int p = start; 3461 int q; 3462 boolean compressedZeros = false; 3463 3464 q = scanHexSeq(p, n); 3465 3466 if (q > p) { 3467 p = q; 3468 if (at(p, n, "::")) { 3469 compressedZeros = true; 3470 p = scanHexPost(p + 2, n); 3471 } else if (at(p, n, ':')) { 3472 p = takeIPv4Address(p + 1, n, "IPv4 address"); 3473 ipv6byteCount += 4; 3474 } 3475 } else if (at(p, n, "::")) { 3476 compressedZeros = true; 3477 p = scanHexPost(p + 2, n); 3478 } 3479 if (p < n) 3480 fail("Malformed IPv6 address", start); 3481 if (ipv6byteCount > 16) 3482 fail("IPv6 address too long", start); 3483 if (!compressedZeros && ipv6byteCount < 16) 3484 fail("IPv6 address too short", start); 3485 if (compressedZeros && ipv6byteCount == 16) 3486 fail("Malformed IPv6 address", start); 3487 3488 return p; 3489 } 3490 3491 private int scanHexPost(int start, int n) 3492 throws URISyntaxException 3493 { 3494 int p = start; 3495 int q; 3496 3497 if (p == n) 3498 return p; 3499 3500 q = scanHexSeq(p, n); 3501 if (q > p) { 3502 p = q; 3503 if (at(p, n, ':')) { 3504 p++; 3505 p = takeIPv4Address(p, n, "hex digits or IPv4 address"); 3506 ipv6byteCount += 4; 3507 } 3508 } else { 3509 p = takeIPv4Address(p, n, "hex digits or IPv4 address"); 3510 ipv6byteCount += 4; 3511 } 3512 return p; 3513 } 3514 3515 // Scan a hex sequence; return -1 if one could not be scanned 3516 // 3517 private int scanHexSeq(int start, int n) 3518 throws URISyntaxException 3519 { 3520 int p = start; 3521 int q; 3522 3523 q = scan(p, n, L_HEX, H_HEX); 3524 if (q <= p) 3525 return -1; 3526 if (at(q, n, '.')) // Beginning of IPv4 address 3527 return -1; 3528 if (q > p + 4) 3529 fail("IPv6 hexadecimal digit sequence too long", p); 3530 ipv6byteCount += 2; 3531 p = q; 3532 while (p < n) { 3533 if (!at(p, n, ':')) 3534 break; 3535 if (at(p + 1, n, ':')) 3536 break; // "::" 3537 p++; 3538 q = scan(p, n, L_HEX, H_HEX); 3539 if (q <= p) 3540 failExpecting("digits for an IPv6 address", p); 3541 if (at(q, n, '.')) { // Beginning of IPv4 address 3542 p--; 3543 break; 3544 } 3545 if (q > p + 4) 3546 fail("IPv6 hexadecimal digit sequence too long", p); 3547 ipv6byteCount += 2; 3548 p = q; 3549 } 3550 3551 return p; 3552 } 3553 3554 } 3555 3556} 3557