1/* Licensed to the Apache Software Foundation (ASF) under one or more 2 * contributor license agreements. See the NOTICE file distributed with 3 * this work for additional information regarding copyright ownership. 4 * The ASF licenses this file to You under the Apache License, Version 2.0 5 * (the "License"); you may not use this file except in compliance with 6 * the License. You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17/* 18 * apr_uri.c: URI related utility things 19 * 20 */ 21 22#include <stdlib.h> 23 24#include "apu.h" 25#include "apr.h" 26#include "apr_general.h" 27#include "apr_strings.h" 28 29#define APR_WANT_STRFUNC 30#include "apr_want.h" 31 32#include "apr_uri.h" 33 34typedef struct schemes_t schemes_t; 35 36/** Structure to store various schemes and their default ports */ 37struct schemes_t { 38 /** The name of the scheme */ 39 const char *name; 40 /** The default port for the scheme */ 41 apr_port_t default_port; 42}; 43 44/* Some WWW schemes and their default ports; this is basically /etc/services */ 45/* This will become global when the protocol abstraction comes */ 46/* As the schemes are searched by a linear search, */ 47/* they are sorted by their expected frequency */ 48static schemes_t schemes[] = 49{ 50 {"http", APR_URI_HTTP_DEFAULT_PORT}, 51 {"ftp", APR_URI_FTP_DEFAULT_PORT}, 52 {"https", APR_URI_HTTPS_DEFAULT_PORT}, 53 {"gopher", APR_URI_GOPHER_DEFAULT_PORT}, 54 {"ldap", APR_URI_LDAP_DEFAULT_PORT}, 55 {"nntp", APR_URI_NNTP_DEFAULT_PORT}, 56 {"snews", APR_URI_SNEWS_DEFAULT_PORT}, 57 {"imap", APR_URI_IMAP_DEFAULT_PORT}, 58 {"pop", APR_URI_POP_DEFAULT_PORT}, 59 {"sip", APR_URI_SIP_DEFAULT_PORT}, 60 {"rtsp", APR_URI_RTSP_DEFAULT_PORT}, 61 {"wais", APR_URI_WAIS_DEFAULT_PORT}, 62 {"z39.50r", APR_URI_WAIS_DEFAULT_PORT}, 63 {"z39.50s", APR_URI_WAIS_DEFAULT_PORT}, 64 {"prospero", APR_URI_PROSPERO_DEFAULT_PORT}, 65 {"nfs", APR_URI_NFS_DEFAULT_PORT}, 66 {"tip", APR_URI_TIP_DEFAULT_PORT}, 67 {"acap", APR_URI_ACAP_DEFAULT_PORT}, 68 {"telnet", APR_URI_TELNET_DEFAULT_PORT}, 69 {"ssh", APR_URI_SSH_DEFAULT_PORT}, 70 { NULL, 0xFFFF } /* unknown port */ 71}; 72 73APU_DECLARE(apr_port_t) apr_uri_port_of_scheme(const char *scheme_str) 74{ 75 schemes_t *scheme; 76 77 if (scheme_str) { 78 for (scheme = schemes; scheme->name != NULL; ++scheme) { 79 if (strcasecmp(scheme_str, scheme->name) == 0) { 80 return scheme->default_port; 81 } 82 } 83 } 84 return 0; 85} 86 87/* Unparse a apr_uri_t structure to an URI string. 88 * Optionally suppress the password for security reasons. 89 */ 90APU_DECLARE(char *) apr_uri_unparse(apr_pool_t *p, 91 const apr_uri_t *uptr, 92 unsigned flags) 93{ 94 char *ret = ""; 95 96 /* If suppressing the site part, omit both user name & scheme://hostname */ 97 if (!(flags & APR_URI_UNP_OMITSITEPART)) { 98 99 /* Construct a "user:password@" string, honoring the passed 100 * APR_URI_UNP_ flags: */ 101 if (uptr->user || uptr->password) { 102 ret = apr_pstrcat(p, 103 (uptr->user && !(flags & APR_URI_UNP_OMITUSER)) 104 ? uptr->user : "", 105 (uptr->password && !(flags & APR_URI_UNP_OMITPASSWORD)) 106 ? ":" : "", 107 (uptr->password && !(flags & APR_URI_UNP_OMITPASSWORD)) 108 ? ((flags & APR_URI_UNP_REVEALPASSWORD) 109 ? uptr->password : "XXXXXXXX") 110 : "", 111 ((uptr->user && !(flags & APR_URI_UNP_OMITUSER)) || 112 (uptr->password && !(flags & APR_URI_UNP_OMITPASSWORD))) 113 ? "@" : "", 114 NULL); 115 } 116 117 /* Construct scheme://site string */ 118 if (uptr->hostname) { 119 int is_default_port; 120 const char *lbrk = "", *rbrk = ""; 121 122 if (strchr(uptr->hostname, ':')) { /* v6 literal */ 123 lbrk = "["; 124 rbrk = "]"; 125 } 126 127 is_default_port = 128 (uptr->port_str == NULL || 129 uptr->port == 0 || 130 uptr->port == apr_uri_port_of_scheme(uptr->scheme)); 131 132 ret = apr_pstrcat(p, "//", ret, lbrk, uptr->hostname, rbrk, 133 is_default_port ? "" : ":", 134 is_default_port ? "" : uptr->port_str, 135 NULL); 136 } 137 if (uptr->scheme) { 138 ret = apr_pstrcat(p, uptr->scheme, ":", ret, NULL); 139 } 140 } 141 142 /* Should we suppress all path info? */ 143 if (!(flags & APR_URI_UNP_OMITPATHINFO)) { 144 /* Append path, query and fragment strings: */ 145 ret = apr_pstrcat(p, 146 ret, 147 (uptr->path) 148 ? uptr->path : "", 149 (uptr->query && !(flags & APR_URI_UNP_OMITQUERY)) 150 ? "?" : "", 151 (uptr->query && !(flags & APR_URI_UNP_OMITQUERY)) 152 ? uptr->query : "", 153 (uptr->fragment && !(flags & APR_URI_UNP_OMITQUERY)) 154 ? "#" : NULL, 155 (uptr->fragment && !(flags & APR_URI_UNP_OMITQUERY)) 156 ? uptr->fragment : NULL, 157 NULL); 158 } 159 return ret; 160} 161 162/* Here is the hand-optimized parse_uri_components(). There are some wild 163 * tricks we could pull in assembly language that we don't pull here... like we 164 * can do word-at-time scans for delimiter characters using the same technique 165 * that fast memchr()s use. But that would be way non-portable. -djg 166 */ 167 168/* We have a apr_table_t that we can index by character and it tells us if the 169 * character is one of the interesting delimiters. Note that we even get 170 * compares for NUL for free -- it's just another delimiter. 171 */ 172 173#define T_COLON 0x01 /* ':' */ 174#define T_SLASH 0x02 /* '/' */ 175#define T_QUESTION 0x04 /* '?' */ 176#define T_HASH 0x08 /* '#' */ 177#define T_NUL 0x80 /* '\0' */ 178 179#if APR_CHARSET_EBCDIC 180/* Delimiter table for the EBCDIC character set */ 181static const unsigned char uri_delims[256] = { 182 T_NUL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 183 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 184 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 185 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 186 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 187 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 188 0,T_SLASH,0,0,0,0,0,0,0,0,0,0,0,0,0,T_QUESTION, 189 0,0,0,0,0,0,0,0,0,0,T_COLON,T_HASH,0,0,0,0, 190 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 191 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 192 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 193 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 194 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 195 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 196 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 197 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 198}; 199#else 200/* Delimiter table for the ASCII character set */ 201static const unsigned char uri_delims[256] = { 202 T_NUL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 203 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 204 0,0,0,T_HASH,0,0,0,0,0,0,0,0,0,0,0,T_SLASH, 205 0,0,0,0,0,0,0,0,0,0,T_COLON,0,0,0,0,T_QUESTION, 206 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 207 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 208 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 209 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 210 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 211 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 212 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 213 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 214 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 215 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 216 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 217 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 218}; 219#endif 220 221 222/* it works like this: 223 if (uri_delims[ch] & NOTEND_foobar) { 224 then we're not at a delimiter for foobar 225 } 226*/ 227 228/* Note that we optimize the scheme scanning here, we cheat and let the 229 * compiler know that it doesn't have to do the & masking. 230 */ 231#define NOTEND_SCHEME (0xff) 232#define NOTEND_HOSTINFO (T_SLASH | T_QUESTION | T_HASH | T_NUL) 233#define NOTEND_PATH (T_QUESTION | T_HASH | T_NUL) 234 235/* parse_uri_components(): 236 * Parse a given URI, fill in all supplied fields of a uri_components 237 * structure. This eliminates the necessity of extracting host, port, 238 * path, query info repeatedly in the modules. 239 * Side effects: 240 * - fills in fields of uri_components *uptr 241 * - none on any of the r->* fields 242 */ 243APU_DECLARE(apr_status_t) apr_uri_parse(apr_pool_t *p, const char *uri, 244 apr_uri_t *uptr) 245{ 246 const char *s; 247 const char *s1; 248 const char *hostinfo; 249 char *endstr; 250 int port; 251 int v6_offset1 = 0, v6_offset2 = 0; 252 253 /* Initialize the structure. parse_uri() and parse_uri_components() 254 * can be called more than once per request. 255 */ 256 memset (uptr, '\0', sizeof(*uptr)); 257 uptr->is_initialized = 1; 258 259 /* We assume the processor has a branch predictor like most -- 260 * it assumes forward branches are untaken and backwards are taken. That's 261 * the reason for the gotos. -djg 262 */ 263 if (uri[0] == '/') { 264 /* RFC2396 #4.3 says that two leading slashes mean we have an 265 * authority component, not a path! Fixing this looks scary 266 * with the gotos here. But if the existing logic is valid, 267 * then presumably a goto pointing to deal_with_authority works. 268 * 269 * RFC2396 describes this as resolving an ambiguity. In the 270 * case of three or more slashes there would seem to be no 271 * ambiguity, so it is a path after all. 272 */ 273 if (uri[1] == '/' && uri[2] != '/') { 274 s = uri + 2 ; 275 goto deal_with_authority ; 276 } 277 278deal_with_path: 279 /* we expect uri to point to first character of path ... remember 280 * that the path could be empty -- http://foobar?query for example 281 */ 282 s = uri; 283 while ((uri_delims[*(unsigned char *)s] & NOTEND_PATH) == 0) { 284 ++s; 285 } 286 if (s != uri) { 287 uptr->path = apr_pstrmemdup(p, uri, s - uri); 288 } 289 if (*s == 0) { 290 return APR_SUCCESS; 291 } 292 if (*s == '?') { 293 ++s; 294 s1 = strchr(s, '#'); 295 if (s1) { 296 uptr->fragment = apr_pstrdup(p, s1 + 1); 297 uptr->query = apr_pstrmemdup(p, s, s1 - s); 298 } 299 else { 300 uptr->query = apr_pstrdup(p, s); 301 } 302 return APR_SUCCESS; 303 } 304 /* otherwise it's a fragment */ 305 uptr->fragment = apr_pstrdup(p, s + 1); 306 return APR_SUCCESS; 307 } 308 309 /* find the scheme: */ 310 s = uri; 311 while ((uri_delims[*(unsigned char *)s] & NOTEND_SCHEME) == 0) { 312 ++s; 313 } 314 /* scheme must be non-empty and followed by : */ 315 if (s == uri || s[0] != ':') { 316 goto deal_with_path; /* backwards predicted taken! */ 317 } 318 319 uptr->scheme = apr_pstrmemdup(p, uri, s - uri); 320 if (s[1] != '/' || s[2] != '/') { 321 uri = s + 1; 322 goto deal_with_path; 323 } 324 325 s += 3; 326 327deal_with_authority: 328 hostinfo = s; 329 while ((uri_delims[*(unsigned char *)s] & NOTEND_HOSTINFO) == 0) { 330 ++s; 331 } 332 uri = s; /* whatever follows hostinfo is start of uri */ 333 uptr->hostinfo = apr_pstrmemdup(p, hostinfo, uri - hostinfo); 334 335 /* If there's a username:password@host:port, the @ we want is the last @... 336 * too bad there's no memrchr()... For the C purists, note that hostinfo 337 * is definately not the first character of the original uri so therefore 338 * &hostinfo[-1] < &hostinfo[0] ... and this loop is valid C. 339 */ 340 do { 341 --s; 342 } while (s >= hostinfo && *s != '@'); 343 if (s < hostinfo) { 344 /* again we want the common case to be fall through */ 345deal_with_host: 346 /* We expect hostinfo to point to the first character of 347 * the hostname. If there's a port it is the first colon, 348 * except with IPv6. 349 */ 350 if (*hostinfo == '[') { 351 v6_offset1 = 1; 352 v6_offset2 = 2; 353 s = memchr(hostinfo, ']', uri - hostinfo); 354 if (s == NULL) { 355 return APR_EGENERAL; 356 } 357 if (*++s != ':') { 358 s = NULL; /* no port */ 359 } 360 } 361 else { 362 s = memchr(hostinfo, ':', uri - hostinfo); 363 } 364 if (s == NULL) { 365 /* we expect the common case to have no port */ 366 uptr->hostname = apr_pstrmemdup(p, 367 hostinfo + v6_offset1, 368 uri - hostinfo - v6_offset2); 369 goto deal_with_path; 370 } 371 uptr->hostname = apr_pstrmemdup(p, 372 hostinfo + v6_offset1, 373 s - hostinfo - v6_offset2); 374 ++s; 375 uptr->port_str = apr_pstrmemdup(p, s, uri - s); 376 if (uri != s) { 377 port = strtol(uptr->port_str, &endstr, 10); 378 uptr->port = port; 379 if (*endstr == '\0') { 380 goto deal_with_path; 381 } 382 /* Invalid characters after ':' found */ 383 return APR_EGENERAL; 384 } 385 uptr->port = apr_uri_port_of_scheme(uptr->scheme); 386 goto deal_with_path; 387 } 388 389 /* first colon delimits username:password */ 390 s1 = memchr(hostinfo, ':', s - hostinfo); 391 if (s1) { 392 uptr->user = apr_pstrmemdup(p, hostinfo, s1 - hostinfo); 393 ++s1; 394 uptr->password = apr_pstrmemdup(p, s1, s - s1); 395 } 396 else { 397 uptr->user = apr_pstrmemdup(p, hostinfo, s - hostinfo); 398 } 399 hostinfo = s + 1; 400 goto deal_with_host; 401} 402 403/* Special case for CONNECT parsing: it comes with the hostinfo part only */ 404/* See the INTERNET-DRAFT document "Tunneling SSL Through a WWW Proxy" 405 * currently at http://www.mcom.com/newsref/std/tunneling_ssl.html 406 * for the format of the "CONNECT host:port HTTP/1.0" request 407 */ 408APU_DECLARE(apr_status_t) apr_uri_parse_hostinfo(apr_pool_t *p, 409 const char *hostinfo, 410 apr_uri_t *uptr) 411{ 412 const char *s; 413 char *endstr; 414 const char *rsb; 415 int v6_offset1 = 0; 416 417 /* Initialize the structure. parse_uri() and parse_uri_components() 418 * can be called more than once per request. 419 */ 420 memset(uptr, '\0', sizeof(*uptr)); 421 uptr->is_initialized = 1; 422 uptr->hostinfo = apr_pstrdup(p, hostinfo); 423 424 /* We expect hostinfo to point to the first character of 425 * the hostname. There must be a port, separated by a colon 426 */ 427 if (*hostinfo == '[') { 428 if ((rsb = strchr(hostinfo, ']')) == NULL || 429 *(rsb + 1) != ':') { 430 return APR_EGENERAL; 431 } 432 /* literal IPv6 address */ 433 s = rsb + 1; 434 ++hostinfo; 435 v6_offset1 = 1; 436 } 437 else { 438 s = strchr(hostinfo, ':'); 439 } 440 if (s == NULL) { 441 return APR_EGENERAL; 442 } 443 uptr->hostname = apr_pstrndup(p, hostinfo, s - hostinfo - v6_offset1); 444 ++s; 445 uptr->port_str = apr_pstrdup(p, s); 446 if (*s != '\0') { 447 uptr->port = (unsigned short) strtol(uptr->port_str, &endstr, 10); 448 if (*endstr == '\0') { 449 return APR_SUCCESS; 450 } 451 /* Invalid characters after ':' found */ 452 } 453 return APR_EGENERAL; 454} 455