1/* Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements.  See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License.  You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * apr_uri.c: URI related utility things
19 *
20 */
21
22#include <stdlib.h>
23
24#include "apu.h"
25#include "apr.h"
26#include "apr_general.h"
27#include "apr_strings.h"
28
29#define APR_WANT_STRFUNC
30#include "apr_want.h"
31
32#include "apr_uri.h"
33
34typedef struct schemes_t schemes_t;
35
36/** Structure to store various schemes and their default ports */
37struct schemes_t {
38    /** The name of the scheme */
39    const char *name;
40    /** The default port for the scheme */
41    apr_port_t default_port;
42};
43
44/* Some WWW schemes and their default ports; this is basically /etc/services */
45/* This will become global when the protocol abstraction comes */
46/* As the schemes are searched by a linear search, */
47/* they are sorted by their expected frequency */
48static schemes_t schemes[] =
49{
50    {"http",     APR_URI_HTTP_DEFAULT_PORT},
51    {"ftp",      APR_URI_FTP_DEFAULT_PORT},
52    {"https",    APR_URI_HTTPS_DEFAULT_PORT},
53    {"gopher",   APR_URI_GOPHER_DEFAULT_PORT},
54    {"ldap",     APR_URI_LDAP_DEFAULT_PORT},
55    {"nntp",     APR_URI_NNTP_DEFAULT_PORT},
56    {"snews",    APR_URI_SNEWS_DEFAULT_PORT},
57    {"imap",     APR_URI_IMAP_DEFAULT_PORT},
58    {"pop",      APR_URI_POP_DEFAULT_PORT},
59    {"sip",      APR_URI_SIP_DEFAULT_PORT},
60    {"rtsp",     APR_URI_RTSP_DEFAULT_PORT},
61    {"wais",     APR_URI_WAIS_DEFAULT_PORT},
62    {"z39.50r",  APR_URI_WAIS_DEFAULT_PORT},
63    {"z39.50s",  APR_URI_WAIS_DEFAULT_PORT},
64    {"prospero", APR_URI_PROSPERO_DEFAULT_PORT},
65    {"nfs",      APR_URI_NFS_DEFAULT_PORT},
66    {"tip",      APR_URI_TIP_DEFAULT_PORT},
67    {"acap",     APR_URI_ACAP_DEFAULT_PORT},
68    {"telnet",   APR_URI_TELNET_DEFAULT_PORT},
69    {"ssh",      APR_URI_SSH_DEFAULT_PORT},
70    { NULL, 0xFFFF }     /* unknown port */
71};
72
73APU_DECLARE(apr_port_t) apr_uri_port_of_scheme(const char *scheme_str)
74{
75    schemes_t *scheme;
76
77    if (scheme_str) {
78        for (scheme = schemes; scheme->name != NULL; ++scheme) {
79            if (strcasecmp(scheme_str, scheme->name) == 0) {
80                return scheme->default_port;
81            }
82        }
83    }
84    return 0;
85}
86
87/* Unparse a apr_uri_t structure to an URI string.
88 * Optionally suppress the password for security reasons.
89 */
90APU_DECLARE(char *) apr_uri_unparse(apr_pool_t *p,
91                                    const apr_uri_t *uptr,
92                                    unsigned flags)
93{
94    char *ret = "";
95
96    /* If suppressing the site part, omit both user name & scheme://hostname */
97    if (!(flags & APR_URI_UNP_OMITSITEPART)) {
98
99        /* Construct a "user:password@" string, honoring the passed
100         * APR_URI_UNP_ flags: */
101        if (uptr->user || uptr->password) {
102            ret = apr_pstrcat(p,
103                      (uptr->user     && !(flags & APR_URI_UNP_OMITUSER))
104                          ? uptr->user : "",
105                      (uptr->password && !(flags & APR_URI_UNP_OMITPASSWORD))
106                          ? ":" : "",
107                      (uptr->password && !(flags & APR_URI_UNP_OMITPASSWORD))
108                          ? ((flags & APR_URI_UNP_REVEALPASSWORD)
109                              ? uptr->password : "XXXXXXXX")
110                          : "",
111                      ((uptr->user     && !(flags & APR_URI_UNP_OMITUSER)) ||
112                       (uptr->password && !(flags & APR_URI_UNP_OMITPASSWORD)))
113                          ? "@" : "",
114                      NULL);
115        }
116
117        /* Construct scheme://site string */
118        if (uptr->hostname) {
119            int is_default_port;
120            const char *lbrk = "", *rbrk = "";
121
122            if (strchr(uptr->hostname, ':')) { /* v6 literal */
123                lbrk = "[";
124                rbrk = "]";
125            }
126
127            is_default_port =
128                (uptr->port_str == NULL ||
129                 uptr->port == 0 ||
130                 uptr->port == apr_uri_port_of_scheme(uptr->scheme));
131
132            ret = apr_pstrcat(p, "//", ret, lbrk, uptr->hostname, rbrk,
133                        is_default_port ? "" : ":",
134                        is_default_port ? "" : uptr->port_str,
135                        NULL);
136        }
137	if (uptr->scheme) {
138	    ret = apr_pstrcat(p, uptr->scheme, ":", ret, NULL);
139	}
140    }
141
142    /* Should we suppress all path info? */
143    if (!(flags & APR_URI_UNP_OMITPATHINFO)) {
144        /* Append path, query and fragment strings: */
145        ret = apr_pstrcat(p,
146                          ret,
147                          (uptr->path)
148                              ? uptr->path : "",
149                          (uptr->query    && !(flags & APR_URI_UNP_OMITQUERY))
150                              ? "?" : "",
151                          (uptr->query    && !(flags & APR_URI_UNP_OMITQUERY))
152                              ? uptr->query : "",
153                          (uptr->fragment && !(flags & APR_URI_UNP_OMITQUERY))
154                              ? "#" : NULL,
155                          (uptr->fragment && !(flags & APR_URI_UNP_OMITQUERY))
156                              ? uptr->fragment : NULL,
157                          NULL);
158    }
159    return ret;
160}
161
162/* Here is the hand-optimized parse_uri_components().  There are some wild
163 * tricks we could pull in assembly language that we don't pull here... like we
164 * can do word-at-time scans for delimiter characters using the same technique
165 * that fast memchr()s use.  But that would be way non-portable. -djg
166 */
167
168/* We have a apr_table_t that we can index by character and it tells us if the
169 * character is one of the interesting delimiters.  Note that we even get
170 * compares for NUL for free -- it's just another delimiter.
171 */
172
173#define T_COLON           0x01        /* ':' */
174#define T_SLASH           0x02        /* '/' */
175#define T_QUESTION        0x04        /* '?' */
176#define T_HASH            0x08        /* '#' */
177#define T_NUL             0x80        /* '\0' */
178
179#if APR_CHARSET_EBCDIC
180/* Delimiter table for the EBCDIC character set */
181static const unsigned char uri_delims[256] = {
182    T_NUL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
183    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
184    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
185    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
186    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
187    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
188    0,T_SLASH,0,0,0,0,0,0,0,0,0,0,0,0,0,T_QUESTION,
189    0,0,0,0,0,0,0,0,0,0,T_COLON,T_HASH,0,0,0,0,
190    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
191    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
192    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
193    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
194    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
195    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
196    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
197    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
198};
199#else
200/* Delimiter table for the ASCII character set */
201static const unsigned char uri_delims[256] = {
202    T_NUL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
203    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
204    0,0,0,T_HASH,0,0,0,0,0,0,0,0,0,0,0,T_SLASH,
205    0,0,0,0,0,0,0,0,0,0,T_COLON,0,0,0,0,T_QUESTION,
206    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
207    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
208    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
209    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
210    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
211    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
212    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
213    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
214    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
215    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
216    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
217    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
218};
219#endif
220
221
222/* it works like this:
223    if (uri_delims[ch] & NOTEND_foobar) {
224        then we're not at a delimiter for foobar
225    }
226*/
227
228/* Note that we optimize the scheme scanning here, we cheat and let the
229 * compiler know that it doesn't have to do the & masking.
230 */
231#define NOTEND_SCHEME     (0xff)
232#define NOTEND_HOSTINFO   (T_SLASH | T_QUESTION | T_HASH | T_NUL)
233#define NOTEND_PATH       (T_QUESTION | T_HASH | T_NUL)
234
235/* parse_uri_components():
236 * Parse a given URI, fill in all supplied fields of a uri_components
237 * structure. This eliminates the necessity of extracting host, port,
238 * path, query info repeatedly in the modules.
239 * Side effects:
240 *  - fills in fields of uri_components *uptr
241 *  - none on any of the r->* fields
242 */
243APU_DECLARE(apr_status_t) apr_uri_parse(apr_pool_t *p, const char *uri,
244                                        apr_uri_t *uptr)
245{
246    const char *s;
247    const char *s1;
248    const char *hostinfo;
249    char *endstr;
250    int port;
251    int v6_offset1 = 0, v6_offset2 = 0;
252
253    /* Initialize the structure. parse_uri() and parse_uri_components()
254     * can be called more than once per request.
255     */
256    memset (uptr, '\0', sizeof(*uptr));
257    uptr->is_initialized = 1;
258
259    /* We assume the processor has a branch predictor like most --
260     * it assumes forward branches are untaken and backwards are taken.  That's
261     * the reason for the gotos.  -djg
262     */
263    if (uri[0] == '/') {
264        /* RFC2396 #4.3 says that two leading slashes mean we have an
265         * authority component, not a path!  Fixing this looks scary
266         * with the gotos here.  But if the existing logic is valid,
267         * then presumably a goto pointing to deal_with_authority works.
268         *
269         * RFC2396 describes this as resolving an ambiguity.  In the
270         * case of three or more slashes there would seem to be no
271         * ambiguity, so it is a path after all.
272         */
273        if (uri[1] == '/' && uri[2] != '/') {
274            s = uri + 2 ;
275            goto deal_with_authority ;
276        }
277
278deal_with_path:
279        /* we expect uri to point to first character of path ... remember
280         * that the path could be empty -- http://foobar?query for example
281         */
282        s = uri;
283        while ((uri_delims[*(unsigned char *)s] & NOTEND_PATH) == 0) {
284            ++s;
285        }
286        if (s != uri) {
287            uptr->path = apr_pstrmemdup(p, uri, s - uri);
288        }
289        if (*s == 0) {
290            return APR_SUCCESS;
291        }
292        if (*s == '?') {
293            ++s;
294            s1 = strchr(s, '#');
295            if (s1) {
296                uptr->fragment = apr_pstrdup(p, s1 + 1);
297                uptr->query = apr_pstrmemdup(p, s, s1 - s);
298            }
299            else {
300                uptr->query = apr_pstrdup(p, s);
301            }
302            return APR_SUCCESS;
303        }
304        /* otherwise it's a fragment */
305        uptr->fragment = apr_pstrdup(p, s + 1);
306        return APR_SUCCESS;
307    }
308
309    /* find the scheme: */
310    s = uri;
311    while ((uri_delims[*(unsigned char *)s] & NOTEND_SCHEME) == 0) {
312        ++s;
313    }
314    /* scheme must be non-empty and followed by : */
315    if (s == uri || s[0] != ':') {
316        goto deal_with_path;        /* backwards predicted taken! */
317    }
318
319    uptr->scheme = apr_pstrmemdup(p, uri, s - uri);
320    if (s[1] != '/' || s[2] != '/') {
321        uri = s + 1;
322        goto deal_with_path;
323    }
324
325    s += 3;
326
327deal_with_authority:
328    hostinfo = s;
329    while ((uri_delims[*(unsigned char *)s] & NOTEND_HOSTINFO) == 0) {
330        ++s;
331    }
332    uri = s;        /* whatever follows hostinfo is start of uri */
333    uptr->hostinfo = apr_pstrmemdup(p, hostinfo, uri - hostinfo);
334
335    /* If there's a username:password@host:port, the @ we want is the last @...
336     * too bad there's no memrchr()... For the C purists, note that hostinfo
337     * is definately not the first character of the original uri so therefore
338     * &hostinfo[-1] < &hostinfo[0] ... and this loop is valid C.
339     */
340    do {
341        --s;
342    } while (s >= hostinfo && *s != '@');
343    if (s < hostinfo) {
344        /* again we want the common case to be fall through */
345deal_with_host:
346        /* We expect hostinfo to point to the first character of
347         * the hostname.  If there's a port it is the first colon,
348         * except with IPv6.
349         */
350        if (*hostinfo == '[') {
351            v6_offset1 = 1;
352            v6_offset2 = 2;
353            s = memchr(hostinfo, ']', uri - hostinfo);
354            if (s == NULL) {
355                return APR_EGENERAL;
356            }
357            if (*++s != ':') {
358                s = NULL; /* no port */
359            }
360        }
361        else {
362            s = memchr(hostinfo, ':', uri - hostinfo);
363        }
364        if (s == NULL) {
365            /* we expect the common case to have no port */
366            uptr->hostname = apr_pstrmemdup(p,
367                                            hostinfo + v6_offset1,
368                                            uri - hostinfo - v6_offset2);
369            goto deal_with_path;
370        }
371        uptr->hostname = apr_pstrmemdup(p,
372                                        hostinfo + v6_offset1,
373                                        s - hostinfo - v6_offset2);
374        ++s;
375        uptr->port_str = apr_pstrmemdup(p, s, uri - s);
376        if (uri != s) {
377            port = strtol(uptr->port_str, &endstr, 10);
378            uptr->port = port;
379            if (*endstr == '\0') {
380                goto deal_with_path;
381            }
382            /* Invalid characters after ':' found */
383            return APR_EGENERAL;
384        }
385        uptr->port = apr_uri_port_of_scheme(uptr->scheme);
386        goto deal_with_path;
387    }
388
389    /* first colon delimits username:password */
390    s1 = memchr(hostinfo, ':', s - hostinfo);
391    if (s1) {
392        uptr->user = apr_pstrmemdup(p, hostinfo, s1 - hostinfo);
393        ++s1;
394        uptr->password = apr_pstrmemdup(p, s1, s - s1);
395    }
396    else {
397        uptr->user = apr_pstrmemdup(p, hostinfo, s - hostinfo);
398    }
399    hostinfo = s + 1;
400    goto deal_with_host;
401}
402
403/* Special case for CONNECT parsing: it comes with the hostinfo part only */
404/* See the INTERNET-DRAFT document "Tunneling SSL Through a WWW Proxy"
405 * currently at http://www.mcom.com/newsref/std/tunneling_ssl.html
406 * for the format of the "CONNECT host:port HTTP/1.0" request
407 */
408APU_DECLARE(apr_status_t) apr_uri_parse_hostinfo(apr_pool_t *p,
409                                                 const char *hostinfo,
410                                                 apr_uri_t *uptr)
411{
412    const char *s;
413    char *endstr;
414    const char *rsb;
415    int v6_offset1 = 0;
416
417    /* Initialize the structure. parse_uri() and parse_uri_components()
418     * can be called more than once per request.
419     */
420    memset(uptr, '\0', sizeof(*uptr));
421    uptr->is_initialized = 1;
422    uptr->hostinfo = apr_pstrdup(p, hostinfo);
423
424    /* We expect hostinfo to point to the first character of
425     * the hostname.  There must be a port, separated by a colon
426     */
427    if (*hostinfo == '[') {
428        if ((rsb = strchr(hostinfo, ']')) == NULL ||
429            *(rsb + 1) != ':') {
430            return APR_EGENERAL;
431        }
432        /* literal IPv6 address */
433        s = rsb + 1;
434        ++hostinfo;
435        v6_offset1 = 1;
436    }
437    else {
438        s = strchr(hostinfo, ':');
439    }
440    if (s == NULL) {
441        return APR_EGENERAL;
442    }
443    uptr->hostname = apr_pstrndup(p, hostinfo, s - hostinfo - v6_offset1);
444    ++s;
445    uptr->port_str = apr_pstrdup(p, s);
446    if (*s != '\0') {
447        uptr->port = (unsigned short) strtol(uptr->port_str, &endstr, 10);
448        if (*endstr == '\0') {
449            return APR_SUCCESS;
450        }
451        /* Invalid characters after ':' found */
452    }
453    return APR_EGENERAL;
454}
455