1/* URL handling.
2   Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
3   2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation,
4   Inc.
5
6This file is part of GNU Wget.
7
8GNU Wget is free software; you can redistribute it and/or modify
9it under the terms of the GNU General Public License as published by
10the Free Software Foundation; either version 3 of the License, or (at
11your option) any later version.
12
13GNU Wget is distributed in the hope that it will be useful,
14but WITHOUT ANY WARRANTY; without even the implied warranty of
15MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16GNU General Public License for more details.
17
18You should have received a copy of the GNU General Public License
19along with Wget.  If not, see <http://www.gnu.org/licenses/>.
20
21Additional permission under GNU GPL version 3 section 7
22
23If you modify this program, or any covered work, by linking or
24combining it with the OpenSSL project's OpenSSL library (or a
25modified version of that library), containing parts covered by the
26terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
27grants you additional permission to convey the resulting work.
28Corresponding Source for a non-source form of such a combination
29shall include the source code for the parts of OpenSSL used as well
30as that of the covered work.  */
31
32#include "wget.h"
33
34#include <stdio.h>
35#include <stdlib.h>
36#include <string.h>
37#include <unistd.h>
38#include <errno.h>
39#include <assert.h>
40
41#include "utils.h"
42#include "url.h"
43#include "host.h"  /* for is_valid_ipv6_address */
44
45#ifdef __VMS
46#include "vms.h"
47#endif /* def __VMS */
48
49#ifdef TESTING
50#include "test.h"
51#endif
52
53enum {
54  scm_disabled = 1,             /* for https when OpenSSL fails to init. */
55  scm_has_params = 2,           /* whether scheme has ;params */
56  scm_has_query = 4,            /* whether scheme has ?query */
57  scm_has_fragment = 8          /* whether scheme has #fragment */
58};
59
60struct scheme_data
61{
62  /* Short name of the scheme, such as "http" or "ftp". */
63  const char *name;
64  /* Leading string that identifies the scheme, such as "https://". */
65  const char *leading_string;
66  /* Default port of the scheme when none is specified. */
67  int default_port;
68  /* Various flags. */
69  int flags;
70};
71
72/* Supported schemes: */
73static struct scheme_data supported_schemes[] =
74{
75  { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
76#ifdef HAVE_SSL
77  { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
78#endif
79  { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
80
81  /* SCHEME_INVALID */
82  { NULL,       NULL,       -1,                 0 }
83};
84
85/* Forward declarations: */
86
87static bool path_simplify (enum url_scheme, char *);
88
89/* Support for escaping and unescaping of URL strings.  */
90
91/* Table of "reserved" and "unsafe" characters.  Those terms are
92   rfc1738-speak, as such largely obsoleted by rfc2396 and later
93   specs, but the general idea remains.
94
95   A reserved character is the one that you can't decode without
96   changing the meaning of the URL.  For example, you can't decode
97   "/foo/%2f/bar" into "/foo///bar" because the number and contents of
98   path components is different.  Non-reserved characters can be
99   changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
100   unsafe characters are loosely based on rfc1738, plus "$" and ",",
101   as recommended by rfc2396, and minus "~", which is very frequently
102   used (and sometimes unrecognized as %7E by broken servers).
103
104   An unsafe character is the one that should be encoded when URLs are
105   placed in foreign environments.  E.g. space and newline are unsafe
106   in HTTP contexts because HTTP uses them as separator and line
107   terminator, so they must be encoded to %20 and %0A respectively.
108   "*" is unsafe in shell context, etc.
109
110   We determine whether a character is unsafe through static table
111   lookup.  This code assumes ASCII character set and 8-bit chars.  */
112
113enum {
114  /* rfc1738 reserved chars + "$" and ",".  */
115  urlchr_reserved = 1,
116
117  /* rfc1738 unsafe chars, plus non-printables.  */
118  urlchr_unsafe   = 2
119};
120
121#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
122#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
123#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
124
125/* Shorthands for the table: */
126#define R  urlchr_reserved
127#define U  urlchr_unsafe
128#define RU R|U
129
130static const unsigned char urlchr_table[256] =
131{
132  U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
133  U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
134  U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
135  U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
136  U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
137  0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
138  0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
139  0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
140 RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
141  0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
142  0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
143  0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
144  U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
145  0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
146  0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
147  0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
148
149  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
150  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
151  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
152  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
153
154  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
155  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
156  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
157  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
158};
159#undef R
160#undef U
161#undef RU
162
163/* URL-unescape the string S.
164
165   This is done by transforming the sequences "%HH" to the character
166   represented by the hexadecimal digits HH.  If % is not followed by
167   two hexadecimal digits, it is inserted literally.
168
169   The transformation is done in place.  If you need the original
170   string intact, make a copy before calling this function.  */
171
172void
173url_unescape (char *s)
174{
175  char *t = s;                  /* t - tortoise */
176  char *h = s;                  /* h - hare     */
177
178  for (; *h; h++, t++)
179    {
180      if (*h != '%')
181        {
182        copychar:
183          *t = *h;
184        }
185      else
186        {
187          char c;
188          /* Do nothing if '%' is not followed by two hex digits. */
189          if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
190            goto copychar;
191          c = X2DIGITS_TO_NUM (h[1], h[2]);
192          /* Don't unescape %00 because there is no way to insert it
193             into a C string without effectively truncating it. */
194          if (c == '\0')
195            goto copychar;
196          *t = c;
197          h += 2;
198        }
199    }
200  *t = '\0';
201}
202
203/* The core of url_escape_* functions.  Escapes the characters that
204   match the provided mask in urlchr_table.
205
206   If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
207   returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
208   allocated string will be returned in all cases.  */
209
210static char *
211url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
212{
213  const char *p1;
214  char *p2, *newstr;
215  int newlen;
216  int addition = 0;
217
218  for (p1 = s; *p1; p1++)
219    if (urlchr_test (*p1, mask))
220      addition += 2;            /* Two more characters (hex digits) */
221
222  if (!addition)
223    return allow_passthrough ? (char *)s : xstrdup (s);
224
225  newlen = (p1 - s) + addition;
226  newstr = xmalloc (newlen + 1);
227
228  p1 = s;
229  p2 = newstr;
230  while (*p1)
231    {
232      /* Quote the characters that match the test mask. */
233      if (urlchr_test (*p1, mask))
234        {
235          unsigned char c = *p1++;
236          *p2++ = '%';
237          *p2++ = XNUM_TO_DIGIT (c >> 4);
238          *p2++ = XNUM_TO_DIGIT (c & 0xf);
239        }
240      else
241        *p2++ = *p1++;
242    }
243  assert (p2 - newstr == newlen);
244  *p2 = '\0';
245
246  return newstr;
247}
248
249/* URL-escape the unsafe characters (see urlchr_table) in a given
250   string, returning a freshly allocated string.  */
251
252char *
253url_escape (const char *s)
254{
255  return url_escape_1 (s, urlchr_unsafe, false);
256}
257
258/* URL-escape the unsafe and reserved characters (see urlchr_table) in
259   a given string, returning a freshly allocated string.  */
260
261char *
262url_escape_unsafe_and_reserved (const char *s)
263{
264  return url_escape_1 (s, urlchr_unsafe|urlchr_reserved, false);
265}
266
267/* URL-escape the unsafe characters (see urlchr_table) in a given
268   string.  If no characters are unsafe, S is returned.  */
269
270static char *
271url_escape_allow_passthrough (const char *s)
272{
273  return url_escape_1 (s, urlchr_unsafe, true);
274}
275
276/* Decide whether the char at position P needs to be encoded.  (It is
277   not enough to pass a single char *P because the function may need
278   to inspect the surrounding context.)
279
280   Return true if the char should be escaped as %XX, false otherwise.  */
281
282static inline bool
283char_needs_escaping (const char *p)
284{
285  if (*p == '%')
286    {
287      if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
288        return false;
289      else
290        /* Garbled %.. sequence: encode `%'. */
291        return true;
292    }
293  else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
294    return true;
295  else
296    return false;
297}
298
299/* Translate a %-escaped (but possibly non-conformant) input string S
300   into a %-escaped (and conformant) output string.  If no characters
301   are encoded or decoded, return the same string S; otherwise, return
302   a freshly allocated string with the new contents.
303
304   After a URL has been run through this function, the protocols that
305   use `%' as the quote character can use the resulting string as-is,
306   while those that don't can use url_unescape to get to the intended
307   data.  This function is stable: once the input is transformed,
308   further transformations of the result yield the same output.
309
310   Let's discuss why this function is needed.
311
312   Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
313   a raw space character would mess up the HTTP request, it needs to
314   be quoted, like this:
315
316       GET /abc%20def HTTP/1.0
317
318   It would appear that the unsafe chars need to be quoted, for
319   example with url_escape.  But what if we're requested to download
320   `abc%20def'?  url_escape transforms "%" to "%25", which would leave
321   us with `abc%2520def'.  This is incorrect -- since %-escapes are
322   part of URL syntax, "%20" is the correct way to denote a literal
323   space on the Wget command line.  This leads to the conclusion that
324   in that case Wget should not call url_escape, but leave the `%20'
325   as is.  This is clearly contradictory, but it only gets worse.
326
327   What if the requested URI is `abc%20 def'?  If we call url_escape,
328   we end up with `/abc%2520%20def', which is almost certainly not
329   intended.  If we don't call url_escape, we are left with the
330   embedded space and cannot complete the request.  What the user
331   meant was for Wget to request `/abc%20%20def', and this is where
332   reencode_escapes kicks in.
333
334   Wget used to solve this by first decoding %-quotes, and then
335   encoding all the "unsafe" characters found in the resulting string.
336   This was wrong because it didn't preserve certain URL special
337   (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
338   == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
339   whether we considered `+' reserved (it is).  One of these results
340   is inevitable because by the second step we would lose information
341   on whether the `+' was originally encoded or not.  Both results
342   were wrong because in CGI parameters + means space, while %2B means
343   literal plus.  reencode_escapes correctly translates the above to
344   "a%2B+b", i.e. returns the original string.
345
346   This function uses a modified version of the algorithm originally
347   proposed by Anon Sricharoenchai:
348
349   * Encode all "unsafe" characters, except those that are also
350     "reserved", to %XX.  See urlchr_table for which characters are
351     unsafe and reserved.
352
353   * Encode the "%" characters not followed by two hex digits to
354     "%25".
355
356   * Pass through all other characters and %XX escapes as-is.  (Up to
357     Wget 1.10 this decoded %XX escapes corresponding to "safe"
358     characters, but that was obtrusive and broke some servers.)
359
360   Anon's test case:
361
362   "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
363   ->
364   "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
365
366   Simpler test cases:
367
368   "foo bar"         -> "foo%20bar"
369   "foo%20bar"       -> "foo%20bar"
370   "foo %20bar"      -> "foo%20%20bar"
371   "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
372   "foo%25%20bar"    -> "foo%25%20bar"
373   "foo%2%20bar"     -> "foo%252%20bar"
374   "foo+bar"         -> "foo+bar"            (plus is reserved!)
375   "foo%2b+bar"      -> "foo%2b+bar"  */
376
377static char *
378reencode_escapes (const char *s)
379{
380  const char *p1;
381  char *newstr, *p2;
382  int oldlen, newlen;
383
384  int encode_count = 0;
385
386  /* First pass: inspect the string to see if there's anything to do,
387     and to calculate the new length.  */
388  for (p1 = s; *p1; p1++)
389    if (char_needs_escaping (p1))
390      ++encode_count;
391
392  if (!encode_count)
393    /* The string is good as it is. */
394    return (char *) s;          /* C const model sucks. */
395
396  oldlen = p1 - s;
397  /* Each encoding adds two characters (hex digits).  */
398  newlen = oldlen + 2 * encode_count;
399  newstr = xmalloc (newlen + 1);
400
401  /* Second pass: copy the string to the destination address, encoding
402     chars when needed.  */
403  p1 = s;
404  p2 = newstr;
405
406  while (*p1)
407    if (char_needs_escaping (p1))
408      {
409        unsigned char c = *p1++;
410        *p2++ = '%';
411        *p2++ = XNUM_TO_DIGIT (c >> 4);
412        *p2++ = XNUM_TO_DIGIT (c & 0xf);
413      }
414    else
415      *p2++ = *p1++;
416
417  *p2 = '\0';
418  assert (p2 - newstr == newlen);
419  return newstr;
420}
421
422/* Returns the scheme type if the scheme is supported, or
423   SCHEME_INVALID if not.  */
424
425enum url_scheme
426url_scheme (const char *url)
427{
428  int i;
429
430  for (i = 0; supported_schemes[i].leading_string; i++)
431    if (0 == strncasecmp (url, supported_schemes[i].leading_string,
432                          strlen (supported_schemes[i].leading_string)))
433      {
434        if (!(supported_schemes[i].flags & scm_disabled))
435          return (enum url_scheme) i;
436        else
437          return SCHEME_INVALID;
438      }
439
440  return SCHEME_INVALID;
441}
442
443#define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
444
445/* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
446   currently implemented, it returns true if URL begins with
447   [-+a-zA-Z0-9]+: .  */
448
449bool
450url_has_scheme (const char *url)
451{
452  const char *p = url;
453
454  /* The first char must be a scheme char. */
455  if (!*p || !SCHEME_CHAR (*p))
456    return false;
457  ++p;
458  /* Followed by 0 or more scheme chars. */
459  while (*p && SCHEME_CHAR (*p))
460    ++p;
461  /* Terminated by ':'. */
462  return *p == ':';
463}
464
465bool
466url_valid_scheme (const char *url)
467{
468  enum url_scheme scheme = url_scheme (url);
469  return scheme != SCHEME_INVALID;
470}
471
472int
473scheme_default_port (enum url_scheme scheme)
474{
475  return supported_schemes[scheme].default_port;
476}
477
478void
479scheme_disable (enum url_scheme scheme)
480{
481  supported_schemes[scheme].flags |= scm_disabled;
482}
483
484/* Skip the username and password, if present in the URL.  The
485   function should *not* be called with the complete URL, but with the
486   portion after the scheme.
487
488   If no username and password are found, return URL.  */
489
490static const char *
491url_skip_credentials (const char *url)
492{
493  /* Look for '@' that comes before terminators, such as '/', '?',
494     '#', or ';'.  */
495  const char *p = (const char *)strpbrk (url, "@/?#;");
496  if (!p || *p != '@')
497    return url;
498  return p + 1;
499}
500
501/* Parse credentials contained in [BEG, END).  The region is expected
502   to have come from a URL and is unescaped.  */
503
504static bool
505parse_credentials (const char *beg, const char *end, char **user, char **passwd)
506{
507  char *colon;
508  const char *userend;
509
510  if (beg == end)
511    return false;               /* empty user name */
512
513  colon = memchr (beg, ':', end - beg);
514  if (colon == beg)
515    return false;               /* again empty user name */
516
517  if (colon)
518    {
519      *passwd = strdupdelim (colon + 1, end);
520      userend = colon;
521      url_unescape (*passwd);
522    }
523  else
524    {
525      *passwd = NULL;
526      userend = end;
527    }
528  *user = strdupdelim (beg, userend);
529  url_unescape (*user);
530  return true;
531}
532
533/* Used by main.c: detect URLs written using the "shorthand" URL forms
534   originally popularized by Netscape and NcFTP.  HTTP shorthands look
535   like this:
536
537   www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
538   www.foo.com[:port]            -> http://www.foo.com[:port]
539
540   FTP shorthands look like this:
541
542   foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
543   foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
544
545   If the URL needs not or cannot be rewritten, return NULL.  */
546
547char *
548rewrite_shorthand_url (const char *url)
549{
550  const char *p;
551  char *ret;
552
553  if (url_scheme (url) != SCHEME_INVALID)
554    return NULL;
555
556  /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
557     latter Netscape.  */
558  p = strpbrk (url, ":/");
559  if (p == url)
560    return NULL;
561
562  /* If we're looking at "://", it means the URL uses a scheme we
563     don't support, which may include "https" when compiled without
564     SSL support.  Don't bogusly rewrite such URLs.  */
565  if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
566    return NULL;
567
568  if (p && *p == ':')
569    {
570      /* Colon indicates ftp, as in foo.bar.com:path.  Check for
571         special case of http port number ("localhost:10000").  */
572      int digits = strspn (p + 1, "0123456789");
573      if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
574        goto http;
575
576      /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
577      ret = aprintf ("ftp://%s", url);
578      ret[6 + (p - url)] = '/';
579    }
580  else
581    {
582    http:
583      /* Just prepend "http://" to URL. */
584      ret = aprintf ("http://%s", url);
585    }
586  return ret;
587}
588
589static void split_path (const char *, char **, char **);
590
591/* Like strpbrk, with the exception that it returns the pointer to the
592   terminating zero (end-of-string aka "eos") if no matching character
593   is found.  */
594
595static inline char *
596strpbrk_or_eos (const char *s, const char *accept)
597{
598  char *p = strpbrk (s, accept);
599  if (!p)
600    p = strchr (s, '\0');
601  return p;
602}
603
604/* Turn STR into lowercase; return true if a character was actually
605   changed. */
606
607static bool
608lowercase_str (char *str)
609{
610  bool changed = false;
611  for (; *str; str++)
612    if (c_isupper (*str))
613      {
614        changed = true;
615        *str = c_tolower (*str);
616      }
617  return changed;
618}
619
620static const char *
621init_seps (enum url_scheme scheme)
622{
623  static char seps[8] = ":/";
624  char *p = seps + 2;
625  int flags = supported_schemes[scheme].flags;
626
627  if (flags & scm_has_params)
628    *p++ = ';';
629  if (flags & scm_has_query)
630    *p++ = '?';
631  if (flags & scm_has_fragment)
632    *p++ = '#';
633  *p = '\0';
634  return seps;
635}
636
637static const char *parse_errors[] = {
638#define PE_NO_ERROR                     0
639  N_("No error"),
640#define PE_UNSUPPORTED_SCHEME           1
641  N_("Unsupported scheme %s"), /* support for format token only here */
642#define PE_MISSING_SCHEME               2
643  N_("Scheme missing"),
644#define PE_INVALID_HOST_NAME            3
645  N_("Invalid host name"),
646#define PE_BAD_PORT_NUMBER              4
647  N_("Bad port number"),
648#define PE_INVALID_USER_NAME            5
649  N_("Invalid user name"),
650#define PE_UNTERMINATED_IPV6_ADDRESS    6
651  N_("Unterminated IPv6 numeric address"),
652#define PE_IPV6_NOT_SUPPORTED           7
653  N_("IPv6 addresses not supported"),
654#define PE_INVALID_IPV6_ADDRESS         8
655  N_("Invalid IPv6 numeric address")
656};
657
658/* Parse a URL.
659
660   Return a new struct url if successful, NULL on error.  In case of
661   error, and if ERROR is not NULL, also set *ERROR to the appropriate
662   error code. */
663struct url *
664url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
665{
666  struct url *u;
667  const char *p;
668  bool path_modified, host_modified;
669
670  enum url_scheme scheme;
671  const char *seps;
672
673  const char *uname_b,     *uname_e;
674  const char *host_b,      *host_e;
675  const char *path_b,      *path_e;
676  const char *params_b,    *params_e;
677  const char *query_b,     *query_e;
678  const char *fragment_b,  *fragment_e;
679
680  int port;
681  char *user = NULL, *passwd = NULL;
682
683  const char *url_encoded = NULL;
684
685  int error_code;
686
687  scheme = url_scheme (url);
688  if (scheme == SCHEME_INVALID)
689    {
690      if (url_has_scheme (url))
691        error_code = PE_UNSUPPORTED_SCHEME;
692      else
693        error_code = PE_MISSING_SCHEME;
694      goto error;
695    }
696
697  url_encoded = url;
698
699  if (iri && iri->utf8_encode)
700    {
701      char *new_url = NULL;
702
703      iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url);
704      if (!iri->utf8_encode)
705        new_url = NULL;
706      else
707        {
708          iri->orig_url = xstrdup (url);
709          url_encoded = reencode_escapes (new_url);
710          if (url_encoded != new_url)
711            xfree (new_url);
712          percent_encode = false;
713        }
714    }
715
716  if (percent_encode)
717    url_encoded = reencode_escapes (url);
718
719  p = url_encoded;
720  p += strlen (supported_schemes[scheme].leading_string);
721  uname_b = p;
722  p = url_skip_credentials (p);
723  uname_e = p;
724
725  /* scheme://user:pass@host[:port]... */
726  /*                    ^              */
727
728  /* We attempt to break down the URL into the components path,
729     params, query, and fragment.  They are ordered like this:
730
731       scheme://host[:port][/path][;params][?query][#fragment]  */
732
733  path_b     = path_e     = NULL;
734  params_b   = params_e   = NULL;
735  query_b    = query_e    = NULL;
736  fragment_b = fragment_e = NULL;
737
738  /* Initialize separators for optional parts of URL, depending on the
739     scheme.  For example, FTP has params, and HTTP and HTTPS have
740     query string and fragment. */
741  seps = init_seps (scheme);
742
743  host_b = p;
744
745  if (*p == '[')
746    {
747      /* Handle IPv6 address inside square brackets.  Ideally we'd
748         just look for the terminating ']', but rfc2732 mandates
749         rejecting invalid IPv6 addresses.  */
750
751      /* The address begins after '['. */
752      host_b = p + 1;
753      host_e = strchr (host_b, ']');
754
755      if (!host_e)
756        {
757          error_code = PE_UNTERMINATED_IPV6_ADDRESS;
758          goto error;
759        }
760
761#ifdef ENABLE_IPV6
762      /* Check if the IPv6 address is valid. */
763      if (!is_valid_ipv6_address(host_b, host_e))
764        {
765          error_code = PE_INVALID_IPV6_ADDRESS;
766          goto error;
767        }
768
769      /* Continue parsing after the closing ']'. */
770      p = host_e + 1;
771#else
772      error_code = PE_IPV6_NOT_SUPPORTED;
773      goto error;
774#endif
775
776      /* The closing bracket must be followed by a separator or by the
777         null char.  */
778      /* http://[::1]... */
779      /*             ^   */
780      if (!strchr (seps, *p))
781        {
782          /* Trailing garbage after []-delimited IPv6 address. */
783          error_code = PE_INVALID_HOST_NAME;
784          goto error;
785        }
786    }
787  else
788    {
789      p = strpbrk_or_eos (p, seps);
790      host_e = p;
791    }
792  ++seps;                       /* advance to '/' */
793
794  if (host_b == host_e)
795    {
796      error_code = PE_INVALID_HOST_NAME;
797      goto error;
798    }
799
800  port = scheme_default_port (scheme);
801  if (*p == ':')
802    {
803      const char *port_b, *port_e, *pp;
804
805      /* scheme://host:port/tralala */
806      /*              ^             */
807      ++p;
808      port_b = p;
809      p = strpbrk_or_eos (p, seps);
810      port_e = p;
811
812      /* Allow empty port, as per rfc2396. */
813      if (port_b != port_e)
814        for (port = 0, pp = port_b; pp < port_e; pp++)
815          {
816            if (!c_isdigit (*pp))
817              {
818                /* http://host:12randomgarbage/blah */
819                /*               ^                  */
820                error_code = PE_BAD_PORT_NUMBER;
821                goto error;
822              }
823            port = 10 * port + (*pp - '0');
824            /* Check for too large port numbers here, before we have
825               a chance to overflow on bogus port values.  */
826            if (port > 0xffff)
827              {
828                error_code = PE_BAD_PORT_NUMBER;
829                goto error;
830              }
831          }
832    }
833  /* Advance to the first separator *after* '/' (either ';' or '?',
834     depending on the scheme).  */
835  ++seps;
836
837  /* Get the optional parts of URL, each part being delimited by
838     current location and the position of the next separator.  */
839#define GET_URL_PART(sepchar, var) do {                         \
840  if (*p == sepchar)                                            \
841    var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
842  ++seps;                                                       \
843} while (0)
844
845  GET_URL_PART ('/', path);
846  if (supported_schemes[scheme].flags & scm_has_params)
847    GET_URL_PART (';', params);
848  if (supported_schemes[scheme].flags & scm_has_query)
849    GET_URL_PART ('?', query);
850  if (supported_schemes[scheme].flags & scm_has_fragment)
851    GET_URL_PART ('#', fragment);
852
853#undef GET_URL_PART
854  assert (*p == 0);
855
856  if (uname_b != uname_e)
857    {
858      /* http://user:pass@host */
859      /*        ^         ^    */
860      /*     uname_b   uname_e */
861      if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
862        {
863          error_code = PE_INVALID_USER_NAME;
864          goto error;
865        }
866    }
867
868  u = xnew0 (struct url);
869  u->scheme = scheme;
870  u->host   = strdupdelim (host_b, host_e);
871  u->port   = port;
872  u->user   = user;
873  u->passwd = passwd;
874
875  u->path = strdupdelim (path_b, path_e);
876  path_modified = path_simplify (scheme, u->path);
877  split_path (u->path, &u->dir, &u->file);
878
879  host_modified = lowercase_str (u->host);
880
881  /* Decode %HH sequences in host name.  This is important not so much
882     to support %HH sequences in host names (which other browser
883     don't), but to support binary characters (which will have been
884     converted to %HH by reencode_escapes).  */
885  if (strchr (u->host, '%'))
886    {
887      url_unescape (u->host);
888      host_modified = true;
889
890      /* Apply IDNA regardless of iri->utf8_encode status */
891      if (opt.enable_iri && iri)
892        {
893          char *new = idn_encode (iri, u->host);
894          if (new)
895            {
896              xfree (u->host);
897              u->host = new;
898              host_modified = true;
899            }
900        }
901    }
902
903  if (params_b)
904    u->params = strdupdelim (params_b, params_e);
905  if (query_b)
906    u->query = strdupdelim (query_b, query_e);
907  if (fragment_b)
908    u->fragment = strdupdelim (fragment_b, fragment_e);
909
910  if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
911    {
912      /* If we suspect that a transformation has rendered what
913         url_string might return different from URL_ENCODED, rebuild
914         u->url using url_string.  */
915      u->url = url_string (u, URL_AUTH_SHOW);
916
917      if (url_encoded != url)
918        xfree ((char *) url_encoded);
919    }
920  else
921    {
922      if (url_encoded == url)
923        u->url = xstrdup (url);
924      else
925        u->url = (char *) url_encoded;
926    }
927
928  return u;
929
930 error:
931  /* Cleanup in case of error: */
932  if (url_encoded && url_encoded != url)
933    xfree ((char *) url_encoded);
934
935  /* Transmit the error code to the caller, if the caller wants to
936     know.  */
937  if (error)
938    *error = error_code;
939  return NULL;
940}
941
942/* Return the error message string from ERROR_CODE, which should have
943   been retrieved from url_parse.  The error message is translated.  */
944
945char *
946url_error (const char *url, int error_code)
947{
948  assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
949
950  if (error_code == PE_UNSUPPORTED_SCHEME)
951    {
952      char *error, *p;
953      char *scheme = xstrdup (url);
954      assert (url_has_scheme (url));
955
956      if ((p = strchr (scheme, ':')))
957        *p = '\0';
958      if (!strcasecmp (scheme, "https"))
959        error = aprintf (_("HTTPS support not compiled in"));
960      else
961        error = aprintf (_(parse_errors[error_code]), quote (scheme));
962      xfree (scheme);
963
964      return error;
965    }
966  else
967    return xstrdup (_(parse_errors[error_code]));
968}
969
970/* Split PATH into DIR and FILE.  PATH comes from the URL and is
971   expected to be URL-escaped.
972
973   The path is split into directory (the part up to the last slash)
974   and file (the part after the last slash), which are subsequently
975   unescaped.  Examples:
976
977   PATH                 DIR           FILE
978   "foo/bar/baz"        "foo/bar"     "baz"
979   "foo/bar/"           "foo/bar"     ""
980   "foo"                ""            "foo"
981   "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
982
983   DIR and FILE are freshly allocated.  */
984
985static void
986split_path (const char *path, char **dir, char **file)
987{
988  char *last_slash = strrchr (path, '/');
989  if (!last_slash)
990    {
991      *dir = xstrdup ("");
992      *file = xstrdup (path);
993    }
994  else
995    {
996      *dir = strdupdelim (path, last_slash);
997      *file = xstrdup (last_slash + 1);
998    }
999  url_unescape (*dir);
1000  url_unescape (*file);
1001}
1002
1003/* Note: URL's "full path" is the path with the query string and
1004   params appended.  The "fragment" (#foo) is intentionally ignored,
1005   but that might be changed.  For example, if the original URL was
1006   "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
1007   the full path will be "/foo/bar/baz;bullshit?querystring".  */
1008
1009/* Return the length of the full path, without the terminating
1010   zero.  */
1011
1012static int
1013full_path_length (const struct url *url)
1014{
1015  int len = 0;
1016
1017#define FROB(el) if (url->el) len += 1 + strlen (url->el)
1018
1019  FROB (path);
1020  FROB (params);
1021  FROB (query);
1022
1023#undef FROB
1024
1025  return len;
1026}
1027
1028/* Write out the full path. */
1029
1030static void
1031full_path_write (const struct url *url, char *where)
1032{
1033#define FROB(el, chr) do {                      \
1034  char *f_el = url->el;                         \
1035  if (f_el) {                                   \
1036    int l = strlen (f_el);                      \
1037    *where++ = chr;                             \
1038    memcpy (where, f_el, l);                    \
1039    where += l;                                 \
1040  }                                             \
1041} while (0)
1042
1043  FROB (path, '/');
1044  FROB (params, ';');
1045  FROB (query, '?');
1046
1047#undef FROB
1048}
1049
1050/* Public function for getting the "full path".  E.g. if u->path is
1051   "foo/bar" and u->query is "param=value", full_path will be
1052   "/foo/bar?param=value". */
1053
1054char *
1055url_full_path (const struct url *url)
1056{
1057  int length = full_path_length (url);
1058  char *full_path = xmalloc (length + 1);
1059
1060  full_path_write (url, full_path);
1061  full_path[length] = '\0';
1062
1063  return full_path;
1064}
1065
1066/* Unescape CHR in an otherwise escaped STR.  Used to selectively
1067   escaping of certain characters, such as "/" and ":".  Returns a
1068   count of unescaped chars.  */
1069
1070static void
1071unescape_single_char (char *str, char chr)
1072{
1073  const char c1 = XNUM_TO_DIGIT (chr >> 4);
1074  const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1075  char *h = str;                /* hare */
1076  char *t = str;                /* tortoise */
1077  for (; *h; h++, t++)
1078    {
1079      if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1080        {
1081          *t = chr;
1082          h += 2;
1083        }
1084      else
1085        *t = *h;
1086    }
1087  *t = '\0';
1088}
1089
1090/* Escape unsafe and reserved characters, except for the slash
1091   characters.  */
1092
1093static char *
1094url_escape_dir (const char *dir)
1095{
1096  char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1097  if (newdir == dir)
1098    return (char *)dir;
1099
1100  unescape_single_char (newdir, '/');
1101  return newdir;
1102}
1103
1104/* Sync u->path and u->url with u->dir and u->file.  Called after
1105   u->file or u->dir have been changed, typically by the FTP code.  */
1106
1107static void
1108sync_path (struct url *u)
1109{
1110  char *newpath, *efile, *edir;
1111
1112  xfree (u->path);
1113
1114  /* u->dir and u->file are not escaped.  URL-escape them before
1115     reassembling them into u->path.  That way, if they contain
1116     separators like '?' or even if u->file contains slashes, the
1117     path will be correctly assembled.  (u->file can contain slashes
1118     if the URL specifies it with %2f, or if an FTP server returns
1119     it.)  */
1120  edir = url_escape_dir (u->dir);
1121  efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1122
1123  if (!*edir)
1124    newpath = xstrdup (efile);
1125  else
1126    {
1127      int dirlen = strlen (edir);
1128      int filelen = strlen (efile);
1129
1130      /* Copy "DIR/FILE" to newpath. */
1131      char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1132      memcpy (p, edir, dirlen);
1133      p += dirlen;
1134      *p++ = '/';
1135      memcpy (p, efile, filelen);
1136      p += filelen;
1137      *p = '\0';
1138    }
1139
1140  u->path = newpath;
1141
1142  if (edir != u->dir)
1143    xfree (edir);
1144  if (efile != u->file)
1145    xfree (efile);
1146
1147  /* Regenerate u->url as well.  */
1148  xfree (u->url);
1149  u->url = url_string (u, URL_AUTH_SHOW);
1150}
1151
1152/* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1153   This way we can sync u->path and u->url when they get changed.  */
1154
1155void
1156url_set_dir (struct url *url, const char *newdir)
1157{
1158  xfree (url->dir);
1159  url->dir = xstrdup (newdir);
1160  sync_path (url);
1161}
1162
1163void
1164url_set_file (struct url *url, const char *newfile)
1165{
1166  xfree (url->file);
1167  url->file = xstrdup (newfile);
1168  sync_path (url);
1169}
1170
1171void
1172url_free (struct url *url)
1173{
1174  xfree (url->host);
1175  xfree (url->path);
1176  xfree (url->url);
1177
1178  xfree_null (url->params);
1179  xfree_null (url->query);
1180  xfree_null (url->fragment);
1181  xfree_null (url->user);
1182  xfree_null (url->passwd);
1183
1184  xfree (url->dir);
1185  xfree (url->file);
1186
1187  xfree (url);
1188}
1189
1190/* Create all the necessary directories for PATH (a file).  Calls
1191   make_directory internally.  */
1192int
1193mkalldirs (const char *path)
1194{
1195  const char *p;
1196  char *t;
1197  struct_stat st;
1198  int res;
1199
1200  p = path + strlen (path);
1201  for (; *p != '/' && p != path; p--)
1202    ;
1203
1204  /* Don't create if it's just a file.  */
1205  if ((p == path) && (*p != '/'))
1206    return 0;
1207  t = strdupdelim (path, p);
1208
1209  /* Check whether the directory exists.  */
1210  if ((stat (t, &st) == 0))
1211    {
1212      if (S_ISDIR (st.st_mode))
1213        {
1214          xfree (t);
1215          return 0;
1216        }
1217      else
1218        {
1219          /* If the dir exists as a file name, remove it first.  This
1220             is *only* for Wget to work with buggy old CERN http
1221             servers.  Here is the scenario: When Wget tries to
1222             retrieve a directory without a slash, e.g.
1223             http://foo/bar (bar being a directory), CERN server will
1224             not redirect it too http://foo/bar/ -- it will generate a
1225             directory listing containing links to bar/file1,
1226             bar/file2, etc.  Wget will lose because it saves this
1227             HTML listing to a file `bar', so it cannot create the
1228             directory.  To work around this, if the file of the same
1229             name exists, we just remove it and create the directory
1230             anyway.  */
1231          DEBUGP (("Removing %s because of directory danger!\n", t));
1232          unlink (t);
1233        }
1234    }
1235  res = make_directory (t);
1236  if (res != 0)
1237    logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1238  xfree (t);
1239  return res;
1240}
1241
1242/* Functions for constructing the file name out of URL components.  */
1243
1244/* A growable string structure, used by url_file_name and friends.
1245   This should perhaps be moved to utils.c.
1246
1247   The idea is to have a convenient and efficient way to construct a
1248   string by having various functions append data to it.  Instead of
1249   passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1250   functions in questions, we pass the pointer to this struct.
1251
1252   Functions that write to the members in this struct must make sure
1253   that base remains null terminated by calling append_null().
1254   */
1255
1256struct growable {
1257  char *base;
1258  int size;   /* memory allocated */
1259  int tail;   /* string length */
1260};
1261
1262/* Ensure that the string can accept APPEND_COUNT more characters past
1263   the current TAIL position.  If necessary, this will grow the string
1264   and update its allocated size.  If the string is already large
1265   enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1266#define GROW(g, append_size) do {                                       \
1267  struct growable *G_ = g;                                              \
1268  DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1269} while (0)
1270
1271/* Return the tail position of the string. */
1272#define TAIL(r) ((r)->base + (r)->tail)
1273
1274/* Move the tail position by APPEND_COUNT characters. */
1275#define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1276
1277
1278/* Append NULL to DEST. */
1279static void
1280append_null (struct growable *dest)
1281{
1282  GROW (dest, 1);
1283  *TAIL (dest) = 0;
1284}
1285
1286/* Append CH to DEST. */
1287static void
1288append_char (char ch, struct growable *dest)
1289{
1290  if (ch)
1291    {
1292      GROW (dest, 1);
1293      *TAIL (dest) = ch;
1294      TAIL_INCR (dest, 1);
1295    }
1296
1297  append_null (dest);
1298}
1299
1300/* Append the string STR to DEST. */
1301static void
1302append_string (const char *str, struct growable *dest)
1303{
1304  int l = strlen (str);
1305
1306  if (l)
1307    {
1308      GROW (dest, l);
1309      memcpy (TAIL (dest), str, l);
1310      TAIL_INCR (dest, l);
1311    }
1312
1313  append_null (dest);
1314}
1315
1316
1317enum {
1318  filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1319  filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1320  filechr_control     = 4       /* a control character, e.g. 0-31 */
1321};
1322
1323#define FILE_CHAR_TEST(c, mask) \
1324    ((opt.restrict_files_nonascii && !c_isascii ((unsigned char)(c))) || \
1325    (filechr_table[(unsigned char)(c)] & (mask)))
1326
1327/* Shorthands for the table: */
1328#define U filechr_not_unix
1329#define W filechr_not_windows
1330#define C filechr_control
1331
1332#define UW U|W
1333#define UWC U|W|C
1334
1335/* Table of characters unsafe under various conditions (see above).
1336
1337   Arguably we could also claim `%' to be unsafe, since we use it as
1338   the escape character.  If we ever want to be able to reliably
1339   translate file name back to URL, this would become important
1340   crucial.  Right now, it's better to be minimal in escaping.  */
1341
1342static const unsigned char filechr_table[256] =
1343{
1344UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1345  C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1346  C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1347  C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1348  0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1349  0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1350  0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1351  0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1352  0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1353  0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1354  0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1355  0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1356  0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1357  0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1358  0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1359  0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
1360
1361  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1362  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1363  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1364  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1365
1366  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1367  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1368  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1369  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1370};
1371#undef U
1372#undef W
1373#undef C
1374#undef UW
1375#undef UWC
1376
1377/* FN_PORT_SEP is the separator between host and port in file names
1378   for non-standard port numbers.  On Unix this is normally ':', as in
1379   "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1380   because Windows can't handle ':' in file names.  */
1381#define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1382
1383/* FN_QUERY_SEP is the separator between the file name and the URL
1384   query, normally '?'.  Since Windows cannot handle '?' as part of
1385   file name, we use '@' instead there.  */
1386#define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1387#define FN_QUERY_SEP_STR (opt.restrict_files_os != restrict_windows ? "?" : "@")
1388
1389/* Quote path element, characters in [b, e), as file name, and append
1390   the quoted string to DEST.  Each character is quoted as per
1391   file_unsafe_char and the corresponding table.
1392
1393   If ESCAPED is true, the path element is considered to be
1394   URL-escaped and will be unescaped prior to inspection.  */
1395
1396static void
1397append_uri_pathel (const char *b, const char *e, bool escaped,
1398                   struct growable *dest)
1399{
1400  const char *p;
1401  int quoted, outlen;
1402
1403  int mask;
1404  if (opt.restrict_files_os == restrict_unix)
1405    mask = filechr_not_unix;
1406  else
1407    mask = filechr_not_windows;
1408  if (opt.restrict_files_ctrl)
1409    mask |= filechr_control;
1410
1411  /* Copy [b, e) to PATHEL and URL-unescape it. */
1412  if (escaped)
1413    {
1414      char *unescaped;
1415      BOUNDED_TO_ALLOCA (b, e, unescaped);
1416      url_unescape (unescaped);
1417      b = unescaped;
1418      e = unescaped + strlen (unescaped);
1419    }
1420
1421  /* Defang ".." when found as component of path.  Remember that path
1422     comes from the URL and might contain malicious input.  */
1423  if (e - b == 2 && b[0] == '.' && b[1] == '.')
1424    {
1425      b = "%2E%2E";
1426      e = b + 6;
1427    }
1428
1429  /* Walk the PATHEL string and check how many characters we'll need
1430     to quote.  */
1431  quoted = 0;
1432  for (p = b; p < e; p++)
1433    if (FILE_CHAR_TEST (*p, mask))
1434      ++quoted;
1435
1436  /* Calculate the length of the output string.  e-b is the input
1437     string length.  Each quoted char introduces two additional
1438     characters in the string, hence 2*quoted.  */
1439  outlen = (e - b) + (2 * quoted);
1440  GROW (dest, outlen);
1441
1442  if (!quoted)
1443    {
1444      /* If there's nothing to quote, we can simply append the string
1445         without processing it again.  */
1446      memcpy (TAIL (dest), b, outlen);
1447    }
1448  else
1449    {
1450      char *q = TAIL (dest);
1451      for (p = b; p < e; p++)
1452        {
1453          if (!FILE_CHAR_TEST (*p, mask))
1454            *q++ = *p;
1455          else
1456            {
1457              unsigned char ch = *p;
1458              *q++ = '%';
1459              *q++ = XNUM_TO_DIGIT (ch >> 4);
1460              *q++ = XNUM_TO_DIGIT (ch & 0xf);
1461            }
1462        }
1463      assert (q - TAIL (dest) == outlen);
1464    }
1465
1466  /* Perform inline case transformation if required.  */
1467  if (opt.restrict_files_case == restrict_lowercase
1468      || opt.restrict_files_case == restrict_uppercase)
1469    {
1470      char *q;
1471      for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
1472        {
1473          if (opt.restrict_files_case == restrict_lowercase)
1474            *q = c_tolower (*q);
1475          else
1476            *q = c_toupper (*q);
1477        }
1478    }
1479
1480  TAIL_INCR (dest, outlen);
1481  append_null (dest);
1482}
1483
1484/* Append to DEST the directory structure that corresponds the
1485   directory part of URL's path.  For example, if the URL is
1486   http://server/dir1/dir2/file, this appends "/dir1/dir2".
1487
1488   Each path element ("dir1" and "dir2" in the above example) is
1489   examined, url-unescaped, and re-escaped as file name element.
1490
1491   Additionally, it cuts as many directories from the path as
1492   specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1493   will produce "bar" for the above example.  For 2 or more, it will
1494   produce "".
1495
1496   Each component of the path is quoted for use as file name.  */
1497
1498static void
1499append_dir_structure (const struct url *u, struct growable *dest)
1500{
1501  char *pathel, *next;
1502  int cut = opt.cut_dirs;
1503
1504  /* Go through the path components, de-URL-quote them, and quote them
1505     (if necessary) as file names.  */
1506
1507  pathel = u->path;
1508  for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1509    {
1510      if (cut-- > 0)
1511        continue;
1512      if (pathel == next)
1513        /* Ignore empty pathels.  */
1514        continue;
1515
1516      if (dest->tail)
1517        append_char ('/', dest);
1518      append_uri_pathel (pathel, next, true, dest);
1519    }
1520}
1521
1522/* Return a unique file name that matches the given URL as well as
1523   possible.  Does not create directories on the file system.  */
1524
1525char *
1526url_file_name (const struct url *u, char *replaced_filename)
1527{
1528  struct growable fnres;        /* stands for "file name result" */
1529  struct growable temp_fnres;
1530
1531  const char *u_file;
1532  char *fname, *unique, *fname_len_check;
1533  const char *index_filename = "index.html"; /* The default index file is index.html */
1534  size_t max_length;
1535
1536  fnres.base = NULL;
1537  fnres.size = 0;
1538  fnres.tail = 0;
1539
1540  temp_fnres.base = NULL;
1541  temp_fnres.size = 0;
1542  temp_fnres.tail = 0;
1543
1544  /* If an alternative index file was defined, change index_filename */
1545  if (opt.default_page)
1546    index_filename = opt.default_page;
1547
1548
1549  /* Start with the directory prefix, if specified. */
1550  if (opt.dir_prefix)
1551    append_string (opt.dir_prefix, &fnres);
1552
1553  /* If "dirstruct" is turned on (typically the case with -r), add
1554     the host and port (unless those have been turned off) and
1555     directory structure.  */
1556  if (opt.dirstruct)
1557    {
1558      if (opt.protocol_directories)
1559        {
1560          if (fnres.tail)
1561            append_char ('/', &fnres);
1562          append_string (supported_schemes[u->scheme].name, &fnres);
1563        }
1564      if (opt.add_hostdir)
1565        {
1566          if (fnres.tail)
1567            append_char ('/', &fnres);
1568          if (0 != strcmp (u->host, ".."))
1569            append_string (u->host, &fnres);
1570          else
1571            /* Host name can come from the network; malicious DNS may
1572               allow ".." to be resolved, causing us to write to
1573               "../<file>".  Defang such host names.  */
1574            append_string ("%2E%2E", &fnres);
1575          if (u->port != scheme_default_port (u->scheme))
1576            {
1577              char portstr[24];
1578              number_to_string (portstr, u->port);
1579              append_char (FN_PORT_SEP, &fnres);
1580              append_string (portstr, &fnres);
1581            }
1582        }
1583
1584      append_dir_structure (u, &fnres);
1585    }
1586
1587  if (!replaced_filename)
1588    {
1589      /* Create the filename. */
1590      u_file = *u->file ? u->file : index_filename;
1591
1592      /* Append "?query" to the file name, even if empty,
1593       * and create fname_len_check. */
1594      if (u->query)
1595        fname_len_check = concat_strings (u_file, FN_QUERY_SEP_STR, u->query, NULL);
1596      else
1597        fname_len_check = strdupdelim (u_file, u_file + strlen (u_file));
1598    }
1599  else
1600    {
1601      u_file = replaced_filename;
1602      fname_len_check = strdupdelim (u_file, u_file + strlen (u_file));
1603    }
1604
1605  append_uri_pathel (fname_len_check,
1606    fname_len_check + strlen (fname_len_check), false, &temp_fnres);
1607
1608  /* Zero-terminate the temporary file name. */
1609  append_char ('\0', &temp_fnres);
1610
1611  /* Check that the length of the file name is acceptable. */
1612#ifdef WINDOWS
1613  if (MAX_PATH > (fnres.tail + CHOMP_BUFFER + 2))
1614    {
1615      max_length = MAX_PATH - (fnres.tail + CHOMP_BUFFER + 2);
1616      /* FIXME: In Windows a filename is usually limited to 255 characters.
1617      To really be accurate you could call GetVolumeInformation() to get
1618      lpMaximumComponentLength
1619      */
1620      if (max_length > 255)
1621        {
1622          max_length = 255;
1623        }
1624    }
1625  else
1626    {
1627      max_length = 0;
1628    }
1629#else
1630  max_length = get_max_length (fnres.base, fnres.tail, _PC_NAME_MAX) - CHOMP_BUFFER;
1631#endif
1632  if (max_length > 0 && strlen (temp_fnres.base) > max_length)
1633    {
1634      logprintf (LOG_NOTQUIET, "The name is too long, %lu chars total.\n",
1635          (unsigned long) strlen (temp_fnres.base));
1636      logprintf (LOG_NOTQUIET, "Trying to shorten...\n");
1637
1638      /* Shorten the file name. */
1639      temp_fnres.base[max_length] = '\0';
1640
1641      logprintf (LOG_NOTQUIET, "New name is %s.\n", temp_fnres.base);
1642    }
1643
1644  free (fname_len_check);
1645
1646  /* The filename has already been 'cleaned' by append_uri_pathel() above.  So,
1647   * just append it. */
1648  if (fnres.tail)
1649    append_char ('/', &fnres);
1650  append_string (temp_fnres.base, &fnres);
1651
1652  fname = fnres.base;
1653
1654  /* Make a final check that the path length is acceptable? */
1655  /* TODO: check fnres.base for path length problem */
1656
1657  free (temp_fnres.base);
1658
1659  /* Check the cases in which the unique extensions are not used:
1660     1) Clobbering is turned off (-nc).
1661     2) Retrieval with regetting.
1662     3) Timestamping is used.
1663     4) Hierarchy is built.
1664     5) Backups are specified.
1665
1666     The exception is the case when file does exist and is a
1667     directory (see `mkalldirs' for explanation).  */
1668
1669  if (ALLOW_CLOBBER
1670      && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1671    {
1672      unique = fname;
1673    }
1674  else
1675    {
1676      unique = unique_name (fname, true);
1677      if (unique != fname)
1678        xfree (fname);
1679    }
1680
1681/* On VMS, alter the name as required. */
1682#ifdef __VMS
1683  {
1684    char *unique2;
1685
1686    unique2 = ods_conform( unique);
1687    if (unique2 != unique)
1688      {
1689        xfree (unique);
1690        unique = unique2;
1691      }
1692  }
1693#endif /* def __VMS */
1694
1695  return unique;
1696}
1697
1698/* Resolve "." and ".." elements of PATH by destructively modifying
1699   PATH and return true if PATH has been modified, false otherwise.
1700
1701   The algorithm is in spirit similar to the one described in rfc1808,
1702   although implemented differently, in one pass.  To recap, path
1703   elements containing only "." are removed, and ".." is taken to mean
1704   "back up one element".  Single leading and trailing slashes are
1705   preserved.
1706
1707   For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1708   test examples are provided below.  If you change anything in this
1709   function, run test_path_simplify to make sure you haven't broken a
1710   test case.  */
1711
1712static bool
1713path_simplify (enum url_scheme scheme, char *path)
1714{
1715  char *h = path;               /* hare */
1716  char *t = path;               /* tortoise */
1717  char *beg = path;
1718  char *end = strchr (path, '\0');
1719
1720  while (h < end)
1721    {
1722      /* Hare should be at the beginning of a path element. */
1723
1724      if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1725        {
1726          /* Ignore "./". */
1727          h += 2;
1728        }
1729      else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1730        {
1731          /* Handle "../" by retreating the tortoise by one path
1732             element -- but not past beggining.  */
1733          if (t > beg)
1734            {
1735              /* Move backwards until T hits the beginning of the
1736                 previous path element or the beginning of path. */
1737              for (--t; t > beg && t[-1] != '/'; t--)
1738                ;
1739            }
1740          else if (scheme == SCHEME_FTP)
1741            {
1742              /* If we're at the beginning, copy the "../" literally
1743                 and move the beginning so a later ".." doesn't remove
1744                 it.  This violates RFC 3986; but we do it for FTP
1745                 anyway because there is otherwise no way to get at a
1746                 parent directory, when the FTP server drops us in a
1747                 non-root directory (which is not uncommon). */
1748              beg = t + 3;
1749              goto regular;
1750            }
1751          h += 3;
1752        }
1753      else
1754        {
1755        regular:
1756          /* A regular path element.  If H hasn't advanced past T,
1757             simply skip to the next path element.  Otherwise, copy
1758             the path element until the next slash.  */
1759          if (t == h)
1760            {
1761              /* Skip the path element, including the slash.  */
1762              while (h < end && *h != '/')
1763                t++, h++;
1764              if (h < end)
1765                t++, h++;
1766            }
1767          else
1768            {
1769              /* Copy the path element, including the final slash.  */
1770              while (h < end && *h != '/')
1771                *t++ = *h++;
1772              if (h < end)
1773                *t++ = *h++;
1774            }
1775        }
1776    }
1777
1778  if (t != h)
1779    *t = '\0';
1780
1781  return t != h;
1782}
1783
1784/* Return the length of URL's path.  Path is considered to be
1785   terminated by one or more of the ?query or ;params or #fragment,
1786   depending on the scheme.  */
1787
1788static const char *
1789path_end (const char *url)
1790{
1791  enum url_scheme scheme = url_scheme (url);
1792  const char *seps;
1793  if (scheme == SCHEME_INVALID)
1794    scheme = SCHEME_HTTP;       /* use http semantics for rel links */
1795  /* +2 to ignore the first two separators ':' and '/' */
1796  seps = init_seps (scheme) + 2;
1797  return strpbrk_or_eos (url, seps);
1798}
1799
1800/* Find the last occurrence of character C in the range [b, e), or
1801   NULL, if none are present.  */
1802#define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1803
1804/* Merge BASE with LINK and return the resulting URI.
1805
1806   Either of the URIs may be absolute or relative, complete with the
1807   host name, or path only.  This tries to reasonably handle all
1808   foreseeable cases.  It only employs minimal URL parsing, without
1809   knowledge of the specifics of schemes.
1810
1811   I briefly considered making this function call path_simplify after
1812   the merging process, as rfc1738 seems to suggest.  This is a bad
1813   idea for several reasons: 1) it complexifies the code, and 2)
1814   url_parse has to simplify path anyway, so it's wasteful to boot.  */
1815
1816char *
1817uri_merge (const char *base, const char *link)
1818{
1819  int linklength;
1820  const char *end;
1821  char *merge;
1822
1823  if (url_has_scheme (link))
1824    return xstrdup (link);
1825
1826  /* We may not examine BASE past END. */
1827  end = path_end (base);
1828  linklength = strlen (link);
1829
1830  if (!*link)
1831    {
1832      /* Empty LINK points back to BASE, query string and all. */
1833      return xstrdup (base);
1834    }
1835  else if (*link == '?')
1836    {
1837      /* LINK points to the same location, but changes the query
1838         string.  Examples: */
1839      /* uri_merge("path",         "?new") -> "path?new"     */
1840      /* uri_merge("path?foo",     "?new") -> "path?new"     */
1841      /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1842      /* uri_merge("path#foo",     "?new") -> "path?new"     */
1843      int baselength = end - base;
1844      merge = xmalloc (baselength + linklength + 1);
1845      memcpy (merge, base, baselength);
1846      memcpy (merge + baselength, link, linklength);
1847      merge[baselength + linklength] = '\0';
1848    }
1849  else if (*link == '#')
1850    {
1851      /* uri_merge("path",         "#new") -> "path#new"     */
1852      /* uri_merge("path#foo",     "#new") -> "path#new"     */
1853      /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1854      /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1855      int baselength;
1856      const char *end1 = strchr (base, '#');
1857      if (!end1)
1858        end1 = base + strlen (base);
1859      baselength = end1 - base;
1860      merge = xmalloc (baselength + linklength + 1);
1861      memcpy (merge, base, baselength);
1862      memcpy (merge + baselength, link, linklength);
1863      merge[baselength + linklength] = '\0';
1864    }
1865  else if (*link == '/' && *(link + 1) == '/')
1866    {
1867      /* LINK begins with "//" and so is a net path: we need to
1868         replace everything after (and including) the double slash
1869         with LINK. */
1870
1871      /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1872      /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1873      /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1874
1875      int span;
1876      const char *slash;
1877      const char *start_insert;
1878
1879      /* Look for first slash. */
1880      slash = memchr (base, '/', end - base);
1881      /* If found slash and it is a double slash, then replace
1882         from this point, else default to replacing from the
1883         beginning.  */
1884      if (slash && *(slash + 1) == '/')
1885        start_insert = slash;
1886      else
1887        start_insert = base;
1888
1889      span = start_insert - base;
1890      merge = xmalloc (span + linklength + 1);
1891      if (span)
1892        memcpy (merge, base, span);
1893      memcpy (merge + span, link, linklength);
1894      merge[span + linklength] = '\0';
1895    }
1896  else if (*link == '/')
1897    {
1898      /* LINK is an absolute path: we need to replace everything
1899         after (and including) the FIRST slash with LINK.
1900
1901         So, if BASE is "http://host/whatever/foo/bar", and LINK is
1902         "/qux/xyzzy", our result should be
1903         "http://host/qux/xyzzy".  */
1904      int span;
1905      const char *slash;
1906      const char *start_insert = NULL; /* for gcc to shut up. */
1907      const char *pos = base;
1908      bool seen_slash_slash = false;
1909      /* We're looking for the first slash, but want to ignore
1910         double slash. */
1911    again:
1912      slash = memchr (pos, '/', end - pos);
1913      if (slash && !seen_slash_slash)
1914        if (*(slash + 1) == '/')
1915          {
1916            pos = slash + 2;
1917            seen_slash_slash = true;
1918            goto again;
1919          }
1920
1921      /* At this point, SLASH is the location of the first / after
1922         "//", or the first slash altogether.  START_INSERT is the
1923         pointer to the location where LINK will be inserted.  When
1924         examining the last two examples, keep in mind that LINK
1925         begins with '/'. */
1926
1927      if (!slash && !seen_slash_slash)
1928        /* example: "foo" */
1929        /*           ^    */
1930        start_insert = base;
1931      else if (!slash && seen_slash_slash)
1932        /* example: "http://foo" */
1933        /*                     ^ */
1934        start_insert = end;
1935      else if (slash && !seen_slash_slash)
1936        /* example: "foo/bar" */
1937        /*           ^        */
1938        start_insert = base;
1939      else if (slash && seen_slash_slash)
1940        /* example: "http://something/" */
1941        /*                           ^  */
1942        start_insert = slash;
1943
1944      span = start_insert - base;
1945      merge = xmalloc (span + linklength + 1);
1946      if (span)
1947        memcpy (merge, base, span);
1948      memcpy (merge + span, link, linklength);
1949      merge[span + linklength] = '\0';
1950    }
1951  else
1952    {
1953      /* LINK is a relative URL: we need to replace everything
1954         after last slash (possibly empty) with LINK.
1955
1956         So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1957         our result should be "whatever/foo/qux/xyzzy".  */
1958      bool need_explicit_slash = false;
1959      int span;
1960      const char *start_insert;
1961      const char *last_slash = find_last_char (base, end, '/');
1962      if (!last_slash)
1963        {
1964          /* No slash found at all.  Replace what we have with LINK. */
1965          start_insert = base;
1966        }
1967      else if (last_slash && last_slash >= base + 2
1968               && last_slash[-2] == ':' && last_slash[-1] == '/')
1969        {
1970          /* example: http://host"  */
1971          /*                      ^ */
1972          start_insert = end + 1;
1973          need_explicit_slash = true;
1974        }
1975      else
1976        {
1977          /* example: "whatever/foo/bar" */
1978          /*                        ^    */
1979          start_insert = last_slash + 1;
1980        }
1981
1982      span = start_insert - base;
1983      merge = xmalloc (span + linklength + 1);
1984      if (span)
1985        memcpy (merge, base, span);
1986      if (need_explicit_slash)
1987        merge[span - 1] = '/';
1988      memcpy (merge + span, link, linklength);
1989      merge[span + linklength] = '\0';
1990    }
1991
1992  return merge;
1993}
1994
1995#define APPEND(p, s) do {                       \
1996  int len = strlen (s);                         \
1997  memcpy (p, s, len);                           \
1998  p += len;                                     \
1999} while (0)
2000
2001/* Use this instead of password when the actual password is supposed
2002   to be hidden.  We intentionally use a generic string without giving
2003   away the number of characters in the password, like previous
2004   versions did.  */
2005#define HIDDEN_PASSWORD "*password*"
2006
2007/* Recreate the URL string from the data in URL.
2008
2009   If HIDE is true (as it is when we're calling this on a URL we plan
2010   to print, but not when calling it to canonicalize a URL for use
2011   within the program), password will be hidden.  Unsafe characters in
2012   the URL will be quoted.  */
2013
2014char *
2015url_string (const struct url *url, enum url_auth_mode auth_mode)
2016{
2017  int size;
2018  char *result, *p;
2019  char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
2020
2021  int scheme_port = supported_schemes[url->scheme].default_port;
2022  const char *scheme_str = supported_schemes[url->scheme].leading_string;
2023  int fplen = full_path_length (url);
2024
2025  bool brackets_around_host;
2026
2027  assert (scheme_str != NULL);
2028
2029  /* Make sure the user name and password are quoted. */
2030  if (url->user)
2031    {
2032      if (auth_mode != URL_AUTH_HIDE)
2033        {
2034          quoted_user = url_escape_allow_passthrough (url->user);
2035          if (url->passwd)
2036            {
2037              if (auth_mode == URL_AUTH_HIDE_PASSWD)
2038                quoted_passwd = (char *) HIDDEN_PASSWORD;
2039              else
2040                quoted_passwd = url_escape_allow_passthrough (url->passwd);
2041            }
2042        }
2043    }
2044
2045  /* In the unlikely event that the host name contains non-printable
2046     characters, quote it for displaying to the user.  */
2047  quoted_host = url_escape_allow_passthrough (url->host);
2048
2049  /* Undo the quoting of colons that URL escaping performs.  IPv6
2050     addresses may legally contain colons, and in that case must be
2051     placed in square brackets.  */
2052  if (quoted_host != url->host)
2053    unescape_single_char (quoted_host, ':');
2054  brackets_around_host = strchr (quoted_host, ':') != NULL;
2055
2056  size = (strlen (scheme_str)
2057          + strlen (quoted_host)
2058          + (brackets_around_host ? 2 : 0)
2059          + fplen
2060          + 1);
2061  if (url->port != scheme_port)
2062    size += 1 + numdigit (url->port);
2063  if (quoted_user)
2064    {
2065      size += 1 + strlen (quoted_user);
2066      if (quoted_passwd)
2067        size += 1 + strlen (quoted_passwd);
2068    }
2069
2070  p = result = xmalloc (size);
2071
2072  APPEND (p, scheme_str);
2073  if (quoted_user)
2074    {
2075      APPEND (p, quoted_user);
2076      if (quoted_passwd)
2077        {
2078          *p++ = ':';
2079          APPEND (p, quoted_passwd);
2080        }
2081      *p++ = '@';
2082    }
2083
2084  if (brackets_around_host)
2085    *p++ = '[';
2086  APPEND (p, quoted_host);
2087  if (brackets_around_host)
2088    *p++ = ']';
2089  if (url->port != scheme_port)
2090    {
2091      *p++ = ':';
2092      p = number_to_string (p, url->port);
2093    }
2094
2095  full_path_write (url, p);
2096  p += fplen;
2097  *p++ = '\0';
2098
2099  assert (p - result == size);
2100
2101  if (quoted_user && quoted_user != url->user)
2102    xfree (quoted_user);
2103  if (quoted_passwd && auth_mode == URL_AUTH_SHOW
2104      && quoted_passwd != url->passwd)
2105    xfree (quoted_passwd);
2106  if (quoted_host != url->host)
2107    xfree (quoted_host);
2108
2109  return result;
2110}
2111
2112/* Return true if scheme a is similar to scheme b.
2113
2114   Schemes are similar if they are equal.  If SSL is supported, schemes
2115   are also similar if one is http (SCHEME_HTTP) and the other is https
2116   (SCHEME_HTTPS).  */
2117bool
2118schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2119{
2120  if (a == b)
2121    return true;
2122#ifdef HAVE_SSL
2123  if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2124      || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2125    return true;
2126#endif
2127  return false;
2128}
2129
2130static int
2131getchar_from_escaped_string (const char *str, char *c)
2132{
2133  const char *p = str;
2134
2135  assert (str && *str);
2136  assert (c);
2137
2138  if (p[0] == '%')
2139    {
2140      if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
2141        {
2142          *c = '%';
2143          return 1;
2144        }
2145      else
2146        {
2147          if (p[2] == 0)
2148            return 0; /* error: invalid string */
2149
2150          *c = X2DIGITS_TO_NUM (p[1], p[2]);
2151          if (URL_RESERVED_CHAR(*c))
2152            {
2153              *c = '%';
2154              return 1;
2155            }
2156          else
2157            return 3;
2158        }
2159    }
2160  else
2161    {
2162      *c = p[0];
2163    }
2164
2165  return 1;
2166}
2167
2168bool
2169are_urls_equal (const char *u1, const char *u2)
2170{
2171  const char *p, *q;
2172  int pp, qq;
2173  char ch1, ch2;
2174  assert(u1 && u2);
2175
2176  p = u1;
2177  q = u2;
2178
2179  while (*p && *q
2180         && (pp = getchar_from_escaped_string (p, &ch1))
2181         && (qq = getchar_from_escaped_string (q, &ch2))
2182         && (c_tolower(ch1) == c_tolower(ch2)))
2183    {
2184      p += pp;
2185      q += qq;
2186    }
2187
2188  return (*p == 0 && *q == 0 ? true : false);
2189}
2190
2191#ifdef TESTING
2192/* Debugging and testing support for path_simplify. */
2193
2194#if 0
2195/* Debug: run path_simplify on PATH and return the result in a new
2196   string.  Useful for calling from the debugger.  */
2197static char *
2198ps (char *path)
2199{
2200  char *copy = xstrdup (path);
2201  path_simplify (copy);
2202  return copy;
2203}
2204#endif
2205
2206static const char *
2207run_test (const char *test, const char *expected_result, enum url_scheme scheme,
2208          bool expected_change)
2209{
2210  char *test_copy = xstrdup (test);
2211  bool modified = path_simplify (scheme, test_copy);
2212
2213  if (0 != strcmp (test_copy, expected_result))
2214    {
2215      printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2216              test, expected_result, test_copy);
2217      mu_assert ("", 0);
2218    }
2219  if (modified != expected_change)
2220    {
2221      if (expected_change)
2222        printf ("Expected modification with path_simplify(\"%s\").\n",
2223                test);
2224      else
2225        printf ("Expected no modification with path_simplify(\"%s\").\n",
2226                test);
2227    }
2228  xfree (test_copy);
2229  mu_assert ("", modified == expected_change);
2230  return NULL;
2231}
2232
2233const char *
2234test_path_simplify (void)
2235{
2236  static const struct {
2237    const char *test, *result;
2238    enum url_scheme scheme;
2239    bool should_modify;
2240  } tests[] = {
2241    { "",                       "",             SCHEME_HTTP, false },
2242    { ".",                      "",             SCHEME_HTTP, true },
2243    { "./",                     "",             SCHEME_HTTP, true },
2244    { "..",                     "",             SCHEME_HTTP, true },
2245    { "../",                    "",             SCHEME_HTTP, true },
2246    { "..",                     "..",           SCHEME_FTP,  false },
2247    { "../",                    "../",          SCHEME_FTP,  false },
2248    { "foo",                    "foo",          SCHEME_HTTP, false },
2249    { "foo/bar",                "foo/bar",      SCHEME_HTTP, false },
2250    { "foo///bar",              "foo///bar",    SCHEME_HTTP, false },
2251    { "foo/.",                  "foo/",         SCHEME_HTTP, true },
2252    { "foo/./",                 "foo/",         SCHEME_HTTP, true },
2253    { "foo./",                  "foo./",        SCHEME_HTTP, false },
2254    { "foo/../bar",             "bar",          SCHEME_HTTP, true },
2255    { "foo/../bar/",            "bar/",         SCHEME_HTTP, true },
2256    { "foo/bar/..",             "foo/",         SCHEME_HTTP, true },
2257    { "foo/bar/../x",           "foo/x",        SCHEME_HTTP, true },
2258    { "foo/bar/../x/",          "foo/x/",       SCHEME_HTTP, true },
2259    { "foo/..",                 "",             SCHEME_HTTP, true },
2260    { "foo/../..",              "",             SCHEME_HTTP, true },
2261    { "foo/../../..",           "",             SCHEME_HTTP, true },
2262    { "foo/../../bar/../../baz", "baz",         SCHEME_HTTP, true },
2263    { "foo/../..",              "..",           SCHEME_FTP,  true },
2264    { "foo/../../..",           "../..",        SCHEME_FTP,  true },
2265    { "foo/../../bar/../../baz", "../../baz",   SCHEME_FTP,  true },
2266    { "a/b/../../c",            "c",            SCHEME_HTTP, true },
2267    { "./a/../b",               "b",            SCHEME_HTTP, true }
2268  };
2269  unsigned i;
2270
2271  for (i = 0; i < countof (tests); i++)
2272    {
2273      const char *message;
2274      const char *test = tests[i].test;
2275      const char *expected_result = tests[i].result;
2276      enum url_scheme scheme = tests[i].scheme;
2277      bool  expected_change = tests[i].should_modify;
2278
2279      message = run_test (test, expected_result, scheme, expected_change);
2280      if (message) return message;
2281    }
2282  return NULL;
2283}
2284
2285const char *
2286test_append_uri_pathel(void)
2287{
2288  unsigned i;
2289  static const struct {
2290    const char *original_url;
2291    const char *input;
2292    bool escaped;
2293    const char *expected_result;
2294  } test_array[] = {
2295    { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2296  };
2297
2298  for (i = 0; i < countof(test_array); ++i)
2299    {
2300      struct growable dest;
2301      const char *p = test_array[i].input;
2302
2303      memset (&dest, 0, sizeof (dest));
2304
2305      append_string (test_array[i].original_url, &dest);
2306      append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2307
2308      mu_assert ("test_append_uri_pathel: wrong result",
2309                 strcmp (dest.base, test_array[i].expected_result) == 0);
2310    }
2311
2312  return NULL;
2313}
2314
2315const char *
2316test_are_urls_equal(void)
2317{
2318  unsigned i;
2319  static const struct {
2320    const char *url1;
2321    const char *url2;
2322    bool expected_result;
2323  } test_array[] = {
2324    { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/",       true },
2325    { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2326    { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/",  false },
2327    { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/",     true },
2328    { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/",  false },
2329    { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/",       false },
2330  };
2331
2332  for (i = 0; i < countof(test_array); ++i)
2333    {
2334      mu_assert ("test_are_urls_equal: wrong result",
2335                 are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2336    }
2337
2338  return NULL;
2339}
2340
2341#endif /* TESTING */
2342
2343/*
2344 * vim: et ts=2 sw=2
2345 */
2346