1/* URL handling. 2 Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 3 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, 4 Inc. 5 6This file is part of GNU Wget. 7 8GNU Wget is free software; you can redistribute it and/or modify 9it under the terms of the GNU General Public License as published by 10the Free Software Foundation; either version 3 of the License, or (at 11your option) any later version. 12 13GNU Wget is distributed in the hope that it will be useful, 14but WITHOUT ANY WARRANTY; without even the implied warranty of 15MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16GNU General Public License for more details. 17 18You should have received a copy of the GNU General Public License 19along with Wget. If not, see <http://www.gnu.org/licenses/>. 20 21Additional permission under GNU GPL version 3 section 7 22 23If you modify this program, or any covered work, by linking or 24combining it with the OpenSSL project's OpenSSL library (or a 25modified version of that library), containing parts covered by the 26terms of the OpenSSL or SSLeay licenses, the Free Software Foundation 27grants you additional permission to convey the resulting work. 28Corresponding Source for a non-source form of such a combination 29shall include the source code for the parts of OpenSSL used as well 30as that of the covered work. */ 31 32#include "wget.h" 33 34#include <stdio.h> 35#include <stdlib.h> 36#include <string.h> 37#include <unistd.h> 38#include <errno.h> 39#include <assert.h> 40 41#include "utils.h" 42#include "url.h" 43#include "host.h" /* for is_valid_ipv6_address */ 44 45#ifdef __VMS 46#include "vms.h" 47#endif /* def __VMS */ 48 49#ifdef TESTING 50#include "test.h" 51#endif 52 53enum { 54 scm_disabled = 1, /* for https when OpenSSL fails to init. */ 55 scm_has_params = 2, /* whether scheme has ;params */ 56 scm_has_query = 4, /* whether scheme has ?query */ 57 scm_has_fragment = 8 /* whether scheme has #fragment */ 58}; 59 60struct scheme_data 61{ 62 /* Short name of the scheme, such as "http" or "ftp". */ 63 const char *name; 64 /* Leading string that identifies the scheme, such as "https://". */ 65 const char *leading_string; 66 /* Default port of the scheme when none is specified. */ 67 int default_port; 68 /* Various flags. */ 69 int flags; 70}; 71 72/* Supported schemes: */ 73static struct scheme_data supported_schemes[] = 74{ 75 { "http", "http://", DEFAULT_HTTP_PORT, scm_has_query|scm_has_fragment }, 76#ifdef HAVE_SSL 77 { "https", "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment }, 78#endif 79 { "ftp", "ftp://", DEFAULT_FTP_PORT, scm_has_params|scm_has_fragment }, 80 81 /* SCHEME_INVALID */ 82 { NULL, NULL, -1, 0 } 83}; 84 85/* Forward declarations: */ 86 87static bool path_simplify (enum url_scheme, char *); 88 89/* Support for escaping and unescaping of URL strings. */ 90 91/* Table of "reserved" and "unsafe" characters. Those terms are 92 rfc1738-speak, as such largely obsoleted by rfc2396 and later 93 specs, but the general idea remains. 94 95 A reserved character is the one that you can't decode without 96 changing the meaning of the URL. For example, you can't decode 97 "/foo/%2f/bar" into "/foo///bar" because the number and contents of 98 path components is different. Non-reserved characters can be 99 changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". The 100 unsafe characters are loosely based on rfc1738, plus "$" and ",", 101 as recommended by rfc2396, and minus "~", which is very frequently 102 used (and sometimes unrecognized as %7E by broken servers). 103 104 An unsafe character is the one that should be encoded when URLs are 105 placed in foreign environments. E.g. space and newline are unsafe 106 in HTTP contexts because HTTP uses them as separator and line 107 terminator, so they must be encoded to %20 and %0A respectively. 108 "*" is unsafe in shell context, etc. 109 110 We determine whether a character is unsafe through static table 111 lookup. This code assumes ASCII character set and 8-bit chars. */ 112 113enum { 114 /* rfc1738 reserved chars + "$" and ",". */ 115 urlchr_reserved = 1, 116 117 /* rfc1738 unsafe chars, plus non-printables. */ 118 urlchr_unsafe = 2 119}; 120 121#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask)) 122#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved) 123#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe) 124 125/* Shorthands for the table: */ 126#define R urlchr_reserved 127#define U urlchr_unsafe 128#define RU R|U 129 130static const unsigned char urlchr_table[256] = 131{ 132 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */ 133 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */ 134 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */ 135 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */ 136 U, 0, U, RU, R, U, R, 0, /* SP ! " # $ % & ' */ 137 0, 0, 0, R, R, 0, 0, R, /* ( ) * + , - . / */ 138 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */ 139 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */ 140 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */ 141 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */ 142 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */ 143 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */ 144 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */ 145 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */ 146 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */ 147 0, 0, 0, U, U, U, 0, U, /* x y z { | } ~ DEL */ 148 149 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, 150 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, 151 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, 152 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, 153 154 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, 155 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, 156 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, 157 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, 158}; 159#undef R 160#undef U 161#undef RU 162 163/* URL-unescape the string S. 164 165 This is done by transforming the sequences "%HH" to the character 166 represented by the hexadecimal digits HH. If % is not followed by 167 two hexadecimal digits, it is inserted literally. 168 169 The transformation is done in place. If you need the original 170 string intact, make a copy before calling this function. */ 171 172void 173url_unescape (char *s) 174{ 175 char *t = s; /* t - tortoise */ 176 char *h = s; /* h - hare */ 177 178 for (; *h; h++, t++) 179 { 180 if (*h != '%') 181 { 182 copychar: 183 *t = *h; 184 } 185 else 186 { 187 char c; 188 /* Do nothing if '%' is not followed by two hex digits. */ 189 if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2]))) 190 goto copychar; 191 c = X2DIGITS_TO_NUM (h[1], h[2]); 192 /* Don't unescape %00 because there is no way to insert it 193 into a C string without effectively truncating it. */ 194 if (c == '\0') 195 goto copychar; 196 *t = c; 197 h += 2; 198 } 199 } 200 *t = '\0'; 201} 202 203/* The core of url_escape_* functions. Escapes the characters that 204 match the provided mask in urlchr_table. 205 206 If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be 207 returned unchanged. If ALLOW_PASSTHROUGH is false, a freshly 208 allocated string will be returned in all cases. */ 209 210static char * 211url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough) 212{ 213 const char *p1; 214 char *p2, *newstr; 215 int newlen; 216 int addition = 0; 217 218 for (p1 = s; *p1; p1++) 219 if (urlchr_test (*p1, mask)) 220 addition += 2; /* Two more characters (hex digits) */ 221 222 if (!addition) 223 return allow_passthrough ? (char *)s : xstrdup (s); 224 225 newlen = (p1 - s) + addition; 226 newstr = xmalloc (newlen + 1); 227 228 p1 = s; 229 p2 = newstr; 230 while (*p1) 231 { 232 /* Quote the characters that match the test mask. */ 233 if (urlchr_test (*p1, mask)) 234 { 235 unsigned char c = *p1++; 236 *p2++ = '%'; 237 *p2++ = XNUM_TO_DIGIT (c >> 4); 238 *p2++ = XNUM_TO_DIGIT (c & 0xf); 239 } 240 else 241 *p2++ = *p1++; 242 } 243 assert (p2 - newstr == newlen); 244 *p2 = '\0'; 245 246 return newstr; 247} 248 249/* URL-escape the unsafe characters (see urlchr_table) in a given 250 string, returning a freshly allocated string. */ 251 252char * 253url_escape (const char *s) 254{ 255 return url_escape_1 (s, urlchr_unsafe, false); 256} 257 258/* URL-escape the unsafe and reserved characters (see urlchr_table) in 259 a given string, returning a freshly allocated string. */ 260 261char * 262url_escape_unsafe_and_reserved (const char *s) 263{ 264 return url_escape_1 (s, urlchr_unsafe|urlchr_reserved, false); 265} 266 267/* URL-escape the unsafe characters (see urlchr_table) in a given 268 string. If no characters are unsafe, S is returned. */ 269 270static char * 271url_escape_allow_passthrough (const char *s) 272{ 273 return url_escape_1 (s, urlchr_unsafe, true); 274} 275 276/* Decide whether the char at position P needs to be encoded. (It is 277 not enough to pass a single char *P because the function may need 278 to inspect the surrounding context.) 279 280 Return true if the char should be escaped as %XX, false otherwise. */ 281 282static inline bool 283char_needs_escaping (const char *p) 284{ 285 if (*p == '%') 286 { 287 if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2))) 288 return false; 289 else 290 /* Garbled %.. sequence: encode `%'. */ 291 return true; 292 } 293 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p)) 294 return true; 295 else 296 return false; 297} 298 299/* Translate a %-escaped (but possibly non-conformant) input string S 300 into a %-escaped (and conformant) output string. If no characters 301 are encoded or decoded, return the same string S; otherwise, return 302 a freshly allocated string with the new contents. 303 304 After a URL has been run through this function, the protocols that 305 use `%' as the quote character can use the resulting string as-is, 306 while those that don't can use url_unescape to get to the intended 307 data. This function is stable: once the input is transformed, 308 further transformations of the result yield the same output. 309 310 Let's discuss why this function is needed. 311 312 Imagine Wget is asked to retrieve `http://abc.xyz/abc def'. Since 313 a raw space character would mess up the HTTP request, it needs to 314 be quoted, like this: 315 316 GET /abc%20def HTTP/1.0 317 318 It would appear that the unsafe chars need to be quoted, for 319 example with url_escape. But what if we're requested to download 320 `abc%20def'? url_escape transforms "%" to "%25", which would leave 321 us with `abc%2520def'. This is incorrect -- since %-escapes are 322 part of URL syntax, "%20" is the correct way to denote a literal 323 space on the Wget command line. This leads to the conclusion that 324 in that case Wget should not call url_escape, but leave the `%20' 325 as is. This is clearly contradictory, but it only gets worse. 326 327 What if the requested URI is `abc%20 def'? If we call url_escape, 328 we end up with `/abc%2520%20def', which is almost certainly not 329 intended. If we don't call url_escape, we are left with the 330 embedded space and cannot complete the request. What the user 331 meant was for Wget to request `/abc%20%20def', and this is where 332 reencode_escapes kicks in. 333 334 Wget used to solve this by first decoding %-quotes, and then 335 encoding all the "unsafe" characters found in the resulting string. 336 This was wrong because it didn't preserve certain URL special 337 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b 338 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on 339 whether we considered `+' reserved (it is). One of these results 340 is inevitable because by the second step we would lose information 341 on whether the `+' was originally encoded or not. Both results 342 were wrong because in CGI parameters + means space, while %2B means 343 literal plus. reencode_escapes correctly translates the above to 344 "a%2B+b", i.e. returns the original string. 345 346 This function uses a modified version of the algorithm originally 347 proposed by Anon Sricharoenchai: 348 349 * Encode all "unsafe" characters, except those that are also 350 "reserved", to %XX. See urlchr_table for which characters are 351 unsafe and reserved. 352 353 * Encode the "%" characters not followed by two hex digits to 354 "%25". 355 356 * Pass through all other characters and %XX escapes as-is. (Up to 357 Wget 1.10 this decoded %XX escapes corresponding to "safe" 358 characters, but that was obtrusive and broke some servers.) 359 360 Anon's test case: 361 362 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc" 363 -> 364 "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc" 365 366 Simpler test cases: 367 368 "foo bar" -> "foo%20bar" 369 "foo%20bar" -> "foo%20bar" 370 "foo %20bar" -> "foo%20%20bar" 371 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%') 372 "foo%25%20bar" -> "foo%25%20bar" 373 "foo%2%20bar" -> "foo%252%20bar" 374 "foo+bar" -> "foo+bar" (plus is reserved!) 375 "foo%2b+bar" -> "foo%2b+bar" */ 376 377static char * 378reencode_escapes (const char *s) 379{ 380 const char *p1; 381 char *newstr, *p2; 382 int oldlen, newlen; 383 384 int encode_count = 0; 385 386 /* First pass: inspect the string to see if there's anything to do, 387 and to calculate the new length. */ 388 for (p1 = s; *p1; p1++) 389 if (char_needs_escaping (p1)) 390 ++encode_count; 391 392 if (!encode_count) 393 /* The string is good as it is. */ 394 return (char *) s; /* C const model sucks. */ 395 396 oldlen = p1 - s; 397 /* Each encoding adds two characters (hex digits). */ 398 newlen = oldlen + 2 * encode_count; 399 newstr = xmalloc (newlen + 1); 400 401 /* Second pass: copy the string to the destination address, encoding 402 chars when needed. */ 403 p1 = s; 404 p2 = newstr; 405 406 while (*p1) 407 if (char_needs_escaping (p1)) 408 { 409 unsigned char c = *p1++; 410 *p2++ = '%'; 411 *p2++ = XNUM_TO_DIGIT (c >> 4); 412 *p2++ = XNUM_TO_DIGIT (c & 0xf); 413 } 414 else 415 *p2++ = *p1++; 416 417 *p2 = '\0'; 418 assert (p2 - newstr == newlen); 419 return newstr; 420} 421 422/* Returns the scheme type if the scheme is supported, or 423 SCHEME_INVALID if not. */ 424 425enum url_scheme 426url_scheme (const char *url) 427{ 428 int i; 429 430 for (i = 0; supported_schemes[i].leading_string; i++) 431 if (0 == strncasecmp (url, supported_schemes[i].leading_string, 432 strlen (supported_schemes[i].leading_string))) 433 { 434 if (!(supported_schemes[i].flags & scm_disabled)) 435 return (enum url_scheme) i; 436 else 437 return SCHEME_INVALID; 438 } 439 440 return SCHEME_INVALID; 441} 442 443#define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+') 444 445/* Return 1 if the URL begins with any "scheme", 0 otherwise. As 446 currently implemented, it returns true if URL begins with 447 [-+a-zA-Z0-9]+: . */ 448 449bool 450url_has_scheme (const char *url) 451{ 452 const char *p = url; 453 454 /* The first char must be a scheme char. */ 455 if (!*p || !SCHEME_CHAR (*p)) 456 return false; 457 ++p; 458 /* Followed by 0 or more scheme chars. */ 459 while (*p && SCHEME_CHAR (*p)) 460 ++p; 461 /* Terminated by ':'. */ 462 return *p == ':'; 463} 464 465bool 466url_valid_scheme (const char *url) 467{ 468 enum url_scheme scheme = url_scheme (url); 469 return scheme != SCHEME_INVALID; 470} 471 472int 473scheme_default_port (enum url_scheme scheme) 474{ 475 return supported_schemes[scheme].default_port; 476} 477 478void 479scheme_disable (enum url_scheme scheme) 480{ 481 supported_schemes[scheme].flags |= scm_disabled; 482} 483 484/* Skip the username and password, if present in the URL. The 485 function should *not* be called with the complete URL, but with the 486 portion after the scheme. 487 488 If no username and password are found, return URL. */ 489 490static const char * 491url_skip_credentials (const char *url) 492{ 493 /* Look for '@' that comes before terminators, such as '/', '?', 494 '#', or ';'. */ 495 const char *p = (const char *)strpbrk (url, "@/?#;"); 496 if (!p || *p != '@') 497 return url; 498 return p + 1; 499} 500 501/* Parse credentials contained in [BEG, END). The region is expected 502 to have come from a URL and is unescaped. */ 503 504static bool 505parse_credentials (const char *beg, const char *end, char **user, char **passwd) 506{ 507 char *colon; 508 const char *userend; 509 510 if (beg == end) 511 return false; /* empty user name */ 512 513 colon = memchr (beg, ':', end - beg); 514 if (colon == beg) 515 return false; /* again empty user name */ 516 517 if (colon) 518 { 519 *passwd = strdupdelim (colon + 1, end); 520 userend = colon; 521 url_unescape (*passwd); 522 } 523 else 524 { 525 *passwd = NULL; 526 userend = end; 527 } 528 *user = strdupdelim (beg, userend); 529 url_unescape (*user); 530 return true; 531} 532 533/* Used by main.c: detect URLs written using the "shorthand" URL forms 534 originally popularized by Netscape and NcFTP. HTTP shorthands look 535 like this: 536 537 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file 538 www.foo.com[:port] -> http://www.foo.com[:port] 539 540 FTP shorthands look like this: 541 542 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file 543 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file 544 545 If the URL needs not or cannot be rewritten, return NULL. */ 546 547char * 548rewrite_shorthand_url (const char *url) 549{ 550 const char *p; 551 char *ret; 552 553 if (url_scheme (url) != SCHEME_INVALID) 554 return NULL; 555 556 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the 557 latter Netscape. */ 558 p = strpbrk (url, ":/"); 559 if (p == url) 560 return NULL; 561 562 /* If we're looking at "://", it means the URL uses a scheme we 563 don't support, which may include "https" when compiled without 564 SSL support. Don't bogusly rewrite such URLs. */ 565 if (p && p[0] == ':' && p[1] == '/' && p[2] == '/') 566 return NULL; 567 568 if (p && *p == ':') 569 { 570 /* Colon indicates ftp, as in foo.bar.com:path. Check for 571 special case of http port number ("localhost:10000"). */ 572 int digits = strspn (p + 1, "0123456789"); 573 if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0')) 574 goto http; 575 576 /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */ 577 ret = aprintf ("ftp://%s", url); 578 ret[6 + (p - url)] = '/'; 579 } 580 else 581 { 582 http: 583 /* Just prepend "http://" to URL. */ 584 ret = aprintf ("http://%s", url); 585 } 586 return ret; 587} 588 589static void split_path (const char *, char **, char **); 590 591/* Like strpbrk, with the exception that it returns the pointer to the 592 terminating zero (end-of-string aka "eos") if no matching character 593 is found. */ 594 595static inline char * 596strpbrk_or_eos (const char *s, const char *accept) 597{ 598 char *p = strpbrk (s, accept); 599 if (!p) 600 p = strchr (s, '\0'); 601 return p; 602} 603 604/* Turn STR into lowercase; return true if a character was actually 605 changed. */ 606 607static bool 608lowercase_str (char *str) 609{ 610 bool changed = false; 611 for (; *str; str++) 612 if (c_isupper (*str)) 613 { 614 changed = true; 615 *str = c_tolower (*str); 616 } 617 return changed; 618} 619 620static const char * 621init_seps (enum url_scheme scheme) 622{ 623 static char seps[8] = ":/"; 624 char *p = seps + 2; 625 int flags = supported_schemes[scheme].flags; 626 627 if (flags & scm_has_params) 628 *p++ = ';'; 629 if (flags & scm_has_query) 630 *p++ = '?'; 631 if (flags & scm_has_fragment) 632 *p++ = '#'; 633 *p = '\0'; 634 return seps; 635} 636 637static const char *parse_errors[] = { 638#define PE_NO_ERROR 0 639 N_("No error"), 640#define PE_UNSUPPORTED_SCHEME 1 641 N_("Unsupported scheme %s"), /* support for format token only here */ 642#define PE_MISSING_SCHEME 2 643 N_("Scheme missing"), 644#define PE_INVALID_HOST_NAME 3 645 N_("Invalid host name"), 646#define PE_BAD_PORT_NUMBER 4 647 N_("Bad port number"), 648#define PE_INVALID_USER_NAME 5 649 N_("Invalid user name"), 650#define PE_UNTERMINATED_IPV6_ADDRESS 6 651 N_("Unterminated IPv6 numeric address"), 652#define PE_IPV6_NOT_SUPPORTED 7 653 N_("IPv6 addresses not supported"), 654#define PE_INVALID_IPV6_ADDRESS 8 655 N_("Invalid IPv6 numeric address") 656}; 657 658/* Parse a URL. 659 660 Return a new struct url if successful, NULL on error. In case of 661 error, and if ERROR is not NULL, also set *ERROR to the appropriate 662 error code. */ 663struct url * 664url_parse (const char *url, int *error, struct iri *iri, bool percent_encode) 665{ 666 struct url *u; 667 const char *p; 668 bool path_modified, host_modified; 669 670 enum url_scheme scheme; 671 const char *seps; 672 673 const char *uname_b, *uname_e; 674 const char *host_b, *host_e; 675 const char *path_b, *path_e; 676 const char *params_b, *params_e; 677 const char *query_b, *query_e; 678 const char *fragment_b, *fragment_e; 679 680 int port; 681 char *user = NULL, *passwd = NULL; 682 683 const char *url_encoded = NULL; 684 685 int error_code; 686 687 scheme = url_scheme (url); 688 if (scheme == SCHEME_INVALID) 689 { 690 if (url_has_scheme (url)) 691 error_code = PE_UNSUPPORTED_SCHEME; 692 else 693 error_code = PE_MISSING_SCHEME; 694 goto error; 695 } 696 697 url_encoded = url; 698 699 if (iri && iri->utf8_encode) 700 { 701 char *new_url = NULL; 702 703 iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url); 704 if (!iri->utf8_encode) 705 new_url = NULL; 706 else 707 { 708 iri->orig_url = xstrdup (url); 709 url_encoded = reencode_escapes (new_url); 710 if (url_encoded != new_url) 711 xfree (new_url); 712 percent_encode = false; 713 } 714 } 715 716 if (percent_encode) 717 url_encoded = reencode_escapes (url); 718 719 p = url_encoded; 720 p += strlen (supported_schemes[scheme].leading_string); 721 uname_b = p; 722 p = url_skip_credentials (p); 723 uname_e = p; 724 725 /* scheme://user:pass@host[:port]... */ 726 /* ^ */ 727 728 /* We attempt to break down the URL into the components path, 729 params, query, and fragment. They are ordered like this: 730 731 scheme://host[:port][/path][;params][?query][#fragment] */ 732 733 path_b = path_e = NULL; 734 params_b = params_e = NULL; 735 query_b = query_e = NULL; 736 fragment_b = fragment_e = NULL; 737 738 /* Initialize separators for optional parts of URL, depending on the 739 scheme. For example, FTP has params, and HTTP and HTTPS have 740 query string and fragment. */ 741 seps = init_seps (scheme); 742 743 host_b = p; 744 745 if (*p == '[') 746 { 747 /* Handle IPv6 address inside square brackets. Ideally we'd 748 just look for the terminating ']', but rfc2732 mandates 749 rejecting invalid IPv6 addresses. */ 750 751 /* The address begins after '['. */ 752 host_b = p + 1; 753 host_e = strchr (host_b, ']'); 754 755 if (!host_e) 756 { 757 error_code = PE_UNTERMINATED_IPV6_ADDRESS; 758 goto error; 759 } 760 761#ifdef ENABLE_IPV6 762 /* Check if the IPv6 address is valid. */ 763 if (!is_valid_ipv6_address(host_b, host_e)) 764 { 765 error_code = PE_INVALID_IPV6_ADDRESS; 766 goto error; 767 } 768 769 /* Continue parsing after the closing ']'. */ 770 p = host_e + 1; 771#else 772 error_code = PE_IPV6_NOT_SUPPORTED; 773 goto error; 774#endif 775 776 /* The closing bracket must be followed by a separator or by the 777 null char. */ 778 /* http://[::1]... */ 779 /* ^ */ 780 if (!strchr (seps, *p)) 781 { 782 /* Trailing garbage after []-delimited IPv6 address. */ 783 error_code = PE_INVALID_HOST_NAME; 784 goto error; 785 } 786 } 787 else 788 { 789 p = strpbrk_or_eos (p, seps); 790 host_e = p; 791 } 792 ++seps; /* advance to '/' */ 793 794 if (host_b == host_e) 795 { 796 error_code = PE_INVALID_HOST_NAME; 797 goto error; 798 } 799 800 port = scheme_default_port (scheme); 801 if (*p == ':') 802 { 803 const char *port_b, *port_e, *pp; 804 805 /* scheme://host:port/tralala */ 806 /* ^ */ 807 ++p; 808 port_b = p; 809 p = strpbrk_or_eos (p, seps); 810 port_e = p; 811 812 /* Allow empty port, as per rfc2396. */ 813 if (port_b != port_e) 814 for (port = 0, pp = port_b; pp < port_e; pp++) 815 { 816 if (!c_isdigit (*pp)) 817 { 818 /* http://host:12randomgarbage/blah */ 819 /* ^ */ 820 error_code = PE_BAD_PORT_NUMBER; 821 goto error; 822 } 823 port = 10 * port + (*pp - '0'); 824 /* Check for too large port numbers here, before we have 825 a chance to overflow on bogus port values. */ 826 if (port > 0xffff) 827 { 828 error_code = PE_BAD_PORT_NUMBER; 829 goto error; 830 } 831 } 832 } 833 /* Advance to the first separator *after* '/' (either ';' or '?', 834 depending on the scheme). */ 835 ++seps; 836 837 /* Get the optional parts of URL, each part being delimited by 838 current location and the position of the next separator. */ 839#define GET_URL_PART(sepchar, var) do { \ 840 if (*p == sepchar) \ 841 var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps); \ 842 ++seps; \ 843} while (0) 844 845 GET_URL_PART ('/', path); 846 if (supported_schemes[scheme].flags & scm_has_params) 847 GET_URL_PART (';', params); 848 if (supported_schemes[scheme].flags & scm_has_query) 849 GET_URL_PART ('?', query); 850 if (supported_schemes[scheme].flags & scm_has_fragment) 851 GET_URL_PART ('#', fragment); 852 853#undef GET_URL_PART 854 assert (*p == 0); 855 856 if (uname_b != uname_e) 857 { 858 /* http://user:pass@host */ 859 /* ^ ^ */ 860 /* uname_b uname_e */ 861 if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd)) 862 { 863 error_code = PE_INVALID_USER_NAME; 864 goto error; 865 } 866 } 867 868 u = xnew0 (struct url); 869 u->scheme = scheme; 870 u->host = strdupdelim (host_b, host_e); 871 u->port = port; 872 u->user = user; 873 u->passwd = passwd; 874 875 u->path = strdupdelim (path_b, path_e); 876 path_modified = path_simplify (scheme, u->path); 877 split_path (u->path, &u->dir, &u->file); 878 879 host_modified = lowercase_str (u->host); 880 881 /* Decode %HH sequences in host name. This is important not so much 882 to support %HH sequences in host names (which other browser 883 don't), but to support binary characters (which will have been 884 converted to %HH by reencode_escapes). */ 885 if (strchr (u->host, '%')) 886 { 887 url_unescape (u->host); 888 host_modified = true; 889 890 /* Apply IDNA regardless of iri->utf8_encode status */ 891 if (opt.enable_iri && iri) 892 { 893 char *new = idn_encode (iri, u->host); 894 if (new) 895 { 896 xfree (u->host); 897 u->host = new; 898 host_modified = true; 899 } 900 } 901 } 902 903 if (params_b) 904 u->params = strdupdelim (params_b, params_e); 905 if (query_b) 906 u->query = strdupdelim (query_b, query_e); 907 if (fragment_b) 908 u->fragment = strdupdelim (fragment_b, fragment_e); 909 910 if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e) 911 { 912 /* If we suspect that a transformation has rendered what 913 url_string might return different from URL_ENCODED, rebuild 914 u->url using url_string. */ 915 u->url = url_string (u, URL_AUTH_SHOW); 916 917 if (url_encoded != url) 918 xfree ((char *) url_encoded); 919 } 920 else 921 { 922 if (url_encoded == url) 923 u->url = xstrdup (url); 924 else 925 u->url = (char *) url_encoded; 926 } 927 928 return u; 929 930 error: 931 /* Cleanup in case of error: */ 932 if (url_encoded && url_encoded != url) 933 xfree ((char *) url_encoded); 934 935 /* Transmit the error code to the caller, if the caller wants to 936 know. */ 937 if (error) 938 *error = error_code; 939 return NULL; 940} 941 942/* Return the error message string from ERROR_CODE, which should have 943 been retrieved from url_parse. The error message is translated. */ 944 945char * 946url_error (const char *url, int error_code) 947{ 948 assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors)); 949 950 if (error_code == PE_UNSUPPORTED_SCHEME) 951 { 952 char *error, *p; 953 char *scheme = xstrdup (url); 954 assert (url_has_scheme (url)); 955 956 if ((p = strchr (scheme, ':'))) 957 *p = '\0'; 958 if (!strcasecmp (scheme, "https")) 959 error = aprintf (_("HTTPS support not compiled in")); 960 else 961 error = aprintf (_(parse_errors[error_code]), quote (scheme)); 962 xfree (scheme); 963 964 return error; 965 } 966 else 967 return xstrdup (_(parse_errors[error_code])); 968} 969 970/* Split PATH into DIR and FILE. PATH comes from the URL and is 971 expected to be URL-escaped. 972 973 The path is split into directory (the part up to the last slash) 974 and file (the part after the last slash), which are subsequently 975 unescaped. Examples: 976 977 PATH DIR FILE 978 "foo/bar/baz" "foo/bar" "baz" 979 "foo/bar/" "foo/bar" "" 980 "foo" "" "foo" 981 "foo/bar/baz%2fqux" "foo/bar" "baz/qux" (!) 982 983 DIR and FILE are freshly allocated. */ 984 985static void 986split_path (const char *path, char **dir, char **file) 987{ 988 char *last_slash = strrchr (path, '/'); 989 if (!last_slash) 990 { 991 *dir = xstrdup (""); 992 *file = xstrdup (path); 993 } 994 else 995 { 996 *dir = strdupdelim (path, last_slash); 997 *file = xstrdup (last_slash + 1); 998 } 999 url_unescape (*dir); 1000 url_unescape (*file); 1001} 1002 1003/* Note: URL's "full path" is the path with the query string and 1004 params appended. The "fragment" (#foo) is intentionally ignored, 1005 but that might be changed. For example, if the original URL was 1006 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment", 1007 the full path will be "/foo/bar/baz;bullshit?querystring". */ 1008 1009/* Return the length of the full path, without the terminating 1010 zero. */ 1011 1012static int 1013full_path_length (const struct url *url) 1014{ 1015 int len = 0; 1016 1017#define FROB(el) if (url->el) len += 1 + strlen (url->el) 1018 1019 FROB (path); 1020 FROB (params); 1021 FROB (query); 1022 1023#undef FROB 1024 1025 return len; 1026} 1027 1028/* Write out the full path. */ 1029 1030static void 1031full_path_write (const struct url *url, char *where) 1032{ 1033#define FROB(el, chr) do { \ 1034 char *f_el = url->el; \ 1035 if (f_el) { \ 1036 int l = strlen (f_el); \ 1037 *where++ = chr; \ 1038 memcpy (where, f_el, l); \ 1039 where += l; \ 1040 } \ 1041} while (0) 1042 1043 FROB (path, '/'); 1044 FROB (params, ';'); 1045 FROB (query, '?'); 1046 1047#undef FROB 1048} 1049 1050/* Public function for getting the "full path". E.g. if u->path is 1051 "foo/bar" and u->query is "param=value", full_path will be 1052 "/foo/bar?param=value". */ 1053 1054char * 1055url_full_path (const struct url *url) 1056{ 1057 int length = full_path_length (url); 1058 char *full_path = xmalloc (length + 1); 1059 1060 full_path_write (url, full_path); 1061 full_path[length] = '\0'; 1062 1063 return full_path; 1064} 1065 1066/* Unescape CHR in an otherwise escaped STR. Used to selectively 1067 escaping of certain characters, such as "/" and ":". Returns a 1068 count of unescaped chars. */ 1069 1070static void 1071unescape_single_char (char *str, char chr) 1072{ 1073 const char c1 = XNUM_TO_DIGIT (chr >> 4); 1074 const char c2 = XNUM_TO_DIGIT (chr & 0xf); 1075 char *h = str; /* hare */ 1076 char *t = str; /* tortoise */ 1077 for (; *h; h++, t++) 1078 { 1079 if (h[0] == '%' && h[1] == c1 && h[2] == c2) 1080 { 1081 *t = chr; 1082 h += 2; 1083 } 1084 else 1085 *t = *h; 1086 } 1087 *t = '\0'; 1088} 1089 1090/* Escape unsafe and reserved characters, except for the slash 1091 characters. */ 1092 1093static char * 1094url_escape_dir (const char *dir) 1095{ 1096 char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1); 1097 if (newdir == dir) 1098 return (char *)dir; 1099 1100 unescape_single_char (newdir, '/'); 1101 return newdir; 1102} 1103 1104/* Sync u->path and u->url with u->dir and u->file. Called after 1105 u->file or u->dir have been changed, typically by the FTP code. */ 1106 1107static void 1108sync_path (struct url *u) 1109{ 1110 char *newpath, *efile, *edir; 1111 1112 xfree (u->path); 1113 1114 /* u->dir and u->file are not escaped. URL-escape them before 1115 reassembling them into u->path. That way, if they contain 1116 separators like '?' or even if u->file contains slashes, the 1117 path will be correctly assembled. (u->file can contain slashes 1118 if the URL specifies it with %2f, or if an FTP server returns 1119 it.) */ 1120 edir = url_escape_dir (u->dir); 1121 efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1); 1122 1123 if (!*edir) 1124 newpath = xstrdup (efile); 1125 else 1126 { 1127 int dirlen = strlen (edir); 1128 int filelen = strlen (efile); 1129 1130 /* Copy "DIR/FILE" to newpath. */ 1131 char *p = newpath = xmalloc (dirlen + 1 + filelen + 1); 1132 memcpy (p, edir, dirlen); 1133 p += dirlen; 1134 *p++ = '/'; 1135 memcpy (p, efile, filelen); 1136 p += filelen; 1137 *p = '\0'; 1138 } 1139 1140 u->path = newpath; 1141 1142 if (edir != u->dir) 1143 xfree (edir); 1144 if (efile != u->file) 1145 xfree (efile); 1146 1147 /* Regenerate u->url as well. */ 1148 xfree (u->url); 1149 u->url = url_string (u, URL_AUTH_SHOW); 1150} 1151 1152/* Mutators. Code in ftp.c insists on changing u->dir and u->file. 1153 This way we can sync u->path and u->url when they get changed. */ 1154 1155void 1156url_set_dir (struct url *url, const char *newdir) 1157{ 1158 xfree (url->dir); 1159 url->dir = xstrdup (newdir); 1160 sync_path (url); 1161} 1162 1163void 1164url_set_file (struct url *url, const char *newfile) 1165{ 1166 xfree (url->file); 1167 url->file = xstrdup (newfile); 1168 sync_path (url); 1169} 1170 1171void 1172url_free (struct url *url) 1173{ 1174 xfree (url->host); 1175 xfree (url->path); 1176 xfree (url->url); 1177 1178 xfree_null (url->params); 1179 xfree_null (url->query); 1180 xfree_null (url->fragment); 1181 xfree_null (url->user); 1182 xfree_null (url->passwd); 1183 1184 xfree (url->dir); 1185 xfree (url->file); 1186 1187 xfree (url); 1188} 1189 1190/* Create all the necessary directories for PATH (a file). Calls 1191 make_directory internally. */ 1192int 1193mkalldirs (const char *path) 1194{ 1195 const char *p; 1196 char *t; 1197 struct_stat st; 1198 int res; 1199 1200 p = path + strlen (path); 1201 for (; *p != '/' && p != path; p--) 1202 ; 1203 1204 /* Don't create if it's just a file. */ 1205 if ((p == path) && (*p != '/')) 1206 return 0; 1207 t = strdupdelim (path, p); 1208 1209 /* Check whether the directory exists. */ 1210 if ((stat (t, &st) == 0)) 1211 { 1212 if (S_ISDIR (st.st_mode)) 1213 { 1214 xfree (t); 1215 return 0; 1216 } 1217 else 1218 { 1219 /* If the dir exists as a file name, remove it first. This 1220 is *only* for Wget to work with buggy old CERN http 1221 servers. Here is the scenario: When Wget tries to 1222 retrieve a directory without a slash, e.g. 1223 http://foo/bar (bar being a directory), CERN server will 1224 not redirect it too http://foo/bar/ -- it will generate a 1225 directory listing containing links to bar/file1, 1226 bar/file2, etc. Wget will lose because it saves this 1227 HTML listing to a file `bar', so it cannot create the 1228 directory. To work around this, if the file of the same 1229 name exists, we just remove it and create the directory 1230 anyway. */ 1231 DEBUGP (("Removing %s because of directory danger!\n", t)); 1232 unlink (t); 1233 } 1234 } 1235 res = make_directory (t); 1236 if (res != 0) 1237 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno)); 1238 xfree (t); 1239 return res; 1240} 1241 1242/* Functions for constructing the file name out of URL components. */ 1243 1244/* A growable string structure, used by url_file_name and friends. 1245 This should perhaps be moved to utils.c. 1246 1247 The idea is to have a convenient and efficient way to construct a 1248 string by having various functions append data to it. Instead of 1249 passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the 1250 functions in questions, we pass the pointer to this struct. 1251 1252 Functions that write to the members in this struct must make sure 1253 that base remains null terminated by calling append_null(). 1254 */ 1255 1256struct growable { 1257 char *base; 1258 int size; /* memory allocated */ 1259 int tail; /* string length */ 1260}; 1261 1262/* Ensure that the string can accept APPEND_COUNT more characters past 1263 the current TAIL position. If necessary, this will grow the string 1264 and update its allocated size. If the string is already large 1265 enough to take TAIL+APPEND_COUNT characters, this does nothing. */ 1266#define GROW(g, append_size) do { \ 1267 struct growable *G_ = g; \ 1268 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \ 1269} while (0) 1270 1271/* Return the tail position of the string. */ 1272#define TAIL(r) ((r)->base + (r)->tail) 1273 1274/* Move the tail position by APPEND_COUNT characters. */ 1275#define TAIL_INCR(r, append_count) ((r)->tail += append_count) 1276 1277 1278/* Append NULL to DEST. */ 1279static void 1280append_null (struct growable *dest) 1281{ 1282 GROW (dest, 1); 1283 *TAIL (dest) = 0; 1284} 1285 1286/* Append CH to DEST. */ 1287static void 1288append_char (char ch, struct growable *dest) 1289{ 1290 if (ch) 1291 { 1292 GROW (dest, 1); 1293 *TAIL (dest) = ch; 1294 TAIL_INCR (dest, 1); 1295 } 1296 1297 append_null (dest); 1298} 1299 1300/* Append the string STR to DEST. */ 1301static void 1302append_string (const char *str, struct growable *dest) 1303{ 1304 int l = strlen (str); 1305 1306 if (l) 1307 { 1308 GROW (dest, l); 1309 memcpy (TAIL (dest), str, l); 1310 TAIL_INCR (dest, l); 1311 } 1312 1313 append_null (dest); 1314} 1315 1316 1317enum { 1318 filechr_not_unix = 1, /* unusable on Unix, / and \0 */ 1319 filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */ 1320 filechr_control = 4 /* a control character, e.g. 0-31 */ 1321}; 1322 1323#define FILE_CHAR_TEST(c, mask) \ 1324 ((opt.restrict_files_nonascii && !c_isascii ((unsigned char)(c))) || \ 1325 (filechr_table[(unsigned char)(c)] & (mask))) 1326 1327/* Shorthands for the table: */ 1328#define U filechr_not_unix 1329#define W filechr_not_windows 1330#define C filechr_control 1331 1332#define UW U|W 1333#define UWC U|W|C 1334 1335/* Table of characters unsafe under various conditions (see above). 1336 1337 Arguably we could also claim `%' to be unsafe, since we use it as 1338 the escape character. If we ever want to be able to reliably 1339 translate file name back to URL, this would become important 1340 crucial. Right now, it's better to be minimal in escaping. */ 1341 1342static const unsigned char filechr_table[256] = 1343{ 1344UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */ 1345 C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */ 1346 C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */ 1347 C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */ 1348 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */ 1349 0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */ 1350 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */ 1351 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */ 1352 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */ 1353 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */ 1354 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */ 1355 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */ 1356 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */ 1357 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */ 1358 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */ 1359 0, 0, 0, 0, W, 0, 0, C, /* x y z { | } ~ DEL */ 1360 1361 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */ 1362 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */ 1363 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1364 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1365 1366 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1367 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1368 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1369 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1370}; 1371#undef U 1372#undef W 1373#undef C 1374#undef UW 1375#undef UWC 1376 1377/* FN_PORT_SEP is the separator between host and port in file names 1378 for non-standard port numbers. On Unix this is normally ':', as in 1379 "www.xemacs.org:4001/index.html". Under Windows, we set it to + 1380 because Windows can't handle ':' in file names. */ 1381#define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+') 1382 1383/* FN_QUERY_SEP is the separator between the file name and the URL 1384 query, normally '?'. Since Windows cannot handle '?' as part of 1385 file name, we use '@' instead there. */ 1386#define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@') 1387#define FN_QUERY_SEP_STR (opt.restrict_files_os != restrict_windows ? "?" : "@") 1388 1389/* Quote path element, characters in [b, e), as file name, and append 1390 the quoted string to DEST. Each character is quoted as per 1391 file_unsafe_char and the corresponding table. 1392 1393 If ESCAPED is true, the path element is considered to be 1394 URL-escaped and will be unescaped prior to inspection. */ 1395 1396static void 1397append_uri_pathel (const char *b, const char *e, bool escaped, 1398 struct growable *dest) 1399{ 1400 const char *p; 1401 int quoted, outlen; 1402 1403 int mask; 1404 if (opt.restrict_files_os == restrict_unix) 1405 mask = filechr_not_unix; 1406 else 1407 mask = filechr_not_windows; 1408 if (opt.restrict_files_ctrl) 1409 mask |= filechr_control; 1410 1411 /* Copy [b, e) to PATHEL and URL-unescape it. */ 1412 if (escaped) 1413 { 1414 char *unescaped; 1415 BOUNDED_TO_ALLOCA (b, e, unescaped); 1416 url_unescape (unescaped); 1417 b = unescaped; 1418 e = unescaped + strlen (unescaped); 1419 } 1420 1421 /* Defang ".." when found as component of path. Remember that path 1422 comes from the URL and might contain malicious input. */ 1423 if (e - b == 2 && b[0] == '.' && b[1] == '.') 1424 { 1425 b = "%2E%2E"; 1426 e = b + 6; 1427 } 1428 1429 /* Walk the PATHEL string and check how many characters we'll need 1430 to quote. */ 1431 quoted = 0; 1432 for (p = b; p < e; p++) 1433 if (FILE_CHAR_TEST (*p, mask)) 1434 ++quoted; 1435 1436 /* Calculate the length of the output string. e-b is the input 1437 string length. Each quoted char introduces two additional 1438 characters in the string, hence 2*quoted. */ 1439 outlen = (e - b) + (2 * quoted); 1440 GROW (dest, outlen); 1441 1442 if (!quoted) 1443 { 1444 /* If there's nothing to quote, we can simply append the string 1445 without processing it again. */ 1446 memcpy (TAIL (dest), b, outlen); 1447 } 1448 else 1449 { 1450 char *q = TAIL (dest); 1451 for (p = b; p < e; p++) 1452 { 1453 if (!FILE_CHAR_TEST (*p, mask)) 1454 *q++ = *p; 1455 else 1456 { 1457 unsigned char ch = *p; 1458 *q++ = '%'; 1459 *q++ = XNUM_TO_DIGIT (ch >> 4); 1460 *q++ = XNUM_TO_DIGIT (ch & 0xf); 1461 } 1462 } 1463 assert (q - TAIL (dest) == outlen); 1464 } 1465 1466 /* Perform inline case transformation if required. */ 1467 if (opt.restrict_files_case == restrict_lowercase 1468 || opt.restrict_files_case == restrict_uppercase) 1469 { 1470 char *q; 1471 for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q) 1472 { 1473 if (opt.restrict_files_case == restrict_lowercase) 1474 *q = c_tolower (*q); 1475 else 1476 *q = c_toupper (*q); 1477 } 1478 } 1479 1480 TAIL_INCR (dest, outlen); 1481 append_null (dest); 1482} 1483 1484/* Append to DEST the directory structure that corresponds the 1485 directory part of URL's path. For example, if the URL is 1486 http://server/dir1/dir2/file, this appends "/dir1/dir2". 1487 1488 Each path element ("dir1" and "dir2" in the above example) is 1489 examined, url-unescaped, and re-escaped as file name element. 1490 1491 Additionally, it cuts as many directories from the path as 1492 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it 1493 will produce "bar" for the above example. For 2 or more, it will 1494 produce "". 1495 1496 Each component of the path is quoted for use as file name. */ 1497 1498static void 1499append_dir_structure (const struct url *u, struct growable *dest) 1500{ 1501 char *pathel, *next; 1502 int cut = opt.cut_dirs; 1503 1504 /* Go through the path components, de-URL-quote them, and quote them 1505 (if necessary) as file names. */ 1506 1507 pathel = u->path; 1508 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1) 1509 { 1510 if (cut-- > 0) 1511 continue; 1512 if (pathel == next) 1513 /* Ignore empty pathels. */ 1514 continue; 1515 1516 if (dest->tail) 1517 append_char ('/', dest); 1518 append_uri_pathel (pathel, next, true, dest); 1519 } 1520} 1521 1522/* Return a unique file name that matches the given URL as well as 1523 possible. Does not create directories on the file system. */ 1524 1525char * 1526url_file_name (const struct url *u, char *replaced_filename) 1527{ 1528 struct growable fnres; /* stands for "file name result" */ 1529 struct growable temp_fnres; 1530 1531 const char *u_file; 1532 char *fname, *unique, *fname_len_check; 1533 const char *index_filename = "index.html"; /* The default index file is index.html */ 1534 size_t max_length; 1535 1536 fnres.base = NULL; 1537 fnres.size = 0; 1538 fnres.tail = 0; 1539 1540 temp_fnres.base = NULL; 1541 temp_fnres.size = 0; 1542 temp_fnres.tail = 0; 1543 1544 /* If an alternative index file was defined, change index_filename */ 1545 if (opt.default_page) 1546 index_filename = opt.default_page; 1547 1548 1549 /* Start with the directory prefix, if specified. */ 1550 if (opt.dir_prefix) 1551 append_string (opt.dir_prefix, &fnres); 1552 1553 /* If "dirstruct" is turned on (typically the case with -r), add 1554 the host and port (unless those have been turned off) and 1555 directory structure. */ 1556 if (opt.dirstruct) 1557 { 1558 if (opt.protocol_directories) 1559 { 1560 if (fnres.tail) 1561 append_char ('/', &fnres); 1562 append_string (supported_schemes[u->scheme].name, &fnres); 1563 } 1564 if (opt.add_hostdir) 1565 { 1566 if (fnres.tail) 1567 append_char ('/', &fnres); 1568 if (0 != strcmp (u->host, "..")) 1569 append_string (u->host, &fnres); 1570 else 1571 /* Host name can come from the network; malicious DNS may 1572 allow ".." to be resolved, causing us to write to 1573 "../<file>". Defang such host names. */ 1574 append_string ("%2E%2E", &fnres); 1575 if (u->port != scheme_default_port (u->scheme)) 1576 { 1577 char portstr[24]; 1578 number_to_string (portstr, u->port); 1579 append_char (FN_PORT_SEP, &fnres); 1580 append_string (portstr, &fnres); 1581 } 1582 } 1583 1584 append_dir_structure (u, &fnres); 1585 } 1586 1587 if (!replaced_filename) 1588 { 1589 /* Create the filename. */ 1590 u_file = *u->file ? u->file : index_filename; 1591 1592 /* Append "?query" to the file name, even if empty, 1593 * and create fname_len_check. */ 1594 if (u->query) 1595 fname_len_check = concat_strings (u_file, FN_QUERY_SEP_STR, u->query, NULL); 1596 else 1597 fname_len_check = strdupdelim (u_file, u_file + strlen (u_file)); 1598 } 1599 else 1600 { 1601 u_file = replaced_filename; 1602 fname_len_check = strdupdelim (u_file, u_file + strlen (u_file)); 1603 } 1604 1605 append_uri_pathel (fname_len_check, 1606 fname_len_check + strlen (fname_len_check), false, &temp_fnres); 1607 1608 /* Zero-terminate the temporary file name. */ 1609 append_char ('\0', &temp_fnres); 1610 1611 /* Check that the length of the file name is acceptable. */ 1612#ifdef WINDOWS 1613 if (MAX_PATH > (fnres.tail + CHOMP_BUFFER + 2)) 1614 { 1615 max_length = MAX_PATH - (fnres.tail + CHOMP_BUFFER + 2); 1616 /* FIXME: In Windows a filename is usually limited to 255 characters. 1617 To really be accurate you could call GetVolumeInformation() to get 1618 lpMaximumComponentLength 1619 */ 1620 if (max_length > 255) 1621 { 1622 max_length = 255; 1623 } 1624 } 1625 else 1626 { 1627 max_length = 0; 1628 } 1629#else 1630 max_length = get_max_length (fnres.base, fnres.tail, _PC_NAME_MAX) - CHOMP_BUFFER; 1631#endif 1632 if (max_length > 0 && strlen (temp_fnres.base) > max_length) 1633 { 1634 logprintf (LOG_NOTQUIET, "The name is too long, %lu chars total.\n", 1635 (unsigned long) strlen (temp_fnres.base)); 1636 logprintf (LOG_NOTQUIET, "Trying to shorten...\n"); 1637 1638 /* Shorten the file name. */ 1639 temp_fnres.base[max_length] = '\0'; 1640 1641 logprintf (LOG_NOTQUIET, "New name is %s.\n", temp_fnres.base); 1642 } 1643 1644 free (fname_len_check); 1645 1646 /* The filename has already been 'cleaned' by append_uri_pathel() above. So, 1647 * just append it. */ 1648 if (fnres.tail) 1649 append_char ('/', &fnres); 1650 append_string (temp_fnres.base, &fnres); 1651 1652 fname = fnres.base; 1653 1654 /* Make a final check that the path length is acceptable? */ 1655 /* TODO: check fnres.base for path length problem */ 1656 1657 free (temp_fnres.base); 1658 1659 /* Check the cases in which the unique extensions are not used: 1660 1) Clobbering is turned off (-nc). 1661 2) Retrieval with regetting. 1662 3) Timestamping is used. 1663 4) Hierarchy is built. 1664 5) Backups are specified. 1665 1666 The exception is the case when file does exist and is a 1667 directory (see `mkalldirs' for explanation). */ 1668 1669 if (ALLOW_CLOBBER 1670 && !(file_exists_p (fname) && !file_non_directory_p (fname))) 1671 { 1672 unique = fname; 1673 } 1674 else 1675 { 1676 unique = unique_name (fname, true); 1677 if (unique != fname) 1678 xfree (fname); 1679 } 1680 1681/* On VMS, alter the name as required. */ 1682#ifdef __VMS 1683 { 1684 char *unique2; 1685 1686 unique2 = ods_conform( unique); 1687 if (unique2 != unique) 1688 { 1689 xfree (unique); 1690 unique = unique2; 1691 } 1692 } 1693#endif /* def __VMS */ 1694 1695 return unique; 1696} 1697 1698/* Resolve "." and ".." elements of PATH by destructively modifying 1699 PATH and return true if PATH has been modified, false otherwise. 1700 1701 The algorithm is in spirit similar to the one described in rfc1808, 1702 although implemented differently, in one pass. To recap, path 1703 elements containing only "." are removed, and ".." is taken to mean 1704 "back up one element". Single leading and trailing slashes are 1705 preserved. 1706 1707 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive 1708 test examples are provided below. If you change anything in this 1709 function, run test_path_simplify to make sure you haven't broken a 1710 test case. */ 1711 1712static bool 1713path_simplify (enum url_scheme scheme, char *path) 1714{ 1715 char *h = path; /* hare */ 1716 char *t = path; /* tortoise */ 1717 char *beg = path; 1718 char *end = strchr (path, '\0'); 1719 1720 while (h < end) 1721 { 1722 /* Hare should be at the beginning of a path element. */ 1723 1724 if (h[0] == '.' && (h[1] == '/' || h[1] == '\0')) 1725 { 1726 /* Ignore "./". */ 1727 h += 2; 1728 } 1729 else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0')) 1730 { 1731 /* Handle "../" by retreating the tortoise by one path 1732 element -- but not past beggining. */ 1733 if (t > beg) 1734 { 1735 /* Move backwards until T hits the beginning of the 1736 previous path element or the beginning of path. */ 1737 for (--t; t > beg && t[-1] != '/'; t--) 1738 ; 1739 } 1740 else if (scheme == SCHEME_FTP) 1741 { 1742 /* If we're at the beginning, copy the "../" literally 1743 and move the beginning so a later ".." doesn't remove 1744 it. This violates RFC 3986; but we do it for FTP 1745 anyway because there is otherwise no way to get at a 1746 parent directory, when the FTP server drops us in a 1747 non-root directory (which is not uncommon). */ 1748 beg = t + 3; 1749 goto regular; 1750 } 1751 h += 3; 1752 } 1753 else 1754 { 1755 regular: 1756 /* A regular path element. If H hasn't advanced past T, 1757 simply skip to the next path element. Otherwise, copy 1758 the path element until the next slash. */ 1759 if (t == h) 1760 { 1761 /* Skip the path element, including the slash. */ 1762 while (h < end && *h != '/') 1763 t++, h++; 1764 if (h < end) 1765 t++, h++; 1766 } 1767 else 1768 { 1769 /* Copy the path element, including the final slash. */ 1770 while (h < end && *h != '/') 1771 *t++ = *h++; 1772 if (h < end) 1773 *t++ = *h++; 1774 } 1775 } 1776 } 1777 1778 if (t != h) 1779 *t = '\0'; 1780 1781 return t != h; 1782} 1783 1784/* Return the length of URL's path. Path is considered to be 1785 terminated by one or more of the ?query or ;params or #fragment, 1786 depending on the scheme. */ 1787 1788static const char * 1789path_end (const char *url) 1790{ 1791 enum url_scheme scheme = url_scheme (url); 1792 const char *seps; 1793 if (scheme == SCHEME_INVALID) 1794 scheme = SCHEME_HTTP; /* use http semantics for rel links */ 1795 /* +2 to ignore the first two separators ':' and '/' */ 1796 seps = init_seps (scheme) + 2; 1797 return strpbrk_or_eos (url, seps); 1798} 1799 1800/* Find the last occurrence of character C in the range [b, e), or 1801 NULL, if none are present. */ 1802#define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b)) 1803 1804/* Merge BASE with LINK and return the resulting URI. 1805 1806 Either of the URIs may be absolute or relative, complete with the 1807 host name, or path only. This tries to reasonably handle all 1808 foreseeable cases. It only employs minimal URL parsing, without 1809 knowledge of the specifics of schemes. 1810 1811 I briefly considered making this function call path_simplify after 1812 the merging process, as rfc1738 seems to suggest. This is a bad 1813 idea for several reasons: 1) it complexifies the code, and 2) 1814 url_parse has to simplify path anyway, so it's wasteful to boot. */ 1815 1816char * 1817uri_merge (const char *base, const char *link) 1818{ 1819 int linklength; 1820 const char *end; 1821 char *merge; 1822 1823 if (url_has_scheme (link)) 1824 return xstrdup (link); 1825 1826 /* We may not examine BASE past END. */ 1827 end = path_end (base); 1828 linklength = strlen (link); 1829 1830 if (!*link) 1831 { 1832 /* Empty LINK points back to BASE, query string and all. */ 1833 return xstrdup (base); 1834 } 1835 else if (*link == '?') 1836 { 1837 /* LINK points to the same location, but changes the query 1838 string. Examples: */ 1839 /* uri_merge("path", "?new") -> "path?new" */ 1840 /* uri_merge("path?foo", "?new") -> "path?new" */ 1841 /* uri_merge("path?foo#bar", "?new") -> "path?new" */ 1842 /* uri_merge("path#foo", "?new") -> "path?new" */ 1843 int baselength = end - base; 1844 merge = xmalloc (baselength + linklength + 1); 1845 memcpy (merge, base, baselength); 1846 memcpy (merge + baselength, link, linklength); 1847 merge[baselength + linklength] = '\0'; 1848 } 1849 else if (*link == '#') 1850 { 1851 /* uri_merge("path", "#new") -> "path#new" */ 1852 /* uri_merge("path#foo", "#new") -> "path#new" */ 1853 /* uri_merge("path?foo", "#new") -> "path?foo#new" */ 1854 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */ 1855 int baselength; 1856 const char *end1 = strchr (base, '#'); 1857 if (!end1) 1858 end1 = base + strlen (base); 1859 baselength = end1 - base; 1860 merge = xmalloc (baselength + linklength + 1); 1861 memcpy (merge, base, baselength); 1862 memcpy (merge + baselength, link, linklength); 1863 merge[baselength + linklength] = '\0'; 1864 } 1865 else if (*link == '/' && *(link + 1) == '/') 1866 { 1867 /* LINK begins with "//" and so is a net path: we need to 1868 replace everything after (and including) the double slash 1869 with LINK. */ 1870 1871 /* uri_merge("foo", "//new/bar") -> "//new/bar" */ 1872 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */ 1873 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */ 1874 1875 int span; 1876 const char *slash; 1877 const char *start_insert; 1878 1879 /* Look for first slash. */ 1880 slash = memchr (base, '/', end - base); 1881 /* If found slash and it is a double slash, then replace 1882 from this point, else default to replacing from the 1883 beginning. */ 1884 if (slash && *(slash + 1) == '/') 1885 start_insert = slash; 1886 else 1887 start_insert = base; 1888 1889 span = start_insert - base; 1890 merge = xmalloc (span + linklength + 1); 1891 if (span) 1892 memcpy (merge, base, span); 1893 memcpy (merge + span, link, linklength); 1894 merge[span + linklength] = '\0'; 1895 } 1896 else if (*link == '/') 1897 { 1898 /* LINK is an absolute path: we need to replace everything 1899 after (and including) the FIRST slash with LINK. 1900 1901 So, if BASE is "http://host/whatever/foo/bar", and LINK is 1902 "/qux/xyzzy", our result should be 1903 "http://host/qux/xyzzy". */ 1904 int span; 1905 const char *slash; 1906 const char *start_insert = NULL; /* for gcc to shut up. */ 1907 const char *pos = base; 1908 bool seen_slash_slash = false; 1909 /* We're looking for the first slash, but want to ignore 1910 double slash. */ 1911 again: 1912 slash = memchr (pos, '/', end - pos); 1913 if (slash && !seen_slash_slash) 1914 if (*(slash + 1) == '/') 1915 { 1916 pos = slash + 2; 1917 seen_slash_slash = true; 1918 goto again; 1919 } 1920 1921 /* At this point, SLASH is the location of the first / after 1922 "//", or the first slash altogether. START_INSERT is the 1923 pointer to the location where LINK will be inserted. When 1924 examining the last two examples, keep in mind that LINK 1925 begins with '/'. */ 1926 1927 if (!slash && !seen_slash_slash) 1928 /* example: "foo" */ 1929 /* ^ */ 1930 start_insert = base; 1931 else if (!slash && seen_slash_slash) 1932 /* example: "http://foo" */ 1933 /* ^ */ 1934 start_insert = end; 1935 else if (slash && !seen_slash_slash) 1936 /* example: "foo/bar" */ 1937 /* ^ */ 1938 start_insert = base; 1939 else if (slash && seen_slash_slash) 1940 /* example: "http://something/" */ 1941 /* ^ */ 1942 start_insert = slash; 1943 1944 span = start_insert - base; 1945 merge = xmalloc (span + linklength + 1); 1946 if (span) 1947 memcpy (merge, base, span); 1948 memcpy (merge + span, link, linklength); 1949 merge[span + linklength] = '\0'; 1950 } 1951 else 1952 { 1953 /* LINK is a relative URL: we need to replace everything 1954 after last slash (possibly empty) with LINK. 1955 1956 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy", 1957 our result should be "whatever/foo/qux/xyzzy". */ 1958 bool need_explicit_slash = false; 1959 int span; 1960 const char *start_insert; 1961 const char *last_slash = find_last_char (base, end, '/'); 1962 if (!last_slash) 1963 { 1964 /* No slash found at all. Replace what we have with LINK. */ 1965 start_insert = base; 1966 } 1967 else if (last_slash && last_slash >= base + 2 1968 && last_slash[-2] == ':' && last_slash[-1] == '/') 1969 { 1970 /* example: http://host" */ 1971 /* ^ */ 1972 start_insert = end + 1; 1973 need_explicit_slash = true; 1974 } 1975 else 1976 { 1977 /* example: "whatever/foo/bar" */ 1978 /* ^ */ 1979 start_insert = last_slash + 1; 1980 } 1981 1982 span = start_insert - base; 1983 merge = xmalloc (span + linklength + 1); 1984 if (span) 1985 memcpy (merge, base, span); 1986 if (need_explicit_slash) 1987 merge[span - 1] = '/'; 1988 memcpy (merge + span, link, linklength); 1989 merge[span + linklength] = '\0'; 1990 } 1991 1992 return merge; 1993} 1994 1995#define APPEND(p, s) do { \ 1996 int len = strlen (s); \ 1997 memcpy (p, s, len); \ 1998 p += len; \ 1999} while (0) 2000 2001/* Use this instead of password when the actual password is supposed 2002 to be hidden. We intentionally use a generic string without giving 2003 away the number of characters in the password, like previous 2004 versions did. */ 2005#define HIDDEN_PASSWORD "*password*" 2006 2007/* Recreate the URL string from the data in URL. 2008 2009 If HIDE is true (as it is when we're calling this on a URL we plan 2010 to print, but not when calling it to canonicalize a URL for use 2011 within the program), password will be hidden. Unsafe characters in 2012 the URL will be quoted. */ 2013 2014char * 2015url_string (const struct url *url, enum url_auth_mode auth_mode) 2016{ 2017 int size; 2018 char *result, *p; 2019 char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL; 2020 2021 int scheme_port = supported_schemes[url->scheme].default_port; 2022 const char *scheme_str = supported_schemes[url->scheme].leading_string; 2023 int fplen = full_path_length (url); 2024 2025 bool brackets_around_host; 2026 2027 assert (scheme_str != NULL); 2028 2029 /* Make sure the user name and password are quoted. */ 2030 if (url->user) 2031 { 2032 if (auth_mode != URL_AUTH_HIDE) 2033 { 2034 quoted_user = url_escape_allow_passthrough (url->user); 2035 if (url->passwd) 2036 { 2037 if (auth_mode == URL_AUTH_HIDE_PASSWD) 2038 quoted_passwd = (char *) HIDDEN_PASSWORD; 2039 else 2040 quoted_passwd = url_escape_allow_passthrough (url->passwd); 2041 } 2042 } 2043 } 2044 2045 /* In the unlikely event that the host name contains non-printable 2046 characters, quote it for displaying to the user. */ 2047 quoted_host = url_escape_allow_passthrough (url->host); 2048 2049 /* Undo the quoting of colons that URL escaping performs. IPv6 2050 addresses may legally contain colons, and in that case must be 2051 placed in square brackets. */ 2052 if (quoted_host != url->host) 2053 unescape_single_char (quoted_host, ':'); 2054 brackets_around_host = strchr (quoted_host, ':') != NULL; 2055 2056 size = (strlen (scheme_str) 2057 + strlen (quoted_host) 2058 + (brackets_around_host ? 2 : 0) 2059 + fplen 2060 + 1); 2061 if (url->port != scheme_port) 2062 size += 1 + numdigit (url->port); 2063 if (quoted_user) 2064 { 2065 size += 1 + strlen (quoted_user); 2066 if (quoted_passwd) 2067 size += 1 + strlen (quoted_passwd); 2068 } 2069 2070 p = result = xmalloc (size); 2071 2072 APPEND (p, scheme_str); 2073 if (quoted_user) 2074 { 2075 APPEND (p, quoted_user); 2076 if (quoted_passwd) 2077 { 2078 *p++ = ':'; 2079 APPEND (p, quoted_passwd); 2080 } 2081 *p++ = '@'; 2082 } 2083 2084 if (brackets_around_host) 2085 *p++ = '['; 2086 APPEND (p, quoted_host); 2087 if (brackets_around_host) 2088 *p++ = ']'; 2089 if (url->port != scheme_port) 2090 { 2091 *p++ = ':'; 2092 p = number_to_string (p, url->port); 2093 } 2094 2095 full_path_write (url, p); 2096 p += fplen; 2097 *p++ = '\0'; 2098 2099 assert (p - result == size); 2100 2101 if (quoted_user && quoted_user != url->user) 2102 xfree (quoted_user); 2103 if (quoted_passwd && auth_mode == URL_AUTH_SHOW 2104 && quoted_passwd != url->passwd) 2105 xfree (quoted_passwd); 2106 if (quoted_host != url->host) 2107 xfree (quoted_host); 2108 2109 return result; 2110} 2111 2112/* Return true if scheme a is similar to scheme b. 2113 2114 Schemes are similar if they are equal. If SSL is supported, schemes 2115 are also similar if one is http (SCHEME_HTTP) and the other is https 2116 (SCHEME_HTTPS). */ 2117bool 2118schemes_are_similar_p (enum url_scheme a, enum url_scheme b) 2119{ 2120 if (a == b) 2121 return true; 2122#ifdef HAVE_SSL 2123 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS) 2124 || (a == SCHEME_HTTPS && b == SCHEME_HTTP)) 2125 return true; 2126#endif 2127 return false; 2128} 2129 2130static int 2131getchar_from_escaped_string (const char *str, char *c) 2132{ 2133 const char *p = str; 2134 2135 assert (str && *str); 2136 assert (c); 2137 2138 if (p[0] == '%') 2139 { 2140 if (!c_isxdigit(p[1]) || !c_isxdigit(p[2])) 2141 { 2142 *c = '%'; 2143 return 1; 2144 } 2145 else 2146 { 2147 if (p[2] == 0) 2148 return 0; /* error: invalid string */ 2149 2150 *c = X2DIGITS_TO_NUM (p[1], p[2]); 2151 if (URL_RESERVED_CHAR(*c)) 2152 { 2153 *c = '%'; 2154 return 1; 2155 } 2156 else 2157 return 3; 2158 } 2159 } 2160 else 2161 { 2162 *c = p[0]; 2163 } 2164 2165 return 1; 2166} 2167 2168bool 2169are_urls_equal (const char *u1, const char *u2) 2170{ 2171 const char *p, *q; 2172 int pp, qq; 2173 char ch1, ch2; 2174 assert(u1 && u2); 2175 2176 p = u1; 2177 q = u2; 2178 2179 while (*p && *q 2180 && (pp = getchar_from_escaped_string (p, &ch1)) 2181 && (qq = getchar_from_escaped_string (q, &ch2)) 2182 && (c_tolower(ch1) == c_tolower(ch2))) 2183 { 2184 p += pp; 2185 q += qq; 2186 } 2187 2188 return (*p == 0 && *q == 0 ? true : false); 2189} 2190 2191#ifdef TESTING 2192/* Debugging and testing support for path_simplify. */ 2193 2194#if 0 2195/* Debug: run path_simplify on PATH and return the result in a new 2196 string. Useful for calling from the debugger. */ 2197static char * 2198ps (char *path) 2199{ 2200 char *copy = xstrdup (path); 2201 path_simplify (copy); 2202 return copy; 2203} 2204#endif 2205 2206static const char * 2207run_test (const char *test, const char *expected_result, enum url_scheme scheme, 2208 bool expected_change) 2209{ 2210 char *test_copy = xstrdup (test); 2211 bool modified = path_simplify (scheme, test_copy); 2212 2213 if (0 != strcmp (test_copy, expected_result)) 2214 { 2215 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n", 2216 test, expected_result, test_copy); 2217 mu_assert ("", 0); 2218 } 2219 if (modified != expected_change) 2220 { 2221 if (expected_change) 2222 printf ("Expected modification with path_simplify(\"%s\").\n", 2223 test); 2224 else 2225 printf ("Expected no modification with path_simplify(\"%s\").\n", 2226 test); 2227 } 2228 xfree (test_copy); 2229 mu_assert ("", modified == expected_change); 2230 return NULL; 2231} 2232 2233const char * 2234test_path_simplify (void) 2235{ 2236 static const struct { 2237 const char *test, *result; 2238 enum url_scheme scheme; 2239 bool should_modify; 2240 } tests[] = { 2241 { "", "", SCHEME_HTTP, false }, 2242 { ".", "", SCHEME_HTTP, true }, 2243 { "./", "", SCHEME_HTTP, true }, 2244 { "..", "", SCHEME_HTTP, true }, 2245 { "../", "", SCHEME_HTTP, true }, 2246 { "..", "..", SCHEME_FTP, false }, 2247 { "../", "../", SCHEME_FTP, false }, 2248 { "foo", "foo", SCHEME_HTTP, false }, 2249 { "foo/bar", "foo/bar", SCHEME_HTTP, false }, 2250 { "foo///bar", "foo///bar", SCHEME_HTTP, false }, 2251 { "foo/.", "foo/", SCHEME_HTTP, true }, 2252 { "foo/./", "foo/", SCHEME_HTTP, true }, 2253 { "foo./", "foo./", SCHEME_HTTP, false }, 2254 { "foo/../bar", "bar", SCHEME_HTTP, true }, 2255 { "foo/../bar/", "bar/", SCHEME_HTTP, true }, 2256 { "foo/bar/..", "foo/", SCHEME_HTTP, true }, 2257 { "foo/bar/../x", "foo/x", SCHEME_HTTP, true }, 2258 { "foo/bar/../x/", "foo/x/", SCHEME_HTTP, true }, 2259 { "foo/..", "", SCHEME_HTTP, true }, 2260 { "foo/../..", "", SCHEME_HTTP, true }, 2261 { "foo/../../..", "", SCHEME_HTTP, true }, 2262 { "foo/../../bar/../../baz", "baz", SCHEME_HTTP, true }, 2263 { "foo/../..", "..", SCHEME_FTP, true }, 2264 { "foo/../../..", "../..", SCHEME_FTP, true }, 2265 { "foo/../../bar/../../baz", "../../baz", SCHEME_FTP, true }, 2266 { "a/b/../../c", "c", SCHEME_HTTP, true }, 2267 { "./a/../b", "b", SCHEME_HTTP, true } 2268 }; 2269 unsigned i; 2270 2271 for (i = 0; i < countof (tests); i++) 2272 { 2273 const char *message; 2274 const char *test = tests[i].test; 2275 const char *expected_result = tests[i].result; 2276 enum url_scheme scheme = tests[i].scheme; 2277 bool expected_change = tests[i].should_modify; 2278 2279 message = run_test (test, expected_result, scheme, expected_change); 2280 if (message) return message; 2281 } 2282 return NULL; 2283} 2284 2285const char * 2286test_append_uri_pathel(void) 2287{ 2288 unsigned i; 2289 static const struct { 2290 const char *original_url; 2291 const char *input; 2292 bool escaped; 2293 const char *expected_result; 2294 } test_array[] = { 2295 { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" }, 2296 }; 2297 2298 for (i = 0; i < countof(test_array); ++i) 2299 { 2300 struct growable dest; 2301 const char *p = test_array[i].input; 2302 2303 memset (&dest, 0, sizeof (dest)); 2304 2305 append_string (test_array[i].original_url, &dest); 2306 append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest); 2307 2308 mu_assert ("test_append_uri_pathel: wrong result", 2309 strcmp (dest.base, test_array[i].expected_result) == 0); 2310 } 2311 2312 return NULL; 2313} 2314 2315const char * 2316test_are_urls_equal(void) 2317{ 2318 unsigned i; 2319 static const struct { 2320 const char *url1; 2321 const char *url2; 2322 bool expected_result; 2323 } test_array[] = { 2324 { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/", true }, 2325 { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false }, 2326 { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/", false }, 2327 { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/", true }, 2328 { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/", false }, 2329 { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/", false }, 2330 }; 2331 2332 for (i = 0; i < countof(test_array); ++i) 2333 { 2334 mu_assert ("test_are_urls_equal: wrong result", 2335 are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result); 2336 } 2337 2338 return NULL; 2339} 2340 2341#endif /* TESTING */ 2342 2343/* 2344 * vim: et ts=2 sw=2 2345 */ 2346