1/* Conversion of links to local files. 2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 3 2008, 2009 Free Software Foundation, Inc. 4 5This file is part of GNU Wget. 6 7GNU Wget is free software; you can redistribute it and/or modify 8it under the terms of the GNU General Public License as published by 9the Free Software Foundation; either version 3 of the License, or 10 (at your option) any later version. 11 12GNU Wget is distributed in the hope that it will be useful, 13but WITHOUT ANY WARRANTY; without even the implied warranty of 14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15GNU General Public License for more details. 16 17You should have received a copy of the GNU General Public License 18along with Wget. If not, see <http://www.gnu.org/licenses/>. 19 20Additional permission under GNU GPL version 3 section 7 21 22If you modify this program, or any covered work, by linking or 23combining it with the OpenSSL project's OpenSSL library (or a 24modified version of that library), containing parts covered by the 25terms of the OpenSSL or SSLeay licenses, the Free Software Foundation 26grants you additional permission to convey the resulting work. 27Corresponding Source for a non-source form of such a combination 28shall include the source code for the parts of OpenSSL used as well 29as that of the covered work. */ 30 31#include "wget.h" 32 33#include <stdio.h> 34#include <stdlib.h> 35#include <string.h> 36#ifdef HAVE_UNISTD_H 37# include <unistd.h> 38#endif /* HAVE_UNISTD_H */ 39#include <errno.h> 40#include <assert.h> 41#include "convert.h" 42#include "url.h" 43#include "recur.h" 44#include "utils.h" 45#include "hash.h" 46#include "ptimer.h" 47#include "res.h" 48#include "html-url.h" 49#include "css-url.h" 50 51static struct hash_table *dl_file_url_map; 52struct hash_table *dl_url_file_map; 53 54/* Set of HTML/CSS files downloaded in this Wget run, used for link 55 conversion after Wget is done. */ 56struct hash_table *downloaded_html_set; 57struct hash_table *downloaded_css_set; 58 59static void convert_links (const char *, struct urlpos *); 60 61 62void 63convert_links_in_hashtable (struct hash_table *downloaded_set, 64 int is_css, 65 int *file_count) 66{ 67 int i; 68 69 int cnt; 70 char **file_array; 71 72 cnt = 0; 73 if (downloaded_set) 74 cnt = hash_table_count (downloaded_set); 75 if (cnt == 0) 76 return; 77 file_array = alloca_array (char *, cnt); 78 string_set_to_array (downloaded_set, file_array); 79 80 for (i = 0; i < cnt; i++) 81 { 82 struct urlpos *urls, *cur_url; 83 char *url; 84 char *file = file_array[i]; 85 86 /* Determine the URL of the file. get_urls_{html,css} will need 87 it. */ 88 url = hash_table_get (dl_file_url_map, file); 89 if (!url) 90 { 91 DEBUGP (("Apparently %s has been removed.\n", file)); 92 continue; 93 } 94 95 DEBUGP (("Scanning %s (from %s)\n", file, url)); 96 97 /* Parse the file... */ 98 urls = is_css ? get_urls_css_file (file, url) : 99 get_urls_html (file, url, NULL, NULL); 100 101 /* We don't respect meta_disallow_follow here because, even if 102 the file is not followed, we might still want to convert the 103 links that have been followed from other files. */ 104 105 for (cur_url = urls; cur_url; cur_url = cur_url->next) 106 { 107 char *local_name; 108 struct url *u = cur_url->url; 109 110 if (cur_url->link_base_p) 111 { 112 /* Base references have been resolved by our parser, so 113 we turn the base URL into an empty string. (Perhaps 114 we should remove the tag entirely?) */ 115 cur_url->convert = CO_NULLIFY_BASE; 116 continue; 117 } 118 119 /* We decide the direction of conversion according to whether 120 a URL was downloaded. Downloaded URLs will be converted 121 ABS2REL, whereas non-downloaded will be converted REL2ABS. */ 122 local_name = hash_table_get (dl_url_file_map, u->url); 123 124 /* Decide on the conversion type. */ 125 if (local_name) 126 { 127 /* We've downloaded this URL. Convert it to relative 128 form. We do this even if the URL already is in 129 relative form, because our directory structure may 130 not be identical to that on the server (think `-nd', 131 `--cut-dirs', etc.) */ 132 cur_url->convert = CO_CONVERT_TO_RELATIVE; 133 cur_url->local_name = xstrdup (local_name); 134 DEBUGP (("will convert url %s to local %s\n", u->url, local_name)); 135 } 136 else 137 { 138 /* We haven't downloaded this URL. If it's not already 139 complete (including a full host name), convert it to 140 that form, so it can be reached while browsing this 141 HTML locally. */ 142 if (!cur_url->link_complete_p) 143 cur_url->convert = CO_CONVERT_TO_COMPLETE; 144 cur_url->local_name = NULL; 145 DEBUGP (("will convert url %s to complete\n", u->url)); 146 } 147 } 148 149 /* Convert the links in the file. */ 150 convert_links (file, urls); 151 ++*file_count; 152 153 /* Free the data. */ 154 free_urlpos (urls); 155 } 156} 157 158/* This function is called when the retrieval is done to convert the 159 links that have been downloaded. It has to be called at the end of 160 the retrieval, because only then does Wget know conclusively which 161 URLs have been downloaded, and which not, so it can tell which 162 direction to convert to. 163 164 The "direction" means that the URLs to the files that have been 165 downloaded get converted to the relative URL which will point to 166 that file. And the other URLs get converted to the remote URL on 167 the server. 168 169 All the downloaded HTMLs are kept in downloaded_html_files, and 170 downloaded URLs in urls_downloaded. All the information is 171 extracted from these two lists. */ 172 173void 174convert_all_links (void) 175{ 176 double secs; 177 int file_count = 0; 178 179 struct ptimer *timer = ptimer_new (); 180 181 convert_links_in_hashtable (downloaded_html_set, 0, &file_count); 182 convert_links_in_hashtable (downloaded_css_set, 1, &file_count); 183 184 secs = ptimer_measure (timer); 185 logprintf (LOG_VERBOSE, _("Converted %d files in %s seconds.\n"), 186 file_count, print_decimal (secs)); 187 188 ptimer_destroy (timer); 189} 190 191static void write_backup_file (const char *, downloaded_file_t); 192static const char *replace_plain (const char*, int, FILE*, const char *); 193static const char *replace_attr (const char *, int, FILE *, const char *); 194static const char *replace_attr_refresh_hack (const char *, int, FILE *, 195 const char *, int); 196static char *local_quote_string (const char *); 197static char *construct_relative (const char *, const char *); 198 199/* Change the links in one file. LINKS is a list of links in the 200 document, along with their positions and the desired direction of 201 the conversion. */ 202static void 203convert_links (const char *file, struct urlpos *links) 204{ 205 struct file_memory *fm; 206 FILE *fp; 207 const char *p; 208 downloaded_file_t downloaded_file_return; 209 210 struct urlpos *link; 211 int to_url_count = 0, to_file_count = 0; 212 213 logprintf (LOG_VERBOSE, _("Converting %s... "), file); 214 215 { 216 /* First we do a "dry run": go through the list L and see whether 217 any URL needs to be converted in the first place. If not, just 218 leave the file alone. */ 219 int dry_count = 0; 220 struct urlpos *dry; 221 for (dry = links; dry; dry = dry->next) 222 if (dry->convert != CO_NOCONVERT) 223 ++dry_count; 224 if (!dry_count) 225 { 226 logputs (LOG_VERBOSE, _("nothing to do.\n")); 227 return; 228 } 229 } 230 231 fm = read_file (file); 232 if (!fm) 233 { 234 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"), 235 file, strerror (errno)); 236 return; 237 } 238 239 downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file); 240 if (opt.backup_converted && downloaded_file_return) 241 write_backup_file (file, downloaded_file_return); 242 243 /* Before opening the file for writing, unlink the file. This is 244 important if the data in FM is mmaped. In such case, nulling the 245 file, which is what fopen() below does, would make us read all 246 zeroes from the mmaped region. */ 247 if (unlink (file) < 0 && errno != ENOENT) 248 { 249 logprintf (LOG_NOTQUIET, _("Unable to delete %s: %s\n"), 250 quote (file), strerror (errno)); 251 read_file_free (fm); 252 return; 253 } 254 /* Now open the file for writing. */ 255 fp = fopen (file, "wb"); 256 if (!fp) 257 { 258 logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"), 259 file, strerror (errno)); 260 read_file_free (fm); 261 return; 262 } 263 264 /* Here we loop through all the URLs in file, replacing those of 265 them that are downloaded with relative references. */ 266 p = fm->content; 267 for (link = links; link; link = link->next) 268 { 269 char *url_start = fm->content + link->pos; 270 271 if (link->pos >= fm->length) 272 { 273 DEBUGP (("Something strange is going on. Please investigate.")); 274 break; 275 } 276 /* If the URL is not to be converted, skip it. */ 277 if (link->convert == CO_NOCONVERT) 278 { 279 DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos)); 280 continue; 281 } 282 283 /* Echo the file contents, up to the offending URL's opening 284 quote, to the outfile. */ 285 fwrite (p, 1, url_start - p, fp); 286 p = url_start; 287 288 switch (link->convert) 289 { 290 case CO_CONVERT_TO_RELATIVE: 291 /* Convert absolute URL to relative. */ 292 { 293 char *newname = construct_relative (file, link->local_name); 294 char *quoted_newname = local_quote_string (newname); 295 296 if (link->link_css_p) 297 p = replace_plain (p, link->size, fp, quoted_newname); 298 else if (!link->link_refresh_p) 299 p = replace_attr (p, link->size, fp, quoted_newname); 300 else 301 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname, 302 link->refresh_timeout); 303 304 DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n", 305 link->url->url, newname, link->pos, file)); 306 xfree (newname); 307 xfree (quoted_newname); 308 ++to_file_count; 309 break; 310 } 311 case CO_CONVERT_TO_COMPLETE: 312 /* Convert the link to absolute URL. */ 313 { 314 char *newlink = link->url->url; 315 char *quoted_newlink = html_quote_string (newlink); 316 317 if (link->link_css_p) 318 p = replace_plain (p, link->size, fp, quoted_newlink); 319 else if (!link->link_refresh_p) 320 p = replace_attr (p, link->size, fp, quoted_newlink); 321 else 322 p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink, 323 link->refresh_timeout); 324 325 DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n", 326 newlink, link->pos, file)); 327 xfree (quoted_newlink); 328 ++to_url_count; 329 break; 330 } 331 case CO_NULLIFY_BASE: 332 /* Change the base href to "". */ 333 p = replace_attr (p, link->size, fp, ""); 334 break; 335 case CO_NOCONVERT: 336 abort (); 337 break; 338 } 339 } 340 341 /* Output the rest of the file. */ 342 if (p - fm->content < fm->length) 343 fwrite (p, 1, fm->length - (p - fm->content), fp); 344 fclose (fp); 345 read_file_free (fm); 346 347 logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count); 348} 349 350/* Construct and return a link that points from BASEFILE to LINKFILE. 351 Both files should be local file names, BASEFILE of the referrering 352 file, and LINKFILE of the referred file. 353 354 Examples: 355 356 cr("foo", "bar") -> "bar" 357 cr("A/foo", "A/bar") -> "bar" 358 cr("A/foo", "A/B/bar") -> "B/bar" 359 cr("A/X/foo", "A/Y/bar") -> "../Y/bar" 360 cr("X/", "Y/bar") -> "../Y/bar" (trailing slash does matter in BASE) 361 362 Both files should be absolute or relative, otherwise strange 363 results might ensue. The function makes no special efforts to 364 handle "." and ".." in links, so make sure they're not there 365 (e.g. using path_simplify). */ 366 367static char * 368construct_relative (const char *basefile, const char *linkfile) 369{ 370 char *link; 371 int basedirs; 372 const char *b, *l; 373 int i, start; 374 375 /* First, skip the initial directory components common to both 376 files. */ 377 start = 0; 378 for (b = basefile, l = linkfile; *b == *l && *b != '\0'; ++b, ++l) 379 { 380 if (*b == '/') 381 start = (b - basefile) + 1; 382 } 383 basefile += start; 384 linkfile += start; 385 386 /* With common directories out of the way, the situation we have is 387 as follows: 388 b - b1/b2/[...]/bfile 389 l - l1/l2/[...]/lfile 390 391 The link we're constructing needs to be: 392 lnk - ../../l1/l2/[...]/lfile 393 394 Where the number of ".."'s equals the number of bN directory 395 components in B. */ 396 397 /* Count the directory components in B. */ 398 basedirs = 0; 399 for (b = basefile; *b; b++) 400 { 401 if (*b == '/') 402 ++basedirs; 403 } 404 405 /* Construct LINK as explained above. */ 406 link = xmalloc (3 * basedirs + strlen (linkfile) + 1); 407 for (i = 0; i < basedirs; i++) 408 memcpy (link + 3 * i, "../", 3); 409 strcpy (link + 3 * i, linkfile); 410 return link; 411} 412 413/* Used by write_backup_file to remember which files have been 414 written. */ 415static struct hash_table *converted_files; 416 417static void 418write_backup_file (const char *file, downloaded_file_t downloaded_file_return) 419{ 420 /* Rather than just writing over the original .html file with the 421 converted version, save the former to *.orig. Note we only do 422 this for files we've _successfully_ downloaded, so we don't 423 clobber .orig files sitting around from previous invocations. 424 On VMS, use "_orig" instead of ".orig". See "wget.h". */ 425 426 /* Construct the backup filename as the original name plus ".orig". */ 427 size_t filename_len = strlen (file); 428 char* filename_plus_orig_suffix; 429 430 /* TODO: hack this to work with css files */ 431 if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED) 432 { 433 /* Just write "orig" over "html". We need to do it this way 434 because when we're checking to see if we've downloaded the 435 file before (to see if we can skip downloading it), we don't 436 know if it's a text/html file. Therefore we don't know yet 437 at that stage that -E is going to cause us to tack on 438 ".html", so we need to compare vs. the original URL plus 439 ".orig", not the original URL plus ".html.orig". */ 440 filename_plus_orig_suffix = alloca (filename_len + 1); 441 strcpy (filename_plus_orig_suffix, file); 442 strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig"); 443 } 444 else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */ 445 { 446 /* Append ".orig" to the name. */ 447 filename_plus_orig_suffix = alloca (filename_len + sizeof (ORIG_SFX)); 448 strcpy (filename_plus_orig_suffix, file); 449 strcpy (filename_plus_orig_suffix + filename_len, ORIG_SFX); 450 } 451 452 if (!converted_files) 453 converted_files = make_string_hash_table (0); 454 455 /* We can get called twice on the same URL thanks to the 456 convert_all_links() call in main(). If we write the .orig file 457 each time in such a case, it'll end up containing the first-pass 458 conversion, not the original file. So, see if we've already been 459 called on this file. */ 460 if (!string_set_contains (converted_files, file)) 461 { 462 /* Rename <file> to <file>.orig before former gets written over. */ 463 if (rename (file, filename_plus_orig_suffix) != 0) 464 logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"), 465 file, filename_plus_orig_suffix, strerror (errno)); 466 467 /* Remember that we've already written a .orig backup for this file. 468 Note that we never free this memory since we need it till the 469 convert_all_links() call, which is one of the last things the 470 program does before terminating. BTW, I'm not sure if it would be 471 safe to just set 'converted_file_ptr->string' to 'file' below, 472 rather than making a copy of the string... Another note is that I 473 thought I could just add a field to the urlpos structure saying 474 that we'd written a .orig file for this URL, but that didn't work, 475 so I had to make this separate list. 476 -- Dan Harkless <wget@harkless.org> 477 478 This [adding a field to the urlpos structure] didn't work 479 because convert_file() is called from convert_all_links at 480 the end of the retrieval with a freshly built new urlpos 481 list. 482 -- Hrvoje Niksic <hniksic@xemacs.org> 483 */ 484 string_set_add (converted_files, file); 485 } 486} 487 488static bool find_fragment (const char *, int, const char **, const char **); 489 490/* Replace a string with NEW_TEXT. Ignore quoting. */ 491static const char * 492replace_plain (const char *p, int size, FILE *fp, const char *new_text) 493{ 494 fputs (new_text, fp); 495 p += size; 496 return p; 497} 498 499/* Replace an attribute's original text with NEW_TEXT. */ 500 501static const char * 502replace_attr (const char *p, int size, FILE *fp, const char *new_text) 503{ 504 bool quote_flag = false; 505 char quote_char = '\"'; /* use "..." for quoting, unless the 506 original value is quoted, in which 507 case reuse its quoting char. */ 508 const char *frag_beg, *frag_end; 509 510 /* Structure of our string is: 511 "...old-contents..." 512 <--- size ---> (with quotes) 513 OR: 514 ...old-contents... 515 <--- size --> (no quotes) */ 516 517 if (*p == '\"' || *p == '\'') 518 { 519 quote_char = *p; 520 quote_flag = true; 521 ++p; 522 size -= 2; /* disregard opening and closing quote */ 523 } 524 putc (quote_char, fp); 525 fputs (new_text, fp); 526 527 /* Look for fragment identifier, if any. */ 528 if (find_fragment (p, size, &frag_beg, &frag_end)) 529 fwrite (frag_beg, 1, frag_end - frag_beg, fp); 530 p += size; 531 if (quote_flag) 532 ++p; 533 putc (quote_char, fp); 534 535 return p; 536} 537 538/* The same as REPLACE_ATTR, but used when replacing 539 <meta http-equiv=refresh content="new_text"> because we need to 540 append "timeout_value; URL=" before the next_text. */ 541 542static const char * 543replace_attr_refresh_hack (const char *p, int size, FILE *fp, 544 const char *new_text, int timeout) 545{ 546 /* "0; URL=..." */ 547 char *new_with_timeout = (char *)alloca (numdigit (timeout) 548 + 6 /* "; URL=" */ 549 + strlen (new_text) 550 + 1); 551 sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text); 552 553 return replace_attr (p, size, fp, new_with_timeout); 554} 555 556/* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not 557 preceded by '&'. If the character is not found, return zero. If 558 the character is found, return true and set BP and EP to point to 559 the beginning and end of the region. 560 561 This is used for finding the fragment indentifiers in URLs. */ 562 563static bool 564find_fragment (const char *beg, int size, const char **bp, const char **ep) 565{ 566 const char *end = beg + size; 567 bool saw_amp = false; 568 for (; beg < end; beg++) 569 { 570 switch (*beg) 571 { 572 case '&': 573 saw_amp = true; 574 break; 575 case '#': 576 if (!saw_amp) 577 { 578 *bp = beg; 579 *ep = end; 580 return true; 581 } 582 /* fallthrough */ 583 default: 584 saw_amp = false; 585 } 586 } 587 return false; 588} 589 590/* Quote FILE for use as local reference to an HTML file. 591 592 We quote ? as %3F to avoid passing part of the file name as the 593 parameter when browsing the converted file through HTTP. However, 594 it is safe to do this only when `--adjust-extension' is turned on. 595 This is because converting "index.html?foo=bar" to 596 "index.html%3Ffoo=bar" would break local browsing, as the latter 597 isn't even recognized as an HTML file! However, converting 598 "index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be 599 safe for both local and HTTP-served browsing. 600 601 We always quote "#" as "%23", "%" as "%25" and ";" as "%3B" 602 because those characters have special meanings in URLs. */ 603 604static char * 605local_quote_string (const char *file) 606{ 607 const char *from; 608 char *newname, *to; 609 610 char *any = strpbrk (file, "?#%;"); 611 if (!any) 612 return html_quote_string (file); 613 614 /* Allocate space assuming the worst-case scenario, each character 615 having to be quoted. */ 616 to = newname = (char *)alloca (3 * strlen (file) + 1); 617 for (from = file; *from; from++) 618 switch (*from) 619 { 620 case '%': 621 *to++ = '%'; 622 *to++ = '2'; 623 *to++ = '5'; 624 break; 625 case '#': 626 *to++ = '%'; 627 *to++ = '2'; 628 *to++ = '3'; 629 break; 630 case ';': 631 *to++ = '%'; 632 *to++ = '3'; 633 *to++ = 'B'; 634 break; 635 case '?': 636 if (opt.adjust_extension) 637 { 638 *to++ = '%'; 639 *to++ = '3'; 640 *to++ = 'F'; 641 break; 642 } 643 /* fallthrough */ 644 default: 645 *to++ = *from; 646 } 647 *to = '\0'; 648 649 return html_quote_string (newname); 650} 651 652/* Book-keeping code for dl_file_url_map, dl_url_file_map, 653 downloaded_html_list, and downloaded_html_set. Other code calls 654 these functions to let us know that a file has been downloaded. */ 655 656#define ENSURE_TABLES_EXIST do { \ 657 if (!dl_file_url_map) \ 658 dl_file_url_map = make_string_hash_table (0); \ 659 if (!dl_url_file_map) \ 660 dl_url_file_map = make_string_hash_table (0); \ 661} while (0) 662 663/* Return true if S1 and S2 are the same, except for "/index.html". 664 The three cases in which it returns one are (substitute any 665 substring for "foo"): 666 667 m("foo/index.html", "foo/") ==> 1 668 m("foo/", "foo/index.html") ==> 1 669 m("foo", "foo/index.html") ==> 1 670 m("foo", "foo/" ==> 1 671 m("foo", "foo") ==> 1 */ 672 673static bool 674match_except_index (const char *s1, const char *s2) 675{ 676 int i; 677 const char *lng; 678 679 /* Skip common substring. */ 680 for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++) 681 ; 682 if (i == 0) 683 /* Strings differ at the very beginning -- bail out. We need to 684 check this explicitly to avoid `lng - 1' reading outside the 685 array. */ 686 return false; 687 688 if (!*s1 && !*s2) 689 /* Both strings hit EOF -- strings are equal. */ 690 return true; 691 else if (*s1 && *s2) 692 /* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */ 693 return false; 694 else if (*s1) 695 /* S1 is the longer one. */ 696 lng = s1; 697 else 698 /* S2 is the longer one. */ 699 lng = s2; 700 701 /* foo */ /* foo/ */ 702 /* foo/index.html */ /* or */ /* foo/index.html */ 703 /* ^ */ /* ^ */ 704 705 if (*lng != '/') 706 /* The right-hand case. */ 707 --lng; 708 709 if (*lng == '/' && *(lng + 1) == '\0') 710 /* foo */ 711 /* foo/ */ 712 return true; 713 714 return 0 == strcmp (lng, "/index.html"); 715} 716 717static int 718dissociate_urls_from_file_mapper (void *key, void *value, void *arg) 719{ 720 char *mapping_url = (char *)key; 721 char *mapping_file = (char *)value; 722 char *file = (char *)arg; 723 724 if (0 == strcmp (mapping_file, file)) 725 { 726 hash_table_remove (dl_url_file_map, mapping_url); 727 xfree (mapping_url); 728 xfree (mapping_file); 729 } 730 731 /* Continue mapping. */ 732 return 0; 733} 734 735/* Remove all associations from various URLs to FILE from dl_url_file_map. */ 736 737static void 738dissociate_urls_from_file (const char *file) 739{ 740 /* Can't use hash_table_iter_* because the table mutates while mapping. */ 741 hash_table_for_each (dl_url_file_map, dissociate_urls_from_file_mapper, 742 (char *) file); 743} 744 745/* Register that URL has been successfully downloaded to FILE. This 746 is used by the link conversion code to convert references to URLs 747 to references to local files. It is also being used to check if a 748 URL has already been downloaded. */ 749 750void 751register_download (const char *url, const char *file) 752{ 753 char *old_file, *old_url; 754 755 ENSURE_TABLES_EXIST; 756 757 /* With some forms of retrieval, it is possible, although not likely 758 or particularly desirable. If both are downloaded, the second 759 download will override the first one. When that happens, 760 dissociate the old file name from the URL. */ 761 762 if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url)) 763 { 764 if (0 == strcmp (url, old_url)) 765 /* We have somehow managed to download the same URL twice. 766 Nothing to do. */ 767 return; 768 769 if (match_except_index (url, old_url) 770 && !hash_table_contains (dl_url_file_map, url)) 771 /* The two URLs differ only in the "index.html" ending. For 772 example, one is "http://www.server.com/", and the other is 773 "http://www.server.com/index.html". Don't remove the old 774 one, just add the new one as a non-canonical entry. */ 775 goto url_only; 776 777 hash_table_remove (dl_file_url_map, file); 778 xfree (old_file); 779 xfree (old_url); 780 781 /* Remove all the URLs that point to this file. Yes, there can 782 be more than one such URL, because we store redirections as 783 multiple entries in dl_url_file_map. For example, if URL1 784 redirects to URL2 which gets downloaded to FILE, we map both 785 URL1 and URL2 to FILE in dl_url_file_map. (dl_file_url_map 786 only points to URL2.) When another URL gets loaded to FILE, 787 we want both URL1 and URL2 dissociated from it. 788 789 This is a relatively expensive operation because it performs 790 a linear search of the whole hash table, but it should be 791 called very rarely, only when two URLs resolve to the same 792 file name, *and* the "<file>.1" extensions are turned off. 793 In other words, almost never. */ 794 dissociate_urls_from_file (file); 795 } 796 797 hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url)); 798 799 url_only: 800 /* A URL->FILE mapping is not possible without a FILE->URL mapping. 801 If the latter were present, it should have been removed by the 802 above `if'. So we could write: 803 804 assert (!hash_table_contains (dl_url_file_map, url)); 805 806 The above is correct when running in recursive mode where the 807 same URL always resolves to the same file. But if you do 808 something like: 809 810 wget URL URL 811 812 then the first URL will resolve to "FILE", and the other to 813 "FILE.1". In that case, FILE.1 will not be found in 814 dl_file_url_map, but URL will still point to FILE in 815 dl_url_file_map. */ 816 if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file)) 817 { 818 hash_table_remove (dl_url_file_map, url); 819 xfree (old_url); 820 xfree (old_file); 821 } 822 823 hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file)); 824} 825 826/* Register that FROM has been redirected to TO. This assumes that TO 827 is successfully downloaded and already registered using 828 register_download() above. */ 829 830void 831register_redirection (const char *from, const char *to) 832{ 833 char *file; 834 835 ENSURE_TABLES_EXIST; 836 837 file = hash_table_get (dl_url_file_map, to); 838 assert (file != NULL); 839 if (!hash_table_contains (dl_url_file_map, from)) 840 hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file)); 841} 842 843/* Register that the file has been deleted. */ 844 845void 846register_delete_file (const char *file) 847{ 848 char *old_url, *old_file; 849 850 ENSURE_TABLES_EXIST; 851 852 if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url)) 853 return; 854 855 hash_table_remove (dl_file_url_map, file); 856 xfree (old_file); 857 xfree (old_url); 858 dissociate_urls_from_file (file); 859} 860 861/* Register that FILE is an HTML file that has been downloaded. */ 862 863void 864register_html (const char *url, const char *file) 865{ 866 if (!downloaded_html_set) 867 downloaded_html_set = make_string_hash_table (0); 868 string_set_add (downloaded_html_set, file); 869} 870 871/* Register that FILE is a CSS file that has been downloaded. */ 872 873void 874register_css (const char *url, const char *file) 875{ 876 if (!downloaded_css_set) 877 downloaded_css_set = make_string_hash_table (0); 878 string_set_add (downloaded_css_set, file); 879} 880 881static void downloaded_files_free (void); 882 883/* Cleanup the data structures associated with this file. */ 884 885void 886convert_cleanup (void) 887{ 888 if (dl_file_url_map) 889 { 890 free_keys_and_values (dl_file_url_map); 891 hash_table_destroy (dl_file_url_map); 892 dl_file_url_map = NULL; 893 } 894 if (dl_url_file_map) 895 { 896 free_keys_and_values (dl_url_file_map); 897 hash_table_destroy (dl_url_file_map); 898 dl_url_file_map = NULL; 899 } 900 if (downloaded_html_set) 901 string_set_free (downloaded_html_set); 902 downloaded_files_free (); 903 if (converted_files) 904 string_set_free (converted_files); 905} 906 907/* Book-keeping code for downloaded files that enables extension 908 hacks. */ 909 910/* This table should really be merged with dl_file_url_map and 911 downloaded_html_files. This was originally a list, but I changed 912 it to a hash table beause it was actually taking a lot of time to 913 find things in it. */ 914 915static struct hash_table *downloaded_files_hash; 916 917/* We're storing "modes" of type downloaded_file_t in the hash table. 918 However, our hash tables only accept pointers for keys and values. 919 So when we need a pointer, we use the address of a 920 downloaded_file_t variable of static storage. */ 921 922static downloaded_file_t * 923downloaded_mode_to_ptr (downloaded_file_t mode) 924{ 925 static downloaded_file_t 926 v1 = FILE_NOT_ALREADY_DOWNLOADED, 927 v2 = FILE_DOWNLOADED_NORMALLY, 928 v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, 929 v4 = CHECK_FOR_FILE; 930 931 switch (mode) 932 { 933 case FILE_NOT_ALREADY_DOWNLOADED: 934 return &v1; 935 case FILE_DOWNLOADED_NORMALLY: 936 return &v2; 937 case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED: 938 return &v3; 939 case CHECK_FOR_FILE: 940 return &v4; 941 } 942 return NULL; 943} 944 945/* Remembers which files have been downloaded. In the standard case, 946 should be called with mode == FILE_DOWNLOADED_NORMALLY for each 947 file we actually download successfully (i.e. not for ones we have 948 failures on or that we skip due to -N). 949 950 When we've downloaded a file and tacked on a ".html" extension due 951 to -E, call this function with 952 FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than 953 FILE_DOWNLOADED_NORMALLY. 954 955 If you just want to check if a file has been previously added 956 without adding it, call with mode == CHECK_FOR_FILE. Please be 957 sure to call this function with local filenames, not remote 958 URLs. */ 959 960downloaded_file_t 961downloaded_file (downloaded_file_t mode, const char *file) 962{ 963 downloaded_file_t *ptr; 964 965 if (mode == CHECK_FOR_FILE) 966 { 967 if (!downloaded_files_hash) 968 return FILE_NOT_ALREADY_DOWNLOADED; 969 ptr = hash_table_get (downloaded_files_hash, file); 970 if (!ptr) 971 return FILE_NOT_ALREADY_DOWNLOADED; 972 return *ptr; 973 } 974 975 if (!downloaded_files_hash) 976 downloaded_files_hash = make_string_hash_table (0); 977 978 ptr = hash_table_get (downloaded_files_hash, file); 979 if (ptr) 980 return *ptr; 981 982 ptr = downloaded_mode_to_ptr (mode); 983 hash_table_put (downloaded_files_hash, xstrdup (file), ptr); 984 985 return FILE_NOT_ALREADY_DOWNLOADED; 986} 987 988static void 989downloaded_files_free (void) 990{ 991 if (downloaded_files_hash) 992 { 993 hash_table_iterator iter; 994 for (hash_table_iterate (downloaded_files_hash, &iter); 995 hash_table_iter_next (&iter); 996 ) 997 xfree (iter.key); 998 hash_table_destroy (downloaded_files_hash); 999 downloaded_files_hash = NULL; 1000 } 1001} 1002 1003/* The function returns the pointer to the malloc-ed quoted version of 1004 string s. It will recognize and quote numeric and special graphic 1005 entities, as per RFC1866: 1006 1007 `&' -> `&' 1008 `<' -> `<' 1009 `>' -> `>' 1010 `"' -> `"' 1011 SP -> ` ' 1012 1013 No other entities are recognized or replaced. */ 1014char * 1015html_quote_string (const char *s) 1016{ 1017 const char *b = s; 1018 char *p, *res; 1019 int i; 1020 1021 /* Pass through the string, and count the new size. */ 1022 for (i = 0; *s; s++, i++) 1023 { 1024 if (*s == '&') 1025 i += 4; /* `amp;' */ 1026 else if (*s == '<' || *s == '>') 1027 i += 3; /* `lt;' and `gt;' */ 1028 else if (*s == '\"') 1029 i += 5; /* `quot;' */ 1030 else if (*s == ' ') 1031 i += 4; /* #32; */ 1032 } 1033 res = xmalloc (i + 1); 1034 s = b; 1035 for (p = res; *s; s++) 1036 { 1037 switch (*s) 1038 { 1039 case '&': 1040 *p++ = '&'; 1041 *p++ = 'a'; 1042 *p++ = 'm'; 1043 *p++ = 'p'; 1044 *p++ = ';'; 1045 break; 1046 case '<': case '>': 1047 *p++ = '&'; 1048 *p++ = (*s == '<' ? 'l' : 'g'); 1049 *p++ = 't'; 1050 *p++ = ';'; 1051 break; 1052 case '\"': 1053 *p++ = '&'; 1054 *p++ = 'q'; 1055 *p++ = 'u'; 1056 *p++ = 'o'; 1057 *p++ = 't'; 1058 *p++ = ';'; 1059 break; 1060 case ' ': 1061 *p++ = '&'; 1062 *p++ = '#'; 1063 *p++ = '3'; 1064 *p++ = '2'; 1065 *p++ = ';'; 1066 break; 1067 default: 1068 *p++ = *s; 1069 } 1070 } 1071 *p = '\0'; 1072 return res; 1073} 1074 1075/* 1076 * vim: et ts=2 sw=2 1077 */ 1078 1079