1/* Collect URLs from HTML source. 2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 3 2007, 2008, 2009 Free Software Foundation, Inc. 4 5This file is part of GNU Wget. 6 7GNU Wget is free software; you can redistribute it and/or modify 8it under the terms of the GNU General Public License as published by 9the Free Software Foundation; either version 3 of the License, or 10 (at your option) any later version. 11 12GNU Wget is distributed in the hope that it will be useful, 13but WITHOUT ANY WARRANTY; without even the implied warranty of 14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15GNU General Public License for more details. 16 17You should have received a copy of the GNU General Public License 18along with Wget. If not, see <http://www.gnu.org/licenses/>. 19 20Additional permission under GNU GPL version 3 section 7 21 22If you modify this program, or any covered work, by linking or 23combining it with the OpenSSL project's OpenSSL library (or a 24modified version of that library), containing parts covered by the 25terms of the OpenSSL or SSLeay licenses, the Free Software Foundation 26grants you additional permission to convey the resulting work. 27Corresponding Source for a non-source form of such a combination 28shall include the source code for the parts of OpenSSL used as well 29as that of the covered work. */ 30 31#include "wget.h" 32 33#include <stdio.h> 34#include <string.h> 35#include <stdlib.h> 36#include <errno.h> 37#include <assert.h> 38 39#include "html-parse.h" 40#include "url.h" 41#include "utils.h" 42#include "hash.h" 43#include "convert.h" 44#include "recur.h" 45#include "html-url.h" 46#include "css-url.h" 47 48typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *); 49 50#define DECLARE_TAG_HANDLER(fun) \ 51 static void fun (int, struct taginfo *, struct map_context *) 52 53DECLARE_TAG_HANDLER (tag_find_urls); 54DECLARE_TAG_HANDLER (tag_handle_base); 55DECLARE_TAG_HANDLER (tag_handle_form); 56DECLARE_TAG_HANDLER (tag_handle_link); 57DECLARE_TAG_HANDLER (tag_handle_meta); 58 59enum { 60 TAG_A, 61 TAG_APPLET, 62 TAG_AREA, 63 TAG_BASE, 64 TAG_BGSOUND, 65 TAG_BODY, 66 TAG_EMBED, 67 TAG_FIG, 68 TAG_FORM, 69 TAG_FRAME, 70 TAG_IFRAME, 71 TAG_IMG, 72 TAG_INPUT, 73 TAG_LAYER, 74 TAG_LINK, 75 TAG_META, 76 TAG_OBJECT, 77 TAG_OVERLAY, 78 TAG_SCRIPT, 79 TAG_TABLE, 80 TAG_TD, 81 TAG_TH 82}; 83 84/* The list of known tags and functions used for handling them. Most 85 tags are simply harvested for URLs. */ 86static struct known_tag { 87 int tagid; 88 const char *name; 89 tag_handler_t handler; 90} known_tags[] = { 91 { TAG_A, "a", tag_find_urls }, 92 { TAG_APPLET, "applet", tag_find_urls }, 93 { TAG_AREA, "area", tag_find_urls }, 94 { TAG_BASE, "base", tag_handle_base }, 95 { TAG_BGSOUND, "bgsound", tag_find_urls }, 96 { TAG_BODY, "body", tag_find_urls }, 97 { TAG_EMBED, "embed", tag_find_urls }, 98 { TAG_FIG, "fig", tag_find_urls }, 99 { TAG_FORM, "form", tag_handle_form }, 100 { TAG_FRAME, "frame", tag_find_urls }, 101 { TAG_IFRAME, "iframe", tag_find_urls }, 102 { TAG_IMG, "img", tag_find_urls }, 103 { TAG_INPUT, "input", tag_find_urls }, 104 { TAG_LAYER, "layer", tag_find_urls }, 105 { TAG_LINK, "link", tag_handle_link }, 106 { TAG_META, "meta", tag_handle_meta }, 107 { TAG_OBJECT, "object", tag_find_urls }, 108 { TAG_OVERLAY, "overlay", tag_find_urls }, 109 { TAG_SCRIPT, "script", tag_find_urls }, 110 { TAG_TABLE, "table", tag_find_urls }, 111 { TAG_TD, "td", tag_find_urls }, 112 { TAG_TH, "th", tag_find_urls } 113}; 114 115/* tag_url_attributes documents which attributes of which tags contain 116 URLs to harvest. It is used by tag_find_urls. */ 117 118/* Defines for the FLAGS. */ 119 120/* The link is "inline", i.e. needs to be retrieved for this document 121 to be correctly rendered. Inline links include inlined images, 122 stylesheets, children frames, etc. */ 123#define ATTR_INLINE 1 124 125/* The link is expected to yield HTML contents. It's important not to 126 try to follow HTML obtained by following e.g. <img src="..."> 127 regardless of content-type. Doing this causes infinite loops for 128 "images" that return non-404 error pages with links to the same 129 image. */ 130#define ATTR_HTML 2 131 132/* For tags handled by tag_find_urls: attributes that contain URLs to 133 download. */ 134static struct { 135 int tagid; 136 const char *attr_name; 137 int flags; 138} tag_url_attributes[] = { 139 { TAG_A, "href", ATTR_HTML }, 140 { TAG_APPLET, "code", ATTR_INLINE }, 141 { TAG_AREA, "href", ATTR_HTML }, 142 { TAG_BGSOUND, "src", ATTR_INLINE }, 143 { TAG_BODY, "background", ATTR_INLINE }, 144 { TAG_EMBED, "href", ATTR_HTML }, 145 { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML }, 146 { TAG_FIG, "src", ATTR_INLINE }, 147 { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML }, 148 { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML }, 149 { TAG_IMG, "href", ATTR_INLINE }, 150 { TAG_IMG, "lowsrc", ATTR_INLINE }, 151 { TAG_IMG, "src", ATTR_INLINE }, 152 { TAG_INPUT, "src", ATTR_INLINE }, 153 { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML }, 154 { TAG_OBJECT, "data", ATTR_INLINE }, 155 { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML }, 156 { TAG_SCRIPT, "src", ATTR_INLINE }, 157 { TAG_TABLE, "background", ATTR_INLINE }, 158 { TAG_TD, "background", ATTR_INLINE }, 159 { TAG_TH, "background", ATTR_INLINE } 160}; 161 162/* The lists of interesting tags and attributes are built dynamically, 163 from the information above. However, some places in the code refer 164 to the attributes not mentioned here. We add them manually. */ 165static const char *additional_attributes[] = { 166 "rel", /* used by tag_handle_link */ 167 "http-equiv", /* used by tag_handle_meta */ 168 "name", /* used by tag_handle_meta */ 169 "content", /* used by tag_handle_meta */ 170 "action", /* used by tag_handle_form */ 171 "style" /* used by check_style_attr */ 172}; 173 174static struct hash_table *interesting_tags; 175static struct hash_table *interesting_attributes; 176 177/* Will contains the (last) charset found in 'http-equiv=content-type' 178 meta tags */ 179static char *meta_charset; 180 181static void 182init_interesting (void) 183{ 184 /* Init the variables interesting_tags and interesting_attributes 185 that are used by the HTML parser to know which tags and 186 attributes we're interested in. We initialize this only once, 187 for performance reasons. 188 189 Here we also make sure that what we put in interesting_tags 190 matches the user's preferences as specified through --ignore-tags 191 and --follow-tags. */ 192 193 size_t i; 194 interesting_tags = make_nocase_string_hash_table (countof (known_tags)); 195 196 /* First, add all the tags we know hot to handle, mapped to their 197 respective entries in known_tags. */ 198 for (i = 0; i < countof (known_tags); i++) 199 hash_table_put (interesting_tags, known_tags[i].name, known_tags + i); 200 201 /* Then remove the tags ignored through --ignore-tags. */ 202 if (opt.ignore_tags) 203 { 204 char **ignored; 205 for (ignored = opt.ignore_tags; *ignored; ignored++) 206 hash_table_remove (interesting_tags, *ignored); 207 } 208 209 /* If --follow-tags is specified, use only those tags. */ 210 if (opt.follow_tags) 211 { 212 /* Create a new table intersecting --follow-tags and known_tags, 213 and use it as interesting_tags. */ 214 struct hash_table *intersect = make_nocase_string_hash_table (0); 215 char **followed; 216 for (followed = opt.follow_tags; *followed; followed++) 217 { 218 struct known_tag *t = hash_table_get (interesting_tags, *followed); 219 if (!t) 220 continue; /* ignore unknown --follow-tags entries. */ 221 hash_table_put (intersect, *followed, t); 222 } 223 hash_table_destroy (interesting_tags); 224 interesting_tags = intersect; 225 } 226 227 /* Add the attributes we care about. */ 228 interesting_attributes = make_nocase_string_hash_table (10); 229 for (i = 0; i < countof (additional_attributes); i++) 230 hash_table_put (interesting_attributes, additional_attributes[i], "1"); 231 for (i = 0; i < countof (tag_url_attributes); i++) 232 hash_table_put (interesting_attributes, 233 tag_url_attributes[i].attr_name, "1"); 234} 235 236/* Find the value of attribute named NAME in the taginfo TAG. If the 237 attribute is not present, return NULL. If ATTRIND is non-NULL, the 238 index of the attribute in TAG will be stored there. */ 239 240static char * 241find_attr (struct taginfo *tag, const char *name, int *attrind) 242{ 243 int i; 244 for (i = 0; i < tag->nattrs; i++) 245 if (!strcasecmp (tag->attrs[i].name, name)) 246 { 247 if (attrind) 248 *attrind = i; 249 return tag->attrs[i].value; 250 } 251 return NULL; 252} 253 254/* used for calls to append_url */ 255#define ATTR_POS(tag, attrind, ctx) \ 256 (tag->attrs[attrind].value_raw_beginning - ctx->text) 257#define ATTR_SIZE(tag, attrind) \ 258 (tag->attrs[attrind].value_raw_size) 259 260/* Append LINK_URI to the urlpos structure that is being built. 261 262 LINK_URI will be merged with the current document base. 263*/ 264 265struct urlpos * 266append_url (const char *link_uri, int position, int size, 267 struct map_context *ctx) 268{ 269 int link_has_scheme = url_has_scheme (link_uri); 270 struct urlpos *newel; 271 const char *base = ctx->base ? ctx->base : ctx->parent_base; 272 struct url *url; 273 274 if (!base) 275 { 276 DEBUGP (("%s: no base, merge will use \"%s\".\n", 277 ctx->document_file, link_uri)); 278 279 if (!link_has_scheme) 280 { 281 /* Base URL is unavailable, and the link does not have a 282 location attached to it -- we have to give up. Since 283 this can only happen when using `--force-html -i', print 284 a warning. */ 285 logprintf (LOG_NOTQUIET, 286 _("%s: Cannot resolve incomplete link %s.\n"), 287 ctx->document_file, link_uri); 288 return NULL; 289 } 290 291 url = url_parse (link_uri, NULL, NULL, false); 292 if (!url) 293 { 294 DEBUGP (("%s: link \"%s\" doesn't parse.\n", 295 ctx->document_file, link_uri)); 296 return NULL; 297 } 298 } 299 else 300 { 301 /* Merge BASE with LINK_URI, but also make sure the result is 302 canonicalized, i.e. that "../" have been resolved. 303 (parse_url will do that for us.) */ 304 305 char *complete_uri = uri_merge (base, link_uri); 306 307 DEBUGP (("%s: merge(%s, %s) -> %s\n", 308 quotearg_n_style (0, escape_quoting_style, ctx->document_file), 309 quote_n (1, base), 310 quote_n (2, link_uri), 311 quotearg_n_style (3, escape_quoting_style, complete_uri))); 312 313 url = url_parse (complete_uri, NULL, NULL, false); 314 if (!url) 315 { 316 DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", 317 ctx->document_file, complete_uri)); 318 xfree (complete_uri); 319 return NULL; 320 } 321 xfree (complete_uri); 322 } 323 324 DEBUGP (("appending %s to urlpos.\n", quote (url->url))); 325 326 newel = xnew0 (struct urlpos); 327 newel->url = url; 328 newel->pos = position; 329 newel->size = size; 330 331 /* A URL is relative if the host is not named, and the name does not 332 start with `/'. */ 333 if (!link_has_scheme && *link_uri != '/') 334 newel->link_relative_p = 1; 335 else if (link_has_scheme) 336 newel->link_complete_p = 1; 337 338 if (ctx->tail) 339 { 340 ctx->tail->next = newel; 341 ctx->tail = newel; 342 } 343 else 344 ctx->tail = ctx->head = newel; 345 346 return newel; 347} 348 349static void 350check_style_attr (struct taginfo *tag, struct map_context *ctx) 351{ 352 int attrind; 353 char *style = find_attr (tag, "style", &attrind); 354 if (!style) 355 return; 356 357 /* raw pos and raw size include the quotes, hence the +1 -2 */ 358 get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2); 359} 360 361/* All the tag_* functions are called from collect_tags_mapper, as 362 specified by KNOWN_TAGS. */ 363 364/* Default tag handler: collect URLs from attributes specified for 365 this tag by tag_url_attributes. */ 366 367static void 368tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) 369{ 370 size_t i; 371 int attrind; 372 int first = -1; 373 374 for (i = 0; i < countof (tag_url_attributes); i++) 375 if (tag_url_attributes[i].tagid == tagid) 376 { 377 /* We've found the index of tag_url_attributes where the 378 attributes of our tag begin. */ 379 first = i; 380 break; 381 } 382 assert (first != -1); 383 384 /* Loop over the "interesting" attributes of this tag. In this 385 example, it will loop over "src" and "lowsrc". 386 387 <img src="foo.png" lowsrc="bar.png"> 388 389 This has to be done in the outer loop so that the attributes are 390 processed in the same order in which they appear in the page. 391 This is required when converting links. */ 392 393 for (attrind = 0; attrind < tag->nattrs; attrind++) 394 { 395 /* Find whether TAG/ATTRIND is a combination that contains a 396 URL. */ 397 char *link = tag->attrs[attrind].value; 398 const size_t size = countof (tag_url_attributes); 399 400 /* If you're cringing at the inefficiency of the nested loops, 401 remember that they both iterate over a very small number of 402 items. The worst-case inner loop is for the IMG tag, which 403 has three attributes. */ 404 for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++) 405 { 406 if (0 == strcasecmp (tag->attrs[attrind].name, 407 tag_url_attributes[i].attr_name)) 408 { 409 struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx), 410 ATTR_SIZE(tag,attrind), ctx); 411 if (up) 412 { 413 int flags = tag_url_attributes[i].flags; 414 if (flags & ATTR_INLINE) 415 up->link_inline_p = 1; 416 if (flags & ATTR_HTML) 417 up->link_expect_html = 1; 418 } 419 } 420 } 421 } 422} 423 424/* Handle the BASE tag, for <base href=...>. */ 425 426static void 427tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx) 428{ 429 struct urlpos *base_urlpos; 430 int attrind; 431 char *newbase = find_attr (tag, "href", &attrind); 432 if (!newbase) 433 return; 434 435 base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx), 436 ATTR_SIZE(tag,attrind), ctx); 437 if (!base_urlpos) 438 return; 439 base_urlpos->ignore_when_downloading = 1; 440 base_urlpos->link_base_p = 1; 441 442 if (ctx->base) 443 xfree (ctx->base); 444 if (ctx->parent_base) 445 ctx->base = uri_merge (ctx->parent_base, newbase); 446 else 447 ctx->base = xstrdup (newbase); 448} 449 450/* Mark the URL found in <form action=...> for conversion. */ 451 452static void 453tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx) 454{ 455 int attrind; 456 char *action = find_attr (tag, "action", &attrind); 457 458 if (action) 459 { 460 struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx), 461 ATTR_SIZE(tag,attrind), ctx); 462 if (up) 463 up->ignore_when_downloading = 1; 464 } 465} 466 467/* Handle the LINK tag. It requires special handling because how its 468 links will be followed in -p mode depends on the REL attribute. */ 469 470static void 471tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx) 472{ 473 int attrind; 474 char *href = find_attr (tag, "href", &attrind); 475 476 /* All <link href="..."> link references are external, except those 477 known not to be, such as style sheet and shortcut icon: 478 479 <link rel="stylesheet" href="..."> 480 <link rel="shortcut icon" href="..."> 481 */ 482 if (href) 483 { 484 struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx), 485 ATTR_SIZE(tag,attrind), ctx); 486 if (up) 487 { 488 char *rel = find_attr (tag, "rel", NULL); 489 if (rel) 490 { 491 if (0 == strcasecmp (rel, "stylesheet")) 492 { 493 up->link_inline_p = 1; 494 up->link_expect_css = 1; 495 } 496 else if (0 == strcasecmp (rel, "shortcut icon")) 497 { 498 up->link_inline_p = 1; 499 } 500 } 501 else 502 /* The external ones usually point to HTML pages, such as 503 <link rel="next" href="..."> */ 504 up->link_expect_html = 1; 505 } 506 } 507} 508 509/* Handle the META tag. This requires special handling because of the 510 refresh feature and because of robot exclusion. */ 511 512static void 513tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx) 514{ 515 char *name = find_attr (tag, "name", NULL); 516 char *http_equiv = find_attr (tag, "http-equiv", NULL); 517 518 if (http_equiv && 0 == strcasecmp (http_equiv, "refresh")) 519 { 520 /* Some pages use a META tag to specify that the page be 521 refreshed by a new page after a given number of seconds. The 522 general format for this is: 523 524 <meta http-equiv=Refresh content="NUMBER; URL=index2.html"> 525 526 So we just need to skip past the "NUMBER; URL=" garbage to 527 get to the URL. */ 528 529 struct urlpos *entry; 530 int attrind; 531 int timeout = 0; 532 char *p; 533 534 char *refresh = find_attr (tag, "content", &attrind); 535 if (!refresh) 536 return; 537 538 for (p = refresh; c_isdigit (*p); p++) 539 timeout = 10 * timeout + *p - '0'; 540 if (*p++ != ';') 541 return; 542 543 while (c_isspace (*p)) 544 ++p; 545 if (!( c_toupper (*p) == 'U' 546 && c_toupper (*(p + 1)) == 'R' 547 && c_toupper (*(p + 2)) == 'L' 548 && *(p + 3) == '=')) 549 return; 550 p += 4; 551 while (c_isspace (*p)) 552 ++p; 553 554 entry = append_url (p, ATTR_POS(tag,attrind,ctx), 555 ATTR_SIZE(tag,attrind), ctx); 556 if (entry) 557 { 558 entry->link_refresh_p = 1; 559 entry->refresh_timeout = timeout; 560 entry->link_expect_html = 1; 561 } 562 } 563 else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type")) 564 { 565 /* Handle stuff like: 566 <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */ 567 568 char *mcharset; 569 char *content = find_attr (tag, "content", NULL); 570 if (!content) 571 return; 572 573 mcharset = parse_charset (content); 574 if (!mcharset) 575 return; 576 577 xfree_null (meta_charset); 578 meta_charset = mcharset; 579 } 580 else if (name && 0 == strcasecmp (name, "robots")) 581 { 582 /* Handle stuff like: 583 <meta name="robots" content="index,nofollow"> */ 584 char *content = find_attr (tag, "content", NULL); 585 if (!content) 586 return; 587 if (!strcasecmp (content, "none")) 588 ctx->nofollow = true; 589 else 590 { 591 while (*content) 592 { 593 char *end; 594 /* Skip any initial whitespace. */ 595 content += strspn (content, " \f\n\r\t\v"); 596 /* Find the next occurrence of ',' or whitespace, 597 * or the end of the string. */ 598 end = content + strcspn (content, ", \f\n\r\t\v"); 599 if (!strncasecmp (content, "nofollow", end - content)) 600 ctx->nofollow = true; 601 /* Skip past the next comma, if any. */ 602 if (*end == ',') 603 ++end; 604 else 605 { 606 end = strchr (end, ','); 607 if (end) 608 ++end; 609 else 610 end = content + strlen (content); 611 } 612 content = end; 613 } 614 } 615 } 616} 617 618/* Dispatch the tag handler appropriate for the tag we're mapping 619 over. See known_tags[] for definition of tag handlers. */ 620 621static void 622collect_tags_mapper (struct taginfo *tag, void *arg) 623{ 624 struct map_context *ctx = (struct map_context *)arg; 625 626 /* Find the tag in our table of tags. This must not fail because 627 map_html_tags only returns tags found in interesting_tags. 628 629 I've changed this for now, I'm passing NULL as interesting_tags 630 to map_html_tags. This way we can check all tags for a style 631 attribute. 632 */ 633 struct known_tag *t = hash_table_get (interesting_tags, tag->name); 634 635 if (t != NULL) 636 t->handler (t->tagid, tag, ctx); 637 638 check_style_attr (tag, ctx); 639 640 if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) && 641 tag->contents_begin && tag->contents_end) 642 { 643 /* parse contents */ 644 get_urls_css (ctx, tag->contents_begin - ctx->text, 645 tag->contents_end - tag->contents_begin); 646 } 647} 648 649/* Analyze HTML tags FILE and construct a list of URLs referenced from 650 it. It merges relative links in FILE with URL. It is aware of 651 <base href=...> and does the right thing. */ 652 653struct urlpos * 654get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, 655 struct iri *iri) 656{ 657 struct file_memory *fm; 658 struct map_context ctx; 659 int flags; 660 661 /* Load the file. */ 662 fm = read_file (file); 663 if (!fm) 664 { 665 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); 666 return NULL; 667 } 668 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); 669 670 ctx.text = fm->content; 671 ctx.head = ctx.tail = NULL; 672 ctx.base = NULL; 673 ctx.parent_base = url ? url : opt.base_href; 674 ctx.document_file = file; 675 ctx.nofollow = false; 676 677 if (!interesting_tags) 678 init_interesting (); 679 680 /* Specify MHT_TRIM_VALUES because of buggy HTML generators that 681 generate <a href=" foo"> instead of <a href="foo"> (browsers 682 ignore spaces as well.) If you really mean space, use &32; or 683 %20. MHT_TRIM_VALUES also causes squashing of embedded newlines, 684 e.g. in <img src="foo.[newline]html">. Such newlines are also 685 ignored by IE and Mozilla and are presumably introduced by 686 writing HTML with editors that force word wrap. */ 687 flags = MHT_TRIM_VALUES; 688 if (opt.strict_comments) 689 flags |= MHT_STRICT_COMMENTS; 690 691 /* the NULL here used to be interesting_tags */ 692 map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, 693 NULL, interesting_attributes); 694 695 /* If meta charset isn't null, override content encoding */ 696 if (iri && meta_charset) 697 set_content_encoding (iri, meta_charset); 698 699 DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow)); 700 if (meta_disallow_follow) 701 *meta_disallow_follow = ctx.nofollow; 702 703 xfree_null (ctx.base); 704 read_file_free (fm); 705 return ctx.head; 706} 707 708/* This doesn't really have anything to do with HTML, but it's similar 709 to get_urls_html, so we put it here. */ 710 711struct urlpos * 712get_urls_file (const char *file) 713{ 714 struct file_memory *fm; 715 struct urlpos *head, *tail; 716 const char *text, *text_end; 717 718 /* Load the file. */ 719 fm = read_file (file); 720 if (!fm) 721 { 722 logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); 723 return NULL; 724 } 725 DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); 726 727 head = tail = NULL; 728 text = fm->content; 729 text_end = fm->content + fm->length; 730 while (text < text_end) 731 { 732 int up_error_code; 733 char *url_text; 734 struct urlpos *entry; 735 struct url *url; 736 737 const char *line_beg = text; 738 const char *line_end = memchr (text, '\n', text_end - text); 739 if (!line_end) 740 line_end = text_end; 741 else 742 ++line_end; 743 text = line_end; 744 745 /* Strip whitespace from the beginning and end of line. */ 746 while (line_beg < line_end && c_isspace (*line_beg)) 747 ++line_beg; 748 while (line_end > line_beg && c_isspace (*(line_end - 1))) 749 --line_end; 750 751 if (line_beg == line_end) 752 continue; 753 754 /* The URL is in the [line_beg, line_end) region. */ 755 756 /* We must copy the URL to a zero-terminated string, and we 757 can't use alloca because we're in a loop. *sigh*. */ 758 url_text = strdupdelim (line_beg, line_end); 759 760 if (opt.base_href) 761 { 762 /* Merge opt.base_href with URL. */ 763 char *merged = uri_merge (opt.base_href, url_text); 764 xfree (url_text); 765 url_text = merged; 766 } 767 768 url = url_parse (url_text, &up_error_code, NULL, false); 769 if (!url) 770 { 771 char *error = url_error (url_text, up_error_code); 772 logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), 773 file, url_text, error); 774 xfree (url_text); 775 xfree (error); 776 continue; 777 } 778 xfree (url_text); 779 780 entry = xnew0 (struct urlpos); 781 entry->url = url; 782 783 if (!head) 784 head = entry; 785 else 786 tail->next = entry; 787 tail = entry; 788 } 789 read_file_free (fm); 790 return head; 791} 792 793void 794cleanup_html_url (void) 795{ 796 /* Destroy the hash tables. The hash table keys and values are not 797 allocated by this code, so we don't need to free them here. */ 798 if (interesting_tags) 799 hash_table_destroy (interesting_tags); 800 if (interesting_attributes) 801 hash_table_destroy (interesting_attributes); 802} 803