1/* Collect URLs from HTML source.
2   Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
3   2007, 2008, 2009 Free Software Foundation, Inc.
4
5This file is part of GNU Wget.
6
7GNU Wget is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11
12GNU Wget is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15GNU General Public License for more details.
16
17You should have received a copy of the GNU General Public License
18along with Wget.  If not, see <http://www.gnu.org/licenses/>.
19
20Additional permission under GNU GPL version 3 section 7
21
22If you modify this program, or any covered work, by linking or
23combining it with the OpenSSL project's OpenSSL library (or a
24modified version of that library), containing parts covered by the
25terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26grants you additional permission to convey the resulting work.
27Corresponding Source for a non-source form of such a combination
28shall include the source code for the parts of OpenSSL used as well
29as that of the covered work.  */
30
31#include "wget.h"
32
33#include <stdio.h>
34#include <string.h>
35#include <stdlib.h>
36#include <errno.h>
37#include <assert.h>
38
39#include "html-parse.h"
40#include "url.h"
41#include "utils.h"
42#include "hash.h"
43#include "convert.h"
44#include "recur.h"
45#include "html-url.h"
46#include "css-url.h"
47
48typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
49
50#define DECLARE_TAG_HANDLER(fun)                                \
51  static void fun (int, struct taginfo *, struct map_context *)
52
53DECLARE_TAG_HANDLER (tag_find_urls);
54DECLARE_TAG_HANDLER (tag_handle_base);
55DECLARE_TAG_HANDLER (tag_handle_form);
56DECLARE_TAG_HANDLER (tag_handle_link);
57DECLARE_TAG_HANDLER (tag_handle_meta);
58
59enum {
60  TAG_A,
61  TAG_APPLET,
62  TAG_AREA,
63  TAG_BASE,
64  TAG_BGSOUND,
65  TAG_BODY,
66  TAG_EMBED,
67  TAG_FIG,
68  TAG_FORM,
69  TAG_FRAME,
70  TAG_IFRAME,
71  TAG_IMG,
72  TAG_INPUT,
73  TAG_LAYER,
74  TAG_LINK,
75  TAG_META,
76  TAG_OBJECT,
77  TAG_OVERLAY,
78  TAG_SCRIPT,
79  TAG_TABLE,
80  TAG_TD,
81  TAG_TH
82};
83
84/* The list of known tags and functions used for handling them.  Most
85   tags are simply harvested for URLs. */
86static struct known_tag {
87  int tagid;
88  const char *name;
89  tag_handler_t handler;
90} known_tags[] = {
91  { TAG_A,       "a",           tag_find_urls },
92  { TAG_APPLET,  "applet",      tag_find_urls },
93  { TAG_AREA,    "area",        tag_find_urls },
94  { TAG_BASE,    "base",        tag_handle_base },
95  { TAG_BGSOUND, "bgsound",     tag_find_urls },
96  { TAG_BODY,    "body",        tag_find_urls },
97  { TAG_EMBED,   "embed",       tag_find_urls },
98  { TAG_FIG,     "fig",         tag_find_urls },
99  { TAG_FORM,    "form",        tag_handle_form },
100  { TAG_FRAME,   "frame",       tag_find_urls },
101  { TAG_IFRAME,  "iframe",      tag_find_urls },
102  { TAG_IMG,     "img",         tag_find_urls },
103  { TAG_INPUT,   "input",       tag_find_urls },
104  { TAG_LAYER,   "layer",       tag_find_urls },
105  { TAG_LINK,    "link",        tag_handle_link },
106  { TAG_META,    "meta",        tag_handle_meta },
107  { TAG_OBJECT,  "object",      tag_find_urls },
108  { TAG_OVERLAY, "overlay",     tag_find_urls },
109  { TAG_SCRIPT,  "script",      tag_find_urls },
110  { TAG_TABLE,   "table",       tag_find_urls },
111  { TAG_TD,      "td",          tag_find_urls },
112  { TAG_TH,      "th",          tag_find_urls }
113};
114
115/* tag_url_attributes documents which attributes of which tags contain
116   URLs to harvest.  It is used by tag_find_urls.  */
117
118/* Defines for the FLAGS. */
119
120/* The link is "inline", i.e. needs to be retrieved for this document
121   to be correctly rendered.  Inline links include inlined images,
122   stylesheets, children frames, etc.  */
123#define ATTR_INLINE     1
124
125/* The link is expected to yield HTML contents.  It's important not to
126   try to follow HTML obtained by following e.g. <img src="...">
127   regardless of content-type.  Doing this causes infinite loops for
128   "images" that return non-404 error pages with links to the same
129   image.  */
130#define ATTR_HTML       2
131
132/* For tags handled by tag_find_urls: attributes that contain URLs to
133   download. */
134static struct {
135  int tagid;
136  const char *attr_name;
137  int flags;
138} tag_url_attributes[] = {
139  { TAG_A,              "href",         ATTR_HTML },
140  { TAG_APPLET,         "code",         ATTR_INLINE },
141  { TAG_AREA,           "href",         ATTR_HTML },
142  { TAG_BGSOUND,        "src",          ATTR_INLINE },
143  { TAG_BODY,           "background",   ATTR_INLINE },
144  { TAG_EMBED,          "href",         ATTR_HTML },
145  { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
146  { TAG_FIG,            "src",          ATTR_INLINE },
147  { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
148  { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
149  { TAG_IMG,            "href",         ATTR_INLINE },
150  { TAG_IMG,            "lowsrc",       ATTR_INLINE },
151  { TAG_IMG,            "src",          ATTR_INLINE },
152  { TAG_INPUT,          "src",          ATTR_INLINE },
153  { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
154  { TAG_OBJECT,         "data",         ATTR_INLINE },
155  { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
156  { TAG_SCRIPT,         "src",          ATTR_INLINE },
157  { TAG_TABLE,          "background",   ATTR_INLINE },
158  { TAG_TD,             "background",   ATTR_INLINE },
159  { TAG_TH,             "background",   ATTR_INLINE }
160};
161
162/* The lists of interesting tags and attributes are built dynamically,
163   from the information above.  However, some places in the code refer
164   to the attributes not mentioned here.  We add them manually.  */
165static const char *additional_attributes[] = {
166  "rel",                        /* used by tag_handle_link  */
167  "http-equiv",                 /* used by tag_handle_meta  */
168  "name",                       /* used by tag_handle_meta  */
169  "content",                    /* used by tag_handle_meta  */
170  "action",                     /* used by tag_handle_form  */
171  "style"                       /* used by check_style_attr */
172};
173
174static struct hash_table *interesting_tags;
175static struct hash_table *interesting_attributes;
176
177/* Will contains the (last) charset found in 'http-equiv=content-type'
178   meta tags  */
179static char *meta_charset;
180
181static void
182init_interesting (void)
183{
184  /* Init the variables interesting_tags and interesting_attributes
185     that are used by the HTML parser to know which tags and
186     attributes we're interested in.  We initialize this only once,
187     for performance reasons.
188
189     Here we also make sure that what we put in interesting_tags
190     matches the user's preferences as specified through --ignore-tags
191     and --follow-tags.  */
192
193  size_t i;
194  interesting_tags = make_nocase_string_hash_table (countof (known_tags));
195
196  /* First, add all the tags we know hot to handle, mapped to their
197     respective entries in known_tags.  */
198  for (i = 0; i < countof (known_tags); i++)
199    hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
200
201  /* Then remove the tags ignored through --ignore-tags.  */
202  if (opt.ignore_tags)
203    {
204      char **ignored;
205      for (ignored = opt.ignore_tags; *ignored; ignored++)
206        hash_table_remove (interesting_tags, *ignored);
207    }
208
209  /* If --follow-tags is specified, use only those tags.  */
210  if (opt.follow_tags)
211    {
212      /* Create a new table intersecting --follow-tags and known_tags,
213         and use it as interesting_tags.  */
214      struct hash_table *intersect = make_nocase_string_hash_table (0);
215      char **followed;
216      for (followed = opt.follow_tags; *followed; followed++)
217        {
218          struct known_tag *t = hash_table_get (interesting_tags, *followed);
219          if (!t)
220            continue;           /* ignore unknown --follow-tags entries. */
221          hash_table_put (intersect, *followed, t);
222        }
223      hash_table_destroy (interesting_tags);
224      interesting_tags = intersect;
225    }
226
227  /* Add the attributes we care about. */
228  interesting_attributes = make_nocase_string_hash_table (10);
229  for (i = 0; i < countof (additional_attributes); i++)
230    hash_table_put (interesting_attributes, additional_attributes[i], "1");
231  for (i = 0; i < countof (tag_url_attributes); i++)
232    hash_table_put (interesting_attributes,
233                    tag_url_attributes[i].attr_name, "1");
234}
235
236/* Find the value of attribute named NAME in the taginfo TAG.  If the
237   attribute is not present, return NULL.  If ATTRIND is non-NULL, the
238   index of the attribute in TAG will be stored there.  */
239
240static char *
241find_attr (struct taginfo *tag, const char *name, int *attrind)
242{
243  int i;
244  for (i = 0; i < tag->nattrs; i++)
245    if (!strcasecmp (tag->attrs[i].name, name))
246      {
247        if (attrind)
248          *attrind = i;
249        return tag->attrs[i].value;
250      }
251  return NULL;
252}
253
254/* used for calls to append_url */
255#define ATTR_POS(tag, attrind, ctx) \
256 (tag->attrs[attrind].value_raw_beginning - ctx->text)
257#define ATTR_SIZE(tag, attrind) \
258 (tag->attrs[attrind].value_raw_size)
259
260/* Append LINK_URI to the urlpos structure that is being built.
261
262   LINK_URI will be merged with the current document base.
263*/
264
265struct urlpos *
266append_url (const char *link_uri, int position, int size,
267            struct map_context *ctx)
268{
269  int link_has_scheme = url_has_scheme (link_uri);
270  struct urlpos *newel;
271  const char *base = ctx->base ? ctx->base : ctx->parent_base;
272  struct url *url;
273
274  if (!base)
275    {
276      DEBUGP (("%s: no base, merge will use \"%s\".\n",
277               ctx->document_file, link_uri));
278
279      if (!link_has_scheme)
280        {
281          /* Base URL is unavailable, and the link does not have a
282             location attached to it -- we have to give up.  Since
283             this can only happen when using `--force-html -i', print
284             a warning.  */
285          logprintf (LOG_NOTQUIET,
286                     _("%s: Cannot resolve incomplete link %s.\n"),
287                     ctx->document_file, link_uri);
288          return NULL;
289        }
290
291      url = url_parse (link_uri, NULL, NULL, false);
292      if (!url)
293        {
294          DEBUGP (("%s: link \"%s\" doesn't parse.\n",
295                   ctx->document_file, link_uri));
296          return NULL;
297        }
298    }
299  else
300    {
301      /* Merge BASE with LINK_URI, but also make sure the result is
302         canonicalized, i.e. that "../" have been resolved.
303         (parse_url will do that for us.) */
304
305      char *complete_uri = uri_merge (base, link_uri);
306
307      DEBUGP (("%s: merge(%s, %s) -> %s\n",
308               quotearg_n_style (0, escape_quoting_style, ctx->document_file),
309               quote_n (1, base),
310               quote_n (2, link_uri),
311               quotearg_n_style (3, escape_quoting_style, complete_uri)));
312
313      url = url_parse (complete_uri, NULL, NULL, false);
314      if (!url)
315        {
316          DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
317                   ctx->document_file, complete_uri));
318          xfree (complete_uri);
319          return NULL;
320        }
321      xfree (complete_uri);
322    }
323
324  DEBUGP (("appending %s to urlpos.\n", quote (url->url)));
325
326  newel = xnew0 (struct urlpos);
327  newel->url = url;
328  newel->pos = position;
329  newel->size = size;
330
331  /* A URL is relative if the host is not named, and the name does not
332     start with `/'.  */
333  if (!link_has_scheme && *link_uri != '/')
334    newel->link_relative_p = 1;
335  else if (link_has_scheme)
336    newel->link_complete_p = 1;
337
338  if (ctx->tail)
339    {
340      ctx->tail->next = newel;
341      ctx->tail = newel;
342    }
343  else
344    ctx->tail = ctx->head = newel;
345
346  return newel;
347}
348
349static void
350check_style_attr (struct taginfo *tag, struct map_context *ctx)
351{
352  int attrind;
353  char *style = find_attr (tag, "style", &attrind);
354  if (!style)
355    return;
356
357  /* raw pos and raw size include the quotes, hence the +1 -2 */
358  get_urls_css (ctx, ATTR_POS(tag,attrind,ctx)+1, ATTR_SIZE(tag,attrind)-2);
359}
360
361/* All the tag_* functions are called from collect_tags_mapper, as
362   specified by KNOWN_TAGS.  */
363
364/* Default tag handler: collect URLs from attributes specified for
365   this tag by tag_url_attributes.  */
366
367static void
368tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
369{
370  size_t i;
371  int attrind;
372  int first = -1;
373
374  for (i = 0; i < countof (tag_url_attributes); i++)
375    if (tag_url_attributes[i].tagid == tagid)
376      {
377        /* We've found the index of tag_url_attributes where the
378           attributes of our tag begin.  */
379        first = i;
380        break;
381      }
382  assert (first != -1);
383
384  /* Loop over the "interesting" attributes of this tag.  In this
385     example, it will loop over "src" and "lowsrc".
386
387       <img src="foo.png" lowsrc="bar.png">
388
389     This has to be done in the outer loop so that the attributes are
390     processed in the same order in which they appear in the page.
391     This is required when converting links.  */
392
393  for (attrind = 0; attrind < tag->nattrs; attrind++)
394    {
395      /* Find whether TAG/ATTRIND is a combination that contains a
396         URL. */
397      char *link = tag->attrs[attrind].value;
398      const size_t size = countof (tag_url_attributes);
399
400      /* If you're cringing at the inefficiency of the nested loops,
401         remember that they both iterate over a very small number of
402         items.  The worst-case inner loop is for the IMG tag, which
403         has three attributes.  */
404      for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
405        {
406          if (0 == strcasecmp (tag->attrs[attrind].name,
407                               tag_url_attributes[i].attr_name))
408            {
409              struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
410                                              ATTR_SIZE(tag,attrind), ctx);
411              if (up)
412                {
413                  int flags = tag_url_attributes[i].flags;
414                  if (flags & ATTR_INLINE)
415                    up->link_inline_p = 1;
416                  if (flags & ATTR_HTML)
417                    up->link_expect_html = 1;
418                }
419            }
420        }
421    }
422}
423
424/* Handle the BASE tag, for <base href=...>. */
425
426static void
427tag_handle_base (int tagid, struct taginfo *tag, struct map_context *ctx)
428{
429  struct urlpos *base_urlpos;
430  int attrind;
431  char *newbase = find_attr (tag, "href", &attrind);
432  if (!newbase)
433    return;
434
435  base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
436                            ATTR_SIZE(tag,attrind), ctx);
437  if (!base_urlpos)
438    return;
439  base_urlpos->ignore_when_downloading = 1;
440  base_urlpos->link_base_p = 1;
441
442  if (ctx->base)
443    xfree (ctx->base);
444  if (ctx->parent_base)
445    ctx->base = uri_merge (ctx->parent_base, newbase);
446  else
447    ctx->base = xstrdup (newbase);
448}
449
450/* Mark the URL found in <form action=...> for conversion. */
451
452static void
453tag_handle_form (int tagid, struct taginfo *tag, struct map_context *ctx)
454{
455  int attrind;
456  char *action = find_attr (tag, "action", &attrind);
457
458  if (action)
459    {
460      struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
461                                      ATTR_SIZE(tag,attrind), ctx);
462      if (up)
463        up->ignore_when_downloading = 1;
464    }
465}
466
467/* Handle the LINK tag.  It requires special handling because how its
468   links will be followed in -p mode depends on the REL attribute.  */
469
470static void
471tag_handle_link (int tagid, struct taginfo *tag, struct map_context *ctx)
472{
473  int attrind;
474  char *href = find_attr (tag, "href", &attrind);
475
476  /* All <link href="..."> link references are external, except those
477     known not to be, such as style sheet and shortcut icon:
478
479       <link rel="stylesheet" href="...">
480       <link rel="shortcut icon" href="...">
481  */
482  if (href)
483    {
484      struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
485                                      ATTR_SIZE(tag,attrind), ctx);
486      if (up)
487        {
488          char *rel = find_attr (tag, "rel", NULL);
489          if (rel)
490            {
491              if (0 == strcasecmp (rel, "stylesheet"))
492                {
493                  up->link_inline_p = 1;
494                  up->link_expect_css = 1;
495                }
496              else if (0 == strcasecmp (rel, "shortcut icon"))
497                {
498                  up->link_inline_p = 1;
499                }
500            }
501          else
502            /* The external ones usually point to HTML pages, such as
503               <link rel="next" href="..."> */
504            up->link_expect_html = 1;
505        }
506    }
507}
508
509/* Handle the META tag.  This requires special handling because of the
510   refresh feature and because of robot exclusion.  */
511
512static void
513tag_handle_meta (int tagid, struct taginfo *tag, struct map_context *ctx)
514{
515  char *name = find_attr (tag, "name", NULL);
516  char *http_equiv = find_attr (tag, "http-equiv", NULL);
517
518  if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))
519    {
520      /* Some pages use a META tag to specify that the page be
521         refreshed by a new page after a given number of seconds.  The
522         general format for this is:
523
524           <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
525
526         So we just need to skip past the "NUMBER; URL=" garbage to
527         get to the URL.  */
528
529      struct urlpos *entry;
530      int attrind;
531      int timeout = 0;
532      char *p;
533
534      char *refresh = find_attr (tag, "content", &attrind);
535      if (!refresh)
536        return;
537
538      for (p = refresh; c_isdigit (*p); p++)
539        timeout = 10 * timeout + *p - '0';
540      if (*p++ != ';')
541        return;
542
543      while (c_isspace (*p))
544        ++p;
545      if (!(   c_toupper (*p)       == 'U'
546            && c_toupper (*(p + 1)) == 'R'
547            && c_toupper (*(p + 2)) == 'L'
548            &&          *(p + 3)  == '='))
549        return;
550      p += 4;
551      while (c_isspace (*p))
552        ++p;
553
554      entry = append_url (p, ATTR_POS(tag,attrind,ctx),
555                          ATTR_SIZE(tag,attrind), ctx);
556      if (entry)
557        {
558          entry->link_refresh_p = 1;
559          entry->refresh_timeout = timeout;
560          entry->link_expect_html = 1;
561        }
562    }
563  else if (http_equiv && 0 == strcasecmp (http_equiv, "content-type"))
564    {
565      /* Handle stuff like:
566         <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
567
568      char *mcharset;
569      char *content = find_attr (tag, "content", NULL);
570      if (!content)
571        return;
572
573      mcharset = parse_charset (content);
574      if (!mcharset)
575        return;
576
577      xfree_null (meta_charset);
578      meta_charset = mcharset;
579    }
580  else if (name && 0 == strcasecmp (name, "robots"))
581    {
582      /* Handle stuff like:
583         <meta name="robots" content="index,nofollow"> */
584      char *content = find_attr (tag, "content", NULL);
585      if (!content)
586        return;
587      if (!strcasecmp (content, "none"))
588        ctx->nofollow = true;
589      else
590        {
591          while (*content)
592            {
593              char *end;
594              /* Skip any initial whitespace. */
595              content += strspn (content, " \f\n\r\t\v");
596              /* Find the next occurrence of ',' or whitespace,
597               * or the end of the string.  */
598              end = content + strcspn (content, ", \f\n\r\t\v");
599              if (!strncasecmp (content, "nofollow", end - content))
600                ctx->nofollow = true;
601              /* Skip past the next comma, if any. */
602              if (*end == ',')
603                ++end;
604              else
605                {
606                  end = strchr (end, ',');
607                  if (end)
608                    ++end;
609                  else
610                    end = content + strlen (content);
611                }
612              content = end;
613            }
614        }
615    }
616}
617
618/* Dispatch the tag handler appropriate for the tag we're mapping
619   over.  See known_tags[] for definition of tag handlers.  */
620
621static void
622collect_tags_mapper (struct taginfo *tag, void *arg)
623{
624  struct map_context *ctx = (struct map_context *)arg;
625
626  /* Find the tag in our table of tags.  This must not fail because
627     map_html_tags only returns tags found in interesting_tags.
628
629     I've changed this for now, I'm passing NULL as interesting_tags
630     to map_html_tags.  This way we can check all tags for a style
631     attribute.
632  */
633  struct known_tag *t = hash_table_get (interesting_tags, tag->name);
634
635  if (t != NULL)
636    t->handler (t->tagid, tag, ctx);
637
638  check_style_attr (tag, ctx);
639
640  if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) &&
641      tag->contents_begin && tag->contents_end)
642  {
643    /* parse contents */
644    get_urls_css (ctx, tag->contents_begin - ctx->text,
645                  tag->contents_end - tag->contents_begin);
646  }
647}
648
649/* Analyze HTML tags FILE and construct a list of URLs referenced from
650   it.  It merges relative links in FILE with URL.  It is aware of
651   <base href=...> and does the right thing.  */
652
653struct urlpos *
654get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
655               struct iri *iri)
656{
657  struct file_memory *fm;
658  struct map_context ctx;
659  int flags;
660
661  /* Load the file. */
662  fm = read_file (file);
663  if (!fm)
664    {
665      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
666      return NULL;
667    }
668  DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
669
670  ctx.text = fm->content;
671  ctx.head = ctx.tail = NULL;
672  ctx.base = NULL;
673  ctx.parent_base = url ? url : opt.base_href;
674  ctx.document_file = file;
675  ctx.nofollow = false;
676
677  if (!interesting_tags)
678    init_interesting ();
679
680  /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
681     generate <a href=" foo"> instead of <a href="foo"> (browsers
682     ignore spaces as well.)  If you really mean space, use &32; or
683     %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
684     e.g. in <img src="foo.[newline]html">.  Such newlines are also
685     ignored by IE and Mozilla and are presumably introduced by
686     writing HTML with editors that force word wrap.  */
687  flags = MHT_TRIM_VALUES;
688  if (opt.strict_comments)
689    flags |= MHT_STRICT_COMMENTS;
690
691  /* the NULL here used to be interesting_tags */
692  map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
693                 NULL, interesting_attributes);
694
695  /* If meta charset isn't null, override content encoding */
696  if (iri && meta_charset)
697    set_content_encoding (iri, meta_charset);
698
699  DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
700  if (meta_disallow_follow)
701    *meta_disallow_follow = ctx.nofollow;
702
703  xfree_null (ctx.base);
704  read_file_free (fm);
705  return ctx.head;
706}
707
708/* This doesn't really have anything to do with HTML, but it's similar
709   to get_urls_html, so we put it here.  */
710
711struct urlpos *
712get_urls_file (const char *file)
713{
714  struct file_memory *fm;
715  struct urlpos *head, *tail;
716  const char *text, *text_end;
717
718  /* Load the file.  */
719  fm = read_file (file);
720  if (!fm)
721    {
722      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
723      return NULL;
724    }
725  DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
726
727  head = tail = NULL;
728  text = fm->content;
729  text_end = fm->content + fm->length;
730  while (text < text_end)
731    {
732      int up_error_code;
733      char *url_text;
734      struct urlpos *entry;
735      struct url *url;
736
737      const char *line_beg = text;
738      const char *line_end = memchr (text, '\n', text_end - text);
739      if (!line_end)
740        line_end = text_end;
741      else
742        ++line_end;
743      text = line_end;
744
745      /* Strip whitespace from the beginning and end of line. */
746      while (line_beg < line_end && c_isspace (*line_beg))
747        ++line_beg;
748      while (line_end > line_beg && c_isspace (*(line_end - 1)))
749        --line_end;
750
751      if (line_beg == line_end)
752        continue;
753
754      /* The URL is in the [line_beg, line_end) region. */
755
756      /* We must copy the URL to a zero-terminated string, and we
757         can't use alloca because we're in a loop.  *sigh*.  */
758      url_text = strdupdelim (line_beg, line_end);
759
760      if (opt.base_href)
761        {
762          /* Merge opt.base_href with URL. */
763          char *merged = uri_merge (opt.base_href, url_text);
764          xfree (url_text);
765          url_text = merged;
766        }
767
768      url = url_parse (url_text, &up_error_code, NULL, false);
769      if (!url)
770        {
771          char *error = url_error (url_text, up_error_code);
772          logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
773                     file, url_text, error);
774          xfree (url_text);
775          xfree (error);
776          continue;
777        }
778      xfree (url_text);
779
780      entry = xnew0 (struct urlpos);
781      entry->url = url;
782
783      if (!head)
784        head = entry;
785      else
786        tail->next = entry;
787      tail = entry;
788    }
789  read_file_free (fm);
790  return head;
791}
792
793void
794cleanup_html_url (void)
795{
796  /* Destroy the hash tables.  The hash table keys and values are not
797     allocated by this code, so we don't need to free them here.  */
798  if (interesting_tags)
799    hash_table_destroy (interesting_tags);
800  if (interesting_attributes)
801    hash_table_destroy (interesting_attributes);
802}
803