1/* Handling of recursive HTTP retrieving.
2   Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
3   2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
4
5This file is part of GNU Wget.
6
7GNU Wget is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11
12GNU Wget is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15GNU General Public License for more details.
16
17You should have received a copy of the GNU General Public License
18along with Wget.  If not, see <http://www.gnu.org/licenses/>.
19
20Additional permission under GNU GPL version 3 section 7
21
22If you modify this program, or any covered work, by linking or
23combining it with the OpenSSL project's OpenSSL library (or a
24modified version of that library), containing parts covered by the
25terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26grants you additional permission to convey the resulting work.
27Corresponding Source for a non-source form of such a combination
28shall include the source code for the parts of OpenSSL used as well
29as that of the covered work.  */
30
31#include "wget.h"
32
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#ifdef HAVE_UNISTD_H
37# include <unistd.h>
38#endif /* HAVE_UNISTD_H */
39#include <errno.h>
40#include <assert.h>
41
42#include "url.h"
43#include "recur.h"
44#include "utils.h"
45#include "retr.h"
46#include "ftp.h"
47#include "host.h"
48#include "hash.h"
49#include "res.h"
50#include "convert.h"
51#include "html-url.h"
52#include "css-url.h"
53#include "spider.h"
54
55/* Functions for maintaining the URL queue.  */
56
57struct queue_element {
58  const char *url;              /* the URL to download */
59  const char *referer;          /* the referring document */
60  int depth;                    /* the depth */
61  bool html_allowed;            /* whether the document is allowed to
62                                   be treated as HTML. */
63  struct iri *iri;                /* sXXXav */
64  bool css_allowed;             /* whether the document is allowed to
65                                   be treated as CSS. */
66  struct queue_element *next;   /* next element in queue */
67};
68
69struct url_queue {
70  struct queue_element *head;
71  struct queue_element *tail;
72  int count, maxcount;
73};
74
75/* Create a URL queue. */
76
77static struct url_queue *
78url_queue_new (void)
79{
80  struct url_queue *queue = xnew0 (struct url_queue);
81  return queue;
82}
83
84/* Delete a URL queue. */
85
86static void
87url_queue_delete (struct url_queue *queue)
88{
89  xfree (queue);
90}
91
92/* Enqueue a URL in the queue.  The queue is FIFO: the items will be
93   retrieved ("dequeued") from the queue in the order they were placed
94   into it.  */
95
96static void
97url_enqueue (struct url_queue *queue, struct iri *i,
98             const char *url, const char *referer, int depth,
99             bool html_allowed, bool css_allowed)
100{
101  struct queue_element *qel = xnew (struct queue_element);
102  qel->iri = i;
103  qel->url = url;
104  qel->referer = referer;
105  qel->depth = depth;
106  qel->html_allowed = html_allowed;
107  qel->css_allowed = css_allowed;
108  qel->next = NULL;
109
110  ++queue->count;
111  if (queue->count > queue->maxcount)
112    queue->maxcount = queue->count;
113
114  DEBUGP (("Enqueuing %s at depth %d\n",
115           quotearg_n_style (0, escape_quoting_style, url), depth));
116  DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
117
118  if (i)
119    DEBUGP (("[IRI Enqueuing %s with %s\n", quote_n (0, url),
120             i->uri_encoding ? quote_n (1, i->uri_encoding) : "None"));
121
122  if (queue->tail)
123    queue->tail->next = qel;
124  queue->tail = qel;
125
126  if (!queue->head)
127    queue->head = queue->tail;
128}
129
130/* Take a URL out of the queue.  Return true if this operation
131   succeeded, or false if the queue is empty.  */
132
133static bool
134url_dequeue (struct url_queue *queue, struct iri **i,
135             const char **url, const char **referer, int *depth,
136             bool *html_allowed, bool *css_allowed)
137{
138  struct queue_element *qel = queue->head;
139
140  if (!qel)
141    return false;
142
143  queue->head = queue->head->next;
144  if (!queue->head)
145    queue->tail = NULL;
146
147  *i = qel->iri;
148  *url = qel->url;
149  *referer = qel->referer;
150  *depth = qel->depth;
151  *html_allowed = qel->html_allowed;
152  *css_allowed = qel->css_allowed;
153
154  --queue->count;
155
156  DEBUGP (("Dequeuing %s at depth %d\n",
157           quotearg_n_style (0, escape_quoting_style, qel->url), qel->depth));
158  DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
159
160  xfree (qel);
161  return true;
162}
163
164static bool download_child_p (const struct urlpos *, struct url *, int,
165                              struct url *, struct hash_table *, struct iri *);
166static bool descend_redirect_p (const char *, struct url *, int,
167                                struct url *, struct hash_table *, struct iri *);
168
169
170/* Retrieve a part of the web beginning with START_URL.  This used to
171   be called "recursive retrieval", because the old function was
172   recursive and implemented depth-first search.  retrieve_tree on the
173   other hand implements breadth-search traversal of the tree, which
174   results in much nicer ordering of downloads.
175
176   The algorithm this function uses is simple:
177
178   1. put START_URL in the queue.
179   2. while there are URLs in the queue:
180
181     3. get next URL from the queue.
182     4. download it.
183     5. if the URL is HTML and its depth does not exceed maximum depth,
184        get the list of URLs embedded therein.
185     6. for each of those URLs do the following:
186
187       7. if the URL is not one of those downloaded before, and if it
188          satisfies the criteria specified by the various command-line
189          options, add it to the queue. */
190
191uerr_t
192retrieve_tree (struct url *start_url_parsed, struct iri *pi)
193{
194  uerr_t status = RETROK;
195
196  /* The queue of URLs we need to load. */
197  struct url_queue *queue;
198
199  /* The URLs we do not wish to enqueue, because they are already in
200     the queue, but haven't been downloaded yet.  */
201  struct hash_table *blacklist;
202
203  int up_error_code;
204  struct iri *i = iri_new ();
205
206#define COPYSTR(x)  (x) ? xstrdup(x) : NULL;
207  /* Duplicate pi struct if not NULL */
208  if (pi)
209    {
210      i->uri_encoding = COPYSTR (pi->uri_encoding);
211      i->content_encoding = COPYSTR (pi->content_encoding);
212      i->utf8_encode = pi->utf8_encode;
213    }
214  else
215    set_uri_encoding (i, opt.locale, true);
216#undef COPYSTR
217
218  queue = url_queue_new ();
219  blacklist = make_string_hash_table (0);
220
221  /* Enqueue the starting URL.  Use start_url_parsed->url rather than
222     just URL so we enqueue the canonical form of the URL.  */
223  url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true,
224               false);
225  string_set_add (blacklist, start_url_parsed->url);
226
227  while (1)
228    {
229      bool descend = false;
230      char *url, *referer, *file = NULL;
231      int depth;
232      bool html_allowed, css_allowed;
233      bool is_css = false;
234      bool dash_p_leaf_HTML = false;
235
236      if (opt.quota && total_downloaded_bytes > opt.quota)
237        break;
238      if (status == FWRITEERR)
239        break;
240
241      /* Get the next URL from the queue... */
242
243      if (!url_dequeue (queue, (struct iri **) &i,
244                        (const char **)&url, (const char **)&referer,
245                        &depth, &html_allowed, &css_allowed))
246        break;
247
248      /* ...and download it.  Note that this download is in most cases
249         unconditional, as download_child_p already makes sure a file
250         doesn't get enqueued twice -- and yet this check is here, and
251         not in download_child_p.  This is so that if you run `wget -r
252         URL1 URL2', and a random URL is encountered once under URL1
253         and again under URL2, but at a different (possibly smaller)
254         depth, we want the URL's children to be taken into account
255         the second time.  */
256      if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
257        {
258          file = xstrdup (hash_table_get (dl_url_file_map, url));
259
260          DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
261                   url, file));
262
263          /* this sucks, needs to be combined! */
264          if (html_allowed
265              && downloaded_html_set
266              && string_set_contains (downloaded_html_set, file))
267            {
268              descend = true;
269              is_css = false;
270            }
271          if (css_allowed
272              && downloaded_css_set
273              && string_set_contains (downloaded_css_set, file))
274            {
275              descend = true;
276              is_css = true;
277            }
278        }
279      else
280        {
281          int dt = 0, url_err;
282          char *redirected = NULL;
283          struct url *url_parsed = url_parse (url, &url_err, i, true);
284
285          status = retrieve_url (url_parsed, url, &file, &redirected, referer,
286                                 &dt, false, i, true);
287
288          if (html_allowed && file && status == RETROK
289              && (dt & RETROKF) && (dt & TEXTHTML))
290            {
291              descend = true;
292              is_css = false;
293            }
294
295          /* a little different, css_allowed can override content type
296             lots of web servers serve css with an incorrect content type
297          */
298          if (file && status == RETROK
299              && (dt & RETROKF) &&
300              ((dt & TEXTCSS) || css_allowed))
301            {
302              descend = true;
303              is_css = true;
304            }
305
306          if (redirected)
307            {
308              /* We have been redirected, possibly to another host, or
309                 different path, or wherever.  Check whether we really
310                 want to follow it.  */
311              if (descend)
312                {
313                  if (!descend_redirect_p (redirected, url_parsed, depth,
314                                           start_url_parsed, blacklist, i))
315                    descend = false;
316                  else
317                    /* Make sure that the old pre-redirect form gets
318                       blacklisted. */
319                    string_set_add (blacklist, url);
320                }
321
322              xfree (url);
323              url = redirected;
324            }
325          else
326            {
327              xfree (url);
328              url = xstrdup (url_parsed->url);
329            }
330          url_free(url_parsed);
331        }
332
333      if (opt.spider)
334        {
335          visited_url (url, referer);
336        }
337
338      if (descend
339          && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
340        {
341          if (opt.page_requisites
342              && (depth == opt.reclevel || depth == opt.reclevel + 1))
343            {
344              /* When -p is specified, we are allowed to exceed the
345                 maximum depth, but only for the "inline" links,
346                 i.e. those that are needed to display the page.
347                 Originally this could exceed the depth at most by
348                 one, but we allow one more level so that the leaf
349                 pages that contain frames can be loaded
350                 correctly.  */
351              dash_p_leaf_HTML = true;
352            }
353          else
354            {
355              /* Either -p wasn't specified or it was and we've
356                 already spent the two extra (pseudo-)levels that it
357                 affords us, so we need to bail out. */
358              DEBUGP (("Not descending further; at depth %d, max. %d.\n",
359                       depth, opt.reclevel));
360              descend = false;
361            }
362        }
363
364      /* If the downloaded document was HTML or CSS, parse it and enqueue the
365         links it contains. */
366
367      if (descend)
368        {
369          bool meta_disallow_follow = false;
370          struct urlpos *children
371            = is_css ? get_urls_css_file (file, url) :
372                       get_urls_html (file, url, &meta_disallow_follow, i);
373
374          if (opt.use_robots && meta_disallow_follow)
375            {
376              free_urlpos (children);
377              children = NULL;
378            }
379
380          if (children)
381            {
382              struct urlpos *child = children;
383              struct url *url_parsed = url_parse (url, NULL, i, true);
384              struct iri *ci;
385              char *referer_url = url;
386              bool strip_auth = (url_parsed != NULL
387                                 && url_parsed->user != NULL);
388              assert (url_parsed != NULL);
389
390              /* Strip auth info if present */
391              if (strip_auth)
392                referer_url = url_string (url_parsed, URL_AUTH_HIDE);
393
394              for (; child; child = child->next)
395                {
396                  if (child->ignore_when_downloading)
397                    continue;
398                  if (dash_p_leaf_HTML && !child->link_inline_p)
399                    continue;
400                  if (download_child_p (child, url_parsed, depth, start_url_parsed,
401                                        blacklist, i))
402                    {
403                      ci = iri_new ();
404                      set_uri_encoding (ci, i->content_encoding, false);
405                      url_enqueue (queue, ci, xstrdup (child->url->url),
406                                   xstrdup (referer_url), depth + 1,
407                                   child->link_expect_html,
408                                   child->link_expect_css);
409                      /* We blacklist the URL we have enqueued, because we
410                         don't want to enqueue (and hence download) the
411                         same URL twice.  */
412                      string_set_add (blacklist, child->url->url);
413                    }
414                }
415
416              if (strip_auth)
417                xfree (referer_url);
418              url_free (url_parsed);
419              free_urlpos (children);
420            }
421        }
422
423      if (file
424          && (opt.delete_after
425              || opt.spider /* opt.recursive is implicitely true */
426              || !acceptable (file)))
427        {
428          /* Either --delete-after was specified, or we loaded this
429             (otherwise unneeded because of --spider or rejected by -R)
430             HTML file just to harvest its hyperlinks -- in either case,
431             delete the local file. */
432          DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
433                   opt.delete_after ? "--delete-after" :
434                   (opt.spider ? "--spider" :
435                    "recursive rejection criteria")));
436          logprintf (LOG_VERBOSE,
437                     (opt.delete_after || opt.spider
438                      ? _("Removing %s.\n")
439                      : _("Removing %s since it should be rejected.\n")),
440                     file);
441          if (unlink (file))
442            logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
443          logputs (LOG_VERBOSE, "\n");
444          register_delete_file (file);
445        }
446
447      xfree (url);
448      xfree_null (referer);
449      xfree_null (file);
450      iri_free (i);
451    }
452
453  /* If anything is left of the queue due to a premature exit, free it
454     now.  */
455  {
456    char *d1, *d2;
457    int d3;
458    bool d4, d5;
459    struct iri *d6;
460    while (url_dequeue (queue, (struct iri **)&d6,
461                        (const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
462      {
463        iri_free (d6);
464        xfree (d1);
465        xfree_null (d2);
466      }
467  }
468  url_queue_delete (queue);
469
470  string_set_free (blacklist);
471
472  if (opt.quota && total_downloaded_bytes > opt.quota)
473    return QUOTEXC;
474  else if (status == FWRITEERR)
475    return FWRITEERR;
476  else
477    return RETROK;
478}
479
480/* Based on the context provided by retrieve_tree, decide whether a
481   URL is to be descended to.  This is only ever called from
482   retrieve_tree, but is in a separate function for clarity.
483
484   The most expensive checks (such as those for robots) are memoized
485   by storing these URLs to BLACKLIST.  This may or may not help.  It
486   will help if those URLs are encountered many times.  */
487
488static bool
489download_child_p (const struct urlpos *upos, struct url *parent, int depth,
490                  struct url *start_url_parsed, struct hash_table *blacklist,
491                  struct iri *iri)
492{
493  struct url *u = upos->url;
494  const char *url = u->url;
495  bool u_scheme_like_http;
496
497  DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
498
499  if (string_set_contains (blacklist, url))
500    {
501      if (opt.spider)
502        {
503          char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
504          DEBUGP (("download_child_p: parent->url is: %s\n", quote (parent->url)));
505          visited_url (url, referrer);
506          xfree (referrer);
507        }
508      DEBUGP (("Already on the black list.\n"));
509      goto out;
510    }
511
512  /* Several things to check for:
513     1. if scheme is not http, and we don't load it
514     2. check for relative links (if relative_only is set)
515     3. check for domain
516     4. check for no-parent
517     5. check for excludes && includes
518     6. check for suffix
519     7. check for same host (if spanhost is unset), with possible
520     gethostbyname baggage
521     8. check for robots.txt
522
523     Addendum: If the URL is FTP, and it is to be loaded, only the
524     domain and suffix settings are "stronger".
525
526     Note that .html files will get loaded regardless of suffix rules
527     (but that is remedied later with unlink) unless the depth equals
528     the maximum depth.
529
530     More time- and memory- consuming tests should be put later on
531     the list.  */
532
533  /* Determine whether URL under consideration has a HTTP-like scheme. */
534  u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP);
535
536  /* 1. Schemes other than HTTP are normally not recursed into. */
537  if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
538    {
539      DEBUGP (("Not following non-HTTP schemes.\n"));
540      goto out;
541    }
542
543  /* 2. If it is an absolute link and they are not followed, throw it
544     out.  */
545  if (u_scheme_like_http)
546    if (opt.relative_only && !upos->link_relative_p)
547      {
548        DEBUGP (("It doesn't really look like a relative link.\n"));
549        goto out;
550      }
551
552  /* 3. If its domain is not to be accepted/looked-up, chuck it
553     out.  */
554  if (!accept_domain (u))
555    {
556      DEBUGP (("The domain was not accepted.\n"));
557      goto out;
558    }
559
560  /* 4. Check for parent directory.
561
562     If we descended to a different host or changed the scheme, ignore
563     opt.no_parent.  Also ignore it for documents needed to display
564     the parent page when in -p mode.  */
565  if (opt.no_parent
566      && schemes_are_similar_p (u->scheme, start_url_parsed->scheme)
567      && 0 == strcasecmp (u->host, start_url_parsed->host)
568      && u->port == start_url_parsed->port
569      && !(opt.page_requisites && upos->link_inline_p))
570    {
571      if (!subdir_p (start_url_parsed->dir, u->dir))
572        {
573          DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
574                   u->dir, start_url_parsed->dir));
575          goto out;
576        }
577    }
578
579  /* 5. If the file does not match the acceptance list, or is on the
580     rejection list, chuck it out.  The same goes for the directory
581     exclusion and inclusion lists.  */
582  if (opt.includes || opt.excludes)
583    {
584      if (!accdir (u->dir))
585        {
586          DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
587          goto out;
588        }
589    }
590
591  /* 6. Check for acceptance/rejection rules.  We ignore these rules
592     for directories (no file name to match) and for non-leaf HTMLs,
593     which can lead to other files that do need to be downloaded.  (-p
594     automatically implies non-leaf because with -p we can, if
595     necesary, overstep the maximum depth to get the page requisites.)  */
596  if (u->file[0] != '\0'
597      && !(has_html_suffix_p (u->file)
598           /* The exception only applies to non-leaf HTMLs (but -p
599              always implies non-leaf because we can overstep the
600              maximum depth to get the requisites): */
601           && (/* non-leaf */
602               opt.reclevel == INFINITE_RECURSION
603               /* also non-leaf */
604               || depth < opt.reclevel - 1
605               /* -p, which implies non-leaf (see above) */
606               || opt.page_requisites)))
607    {
608      if (!acceptable (u->file))
609        {
610          DEBUGP (("%s (%s) does not match acc/rej rules.\n",
611                   url, u->file));
612          goto out;
613        }
614    }
615
616  /* 7. */
617  if (schemes_are_similar_p (u->scheme, parent->scheme))
618    if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
619      {
620        DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
621                 u->host, parent->host));
622        goto out;
623      }
624
625  /* 8. */
626  if (opt.use_robots && u_scheme_like_http)
627    {
628      struct robot_specs *specs = res_get_specs (u->host, u->port);
629      if (!specs)
630        {
631          char *rfile;
632          if (res_retrieve_file (url, &rfile, iri))
633            {
634              specs = res_parse_from_file (rfile);
635
636              /* Delete the robots.txt file if we chose to either delete the
637                 files after downloading or we're just running a spider. */
638              if (opt.delete_after || opt.spider)
639                {
640                  logprintf (LOG_VERBOSE, "Removing %s.\n", rfile);
641                  if (unlink (rfile))
642                      logprintf (LOG_NOTQUIET, "unlink: %s\n",
643                                 strerror (errno));
644                }
645
646              xfree (rfile);
647            }
648          else
649            {
650              /* If we cannot get real specs, at least produce
651                 dummy ones so that we can register them and stop
652                 trying to retrieve them.  */
653              specs = res_parse ("", 0);
654            }
655          res_register_specs (u->host, u->port, specs);
656        }
657
658      /* Now that we have (or don't have) robots.txt specs, we can
659         check what they say.  */
660      if (!res_match_path (specs, u->path))
661        {
662          DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
663          string_set_add (blacklist, url);
664          goto out;
665        }
666    }
667
668  /* The URL has passed all the tests.  It can be placed in the
669     download queue. */
670  DEBUGP (("Decided to load it.\n"));
671
672  return true;
673
674 out:
675  DEBUGP (("Decided NOT to load it.\n"));
676
677  return false;
678}
679
680/* This function determines whether we will consider downloading the
681   children of a URL whose download resulted in a redirection,
682   possibly to another host, etc.  It is needed very rarely, and thus
683   it is merely a simple-minded wrapper around download_child_p.  */
684
685static bool
686descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
687                    struct url *start_url_parsed, struct hash_table *blacklist,
688                    struct iri *iri)
689{
690  struct url *new_parsed;
691  struct urlpos *upos;
692  bool success;
693
694  assert (orig_parsed != NULL);
695
696  new_parsed = url_parse (redirected, NULL, NULL, false);
697  assert (new_parsed != NULL);
698
699  upos = xnew0 (struct urlpos);
700  upos->url = new_parsed;
701
702  success = download_child_p (upos, orig_parsed, depth,
703                              start_url_parsed, blacklist, iri);
704
705  url_free (new_parsed);
706  xfree (upos);
707
708  if (!success)
709    DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
710
711  return success;
712}
713
714/* vim:set sts=2 sw=2 cino+={s: */
715