1/* Support for Robot Exclusion Standard (RES).
2   Copyright (C) 2001, 2006, 2007, 2008, 2009 Free Software Foundation,
3   Inc.
4
5This file is part of Wget.
6
7This program is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 3 of the License, or (at
10your option) any later version.
11
12This program is distributed in the hope that it will be useful, but
13WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15General Public License for more details.
16
17You should have received a copy of the GNU General Public License
18along with Wget.  If not, see <http://www.gnu.org/licenses/>.
19
20Additional permission under GNU GPL version 3 section 7
21
22If you modify this program, or any covered work, by linking or
23combining it with the OpenSSL project's OpenSSL library (or a
24modified version of that library), containing parts covered by the
25terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26grants you additional permission to convey the resulting work.
27Corresponding Source for a non-source form of such a combination
28shall include the source code for the parts of OpenSSL used as well
29as that of the covered work.  */
30
31/* This file implements the Robot Exclusion Standard (RES).
32
33   RES is a simple protocol that enables site admins to signalize to
34   the web crawlers that certain parts of the site should not be
35   accessed.  All the admin needs to do is create a "robots.txt" file
36   in the web server root, and use simple commands to allow or
37   disallow access to certain parts of the site.
38
39   The first specification was written by Martijn Koster in 1994, and
40   is still available at <http://www.robotstxt.org/wc/norobots.html>.
41   In 1996, Martijn wrote an Internet Draft specifying an improved RES
42   specification; however, that work was apparently abandoned since
43   the draft has expired in 1997 and hasn't been replaced since.  The
44   draft is available at
45   <http://www.robotstxt.org/wc/norobots-rfc.html>.
46
47   This file implements RES as specified by the draft.  Note that this
48   only handles the "robots.txt" support.  The META tag that controls
49   whether the links should be followed is handled in `html-url.c'.
50
51   Known deviations:
52
53   * The end-of-line comment recognition is more in the spirit of the
54     Bourne Shell (as specified by RES-1994).  That means that
55     "foo#bar" is taken literally, whereas "foo #bar" is interpreted
56     as "foo".  The Draft apparently specifies that both should be
57     interpreted as "foo".
58
59   * We don't recognize sole CR as the line ending.
60
61   * We don't implement expiry mechanism for /robots.txt specs.  I
62     consider it non-necessary for a relatively short-lived
63     application such as Wget.  Besides, it is highly questionable
64     whether anyone deploys the recommended expiry scheme for
65     robots.txt.
66
67   Entry points are functions res_parse, res_parse_from_file,
68   res_match_path, res_register_specs, res_get_specs, and
69   res_retrieve_file.  */
70
71#include "wget.h"
72
73#include <stdio.h>
74#include <stdlib.h>
75#include <string.h>
76#include <errno.h>
77#include <assert.h>
78
79#include "utils.h"
80#include "hash.h"
81#include "url.h"
82#include "retr.h"
83#include "res.h"
84
85#ifdef TESTING
86#include "test.h"
87#endif
88
89struct path_info {
90  char *path;
91  bool allowedp;
92  bool user_agent_exact_p;
93};
94
95struct robot_specs {
96  int count;
97  int size;
98  struct path_info *paths;
99};
100
101/* Parsing the robot spec. */
102
103/* Check whether AGENT (a string of length LENGTH) equals "wget" or
104   "*".  If it is either of them, *matches is set to one.  If it is
105   "wget", *exact_match is set to one.  */
106
107static void
108match_user_agent (const char *agent, int length,
109                  bool *matches, bool *exact_match)
110{
111  if (length == 1 && *agent == '*')
112    {
113      *matches = true;
114      *exact_match = false;
115    }
116  else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
117    {
118      *matches = true;
119      *exact_match = true;
120    }
121  else
122    {
123      *matches = false;
124      *exact_match = false;
125    }
126}
127
128/* Add a path specification between PATH_B and PATH_E as one of the
129   paths in SPECS.  */
130
131static void
132add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
133          bool allowedp, bool exactp)
134{
135  struct path_info pp;
136  if (path_b < path_e && *path_b == '/')
137    /* Our path representation doesn't use a leading slash, so remove
138       one from theirs. */
139    ++path_b;
140  pp.path     = strdupdelim (path_b, path_e);
141  pp.allowedp = allowedp;
142  pp.user_agent_exact_p = exactp;
143  ++specs->count;
144  if (specs->count > specs->size)
145    {
146      if (specs->size == 0)
147        specs->size = 1;
148      else
149        specs->size <<= 1;
150      specs->paths = xrealloc (specs->paths,
151                               specs->size * sizeof (struct path_info));
152    }
153  specs->paths[specs->count - 1] = pp;
154}
155
156/* Recreate SPECS->paths with only those paths that have
157   user_agent_exact_p set to true.  */
158
159static void
160prune_non_exact (struct robot_specs *specs)
161{
162  struct path_info *newpaths;
163  int i, j, cnt;
164  cnt = 0;
165  for (i = 0; i < specs->count; i++)
166    if (specs->paths[i].user_agent_exact_p)
167      ++cnt;
168  newpaths = xnew_array (struct path_info, cnt);
169  for (i = 0, j = 0; i < specs->count; i++)
170    if (specs->paths[i].user_agent_exact_p)
171      newpaths[j++] = specs->paths[i];
172  assert (j == cnt);
173  xfree (specs->paths);
174  specs->paths = newpaths;
175  specs->count = cnt;
176  specs->size  = cnt;
177}
178
179#define EOL(p) ((p) >= lineend)
180
181#define SKIP_SPACE(p) do {              \
182  while (!EOL (p) && c_isspace (*p))      \
183    ++p;                                \
184} while (0)
185
186#define FIELD_IS(string_literal)        \
187  BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
188
189/* Parse textual RES specs beginning with SOURCE of length LENGTH.
190   Return a specs objects ready to be fed to res_match_path.
191
192   The parsing itself is trivial, but creating a correct SPECS object
193   is trickier than it seems, because RES is surprisingly byzantine if
194   you attempt to implement it correctly.
195
196   A "record" is a block of one or more `User-Agent' lines followed by
197   one or more `Allow' or `Disallow' lines.  Record is accepted by
198   Wget if one of the `User-Agent' lines was "wget", or if the user
199   agent line was "*".
200
201   After all the lines have been read, we examine whether an exact
202   ("wget") user-agent field was specified.  If so, we delete all the
203   lines read under "User-Agent: *" blocks because we have our own
204   Wget-specific blocks.  This enables the admin to say:
205
206       User-Agent: *
207       Disallow: /
208
209       User-Agent: google
210       User-Agent: wget
211       Disallow: /cgi-bin
212
213   This means that to Wget and to Google, /cgi-bin is disallowed,
214   whereas for all other crawlers, everything is disallowed.
215   res_parse is implemented so that the order of records doesn't
216   matter.  In the case above, the "User-Agent: *" could have come
217   after the other one.  */
218
219struct robot_specs *
220res_parse (const char *source, int length)
221{
222  int line_count = 1;
223
224  const char *p   = source;
225  const char *end = source + length;
226
227  /* true if last applicable user-agent field matches Wget. */
228  bool user_agent_applies = false;
229
230  /* true if last applicable user-agent field *exactly* matches
231     Wget.  */
232  bool user_agent_exact = false;
233
234  /* whether we ever encountered exact user agent. */
235  bool found_exact = false;
236
237  /* count of allow/disallow lines in the current "record", i.e. after
238     the last `user-agent' instructions.  */
239  int record_count = 0;
240
241  struct robot_specs *specs = xnew0 (struct robot_specs);
242
243  while (1)
244    {
245      const char *lineend, *lineend_real;
246      const char *field_b, *field_e;
247      const char *value_b, *value_e;
248
249      if (p == end)
250        break;
251      lineend_real = memchr (p, '\n', end - p);
252      if (lineend_real)
253        ++lineend_real;
254      else
255        lineend_real = end;
256      lineend = lineend_real;
257
258      /* Before doing anything else, check whether the line is empty
259         or comment-only. */
260      SKIP_SPACE (p);
261      if (EOL (p) || *p == '#')
262        goto next;
263
264      /* Make sure the end-of-line comments are respected by setting
265         lineend to a location preceding the first comment.  Real line
266         ending remains in lineend_real.  */
267      for (lineend = p; lineend < lineend_real; lineend++)
268        if ((lineend == p || c_isspace (*(lineend - 1)))
269            && *lineend == '#')
270          break;
271
272      /* Ignore trailing whitespace in the same way. */
273      while (lineend > p && c_isspace (*(lineend - 1)))
274        --lineend;
275
276      assert (!EOL (p));
277
278      field_b = p;
279      while (!EOL (p) && (c_isalnum (*p) || *p == '-'))
280        ++p;
281      field_e = p;
282
283      SKIP_SPACE (p);
284      if (field_b == field_e || EOL (p) || *p != ':')
285        {
286          DEBUGP (("Ignoring malformed line %d", line_count));
287          goto next;
288        }
289      ++p;                      /* skip ':' */
290      SKIP_SPACE (p);
291
292      value_b = p;
293      while (!EOL (p))
294        ++p;
295      value_e = p;
296
297      /* Finally, we have a syntactically valid line. */
298      if (FIELD_IS ("user-agent"))
299        {
300          /* We have to support several cases:
301
302             --previous records--
303
304             User-Agent: foo
305             User-Agent: Wget
306             User-Agent: bar
307             ... matching record ...
308
309             User-Agent: baz
310             User-Agent: qux
311             ... non-matching record ...
312
313             User-Agent: *
314             ... matching record, but will be pruned later ...
315
316             We have to respect `User-Agent' at the beginning of each
317             new record simply because we don't know if we're going to
318             encounter "Wget" among the agents or not.  Hence,
319             match_user_agent is called when record_count != 0.
320
321             But if record_count is 0, we have to keep calling it
322             until it matches, and if that happens, we must not call
323             it any more, until the next record.  Hence the other part
324             of the condition.  */
325          if (record_count != 0 || user_agent_applies == false)
326            match_user_agent (value_b, value_e - value_b,
327                              &user_agent_applies, &user_agent_exact);
328          if (user_agent_exact)
329            found_exact = true;
330          record_count = 0;
331        }
332      else if (FIELD_IS ("allow"))
333        {
334          if (user_agent_applies)
335            {
336              add_path (specs, value_b, value_e, true, user_agent_exact);
337            }
338          ++record_count;
339        }
340      else if (FIELD_IS ("disallow"))
341        {
342          if (user_agent_applies)
343            {
344              bool allowed = false;
345              if (value_b == value_e)
346                /* Empty "disallow" line means everything is *allowed*!  */
347                allowed = true;
348              add_path (specs, value_b, value_e, allowed, user_agent_exact);
349            }
350          ++record_count;
351        }
352      else
353        {
354          DEBUGP (("Ignoring unknown field at line %d", line_count));
355          goto next;
356        }
357
358    next:
359      p = lineend_real;
360      ++line_count;
361    }
362
363  if (found_exact)
364    {
365      /* We've encountered an exactly matching user-agent.  Throw out
366         all the stuff with user-agent: *.  */
367      prune_non_exact (specs);
368    }
369  else if (specs->size > specs->count)
370    {
371      /* add_path normally over-allocates specs->paths.  Reallocate it
372         to the correct size in order to conserve some memory.  */
373      specs->paths = xrealloc (specs->paths,
374                               specs->count * sizeof (struct path_info));
375      specs->size = specs->count;
376    }
377
378  return specs;
379}
380
381/* The same like res_parse, but first map the FILENAME into memory,
382   and then parse it.  */
383
384struct robot_specs *
385res_parse_from_file (const char *filename)
386{
387  struct robot_specs *specs;
388  struct file_memory *fm = read_file (filename);
389  if (!fm)
390    {
391      logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"),
392                 filename, strerror (errno));
393      return NULL;
394    }
395  specs = res_parse (fm->content, fm->length);
396  read_file_free (fm);
397  return specs;
398}
399
400static void
401free_specs (struct robot_specs *specs)
402{
403  int i;
404  for (i = 0; i < specs->count; i++)
405    xfree (specs->paths[i].path);
406  xfree_null (specs->paths);
407  xfree (specs);
408}
409
410/* Matching of a path according to the specs. */
411
412/* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
413   that number is not a numerical representation of '/', decode C and
414   advance the pointer.  */
415
416#define DECODE_MAYBE(c, ptr) do {                               \
417  if (c == '%' && c_isxdigit (ptr[1]) && c_isxdigit (ptr[2]))       \
418    {                                                           \
419      char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]);          \
420      if (decoded != '/')                                       \
421        {                                                       \
422          c = decoded;                                          \
423          ptr += 2;                                             \
424        }                                                       \
425    }                                                           \
426} while (0)
427
428/* The inner matching engine: return true if RECORD_PATH matches
429   URL_PATH.  The rules for matching are described at
430   <http://www.robotstxt.org/wc/norobots-rfc.txt>, section 3.2.2.  */
431
432static bool
433matches (const char *record_path, const char *url_path)
434{
435  const char *rp = record_path;
436  const char *up = url_path;
437
438  for (; ; ++rp, ++up)
439    {
440      char rc = *rp;
441      char uc = *up;
442      if (!rc)
443        return true;
444      if (!uc)
445        return false;
446      DECODE_MAYBE(rc, rp);
447      DECODE_MAYBE(uc, up);
448      if (rc != uc)
449        return false;
450    }
451}
452
453/* Iterate through all paths in SPECS.  For the first one that
454   matches, return its allow/reject status.  If none matches,
455   retrieval is by default allowed.  */
456
457bool
458res_match_path (const struct robot_specs *specs, const char *path)
459{
460  int i;
461  if (!specs)
462    return true;
463  for (i = 0; i < specs->count; i++)
464    if (matches (specs->paths[i].path, path))
465      {
466        bool allowedp = specs->paths[i].allowedp;
467        DEBUGP (("%s path %s because of rule %s.\n",
468                 allowedp ? "Allowing" : "Rejecting",
469                 path, quote (specs->paths[i].path)));
470        return allowedp;
471      }
472  return true;
473}
474
475/* Registering the specs. */
476
477static struct hash_table *registered_specs;
478
479/* Stolen from cookies.c. */
480#define SET_HOSTPORT(host, port, result) do {           \
481  int HP_len = strlen (host);                           \
482  result = alloca (HP_len + 1 + numdigit (port) + 1);   \
483  memcpy (result, host, HP_len);                        \
484  result[HP_len] = ':';                                 \
485  number_to_string (result + HP_len + 1, port);         \
486} while (0)
487
488/* Register RES specs that below to server on HOST:PORT.  They will
489   later be retrievable using res_get_specs.  */
490
491void
492res_register_specs (const char *host, int port, struct robot_specs *specs)
493{
494  struct robot_specs *old;
495  char *hp, *hp_old;
496  SET_HOSTPORT (host, port, hp);
497
498  if (!registered_specs)
499    registered_specs = make_nocase_string_hash_table (0);
500
501  if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
502    {
503      if (old)
504        free_specs (old);
505      hash_table_put (registered_specs, hp_old, specs);
506    }
507  else
508    {
509      hash_table_put (registered_specs, xstrdup (hp), specs);
510    }
511}
512
513/* Get the specs that belong to HOST:PORT. */
514
515struct robot_specs *
516res_get_specs (const char *host, int port)
517{
518  char *hp;
519  SET_HOSTPORT (host, port, hp);
520  if (!registered_specs)
521    return NULL;
522  return hash_table_get (registered_specs, hp);
523}
524
525/* Loading the robots file.  */
526
527#define RES_SPECS_LOCATION "/robots.txt"
528
529/* Retrieve the robots.txt from the server root of the server that
530   serves URL.  The file will be named according to the currently
531   active rules, and the file name will be returned in *file.
532
533   Return true if robots were retrieved OK, false otherwise.  */
534
535bool
536res_retrieve_file (const char *url, char **file, struct iri *iri)
537{
538  struct iri *i = iri_new ();
539  uerr_t err;
540  char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
541  int saved_ts_val = opt.timestamping;
542  int saved_sp_val = opt.spider, url_err;
543  struct url * url_parsed;
544
545  /* Copy server URI encoding for a possible IDNA transformation, no need to
546     encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
547  set_uri_encoding (i, iri->uri_encoding, false);
548  i->utf8_encode = false;
549
550  logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
551  *file = NULL;
552  opt.timestamping = false;
553  opt.spider       = false;
554
555  url_parsed = url_parse (robots_url, &url_err, iri, true);
556  if (!url_parsed)
557    {
558      char *error = url_error (robots_url, url_err);
559      logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error);
560      xfree (error);
561      err = URLERROR;
562    }
563  else
564    {
565      err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
566                          false, i, false);
567      url_free(url_parsed);
568    }
569
570  opt.timestamping = saved_ts_val;
571  opt.spider       = saved_sp_val;
572  xfree (robots_url);
573  iri_free (i);
574
575  if (err != RETROK && *file != NULL)
576    {
577      /* If the file is not retrieved correctly, but retrieve_url
578         allocated the file name, deallocate is here so that the
579         caller doesn't have to worry about it.  */
580      xfree (*file);
581      *file = NULL;
582    }
583  return err == RETROK;
584}
585
586bool
587is_robots_txt_url (const char *url)
588{
589  char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
590  bool ret = are_urls_equal (url, robots_url);
591
592  xfree (robots_url);
593
594  return ret;
595}
596
597void
598res_cleanup (void)
599{
600  if (registered_specs)
601    {
602      hash_table_iterator iter;
603      for (hash_table_iterate (registered_specs, &iter);
604           hash_table_iter_next (&iter);
605           )
606        {
607          xfree (iter.key);
608          free_specs (iter.value);
609        }
610      hash_table_destroy (registered_specs);
611      registered_specs = NULL;
612    }
613}
614
615#ifdef TESTING
616
617const char *
618test_is_robots_txt_url()
619{
620  int i;
621  struct {
622    char *url;
623    bool expected_result;
624  } test_array[] = {
625    { "http://www.yoyodyne.com/robots.txt", true },
626    { "http://www.yoyodyne.com/somepath/", false },
627    { "http://www.yoyodyne.com/somepath/robots.txt", false },
628  };
629
630  for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
631    {
632      mu_assert ("test_is_robots_txt_url: wrong result",
633                 is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);
634    }
635
636  return NULL;
637}
638
639#endif /* TESTING */
640
641/*
642 * vim: et ts=2 sw=2
643 */
644
645