1/* Support for Robot Exclusion Standard (RES). 2 Copyright (C) 2001, 2006, 2007, 2008, 2009, 2010, 2011 Free Software 3 Foundation, Inc. 4 5This file is part of Wget. 6 7This program is free software; you can redistribute it and/or modify 8it under the terms of the GNU General Public License as published by 9the Free Software Foundation; either version 3 of the License, or (at 10your option) any later version. 11 12This program is distributed in the hope that it will be useful, but 13WITHOUT ANY WARRANTY; without even the implied warranty of 14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15General Public License for more details. 16 17You should have received a copy of the GNU General Public License 18along with Wget. If not, see <http://www.gnu.org/licenses/>. 19 20Additional permission under GNU GPL version 3 section 7 21 22If you modify this program, or any covered work, by linking or 23combining it with the OpenSSL project's OpenSSL library (or a 24modified version of that library), containing parts covered by the 25terms of the OpenSSL or SSLeay licenses, the Free Software Foundation 26grants you additional permission to convey the resulting work. 27Corresponding Source for a non-source form of such a combination 28shall include the source code for the parts of OpenSSL used as well 29as that of the covered work. */ 30 31/* This file implements the Robot Exclusion Standard (RES). 32 33 RES is a simple protocol that enables site admins to signalize to 34 the web crawlers that certain parts of the site should not be 35 accessed. All the admin needs to do is create a "robots.txt" file 36 in the web server root, and use simple commands to allow or 37 disallow access to certain parts of the site. 38 39 The first specification was written by Martijn Koster in 1994, and 40 is still available at <http://www.robotstxt.org/wc/norobots.html>. 41 In 1996, Martijn wrote an Internet Draft specifying an improved RES 42 specification; however, that work was apparently abandoned since 43 the draft has expired in 1997 and hasn't been replaced since. The 44 draft is available at 45 <http://www.robotstxt.org/wc/norobots-rfc.html>. 46 47 This file implements RES as specified by the draft. Note that this 48 only handles the "robots.txt" support. The META tag that controls 49 whether the links should be followed is handled in `html-url.c'. 50 51 Known deviations: 52 53 * The end-of-line comment recognition is more in the spirit of the 54 Bourne Shell (as specified by RES-1994). That means that 55 "foo#bar" is taken literally, whereas "foo #bar" is interpreted 56 as "foo". The Draft apparently specifies that both should be 57 interpreted as "foo". 58 59 * We don't recognize sole CR as the line ending. 60 61 * We don't implement expiry mechanism for /robots.txt specs. I 62 consider it non-necessary for a relatively short-lived 63 application such as Wget. Besides, it is highly questionable 64 whether anyone deploys the recommended expiry scheme for 65 robots.txt. 66 67 Entry points are functions res_parse, res_parse_from_file, 68 res_match_path, res_register_specs, res_get_specs, and 69 res_retrieve_file. */ 70 71#include "wget.h" 72 73#include <stdio.h> 74#include <stdlib.h> 75#include <string.h> 76#include <errno.h> 77#include <assert.h> 78 79#include "utils.h" 80#include "hash.h" 81#include "url.h" 82#include "retr.h" 83#include "res.h" 84 85#ifdef TESTING 86#include "test.h" 87#endif 88 89struct path_info { 90 char *path; 91 bool allowedp; 92 bool user_agent_exact_p; 93}; 94 95struct robot_specs { 96 int count; 97 int size; 98 struct path_info *paths; 99}; 100 101/* Parsing the robot spec. */ 102 103/* Check whether AGENT (a string of length LENGTH) equals "wget" or 104 "*". If it is either of them, *matches is set to one. If it is 105 "wget", *exact_match is set to one. */ 106 107static void 108match_user_agent (const char *agent, int length, 109 bool *matches, bool *exact_match) 110{ 111 if (length == 1 && *agent == '*') 112 { 113 *matches = true; 114 *exact_match = false; 115 } 116 else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget")) 117 { 118 *matches = true; 119 *exact_match = true; 120 } 121 else 122 { 123 *matches = false; 124 *exact_match = false; 125 } 126} 127 128/* Add a path specification between PATH_B and PATH_E as one of the 129 paths in SPECS. */ 130 131static void 132add_path (struct robot_specs *specs, const char *path_b, const char *path_e, 133 bool allowedp, bool exactp) 134{ 135 struct path_info pp; 136 if (path_b < path_e && *path_b == '/') 137 /* Our path representation doesn't use a leading slash, so remove 138 one from theirs. */ 139 ++path_b; 140 pp.path = strdupdelim (path_b, path_e); 141 pp.allowedp = allowedp; 142 pp.user_agent_exact_p = exactp; 143 ++specs->count; 144 if (specs->count > specs->size) 145 { 146 if (specs->size == 0) 147 specs->size = 1; 148 else 149 specs->size <<= 1; 150 specs->paths = xrealloc (specs->paths, 151 specs->size * sizeof (struct path_info)); 152 } 153 specs->paths[specs->count - 1] = pp; 154} 155 156/* Recreate SPECS->paths with only those paths that have 157 user_agent_exact_p set to true. */ 158 159static void 160prune_non_exact (struct robot_specs *specs) 161{ 162 struct path_info *newpaths; 163 int i, j, cnt; 164 cnt = 0; 165 for (i = 0; i < specs->count; i++) 166 if (specs->paths[i].user_agent_exact_p) 167 ++cnt; 168 newpaths = xnew_array (struct path_info, cnt); 169 for (i = 0, j = 0; i < specs->count; i++) 170 if (specs->paths[i].user_agent_exact_p) 171 newpaths[j++] = specs->paths[i]; 172 assert (j == cnt); 173 xfree (specs->paths); 174 specs->paths = newpaths; 175 specs->count = cnt; 176 specs->size = cnt; 177} 178 179#define EOL(p) ((p) >= lineend) 180 181#define SKIP_SPACE(p) do { \ 182 while (!EOL (p) && c_isspace (*p)) \ 183 ++p; \ 184} while (0) 185 186#define FIELD_IS(string_literal) \ 187 BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal) 188 189/* Parse textual RES specs beginning with SOURCE of length LENGTH. 190 Return a specs objects ready to be fed to res_match_path. 191 192 The parsing itself is trivial, but creating a correct SPECS object 193 is trickier than it seems, because RES is surprisingly byzantine if 194 you attempt to implement it correctly. 195 196 A "record" is a block of one or more `User-Agent' lines followed by 197 one or more `Allow' or `Disallow' lines. Record is accepted by 198 Wget if one of the `User-Agent' lines was "wget", or if the user 199 agent line was "*". 200 201 After all the lines have been read, we examine whether an exact 202 ("wget") user-agent field was specified. If so, we delete all the 203 lines read under "User-Agent: *" blocks because we have our own 204 Wget-specific blocks. This enables the admin to say: 205 206 User-Agent: * 207 Disallow: / 208 209 User-Agent: google 210 User-Agent: wget 211 Disallow: /cgi-bin 212 213 This means that to Wget and to Google, /cgi-bin is disallowed, 214 whereas for all other crawlers, everything is disallowed. 215 res_parse is implemented so that the order of records doesn't 216 matter. In the case above, the "User-Agent: *" could have come 217 after the other one. */ 218 219struct robot_specs * 220res_parse (const char *source, int length) 221{ 222 int line_count = 1; 223 224 const char *p = source; 225 const char *end = source + length; 226 227 /* true if last applicable user-agent field matches Wget. */ 228 bool user_agent_applies = false; 229 230 /* true if last applicable user-agent field *exactly* matches 231 Wget. */ 232 bool user_agent_exact = false; 233 234 /* whether we ever encountered exact user agent. */ 235 bool found_exact = false; 236 237 /* count of allow/disallow lines in the current "record", i.e. after 238 the last `user-agent' instructions. */ 239 int record_count = 0; 240 241 struct robot_specs *specs = xnew0 (struct robot_specs); 242 243 while (1) 244 { 245 const char *lineend, *lineend_real; 246 const char *field_b, *field_e; 247 const char *value_b, *value_e; 248 249 if (p == end) 250 break; 251 lineend_real = memchr (p, '\n', end - p); 252 if (lineend_real) 253 ++lineend_real; 254 else 255 lineend_real = end; 256 lineend = lineend_real; 257 258 /* Before doing anything else, check whether the line is empty 259 or comment-only. */ 260 SKIP_SPACE (p); 261 if (EOL (p) || *p == '#') 262 goto next; 263 264 /* Make sure the end-of-line comments are respected by setting 265 lineend to a location preceding the first comment. Real line 266 ending remains in lineend_real. */ 267 for (lineend = p; lineend < lineend_real; lineend++) 268 if ((lineend == p || c_isspace (*(lineend - 1))) 269 && *lineend == '#') 270 break; 271 272 /* Ignore trailing whitespace in the same way. */ 273 while (lineend > p && c_isspace (*(lineend - 1))) 274 --lineend; 275 276 assert (!EOL (p)); 277 278 field_b = p; 279 while (!EOL (p) && (c_isalnum (*p) || *p == '-')) 280 ++p; 281 field_e = p; 282 283 SKIP_SPACE (p); 284 if (field_b == field_e || EOL (p) || *p != ':') 285 { 286 DEBUGP (("Ignoring malformed line %d\n", line_count)); 287 goto next; 288 } 289 ++p; /* skip ':' */ 290 SKIP_SPACE (p); 291 292 value_b = p; 293 while (!EOL (p)) 294 ++p; 295 value_e = p; 296 297 /* Finally, we have a syntactically valid line. */ 298 if (FIELD_IS ("user-agent")) 299 { 300 /* We have to support several cases: 301 302 --previous records-- 303 304 User-Agent: foo 305 User-Agent: Wget 306 User-Agent: bar 307 ... matching record ... 308 309 User-Agent: baz 310 User-Agent: qux 311 ... non-matching record ... 312 313 User-Agent: * 314 ... matching record, but will be pruned later ... 315 316 We have to respect `User-Agent' at the beginning of each 317 new record simply because we don't know if we're going to 318 encounter "Wget" among the agents or not. Hence, 319 match_user_agent is called when record_count != 0. 320 321 But if record_count is 0, we have to keep calling it 322 until it matches, and if that happens, we must not call 323 it any more, until the next record. Hence the other part 324 of the condition. */ 325 if (record_count != 0 || user_agent_applies == false) 326 match_user_agent (value_b, value_e - value_b, 327 &user_agent_applies, &user_agent_exact); 328 if (user_agent_exact) 329 found_exact = true; 330 record_count = 0; 331 } 332 else if (FIELD_IS ("allow")) 333 { 334 if (user_agent_applies) 335 { 336 add_path (specs, value_b, value_e, true, user_agent_exact); 337 } 338 ++record_count; 339 } 340 else if (FIELD_IS ("disallow")) 341 { 342 if (user_agent_applies) 343 { 344 bool allowed = false; 345 if (value_b == value_e) 346 /* Empty "disallow" line means everything is *allowed*! */ 347 allowed = true; 348 add_path (specs, value_b, value_e, allowed, user_agent_exact); 349 } 350 ++record_count; 351 } 352 else 353 { 354 DEBUGP (("Ignoring unknown field at line %d\n", line_count)); 355 goto next; 356 } 357 358 next: 359 p = lineend_real; 360 ++line_count; 361 } 362 363 if (found_exact) 364 { 365 /* We've encountered an exactly matching user-agent. Throw out 366 all the stuff with user-agent: *. */ 367 prune_non_exact (specs); 368 } 369 else if (specs->size > specs->count) 370 { 371 /* add_path normally over-allocates specs->paths. Reallocate it 372 to the correct size in order to conserve some memory. */ 373 specs->paths = xrealloc (specs->paths, 374 specs->count * sizeof (struct path_info)); 375 specs->size = specs->count; 376 } 377 378 return specs; 379} 380 381/* The same like res_parse, but first map the FILENAME into memory, 382 and then parse it. */ 383 384struct robot_specs * 385res_parse_from_file (const char *filename) 386{ 387 struct robot_specs *specs; 388 struct file_memory *fm = wget_read_file (filename); 389 if (!fm) 390 { 391 logprintf (LOG_NOTQUIET, _("Cannot open %s: %s"), 392 filename, strerror (errno)); 393 return NULL; 394 } 395 specs = res_parse (fm->content, fm->length); 396 wget_read_file_free (fm); 397 return specs; 398} 399 400static void 401free_specs (struct robot_specs *specs) 402{ 403 int i; 404 for (i = 0; i < specs->count; i++) 405 xfree (specs->paths[i].path); 406 xfree_null (specs->paths); 407 xfree (specs); 408} 409 410/* Matching of a path according to the specs. */ 411 412/* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if 413 that number is not a numerical representation of '/', decode C and 414 advance the pointer. */ 415 416#define DECODE_MAYBE(c, ptr) do { \ 417 if (c == '%' && c_isxdigit (ptr[1]) && c_isxdigit (ptr[2])) \ 418 { \ 419 char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \ 420 if (decoded != '/') \ 421 { \ 422 c = decoded; \ 423 ptr += 2; \ 424 } \ 425 } \ 426} while (0) 427 428/* The inner matching engine: return true if RECORD_PATH matches 429 URL_PATH. The rules for matching are described at 430 <http://www.robotstxt.org/wc/norobots-rfc.txt>, section 3.2.2. */ 431 432static bool 433matches (const char *record_path, const char *url_path) 434{ 435 const char *rp = record_path; 436 const char *up = url_path; 437 438 for (; ; ++rp, ++up) 439 { 440 char rc = *rp; 441 char uc = *up; 442 if (!rc) 443 return true; 444 if (!uc) 445 return false; 446 DECODE_MAYBE(rc, rp); 447 DECODE_MAYBE(uc, up); 448 if (rc != uc) 449 return false; 450 } 451} 452 453/* Iterate through all paths in SPECS. For the first one that 454 matches, return its allow/reject status. If none matches, 455 retrieval is by default allowed. */ 456 457bool 458res_match_path (const struct robot_specs *specs, const char *path) 459{ 460 int i; 461 if (!specs) 462 return true; 463 for (i = 0; i < specs->count; i++) 464 if (matches (specs->paths[i].path, path)) 465 { 466 bool allowedp = specs->paths[i].allowedp; 467 DEBUGP (("%s path %s because of rule %s.\n", 468 allowedp ? "Allowing" : "Rejecting", 469 path, quote (specs->paths[i].path))); 470 return allowedp; 471 } 472 return true; 473} 474 475/* Registering the specs. */ 476 477static struct hash_table *registered_specs; 478 479/* Stolen from cookies.c. */ 480#define SET_HOSTPORT(host, port, result) do { \ 481 int HP_len = strlen (host); \ 482 result = alloca (HP_len + 1 + numdigit (port) + 1); \ 483 memcpy (result, host, HP_len); \ 484 result[HP_len] = ':'; \ 485 number_to_string (result + HP_len + 1, port); \ 486} while (0) 487 488/* Register RES specs that below to server on HOST:PORT. They will 489 later be retrievable using res_get_specs. */ 490 491void 492res_register_specs (const char *host, int port, struct robot_specs *specs) 493{ 494 struct robot_specs *old; 495 char *hp, *hp_old; 496 SET_HOSTPORT (host, port, hp); 497 498 if (!registered_specs) 499 registered_specs = make_nocase_string_hash_table (0); 500 501 if (hash_table_get_pair (registered_specs, hp, &hp_old, &old)) 502 { 503 if (old) 504 free_specs (old); 505 hash_table_put (registered_specs, hp_old, specs); 506 } 507 else 508 { 509 hash_table_put (registered_specs, xstrdup (hp), specs); 510 } 511} 512 513/* Get the specs that belong to HOST:PORT. */ 514 515struct robot_specs * 516res_get_specs (const char *host, int port) 517{ 518 char *hp; 519 SET_HOSTPORT (host, port, hp); 520 if (!registered_specs) 521 return NULL; 522 return hash_table_get (registered_specs, hp); 523} 524 525/* Loading the robots file. */ 526 527#define RES_SPECS_LOCATION "/robots.txt" 528 529/* Retrieve the robots.txt from the server root of the server that 530 serves URL. The file will be named according to the currently 531 active rules, and the file name will be returned in *file. 532 533 Return true if robots were retrieved OK, false otherwise. */ 534 535bool 536res_retrieve_file (const char *url, char **file, struct iri *iri) 537{ 538 struct iri *i = iri_new (); 539 uerr_t err; 540 char *robots_url = uri_merge (url, RES_SPECS_LOCATION); 541 int saved_ts_val = opt.timestamping; 542 int saved_sp_val = opt.spider, url_err; 543 struct url * url_parsed; 544 545 /* Copy server URI encoding for a possible IDNA transformation, no need to 546 encode the full URI in UTF-8 because "robots.txt" is plain ASCII */ 547 set_uri_encoding (i, iri->uri_encoding, false); 548 i->utf8_encode = false; 549 550 logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); 551 *file = NULL; 552 opt.timestamping = false; 553 opt.spider = false; 554 555 url_parsed = url_parse (robots_url, &url_err, i, true); 556 if (!url_parsed) 557 { 558 char *error = url_error (robots_url, url_err); 559 logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error); 560 xfree (error); 561 err = URLERROR; 562 } 563 else 564 { 565 err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL, 566 false, i, false); 567 url_free(url_parsed); 568 } 569 570 opt.timestamping = saved_ts_val; 571 opt.spider = saved_sp_val; 572 xfree (robots_url); 573 iri_free (i); 574 575 if (err != RETROK && *file != NULL) 576 { 577 /* If the file is not retrieved correctly, but retrieve_url 578 allocated the file name, deallocate is here so that the 579 caller doesn't have to worry about it. */ 580 xfree (*file); 581 *file = NULL; 582 } 583 return err == RETROK; 584} 585 586bool 587is_robots_txt_url (const char *url) 588{ 589 char *robots_url = uri_merge (url, RES_SPECS_LOCATION); 590 bool ret = are_urls_equal (url, robots_url); 591 592 xfree (robots_url); 593 594 return ret; 595} 596 597void 598res_cleanup (void) 599{ 600 if (registered_specs) 601 { 602 hash_table_iterator iter; 603 for (hash_table_iterate (registered_specs, &iter); 604 hash_table_iter_next (&iter); 605 ) 606 { 607 xfree (iter.key); 608 free_specs (iter.value); 609 } 610 hash_table_destroy (registered_specs); 611 registered_specs = NULL; 612 } 613} 614 615#ifdef TESTING 616 617const char * 618test_is_robots_txt_url(void) 619{ 620 unsigned i; 621 static const struct { 622 const char *url; 623 bool expected_result; 624 } test_array[] = { 625 { "http://www.yoyodyne.com/robots.txt", true }, 626 { "http://www.yoyodyne.com/somepath/", false }, 627 { "http://www.yoyodyne.com/somepath/robots.txt", false }, 628 }; 629 630 for (i = 0; i < countof(test_array); ++i) 631 { 632 mu_assert ("test_is_robots_txt_url: wrong result", 633 is_robots_txt_url (test_array[i].url) == test_array[i].expected_result); 634 } 635 636 return NULL; 637} 638 639#endif /* TESTING */ 640 641/* 642 * vim: et ts=2 sw=2 643 */ 644