1/* HTTP support. 2 Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 3 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. 4 5This file is part of GNU Wget. 6 7GNU Wget is free software; you can redistribute it and/or modify 8it under the terms of the GNU General Public License as published by 9the Free Software Foundation; either version 3 of the License, or 10 (at your option) any later version. 11 12GNU Wget is distributed in the hope that it will be useful, 13but WITHOUT ANY WARRANTY; without even the implied warranty of 14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15GNU General Public License for more details. 16 17You should have received a copy of the GNU General Public License 18along with Wget. If not, see <http://www.gnu.org/licenses/>. 19 20Additional permission under GNU GPL version 3 section 7 21 22If you modify this program, or any covered work, by linking or 23combining it with the OpenSSL project's OpenSSL library (or a 24modified version of that library), containing parts covered by the 25terms of the OpenSSL or SSLeay licenses, the Free Software Foundation 26grants you additional permission to convey the resulting work. 27Corresponding Source for a non-source form of such a combination 28shall include the source code for the parts of OpenSSL used as well 29as that of the covered work. */ 30 31#include "wget.h" 32 33#include <stdio.h> 34#include <stdlib.h> 35#include <string.h> 36#ifdef HAVE_UNISTD_H 37# include <unistd.h> 38#endif 39#include <assert.h> 40#include <errno.h> 41#include <time.h> 42#include <locale.h> 43 44#include "hash.h" 45#include "http.h" 46#include "utils.h" 47#include "url.h" 48#include "host.h" 49#include "retr.h" 50#include "connect.h" 51#include "netrc.h" 52#ifdef HAVE_SSL 53# include "ssl.h" 54#endif 55#ifdef ENABLE_NTLM 56# include "http-ntlm.h" 57#endif 58#include "cookies.h" 59#ifdef ENABLE_DIGEST 60# include "gen-md5.h" 61#endif 62#include "convert.h" 63#include "spider.h" 64 65#ifdef TESTING 66#include "test.h" 67#endif 68 69#ifdef __VMS 70# include "vms.h" 71#endif /* def __VMS */ 72 73extern char *version_string; 74 75/* Forward decls. */ 76struct http_stat; 77static char *create_authorization_line (const char *, const char *, 78 const char *, const char *, 79 const char *, bool *); 80static char *basic_authentication_encode (const char *, const char *); 81static bool known_authentication_scheme_p (const char *, const char *); 82static void ensure_extension (struct http_stat *, const char *, int *); 83static void load_cookies (void); 84 85#ifndef MIN 86# define MIN(x, y) ((x) > (y) ? (y) : (x)) 87#endif 88 89 90static bool cookies_loaded_p; 91static struct cookie_jar *wget_cookie_jar; 92 93#define TEXTHTML_S "text/html" 94#define TEXTXHTML_S "application/xhtml+xml" 95#define TEXTCSS_S "text/css" 96 97/* Some status code validation macros: */ 98#define H_20X(x) (((x) >= 200) && ((x) < 300)) 99#define H_PARTIAL(x) ((x) == HTTP_STATUS_PARTIAL_CONTENTS) 100#define H_REDIRECTED(x) ((x) == HTTP_STATUS_MOVED_PERMANENTLY \ 101 || (x) == HTTP_STATUS_MOVED_TEMPORARILY \ 102 || (x) == HTTP_STATUS_SEE_OTHER \ 103 || (x) == HTTP_STATUS_TEMPORARY_REDIRECT) 104 105/* HTTP/1.0 status codes from RFC1945, provided for reference. */ 106/* Successful 2xx. */ 107#define HTTP_STATUS_OK 200 108#define HTTP_STATUS_CREATED 201 109#define HTTP_STATUS_ACCEPTED 202 110#define HTTP_STATUS_NO_CONTENT 204 111#define HTTP_STATUS_PARTIAL_CONTENTS 206 112 113/* Redirection 3xx. */ 114#define HTTP_STATUS_MULTIPLE_CHOICES 300 115#define HTTP_STATUS_MOVED_PERMANENTLY 301 116#define HTTP_STATUS_MOVED_TEMPORARILY 302 117#define HTTP_STATUS_SEE_OTHER 303 /* from HTTP/1.1 */ 118#define HTTP_STATUS_NOT_MODIFIED 304 119#define HTTP_STATUS_TEMPORARY_REDIRECT 307 /* from HTTP/1.1 */ 120 121/* Client error 4xx. */ 122#define HTTP_STATUS_BAD_REQUEST 400 123#define HTTP_STATUS_UNAUTHORIZED 401 124#define HTTP_STATUS_FORBIDDEN 403 125#define HTTP_STATUS_NOT_FOUND 404 126#define HTTP_STATUS_RANGE_NOT_SATISFIABLE 416 127 128/* Server errors 5xx. */ 129#define HTTP_STATUS_INTERNAL 500 130#define HTTP_STATUS_NOT_IMPLEMENTED 501 131#define HTTP_STATUS_BAD_GATEWAY 502 132#define HTTP_STATUS_UNAVAILABLE 503 133 134enum rp { 135 rel_none, rel_name, rel_value, rel_both 136}; 137 138struct request { 139 const char *method; 140 char *arg; 141 142 struct request_header { 143 char *name, *value; 144 enum rp release_policy; 145 } *headers; 146 int hcount, hcapacity; 147}; 148 149extern int numurls; 150 151/* Create a new, empty request. At least request_set_method must be 152 called before the request can be used. */ 153 154static struct request * 155request_new (void) 156{ 157 struct request *req = xnew0 (struct request); 158 req->hcapacity = 8; 159 req->headers = xnew_array (struct request_header, req->hcapacity); 160 return req; 161} 162 163/* Set the request's method and its arguments. METH should be a 164 literal string (or it should outlive the request) because it will 165 not be freed. ARG will be freed by request_free. */ 166 167static void 168request_set_method (struct request *req, const char *meth, char *arg) 169{ 170 req->method = meth; 171 req->arg = arg; 172} 173 174/* Return the method string passed with the last call to 175 request_set_method. */ 176 177static const char * 178request_method (const struct request *req) 179{ 180 return req->method; 181} 182 183/* Free one header according to the release policy specified with 184 request_set_header. */ 185 186static void 187release_header (struct request_header *hdr) 188{ 189 switch (hdr->release_policy) 190 { 191 case rel_none: 192 break; 193 case rel_name: 194 xfree (hdr->name); 195 break; 196 case rel_value: 197 xfree (hdr->value); 198 break; 199 case rel_both: 200 xfree (hdr->name); 201 xfree (hdr->value); 202 break; 203 } 204} 205 206/* Set the request named NAME to VALUE. Specifically, this means that 207 a "NAME: VALUE\r\n" header line will be used in the request. If a 208 header with the same name previously existed in the request, its 209 value will be replaced by this one. A NULL value means do nothing. 210 211 RELEASE_POLICY determines whether NAME and VALUE should be released 212 (freed) with request_free. Allowed values are: 213 214 - rel_none - don't free NAME or VALUE 215 - rel_name - free NAME when done 216 - rel_value - free VALUE when done 217 - rel_both - free both NAME and VALUE when done 218 219 Setting release policy is useful when arguments come from different 220 sources. For example: 221 222 // Don't free literal strings! 223 request_set_header (req, "Pragma", "no-cache", rel_none); 224 225 // Don't free a global variable, we'll need it later. 226 request_set_header (req, "Referer", opt.referer, rel_none); 227 228 // Value freshly allocated, free it when done. 229 request_set_header (req, "Range", 230 aprintf ("bytes=%s-", number_to_static_string (hs->restval)), 231 rel_value); 232 */ 233 234static void 235request_set_header (struct request *req, char *name, char *value, 236 enum rp release_policy) 237{ 238 struct request_header *hdr; 239 int i; 240 241 if (!value) 242 { 243 /* A NULL value is a no-op; if freeing the name is requested, 244 free it now to avoid leaks. */ 245 if (release_policy == rel_name || release_policy == rel_both) 246 xfree (name); 247 return; 248 } 249 250 for (i = 0; i < req->hcount; i++) 251 { 252 hdr = &req->headers[i]; 253 if (0 == strcasecmp (name, hdr->name)) 254 { 255 /* Replace existing header. */ 256 release_header (hdr); 257 hdr->name = name; 258 hdr->value = value; 259 hdr->release_policy = release_policy; 260 return; 261 } 262 } 263 264 /* Install new header. */ 265 266 if (req->hcount >= req->hcapacity) 267 { 268 req->hcapacity <<= 1; 269 req->headers = xrealloc (req->headers, req->hcapacity * sizeof (*hdr)); 270 } 271 hdr = &req->headers[req->hcount++]; 272 hdr->name = name; 273 hdr->value = value; 274 hdr->release_policy = release_policy; 275} 276 277/* Like request_set_header, but sets the whole header line, as 278 provided by the user using the `--header' option. For example, 279 request_set_user_header (req, "Foo: bar") works just like 280 request_set_header (req, "Foo", "bar"). */ 281 282static void 283request_set_user_header (struct request *req, const char *header) 284{ 285 char *name; 286 const char *p = strchr (header, ':'); 287 if (!p) 288 return; 289 BOUNDED_TO_ALLOCA (header, p, name); 290 ++p; 291 while (c_isspace (*p)) 292 ++p; 293 request_set_header (req, xstrdup (name), (char *) p, rel_name); 294} 295 296/* Remove the header with specified name from REQ. Returns true if 297 the header was actually removed, false otherwise. */ 298 299static bool 300request_remove_header (struct request *req, char *name) 301{ 302 int i; 303 for (i = 0; i < req->hcount; i++) 304 { 305 struct request_header *hdr = &req->headers[i]; 306 if (0 == strcasecmp (name, hdr->name)) 307 { 308 release_header (hdr); 309 /* Move the remaining headers by one. */ 310 if (i < req->hcount - 1) 311 memmove (hdr, hdr + 1, (req->hcount - i - 1) * sizeof (*hdr)); 312 --req->hcount; 313 return true; 314 } 315 } 316 return false; 317} 318 319#define APPEND(p, str) do { \ 320 int A_len = strlen (str); \ 321 memcpy (p, str, A_len); \ 322 p += A_len; \ 323} while (0) 324 325/* Construct the request and write it to FD using fd_write. */ 326 327static int 328request_send (const struct request *req, int fd) 329{ 330 char *request_string, *p; 331 int i, size, write_error; 332 333 /* Count the request size. */ 334 size = 0; 335 336 /* METHOD " " ARG " " "HTTP/1.0" "\r\n" */ 337 size += strlen (req->method) + 1 + strlen (req->arg) + 1 + 8 + 2; 338 339 for (i = 0; i < req->hcount; i++) 340 { 341 struct request_header *hdr = &req->headers[i]; 342 /* NAME ": " VALUE "\r\n" */ 343 size += strlen (hdr->name) + 2 + strlen (hdr->value) + 2; 344 } 345 346 /* "\r\n\0" */ 347 size += 3; 348 349 p = request_string = alloca_array (char, size); 350 351 /* Generate the request. */ 352 353 APPEND (p, req->method); *p++ = ' '; 354 APPEND (p, req->arg); *p++ = ' '; 355 memcpy (p, "HTTP/1.0\r\n", 10); p += 10; 356 357 for (i = 0; i < req->hcount; i++) 358 { 359 struct request_header *hdr = &req->headers[i]; 360 APPEND (p, hdr->name); 361 *p++ = ':', *p++ = ' '; 362 APPEND (p, hdr->value); 363 *p++ = '\r', *p++ = '\n'; 364 } 365 366 *p++ = '\r', *p++ = '\n', *p++ = '\0'; 367 assert (p - request_string == size); 368 369#undef APPEND 370 371 DEBUGP (("\n---request begin---\n%s---request end---\n", request_string)); 372 373 /* Send the request to the server. */ 374 375 write_error = fd_write (fd, request_string, size - 1, -1); 376 if (write_error < 0) 377 logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"), 378 fd_errstr (fd)); 379 return write_error; 380} 381 382/* Release the resources used by REQ. */ 383 384static void 385request_free (struct request *req) 386{ 387 int i; 388 xfree_null (req->arg); 389 for (i = 0; i < req->hcount; i++) 390 release_header (&req->headers[i]); 391 xfree_null (req->headers); 392 xfree (req); 393} 394 395static struct hash_table *basic_authed_hosts; 396 397/* Find out if this host has issued a Basic challenge yet; if so, give 398 * it the username, password. A temporary measure until we can get 399 * proper authentication in place. */ 400 401static bool 402maybe_send_basic_creds (const char *hostname, const char *user, 403 const char *passwd, struct request *req) 404{ 405 bool do_challenge = false; 406 407 if (opt.auth_without_challenge) 408 { 409 DEBUGP(("Auth-without-challenge set, sending Basic credentials.\n")); 410 do_challenge = true; 411 } 412 else if (basic_authed_hosts 413 && hash_table_contains(basic_authed_hosts, hostname)) 414 { 415 DEBUGP(("Found %s in basic_authed_hosts.\n", quote (hostname))); 416 do_challenge = true; 417 } 418 else 419 { 420 DEBUGP(("Host %s has not issued a general basic challenge.\n", 421 quote (hostname))); 422 } 423 if (do_challenge) 424 { 425 request_set_header (req, "Authorization", 426 basic_authentication_encode (user, passwd), 427 rel_value); 428 } 429 return do_challenge; 430} 431 432static void 433register_basic_auth_host (const char *hostname) 434{ 435 if (!basic_authed_hosts) 436 { 437 basic_authed_hosts = make_nocase_string_hash_table (1); 438 } 439 if (!hash_table_contains(basic_authed_hosts, hostname)) 440 { 441 hash_table_put (basic_authed_hosts, xstrdup(hostname), NULL); 442 DEBUGP(("Inserted %s into basic_authed_hosts\n", quote (hostname))); 443 } 444} 445 446 447/* Send the contents of FILE_NAME to SOCK. Make sure that exactly 448 PROMISED_SIZE bytes are sent over the wire -- if the file is 449 longer, read only that much; if the file is shorter, report an error. */ 450 451static int 452post_file (int sock, const char *file_name, wgint promised_size) 453{ 454 static char chunk[8192]; 455 wgint written = 0; 456 int write_error; 457 FILE *fp; 458 459 DEBUGP (("[writing POST file %s ... ", file_name)); 460 461 fp = fopen (file_name, "rb"); 462 if (!fp) 463 return -1; 464 while (!feof (fp) && written < promised_size) 465 { 466 int towrite; 467 int length = fread (chunk, 1, sizeof (chunk), fp); 468 if (length == 0) 469 break; 470 towrite = MIN (promised_size - written, length); 471 write_error = fd_write (sock, chunk, towrite, -1); 472 if (write_error < 0) 473 { 474 fclose (fp); 475 return -1; 476 } 477 written += towrite; 478 } 479 fclose (fp); 480 481 /* If we've written less than was promised, report a (probably 482 nonsensical) error rather than break the promise. */ 483 if (written < promised_size) 484 { 485 errno = EINVAL; 486 return -1; 487 } 488 489 assert (written == promised_size); 490 DEBUGP (("done]\n")); 491 return 0; 492} 493 494/* Determine whether [START, PEEKED + PEEKLEN) contains an empty line. 495 If so, return the pointer to the position after the line, otherwise 496 return NULL. This is used as callback to fd_read_hunk. The data 497 between START and PEEKED has been read and cannot be "unread"; the 498 data after PEEKED has only been peeked. */ 499 500static const char * 501response_head_terminator (const char *start, const char *peeked, int peeklen) 502{ 503 const char *p, *end; 504 505 /* If at first peek, verify whether HUNK starts with "HTTP". If 506 not, this is a HTTP/0.9 request and we must bail out without 507 reading anything. */ 508 if (start == peeked && 0 != memcmp (start, "HTTP", MIN (peeklen, 4))) 509 return start; 510 511 /* Look for "\n[\r]\n", and return the following position if found. 512 Start two chars before the current to cover the possibility that 513 part of the terminator (e.g. "\n\r") arrived in the previous 514 batch. */ 515 p = peeked - start < 2 ? start : peeked - 2; 516 end = peeked + peeklen; 517 518 /* Check for \n\r\n or \n\n anywhere in [p, end-2). */ 519 for (; p < end - 2; p++) 520 if (*p == '\n') 521 { 522 if (p[1] == '\r' && p[2] == '\n') 523 return p + 3; 524 else if (p[1] == '\n') 525 return p + 2; 526 } 527 /* p==end-2: check for \n\n directly preceding END. */ 528 if (p[0] == '\n' && p[1] == '\n') 529 return p + 2; 530 531 return NULL; 532} 533 534/* The maximum size of a single HTTP response we care to read. Rather 535 than being a limit of the reader implementation, this limit 536 prevents Wget from slurping all available memory upon encountering 537 malicious or buggy server output, thus protecting the user. Define 538 it to 0 to remove the limit. */ 539 540#define HTTP_RESPONSE_MAX_SIZE 65536 541 542/* Read the HTTP request head from FD and return it. The error 543 conditions are the same as with fd_read_hunk. 544 545 To support HTTP/0.9 responses, this function tries to make sure 546 that the data begins with "HTTP". If this is not the case, no data 547 is read and an empty request is returned, so that the remaining 548 data can be treated as body. */ 549 550static char * 551read_http_response_head (int fd) 552{ 553 return fd_read_hunk (fd, response_head_terminator, 512, 554 HTTP_RESPONSE_MAX_SIZE); 555} 556 557struct response { 558 /* The response data. */ 559 const char *data; 560 561 /* The array of pointers that indicate where each header starts. 562 For example, given this HTTP response: 563 564 HTTP/1.0 200 Ok 565 Description: some 566 text 567 Etag: x 568 569 The headers are located like this: 570 571 "HTTP/1.0 200 Ok\r\nDescription: some\r\n text\r\nEtag: x\r\n\r\n" 572 ^ ^ ^ ^ 573 headers[0] headers[1] headers[2] headers[3] 574 575 I.e. headers[0] points to the beginning of the request, 576 headers[1] points to the end of the first header and the 577 beginning of the second one, etc. */ 578 579 const char **headers; 580}; 581 582/* Create a new response object from the text of the HTTP response, 583 available in HEAD. That text is automatically split into 584 constituent header lines for fast retrieval using 585 resp_header_*. */ 586 587static struct response * 588resp_new (const char *head) 589{ 590 const char *hdr; 591 int count, size; 592 593 struct response *resp = xnew0 (struct response); 594 resp->data = head; 595 596 if (*head == '\0') 597 { 598 /* Empty head means that we're dealing with a headerless 599 (HTTP/0.9) response. In that case, don't set HEADERS at 600 all. */ 601 return resp; 602 } 603 604 /* Split HEAD into header lines, so that resp_header_* functions 605 don't need to do this over and over again. */ 606 607 size = count = 0; 608 hdr = head; 609 while (1) 610 { 611 DO_REALLOC (resp->headers, size, count + 1, const char *); 612 resp->headers[count++] = hdr; 613 614 /* Break upon encountering an empty line. */ 615 if (!hdr[0] || (hdr[0] == '\r' && hdr[1] == '\n') || hdr[0] == '\n') 616 break; 617 618 /* Find the end of HDR, including continuations. */ 619 do 620 { 621 const char *end = strchr (hdr, '\n'); 622 if (end) 623 hdr = end + 1; 624 else 625 hdr += strlen (hdr); 626 } 627 while (*hdr == ' ' || *hdr == '\t'); 628 } 629 DO_REALLOC (resp->headers, size, count + 1, const char *); 630 resp->headers[count] = NULL; 631 632 return resp; 633} 634 635/* Locate the header named NAME in the request data, starting with 636 position START. This allows the code to loop through the request 637 data, filtering for all requests of a given name. Returns the 638 found position, or -1 for failure. The code that uses this 639 function typically looks like this: 640 641 for (pos = 0; (pos = resp_header_locate (...)) != -1; pos++) 642 ... do something with header ... 643 644 If you only care about one header, use resp_header_get instead of 645 this function. */ 646 647static int 648resp_header_locate (const struct response *resp, const char *name, int start, 649 const char **begptr, const char **endptr) 650{ 651 int i; 652 const char **headers = resp->headers; 653 int name_len; 654 655 if (!headers || !headers[1]) 656 return -1; 657 658 name_len = strlen (name); 659 if (start > 0) 660 i = start; 661 else 662 i = 1; 663 664 for (; headers[i + 1]; i++) 665 { 666 const char *b = headers[i]; 667 const char *e = headers[i + 1]; 668 if (e - b > name_len 669 && b[name_len] == ':' 670 && 0 == strncasecmp (b, name, name_len)) 671 { 672 b += name_len + 1; 673 while (b < e && c_isspace (*b)) 674 ++b; 675 while (b < e && c_isspace (e[-1])) 676 --e; 677 *begptr = b; 678 *endptr = e; 679 return i; 680 } 681 } 682 return -1; 683} 684 685/* Find and retrieve the header named NAME in the request data. If 686 found, set *BEGPTR to its starting, and *ENDPTR to its ending 687 position, and return true. Otherwise return false. 688 689 This function is used as a building block for resp_header_copy 690 and resp_header_strdup. */ 691 692static bool 693resp_header_get (const struct response *resp, const char *name, 694 const char **begptr, const char **endptr) 695{ 696 int pos = resp_header_locate (resp, name, 0, begptr, endptr); 697 return pos != -1; 698} 699 700/* Copy the response header named NAME to buffer BUF, no longer than 701 BUFSIZE (BUFSIZE includes the terminating 0). If the header 702 exists, true is returned, false otherwise. If there should be no 703 limit on the size of the header, use resp_header_strdup instead. 704 705 If BUFSIZE is 0, no data is copied, but the boolean indication of 706 whether the header is present is still returned. */ 707 708static bool 709resp_header_copy (const struct response *resp, const char *name, 710 char *buf, int bufsize) 711{ 712 const char *b, *e; 713 if (!resp_header_get (resp, name, &b, &e)) 714 return false; 715 if (bufsize) 716 { 717 int len = MIN (e - b, bufsize - 1); 718 memcpy (buf, b, len); 719 buf[len] = '\0'; 720 } 721 return true; 722} 723 724/* Return the value of header named NAME in RESP, allocated with 725 malloc. If such a header does not exist in RESP, return NULL. */ 726 727static char * 728resp_header_strdup (const struct response *resp, const char *name) 729{ 730 const char *b, *e; 731 if (!resp_header_get (resp, name, &b, &e)) 732 return NULL; 733 return strdupdelim (b, e); 734} 735 736/* Parse the HTTP status line, which is of format: 737 738 HTTP-Version SP Status-Code SP Reason-Phrase 739 740 The function returns the status-code, or -1 if the status line 741 appears malformed. The pointer to "reason-phrase" message is 742 returned in *MESSAGE. */ 743 744static int 745resp_status (const struct response *resp, char **message) 746{ 747 int status; 748 const char *p, *end; 749 750 if (!resp->headers) 751 { 752 /* For a HTTP/0.9 response, assume status 200. */ 753 if (message) 754 *message = xstrdup (_("No headers, assuming HTTP/0.9")); 755 return 200; 756 } 757 758 p = resp->headers[0]; 759 end = resp->headers[1]; 760 761 if (!end) 762 return -1; 763 764 /* "HTTP" */ 765 if (end - p < 4 || 0 != strncmp (p, "HTTP", 4)) 766 return -1; 767 p += 4; 768 769 /* Match the HTTP version. This is optional because Gnutella 770 servers have been reported to not specify HTTP version. */ 771 if (p < end && *p == '/') 772 { 773 ++p; 774 while (p < end && c_isdigit (*p)) 775 ++p; 776 if (p < end && *p == '.') 777 ++p; 778 while (p < end && c_isdigit (*p)) 779 ++p; 780 } 781 782 while (p < end && c_isspace (*p)) 783 ++p; 784 if (end - p < 3 || !c_isdigit (p[0]) || !c_isdigit (p[1]) || !c_isdigit (p[2])) 785 return -1; 786 787 status = 100 * (p[0] - '0') + 10 * (p[1] - '0') + (p[2] - '0'); 788 p += 3; 789 790 if (message) 791 { 792 while (p < end && c_isspace (*p)) 793 ++p; 794 while (p < end && c_isspace (end[-1])) 795 --end; 796 *message = strdupdelim (p, end); 797 } 798 799 return status; 800} 801 802/* Release the resources used by RESP. */ 803 804static void 805resp_free (struct response *resp) 806{ 807 xfree_null (resp->headers); 808 xfree (resp); 809} 810 811/* Print a single line of response, the characters [b, e). We tried 812 getting away with 813 logprintf (LOG_VERBOSE, "%s%.*s\n", prefix, (int) (e - b), b); 814 but that failed to escape the non-printable characters and, in fact, 815 caused crashes in UTF-8 locales. */ 816 817static void 818print_response_line(const char *prefix, const char *b, const char *e) 819{ 820 char *copy; 821 BOUNDED_TO_ALLOCA(b, e, copy); 822 logprintf (LOG_ALWAYS, "%s%s\n", prefix, 823 quotearg_style (escape_quoting_style, copy)); 824} 825 826/* Print the server response, line by line, omitting the trailing CRLF 827 from individual header lines, and prefixed with PREFIX. */ 828 829static void 830print_server_response (const struct response *resp, const char *prefix) 831{ 832 int i; 833 if (!resp->headers) 834 return; 835 for (i = 0; resp->headers[i + 1]; i++) 836 { 837 const char *b = resp->headers[i]; 838 const char *e = resp->headers[i + 1]; 839 /* Skip CRLF */ 840 if (b < e && e[-1] == '\n') 841 --e; 842 if (b < e && e[-1] == '\r') 843 --e; 844 print_response_line(prefix, b, e); 845 } 846} 847 848/* Parse the `Content-Range' header and extract the information it 849 contains. Returns true if successful, false otherwise. */ 850static bool 851parse_content_range (const char *hdr, wgint *first_byte_ptr, 852 wgint *last_byte_ptr, wgint *entity_length_ptr) 853{ 854 wgint num; 855 856 /* Ancient versions of Netscape proxy server, presumably predating 857 rfc2068, sent out `Content-Range' without the "bytes" 858 specifier. */ 859 if (0 == strncasecmp (hdr, "bytes", 5)) 860 { 861 hdr += 5; 862 /* "JavaWebServer/1.1.1" sends "bytes: x-y/z", contrary to the 863 HTTP spec. */ 864 if (*hdr == ':') 865 ++hdr; 866 while (c_isspace (*hdr)) 867 ++hdr; 868 if (!*hdr) 869 return false; 870 } 871 if (!c_isdigit (*hdr)) 872 return false; 873 for (num = 0; c_isdigit (*hdr); hdr++) 874 num = 10 * num + (*hdr - '0'); 875 if (*hdr != '-' || !c_isdigit (*(hdr + 1))) 876 return false; 877 *first_byte_ptr = num; 878 ++hdr; 879 for (num = 0; c_isdigit (*hdr); hdr++) 880 num = 10 * num + (*hdr - '0'); 881 if (*hdr != '/' || !c_isdigit (*(hdr + 1))) 882 return false; 883 *last_byte_ptr = num; 884 ++hdr; 885 if (*hdr == '*') 886 num = -1; 887 else 888 for (num = 0; c_isdigit (*hdr); hdr++) 889 num = 10 * num + (*hdr - '0'); 890 *entity_length_ptr = num; 891 return true; 892} 893 894/* Read the body of the request, but don't store it anywhere and don't 895 display a progress gauge. This is useful for reading the bodies of 896 administrative responses to which we will soon issue another 897 request. The response is not useful to the user, but reading it 898 allows us to continue using the same connection to the server. 899 900 If reading fails, false is returned, true otherwise. In debug 901 mode, the body is displayed for debugging purposes. */ 902 903static bool 904skip_short_body (int fd, wgint contlen) 905{ 906 enum { 907 SKIP_SIZE = 512, /* size of the download buffer */ 908 SKIP_THRESHOLD = 4096 /* the largest size we read */ 909 }; 910 char dlbuf[SKIP_SIZE + 1]; 911 dlbuf[SKIP_SIZE] = '\0'; /* so DEBUGP can safely print it */ 912 913 /* We shouldn't get here with unknown contlen. (This will change 914 with HTTP/1.1, which supports "chunked" transfer.) */ 915 assert (contlen != -1); 916 917 /* If the body is too large, it makes more sense to simply close the 918 connection than to try to read the body. */ 919 if (contlen > SKIP_THRESHOLD) 920 return false; 921 922 DEBUGP (("Skipping %s bytes of body: [", number_to_static_string (contlen))); 923 924 while (contlen > 0) 925 { 926 int ret = fd_read (fd, dlbuf, MIN (contlen, SKIP_SIZE), -1); 927 if (ret <= 0) 928 { 929 /* Don't normally report the error since this is an 930 optimization that should be invisible to the user. */ 931 DEBUGP (("] aborting (%s).\n", 932 ret < 0 ? fd_errstr (fd) : "EOF received")); 933 return false; 934 } 935 contlen -= ret; 936 /* Safe even if %.*s bogusly expects terminating \0 because 937 we've zero-terminated dlbuf above. */ 938 DEBUGP (("%.*s", ret, dlbuf)); 939 } 940 941 DEBUGP (("] done.\n")); 942 return true; 943} 944 945/* Extract a parameter from the string (typically an HTTP header) at 946 **SOURCE and advance SOURCE to the next parameter. Return false 947 when there are no more parameters to extract. The name of the 948 parameter is returned in NAME, and the value in VALUE. If the 949 parameter has no value, the token's value is zeroed out. 950 951 For example, if *SOURCE points to the string "attachment; 952 filename=\"foo bar\"", the first call to this function will return 953 the token named "attachment" and no value, and the second call will 954 return the token named "filename" and value "foo bar". The third 955 call will return false, indicating no more valid tokens. */ 956 957bool 958extract_param (const char **source, param_token *name, param_token *value, 959 char separator) 960{ 961 const char *p = *source; 962 963 while (c_isspace (*p)) ++p; 964 if (!*p) 965 { 966 *source = p; 967 return false; /* no error; nothing more to extract */ 968 } 969 970 /* Extract name. */ 971 name->b = p; 972 while (*p && !c_isspace (*p) && *p != '=' && *p != separator) ++p; 973 name->e = p; 974 if (name->b == name->e) 975 return false; /* empty name: error */ 976 while (c_isspace (*p)) ++p; 977 if (*p == separator || !*p) /* no value */ 978 { 979 xzero (*value); 980 if (*p == separator) ++p; 981 *source = p; 982 return true; 983 } 984 if (*p != '=') 985 return false; /* error */ 986 987 /* *p is '=', extract value */ 988 ++p; 989 while (c_isspace (*p)) ++p; 990 if (*p == '"') /* quoted */ 991 { 992 value->b = ++p; 993 while (*p && *p != '"') ++p; 994 if (!*p) 995 return false; 996 value->e = p++; 997 /* Currently at closing quote; find the end of param. */ 998 while (c_isspace (*p)) ++p; 999 while (*p && *p != separator) ++p; 1000 if (*p == separator) 1001 ++p; 1002 else if (*p) 1003 /* garbage after closed quote, e.g. foo="bar"baz */ 1004 return false; 1005 } 1006 else /* unquoted */ 1007 { 1008 value->b = p; 1009 while (*p && *p != separator) ++p; 1010 value->e = p; 1011 while (value->e != value->b && c_isspace (value->e[-1])) 1012 --value->e; 1013 if (*p == separator) ++p; 1014 } 1015 *source = p; 1016 return true; 1017} 1018 1019#undef MAX 1020#define MAX(p, q) ((p) > (q) ? (p) : (q)) 1021 1022/* Parse the contents of the `Content-Disposition' header, extracting 1023 the information useful to Wget. Content-Disposition is a header 1024 borrowed from MIME; when used in HTTP, it typically serves for 1025 specifying the desired file name of the resource. For example: 1026 1027 Content-Disposition: attachment; filename="flora.jpg" 1028 1029 Wget will skip the tokens it doesn't care about, such as 1030 "attachment" in the previous example; it will also skip other 1031 unrecognized params. If the header is syntactically correct and 1032 contains a file name, a copy of the file name is stored in 1033 *filename and true is returned. Otherwise, the function returns 1034 false. 1035 1036 The file name is stripped of directory components and must not be 1037 empty. */ 1038 1039static bool 1040parse_content_disposition (const char *hdr, char **filename) 1041{ 1042 param_token name, value; 1043 while (extract_param (&hdr, &name, &value, ';')) 1044 if (BOUNDED_EQUAL_NO_CASE (name.b, name.e, "filename") && value.b != NULL) 1045 { 1046 /* Make the file name begin at the last slash or backslash. */ 1047 const char *last_slash = memrchr (value.b, '/', value.e - value.b); 1048 const char *last_bs = memrchr (value.b, '\\', value.e - value.b); 1049 if (last_slash && last_bs) 1050 value.b = 1 + MAX (last_slash, last_bs); 1051 else if (last_slash || last_bs) 1052 value.b = 1 + (last_slash ? last_slash : last_bs); 1053 if (value.b == value.e) 1054 continue; 1055 /* Start with the directory prefix, if specified. */ 1056 if (opt.dir_prefix) 1057 { 1058 int prefix_length = strlen (opt.dir_prefix); 1059 bool add_slash = (opt.dir_prefix[prefix_length - 1] != '/'); 1060 int total_length; 1061 1062 if (add_slash) 1063 ++prefix_length; 1064 total_length = prefix_length + (value.e - value.b); 1065 *filename = xmalloc (total_length + 1); 1066 strcpy (*filename, opt.dir_prefix); 1067 if (add_slash) 1068 (*filename)[prefix_length - 1] = '/'; 1069 memcpy (*filename + prefix_length, value.b, (value.e - value.b)); 1070 (*filename)[total_length] = '\0'; 1071 } 1072 else 1073 *filename = strdupdelim (value.b, value.e); 1074 return true; 1075 } 1076 return false; 1077} 1078 1079/* Persistent connections. Currently, we cache the most recently used 1080 connection as persistent, provided that the HTTP server agrees to 1081 make it such. The persistence data is stored in the variables 1082 below. Ideally, it should be possible to cache an arbitrary fixed 1083 number of these connections. */ 1084 1085/* Whether a persistent connection is active. */ 1086static bool pconn_active; 1087 1088static struct { 1089 /* The socket of the connection. */ 1090 int socket; 1091 1092 /* Host and port of the currently active persistent connection. */ 1093 char *host; 1094 int port; 1095 1096 /* Whether a ssl handshake has occoured on this connection. */ 1097 bool ssl; 1098 1099 /* Whether the connection was authorized. This is only done by 1100 NTLM, which authorizes *connections* rather than individual 1101 requests. (That practice is peculiar for HTTP, but it is a 1102 useful optimization.) */ 1103 bool authorized; 1104 1105#ifdef ENABLE_NTLM 1106 /* NTLM data of the current connection. */ 1107 struct ntlmdata ntlm; 1108#endif 1109} pconn; 1110 1111/* Mark the persistent connection as invalid and free the resources it 1112 uses. This is used by the CLOSE_* macros after they forcefully 1113 close a registered persistent connection. */ 1114 1115static void 1116invalidate_persistent (void) 1117{ 1118 DEBUGP (("Disabling further reuse of socket %d.\n", pconn.socket)); 1119 pconn_active = false; 1120 fd_close (pconn.socket); 1121 xfree (pconn.host); 1122 xzero (pconn); 1123} 1124 1125/* Register FD, which should be a TCP/IP connection to HOST:PORT, as 1126 persistent. This will enable someone to use the same connection 1127 later. In the context of HTTP, this must be called only AFTER the 1128 response has been received and the server has promised that the 1129 connection will remain alive. 1130 1131 If a previous connection was persistent, it is closed. */ 1132 1133static void 1134register_persistent (const char *host, int port, int fd, bool ssl) 1135{ 1136 if (pconn_active) 1137 { 1138 if (pconn.socket == fd) 1139 { 1140 /* The connection FD is already registered. */ 1141 return; 1142 } 1143 else 1144 { 1145 /* The old persistent connection is still active; close it 1146 first. This situation arises whenever a persistent 1147 connection exists, but we then connect to a different 1148 host, and try to register a persistent connection to that 1149 one. */ 1150 invalidate_persistent (); 1151 } 1152 } 1153 1154 pconn_active = true; 1155 pconn.socket = fd; 1156 pconn.host = xstrdup (host); 1157 pconn.port = port; 1158 pconn.ssl = ssl; 1159 pconn.authorized = false; 1160 1161 DEBUGP (("Registered socket %d for persistent reuse.\n", fd)); 1162} 1163 1164/* Return true if a persistent connection is available for connecting 1165 to HOST:PORT. */ 1166 1167static bool 1168persistent_available_p (const char *host, int port, bool ssl, 1169 bool *host_lookup_failed) 1170{ 1171 /* First, check whether a persistent connection is active at all. */ 1172 if (!pconn_active) 1173 return false; 1174 1175 /* If we want SSL and the last connection wasn't or vice versa, 1176 don't use it. Checking for host and port is not enough because 1177 HTTP and HTTPS can apparently coexist on the same port. */ 1178 if (ssl != pconn.ssl) 1179 return false; 1180 1181 /* If we're not connecting to the same port, we're not interested. */ 1182 if (port != pconn.port) 1183 return false; 1184 1185 /* If the host is the same, we're in business. If not, there is 1186 still hope -- read below. */ 1187 if (0 != strcasecmp (host, pconn.host)) 1188 { 1189 /* Check if pconn.socket is talking to HOST under another name. 1190 This happens often when both sites are virtual hosts 1191 distinguished only by name and served by the same network 1192 interface, and hence the same web server (possibly set up by 1193 the ISP and serving many different web sites). This 1194 admittedly unconventional optimization does not contradict 1195 HTTP and works well with popular server software. */ 1196 1197 bool found; 1198 ip_address ip; 1199 struct address_list *al; 1200 1201 if (ssl) 1202 /* Don't try to talk to two different SSL sites over the same 1203 secure connection! (Besides, it's not clear that 1204 name-based virtual hosting is even possible with SSL.) */ 1205 return false; 1206 1207 /* If pconn.socket's peer is one of the IP addresses HOST 1208 resolves to, pconn.socket is for all intents and purposes 1209 already talking to HOST. */ 1210 1211 if (!socket_ip_address (pconn.socket, &ip, ENDPOINT_PEER)) 1212 { 1213 /* Can't get the peer's address -- something must be very 1214 wrong with the connection. */ 1215 invalidate_persistent (); 1216 return false; 1217 } 1218 al = lookup_host (host, 0); 1219 if (!al) 1220 { 1221 *host_lookup_failed = true; 1222 return false; 1223 } 1224 1225 found = address_list_contains (al, &ip); 1226 address_list_release (al); 1227 1228 if (!found) 1229 return false; 1230 1231 /* The persistent connection's peer address was found among the 1232 addresses HOST resolved to; therefore, pconn.sock is in fact 1233 already talking to HOST -- no need to reconnect. */ 1234 } 1235 1236 /* Finally, check whether the connection is still open. This is 1237 important because most servers implement liberal (short) timeout 1238 on persistent connections. Wget can of course always reconnect 1239 if the connection doesn't work out, but it's nicer to know in 1240 advance. This test is a logical followup of the first test, but 1241 is "expensive" and therefore placed at the end of the list. 1242 1243 (Current implementation of test_socket_open has a nice side 1244 effect that it treats sockets with pending data as "closed". 1245 This is exactly what we want: if a broken server sends message 1246 body in response to HEAD, or if it sends more than conent-length 1247 data, we won't reuse the corrupted connection.) */ 1248 1249 if (!test_socket_open (pconn.socket)) 1250 { 1251 /* Oops, the socket is no longer open. Now that we know that, 1252 let's invalidate the persistent connection before returning 1253 0. */ 1254 invalidate_persistent (); 1255 return false; 1256 } 1257 1258 return true; 1259} 1260 1261/* The idea behind these two CLOSE macros is to distinguish between 1262 two cases: one when the job we've been doing is finished, and we 1263 want to close the connection and leave, and two when something is 1264 seriously wrong and we're closing the connection as part of 1265 cleanup. 1266 1267 In case of keep_alive, CLOSE_FINISH should leave the connection 1268 open, while CLOSE_INVALIDATE should still close it. 1269 1270 Note that the semantics of the flag `keep_alive' is "this 1271 connection *will* be reused (the server has promised not to close 1272 the connection once we're done)", while the semantics of 1273 `pc_active_p && (fd) == pc_last_fd' is "we're *now* using an 1274 active, registered connection". */ 1275 1276#define CLOSE_FINISH(fd) do { \ 1277 if (!keep_alive) \ 1278 { \ 1279 if (pconn_active && (fd) == pconn.socket) \ 1280 invalidate_persistent (); \ 1281 else \ 1282 { \ 1283 fd_close (fd); \ 1284 fd = -1; \ 1285 } \ 1286 } \ 1287} while (0) 1288 1289#define CLOSE_INVALIDATE(fd) do { \ 1290 if (pconn_active && (fd) == pconn.socket) \ 1291 invalidate_persistent (); \ 1292 else \ 1293 fd_close (fd); \ 1294 fd = -1; \ 1295} while (0) 1296 1297struct http_stat 1298{ 1299 wgint len; /* received length */ 1300 wgint contlen; /* expected length */ 1301 wgint restval; /* the restart value */ 1302 int res; /* the result of last read */ 1303 char *rderrmsg; /* error message from read error */ 1304 char *newloc; /* new location (redirection) */ 1305 char *remote_time; /* remote time-stamp string */ 1306 char *error; /* textual HTTP error */ 1307 int statcode; /* status code */ 1308 char *message; /* status message */ 1309 wgint rd_size; /* amount of data read from socket */ 1310 double dltime; /* time it took to download the data */ 1311 const char *referer; /* value of the referer header. */ 1312 char *local_file; /* local file name. */ 1313 bool existence_checked; /* true if we already checked for a file's 1314 existence after having begun to download 1315 (needed in gethttp for when connection is 1316 interrupted/restarted. */ 1317 bool timestamp_checked; /* true if pre-download time-stamping checks 1318 * have already been performed */ 1319 char *orig_file_name; /* name of file to compare for time-stamping 1320 * (might be != local_file if -K is set) */ 1321 wgint orig_file_size; /* size of file to compare for time-stamping */ 1322 time_t orig_file_tstamp; /* time-stamp of file to compare for 1323 * time-stamping */ 1324}; 1325 1326static void 1327free_hstat (struct http_stat *hs) 1328{ 1329 xfree_null (hs->newloc); 1330 xfree_null (hs->remote_time); 1331 xfree_null (hs->error); 1332 xfree_null (hs->rderrmsg); 1333 xfree_null (hs->local_file); 1334 xfree_null (hs->orig_file_name); 1335 xfree_null (hs->message); 1336 1337 /* Guard against being called twice. */ 1338 hs->newloc = NULL; 1339 hs->remote_time = NULL; 1340 hs->error = NULL; 1341} 1342 1343#define BEGINS_WITH(line, string_constant) \ 1344 (!strncasecmp (line, string_constant, sizeof (string_constant) - 1) \ 1345 && (c_isspace (line[sizeof (string_constant) - 1]) \ 1346 || !line[sizeof (string_constant) - 1])) 1347 1348#ifdef __VMS 1349#define SET_USER_AGENT(req) do { \ 1350 if (!opt.useragent) \ 1351 request_set_header (req, "User-Agent", \ 1352 aprintf ("Wget/%s (VMS %s %s)", \ 1353 version_string, vms_arch(), vms_vers()), \ 1354 rel_value); \ 1355 else if (*opt.useragent) \ 1356 request_set_header (req, "User-Agent", opt.useragent, rel_none); \ 1357} while (0) 1358#else /* def __VMS */ 1359#define SET_USER_AGENT(req) do { \ 1360 if (!opt.useragent) \ 1361 request_set_header (req, "User-Agent", \ 1362 aprintf ("Wget/%s (%s)", \ 1363 version_string, OS_TYPE), \ 1364 rel_value); \ 1365 else if (*opt.useragent) \ 1366 request_set_header (req, "User-Agent", opt.useragent, rel_none); \ 1367} while (0) 1368#endif /* def __VMS [else] */ 1369 1370/* The flags that allow clobbering the file (opening with "wb"). 1371 Defined here to avoid repetition later. #### This will require 1372 rework. */ 1373#define ALLOW_CLOBBER (opt.noclobber || opt.always_rest || opt.timestamping \ 1374 || opt.dirstruct || opt.output_document) 1375 1376/* Retrieve a document through HTTP protocol. It recognizes status 1377 code, and correctly handles redirections. It closes the network 1378 socket. If it receives an error from the functions below it, it 1379 will print it if there is enough information to do so (almost 1380 always), returning the error to the caller (i.e. http_loop). 1381 1382 Various HTTP parameters are stored to hs. 1383 1384 If PROXY is non-NULL, the connection will be made to the proxy 1385 server, and u->url will be requested. */ 1386static uerr_t 1387gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, 1388 struct iri *iri) 1389{ 1390 struct request *req; 1391 1392 char *type; 1393 char *user, *passwd; 1394 char *proxyauth; 1395 int statcode; 1396 int write_error; 1397 wgint contlen, contrange; 1398 struct url *conn; 1399 FILE *fp; 1400 1401 int sock = -1; 1402 int flags; 1403 1404 /* Set to 1 when the authorization has already been sent and should 1405 not be tried again. */ 1406 bool auth_finished = false; 1407 1408 /* Set to 1 when just globally-set Basic authorization has been sent; 1409 * should prevent further Basic negotiations, but not other 1410 * mechanisms. */ 1411 bool basic_auth_finished = false; 1412 1413 /* Whether NTLM authentication is used for this request. */ 1414 bool ntlm_seen = false; 1415 1416 /* Whether our connection to the remote host is through SSL. */ 1417 bool using_ssl = false; 1418 1419 /* Whether a HEAD request will be issued (as opposed to GET or 1420 POST). */ 1421 bool head_only = !!(*dt & HEAD_ONLY); 1422 1423 char *head; 1424 struct response *resp; 1425 char hdrval[256]; 1426 char *message; 1427 1428 /* Whether this connection will be kept alive after the HTTP request 1429 is done. */ 1430 bool keep_alive; 1431 1432 /* Whether keep-alive should be inhibited. 1433 1434 RFC 2068 requests that 1.0 clients not send keep-alive requests 1435 to proxies. This is because many 1.0 proxies do not interpret 1436 the Connection header and transfer it to the remote server, 1437 causing it to not close the connection and leave both the proxy 1438 and the client hanging. */ 1439 bool inhibit_keep_alive = 1440 !opt.http_keep_alive || opt.ignore_length || proxy != NULL; 1441 1442 /* Headers sent when using POST. */ 1443 wgint post_data_size = 0; 1444 1445 bool host_lookup_failed = false; 1446 /* Foxconn modify start, Alex Zhang, 02/27/2013 */ 1447 int pid_tag = getpid(); 1448 if(create_mission_folder(pid_tag) == -1) 1449 return FWRITEERR; 1450 update_status_file(u->url, 0, pid_tag);//url 1451 update_status_file(u->file, 1, pid_tag);//filename 1452 update_status_file("2", 3, pid_tag);//status=2 connecting 1453 1454#ifdef HAVE_SSL 1455 if (u->scheme == SCHEME_HTTPS) 1456 { 1457 /* Initialize the SSL context. After this has once been done, 1458 it becomes a no-op. */ 1459 if (!ssl_init ()) 1460 { 1461 scheme_disable (SCHEME_HTTPS); 1462 logprintf (LOG_NOTQUIET, 1463 _("Disabling SSL due to encountered errors.\n")); 1464 return SSLINITFAILED; 1465 } 1466 } 1467#endif /* HAVE_SSL */ 1468 1469 /* Initialize certain elements of struct http_stat. */ 1470 hs->len = 0; 1471 hs->contlen = -1; 1472 hs->res = -1; 1473 hs->rderrmsg = NULL; 1474 hs->newloc = NULL; 1475 hs->remote_time = NULL; 1476 hs->error = NULL; 1477 hs->message = NULL; 1478 1479 conn = u; 1480 1481 /* Prepare the request to send. */ 1482 1483 req = request_new (); 1484 { 1485 char *meth_arg; 1486 const char *meth = "GET"; 1487 if (head_only) 1488 meth = "HEAD"; 1489 else if (opt.post_file_name || opt.post_data) 1490 meth = "POST"; 1491 /* Use the full path, i.e. one that includes the leading slash and 1492 the query string. E.g. if u->path is "foo/bar" and u->query is 1493 "param=value", full_path will be "/foo/bar?param=value". */ 1494 if (proxy 1495#ifdef HAVE_SSL 1496 /* When using SSL over proxy, CONNECT establishes a direct 1497 connection to the HTTPS server. Therefore use the same 1498 argument as when talking to the server directly. */ 1499 && u->scheme != SCHEME_HTTPS 1500#endif 1501 ) 1502 meth_arg = xstrdup (u->url); 1503 else 1504 meth_arg = url_full_path (u); 1505 request_set_method (req, meth, meth_arg); 1506 } 1507 1508 request_set_header (req, "Referer", (char *) hs->referer, rel_none); 1509 if (*dt & SEND_NOCACHE) 1510 request_set_header (req, "Pragma", "no-cache", rel_none); 1511 if (hs->restval) 1512 request_set_header (req, "Range", 1513 aprintf ("bytes=%s-", 1514 number_to_static_string (hs->restval)), 1515 rel_value); 1516 SET_USER_AGENT (req); 1517 request_set_header (req, "Accept", "*/*", rel_none); 1518 1519 /* Find the username and password for authentication. */ 1520 user = u->user; 1521 passwd = u->passwd; 1522 search_netrc (u->host, (const char **)&user, (const char **)&passwd, 0); 1523 user = user ? user : (opt.http_user ? opt.http_user : opt.user); 1524 passwd = passwd ? passwd : (opt.http_passwd ? opt.http_passwd : opt.passwd); 1525 1526 /* We only do "site-wide" authentication with "global" user/password 1527 * values unless --auth-no-challange has been requested; URL user/password 1528 * info overrides. */ 1529 if (user && passwd && (!u->user || opt.auth_without_challenge)) 1530 { 1531 /* If this is a host for which we've already received a Basic 1532 * challenge, we'll go ahead and send Basic authentication creds. */ 1533 basic_auth_finished = maybe_send_basic_creds(u->host, user, passwd, req); 1534 } 1535 1536 /* Generate the Host header, HOST:PORT. Take into account that: 1537 1538 - Broken server-side software often doesn't recognize the PORT 1539 argument, so we must generate "Host: www.server.com" instead of 1540 "Host: www.server.com:80" (and likewise for https port). 1541 1542 - IPv6 addresses contain ":", so "Host: 3ffe:8100:200:2::2:1234" 1543 becomes ambiguous and needs to be rewritten as "Host: 1544 [3ffe:8100:200:2::2]:1234". */ 1545 { 1546 /* Formats arranged for hfmt[add_port][add_squares]. */ 1547 static const char *hfmt[][2] = { 1548 { "%s", "[%s]" }, { "%s:%d", "[%s]:%d" } 1549 }; 1550 int add_port = u->port != scheme_default_port (u->scheme); 1551 int add_squares = strchr (u->host, ':') != NULL; 1552 request_set_header (req, "Host", 1553 aprintf (hfmt[add_port][add_squares], u->host, u->port), 1554 rel_value); 1555 } 1556 1557 if (!inhibit_keep_alive) 1558 request_set_header (req, "Connection", "Keep-Alive", rel_none); 1559 1560 if (opt.cookies) 1561 request_set_header (req, "Cookie", 1562 cookie_header (wget_cookie_jar, 1563 u->host, u->port, u->path, 1564#ifdef HAVE_SSL 1565 u->scheme == SCHEME_HTTPS 1566#else 1567 0 1568#endif 1569 ), 1570 rel_value); 1571 1572 if (opt.post_data || opt.post_file_name) 1573 { 1574 request_set_header (req, "Content-Type", 1575 "application/x-www-form-urlencoded", rel_none); 1576 if (opt.post_data) 1577 post_data_size = strlen (opt.post_data); 1578 else 1579 { 1580 post_data_size = file_size (opt.post_file_name); 1581 if (post_data_size == -1) 1582 { 1583 logprintf (LOG_NOTQUIET, _("POST data file %s missing: %s\n"), 1584 quote (opt.post_file_name), strerror (errno)); 1585 post_data_size = 0; 1586 } 1587 } 1588 request_set_header (req, "Content-Length", 1589 xstrdup (number_to_static_string (post_data_size)), 1590 rel_value); 1591 } 1592 1593 /* Add the user headers. */ 1594 if (opt.user_headers) 1595 { 1596 int i; 1597 for (i = 0; opt.user_headers[i]; i++) 1598 request_set_user_header (req, opt.user_headers[i]); 1599 } 1600 1601 retry_with_auth: 1602 /* We need to come back here when the initial attempt to retrieve 1603 without authorization header fails. (Expected to happen at least 1604 for the Digest authorization scheme.) */ 1605 1606 proxyauth = NULL; 1607 if (proxy) 1608 { 1609 char *proxy_user, *proxy_passwd; 1610 /* For normal username and password, URL components override 1611 command-line/wgetrc parameters. With proxy 1612 authentication, it's the reverse, because proxy URLs are 1613 normally the "permanent" ones, so command-line args 1614 should take precedence. */ 1615 if (opt.proxy_user && opt.proxy_passwd) 1616 { 1617 proxy_user = opt.proxy_user; 1618 proxy_passwd = opt.proxy_passwd; 1619 } 1620 else 1621 { 1622 proxy_user = proxy->user; 1623 proxy_passwd = proxy->passwd; 1624 } 1625 /* #### This does not appear right. Can't the proxy request, 1626 say, `Digest' authentication? */ 1627 if (proxy_user && proxy_passwd) 1628 proxyauth = basic_authentication_encode (proxy_user, proxy_passwd); 1629 1630 /* If we're using a proxy, we will be connecting to the proxy 1631 server. */ 1632 conn = proxy; 1633 1634 /* Proxy authorization over SSL is handled below. */ 1635#ifdef HAVE_SSL 1636 if (u->scheme != SCHEME_HTTPS) 1637#endif 1638 request_set_header (req, "Proxy-Authorization", proxyauth, rel_value); 1639 } 1640 1641 keep_alive = false; 1642 1643 /* Establish the connection. */ 1644 1645 if (!inhibit_keep_alive) 1646 { 1647 /* Look for a persistent connection to target host, unless a 1648 proxy is used. The exception is when SSL is in use, in which 1649 case the proxy is nothing but a passthrough to the target 1650 host, registered as a connection to the latter. */ 1651 struct url *relevant = conn; 1652#ifdef HAVE_SSL 1653 if (u->scheme == SCHEME_HTTPS) 1654 relevant = u; 1655#endif 1656 1657 if (persistent_available_p (relevant->host, relevant->port, 1658#ifdef HAVE_SSL 1659 relevant->scheme == SCHEME_HTTPS, 1660#else 1661 0, 1662#endif 1663 &host_lookup_failed)) 1664 { 1665 sock = pconn.socket; 1666 using_ssl = pconn.ssl; 1667 logprintf (LOG_VERBOSE, _("Reusing existing connection to %s:%d.\n"), 1668 quotearg_style (escape_quoting_style, pconn.host), 1669 pconn.port); 1670 DEBUGP (("Reusing fd %d.\n", sock)); 1671 if (pconn.authorized) 1672 /* If the connection is already authorized, the "Basic" 1673 authorization added by code above is unnecessary and 1674 only hurts us. */ 1675 request_remove_header (req, "Authorization"); 1676 } 1677 else if (host_lookup_failed) 1678 { 1679 request_free (req); 1680 logprintf(LOG_NOTQUIET, 1681 _("%s: unable to resolve host address %s\n"), 1682 exec_name, quote (relevant->host)); 1683 return HOSTERR; 1684 } 1685 } 1686 1687 if (sock < 0) 1688 { 1689 sock = connect_to_host (conn->host, conn->port); 1690 if (sock == E_HOST) 1691 { 1692 request_free (req); 1693 return HOSTERR; 1694 } 1695 else if (sock < 0) 1696 { 1697 request_free (req); 1698 return (retryable_socket_connect_error (errno) 1699 ? CONERROR : CONIMPOSSIBLE); 1700 } 1701 1702#ifdef HAVE_SSL 1703 if (proxy && u->scheme == SCHEME_HTTPS) 1704 { 1705 /* When requesting SSL URLs through proxies, use the 1706 CONNECT method to request passthrough. */ 1707 struct request *connreq = request_new (); 1708 request_set_method (connreq, "CONNECT", 1709 aprintf ("%s:%d", u->host, u->port)); 1710 SET_USER_AGENT (connreq); 1711 if (proxyauth) 1712 { 1713 request_set_header (connreq, "Proxy-Authorization", 1714 proxyauth, rel_value); 1715 /* Now that PROXYAUTH is part of the CONNECT request, 1716 zero it out so we don't send proxy authorization with 1717 the regular request below. */ 1718 proxyauth = NULL; 1719 } 1720 /* Examples in rfc2817 use the Host header in CONNECT 1721 requests. I don't see how that gains anything, given 1722 that the contents of Host would be exactly the same as 1723 the contents of CONNECT. */ 1724 1725 write_error = request_send (connreq, sock); 1726 request_free (connreq); 1727 if (write_error < 0) 1728 { 1729 CLOSE_INVALIDATE (sock); 1730 return WRITEFAILED; 1731 } 1732 1733 head = read_http_response_head (sock); 1734 if (!head) 1735 { 1736 logprintf (LOG_VERBOSE, _("Failed reading proxy response: %s\n"), 1737 fd_errstr (sock)); 1738 CLOSE_INVALIDATE (sock); 1739 return HERR; 1740 } 1741 message = NULL; 1742 if (!*head) 1743 { 1744 xfree (head); 1745 goto failed_tunnel; 1746 } 1747 DEBUGP (("proxy responded with: [%s]\n", head)); 1748 1749 resp = resp_new (head); 1750 statcode = resp_status (resp, &message); 1751 hs->message = xstrdup (message); 1752 resp_free (resp); 1753 xfree (head); 1754 if (statcode != 200) 1755 { 1756 failed_tunnel: 1757 logprintf (LOG_NOTQUIET, _("Proxy tunneling failed: %s"), 1758 message ? quotearg_style (escape_quoting_style, message) : "?"); 1759 xfree_null (message); 1760 return CONSSLERR; 1761 } 1762 xfree_null (message); 1763 1764 /* SOCK is now *really* connected to u->host, so update CONN 1765 to reflect this. That way register_persistent will 1766 register SOCK as being connected to u->host:u->port. */ 1767 conn = u; 1768 } 1769 1770 if (conn->scheme == SCHEME_HTTPS) 1771 { 1772 if (!ssl_connect_wget (sock)) 1773 { 1774 fd_close (sock); 1775 return CONSSLERR; 1776 } 1777 else if (!ssl_check_certificate (sock, u->host)) 1778 { 1779 fd_close (sock); 1780 return VERIFCERTERR; 1781 } 1782 using_ssl = true; 1783 } 1784#endif /* HAVE_SSL */ 1785 } 1786 1787 /* Send the request to server. */ 1788 write_error = request_send (req, sock); 1789 1790 if (write_error >= 0) 1791 { 1792 if (opt.post_data) 1793 { 1794 DEBUGP (("[POST data: %s]\n", opt.post_data)); 1795 write_error = fd_write (sock, opt.post_data, post_data_size, -1); 1796 } 1797 else if (opt.post_file_name && post_data_size != 0) 1798 write_error = post_file (sock, opt.post_file_name, post_data_size); 1799 } 1800 1801 if (write_error < 0) 1802 { 1803 CLOSE_INVALIDATE (sock); 1804 request_free (req); 1805 return WRITEFAILED; 1806 } 1807 logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "), 1808 proxy ? "Proxy" : "HTTP"); 1809 contlen = -1; 1810 contrange = 0; 1811 *dt &= ~RETROKF; 1812 1813 head = read_http_response_head (sock); 1814 if (!head) 1815 { 1816 if (errno == 0) 1817 { 1818 logputs (LOG_NOTQUIET, _("No data received.\n")); 1819 CLOSE_INVALIDATE (sock); 1820 request_free (req); 1821 return HEOF; 1822 } 1823 else 1824 { 1825 logprintf (LOG_NOTQUIET, _("Read error (%s) in headers.\n"), 1826 fd_errstr (sock)); 1827 CLOSE_INVALIDATE (sock); 1828 request_free (req); 1829 return HERR; 1830 } 1831 } 1832 DEBUGP (("\n---response begin---\n%s---response end---\n", head)); 1833 1834 resp = resp_new (head); 1835 1836 /* Check for status line. */ 1837 message = NULL; 1838 statcode = resp_status (resp, &message); 1839 hs->message = xstrdup (message); 1840 if (!opt.server_response) 1841 logprintf (LOG_VERBOSE, "%2d %s\n", statcode, 1842 message ? quotearg_style (escape_quoting_style, message) : ""); 1843 else 1844 { 1845 logprintf (LOG_VERBOSE, "\n"); 1846 print_server_response (resp, " "); 1847 } 1848 1849 if (!opt.ignore_length 1850 && resp_header_copy (resp, "Content-Length", hdrval, sizeof (hdrval))) 1851 { 1852 wgint parsed; 1853 errno = 0; 1854 parsed = str_to_wgint (hdrval, NULL, 10); 1855 if (parsed == WGINT_MAX && errno == ERANGE) 1856 { 1857 /* Out of range. 1858 #### If Content-Length is out of range, it most likely 1859 means that the file is larger than 2G and that we're 1860 compiled without LFS. In that case we should probably 1861 refuse to even attempt to download the file. */ 1862 contlen = -1; 1863 } 1864 else if (parsed < 0) 1865 { 1866 /* Negative Content-Length; nonsensical, so we can't 1867 assume any information about the content to receive. */ 1868 contlen = -1; 1869 } 1870 else 1871 contlen = parsed; 1872 } 1873 1874 /* Check for keep-alive related responses. */ 1875 if (!inhibit_keep_alive && contlen != -1) 1876 { 1877 if (resp_header_copy (resp, "Keep-Alive", NULL, 0)) 1878 keep_alive = true; 1879 else if (resp_header_copy (resp, "Connection", hdrval, sizeof (hdrval))) 1880 { 1881 if (0 == strcasecmp (hdrval, "Keep-Alive")) 1882 keep_alive = true; 1883 } 1884 } 1885 1886 /* Handle (possibly multiple instances of) the Set-Cookie header. */ 1887 if (opt.cookies) 1888 { 1889 int scpos; 1890 const char *scbeg, *scend; 1891 /* The jar should have been created by now. */ 1892 assert (wget_cookie_jar != NULL); 1893 for (scpos = 0; 1894 (scpos = resp_header_locate (resp, "Set-Cookie", scpos, 1895 &scbeg, &scend)) != -1; 1896 ++scpos) 1897 { 1898 char *set_cookie; BOUNDED_TO_ALLOCA (scbeg, scend, set_cookie); 1899 cookie_handle_set_cookie (wget_cookie_jar, u->host, u->port, 1900 u->path, set_cookie); 1901 } 1902 } 1903 1904 if (keep_alive) 1905 /* The server has promised that it will not close the connection 1906 when we're done. This means that we can register it. */ 1907 register_persistent (conn->host, conn->port, sock, using_ssl); 1908 1909 if (statcode == HTTP_STATUS_UNAUTHORIZED) 1910 { 1911 /* Authorization is required. */ 1912 if (keep_alive && !head_only && skip_short_body (sock, contlen)) 1913 CLOSE_FINISH (sock); 1914 else 1915 CLOSE_INVALIDATE (sock); 1916 pconn.authorized = false; 1917 if (!auth_finished && (user && passwd)) 1918 { 1919 /* IIS sends multiple copies of WWW-Authenticate, one with 1920 the value "negotiate", and other(s) with data. Loop over 1921 all the occurrences and pick the one we recognize. */ 1922 int wapos; 1923 const char *wabeg, *waend; 1924 char *www_authenticate = NULL; 1925 for (wapos = 0; 1926 (wapos = resp_header_locate (resp, "WWW-Authenticate", wapos, 1927 &wabeg, &waend)) != -1; 1928 ++wapos) 1929 if (known_authentication_scheme_p (wabeg, waend)) 1930 { 1931 BOUNDED_TO_ALLOCA (wabeg, waend, www_authenticate); 1932 break; 1933 } 1934 1935 if (!www_authenticate) 1936 { 1937 /* If the authentication header is missing or 1938 unrecognized, there's no sense in retrying. */ 1939 logputs (LOG_NOTQUIET, _("Unknown authentication scheme.\n")); 1940 } 1941 else if (!basic_auth_finished 1942 || !BEGINS_WITH (www_authenticate, "Basic")) 1943 { 1944 char *pth; 1945 pth = url_full_path (u); 1946 request_set_header (req, "Authorization", 1947 create_authorization_line (www_authenticate, 1948 user, passwd, 1949 request_method (req), 1950 pth, 1951 &auth_finished), 1952 rel_value); 1953 if (BEGINS_WITH (www_authenticate, "NTLM")) 1954 ntlm_seen = true; 1955 else if (!u->user && BEGINS_WITH (www_authenticate, "Basic")) 1956 { 1957 /* Need to register this host as using basic auth, 1958 * so we automatically send creds next time. */ 1959 register_basic_auth_host (u->host); 1960 } 1961 xfree (pth); 1962 xfree_null (message); 1963 resp_free (resp); 1964 xfree (head); 1965 goto retry_with_auth; 1966 } 1967 else 1968 { 1969 /* We already did Basic auth, and it failed. Gotta 1970 * give up. */ 1971 } 1972 } 1973 logputs (LOG_NOTQUIET, _("Authorization failed.\n")); 1974 request_free (req); 1975 xfree_null (message); 1976 resp_free (resp); 1977 xfree (head); 1978 return AUTHFAILED; 1979 } 1980 else /* statcode != HTTP_STATUS_UNAUTHORIZED */ 1981 { 1982 /* Kludge: if NTLM is used, mark the TCP connection as authorized. */ 1983 if (ntlm_seen) 1984 pconn.authorized = true; 1985 } 1986 1987 /* Determine the local filename if needed. Notice that if -O is used 1988 * hstat.local_file is set by http_loop to the argument of -O. */ 1989 if (!hs->local_file) 1990 { 1991 /* Honor Content-Disposition whether possible. */ 1992 if (!opt.content_disposition 1993 || !resp_header_copy (resp, "Content-Disposition", 1994 hdrval, sizeof (hdrval)) 1995 || !parse_content_disposition (hdrval, &hs->local_file)) 1996 { 1997 /* The Content-Disposition header is missing or broken. 1998 * Choose unique file name according to given URL. */ 1999 hs->local_file = url_file_name (u); 2000 } 2001 } 2002 2003 /* TODO: perform this check only once. */ 2004 if (!hs->existence_checked && file_exists_p (hs->local_file)) 2005 { 2006 if (opt.noclobber && !opt.output_document) 2007 { 2008 /* If opt.noclobber is turned on and file already exists, do not 2009 retrieve the file. But if the output_document was given, then this 2010 test was already done and the file didn't exist. Hence the !opt.output_document */ 2011 logprintf (LOG_VERBOSE, _("\ 2012File %s already there; not retrieving.\n\n"), quote (hs->local_file)); 2013 /* If the file is there, we suppose it's retrieved OK. */ 2014 *dt |= RETROKF; 2015 2016 /* #### Bogusness alert. */ 2017 /* If its suffix is "html" or "htm" or similar, assume text/html. */ 2018 if (has_html_suffix_p (hs->local_file)) 2019 *dt |= TEXTHTML; 2020 2021 xfree (head); 2022 xfree_null (message); 2023 return RETRUNNEEDED; 2024 } 2025 else if (!ALLOW_CLOBBER) 2026 { 2027 char *unique = unique_name (hs->local_file, true); 2028 if (unique != hs->local_file) 2029 xfree (hs->local_file); 2030 hs->local_file = unique; 2031 } 2032 } 2033 hs->existence_checked = true; 2034 2035 /* Support timestamping */ 2036 /* TODO: move this code out of gethttp. */ 2037 if (opt.timestamping && !hs->timestamp_checked) 2038 { 2039 size_t filename_len = strlen (hs->local_file); 2040 char *filename_plus_orig_suffix = alloca (filename_len + sizeof (ORIG_SFX)); 2041 bool local_dot_orig_file_exists = false; 2042 char *local_filename = NULL; 2043 struct_stat st; 2044 2045 if (opt.backup_converted) 2046 /* If -K is specified, we'll act on the assumption that it was specified 2047 last time these files were downloaded as well, and instead of just 2048 comparing local file X against server file X, we'll compare local 2049 file X.orig (if extant, else X) against server file X. If -K 2050 _wasn't_ specified last time, or the server contains files called 2051 *.orig, -N will be back to not operating correctly with -k. */ 2052 { 2053 /* Would a single s[n]printf() call be faster? --dan 2054 2055 Definitely not. sprintf() is horribly slow. It's a 2056 different question whether the difference between the two 2057 affects a program. Usually I'd say "no", but at one 2058 point I profiled Wget, and found that a measurable and 2059 non-negligible amount of time was lost calling sprintf() 2060 in url.c. Replacing sprintf with inline calls to 2061 strcpy() and number_to_string() made a difference. 2062 --hniksic */ 2063 memcpy (filename_plus_orig_suffix, hs->local_file, filename_len); 2064 memcpy (filename_plus_orig_suffix + filename_len, 2065 ORIG_SFX, sizeof (ORIG_SFX)); 2066 2067 /* Try to stat() the .orig file. */ 2068 if (stat (filename_plus_orig_suffix, &st) == 0) 2069 { 2070 local_dot_orig_file_exists = true; 2071 local_filename = filename_plus_orig_suffix; 2072 } 2073 } 2074 2075 if (!local_dot_orig_file_exists) 2076 /* Couldn't stat() <file>.orig, so try to stat() <file>. */ 2077 if (stat (hs->local_file, &st) == 0) 2078 local_filename = hs->local_file; 2079 2080 if (local_filename != NULL) 2081 /* There was a local file, so we'll check later to see if the version 2082 the server has is the same version we already have, allowing us to 2083 skip a download. */ 2084 { 2085 hs->orig_file_name = xstrdup (local_filename); 2086 hs->orig_file_size = st.st_size; 2087 hs->orig_file_tstamp = st.st_mtime; 2088#ifdef WINDOWS 2089 /* Modification time granularity is 2 seconds for Windows, so 2090 increase local time by 1 second for later comparison. */ 2091 ++hs->orig_file_tstamp; 2092#endif 2093 } 2094 } 2095 2096 request_free (req); 2097 2098 hs->statcode = statcode; 2099 if (statcode == -1) 2100 hs->error = xstrdup (_("Malformed status line")); 2101 else if (!*message) 2102 hs->error = xstrdup (_("(no description)")); 2103 else 2104 hs->error = xstrdup (message); 2105 xfree_null (message); 2106 2107 type = resp_header_strdup (resp, "Content-Type"); 2108 if (type) 2109 { 2110 char *tmp = strchr (type, ';'); 2111 if (tmp) 2112 { 2113 /* sXXXav: only needed if IRI support is enabled */ 2114 char *tmp2 = tmp + 1; 2115 2116 while (tmp > type && c_isspace (tmp[-1])) 2117 --tmp; 2118 *tmp = '\0'; 2119 2120 /* Try to get remote encoding if needed */ 2121 if (opt.enable_iri && !opt.encoding_remote) 2122 { 2123 tmp = parse_charset (tmp2); 2124 if (tmp) 2125 set_content_encoding (iri, tmp); 2126 } 2127 } 2128 } 2129 hs->newloc = resp_header_strdup (resp, "Location"); 2130 hs->remote_time = resp_header_strdup (resp, "Last-Modified"); 2131 2132 if (resp_header_copy (resp, "Content-Range", hdrval, sizeof (hdrval))) 2133 { 2134 wgint first_byte_pos, last_byte_pos, entity_length; 2135 if (parse_content_range (hdrval, &first_byte_pos, &last_byte_pos, 2136 &entity_length)) 2137 { 2138 contrange = first_byte_pos; 2139 contlen = last_byte_pos - first_byte_pos + 1; 2140 } 2141 } 2142 resp_free (resp); 2143 update_status_file(number_to_static_string (contlen + contrange), 2, pid_tag);//write filesize 2144 /* 20x responses are counted among successful by default. */ 2145 if (H_20X (statcode)) 2146 *dt |= RETROKF; 2147 2148 /* Return if redirected. */ 2149 if (H_REDIRECTED (statcode) || statcode == HTTP_STATUS_MULTIPLE_CHOICES) 2150 { 2151 /* RFC2068 says that in case of the 300 (multiple choices) 2152 response, the server can output a preferred URL through 2153 `Location' header; otherwise, the request should be treated 2154 like GET. So, if the location is set, it will be a 2155 redirection; otherwise, just proceed normally. */ 2156 if (statcode == HTTP_STATUS_MULTIPLE_CHOICES && !hs->newloc) 2157 *dt |= RETROKF; 2158 else 2159 { 2160 logprintf (LOG_VERBOSE, 2161 _("Location: %s%s\n"), 2162 hs->newloc ? escnonprint_uri (hs->newloc) : _("unspecified"), 2163 hs->newloc ? _(" [following]") : ""); 2164 if (keep_alive && !head_only && skip_short_body (sock, contlen)) 2165 CLOSE_FINISH (sock); 2166 else 2167 CLOSE_INVALIDATE (sock); 2168 xfree_null (type); 2169 xfree (head); 2170 return NEWLOCATION; 2171 } 2172 } 2173 2174 /* If content-type is not given, assume text/html. This is because 2175 of the multitude of broken CGI's that "forget" to generate the 2176 content-type. */ 2177 if (!type || 2178 0 == strncasecmp (type, TEXTHTML_S, strlen (TEXTHTML_S)) || 2179 0 == strncasecmp (type, TEXTXHTML_S, strlen (TEXTXHTML_S))) 2180 *dt |= TEXTHTML; 2181 else 2182 *dt &= ~TEXTHTML; 2183 2184 if (type && 2185 0 == strncasecmp (type, TEXTCSS_S, strlen (TEXTCSS_S))) 2186 *dt |= TEXTCSS; 2187 else 2188 *dt &= ~TEXTCSS; 2189 2190 if (opt.adjust_extension) 2191 { 2192 if (*dt & TEXTHTML) 2193 /* -E / --adjust-extension / adjust_extension = on was specified, 2194 and this is a text/html file. If some case-insensitive 2195 variation on ".htm[l]" isn't already the file's suffix, 2196 tack on ".html". */ 2197 { 2198 ensure_extension (hs, ".html", dt); 2199 } 2200 else if (*dt & TEXTCSS) 2201 { 2202 ensure_extension (hs, ".css", dt); 2203 } 2204 } 2205 2206 if (statcode == HTTP_STATUS_RANGE_NOT_SATISFIABLE 2207 || (hs->restval > 0 && statcode == HTTP_STATUS_OK 2208 && contrange == 0 && hs->restval >= contlen) 2209 ) 2210 { 2211 /* If `-c' is in use and the file has been fully downloaded (or 2212 the remote file has shrunk), Wget effectively requests bytes 2213 after the end of file and the server response with 416 2214 (or 200 with a <= Content-Length. */ 2215 logputs (LOG_VERBOSE, _("\ 2216\n The file is already fully retrieved; nothing to do.\n\n")); 2217 /* In case the caller inspects. */ 2218 hs->len = contlen; 2219 hs->res = 0; 2220 /* Mark as successfully retrieved. */ 2221 *dt |= RETROKF; 2222 xfree_null (type); 2223 CLOSE_INVALIDATE (sock); /* would be CLOSE_FINISH, but there 2224 might be more bytes in the body. */ 2225 xfree (head); 2226 return RETRUNNEEDED; 2227 } 2228 if ((contrange != 0 && contrange != hs->restval) 2229 || (H_PARTIAL (statcode) && !contrange)) 2230 { 2231 /* The Range request was somehow misunderstood by the server. 2232 Bail out. */ 2233 xfree_null (type); 2234 CLOSE_INVALIDATE (sock); 2235 xfree (head); 2236 return RANGEERR; 2237 } 2238 if (contlen == -1) 2239 hs->contlen = -1; 2240 else 2241 hs->contlen = contlen + contrange; 2242 /* Foxconn modify end, Alex Zhang, 02/27/2013 */ 2243 if (opt.verbose) 2244 { 2245 if (*dt & RETROKF) 2246 { 2247 /* No need to print this output if the body won't be 2248 downloaded at all, or if the original server response is 2249 printed. */ 2250 logputs (LOG_VERBOSE, _("Length: ")); 2251 if (contlen != -1) 2252 { 2253 logputs (LOG_VERBOSE, number_to_static_string (contlen + contrange)); 2254 if (contlen + contrange >= 1024) 2255 logprintf (LOG_VERBOSE, " (%s)", 2256 human_readable (contlen + contrange)); 2257 if (contrange) 2258 { 2259 if (contlen >= 1024) 2260 logprintf (LOG_VERBOSE, _(", %s (%s) remaining"), 2261 number_to_static_string (contlen), 2262 human_readable (contlen)); 2263 else 2264 logprintf (LOG_VERBOSE, _(", %s remaining"), 2265 number_to_static_string (contlen)); 2266 } 2267 } 2268 else 2269 logputs (LOG_VERBOSE, 2270 opt.ignore_length ? _("ignored") : _("unspecified")); 2271 if (type) 2272 logprintf (LOG_VERBOSE, " [%s]\n", quotearg_style (escape_quoting_style, type)); 2273 else 2274 logputs (LOG_VERBOSE, "\n"); 2275 } 2276 } 2277 xfree_null (type); 2278 type = NULL; /* We don't need it any more. */ 2279 2280 /* Return if we have no intention of further downloading. */ 2281 if (!(*dt & RETROKF) || head_only) 2282 { 2283 /* In case the caller cares to look... */ 2284 hs->len = 0; 2285 hs->res = 0; 2286 xfree_null (type); 2287 if (head_only) 2288 /* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the 2289 servers not to send body in response to a HEAD request, and 2290 those that do will likely be caught by test_socket_open. 2291 If not, they can be worked around using 2292 `--no-http-keep-alive'. */ 2293 CLOSE_FINISH (sock); 2294 else if (keep_alive && skip_short_body (sock, contlen)) 2295 /* Successfully skipped the body; also keep using the socket. */ 2296 CLOSE_FINISH (sock); 2297 else 2298 CLOSE_INVALIDATE (sock); 2299 xfree (head); 2300 return RETRFINISHED; 2301 } 2302 2303/* 2005-06-17 SMS. 2304 For VMS, define common fopen() optional arguments. 2305*/ 2306#ifdef __VMS 2307# define FOPEN_OPT_ARGS "fop=sqo", "acc", acc_cb, &open_id 2308# define FOPEN_BIN_FLAG 3 2309#else /* def __VMS */ 2310# define FOPEN_BIN_FLAG true 2311#endif /* def __VMS [else] */ 2312 2313 /* Open the local file. */ 2314 if (!output_stream) 2315 { 2316 mkalldirs (hs->local_file); 2317 if (opt.backups) 2318 rotate_backups (hs->local_file); 2319 if (hs->restval) 2320 { 2321#ifdef __VMS 2322 int open_id; 2323 2324 open_id = 21; 2325 fp = fopen (hs->local_file, "ab", FOPEN_OPT_ARGS); 2326#else /* def __VMS */ 2327 fp = fopen (hs->local_file, "ab"); 2328#endif /* def __VMS [else] */ 2329 } 2330 else if (ALLOW_CLOBBER) 2331 { 2332#ifdef __VMS 2333 int open_id; 2334 2335 open_id = 22; 2336 fp = fopen (hs->local_file, "wb", FOPEN_OPT_ARGS); 2337#else /* def __VMS */ 2338 fp = fopen (hs->local_file, "wb"); 2339#endif /* def __VMS [else] */ 2340 } 2341 else 2342 { 2343 fp = fopen_excl (hs->local_file, FOPEN_BIN_FLAG); 2344 if (!fp && errno == EEXIST) 2345 { 2346 /* We cannot just invent a new name and use it (which is 2347 what functions like unique_create typically do) 2348 because we told the user we'd use this name. 2349 Instead, return and retry the download. */ 2350 logprintf (LOG_NOTQUIET, 2351 _("%s has sprung into existence.\n"), 2352 hs->local_file); 2353 CLOSE_INVALIDATE (sock); 2354 xfree (head); 2355 return FOPEN_EXCL_ERR; 2356 } 2357 } 2358 if (!fp) 2359 { 2360 logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, strerror (errno)); 2361 CLOSE_INVALIDATE (sock); 2362 xfree (head); 2363 return FOPENERR; 2364 } 2365 } 2366 else 2367 fp = output_stream; 2368 2369 /* Print fetch message, if opt.verbose. */ 2370 if (opt.verbose) 2371 { 2372 logprintf (LOG_NOTQUIET, _("Saving to: %s\n"), 2373 HYPHENP (hs->local_file) ? quote ("STDOUT") : quote (hs->local_file)); 2374 } 2375 2376 /* This confuses the timestamping code that checks for file size. 2377 #### The timestamping code should be smarter about file size. */ 2378 if (opt.save_headers && hs->restval == 0) 2379 fwrite (head, 1, strlen (head), fp); 2380 2381 /* Now we no longer need to store the response header. */ 2382 xfree (head); 2383 2384 /* Download the request body. */ 2385 /* Foxconn add start, Alex Zhang, 01/29/2013 */ 2386 if(contlen == 0) 2387 { 2388 CLOSE_FINISH (sock); 2389 if (!output_stream) 2390 fclose (fp); 2391 return RETRFINISHED; 2392 } 2393 /* Foxconn add end, Alex Zhang, 01/29/2013 */ 2394 flags = 0; 2395 if (contlen != -1) 2396 /* If content-length is present, read that much; otherwise, read 2397 until EOF. The HTTP spec doesn't require the server to 2398 actually close the connection when it's done sending data. */ 2399 flags |= rb_read_exactly; 2400 if (hs->restval > 0 && contrange == 0) 2401 /* If the server ignored our range request, instruct fd_read_body 2402 to skip the first RESTVAL bytes of body. */ 2403 flags |= rb_skip_startpos; 2404 hs->len = hs->restval; 2405 hs->rd_size = 0; 2406 hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0, 2407 hs->restval, &hs->rd_size, &hs->len, &hs->dltime, 2408 flags); 2409 2410 if (hs->res >= 0) 2411 CLOSE_FINISH (sock); 2412 else 2413 { 2414 if (hs->res < 0) 2415 hs->rderrmsg = xstrdup (fd_errstr (sock)); 2416 CLOSE_INVALIDATE (sock); 2417 } 2418 2419 if (!output_stream) 2420 fclose (fp); 2421 if (hs->res == -2) 2422 return FWRITEERR; 2423 return RETRFINISHED; 2424} 2425 2426/* The genuine HTTP loop! This is the part where the retrieval is 2427 retried, and retried, and retried, and... */ 2428uerr_t 2429http_loop (struct url *u, char **newloc, char **local_file, const char *referer, 2430 int *dt, struct url *proxy, struct iri *iri) 2431{ 2432 int count; 2433 bool got_head = false; /* used for time-stamping and filename detection */ 2434 bool time_came_from_head = false; 2435 bool got_name = false; 2436 char *tms; 2437 const char *tmrate; 2438 uerr_t err, ret = TRYLIMEXC; 2439 time_t tmr = -1; /* remote time-stamp */ 2440 struct http_stat hstat; /* HTTP status */ 2441 struct_stat st; 2442 bool send_head_first = true; 2443 char *file_name; 2444 2445 /* Assert that no value for *LOCAL_FILE was passed. */ 2446 assert (local_file == NULL || *local_file == NULL); 2447 2448 /* Set LOCAL_FILE parameter. */ 2449 if (local_file && opt.output_document) 2450 *local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document); 2451 2452 /* Reset NEWLOC parameter. */ 2453 *newloc = NULL; 2454 2455 /* This used to be done in main(), but it's a better idea to do it 2456 here so that we don't go through the hoops if we're just using 2457 FTP or whatever. */ 2458 if (opt.cookies) 2459 load_cookies(); 2460 2461 /* Warn on (likely bogus) wildcard usage in HTTP. */ 2462 if (opt.ftp_glob && has_wildcards_p (u->path)) 2463 logputs (LOG_VERBOSE, _("Warning: wildcards not supported in HTTP.\n")); 2464 2465 /* Setup hstat struct. */ 2466 xzero (hstat); 2467 hstat.referer = referer; 2468 2469 if (opt.output_document) 2470 { 2471 hstat.local_file = xstrdup (opt.output_document); 2472 got_name = true; 2473 } 2474 else if (!opt.content_disposition) 2475 { 2476 hstat.local_file = url_file_name (u); 2477 got_name = true; 2478 } 2479 2480 /* TODO: Ick! This code is now in both gethttp and http_loop, and is 2481 * screaming for some refactoring. */ 2482 if (got_name && file_exists_p (hstat.local_file) && opt.noclobber && !opt.output_document) 2483 { 2484 /* If opt.noclobber is turned on and file already exists, do not 2485 retrieve the file. But if the output_document was given, then this 2486 test was already done and the file didn't exist. Hence the !opt.output_document */ 2487 logprintf (LOG_VERBOSE, _("\ 2488File %s already there; not retrieving.\n\n"), 2489 quote (hstat.local_file)); 2490 /* If the file is there, we suppose it's retrieved OK. */ 2491 *dt |= RETROKF; 2492 2493 /* #### Bogusness alert. */ 2494 /* If its suffix is "html" or "htm" or similar, assume text/html. */ 2495 if (has_html_suffix_p (hstat.local_file)) 2496 *dt |= TEXTHTML; 2497 2498 ret = RETROK; 2499 goto exit; 2500 } 2501 2502 /* Reset the counter. */ 2503 count = 0; 2504 2505 /* Reset the document type. */ 2506 *dt = 0; 2507 2508 /* Skip preliminary HEAD request if we're not in spider mode AND 2509 * if -O was given or HTTP Content-Disposition support is disabled. */ 2510 if (!opt.spider 2511 && (got_name || !opt.content_disposition)) 2512 send_head_first = false; 2513 2514 /* Send preliminary HEAD request if -N is given and we have an existing 2515 * destination file. */ 2516 file_name = url_file_name (u); 2517 if (opt.timestamping 2518 && !opt.content_disposition 2519 && file_exists_p (file_name)) 2520 send_head_first = true; 2521 xfree (file_name); 2522 2523 /* THE loop */ 2524 do 2525 { 2526 /* Increment the pass counter. */ 2527 ++count; 2528 sleep_between_retrievals (count); 2529 2530 /* Get the current time string. */ 2531 tms = datetime_str (time (NULL)); 2532 2533 if (opt.spider && !got_head) 2534 logprintf (LOG_VERBOSE, _("\ 2535Spider mode enabled. Check if remote file exists.\n")); 2536 2537 /* Print fetch message, if opt.verbose. */ 2538 if (opt.verbose) 2539 { 2540 char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD); 2541 2542 if (count > 1) 2543 { 2544 char tmp[256]; 2545 sprintf (tmp, _("(try:%2d)"), count); 2546 logprintf (LOG_NOTQUIET, "--%s-- %s %s\n", 2547 tms, tmp, hurl); 2548 } 2549 else 2550 { 2551 logprintf (LOG_NOTQUIET, "--%s-- %s\n", 2552 tms, hurl); 2553 } 2554 2555#ifdef WINDOWS 2556 ws_changetitle (hurl); 2557#endif 2558 xfree (hurl); 2559 } 2560 2561 /* Default document type is empty. However, if spider mode is 2562 on or time-stamping is employed, HEAD_ONLY commands is 2563 encoded within *dt. */ 2564 if (send_head_first && !got_head) 2565 *dt |= HEAD_ONLY; 2566 else 2567 *dt &= ~HEAD_ONLY; 2568 2569 /* Decide whether or not to restart. */ 2570 if (opt.always_rest 2571 && got_name 2572 && stat (hstat.local_file, &st) == 0 2573 && S_ISREG (st.st_mode)) 2574 /* When -c is used, continue from on-disk size. (Can't use 2575 hstat.len even if count>1 because we don't want a failed 2576 first attempt to clobber existing data.) */ 2577 hstat.restval = st.st_size; 2578 else if (count > 1) 2579 /* otherwise, continue where the previous try left off */ 2580 hstat.restval = hstat.len; 2581 else 2582 hstat.restval = 0; 2583 2584 /* Decide whether to send the no-cache directive. We send it in 2585 two cases: 2586 a) we're using a proxy, and we're past our first retrieval. 2587 Some proxies are notorious for caching incomplete data, so 2588 we require a fresh get. 2589 b) caching is explicitly inhibited. */ 2590 if ((proxy && count > 1) /* a */ 2591 || !opt.allow_cache) /* b */ 2592 *dt |= SEND_NOCACHE; 2593 else 2594 *dt &= ~SEND_NOCACHE; 2595 2596 /* Try fetching the document, or at least its head. */ 2597 err = gethttp (u, &hstat, dt, proxy, iri); 2598 2599 /* Time? */ 2600 tms = datetime_str (time (NULL)); 2601 2602 /* Get the new location (with or without the redirection). */ 2603 if (hstat.newloc) 2604 *newloc = xstrdup (hstat.newloc); 2605 2606 switch (err) 2607 { 2608 case HERR: case HEOF: case CONSOCKERR: case CONCLOSED: 2609 case CONERROR: case READERR: case WRITEFAILED: 2610 case RANGEERR: case FOPEN_EXCL_ERR: 2611 /* Non-fatal errors continue executing the loop, which will 2612 bring them to "while" statement at the end, to judge 2613 whether the number of tries was exceeded. */ 2614 printwhat (count, opt.ntry); 2615 continue; 2616 case FWRITEERR: case FOPENERR: 2617 /* Another fatal error. */ 2618 logputs (LOG_VERBOSE, "\n"); 2619 logprintf (LOG_NOTQUIET, _("Cannot write to %s (%s).\n"), 2620 quote (hstat.local_file), strerror (errno)); 2621 case HOSTERR: case CONIMPOSSIBLE: case PROXERR: case AUTHFAILED: 2622 case SSLINITFAILED: case CONTNOTSUPPORTED: case VERIFCERTERR: 2623 /* Fatal errors just return from the function. */ 2624 ret = err; 2625 goto exit; 2626 case CONSSLERR: 2627 /* Another fatal error. */ 2628 logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n")); 2629 ret = err; 2630 goto exit; 2631 case NEWLOCATION: 2632 /* Return the new location to the caller. */ 2633 if (!*newloc) 2634 { 2635 logprintf (LOG_NOTQUIET, 2636 _("ERROR: Redirection (%d) without location.\n"), 2637 hstat.statcode); 2638 ret = WRONGCODE; 2639 } 2640 else 2641 { 2642 ret = NEWLOCATION; 2643 } 2644 goto exit; 2645 case RETRUNNEEDED: 2646 /* The file was already fully retrieved. */ 2647 ret = RETROK; 2648 goto exit; 2649 case RETRFINISHED: 2650 /* Deal with you later. */ 2651 break; 2652 default: 2653 /* All possibilities should have been exhausted. */ 2654 abort (); 2655 } 2656 2657 if (!(*dt & RETROKF)) 2658 { 2659 char *hurl = NULL; 2660 if (!opt.verbose) 2661 { 2662 /* #### Ugly ugly ugly! */ 2663 hurl = url_string (u, URL_AUTH_HIDE_PASSWD); 2664 logprintf (LOG_NONVERBOSE, "%s:\n", hurl); 2665 } 2666 2667 /* Fall back to GET if HEAD fails with a 500 or 501 error code. */ 2668 if (*dt & HEAD_ONLY 2669 && (hstat.statcode == 500 || hstat.statcode == 501)) 2670 { 2671 got_head = true; 2672 continue; 2673 } 2674 /* Maybe we should always keep track of broken links, not just in 2675 * spider mode. 2676 * Don't log error if it was UTF-8 encoded because we will try 2677 * once unencoded. */ 2678 else if (opt.spider && !iri->utf8_encode) 2679 { 2680 /* #### Again: ugly ugly ugly! */ 2681 if (!hurl) 2682 hurl = url_string (u, URL_AUTH_HIDE_PASSWD); 2683 nonexisting_url (hurl); 2684 logprintf (LOG_NOTQUIET, _("\ 2685Remote file does not exist -- broken link!!!\n")); 2686 } 2687 else 2688 { 2689 logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), 2690 tms, hstat.statcode, 2691 quotearg_style (escape_quoting_style, hstat.error)); 2692 } 2693 logputs (LOG_VERBOSE, "\n"); 2694 ret = WRONGCODE; 2695 xfree_null (hurl); 2696 goto exit; 2697 } 2698 2699 /* Did we get the time-stamp? */ 2700 if (!got_head) 2701 { 2702 got_head = true; /* no more time-stamping */ 2703 2704 if (opt.timestamping && !hstat.remote_time) 2705 { 2706 logputs (LOG_NOTQUIET, _("\ 2707Last-modified header missing -- time-stamps turned off.\n")); 2708 } 2709 else if (hstat.remote_time) 2710 { 2711 /* Convert the date-string into struct tm. */ 2712 tmr = http_atotm (hstat.remote_time); 2713 if (tmr == (time_t) (-1)) 2714 logputs (LOG_VERBOSE, _("\ 2715Last-modified header invalid -- time-stamp ignored.\n")); 2716 if (*dt & HEAD_ONLY) 2717 time_came_from_head = true; 2718 } 2719 2720 if (send_head_first) 2721 { 2722 /* The time-stamping section. */ 2723 if (opt.timestamping) 2724 { 2725 if (hstat.orig_file_name) /* Perform the following 2726 checks only if the file 2727 we're supposed to 2728 download already exists. */ 2729 { 2730 if (hstat.remote_time && 2731 tmr != (time_t) (-1)) 2732 { 2733 /* Now time-stamping can be used validly. 2734 Time-stamping means that if the sizes of 2735 the local and remote file match, and local 2736 file is newer than the remote file, it will 2737 not be retrieved. Otherwise, the normal 2738 download procedure is resumed. */ 2739 if (hstat.orig_file_tstamp >= tmr) 2740 { 2741 if (hstat.contlen == -1 2742 || hstat.orig_file_size == hstat.contlen) 2743 { 2744 logprintf (LOG_VERBOSE, _("\ 2745Server file no newer than local file %s -- not retrieving.\n\n"), 2746 quote (hstat.orig_file_name)); 2747 ret = RETROK; 2748 goto exit; 2749 } 2750 else 2751 { 2752 logprintf (LOG_VERBOSE, _("\ 2753The sizes do not match (local %s) -- retrieving.\n"), 2754 number_to_static_string (hstat.orig_file_size)); 2755 } 2756 } 2757 else 2758 logputs (LOG_VERBOSE, 2759 _("Remote file is newer, retrieving.\n")); 2760 2761 logputs (LOG_VERBOSE, "\n"); 2762 } 2763 } 2764 2765 /* free_hstat (&hstat); */ 2766 hstat.timestamp_checked = true; 2767 } 2768 2769 if (opt.spider) 2770 { 2771 bool finished = true; 2772 if (opt.recursive) 2773 { 2774 if (*dt & TEXTHTML) 2775 { 2776 logputs (LOG_VERBOSE, _("\ 2777Remote file exists and could contain links to other resources -- retrieving.\n\n")); 2778 finished = false; 2779 } 2780 else 2781 { 2782 logprintf (LOG_VERBOSE, _("\ 2783Remote file exists but does not contain any link -- not retrieving.\n\n")); 2784 ret = RETROK; /* RETRUNNEEDED is not for caller. */ 2785 } 2786 } 2787 else 2788 { 2789 if (*dt & TEXTHTML) 2790 { 2791 logprintf (LOG_VERBOSE, _("\ 2792Remote file exists and could contain further links,\n\ 2793but recursion is disabled -- not retrieving.\n\n")); 2794 } 2795 else 2796 { 2797 logprintf (LOG_VERBOSE, _("\ 2798Remote file exists.\n\n")); 2799 } 2800 ret = RETROK; /* RETRUNNEEDED is not for caller. */ 2801 } 2802 2803 if (finished) 2804 { 2805 logprintf (LOG_NONVERBOSE, 2806 _("%s URL: %s %2d %s\n"), 2807 tms, u->url, hstat.statcode, 2808 hstat.message ? quotearg_style (escape_quoting_style, hstat.message) : ""); 2809 goto exit; 2810 } 2811 } 2812 2813 got_name = true; 2814 *dt &= ~HEAD_ONLY; 2815 count = 0; /* the retrieve count for HEAD is reset */ 2816 continue; 2817 } /* send_head_first */ 2818 } /* !got_head */ 2819 2820 if ((tmr != (time_t) (-1)) 2821 && ((hstat.len == hstat.contlen) || 2822 ((hstat.res == 0) && (hstat.contlen == -1)))) 2823 { 2824 const char *fl = NULL; 2825 set_local_file (&fl, hstat.local_file); 2826 if (fl) 2827 { 2828 time_t newtmr = -1; 2829 /* Reparse time header, in case it's changed. */ 2830 if (time_came_from_head 2831 && hstat.remote_time && hstat.remote_time[0]) 2832 { 2833 newtmr = http_atotm (hstat.remote_time); 2834 if (newtmr != (time_t)-1) 2835 tmr = newtmr; 2836 } 2837 touch (fl, tmr); 2838 } 2839 } 2840 /* End of time-stamping section. */ 2841 2842 tmrate = retr_rate (hstat.rd_size, hstat.dltime); 2843 total_download_time += hstat.dltime; 2844 2845 if (hstat.len == hstat.contlen) 2846 { 2847 if (*dt & RETROKF) 2848 { 2849 bool write_to_stdout = (opt.output_document && HYPHENP (opt.output_document)); 2850 2851 logprintf (LOG_VERBOSE, 2852 write_to_stdout 2853 ? _("%s (%s) - written to stdout %s[%s/%s]\n\n") 2854 : _("%s (%s) - %s saved [%s/%s]\n\n"), 2855 tms, tmrate, 2856 write_to_stdout ? "" : quote (hstat.local_file), 2857 number_to_static_string (hstat.len), 2858 number_to_static_string (hstat.contlen)); 2859 logprintf (LOG_NONVERBOSE, 2860 "%s URL:%s [%s/%s] -> \"%s\" [%d]\n", 2861 tms, u->url, 2862 number_to_static_string (hstat.len), 2863 number_to_static_string (hstat.contlen), 2864 hstat.local_file, count); 2865 } 2866 ++numurls; 2867 total_downloaded_bytes += hstat.len; 2868 2869 /* Remember that we downloaded the file for later ".orig" code. */ 2870 if (*dt & ADDED_HTML_EXTENSION) 2871 downloaded_file(FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, hstat.local_file); 2872 else 2873 downloaded_file(FILE_DOWNLOADED_NORMALLY, hstat.local_file); 2874 2875 ret = RETROK; 2876 goto exit; 2877 } 2878 else if (hstat.res == 0) /* No read error */ 2879 { 2880 if (hstat.contlen == -1) /* We don't know how much we were supposed 2881 to get, so assume we succeeded. */ 2882 { 2883 if (*dt & RETROKF) 2884 { 2885 bool write_to_stdout = (opt.output_document && HYPHENP (opt.output_document)); 2886 2887 logprintf (LOG_VERBOSE, 2888 write_to_stdout 2889 ? _("%s (%s) - written to stdout %s[%s]\n\n") 2890 : _("%s (%s) - %s saved [%s]\n\n"), 2891 tms, tmrate, 2892 write_to_stdout ? "" : quote (hstat.local_file), 2893 number_to_static_string (hstat.len)); 2894 logprintf (LOG_NONVERBOSE, 2895 "%s URL:%s [%s] -> \"%s\" [%d]\n", 2896 tms, u->url, number_to_static_string (hstat.len), 2897 hstat.local_file, count); 2898 } 2899 ++numurls; 2900 total_downloaded_bytes += hstat.len; 2901 2902 /* Remember that we downloaded the file for later ".orig" code. */ 2903 if (*dt & ADDED_HTML_EXTENSION) 2904 downloaded_file(FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, hstat.local_file); 2905 else 2906 downloaded_file(FILE_DOWNLOADED_NORMALLY, hstat.local_file); 2907 2908 ret = RETROK; 2909 goto exit; 2910 } 2911 else if (hstat.len < hstat.contlen) /* meaning we lost the 2912 connection too soon */ 2913 { 2914 logprintf (LOG_VERBOSE, 2915 _("%s (%s) - Connection closed at byte %s. "), 2916 tms, tmrate, number_to_static_string (hstat.len)); 2917 printwhat (count, opt.ntry); 2918 continue; 2919 } 2920 else if (hstat.len != hstat.restval) 2921 /* Getting here would mean reading more data than 2922 requested with content-length, which we never do. */ 2923 abort (); 2924 else 2925 { 2926 /* Getting here probably means that the content-length was 2927 * _less_ than the original, local size. We should probably 2928 * truncate or re-read, or something. FIXME */ 2929 ret = RETROK; 2930 goto exit; 2931 } 2932 } 2933 else /* from now on hstat.res can only be -1 */ 2934 { 2935 if (hstat.contlen == -1) 2936 { 2937 logprintf (LOG_VERBOSE, 2938 _("%s (%s) - Read error at byte %s (%s)."), 2939 tms, tmrate, number_to_static_string (hstat.len), 2940 hstat.rderrmsg); 2941 printwhat (count, opt.ntry); 2942 continue; 2943 } 2944 else /* hstat.res == -1 and contlen is given */ 2945 { 2946 logprintf (LOG_VERBOSE, 2947 _("%s (%s) - Read error at byte %s/%s (%s). "), 2948 tms, tmrate, 2949 number_to_static_string (hstat.len), 2950 number_to_static_string (hstat.contlen), 2951 hstat.rderrmsg); 2952 printwhat (count, opt.ntry); 2953 continue; 2954 } 2955 } 2956 /* not reached */ 2957 } 2958 while (!opt.ntry || (count < opt.ntry)); 2959 2960exit: 2961 if (ret == RETROK) 2962 *local_file = xstrdup (hstat.local_file); 2963 free_hstat (&hstat); 2964 2965 return ret; 2966} 2967 2968/* Check whether the result of strptime() indicates success. 2969 strptime() returns the pointer to how far it got to in the string. 2970 The processing has been successful if the string is at `GMT' or 2971 `+X', or at the end of the string. 2972 2973 In extended regexp parlance, the function returns 1 if P matches 2974 "^ *(GMT|[+-][0-9]|$)", 0 otherwise. P being NULL (which strptime 2975 can return) is considered a failure and 0 is returned. */ 2976static bool 2977check_end (const char *p) 2978{ 2979 if (!p) 2980 return false; 2981 while (c_isspace (*p)) 2982 ++p; 2983 if (!*p 2984 || (p[0] == 'G' && p[1] == 'M' && p[2] == 'T') 2985 || ((p[0] == '+' || p[0] == '-') && c_isdigit (p[1]))) 2986 return true; 2987 else 2988 return false; 2989} 2990 2991/* Convert the textual specification of time in TIME_STRING to the 2992 number of seconds since the Epoch. 2993 2994 TIME_STRING can be in any of the three formats RFC2616 allows the 2995 HTTP servers to emit -- RFC1123-date, RFC850-date or asctime-date, 2996 as well as the time format used in the Set-Cookie header. 2997 Timezones are ignored, and should be GMT. 2998 2999 Return the computed time_t representation, or -1 if the conversion 3000 fails. 3001 3002 This function uses strptime with various string formats for parsing 3003 TIME_STRING. This results in a parser that is not as lenient in 3004 interpreting TIME_STRING as I would like it to be. Being based on 3005 strptime, it always allows shortened months, one-digit days, etc., 3006 but due to the multitude of formats in which time can be 3007 represented, an ideal HTTP time parser would be even more 3008 forgiving. It should completely ignore things like week days and 3009 concentrate only on the various forms of representing years, 3010 months, days, hours, minutes, and seconds. For example, it would 3011 be nice if it accepted ISO 8601 out of the box. 3012 3013 I've investigated free and PD code for this purpose, but none was 3014 usable. getdate was big and unwieldy, and had potential copyright 3015 issues, or so I was informed. Dr. Marcus Hennecke's atotm(), 3016 distributed with phttpd, is excellent, but we cannot use it because 3017 it is not assigned to the FSF. So I stuck it with strptime. */ 3018 3019time_t 3020http_atotm (const char *time_string) 3021{ 3022 /* NOTE: Solaris strptime man page claims that %n and %t match white 3023 space, but that's not universally available. Instead, we simply 3024 use ` ' to mean "skip all WS", which works under all strptime 3025 implementations I've tested. */ 3026 3027 static const char *time_formats[] = { 3028 "%a, %d %b %Y %T", /* rfc1123: Thu, 29 Jan 1998 22:12:57 */ 3029 "%A, %d-%b-%y %T", /* rfc850: Thursday, 29-Jan-98 22:12:57 */ 3030 "%a %b %d %T %Y", /* asctime: Thu Jan 29 22:12:57 1998 */ 3031 "%a, %d-%b-%Y %T" /* cookies: Thu, 29-Jan-1998 22:12:57 3032 (used in Set-Cookie, defined in the 3033 Netscape cookie specification.) */ 3034 }; 3035 const char *oldlocale; 3036 char savedlocale[256]; 3037 size_t i; 3038 time_t ret = (time_t) -1; 3039 3040 /* Solaris strptime fails to recognize English month names in 3041 non-English locales, which we work around by temporarily setting 3042 locale to C before invoking strptime. */ 3043 oldlocale = setlocale (LC_TIME, NULL); 3044 if (oldlocale) 3045 { 3046 size_t l = strlen (oldlocale); 3047 if (l >= sizeof savedlocale) 3048 savedlocale[0] = '\0'; 3049 else 3050 memcpy (savedlocale, oldlocale, l); 3051 } 3052 else savedlocale[0] = '\0'; 3053 3054 setlocale (LC_TIME, "C"); 3055 3056 for (i = 0; i < countof (time_formats); i++) 3057 { 3058 struct tm t; 3059 3060 /* Some versions of strptime use the existing contents of struct 3061 tm to recalculate the date according to format. Zero it out 3062 to prevent stack garbage from influencing strptime. */ 3063 xzero (t); 3064 3065 if (check_end (strptime (time_string, time_formats[i], &t))) 3066 { 3067 ret = timegm (&t); 3068 break; 3069 } 3070 } 3071 3072 /* Restore the previous locale. */ 3073 if (savedlocale[0]) 3074 setlocale (LC_TIME, savedlocale); 3075 3076 return ret; 3077} 3078 3079/* Authorization support: We support three authorization schemes: 3080 3081 * `Basic' scheme, consisting of base64-ing USER:PASSWORD string; 3082 3083 * `Digest' scheme, added by Junio Hamano <junio@twinsun.com>, 3084 consisting of answering to the server's challenge with the proper 3085 MD5 digests. 3086 3087 * `NTLM' ("NT Lan Manager") scheme, based on code written by Daniel 3088 Stenberg for libcurl. Like digest, NTLM is based on a 3089 challenge-response mechanism, but unlike digest, it is non-standard 3090 (authenticates TCP connections rather than requests), undocumented 3091 and Microsoft-specific. */ 3092 3093/* Create the authentication header contents for the `Basic' scheme. 3094 This is done by encoding the string "USER:PASS" to base64 and 3095 prepending the string "Basic " in front of it. */ 3096 3097static char * 3098basic_authentication_encode (const char *user, const char *passwd) 3099{ 3100 char *t1, *t2; 3101 int len1 = strlen (user) + 1 + strlen (passwd); 3102 3103 t1 = (char *)alloca (len1 + 1); 3104 sprintf (t1, "%s:%s", user, passwd); 3105 3106 t2 = (char *)alloca (BASE64_LENGTH (len1) + 1); 3107 base64_encode (t1, len1, t2); 3108 3109 return concat_strings ("Basic ", t2, (char *) 0); 3110} 3111 3112#define SKIP_WS(x) do { \ 3113 while (c_isspace (*(x))) \ 3114 ++(x); \ 3115} while (0) 3116 3117#ifdef ENABLE_DIGEST 3118/* Dump the hexadecimal representation of HASH to BUF. HASH should be 3119 an array of 16 bytes containing the hash keys, and BUF should be a 3120 buffer of 33 writable characters (32 for hex digits plus one for 3121 zero termination). */ 3122static void 3123dump_hash (char *buf, const unsigned char *hash) 3124{ 3125 int i; 3126 3127 for (i = 0; i < MD5_HASHLEN; i++, hash++) 3128 { 3129 *buf++ = XNUM_TO_digit (*hash >> 4); 3130 *buf++ = XNUM_TO_digit (*hash & 0xf); 3131 } 3132 *buf = '\0'; 3133} 3134 3135/* Take the line apart to find the challenge, and compose a digest 3136 authorization header. See RFC2069 section 2.1.2. */ 3137static char * 3138digest_authentication_encode (const char *au, const char *user, 3139 const char *passwd, const char *method, 3140 const char *path) 3141{ 3142 static char *realm, *opaque, *nonce; 3143 static struct { 3144 const char *name; 3145 char **variable; 3146 } options[] = { 3147 { "realm", &realm }, 3148 { "opaque", &opaque }, 3149 { "nonce", &nonce } 3150 }; 3151 char *res; 3152 param_token name, value; 3153 3154 realm = opaque = nonce = NULL; 3155 3156 au += 6; /* skip over `Digest' */ 3157 while (extract_param (&au, &name, &value, ',')) 3158 { 3159 size_t i; 3160 size_t namelen = name.e - name.b; 3161 for (i = 0; i < countof (options); i++) 3162 if (namelen == strlen (options[i].name) 3163 && 0 == strncmp (name.b, options[i].name, 3164 namelen)) 3165 { 3166 *options[i].variable = strdupdelim (value.b, value.e); 3167 break; 3168 } 3169 } 3170 if (!realm || !nonce || !user || !passwd || !path || !method) 3171 { 3172 xfree_null (realm); 3173 xfree_null (opaque); 3174 xfree_null (nonce); 3175 return NULL; 3176 } 3177 3178 /* Calculate the digest value. */ 3179 { 3180 ALLOCA_MD5_CONTEXT (ctx); 3181 unsigned char hash[MD5_HASHLEN]; 3182 char a1buf[MD5_HASHLEN * 2 + 1], a2buf[MD5_HASHLEN * 2 + 1]; 3183 char response_digest[MD5_HASHLEN * 2 + 1]; 3184 3185 /* A1BUF = H(user ":" realm ":" password) */ 3186 gen_md5_init (ctx); 3187 gen_md5_update ((unsigned char *)user, strlen (user), ctx); 3188 gen_md5_update ((unsigned char *)":", 1, ctx); 3189 gen_md5_update ((unsigned char *)realm, strlen (realm), ctx); 3190 gen_md5_update ((unsigned char *)":", 1, ctx); 3191 gen_md5_update ((unsigned char *)passwd, strlen (passwd), ctx); 3192 gen_md5_finish (ctx, hash); 3193 dump_hash (a1buf, hash); 3194 3195 /* A2BUF = H(method ":" path) */ 3196 gen_md5_init (ctx); 3197 gen_md5_update ((unsigned char *)method, strlen (method), ctx); 3198 gen_md5_update ((unsigned char *)":", 1, ctx); 3199 gen_md5_update ((unsigned char *)path, strlen (path), ctx); 3200 gen_md5_finish (ctx, hash); 3201 dump_hash (a2buf, hash); 3202 3203 /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" A2BUF) */ 3204 gen_md5_init (ctx); 3205 gen_md5_update ((unsigned char *)a1buf, MD5_HASHLEN * 2, ctx); 3206 gen_md5_update ((unsigned char *)":", 1, ctx); 3207 gen_md5_update ((unsigned char *)nonce, strlen (nonce), ctx); 3208 gen_md5_update ((unsigned char *)":", 1, ctx); 3209 gen_md5_update ((unsigned char *)a2buf, MD5_HASHLEN * 2, ctx); 3210 gen_md5_finish (ctx, hash); 3211 dump_hash (response_digest, hash); 3212 3213 res = xmalloc (strlen (user) 3214 + strlen (user) 3215 + strlen (realm) 3216 + strlen (nonce) 3217 + strlen (path) 3218 + 2 * MD5_HASHLEN /*strlen (response_digest)*/ 3219 + (opaque ? strlen (opaque) : 0) 3220 + 128); 3221 sprintf (res, "Digest \ 3222username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\"", 3223 user, realm, nonce, path, response_digest); 3224 if (opaque) 3225 { 3226 char *p = res + strlen (res); 3227 strcat (p, ", opaque=\""); 3228 strcat (p, opaque); 3229 strcat (p, "\""); 3230 } 3231 } 3232 return res; 3233} 3234#endif /* ENABLE_DIGEST */ 3235 3236/* Computing the size of a string literal must take into account that 3237 value returned by sizeof includes the terminating \0. */ 3238#define STRSIZE(literal) (sizeof (literal) - 1) 3239 3240/* Whether chars in [b, e) begin with the literal string provided as 3241 first argument and are followed by whitespace or terminating \0. 3242 The comparison is case-insensitive. */ 3243#define STARTS(literal, b, e) \ 3244 ((e > b) \ 3245 && ((size_t) ((e) - (b))) >= STRSIZE (literal) \ 3246 && 0 == strncasecmp (b, literal, STRSIZE (literal)) \ 3247 && ((size_t) ((e) - (b)) == STRSIZE (literal) \ 3248 || c_isspace (b[STRSIZE (literal)]))) 3249 3250static bool 3251known_authentication_scheme_p (const char *hdrbeg, const char *hdrend) 3252{ 3253 return STARTS ("Basic", hdrbeg, hdrend) 3254#ifdef ENABLE_DIGEST 3255 || STARTS ("Digest", hdrbeg, hdrend) 3256#endif 3257#ifdef ENABLE_NTLM 3258 || STARTS ("NTLM", hdrbeg, hdrend) 3259#endif 3260 ; 3261} 3262 3263#undef STARTS 3264 3265/* Create the HTTP authorization request header. When the 3266 `WWW-Authenticate' response header is seen, according to the 3267 authorization scheme specified in that header (`Basic' and `Digest' 3268 are supported by the current implementation), produce an 3269 appropriate HTTP authorization request header. */ 3270static char * 3271create_authorization_line (const char *au, const char *user, 3272 const char *passwd, const char *method, 3273 const char *path, bool *finished) 3274{ 3275 /* We are called only with known schemes, so we can dispatch on the 3276 first letter. */ 3277 switch (c_toupper (*au)) 3278 { 3279 case 'B': /* Basic */ 3280 *finished = true; 3281 return basic_authentication_encode (user, passwd); 3282#ifdef ENABLE_DIGEST 3283 case 'D': /* Digest */ 3284 *finished = true; 3285 return digest_authentication_encode (au, user, passwd, method, path); 3286#endif 3287#ifdef ENABLE_NTLM 3288 case 'N': /* NTLM */ 3289 if (!ntlm_input (&pconn.ntlm, au)) 3290 { 3291 *finished = true; 3292 return NULL; 3293 } 3294 return ntlm_output (&pconn.ntlm, user, passwd, finished); 3295#endif 3296 default: 3297 /* We shouldn't get here -- this function should be only called 3298 with values approved by known_authentication_scheme_p. */ 3299 abort (); 3300 } 3301} 3302 3303static void 3304load_cookies (void) 3305{ 3306 if (!wget_cookie_jar) 3307 wget_cookie_jar = cookie_jar_new (); 3308 if (opt.cookies_input && !cookies_loaded_p) 3309 { 3310 cookie_jar_load (wget_cookie_jar, opt.cookies_input); 3311 cookies_loaded_p = true; 3312 } 3313} 3314 3315void 3316save_cookies (void) 3317{ 3318 if (wget_cookie_jar) 3319 cookie_jar_save (wget_cookie_jar, opt.cookies_output); 3320} 3321 3322void 3323http_cleanup (void) 3324{ 3325 xfree_null (pconn.host); 3326 if (wget_cookie_jar) 3327 cookie_jar_delete (wget_cookie_jar); 3328} 3329 3330void 3331ensure_extension (struct http_stat *hs, const char *ext, int *dt) 3332{ 3333 char *last_period_in_local_filename = strrchr (hs->local_file, '.'); 3334 char shortext[8]; 3335 int len = strlen (ext); 3336 if (len == 5) 3337 { 3338 strncpy (shortext, ext, len - 1); 3339 shortext[len - 2] = '\0'; 3340 } 3341 3342 if (last_period_in_local_filename == NULL 3343 || !(0 == strcasecmp (last_period_in_local_filename, shortext) 3344 || 0 == strcasecmp (last_period_in_local_filename, ext))) 3345 { 3346 int local_filename_len = strlen (hs->local_file); 3347 /* Resize the local file, allowing for ".html" preceded by 3348 optional ".NUMBER". */ 3349 hs->local_file = xrealloc (hs->local_file, 3350 local_filename_len + 24 + len); 3351 strcpy (hs->local_file + local_filename_len, ext); 3352 /* If clobbering is not allowed and the file, as named, 3353 exists, tack on ".NUMBER.html" instead. */ 3354 if (!ALLOW_CLOBBER && file_exists_p (hs->local_file)) 3355 { 3356 int ext_num = 1; 3357 do 3358 sprintf (hs->local_file + local_filename_len, 3359 ".%d%s", ext_num++, ext); 3360 while (file_exists_p (hs->local_file)); 3361 } 3362 *dt |= ADDED_HTML_EXTENSION; 3363 } 3364} 3365 3366 3367#ifdef TESTING 3368 3369const char * 3370test_parse_content_disposition() 3371{ 3372 int i; 3373 struct { 3374 char *hdrval; 3375 char *opt_dir_prefix; 3376 char *filename; 3377 bool result; 3378 } test_array[] = { 3379 { "filename=\"file.ext\"", NULL, "file.ext", true }, 3380 { "filename=\"file.ext\"", "somedir", "somedir/file.ext", true }, 3381 { "attachment; filename=\"file.ext\"", NULL, "file.ext", true }, 3382 { "attachment; filename=\"file.ext\"", "somedir", "somedir/file.ext", true }, 3383 { "attachment; filename=\"file.ext\"; dummy", NULL, "file.ext", true }, 3384 { "attachment; filename=\"file.ext\"; dummy", "somedir", "somedir/file.ext", true }, 3385 { "attachment", NULL, NULL, false }, 3386 { "attachment", "somedir", NULL, false }, 3387 }; 3388 3389 for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i) 3390 { 3391 char *filename; 3392 bool res; 3393 3394 opt.dir_prefix = test_array[i].opt_dir_prefix; 3395 res = parse_content_disposition (test_array[i].hdrval, &filename); 3396 3397 mu_assert ("test_parse_content_disposition: wrong result", 3398 res == test_array[i].result 3399 && (res == false 3400 || 0 == strcmp (test_array[i].filename, filename))); 3401 } 3402 3403 return NULL; 3404} 3405 3406#endif /* TESTING */ 3407 3408/* 3409 * vim: et sts=2 sw=2 cino+={s 3410 */ 3411 3412