1/* HTML parser for Wget. 2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 3 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. 4 5This file is part of GNU Wget. 6 7GNU Wget is free software; you can redistribute it and/or modify 8it under the terms of the GNU General Public License as published by 9the Free Software Foundation; either version 3 of the License, or (at 10your option) any later version. 11 12GNU Wget is distributed in the hope that it will be useful, 13but WITHOUT ANY WARRANTY; without even the implied warranty of 14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15GNU General Public License for more details. 16 17You should have received a copy of the GNU General Public License 18along with Wget. If not, see <http://www.gnu.org/licenses/>. 19 20Additional permission under GNU GPL version 3 section 7 21 22If you modify this program, or any covered work, by linking or 23combining it with the OpenSSL project's OpenSSL library (or a 24modified version of that library), containing parts covered by the 25terms of the OpenSSL or SSLeay licenses, the Free Software Foundation 26grants you additional permission to convey the resulting work. 27Corresponding Source for a non-source form of such a combination 28shall include the source code for the parts of OpenSSL used as well 29as that of the covered work. */ 30 31/* The only entry point to this module is map_html_tags(), which see. */ 32 33/* TODO: 34 35 - Allow hooks for callers to process contents outside tags. This 36 is needed to implement handling <style> and <script>. The 37 taginfo structure already carries the information about where the 38 tags are, but this is not enough, because one would also want to 39 skip the comments. (The funny thing is that for <style> and 40 <script> you *don't* want to skip comments!) 41 42 - Create a test suite for regression testing. */ 43 44/* HISTORY: 45 46 This is the third HTML parser written for Wget. The first one was 47 written some time during the Geturl 1.0 beta cycle, and was very 48 inefficient and buggy. It also contained some very complex code to 49 remember a list of parser states, because it was supposed to be 50 reentrant. 51 52 The second HTML parser was written for Wget 1.4 (the first version 53 by the name `Wget'), and was a complete rewrite. Although the new 54 parser behaved much better and made no claims of reentrancy, it 55 still shared many of the fundamental flaws of the old version -- it 56 only regarded HTML in terms tag-attribute pairs, where the 57 attribute's value was a URL to be returned. Any other property of 58 HTML, such as <base href=...>, or strange way to specify a URL, 59 such as <meta http-equiv=Refresh content="0; URL=..."> had to be 60 crudely hacked in -- and the caller had to be aware of these hacks. 61 Like its predecessor, this parser did not support HTML comments. 62 63 After Wget 1.5.1 was released, I set out to write a third HTML 64 parser. The objectives of the new parser were to: (1) provide a 65 clean way to analyze HTML lexically, (2) separate interpretation of 66 the markup from the parsing process, (3) be as correct as possible, 67 e.g. correctly skipping comments and other SGML declarations, (4) 68 understand the most common errors in markup and skip them or be 69 relaxed towrds them, and (5) be reasonably efficient (no regexps, 70 minimum copying and minimum or no heap allocation). 71 72 I believe this parser meets all of the above goals. It is 73 reasonably well structured, and could be relatively easily 74 separated from Wget and used elsewhere. While some of its 75 intrinsic properties limit its value as a general-purpose HTML 76 parser, I believe that, with minimum modifications, it could serve 77 as a backend for one. 78 79 Due to time and other constraints, this parser was not integrated 80 into Wget until the version 1.7. */ 81 82/* DESCRIPTION: 83 84 The single entry point of this parser is map_html_tags(), which 85 works by calling a function you specify for each tag. The function 86 gets called with the pointer to a structure describing the tag and 87 its attributes. */ 88 89/* To test as standalone, compile with `-DSTANDALONE -I.'. You'll 90 still need Wget headers to compile. */ 91 92#include "wget.h" 93 94#ifdef STANDALONE 95# define I_REALLY_WANT_CTYPE_MACROS 96#endif 97 98#include <stdio.h> 99#include <stdlib.h> 100#include <string.h> 101#include <assert.h> 102 103#include "utils.h" 104#include "html-parse.h" 105 106#ifdef STANDALONE 107# undef xmalloc 108# undef xrealloc 109# undef xfree 110# define xmalloc malloc 111# define xrealloc realloc 112# define xfree free 113 114# undef c_isspace 115# undef c_isdigit 116# undef c_isxdigit 117# undef c_isalpha 118# undef c_isalnum 119# undef c_tolower 120# undef c_toupper 121 122# define c_isspace(x) isspace (x) 123# define c_isdigit(x) isdigit (x) 124# define c_isxdigit(x) isxdigit (x) 125# define c_isalpha(x) isalpha (x) 126# define c_isalnum(x) isalnum (x) 127# define c_tolower(x) tolower (x) 128# define c_toupper(x) toupper (x) 129 130struct hash_table { 131 int dummy; 132}; 133static void * 134hash_table_get (const struct hash_table *ht, void *ptr) 135{ 136 return ptr; 137} 138#else /* not STANDALONE */ 139# include "hash.h" 140#endif 141 142/* Pool support. A pool is a resizable chunk of memory. It is first 143 allocated on the stack, and moved to the heap if it needs to be 144 larger than originally expected. map_html_tags() uses it to store 145 the zero-terminated names and values of tags and attributes. 146 147 Thus taginfo->name, and attr->name and attr->value for each 148 attribute, do not point into separately allocated areas, but into 149 different parts of the pool, separated only by terminating zeros. 150 This ensures minimum amount of allocation and, for most tags, no 151 allocation because the entire pool is kept on the stack. */ 152 153struct pool { 154 char *contents; /* pointer to the contents. */ 155 int size; /* size of the pool. */ 156 int tail; /* next available position index. */ 157 bool resized; /* whether the pool has been resized 158 using malloc. */ 159 160 char *orig_contents; /* original pool contents, usually 161 stack-allocated. used by POOL_FREE 162 to restore the pool to the initial 163 state. */ 164 int orig_size; 165}; 166 167/* Initialize the pool to hold INITIAL_SIZE bytes of storage. */ 168 169#define POOL_INIT(p, initial_storage, initial_size) do { \ 170 struct pool *P = (p); \ 171 P->contents = (initial_storage); \ 172 P->size = (initial_size); \ 173 P->tail = 0; \ 174 P->resized = false; \ 175 P->orig_contents = P->contents; \ 176 P->orig_size = P->size; \ 177} while (0) 178 179/* Grow the pool to accomodate at least SIZE new bytes. If the pool 180 already has room to accomodate SIZE bytes of data, this is a no-op. */ 181 182#define POOL_GROW(p, increase) \ 183 GROW_ARRAY ((p)->contents, (p)->size, (p)->tail + (increase), \ 184 (p)->resized, char) 185 186/* Append text in the range [beg, end) to POOL. No zero-termination 187 is done. */ 188 189#define POOL_APPEND(p, beg, end) do { \ 190 const char *PA_beg = (beg); \ 191 int PA_size = (end) - PA_beg; \ 192 POOL_GROW (p, PA_size); \ 193 memcpy ((p)->contents + (p)->tail, PA_beg, PA_size); \ 194 (p)->tail += PA_size; \ 195} while (0) 196 197/* Append one character to the pool. Can be used to zero-terminate 198 pool strings. */ 199 200#define POOL_APPEND_CHR(p, ch) do { \ 201 char PAC_char = (ch); \ 202 POOL_GROW (p, 1); \ 203 (p)->contents[(p)->tail++] = PAC_char; \ 204} while (0) 205 206/* Forget old pool contents. The allocated memory is not freed. */ 207#define POOL_REWIND(p) (p)->tail = 0 208 209/* Free heap-allocated memory for contents of POOL. This calls 210 xfree() if the memory was allocated through malloc. It also 211 restores `contents' and `size' to their original, pre-malloc 212 values. That way after POOL_FREE, the pool is fully usable, just 213 as if it were freshly initialized with POOL_INIT. */ 214 215#define POOL_FREE(p) do { \ 216 struct pool *P = p; \ 217 if (P->resized) \ 218 xfree (P->contents); \ 219 P->contents = P->orig_contents; \ 220 P->size = P->orig_size; \ 221 P->tail = 0; \ 222 P->resized = false; \ 223} while (0) 224 225/* Used for small stack-allocated memory chunks that might grow. Like 226 DO_REALLOC, this macro grows BASEVAR as necessary to take 227 NEEDED_SIZE items of TYPE. 228 229 The difference is that on the first resize, it will use 230 malloc+memcpy rather than realloc. That way you can stack-allocate 231 the initial chunk, and only resort to heap allocation if you 232 stumble upon large data. 233 234 After the first resize, subsequent ones are performed with realloc, 235 just like DO_REALLOC. */ 236 237#define GROW_ARRAY(basevar, sizevar, needed_size, resized, type) do { \ 238 long ga_needed_size = (needed_size); \ 239 long ga_newsize = (sizevar); \ 240 while (ga_newsize < ga_needed_size) \ 241 ga_newsize <<= 1; \ 242 if (ga_newsize != (sizevar)) \ 243 { \ 244 if (resized) \ 245 basevar = xrealloc (basevar, ga_newsize * sizeof (type)); \ 246 else \ 247 { \ 248 void *ga_new = xmalloc (ga_newsize * sizeof (type)); \ 249 memcpy (ga_new, basevar, (sizevar) * sizeof (type)); \ 250 (basevar) = ga_new; \ 251 resized = true; \ 252 } \ 253 (sizevar) = ga_newsize; \ 254 } \ 255} while (0) 256 257/* Test whether n+1-sized entity name fits in P. We don't support 258 IE-style non-terminated entities, e.g. "<foo" -> "<foo". 259 However, "<foo" will work, as will "<!foo", "<", etc. In 260 other words an entity needs to be terminated by either a 261 non-alphanumeric or the end of string. */ 262#define FITS(p, n) (p + n == end || (p + n < end && !c_isalnum (p[n]))) 263 264/* Macros that test entity names by returning true if P is followed by 265 the specified characters. */ 266#define ENT1(p, c0) (FITS (p, 1) && p[0] == c0) 267#define ENT2(p, c0, c1) (FITS (p, 2) && p[0] == c0 && p[1] == c1) 268#define ENT3(p, c0, c1, c2) (FITS (p, 3) && p[0]==c0 && p[1]==c1 && p[2]==c2) 269 270/* Increment P by INC chars. If P lands at a semicolon, increment it 271 past the semicolon. This ensures that e.g. "<foo" is converted 272 to "<foo", but "<,foo" to "<,foo". */ 273#define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p) 274 275struct tagstack_item { 276 const char *tagname_begin; 277 const char *tagname_end; 278 const char *contents_begin; 279 struct tagstack_item *prev; 280 struct tagstack_item *next; 281}; 282 283static struct tagstack_item * 284tagstack_push (struct tagstack_item **head, struct tagstack_item **tail) 285{ 286 struct tagstack_item *ts = xmalloc(sizeof(struct tagstack_item)); 287 if (*head == NULL) 288 { 289 *head = *tail = ts; 290 ts->prev = ts->next = NULL; 291 } 292 else 293 { 294 (*tail)->next = ts; 295 ts->prev = *tail; 296 *tail = ts; 297 ts->next = NULL; 298 } 299 300 return ts; 301} 302 303/* remove ts and everything after it from the stack */ 304static void 305tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail, 306 struct tagstack_item *ts) 307{ 308 if (*head == NULL) 309 return; 310 311 if (ts == *tail) 312 { 313 if (ts == *head) 314 { 315 xfree (ts); 316 *head = *tail = NULL; 317 } 318 else 319 { 320 ts->prev->next = NULL; 321 *tail = ts->prev; 322 xfree (ts); 323 } 324 } 325 else 326 { 327 if (ts == *head) 328 { 329 *head = NULL; 330 } 331 *tail = ts->prev; 332 333 if (ts->prev) 334 { 335 ts->prev->next = NULL; 336 } 337 while (ts) 338 { 339 struct tagstack_item *p = ts->next; 340 xfree (ts); 341 ts = p; 342 } 343 } 344} 345 346static struct tagstack_item * 347tagstack_find (struct tagstack_item *tail, const char *tagname_begin, 348 const char *tagname_end) 349{ 350 int len = tagname_end - tagname_begin; 351 while (tail) 352 { 353 if (len == (tail->tagname_end - tail->tagname_begin)) 354 { 355 if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len)) 356 return tail; 357 } 358 tail = tail->prev; 359 } 360 return NULL; 361} 362 363/* Decode the HTML character entity at *PTR, considering END to be end 364 of buffer. It is assumed that the "&" character that marks the 365 beginning of the entity has been seen at *PTR-1. If a recognized 366 ASCII entity is seen, it is returned, and *PTR is moved to the end 367 of the entity. Otherwise, -1 is returned and *PTR left unmodified. 368 369 The recognized entities are: <, >, &, &apos, and ". */ 370 371static int 372decode_entity (const char **ptr, const char *end) 373{ 374 const char *p = *ptr; 375 int value = -1; 376 377 if (++p == end) 378 return -1; 379 380 switch (*p++) 381 { 382 case '#': 383 /* Process numeric entities "&#DDD;" and "&#xHH;". */ 384 { 385 int digits = 0; 386 value = 0; 387 if (*p == 'x') 388 for (++p; value < 256 && p < end && c_isxdigit (*p); p++, digits++) 389 value = (value << 4) + XDIGIT_TO_NUM (*p); 390 else 391 for (; value < 256 && p < end && c_isdigit (*p); p++, digits++) 392 value = (value * 10) + (*p - '0'); 393 if (!digits) 394 return -1; 395 /* Don't interpret 128+ codes and NUL because we cannot 396 portably reinserted them into HTML. */ 397 if (!value || (value & ~0x7f)) 398 return -1; 399 *ptr = SKIP_SEMI (p, 0); 400 return value; 401 } 402 /* Process named ASCII entities. */ 403 case 'g': 404 if (ENT1 (p, 't')) 405 value = '>', *ptr = SKIP_SEMI (p, 1); 406 break; 407 case 'l': 408 if (ENT1 (p, 't')) 409 value = '<', *ptr = SKIP_SEMI (p, 1); 410 break; 411 case 'a': 412 if (ENT2 (p, 'm', 'p')) 413 value = '&', *ptr = SKIP_SEMI (p, 2); 414 else if (ENT3 (p, 'p', 'o', 's')) 415 /* handle &apos for the sake of the XML/XHTML crowd. */ 416 value = '\'', *ptr = SKIP_SEMI (p, 3); 417 break; 418 case 'q': 419 if (ENT3 (p, 'u', 'o', 't')) 420 value = '\"', *ptr = SKIP_SEMI (p, 3); 421 break; 422 } 423 return value; 424} 425#undef ENT1 426#undef ENT2 427#undef ENT3 428#undef FITS 429#undef SKIP_SEMI 430 431enum { 432 AP_DOWNCASE = 1, 433 AP_DECODE_ENTITIES = 2, 434 AP_TRIM_BLANKS = 4 435}; 436 437/* Copy the text in the range [BEG, END) to POOL, optionally 438 performing operations specified by FLAGS. FLAGS may be any 439 combination of AP_DOWNCASE, AP_DECODE_ENTITIES and AP_TRIM_BLANKS 440 with the following meaning: 441 442 * AP_DOWNCASE -- downcase all the letters; 443 444 * AP_DECODE_ENTITIES -- decode the named and numeric entities in 445 the ASCII range when copying the string. 446 447 * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end 448 of text, as well as embedded newlines. */ 449 450static void 451convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags) 452{ 453 int old_tail = pool->tail; 454 455 /* Skip blanks if required. We must do this before entities are 456 processed, so that blanks can still be inserted as, for instance, 457 ` '. */ 458 if (flags & AP_TRIM_BLANKS) 459 { 460 while (beg < end && c_isspace (*beg)) 461 ++beg; 462 while (end > beg && c_isspace (end[-1])) 463 --end; 464 } 465 466 if (flags & AP_DECODE_ENTITIES) 467 { 468 /* Grow the pool, then copy the text to the pool character by 469 character, processing the encountered entities as we go 470 along. 471 472 It's safe (and necessary) to grow the pool in advance because 473 processing the entities can only *shorten* the string, it can 474 never lengthen it. */ 475 const char *from = beg; 476 char *to; 477 bool squash_newlines = !!(flags & AP_TRIM_BLANKS); 478 479 POOL_GROW (pool, end - beg); 480 to = pool->contents + pool->tail; 481 482 while (from < end) 483 { 484 if (*from == '&') 485 { 486 int entity = decode_entity (&from, end); 487 if (entity != -1) 488 *to++ = entity; 489 else 490 *to++ = *from++; 491 } 492 else if ((*from == '\n' || *from == '\r') && squash_newlines) 493 ++from; 494 else 495 *to++ = *from++; 496 } 497 /* Verify that we haven't exceeded the original size. (It 498 shouldn't happen, hence the assert.) */ 499 assert (to - (pool->contents + pool->tail) <= end - beg); 500 501 /* Make POOL's tail point to the position following the string 502 we've written. */ 503 pool->tail = to - pool->contents; 504 POOL_APPEND_CHR (pool, '\0'); 505 } 506 else 507 { 508 /* Just copy the text to the pool. */ 509 POOL_APPEND (pool, beg, end); 510 POOL_APPEND_CHR (pool, '\0'); 511 } 512 513 if (flags & AP_DOWNCASE) 514 { 515 char *p = pool->contents + old_tail; 516 for (; *p; p++) 517 *p = c_tolower (*p); 518 } 519} 520 521/* Originally we used to adhere to rfc 1866 here, and allowed only 522 letters, digits, periods, and hyphens as names (of tags or 523 attributes). However, this broke too many pages which used 524 proprietary or strange attributes, e.g. <img src="a.gif" 525 v:shapes="whatever">. 526 527 So now we allow any character except: 528 * whitespace 529 * 8-bit and control chars 530 * characters that clearly cannot be part of name: 531 '=', '<', '>', '/'. 532 533 This only affects attribute and tag names; attribute values allow 534 an even greater variety of characters. */ 535 536#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \ 537 && (x) != '=' && (x) != '<' && (x) != '>' \ 538 && (x) != '/') 539 540#ifdef STANDALONE 541static int comment_backout_count; 542#endif 543 544/* Advance over an SGML declaration, such as <!DOCTYPE ...>. In 545 strict comments mode, this is used for skipping over comments as 546 well. 547 548 To recap: any SGML declaration may have comments associated with 549 it, e.g. 550 <!MY-DECL -- isn't this fun? -- foo bar> 551 552 An HTML comment is merely an empty declaration (<!>) with a comment 553 attached, like this: 554 <!-- some stuff here --> 555 556 Several comments may be embedded in one comment declaration: 557 <!-- have -- -- fun --> 558 559 Whitespace is allowed between and after the comments, but not 560 before the first comment. Additionally, this function attempts to 561 handle double quotes in SGML declarations correctly. */ 562 563static const char * 564advance_declaration (const char *beg, const char *end) 565{ 566 const char *p = beg; 567 char quote_char = '\0'; /* shut up, gcc! */ 568 char ch; 569 570 enum { 571 AC_S_DONE, 572 AC_S_BACKOUT, 573 AC_S_BANG, 574 AC_S_DEFAULT, 575 AC_S_DCLNAME, 576 AC_S_DASH1, 577 AC_S_DASH2, 578 AC_S_COMMENT, 579 AC_S_DASH3, 580 AC_S_DASH4, 581 AC_S_QUOTE1, 582 AC_S_IN_QUOTE, 583 AC_S_QUOTE2 584 } state = AC_S_BANG; 585 586 if (beg == end) 587 return beg; 588 ch = *p++; 589 590 /* It looked like a good idea to write this as a state machine, but 591 now I wonder... */ 592 593 while (state != AC_S_DONE && state != AC_S_BACKOUT) 594 { 595 if (p == end) 596 state = AC_S_BACKOUT; 597 switch (state) 598 { 599 case AC_S_DONE: 600 case AC_S_BACKOUT: 601 break; 602 case AC_S_BANG: 603 if (ch == '!') 604 { 605 ch = *p++; 606 state = AC_S_DEFAULT; 607 } 608 else 609 state = AC_S_BACKOUT; 610 break; 611 case AC_S_DEFAULT: 612 switch (ch) 613 { 614 case '-': 615 state = AC_S_DASH1; 616 break; 617 case ' ': 618 case '\t': 619 case '\r': 620 case '\n': 621 ch = *p++; 622 break; 623 case '<': 624 case '>': 625 state = AC_S_DONE; 626 break; 627 case '\'': 628 case '\"': 629 state = AC_S_QUOTE1; 630 break; 631 default: 632 if (NAME_CHAR_P (ch)) 633 state = AC_S_DCLNAME; 634 else 635 state = AC_S_BACKOUT; 636 break; 637 } 638 break; 639 case AC_S_DCLNAME: 640 if (ch == '-') 641 state = AC_S_DASH1; 642 else if (NAME_CHAR_P (ch)) 643 ch = *p++; 644 else 645 state = AC_S_DEFAULT; 646 break; 647 case AC_S_QUOTE1: 648 /* We must use 0x22 because broken assert macros choke on 649 '"' and '\"'. */ 650 assert (ch == '\'' || ch == 0x22); 651 quote_char = ch; /* cheating -- I really don't feel like 652 introducing more different states for 653 different quote characters. */ 654 ch = *p++; 655 state = AC_S_IN_QUOTE; 656 break; 657 case AC_S_IN_QUOTE: 658 if (ch == quote_char) 659 state = AC_S_QUOTE2; 660 else 661 ch = *p++; 662 break; 663 case AC_S_QUOTE2: 664 assert (ch == quote_char); 665 ch = *p++; 666 state = AC_S_DEFAULT; 667 break; 668 case AC_S_DASH1: 669 assert (ch == '-'); 670 ch = *p++; 671 state = AC_S_DASH2; 672 break; 673 case AC_S_DASH2: 674 switch (ch) 675 { 676 case '-': 677 ch = *p++; 678 state = AC_S_COMMENT; 679 break; 680 default: 681 state = AC_S_BACKOUT; 682 } 683 break; 684 case AC_S_COMMENT: 685 switch (ch) 686 { 687 case '-': 688 state = AC_S_DASH3; 689 break; 690 default: 691 ch = *p++; 692 break; 693 } 694 break; 695 case AC_S_DASH3: 696 assert (ch == '-'); 697 ch = *p++; 698 state = AC_S_DASH4; 699 break; 700 case AC_S_DASH4: 701 switch (ch) 702 { 703 case '-': 704 ch = *p++; 705 state = AC_S_DEFAULT; 706 break; 707 default: 708 state = AC_S_COMMENT; 709 break; 710 } 711 break; 712 } 713 } 714 715 if (state == AC_S_BACKOUT) 716 { 717#ifdef STANDALONE 718 ++comment_backout_count; 719#endif 720 return beg + 1; 721 } 722 return p; 723} 724 725/* Find the first occurrence of the substring "-->" in [BEG, END) and 726 return the pointer to the character after the substring. If the 727 substring is not found, return NULL. */ 728 729static const char * 730find_comment_end (const char *beg, const char *end) 731{ 732 /* Open-coded Boyer-Moore search for "-->". Examine the third char; 733 if it's not '>' or '-', advance by three characters. Otherwise, 734 look at the preceding characters and try to find a match. */ 735 736 const char *p = beg - 1; 737 738 while ((p += 3) < end) 739 switch (p[0]) 740 { 741 case '>': 742 if (p[-1] == '-' && p[-2] == '-') 743 return p + 1; 744 break; 745 case '-': 746 at_dash: 747 if (p[-1] == '-') 748 { 749 at_dash_dash: 750 if (++p == end) return NULL; 751 switch (p[0]) 752 { 753 case '>': return p + 1; 754 case '-': goto at_dash_dash; 755 } 756 } 757 else 758 { 759 if ((p += 2) >= end) return NULL; 760 switch (p[0]) 761 { 762 case '>': 763 if (p[-1] == '-') 764 return p + 1; 765 break; 766 case '-': 767 goto at_dash; 768 } 769 } 770 } 771 return NULL; 772} 773 774/* Return true if the string containing of characters inside [b, e) is 775 present in hash table HT. */ 776 777static bool 778name_allowed (const struct hash_table *ht, const char *b, const char *e) 779{ 780 char *copy; 781 if (!ht) 782 return true; 783 BOUNDED_TO_ALLOCA (b, e, copy); 784 return hash_table_get (ht, copy) != NULL; 785} 786 787/* Advance P (a char pointer), with the explicit intent of being able 788 to read the next character. If this is not possible, go to finish. */ 789 790#define ADVANCE(p) do { \ 791 ++p; \ 792 if (p >= end) \ 793 goto finish; \ 794} while (0) 795 796/* Skip whitespace, if any. */ 797 798#define SKIP_WS(p) do { \ 799 while (c_isspace (*p)) { \ 800 ADVANCE (p); \ 801 } \ 802} while (0) 803 804/* Skip non-whitespace, if any. */ 805 806#define SKIP_NON_WS(p) do { \ 807 while (!c_isspace (*p)) { \ 808 ADVANCE (p); \ 809 } \ 810} while (0) 811 812#ifdef STANDALONE 813static int tag_backout_count; 814#endif 815 816/* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long. 817 MAPFUN will be called with two arguments: pointer to an initialized 818 struct taginfo, and MAPARG. 819 820 ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of 821 which are the tags and attribute names that this function should 822 use. If ALLOWED_TAGS is NULL, all tags are processed; if 823 ALLOWED_ATTRIBUTES is NULL, all attributes are returned. 824 825 (Obviously, the caller can filter out unwanted tags and attributes 826 just as well, but this is just an optimization designed to avoid 827 unnecessary copying of tags/attributes which the caller doesn't 828 care about.) */ 829 830void 831map_html_tags (const char *text, int size, 832 void (*mapfun) (struct taginfo *, void *), void *maparg, 833 int flags, 834 const struct hash_table *allowed_tags, 835 const struct hash_table *allowed_attributes) 836{ 837 /* storage for strings passed to MAPFUN callback; if 256 bytes is 838 too little, POOL_APPEND allocates more with malloc. */ 839 char pool_initial_storage[256]; 840 struct pool pool; 841 842 const char *p = text; 843 const char *end = text + size; 844 845 struct attr_pair attr_pair_initial_storage[8]; 846 int attr_pair_size = countof (attr_pair_initial_storage); 847 bool attr_pair_resized = false; 848 struct attr_pair *pairs = attr_pair_initial_storage; 849 850 struct tagstack_item *head = NULL; 851 struct tagstack_item *tail = NULL; 852 853 if (!size) 854 return; 855 856 POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage)); 857 858 { 859 int nattrs, end_tag; 860 const char *tag_name_begin, *tag_name_end; 861 const char *tag_start_position; 862 bool uninteresting_tag; 863 864 look_for_tag: 865 POOL_REWIND (&pool); 866 867 nattrs = 0; 868 end_tag = 0; 869 870 /* Find beginning of tag. We use memchr() instead of the usual 871 looping with ADVANCE() for speed. */ 872 p = memchr (p, '<', end - p); 873 if (!p) 874 goto finish; 875 876 tag_start_position = p; 877 ADVANCE (p); 878 879 /* Establish the type of the tag (start-tag, end-tag or 880 declaration). */ 881 if (*p == '!') 882 { 883 if (!(flags & MHT_STRICT_COMMENTS) 884 && p < end + 3 && p[1] == '-' && p[2] == '-') 885 { 886 /* If strict comments are not enforced and if we know 887 we're looking at a comment, simply look for the 888 terminating "-->". Non-strict is the default because 889 it works in other browsers and most HTML writers can't 890 be bothered with getting the comments right. */ 891 const char *comment_end = find_comment_end (p + 3, end); 892 if (comment_end) 893 p = comment_end; 894 } 895 else 896 { 897 /* Either in strict comment mode or looking at a non-empty 898 declaration. Real declarations are much less likely to 899 be misused the way comments are, so advance over them 900 properly regardless of strictness. */ 901 p = advance_declaration (p, end); 902 } 903 if (p == end) 904 goto finish; 905 goto look_for_tag; 906 } 907 else if (*p == '/') 908 { 909 end_tag = 1; 910 ADVANCE (p); 911 } 912 tag_name_begin = p; 913 while (NAME_CHAR_P (*p)) 914 ADVANCE (p); 915 if (p == tag_name_begin) 916 goto look_for_tag; 917 tag_name_end = p; 918 SKIP_WS (p); 919 920 if (!end_tag) 921 { 922 struct tagstack_item *ts = tagstack_push (&head, &tail); 923 if (ts) 924 { 925 ts->tagname_begin = tag_name_begin; 926 ts->tagname_end = tag_name_end; 927 ts->contents_begin = NULL; 928 } 929 } 930 931 if (end_tag && *p != '>' && *p != '<') 932 goto backout_tag; 933 934 if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end)) 935 /* We can't just say "goto look_for_tag" here because we need 936 the loop below to properly advance over the tag's attributes. */ 937 uninteresting_tag = true; 938 else 939 { 940 uninteresting_tag = false; 941 convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE); 942 } 943 944 /* Find the attributes. */ 945 while (1) 946 { 947 const char *attr_name_begin, *attr_name_end; 948 const char *attr_value_begin, *attr_value_end; 949 const char *attr_raw_value_begin, *attr_raw_value_end; 950 int operation = AP_DOWNCASE; /* stupid compiler. */ 951 952 SKIP_WS (p); 953 954 if (*p == '/') 955 { 956 /* A slash at this point means the tag is about to be 957 closed. This is legal in XML and has been popularized 958 in HTML via XHTML. */ 959 /* <foo a=b c=d /> */ 960 /* ^ */ 961 ADVANCE (p); 962 SKIP_WS (p); 963 if (*p != '<' && *p != '>') 964 goto backout_tag; 965 } 966 967 /* Check for end of tag definition. */ 968 if (*p == '<' || *p == '>') 969 break; 970 971 /* Establish bounds of attribute name. */ 972 attr_name_begin = p; /* <foo bar ...> */ 973 /* ^ */ 974 while (NAME_CHAR_P (*p)) 975 ADVANCE (p); 976 attr_name_end = p; /* <foo bar ...> */ 977 /* ^ */ 978 if (attr_name_begin == attr_name_end) 979 goto backout_tag; 980 981 /* Establish bounds of attribute value. */ 982 SKIP_WS (p); 983 984 if (NAME_CHAR_P (*p) || *p == '/' || *p == '<' || *p == '>') 985 { 986 /* Minimized attribute syntax allows `=' to be omitted. 987 For example, <UL COMPACT> is a valid shorthand for <UL 988 COMPACT="compact">. Even if such attributes are not 989 useful to Wget, we need to support them, so that the 990 tags containing them can be parsed correctly. */ 991 attr_raw_value_begin = attr_value_begin = attr_name_begin; 992 attr_raw_value_end = attr_value_end = attr_name_end; 993 } 994 else if (*p == '=') 995 { 996 ADVANCE (p); 997 SKIP_WS (p); 998 if (*p == '\"' || *p == '\'') 999 { 1000 bool newline_seen = false; 1001 char quote_char = *p; 1002 attr_raw_value_begin = p; 1003 ADVANCE (p); 1004 attr_value_begin = p; /* <foo bar="baz"> */ 1005 /* ^ */ 1006 while (*p != quote_char) 1007 { 1008 if (!newline_seen && *p == '\n') 1009 { 1010 /* If a newline is seen within the quotes, it 1011 is most likely that someone forgot to close 1012 the quote. In that case, we back out to 1013 the value beginning, and terminate the tag 1014 at either `>' or the delimiter, whichever 1015 comes first. Such a tag terminated at `>' 1016 is discarded. */ 1017 p = attr_value_begin; 1018 newline_seen = true; 1019 continue; 1020 } 1021 else if (newline_seen && (*p == '<' || *p == '>')) 1022 break; 1023 ADVANCE (p); 1024 } 1025 attr_value_end = p; /* <foo bar="baz"> */ 1026 /* ^ */ 1027 if (*p == quote_char) 1028 ADVANCE (p); 1029 else 1030 goto look_for_tag; 1031 attr_raw_value_end = p; /* <foo bar="baz"> */ 1032 /* ^ */ 1033 operation = AP_DECODE_ENTITIES; 1034 if (flags & MHT_TRIM_VALUES) 1035 operation |= AP_TRIM_BLANKS; 1036 } 1037 else 1038 { 1039 attr_value_begin = p; /* <foo bar=baz> */ 1040 /* ^ */ 1041 /* According to SGML, a name token should consist only 1042 of alphanumerics, . and -. However, this is often 1043 violated by, for instance, `%' in `width=75%'. 1044 We'll be liberal and allow just about anything as 1045 an attribute value. */ 1046 while (!c_isspace (*p) && *p != '<' && *p != '>') 1047 ADVANCE (p); 1048 attr_value_end = p; /* <foo bar=baz qux=quix> */ 1049 /* ^ */ 1050 if (attr_value_begin == attr_value_end) 1051 /* <foo bar=> */ 1052 /* ^ */ 1053 goto backout_tag; 1054 attr_raw_value_begin = attr_value_begin; 1055 attr_raw_value_end = attr_value_end; 1056 operation = AP_DECODE_ENTITIES; 1057 } 1058 } 1059 else 1060 { 1061 /* We skipped the whitespace and found something that is 1062 neither `=' nor the beginning of the next attribute's 1063 name. Back out. */ 1064 goto backout_tag; /* <foo bar [... */ 1065 /* ^ */ 1066 } 1067 1068 /* If we're not interested in the tag, don't bother with any 1069 of the attributes. */ 1070 if (uninteresting_tag) 1071 continue; 1072 1073 /* If we aren't interested in the attribute, skip it. We 1074 cannot do this test any sooner, because our text pointer 1075 needs to correctly advance over the attribute. */ 1076 if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end)) 1077 continue; 1078 1079 GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized, 1080 struct attr_pair); 1081 1082 pairs[nattrs].name_pool_index = pool.tail; 1083 convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE); 1084 1085 pairs[nattrs].value_pool_index = pool.tail; 1086 convert_and_copy (&pool, attr_value_begin, attr_value_end, operation); 1087 pairs[nattrs].value_raw_beginning = attr_raw_value_begin; 1088 pairs[nattrs].value_raw_size = (attr_raw_value_end 1089 - attr_raw_value_begin); 1090 ++nattrs; 1091 } 1092 1093 if (!end_tag && tail && (tail->tagname_begin == tag_name_begin)) 1094 { 1095 tail->contents_begin = p+1; 1096 } 1097 1098 if (uninteresting_tag) 1099 { 1100 ADVANCE (p); 1101 goto look_for_tag; 1102 } 1103 1104 /* By now, we have a valid tag with a name and zero or more 1105 attributes. Fill in the data and call the mapper function. */ 1106 { 1107 int i; 1108 struct taginfo taginfo; 1109 struct tagstack_item *ts = NULL; 1110 1111 taginfo.name = pool.contents; 1112 taginfo.end_tag_p = end_tag; 1113 taginfo.nattrs = nattrs; 1114 /* We fill in the char pointers only now, when pool can no 1115 longer get realloc'ed. If we did that above, we could get 1116 hosed by reallocation. Obviously, after this point, the pool 1117 may no longer be grown. */ 1118 for (i = 0; i < nattrs; i++) 1119 { 1120 pairs[i].name = pool.contents + pairs[i].name_pool_index; 1121 pairs[i].value = pool.contents + pairs[i].value_pool_index; 1122 } 1123 taginfo.attrs = pairs; 1124 taginfo.start_position = tag_start_position; 1125 taginfo.end_position = p + 1; 1126 taginfo.contents_begin = NULL; 1127 taginfo.contents_end = NULL; 1128 1129 if (end_tag) 1130 { 1131 ts = tagstack_find (tail, tag_name_begin, tag_name_end); 1132 if (ts) 1133 { 1134 if (ts->contents_begin) 1135 { 1136 taginfo.contents_begin = ts->contents_begin; 1137 taginfo.contents_end = tag_start_position; 1138 } 1139 tagstack_pop (&head, &tail, ts); 1140 } 1141 } 1142 1143 mapfun (&taginfo, maparg); 1144 if (*p != '<') 1145 ADVANCE (p); 1146 } 1147 goto look_for_tag; 1148 1149 backout_tag: 1150#ifdef STANDALONE 1151 ++tag_backout_count; 1152#endif 1153 /* The tag wasn't really a tag. Treat its contents as ordinary 1154 data characters. */ 1155 p = tag_start_position + 1; 1156 goto look_for_tag; 1157 } 1158 1159 finish: 1160 POOL_FREE (&pool); 1161 if (attr_pair_resized) 1162 xfree (pairs); 1163 /* pop any tag stack that's left */ 1164 tagstack_pop (&head, &tail, head); 1165} 1166 1167#undef ADVANCE 1168#undef SKIP_WS 1169#undef SKIP_NON_WS 1170 1171#ifdef STANDALONE 1172static void 1173test_mapper (struct taginfo *taginfo, void *arg) 1174{ 1175 int i; 1176 1177 printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name); 1178 for (i = 0; i < taginfo->nattrs; i++) 1179 printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value); 1180 putchar ('\n'); 1181 ++*(int *)arg; 1182} 1183 1184int main () 1185{ 1186 int size = 256; 1187 char *x = xmalloc (size); 1188 int length = 0; 1189 int read_count; 1190 int tag_counter = 0; 1191 1192#ifdef ENABLE_NLS 1193 /* Set the current locale. */ 1194 setlocale (LC_ALL, ""); 1195 /* Set the text message domain. */ 1196 bindtextdomain ("wget", LOCALEDIR); 1197 textdomain ("wget"); 1198#endif /* ENABLE_NLS */ 1199 1200 while ((read_count = fread (x + length, 1, size - length, stdin))) 1201 { 1202 length += read_count; 1203 size <<= 1; 1204 x = xrealloc (x, size); 1205 } 1206 1207 map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL); 1208 printf ("TAGS: %d\n", tag_counter); 1209 printf ("Tag backouts: %d\n", tag_backout_count); 1210 printf ("Comment backouts: %d\n", comment_backout_count); 1211 return 0; 1212} 1213#endif /* STANDALONE */ 1214