1169695Skan/* HTML parser for Wget. 2169695Skan Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 3169695Skan 2007, 2008, 2009 Free Software Foundation, Inc. 4169695Skan 5169695SkanThis file is part of GNU Wget. 6169695Skan 7169695SkanGNU Wget is free software; you can redistribute it and/or modify 8169695Skanit under the terms of the GNU General Public License as published by 9169695Skanthe Free Software Foundation; either version 3 of the License, or (at 10169695Skanyour option) any later version. 11169695Skan 12169695SkanGNU Wget is distributed in the hope that it will be useful, 13169695Skanbut WITHOUT ANY WARRANTY; without even the implied warranty of 14169695SkanMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15169695SkanGNU General Public License for more details. 16169695Skan 17169695SkanYou should have received a copy of the GNU General Public License 18169695Skanalong with Wget. If not, see <http://www.gnu.org/licenses/>. 19169695Skan 20169695SkanAdditional permission under GNU GPL version 3 section 7 21169695Skan 22169695SkanIf you modify this program, or any covered work, by linking or 23169695Skancombining it with the OpenSSL project's OpenSSL library (or a 24169695Skanmodified version of that library), containing parts covered by the 25169695Skanterms of the OpenSSL or SSLeay licenses, the Free Software Foundation 26169695Skangrants you additional permission to convey the resulting work. 27169695SkanCorresponding Source for a non-source form of such a combination 28169695Skanshall include the source code for the parts of OpenSSL used as well 29169695Skanas that of the covered work. */ 30169695Skan 31169695Skan/* The only entry point to this module is map_html_tags(), which see. */ 32169695Skan 33169695Skan/* TODO: 34169695Skan 35169695Skan - Allow hooks for callers to process contents outside tags. This 36169695Skan is needed to implement handling <style> and <script>. The 37169695Skan taginfo structure already carries the information about where the 38169695Skan tags are, but this is not enough, because one would also want to 39169695Skan skip the comments. (The funny thing is that for <style> and 40169695Skan <script> you *don't* want to skip comments!) 41169695Skan 42169695Skan - Create a test suite for regression testing. */ 43169695Skan 44169695Skan/* HISTORY: 45169695Skan 46169695Skan This is the third HTML parser written for Wget. The first one was 47169695Skan written some time during the Geturl 1.0 beta cycle, and was very 48169695Skan inefficient and buggy. It also contained some very complex code to 49169695Skan remember a list of parser states, because it was supposed to be 50169695Skan reentrant. 51169695Skan 52169695Skan The second HTML parser was written for Wget 1.4 (the first version 53169695Skan by the name `Wget'), and was a complete rewrite. Although the new 54169695Skan parser behaved much better and made no claims of reentrancy, it 55169695Skan still shared many of the fundamental flaws of the old version -- it 56169695Skan only regarded HTML in terms tag-attribute pairs, where the 57169695Skan attribute's value was a URL to be returned. Any other property of 58169695Skan HTML, such as <base href=...>, or strange way to specify a URL, 59169695Skan such as <meta http-equiv=Refresh content="0; URL=..."> had to be 60169695Skan crudely hacked in -- and the caller had to be aware of these hacks. 61169695Skan Like its predecessor, this parser did not support HTML comments. 62169695Skan 63169695Skan After Wget 1.5.1 was released, I set out to write a third HTML 64169695Skan parser. The objectives of the new parser were to: (1) provide a 65169695Skan clean way to analyze HTML lexically, (2) separate interpretation of 66169695Skan the markup from the parsing process, (3) be as correct as possible, 67169695Skan e.g. correctly skipping comments and other SGML declarations, (4) 68169695Skan understand the most common errors in markup and skip them or be 69169695Skan relaxed towrds them, and (5) be reasonably efficient (no regexps, 70169695Skan minimum copying and minimum or no heap allocation). 71169695Skan 72169695Skan I believe this parser meets all of the above goals. It is 73169695Skan reasonably well structured, and could be relatively easily 74169695Skan separated from Wget and used elsewhere. While some of its 75169695Skan intrinsic properties limit its value as a general-purpose HTML 76169695Skan parser, I believe that, with minimum modifications, it could serve 77169695Skan as a backend for one. 78169695Skan 79169695Skan Due to time and other constraints, this parser was not integrated 80169695Skan into Wget until the version 1.7. */ 81169695Skan 82169695Skan/* DESCRIPTION: 83169695Skan 84169695Skan The single entry point of this parser is map_html_tags(), which 85169695Skan works by calling a function you specify for each tag. The function 86169695Skan gets called with the pointer to a structure describing the tag and 87169695Skan its attributes. */ 88169695Skan 89169695Skan/* To test as standalone, compile with `-DSTANDALONE -I.'. You'll 90169695Skan still need Wget headers to compile. */ 91169695Skan 92169695Skan#include "wget.h" 93169695Skan 94169695Skan#ifdef STANDALONE 95169695Skan# define I_REALLY_WANT_CTYPE_MACROS 96169695Skan#endif 97169695Skan 98169695Skan#include <stdio.h> 99169695Skan#include <stdlib.h> 100169695Skan#include <string.h> 101169695Skan#include <assert.h> 102169695Skan 103169695Skan#include "utils.h" 104169695Skan#include "html-parse.h" 105169695Skan 106169695Skan#ifdef STANDALONE 107169695Skan# undef xmalloc 108169695Skan# undef xrealloc 109169695Skan# undef xfree 110169695Skan# define xmalloc malloc 111169695Skan# define xrealloc realloc 112169695Skan# define xfree free 113 114# undef c_isspace 115# undef c_isdigit 116# undef c_isxdigit 117# undef c_isalpha 118# undef c_isalnum 119# undef c_tolower 120# undef c_toupper 121 122# define c_isspace(x) isspace (x) 123# define c_isdigit(x) isdigit (x) 124# define c_isxdigit(x) isxdigit (x) 125# define c_isalpha(x) isalpha (x) 126# define c_isalnum(x) isalnum (x) 127# define c_tolower(x) tolower (x) 128# define c_toupper(x) toupper (x) 129 130struct hash_table { 131 int dummy; 132}; 133static void * 134hash_table_get (const struct hash_table *ht, void *ptr) 135{ 136 return ptr; 137} 138#else /* not STANDALONE */ 139# include "hash.h" 140#endif 141 142/* Pool support. A pool is a resizable chunk of memory. It is first 143 allocated on the stack, and moved to the heap if it needs to be 144 larger than originally expected. map_html_tags() uses it to store 145 the zero-terminated names and values of tags and attributes. 146 147 Thus taginfo->name, and attr->name and attr->value for each 148 attribute, do not point into separately allocated areas, but into 149 different parts of the pool, separated only by terminating zeros. 150 This ensures minimum amount of allocation and, for most tags, no 151 allocation because the entire pool is kept on the stack. */ 152 153struct pool { 154 char *contents; /* pointer to the contents. */ 155 int size; /* size of the pool. */ 156 int tail; /* next available position index. */ 157 bool resized; /* whether the pool has been resized 158 using malloc. */ 159 160 char *orig_contents; /* original pool contents, usually 161 stack-allocated. used by POOL_FREE 162 to restore the pool to the initial 163 state. */ 164 int orig_size; 165}; 166 167/* Initialize the pool to hold INITIAL_SIZE bytes of storage. */ 168 169#define POOL_INIT(p, initial_storage, initial_size) do { \ 170 struct pool *P = (p); \ 171 P->contents = (initial_storage); \ 172 P->size = (initial_size); \ 173 P->tail = 0; \ 174 P->resized = false; \ 175 P->orig_contents = P->contents; \ 176 P->orig_size = P->size; \ 177} while (0) 178 179/* Grow the pool to accomodate at least SIZE new bytes. If the pool 180 already has room to accomodate SIZE bytes of data, this is a no-op. */ 181 182#define POOL_GROW(p, increase) \ 183 GROW_ARRAY ((p)->contents, (p)->size, (p)->tail + (increase), \ 184 (p)->resized, char) 185 186/* Append text in the range [beg, end) to POOL. No zero-termination 187 is done. */ 188 189#define POOL_APPEND(p, beg, end) do { \ 190 const char *PA_beg = (beg); \ 191 int PA_size = (end) - PA_beg; \ 192 POOL_GROW (p, PA_size); \ 193 memcpy ((p)->contents + (p)->tail, PA_beg, PA_size); \ 194 (p)->tail += PA_size; \ 195} while (0) 196 197/* Append one character to the pool. Can be used to zero-terminate 198 pool strings. */ 199 200#define POOL_APPEND_CHR(p, ch) do { \ 201 char PAC_char = (ch); \ 202 POOL_GROW (p, 1); \ 203 (p)->contents[(p)->tail++] = PAC_char; \ 204} while (0) 205 206/* Forget old pool contents. The allocated memory is not freed. */ 207#define POOL_REWIND(p) (p)->tail = 0 208 209/* Free heap-allocated memory for contents of POOL. This calls 210 xfree() if the memory was allocated through malloc. It also 211 restores `contents' and `size' to their original, pre-malloc 212 values. That way after POOL_FREE, the pool is fully usable, just 213 as if it were freshly initialized with POOL_INIT. */ 214 215#define POOL_FREE(p) do { \ 216 struct pool *P = p; \ 217 if (P->resized) \ 218 xfree (P->contents); \ 219 P->contents = P->orig_contents; \ 220 P->size = P->orig_size; \ 221 P->tail = 0; \ 222 P->resized = false; \ 223} while (0) 224 225/* Used for small stack-allocated memory chunks that might grow. Like 226 DO_REALLOC, this macro grows BASEVAR as necessary to take 227 NEEDED_SIZE items of TYPE. 228 229 The difference is that on the first resize, it will use 230 malloc+memcpy rather than realloc. That way you can stack-allocate 231 the initial chunk, and only resort to heap allocation if you 232 stumble upon large data. 233 234 After the first resize, subsequent ones are performed with realloc, 235 just like DO_REALLOC. */ 236 237#define GROW_ARRAY(basevar, sizevar, needed_size, resized, type) do { \ 238 long ga_needed_size = (needed_size); \ 239 long ga_newsize = (sizevar); \ 240 while (ga_newsize < ga_needed_size) \ 241 ga_newsize <<= 1; \ 242 if (ga_newsize != (sizevar)) \ 243 { \ 244 if (resized) \ 245 basevar = xrealloc (basevar, ga_newsize * sizeof (type)); \ 246 else \ 247 { \ 248 void *ga_new = xmalloc (ga_newsize * sizeof (type)); \ 249 memcpy (ga_new, basevar, (sizevar) * sizeof (type)); \ 250 (basevar) = ga_new; \ 251 resized = true; \ 252 } \ 253 (sizevar) = ga_newsize; \ 254 } \ 255} while (0) 256 257/* Test whether n+1-sized entity name fits in P. We don't support 258 IE-style non-terminated entities, e.g. "<foo" -> "<foo". 259 However, "<foo" will work, as will "<!foo", "<", etc. In 260 other words an entity needs to be terminated by either a 261 non-alphanumeric or the end of string. */ 262#define FITS(p, n) (p + n == end || (p + n < end && !c_isalnum (p[n]))) 263 264/* Macros that test entity names by returning true if P is followed by 265 the specified characters. */ 266#define ENT1(p, c0) (FITS (p, 1) && p[0] == c0) 267#define ENT2(p, c0, c1) (FITS (p, 2) && p[0] == c0 && p[1] == c1) 268#define ENT3(p, c0, c1, c2) (FITS (p, 3) && p[0]==c0 && p[1]==c1 && p[2]==c2) 269 270/* Increment P by INC chars. If P lands at a semicolon, increment it 271 past the semicolon. This ensures that e.g. "<foo" is converted 272 to "<foo", but "<,foo" to "<,foo". */ 273#define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p) 274 275struct tagstack_item { 276 const char *tagname_begin; 277 const char *tagname_end; 278 const char *contents_begin; 279 struct tagstack_item *prev; 280 struct tagstack_item *next; 281}; 282 283struct tagstack_item * 284tagstack_push (struct tagstack_item **head, struct tagstack_item **tail) 285{ 286 struct tagstack_item *ts = xmalloc(sizeof(struct tagstack_item)); 287 if (*head == NULL) 288 { 289 *head = *tail = ts; 290 ts->prev = ts->next = NULL; 291 } 292 else 293 { 294 (*tail)->next = ts; 295 ts->prev = *tail; 296 *tail = ts; 297 ts->next = NULL; 298 } 299 300 return ts; 301} 302 303/* remove ts and everything after it from the stack */ 304void 305tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail, 306 struct tagstack_item *ts) 307{ 308 if (*head == NULL) 309 return; 310 311 if (ts == *tail) 312 { 313 if (ts == *head) 314 { 315 xfree (ts); 316 *head = *tail = NULL; 317 } 318 else 319 { 320 ts->prev->next = NULL; 321 *tail = ts->prev; 322 xfree (ts); 323 } 324 } 325 else 326 { 327 if (ts == *head) 328 { 329 *head = NULL; 330 } 331 *tail = ts->prev; 332 333 if (ts->prev) 334 { 335 ts->prev->next = NULL; 336 } 337 while (ts) 338 { 339 struct tagstack_item *p = ts->next; 340 xfree (ts); 341 ts = p; 342 } 343 } 344} 345 346struct tagstack_item * 347tagstack_find (struct tagstack_item *tail, const char *tagname_begin, 348 const char *tagname_end) 349{ 350 int len = tagname_end - tagname_begin; 351 while (tail) 352 { 353 if (len == (tail->tagname_end - tail->tagname_begin)) 354 { 355 if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len)) 356 return tail; 357 } 358 tail = tail->prev; 359 } 360 return NULL; 361} 362 363/* Decode the HTML character entity at *PTR, considering END to be end 364 of buffer. It is assumed that the "&" character that marks the 365 beginning of the entity has been seen at *PTR-1. If a recognized 366 ASCII entity is seen, it is returned, and *PTR is moved to the end 367 of the entity. Otherwise, -1 is returned and *PTR left unmodified. 368 369 The recognized entities are: <, >, &, &apos, and ". */ 370 371static int 372decode_entity (const char **ptr, const char *end) 373{ 374 const char *p = *ptr; 375 int value = -1; 376 377 if (++p == end) 378 return -1; 379 380 switch (*p++) 381 { 382 case '#': 383 /* Process numeric entities "&#DDD;" and "&#xHH;". */ 384 { 385 int digits = 0; 386 value = 0; 387 if (*p == 'x') 388 for (++p; value < 256 && p < end && c_isxdigit (*p); p++, digits++) 389 value = (value << 4) + XDIGIT_TO_NUM (*p); 390 else 391 for (; value < 256 && p < end && c_isdigit (*p); p++, digits++) 392 value = (value * 10) + (*p - '0'); 393 if (!digits) 394 return -1; 395 /* Don't interpret 128+ codes and NUL because we cannot 396 portably reinserted them into HTML. */ 397 if (!value || (value & ~0x7f)) 398 return -1; 399 *ptr = SKIP_SEMI (p, 0); 400 return value; 401 } 402 /* Process named ASCII entities. */ 403 case 'g': 404 if (ENT1 (p, 't')) 405 value = '>', *ptr = SKIP_SEMI (p, 1); 406 break; 407 case 'l': 408 if (ENT1 (p, 't')) 409 value = '<', *ptr = SKIP_SEMI (p, 1); 410 break; 411 case 'a': 412 if (ENT2 (p, 'm', 'p')) 413 value = '&', *ptr = SKIP_SEMI (p, 2); 414 else if (ENT3 (p, 'p', 'o', 's')) 415 /* handle &apos for the sake of the XML/XHTML crowd. */ 416 value = '\'', *ptr = SKIP_SEMI (p, 3); 417 break; 418 case 'q': 419 if (ENT3 (p, 'u', 'o', 't')) 420 value = '\"', *ptr = SKIP_SEMI (p, 3); 421 break; 422 } 423 return value; 424} 425#undef ENT1 426#undef ENT2 427#undef ENT3 428#undef FITS 429#undef SKIP_SEMI 430 431enum { 432 AP_DOWNCASE = 1, 433 AP_DECODE_ENTITIES = 2, 434 AP_TRIM_BLANKS = 4 435}; 436 437/* Copy the text in the range [BEG, END) to POOL, optionally 438 performing operations specified by FLAGS. FLAGS may be any 439 combination of AP_DOWNCASE, AP_DECODE_ENTITIES and AP_TRIM_BLANKS 440 with the following meaning: 441 442 * AP_DOWNCASE -- downcase all the letters; 443 444 * AP_DECODE_ENTITIES -- decode the named and numeric entities in 445 the ASCII range when copying the string. 446 447 * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end 448 of text, as well as embedded newlines. */ 449 450static void 451convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags) 452{ 453 int old_tail = pool->tail; 454 455 /* Skip blanks if required. We must do this before entities are 456 processed, so that blanks can still be inserted as, for instance, 457 ` '. */ 458 if (flags & AP_TRIM_BLANKS) 459 { 460 while (beg < end && c_isspace (*beg)) 461 ++beg; 462 while (end > beg && c_isspace (end[-1])) 463 --end; 464 } 465 466 if (flags & AP_DECODE_ENTITIES) 467 { 468 /* Grow the pool, then copy the text to the pool character by 469 character, processing the encountered entities as we go 470 along. 471 472 It's safe (and necessary) to grow the pool in advance because 473 processing the entities can only *shorten* the string, it can 474 never lengthen it. */ 475 const char *from = beg; 476 char *to; 477 bool squash_newlines = !!(flags & AP_TRIM_BLANKS); 478 479 POOL_GROW (pool, end - beg); 480 to = pool->contents + pool->tail; 481 482 while (from < end) 483 { 484 if (*from == '&') 485 { 486 int entity = decode_entity (&from, end); 487 if (entity != -1) 488 *to++ = entity; 489 else 490 *to++ = *from++; 491 } 492 else if ((*from == '\n' || *from == '\r') && squash_newlines) 493 ++from; 494 else 495 *to++ = *from++; 496 } 497 /* Verify that we haven't exceeded the original size. (It 498 shouldn't happen, hence the assert.) */ 499 assert (to - (pool->contents + pool->tail) <= end - beg); 500 501 /* Make POOL's tail point to the position following the string 502 we've written. */ 503 pool->tail = to - pool->contents; 504 POOL_APPEND_CHR (pool, '\0'); 505 } 506 else 507 { 508 /* Just copy the text to the pool. */ 509 POOL_APPEND (pool, beg, end); 510 POOL_APPEND_CHR (pool, '\0'); 511 } 512 513 if (flags & AP_DOWNCASE) 514 { 515 char *p = pool->contents + old_tail; 516 for (; *p; p++) 517 *p = c_tolower (*p); 518 } 519} 520 521/* Originally we used to adhere to rfc 1866 here, and allowed only 522 letters, digits, periods, and hyphens as names (of tags or 523 attributes). However, this broke too many pages which used 524 proprietary or strange attributes, e.g. <img src="a.gif" 525 v:shapes="whatever">. 526 527 So now we allow any character except: 528 * whitespace 529 * 8-bit and control chars 530 * characters that clearly cannot be part of name: 531 '=', '>', '/'. 532 533 This only affects attribute and tag names; attribute values allow 534 an even greater variety of characters. */ 535 536#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \ 537 && (x) != '=' && (x) != '>' && (x) != '/') 538 539#ifdef STANDALONE 540static int comment_backout_count; 541#endif 542 543/* Advance over an SGML declaration, such as <!DOCTYPE ...>. In 544 strict comments mode, this is used for skipping over comments as 545 well. 546 547 To recap: any SGML declaration may have comments associated with 548 it, e.g. 549 <!MY-DECL -- isn't this fun? -- foo bar> 550 551 An HTML comment is merely an empty declaration (<!>) with a comment 552 attached, like this: 553 <!-- some stuff here --> 554 555 Several comments may be embedded in one comment declaration: 556 <!-- have -- -- fun --> 557 558 Whitespace is allowed between and after the comments, but not 559 before the first comment. Additionally, this function attempts to 560 handle double quotes in SGML declarations correctly. */ 561 562static const char * 563advance_declaration (const char *beg, const char *end) 564{ 565 const char *p = beg; 566 char quote_char = '\0'; /* shut up, gcc! */ 567 char ch; 568 569 enum { 570 AC_S_DONE, 571 AC_S_BACKOUT, 572 AC_S_BANG, 573 AC_S_DEFAULT, 574 AC_S_DCLNAME, 575 AC_S_DASH1, 576 AC_S_DASH2, 577 AC_S_COMMENT, 578 AC_S_DASH3, 579 AC_S_DASH4, 580 AC_S_QUOTE1, 581 AC_S_IN_QUOTE, 582 AC_S_QUOTE2 583 } state = AC_S_BANG; 584 585 if (beg == end) 586 return beg; 587 ch = *p++; 588 589 /* It looked like a good idea to write this as a state machine, but 590 now I wonder... */ 591 592 while (state != AC_S_DONE && state != AC_S_BACKOUT) 593 { 594 if (p == end) 595 state = AC_S_BACKOUT; 596 switch (state) 597 { 598 case AC_S_DONE: 599 case AC_S_BACKOUT: 600 break; 601 case AC_S_BANG: 602 if (ch == '!') 603 { 604 ch = *p++; 605 state = AC_S_DEFAULT; 606 } 607 else 608 state = AC_S_BACKOUT; 609 break; 610 case AC_S_DEFAULT: 611 switch (ch) 612 { 613 case '-': 614 state = AC_S_DASH1; 615 break; 616 case ' ': 617 case '\t': 618 case '\r': 619 case '\n': 620 ch = *p++; 621 break; 622 case '>': 623 state = AC_S_DONE; 624 break; 625 case '\'': 626 case '\"': 627 state = AC_S_QUOTE1; 628 break; 629 default: 630 if (NAME_CHAR_P (ch)) 631 state = AC_S_DCLNAME; 632 else 633 state = AC_S_BACKOUT; 634 break; 635 } 636 break; 637 case AC_S_DCLNAME: 638 if (ch == '-') 639 state = AC_S_DASH1; 640 else if (NAME_CHAR_P (ch)) 641 ch = *p++; 642 else 643 state = AC_S_DEFAULT; 644 break; 645 case AC_S_QUOTE1: 646 /* We must use 0x22 because broken assert macros choke on 647 '"' and '\"'. */ 648 assert (ch == '\'' || ch == 0x22); 649 quote_char = ch; /* cheating -- I really don't feel like 650 introducing more different states for 651 different quote characters. */ 652 ch = *p++; 653 state = AC_S_IN_QUOTE; 654 break; 655 case AC_S_IN_QUOTE: 656 if (ch == quote_char) 657 state = AC_S_QUOTE2; 658 else 659 ch = *p++; 660 break; 661 case AC_S_QUOTE2: 662 assert (ch == quote_char); 663 ch = *p++; 664 state = AC_S_DEFAULT; 665 break; 666 case AC_S_DASH1: 667 assert (ch == '-'); 668 ch = *p++; 669 state = AC_S_DASH2; 670 break; 671 case AC_S_DASH2: 672 switch (ch) 673 { 674 case '-': 675 ch = *p++; 676 state = AC_S_COMMENT; 677 break; 678 default: 679 state = AC_S_BACKOUT; 680 } 681 break; 682 case AC_S_COMMENT: 683 switch (ch) 684 { 685 case '-': 686 state = AC_S_DASH3; 687 break; 688 default: 689 ch = *p++; 690 break; 691 } 692 break; 693 case AC_S_DASH3: 694 assert (ch == '-'); 695 ch = *p++; 696 state = AC_S_DASH4; 697 break; 698 case AC_S_DASH4: 699 switch (ch) 700 { 701 case '-': 702 ch = *p++; 703 state = AC_S_DEFAULT; 704 break; 705 default: 706 state = AC_S_COMMENT; 707 break; 708 } 709 break; 710 } 711 } 712 713 if (state == AC_S_BACKOUT) 714 { 715#ifdef STANDALONE 716 ++comment_backout_count; 717#endif 718 return beg + 1; 719 } 720 return p; 721} 722 723/* Find the first occurrence of the substring "-->" in [BEG, END) and 724 return the pointer to the character after the substring. If the 725 substring is not found, return NULL. */ 726 727static const char * 728find_comment_end (const char *beg, const char *end) 729{ 730 /* Open-coded Boyer-Moore search for "-->". Examine the third char; 731 if it's not '>' or '-', advance by three characters. Otherwise, 732 look at the preceding characters and try to find a match. */ 733 734 const char *p = beg - 1; 735 736 while ((p += 3) < end) 737 switch (p[0]) 738 { 739 case '>': 740 if (p[-1] == '-' && p[-2] == '-') 741 return p + 1; 742 break; 743 case '-': 744 at_dash: 745 if (p[-1] == '-') 746 { 747 at_dash_dash: 748 if (++p == end) return NULL; 749 switch (p[0]) 750 { 751 case '>': return p + 1; 752 case '-': goto at_dash_dash; 753 } 754 } 755 else 756 { 757 if ((p += 2) >= end) return NULL; 758 switch (p[0]) 759 { 760 case '>': 761 if (p[-1] == '-') 762 return p + 1; 763 break; 764 case '-': 765 goto at_dash; 766 } 767 } 768 } 769 return NULL; 770} 771 772/* Return true if the string containing of characters inside [b, e) is 773 present in hash table HT. */ 774 775static bool 776name_allowed (const struct hash_table *ht, const char *b, const char *e) 777{ 778 char *copy; 779 if (!ht) 780 return true; 781 BOUNDED_TO_ALLOCA (b, e, copy); 782 return hash_table_get (ht, copy) != NULL; 783} 784 785/* Advance P (a char pointer), with the explicit intent of being able 786 to read the next character. If this is not possible, go to finish. */ 787 788#define ADVANCE(p) do { \ 789 ++p; \ 790 if (p >= end) \ 791 goto finish; \ 792} while (0) 793 794/* Skip whitespace, if any. */ 795 796#define SKIP_WS(p) do { \ 797 while (c_isspace (*p)) { \ 798 ADVANCE (p); \ 799 } \ 800} while (0) 801 802/* Skip non-whitespace, if any. */ 803 804#define SKIP_NON_WS(p) do { \ 805 while (!c_isspace (*p)) { \ 806 ADVANCE (p); \ 807 } \ 808} while (0) 809 810#ifdef STANDALONE 811static int tag_backout_count; 812#endif 813 814/* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long. 815 MAPFUN will be called with two arguments: pointer to an initialized 816 struct taginfo, and MAPARG. 817 818 ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of 819 which are the tags and attribute names that this function should 820 use. If ALLOWED_TAGS is NULL, all tags are processed; if 821 ALLOWED_ATTRIBUTES is NULL, all attributes are returned. 822 823 (Obviously, the caller can filter out unwanted tags and attributes 824 just as well, but this is just an optimization designed to avoid 825 unnecessary copying of tags/attributes which the caller doesn't 826 care about.) */ 827 828void 829map_html_tags (const char *text, int size, 830 void (*mapfun) (struct taginfo *, void *), void *maparg, 831 int flags, 832 const struct hash_table *allowed_tags, 833 const struct hash_table *allowed_attributes) 834{ 835 /* storage for strings passed to MAPFUN callback; if 256 bytes is 836 too little, POOL_APPEND allocates more with malloc. */ 837 char pool_initial_storage[256]; 838 struct pool pool; 839 840 const char *p = text; 841 const char *end = text + size; 842 843 struct attr_pair attr_pair_initial_storage[8]; 844 int attr_pair_size = countof (attr_pair_initial_storage); 845 bool attr_pair_resized = false; 846 struct attr_pair *pairs = attr_pair_initial_storage; 847 848 struct tagstack_item *head = NULL; 849 struct tagstack_item *tail = NULL; 850 851 if (!size) 852 return; 853 854 POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage)); 855 856 { 857 int nattrs, end_tag; 858 const char *tag_name_begin, *tag_name_end; 859 const char *tag_start_position; 860 bool uninteresting_tag; 861 862 look_for_tag: 863 POOL_REWIND (&pool); 864 865 nattrs = 0; 866 end_tag = 0; 867 868 /* Find beginning of tag. We use memchr() instead of the usual 869 looping with ADVANCE() for speed. */ 870 p = memchr (p, '<', end - p); 871 if (!p) 872 goto finish; 873 874 tag_start_position = p; 875 ADVANCE (p); 876 877 /* Establish the type of the tag (start-tag, end-tag or 878 declaration). */ 879 if (*p == '!') 880 { 881 if (!(flags & MHT_STRICT_COMMENTS) 882 && p < end + 3 && p[1] == '-' && p[2] == '-') 883 { 884 /* If strict comments are not enforced and if we know 885 we're looking at a comment, simply look for the 886 terminating "-->". Non-strict is the default because 887 it works in other browsers and most HTML writers can't 888 be bothered with getting the comments right. */ 889 const char *comment_end = find_comment_end (p + 3, end); 890 if (comment_end) 891 p = comment_end; 892 } 893 else 894 { 895 /* Either in strict comment mode or looking at a non-empty 896 declaration. Real declarations are much less likely to 897 be misused the way comments are, so advance over them 898 properly regardless of strictness. */ 899 p = advance_declaration (p, end); 900 } 901 if (p == end) 902 goto finish; 903 goto look_for_tag; 904 } 905 else if (*p == '/') 906 { 907 end_tag = 1; 908 ADVANCE (p); 909 } 910 tag_name_begin = p; 911 while (NAME_CHAR_P (*p)) 912 ADVANCE (p); 913 if (p == tag_name_begin) 914 goto look_for_tag; 915 tag_name_end = p; 916 SKIP_WS (p); 917 918 if (!end_tag) 919 { 920 struct tagstack_item *ts = tagstack_push (&head, &tail); 921 if (ts) 922 { 923 ts->tagname_begin = tag_name_begin; 924 ts->tagname_end = tag_name_end; 925 ts->contents_begin = NULL; 926 } 927 } 928 929 if (end_tag && *p != '>') 930 goto backout_tag; 931 932 if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end)) 933 /* We can't just say "goto look_for_tag" here because we need 934 the loop below to properly advance over the tag's attributes. */ 935 uninteresting_tag = true; 936 else 937 { 938 uninteresting_tag = false; 939 convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE); 940 } 941 942 /* Find the attributes. */ 943 while (1) 944 { 945 const char *attr_name_begin, *attr_name_end; 946 const char *attr_value_begin, *attr_value_end; 947 const char *attr_raw_value_begin, *attr_raw_value_end; 948 int operation = AP_DOWNCASE; /* stupid compiler. */ 949 950 SKIP_WS (p); 951 952 if (*p == '/') 953 { 954 /* A slash at this point means the tag is about to be 955 closed. This is legal in XML and has been popularized 956 in HTML via XHTML. */ 957 /* <foo a=b c=d /> */ 958 /* ^ */ 959 ADVANCE (p); 960 SKIP_WS (p); 961 if (*p != '>') 962 goto backout_tag; 963 } 964 965 /* Check for end of tag definition. */ 966 if (*p == '>') 967 break; 968 969 /* Establish bounds of attribute name. */ 970 attr_name_begin = p; /* <foo bar ...> */ 971 /* ^ */ 972 while (NAME_CHAR_P (*p)) 973 ADVANCE (p); 974 attr_name_end = p; /* <foo bar ...> */ 975 /* ^ */ 976 if (attr_name_begin == attr_name_end) 977 goto backout_tag; 978 979 /* Establish bounds of attribute value. */ 980 SKIP_WS (p); 981 if (NAME_CHAR_P (*p) || *p == '/' || *p == '>') 982 { 983 /* Minimized attribute syntax allows `=' to be omitted. 984 For example, <UL COMPACT> is a valid shorthand for <UL 985 COMPACT="compact">. Even if such attributes are not 986 useful to Wget, we need to support them, so that the 987 tags containing them can be parsed correctly. */ 988 attr_raw_value_begin = attr_value_begin = attr_name_begin; 989 attr_raw_value_end = attr_value_end = attr_name_end; 990 } 991 else if (*p == '=') 992 { 993 ADVANCE (p); 994 SKIP_WS (p); 995 if (*p == '\"' || *p == '\'') 996 { 997 bool newline_seen = false; 998 char quote_char = *p; 999 attr_raw_value_begin = p; 1000 ADVANCE (p); 1001 attr_value_begin = p; /* <foo bar="baz"> */ 1002 /* ^ */ 1003 while (*p != quote_char) 1004 { 1005 if (!newline_seen && *p == '\n') 1006 { 1007 /* If a newline is seen within the quotes, it 1008 is most likely that someone forgot to close 1009 the quote. In that case, we back out to 1010 the value beginning, and terminate the tag 1011 at either `>' or the delimiter, whichever 1012 comes first. Such a tag terminated at `>' 1013 is discarded. */ 1014 p = attr_value_begin; 1015 newline_seen = true; 1016 continue; 1017 } 1018 else if (newline_seen && *p == '>') 1019 break; 1020 ADVANCE (p); 1021 } 1022 attr_value_end = p; /* <foo bar="baz"> */ 1023 /* ^ */ 1024 if (*p == quote_char) 1025 ADVANCE (p); 1026 else 1027 goto look_for_tag; 1028 attr_raw_value_end = p; /* <foo bar="baz"> */ 1029 /* ^ */ 1030 operation = AP_DECODE_ENTITIES; 1031 if (flags & MHT_TRIM_VALUES) 1032 operation |= AP_TRIM_BLANKS; 1033 } 1034 else 1035 { 1036 attr_value_begin = p; /* <foo bar=baz> */ 1037 /* ^ */ 1038 /* According to SGML, a name token should consist only 1039 of alphanumerics, . and -. However, this is often 1040 violated by, for instance, `%' in `width=75%'. 1041 We'll be liberal and allow just about anything as 1042 an attribute value. */ 1043 while (!c_isspace (*p) && *p != '>') 1044 ADVANCE (p); 1045 attr_value_end = p; /* <foo bar=baz qux=quix> */ 1046 /* ^ */ 1047 if (attr_value_begin == attr_value_end) 1048 /* <foo bar=> */ 1049 /* ^ */ 1050 goto backout_tag; 1051 attr_raw_value_begin = attr_value_begin; 1052 attr_raw_value_end = attr_value_end; 1053 operation = AP_DECODE_ENTITIES; 1054 } 1055 } 1056 else 1057 { 1058 /* We skipped the whitespace and found something that is 1059 neither `=' nor the beginning of the next attribute's 1060 name. Back out. */ 1061 goto backout_tag; /* <foo bar [... */ 1062 /* ^ */ 1063 } 1064 1065 /* If we're not interested in the tag, don't bother with any 1066 of the attributes. */ 1067 if (uninteresting_tag) 1068 continue; 1069 1070 /* If we aren't interested in the attribute, skip it. We 1071 cannot do this test any sooner, because our text pointer 1072 needs to correctly advance over the attribute. */ 1073 if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end)) 1074 continue; 1075 1076 GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized, 1077 struct attr_pair); 1078 1079 pairs[nattrs].name_pool_index = pool.tail; 1080 convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE); 1081 1082 pairs[nattrs].value_pool_index = pool.tail; 1083 convert_and_copy (&pool, attr_value_begin, attr_value_end, operation); 1084 pairs[nattrs].value_raw_beginning = attr_raw_value_begin; 1085 pairs[nattrs].value_raw_size = (attr_raw_value_end 1086 - attr_raw_value_begin); 1087 ++nattrs; 1088 } 1089 1090 if (!end_tag && tail && (tail->tagname_begin == tag_name_begin)) 1091 { 1092 tail->contents_begin = p+1; 1093 } 1094 1095 if (uninteresting_tag) 1096 { 1097 ADVANCE (p); 1098 goto look_for_tag; 1099 } 1100 1101 /* By now, we have a valid tag with a name and zero or more 1102 attributes. Fill in the data and call the mapper function. */ 1103 { 1104 int i; 1105 struct taginfo taginfo; 1106 struct tagstack_item *ts = NULL; 1107 1108 taginfo.name = pool.contents; 1109 taginfo.end_tag_p = end_tag; 1110 taginfo.nattrs = nattrs; 1111 /* We fill in the char pointers only now, when pool can no 1112 longer get realloc'ed. If we did that above, we could get 1113 hosed by reallocation. Obviously, after this point, the pool 1114 may no longer be grown. */ 1115 for (i = 0; i < nattrs; i++) 1116 { 1117 pairs[i].name = pool.contents + pairs[i].name_pool_index; 1118 pairs[i].value = pool.contents + pairs[i].value_pool_index; 1119 } 1120 taginfo.attrs = pairs; 1121 taginfo.start_position = tag_start_position; 1122 taginfo.end_position = p + 1; 1123 taginfo.contents_begin = NULL; 1124 taginfo.contents_end = NULL; 1125 1126 if (end_tag) 1127 { 1128 ts = tagstack_find (tail, tag_name_begin, tag_name_end); 1129 if (ts) 1130 { 1131 if (ts->contents_begin) 1132 { 1133 taginfo.contents_begin = ts->contents_begin; 1134 taginfo.contents_end = tag_start_position; 1135 } 1136 tagstack_pop (&head, &tail, ts); 1137 } 1138 } 1139 1140 mapfun (&taginfo, maparg); 1141 ADVANCE (p); 1142 } 1143 goto look_for_tag; 1144 1145 backout_tag: 1146#ifdef STANDALONE 1147 ++tag_backout_count; 1148#endif 1149 /* The tag wasn't really a tag. Treat its contents as ordinary 1150 data characters. */ 1151 p = tag_start_position + 1; 1152 goto look_for_tag; 1153 } 1154 1155 finish: 1156 POOL_FREE (&pool); 1157 if (attr_pair_resized) 1158 xfree (pairs); 1159 /* pop any tag stack that's left */ 1160 tagstack_pop (&head, &tail, head); 1161} 1162 1163#undef ADVANCE 1164#undef SKIP_WS 1165#undef SKIP_NON_WS 1166 1167#ifdef STANDALONE 1168static void 1169test_mapper (struct taginfo *taginfo, void *arg) 1170{ 1171 int i; 1172 1173 printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name); 1174 for (i = 0; i < taginfo->nattrs; i++) 1175 printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value); 1176 putchar ('\n'); 1177 ++*(int *)arg; 1178} 1179 1180int main () 1181{ 1182 int size = 256; 1183 char *x = xmalloc (size); 1184 int length = 0; 1185 int read_count; 1186 int tag_counter = 0; 1187 1188 while ((read_count = fread (x + length, 1, size - length, stdin))) 1189 { 1190 length += read_count; 1191 size <<= 1; 1192 x = xrealloc (x, size); 1193 } 1194 1195 map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL); 1196 printf ("TAGS: %d\n", tag_counter); 1197 printf ("Tag backouts: %d\n", tag_backout_count); 1198 printf ("Comment backouts: %d\n", comment_backout_count); 1199 return 0; 1200} 1201#endif /* STANDALONE */ 1202