1169695Skan/* HTML parser for Wget.
2169695Skan   Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
3169695Skan   2007, 2008, 2009 Free Software Foundation, Inc.
4169695Skan
5169695SkanThis file is part of GNU Wget.
6169695Skan
7169695SkanGNU Wget is free software; you can redistribute it and/or modify
8169695Skanit under the terms of the GNU General Public License as published by
9169695Skanthe Free Software Foundation; either version 3 of the License, or (at
10169695Skanyour option) any later version.
11169695Skan
12169695SkanGNU Wget is distributed in the hope that it will be useful,
13169695Skanbut WITHOUT ANY WARRANTY; without even the implied warranty of
14169695SkanMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15169695SkanGNU General Public License for more details.
16169695Skan
17169695SkanYou should have received a copy of the GNU General Public License
18169695Skanalong with Wget.  If not, see <http://www.gnu.org/licenses/>.
19169695Skan
20169695SkanAdditional permission under GNU GPL version 3 section 7
21169695Skan
22169695SkanIf you modify this program, or any covered work, by linking or
23169695Skancombining it with the OpenSSL project's OpenSSL library (or a
24169695Skanmodified version of that library), containing parts covered by the
25169695Skanterms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26169695Skangrants you additional permission to convey the resulting work.
27169695SkanCorresponding Source for a non-source form of such a combination
28169695Skanshall include the source code for the parts of OpenSSL used as well
29169695Skanas that of the covered work.  */
30169695Skan
31169695Skan/* The only entry point to this module is map_html_tags(), which see.  */
32169695Skan
33169695Skan/* TODO:
34169695Skan
35169695Skan   - Allow hooks for callers to process contents outside tags.  This
36169695Skan     is needed to implement handling <style> and <script>.  The
37169695Skan     taginfo structure already carries the information about where the
38169695Skan     tags are, but this is not enough, because one would also want to
39169695Skan     skip the comments.  (The funny thing is that for <style> and
40169695Skan     <script> you *don't* want to skip comments!)
41169695Skan
42169695Skan   - Create a test suite for regression testing. */
43169695Skan
44169695Skan/* HISTORY:
45169695Skan
46169695Skan   This is the third HTML parser written for Wget.  The first one was
47169695Skan   written some time during the Geturl 1.0 beta cycle, and was very
48169695Skan   inefficient and buggy.  It also contained some very complex code to
49169695Skan   remember a list of parser states, because it was supposed to be
50169695Skan   reentrant.
51169695Skan
52169695Skan   The second HTML parser was written for Wget 1.4 (the first version
53169695Skan   by the name `Wget'), and was a complete rewrite.  Although the new
54169695Skan   parser behaved much better and made no claims of reentrancy, it
55169695Skan   still shared many of the fundamental flaws of the old version -- it
56169695Skan   only regarded HTML in terms tag-attribute pairs, where the
57169695Skan   attribute's value was a URL to be returned.  Any other property of
58169695Skan   HTML, such as <base href=...>, or strange way to specify a URL,
59169695Skan   such as <meta http-equiv=Refresh content="0; URL=..."> had to be
60169695Skan   crudely hacked in -- and the caller had to be aware of these hacks.
61169695Skan   Like its predecessor, this parser did not support HTML comments.
62169695Skan
63169695Skan   After Wget 1.5.1 was released, I set out to write a third HTML
64169695Skan   parser.  The objectives of the new parser were to: (1) provide a
65169695Skan   clean way to analyze HTML lexically, (2) separate interpretation of
66169695Skan   the markup from the parsing process, (3) be as correct as possible,
67169695Skan   e.g. correctly skipping comments and other SGML declarations, (4)
68169695Skan   understand the most common errors in markup and skip them or be
69169695Skan   relaxed towrds them, and (5) be reasonably efficient (no regexps,
70169695Skan   minimum copying and minimum or no heap allocation).
71169695Skan
72169695Skan   I believe this parser meets all of the above goals.  It is
73169695Skan   reasonably well structured, and could be relatively easily
74169695Skan   separated from Wget and used elsewhere.  While some of its
75169695Skan   intrinsic properties limit its value as a general-purpose HTML
76169695Skan   parser, I believe that, with minimum modifications, it could serve
77169695Skan   as a backend for one.
78169695Skan
79169695Skan   Due to time and other constraints, this parser was not integrated
80169695Skan   into Wget until the version 1.7. */
81169695Skan
82169695Skan/* DESCRIPTION:
83169695Skan
84169695Skan   The single entry point of this parser is map_html_tags(), which
85169695Skan   works by calling a function you specify for each tag.  The function
86169695Skan   gets called with the pointer to a structure describing the tag and
87169695Skan   its attributes.  */
88169695Skan
89169695Skan/* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
90169695Skan   still need Wget headers to compile.  */
91169695Skan
92169695Skan#include "wget.h"
93169695Skan
94169695Skan#ifdef STANDALONE
95169695Skan# define I_REALLY_WANT_CTYPE_MACROS
96169695Skan#endif
97169695Skan
98169695Skan#include <stdio.h>
99169695Skan#include <stdlib.h>
100169695Skan#include <string.h>
101169695Skan#include <assert.h>
102169695Skan
103169695Skan#include "utils.h"
104169695Skan#include "html-parse.h"
105169695Skan
106169695Skan#ifdef STANDALONE
107169695Skan# undef xmalloc
108169695Skan# undef xrealloc
109169695Skan# undef xfree
110169695Skan# define xmalloc malloc
111169695Skan# define xrealloc realloc
112169695Skan# define xfree free
113
114# undef c_isspace
115# undef c_isdigit
116# undef c_isxdigit
117# undef c_isalpha
118# undef c_isalnum
119# undef c_tolower
120# undef c_toupper
121
122# define c_isspace(x) isspace (x)
123# define c_isdigit(x) isdigit (x)
124# define c_isxdigit(x) isxdigit (x)
125# define c_isalpha(x) isalpha (x)
126# define c_isalnum(x) isalnum (x)
127# define c_tolower(x) tolower (x)
128# define c_toupper(x) toupper (x)
129
130struct hash_table {
131  int dummy;
132};
133static void *
134hash_table_get (const struct hash_table *ht, void *ptr)
135{
136  return ptr;
137}
138#else  /* not STANDALONE */
139# include "hash.h"
140#endif
141
142/* Pool support.  A pool is a resizable chunk of memory.  It is first
143   allocated on the stack, and moved to the heap if it needs to be
144   larger than originally expected.  map_html_tags() uses it to store
145   the zero-terminated names and values of tags and attributes.
146
147   Thus taginfo->name, and attr->name and attr->value for each
148   attribute, do not point into separately allocated areas, but into
149   different parts of the pool, separated only by terminating zeros.
150   This ensures minimum amount of allocation and, for most tags, no
151   allocation because the entire pool is kept on the stack.  */
152
153struct pool {
154  char *contents;               /* pointer to the contents. */
155  int size;                     /* size of the pool. */
156  int tail;                     /* next available position index. */
157  bool resized;                 /* whether the pool has been resized
158                                   using malloc. */
159
160  char *orig_contents;          /* original pool contents, usually
161                                   stack-allocated.  used by POOL_FREE
162                                   to restore the pool to the initial
163                                   state. */
164  int orig_size;
165};
166
167/* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
168
169#define POOL_INIT(p, initial_storage, initial_size) do {        \
170  struct pool *P = (p);                                         \
171  P->contents = (initial_storage);                              \
172  P->size = (initial_size);                                     \
173  P->tail = 0;                                                  \
174  P->resized = false;                                           \
175  P->orig_contents = P->contents;                               \
176  P->orig_size = P->size;                                       \
177} while (0)
178
179/* Grow the pool to accomodate at least SIZE new bytes.  If the pool
180   already has room to accomodate SIZE bytes of data, this is a no-op.  */
181
182#define POOL_GROW(p, increase)                                  \
183  GROW_ARRAY ((p)->contents, (p)->size, (p)->tail + (increase), \
184              (p)->resized, char)
185
186/* Append text in the range [beg, end) to POOL.  No zero-termination
187   is done.  */
188
189#define POOL_APPEND(p, beg, end) do {                   \
190  const char *PA_beg = (beg);                           \
191  int PA_size = (end) - PA_beg;                         \
192  POOL_GROW (p, PA_size);                               \
193  memcpy ((p)->contents + (p)->tail, PA_beg, PA_size);  \
194  (p)->tail += PA_size;                                 \
195} while (0)
196
197/* Append one character to the pool.  Can be used to zero-terminate
198   pool strings.  */
199
200#define POOL_APPEND_CHR(p, ch) do {             \
201  char PAC_char = (ch);                         \
202  POOL_GROW (p, 1);                             \
203  (p)->contents[(p)->tail++] = PAC_char;        \
204} while (0)
205
206/* Forget old pool contents.  The allocated memory is not freed. */
207#define POOL_REWIND(p) (p)->tail = 0
208
209/* Free heap-allocated memory for contents of POOL.  This calls
210   xfree() if the memory was allocated through malloc.  It also
211   restores `contents' and `size' to their original, pre-malloc
212   values.  That way after POOL_FREE, the pool is fully usable, just
213   as if it were freshly initialized with POOL_INIT.  */
214
215#define POOL_FREE(p) do {                       \
216  struct pool *P = p;                           \
217  if (P->resized)                               \
218    xfree (P->contents);                        \
219  P->contents = P->orig_contents;               \
220  P->size = P->orig_size;                       \
221  P->tail = 0;                                  \
222  P->resized = false;                           \
223} while (0)
224
225/* Used for small stack-allocated memory chunks that might grow.  Like
226   DO_REALLOC, this macro grows BASEVAR as necessary to take
227   NEEDED_SIZE items of TYPE.
228
229   The difference is that on the first resize, it will use
230   malloc+memcpy rather than realloc.  That way you can stack-allocate
231   the initial chunk, and only resort to heap allocation if you
232   stumble upon large data.
233
234   After the first resize, subsequent ones are performed with realloc,
235   just like DO_REALLOC.  */
236
237#define GROW_ARRAY(basevar, sizevar, needed_size, resized, type) do {           \
238  long ga_needed_size = (needed_size);                                          \
239  long ga_newsize = (sizevar);                                                  \
240  while (ga_newsize < ga_needed_size)                                           \
241    ga_newsize <<= 1;                                                           \
242  if (ga_newsize != (sizevar))                                                  \
243    {                                                                           \
244      if (resized)                                                              \
245        basevar = xrealloc (basevar, ga_newsize * sizeof (type));               \
246      else                                                                      \
247        {                                                                       \
248          void *ga_new = xmalloc (ga_newsize * sizeof (type));                  \
249          memcpy (ga_new, basevar, (sizevar) * sizeof (type));                  \
250          (basevar) = ga_new;                                                   \
251          resized = true;                                                       \
252        }                                                                       \
253      (sizevar) = ga_newsize;                                                   \
254    }                                                                           \
255} while (0)
256
257/* Test whether n+1-sized entity name fits in P.  We don't support
258   IE-style non-terminated entities, e.g. "&ltfoo" -> "<foo".
259   However, "&lt;foo" will work, as will "&lt!foo", "&lt", etc.  In
260   other words an entity needs to be terminated by either a
261   non-alphanumeric or the end of string.  */
262#define FITS(p, n) (p + n == end || (p + n < end && !c_isalnum (p[n])))
263
264/* Macros that test entity names by returning true if P is followed by
265   the specified characters.  */
266#define ENT1(p, c0) (FITS (p, 1) && p[0] == c0)
267#define ENT2(p, c0, c1) (FITS (p, 2) && p[0] == c0 && p[1] == c1)
268#define ENT3(p, c0, c1, c2) (FITS (p, 3) && p[0]==c0 && p[1]==c1 && p[2]==c2)
269
270/* Increment P by INC chars.  If P lands at a semicolon, increment it
271   past the semicolon.  This ensures that e.g. "&lt;foo" is converted
272   to "<foo", but "&lt,foo" to "<,foo".  */
273#define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p)
274
275struct tagstack_item {
276  const char *tagname_begin;
277  const char *tagname_end;
278  const char *contents_begin;
279  struct tagstack_item *prev;
280  struct tagstack_item *next;
281};
282
283struct tagstack_item *
284tagstack_push (struct tagstack_item **head, struct tagstack_item **tail)
285{
286  struct tagstack_item *ts = xmalloc(sizeof(struct tagstack_item));
287  if (*head == NULL)
288    {
289      *head = *tail = ts;
290      ts->prev = ts->next = NULL;
291    }
292  else
293    {
294      (*tail)->next = ts;
295      ts->prev = *tail;
296      *tail = ts;
297      ts->next = NULL;
298    }
299
300  return ts;
301}
302
303/* remove ts and everything after it from the stack */
304void
305tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail,
306              struct tagstack_item *ts)
307{
308  if (*head == NULL)
309    return;
310
311  if (ts == *tail)
312    {
313      if (ts == *head)
314        {
315          xfree (ts);
316          *head = *tail = NULL;
317        }
318      else
319        {
320          ts->prev->next = NULL;
321          *tail = ts->prev;
322          xfree (ts);
323        }
324    }
325  else
326    {
327      if (ts == *head)
328        {
329          *head = NULL;
330        }
331      *tail = ts->prev;
332
333      if (ts->prev)
334        {
335          ts->prev->next = NULL;
336        }
337      while (ts)
338        {
339          struct tagstack_item *p = ts->next;
340          xfree (ts);
341          ts = p;
342        }
343    }
344}
345
346struct tagstack_item *
347tagstack_find (struct tagstack_item *tail, const char *tagname_begin,
348               const char *tagname_end)
349{
350  int len = tagname_end - tagname_begin;
351  while (tail)
352    {
353      if (len == (tail->tagname_end - tail->tagname_begin))
354        {
355          if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len))
356            return tail;
357        }
358      tail = tail->prev;
359    }
360  return NULL;
361}
362
363/* Decode the HTML character entity at *PTR, considering END to be end
364   of buffer.  It is assumed that the "&" character that marks the
365   beginning of the entity has been seen at *PTR-1.  If a recognized
366   ASCII entity is seen, it is returned, and *PTR is moved to the end
367   of the entity.  Otherwise, -1 is returned and *PTR left unmodified.
368
369   The recognized entities are: &lt, &gt, &amp, &apos, and &quot.  */
370
371static int
372decode_entity (const char **ptr, const char *end)
373{
374  const char *p = *ptr;
375  int value = -1;
376
377  if (++p == end)
378    return -1;
379
380  switch (*p++)
381    {
382    case '#':
383      /* Process numeric entities "&#DDD;" and "&#xHH;".  */
384      {
385        int digits = 0;
386        value = 0;
387        if (*p == 'x')
388          for (++p; value < 256 && p < end && c_isxdigit (*p); p++, digits++)
389            value = (value << 4) + XDIGIT_TO_NUM (*p);
390        else
391          for (; value < 256 && p < end && c_isdigit (*p); p++, digits++)
392            value = (value * 10) + (*p - '0');
393        if (!digits)
394          return -1;
395        /* Don't interpret 128+ codes and NUL because we cannot
396           portably reinserted them into HTML.  */
397        if (!value || (value & ~0x7f))
398          return -1;
399        *ptr = SKIP_SEMI (p, 0);
400        return value;
401      }
402    /* Process named ASCII entities.  */
403    case 'g':
404      if (ENT1 (p, 't'))
405        value = '>', *ptr = SKIP_SEMI (p, 1);
406      break;
407    case 'l':
408      if (ENT1 (p, 't'))
409        value = '<', *ptr = SKIP_SEMI (p, 1);
410      break;
411    case 'a':
412      if (ENT2 (p, 'm', 'p'))
413        value = '&', *ptr = SKIP_SEMI (p, 2);
414      else if (ENT3 (p, 'p', 'o', 's'))
415        /* handle &apos for the sake of the XML/XHTML crowd. */
416        value = '\'', *ptr = SKIP_SEMI (p, 3);
417      break;
418    case 'q':
419      if (ENT3 (p, 'u', 'o', 't'))
420        value = '\"', *ptr = SKIP_SEMI (p, 3);
421      break;
422    }
423  return value;
424}
425#undef ENT1
426#undef ENT2
427#undef ENT3
428#undef FITS
429#undef SKIP_SEMI
430
431enum {
432  AP_DOWNCASE           = 1,
433  AP_DECODE_ENTITIES    = 2,
434  AP_TRIM_BLANKS        = 4
435};
436
437/* Copy the text in the range [BEG, END) to POOL, optionally
438   performing operations specified by FLAGS.  FLAGS may be any
439   combination of AP_DOWNCASE, AP_DECODE_ENTITIES and AP_TRIM_BLANKS
440   with the following meaning:
441
442   * AP_DOWNCASE -- downcase all the letters;
443
444   * AP_DECODE_ENTITIES -- decode the named and numeric entities in
445     the ASCII range when copying the string.
446
447   * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
448     of text, as well as embedded newlines.  */
449
450static void
451convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
452{
453  int old_tail = pool->tail;
454
455  /* Skip blanks if required.  We must do this before entities are
456     processed, so that blanks can still be inserted as, for instance,
457     `&#32;'.  */
458  if (flags & AP_TRIM_BLANKS)
459    {
460      while (beg < end && c_isspace (*beg))
461        ++beg;
462      while (end > beg && c_isspace (end[-1]))
463        --end;
464    }
465
466  if (flags & AP_DECODE_ENTITIES)
467    {
468      /* Grow the pool, then copy the text to the pool character by
469         character, processing the encountered entities as we go
470         along.
471
472         It's safe (and necessary) to grow the pool in advance because
473         processing the entities can only *shorten* the string, it can
474         never lengthen it.  */
475      const char *from = beg;
476      char *to;
477      bool squash_newlines = !!(flags & AP_TRIM_BLANKS);
478
479      POOL_GROW (pool, end - beg);
480      to = pool->contents + pool->tail;
481
482      while (from < end)
483        {
484          if (*from == '&')
485            {
486              int entity = decode_entity (&from, end);
487              if (entity != -1)
488                *to++ = entity;
489              else
490                *to++ = *from++;
491            }
492          else if ((*from == '\n' || *from == '\r') && squash_newlines)
493            ++from;
494          else
495            *to++ = *from++;
496        }
497      /* Verify that we haven't exceeded the original size.  (It
498         shouldn't happen, hence the assert.)  */
499      assert (to - (pool->contents + pool->tail) <= end - beg);
500
501      /* Make POOL's tail point to the position following the string
502         we've written.  */
503      pool->tail = to - pool->contents;
504      POOL_APPEND_CHR (pool, '\0');
505    }
506  else
507    {
508      /* Just copy the text to the pool.  */
509      POOL_APPEND (pool, beg, end);
510      POOL_APPEND_CHR (pool, '\0');
511    }
512
513  if (flags & AP_DOWNCASE)
514    {
515      char *p = pool->contents + old_tail;
516      for (; *p; p++)
517        *p = c_tolower (*p);
518    }
519}
520
521/* Originally we used to adhere to rfc 1866 here, and allowed only
522   letters, digits, periods, and hyphens as names (of tags or
523   attributes).  However, this broke too many pages which used
524   proprietary or strange attributes, e.g. <img src="a.gif"
525   v:shapes="whatever">.
526
527   So now we allow any character except:
528     * whitespace
529     * 8-bit and control chars
530     * characters that clearly cannot be part of name:
531       '=', '>', '/'.
532
533   This only affects attribute and tag names; attribute values allow
534   an even greater variety of characters.  */
535
536#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127                           \
537                        && (x) != '=' && (x) != '>' && (x) != '/')
538
539#ifdef STANDALONE
540static int comment_backout_count;
541#endif
542
543/* Advance over an SGML declaration, such as <!DOCTYPE ...>.  In
544   strict comments mode, this is used for skipping over comments as
545   well.
546
547   To recap: any SGML declaration may have comments associated with
548   it, e.g.
549       <!MY-DECL -- isn't this fun? -- foo bar>
550
551   An HTML comment is merely an empty declaration (<!>) with a comment
552   attached, like this:
553       <!-- some stuff here -->
554
555   Several comments may be embedded in one comment declaration:
556       <!-- have -- -- fun -->
557
558   Whitespace is allowed between and after the comments, but not
559   before the first comment.  Additionally, this function attempts to
560   handle double quotes in SGML declarations correctly.  */
561
562static const char *
563advance_declaration (const char *beg, const char *end)
564{
565  const char *p = beg;
566  char quote_char = '\0';       /* shut up, gcc! */
567  char ch;
568
569  enum {
570    AC_S_DONE,
571    AC_S_BACKOUT,
572    AC_S_BANG,
573    AC_S_DEFAULT,
574    AC_S_DCLNAME,
575    AC_S_DASH1,
576    AC_S_DASH2,
577    AC_S_COMMENT,
578    AC_S_DASH3,
579    AC_S_DASH4,
580    AC_S_QUOTE1,
581    AC_S_IN_QUOTE,
582    AC_S_QUOTE2
583  } state = AC_S_BANG;
584
585  if (beg == end)
586    return beg;
587  ch = *p++;
588
589  /* It looked like a good idea to write this as a state machine, but
590     now I wonder...  */
591
592  while (state != AC_S_DONE && state != AC_S_BACKOUT)
593    {
594      if (p == end)
595        state = AC_S_BACKOUT;
596      switch (state)
597        {
598        case AC_S_DONE:
599        case AC_S_BACKOUT:
600          break;
601        case AC_S_BANG:
602          if (ch == '!')
603            {
604              ch = *p++;
605              state = AC_S_DEFAULT;
606            }
607          else
608            state = AC_S_BACKOUT;
609          break;
610        case AC_S_DEFAULT:
611          switch (ch)
612            {
613            case '-':
614              state = AC_S_DASH1;
615              break;
616            case ' ':
617            case '\t':
618            case '\r':
619            case '\n':
620              ch = *p++;
621              break;
622            case '>':
623              state = AC_S_DONE;
624              break;
625            case '\'':
626            case '\"':
627              state = AC_S_QUOTE1;
628              break;
629            default:
630              if (NAME_CHAR_P (ch))
631                state = AC_S_DCLNAME;
632              else
633                state = AC_S_BACKOUT;
634              break;
635            }
636          break;
637        case AC_S_DCLNAME:
638          if (ch == '-')
639            state = AC_S_DASH1;
640          else if (NAME_CHAR_P (ch))
641            ch = *p++;
642          else
643            state = AC_S_DEFAULT;
644          break;
645        case AC_S_QUOTE1:
646          /* We must use 0x22 because broken assert macros choke on
647             '"' and '\"'.  */
648          assert (ch == '\'' || ch == 0x22);
649          quote_char = ch;      /* cheating -- I really don't feel like
650                                   introducing more different states for
651                                   different quote characters. */
652          ch = *p++;
653          state = AC_S_IN_QUOTE;
654          break;
655        case AC_S_IN_QUOTE:
656          if (ch == quote_char)
657            state = AC_S_QUOTE2;
658          else
659            ch = *p++;
660          break;
661        case AC_S_QUOTE2:
662          assert (ch == quote_char);
663          ch = *p++;
664          state = AC_S_DEFAULT;
665          break;
666        case AC_S_DASH1:
667          assert (ch == '-');
668          ch = *p++;
669          state = AC_S_DASH2;
670          break;
671        case AC_S_DASH2:
672          switch (ch)
673            {
674            case '-':
675              ch = *p++;
676              state = AC_S_COMMENT;
677              break;
678            default:
679              state = AC_S_BACKOUT;
680            }
681          break;
682        case AC_S_COMMENT:
683          switch (ch)
684            {
685            case '-':
686              state = AC_S_DASH3;
687              break;
688            default:
689              ch = *p++;
690              break;
691            }
692          break;
693        case AC_S_DASH3:
694          assert (ch == '-');
695          ch = *p++;
696          state = AC_S_DASH4;
697          break;
698        case AC_S_DASH4:
699          switch (ch)
700            {
701            case '-':
702              ch = *p++;
703              state = AC_S_DEFAULT;
704              break;
705            default:
706              state = AC_S_COMMENT;
707              break;
708            }
709          break;
710        }
711    }
712
713  if (state == AC_S_BACKOUT)
714    {
715#ifdef STANDALONE
716      ++comment_backout_count;
717#endif
718      return beg + 1;
719    }
720  return p;
721}
722
723/* Find the first occurrence of the substring "-->" in [BEG, END) and
724   return the pointer to the character after the substring.  If the
725   substring is not found, return NULL.  */
726
727static const char *
728find_comment_end (const char *beg, const char *end)
729{
730  /* Open-coded Boyer-Moore search for "-->".  Examine the third char;
731     if it's not '>' or '-', advance by three characters.  Otherwise,
732     look at the preceding characters and try to find a match.  */
733
734  const char *p = beg - 1;
735
736  while ((p += 3) < end)
737    switch (p[0])
738      {
739      case '>':
740        if (p[-1] == '-' && p[-2] == '-')
741          return p + 1;
742        break;
743      case '-':
744      at_dash:
745        if (p[-1] == '-')
746          {
747          at_dash_dash:
748            if (++p == end) return NULL;
749            switch (p[0])
750              {
751              case '>': return p + 1;
752              case '-': goto at_dash_dash;
753              }
754          }
755        else
756          {
757            if ((p += 2) >= end) return NULL;
758            switch (p[0])
759              {
760              case '>':
761                if (p[-1] == '-')
762                  return p + 1;
763                break;
764              case '-':
765                goto at_dash;
766              }
767          }
768      }
769  return NULL;
770}
771
772/* Return true if the string containing of characters inside [b, e) is
773   present in hash table HT.  */
774
775static bool
776name_allowed (const struct hash_table *ht, const char *b, const char *e)
777{
778  char *copy;
779  if (!ht)
780    return true;
781  BOUNDED_TO_ALLOCA (b, e, copy);
782  return hash_table_get (ht, copy) != NULL;
783}
784
785/* Advance P (a char pointer), with the explicit intent of being able
786   to read the next character.  If this is not possible, go to finish.  */
787
788#define ADVANCE(p) do {                         \
789  ++p;                                          \
790  if (p >= end)                                 \
791    goto finish;                                \
792} while (0)
793
794/* Skip whitespace, if any. */
795
796#define SKIP_WS(p) do {                         \
797  while (c_isspace (*p)) {                        \
798    ADVANCE (p);                                \
799  }                                             \
800} while (0)
801
802/* Skip non-whitespace, if any. */
803
804#define SKIP_NON_WS(p) do {                     \
805  while (!c_isspace (*p)) {                       \
806    ADVANCE (p);                                \
807  }                                             \
808} while (0)
809
810#ifdef STANDALONE
811static int tag_backout_count;
812#endif
813
814/* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
815   MAPFUN will be called with two arguments: pointer to an initialized
816   struct taginfo, and MAPARG.
817
818   ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of
819   which are the tags and attribute names that this function should
820   use.  If ALLOWED_TAGS is NULL, all tags are processed; if
821   ALLOWED_ATTRIBUTES is NULL, all attributes are returned.
822
823   (Obviously, the caller can filter out unwanted tags and attributes
824   just as well, but this is just an optimization designed to avoid
825   unnecessary copying of tags/attributes which the caller doesn't
826   care about.)  */
827
828void
829map_html_tags (const char *text, int size,
830               void (*mapfun) (struct taginfo *, void *), void *maparg,
831               int flags,
832               const struct hash_table *allowed_tags,
833               const struct hash_table *allowed_attributes)
834{
835  /* storage for strings passed to MAPFUN callback; if 256 bytes is
836     too little, POOL_APPEND allocates more with malloc. */
837  char pool_initial_storage[256];
838  struct pool pool;
839
840  const char *p = text;
841  const char *end = text + size;
842
843  struct attr_pair attr_pair_initial_storage[8];
844  int attr_pair_size = countof (attr_pair_initial_storage);
845  bool attr_pair_resized = false;
846  struct attr_pair *pairs = attr_pair_initial_storage;
847
848  struct tagstack_item *head = NULL;
849  struct tagstack_item *tail = NULL;
850
851  if (!size)
852    return;
853
854  POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage));
855
856  {
857    int nattrs, end_tag;
858    const char *tag_name_begin, *tag_name_end;
859    const char *tag_start_position;
860    bool uninteresting_tag;
861
862  look_for_tag:
863    POOL_REWIND (&pool);
864
865    nattrs = 0;
866    end_tag = 0;
867
868    /* Find beginning of tag.  We use memchr() instead of the usual
869       looping with ADVANCE() for speed. */
870    p = memchr (p, '<', end - p);
871    if (!p)
872      goto finish;
873
874    tag_start_position = p;
875    ADVANCE (p);
876
877    /* Establish the type of the tag (start-tag, end-tag or
878       declaration).  */
879    if (*p == '!')
880      {
881        if (!(flags & MHT_STRICT_COMMENTS)
882            && p < end + 3 && p[1] == '-' && p[2] == '-')
883          {
884            /* If strict comments are not enforced and if we know
885               we're looking at a comment, simply look for the
886               terminating "-->".  Non-strict is the default because
887               it works in other browsers and most HTML writers can't
888               be bothered with getting the comments right.  */
889            const char *comment_end = find_comment_end (p + 3, end);
890            if (comment_end)
891              p = comment_end;
892          }
893        else
894          {
895            /* Either in strict comment mode or looking at a non-empty
896               declaration.  Real declarations are much less likely to
897               be misused the way comments are, so advance over them
898               properly regardless of strictness.  */
899            p = advance_declaration (p, end);
900          }
901        if (p == end)
902          goto finish;
903        goto look_for_tag;
904      }
905    else if (*p == '/')
906      {
907        end_tag = 1;
908        ADVANCE (p);
909      }
910    tag_name_begin = p;
911    while (NAME_CHAR_P (*p))
912      ADVANCE (p);
913    if (p == tag_name_begin)
914      goto look_for_tag;
915    tag_name_end = p;
916    SKIP_WS (p);
917
918    if (!end_tag)
919      {
920        struct tagstack_item *ts = tagstack_push (&head, &tail);
921        if (ts)
922          {
923            ts->tagname_begin  = tag_name_begin;
924            ts->tagname_end    = tag_name_end;
925            ts->contents_begin = NULL;
926          }
927      }
928
929    if (end_tag && *p != '>')
930      goto backout_tag;
931
932    if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
933      /* We can't just say "goto look_for_tag" here because we need
934         the loop below to properly advance over the tag's attributes.  */
935      uninteresting_tag = true;
936    else
937      {
938        uninteresting_tag = false;
939        convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
940      }
941
942    /* Find the attributes. */
943    while (1)
944      {
945        const char *attr_name_begin, *attr_name_end;
946        const char *attr_value_begin, *attr_value_end;
947        const char *attr_raw_value_begin, *attr_raw_value_end;
948        int operation = AP_DOWNCASE; /* stupid compiler. */
949
950        SKIP_WS (p);
951
952        if (*p == '/')
953          {
954            /* A slash at this point means the tag is about to be
955               closed.  This is legal in XML and has been popularized
956               in HTML via XHTML.  */
957            /* <foo a=b c=d /> */
958            /*              ^  */
959            ADVANCE (p);
960            SKIP_WS (p);
961            if (*p != '>')
962              goto backout_tag;
963          }
964
965        /* Check for end of tag definition. */
966        if (*p == '>')
967          break;
968
969        /* Establish bounds of attribute name. */
970        attr_name_begin = p;    /* <foo bar ...> */
971                                /*      ^        */
972        while (NAME_CHAR_P (*p))
973          ADVANCE (p);
974        attr_name_end = p;      /* <foo bar ...> */
975                                /*         ^     */
976        if (attr_name_begin == attr_name_end)
977          goto backout_tag;
978
979        /* Establish bounds of attribute value. */
980        SKIP_WS (p);
981        if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
982          {
983            /* Minimized attribute syntax allows `=' to be omitted.
984               For example, <UL COMPACT> is a valid shorthand for <UL
985               COMPACT="compact">.  Even if such attributes are not
986               useful to Wget, we need to support them, so that the
987               tags containing them can be parsed correctly. */
988            attr_raw_value_begin = attr_value_begin = attr_name_begin;
989            attr_raw_value_end = attr_value_end = attr_name_end;
990          }
991        else if (*p == '=')
992          {
993            ADVANCE (p);
994            SKIP_WS (p);
995            if (*p == '\"' || *p == '\'')
996              {
997                bool newline_seen = false;
998                char quote_char = *p;
999                attr_raw_value_begin = p;
1000                ADVANCE (p);
1001                attr_value_begin = p; /* <foo bar="baz"> */
1002                                      /*           ^     */
1003                while (*p != quote_char)
1004                  {
1005                    if (!newline_seen && *p == '\n')
1006                      {
1007                        /* If a newline is seen within the quotes, it
1008                           is most likely that someone forgot to close
1009                           the quote.  In that case, we back out to
1010                           the value beginning, and terminate the tag
1011                           at either `>' or the delimiter, whichever
1012                           comes first.  Such a tag terminated at `>'
1013                           is discarded.  */
1014                        p = attr_value_begin;
1015                        newline_seen = true;
1016                        continue;
1017                      }
1018                    else if (newline_seen && *p == '>')
1019                      break;
1020                    ADVANCE (p);
1021                  }
1022                attr_value_end = p; /* <foo bar="baz"> */
1023                                    /*              ^  */
1024                if (*p == quote_char)
1025                  ADVANCE (p);
1026                else
1027                  goto look_for_tag;
1028                attr_raw_value_end = p; /* <foo bar="baz"> */
1029                                        /*               ^ */
1030                operation = AP_DECODE_ENTITIES;
1031                if (flags & MHT_TRIM_VALUES)
1032                  operation |= AP_TRIM_BLANKS;
1033              }
1034            else
1035              {
1036                attr_value_begin = p; /* <foo bar=baz> */
1037                                      /*          ^    */
1038                /* According to SGML, a name token should consist only
1039                   of alphanumerics, . and -.  However, this is often
1040                   violated by, for instance, `%' in `width=75%'.
1041                   We'll be liberal and allow just about anything as
1042                   an attribute value.  */
1043                while (!c_isspace (*p) && *p != '>')
1044                  ADVANCE (p);
1045                attr_value_end = p; /* <foo bar=baz qux=quix> */
1046                                    /*             ^          */
1047                if (attr_value_begin == attr_value_end)
1048                  /* <foo bar=> */
1049                  /*          ^ */
1050                  goto backout_tag;
1051                attr_raw_value_begin = attr_value_begin;
1052                attr_raw_value_end = attr_value_end;
1053                operation = AP_DECODE_ENTITIES;
1054              }
1055          }
1056        else
1057          {
1058            /* We skipped the whitespace and found something that is
1059               neither `=' nor the beginning of the next attribute's
1060               name.  Back out.  */
1061            goto backout_tag;   /* <foo bar [... */
1062                                /*          ^    */
1063          }
1064
1065        /* If we're not interested in the tag, don't bother with any
1066           of the attributes.  */
1067        if (uninteresting_tag)
1068          continue;
1069
1070        /* If we aren't interested in the attribute, skip it.  We
1071           cannot do this test any sooner, because our text pointer
1072           needs to correctly advance over the attribute.  */
1073        if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end))
1074          continue;
1075
1076        GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,
1077                    struct attr_pair);
1078
1079        pairs[nattrs].name_pool_index = pool.tail;
1080        convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
1081
1082        pairs[nattrs].value_pool_index = pool.tail;
1083        convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
1084        pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
1085        pairs[nattrs].value_raw_size = (attr_raw_value_end
1086                                        - attr_raw_value_begin);
1087        ++nattrs;
1088      }
1089
1090    if (!end_tag && tail && (tail->tagname_begin == tag_name_begin))
1091      {
1092        tail->contents_begin = p+1;
1093      }
1094
1095    if (uninteresting_tag)
1096      {
1097        ADVANCE (p);
1098        goto look_for_tag;
1099      }
1100
1101    /* By now, we have a valid tag with a name and zero or more
1102       attributes.  Fill in the data and call the mapper function.  */
1103    {
1104      int i;
1105      struct taginfo taginfo;
1106      struct tagstack_item *ts = NULL;
1107
1108      taginfo.name      = pool.contents;
1109      taginfo.end_tag_p = end_tag;
1110      taginfo.nattrs    = nattrs;
1111      /* We fill in the char pointers only now, when pool can no
1112         longer get realloc'ed.  If we did that above, we could get
1113         hosed by reallocation.  Obviously, after this point, the pool
1114         may no longer be grown.  */
1115      for (i = 0; i < nattrs; i++)
1116        {
1117          pairs[i].name = pool.contents + pairs[i].name_pool_index;
1118          pairs[i].value = pool.contents + pairs[i].value_pool_index;
1119        }
1120      taginfo.attrs = pairs;
1121      taginfo.start_position = tag_start_position;
1122      taginfo.end_position   = p + 1;
1123      taginfo.contents_begin = NULL;
1124      taginfo.contents_end = NULL;
1125
1126      if (end_tag)
1127        {
1128          ts = tagstack_find (tail, tag_name_begin, tag_name_end);
1129          if (ts)
1130            {
1131              if (ts->contents_begin)
1132                {
1133                  taginfo.contents_begin = ts->contents_begin;
1134                  taginfo.contents_end   = tag_start_position;
1135                }
1136              tagstack_pop (&head, &tail, ts);
1137            }
1138        }
1139
1140      mapfun (&taginfo, maparg);
1141      ADVANCE (p);
1142    }
1143    goto look_for_tag;
1144
1145  backout_tag:
1146#ifdef STANDALONE
1147    ++tag_backout_count;
1148#endif
1149    /* The tag wasn't really a tag.  Treat its contents as ordinary
1150       data characters. */
1151    p = tag_start_position + 1;
1152    goto look_for_tag;
1153  }
1154
1155 finish:
1156  POOL_FREE (&pool);
1157  if (attr_pair_resized)
1158    xfree (pairs);
1159  /* pop any tag stack that's left */
1160  tagstack_pop (&head, &tail, head);
1161}
1162
1163#undef ADVANCE
1164#undef SKIP_WS
1165#undef SKIP_NON_WS
1166
1167#ifdef STANDALONE
1168static void
1169test_mapper (struct taginfo *taginfo, void *arg)
1170{
1171  int i;
1172
1173  printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
1174  for (i = 0; i < taginfo->nattrs; i++)
1175    printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
1176  putchar ('\n');
1177  ++*(int *)arg;
1178}
1179
1180int main ()
1181{
1182  int size = 256;
1183  char *x = xmalloc (size);
1184  int length = 0;
1185  int read_count;
1186  int tag_counter = 0;
1187
1188  while ((read_count = fread (x + length, 1, size - length, stdin)))
1189    {
1190      length += read_count;
1191      size <<= 1;
1192      x = xrealloc (x, size);
1193    }
1194
1195  map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL);
1196  printf ("TAGS: %d\n", tag_counter);
1197  printf ("Tag backouts:     %d\n", tag_backout_count);
1198  printf ("Comment backouts: %d\n", comment_backout_count);
1199  return 0;
1200}
1201#endif /* STANDALONE */
1202