1/* HTML parser for Wget.
2   Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
3   2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
4
5This file is part of GNU Wget.
6
7GNU Wget is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 3 of the License, or (at
10your option) any later version.
11
12GNU Wget is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15GNU General Public License for more details.
16
17You should have received a copy of the GNU General Public License
18along with Wget.  If not, see <http://www.gnu.org/licenses/>.
19
20Additional permission under GNU GPL version 3 section 7
21
22If you modify this program, or any covered work, by linking or
23combining it with the OpenSSL project's OpenSSL library (or a
24modified version of that library), containing parts covered by the
25terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26grants you additional permission to convey the resulting work.
27Corresponding Source for a non-source form of such a combination
28shall include the source code for the parts of OpenSSL used as well
29as that of the covered work.  */
30
31/* The only entry point to this module is map_html_tags(), which see.  */
32
33/* TODO:
34
35   - Allow hooks for callers to process contents outside tags.  This
36     is needed to implement handling <style> and <script>.  The
37     taginfo structure already carries the information about where the
38     tags are, but this is not enough, because one would also want to
39     skip the comments.  (The funny thing is that for <style> and
40     <script> you *don't* want to skip comments!)
41
42   - Create a test suite for regression testing. */
43
44/* HISTORY:
45
46   This is the third HTML parser written for Wget.  The first one was
47   written some time during the Geturl 1.0 beta cycle, and was very
48   inefficient and buggy.  It also contained some very complex code to
49   remember a list of parser states, because it was supposed to be
50   reentrant.
51
52   The second HTML parser was written for Wget 1.4 (the first version
53   by the name `Wget'), and was a complete rewrite.  Although the new
54   parser behaved much better and made no claims of reentrancy, it
55   still shared many of the fundamental flaws of the old version -- it
56   only regarded HTML in terms tag-attribute pairs, where the
57   attribute's value was a URL to be returned.  Any other property of
58   HTML, such as <base href=...>, or strange way to specify a URL,
59   such as <meta http-equiv=Refresh content="0; URL=..."> had to be
60   crudely hacked in -- and the caller had to be aware of these hacks.
61   Like its predecessor, this parser did not support HTML comments.
62
63   After Wget 1.5.1 was released, I set out to write a third HTML
64   parser.  The objectives of the new parser were to: (1) provide a
65   clean way to analyze HTML lexically, (2) separate interpretation of
66   the markup from the parsing process, (3) be as correct as possible,
67   e.g. correctly skipping comments and other SGML declarations, (4)
68   understand the most common errors in markup and skip them or be
69   relaxed towrds them, and (5) be reasonably efficient (no regexps,
70   minimum copying and minimum or no heap allocation).
71
72   I believe this parser meets all of the above goals.  It is
73   reasonably well structured, and could be relatively easily
74   separated from Wget and used elsewhere.  While some of its
75   intrinsic properties limit its value as a general-purpose HTML
76   parser, I believe that, with minimum modifications, it could serve
77   as a backend for one.
78
79   Due to time and other constraints, this parser was not integrated
80   into Wget until the version 1.7. */
81
82/* DESCRIPTION:
83
84   The single entry point of this parser is map_html_tags(), which
85   works by calling a function you specify for each tag.  The function
86   gets called with the pointer to a structure describing the tag and
87   its attributes.  */
88
89/* To test as standalone, compile with `-DSTANDALONE -I.'.  You'll
90   still need Wget headers to compile.  */
91
92#include "wget.h"
93
94#ifdef STANDALONE
95# define I_REALLY_WANT_CTYPE_MACROS
96#endif
97
98#include <stdio.h>
99#include <stdlib.h>
100#include <string.h>
101#include <assert.h>
102
103#include "utils.h"
104#include "html-parse.h"
105
106#ifdef STANDALONE
107# undef xmalloc
108# undef xrealloc
109# undef xfree
110# define xmalloc malloc
111# define xrealloc realloc
112# define xfree free
113
114# undef c_isspace
115# undef c_isdigit
116# undef c_isxdigit
117# undef c_isalpha
118# undef c_isalnum
119# undef c_tolower
120# undef c_toupper
121
122# define c_isspace(x) isspace (x)
123# define c_isdigit(x) isdigit (x)
124# define c_isxdigit(x) isxdigit (x)
125# define c_isalpha(x) isalpha (x)
126# define c_isalnum(x) isalnum (x)
127# define c_tolower(x) tolower (x)
128# define c_toupper(x) toupper (x)
129
130struct hash_table {
131  int dummy;
132};
133static void *
134hash_table_get (const struct hash_table *ht, void *ptr)
135{
136  return ptr;
137}
138#else  /* not STANDALONE */
139# include "hash.h"
140#endif
141
142/* Pool support.  A pool is a resizable chunk of memory.  It is first
143   allocated on the stack, and moved to the heap if it needs to be
144   larger than originally expected.  map_html_tags() uses it to store
145   the zero-terminated names and values of tags and attributes.
146
147   Thus taginfo->name, and attr->name and attr->value for each
148   attribute, do not point into separately allocated areas, but into
149   different parts of the pool, separated only by terminating zeros.
150   This ensures minimum amount of allocation and, for most tags, no
151   allocation because the entire pool is kept on the stack.  */
152
153struct pool {
154  char *contents;               /* pointer to the contents. */
155  int size;                     /* size of the pool. */
156  int tail;                     /* next available position index. */
157  bool resized;                 /* whether the pool has been resized
158                                   using malloc. */
159
160  char *orig_contents;          /* original pool contents, usually
161                                   stack-allocated.  used by POOL_FREE
162                                   to restore the pool to the initial
163                                   state. */
164  int orig_size;
165};
166
167/* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
168
169#define POOL_INIT(p, initial_storage, initial_size) do {        \
170  struct pool *P = (p);                                         \
171  P->contents = (initial_storage);                              \
172  P->size = (initial_size);                                     \
173  P->tail = 0;                                                  \
174  P->resized = false;                                           \
175  P->orig_contents = P->contents;                               \
176  P->orig_size = P->size;                                       \
177} while (0)
178
179/* Grow the pool to accomodate at least SIZE new bytes.  If the pool
180   already has room to accomodate SIZE bytes of data, this is a no-op.  */
181
182#define POOL_GROW(p, increase)                                  \
183  GROW_ARRAY ((p)->contents, (p)->size, (p)->tail + (increase), \
184              (p)->resized, char)
185
186/* Append text in the range [beg, end) to POOL.  No zero-termination
187   is done.  */
188
189#define POOL_APPEND(p, beg, end) do {                   \
190  const char *PA_beg = (beg);                           \
191  int PA_size = (end) - PA_beg;                         \
192  POOL_GROW (p, PA_size);                               \
193  memcpy ((p)->contents + (p)->tail, PA_beg, PA_size);  \
194  (p)->tail += PA_size;                                 \
195} while (0)
196
197/* Append one character to the pool.  Can be used to zero-terminate
198   pool strings.  */
199
200#define POOL_APPEND_CHR(p, ch) do {             \
201  char PAC_char = (ch);                         \
202  POOL_GROW (p, 1);                             \
203  (p)->contents[(p)->tail++] = PAC_char;        \
204} while (0)
205
206/* Forget old pool contents.  The allocated memory is not freed. */
207#define POOL_REWIND(p) (p)->tail = 0
208
209/* Free heap-allocated memory for contents of POOL.  This calls
210   xfree() if the memory was allocated through malloc.  It also
211   restores `contents' and `size' to their original, pre-malloc
212   values.  That way after POOL_FREE, the pool is fully usable, just
213   as if it were freshly initialized with POOL_INIT.  */
214
215#define POOL_FREE(p) do {                       \
216  struct pool *P = p;                           \
217  if (P->resized)                               \
218    xfree (P->contents);                        \
219  P->contents = P->orig_contents;               \
220  P->size = P->orig_size;                       \
221  P->tail = 0;                                  \
222  P->resized = false;                           \
223} while (0)
224
225/* Used for small stack-allocated memory chunks that might grow.  Like
226   DO_REALLOC, this macro grows BASEVAR as necessary to take
227   NEEDED_SIZE items of TYPE.
228
229   The difference is that on the first resize, it will use
230   malloc+memcpy rather than realloc.  That way you can stack-allocate
231   the initial chunk, and only resort to heap allocation if you
232   stumble upon large data.
233
234   After the first resize, subsequent ones are performed with realloc,
235   just like DO_REALLOC.  */
236
237#define GROW_ARRAY(basevar, sizevar, needed_size, resized, type) do {           \
238  long ga_needed_size = (needed_size);                                          \
239  long ga_newsize = (sizevar);                                                  \
240  while (ga_newsize < ga_needed_size)                                           \
241    ga_newsize <<= 1;                                                           \
242  if (ga_newsize != (sizevar))                                                  \
243    {                                                                           \
244      if (resized)                                                              \
245        basevar = xrealloc (basevar, ga_newsize * sizeof (type));               \
246      else                                                                      \
247        {                                                                       \
248          void *ga_new = xmalloc (ga_newsize * sizeof (type));                  \
249          memcpy (ga_new, basevar, (sizevar) * sizeof (type));                  \
250          (basevar) = ga_new;                                                   \
251          resized = true;                                                       \
252        }                                                                       \
253      (sizevar) = ga_newsize;                                                   \
254    }                                                                           \
255} while (0)
256
257/* Test whether n+1-sized entity name fits in P.  We don't support
258   IE-style non-terminated entities, e.g. "&ltfoo" -> "<foo".
259   However, "&lt;foo" will work, as will "&lt!foo", "&lt", etc.  In
260   other words an entity needs to be terminated by either a
261   non-alphanumeric or the end of string.  */
262#define FITS(p, n) (p + n == end || (p + n < end && !c_isalnum (p[n])))
263
264/* Macros that test entity names by returning true if P is followed by
265   the specified characters.  */
266#define ENT1(p, c0) (FITS (p, 1) && p[0] == c0)
267#define ENT2(p, c0, c1) (FITS (p, 2) && p[0] == c0 && p[1] == c1)
268#define ENT3(p, c0, c1, c2) (FITS (p, 3) && p[0]==c0 && p[1]==c1 && p[2]==c2)
269
270/* Increment P by INC chars.  If P lands at a semicolon, increment it
271   past the semicolon.  This ensures that e.g. "&lt;foo" is converted
272   to "<foo", but "&lt,foo" to "<,foo".  */
273#define SKIP_SEMI(p, inc) (p += inc, p < end && *p == ';' ? ++p : p)
274
275struct tagstack_item {
276  const char *tagname_begin;
277  const char *tagname_end;
278  const char *contents_begin;
279  struct tagstack_item *prev;
280  struct tagstack_item *next;
281};
282
283static struct tagstack_item *
284tagstack_push (struct tagstack_item **head, struct tagstack_item **tail)
285{
286  struct tagstack_item *ts = xmalloc(sizeof(struct tagstack_item));
287  if (*head == NULL)
288    {
289      *head = *tail = ts;
290      ts->prev = ts->next = NULL;
291    }
292  else
293    {
294      (*tail)->next = ts;
295      ts->prev = *tail;
296      *tail = ts;
297      ts->next = NULL;
298    }
299
300  return ts;
301}
302
303/* remove ts and everything after it from the stack */
304static void
305tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail,
306              struct tagstack_item *ts)
307{
308  if (*head == NULL)
309    return;
310
311  if (ts == *tail)
312    {
313      if (ts == *head)
314        {
315          xfree (ts);
316          *head = *tail = NULL;
317        }
318      else
319        {
320          ts->prev->next = NULL;
321          *tail = ts->prev;
322          xfree (ts);
323        }
324    }
325  else
326    {
327      if (ts == *head)
328        {
329          *head = NULL;
330        }
331      *tail = ts->prev;
332
333      if (ts->prev)
334        {
335          ts->prev->next = NULL;
336        }
337      while (ts)
338        {
339          struct tagstack_item *p = ts->next;
340          xfree (ts);
341          ts = p;
342        }
343    }
344}
345
346static struct tagstack_item *
347tagstack_find (struct tagstack_item *tail, const char *tagname_begin,
348               const char *tagname_end)
349{
350  int len = tagname_end - tagname_begin;
351  while (tail)
352    {
353      if (len == (tail->tagname_end - tail->tagname_begin))
354        {
355          if (0 == strncasecmp (tail->tagname_begin, tagname_begin, len))
356            return tail;
357        }
358      tail = tail->prev;
359    }
360  return NULL;
361}
362
363/* Decode the HTML character entity at *PTR, considering END to be end
364   of buffer.  It is assumed that the "&" character that marks the
365   beginning of the entity has been seen at *PTR-1.  If a recognized
366   ASCII entity is seen, it is returned, and *PTR is moved to the end
367   of the entity.  Otherwise, -1 is returned and *PTR left unmodified.
368
369   The recognized entities are: &lt, &gt, &amp, &apos, and &quot.  */
370
371static int
372decode_entity (const char **ptr, const char *end)
373{
374  const char *p = *ptr;
375  int value = -1;
376
377  if (++p == end)
378    return -1;
379
380  switch (*p++)
381    {
382    case '#':
383      /* Process numeric entities "&#DDD;" and "&#xHH;".  */
384      {
385        int digits = 0;
386        value = 0;
387        if (*p == 'x')
388          for (++p; value < 256 && p < end && c_isxdigit (*p); p++, digits++)
389            value = (value << 4) + XDIGIT_TO_NUM (*p);
390        else
391          for (; value < 256 && p < end && c_isdigit (*p); p++, digits++)
392            value = (value * 10) + (*p - '0');
393        if (!digits)
394          return -1;
395        /* Don't interpret 128+ codes and NUL because we cannot
396           portably reinserted them into HTML.  */
397        if (!value || (value & ~0x7f))
398          return -1;
399        *ptr = SKIP_SEMI (p, 0);
400        return value;
401      }
402    /* Process named ASCII entities.  */
403    case 'g':
404      if (ENT1 (p, 't'))
405        value = '>', *ptr = SKIP_SEMI (p, 1);
406      break;
407    case 'l':
408      if (ENT1 (p, 't'))
409        value = '<', *ptr = SKIP_SEMI (p, 1);
410      break;
411    case 'a':
412      if (ENT2 (p, 'm', 'p'))
413        value = '&', *ptr = SKIP_SEMI (p, 2);
414      else if (ENT3 (p, 'p', 'o', 's'))
415        /* handle &apos for the sake of the XML/XHTML crowd. */
416        value = '\'', *ptr = SKIP_SEMI (p, 3);
417      break;
418    case 'q':
419      if (ENT3 (p, 'u', 'o', 't'))
420        value = '\"', *ptr = SKIP_SEMI (p, 3);
421      break;
422    }
423  return value;
424}
425#undef ENT1
426#undef ENT2
427#undef ENT3
428#undef FITS
429#undef SKIP_SEMI
430
431enum {
432  AP_DOWNCASE           = 1,
433  AP_DECODE_ENTITIES    = 2,
434  AP_TRIM_BLANKS        = 4
435};
436
437/* Copy the text in the range [BEG, END) to POOL, optionally
438   performing operations specified by FLAGS.  FLAGS may be any
439   combination of AP_DOWNCASE, AP_DECODE_ENTITIES and AP_TRIM_BLANKS
440   with the following meaning:
441
442   * AP_DOWNCASE -- downcase all the letters;
443
444   * AP_DECODE_ENTITIES -- decode the named and numeric entities in
445     the ASCII range when copying the string.
446
447   * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
448     of text, as well as embedded newlines.  */
449
450static void
451convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
452{
453  int old_tail = pool->tail;
454
455  /* Skip blanks if required.  We must do this before entities are
456     processed, so that blanks can still be inserted as, for instance,
457     `&#32;'.  */
458  if (flags & AP_TRIM_BLANKS)
459    {
460      while (beg < end && c_isspace (*beg))
461        ++beg;
462      while (end > beg && c_isspace (end[-1]))
463        --end;
464    }
465
466  if (flags & AP_DECODE_ENTITIES)
467    {
468      /* Grow the pool, then copy the text to the pool character by
469         character, processing the encountered entities as we go
470         along.
471
472         It's safe (and necessary) to grow the pool in advance because
473         processing the entities can only *shorten* the string, it can
474         never lengthen it.  */
475      const char *from = beg;
476      char *to;
477      bool squash_newlines = !!(flags & AP_TRIM_BLANKS);
478
479      POOL_GROW (pool, end - beg);
480      to = pool->contents + pool->tail;
481
482      while (from < end)
483        {
484          if (*from == '&')
485            {
486              int entity = decode_entity (&from, end);
487              if (entity != -1)
488                *to++ = entity;
489              else
490                *to++ = *from++;
491            }
492          else if ((*from == '\n' || *from == '\r') && squash_newlines)
493            ++from;
494          else
495            *to++ = *from++;
496        }
497      /* Verify that we haven't exceeded the original size.  (It
498         shouldn't happen, hence the assert.)  */
499      assert (to - (pool->contents + pool->tail) <= end - beg);
500
501      /* Make POOL's tail point to the position following the string
502         we've written.  */
503      pool->tail = to - pool->contents;
504      POOL_APPEND_CHR (pool, '\0');
505    }
506  else
507    {
508      /* Just copy the text to the pool.  */
509      POOL_APPEND (pool, beg, end);
510      POOL_APPEND_CHR (pool, '\0');
511    }
512
513  if (flags & AP_DOWNCASE)
514    {
515      char *p = pool->contents + old_tail;
516      for (; *p; p++)
517        *p = c_tolower (*p);
518    }
519}
520
521/* Originally we used to adhere to rfc 1866 here, and allowed only
522   letters, digits, periods, and hyphens as names (of tags or
523   attributes).  However, this broke too many pages which used
524   proprietary or strange attributes, e.g. <img src="a.gif"
525   v:shapes="whatever">.
526
527   So now we allow any character except:
528     * whitespace
529     * 8-bit and control chars
530     * characters that clearly cannot be part of name:
531       '=', '<', '>', '/'.
532
533   This only affects attribute and tag names; attribute values allow
534   an even greater variety of characters.  */
535
536#define NAME_CHAR_P(x) ((x) > 32 && (x) < 127                           \
537                        && (x) != '=' && (x) != '<' && (x) != '>'       \
538                        && (x) != '/')
539
540#ifdef STANDALONE
541static int comment_backout_count;
542#endif
543
544/* Advance over an SGML declaration, such as <!DOCTYPE ...>.  In
545   strict comments mode, this is used for skipping over comments as
546   well.
547
548   To recap: any SGML declaration may have comments associated with
549   it, e.g.
550       <!MY-DECL -- isn't this fun? -- foo bar>
551
552   An HTML comment is merely an empty declaration (<!>) with a comment
553   attached, like this:
554       <!-- some stuff here -->
555
556   Several comments may be embedded in one comment declaration:
557       <!-- have -- -- fun -->
558
559   Whitespace is allowed between and after the comments, but not
560   before the first comment.  Additionally, this function attempts to
561   handle double quotes in SGML declarations correctly.  */
562
563static const char *
564advance_declaration (const char *beg, const char *end)
565{
566  const char *p = beg;
567  char quote_char = '\0';       /* shut up, gcc! */
568  char ch;
569
570  enum {
571    AC_S_DONE,
572    AC_S_BACKOUT,
573    AC_S_BANG,
574    AC_S_DEFAULT,
575    AC_S_DCLNAME,
576    AC_S_DASH1,
577    AC_S_DASH2,
578    AC_S_COMMENT,
579    AC_S_DASH3,
580    AC_S_DASH4,
581    AC_S_QUOTE1,
582    AC_S_IN_QUOTE,
583    AC_S_QUOTE2
584  } state = AC_S_BANG;
585
586  if (beg == end)
587    return beg;
588  ch = *p++;
589
590  /* It looked like a good idea to write this as a state machine, but
591     now I wonder...  */
592
593  while (state != AC_S_DONE && state != AC_S_BACKOUT)
594    {
595      if (p == end)
596        state = AC_S_BACKOUT;
597      switch (state)
598        {
599        case AC_S_DONE:
600        case AC_S_BACKOUT:
601          break;
602        case AC_S_BANG:
603          if (ch == '!')
604            {
605              ch = *p++;
606              state = AC_S_DEFAULT;
607            }
608          else
609            state = AC_S_BACKOUT;
610          break;
611        case AC_S_DEFAULT:
612          switch (ch)
613            {
614            case '-':
615              state = AC_S_DASH1;
616              break;
617            case ' ':
618            case '\t':
619            case '\r':
620            case '\n':
621              ch = *p++;
622              break;
623            case '<':
624            case '>':
625              state = AC_S_DONE;
626              break;
627            case '\'':
628            case '\"':
629              state = AC_S_QUOTE1;
630              break;
631            default:
632              if (NAME_CHAR_P (ch))
633                state = AC_S_DCLNAME;
634              else
635                state = AC_S_BACKOUT;
636              break;
637            }
638          break;
639        case AC_S_DCLNAME:
640          if (ch == '-')
641            state = AC_S_DASH1;
642          else if (NAME_CHAR_P (ch))
643            ch = *p++;
644          else
645            state = AC_S_DEFAULT;
646          break;
647        case AC_S_QUOTE1:
648          /* We must use 0x22 because broken assert macros choke on
649             '"' and '\"'.  */
650          assert (ch == '\'' || ch == 0x22);
651          quote_char = ch;      /* cheating -- I really don't feel like
652                                   introducing more different states for
653                                   different quote characters. */
654          ch = *p++;
655          state = AC_S_IN_QUOTE;
656          break;
657        case AC_S_IN_QUOTE:
658          if (ch == quote_char)
659            state = AC_S_QUOTE2;
660          else
661            ch = *p++;
662          break;
663        case AC_S_QUOTE2:
664          assert (ch == quote_char);
665          ch = *p++;
666          state = AC_S_DEFAULT;
667          break;
668        case AC_S_DASH1:
669          assert (ch == '-');
670          ch = *p++;
671          state = AC_S_DASH2;
672          break;
673        case AC_S_DASH2:
674          switch (ch)
675            {
676            case '-':
677              ch = *p++;
678              state = AC_S_COMMENT;
679              break;
680            default:
681              state = AC_S_BACKOUT;
682            }
683          break;
684        case AC_S_COMMENT:
685          switch (ch)
686            {
687            case '-':
688              state = AC_S_DASH3;
689              break;
690            default:
691              ch = *p++;
692              break;
693            }
694          break;
695        case AC_S_DASH3:
696          assert (ch == '-');
697          ch = *p++;
698          state = AC_S_DASH4;
699          break;
700        case AC_S_DASH4:
701          switch (ch)
702            {
703            case '-':
704              ch = *p++;
705              state = AC_S_DEFAULT;
706              break;
707            default:
708              state = AC_S_COMMENT;
709              break;
710            }
711          break;
712        }
713    }
714
715  if (state == AC_S_BACKOUT)
716    {
717#ifdef STANDALONE
718      ++comment_backout_count;
719#endif
720      return beg + 1;
721    }
722  return p;
723}
724
725/* Find the first occurrence of the substring "-->" in [BEG, END) and
726   return the pointer to the character after the substring.  If the
727   substring is not found, return NULL.  */
728
729static const char *
730find_comment_end (const char *beg, const char *end)
731{
732  /* Open-coded Boyer-Moore search for "-->".  Examine the third char;
733     if it's not '>' or '-', advance by three characters.  Otherwise,
734     look at the preceding characters and try to find a match.  */
735
736  const char *p = beg - 1;
737
738  while ((p += 3) < end)
739    switch (p[0])
740      {
741      case '>':
742        if (p[-1] == '-' && p[-2] == '-')
743          return p + 1;
744        break;
745      case '-':
746      at_dash:
747        if (p[-1] == '-')
748          {
749          at_dash_dash:
750            if (++p == end) return NULL;
751            switch (p[0])
752              {
753              case '>': return p + 1;
754              case '-': goto at_dash_dash;
755              }
756          }
757        else
758          {
759            if ((p += 2) >= end) return NULL;
760            switch (p[0])
761              {
762              case '>':
763                if (p[-1] == '-')
764                  return p + 1;
765                break;
766              case '-':
767                goto at_dash;
768              }
769          }
770      }
771  return NULL;
772}
773
774/* Return true if the string containing of characters inside [b, e) is
775   present in hash table HT.  */
776
777static bool
778name_allowed (const struct hash_table *ht, const char *b, const char *e)
779{
780  char *copy;
781  if (!ht)
782    return true;
783  BOUNDED_TO_ALLOCA (b, e, copy);
784  return hash_table_get (ht, copy) != NULL;
785}
786
787/* Advance P (a char pointer), with the explicit intent of being able
788   to read the next character.  If this is not possible, go to finish.  */
789
790#define ADVANCE(p) do {                         \
791  ++p;                                          \
792  if (p >= end)                                 \
793    goto finish;                                \
794} while (0)
795
796/* Skip whitespace, if any. */
797
798#define SKIP_WS(p) do {                         \
799  while (c_isspace (*p)) {                        \
800    ADVANCE (p);                                \
801  }                                             \
802} while (0)
803
804/* Skip non-whitespace, if any. */
805
806#define SKIP_NON_WS(p) do {                     \
807  while (!c_isspace (*p)) {                       \
808    ADVANCE (p);                                \
809  }                                             \
810} while (0)
811
812#ifdef STANDALONE
813static int tag_backout_count;
814#endif
815
816/* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
817   MAPFUN will be called with two arguments: pointer to an initialized
818   struct taginfo, and MAPARG.
819
820   ALLOWED_TAGS and ALLOWED_ATTRIBUTES are hash tables the keys of
821   which are the tags and attribute names that this function should
822   use.  If ALLOWED_TAGS is NULL, all tags are processed; if
823   ALLOWED_ATTRIBUTES is NULL, all attributes are returned.
824
825   (Obviously, the caller can filter out unwanted tags and attributes
826   just as well, but this is just an optimization designed to avoid
827   unnecessary copying of tags/attributes which the caller doesn't
828   care about.)  */
829
830void
831map_html_tags (const char *text, int size,
832               void (*mapfun) (struct taginfo *, void *), void *maparg,
833               int flags,
834               const struct hash_table *allowed_tags,
835               const struct hash_table *allowed_attributes)
836{
837  /* storage for strings passed to MAPFUN callback; if 256 bytes is
838     too little, POOL_APPEND allocates more with malloc. */
839  char pool_initial_storage[256];
840  struct pool pool;
841
842  const char *p = text;
843  const char *end = text + size;
844
845  struct attr_pair attr_pair_initial_storage[8];
846  int attr_pair_size = countof (attr_pair_initial_storage);
847  bool attr_pair_resized = false;
848  struct attr_pair *pairs = attr_pair_initial_storage;
849
850  struct tagstack_item *head = NULL;
851  struct tagstack_item *tail = NULL;
852
853  if (!size)
854    return;
855
856  POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage));
857
858  {
859    int nattrs, end_tag;
860    const char *tag_name_begin, *tag_name_end;
861    const char *tag_start_position;
862    bool uninteresting_tag;
863
864  look_for_tag:
865    POOL_REWIND (&pool);
866
867    nattrs = 0;
868    end_tag = 0;
869
870    /* Find beginning of tag.  We use memchr() instead of the usual
871       looping with ADVANCE() for speed. */
872    p = memchr (p, '<', end - p);
873    if (!p)
874      goto finish;
875
876    tag_start_position = p;
877    ADVANCE (p);
878
879    /* Establish the type of the tag (start-tag, end-tag or
880       declaration).  */
881    if (*p == '!')
882      {
883        if (!(flags & MHT_STRICT_COMMENTS)
884            && p < end + 3 && p[1] == '-' && p[2] == '-')
885          {
886            /* If strict comments are not enforced and if we know
887               we're looking at a comment, simply look for the
888               terminating "-->".  Non-strict is the default because
889               it works in other browsers and most HTML writers can't
890               be bothered with getting the comments right.  */
891            const char *comment_end = find_comment_end (p + 3, end);
892            if (comment_end)
893              p = comment_end;
894          }
895        else
896          {
897            /* Either in strict comment mode or looking at a non-empty
898               declaration.  Real declarations are much less likely to
899               be misused the way comments are, so advance over them
900               properly regardless of strictness.  */
901            p = advance_declaration (p, end);
902          }
903        if (p == end)
904          goto finish;
905        goto look_for_tag;
906      }
907    else if (*p == '/')
908      {
909        end_tag = 1;
910        ADVANCE (p);
911      }
912    tag_name_begin = p;
913    while (NAME_CHAR_P (*p))
914      ADVANCE (p);
915    if (p == tag_name_begin)
916      goto look_for_tag;
917    tag_name_end = p;
918    SKIP_WS (p);
919
920    if (!end_tag)
921      {
922        struct tagstack_item *ts = tagstack_push (&head, &tail);
923        if (ts)
924          {
925            ts->tagname_begin  = tag_name_begin;
926            ts->tagname_end    = tag_name_end;
927            ts->contents_begin = NULL;
928          }
929      }
930
931    if (end_tag && *p != '>' && *p != '<')
932      goto backout_tag;
933
934    if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
935      /* We can't just say "goto look_for_tag" here because we need
936         the loop below to properly advance over the tag's attributes.  */
937      uninteresting_tag = true;
938    else
939      {
940        uninteresting_tag = false;
941        convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
942      }
943
944    /* Find the attributes. */
945    while (1)
946      {
947        const char *attr_name_begin, *attr_name_end;
948        const char *attr_value_begin, *attr_value_end;
949        const char *attr_raw_value_begin, *attr_raw_value_end;
950        int operation = AP_DOWNCASE; /* stupid compiler. */
951
952        SKIP_WS (p);
953
954        if (*p == '/')
955          {
956            /* A slash at this point means the tag is about to be
957               closed.  This is legal in XML and has been popularized
958               in HTML via XHTML.  */
959            /* <foo a=b c=d /> */
960            /*              ^  */
961            ADVANCE (p);
962            SKIP_WS (p);
963            if (*p != '<' && *p != '>')
964              goto backout_tag;
965          }
966
967        /* Check for end of tag definition. */
968        if (*p == '<' || *p == '>')
969          break;
970
971        /* Establish bounds of attribute name. */
972        attr_name_begin = p;    /* <foo bar ...> */
973                                /*      ^        */
974        while (NAME_CHAR_P (*p))
975          ADVANCE (p);
976        attr_name_end = p;      /* <foo bar ...> */
977                                /*         ^     */
978        if (attr_name_begin == attr_name_end)
979          goto backout_tag;
980
981        /* Establish bounds of attribute value. */
982        SKIP_WS (p);
983
984        if (NAME_CHAR_P (*p) || *p == '/' || *p == '<' || *p == '>')
985          {
986            /* Minimized attribute syntax allows `=' to be omitted.
987               For example, <UL COMPACT> is a valid shorthand for <UL
988               COMPACT="compact">.  Even if such attributes are not
989               useful to Wget, we need to support them, so that the
990               tags containing them can be parsed correctly. */
991            attr_raw_value_begin = attr_value_begin = attr_name_begin;
992            attr_raw_value_end = attr_value_end = attr_name_end;
993          }
994        else if (*p == '=')
995          {
996            ADVANCE (p);
997            SKIP_WS (p);
998            if (*p == '\"' || *p == '\'')
999              {
1000                bool newline_seen = false;
1001                char quote_char = *p;
1002                attr_raw_value_begin = p;
1003                ADVANCE (p);
1004                attr_value_begin = p; /* <foo bar="baz"> */
1005                                      /*           ^     */
1006                while (*p != quote_char)
1007                  {
1008                    if (!newline_seen && *p == '\n')
1009                      {
1010                        /* If a newline is seen within the quotes, it
1011                           is most likely that someone forgot to close
1012                           the quote.  In that case, we back out to
1013                           the value beginning, and terminate the tag
1014                           at either `>' or the delimiter, whichever
1015                           comes first.  Such a tag terminated at `>'
1016                           is discarded.  */
1017                        p = attr_value_begin;
1018                        newline_seen = true;
1019                        continue;
1020                      }
1021                    else if (newline_seen && (*p == '<' || *p == '>'))
1022                      break;
1023                    ADVANCE (p);
1024                  }
1025                attr_value_end = p; /* <foo bar="baz"> */
1026                                    /*              ^  */
1027                if (*p == quote_char)
1028                  ADVANCE (p);
1029                else
1030                  goto look_for_tag;
1031                attr_raw_value_end = p; /* <foo bar="baz"> */
1032                                        /*               ^ */
1033                operation = AP_DECODE_ENTITIES;
1034                if (flags & MHT_TRIM_VALUES)
1035                  operation |= AP_TRIM_BLANKS;
1036              }
1037            else
1038              {
1039                attr_value_begin = p; /* <foo bar=baz> */
1040                                      /*          ^    */
1041                /* According to SGML, a name token should consist only
1042                   of alphanumerics, . and -.  However, this is often
1043                   violated by, for instance, `%' in `width=75%'.
1044                   We'll be liberal and allow just about anything as
1045                   an attribute value.  */
1046                while (!c_isspace (*p) && *p != '<' && *p != '>')
1047                  ADVANCE (p);
1048                attr_value_end = p; /* <foo bar=baz qux=quix> */
1049                                    /*             ^          */
1050                if (attr_value_begin == attr_value_end)
1051                  /* <foo bar=> */
1052                  /*          ^ */
1053                  goto backout_tag;
1054                attr_raw_value_begin = attr_value_begin;
1055                attr_raw_value_end = attr_value_end;
1056                operation = AP_DECODE_ENTITIES;
1057              }
1058          }
1059        else
1060          {
1061            /* We skipped the whitespace and found something that is
1062               neither `=' nor the beginning of the next attribute's
1063               name.  Back out.  */
1064            goto backout_tag;   /* <foo bar [... */
1065                                /*          ^    */
1066          }
1067
1068        /* If we're not interested in the tag, don't bother with any
1069           of the attributes.  */
1070        if (uninteresting_tag)
1071          continue;
1072
1073        /* If we aren't interested in the attribute, skip it.  We
1074           cannot do this test any sooner, because our text pointer
1075           needs to correctly advance over the attribute.  */
1076        if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end))
1077          continue;
1078
1079        GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,
1080                    struct attr_pair);
1081
1082        pairs[nattrs].name_pool_index = pool.tail;
1083        convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
1084
1085        pairs[nattrs].value_pool_index = pool.tail;
1086        convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
1087        pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
1088        pairs[nattrs].value_raw_size = (attr_raw_value_end
1089                                        - attr_raw_value_begin);
1090        ++nattrs;
1091      }
1092
1093    if (!end_tag && tail && (tail->tagname_begin == tag_name_begin))
1094      {
1095        tail->contents_begin = p+1;
1096      }
1097
1098    if (uninteresting_tag)
1099      {
1100        ADVANCE (p);
1101        goto look_for_tag;
1102      }
1103
1104    /* By now, we have a valid tag with a name and zero or more
1105       attributes.  Fill in the data and call the mapper function.  */
1106    {
1107      int i;
1108      struct taginfo taginfo;
1109      struct tagstack_item *ts = NULL;
1110
1111      taginfo.name      = pool.contents;
1112      taginfo.end_tag_p = end_tag;
1113      taginfo.nattrs    = nattrs;
1114      /* We fill in the char pointers only now, when pool can no
1115         longer get realloc'ed.  If we did that above, we could get
1116         hosed by reallocation.  Obviously, after this point, the pool
1117         may no longer be grown.  */
1118      for (i = 0; i < nattrs; i++)
1119        {
1120          pairs[i].name = pool.contents + pairs[i].name_pool_index;
1121          pairs[i].value = pool.contents + pairs[i].value_pool_index;
1122        }
1123      taginfo.attrs = pairs;
1124      taginfo.start_position = tag_start_position;
1125      taginfo.end_position   = p + 1;
1126      taginfo.contents_begin = NULL;
1127      taginfo.contents_end = NULL;
1128
1129      if (end_tag)
1130        {
1131          ts = tagstack_find (tail, tag_name_begin, tag_name_end);
1132          if (ts)
1133            {
1134              if (ts->contents_begin)
1135                {
1136                  taginfo.contents_begin = ts->contents_begin;
1137                  taginfo.contents_end   = tag_start_position;
1138                }
1139              tagstack_pop (&head, &tail, ts);
1140            }
1141        }
1142
1143      mapfun (&taginfo, maparg);
1144      if (*p != '<')
1145        ADVANCE (p);
1146    }
1147    goto look_for_tag;
1148
1149  backout_tag:
1150#ifdef STANDALONE
1151    ++tag_backout_count;
1152#endif
1153    /* The tag wasn't really a tag.  Treat its contents as ordinary
1154       data characters. */
1155    p = tag_start_position + 1;
1156    goto look_for_tag;
1157  }
1158
1159 finish:
1160  POOL_FREE (&pool);
1161  if (attr_pair_resized)
1162    xfree (pairs);
1163  /* pop any tag stack that's left */
1164  tagstack_pop (&head, &tail, head);
1165}
1166
1167#undef ADVANCE
1168#undef SKIP_WS
1169#undef SKIP_NON_WS
1170
1171#ifdef STANDALONE
1172static void
1173test_mapper (struct taginfo *taginfo, void *arg)
1174{
1175  int i;
1176
1177  printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
1178  for (i = 0; i < taginfo->nattrs; i++)
1179    printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
1180  putchar ('\n');
1181  ++*(int *)arg;
1182}
1183
1184int main ()
1185{
1186  int size = 256;
1187  char *x = xmalloc (size);
1188  int length = 0;
1189  int read_count;
1190  int tag_counter = 0;
1191
1192#ifdef ENABLE_NLS
1193  /* Set the current locale.  */
1194  setlocale (LC_ALL, "");
1195  /* Set the text message domain.  */
1196  bindtextdomain ("wget", LOCALEDIR);
1197  textdomain ("wget");
1198#endif /* ENABLE_NLS */
1199
1200  while ((read_count = fread (x + length, 1, size - length, stdin)))
1201    {
1202      length += read_count;
1203      size <<= 1;
1204      x = xrealloc (x, size);
1205    }
1206
1207  map_html_tags (x, length, test_mapper, &tag_counter, 0, NULL, NULL);
1208  printf ("TAGS: %d\n", tag_counter);
1209  printf ("Tag backouts:     %d\n", tag_backout_count);
1210  printf ("Comment backouts: %d\n", comment_backout_count);
1211  return 0;
1212}
1213#endif /* STANDALONE */
1214