1/* parser.c -- HTML Parser
2
3  (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
4  See tidy.h for the copyright notice.
5
6  CVS Info :
7
8    $Author$
9    $Date$
10    $Revision$
11
12*/
13
14#include "tidy-int.h"
15#include "lexer.h"
16#include "parser.h"
17#include "message.h"
18#include "clean.h"
19#include "tags.h"
20#include "tmbstr.h"
21
22#ifdef AUTO_INPUT_ENCODING
23#include "charsets.h"
24#endif
25
26Bool TY_(CheckNodeIntegrity)(Node *node)
27{
28#ifndef NO_NODE_INTEGRITY_CHECK
29    Node *child;
30
31    if (node->prev)
32    {
33        if (node->prev->next != node)
34            return no;
35    }
36
37    if (node->next)
38    {
39        if (node->next->prev != node)
40            return no;
41    }
42
43    if (node->parent)
44    {
45        if (node->prev == NULL && node->parent->content != node)
46            return no;
47
48        if (node->next == NULL && node->parent->last != node)
49            return no;
50    }
51
52    for (child = node->content; child; child = child->next)
53        if ( child->parent != node || !TY_(CheckNodeIntegrity)(child) )
54            return no;
55
56#endif
57    return yes;
58}
59
60/*
61 used to determine how attributes
62 without values should be printed
63 this was introduced to deal with
64 user defined tags e.g. Cold Fusion
65*/
66Bool TY_(IsNewNode)(Node *node)
67{
68    if (node && node->tag)
69    {
70        return (node->tag->model & CM_NEW);
71    }
72    return yes;
73}
74
75void TY_(CoerceNode)(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool unexpected)
76{
77    const Dict* tag = TY_(LookupTagDef)(tid);
78    Node* tmp = TY_(InferredTag)(doc, tag->id);
79
80    if (obsolete)
81        TY_(ReportWarning)(doc, node, tmp, OBSOLETE_ELEMENT);
82    else if (unexpected)
83        TY_(ReportError)(doc, node, tmp, REPLACING_UNEX_ELEMENT);
84    else
85        TY_(ReportNotice)(doc, node, tmp, REPLACING_ELEMENT);
86
87    MemFree(tmp->element);
88    MemFree(tmp);
89
90    node->was = node->tag;
91    node->tag = tag;
92    node->type = StartTag;
93    node->implicit = yes;
94    MemFree(node->element);
95    node->element = TY_(tmbstrdup)(tag->name);
96}
97
98/* extract a node and its children from a markup tree */
99Node *TY_(RemoveNode)(Node *node)
100{
101    if (node->prev)
102        node->prev->next = node->next;
103
104    if (node->next)
105        node->next->prev = node->prev;
106
107    if (node->parent)
108    {
109        if (node->parent->content == node)
110            node->parent->content = node->next;
111
112        if (node->parent->last == node)
113            node->parent->last = node->prev;
114    }
115
116    node->parent = node->prev = node->next = NULL;
117    return node;
118}
119
120/* remove node from markup tree and discard it */
121Node *TY_(DiscardElement)( TidyDocImpl* doc, Node *element )
122{
123    Node *next = NULL;
124
125    if (element)
126    {
127        next = element->next;
128        TY_(RemoveNode)(element);
129        TY_(FreeNode)( doc, element);
130    }
131
132    return next;
133}
134
135/*
136 insert "node" into markup tree as the firt element
137 of content of "element"
138*/
139void TY_(InsertNodeAtStart)(Node *element, Node *node)
140{
141    node->parent = element;
142
143    if (element->content == NULL)
144        element->last = node;
145    else
146        element->content->prev = node;
147
148    node->next = element->content;
149    node->prev = NULL;
150    element->content = node;
151}
152
153/*
154 insert "node" into markup tree as the last element
155 of content of "element"
156*/
157void TY_(InsertNodeAtEnd)(Node *element, Node *node)
158{
159    node->parent = element;
160    node->prev = element->last;
161
162    if (element->last != NULL)
163        element->last->next = node;
164    else
165        element->content = node;
166
167    element->last = node;
168}
169
170/*
171 insert "node" into markup tree in place of "element"
172 which is moved to become the child of the node
173*/
174static void InsertNodeAsParent(Node *element, Node *node)
175{
176    node->content = element;
177    node->last = element;
178    node->parent = element->parent;
179    element->parent = node;
180
181    if (node->parent->content == element)
182        node->parent->content = node;
183
184    if (node->parent->last == element)
185        node->parent->last = node;
186
187    node->prev = element->prev;
188    element->prev = NULL;
189
190    if (node->prev)
191        node->prev->next = node;
192
193    node->next = element->next;
194    element->next = NULL;
195
196    if (node->next)
197        node->next->prev = node;
198}
199
200/* insert "node" into markup tree before "element" */
201void TY_(InsertNodeBeforeElement)(Node *element, Node *node)
202{
203    Node *parent;
204
205    parent = element->parent;
206    node->parent = parent;
207    node->next = element;
208    node->prev = element->prev;
209    element->prev = node;
210
211    if (node->prev)
212        node->prev->next = node;
213
214    if (parent->content == element)
215        parent->content = node;
216}
217
218/* insert "node" into markup tree after "element" */
219void TY_(InsertNodeAfterElement)(Node *element, Node *node)
220{
221    Node *parent;
222
223    parent = element->parent;
224    node->parent = parent;
225
226    /* AQ - 13 Jan 2000 fix for parent == NULL */
227    if (parent != NULL && parent->last == element)
228        parent->last = node;
229    else
230    {
231        node->next = element->next;
232        /* AQ - 13 Jan 2000 fix for node->next == NULL */
233        if (node->next != NULL)
234            node->next->prev = node;
235    }
236
237    element->next = node;
238    node->prev = element;
239}
240
241static Bool CanPrune( TidyDocImpl* doc, Node *element )
242{
243    if ( TY_(nodeIsText)(element) )
244        return yes;
245
246    if ( element->content )
247        return no;
248
249    if ( element->tag == NULL )
250        return no;
251
252    if ( element->tag->model & CM_BLOCK && element->attributes != NULL )
253        return no;
254
255    if ( nodeIsA(element) && element->attributes != NULL )
256        return no;
257
258    if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) )
259        return no;
260
261    if ( element->tag->model & CM_ROW )
262        return no;
263
264    if ( element->tag->model & CM_EMPTY )
265        return no;
266
267    if ( nodeIsAPPLET(element) )
268        return no;
269
270    if ( nodeIsOBJECT(element) )
271        return no;
272
273    if ( nodeIsSCRIPT(element) && attrGetSRC(element) )
274        return no;
275
276    if ( nodeIsTITLE(element) )
277        return no;
278
279    /* #433359 - fix by Randy Waki 12 Mar 01 */
280    if ( nodeIsIFRAME(element) )
281        return no;
282
283    /* fix for bug 770297 */
284    if (nodeIsTEXTAREA(element))
285        return no;
286
287    if ( attrGetID(element) || attrGetNAME(element) )
288        return no;
289
290    /* fix for bug 695408; a better fix would look for unknown and    */
291    /* known proprietary attributes that make the element significant */
292    if (attrGetDATAFLD(element))
293        return no;
294
295    /* fix for bug 723772, don't trim new-...-tags */
296    if (element->tag->id == TidyTag_UNKNOWN)
297        return no;
298
299    if (nodeIsBODY(element))
300        return no;
301
302    if (nodeIsCOLGROUP(element))
303        return no;
304
305    return yes;
306}
307
308/* return next element */
309Node *TY_(TrimEmptyElement)( TidyDocImpl* doc, Node *element )
310{
311    if ( CanPrune(doc, element) )
312    {
313       if (element->type != TextNode)
314            TY_(ReportNotice)(doc, element, NULL, TRIM_EMPTY_ELEMENT);
315
316        return TY_(DiscardElement)(doc, element);
317    }
318    return element->next;
319}
320
321Node* TY_(DropEmptyElements)(TidyDocImpl* doc, Node* node)
322{
323    Node* next;
324
325    while (node)
326    {
327        next = node->next;
328
329        if (node->content)
330            TY_(DropEmptyElements)(doc, node->content);
331
332        if (!TY_(nodeIsElement)(node) &&
333            !(TY_(nodeIsText)(node) && !(node->start < node->end)))
334        {
335            node = next;
336            continue;
337        }
338
339        next = TY_(TrimEmptyElement)(doc, node);
340        node = next;
341    }
342
343    return node;
344}
345
346/*
347  errors in positioning of form start or end tags
348  generally require human intervention to fix
349*/
350static void BadForm( TidyDocImpl* doc )
351{
352    doc->badForm = yes;
353    /* doc->errors++; */
354}
355
356/*
357  This maps
358       <em>hello </em><strong>world</strong>
359  to
360       <em>hello</em> <strong>world</strong>
361
362  If last child of element is a text node
363  then trim trailing white space character
364  moving it to after element's end tag.
365*/
366static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last )
367{
368    Lexer* lexer = doc->lexer;
369    byte c;
370
371    if (TY_(nodeIsText)(last))
372    {
373        if (last->end > last->start)
374        {
375            c = (byte) lexer->lexbuf[ last->end - 1 ];
376
377            if (   c == ' '
378#ifdef COMMENT_NBSP_FIX
379                || c == 160
380#endif
381               )
382            {
383#ifdef COMMENT_NBSP_FIX
384                /* take care with <td>&nbsp;</td> */
385                if ( c == 160 &&
386                     ( element->tag == doc->tags.tag_td ||
387                       element->tag == doc->tags.tag_th )
388                   )
389                {
390                    if (last->end > last->start + 1)
391                        last->end -= 1;
392                }
393                else
394#endif
395                {
396                    last->end -= 1;
397                    if ( (element->tag->model & CM_INLINE) &&
398                         !(element->tag->model & CM_FIELD) )
399                        lexer->insertspace = yes;
400                }
401            }
402        }
403    }
404}
405
406#if 0
407static Node *EscapeTag(Lexer *lexer, Node *element)
408{
409    Node *node = NewNode(lexer);
410
411    node->start = lexer->lexsize;
412    AddByte(lexer, '<');
413
414    if (element->type == EndTag)
415        AddByte(lexer, '/');
416
417    if (element->element)
418    {
419        char *p;
420        for (p = element->element; *p != '\0'; ++p)
421            AddByte(lexer, *p);
422    }
423    else if (element->type == DocTypeTag)
424    {
425        uint i;
426        AddStringLiteral( lexer, "!DOCTYPE " );
427        for (i = element->start; i < element->end; ++i)
428            AddByte(lexer, lexer->lexbuf[i]);
429    }
430
431    if (element->type == StartEndTag)
432        AddByte(lexer, '/');
433
434    AddByte(lexer, '>');
435    node->end = lexer->lexsize;
436
437    return node;
438}
439#endif /* 0 */
440
441/* Only true for text nodes. */
442Bool TY_(IsBlank)(Lexer *lexer, Node *node)
443{
444    Bool isBlank = TY_(nodeIsText)(node);
445    if ( isBlank )
446        isBlank = ( node->end == node->start ||       /* Zero length */
447                    ( node->end == node->start+1      /* or one blank. */
448                      && lexer->lexbuf[node->start] == ' ' ) );
449    return isBlank;
450}
451
452/*
453  This maps
454       <p>hello<em> world</em>
455  to
456       <p>hello <em>world</em>
457
458  Trims initial space, by moving it before the
459  start tag, or if this element is the first in
460  parent's content, then by discarding the space
461*/
462static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text )
463{
464    Lexer* lexer = doc->lexer;
465    Node *prev, *node;
466
467    if ( TY_(nodeIsText)(text) &&
468         lexer->lexbuf[text->start] == ' ' &&
469         text->start < text->end )
470    {
471        if ( (element->tag->model & CM_INLINE) &&
472             !(element->tag->model & CM_FIELD) )
473        {
474            prev = element->prev;
475
476            if (TY_(nodeIsText)(prev))
477            {
478                if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ')
479                    lexer->lexbuf[(prev->end)++] = ' ';
480
481                ++(element->start);
482            }
483            else /* create new node */
484            {
485                node = TY_(NewNode)(lexer);
486                node->start = (element->start)++;
487                node->end = element->start;
488                lexer->lexbuf[node->start] = ' ';
489                TY_(InsertNodeBeforeElement)(element ,node);
490            }
491        }
492
493        /* discard the space in current node */
494        ++(text->start);
495    }
496}
497
498static Bool IsPreDescendant(Node* node)
499{
500    Node *parent = node->parent;
501
502    while (parent)
503    {
504        if (parent->tag && parent->tag->parser == TY_(ParsePre))
505            return yes;
506
507        parent = parent->parent;
508    }
509
510    return no;
511}
512
513static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node)
514{
515    Node* next;
516
517    if (!TY_(nodeIsText)(node))
518        return no;
519
520    if (node->parent->type == DocTypeTag)
521        return no;
522
523    if (IsPreDescendant(node))
524        return no;
525
526    if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
527        return no;
528
529    next = node->next;
530
531    /* <p>... </p> */
532    if (!next && !TY_(nodeHasCM)(node->parent, CM_INLINE))
533        return yes;
534
535    /* <div><small>... </small><h3>...</h3></div> */
536    if (!next && node->parent->next && !TY_(nodeHasCM)(node->parent->next, CM_INLINE))
537        return yes;
538
539    if (!next)
540        return no;
541
542    if (nodeIsBR(next))
543        return yes;
544
545    if (TY_(nodeHasCM)(next, CM_INLINE))
546        return no;
547
548    /* <a href='/'>...</a> <p>...</p> */
549    if (next->type == StartTag)
550        return yes;
551
552    /* <strong>...</strong> <hr /> */
553    if (next->type == StartEndTag)
554        return yes;
555
556    /* evil adjacent text nodes, Tidy should not generate these :-( */
557    if (TY_(nodeIsText)(next) && next->start < next->end
558        && TY_(IsWhite)(doc->lexer->lexbuf[next->start]))
559        return yes;
560
561    return no;
562}
563
564static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node)
565{
566    if (!TY_(nodeIsText)(node))
567        return no;
568
569    if (node->parent->type == DocTypeTag)
570        return no;
571
572    if (IsPreDescendant(node))
573        return no;
574
575    if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
576        return no;
577
578    /* <p>...<br> <em>...</em>...</p> */
579    if (nodeIsBR(node->prev))
580        return yes;
581
582    /* <p> ...</p> */
583    if (node->prev == NULL && !TY_(nodeHasCM)(node->parent, CM_INLINE))
584        return yes;
585
586    /* <h4>...</h4> <em>...</em> */
587    if (node->prev && !TY_(nodeHasCM)(node->prev, CM_INLINE) &&
588        TY_(nodeIsElement)(node->prev))
589        return yes;
590
591    /* <p><span> ...</span></p> */
592    if (!node->prev && !node->parent->prev && !TY_(nodeHasCM)(node->parent->parent, CM_INLINE))
593        return yes;
594
595    return no;
596}
597
598static void CleanSpaces(TidyDocImpl* doc, Node* node)
599{
600    Node* next;
601
602    while (node)
603    {
604        next = node->next;
605
606        if (TY_(nodeIsText)(node) && CleanLeadingWhitespace(doc, node))
607            while (node->start < node->end && TY_(IsWhite)(doc->lexer->lexbuf[node->start]))
608                ++(node->start);
609
610        if (TY_(nodeIsText)(node) && CleanTrailingWhitespace(doc, node))
611            while (node->end > node->start && TY_(IsWhite)(doc->lexer->lexbuf[node->end - 1]))
612                --(node->end);
613
614        if (TY_(nodeIsText)(node) && !(node->start < node->end))
615        {
616            TY_(RemoveNode)(node);
617            TY_(FreeNode)(doc, node);
618            node = next;
619
620            continue;
621        }
622
623        if (node->content)
624            CleanSpaces(doc, node->content);
625
626        node = next;
627    }
628}
629
630/*
631  Move initial and trailing space out.
632  This routine maps:
633
634       hello<em> world</em>
635  to
636       hello <em>world</em>
637  and
638       <em>hello </em><strong>world</strong>
639  to
640       <em>hello</em> <strong>world</strong>
641*/
642static void TrimSpaces( TidyDocImpl* doc, Node *element)
643{
644    Node* text = element->content;
645
646    if (nodeIsPRE(element) || IsPreDescendant(element))
647        return;
648
649    if (TY_(nodeIsText)(text))
650        TrimInitialSpace(doc, element, text);
651
652    text = element->last;
653
654    if (TY_(nodeIsText)(text))
655        TrimTrailingSpace(doc, element, text);
656}
657
658static Bool DescendantOf( Node *element, TidyTagId tid )
659{
660    Node *parent;
661    for ( parent = element->parent;
662          parent != NULL;
663          parent = parent->parent )
664    {
665        if ( TagIsId(parent, tid) )
666            return yes;
667    }
668    return no;
669}
670
671static Bool InsertMisc(Node *element, Node *node)
672{
673    if (node->type == CommentTag ||
674        node->type == ProcInsTag ||
675        node->type == CDATATag ||
676        node->type == SectionTag ||
677        node->type == AspTag ||
678        node->type == JsteTag ||
679        node->type == PhpTag )
680    {
681        TY_(InsertNodeAtEnd)(element, node);
682        return yes;
683    }
684
685    if ( node->type == XmlDecl )
686    {
687        Node* root = element;
688        while ( root && root->parent )
689            root = root->parent;
690        if ( root )
691        {
692/* Apple Changes:
693   2007-03-05 iccir [5036506] Don't insert an XmlDecl if one already exists.
694*/
695#ifdef TIDY_APPLE_CHANGES
696            if (!(root->content && root->content->type == XmlDecl))
697            {
698                TY_(InsertNodeAtStart)( root, node );
699                return yes;
700            }
701#else
702          TY_(InsertNodeAtStart)( root, node );
703          return yes;
704#endif
705        }
706    }
707
708    /* Declared empty tags seem to be slipping through
709    ** the cracks.  This is an experiment to figure out
710    ** a decent place to pick them up.
711    */
712    if ( node->tag &&
713         TY_(nodeIsElement)(node) &&
714         TY_(nodeCMIsEmpty)(node) && TagId(node) == TidyTag_UNKNOWN &&
715         (node->tag->versions & VERS_PROPRIETARY) != 0 )
716    {
717        TY_(InsertNodeAtEnd)(element, node);
718        return yes;
719    }
720
721    return no;
722}
723
724
725static void ParseTag( TidyDocImpl* doc, Node *node, GetTokenMode mode )
726{
727    Lexer* lexer = doc->lexer;
728    /*
729       Fix by GLP 2000-12-21.  Need to reset insertspace if this
730       is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
731    */
732    if (node->tag->model & CM_EMPTY)
733    {
734        lexer->waswhite = no;
735        if (node->tag->parser == NULL)
736            return;
737    }
738    else if (!(node->tag->model & CM_INLINE))
739        lexer->insertspace = no;
740
741    if (node->tag->parser == NULL)
742        return;
743
744    if (node->type == StartEndTag)
745        return;
746
747    (*node->tag->parser)( doc, node, mode );
748}
749
750/*
751 the doctype has been found after other tags,
752 and needs moving to before the html element
753*/
754static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype )
755{
756    Node* existing = TY_(FindDocType)( doc );
757    if ( existing )
758    {
759        TY_(ReportError)(doc, element, doctype, DISCARDING_UNEXPECTED );
760        TY_(FreeNode)( doc, doctype );
761    }
762    else
763    {
764        TY_(ReportError)(doc, element, doctype, DOCTYPE_AFTER_TAGS );
765        while ( !nodeIsHTML(element) )
766            element = element->parent;
767        TY_(InsertNodeBeforeElement)( element, doctype );
768    }
769}
770
771/*
772 move node to the head, where element is used as starting
773 point in hunt for head. normally called during parsing
774*/
775static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node )
776{
777    Node *head;
778
779    TY_(RemoveNode)( node );  /* make sure that node is isolated */
780
781    if ( TY_(nodeIsElement)(node) )
782    {
783        TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN );
784
785        head = TY_(FindHEAD)(doc);
786        assert(head != NULL);
787
788        TY_(InsertNodeAtEnd)(head, node);
789
790        if ( node->tag->parser )
791            ParseTag( doc, node, IgnoreWhitespace );
792    }
793    else
794    {
795        TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
796        TY_(FreeNode)( doc, node );
797    }
798}
799
800/* moves given node to end of body element */
801static void MoveNodeToBody( TidyDocImpl* doc, Node* node )
802{
803    Node* body = TY_(FindBody)( doc );
804    if ( body )
805    {
806        TY_(RemoveNode)( node );
807        TY_(InsertNodeAtEnd)( body, node );
808    }
809}
810
811static void AddClassNoIndent( TidyDocImpl* doc, Node *node )
812{
813    ctmbstr sprop =
814        "padding-left: 2ex; margin-left: 0ex"
815        "; margin-top: 0ex; margin-bottom: 0ex";
816    if ( !cfgBool(doc, TidyDecorateInferredUL) )
817        return;
818    if ( cfgBool(doc, TidyMakeClean) )
819        TY_(AddStyleAsClass)( doc, node, sprop );
820    else
821        TY_(AddStyleProperty)( doc, node, sprop );
822}
823
824/*
825   element is node created by the lexer
826   upon seeing the start tag, or by the
827   parser when the start tag is inferred
828*/
829void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode)
830{
831    Lexer* lexer = doc->lexer;
832    Node *node;
833    Bool checkstack = yes;
834    uint istackbase = 0;
835
836    if ( element->tag->model & CM_EMPTY )
837        return;
838
839    if ( nodeIsFORM(element) &&
840         DescendantOf(element, TidyTag_FORM) )
841        TY_(ReportError)(doc, element, NULL, ILLEGAL_NESTING );
842
843    /*
844     InlineDup() asks the lexer to insert inline emphasis tags
845     currently pushed on the istack, but take care to avoid
846     propagating inline emphasis inside OBJECT or APPLET.
847     For these elements a fresh inline stack context is created
848     and disposed of upon reaching the end of the element.
849     They thus behave like table cells in this respect.
850    */
851    if (element->tag->model & CM_OBJECT)
852    {
853        istackbase = lexer->istackbase;
854        lexer->istackbase = lexer->istacksize;
855    }
856
857    if (!(element->tag->model & CM_MIXED))
858        TY_(InlineDup)( doc, NULL );
859
860    mode = IgnoreWhitespace;
861
862    while ((node = TY_(GetToken)(doc, mode /*MixedContent*/)) != NULL)
863    {
864        /* end tag for this element */
865        if (node->type == EndTag && node->tag &&
866            (node->tag == element->tag || element->was == node->tag))
867        {
868            TY_(FreeNode)( doc, node );
869
870            if (element->tag->model & CM_OBJECT)
871            {
872                /* pop inline stack */
873                while (lexer->istacksize > lexer->istackbase)
874                    TY_(PopInline)( doc, NULL );
875                lexer->istackbase = istackbase;
876            }
877
878            element->closed = yes;
879            TrimSpaces( doc, element );
880            return;
881        }
882
883        if ( nodeIsBODY( node ) && DescendantOf( element, TidyTag_HEAD ))
884        {
885            /*  If we're in the HEAD, close it before proceeding.
886                This is an extremely rare occurance, but has been observed.
887            */
888            TY_(UngetToken)( doc );
889            break;
890        }
891
892        if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) )
893        {
894            if ( TY_(nodeIsElement)(node) )
895                TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
896            TY_(FreeNode)( doc, node );
897            continue;
898        }
899
900
901        if (node->type == EndTag)
902        {
903            if (node->tag == NULL)
904            {
905                TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
906                TY_(FreeNode)( doc, node );
907                continue;
908            }
909            else if ( nodeIsBR(node) )
910                node->type = StartTag;
911            else if ( nodeIsP(node) )
912            {
913                /* Cannot have a block inside a paragraph, so no checking
914                   for an ancestor is necessary -- but we _can_ have
915                   paragraphs inside a block, so change it to an implicit
916                   empty paragraph, to be dealt with according to the user's
917                   options
918                */
919                node->type = StartEndTag;
920                node->implicit = yes;
921#if OBSOLETE
922                TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
923                TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */
924                TY_(InsertNodeAtEnd)( element, node );
925                node = InferredTag(doc, TidyTag_BR);
926#endif
927            }
928            else if (DescendantOf( element, node->tag->id ))
929            {
930                /*
931                  if this is the end tag for an ancestor element
932                  then infer end tag for this element
933                */
934                TY_(UngetToken)( doc );
935                break;
936#if OBSOLETE
937                Node *parent;
938                for ( parent = element->parent;
939                      parent != NULL;
940                      parent = parent->parent )
941                {
942                    if (node->tag == parent->tag)
943                    {
944                        if (!(element->tag->model & CM_OPT))
945                            TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
946
947                        TY_(UngetToken)( doc );
948
949                        if (element->tag->model & CM_OBJECT)
950                        {
951                            /* pop inline stack */
952                            while (lexer->istacksize > lexer->istackbase)
953                                TY_(PopInline)( doc, NULL );
954                            lexer->istackbase = istackbase;
955                        }
956
957                        TrimSpaces( doc, element );
958                        return;
959                    }
960                }
961#endif
962            }
963            else
964            {
965                /* special case </tr> etc. for stuff moved in front of table */
966                if ( lexer->exiled
967                     && node->tag->model
968                     && (node->tag->model & CM_TABLE) )
969                {
970                    TY_(UngetToken)( doc );
971                    TrimSpaces( doc, element );
972                    return;
973                }
974            }
975        }
976
977        /* mixed content model permits text */
978        if (TY_(nodeIsText)(node))
979        {
980            if ( checkstack )
981            {
982                checkstack = no;
983                if (!(element->tag->model & CM_MIXED))
984                {
985                    if ( TY_(InlineDup)(doc, node) > 0 )
986                        continue;
987                }
988            }
989
990            TY_(InsertNodeAtEnd)(element, node);
991            mode = MixedContent;
992
993            /*
994              HTML4 strict doesn't allow mixed content for
995              elements with %block; as their content model
996            */
997            /*
998              But only body, map, blockquote, form and
999              noscript have content model %block;
1000            */
1001            if ( nodeIsBODY(element)       ||
1002                 nodeIsMAP(element)        ||
1003                 nodeIsBLOCKQUOTE(element) ||
1004                 nodeIsFORM(element)       ||
1005                 nodeIsNOSCRIPT(element) )
1006                TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
1007            continue;
1008        }
1009
1010        if ( InsertMisc(element, node) )
1011            continue;
1012
1013        /* allow PARAM elements? */
1014        if ( nodeIsPARAM(node) )
1015        {
1016            if ( TY_(nodeHasCM)(element, CM_PARAM) && TY_(nodeIsElement)(node) )
1017            {
1018                TY_(InsertNodeAtEnd)(element, node);
1019                continue;
1020            }
1021
1022            /* otherwise discard it */
1023            TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1024            TY_(FreeNode)( doc, node );
1025            continue;
1026        }
1027
1028        /* allow AREA elements? */
1029        if ( nodeIsAREA(node) )
1030        {
1031            if ( nodeIsMAP(element) && TY_(nodeIsElement)(node) )
1032            {
1033                TY_(InsertNodeAtEnd)(element, node);
1034                continue;
1035            }
1036
1037            /* otherwise discard it */
1038            TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1039            TY_(FreeNode)( doc, node );
1040            continue;
1041        }
1042
1043        /* ignore unknown start/end tags */
1044        if ( node->tag == NULL )
1045        {
1046            TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1047            TY_(FreeNode)( doc, node );
1048            continue;
1049        }
1050
1051        /*
1052          Allow CM_INLINE elements here.
1053
1054          Allow CM_BLOCK elements here unless
1055          lexer->excludeBlocks is yes.
1056
1057          LI and DD are special cased.
1058
1059          Otherwise infer end tag for this element.
1060        */
1061
1062        if ( !TY_(nodeHasCM)(node, CM_INLINE) )
1063        {
1064            if ( !TY_(nodeIsElement)(node) )
1065            {
1066                if ( nodeIsFORM(node) )
1067                    BadForm( doc );
1068
1069                TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1070                TY_(FreeNode)( doc, node );
1071                continue;
1072            }
1073
1074            /* #427671 - Fix by Randy Waki - 10 Aug 00 */
1075            /*
1076             If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION
1077             start tag, discard the start tag and let the subsequent content get
1078             parsed as content of the enclosing LI.  This seems to mimic IE and
1079             Netscape, and avoids an infinite loop: without this check,
1080             ParseBlock (which is parsing the LI's content) and ParseList (which
1081             is parsing the LI's parent's content) repeatedly defer to each
1082             other to parse the illegal start tag, each time inferring a missing
1083             </li> or <li> respectively.
1084
1085             NOTE: This check is a bit fragile.  It specifically checks for the
1086             four tags that happen to weave their way through the current series
1087             of tests performed by ParseBlock and ParseList to trigger the
1088             infinite loop.
1089            */
1090            if ( nodeIsLI(element) )
1091            {
1092                if ( nodeIsFRAME(node)    ||
1093                     nodeIsFRAMESET(node) ||
1094                     nodeIsOPTGROUP(node) ||
1095                     nodeIsOPTION(node) )
1096                {
1097                    TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1098                    TY_(FreeNode)( doc, node );  /* DSR - 27Apr02 avoid memory leak */
1099                    continue;
1100                }
1101            }
1102
1103            if ( nodeIsTD(element) || nodeIsTH(element) )
1104            {
1105                /* if parent is a table cell, avoid inferring the end of the cell */
1106
1107                if ( TY_(nodeHasCM)(node, CM_HEAD) )
1108                {
1109                    MoveToHead( doc, element, node );
1110                    continue;
1111                }
1112
1113                if ( TY_(nodeHasCM)(node, CM_LIST) )
1114                {
1115                    TY_(UngetToken)( doc );
1116                    node = TY_(InferredTag)(doc, TidyTag_UL);
1117                    AddClassNoIndent(doc, node);
1118                    lexer->excludeBlocks = yes;
1119                }
1120                else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
1121                {
1122                    TY_(UngetToken)( doc );
1123                    node = TY_(InferredTag)(doc, TidyTag_DL);
1124                    lexer->excludeBlocks = yes;
1125                }
1126
1127                /* infer end of current table cell */
1128                if ( !TY_(nodeHasCM)(node, CM_BLOCK) )
1129                {
1130                    TY_(UngetToken)( doc );
1131                    TrimSpaces( doc, element );
1132                    return;
1133                }
1134            }
1135            else if ( TY_(nodeHasCM)(node, CM_BLOCK) )
1136            {
1137                if ( lexer->excludeBlocks )
1138                {
1139                    if ( !TY_(nodeHasCM)(element, CM_OPT) )
1140                        TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1141
1142                    TY_(UngetToken)( doc );
1143
1144                    if ( TY_(nodeHasCM)(element, CM_OBJECT) )
1145                        lexer->istackbase = istackbase;
1146
1147                    TrimSpaces( doc, element );
1148                    return;
1149                }
1150            }
1151            else /* things like list items */
1152            {
1153                if (node->tag->model & CM_HEAD)
1154                {
1155                    MoveToHead( doc, element, node );
1156                    continue;
1157                }
1158
1159                /*
1160                 special case where a form start tag
1161                 occurs in a tr and is followed by td or th
1162                */
1163
1164                if ( nodeIsFORM(element) &&
1165                     nodeIsTD(element->parent) &&
1166                     element->parent->implicit )
1167                {
1168                    if ( nodeIsTD(node) )
1169                    {
1170                        TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1171                        TY_(FreeNode)( doc, node );
1172                        continue;
1173                    }
1174
1175                    if ( nodeIsTH(node) )
1176                    {
1177                        TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1178                        TY_(FreeNode)( doc, node );
1179                        node = element->parent;
1180                        MemFree(node->element);
1181                        node->element = TY_(tmbstrdup)("th");
1182                        node->tag = TY_(LookupTagDef)( TidyTag_TH );
1183                        continue;
1184                    }
1185                }
1186
1187                if ( !TY_(nodeHasCM)(element, CM_OPT) && !element->implicit )
1188                    TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1189
1190                TY_(UngetToken)( doc );
1191
1192                if ( TY_(nodeHasCM)(node, CM_LIST) )
1193                {
1194                    if ( element->parent && element->parent->tag &&
1195                         element->parent->tag->parser == TY_(ParseList) )
1196                    {
1197                        TrimSpaces( doc, element );
1198                        return;
1199                    }
1200
1201                    node = TY_(InferredTag)(doc, TidyTag_UL);
1202                    AddClassNoIndent(doc, node);
1203                }
1204                else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
1205                {
1206                    if ( nodeIsDL(element->parent) )
1207                    {
1208                        TrimSpaces( doc, element );
1209                        return;
1210                    }
1211
1212                    node = TY_(InferredTag)(doc, TidyTag_DL);
1213                }
1214                else if ( TY_(nodeHasCM)(node, CM_TABLE) || TY_(nodeHasCM)(node, CM_ROW) )
1215                {
1216                    /* http://tidy.sf.net/issue/1316307 */
1217                    /* In exiled mode, return so table processing can
1218                       continue. */
1219                    if (lexer->exiled)
1220                        return;
1221                    node = TY_(InferredTag)(doc, TidyTag_TABLE);
1222                }
1223                else if ( TY_(nodeHasCM)(element, CM_OBJECT) )
1224                {
1225                    /* pop inline stack */
1226                    while ( lexer->istacksize > lexer->istackbase )
1227                        TY_(PopInline)( doc, NULL );
1228                    lexer->istackbase = istackbase;
1229                    TrimSpaces( doc, element );
1230                    return;
1231
1232                }
1233                else
1234                {
1235                    TrimSpaces( doc, element );
1236                    return;
1237                }
1238            }
1239        }
1240
1241        /* parse known element */
1242        if (TY_(nodeIsElement)(node))
1243        {
1244            if (node->tag->model & CM_INLINE)
1245            {
1246                if (checkstack && !node->implicit)
1247                {
1248                    checkstack = no;
1249
1250                    if (!(element->tag->model & CM_MIXED)) /* #431731 - fix by Randy Waki 25 Dec 00 */
1251                    {
1252                        if ( TY_(InlineDup)(doc, node) > 0 )
1253                            continue;
1254                    }
1255                }
1256
1257                mode = MixedContent;
1258            }
1259            else
1260            {
1261                checkstack = yes;
1262                mode = IgnoreWhitespace;
1263            }
1264
1265            /* trim white space before <br> */
1266            if ( nodeIsBR(node) )
1267                TrimSpaces( doc, element );
1268
1269            TY_(InsertNodeAtEnd)(element, node);
1270
1271            if (node->implicit)
1272                TY_(ReportError)(doc, element, node, INSERTING_TAG );
1273
1274            ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ );
1275            continue;
1276        }
1277
1278        /* discard unexpected tags */
1279        if (node->type == EndTag)
1280            TY_(PopInline)( doc, node );  /* if inline end tag */
1281
1282        TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1283        TY_(FreeNode)( doc, node );
1284        continue;
1285    }
1286
1287    if (!(element->tag->model & CM_OPT))
1288        TY_(ReportError)(doc, element, node, MISSING_ENDTAG_FOR);
1289
1290    if (element->tag->model & CM_OBJECT)
1291    {
1292        /* pop inline stack */
1293        while ( lexer->istacksize > lexer->istackbase )
1294            TY_(PopInline)( doc, NULL );
1295        lexer->istackbase = istackbase;
1296    }
1297
1298    TrimSpaces( doc, element );
1299}
1300
1301void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
1302{
1303    Lexer* lexer = doc->lexer;
1304    Node *node, *parent;
1305
1306    if (element->tag->model & CM_EMPTY)
1307        return;
1308
1309    /*
1310     ParseInline is used for some block level elements like H1 to H6
1311     For such elements we need to insert inline emphasis tags currently
1312     on the inline stack. For Inline elements, we normally push them
1313     onto the inline stack provided they aren't implicit or OBJECT/APPLET.
1314     This test is carried out in PushInline and PopInline, see istack.c
1315
1316     InlineDup(...) is not called for elements with a CM_MIXED (inline and
1317     block) content model, e.g. <del> or <ins>, otherwise constructs like
1318
1319       <p>111<a name='foo'>222<del>333</del>444</a>555</p>
1320       <p>111<span>222<del>333</del>444</span>555</p>
1321       <p>111<em>222<del>333</del>444</em>555</p>
1322
1323     will get corrupted.
1324    */
1325    if ((TY_(nodeHasCM)(element, CM_BLOCK) || nodeIsDT(element)) &&
1326        !TY_(nodeHasCM)(element, CM_MIXED))
1327        TY_(InlineDup)(doc, NULL);
1328    else if (TY_(nodeHasCM)(element, CM_INLINE))
1329        TY_(PushInline)(doc, element);
1330
1331    if ( nodeIsNOBR(element) )
1332        doc->badLayout |= USING_NOBR;
1333    else if ( nodeIsFONT(element) )
1334        doc->badLayout |= USING_FONT;
1335
1336    /* Inline elements may or may not be within a preformatted element */
1337    if (mode != Preformatted)
1338        mode = MixedContent;
1339
1340    while ((node = TY_(GetToken)(doc, mode)) != NULL)
1341    {
1342        /* end tag for current element */
1343        if (node->tag == element->tag && node->type == EndTag)
1344        {
1345            if (element->tag->model & CM_INLINE)
1346                TY_(PopInline)( doc, node );
1347
1348            TY_(FreeNode)( doc, node );
1349
1350            if (!(mode & Preformatted))
1351                TrimSpaces(doc, element);
1352
1353            /*
1354             if a font element wraps an anchor and nothing else
1355             then move the font element inside the anchor since
1356             otherwise it won't alter the anchor text color
1357            */
1358            if ( nodeIsFONT(element) &&
1359                 element->content && element->content == element->last )
1360            {
1361                Node *child = element->content;
1362
1363                if ( nodeIsA(child) )
1364                {
1365                    child->parent = element->parent;
1366                    child->next = element->next;
1367                    child->prev = element->prev;
1368
1369                    element->next = NULL;
1370                    element->prev = NULL;
1371                    element->parent = child;
1372
1373                    element->content = child->content;
1374                    element->last = child->last;
1375                    child->content = element;
1376
1377                    TY_(FixNodeLinks)(child);
1378                    TY_(FixNodeLinks)(element);
1379                }
1380            }
1381
1382            element->closed = yes;
1383            TrimSpaces( doc, element );
1384            return;
1385        }
1386
1387        /* <u>...<u>  map 2nd <u> to </u> if 1st is explicit */
1388        /* otherwise emphasis nesting is probably unintentional */
1389        /* big, small, sub, sup have cumulative effect to leave them alone */
1390        if ( node->type == StartTag
1391             && node->tag == element->tag
1392             && TY_(IsPushed)( doc, node )
1393             && !node->implicit
1394             && !element->implicit
1395             && node->tag && (node->tag->model & CM_INLINE)
1396             && !nodeIsA(node)
1397             && !nodeIsFONT(node)
1398             && !nodeIsBIG(node)
1399             && !nodeIsSMALL(node)
1400             && !nodeIsSUB(node)
1401             && !nodeIsSUP(node)
1402             && !nodeIsQ(node)
1403             && !nodeIsSPAN(node)
1404           )
1405        {
1406            if (element->content != NULL && node->attributes == NULL)
1407            {
1408                TY_(ReportWarning)(doc, element, node, COERCE_TO_ENDTAG_WARN);
1409                node->type = EndTag;
1410                TY_(UngetToken)(doc);
1411                continue;
1412            }
1413
1414            if (node->attributes == NULL || element->attributes == NULL)
1415                TY_(ReportWarning)(doc, element, node, NESTED_EMPHASIS);
1416        }
1417        else if ( TY_(IsPushed)(doc, node) && node->type == StartTag &&
1418                  nodeIsQ(node) )
1419        {
1420            TY_(ReportWarning)(doc, element, node, NESTED_QUOTATION);
1421        }
1422
1423        if ( TY_(nodeIsText)(node) )
1424        {
1425            /* only called for 1st child */
1426            if ( element->content == NULL && !(mode & Preformatted) )
1427                TrimSpaces( doc, element );
1428
1429            if ( node->start >= node->end )
1430            {
1431                TY_(FreeNode)( doc, node );
1432                continue;
1433            }
1434
1435            TY_(InsertNodeAtEnd)(element, node);
1436            continue;
1437        }
1438
1439        /* mixed content model so allow text */
1440        if (InsertMisc(element, node))
1441            continue;
1442
1443        /* deal with HTML tags */
1444        if ( nodeIsHTML(node) )
1445        {
1446            if ( TY_(nodeIsElement)(node) )
1447            {
1448                TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1449                TY_(FreeNode)( doc, node );
1450                continue;
1451            }
1452
1453            /* otherwise infer end of inline element */
1454            TY_(UngetToken)( doc );
1455
1456            if (!(mode & Preformatted))
1457                TrimSpaces(doc, element);
1458
1459            return;
1460        }
1461
1462        /* within <dt> or <pre> map <p> to <br> */
1463        if ( nodeIsP(node) &&
1464             node->type == StartTag &&
1465             ( (mode & Preformatted) ||
1466               nodeIsDT(element) ||
1467               DescendantOf(element, TidyTag_DT )
1468             )
1469           )
1470        {
1471            node->tag = TY_(LookupTagDef)( TidyTag_BR );
1472            MemFree(node->element);
1473            node->element = TY_(tmbstrdup)("br");
1474            TrimSpaces(doc, element);
1475            TY_(InsertNodeAtEnd)(element, node);
1476            continue;
1477        }
1478
1479        /* <p> allowed within <address> in HTML 4.01 Transitional */
1480        if ( nodeIsP(node) &&
1481             node->type == StartTag &&
1482             nodeIsADDRESS(element) )
1483        {
1484            TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
1485            TY_(InsertNodeAtEnd)(element, node);
1486            (*node->tag->parser)( doc, node, mode );
1487            continue;
1488        }
1489
1490        /* ignore unknown and PARAM tags */
1491        if ( node->tag == NULL || nodeIsPARAM(node) )
1492        {
1493            TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1494            TY_(FreeNode)( doc, node );
1495            continue;
1496        }
1497
1498        if ( nodeIsBR(node) && node->type == EndTag )
1499            node->type = StartTag;
1500
1501        if ( node->type == EndTag )
1502        {
1503           /* coerce </br> to <br> */
1504           if ( nodeIsBR(node) )
1505                node->type = StartTag;
1506           else if ( nodeIsP(node) )
1507           {
1508               /* coerce unmatched </p> to <br><br> */
1509                if ( !DescendantOf(element, TidyTag_P) )
1510                {
1511                    TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
1512                    TrimSpaces( doc, element );
1513                    TY_(InsertNodeAtEnd)( element, node );
1514                    node = TY_(InferredTag)(doc, TidyTag_BR);
1515                    TY_(InsertNodeAtEnd)( element, node ); /* todo: check this */
1516                    continue;
1517                }
1518           }
1519           else if ( TY_(nodeHasCM)(node, CM_INLINE)
1520                     && !nodeIsA(node)
1521                     && !TY_(nodeHasCM)(node, CM_OBJECT)
1522                     && TY_(nodeHasCM)(element, CM_INLINE) )
1523            {
1524                /* allow any inline end tag to end current element */
1525                TY_(PopInline)( doc, element );
1526
1527                if ( !nodeIsA(element) )
1528                {
1529                    if ( nodeIsA(node) && node->tag != element->tag )
1530                    {
1531                       TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1532                       TY_(UngetToken)( doc );
1533                    }
1534                    else
1535                    {
1536                        TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG);
1537                        TY_(FreeNode)( doc, node);
1538                    }
1539
1540                    if (!(mode & Preformatted))
1541                        TrimSpaces(doc, element);
1542
1543                    return;
1544                }
1545
1546                /* if parent is <a> then discard unexpected inline end tag */
1547                TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1548                TY_(FreeNode)( doc, node);
1549                continue;
1550            }  /* special case </tr> etc. for stuff moved in front of table */
1551            else if ( lexer->exiled
1552                      && node->tag->model
1553                      && (node->tag->model & CM_TABLE) )
1554            {
1555                TY_(UngetToken)( doc );
1556                TrimSpaces(doc, element);
1557                return;
1558            }
1559        }
1560
1561        /* allow any header tag to end current header */
1562        if ( TY_(nodeHasCM)(node, CM_HEADING) && TY_(nodeHasCM)(element, CM_HEADING) )
1563        {
1564
1565            if ( node->tag == element->tag )
1566            {
1567                TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG );
1568                TY_(FreeNode)( doc, node);
1569            }
1570            else
1571            {
1572                TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1573                TY_(UngetToken)( doc );
1574            }
1575
1576            if (!(mode & Preformatted))
1577                TrimSpaces(doc, element);
1578
1579            return;
1580        }
1581
1582        /*
1583           an <A> tag to ends any open <A> element
1584           but <A href=...> is mapped to </A><A href=...>
1585        */
1586        /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
1587        /* if (node->tag == doc->tags.tag_a && !node->implicit && TY_(IsPushed)(doc, node)) */
1588        if ( nodeIsA(node) && !node->implicit &&
1589             (nodeIsA(element) || DescendantOf(element, TidyTag_A)) )
1590        {
1591            /* coerce <a> to </a> unless it has some attributes */
1592            /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
1593            /* other fixes by Dave Raggett */
1594            /* if (node->attributes == NULL) */
1595            if (node->type != EndTag && node->attributes == NULL)
1596            {
1597                node->type = EndTag;
1598                TY_(ReportError)(doc, element, node, COERCE_TO_ENDTAG);
1599                /* TY_(PopInline)( doc, node ); */
1600                TY_(UngetToken)( doc );
1601                continue;
1602            }
1603
1604            TY_(UngetToken)( doc );
1605            TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
1606            /* TY_(PopInline)( doc, element ); */
1607
1608            if (!(mode & Preformatted))
1609                TrimSpaces(doc, element);
1610
1611            return;
1612        }
1613
1614        if (element->tag->model & CM_HEADING)
1615        {
1616            if ( nodeIsCENTER(node) || nodeIsDIV(node) )
1617            {
1618                if (!TY_(nodeIsElement)(node))
1619                {
1620                    TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1621                    TY_(FreeNode)( doc, node);
1622                    continue;
1623                }
1624
1625                TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
1626
1627                /* insert center as parent if heading is empty */
1628                if (element->content == NULL)
1629                {
1630                    InsertNodeAsParent(element, node);
1631                    continue;
1632                }
1633
1634                /* split heading and make center parent of 2nd part */
1635                TY_(InsertNodeAfterElement)(element, node);
1636
1637                if (!(mode & Preformatted))
1638                    TrimSpaces(doc, element);
1639
1640                element = TY_(CloneNode)( doc, element );
1641                TY_(InsertNodeAtEnd)(node, element);
1642                continue;
1643            }
1644
1645            if ( nodeIsHR(node) )
1646            {
1647                if ( !TY_(nodeIsElement)(node) )
1648                {
1649                    TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1650                    TY_(FreeNode)( doc, node);
1651                    continue;
1652                }
1653
1654                TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
1655
1656                /* insert hr before heading if heading is empty */
1657                if (element->content == NULL)
1658                {
1659                    TY_(InsertNodeBeforeElement)(element, node);
1660                    continue;
1661                }
1662
1663                /* split heading and insert hr before 2nd part */
1664                TY_(InsertNodeAfterElement)(element, node);
1665
1666                if (!(mode & Preformatted))
1667                    TrimSpaces(doc, element);
1668
1669                element = TY_(CloneNode)( doc, element );
1670                TY_(InsertNodeAfterElement)(node, element);
1671                continue;
1672            }
1673        }
1674
1675        if ( nodeIsDT(element) )
1676        {
1677            if ( nodeIsHR(node) )
1678            {
1679                Node *dd;
1680                if ( !TY_(nodeIsElement)(node) )
1681                {
1682                    TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1683                    TY_(FreeNode)( doc, node);
1684                    continue;
1685                }
1686
1687                TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
1688                dd = TY_(InferredTag)(doc, TidyTag_DD);
1689
1690                /* insert hr within dd before dt if dt is empty */
1691                if (element->content == NULL)
1692                {
1693                    TY_(InsertNodeBeforeElement)(element, dd);
1694                    TY_(InsertNodeAtEnd)(dd, node);
1695                    continue;
1696                }
1697
1698                /* split dt and insert hr within dd before 2nd part */
1699                TY_(InsertNodeAfterElement)(element, dd);
1700                TY_(InsertNodeAtEnd)(dd, node);
1701
1702                if (!(mode & Preformatted))
1703                    TrimSpaces(doc, element);
1704
1705                element = TY_(CloneNode)( doc, element );
1706                TY_(InsertNodeAfterElement)(dd, element);
1707                continue;
1708            }
1709        }
1710
1711
1712        /*
1713          if this is the end tag for an ancestor element
1714          then infer end tag for this element
1715        */
1716        if (node->type == EndTag)
1717        {
1718            for (parent = element->parent;
1719                    parent != NULL; parent = parent->parent)
1720            {
1721                if (node->tag == parent->tag)
1722                {
1723                    if (!(element->tag->model & CM_OPT) && !element->implicit)
1724                        TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
1725
1726                    if( TY_(IsPushedLast)( doc, element, node ) )
1727                        TY_(PopInline)( doc, element );
1728                    TY_(UngetToken)( doc );
1729
1730                    if (!(mode & Preformatted))
1731                        TrimSpaces(doc, element);
1732
1733                    return;
1734                }
1735            }
1736        }
1737
1738        /* block level tags end this element */
1739        if (!(node->tag->model & CM_INLINE) &&
1740            !(element->tag->model & CM_MIXED))
1741        {
1742            if ( !TY_(nodeIsElement)(node) )
1743            {
1744                TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1745                TY_(FreeNode)( doc, node);
1746                continue;
1747            }
1748
1749            if (!(element->tag->model & CM_OPT))
1750                TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
1751
1752            if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK))
1753            {
1754                MoveToHead(doc, element, node);
1755                continue;
1756            }
1757
1758            /*
1759               prevent anchors from propagating into block tags
1760               except for headings h1 to h6
1761            */
1762            if ( nodeIsA(element) )
1763            {
1764                if (node->tag && !(node->tag->model & CM_HEADING))
1765                    TY_(PopInline)( doc, element );
1766                else if (!(element->content))
1767                {
1768                    TY_(DiscardElement)( doc, element );
1769                    TY_(UngetToken)( doc );
1770                    return;
1771                }
1772            }
1773
1774            TY_(UngetToken)( doc );
1775
1776            if (!(mode & Preformatted))
1777                TrimSpaces(doc, element);
1778
1779            return;
1780        }
1781
1782        /* parse inline element */
1783        if (TY_(nodeIsElement)(node))
1784        {
1785            if (node->implicit)
1786                TY_(ReportError)(doc, element, node, INSERTING_TAG);
1787
1788            /* trim white space before <br> */
1789            if ( nodeIsBR(node) )
1790                TrimSpaces(doc, element);
1791
1792            TY_(InsertNodeAtEnd)(element, node);
1793            ParseTag(doc, node, mode);
1794            continue;
1795        }
1796
1797        /* discard unexpected tags */
1798        TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1799        TY_(FreeNode)( doc, node );
1800        continue;
1801    }
1802
1803    if (!(element->tag->model & CM_OPT))
1804        TY_(ReportError)(doc, element, node, MISSING_ENDTAG_FOR);
1805
1806}
1807
1808void TY_(ParseEmpty)(TidyDocImpl* doc, Node *element, GetTokenMode mode)
1809{
1810    Lexer* lexer = doc->lexer;
1811    if ( lexer->isvoyager )
1812    {
1813        Node *node = TY_(GetToken)( doc, mode);
1814        if ( node )
1815        {
1816            if ( !(node->type == EndTag && node->tag == element->tag) )
1817            {
1818                TY_(ReportError)(doc, element, node, ELEMENT_NOT_EMPTY);
1819                TY_(UngetToken)( doc );
1820            }
1821            else
1822            {
1823                TY_(FreeNode)( doc, node );
1824            }
1825        }
1826    }
1827}
1828
1829void TY_(ParseDefList)(TidyDocImpl* doc, Node *list, GetTokenMode mode)
1830{
1831    Lexer* lexer = doc->lexer;
1832    Node *node, *parent;
1833
1834    if (list->tag->model & CM_EMPTY)
1835        return;
1836
1837    lexer->insert = NULL;  /* defer implicit inline start tags */
1838
1839    while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL)
1840    {
1841        if (node->tag == list->tag && node->type == EndTag)
1842        {
1843            TY_(FreeNode)( doc, node);
1844            list->closed = yes;
1845            return;
1846        }
1847
1848        /* deal with comments etc. */
1849        if (InsertMisc(list, node))
1850            continue;
1851
1852        if (TY_(nodeIsText)(node))
1853        {
1854            TY_(UngetToken)( doc );
1855            node = TY_(InferredTag)(doc, TidyTag_DT);
1856            TY_(ReportError)(doc, list, node, MISSING_STARTTAG);
1857        }
1858
1859        if (node->tag == NULL)
1860        {
1861            TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1862            TY_(FreeNode)( doc, node);
1863            continue;
1864        }
1865
1866        /*
1867          if this is the end tag for an ancestor element
1868          then infer end tag for this element
1869        */
1870        if (node->type == EndTag)
1871        {
1872            Bool discardIt = no;
1873            if ( nodeIsFORM(node) )
1874            {
1875                BadForm( doc );
1876                TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1877                TY_(FreeNode)( doc, node );
1878                continue;
1879            }
1880
1881            for (parent = list->parent;
1882                    parent != NULL; parent = parent->parent)
1883            {
1884               /* Do not match across BODY to avoid infinite loop
1885                  between ParseBody and this parser,
1886                  See http://tidy.sf.net/bug/1098012. */
1887                if (nodeIsBODY(parent))
1888                {
1889                    discardIt = yes;
1890                    break;
1891                }
1892                if (node->tag == parent->tag)
1893                {
1894                    TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
1895
1896                    TY_(UngetToken)( doc );
1897                    return;
1898                }
1899            }
1900            if (discardIt)
1901            {
1902                TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1903                TY_(FreeNode)( doc, node);
1904                continue;
1905            }
1906        }
1907
1908        /* center in a dt or a dl breaks the dl list in two */
1909        if ( nodeIsCENTER(node) )
1910        {
1911            if (list->content)
1912                TY_(InsertNodeAfterElement)(list, node);
1913            else /* trim empty dl list */
1914            {
1915                TY_(InsertNodeBeforeElement)(list, node);
1916
1917/* #540296 tidy dumps with empty definition list */
1918#if 0
1919                TY_(DiscardElement)(list);
1920#endif
1921            }
1922
1923            /* #426885 - fix by Glenn Carroll 19 Apr 00, and
1924                         Gary Dechaines 11 Aug 00 */
1925            /* ParseTag can destroy node, if it finds that
1926             * this <center> is followed immediately by </center>.
1927             * It's awkward but necessary to determine if this
1928             * has happened.
1929             */
1930            parent = node->parent;
1931
1932            /* and parse contents of center */
1933            lexer->excludeBlocks = no;
1934            ParseTag( doc, node, mode);
1935            lexer->excludeBlocks = yes;
1936
1937            /* now create a new dl element,
1938             * unless node has been blown away because the
1939             * center was empty, as above.
1940             */
1941            if (parent->last == node)
1942            {
1943                list = TY_(InferredTag)(doc, TidyTag_DL);
1944                TY_(InsertNodeAfterElement)(node, list);
1945            }
1946            continue;
1947        }
1948
1949        if ( !(nodeIsDT(node) || nodeIsDD(node)) )
1950        {
1951            TY_(UngetToken)( doc );
1952
1953            if (!(node->tag->model & (CM_BLOCK | CM_INLINE)))
1954            {
1955                TY_(ReportError)(doc, list, node, TAG_NOT_ALLOWED_IN);
1956                return;
1957            }
1958
1959            /* if DD appeared directly in BODY then exclude blocks */
1960            if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks)
1961                return;
1962
1963            node = TY_(InferredTag)(doc, TidyTag_DD);
1964            TY_(ReportError)(doc, list, node, MISSING_STARTTAG);
1965        }
1966
1967        if (node->type == EndTag)
1968        {
1969            TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1970            TY_(FreeNode)( doc, node);
1971            continue;
1972        }
1973
1974        /* node should be <DT> or <DD>*/
1975        TY_(InsertNodeAtEnd)(list, node);
1976        ParseTag( doc, node, IgnoreWhitespace);
1977    }
1978
1979    TY_(ReportError)(doc, list, node, MISSING_ENDTAG_FOR);
1980}
1981
1982void TY_(ParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode))
1983{
1984    Lexer* lexer = doc->lexer;
1985    Node *node, *parent;
1986
1987    if (list->tag->model & CM_EMPTY)
1988        return;
1989
1990    lexer->insert = NULL;  /* defer implicit inline start tags */
1991
1992    while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL)
1993    {
1994        if (node->tag == list->tag && node->type == EndTag)
1995        {
1996            TY_(FreeNode)( doc, node);
1997            list->closed = yes;
1998            return;
1999        }
2000
2001        /* deal with comments etc. */
2002        if (InsertMisc(list, node))
2003            continue;
2004
2005        if (node->type != TextNode && node->tag == NULL)
2006        {
2007            TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2008            TY_(FreeNode)( doc, node);
2009            continue;
2010        }
2011
2012        /*
2013          if this is the end tag for an ancestor element
2014          then infer end tag for this element
2015        */
2016        if (node->type == EndTag)
2017        {
2018            if ( nodeIsFORM(node) )
2019            {
2020                BadForm( doc );
2021                TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2022                TY_(FreeNode)( doc, node );
2023                continue;
2024            }
2025
2026            if (node->tag && node->tag->model & CM_INLINE)
2027            {
2028                TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2029                TY_(PopInline)( doc, node );
2030                TY_(FreeNode)( doc, node);
2031                continue;
2032            }
2033
2034            for ( parent = list->parent;
2035                  parent != NULL; parent = parent->parent )
2036            {
2037               /* Do not match across BODY to avoid infinite loop
2038                  between ParseBody and this parser,
2039                  See http://tidy.sf.net/bug/1053626. */
2040                if (nodeIsBODY(parent))
2041                    break;
2042                if (node->tag == parent->tag)
2043                {
2044                    TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
2045                    TY_(UngetToken)( doc );
2046                    return;
2047                }
2048            }
2049
2050            TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2051            TY_(FreeNode)( doc, node);
2052            continue;
2053        }
2054
2055        if ( !nodeIsLI(node) )
2056        {
2057            TY_(UngetToken)( doc );
2058
2059            if (node->tag && (node->tag->model & CM_BLOCK) && lexer->excludeBlocks)
2060            {
2061                TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
2062                return;
2063            }
2064            /* http://tidy.sf.net/issue/1316307 */
2065            /* In exiled mode, return so table processing can continue. */
2066            else if ( lexer->exiled && node->tag
2067                      && TY_(nodeHasCM)(node, CM_TABLE|CM_ROWGRP|CM_ROW) )
2068                return;
2069
2070            node = TY_(InferredTag)(doc, TidyTag_LI);
2071            TY_(AddStyleProperty)( doc, node, "list-style: none" );
2072            TY_(ReportError)(doc, list, node, MISSING_STARTTAG );
2073        }
2074
2075        /* node should be <LI> */
2076        TY_(InsertNodeAtEnd)(list,node);
2077        ParseTag( doc, node, IgnoreWhitespace);
2078    }
2079
2080    TY_(ReportError)(doc, list, node, MISSING_ENDTAG_FOR);
2081}
2082
2083/*
2084 unexpected content in table row is moved to just before
2085 the table in accordance with Netscape and IE. This code
2086 assumes that node hasn't been inserted into the row.
2087*/
2088static void MoveBeforeTable( TidyDocImpl* ARG_UNUSED(doc), Node *row,
2089                             Node *node )
2090{
2091    Node *table;
2092
2093    /* first find the table element */
2094    for (table = row->parent; table; table = table->parent)
2095    {
2096        if ( nodeIsTABLE(table) )
2097        {
2098            TY_(InsertNodeBeforeElement)( table, node );
2099            return;
2100        }
2101    }
2102    /* No table element */
2103    TY_(InsertNodeBeforeElement)( row->parent, node );
2104}
2105
2106/*
2107 if a table row is empty then insert an empty cell
2108 this practice is consistent with browser behavior
2109 and avoids potential problems with row spanning cells
2110*/
2111static void FixEmptyRow(TidyDocImpl* doc, Node *row)
2112{
2113    Node *cell;
2114
2115    if (row->content == NULL)
2116    {
2117        cell = TY_(InferredTag)(doc, TidyTag_TD);
2118        TY_(InsertNodeAtEnd)(row, cell);
2119        TY_(ReportError)(doc, row, cell, MISSING_STARTTAG);
2120    }
2121}
2122
2123void TY_(ParseRow)(TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode))
2124{
2125    Lexer* lexer = doc->lexer;
2126    Node *node;
2127    Bool exclude_state;
2128
2129    if (row->tag->model & CM_EMPTY)
2130        return;
2131
2132    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2133    {
2134        if (node->tag == row->tag)
2135        {
2136            if (node->type == EndTag)
2137            {
2138                TY_(FreeNode)( doc, node);
2139                row->closed = yes;
2140                FixEmptyRow( doc, row);
2141                return;
2142            }
2143
2144            /* New row start implies end of current row */
2145            TY_(UngetToken)( doc );
2146            FixEmptyRow( doc, row);
2147            return;
2148        }
2149
2150        /*
2151          if this is the end tag for an ancestor element
2152          then infer end tag for this element
2153        */
2154        if ( node->type == EndTag )
2155        {
2156            if ( (TY_(nodeHasCM)(node, CM_HTML|CM_TABLE) || nodeIsTABLE(node))
2157                 && DescendantOf(row, TagId(node)) )
2158            {
2159                TY_(UngetToken)( doc );
2160                return;
2161            }
2162
2163            if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2164            {
2165                if ( nodeIsFORM(node) )
2166                    BadForm( doc );
2167
2168                TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2169                TY_(FreeNode)( doc, node);
2170                continue;
2171            }
2172
2173            if ( nodeIsTD(node) || nodeIsTH(node) )
2174            {
2175                TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2176                TY_(FreeNode)( doc, node);
2177                continue;
2178            }
2179        }
2180
2181        /* deal with comments etc. */
2182        if (InsertMisc(row, node))
2183            continue;
2184
2185        /* discard unknown tags */
2186        if (node->tag == NULL && node->type != TextNode)
2187        {
2188            TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2189            TY_(FreeNode)( doc, node);
2190            continue;
2191        }
2192
2193        /* discard unexpected <table> element */
2194        if ( nodeIsTABLE(node) )
2195        {
2196            TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2197            TY_(FreeNode)( doc, node);
2198            continue;
2199        }
2200
2201        /* THEAD, TFOOT or TBODY */
2202        if ( TY_(nodeHasCM)(node, CM_ROWGRP) )
2203        {
2204            TY_(UngetToken)( doc );
2205            return;
2206        }
2207
2208        if (node->type == EndTag)
2209        {
2210            TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2211            TY_(FreeNode)( doc, node);
2212            continue;
2213        }
2214
2215        /*
2216          if text or inline or block move before table
2217          if head content move to head
2218        */
2219
2220        if (node->type != EndTag)
2221        {
2222            if ( nodeIsFORM(node) )
2223            {
2224                TY_(UngetToken)( doc );
2225                node = TY_(InferredTag)(doc, TidyTag_TD);
2226                TY_(ReportError)(doc, row, node, MISSING_STARTTAG);
2227            }
2228            else if ( TY_(nodeIsText)(node)
2229                      || TY_(nodeHasCM)(node, CM_BLOCK | CM_INLINE) )
2230            {
2231                MoveBeforeTable( doc, row, node );
2232                TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN);
2233                lexer->exiled = yes;
2234                exclude_state = lexer->excludeBlocks;
2235                lexer->excludeBlocks = no;
2236
2237                if (node->type != TextNode)
2238                    ParseTag( doc, node, IgnoreWhitespace);
2239
2240                lexer->exiled = no;
2241                lexer->excludeBlocks = exclude_state;
2242                continue;
2243            }
2244            else if (node->tag->model & CM_HEAD)
2245            {
2246                TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN);
2247                MoveToHead( doc, row, node);
2248                continue;
2249            }
2250        }
2251
2252        if ( !(nodeIsTD(node) || nodeIsTH(node)) )
2253        {
2254            TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN);
2255            TY_(FreeNode)( doc, node);
2256            continue;
2257        }
2258
2259        /* node should be <TD> or <TH> */
2260        TY_(InsertNodeAtEnd)(row, node);
2261        exclude_state = lexer->excludeBlocks;
2262        lexer->excludeBlocks = no;
2263        ParseTag( doc, node, IgnoreWhitespace);
2264        lexer->excludeBlocks = exclude_state;
2265
2266        /* pop inline stack */
2267
2268        while ( lexer->istacksize > lexer->istackbase )
2269            TY_(PopInline)( doc, NULL );
2270    }
2271
2272}
2273
2274void TY_(ParseRowGroup)(TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSED(mode))
2275{
2276    Lexer* lexer = doc->lexer;
2277    Node *node, *parent;
2278
2279    if (rowgroup->tag->model & CM_EMPTY)
2280        return;
2281
2282    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2283    {
2284        if (node->tag == rowgroup->tag)
2285        {
2286            if (node->type == EndTag)
2287            {
2288                rowgroup->closed = yes;
2289                TY_(FreeNode)( doc, node);
2290                return;
2291            }
2292
2293            TY_(UngetToken)( doc );
2294            return;
2295        }
2296
2297        /* if </table> infer end tag */
2298        if ( nodeIsTABLE(node) && node->type == EndTag )
2299        {
2300            TY_(UngetToken)( doc );
2301            return;
2302        }
2303
2304        /* deal with comments etc. */
2305        if (InsertMisc(rowgroup, node))
2306            continue;
2307
2308        /* discard unknown tags */
2309        if (node->tag == NULL && node->type != TextNode)
2310        {
2311            TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2312            TY_(FreeNode)( doc, node);
2313            continue;
2314        }
2315
2316        /*
2317          if TD or TH then infer <TR>
2318          if text or inline or block move before table
2319          if head content move to head
2320        */
2321
2322        if (node->type != EndTag)
2323        {
2324            if ( nodeIsTD(node) || nodeIsTH(node) )
2325            {
2326                TY_(UngetToken)( doc );
2327                node = TY_(InferredTag)(doc, TidyTag_TR);
2328                TY_(ReportError)(doc, rowgroup, node, MISSING_STARTTAG);
2329            }
2330            else if ( TY_(nodeIsText)(node)
2331                      || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2332            {
2333                MoveBeforeTable( doc, rowgroup, node );
2334                TY_(ReportError)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
2335                lexer->exiled = yes;
2336
2337                if (node->type != TextNode)
2338                    ParseTag(doc, node, IgnoreWhitespace);
2339
2340                lexer->exiled = no;
2341                continue;
2342            }
2343            else if (node->tag->model & CM_HEAD)
2344            {
2345                TY_(ReportError)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
2346                MoveToHead(doc, rowgroup, node);
2347                continue;
2348            }
2349        }
2350
2351        /*
2352          if this is the end tag for ancestor element
2353          then infer end tag for this element
2354        */
2355        if (node->type == EndTag)
2356        {
2357            if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2358            {
2359                if ( nodeIsFORM(node) )
2360                    BadForm( doc );
2361
2362                TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2363                TY_(FreeNode)( doc, node);
2364                continue;
2365            }
2366
2367            if ( nodeIsTR(node) || nodeIsTD(node) || nodeIsTH(node) )
2368            {
2369                TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2370                TY_(FreeNode)( doc, node);
2371                continue;
2372            }
2373
2374            for ( parent = rowgroup->parent;
2375                  parent != NULL;
2376                  parent = parent->parent )
2377            {
2378                if (node->tag == parent->tag)
2379                {
2380                    TY_(UngetToken)( doc );
2381                    return;
2382                }
2383            }
2384        }
2385
2386        /*
2387          if THEAD, TFOOT or TBODY then implied end tag
2388
2389        */
2390        if (node->tag->model & CM_ROWGRP)
2391        {
2392            if (node->type != EndTag)
2393            {
2394                TY_(UngetToken)( doc );
2395                return;
2396            }
2397        }
2398
2399        if (node->type == EndTag)
2400        {
2401            TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2402            TY_(FreeNode)( doc, node);
2403            continue;
2404        }
2405
2406        if ( !nodeIsTR(node) )
2407        {
2408            node = TY_(InferredTag)(doc, TidyTag_TR);
2409            TY_(ReportError)(doc, rowgroup, node, MISSING_STARTTAG);
2410            TY_(UngetToken)( doc );
2411        }
2412
2413       /* node should be <TR> */
2414        TY_(InsertNodeAtEnd)(rowgroup, node);
2415        ParseTag(doc, node, IgnoreWhitespace);
2416    }
2417
2418}
2419
2420void TY_(ParseColGroup)(TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSED(mode))
2421{
2422    Node *node, *parent;
2423
2424    if (colgroup->tag->model & CM_EMPTY)
2425        return;
2426
2427    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2428    {
2429        if (node->tag == colgroup->tag && node->type == EndTag)
2430        {
2431            TY_(FreeNode)( doc, node);
2432            colgroup->closed = yes;
2433            return;
2434        }
2435
2436        /*
2437          if this is the end tag for an ancestor element
2438          then infer end tag for this element
2439        */
2440        if (node->type == EndTag)
2441        {
2442            if ( nodeIsFORM(node) )
2443            {
2444                BadForm( doc );
2445                TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2446                TY_(FreeNode)( doc, node);
2447                continue;
2448            }
2449
2450            for ( parent = colgroup->parent;
2451                  parent != NULL;
2452                  parent = parent->parent )
2453            {
2454                if (node->tag == parent->tag)
2455                {
2456                    TY_(UngetToken)( doc );
2457                    return;
2458                }
2459            }
2460        }
2461
2462        if (TY_(nodeIsText)(node))
2463        {
2464            TY_(UngetToken)( doc );
2465            return;
2466        }
2467
2468        /* deal with comments etc. */
2469        if (InsertMisc(colgroup, node))
2470            continue;
2471
2472        /* discard unknown tags */
2473        if (node->tag == NULL)
2474        {
2475            TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2476            TY_(FreeNode)( doc, node);
2477            continue;
2478        }
2479
2480        if ( !nodeIsCOL(node) )
2481        {
2482            TY_(UngetToken)( doc );
2483            return;
2484        }
2485
2486        if (node->type == EndTag)
2487        {
2488            TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2489            TY_(FreeNode)( doc, node);
2490            continue;
2491        }
2492
2493        /* node should be <COL> */
2494        TY_(InsertNodeAtEnd)(colgroup, node);
2495        ParseTag(doc, node, IgnoreWhitespace);
2496    }
2497}
2498
2499void TY_(ParseTableTag)(TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(mode))
2500{
2501    Lexer* lexer = doc->lexer;
2502    Node *node, *parent;
2503    uint istackbase;
2504
2505    TY_(DeferDup)( doc );
2506    istackbase = lexer->istackbase;
2507    lexer->istackbase = lexer->istacksize;
2508
2509    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2510    {
2511        if (node->tag == table->tag && node->type == EndTag)
2512        {
2513            TY_(FreeNode)( doc, node);
2514            lexer->istackbase = istackbase;
2515            table->closed = yes;
2516            return;
2517        }
2518
2519        /* deal with comments etc. */
2520        if (InsertMisc(table, node))
2521            continue;
2522
2523        /* discard unknown tags */
2524        if (node->tag == NULL && node->type != TextNode)
2525        {
2526            TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2527            TY_(FreeNode)( doc, node);
2528            continue;
2529        }
2530
2531        /* if TD or TH or text or inline or block then infer <TR> */
2532
2533        if (node->type != EndTag)
2534        {
2535            if ( nodeIsTD(node) || nodeIsTH(node) || nodeIsTABLE(node) )
2536            {
2537                TY_(UngetToken)( doc );
2538                node = TY_(InferredTag)(doc, TidyTag_TR);
2539                TY_(ReportError)(doc, table, node, MISSING_STARTTAG);
2540            }
2541            else if ( TY_(nodeIsText)(node) ||TY_(nodeHasCM)(node,CM_BLOCK|CM_INLINE) )
2542            {
2543                TY_(InsertNodeBeforeElement)(table, node);
2544                TY_(ReportError)(doc, table, node, TAG_NOT_ALLOWED_IN);
2545                lexer->exiled = yes;
2546
2547                if (node->type != TextNode)
2548                    ParseTag(doc, node, IgnoreWhitespace);
2549
2550                lexer->exiled = no;
2551                continue;
2552            }
2553            else if (node->tag->model & CM_HEAD)
2554            {
2555                MoveToHead(doc, table, node);
2556                continue;
2557            }
2558        }
2559
2560        /*
2561          if this is the end tag for an ancestor element
2562          then infer end tag for this element
2563        */
2564        if (node->type == EndTag)
2565        {
2566            if ( nodeIsFORM(node) )
2567            {
2568                BadForm( doc );
2569                TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2570                TY_(FreeNode)( doc, node);
2571                continue;
2572            }
2573
2574            /* best to discard unexpected block/inline end tags */
2575            if ( TY_(nodeHasCM)(node, CM_TABLE|CM_ROW) ||
2576                 TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2577            {
2578                TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2579                TY_(FreeNode)( doc, node);
2580                continue;
2581            }
2582
2583            for ( parent = table->parent;
2584                  parent != NULL;
2585                  parent = parent->parent )
2586            {
2587                if (node->tag == parent->tag)
2588                {
2589                    TY_(ReportError)(doc, table, node, MISSING_ENDTAG_BEFORE );
2590                    TY_(UngetToken)( doc );
2591                    lexer->istackbase = istackbase;
2592                    return;
2593                }
2594            }
2595        }
2596
2597        if (!(node->tag->model & CM_TABLE))
2598        {
2599            TY_(UngetToken)( doc );
2600            TY_(ReportError)(doc, table, node, TAG_NOT_ALLOWED_IN);
2601            lexer->istackbase = istackbase;
2602            return;
2603        }
2604
2605        if (TY_(nodeIsElement)(node))
2606        {
2607            TY_(InsertNodeAtEnd)(table, node);
2608            ParseTag(doc, node, IgnoreWhitespace);
2609            continue;
2610        }
2611
2612        /* discard unexpected text nodes and end tags */
2613        TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2614        TY_(FreeNode)( doc, node);
2615    }
2616
2617    TY_(ReportError)(doc, table, node, MISSING_ENDTAG_FOR);
2618    lexer->istackbase = istackbase;
2619}
2620
2621/* acceptable content for pre elements */
2622static Bool PreContent( TidyDocImpl* ARG_UNUSED(doc), Node* node )
2623{
2624    /* p is coerced to br's, Text OK too */
2625    if ( nodeIsP(node) || TY_(nodeIsText)(node) )
2626        return yes;
2627
2628    if ( node->tag == NULL ||
2629         nodeIsPARAM(node) ||
2630         !TY_(nodeHasCM)(node, CM_INLINE|CM_NEW) )
2631        return no;
2632
2633    return yes;
2634}
2635
2636void TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) )
2637{
2638    Node *node;
2639
2640    if (pre->tag->model & CM_EMPTY)
2641        return;
2642
2643    TY_(InlineDup)( doc, NULL ); /* tell lexer to insert inlines if needed */
2644
2645    while ((node = TY_(GetToken)(doc, Preformatted)) != NULL)
2646    {
2647        if ( node->type == EndTag &&
2648             (node->tag == pre->tag || DescendantOf(pre, TagId(node))) )
2649        {
2650            if (nodeIsBODY(node) || nodeIsHTML(node))
2651            {
2652                TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2653                TY_(FreeNode)(doc, node);
2654                continue;
2655            }
2656            if (node->tag == pre->tag)
2657            {
2658                TY_(FreeNode)(doc, node);
2659            }
2660            else
2661            {
2662                TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_BEFORE );
2663                TY_(UngetToken)( doc );
2664            }
2665            pre->closed = yes;
2666            TrimSpaces(doc, pre);
2667            return;
2668        }
2669
2670        if (TY_(nodeIsText)(node))
2671        {
2672            TY_(InsertNodeAtEnd)(pre, node);
2673            continue;
2674        }
2675
2676        /* deal with comments etc. */
2677        if (InsertMisc(pre, node))
2678            continue;
2679
2680        if (node->tag == NULL)
2681        {
2682            TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2683            TY_(FreeNode)(doc, node);
2684            continue;
2685        }
2686
2687        /* strip unexpected tags */
2688        if ( !PreContent(doc, node) )
2689        {
2690            Node *newnode;
2691
2692            /* fix for http://tidy.sf.net/bug/772205 */
2693            if (node->type == EndTag)
2694            {
2695               TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2696               TY_(FreeNode)(doc, node);
2697               continue;
2698            }
2699            /*
2700              This is basically what Tidy 04 August 2000 did and far more accurate
2701              with respect to browser behaivour than the code commented out above.
2702              Tidy could try to propagate the <pre> into each disallowed child where
2703              <pre> is allowed in order to replicate some browsers behaivour, but
2704              there are a lot of exceptions, e.g. Internet Explorer does not propagate
2705              <pre> into table cells while Mozilla does. Opera 6 never propagates
2706              <pre> into blocklevel elements while Opera 7 behaves much like Mozilla.
2707
2708              Tidy behaves thus mostly like Opera 6 except for nested <pre> elements
2709              which are handled like Mozilla takes them (Opera6 closes all <pre> after
2710              the first </pre>).
2711
2712              There are similar issues like replacing <p> in <pre> with <br>, for
2713              example
2714
2715                <pre>...<p>...</pre>                 (Input)
2716                <pre>...<br>...</pre>                (Tidy)
2717                <pre>...<br>...</pre>                (Opera 7 and Internet Explorer)
2718                <pre>...<br><br>...</pre>            (Opera 6 and Mozilla)
2719
2720                <pre>...<p>...</p>...</pre>          (Input)
2721                <pre>...<br>......</pre>             (Tidy, BUG!)
2722                <pre>...<br>...<br>...</pre>         (Internet Explorer)
2723                <pre>...<br><br>...<br><br>...</pre> (Mozilla, Opera 6)
2724                <pre>...<br>...<br><br>...</pre>     (Opera 7)
2725
2726              or something similar, they could also be closing the <pre> and propagate
2727              the <pre> into the newly opened <p>.
2728
2729              Todo: IMG, OBJECT, APPLET, BIG, SMALL, SUB, SUP, FONT, and BASEFONT are
2730              dissallowed in <pre>, Tidy neither detects this nor does it perform any
2731              cleanup operation. Tidy should at least issue a warning if it encounters
2732              such constructs.
2733
2734              Todo: discarding </p> is abviously a bug, it should be replaced by <br>.
2735            */
2736            TY_(InsertNodeAfterElement)(pre, node);
2737            TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_BEFORE);
2738            ParseTag(doc, node, IgnoreWhitespace);
2739
2740            newnode = TY_(InferredTag)(doc, TidyTag_PRE);
2741            TY_(ReportError)(doc, pre, newnode, INSERTING_TAG);
2742            pre = newnode;
2743            TY_(InsertNodeAfterElement)(node, pre);
2744
2745            continue;
2746        }
2747
2748        if ( nodeIsP(node) )
2749        {
2750            if (node->type == StartTag)
2751            {
2752                TY_(ReportError)(doc, pre, node, USING_BR_INPLACE_OF);
2753
2754                /* trim white space before <p> in <pre>*/
2755                TrimSpaces(doc, pre);
2756
2757                /* coerce both <p> and </p> to <br> */
2758                TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
2759                TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */
2760                TY_(InsertNodeAtEnd)( pre, node );
2761            }
2762            else
2763            {
2764                TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2765                TY_(FreeNode)( doc, node);
2766            }
2767            continue;
2768        }
2769
2770        if ( TY_(nodeIsElement)(node) )
2771        {
2772            /* trim white space before <br> */
2773            if ( nodeIsBR(node) )
2774                TrimSpaces(doc, pre);
2775
2776            TY_(InsertNodeAtEnd)(pre, node);
2777            ParseTag(doc, node, Preformatted);
2778            continue;
2779        }
2780
2781        /* discard unexpected tags */
2782        TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2783        TY_(FreeNode)( doc, node);
2784    }
2785
2786    TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_FOR);
2787}
2788
2789void TY_(ParseOptGroup)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode))
2790{
2791    Lexer* lexer = doc->lexer;
2792    Node *node;
2793
2794    lexer->insert = NULL;  /* defer implicit inline start tags */
2795
2796    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2797    {
2798        if (node->tag == field->tag && node->type == EndTag)
2799        {
2800            TY_(FreeNode)( doc, node);
2801            field->closed = yes;
2802            TrimSpaces(doc, field);
2803            return;
2804        }
2805
2806        /* deal with comments etc. */
2807        if (InsertMisc(field, node))
2808            continue;
2809
2810        if ( node->type == StartTag &&
2811             (nodeIsOPTION(node) || nodeIsOPTGROUP(node)) )
2812        {
2813            if ( nodeIsOPTGROUP(node) )
2814                TY_(ReportError)(doc, field, node, CANT_BE_NESTED);
2815
2816            TY_(InsertNodeAtEnd)(field, node);
2817            ParseTag(doc, node, MixedContent);
2818            continue;
2819        }
2820
2821        /* discard unexpected tags */
2822        TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED );
2823        TY_(FreeNode)( doc, node);
2824    }
2825}
2826
2827
2828void TY_(ParseSelect)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode))
2829{
2830    Lexer* lexer = doc->lexer;
2831    Node *node;
2832
2833    lexer->insert = NULL;  /* defer implicit inline start tags */
2834
2835    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2836    {
2837        if (node->tag == field->tag && node->type == EndTag)
2838        {
2839            TY_(FreeNode)( doc, node);
2840            field->closed = yes;
2841            TrimSpaces(doc, field);
2842            return;
2843        }
2844
2845        /* deal with comments etc. */
2846        if (InsertMisc(field, node))
2847            continue;
2848
2849        if ( node->type == StartTag &&
2850             ( nodeIsOPTION(node)   ||
2851               nodeIsOPTGROUP(node) ||
2852               nodeIsSCRIPT(node))
2853           )
2854        {
2855            TY_(InsertNodeAtEnd)(field, node);
2856            ParseTag(doc, node, IgnoreWhitespace);
2857            continue;
2858        }
2859
2860        /* discard unexpected tags */
2861        TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED);
2862        TY_(FreeNode)( doc, node);
2863    }
2864
2865    TY_(ReportError)(doc, field, node, MISSING_ENDTAG_FOR);
2866}
2867
2868void TY_(ParseText)(TidyDocImpl* doc, Node *field, GetTokenMode mode)
2869{
2870    Lexer* lexer = doc->lexer;
2871    Node *node;
2872
2873    lexer->insert = NULL;  /* defer implicit inline start tags */
2874
2875    if ( nodeIsTEXTAREA(field) )
2876        mode = Preformatted;
2877    else
2878        mode = MixedContent;  /* kludge for font tags */
2879
2880    while ((node = TY_(GetToken)(doc, mode)) != NULL)
2881    {
2882        if (node->tag == field->tag && node->type == EndTag)
2883        {
2884            TY_(FreeNode)( doc, node);
2885            field->closed = yes;
2886            TrimSpaces(doc, field);
2887            return;
2888        }
2889
2890        /* deal with comments etc. */
2891        if (InsertMisc(field, node))
2892            continue;
2893
2894        if (TY_(nodeIsText)(node))
2895        {
2896            /* only called for 1st child */
2897            if (field->content == NULL && !(mode & Preformatted))
2898                TrimSpaces(doc, field);
2899
2900            if (node->start >= node->end)
2901            {
2902                TY_(FreeNode)( doc, node);
2903                continue;
2904            }
2905
2906            TY_(InsertNodeAtEnd)(field, node);
2907            continue;
2908        }
2909
2910        /* for textarea should all cases of < and & be escaped? */
2911
2912        /* discard inline tags e.g. font */
2913        if (   node->tag
2914            && node->tag->model & CM_INLINE
2915            && !(node->tag->model & CM_FIELD)) /* #487283 - fix by Lee Passey 25 Jan 02 */
2916        {
2917            TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED);
2918            TY_(FreeNode)( doc, node);
2919            continue;
2920        }
2921
2922        /* terminate element on other tags */
2923        if (!(field->tag->model & CM_OPT))
2924            TY_(ReportError)(doc, field, node, MISSING_ENDTAG_BEFORE);
2925
2926        TY_(UngetToken)( doc );
2927        TrimSpaces(doc, field);
2928        return;
2929    }
2930
2931    if (!(field->tag->model & CM_OPT))
2932        TY_(ReportError)(doc, field, node, MISSING_ENDTAG_FOR);
2933}
2934
2935
2936void TY_(ParseTitle)(TidyDocImpl* doc, Node *title, GetTokenMode ARG_UNUSED(mode))
2937{
2938    Node *node;
2939    while ((node = TY_(GetToken)(doc, MixedContent)) != NULL)
2940    {
2941        if (node->tag == title->tag && node->type == StartTag)
2942        {
2943            TY_(ReportError)(doc, title, node, COERCE_TO_ENDTAG);
2944            node->type = EndTag;
2945            TY_(UngetToken)( doc );
2946            continue;
2947        }
2948        else if (node->tag == title->tag && node->type == EndTag)
2949        {
2950            TY_(FreeNode)( doc, node);
2951            title->closed = yes;
2952            TrimSpaces(doc, title);
2953            return;
2954        }
2955
2956        if (TY_(nodeIsText)(node))
2957        {
2958            /* only called for 1st child */
2959            if (title->content == NULL)
2960                TrimInitialSpace(doc, title, node);
2961
2962            if (node->start >= node->end)
2963            {
2964                TY_(FreeNode)( doc, node);
2965                continue;
2966            }
2967
2968            TY_(InsertNodeAtEnd)(title, node);
2969            continue;
2970        }
2971
2972        /* deal with comments etc. */
2973        if (InsertMisc(title, node))
2974            continue;
2975
2976        /* discard unknown tags */
2977        if (node->tag == NULL)
2978        {
2979            TY_(ReportError)(doc, title, node, DISCARDING_UNEXPECTED);
2980            TY_(FreeNode)( doc, node);
2981            continue;
2982        }
2983
2984        /* pushback unexpected tokens */
2985        TY_(ReportError)(doc, title, node, MISSING_ENDTAG_BEFORE);
2986        TY_(UngetToken)( doc );
2987        TrimSpaces(doc, title);
2988        return;
2989    }
2990
2991    TY_(ReportError)(doc, title, node, MISSING_ENDTAG_FOR);
2992}
2993
2994/*
2995  This isn't quite right for CDATA content as it recognises
2996  tags within the content and parses them accordingly.
2997  This will unfortunately screw up scripts which include
2998  < + letter,  < + !, < + ?  or  < + / + letter
2999*/
3000
3001void TY_(ParseScript)(TidyDocImpl* doc, Node *script, GetTokenMode ARG_UNUSED(mode))
3002{
3003    Node *node;
3004
3005    doc->lexer->parent = script;
3006    node = TY_(GetToken)(doc, CdataContent);
3007    doc->lexer->parent = NULL;
3008
3009    if (node)
3010    {
3011        TY_(InsertNodeAtEnd)(script, node);
3012    }
3013    else
3014    {
3015        /* handle e.g. a document like "<script>" */
3016        TY_(ReportError)(doc, script, NULL, MISSING_ENDTAG_FOR);
3017        return;
3018    }
3019
3020    node = TY_(GetToken)(doc, IgnoreWhitespace);
3021
3022    if (!(node && node->type == EndTag && node->tag &&
3023        node->tag->id == script->tag->id))
3024    {
3025        TY_(ReportError)(doc, script, node, MISSING_ENDTAG_FOR);
3026
3027        if (node)
3028            TY_(UngetToken)(doc);
3029    }
3030    else
3031    {
3032        TY_(FreeNode)(doc, node);
3033    }
3034}
3035
3036Bool TY_(IsJavaScript)(Node *node)
3037{
3038    Bool result = no;
3039    AttVal *attr;
3040
3041    if (node->attributes == NULL)
3042        return yes;
3043
3044    for (attr = node->attributes; attr; attr = attr->next)
3045    {
3046        if ( (attrIsLANGUAGE(attr) || attrIsTYPE(attr))
3047             && AttrContains(attr, "javascript") )
3048        {
3049            result = yes;
3050            break;
3051        }
3052    }
3053
3054    return result;
3055}
3056
3057void TY_(ParseHead)(TidyDocImpl* doc, Node *head, GetTokenMode ARG_UNUSED(mode))
3058{
3059    Lexer* lexer = doc->lexer;
3060    Node *node;
3061    int HasTitle = 0;
3062    int HasBase = 0;
3063
3064    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
3065    {
3066        if (node->tag == head->tag && node->type == EndTag)
3067        {
3068            TY_(FreeNode)( doc, node);
3069            head->closed = yes;
3070            break;
3071        }
3072
3073        /* find and discard multiple <head> elements */
3074        /* find and discard <html> in <head> elements */
3075        if ((node->tag == head->tag || nodeIsHTML(node)) && node->type == StartTag)
3076        {
3077            TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED);
3078            TY_(FreeNode)(doc, node);
3079            continue;
3080        }
3081
3082        if (TY_(nodeIsText)(node))
3083        {
3084            TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN);
3085            TY_(UngetToken)( doc );
3086            break;
3087        }
3088
3089        if (node->type == ProcInsTag && node->element &&
3090            TY_(tmbstrcmp)(node->element, "xml-stylesheet") == 0)
3091        {
3092            TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN);
3093            TY_(InsertNodeBeforeElement)(TY_(FindHTML)(doc), node);
3094            continue;
3095        }
3096
3097        /* deal with comments etc. */
3098        if (InsertMisc(head, node))
3099            continue;
3100
3101        if (node->type == DocTypeTag)
3102        {
3103            InsertDocType(doc, head, node);
3104            continue;
3105        }
3106
3107        /* discard unknown tags */
3108        if (node->tag == NULL)
3109        {
3110            TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED);
3111            TY_(FreeNode)( doc, node);
3112            continue;
3113        }
3114
3115        /*
3116         if it doesn't belong in the head then
3117         treat as implicit end of head and deal
3118         with as part of the body
3119        */
3120        if (!(node->tag->model & CM_HEAD))
3121        {
3122            /* #545067 Implicit closing of head broken - warn only for XHTML input */
3123            if ( lexer->isvoyager )
3124                TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN );
3125            TY_(UngetToken)( doc );
3126            break;
3127        }
3128
3129        if (TY_(nodeIsElement)(node))
3130        {
3131            if ( nodeIsTITLE(node) )
3132            {
3133                ++HasTitle;
3134
3135                if (HasTitle > 1)
3136                    if (head)
3137                        TY_(ReportError)(doc, head, node, TOO_MANY_ELEMENTS_IN);
3138                    else
3139                        TY_(ReportError)(doc, head, node, TOO_MANY_ELEMENTS);
3140            }
3141            else if ( nodeIsBASE(node) )
3142            {
3143                ++HasBase;
3144
3145                if (HasBase > 1)
3146                    if (head)
3147                        TY_(ReportError)(doc, head, node, TOO_MANY_ELEMENTS_IN);
3148                    else
3149                        TY_(ReportError)(doc, head, node, TOO_MANY_ELEMENTS);
3150            }
3151            else if ( nodeIsNOSCRIPT(node) )
3152            {
3153                TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN);
3154            }
3155
3156#ifdef AUTO_INPUT_ENCODING
3157            else if (nodeIsMETA(node))
3158            {
3159                AttVal * httpEquiv = AttrGetById(node, TidyAttr_HTTP_EQUIV);
3160                AttVal * content = AttrGetById(node, TidyAttr_CONTENT);
3161                if (httpEquiv && AttrValueIs(httpEquiv, "Content-Type") && AttrHasValue(content))
3162                {
3163                    tmbstr val, charset;
3164                    uint end = 0;
3165                    val = charset = TY_(tmbstrdup)(content->value);
3166                    val = TY_(tmbstrtolower)(val);
3167                    val = strstr(content->value, "charset");
3168
3169                    if (val)
3170                        val += 7;
3171
3172                    while(val && *val && (TY_(IsWhite)((tchar)*val) ||
3173                          *val == '=' || *val == '"' || *val == '\''))
3174                        ++val;
3175
3176                    while(val && val[end] && !(TY_(IsWhite)((tchar)val[end]) ||
3177                          val[end] == '"' || val[end] == '\'' || val[end] == ';'))
3178                        ++end;
3179
3180                    if (val && end)
3181                    {
3182                        tmbstr encoding = TY_(tmbstrndup)(val, end);
3183                        uint id = TY_(GetEncodingIdFromName)(encoding);
3184
3185                        /* todo: detect mismatch with BOM/XMLDecl/declared */
3186                        /* todo: error for unsupported encodings */
3187                        /* todo: try to re-init transcoder */
3188                        /* todo: change input/output encoding settings */
3189                        /* todo: store id in StreamIn */
3190
3191                        MemFree(encoding);
3192                    }
3193
3194                    MemFree(charset);
3195                }
3196            }
3197#endif /* AUTO_INPUT_ENCODING */
3198
3199            TY_(InsertNodeAtEnd)(head, node);
3200            ParseTag(doc, node, IgnoreWhitespace);
3201            continue;
3202        }
3203
3204        /* discard unexpected text nodes and end tags */
3205        TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED);
3206        TY_(FreeNode)( doc, node);
3207    }
3208}
3209
3210void TY_(ParseBody)(TidyDocImpl* doc, Node *body, GetTokenMode mode)
3211{
3212    Lexer* lexer = doc->lexer;
3213    Node *node;
3214    Bool checkstack, iswhitenode;
3215
3216    mode = IgnoreWhitespace;
3217    checkstack = yes;
3218
3219    TY_(BumpObject)( doc, body->parent );
3220
3221    while ((node = TY_(GetToken)(doc, mode)) != NULL)
3222    {
3223        /* find and discard multiple <body> elements */
3224        if (node->tag == body->tag && node->type == StartTag)
3225        {
3226            TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3227            TY_(FreeNode)(doc, node);
3228            continue;
3229        }
3230
3231        /* #538536 Extra endtags not detected */
3232        if ( nodeIsHTML(node) )
3233        {
3234            if (TY_(nodeIsElement)(node) || lexer->seenEndHtml)
3235                TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3236            else
3237                lexer->seenEndHtml = 1;
3238
3239            TY_(FreeNode)( doc, node);
3240            continue;
3241        }
3242
3243        if ( lexer->seenEndBody &&
3244             ( node->type == StartTag ||
3245               node->type == EndTag   ||
3246               node->type == StartEndTag ) )
3247        {
3248            TY_(ReportError)(doc, body, node, CONTENT_AFTER_BODY );
3249        }
3250
3251        if ( node->tag == body->tag && node->type == EndTag )
3252        {
3253            body->closed = yes;
3254            TrimSpaces(doc, body);
3255            TY_(FreeNode)( doc, node);
3256            lexer->seenEndBody = 1;
3257            mode = IgnoreWhitespace;
3258
3259            if ( nodeIsNOFRAMES(body->parent) )
3260                break;
3261
3262            continue;
3263        }
3264
3265        if ( nodeIsNOFRAMES(node) )
3266        {
3267            if (node->type == StartTag)
3268            {
3269                TY_(InsertNodeAtEnd)(body, node);
3270                TY_(ParseBlock)(doc, node, mode);
3271                continue;
3272            }
3273
3274            if (node->type == EndTag && nodeIsNOFRAMES(body->parent) )
3275            {
3276                TrimSpaces(doc, body);
3277                TY_(UngetToken)( doc );
3278                break;
3279            }
3280        }
3281
3282        if ( (nodeIsFRAME(node) || nodeIsFRAMESET(node))
3283             && nodeIsNOFRAMES(body->parent) )
3284        {
3285            TrimSpaces(doc, body);
3286            TY_(UngetToken)( doc );
3287            break;
3288        }
3289
3290        iswhitenode = no;
3291
3292        if ( TY_(nodeIsText)(node) &&
3293             node->end <= node->start + 1 &&
3294             lexer->lexbuf[node->start] == ' ' )
3295            iswhitenode = yes;
3296
3297        /* deal with comments etc. */
3298        if (InsertMisc(body, node))
3299            continue;
3300
3301        /* #538536 Extra endtags not detected */
3302#if 0
3303        if ( lexer->seenEndBody == 1 && !iswhitenode )
3304        {
3305            ++lexer->seenEndBody;
3306            TY_(ReportError)(doc, body, node, CONTENT_AFTER_BODY);
3307        }
3308#endif
3309
3310        /* mixed content model permits text */
3311        if (TY_(nodeIsText)(node))
3312        {
3313            if (iswhitenode && mode == IgnoreWhitespace)
3314            {
3315                TY_(FreeNode)( doc, node);
3316                continue;
3317            }
3318
3319            /* HTML 2 and HTML4 strict don't allow text here */
3320            TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT | VERS_HTML20));
3321
3322            if (checkstack)
3323            {
3324                checkstack = no;
3325
3326                if ( TY_(InlineDup)(doc, node) > 0 )
3327                    continue;
3328            }
3329
3330            TY_(InsertNodeAtEnd)(body, node);
3331            mode = MixedContent;
3332            continue;
3333        }
3334
3335        if (node->type == DocTypeTag)
3336        {
3337            InsertDocType(doc, body, node);
3338            continue;
3339        }
3340        /* discard unknown  and PARAM tags */
3341        if ( node->tag == NULL || nodeIsPARAM(node) )
3342        {
3343            TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3344            TY_(FreeNode)( doc, node);
3345            continue;
3346        }
3347
3348        /*
3349          Netscape allows LI and DD directly in BODY
3350          We infer UL or DL respectively and use this
3351          Bool to exclude block-level elements so as
3352          to match Netscape's observed behaviour.
3353        */
3354        lexer->excludeBlocks = no;
3355
3356        if ( nodeIsINPUT(node) ||
3357             (!TY_(nodeHasCM)(node, CM_BLOCK) && !TY_(nodeHasCM)(node, CM_INLINE))
3358           )
3359        {
3360            /* avoid this error message being issued twice */
3361            if (!(node->tag->model & CM_HEAD))
3362                TY_(ReportError)(doc, body, node, TAG_NOT_ALLOWED_IN);
3363
3364            if (node->tag->model & CM_HTML)
3365            {
3366                /* copy body attributes if current body was inferred */
3367                if ( nodeIsBODY(node) && body->implicit
3368                     && body->attributes == NULL )
3369                {
3370                    body->attributes = node->attributes;
3371                    node->attributes = NULL;
3372                }
3373
3374                TY_(FreeNode)( doc, node);
3375                continue;
3376            }
3377
3378            if (node->tag->model & CM_HEAD)
3379            {
3380                MoveToHead(doc, body, node);
3381                continue;
3382            }
3383
3384            if (node->tag->model & CM_LIST)
3385            {
3386                TY_(UngetToken)( doc );
3387                node = TY_(InferredTag)(doc, TidyTag_UL);
3388                AddClassNoIndent(doc, node);
3389                lexer->excludeBlocks = yes;
3390            }
3391            else if (node->tag->model & CM_DEFLIST)
3392            {
3393                TY_(UngetToken)( doc );
3394                node = TY_(InferredTag)(doc, TidyTag_DL);
3395                lexer->excludeBlocks = yes;
3396            }
3397            else if (node->tag->model & (CM_TABLE | CM_ROWGRP | CM_ROW))
3398            {
3399                TY_(UngetToken)( doc );
3400                node = TY_(InferredTag)(doc, TidyTag_TABLE);
3401                lexer->excludeBlocks = yes;
3402            }
3403            else if ( nodeIsINPUT(node) )
3404            {
3405                TY_(UngetToken)( doc );
3406                node = TY_(InferredTag)(doc, TidyTag_FORM);
3407                lexer->excludeBlocks = yes;
3408            }
3409            else
3410            {
3411                if ( !TY_(nodeHasCM)(node, CM_ROW | CM_FIELD) )
3412                {
3413                    TY_(UngetToken)( doc );
3414                    return;
3415                }
3416
3417                /* ignore </td> </th> <option> etc. */
3418                TY_(FreeNode)( doc, node );
3419                continue;
3420            }
3421        }
3422
3423        if (node->type == EndTag)
3424        {
3425            if ( nodeIsBR(node) )
3426                node->type = StartTag;
3427            else if ( nodeIsP(node) )
3428            {
3429                node->type = StartEndTag;
3430                node->implicit = yes;
3431#if OBSOLETE
3432                TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
3433                FreeAttrs( doc, node ); /* discard align attribute etc. */
3434                TY_(InsertNodeAtEnd)(body, node);
3435                node = TY_(InferredTag)(doc, TidyTag_BR);
3436#endif
3437            }
3438            else if ( TY_(nodeHasCM)(node, CM_INLINE) )
3439                TY_(PopInline)( doc, node );
3440        }
3441
3442        if (TY_(nodeIsElement)(node))
3443        {
3444            if ( TY_(nodeHasCM)(node, CM_INLINE) && !TY_(nodeHasCM)(node, CM_MIXED) )
3445            {
3446                /* HTML4 strict doesn't allow inline content here */
3447                /* but HTML2 does allow img elements as children of body */
3448                if ( nodeIsIMG(node) )
3449                    TY_(ConstrainVersion)(doc, ~VERS_HTML40_STRICT);
3450                else
3451                    TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT|VERS_HTML20));
3452
3453                if (checkstack && !node->implicit)
3454                {
3455                    checkstack = no;
3456
3457                    if ( TY_(InlineDup)(doc, node) > 0 )
3458                        continue;
3459                }
3460
3461                mode = MixedContent;
3462            }
3463            else
3464            {
3465                checkstack = yes;
3466                mode = IgnoreWhitespace;
3467            }
3468
3469            if (node->implicit)
3470                TY_(ReportError)(doc, body, node, INSERTING_TAG);
3471
3472            TY_(InsertNodeAtEnd)(body, node);
3473            ParseTag(doc, node, mode);
3474            continue;
3475        }
3476
3477        /* discard unexpected tags */
3478        TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3479        TY_(FreeNode)( doc, node);
3480    }
3481}
3482
3483void TY_(ParseNoFrames)(TidyDocImpl* doc, Node *noframes, GetTokenMode mode)
3484{
3485    Lexer* lexer = doc->lexer;
3486    Node *node;
3487
3488    if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3489    {
3490        doc->badAccess |=  USING_NOFRAMES;
3491    }
3492    mode = IgnoreWhitespace;
3493
3494    while ( (node = TY_(GetToken)(doc, mode)) != NULL )
3495    {
3496        if ( node->tag == noframes->tag && node->type == EndTag )
3497        {
3498            TY_(FreeNode)( doc, node);
3499            noframes->closed = yes;
3500            TrimSpaces(doc, noframes);
3501            return;
3502        }
3503
3504        if ( nodeIsFRAME(node) || nodeIsFRAMESET(node) )
3505        {
3506            TrimSpaces(doc, noframes);
3507            if (node->type == EndTag)
3508            {
3509                TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3510                TY_(FreeNode)( doc, node);       /* Throw it away */
3511            }
3512            else
3513            {
3514                TY_(ReportError)(doc, noframes, node, MISSING_ENDTAG_BEFORE);
3515                TY_(UngetToken)( doc );
3516            }
3517            return;
3518        }
3519
3520        if ( nodeIsHTML(node) )
3521        {
3522            if (TY_(nodeIsElement)(node))
3523                TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3524
3525            TY_(FreeNode)( doc, node);
3526            continue;
3527        }
3528
3529        /* deal with comments etc. */
3530        if (InsertMisc(noframes, node))
3531            continue;
3532
3533        if ( nodeIsBODY(node) && node->type == StartTag )
3534        {
3535            Bool seen_body = lexer->seenEndBody;
3536            TY_(InsertNodeAtEnd)(noframes, node);
3537            ParseTag(doc, node, IgnoreWhitespace /*MixedContent*/);
3538
3539            /* fix for bug http://tidy.sf.net/bug/887259 */
3540            if (seen_body && TY_(FindBody)(doc) != node)
3541            {
3542                TY_(CoerceNode)(doc, node, TidyTag_DIV, no, no);
3543                MoveNodeToBody(doc, node);
3544            }
3545            continue;
3546        }
3547
3548        /* implicit body element inferred */
3549        if (TY_(nodeIsText)(node) || (node->tag && node->type != EndTag))
3550        {
3551            Node *body = TY_(FindBody)( doc );
3552            if ( body || lexer->seenEndBody )
3553            {
3554                if ( body == NULL )
3555                {
3556                    TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3557                    TY_(FreeNode)( doc, node);
3558                    continue;
3559                }
3560                if ( TY_(nodeIsText)(node) )
3561                {
3562                    TY_(UngetToken)( doc );
3563                    node = TY_(InferredTag)(doc, TidyTag_P);
3564                    TY_(ReportError)(doc, noframes, node, CONTENT_AFTER_BODY );
3565                }
3566                TY_(InsertNodeAtEnd)( body, node );
3567            }
3568            else
3569            {
3570                TY_(UngetToken)( doc );
3571                node = TY_(InferredTag)(doc, TidyTag_BODY);
3572                if ( cfgBool(doc, TidyXmlOut) )
3573                    TY_(ReportError)(doc, noframes, node, INSERTING_TAG);
3574                TY_(InsertNodeAtEnd)( noframes, node );
3575            }
3576
3577            ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ );
3578            continue;
3579        }
3580
3581        /* discard unexpected end tags */
3582        TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3583        TY_(FreeNode)( doc, node);
3584    }
3585
3586    TY_(ReportError)(doc, noframes, node, MISSING_ENDTAG_FOR);
3587}
3588
3589void TY_(ParseFrameSet)(TidyDocImpl* doc, Node *frameset, GetTokenMode ARG_UNUSED(mode))
3590{
3591    Lexer* lexer = doc->lexer;
3592    Node *node;
3593
3594    if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3595    {
3596        doc->badAccess |=  USING_FRAMES;
3597    }
3598
3599    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
3600    {
3601        if (node->tag == frameset->tag && node->type == EndTag)
3602        {
3603            TY_(FreeNode)( doc, node);
3604            frameset->closed = yes;
3605            TrimSpaces(doc, frameset);
3606            return;
3607        }
3608
3609        /* deal with comments etc. */
3610        if (InsertMisc(frameset, node))
3611            continue;
3612
3613        if (node->tag == NULL)
3614        {
3615            TY_(ReportError)(doc, frameset, node, DISCARDING_UNEXPECTED);
3616            TY_(FreeNode)( doc, node);
3617            continue;
3618        }
3619
3620        if (TY_(nodeIsElement)(node))
3621        {
3622            if (node->tag && node->tag->model & CM_HEAD)
3623            {
3624                MoveToHead(doc, frameset, node);
3625                continue;
3626            }
3627        }
3628
3629        if ( nodeIsBODY(node) )
3630        {
3631            TY_(UngetToken)( doc );
3632            node = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
3633            TY_(ReportError)(doc, frameset, node, INSERTING_TAG);
3634        }
3635
3636        if (node->type == StartTag && (node->tag->model & CM_FRAMES))
3637        {
3638            TY_(InsertNodeAtEnd)(frameset, node);
3639            lexer->excludeBlocks = no;
3640            ParseTag(doc, node, MixedContent);
3641            continue;
3642        }
3643        else if (node->type == StartEndTag && (node->tag->model & CM_FRAMES))
3644        {
3645            TY_(InsertNodeAtEnd)(frameset, node);
3646            continue;
3647        }
3648
3649        /* discard unexpected tags */
3650        TY_(ReportError)(doc, frameset, node, DISCARDING_UNEXPECTED);
3651        TY_(FreeNode)( doc, node);
3652    }
3653
3654    TY_(ReportError)(doc, frameset, node, MISSING_ENDTAG_FOR);
3655}
3656
3657void TY_(ParseHTML)(TidyDocImpl* doc, Node *html, GetTokenMode mode)
3658{
3659    Node *node, *head;
3660    Node *frameset = NULL;
3661    Node *noframes = NULL;
3662
3663    TY_(SetOptionBool)( doc, TidyXmlTags, no );
3664
3665    for (;;)
3666    {
3667        node = TY_(GetToken)(doc, IgnoreWhitespace);
3668
3669        if (node == NULL)
3670        {
3671            node = TY_(InferredTag)(doc, TidyTag_HEAD);
3672            break;
3673        }
3674
3675        if ( nodeIsHEAD(node) )
3676            break;
3677
3678        if (node->tag == html->tag && node->type == EndTag)
3679        {
3680            TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3681            TY_(FreeNode)( doc, node);
3682            continue;
3683        }
3684
3685        /* find and discard multiple <html> elements */
3686        if (node->tag == html->tag && node->type == StartTag)
3687        {
3688            TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3689            TY_(FreeNode)(doc, node);
3690            continue;
3691        }
3692
3693        /* deal with comments etc. */
3694        if (InsertMisc(html, node))
3695            continue;
3696
3697        TY_(UngetToken)( doc );
3698        node = TY_(InferredTag)(doc, TidyTag_HEAD);
3699        break;
3700    }
3701
3702    head = node;
3703    TY_(InsertNodeAtEnd)(html, head);
3704    TY_(ParseHead)(doc, head, mode);
3705
3706    for (;;)
3707    {
3708        node = TY_(GetToken)(doc, IgnoreWhitespace);
3709
3710        if (node == NULL)
3711        {
3712            if (frameset == NULL) /* implied body */
3713            {
3714                node = TY_(InferredTag)(doc, TidyTag_BODY);
3715                TY_(InsertNodeAtEnd)(html, node);
3716                TY_(ParseBody)(doc, node, mode);
3717            }
3718
3719            return;
3720        }
3721
3722        /* robustly handle html tags */
3723        if (node->tag == html->tag)
3724        {
3725            if (node->type != StartTag && frameset == NULL)
3726                TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3727
3728            TY_(FreeNode)( doc, node);
3729            continue;
3730        }
3731
3732        /* deal with comments etc. */
3733        if (InsertMisc(html, node))
3734            continue;
3735
3736        /* if frameset document coerce <body> to <noframes> */
3737        if ( nodeIsBODY(node) )
3738        {
3739            if (node->type != StartTag)
3740            {
3741                TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3742                TY_(FreeNode)( doc, node);
3743                continue;
3744            }
3745
3746            if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3747            {
3748                if (frameset != NULL)
3749                {
3750                    TY_(UngetToken)( doc );
3751
3752                    if (noframes == NULL)
3753                    {
3754                        noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
3755                        TY_(InsertNodeAtEnd)(frameset, noframes);
3756                        TY_(ReportError)(doc, html, noframes, INSERTING_TAG);
3757                    }
3758                    else
3759                    {
3760                        if (noframes->type == StartEndTag)
3761                            noframes->type = StartTag;
3762                    }
3763
3764                    ParseTag(doc, noframes, mode);
3765                    continue;
3766                }
3767            }
3768
3769            TY_(ConstrainVersion)(doc, ~VERS_FRAMESET);
3770            break;  /* to parse body */
3771        }
3772
3773        /* flag an error if we see more than one frameset */
3774        if ( nodeIsFRAMESET(node) )
3775        {
3776            if (node->type != StartTag)
3777            {
3778                TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3779                TY_(FreeNode)( doc, node);
3780                continue;
3781            }
3782
3783            if (frameset != NULL)
3784                TY_(ReportFatal)(doc, html, node, DUPLICATE_FRAMESET);
3785            else
3786                frameset = node;
3787
3788            TY_(InsertNodeAtEnd)(html, node);
3789            ParseTag(doc, node, mode);
3790
3791            /*
3792              see if it includes a noframes element so
3793              that we can merge subsequent noframes elements
3794            */
3795
3796            for (node = frameset->content; node; node = node->next)
3797            {
3798                if ( nodeIsNOFRAMES(node) )
3799                    noframes = node;
3800            }
3801            continue;
3802        }
3803
3804        /* if not a frameset document coerce <noframes> to <body> */
3805        if ( nodeIsNOFRAMES(node) )
3806        {
3807            if (node->type != StartTag)
3808            {
3809                TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3810                TY_(FreeNode)( doc, node);
3811                continue;
3812            }
3813
3814            if (frameset == NULL)
3815            {
3816                TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3817                TY_(FreeNode)( doc, node);
3818                node = TY_(InferredTag)(doc, TidyTag_BODY);
3819                break;
3820            }
3821
3822            if (noframes == NULL)
3823            {
3824                noframes = node;
3825                TY_(InsertNodeAtEnd)(frameset, noframes);
3826            }
3827            else
3828                TY_(FreeNode)( doc, node);
3829
3830            ParseTag(doc, noframes, mode);
3831            continue;
3832        }
3833
3834        if (TY_(nodeIsElement)(node))
3835        {
3836            if (node->tag && node->tag->model & CM_HEAD)
3837            {
3838                MoveToHead(doc, html, node);
3839                continue;
3840            }
3841
3842            /* discard illegal frame element following a frameset */
3843            if ( frameset != NULL && nodeIsFRAME(node) )
3844            {
3845                TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3846                TY_(FreeNode)(doc, node);
3847                continue;
3848            }
3849        }
3850
3851        TY_(UngetToken)( doc );
3852
3853        /* insert other content into noframes element */
3854
3855        if (frameset)
3856        {
3857            if (noframes == NULL)
3858            {
3859                noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
3860                TY_(InsertNodeAtEnd)(frameset, noframes);
3861            }
3862            else
3863            {
3864                TY_(ReportError)(doc, html, node, NOFRAMES_CONTENT);
3865                if (noframes->type == StartEndTag)
3866                    noframes->type = StartTag;
3867            }
3868
3869            TY_(ConstrainVersion)(doc, VERS_FRAMESET);
3870            ParseTag(doc, noframes, mode);
3871            continue;
3872        }
3873
3874        node = TY_(InferredTag)(doc, TidyTag_BODY);
3875        TY_(ConstrainVersion)(doc, ~VERS_FRAMESET);
3876        break;
3877    }
3878
3879    /* node must be body */
3880
3881    TY_(InsertNodeAtEnd)(html, node);
3882    ParseTag(doc, node, mode);
3883}
3884
3885static Bool nodeCMIsOnlyInline( Node* node )
3886{
3887    return TY_(nodeHasCM)( node, CM_INLINE ) && !TY_(nodeHasCM)( node, CM_BLOCK );
3888}
3889
3890static void EncloseBodyText(TidyDocImpl* doc)
3891{
3892    Node* node;
3893    Node* body = TY_(FindBody)(doc);
3894
3895    if (!body)
3896        return;
3897
3898    node = body->content;
3899
3900    while (node)
3901    {
3902        if ((TY_(nodeIsText)(node) && !TY_(IsBlank)(doc->lexer, node)) ||
3903            (TY_(nodeIsElement)(node) && nodeCMIsOnlyInline(node)))
3904        {
3905            Node* p = TY_(InferredTag)(doc, TidyTag_P);
3906            TY_(InsertNodeBeforeElement)(node, p);
3907            while (node && (!TY_(nodeIsElement)(node) || nodeCMIsOnlyInline(node)))
3908            {
3909                Node* next = node->next;
3910                TY_(RemoveNode)(node);
3911                TY_(InsertNodeAtEnd)(p, node);
3912                node = next;
3913            }
3914            TrimSpaces(doc, p);
3915            continue;
3916        }
3917        node = node->next;
3918    }
3919}
3920
3921/* <form>, <blockquote> and <noscript> do not allow #PCDATA in
3922   HTML 4.01 Strict (%block; model instead of %flow;).
3923  When requested, text nodes in these elements are wrapped in <p>. */
3924static void EncloseBlockText(TidyDocImpl* doc, Node* node)
3925{
3926    Node *next;
3927    Node *block;
3928
3929    while (node)
3930    {
3931        next = node->next;
3932
3933        if (node->content)
3934            EncloseBlockText(doc, node->content);
3935
3936        if (!(nodeIsFORM(node) || nodeIsNOSCRIPT(node) ||
3937              nodeIsBLOCKQUOTE(node))
3938            || !node->content)
3939        {
3940            node = next;
3941            continue;
3942        }
3943
3944        block = node->content;
3945
3946        if ((TY_(nodeIsText)(block) && !TY_(IsBlank)(doc->lexer, block)) ||
3947            (TY_(nodeIsElement)(block) && nodeCMIsOnlyInline(block)))
3948        {
3949            Node* p = TY_(InferredTag)(doc, TidyTag_P);
3950            TY_(InsertNodeBeforeElement)(block, p);
3951            while (block &&
3952                   (!TY_(nodeIsElement)(block) || nodeCMIsOnlyInline(block)))
3953            {
3954                Node* tempNext = block->next;
3955                TY_(RemoveNode)(block);
3956                TY_(InsertNodeAtEnd)(p, block);
3957                block = tempNext;
3958            }
3959            TrimSpaces(doc, p);
3960            continue;
3961        }
3962
3963        node = next;
3964    }
3965}
3966
3967static void ReplaceObsoleteElements(TidyDocImpl* doc, Node* node)
3968{
3969    Node *next;
3970
3971    while (node)
3972    {
3973        next = node->next;
3974
3975        if (nodeIsDIR(node) || nodeIsMENU(node))
3976            TY_(CoerceNode)(doc, node, TidyTag_UL, yes, yes);
3977
3978        if (nodeIsXMP(node) || nodeIsLISTING(node) ||
3979            (node->tag && node->tag->id == TidyTag_PLAINTEXT))
3980            TY_(CoerceNode)(doc, node, TidyTag_PRE, yes, yes);
3981
3982        if (node->content)
3983            ReplaceObsoleteElements(doc, node->content);
3984
3985        node = next;
3986    }
3987}
3988
3989static void AttributeChecks(TidyDocImpl* doc, Node* node)
3990{
3991    Node *next;
3992
3993    while (node)
3994    {
3995        next = node->next;
3996
3997        if (TY_(nodeIsElement)(node))
3998        {
3999            if (node->tag->chkattrs)
4000                node->tag->chkattrs(doc, node);
4001            else
4002                TY_(CheckAttributes)(doc, node);
4003        }
4004
4005        if (node->content)
4006            AttributeChecks(doc, node->content);
4007
4008        node = next;
4009    }
4010}
4011
4012/* Apple Changes:
4013   2007-02-02 iccir If TidySanitizeAgainstXSS is set, remove elements which could load external content
4014*/
4015#ifdef TIDY_APPLE_CHANGES
4016static void SanitizeNodesAgainstXSS(TidyDocImpl* doc, Node* node)
4017{
4018    Node *next;
4019
4020    Bool isXml = cfgBool( doc, TidyXmlTags );
4021    Bool shouldRemoveElement;
4022
4023    while (node)
4024    {
4025        next = node->next;
4026
4027        if (!isXml)
4028        {
4029            shouldRemoveElement = TY_(nodeIsFRAMESET) (node) ||
4030                                  TY_(nodeIsSCRIPT)   (node) ||
4031                                  TY_(nodeIsIFRAME)   (node) ||
4032                                  TY_(nodeIsOBJECT)   (node) ||
4033                                  TY_(nodeIsFRAME)    (node) ||
4034                                  TY_(nodeIsEMBED)    (node) ||
4035                                  TY_(nodeIsSTYLE)    (node) ||
4036                                  TY_(nodeIsLINK)     (node) ||
4037                                  TY_(nodeIsMETA)     (node) ||
4038                                  TY_(nodeIsAPPLET)   (node) ;
4039        }
4040        else
4041        {
4042            /* When the content was parsed as XML, the tag identifiers all point at a generic XML tag identifier
4043               with an unknown tag name, so we need to manually compare the tag names with the bad set of tags. */
4044            shouldRemoveElement = node->element && (!TY_(tmbstrcasecmp)(node->element, "frameset") ||
4045                                                    !TY_(tmbstrcasecmp)(node->element, "script")   ||
4046                                                    !TY_(tmbstrcasecmp)(node->element, "iframe")   ||
4047                                                    !TY_(tmbstrcasecmp)(node->element, "object")   ||
4048                                                    !TY_(tmbstrcasecmp)(node->element, "frame")    ||
4049                                                    !TY_(tmbstrcasecmp)(node->element, "embed")    ||
4050                                                    !TY_(tmbstrcasecmp)(node->element, "style")    ||
4051                                                    !TY_(tmbstrcasecmp)(node->element, "link")     ||
4052                                                    !TY_(tmbstrcasecmp)(node->element, "meta")     ||
4053                                                    !TY_(tmbstrcasecmp)(node->element, "applet")   );
4054        }
4055
4056        if (shouldRemoveElement)
4057        {
4058            RemoveNode(node);
4059            FreeNode(doc, node);
4060        }
4061        else if (node->content)
4062        {
4063            SanitizeNodesAgainstXSS(doc, node->content);
4064        }
4065
4066        node = next;
4067    }
4068}
4069#endif
4070
4071/*
4072  HTML is the top level element
4073*/
4074void TY_(ParseDocument)(TidyDocImpl* doc)
4075{
4076    Node *node, *html, *doctype = NULL;
4077
4078    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
4079    {
4080        if (node->type == XmlDecl)
4081        {
4082            if (TY_(FindXmlDecl)(doc) && doc->root.content)
4083            {
4084                TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4085                TY_(FreeNode)(doc, node);
4086                continue;
4087            }
4088            if (node->line != 1 || (node->line == 1 && node->column != 1))
4089            {
4090                TY_(ReportError)(doc, &doc->root, node, SPACE_PRECEDING_XMLDECL);
4091            }
4092        }
4093#ifdef AUTO_INPUT_ENCODING
4094        if (node->type == XmlDecl)
4095        {
4096            AttVal* encoding = GetAttrByName(node, "encoding");
4097            if (AttrHasValue(encoding))
4098            {
4099                uint id = TY_(GetEncodingIdFromName)(encoding->value);
4100
4101                /* todo: detect mismatch with BOM/XMLDecl/declared */
4102                /* todo: error for unsupported encodings */
4103                /* todo: try to re-init transcoder */
4104                /* todo: change input/output encoding settings */
4105                /* todo: store id in StreamIn */
4106            }
4107        }
4108#endif /* AUTO_INPUT_ENCODING */
4109
4110        /* deal with comments etc. */
4111        if (InsertMisc( &doc->root, node ))
4112            continue;
4113
4114        if (node->type == DocTypeTag)
4115        {
4116            if (doctype == NULL)
4117            {
4118                TY_(InsertNodeAtEnd)( &doc->root, node);
4119                doctype = node;
4120            }
4121            else
4122            {
4123                TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4124                TY_(FreeNode)( doc, node);
4125            }
4126            continue;
4127        }
4128
4129        if (node->type == EndTag)
4130        {
4131            TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4132            TY_(FreeNode)( doc, node);
4133            continue;
4134        }
4135
4136        if (node->type == StartTag && nodeIsHTML(node))
4137        {
4138            AttVal *xmlns;
4139
4140            xmlns = TY_(AttrGetById)(node, TidyAttr_XMLNS);
4141
4142            if (AttrValueIs(xmlns, XHTML_NAMESPACE))
4143            {
4144                Bool htmlOut = cfgBool( doc, TidyHtmlOut );
4145                doc->lexer->isvoyager = yes;                  /* Unless plain HTML */
4146                TY_(SetOptionBool)( doc, TidyXhtmlOut, !htmlOut ); /* is specified, output*/
4147                TY_(SetOptionBool)( doc, TidyXmlOut, !htmlOut );   /* will be XHTML. */
4148
4149                /* adjust other config options, just as in config.c */
4150                if ( !htmlOut )
4151                {
4152                    TY_(SetOptionBool)( doc, TidyUpperCaseTags, no );
4153                    TY_(SetOptionBool)( doc, TidyUpperCaseAttrs, no );
4154                }
4155            }
4156        }
4157
4158        if ( node->type != StartTag || !nodeIsHTML(node) )
4159        {
4160            TY_(UngetToken)( doc );
4161            html = TY_(InferredTag)(doc, TidyTag_HTML);
4162        }
4163        else
4164            html = node;
4165
4166        if (!TY_(FindDocType)(doc))
4167            TY_(ReportError)(doc, NULL, NULL, MISSING_DOCTYPE);
4168
4169        TY_(InsertNodeAtEnd)( &doc->root, html);
4170        TY_(ParseHTML)( doc, html, IgnoreWhitespace );
4171        break;
4172    }
4173
4174#if SUPPORT_ACCESSIBILITY_CHECKS
4175    /* do this before any more document fixes */
4176    if ( cfg( doc, TidyAccessibilityCheckLevel ) > 0 )
4177        TY_(AccessibilityChecks)( doc );
4178#endif /* #if SUPPORT_ACCESSIBILITY_CHECKS */
4179
4180    if (!TY_(FindHTML)(doc))
4181    {
4182        /* a later check should complain if <body> is empty */
4183        html = TY_(InferredTag)(doc, TidyTag_HTML);
4184        TY_(InsertNodeAtEnd)( &doc->root, html);
4185        TY_(ParseHTML)(doc, html, IgnoreWhitespace);
4186    }
4187
4188    if (!TY_(FindTITLE)(doc))
4189    {
4190        Node* head = TY_(FindHEAD)(doc);
4191        TY_(ReportError)(doc, head, NULL, MISSING_TITLE_ELEMENT);
4192        TY_(InsertNodeAtEnd)(head, TY_(InferredTag)(doc, TidyTag_TITLE));
4193    }
4194
4195#ifdef TIDY_APPLE_CHANGES
4196    if (cfgBool(doc, TidySanitizeAgainstXSS))
4197        SanitizeNodesAgainstXSS(doc, &doc->root);
4198#endif
4199    AttributeChecks(doc, &doc->root);
4200    ReplaceObsoleteElements(doc, &doc->root);
4201    TY_(DropEmptyElements)(doc, &doc->root);
4202    CleanSpaces(doc, &doc->root);
4203
4204    if (cfgBool(doc, TidyEncloseBodyText))
4205        EncloseBodyText(doc);
4206    if (cfgBool(doc, TidyEncloseBlockText))
4207        EncloseBlockText(doc, &doc->root);
4208}
4209
4210Bool TY_(XMLPreserveWhiteSpace)( TidyDocImpl* doc, Node *element)
4211{
4212    AttVal *attribute;
4213
4214    /* search attributes for xml:space */
4215    for (attribute = element->attributes; attribute; attribute = attribute->next)
4216    {
4217        if (attrIsXML_SPACE(attribute))
4218        {
4219            if (AttrValueIs(attribute, "preserve"))
4220                return yes;
4221
4222            return no;
4223        }
4224    }
4225
4226    if (element->element == NULL)
4227        return no;
4228
4229    /* kludge for html docs without explicit xml:space attribute */
4230    if (nodeIsPRE(element)    ||
4231        nodeIsSCRIPT(element) ||
4232        nodeIsSTYLE(element)  ||
4233        TY_(FindParser)(doc, element) == TY_(ParsePre))
4234        return yes;
4235
4236    /* kludge for XSL docs */
4237    if ( TY_(tmbstrcasecmp)(element->element, "xsl:text") == 0 )
4238        return yes;
4239
4240    return no;
4241}
4242
4243/*
4244  XML documents
4245*/
4246static void ParseXMLElement(TidyDocImpl* doc, Node *element, GetTokenMode mode)
4247{
4248    Lexer* lexer = doc->lexer;
4249    Node *node;
4250
4251    /* if node is pre or has xml:space="preserve" then do so */
4252
4253    if ( TY_(XMLPreserveWhiteSpace)(doc, element) )
4254        mode = Preformatted;
4255
4256    while ((node = TY_(GetToken)(doc, mode)) != NULL)
4257    {
4258        if (node->type == EndTag &&
4259           node->element && element->element &&
4260           TY_(tmbstrcmp)(node->element, element->element) == 0)
4261        {
4262            TY_(FreeNode)( doc, node);
4263            element->closed = yes;
4264            break;
4265        }
4266
4267        /* discard unexpected end tags */
4268        if (node->type == EndTag)
4269        {
4270            if (element)
4271                TY_(ReportFatal)(doc, element, node, UNEXPECTED_ENDTAG_IN);
4272            else
4273                TY_(ReportFatal)(doc, element, node, UNEXPECTED_ENDTAG);
4274
4275            TY_(FreeNode)( doc, node);
4276            continue;
4277        }
4278
4279        /* parse content on seeing start tag */
4280        if (node->type == StartTag)
4281            ParseXMLElement( doc, node, mode );
4282
4283        TY_(InsertNodeAtEnd)(element, node);
4284    }
4285
4286    /*
4287     if first child is text then trim initial space and
4288     delete text node if it is empty.
4289    */
4290
4291    node = element->content;
4292
4293    if (TY_(nodeIsText)(node) && mode != Preformatted)
4294    {
4295        if ( lexer->lexbuf[node->start] == ' ' )
4296        {
4297            node->start++;
4298
4299            if (node->start >= node->end)
4300                TY_(DiscardElement)( doc, node );
4301        }
4302    }
4303
4304    /*
4305     if last child is text then trim final space and
4306     delete the text node if it is empty
4307    */
4308
4309    node = element->last;
4310
4311    if (TY_(nodeIsText)(node) && mode != Preformatted)
4312    {
4313        if ( lexer->lexbuf[node->end - 1] == ' ' )
4314        {
4315            node->end--;
4316
4317            if (node->start >= node->end)
4318                TY_(DiscardElement)( doc, node );
4319        }
4320    }
4321}
4322
4323void TY_(ParseXMLDocument)(TidyDocImpl* doc)
4324{
4325    Node *node, *doctype = NULL;
4326
4327    TY_(SetOptionBool)( doc, TidyXmlTags, yes );
4328
4329    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
4330    {
4331        /* discard unexpected end tags */
4332        if (node->type == EndTag)
4333        {
4334            TY_(ReportError)(doc, NULL, node, UNEXPECTED_ENDTAG);
4335            TY_(FreeNode)( doc, node);
4336            continue;
4337        }
4338
4339         /* deal with comments etc. */
4340        if (InsertMisc( &doc->root, node))
4341            continue;
4342
4343        if (node->type == DocTypeTag)
4344        {
4345            if (doctype == NULL)
4346            {
4347                TY_(InsertNodeAtEnd)( &doc->root, node);
4348                doctype = node;
4349            }
4350            else
4351            {
4352                TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4353                TY_(FreeNode)( doc, node);
4354            }
4355            continue;
4356        }
4357
4358        if (node->type == StartEndTag)
4359        {
4360            TY_(InsertNodeAtEnd)( &doc->root, node);
4361            continue;
4362        }
4363
4364       /* if start tag then parse element's content */
4365        if (node->type == StartTag)
4366        {
4367            TY_(InsertNodeAtEnd)( &doc->root, node );
4368            ParseXMLElement( doc, node, IgnoreWhitespace );
4369            continue;
4370        }
4371
4372        TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4373        TY_(FreeNode)( doc, node);
4374    }
4375
4376    /* ensure presence of initial <?xml version="1.0"?> */
4377    if ( cfgBool(doc, TidyXmlDecl) )
4378        TY_(FixXmlDecl)( doc );
4379
4380#ifdef TIDY_APPLE_CHANGES
4381    if (cfgBool(doc, TidySanitizeAgainstXSS)) {
4382        SanitizeNodesAgainstXSS(doc, &doc->root);
4383        AttributeChecks(doc, &doc->root);
4384    }
4385#endif
4386}
4387
4388/*
4389 * local variables:
4390 * mode: c
4391 * indent-tabs-mode: nil
4392 * c-basic-offset: 4
4393 * eval: (c-set-offset 'substatement-open 0)
4394 * end:
4395 */
4396