1/*
2  clean.c -- clean up misuse of presentation markup
3
4  (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
5  See tidy.h for the copyright notice.
6
7  CVS Info :
8
9    $Author: iccir $
10    $Date: 2007/01/30 23:46:51 $
11    $Revision: 1.3 $
12
13  Filters from other formats such as Microsoft Word
14  often make excessive use of presentation markup such
15  as font tags, B, I, and the align attribute. By applying
16  a set of production rules, it is straight forward to
17  transform this to use CSS.
18
19  Some rules replace some of the children of an element by
20  style properties on the element, e.g.
21
22  <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
23
24  Such rules are applied to the element's content and then
25  to the element itself until none of the rules more apply.
26  Having applied all the rules to an element, it will have
27  a style attribute with one or more properties.
28
29  Other rules strip the element they apply to, replacing
30  it by style properties on the contents, e.g.
31
32  <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
33
34  These rules are applied to an element before processing
35  its content and replace the current element by the first
36  element in the exposed content.
37
38  After applying both sets of rules, you can replace the
39  style attribute by a class value and style rule in the
40  document head. To support this, an association of styles
41  and class names is built.
42
43  A naive approach is to rely on string matching to test
44  when two property lists are the same. A better approach
45  would be to first sort the properties before matching.
46
47*/
48
49#include <stdio.h>
50#include <stdlib.h>
51#include <string.h>
52
53#include "tidy-int.h"
54#include "clean.h"
55#include "lexer.h"
56#include "parser.h"
57#include "attrs.h"
58#include "message.h"
59#include "tmbstr.h"
60#include "utf8.h"
61
62static Node* CleanNode( TidyDocImpl* doc, Node *node );
63
64static void RenameElem( Node* node, TidyTagId tid )
65{
66    const Dict* dict = TY_(LookupTagDef)( tid );
67    MemFree( node->element );
68    node->element = TY_(tmbstrdup)( dict->name );
69    node->tag = dict;
70}
71
72static void FreeStyleProps(StyleProp *props)
73{
74    StyleProp *next;
75
76    while (props)
77    {
78        next = props->next;
79        MemFree(props->name);
80        MemFree(props->value);
81        MemFree(props);
82        props = next;
83    }
84}
85
86static StyleProp *InsertProperty( StyleProp* props, ctmbstr name, ctmbstr value )
87{
88    StyleProp *first, *prev, *prop;
89    int cmp;
90
91    prev = NULL;
92    first = props;
93
94    while (props)
95    {
96        cmp = TY_(tmbstrcmp)(props->name, name);
97
98        if (cmp == 0)
99        {
100            /* this property is already defined, ignore new value */
101            return first;
102        }
103
104        if (cmp > 0)
105        {
106            /* insert before this */
107
108            prop = (StyleProp *)MemAlloc(sizeof(StyleProp));
109            prop->name = TY_(tmbstrdup)(name);
110            prop->value = TY_(tmbstrdup)(value);
111            prop->next = props;
112
113            if (prev)
114                prev->next = prop;
115            else
116                first = prop;
117
118            return first;
119        }
120
121        prev = props;
122        props = props->next;
123    }
124
125    prop = (StyleProp *)MemAlloc(sizeof(StyleProp));
126    prop->name = TY_(tmbstrdup)(name);
127    prop->value = TY_(tmbstrdup)(value);
128    prop->next = NULL;
129
130    if (prev)
131        prev->next = prop;
132    else
133        first = prop;
134
135    return first;
136}
137
138/*
139 Create sorted linked list of properties from style string
140 It temporarily places nulls in place of ':' and ';' to
141 delimit the strings for the property name and value.
142 Some systems don't allow you to NULL literal strings,
143 so to avoid this, a copy is made first.
144*/
145static StyleProp* CreateProps( StyleProp* prop, ctmbstr style )
146{
147    tmbstr name, value = NULL, name_end, value_end, line;
148    Bool more;
149
150    line = TY_(tmbstrdup)(style);
151    name = line;
152
153    while (*name)
154    {
155        while (*name == ' ')
156            ++name;
157
158        name_end = name;
159
160        while (*name_end)
161        {
162            if (*name_end == ':')
163            {
164                value = name_end + 1;
165                break;
166            }
167
168            ++name_end;
169        }
170
171        if (*name_end != ':')
172            break;
173
174        while ( value && *value == ' ')
175            ++value;
176
177        value_end = value;
178        more = no;
179
180        while (*value_end)
181        {
182            if (*value_end == ';')
183            {
184                more = yes;
185                break;
186            }
187
188            ++value_end;
189        }
190
191        *name_end = '\0';
192        *value_end = '\0';
193
194        prop = InsertProperty(prop, name, value);
195        *name_end = ':';
196
197        if (more)
198        {
199            *value_end = ';';
200            name = value_end + 1;
201            continue;
202        }
203
204        break;
205    }
206
207    MemFree(line);  /* free temporary copy */
208    return prop;
209}
210
211static tmbstr CreatePropString(StyleProp *props)
212{
213    tmbstr style, p, s;
214    uint len;
215    StyleProp *prop;
216
217    /* compute length */
218
219    for (len = 0, prop = props; prop; prop = prop->next)
220    {
221        len += TY_(tmbstrlen)(prop->name) + 2;
222        if (prop->value)
223            len += TY_(tmbstrlen)(prop->value) + 2;
224    }
225
226    style = (tmbstr) MemAlloc(len+1);
227    style[0] = '\0';
228
229    for (p = style, prop = props; prop; prop = prop->next)
230    {
231        s = prop->name;
232
233        while((*p++ = *s++))
234            continue;
235
236        if (prop->value)
237        {
238            *--p = ':';
239            *++p = ' ';
240            ++p;
241
242            s = prop->value;
243            while((*p++ = *s++))
244                continue;
245        }
246        if (prop->next == NULL)
247            break;
248
249        *--p = ';';
250        *++p = ' ';
251        ++p;
252    }
253
254    return style;
255}
256
257/*
258  create string with merged properties
259static tmbstr AddProperty( ctmbstr style, ctmbstr property )
260{
261    tmbstr line;
262    StyleProp *prop;
263
264    prop = CreateProps(NULL, style);
265    prop = CreateProps(prop, property);
266    line = CreatePropString(prop);
267    FreeStyleProps(prop);
268    return line;
269}
270*/
271
272void TY_(FreeStyles)( TidyDocImpl* doc )
273{
274    Lexer* lexer = doc->lexer;
275    if ( lexer )
276    {
277        TagStyle *style, *next;
278        for ( style = lexer->styles; style; style = next )
279        {
280            next = style->next;
281            MemFree( style->tag );
282            MemFree( style->tag_class );
283            MemFree( style->properties );
284            MemFree( style );
285        }
286    }
287}
288
289static tmbstr GensymClass( TidyDocImpl* doc )
290{
291    tmbchar buf[512];  /* CSSPrefix is limited to 256 characters */
292    ctmbstr pfx = cfgStr(doc, TidyCSSPrefix);
293    if ( pfx == NULL || *pfx == 0 )
294      pfx = "c";
295
296    TY_(tmbsnprintf)(buf, sizeof(buf), "%s%u", pfx, ++doc->nClassId );
297    return TY_(tmbstrdup)(buf);
298}
299
300static ctmbstr FindStyle( TidyDocImpl* doc, ctmbstr tag, ctmbstr properties )
301{
302    Lexer* lexer = doc->lexer;
303    TagStyle* style;
304
305    for (style = lexer->styles; style; style=style->next)
306    {
307        if (TY_(tmbstrcmp)(style->tag, tag) == 0 &&
308            TY_(tmbstrcmp)(style->properties, properties) == 0)
309            return style->tag_class;
310    }
311
312    style = (TagStyle *)MemAlloc( sizeof(TagStyle) );
313    style->tag = TY_(tmbstrdup)(tag);
314    style->tag_class = GensymClass( doc );
315    style->properties = TY_(tmbstrdup)( properties );
316    style->next = lexer->styles;
317    lexer->styles = style;
318    return style->tag_class;
319}
320
321/*
322 Add class="foo" to node
323*/
324static void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname )
325{
326    AttVal *classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);;
327
328    /*
329     if there already is a class attribute
330     then append class name after a space.
331    */
332    if (classattr)
333        TY_(AppendToClassAttr)( classattr, classname );
334    else /* create new class attribute */
335        TY_(AddAttribute)( doc, node, "class", classname );
336}
337
338void TY_(AddStyleAsClass)( TidyDocImpl* doc, Node *node, ctmbstr stylevalue )
339{
340    ctmbstr classname;
341
342    classname = FindStyle( doc, node->element, stylevalue );
343    AddClass( doc, node, classname);
344}
345
346/*
347 Find style attribute in node, and replace it
348 by corresponding class attribute. Search for
349 class in style dictionary otherwise gensym
350 new class and add to dictionary.
351
352 Assumes that node doesn't have a class attribute
353*/
354static void Style2Rule( TidyDocImpl* doc, Node *node)
355{
356    AttVal *styleattr, *classattr;
357    ctmbstr classname;
358
359    styleattr = TY_(AttrGetById)(node, TidyAttr_STYLE);
360
361    if (styleattr)
362    {
363        /* fix for http://tidy.sf.net/bug/850215 */
364        if (!styleattr->value)
365        {
366            TY_(RemoveAttribute)(doc, node, styleattr);
367            return;
368        }
369
370        classname = FindStyle( doc, node->element, styleattr->value );
371        classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);
372
373        /*
374         if there already is a class attribute
375         then append class name after an underscore
376        */
377        if (classattr)
378        {
379            TY_(AppendToClassAttr)( classattr, classname );
380            TY_(RemoveAttribute)( doc, node, styleattr );
381        }
382        else /* reuse style attribute for class attribute */
383        {
384            MemFree(styleattr->attribute);
385            MemFree(styleattr->value);
386            styleattr->attribute = TY_(tmbstrdup)("class");
387            styleattr->value = TY_(tmbstrdup)(classname);
388        }
389    }
390}
391
392static void AddColorRule( Lexer* lexer, ctmbstr selector, ctmbstr color )
393{
394    if ( selector && color )
395    {
396        TY_(AddStringLiteral)(lexer, selector);
397        TY_(AddStringLiteral)(lexer, " { color: ");
398        TY_(AddStringLiteral)(lexer, color);
399        TY_(AddStringLiteral)(lexer, " }\n");
400    }
401}
402
403/*
404 move presentation attribs from body to style element
405
406 background="foo" ->  body { background-image: url(foo) }
407 bgcolor="foo"    ->  body { background-color: foo }
408 text="foo"       ->  body { color: foo }
409 link="foo"       ->  :link { color: foo }
410 vlink="foo"      ->  :visited { color: foo }
411 alink="foo"      ->  :active { color: foo }
412*/
413static void CleanBodyAttrs( TidyDocImpl* doc, Node* body )
414{
415    Lexer* lexer  = doc->lexer;
416    tmbstr bgurl   = NULL;
417    tmbstr bgcolor = NULL;
418    tmbstr color   = NULL;
419    AttVal* attr;
420
421    if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BACKGROUND)))
422    {
423        bgurl = attr->value;
424        attr->value = NULL;
425        TY_(RemoveAttribute)( doc, body, attr );
426    }
427
428    if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BGCOLOR)))
429    {
430        bgcolor = attr->value;
431        attr->value = NULL;
432        TY_(RemoveAttribute)( doc, body, attr );
433    }
434
435    if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_TEXT)))
436    {
437        color = attr->value;
438        attr->value = NULL;
439        TY_(RemoveAttribute)( doc, body, attr );
440    }
441
442    if ( bgurl || bgcolor || color )
443    {
444        TY_(AddStringLiteral)(lexer, " body {\n");
445        if (bgurl)
446        {
447            TY_(AddStringLiteral)(lexer, "  background-image: url(");
448            TY_(AddStringLiteral)(lexer, bgurl);
449            TY_(AddStringLiteral)(lexer, ");\n");
450            MemFree(bgurl);
451        }
452        if (bgcolor)
453        {
454            TY_(AddStringLiteral)(lexer, "  background-color: ");
455            TY_(AddStringLiteral)(lexer, bgcolor);
456            TY_(AddStringLiteral)(lexer, ";\n");
457            MemFree(bgcolor);
458        }
459        if (color)
460        {
461            TY_(AddStringLiteral)(lexer, "  color: ");
462            TY_(AddStringLiteral)(lexer, color);
463            TY_(AddStringLiteral)(lexer, ";\n");
464            MemFree(color);
465        }
466
467        TY_(AddStringLiteral)(lexer, " }\n");
468    }
469
470    if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_LINK)))
471    {
472        AddColorRule(lexer, " :link", attr->value);
473        TY_(RemoveAttribute)( doc, body, attr );
474    }
475
476    if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_VLINK)))
477    {
478        AddColorRule(lexer, " :visited", attr->value);
479        TY_(RemoveAttribute)( doc, body, attr );
480    }
481
482    if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_ALINK)))
483    {
484        AddColorRule(lexer, " :active", attr->value);
485        TY_(RemoveAttribute)( doc, body, attr );
486    }
487}
488
489static Bool NiceBody( TidyDocImpl* doc )
490{
491    Node* node = TY_(FindBody)(doc);
492    if (node)
493    {
494        if (TY_(AttrGetById)(node, TidyAttr_BACKGROUND) ||
495            TY_(AttrGetById)(node, TidyAttr_BGCOLOR)    ||
496            TY_(AttrGetById)(node, TidyAttr_TEXT)       ||
497            TY_(AttrGetById)(node, TidyAttr_LINK)       ||
498            TY_(AttrGetById)(node, TidyAttr_VLINK)      ||
499            TY_(AttrGetById)(node, TidyAttr_ALINK))
500        {
501            doc->badLayout |= USING_BODY;
502            return no;
503        }
504    }
505
506    return yes;
507}
508
509/* create style element using rules from dictionary */
510static void CreateStyleElement( TidyDocImpl* doc )
511{
512    Lexer* lexer = doc->lexer;
513    Node *node, *head, *body;
514    TagStyle *style;
515    AttVal *av;
516
517    if ( lexer->styles == NULL && NiceBody(doc) )
518        return;
519
520    node = TY_(NewNode)( lexer );
521    node->type = StartTag;
522    node->implicit = yes;
523    node->element = TY_(tmbstrdup)("style");
524    TY_(FindTag)( doc, node );
525
526    /* insert type attribute */
527    av = TY_(NewAttributeEx)( doc, "type", "text/css", '"' );
528    TY_(InsertAttributeAtStart)( node, av );
529
530    body = TY_(FindBody)( doc );
531    lexer->txtstart = lexer->lexsize;
532    if ( body )
533        CleanBodyAttrs( doc, body );
534
535    for (style = lexer->styles; style; style = style->next)
536    {
537        TY_(AddCharToLexer)(lexer, ' ');
538        TY_(AddStringLiteral)(lexer, style->tag);
539        TY_(AddCharToLexer)(lexer, '.');
540        TY_(AddStringLiteral)(lexer, style->tag_class);
541        TY_(AddCharToLexer)(lexer, ' ');
542        TY_(AddCharToLexer)(lexer, '{');
543        TY_(AddStringLiteral)(lexer, style->properties);
544        TY_(AddCharToLexer)(lexer, '}');
545        TY_(AddCharToLexer)(lexer, '\n');
546    }
547
548    lexer->txtend = lexer->lexsize;
549
550    TY_(InsertNodeAtEnd)( node, TY_(TextToken)(lexer) );
551
552    /*
553     now insert style element into document head
554
555     doc is root node. search its children for html node
556     the head node should be first child of html node
557    */
558    if ( NULL != (head = TY_(FindHEAD)( doc )) )
559        TY_(InsertNodeAtEnd)( head, node );
560}
561
562
563/* ensure bidirectional links are consistent */
564void TY_(FixNodeLinks)(Node *node)
565{
566    Node *child;
567
568    if (node->prev)
569        node->prev->next = node;
570    else
571        node->parent->content = node;
572
573    if (node->next)
574        node->next->prev = node;
575    else
576        node->parent->last = node;
577
578    for (child = node->content; child; child = child->next)
579        child->parent = node;
580}
581
582/*
583 used to strip child of node when
584 the node has one and only one child
585*/
586static void StripOnlyChild(TidyDocImpl* doc, Node *node)
587{
588    Node *child;
589
590    child = node->content;
591    node->content = child->content;
592    node->last = child->last;
593    child->content = NULL;
594    TY_(FreeNode)(doc, child);
595
596    for (child = node->content; child; child = child->next)
597        child->parent = node;
598}
599
600/*
601  used to strip font start and end tags.
602  Extricate "element", replace it by its content and delete it.
603*/
604static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
605{
606    if (element->content)
607    {
608        Node *node, *parent = element->parent;
609
610        element->last->next = element->next;
611
612        if (element->next)
613        {
614            element->next->prev = element->last;
615        }
616        else
617            parent->last = element->last;
618
619        if (element->prev)
620        {
621            element->content->prev = element->prev;
622            element->prev->next = element->content;
623        }
624        else
625            parent->content = element->content;
626
627        for (node = element->content; node; node = node->next)
628            node->parent = parent;
629
630        *pnode = element->content;
631
632        element->next = element->content = NULL;
633        TY_(FreeNode)(doc, element);
634    }
635    else
636    {
637        *pnode = TY_(DiscardElement)(doc, element);
638    }
639}
640
641/*
642  Create new string that consists of the
643  combined style properties in s1 and s2
644
645  To merge property lists, we build a linked
646  list of property/values and insert properties
647  into the list in order, merging values for
648  the same property name.
649*/
650static tmbstr MergeProperties( ctmbstr s1, ctmbstr s2 )
651{
652    tmbstr s;
653    StyleProp *prop;
654
655    prop = CreateProps(NULL, s1);
656    prop = CreateProps(prop, s2);
657    s = CreatePropString(prop);
658    FreeStyleProps(prop);
659    return s;
660}
661
662/*
663 Add style property to element, creating style
664 attribute as needed and adding ; delimiter
665*/
666void TY_(AddStyleProperty)(TidyDocImpl* doc, Node *node, ctmbstr property )
667{
668    AttVal *av = TY_(AttrGetById)(node, TidyAttr_STYLE);
669
670    /* if style attribute already exists then insert property */
671
672    if ( av )
673    {
674        if (av->value != NULL)
675        {
676            tmbstr s = MergeProperties( av->value, property );
677            MemFree( av->value );
678            av->value = s;
679        }
680        else
681        {
682            av->value = TY_(tmbstrdup)( property );
683        }
684    }
685    else /* else create new style attribute */
686    {
687        av = TY_(NewAttributeEx)( doc, "style", property, '"' );
688        TY_(InsertAttributeAtStart)( node, av );
689    }
690}
691
692static void MergeClasses(TidyDocImpl* doc, Node *node, Node *child)
693{
694    AttVal *av;
695    tmbstr s1, s2, names;
696
697    for (s2 = NULL, av = child->attributes; av; av = av->next)
698    {
699        if (attrIsCLASS(av))
700        {
701            s2 = av->value;
702            break;
703        }
704    }
705
706    for (s1 = NULL, av = node->attributes; av; av = av->next)
707    {
708        if (attrIsCLASS(av))
709        {
710            s1 = av->value;
711            break;
712        }
713    }
714
715    if (s1)
716    {
717        if (s2)  /* merge class names from both */
718        {
719            uint l1, l2;
720            l1 = TY_(tmbstrlen)(s1);
721            l2 = TY_(tmbstrlen)(s2);
722            names = (tmbstr) MemAlloc(l1 + l2 + 2);
723            TY_(tmbstrcpy)(names, s1);
724            names[l1] = ' ';
725            TY_(tmbstrcpy)(names+l1+1, s2);
726            MemFree(av->value);
727            av->value = names;
728        }
729    }
730    else if (s2)  /* copy class names from child */
731    {
732        av = TY_(NewAttributeEx)( doc, "class", s2, '"' );
733        TY_(InsertAttributeAtStart)( node, av );
734    }
735}
736
737static void MergeStyles(TidyDocImpl* doc, Node *node, Node *child)
738{
739    AttVal *av;
740    tmbstr s1, s2, style;
741
742    /*
743       the child may have a class attribute used
744       for attaching styles, if so the class name
745       needs to be copied to node's class
746    */
747    MergeClasses(doc, node, child);
748
749    for (s2 = NULL, av = child->attributes; av; av = av->next)
750    {
751        if (attrIsSTYLE(av))
752        {
753            s2 = av->value;
754            break;
755        }
756    }
757
758    for (s1 = NULL, av = node->attributes; av; av = av->next)
759    {
760        if (attrIsSTYLE(av))
761        {
762            s1 = av->value;
763            break;
764        }
765    }
766
767    if (s1)
768    {
769        if (s2)  /* merge styles from both */
770        {
771            style = MergeProperties(s1, s2);
772            MemFree(av->value);
773            av->value = style;
774        }
775    }
776    else if (s2)  /* copy style of child */
777    {
778        av = TY_(NewAttributeEx)( doc, "style", s2, '"' );
779        TY_(InsertAttributeAtStart)( node, av );
780    }
781}
782
783static ctmbstr FontSize2Name(ctmbstr size)
784{
785    static const ctmbstr sizes[7] =
786    {
787        "60%", "70%", "80%", NULL,
788        "120%", "150%", "200%"
789    };
790
791    /* increment of 0.8 */
792    static const ctmbstr minussizes[] =
793    {
794        "100%", "80%", "64%", "51%",
795        "40%", "32%", "26%"
796    };
797
798    /* increment of 1.2 */
799    static const ctmbstr plussizes[] =
800    {
801        "100%", "120%", "144%", "172%",
802        "207%", "248%", "298%"
803    };
804
805    if (size[0] == '\0')
806        return NULL;
807
808    if ('0' <= size[0] && size[0] <= '6')
809    {
810        int n = size[0] - '0';
811        return sizes[n];
812    }
813
814    if (size[0] == '-')
815    {
816        if ('0' <= size[1] && size[1] <= '6')
817        {
818            int n = size[1] - '0';
819            return minussizes[n];
820        }
821        return "smaller"; /*"70%"; */
822    }
823
824    if ('0' <= size[1] && size[1] <= '6')
825    {
826        int n = size[1] - '0';
827        return plussizes[n];
828    }
829
830    return "larger"; /* "140%" */
831}
832
833static void AddFontFace( TidyDocImpl* doc, Node *node, ctmbstr face )
834{
835    tmbchar buf[256];
836    TY_(tmbsnprintf)(buf, sizeof(buf), "font-family: %s", face );
837    TY_(AddStyleProperty)( doc, node, buf );
838}
839
840static void AddFontSize( TidyDocImpl* doc, Node* node, ctmbstr size )
841{
842    ctmbstr value = NULL;
843
844    if (nodeIsP(node))
845    {
846        if (TY_(tmbstrcmp)(size, "6") == 0)
847            value = "h1";
848        else if (TY_(tmbstrcmp)(size, "5") == 0)
849            value = "h2";
850        else if (TY_(tmbstrcmp)(size, "4") == 0)
851            value = "h3";
852
853        if (value)
854        {
855            MemFree(node->element);
856            node->element = TY_(tmbstrdup)(value);
857            TY_(FindTag)(doc, node);
858            return;
859        }
860    }
861
862    value = FontSize2Name(size);
863
864    if (value)
865    {
866        tmbchar buf[64];
867        TY_(tmbsnprintf)(buf, sizeof(buf), "font-size: %s", value);
868        TY_(AddStyleProperty)( doc, node, buf );
869    }
870}
871
872static void AddFontColor( TidyDocImpl* doc, Node *node, ctmbstr color)
873{
874    tmbchar buf[128];
875    TY_(tmbsnprintf)(buf, sizeof(buf), "color: %s", color);
876    TY_(AddStyleProperty)( doc, node, buf );
877}
878
879/* force alignment value to lower case */
880static void AddAlign( TidyDocImpl* doc, Node *node, ctmbstr align )
881{
882    uint i;
883    tmbchar buf[128];
884
885    TY_(tmbstrcpy)( buf, "text-align: " );
886    for ( i = 12; i < sizeof(buf)/sizeof(buf[0])-1; ++i )
887    {
888        if ( (buf[i] = (tmbchar)TY_(ToLower)(*align++)) == '\0' )
889            break;
890    }
891    buf[i] = '\0';
892    TY_(AddStyleProperty)( doc, node, buf );
893}
894
895/*
896 add style properties to node corresponding to
897 the font face, size and color attributes
898*/
899static void AddFontStyles( TidyDocImpl* doc, Node *node, AttVal *av)
900{
901    while (av)
902    {
903        if (AttrHasValue(av))
904        {
905            if (attrIsFACE(av))
906                AddFontFace( doc, node, av->value );
907            else if (attrIsSIZE(av))
908                AddFontSize( doc, node, av->value );
909            else if (attrIsCOLOR(av))
910                AddFontColor( doc, node, av->value );
911        }
912        av = av->next;
913    }
914}
915
916/*
917    Symptom: <p align=center>
918    Action: <p style="text-align: center">
919*/
920static void TextAlign( TidyDocImpl* doc, Node* node )
921{
922    AttVal *av, *prev;
923
924    prev = NULL;
925
926    for (av = node->attributes; av; av = av->next)
927    {
928        if (attrIsALIGN(av))
929        {
930            if (prev)
931                prev->next = av->next;
932            else
933                node->attributes = av->next;
934
935            if (av->value)
936                AddAlign( doc, node, av->value );
937
938            TY_(FreeAttribute)(doc, av);
939            break;
940        }
941
942        prev = av;
943    }
944}
945
946/*
947   The clean up rules use the pnode argument to return the
948   next node when the original node has been deleted
949*/
950
951/*
952    Symptom: <dir> <li> where <li> is only child
953    Action: coerce <dir> <li> to <div> with indent.
954*/
955
956static Bool Dir2Div( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode))
957{
958    Node *child;
959
960    if ( nodeIsDIR(node) || nodeIsUL(node) || nodeIsOL(node) )
961    {
962        child = node->content;
963
964        if (child == NULL)
965            return no;
966
967        /* check child has no peers */
968
969        if (child->next)
970            return no;
971
972        if ( !nodeIsLI(child) )
973            return no;
974
975        if ( !child->implicit )
976            return no;
977
978        /* coerce dir to div */
979        node->tag = TY_(LookupTagDef)( TidyTag_DIV );
980        MemFree( node->element );
981        node->element = TY_(tmbstrdup)("div");
982        TY_(AddStyleProperty)( doc, node, "margin-left: 2em" );
983        StripOnlyChild( doc, node );
984        return yes;
985    }
986
987    return no;
988}
989
990/*
991    Symptom: <center>
992    Action: replace <center> by <div style="text-align: center">
993*/
994
995static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode)
996{
997    if ( nodeIsCENTER(node) )
998    {
999        if ( cfgBool(doc, TidyDropFontTags) )
1000        {
1001            if (node->content)
1002            {
1003                Node *last = node->last;
1004                DiscardContainer( doc, node, pnode );
1005
1006                node = TY_(InferredTag)(doc, TidyTag_BR);
1007                TY_(InsertNodeAfterElement)(last, node);
1008            }
1009            else
1010            {
1011                Node *prev = node->prev, *next = node->next,
1012                     *parent = node->parent;
1013                DiscardContainer( doc, node, pnode );
1014
1015                node = TY_(InferredTag)(doc, TidyTag_BR);
1016                if (next)
1017                    TY_(InsertNodeBeforeElement)(next, node);
1018                else if (prev)
1019                    TY_(InsertNodeAfterElement)(prev, node);
1020                else
1021                    TY_(InsertNodeAtStart)(parent, node);
1022            }
1023
1024            return yes;
1025        }
1026
1027        RenameElem( node, TidyTag_DIV );
1028        TY_(AddStyleProperty)( doc, node, "text-align: center" );
1029        return yes;
1030    }
1031
1032    return no;
1033}
1034
1035/* Copy child attributes to node. Duplicate attributes are overwritten.
1036   Unique attributes (such as ID) disable the action.
1037   Attributes style and class are not dealt with. A call to MergeStyles
1038   will do that.
1039*/
1040static Bool CopyAttrs( TidyDocImpl* doc, Node *node, Node *child)
1041{
1042    AttVal *av1, *av2;
1043    TidyAttrId id;
1044
1045    /* Detect attributes that cannot be merged or overwritten. */
1046    if (TY_(AttrGetById)(child, TidyAttr_ID) != NULL
1047        && TY_(AttrGetById)(node, TidyAttr_ID) != NULL)
1048        return no;
1049
1050    /* Move child attributes to node. Attributes in node
1051     can be overwritten or merged. */
1052    for (av2 = child->attributes; av2; )
1053    {
1054        /* Dealt by MergeStyles. */
1055        if (attrIsSTYLE(av2) || attrIsCLASS(av2))
1056        {
1057            av2 = av2->next;
1058            continue;
1059        }
1060        /* Avoid duplicates in node */
1061        if ((id=AttrId(av2)) != TidyAttr_UNKNOWN
1062            && (av1=TY_(AttrGetById)(node, id))!= NULL)
1063            TY_(RemoveAttribute)( doc, node, av1 );
1064
1065        /* Move attribute from child to node */
1066        TY_(DetachAttribute)( child, av2 );
1067        av1 = av2;
1068        av2 = av2->next;
1069        av1->next = NULL;
1070        TY_(InsertAttributeAtEnd)( node, av1 );
1071    }
1072
1073    return yes;
1074}
1075
1076/*
1077    Symptom <XX><XX>...</XX></XX>
1078    Action: merge the two XXs
1079
1080  For instance, this is useful after nested <dir>s used by Word
1081  for indenting have been converted to <div>s
1082
1083  If state is "no", no merging.
1084  If state is "yes", inner element is discarded. Only Style and Class
1085  attributes are merged using MergeStyles().
1086  If state is "auto", atttibutes are merged as described in CopyAttrs().
1087  Style and Class attributes are merged using MergeStyles().
1088*/
1089static Bool MergeNestedElements( TidyDocImpl* doc,
1090                                 TidyTagId Id, TidyTriState state, Node *node,
1091                                 Node **ARG_UNUSED(pnode))
1092{
1093    Node *child;
1094
1095    if ( state == TidyNoState
1096         || !TagIsId(node, Id) )
1097        return no;
1098
1099    child = node->content;
1100
1101    if ( child == NULL
1102         || child->next != NULL
1103         || !TagIsId(child, Id) )
1104        return no;
1105
1106    if ( state == TidyAutoState
1107         && CopyAttrs(doc, node, child) == no )
1108        return no;
1109
1110    MergeStyles( doc, node, child );
1111    StripOnlyChild( doc, node );
1112    return yes;
1113}
1114
1115/*
1116    Symptom: <ul><li><ul>...</ul></li></ul>
1117    Action: discard outer list
1118*/
1119
1120static Bool NestedList( TidyDocImpl* doc, Node *node, Node **pnode )
1121{
1122    Node *child, *list;
1123
1124    if ( nodeIsUL(node) || nodeIsOL(node) )
1125    {
1126        child = node->content;
1127
1128        if (child == NULL)
1129            return no;
1130
1131        /* check child has no peers */
1132
1133        if (child->next)
1134            return no;
1135
1136        list = child->content;
1137
1138        if (!list)
1139            return no;
1140
1141        if (list->tag != node->tag)
1142            return no;
1143
1144        /* check list has no peers */
1145        if (list->next)
1146            return no;
1147
1148        *pnode = list;  /* Set node to resume iteration */
1149
1150        /* move inner list node into position of outer node */
1151        list->prev = node->prev;
1152        list->next = node->next;
1153        list->parent = node->parent;
1154        TY_(FixNodeLinks)(list);
1155
1156        /* get rid of outer ul and its li */
1157        child->content = NULL;
1158        TY_(FreeNode)( doc, child ); /* See test #427841. */
1159        child = NULL;
1160        node->content = NULL;
1161        node->next = NULL;
1162        TY_(FreeNode)( doc, node );
1163        node = NULL;
1164
1165        /*
1166          If prev node was a list the chances are this node
1167          should be appended to that list. Word has no way of
1168          recognizing nested lists and just uses indents
1169        */
1170
1171        if (list->prev)
1172        {
1173            if ( (nodeIsUL(list->prev) || nodeIsOL(list->prev))
1174                 && list->prev->last )
1175            {
1176                node = list;
1177                list = node->prev;
1178
1179                child = list->last;  /* <li> */
1180
1181                list->next = node->next;
1182                TY_(FixNodeLinks)(list);
1183
1184                node->parent = child;
1185                node->next = NULL;
1186                node->prev = child->last;
1187                TY_(FixNodeLinks)(node);
1188                CleanNode( doc, node );
1189            }
1190        }
1191
1192        return yes;
1193    }
1194
1195    return no;
1196}
1197
1198/*
1199  Some necessary conditions to apply BlockStyle().
1200 */
1201
1202static Bool CanApplyBlockStyle( Node *node )
1203{
1204    if (node->tag->model & (CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE)
1205        && !nodeIsTABLE(node) && !nodeIsTR(node) && !nodeIsLI(node) )
1206    {
1207        return yes;
1208    }
1209    return no;
1210}
1211
1212/*
1213  Symptom: the only child of a block-level element is a
1214  presentation element such as B, I or FONT
1215
1216  Action: add style "font-weight: bold" to the block and
1217  strip the <b> element, leaving its children.
1218
1219  example:
1220
1221    <p>
1222      <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1223    </p>
1224
1225  becomes:
1226
1227      <p style="font-weight: bold; font-family: Arial; font-size: 6">
1228        Draft Recommended Practice
1229      </p>
1230
1231  This code also replaces the align attribute by a style attribute.
1232  However, to avoid CSS problems with Navigator 4, this isn't done
1233  for the elements: caption, tr and table
1234*/
1235static Bool BlockStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1236{
1237    Node *child;
1238
1239    if (CanApplyBlockStyle(node))
1240    {
1241        /* check for align attribute */
1242        if ( !nodeIsCAPTION(node) )
1243            TextAlign( doc, node );
1244
1245        child = node->content;
1246        if (child == NULL)
1247            return no;
1248
1249        /* check child has no peers */
1250        if (child->next)
1251            return no;
1252
1253        if ( nodeIsB(child) )
1254        {
1255            MergeStyles( doc, node, child );
1256            TY_(AddStyleProperty)( doc, node, "font-weight: bold" );
1257            StripOnlyChild( doc, node );
1258            return yes;
1259        }
1260
1261        if ( nodeIsI(child) )
1262        {
1263            MergeStyles( doc, node, child );
1264            TY_(AddStyleProperty)( doc, node, "font-style: italic" );
1265            StripOnlyChild( doc, node );
1266            return yes;
1267        }
1268
1269        if ( nodeIsFONT(child) )
1270        {
1271            MergeStyles( doc, node, child );
1272            AddFontStyles( doc, node, child->attributes );
1273            StripOnlyChild( doc, node );
1274            return yes;
1275        }
1276    }
1277
1278    return no;
1279}
1280
1281/* the only child of table cell or an inline element such as em */
1282static Bool InlineStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1283{
1284    Node *child;
1285
1286    if ( !nodeIsFONT(node) && TY_(nodeHasCM)(node, CM_INLINE|CM_ROW) )
1287    {
1288        child = node->content;
1289
1290        if (child == NULL)
1291            return no;
1292
1293        /* check child has no peers */
1294
1295        if (child->next)
1296            return no;
1297
1298        if ( nodeIsB(child) && cfgBool(doc, TidyLogicalEmphasis) )
1299        {
1300            MergeStyles( doc, node, child );
1301            TY_(AddStyleProperty)( doc, node, "font-weight: bold" );
1302            StripOnlyChild( doc, node );
1303            return yes;
1304        }
1305
1306        if ( nodeIsI(child) && cfgBool(doc, TidyLogicalEmphasis) )
1307        {
1308            MergeStyles( doc, node, child );
1309            TY_(AddStyleProperty)( doc, node, "font-style: italic" );
1310            StripOnlyChild( doc, node );
1311            return yes;
1312        }
1313
1314        if ( nodeIsFONT(child) )
1315        {
1316            MergeStyles( doc, node, child );
1317            AddFontStyles( doc, node, child->attributes );
1318            StripOnlyChild( doc, node );
1319            return yes;
1320        }
1321    }
1322
1323    return no;
1324}
1325
1326/*
1327  Replace font elements by span elements, deleting
1328  the font element's attributes and replacing them
1329  by a single style attribute.
1330*/
1331static Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode )
1332{
1333    AttVal *av, *style, *next;
1334
1335    if ( nodeIsFONT(node) )
1336    {
1337        if ( cfgBool(doc, TidyDropFontTags) )
1338        {
1339            DiscardContainer( doc, node, pnode );
1340            return yes;
1341        }
1342
1343        /* if FONT is only child of parent element then leave alone
1344          Do so only if BlockStyle may be succesful. */
1345        if ( node->parent->content == node && node->next == NULL &&
1346             CanApplyBlockStyle(node->parent) )
1347            return no;
1348
1349        AddFontStyles( doc, node, node->attributes );
1350
1351        /* extract style attribute and free the rest */
1352        av = node->attributes;
1353        style = NULL;
1354
1355        while (av)
1356        {
1357            next = av->next;
1358
1359            if (attrIsSTYLE(av))
1360            {
1361                av->next = NULL;
1362                style = av;
1363            }
1364            else
1365            {
1366                TY_(FreeAttribute)( doc, av );
1367            }
1368            av = next;
1369        }
1370
1371        node->attributes = style;
1372        RenameElem( node, TidyTag_SPAN );
1373        return yes;
1374    }
1375
1376    return no;
1377}
1378
1379/*
1380  Applies all matching rules to a node.
1381*/
1382Node* CleanNode( TidyDocImpl* doc, Node *node )
1383{
1384    Node *next = NULL;
1385    TidyTriState mergeDivs = cfgAutoBool(doc, TidyMergeDivs);
1386
1387    for (next = node; TY_(nodeIsElement)(node); node = next)
1388    {
1389        if ( Dir2Div(doc, node, &next) )
1390            continue;
1391
1392        /* Special case: true result means
1393        ** that arg node and its parent no longer exist.
1394        ** So we must jump back up the CreateStyleProperties()
1395        ** call stack until we have a valid node reference.
1396        */
1397        if ( NestedList(doc, node, &next) )
1398            return next;
1399
1400        if ( Center2Div(doc, node, &next) )
1401            continue;
1402
1403        if ( MergeNestedElements(doc, TidyTag_DIV, mergeDivs, node, &next) )
1404            continue;
1405
1406        if ( BlockStyle(doc, node, &next) )
1407            continue;
1408
1409        if ( InlineStyle(doc, node, &next) )
1410            continue;
1411
1412        if ( Font2Span(doc, node, &next) )
1413            continue;
1414
1415        break;
1416    }
1417
1418    return next;
1419}
1420
1421/* Special case: if the current node is destroyed by
1422** CleanNode() lower in the tree, this node and its parent
1423** no longer exist.  So we must jump back up the CleanTree()
1424** call stack until we have a valid node reference.
1425*/
1426
1427static Node* CleanTree( TidyDocImpl* doc, Node *node )
1428{
1429    if (node->content)
1430    {
1431        Node *child;
1432        for (child = node->content; child != NULL; child = child->next)
1433        {
1434            child = CleanTree( doc, child );
1435            if ( !child )
1436                break;
1437        }
1438    }
1439
1440    return CleanNode( doc, node );
1441}
1442
1443static void DefineStyleRules( TidyDocImpl* doc, Node *node )
1444{
1445    Node *child;
1446
1447    if (node->content)
1448    {
1449        for (child = node->content;
1450                child != NULL; child = child->next)
1451        {
1452            DefineStyleRules( doc, child );
1453        }
1454    }
1455
1456    Style2Rule( doc, node );
1457}
1458
1459void TY_(CleanDocument)( TidyDocImpl* doc )
1460{
1461    /* placeholder.  CleanTree()/CleanNode() will not
1462    ** zap root element
1463    */
1464    CleanTree( doc, &doc->root );
1465
1466    if ( cfgBool(doc, TidyMakeClean) )
1467    {
1468        DefineStyleRules( doc, &doc->root );
1469        CreateStyleElement( doc );
1470    }
1471}
1472
1473/* simplifies <b><b> ... </b> ...</b> etc. */
1474void TY_(NestedEmphasis)( TidyDocImpl* doc, Node* node )
1475{
1476    Node *next;
1477
1478    while (node)
1479    {
1480        next = node->next;
1481
1482        if ( (nodeIsB(node) || nodeIsI(node))
1483             && node->parent && node->parent->tag == node->tag)
1484        {
1485            /* strip redundant inner element */
1486            DiscardContainer( doc, node, &next );
1487            node = next;
1488            continue;
1489        }
1490
1491        if ( node->content )
1492            TY_(NestedEmphasis)( doc, node->content );
1493
1494        node = next;
1495    }
1496}
1497
1498
1499
1500/* replace i by em and b by strong */
1501void TY_(EmFromI)( TidyDocImpl* doc, Node* node )
1502{
1503    while (node)
1504    {
1505        if ( nodeIsI(node) )
1506            RenameElem( node, TidyTag_EM );
1507        else if ( nodeIsB(node) )
1508            RenameElem( node, TidyTag_STRONG );
1509
1510        if ( node->content )
1511            TY_(EmFromI)( doc, node->content );
1512
1513        node = node->next;
1514    }
1515}
1516
1517static Bool HasOneChild(Node *node)
1518{
1519    return (node->content && node->content->next == NULL);
1520}
1521
1522/*
1523 Some people use dir or ul without an li
1524 to indent the content. The pattern to
1525 look for is a list with a single implicit
1526 li. This is recursively replaced by an
1527 implicit blockquote.
1528*/
1529void TY_(List2BQ)( TidyDocImpl* doc, Node* node )
1530{
1531    while (node)
1532    {
1533        if (node->content)
1534            TY_(List2BQ)( doc, node->content );
1535
1536        if ( node->tag && node->tag->parser == TY_(ParseList) &&
1537             HasOneChild(node) && node->content->implicit )
1538        {
1539            StripOnlyChild( doc, node );
1540            RenameElem( node, TidyTag_BLOCKQUOTE );
1541            node->implicit = yes;
1542        }
1543
1544        node = node->next;
1545    }
1546}
1547
1548
1549/*
1550 Replace implicit blockquote by div with an indent
1551 taking care to reduce nested blockquotes to a single
1552 div with the indent set to match the nesting depth
1553*/
1554void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
1555{
1556    tmbchar indent_buf[ 32 ];
1557    uint indent;
1558
1559    while (node)
1560    {
1561        if ( nodeIsBLOCKQUOTE(node) && node->implicit )
1562        {
1563            indent = 1;
1564
1565            while( HasOneChild(node) &&
1566                   nodeIsBLOCKQUOTE(node->content) &&
1567                   node->implicit)
1568            {
1569                ++indent;
1570                StripOnlyChild( doc, node );
1571            }
1572
1573            if (node->content)
1574                TY_(BQ2Div)( doc, node->content );
1575
1576            TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
1577                             2*indent);
1578
1579            RenameElem( node, TidyTag_DIV );
1580            TY_(AddStyleProperty)(doc, node, indent_buf );
1581        }
1582        else if (node->content)
1583            TY_(BQ2Div)( doc, node->content );
1584
1585        node = node->next;
1586    }
1587}
1588
1589
1590static Node* FindEnclosingCell( TidyDocImpl* ARG_UNUSED(doc), Node *node)
1591{
1592    Node *check;
1593
1594    for ( check=node; check; check = check->parent )
1595    {
1596      if ( nodeIsTD(check) )
1597        return check;
1598    }
1599    return NULL;
1600}
1601
1602/* node is <![if ...]> prune up to <![endif]> */
1603static Node* PruneSection( TidyDocImpl* doc, Node *node )
1604{
1605    Lexer* lexer = doc->lexer;
1606
1607    for (;;)
1608    {
1609        ctmbstr lexbuf = lexer->lexbuf + node->start;
1610        if ( TY_(tmbstrncmp)(lexbuf, "if !supportEmptyParas", 21) == 0 )
1611        {
1612          Node* cell = FindEnclosingCell( doc, node );
1613          if ( cell )
1614          {
1615            /* Need to put &nbsp; into cell so it doesn't look weird
1616            */
1617            Node* nbsp = TY_(NewLiteralTextNode)( lexer, "\240" );
1618            assert( (byte)'\240' == (byte)160 );
1619            TY_(InsertNodeBeforeElement)( node, nbsp );
1620          }
1621        }
1622
1623        /* discard node and returns next */
1624        node = TY_(DiscardElement)( doc, node );
1625
1626        if (node == NULL)
1627            return NULL;
1628
1629        if (node->type == SectionTag)
1630        {
1631            if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0)
1632            {
1633                node = PruneSection( doc, node );
1634                continue;
1635            }
1636
1637            if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "endif", 5) == 0)
1638            {
1639                node = TY_(DiscardElement)( doc, node );
1640                break;
1641            }
1642        }
1643    }
1644
1645    return node;
1646}
1647
1648void TY_(DropSections)( TidyDocImpl* doc, Node* node )
1649{
1650    Lexer* lexer = doc->lexer;
1651    while (node)
1652    {
1653        if (node->type == SectionTag)
1654        {
1655            /* prune up to matching endif */
1656            if ((TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0) &&
1657                (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if !vml", 7) != 0)) /* #444394 - fix 13 Sep 01 */
1658            {
1659                node = PruneSection( doc, node );
1660                continue;
1661            }
1662
1663            /* discard others as well */
1664            node = TY_(DiscardElement)( doc, node );
1665            continue;
1666        }
1667
1668        if (node->content)
1669            TY_(DropSections)( doc, node->content );
1670
1671        node = node->next;
1672    }
1673}
1674
1675static void PurgeWord2000Attributes( TidyDocImpl* ARG_UNUSED(doc), Node* node )
1676{
1677    AttVal *attr, *next, *prev = NULL;
1678
1679    for ( attr = node->attributes; attr; attr = next )
1680    {
1681        next = attr->next;
1682
1683        /* special check for class="Code" denoting pre text */
1684        /* Pass thru user defined styles as HTML class names */
1685        if (attrIsCLASS(attr))
1686        {
1687            if (AttrValueIs(attr, "Code") ||
1688                 TY_(tmbstrncmp)(attr->value, "Mso", 3) != 0 )
1689            {
1690                prev = attr;
1691                continue;
1692            }
1693        }
1694
1695        if (attrIsCLASS(attr) ||
1696            attrIsSTYLE(attr) ||
1697            attrIsLANG(attr)  ||
1698             ( (attrIsHEIGHT(attr) || attrIsWIDTH(attr)) &&
1699               (nodeIsTD(node) || nodeIsTR(node) || nodeIsTH(node)) ) ||
1700             (attr->attribute && TY_(tmbstrncmp)(attr->attribute, "x:", 2) == 0) )
1701        {
1702            if (prev)
1703                prev->next = next;
1704            else
1705                node->attributes = next;
1706
1707            TY_(FreeAttribute)( doc, attr );
1708        }
1709        else
1710            prev = attr;
1711    }
1712}
1713
1714/* Word2000 uses span excessively, so we strip span out */
1715static Node* StripSpan( TidyDocImpl* doc, Node* span )
1716{
1717    Node *node, *prev = NULL, *content;
1718
1719    /*
1720     deal with span elements that have content
1721     by splicing the content in place of the span
1722     after having processed it
1723    */
1724
1725    TY_(CleanWord2000)( doc, span->content );
1726    content = span->content;
1727
1728    if (span->prev)
1729        prev = span->prev;
1730    else if (content)
1731    {
1732        node = content;
1733        content = content->next;
1734        TY_(RemoveNode)(node);
1735        TY_(InsertNodeBeforeElement)(span, node);
1736        prev = node;
1737    }
1738
1739    while (content)
1740    {
1741        node = content;
1742        content = content->next;
1743        TY_(RemoveNode)(node);
1744        TY_(InsertNodeAfterElement)(prev, node);
1745        prev = node;
1746    }
1747
1748    if (span->next == NULL)
1749        span->parent->last = prev;
1750
1751    node = span->next;
1752    span->content = NULL;
1753    TY_(DiscardElement)( doc, span );
1754    return node;
1755}
1756
1757/* map non-breaking spaces to regular spaces */
1758void TY_(NormalizeSpaces)(Lexer *lexer, Node *node)
1759{
1760    while ( node )
1761    {
1762        if ( node->content )
1763            TY_(NormalizeSpaces)( lexer, node->content );
1764
1765        if (TY_(nodeIsText)(node))
1766        {
1767            uint i, c;
1768            tmbstr p = lexer->lexbuf + node->start;
1769
1770            for (i = node->start; i < node->end; ++i)
1771            {
1772                c = (byte) lexer->lexbuf[i];
1773
1774                /* look for UTF-8 multibyte character */
1775                if ( c > 0x7F )
1776                    i += TY_(GetUTF8)( lexer->lexbuf + i, &c );
1777
1778                if ( c == 160 )
1779                    c = ' ';
1780
1781                p = TY_(PutUTF8)(p, c);
1782            }
1783            node->end = p - lexer->lexbuf;
1784        }
1785
1786        node = node->next;
1787    }
1788}
1789
1790/* used to hunt for hidden preformatted sections */
1791static Bool NoMargins(Node *node)
1792{
1793    AttVal *attval = TY_(AttrGetById)(node, TidyAttr_STYLE);
1794
1795    if ( !AttrHasValue(attval) )
1796        return no;
1797
1798    /* search for substring "margin-top: 0" */
1799    if (!TY_(tmbsubstr)(attval->value, "margin-top: 0"))
1800        return no;
1801
1802    /* search for substring "margin-bottom: 0" */
1803    if (!TY_(tmbsubstr)(attval->value, "margin-bottom: 0"))
1804        return no;
1805
1806    return yes;
1807}
1808
1809/* does element have a single space as its content? */
1810static Bool SingleSpace( Lexer* lexer, Node* node )
1811{
1812    if ( node->content )
1813    {
1814        node = node->content;
1815
1816        if ( node->next != NULL )
1817            return no;
1818
1819        if ( node->type != TextNode )
1820            return no;
1821
1822        if ( (node->end - node->start) == 1 &&
1823             lexer->lexbuf[node->start] == ' ' )
1824            return yes;
1825
1826        if ( (node->end - node->start) == 2 )
1827        {
1828            uint c = 0;
1829            TY_(GetUTF8)( lexer->lexbuf + node->start, &c );
1830            if ( c == 160 )
1831                return yes;
1832        }
1833    }
1834
1835    return no;
1836}
1837
1838/*
1839 This is a major clean up to strip out all the extra stuff you get
1840 when you save as web page from Word 2000. It doesn't yet know what
1841 to do with VML tags, but these will appear as errors unless you
1842 declare them as new tags, such as o:p which needs to be declared
1843 as inline.
1844*/
1845void TY_(CleanWord2000)( TidyDocImpl* doc, Node *node)
1846{
1847    /* used to a list from a sequence of bulletted p's */
1848    Lexer* lexer = doc->lexer;
1849    Node* list = NULL;
1850
1851    while ( node )
1852    {
1853        /* get rid of Word's xmlns attributes */
1854        if ( nodeIsHTML(node) )
1855        {
1856            /* check that it's a Word 2000 document */
1857            if ( !TY_(GetAttrByName)(node, "xmlns:o") &&
1858                 !cfgBool(doc, TidyMakeBare) )
1859                return;
1860
1861            TY_(FreeAttrs)( doc, node );
1862        }
1863
1864        /* fix up preformatted sections by looking for a
1865        ** sequence of paragraphs with zero top/bottom margin
1866        */
1867        if ( nodeIsP(node) )
1868        {
1869            if (NoMargins(node))
1870            {
1871                Node *pre, *next;
1872                TY_(CoerceNode)(doc, node, TidyTag_PRE, no, yes);
1873
1874                PurgeWord2000Attributes( doc, node );
1875
1876                if (node->content)
1877                    TY_(CleanWord2000)( doc, node->content );
1878
1879                pre = node;
1880                node = node->next;
1881
1882                /* continue to strip p's */
1883
1884                while ( nodeIsP(node) && NoMargins(node) )
1885                {
1886                    next = node->next;
1887                    TY_(RemoveNode)(node);
1888                    TY_(InsertNodeAtEnd)(pre, TY_(NewLineNode)(lexer));
1889                    TY_(InsertNodeAtEnd)(pre, node);
1890                    StripSpan( doc, node );
1891                    node = next;
1892                }
1893
1894                if (node == NULL)
1895                    break;
1896            }
1897        }
1898
1899        if (node->tag && (node->tag->model & CM_BLOCK)
1900            && SingleSpace(lexer, node))
1901        {
1902            node = StripSpan( doc, node );
1903            continue;
1904        }
1905        /* discard Word's style verbiage */
1906        if ( nodeIsSTYLE(node) || nodeIsMETA(node) ||
1907             node->type == CommentTag )
1908        {
1909            node = TY_(DiscardElement)( doc, node );
1910            continue;
1911        }
1912
1913        /* strip out all span and font tags Word scatters so liberally! */
1914        if ( nodeIsSPAN(node) || nodeIsFONT(node) )
1915        {
1916            node = StripSpan( doc, node );
1917            continue;
1918        }
1919
1920        if ( nodeIsLINK(node) )
1921        {
1922            AttVal *attr = TY_(AttrGetById)(node, TidyAttr_REL);
1923
1924            if (AttrValueIs(attr, "File-List"))
1925            {
1926                node = TY_(DiscardElement)( doc, node );
1927                continue;
1928            }
1929        }
1930
1931        /* discards <o:p> which encodes the paragraph mark */
1932        if ( node->tag && TY_(tmbstrcmp)(node->tag->name,"o:p")==0)
1933        {
1934            Node* next;
1935            DiscardContainer( doc, node, &next );
1936            node = next;
1937            continue;
1938        }
1939
1940        /* discard empty paragraphs */
1941
1942        if ( node->content == NULL && nodeIsP(node) )
1943        {
1944            /*  Use the existing function to ensure consistency */
1945            Node *next = TY_(TrimEmptyElement)( doc, node );
1946            node = next;
1947            continue;
1948        }
1949
1950        if ( nodeIsP(node) )
1951        {
1952            AttVal *attr, *atrStyle;
1953
1954            attr = TY_(AttrGetById)(node, TidyAttr_CLASS);
1955            atrStyle = TY_(AttrGetById)(node, TidyAttr_STYLE);
1956            /*
1957               (JES) Sometimes Word marks a list item with the following hokie syntax
1958               <p class="MsoNormal" style="...;mso-list:l1 level1 lfo1;
1959                translate these into <li>
1960            */
1961            /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
1962            /* map <p class="MsoListNumber"> to <ol>...</ol> */
1963            if ( AttrValueIs(attr, "MsoListBullet") ||
1964                 AttrValueIs(attr, "MsoListNumber") ||
1965                 AttrContains(atrStyle, "mso-list:") )
1966            {
1967                TidyTagId listType = TidyTag_UL;
1968                if (AttrValueIs(attr, "MsoListNumber"))
1969                    listType = TidyTag_OL;
1970
1971                TY_(CoerceNode)(doc, node, TidyTag_LI, no, yes);
1972
1973                if ( !list || TagId(list) != listType )
1974                {
1975                    const Dict* tag = TY_(LookupTagDef)( listType );
1976                    list = TY_(InferredTag)(doc, tag->id);
1977                    TY_(InsertNodeBeforeElement)(node, list);
1978                }
1979
1980                PurgeWord2000Attributes( doc, node );
1981
1982                if ( node->content )
1983                    TY_(CleanWord2000)( doc, node->content );
1984
1985                /* remove node and append to contents of list */
1986                TY_(RemoveNode)(node);
1987                TY_(InsertNodeAtEnd)(list, node);
1988                node = list;
1989            }
1990            /* map sequence of <p class="Code"> to <pre>...</pre> */
1991            else if (AttrValueIs(attr, "Code"))
1992            {
1993                Node *br = TY_(NewLineNode)(lexer);
1994                TY_(NormalizeSpaces)(lexer, node->content);
1995
1996                if ( !list || TagId(list) != TidyTag_PRE )
1997                {
1998                    list = TY_(InferredTag)(doc, TidyTag_PRE);
1999                    TY_(InsertNodeBeforeElement)(node, list);
2000                }
2001
2002                /* remove node and append to contents of list */
2003                TY_(RemoveNode)(node);
2004                TY_(InsertNodeAtEnd)(list, node);
2005                StripSpan( doc, node );
2006                TY_(InsertNodeAtEnd)(list, br);
2007                node = list->next;
2008            }
2009            else
2010                list = NULL;
2011        }
2012        else
2013            list = NULL;
2014
2015        if (!node)
2016            return;
2017
2018        /* strip out style and class attributes */
2019        if (TY_(nodeIsElement)(node))
2020            PurgeWord2000Attributes( doc, node );
2021
2022        if (node->content)
2023            TY_(CleanWord2000)( doc, node->content );
2024
2025        node = node->next;
2026    }
2027}
2028
2029Bool TY_(IsWord2000)( TidyDocImpl* doc )
2030{
2031    AttVal *attval;
2032    Node *node, *head;
2033    Node *html = TY_(FindHTML)( doc );
2034
2035    if (html && TY_(GetAttrByName)(html, "xmlns:o"))
2036        return yes;
2037
2038    /* search for <meta name="GENERATOR" content="Microsoft ..."> */
2039    head = TY_(FindHEAD)( doc );
2040
2041    if (head)
2042    {
2043        for (node = head->content; node; node = node->next)
2044        {
2045            if ( !nodeIsMETA(node) )
2046                continue;
2047
2048            attval = TY_(AttrGetById)( node, TidyAttr_NAME );
2049
2050            if ( !AttrValueIs(attval, "generator") )
2051                continue;
2052
2053            attval =  TY_(AttrGetById)( node, TidyAttr_CONTENT );
2054
2055            if ( AttrContains(attval, "Microsoft") )
2056                return yes;
2057        }
2058    }
2059
2060    return no;
2061}
2062
2063/* where appropriate move object elements from head to body */
2064void TY_(BumpObject)( TidyDocImpl* doc, Node *html )
2065{
2066    Node *node, *next, *head = NULL, *body = NULL;
2067
2068    if (!html)
2069        return;
2070
2071    for ( node = html->content; node != NULL; node = node->next )
2072    {
2073        if ( nodeIsHEAD(node) )
2074            head = node;
2075
2076        if ( nodeIsBODY(node) )
2077            body = node;
2078    }
2079
2080    if ( head != NULL && body != NULL )
2081    {
2082        for (node = head->content; node != NULL; node = next)
2083        {
2084            next = node->next;
2085
2086            if ( nodeIsOBJECT(node) )
2087            {
2088                Node *child;
2089                Bool bump = no;
2090
2091                for (child = node->content; child != NULL; child = child->next)
2092                {
2093                    /* bump to body unless content is param */
2094                    if ( (TY_(nodeIsText)(child) && !TY_(IsBlank)(doc->lexer, node))
2095                         || !nodeIsPARAM(child) )
2096                    {
2097                            bump = yes;
2098                            break;
2099                    }
2100                }
2101
2102                if ( bump )
2103                {
2104                    TY_(RemoveNode)( node );
2105                    TY_(InsertNodeAtStart)( body, node );
2106                }
2107            }
2108        }
2109    }
2110}
2111
2112/* This is disabled due to http://tidy.sf.net/bug/681116 */
2113#if 0
2114void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
2115{
2116    Node *pNode;
2117    Bool bBRDeleted = no;
2118
2119    if (NULL == pParent)
2120        return;
2121
2122    /*  First, check the status of All My Children  */
2123    pNode = pParent->content;
2124    while (NULL != pNode )
2125    {
2126        /* The node may get trimmed, so save the next pointer, if any */
2127        Node *pNext = pNode->next;
2128        FixBrakes( pDoc, pNode );
2129        pNode = pNext;
2130    }
2131
2132
2133    /*  As long as my last child is a <br />, move it to my last peer  */
2134    if ( nodeCMIsBlock( pParent ))
2135    {
2136        for ( pNode = pParent->last;
2137              NULL != pNode && nodeIsBR( pNode );
2138              pNode = pParent->last )
2139        {
2140            if ( NULL == pNode->attributes && no == bBRDeleted )
2141            {
2142                TY_(DiscardElement)( pDoc, pNode );
2143                bBRDeleted = yes;
2144            }
2145            else
2146            {
2147                TY_(RemoveNode)( pNode );
2148                TY_(InsertNodeAfterElement)( pParent, pNode );
2149            }
2150        }
2151        TY_(TrimEmptyElement)( pDoc, pParent );
2152    }
2153}
2154#endif
2155
2156void TY_(VerifyHTTPEquiv)(TidyDocImpl* pDoc, Node *head)
2157{
2158    Node *pNode;
2159    StyleProp *pFirstProp = NULL, *pLastProp = NULL, *prop = NULL;
2160    tmbstr s, pszBegin, pszEnd;
2161    ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(pDoc, TidyOutCharEncoding));
2162
2163    if (!enc)
2164        return;
2165
2166    if (!nodeIsHEAD(head))
2167        head = TY_(FindHEAD)(pDoc);
2168
2169    if (!head)
2170        return;
2171
2172    /* Find any <meta http-equiv='Content-Type' content='...' /> */
2173    for (pNode = head->content; NULL != pNode; pNode = pNode->next)
2174    {
2175        AttVal* httpEquiv = TY_(AttrGetById)(pNode, TidyAttr_HTTP_EQUIV);
2176        AttVal* metaContent = TY_(AttrGetById)(pNode, TidyAttr_CONTENT);
2177
2178        if ( !nodeIsMETA(pNode) || !metaContent ||
2179             !AttrValueIs(httpEquiv, "Content-Type") )
2180            continue;
2181
2182        pszBegin = s = TY_(tmbstrdup)( metaContent->value );
2183        while (pszBegin && *pszBegin)
2184        {
2185            while (isspace( *pszBegin ))
2186                pszBegin++;
2187            pszEnd = pszBegin;
2188            while ('\0' != *pszEnd && ';' != *pszEnd)
2189                pszEnd++;
2190            if (';' == *pszEnd )
2191                *(pszEnd++) = '\0';
2192            if (pszEnd > pszBegin)
2193            {
2194                prop = (StyleProp *)MemAlloc(sizeof(StyleProp));
2195                prop->name = TY_(tmbstrdup)( pszBegin );
2196                prop->value = NULL;
2197                prop->next = NULL;
2198
2199                if (NULL != pLastProp)
2200                    pLastProp->next = prop;
2201                else
2202                    pFirstProp = prop;
2203
2204                pLastProp = prop;
2205                pszBegin = pszEnd;
2206            }
2207        }
2208        MemFree( s );
2209
2210        /*  find the charset property */
2211        for (prop = pFirstProp; NULL != prop; prop = prop->next)
2212        {
2213            if (0 != TY_(tmbstrncasecmp)( prop->name, "charset", 7 ))
2214                continue;
2215
2216            MemFree( prop->name );
2217            prop->name = (tmbstr)MemAlloc( 8 + TY_(tmbstrlen)(enc) + 1 );
2218            TY_(tmbstrcpy)(prop->name, "charset=");
2219            TY_(tmbstrcpy)(prop->name+8, enc);
2220            s = CreatePropString( pFirstProp );
2221            MemFree( metaContent->value );
2222            metaContent->value = s;
2223            break;
2224        }
2225        /* #718127, prevent memory leakage */
2226        FreeStyleProps(pFirstProp);
2227        pFirstProp = NULL;
2228        pLastProp = NULL;
2229    }
2230}
2231
2232void TY_(DropComments)(TidyDocImpl* doc, Node* node)
2233{
2234    Node* next;
2235
2236    while (node)
2237    {
2238        next = node->next;
2239
2240        if (node->type == CommentTag)
2241        {
2242            TY_(RemoveNode)(node);
2243            TY_(FreeNode)(doc, node);
2244            node = next;
2245            continue;
2246        }
2247
2248        if (node->content)
2249            TY_(DropComments)(doc, node->content);
2250
2251        node = next;
2252    }
2253}
2254
2255void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **ARG_UNUSED(pnode))
2256{
2257    Node* next;
2258
2259    while (node)
2260    {
2261        next = node->next;
2262
2263        if (nodeIsFONT(node))
2264        {
2265            DiscardContainer(doc, node, &next);
2266            node = next;
2267            continue;
2268        }
2269
2270        if (node->content)
2271            TY_(DropFontElements)(doc, node->content, &next);
2272
2273        node = next;
2274    }
2275}
2276
2277void TY_(WbrToSpace)(TidyDocImpl* doc, Node* node)
2278{
2279    Node* next;
2280
2281    while (node)
2282    {
2283        next = node->next;
2284
2285        if (nodeIsWBR(node))
2286        {
2287            Node* text;
2288            text = TY_(NewLiteralTextNode)(doc->lexer, " ");
2289            TY_(InsertNodeAfterElement)(node, text);
2290            TY_(RemoveNode)(node);
2291            TY_(FreeNode)(doc, node);
2292            node = next;
2293            continue;
2294        }
2295
2296        if (node->content)
2297            TY_(WbrToSpace)(doc, node->content);
2298
2299        node = next;
2300   }
2301}
2302
2303/*
2304  Filters from Word and PowerPoint often use smart
2305  quotes resulting in character codes between 128
2306  and 159. Unfortunately, the corresponding HTML 4.0
2307  entities for these are not widely supported. The
2308  following converts dashes and quotation marks to
2309  the nearest ASCII equivalent. My thanks to
2310  Andrzej Novosiolov for his help with this code.
2311
2312  Note: The old code in the pretty printer applied
2313  this to all node types and attribute values while
2314  this routine applies it only to text nodes. First,
2315  Microsoft Office products rarely put the relevant
2316  characters into these tokens, second support for
2317  them is much better now and last but not least, it
2318  can be harmful to replace these characters since
2319  US-ASCII quote marks are often used as syntax
2320  characters, a simple
2321
2322    <a onmouseover="alert('&#x2018;')">...</a>
2323
2324  would be broken if the U+2018 is replaced by "'".
2325  The old code would neither take care whether the
2326  quote mark is already used as delimiter,
2327
2328    <p title='&#x2018;'>...</p>
2329
2330  got
2331
2332    <p title='''>...</p>
2333
2334  Since browser support is much better nowadays and
2335  high-quality typography is better than ASCII it'd
2336  be probably a good idea to drop the feature...
2337*/
2338void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node)
2339{
2340    Node* next;
2341    Lexer* lexer = doc->lexer;
2342
2343    while (node)
2344    {
2345        next = node->next;
2346
2347        if (TY_(nodeIsText)(node))
2348        {
2349            uint i, c;
2350            tmbstr p = lexer->lexbuf + node->start;
2351
2352            for (i = node->start; i < node->end; ++i)
2353            {
2354                c = (unsigned char) lexer->lexbuf[i];
2355
2356                if (c > 0x7F)
2357                    i += TY_(GetUTF8)(lexer->lexbuf + i, &c);
2358
2359                if (c >= 0x2013 && c <= 0x201E)
2360                {
2361                    switch (c)
2362                    {
2363                    case 0x2013: /* en dash */
2364                    case 0x2014: /* em dash */
2365                        c = '-';
2366                        break;
2367                    case 0x2018: /* left single  quotation mark */
2368                    case 0x2019: /* right single quotation mark */
2369                    case 0x201A: /* single low-9 quotation mark */
2370                        c = '\'';
2371                        break;
2372                    case 0x201C: /* left double  quotation mark */
2373                    case 0x201D: /* right double quotation mark */
2374                    case 0x201E: /* double low-9 quotation mark */
2375                        c = '"';
2376                        break;
2377                    }
2378                }
2379
2380                p = TY_(PutUTF8)(p, c);
2381            }
2382
2383            node->end = p - lexer->lexbuf;
2384        }
2385
2386        if (node->content)
2387            TY_(DowngradeTypography)(doc, node->content);
2388
2389        node = next;
2390    }
2391}
2392
2393void TY_(ReplacePreformattedSpaces)(TidyDocImpl* doc, Node* node)
2394{
2395    Node* next;
2396
2397    while (node)
2398    {
2399        next = node->next;
2400
2401        if (node->tag && node->tag->parser == TY_(ParsePre))
2402        {
2403            TY_(NormalizeSpaces)(doc->lexer, node->content);
2404            node = next;
2405            continue;
2406        }
2407
2408        if (node->content)
2409            TY_(ReplacePreformattedSpaces)(doc, node->content);
2410
2411        node = next;
2412    }
2413}
2414
2415void TY_(ConvertCDATANodes)(TidyDocImpl* doc, Node* node)
2416{
2417    Node* next;
2418
2419    while (node)
2420    {
2421        next = node->next;
2422
2423        if (node->type == CDATATag)
2424            node->type = TextNode;
2425
2426        if (node->content)
2427            TY_(ConvertCDATANodes)(doc, node->content);
2428
2429        node = next;
2430    }
2431}
2432
2433/*
2434  FixLanguageInformation ensures that the document contains (only)
2435  the attributes for language information desired by the output
2436  document type. For example, for XHTML 1.0 documents both
2437  'xml:lang' and 'lang' are desired, for XHTML 1.1 only 'xml:lang'
2438  is desired and for HTML 4.01 only 'lang' is desired.
2439*/
2440void TY_(FixLanguageInformation)(TidyDocImpl* doc, Node* node, Bool wantXmlLang, Bool wantLang)
2441{
2442    Node* next;
2443
2444    while (node)
2445    {
2446        next = node->next;
2447
2448        /* todo: report modifications made here to the report system */
2449
2450        if (TY_(nodeIsElement)(node))
2451        {
2452            AttVal* lang = TY_(AttrGetById)(node, TidyAttr_LANG);
2453            AttVal* xmlLang = TY_(AttrGetById)(node, TidyAttr_XML_LANG);
2454
2455            if (lang && xmlLang)
2456            {
2457                /*
2458                  todo: check whether both attributes are in sync,
2459                  here or elsewhere, where elsewhere is probably
2460                  preferable.
2461                  AD - March 2005: not mandatory according the standards.
2462                */
2463            }
2464            else if (lang && wantXmlLang)
2465            {
2466                if (TY_(NodeAttributeVersions)( node, TidyAttr_XML_LANG )
2467                    & doc->lexer->versionEmitted)
2468                    TY_(RepairAttrValue)(doc, node, "xml:lang", lang->value);
2469            }
2470            else if (xmlLang && wantLang)
2471            {
2472                if (TY_(NodeAttributeVersions)( node, TidyAttr_LANG )
2473                    & doc->lexer->versionEmitted)
2474                    TY_(RepairAttrValue)(doc, node, "lang", xmlLang->value);
2475            }
2476
2477            if (lang && !wantLang)
2478                TY_(RemoveAttribute)(doc, node, lang);
2479
2480            if (xmlLang && !wantXmlLang)
2481                TY_(RemoveAttribute)(doc, node, xmlLang);
2482        }
2483
2484        if (node->content)
2485            TY_(FixLanguageInformation)(doc, node->content, wantXmlLang, wantLang);
2486
2487        node = next;
2488    }
2489}
2490
2491/*
2492  Set/fix/remove <html xmlns='...'>
2493*/
2494void TY_(FixXhtmlNamespace)(TidyDocImpl* doc, Bool wantXmlns)
2495{
2496    Node* html = TY_(FindHTML)(doc);
2497    AttVal* xmlns;
2498
2499    if (!html)
2500        return;
2501
2502    xmlns = TY_(AttrGetById)(html, TidyAttr_XMLNS);
2503
2504    if (wantXmlns)
2505    {
2506        if (!AttrValueIs(xmlns, XHTML_NAMESPACE))
2507            TY_(RepairAttrValue)(doc, html, "xmlns", XHTML_NAMESPACE);
2508    }
2509    else if (xmlns)
2510    {
2511        TY_(RemoveAttribute)(doc, html, xmlns);
2512    }
2513}
2514
2515/*
2516  ...
2517*/
2518void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
2519{
2520    Node* next;
2521
2522    while (node)
2523    {
2524        next = node->next;
2525
2526        if (TY_(IsAnchorElement)(doc, node))
2527        {
2528            AttVal *name = TY_(AttrGetById)(node, TidyAttr_NAME);
2529            AttVal *id = TY_(AttrGetById)(node, TidyAttr_ID);
2530
2531            /* todo: how are empty name/id attributes handled? */
2532
2533            if (name && id)
2534            {
2535                Bool NameHasValue = AttrHasValue(name);
2536                Bool IdHasValue = AttrHasValue(id);
2537                if ( (NameHasValue != IdHasValue) ||
2538                     (NameHasValue && IdHasValue &&
2539                     TY_(tmbstrcmp)(name->value, id->value) != 0 ) )
2540                    TY_(ReportAttrError)( doc, node, name, ID_NAME_MISMATCH);
2541            }
2542            else if (name && wantId)
2543            {
2544                if (TY_(NodeAttributeVersions)( node, TidyAttr_ID )
2545                    & doc->lexer->versionEmitted)
2546                {
2547                    if (TY_(IsValidHTMLID)(name->value))
2548                    {
2549                        TY_(RepairAttrValue)(doc, node, "id", name->value);
2550                    }
2551                    else
2552                    {
2553                        TY_(ReportAttrError)(doc, node, name, INVALID_XML_ID);
2554                    }
2555                 }
2556            }
2557            else if (id && wantName)
2558            {
2559                if (TY_(NodeAttributeVersions)( node, TidyAttr_NAME )
2560                    & doc->lexer->versionEmitted)
2561                    /* todo: do not assume id is valid */
2562                    TY_(RepairAttrValue)(doc, node, "name", id->value);
2563            }
2564
2565            if (id && !wantId)
2566                TY_(RemoveAttribute)(doc, node, id);
2567
2568            if (name && !wantName)
2569                TY_(RemoveAttribute)(doc, node, name);
2570
2571            if (TY_(AttrGetById)(node, TidyAttr_NAME) == NULL &&
2572                TY_(AttrGetById)(node, TidyAttr_ID) == NULL)
2573                TY_(RemoveAnchorByNode)(doc, node);
2574        }
2575
2576        if (node->content)
2577            TY_(FixAnchors)(doc, node->content, wantName, wantId);
2578
2579        node = next;
2580    }
2581}
2582
2583/*
2584 * local variables:
2585 * mode: c
2586 * indent-tabs-mode: nil
2587 * c-basic-offset: 4
2588 * eval: (c-set-offset 'substatement-open 0)
2589 * end:
2590 */
2591