1/* 2 clean.c -- clean up misuse of presentation markup 3 4 (c) 1998-2006 (W3C) MIT, ERCIM, Keio University 5 See tidy.h for the copyright notice. 6 7 CVS Info : 8 9 $Author: iccir $ 10 $Date: 2007/01/30 23:46:51 $ 11 $Revision: 1.3 $ 12 13 Filters from other formats such as Microsoft Word 14 often make excessive use of presentation markup such 15 as font tags, B, I, and the align attribute. By applying 16 a set of production rules, it is straight forward to 17 transform this to use CSS. 18 19 Some rules replace some of the children of an element by 20 style properties on the element, e.g. 21 22 <p><b>...</b></p> -> <p style="font-weight: bold">...</p> 23 24 Such rules are applied to the element's content and then 25 to the element itself until none of the rules more apply. 26 Having applied all the rules to an element, it will have 27 a style attribute with one or more properties. 28 29 Other rules strip the element they apply to, replacing 30 it by style properties on the contents, e.g. 31 32 <dir><li><p>...</li></dir> -> <p style="margin-left 1em">... 33 34 These rules are applied to an element before processing 35 its content and replace the current element by the first 36 element in the exposed content. 37 38 After applying both sets of rules, you can replace the 39 style attribute by a class value and style rule in the 40 document head. To support this, an association of styles 41 and class names is built. 42 43 A naive approach is to rely on string matching to test 44 when two property lists are the same. A better approach 45 would be to first sort the properties before matching. 46 47*/ 48 49#include <stdio.h> 50#include <stdlib.h> 51#include <string.h> 52 53#include "tidy-int.h" 54#include "clean.h" 55#include "lexer.h" 56#include "parser.h" 57#include "attrs.h" 58#include "message.h" 59#include "tmbstr.h" 60#include "utf8.h" 61 62static Node* CleanNode( TidyDocImpl* doc, Node *node ); 63 64static void RenameElem( Node* node, TidyTagId tid ) 65{ 66 const Dict* dict = TY_(LookupTagDef)( tid ); 67 MemFree( node->element ); 68 node->element = TY_(tmbstrdup)( dict->name ); 69 node->tag = dict; 70} 71 72static void FreeStyleProps(StyleProp *props) 73{ 74 StyleProp *next; 75 76 while (props) 77 { 78 next = props->next; 79 MemFree(props->name); 80 MemFree(props->value); 81 MemFree(props); 82 props = next; 83 } 84} 85 86static StyleProp *InsertProperty( StyleProp* props, ctmbstr name, ctmbstr value ) 87{ 88 StyleProp *first, *prev, *prop; 89 int cmp; 90 91 prev = NULL; 92 first = props; 93 94 while (props) 95 { 96 cmp = TY_(tmbstrcmp)(props->name, name); 97 98 if (cmp == 0) 99 { 100 /* this property is already defined, ignore new value */ 101 return first; 102 } 103 104 if (cmp > 0) 105 { 106 /* insert before this */ 107 108 prop = (StyleProp *)MemAlloc(sizeof(StyleProp)); 109 prop->name = TY_(tmbstrdup)(name); 110 prop->value = TY_(tmbstrdup)(value); 111 prop->next = props; 112 113 if (prev) 114 prev->next = prop; 115 else 116 first = prop; 117 118 return first; 119 } 120 121 prev = props; 122 props = props->next; 123 } 124 125 prop = (StyleProp *)MemAlloc(sizeof(StyleProp)); 126 prop->name = TY_(tmbstrdup)(name); 127 prop->value = TY_(tmbstrdup)(value); 128 prop->next = NULL; 129 130 if (prev) 131 prev->next = prop; 132 else 133 first = prop; 134 135 return first; 136} 137 138/* 139 Create sorted linked list of properties from style string 140 It temporarily places nulls in place of ':' and ';' to 141 delimit the strings for the property name and value. 142 Some systems don't allow you to NULL literal strings, 143 so to avoid this, a copy is made first. 144*/ 145static StyleProp* CreateProps( StyleProp* prop, ctmbstr style ) 146{ 147 tmbstr name, value = NULL, name_end, value_end, line; 148 Bool more; 149 150 line = TY_(tmbstrdup)(style); 151 name = line; 152 153 while (*name) 154 { 155 while (*name == ' ') 156 ++name; 157 158 name_end = name; 159 160 while (*name_end) 161 { 162 if (*name_end == ':') 163 { 164 value = name_end + 1; 165 break; 166 } 167 168 ++name_end; 169 } 170 171 if (*name_end != ':') 172 break; 173 174 while ( value && *value == ' ') 175 ++value; 176 177 value_end = value; 178 more = no; 179 180 while (*value_end) 181 { 182 if (*value_end == ';') 183 { 184 more = yes; 185 break; 186 } 187 188 ++value_end; 189 } 190 191 *name_end = '\0'; 192 *value_end = '\0'; 193 194 prop = InsertProperty(prop, name, value); 195 *name_end = ':'; 196 197 if (more) 198 { 199 *value_end = ';'; 200 name = value_end + 1; 201 continue; 202 } 203 204 break; 205 } 206 207 MemFree(line); /* free temporary copy */ 208 return prop; 209} 210 211static tmbstr CreatePropString(StyleProp *props) 212{ 213 tmbstr style, p, s; 214 uint len; 215 StyleProp *prop; 216 217 /* compute length */ 218 219 for (len = 0, prop = props; prop; prop = prop->next) 220 { 221 len += TY_(tmbstrlen)(prop->name) + 2; 222 if (prop->value) 223 len += TY_(tmbstrlen)(prop->value) + 2; 224 } 225 226 style = (tmbstr) MemAlloc(len+1); 227 style[0] = '\0'; 228 229 for (p = style, prop = props; prop; prop = prop->next) 230 { 231 s = prop->name; 232 233 while((*p++ = *s++)) 234 continue; 235 236 if (prop->value) 237 { 238 *--p = ':'; 239 *++p = ' '; 240 ++p; 241 242 s = prop->value; 243 while((*p++ = *s++)) 244 continue; 245 } 246 if (prop->next == NULL) 247 break; 248 249 *--p = ';'; 250 *++p = ' '; 251 ++p; 252 } 253 254 return style; 255} 256 257/* 258 create string with merged properties 259static tmbstr AddProperty( ctmbstr style, ctmbstr property ) 260{ 261 tmbstr line; 262 StyleProp *prop; 263 264 prop = CreateProps(NULL, style); 265 prop = CreateProps(prop, property); 266 line = CreatePropString(prop); 267 FreeStyleProps(prop); 268 return line; 269} 270*/ 271 272void TY_(FreeStyles)( TidyDocImpl* doc ) 273{ 274 Lexer* lexer = doc->lexer; 275 if ( lexer ) 276 { 277 TagStyle *style, *next; 278 for ( style = lexer->styles; style; style = next ) 279 { 280 next = style->next; 281 MemFree( style->tag ); 282 MemFree( style->tag_class ); 283 MemFree( style->properties ); 284 MemFree( style ); 285 } 286 } 287} 288 289static tmbstr GensymClass( TidyDocImpl* doc ) 290{ 291 tmbchar buf[512]; /* CSSPrefix is limited to 256 characters */ 292 ctmbstr pfx = cfgStr(doc, TidyCSSPrefix); 293 if ( pfx == NULL || *pfx == 0 ) 294 pfx = "c"; 295 296 TY_(tmbsnprintf)(buf, sizeof(buf), "%s%u", pfx, ++doc->nClassId ); 297 return TY_(tmbstrdup)(buf); 298} 299 300static ctmbstr FindStyle( TidyDocImpl* doc, ctmbstr tag, ctmbstr properties ) 301{ 302 Lexer* lexer = doc->lexer; 303 TagStyle* style; 304 305 for (style = lexer->styles; style; style=style->next) 306 { 307 if (TY_(tmbstrcmp)(style->tag, tag) == 0 && 308 TY_(tmbstrcmp)(style->properties, properties) == 0) 309 return style->tag_class; 310 } 311 312 style = (TagStyle *)MemAlloc( sizeof(TagStyle) ); 313 style->tag = TY_(tmbstrdup)(tag); 314 style->tag_class = GensymClass( doc ); 315 style->properties = TY_(tmbstrdup)( properties ); 316 style->next = lexer->styles; 317 lexer->styles = style; 318 return style->tag_class; 319} 320 321/* 322 Add class="foo" to node 323*/ 324static void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname ) 325{ 326 AttVal *classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);; 327 328 /* 329 if there already is a class attribute 330 then append class name after a space. 331 */ 332 if (classattr) 333 TY_(AppendToClassAttr)( classattr, classname ); 334 else /* create new class attribute */ 335 TY_(AddAttribute)( doc, node, "class", classname ); 336} 337 338void TY_(AddStyleAsClass)( TidyDocImpl* doc, Node *node, ctmbstr stylevalue ) 339{ 340 ctmbstr classname; 341 342 classname = FindStyle( doc, node->element, stylevalue ); 343 AddClass( doc, node, classname); 344} 345 346/* 347 Find style attribute in node, and replace it 348 by corresponding class attribute. Search for 349 class in style dictionary otherwise gensym 350 new class and add to dictionary. 351 352 Assumes that node doesn't have a class attribute 353*/ 354static void Style2Rule( TidyDocImpl* doc, Node *node) 355{ 356 AttVal *styleattr, *classattr; 357 ctmbstr classname; 358 359 styleattr = TY_(AttrGetById)(node, TidyAttr_STYLE); 360 361 if (styleattr) 362 { 363 /* fix for http://tidy.sf.net/bug/850215 */ 364 if (!styleattr->value) 365 { 366 TY_(RemoveAttribute)(doc, node, styleattr); 367 return; 368 } 369 370 classname = FindStyle( doc, node->element, styleattr->value ); 371 classattr = TY_(AttrGetById)(node, TidyAttr_CLASS); 372 373 /* 374 if there already is a class attribute 375 then append class name after an underscore 376 */ 377 if (classattr) 378 { 379 TY_(AppendToClassAttr)( classattr, classname ); 380 TY_(RemoveAttribute)( doc, node, styleattr ); 381 } 382 else /* reuse style attribute for class attribute */ 383 { 384 MemFree(styleattr->attribute); 385 MemFree(styleattr->value); 386 styleattr->attribute = TY_(tmbstrdup)("class"); 387 styleattr->value = TY_(tmbstrdup)(classname); 388 } 389 } 390} 391 392static void AddColorRule( Lexer* lexer, ctmbstr selector, ctmbstr color ) 393{ 394 if ( selector && color ) 395 { 396 TY_(AddStringLiteral)(lexer, selector); 397 TY_(AddStringLiteral)(lexer, " { color: "); 398 TY_(AddStringLiteral)(lexer, color); 399 TY_(AddStringLiteral)(lexer, " }\n"); 400 } 401} 402 403/* 404 move presentation attribs from body to style element 405 406 background="foo" -> body { background-image: url(foo) } 407 bgcolor="foo" -> body { background-color: foo } 408 text="foo" -> body { color: foo } 409 link="foo" -> :link { color: foo } 410 vlink="foo" -> :visited { color: foo } 411 alink="foo" -> :active { color: foo } 412*/ 413static void CleanBodyAttrs( TidyDocImpl* doc, Node* body ) 414{ 415 Lexer* lexer = doc->lexer; 416 tmbstr bgurl = NULL; 417 tmbstr bgcolor = NULL; 418 tmbstr color = NULL; 419 AttVal* attr; 420 421 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BACKGROUND))) 422 { 423 bgurl = attr->value; 424 attr->value = NULL; 425 TY_(RemoveAttribute)( doc, body, attr ); 426 } 427 428 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BGCOLOR))) 429 { 430 bgcolor = attr->value; 431 attr->value = NULL; 432 TY_(RemoveAttribute)( doc, body, attr ); 433 } 434 435 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_TEXT))) 436 { 437 color = attr->value; 438 attr->value = NULL; 439 TY_(RemoveAttribute)( doc, body, attr ); 440 } 441 442 if ( bgurl || bgcolor || color ) 443 { 444 TY_(AddStringLiteral)(lexer, " body {\n"); 445 if (bgurl) 446 { 447 TY_(AddStringLiteral)(lexer, " background-image: url("); 448 TY_(AddStringLiteral)(lexer, bgurl); 449 TY_(AddStringLiteral)(lexer, ");\n"); 450 MemFree(bgurl); 451 } 452 if (bgcolor) 453 { 454 TY_(AddStringLiteral)(lexer, " background-color: "); 455 TY_(AddStringLiteral)(lexer, bgcolor); 456 TY_(AddStringLiteral)(lexer, ";\n"); 457 MemFree(bgcolor); 458 } 459 if (color) 460 { 461 TY_(AddStringLiteral)(lexer, " color: "); 462 TY_(AddStringLiteral)(lexer, color); 463 TY_(AddStringLiteral)(lexer, ";\n"); 464 MemFree(color); 465 } 466 467 TY_(AddStringLiteral)(lexer, " }\n"); 468 } 469 470 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_LINK))) 471 { 472 AddColorRule(lexer, " :link", attr->value); 473 TY_(RemoveAttribute)( doc, body, attr ); 474 } 475 476 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_VLINK))) 477 { 478 AddColorRule(lexer, " :visited", attr->value); 479 TY_(RemoveAttribute)( doc, body, attr ); 480 } 481 482 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_ALINK))) 483 { 484 AddColorRule(lexer, " :active", attr->value); 485 TY_(RemoveAttribute)( doc, body, attr ); 486 } 487} 488 489static Bool NiceBody( TidyDocImpl* doc ) 490{ 491 Node* node = TY_(FindBody)(doc); 492 if (node) 493 { 494 if (TY_(AttrGetById)(node, TidyAttr_BACKGROUND) || 495 TY_(AttrGetById)(node, TidyAttr_BGCOLOR) || 496 TY_(AttrGetById)(node, TidyAttr_TEXT) || 497 TY_(AttrGetById)(node, TidyAttr_LINK) || 498 TY_(AttrGetById)(node, TidyAttr_VLINK) || 499 TY_(AttrGetById)(node, TidyAttr_ALINK)) 500 { 501 doc->badLayout |= USING_BODY; 502 return no; 503 } 504 } 505 506 return yes; 507} 508 509/* create style element using rules from dictionary */ 510static void CreateStyleElement( TidyDocImpl* doc ) 511{ 512 Lexer* lexer = doc->lexer; 513 Node *node, *head, *body; 514 TagStyle *style; 515 AttVal *av; 516 517 if ( lexer->styles == NULL && NiceBody(doc) ) 518 return; 519 520 node = TY_(NewNode)( lexer ); 521 node->type = StartTag; 522 node->implicit = yes; 523 node->element = TY_(tmbstrdup)("style"); 524 TY_(FindTag)( doc, node ); 525 526 /* insert type attribute */ 527 av = TY_(NewAttributeEx)( doc, "type", "text/css", '"' ); 528 TY_(InsertAttributeAtStart)( node, av ); 529 530 body = TY_(FindBody)( doc ); 531 lexer->txtstart = lexer->lexsize; 532 if ( body ) 533 CleanBodyAttrs( doc, body ); 534 535 for (style = lexer->styles; style; style = style->next) 536 { 537 TY_(AddCharToLexer)(lexer, ' '); 538 TY_(AddStringLiteral)(lexer, style->tag); 539 TY_(AddCharToLexer)(lexer, '.'); 540 TY_(AddStringLiteral)(lexer, style->tag_class); 541 TY_(AddCharToLexer)(lexer, ' '); 542 TY_(AddCharToLexer)(lexer, '{'); 543 TY_(AddStringLiteral)(lexer, style->properties); 544 TY_(AddCharToLexer)(lexer, '}'); 545 TY_(AddCharToLexer)(lexer, '\n'); 546 } 547 548 lexer->txtend = lexer->lexsize; 549 550 TY_(InsertNodeAtEnd)( node, TY_(TextToken)(lexer) ); 551 552 /* 553 now insert style element into document head 554 555 doc is root node. search its children for html node 556 the head node should be first child of html node 557 */ 558 if ( NULL != (head = TY_(FindHEAD)( doc )) ) 559 TY_(InsertNodeAtEnd)( head, node ); 560} 561 562 563/* ensure bidirectional links are consistent */ 564void TY_(FixNodeLinks)(Node *node) 565{ 566 Node *child; 567 568 if (node->prev) 569 node->prev->next = node; 570 else 571 node->parent->content = node; 572 573 if (node->next) 574 node->next->prev = node; 575 else 576 node->parent->last = node; 577 578 for (child = node->content; child; child = child->next) 579 child->parent = node; 580} 581 582/* 583 used to strip child of node when 584 the node has one and only one child 585*/ 586static void StripOnlyChild(TidyDocImpl* doc, Node *node) 587{ 588 Node *child; 589 590 child = node->content; 591 node->content = child->content; 592 node->last = child->last; 593 child->content = NULL; 594 TY_(FreeNode)(doc, child); 595 596 for (child = node->content; child; child = child->next) 597 child->parent = node; 598} 599 600/* 601 used to strip font start and end tags. 602 Extricate "element", replace it by its content and delete it. 603*/ 604static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode) 605{ 606 if (element->content) 607 { 608 Node *node, *parent = element->parent; 609 610 element->last->next = element->next; 611 612 if (element->next) 613 { 614 element->next->prev = element->last; 615 } 616 else 617 parent->last = element->last; 618 619 if (element->prev) 620 { 621 element->content->prev = element->prev; 622 element->prev->next = element->content; 623 } 624 else 625 parent->content = element->content; 626 627 for (node = element->content; node; node = node->next) 628 node->parent = parent; 629 630 *pnode = element->content; 631 632 element->next = element->content = NULL; 633 TY_(FreeNode)(doc, element); 634 } 635 else 636 { 637 *pnode = TY_(DiscardElement)(doc, element); 638 } 639} 640 641/* 642 Create new string that consists of the 643 combined style properties in s1 and s2 644 645 To merge property lists, we build a linked 646 list of property/values and insert properties 647 into the list in order, merging values for 648 the same property name. 649*/ 650static tmbstr MergeProperties( ctmbstr s1, ctmbstr s2 ) 651{ 652 tmbstr s; 653 StyleProp *prop; 654 655 prop = CreateProps(NULL, s1); 656 prop = CreateProps(prop, s2); 657 s = CreatePropString(prop); 658 FreeStyleProps(prop); 659 return s; 660} 661 662/* 663 Add style property to element, creating style 664 attribute as needed and adding ; delimiter 665*/ 666void TY_(AddStyleProperty)(TidyDocImpl* doc, Node *node, ctmbstr property ) 667{ 668 AttVal *av = TY_(AttrGetById)(node, TidyAttr_STYLE); 669 670 /* if style attribute already exists then insert property */ 671 672 if ( av ) 673 { 674 if (av->value != NULL) 675 { 676 tmbstr s = MergeProperties( av->value, property ); 677 MemFree( av->value ); 678 av->value = s; 679 } 680 else 681 { 682 av->value = TY_(tmbstrdup)( property ); 683 } 684 } 685 else /* else create new style attribute */ 686 { 687 av = TY_(NewAttributeEx)( doc, "style", property, '"' ); 688 TY_(InsertAttributeAtStart)( node, av ); 689 } 690} 691 692static void MergeClasses(TidyDocImpl* doc, Node *node, Node *child) 693{ 694 AttVal *av; 695 tmbstr s1, s2, names; 696 697 for (s2 = NULL, av = child->attributes; av; av = av->next) 698 { 699 if (attrIsCLASS(av)) 700 { 701 s2 = av->value; 702 break; 703 } 704 } 705 706 for (s1 = NULL, av = node->attributes; av; av = av->next) 707 { 708 if (attrIsCLASS(av)) 709 { 710 s1 = av->value; 711 break; 712 } 713 } 714 715 if (s1) 716 { 717 if (s2) /* merge class names from both */ 718 { 719 uint l1, l2; 720 l1 = TY_(tmbstrlen)(s1); 721 l2 = TY_(tmbstrlen)(s2); 722 names = (tmbstr) MemAlloc(l1 + l2 + 2); 723 TY_(tmbstrcpy)(names, s1); 724 names[l1] = ' '; 725 TY_(tmbstrcpy)(names+l1+1, s2); 726 MemFree(av->value); 727 av->value = names; 728 } 729 } 730 else if (s2) /* copy class names from child */ 731 { 732 av = TY_(NewAttributeEx)( doc, "class", s2, '"' ); 733 TY_(InsertAttributeAtStart)( node, av ); 734 } 735} 736 737static void MergeStyles(TidyDocImpl* doc, Node *node, Node *child) 738{ 739 AttVal *av; 740 tmbstr s1, s2, style; 741 742 /* 743 the child may have a class attribute used 744 for attaching styles, if so the class name 745 needs to be copied to node's class 746 */ 747 MergeClasses(doc, node, child); 748 749 for (s2 = NULL, av = child->attributes; av; av = av->next) 750 { 751 if (attrIsSTYLE(av)) 752 { 753 s2 = av->value; 754 break; 755 } 756 } 757 758 for (s1 = NULL, av = node->attributes; av; av = av->next) 759 { 760 if (attrIsSTYLE(av)) 761 { 762 s1 = av->value; 763 break; 764 } 765 } 766 767 if (s1) 768 { 769 if (s2) /* merge styles from both */ 770 { 771 style = MergeProperties(s1, s2); 772 MemFree(av->value); 773 av->value = style; 774 } 775 } 776 else if (s2) /* copy style of child */ 777 { 778 av = TY_(NewAttributeEx)( doc, "style", s2, '"' ); 779 TY_(InsertAttributeAtStart)( node, av ); 780 } 781} 782 783static ctmbstr FontSize2Name(ctmbstr size) 784{ 785 static const ctmbstr sizes[7] = 786 { 787 "60%", "70%", "80%", NULL, 788 "120%", "150%", "200%" 789 }; 790 791 /* increment of 0.8 */ 792 static const ctmbstr minussizes[] = 793 { 794 "100%", "80%", "64%", "51%", 795 "40%", "32%", "26%" 796 }; 797 798 /* increment of 1.2 */ 799 static const ctmbstr plussizes[] = 800 { 801 "100%", "120%", "144%", "172%", 802 "207%", "248%", "298%" 803 }; 804 805 if (size[0] == '\0') 806 return NULL; 807 808 if ('0' <= size[0] && size[0] <= '6') 809 { 810 int n = size[0] - '0'; 811 return sizes[n]; 812 } 813 814 if (size[0] == '-') 815 { 816 if ('0' <= size[1] && size[1] <= '6') 817 { 818 int n = size[1] - '0'; 819 return minussizes[n]; 820 } 821 return "smaller"; /*"70%"; */ 822 } 823 824 if ('0' <= size[1] && size[1] <= '6') 825 { 826 int n = size[1] - '0'; 827 return plussizes[n]; 828 } 829 830 return "larger"; /* "140%" */ 831} 832 833static void AddFontFace( TidyDocImpl* doc, Node *node, ctmbstr face ) 834{ 835 tmbchar buf[256]; 836 TY_(tmbsnprintf)(buf, sizeof(buf), "font-family: %s", face ); 837 TY_(AddStyleProperty)( doc, node, buf ); 838} 839 840static void AddFontSize( TidyDocImpl* doc, Node* node, ctmbstr size ) 841{ 842 ctmbstr value = NULL; 843 844 if (nodeIsP(node)) 845 { 846 if (TY_(tmbstrcmp)(size, "6") == 0) 847 value = "h1"; 848 else if (TY_(tmbstrcmp)(size, "5") == 0) 849 value = "h2"; 850 else if (TY_(tmbstrcmp)(size, "4") == 0) 851 value = "h3"; 852 853 if (value) 854 { 855 MemFree(node->element); 856 node->element = TY_(tmbstrdup)(value); 857 TY_(FindTag)(doc, node); 858 return; 859 } 860 } 861 862 value = FontSize2Name(size); 863 864 if (value) 865 { 866 tmbchar buf[64]; 867 TY_(tmbsnprintf)(buf, sizeof(buf), "font-size: %s", value); 868 TY_(AddStyleProperty)( doc, node, buf ); 869 } 870} 871 872static void AddFontColor( TidyDocImpl* doc, Node *node, ctmbstr color) 873{ 874 tmbchar buf[128]; 875 TY_(tmbsnprintf)(buf, sizeof(buf), "color: %s", color); 876 TY_(AddStyleProperty)( doc, node, buf ); 877} 878 879/* force alignment value to lower case */ 880static void AddAlign( TidyDocImpl* doc, Node *node, ctmbstr align ) 881{ 882 uint i; 883 tmbchar buf[128]; 884 885 TY_(tmbstrcpy)( buf, "text-align: " ); 886 for ( i = 12; i < sizeof(buf)/sizeof(buf[0])-1; ++i ) 887 { 888 if ( (buf[i] = (tmbchar)TY_(ToLower)(*align++)) == '\0' ) 889 break; 890 } 891 buf[i] = '\0'; 892 TY_(AddStyleProperty)( doc, node, buf ); 893} 894 895/* 896 add style properties to node corresponding to 897 the font face, size and color attributes 898*/ 899static void AddFontStyles( TidyDocImpl* doc, Node *node, AttVal *av) 900{ 901 while (av) 902 { 903 if (AttrHasValue(av)) 904 { 905 if (attrIsFACE(av)) 906 AddFontFace( doc, node, av->value ); 907 else if (attrIsSIZE(av)) 908 AddFontSize( doc, node, av->value ); 909 else if (attrIsCOLOR(av)) 910 AddFontColor( doc, node, av->value ); 911 } 912 av = av->next; 913 } 914} 915 916/* 917 Symptom: <p align=center> 918 Action: <p style="text-align: center"> 919*/ 920static void TextAlign( TidyDocImpl* doc, Node* node ) 921{ 922 AttVal *av, *prev; 923 924 prev = NULL; 925 926 for (av = node->attributes; av; av = av->next) 927 { 928 if (attrIsALIGN(av)) 929 { 930 if (prev) 931 prev->next = av->next; 932 else 933 node->attributes = av->next; 934 935 if (av->value) 936 AddAlign( doc, node, av->value ); 937 938 TY_(FreeAttribute)(doc, av); 939 break; 940 } 941 942 prev = av; 943 } 944} 945 946/* 947 The clean up rules use the pnode argument to return the 948 next node when the original node has been deleted 949*/ 950 951/* 952 Symptom: <dir> <li> where <li> is only child 953 Action: coerce <dir> <li> to <div> with indent. 954*/ 955 956static Bool Dir2Div( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode)) 957{ 958 Node *child; 959 960 if ( nodeIsDIR(node) || nodeIsUL(node) || nodeIsOL(node) ) 961 { 962 child = node->content; 963 964 if (child == NULL) 965 return no; 966 967 /* check child has no peers */ 968 969 if (child->next) 970 return no; 971 972 if ( !nodeIsLI(child) ) 973 return no; 974 975 if ( !child->implicit ) 976 return no; 977 978 /* coerce dir to div */ 979 node->tag = TY_(LookupTagDef)( TidyTag_DIV ); 980 MemFree( node->element ); 981 node->element = TY_(tmbstrdup)("div"); 982 TY_(AddStyleProperty)( doc, node, "margin-left: 2em" ); 983 StripOnlyChild( doc, node ); 984 return yes; 985 } 986 987 return no; 988} 989 990/* 991 Symptom: <center> 992 Action: replace <center> by <div style="text-align: center"> 993*/ 994 995static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode) 996{ 997 if ( nodeIsCENTER(node) ) 998 { 999 if ( cfgBool(doc, TidyDropFontTags) ) 1000 { 1001 if (node->content) 1002 { 1003 Node *last = node->last; 1004 DiscardContainer( doc, node, pnode ); 1005 1006 node = TY_(InferredTag)(doc, TidyTag_BR); 1007 TY_(InsertNodeAfterElement)(last, node); 1008 } 1009 else 1010 { 1011 Node *prev = node->prev, *next = node->next, 1012 *parent = node->parent; 1013 DiscardContainer( doc, node, pnode ); 1014 1015 node = TY_(InferredTag)(doc, TidyTag_BR); 1016 if (next) 1017 TY_(InsertNodeBeforeElement)(next, node); 1018 else if (prev) 1019 TY_(InsertNodeAfterElement)(prev, node); 1020 else 1021 TY_(InsertNodeAtStart)(parent, node); 1022 } 1023 1024 return yes; 1025 } 1026 1027 RenameElem( node, TidyTag_DIV ); 1028 TY_(AddStyleProperty)( doc, node, "text-align: center" ); 1029 return yes; 1030 } 1031 1032 return no; 1033} 1034 1035/* Copy child attributes to node. Duplicate attributes are overwritten. 1036 Unique attributes (such as ID) disable the action. 1037 Attributes style and class are not dealt with. A call to MergeStyles 1038 will do that. 1039*/ 1040static Bool CopyAttrs( TidyDocImpl* doc, Node *node, Node *child) 1041{ 1042 AttVal *av1, *av2; 1043 TidyAttrId id; 1044 1045 /* Detect attributes that cannot be merged or overwritten. */ 1046 if (TY_(AttrGetById)(child, TidyAttr_ID) != NULL 1047 && TY_(AttrGetById)(node, TidyAttr_ID) != NULL) 1048 return no; 1049 1050 /* Move child attributes to node. Attributes in node 1051 can be overwritten or merged. */ 1052 for (av2 = child->attributes; av2; ) 1053 { 1054 /* Dealt by MergeStyles. */ 1055 if (attrIsSTYLE(av2) || attrIsCLASS(av2)) 1056 { 1057 av2 = av2->next; 1058 continue; 1059 } 1060 /* Avoid duplicates in node */ 1061 if ((id=AttrId(av2)) != TidyAttr_UNKNOWN 1062 && (av1=TY_(AttrGetById)(node, id))!= NULL) 1063 TY_(RemoveAttribute)( doc, node, av1 ); 1064 1065 /* Move attribute from child to node */ 1066 TY_(DetachAttribute)( child, av2 ); 1067 av1 = av2; 1068 av2 = av2->next; 1069 av1->next = NULL; 1070 TY_(InsertAttributeAtEnd)( node, av1 ); 1071 } 1072 1073 return yes; 1074} 1075 1076/* 1077 Symptom <XX><XX>...</XX></XX> 1078 Action: merge the two XXs 1079 1080 For instance, this is useful after nested <dir>s used by Word 1081 for indenting have been converted to <div>s 1082 1083 If state is "no", no merging. 1084 If state is "yes", inner element is discarded. Only Style and Class 1085 attributes are merged using MergeStyles(). 1086 If state is "auto", atttibutes are merged as described in CopyAttrs(). 1087 Style and Class attributes are merged using MergeStyles(). 1088*/ 1089static Bool MergeNestedElements( TidyDocImpl* doc, 1090 TidyTagId Id, TidyTriState state, Node *node, 1091 Node **ARG_UNUSED(pnode)) 1092{ 1093 Node *child; 1094 1095 if ( state == TidyNoState 1096 || !TagIsId(node, Id) ) 1097 return no; 1098 1099 child = node->content; 1100 1101 if ( child == NULL 1102 || child->next != NULL 1103 || !TagIsId(child, Id) ) 1104 return no; 1105 1106 if ( state == TidyAutoState 1107 && CopyAttrs(doc, node, child) == no ) 1108 return no; 1109 1110 MergeStyles( doc, node, child ); 1111 StripOnlyChild( doc, node ); 1112 return yes; 1113} 1114 1115/* 1116 Symptom: <ul><li><ul>...</ul></li></ul> 1117 Action: discard outer list 1118*/ 1119 1120static Bool NestedList( TidyDocImpl* doc, Node *node, Node **pnode ) 1121{ 1122 Node *child, *list; 1123 1124 if ( nodeIsUL(node) || nodeIsOL(node) ) 1125 { 1126 child = node->content; 1127 1128 if (child == NULL) 1129 return no; 1130 1131 /* check child has no peers */ 1132 1133 if (child->next) 1134 return no; 1135 1136 list = child->content; 1137 1138 if (!list) 1139 return no; 1140 1141 if (list->tag != node->tag) 1142 return no; 1143 1144 /* check list has no peers */ 1145 if (list->next) 1146 return no; 1147 1148 *pnode = list; /* Set node to resume iteration */ 1149 1150 /* move inner list node into position of outer node */ 1151 list->prev = node->prev; 1152 list->next = node->next; 1153 list->parent = node->parent; 1154 TY_(FixNodeLinks)(list); 1155 1156 /* get rid of outer ul and its li */ 1157 child->content = NULL; 1158 TY_(FreeNode)( doc, child ); /* See test #427841. */ 1159 child = NULL; 1160 node->content = NULL; 1161 node->next = NULL; 1162 TY_(FreeNode)( doc, node ); 1163 node = NULL; 1164 1165 /* 1166 If prev node was a list the chances are this node 1167 should be appended to that list. Word has no way of 1168 recognizing nested lists and just uses indents 1169 */ 1170 1171 if (list->prev) 1172 { 1173 if ( (nodeIsUL(list->prev) || nodeIsOL(list->prev)) 1174 && list->prev->last ) 1175 { 1176 node = list; 1177 list = node->prev; 1178 1179 child = list->last; /* <li> */ 1180 1181 list->next = node->next; 1182 TY_(FixNodeLinks)(list); 1183 1184 node->parent = child; 1185 node->next = NULL; 1186 node->prev = child->last; 1187 TY_(FixNodeLinks)(node); 1188 CleanNode( doc, node ); 1189 } 1190 } 1191 1192 return yes; 1193 } 1194 1195 return no; 1196} 1197 1198/* 1199 Some necessary conditions to apply BlockStyle(). 1200 */ 1201 1202static Bool CanApplyBlockStyle( Node *node ) 1203{ 1204 if (node->tag->model & (CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE) 1205 && !nodeIsTABLE(node) && !nodeIsTR(node) && !nodeIsLI(node) ) 1206 { 1207 return yes; 1208 } 1209 return no; 1210} 1211 1212/* 1213 Symptom: the only child of a block-level element is a 1214 presentation element such as B, I or FONT 1215 1216 Action: add style "font-weight: bold" to the block and 1217 strip the <b> element, leaving its children. 1218 1219 example: 1220 1221 <p> 1222 <b><font face="Arial" size="6">Draft Recommended Practice</font></b> 1223 </p> 1224 1225 becomes: 1226 1227 <p style="font-weight: bold; font-family: Arial; font-size: 6"> 1228 Draft Recommended Practice 1229 </p> 1230 1231 This code also replaces the align attribute by a style attribute. 1232 However, to avoid CSS problems with Navigator 4, this isn't done 1233 for the elements: caption, tr and table 1234*/ 1235static Bool BlockStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) ) 1236{ 1237 Node *child; 1238 1239 if (CanApplyBlockStyle(node)) 1240 { 1241 /* check for align attribute */ 1242 if ( !nodeIsCAPTION(node) ) 1243 TextAlign( doc, node ); 1244 1245 child = node->content; 1246 if (child == NULL) 1247 return no; 1248 1249 /* check child has no peers */ 1250 if (child->next) 1251 return no; 1252 1253 if ( nodeIsB(child) ) 1254 { 1255 MergeStyles( doc, node, child ); 1256 TY_(AddStyleProperty)( doc, node, "font-weight: bold" ); 1257 StripOnlyChild( doc, node ); 1258 return yes; 1259 } 1260 1261 if ( nodeIsI(child) ) 1262 { 1263 MergeStyles( doc, node, child ); 1264 TY_(AddStyleProperty)( doc, node, "font-style: italic" ); 1265 StripOnlyChild( doc, node ); 1266 return yes; 1267 } 1268 1269 if ( nodeIsFONT(child) ) 1270 { 1271 MergeStyles( doc, node, child ); 1272 AddFontStyles( doc, node, child->attributes ); 1273 StripOnlyChild( doc, node ); 1274 return yes; 1275 } 1276 } 1277 1278 return no; 1279} 1280 1281/* the only child of table cell or an inline element such as em */ 1282static Bool InlineStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) ) 1283{ 1284 Node *child; 1285 1286 if ( !nodeIsFONT(node) && TY_(nodeHasCM)(node, CM_INLINE|CM_ROW) ) 1287 { 1288 child = node->content; 1289 1290 if (child == NULL) 1291 return no; 1292 1293 /* check child has no peers */ 1294 1295 if (child->next) 1296 return no; 1297 1298 if ( nodeIsB(child) && cfgBool(doc, TidyLogicalEmphasis) ) 1299 { 1300 MergeStyles( doc, node, child ); 1301 TY_(AddStyleProperty)( doc, node, "font-weight: bold" ); 1302 StripOnlyChild( doc, node ); 1303 return yes; 1304 } 1305 1306 if ( nodeIsI(child) && cfgBool(doc, TidyLogicalEmphasis) ) 1307 { 1308 MergeStyles( doc, node, child ); 1309 TY_(AddStyleProperty)( doc, node, "font-style: italic" ); 1310 StripOnlyChild( doc, node ); 1311 return yes; 1312 } 1313 1314 if ( nodeIsFONT(child) ) 1315 { 1316 MergeStyles( doc, node, child ); 1317 AddFontStyles( doc, node, child->attributes ); 1318 StripOnlyChild( doc, node ); 1319 return yes; 1320 } 1321 } 1322 1323 return no; 1324} 1325 1326/* 1327 Replace font elements by span elements, deleting 1328 the font element's attributes and replacing them 1329 by a single style attribute. 1330*/ 1331static Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode ) 1332{ 1333 AttVal *av, *style, *next; 1334 1335 if ( nodeIsFONT(node) ) 1336 { 1337 if ( cfgBool(doc, TidyDropFontTags) ) 1338 { 1339 DiscardContainer( doc, node, pnode ); 1340 return yes; 1341 } 1342 1343 /* if FONT is only child of parent element then leave alone 1344 Do so only if BlockStyle may be succesful. */ 1345 if ( node->parent->content == node && node->next == NULL && 1346 CanApplyBlockStyle(node->parent) ) 1347 return no; 1348 1349 AddFontStyles( doc, node, node->attributes ); 1350 1351 /* extract style attribute and free the rest */ 1352 av = node->attributes; 1353 style = NULL; 1354 1355 while (av) 1356 { 1357 next = av->next; 1358 1359 if (attrIsSTYLE(av)) 1360 { 1361 av->next = NULL; 1362 style = av; 1363 } 1364 else 1365 { 1366 TY_(FreeAttribute)( doc, av ); 1367 } 1368 av = next; 1369 } 1370 1371 node->attributes = style; 1372 RenameElem( node, TidyTag_SPAN ); 1373 return yes; 1374 } 1375 1376 return no; 1377} 1378 1379/* 1380 Applies all matching rules to a node. 1381*/ 1382Node* CleanNode( TidyDocImpl* doc, Node *node ) 1383{ 1384 Node *next = NULL; 1385 TidyTriState mergeDivs = cfgAutoBool(doc, TidyMergeDivs); 1386 1387 for (next = node; TY_(nodeIsElement)(node); node = next) 1388 { 1389 if ( Dir2Div(doc, node, &next) ) 1390 continue; 1391 1392 /* Special case: true result means 1393 ** that arg node and its parent no longer exist. 1394 ** So we must jump back up the CreateStyleProperties() 1395 ** call stack until we have a valid node reference. 1396 */ 1397 if ( NestedList(doc, node, &next) ) 1398 return next; 1399 1400 if ( Center2Div(doc, node, &next) ) 1401 continue; 1402 1403 if ( MergeNestedElements(doc, TidyTag_DIV, mergeDivs, node, &next) ) 1404 continue; 1405 1406 if ( BlockStyle(doc, node, &next) ) 1407 continue; 1408 1409 if ( InlineStyle(doc, node, &next) ) 1410 continue; 1411 1412 if ( Font2Span(doc, node, &next) ) 1413 continue; 1414 1415 break; 1416 } 1417 1418 return next; 1419} 1420 1421/* Special case: if the current node is destroyed by 1422** CleanNode() lower in the tree, this node and its parent 1423** no longer exist. So we must jump back up the CleanTree() 1424** call stack until we have a valid node reference. 1425*/ 1426 1427static Node* CleanTree( TidyDocImpl* doc, Node *node ) 1428{ 1429 if (node->content) 1430 { 1431 Node *child; 1432 for (child = node->content; child != NULL; child = child->next) 1433 { 1434 child = CleanTree( doc, child ); 1435 if ( !child ) 1436 break; 1437 } 1438 } 1439 1440 return CleanNode( doc, node ); 1441} 1442 1443static void DefineStyleRules( TidyDocImpl* doc, Node *node ) 1444{ 1445 Node *child; 1446 1447 if (node->content) 1448 { 1449 for (child = node->content; 1450 child != NULL; child = child->next) 1451 { 1452 DefineStyleRules( doc, child ); 1453 } 1454 } 1455 1456 Style2Rule( doc, node ); 1457} 1458 1459void TY_(CleanDocument)( TidyDocImpl* doc ) 1460{ 1461 /* placeholder. CleanTree()/CleanNode() will not 1462 ** zap root element 1463 */ 1464 CleanTree( doc, &doc->root ); 1465 1466 if ( cfgBool(doc, TidyMakeClean) ) 1467 { 1468 DefineStyleRules( doc, &doc->root ); 1469 CreateStyleElement( doc ); 1470 } 1471} 1472 1473/* simplifies <b><b> ... </b> ...</b> etc. */ 1474void TY_(NestedEmphasis)( TidyDocImpl* doc, Node* node ) 1475{ 1476 Node *next; 1477 1478 while (node) 1479 { 1480 next = node->next; 1481 1482 if ( (nodeIsB(node) || nodeIsI(node)) 1483 && node->parent && node->parent->tag == node->tag) 1484 { 1485 /* strip redundant inner element */ 1486 DiscardContainer( doc, node, &next ); 1487 node = next; 1488 continue; 1489 } 1490 1491 if ( node->content ) 1492 TY_(NestedEmphasis)( doc, node->content ); 1493 1494 node = next; 1495 } 1496} 1497 1498 1499 1500/* replace i by em and b by strong */ 1501void TY_(EmFromI)( TidyDocImpl* doc, Node* node ) 1502{ 1503 while (node) 1504 { 1505 if ( nodeIsI(node) ) 1506 RenameElem( node, TidyTag_EM ); 1507 else if ( nodeIsB(node) ) 1508 RenameElem( node, TidyTag_STRONG ); 1509 1510 if ( node->content ) 1511 TY_(EmFromI)( doc, node->content ); 1512 1513 node = node->next; 1514 } 1515} 1516 1517static Bool HasOneChild(Node *node) 1518{ 1519 return (node->content && node->content->next == NULL); 1520} 1521 1522/* 1523 Some people use dir or ul without an li 1524 to indent the content. The pattern to 1525 look for is a list with a single implicit 1526 li. This is recursively replaced by an 1527 implicit blockquote. 1528*/ 1529void TY_(List2BQ)( TidyDocImpl* doc, Node* node ) 1530{ 1531 while (node) 1532 { 1533 if (node->content) 1534 TY_(List2BQ)( doc, node->content ); 1535 1536 if ( node->tag && node->tag->parser == TY_(ParseList) && 1537 HasOneChild(node) && node->content->implicit ) 1538 { 1539 StripOnlyChild( doc, node ); 1540 RenameElem( node, TidyTag_BLOCKQUOTE ); 1541 node->implicit = yes; 1542 } 1543 1544 node = node->next; 1545 } 1546} 1547 1548 1549/* 1550 Replace implicit blockquote by div with an indent 1551 taking care to reduce nested blockquotes to a single 1552 div with the indent set to match the nesting depth 1553*/ 1554void TY_(BQ2Div)( TidyDocImpl* doc, Node *node ) 1555{ 1556 tmbchar indent_buf[ 32 ]; 1557 uint indent; 1558 1559 while (node) 1560 { 1561 if ( nodeIsBLOCKQUOTE(node) && node->implicit ) 1562 { 1563 indent = 1; 1564 1565 while( HasOneChild(node) && 1566 nodeIsBLOCKQUOTE(node->content) && 1567 node->implicit) 1568 { 1569 ++indent; 1570 StripOnlyChild( doc, node ); 1571 } 1572 1573 if (node->content) 1574 TY_(BQ2Div)( doc, node->content ); 1575 1576 TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem", 1577 2*indent); 1578 1579 RenameElem( node, TidyTag_DIV ); 1580 TY_(AddStyleProperty)(doc, node, indent_buf ); 1581 } 1582 else if (node->content) 1583 TY_(BQ2Div)( doc, node->content ); 1584 1585 node = node->next; 1586 } 1587} 1588 1589 1590static Node* FindEnclosingCell( TidyDocImpl* ARG_UNUSED(doc), Node *node) 1591{ 1592 Node *check; 1593 1594 for ( check=node; check; check = check->parent ) 1595 { 1596 if ( nodeIsTD(check) ) 1597 return check; 1598 } 1599 return NULL; 1600} 1601 1602/* node is <![if ...]> prune up to <![endif]> */ 1603static Node* PruneSection( TidyDocImpl* doc, Node *node ) 1604{ 1605 Lexer* lexer = doc->lexer; 1606 1607 for (;;) 1608 { 1609 ctmbstr lexbuf = lexer->lexbuf + node->start; 1610 if ( TY_(tmbstrncmp)(lexbuf, "if !supportEmptyParas", 21) == 0 ) 1611 { 1612 Node* cell = FindEnclosingCell( doc, node ); 1613 if ( cell ) 1614 { 1615 /* Need to put into cell so it doesn't look weird 1616 */ 1617 Node* nbsp = TY_(NewLiteralTextNode)( lexer, "\240" ); 1618 assert( (byte)'\240' == (byte)160 ); 1619 TY_(InsertNodeBeforeElement)( node, nbsp ); 1620 } 1621 } 1622 1623 /* discard node and returns next */ 1624 node = TY_(DiscardElement)( doc, node ); 1625 1626 if (node == NULL) 1627 return NULL; 1628 1629 if (node->type == SectionTag) 1630 { 1631 if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0) 1632 { 1633 node = PruneSection( doc, node ); 1634 continue; 1635 } 1636 1637 if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "endif", 5) == 0) 1638 { 1639 node = TY_(DiscardElement)( doc, node ); 1640 break; 1641 } 1642 } 1643 } 1644 1645 return node; 1646} 1647 1648void TY_(DropSections)( TidyDocImpl* doc, Node* node ) 1649{ 1650 Lexer* lexer = doc->lexer; 1651 while (node) 1652 { 1653 if (node->type == SectionTag) 1654 { 1655 /* prune up to matching endif */ 1656 if ((TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0) && 1657 (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if !vml", 7) != 0)) /* #444394 - fix 13 Sep 01 */ 1658 { 1659 node = PruneSection( doc, node ); 1660 continue; 1661 } 1662 1663 /* discard others as well */ 1664 node = TY_(DiscardElement)( doc, node ); 1665 continue; 1666 } 1667 1668 if (node->content) 1669 TY_(DropSections)( doc, node->content ); 1670 1671 node = node->next; 1672 } 1673} 1674 1675static void PurgeWord2000Attributes( TidyDocImpl* ARG_UNUSED(doc), Node* node ) 1676{ 1677 AttVal *attr, *next, *prev = NULL; 1678 1679 for ( attr = node->attributes; attr; attr = next ) 1680 { 1681 next = attr->next; 1682 1683 /* special check for class="Code" denoting pre text */ 1684 /* Pass thru user defined styles as HTML class names */ 1685 if (attrIsCLASS(attr)) 1686 { 1687 if (AttrValueIs(attr, "Code") || 1688 TY_(tmbstrncmp)(attr->value, "Mso", 3) != 0 ) 1689 { 1690 prev = attr; 1691 continue; 1692 } 1693 } 1694 1695 if (attrIsCLASS(attr) || 1696 attrIsSTYLE(attr) || 1697 attrIsLANG(attr) || 1698 ( (attrIsHEIGHT(attr) || attrIsWIDTH(attr)) && 1699 (nodeIsTD(node) || nodeIsTR(node) || nodeIsTH(node)) ) || 1700 (attr->attribute && TY_(tmbstrncmp)(attr->attribute, "x:", 2) == 0) ) 1701 { 1702 if (prev) 1703 prev->next = next; 1704 else 1705 node->attributes = next; 1706 1707 TY_(FreeAttribute)( doc, attr ); 1708 } 1709 else 1710 prev = attr; 1711 } 1712} 1713 1714/* Word2000 uses span excessively, so we strip span out */ 1715static Node* StripSpan( TidyDocImpl* doc, Node* span ) 1716{ 1717 Node *node, *prev = NULL, *content; 1718 1719 /* 1720 deal with span elements that have content 1721 by splicing the content in place of the span 1722 after having processed it 1723 */ 1724 1725 TY_(CleanWord2000)( doc, span->content ); 1726 content = span->content; 1727 1728 if (span->prev) 1729 prev = span->prev; 1730 else if (content) 1731 { 1732 node = content; 1733 content = content->next; 1734 TY_(RemoveNode)(node); 1735 TY_(InsertNodeBeforeElement)(span, node); 1736 prev = node; 1737 } 1738 1739 while (content) 1740 { 1741 node = content; 1742 content = content->next; 1743 TY_(RemoveNode)(node); 1744 TY_(InsertNodeAfterElement)(prev, node); 1745 prev = node; 1746 } 1747 1748 if (span->next == NULL) 1749 span->parent->last = prev; 1750 1751 node = span->next; 1752 span->content = NULL; 1753 TY_(DiscardElement)( doc, span ); 1754 return node; 1755} 1756 1757/* map non-breaking spaces to regular spaces */ 1758void TY_(NormalizeSpaces)(Lexer *lexer, Node *node) 1759{ 1760 while ( node ) 1761 { 1762 if ( node->content ) 1763 TY_(NormalizeSpaces)( lexer, node->content ); 1764 1765 if (TY_(nodeIsText)(node)) 1766 { 1767 uint i, c; 1768 tmbstr p = lexer->lexbuf + node->start; 1769 1770 for (i = node->start; i < node->end; ++i) 1771 { 1772 c = (byte) lexer->lexbuf[i]; 1773 1774 /* look for UTF-8 multibyte character */ 1775 if ( c > 0x7F ) 1776 i += TY_(GetUTF8)( lexer->lexbuf + i, &c ); 1777 1778 if ( c == 160 ) 1779 c = ' '; 1780 1781 p = TY_(PutUTF8)(p, c); 1782 } 1783 node->end = p - lexer->lexbuf; 1784 } 1785 1786 node = node->next; 1787 } 1788} 1789 1790/* used to hunt for hidden preformatted sections */ 1791static Bool NoMargins(Node *node) 1792{ 1793 AttVal *attval = TY_(AttrGetById)(node, TidyAttr_STYLE); 1794 1795 if ( !AttrHasValue(attval) ) 1796 return no; 1797 1798 /* search for substring "margin-top: 0" */ 1799 if (!TY_(tmbsubstr)(attval->value, "margin-top: 0")) 1800 return no; 1801 1802 /* search for substring "margin-bottom: 0" */ 1803 if (!TY_(tmbsubstr)(attval->value, "margin-bottom: 0")) 1804 return no; 1805 1806 return yes; 1807} 1808 1809/* does element have a single space as its content? */ 1810static Bool SingleSpace( Lexer* lexer, Node* node ) 1811{ 1812 if ( node->content ) 1813 { 1814 node = node->content; 1815 1816 if ( node->next != NULL ) 1817 return no; 1818 1819 if ( node->type != TextNode ) 1820 return no; 1821 1822 if ( (node->end - node->start) == 1 && 1823 lexer->lexbuf[node->start] == ' ' ) 1824 return yes; 1825 1826 if ( (node->end - node->start) == 2 ) 1827 { 1828 uint c = 0; 1829 TY_(GetUTF8)( lexer->lexbuf + node->start, &c ); 1830 if ( c == 160 ) 1831 return yes; 1832 } 1833 } 1834 1835 return no; 1836} 1837 1838/* 1839 This is a major clean up to strip out all the extra stuff you get 1840 when you save as web page from Word 2000. It doesn't yet know what 1841 to do with VML tags, but these will appear as errors unless you 1842 declare them as new tags, such as o:p which needs to be declared 1843 as inline. 1844*/ 1845void TY_(CleanWord2000)( TidyDocImpl* doc, Node *node) 1846{ 1847 /* used to a list from a sequence of bulletted p's */ 1848 Lexer* lexer = doc->lexer; 1849 Node* list = NULL; 1850 1851 while ( node ) 1852 { 1853 /* get rid of Word's xmlns attributes */ 1854 if ( nodeIsHTML(node) ) 1855 { 1856 /* check that it's a Word 2000 document */ 1857 if ( !TY_(GetAttrByName)(node, "xmlns:o") && 1858 !cfgBool(doc, TidyMakeBare) ) 1859 return; 1860 1861 TY_(FreeAttrs)( doc, node ); 1862 } 1863 1864 /* fix up preformatted sections by looking for a 1865 ** sequence of paragraphs with zero top/bottom margin 1866 */ 1867 if ( nodeIsP(node) ) 1868 { 1869 if (NoMargins(node)) 1870 { 1871 Node *pre, *next; 1872 TY_(CoerceNode)(doc, node, TidyTag_PRE, no, yes); 1873 1874 PurgeWord2000Attributes( doc, node ); 1875 1876 if (node->content) 1877 TY_(CleanWord2000)( doc, node->content ); 1878 1879 pre = node; 1880 node = node->next; 1881 1882 /* continue to strip p's */ 1883 1884 while ( nodeIsP(node) && NoMargins(node) ) 1885 { 1886 next = node->next; 1887 TY_(RemoveNode)(node); 1888 TY_(InsertNodeAtEnd)(pre, TY_(NewLineNode)(lexer)); 1889 TY_(InsertNodeAtEnd)(pre, node); 1890 StripSpan( doc, node ); 1891 node = next; 1892 } 1893 1894 if (node == NULL) 1895 break; 1896 } 1897 } 1898 1899 if (node->tag && (node->tag->model & CM_BLOCK) 1900 && SingleSpace(lexer, node)) 1901 { 1902 node = StripSpan( doc, node ); 1903 continue; 1904 } 1905 /* discard Word's style verbiage */ 1906 if ( nodeIsSTYLE(node) || nodeIsMETA(node) || 1907 node->type == CommentTag ) 1908 { 1909 node = TY_(DiscardElement)( doc, node ); 1910 continue; 1911 } 1912 1913 /* strip out all span and font tags Word scatters so liberally! */ 1914 if ( nodeIsSPAN(node) || nodeIsFONT(node) ) 1915 { 1916 node = StripSpan( doc, node ); 1917 continue; 1918 } 1919 1920 if ( nodeIsLINK(node) ) 1921 { 1922 AttVal *attr = TY_(AttrGetById)(node, TidyAttr_REL); 1923 1924 if (AttrValueIs(attr, "File-List")) 1925 { 1926 node = TY_(DiscardElement)( doc, node ); 1927 continue; 1928 } 1929 } 1930 1931 /* discards <o:p> which encodes the paragraph mark */ 1932 if ( node->tag && TY_(tmbstrcmp)(node->tag->name,"o:p")==0) 1933 { 1934 Node* next; 1935 DiscardContainer( doc, node, &next ); 1936 node = next; 1937 continue; 1938 } 1939 1940 /* discard empty paragraphs */ 1941 1942 if ( node->content == NULL && nodeIsP(node) ) 1943 { 1944 /* Use the existing function to ensure consistency */ 1945 Node *next = TY_(TrimEmptyElement)( doc, node ); 1946 node = next; 1947 continue; 1948 } 1949 1950 if ( nodeIsP(node) ) 1951 { 1952 AttVal *attr, *atrStyle; 1953 1954 attr = TY_(AttrGetById)(node, TidyAttr_CLASS); 1955 atrStyle = TY_(AttrGetById)(node, TidyAttr_STYLE); 1956 /* 1957 (JES) Sometimes Word marks a list item with the following hokie syntax 1958 <p class="MsoNormal" style="...;mso-list:l1 level1 lfo1; 1959 translate these into <li> 1960 */ 1961 /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */ 1962 /* map <p class="MsoListNumber"> to <ol>...</ol> */ 1963 if ( AttrValueIs(attr, "MsoListBullet") || 1964 AttrValueIs(attr, "MsoListNumber") || 1965 AttrContains(atrStyle, "mso-list:") ) 1966 { 1967 TidyTagId listType = TidyTag_UL; 1968 if (AttrValueIs(attr, "MsoListNumber")) 1969 listType = TidyTag_OL; 1970 1971 TY_(CoerceNode)(doc, node, TidyTag_LI, no, yes); 1972 1973 if ( !list || TagId(list) != listType ) 1974 { 1975 const Dict* tag = TY_(LookupTagDef)( listType ); 1976 list = TY_(InferredTag)(doc, tag->id); 1977 TY_(InsertNodeBeforeElement)(node, list); 1978 } 1979 1980 PurgeWord2000Attributes( doc, node ); 1981 1982 if ( node->content ) 1983 TY_(CleanWord2000)( doc, node->content ); 1984 1985 /* remove node and append to contents of list */ 1986 TY_(RemoveNode)(node); 1987 TY_(InsertNodeAtEnd)(list, node); 1988 node = list; 1989 } 1990 /* map sequence of <p class="Code"> to <pre>...</pre> */ 1991 else if (AttrValueIs(attr, "Code")) 1992 { 1993 Node *br = TY_(NewLineNode)(lexer); 1994 TY_(NormalizeSpaces)(lexer, node->content); 1995 1996 if ( !list || TagId(list) != TidyTag_PRE ) 1997 { 1998 list = TY_(InferredTag)(doc, TidyTag_PRE); 1999 TY_(InsertNodeBeforeElement)(node, list); 2000 } 2001 2002 /* remove node and append to contents of list */ 2003 TY_(RemoveNode)(node); 2004 TY_(InsertNodeAtEnd)(list, node); 2005 StripSpan( doc, node ); 2006 TY_(InsertNodeAtEnd)(list, br); 2007 node = list->next; 2008 } 2009 else 2010 list = NULL; 2011 } 2012 else 2013 list = NULL; 2014 2015 if (!node) 2016 return; 2017 2018 /* strip out style and class attributes */ 2019 if (TY_(nodeIsElement)(node)) 2020 PurgeWord2000Attributes( doc, node ); 2021 2022 if (node->content) 2023 TY_(CleanWord2000)( doc, node->content ); 2024 2025 node = node->next; 2026 } 2027} 2028 2029Bool TY_(IsWord2000)( TidyDocImpl* doc ) 2030{ 2031 AttVal *attval; 2032 Node *node, *head; 2033 Node *html = TY_(FindHTML)( doc ); 2034 2035 if (html && TY_(GetAttrByName)(html, "xmlns:o")) 2036 return yes; 2037 2038 /* search for <meta name="GENERATOR" content="Microsoft ..."> */ 2039 head = TY_(FindHEAD)( doc ); 2040 2041 if (head) 2042 { 2043 for (node = head->content; node; node = node->next) 2044 { 2045 if ( !nodeIsMETA(node) ) 2046 continue; 2047 2048 attval = TY_(AttrGetById)( node, TidyAttr_NAME ); 2049 2050 if ( !AttrValueIs(attval, "generator") ) 2051 continue; 2052 2053 attval = TY_(AttrGetById)( node, TidyAttr_CONTENT ); 2054 2055 if ( AttrContains(attval, "Microsoft") ) 2056 return yes; 2057 } 2058 } 2059 2060 return no; 2061} 2062 2063/* where appropriate move object elements from head to body */ 2064void TY_(BumpObject)( TidyDocImpl* doc, Node *html ) 2065{ 2066 Node *node, *next, *head = NULL, *body = NULL; 2067 2068 if (!html) 2069 return; 2070 2071 for ( node = html->content; node != NULL; node = node->next ) 2072 { 2073 if ( nodeIsHEAD(node) ) 2074 head = node; 2075 2076 if ( nodeIsBODY(node) ) 2077 body = node; 2078 } 2079 2080 if ( head != NULL && body != NULL ) 2081 { 2082 for (node = head->content; node != NULL; node = next) 2083 { 2084 next = node->next; 2085 2086 if ( nodeIsOBJECT(node) ) 2087 { 2088 Node *child; 2089 Bool bump = no; 2090 2091 for (child = node->content; child != NULL; child = child->next) 2092 { 2093 /* bump to body unless content is param */ 2094 if ( (TY_(nodeIsText)(child) && !TY_(IsBlank)(doc->lexer, node)) 2095 || !nodeIsPARAM(child) ) 2096 { 2097 bump = yes; 2098 break; 2099 } 2100 } 2101 2102 if ( bump ) 2103 { 2104 TY_(RemoveNode)( node ); 2105 TY_(InsertNodeAtStart)( body, node ); 2106 } 2107 } 2108 } 2109 } 2110} 2111 2112/* This is disabled due to http://tidy.sf.net/bug/681116 */ 2113#if 0 2114void FixBrakes( TidyDocImpl* pDoc, Node *pParent ) 2115{ 2116 Node *pNode; 2117 Bool bBRDeleted = no; 2118 2119 if (NULL == pParent) 2120 return; 2121 2122 /* First, check the status of All My Children */ 2123 pNode = pParent->content; 2124 while (NULL != pNode ) 2125 { 2126 /* The node may get trimmed, so save the next pointer, if any */ 2127 Node *pNext = pNode->next; 2128 FixBrakes( pDoc, pNode ); 2129 pNode = pNext; 2130 } 2131 2132 2133 /* As long as my last child is a <br />, move it to my last peer */ 2134 if ( nodeCMIsBlock( pParent )) 2135 { 2136 for ( pNode = pParent->last; 2137 NULL != pNode && nodeIsBR( pNode ); 2138 pNode = pParent->last ) 2139 { 2140 if ( NULL == pNode->attributes && no == bBRDeleted ) 2141 { 2142 TY_(DiscardElement)( pDoc, pNode ); 2143 bBRDeleted = yes; 2144 } 2145 else 2146 { 2147 TY_(RemoveNode)( pNode ); 2148 TY_(InsertNodeAfterElement)( pParent, pNode ); 2149 } 2150 } 2151 TY_(TrimEmptyElement)( pDoc, pParent ); 2152 } 2153} 2154#endif 2155 2156void TY_(VerifyHTTPEquiv)(TidyDocImpl* pDoc, Node *head) 2157{ 2158 Node *pNode; 2159 StyleProp *pFirstProp = NULL, *pLastProp = NULL, *prop = NULL; 2160 tmbstr s, pszBegin, pszEnd; 2161 ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(pDoc, TidyOutCharEncoding)); 2162 2163 if (!enc) 2164 return; 2165 2166 if (!nodeIsHEAD(head)) 2167 head = TY_(FindHEAD)(pDoc); 2168 2169 if (!head) 2170 return; 2171 2172 /* Find any <meta http-equiv='Content-Type' content='...' /> */ 2173 for (pNode = head->content; NULL != pNode; pNode = pNode->next) 2174 { 2175 AttVal* httpEquiv = TY_(AttrGetById)(pNode, TidyAttr_HTTP_EQUIV); 2176 AttVal* metaContent = TY_(AttrGetById)(pNode, TidyAttr_CONTENT); 2177 2178 if ( !nodeIsMETA(pNode) || !metaContent || 2179 !AttrValueIs(httpEquiv, "Content-Type") ) 2180 continue; 2181 2182 pszBegin = s = TY_(tmbstrdup)( metaContent->value ); 2183 while (pszBegin && *pszBegin) 2184 { 2185 while (isspace( *pszBegin )) 2186 pszBegin++; 2187 pszEnd = pszBegin; 2188 while ('\0' != *pszEnd && ';' != *pszEnd) 2189 pszEnd++; 2190 if (';' == *pszEnd ) 2191 *(pszEnd++) = '\0'; 2192 if (pszEnd > pszBegin) 2193 { 2194 prop = (StyleProp *)MemAlloc(sizeof(StyleProp)); 2195 prop->name = TY_(tmbstrdup)( pszBegin ); 2196 prop->value = NULL; 2197 prop->next = NULL; 2198 2199 if (NULL != pLastProp) 2200 pLastProp->next = prop; 2201 else 2202 pFirstProp = prop; 2203 2204 pLastProp = prop; 2205 pszBegin = pszEnd; 2206 } 2207 } 2208 MemFree( s ); 2209 2210 /* find the charset property */ 2211 for (prop = pFirstProp; NULL != prop; prop = prop->next) 2212 { 2213 if (0 != TY_(tmbstrncasecmp)( prop->name, "charset", 7 )) 2214 continue; 2215 2216 MemFree( prop->name ); 2217 prop->name = (tmbstr)MemAlloc( 8 + TY_(tmbstrlen)(enc) + 1 ); 2218 TY_(tmbstrcpy)(prop->name, "charset="); 2219 TY_(tmbstrcpy)(prop->name+8, enc); 2220 s = CreatePropString( pFirstProp ); 2221 MemFree( metaContent->value ); 2222 metaContent->value = s; 2223 break; 2224 } 2225 /* #718127, prevent memory leakage */ 2226 FreeStyleProps(pFirstProp); 2227 pFirstProp = NULL; 2228 pLastProp = NULL; 2229 } 2230} 2231 2232void TY_(DropComments)(TidyDocImpl* doc, Node* node) 2233{ 2234 Node* next; 2235 2236 while (node) 2237 { 2238 next = node->next; 2239 2240 if (node->type == CommentTag) 2241 { 2242 TY_(RemoveNode)(node); 2243 TY_(FreeNode)(doc, node); 2244 node = next; 2245 continue; 2246 } 2247 2248 if (node->content) 2249 TY_(DropComments)(doc, node->content); 2250 2251 node = next; 2252 } 2253} 2254 2255void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **ARG_UNUSED(pnode)) 2256{ 2257 Node* next; 2258 2259 while (node) 2260 { 2261 next = node->next; 2262 2263 if (nodeIsFONT(node)) 2264 { 2265 DiscardContainer(doc, node, &next); 2266 node = next; 2267 continue; 2268 } 2269 2270 if (node->content) 2271 TY_(DropFontElements)(doc, node->content, &next); 2272 2273 node = next; 2274 } 2275} 2276 2277void TY_(WbrToSpace)(TidyDocImpl* doc, Node* node) 2278{ 2279 Node* next; 2280 2281 while (node) 2282 { 2283 next = node->next; 2284 2285 if (nodeIsWBR(node)) 2286 { 2287 Node* text; 2288 text = TY_(NewLiteralTextNode)(doc->lexer, " "); 2289 TY_(InsertNodeAfterElement)(node, text); 2290 TY_(RemoveNode)(node); 2291 TY_(FreeNode)(doc, node); 2292 node = next; 2293 continue; 2294 } 2295 2296 if (node->content) 2297 TY_(WbrToSpace)(doc, node->content); 2298 2299 node = next; 2300 } 2301} 2302 2303/* 2304 Filters from Word and PowerPoint often use smart 2305 quotes resulting in character codes between 128 2306 and 159. Unfortunately, the corresponding HTML 4.0 2307 entities for these are not widely supported. The 2308 following converts dashes and quotation marks to 2309 the nearest ASCII equivalent. My thanks to 2310 Andrzej Novosiolov for his help with this code. 2311 2312 Note: The old code in the pretty printer applied 2313 this to all node types and attribute values while 2314 this routine applies it only to text nodes. First, 2315 Microsoft Office products rarely put the relevant 2316 characters into these tokens, second support for 2317 them is much better now and last but not least, it 2318 can be harmful to replace these characters since 2319 US-ASCII quote marks are often used as syntax 2320 characters, a simple 2321 2322 <a onmouseover="alert('‘')">...</a> 2323 2324 would be broken if the U+2018 is replaced by "'". 2325 The old code would neither take care whether the 2326 quote mark is already used as delimiter, 2327 2328 <p title='‘'>...</p> 2329 2330 got 2331 2332 <p title='''>...</p> 2333 2334 Since browser support is much better nowadays and 2335 high-quality typography is better than ASCII it'd 2336 be probably a good idea to drop the feature... 2337*/ 2338void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node) 2339{ 2340 Node* next; 2341 Lexer* lexer = doc->lexer; 2342 2343 while (node) 2344 { 2345 next = node->next; 2346 2347 if (TY_(nodeIsText)(node)) 2348 { 2349 uint i, c; 2350 tmbstr p = lexer->lexbuf + node->start; 2351 2352 for (i = node->start; i < node->end; ++i) 2353 { 2354 c = (unsigned char) lexer->lexbuf[i]; 2355 2356 if (c > 0x7F) 2357 i += TY_(GetUTF8)(lexer->lexbuf + i, &c); 2358 2359 if (c >= 0x2013 && c <= 0x201E) 2360 { 2361 switch (c) 2362 { 2363 case 0x2013: /* en dash */ 2364 case 0x2014: /* em dash */ 2365 c = '-'; 2366 break; 2367 case 0x2018: /* left single quotation mark */ 2368 case 0x2019: /* right single quotation mark */ 2369 case 0x201A: /* single low-9 quotation mark */ 2370 c = '\''; 2371 break; 2372 case 0x201C: /* left double quotation mark */ 2373 case 0x201D: /* right double quotation mark */ 2374 case 0x201E: /* double low-9 quotation mark */ 2375 c = '"'; 2376 break; 2377 } 2378 } 2379 2380 p = TY_(PutUTF8)(p, c); 2381 } 2382 2383 node->end = p - lexer->lexbuf; 2384 } 2385 2386 if (node->content) 2387 TY_(DowngradeTypography)(doc, node->content); 2388 2389 node = next; 2390 } 2391} 2392 2393void TY_(ReplacePreformattedSpaces)(TidyDocImpl* doc, Node* node) 2394{ 2395 Node* next; 2396 2397 while (node) 2398 { 2399 next = node->next; 2400 2401 if (node->tag && node->tag->parser == TY_(ParsePre)) 2402 { 2403 TY_(NormalizeSpaces)(doc->lexer, node->content); 2404 node = next; 2405 continue; 2406 } 2407 2408 if (node->content) 2409 TY_(ReplacePreformattedSpaces)(doc, node->content); 2410 2411 node = next; 2412 } 2413} 2414 2415void TY_(ConvertCDATANodes)(TidyDocImpl* doc, Node* node) 2416{ 2417 Node* next; 2418 2419 while (node) 2420 { 2421 next = node->next; 2422 2423 if (node->type == CDATATag) 2424 node->type = TextNode; 2425 2426 if (node->content) 2427 TY_(ConvertCDATANodes)(doc, node->content); 2428 2429 node = next; 2430 } 2431} 2432 2433/* 2434 FixLanguageInformation ensures that the document contains (only) 2435 the attributes for language information desired by the output 2436 document type. For example, for XHTML 1.0 documents both 2437 'xml:lang' and 'lang' are desired, for XHTML 1.1 only 'xml:lang' 2438 is desired and for HTML 4.01 only 'lang' is desired. 2439*/ 2440void TY_(FixLanguageInformation)(TidyDocImpl* doc, Node* node, Bool wantXmlLang, Bool wantLang) 2441{ 2442 Node* next; 2443 2444 while (node) 2445 { 2446 next = node->next; 2447 2448 /* todo: report modifications made here to the report system */ 2449 2450 if (TY_(nodeIsElement)(node)) 2451 { 2452 AttVal* lang = TY_(AttrGetById)(node, TidyAttr_LANG); 2453 AttVal* xmlLang = TY_(AttrGetById)(node, TidyAttr_XML_LANG); 2454 2455 if (lang && xmlLang) 2456 { 2457 /* 2458 todo: check whether both attributes are in sync, 2459 here or elsewhere, where elsewhere is probably 2460 preferable. 2461 AD - March 2005: not mandatory according the standards. 2462 */ 2463 } 2464 else if (lang && wantXmlLang) 2465 { 2466 if (TY_(NodeAttributeVersions)( node, TidyAttr_XML_LANG ) 2467 & doc->lexer->versionEmitted) 2468 TY_(RepairAttrValue)(doc, node, "xml:lang", lang->value); 2469 } 2470 else if (xmlLang && wantLang) 2471 { 2472 if (TY_(NodeAttributeVersions)( node, TidyAttr_LANG ) 2473 & doc->lexer->versionEmitted) 2474 TY_(RepairAttrValue)(doc, node, "lang", xmlLang->value); 2475 } 2476 2477 if (lang && !wantLang) 2478 TY_(RemoveAttribute)(doc, node, lang); 2479 2480 if (xmlLang && !wantXmlLang) 2481 TY_(RemoveAttribute)(doc, node, xmlLang); 2482 } 2483 2484 if (node->content) 2485 TY_(FixLanguageInformation)(doc, node->content, wantXmlLang, wantLang); 2486 2487 node = next; 2488 } 2489} 2490 2491/* 2492 Set/fix/remove <html xmlns='...'> 2493*/ 2494void TY_(FixXhtmlNamespace)(TidyDocImpl* doc, Bool wantXmlns) 2495{ 2496 Node* html = TY_(FindHTML)(doc); 2497 AttVal* xmlns; 2498 2499 if (!html) 2500 return; 2501 2502 xmlns = TY_(AttrGetById)(html, TidyAttr_XMLNS); 2503 2504 if (wantXmlns) 2505 { 2506 if (!AttrValueIs(xmlns, XHTML_NAMESPACE)) 2507 TY_(RepairAttrValue)(doc, html, "xmlns", XHTML_NAMESPACE); 2508 } 2509 else if (xmlns) 2510 { 2511 TY_(RemoveAttribute)(doc, html, xmlns); 2512 } 2513} 2514 2515/* 2516 ... 2517*/ 2518void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId) 2519{ 2520 Node* next; 2521 2522 while (node) 2523 { 2524 next = node->next; 2525 2526 if (TY_(IsAnchorElement)(doc, node)) 2527 { 2528 AttVal *name = TY_(AttrGetById)(node, TidyAttr_NAME); 2529 AttVal *id = TY_(AttrGetById)(node, TidyAttr_ID); 2530 2531 /* todo: how are empty name/id attributes handled? */ 2532 2533 if (name && id) 2534 { 2535 Bool NameHasValue = AttrHasValue(name); 2536 Bool IdHasValue = AttrHasValue(id); 2537 if ( (NameHasValue != IdHasValue) || 2538 (NameHasValue && IdHasValue && 2539 TY_(tmbstrcmp)(name->value, id->value) != 0 ) ) 2540 TY_(ReportAttrError)( doc, node, name, ID_NAME_MISMATCH); 2541 } 2542 else if (name && wantId) 2543 { 2544 if (TY_(NodeAttributeVersions)( node, TidyAttr_ID ) 2545 & doc->lexer->versionEmitted) 2546 { 2547 if (TY_(IsValidHTMLID)(name->value)) 2548 { 2549 TY_(RepairAttrValue)(doc, node, "id", name->value); 2550 } 2551 else 2552 { 2553 TY_(ReportAttrError)(doc, node, name, INVALID_XML_ID); 2554 } 2555 } 2556 } 2557 else if (id && wantName) 2558 { 2559 if (TY_(NodeAttributeVersions)( node, TidyAttr_NAME ) 2560 & doc->lexer->versionEmitted) 2561 /* todo: do not assume id is valid */ 2562 TY_(RepairAttrValue)(doc, node, "name", id->value); 2563 } 2564 2565 if (id && !wantId) 2566 TY_(RemoveAttribute)(doc, node, id); 2567 2568 if (name && !wantName) 2569 TY_(RemoveAttribute)(doc, node, name); 2570 2571 if (TY_(AttrGetById)(node, TidyAttr_NAME) == NULL && 2572 TY_(AttrGetById)(node, TidyAttr_ID) == NULL) 2573 TY_(RemoveAnchorByNode)(doc, node); 2574 } 2575 2576 if (node->content) 2577 TY_(FixAnchors)(doc, node->content, wantName, wantId); 2578 2579 node = next; 2580 } 2581} 2582 2583/* 2584 * local variables: 2585 * mode: c 2586 * indent-tabs-mode: nil 2587 * c-basic-offset: 4 2588 * eval: (c-set-offset 'substatement-open 0) 2589 * end: 2590 */ 2591