1/* parser.c -- HTML Parser 2 3 (c) 1998-2006 (W3C) MIT, ERCIM, Keio University 4 See tidy.h for the copyright notice. 5 6 CVS Info : 7 8 $Author$ 9 $Date$ 10 $Revision$ 11 12*/ 13 14#include "tidy-int.h" 15#include "lexer.h" 16#include "parser.h" 17#include "message.h" 18#include "clean.h" 19#include "tags.h" 20#include "tmbstr.h" 21 22#ifdef AUTO_INPUT_ENCODING 23#include "charsets.h" 24#endif 25 26Bool TY_(CheckNodeIntegrity)(Node *node) 27{ 28#ifndef NO_NODE_INTEGRITY_CHECK 29 Node *child; 30 31 if (node->prev) 32 { 33 if (node->prev->next != node) 34 return no; 35 } 36 37 if (node->next) 38 { 39 if (node->next->prev != node) 40 return no; 41 } 42 43 if (node->parent) 44 { 45 if (node->prev == NULL && node->parent->content != node) 46 return no; 47 48 if (node->next == NULL && node->parent->last != node) 49 return no; 50 } 51 52 for (child = node->content; child; child = child->next) 53 if ( child->parent != node || !TY_(CheckNodeIntegrity)(child) ) 54 return no; 55 56#endif 57 return yes; 58} 59 60/* 61 used to determine how attributes 62 without values should be printed 63 this was introduced to deal with 64 user defined tags e.g. Cold Fusion 65*/ 66Bool TY_(IsNewNode)(Node *node) 67{ 68 if (node && node->tag) 69 { 70 return (node->tag->model & CM_NEW); 71 } 72 return yes; 73} 74 75void TY_(CoerceNode)(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool unexpected) 76{ 77 const Dict* tag = TY_(LookupTagDef)(tid); 78 Node* tmp = TY_(InferredTag)(doc, tag->id); 79 80 if (obsolete) 81 TY_(ReportWarning)(doc, node, tmp, OBSOLETE_ELEMENT); 82 else if (unexpected) 83 TY_(ReportError)(doc, node, tmp, REPLACING_UNEX_ELEMENT); 84 else 85 TY_(ReportNotice)(doc, node, tmp, REPLACING_ELEMENT); 86 87 MemFree(tmp->element); 88 MemFree(tmp); 89 90 node->was = node->tag; 91 node->tag = tag; 92 node->type = StartTag; 93 node->implicit = yes; 94 MemFree(node->element); 95 node->element = TY_(tmbstrdup)(tag->name); 96} 97 98/* extract a node and its children from a markup tree */ 99Node *TY_(RemoveNode)(Node *node) 100{ 101 if (node->prev) 102 node->prev->next = node->next; 103 104 if (node->next) 105 node->next->prev = node->prev; 106 107 if (node->parent) 108 { 109 if (node->parent->content == node) 110 node->parent->content = node->next; 111 112 if (node->parent->last == node) 113 node->parent->last = node->prev; 114 } 115 116 node->parent = node->prev = node->next = NULL; 117 return node; 118} 119 120/* remove node from markup tree and discard it */ 121Node *TY_(DiscardElement)( TidyDocImpl* doc, Node *element ) 122{ 123 Node *next = NULL; 124 125 if (element) 126 { 127 next = element->next; 128 TY_(RemoveNode)(element); 129 TY_(FreeNode)( doc, element); 130 } 131 132 return next; 133} 134 135/* 136 insert "node" into markup tree as the firt element 137 of content of "element" 138*/ 139void TY_(InsertNodeAtStart)(Node *element, Node *node) 140{ 141 node->parent = element; 142 143 if (element->content == NULL) 144 element->last = node; 145 else 146 element->content->prev = node; 147 148 node->next = element->content; 149 node->prev = NULL; 150 element->content = node; 151} 152 153/* 154 insert "node" into markup tree as the last element 155 of content of "element" 156*/ 157void TY_(InsertNodeAtEnd)(Node *element, Node *node) 158{ 159 node->parent = element; 160 node->prev = element->last; 161 162 if (element->last != NULL) 163 element->last->next = node; 164 else 165 element->content = node; 166 167 element->last = node; 168} 169 170/* 171 insert "node" into markup tree in place of "element" 172 which is moved to become the child of the node 173*/ 174static void InsertNodeAsParent(Node *element, Node *node) 175{ 176 node->content = element; 177 node->last = element; 178 node->parent = element->parent; 179 element->parent = node; 180 181 if (node->parent->content == element) 182 node->parent->content = node; 183 184 if (node->parent->last == element) 185 node->parent->last = node; 186 187 node->prev = element->prev; 188 element->prev = NULL; 189 190 if (node->prev) 191 node->prev->next = node; 192 193 node->next = element->next; 194 element->next = NULL; 195 196 if (node->next) 197 node->next->prev = node; 198} 199 200/* insert "node" into markup tree before "element" */ 201void TY_(InsertNodeBeforeElement)(Node *element, Node *node) 202{ 203 Node *parent; 204 205 parent = element->parent; 206 node->parent = parent; 207 node->next = element; 208 node->prev = element->prev; 209 element->prev = node; 210 211 if (node->prev) 212 node->prev->next = node; 213 214 if (parent->content == element) 215 parent->content = node; 216} 217 218/* insert "node" into markup tree after "element" */ 219void TY_(InsertNodeAfterElement)(Node *element, Node *node) 220{ 221 Node *parent; 222 223 parent = element->parent; 224 node->parent = parent; 225 226 /* AQ - 13 Jan 2000 fix for parent == NULL */ 227 if (parent != NULL && parent->last == element) 228 parent->last = node; 229 else 230 { 231 node->next = element->next; 232 /* AQ - 13 Jan 2000 fix for node->next == NULL */ 233 if (node->next != NULL) 234 node->next->prev = node; 235 } 236 237 element->next = node; 238 node->prev = element; 239} 240 241static Bool CanPrune( TidyDocImpl* doc, Node *element ) 242{ 243 if ( TY_(nodeIsText)(element) ) 244 return yes; 245 246 if ( element->content ) 247 return no; 248 249 if ( element->tag == NULL ) 250 return no; 251 252 if ( element->tag->model & CM_BLOCK && element->attributes != NULL ) 253 return no; 254 255 if ( nodeIsA(element) && element->attributes != NULL ) 256 return no; 257 258 if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) ) 259 return no; 260 261 if ( element->tag->model & CM_ROW ) 262 return no; 263 264 if ( element->tag->model & CM_EMPTY ) 265 return no; 266 267 if ( nodeIsAPPLET(element) ) 268 return no; 269 270 if ( nodeIsOBJECT(element) ) 271 return no; 272 273 if ( nodeIsSCRIPT(element) && attrGetSRC(element) ) 274 return no; 275 276 if ( nodeIsTITLE(element) ) 277 return no; 278 279 /* #433359 - fix by Randy Waki 12 Mar 01 */ 280 if ( nodeIsIFRAME(element) ) 281 return no; 282 283 /* fix for bug 770297 */ 284 if (nodeIsTEXTAREA(element)) 285 return no; 286 287 if ( attrGetID(element) || attrGetNAME(element) ) 288 return no; 289 290 /* fix for bug 695408; a better fix would look for unknown and */ 291 /* known proprietary attributes that make the element significant */ 292 if (attrGetDATAFLD(element)) 293 return no; 294 295 /* fix for bug 723772, don't trim new-...-tags */ 296 if (element->tag->id == TidyTag_UNKNOWN) 297 return no; 298 299 if (nodeIsBODY(element)) 300 return no; 301 302 if (nodeIsCOLGROUP(element)) 303 return no; 304 305 return yes; 306} 307 308/* return next element */ 309Node *TY_(TrimEmptyElement)( TidyDocImpl* doc, Node *element ) 310{ 311 if ( CanPrune(doc, element) ) 312 { 313 if (element->type != TextNode) 314 TY_(ReportNotice)(doc, element, NULL, TRIM_EMPTY_ELEMENT); 315 316 return TY_(DiscardElement)(doc, element); 317 } 318 return element->next; 319} 320 321Node* TY_(DropEmptyElements)(TidyDocImpl* doc, Node* node) 322{ 323 Node* next; 324 325 while (node) 326 { 327 next = node->next; 328 329 if (node->content) 330 TY_(DropEmptyElements)(doc, node->content); 331 332 if (!TY_(nodeIsElement)(node) && 333 !(TY_(nodeIsText)(node) && !(node->start < node->end))) 334 { 335 node = next; 336 continue; 337 } 338 339 next = TY_(TrimEmptyElement)(doc, node); 340 node = next; 341 } 342 343 return node; 344} 345 346/* 347 errors in positioning of form start or end tags 348 generally require human intervention to fix 349*/ 350static void BadForm( TidyDocImpl* doc ) 351{ 352 doc->badForm = yes; 353 /* doc->errors++; */ 354} 355 356/* 357 This maps 358 <em>hello </em><strong>world</strong> 359 to 360 <em>hello</em> <strong>world</strong> 361 362 If last child of element is a text node 363 then trim trailing white space character 364 moving it to after element's end tag. 365*/ 366static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last ) 367{ 368 Lexer* lexer = doc->lexer; 369 byte c; 370 371 if (TY_(nodeIsText)(last)) 372 { 373 if (last->end > last->start) 374 { 375 c = (byte) lexer->lexbuf[ last->end - 1 ]; 376 377 if ( c == ' ' 378#ifdef COMMENT_NBSP_FIX 379 || c == 160 380#endif 381 ) 382 { 383#ifdef COMMENT_NBSP_FIX 384 /* take care with <td> </td> */ 385 if ( c == 160 && 386 ( element->tag == doc->tags.tag_td || 387 element->tag == doc->tags.tag_th ) 388 ) 389 { 390 if (last->end > last->start + 1) 391 last->end -= 1; 392 } 393 else 394#endif 395 { 396 last->end -= 1; 397 if ( (element->tag->model & CM_INLINE) && 398 !(element->tag->model & CM_FIELD) ) 399 lexer->insertspace = yes; 400 } 401 } 402 } 403 } 404} 405 406#if 0 407static Node *EscapeTag(Lexer *lexer, Node *element) 408{ 409 Node *node = NewNode(lexer); 410 411 node->start = lexer->lexsize; 412 AddByte(lexer, '<'); 413 414 if (element->type == EndTag) 415 AddByte(lexer, '/'); 416 417 if (element->element) 418 { 419 char *p; 420 for (p = element->element; *p != '\0'; ++p) 421 AddByte(lexer, *p); 422 } 423 else if (element->type == DocTypeTag) 424 { 425 uint i; 426 AddStringLiteral( lexer, "!DOCTYPE " ); 427 for (i = element->start; i < element->end; ++i) 428 AddByte(lexer, lexer->lexbuf[i]); 429 } 430 431 if (element->type == StartEndTag) 432 AddByte(lexer, '/'); 433 434 AddByte(lexer, '>'); 435 node->end = lexer->lexsize; 436 437 return node; 438} 439#endif /* 0 */ 440 441/* Only true for text nodes. */ 442Bool TY_(IsBlank)(Lexer *lexer, Node *node) 443{ 444 Bool isBlank = TY_(nodeIsText)(node); 445 if ( isBlank ) 446 isBlank = ( node->end == node->start || /* Zero length */ 447 ( node->end == node->start+1 /* or one blank. */ 448 && lexer->lexbuf[node->start] == ' ' ) ); 449 return isBlank; 450} 451 452/* 453 This maps 454 <p>hello<em> world</em> 455 to 456 <p>hello <em>world</em> 457 458 Trims initial space, by moving it before the 459 start tag, or if this element is the first in 460 parent's content, then by discarding the space 461*/ 462static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text ) 463{ 464 Lexer* lexer = doc->lexer; 465 Node *prev, *node; 466 467 if ( TY_(nodeIsText)(text) && 468 lexer->lexbuf[text->start] == ' ' && 469 text->start < text->end ) 470 { 471 if ( (element->tag->model & CM_INLINE) && 472 !(element->tag->model & CM_FIELD) ) 473 { 474 prev = element->prev; 475 476 if (TY_(nodeIsText)(prev)) 477 { 478 if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ') 479 lexer->lexbuf[(prev->end)++] = ' '; 480 481 ++(element->start); 482 } 483 else /* create new node */ 484 { 485 node = TY_(NewNode)(lexer); 486 node->start = (element->start)++; 487 node->end = element->start; 488 lexer->lexbuf[node->start] = ' '; 489 TY_(InsertNodeBeforeElement)(element ,node); 490 } 491 } 492 493 /* discard the space in current node */ 494 ++(text->start); 495 } 496} 497 498static Bool IsPreDescendant(Node* node) 499{ 500 Node *parent = node->parent; 501 502 while (parent) 503 { 504 if (parent->tag && parent->tag->parser == TY_(ParsePre)) 505 return yes; 506 507 parent = parent->parent; 508 } 509 510 return no; 511} 512 513static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node) 514{ 515 Node* next; 516 517 if (!TY_(nodeIsText)(node)) 518 return no; 519 520 if (node->parent->type == DocTypeTag) 521 return no; 522 523 if (IsPreDescendant(node)) 524 return no; 525 526 if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript)) 527 return no; 528 529 next = node->next; 530 531 /* <p>... </p> */ 532 if (!next && !TY_(nodeHasCM)(node->parent, CM_INLINE)) 533 return yes; 534 535 /* <div><small>... </small><h3>...</h3></div> */ 536 if (!next && node->parent->next && !TY_(nodeHasCM)(node->parent->next, CM_INLINE)) 537 return yes; 538 539 if (!next) 540 return no; 541 542 if (nodeIsBR(next)) 543 return yes; 544 545 if (TY_(nodeHasCM)(next, CM_INLINE)) 546 return no; 547 548 /* <a href='/'>...</a> <p>...</p> */ 549 if (next->type == StartTag) 550 return yes; 551 552 /* <strong>...</strong> <hr /> */ 553 if (next->type == StartEndTag) 554 return yes; 555 556 /* evil adjacent text nodes, Tidy should not generate these :-( */ 557 if (TY_(nodeIsText)(next) && next->start < next->end 558 && TY_(IsWhite)(doc->lexer->lexbuf[next->start])) 559 return yes; 560 561 return no; 562} 563 564static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node) 565{ 566 if (!TY_(nodeIsText)(node)) 567 return no; 568 569 if (node->parent->type == DocTypeTag) 570 return no; 571 572 if (IsPreDescendant(node)) 573 return no; 574 575 if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript)) 576 return no; 577 578 /* <p>...<br> <em>...</em>...</p> */ 579 if (nodeIsBR(node->prev)) 580 return yes; 581 582 /* <p> ...</p> */ 583 if (node->prev == NULL && !TY_(nodeHasCM)(node->parent, CM_INLINE)) 584 return yes; 585 586 /* <h4>...</h4> <em>...</em> */ 587 if (node->prev && !TY_(nodeHasCM)(node->prev, CM_INLINE) && 588 TY_(nodeIsElement)(node->prev)) 589 return yes; 590 591 /* <p><span> ...</span></p> */ 592 if (!node->prev && !node->parent->prev && !TY_(nodeHasCM)(node->parent->parent, CM_INLINE)) 593 return yes; 594 595 return no; 596} 597 598static void CleanSpaces(TidyDocImpl* doc, Node* node) 599{ 600 Node* next; 601 602 while (node) 603 { 604 next = node->next; 605 606 if (TY_(nodeIsText)(node) && CleanLeadingWhitespace(doc, node)) 607 while (node->start < node->end && TY_(IsWhite)(doc->lexer->lexbuf[node->start])) 608 ++(node->start); 609 610 if (TY_(nodeIsText)(node) && CleanTrailingWhitespace(doc, node)) 611 while (node->end > node->start && TY_(IsWhite)(doc->lexer->lexbuf[node->end - 1])) 612 --(node->end); 613 614 if (TY_(nodeIsText)(node) && !(node->start < node->end)) 615 { 616 TY_(RemoveNode)(node); 617 TY_(FreeNode)(doc, node); 618 node = next; 619 620 continue; 621 } 622 623 if (node->content) 624 CleanSpaces(doc, node->content); 625 626 node = next; 627 } 628} 629 630/* 631 Move initial and trailing space out. 632 This routine maps: 633 634 hello<em> world</em> 635 to 636 hello <em>world</em> 637 and 638 <em>hello </em><strong>world</strong> 639 to 640 <em>hello</em> <strong>world</strong> 641*/ 642static void TrimSpaces( TidyDocImpl* doc, Node *element) 643{ 644 Node* text = element->content; 645 646 if (nodeIsPRE(element) || IsPreDescendant(element)) 647 return; 648 649 if (TY_(nodeIsText)(text)) 650 TrimInitialSpace(doc, element, text); 651 652 text = element->last; 653 654 if (TY_(nodeIsText)(text)) 655 TrimTrailingSpace(doc, element, text); 656} 657 658static Bool DescendantOf( Node *element, TidyTagId tid ) 659{ 660 Node *parent; 661 for ( parent = element->parent; 662 parent != NULL; 663 parent = parent->parent ) 664 { 665 if ( TagIsId(parent, tid) ) 666 return yes; 667 } 668 return no; 669} 670 671static Bool InsertMisc(Node *element, Node *node) 672{ 673 if (node->type == CommentTag || 674 node->type == ProcInsTag || 675 node->type == CDATATag || 676 node->type == SectionTag || 677 node->type == AspTag || 678 node->type == JsteTag || 679 node->type == PhpTag ) 680 { 681 TY_(InsertNodeAtEnd)(element, node); 682 return yes; 683 } 684 685 if ( node->type == XmlDecl ) 686 { 687 Node* root = element; 688 while ( root && root->parent ) 689 root = root->parent; 690 if ( root ) 691 { 692/* Apple Changes: 693 2007-03-05 iccir [5036506] Don't insert an XmlDecl if one already exists. 694*/ 695#ifdef TIDY_APPLE_CHANGES 696 if (!(root->content && root->content->type == XmlDecl)) 697 { 698 TY_(InsertNodeAtStart)( root, node ); 699 return yes; 700 } 701#else 702 TY_(InsertNodeAtStart)( root, node ); 703 return yes; 704#endif 705 } 706 } 707 708 /* Declared empty tags seem to be slipping through 709 ** the cracks. This is an experiment to figure out 710 ** a decent place to pick them up. 711 */ 712 if ( node->tag && 713 TY_(nodeIsElement)(node) && 714 TY_(nodeCMIsEmpty)(node) && TagId(node) == TidyTag_UNKNOWN && 715 (node->tag->versions & VERS_PROPRIETARY) != 0 ) 716 { 717 TY_(InsertNodeAtEnd)(element, node); 718 return yes; 719 } 720 721 return no; 722} 723 724 725static void ParseTag( TidyDocImpl* doc, Node *node, GetTokenMode mode ) 726{ 727 Lexer* lexer = doc->lexer; 728 /* 729 Fix by GLP 2000-12-21. Need to reset insertspace if this 730 is both a non-inline and empty tag (base, link, meta, isindex, hr, area). 731 */ 732 if (node->tag->model & CM_EMPTY) 733 { 734 lexer->waswhite = no; 735 if (node->tag->parser == NULL) 736 return; 737 } 738 else if (!(node->tag->model & CM_INLINE)) 739 lexer->insertspace = no; 740 741 if (node->tag->parser == NULL) 742 return; 743 744 if (node->type == StartEndTag) 745 return; 746 747 (*node->tag->parser)( doc, node, mode ); 748} 749 750/* 751 the doctype has been found after other tags, 752 and needs moving to before the html element 753*/ 754static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype ) 755{ 756 Node* existing = TY_(FindDocType)( doc ); 757 if ( existing ) 758 { 759 TY_(ReportError)(doc, element, doctype, DISCARDING_UNEXPECTED ); 760 TY_(FreeNode)( doc, doctype ); 761 } 762 else 763 { 764 TY_(ReportError)(doc, element, doctype, DOCTYPE_AFTER_TAGS ); 765 while ( !nodeIsHTML(element) ) 766 element = element->parent; 767 TY_(InsertNodeBeforeElement)( element, doctype ); 768 } 769} 770 771/* 772 move node to the head, where element is used as starting 773 point in hunt for head. normally called during parsing 774*/ 775static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node ) 776{ 777 Node *head; 778 779 TY_(RemoveNode)( node ); /* make sure that node is isolated */ 780 781 if ( TY_(nodeIsElement)(node) ) 782 { 783 TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN ); 784 785 head = TY_(FindHEAD)(doc); 786 assert(head != NULL); 787 788 TY_(InsertNodeAtEnd)(head, node); 789 790 if ( node->tag->parser ) 791 ParseTag( doc, node, IgnoreWhitespace ); 792 } 793 else 794 { 795 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED); 796 TY_(FreeNode)( doc, node ); 797 } 798} 799 800/* moves given node to end of body element */ 801static void MoveNodeToBody( TidyDocImpl* doc, Node* node ) 802{ 803 Node* body = TY_(FindBody)( doc ); 804 if ( body ) 805 { 806 TY_(RemoveNode)( node ); 807 TY_(InsertNodeAtEnd)( body, node ); 808 } 809} 810 811static void AddClassNoIndent( TidyDocImpl* doc, Node *node ) 812{ 813 ctmbstr sprop = 814 "padding-left: 2ex; margin-left: 0ex" 815 "; margin-top: 0ex; margin-bottom: 0ex"; 816 if ( !cfgBool(doc, TidyDecorateInferredUL) ) 817 return; 818 if ( cfgBool(doc, TidyMakeClean) ) 819 TY_(AddStyleAsClass)( doc, node, sprop ); 820 else 821 TY_(AddStyleProperty)( doc, node, sprop ); 822} 823 824/* 825 element is node created by the lexer 826 upon seeing the start tag, or by the 827 parser when the start tag is inferred 828*/ 829void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode) 830{ 831 Lexer* lexer = doc->lexer; 832 Node *node; 833 Bool checkstack = yes; 834 uint istackbase = 0; 835 836 if ( element->tag->model & CM_EMPTY ) 837 return; 838 839 if ( nodeIsFORM(element) && 840 DescendantOf(element, TidyTag_FORM) ) 841 TY_(ReportError)(doc, element, NULL, ILLEGAL_NESTING ); 842 843 /* 844 InlineDup() asks the lexer to insert inline emphasis tags 845 currently pushed on the istack, but take care to avoid 846 propagating inline emphasis inside OBJECT or APPLET. 847 For these elements a fresh inline stack context is created 848 and disposed of upon reaching the end of the element. 849 They thus behave like table cells in this respect. 850 */ 851 if (element->tag->model & CM_OBJECT) 852 { 853 istackbase = lexer->istackbase; 854 lexer->istackbase = lexer->istacksize; 855 } 856 857 if (!(element->tag->model & CM_MIXED)) 858 TY_(InlineDup)( doc, NULL ); 859 860 mode = IgnoreWhitespace; 861 862 while ((node = TY_(GetToken)(doc, mode /*MixedContent*/)) != NULL) 863 { 864 /* end tag for this element */ 865 if (node->type == EndTag && node->tag && 866 (node->tag == element->tag || element->was == node->tag)) 867 { 868 TY_(FreeNode)( doc, node ); 869 870 if (element->tag->model & CM_OBJECT) 871 { 872 /* pop inline stack */ 873 while (lexer->istacksize > lexer->istackbase) 874 TY_(PopInline)( doc, NULL ); 875 lexer->istackbase = istackbase; 876 } 877 878 element->closed = yes; 879 TrimSpaces( doc, element ); 880 return; 881 } 882 883 if ( nodeIsBODY( node ) && DescendantOf( element, TidyTag_HEAD )) 884 { 885 /* If we're in the HEAD, close it before proceeding. 886 This is an extremely rare occurance, but has been observed. 887 */ 888 TY_(UngetToken)( doc ); 889 break; 890 } 891 892 if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) ) 893 { 894 if ( TY_(nodeIsElement)(node) ) 895 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED ); 896 TY_(FreeNode)( doc, node ); 897 continue; 898 } 899 900 901 if (node->type == EndTag) 902 { 903 if (node->tag == NULL) 904 { 905 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED ); 906 TY_(FreeNode)( doc, node ); 907 continue; 908 } 909 else if ( nodeIsBR(node) ) 910 node->type = StartTag; 911 else if ( nodeIsP(node) ) 912 { 913 /* Cannot have a block inside a paragraph, so no checking 914 for an ancestor is necessary -- but we _can_ have 915 paragraphs inside a block, so change it to an implicit 916 empty paragraph, to be dealt with according to the user's 917 options 918 */ 919 node->type = StartEndTag; 920 node->implicit = yes; 921#if OBSOLETE 922 TY_(CoerceNode)(doc, node, TidyTag_BR, no, no); 923 TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */ 924 TY_(InsertNodeAtEnd)( element, node ); 925 node = InferredTag(doc, TidyTag_BR); 926#endif 927 } 928 else if (DescendantOf( element, node->tag->id )) 929 { 930 /* 931 if this is the end tag for an ancestor element 932 then infer end tag for this element 933 */ 934 TY_(UngetToken)( doc ); 935 break; 936#if OBSOLETE 937 Node *parent; 938 for ( parent = element->parent; 939 parent != NULL; 940 parent = parent->parent ) 941 { 942 if (node->tag == parent->tag) 943 { 944 if (!(element->tag->model & CM_OPT)) 945 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE ); 946 947 TY_(UngetToken)( doc ); 948 949 if (element->tag->model & CM_OBJECT) 950 { 951 /* pop inline stack */ 952 while (lexer->istacksize > lexer->istackbase) 953 TY_(PopInline)( doc, NULL ); 954 lexer->istackbase = istackbase; 955 } 956 957 TrimSpaces( doc, element ); 958 return; 959 } 960 } 961#endif 962 } 963 else 964 { 965 /* special case </tr> etc. for stuff moved in front of table */ 966 if ( lexer->exiled 967 && node->tag->model 968 && (node->tag->model & CM_TABLE) ) 969 { 970 TY_(UngetToken)( doc ); 971 TrimSpaces( doc, element ); 972 return; 973 } 974 } 975 } 976 977 /* mixed content model permits text */ 978 if (TY_(nodeIsText)(node)) 979 { 980 if ( checkstack ) 981 { 982 checkstack = no; 983 if (!(element->tag->model & CM_MIXED)) 984 { 985 if ( TY_(InlineDup)(doc, node) > 0 ) 986 continue; 987 } 988 } 989 990 TY_(InsertNodeAtEnd)(element, node); 991 mode = MixedContent; 992 993 /* 994 HTML4 strict doesn't allow mixed content for 995 elements with %block; as their content model 996 */ 997 /* 998 But only body, map, blockquote, form and 999 noscript have content model %block; 1000 */ 1001 if ( nodeIsBODY(element) || 1002 nodeIsMAP(element) || 1003 nodeIsBLOCKQUOTE(element) || 1004 nodeIsFORM(element) || 1005 nodeIsNOSCRIPT(element) ) 1006 TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT ); 1007 continue; 1008 } 1009 1010 if ( InsertMisc(element, node) ) 1011 continue; 1012 1013 /* allow PARAM elements? */ 1014 if ( nodeIsPARAM(node) ) 1015 { 1016 if ( TY_(nodeHasCM)(element, CM_PARAM) && TY_(nodeIsElement)(node) ) 1017 { 1018 TY_(InsertNodeAtEnd)(element, node); 1019 continue; 1020 } 1021 1022 /* otherwise discard it */ 1023 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED ); 1024 TY_(FreeNode)( doc, node ); 1025 continue; 1026 } 1027 1028 /* allow AREA elements? */ 1029 if ( nodeIsAREA(node) ) 1030 { 1031 if ( nodeIsMAP(element) && TY_(nodeIsElement)(node) ) 1032 { 1033 TY_(InsertNodeAtEnd)(element, node); 1034 continue; 1035 } 1036 1037 /* otherwise discard it */ 1038 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED ); 1039 TY_(FreeNode)( doc, node ); 1040 continue; 1041 } 1042 1043 /* ignore unknown start/end tags */ 1044 if ( node->tag == NULL ) 1045 { 1046 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED ); 1047 TY_(FreeNode)( doc, node ); 1048 continue; 1049 } 1050 1051 /* 1052 Allow CM_INLINE elements here. 1053 1054 Allow CM_BLOCK elements here unless 1055 lexer->excludeBlocks is yes. 1056 1057 LI and DD are special cased. 1058 1059 Otherwise infer end tag for this element. 1060 */ 1061 1062 if ( !TY_(nodeHasCM)(node, CM_INLINE) ) 1063 { 1064 if ( !TY_(nodeIsElement)(node) ) 1065 { 1066 if ( nodeIsFORM(node) ) 1067 BadForm( doc ); 1068 1069 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED ); 1070 TY_(FreeNode)( doc, node ); 1071 continue; 1072 } 1073 1074 /* #427671 - Fix by Randy Waki - 10 Aug 00 */ 1075 /* 1076 If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION 1077 start tag, discard the start tag and let the subsequent content get 1078 parsed as content of the enclosing LI. This seems to mimic IE and 1079 Netscape, and avoids an infinite loop: without this check, 1080 ParseBlock (which is parsing the LI's content) and ParseList (which 1081 is parsing the LI's parent's content) repeatedly defer to each 1082 other to parse the illegal start tag, each time inferring a missing 1083 </li> or <li> respectively. 1084 1085 NOTE: This check is a bit fragile. It specifically checks for the 1086 four tags that happen to weave their way through the current series 1087 of tests performed by ParseBlock and ParseList to trigger the 1088 infinite loop. 1089 */ 1090 if ( nodeIsLI(element) ) 1091 { 1092 if ( nodeIsFRAME(node) || 1093 nodeIsFRAMESET(node) || 1094 nodeIsOPTGROUP(node) || 1095 nodeIsOPTION(node) ) 1096 { 1097 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED ); 1098 TY_(FreeNode)( doc, node ); /* DSR - 27Apr02 avoid memory leak */ 1099 continue; 1100 } 1101 } 1102 1103 if ( nodeIsTD(element) || nodeIsTH(element) ) 1104 { 1105 /* if parent is a table cell, avoid inferring the end of the cell */ 1106 1107 if ( TY_(nodeHasCM)(node, CM_HEAD) ) 1108 { 1109 MoveToHead( doc, element, node ); 1110 continue; 1111 } 1112 1113 if ( TY_(nodeHasCM)(node, CM_LIST) ) 1114 { 1115 TY_(UngetToken)( doc ); 1116 node = TY_(InferredTag)(doc, TidyTag_UL); 1117 AddClassNoIndent(doc, node); 1118 lexer->excludeBlocks = yes; 1119 } 1120 else if ( TY_(nodeHasCM)(node, CM_DEFLIST) ) 1121 { 1122 TY_(UngetToken)( doc ); 1123 node = TY_(InferredTag)(doc, TidyTag_DL); 1124 lexer->excludeBlocks = yes; 1125 } 1126 1127 /* infer end of current table cell */ 1128 if ( !TY_(nodeHasCM)(node, CM_BLOCK) ) 1129 { 1130 TY_(UngetToken)( doc ); 1131 TrimSpaces( doc, element ); 1132 return; 1133 } 1134 } 1135 else if ( TY_(nodeHasCM)(node, CM_BLOCK) ) 1136 { 1137 if ( lexer->excludeBlocks ) 1138 { 1139 if ( !TY_(nodeHasCM)(element, CM_OPT) ) 1140 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE ); 1141 1142 TY_(UngetToken)( doc ); 1143 1144 if ( TY_(nodeHasCM)(element, CM_OBJECT) ) 1145 lexer->istackbase = istackbase; 1146 1147 TrimSpaces( doc, element ); 1148 return; 1149 } 1150 } 1151 else /* things like list items */ 1152 { 1153 if (node->tag->model & CM_HEAD) 1154 { 1155 MoveToHead( doc, element, node ); 1156 continue; 1157 } 1158 1159 /* 1160 special case where a form start tag 1161 occurs in a tr and is followed by td or th 1162 */ 1163 1164 if ( nodeIsFORM(element) && 1165 nodeIsTD(element->parent) && 1166 element->parent->implicit ) 1167 { 1168 if ( nodeIsTD(node) ) 1169 { 1170 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED ); 1171 TY_(FreeNode)( doc, node ); 1172 continue; 1173 } 1174 1175 if ( nodeIsTH(node) ) 1176 { 1177 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED ); 1178 TY_(FreeNode)( doc, node ); 1179 node = element->parent; 1180 MemFree(node->element); 1181 node->element = TY_(tmbstrdup)("th"); 1182 node->tag = TY_(LookupTagDef)( TidyTag_TH ); 1183 continue; 1184 } 1185 } 1186 1187 if ( !TY_(nodeHasCM)(element, CM_OPT) && !element->implicit ) 1188 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE ); 1189 1190 TY_(UngetToken)( doc ); 1191 1192 if ( TY_(nodeHasCM)(node, CM_LIST) ) 1193 { 1194 if ( element->parent && element->parent->tag && 1195 element->parent->tag->parser == TY_(ParseList) ) 1196 { 1197 TrimSpaces( doc, element ); 1198 return; 1199 } 1200 1201 node = TY_(InferredTag)(doc, TidyTag_UL); 1202 AddClassNoIndent(doc, node); 1203 } 1204 else if ( TY_(nodeHasCM)(node, CM_DEFLIST) ) 1205 { 1206 if ( nodeIsDL(element->parent) ) 1207 { 1208 TrimSpaces( doc, element ); 1209 return; 1210 } 1211 1212 node = TY_(InferredTag)(doc, TidyTag_DL); 1213 } 1214 else if ( TY_(nodeHasCM)(node, CM_TABLE) || TY_(nodeHasCM)(node, CM_ROW) ) 1215 { 1216 /* http://tidy.sf.net/issue/1316307 */ 1217 /* In exiled mode, return so table processing can 1218 continue. */ 1219 if (lexer->exiled) 1220 return; 1221 node = TY_(InferredTag)(doc, TidyTag_TABLE); 1222 } 1223 else if ( TY_(nodeHasCM)(element, CM_OBJECT) ) 1224 { 1225 /* pop inline stack */ 1226 while ( lexer->istacksize > lexer->istackbase ) 1227 TY_(PopInline)( doc, NULL ); 1228 lexer->istackbase = istackbase; 1229 TrimSpaces( doc, element ); 1230 return; 1231 1232 } 1233 else 1234 { 1235 TrimSpaces( doc, element ); 1236 return; 1237 } 1238 } 1239 } 1240 1241 /* parse known element */ 1242 if (TY_(nodeIsElement)(node)) 1243 { 1244 if (node->tag->model & CM_INLINE) 1245 { 1246 if (checkstack && !node->implicit) 1247 { 1248 checkstack = no; 1249 1250 if (!(element->tag->model & CM_MIXED)) /* #431731 - fix by Randy Waki 25 Dec 00 */ 1251 { 1252 if ( TY_(InlineDup)(doc, node) > 0 ) 1253 continue; 1254 } 1255 } 1256 1257 mode = MixedContent; 1258 } 1259 else 1260 { 1261 checkstack = yes; 1262 mode = IgnoreWhitespace; 1263 } 1264 1265 /* trim white space before <br> */ 1266 if ( nodeIsBR(node) ) 1267 TrimSpaces( doc, element ); 1268 1269 TY_(InsertNodeAtEnd)(element, node); 1270 1271 if (node->implicit) 1272 TY_(ReportError)(doc, element, node, INSERTING_TAG ); 1273 1274 ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ ); 1275 continue; 1276 } 1277 1278 /* discard unexpected tags */ 1279 if (node->type == EndTag) 1280 TY_(PopInline)( doc, node ); /* if inline end tag */ 1281 1282 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED ); 1283 TY_(FreeNode)( doc, node ); 1284 continue; 1285 } 1286 1287 if (!(element->tag->model & CM_OPT)) 1288 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_FOR); 1289 1290 if (element->tag->model & CM_OBJECT) 1291 { 1292 /* pop inline stack */ 1293 while ( lexer->istacksize > lexer->istackbase ) 1294 TY_(PopInline)( doc, NULL ); 1295 lexer->istackbase = istackbase; 1296 } 1297 1298 TrimSpaces( doc, element ); 1299} 1300 1301void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode ) 1302{ 1303 Lexer* lexer = doc->lexer; 1304 Node *node, *parent; 1305 1306 if (element->tag->model & CM_EMPTY) 1307 return; 1308 1309 /* 1310 ParseInline is used for some block level elements like H1 to H6 1311 For such elements we need to insert inline emphasis tags currently 1312 on the inline stack. For Inline elements, we normally push them 1313 onto the inline stack provided they aren't implicit or OBJECT/APPLET. 1314 This test is carried out in PushInline and PopInline, see istack.c 1315 1316 InlineDup(...) is not called for elements with a CM_MIXED (inline and 1317 block) content model, e.g. <del> or <ins>, otherwise constructs like 1318 1319 <p>111<a name='foo'>222<del>333</del>444</a>555</p> 1320 <p>111<span>222<del>333</del>444</span>555</p> 1321 <p>111<em>222<del>333</del>444</em>555</p> 1322 1323 will get corrupted. 1324 */ 1325 if ((TY_(nodeHasCM)(element, CM_BLOCK) || nodeIsDT(element)) && 1326 !TY_(nodeHasCM)(element, CM_MIXED)) 1327 TY_(InlineDup)(doc, NULL); 1328 else if (TY_(nodeHasCM)(element, CM_INLINE)) 1329 TY_(PushInline)(doc, element); 1330 1331 if ( nodeIsNOBR(element) ) 1332 doc->badLayout |= USING_NOBR; 1333 else if ( nodeIsFONT(element) ) 1334 doc->badLayout |= USING_FONT; 1335 1336 /* Inline elements may or may not be within a preformatted element */ 1337 if (mode != Preformatted) 1338 mode = MixedContent; 1339 1340 while ((node = TY_(GetToken)(doc, mode)) != NULL) 1341 { 1342 /* end tag for current element */ 1343 if (node->tag == element->tag && node->type == EndTag) 1344 { 1345 if (element->tag->model & CM_INLINE) 1346 TY_(PopInline)( doc, node ); 1347 1348 TY_(FreeNode)( doc, node ); 1349 1350 if (!(mode & Preformatted)) 1351 TrimSpaces(doc, element); 1352 1353 /* 1354 if a font element wraps an anchor and nothing else 1355 then move the font element inside the anchor since 1356 otherwise it won't alter the anchor text color 1357 */ 1358 if ( nodeIsFONT(element) && 1359 element->content && element->content == element->last ) 1360 { 1361 Node *child = element->content; 1362 1363 if ( nodeIsA(child) ) 1364 { 1365 child->parent = element->parent; 1366 child->next = element->next; 1367 child->prev = element->prev; 1368 1369 element->next = NULL; 1370 element->prev = NULL; 1371 element->parent = child; 1372 1373 element->content = child->content; 1374 element->last = child->last; 1375 child->content = element; 1376 1377 TY_(FixNodeLinks)(child); 1378 TY_(FixNodeLinks)(element); 1379 } 1380 } 1381 1382 element->closed = yes; 1383 TrimSpaces( doc, element ); 1384 return; 1385 } 1386 1387 /* <u>...<u> map 2nd <u> to </u> if 1st is explicit */ 1388 /* otherwise emphasis nesting is probably unintentional */ 1389 /* big, small, sub, sup have cumulative effect to leave them alone */ 1390 if ( node->type == StartTag 1391 && node->tag == element->tag 1392 && TY_(IsPushed)( doc, node ) 1393 && !node->implicit 1394 && !element->implicit 1395 && node->tag && (node->tag->model & CM_INLINE) 1396 && !nodeIsA(node) 1397 && !nodeIsFONT(node) 1398 && !nodeIsBIG(node) 1399 && !nodeIsSMALL(node) 1400 && !nodeIsSUB(node) 1401 && !nodeIsSUP(node) 1402 && !nodeIsQ(node) 1403 && !nodeIsSPAN(node) 1404 ) 1405 { 1406 if (element->content != NULL && node->attributes == NULL) 1407 { 1408 TY_(ReportWarning)(doc, element, node, COERCE_TO_ENDTAG_WARN); 1409 node->type = EndTag; 1410 TY_(UngetToken)(doc); 1411 continue; 1412 } 1413 1414 if (node->attributes == NULL || element->attributes == NULL) 1415 TY_(ReportWarning)(doc, element, node, NESTED_EMPHASIS); 1416 } 1417 else if ( TY_(IsPushed)(doc, node) && node->type == StartTag && 1418 nodeIsQ(node) ) 1419 { 1420 TY_(ReportWarning)(doc, element, node, NESTED_QUOTATION); 1421 } 1422 1423 if ( TY_(nodeIsText)(node) ) 1424 { 1425 /* only called for 1st child */ 1426 if ( element->content == NULL && !(mode & Preformatted) ) 1427 TrimSpaces( doc, element ); 1428 1429 if ( node->start >= node->end ) 1430 { 1431 TY_(FreeNode)( doc, node ); 1432 continue; 1433 } 1434 1435 TY_(InsertNodeAtEnd)(element, node); 1436 continue; 1437 } 1438 1439 /* mixed content model so allow text */ 1440 if (InsertMisc(element, node)) 1441 continue; 1442 1443 /* deal with HTML tags */ 1444 if ( nodeIsHTML(node) ) 1445 { 1446 if ( TY_(nodeIsElement)(node) ) 1447 { 1448 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED ); 1449 TY_(FreeNode)( doc, node ); 1450 continue; 1451 } 1452 1453 /* otherwise infer end of inline element */ 1454 TY_(UngetToken)( doc ); 1455 1456 if (!(mode & Preformatted)) 1457 TrimSpaces(doc, element); 1458 1459 return; 1460 } 1461 1462 /* within <dt> or <pre> map <p> to <br> */ 1463 if ( nodeIsP(node) && 1464 node->type == StartTag && 1465 ( (mode & Preformatted) || 1466 nodeIsDT(element) || 1467 DescendantOf(element, TidyTag_DT ) 1468 ) 1469 ) 1470 { 1471 node->tag = TY_(LookupTagDef)( TidyTag_BR ); 1472 MemFree(node->element); 1473 node->element = TY_(tmbstrdup)("br"); 1474 TrimSpaces(doc, element); 1475 TY_(InsertNodeAtEnd)(element, node); 1476 continue; 1477 } 1478 1479 /* <p> allowed within <address> in HTML 4.01 Transitional */ 1480 if ( nodeIsP(node) && 1481 node->type == StartTag && 1482 nodeIsADDRESS(element) ) 1483 { 1484 TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT ); 1485 TY_(InsertNodeAtEnd)(element, node); 1486 (*node->tag->parser)( doc, node, mode ); 1487 continue; 1488 } 1489 1490 /* ignore unknown and PARAM tags */ 1491 if ( node->tag == NULL || nodeIsPARAM(node) ) 1492 { 1493 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED); 1494 TY_(FreeNode)( doc, node ); 1495 continue; 1496 } 1497 1498 if ( nodeIsBR(node) && node->type == EndTag ) 1499 node->type = StartTag; 1500 1501 if ( node->type == EndTag ) 1502 { 1503 /* coerce </br> to <br> */ 1504 if ( nodeIsBR(node) ) 1505 node->type = StartTag; 1506 else if ( nodeIsP(node) ) 1507 { 1508 /* coerce unmatched </p> to <br><br> */ 1509 if ( !DescendantOf(element, TidyTag_P) ) 1510 { 1511 TY_(CoerceNode)(doc, node, TidyTag_BR, no, no); 1512 TrimSpaces( doc, element ); 1513 TY_(InsertNodeAtEnd)( element, node ); 1514 node = TY_(InferredTag)(doc, TidyTag_BR); 1515 TY_(InsertNodeAtEnd)( element, node ); /* todo: check this */ 1516 continue; 1517 } 1518 } 1519 else if ( TY_(nodeHasCM)(node, CM_INLINE) 1520 && !nodeIsA(node) 1521 && !TY_(nodeHasCM)(node, CM_OBJECT) 1522 && TY_(nodeHasCM)(element, CM_INLINE) ) 1523 { 1524 /* allow any inline end tag to end current element */ 1525 TY_(PopInline)( doc, element ); 1526 1527 if ( !nodeIsA(element) ) 1528 { 1529 if ( nodeIsA(node) && node->tag != element->tag ) 1530 { 1531 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE ); 1532 TY_(UngetToken)( doc ); 1533 } 1534 else 1535 { 1536 TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG); 1537 TY_(FreeNode)( doc, node); 1538 } 1539 1540 if (!(mode & Preformatted)) 1541 TrimSpaces(doc, element); 1542 1543 return; 1544 } 1545 1546 /* if parent is <a> then discard unexpected inline end tag */ 1547 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED); 1548 TY_(FreeNode)( doc, node); 1549 continue; 1550 } /* special case </tr> etc. for stuff moved in front of table */ 1551 else if ( lexer->exiled 1552 && node->tag->model 1553 && (node->tag->model & CM_TABLE) ) 1554 { 1555 TY_(UngetToken)( doc ); 1556 TrimSpaces(doc, element); 1557 return; 1558 } 1559 } 1560 1561 /* allow any header tag to end current header */ 1562 if ( TY_(nodeHasCM)(node, CM_HEADING) && TY_(nodeHasCM)(element, CM_HEADING) ) 1563 { 1564 1565 if ( node->tag == element->tag ) 1566 { 1567 TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG ); 1568 TY_(FreeNode)( doc, node); 1569 } 1570 else 1571 { 1572 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE ); 1573 TY_(UngetToken)( doc ); 1574 } 1575 1576 if (!(mode & Preformatted)) 1577 TrimSpaces(doc, element); 1578 1579 return; 1580 } 1581 1582 /* 1583 an <A> tag to ends any open <A> element 1584 but <A href=...> is mapped to </A><A href=...> 1585 */ 1586 /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */ 1587 /* if (node->tag == doc->tags.tag_a && !node->implicit && TY_(IsPushed)(doc, node)) */ 1588 if ( nodeIsA(node) && !node->implicit && 1589 (nodeIsA(element) || DescendantOf(element, TidyTag_A)) ) 1590 { 1591 /* coerce <a> to </a> unless it has some attributes */ 1592 /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */ 1593 /* other fixes by Dave Raggett */ 1594 /* if (node->attributes == NULL) */ 1595 if (node->type != EndTag && node->attributes == NULL) 1596 { 1597 node->type = EndTag; 1598 TY_(ReportError)(doc, element, node, COERCE_TO_ENDTAG); 1599 /* TY_(PopInline)( doc, node ); */ 1600 TY_(UngetToken)( doc ); 1601 continue; 1602 } 1603 1604 TY_(UngetToken)( doc ); 1605 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE); 1606 /* TY_(PopInline)( doc, element ); */ 1607 1608 if (!(mode & Preformatted)) 1609 TrimSpaces(doc, element); 1610 1611 return; 1612 } 1613 1614 if (element->tag->model & CM_HEADING) 1615 { 1616 if ( nodeIsCENTER(node) || nodeIsDIV(node) ) 1617 { 1618 if (!TY_(nodeIsElement)(node)) 1619 { 1620 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED); 1621 TY_(FreeNode)( doc, node); 1622 continue; 1623 } 1624 1625 TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN); 1626 1627 /* insert center as parent if heading is empty */ 1628 if (element->content == NULL) 1629 { 1630 InsertNodeAsParent(element, node); 1631 continue; 1632 } 1633 1634 /* split heading and make center parent of 2nd part */ 1635 TY_(InsertNodeAfterElement)(element, node); 1636 1637 if (!(mode & Preformatted)) 1638 TrimSpaces(doc, element); 1639 1640 element = TY_(CloneNode)( doc, element ); 1641 TY_(InsertNodeAtEnd)(node, element); 1642 continue; 1643 } 1644 1645 if ( nodeIsHR(node) ) 1646 { 1647 if ( !TY_(nodeIsElement)(node) ) 1648 { 1649 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED); 1650 TY_(FreeNode)( doc, node); 1651 continue; 1652 } 1653 1654 TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN); 1655 1656 /* insert hr before heading if heading is empty */ 1657 if (element->content == NULL) 1658 { 1659 TY_(InsertNodeBeforeElement)(element, node); 1660 continue; 1661 } 1662 1663 /* split heading and insert hr before 2nd part */ 1664 TY_(InsertNodeAfterElement)(element, node); 1665 1666 if (!(mode & Preformatted)) 1667 TrimSpaces(doc, element); 1668 1669 element = TY_(CloneNode)( doc, element ); 1670 TY_(InsertNodeAfterElement)(node, element); 1671 continue; 1672 } 1673 } 1674 1675 if ( nodeIsDT(element) ) 1676 { 1677 if ( nodeIsHR(node) ) 1678 { 1679 Node *dd; 1680 if ( !TY_(nodeIsElement)(node) ) 1681 { 1682 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED); 1683 TY_(FreeNode)( doc, node); 1684 continue; 1685 } 1686 1687 TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN); 1688 dd = TY_(InferredTag)(doc, TidyTag_DD); 1689 1690 /* insert hr within dd before dt if dt is empty */ 1691 if (element->content == NULL) 1692 { 1693 TY_(InsertNodeBeforeElement)(element, dd); 1694 TY_(InsertNodeAtEnd)(dd, node); 1695 continue; 1696 } 1697 1698 /* split dt and insert hr within dd before 2nd part */ 1699 TY_(InsertNodeAfterElement)(element, dd); 1700 TY_(InsertNodeAtEnd)(dd, node); 1701 1702 if (!(mode & Preformatted)) 1703 TrimSpaces(doc, element); 1704 1705 element = TY_(CloneNode)( doc, element ); 1706 TY_(InsertNodeAfterElement)(dd, element); 1707 continue; 1708 } 1709 } 1710 1711 1712 /* 1713 if this is the end tag for an ancestor element 1714 then infer end tag for this element 1715 */ 1716 if (node->type == EndTag) 1717 { 1718 for (parent = element->parent; 1719 parent != NULL; parent = parent->parent) 1720 { 1721 if (node->tag == parent->tag) 1722 { 1723 if (!(element->tag->model & CM_OPT) && !element->implicit) 1724 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE); 1725 1726 if( TY_(IsPushedLast)( doc, element, node ) ) 1727 TY_(PopInline)( doc, element ); 1728 TY_(UngetToken)( doc ); 1729 1730 if (!(mode & Preformatted)) 1731 TrimSpaces(doc, element); 1732 1733 return; 1734 } 1735 } 1736 } 1737 1738 /* block level tags end this element */ 1739 if (!(node->tag->model & CM_INLINE) && 1740 !(element->tag->model & CM_MIXED)) 1741 { 1742 if ( !TY_(nodeIsElement)(node) ) 1743 { 1744 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED); 1745 TY_(FreeNode)( doc, node); 1746 continue; 1747 } 1748 1749 if (!(element->tag->model & CM_OPT)) 1750 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE); 1751 1752 if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK)) 1753 { 1754 MoveToHead(doc, element, node); 1755 continue; 1756 } 1757 1758 /* 1759 prevent anchors from propagating into block tags 1760 except for headings h1 to h6 1761 */ 1762 if ( nodeIsA(element) ) 1763 { 1764 if (node->tag && !(node->tag->model & CM_HEADING)) 1765 TY_(PopInline)( doc, element ); 1766 else if (!(element->content)) 1767 { 1768 TY_(DiscardElement)( doc, element ); 1769 TY_(UngetToken)( doc ); 1770 return; 1771 } 1772 } 1773 1774 TY_(UngetToken)( doc ); 1775 1776 if (!(mode & Preformatted)) 1777 TrimSpaces(doc, element); 1778 1779 return; 1780 } 1781 1782 /* parse inline element */ 1783 if (TY_(nodeIsElement)(node)) 1784 { 1785 if (node->implicit) 1786 TY_(ReportError)(doc, element, node, INSERTING_TAG); 1787 1788 /* trim white space before <br> */ 1789 if ( nodeIsBR(node) ) 1790 TrimSpaces(doc, element); 1791 1792 TY_(InsertNodeAtEnd)(element, node); 1793 ParseTag(doc, node, mode); 1794 continue; 1795 } 1796 1797 /* discard unexpected tags */ 1798 TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED); 1799 TY_(FreeNode)( doc, node ); 1800 continue; 1801 } 1802 1803 if (!(element->tag->model & CM_OPT)) 1804 TY_(ReportError)(doc, element, node, MISSING_ENDTAG_FOR); 1805 1806} 1807 1808void TY_(ParseEmpty)(TidyDocImpl* doc, Node *element, GetTokenMode mode) 1809{ 1810 Lexer* lexer = doc->lexer; 1811 if ( lexer->isvoyager ) 1812 { 1813 Node *node = TY_(GetToken)( doc, mode); 1814 if ( node ) 1815 { 1816 if ( !(node->type == EndTag && node->tag == element->tag) ) 1817 { 1818 TY_(ReportError)(doc, element, node, ELEMENT_NOT_EMPTY); 1819 TY_(UngetToken)( doc ); 1820 } 1821 else 1822 { 1823 TY_(FreeNode)( doc, node ); 1824 } 1825 } 1826 } 1827} 1828 1829void TY_(ParseDefList)(TidyDocImpl* doc, Node *list, GetTokenMode mode) 1830{ 1831 Lexer* lexer = doc->lexer; 1832 Node *node, *parent; 1833 1834 if (list->tag->model & CM_EMPTY) 1835 return; 1836 1837 lexer->insert = NULL; /* defer implicit inline start tags */ 1838 1839 while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL) 1840 { 1841 if (node->tag == list->tag && node->type == EndTag) 1842 { 1843 TY_(FreeNode)( doc, node); 1844 list->closed = yes; 1845 return; 1846 } 1847 1848 /* deal with comments etc. */ 1849 if (InsertMisc(list, node)) 1850 continue; 1851 1852 if (TY_(nodeIsText)(node)) 1853 { 1854 TY_(UngetToken)( doc ); 1855 node = TY_(InferredTag)(doc, TidyTag_DT); 1856 TY_(ReportError)(doc, list, node, MISSING_STARTTAG); 1857 } 1858 1859 if (node->tag == NULL) 1860 { 1861 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED); 1862 TY_(FreeNode)( doc, node); 1863 continue; 1864 } 1865 1866 /* 1867 if this is the end tag for an ancestor element 1868 then infer end tag for this element 1869 */ 1870 if (node->type == EndTag) 1871 { 1872 Bool discardIt = no; 1873 if ( nodeIsFORM(node) ) 1874 { 1875 BadForm( doc ); 1876 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED); 1877 TY_(FreeNode)( doc, node ); 1878 continue; 1879 } 1880 1881 for (parent = list->parent; 1882 parent != NULL; parent = parent->parent) 1883 { 1884 /* Do not match across BODY to avoid infinite loop 1885 between ParseBody and this parser, 1886 See http://tidy.sf.net/bug/1098012. */ 1887 if (nodeIsBODY(parent)) 1888 { 1889 discardIt = yes; 1890 break; 1891 } 1892 if (node->tag == parent->tag) 1893 { 1894 TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE); 1895 1896 TY_(UngetToken)( doc ); 1897 return; 1898 } 1899 } 1900 if (discardIt) 1901 { 1902 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED); 1903 TY_(FreeNode)( doc, node); 1904 continue; 1905 } 1906 } 1907 1908 /* center in a dt or a dl breaks the dl list in two */ 1909 if ( nodeIsCENTER(node) ) 1910 { 1911 if (list->content) 1912 TY_(InsertNodeAfterElement)(list, node); 1913 else /* trim empty dl list */ 1914 { 1915 TY_(InsertNodeBeforeElement)(list, node); 1916 1917/* #540296 tidy dumps with empty definition list */ 1918#if 0 1919 TY_(DiscardElement)(list); 1920#endif 1921 } 1922 1923 /* #426885 - fix by Glenn Carroll 19 Apr 00, and 1924 Gary Dechaines 11 Aug 00 */ 1925 /* ParseTag can destroy node, if it finds that 1926 * this <center> is followed immediately by </center>. 1927 * It's awkward but necessary to determine if this 1928 * has happened. 1929 */ 1930 parent = node->parent; 1931 1932 /* and parse contents of center */ 1933 lexer->excludeBlocks = no; 1934 ParseTag( doc, node, mode); 1935 lexer->excludeBlocks = yes; 1936 1937 /* now create a new dl element, 1938 * unless node has been blown away because the 1939 * center was empty, as above. 1940 */ 1941 if (parent->last == node) 1942 { 1943 list = TY_(InferredTag)(doc, TidyTag_DL); 1944 TY_(InsertNodeAfterElement)(node, list); 1945 } 1946 continue; 1947 } 1948 1949 if ( !(nodeIsDT(node) || nodeIsDD(node)) ) 1950 { 1951 TY_(UngetToken)( doc ); 1952 1953 if (!(node->tag->model & (CM_BLOCK | CM_INLINE))) 1954 { 1955 TY_(ReportError)(doc, list, node, TAG_NOT_ALLOWED_IN); 1956 return; 1957 } 1958 1959 /* if DD appeared directly in BODY then exclude blocks */ 1960 if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks) 1961 return; 1962 1963 node = TY_(InferredTag)(doc, TidyTag_DD); 1964 TY_(ReportError)(doc, list, node, MISSING_STARTTAG); 1965 } 1966 1967 if (node->type == EndTag) 1968 { 1969 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED); 1970 TY_(FreeNode)( doc, node); 1971 continue; 1972 } 1973 1974 /* node should be <DT> or <DD>*/ 1975 TY_(InsertNodeAtEnd)(list, node); 1976 ParseTag( doc, node, IgnoreWhitespace); 1977 } 1978 1979 TY_(ReportError)(doc, list, node, MISSING_ENDTAG_FOR); 1980} 1981 1982void TY_(ParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode)) 1983{ 1984 Lexer* lexer = doc->lexer; 1985 Node *node, *parent; 1986 1987 if (list->tag->model & CM_EMPTY) 1988 return; 1989 1990 lexer->insert = NULL; /* defer implicit inline start tags */ 1991 1992 while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL) 1993 { 1994 if (node->tag == list->tag && node->type == EndTag) 1995 { 1996 TY_(FreeNode)( doc, node); 1997 list->closed = yes; 1998 return; 1999 } 2000 2001 /* deal with comments etc. */ 2002 if (InsertMisc(list, node)) 2003 continue; 2004 2005 if (node->type != TextNode && node->tag == NULL) 2006 { 2007 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED); 2008 TY_(FreeNode)( doc, node); 2009 continue; 2010 } 2011 2012 /* 2013 if this is the end tag for an ancestor element 2014 then infer end tag for this element 2015 */ 2016 if (node->type == EndTag) 2017 { 2018 if ( nodeIsFORM(node) ) 2019 { 2020 BadForm( doc ); 2021 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED); 2022 TY_(FreeNode)( doc, node ); 2023 continue; 2024 } 2025 2026 if (node->tag && node->tag->model & CM_INLINE) 2027 { 2028 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED); 2029 TY_(PopInline)( doc, node ); 2030 TY_(FreeNode)( doc, node); 2031 continue; 2032 } 2033 2034 for ( parent = list->parent; 2035 parent != NULL; parent = parent->parent ) 2036 { 2037 /* Do not match across BODY to avoid infinite loop 2038 between ParseBody and this parser, 2039 See http://tidy.sf.net/bug/1053626. */ 2040 if (nodeIsBODY(parent)) 2041 break; 2042 if (node->tag == parent->tag) 2043 { 2044 TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE); 2045 TY_(UngetToken)( doc ); 2046 return; 2047 } 2048 } 2049 2050 TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED); 2051 TY_(FreeNode)( doc, node); 2052 continue; 2053 } 2054 2055 if ( !nodeIsLI(node) ) 2056 { 2057 TY_(UngetToken)( doc ); 2058 2059 if (node->tag && (node->tag->model & CM_BLOCK) && lexer->excludeBlocks) 2060 { 2061 TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE); 2062 return; 2063 } 2064 /* http://tidy.sf.net/issue/1316307 */ 2065 /* In exiled mode, return so table processing can continue. */ 2066 else if ( lexer->exiled && node->tag 2067 && TY_(nodeHasCM)(node, CM_TABLE|CM_ROWGRP|CM_ROW) ) 2068 return; 2069 2070 node = TY_(InferredTag)(doc, TidyTag_LI); 2071 TY_(AddStyleProperty)( doc, node, "list-style: none" ); 2072 TY_(ReportError)(doc, list, node, MISSING_STARTTAG ); 2073 } 2074 2075 /* node should be <LI> */ 2076 TY_(InsertNodeAtEnd)(list,node); 2077 ParseTag( doc, node, IgnoreWhitespace); 2078 } 2079 2080 TY_(ReportError)(doc, list, node, MISSING_ENDTAG_FOR); 2081} 2082 2083/* 2084 unexpected content in table row is moved to just before 2085 the table in accordance with Netscape and IE. This code 2086 assumes that node hasn't been inserted into the row. 2087*/ 2088static void MoveBeforeTable( TidyDocImpl* ARG_UNUSED(doc), Node *row, 2089 Node *node ) 2090{ 2091 Node *table; 2092 2093 /* first find the table element */ 2094 for (table = row->parent; table; table = table->parent) 2095 { 2096 if ( nodeIsTABLE(table) ) 2097 { 2098 TY_(InsertNodeBeforeElement)( table, node ); 2099 return; 2100 } 2101 } 2102 /* No table element */ 2103 TY_(InsertNodeBeforeElement)( row->parent, node ); 2104} 2105 2106/* 2107 if a table row is empty then insert an empty cell 2108 this practice is consistent with browser behavior 2109 and avoids potential problems with row spanning cells 2110*/ 2111static void FixEmptyRow(TidyDocImpl* doc, Node *row) 2112{ 2113 Node *cell; 2114 2115 if (row->content == NULL) 2116 { 2117 cell = TY_(InferredTag)(doc, TidyTag_TD); 2118 TY_(InsertNodeAtEnd)(row, cell); 2119 TY_(ReportError)(doc, row, cell, MISSING_STARTTAG); 2120 } 2121} 2122 2123void TY_(ParseRow)(TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode)) 2124{ 2125 Lexer* lexer = doc->lexer; 2126 Node *node; 2127 Bool exclude_state; 2128 2129 if (row->tag->model & CM_EMPTY) 2130 return; 2131 2132 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) 2133 { 2134 if (node->tag == row->tag) 2135 { 2136 if (node->type == EndTag) 2137 { 2138 TY_(FreeNode)( doc, node); 2139 row->closed = yes; 2140 FixEmptyRow( doc, row); 2141 return; 2142 } 2143 2144 /* New row start implies end of current row */ 2145 TY_(UngetToken)( doc ); 2146 FixEmptyRow( doc, row); 2147 return; 2148 } 2149 2150 /* 2151 if this is the end tag for an ancestor element 2152 then infer end tag for this element 2153 */ 2154 if ( node->type == EndTag ) 2155 { 2156 if ( (TY_(nodeHasCM)(node, CM_HTML|CM_TABLE) || nodeIsTABLE(node)) 2157 && DescendantOf(row, TagId(node)) ) 2158 { 2159 TY_(UngetToken)( doc ); 2160 return; 2161 } 2162 2163 if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) ) 2164 { 2165 if ( nodeIsFORM(node) ) 2166 BadForm( doc ); 2167 2168 TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED); 2169 TY_(FreeNode)( doc, node); 2170 continue; 2171 } 2172 2173 if ( nodeIsTD(node) || nodeIsTH(node) ) 2174 { 2175 TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED); 2176 TY_(FreeNode)( doc, node); 2177 continue; 2178 } 2179 } 2180 2181 /* deal with comments etc. */ 2182 if (InsertMisc(row, node)) 2183 continue; 2184 2185 /* discard unknown tags */ 2186 if (node->tag == NULL && node->type != TextNode) 2187 { 2188 TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED); 2189 TY_(FreeNode)( doc, node); 2190 continue; 2191 } 2192 2193 /* discard unexpected <table> element */ 2194 if ( nodeIsTABLE(node) ) 2195 { 2196 TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED); 2197 TY_(FreeNode)( doc, node); 2198 continue; 2199 } 2200 2201 /* THEAD, TFOOT or TBODY */ 2202 if ( TY_(nodeHasCM)(node, CM_ROWGRP) ) 2203 { 2204 TY_(UngetToken)( doc ); 2205 return; 2206 } 2207 2208 if (node->type == EndTag) 2209 { 2210 TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED); 2211 TY_(FreeNode)( doc, node); 2212 continue; 2213 } 2214 2215 /* 2216 if text or inline or block move before table 2217 if head content move to head 2218 */ 2219 2220 if (node->type != EndTag) 2221 { 2222 if ( nodeIsFORM(node) ) 2223 { 2224 TY_(UngetToken)( doc ); 2225 node = TY_(InferredTag)(doc, TidyTag_TD); 2226 TY_(ReportError)(doc, row, node, MISSING_STARTTAG); 2227 } 2228 else if ( TY_(nodeIsText)(node) 2229 || TY_(nodeHasCM)(node, CM_BLOCK | CM_INLINE) ) 2230 { 2231 MoveBeforeTable( doc, row, node ); 2232 TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN); 2233 lexer->exiled = yes; 2234 exclude_state = lexer->excludeBlocks; 2235 lexer->excludeBlocks = no; 2236 2237 if (node->type != TextNode) 2238 ParseTag( doc, node, IgnoreWhitespace); 2239 2240 lexer->exiled = no; 2241 lexer->excludeBlocks = exclude_state; 2242 continue; 2243 } 2244 else if (node->tag->model & CM_HEAD) 2245 { 2246 TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN); 2247 MoveToHead( doc, row, node); 2248 continue; 2249 } 2250 } 2251 2252 if ( !(nodeIsTD(node) || nodeIsTH(node)) ) 2253 { 2254 TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN); 2255 TY_(FreeNode)( doc, node); 2256 continue; 2257 } 2258 2259 /* node should be <TD> or <TH> */ 2260 TY_(InsertNodeAtEnd)(row, node); 2261 exclude_state = lexer->excludeBlocks; 2262 lexer->excludeBlocks = no; 2263 ParseTag( doc, node, IgnoreWhitespace); 2264 lexer->excludeBlocks = exclude_state; 2265 2266 /* pop inline stack */ 2267 2268 while ( lexer->istacksize > lexer->istackbase ) 2269 TY_(PopInline)( doc, NULL ); 2270 } 2271 2272} 2273 2274void TY_(ParseRowGroup)(TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSED(mode)) 2275{ 2276 Lexer* lexer = doc->lexer; 2277 Node *node, *parent; 2278 2279 if (rowgroup->tag->model & CM_EMPTY) 2280 return; 2281 2282 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) 2283 { 2284 if (node->tag == rowgroup->tag) 2285 { 2286 if (node->type == EndTag) 2287 { 2288 rowgroup->closed = yes; 2289 TY_(FreeNode)( doc, node); 2290 return; 2291 } 2292 2293 TY_(UngetToken)( doc ); 2294 return; 2295 } 2296 2297 /* if </table> infer end tag */ 2298 if ( nodeIsTABLE(node) && node->type == EndTag ) 2299 { 2300 TY_(UngetToken)( doc ); 2301 return; 2302 } 2303 2304 /* deal with comments etc. */ 2305 if (InsertMisc(rowgroup, node)) 2306 continue; 2307 2308 /* discard unknown tags */ 2309 if (node->tag == NULL && node->type != TextNode) 2310 { 2311 TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED); 2312 TY_(FreeNode)( doc, node); 2313 continue; 2314 } 2315 2316 /* 2317 if TD or TH then infer <TR> 2318 if text or inline or block move before table 2319 if head content move to head 2320 */ 2321 2322 if (node->type != EndTag) 2323 { 2324 if ( nodeIsTD(node) || nodeIsTH(node) ) 2325 { 2326 TY_(UngetToken)( doc ); 2327 node = TY_(InferredTag)(doc, TidyTag_TR); 2328 TY_(ReportError)(doc, rowgroup, node, MISSING_STARTTAG); 2329 } 2330 else if ( TY_(nodeIsText)(node) 2331 || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) ) 2332 { 2333 MoveBeforeTable( doc, rowgroup, node ); 2334 TY_(ReportError)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN); 2335 lexer->exiled = yes; 2336 2337 if (node->type != TextNode) 2338 ParseTag(doc, node, IgnoreWhitespace); 2339 2340 lexer->exiled = no; 2341 continue; 2342 } 2343 else if (node->tag->model & CM_HEAD) 2344 { 2345 TY_(ReportError)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN); 2346 MoveToHead(doc, rowgroup, node); 2347 continue; 2348 } 2349 } 2350 2351 /* 2352 if this is the end tag for ancestor element 2353 then infer end tag for this element 2354 */ 2355 if (node->type == EndTag) 2356 { 2357 if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) ) 2358 { 2359 if ( nodeIsFORM(node) ) 2360 BadForm( doc ); 2361 2362 TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED); 2363 TY_(FreeNode)( doc, node); 2364 continue; 2365 } 2366 2367 if ( nodeIsTR(node) || nodeIsTD(node) || nodeIsTH(node) ) 2368 { 2369 TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED); 2370 TY_(FreeNode)( doc, node); 2371 continue; 2372 } 2373 2374 for ( parent = rowgroup->parent; 2375 parent != NULL; 2376 parent = parent->parent ) 2377 { 2378 if (node->tag == parent->tag) 2379 { 2380 TY_(UngetToken)( doc ); 2381 return; 2382 } 2383 } 2384 } 2385 2386 /* 2387 if THEAD, TFOOT or TBODY then implied end tag 2388 2389 */ 2390 if (node->tag->model & CM_ROWGRP) 2391 { 2392 if (node->type != EndTag) 2393 { 2394 TY_(UngetToken)( doc ); 2395 return; 2396 } 2397 } 2398 2399 if (node->type == EndTag) 2400 { 2401 TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED); 2402 TY_(FreeNode)( doc, node); 2403 continue; 2404 } 2405 2406 if ( !nodeIsTR(node) ) 2407 { 2408 node = TY_(InferredTag)(doc, TidyTag_TR); 2409 TY_(ReportError)(doc, rowgroup, node, MISSING_STARTTAG); 2410 TY_(UngetToken)( doc ); 2411 } 2412 2413 /* node should be <TR> */ 2414 TY_(InsertNodeAtEnd)(rowgroup, node); 2415 ParseTag(doc, node, IgnoreWhitespace); 2416 } 2417 2418} 2419 2420void TY_(ParseColGroup)(TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSED(mode)) 2421{ 2422 Node *node, *parent; 2423 2424 if (colgroup->tag->model & CM_EMPTY) 2425 return; 2426 2427 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) 2428 { 2429 if (node->tag == colgroup->tag && node->type == EndTag) 2430 { 2431 TY_(FreeNode)( doc, node); 2432 colgroup->closed = yes; 2433 return; 2434 } 2435 2436 /* 2437 if this is the end tag for an ancestor element 2438 then infer end tag for this element 2439 */ 2440 if (node->type == EndTag) 2441 { 2442 if ( nodeIsFORM(node) ) 2443 { 2444 BadForm( doc ); 2445 TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED); 2446 TY_(FreeNode)( doc, node); 2447 continue; 2448 } 2449 2450 for ( parent = colgroup->parent; 2451 parent != NULL; 2452 parent = parent->parent ) 2453 { 2454 if (node->tag == parent->tag) 2455 { 2456 TY_(UngetToken)( doc ); 2457 return; 2458 } 2459 } 2460 } 2461 2462 if (TY_(nodeIsText)(node)) 2463 { 2464 TY_(UngetToken)( doc ); 2465 return; 2466 } 2467 2468 /* deal with comments etc. */ 2469 if (InsertMisc(colgroup, node)) 2470 continue; 2471 2472 /* discard unknown tags */ 2473 if (node->tag == NULL) 2474 { 2475 TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED); 2476 TY_(FreeNode)( doc, node); 2477 continue; 2478 } 2479 2480 if ( !nodeIsCOL(node) ) 2481 { 2482 TY_(UngetToken)( doc ); 2483 return; 2484 } 2485 2486 if (node->type == EndTag) 2487 { 2488 TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED); 2489 TY_(FreeNode)( doc, node); 2490 continue; 2491 } 2492 2493 /* node should be <COL> */ 2494 TY_(InsertNodeAtEnd)(colgroup, node); 2495 ParseTag(doc, node, IgnoreWhitespace); 2496 } 2497} 2498 2499void TY_(ParseTableTag)(TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(mode)) 2500{ 2501 Lexer* lexer = doc->lexer; 2502 Node *node, *parent; 2503 uint istackbase; 2504 2505 TY_(DeferDup)( doc ); 2506 istackbase = lexer->istackbase; 2507 lexer->istackbase = lexer->istacksize; 2508 2509 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) 2510 { 2511 if (node->tag == table->tag && node->type == EndTag) 2512 { 2513 TY_(FreeNode)( doc, node); 2514 lexer->istackbase = istackbase; 2515 table->closed = yes; 2516 return; 2517 } 2518 2519 /* deal with comments etc. */ 2520 if (InsertMisc(table, node)) 2521 continue; 2522 2523 /* discard unknown tags */ 2524 if (node->tag == NULL && node->type != TextNode) 2525 { 2526 TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED); 2527 TY_(FreeNode)( doc, node); 2528 continue; 2529 } 2530 2531 /* if TD or TH or text or inline or block then infer <TR> */ 2532 2533 if (node->type != EndTag) 2534 { 2535 if ( nodeIsTD(node) || nodeIsTH(node) || nodeIsTABLE(node) ) 2536 { 2537 TY_(UngetToken)( doc ); 2538 node = TY_(InferredTag)(doc, TidyTag_TR); 2539 TY_(ReportError)(doc, table, node, MISSING_STARTTAG); 2540 } 2541 else if ( TY_(nodeIsText)(node) ||TY_(nodeHasCM)(node,CM_BLOCK|CM_INLINE) ) 2542 { 2543 TY_(InsertNodeBeforeElement)(table, node); 2544 TY_(ReportError)(doc, table, node, TAG_NOT_ALLOWED_IN); 2545 lexer->exiled = yes; 2546 2547 if (node->type != TextNode) 2548 ParseTag(doc, node, IgnoreWhitespace); 2549 2550 lexer->exiled = no; 2551 continue; 2552 } 2553 else if (node->tag->model & CM_HEAD) 2554 { 2555 MoveToHead(doc, table, node); 2556 continue; 2557 } 2558 } 2559 2560 /* 2561 if this is the end tag for an ancestor element 2562 then infer end tag for this element 2563 */ 2564 if (node->type == EndTag) 2565 { 2566 if ( nodeIsFORM(node) ) 2567 { 2568 BadForm( doc ); 2569 TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED); 2570 TY_(FreeNode)( doc, node); 2571 continue; 2572 } 2573 2574 /* best to discard unexpected block/inline end tags */ 2575 if ( TY_(nodeHasCM)(node, CM_TABLE|CM_ROW) || 2576 TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) ) 2577 { 2578 TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED); 2579 TY_(FreeNode)( doc, node); 2580 continue; 2581 } 2582 2583 for ( parent = table->parent; 2584 parent != NULL; 2585 parent = parent->parent ) 2586 { 2587 if (node->tag == parent->tag) 2588 { 2589 TY_(ReportError)(doc, table, node, MISSING_ENDTAG_BEFORE ); 2590 TY_(UngetToken)( doc ); 2591 lexer->istackbase = istackbase; 2592 return; 2593 } 2594 } 2595 } 2596 2597 if (!(node->tag->model & CM_TABLE)) 2598 { 2599 TY_(UngetToken)( doc ); 2600 TY_(ReportError)(doc, table, node, TAG_NOT_ALLOWED_IN); 2601 lexer->istackbase = istackbase; 2602 return; 2603 } 2604 2605 if (TY_(nodeIsElement)(node)) 2606 { 2607 TY_(InsertNodeAtEnd)(table, node); 2608 ParseTag(doc, node, IgnoreWhitespace); 2609 continue; 2610 } 2611 2612 /* discard unexpected text nodes and end tags */ 2613 TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED); 2614 TY_(FreeNode)( doc, node); 2615 } 2616 2617 TY_(ReportError)(doc, table, node, MISSING_ENDTAG_FOR); 2618 lexer->istackbase = istackbase; 2619} 2620 2621/* acceptable content for pre elements */ 2622static Bool PreContent( TidyDocImpl* ARG_UNUSED(doc), Node* node ) 2623{ 2624 /* p is coerced to br's, Text OK too */ 2625 if ( nodeIsP(node) || TY_(nodeIsText)(node) ) 2626 return yes; 2627 2628 if ( node->tag == NULL || 2629 nodeIsPARAM(node) || 2630 !TY_(nodeHasCM)(node, CM_INLINE|CM_NEW) ) 2631 return no; 2632 2633 return yes; 2634} 2635 2636void TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) ) 2637{ 2638 Node *node; 2639 2640 if (pre->tag->model & CM_EMPTY) 2641 return; 2642 2643 TY_(InlineDup)( doc, NULL ); /* tell lexer to insert inlines if needed */ 2644 2645 while ((node = TY_(GetToken)(doc, Preformatted)) != NULL) 2646 { 2647 if ( node->type == EndTag && 2648 (node->tag == pre->tag || DescendantOf(pre, TagId(node))) ) 2649 { 2650 if (nodeIsBODY(node) || nodeIsHTML(node)) 2651 { 2652 TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED); 2653 TY_(FreeNode)(doc, node); 2654 continue; 2655 } 2656 if (node->tag == pre->tag) 2657 { 2658 TY_(FreeNode)(doc, node); 2659 } 2660 else 2661 { 2662 TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_BEFORE ); 2663 TY_(UngetToken)( doc ); 2664 } 2665 pre->closed = yes; 2666 TrimSpaces(doc, pre); 2667 return; 2668 } 2669 2670 if (TY_(nodeIsText)(node)) 2671 { 2672 TY_(InsertNodeAtEnd)(pre, node); 2673 continue; 2674 } 2675 2676 /* deal with comments etc. */ 2677 if (InsertMisc(pre, node)) 2678 continue; 2679 2680 if (node->tag == NULL) 2681 { 2682 TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED); 2683 TY_(FreeNode)(doc, node); 2684 continue; 2685 } 2686 2687 /* strip unexpected tags */ 2688 if ( !PreContent(doc, node) ) 2689 { 2690 Node *newnode; 2691 2692 /* fix for http://tidy.sf.net/bug/772205 */ 2693 if (node->type == EndTag) 2694 { 2695 TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED); 2696 TY_(FreeNode)(doc, node); 2697 continue; 2698 } 2699 /* 2700 This is basically what Tidy 04 August 2000 did and far more accurate 2701 with respect to browser behaivour than the code commented out above. 2702 Tidy could try to propagate the <pre> into each disallowed child where 2703 <pre> is allowed in order to replicate some browsers behaivour, but 2704 there are a lot of exceptions, e.g. Internet Explorer does not propagate 2705 <pre> into table cells while Mozilla does. Opera 6 never propagates 2706 <pre> into blocklevel elements while Opera 7 behaves much like Mozilla. 2707 2708 Tidy behaves thus mostly like Opera 6 except for nested <pre> elements 2709 which are handled like Mozilla takes them (Opera6 closes all <pre> after 2710 the first </pre>). 2711 2712 There are similar issues like replacing <p> in <pre> with <br>, for 2713 example 2714 2715 <pre>...<p>...</pre> (Input) 2716 <pre>...<br>...</pre> (Tidy) 2717 <pre>...<br>...</pre> (Opera 7 and Internet Explorer) 2718 <pre>...<br><br>...</pre> (Opera 6 and Mozilla) 2719 2720 <pre>...<p>...</p>...</pre> (Input) 2721 <pre>...<br>......</pre> (Tidy, BUG!) 2722 <pre>...<br>...<br>...</pre> (Internet Explorer) 2723 <pre>...<br><br>...<br><br>...</pre> (Mozilla, Opera 6) 2724 <pre>...<br>...<br><br>...</pre> (Opera 7) 2725 2726 or something similar, they could also be closing the <pre> and propagate 2727 the <pre> into the newly opened <p>. 2728 2729 Todo: IMG, OBJECT, APPLET, BIG, SMALL, SUB, SUP, FONT, and BASEFONT are 2730 dissallowed in <pre>, Tidy neither detects this nor does it perform any 2731 cleanup operation. Tidy should at least issue a warning if it encounters 2732 such constructs. 2733 2734 Todo: discarding </p> is abviously a bug, it should be replaced by <br>. 2735 */ 2736 TY_(InsertNodeAfterElement)(pre, node); 2737 TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_BEFORE); 2738 ParseTag(doc, node, IgnoreWhitespace); 2739 2740 newnode = TY_(InferredTag)(doc, TidyTag_PRE); 2741 TY_(ReportError)(doc, pre, newnode, INSERTING_TAG); 2742 pre = newnode; 2743 TY_(InsertNodeAfterElement)(node, pre); 2744 2745 continue; 2746 } 2747 2748 if ( nodeIsP(node) ) 2749 { 2750 if (node->type == StartTag) 2751 { 2752 TY_(ReportError)(doc, pre, node, USING_BR_INPLACE_OF); 2753 2754 /* trim white space before <p> in <pre>*/ 2755 TrimSpaces(doc, pre); 2756 2757 /* coerce both <p> and </p> to <br> */ 2758 TY_(CoerceNode)(doc, node, TidyTag_BR, no, no); 2759 TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */ 2760 TY_(InsertNodeAtEnd)( pre, node ); 2761 } 2762 else 2763 { 2764 TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED); 2765 TY_(FreeNode)( doc, node); 2766 } 2767 continue; 2768 } 2769 2770 if ( TY_(nodeIsElement)(node) ) 2771 { 2772 /* trim white space before <br> */ 2773 if ( nodeIsBR(node) ) 2774 TrimSpaces(doc, pre); 2775 2776 TY_(InsertNodeAtEnd)(pre, node); 2777 ParseTag(doc, node, Preformatted); 2778 continue; 2779 } 2780 2781 /* discard unexpected tags */ 2782 TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED); 2783 TY_(FreeNode)( doc, node); 2784 } 2785 2786 TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_FOR); 2787} 2788 2789void TY_(ParseOptGroup)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode)) 2790{ 2791 Lexer* lexer = doc->lexer; 2792 Node *node; 2793 2794 lexer->insert = NULL; /* defer implicit inline start tags */ 2795 2796 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) 2797 { 2798 if (node->tag == field->tag && node->type == EndTag) 2799 { 2800 TY_(FreeNode)( doc, node); 2801 field->closed = yes; 2802 TrimSpaces(doc, field); 2803 return; 2804 } 2805 2806 /* deal with comments etc. */ 2807 if (InsertMisc(field, node)) 2808 continue; 2809 2810 if ( node->type == StartTag && 2811 (nodeIsOPTION(node) || nodeIsOPTGROUP(node)) ) 2812 { 2813 if ( nodeIsOPTGROUP(node) ) 2814 TY_(ReportError)(doc, field, node, CANT_BE_NESTED); 2815 2816 TY_(InsertNodeAtEnd)(field, node); 2817 ParseTag(doc, node, MixedContent); 2818 continue; 2819 } 2820 2821 /* discard unexpected tags */ 2822 TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED ); 2823 TY_(FreeNode)( doc, node); 2824 } 2825} 2826 2827 2828void TY_(ParseSelect)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode)) 2829{ 2830 Lexer* lexer = doc->lexer; 2831 Node *node; 2832 2833 lexer->insert = NULL; /* defer implicit inline start tags */ 2834 2835 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) 2836 { 2837 if (node->tag == field->tag && node->type == EndTag) 2838 { 2839 TY_(FreeNode)( doc, node); 2840 field->closed = yes; 2841 TrimSpaces(doc, field); 2842 return; 2843 } 2844 2845 /* deal with comments etc. */ 2846 if (InsertMisc(field, node)) 2847 continue; 2848 2849 if ( node->type == StartTag && 2850 ( nodeIsOPTION(node) || 2851 nodeIsOPTGROUP(node) || 2852 nodeIsSCRIPT(node)) 2853 ) 2854 { 2855 TY_(InsertNodeAtEnd)(field, node); 2856 ParseTag(doc, node, IgnoreWhitespace); 2857 continue; 2858 } 2859 2860 /* discard unexpected tags */ 2861 TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED); 2862 TY_(FreeNode)( doc, node); 2863 } 2864 2865 TY_(ReportError)(doc, field, node, MISSING_ENDTAG_FOR); 2866} 2867 2868void TY_(ParseText)(TidyDocImpl* doc, Node *field, GetTokenMode mode) 2869{ 2870 Lexer* lexer = doc->lexer; 2871 Node *node; 2872 2873 lexer->insert = NULL; /* defer implicit inline start tags */ 2874 2875 if ( nodeIsTEXTAREA(field) ) 2876 mode = Preformatted; 2877 else 2878 mode = MixedContent; /* kludge for font tags */ 2879 2880 while ((node = TY_(GetToken)(doc, mode)) != NULL) 2881 { 2882 if (node->tag == field->tag && node->type == EndTag) 2883 { 2884 TY_(FreeNode)( doc, node); 2885 field->closed = yes; 2886 TrimSpaces(doc, field); 2887 return; 2888 } 2889 2890 /* deal with comments etc. */ 2891 if (InsertMisc(field, node)) 2892 continue; 2893 2894 if (TY_(nodeIsText)(node)) 2895 { 2896 /* only called for 1st child */ 2897 if (field->content == NULL && !(mode & Preformatted)) 2898 TrimSpaces(doc, field); 2899 2900 if (node->start >= node->end) 2901 { 2902 TY_(FreeNode)( doc, node); 2903 continue; 2904 } 2905 2906 TY_(InsertNodeAtEnd)(field, node); 2907 continue; 2908 } 2909 2910 /* for textarea should all cases of < and & be escaped? */ 2911 2912 /* discard inline tags e.g. font */ 2913 if ( node->tag 2914 && node->tag->model & CM_INLINE 2915 && !(node->tag->model & CM_FIELD)) /* #487283 - fix by Lee Passey 25 Jan 02 */ 2916 { 2917 TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED); 2918 TY_(FreeNode)( doc, node); 2919 continue; 2920 } 2921 2922 /* terminate element on other tags */ 2923 if (!(field->tag->model & CM_OPT)) 2924 TY_(ReportError)(doc, field, node, MISSING_ENDTAG_BEFORE); 2925 2926 TY_(UngetToken)( doc ); 2927 TrimSpaces(doc, field); 2928 return; 2929 } 2930 2931 if (!(field->tag->model & CM_OPT)) 2932 TY_(ReportError)(doc, field, node, MISSING_ENDTAG_FOR); 2933} 2934 2935 2936void TY_(ParseTitle)(TidyDocImpl* doc, Node *title, GetTokenMode ARG_UNUSED(mode)) 2937{ 2938 Node *node; 2939 while ((node = TY_(GetToken)(doc, MixedContent)) != NULL) 2940 { 2941 if (node->tag == title->tag && node->type == StartTag) 2942 { 2943 TY_(ReportError)(doc, title, node, COERCE_TO_ENDTAG); 2944 node->type = EndTag; 2945 TY_(UngetToken)( doc ); 2946 continue; 2947 } 2948 else if (node->tag == title->tag && node->type == EndTag) 2949 { 2950 TY_(FreeNode)( doc, node); 2951 title->closed = yes; 2952 TrimSpaces(doc, title); 2953 return; 2954 } 2955 2956 if (TY_(nodeIsText)(node)) 2957 { 2958 /* only called for 1st child */ 2959 if (title->content == NULL) 2960 TrimInitialSpace(doc, title, node); 2961 2962 if (node->start >= node->end) 2963 { 2964 TY_(FreeNode)( doc, node); 2965 continue; 2966 } 2967 2968 TY_(InsertNodeAtEnd)(title, node); 2969 continue; 2970 } 2971 2972 /* deal with comments etc. */ 2973 if (InsertMisc(title, node)) 2974 continue; 2975 2976 /* discard unknown tags */ 2977 if (node->tag == NULL) 2978 { 2979 TY_(ReportError)(doc, title, node, DISCARDING_UNEXPECTED); 2980 TY_(FreeNode)( doc, node); 2981 continue; 2982 } 2983 2984 /* pushback unexpected tokens */ 2985 TY_(ReportError)(doc, title, node, MISSING_ENDTAG_BEFORE); 2986 TY_(UngetToken)( doc ); 2987 TrimSpaces(doc, title); 2988 return; 2989 } 2990 2991 TY_(ReportError)(doc, title, node, MISSING_ENDTAG_FOR); 2992} 2993 2994/* 2995 This isn't quite right for CDATA content as it recognises 2996 tags within the content and parses them accordingly. 2997 This will unfortunately screw up scripts which include 2998 < + letter, < + !, < + ? or < + / + letter 2999*/ 3000 3001void TY_(ParseScript)(TidyDocImpl* doc, Node *script, GetTokenMode ARG_UNUSED(mode)) 3002{ 3003 Node *node; 3004 3005 doc->lexer->parent = script; 3006 node = TY_(GetToken)(doc, CdataContent); 3007 doc->lexer->parent = NULL; 3008 3009 if (node) 3010 { 3011 TY_(InsertNodeAtEnd)(script, node); 3012 } 3013 else 3014 { 3015 /* handle e.g. a document like "<script>" */ 3016 TY_(ReportError)(doc, script, NULL, MISSING_ENDTAG_FOR); 3017 return; 3018 } 3019 3020 node = TY_(GetToken)(doc, IgnoreWhitespace); 3021 3022 if (!(node && node->type == EndTag && node->tag && 3023 node->tag->id == script->tag->id)) 3024 { 3025 TY_(ReportError)(doc, script, node, MISSING_ENDTAG_FOR); 3026 3027 if (node) 3028 TY_(UngetToken)(doc); 3029 } 3030 else 3031 { 3032 TY_(FreeNode)(doc, node); 3033 } 3034} 3035 3036Bool TY_(IsJavaScript)(Node *node) 3037{ 3038 Bool result = no; 3039 AttVal *attr; 3040 3041 if (node->attributes == NULL) 3042 return yes; 3043 3044 for (attr = node->attributes; attr; attr = attr->next) 3045 { 3046 if ( (attrIsLANGUAGE(attr) || attrIsTYPE(attr)) 3047 && AttrContains(attr, "javascript") ) 3048 { 3049 result = yes; 3050 break; 3051 } 3052 } 3053 3054 return result; 3055} 3056 3057void TY_(ParseHead)(TidyDocImpl* doc, Node *head, GetTokenMode ARG_UNUSED(mode)) 3058{ 3059 Lexer* lexer = doc->lexer; 3060 Node *node; 3061 int HasTitle = 0; 3062 int HasBase = 0; 3063 3064 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) 3065 { 3066 if (node->tag == head->tag && node->type == EndTag) 3067 { 3068 TY_(FreeNode)( doc, node); 3069 head->closed = yes; 3070 break; 3071 } 3072 3073 /* find and discard multiple <head> elements */ 3074 /* find and discard <html> in <head> elements */ 3075 if ((node->tag == head->tag || nodeIsHTML(node)) && node->type == StartTag) 3076 { 3077 TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED); 3078 TY_(FreeNode)(doc, node); 3079 continue; 3080 } 3081 3082 if (TY_(nodeIsText)(node)) 3083 { 3084 TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN); 3085 TY_(UngetToken)( doc ); 3086 break; 3087 } 3088 3089 if (node->type == ProcInsTag && node->element && 3090 TY_(tmbstrcmp)(node->element, "xml-stylesheet") == 0) 3091 { 3092 TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN); 3093 TY_(InsertNodeBeforeElement)(TY_(FindHTML)(doc), node); 3094 continue; 3095 } 3096 3097 /* deal with comments etc. */ 3098 if (InsertMisc(head, node)) 3099 continue; 3100 3101 if (node->type == DocTypeTag) 3102 { 3103 InsertDocType(doc, head, node); 3104 continue; 3105 } 3106 3107 /* discard unknown tags */ 3108 if (node->tag == NULL) 3109 { 3110 TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED); 3111 TY_(FreeNode)( doc, node); 3112 continue; 3113 } 3114 3115 /* 3116 if it doesn't belong in the head then 3117 treat as implicit end of head and deal 3118 with as part of the body 3119 */ 3120 if (!(node->tag->model & CM_HEAD)) 3121 { 3122 /* #545067 Implicit closing of head broken - warn only for XHTML input */ 3123 if ( lexer->isvoyager ) 3124 TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN ); 3125 TY_(UngetToken)( doc ); 3126 break; 3127 } 3128 3129 if (TY_(nodeIsElement)(node)) 3130 { 3131 if ( nodeIsTITLE(node) ) 3132 { 3133 ++HasTitle; 3134 3135 if (HasTitle > 1) 3136 if (head) 3137 TY_(ReportError)(doc, head, node, TOO_MANY_ELEMENTS_IN); 3138 else 3139 TY_(ReportError)(doc, head, node, TOO_MANY_ELEMENTS); 3140 } 3141 else if ( nodeIsBASE(node) ) 3142 { 3143 ++HasBase; 3144 3145 if (HasBase > 1) 3146 if (head) 3147 TY_(ReportError)(doc, head, node, TOO_MANY_ELEMENTS_IN); 3148 else 3149 TY_(ReportError)(doc, head, node, TOO_MANY_ELEMENTS); 3150 } 3151 else if ( nodeIsNOSCRIPT(node) ) 3152 { 3153 TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN); 3154 } 3155 3156#ifdef AUTO_INPUT_ENCODING 3157 else if (nodeIsMETA(node)) 3158 { 3159 AttVal * httpEquiv = AttrGetById(node, TidyAttr_HTTP_EQUIV); 3160 AttVal * content = AttrGetById(node, TidyAttr_CONTENT); 3161 if (httpEquiv && AttrValueIs(httpEquiv, "Content-Type") && AttrHasValue(content)) 3162 { 3163 tmbstr val, charset; 3164 uint end = 0; 3165 val = charset = TY_(tmbstrdup)(content->value); 3166 val = TY_(tmbstrtolower)(val); 3167 val = strstr(content->value, "charset"); 3168 3169 if (val) 3170 val += 7; 3171 3172 while(val && *val && (TY_(IsWhite)((tchar)*val) || 3173 *val == '=' || *val == '"' || *val == '\'')) 3174 ++val; 3175 3176 while(val && val[end] && !(TY_(IsWhite)((tchar)val[end]) || 3177 val[end] == '"' || val[end] == '\'' || val[end] == ';')) 3178 ++end; 3179 3180 if (val && end) 3181 { 3182 tmbstr encoding = TY_(tmbstrndup)(val, end); 3183 uint id = TY_(GetEncodingIdFromName)(encoding); 3184 3185 /* todo: detect mismatch with BOM/XMLDecl/declared */ 3186 /* todo: error for unsupported encodings */ 3187 /* todo: try to re-init transcoder */ 3188 /* todo: change input/output encoding settings */ 3189 /* todo: store id in StreamIn */ 3190 3191 MemFree(encoding); 3192 } 3193 3194 MemFree(charset); 3195 } 3196 } 3197#endif /* AUTO_INPUT_ENCODING */ 3198 3199 TY_(InsertNodeAtEnd)(head, node); 3200 ParseTag(doc, node, IgnoreWhitespace); 3201 continue; 3202 } 3203 3204 /* discard unexpected text nodes and end tags */ 3205 TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED); 3206 TY_(FreeNode)( doc, node); 3207 } 3208} 3209 3210void TY_(ParseBody)(TidyDocImpl* doc, Node *body, GetTokenMode mode) 3211{ 3212 Lexer* lexer = doc->lexer; 3213 Node *node; 3214 Bool checkstack, iswhitenode; 3215 3216 mode = IgnoreWhitespace; 3217 checkstack = yes; 3218 3219 TY_(BumpObject)( doc, body->parent ); 3220 3221 while ((node = TY_(GetToken)(doc, mode)) != NULL) 3222 { 3223 /* find and discard multiple <body> elements */ 3224 if (node->tag == body->tag && node->type == StartTag) 3225 { 3226 TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED); 3227 TY_(FreeNode)(doc, node); 3228 continue; 3229 } 3230 3231 /* #538536 Extra endtags not detected */ 3232 if ( nodeIsHTML(node) ) 3233 { 3234 if (TY_(nodeIsElement)(node) || lexer->seenEndHtml) 3235 TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED); 3236 else 3237 lexer->seenEndHtml = 1; 3238 3239 TY_(FreeNode)( doc, node); 3240 continue; 3241 } 3242 3243 if ( lexer->seenEndBody && 3244 ( node->type == StartTag || 3245 node->type == EndTag || 3246 node->type == StartEndTag ) ) 3247 { 3248 TY_(ReportError)(doc, body, node, CONTENT_AFTER_BODY ); 3249 } 3250 3251 if ( node->tag == body->tag && node->type == EndTag ) 3252 { 3253 body->closed = yes; 3254 TrimSpaces(doc, body); 3255 TY_(FreeNode)( doc, node); 3256 lexer->seenEndBody = 1; 3257 mode = IgnoreWhitespace; 3258 3259 if ( nodeIsNOFRAMES(body->parent) ) 3260 break; 3261 3262 continue; 3263 } 3264 3265 if ( nodeIsNOFRAMES(node) ) 3266 { 3267 if (node->type == StartTag) 3268 { 3269 TY_(InsertNodeAtEnd)(body, node); 3270 TY_(ParseBlock)(doc, node, mode); 3271 continue; 3272 } 3273 3274 if (node->type == EndTag && nodeIsNOFRAMES(body->parent) ) 3275 { 3276 TrimSpaces(doc, body); 3277 TY_(UngetToken)( doc ); 3278 break; 3279 } 3280 } 3281 3282 if ( (nodeIsFRAME(node) || nodeIsFRAMESET(node)) 3283 && nodeIsNOFRAMES(body->parent) ) 3284 { 3285 TrimSpaces(doc, body); 3286 TY_(UngetToken)( doc ); 3287 break; 3288 } 3289 3290 iswhitenode = no; 3291 3292 if ( TY_(nodeIsText)(node) && 3293 node->end <= node->start + 1 && 3294 lexer->lexbuf[node->start] == ' ' ) 3295 iswhitenode = yes; 3296 3297 /* deal with comments etc. */ 3298 if (InsertMisc(body, node)) 3299 continue; 3300 3301 /* #538536 Extra endtags not detected */ 3302#if 0 3303 if ( lexer->seenEndBody == 1 && !iswhitenode ) 3304 { 3305 ++lexer->seenEndBody; 3306 TY_(ReportError)(doc, body, node, CONTENT_AFTER_BODY); 3307 } 3308#endif 3309 3310 /* mixed content model permits text */ 3311 if (TY_(nodeIsText)(node)) 3312 { 3313 if (iswhitenode && mode == IgnoreWhitespace) 3314 { 3315 TY_(FreeNode)( doc, node); 3316 continue; 3317 } 3318 3319 /* HTML 2 and HTML4 strict don't allow text here */ 3320 TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT | VERS_HTML20)); 3321 3322 if (checkstack) 3323 { 3324 checkstack = no; 3325 3326 if ( TY_(InlineDup)(doc, node) > 0 ) 3327 continue; 3328 } 3329 3330 TY_(InsertNodeAtEnd)(body, node); 3331 mode = MixedContent; 3332 continue; 3333 } 3334 3335 if (node->type == DocTypeTag) 3336 { 3337 InsertDocType(doc, body, node); 3338 continue; 3339 } 3340 /* discard unknown and PARAM tags */ 3341 if ( node->tag == NULL || nodeIsPARAM(node) ) 3342 { 3343 TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED); 3344 TY_(FreeNode)( doc, node); 3345 continue; 3346 } 3347 3348 /* 3349 Netscape allows LI and DD directly in BODY 3350 We infer UL or DL respectively and use this 3351 Bool to exclude block-level elements so as 3352 to match Netscape's observed behaviour. 3353 */ 3354 lexer->excludeBlocks = no; 3355 3356 if ( nodeIsINPUT(node) || 3357 (!TY_(nodeHasCM)(node, CM_BLOCK) && !TY_(nodeHasCM)(node, CM_INLINE)) 3358 ) 3359 { 3360 /* avoid this error message being issued twice */ 3361 if (!(node->tag->model & CM_HEAD)) 3362 TY_(ReportError)(doc, body, node, TAG_NOT_ALLOWED_IN); 3363 3364 if (node->tag->model & CM_HTML) 3365 { 3366 /* copy body attributes if current body was inferred */ 3367 if ( nodeIsBODY(node) && body->implicit 3368 && body->attributes == NULL ) 3369 { 3370 body->attributes = node->attributes; 3371 node->attributes = NULL; 3372 } 3373 3374 TY_(FreeNode)( doc, node); 3375 continue; 3376 } 3377 3378 if (node->tag->model & CM_HEAD) 3379 { 3380 MoveToHead(doc, body, node); 3381 continue; 3382 } 3383 3384 if (node->tag->model & CM_LIST) 3385 { 3386 TY_(UngetToken)( doc ); 3387 node = TY_(InferredTag)(doc, TidyTag_UL); 3388 AddClassNoIndent(doc, node); 3389 lexer->excludeBlocks = yes; 3390 } 3391 else if (node->tag->model & CM_DEFLIST) 3392 { 3393 TY_(UngetToken)( doc ); 3394 node = TY_(InferredTag)(doc, TidyTag_DL); 3395 lexer->excludeBlocks = yes; 3396 } 3397 else if (node->tag->model & (CM_TABLE | CM_ROWGRP | CM_ROW)) 3398 { 3399 TY_(UngetToken)( doc ); 3400 node = TY_(InferredTag)(doc, TidyTag_TABLE); 3401 lexer->excludeBlocks = yes; 3402 } 3403 else if ( nodeIsINPUT(node) ) 3404 { 3405 TY_(UngetToken)( doc ); 3406 node = TY_(InferredTag)(doc, TidyTag_FORM); 3407 lexer->excludeBlocks = yes; 3408 } 3409 else 3410 { 3411 if ( !TY_(nodeHasCM)(node, CM_ROW | CM_FIELD) ) 3412 { 3413 TY_(UngetToken)( doc ); 3414 return; 3415 } 3416 3417 /* ignore </td> </th> <option> etc. */ 3418 TY_(FreeNode)( doc, node ); 3419 continue; 3420 } 3421 } 3422 3423 if (node->type == EndTag) 3424 { 3425 if ( nodeIsBR(node) ) 3426 node->type = StartTag; 3427 else if ( nodeIsP(node) ) 3428 { 3429 node->type = StartEndTag; 3430 node->implicit = yes; 3431#if OBSOLETE 3432 TY_(CoerceNode)(doc, node, TidyTag_BR, no, no); 3433 FreeAttrs( doc, node ); /* discard align attribute etc. */ 3434 TY_(InsertNodeAtEnd)(body, node); 3435 node = TY_(InferredTag)(doc, TidyTag_BR); 3436#endif 3437 } 3438 else if ( TY_(nodeHasCM)(node, CM_INLINE) ) 3439 TY_(PopInline)( doc, node ); 3440 } 3441 3442 if (TY_(nodeIsElement)(node)) 3443 { 3444 if ( TY_(nodeHasCM)(node, CM_INLINE) && !TY_(nodeHasCM)(node, CM_MIXED) ) 3445 { 3446 /* HTML4 strict doesn't allow inline content here */ 3447 /* but HTML2 does allow img elements as children of body */ 3448 if ( nodeIsIMG(node) ) 3449 TY_(ConstrainVersion)(doc, ~VERS_HTML40_STRICT); 3450 else 3451 TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT|VERS_HTML20)); 3452 3453 if (checkstack && !node->implicit) 3454 { 3455 checkstack = no; 3456 3457 if ( TY_(InlineDup)(doc, node) > 0 ) 3458 continue; 3459 } 3460 3461 mode = MixedContent; 3462 } 3463 else 3464 { 3465 checkstack = yes; 3466 mode = IgnoreWhitespace; 3467 } 3468 3469 if (node->implicit) 3470 TY_(ReportError)(doc, body, node, INSERTING_TAG); 3471 3472 TY_(InsertNodeAtEnd)(body, node); 3473 ParseTag(doc, node, mode); 3474 continue; 3475 } 3476 3477 /* discard unexpected tags */ 3478 TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED); 3479 TY_(FreeNode)( doc, node); 3480 } 3481} 3482 3483void TY_(ParseNoFrames)(TidyDocImpl* doc, Node *noframes, GetTokenMode mode) 3484{ 3485 Lexer* lexer = doc->lexer; 3486 Node *node; 3487 3488 if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 ) 3489 { 3490 doc->badAccess |= USING_NOFRAMES; 3491 } 3492 mode = IgnoreWhitespace; 3493 3494 while ( (node = TY_(GetToken)(doc, mode)) != NULL ) 3495 { 3496 if ( node->tag == noframes->tag && node->type == EndTag ) 3497 { 3498 TY_(FreeNode)( doc, node); 3499 noframes->closed = yes; 3500 TrimSpaces(doc, noframes); 3501 return; 3502 } 3503 3504 if ( nodeIsFRAME(node) || nodeIsFRAMESET(node) ) 3505 { 3506 TrimSpaces(doc, noframes); 3507 if (node->type == EndTag) 3508 { 3509 TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED); 3510 TY_(FreeNode)( doc, node); /* Throw it away */ 3511 } 3512 else 3513 { 3514 TY_(ReportError)(doc, noframes, node, MISSING_ENDTAG_BEFORE); 3515 TY_(UngetToken)( doc ); 3516 } 3517 return; 3518 } 3519 3520 if ( nodeIsHTML(node) ) 3521 { 3522 if (TY_(nodeIsElement)(node)) 3523 TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED); 3524 3525 TY_(FreeNode)( doc, node); 3526 continue; 3527 } 3528 3529 /* deal with comments etc. */ 3530 if (InsertMisc(noframes, node)) 3531 continue; 3532 3533 if ( nodeIsBODY(node) && node->type == StartTag ) 3534 { 3535 Bool seen_body = lexer->seenEndBody; 3536 TY_(InsertNodeAtEnd)(noframes, node); 3537 ParseTag(doc, node, IgnoreWhitespace /*MixedContent*/); 3538 3539 /* fix for bug http://tidy.sf.net/bug/887259 */ 3540 if (seen_body && TY_(FindBody)(doc) != node) 3541 { 3542 TY_(CoerceNode)(doc, node, TidyTag_DIV, no, no); 3543 MoveNodeToBody(doc, node); 3544 } 3545 continue; 3546 } 3547 3548 /* implicit body element inferred */ 3549 if (TY_(nodeIsText)(node) || (node->tag && node->type != EndTag)) 3550 { 3551 Node *body = TY_(FindBody)( doc ); 3552 if ( body || lexer->seenEndBody ) 3553 { 3554 if ( body == NULL ) 3555 { 3556 TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED); 3557 TY_(FreeNode)( doc, node); 3558 continue; 3559 } 3560 if ( TY_(nodeIsText)(node) ) 3561 { 3562 TY_(UngetToken)( doc ); 3563 node = TY_(InferredTag)(doc, TidyTag_P); 3564 TY_(ReportError)(doc, noframes, node, CONTENT_AFTER_BODY ); 3565 } 3566 TY_(InsertNodeAtEnd)( body, node ); 3567 } 3568 else 3569 { 3570 TY_(UngetToken)( doc ); 3571 node = TY_(InferredTag)(doc, TidyTag_BODY); 3572 if ( cfgBool(doc, TidyXmlOut) ) 3573 TY_(ReportError)(doc, noframes, node, INSERTING_TAG); 3574 TY_(InsertNodeAtEnd)( noframes, node ); 3575 } 3576 3577 ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ ); 3578 continue; 3579 } 3580 3581 /* discard unexpected end tags */ 3582 TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED); 3583 TY_(FreeNode)( doc, node); 3584 } 3585 3586 TY_(ReportError)(doc, noframes, node, MISSING_ENDTAG_FOR); 3587} 3588 3589void TY_(ParseFrameSet)(TidyDocImpl* doc, Node *frameset, GetTokenMode ARG_UNUSED(mode)) 3590{ 3591 Lexer* lexer = doc->lexer; 3592 Node *node; 3593 3594 if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 ) 3595 { 3596 doc->badAccess |= USING_FRAMES; 3597 } 3598 3599 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) 3600 { 3601 if (node->tag == frameset->tag && node->type == EndTag) 3602 { 3603 TY_(FreeNode)( doc, node); 3604 frameset->closed = yes; 3605 TrimSpaces(doc, frameset); 3606 return; 3607 } 3608 3609 /* deal with comments etc. */ 3610 if (InsertMisc(frameset, node)) 3611 continue; 3612 3613 if (node->tag == NULL) 3614 { 3615 TY_(ReportError)(doc, frameset, node, DISCARDING_UNEXPECTED); 3616 TY_(FreeNode)( doc, node); 3617 continue; 3618 } 3619 3620 if (TY_(nodeIsElement)(node)) 3621 { 3622 if (node->tag && node->tag->model & CM_HEAD) 3623 { 3624 MoveToHead(doc, frameset, node); 3625 continue; 3626 } 3627 } 3628 3629 if ( nodeIsBODY(node) ) 3630 { 3631 TY_(UngetToken)( doc ); 3632 node = TY_(InferredTag)(doc, TidyTag_NOFRAMES); 3633 TY_(ReportError)(doc, frameset, node, INSERTING_TAG); 3634 } 3635 3636 if (node->type == StartTag && (node->tag->model & CM_FRAMES)) 3637 { 3638 TY_(InsertNodeAtEnd)(frameset, node); 3639 lexer->excludeBlocks = no; 3640 ParseTag(doc, node, MixedContent); 3641 continue; 3642 } 3643 else if (node->type == StartEndTag && (node->tag->model & CM_FRAMES)) 3644 { 3645 TY_(InsertNodeAtEnd)(frameset, node); 3646 continue; 3647 } 3648 3649 /* discard unexpected tags */ 3650 TY_(ReportError)(doc, frameset, node, DISCARDING_UNEXPECTED); 3651 TY_(FreeNode)( doc, node); 3652 } 3653 3654 TY_(ReportError)(doc, frameset, node, MISSING_ENDTAG_FOR); 3655} 3656 3657void TY_(ParseHTML)(TidyDocImpl* doc, Node *html, GetTokenMode mode) 3658{ 3659 Node *node, *head; 3660 Node *frameset = NULL; 3661 Node *noframes = NULL; 3662 3663 TY_(SetOptionBool)( doc, TidyXmlTags, no ); 3664 3665 for (;;) 3666 { 3667 node = TY_(GetToken)(doc, IgnoreWhitespace); 3668 3669 if (node == NULL) 3670 { 3671 node = TY_(InferredTag)(doc, TidyTag_HEAD); 3672 break; 3673 } 3674 3675 if ( nodeIsHEAD(node) ) 3676 break; 3677 3678 if (node->tag == html->tag && node->type == EndTag) 3679 { 3680 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED); 3681 TY_(FreeNode)( doc, node); 3682 continue; 3683 } 3684 3685 /* find and discard multiple <html> elements */ 3686 if (node->tag == html->tag && node->type == StartTag) 3687 { 3688 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED); 3689 TY_(FreeNode)(doc, node); 3690 continue; 3691 } 3692 3693 /* deal with comments etc. */ 3694 if (InsertMisc(html, node)) 3695 continue; 3696 3697 TY_(UngetToken)( doc ); 3698 node = TY_(InferredTag)(doc, TidyTag_HEAD); 3699 break; 3700 } 3701 3702 head = node; 3703 TY_(InsertNodeAtEnd)(html, head); 3704 TY_(ParseHead)(doc, head, mode); 3705 3706 for (;;) 3707 { 3708 node = TY_(GetToken)(doc, IgnoreWhitespace); 3709 3710 if (node == NULL) 3711 { 3712 if (frameset == NULL) /* implied body */ 3713 { 3714 node = TY_(InferredTag)(doc, TidyTag_BODY); 3715 TY_(InsertNodeAtEnd)(html, node); 3716 TY_(ParseBody)(doc, node, mode); 3717 } 3718 3719 return; 3720 } 3721 3722 /* robustly handle html tags */ 3723 if (node->tag == html->tag) 3724 { 3725 if (node->type != StartTag && frameset == NULL) 3726 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED); 3727 3728 TY_(FreeNode)( doc, node); 3729 continue; 3730 } 3731 3732 /* deal with comments etc. */ 3733 if (InsertMisc(html, node)) 3734 continue; 3735 3736 /* if frameset document coerce <body> to <noframes> */ 3737 if ( nodeIsBODY(node) ) 3738 { 3739 if (node->type != StartTag) 3740 { 3741 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED); 3742 TY_(FreeNode)( doc, node); 3743 continue; 3744 } 3745 3746 if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 ) 3747 { 3748 if (frameset != NULL) 3749 { 3750 TY_(UngetToken)( doc ); 3751 3752 if (noframes == NULL) 3753 { 3754 noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES); 3755 TY_(InsertNodeAtEnd)(frameset, noframes); 3756 TY_(ReportError)(doc, html, noframes, INSERTING_TAG); 3757 } 3758 else 3759 { 3760 if (noframes->type == StartEndTag) 3761 noframes->type = StartTag; 3762 } 3763 3764 ParseTag(doc, noframes, mode); 3765 continue; 3766 } 3767 } 3768 3769 TY_(ConstrainVersion)(doc, ~VERS_FRAMESET); 3770 break; /* to parse body */ 3771 } 3772 3773 /* flag an error if we see more than one frameset */ 3774 if ( nodeIsFRAMESET(node) ) 3775 { 3776 if (node->type != StartTag) 3777 { 3778 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED); 3779 TY_(FreeNode)( doc, node); 3780 continue; 3781 } 3782 3783 if (frameset != NULL) 3784 TY_(ReportFatal)(doc, html, node, DUPLICATE_FRAMESET); 3785 else 3786 frameset = node; 3787 3788 TY_(InsertNodeAtEnd)(html, node); 3789 ParseTag(doc, node, mode); 3790 3791 /* 3792 see if it includes a noframes element so 3793 that we can merge subsequent noframes elements 3794 */ 3795 3796 for (node = frameset->content; node; node = node->next) 3797 { 3798 if ( nodeIsNOFRAMES(node) ) 3799 noframes = node; 3800 } 3801 continue; 3802 } 3803 3804 /* if not a frameset document coerce <noframes> to <body> */ 3805 if ( nodeIsNOFRAMES(node) ) 3806 { 3807 if (node->type != StartTag) 3808 { 3809 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED); 3810 TY_(FreeNode)( doc, node); 3811 continue; 3812 } 3813 3814 if (frameset == NULL) 3815 { 3816 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED); 3817 TY_(FreeNode)( doc, node); 3818 node = TY_(InferredTag)(doc, TidyTag_BODY); 3819 break; 3820 } 3821 3822 if (noframes == NULL) 3823 { 3824 noframes = node; 3825 TY_(InsertNodeAtEnd)(frameset, noframes); 3826 } 3827 else 3828 TY_(FreeNode)( doc, node); 3829 3830 ParseTag(doc, noframes, mode); 3831 continue; 3832 } 3833 3834 if (TY_(nodeIsElement)(node)) 3835 { 3836 if (node->tag && node->tag->model & CM_HEAD) 3837 { 3838 MoveToHead(doc, html, node); 3839 continue; 3840 } 3841 3842 /* discard illegal frame element following a frameset */ 3843 if ( frameset != NULL && nodeIsFRAME(node) ) 3844 { 3845 TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED); 3846 TY_(FreeNode)(doc, node); 3847 continue; 3848 } 3849 } 3850 3851 TY_(UngetToken)( doc ); 3852 3853 /* insert other content into noframes element */ 3854 3855 if (frameset) 3856 { 3857 if (noframes == NULL) 3858 { 3859 noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES); 3860 TY_(InsertNodeAtEnd)(frameset, noframes); 3861 } 3862 else 3863 { 3864 TY_(ReportError)(doc, html, node, NOFRAMES_CONTENT); 3865 if (noframes->type == StartEndTag) 3866 noframes->type = StartTag; 3867 } 3868 3869 TY_(ConstrainVersion)(doc, VERS_FRAMESET); 3870 ParseTag(doc, noframes, mode); 3871 continue; 3872 } 3873 3874 node = TY_(InferredTag)(doc, TidyTag_BODY); 3875 TY_(ConstrainVersion)(doc, ~VERS_FRAMESET); 3876 break; 3877 } 3878 3879 /* node must be body */ 3880 3881 TY_(InsertNodeAtEnd)(html, node); 3882 ParseTag(doc, node, mode); 3883} 3884 3885static Bool nodeCMIsOnlyInline( Node* node ) 3886{ 3887 return TY_(nodeHasCM)( node, CM_INLINE ) && !TY_(nodeHasCM)( node, CM_BLOCK ); 3888} 3889 3890static void EncloseBodyText(TidyDocImpl* doc) 3891{ 3892 Node* node; 3893 Node* body = TY_(FindBody)(doc); 3894 3895 if (!body) 3896 return; 3897 3898 node = body->content; 3899 3900 while (node) 3901 { 3902 if ((TY_(nodeIsText)(node) && !TY_(IsBlank)(doc->lexer, node)) || 3903 (TY_(nodeIsElement)(node) && nodeCMIsOnlyInline(node))) 3904 { 3905 Node* p = TY_(InferredTag)(doc, TidyTag_P); 3906 TY_(InsertNodeBeforeElement)(node, p); 3907 while (node && (!TY_(nodeIsElement)(node) || nodeCMIsOnlyInline(node))) 3908 { 3909 Node* next = node->next; 3910 TY_(RemoveNode)(node); 3911 TY_(InsertNodeAtEnd)(p, node); 3912 node = next; 3913 } 3914 TrimSpaces(doc, p); 3915 continue; 3916 } 3917 node = node->next; 3918 } 3919} 3920 3921/* <form>, <blockquote> and <noscript> do not allow #PCDATA in 3922 HTML 4.01 Strict (%block; model instead of %flow;). 3923 When requested, text nodes in these elements are wrapped in <p>. */ 3924static void EncloseBlockText(TidyDocImpl* doc, Node* node) 3925{ 3926 Node *next; 3927 Node *block; 3928 3929 while (node) 3930 { 3931 next = node->next; 3932 3933 if (node->content) 3934 EncloseBlockText(doc, node->content); 3935 3936 if (!(nodeIsFORM(node) || nodeIsNOSCRIPT(node) || 3937 nodeIsBLOCKQUOTE(node)) 3938 || !node->content) 3939 { 3940 node = next; 3941 continue; 3942 } 3943 3944 block = node->content; 3945 3946 if ((TY_(nodeIsText)(block) && !TY_(IsBlank)(doc->lexer, block)) || 3947 (TY_(nodeIsElement)(block) && nodeCMIsOnlyInline(block))) 3948 { 3949 Node* p = TY_(InferredTag)(doc, TidyTag_P); 3950 TY_(InsertNodeBeforeElement)(block, p); 3951 while (block && 3952 (!TY_(nodeIsElement)(block) || nodeCMIsOnlyInline(block))) 3953 { 3954 Node* tempNext = block->next; 3955 TY_(RemoveNode)(block); 3956 TY_(InsertNodeAtEnd)(p, block); 3957 block = tempNext; 3958 } 3959 TrimSpaces(doc, p); 3960 continue; 3961 } 3962 3963 node = next; 3964 } 3965} 3966 3967static void ReplaceObsoleteElements(TidyDocImpl* doc, Node* node) 3968{ 3969 Node *next; 3970 3971 while (node) 3972 { 3973 next = node->next; 3974 3975 if (nodeIsDIR(node) || nodeIsMENU(node)) 3976 TY_(CoerceNode)(doc, node, TidyTag_UL, yes, yes); 3977 3978 if (nodeIsXMP(node) || nodeIsLISTING(node) || 3979 (node->tag && node->tag->id == TidyTag_PLAINTEXT)) 3980 TY_(CoerceNode)(doc, node, TidyTag_PRE, yes, yes); 3981 3982 if (node->content) 3983 ReplaceObsoleteElements(doc, node->content); 3984 3985 node = next; 3986 } 3987} 3988 3989static void AttributeChecks(TidyDocImpl* doc, Node* node) 3990{ 3991 Node *next; 3992 3993 while (node) 3994 { 3995 next = node->next; 3996 3997 if (TY_(nodeIsElement)(node)) 3998 { 3999 if (node->tag->chkattrs) 4000 node->tag->chkattrs(doc, node); 4001 else 4002 TY_(CheckAttributes)(doc, node); 4003 } 4004 4005 if (node->content) 4006 AttributeChecks(doc, node->content); 4007 4008 node = next; 4009 } 4010} 4011 4012/* Apple Changes: 4013 2007-02-02 iccir If TidySanitizeAgainstXSS is set, remove elements which could load external content 4014*/ 4015#ifdef TIDY_APPLE_CHANGES 4016static void SanitizeNodesAgainstXSS(TidyDocImpl* doc, Node* node) 4017{ 4018 Node *next; 4019 4020 Bool isXml = cfgBool( doc, TidyXmlTags ); 4021 Bool shouldRemoveElement; 4022 4023 while (node) 4024 { 4025 next = node->next; 4026 4027 if (!isXml) 4028 { 4029 shouldRemoveElement = TY_(nodeIsFRAMESET) (node) || 4030 TY_(nodeIsSCRIPT) (node) || 4031 TY_(nodeIsIFRAME) (node) || 4032 TY_(nodeIsOBJECT) (node) || 4033 TY_(nodeIsFRAME) (node) || 4034 TY_(nodeIsEMBED) (node) || 4035 TY_(nodeIsSTYLE) (node) || 4036 TY_(nodeIsLINK) (node) || 4037 TY_(nodeIsMETA) (node) || 4038 TY_(nodeIsAPPLET) (node) ; 4039 } 4040 else 4041 { 4042 /* When the content was parsed as XML, the tag identifiers all point at a generic XML tag identifier 4043 with an unknown tag name, so we need to manually compare the tag names with the bad set of tags. */ 4044 shouldRemoveElement = node->element && (!TY_(tmbstrcasecmp)(node->element, "frameset") || 4045 !TY_(tmbstrcasecmp)(node->element, "script") || 4046 !TY_(tmbstrcasecmp)(node->element, "iframe") || 4047 !TY_(tmbstrcasecmp)(node->element, "object") || 4048 !TY_(tmbstrcasecmp)(node->element, "frame") || 4049 !TY_(tmbstrcasecmp)(node->element, "embed") || 4050 !TY_(tmbstrcasecmp)(node->element, "style") || 4051 !TY_(tmbstrcasecmp)(node->element, "link") || 4052 !TY_(tmbstrcasecmp)(node->element, "meta") || 4053 !TY_(tmbstrcasecmp)(node->element, "applet") ); 4054 } 4055 4056 if (shouldRemoveElement) 4057 { 4058 RemoveNode(node); 4059 FreeNode(doc, node); 4060 } 4061 else if (node->content) 4062 { 4063 SanitizeNodesAgainstXSS(doc, node->content); 4064 } 4065 4066 node = next; 4067 } 4068} 4069#endif 4070 4071/* 4072 HTML is the top level element 4073*/ 4074void TY_(ParseDocument)(TidyDocImpl* doc) 4075{ 4076 Node *node, *html, *doctype = NULL; 4077 4078 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) 4079 { 4080 if (node->type == XmlDecl) 4081 { 4082 if (TY_(FindXmlDecl)(doc) && doc->root.content) 4083 { 4084 TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED); 4085 TY_(FreeNode)(doc, node); 4086 continue; 4087 } 4088 if (node->line != 1 || (node->line == 1 && node->column != 1)) 4089 { 4090 TY_(ReportError)(doc, &doc->root, node, SPACE_PRECEDING_XMLDECL); 4091 } 4092 } 4093#ifdef AUTO_INPUT_ENCODING 4094 if (node->type == XmlDecl) 4095 { 4096 AttVal* encoding = GetAttrByName(node, "encoding"); 4097 if (AttrHasValue(encoding)) 4098 { 4099 uint id = TY_(GetEncodingIdFromName)(encoding->value); 4100 4101 /* todo: detect mismatch with BOM/XMLDecl/declared */ 4102 /* todo: error for unsupported encodings */ 4103 /* todo: try to re-init transcoder */ 4104 /* todo: change input/output encoding settings */ 4105 /* todo: store id in StreamIn */ 4106 } 4107 } 4108#endif /* AUTO_INPUT_ENCODING */ 4109 4110 /* deal with comments etc. */ 4111 if (InsertMisc( &doc->root, node )) 4112 continue; 4113 4114 if (node->type == DocTypeTag) 4115 { 4116 if (doctype == NULL) 4117 { 4118 TY_(InsertNodeAtEnd)( &doc->root, node); 4119 doctype = node; 4120 } 4121 else 4122 { 4123 TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED); 4124 TY_(FreeNode)( doc, node); 4125 } 4126 continue; 4127 } 4128 4129 if (node->type == EndTag) 4130 { 4131 TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED); 4132 TY_(FreeNode)( doc, node); 4133 continue; 4134 } 4135 4136 if (node->type == StartTag && nodeIsHTML(node)) 4137 { 4138 AttVal *xmlns; 4139 4140 xmlns = TY_(AttrGetById)(node, TidyAttr_XMLNS); 4141 4142 if (AttrValueIs(xmlns, XHTML_NAMESPACE)) 4143 { 4144 Bool htmlOut = cfgBool( doc, TidyHtmlOut ); 4145 doc->lexer->isvoyager = yes; /* Unless plain HTML */ 4146 TY_(SetOptionBool)( doc, TidyXhtmlOut, !htmlOut ); /* is specified, output*/ 4147 TY_(SetOptionBool)( doc, TidyXmlOut, !htmlOut ); /* will be XHTML. */ 4148 4149 /* adjust other config options, just as in config.c */ 4150 if ( !htmlOut ) 4151 { 4152 TY_(SetOptionBool)( doc, TidyUpperCaseTags, no ); 4153 TY_(SetOptionBool)( doc, TidyUpperCaseAttrs, no ); 4154 } 4155 } 4156 } 4157 4158 if ( node->type != StartTag || !nodeIsHTML(node) ) 4159 { 4160 TY_(UngetToken)( doc ); 4161 html = TY_(InferredTag)(doc, TidyTag_HTML); 4162 } 4163 else 4164 html = node; 4165 4166 if (!TY_(FindDocType)(doc)) 4167 TY_(ReportError)(doc, NULL, NULL, MISSING_DOCTYPE); 4168 4169 TY_(InsertNodeAtEnd)( &doc->root, html); 4170 TY_(ParseHTML)( doc, html, IgnoreWhitespace ); 4171 break; 4172 } 4173 4174#if SUPPORT_ACCESSIBILITY_CHECKS 4175 /* do this before any more document fixes */ 4176 if ( cfg( doc, TidyAccessibilityCheckLevel ) > 0 ) 4177 TY_(AccessibilityChecks)( doc ); 4178#endif /* #if SUPPORT_ACCESSIBILITY_CHECKS */ 4179 4180 if (!TY_(FindHTML)(doc)) 4181 { 4182 /* a later check should complain if <body> is empty */ 4183 html = TY_(InferredTag)(doc, TidyTag_HTML); 4184 TY_(InsertNodeAtEnd)( &doc->root, html); 4185 TY_(ParseHTML)(doc, html, IgnoreWhitespace); 4186 } 4187 4188 if (!TY_(FindTITLE)(doc)) 4189 { 4190 Node* head = TY_(FindHEAD)(doc); 4191 TY_(ReportError)(doc, head, NULL, MISSING_TITLE_ELEMENT); 4192 TY_(InsertNodeAtEnd)(head, TY_(InferredTag)(doc, TidyTag_TITLE)); 4193 } 4194 4195#ifdef TIDY_APPLE_CHANGES 4196 if (cfgBool(doc, TidySanitizeAgainstXSS)) 4197 SanitizeNodesAgainstXSS(doc, &doc->root); 4198#endif 4199 AttributeChecks(doc, &doc->root); 4200 ReplaceObsoleteElements(doc, &doc->root); 4201 TY_(DropEmptyElements)(doc, &doc->root); 4202 CleanSpaces(doc, &doc->root); 4203 4204 if (cfgBool(doc, TidyEncloseBodyText)) 4205 EncloseBodyText(doc); 4206 if (cfgBool(doc, TidyEncloseBlockText)) 4207 EncloseBlockText(doc, &doc->root); 4208} 4209 4210Bool TY_(XMLPreserveWhiteSpace)( TidyDocImpl* doc, Node *element) 4211{ 4212 AttVal *attribute; 4213 4214 /* search attributes for xml:space */ 4215 for (attribute = element->attributes; attribute; attribute = attribute->next) 4216 { 4217 if (attrIsXML_SPACE(attribute)) 4218 { 4219 if (AttrValueIs(attribute, "preserve")) 4220 return yes; 4221 4222 return no; 4223 } 4224 } 4225 4226 if (element->element == NULL) 4227 return no; 4228 4229 /* kludge for html docs without explicit xml:space attribute */ 4230 if (nodeIsPRE(element) || 4231 nodeIsSCRIPT(element) || 4232 nodeIsSTYLE(element) || 4233 TY_(FindParser)(doc, element) == TY_(ParsePre)) 4234 return yes; 4235 4236 /* kludge for XSL docs */ 4237 if ( TY_(tmbstrcasecmp)(element->element, "xsl:text") == 0 ) 4238 return yes; 4239 4240 return no; 4241} 4242 4243/* 4244 XML documents 4245*/ 4246static void ParseXMLElement(TidyDocImpl* doc, Node *element, GetTokenMode mode) 4247{ 4248 Lexer* lexer = doc->lexer; 4249 Node *node; 4250 4251 /* if node is pre or has xml:space="preserve" then do so */ 4252 4253 if ( TY_(XMLPreserveWhiteSpace)(doc, element) ) 4254 mode = Preformatted; 4255 4256 while ((node = TY_(GetToken)(doc, mode)) != NULL) 4257 { 4258 if (node->type == EndTag && 4259 node->element && element->element && 4260 TY_(tmbstrcmp)(node->element, element->element) == 0) 4261 { 4262 TY_(FreeNode)( doc, node); 4263 element->closed = yes; 4264 break; 4265 } 4266 4267 /* discard unexpected end tags */ 4268 if (node->type == EndTag) 4269 { 4270 if (element) 4271 TY_(ReportFatal)(doc, element, node, UNEXPECTED_ENDTAG_IN); 4272 else 4273 TY_(ReportFatal)(doc, element, node, UNEXPECTED_ENDTAG); 4274 4275 TY_(FreeNode)( doc, node); 4276 continue; 4277 } 4278 4279 /* parse content on seeing start tag */ 4280 if (node->type == StartTag) 4281 ParseXMLElement( doc, node, mode ); 4282 4283 TY_(InsertNodeAtEnd)(element, node); 4284 } 4285 4286 /* 4287 if first child is text then trim initial space and 4288 delete text node if it is empty. 4289 */ 4290 4291 node = element->content; 4292 4293 if (TY_(nodeIsText)(node) && mode != Preformatted) 4294 { 4295 if ( lexer->lexbuf[node->start] == ' ' ) 4296 { 4297 node->start++; 4298 4299 if (node->start >= node->end) 4300 TY_(DiscardElement)( doc, node ); 4301 } 4302 } 4303 4304 /* 4305 if last child is text then trim final space and 4306 delete the text node if it is empty 4307 */ 4308 4309 node = element->last; 4310 4311 if (TY_(nodeIsText)(node) && mode != Preformatted) 4312 { 4313 if ( lexer->lexbuf[node->end - 1] == ' ' ) 4314 { 4315 node->end--; 4316 4317 if (node->start >= node->end) 4318 TY_(DiscardElement)( doc, node ); 4319 } 4320 } 4321} 4322 4323void TY_(ParseXMLDocument)(TidyDocImpl* doc) 4324{ 4325 Node *node, *doctype = NULL; 4326 4327 TY_(SetOptionBool)( doc, TidyXmlTags, yes ); 4328 4329 while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) 4330 { 4331 /* discard unexpected end tags */ 4332 if (node->type == EndTag) 4333 { 4334 TY_(ReportError)(doc, NULL, node, UNEXPECTED_ENDTAG); 4335 TY_(FreeNode)( doc, node); 4336 continue; 4337 } 4338 4339 /* deal with comments etc. */ 4340 if (InsertMisc( &doc->root, node)) 4341 continue; 4342 4343 if (node->type == DocTypeTag) 4344 { 4345 if (doctype == NULL) 4346 { 4347 TY_(InsertNodeAtEnd)( &doc->root, node); 4348 doctype = node; 4349 } 4350 else 4351 { 4352 TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED); 4353 TY_(FreeNode)( doc, node); 4354 } 4355 continue; 4356 } 4357 4358 if (node->type == StartEndTag) 4359 { 4360 TY_(InsertNodeAtEnd)( &doc->root, node); 4361 continue; 4362 } 4363 4364 /* if start tag then parse element's content */ 4365 if (node->type == StartTag) 4366 { 4367 TY_(InsertNodeAtEnd)( &doc->root, node ); 4368 ParseXMLElement( doc, node, IgnoreWhitespace ); 4369 continue; 4370 } 4371 4372 TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED); 4373 TY_(FreeNode)( doc, node); 4374 } 4375 4376 /* ensure presence of initial <?xml version="1.0"?> */ 4377 if ( cfgBool(doc, TidyXmlDecl) ) 4378 TY_(FixXmlDecl)( doc ); 4379 4380#ifdef TIDY_APPLE_CHANGES 4381 if (cfgBool(doc, TidySanitizeAgainstXSS)) { 4382 SanitizeNodesAgainstXSS(doc, &doc->root); 4383 AttributeChecks(doc, &doc->root); 4384 } 4385#endif 4386} 4387 4388/* 4389 * local variables: 4390 * mode: c 4391 * indent-tabs-mode: nil 4392 * c-basic-offset: 4 4393 * eval: (c-set-offset 'substatement-open 0) 4394 * end: 4395 */ 4396