1#ifndef __LEXER_H__ 2#define __LEXER_H__ 3 4/* lexer.h -- Lexer for html parser 5 6 (c) 1998-2006 (W3C) MIT, ERCIM, Keio University 7 See tidy.h for the copyright notice. 8 9 CVS Info: 10 $Author: iccir $ 11 $Date: 2007/03/02 09:35:13 $ 12 $Revision: 1.4 $ 13 14*/ 15 16/* 17 Given an input source, it returns a sequence of tokens. 18 19 GetToken(source) gets the next token 20 UngetToken(source) provides one level undo 21 22 The tags include an attribute list: 23 24 - linked list of attribute/value nodes 25 - each node has 2 NULL-terminated strings. 26 - entities are replaced in attribute values 27 28 white space is compacted if not in preformatted mode 29 If not in preformatted mode then leading white space 30 is discarded and subsequent white space sequences 31 compacted to single space characters. 32 33 If XmlTags is no then Tag names are folded to upper 34 case and attribute names to lower case. 35 36 Not yet done: 37 - Doctype subset and marked sections 38*/ 39 40#ifdef __cplusplus 41extern "C" { 42#endif 43 44#include "forward.h" 45 46/* lexer character types 47*/ 48#define digit 1u 49#define letter 2u 50#define namechar 4u 51#define white 8u 52#define newline 16u 53#define lowercase 32u 54#define uppercase 64u 55 56 57/* node->type is one of these values 58*/ 59typedef enum 60{ 61 RootNode, 62 DocTypeTag, 63 CommentTag, 64 ProcInsTag, 65 TextNode, 66 StartTag, 67 EndTag, 68 StartEndTag, 69 CDATATag, 70 SectionTag, 71 AspTag, 72 JsteTag, 73 PhpTag, 74 XmlDecl 75} NodeType; 76 77 78 79/* lexer GetToken states 80*/ 81typedef enum 82{ 83 LEX_CONTENT, 84 LEX_GT, 85 LEX_ENDTAG, 86 LEX_STARTTAG, 87 LEX_COMMENT, 88 LEX_DOCTYPE, 89 LEX_PROCINSTR, 90 LEX_ENDCOMMENT, 91 LEX_CDATA, 92 LEX_SECTION, 93 LEX_ASP, 94 LEX_JSTE, 95 LEX_PHP, 96 LEX_XMLDECL 97} LexerState; 98 99/* ParseDocTypeDecl state constants */ 100typedef enum 101{ 102 DT_INTERMEDIATE, 103 DT_DOCTYPENAME, 104 DT_PUBLICSYSTEM, 105 DT_QUOTEDSTRING, 106 DT_INTSUBSET 107} ParseDocTypeDeclState; 108 109/* content model shortcut encoding 110 111 Descriptions are tentative. 112*/ 113#define CM_UNKNOWN 0 114/* Elements with no content. Map to HTML specification. */ 115#define CM_EMPTY (1 << 0) 116/* Elements that appear outside of "BODY". */ 117#define CM_HTML (1 << 1) 118/* Elements that can appear within HEAD. */ 119#define CM_HEAD (1 << 2) 120/* HTML "block" elements. */ 121#define CM_BLOCK (1 << 3) 122/* HTML "inline" elements. */ 123#define CM_INLINE (1 << 4) 124/* Elements that mark list item ("LI"). */ 125#define CM_LIST (1 << 5) 126/* Elements that mark definition list item ("DL", "DT"). */ 127#define CM_DEFLIST (1 << 6) 128/* Elements that can appear inside TABLE. */ 129#define CM_TABLE (1 << 7) 130/* Used for "THEAD", "TFOOT" or "TBODY". */ 131#define CM_ROWGRP (1 << 8) 132/* Used for "TD", "TH" */ 133#define CM_ROW (1 << 9) 134/* Elements whose content must be protected against white space movement. 135 Includes some elements that can found in forms. */ 136#define CM_FIELD (1 << 10) 137/* Used to avoid propagating inline emphasis inside some elements 138 such as OBJECT or APPLET. */ 139#define CM_OBJECT (1 << 11) 140/* Elements that allows "PARAM". */ 141#define CM_PARAM (1 << 12) 142/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */ 143#define CM_FRAMES (1 << 13) 144/* Heading elements (h1, h2, ...). */ 145#define CM_HEADING (1 << 14) 146/* Elements with an optional end tag. */ 147#define CM_OPT (1 << 15) 148/* Elements that use "align" attribute for vertical position. */ 149#define CM_IMG (1 << 16) 150/* Elements with inline and block model. Used to avoid calling InlineDup. */ 151#define CM_MIXED (1 << 17) 152/* Elements whose content needs to be indented only if containing one 153 CM_BLOCK element. */ 154#define CM_NO_INDENT (1 << 18) 155/* Elements that are obsolete (such as "dir", "menu"). */ 156#define CM_OBSOLETE (1 << 19) 157/* User defined elements. Used to determine how attributes wihout value 158 should be printed. */ 159#define CM_NEW (1 << 20) 160/* Elements that cannot be omitted. */ 161#define CM_OMITST (1 << 21) 162 163/* If the document uses just HTML 2.0 tags and attributes described 164** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. 165** If there are proprietary tags and attributes then describe it as 166** HTML Proprietary. If it includes the xml-lang or xmlns attributes 167** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the 168** flavors of Voyager (strict, loose or frameset). 169*/ 170 171/* unknown */ 172#define xxxx 0u 173 174/* W3C defined HTML/XHTML family document types */ 175#define HT20 1u 176#define HT32 2u 177#define H40S 4u 178#define H40T 8u 179#define H40F 16u 180#define H41S 32u 181#define H41T 64u 182#define H41F 128u 183#define X10S 256u 184#define X10T 512u 185#define X10F 1024u 186#define XH11 2048u 187#define XB10 4096u 188 189/* proprietary stuff */ 190#define VERS_SUN 8192u 191#define VERS_NETSCAPE 16384u 192#define VERS_MICROSOFT 32768u 193 194/* special flag */ 195#define VERS_XML 65536u 196 197/* compatibility symbols */ 198#define VERS_UNKNOWN (xxxx) 199#define VERS_HTML20 (HT20) 200#define VERS_HTML32 (HT32) 201#define VERS_HTML40_STRICT (H40S|H41S|X10S) 202#define VERS_HTML40_LOOSE (H40T|H41T|X10T) 203#define VERS_FRAMESET (H40F|H41F|X10F) 204#define VERS_XHTML11 (XH11) 205#define VERS_BASIC (XB10) 206 207/* meta symbols */ 208#define VERS_HTML40 (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET) 209#define VERS_IFRAME (VERS_HTML40_LOOSE|VERS_FRAMESET) 210#define VERS_LOOSE (VERS_HTML20|VERS_HTML32|VERS_IFRAME) 211#define VERS_EVENTS (VERS_HTML40|VERS_XHTML11) 212#define VERS_FROM32 (VERS_HTML32|VERS_HTML40) 213#define VERS_FROM40 (VERS_HTML40|VERS_XHTML11|VERS_BASIC) 214#define VERS_XHTML (X10S|X10T|X10F|XH11|XB10) 215 216/* all W3C defined document types */ 217#define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40) 218 219/* all proprietary types */ 220#define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN) 221 222/* Linked list of class names and styles 223*/ 224struct _Style; 225typedef struct _Style TagStyle; 226 227struct _Style 228{ 229 tmbstr tag; 230 tmbstr tag_class; 231 tmbstr properties; 232 TagStyle *next; 233}; 234 235 236/* Linked list of style properties 237*/ 238struct _StyleProp; 239typedef struct _StyleProp StyleProp; 240 241struct _StyleProp 242{ 243 tmbstr name; 244 tmbstr value; 245 StyleProp *next; 246}; 247 248 249 250 251/* Attribute/Value linked list node 252*/ 253 254struct _AttVal 255{ 256 AttVal* next; 257 const Attribute* dict; 258 Node* asp; 259 Node* php; 260 int delim; 261 tmbstr attribute; 262 tmbstr value; 263}; 264 265 266 267/* 268 Mosaic handles inlines via a separate stack from other elements 269 We duplicate this to recover from inline markup errors such as: 270 271 <i>italic text 272 <p>more italic text</b> normal text 273 274 which for compatibility with Mosaic is mapped to: 275 276 <i>italic text</i> 277 <p><i>more italic text</i> normal text 278 279 Note that any inline end tag pop's the effect of the current 280 inline start tag, so that </b> pop's <i> in the above example. 281*/ 282struct _IStack 283{ 284 IStack* next; 285 const Dict* tag; /* tag's dictionary definition */ 286 tmbstr element; /* name (NULL for text nodes) */ 287 AttVal* attributes; 288}; 289 290 291/* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, 292** etc. etc. 293*/ 294 295struct _Node 296{ 297 Node* parent; /* tree structure */ 298 Node* prev; 299 Node* next; 300 Node* content; 301 Node* last; 302 303 AttVal* attributes; 304 const Dict* was; /* old tag when it was changed */ 305 const Dict* tag; /* tag's dictionary definition */ 306 307 tmbstr element; /* name (NULL for text nodes) */ 308 309 uint start; /* start of span onto text array */ 310 uint end; /* end of span onto text array */ 311 NodeType type; /* TextNode, StartTag, EndTag etc. */ 312 313 uint line; /* current line of document */ 314 uint column; /* current column of document */ 315 316 Bool closed; /* true if closed by explicit end tag */ 317 Bool implicit; /* true if inferred */ 318 Bool linebreak; /* true if followed by a line break */ 319 320#ifdef TIDY_STORE_ORIGINAL_TEXT 321 tmbstr otext; 322#endif 323}; 324 325 326/* 327 The following are private to the lexer 328 Use NewLexer() to create a lexer, and 329 FreeLexer() to free it. 330*/ 331 332struct _Lexer 333{ 334#if 0 /* Move to TidyDocImpl */ 335 StreamIn* in; /* document content input */ 336 StreamOut* errout; /* error output stream */ 337 338 uint badAccess; /* for accessibility errors */ 339 uint badLayout; /* for bad style errors */ 340 uint badChars; /* for bad character encodings */ 341 uint badForm; /* for mismatched/mispositioned form tags */ 342 uint warnings; /* count of warnings in this document */ 343 uint errors; /* count of errors */ 344#endif 345 346 uint lines; /* lines seen */ 347 uint columns; /* at start of current token */ 348 Bool waswhite; /* used to collapse contiguous white space */ 349 Bool pushed; /* true after token has been pushed back */ 350 Bool insertspace; /* when space is moved after end tag */ 351 Bool excludeBlocks; /* Netscape compatibility */ 352 Bool exiled; /* true if moved out of table */ 353 Bool isvoyager; /* true if xmlns attribute on html element */ 354 uint versions; /* bit vector of HTML versions */ 355 uint doctype; /* version as given by doctype (if any) */ 356 uint versionEmitted; /* version of doctype emitted */ 357 Bool bad_doctype; /* e.g. if html or PUBLIC is missing */ 358 uint txtstart; /* start of current node */ 359 uint txtend; /* end of current node */ 360 LexerState state; /* state of lexer's finite state machine */ 361 362 Node* token; /* last token returned by GetToken() */ 363 Node* itoken; /* last duplicate inline returned by GetToken() */ 364 Node* root; /* remember root node of the document */ 365 Node* parent; /* remember parent node for CDATA elements */ 366 367 Bool seenEndBody; /* true if a </body> tag has been encountered */ 368 Bool seenEndHtml; /* true if a </html> tag has been encountered */ 369 370 /* 371 Lexer character buffer 372 373 Parse tree nodes span onto this buffer 374 which contains the concatenated text 375 contents of all of the elements. 376 377 lexsize must be reset for each file. 378 */ 379 tmbstr lexbuf; /* MB character buffer */ 380 uint lexlength; /* allocated */ 381 uint lexsize; /* used */ 382 383 /* Inline stack for compatibility with Mosaic */ 384 Node* inode; /* for deferring text node */ 385 IStack* insert; /* for inferring inline tags */ 386 IStack* istack; 387 uint istacklength; /* allocated */ 388 uint istacksize; /* used */ 389 uint istackbase; /* start of frame */ 390 391 TagStyle *styles; /* used for cleaning up presentation markup */ 392 393#if 0 394 TidyDocImpl* doc; /* Pointer back to doc for error reporting */ 395#endif 396}; 397 398 399/* Lexer Functions 400*/ 401 402/* choose what version to use for new doctype */ 403int TY_(HTMLVersion)( TidyDocImpl* doc ); 404 405/* everything is allowed in proprietary version of HTML */ 406/* this is handled here rather than in the tag/attr dicts */ 407 408void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers ); 409 410Bool TY_(IsWhite)(uint c); 411Bool TY_(IsDigit)(uint c); 412Bool TY_(IsLetter)(uint c); 413Bool TY_(IsNewline)(uint c); 414Bool TY_(IsNamechar)(uint c); 415Bool TY_(IsXMLLetter)(uint c); 416Bool TY_(IsXMLNamechar)(uint c); 417 418/* Bool IsLower(uint c); */ 419Bool TY_(IsUpper)(uint c); 420uint TY_(ToLower)(uint c); 421uint TY_(ToUpper)(uint c); 422 423Lexer* TY_(NewLexer)( TidyDocImpl* doc ); 424void TY_(FreeLexer)( TidyDocImpl* doc ); 425 426/* store character c as UTF-8 encoded byte stream */ 427void TY_(AddCharToLexer)( Lexer *lexer, uint c ); 428 429/* 430 Used for elements and text nodes 431 element name is NULL for text nodes 432 start and end are offsets into lexbuf 433 which contains the textual content of 434 all elements in the parse tree. 435 436 parent and content allow traversal 437 of the parse tree in any direction. 438 attributes are represented as a linked 439 list of AttVal nodes which hold the 440 strings for attribute/value pairs. 441*/ 442Node* TY_(NewNode)( Lexer* lexer ); 443 444 445/* used to clone heading nodes when split by an <HR> */ 446Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element ); 447 448/* free node's attributes */ 449void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node ); 450 451/* doesn't repair attribute list linkage */ 452void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av ); 453 454/* detach attribute from node */ 455void TY_(DetachAttribute)( Node *node, AttVal *attr ); 456 457/* detach attribute from node then free it 458*/ 459void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr ); 460 461/* 462 Free document nodes by iterating through peers and recursing 463 through children. Set next to NULL before calling FreeNode() 464 to avoid freeing peer nodes. Doesn't patch up prev/next links. 465 */ 466void TY_(FreeNode)( TidyDocImpl* doc, Node *node ); 467 468Node* TY_(TextToken)( Lexer *lexer ); 469 470/* used for creating preformatted text from Word2000 */ 471Node* TY_(NewLineNode)( Lexer *lexer ); 472 473/* used for adding a for Word2000 */ 474Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt ); 475 476void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str ); 477/* void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); */ 478 479/* find element */ 480Node* TY_(FindDocType)( TidyDocImpl* doc ); 481Node* TY_(FindHTML)( TidyDocImpl* doc ); 482Node* TY_(FindHEAD)( TidyDocImpl* doc ); 483Node* TY_(FindTITLE)(TidyDocImpl* doc); 484Node* TY_(FindBody)( TidyDocImpl* doc ); 485Node* TY_(FindXmlDecl)(TidyDocImpl* doc); 486 487/* Returns containing block element, if any */ 488Node* TY_(FindContainer)( Node* node ); 489 490/* add meta element for Tidy */ 491Bool TY_(AddGenerator)( TidyDocImpl* doc ); 492 493uint TY_(ApparentVersion)( TidyDocImpl* doc ); 494 495ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool isXhtml ); 496 497Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc ); 498 499Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc ); 500 501 502/* fixup doctype if missing */ 503Bool TY_(FixDocType)( TidyDocImpl* doc ); 504 505/* ensure XML document starts with <?xml version="1.0"?> */ 506/* add encoding attribute if not using ASCII or UTF-8 output */ 507Bool TY_(FixXmlDecl)( TidyDocImpl* doc ); 508 509Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id); 510 511void TY_(UngetToken)( TidyDocImpl* doc ); 512 513 514/* 515 modes for GetToken() 516 517 MixedContent -- for elements which don't accept PCDATA 518 Preformatted -- white space preserved as is 519 IgnoreMarkup -- for CDATA elements such as script, style 520*/ 521typedef enum 522{ 523 IgnoreWhitespace, 524 MixedContent, 525 Preformatted, 526 IgnoreMarkup, 527 CdataContent 528} GetTokenMode; 529 530Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode ); 531 532void TY_(InitMap)(void); 533 534 535/* create a new attribute */ 536AttVal* TY_(NewAttribute)(void); 537 538/* create a new attribute with given name and value */ 539AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value, 540 int delim ); 541 542/* insert attribute at the end of attribute list of a node */ 543void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av ); 544 545/* insert attribute at the start of attribute list of a node */ 546void TY_(InsertAttributeAtStart)( Node *node, AttVal *av ); 547 548/************************************* 549 In-line Stack functions 550*************************************/ 551 552 553/* duplicate attributes */ 554AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs ); 555 556/* 557 push a copy of an inline node onto stack 558 but don't push if implicit or OBJECT or APPLET 559 (implicit tags are ones generated from the istack) 560 561 One issue arises with pushing inlines when 562 the tag is already pushed. For instance: 563 564 <p><em>text 565 <p><em>more text 566 567 Shouldn't be mapped to 568 569 <p><em>text</em></p> 570 <p><em><em>more text</em></em> 571*/ 572void TY_(PushInline)( TidyDocImpl* doc, Node* node ); 573 574/* pop inline stack */ 575void TY_(PopInline)( TidyDocImpl* doc, Node* node ); 576 577Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node ); 578Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node ); 579 580/* 581 This has the effect of inserting "missing" inline 582 elements around the contents of blocklevel elements 583 such as P, TD, TH, DIV, PRE etc. This procedure is 584 called at the start of ParseBlock. when the inline 585 stack is not empty, as will be the case in: 586 587 <i><h1>italic heading</h1></i> 588 589 which is then treated as equivalent to 590 591 <h1><i>italic heading</i></h1> 592 593 This is implemented by setting the lexer into a mode 594 where it gets tokens from the inline stack rather than 595 from the input stream. 596*/ 597int TY_(InlineDup)( TidyDocImpl* doc, Node *node ); 598 599/* 600 defer duplicates when entering a table or other 601 element where the inlines shouldn't be duplicated 602*/ 603void TY_(DeferDup)( TidyDocImpl* doc ); 604Node* TY_(InsertedToken)( TidyDocImpl* doc ); 605 606#ifdef __cplusplus 607} 608#endif 609 610 611#endif /* __LEXER_H__ */ 612