1#ifndef __LEXER_H__
2#define __LEXER_H__
3
4/* lexer.h -- Lexer for html parser
5
6   (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
7   See tidy.h for the copyright notice.
8
9   CVS Info:
10    $Author: iccir $
11    $Date: 2007/03/02 09:35:13 $
12    $Revision: 1.4 $
13
14*/
15
16/*
17  Given an input source, it returns a sequence of tokens.
18
19     GetToken(source) gets the next token
20     UngetToken(source) provides one level undo
21
22  The tags include an attribute list:
23
24    - linked list of attribute/value nodes
25    - each node has 2 NULL-terminated strings.
26    - entities are replaced in attribute values
27
28  white space is compacted if not in preformatted mode
29  If not in preformatted mode then leading white space
30  is discarded and subsequent white space sequences
31  compacted to single space characters.
32
33  If XmlTags is no then Tag names are folded to upper
34  case and attribute names to lower case.
35
36 Not yet done:
37    -   Doctype subset and marked sections
38*/
39
40#ifdef __cplusplus
41extern "C" {
42#endif
43
44#include "forward.h"
45
46/* lexer character types
47*/
48#define digit       1u
49#define letter      2u
50#define namechar    4u
51#define white       8u
52#define newline     16u
53#define lowercase   32u
54#define uppercase   64u
55
56
57/* node->type is one of these values
58*/
59typedef enum
60{
61  RootNode,
62  DocTypeTag,
63  CommentTag,
64  ProcInsTag,
65  TextNode,
66  StartTag,
67  EndTag,
68  StartEndTag,
69  CDATATag,
70  SectionTag,
71  AspTag,
72  JsteTag,
73  PhpTag,
74  XmlDecl
75} NodeType;
76
77
78
79/* lexer GetToken states
80*/
81typedef enum
82{
83  LEX_CONTENT,
84  LEX_GT,
85  LEX_ENDTAG,
86  LEX_STARTTAG,
87  LEX_COMMENT,
88  LEX_DOCTYPE,
89  LEX_PROCINSTR,
90  LEX_ENDCOMMENT,
91  LEX_CDATA,
92  LEX_SECTION,
93  LEX_ASP,
94  LEX_JSTE,
95  LEX_PHP,
96  LEX_XMLDECL
97} LexerState;
98
99/* ParseDocTypeDecl state constants */
100typedef enum
101{
102  DT_INTERMEDIATE,
103  DT_DOCTYPENAME,
104  DT_PUBLICSYSTEM,
105  DT_QUOTEDSTRING,
106  DT_INTSUBSET
107} ParseDocTypeDeclState;
108
109/* content model shortcut encoding
110
111   Descriptions are tentative.
112*/
113#define CM_UNKNOWN      0
114/* Elements with no content. Map to HTML specification. */
115#define CM_EMPTY        (1 << 0)
116/* Elements that appear outside of "BODY". */
117#define CM_HTML         (1 << 1)
118/* Elements that can appear within HEAD. */
119#define CM_HEAD         (1 << 2)
120/* HTML "block" elements. */
121#define CM_BLOCK        (1 << 3)
122/* HTML "inline" elements. */
123#define CM_INLINE       (1 << 4)
124/* Elements that mark list item ("LI"). */
125#define CM_LIST         (1 << 5)
126/* Elements that mark definition list item ("DL", "DT"). */
127#define CM_DEFLIST      (1 << 6)
128/* Elements that can appear inside TABLE. */
129#define CM_TABLE        (1 << 7)
130/* Used for "THEAD", "TFOOT" or "TBODY". */
131#define CM_ROWGRP       (1 << 8)
132/* Used for "TD", "TH" */
133#define CM_ROW          (1 << 9)
134/* Elements whose content must be protected against white space movement.
135   Includes some elements that can found in forms. */
136#define CM_FIELD        (1 << 10)
137/* Used to avoid propagating inline emphasis inside some elements
138   such as OBJECT or APPLET. */
139#define CM_OBJECT       (1 << 11)
140/* Elements that allows "PARAM". */
141#define CM_PARAM        (1 << 12)
142/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
143#define CM_FRAMES       (1 << 13)
144/* Heading elements (h1, h2, ...). */
145#define CM_HEADING      (1 << 14)
146/* Elements with an optional end tag. */
147#define CM_OPT          (1 << 15)
148/* Elements that use "align" attribute for vertical position. */
149#define CM_IMG          (1 << 16)
150/* Elements with inline and block model. Used to avoid calling InlineDup. */
151#define CM_MIXED        (1 << 17)
152/* Elements whose content needs to be indented only if containing one
153   CM_BLOCK element. */
154#define CM_NO_INDENT    (1 << 18)
155/* Elements that are obsolete (such as "dir", "menu"). */
156#define CM_OBSOLETE     (1 << 19)
157/* User defined elements. Used to determine how attributes wihout value
158   should be printed. */
159#define CM_NEW          (1 << 20)
160/* Elements that cannot be omitted. */
161#define CM_OMITST       (1 << 21)
162
163/* If the document uses just HTML 2.0 tags and attributes described
164** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
165** If there are proprietary tags and attributes then describe it as
166** HTML Proprietary. If it includes the xml-lang or xmlns attributes
167** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
168** flavors of Voyager (strict, loose or frameset).
169*/
170
171/* unknown */
172#define xxxx                   0u
173
174/* W3C defined HTML/XHTML family document types */
175#define HT20                   1u
176#define HT32                   2u
177#define H40S                   4u
178#define H40T                   8u
179#define H40F                  16u
180#define H41S                  32u
181#define H41T                  64u
182#define H41F                 128u
183#define X10S                 256u
184#define X10T                 512u
185#define X10F                1024u
186#define XH11                2048u
187#define XB10                4096u
188
189/* proprietary stuff */
190#define VERS_SUN            8192u
191#define VERS_NETSCAPE      16384u
192#define VERS_MICROSOFT     32768u
193
194/* special flag */
195#define VERS_XML           65536u
196
197/* compatibility symbols */
198#define VERS_UNKNOWN       (xxxx)
199#define VERS_HTML20        (HT20)
200#define VERS_HTML32        (HT32)
201#define VERS_HTML40_STRICT (H40S|H41S|X10S)
202#define VERS_HTML40_LOOSE  (H40T|H41T|X10T)
203#define VERS_FRAMESET      (H40F|H41F|X10F)
204#define VERS_XHTML11       (XH11)
205#define VERS_BASIC         (XB10)
206
207/* meta symbols */
208#define VERS_HTML40        (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET)
209#define VERS_IFRAME        (VERS_HTML40_LOOSE|VERS_FRAMESET)
210#define VERS_LOOSE         (VERS_HTML20|VERS_HTML32|VERS_IFRAME)
211#define VERS_EVENTS        (VERS_HTML40|VERS_XHTML11)
212#define VERS_FROM32        (VERS_HTML32|VERS_HTML40)
213#define VERS_FROM40        (VERS_HTML40|VERS_XHTML11|VERS_BASIC)
214#define VERS_XHTML         (X10S|X10T|X10F|XH11|XB10)
215
216/* all W3C defined document types */
217#define VERS_ALL           (VERS_HTML20|VERS_HTML32|VERS_FROM40)
218
219/* all proprietary types */
220#define VERS_PROPRIETARY   (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
221
222/* Linked list of class names and styles
223*/
224struct _Style;
225typedef struct _Style TagStyle;
226
227struct _Style
228{
229    tmbstr tag;
230    tmbstr tag_class;
231    tmbstr properties;
232    TagStyle *next;
233};
234
235
236/* Linked list of style properties
237*/
238struct _StyleProp;
239typedef struct _StyleProp StyleProp;
240
241struct _StyleProp
242{
243    tmbstr name;
244    tmbstr value;
245    StyleProp *next;
246};
247
248
249
250
251/* Attribute/Value linked list node
252*/
253
254struct _AttVal
255{
256    AttVal*           next;
257    const Attribute*  dict;
258    Node*             asp;
259    Node*             php;
260    int               delim;
261    tmbstr            attribute;
262    tmbstr            value;
263};
264
265
266
267/*
268  Mosaic handles inlines via a separate stack from other elements
269  We duplicate this to recover from inline markup errors such as:
270
271     <i>italic text
272     <p>more italic text</b> normal text
273
274  which for compatibility with Mosaic is mapped to:
275
276     <i>italic text</i>
277     <p><i>more italic text</i> normal text
278
279  Note that any inline end tag pop's the effect of the current
280  inline start tag, so that </b> pop's <i> in the above example.
281*/
282struct _IStack
283{
284    IStack*     next;
285    const Dict* tag;        /* tag's dictionary definition */
286    tmbstr      element;    /* name (NULL for text nodes) */
287    AttVal*     attributes;
288};
289
290
291/* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl,
292** etc. etc.
293*/
294
295struct _Node
296{
297    Node*       parent;         /* tree structure */
298    Node*       prev;
299    Node*       next;
300    Node*       content;
301    Node*       last;
302
303    AttVal*     attributes;
304    const Dict* was;            /* old tag when it was changed */
305    const Dict* tag;            /* tag's dictionary definition */
306
307    tmbstr      element;        /* name (NULL for text nodes) */
308
309    uint        start;          /* start of span onto text array */
310    uint        end;            /* end of span onto text array */
311    NodeType    type;           /* TextNode, StartTag, EndTag etc. */
312
313    uint        line;           /* current line of document */
314    uint        column;         /* current column of document */
315
316    Bool        closed;         /* true if closed by explicit end tag */
317    Bool        implicit;       /* true if inferred */
318    Bool        linebreak;      /* true if followed by a line break */
319
320#ifdef TIDY_STORE_ORIGINAL_TEXT
321    tmbstr      otext;
322#endif
323};
324
325
326/*
327  The following are private to the lexer
328  Use NewLexer() to create a lexer, and
329  FreeLexer() to free it.
330*/
331
332struct _Lexer
333{
334#if 0  /* Move to TidyDocImpl */
335    StreamIn* in;           /* document content input */
336    StreamOut* errout;      /* error output stream */
337
338    uint badAccess;         /* for accessibility errors */
339    uint badLayout;         /* for bad style errors */
340    uint badChars;          /* for bad character encodings */
341    uint badForm;           /* for mismatched/mispositioned form tags */
342    uint warnings;          /* count of warnings in this document */
343    uint errors;            /* count of errors */
344#endif
345
346    uint lines;             /* lines seen */
347    uint columns;           /* at start of current token */
348    Bool waswhite;          /* used to collapse contiguous white space */
349    Bool pushed;            /* true after token has been pushed back */
350    Bool insertspace;       /* when space is moved after end tag */
351    Bool excludeBlocks;     /* Netscape compatibility */
352    Bool exiled;            /* true if moved out of table */
353    Bool isvoyager;         /* true if xmlns attribute on html element */
354    uint versions;          /* bit vector of HTML versions */
355    uint doctype;           /* version as given by doctype (if any) */
356    uint versionEmitted;    /* version of doctype emitted */
357    Bool bad_doctype;       /* e.g. if html or PUBLIC is missing */
358    uint txtstart;          /* start of current node */
359    uint txtend;            /* end of current node */
360    LexerState state;       /* state of lexer's finite state machine */
361
362    Node* token;            /* last token returned by GetToken() */
363    Node* itoken;           /* last duplicate inline returned by GetToken() */
364    Node* root;             /* remember root node of the document */
365    Node* parent;           /* remember parent node for CDATA elements */
366
367    Bool seenEndBody;       /* true if a </body> tag has been encountered */
368    Bool seenEndHtml;       /* true if a </html> tag has been encountered */
369
370    /*
371      Lexer character buffer
372
373      Parse tree nodes span onto this buffer
374      which contains the concatenated text
375      contents of all of the elements.
376
377      lexsize must be reset for each file.
378    */
379    tmbstr lexbuf;          /* MB character buffer */
380    uint lexlength;         /* allocated */
381    uint lexsize;           /* used */
382
383    /* Inline stack for compatibility with Mosaic */
384    Node* inode;            /* for deferring text node */
385    IStack* insert;         /* for inferring inline tags */
386    IStack* istack;
387    uint istacklength;      /* allocated */
388    uint istacksize;        /* used */
389    uint istackbase;        /* start of frame */
390
391    TagStyle *styles;          /* used for cleaning up presentation markup */
392
393#if 0
394    TidyDocImpl* doc;       /* Pointer back to doc for error reporting */
395#endif
396};
397
398
399/* Lexer Functions
400*/
401
402/* choose what version to use for new doctype */
403int TY_(HTMLVersion)( TidyDocImpl* doc );
404
405/* everything is allowed in proprietary version of HTML */
406/* this is handled here rather than in the tag/attr dicts */
407
408void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers );
409
410Bool TY_(IsWhite)(uint c);
411Bool TY_(IsDigit)(uint c);
412Bool TY_(IsLetter)(uint c);
413Bool TY_(IsNewline)(uint c);
414Bool TY_(IsNamechar)(uint c);
415Bool TY_(IsXMLLetter)(uint c);
416Bool TY_(IsXMLNamechar)(uint c);
417
418/* Bool IsLower(uint c); */
419Bool TY_(IsUpper)(uint c);
420uint TY_(ToLower)(uint c);
421uint TY_(ToUpper)(uint c);
422
423Lexer* TY_(NewLexer)( TidyDocImpl* doc );
424void TY_(FreeLexer)( TidyDocImpl* doc );
425
426/* store character c as UTF-8 encoded byte stream */
427void TY_(AddCharToLexer)( Lexer *lexer, uint c );
428
429/*
430  Used for elements and text nodes
431  element name is NULL for text nodes
432  start and end are offsets into lexbuf
433  which contains the textual content of
434  all elements in the parse tree.
435
436  parent and content allow traversal
437  of the parse tree in any direction.
438  attributes are represented as a linked
439  list of AttVal nodes which hold the
440  strings for attribute/value pairs.
441*/
442Node* TY_(NewNode)( Lexer* lexer );
443
444
445/* used to clone heading nodes when split by an <HR> */
446Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element );
447
448/* free node's attributes */
449void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node );
450
451/* doesn't repair attribute list linkage */
452void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av );
453
454/* detach attribute from node */
455void TY_(DetachAttribute)( Node *node, AttVal *attr );
456
457/* detach attribute from node then free it
458*/
459void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr );
460
461/*
462  Free document nodes by iterating through peers and recursing
463  through children. Set next to NULL before calling FreeNode()
464  to avoid freeing peer nodes. Doesn't patch up prev/next links.
465 */
466void TY_(FreeNode)( TidyDocImpl* doc, Node *node );
467
468Node* TY_(TextToken)( Lexer *lexer );
469
470/* used for creating preformatted text from Word2000 */
471Node* TY_(NewLineNode)( Lexer *lexer );
472
473/* used for adding a &nbsp; for Word2000 */
474Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt );
475
476void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
477/* void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); */
478
479/* find element */
480Node* TY_(FindDocType)( TidyDocImpl* doc );
481Node* TY_(FindHTML)( TidyDocImpl* doc );
482Node* TY_(FindHEAD)( TidyDocImpl* doc );
483Node* TY_(FindTITLE)(TidyDocImpl* doc);
484Node* TY_(FindBody)( TidyDocImpl* doc );
485Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
486
487/* Returns containing block element, if any */
488Node* TY_(FindContainer)( Node* node );
489
490/* add meta element for Tidy */
491Bool TY_(AddGenerator)( TidyDocImpl* doc );
492
493uint TY_(ApparentVersion)( TidyDocImpl* doc );
494
495ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool isXhtml );
496
497Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc );
498
499Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc );
500
501
502/* fixup doctype if missing */
503Bool TY_(FixDocType)( TidyDocImpl* doc );
504
505/* ensure XML document starts with <?xml version="1.0"?> */
506/* add encoding attribute if not using ASCII or UTF-8 output */
507Bool TY_(FixXmlDecl)( TidyDocImpl* doc );
508
509Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id);
510
511void TY_(UngetToken)( TidyDocImpl* doc );
512
513
514/*
515  modes for GetToken()
516
517  MixedContent   -- for elements which don't accept PCDATA
518  Preformatted   -- white space preserved as is
519  IgnoreMarkup   -- for CDATA elements such as script, style
520*/
521typedef enum
522{
523  IgnoreWhitespace,
524  MixedContent,
525  Preformatted,
526  IgnoreMarkup,
527  CdataContent
528} GetTokenMode;
529
530Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode );
531
532void TY_(InitMap)(void);
533
534
535/* create a new attribute */
536AttVal* TY_(NewAttribute)(void);
537
538/* create a new attribute with given name and value */
539AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
540                             int delim );
541
542/* insert attribute at the end of attribute list of a node */
543void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av );
544
545/* insert attribute at the start of attribute list of a node */
546void TY_(InsertAttributeAtStart)( Node *node, AttVal *av );
547
548/*************************************
549  In-line Stack functions
550*************************************/
551
552
553/* duplicate attributes */
554AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs );
555
556/*
557  push a copy of an inline node onto stack
558  but don't push if implicit or OBJECT or APPLET
559  (implicit tags are ones generated from the istack)
560
561  One issue arises with pushing inlines when
562  the tag is already pushed. For instance:
563
564      <p><em>text
565      <p><em>more text
566
567  Shouldn't be mapped to
568
569      <p><em>text</em></p>
570      <p><em><em>more text</em></em>
571*/
572void TY_(PushInline)( TidyDocImpl* doc, Node* node );
573
574/* pop inline stack */
575void TY_(PopInline)( TidyDocImpl* doc, Node* node );
576
577Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node );
578Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node );
579
580/*
581  This has the effect of inserting "missing" inline
582  elements around the contents of blocklevel elements
583  such as P, TD, TH, DIV, PRE etc. This procedure is
584  called at the start of ParseBlock. when the inline
585  stack is not empty, as will be the case in:
586
587    <i><h1>italic heading</h1></i>
588
589  which is then treated as equivalent to
590
591    <h1><i>italic heading</i></h1>
592
593  This is implemented by setting the lexer into a mode
594  where it gets tokens from the inline stack rather than
595  from the input stream.
596*/
597int TY_(InlineDup)( TidyDocImpl* doc, Node *node );
598
599/*
600 defer duplicates when entering a table or other
601 element where the inlines shouldn't be duplicated
602*/
603void TY_(DeferDup)( TidyDocImpl* doc );
604Node* TY_(InsertedToken)( TidyDocImpl* doc );
605
606#ifdef __cplusplus
607}
608#endif
609
610
611#endif /* __LEXER_H__ */
612