1/* lexer.c -- Lexer for html parser
2
3  (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
4  See tidy.h for the copyright notice.
5
6  CVS Info :
7
8    $Author$
9    $Date$
10    $Revision$
11
12*/
13
14/*
15  Given a file stream fp it returns a sequence of tokens.
16
17     GetToken(fp) gets the next token
18     UngetToken(fp) provides one level undo
19
20  The tags include an attribute list:
21
22    - linked list of attribute/value nodes
23    - each node has 2 NULL-terminated strings.
24    - entities are replaced in attribute values
25
26  white space is compacted if not in preformatted mode
27  If not in preformatted mode then leading white space
28  is discarded and subsequent white space sequences
29  compacted to single space characters.
30
31  If XmlTags is no then Tag names are folded to upper
32  case and attribute names to lower case.
33
34 Not yet done:
35    -   Doctype subset and marked sections
36*/
37
38#include "tidy-int.h"
39#include "lexer.h"
40#include "parser.h"
41#include "entities.h"
42#include "streamio.h"
43#include "message.h"
44#include "tmbstr.h"
45#include "clean.h"
46#include "utf8.h"
47#include "streamio.h"
48
49/* Forward references
50*/
51/* swallows closing '>' */
52static AttVal *ParseAttrs( TidyDocImpl* doc, Bool *isempty );
53
54static tmbstr ParseAttribute( TidyDocImpl* doc, Bool* isempty,
55                             Node **asp, Node **php );
56
57static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, Bool foldCase,
58                         Bool *isempty, int *pdelim );
59
60static Node *ParseDocTypeDecl(TidyDocImpl* doc);
61
62static void AddAttrToList( AttVal** list, AttVal* av );
63
64/* used to classify characters for lexical purposes */
65#define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0)
66static uint lexmap[128];
67
68#define IsValidXMLAttrName(name) TY_(IsValidXMLID)(name)
69#define IsValidXMLElemName(name) TY_(IsValidXMLID)(name)
70
71static struct _doctypes
72{
73    uint score;
74    uint vers;
75    ctmbstr name;
76    ctmbstr fpi;
77    ctmbstr si;
78} const W3C_Doctypes[] =
79{
80  {  2, HT20, "HTML 2.0",               "-//IETF//DTD HTML 2.0//EN",              NULL,                                                       },
81  {  2, HT20, "HTML 2.0",               "-//IETF//DTD HTML//EN",                  NULL,                                                       },
82  {  2, HT20, "HTML 2.0",               "-//W3C//DTD HTML 2.0//EN",               NULL,                                                       },
83  {  1, HT32, "HTML 3.2",               "-//W3C//DTD HTML 3.2//EN",               NULL,                                                       },
84  {  1, HT32, "HTML 3.2",               "-//W3C//DTD HTML 3.2 Final//EN",         NULL,                                                       },
85  {  1, HT32, "HTML 3.2",               "-//W3C//DTD HTML 3.2 Draft//EN",         NULL,                                                       },
86  {  6, H40S, "HTML 4.0 Strict",        "-//W3C//DTD HTML 4.0//EN",               "http://www.w3.org/TR/REC-html40/strict.dtd"                },
87  {  8, H40T, "HTML 4.0 Transitional",  "-//W3C//DTD HTML 4.0 Transitional//EN",  "http://www.w3.org/TR/REC-html40/loose.dtd"                 },
88  {  7, H40F, "HTML 4.0 Frameset",      "-//W3C//DTD HTML 4.0 Frameset//EN",      "http://www.w3.org/TR/REC-html40/frameset.dtd"              },
89  {  3, H41S, "HTML 4.01 Strict",       "-//W3C//DTD HTML 4.01//EN",              "http://www.w3.org/TR/html4/strict.dtd"                     },
90  {  5, H41T, "HTML 4.01 Transitional", "-//W3C//DTD HTML 4.01 Transitional//EN", "http://www.w3.org/TR/html4/loose.dtd"                      },
91  {  4, H41F, "HTML 4.01 Frameset",     "-//W3C//DTD HTML 4.01 Frameset//EN",     "http://www.w3.org/TR/html4/frameset.dtd"                   },
92  {  9, X10S, "XHTML 1.0 Strict",       "-//W3C//DTD XHTML 1.0 Strict//EN",       "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"         },
93  { 11, X10T, "XHTML 1.0 Transitional", "-//W3C//DTD XHTML 1.0 Transitional//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"   },
94  { 10, X10F, "XHTML 1.0 Frameset",     "-//W3C//DTD XHTML 1.0 Frameset//EN",     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"       },
95  { 12, XH11, "XHTML 1.1",              "-//W3C//DTD XHTML 1.1//EN",              "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"              },
96  { 13, XB10, "XHTML Basic 1.0",        "-//W3C//DTD XHTML Basic 1.0//EN",        "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd"        },
97
98  /* reminder to add XHTML Print 1.0 support, see http://www.w3.org/TR/xhtml-print */
99#if 0
100  { 14, XP10, "XHTML Print 1.0",        "-//W3C//DTD XHTML-Print 1.0//EN",         "http://www.w3.org/MarkUp/DTD/xhtml-print10.dtd"           },
101  { 14, XP10, "XHTML Print 1.0",        "-//PWG//DTD XHTML-Print 1.0//EN",         "http://www.xhtml-print.org/xhtml-print/xhtml-print10.dtd" },
102#endif
103  /* final entry */
104  {  0,    0, NULL,                     NULL,                                     NULL                                                        }
105};
106
107int TY_(HTMLVersion)(TidyDocImpl* doc)
108{
109    uint i;
110    uint j = 0;
111    uint score = 0;
112    uint vers = doc->lexer->versions;
113    uint dtver = doc->lexer->doctype;
114    TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);
115    Bool xhtml = (cfgBool(doc, TidyXmlOut) || doc->lexer->isvoyager) &&
116                 !cfgBool(doc, TidyHtmlOut);
117    Bool html4 = dtmode == TidyDoctypeStrict || dtmode == TidyDoctypeLoose || VERS_FROM40 & dtver;
118
119    for (i = 0; W3C_Doctypes[i].name; ++i)
120    {
121        if ((xhtml && !(VERS_XHTML & W3C_Doctypes[i].vers)) ||
122            (html4 && !(VERS_FROM40 & W3C_Doctypes[i].vers)))
123            continue;
124
125        if (vers & W3C_Doctypes[i].vers &&
126            (W3C_Doctypes[i].score < score || !score))
127        {
128            score = W3C_Doctypes[i].score;
129            j = i;
130        }
131    }
132
133    if (score)
134        return W3C_Doctypes[j].vers;
135
136    return VERS_UNKNOWN;
137}
138
139static ctmbstr GetFPIFromVers(uint vers)
140{
141    uint i;
142
143    for (i = 0; W3C_Doctypes[i].name; ++i)
144        if (W3C_Doctypes[i].vers == vers)
145            return W3C_Doctypes[i].fpi;
146
147    return NULL;
148}
149
150static ctmbstr GetSIFromVers(uint vers)
151{
152    uint i;
153
154    for (i = 0; W3C_Doctypes[i].name; ++i)
155        if (W3C_Doctypes[i].vers == vers)
156            return W3C_Doctypes[i].si;
157
158    return NULL;
159}
160
161static ctmbstr GetNameFromVers(uint vers)
162{
163    uint i;
164
165    for (i = 0; W3C_Doctypes[i].name; ++i)
166        if (W3C_Doctypes[i].vers == vers)
167            return W3C_Doctypes[i].name;
168
169    return NULL;
170}
171
172static uint GetVersFromFPI(ctmbstr fpi)
173{
174    uint i;
175
176    for (i = 0; W3C_Doctypes[i].name; ++i)
177        if (TY_(tmbstrcasecmp)(W3C_Doctypes[i].fpi, fpi) == 0)
178            return W3C_Doctypes[i].vers;
179
180    return 0;
181}
182
183/* everything is allowed in proprietary version of HTML */
184/* this is handled here rather than in the tag/attr dicts */
185void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers)
186{
187    doc->lexer->versions &= (vers | VERS_PROPRIETARY);
188}
189
190Bool TY_(IsWhite)(uint c)
191{
192    uint map = MAP(c);
193
194    return (map & white)!=0;
195}
196
197Bool TY_(IsNewline)(uint c)
198{
199    uint map = MAP(c);
200    return (map & newline)!=0;
201}
202
203Bool TY_(IsDigit)(uint c)
204{
205    uint map;
206
207    map = MAP(c);
208
209    return (map & digit)!=0;
210}
211
212Bool TY_(IsLetter)(uint c)
213{
214    uint map;
215
216    map = MAP(c);
217
218    return (map & letter)!=0;
219}
220
221Bool TY_(IsNamechar)(uint c)
222{
223    uint map = MAP(c);
224    return (map & namechar)!=0;
225}
226
227Bool TY_(IsXMLLetter)(uint c)
228{
229    return ((c >= 0x41 && c <= 0x5a) ||
230        (c >= 0x61 && c <= 0x7a) ||
231        (c >= 0xc0 && c <= 0xd6) ||
232        (c >= 0xd8 && c <= 0xf6) ||
233        (c >= 0xf8 && c <= 0xff) ||
234        (c >= 0x100 && c <= 0x131) ||
235        (c >= 0x134 && c <= 0x13e) ||
236        (c >= 0x141 && c <= 0x148) ||
237        (c >= 0x14a && c <= 0x17e) ||
238        (c >= 0x180 && c <= 0x1c3) ||
239        (c >= 0x1cd && c <= 0x1f0) ||
240        (c >= 0x1f4 && c <= 0x1f5) ||
241        (c >= 0x1fa && c <= 0x217) ||
242        (c >= 0x250 && c <= 0x2a8) ||
243        (c >= 0x2bb && c <= 0x2c1) ||
244        c == 0x386 ||
245        (c >= 0x388 && c <= 0x38a) ||
246        c == 0x38c ||
247        (c >= 0x38e && c <= 0x3a1) ||
248        (c >= 0x3a3 && c <= 0x3ce) ||
249        (c >= 0x3d0 && c <= 0x3d6) ||
250        c == 0x3da ||
251        c == 0x3dc ||
252        c == 0x3de ||
253        c == 0x3e0 ||
254        (c >= 0x3e2 && c <= 0x3f3) ||
255        (c >= 0x401 && c <= 0x40c) ||
256        (c >= 0x40e && c <= 0x44f) ||
257        (c >= 0x451 && c <= 0x45c) ||
258        (c >= 0x45e && c <= 0x481) ||
259        (c >= 0x490 && c <= 0x4c4) ||
260        (c >= 0x4c7 && c <= 0x4c8) ||
261        (c >= 0x4cb && c <= 0x4cc) ||
262        (c >= 0x4d0 && c <= 0x4eb) ||
263        (c >= 0x4ee && c <= 0x4f5) ||
264        (c >= 0x4f8 && c <= 0x4f9) ||
265        (c >= 0x531 && c <= 0x556) ||
266        c == 0x559 ||
267        (c >= 0x561 && c <= 0x586) ||
268        (c >= 0x5d0 && c <= 0x5ea) ||
269        (c >= 0x5f0 && c <= 0x5f2) ||
270        (c >= 0x621 && c <= 0x63a) ||
271        (c >= 0x641 && c <= 0x64a) ||
272        (c >= 0x671 && c <= 0x6b7) ||
273        (c >= 0x6ba && c <= 0x6be) ||
274        (c >= 0x6c0 && c <= 0x6ce) ||
275        (c >= 0x6d0 && c <= 0x6d3) ||
276        c == 0x6d5 ||
277        (c >= 0x6e5 && c <= 0x6e6) ||
278        (c >= 0x905 && c <= 0x939) ||
279        c == 0x93d ||
280        (c >= 0x958 && c <= 0x961) ||
281        (c >= 0x985 && c <= 0x98c) ||
282        (c >= 0x98f && c <= 0x990) ||
283        (c >= 0x993 && c <= 0x9a8) ||
284        (c >= 0x9aa && c <= 0x9b0) ||
285        c == 0x9b2 ||
286        (c >= 0x9b6 && c <= 0x9b9) ||
287        (c >= 0x9dc && c <= 0x9dd) ||
288        (c >= 0x9df && c <= 0x9e1) ||
289        (c >= 0x9f0 && c <= 0x9f1) ||
290        (c >= 0xa05 && c <= 0xa0a) ||
291        (c >= 0xa0f && c <= 0xa10) ||
292        (c >= 0xa13 && c <= 0xa28) ||
293        (c >= 0xa2a && c <= 0xa30) ||
294        (c >= 0xa32 && c <= 0xa33) ||
295        (c >= 0xa35 && c <= 0xa36) ||
296        (c >= 0xa38 && c <= 0xa39) ||
297        (c >= 0xa59 && c <= 0xa5c) ||
298        c == 0xa5e ||
299        (c >= 0xa72 && c <= 0xa74) ||
300        (c >= 0xa85 && c <= 0xa8b) ||
301        c == 0xa8d ||
302        (c >= 0xa8f && c <= 0xa91) ||
303        (c >= 0xa93 && c <= 0xaa8) ||
304        (c >= 0xaaa && c <= 0xab0) ||
305        (c >= 0xab2 && c <= 0xab3) ||
306        (c >= 0xab5 && c <= 0xab9) ||
307        c == 0xabd ||
308        c == 0xae0 ||
309        (c >= 0xb05 && c <= 0xb0c) ||
310        (c >= 0xb0f && c <= 0xb10) ||
311        (c >= 0xb13 && c <= 0xb28) ||
312        (c >= 0xb2a && c <= 0xb30) ||
313        (c >= 0xb32 && c <= 0xb33) ||
314        (c >= 0xb36 && c <= 0xb39) ||
315        c == 0xb3d ||
316        (c >= 0xb5c && c <= 0xb5d) ||
317        (c >= 0xb5f && c <= 0xb61) ||
318        (c >= 0xb85 && c <= 0xb8a) ||
319        (c >= 0xb8e && c <= 0xb90) ||
320        (c >= 0xb92 && c <= 0xb95) ||
321        (c >= 0xb99 && c <= 0xb9a) ||
322        c == 0xb9c ||
323        (c >= 0xb9e && c <= 0xb9f) ||
324        (c >= 0xba3 && c <= 0xba4) ||
325        (c >= 0xba8 && c <= 0xbaa) ||
326        (c >= 0xbae && c <= 0xbb5) ||
327        (c >= 0xbb7 && c <= 0xbb9) ||
328        (c >= 0xc05 && c <= 0xc0c) ||
329        (c >= 0xc0e && c <= 0xc10) ||
330        (c >= 0xc12 && c <= 0xc28) ||
331        (c >= 0xc2a && c <= 0xc33) ||
332        (c >= 0xc35 && c <= 0xc39) ||
333        (c >= 0xc60 && c <= 0xc61) ||
334        (c >= 0xc85 && c <= 0xc8c) ||
335        (c >= 0xc8e && c <= 0xc90) ||
336        (c >= 0xc92 && c <= 0xca8) ||
337        (c >= 0xcaa && c <= 0xcb3) ||
338        (c >= 0xcb5 && c <= 0xcb9) ||
339        c == 0xcde ||
340        (c >= 0xce0 && c <= 0xce1) ||
341        (c >= 0xd05 && c <= 0xd0c) ||
342        (c >= 0xd0e && c <= 0xd10) ||
343        (c >= 0xd12 && c <= 0xd28) ||
344        (c >= 0xd2a && c <= 0xd39) ||
345        (c >= 0xd60 && c <= 0xd61) ||
346        (c >= 0xe01 && c <= 0xe2e) ||
347        c == 0xe30 ||
348        (c >= 0xe32 && c <= 0xe33) ||
349        (c >= 0xe40 && c <= 0xe45) ||
350        (c >= 0xe81 && c <= 0xe82) ||
351        c == 0xe84 ||
352        (c >= 0xe87 && c <= 0xe88) ||
353        c == 0xe8a ||
354        c == 0xe8d ||
355        (c >= 0xe94 && c <= 0xe97) ||
356        (c >= 0xe99 && c <= 0xe9f) ||
357        (c >= 0xea1 && c <= 0xea3) ||
358        c == 0xea5 ||
359        c == 0xea7 ||
360        (c >= 0xeaa && c <= 0xeab) ||
361        (c >= 0xead && c <= 0xeae) ||
362        c == 0xeb0 ||
363        (c >= 0xeb2 && c <= 0xeb3) ||
364        c == 0xebd ||
365        (c >= 0xec0 && c <= 0xec4) ||
366        (c >= 0xf40 && c <= 0xf47) ||
367        (c >= 0xf49 && c <= 0xf69) ||
368        (c >= 0x10a0 && c <= 0x10c5) ||
369        (c >= 0x10d0 && c <= 0x10f6) ||
370        c == 0x1100 ||
371        (c >= 0x1102 && c <= 0x1103) ||
372        (c >= 0x1105 && c <= 0x1107) ||
373        c == 0x1109 ||
374        (c >= 0x110b && c <= 0x110c) ||
375        (c >= 0x110e && c <= 0x1112) ||
376        c == 0x113c ||
377        c == 0x113e ||
378        c == 0x1140 ||
379        c == 0x114c ||
380        c == 0x114e ||
381        c == 0x1150 ||
382        (c >= 0x1154 && c <= 0x1155) ||
383        c == 0x1159 ||
384        (c >= 0x115f && c <= 0x1161) ||
385        c == 0x1163 ||
386        c == 0x1165 ||
387        c == 0x1167 ||
388        c == 0x1169 ||
389        (c >= 0x116d && c <= 0x116e) ||
390        (c >= 0x1172 && c <= 0x1173) ||
391        c == 0x1175 ||
392        c == 0x119e ||
393        c == 0x11a8 ||
394        c == 0x11ab ||
395        (c >= 0x11ae && c <= 0x11af) ||
396        (c >= 0x11b7 && c <= 0x11b8) ||
397        c == 0x11ba ||
398        (c >= 0x11bc && c <= 0x11c2) ||
399        c == 0x11eb ||
400        c == 0x11f0 ||
401        c == 0x11f9 ||
402        (c >= 0x1e00 && c <= 0x1e9b) ||
403        (c >= 0x1ea0 && c <= 0x1ef9) ||
404        (c >= 0x1f00 && c <= 0x1f15) ||
405        (c >= 0x1f18 && c <= 0x1f1d) ||
406        (c >= 0x1f20 && c <= 0x1f45) ||
407        (c >= 0x1f48 && c <= 0x1f4d) ||
408        (c >= 0x1f50 && c <= 0x1f57) ||
409        c == 0x1f59 ||
410        c == 0x1f5b ||
411        c == 0x1f5d ||
412        (c >= 0x1f5f && c <= 0x1f7d) ||
413        (c >= 0x1f80 && c <= 0x1fb4) ||
414        (c >= 0x1fb6 && c <= 0x1fbc) ||
415        c == 0x1fbe ||
416        (c >= 0x1fc2 && c <= 0x1fc4) ||
417        (c >= 0x1fc6 && c <= 0x1fcc) ||
418        (c >= 0x1fd0 && c <= 0x1fd3) ||
419        (c >= 0x1fd6 && c <= 0x1fdb) ||
420        (c >= 0x1fe0 && c <= 0x1fec) ||
421        (c >= 0x1ff2 && c <= 0x1ff4) ||
422        (c >= 0x1ff6 && c <= 0x1ffc) ||
423        c == 0x2126 ||
424        (c >= 0x212a && c <= 0x212b) ||
425        c == 0x212e ||
426        (c >= 0x2180 && c <= 0x2182) ||
427        (c >= 0x3041 && c <= 0x3094) ||
428        (c >= 0x30a1 && c <= 0x30fa) ||
429        (c >= 0x3105 && c <= 0x312c) ||
430        (c >= 0xac00 && c <= 0xd7a3) ||
431        (c >= 0x4e00 && c <= 0x9fa5) ||
432        c == 0x3007 ||
433        (c >= 0x3021 && c <= 0x3029) ||
434        (c >= 0x4e00 && c <= 0x9fa5) ||
435        c == 0x3007 ||
436        (c >= 0x3021 && c <= 0x3029));
437}
438
439Bool TY_(IsXMLNamechar)(uint c)
440{
441    return (TY_(IsXMLLetter)(c) ||
442        c == '.' || c == '_' ||
443        c == ':' || c == '-' ||
444        (c >= 0x300 && c <= 0x345) ||
445        (c >= 0x360 && c <= 0x361) ||
446        (c >= 0x483 && c <= 0x486) ||
447        (c >= 0x591 && c <= 0x5a1) ||
448        (c >= 0x5a3 && c <= 0x5b9) ||
449        (c >= 0x5bb && c <= 0x5bd) ||
450        c == 0x5bf ||
451        (c >= 0x5c1 && c <= 0x5c2) ||
452        c == 0x5c4 ||
453        (c >= 0x64b && c <= 0x652) ||
454        c == 0x670 ||
455        (c >= 0x6d6 && c <= 0x6dc) ||
456        (c >= 0x6dd && c <= 0x6df) ||
457        (c >= 0x6e0 && c <= 0x6e4) ||
458        (c >= 0x6e7 && c <= 0x6e8) ||
459        (c >= 0x6ea && c <= 0x6ed) ||
460        (c >= 0x901 && c <= 0x903) ||
461        c == 0x93c ||
462        (c >= 0x93e && c <= 0x94c) ||
463        c == 0x94d ||
464        (c >= 0x951 && c <= 0x954) ||
465        (c >= 0x962 && c <= 0x963) ||
466        (c >= 0x981 && c <= 0x983) ||
467        c == 0x9bc ||
468        c == 0x9be ||
469        c == 0x9bf ||
470        (c >= 0x9c0 && c <= 0x9c4) ||
471        (c >= 0x9c7 && c <= 0x9c8) ||
472        (c >= 0x9cb && c <= 0x9cd) ||
473        c == 0x9d7 ||
474        (c >= 0x9e2 && c <= 0x9e3) ||
475        c == 0xa02 ||
476        c == 0xa3c ||
477        c == 0xa3e ||
478        c == 0xa3f ||
479        (c >= 0xa40 && c <= 0xa42) ||
480        (c >= 0xa47 && c <= 0xa48) ||
481        (c >= 0xa4b && c <= 0xa4d) ||
482        (c >= 0xa70 && c <= 0xa71) ||
483        (c >= 0xa81 && c <= 0xa83) ||
484        c == 0xabc ||
485        (c >= 0xabe && c <= 0xac5) ||
486        (c >= 0xac7 && c <= 0xac9) ||
487        (c >= 0xacb && c <= 0xacd) ||
488        (c >= 0xb01 && c <= 0xb03) ||
489        c == 0xb3c ||
490        (c >= 0xb3e && c <= 0xb43) ||
491        (c >= 0xb47 && c <= 0xb48) ||
492        (c >= 0xb4b && c <= 0xb4d) ||
493        (c >= 0xb56 && c <= 0xb57) ||
494        (c >= 0xb82 && c <= 0xb83) ||
495        (c >= 0xbbe && c <= 0xbc2) ||
496        (c >= 0xbc6 && c <= 0xbc8) ||
497        (c >= 0xbca && c <= 0xbcd) ||
498        c == 0xbd7 ||
499        (c >= 0xc01 && c <= 0xc03) ||
500        (c >= 0xc3e && c <= 0xc44) ||
501        (c >= 0xc46 && c <= 0xc48) ||
502        (c >= 0xc4a && c <= 0xc4d) ||
503        (c >= 0xc55 && c <= 0xc56) ||
504        (c >= 0xc82 && c <= 0xc83) ||
505        (c >= 0xcbe && c <= 0xcc4) ||
506        (c >= 0xcc6 && c <= 0xcc8) ||
507        (c >= 0xcca && c <= 0xccd) ||
508        (c >= 0xcd5 && c <= 0xcd6) ||
509        (c >= 0xd02 && c <= 0xd03) ||
510        (c >= 0xd3e && c <= 0xd43) ||
511        (c >= 0xd46 && c <= 0xd48) ||
512        (c >= 0xd4a && c <= 0xd4d) ||
513        c == 0xd57 ||
514        c == 0xe31 ||
515        (c >= 0xe34 && c <= 0xe3a) ||
516        (c >= 0xe47 && c <= 0xe4e) ||
517        c == 0xeb1 ||
518        (c >= 0xeb4 && c <= 0xeb9) ||
519        (c >= 0xebb && c <= 0xebc) ||
520        (c >= 0xec8 && c <= 0xecd) ||
521        (c >= 0xf18 && c <= 0xf19) ||
522        c == 0xf35 ||
523        c == 0xf37 ||
524        c == 0xf39 ||
525        c == 0xf3e ||
526        c == 0xf3f ||
527        (c >= 0xf71 && c <= 0xf84) ||
528        (c >= 0xf86 && c <= 0xf8b) ||
529        (c >= 0xf90 && c <= 0xf95) ||
530        c == 0xf97 ||
531        (c >= 0xf99 && c <= 0xfad) ||
532        (c >= 0xfb1 && c <= 0xfb7) ||
533        c == 0xfb9 ||
534        (c >= 0x20d0 && c <= 0x20dc) ||
535        c == 0x20e1 ||
536        (c >= 0x302a && c <= 0x302f) ||
537        c == 0x3099 ||
538        c == 0x309a ||
539        (c >= 0x30 && c <= 0x39) ||
540        (c >= 0x660 && c <= 0x669) ||
541        (c >= 0x6f0 && c <= 0x6f9) ||
542        (c >= 0x966 && c <= 0x96f) ||
543        (c >= 0x9e6 && c <= 0x9ef) ||
544        (c >= 0xa66 && c <= 0xa6f) ||
545        (c >= 0xae6 && c <= 0xaef) ||
546        (c >= 0xb66 && c <= 0xb6f) ||
547        (c >= 0xbe7 && c <= 0xbef) ||
548        (c >= 0xc66 && c <= 0xc6f) ||
549        (c >= 0xce6 && c <= 0xcef) ||
550        (c >= 0xd66 && c <= 0xd6f) ||
551        (c >= 0xe50 && c <= 0xe59) ||
552        (c >= 0xed0 && c <= 0xed9) ||
553        (c >= 0xf20 && c <= 0xf29) ||
554        c == 0xb7 ||
555        c == 0x2d0 ||
556        c == 0x2d1 ||
557        c == 0x387 ||
558        c == 0x640 ||
559        c == 0xe46 ||
560        c == 0xec6 ||
561        c == 0x3005 ||
562        (c >= 0x3031 && c <= 0x3035) ||
563        (c >= 0x309d && c <= 0x309e) ||
564        (c >= 0x30fc && c <= 0x30fe));
565}
566
567#if 0
568Bool IsLower(uint c)
569{
570    uint map = MAP(c);
571
572    return (map & lowercase)!=0;
573}
574#endif
575
576Bool TY_(IsUpper)(uint c)
577{
578    uint map = MAP(c);
579
580    return (map & uppercase)!=0;
581}
582
583uint TY_(ToLower)(uint c)
584{
585    uint map = MAP(c);
586
587    if (map & uppercase)
588        c += 'a' - 'A';
589
590    return c;
591}
592
593uint TY_(ToUpper)(uint c)
594{
595    uint map = MAP(c);
596
597    if (map & lowercase)
598        c += (uint) ('A' - 'a' );
599
600    return c;
601}
602
603#if 0
604char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps )
605{
606    if ( !cfgBool(doc, TidyXmlTags) )
607    {
608        if ( tocaps )
609        {
610            c = (tmbchar) ToUpper(c);
611        }
612        else /* force to lower case */
613        {
614            c = (tmbchar) ToLower(c);
615        }
616    }
617    return c;
618}
619#endif
620
621/*
622 return last character in string
623 this is useful when trailing quotemark
624 is missing on an attribute
625*/
626static tmbchar LastChar( tmbstr str )
627{
628    if ( str && *str )
629    {
630        int n = TY_(tmbstrlen)(str);
631        return str[n-1];
632    }
633    return 0;
634}
635
636/*
637   node->type is one of these:
638
639    #define TextNode    1
640    #define StartTag    2
641    #define EndTag      3
642    #define StartEndTag 4
643*/
644
645Lexer* TY_(NewLexer)( TidyDocImpl* doc )
646{
647    Lexer* lexer = (Lexer*) MemAlloc( sizeof(Lexer) );
648
649    if ( lexer != NULL )
650    {
651        ClearMemory( lexer, sizeof(Lexer) );
652
653        lexer->lines = 1;
654        lexer->columns = 1;
655        lexer->state = LEX_CONTENT;
656
657        lexer->versions = (VERS_ALL|VERS_PROPRIETARY);
658        lexer->doctype = VERS_UNKNOWN;
659        lexer->root = &doc->root;
660    }
661    return lexer;
662}
663
664static Bool EndOfInput( TidyDocImpl* doc )
665{
666    assert( doc->docIn != NULL );
667    return ( !doc->docIn->pushed && TY_(IsEOF)(doc->docIn) );
668}
669
670void TY_(FreeLexer)( TidyDocImpl* doc )
671{
672    Lexer *lexer = doc->lexer;
673    if ( lexer )
674    {
675        TY_(FreeStyles)( doc );
676
677        /* See GetToken() */
678        if ( lexer->pushed || lexer->itoken )
679        {
680            if (lexer->pushed)
681                TY_(FreeNode)( doc, lexer->itoken );
682            TY_(FreeNode)( doc, lexer->token );
683        }
684
685        while ( lexer->istacksize > 0 )
686            TY_(PopInline)( doc, NULL );
687
688        MemFree( lexer->istack );
689        MemFree( lexer->lexbuf );
690        MemFree( lexer );
691        doc->lexer = NULL;
692    }
693}
694
695/* Lexer uses bigger memory chunks than pprint as
696** it must hold the entire input document. not just
697** the last line or three.
698*/
699static void AddByte( Lexer *lexer, tmbchar ch )
700{
701    if ( lexer->lexsize + 2 >= lexer->lexlength )
702    {
703        tmbstr buf = NULL;
704        uint allocAmt = lexer->lexlength;
705        while ( lexer->lexsize + 2 >= allocAmt )
706        {
707            if ( allocAmt == 0 )
708                allocAmt = 8192;
709            else
710                allocAmt *= 2;
711        }
712        buf = (tmbstr) MemRealloc( lexer->lexbuf, allocAmt );
713        if ( buf )
714        {
715          ClearMemory( buf + lexer->lexlength,
716                       allocAmt - lexer->lexlength );
717          lexer->lexbuf = buf;
718          lexer->lexlength = allocAmt;
719        }
720    }
721
722    lexer->lexbuf[ lexer->lexsize++ ] = ch;
723    lexer->lexbuf[ lexer->lexsize ]   = '\0';  /* debug */
724}
725
726static void ChangeChar( Lexer *lexer, tmbchar c )
727{
728    if ( lexer->lexsize > 0 )
729    {
730        lexer->lexbuf[ lexer->lexsize-1 ] = c;
731    }
732}
733
734/* store character c as UTF-8 encoded byte stream */
735void TY_(AddCharToLexer)( Lexer *lexer, uint c )
736{
737    int i, err, count = 0;
738    tmbchar buf[10] = {0};
739
740    err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
741    if (err)
742    {
743#if 0 && defined(_DEBUG)
744        fprintf( stderr, "lexer UTF-8 encoding error for U+%x : ", c );
745#endif
746        /* replacement character 0xFFFD encoded as UTF-8 */
747        buf[0] = (byte) 0xEF;
748        buf[1] = (byte) 0xBF;
749        buf[2] = (byte) 0xBD;
750        count = 3;
751    }
752
753    for ( i = 0; i < count; ++i )
754        AddByte( lexer, buf[i] );
755}
756
757static void AddStringToLexer( Lexer *lexer, ctmbstr str )
758{
759    uint c;
760
761    /*  Many (all?) compilers will sign-extend signed chars (the default) when
762    **  converting them to unsigned integer values.  We must cast our char to
763    **  unsigned char before assigning it to prevent this from happening.
764    */
765    while( 0 != (c = (unsigned char) *str++ ))
766        TY_(AddCharToLexer)( lexer, c );
767}
768
769
770static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer )
771{
772    lexer->lines = doc->docIn->curline;
773    lexer->columns = doc->docIn->curcol;
774}
775
776/*
777  No longer attempts to insert missing ';' for unknown
778  enitities unless one was present already, since this
779  gives unexpected results.
780
781  For example:   <a href="something.htm?foo&bar&fred">
782  was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;">
783  rather than:   <a href="something.htm?foo&amp;bar&amp;fred">
784
785  My thanks for Maurice Buxton for spotting this.
786
787  Also Randy Waki pointed out the following case for the
788  04 Aug 00 version (bug #433012):
789
790  For example:   <a href="something.htm?id=1&lang=en">
791  was tidied to: <a href="something.htm?id=1&lang;=en">
792  rather than:   <a href="something.htm?id=1&amp;lang=en">
793
794  where "lang" is a known entity (#9001), but browsers would
795  misinterpret "&lang;" because it had a value > 256.
796
797  So the case of an apparently known entity with a value > 256 and
798  missing a semicolon is handled specially.
799
800  "ParseEntity" is also a bit of a misnomer - it handles entities and
801  numeric character references. Invalid NCR's are now reported.
802*/
803static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode )
804{
805    uint start;
806    Bool first = yes, semicolon = no, found = no;
807    Bool isXml = cfgBool( doc, TidyXmlTags );
808    uint c, ch, startcol, entver = 0;
809    Lexer* lexer = doc->lexer;
810
811    start = lexer->lexsize - 1;  /* to start at "&" */
812    startcol = doc->docIn->curcol - 1;
813
814    while ( (c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
815    {
816        if ( c == ';' )
817        {
818            semicolon = yes;
819            break;
820        }
821
822        if (first && c == '#')
823        {
824#if SUPPORT_ASIAN_ENCODINGS
825            if ( !cfgBool(doc, TidyNCR) ||
826                 cfg(doc, TidyInCharEncoding) == BIG5 ||
827                 cfg(doc, TidyInCharEncoding) == SHIFTJIS )
828            {
829                TY_(UngetChar)('#', doc->docIn);
830                return;
831            }
832#endif
833            TY_(AddCharToLexer)( lexer, c );
834            first = no;
835            continue;
836        }
837
838        first = no;
839
840        if ( TY_(IsNamechar)(c) )
841        {
842            TY_(AddCharToLexer)( lexer, c );
843            continue;
844        }
845
846        /* otherwise put it back */
847
848        TY_(UngetChar)( c, doc->docIn );
849        break;
850    }
851
852    /* make sure entity is NULL terminated */
853    lexer->lexbuf[lexer->lexsize] = '\0';
854
855    /* Should contrain version to XML/XHTML if &apos;
856    ** is encountered.  But this is not possible with
857    ** Tidy's content model bit mask.
858    */
859    if ( TY_(tmbstrcmp)(lexer->lexbuf+start, "&apos") == 0
860         && !cfgBool(doc, TidyXmlOut)
861         && !lexer->isvoyager
862         && !cfgBool(doc, TidyXhtmlOut) )
863        TY_(ReportEntityError)( doc, APOS_UNDEFINED, lexer->lexbuf+start, 39 );
864
865    /* Lookup entity code and version
866    */
867    found = TY_(EntityInfo)( lexer->lexbuf+start, isXml, &ch, &entver );
868
869    /* deal with unrecognized or invalid entities */
870    /* #433012 - fix by Randy Waki 17 Feb 01 */
871    /* report invalid NCR's - Terry Teague 01 Sep 01 */
872    if ( !found || (ch >= 128 && ch <= 159) || (ch >= 256 && c != ';') )
873    {
874        /* set error position just before offending character */
875        SetLexerLocus( doc, lexer );
876        lexer->columns = startcol;
877
878        if (lexer->lexsize > start + 1)
879        {
880            if (ch >= 128 && ch <= 159)
881            {
882                /* invalid numeric character reference */
883
884                uint c1 = 0;
885                int replaceMode = DISCARDED_CHAR;
886
887                if ( TY_(ReplacementCharEncoding) == WIN1252 )
888                    c1 = TY_(DecodeWin1252)( ch );
889                else if ( TY_(ReplacementCharEncoding) == MACROMAN )
890                    c1 = TY_(DecodeMacRoman)( ch );
891
892                if ( c1 )
893                    replaceMode = REPLACED_CHAR;
894
895                if ( c != ';' )  /* issue warning if not terminated by ';' */
896                    TY_(ReportEntityError)( doc, MISSING_SEMICOLON_NCR,
897                                       lexer->lexbuf+start, c );
898
899                TY_(ReportEncodingError)(doc, INVALID_NCR, ch, replaceMode == DISCARDED_CHAR);
900
901
902/* Apple Changes:
903   2007-02-07 iccir [4642206] Don't insert invalid characters in raw mode
904   2007-06-27 iccir [5222259] The previous fix for 4642206 disabled TidyQuoteAmpersand when in RAW encoding mode.
905                              Since PPrintChar() has no character look-ahead, I am resorting to quoting the
906                              ampersand in the lexer.
907*/
908#ifdef TIDY_APPLE_CHANGES
909                if ( cfg(doc, TidyOutCharEncoding) != RAW )
910                {
911#endif
912                if ( c1 )
913                {
914                    /* make the replacement */
915                    lexer->lexsize = start;
916                    TY_(AddCharToLexer)( lexer, c1 );
917                    semicolon = no;
918                }
919                else
920                {
921                    /* discard */
922                    lexer->lexsize = start;
923                    semicolon = no;
924                }
925#ifdef TIDY_APPLE_CHANGES
926                }
927#endif
928            }
929            else
930                TY_(ReportEntityError)( doc, UNKNOWN_ENTITY,
931                                   lexer->lexbuf+start, ch );
932
933            if (semicolon)
934                TY_(AddCharToLexer)( lexer, ';' );
935        }
936        else /* naked & */
937#ifdef TIDY_APPLE_CHANGES
938        {
939            if ( (cfg(doc, TidyOutCharEncoding) == RAW && cfgBool(doc, TidyQuoteAmpersand)) )
940                AddStringToLexer( lexer, "amp;" );
941#endif
942            TY_(ReportEntityError)( doc, UNESCAPED_AMPERSAND,
943                               lexer->lexbuf+start, ch );
944#ifdef TIDY_APPLE_CHANGES
945        }
946#endif
947    }
948    else
949    {
950        if ( c != ';' )    /* issue warning if not terminated by ';' */
951        {
952            /* set error position just before offending chararcter */
953            SetLexerLocus( doc, lexer );
954            lexer->columns = startcol;
955            TY_(ReportEntityError)( doc, MISSING_SEMICOLON, lexer->lexbuf+start, c );
956        }
957
958#ifdef TIDY_APPLE_CHANGES
959        if ( cfg(doc, TidyOutCharEncoding) == RAW )
960        {
961            AddCharToLexer( lexer, ';' );
962        }
963        else
964        {
965#endif
966        lexer->lexsize = start;
967        if ( ch == 160 && (mode == Preformatted) )
968            ch = ' ';
969        TY_(AddCharToLexer)( lexer, ch );
970
971        if ( ch == '&' && !cfgBool(doc, TidyQuoteAmpersand) )
972            AddStringToLexer( lexer, "amp;" );
973
974        /* Detect extended vs. basic entities */
975        TY_(ConstrainVersion)( doc, entver );
976#ifdef TIDY_APPLE_CHANGES
977        }
978#endif
979    }
980}
981
982static tmbchar ParseTagName( TidyDocImpl* doc )
983{
984    Lexer *lexer = doc->lexer;
985    uint c = lexer->lexbuf[ lexer->txtstart ];
986    Bool xml = cfgBool(doc, TidyXmlTags);
987
988    /* fold case of first character in buffer */
989    if (!xml && TY_(IsUpper)(c))
990        lexer->lexbuf[lexer->txtstart] = (tmbchar) TY_(ToLower)(c);
991
992    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
993    {
994        if ((!xml && !TY_(IsNamechar)(c)) ||
995            (xml && !TY_(IsXMLNamechar)(c)))
996            break;
997
998        /* fold case of subsequent characters */
999        if (!xml && TY_(IsUpper)(c))
1000             c = TY_(ToLower)(c);
1001
1002        TY_(AddCharToLexer)(lexer, c);
1003    }
1004
1005    lexer->txtend = lexer->lexsize;
1006    return (tmbchar) c;
1007}
1008
1009/*
1010  Used for elements and text nodes
1011  element name is NULL for text nodes
1012  start and end are offsets into lexbuf
1013  which contains the textual content of
1014  all elements in the parse tree.
1015
1016  parent and content allow traversal
1017  of the parse tree in any direction.
1018  attributes are represented as a linked
1019  list of AttVal nodes which hold the
1020  strings for attribute/value pairs.
1021*/
1022
1023
1024Node *TY_(NewNode)(Lexer *lexer)
1025{
1026    Node* node = (Node*) MemAlloc( sizeof(Node) );
1027    ClearMemory( node, sizeof(Node) );
1028    if ( lexer )
1029    {
1030        node->line = lexer->lines;
1031        node->column = lexer->columns;
1032    }
1033    node->type = TextNode;
1034    return node;
1035}
1036
1037/* used to clone heading nodes when split by an <HR> */
1038Node *TY_(CloneNode)( TidyDocImpl* doc, Node *element )
1039{
1040    Lexer* lexer = doc->lexer;
1041    Node *node = TY_(NewNode)( lexer );
1042
1043    node->start = lexer->lexsize;
1044    node->end   = lexer->lexsize;
1045
1046    if ( element )
1047    {
1048        node->parent     = element->parent;
1049        node->type       = element->type;
1050        node->closed     = element->closed;
1051        node->implicit   = element->implicit;
1052        node->tag        = element->tag;
1053        node->element    = TY_(tmbstrdup)( element->element );
1054        node->attributes = TY_(DupAttrs)( doc, element->attributes );
1055    }
1056    return node;
1057}
1058
1059/* free node's attributes */
1060void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node )
1061{
1062
1063    while ( node->attributes )
1064    {
1065        AttVal *av = node->attributes;
1066
1067        if ( av->attribute )
1068        {
1069            if ( (attrIsID(av) || attrIsNAME(av)) &&
1070                 TY_(IsAnchorElement)(doc, node) )
1071            {
1072                TY_(RemoveAnchorByNode)( doc, node );
1073            }
1074        }
1075
1076        node->attributes = av->next;
1077        TY_(FreeAttribute)( doc, av );
1078    }
1079}
1080
1081/* doesn't repair attribute list linkage */
1082void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av )
1083{
1084    TY_(FreeNode)( doc, av->asp );
1085    TY_(FreeNode)( doc, av->php );
1086    MemFree( av->attribute );
1087    MemFree( av->value );
1088    MemFree( av );
1089}
1090
1091/* detach attribute from node
1092*/
1093void TY_(DetachAttribute)( Node *node, AttVal *attr )
1094{
1095    AttVal *av, *prev = NULL;
1096
1097    for ( av = node->attributes; av; av = av->next )
1098    {
1099        if ( av == attr )
1100        {
1101            if ( prev )
1102                prev->next = attr->next;
1103            else
1104                node->attributes = attr->next;
1105            break;
1106        }
1107        prev = av;
1108    }
1109}
1110
1111/* detach attribute from node then free it
1112*/
1113void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr )
1114{
1115    TY_(DetachAttribute)( node, attr );
1116    TY_(FreeAttribute)( doc, attr );
1117}
1118
1119/*
1120  Free document nodes by iterating through peers and recursing
1121  through children. Set next to NULL before calling TY_(FreeNode)()
1122  to avoid freeing peer nodes. Doesn't patch up prev/next links.
1123 */
1124void TY_(FreeNode)( TidyDocImpl* doc, Node *node )
1125{
1126    while ( node )
1127    {
1128        Node* next = node->next;
1129
1130        TY_(FreeAttrs)( doc, node );
1131        TY_(FreeNode)( doc, node->content );
1132        MemFree( node->element );
1133#ifdef TIDY_STORE_ORIGINAL_TEXT
1134        if (node->otext)
1135            MemFree(node->otext);
1136#endif
1137        if (RootNode != node->type)
1138            MemFree( node );
1139        else
1140            node->content = NULL;
1141
1142        node = next;
1143    }
1144}
1145
1146#ifdef TIDY_STORE_ORIGINAL_TEXT
1147void StoreOriginalTextInToken(TidyDocImpl* doc, Node* node, uint count)
1148{
1149    if (!doc->storeText)
1150        return;
1151
1152    if (count >= doc->docIn->otextlen)
1153        return;
1154
1155    if (!doc->docIn->otextsize)
1156        return;
1157
1158    if (count == 0)
1159    {
1160        node->otext = doc->docIn->otextbuf;
1161        doc->docIn->otextbuf = NULL;
1162        doc->docIn->otextlen = 0;
1163        doc->docIn->otextsize = 0;
1164    }
1165    else
1166    {
1167        uint len = doc->docIn->otextlen;
1168        tmbstr buf1 = (tmbstr)MemAlloc(len - count + 1);
1169        tmbstr buf2 = (tmbstr)MemAlloc(count + 1);
1170        uint i, j;
1171
1172        /* strncpy? */
1173
1174        for (i = 0; i < len - count; ++i)
1175            buf1[i] = doc->docIn->otextbuf[i];
1176
1177        buf1[i] = 0;
1178
1179        for (j = 0; j + i < len; ++j)
1180            buf2[j] = doc->docIn->otextbuf[j + i];
1181
1182        buf2[j] = 0;
1183
1184        MemFree(doc->docIn->otextbuf);
1185        node->otext = buf1;
1186        doc->docIn->otextbuf = buf2;
1187        doc->docIn->otextlen = count;
1188        doc->docIn->otextsize = count + 1;
1189    }
1190}
1191#endif
1192
1193Node* TY_(TextToken)( Lexer *lexer )
1194{
1195    Node *node = TY_(NewNode)( lexer );
1196    node->start = lexer->txtstart;
1197    node->end = lexer->txtend;
1198    return node;
1199}
1200
1201/* used for creating preformatted text from Word2000 */
1202Node *TY_(NewLineNode)( Lexer *lexer )
1203{
1204    Node *node = TY_(NewNode)( lexer );
1205    node->start = lexer->lexsize;
1206    TY_(AddCharToLexer)( lexer, (uint)'\n' );
1207    node->end = lexer->lexsize;
1208    return node;
1209}
1210
1211/* used for adding a &nbsp; for Word2000 */
1212Node* TY_(NewLiteralTextNode)( Lexer *lexer, ctmbstr txt )
1213{
1214    Node *node = TY_(NewNode)( lexer );
1215    node->start = lexer->lexsize;
1216    AddStringToLexer( lexer, txt );
1217    node->end = lexer->lexsize;
1218    return node;
1219}
1220
1221static Node* TagToken( TidyDocImpl* doc, NodeType type )
1222{
1223    Lexer* lexer = doc->lexer;
1224    Node* node = TY_(NewNode)( lexer );
1225    node->type = type;
1226    node->element = TY_(tmbstrndup)( lexer->lexbuf + lexer->txtstart,
1227                                     lexer->txtend - lexer->txtstart );
1228    node->start = lexer->txtstart;
1229    node->end = lexer->txtstart;
1230
1231    if ( type == StartTag || type == StartEndTag || type == EndTag )
1232        TY_(FindTag)(doc, node);
1233
1234    return node;
1235}
1236
1237static Node* NewToken(TidyDocImpl* doc, NodeType type)
1238{
1239    Lexer* lexer = doc->lexer;
1240    Node* node = TY_(NewNode)(lexer);
1241    node->type = type;
1242    node->start = lexer->txtstart;
1243    node->end = lexer->txtend;
1244#ifdef TIDY_STORE_ORIGINAL_TEXT
1245    StoreOriginalTextInToken(doc, node, 0);
1246#endif
1247    return node;
1248}
1249
1250#define CommentToken(doc) NewToken(doc, CommentTag)
1251#define DocTypeToken(doc) NewToken(doc, DocTypeTag)
1252#define PIToken(doc)      NewToken(doc, ProcInsTag)
1253#define AspToken(doc)     NewToken(doc, AspTag)
1254#define JsteToken(doc)    NewToken(doc, JsteTag)
1255#define PhpToken(doc)     NewToken(doc, PhpTag)
1256#define XmlDeclToken(doc) NewToken(doc, XmlDecl)
1257#define SectionToken(doc) NewToken(doc, SectionTag)
1258#define CDATAToken(doc)   NewToken(doc, CDATATag)
1259
1260void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str )
1261{
1262    byte c;
1263    while(0 != (c = *str++) )
1264        TY_(AddCharToLexer)( lexer, c );
1265}
1266
1267/*
1268void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len )
1269{
1270    byte c;
1271    int ix;
1272
1273    for ( ix=0; ix < len && (c = *str++); ++ix )
1274        TY_(AddCharToLexer)(lexer, c);
1275}
1276*/
1277
1278/* find doctype element */
1279Node *TY_(FindDocType)( TidyDocImpl* doc )
1280{
1281    Node* node;
1282    for ( node = (doc ? doc->root.content : NULL);
1283          node && node->type != DocTypeTag;
1284          node = node->next )
1285        /**/;
1286    return node;
1287}
1288
1289/* find parent container element */
1290Node* TY_(FindContainer)( Node* node )
1291{
1292    for ( node = (node ? node->parent : NULL);
1293          node && TY_(nodeHasCM)(node, CM_INLINE);
1294          node = node->parent )
1295        /**/;
1296
1297    return node;
1298}
1299
1300
1301/* find html element */
1302Node *TY_(FindHTML)( TidyDocImpl* doc )
1303{
1304    Node *node;
1305    for ( node = (doc ? doc->root.content : NULL);
1306          node && !nodeIsHTML(node);
1307          node = node->next )
1308        /**/;
1309
1310    return node;
1311}
1312
1313/* find XML Declaration */
1314Node *TY_(FindXmlDecl)(TidyDocImpl* doc)
1315{
1316    Node *node;
1317    for ( node = (doc ? doc->root.content : NULL);
1318          node && !(node->type == XmlDecl);
1319          node = node->next )
1320        /**/;
1321
1322    return node;
1323}
1324
1325
1326Node *TY_(FindHEAD)( TidyDocImpl* doc )
1327{
1328    Node *node = TY_(FindHTML)( doc );
1329
1330    if ( node )
1331    {
1332        for ( node = node->content;
1333              node && !nodeIsHEAD(node);
1334              node = node->next )
1335            /**/;
1336    }
1337
1338    return node;
1339}
1340
1341Node *TY_(FindTITLE)(TidyDocImpl* doc)
1342{
1343    Node *node = TY_(FindHEAD)(doc);
1344
1345    if (node)
1346        for (node = node->content;
1347             node && !nodeIsTITLE(node);
1348             node = node->next) {}
1349
1350    return node;
1351}
1352
1353Node *TY_(FindBody)( TidyDocImpl* doc )
1354{
1355    Node *node = ( doc ? doc->root.content : NULL );
1356
1357    while ( node && !nodeIsHTML(node) )
1358        node = node->next;
1359
1360    if (node == NULL)
1361        return NULL;
1362
1363    node = node->content;
1364    while ( node && !nodeIsBODY(node) && !nodeIsFRAMESET(node) )
1365        node = node->next;
1366
1367    if ( node && nodeIsFRAMESET(node) )
1368    {
1369        node = node->content;
1370        while ( node && !nodeIsNOFRAMES(node) )
1371            node = node->next;
1372
1373        if ( node )
1374        {
1375            node = node->content;
1376            while ( node && !nodeIsBODY(node) )
1377                node = node->next;
1378        }
1379    }
1380
1381    return node;
1382}
1383
1384/* add meta element for Tidy */
1385Bool TY_(AddGenerator)( TidyDocImpl* doc )
1386{
1387    AttVal *attval;
1388    Node *node;
1389    Node *head = TY_(FindHEAD)( doc );
1390    tmbchar buf[256];
1391
1392    if (head)
1393    {
1394#ifdef PLATFORM_NAME
1395        TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for "PLATFORM_NAME" (vers %s), see www.w3.org",
1396                         tidyReleaseDate());
1397#else
1398        TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy (vers %s), see www.w3.org", tidyReleaseDate());
1399#endif
1400
1401        for ( node = head->content; node; node = node->next )
1402        {
1403            if ( nodeIsMETA(node) )
1404            {
1405                attval = TY_(AttrGetById)(node, TidyAttr_NAME);
1406
1407                if (AttrValueIs(attval, "generator"))
1408                {
1409                    attval = TY_(AttrGetById)(node, TidyAttr_CONTENT);
1410
1411                    if (AttrHasValue(attval) &&
1412                        TY_(tmbstrncasecmp)(attval->value, "HTML Tidy", 9) == 0)
1413                    {
1414                        /* update the existing content to reflect the */
1415                        /* actual version of Tidy currently being used */
1416
1417                        MemFree(attval->value);
1418                        attval->value = TY_(tmbstrdup)(buf);
1419                        return no;
1420                    }
1421                }
1422            }
1423        }
1424
1425        if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
1426        {
1427            node = TY_(InferredTag)(doc, TidyTag_META);
1428            TY_(AddAttribute)( doc, node, "name", "generator" );
1429            TY_(AddAttribute)( doc, node, "content", buf );
1430            TY_(InsertNodeAtStart)( head, node );
1431            return yes;
1432        }
1433    }
1434
1435    return no;
1436}
1437
1438/* examine <!DOCTYPE> to identify version */
1439static uint FindGivenVersion( TidyDocImpl* doc, Node* doctype )
1440{
1441    AttVal * fpi = TY_(GetAttrByName)(doctype, "PUBLIC");
1442    uint vers;
1443
1444    if (!fpi || !fpi->value)
1445        return VERS_UNKNOWN;
1446
1447    vers = GetVersFromFPI(fpi->value);
1448
1449    if (VERS_XHTML & vers)
1450    {
1451        TY_(SetOptionBool)(doc, TidyXmlOut, yes);
1452        TY_(SetOptionBool)(doc, TidyXhtmlOut, yes);
1453        doc->lexer->isvoyager = yes;
1454    }
1455
1456    /* todo: add a warning if case does not match? */
1457    MemFree(fpi->value);
1458    fpi->value = TY_(tmbstrdup)(GetFPIFromVers(vers));
1459
1460    return vers;
1461}
1462
1463/* return guessed version */
1464uint TY_(ApparentVersion)( TidyDocImpl* doc )
1465{
1466    if ((doc->lexer->doctype == XH11 ||
1467         doc->lexer->doctype == XB10) &&
1468        (doc->lexer->versions & doc->lexer->doctype))
1469        return doc->lexer->doctype;
1470    else
1471        return TY_(HTMLVersion)(doc);
1472}
1473
1474ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool ARG_UNUSED(isXhtml) )
1475{
1476    ctmbstr name = GetNameFromVers(vers);
1477
1478    /* this test has moved to ReportMarkupVersion() in localize.c, for localization reasons */
1479    /*
1480    if (!name)
1481        name = "HTML Proprietary";
1482     */
1483
1484    return name;
1485}
1486
1487Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc )
1488{
1489    Bool isXhtml = doc->lexer->isvoyager;
1490    Node* doctype;
1491
1492    /* Do not warn in XHTML mode */
1493    if ( isXhtml )
1494        return no;
1495
1496    /* Do not warn if emitted doctype is proprietary */
1497    if ( TY_(HTMLVersionNameFromCode)(doc->lexer->versionEmitted, isXhtml ) == NULL )
1498        return no;
1499
1500    /* Do not warn if no SI is possible */
1501    if ( GetSIFromVers(doc->lexer->versionEmitted) == NULL )
1502        return no;
1503
1504    if ( (doctype = TY_(FindDocType)( doc )) != NULL
1505         && TY_(GetAttrByName)(doctype, "SYSTEM") == NULL )
1506        return yes;
1507
1508    return no;
1509}
1510
1511
1512/* Put DOCTYPE declaration between the
1513** <?xml version "1.0" ... ?> declaration, if any,
1514** and the <html> tag.  Should also work for any comments,
1515** etc. that may precede the <html> tag.
1516*/
1517
1518static Node* NewDocTypeNode( TidyDocImpl* doc )
1519{
1520    Node* doctype = NULL;
1521    Node* html = TY_(FindHTML)( doc );
1522
1523    if ( !html )
1524        return NULL;
1525
1526    doctype = TY_(NewNode)( NULL );
1527    doctype->type = DocTypeTag;
1528    TY_(InsertNodeBeforeElement)(html, doctype);
1529    return doctype;
1530}
1531
1532Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc )
1533{
1534    Lexer *lexer = doc->lexer;
1535    Node *doctype = TY_(FindDocType)( doc );
1536    TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);
1537    ctmbstr pub = "PUBLIC";
1538    ctmbstr sys = "SYSTEM";
1539
1540    lexer->versionEmitted = TY_(ApparentVersion)( doc );
1541
1542    if (dtmode == TidyDoctypeOmit)
1543    {
1544        if (doctype)
1545            TY_(DiscardElement)(doc, doctype);
1546        return yes;
1547    }
1548
1549    if (dtmode == TidyDoctypeUser && !cfgStr(doc, TidyDoctype))
1550        return no;
1551
1552    if (!doctype)
1553    {
1554        doctype = NewDocTypeNode(doc);
1555        doctype->element = TY_(tmbstrdup)("html");
1556    }
1557    else
1558    {
1559        doctype->element = TY_(tmbstrtolower)(doctype->element);
1560    }
1561
1562    switch(dtmode)
1563    {
1564    case TidyDoctypeStrict:
1565        /* XHTML 1.0 Strict */
1566        TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S));
1567        TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S));
1568        lexer->versionEmitted = X10S;
1569        break;
1570    case TidyDoctypeLoose:
1571        /* XHTML 1.0 Transitional */
1572        TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T));
1573        TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T));
1574        lexer->versionEmitted = X10T;
1575        break;
1576    case TidyDoctypeUser:
1577        /* user defined document type declaration */
1578        TY_(RepairAttrValue)(doc, doctype, pub, cfgStr(doc, TidyDoctype));
1579        TY_(RepairAttrValue)(doc, doctype, sys, "");
1580        break;
1581    case TidyDoctypeAuto:
1582        if (lexer->versions & XH11 && lexer->doctype == XH11)
1583        {
1584            if (!TY_(GetAttrByName)(doctype, sys))
1585                TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11));
1586            lexer->versionEmitted = XH11;
1587            return yes;
1588        }
1589        else if (lexer->versions & XH11 && !(lexer->versions & VERS_HTML40))
1590        {
1591            TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(XH11));
1592            TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11));
1593            lexer->versionEmitted = XH11;
1594        }
1595        else if (lexer->versions & XB10 && lexer->doctype == XB10)
1596        {
1597            if (!TY_(GetAttrByName)(doctype, sys))
1598                TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XB10));
1599            lexer->versionEmitted = XB10;
1600            return yes;
1601        }
1602        else if (lexer->versions & VERS_HTML40_STRICT)
1603        {
1604            TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S));
1605            TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S));
1606            lexer->versionEmitted = X10S;
1607        }
1608        else if (lexer->versions & VERS_FRAMESET)
1609        {
1610            TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10F));
1611            TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10F));
1612            lexer->versionEmitted = X10F;
1613        }
1614        else if (lexer->versions & VERS_LOOSE)
1615        {
1616            TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T));
1617            TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T));
1618            lexer->versionEmitted = X10T;
1619        }
1620        else
1621        {
1622            if (doctype)
1623                TY_(DiscardElement)(doc, doctype);
1624            return no;
1625        }
1626        break;
1627    }
1628
1629    return no;
1630}
1631
1632/* fixup doctype if missing */
1633Bool TY_(FixDocType)( TidyDocImpl* doc )
1634{
1635    Lexer* lexer = doc->lexer;
1636    Node* doctype = TY_(FindDocType)( doc );
1637    uint dtmode = cfg( doc, TidyDoctypeMode );
1638    uint guessed = VERS_UNKNOWN;
1639    Bool hadSI = no;
1640
1641    if (dtmode == TidyDoctypeAuto &&
1642        lexer->versions & lexer->doctype &&
1643        !(VERS_XHTML & lexer->doctype && !lexer->isvoyager)
1644        && TY_(FindDocType)(doc))
1645    {
1646        lexer->versionEmitted = lexer->doctype;
1647        return yes;
1648    }
1649
1650    if (dtmode == TidyDoctypeOmit)
1651    {
1652        if (doctype)
1653            TY_(DiscardElement)( doc, doctype );
1654        lexer->versionEmitted = TY_(ApparentVersion)( doc );
1655        return yes;
1656    }
1657
1658    if (cfgBool(doc, TidyXmlOut))
1659        return yes;
1660
1661    if (doctype)
1662        hadSI = TY_(GetAttrByName)(doctype, "SYSTEM") != NULL;
1663
1664    if ((dtmode == TidyDoctypeStrict ||
1665         dtmode == TidyDoctypeLoose) && doctype)
1666    {
1667        TY_(DiscardElement)(doc, doctype);
1668        doctype = NULL;
1669    }
1670
1671    switch (dtmode)
1672    {
1673    case TidyDoctypeStrict:
1674        guessed = H41S;
1675        break;
1676    case TidyDoctypeLoose:
1677        guessed = H41T;
1678        break;
1679    case TidyDoctypeAuto:
1680        guessed = TY_(HTMLVersion)(doc);
1681        break;
1682    }
1683
1684    lexer->versionEmitted = guessed;
1685    if (guessed == VERS_UNKNOWN)
1686        return no;
1687
1688    if (doctype)
1689    {
1690        doctype->element = TY_(tmbstrtolower)(doctype->element);
1691    }
1692    else
1693    {
1694        doctype = NewDocTypeNode(doc);
1695        doctype->element = TY_(tmbstrdup)("html");
1696    }
1697
1698    TY_(RepairAttrValue)(doc, doctype, "PUBLIC", GetFPIFromVers(guessed));
1699
1700    if (hadSI)
1701        TY_(RepairAttrValue)(doc, doctype, "SYSTEM", GetSIFromVers(guessed));
1702
1703    return yes;
1704}
1705
1706/* ensure XML document starts with <?xml version="1.0"?> */
1707/* add encoding attribute if not using ASCII or UTF-8 output */
1708Bool TY_(FixXmlDecl)( TidyDocImpl* doc )
1709{
1710    Node* xml;
1711    AttVal *version, *encoding;
1712    Lexer*lexer = doc->lexer;
1713    Node* root = &doc->root;
1714
1715    if ( root->content && root->content->type == XmlDecl )
1716    {
1717        xml = root->content;
1718    }
1719    else
1720    {
1721        xml = TY_(NewNode)(lexer);
1722        xml->type = XmlDecl;
1723        if ( root->content )
1724            TY_(InsertNodeBeforeElement)(root->content, xml);
1725        else
1726            root->content = xml;
1727    }
1728
1729    version = TY_(GetAttrByName)(xml, "version");
1730    encoding = TY_(GetAttrByName)(xml, "encoding");
1731
1732    /*
1733      We need to insert a check if declared encoding
1734      and output encoding mismatch and fix the XML
1735      declaration accordingly!!!
1736    */
1737
1738    if ( encoding == NULL && cfg(doc, TidyOutCharEncoding) != UTF8 )
1739    {
1740        ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
1741        if ( enc )
1742            TY_(AddAttribute)( doc, xml, "encoding", enc );
1743    }
1744
1745    if ( version == NULL )
1746        TY_(AddAttribute)( doc, xml, "version", "1.0" );
1747    return yes;
1748}
1749
1750Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id)
1751{
1752    Lexer *lexer = doc->lexer;
1753    Node *node = TY_(NewNode)( lexer );
1754    const Dict* dict = TY_(LookupTagDef)(id);
1755
1756    assert( dict != NULL );
1757
1758    node->type = StartTag;
1759    node->implicit = yes;
1760    node->element = TY_(tmbstrdup)(dict->name);
1761    node->tag = dict;
1762    node->start = lexer->txtstart;
1763    node->end = lexer->txtend;
1764
1765    return node;
1766}
1767
1768static Bool ExpectsContent(Node *node)
1769{
1770    if (node->type != StartTag)
1771        return no;
1772
1773    /* unknown element? */
1774    if (node->tag == NULL)
1775        return yes;
1776
1777    if (node->tag->model & CM_EMPTY)
1778        return no;
1779
1780    return yes;
1781}
1782
1783/*
1784  create a text node for the contents of
1785  a CDATA element like style or script
1786  which ends with </foo> for some foo.
1787*/
1788
1789typedef enum
1790{
1791    CDATA_INTERMEDIATE,
1792    CDATA_STARTTAG,
1793    CDATA_ENDTAG
1794} CDATAState;
1795
1796static Node *GetCDATA( TidyDocImpl* doc, Node *container )
1797{
1798    Lexer* lexer = doc->lexer;
1799    uint start = 0;
1800    int nested = 0;
1801    CDATAState state = CDATA_INTERMEDIATE;
1802    uint i;
1803    Bool isEmpty = yes;
1804    Bool matches = no;
1805    uint c;
1806    Bool hasSrc = TY_(AttrGetById)(container, TidyAttr_SRC) != NULL;
1807
1808    SetLexerLocus( doc, lexer );
1809    lexer->waswhite = no;
1810    lexer->txtstart = lexer->txtend = lexer->lexsize;
1811
1812    /* seen start tag, look for matching end tag */
1813    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
1814    {
1815        TY_(AddCharToLexer)(lexer, c);
1816        lexer->txtend = lexer->lexsize;
1817
1818        if (state == CDATA_INTERMEDIATE)
1819        {
1820            if (c != '<')
1821            {
1822                if (isEmpty && !TY_(IsWhite)(c))
1823                    isEmpty = no;
1824                continue;
1825            }
1826
1827            c = TY_(ReadChar)(doc->docIn);
1828
1829            if (TY_(IsLetter)(c))
1830            {
1831                /* <head><script src=foo><meta name=foo content=bar>*/
1832                if (hasSrc && isEmpty && nodeIsSCRIPT(container))
1833                {
1834                    /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */
1835                    lexer->lexsize = lexer->txtstart;
1836                    TY_(UngetChar)(c, doc->docIn);
1837                    TY_(UngetChar)('<', doc->docIn);
1838                    return NULL;
1839                }
1840                TY_(AddCharToLexer)(lexer, c);
1841                start = lexer->lexsize - 1;
1842                state = CDATA_STARTTAG;
1843            }
1844            else if (c == '/')
1845            {
1846                TY_(AddCharToLexer)(lexer, c);
1847
1848                c = TY_(ReadChar)(doc->docIn);
1849
1850                if (!TY_(IsLetter)(c))
1851                {
1852                    TY_(UngetChar)(c, doc->docIn);
1853                    continue;
1854                }
1855                TY_(UngetChar)(c, doc->docIn);
1856
1857                start = lexer->lexsize;
1858                state = CDATA_ENDTAG;
1859            }
1860            else if (c == '\\')
1861            {
1862                /* recognize document.write("<script><\/script>") */
1863                TY_(AddCharToLexer)(lexer, c);
1864
1865                c = TY_(ReadChar)(doc->docIn);
1866
1867                if (c != '/')
1868                {
1869                    TY_(UngetChar)(c, doc->docIn);
1870                    continue;
1871                }
1872
1873                TY_(AddCharToLexer)(lexer, c);
1874                c = TY_(ReadChar)(doc->docIn);
1875
1876                if (!TY_(IsLetter)(c))
1877                {
1878                    TY_(UngetChar)(c, doc->docIn);
1879                    continue;
1880                }
1881                TY_(UngetChar)(c, doc->docIn);
1882
1883                start = lexer->lexsize;
1884                state = CDATA_ENDTAG;
1885            }
1886            else
1887            {
1888                TY_(UngetChar)(c, doc->docIn);
1889            }
1890        }
1891        /* '<' + Letter found */
1892        else if (state == CDATA_STARTTAG)
1893        {
1894            if (TY_(IsLetter)(c))
1895                continue;
1896
1897            matches = TY_(tmbstrncasecmp)(container->element, lexer->lexbuf + start,
1898                                     TY_(tmbstrlen)(container->element)) == 0;
1899            if (matches)
1900                nested++;
1901
1902            state = CDATA_INTERMEDIATE;
1903        }
1904        /* '<' + '/' + Letter found */
1905        else if (state == CDATA_ENDTAG)
1906        {
1907            if (TY_(IsLetter)(c))
1908                continue;
1909
1910            matches = TY_(tmbstrncasecmp)(container->element, lexer->lexbuf + start,
1911                                     TY_(tmbstrlen)(container->element)) == 0;
1912
1913            if (isEmpty && !matches)
1914            {
1915                /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */
1916
1917                for (i = lexer->lexsize - 1; i >= start; --i)
1918                    TY_(UngetChar)((uint)lexer->lexbuf[i], doc->docIn);
1919                TY_(UngetChar)('/', doc->docIn);
1920                TY_(UngetChar)('<', doc->docIn);
1921                break;
1922            }
1923
1924            if (matches && nested-- <= 0)
1925            {
1926                for (i = lexer->lexsize - 1; i >= start; --i)
1927                    TY_(UngetChar)((uint)lexer->lexbuf[i], doc->docIn);
1928                TY_(UngetChar)('/', doc->docIn);
1929                TY_(UngetChar)('<', doc->docIn);
1930                lexer->lexsize -= (lexer->lexsize - start) + 2;
1931                break;
1932            }
1933            else if (lexer->lexbuf[start - 2] != '\\')
1934            {
1935                /* if the end tag is not already escaped using backslash */
1936                SetLexerLocus( doc, lexer );
1937                lexer->columns -= 3;
1938                TY_(ReportError)(doc, NULL, NULL, BAD_CDATA_CONTENT);
1939
1940                /* if javascript insert backslash before / */
1941                if (TY_(IsJavaScript)(container))
1942                {
1943                    for (i = lexer->lexsize; i > start-1; --i)
1944                        lexer->lexbuf[i] = lexer->lexbuf[i-1];
1945
1946                    lexer->lexbuf[start-1] = '\\';
1947                    lexer->lexsize++;
1948                }
1949            }
1950            state = CDATA_INTERMEDIATE;
1951        }
1952    }
1953    if (isEmpty)
1954        lexer->lexsize = lexer->txtstart = lexer->txtend;
1955    else
1956        lexer->txtend = lexer->lexsize;
1957
1958    if (c == EndOfStream)
1959        TY_(ReportError)(doc, container, NULL, MISSING_ENDTAG_FOR );
1960
1961/* this was disabled for some reason... */
1962#if 0
1963    if (lexer->txtend > lexer->txtstart)
1964        return TextToken(lexer);
1965    else
1966        return NULL;
1967#else
1968    return TY_(TextToken)(lexer);
1969#endif
1970}
1971
1972void TY_(UngetToken)( TidyDocImpl* doc )
1973{
1974    doc->lexer->pushed = yes;
1975}
1976
1977#ifdef TIDY_STORE_ORIGINAL_TEXT
1978#define CondReturnTextNode(doc, skip) \
1979            if (lexer->txtend > lexer->txtstart) \
1980            { \
1981                lexer->token = TY_(TextToken)(lexer); \
1982                StoreOriginalTextInToken(doc, lexer->token, skip); \
1983                return lexer->token; \
1984            }
1985#else
1986#define CondReturnTextNode(doc, skip) \
1987            if (lexer->txtend > lexer->txtstart) \
1988            { \
1989                lexer->token = TY_(TextToken)(lexer); \
1990                return lexer->token; \
1991            }
1992#endif
1993
1994/*
1995  modes for GetToken()
1996
1997  MixedContent   -- for elements which don't accept PCDATA
1998  Preformatted   -- white space preserved as is
1999  IgnoreMarkup   -- for CDATA elements such as script, style
2000*/
2001static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode );
2002
2003Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode )
2004{
2005    Lexer* lexer = doc->lexer;
2006
2007    if (lexer->pushed || lexer->itoken)
2008    {
2009        /* Deal with previously returned duplicate inline token */
2010        if (lexer->itoken)
2011        {
2012            /* itoken rejected */
2013            if (lexer->pushed)
2014            {
2015                lexer->pushed = no;
2016                return lexer->itoken;
2017            }
2018            /* itoken has been accepted */
2019            lexer->itoken = NULL;
2020        }
2021
2022        /* duplicate inlines in preference to pushed text nodes when appropriate */
2023        lexer->pushed = no;
2024        if (lexer->token->type != TextNode
2025            || !(lexer->insert || lexer->inode))
2026            return lexer->token;
2027        return lexer->itoken = TY_(InsertedToken)( doc );
2028    }
2029
2030    assert( !(lexer->pushed || lexer->itoken) );
2031
2032    /* at start of block elements, unclosed inline
2033       elements are inserted into the token stream */
2034    if (lexer->insert || lexer->inode)
2035        return lexer->token = TY_(InsertedToken)( doc );
2036
2037    if (mode == CdataContent)
2038    {
2039        assert( lexer->parent != NULL );
2040        return GetCDATA(doc, lexer->parent);
2041    }
2042
2043    return GetTokenFromStream( doc, mode );
2044}
2045
2046static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
2047{
2048    Lexer* lexer = doc->lexer;
2049    uint c, badcomment = 0;
2050    Bool isempty = no;
2051    AttVal *attributes = NULL;
2052
2053    /* Lexer->token must be set on return. Nullify it for safety. */
2054    lexer->token = NULL;
2055
2056    SetLexerLocus( doc, lexer );
2057    lexer->waswhite = no;
2058
2059    lexer->txtstart = lexer->txtend = lexer->lexsize;
2060
2061    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
2062    {
2063        if (lexer->insertspace)
2064        {
2065            TY_(AddCharToLexer)(lexer, ' ');
2066            lexer->waswhite = yes;
2067            lexer->insertspace = no;
2068        }
2069
2070        if (c == 160 && (mode == Preformatted))
2071            c = ' ';
2072
2073        TY_(AddCharToLexer)(lexer, c);
2074
2075        switch (lexer->state)
2076        {
2077            case LEX_CONTENT:  /* element content */
2078
2079                /*
2080                 Discard white space if appropriate. Its cheaper
2081                 to do this here rather than in parser methods
2082                 for elements that don't have mixed content.
2083                */
2084                if (TY_(IsWhite)(c) && (mode == IgnoreWhitespace)
2085                      && lexer->lexsize == lexer->txtstart + 1)
2086                {
2087                    --(lexer->lexsize);
2088                    lexer->waswhite = no;
2089                    SetLexerLocus( doc, lexer );
2090                    continue;
2091                }
2092
2093                if (c == '<')
2094                {
2095                    lexer->state = LEX_GT;
2096                    continue;
2097                }
2098
2099                if (TY_(IsWhite)(c))
2100                {
2101                    /* was previous character white? */
2102                    if (lexer->waswhite)
2103                    {
2104                        if (mode != Preformatted && mode != IgnoreMarkup)
2105                        {
2106                            --(lexer->lexsize);
2107                            SetLexerLocus( doc, lexer );
2108                        }
2109                    }
2110                    else /* prev character wasn't white */
2111                    {
2112                        lexer->waswhite = yes;
2113
2114                        if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
2115                            ChangeChar(lexer, ' ');
2116                    }
2117
2118                    continue;
2119                }
2120                else if (c == '&' && mode != IgnoreMarkup)
2121                    ParseEntity( doc, mode );
2122
2123                /* this is needed to avoid trimming trailing whitespace */
2124                if (mode == IgnoreWhitespace)
2125                    mode = MixedContent;
2126
2127                lexer->waswhite = no;
2128                continue;
2129
2130            case LEX_GT:  /* < */
2131
2132                /* check for endtag */
2133                if (c == '/')
2134                {
2135                    if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
2136                    {
2137                        TY_(UngetChar)(c, doc->docIn);
2138                        continue;
2139                    }
2140
2141                    TY_(AddCharToLexer)(lexer, c);
2142
2143                    if (TY_(IsLetter)(c))
2144                    {
2145                        lexer->lexsize -= 3;
2146                        lexer->txtend = lexer->lexsize;
2147                        TY_(UngetChar)(c, doc->docIn);
2148                        lexer->state = LEX_ENDTAG;
2149                        lexer->lexbuf[lexer->lexsize] = '\0';  /* debug */
2150                        doc->docIn->curcol -= 2;
2151
2152                        /* if some text before the </ return it now */
2153                        if (lexer->txtend > lexer->txtstart)
2154                        {
2155                            /* trim space character before end tag */
2156                            if (mode == IgnoreWhitespace && lexer->lexbuf[lexer->lexsize - 1] == ' ')
2157                            {
2158                                lexer->lexsize -= 1;
2159                                lexer->txtend = lexer->lexsize;
2160                            }
2161                            lexer->token = TY_(TextToken)(lexer);
2162#ifdef TIDY_STORE_ORIGINAL_TEXT
2163                            StoreOriginalTextInToken(doc, lexer->token, 3);
2164#endif
2165                            return lexer->token;
2166                        }
2167
2168                        continue;       /* no text so keep going */
2169                    }
2170
2171                    /* otherwise treat as CDATA */
2172                    lexer->waswhite = no;
2173                    lexer->state = LEX_CONTENT;
2174                    continue;
2175                }
2176
2177                if (mode == IgnoreMarkup)
2178                {
2179                    /* otherwise treat as CDATA */
2180                    lexer->waswhite = no;
2181                    lexer->state = LEX_CONTENT;
2182                    continue;
2183                }
2184
2185                /*
2186                   look out for comments, doctype or marked sections
2187                   this isn't quite right, but its getting there ...
2188                */
2189                if (c == '!')
2190                {
2191                    c = TY_(ReadChar)(doc->docIn);
2192
2193                    if (c == '-')
2194                    {
2195                        c = TY_(ReadChar)(doc->docIn);
2196
2197                        if (c == '-')
2198                        {
2199                            lexer->state = LEX_COMMENT;  /* comment */
2200                            lexer->lexsize -= 2;
2201                            lexer->txtend = lexer->lexsize;
2202
2203                            CondReturnTextNode(doc, 4)
2204
2205                            lexer->txtstart = lexer->lexsize;
2206                            continue;
2207                        }
2208
2209                        TY_(ReportError)(doc, NULL, NULL, MALFORMED_COMMENT );
2210                    }
2211                    else if (c == 'd' || c == 'D')
2212                    {
2213                        /* todo: check for complete "<!DOCTYPE" not just <!D */
2214
2215                        uint skip = 0;
2216
2217                        lexer->state = LEX_DOCTYPE; /* doctype */
2218                        lexer->lexsize -= 2;
2219                        lexer->txtend = lexer->lexsize;
2220                        mode = IgnoreWhitespace;
2221
2222                        /* skip until white space or '>' */
2223
2224                        for (;;)
2225                        {
2226                            c = TY_(ReadChar)(doc->docIn);
2227                            ++skip;
2228
2229                            if (c == EndOfStream || c == '>')
2230                            {
2231                                TY_(UngetChar)(c, doc->docIn);
2232                                break;
2233                            }
2234
2235
2236                            if (!TY_(IsWhite)(c))
2237                                continue;
2238
2239                            /* and skip to end of whitespace */
2240
2241                            for (;;)
2242                            {
2243                                c = TY_(ReadChar)(doc->docIn);
2244                                ++skip;
2245
2246                                if (c == EndOfStream || c == '>')
2247                                {
2248                                    TY_(UngetChar)(c, doc->docIn);
2249                                    break;
2250                                }
2251
2252
2253                                if (TY_(IsWhite)(c))
2254                                    continue;
2255
2256                                TY_(UngetChar)(c, doc->docIn);
2257                                break;
2258                            }
2259
2260                            break;
2261                        }
2262
2263                        CondReturnTextNode(doc, (skip + 3))
2264
2265                        lexer->txtstart = lexer->lexsize;
2266                        continue;
2267                    }
2268                    else if (c == '[')
2269                    {
2270                        /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
2271                        lexer->lexsize -= 2;
2272                        lexer->state = LEX_SECTION;
2273                        lexer->txtend = lexer->lexsize;
2274
2275                        CondReturnTextNode(doc, 2)
2276
2277                        lexer->txtstart = lexer->lexsize;
2278                        continue;
2279                    }
2280
2281
2282
2283                    /* else swallow characters up to and including next '>' */
2284                    while ((c = TY_(ReadChar)(doc->docIn)) != '>')
2285                    {
2286                        if (c == EndOfStream)
2287                        {
2288                            TY_(UngetChar)(c, doc->docIn);
2289                            break;
2290                        }
2291                    }
2292
2293                    lexer->lexsize -= 2;
2294                    lexer->lexbuf[lexer->lexsize] = '\0';
2295                    lexer->state = LEX_CONTENT;
2296                    continue;
2297                }
2298
2299                /*
2300                   processing instructions
2301                */
2302
2303                if (c == '?')
2304                {
2305                    lexer->lexsize -= 2;
2306                    lexer->state = LEX_PROCINSTR;
2307                    lexer->txtend = lexer->lexsize;
2308
2309                    CondReturnTextNode(doc, 2)
2310
2311                    lexer->txtstart = lexer->lexsize;
2312                    continue;
2313                }
2314
2315                /* Microsoft ASP's e.g. <% ... server-code ... %> */
2316                if (c == '%')
2317                {
2318                    lexer->lexsize -= 2;
2319                    lexer->state = LEX_ASP;
2320                    lexer->txtend = lexer->lexsize;
2321
2322                    CondReturnTextNode(doc, 2)
2323
2324                    lexer->txtstart = lexer->lexsize;
2325                    continue;
2326                }
2327
2328                /* Netscapes JSTE e.g. <# ... server-code ... #> */
2329                if (c == '#')
2330                {
2331                    lexer->lexsize -= 2;
2332                    lexer->state = LEX_JSTE;
2333                    lexer->txtend = lexer->lexsize;
2334
2335                    CondReturnTextNode(doc, 2)
2336
2337                    lexer->txtstart = lexer->lexsize;
2338                    continue;
2339                }
2340
2341                /* check for start tag */
2342                if (TY_(IsLetter)(c))
2343                {
2344                    TY_(UngetChar)(c, doc->docIn);     /* push back letter */
2345                    TY_(UngetChar)('<', doc->docIn);
2346                    --(doc->docIn->curcol);
2347                    lexer->lexsize -= 2;      /* discard "<" + letter */
2348                    lexer->txtend = lexer->lexsize;
2349                    lexer->state = LEX_STARTTAG;         /* ready to read tag name */
2350
2351                    CondReturnTextNode(doc, 2)
2352
2353                    /* lexer->txtstart = lexer->lexsize; missing here? */
2354                    continue;       /* no text so keep going */
2355                }
2356
2357                /* fix for bug 762102 */
2358                if (c == '&')
2359                {
2360                    TY_(UngetChar)(c, doc->docIn);
2361                    --(lexer->lexsize);
2362                }
2363
2364                /* otherwise treat as CDATA */
2365                lexer->state = LEX_CONTENT;
2366                lexer->waswhite = no;
2367                continue;
2368
2369            case LEX_ENDTAG:  /* </letter */
2370                lexer->txtstart = lexer->lexsize - 1;
2371                doc->docIn->curcol += 2;
2372                c = ParseTagName( doc );
2373                lexer->token = TagToken( doc, EndTag );  /* create endtag token */
2374                lexer->lexsize = lexer->txtend = lexer->txtstart;
2375
2376                /* skip to '>' */
2377                while ( c != '>' && c != EndOfStream )
2378                {
2379                    c = TY_(ReadChar)(doc->docIn);
2380                }
2381
2382                if (c == EndOfStream)
2383                {
2384                    TY_(FreeNode)( doc, lexer->token );
2385                    continue;
2386                }
2387
2388                lexer->state = LEX_CONTENT;
2389                lexer->waswhite = no;
2390#ifdef TIDY_STORE_ORIGINAL_TEXT
2391                StoreOriginalTextInToken(doc, lexer->token, 0); /* hmm... */
2392#endif
2393                return lexer->token;  /* the endtag token */
2394
2395            case LEX_STARTTAG: /* first letter of tagname */
2396                c = TY_(ReadChar)(doc->docIn);
2397                ChangeChar(lexer, (tmbchar)c);
2398                lexer->txtstart = lexer->lexsize - 1; /* set txtstart to first letter */
2399                c = ParseTagName( doc );
2400                isempty = no;
2401                attributes = NULL;
2402                lexer->token = TagToken( doc, (isempty ? StartEndTag : StartTag) );
2403
2404                /* parse attributes, consuming closing ">" */
2405                if (c != '>')
2406                {
2407                    if (c == '/')
2408                        TY_(UngetChar)(c, doc->docIn);
2409
2410                    attributes = ParseAttrs( doc, &isempty );
2411                }
2412
2413                if (isempty)
2414                    lexer->token->type = StartEndTag;
2415
2416                lexer->token->attributes = attributes;
2417                lexer->lexsize = lexer->txtend = lexer->txtstart;
2418
2419                /* swallow newline following start tag */
2420                /* special check needed for CRLF sequence */
2421                /* this doesn't apply to empty elements */
2422                /* nor to preformatted content that needs escaping */
2423
2424                if ((mode != Preformatted && ExpectsContent(lexer->token))
2425                    || nodeIsBR(lexer->token) || nodeIsHR(lexer->token))
2426                {
2427                    c = TY_(ReadChar)(doc->docIn);
2428
2429                    if (c != '\n' && c != '\f')
2430                        TY_(UngetChar)(c, doc->docIn);
2431
2432                    lexer->waswhite = yes;  /* to swallow leading whitespace */
2433                }
2434                else
2435                    lexer->waswhite = no;
2436
2437                lexer->state = LEX_CONTENT;
2438                if (lexer->token->tag == NULL)
2439                    TY_(ReportFatal)( doc, NULL, lexer->token, UNKNOWN_ELEMENT );
2440                else if ( !cfgBool(doc, TidyXmlTags) )
2441                {
2442                    Node* curr = lexer->token;
2443                    TY_(ConstrainVersion)( doc, curr->tag->versions );
2444
2445                    if ( curr->tag->versions & VERS_PROPRIETARY )
2446                    {
2447                        if ( !cfgBool(doc, TidyMakeClean) ||
2448                             ( !nodeIsNOBR(curr) && !nodeIsWBR(curr) ) )
2449                        {
2450                            TY_(ReportError)(doc, NULL, curr, PROPRIETARY_ELEMENT );
2451
2452                            if ( nodeIsLAYER(curr) )
2453                                doc->badLayout |= USING_LAYER;
2454                            else if ( nodeIsSPACER(curr) )
2455                                doc->badLayout |= USING_SPACER;
2456                            else if ( nodeIsNOBR(curr) )
2457                                doc->badLayout |= USING_NOBR;
2458                        }
2459                    }
2460
2461                    TY_(RepairDuplicateAttributes)( doc, curr );
2462                }
2463#ifdef TIDY_STORE_ORIGINAL_TEXT
2464                StoreOriginalTextInToken(doc, lexer->token, 0);
2465#endif
2466                return lexer->token;  /* return start tag */
2467
2468            case LEX_COMMENT:  /* seen <!-- so look for --> */
2469
2470                if (c != '-')
2471                    continue;
2472
2473                c = TY_(ReadChar)(doc->docIn);
2474                TY_(AddCharToLexer)(lexer, c);
2475
2476                if (c != '-')
2477                    continue;
2478
2479            end_comment:
2480                c = TY_(ReadChar)(doc->docIn);
2481
2482                if (c == '>')
2483                {
2484                    if (badcomment)
2485                        TY_(ReportError)(doc, NULL, NULL, MALFORMED_COMMENT );
2486
2487                    /* do not store closing -- in lexbuf */
2488                    lexer->lexsize -= 2;
2489                    lexer->txtend = lexer->lexsize;
2490                    lexer->lexbuf[lexer->lexsize] = '\0';
2491                    lexer->state = LEX_CONTENT;
2492                    lexer->waswhite = no;
2493                    lexer->token = CommentToken(doc);
2494
2495                    /* now look for a line break */
2496
2497                    c = TY_(ReadChar)(doc->docIn);
2498
2499                    if (c == '\n')
2500                        lexer->token->linebreak = yes;
2501                    else
2502                        TY_(UngetChar)(c, doc->docIn);
2503
2504                    return lexer->token;
2505                }
2506
2507                /* note position of first such error in the comment */
2508                if (!badcomment)
2509                {
2510                    SetLexerLocus( doc, lexer );
2511                    lexer->columns -= 3;
2512                }
2513
2514                badcomment++;
2515
2516                if ( cfgBool(doc, TidyFixComments) )
2517                    lexer->lexbuf[lexer->lexsize - 2] = '=';
2518
2519                /* if '-' then look for '>' to end the comment */
2520                if (c == '-')
2521                {
2522                    TY_(AddCharToLexer)(lexer, c);
2523                    goto end_comment;
2524                }
2525
2526                /* otherwise continue to look for --> */
2527                lexer->lexbuf[lexer->lexsize - 1] = '=';
2528
2529                /* http://tidy.sf.net/bug/1266647 */
2530                TY_(AddCharToLexer)(lexer, c);
2531
2532                continue;
2533
2534            case LEX_DOCTYPE:  /* seen <!d so look for '>' munging whitespace */
2535
2536                /* use ParseDocTypeDecl() to tokenize doctype declaration */
2537                TY_(UngetChar)(c, doc->docIn);
2538                lexer->lexsize -= 1;
2539                lexer->token = ParseDocTypeDecl(doc);
2540
2541                lexer->txtend = lexer->lexsize;
2542                lexer->lexbuf[lexer->lexsize] = '\0';
2543                lexer->state = LEX_CONTENT;
2544                lexer->waswhite = no;
2545
2546                /* make a note of the version named by the 1st doctype */
2547                if (lexer->doctype == VERS_UNKNOWN && lexer->token && !cfgBool(doc, TidyXmlTags))
2548                    lexer->doctype = FindGivenVersion(doc, lexer->token);
2549                return lexer->token;
2550
2551            case LEX_PROCINSTR:  /* seen <? so look for '>' */
2552                /* check for PHP preprocessor instructions <?php ... ?> */
2553
2554                if  (lexer->lexsize - lexer->txtstart == 3)
2555                {
2556                    if (TY_(tmbstrncmp)(lexer->lexbuf + lexer->txtstart, "php", 3) == 0)
2557                    {
2558                        lexer->state = LEX_PHP;
2559                        continue;
2560                    }
2561                }
2562
2563                if  (lexer->lexsize - lexer->txtstart == 4)
2564                {
2565                    if (TY_(tmbstrncmp)(lexer->lexbuf + lexer->txtstart, "xml", 3) == 0 &&
2566                        TY_(IsWhite)(lexer->lexbuf[lexer->txtstart + 3]))
2567                    {
2568                        lexer->state = LEX_XMLDECL;
2569                        attributes = NULL;
2570                        continue;
2571                    }
2572                }
2573
2574                if (cfgBool(doc, TidyXmlPIs) || lexer->isvoyager) /* insist on ?> as terminator */
2575                {
2576                    if (c != '?')
2577                        continue;
2578
2579                    /* now look for '>' */
2580                    c = TY_(ReadChar)(doc->docIn);
2581
2582                    if (c == EndOfStream)
2583                    {
2584                        TY_(ReportError)(doc, NULL, NULL, UNEXPECTED_END_OF_FILE );
2585                        TY_(UngetChar)(c, doc->docIn);
2586                        continue;
2587                    }
2588
2589                    TY_(AddCharToLexer)(lexer, c);
2590                }
2591
2592
2593                if (c != '>')
2594                    continue;
2595
2596                lexer->lexsize -= 1;
2597
2598                if (lexer->lexsize)
2599                {
2600                    uint i;
2601                    Bool closed;
2602
2603                    for (i = 0; i < lexer->lexsize - lexer->txtstart &&
2604                        !TY_(IsWhite)(lexer->lexbuf[i + lexer->txtstart]); ++i)
2605                        /**/;
2606
2607                    closed = lexer->lexbuf[lexer->lexsize - 1] == '?';
2608
2609                    if (closed)
2610                        lexer->lexsize -= 1;
2611
2612                    lexer->txtstart += i;
2613                    lexer->txtend = lexer->lexsize;
2614                    lexer->lexbuf[lexer->lexsize] = '\0';
2615
2616                    lexer->token = PIToken(doc);
2617                    lexer->token->closed = closed;
2618                    lexer->token->element = TY_(tmbstrndup)(lexer->lexbuf +
2619                                                            lexer->txtstart - i, i);
2620                }
2621                else
2622                {
2623                    lexer->txtend = lexer->lexsize;
2624                    lexer->lexbuf[lexer->lexsize] = '\0';
2625                    lexer->token = PIToken(doc);
2626                }
2627
2628                lexer->state = LEX_CONTENT;
2629                lexer->waswhite = no;
2630                return lexer->token;
2631
2632            case LEX_ASP:  /* seen <% so look for "%>" */
2633                if (c != '%')
2634                    continue;
2635
2636                /* now look for '>' */
2637                c = TY_(ReadChar)(doc->docIn);
2638
2639
2640                if (c != '>')
2641                {
2642                    TY_(UngetChar)(c, doc->docIn);
2643                    continue;
2644                }
2645
2646                lexer->lexsize -= 1;
2647                lexer->txtend = lexer->lexsize;
2648                lexer->lexbuf[lexer->lexsize] = '\0';
2649                lexer->state = LEX_CONTENT;
2650                lexer->waswhite = no;
2651                return lexer->token = AspToken(doc);
2652
2653            case LEX_JSTE:  /* seen <# so look for "#>" */
2654                if (c != '#')
2655                    continue;
2656
2657                /* now look for '>' */
2658                c = TY_(ReadChar)(doc->docIn);
2659
2660
2661                if (c != '>')
2662                {
2663                    TY_(UngetChar)(c, doc->docIn);
2664                    continue;
2665                }
2666
2667                lexer->lexsize -= 1;
2668                lexer->txtend = lexer->lexsize;
2669                lexer->lexbuf[lexer->lexsize] = '\0';
2670                lexer->state = LEX_CONTENT;
2671                lexer->waswhite = no;
2672                return lexer->token = JsteToken(doc);
2673
2674            case LEX_PHP: /* seen "<?php" so look for "?>" */
2675                if (c != '?')
2676                    continue;
2677
2678                /* now look for '>' */
2679                c = TY_(ReadChar)(doc->docIn);
2680
2681                if (c != '>')
2682                {
2683                    TY_(UngetChar)(c, doc->docIn);
2684                    continue;
2685                }
2686
2687                lexer->lexsize -= 1;
2688                lexer->txtend = lexer->lexsize;
2689                lexer->lexbuf[lexer->lexsize] = '\0';
2690                lexer->state = LEX_CONTENT;
2691                lexer->waswhite = no;
2692                return lexer->token = PhpToken(doc);
2693
2694            case LEX_XMLDECL: /* seen "<?xml" so look for "?>" */
2695
2696                if (TY_(IsWhite)(c) && c != '?')
2697                    continue;
2698
2699                /* get pseudo-attribute */
2700                if (c != '?')
2701                {
2702                    tmbstr name;
2703                    Node *asp, *php;
2704                    AttVal *av = NULL;
2705                    int pdelim = 0;
2706                    isempty = no;
2707
2708                    TY_(UngetChar)(c, doc->docIn);
2709
2710                    name = ParseAttribute( doc, &isempty, &asp, &php );
2711
2712                    if (!name)
2713                    {
2714                        /* fix for http://tidy.sf.net/bug/788031 */
2715                        lexer->lexsize -= 1;
2716                        lexer->txtend = lexer->txtstart;
2717                        lexer->lexbuf[lexer->txtend] = '\0';
2718                        lexer->state = LEX_CONTENT;
2719                        lexer->waswhite = no;
2720                        lexer->token = XmlDeclToken(doc);
2721                        lexer->token->attributes = attributes;
2722                        return lexer->token;
2723                    }
2724
2725                    av = TY_(NewAttribute)();
2726                    av->attribute = name;
2727                    av->value = ParseValue( doc, name, yes, &isempty, &pdelim );
2728                    av->delim = pdelim;
2729                    av->dict = TY_(FindAttribute)( doc, av );
2730
2731                    AddAttrToList( &attributes, av );
2732                    /* continue; */
2733                }
2734
2735                /* now look for '>' */
2736                c = TY_(ReadChar)(doc->docIn);
2737
2738                if (c != '>')
2739                {
2740                    TY_(UngetChar)(c, doc->docIn);
2741                    continue;
2742                }
2743                lexer->lexsize -= 1;
2744                lexer->txtend = lexer->txtstart;
2745                lexer->lexbuf[lexer->txtend] = '\0';
2746                lexer->state = LEX_CONTENT;
2747                lexer->waswhite = no;
2748                lexer->token = XmlDeclToken(doc);
2749                lexer->token->attributes = attributes;
2750                return lexer->token;
2751
2752            case LEX_SECTION: /* seen "<![" so look for "]>" */
2753                if (c == '[')
2754                {
2755                    if (lexer->lexsize == (lexer->txtstart + 6) &&
2756                        TY_(tmbstrncmp)(lexer->lexbuf+lexer->txtstart, "CDATA[", 6) == 0)
2757                    {
2758                        lexer->state = LEX_CDATA;
2759                        lexer->lexsize -= 6;
2760                        continue;
2761                    }
2762                }
2763
2764                if (c != ']')
2765                    continue;
2766
2767                /* now look for '>' */
2768                c = TY_(ReadChar)(doc->docIn);
2769
2770                if (c != '>')
2771                {
2772                    TY_(UngetChar)(c, doc->docIn);
2773                    continue;
2774                }
2775
2776                lexer->lexsize -= 1;
2777                lexer->txtend = lexer->lexsize;
2778                lexer->lexbuf[lexer->lexsize] = '\0';
2779                lexer->state = LEX_CONTENT;
2780                lexer->waswhite = no;
2781                return lexer->token = SectionToken(doc);
2782
2783            case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
2784/* Apple Changes:
2785   2007-08-08 iccir [5393761] The previous fix for 4642206 resulted in CDATA & not escaping properly when
2786                              escape-cdata=yes and char-encoding=raw.  Similar to our fix in ParseEntity(), append
2787							  an additional "amp;" under these conditions.
2788*/
2789#ifdef TIDY_APPLE_CHANGES
2790				if (c == '&' && cfg(doc, TidyOutCharEncoding) == RAW && cfgBool(doc, TidyEscapeCdata))
2791				{
2792					TY_(AddStringToLexer)(lexer, "amp;");
2793				}
2794#endif
2795                if (c != ']')
2796                    continue;
2797
2798                /* now look for ']' */
2799                c = TY_(ReadChar)(doc->docIn);
2800
2801                if (c != ']')
2802                {
2803                    TY_(UngetChar)(c, doc->docIn);
2804                    continue;
2805                }
2806
2807                /* now look for '>' */
2808                c = TY_(ReadChar)(doc->docIn);
2809
2810                if (c != '>')
2811                {
2812                    TY_(UngetChar)(c, doc->docIn);
2813                    TY_(UngetChar)(']', doc->docIn);
2814                    continue;
2815                }
2816
2817                lexer->lexsize -= 1;
2818                lexer->txtend = lexer->lexsize;
2819                lexer->lexbuf[lexer->lexsize] = '\0';
2820                lexer->state = LEX_CONTENT;
2821                lexer->waswhite = no;
2822                return lexer->token = CDATAToken(doc);
2823        }
2824    }
2825
2826    if (lexer->state == LEX_CONTENT)  /* text string */
2827    {
2828        lexer->txtend = lexer->lexsize;
2829
2830        if (lexer->txtend > lexer->txtstart)
2831        {
2832            TY_(UngetChar)(c, doc->docIn);
2833
2834            if (lexer->lexbuf[lexer->lexsize - 1] == ' ')
2835            {
2836                lexer->lexsize -= 1;
2837                lexer->txtend = lexer->lexsize;
2838            }
2839            lexer->token = TY_(TextToken)(lexer);
2840#ifdef TIDY_STORE_ORIGINAL_TEXT
2841            StoreOriginalTextInToken(doc, lexer->token, 0); /* ? */
2842#endif
2843            return lexer->token;
2844        }
2845    }
2846    else if (lexer->state == LEX_COMMENT) /* comment */
2847    {
2848        if (c == EndOfStream)
2849            TY_(ReportError)(doc, NULL, NULL, MALFORMED_COMMENT );
2850
2851        lexer->txtend = lexer->lexsize;
2852        lexer->lexbuf[lexer->lexsize] = '\0';
2853        lexer->state = LEX_CONTENT;
2854        lexer->waswhite = no;
2855        return lexer->token = CommentToken(doc);
2856    }
2857
2858    return NULL;
2859}
2860
2861static void MapStr( ctmbstr str, uint code )
2862{
2863    while ( *str )
2864    {
2865        uint i = (byte) *str++;
2866        lexmap[i] |= code;
2867    }
2868}
2869
2870void TY_(InitMap)(void)
2871{
2872    MapStr("\r\n\f", newline|white);
2873    MapStr(" \t", white);
2874    MapStr("-.:_", namechar);
2875    MapStr("0123456789", digit|namechar);
2876    MapStr("abcdefghijklmnopqrstuvwxyz", lowercase|letter|namechar);
2877    MapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", uppercase|letter|namechar);
2878}
2879
2880/*
2881 parser for ASP within start tags
2882
2883 Some people use ASP for to customize attributes
2884 Tidy isn't really well suited to dealing with ASP
2885 This is a workaround for attributes, but won't
2886 deal with the case where the ASP is used to tailor
2887 the attribute value. Here is an example of a work
2888 around for using ASP in attribute values:
2889
2890  href='<%=rsSchool.Fields("ID").Value%>'
2891
2892 where the ASP that generates the attribute value
2893 is masked from Tidy by the quotemarks.
2894
2895*/
2896
2897static Node *ParseAsp( TidyDocImpl* doc )
2898{
2899    Lexer* lexer = doc->lexer;
2900    uint c;
2901    Node *asp = NULL;
2902
2903    lexer->txtstart = lexer->lexsize;
2904
2905    for (;;)
2906    {
2907        if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
2908            break;
2909
2910        TY_(AddCharToLexer)(lexer, c);
2911
2912
2913        if (c != '%')
2914            continue;
2915
2916        if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
2917            break;
2918
2919        TY_(AddCharToLexer)(lexer, c);
2920
2921        if (c == '>')
2922        {
2923            lexer->lexsize -= 2;
2924            break;
2925        }
2926    }
2927
2928    lexer->txtend = lexer->lexsize;
2929    if (lexer->txtend > lexer->txtstart)
2930        asp = AspToken(doc);
2931
2932    lexer->txtstart = lexer->txtend;
2933    return asp;
2934}
2935
2936
2937/*
2938 PHP is like ASP but is based upon XML
2939 processing instructions, e.g. <?php ... ?>
2940*/
2941static Node *ParsePhp( TidyDocImpl* doc )
2942{
2943    Lexer* lexer = doc->lexer;
2944    uint c;
2945    Node *php = NULL;
2946
2947    lexer->txtstart = lexer->lexsize;
2948
2949    for (;;)
2950    {
2951        if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
2952            break;
2953
2954        TY_(AddCharToLexer)(lexer, c);
2955
2956
2957        if (c != '?')
2958            continue;
2959
2960        if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
2961            break;
2962
2963        TY_(AddCharToLexer)(lexer, c);
2964
2965        if (c == '>')
2966        {
2967            lexer->lexsize -= 2;
2968            break;
2969        }
2970    }
2971
2972    lexer->txtend = lexer->lexsize;
2973    if (lexer->txtend > lexer->txtstart)
2974        php = PhpToken(doc);
2975
2976    lexer->txtstart = lexer->txtend;
2977    return php;
2978}
2979
2980/* consumes the '>' terminating start tags */
2981static tmbstr  ParseAttribute( TidyDocImpl* doc, Bool *isempty,
2982                              Node **asp, Node **php)
2983{
2984    Lexer* lexer = doc->lexer;
2985    int start, len = 0;
2986    tmbstr attr = NULL;
2987    uint c, lastc;
2988
2989    *asp = NULL;  /* clear asp pointer */
2990    *php = NULL;  /* clear php pointer */
2991
2992 /* skip white space before the attribute */
2993
2994    for (;;)
2995    {
2996        c = TY_(ReadChar)( doc->docIn );
2997
2998
2999        if (c == '/')
3000        {
3001            c = TY_(ReadChar)( doc->docIn );
3002
3003            if (c == '>')
3004            {
3005                *isempty = yes;
3006                return NULL;
3007            }
3008
3009            TY_(UngetChar)(c, doc->docIn);
3010            c = '/';
3011            break;
3012        }
3013
3014        if (c == '>')
3015            return NULL;
3016
3017        if (c =='<')
3018        {
3019            c = TY_(ReadChar)(doc->docIn);
3020
3021            if (c == '%')
3022            {
3023                *asp = ParseAsp( doc );
3024                return NULL;
3025            }
3026            else if (c == '?')
3027            {
3028                *php = ParsePhp( doc );
3029                return NULL;
3030            }
3031
3032            TY_(UngetChar)(c, doc->docIn);
3033            TY_(UngetChar)('<', doc->docIn);
3034            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3035            return NULL;
3036        }
3037
3038        if (c == '=')
3039        {
3040            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_EQUALSIGN );
3041            continue;
3042        }
3043
3044        if (c == '"' || c == '\'')
3045        {
3046            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK );
3047            continue;
3048        }
3049
3050        if (c == EndOfStream)
3051        {
3052            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3053            TY_(UngetChar)(c, doc->docIn);
3054            return NULL;
3055        }
3056
3057
3058        if (!TY_(IsWhite)(c))
3059           break;
3060    }
3061
3062    start = lexer->lexsize;
3063    lastc = c;
3064
3065    for (;;)
3066    {
3067     /* but push back '=' for parseValue() */
3068        if (c == '=' || c == '>')
3069        {
3070            TY_(UngetChar)(c, doc->docIn);
3071            break;
3072        }
3073
3074        if (c == '<' || c == EndOfStream)
3075        {
3076            TY_(UngetChar)(c, doc->docIn);
3077            break;
3078        }
3079
3080        if (lastc == '-' && (c == '"' || c == '\''))
3081        {
3082            lexer->lexsize--;
3083            --len;
3084            TY_(UngetChar)(c, doc->docIn);
3085            break;
3086        }
3087
3088        if (TY_(IsWhite)(c))
3089            break;
3090
3091        /* what should be done about non-namechar characters? */
3092        /* currently these are incorporated into the attr name */
3093
3094        if ( !cfgBool(doc, TidyXmlTags) && TY_(IsUpper)(c) )
3095            c = TY_(ToLower)(c);
3096
3097        TY_(AddCharToLexer)( lexer, c );
3098        lastc = c;
3099        c = TY_(ReadChar)(doc->docIn);
3100    }
3101
3102    /* handle attribute names with multibyte chars */
3103    len = lexer->lexsize - start;
3104    attr = (len > 0 ? TY_(tmbstrndup)(lexer->lexbuf+start, len) : NULL);
3105    lexer->lexsize = start;
3106    return attr;
3107}
3108
3109/*
3110 invoked when < is seen in place of attribute value
3111 but terminates on whitespace if not ASP, PHP or Tango
3112 this routine recognizes ' and " quoted strings
3113*/
3114static int ParseServerInstruction( TidyDocImpl* doc )
3115{
3116    Lexer* lexer = doc->lexer;
3117    uint c;
3118    int delim = '"';
3119    Bool isrule = no;
3120
3121    c = TY_(ReadChar)(doc->docIn);
3122    TY_(AddCharToLexer)(lexer, c);
3123
3124    /* check for ASP, PHP or Tango */
3125    if (c == '%' || c == '?' || c == '@')
3126        isrule = yes;
3127
3128    for (;;)
3129    {
3130        c = TY_(ReadChar)(doc->docIn);
3131
3132        if (c == EndOfStream)
3133            break;
3134
3135        if (c == '>')
3136        {
3137            if (isrule)
3138                TY_(AddCharToLexer)(lexer, c);
3139            else
3140                TY_(UngetChar)(c, doc->docIn);
3141
3142            break;
3143        }
3144
3145        /* if not recognized as ASP, PHP or Tango */
3146        /* then also finish value on whitespace */
3147        if (!isrule)
3148        {
3149            if (TY_(IsWhite)(c))
3150                break;
3151        }
3152
3153        TY_(AddCharToLexer)(lexer, c);
3154
3155        if (c == '"')
3156        {
3157            do
3158            {
3159                c = TY_(ReadChar)(doc->docIn);
3160                if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */
3161                {
3162                    TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3163                    TY_(UngetChar)(c, doc->docIn);
3164                    return 0;
3165                }
3166                if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */
3167                {
3168                    TY_(UngetChar)(c, doc->docIn);
3169                    TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3170                    return 0;
3171                }
3172                TY_(AddCharToLexer)(lexer, c);
3173            }
3174            while (c != '"');
3175            delim = '\'';
3176            continue;
3177        }
3178
3179        if (c == '\'')
3180        {
3181            do
3182            {
3183                c = TY_(ReadChar)(doc->docIn);
3184                if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */
3185                {
3186                    TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3187                    TY_(UngetChar)(c, doc->docIn);
3188                    return 0;
3189                }
3190                if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */
3191                {
3192                    TY_(UngetChar)(c, doc->docIn);
3193                    TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3194                    return 0;
3195                }
3196                TY_(AddCharToLexer)(lexer, c);
3197            }
3198            while (c != '\'');
3199        }
3200    }
3201
3202    return delim;
3203}
3204
3205/* values start with "=" or " = " etc. */
3206/* doesn't consume the ">" at end of start tag */
3207
3208static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name,
3209                    Bool foldCase, Bool *isempty, int *pdelim)
3210{
3211    Lexer* lexer = doc->lexer;
3212    int len = 0, start;
3213    Bool seen_gt = no;
3214    Bool munge = yes;
3215    uint c, lastc, delim, quotewarning;
3216    tmbstr value;
3217
3218    delim = (tmbchar) 0;
3219    *pdelim = '"';
3220
3221    /*
3222     Henry Zrepa reports that some folk are using the
3223     embed element with script attributes where newlines
3224     are significant and must be preserved
3225    */
3226    if ( cfgBool(doc, TidyLiteralAttribs) )
3227        munge = no;
3228
3229 /* skip white space before the '=' */
3230
3231    for (;;)
3232    {
3233        c = TY_(ReadChar)(doc->docIn);
3234
3235        if (c == EndOfStream)
3236        {
3237            TY_(UngetChar)(c, doc->docIn);
3238            break;
3239        }
3240
3241        if (!TY_(IsWhite)(c))
3242           break;
3243    }
3244
3245/*
3246  c should be '=' if there is a value
3247  other legal possibilities are white
3248  space, '/' and '>'
3249*/
3250
3251    if (c != '=' && c != '"' && c != '\'')
3252    {
3253        TY_(UngetChar)(c, doc->docIn);
3254        return NULL;
3255    }
3256
3257 /* skip white space after '=' */
3258
3259    for (;;)
3260    {
3261        c = TY_(ReadChar)(doc->docIn);
3262
3263        if (c == EndOfStream)
3264        {
3265            TY_(UngetChar)(c, doc->docIn);
3266            break;
3267        }
3268
3269        if (!TY_(IsWhite)(c))
3270           break;
3271    }
3272
3273 /* check for quote marks */
3274
3275    if (c == '"' || c == '\'')
3276        delim = c;
3277    else if (c == '<')
3278    {
3279        start = lexer->lexsize;
3280        TY_(AddCharToLexer)(lexer, c);
3281        *pdelim = ParseServerInstruction( doc );
3282        len = lexer->lexsize - start;
3283        lexer->lexsize = start;
3284        return (len > 0 ? TY_(tmbstrndup)(lexer->lexbuf+start, len) : NULL);
3285    }
3286    else
3287        TY_(UngetChar)(c, doc->docIn);
3288
3289 /*
3290   and read the value string
3291   check for quote mark if needed
3292 */
3293
3294    quotewarning = 0;
3295    start = lexer->lexsize;
3296    c = '\0';
3297
3298    for (;;)
3299    {
3300        lastc = c;  /* track last character */
3301        c = TY_(ReadChar)(doc->docIn);
3302
3303        if (c == EndOfStream)
3304        {
3305            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3306            TY_(UngetChar)(c, doc->docIn);
3307            break;
3308        }
3309
3310        if (delim == (tmbchar)0)
3311        {
3312            if (c == '>')
3313            {
3314                TY_(UngetChar)(c, doc->docIn);
3315                break;
3316            }
3317
3318            if (c == '"' || c == '\'')
3319            {
3320                uint q = c;
3321
3322                TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK );
3323
3324                /* handle <input onclick=s("btn1")> and <a title=foo""">...</a> */
3325                /* this doesn't handle <a title=foo"/> which browsers treat as  */
3326                /* 'foo"/' nor  <a title=foo" /> which browser treat as 'foo"'  */
3327
3328                c = TY_(ReadChar)(doc->docIn);
3329                if (c == '>')
3330                {
3331                    TY_(AddCharToLexer)(lexer, q);
3332                    TY_(UngetChar)(c, doc->docIn);
3333                    break;
3334                }
3335                else
3336                {
3337                    TY_(UngetChar)(c, doc->docIn);
3338                    c = q;
3339                }
3340            }
3341
3342            if (c == '<')
3343            {
3344                TY_(UngetChar)(c, doc->docIn);
3345                c = '>';
3346                TY_(UngetChar)(c, doc->docIn);
3347                TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3348                break;
3349            }
3350
3351            /*
3352             For cases like <br clear=all/> need to avoid treating /> as
3353             part of the attribute value, however care is needed to avoid
3354             so treating <a href=http://www.acme.com/> in this way, which
3355             would map the <a> tag to <a href="http://www.acme.com"/>
3356            */
3357            if (c == '/')
3358            {
3359                /* peek ahead in case of /> */
3360                c = TY_(ReadChar)(doc->docIn);
3361
3362                if ( c == '>' && !TY_(IsUrl)(doc, name) )
3363                {
3364                    *isempty = yes;
3365                    TY_(UngetChar)(c, doc->docIn);
3366                    break;
3367                }
3368
3369                /* unget peeked character */
3370                TY_(UngetChar)(c, doc->docIn);
3371                c = '/';
3372            }
3373        }
3374        else  /* delim is '\'' or '"' */
3375        {
3376            if (c == delim)
3377                break;
3378
3379            if (c == '\n' || c == '<' || c == '>')
3380                ++quotewarning;
3381
3382            if (c == '>')
3383                seen_gt = yes;
3384        }
3385
3386        if (c == '&')
3387        {
3388            TY_(AddCharToLexer)(lexer, c);
3389            ParseEntity( doc, IgnoreWhitespace );
3390            if (lexer->lexbuf[lexer->lexsize - 1] == '\n' && munge)
3391                ChangeChar(lexer, ' ');
3392            continue;
3393        }
3394
3395        /*
3396         kludge for JavaScript attribute values
3397         with line continuations in string literals
3398        */
3399        if (c == '\\')
3400        {
3401            c = TY_(ReadChar)(doc->docIn);
3402
3403            if (c != '\n')
3404            {
3405                TY_(UngetChar)(c, doc->docIn);
3406                c = '\\';
3407            }
3408        }
3409
3410        if (TY_(IsWhite)(c))
3411        {
3412            if ( delim == 0 )
3413                break;
3414
3415            if (munge)
3416            {
3417                /* discard line breaks in quoted URLs */
3418                /* #438650 - fix by Randy Waki */
3419                if ( c == '\n' && TY_(IsUrl)(doc, name) )
3420                {
3421                    /* warn that we discard this newline */
3422                    TY_(ReportAttrError)( doc, lexer->token, NULL, NEWLINE_IN_URI);
3423                    continue;
3424                }
3425
3426                c = ' ';
3427
3428                if (lastc == ' ')
3429                {
3430                    if (TY_(IsUrl)(doc, name) )
3431                        TY_(ReportAttrError)( doc, lexer->token, NULL, WHITE_IN_URI);
3432                    continue;
3433                }
3434            }
3435        }
3436        else if (foldCase && TY_(IsUpper)(c))
3437            c = TY_(ToLower)(c);
3438
3439        TY_(AddCharToLexer)(lexer, c);
3440    }
3441
3442    if (quotewarning > 10 && seen_gt && munge)
3443    {
3444        /*
3445           there is almost certainly a missing trailing quote mark
3446           as we have see too many newlines, < or > characters.
3447
3448           an exception is made for Javascript attributes and the
3449           javascript URL scheme which may legitimately include < and >,
3450           and for attributes starting with "<xml " as generated by
3451           Microsoft Office.
3452        */
3453        if ( !TY_(IsScript)(doc, name) &&
3454             !(TY_(IsUrl)(doc, name) && TY_(tmbstrncmp)(lexer->lexbuf+start, "javascript:", 11) == 0) &&
3455             !(TY_(tmbstrncmp)(lexer->lexbuf+start, "<xml ", 5) == 0)
3456           )
3457            TY_(ReportFatal)( doc, NULL, NULL, SUSPECTED_MISSING_QUOTE );
3458    }
3459
3460    len = lexer->lexsize - start;
3461    lexer->lexsize = start;
3462
3463
3464    if (len > 0 || delim)
3465    {
3466        /* ignore leading and trailing white space for all but title, alt, value */
3467        /* and prompts attributes unless --literal-attributes is set to yes      */
3468        /* #994841 - Whitespace is removed from value attributes                 */
3469
3470        if (munge &&
3471            TY_(tmbstrcasecmp)(name, "alt") &&
3472            TY_(tmbstrcasecmp)(name, "title") &&
3473            TY_(tmbstrcasecmp)(name, "value") &&
3474            TY_(tmbstrcasecmp)(name, "prompt"))
3475        {
3476            while (TY_(IsWhite)(lexer->lexbuf[start+len-1]))
3477                --len;
3478
3479            while (TY_(IsWhite)(lexer->lexbuf[start]) && start < len)
3480            {
3481                ++start;
3482                --len;
3483            }
3484        }
3485
3486        value = TY_(tmbstrndup)(lexer->lexbuf + start, len);
3487    }
3488    else
3489        value = NULL;
3490
3491    /* note delimiter if given */
3492    *pdelim = (delim ? delim : '"');
3493
3494    return value;
3495}
3496
3497/* attr must be non-NULL */
3498static Bool IsValidAttrName( ctmbstr attr )
3499{
3500    uint i, c = attr[0];
3501
3502    /* first character should be a letter */
3503    if (!TY_(IsLetter)(c))
3504        return no;
3505
3506    /* remaining characters should be namechars */
3507    for( i = 1; i < TY_(tmbstrlen)(attr); i++)
3508    {
3509        c = attr[i];
3510
3511        if (TY_(IsNamechar)(c))
3512            continue;
3513
3514        return no;
3515    }
3516
3517    return yes;
3518}
3519
3520/* create a new attribute */
3521AttVal *TY_(NewAttribute)(void)
3522{
3523    AttVal *av = (AttVal*) MemAlloc( sizeof(AttVal) );
3524    ClearMemory( av, sizeof(AttVal) );
3525    return av;
3526}
3527
3528/* create a new attribute with given name and value */
3529AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
3530                             int delim )
3531{
3532    AttVal *av = TY_(NewAttribute)();
3533    av->attribute = TY_(tmbstrdup)(name);
3534    av->value = TY_(tmbstrdup)(value);
3535    av->delim = delim;
3536    av->dict = TY_(FindAttribute)( doc, av );
3537    return av;
3538}
3539
3540static void AddAttrToList( AttVal** list, AttVal* av )
3541{
3542  if ( *list == NULL )
3543    *list = av;
3544  else
3545  {
3546    AttVal* here = *list;
3547    while ( here->next )
3548      here = here->next;
3549    here->next = av;
3550  }
3551}
3552
3553void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av )
3554{
3555    AddAttrToList(&node->attributes, av);
3556}
3557
3558void TY_(InsertAttributeAtStart)( Node *node, AttVal *av )
3559{
3560    av->next = node->attributes;
3561    node->attributes = av;
3562}
3563
3564/* swallows closing '>' */
3565
3566static AttVal* ParseAttrs( TidyDocImpl* doc, Bool *isempty )
3567{
3568    Lexer* lexer = doc->lexer;
3569    AttVal *av, *list;
3570    tmbstr value;
3571    int delim;
3572    Node *asp, *php;
3573
3574    list = NULL;
3575
3576    while ( !EndOfInput(doc) )
3577    {
3578        tmbstr attribute = ParseAttribute( doc, isempty, &asp, &php );
3579
3580        if (attribute == NULL)
3581        {
3582            /* check if attributes are created by ASP markup */
3583            if (asp)
3584            {
3585                av = TY_(NewAttribute)();
3586                av->asp = asp;
3587                AddAttrToList( &list, av );
3588                continue;
3589            }
3590
3591            /* check if attributes are created by PHP markup */
3592            if (php)
3593            {
3594                av = TY_(NewAttribute)();
3595                av->php = php;
3596                AddAttrToList( &list, av );
3597                continue;
3598            }
3599
3600            break;
3601        }
3602
3603        value = ParseValue( doc, attribute, no, isempty, &delim );
3604
3605        if (attribute && (IsValidAttrName(attribute) ||
3606            (cfgBool(doc, TidyXmlTags) && IsValidXMLAttrName(attribute))))
3607        {
3608            av = TY_(NewAttribute)();
3609            av->delim = delim;
3610            av->attribute = attribute;
3611            av->value = value;
3612            av->dict = TY_(FindAttribute)( doc, av );
3613            AddAttrToList( &list, av );
3614        }
3615        else
3616        {
3617            av = TY_(NewAttribute)();
3618            av->attribute = attribute;
3619            av->value = value;
3620
3621            if (LastChar(attribute) == '"')
3622                TY_(ReportAttrError)( doc, lexer->token, av, MISSING_QUOTEMARK);
3623            else if (value == NULL)
3624                TY_(ReportAttrError)(doc, lexer->token, av, MISSING_ATTR_VALUE);
3625            else
3626                TY_(ReportAttrError)(doc, lexer->token, av, INVALID_ATTRIBUTE);
3627
3628            TY_(FreeAttribute)( doc, av );
3629        }
3630    }
3631
3632    return list;
3633}
3634
3635/*
3636  Returns document type declarations like
3637
3638  <!DOCTYPE foo PUBLIC "fpi" "sysid">
3639  <!DOCTYPE bar SYSTEM "sysid">
3640  <!DOCTYPE baz [ <!ENTITY ouml "&#246"> ]>
3641
3642  as
3643
3644  <foo PUBLIC="fpi" SYSTEM="sysid" />
3645  <bar SYSTEM="sysid" />
3646  <baz> &lt;!ENTITY ouml &quot;&amp;#246&quot;&gt; </baz>
3647*/
3648static Node *ParseDocTypeDecl(TidyDocImpl* doc)
3649{
3650    Lexer *lexer = doc->lexer;
3651    int start = lexer->lexsize;
3652    ParseDocTypeDeclState state = DT_DOCTYPENAME;
3653    uint c;
3654    uint delim = 0;
3655    Bool hasfpi = yes;
3656
3657    Node* node = TY_(NewNode)(lexer);
3658    node->type = DocTypeTag;
3659    node->start = lexer->txtstart;
3660    node->end = lexer->txtend;
3661
3662    lexer->waswhite = no;
3663
3664    /* todo: reset lexer->lexsize when appropriate to avoid wasting memory */
3665
3666    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
3667    {
3668        /* convert newlines to spaces */
3669        if (state != DT_INTSUBSET)
3670            c = c == '\n' ? ' ' : c;
3671
3672        /* convert white-space sequences to single space character */
3673        if (TY_(IsWhite)(c) && state != DT_INTSUBSET)
3674        {
3675            if (!lexer->waswhite)
3676            {
3677                TY_(AddCharToLexer)(lexer, c);
3678                lexer->waswhite = yes;
3679            }
3680            else
3681            {
3682                /* discard space */
3683                continue;
3684            }
3685        }
3686        else
3687        {
3688            TY_(AddCharToLexer)(lexer, c);
3689            lexer->waswhite = no;
3690        }
3691
3692        switch(state)
3693        {
3694        case DT_INTERMEDIATE:
3695            /* determine what's next */
3696            if (TY_(ToUpper)(c) == 'P' || TY_(ToUpper)(c) == 'S')
3697            {
3698                start = lexer->lexsize - 1;
3699                state = DT_PUBLICSYSTEM;
3700                continue;
3701            }
3702            else if (c == '[')
3703            {
3704                start = lexer->lexsize;
3705                state = DT_INTSUBSET;
3706                continue;
3707            }
3708            else if (c == '\'' || c == '"')
3709            {
3710                start = lexer->lexsize;
3711                delim = c;
3712                state = DT_QUOTEDSTRING;
3713                continue;
3714            }
3715            else if (c == '>')
3716            {
3717                AttVal* si;
3718
3719                node->end = --(lexer->lexsize);
3720
3721                si = TY_(GetAttrByName)(node, "SYSTEM");
3722                if (si)
3723                    TY_(CheckUrl)(doc, node, si);
3724
3725                if (!node->element || !IsValidXMLElemName(node->element))
3726                {
3727                    TY_(ReportError)(doc, NULL, NULL, MALFORMED_DOCTYPE);
3728                    TY_(FreeNode)(doc, node);
3729                    return NULL;
3730                }
3731#ifdef TIDY_STORE_ORIGINAL_TEXT
3732                StoreOriginalTextInToken(doc, node, 0);
3733#endif
3734                return node;
3735            }
3736            else
3737            {
3738                /* error */
3739            }
3740            break;
3741        case DT_DOCTYPENAME:
3742            /* read document type name */
3743            if (TY_(IsWhite)(c) || c == '>' || c == '[')
3744            {
3745                node->element = TY_(tmbstrndup)(lexer->lexbuf + start,
3746                                                lexer->lexsize - start - 1);
3747                if (c == '>' || c == '[')
3748                {
3749                    --(lexer->lexsize);
3750                    TY_(UngetChar)(c, doc->docIn);
3751                }
3752
3753                state = DT_INTERMEDIATE;
3754                continue;
3755            }
3756            break;
3757        case DT_PUBLICSYSTEM:
3758            /* read PUBLIC/SYSTEM */
3759            if (TY_(IsWhite)(c) || c == '>')
3760            {
3761                char *attname = TY_(tmbstrndup)(lexer->lexbuf + start,
3762                                                lexer->lexsize - start - 1);
3763                hasfpi = !(TY_(tmbstrcasecmp)(attname, "SYSTEM") == 0);
3764
3765                MemFree(attname);
3766
3767                /* todo: report an error if SYSTEM/PUBLIC not uppercase */
3768
3769                if (c == '>')
3770                {
3771                    --(lexer->lexsize);
3772                    TY_(UngetChar)(c, doc->docIn);
3773                }
3774
3775                state = DT_INTERMEDIATE;
3776                continue;
3777            }
3778            break;
3779        case DT_QUOTEDSTRING:
3780            /* read quoted string */
3781            if (c == delim)
3782            {
3783                char *value = TY_(tmbstrndup)(lexer->lexbuf + start,
3784                                              lexer->lexsize - start - 1);
3785                AttVal* att = TY_(AddAttribute)(doc, node, hasfpi ? "PUBLIC" : "SYSTEM", value);
3786                MemFree(value);
3787                att->delim = delim;
3788                hasfpi = no;
3789                state = DT_INTERMEDIATE;
3790                delim = 0;
3791                continue;
3792            }
3793            break;
3794        case DT_INTSUBSET:
3795            /* read internal subset */
3796            if (c == ']')
3797            {
3798                Node* subset;
3799                lexer->txtstart = start;
3800                lexer->txtend = lexer->lexsize - 1;
3801                subset = TY_(TextToken)(lexer);
3802                TY_(InsertNodeAtEnd)(node, subset);
3803                state = DT_INTERMEDIATE;
3804            }
3805            break;
3806        }
3807    }
3808
3809    /* document type declaration not finished */
3810    TY_(ReportError)(doc, NULL, NULL, MALFORMED_DOCTYPE);
3811    TY_(FreeNode)(doc, node);
3812    return NULL;
3813}
3814
3815/*
3816 * local variables:
3817 * mode: c
3818 * indent-tabs-mode: nil
3819 * c-basic-offset: 4
3820 * eval: (c-set-offset 'substatement-open 0)
3821 * end:
3822 */
3823