1/* lexer.c -- Lexer for html parser 2 3 (c) 1998-2006 (W3C) MIT, ERCIM, Keio University 4 See tidy.h for the copyright notice. 5 6 CVS Info : 7 8 $Author: iccir $ 9 $Date: 2007/08/09 02:34:44 $ 10 $Revision: 1.7 $ 11 12*/ 13 14/* 15 Given a file stream fp it returns a sequence of tokens. 16 17 GetToken(fp) gets the next token 18 UngetToken(fp) provides one level undo 19 20 The tags include an attribute list: 21 22 - linked list of attribute/value nodes 23 - each node has 2 NULL-terminated strings. 24 - entities are replaced in attribute values 25 26 white space is compacted if not in preformatted mode 27 If not in preformatted mode then leading white space 28 is discarded and subsequent white space sequences 29 compacted to single space characters. 30 31 If XmlTags is no then Tag names are folded to upper 32 case and attribute names to lower case. 33 34 Not yet done: 35 - Doctype subset and marked sections 36*/ 37 38#include "tidy-int.h" 39#include "lexer.h" 40#include "parser.h" 41#include "entities.h" 42#include "streamio.h" 43#include "message.h" 44#include "tmbstr.h" 45#include "clean.h" 46#include "utf8.h" 47#include "streamio.h" 48 49/* Forward references 50*/ 51/* swallows closing '>' */ 52static AttVal *ParseAttrs( TidyDocImpl* doc, Bool *isempty ); 53 54static tmbstr ParseAttribute( TidyDocImpl* doc, Bool* isempty, 55 Node **asp, Node **php ); 56 57static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, Bool foldCase, 58 Bool *isempty, int *pdelim ); 59 60static Node *ParseDocTypeDecl(TidyDocImpl* doc); 61 62static void AddAttrToList( AttVal** list, AttVal* av ); 63 64/* used to classify characters for lexical purposes */ 65#define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0) 66static uint lexmap[128]; 67 68#define IsValidXMLAttrName(name) TY_(IsValidXMLID)(name) 69#define IsValidXMLElemName(name) TY_(IsValidXMLID)(name) 70 71static struct _doctypes 72{ 73 uint score; 74 uint vers; 75 ctmbstr name; 76 ctmbstr fpi; 77 ctmbstr si; 78} const W3C_Doctypes[] = 79{ 80 { 2, HT20, "HTML 2.0", "-//IETF//DTD HTML 2.0//EN", NULL, }, 81 { 2, HT20, "HTML 2.0", "-//IETF//DTD HTML//EN", NULL, }, 82 { 2, HT20, "HTML 2.0", "-//W3C//DTD HTML 2.0//EN", NULL, }, 83 { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2//EN", NULL, }, 84 { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Final//EN", NULL, }, 85 { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Draft//EN", NULL, }, 86 { 6, H40S, "HTML 4.0 Strict", "-//W3C//DTD HTML 4.0//EN", "http://www.w3.org/TR/REC-html40/strict.dtd" }, 87 { 8, H40T, "HTML 4.0 Transitional", "-//W3C//DTD HTML 4.0 Transitional//EN", "http://www.w3.org/TR/REC-html40/loose.dtd" }, 88 { 7, H40F, "HTML 4.0 Frameset", "-//W3C//DTD HTML 4.0 Frameset//EN", "http://www.w3.org/TR/REC-html40/frameset.dtd" }, 89 { 3, H41S, "HTML 4.01 Strict", "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd" }, 90 { 5, H41T, "HTML 4.01 Transitional", "-//W3C//DTD HTML 4.01 Transitional//EN", "http://www.w3.org/TR/html4/loose.dtd" }, 91 { 4, H41F, "HTML 4.01 Frameset", "-//W3C//DTD HTML 4.01 Frameset//EN", "http://www.w3.org/TR/html4/frameset.dtd" }, 92 { 9, X10S, "XHTML 1.0 Strict", "-//W3C//DTD XHTML 1.0 Strict//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" }, 93 { 11, X10T, "XHTML 1.0 Transitional", "-//W3C//DTD XHTML 1.0 Transitional//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" }, 94 { 10, X10F, "XHTML 1.0 Frameset", "-//W3C//DTD XHTML 1.0 Frameset//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd" }, 95 { 12, XH11, "XHTML 1.1", "-//W3C//DTD XHTML 1.1//EN", "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd" }, 96 { 13, XB10, "XHTML Basic 1.0", "-//W3C//DTD XHTML Basic 1.0//EN", "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd" }, 97 98 /* reminder to add XHTML Print 1.0 support, see http://www.w3.org/TR/xhtml-print */ 99#if 0 100 { 14, XP10, "XHTML Print 1.0", "-//W3C//DTD XHTML-Print 1.0//EN", "http://www.w3.org/MarkUp/DTD/xhtml-print10.dtd" }, 101 { 14, XP10, "XHTML Print 1.0", "-//PWG//DTD XHTML-Print 1.0//EN", "http://www.xhtml-print.org/xhtml-print/xhtml-print10.dtd" }, 102#endif 103 /* final entry */ 104 { 0, 0, NULL, NULL, NULL } 105}; 106 107int TY_(HTMLVersion)(TidyDocImpl* doc) 108{ 109 uint i; 110 uint j = 0; 111 uint score = 0; 112 uint vers = doc->lexer->versions; 113 uint dtver = doc->lexer->doctype; 114 TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode); 115 Bool xhtml = (cfgBool(doc, TidyXmlOut) || doc->lexer->isvoyager) && 116 !cfgBool(doc, TidyHtmlOut); 117 Bool html4 = dtmode == TidyDoctypeStrict || dtmode == TidyDoctypeLoose || VERS_FROM40 & dtver; 118 119 for (i = 0; W3C_Doctypes[i].name; ++i) 120 { 121 if ((xhtml && !(VERS_XHTML & W3C_Doctypes[i].vers)) || 122 (html4 && !(VERS_FROM40 & W3C_Doctypes[i].vers))) 123 continue; 124 125 if (vers & W3C_Doctypes[i].vers && 126 (W3C_Doctypes[i].score < score || !score)) 127 { 128 score = W3C_Doctypes[i].score; 129 j = i; 130 } 131 } 132 133 if (score) 134 return W3C_Doctypes[j].vers; 135 136 return VERS_UNKNOWN; 137} 138 139static ctmbstr GetFPIFromVers(uint vers) 140{ 141 uint i; 142 143 for (i = 0; W3C_Doctypes[i].name; ++i) 144 if (W3C_Doctypes[i].vers == vers) 145 return W3C_Doctypes[i].fpi; 146 147 return NULL; 148} 149 150static ctmbstr GetSIFromVers(uint vers) 151{ 152 uint i; 153 154 for (i = 0; W3C_Doctypes[i].name; ++i) 155 if (W3C_Doctypes[i].vers == vers) 156 return W3C_Doctypes[i].si; 157 158 return NULL; 159} 160 161static ctmbstr GetNameFromVers(uint vers) 162{ 163 uint i; 164 165 for (i = 0; W3C_Doctypes[i].name; ++i) 166 if (W3C_Doctypes[i].vers == vers) 167 return W3C_Doctypes[i].name; 168 169 return NULL; 170} 171 172static uint GetVersFromFPI(ctmbstr fpi) 173{ 174 uint i; 175 176 for (i = 0; W3C_Doctypes[i].name; ++i) 177 if (TY_(tmbstrcasecmp)(W3C_Doctypes[i].fpi, fpi) == 0) 178 return W3C_Doctypes[i].vers; 179 180 return 0; 181} 182 183/* everything is allowed in proprietary version of HTML */ 184/* this is handled here rather than in the tag/attr dicts */ 185void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers) 186{ 187 doc->lexer->versions &= (vers | VERS_PROPRIETARY); 188} 189 190Bool TY_(IsWhite)(uint c) 191{ 192 uint map = MAP(c); 193 194 return (map & white)!=0; 195} 196 197Bool TY_(IsNewline)(uint c) 198{ 199 uint map = MAP(c); 200 return (map & newline)!=0; 201} 202 203Bool TY_(IsDigit)(uint c) 204{ 205 uint map; 206 207 map = MAP(c); 208 209 return (map & digit)!=0; 210} 211 212Bool TY_(IsLetter)(uint c) 213{ 214 uint map; 215 216 map = MAP(c); 217 218 return (map & letter)!=0; 219} 220 221Bool TY_(IsNamechar)(uint c) 222{ 223 uint map = MAP(c); 224 return (map & namechar)!=0; 225} 226 227Bool TY_(IsXMLLetter)(uint c) 228{ 229 return ((c >= 0x41 && c <= 0x5a) || 230 (c >= 0x61 && c <= 0x7a) || 231 (c >= 0xc0 && c <= 0xd6) || 232 (c >= 0xd8 && c <= 0xf6) || 233 (c >= 0xf8 && c <= 0xff) || 234 (c >= 0x100 && c <= 0x131) || 235 (c >= 0x134 && c <= 0x13e) || 236 (c >= 0x141 && c <= 0x148) || 237 (c >= 0x14a && c <= 0x17e) || 238 (c >= 0x180 && c <= 0x1c3) || 239 (c >= 0x1cd && c <= 0x1f0) || 240 (c >= 0x1f4 && c <= 0x1f5) || 241 (c >= 0x1fa && c <= 0x217) || 242 (c >= 0x250 && c <= 0x2a8) || 243 (c >= 0x2bb && c <= 0x2c1) || 244 c == 0x386 || 245 (c >= 0x388 && c <= 0x38a) || 246 c == 0x38c || 247 (c >= 0x38e && c <= 0x3a1) || 248 (c >= 0x3a3 && c <= 0x3ce) || 249 (c >= 0x3d0 && c <= 0x3d6) || 250 c == 0x3da || 251 c == 0x3dc || 252 c == 0x3de || 253 c == 0x3e0 || 254 (c >= 0x3e2 && c <= 0x3f3) || 255 (c >= 0x401 && c <= 0x40c) || 256 (c >= 0x40e && c <= 0x44f) || 257 (c >= 0x451 && c <= 0x45c) || 258 (c >= 0x45e && c <= 0x481) || 259 (c >= 0x490 && c <= 0x4c4) || 260 (c >= 0x4c7 && c <= 0x4c8) || 261 (c >= 0x4cb && c <= 0x4cc) || 262 (c >= 0x4d0 && c <= 0x4eb) || 263 (c >= 0x4ee && c <= 0x4f5) || 264 (c >= 0x4f8 && c <= 0x4f9) || 265 (c >= 0x531 && c <= 0x556) || 266 c == 0x559 || 267 (c >= 0x561 && c <= 0x586) || 268 (c >= 0x5d0 && c <= 0x5ea) || 269 (c >= 0x5f0 && c <= 0x5f2) || 270 (c >= 0x621 && c <= 0x63a) || 271 (c >= 0x641 && c <= 0x64a) || 272 (c >= 0x671 && c <= 0x6b7) || 273 (c >= 0x6ba && c <= 0x6be) || 274 (c >= 0x6c0 && c <= 0x6ce) || 275 (c >= 0x6d0 && c <= 0x6d3) || 276 c == 0x6d5 || 277 (c >= 0x6e5 && c <= 0x6e6) || 278 (c >= 0x905 && c <= 0x939) || 279 c == 0x93d || 280 (c >= 0x958 && c <= 0x961) || 281 (c >= 0x985 && c <= 0x98c) || 282 (c >= 0x98f && c <= 0x990) || 283 (c >= 0x993 && c <= 0x9a8) || 284 (c >= 0x9aa && c <= 0x9b0) || 285 c == 0x9b2 || 286 (c >= 0x9b6 && c <= 0x9b9) || 287 (c >= 0x9dc && c <= 0x9dd) || 288 (c >= 0x9df && c <= 0x9e1) || 289 (c >= 0x9f0 && c <= 0x9f1) || 290 (c >= 0xa05 && c <= 0xa0a) || 291 (c >= 0xa0f && c <= 0xa10) || 292 (c >= 0xa13 && c <= 0xa28) || 293 (c >= 0xa2a && c <= 0xa30) || 294 (c >= 0xa32 && c <= 0xa33) || 295 (c >= 0xa35 && c <= 0xa36) || 296 (c >= 0xa38 && c <= 0xa39) || 297 (c >= 0xa59 && c <= 0xa5c) || 298 c == 0xa5e || 299 (c >= 0xa72 && c <= 0xa74) || 300 (c >= 0xa85 && c <= 0xa8b) || 301 c == 0xa8d || 302 (c >= 0xa8f && c <= 0xa91) || 303 (c >= 0xa93 && c <= 0xaa8) || 304 (c >= 0xaaa && c <= 0xab0) || 305 (c >= 0xab2 && c <= 0xab3) || 306 (c >= 0xab5 && c <= 0xab9) || 307 c == 0xabd || 308 c == 0xae0 || 309 (c >= 0xb05 && c <= 0xb0c) || 310 (c >= 0xb0f && c <= 0xb10) || 311 (c >= 0xb13 && c <= 0xb28) || 312 (c >= 0xb2a && c <= 0xb30) || 313 (c >= 0xb32 && c <= 0xb33) || 314 (c >= 0xb36 && c <= 0xb39) || 315 c == 0xb3d || 316 (c >= 0xb5c && c <= 0xb5d) || 317 (c >= 0xb5f && c <= 0xb61) || 318 (c >= 0xb85 && c <= 0xb8a) || 319 (c >= 0xb8e && c <= 0xb90) || 320 (c >= 0xb92 && c <= 0xb95) || 321 (c >= 0xb99 && c <= 0xb9a) || 322 c == 0xb9c || 323 (c >= 0xb9e && c <= 0xb9f) || 324 (c >= 0xba3 && c <= 0xba4) || 325 (c >= 0xba8 && c <= 0xbaa) || 326 (c >= 0xbae && c <= 0xbb5) || 327 (c >= 0xbb7 && c <= 0xbb9) || 328 (c >= 0xc05 && c <= 0xc0c) || 329 (c >= 0xc0e && c <= 0xc10) || 330 (c >= 0xc12 && c <= 0xc28) || 331 (c >= 0xc2a && c <= 0xc33) || 332 (c >= 0xc35 && c <= 0xc39) || 333 (c >= 0xc60 && c <= 0xc61) || 334 (c >= 0xc85 && c <= 0xc8c) || 335 (c >= 0xc8e && c <= 0xc90) || 336 (c >= 0xc92 && c <= 0xca8) || 337 (c >= 0xcaa && c <= 0xcb3) || 338 (c >= 0xcb5 && c <= 0xcb9) || 339 c == 0xcde || 340 (c >= 0xce0 && c <= 0xce1) || 341 (c >= 0xd05 && c <= 0xd0c) || 342 (c >= 0xd0e && c <= 0xd10) || 343 (c >= 0xd12 && c <= 0xd28) || 344 (c >= 0xd2a && c <= 0xd39) || 345 (c >= 0xd60 && c <= 0xd61) || 346 (c >= 0xe01 && c <= 0xe2e) || 347 c == 0xe30 || 348 (c >= 0xe32 && c <= 0xe33) || 349 (c >= 0xe40 && c <= 0xe45) || 350 (c >= 0xe81 && c <= 0xe82) || 351 c == 0xe84 || 352 (c >= 0xe87 && c <= 0xe88) || 353 c == 0xe8a || 354 c == 0xe8d || 355 (c >= 0xe94 && c <= 0xe97) || 356 (c >= 0xe99 && c <= 0xe9f) || 357 (c >= 0xea1 && c <= 0xea3) || 358 c == 0xea5 || 359 c == 0xea7 || 360 (c >= 0xeaa && c <= 0xeab) || 361 (c >= 0xead && c <= 0xeae) || 362 c == 0xeb0 || 363 (c >= 0xeb2 && c <= 0xeb3) || 364 c == 0xebd || 365 (c >= 0xec0 && c <= 0xec4) || 366 (c >= 0xf40 && c <= 0xf47) || 367 (c >= 0xf49 && c <= 0xf69) || 368 (c >= 0x10a0 && c <= 0x10c5) || 369 (c >= 0x10d0 && c <= 0x10f6) || 370 c == 0x1100 || 371 (c >= 0x1102 && c <= 0x1103) || 372 (c >= 0x1105 && c <= 0x1107) || 373 c == 0x1109 || 374 (c >= 0x110b && c <= 0x110c) || 375 (c >= 0x110e && c <= 0x1112) || 376 c == 0x113c || 377 c == 0x113e || 378 c == 0x1140 || 379 c == 0x114c || 380 c == 0x114e || 381 c == 0x1150 || 382 (c >= 0x1154 && c <= 0x1155) || 383 c == 0x1159 || 384 (c >= 0x115f && c <= 0x1161) || 385 c == 0x1163 || 386 c == 0x1165 || 387 c == 0x1167 || 388 c == 0x1169 || 389 (c >= 0x116d && c <= 0x116e) || 390 (c >= 0x1172 && c <= 0x1173) || 391 c == 0x1175 || 392 c == 0x119e || 393 c == 0x11a8 || 394 c == 0x11ab || 395 (c >= 0x11ae && c <= 0x11af) || 396 (c >= 0x11b7 && c <= 0x11b8) || 397 c == 0x11ba || 398 (c >= 0x11bc && c <= 0x11c2) || 399 c == 0x11eb || 400 c == 0x11f0 || 401 c == 0x11f9 || 402 (c >= 0x1e00 && c <= 0x1e9b) || 403 (c >= 0x1ea0 && c <= 0x1ef9) || 404 (c >= 0x1f00 && c <= 0x1f15) || 405 (c >= 0x1f18 && c <= 0x1f1d) || 406 (c >= 0x1f20 && c <= 0x1f45) || 407 (c >= 0x1f48 && c <= 0x1f4d) || 408 (c >= 0x1f50 && c <= 0x1f57) || 409 c == 0x1f59 || 410 c == 0x1f5b || 411 c == 0x1f5d || 412 (c >= 0x1f5f && c <= 0x1f7d) || 413 (c >= 0x1f80 && c <= 0x1fb4) || 414 (c >= 0x1fb6 && c <= 0x1fbc) || 415 c == 0x1fbe || 416 (c >= 0x1fc2 && c <= 0x1fc4) || 417 (c >= 0x1fc6 && c <= 0x1fcc) || 418 (c >= 0x1fd0 && c <= 0x1fd3) || 419 (c >= 0x1fd6 && c <= 0x1fdb) || 420 (c >= 0x1fe0 && c <= 0x1fec) || 421 (c >= 0x1ff2 && c <= 0x1ff4) || 422 (c >= 0x1ff6 && c <= 0x1ffc) || 423 c == 0x2126 || 424 (c >= 0x212a && c <= 0x212b) || 425 c == 0x212e || 426 (c >= 0x2180 && c <= 0x2182) || 427 (c >= 0x3041 && c <= 0x3094) || 428 (c >= 0x30a1 && c <= 0x30fa) || 429 (c >= 0x3105 && c <= 0x312c) || 430 (c >= 0xac00 && c <= 0xd7a3) || 431 (c >= 0x4e00 && c <= 0x9fa5) || 432 c == 0x3007 || 433 (c >= 0x3021 && c <= 0x3029) || 434 (c >= 0x4e00 && c <= 0x9fa5) || 435 c == 0x3007 || 436 (c >= 0x3021 && c <= 0x3029)); 437} 438 439Bool TY_(IsXMLNamechar)(uint c) 440{ 441 return (TY_(IsXMLLetter)(c) || 442 c == '.' || c == '_' || 443 c == ':' || c == '-' || 444 (c >= 0x300 && c <= 0x345) || 445 (c >= 0x360 && c <= 0x361) || 446 (c >= 0x483 && c <= 0x486) || 447 (c >= 0x591 && c <= 0x5a1) || 448 (c >= 0x5a3 && c <= 0x5b9) || 449 (c >= 0x5bb && c <= 0x5bd) || 450 c == 0x5bf || 451 (c >= 0x5c1 && c <= 0x5c2) || 452 c == 0x5c4 || 453 (c >= 0x64b && c <= 0x652) || 454 c == 0x670 || 455 (c >= 0x6d6 && c <= 0x6dc) || 456 (c >= 0x6dd && c <= 0x6df) || 457 (c >= 0x6e0 && c <= 0x6e4) || 458 (c >= 0x6e7 && c <= 0x6e8) || 459 (c >= 0x6ea && c <= 0x6ed) || 460 (c >= 0x901 && c <= 0x903) || 461 c == 0x93c || 462 (c >= 0x93e && c <= 0x94c) || 463 c == 0x94d || 464 (c >= 0x951 && c <= 0x954) || 465 (c >= 0x962 && c <= 0x963) || 466 (c >= 0x981 && c <= 0x983) || 467 c == 0x9bc || 468 c == 0x9be || 469 c == 0x9bf || 470 (c >= 0x9c0 && c <= 0x9c4) || 471 (c >= 0x9c7 && c <= 0x9c8) || 472 (c >= 0x9cb && c <= 0x9cd) || 473 c == 0x9d7 || 474 (c >= 0x9e2 && c <= 0x9e3) || 475 c == 0xa02 || 476 c == 0xa3c || 477 c == 0xa3e || 478 c == 0xa3f || 479 (c >= 0xa40 && c <= 0xa42) || 480 (c >= 0xa47 && c <= 0xa48) || 481 (c >= 0xa4b && c <= 0xa4d) || 482 (c >= 0xa70 && c <= 0xa71) || 483 (c >= 0xa81 && c <= 0xa83) || 484 c == 0xabc || 485 (c >= 0xabe && c <= 0xac5) || 486 (c >= 0xac7 && c <= 0xac9) || 487 (c >= 0xacb && c <= 0xacd) || 488 (c >= 0xb01 && c <= 0xb03) || 489 c == 0xb3c || 490 (c >= 0xb3e && c <= 0xb43) || 491 (c >= 0xb47 && c <= 0xb48) || 492 (c >= 0xb4b && c <= 0xb4d) || 493 (c >= 0xb56 && c <= 0xb57) || 494 (c >= 0xb82 && c <= 0xb83) || 495 (c >= 0xbbe && c <= 0xbc2) || 496 (c >= 0xbc6 && c <= 0xbc8) || 497 (c >= 0xbca && c <= 0xbcd) || 498 c == 0xbd7 || 499 (c >= 0xc01 && c <= 0xc03) || 500 (c >= 0xc3e && c <= 0xc44) || 501 (c >= 0xc46 && c <= 0xc48) || 502 (c >= 0xc4a && c <= 0xc4d) || 503 (c >= 0xc55 && c <= 0xc56) || 504 (c >= 0xc82 && c <= 0xc83) || 505 (c >= 0xcbe && c <= 0xcc4) || 506 (c >= 0xcc6 && c <= 0xcc8) || 507 (c >= 0xcca && c <= 0xccd) || 508 (c >= 0xcd5 && c <= 0xcd6) || 509 (c >= 0xd02 && c <= 0xd03) || 510 (c >= 0xd3e && c <= 0xd43) || 511 (c >= 0xd46 && c <= 0xd48) || 512 (c >= 0xd4a && c <= 0xd4d) || 513 c == 0xd57 || 514 c == 0xe31 || 515 (c >= 0xe34 && c <= 0xe3a) || 516 (c >= 0xe47 && c <= 0xe4e) || 517 c == 0xeb1 || 518 (c >= 0xeb4 && c <= 0xeb9) || 519 (c >= 0xebb && c <= 0xebc) || 520 (c >= 0xec8 && c <= 0xecd) || 521 (c >= 0xf18 && c <= 0xf19) || 522 c == 0xf35 || 523 c == 0xf37 || 524 c == 0xf39 || 525 c == 0xf3e || 526 c == 0xf3f || 527 (c >= 0xf71 && c <= 0xf84) || 528 (c >= 0xf86 && c <= 0xf8b) || 529 (c >= 0xf90 && c <= 0xf95) || 530 c == 0xf97 || 531 (c >= 0xf99 && c <= 0xfad) || 532 (c >= 0xfb1 && c <= 0xfb7) || 533 c == 0xfb9 || 534 (c >= 0x20d0 && c <= 0x20dc) || 535 c == 0x20e1 || 536 (c >= 0x302a && c <= 0x302f) || 537 c == 0x3099 || 538 c == 0x309a || 539 (c >= 0x30 && c <= 0x39) || 540 (c >= 0x660 && c <= 0x669) || 541 (c >= 0x6f0 && c <= 0x6f9) || 542 (c >= 0x966 && c <= 0x96f) || 543 (c >= 0x9e6 && c <= 0x9ef) || 544 (c >= 0xa66 && c <= 0xa6f) || 545 (c >= 0xae6 && c <= 0xaef) || 546 (c >= 0xb66 && c <= 0xb6f) || 547 (c >= 0xbe7 && c <= 0xbef) || 548 (c >= 0xc66 && c <= 0xc6f) || 549 (c >= 0xce6 && c <= 0xcef) || 550 (c >= 0xd66 && c <= 0xd6f) || 551 (c >= 0xe50 && c <= 0xe59) || 552 (c >= 0xed0 && c <= 0xed9) || 553 (c >= 0xf20 && c <= 0xf29) || 554 c == 0xb7 || 555 c == 0x2d0 || 556 c == 0x2d1 || 557 c == 0x387 || 558 c == 0x640 || 559 c == 0xe46 || 560 c == 0xec6 || 561 c == 0x3005 || 562 (c >= 0x3031 && c <= 0x3035) || 563 (c >= 0x309d && c <= 0x309e) || 564 (c >= 0x30fc && c <= 0x30fe)); 565} 566 567#if 0 568Bool IsLower(uint c) 569{ 570 uint map = MAP(c); 571 572 return (map & lowercase)!=0; 573} 574#endif 575 576Bool TY_(IsUpper)(uint c) 577{ 578 uint map = MAP(c); 579 580 return (map & uppercase)!=0; 581} 582 583uint TY_(ToLower)(uint c) 584{ 585 uint map = MAP(c); 586 587 if (map & uppercase) 588 c += 'a' - 'A'; 589 590 return c; 591} 592 593uint TY_(ToUpper)(uint c) 594{ 595 uint map = MAP(c); 596 597 if (map & lowercase) 598 c += (uint) ('A' - 'a' ); 599 600 return c; 601} 602 603#if 0 604char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps ) 605{ 606 if ( !cfgBool(doc, TidyXmlTags) ) 607 { 608 if ( tocaps ) 609 { 610 c = (tmbchar) ToUpper(c); 611 } 612 else /* force to lower case */ 613 { 614 c = (tmbchar) ToLower(c); 615 } 616 } 617 return c; 618} 619#endif 620 621/* 622 return last character in string 623 this is useful when trailing quotemark 624 is missing on an attribute 625*/ 626static tmbchar LastChar( tmbstr str ) 627{ 628 if ( str && *str ) 629 { 630 int n = TY_(tmbstrlen)(str); 631 return str[n-1]; 632 } 633 return 0; 634} 635 636/* 637 node->type is one of these: 638 639 #define TextNode 1 640 #define StartTag 2 641 #define EndTag 3 642 #define StartEndTag 4 643*/ 644 645Lexer* TY_(NewLexer)( TidyDocImpl* doc ) 646{ 647 Lexer* lexer = (Lexer*) MemAlloc( sizeof(Lexer) ); 648 649 if ( lexer != NULL ) 650 { 651 ClearMemory( lexer, sizeof(Lexer) ); 652 653 lexer->lines = 1; 654 lexer->columns = 1; 655 lexer->state = LEX_CONTENT; 656 657 lexer->versions = (VERS_ALL|VERS_PROPRIETARY); 658 lexer->doctype = VERS_UNKNOWN; 659 lexer->root = &doc->root; 660 } 661 return lexer; 662} 663 664static Bool EndOfInput( TidyDocImpl* doc ) 665{ 666 assert( doc->docIn != NULL ); 667 return ( !doc->docIn->pushed && TY_(IsEOF)(doc->docIn) ); 668} 669 670void TY_(FreeLexer)( TidyDocImpl* doc ) 671{ 672 Lexer *lexer = doc->lexer; 673 if ( lexer ) 674 { 675 TY_(FreeStyles)( doc ); 676 677 /* See GetToken() */ 678 if ( lexer->pushed || lexer->itoken ) 679 { 680 if (lexer->pushed) 681 TY_(FreeNode)( doc, lexer->itoken ); 682 TY_(FreeNode)( doc, lexer->token ); 683 } 684 685 while ( lexer->istacksize > 0 ) 686 TY_(PopInline)( doc, NULL ); 687 688 MemFree( lexer->istack ); 689 MemFree( lexer->lexbuf ); 690 MemFree( lexer ); 691 doc->lexer = NULL; 692 } 693} 694 695/* Lexer uses bigger memory chunks than pprint as 696** it must hold the entire input document. not just 697** the last line or three. 698*/ 699static void AddByte( Lexer *lexer, tmbchar ch ) 700{ 701 if ( lexer->lexsize + 2 >= lexer->lexlength ) 702 { 703 tmbstr buf = NULL; 704 uint allocAmt = lexer->lexlength; 705 while ( lexer->lexsize + 2 >= allocAmt ) 706 { 707 if ( allocAmt == 0 ) 708 allocAmt = 8192; 709 else 710 allocAmt *= 2; 711 } 712 buf = (tmbstr) MemRealloc( lexer->lexbuf, allocAmt ); 713 if ( buf ) 714 { 715 ClearMemory( buf + lexer->lexlength, 716 allocAmt - lexer->lexlength ); 717 lexer->lexbuf = buf; 718 lexer->lexlength = allocAmt; 719 } 720 } 721 722 lexer->lexbuf[ lexer->lexsize++ ] = ch; 723 lexer->lexbuf[ lexer->lexsize ] = '\0'; /* debug */ 724} 725 726static void ChangeChar( Lexer *lexer, tmbchar c ) 727{ 728 if ( lexer->lexsize > 0 ) 729 { 730 lexer->lexbuf[ lexer->lexsize-1 ] = c; 731 } 732} 733 734/* store character c as UTF-8 encoded byte stream */ 735void TY_(AddCharToLexer)( Lexer *lexer, uint c ) 736{ 737 int i, err, count = 0; 738 tmbchar buf[10] = {0}; 739 740 err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count ); 741 if (err) 742 { 743#if 0 && defined(_DEBUG) 744 fprintf( stderr, "lexer UTF-8 encoding error for U+%x : ", c ); 745#endif 746 /* replacement character 0xFFFD encoded as UTF-8 */ 747 buf[0] = (byte) 0xEF; 748 buf[1] = (byte) 0xBF; 749 buf[2] = (byte) 0xBD; 750 count = 3; 751 } 752 753 for ( i = 0; i < count; ++i ) 754 AddByte( lexer, buf[i] ); 755} 756 757static void AddStringToLexer( Lexer *lexer, ctmbstr str ) 758{ 759 uint c; 760 761 /* Many (all?) compilers will sign-extend signed chars (the default) when 762 ** converting them to unsigned integer values. We must cast our char to 763 ** unsigned char before assigning it to prevent this from happening. 764 */ 765 while( 0 != (c = (unsigned char) *str++ )) 766 TY_(AddCharToLexer)( lexer, c ); 767} 768 769 770static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer ) 771{ 772 lexer->lines = doc->docIn->curline; 773 lexer->columns = doc->docIn->curcol; 774} 775 776/* 777 No longer attempts to insert missing ';' for unknown 778 enitities unless one was present already, since this 779 gives unexpected results. 780 781 For example: <a href="something.htm?foo&bar&fred"> 782 was tidied to: <a href="something.htm?foo&bar;&fred;"> 783 rather than: <a href="something.htm?foo&bar&fred"> 784 785 My thanks for Maurice Buxton for spotting this. 786 787 Also Randy Waki pointed out the following case for the 788 04 Aug 00 version (bug #433012): 789 790 For example: <a href="something.htm?id=1&lang=en"> 791 was tidied to: <a href="something.htm?id=1⟨=en"> 792 rather than: <a href="something.htm?id=1&lang=en"> 793 794 where "lang" is a known entity (#9001), but browsers would 795 misinterpret "⟨" because it had a value > 256. 796 797 So the case of an apparently known entity with a value > 256 and 798 missing a semicolon is handled specially. 799 800 "ParseEntity" is also a bit of a misnomer - it handles entities and 801 numeric character references. Invalid NCR's are now reported. 802*/ 803static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode ) 804{ 805 uint start; 806 Bool first = yes, semicolon = no, found = no; 807 Bool isXml = cfgBool( doc, TidyXmlTags ); 808 uint c, ch, startcol, entver = 0; 809 Lexer* lexer = doc->lexer; 810 811 start = lexer->lexsize - 1; /* to start at "&" */ 812 startcol = doc->docIn->curcol - 1; 813 814 while ( (c = TY_(ReadChar)(doc->docIn)) != EndOfStream ) 815 { 816 if ( c == ';' ) 817 { 818 semicolon = yes; 819 break; 820 } 821 822 if (first && c == '#') 823 { 824#if SUPPORT_ASIAN_ENCODINGS 825 if ( !cfgBool(doc, TidyNCR) || 826 cfg(doc, TidyInCharEncoding) == BIG5 || 827 cfg(doc, TidyInCharEncoding) == SHIFTJIS ) 828 { 829 TY_(UngetChar)('#', doc->docIn); 830 return; 831 } 832#endif 833 TY_(AddCharToLexer)( lexer, c ); 834 first = no; 835 continue; 836 } 837 838 first = no; 839 840 if ( TY_(IsNamechar)(c) ) 841 { 842 TY_(AddCharToLexer)( lexer, c ); 843 continue; 844 } 845 846 /* otherwise put it back */ 847 848 TY_(UngetChar)( c, doc->docIn ); 849 break; 850 } 851 852 /* make sure entity is NULL terminated */ 853 lexer->lexbuf[lexer->lexsize] = '\0'; 854 855 /* Should contrain version to XML/XHTML if ' 856 ** is encountered. But this is not possible with 857 ** Tidy's content model bit mask. 858 */ 859 if ( TY_(tmbstrcmp)(lexer->lexbuf+start, "&apos") == 0 860 && !cfgBool(doc, TidyXmlOut) 861 && !lexer->isvoyager 862 && !cfgBool(doc, TidyXhtmlOut) ) 863 TY_(ReportEntityError)( doc, APOS_UNDEFINED, lexer->lexbuf+start, 39 ); 864 865 /* Lookup entity code and version 866 */ 867 found = TY_(EntityInfo)( lexer->lexbuf+start, isXml, &ch, &entver ); 868 869 /* deal with unrecognized or invalid entities */ 870 /* #433012 - fix by Randy Waki 17 Feb 01 */ 871 /* report invalid NCR's - Terry Teague 01 Sep 01 */ 872 if ( !found || (ch >= 128 && ch <= 159) || (ch >= 256 && c != ';') ) 873 { 874 /* set error position just before offending character */ 875 SetLexerLocus( doc, lexer ); 876 lexer->columns = startcol; 877 878 if (lexer->lexsize > start + 1) 879 { 880 if (ch >= 128 && ch <= 159) 881 { 882 /* invalid numeric character reference */ 883 884 uint c1 = 0; 885 int replaceMode = DISCARDED_CHAR; 886 887 if ( TY_(ReplacementCharEncoding) == WIN1252 ) 888 c1 = TY_(DecodeWin1252)( ch ); 889 else if ( TY_(ReplacementCharEncoding) == MACROMAN ) 890 c1 = TY_(DecodeMacRoman)( ch ); 891 892 if ( c1 ) 893 replaceMode = REPLACED_CHAR; 894 895 if ( c != ';' ) /* issue warning if not terminated by ';' */ 896 TY_(ReportEntityError)( doc, MISSING_SEMICOLON_NCR, 897 lexer->lexbuf+start, c ); 898 899 TY_(ReportEncodingError)(doc, INVALID_NCR, ch, replaceMode == DISCARDED_CHAR); 900 901 902/* Apple Changes: 903 2007-02-07 iccir [4642206] Don't insert invalid characters in raw mode 904 2007-06-27 iccir [5222259] The previous fix for 4642206 disabled TidyQuoteAmpersand when in RAW encoding mode. 905 Since PPrintChar() has no character look-ahead, I am resorting to quoting the 906 ampersand in the lexer. 907*/ 908#ifdef TIDY_APPLE_CHANGES 909 if ( cfg(doc, TidyOutCharEncoding) != RAW ) 910 { 911#endif 912 if ( c1 ) 913 { 914 /* make the replacement */ 915 lexer->lexsize = start; 916 TY_(AddCharToLexer)( lexer, c1 ); 917 semicolon = no; 918 } 919 else 920 { 921 /* discard */ 922 lexer->lexsize = start; 923 semicolon = no; 924 } 925#ifdef TIDY_APPLE_CHANGES 926 } 927#endif 928 } 929 else 930 TY_(ReportEntityError)( doc, UNKNOWN_ENTITY, 931 lexer->lexbuf+start, ch ); 932 933 if (semicolon) 934 TY_(AddCharToLexer)( lexer, ';' ); 935 } 936 else /* naked & */ 937#ifdef TIDY_APPLE_CHANGES 938 { 939 if ( (cfg(doc, TidyOutCharEncoding) == RAW && cfgBool(doc, TidyQuoteAmpersand)) ) 940 AddStringToLexer( lexer, "amp;" ); 941#endif 942 TY_(ReportEntityError)( doc, UNESCAPED_AMPERSAND, 943 lexer->lexbuf+start, ch ); 944#ifdef TIDY_APPLE_CHANGES 945 } 946#endif 947 } 948 else 949 { 950 if ( c != ';' ) /* issue warning if not terminated by ';' */ 951 { 952 /* set error position just before offending chararcter */ 953 SetLexerLocus( doc, lexer ); 954 lexer->columns = startcol; 955 TY_(ReportEntityError)( doc, MISSING_SEMICOLON, lexer->lexbuf+start, c ); 956 } 957 958#ifdef TIDY_APPLE_CHANGES 959 if ( cfg(doc, TidyOutCharEncoding) == RAW ) 960 { 961 AddCharToLexer( lexer, ';' ); 962 } 963 else 964 { 965#endif 966 lexer->lexsize = start; 967 if ( ch == 160 && (mode == Preformatted) ) 968 ch = ' '; 969 TY_(AddCharToLexer)( lexer, ch ); 970 971 if ( ch == '&' && !cfgBool(doc, TidyQuoteAmpersand) ) 972 AddStringToLexer( lexer, "amp;" ); 973 974 /* Detect extended vs. basic entities */ 975 TY_(ConstrainVersion)( doc, entver ); 976#ifdef TIDY_APPLE_CHANGES 977 } 978#endif 979 } 980} 981 982static tmbchar ParseTagName( TidyDocImpl* doc ) 983{ 984 Lexer *lexer = doc->lexer; 985 uint c = lexer->lexbuf[ lexer->txtstart ]; 986 Bool xml = cfgBool(doc, TidyXmlTags); 987 988 /* fold case of first character in buffer */ 989 if (!xml && TY_(IsUpper)(c)) 990 lexer->lexbuf[lexer->txtstart] = (tmbchar) TY_(ToLower)(c); 991 992 while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream) 993 { 994 if ((!xml && !TY_(IsNamechar)(c)) || 995 (xml && !TY_(IsXMLNamechar)(c))) 996 break; 997 998 /* fold case of subsequent characters */ 999 if (!xml && TY_(IsUpper)(c)) 1000 c = TY_(ToLower)(c); 1001 1002 TY_(AddCharToLexer)(lexer, c); 1003 } 1004 1005 lexer->txtend = lexer->lexsize; 1006 return (tmbchar) c; 1007} 1008 1009/* 1010 Used for elements and text nodes 1011 element name is NULL for text nodes 1012 start and end are offsets into lexbuf 1013 which contains the textual content of 1014 all elements in the parse tree. 1015 1016 parent and content allow traversal 1017 of the parse tree in any direction. 1018 attributes are represented as a linked 1019 list of AttVal nodes which hold the 1020 strings for attribute/value pairs. 1021*/ 1022 1023 1024Node *TY_(NewNode)(Lexer *lexer) 1025{ 1026 Node* node = (Node*) MemAlloc( sizeof(Node) ); 1027 ClearMemory( node, sizeof(Node) ); 1028 if ( lexer ) 1029 { 1030 node->line = lexer->lines; 1031 node->column = lexer->columns; 1032 } 1033 node->type = TextNode; 1034 return node; 1035} 1036 1037/* used to clone heading nodes when split by an <HR> */ 1038Node *TY_(CloneNode)( TidyDocImpl* doc, Node *element ) 1039{ 1040 Lexer* lexer = doc->lexer; 1041 Node *node = TY_(NewNode)( lexer ); 1042 1043 node->start = lexer->lexsize; 1044 node->end = lexer->lexsize; 1045 1046 if ( element ) 1047 { 1048 node->parent = element->parent; 1049 node->type = element->type; 1050 node->closed = element->closed; 1051 node->implicit = element->implicit; 1052 node->tag = element->tag; 1053 node->element = TY_(tmbstrdup)( element->element ); 1054 node->attributes = TY_(DupAttrs)( doc, element->attributes ); 1055 } 1056 return node; 1057} 1058 1059/* free node's attributes */ 1060void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node ) 1061{ 1062 1063 while ( node->attributes ) 1064 { 1065 AttVal *av = node->attributes; 1066 1067 if ( av->attribute ) 1068 { 1069 if ( (attrIsID(av) || attrIsNAME(av)) && 1070 TY_(IsAnchorElement)(doc, node) ) 1071 { 1072 TY_(RemoveAnchorByNode)( doc, node ); 1073 } 1074 } 1075 1076 node->attributes = av->next; 1077 TY_(FreeAttribute)( doc, av ); 1078 } 1079} 1080 1081/* doesn't repair attribute list linkage */ 1082void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av ) 1083{ 1084 TY_(FreeNode)( doc, av->asp ); 1085 TY_(FreeNode)( doc, av->php ); 1086 MemFree( av->attribute ); 1087 MemFree( av->value ); 1088 MemFree( av ); 1089} 1090 1091/* detach attribute from node 1092*/ 1093void TY_(DetachAttribute)( Node *node, AttVal *attr ) 1094{ 1095 AttVal *av, *prev = NULL; 1096 1097 for ( av = node->attributes; av; av = av->next ) 1098 { 1099 if ( av == attr ) 1100 { 1101 if ( prev ) 1102 prev->next = attr->next; 1103 else 1104 node->attributes = attr->next; 1105 break; 1106 } 1107 prev = av; 1108 } 1109} 1110 1111/* detach attribute from node then free it 1112*/ 1113void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr ) 1114{ 1115 TY_(DetachAttribute)( node, attr ); 1116 TY_(FreeAttribute)( doc, attr ); 1117} 1118 1119/* 1120 Free document nodes by iterating through peers and recursing 1121 through children. Set next to NULL before calling TY_(FreeNode)() 1122 to avoid freeing peer nodes. Doesn't patch up prev/next links. 1123 */ 1124void TY_(FreeNode)( TidyDocImpl* doc, Node *node ) 1125{ 1126 while ( node ) 1127 { 1128 Node* next = node->next; 1129 1130 TY_(FreeAttrs)( doc, node ); 1131 TY_(FreeNode)( doc, node->content ); 1132 MemFree( node->element ); 1133#ifdef TIDY_STORE_ORIGINAL_TEXT 1134 if (node->otext) 1135 MemFree(node->otext); 1136#endif 1137 if (RootNode != node->type) 1138 MemFree( node ); 1139 else 1140 node->content = NULL; 1141 1142 node = next; 1143 } 1144} 1145 1146#ifdef TIDY_STORE_ORIGINAL_TEXT 1147void StoreOriginalTextInToken(TidyDocImpl* doc, Node* node, uint count) 1148{ 1149 if (!doc->storeText) 1150 return; 1151 1152 if (count >= doc->docIn->otextlen) 1153 return; 1154 1155 if (!doc->docIn->otextsize) 1156 return; 1157 1158 if (count == 0) 1159 { 1160 node->otext = doc->docIn->otextbuf; 1161 doc->docIn->otextbuf = NULL; 1162 doc->docIn->otextlen = 0; 1163 doc->docIn->otextsize = 0; 1164 } 1165 else 1166 { 1167 uint len = doc->docIn->otextlen; 1168 tmbstr buf1 = (tmbstr)MemAlloc(len - count + 1); 1169 tmbstr buf2 = (tmbstr)MemAlloc(count + 1); 1170 uint i, j; 1171 1172 /* strncpy? */ 1173 1174 for (i = 0; i < len - count; ++i) 1175 buf1[i] = doc->docIn->otextbuf[i]; 1176 1177 buf1[i] = 0; 1178 1179 for (j = 0; j + i < len; ++j) 1180 buf2[j] = doc->docIn->otextbuf[j + i]; 1181 1182 buf2[j] = 0; 1183 1184 MemFree(doc->docIn->otextbuf); 1185 node->otext = buf1; 1186 doc->docIn->otextbuf = buf2; 1187 doc->docIn->otextlen = count; 1188 doc->docIn->otextsize = count + 1; 1189 } 1190} 1191#endif 1192 1193Node* TY_(TextToken)( Lexer *lexer ) 1194{ 1195 Node *node = TY_(NewNode)( lexer ); 1196 node->start = lexer->txtstart; 1197 node->end = lexer->txtend; 1198 return node; 1199} 1200 1201/* used for creating preformatted text from Word2000 */ 1202Node *TY_(NewLineNode)( Lexer *lexer ) 1203{ 1204 Node *node = TY_(NewNode)( lexer ); 1205 node->start = lexer->lexsize; 1206 TY_(AddCharToLexer)( lexer, (uint)'\n' ); 1207 node->end = lexer->lexsize; 1208 return node; 1209} 1210 1211/* used for adding a for Word2000 */ 1212Node* TY_(NewLiteralTextNode)( Lexer *lexer, ctmbstr txt ) 1213{ 1214 Node *node = TY_(NewNode)( lexer ); 1215 node->start = lexer->lexsize; 1216 AddStringToLexer( lexer, txt ); 1217 node->end = lexer->lexsize; 1218 return node; 1219} 1220 1221static Node* TagToken( TidyDocImpl* doc, NodeType type ) 1222{ 1223 Lexer* lexer = doc->lexer; 1224 Node* node = TY_(NewNode)( lexer ); 1225 node->type = type; 1226 node->element = TY_(tmbstrndup)( lexer->lexbuf + lexer->txtstart, 1227 lexer->txtend - lexer->txtstart ); 1228 node->start = lexer->txtstart; 1229 node->end = lexer->txtstart; 1230 1231 if ( type == StartTag || type == StartEndTag || type == EndTag ) 1232 TY_(FindTag)(doc, node); 1233 1234 return node; 1235} 1236 1237static Node* NewToken(TidyDocImpl* doc, NodeType type) 1238{ 1239 Lexer* lexer = doc->lexer; 1240 Node* node = TY_(NewNode)(lexer); 1241 node->type = type; 1242 node->start = lexer->txtstart; 1243 node->end = lexer->txtend; 1244#ifdef TIDY_STORE_ORIGINAL_TEXT 1245 StoreOriginalTextInToken(doc, node, 0); 1246#endif 1247 return node; 1248} 1249 1250#define CommentToken(doc) NewToken(doc, CommentTag) 1251#define DocTypeToken(doc) NewToken(doc, DocTypeTag) 1252#define PIToken(doc) NewToken(doc, ProcInsTag) 1253#define AspToken(doc) NewToken(doc, AspTag) 1254#define JsteToken(doc) NewToken(doc, JsteTag) 1255#define PhpToken(doc) NewToken(doc, PhpTag) 1256#define XmlDeclToken(doc) NewToken(doc, XmlDecl) 1257#define SectionToken(doc) NewToken(doc, SectionTag) 1258#define CDATAToken(doc) NewToken(doc, CDATATag) 1259 1260void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str ) 1261{ 1262 byte c; 1263 while(0 != (c = *str++) ) 1264 TY_(AddCharToLexer)( lexer, c ); 1265} 1266 1267/* 1268void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ) 1269{ 1270 byte c; 1271 int ix; 1272 1273 for ( ix=0; ix < len && (c = *str++); ++ix ) 1274 TY_(AddCharToLexer)(lexer, c); 1275} 1276*/ 1277 1278/* find doctype element */ 1279Node *TY_(FindDocType)( TidyDocImpl* doc ) 1280{ 1281 Node* node; 1282 for ( node = (doc ? doc->root.content : NULL); 1283 node && node->type != DocTypeTag; 1284 node = node->next ) 1285 /**/; 1286 return node; 1287} 1288 1289/* find parent container element */ 1290Node* TY_(FindContainer)( Node* node ) 1291{ 1292 for ( node = (node ? node->parent : NULL); 1293 node && TY_(nodeHasCM)(node, CM_INLINE); 1294 node = node->parent ) 1295 /**/; 1296 1297 return node; 1298} 1299 1300 1301/* find html element */ 1302Node *TY_(FindHTML)( TidyDocImpl* doc ) 1303{ 1304 Node *node; 1305 for ( node = (doc ? doc->root.content : NULL); 1306 node && !nodeIsHTML(node); 1307 node = node->next ) 1308 /**/; 1309 1310 return node; 1311} 1312 1313/* find XML Declaration */ 1314Node *TY_(FindXmlDecl)(TidyDocImpl* doc) 1315{ 1316 Node *node; 1317 for ( node = (doc ? doc->root.content : NULL); 1318 node && !(node->type == XmlDecl); 1319 node = node->next ) 1320 /**/; 1321 1322 return node; 1323} 1324 1325 1326Node *TY_(FindHEAD)( TidyDocImpl* doc ) 1327{ 1328 Node *node = TY_(FindHTML)( doc ); 1329 1330 if ( node ) 1331 { 1332 for ( node = node->content; 1333 node && !nodeIsHEAD(node); 1334 node = node->next ) 1335 /**/; 1336 } 1337 1338 return node; 1339} 1340 1341Node *TY_(FindTITLE)(TidyDocImpl* doc) 1342{ 1343 Node *node = TY_(FindHEAD)(doc); 1344 1345 if (node) 1346 for (node = node->content; 1347 node && !nodeIsTITLE(node); 1348 node = node->next) {} 1349 1350 return node; 1351} 1352 1353Node *TY_(FindBody)( TidyDocImpl* doc ) 1354{ 1355 Node *node = ( doc ? doc->root.content : NULL ); 1356 1357 while ( node && !nodeIsHTML(node) ) 1358 node = node->next; 1359 1360 if (node == NULL) 1361 return NULL; 1362 1363 node = node->content; 1364 while ( node && !nodeIsBODY(node) && !nodeIsFRAMESET(node) ) 1365 node = node->next; 1366 1367 if ( node && nodeIsFRAMESET(node) ) 1368 { 1369 node = node->content; 1370 while ( node && !nodeIsNOFRAMES(node) ) 1371 node = node->next; 1372 1373 if ( node ) 1374 { 1375 node = node->content; 1376 while ( node && !nodeIsBODY(node) ) 1377 node = node->next; 1378 } 1379 } 1380 1381 return node; 1382} 1383 1384/* add meta element for Tidy */ 1385Bool TY_(AddGenerator)( TidyDocImpl* doc ) 1386{ 1387 AttVal *attval; 1388 Node *node; 1389 Node *head = TY_(FindHEAD)( doc ); 1390 tmbchar buf[256]; 1391 1392 if (head) 1393 { 1394#ifdef PLATFORM_NAME 1395 TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for "PLATFORM_NAME" (vers %s), see www.w3.org", 1396 tidyReleaseDate()); 1397#else 1398 TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy (vers %s), see www.w3.org", tidyReleaseDate()); 1399#endif 1400 1401 for ( node = head->content; node; node = node->next ) 1402 { 1403 if ( nodeIsMETA(node) ) 1404 { 1405 attval = TY_(AttrGetById)(node, TidyAttr_NAME); 1406 1407 if (AttrValueIs(attval, "generator")) 1408 { 1409 attval = TY_(AttrGetById)(node, TidyAttr_CONTENT); 1410 1411 if (AttrHasValue(attval) && 1412 TY_(tmbstrncasecmp)(attval->value, "HTML Tidy", 9) == 0) 1413 { 1414 /* update the existing content to reflect the */ 1415 /* actual version of Tidy currently being used */ 1416 1417 MemFree(attval->value); 1418 attval->value = TY_(tmbstrdup)(buf); 1419 return no; 1420 } 1421 } 1422 } 1423 } 1424 1425 if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 ) 1426 { 1427 node = TY_(InferredTag)(doc, TidyTag_META); 1428 TY_(AddAttribute)( doc, node, "name", "generator" ); 1429 TY_(AddAttribute)( doc, node, "content", buf ); 1430 TY_(InsertNodeAtStart)( head, node ); 1431 return yes; 1432 } 1433 } 1434 1435 return no; 1436} 1437 1438/* examine <!DOCTYPE> to identify version */ 1439static uint FindGivenVersion( TidyDocImpl* doc, Node* doctype ) 1440{ 1441 AttVal * fpi = TY_(GetAttrByName)(doctype, "PUBLIC"); 1442 uint vers; 1443 1444 if (!fpi || !fpi->value) 1445 return VERS_UNKNOWN; 1446 1447 vers = GetVersFromFPI(fpi->value); 1448 1449 if (VERS_XHTML & vers) 1450 { 1451 TY_(SetOptionBool)(doc, TidyXmlOut, yes); 1452 TY_(SetOptionBool)(doc, TidyXhtmlOut, yes); 1453 doc->lexer->isvoyager = yes; 1454 } 1455 1456 /* todo: add a warning if case does not match? */ 1457 MemFree(fpi->value); 1458 fpi->value = TY_(tmbstrdup)(GetFPIFromVers(vers)); 1459 1460 return vers; 1461} 1462 1463/* return guessed version */ 1464uint TY_(ApparentVersion)( TidyDocImpl* doc ) 1465{ 1466 if ((doc->lexer->doctype == XH11 || 1467 doc->lexer->doctype == XB10) && 1468 (doc->lexer->versions & doc->lexer->doctype)) 1469 return doc->lexer->doctype; 1470 else 1471 return TY_(HTMLVersion)(doc); 1472} 1473 1474ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool ARG_UNUSED(isXhtml) ) 1475{ 1476 ctmbstr name = GetNameFromVers(vers); 1477 1478 /* this test has moved to ReportMarkupVersion() in localize.c, for localization reasons */ 1479 /* 1480 if (!name) 1481 name = "HTML Proprietary"; 1482 */ 1483 1484 return name; 1485} 1486 1487Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc ) 1488{ 1489 Bool isXhtml = doc->lexer->isvoyager; 1490 Node* doctype; 1491 1492 /* Do not warn in XHTML mode */ 1493 if ( isXhtml ) 1494 return no; 1495 1496 /* Do not warn if emitted doctype is proprietary */ 1497 if ( TY_(HTMLVersionNameFromCode)(doc->lexer->versionEmitted, isXhtml ) == NULL ) 1498 return no; 1499 1500 /* Do not warn if no SI is possible */ 1501 if ( GetSIFromVers(doc->lexer->versionEmitted) == NULL ) 1502 return no; 1503 1504 if ( (doctype = TY_(FindDocType)( doc )) != NULL 1505 && TY_(GetAttrByName)(doctype, "SYSTEM") == NULL ) 1506 return yes; 1507 1508 return no; 1509} 1510 1511 1512/* Put DOCTYPE declaration between the 1513** <?xml version "1.0" ... ?> declaration, if any, 1514** and the <html> tag. Should also work for any comments, 1515** etc. that may precede the <html> tag. 1516*/ 1517 1518static Node* NewDocTypeNode( TidyDocImpl* doc ) 1519{ 1520 Node* doctype = NULL; 1521 Node* html = TY_(FindHTML)( doc ); 1522 1523 if ( !html ) 1524 return NULL; 1525 1526 doctype = TY_(NewNode)( NULL ); 1527 doctype->type = DocTypeTag; 1528 TY_(InsertNodeBeforeElement)(html, doctype); 1529 return doctype; 1530} 1531 1532Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc ) 1533{ 1534 Lexer *lexer = doc->lexer; 1535 Node *doctype = TY_(FindDocType)( doc ); 1536 TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode); 1537 ctmbstr pub = "PUBLIC"; 1538 ctmbstr sys = "SYSTEM"; 1539 1540 lexer->versionEmitted = TY_(ApparentVersion)( doc ); 1541 1542 if (dtmode == TidyDoctypeOmit) 1543 { 1544 if (doctype) 1545 TY_(DiscardElement)(doc, doctype); 1546 return yes; 1547 } 1548 1549 if (dtmode == TidyDoctypeUser && !cfgStr(doc, TidyDoctype)) 1550 return no; 1551 1552 if (!doctype) 1553 { 1554 doctype = NewDocTypeNode(doc); 1555 doctype->element = TY_(tmbstrdup)("html"); 1556 } 1557 else 1558 { 1559 doctype->element = TY_(tmbstrtolower)(doctype->element); 1560 } 1561 1562 switch(dtmode) 1563 { 1564 case TidyDoctypeStrict: 1565 /* XHTML 1.0 Strict */ 1566 TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S)); 1567 TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S)); 1568 lexer->versionEmitted = X10S; 1569 break; 1570 case TidyDoctypeLoose: 1571 /* XHTML 1.0 Transitional */ 1572 TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T)); 1573 TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T)); 1574 lexer->versionEmitted = X10T; 1575 break; 1576 case TidyDoctypeUser: 1577 /* user defined document type declaration */ 1578 TY_(RepairAttrValue)(doc, doctype, pub, cfgStr(doc, TidyDoctype)); 1579 TY_(RepairAttrValue)(doc, doctype, sys, ""); 1580 break; 1581 case TidyDoctypeAuto: 1582 if (lexer->versions & XH11 && lexer->doctype == XH11) 1583 { 1584 if (!TY_(GetAttrByName)(doctype, sys)) 1585 TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11)); 1586 lexer->versionEmitted = XH11; 1587 return yes; 1588 } 1589 else if (lexer->versions & XH11 && !(lexer->versions & VERS_HTML40)) 1590 { 1591 TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(XH11)); 1592 TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11)); 1593 lexer->versionEmitted = XH11; 1594 } 1595 else if (lexer->versions & XB10 && lexer->doctype == XB10) 1596 { 1597 if (!TY_(GetAttrByName)(doctype, sys)) 1598 TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XB10)); 1599 lexer->versionEmitted = XB10; 1600 return yes; 1601 } 1602 else if (lexer->versions & VERS_HTML40_STRICT) 1603 { 1604 TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S)); 1605 TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S)); 1606 lexer->versionEmitted = X10S; 1607 } 1608 else if (lexer->versions & VERS_FRAMESET) 1609 { 1610 TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10F)); 1611 TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10F)); 1612 lexer->versionEmitted = X10F; 1613 } 1614 else if (lexer->versions & VERS_LOOSE) 1615 { 1616 TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T)); 1617 TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T)); 1618 lexer->versionEmitted = X10T; 1619 } 1620 else 1621 { 1622 if (doctype) 1623 TY_(DiscardElement)(doc, doctype); 1624 return no; 1625 } 1626 break; 1627 } 1628 1629 return no; 1630} 1631 1632/* fixup doctype if missing */ 1633Bool TY_(FixDocType)( TidyDocImpl* doc ) 1634{ 1635 Lexer* lexer = doc->lexer; 1636 Node* doctype = TY_(FindDocType)( doc ); 1637 uint dtmode = cfg( doc, TidyDoctypeMode ); 1638 uint guessed = VERS_UNKNOWN; 1639 Bool hadSI = no; 1640 1641 if (dtmode == TidyDoctypeAuto && 1642 lexer->versions & lexer->doctype && 1643 !(VERS_XHTML & lexer->doctype && !lexer->isvoyager) 1644 && TY_(FindDocType)(doc)) 1645 { 1646 lexer->versionEmitted = lexer->doctype; 1647 return yes; 1648 } 1649 1650 if (dtmode == TidyDoctypeOmit) 1651 { 1652 if (doctype) 1653 TY_(DiscardElement)( doc, doctype ); 1654 lexer->versionEmitted = TY_(ApparentVersion)( doc ); 1655 return yes; 1656 } 1657 1658 if (cfgBool(doc, TidyXmlOut)) 1659 return yes; 1660 1661 if (doctype) 1662 hadSI = TY_(GetAttrByName)(doctype, "SYSTEM") != NULL; 1663 1664 if ((dtmode == TidyDoctypeStrict || 1665 dtmode == TidyDoctypeLoose) && doctype) 1666 { 1667 TY_(DiscardElement)(doc, doctype); 1668 doctype = NULL; 1669 } 1670 1671 switch (dtmode) 1672 { 1673 case TidyDoctypeStrict: 1674 guessed = H41S; 1675 break; 1676 case TidyDoctypeLoose: 1677 guessed = H41T; 1678 break; 1679 case TidyDoctypeAuto: 1680 guessed = TY_(HTMLVersion)(doc); 1681 break; 1682 } 1683 1684 lexer->versionEmitted = guessed; 1685 if (guessed == VERS_UNKNOWN) 1686 return no; 1687 1688 if (doctype) 1689 { 1690 doctype->element = TY_(tmbstrtolower)(doctype->element); 1691 } 1692 else 1693 { 1694 doctype = NewDocTypeNode(doc); 1695 doctype->element = TY_(tmbstrdup)("html"); 1696 } 1697 1698 TY_(RepairAttrValue)(doc, doctype, "PUBLIC", GetFPIFromVers(guessed)); 1699 1700 if (hadSI) 1701 TY_(RepairAttrValue)(doc, doctype, "SYSTEM", GetSIFromVers(guessed)); 1702 1703 return yes; 1704} 1705 1706/* ensure XML document starts with <?xml version="1.0"?> */ 1707/* add encoding attribute if not using ASCII or UTF-8 output */ 1708Bool TY_(FixXmlDecl)( TidyDocImpl* doc ) 1709{ 1710 Node* xml; 1711 AttVal *version, *encoding; 1712 Lexer*lexer = doc->lexer; 1713 Node* root = &doc->root; 1714 1715 if ( root->content && root->content->type == XmlDecl ) 1716 { 1717 xml = root->content; 1718 } 1719 else 1720 { 1721 xml = TY_(NewNode)(lexer); 1722 xml->type = XmlDecl; 1723 if ( root->content ) 1724 TY_(InsertNodeBeforeElement)(root->content, xml); 1725 else 1726 root->content = xml; 1727 } 1728 1729 version = TY_(GetAttrByName)(xml, "version"); 1730 encoding = TY_(GetAttrByName)(xml, "encoding"); 1731 1732 /* 1733 We need to insert a check if declared encoding 1734 and output encoding mismatch and fix the XML 1735 declaration accordingly!!! 1736 */ 1737 1738 if ( encoding == NULL && cfg(doc, TidyOutCharEncoding) != UTF8 ) 1739 { 1740 ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding)); 1741 if ( enc ) 1742 TY_(AddAttribute)( doc, xml, "encoding", enc ); 1743 } 1744 1745 if ( version == NULL ) 1746 TY_(AddAttribute)( doc, xml, "version", "1.0" ); 1747 return yes; 1748} 1749 1750Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id) 1751{ 1752 Lexer *lexer = doc->lexer; 1753 Node *node = TY_(NewNode)( lexer ); 1754 const Dict* dict = TY_(LookupTagDef)(id); 1755 1756 assert( dict != NULL ); 1757 1758 node->type = StartTag; 1759 node->implicit = yes; 1760 node->element = TY_(tmbstrdup)(dict->name); 1761 node->tag = dict; 1762 node->start = lexer->txtstart; 1763 node->end = lexer->txtend; 1764 1765 return node; 1766} 1767 1768static Bool ExpectsContent(Node *node) 1769{ 1770 if (node->type != StartTag) 1771 return no; 1772 1773 /* unknown element? */ 1774 if (node->tag == NULL) 1775 return yes; 1776 1777 if (node->tag->model & CM_EMPTY) 1778 return no; 1779 1780 return yes; 1781} 1782 1783/* 1784 create a text node for the contents of 1785 a CDATA element like style or script 1786 which ends with </foo> for some foo. 1787*/ 1788 1789typedef enum 1790{ 1791 CDATA_INTERMEDIATE, 1792 CDATA_STARTTAG, 1793 CDATA_ENDTAG 1794} CDATAState; 1795 1796static Node *GetCDATA( TidyDocImpl* doc, Node *container ) 1797{ 1798 Lexer* lexer = doc->lexer; 1799 uint start = 0; 1800 int nested = 0; 1801 CDATAState state = CDATA_INTERMEDIATE; 1802 uint i; 1803 Bool isEmpty = yes; 1804 Bool matches = no; 1805 uint c; 1806 Bool hasSrc = TY_(AttrGetById)(container, TidyAttr_SRC) != NULL; 1807 1808 SetLexerLocus( doc, lexer ); 1809 lexer->waswhite = no; 1810 lexer->txtstart = lexer->txtend = lexer->lexsize; 1811 1812 /* seen start tag, look for matching end tag */ 1813 while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream) 1814 { 1815 TY_(AddCharToLexer)(lexer, c); 1816 lexer->txtend = lexer->lexsize; 1817 1818 if (state == CDATA_INTERMEDIATE) 1819 { 1820 if (c != '<') 1821 { 1822 if (isEmpty && !TY_(IsWhite)(c)) 1823 isEmpty = no; 1824 continue; 1825 } 1826 1827 c = TY_(ReadChar)(doc->docIn); 1828 1829 if (TY_(IsLetter)(c)) 1830 { 1831 /* <head><script src=foo><meta name=foo content=bar>*/ 1832 if (hasSrc && isEmpty && nodeIsSCRIPT(container)) 1833 { 1834 /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */ 1835 lexer->lexsize = lexer->txtstart; 1836 TY_(UngetChar)(c, doc->docIn); 1837 TY_(UngetChar)('<', doc->docIn); 1838 return NULL; 1839 } 1840 TY_(AddCharToLexer)(lexer, c); 1841 start = lexer->lexsize - 1; 1842 state = CDATA_STARTTAG; 1843 } 1844 else if (c == '/') 1845 { 1846 TY_(AddCharToLexer)(lexer, c); 1847 1848 c = TY_(ReadChar)(doc->docIn); 1849 1850 if (!TY_(IsLetter)(c)) 1851 { 1852 TY_(UngetChar)(c, doc->docIn); 1853 continue; 1854 } 1855 TY_(UngetChar)(c, doc->docIn); 1856 1857 start = lexer->lexsize; 1858 state = CDATA_ENDTAG; 1859 } 1860 else if (c == '\\') 1861 { 1862 /* recognize document.write("<script><\/script>") */ 1863 TY_(AddCharToLexer)(lexer, c); 1864 1865 c = TY_(ReadChar)(doc->docIn); 1866 1867 if (c != '/') 1868 { 1869 TY_(UngetChar)(c, doc->docIn); 1870 continue; 1871 } 1872 1873 TY_(AddCharToLexer)(lexer, c); 1874 c = TY_(ReadChar)(doc->docIn); 1875 1876 if (!TY_(IsLetter)(c)) 1877 { 1878 TY_(UngetChar)(c, doc->docIn); 1879 continue; 1880 } 1881 TY_(UngetChar)(c, doc->docIn); 1882 1883 start = lexer->lexsize; 1884 state = CDATA_ENDTAG; 1885 } 1886 else 1887 { 1888 TY_(UngetChar)(c, doc->docIn); 1889 } 1890 } 1891 /* '<' + Letter found */ 1892 else if (state == CDATA_STARTTAG) 1893 { 1894 if (TY_(IsLetter)(c)) 1895 continue; 1896 1897 matches = TY_(tmbstrncasecmp)(container->element, lexer->lexbuf + start, 1898 TY_(tmbstrlen)(container->element)) == 0; 1899 if (matches) 1900 nested++; 1901 1902 state = CDATA_INTERMEDIATE; 1903 } 1904 /* '<' + '/' + Letter found */ 1905 else if (state == CDATA_ENDTAG) 1906 { 1907 if (TY_(IsLetter)(c)) 1908 continue; 1909 1910 matches = TY_(tmbstrncasecmp)(container->element, lexer->lexbuf + start, 1911 TY_(tmbstrlen)(container->element)) == 0; 1912 1913 if (isEmpty && !matches) 1914 { 1915 /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */ 1916 1917 for (i = lexer->lexsize - 1; i >= start; --i) 1918 TY_(UngetChar)((uint)lexer->lexbuf[i], doc->docIn); 1919 TY_(UngetChar)('/', doc->docIn); 1920 TY_(UngetChar)('<', doc->docIn); 1921 break; 1922 } 1923 1924 if (matches && nested-- <= 0) 1925 { 1926 for (i = lexer->lexsize - 1; i >= start; --i) 1927 TY_(UngetChar)((uint)lexer->lexbuf[i], doc->docIn); 1928 TY_(UngetChar)('/', doc->docIn); 1929 TY_(UngetChar)('<', doc->docIn); 1930 lexer->lexsize -= (lexer->lexsize - start) + 2; 1931 break; 1932 } 1933 else if (lexer->lexbuf[start - 2] != '\\') 1934 { 1935 /* if the end tag is not already escaped using backslash */ 1936 SetLexerLocus( doc, lexer ); 1937 lexer->columns -= 3; 1938 TY_(ReportError)(doc, NULL, NULL, BAD_CDATA_CONTENT); 1939 1940 /* if javascript insert backslash before / */ 1941 if (TY_(IsJavaScript)(container)) 1942 { 1943 for (i = lexer->lexsize; i > start-1; --i) 1944 lexer->lexbuf[i] = lexer->lexbuf[i-1]; 1945 1946 lexer->lexbuf[start-1] = '\\'; 1947 lexer->lexsize++; 1948 } 1949 } 1950 state = CDATA_INTERMEDIATE; 1951 } 1952 } 1953 if (isEmpty) 1954 lexer->lexsize = lexer->txtstart = lexer->txtend; 1955 else 1956 lexer->txtend = lexer->lexsize; 1957 1958 if (c == EndOfStream) 1959 TY_(ReportError)(doc, container, NULL, MISSING_ENDTAG_FOR ); 1960 1961/* this was disabled for some reason... */ 1962#if 0 1963 if (lexer->txtend > lexer->txtstart) 1964 return TextToken(lexer); 1965 else 1966 return NULL; 1967#else 1968 return TY_(TextToken)(lexer); 1969#endif 1970} 1971 1972void TY_(UngetToken)( TidyDocImpl* doc ) 1973{ 1974 doc->lexer->pushed = yes; 1975} 1976 1977#ifdef TIDY_STORE_ORIGINAL_TEXT 1978#define CondReturnTextNode(doc, skip) \ 1979 if (lexer->txtend > lexer->txtstart) \ 1980 { \ 1981 lexer->token = TY_(TextToken)(lexer); \ 1982 StoreOriginalTextInToken(doc, lexer->token, skip); \ 1983 return lexer->token; \ 1984 } 1985#else 1986#define CondReturnTextNode(doc, skip) \ 1987 if (lexer->txtend > lexer->txtstart) \ 1988 { \ 1989 lexer->token = TY_(TextToken)(lexer); \ 1990 return lexer->token; \ 1991 } 1992#endif 1993 1994/* 1995 modes for GetToken() 1996 1997 MixedContent -- for elements which don't accept PCDATA 1998 Preformatted -- white space preserved as is 1999 IgnoreMarkup -- for CDATA elements such as script, style 2000*/ 2001static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode ); 2002 2003Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode ) 2004{ 2005 Lexer* lexer = doc->lexer; 2006 2007 if (lexer->pushed || lexer->itoken) 2008 { 2009 /* Deal with previously returned duplicate inline token */ 2010 if (lexer->itoken) 2011 { 2012 /* itoken rejected */ 2013 if (lexer->pushed) 2014 { 2015 lexer->pushed = no; 2016 return lexer->itoken; 2017 } 2018 /* itoken has been accepted */ 2019 lexer->itoken = NULL; 2020 } 2021 2022 /* duplicate inlines in preference to pushed text nodes when appropriate */ 2023 lexer->pushed = no; 2024 if (lexer->token->type != TextNode 2025 || !(lexer->insert || lexer->inode)) 2026 return lexer->token; 2027 return lexer->itoken = TY_(InsertedToken)( doc ); 2028 } 2029 2030 assert( !(lexer->pushed || lexer->itoken) ); 2031 2032 /* at start of block elements, unclosed inline 2033 elements are inserted into the token stream */ 2034 if (lexer->insert || lexer->inode) 2035 return lexer->token = TY_(InsertedToken)( doc ); 2036 2037 if (mode == CdataContent) 2038 { 2039 assert( lexer->parent != NULL ); 2040 return GetCDATA(doc, lexer->parent); 2041 } 2042 2043 return GetTokenFromStream( doc, mode ); 2044} 2045 2046static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode ) 2047{ 2048 Lexer* lexer = doc->lexer; 2049 uint c, badcomment = 0; 2050 Bool isempty = no; 2051 AttVal *attributes = NULL; 2052 2053 /* Lexer->token must be set on return. Nullify it for safety. */ 2054 lexer->token = NULL; 2055 2056 SetLexerLocus( doc, lexer ); 2057 lexer->waswhite = no; 2058 2059 lexer->txtstart = lexer->txtend = lexer->lexsize; 2060 2061 while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream) 2062 { 2063 if (lexer->insertspace) 2064 { 2065 TY_(AddCharToLexer)(lexer, ' '); 2066 lexer->waswhite = yes; 2067 lexer->insertspace = no; 2068 } 2069 2070 if (c == 160 && (mode == Preformatted)) 2071 c = ' '; 2072 2073 TY_(AddCharToLexer)(lexer, c); 2074 2075 switch (lexer->state) 2076 { 2077 case LEX_CONTENT: /* element content */ 2078 2079 /* 2080 Discard white space if appropriate. Its cheaper 2081 to do this here rather than in parser methods 2082 for elements that don't have mixed content. 2083 */ 2084 if (TY_(IsWhite)(c) && (mode == IgnoreWhitespace) 2085 && lexer->lexsize == lexer->txtstart + 1) 2086 { 2087 --(lexer->lexsize); 2088 lexer->waswhite = no; 2089 SetLexerLocus( doc, lexer ); 2090 continue; 2091 } 2092 2093 if (c == '<') 2094 { 2095 lexer->state = LEX_GT; 2096 continue; 2097 } 2098 2099 if (TY_(IsWhite)(c)) 2100 { 2101 /* was previous character white? */ 2102 if (lexer->waswhite) 2103 { 2104 if (mode != Preformatted && mode != IgnoreMarkup) 2105 { 2106 --(lexer->lexsize); 2107 SetLexerLocus( doc, lexer ); 2108 } 2109 } 2110 else /* prev character wasn't white */ 2111 { 2112 lexer->waswhite = yes; 2113 2114 if (mode != Preformatted && mode != IgnoreMarkup && c != ' ') 2115 ChangeChar(lexer, ' '); 2116 } 2117 2118 continue; 2119 } 2120 else if (c == '&' && mode != IgnoreMarkup) 2121 ParseEntity( doc, mode ); 2122 2123 /* this is needed to avoid trimming trailing whitespace */ 2124 if (mode == IgnoreWhitespace) 2125 mode = MixedContent; 2126 2127 lexer->waswhite = no; 2128 continue; 2129 2130 case LEX_GT: /* < */ 2131 2132 /* check for endtag */ 2133 if (c == '/') 2134 { 2135 if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream) 2136 { 2137 TY_(UngetChar)(c, doc->docIn); 2138 continue; 2139 } 2140 2141 TY_(AddCharToLexer)(lexer, c); 2142 2143 if (TY_(IsLetter)(c)) 2144 { 2145 lexer->lexsize -= 3; 2146 lexer->txtend = lexer->lexsize; 2147 TY_(UngetChar)(c, doc->docIn); 2148 lexer->state = LEX_ENDTAG; 2149 lexer->lexbuf[lexer->lexsize] = '\0'; /* debug */ 2150 doc->docIn->curcol -= 2; 2151 2152 /* if some text before the </ return it now */ 2153 if (lexer->txtend > lexer->txtstart) 2154 { 2155 /* trim space character before end tag */ 2156 if (mode == IgnoreWhitespace && lexer->lexbuf[lexer->lexsize - 1] == ' ') 2157 { 2158 lexer->lexsize -= 1; 2159 lexer->txtend = lexer->lexsize; 2160 } 2161 lexer->token = TY_(TextToken)(lexer); 2162#ifdef TIDY_STORE_ORIGINAL_TEXT 2163 StoreOriginalTextInToken(doc, lexer->token, 3); 2164#endif 2165 return lexer->token; 2166 } 2167 2168 continue; /* no text so keep going */ 2169 } 2170 2171 /* otherwise treat as CDATA */ 2172 lexer->waswhite = no; 2173 lexer->state = LEX_CONTENT; 2174 continue; 2175 } 2176 2177 if (mode == IgnoreMarkup) 2178 { 2179 /* otherwise treat as CDATA */ 2180 lexer->waswhite = no; 2181 lexer->state = LEX_CONTENT; 2182 continue; 2183 } 2184 2185 /* 2186 look out for comments, doctype or marked sections 2187 this isn't quite right, but its getting there ... 2188 */ 2189 if (c == '!') 2190 { 2191 c = TY_(ReadChar)(doc->docIn); 2192 2193 if (c == '-') 2194 { 2195 c = TY_(ReadChar)(doc->docIn); 2196 2197 if (c == '-') 2198 { 2199 lexer->state = LEX_COMMENT; /* comment */ 2200 lexer->lexsize -= 2; 2201 lexer->txtend = lexer->lexsize; 2202 2203 CondReturnTextNode(doc, 4) 2204 2205 lexer->txtstart = lexer->lexsize; 2206 continue; 2207 } 2208 2209 TY_(ReportError)(doc, NULL, NULL, MALFORMED_COMMENT ); 2210 } 2211 else if (c == 'd' || c == 'D') 2212 { 2213 /* todo: check for complete "<!DOCTYPE" not just <!D */ 2214 2215 uint skip = 0; 2216 2217 lexer->state = LEX_DOCTYPE; /* doctype */ 2218 lexer->lexsize -= 2; 2219 lexer->txtend = lexer->lexsize; 2220 mode = IgnoreWhitespace; 2221 2222 /* skip until white space or '>' */ 2223 2224 for (;;) 2225 { 2226 c = TY_(ReadChar)(doc->docIn); 2227 ++skip; 2228 2229 if (c == EndOfStream || c == '>') 2230 { 2231 TY_(UngetChar)(c, doc->docIn); 2232 break; 2233 } 2234 2235 2236 if (!TY_(IsWhite)(c)) 2237 continue; 2238 2239 /* and skip to end of whitespace */ 2240 2241 for (;;) 2242 { 2243 c = TY_(ReadChar)(doc->docIn); 2244 ++skip; 2245 2246 if (c == EndOfStream || c == '>') 2247 { 2248 TY_(UngetChar)(c, doc->docIn); 2249 break; 2250 } 2251 2252 2253 if (TY_(IsWhite)(c)) 2254 continue; 2255 2256 TY_(UngetChar)(c, doc->docIn); 2257 break; 2258 } 2259 2260 break; 2261 } 2262 2263 CondReturnTextNode(doc, (skip + 3)) 2264 2265 lexer->txtstart = lexer->lexsize; 2266 continue; 2267 } 2268 else if (c == '[') 2269 { 2270 /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */ 2271 lexer->lexsize -= 2; 2272 lexer->state = LEX_SECTION; 2273 lexer->txtend = lexer->lexsize; 2274 2275 CondReturnTextNode(doc, 2) 2276 2277 lexer->txtstart = lexer->lexsize; 2278 continue; 2279 } 2280 2281 2282 2283 /* else swallow characters up to and including next '>' */ 2284 while ((c = TY_(ReadChar)(doc->docIn)) != '>') 2285 { 2286 if (c == EndOfStream) 2287 { 2288 TY_(UngetChar)(c, doc->docIn); 2289 break; 2290 } 2291 } 2292 2293 lexer->lexsize -= 2; 2294 lexer->lexbuf[lexer->lexsize] = '\0'; 2295 lexer->state = LEX_CONTENT; 2296 continue; 2297 } 2298 2299 /* 2300 processing instructions 2301 */ 2302 2303 if (c == '?') 2304 { 2305 lexer->lexsize -= 2; 2306 lexer->state = LEX_PROCINSTR; 2307 lexer->txtend = lexer->lexsize; 2308 2309 CondReturnTextNode(doc, 2) 2310 2311 lexer->txtstart = lexer->lexsize; 2312 continue; 2313 } 2314 2315 /* Microsoft ASP's e.g. <% ... server-code ... %> */ 2316 if (c == '%') 2317 { 2318 lexer->lexsize -= 2; 2319 lexer->state = LEX_ASP; 2320 lexer->txtend = lexer->lexsize; 2321 2322 CondReturnTextNode(doc, 2) 2323 2324 lexer->txtstart = lexer->lexsize; 2325 continue; 2326 } 2327 2328 /* Netscapes JSTE e.g. <# ... server-code ... #> */ 2329 if (c == '#') 2330 { 2331 lexer->lexsize -= 2; 2332 lexer->state = LEX_JSTE; 2333 lexer->txtend = lexer->lexsize; 2334 2335 CondReturnTextNode(doc, 2) 2336 2337 lexer->txtstart = lexer->lexsize; 2338 continue; 2339 } 2340 2341 /* check for start tag */ 2342 if (TY_(IsLetter)(c)) 2343 { 2344 TY_(UngetChar)(c, doc->docIn); /* push back letter */ 2345 TY_(UngetChar)('<', doc->docIn); 2346 --(doc->docIn->curcol); 2347 lexer->lexsize -= 2; /* discard "<" + letter */ 2348 lexer->txtend = lexer->lexsize; 2349 lexer->state = LEX_STARTTAG; /* ready to read tag name */ 2350 2351 CondReturnTextNode(doc, 2) 2352 2353 /* lexer->txtstart = lexer->lexsize; missing here? */ 2354 continue; /* no text so keep going */ 2355 } 2356 2357 /* fix for bug 762102 */ 2358 if (c == '&') 2359 { 2360 TY_(UngetChar)(c, doc->docIn); 2361 --(lexer->lexsize); 2362 } 2363 2364 /* otherwise treat as CDATA */ 2365 lexer->state = LEX_CONTENT; 2366 lexer->waswhite = no; 2367 continue; 2368 2369 case LEX_ENDTAG: /* </letter */ 2370 lexer->txtstart = lexer->lexsize - 1; 2371 doc->docIn->curcol += 2; 2372 c = ParseTagName( doc ); 2373 lexer->token = TagToken( doc, EndTag ); /* create endtag token */ 2374 lexer->lexsize = lexer->txtend = lexer->txtstart; 2375 2376 /* skip to '>' */ 2377 while ( c != '>' && c != EndOfStream ) 2378 { 2379 c = TY_(ReadChar)(doc->docIn); 2380 } 2381 2382 if (c == EndOfStream) 2383 { 2384 TY_(FreeNode)( doc, lexer->token ); 2385 continue; 2386 } 2387 2388 lexer->state = LEX_CONTENT; 2389 lexer->waswhite = no; 2390#ifdef TIDY_STORE_ORIGINAL_TEXT 2391 StoreOriginalTextInToken(doc, lexer->token, 0); /* hmm... */ 2392#endif 2393 return lexer->token; /* the endtag token */ 2394 2395 case LEX_STARTTAG: /* first letter of tagname */ 2396 c = TY_(ReadChar)(doc->docIn); 2397 ChangeChar(lexer, (tmbchar)c); 2398 lexer->txtstart = lexer->lexsize - 1; /* set txtstart to first letter */ 2399 c = ParseTagName( doc ); 2400 isempty = no; 2401 attributes = NULL; 2402 lexer->token = TagToken( doc, (isempty ? StartEndTag : StartTag) ); 2403 2404 /* parse attributes, consuming closing ">" */ 2405 if (c != '>') 2406 { 2407 if (c == '/') 2408 TY_(UngetChar)(c, doc->docIn); 2409 2410 attributes = ParseAttrs( doc, &isempty ); 2411 } 2412 2413 if (isempty) 2414 lexer->token->type = StartEndTag; 2415 2416 lexer->token->attributes = attributes; 2417 lexer->lexsize = lexer->txtend = lexer->txtstart; 2418 2419 /* swallow newline following start tag */ 2420 /* special check needed for CRLF sequence */ 2421 /* this doesn't apply to empty elements */ 2422 /* nor to preformatted content that needs escaping */ 2423 2424 if ((mode != Preformatted && ExpectsContent(lexer->token)) 2425 || nodeIsBR(lexer->token) || nodeIsHR(lexer->token)) 2426 { 2427 c = TY_(ReadChar)(doc->docIn); 2428 2429 if (c != '\n' && c != '\f') 2430 TY_(UngetChar)(c, doc->docIn); 2431 2432 lexer->waswhite = yes; /* to swallow leading whitespace */ 2433 } 2434 else 2435 lexer->waswhite = no; 2436 2437 lexer->state = LEX_CONTENT; 2438 if (lexer->token->tag == NULL) 2439 TY_(ReportFatal)( doc, NULL, lexer->token, UNKNOWN_ELEMENT ); 2440 else if ( !cfgBool(doc, TidyXmlTags) ) 2441 { 2442 Node* curr = lexer->token; 2443 TY_(ConstrainVersion)( doc, curr->tag->versions ); 2444 2445 if ( curr->tag->versions & VERS_PROPRIETARY ) 2446 { 2447 if ( !cfgBool(doc, TidyMakeClean) || 2448 ( !nodeIsNOBR(curr) && !nodeIsWBR(curr) ) ) 2449 { 2450 TY_(ReportError)(doc, NULL, curr, PROPRIETARY_ELEMENT ); 2451 2452 if ( nodeIsLAYER(curr) ) 2453 doc->badLayout |= USING_LAYER; 2454 else if ( nodeIsSPACER(curr) ) 2455 doc->badLayout |= USING_SPACER; 2456 else if ( nodeIsNOBR(curr) ) 2457 doc->badLayout |= USING_NOBR; 2458 } 2459 } 2460 2461 TY_(RepairDuplicateAttributes)( doc, curr ); 2462 } 2463#ifdef TIDY_STORE_ORIGINAL_TEXT 2464 StoreOriginalTextInToken(doc, lexer->token, 0); 2465#endif 2466 return lexer->token; /* return start tag */ 2467 2468 case LEX_COMMENT: /* seen <!-- so look for --> */ 2469 2470 if (c != '-') 2471 continue; 2472 2473 c = TY_(ReadChar)(doc->docIn); 2474 TY_(AddCharToLexer)(lexer, c); 2475 2476 if (c != '-') 2477 continue; 2478 2479 end_comment: 2480 c = TY_(ReadChar)(doc->docIn); 2481 2482 if (c == '>') 2483 { 2484 if (badcomment) 2485 TY_(ReportError)(doc, NULL, NULL, MALFORMED_COMMENT ); 2486 2487 /* do not store closing -- in lexbuf */ 2488 lexer->lexsize -= 2; 2489 lexer->txtend = lexer->lexsize; 2490 lexer->lexbuf[lexer->lexsize] = '\0'; 2491 lexer->state = LEX_CONTENT; 2492 lexer->waswhite = no; 2493 lexer->token = CommentToken(doc); 2494 2495 /* now look for a line break */ 2496 2497 c = TY_(ReadChar)(doc->docIn); 2498 2499 if (c == '\n') 2500 lexer->token->linebreak = yes; 2501 else 2502 TY_(UngetChar)(c, doc->docIn); 2503 2504 return lexer->token; 2505 } 2506 2507 /* note position of first such error in the comment */ 2508 if (!badcomment) 2509 { 2510 SetLexerLocus( doc, lexer ); 2511 lexer->columns -= 3; 2512 } 2513 2514 badcomment++; 2515 2516 if ( cfgBool(doc, TidyFixComments) ) 2517 lexer->lexbuf[lexer->lexsize - 2] = '='; 2518 2519 /* if '-' then look for '>' to end the comment */ 2520 if (c == '-') 2521 { 2522 TY_(AddCharToLexer)(lexer, c); 2523 goto end_comment; 2524 } 2525 2526 /* otherwise continue to look for --> */ 2527 lexer->lexbuf[lexer->lexsize - 1] = '='; 2528 2529 /* http://tidy.sf.net/bug/1266647 */ 2530 TY_(AddCharToLexer)(lexer, c); 2531 2532 continue; 2533 2534 case LEX_DOCTYPE: /* seen <!d so look for '>' munging whitespace */ 2535 2536 /* use ParseDocTypeDecl() to tokenize doctype declaration */ 2537 TY_(UngetChar)(c, doc->docIn); 2538 lexer->lexsize -= 1; 2539 lexer->token = ParseDocTypeDecl(doc); 2540 2541 lexer->txtend = lexer->lexsize; 2542 lexer->lexbuf[lexer->lexsize] = '\0'; 2543 lexer->state = LEX_CONTENT; 2544 lexer->waswhite = no; 2545 2546 /* make a note of the version named by the 1st doctype */ 2547 if (lexer->doctype == VERS_UNKNOWN && lexer->token && !cfgBool(doc, TidyXmlTags)) 2548 lexer->doctype = FindGivenVersion(doc, lexer->token); 2549 return lexer->token; 2550 2551 case LEX_PROCINSTR: /* seen <? so look for '>' */ 2552 /* check for PHP preprocessor instructions <?php ... ?> */ 2553 2554 if (lexer->lexsize - lexer->txtstart == 3) 2555 { 2556 if (TY_(tmbstrncmp)(lexer->lexbuf + lexer->txtstart, "php", 3) == 0) 2557 { 2558 lexer->state = LEX_PHP; 2559 continue; 2560 } 2561 } 2562 2563 if (lexer->lexsize - lexer->txtstart == 4) 2564 { 2565 if (TY_(tmbstrncmp)(lexer->lexbuf + lexer->txtstart, "xml", 3) == 0 && 2566 TY_(IsWhite)(lexer->lexbuf[lexer->txtstart + 3])) 2567 { 2568 lexer->state = LEX_XMLDECL; 2569 attributes = NULL; 2570 continue; 2571 } 2572 } 2573 2574 if (cfgBool(doc, TidyXmlPIs) || lexer->isvoyager) /* insist on ?> as terminator */ 2575 { 2576 if (c != '?') 2577 continue; 2578 2579 /* now look for '>' */ 2580 c = TY_(ReadChar)(doc->docIn); 2581 2582 if (c == EndOfStream) 2583 { 2584 TY_(ReportError)(doc, NULL, NULL, UNEXPECTED_END_OF_FILE ); 2585 TY_(UngetChar)(c, doc->docIn); 2586 continue; 2587 } 2588 2589 TY_(AddCharToLexer)(lexer, c); 2590 } 2591 2592 2593 if (c != '>') 2594 continue; 2595 2596 lexer->lexsize -= 1; 2597 2598 if (lexer->lexsize) 2599 { 2600 uint i; 2601 Bool closed; 2602 2603 for (i = 0; i < lexer->lexsize - lexer->txtstart && 2604 !TY_(IsWhite)(lexer->lexbuf[i + lexer->txtstart]); ++i) 2605 /**/; 2606 2607 closed = lexer->lexbuf[lexer->lexsize - 1] == '?'; 2608 2609 if (closed) 2610 lexer->lexsize -= 1; 2611 2612 lexer->txtstart += i; 2613 lexer->txtend = lexer->lexsize; 2614 lexer->lexbuf[lexer->lexsize] = '\0'; 2615 2616 lexer->token = PIToken(doc); 2617 lexer->token->closed = closed; 2618 lexer->token->element = TY_(tmbstrndup)(lexer->lexbuf + 2619 lexer->txtstart - i, i); 2620 } 2621 else 2622 { 2623 lexer->txtend = lexer->lexsize; 2624 lexer->lexbuf[lexer->lexsize] = '\0'; 2625 lexer->token = PIToken(doc); 2626 } 2627 2628 lexer->state = LEX_CONTENT; 2629 lexer->waswhite = no; 2630 return lexer->token; 2631 2632 case LEX_ASP: /* seen <% so look for "%>" */ 2633 if (c != '%') 2634 continue; 2635 2636 /* now look for '>' */ 2637 c = TY_(ReadChar)(doc->docIn); 2638 2639 2640 if (c != '>') 2641 { 2642 TY_(UngetChar)(c, doc->docIn); 2643 continue; 2644 } 2645 2646 lexer->lexsize -= 1; 2647 lexer->txtend = lexer->lexsize; 2648 lexer->lexbuf[lexer->lexsize] = '\0'; 2649 lexer->state = LEX_CONTENT; 2650 lexer->waswhite = no; 2651 return lexer->token = AspToken(doc); 2652 2653 case LEX_JSTE: /* seen <# so look for "#>" */ 2654 if (c != '#') 2655 continue; 2656 2657 /* now look for '>' */ 2658 c = TY_(ReadChar)(doc->docIn); 2659 2660 2661 if (c != '>') 2662 { 2663 TY_(UngetChar)(c, doc->docIn); 2664 continue; 2665 } 2666 2667 lexer->lexsize -= 1; 2668 lexer->txtend = lexer->lexsize; 2669 lexer->lexbuf[lexer->lexsize] = '\0'; 2670 lexer->state = LEX_CONTENT; 2671 lexer->waswhite = no; 2672 return lexer->token = JsteToken(doc); 2673 2674 case LEX_PHP: /* seen "<?php" so look for "?>" */ 2675 if (c != '?') 2676 continue; 2677 2678 /* now look for '>' */ 2679 c = TY_(ReadChar)(doc->docIn); 2680 2681 if (c != '>') 2682 { 2683 TY_(UngetChar)(c, doc->docIn); 2684 continue; 2685 } 2686 2687 lexer->lexsize -= 1; 2688 lexer->txtend = lexer->lexsize; 2689 lexer->lexbuf[lexer->lexsize] = '\0'; 2690 lexer->state = LEX_CONTENT; 2691 lexer->waswhite = no; 2692 return lexer->token = PhpToken(doc); 2693 2694 case LEX_XMLDECL: /* seen "<?xml" so look for "?>" */ 2695 2696 if (TY_(IsWhite)(c) && c != '?') 2697 continue; 2698 2699 /* get pseudo-attribute */ 2700 if (c != '?') 2701 { 2702 tmbstr name; 2703 Node *asp, *php; 2704 AttVal *av = NULL; 2705 int pdelim = 0; 2706 isempty = no; 2707 2708 TY_(UngetChar)(c, doc->docIn); 2709 2710 name = ParseAttribute( doc, &isempty, &asp, &php ); 2711 2712 if (!name) 2713 { 2714 /* fix for http://tidy.sf.net/bug/788031 */ 2715 lexer->lexsize -= 1; 2716 lexer->txtend = lexer->txtstart; 2717 lexer->lexbuf[lexer->txtend] = '\0'; 2718 lexer->state = LEX_CONTENT; 2719 lexer->waswhite = no; 2720 lexer->token = XmlDeclToken(doc); 2721 lexer->token->attributes = attributes; 2722 return lexer->token; 2723 } 2724 2725 av = TY_(NewAttribute)(); 2726 av->attribute = name; 2727 av->value = ParseValue( doc, name, yes, &isempty, &pdelim ); 2728 av->delim = pdelim; 2729 av->dict = TY_(FindAttribute)( doc, av ); 2730 2731 AddAttrToList( &attributes, av ); 2732 /* continue; */ 2733 } 2734 2735 /* now look for '>' */ 2736 c = TY_(ReadChar)(doc->docIn); 2737 2738 if (c != '>') 2739 { 2740 TY_(UngetChar)(c, doc->docIn); 2741 continue; 2742 } 2743 lexer->lexsize -= 1; 2744 lexer->txtend = lexer->txtstart; 2745 lexer->lexbuf[lexer->txtend] = '\0'; 2746 lexer->state = LEX_CONTENT; 2747 lexer->waswhite = no; 2748 lexer->token = XmlDeclToken(doc); 2749 lexer->token->attributes = attributes; 2750 return lexer->token; 2751 2752 case LEX_SECTION: /* seen "<![" so look for "]>" */ 2753 if (c == '[') 2754 { 2755 if (lexer->lexsize == (lexer->txtstart + 6) && 2756 TY_(tmbstrncmp)(lexer->lexbuf+lexer->txtstart, "CDATA[", 6) == 0) 2757 { 2758 lexer->state = LEX_CDATA; 2759 lexer->lexsize -= 6; 2760 continue; 2761 } 2762 } 2763 2764 if (c != ']') 2765 continue; 2766 2767 /* now look for '>' */ 2768 c = TY_(ReadChar)(doc->docIn); 2769 2770 if (c != '>') 2771 { 2772 TY_(UngetChar)(c, doc->docIn); 2773 continue; 2774 } 2775 2776 lexer->lexsize -= 1; 2777 lexer->txtend = lexer->lexsize; 2778 lexer->lexbuf[lexer->lexsize] = '\0'; 2779 lexer->state = LEX_CONTENT; 2780 lexer->waswhite = no; 2781 return lexer->token = SectionToken(doc); 2782 2783 case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */ 2784/* Apple Changes: 2785 2007-08-08 iccir [5393761] The previous fix for 4642206 resulted in CDATA & not escaping properly when 2786 escape-cdata=yes and char-encoding=raw. Similar to our fix in ParseEntity(), append 2787 an additional "amp;" under these conditions. 2788*/ 2789#ifdef TIDY_APPLE_CHANGES 2790 if (c == '&' && cfg(doc, TidyOutCharEncoding) == RAW && cfgBool(doc, TidyEscapeCdata)) 2791 { 2792 TY_(AddStringToLexer)(lexer, "amp;"); 2793 } 2794#endif 2795 if (c != ']') 2796 continue; 2797 2798 /* now look for ']' */ 2799 c = TY_(ReadChar)(doc->docIn); 2800 2801 if (c != ']') 2802 { 2803 TY_(UngetChar)(c, doc->docIn); 2804 continue; 2805 } 2806 2807 /* now look for '>' */ 2808 c = TY_(ReadChar)(doc->docIn); 2809 2810 if (c != '>') 2811 { 2812 TY_(UngetChar)(c, doc->docIn); 2813 TY_(UngetChar)(']', doc->docIn); 2814 continue; 2815 } 2816 2817 lexer->lexsize -= 1; 2818 lexer->txtend = lexer->lexsize; 2819 lexer->lexbuf[lexer->lexsize] = '\0'; 2820 lexer->state = LEX_CONTENT; 2821 lexer->waswhite = no; 2822 return lexer->token = CDATAToken(doc); 2823 } 2824 } 2825 2826 if (lexer->state == LEX_CONTENT) /* text string */ 2827 { 2828 lexer->txtend = lexer->lexsize; 2829 2830 if (lexer->txtend > lexer->txtstart) 2831 { 2832 TY_(UngetChar)(c, doc->docIn); 2833 2834 if (lexer->lexbuf[lexer->lexsize - 1] == ' ') 2835 { 2836 lexer->lexsize -= 1; 2837 lexer->txtend = lexer->lexsize; 2838 } 2839 lexer->token = TY_(TextToken)(lexer); 2840#ifdef TIDY_STORE_ORIGINAL_TEXT 2841 StoreOriginalTextInToken(doc, lexer->token, 0); /* ? */ 2842#endif 2843 return lexer->token; 2844 } 2845 } 2846 else if (lexer->state == LEX_COMMENT) /* comment */ 2847 { 2848 if (c == EndOfStream) 2849 TY_(ReportError)(doc, NULL, NULL, MALFORMED_COMMENT ); 2850 2851 lexer->txtend = lexer->lexsize; 2852 lexer->lexbuf[lexer->lexsize] = '\0'; 2853 lexer->state = LEX_CONTENT; 2854 lexer->waswhite = no; 2855 return lexer->token = CommentToken(doc); 2856 } 2857 2858 return NULL; 2859} 2860 2861static void MapStr( ctmbstr str, uint code ) 2862{ 2863 while ( *str ) 2864 { 2865 uint i = (byte) *str++; 2866 lexmap[i] |= code; 2867 } 2868} 2869 2870void TY_(InitMap)(void) 2871{ 2872 MapStr("\r\n\f", newline|white); 2873 MapStr(" \t", white); 2874 MapStr("-.:_", namechar); 2875 MapStr("0123456789", digit|namechar); 2876 MapStr("abcdefghijklmnopqrstuvwxyz", lowercase|letter|namechar); 2877 MapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", uppercase|letter|namechar); 2878} 2879 2880/* 2881 parser for ASP within start tags 2882 2883 Some people use ASP for to customize attributes 2884 Tidy isn't really well suited to dealing with ASP 2885 This is a workaround for attributes, but won't 2886 deal with the case where the ASP is used to tailor 2887 the attribute value. Here is an example of a work 2888 around for using ASP in attribute values: 2889 2890 href='<%=rsSchool.Fields("ID").Value%>' 2891 2892 where the ASP that generates the attribute value 2893 is masked from Tidy by the quotemarks. 2894 2895*/ 2896 2897static Node *ParseAsp( TidyDocImpl* doc ) 2898{ 2899 Lexer* lexer = doc->lexer; 2900 uint c; 2901 Node *asp = NULL; 2902 2903 lexer->txtstart = lexer->lexsize; 2904 2905 for (;;) 2906 { 2907 if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream) 2908 break; 2909 2910 TY_(AddCharToLexer)(lexer, c); 2911 2912 2913 if (c != '%') 2914 continue; 2915 2916 if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream) 2917 break; 2918 2919 TY_(AddCharToLexer)(lexer, c); 2920 2921 if (c == '>') 2922 { 2923 lexer->lexsize -= 2; 2924 break; 2925 } 2926 } 2927 2928 lexer->txtend = lexer->lexsize; 2929 if (lexer->txtend > lexer->txtstart) 2930 asp = AspToken(doc); 2931 2932 lexer->txtstart = lexer->txtend; 2933 return asp; 2934} 2935 2936 2937/* 2938 PHP is like ASP but is based upon XML 2939 processing instructions, e.g. <?php ... ?> 2940*/ 2941static Node *ParsePhp( TidyDocImpl* doc ) 2942{ 2943 Lexer* lexer = doc->lexer; 2944 uint c; 2945 Node *php = NULL; 2946 2947 lexer->txtstart = lexer->lexsize; 2948 2949 for (;;) 2950 { 2951 if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream) 2952 break; 2953 2954 TY_(AddCharToLexer)(lexer, c); 2955 2956 2957 if (c != '?') 2958 continue; 2959 2960 if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream) 2961 break; 2962 2963 TY_(AddCharToLexer)(lexer, c); 2964 2965 if (c == '>') 2966 { 2967 lexer->lexsize -= 2; 2968 break; 2969 } 2970 } 2971 2972 lexer->txtend = lexer->lexsize; 2973 if (lexer->txtend > lexer->txtstart) 2974 php = PhpToken(doc); 2975 2976 lexer->txtstart = lexer->txtend; 2977 return php; 2978} 2979 2980/* consumes the '>' terminating start tags */ 2981static tmbstr ParseAttribute( TidyDocImpl* doc, Bool *isempty, 2982 Node **asp, Node **php) 2983{ 2984 Lexer* lexer = doc->lexer; 2985 int start, len = 0; 2986 tmbstr attr = NULL; 2987 uint c, lastc; 2988 2989 *asp = NULL; /* clear asp pointer */ 2990 *php = NULL; /* clear php pointer */ 2991 2992 /* skip white space before the attribute */ 2993 2994 for (;;) 2995 { 2996 c = TY_(ReadChar)( doc->docIn ); 2997 2998 2999 if (c == '/') 3000 { 3001 c = TY_(ReadChar)( doc->docIn ); 3002 3003 if (c == '>') 3004 { 3005 *isempty = yes; 3006 return NULL; 3007 } 3008 3009 TY_(UngetChar)(c, doc->docIn); 3010 c = '/'; 3011 break; 3012 } 3013 3014 if (c == '>') 3015 return NULL; 3016 3017 if (c =='<') 3018 { 3019 c = TY_(ReadChar)(doc->docIn); 3020 3021 if (c == '%') 3022 { 3023 *asp = ParseAsp( doc ); 3024 return NULL; 3025 } 3026 else if (c == '?') 3027 { 3028 *php = ParsePhp( doc ); 3029 return NULL; 3030 } 3031 3032 TY_(UngetChar)(c, doc->docIn); 3033 TY_(UngetChar)('<', doc->docIn); 3034 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT ); 3035 return NULL; 3036 } 3037 3038 if (c == '=') 3039 { 3040 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_EQUALSIGN ); 3041 continue; 3042 } 3043 3044 if (c == '"' || c == '\'') 3045 { 3046 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK ); 3047 continue; 3048 } 3049 3050 if (c == EndOfStream) 3051 { 3052 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR ); 3053 TY_(UngetChar)(c, doc->docIn); 3054 return NULL; 3055 } 3056 3057 3058 if (!TY_(IsWhite)(c)) 3059 break; 3060 } 3061 3062 start = lexer->lexsize; 3063 lastc = c; 3064 3065 for (;;) 3066 { 3067 /* but push back '=' for parseValue() */ 3068 if (c == '=' || c == '>') 3069 { 3070 TY_(UngetChar)(c, doc->docIn); 3071 break; 3072 } 3073 3074 if (c == '<' || c == EndOfStream) 3075 { 3076 TY_(UngetChar)(c, doc->docIn); 3077 break; 3078 } 3079 3080 if (lastc == '-' && (c == '"' || c == '\'')) 3081 { 3082 lexer->lexsize--; 3083 --len; 3084 TY_(UngetChar)(c, doc->docIn); 3085 break; 3086 } 3087 3088 if (TY_(IsWhite)(c)) 3089 break; 3090 3091 /* what should be done about non-namechar characters? */ 3092 /* currently these are incorporated into the attr name */ 3093 3094 if ( !cfgBool(doc, TidyXmlTags) && TY_(IsUpper)(c) ) 3095 c = TY_(ToLower)(c); 3096 3097 TY_(AddCharToLexer)( lexer, c ); 3098 lastc = c; 3099 c = TY_(ReadChar)(doc->docIn); 3100 } 3101 3102 /* handle attribute names with multibyte chars */ 3103 len = lexer->lexsize - start; 3104 attr = (len > 0 ? TY_(tmbstrndup)(lexer->lexbuf+start, len) : NULL); 3105 lexer->lexsize = start; 3106 return attr; 3107} 3108 3109/* 3110 invoked when < is seen in place of attribute value 3111 but terminates on whitespace if not ASP, PHP or Tango 3112 this routine recognizes ' and " quoted strings 3113*/ 3114static int ParseServerInstruction( TidyDocImpl* doc ) 3115{ 3116 Lexer* lexer = doc->lexer; 3117 uint c; 3118 int delim = '"'; 3119 Bool isrule = no; 3120 3121 c = TY_(ReadChar)(doc->docIn); 3122 TY_(AddCharToLexer)(lexer, c); 3123 3124 /* check for ASP, PHP or Tango */ 3125 if (c == '%' || c == '?' || c == '@') 3126 isrule = yes; 3127 3128 for (;;) 3129 { 3130 c = TY_(ReadChar)(doc->docIn); 3131 3132 if (c == EndOfStream) 3133 break; 3134 3135 if (c == '>') 3136 { 3137 if (isrule) 3138 TY_(AddCharToLexer)(lexer, c); 3139 else 3140 TY_(UngetChar)(c, doc->docIn); 3141 3142 break; 3143 } 3144 3145 /* if not recognized as ASP, PHP or Tango */ 3146 /* then also finish value on whitespace */ 3147 if (!isrule) 3148 { 3149 if (TY_(IsWhite)(c)) 3150 break; 3151 } 3152 3153 TY_(AddCharToLexer)(lexer, c); 3154 3155 if (c == '"') 3156 { 3157 do 3158 { 3159 c = TY_(ReadChar)(doc->docIn); 3160 if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */ 3161 { 3162 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR ); 3163 TY_(UngetChar)(c, doc->docIn); 3164 return 0; 3165 } 3166 if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */ 3167 { 3168 TY_(UngetChar)(c, doc->docIn); 3169 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT ); 3170 return 0; 3171 } 3172 TY_(AddCharToLexer)(lexer, c); 3173 } 3174 while (c != '"'); 3175 delim = '\''; 3176 continue; 3177 } 3178 3179 if (c == '\'') 3180 { 3181 do 3182 { 3183 c = TY_(ReadChar)(doc->docIn); 3184 if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */ 3185 { 3186 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR ); 3187 TY_(UngetChar)(c, doc->docIn); 3188 return 0; 3189 } 3190 if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */ 3191 { 3192 TY_(UngetChar)(c, doc->docIn); 3193 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT ); 3194 return 0; 3195 } 3196 TY_(AddCharToLexer)(lexer, c); 3197 } 3198 while (c != '\''); 3199 } 3200 } 3201 3202 return delim; 3203} 3204 3205/* values start with "=" or " = " etc. */ 3206/* doesn't consume the ">" at end of start tag */ 3207 3208static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, 3209 Bool foldCase, Bool *isempty, int *pdelim) 3210{ 3211 Lexer* lexer = doc->lexer; 3212 int len = 0, start; 3213 Bool seen_gt = no; 3214 Bool munge = yes; 3215 uint c, lastc, delim, quotewarning; 3216 tmbstr value; 3217 3218 delim = (tmbchar) 0; 3219 *pdelim = '"'; 3220 3221 /* 3222 Henry Zrepa reports that some folk are using the 3223 embed element with script attributes where newlines 3224 are significant and must be preserved 3225 */ 3226 if ( cfgBool(doc, TidyLiteralAttribs) ) 3227 munge = no; 3228 3229 /* skip white space before the '=' */ 3230 3231 for (;;) 3232 { 3233 c = TY_(ReadChar)(doc->docIn); 3234 3235 if (c == EndOfStream) 3236 { 3237 TY_(UngetChar)(c, doc->docIn); 3238 break; 3239 } 3240 3241 if (!TY_(IsWhite)(c)) 3242 break; 3243 } 3244 3245/* 3246 c should be '=' if there is a value 3247 other legal possibilities are white 3248 space, '/' and '>' 3249*/ 3250 3251 if (c != '=' && c != '"' && c != '\'') 3252 { 3253 TY_(UngetChar)(c, doc->docIn); 3254 return NULL; 3255 } 3256 3257 /* skip white space after '=' */ 3258 3259 for (;;) 3260 { 3261 c = TY_(ReadChar)(doc->docIn); 3262 3263 if (c == EndOfStream) 3264 { 3265 TY_(UngetChar)(c, doc->docIn); 3266 break; 3267 } 3268 3269 if (!TY_(IsWhite)(c)) 3270 break; 3271 } 3272 3273 /* check for quote marks */ 3274 3275 if (c == '"' || c == '\'') 3276 delim = c; 3277 else if (c == '<') 3278 { 3279 start = lexer->lexsize; 3280 TY_(AddCharToLexer)(lexer, c); 3281 *pdelim = ParseServerInstruction( doc ); 3282 len = lexer->lexsize - start; 3283 lexer->lexsize = start; 3284 return (len > 0 ? TY_(tmbstrndup)(lexer->lexbuf+start, len) : NULL); 3285 } 3286 else 3287 TY_(UngetChar)(c, doc->docIn); 3288 3289 /* 3290 and read the value string 3291 check for quote mark if needed 3292 */ 3293 3294 quotewarning = 0; 3295 start = lexer->lexsize; 3296 c = '\0'; 3297 3298 for (;;) 3299 { 3300 lastc = c; /* track last character */ 3301 c = TY_(ReadChar)(doc->docIn); 3302 3303 if (c == EndOfStream) 3304 { 3305 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR ); 3306 TY_(UngetChar)(c, doc->docIn); 3307 break; 3308 } 3309 3310 if (delim == (tmbchar)0) 3311 { 3312 if (c == '>') 3313 { 3314 TY_(UngetChar)(c, doc->docIn); 3315 break; 3316 } 3317 3318 if (c == '"' || c == '\'') 3319 { 3320 uint q = c; 3321 3322 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK ); 3323 3324 /* handle <input onclick=s("btn1")> and <a title=foo""">...</a> */ 3325 /* this doesn't handle <a title=foo"/> which browsers treat as */ 3326 /* 'foo"/' nor <a title=foo" /> which browser treat as 'foo"' */ 3327 3328 c = TY_(ReadChar)(doc->docIn); 3329 if (c == '>') 3330 { 3331 TY_(AddCharToLexer)(lexer, q); 3332 TY_(UngetChar)(c, doc->docIn); 3333 break; 3334 } 3335 else 3336 { 3337 TY_(UngetChar)(c, doc->docIn); 3338 c = q; 3339 } 3340 } 3341 3342 if (c == '<') 3343 { 3344 TY_(UngetChar)(c, doc->docIn); 3345 c = '>'; 3346 TY_(UngetChar)(c, doc->docIn); 3347 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT ); 3348 break; 3349 } 3350 3351 /* 3352 For cases like <br clear=all/> need to avoid treating /> as 3353 part of the attribute value, however care is needed to avoid 3354 so treating <a href=http://www.acme.com/> in this way, which 3355 would map the <a> tag to <a href="http://www.acme.com"/> 3356 */ 3357 if (c == '/') 3358 { 3359 /* peek ahead in case of /> */ 3360 c = TY_(ReadChar)(doc->docIn); 3361 3362 if ( c == '>' && !TY_(IsUrl)(doc, name) ) 3363 { 3364 *isempty = yes; 3365 TY_(UngetChar)(c, doc->docIn); 3366 break; 3367 } 3368 3369 /* unget peeked character */ 3370 TY_(UngetChar)(c, doc->docIn); 3371 c = '/'; 3372 } 3373 } 3374 else /* delim is '\'' or '"' */ 3375 { 3376 if (c == delim) 3377 break; 3378 3379 if (c == '\n' || c == '<' || c == '>') 3380 ++quotewarning; 3381 3382 if (c == '>') 3383 seen_gt = yes; 3384 } 3385 3386 if (c == '&') 3387 { 3388 TY_(AddCharToLexer)(lexer, c); 3389 ParseEntity( doc, IgnoreWhitespace ); 3390 if (lexer->lexbuf[lexer->lexsize - 1] == '\n' && munge) 3391 ChangeChar(lexer, ' '); 3392 continue; 3393 } 3394 3395 /* 3396 kludge for JavaScript attribute values 3397 with line continuations in string literals 3398 */ 3399 if (c == '\\') 3400 { 3401 c = TY_(ReadChar)(doc->docIn); 3402 3403 if (c != '\n') 3404 { 3405 TY_(UngetChar)(c, doc->docIn); 3406 c = '\\'; 3407 } 3408 } 3409 3410 if (TY_(IsWhite)(c)) 3411 { 3412 if ( delim == 0 ) 3413 break; 3414 3415 if (munge) 3416 { 3417 /* discard line breaks in quoted URLs */ 3418 /* #438650 - fix by Randy Waki */ 3419 if ( c == '\n' && TY_(IsUrl)(doc, name) ) 3420 { 3421 /* warn that we discard this newline */ 3422 TY_(ReportAttrError)( doc, lexer->token, NULL, NEWLINE_IN_URI); 3423 continue; 3424 } 3425 3426 c = ' '; 3427 3428 if (lastc == ' ') 3429 { 3430 if (TY_(IsUrl)(doc, name) ) 3431 TY_(ReportAttrError)( doc, lexer->token, NULL, WHITE_IN_URI); 3432 continue; 3433 } 3434 } 3435 } 3436 else if (foldCase && TY_(IsUpper)(c)) 3437 c = TY_(ToLower)(c); 3438 3439 TY_(AddCharToLexer)(lexer, c); 3440 } 3441 3442 if (quotewarning > 10 && seen_gt && munge) 3443 { 3444 /* 3445 there is almost certainly a missing trailing quote mark 3446 as we have see too many newlines, < or > characters. 3447 3448 an exception is made for Javascript attributes and the 3449 javascript URL scheme which may legitimately include < and >, 3450 and for attributes starting with "<xml " as generated by 3451 Microsoft Office. 3452 */ 3453 if ( !TY_(IsScript)(doc, name) && 3454 !(TY_(IsUrl)(doc, name) && TY_(tmbstrncmp)(lexer->lexbuf+start, "javascript:", 11) == 0) && 3455 !(TY_(tmbstrncmp)(lexer->lexbuf+start, "<xml ", 5) == 0) 3456 ) 3457 TY_(ReportFatal)( doc, NULL, NULL, SUSPECTED_MISSING_QUOTE ); 3458 } 3459 3460 len = lexer->lexsize - start; 3461 lexer->lexsize = start; 3462 3463 3464 if (len > 0 || delim) 3465 { 3466 /* ignore leading and trailing white space for all but title, alt, value */ 3467 /* and prompts attributes unless --literal-attributes is set to yes */ 3468 /* #994841 - Whitespace is removed from value attributes */ 3469 3470 if (munge && 3471 TY_(tmbstrcasecmp)(name, "alt") && 3472 TY_(tmbstrcasecmp)(name, "title") && 3473 TY_(tmbstrcasecmp)(name, "value") && 3474 TY_(tmbstrcasecmp)(name, "prompt")) 3475 { 3476 while (TY_(IsWhite)(lexer->lexbuf[start+len-1])) 3477 --len; 3478 3479 while (TY_(IsWhite)(lexer->lexbuf[start]) && start < len) 3480 { 3481 ++start; 3482 --len; 3483 } 3484 } 3485 3486 value = TY_(tmbstrndup)(lexer->lexbuf + start, len); 3487 } 3488 else 3489 value = NULL; 3490 3491 /* note delimiter if given */ 3492 *pdelim = (delim ? delim : '"'); 3493 3494 return value; 3495} 3496 3497/* attr must be non-NULL */ 3498static Bool IsValidAttrName( ctmbstr attr ) 3499{ 3500 uint i, c = attr[0]; 3501 3502 /* first character should be a letter */ 3503 if (!TY_(IsLetter)(c)) 3504 return no; 3505 3506 /* remaining characters should be namechars */ 3507 for( i = 1; i < TY_(tmbstrlen)(attr); i++) 3508 { 3509 c = attr[i]; 3510 3511 if (TY_(IsNamechar)(c)) 3512 continue; 3513 3514 return no; 3515 } 3516 3517 return yes; 3518} 3519 3520/* create a new attribute */ 3521AttVal *TY_(NewAttribute)(void) 3522{ 3523 AttVal *av = (AttVal*) MemAlloc( sizeof(AttVal) ); 3524 ClearMemory( av, sizeof(AttVal) ); 3525 return av; 3526} 3527 3528/* create a new attribute with given name and value */ 3529AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value, 3530 int delim ) 3531{ 3532 AttVal *av = TY_(NewAttribute)(); 3533 av->attribute = TY_(tmbstrdup)(name); 3534 av->value = TY_(tmbstrdup)(value); 3535 av->delim = delim; 3536 av->dict = TY_(FindAttribute)( doc, av ); 3537 return av; 3538} 3539 3540static void AddAttrToList( AttVal** list, AttVal* av ) 3541{ 3542 if ( *list == NULL ) 3543 *list = av; 3544 else 3545 { 3546 AttVal* here = *list; 3547 while ( here->next ) 3548 here = here->next; 3549 here->next = av; 3550 } 3551} 3552 3553void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av ) 3554{ 3555 AddAttrToList(&node->attributes, av); 3556} 3557 3558void TY_(InsertAttributeAtStart)( Node *node, AttVal *av ) 3559{ 3560 av->next = node->attributes; 3561 node->attributes = av; 3562} 3563 3564/* swallows closing '>' */ 3565 3566static AttVal* ParseAttrs( TidyDocImpl* doc, Bool *isempty ) 3567{ 3568 Lexer* lexer = doc->lexer; 3569 AttVal *av, *list; 3570 tmbstr value; 3571 int delim; 3572 Node *asp, *php; 3573 3574 list = NULL; 3575 3576 while ( !EndOfInput(doc) ) 3577 { 3578 tmbstr attribute = ParseAttribute( doc, isempty, &asp, &php ); 3579 3580 if (attribute == NULL) 3581 { 3582 /* check if attributes are created by ASP markup */ 3583 if (asp) 3584 { 3585 av = TY_(NewAttribute)(); 3586 av->asp = asp; 3587 AddAttrToList( &list, av ); 3588 continue; 3589 } 3590 3591 /* check if attributes are created by PHP markup */ 3592 if (php) 3593 { 3594 av = TY_(NewAttribute)(); 3595 av->php = php; 3596 AddAttrToList( &list, av ); 3597 continue; 3598 } 3599 3600 break; 3601 } 3602 3603 value = ParseValue( doc, attribute, no, isempty, &delim ); 3604 3605 if (attribute && (IsValidAttrName(attribute) || 3606 (cfgBool(doc, TidyXmlTags) && IsValidXMLAttrName(attribute)))) 3607 { 3608 av = TY_(NewAttribute)(); 3609 av->delim = delim; 3610 av->attribute = attribute; 3611 av->value = value; 3612 av->dict = TY_(FindAttribute)( doc, av ); 3613 AddAttrToList( &list, av ); 3614 } 3615 else 3616 { 3617 av = TY_(NewAttribute)(); 3618 av->attribute = attribute; 3619 av->value = value; 3620 3621 if (LastChar(attribute) == '"') 3622 TY_(ReportAttrError)( doc, lexer->token, av, MISSING_QUOTEMARK); 3623 else if (value == NULL) 3624 TY_(ReportAttrError)(doc, lexer->token, av, MISSING_ATTR_VALUE); 3625 else 3626 TY_(ReportAttrError)(doc, lexer->token, av, INVALID_ATTRIBUTE); 3627 3628 TY_(FreeAttribute)( doc, av ); 3629 } 3630 } 3631 3632 return list; 3633} 3634 3635/* 3636 Returns document type declarations like 3637 3638 <!DOCTYPE foo PUBLIC "fpi" "sysid"> 3639 <!DOCTYPE bar SYSTEM "sysid"> 3640 <!DOCTYPE baz [ <!ENTITY ouml "ö"> ]> 3641 3642 as 3643 3644 <foo PUBLIC="fpi" SYSTEM="sysid" /> 3645 <bar SYSTEM="sysid" /> 3646 <baz> <!ENTITY ouml "&#246"> </baz> 3647*/ 3648static Node *ParseDocTypeDecl(TidyDocImpl* doc) 3649{ 3650 Lexer *lexer = doc->lexer; 3651 int start = lexer->lexsize; 3652 ParseDocTypeDeclState state = DT_DOCTYPENAME; 3653 uint c; 3654 uint delim = 0; 3655 Bool hasfpi = yes; 3656 3657 Node* node = TY_(NewNode)(lexer); 3658 node->type = DocTypeTag; 3659 node->start = lexer->txtstart; 3660 node->end = lexer->txtend; 3661 3662 lexer->waswhite = no; 3663 3664 /* todo: reset lexer->lexsize when appropriate to avoid wasting memory */ 3665 3666 while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream) 3667 { 3668 /* convert newlines to spaces */ 3669 if (state != DT_INTSUBSET) 3670 c = c == '\n' ? ' ' : c; 3671 3672 /* convert white-space sequences to single space character */ 3673 if (TY_(IsWhite)(c) && state != DT_INTSUBSET) 3674 { 3675 if (!lexer->waswhite) 3676 { 3677 TY_(AddCharToLexer)(lexer, c); 3678 lexer->waswhite = yes; 3679 } 3680 else 3681 { 3682 /* discard space */ 3683 continue; 3684 } 3685 } 3686 else 3687 { 3688 TY_(AddCharToLexer)(lexer, c); 3689 lexer->waswhite = no; 3690 } 3691 3692 switch(state) 3693 { 3694 case DT_INTERMEDIATE: 3695 /* determine what's next */ 3696 if (TY_(ToUpper)(c) == 'P' || TY_(ToUpper)(c) == 'S') 3697 { 3698 start = lexer->lexsize - 1; 3699 state = DT_PUBLICSYSTEM; 3700 continue; 3701 } 3702 else if (c == '[') 3703 { 3704 start = lexer->lexsize; 3705 state = DT_INTSUBSET; 3706 continue; 3707 } 3708 else if (c == '\'' || c == '"') 3709 { 3710 start = lexer->lexsize; 3711 delim = c; 3712 state = DT_QUOTEDSTRING; 3713 continue; 3714 } 3715 else if (c == '>') 3716 { 3717 AttVal* si; 3718 3719 node->end = --(lexer->lexsize); 3720 3721 si = TY_(GetAttrByName)(node, "SYSTEM"); 3722 if (si) 3723 TY_(CheckUrl)(doc, node, si); 3724 3725 if (!node->element || !IsValidXMLElemName(node->element)) 3726 { 3727 TY_(ReportError)(doc, NULL, NULL, MALFORMED_DOCTYPE); 3728 TY_(FreeNode)(doc, node); 3729 return NULL; 3730 } 3731#ifdef TIDY_STORE_ORIGINAL_TEXT 3732 StoreOriginalTextInToken(doc, node, 0); 3733#endif 3734 return node; 3735 } 3736 else 3737 { 3738 /* error */ 3739 } 3740 break; 3741 case DT_DOCTYPENAME: 3742 /* read document type name */ 3743 if (TY_(IsWhite)(c) || c == '>' || c == '[') 3744 { 3745 node->element = TY_(tmbstrndup)(lexer->lexbuf + start, 3746 lexer->lexsize - start - 1); 3747 if (c == '>' || c == '[') 3748 { 3749 --(lexer->lexsize); 3750 TY_(UngetChar)(c, doc->docIn); 3751 } 3752 3753 state = DT_INTERMEDIATE; 3754 continue; 3755 } 3756 break; 3757 case DT_PUBLICSYSTEM: 3758 /* read PUBLIC/SYSTEM */ 3759 if (TY_(IsWhite)(c) || c == '>') 3760 { 3761 char *attname = TY_(tmbstrndup)(lexer->lexbuf + start, 3762 lexer->lexsize - start - 1); 3763 hasfpi = !(TY_(tmbstrcasecmp)(attname, "SYSTEM") == 0); 3764 3765 MemFree(attname); 3766 3767 /* todo: report an error if SYSTEM/PUBLIC not uppercase */ 3768 3769 if (c == '>') 3770 { 3771 --(lexer->lexsize); 3772 TY_(UngetChar)(c, doc->docIn); 3773 } 3774 3775 state = DT_INTERMEDIATE; 3776 continue; 3777 } 3778 break; 3779 case DT_QUOTEDSTRING: 3780 /* read quoted string */ 3781 if (c == delim) 3782 { 3783 char *value = TY_(tmbstrndup)(lexer->lexbuf + start, 3784 lexer->lexsize - start - 1); 3785 AttVal* att = TY_(AddAttribute)(doc, node, hasfpi ? "PUBLIC" : "SYSTEM", value); 3786 MemFree(value); 3787 att->delim = delim; 3788 hasfpi = no; 3789 state = DT_INTERMEDIATE; 3790 delim = 0; 3791 continue; 3792 } 3793 break; 3794 case DT_INTSUBSET: 3795 /* read internal subset */ 3796 if (c == ']') 3797 { 3798 Node* subset; 3799 lexer->txtstart = start; 3800 lexer->txtend = lexer->lexsize - 1; 3801 subset = TY_(TextToken)(lexer); 3802 TY_(InsertNodeAtEnd)(node, subset); 3803 state = DT_INTERMEDIATE; 3804 } 3805 break; 3806 } 3807 } 3808 3809 /* document type declaration not finished */ 3810 TY_(ReportError)(doc, NULL, NULL, MALFORMED_DOCTYPE); 3811 TY_(FreeNode)(doc, node); 3812 return NULL; 3813} 3814 3815/* 3816 * local variables: 3817 * mode: c 3818 * indent-tabs-mode: nil 3819 * c-basic-offset: 4 3820 * eval: (c-set-offset 'substatement-open 0) 3821 * end: 3822 */ 3823