1/* 2 * HTMLparser.c : an HTML 4.0 non-verifying parser 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel@veillard.com 7 */ 8 9#define IN_LIBXML 10#include "libxml.h" 11#ifdef LIBXML_HTML_ENABLED 12 13#include <string.h> 14#ifdef HAVE_CTYPE_H 15#include <ctype.h> 16#endif 17#ifdef HAVE_STDLIB_H 18#include <stdlib.h> 19#endif 20#ifdef HAVE_SYS_STAT_H 21#include <sys/stat.h> 22#endif 23#ifdef HAVE_FCNTL_H 24#include <fcntl.h> 25#endif 26#ifdef HAVE_UNISTD_H 27#include <unistd.h> 28#endif 29#ifdef HAVE_ZLIB_H 30#include <zlib.h> 31#endif 32 33#include <libxml/xmlmemory.h> 34#include <libxml/tree.h> 35#include <libxml/parser.h> 36#include <libxml/parserInternals.h> 37#include <libxml/xmlerror.h> 38#include <libxml/HTMLparser.h> 39#include <libxml/HTMLtree.h> 40#include <libxml/entities.h> 41#include <libxml/encoding.h> 42#include <libxml/valid.h> 43#include <libxml/xmlIO.h> 44#include <libxml/globals.h> 45#include <libxml/uri.h> 46 47#define HTML_MAX_NAMELEN 1000 48#define HTML_PARSER_BIG_BUFFER_SIZE 1000 49#define HTML_PARSER_BUFFER_SIZE 100 50 51/* #define DEBUG */ 52/* #define DEBUG_PUSH */ 53 54static int htmlOmittedDefaultValue = 1; 55 56xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, 57 xmlChar end, xmlChar end2, xmlChar end3); 58static void htmlParseComment(htmlParserCtxtPtr ctxt); 59 60/************************************************************************ 61 * * 62 * Some factorized error routines * 63 * * 64 ************************************************************************/ 65 66/** 67 * htmlErrMemory: 68 * @ctxt: an HTML parser context 69 * @extra: extra informations 70 * 71 * Handle a redefinition of attribute error 72 */ 73static void 74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra) 75{ 76 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 77 (ctxt->instate == XML_PARSER_EOF)) 78 return; 79 if (ctxt != NULL) { 80 ctxt->errNo = XML_ERR_NO_MEMORY; 81 ctxt->instate = XML_PARSER_EOF; 82 ctxt->disableSAX = 1; 83 } 84 if (extra) 85 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 86 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 87 NULL, NULL, 0, 0, 88 "Memory allocation failed : %s\n", extra); 89 else 90 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 91 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL, 92 NULL, NULL, 0, 0, "Memory allocation failed\n"); 93} 94 95/** 96 * htmlParseErr: 97 * @ctxt: an HTML parser context 98 * @error: the error number 99 * @msg: the error message 100 * @str1: string infor 101 * @str2: string infor 102 * 103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints 104 */ 105static void 106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, 107 const char *msg, const xmlChar *str1, const xmlChar *str2) 108{ 109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 110 (ctxt->instate == XML_PARSER_EOF)) 111 return; 112 if (ctxt != NULL) 113 ctxt->errNo = error; 114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 115 XML_ERR_ERROR, NULL, 0, 116 (const char *) str1, (const char *) str2, 117 NULL, 0, 0, 118 msg, str1, str2); 119 if (ctxt != NULL) 120 ctxt->wellFormed = 0; 121} 122 123/** 124 * htmlParseErrInt: 125 * @ctxt: an HTML parser context 126 * @error: the error number 127 * @msg: the error message 128 * @val: integer info 129 * 130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints 131 */ 132static void 133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, 134 const char *msg, int val) 135{ 136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 137 (ctxt->instate == XML_PARSER_EOF)) 138 return; 139 if (ctxt != NULL) 140 ctxt->errNo = error; 141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 142 XML_ERR_ERROR, NULL, 0, NULL, NULL, 143 NULL, val, 0, msg, val); 144 if (ctxt != NULL) 145 ctxt->wellFormed = 0; 146} 147 148/************************************************************************ 149 * * 150 * Parser stacks related functions and macros * 151 * * 152 ************************************************************************/ 153 154/** 155 * htmlnamePush: 156 * @ctxt: an HTML parser context 157 * @value: the element name 158 * 159 * Pushes a new element name on top of the name stack 160 * 161 * Returns 0 in case of error, the index in the stack otherwise 162 */ 163static int 164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value) 165{ 166 if (ctxt->nameNr >= ctxt->nameMax) { 167 ctxt->nameMax *= 2; 168 ctxt->nameTab = (const xmlChar * *) 169 xmlRealloc((xmlChar * *)ctxt->nameTab, 170 ctxt->nameMax * 171 sizeof(ctxt->nameTab[0])); 172 if (ctxt->nameTab == NULL) { 173 htmlErrMemory(ctxt, NULL); 174 return (0); 175 } 176 } 177 ctxt->nameTab[ctxt->nameNr] = value; 178 ctxt->name = value; 179 return (ctxt->nameNr++); 180} 181/** 182 * htmlnamePop: 183 * @ctxt: an HTML parser context 184 * 185 * Pops the top element name from the name stack 186 * 187 * Returns the name just removed 188 */ 189static const xmlChar * 190htmlnamePop(htmlParserCtxtPtr ctxt) 191{ 192 const xmlChar *ret; 193 194 if (ctxt->nameNr <= 0) 195 return (NULL); 196 ctxt->nameNr--; 197 if (ctxt->nameNr < 0) 198 return (NULL); 199 if (ctxt->nameNr > 0) 200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1]; 201 else 202 ctxt->name = NULL; 203 ret = ctxt->nameTab[ctxt->nameNr]; 204 ctxt->nameTab[ctxt->nameNr] = NULL; 205 return (ret); 206} 207 208/* 209 * Macros for accessing the content. Those should be used only by the parser, 210 * and not exported. 211 * 212 * Dirty macros, i.e. one need to make assumption on the context to use them 213 * 214 * CUR_PTR return the current pointer to the xmlChar to be parsed. 215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled 216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled 217 * in UNICODE mode. This should be used internally by the parser 218 * only to compare to ASCII values otherwise it would break when 219 * running with UTF-8 encoding. 220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only 221 * to compare on ASCII based substring. 222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR 223 * it should be used only to compare on ASCII based substring. 224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined 225 * strings without newlines within the parser. 226 * 227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding 228 * 229 * CURRENT Returns the current char value, with the full decoding of 230 * UTF-8 if we are using this mode. It returns an int. 231 * NEXT Skip to the next character, this does the proper decoding 232 * in UTF-8 mode. It also pop-up unfinished entities on the fly. 233 * NEXTL(l) Skip the current unicode character of l xmlChars long. 234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly 235 */ 236 237#define UPPER (toupper(*ctxt->input->cur)) 238 239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val) 240 241#define NXT(val) ctxt->input->cur[(val)] 242 243#define UPP(val) (toupper(ctxt->input->cur[(val)])) 244 245#define CUR_PTR ctxt->input->cur 246 247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ 248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ 249 xmlParserInputShrink(ctxt->input) 250 251#define GROW if ((ctxt->progressive == 0) && \ 252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ 253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK) 254 255#define CURRENT ((int) (*ctxt->input->cur)) 256 257#define SKIP_BLANKS htmlSkipBlankChars(ctxt) 258 259/* Inported from XML */ 260 261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ 262#define CUR ((int) (*ctxt->input->cur)) 263#define NEXT xmlNextChar(ctxt) 264 265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) 266#define NXT(val) ctxt->input->cur[(val)] 267#define CUR_PTR ctxt->input->cur 268 269 270#define NEXTL(l) do { \ 271 if (*(ctxt->input->cur) == '\n') { \ 272 ctxt->input->line++; ctxt->input->col = 1; \ 273 } else ctxt->input->col++; \ 274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \ 275 } while (0) 276 277/************ 278 \ 279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ 280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); 281 ************/ 282 283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l) 284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l) 285 286#define COPY_BUF(l,b,i,v) \ 287 if (l == 1) b[i++] = (xmlChar) v; \ 288 else i += xmlCopyChar(l,&b[i],v) 289 290/** 291 * htmlCurrentChar: 292 * @ctxt: the HTML parser context 293 * @len: pointer to the length of the char read 294 * 295 * The current char value, if using UTF-8 this may actually span multiple 296 * bytes in the input buffer. Implement the end of line normalization: 297 * 2.11 End-of-Line Handling 298 * If the encoding is unspecified, in the case we find an ISO-Latin-1 299 * char, then the encoding converter is plugged in automatically. 300 * 301 * Returns the current char value and its length 302 */ 303 304static int 305htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { 306 if (ctxt->instate == XML_PARSER_EOF) 307 return(0); 308 309 if (ctxt->token != 0) { 310 *len = 0; 311 return(ctxt->token); 312 } 313 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { 314 /* 315 * We are supposed to handle UTF8, check it's valid 316 * From rfc2044: encoding of the Unicode values on UTF-8: 317 * 318 * UCS-4 range (hex.) UTF-8 octet sequence (binary) 319 * 0000 0000-0000 007F 0xxxxxxx 320 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 321 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 322 * 323 * Check for the 0x110000 limit too 324 */ 325 const unsigned char *cur = ctxt->input->cur; 326 unsigned char c; 327 unsigned int val; 328 329 c = *cur; 330 if (c & 0x80) { 331 if (cur[1] == 0) 332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 333 if ((cur[1] & 0xc0) != 0x80) 334 goto encoding_error; 335 if ((c & 0xe0) == 0xe0) { 336 337 if (cur[2] == 0) 338 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 339 if ((cur[2] & 0xc0) != 0x80) 340 goto encoding_error; 341 if ((c & 0xf0) == 0xf0) { 342 if (cur[3] == 0) 343 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 344 if (((c & 0xf8) != 0xf0) || 345 ((cur[3] & 0xc0) != 0x80)) 346 goto encoding_error; 347 /* 4-byte code */ 348 *len = 4; 349 val = (cur[0] & 0x7) << 18; 350 val |= (cur[1] & 0x3f) << 12; 351 val |= (cur[2] & 0x3f) << 6; 352 val |= cur[3] & 0x3f; 353 } else { 354 /* 3-byte code */ 355 *len = 3; 356 val = (cur[0] & 0xf) << 12; 357 val |= (cur[1] & 0x3f) << 6; 358 val |= cur[2] & 0x3f; 359 } 360 } else { 361 /* 2-byte code */ 362 *len = 2; 363 val = (cur[0] & 0x1f) << 6; 364 val |= cur[1] & 0x3f; 365 } 366 if (!IS_CHAR(val)) { 367 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 368 "Char 0x%X out of allowed range\n", val); 369 } 370 return(val); 371 } else { 372 /* 1-byte code */ 373 *len = 1; 374 return((int) *ctxt->input->cur); 375 } 376 } 377 /* 378 * Assume it's a fixed length encoding (1) with 379 * a compatible encoding for the ASCII set, since 380 * XML constructs only use < 128 chars 381 */ 382 *len = 1; 383 if ((int) *ctxt->input->cur < 0x80) 384 return((int) *ctxt->input->cur); 385 386 /* 387 * Humm this is bad, do an automatic flow conversion 388 */ 389 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); 390 ctxt->charset = XML_CHAR_ENCODING_UTF8; 391 return(xmlCurrentChar(ctxt, len)); 392 393encoding_error: 394 /* 395 * If we detect an UTF8 error that probably mean that the 396 * input encoding didn't get properly advertized in the 397 * declaration header. Report the error and switch the encoding 398 * to ISO-Latin-1 (if you don't like this policy, just declare the 399 * encoding !) 400 */ 401 { 402 char buffer[150]; 403 404 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", 405 ctxt->input->cur[0], ctxt->input->cur[1], 406 ctxt->input->cur[2], ctxt->input->cur[3]); 407 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 408 "Input is not proper UTF-8, indicate encoding !\n", 409 BAD_CAST buffer, NULL); 410 } 411 412 ctxt->charset = XML_CHAR_ENCODING_8859_1; 413 *len = 1; 414 return((int) *ctxt->input->cur); 415} 416 417/** 418 * htmlSkipBlankChars: 419 * @ctxt: the HTML parser context 420 * 421 * skip all blanks character found at that point in the input streams. 422 * 423 * Returns the number of space chars skipped 424 */ 425 426static int 427htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { 428 int res = 0; 429 430 while (IS_BLANK_CH(*(ctxt->input->cur))) { 431 if ((*ctxt->input->cur == 0) && 432 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { 433 xmlPopInput(ctxt); 434 } else { 435 if (*(ctxt->input->cur) == '\n') { 436 ctxt->input->line++; ctxt->input->col = 1; 437 } else ctxt->input->col++; 438 ctxt->input->cur++; 439 ctxt->nbChars++; 440 if (*ctxt->input->cur == 0) 441 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 442 } 443 res++; 444 } 445 return(res); 446} 447 448 449 450/************************************************************************ 451 * * 452 * The list of HTML elements and their properties * 453 * * 454 ************************************************************************/ 455 456/* 457 * Start Tag: 1 means the start tag can be ommited 458 * End Tag: 1 means the end tag can be ommited 459 * 2 means it's forbidden (empty elements) 460 * 3 means the tag is stylistic and should be closed easily 461 * Depr: this element is deprecated 462 * DTD: 1 means that this element is valid only in the Loose DTD 463 * 2 means that this element is valid only in the Frameset DTD 464 * 465 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description 466 , subElements , impliedsubelt , Attributes, userdata 467 */ 468 469/* Definitions and a couple of vars for HTML Elements */ 470 471#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small" 472#define NB_FONTSTYLE 8 473#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym" 474#define NB_PHRASE 10 475#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe" 476#define NB_SPECIAL 15 477#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL 478#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL 479#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address" 480#define NB_BLOCK NB_HEADING + NB_LIST + 14 481#define FORMCTRL "input", "select", "textarea", "label", "button" 482#define NB_FORMCTRL 5 483#define PCDATA 484#define NB_PCDATA 0 485#define HEADING "h1", "h2", "h3", "h4", "h5", "h6" 486#define NB_HEADING 6 487#define LIST "ul", "ol", "dir", "menu" 488#define NB_LIST 4 489#define MODIFIER 490#define NB_MODIFIER 0 491#define FLOW BLOCK,INLINE 492#define NB_FLOW NB_BLOCK + NB_INLINE 493#define EMPTY NULL 494 495 496static const char* const html_flow[] = { FLOW, NULL } ; 497static const char* const html_inline[] = { INLINE, NULL } ; 498 499/* placeholders: elts with content but no subelements */ 500static const char* const html_pcdata[] = { NULL } ; 501#define html_cdata html_pcdata 502 503 504/* ... and for HTML Attributes */ 505 506#define COREATTRS "id", "class", "style", "title" 507#define NB_COREATTRS 4 508#define I18N "lang", "dir" 509#define NB_I18N 2 510#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup" 511#define NB_EVENTS 9 512#define ATTRS COREATTRS,I18N,EVENTS 513#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS 514#define CELLHALIGN "align", "char", "charoff" 515#define NB_CELLHALIGN 3 516#define CELLVALIGN "valign" 517#define NB_CELLVALIGN 1 518 519static const char* const html_attrs[] = { ATTRS, NULL } ; 520static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ; 521static const char* const core_attrs[] = { COREATTRS, NULL } ; 522static const char* const i18n_attrs[] = { I18N, NULL } ; 523 524 525/* Other declarations that should go inline ... */ 526static const char* const a_attrs[] = { ATTRS, "charset", "type", "name", 527 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords", 528 "tabindex", "onfocus", "onblur", NULL } ; 529static const char* const target_attr[] = { "target", NULL } ; 530static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ; 531static const char* const alt_attr[] = { "alt", NULL } ; 532static const char* const src_alt_attrs[] = { "src", "alt", NULL } ; 533static const char* const href_attrs[] = { "href", NULL } ; 534static const char* const clear_attrs[] = { "clear", NULL } ; 535static const char* const inline_p[] = { INLINE, "p", NULL } ; 536 537static const char* const flow_param[] = { FLOW, "param", NULL } ; 538static const char* const applet_attrs[] = { COREATTRS , "codebase", 539 "archive", "alt", "name", "height", "width", "align", 540 "hspace", "vspace", NULL } ; 541static const char* const area_attrs[] = { "shape", "coords", "href", "nohref", 542 "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 543static const char* const basefont_attrs[] = 544 { "id", "size", "color", "face", NULL } ; 545static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ; 546static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ; 547static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ; 548static const char* const body_depr[] = { "background", "bgcolor", "text", 549 "link", "vlink", "alink", NULL } ; 550static const char* const button_attrs[] = { ATTRS, "name", "value", "type", 551 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 552 553 554static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ; 555static const char* const col_elt[] = { "col", NULL } ; 556static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ; 557static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ; 558static const char* const dl_contents[] = { "dt", "dd", NULL } ; 559static const char* const compact_attr[] = { "compact", NULL } ; 560static const char* const label_attr[] = { "label", NULL } ; 561static const char* const fieldset_contents[] = { FLOW, "legend" } ; 562static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ; 563static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ; 564static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ; 565static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ; 566static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ; 567static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ; 568static const char* const head_attrs[] = { I18N, "profile", NULL } ; 569static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ; 570static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ; 571static const char* const version_attr[] = { "version", NULL } ; 572static const char* const html_content[] = { "head", "body", "frameset", NULL } ; 573static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ; 574static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ; 575static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ; 576static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ; 577static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ; 578static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ; 579static const char* const align_attr[] = { "align", NULL } ; 580static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ; 581static const char* const map_contents[] = { BLOCK, "area", NULL } ; 582static const char* const name_attr[] = { "name", NULL } ; 583static const char* const action_attr[] = { "action", NULL } ; 584static const char* const blockli_elt[] = { BLOCK, "li", NULL } ; 585static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ; 586static const char* const content_attr[] = { "content", NULL } ; 587static const char* const type_attr[] = { "type", NULL } ; 588static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ; 589static const char* const object_contents[] = { FLOW, "param", NULL } ; 590static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ; 591static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ; 592static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ; 593static const char* const option_elt[] = { "option", NULL } ; 594static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ; 595static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ; 596static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ; 597static const char* const width_attr[] = { "width", NULL } ; 598static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ; 599static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ; 600static const char* const language_attr[] = { "language", NULL } ; 601static const char* const select_content[] = { "optgroup", "option", NULL } ; 602static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ; 603static const char* const style_attrs[] = { I18N, "media", "title", NULL } ; 604static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ; 605static const char* const table_depr[] = { "align", "bgcolor", NULL } ; 606static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ; 607static const char* const tr_elt[] = { "tr", NULL } ; 608static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ; 609static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ; 610static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ; 611static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ; 612static const char* const tr_contents[] = { "th", "td", NULL } ; 613static const char* const bgcolor_attr[] = { "bgcolor", NULL } ; 614static const char* const li_elt[] = { "li", NULL } ; 615static const char* const ul_depr[] = { "type", "compact", NULL} ; 616static const char* const dir_attr[] = { "dir", NULL} ; 617 618#define DECL (const char**) 619 620static const htmlElemDesc 621html40ElementTable[] = { 622{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ", 623 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL 624}, 625{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form", 626 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 627}, 628{ "acronym", 0, 0, 0, 0, 0, 0, 1, "", 629 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 630}, 631{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ", 632 DECL inline_p , NULL , DECL html_attrs, NULL, NULL 633}, 634{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ", 635 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL 636}, 637{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ", 638 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr 639}, 640{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style", 641 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 642}, 643{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ", 644 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs 645}, 646{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " , 647 EMPTY , NULL , NULL, DECL basefont_attrs, NULL 648}, 649{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ", 650 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr 651}, 652{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style", 653 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 654}, 655{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ", 656 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL 657}, 658{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ", 659 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL 660}, 661{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ", 662 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL 663}, 664{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ", 665 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL 666}, 667{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ", 668 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 669}, 670{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ", 671 DECL html_flow , NULL , NULL, DECL html_attrs, NULL 672}, 673{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation", 674 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 675}, 676{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment", 677 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 678}, 679{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ", 680 EMPTY , NULL , DECL col_attrs , NULL, NULL 681}, 682{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ", 683 DECL col_elt , "col" , DECL col_attrs , NULL, NULL 684}, 685{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ", 686 DECL html_flow , NULL , DECL html_attrs, NULL, NULL 687}, 688{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ", 689 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL 690}, 691{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition", 692 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 693}, 694{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list", 695 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL 696}, 697{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container", 698 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL 699}, 700{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ", 701 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL 702}, 703{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ", 704 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 705}, 706{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis", 707 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 708}, 709{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ", 710 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL 711}, 712{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ", 713 DECL html_inline, NULL, NULL, DECL font_attrs, NULL 714}, 715{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ", 716 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr 717}, 718{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " , 719 EMPTY, NULL, NULL, DECL frame_attrs, NULL 720}, 721{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" , 722 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL 723}, 724{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ", 725 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 726}, 727{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ", 728 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 729}, 730{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ", 731 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 732}, 733{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ", 734 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 735}, 736{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ", 737 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 738}, 739{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ", 740 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 741}, 742{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ", 743 DECL head_contents, NULL, DECL head_attrs, NULL, NULL 744}, 745{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " , 746 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL 747}, 748{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ", 749 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL 750}, 751{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style", 752 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 753}, 754{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ", 755 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL 756}, 757{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ", 758 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs 759}, 760{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ", 761 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL 762}, 763{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text", 764 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL 765}, 766{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ", 767 EMPTY, NULL, NULL, DECL prompt_attrs, NULL 768}, 769{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user", 770 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 771}, 772{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ", 773 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL 774}, 775{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ", 776 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL 777}, 778{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ", 779 DECL html_flow, NULL, DECL html_attrs, NULL, NULL 780}, 781{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ", 782 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL 783}, 784{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ", 785 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr 786}, 787{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ", 788 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL 789}, 790{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ", 791 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr 792}, 793{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ", 794 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL 795}, 796{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ", 797 DECL html_flow, "div", DECL html_attrs, NULL, NULL 798}, 799{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ", 800 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL 801}, 802{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ", 803 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL 804}, 805{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ", 806 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr 807}, 808{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " , 809 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL 810}, 811{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ", 812 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 813}, 814{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ", 815 EMPTY, NULL, DECL param_attrs, NULL, name_attr 816}, 817{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ", 818 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL 819}, 820{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ", 821 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL 822}, 823{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style", 824 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 825}, 826{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.", 827 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 828}, 829{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ", 830 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr 831}, 832{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ", 833 DECL select_content, NULL, DECL select_attrs, NULL, NULL 834}, 835{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style", 836 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 837}, 838{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ", 839 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 840}, 841{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text", 842 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 843}, 844{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis", 845 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 846}, 847{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ", 848 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr 849}, 850{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript", 851 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 852}, 853{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ", 854 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 855}, 856{ "table", 0, 0, 0, 0, 0, 0, 0, "", 857 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL 858}, 859{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ", 860 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 861}, 862{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell", 863 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 864}, 865{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ", 866 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr 867}, 868{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ", 869 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 870}, 871{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell", 872 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 873}, 874{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ", 875 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 876}, 877{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ", 878 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL 879}, 880{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ", 881 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL 882}, 883{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style", 884 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 885}, 886{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style", 887 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 888}, 889{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ", 890 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL 891}, 892{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument", 893 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 894} 895}; 896 897/* 898 * start tags that imply the end of current element 899 */ 900static const char * const htmlStartClose[] = { 901"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6", 902 "dl", "ul", "ol", "menu", "dir", "address", "pre", 903 "listing", "xmp", "head", NULL, 904"head", "p", NULL, 905"title", "p", NULL, 906"body", "head", "style", "link", "title", "p", NULL, 907"frameset", "head", "style", "link", "title", "p", NULL, 908"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address", 909 "pre", "listing", "xmp", "head", "li", NULL, 910"hr", "p", "head", NULL, 911"h1", "p", "head", NULL, 912"h2", "p", "head", NULL, 913"h3", "p", "head", NULL, 914"h4", "p", "head", NULL, 915"h5", "p", "head", NULL, 916"h6", "p", "head", NULL, 917"dir", "p", "head", NULL, 918"address", "p", "head", "ul", NULL, 919"pre", "p", "head", "ul", NULL, 920"listing", "p", "head", NULL, 921"xmp", "p", "head", NULL, 922"blockquote", "p", "head", NULL, 923"dl", "p", "dt", "menu", "dir", "address", "pre", "listing", 924 "xmp", "head", NULL, 925"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp", 926 "head", "dd", NULL, 927"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp", 928 "head", "dt", NULL, 929"ul", "p", "head", "ol", "menu", "dir", "address", "pre", 930 "listing", "xmp", NULL, 931"ol", "p", "head", "ul", NULL, 932"menu", "p", "head", "ul", NULL, 933"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL, 934"div", "p", "head", NULL, 935"noscript", "p", "head", NULL, 936"center", "font", "b", "i", "p", "head", NULL, 937"a", "a", NULL, 938"caption", "p", NULL, 939"colgroup", "caption", "colgroup", "col", "p", NULL, 940"col", "caption", "col", "p", NULL, 941"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", 942 "listing", "xmp", "a", NULL, 943"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 944"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 945"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL, 946"thead", "caption", "col", "colgroup", NULL, 947"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead", 948 "tbody", "p", NULL, 949"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead", 950 "tfoot", "tbody", "p", NULL, 951"optgroup", "option", NULL, 952"option", "option", NULL, 953"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", 954 "pre", "listing", "xmp", "a", NULL, 955NULL 956}; 957 958/* 959 * The list of HTML elements which are supposed not to have 960 * CDATA content and where a p element will be implied 961 * 962 * TODO: extend that list by reading the HTML SGML DTD on 963 * implied paragraph 964 */ 965static const char *const htmlNoContentElements[] = { 966 "html", 967 "head", 968 NULL 969}; 970 971/* 972 * The list of HTML attributes which are of content %Script; 973 * NOTE: when adding ones, check htmlIsScriptAttribute() since 974 * it assumes the name starts with 'on' 975 */ 976static const char *const htmlScriptAttributes[] = { 977 "onclick", 978 "ondblclick", 979 "onmousedown", 980 "onmouseup", 981 "onmouseover", 982 "onmousemove", 983 "onmouseout", 984 "onkeypress", 985 "onkeydown", 986 "onkeyup", 987 "onload", 988 "onunload", 989 "onfocus", 990 "onblur", 991 "onsubmit", 992 "onrest", 993 "onchange", 994 "onselect" 995}; 996 997/* 998 * This table is used by the htmlparser to know what to do with 999 * broken html pages. By assigning different priorities to different 1000 * elements the parser can decide how to handle extra endtags. 1001 * Endtags are only allowed to close elements with lower or equal 1002 * priority. 1003 */ 1004 1005typedef struct { 1006 const char *name; 1007 int priority; 1008} elementPriority; 1009 1010static const elementPriority htmlEndPriority[] = { 1011 {"div", 150}, 1012 {"td", 160}, 1013 {"th", 160}, 1014 {"tr", 170}, 1015 {"thead", 180}, 1016 {"tbody", 180}, 1017 {"tfoot", 180}, 1018 {"table", 190}, 1019 {"head", 200}, 1020 {"body", 200}, 1021 {"html", 220}, 1022 {NULL, 100} /* Default priority */ 1023}; 1024 1025static const char** htmlStartCloseIndex[100]; 1026static int htmlStartCloseIndexinitialized = 0; 1027 1028/************************************************************************ 1029 * * 1030 * functions to handle HTML specific data * 1031 * * 1032 ************************************************************************/ 1033 1034/** 1035 * htmlInitAutoClose: 1036 * 1037 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1038 * This is not reentrant. Call xmlInitParser() once before processing in 1039 * case of use in multithreaded programs. 1040 */ 1041void 1042htmlInitAutoClose(void) { 1043 int indx, i = 0; 1044 1045 if (htmlStartCloseIndexinitialized) return; 1046 1047 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL; 1048 indx = 0; 1049 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) { 1050 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i]; 1051 while (htmlStartClose[i] != NULL) i++; 1052 i++; 1053 } 1054 htmlStartCloseIndexinitialized = 1; 1055} 1056 1057/** 1058 * htmlTagLookup: 1059 * @tag: The tag name in lowercase 1060 * 1061 * Lookup the HTML tag in the ElementTable 1062 * 1063 * Returns the related htmlElemDescPtr or NULL if not found. 1064 */ 1065const htmlElemDesc * 1066htmlTagLookup(const xmlChar *tag) { 1067 unsigned int i; 1068 1069 for (i = 0; i < (sizeof(html40ElementTable) / 1070 sizeof(html40ElementTable[0]));i++) { 1071 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name)) 1072 return((htmlElemDescPtr) &html40ElementTable[i]); 1073 } 1074 return(NULL); 1075} 1076 1077/** 1078 * htmlGetEndPriority: 1079 * @name: The name of the element to look up the priority for. 1080 * 1081 * Return value: The "endtag" priority. 1082 **/ 1083static int 1084htmlGetEndPriority (const xmlChar *name) { 1085 int i = 0; 1086 1087 while ((htmlEndPriority[i].name != NULL) && 1088 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name))) 1089 i++; 1090 1091 return(htmlEndPriority[i].priority); 1092} 1093 1094 1095/** 1096 * htmlCheckAutoClose: 1097 * @newtag: The new tag name 1098 * @oldtag: The old tag name 1099 * 1100 * Checks whether the new tag is one of the registered valid tags for 1101 * closing old. 1102 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1103 * 1104 * Returns 0 if no, 1 if yes. 1105 */ 1106static int 1107htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag) 1108{ 1109 int i, indx; 1110 const char **closed = NULL; 1111 1112 if (htmlStartCloseIndexinitialized == 0) 1113 htmlInitAutoClose(); 1114 1115 /* inefficient, but not a big deal */ 1116 for (indx = 0; indx < 100; indx++) { 1117 closed = htmlStartCloseIndex[indx]; 1118 if (closed == NULL) 1119 return (0); 1120 if (xmlStrEqual(BAD_CAST * closed, newtag)) 1121 break; 1122 } 1123 1124 i = closed - htmlStartClose; 1125 i++; 1126 while (htmlStartClose[i] != NULL) { 1127 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) { 1128 return (1); 1129 } 1130 i++; 1131 } 1132 return (0); 1133} 1134 1135/** 1136 * htmlAutoCloseOnClose: 1137 * @ctxt: an HTML parser context 1138 * @newtag: The new tag name 1139 * @force: force the tag closure 1140 * 1141 * The HTML DTD allows an ending tag to implicitly close other tags. 1142 */ 1143static void 1144htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1145{ 1146 const htmlElemDesc *info; 1147 int i, priority; 1148 1149 priority = htmlGetEndPriority(newtag); 1150 1151 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1152 1153 if (xmlStrEqual(newtag, ctxt->nameTab[i])) 1154 break; 1155 /* 1156 * A missplaced endtag can only close elements with lower 1157 * or equal priority, so if we find an element with higher 1158 * priority before we find an element with 1159 * matching name, we just ignore this endtag 1160 */ 1161 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority) 1162 return; 1163 } 1164 if (i < 0) 1165 return; 1166 1167 while (!xmlStrEqual(newtag, ctxt->name)) { 1168 info = htmlTagLookup(ctxt->name); 1169 if ((info != NULL) && (info->endTag == 3)) { 1170 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 1171 "Opening and ending tag mismatch: %s and %s\n", 1172 newtag, ctxt->name); 1173 } 1174 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1175 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1176 htmlnamePop(ctxt); 1177 } 1178} 1179 1180/** 1181 * htmlAutoCloseOnEnd: 1182 * @ctxt: an HTML parser context 1183 * 1184 * Close all remaining tags at the end of the stream 1185 */ 1186static void 1187htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) 1188{ 1189 int i; 1190 1191 if (ctxt->nameNr == 0) 1192 return; 1193 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1194 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1195 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1196 htmlnamePop(ctxt); 1197 } 1198} 1199 1200/** 1201 * htmlAutoClose: 1202 * @ctxt: an HTML parser context 1203 * @newtag: The new tag name or NULL 1204 * 1205 * The HTML DTD allows a tag to implicitly close other tags. 1206 * The list is kept in htmlStartClose array. This function is 1207 * called when a new tag has been detected and generates the 1208 * appropriates closes if possible/needed. 1209 * If newtag is NULL this mean we are at the end of the resource 1210 * and we should check 1211 */ 1212static void 1213htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1214{ 1215 while ((newtag != NULL) && (ctxt->name != NULL) && 1216 (htmlCheckAutoClose(newtag, ctxt->name))) { 1217 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1218 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1219 htmlnamePop(ctxt); 1220 } 1221 if (newtag == NULL) { 1222 htmlAutoCloseOnEnd(ctxt); 1223 return; 1224 } 1225 while ((newtag == NULL) && (ctxt->name != NULL) && 1226 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) || 1227 (xmlStrEqual(ctxt->name, BAD_CAST "body")) || 1228 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) { 1229 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1230 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1231 htmlnamePop(ctxt); 1232 } 1233} 1234 1235/** 1236 * htmlAutoCloseTag: 1237 * @doc: the HTML document 1238 * @name: The tag name 1239 * @elem: the HTML element 1240 * 1241 * The HTML DTD allows a tag to implicitly close other tags. 1242 * The list is kept in htmlStartClose array. This function checks 1243 * if the element or one of it's children would autoclose the 1244 * given tag. 1245 * 1246 * Returns 1 if autoclose, 0 otherwise 1247 */ 1248int 1249htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) { 1250 htmlNodePtr child; 1251 1252 if (elem == NULL) return(1); 1253 if (xmlStrEqual(name, elem->name)) return(0); 1254 if (htmlCheckAutoClose(elem->name, name)) return(1); 1255 child = elem->children; 1256 while (child != NULL) { 1257 if (htmlAutoCloseTag(doc, name, child)) return(1); 1258 child = child->next; 1259 } 1260 return(0); 1261} 1262 1263/** 1264 * htmlIsAutoClosed: 1265 * @doc: the HTML document 1266 * @elem: the HTML element 1267 * 1268 * The HTML DTD allows a tag to implicitly close other tags. 1269 * The list is kept in htmlStartClose array. This function checks 1270 * if a tag is autoclosed by one of it's child 1271 * 1272 * Returns 1 if autoclosed, 0 otherwise 1273 */ 1274int 1275htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) { 1276 htmlNodePtr child; 1277 1278 if (elem == NULL) return(1); 1279 child = elem->children; 1280 while (child != NULL) { 1281 if (htmlAutoCloseTag(doc, elem->name, child)) return(1); 1282 child = child->next; 1283 } 1284 return(0); 1285} 1286 1287/** 1288 * htmlCheckImplied: 1289 * @ctxt: an HTML parser context 1290 * @newtag: The new tag name 1291 * 1292 * The HTML DTD allows a tag to exists only implicitly 1293 * called when a new tag has been detected and generates the 1294 * appropriates implicit tags if missing 1295 */ 1296static void 1297htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { 1298 if (!htmlOmittedDefaultValue) 1299 return; 1300 if (xmlStrEqual(newtag, BAD_CAST"html")) 1301 return; 1302 if (ctxt->nameNr <= 0) { 1303 htmlnamePush(ctxt, BAD_CAST"html"); 1304 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1305 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL); 1306 } 1307 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head"))) 1308 return; 1309 if ((ctxt->nameNr <= 1) && 1310 ((xmlStrEqual(newtag, BAD_CAST"script")) || 1311 (xmlStrEqual(newtag, BAD_CAST"style")) || 1312 (xmlStrEqual(newtag, BAD_CAST"meta")) || 1313 (xmlStrEqual(newtag, BAD_CAST"link")) || 1314 (xmlStrEqual(newtag, BAD_CAST"title")) || 1315 (xmlStrEqual(newtag, BAD_CAST"base")))) { 1316 /* 1317 * dropped OBJECT ... i you put it first BODY will be 1318 * assumed ! 1319 */ 1320 htmlnamePush(ctxt, BAD_CAST"head"); 1321 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1322 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL); 1323 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) && 1324 (!xmlStrEqual(newtag, BAD_CAST"frame")) && 1325 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) { 1326 int i; 1327 for (i = 0;i < ctxt->nameNr;i++) { 1328 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) { 1329 return; 1330 } 1331 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) { 1332 return; 1333 } 1334 } 1335 1336 htmlnamePush(ctxt, BAD_CAST"body"); 1337 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1338 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL); 1339 } 1340} 1341 1342/** 1343 * htmlCheckParagraph 1344 * @ctxt: an HTML parser context 1345 * 1346 * Check whether a p element need to be implied before inserting 1347 * characters in the current element. 1348 * 1349 * Returns 1 if a paragraph has been inserted, 0 if not and -1 1350 * in case of error. 1351 */ 1352 1353static int 1354htmlCheckParagraph(htmlParserCtxtPtr ctxt) { 1355 const xmlChar *tag; 1356 int i; 1357 1358 if (ctxt == NULL) 1359 return(-1); 1360 tag = ctxt->name; 1361 if (tag == NULL) { 1362 htmlAutoClose(ctxt, BAD_CAST"p"); 1363 htmlCheckImplied(ctxt, BAD_CAST"p"); 1364 htmlnamePush(ctxt, BAD_CAST"p"); 1365 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1366 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1367 return(1); 1368 } 1369 if (!htmlOmittedDefaultValue) 1370 return(0); 1371 for (i = 0; htmlNoContentElements[i] != NULL; i++) { 1372 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) { 1373 htmlAutoClose(ctxt, BAD_CAST"p"); 1374 htmlCheckImplied(ctxt, BAD_CAST"p"); 1375 htmlnamePush(ctxt, BAD_CAST"p"); 1376 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1377 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1378 return(1); 1379 } 1380 } 1381 return(0); 1382} 1383 1384/** 1385 * htmlIsScriptAttribute: 1386 * @name: an attribute name 1387 * 1388 * Check if an attribute is of content type Script 1389 * 1390 * Returns 1 is the attribute is a script 0 otherwise 1391 */ 1392int 1393htmlIsScriptAttribute(const xmlChar *name) { 1394 unsigned int i; 1395 1396 if (name == NULL) 1397 return(0); 1398 /* 1399 * all script attributes start with 'on' 1400 */ 1401 if ((name[0] != 'o') || (name[1] != 'n')) 1402 return(0); 1403 for (i = 0; 1404 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]); 1405 i++) { 1406 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i])) 1407 return(1); 1408 } 1409 return(0); 1410} 1411 1412/************************************************************************ 1413 * * 1414 * The list of HTML predefined entities * 1415 * * 1416 ************************************************************************/ 1417 1418 1419static const htmlEntityDesc html40EntitiesTable[] = { 1420/* 1421 * the 4 absolute ones, plus apostrophe. 1422 */ 1423{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" }, 1424{ 38, "amp", "ampersand, U+0026 ISOnum" }, 1425{ 39, "apos", "single quote" }, 1426{ 60, "lt", "less-than sign, U+003C ISOnum" }, 1427{ 62, "gt", "greater-than sign, U+003E ISOnum" }, 1428 1429/* 1430 * A bunch still in the 128-255 range 1431 * Replacing them depend really on the charset used. 1432 */ 1433{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" }, 1434{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" }, 1435{ 162, "cent", "cent sign, U+00A2 ISOnum" }, 1436{ 163, "pound","pound sign, U+00A3 ISOnum" }, 1437{ 164, "curren","currency sign, U+00A4 ISOnum" }, 1438{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" }, 1439{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" }, 1440{ 167, "sect", "section sign, U+00A7 ISOnum" }, 1441{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" }, 1442{ 169, "copy", "copyright sign, U+00A9 ISOnum" }, 1443{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" }, 1444{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" }, 1445{ 172, "not", "not sign, U+00AC ISOnum" }, 1446{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" }, 1447{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" }, 1448{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" }, 1449{ 176, "deg", "degree sign, U+00B0 ISOnum" }, 1450{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" }, 1451{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" }, 1452{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" }, 1453{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" }, 1454{ 181, "micro","micro sign, U+00B5 ISOnum" }, 1455{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" }, 1456{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" }, 1457{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" }, 1458{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" }, 1459{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" }, 1460{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" }, 1461{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" }, 1462{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" }, 1463{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" }, 1464{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" }, 1465{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" }, 1466{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" }, 1467{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" }, 1468{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" }, 1469{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" }, 1470{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" }, 1471{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" }, 1472{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" }, 1473{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" }, 1474{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" }, 1475{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" }, 1476{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" }, 1477{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" }, 1478{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" }, 1479{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" }, 1480{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" }, 1481{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" }, 1482{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" }, 1483{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" }, 1484{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" }, 1485{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" }, 1486{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" }, 1487{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" }, 1488{ 215, "times","multiplication sign, U+00D7 ISOnum" }, 1489{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" }, 1490{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" }, 1491{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" }, 1492{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" }, 1493{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" }, 1494{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" }, 1495{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" }, 1496{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" }, 1497{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" }, 1498{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" }, 1499{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" }, 1500{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" }, 1501{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" }, 1502{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" }, 1503{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" }, 1504{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" }, 1505{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" }, 1506{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" }, 1507{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" }, 1508{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" }, 1509{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" }, 1510{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" }, 1511{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" }, 1512{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" }, 1513{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" }, 1514{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" }, 1515{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" }, 1516{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" }, 1517{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" }, 1518{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" }, 1519{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" }, 1520{ 247, "divide","division sign, U+00F7 ISOnum" }, 1521{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" }, 1522{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" }, 1523{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" }, 1524{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" }, 1525{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" }, 1526{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" }, 1527{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" }, 1528{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" }, 1529 1530{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" }, 1531{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" }, 1532{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" }, 1533{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" }, 1534{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" }, 1535 1536/* 1537 * Anything below should really be kept as entities references 1538 */ 1539{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" }, 1540 1541{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" }, 1542{ 732, "tilde","small tilde, U+02DC ISOdia" }, 1543 1544{ 913, "Alpha","greek capital letter alpha, U+0391" }, 1545{ 914, "Beta", "greek capital letter beta, U+0392" }, 1546{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" }, 1547{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" }, 1548{ 917, "Epsilon","greek capital letter epsilon, U+0395" }, 1549{ 918, "Zeta", "greek capital letter zeta, U+0396" }, 1550{ 919, "Eta", "greek capital letter eta, U+0397" }, 1551{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" }, 1552{ 921, "Iota", "greek capital letter iota, U+0399" }, 1553{ 922, "Kappa","greek capital letter kappa, U+039A" }, 1554{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" }, 1555{ 924, "Mu", "greek capital letter mu, U+039C" }, 1556{ 925, "Nu", "greek capital letter nu, U+039D" }, 1557{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" }, 1558{ 927, "Omicron","greek capital letter omicron, U+039F" }, 1559{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" }, 1560{ 929, "Rho", "greek capital letter rho, U+03A1" }, 1561{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" }, 1562{ 932, "Tau", "greek capital letter tau, U+03A4" }, 1563{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" }, 1564{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" }, 1565{ 935, "Chi", "greek capital letter chi, U+03A7" }, 1566{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" }, 1567{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" }, 1568 1569{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" }, 1570{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" }, 1571{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" }, 1572{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" }, 1573{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" }, 1574{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" }, 1575{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" }, 1576{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" }, 1577{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" }, 1578{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" }, 1579{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" }, 1580{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" }, 1581{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" }, 1582{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" }, 1583{ 959, "omicron","greek small letter omicron, U+03BF NEW" }, 1584{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" }, 1585{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" }, 1586{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" }, 1587{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" }, 1588{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" }, 1589{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" }, 1590{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" }, 1591{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" }, 1592{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" }, 1593{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" }, 1594{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" }, 1595{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" }, 1596{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" }, 1597 1598{ 8194, "ensp", "en space, U+2002 ISOpub" }, 1599{ 8195, "emsp", "em space, U+2003 ISOpub" }, 1600{ 8201, "thinsp","thin space, U+2009 ISOpub" }, 1601{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" }, 1602{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" }, 1603{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" }, 1604{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" }, 1605{ 8211, "ndash","en dash, U+2013 ISOpub" }, 1606{ 8212, "mdash","em dash, U+2014 ISOpub" }, 1607{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" }, 1608{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" }, 1609{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" }, 1610{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" }, 1611{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" }, 1612{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" }, 1613{ 8224, "dagger","dagger, U+2020 ISOpub" }, 1614{ 8225, "Dagger","double dagger, U+2021 ISOpub" }, 1615 1616{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" }, 1617{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" }, 1618 1619{ 8240, "permil","per mille sign, U+2030 ISOtech" }, 1620 1621{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" }, 1622{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" }, 1623 1624{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" }, 1625{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" }, 1626 1627{ 8254, "oline","overline = spacing overscore, U+203E NEW" }, 1628{ 8260, "frasl","fraction slash, U+2044 NEW" }, 1629 1630{ 8364, "euro", "euro sign, U+20AC NEW" }, 1631 1632{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" }, 1633{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" }, 1634{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" }, 1635{ 8482, "trade","trade mark sign, U+2122 ISOnum" }, 1636{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" }, 1637{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" }, 1638{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" }, 1639{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" }, 1640{ 8595, "darr", "downwards arrow, U+2193 ISOnum" }, 1641{ 8596, "harr", "left right arrow, U+2194 ISOamsa" }, 1642{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" }, 1643{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" }, 1644{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" }, 1645{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" }, 1646{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" }, 1647{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" }, 1648 1649{ 8704, "forall","for all, U+2200 ISOtech" }, 1650{ 8706, "part", "partial differential, U+2202 ISOtech" }, 1651{ 8707, "exist","there exists, U+2203 ISOtech" }, 1652{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" }, 1653{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" }, 1654{ 8712, "isin", "element of, U+2208 ISOtech" }, 1655{ 8713, "notin","not an element of, U+2209 ISOtech" }, 1656{ 8715, "ni", "contains as member, U+220B ISOtech" }, 1657{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" }, 1658{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" }, 1659{ 8722, "minus","minus sign, U+2212 ISOtech" }, 1660{ 8727, "lowast","asterisk operator, U+2217 ISOtech" }, 1661{ 8730, "radic","square root = radical sign, U+221A ISOtech" }, 1662{ 8733, "prop", "proportional to, U+221D ISOtech" }, 1663{ 8734, "infin","infinity, U+221E ISOtech" }, 1664{ 8736, "ang", "angle, U+2220 ISOamso" }, 1665{ 8743, "and", "logical and = wedge, U+2227 ISOtech" }, 1666{ 8744, "or", "logical or = vee, U+2228 ISOtech" }, 1667{ 8745, "cap", "intersection = cap, U+2229 ISOtech" }, 1668{ 8746, "cup", "union = cup, U+222A ISOtech" }, 1669{ 8747, "int", "integral, U+222B ISOtech" }, 1670{ 8756, "there4","therefore, U+2234 ISOtech" }, 1671{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" }, 1672{ 8773, "cong", "approximately equal to, U+2245 ISOtech" }, 1673{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" }, 1674{ 8800, "ne", "not equal to, U+2260 ISOtech" }, 1675{ 8801, "equiv","identical to, U+2261 ISOtech" }, 1676{ 8804, "le", "less-than or equal to, U+2264 ISOtech" }, 1677{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" }, 1678{ 8834, "sub", "subset of, U+2282 ISOtech" }, 1679{ 8835, "sup", "superset of, U+2283 ISOtech" }, 1680{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" }, 1681{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" }, 1682{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" }, 1683{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" }, 1684{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" }, 1685{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" }, 1686{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" }, 1687{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" }, 1688{ 8969, "rceil","right ceiling, U+2309 ISOamsc" }, 1689{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" }, 1690{ 8971, "rfloor","right floor, U+230B ISOamsc" }, 1691{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" }, 1692{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" }, 1693{ 9674, "loz", "lozenge, U+25CA ISOpub" }, 1694 1695{ 9824, "spades","black spade suit, U+2660 ISOpub" }, 1696{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" }, 1697{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" }, 1698{ 9830, "diams","black diamond suit, U+2666 ISOpub" }, 1699 1700}; 1701 1702/************************************************************************ 1703 * * 1704 * Commodity functions to handle entities * 1705 * * 1706 ************************************************************************/ 1707 1708/* 1709 * Macro used to grow the current buffer. 1710 */ 1711#define growBuffer(buffer) { \ 1712 xmlChar *tmp; \ 1713 buffer##_size *= 2; \ 1714 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \ 1715 if (tmp == NULL) { \ 1716 htmlErrMemory(ctxt, "growing buffer\n"); \ 1717 xmlFree(buffer); \ 1718 return(NULL); \ 1719 } \ 1720 buffer = tmp; \ 1721} 1722 1723/** 1724 * htmlEntityLookup: 1725 * @name: the entity name 1726 * 1727 * Lookup the given entity in EntitiesTable 1728 * 1729 * TODO: the linear scan is really ugly, an hash table is really needed. 1730 * 1731 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1732 */ 1733const htmlEntityDesc * 1734htmlEntityLookup(const xmlChar *name) { 1735 unsigned int i; 1736 1737 for (i = 0;i < (sizeof(html40EntitiesTable)/ 1738 sizeof(html40EntitiesTable[0]));i++) { 1739 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) { 1740 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1741 } 1742 } 1743 return(NULL); 1744} 1745 1746/** 1747 * htmlEntityValueLookup: 1748 * @value: the entity's unicode value 1749 * 1750 * Lookup the given entity in EntitiesTable 1751 * 1752 * TODO: the linear scan is really ugly, an hash table is really needed. 1753 * 1754 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1755 */ 1756const htmlEntityDesc * 1757htmlEntityValueLookup(unsigned int value) { 1758 unsigned int i; 1759 1760 for (i = 0;i < (sizeof(html40EntitiesTable)/ 1761 sizeof(html40EntitiesTable[0]));i++) { 1762 if (html40EntitiesTable[i].value >= value) { 1763 if (html40EntitiesTable[i].value > value) 1764 break; 1765 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1766 } 1767 } 1768 return(NULL); 1769} 1770 1771/** 1772 * UTF8ToHtml: 1773 * @out: a pointer to an array of bytes to store the result 1774 * @outlen: the length of @out 1775 * @in: a pointer to an array of UTF-8 chars 1776 * @inlen: the length of @in 1777 * 1778 * Take a block of UTF-8 chars in and try to convert it to an ASCII 1779 * plus HTML entities block of chars out. 1780 * 1781 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 1782 * The value of @inlen after return is the number of octets consumed 1783 * as the return value is positive, else unpredictable. 1784 * The value of @outlen after return is the number of octets consumed. 1785 */ 1786int 1787UTF8ToHtml(unsigned char* out, int *outlen, 1788 const unsigned char* in, int *inlen) { 1789 const unsigned char* processed = in; 1790 const unsigned char* outend; 1791 const unsigned char* outstart = out; 1792 const unsigned char* instart = in; 1793 const unsigned char* inend; 1794 unsigned int c, d; 1795 int trailing; 1796 1797 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1); 1798 if (in == NULL) { 1799 /* 1800 * initialization nothing to do 1801 */ 1802 *outlen = 0; 1803 *inlen = 0; 1804 return(0); 1805 } 1806 inend = in + (*inlen); 1807 outend = out + (*outlen); 1808 while (in < inend) { 1809 d = *in++; 1810 if (d < 0x80) { c= d; trailing= 0; } 1811 else if (d < 0xC0) { 1812 /* trailing byte in leading position */ 1813 *outlen = out - outstart; 1814 *inlen = processed - instart; 1815 return(-2); 1816 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 1817 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 1818 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 1819 else { 1820 /* no chance for this in Ascii */ 1821 *outlen = out - outstart; 1822 *inlen = processed - instart; 1823 return(-2); 1824 } 1825 1826 if (inend - in < trailing) { 1827 break; 1828 } 1829 1830 for ( ; trailing; trailing--) { 1831 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) 1832 break; 1833 c <<= 6; 1834 c |= d & 0x3F; 1835 } 1836 1837 /* assertion: c is a single UTF-4 value */ 1838 if (c < 0x80) { 1839 if (out + 1 >= outend) 1840 break; 1841 *out++ = c; 1842 } else { 1843 int len; 1844 const htmlEntityDesc * ent; 1845 1846 /* 1847 * Try to lookup a predefined HTML entity for it 1848 */ 1849 1850 ent = htmlEntityValueLookup(c); 1851 if (ent == NULL) { 1852 /* no chance for this in Ascii */ 1853 *outlen = out - outstart; 1854 *inlen = processed - instart; 1855 return(-2); 1856 } 1857 len = strlen(ent->name); 1858 if (out + 2 + len >= outend) 1859 break; 1860 *out++ = '&'; 1861 memcpy(out, ent->name, len); 1862 out += len; 1863 *out++ = ';'; 1864 } 1865 processed = in; 1866 } 1867 *outlen = out - outstart; 1868 *inlen = processed - instart; 1869 return(0); 1870} 1871 1872/** 1873 * htmlEncodeEntities: 1874 * @out: a pointer to an array of bytes to store the result 1875 * @outlen: the length of @out 1876 * @in: a pointer to an array of UTF-8 chars 1877 * @inlen: the length of @in 1878 * @quoteChar: the quote character to escape (' or ") or zero. 1879 * 1880 * Take a block of UTF-8 chars in and try to convert it to an ASCII 1881 * plus HTML entities block of chars out. 1882 * 1883 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 1884 * The value of @inlen after return is the number of octets consumed 1885 * as the return value is positive, else unpredictable. 1886 * The value of @outlen after return is the number of octets consumed. 1887 */ 1888int 1889htmlEncodeEntities(unsigned char* out, int *outlen, 1890 const unsigned char* in, int *inlen, int quoteChar) { 1891 const unsigned char* processed = in; 1892 const unsigned char* outend; 1893 const unsigned char* outstart = out; 1894 const unsigned char* instart = in; 1895 const unsigned char* inend; 1896 unsigned int c, d; 1897 int trailing; 1898 1899 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) 1900 return(-1); 1901 outend = out + (*outlen); 1902 inend = in + (*inlen); 1903 while (in < inend) { 1904 d = *in++; 1905 if (d < 0x80) { c= d; trailing= 0; } 1906 else if (d < 0xC0) { 1907 /* trailing byte in leading position */ 1908 *outlen = out - outstart; 1909 *inlen = processed - instart; 1910 return(-2); 1911 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 1912 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 1913 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 1914 else { 1915 /* no chance for this in Ascii */ 1916 *outlen = out - outstart; 1917 *inlen = processed - instart; 1918 return(-2); 1919 } 1920 1921 if (inend - in < trailing) 1922 break; 1923 1924 while (trailing--) { 1925 if (((d= *in++) & 0xC0) != 0x80) { 1926 *outlen = out - outstart; 1927 *inlen = processed - instart; 1928 return(-2); 1929 } 1930 c <<= 6; 1931 c |= d & 0x3F; 1932 } 1933 1934 /* assertion: c is a single UTF-4 value */ 1935 if ((c < 0x80) && (c != (unsigned int) quoteChar) && 1936 (c != '&') && (c != '<') && (c != '>')) { 1937 if (out >= outend) 1938 break; 1939 *out++ = c; 1940 } else { 1941 const htmlEntityDesc * ent; 1942 const char *cp; 1943 char nbuf[16]; 1944 int len; 1945 1946 /* 1947 * Try to lookup a predefined HTML entity for it 1948 */ 1949 ent = htmlEntityValueLookup(c); 1950 if (ent == NULL) { 1951 snprintf(nbuf, sizeof(nbuf), "#%u", c); 1952 cp = nbuf; 1953 } 1954 else 1955 cp = ent->name; 1956 len = strlen(cp); 1957 if (out + 2 + len > outend) 1958 break; 1959 *out++ = '&'; 1960 memcpy(out, cp, len); 1961 out += len; 1962 *out++ = ';'; 1963 } 1964 processed = in; 1965 } 1966 *outlen = out - outstart; 1967 *inlen = processed - instart; 1968 return(0); 1969} 1970 1971/************************************************************************ 1972 * * 1973 * Commodity functions to handle streams * 1974 * * 1975 ************************************************************************/ 1976 1977/** 1978 * htmlNewInputStream: 1979 * @ctxt: an HTML parser context 1980 * 1981 * Create a new input stream structure 1982 * Returns the new input stream or NULL 1983 */ 1984static htmlParserInputPtr 1985htmlNewInputStream(htmlParserCtxtPtr ctxt) { 1986 htmlParserInputPtr input; 1987 1988 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput)); 1989 if (input == NULL) { 1990 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n"); 1991 return(NULL); 1992 } 1993 memset(input, 0, sizeof(htmlParserInput)); 1994 input->filename = NULL; 1995 input->directory = NULL; 1996 input->base = NULL; 1997 input->cur = NULL; 1998 input->buf = NULL; 1999 input->line = 1; 2000 input->col = 1; 2001 input->buf = NULL; 2002 input->free = NULL; 2003 input->version = NULL; 2004 input->consumed = 0; 2005 input->length = 0; 2006 return(input); 2007} 2008 2009 2010/************************************************************************ 2011 * * 2012 * Commodity functions, cleanup needed ? * 2013 * * 2014 ************************************************************************/ 2015/* 2016 * all tags allowing pc data from the html 4.01 loose dtd 2017 * NOTE: it might be more apropriate to integrate this information 2018 * into the html40ElementTable array but I don't want to risk any 2019 * binary incomptibility 2020 */ 2021static const char *allowPCData[] = { 2022 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big", 2023 "blockquote", "body", "button", "caption", "center", "cite", "code", 2024 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2", 2025 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend", 2026 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp", 2027 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var" 2028}; 2029 2030/** 2031 * areBlanks: 2032 * @ctxt: an HTML parser context 2033 * @str: a xmlChar * 2034 * @len: the size of @str 2035 * 2036 * Is this a sequence of blank chars that one can ignore ? 2037 * 2038 * Returns 1 if ignorable 0 otherwise. 2039 */ 2040 2041static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { 2042 unsigned int i; 2043 int j; 2044 xmlNodePtr lastChild; 2045 xmlDtdPtr dtd; 2046 2047 for (j = 0;j < len;j++) 2048 if (!(IS_BLANK_CH(str[j]))) return(0); 2049 2050 if (CUR == 0) return(1); 2051 if (CUR != '<') return(0); 2052 if (ctxt->name == NULL) 2053 return(1); 2054 if (xmlStrEqual(ctxt->name, BAD_CAST"html")) 2055 return(1); 2056 if (xmlStrEqual(ctxt->name, BAD_CAST"head")) 2057 return(1); 2058 2059 /* Only strip CDATA children of the body tag for strict HTML DTDs */ 2060 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) { 2061 dtd = xmlGetIntSubset(ctxt->myDoc); 2062 if (dtd != NULL && dtd->ExternalID != NULL) { 2063 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") || 2064 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN")) 2065 return(1); 2066 } 2067 } 2068 2069 if (ctxt->node == NULL) return(0); 2070 lastChild = xmlGetLastChild(ctxt->node); 2071 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE)) 2072 lastChild = lastChild->prev; 2073 if (lastChild == NULL) { 2074 if ((ctxt->node->type != XML_ELEMENT_NODE) && 2075 (ctxt->node->content != NULL)) return(0); 2076 /* keep ws in constructs like ...<b> </b>... 2077 for all tags "b" allowing PCDATA */ 2078 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2079 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) { 2080 return(0); 2081 } 2082 } 2083 } else if (xmlNodeIsText(lastChild)) { 2084 return(0); 2085 } else { 2086 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p> 2087 for all tags "p" allowing PCDATA */ 2088 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2089 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) { 2090 return(0); 2091 } 2092 } 2093 } 2094 return(1); 2095} 2096 2097/** 2098 * htmlNewDocNoDtD: 2099 * @URI: URI for the dtd, or NULL 2100 * @ExternalID: the external ID of the DTD, or NULL 2101 * 2102 * Creates a new HTML document without a DTD node if @URI and @ExternalID 2103 * are NULL 2104 * 2105 * Returns a new document, do not initialize the DTD if not provided 2106 */ 2107htmlDocPtr 2108htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) { 2109 xmlDocPtr cur; 2110 2111 /* 2112 * Allocate a new document and fill the fields. 2113 */ 2114 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc)); 2115 if (cur == NULL) { 2116 htmlErrMemory(NULL, "HTML document creation failed\n"); 2117 return(NULL); 2118 } 2119 memset(cur, 0, sizeof(xmlDoc)); 2120 2121 cur->type = XML_HTML_DOCUMENT_NODE; 2122 cur->version = NULL; 2123 cur->intSubset = NULL; 2124 cur->doc = cur; 2125 cur->name = NULL; 2126 cur->children = NULL; 2127 cur->extSubset = NULL; 2128 cur->oldNs = NULL; 2129 cur->encoding = NULL; 2130 cur->standalone = 1; 2131 cur->compression = 0; 2132 cur->ids = NULL; 2133 cur->refs = NULL; 2134 cur->_private = NULL; 2135 cur->charset = XML_CHAR_ENCODING_UTF8; 2136 if ((ExternalID != NULL) || 2137 (URI != NULL)) 2138 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI); 2139 return(cur); 2140} 2141 2142/** 2143 * htmlNewDoc: 2144 * @URI: URI for the dtd, or NULL 2145 * @ExternalID: the external ID of the DTD, or NULL 2146 * 2147 * Creates a new HTML document 2148 * 2149 * Returns a new document 2150 */ 2151htmlDocPtr 2152htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { 2153 if ((URI == NULL) && (ExternalID == NULL)) 2154 return(htmlNewDocNoDtD( 2155 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd", 2156 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN")); 2157 2158 return(htmlNewDocNoDtD(URI, ExternalID)); 2159} 2160 2161 2162/************************************************************************ 2163 * * 2164 * The parser itself * 2165 * Relates to http://www.w3.org/TR/html40 * 2166 * * 2167 ************************************************************************/ 2168 2169/************************************************************************ 2170 * * 2171 * The parser itself * 2172 * * 2173 ************************************************************************/ 2174 2175static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); 2176 2177/** 2178 * htmlParseHTMLName: 2179 * @ctxt: an HTML parser context 2180 * 2181 * parse an HTML tag or attribute name, note that we convert it to lowercase 2182 * since HTML names are not case-sensitive. 2183 * 2184 * Returns the Tag Name parsed or NULL 2185 */ 2186 2187static const xmlChar * 2188htmlParseHTMLName(htmlParserCtxtPtr ctxt) { 2189 int i = 0; 2190 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2191 2192 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') && 2193 (CUR != ':')) return(NULL); 2194 2195 while ((i < HTML_PARSER_BUFFER_SIZE) && 2196 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) || 2197 (CUR == ':') || (CUR == '-') || (CUR == '_'))) { 2198 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; 2199 else loc[i] = CUR; 2200 i++; 2201 2202 NEXT; 2203 } 2204 2205 return(xmlDictLookup(ctxt->dict, loc, i)); 2206} 2207 2208/** 2209 * htmlParseName: 2210 * @ctxt: an HTML parser context 2211 * 2212 * parse an HTML name, this routine is case sensitive. 2213 * 2214 * Returns the Name parsed or NULL 2215 */ 2216 2217static const xmlChar * 2218htmlParseName(htmlParserCtxtPtr ctxt) { 2219 const xmlChar *in; 2220 const xmlChar *ret; 2221 int count = 0; 2222 2223 GROW; 2224 2225 /* 2226 * Accelerator for simple ASCII names 2227 */ 2228 in = ctxt->input->cur; 2229 if (((*in >= 0x61) && (*in <= 0x7A)) || 2230 ((*in >= 0x41) && (*in <= 0x5A)) || 2231 (*in == '_') || (*in == ':')) { 2232 in++; 2233 while (((*in >= 0x61) && (*in <= 0x7A)) || 2234 ((*in >= 0x41) && (*in <= 0x5A)) || 2235 ((*in >= 0x30) && (*in <= 0x39)) || 2236 (*in == '_') || (*in == '-') || 2237 (*in == ':') || (*in == '.')) 2238 in++; 2239 if ((*in > 0) && (*in < 0x80)) { 2240 count = in - ctxt->input->cur; 2241 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); 2242 ctxt->input->cur = in; 2243 ctxt->nbChars += count; 2244 ctxt->input->col += count; 2245 return(ret); 2246 } 2247 } 2248 return(htmlParseNameComplex(ctxt)); 2249} 2250 2251static const xmlChar * 2252htmlParseNameComplex(xmlParserCtxtPtr ctxt) { 2253 int len = 0, l; 2254 int c; 2255 int count = 0; 2256 2257 /* 2258 * Handler for more complex cases 2259 */ 2260 GROW; 2261 c = CUR_CHAR(l); 2262 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ 2263 (!IS_LETTER(c) && (c != '_') && 2264 (c != ':'))) { 2265 return(NULL); 2266 } 2267 2268 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ 2269 ((IS_LETTER(c)) || (IS_DIGIT(c)) || 2270 (c == '.') || (c == '-') || 2271 (c == '_') || (c == ':') || 2272 (IS_COMBINING(c)) || 2273 (IS_EXTENDER(c)))) { 2274 if (count++ > 100) { 2275 count = 0; 2276 GROW; 2277 } 2278 len += l; 2279 NEXTL(l); 2280 c = CUR_CHAR(l); 2281 } 2282 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); 2283} 2284 2285 2286/** 2287 * htmlParseHTMLAttribute: 2288 * @ctxt: an HTML parser context 2289 * @stop: a char stop value 2290 * 2291 * parse an HTML attribute value till the stop (quote), if 2292 * stop is 0 then it stops at the first space 2293 * 2294 * Returns the attribute parsed or NULL 2295 */ 2296 2297static xmlChar * 2298htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { 2299 xmlChar *buffer = NULL; 2300 int buffer_size = 0; 2301 xmlChar *out = NULL; 2302 const xmlChar *name = NULL; 2303 const xmlChar *cur = NULL; 2304 const htmlEntityDesc * ent; 2305 2306 /* 2307 * allocate a translation buffer. 2308 */ 2309 buffer_size = HTML_PARSER_BUFFER_SIZE; 2310 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar)); 2311 if (buffer == NULL) { 2312 htmlErrMemory(ctxt, "buffer allocation failed\n"); 2313 return(NULL); 2314 } 2315 out = buffer; 2316 2317 /* 2318 * Ok loop until we reach one of the ending chars 2319 */ 2320 while ((CUR != 0) && (CUR != stop)) { 2321 if ((stop == 0) && (CUR == '>')) break; 2322 if ((stop == 0) && (IS_BLANK_CH(CUR))) break; 2323 if (CUR == '&') { 2324 if (NXT(1) == '#') { 2325 unsigned int c; 2326 int bits; 2327 2328 c = htmlParseCharRef(ctxt); 2329 if (c < 0x80) 2330 { *out++ = c; bits= -6; } 2331 else if (c < 0x800) 2332 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2333 else if (c < 0x10000) 2334 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2335 else 2336 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2337 2338 for ( ; bits >= 0; bits-= 6) { 2339 *out++ = ((c >> bits) & 0x3F) | 0x80; 2340 } 2341 2342 if (out - buffer > buffer_size - 100) { 2343 int indx = out - buffer; 2344 2345 growBuffer(buffer); 2346 out = &buffer[indx]; 2347 } 2348 } else { 2349 ent = htmlParseEntityRef(ctxt, &name); 2350 if (name == NULL) { 2351 *out++ = '&'; 2352 if (out - buffer > buffer_size - 100) { 2353 int indx = out - buffer; 2354 2355 growBuffer(buffer); 2356 out = &buffer[indx]; 2357 } 2358 } else if (ent == NULL) { 2359 *out++ = '&'; 2360 cur = name; 2361 while (*cur != 0) { 2362 if (out - buffer > buffer_size - 100) { 2363 int indx = out - buffer; 2364 2365 growBuffer(buffer); 2366 out = &buffer[indx]; 2367 } 2368 *out++ = *cur++; 2369 } 2370 } else { 2371 unsigned int c; 2372 int bits; 2373 2374 if (out - buffer > buffer_size - 100) { 2375 int indx = out - buffer; 2376 2377 growBuffer(buffer); 2378 out = &buffer[indx]; 2379 } 2380 c = ent->value; 2381 if (c < 0x80) 2382 { *out++ = c; bits= -6; } 2383 else if (c < 0x800) 2384 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2385 else if (c < 0x10000) 2386 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2387 else 2388 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2389 2390 for ( ; bits >= 0; bits-= 6) { 2391 *out++ = ((c >> bits) & 0x3F) | 0x80; 2392 } 2393 } 2394 } 2395 } else { 2396 unsigned int c; 2397 int bits, l; 2398 2399 if (out - buffer > buffer_size - 100) { 2400 int indx = out - buffer; 2401 2402 growBuffer(buffer); 2403 out = &buffer[indx]; 2404 } 2405 c = CUR_CHAR(l); 2406 if (c < 0x80) 2407 { *out++ = c; bits= -6; } 2408 else if (c < 0x800) 2409 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2410 else if (c < 0x10000) 2411 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2412 else 2413 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2414 2415 for ( ; bits >= 0; bits-= 6) { 2416 *out++ = ((c >> bits) & 0x3F) | 0x80; 2417 } 2418 NEXT; 2419 } 2420 } 2421 *out++ = 0; 2422 return(buffer); 2423} 2424 2425/** 2426 * htmlParseEntityRef: 2427 * @ctxt: an HTML parser context 2428 * @str: location to store the entity name 2429 * 2430 * parse an HTML ENTITY references 2431 * 2432 * [68] EntityRef ::= '&' Name ';' 2433 * 2434 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise, 2435 * if non-NULL *str will have to be freed by the caller. 2436 */ 2437const htmlEntityDesc * 2438htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) { 2439 const xmlChar *name; 2440 const htmlEntityDesc * ent = NULL; 2441 2442 if (str != NULL) *str = NULL; 2443 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL); 2444 2445 if (CUR == '&') { 2446 NEXT; 2447 name = htmlParseName(ctxt); 2448 if (name == NULL) { 2449 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 2450 "htmlParseEntityRef: no name\n", NULL, NULL); 2451 } else { 2452 GROW; 2453 if (CUR == ';') { 2454 if (str != NULL) 2455 *str = name; 2456 2457 /* 2458 * Lookup the entity in the table. 2459 */ 2460 ent = htmlEntityLookup(name); 2461 if (ent != NULL) /* OK that's ugly !!! */ 2462 NEXT; 2463 } else { 2464 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING, 2465 "htmlParseEntityRef: expecting ';'\n", 2466 NULL, NULL); 2467 if (str != NULL) 2468 *str = name; 2469 } 2470 } 2471 } 2472 return(ent); 2473} 2474 2475/** 2476 * htmlParseAttValue: 2477 * @ctxt: an HTML parser context 2478 * 2479 * parse a value for an attribute 2480 * Note: the parser won't do substitution of entities here, this 2481 * will be handled later in xmlStringGetNodeList, unless it was 2482 * asked for ctxt->replaceEntities != 0 2483 * 2484 * Returns the AttValue parsed or NULL. 2485 */ 2486 2487static xmlChar * 2488htmlParseAttValue(htmlParserCtxtPtr ctxt) { 2489 xmlChar *ret = NULL; 2490 2491 if (CUR == '"') { 2492 NEXT; 2493 ret = htmlParseHTMLAttribute(ctxt, '"'); 2494 if (CUR != '"') { 2495 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2496 "AttValue: \" expected\n", NULL, NULL); 2497 } else 2498 NEXT; 2499 } else if (CUR == '\'') { 2500 NEXT; 2501 ret = htmlParseHTMLAttribute(ctxt, '\''); 2502 if (CUR != '\'') { 2503 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2504 "AttValue: ' expected\n", NULL, NULL); 2505 } else 2506 NEXT; 2507 } else { 2508 /* 2509 * That's an HTMLism, the attribute value may not be quoted 2510 */ 2511 ret = htmlParseHTMLAttribute(ctxt, 0); 2512 if (ret == NULL) { 2513 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE, 2514 "AttValue: no value found\n", NULL, NULL); 2515 } 2516 } 2517 return(ret); 2518} 2519 2520/** 2521 * htmlParseSystemLiteral: 2522 * @ctxt: an HTML parser context 2523 * 2524 * parse an HTML Literal 2525 * 2526 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") 2527 * 2528 * Returns the SystemLiteral parsed or NULL 2529 */ 2530 2531static xmlChar * 2532htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { 2533 const xmlChar *q; 2534 xmlChar *ret = NULL; 2535 2536 if (CUR == '"') { 2537 NEXT; 2538 q = CUR_PTR; 2539 while ((IS_CHAR_CH(CUR)) && (CUR != '"')) 2540 NEXT; 2541 if (!IS_CHAR_CH(CUR)) { 2542 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2543 "Unfinished SystemLiteral\n", NULL, NULL); 2544 } else { 2545 ret = xmlStrndup(q, CUR_PTR - q); 2546 NEXT; 2547 } 2548 } else if (CUR == '\'') { 2549 NEXT; 2550 q = CUR_PTR; 2551 while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) 2552 NEXT; 2553 if (!IS_CHAR_CH(CUR)) { 2554 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2555 "Unfinished SystemLiteral\n", NULL, NULL); 2556 } else { 2557 ret = xmlStrndup(q, CUR_PTR - q); 2558 NEXT; 2559 } 2560 } else { 2561 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2562 " or ' expected\n", NULL, NULL); 2563 } 2564 2565 return(ret); 2566} 2567 2568/** 2569 * htmlParsePubidLiteral: 2570 * @ctxt: an HTML parser context 2571 * 2572 * parse an HTML public literal 2573 * 2574 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" 2575 * 2576 * Returns the PubidLiteral parsed or NULL. 2577 */ 2578 2579static xmlChar * 2580htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { 2581 const xmlChar *q; 2582 xmlChar *ret = NULL; 2583 /* 2584 * Name ::= (Letter | '_') (NameChar)* 2585 */ 2586 if (CUR == '"') { 2587 NEXT; 2588 q = CUR_PTR; 2589 while (IS_PUBIDCHAR_CH(CUR)) NEXT; 2590 if (CUR != '"') { 2591 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2592 "Unfinished PubidLiteral\n", NULL, NULL); 2593 } else { 2594 ret = xmlStrndup(q, CUR_PTR - q); 2595 NEXT; 2596 } 2597 } else if (CUR == '\'') { 2598 NEXT; 2599 q = CUR_PTR; 2600 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')) 2601 NEXT; 2602 if (CUR != '\'') { 2603 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2604 "Unfinished PubidLiteral\n", NULL, NULL); 2605 } else { 2606 ret = xmlStrndup(q, CUR_PTR - q); 2607 NEXT; 2608 } 2609 } else { 2610 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2611 "PubidLiteral \" or ' expected\n", NULL, NULL); 2612 } 2613 2614 return(ret); 2615} 2616 2617/** 2618 * htmlParseScript: 2619 * @ctxt: an HTML parser context 2620 * 2621 * parse the content of an HTML SCRIPT or STYLE element 2622 * http://www.w3.org/TR/html4/sgml/dtd.html#Script 2623 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet 2624 * http://www.w3.org/TR/html4/types.html#type-script 2625 * http://www.w3.org/TR/html4/types.html#h-6.15 2626 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1 2627 * 2628 * Script data ( %Script; in the DTD) can be the content of the SCRIPT 2629 * element and the value of intrinsic event attributes. User agents must 2630 * not evaluate script data as HTML markup but instead must pass it on as 2631 * data to a script engine. 2632 * NOTES: 2633 * - The content is passed like CDATA 2634 * - the attributes for style and scripting "onXXX" are also described 2635 * as CDATA but SGML allows entities references in attributes so their 2636 * processing is identical as other attributes 2637 */ 2638static void 2639htmlParseScript(htmlParserCtxtPtr ctxt) { 2640 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2641 int nbchar = 0; 2642 int cur,l; 2643 2644 SHRINK; 2645 cur = CUR_CHAR(l); 2646 while (IS_CHAR_CH(cur)) { 2647 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') && 2648 (NXT(3) == '-')) { 2649 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2650 if (ctxt->sax->cdataBlock!= NULL) { 2651 /* 2652 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2653 */ 2654 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2655 } else if (ctxt->sax->characters != NULL) { 2656 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2657 } 2658 } 2659 nbchar = 0; 2660 htmlParseComment(ctxt); 2661 cur = CUR_CHAR(l); 2662 continue; 2663 } else if ((cur == '<') && (NXT(1) == '/')) { 2664 /* 2665 * One should break here, the specification is clear: 2666 * Authors should therefore escape "</" within the content. 2667 * Escape mechanisms are specific to each scripting or 2668 * style sheet language. 2669 * 2670 * In recovery mode, only break if end tag match the 2671 * current tag, effectively ignoring all tags inside the 2672 * script/style block and treating the entire block as 2673 * CDATA. 2674 */ 2675 if (ctxt->recovery) { 2676 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, 2677 xmlStrlen(ctxt->name)) == 0) 2678 { 2679 break; /* while */ 2680 } else { 2681 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 2682 "Element %s embeds close tag\n", 2683 ctxt->name, NULL); 2684 } 2685 } else { 2686 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || 2687 ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) 2688 { 2689 break; /* while */ 2690 } 2691 } 2692 } 2693 COPY_BUF(l,buf,nbchar,cur); 2694 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2695 if (ctxt->sax->cdataBlock!= NULL) { 2696 /* 2697 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2698 */ 2699 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2700 } else if (ctxt->sax->characters != NULL) { 2701 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2702 } 2703 nbchar = 0; 2704 } 2705 GROW; 2706 NEXTL(l); 2707 cur = CUR_CHAR(l); 2708 } 2709 2710 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) { 2711 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2712 "Invalid char in CDATA 0x%X\n", cur); 2713 NEXT; 2714 } 2715 2716 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2717 if (ctxt->sax->cdataBlock!= NULL) { 2718 /* 2719 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2720 */ 2721 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2722 } else if (ctxt->sax->characters != NULL) { 2723 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2724 } 2725 } 2726} 2727 2728 2729/** 2730 * htmlParseCharData: 2731 * @ctxt: an HTML parser context 2732 * 2733 * parse a CharData section. 2734 * if we are within a CDATA section ']]>' marks an end of section. 2735 * 2736 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 2737 */ 2738 2739static void 2740htmlParseCharData(htmlParserCtxtPtr ctxt) { 2741 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2742 int nbchar = 0; 2743 int cur, l; 2744 2745 SHRINK; 2746 cur = CUR_CHAR(l); 2747 while (((cur != '<') || (ctxt->token == '<')) && 2748 ((cur != '&') || (ctxt->token == '&')) && 2749 (IS_CHAR(cur))) { 2750 COPY_BUF(l,buf,nbchar,cur); 2751 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2752 /* 2753 * Ok the segment is to be consumed as chars. 2754 */ 2755 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2756 if (areBlanks(ctxt, buf, nbchar)) { 2757 if (ctxt->sax->ignorableWhitespace != NULL) 2758 ctxt->sax->ignorableWhitespace(ctxt->userData, 2759 buf, nbchar); 2760 } else { 2761 htmlCheckParagraph(ctxt); 2762 if (ctxt->sax->characters != NULL) 2763 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2764 } 2765 } 2766 nbchar = 0; 2767 } 2768 NEXTL(l); 2769 cur = CUR_CHAR(l); 2770 if (cur == 0) { 2771 SHRINK; 2772 GROW; 2773 cur = CUR_CHAR(l); 2774 } 2775 } 2776 if (nbchar != 0) { 2777 buf[nbchar] = 0; 2778 2779 /* 2780 * Ok the segment is to be consumed as chars. 2781 */ 2782 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2783 if (areBlanks(ctxt, buf, nbchar)) { 2784 if (ctxt->sax->ignorableWhitespace != NULL) 2785 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar); 2786 } else { 2787 htmlCheckParagraph(ctxt); 2788 if (ctxt->sax->characters != NULL) 2789 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2790 } 2791 } 2792 } else { 2793 /* 2794 * Loop detection 2795 */ 2796 if (cur == 0) 2797 ctxt->instate = XML_PARSER_EOF; 2798 } 2799} 2800 2801/** 2802 * htmlParseExternalID: 2803 * @ctxt: an HTML parser context 2804 * @publicID: a xmlChar** receiving PubidLiteral 2805 * 2806 * Parse an External ID or a Public ID 2807 * 2808 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral 2809 * | 'PUBLIC' S PubidLiteral S SystemLiteral 2810 * 2811 * [83] PublicID ::= 'PUBLIC' S PubidLiteral 2812 * 2813 * Returns the function returns SystemLiteral and in the second 2814 * case publicID receives PubidLiteral, is strict is off 2815 * it is possible to return NULL and have publicID set. 2816 */ 2817 2818static xmlChar * 2819htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) { 2820 xmlChar *URI = NULL; 2821 2822 if ((UPPER == 'S') && (UPP(1) == 'Y') && 2823 (UPP(2) == 'S') && (UPP(3) == 'T') && 2824 (UPP(4) == 'E') && (UPP(5) == 'M')) { 2825 SKIP(6); 2826 if (!IS_BLANK_CH(CUR)) { 2827 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 2828 "Space required after 'SYSTEM'\n", NULL, NULL); 2829 } 2830 SKIP_BLANKS; 2831 URI = htmlParseSystemLiteral(ctxt); 2832 if (URI == NULL) { 2833 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED, 2834 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL); 2835 } 2836 } else if ((UPPER == 'P') && (UPP(1) == 'U') && 2837 (UPP(2) == 'B') && (UPP(3) == 'L') && 2838 (UPP(4) == 'I') && (UPP(5) == 'C')) { 2839 SKIP(6); 2840 if (!IS_BLANK_CH(CUR)) { 2841 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 2842 "Space required after 'PUBLIC'\n", NULL, NULL); 2843 } 2844 SKIP_BLANKS; 2845 *publicID = htmlParsePubidLiteral(ctxt); 2846 if (*publicID == NULL) { 2847 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED, 2848 "htmlParseExternalID: PUBLIC, no Public Identifier\n", 2849 NULL, NULL); 2850 } 2851 SKIP_BLANKS; 2852 if ((CUR == '"') || (CUR == '\'')) { 2853 URI = htmlParseSystemLiteral(ctxt); 2854 } 2855 } 2856 return(URI); 2857} 2858 2859/** 2860 * xmlParsePI: 2861 * @ctxt: an XML parser context 2862 * 2863 * parse an XML Processing Instruction. 2864 * 2865 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 2866 */ 2867static void 2868htmlParsePI(htmlParserCtxtPtr ctxt) { 2869 xmlChar *buf = NULL; 2870 int len = 0; 2871 int size = HTML_PARSER_BUFFER_SIZE; 2872 int cur, l; 2873 const xmlChar *target; 2874 xmlParserInputState state; 2875 int count = 0; 2876 2877 if ((RAW == '<') && (NXT(1) == '?')) { 2878 state = ctxt->instate; 2879 ctxt->instate = XML_PARSER_PI; 2880 /* 2881 * this is a Processing Instruction. 2882 */ 2883 SKIP(2); 2884 SHRINK; 2885 2886 /* 2887 * Parse the target name and check for special support like 2888 * namespace. 2889 */ 2890 target = htmlParseName(ctxt); 2891 if (target != NULL) { 2892 if (RAW == '>') { 2893 SKIP(1); 2894 2895 /* 2896 * SAX: PI detected. 2897 */ 2898 if ((ctxt->sax) && (!ctxt->disableSAX) && 2899 (ctxt->sax->processingInstruction != NULL)) 2900 ctxt->sax->processingInstruction(ctxt->userData, 2901 target, NULL); 2902 ctxt->instate = state; 2903 return; 2904 } 2905 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 2906 if (buf == NULL) { 2907 htmlErrMemory(ctxt, NULL); 2908 ctxt->instate = state; 2909 return; 2910 } 2911 cur = CUR; 2912 if (!IS_BLANK(cur)) { 2913 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 2914 "ParsePI: PI %s space expected\n", target, NULL); 2915 } 2916 SKIP_BLANKS; 2917 cur = CUR_CHAR(l); 2918 while (IS_CHAR(cur) && (cur != '>')) { 2919 if (len + 5 >= size) { 2920 xmlChar *tmp; 2921 2922 size *= 2; 2923 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 2924 if (tmp == NULL) { 2925 htmlErrMemory(ctxt, NULL); 2926 xmlFree(buf); 2927 ctxt->instate = state; 2928 return; 2929 } 2930 buf = tmp; 2931 } 2932 count++; 2933 if (count > 50) { 2934 GROW; 2935 count = 0; 2936 } 2937 COPY_BUF(l,buf,len,cur); 2938 NEXTL(l); 2939 cur = CUR_CHAR(l); 2940 if (cur == 0) { 2941 SHRINK; 2942 GROW; 2943 cur = CUR_CHAR(l); 2944 } 2945 } 2946 buf[len] = 0; 2947 if (cur != '>') { 2948 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED, 2949 "ParsePI: PI %s never end ...\n", target, NULL); 2950 } else { 2951 SKIP(1); 2952 2953 /* 2954 * SAX: PI detected. 2955 */ 2956 if ((ctxt->sax) && (!ctxt->disableSAX) && 2957 (ctxt->sax->processingInstruction != NULL)) 2958 ctxt->sax->processingInstruction(ctxt->userData, 2959 target, buf); 2960 } 2961 xmlFree(buf); 2962 } else { 2963 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED, 2964 "PI is not started correctly", NULL, NULL); 2965 } 2966 ctxt->instate = state; 2967 } 2968} 2969 2970/** 2971 * htmlParseComment: 2972 * @ctxt: an HTML parser context 2973 * 2974 * Parse an XML (SGML) comment <!-- .... --> 2975 * 2976 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' 2977 */ 2978static void 2979htmlParseComment(htmlParserCtxtPtr ctxt) { 2980 xmlChar *buf = NULL; 2981 int len; 2982 int size = HTML_PARSER_BUFFER_SIZE; 2983 int q, ql; 2984 int r, rl; 2985 int cur, l; 2986 xmlParserInputState state; 2987 2988 /* 2989 * Check that there is a comment right here. 2990 */ 2991 if ((RAW != '<') || (NXT(1) != '!') || 2992 (NXT(2) != '-') || (NXT(3) != '-')) return; 2993 2994 state = ctxt->instate; 2995 ctxt->instate = XML_PARSER_COMMENT; 2996 SHRINK; 2997 SKIP(4); 2998 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 2999 if (buf == NULL) { 3000 htmlErrMemory(ctxt, "buffer allocation failed\n"); 3001 ctxt->instate = state; 3002 return; 3003 } 3004 q = CUR_CHAR(ql); 3005 NEXTL(ql); 3006 r = CUR_CHAR(rl); 3007 NEXTL(rl); 3008 cur = CUR_CHAR(l); 3009 len = 0; 3010 while (IS_CHAR(cur) && 3011 ((cur != '>') || 3012 (r != '-') || (q != '-'))) { 3013 if (len + 5 >= size) { 3014 xmlChar *tmp; 3015 3016 size *= 2; 3017 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3018 if (tmp == NULL) { 3019 xmlFree(buf); 3020 htmlErrMemory(ctxt, "growing buffer failed\n"); 3021 ctxt->instate = state; 3022 return; 3023 } 3024 buf = tmp; 3025 } 3026 COPY_BUF(ql,buf,len,q); 3027 q = r; 3028 ql = rl; 3029 r = cur; 3030 rl = l; 3031 NEXTL(l); 3032 cur = CUR_CHAR(l); 3033 if (cur == 0) { 3034 SHRINK; 3035 GROW; 3036 cur = CUR_CHAR(l); 3037 } 3038 } 3039 buf[len] = 0; 3040 if (!IS_CHAR(cur)) { 3041 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, 3042 "Comment not terminated \n<!--%.50s\n", buf, NULL); 3043 xmlFree(buf); 3044 } else { 3045 NEXT; 3046 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && 3047 (!ctxt->disableSAX)) 3048 ctxt->sax->comment(ctxt->userData, buf); 3049 xmlFree(buf); 3050 } 3051 ctxt->instate = state; 3052} 3053 3054/** 3055 * htmlParseCharRef: 3056 * @ctxt: an HTML parser context 3057 * 3058 * parse Reference declarations 3059 * 3060 * [66] CharRef ::= '&#' [0-9]+ ';' | 3061 * '&#x' [0-9a-fA-F]+ ';' 3062 * 3063 * Returns the value parsed (as an int) 3064 */ 3065int 3066htmlParseCharRef(htmlParserCtxtPtr ctxt) { 3067 int val = 0; 3068 3069 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3070 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3071 "htmlParseCharRef: context error\n", 3072 NULL, NULL); 3073 return(0); 3074 } 3075 if ((CUR == '&') && (NXT(1) == '#') && 3076 ((NXT(2) == 'x') || NXT(2) == 'X')) { 3077 SKIP(3); 3078 while (CUR != ';') { 3079 if ((CUR >= '0') && (CUR <= '9')) 3080 val = val * 16 + (CUR - '0'); 3081 else if ((CUR >= 'a') && (CUR <= 'f')) 3082 val = val * 16 + (CUR - 'a') + 10; 3083 else if ((CUR >= 'A') && (CUR <= 'F')) 3084 val = val * 16 + (CUR - 'A') + 10; 3085 else { 3086 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF, 3087 "htmlParseCharRef: invalid hexadecimal value\n", 3088 NULL, NULL); 3089 return(0); 3090 } 3091 NEXT; 3092 } 3093 if (CUR == ';') 3094 NEXT; 3095 } else if ((CUR == '&') && (NXT(1) == '#')) { 3096 SKIP(2); 3097 while (CUR != ';') { 3098 if ((CUR >= '0') && (CUR <= '9')) 3099 val = val * 10 + (CUR - '0'); 3100 else { 3101 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF, 3102 "htmlParseCharRef: invalid decimal value\n", 3103 NULL, NULL); 3104 return(0); 3105 } 3106 NEXT; 3107 } 3108 if (CUR == ';') 3109 NEXT; 3110 } else { 3111 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF, 3112 "htmlParseCharRef: invalid value\n", NULL, NULL); 3113 } 3114 /* 3115 * Check the value IS_CHAR ... 3116 */ 3117 if (IS_CHAR(val)) { 3118 return(val); 3119 } else { 3120 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 3121 "htmlParseCharRef: invalid xmlChar value %d\n", 3122 val); 3123 } 3124 return(0); 3125} 3126 3127 3128/** 3129 * htmlParseDocTypeDecl: 3130 * @ctxt: an HTML parser context 3131 * 3132 * parse a DOCTYPE declaration 3133 * 3134 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? 3135 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' 3136 */ 3137 3138static void 3139htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { 3140 const xmlChar *name; 3141 xmlChar *ExternalID = NULL; 3142 xmlChar *URI = NULL; 3143 3144 /* 3145 * We know that '<!DOCTYPE' has been detected. 3146 */ 3147 SKIP(9); 3148 3149 SKIP_BLANKS; 3150 3151 /* 3152 * Parse the DOCTYPE name. 3153 */ 3154 name = htmlParseName(ctxt); 3155 if (name == NULL) { 3156 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3157 "htmlParseDocTypeDecl : no DOCTYPE name !\n", 3158 NULL, NULL); 3159 } 3160 /* 3161 * Check that upper(name) == "HTML" !!!!!!!!!!!!! 3162 */ 3163 3164 SKIP_BLANKS; 3165 3166 /* 3167 * Check for SystemID and ExternalID 3168 */ 3169 URI = htmlParseExternalID(ctxt, &ExternalID); 3170 SKIP_BLANKS; 3171 3172 /* 3173 * We should be at the end of the DOCTYPE declaration. 3174 */ 3175 if (CUR != '>') { 3176 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED, 3177 "DOCTYPE improperly terminated\n", NULL, NULL); 3178 /* We shouldn't try to resynchronize ... */ 3179 } 3180 NEXT; 3181 3182 /* 3183 * Create or update the document accordingly to the DOCTYPE 3184 */ 3185 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) && 3186 (!ctxt->disableSAX)) 3187 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI); 3188 3189 /* 3190 * Cleanup, since we don't use all those identifiers 3191 */ 3192 if (URI != NULL) xmlFree(URI); 3193 if (ExternalID != NULL) xmlFree(ExternalID); 3194} 3195 3196/** 3197 * htmlParseAttribute: 3198 * @ctxt: an HTML parser context 3199 * @value: a xmlChar ** used to store the value of the attribute 3200 * 3201 * parse an attribute 3202 * 3203 * [41] Attribute ::= Name Eq AttValue 3204 * 3205 * [25] Eq ::= S? '=' S? 3206 * 3207 * With namespace: 3208 * 3209 * [NS 11] Attribute ::= QName Eq AttValue 3210 * 3211 * Also the case QName == xmlns:??? is handled independently as a namespace 3212 * definition. 3213 * 3214 * Returns the attribute name, and the value in *value. 3215 */ 3216 3217static const xmlChar * 3218htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { 3219 const xmlChar *name; 3220 xmlChar *val = NULL; 3221 3222 *value = NULL; 3223 name = htmlParseHTMLName(ctxt); 3224 if (name == NULL) { 3225 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3226 "error parsing attribute name\n", NULL, NULL); 3227 return(NULL); 3228 } 3229 3230 /* 3231 * read the value 3232 */ 3233 SKIP_BLANKS; 3234 if (CUR == '=') { 3235 NEXT; 3236 SKIP_BLANKS; 3237 val = htmlParseAttValue(ctxt); 3238 } else if (htmlIsBooleanAttr(name)) { 3239 /* 3240 * assume a minimized attribute 3241 */ 3242 val = xmlStrdup(name); 3243 } 3244 3245 *value = val; 3246 return(name); 3247} 3248 3249/** 3250 * htmlCheckEncoding: 3251 * @ctxt: an HTML parser context 3252 * @attvalue: the attribute value 3253 * 3254 * Checks an http-equiv attribute from a Meta tag to detect 3255 * the encoding 3256 * If a new encoding is detected the parser is switched to decode 3257 * it and pass UTF8 3258 */ 3259static void 3260htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { 3261 const xmlChar *encoding; 3262 3263 if ((ctxt == NULL) || (attvalue == NULL)) 3264 return; 3265 3266 /* do not change encoding */ 3267 if (ctxt->input->encoding != NULL) 3268 return; 3269 3270 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset="); 3271 if (encoding != NULL) { 3272 encoding += 8; 3273 } else { 3274 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset ="); 3275 if (encoding != NULL) 3276 encoding += 9; 3277 } 3278 if (encoding != NULL) { 3279 xmlCharEncoding enc; 3280 xmlCharEncodingHandlerPtr handler; 3281 3282 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 3283 3284 if (ctxt->input->encoding != NULL) 3285 xmlFree((xmlChar *) ctxt->input->encoding); 3286 ctxt->input->encoding = xmlStrdup(encoding); 3287 3288 enc = xmlParseCharEncoding((const char *) encoding); 3289 /* 3290 * registered set of known encodings 3291 */ 3292 if (enc != XML_CHAR_ENCODING_ERROR) { 3293 if (((enc == XML_CHAR_ENCODING_UTF16LE) || 3294 (enc == XML_CHAR_ENCODING_UTF16BE) || 3295 (enc == XML_CHAR_ENCODING_UCS4LE) || 3296 (enc == XML_CHAR_ENCODING_UCS4BE)) && 3297 (ctxt->input->buf != NULL) && 3298 (ctxt->input->buf->encoder == NULL)) { 3299 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 3300 "htmlCheckEncoding: wrong encoding meta\n", 3301 NULL, NULL); 3302 } else { 3303 xmlSwitchEncoding(ctxt, enc); 3304 } 3305 ctxt->charset = XML_CHAR_ENCODING_UTF8; 3306 } else { 3307 /* 3308 * fallback for unknown encodings 3309 */ 3310 handler = xmlFindCharEncodingHandler((const char *) encoding); 3311 if (handler != NULL) { 3312 xmlSwitchToEncoding(ctxt, handler); 3313 ctxt->charset = XML_CHAR_ENCODING_UTF8; 3314 } else { 3315 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; 3316 } 3317 } 3318 3319 if ((ctxt->input->buf != NULL) && 3320 (ctxt->input->buf->encoder != NULL) && 3321 (ctxt->input->buf->raw != NULL) && 3322 (ctxt->input->buf->buffer != NULL)) { 3323 int nbchars; 3324 int processed; 3325 3326 /* 3327 * convert as much as possible to the parser reading buffer. 3328 */ 3329 processed = ctxt->input->cur - ctxt->input->base; 3330 xmlBufferShrink(ctxt->input->buf->buffer, processed); 3331 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder, 3332 ctxt->input->buf->buffer, 3333 ctxt->input->buf->raw); 3334 if (nbchars < 0) { 3335 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 3336 "htmlCheckEncoding: encoder error\n", 3337 NULL, NULL); 3338 } 3339 ctxt->input->base = 3340 ctxt->input->cur = ctxt->input->buf->buffer->content; 3341 } 3342 } 3343} 3344 3345/** 3346 * htmlCheckMeta: 3347 * @ctxt: an HTML parser context 3348 * @atts: the attributes values 3349 * 3350 * Checks an attributes from a Meta tag 3351 */ 3352static void 3353htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { 3354 int i; 3355 const xmlChar *att, *value; 3356 int http = 0; 3357 const xmlChar *content = NULL; 3358 3359 if ((ctxt == NULL) || (atts == NULL)) 3360 return; 3361 3362 i = 0; 3363 att = atts[i++]; 3364 while (att != NULL) { 3365 value = atts[i++]; 3366 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv")) 3367 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 3368 http = 1; 3369 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content"))) 3370 content = value; 3371 att = atts[i++]; 3372 } 3373 if ((http) && (content != NULL)) 3374 htmlCheckEncoding(ctxt, content); 3375 3376} 3377 3378/** 3379 * htmlParseStartTag: 3380 * @ctxt: an HTML parser context 3381 * 3382 * parse a start of tag either for rule element or 3383 * EmptyElement. In both case we don't parse the tag closing chars. 3384 * 3385 * [40] STag ::= '<' Name (S Attribute)* S? '>' 3386 * 3387 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' 3388 * 3389 * With namespace: 3390 * 3391 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>' 3392 * 3393 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>' 3394 * 3395 * Returns 0 in case of success and -1 in case of error. 3396 */ 3397 3398static int 3399htmlParseStartTag(htmlParserCtxtPtr ctxt) { 3400 const xmlChar *name; 3401 const xmlChar *attname; 3402 xmlChar *attvalue; 3403 const xmlChar **atts; 3404 int nbatts = 0; 3405 int maxatts; 3406 int meta = 0; 3407 int i; 3408 3409 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3410 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3411 "htmlParseStartTag: context error\n", NULL, NULL); 3412 return -1; 3413 } 3414 if (CUR != '<') return -1; 3415 NEXT; 3416 3417 atts = ctxt->atts; 3418 maxatts = ctxt->maxatts; 3419 3420 GROW; 3421 name = htmlParseHTMLName(ctxt); 3422 if (name == NULL) { 3423 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3424 "htmlParseStartTag: invalid element name\n", 3425 NULL, NULL); 3426 /* Dump the bogus tag like browsers do */ 3427 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 3428 NEXT; 3429 return -1; 3430 } 3431 if (xmlStrEqual(name, BAD_CAST"meta")) 3432 meta = 1; 3433 3434 /* 3435 * Check for auto-closure of HTML elements. 3436 */ 3437 htmlAutoClose(ctxt, name); 3438 3439 /* 3440 * Check for implied HTML elements. 3441 */ 3442 htmlCheckImplied(ctxt, name); 3443 3444 /* 3445 * Avoid html at any level > 0, head at any level != 1 3446 * or any attempt to recurse body 3447 */ 3448 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) { 3449 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3450 "htmlParseStartTag: misplaced <html> tag\n", 3451 name, NULL); 3452 return 0; 3453 } 3454 if ((ctxt->nameNr != 1) && 3455 (xmlStrEqual(name, BAD_CAST"head"))) { 3456 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3457 "htmlParseStartTag: misplaced <head> tag\n", 3458 name, NULL); 3459 return 0; 3460 } 3461 if (xmlStrEqual(name, BAD_CAST"body")) { 3462 int indx; 3463 for (indx = 0;indx < ctxt->nameNr;indx++) { 3464 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) { 3465 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3466 "htmlParseStartTag: misplaced <body> tag\n", 3467 name, NULL); 3468 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 3469 NEXT; 3470 return 0; 3471 } 3472 } 3473 } 3474 3475 /* 3476 * Now parse the attributes, it ends up with the ending 3477 * 3478 * (S Attribute)* S? 3479 */ 3480 SKIP_BLANKS; 3481 while ((IS_CHAR_CH(CUR)) && 3482 (CUR != '>') && 3483 ((CUR != '/') || (NXT(1) != '>'))) { 3484 long cons = ctxt->nbChars; 3485 3486 GROW; 3487 attname = htmlParseAttribute(ctxt, &attvalue); 3488 if (attname != NULL) { 3489 3490 /* 3491 * Well formedness requires at most one declaration of an attribute 3492 */ 3493 for (i = 0; i < nbatts;i += 2) { 3494 if (xmlStrEqual(atts[i], attname)) { 3495 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED, 3496 "Attribute %s redefined\n", attname, NULL); 3497 if (attvalue != NULL) 3498 xmlFree(attvalue); 3499 goto failed; 3500 } 3501 } 3502 3503 /* 3504 * Add the pair to atts 3505 */ 3506 if (atts == NULL) { 3507 maxatts = 22; /* allow for 10 attrs by default */ 3508 atts = (const xmlChar **) 3509 xmlMalloc(maxatts * sizeof(xmlChar *)); 3510 if (atts == NULL) { 3511 htmlErrMemory(ctxt, NULL); 3512 if (attvalue != NULL) 3513 xmlFree(attvalue); 3514 goto failed; 3515 } 3516 ctxt->atts = atts; 3517 ctxt->maxatts = maxatts; 3518 } else if (nbatts + 4 > maxatts) { 3519 const xmlChar **n; 3520 3521 maxatts *= 2; 3522 n = (const xmlChar **) xmlRealloc((void *) atts, 3523 maxatts * sizeof(const xmlChar *)); 3524 if (n == NULL) { 3525 htmlErrMemory(ctxt, NULL); 3526 if (attvalue != NULL) 3527 xmlFree(attvalue); 3528 goto failed; 3529 } 3530 atts = n; 3531 ctxt->atts = atts; 3532 ctxt->maxatts = maxatts; 3533 } 3534 atts[nbatts++] = attname; 3535 atts[nbatts++] = attvalue; 3536 atts[nbatts] = NULL; 3537 atts[nbatts + 1] = NULL; 3538 } 3539 else { 3540 if (attvalue != NULL) 3541 xmlFree(attvalue); 3542 /* Dump the bogus attribute string up to the next blank or 3543 * the end of the tag. */ 3544 while ((IS_CHAR_CH(CUR)) && 3545 !(IS_BLANK_CH(CUR)) && (CUR != '>') && 3546 ((CUR != '/') || (NXT(1) != '>'))) 3547 NEXT; 3548 } 3549 3550failed: 3551 SKIP_BLANKS; 3552 if (cons == ctxt->nbChars) { 3553 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3554 "htmlParseStartTag: problem parsing attributes\n", 3555 NULL, NULL); 3556 break; 3557 } 3558 } 3559 3560 /* 3561 * Handle specific association to the META tag 3562 */ 3563 if (meta) 3564 htmlCheckMeta(ctxt, atts); 3565 3566 /* 3567 * SAX: Start of Element ! 3568 */ 3569 htmlnamePush(ctxt, name); 3570 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) { 3571 if (nbatts != 0) 3572 ctxt->sax->startElement(ctxt->userData, name, atts); 3573 else 3574 ctxt->sax->startElement(ctxt->userData, name, NULL); 3575 } 3576 3577 if (atts != NULL) { 3578 for (i = 1;i < nbatts;i += 2) { 3579 if (atts[i] != NULL) 3580 xmlFree((xmlChar *) atts[i]); 3581 } 3582 } 3583 3584 return 0; 3585} 3586 3587/** 3588 * htmlParseEndTag: 3589 * @ctxt: an HTML parser context 3590 * 3591 * parse an end of tag 3592 * 3593 * [42] ETag ::= '</' Name S? '>' 3594 * 3595 * With namespace 3596 * 3597 * [NS 9] ETag ::= '</' QName S? '>' 3598 * 3599 * Returns 1 if the current level should be closed. 3600 */ 3601 3602static int 3603htmlParseEndTag(htmlParserCtxtPtr ctxt) 3604{ 3605 const xmlChar *name; 3606 const xmlChar *oldname; 3607 int i, ret; 3608 3609 if ((CUR != '<') || (NXT(1) != '/')) { 3610 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED, 3611 "htmlParseEndTag: '</' not found\n", NULL, NULL); 3612 return (0); 3613 } 3614 SKIP(2); 3615 3616 name = htmlParseHTMLName(ctxt); 3617 if (name == NULL) 3618 return (0); 3619 3620 /* 3621 * We should definitely be at the ending "S? '>'" part 3622 */ 3623 SKIP_BLANKS; 3624 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) { 3625 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 3626 "End tag : expected '>'\n", NULL, NULL); 3627 if (ctxt->recovery) { 3628 /* 3629 * We're not at the ending > !! 3630 * Error, unless in recover mode where we search forwards 3631 * until we find a > 3632 */ 3633 while (CUR != '\0' && CUR != '>') NEXT; 3634 NEXT; 3635 } 3636 } else 3637 NEXT; 3638 3639 /* 3640 * If the name read is not one of the element in the parsing stack 3641 * then return, it's just an error. 3642 */ 3643 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 3644 if (xmlStrEqual(name, ctxt->nameTab[i])) 3645 break; 3646 } 3647 if (i < 0) { 3648 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 3649 "Unexpected end tag : %s\n", name, NULL); 3650 return (0); 3651 } 3652 3653 3654 /* 3655 * Check for auto-closure of HTML elements. 3656 */ 3657 3658 htmlAutoCloseOnClose(ctxt, name); 3659 3660 /* 3661 * Well formedness constraints, opening and closing must match. 3662 * With the exception that the autoclose may have popped stuff out 3663 * of the stack. 3664 */ 3665 if (!xmlStrEqual(name, ctxt->name)) { 3666 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) { 3667 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 3668 "Opening and ending tag mismatch: %s and %s\n", 3669 name, ctxt->name); 3670 } 3671 } 3672 3673 /* 3674 * SAX: End of Tag 3675 */ 3676 oldname = ctxt->name; 3677 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) { 3678 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 3679 ctxt->sax->endElement(ctxt->userData, name); 3680 htmlnamePop(ctxt); 3681 ret = 1; 3682 } else { 3683 ret = 0; 3684 } 3685 3686 return (ret); 3687} 3688 3689 3690/** 3691 * htmlParseReference: 3692 * @ctxt: an HTML parser context 3693 * 3694 * parse and handle entity references in content, 3695 * this will end-up in a call to character() since this is either a 3696 * CharRef, or a predefined entity. 3697 */ 3698static void 3699htmlParseReference(htmlParserCtxtPtr ctxt) { 3700 const htmlEntityDesc * ent; 3701 xmlChar out[6]; 3702 const xmlChar *name; 3703 if (CUR != '&') return; 3704 3705 if (NXT(1) == '#') { 3706 unsigned int c; 3707 int bits, i = 0; 3708 3709 c = htmlParseCharRef(ctxt); 3710 if (c == 0) 3711 return; 3712 3713 if (c < 0x80) { out[i++]= c; bits= -6; } 3714 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 3715 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 3716 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 3717 3718 for ( ; bits >= 0; bits-= 6) { 3719 out[i++]= ((c >> bits) & 0x3F) | 0x80; 3720 } 3721 out[i] = 0; 3722 3723 htmlCheckParagraph(ctxt); 3724 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 3725 ctxt->sax->characters(ctxt->userData, out, i); 3726 } else { 3727 ent = htmlParseEntityRef(ctxt, &name); 3728 if (name == NULL) { 3729 htmlCheckParagraph(ctxt); 3730 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 3731 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 3732 return; 3733 } 3734 if ((ent == NULL) || !(ent->value > 0)) { 3735 htmlCheckParagraph(ctxt); 3736 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) { 3737 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 3738 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name)); 3739 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */ 3740 } 3741 } else { 3742 unsigned int c; 3743 int bits, i = 0; 3744 3745 c = ent->value; 3746 if (c < 0x80) 3747 { out[i++]= c; bits= -6; } 3748 else if (c < 0x800) 3749 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 3750 else if (c < 0x10000) 3751 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 3752 else 3753 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 3754 3755 for ( ; bits >= 0; bits-= 6) { 3756 out[i++]= ((c >> bits) & 0x3F) | 0x80; 3757 } 3758 out[i] = 0; 3759 3760 htmlCheckParagraph(ctxt); 3761 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 3762 ctxt->sax->characters(ctxt->userData, out, i); 3763 } 3764 } 3765} 3766 3767/** 3768 * htmlParseContent: 3769 * @ctxt: an HTML parser context 3770 * 3771 * Parse a content: comment, sub-element, reference or text. 3772 */ 3773 3774static void 3775htmlParseContent(htmlParserCtxtPtr ctxt) { 3776 xmlChar *currentNode; 3777 int depth; 3778 3779 currentNode = xmlStrdup(ctxt->name); 3780 depth = ctxt->nameNr; 3781 while (1) { 3782 long cons = ctxt->nbChars; 3783 3784 GROW; 3785 /* 3786 * Our tag or one of it's parent or children is ending. 3787 */ 3788 if ((CUR == '<') && (NXT(1) == '/')) { 3789 if (htmlParseEndTag(ctxt) && 3790 ((currentNode != NULL) || (ctxt->nameNr == 0))) { 3791 if (currentNode != NULL) 3792 xmlFree(currentNode); 3793 return; 3794 } 3795 continue; /* while */ 3796 } 3797 3798 /* 3799 * Has this node been popped out during parsing of 3800 * the next element 3801 */ 3802 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && 3803 (!xmlStrEqual(currentNode, ctxt->name))) 3804 { 3805 if (currentNode != NULL) xmlFree(currentNode); 3806 return; 3807 } 3808 3809 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) || 3810 (xmlStrEqual(currentNode, BAD_CAST"style")))) { 3811 /* 3812 * Handle SCRIPT/STYLE separately 3813 */ 3814 htmlParseScript(ctxt); 3815 } else { 3816 /* 3817 * Sometimes DOCTYPE arrives in the middle of the document 3818 */ 3819 if ((CUR == '<') && (NXT(1) == '!') && 3820 (UPP(2) == 'D') && (UPP(3) == 'O') && 3821 (UPP(4) == 'C') && (UPP(5) == 'T') && 3822 (UPP(6) == 'Y') && (UPP(7) == 'P') && 3823 (UPP(8) == 'E')) { 3824 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3825 "Misplaced DOCTYPE declaration\n", 3826 BAD_CAST "DOCTYPE" , NULL); 3827 htmlParseDocTypeDecl(ctxt); 3828 } 3829 3830 /* 3831 * First case : a comment 3832 */ 3833 if ((CUR == '<') && (NXT(1) == '!') && 3834 (NXT(2) == '-') && (NXT(3) == '-')) { 3835 htmlParseComment(ctxt); 3836 } 3837 3838 /* 3839 * Second case : a Processing Instruction. 3840 */ 3841 else if ((CUR == '<') && (NXT(1) == '?')) { 3842 htmlParsePI(ctxt); 3843 } 3844 3845 /* 3846 * Third case : a sub-element. 3847 */ 3848 else if (CUR == '<') { 3849 htmlParseElement(ctxt); 3850 } 3851 3852 /* 3853 * Fourth case : a reference. If if has not been resolved, 3854 * parsing returns it's Name, create the node 3855 */ 3856 else if (CUR == '&') { 3857 htmlParseReference(ctxt); 3858 } 3859 3860 /* 3861 * Fifth case : end of the resource 3862 */ 3863 else if (CUR == 0) { 3864 htmlAutoCloseOnEnd(ctxt); 3865 break; 3866 } 3867 3868 /* 3869 * Last case, text. Note that References are handled directly. 3870 */ 3871 else { 3872 htmlParseCharData(ctxt); 3873 } 3874 3875 if (cons == ctxt->nbChars) { 3876 if (ctxt->node != NULL) { 3877 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3878 "detected an error in element content\n", 3879 NULL, NULL); 3880 } 3881 break; 3882 } 3883 } 3884 GROW; 3885 } 3886 if (currentNode != NULL) xmlFree(currentNode); 3887} 3888 3889/** 3890 * htmlParseContent: 3891 * @ctxt: an HTML parser context 3892 * 3893 * Parse a content: comment, sub-element, reference or text. 3894 */ 3895 3896void 3897__htmlParseContent(void *ctxt) { 3898 if (ctxt != NULL) 3899 htmlParseContent((htmlParserCtxtPtr) ctxt); 3900} 3901 3902/** 3903 * htmlParseElement: 3904 * @ctxt: an HTML parser context 3905 * 3906 * parse an HTML element, this is highly recursive 3907 * 3908 * [39] element ::= EmptyElemTag | STag content ETag 3909 * 3910 * [41] Attribute ::= Name Eq AttValue 3911 */ 3912 3913void 3914htmlParseElement(htmlParserCtxtPtr ctxt) { 3915 const xmlChar *name; 3916 xmlChar *currentNode = NULL; 3917 const htmlElemDesc * info; 3918 htmlParserNodeInfo node_info; 3919 int failed; 3920 int depth; 3921 const xmlChar *oldptr; 3922 3923 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3924 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3925 "htmlParseElement: context error\n", NULL, NULL); 3926 return; 3927 } 3928 /* Capture start position */ 3929 if (ctxt->record_info) { 3930 node_info.begin_pos = ctxt->input->consumed + 3931 (CUR_PTR - ctxt->input->base); 3932 node_info.begin_line = ctxt->input->line; 3933 } 3934 3935 failed = htmlParseStartTag(ctxt); 3936 name = ctxt->name; 3937 if (failed || (name == NULL)) { 3938 if (CUR == '>') 3939 NEXT; 3940 return; 3941 } 3942 3943 /* 3944 * Lookup the info for that element. 3945 */ 3946 info = htmlTagLookup(name); 3947 if (info == NULL) { 3948 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 3949 "Tag %s invalid\n", name, NULL); 3950 } 3951 3952 /* 3953 * Check for an Empty Element labeled the XML/SGML way 3954 */ 3955 if ((CUR == '/') && (NXT(1) == '>')) { 3956 SKIP(2); 3957 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 3958 ctxt->sax->endElement(ctxt->userData, name); 3959 htmlnamePop(ctxt); 3960 return; 3961 } 3962 3963 if (CUR == '>') { 3964 NEXT; 3965 } else { 3966 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 3967 "Couldn't find end of Start Tag %s\n", name, NULL); 3968 3969 /* 3970 * end of parsing of this node. 3971 */ 3972 if (xmlStrEqual(name, ctxt->name)) { 3973 nodePop(ctxt); 3974 htmlnamePop(ctxt); 3975 } 3976 3977 /* 3978 * Capture end position and add node 3979 */ 3980 if (ctxt->record_info) { 3981 node_info.end_pos = ctxt->input->consumed + 3982 (CUR_PTR - ctxt->input->base); 3983 node_info.end_line = ctxt->input->line; 3984 node_info.node = ctxt->node; 3985 xmlParserAddNodeInfo(ctxt, &node_info); 3986 } 3987 return; 3988 } 3989 3990 /* 3991 * Check for an Empty Element from DTD definition 3992 */ 3993 if ((info != NULL) && (info->empty)) { 3994 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 3995 ctxt->sax->endElement(ctxt->userData, name); 3996 htmlnamePop(ctxt); 3997 return; 3998 } 3999 4000 /* 4001 * Parse the content of the element: 4002 */ 4003 currentNode = xmlStrdup(ctxt->name); 4004 depth = ctxt->nameNr; 4005 while (IS_CHAR_CH(CUR)) { 4006 oldptr = ctxt->input->cur; 4007 htmlParseContent(ctxt); 4008 if (oldptr==ctxt->input->cur) break; 4009 if (ctxt->nameNr < depth) break; 4010 } 4011 4012 /* 4013 * Capture end position and add node 4014 */ 4015 if ( currentNode != NULL && ctxt->record_info ) { 4016 node_info.end_pos = ctxt->input->consumed + 4017 (CUR_PTR - ctxt->input->base); 4018 node_info.end_line = ctxt->input->line; 4019 node_info.node = ctxt->node; 4020 xmlParserAddNodeInfo(ctxt, &node_info); 4021 } 4022 if (!IS_CHAR_CH(CUR)) { 4023 htmlAutoCloseOnEnd(ctxt); 4024 } 4025 4026 if (currentNode != NULL) 4027 xmlFree(currentNode); 4028} 4029 4030/** 4031 * htmlParseDocument: 4032 * @ctxt: an HTML parser context 4033 * 4034 * parse an HTML document (and build a tree if using the standard SAX 4035 * interface). 4036 * 4037 * Returns 0, -1 in case of error. the parser context is augmented 4038 * as a result of the parsing. 4039 */ 4040 4041int 4042htmlParseDocument(htmlParserCtxtPtr ctxt) { 4043 xmlDtdPtr dtd; 4044 4045 xmlInitParser(); 4046 4047 htmlDefaultSAXHandlerInit(); 4048 4049 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4050 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4051 "htmlParseDocument: context error\n", NULL, NULL); 4052 return(XML_ERR_INTERNAL_ERROR); 4053 } 4054 ctxt->html = 1; 4055 GROW; 4056 /* 4057 * SAX: beginning of the document processing. 4058 */ 4059 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 4060 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); 4061 4062 /* 4063 * Wipe out everything which is before the first '<' 4064 */ 4065 SKIP_BLANKS; 4066 if (CUR == 0) { 4067 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY, 4068 "Document is empty\n", NULL, NULL); 4069 } 4070 4071 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) 4072 ctxt->sax->startDocument(ctxt->userData); 4073 4074 4075 /* 4076 * Parse possible comments and PIs before any content 4077 */ 4078 while (((CUR == '<') && (NXT(1) == '!') && 4079 (NXT(2) == '-') && (NXT(3) == '-')) || 4080 ((CUR == '<') && (NXT(1) == '?'))) { 4081 htmlParseComment(ctxt); 4082 htmlParsePI(ctxt); 4083 SKIP_BLANKS; 4084 } 4085 4086 4087 /* 4088 * Then possibly doc type declaration(s) and more Misc 4089 * (doctypedecl Misc*)? 4090 */ 4091 if ((CUR == '<') && (NXT(1) == '!') && 4092 (UPP(2) == 'D') && (UPP(3) == 'O') && 4093 (UPP(4) == 'C') && (UPP(5) == 'T') && 4094 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4095 (UPP(8) == 'E')) { 4096 htmlParseDocTypeDecl(ctxt); 4097 } 4098 SKIP_BLANKS; 4099 4100 /* 4101 * Parse possible comments and PIs before any content 4102 */ 4103 while (((CUR == '<') && (NXT(1) == '!') && 4104 (NXT(2) == '-') && (NXT(3) == '-')) || 4105 ((CUR == '<') && (NXT(1) == '?'))) { 4106 htmlParseComment(ctxt); 4107 htmlParsePI(ctxt); 4108 SKIP_BLANKS; 4109 } 4110 4111 /* 4112 * Time to start parsing the tree itself 4113 */ 4114 htmlParseContent(ctxt); 4115 4116 /* 4117 * autoclose 4118 */ 4119 if (CUR == 0) 4120 htmlAutoCloseOnEnd(ctxt); 4121 4122 4123 /* 4124 * SAX: end of the document processing. 4125 */ 4126 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 4127 ctxt->sax->endDocument(ctxt->userData); 4128 4129 if (ctxt->myDoc != NULL) { 4130 dtd = xmlGetIntSubset(ctxt->myDoc); 4131 if (dtd == NULL) 4132 ctxt->myDoc->intSubset = 4133 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 4134 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 4135 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 4136 } 4137 if (! ctxt->wellFormed) return(-1); 4138 return(0); 4139} 4140 4141 4142/************************************************************************ 4143 * * 4144 * Parser contexts handling * 4145 * * 4146 ************************************************************************/ 4147 4148/** 4149 * htmlInitParserCtxt: 4150 * @ctxt: an HTML parser context 4151 * 4152 * Initialize a parser context 4153 * 4154 * Returns 0 in case of success and -1 in case of error 4155 */ 4156 4157static int 4158htmlInitParserCtxt(htmlParserCtxtPtr ctxt) 4159{ 4160 htmlSAXHandler *sax; 4161 4162 if (ctxt == NULL) return(-1); 4163 memset(ctxt, 0, sizeof(htmlParserCtxt)); 4164 4165 ctxt->dict = xmlDictCreate(); 4166 if (ctxt->dict == NULL) { 4167 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4168 return(-1); 4169 } 4170 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler)); 4171 if (sax == NULL) { 4172 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4173 return(-1); 4174 } 4175 else 4176 memset(sax, 0, sizeof(htmlSAXHandler)); 4177 4178 /* Allocate the Input stack */ 4179 ctxt->inputTab = (htmlParserInputPtr *) 4180 xmlMalloc(5 * sizeof(htmlParserInputPtr)); 4181 if (ctxt->inputTab == NULL) { 4182 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4183 ctxt->inputNr = 0; 4184 ctxt->inputMax = 0; 4185 ctxt->input = NULL; 4186 return(-1); 4187 } 4188 ctxt->inputNr = 0; 4189 ctxt->inputMax = 5; 4190 ctxt->input = NULL; 4191 ctxt->version = NULL; 4192 ctxt->encoding = NULL; 4193 ctxt->standalone = -1; 4194 ctxt->instate = XML_PARSER_START; 4195 4196 /* Allocate the Node stack */ 4197 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr)); 4198 if (ctxt->nodeTab == NULL) { 4199 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4200 ctxt->nodeNr = 0; 4201 ctxt->nodeMax = 0; 4202 ctxt->node = NULL; 4203 ctxt->inputNr = 0; 4204 ctxt->inputMax = 0; 4205 ctxt->input = NULL; 4206 return(-1); 4207 } 4208 ctxt->nodeNr = 0; 4209 ctxt->nodeMax = 10; 4210 ctxt->node = NULL; 4211 4212 /* Allocate the Name stack */ 4213 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *)); 4214 if (ctxt->nameTab == NULL) { 4215 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4216 ctxt->nameNr = 0; 4217 ctxt->nameMax = 10; 4218 ctxt->name = NULL; 4219 ctxt->nodeNr = 0; 4220 ctxt->nodeMax = 0; 4221 ctxt->node = NULL; 4222 ctxt->inputNr = 0; 4223 ctxt->inputMax = 0; 4224 ctxt->input = NULL; 4225 return(-1); 4226 } 4227 ctxt->nameNr = 0; 4228 ctxt->nameMax = 10; 4229 ctxt->name = NULL; 4230 4231 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler; 4232 else { 4233 ctxt->sax = sax; 4234 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 4235 } 4236 ctxt->userData = ctxt; 4237 ctxt->myDoc = NULL; 4238 ctxt->wellFormed = 1; 4239 ctxt->replaceEntities = 0; 4240 ctxt->linenumbers = xmlLineNumbersDefaultValue; 4241 ctxt->html = 1; 4242 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0; 4243 ctxt->vctxt.userData = ctxt; 4244 ctxt->vctxt.error = xmlParserValidityError; 4245 ctxt->vctxt.warning = xmlParserValidityWarning; 4246 ctxt->record_info = 0; 4247 ctxt->validate = 0; 4248 ctxt->nbChars = 0; 4249 ctxt->checkIndex = 0; 4250 ctxt->catalogs = NULL; 4251 xmlInitNodeInfoSeq(&ctxt->node_seq); 4252 return(0); 4253} 4254 4255/** 4256 * htmlFreeParserCtxt: 4257 * @ctxt: an HTML parser context 4258 * 4259 * Free all the memory used by a parser context. However the parsed 4260 * document in ctxt->myDoc is not freed. 4261 */ 4262 4263void 4264htmlFreeParserCtxt(htmlParserCtxtPtr ctxt) 4265{ 4266 xmlFreeParserCtxt(ctxt); 4267} 4268 4269/** 4270 * htmlNewParserCtxt: 4271 * 4272 * Allocate and initialize a new parser context. 4273 * 4274 * Returns the htmlParserCtxtPtr or NULL in case of allocation error 4275 */ 4276 4277htmlParserCtxtPtr 4278htmlNewParserCtxt(void) 4279{ 4280 xmlParserCtxtPtr ctxt; 4281 4282 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt)); 4283 if (ctxt == NULL) { 4284 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n"); 4285 return(NULL); 4286 } 4287 memset(ctxt, 0, sizeof(xmlParserCtxt)); 4288 if (htmlInitParserCtxt(ctxt) < 0) { 4289 htmlFreeParserCtxt(ctxt); 4290 return(NULL); 4291 } 4292 return(ctxt); 4293} 4294 4295/** 4296 * htmlCreateMemoryParserCtxt: 4297 * @buffer: a pointer to a char array 4298 * @size: the size of the array 4299 * 4300 * Create a parser context for an HTML in-memory document. 4301 * 4302 * Returns the new parser context or NULL 4303 */ 4304htmlParserCtxtPtr 4305htmlCreateMemoryParserCtxt(const char *buffer, int size) { 4306 xmlParserCtxtPtr ctxt; 4307 xmlParserInputPtr input; 4308 xmlParserInputBufferPtr buf; 4309 4310 if (buffer == NULL) 4311 return(NULL); 4312 if (size <= 0) 4313 return(NULL); 4314 4315 ctxt = htmlNewParserCtxt(); 4316 if (ctxt == NULL) 4317 return(NULL); 4318 4319 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 4320 if (buf == NULL) return(NULL); 4321 4322 input = xmlNewInputStream(ctxt); 4323 if (input == NULL) { 4324 xmlFreeParserCtxt(ctxt); 4325 return(NULL); 4326 } 4327 4328 input->filename = NULL; 4329 input->buf = buf; 4330 input->base = input->buf->buffer->content; 4331 input->cur = input->buf->buffer->content; 4332 input->end = &input->buf->buffer->content[input->buf->buffer->use]; 4333 4334 inputPush(ctxt, input); 4335 return(ctxt); 4336} 4337 4338/** 4339 * htmlCreateDocParserCtxt: 4340 * @cur: a pointer to an array of xmlChar 4341 * @encoding: a free form C string describing the HTML document encoding, or NULL 4342 * 4343 * Create a parser context for an HTML document. 4344 * 4345 * TODO: check the need to add encoding handling there 4346 * 4347 * Returns the new parser context or NULL 4348 */ 4349static htmlParserCtxtPtr 4350htmlCreateDocParserCtxt(const xmlChar *cur, 4351 const char *encoding ATTRIBUTE_UNUSED) { 4352 int len; 4353 htmlParserCtxtPtr ctxt; 4354 4355 if (cur == NULL) 4356 return(NULL); 4357 len = xmlStrlen(cur); 4358 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len); 4359 4360 if (encoding != NULL) { 4361 xmlCharEncoding enc; 4362 xmlCharEncodingHandlerPtr handler; 4363 4364 if (ctxt->input->encoding != NULL) 4365 xmlFree((xmlChar *) ctxt->input->encoding); 4366 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding); 4367 4368 enc = xmlParseCharEncoding(encoding); 4369 /* 4370 * registered set of known encodings 4371 */ 4372 if (enc != XML_CHAR_ENCODING_ERROR) { 4373 xmlSwitchEncoding(ctxt, enc); 4374 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { 4375 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 4376 "Unsupported encoding %s\n", 4377 (const xmlChar *) encoding, NULL); 4378 } 4379 } else { 4380 /* 4381 * fallback for unknown encodings 4382 */ 4383 handler = xmlFindCharEncodingHandler((const char *) encoding); 4384 if (handler != NULL) { 4385 xmlSwitchToEncoding(ctxt, handler); 4386 } else { 4387 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 4388 "Unsupported encoding %s\n", 4389 (const xmlChar *) encoding, NULL); 4390 } 4391 } 4392 } 4393 return(ctxt); 4394} 4395 4396#ifdef LIBXML_PUSH_ENABLED 4397/************************************************************************ 4398 * * 4399 * Progressive parsing interfaces * 4400 * * 4401 ************************************************************************/ 4402 4403/** 4404 * htmlParseLookupSequence: 4405 * @ctxt: an HTML parser context 4406 * @first: the first char to lookup 4407 * @next: the next char to lookup or zero 4408 * @third: the next char to lookup or zero 4409 * @comment: flag to force checking inside comments 4410 * 4411 * Try to find if a sequence (first, next, third) or just (first next) or 4412 * (first) is available in the input stream. 4413 * This function has a side effect of (possibly) incrementing ctxt->checkIndex 4414 * to avoid rescanning sequences of bytes, it DOES change the state of the 4415 * parser, do not use liberally. 4416 * This is basically similar to xmlParseLookupSequence() 4417 * 4418 * Returns the index to the current parsing point if the full sequence 4419 * is available, -1 otherwise. 4420 */ 4421static int 4422htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, 4423 xmlChar next, xmlChar third, int iscomment) { 4424 int base, len; 4425 htmlParserInputPtr in; 4426 const xmlChar *buf; 4427 int incomment = 0; 4428 4429 in = ctxt->input; 4430 if (in == NULL) return(-1); 4431 base = in->cur - in->base; 4432 if (base < 0) return(-1); 4433 if (ctxt->checkIndex > base) 4434 base = ctxt->checkIndex; 4435 if (in->buf == NULL) { 4436 buf = in->base; 4437 len = in->length; 4438 } else { 4439 buf = in->buf->buffer->content; 4440 len = in->buf->buffer->use; 4441 } 4442 /* take into account the sequence length */ 4443 if (third) len -= 2; 4444 else if (next) len --; 4445 for (;base < len;base++) { 4446 if (!incomment && (base + 4 < len) && !iscomment) { 4447 if ((buf[base] == '<') && (buf[base + 1] == '!') && 4448 (buf[base + 2] == '-') && (buf[base + 3] == '-')) { 4449 incomment = 1; 4450 /* do not increment past <! - some people use <!--> */ 4451 base += 2; 4452 } 4453 } 4454 if (incomment) { 4455 if (base + 3 > len) 4456 return(-1); 4457 if ((buf[base] == '-') && (buf[base + 1] == '-') && 4458 (buf[base + 2] == '>')) { 4459 incomment = 0; 4460 base += 2; 4461 } 4462 continue; 4463 } 4464 if (buf[base] == first) { 4465 if (third != 0) { 4466 if ((buf[base + 1] != next) || 4467 (buf[base + 2] != third)) continue; 4468 } else if (next != 0) { 4469 if (buf[base + 1] != next) continue; 4470 } 4471 ctxt->checkIndex = 0; 4472#ifdef DEBUG_PUSH 4473 if (next == 0) 4474 xmlGenericError(xmlGenericErrorContext, 4475 "HPP: lookup '%c' found at %d\n", 4476 first, base); 4477 else if (third == 0) 4478 xmlGenericError(xmlGenericErrorContext, 4479 "HPP: lookup '%c%c' found at %d\n", 4480 first, next, base); 4481 else 4482 xmlGenericError(xmlGenericErrorContext, 4483 "HPP: lookup '%c%c%c' found at %d\n", 4484 first, next, third, base); 4485#endif 4486 return(base - (in->cur - in->base)); 4487 } 4488 } 4489 ctxt->checkIndex = base; 4490#ifdef DEBUG_PUSH 4491 if (next == 0) 4492 xmlGenericError(xmlGenericErrorContext, 4493 "HPP: lookup '%c' failed\n", first); 4494 else if (third == 0) 4495 xmlGenericError(xmlGenericErrorContext, 4496 "HPP: lookup '%c%c' failed\n", first, next); 4497 else 4498 xmlGenericError(xmlGenericErrorContext, 4499 "HPP: lookup '%c%c%c' failed\n", first, next, third); 4500#endif 4501 return(-1); 4502} 4503 4504/** 4505 * htmlParseTryOrFinish: 4506 * @ctxt: an HTML parser context 4507 * @terminate: last chunk indicator 4508 * 4509 * Try to progress on parsing 4510 * 4511 * Returns zero if no parsing was possible 4512 */ 4513static int 4514htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { 4515 int ret = 0; 4516 htmlParserInputPtr in; 4517 int avail = 0; 4518 xmlChar cur, next; 4519 4520#ifdef DEBUG_PUSH 4521 switch (ctxt->instate) { 4522 case XML_PARSER_EOF: 4523 xmlGenericError(xmlGenericErrorContext, 4524 "HPP: try EOF\n"); break; 4525 case XML_PARSER_START: 4526 xmlGenericError(xmlGenericErrorContext, 4527 "HPP: try START\n"); break; 4528 case XML_PARSER_MISC: 4529 xmlGenericError(xmlGenericErrorContext, 4530 "HPP: try MISC\n");break; 4531 case XML_PARSER_COMMENT: 4532 xmlGenericError(xmlGenericErrorContext, 4533 "HPP: try COMMENT\n");break; 4534 case XML_PARSER_PROLOG: 4535 xmlGenericError(xmlGenericErrorContext, 4536 "HPP: try PROLOG\n");break; 4537 case XML_PARSER_START_TAG: 4538 xmlGenericError(xmlGenericErrorContext, 4539 "HPP: try START_TAG\n");break; 4540 case XML_PARSER_CONTENT: 4541 xmlGenericError(xmlGenericErrorContext, 4542 "HPP: try CONTENT\n");break; 4543 case XML_PARSER_CDATA_SECTION: 4544 xmlGenericError(xmlGenericErrorContext, 4545 "HPP: try CDATA_SECTION\n");break; 4546 case XML_PARSER_END_TAG: 4547 xmlGenericError(xmlGenericErrorContext, 4548 "HPP: try END_TAG\n");break; 4549 case XML_PARSER_ENTITY_DECL: 4550 xmlGenericError(xmlGenericErrorContext, 4551 "HPP: try ENTITY_DECL\n");break; 4552 case XML_PARSER_ENTITY_VALUE: 4553 xmlGenericError(xmlGenericErrorContext, 4554 "HPP: try ENTITY_VALUE\n");break; 4555 case XML_PARSER_ATTRIBUTE_VALUE: 4556 xmlGenericError(xmlGenericErrorContext, 4557 "HPP: try ATTRIBUTE_VALUE\n");break; 4558 case XML_PARSER_DTD: 4559 xmlGenericError(xmlGenericErrorContext, 4560 "HPP: try DTD\n");break; 4561 case XML_PARSER_EPILOG: 4562 xmlGenericError(xmlGenericErrorContext, 4563 "HPP: try EPILOG\n");break; 4564 case XML_PARSER_PI: 4565 xmlGenericError(xmlGenericErrorContext, 4566 "HPP: try PI\n");break; 4567 case XML_PARSER_SYSTEM_LITERAL: 4568 xmlGenericError(xmlGenericErrorContext, 4569 "HPP: try SYSTEM_LITERAL\n");break; 4570 } 4571#endif 4572 4573 while (1) { 4574 4575 in = ctxt->input; 4576 if (in == NULL) break; 4577 if (in->buf == NULL) 4578 avail = in->length - (in->cur - in->base); 4579 else 4580 avail = in->buf->buffer->use - (in->cur - in->base); 4581 if ((avail == 0) && (terminate)) { 4582 htmlAutoCloseOnEnd(ctxt); 4583 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 4584 /* 4585 * SAX: end of the document processing. 4586 */ 4587 ctxt->instate = XML_PARSER_EOF; 4588 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 4589 ctxt->sax->endDocument(ctxt->userData); 4590 } 4591 } 4592 if (avail < 1) 4593 goto done; 4594 cur = in->cur[0]; 4595 if (cur == 0) { 4596 SKIP(1); 4597 continue; 4598 } 4599 4600 switch (ctxt->instate) { 4601 case XML_PARSER_EOF: 4602 /* 4603 * Document parsing is done ! 4604 */ 4605 goto done; 4606 case XML_PARSER_START: 4607 /* 4608 * Very first chars read from the document flow. 4609 */ 4610 cur = in->cur[0]; 4611 if (IS_BLANK_CH(cur)) { 4612 SKIP_BLANKS; 4613 if (in->buf == NULL) 4614 avail = in->length - (in->cur - in->base); 4615 else 4616 avail = in->buf->buffer->use - (in->cur - in->base); 4617 } 4618 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 4619 ctxt->sax->setDocumentLocator(ctxt->userData, 4620 &xmlDefaultSAXLocator); 4621 if ((ctxt->sax) && (ctxt->sax->startDocument) && 4622 (!ctxt->disableSAX)) 4623 ctxt->sax->startDocument(ctxt->userData); 4624 4625 cur = in->cur[0]; 4626 next = in->cur[1]; 4627 if ((cur == '<') && (next == '!') && 4628 (UPP(2) == 'D') && (UPP(3) == 'O') && 4629 (UPP(4) == 'C') && (UPP(5) == 'T') && 4630 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4631 (UPP(8) == 'E')) { 4632 if ((!terminate) && 4633 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4634 goto done; 4635#ifdef DEBUG_PUSH 4636 xmlGenericError(xmlGenericErrorContext, 4637 "HPP: Parsing internal subset\n"); 4638#endif 4639 htmlParseDocTypeDecl(ctxt); 4640 ctxt->instate = XML_PARSER_PROLOG; 4641#ifdef DEBUG_PUSH 4642 xmlGenericError(xmlGenericErrorContext, 4643 "HPP: entering PROLOG\n"); 4644#endif 4645 } else { 4646 ctxt->instate = XML_PARSER_MISC; 4647#ifdef DEBUG_PUSH 4648 xmlGenericError(xmlGenericErrorContext, 4649 "HPP: entering MISC\n"); 4650#endif 4651 } 4652 break; 4653 case XML_PARSER_MISC: 4654 SKIP_BLANKS; 4655 if (in->buf == NULL) 4656 avail = in->length - (in->cur - in->base); 4657 else 4658 avail = in->buf->buffer->use - (in->cur - in->base); 4659 if (avail < 2) 4660 goto done; 4661 cur = in->cur[0]; 4662 next = in->cur[1]; 4663 if ((cur == '<') && (next == '!') && 4664 (in->cur[2] == '-') && (in->cur[3] == '-')) { 4665 if ((!terminate) && 4666 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0)) 4667 goto done; 4668#ifdef DEBUG_PUSH 4669 xmlGenericError(xmlGenericErrorContext, 4670 "HPP: Parsing Comment\n"); 4671#endif 4672 htmlParseComment(ctxt); 4673 ctxt->instate = XML_PARSER_MISC; 4674 } else if ((cur == '<') && (next == '?')) { 4675 if ((!terminate) && 4676 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4677 goto done; 4678#ifdef DEBUG_PUSH 4679 xmlGenericError(xmlGenericErrorContext, 4680 "HPP: Parsing PI\n"); 4681#endif 4682 htmlParsePI(ctxt); 4683 ctxt->instate = XML_PARSER_MISC; 4684 } else if ((cur == '<') && (next == '!') && 4685 (UPP(2) == 'D') && (UPP(3) == 'O') && 4686 (UPP(4) == 'C') && (UPP(5) == 'T') && 4687 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4688 (UPP(8) == 'E')) { 4689 if ((!terminate) && 4690 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4691 goto done; 4692#ifdef DEBUG_PUSH 4693 xmlGenericError(xmlGenericErrorContext, 4694 "HPP: Parsing internal subset\n"); 4695#endif 4696 htmlParseDocTypeDecl(ctxt); 4697 ctxt->instate = XML_PARSER_PROLOG; 4698#ifdef DEBUG_PUSH 4699 xmlGenericError(xmlGenericErrorContext, 4700 "HPP: entering PROLOG\n"); 4701#endif 4702 } else if ((cur == '<') && (next == '!') && 4703 (avail < 9)) { 4704 goto done; 4705 } else { 4706 ctxt->instate = XML_PARSER_START_TAG; 4707#ifdef DEBUG_PUSH 4708 xmlGenericError(xmlGenericErrorContext, 4709 "HPP: entering START_TAG\n"); 4710#endif 4711 } 4712 break; 4713 case XML_PARSER_PROLOG: 4714 SKIP_BLANKS; 4715 if (in->buf == NULL) 4716 avail = in->length - (in->cur - in->base); 4717 else 4718 avail = in->buf->buffer->use - (in->cur - in->base); 4719 if (avail < 2) 4720 goto done; 4721 cur = in->cur[0]; 4722 next = in->cur[1]; 4723 if ((cur == '<') && (next == '!') && 4724 (in->cur[2] == '-') && (in->cur[3] == '-')) { 4725 if ((!terminate) && 4726 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0)) 4727 goto done; 4728#ifdef DEBUG_PUSH 4729 xmlGenericError(xmlGenericErrorContext, 4730 "HPP: Parsing Comment\n"); 4731#endif 4732 htmlParseComment(ctxt); 4733 ctxt->instate = XML_PARSER_PROLOG; 4734 } else if ((cur == '<') && (next == '?')) { 4735 if ((!terminate) && 4736 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4737 goto done; 4738#ifdef DEBUG_PUSH 4739 xmlGenericError(xmlGenericErrorContext, 4740 "HPP: Parsing PI\n"); 4741#endif 4742 htmlParsePI(ctxt); 4743 ctxt->instate = XML_PARSER_PROLOG; 4744 } else if ((cur == '<') && (next == '!') && 4745 (avail < 4)) { 4746 goto done; 4747 } else { 4748 ctxt->instate = XML_PARSER_START_TAG; 4749#ifdef DEBUG_PUSH 4750 xmlGenericError(xmlGenericErrorContext, 4751 "HPP: entering START_TAG\n"); 4752#endif 4753 } 4754 break; 4755 case XML_PARSER_EPILOG: 4756 if (in->buf == NULL) 4757 avail = in->length - (in->cur - in->base); 4758 else 4759 avail = in->buf->buffer->use - (in->cur - in->base); 4760 if (avail < 1) 4761 goto done; 4762 cur = in->cur[0]; 4763 if (IS_BLANK_CH(cur)) { 4764 htmlParseCharData(ctxt); 4765 goto done; 4766 } 4767 if (avail < 2) 4768 goto done; 4769 next = in->cur[1]; 4770 if ((cur == '<') && (next == '!') && 4771 (in->cur[2] == '-') && (in->cur[3] == '-')) { 4772 if ((!terminate) && 4773 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0)) 4774 goto done; 4775#ifdef DEBUG_PUSH 4776 xmlGenericError(xmlGenericErrorContext, 4777 "HPP: Parsing Comment\n"); 4778#endif 4779 htmlParseComment(ctxt); 4780 ctxt->instate = XML_PARSER_EPILOG; 4781 } else if ((cur == '<') && (next == '?')) { 4782 if ((!terminate) && 4783 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4784 goto done; 4785#ifdef DEBUG_PUSH 4786 xmlGenericError(xmlGenericErrorContext, 4787 "HPP: Parsing PI\n"); 4788#endif 4789 htmlParsePI(ctxt); 4790 ctxt->instate = XML_PARSER_EPILOG; 4791 } else if ((cur == '<') && (next == '!') && 4792 (avail < 4)) { 4793 goto done; 4794 } else { 4795 ctxt->errNo = XML_ERR_DOCUMENT_END; 4796 ctxt->wellFormed = 0; 4797 ctxt->instate = XML_PARSER_EOF; 4798#ifdef DEBUG_PUSH 4799 xmlGenericError(xmlGenericErrorContext, 4800 "HPP: entering EOF\n"); 4801#endif 4802 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 4803 ctxt->sax->endDocument(ctxt->userData); 4804 goto done; 4805 } 4806 break; 4807 case XML_PARSER_START_TAG: { 4808 const xmlChar *name; 4809 int failed; 4810 const htmlElemDesc * info; 4811 4812 if (avail < 2) 4813 goto done; 4814 cur = in->cur[0]; 4815 if (cur != '<') { 4816 ctxt->instate = XML_PARSER_CONTENT; 4817#ifdef DEBUG_PUSH 4818 xmlGenericError(xmlGenericErrorContext, 4819 "HPP: entering CONTENT\n"); 4820#endif 4821 break; 4822 } 4823 if (in->cur[1] == '/') { 4824 ctxt->instate = XML_PARSER_END_TAG; 4825 ctxt->checkIndex = 0; 4826#ifdef DEBUG_PUSH 4827 xmlGenericError(xmlGenericErrorContext, 4828 "HPP: entering END_TAG\n"); 4829#endif 4830 break; 4831 } 4832 if ((!terminate) && 4833 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4834 goto done; 4835 4836 failed = htmlParseStartTag(ctxt); 4837 name = ctxt->name; 4838 if (failed || 4839 (name == NULL)) { 4840 if (CUR == '>') 4841 NEXT; 4842 break; 4843 } 4844 4845 /* 4846 * Lookup the info for that element. 4847 */ 4848 info = htmlTagLookup(name); 4849 if (info == NULL) { 4850 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 4851 "Tag %s invalid\n", name, NULL); 4852 } 4853 4854 /* 4855 * Check for an Empty Element labeled the XML/SGML way 4856 */ 4857 if ((CUR == '/') && (NXT(1) == '>')) { 4858 SKIP(2); 4859 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4860 ctxt->sax->endElement(ctxt->userData, name); 4861 htmlnamePop(ctxt); 4862 ctxt->instate = XML_PARSER_CONTENT; 4863#ifdef DEBUG_PUSH 4864 xmlGenericError(xmlGenericErrorContext, 4865 "HPP: entering CONTENT\n"); 4866#endif 4867 break; 4868 } 4869 4870 if (CUR == '>') { 4871 NEXT; 4872 } else { 4873 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 4874 "Couldn't find end of Start Tag %s\n", 4875 name, NULL); 4876 4877 /* 4878 * end of parsing of this node. 4879 */ 4880 if (xmlStrEqual(name, ctxt->name)) { 4881 nodePop(ctxt); 4882 htmlnamePop(ctxt); 4883 } 4884 4885 ctxt->instate = XML_PARSER_CONTENT; 4886#ifdef DEBUG_PUSH 4887 xmlGenericError(xmlGenericErrorContext, 4888 "HPP: entering CONTENT\n"); 4889#endif 4890 break; 4891 } 4892 4893 /* 4894 * Check for an Empty Element from DTD definition 4895 */ 4896 if ((info != NULL) && (info->empty)) { 4897 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4898 ctxt->sax->endElement(ctxt->userData, name); 4899 htmlnamePop(ctxt); 4900 } 4901 ctxt->instate = XML_PARSER_CONTENT; 4902#ifdef DEBUG_PUSH 4903 xmlGenericError(xmlGenericErrorContext, 4904 "HPP: entering CONTENT\n"); 4905#endif 4906 break; 4907 } 4908 case XML_PARSER_CONTENT: { 4909 long cons; 4910 /* 4911 * Handle preparsed entities and charRef 4912 */ 4913 if (ctxt->token != 0) { 4914 xmlChar chr[2] = { 0 , 0 } ; 4915 4916 chr[0] = (xmlChar) ctxt->token; 4917 htmlCheckParagraph(ctxt); 4918 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 4919 ctxt->sax->characters(ctxt->userData, chr, 1); 4920 ctxt->token = 0; 4921 ctxt->checkIndex = 0; 4922 } 4923 if ((avail == 1) && (terminate)) { 4924 cur = in->cur[0]; 4925 if ((cur != '<') && (cur != '&')) { 4926 if (ctxt->sax != NULL) { 4927 if (IS_BLANK_CH(cur)) { 4928 if (ctxt->sax->ignorableWhitespace != NULL) 4929 ctxt->sax->ignorableWhitespace( 4930 ctxt->userData, &cur, 1); 4931 } else { 4932 htmlCheckParagraph(ctxt); 4933 if (ctxt->sax->characters != NULL) 4934 ctxt->sax->characters( 4935 ctxt->userData, &cur, 1); 4936 } 4937 } 4938 ctxt->token = 0; 4939 ctxt->checkIndex = 0; 4940 in->cur++; 4941 break; 4942 } 4943 } 4944 if (avail < 2) 4945 goto done; 4946 cur = in->cur[0]; 4947 next = in->cur[1]; 4948 cons = ctxt->nbChars; 4949 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) || 4950 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) { 4951 /* 4952 * Handle SCRIPT/STYLE separately 4953 */ 4954 if (!terminate) { 4955 int idx; 4956 xmlChar val; 4957 4958 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0); 4959 if (idx < 0) 4960 goto done; 4961 val = in->cur[idx + 2]; 4962 if (val == 0) /* bad cut of input */ 4963 goto done; 4964 } 4965 htmlParseScript(ctxt); 4966 if ((cur == '<') && (next == '/')) { 4967 ctxt->instate = XML_PARSER_END_TAG; 4968 ctxt->checkIndex = 0; 4969#ifdef DEBUG_PUSH 4970 xmlGenericError(xmlGenericErrorContext, 4971 "HPP: entering END_TAG\n"); 4972#endif 4973 break; 4974 } 4975 } else { 4976 /* 4977 * Sometimes DOCTYPE arrives in the middle of the document 4978 */ 4979 if ((cur == '<') && (next == '!') && 4980 (UPP(2) == 'D') && (UPP(3) == 'O') && 4981 (UPP(4) == 'C') && (UPP(5) == 'T') && 4982 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4983 (UPP(8) == 'E')) { 4984 if ((!terminate) && 4985 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4986 goto done; 4987 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 4988 "Misplaced DOCTYPE declaration\n", 4989 BAD_CAST "DOCTYPE" , NULL); 4990 htmlParseDocTypeDecl(ctxt); 4991 } else if ((cur == '<') && (next == '!') && 4992 (in->cur[2] == '-') && (in->cur[3] == '-')) { 4993 if ((!terminate) && 4994 (htmlParseLookupSequence( 4995 ctxt, '-', '-', '>', 1) < 0)) 4996 goto done; 4997#ifdef DEBUG_PUSH 4998 xmlGenericError(xmlGenericErrorContext, 4999 "HPP: Parsing Comment\n"); 5000#endif 5001 htmlParseComment(ctxt); 5002 ctxt->instate = XML_PARSER_CONTENT; 5003 } else if ((cur == '<') && (next == '?')) { 5004 if ((!terminate) && 5005 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5006 goto done; 5007#ifdef DEBUG_PUSH 5008 xmlGenericError(xmlGenericErrorContext, 5009 "HPP: Parsing PI\n"); 5010#endif 5011 htmlParsePI(ctxt); 5012 ctxt->instate = XML_PARSER_CONTENT; 5013 } else if ((cur == '<') && (next == '!') && (avail < 4)) { 5014 goto done; 5015 } else if ((cur == '<') && (next == '/')) { 5016 ctxt->instate = XML_PARSER_END_TAG; 5017 ctxt->checkIndex = 0; 5018#ifdef DEBUG_PUSH 5019 xmlGenericError(xmlGenericErrorContext, 5020 "HPP: entering END_TAG\n"); 5021#endif 5022 break; 5023 } else if (cur == '<') { 5024 ctxt->instate = XML_PARSER_START_TAG; 5025 ctxt->checkIndex = 0; 5026#ifdef DEBUG_PUSH 5027 xmlGenericError(xmlGenericErrorContext, 5028 "HPP: entering START_TAG\n"); 5029#endif 5030 break; 5031 } else if (cur == '&') { 5032 if ((!terminate) && 5033 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0)) 5034 goto done; 5035#ifdef DEBUG_PUSH 5036 xmlGenericError(xmlGenericErrorContext, 5037 "HPP: Parsing Reference\n"); 5038#endif 5039 /* TODO: check generation of subtrees if noent !!! */ 5040 htmlParseReference(ctxt); 5041 } else { 5042 /* 5043 * check that the text sequence is complete 5044 * before handing out the data to the parser 5045 * to avoid problems with erroneous end of 5046 * data detection. 5047 */ 5048 if ((!terminate) && 5049 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0)) 5050 goto done; 5051 ctxt->checkIndex = 0; 5052#ifdef DEBUG_PUSH 5053 xmlGenericError(xmlGenericErrorContext, 5054 "HPP: Parsing char data\n"); 5055#endif 5056 htmlParseCharData(ctxt); 5057 } 5058 } 5059 if (cons == ctxt->nbChars) { 5060 if (ctxt->node != NULL) { 5061 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5062 "detected an error in element content\n", 5063 NULL, NULL); 5064 } 5065 NEXT; 5066 break; 5067 } 5068 5069 break; 5070 } 5071 case XML_PARSER_END_TAG: 5072 if (avail < 2) 5073 goto done; 5074 if ((!terminate) && 5075 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5076 goto done; 5077 htmlParseEndTag(ctxt); 5078 if (ctxt->nameNr == 0) { 5079 ctxt->instate = XML_PARSER_EPILOG; 5080 } else { 5081 ctxt->instate = XML_PARSER_CONTENT; 5082 } 5083 ctxt->checkIndex = 0; 5084#ifdef DEBUG_PUSH 5085 xmlGenericError(xmlGenericErrorContext, 5086 "HPP: entering CONTENT\n"); 5087#endif 5088 break; 5089 case XML_PARSER_CDATA_SECTION: 5090 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5091 "HPP: internal error, state == CDATA\n", 5092 NULL, NULL); 5093 ctxt->instate = XML_PARSER_CONTENT; 5094 ctxt->checkIndex = 0; 5095#ifdef DEBUG_PUSH 5096 xmlGenericError(xmlGenericErrorContext, 5097 "HPP: entering CONTENT\n"); 5098#endif 5099 break; 5100 case XML_PARSER_DTD: 5101 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5102 "HPP: internal error, state == DTD\n", 5103 NULL, NULL); 5104 ctxt->instate = XML_PARSER_CONTENT; 5105 ctxt->checkIndex = 0; 5106#ifdef DEBUG_PUSH 5107 xmlGenericError(xmlGenericErrorContext, 5108 "HPP: entering CONTENT\n"); 5109#endif 5110 break; 5111 case XML_PARSER_COMMENT: 5112 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5113 "HPP: internal error, state == COMMENT\n", 5114 NULL, NULL); 5115 ctxt->instate = XML_PARSER_CONTENT; 5116 ctxt->checkIndex = 0; 5117#ifdef DEBUG_PUSH 5118 xmlGenericError(xmlGenericErrorContext, 5119 "HPP: entering CONTENT\n"); 5120#endif 5121 break; 5122 case XML_PARSER_PI: 5123 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5124 "HPP: internal error, state == PI\n", 5125 NULL, NULL); 5126 ctxt->instate = XML_PARSER_CONTENT; 5127 ctxt->checkIndex = 0; 5128#ifdef DEBUG_PUSH 5129 xmlGenericError(xmlGenericErrorContext, 5130 "HPP: entering CONTENT\n"); 5131#endif 5132 break; 5133 case XML_PARSER_ENTITY_DECL: 5134 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5135 "HPP: internal error, state == ENTITY_DECL\n", 5136 NULL, NULL); 5137 ctxt->instate = XML_PARSER_CONTENT; 5138 ctxt->checkIndex = 0; 5139#ifdef DEBUG_PUSH 5140 xmlGenericError(xmlGenericErrorContext, 5141 "HPP: entering CONTENT\n"); 5142#endif 5143 break; 5144 case XML_PARSER_ENTITY_VALUE: 5145 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5146 "HPP: internal error, state == ENTITY_VALUE\n", 5147 NULL, NULL); 5148 ctxt->instate = XML_PARSER_CONTENT; 5149 ctxt->checkIndex = 0; 5150#ifdef DEBUG_PUSH 5151 xmlGenericError(xmlGenericErrorContext, 5152 "HPP: entering DTD\n"); 5153#endif 5154 break; 5155 case XML_PARSER_ATTRIBUTE_VALUE: 5156 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5157 "HPP: internal error, state == ATTRIBUTE_VALUE\n", 5158 NULL, NULL); 5159 ctxt->instate = XML_PARSER_START_TAG; 5160 ctxt->checkIndex = 0; 5161#ifdef DEBUG_PUSH 5162 xmlGenericError(xmlGenericErrorContext, 5163 "HPP: entering START_TAG\n"); 5164#endif 5165 break; 5166 case XML_PARSER_SYSTEM_LITERAL: 5167 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5168 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n", 5169 NULL, NULL); 5170 ctxt->instate = XML_PARSER_CONTENT; 5171 ctxt->checkIndex = 0; 5172#ifdef DEBUG_PUSH 5173 xmlGenericError(xmlGenericErrorContext, 5174 "HPP: entering CONTENT\n"); 5175#endif 5176 break; 5177 case XML_PARSER_IGNORE: 5178 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5179 "HPP: internal error, state == XML_PARSER_IGNORE\n", 5180 NULL, NULL); 5181 ctxt->instate = XML_PARSER_CONTENT; 5182 ctxt->checkIndex = 0; 5183#ifdef DEBUG_PUSH 5184 xmlGenericError(xmlGenericErrorContext, 5185 "HPP: entering CONTENT\n"); 5186#endif 5187 break; 5188 case XML_PARSER_PUBLIC_LITERAL: 5189 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5190 "HPP: internal error, state == XML_PARSER_LITERAL\n", 5191 NULL, NULL); 5192 ctxt->instate = XML_PARSER_CONTENT; 5193 ctxt->checkIndex = 0; 5194#ifdef DEBUG_PUSH 5195 xmlGenericError(xmlGenericErrorContext, 5196 "HPP: entering CONTENT\n"); 5197#endif 5198 break; 5199 5200 } 5201 } 5202done: 5203 if ((avail == 0) && (terminate)) { 5204 htmlAutoCloseOnEnd(ctxt); 5205 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5206 /* 5207 * SAX: end of the document processing. 5208 */ 5209 ctxt->instate = XML_PARSER_EOF; 5210 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5211 ctxt->sax->endDocument(ctxt->userData); 5212 } 5213 } 5214 if ((ctxt->myDoc != NULL) && 5215 ((terminate) || (ctxt->instate == XML_PARSER_EOF) || 5216 (ctxt->instate == XML_PARSER_EPILOG))) { 5217 xmlDtdPtr dtd; 5218 dtd = xmlGetIntSubset(ctxt->myDoc); 5219 if (dtd == NULL) 5220 ctxt->myDoc->intSubset = 5221 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 5222 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 5223 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 5224 } 5225#ifdef DEBUG_PUSH 5226 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret); 5227#endif 5228 return(ret); 5229} 5230 5231/** 5232 * htmlParseChunk: 5233 * @ctxt: an HTML parser context 5234 * @chunk: an char array 5235 * @size: the size in byte of the chunk 5236 * @terminate: last chunk indicator 5237 * 5238 * Parse a Chunk of memory 5239 * 5240 * Returns zero if no error, the xmlParserErrors otherwise. 5241 */ 5242int 5243htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, 5244 int terminate) { 5245 if ((ctxt == NULL) || (ctxt->input == NULL)) { 5246 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5247 "htmlParseChunk: context error\n", NULL, NULL); 5248 return(XML_ERR_INTERNAL_ERROR); 5249 } 5250 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 5251 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { 5252 int base = ctxt->input->base - ctxt->input->buf->buffer->content; 5253 int cur = ctxt->input->cur - ctxt->input->base; 5254 int res; 5255 5256 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 5257 if (res < 0) { 5258 ctxt->errNo = XML_PARSER_EOF; 5259 ctxt->disableSAX = 1; 5260 return (XML_PARSER_EOF); 5261 } 5262 ctxt->input->base = ctxt->input->buf->buffer->content + base; 5263 ctxt->input->cur = ctxt->input->base + cur; 5264 ctxt->input->end = 5265 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 5266#ifdef DEBUG_PUSH 5267 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 5268#endif 5269 5270#if 0 5271 if ((terminate) || (ctxt->input->buf->buffer->use > 80)) 5272 htmlParseTryOrFinish(ctxt, terminate); 5273#endif 5274 } else if (ctxt->instate != XML_PARSER_EOF) { 5275 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { 5276 xmlParserInputBufferPtr in = ctxt->input->buf; 5277 if ((in->encoder != NULL) && (in->buffer != NULL) && 5278 (in->raw != NULL)) { 5279 int nbchars; 5280 5281 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw); 5282 if (nbchars < 0) { 5283 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 5284 "encoder error\n", NULL, NULL); 5285 return(XML_ERR_INVALID_ENCODING); 5286 } 5287 } 5288 } 5289 } 5290 htmlParseTryOrFinish(ctxt, terminate); 5291 if (terminate) { 5292 if ((ctxt->instate != XML_PARSER_EOF) && 5293 (ctxt->instate != XML_PARSER_EPILOG) && 5294 (ctxt->instate != XML_PARSER_MISC)) { 5295 ctxt->errNo = XML_ERR_DOCUMENT_END; 5296 ctxt->wellFormed = 0; 5297 } 5298 if (ctxt->instate != XML_PARSER_EOF) { 5299 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5300 ctxt->sax->endDocument(ctxt->userData); 5301 } 5302 ctxt->instate = XML_PARSER_EOF; 5303 } 5304 return((xmlParserErrors) ctxt->errNo); 5305} 5306 5307/************************************************************************ 5308 * * 5309 * User entry points * 5310 * * 5311 ************************************************************************/ 5312 5313/** 5314 * htmlCreatePushParserCtxt: 5315 * @sax: a SAX handler 5316 * @user_data: The user data returned on SAX callbacks 5317 * @chunk: a pointer to an array of chars 5318 * @size: number of chars in the array 5319 * @filename: an optional file name or URI 5320 * @enc: an optional encoding 5321 * 5322 * Create a parser context for using the HTML parser in push mode 5323 * The value of @filename is used for fetching external entities 5324 * and error/warning reports. 5325 * 5326 * Returns the new parser context or NULL 5327 */ 5328htmlParserCtxtPtr 5329htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, 5330 const char *chunk, int size, const char *filename, 5331 xmlCharEncoding enc) { 5332 htmlParserCtxtPtr ctxt; 5333 htmlParserInputPtr inputStream; 5334 xmlParserInputBufferPtr buf; 5335 5336 xmlInitParser(); 5337 5338 buf = xmlAllocParserInputBuffer(enc); 5339 if (buf == NULL) return(NULL); 5340 5341 ctxt = htmlNewParserCtxt(); 5342 if (ctxt == NULL) { 5343 xmlFreeParserInputBuffer(buf); 5344 return(NULL); 5345 } 5346 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder) 5347 ctxt->charset=XML_CHAR_ENCODING_UTF8; 5348 if (sax != NULL) { 5349 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler) 5350 xmlFree(ctxt->sax); 5351 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler)); 5352 if (ctxt->sax == NULL) { 5353 xmlFree(buf); 5354 xmlFree(ctxt); 5355 return(NULL); 5356 } 5357 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler)); 5358 if (user_data != NULL) 5359 ctxt->userData = user_data; 5360 } 5361 if (filename == NULL) { 5362 ctxt->directory = NULL; 5363 } else { 5364 ctxt->directory = xmlParserGetDirectory(filename); 5365 } 5366 5367 inputStream = htmlNewInputStream(ctxt); 5368 if (inputStream == NULL) { 5369 xmlFreeParserCtxt(ctxt); 5370 xmlFree(buf); 5371 return(NULL); 5372 } 5373 5374 if (filename == NULL) 5375 inputStream->filename = NULL; 5376 else 5377 inputStream->filename = (char *) 5378 xmlCanonicPath((const xmlChar *) filename); 5379 inputStream->buf = buf; 5380 inputStream->base = inputStream->buf->buffer->content; 5381 inputStream->cur = inputStream->buf->buffer->content; 5382 inputStream->end = 5383 &inputStream->buf->buffer->content[inputStream->buf->buffer->use]; 5384 5385 inputPush(ctxt, inputStream); 5386 5387 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 5388 (ctxt->input->buf != NULL)) { 5389 int base = ctxt->input->base - ctxt->input->buf->buffer->content; 5390 int cur = ctxt->input->cur - ctxt->input->base; 5391 5392 xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 5393 5394 ctxt->input->base = ctxt->input->buf->buffer->content + base; 5395 ctxt->input->cur = ctxt->input->base + cur; 5396 ctxt->input->end = 5397 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 5398#ifdef DEBUG_PUSH 5399 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 5400#endif 5401 } 5402 ctxt->progressive = 1; 5403 5404 return(ctxt); 5405} 5406#endif /* LIBXML_PUSH_ENABLED */ 5407 5408/** 5409 * htmlSAXParseDoc: 5410 * @cur: a pointer to an array of xmlChar 5411 * @encoding: a free form C string describing the HTML document encoding, or NULL 5412 * @sax: the SAX handler block 5413 * @userData: if using SAX, this pointer will be provided on callbacks. 5414 * 5415 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks 5416 * to handle parse events. If sax is NULL, fallback to the default DOM 5417 * behavior and return a tree. 5418 * 5419 * Returns the resulting document tree unless SAX is NULL or the document is 5420 * not well formed. 5421 */ 5422 5423htmlDocPtr 5424htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) { 5425 htmlDocPtr ret; 5426 htmlParserCtxtPtr ctxt; 5427 5428 xmlInitParser(); 5429 5430 if (cur == NULL) return(NULL); 5431 5432 5433 ctxt = htmlCreateDocParserCtxt(cur, encoding); 5434 if (ctxt == NULL) return(NULL); 5435 if (sax != NULL) { 5436 if (ctxt->sax != NULL) xmlFree (ctxt->sax); 5437 ctxt->sax = sax; 5438 ctxt->userData = userData; 5439 } 5440 5441 htmlParseDocument(ctxt); 5442 ret = ctxt->myDoc; 5443 if (sax != NULL) { 5444 ctxt->sax = NULL; 5445 ctxt->userData = NULL; 5446 } 5447 htmlFreeParserCtxt(ctxt); 5448 5449 return(ret); 5450} 5451 5452/** 5453 * htmlParseDoc: 5454 * @cur: a pointer to an array of xmlChar 5455 * @encoding: a free form C string describing the HTML document encoding, or NULL 5456 * 5457 * parse an HTML in-memory document and build a tree. 5458 * 5459 * Returns the resulting document tree 5460 */ 5461 5462htmlDocPtr 5463htmlParseDoc(xmlChar *cur, const char *encoding) { 5464 return(htmlSAXParseDoc(cur, encoding, NULL, NULL)); 5465} 5466 5467 5468/** 5469 * htmlCreateFileParserCtxt: 5470 * @filename: the filename 5471 * @encoding: a free form C string describing the HTML document encoding, or NULL 5472 * 5473 * Create a parser context for a file content. 5474 * Automatic support for ZLIB/Compress compressed document is provided 5475 * by default if found at compile-time. 5476 * 5477 * Returns the new parser context or NULL 5478 */ 5479htmlParserCtxtPtr 5480htmlCreateFileParserCtxt(const char *filename, const char *encoding) 5481{ 5482 htmlParserCtxtPtr ctxt; 5483 htmlParserInputPtr inputStream; 5484 char *canonicFilename; 5485 /* htmlCharEncoding enc; */ 5486 xmlChar *content, *content_line = (xmlChar *) "charset="; 5487 5488 if (filename == NULL) 5489 return(NULL); 5490 5491 ctxt = htmlNewParserCtxt(); 5492 if (ctxt == NULL) { 5493 return(NULL); 5494 } 5495 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename); 5496 if (canonicFilename == NULL) { 5497#ifdef LIBXML_SAX1_ENABLED 5498 if (xmlDefaultSAXHandler.error != NULL) { 5499 xmlDefaultSAXHandler.error(NULL, "out of memory\n"); 5500 } 5501#endif 5502 xmlFreeParserCtxt(ctxt); 5503 return(NULL); 5504 } 5505 5506 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt); 5507 xmlFree(canonicFilename); 5508 if (inputStream == NULL) { 5509 xmlFreeParserCtxt(ctxt); 5510 return(NULL); 5511 } 5512 5513 inputPush(ctxt, inputStream); 5514 5515 /* set encoding */ 5516 if (encoding) { 5517 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1); 5518 if (content) { 5519 strcpy ((char *)content, (char *)content_line); 5520 strcat ((char *)content, (char *)encoding); 5521 htmlCheckEncoding (ctxt, content); 5522 xmlFree (content); 5523 } 5524 } 5525 5526 return(ctxt); 5527} 5528 5529/** 5530 * htmlSAXParseFile: 5531 * @filename: the filename 5532 * @encoding: a free form C string describing the HTML document encoding, or NULL 5533 * @sax: the SAX handler block 5534 * @userData: if using SAX, this pointer will be provided on callbacks. 5535 * 5536 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 5537 * compressed document is provided by default if found at compile-time. 5538 * It use the given SAX function block to handle the parsing callback. 5539 * If sax is NULL, fallback to the default DOM tree building routines. 5540 * 5541 * Returns the resulting document tree unless SAX is NULL or the document is 5542 * not well formed. 5543 */ 5544 5545htmlDocPtr 5546htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax, 5547 void *userData) { 5548 htmlDocPtr ret; 5549 htmlParserCtxtPtr ctxt; 5550 htmlSAXHandlerPtr oldsax = NULL; 5551 5552 xmlInitParser(); 5553 5554 ctxt = htmlCreateFileParserCtxt(filename, encoding); 5555 if (ctxt == NULL) return(NULL); 5556 if (sax != NULL) { 5557 oldsax = ctxt->sax; 5558 ctxt->sax = sax; 5559 ctxt->userData = userData; 5560 } 5561 5562 htmlParseDocument(ctxt); 5563 5564 ret = ctxt->myDoc; 5565 if (sax != NULL) { 5566 ctxt->sax = oldsax; 5567 ctxt->userData = NULL; 5568 } 5569 htmlFreeParserCtxt(ctxt); 5570 5571 return(ret); 5572} 5573 5574/** 5575 * htmlParseFile: 5576 * @filename: the filename 5577 * @encoding: a free form C string describing the HTML document encoding, or NULL 5578 * 5579 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 5580 * compressed document is provided by default if found at compile-time. 5581 * 5582 * Returns the resulting document tree 5583 */ 5584 5585htmlDocPtr 5586htmlParseFile(const char *filename, const char *encoding) { 5587 return(htmlSAXParseFile(filename, encoding, NULL, NULL)); 5588} 5589 5590/** 5591 * htmlHandleOmittedElem: 5592 * @val: int 0 or 1 5593 * 5594 * Set and return the previous value for handling HTML omitted tags. 5595 * 5596 * Returns the last value for 0 for no handling, 1 for auto insertion. 5597 */ 5598 5599int 5600htmlHandleOmittedElem(int val) { 5601 int old = htmlOmittedDefaultValue; 5602 5603 htmlOmittedDefaultValue = val; 5604 return(old); 5605} 5606 5607/** 5608 * htmlElementAllowedHere: 5609 * @parent: HTML parent element 5610 * @elt: HTML element 5611 * 5612 * Checks whether an HTML element may be a direct child of a parent element. 5613 * Note - doesn't check for deprecated elements 5614 * 5615 * Returns 1 if allowed; 0 otherwise. 5616 */ 5617int 5618htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) { 5619 const char** p ; 5620 5621 if ( ! elt || ! parent || ! parent->subelts ) 5622 return 0 ; 5623 5624 for ( p = parent->subelts; *p; ++p ) 5625 if ( !xmlStrcmp((const xmlChar *)*p, elt) ) 5626 return 1 ; 5627 5628 return 0 ; 5629} 5630/** 5631 * htmlElementStatusHere: 5632 * @parent: HTML parent element 5633 * @elt: HTML element 5634 * 5635 * Checks whether an HTML element may be a direct child of a parent element. 5636 * and if so whether it is valid or deprecated. 5637 * 5638 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID 5639 */ 5640htmlStatus 5641htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) { 5642 if ( ! parent || ! elt ) 5643 return HTML_INVALID ; 5644 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) ) 5645 return HTML_INVALID ; 5646 5647 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ; 5648} 5649/** 5650 * htmlAttrAllowed: 5651 * @elt: HTML element 5652 * @attr: HTML attribute 5653 * @legacy: whether to allow deprecated attributes 5654 * 5655 * Checks whether an attribute is valid for an element 5656 * Has full knowledge of Required and Deprecated attributes 5657 * 5658 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID 5659 */ 5660htmlStatus 5661htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) { 5662 const char** p ; 5663 5664 if ( !elt || ! attr ) 5665 return HTML_INVALID ; 5666 5667 if ( elt->attrs_req ) 5668 for ( p = elt->attrs_req; *p; ++p) 5669 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 5670 return HTML_REQUIRED ; 5671 5672 if ( elt->attrs_opt ) 5673 for ( p = elt->attrs_opt; *p; ++p) 5674 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 5675 return HTML_VALID ; 5676 5677 if ( legacy && elt->attrs_depr ) 5678 for ( p = elt->attrs_depr; *p; ++p) 5679 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 5680 return HTML_DEPRECATED ; 5681 5682 return HTML_INVALID ; 5683} 5684/** 5685 * htmlNodeStatus: 5686 * @node: an htmlNodePtr in a tree 5687 * @legacy: whether to allow deprecated elements (YES is faster here 5688 * for Element nodes) 5689 * 5690 * Checks whether the tree node is valid. Experimental (the author 5691 * only uses the HTML enhancements in a SAX parser) 5692 * 5693 * Return: for Element nodes, a return from htmlElementAllowedHere (if 5694 * legacy allowed) or htmlElementStatusHere (otherwise). 5695 * for Attribute nodes, a return from htmlAttrAllowed 5696 * for other nodes, HTML_NA (no checks performed) 5697 */ 5698htmlStatus 5699htmlNodeStatus(const htmlNodePtr node, int legacy) { 5700 if ( ! node ) 5701 return HTML_INVALID ; 5702 5703 switch ( node->type ) { 5704 case XML_ELEMENT_NODE: 5705 return legacy 5706 ? ( htmlElementAllowedHere ( 5707 htmlTagLookup(node->parent->name) , node->name 5708 ) ? HTML_VALID : HTML_INVALID ) 5709 : htmlElementStatusHere( 5710 htmlTagLookup(node->parent->name) , 5711 htmlTagLookup(node->name) ) 5712 ; 5713 case XML_ATTRIBUTE_NODE: 5714 return htmlAttrAllowed( 5715 htmlTagLookup(node->parent->name) , node->name, legacy) ; 5716 default: return HTML_NA ; 5717 } 5718} 5719/************************************************************************ 5720 * * 5721 * New set (2.6.0) of simpler and more flexible APIs * 5722 * * 5723 ************************************************************************/ 5724/** 5725 * DICT_FREE: 5726 * @str: a string 5727 * 5728 * Free a string if it is not owned by the "dict" dictionnary in the 5729 * current scope 5730 */ 5731#define DICT_FREE(str) \ 5732 if ((str) && ((!dict) || \ 5733 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \ 5734 xmlFree((char *)(str)); 5735 5736/** 5737 * htmlCtxtReset: 5738 * @ctxt: an HTML parser context 5739 * 5740 * Reset a parser context 5741 */ 5742void 5743htmlCtxtReset(htmlParserCtxtPtr ctxt) 5744{ 5745 xmlParserInputPtr input; 5746 xmlDictPtr dict; 5747 5748 if (ctxt == NULL) 5749 return; 5750 5751 xmlInitParser(); 5752 dict = ctxt->dict; 5753 5754 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */ 5755 xmlFreeInputStream(input); 5756 } 5757 ctxt->inputNr = 0; 5758 ctxt->input = NULL; 5759 5760 ctxt->spaceNr = 0; 5761 if (ctxt->spaceTab != NULL) { 5762 ctxt->spaceTab[0] = -1; 5763 ctxt->space = &ctxt->spaceTab[0]; 5764 } else { 5765 ctxt->space = NULL; 5766 } 5767 5768 5769 ctxt->nodeNr = 0; 5770 ctxt->node = NULL; 5771 5772 ctxt->nameNr = 0; 5773 ctxt->name = NULL; 5774 5775 DICT_FREE(ctxt->version); 5776 ctxt->version = NULL; 5777 DICT_FREE(ctxt->encoding); 5778 ctxt->encoding = NULL; 5779 DICT_FREE(ctxt->directory); 5780 ctxt->directory = NULL; 5781 DICT_FREE(ctxt->extSubURI); 5782 ctxt->extSubURI = NULL; 5783 DICT_FREE(ctxt->extSubSystem); 5784 ctxt->extSubSystem = NULL; 5785 if (ctxt->myDoc != NULL) 5786 xmlFreeDoc(ctxt->myDoc); 5787 ctxt->myDoc = NULL; 5788 5789 ctxt->standalone = -1; 5790 ctxt->hasExternalSubset = 0; 5791 ctxt->hasPErefs = 0; 5792 ctxt->html = 1; 5793 ctxt->external = 0; 5794 ctxt->instate = XML_PARSER_START; 5795 ctxt->token = 0; 5796 5797 ctxt->wellFormed = 1; 5798 ctxt->nsWellFormed = 1; 5799 ctxt->valid = 1; 5800 ctxt->vctxt.userData = ctxt; 5801 ctxt->vctxt.error = xmlParserValidityError; 5802 ctxt->vctxt.warning = xmlParserValidityWarning; 5803 ctxt->record_info = 0; 5804 ctxt->nbChars = 0; 5805 ctxt->checkIndex = 0; 5806 ctxt->inSubset = 0; 5807 ctxt->errNo = XML_ERR_OK; 5808 ctxt->depth = 0; 5809 ctxt->charset = XML_CHAR_ENCODING_UTF8; 5810 ctxt->catalogs = NULL; 5811 xmlInitNodeInfoSeq(&ctxt->node_seq); 5812 5813 if (ctxt->attsDefault != NULL) { 5814 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree); 5815 ctxt->attsDefault = NULL; 5816 } 5817 if (ctxt->attsSpecial != NULL) { 5818 xmlHashFree(ctxt->attsSpecial, NULL); 5819 ctxt->attsSpecial = NULL; 5820 } 5821} 5822 5823/** 5824 * htmlCtxtUseOptions: 5825 * @ctxt: an HTML parser context 5826 * @options: a combination of htmlParserOption(s) 5827 * 5828 * Applies the options to the parser context 5829 * 5830 * Returns 0 in case of success, the set of unknown or unimplemented options 5831 * in case of error. 5832 */ 5833int 5834htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) 5835{ 5836 if (ctxt == NULL) 5837 return(-1); 5838 5839 if (options & HTML_PARSE_NOWARNING) { 5840 ctxt->sax->warning = NULL; 5841 ctxt->vctxt.warning = NULL; 5842 options -= XML_PARSE_NOWARNING; 5843 ctxt->options |= XML_PARSE_NOWARNING; 5844 } 5845 if (options & HTML_PARSE_NOERROR) { 5846 ctxt->sax->error = NULL; 5847 ctxt->vctxt.error = NULL; 5848 ctxt->sax->fatalError = NULL; 5849 options -= XML_PARSE_NOERROR; 5850 ctxt->options |= XML_PARSE_NOERROR; 5851 } 5852 if (options & HTML_PARSE_PEDANTIC) { 5853 ctxt->pedantic = 1; 5854 options -= XML_PARSE_PEDANTIC; 5855 ctxt->options |= XML_PARSE_PEDANTIC; 5856 } else 5857 ctxt->pedantic = 0; 5858 if (options & XML_PARSE_NOBLANKS) { 5859 ctxt->keepBlanks = 0; 5860 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; 5861 options -= XML_PARSE_NOBLANKS; 5862 ctxt->options |= XML_PARSE_NOBLANKS; 5863 } else 5864 ctxt->keepBlanks = 1; 5865 if (options & HTML_PARSE_RECOVER) { 5866 ctxt->recovery = 1; 5867 options -= HTML_PARSE_RECOVER; 5868 } else 5869 ctxt->recovery = 0; 5870 if (options & HTML_PARSE_COMPACT) { 5871 ctxt->options |= HTML_PARSE_COMPACT; 5872 options -= HTML_PARSE_COMPACT; 5873 } 5874 ctxt->dictNames = 0; 5875 return (options); 5876} 5877 5878/** 5879 * htmlDoRead: 5880 * @ctxt: an HTML parser context 5881 * @URL: the base URL to use for the document 5882 * @encoding: the document encoding, or NULL 5883 * @options: a combination of htmlParserOption(s) 5884 * @reuse: keep the context for reuse 5885 * 5886 * Common front-end for the htmlRead functions 5887 * 5888 * Returns the resulting document tree or NULL 5889 */ 5890static htmlDocPtr 5891htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding, 5892 int options, int reuse) 5893{ 5894 htmlDocPtr ret; 5895 5896 htmlCtxtUseOptions(ctxt, options); 5897 ctxt->html = 1; 5898 if (encoding != NULL) { 5899 xmlCharEncodingHandlerPtr hdlr; 5900 5901 hdlr = xmlFindCharEncodingHandler(encoding); 5902 if (hdlr != NULL) 5903 xmlSwitchToEncoding(ctxt, hdlr); 5904 } 5905 if ((URL != NULL) && (ctxt->input != NULL) && 5906 (ctxt->input->filename == NULL)) 5907 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL); 5908 htmlParseDocument(ctxt); 5909 ret = ctxt->myDoc; 5910 ctxt->myDoc = NULL; 5911 if (!reuse) { 5912 if ((ctxt->dictNames) && 5913 (ret != NULL) && 5914 (ret->dict == ctxt->dict)) 5915 ctxt->dict = NULL; 5916 xmlFreeParserCtxt(ctxt); 5917 } 5918 return (ret); 5919} 5920 5921/** 5922 * htmlReadDoc: 5923 * @cur: a pointer to a zero terminated string 5924 * @URL: the base URL to use for the document 5925 * @encoding: the document encoding, or NULL 5926 * @options: a combination of htmlParserOption(s) 5927 * 5928 * parse an XML in-memory document and build a tree. 5929 * 5930 * Returns the resulting document tree 5931 */ 5932htmlDocPtr 5933htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options) 5934{ 5935 htmlParserCtxtPtr ctxt; 5936 5937 if (cur == NULL) 5938 return (NULL); 5939 5940 xmlInitParser(); 5941 ctxt = htmlCreateDocParserCtxt(cur, NULL); 5942 if (ctxt == NULL) 5943 return (NULL); 5944 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 5945} 5946 5947/** 5948 * htmlReadFile: 5949 * @filename: a file or URL 5950 * @encoding: the document encoding, or NULL 5951 * @options: a combination of htmlParserOption(s) 5952 * 5953 * parse an XML file from the filesystem or the network. 5954 * 5955 * Returns the resulting document tree 5956 */ 5957htmlDocPtr 5958htmlReadFile(const char *filename, const char *encoding, int options) 5959{ 5960 htmlParserCtxtPtr ctxt; 5961 5962 xmlInitParser(); 5963 ctxt = htmlCreateFileParserCtxt(filename, encoding); 5964 if (ctxt == NULL) 5965 return (NULL); 5966 return (htmlDoRead(ctxt, NULL, NULL, options, 0)); 5967} 5968 5969/** 5970 * htmlReadMemory: 5971 * @buffer: a pointer to a char array 5972 * @size: the size of the array 5973 * @URL: the base URL to use for the document 5974 * @encoding: the document encoding, or NULL 5975 * @options: a combination of htmlParserOption(s) 5976 * 5977 * parse an XML in-memory document and build a tree. 5978 * 5979 * Returns the resulting document tree 5980 */ 5981htmlDocPtr 5982htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options) 5983{ 5984 htmlParserCtxtPtr ctxt; 5985 5986 xmlInitParser(); 5987 ctxt = xmlCreateMemoryParserCtxt(buffer, size); 5988 if (ctxt == NULL) 5989 return (NULL); 5990 htmlDefaultSAXHandlerInit(); 5991 if (ctxt->sax != NULL) 5992 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 5993 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 5994} 5995 5996/** 5997 * htmlReadFd: 5998 * @fd: an open file descriptor 5999 * @URL: the base URL to use for the document 6000 * @encoding: the document encoding, or NULL 6001 * @options: a combination of htmlParserOption(s) 6002 * 6003 * parse an XML from a file descriptor and build a tree. 6004 * 6005 * Returns the resulting document tree 6006 */ 6007htmlDocPtr 6008htmlReadFd(int fd, const char *URL, const char *encoding, int options) 6009{ 6010 htmlParserCtxtPtr ctxt; 6011 xmlParserInputBufferPtr input; 6012 xmlParserInputPtr stream; 6013 6014 if (fd < 0) 6015 return (NULL); 6016 6017 xmlInitParser(); 6018 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 6019 if (input == NULL) 6020 return (NULL); 6021 ctxt = xmlNewParserCtxt(); 6022 if (ctxt == NULL) { 6023 xmlFreeParserInputBuffer(input); 6024 return (NULL); 6025 } 6026 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6027 if (stream == NULL) { 6028 xmlFreeParserInputBuffer(input); 6029 xmlFreeParserCtxt(ctxt); 6030 return (NULL); 6031 } 6032 inputPush(ctxt, stream); 6033 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6034} 6035 6036/** 6037 * htmlReadIO: 6038 * @ioread: an I/O read function 6039 * @ioclose: an I/O close function 6040 * @ioctx: an I/O handler 6041 * @URL: the base URL to use for the document 6042 * @encoding: the document encoding, or NULL 6043 * @options: a combination of htmlParserOption(s) 6044 * 6045 * parse an HTML document from I/O functions and source and build a tree. 6046 * 6047 * Returns the resulting document tree 6048 */ 6049htmlDocPtr 6050htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, 6051 void *ioctx, const char *URL, const char *encoding, int options) 6052{ 6053 htmlParserCtxtPtr ctxt; 6054 xmlParserInputBufferPtr input; 6055 xmlParserInputPtr stream; 6056 6057 if (ioread == NULL) 6058 return (NULL); 6059 xmlInitParser(); 6060 6061 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 6062 XML_CHAR_ENCODING_NONE); 6063 if (input == NULL) 6064 return (NULL); 6065 ctxt = htmlNewParserCtxt(); 6066 if (ctxt == NULL) { 6067 xmlFreeParserInputBuffer(input); 6068 return (NULL); 6069 } 6070 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6071 if (stream == NULL) { 6072 xmlFreeParserInputBuffer(input); 6073 xmlFreeParserCtxt(ctxt); 6074 return (NULL); 6075 } 6076 inputPush(ctxt, stream); 6077 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6078} 6079 6080/** 6081 * htmlCtxtReadDoc: 6082 * @ctxt: an HTML parser context 6083 * @cur: a pointer to a zero terminated string 6084 * @URL: the base URL to use for the document 6085 * @encoding: the document encoding, or NULL 6086 * @options: a combination of htmlParserOption(s) 6087 * 6088 * parse an XML in-memory document and build a tree. 6089 * This reuses the existing @ctxt parser context 6090 * 6091 * Returns the resulting document tree 6092 */ 6093htmlDocPtr 6094htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur, 6095 const char *URL, const char *encoding, int options) 6096{ 6097 xmlParserInputPtr stream; 6098 6099 if (cur == NULL) 6100 return (NULL); 6101 if (ctxt == NULL) 6102 return (NULL); 6103 6104 htmlCtxtReset(ctxt); 6105 6106 stream = xmlNewStringInputStream(ctxt, cur); 6107 if (stream == NULL) { 6108 return (NULL); 6109 } 6110 inputPush(ctxt, stream); 6111 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6112} 6113 6114/** 6115 * htmlCtxtReadFile: 6116 * @ctxt: an HTML parser context 6117 * @filename: a file or URL 6118 * @encoding: the document encoding, or NULL 6119 * @options: a combination of htmlParserOption(s) 6120 * 6121 * parse an XML file from the filesystem or the network. 6122 * This reuses the existing @ctxt parser context 6123 * 6124 * Returns the resulting document tree 6125 */ 6126htmlDocPtr 6127htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename, 6128 const char *encoding, int options) 6129{ 6130 xmlParserInputPtr stream; 6131 6132 if (filename == NULL) 6133 return (NULL); 6134 if (ctxt == NULL) 6135 return (NULL); 6136 6137 htmlCtxtReset(ctxt); 6138 6139 stream = xmlLoadExternalEntity(filename, NULL, ctxt); 6140 if (stream == NULL) { 6141 return (NULL); 6142 } 6143 inputPush(ctxt, stream); 6144 return (htmlDoRead(ctxt, NULL, encoding, options, 1)); 6145} 6146 6147/** 6148 * htmlCtxtReadMemory: 6149 * @ctxt: an HTML parser context 6150 * @buffer: a pointer to a char array 6151 * @size: the size of the array 6152 * @URL: the base URL to use for the document 6153 * @encoding: the document encoding, or NULL 6154 * @options: a combination of htmlParserOption(s) 6155 * 6156 * parse an XML in-memory document and build a tree. 6157 * This reuses the existing @ctxt parser context 6158 * 6159 * Returns the resulting document tree 6160 */ 6161htmlDocPtr 6162htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size, 6163 const char *URL, const char *encoding, int options) 6164{ 6165 xmlParserInputBufferPtr input; 6166 xmlParserInputPtr stream; 6167 6168 if (ctxt == NULL) 6169 return (NULL); 6170 if (buffer == NULL) 6171 return (NULL); 6172 6173 htmlCtxtReset(ctxt); 6174 6175 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 6176 if (input == NULL) { 6177 return(NULL); 6178 } 6179 6180 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6181 if (stream == NULL) { 6182 xmlFreeParserInputBuffer(input); 6183 return(NULL); 6184 } 6185 6186 inputPush(ctxt, stream); 6187 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6188} 6189 6190/** 6191 * htmlCtxtReadFd: 6192 * @ctxt: an HTML parser context 6193 * @fd: an open file descriptor 6194 * @URL: the base URL to use for the document 6195 * @encoding: the document encoding, or NULL 6196 * @options: a combination of htmlParserOption(s) 6197 * 6198 * parse an XML from a file descriptor and build a tree. 6199 * This reuses the existing @ctxt parser context 6200 * 6201 * Returns the resulting document tree 6202 */ 6203htmlDocPtr 6204htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd, 6205 const char *URL, const char *encoding, int options) 6206{ 6207 xmlParserInputBufferPtr input; 6208 xmlParserInputPtr stream; 6209 6210 if (fd < 0) 6211 return (NULL); 6212 if (ctxt == NULL) 6213 return (NULL); 6214 6215 htmlCtxtReset(ctxt); 6216 6217 6218 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 6219 if (input == NULL) 6220 return (NULL); 6221 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6222 if (stream == NULL) { 6223 xmlFreeParserInputBuffer(input); 6224 return (NULL); 6225 } 6226 inputPush(ctxt, stream); 6227 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6228} 6229 6230/** 6231 * htmlCtxtReadIO: 6232 * @ctxt: an HTML parser context 6233 * @ioread: an I/O read function 6234 * @ioclose: an I/O close function 6235 * @ioctx: an I/O handler 6236 * @URL: the base URL to use for the document 6237 * @encoding: the document encoding, or NULL 6238 * @options: a combination of htmlParserOption(s) 6239 * 6240 * parse an HTML document from I/O functions and source and build a tree. 6241 * This reuses the existing @ctxt parser context 6242 * 6243 * Returns the resulting document tree 6244 */ 6245htmlDocPtr 6246htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, 6247 xmlInputCloseCallback ioclose, void *ioctx, 6248 const char *URL, 6249 const char *encoding, int options) 6250{ 6251 xmlParserInputBufferPtr input; 6252 xmlParserInputPtr stream; 6253 6254 if (ioread == NULL) 6255 return (NULL); 6256 if (ctxt == NULL) 6257 return (NULL); 6258 6259 htmlCtxtReset(ctxt); 6260 6261 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 6262 XML_CHAR_ENCODING_NONE); 6263 if (input == NULL) 6264 return (NULL); 6265 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6266 if (stream == NULL) { 6267 xmlFreeParserInputBuffer(input); 6268 return (NULL); 6269 } 6270 inputPush(ctxt, stream); 6271 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6272} 6273 6274#define bottom_HTMLparser 6275#include "elfgcchack.h" 6276#endif /* LIBXML_HTML_ENABLED */ 6277