1/* 2 * HTMLparser.c : an HTML 4.0 non-verifying parser 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel@veillard.com 7 */ 8 9#define IN_LIBXML 10#include "libxml.h" 11#ifdef LIBXML_HTML_ENABLED 12 13#include <string.h> 14#ifdef HAVE_CTYPE_H 15#include <ctype.h> 16#endif 17#ifdef HAVE_STDLIB_H 18#include <stdlib.h> 19#endif 20#ifdef HAVE_SYS_STAT_H 21#include <sys/stat.h> 22#endif 23#ifdef HAVE_FCNTL_H 24#include <fcntl.h> 25#endif 26#ifdef HAVE_UNISTD_H 27#include <unistd.h> 28#endif 29#ifdef HAVE_ZLIB_H 30#include <zlib.h> 31#endif 32 33#include <libxml/xmlmemory.h> 34#include <libxml/tree.h> 35#include <libxml/parser.h> 36#include <libxml/parserInternals.h> 37#include <libxml/xmlerror.h> 38#include <libxml/HTMLparser.h> 39#include <libxml/HTMLtree.h> 40#include <libxml/entities.h> 41#include <libxml/encoding.h> 42#include <libxml/valid.h> 43#include <libxml/xmlIO.h> 44#include <libxml/globals.h> 45#include <libxml/uri.h> 46 47#define HTML_MAX_NAMELEN 1000 48#define HTML_PARSER_BIG_BUFFER_SIZE 1000 49#define HTML_PARSER_BUFFER_SIZE 100 50 51/* #define DEBUG */ 52/* #define DEBUG_PUSH */ 53 54static int htmlOmittedDefaultValue = 1; 55 56xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, 57 xmlChar end, xmlChar end2, xmlChar end3); 58static void htmlParseComment(htmlParserCtxtPtr ctxt); 59 60/************************************************************************ 61 * * 62 * Some factorized error routines * 63 * * 64 ************************************************************************/ 65 66/** 67 * htmlErrMemory: 68 * @ctxt: an HTML parser context 69 * @extra: extra informations 70 * 71 * Handle a redefinition of attribute error 72 */ 73static void 74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra) 75{ 76 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 77 (ctxt->instate == XML_PARSER_EOF)) 78 return; 79 if (ctxt != NULL) { 80 ctxt->errNo = XML_ERR_NO_MEMORY; 81 ctxt->instate = XML_PARSER_EOF; 82 ctxt->disableSAX = 1; 83 } 84 if (extra) 85 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 86 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 87 NULL, NULL, 0, 0, 88 "Memory allocation failed : %s\n", extra); 89 else 90 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 91 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL, 92 NULL, NULL, 0, 0, "Memory allocation failed\n"); 93} 94 95/** 96 * htmlParseErr: 97 * @ctxt: an HTML parser context 98 * @error: the error number 99 * @msg: the error message 100 * @str1: string infor 101 * @str2: string infor 102 * 103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints 104 */ 105static void 106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, 107 const char *msg, const xmlChar *str1, const xmlChar *str2) 108{ 109 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 110 (ctxt->instate == XML_PARSER_EOF)) 111 return; 112 if (ctxt != NULL) 113 ctxt->errNo = error; 114 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 115 XML_ERR_ERROR, NULL, 0, 116 (const char *) str1, (const char *) str2, 117 NULL, 0, 0, 118 msg, str1, str2); 119 if (ctxt != NULL) 120 ctxt->wellFormed = 0; 121} 122 123/** 124 * htmlParseErrInt: 125 * @ctxt: an HTML parser context 126 * @error: the error number 127 * @msg: the error message 128 * @val: integer info 129 * 130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints 131 */ 132static void 133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, 134 const char *msg, int val) 135{ 136 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 137 (ctxt->instate == XML_PARSER_EOF)) 138 return; 139 if (ctxt != NULL) 140 ctxt->errNo = error; 141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 142 XML_ERR_ERROR, NULL, 0, NULL, NULL, 143 NULL, val, 0, msg, val); 144 if (ctxt != NULL) 145 ctxt->wellFormed = 0; 146} 147 148/************************************************************************ 149 * * 150 * Parser stacks related functions and macros * 151 * * 152 ************************************************************************/ 153 154/** 155 * htmlnamePush: 156 * @ctxt: an HTML parser context 157 * @value: the element name 158 * 159 * Pushes a new element name on top of the name stack 160 * 161 * Returns 0 in case of error, the index in the stack otherwise 162 */ 163static int 164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value) 165{ 166 if (ctxt->nameNr >= ctxt->nameMax) { 167 ctxt->nameMax *= 2; 168 ctxt->nameTab = (const xmlChar * *) 169 xmlRealloc((xmlChar * *)ctxt->nameTab, 170 ctxt->nameMax * 171 sizeof(ctxt->nameTab[0])); 172 if (ctxt->nameTab == NULL) { 173 htmlErrMemory(ctxt, NULL); 174 return (0); 175 } 176 } 177 ctxt->nameTab[ctxt->nameNr] = value; 178 ctxt->name = value; 179 return (ctxt->nameNr++); 180} 181/** 182 * htmlnamePop: 183 * @ctxt: an HTML parser context 184 * 185 * Pops the top element name from the name stack 186 * 187 * Returns the name just removed 188 */ 189static const xmlChar * 190htmlnamePop(htmlParserCtxtPtr ctxt) 191{ 192 const xmlChar *ret; 193 194 if (ctxt->nameNr <= 0) 195 return (NULL); 196 ctxt->nameNr--; 197 if (ctxt->nameNr < 0) 198 return (NULL); 199 if (ctxt->nameNr > 0) 200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1]; 201 else 202 ctxt->name = NULL; 203 ret = ctxt->nameTab[ctxt->nameNr]; 204 ctxt->nameTab[ctxt->nameNr] = NULL; 205 return (ret); 206} 207 208/* 209 * Macros for accessing the content. Those should be used only by the parser, 210 * and not exported. 211 * 212 * Dirty macros, i.e. one need to make assumption on the context to use them 213 * 214 * CUR_PTR return the current pointer to the xmlChar to be parsed. 215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled 216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled 217 * in UNICODE mode. This should be used internally by the parser 218 * only to compare to ASCII values otherwise it would break when 219 * running with UTF-8 encoding. 220 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only 221 * to compare on ASCII based substring. 222 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR 223 * it should be used only to compare on ASCII based substring. 224 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined 225 * strings without newlines within the parser. 226 * 227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding 228 * 229 * CURRENT Returns the current char value, with the full decoding of 230 * UTF-8 if we are using this mode. It returns an int. 231 * NEXT Skip to the next character, this does the proper decoding 232 * in UTF-8 mode. It also pop-up unfinished entities on the fly. 233 * NEXTL(l) Skip the current unicode character of l xmlChars long. 234 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly 235 */ 236 237#define UPPER (toupper(*ctxt->input->cur)) 238 239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val) 240 241#define NXT(val) ctxt->input->cur[(val)] 242 243#define UPP(val) (toupper(ctxt->input->cur[(val)])) 244 245#define CUR_PTR ctxt->input->cur 246 247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ 248 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ 249 xmlParserInputShrink(ctxt->input) 250 251#define GROW if ((ctxt->progressive == 0) && \ 252 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ 253 xmlParserInputGrow(ctxt->input, INPUT_CHUNK) 254 255#define CURRENT ((int) (*ctxt->input->cur)) 256 257#define SKIP_BLANKS htmlSkipBlankChars(ctxt) 258 259/* Inported from XML */ 260 261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ 262#define CUR ((int) (*ctxt->input->cur)) 263#define NEXT xmlNextChar(ctxt) 264 265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) 266#define NXT(val) ctxt->input->cur[(val)] 267#define CUR_PTR ctxt->input->cur 268 269 270#define NEXTL(l) do { \ 271 if (*(ctxt->input->cur) == '\n') { \ 272 ctxt->input->line++; ctxt->input->col = 1; \ 273 } else ctxt->input->col++; \ 274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \ 275 } while (0) 276 277/************ 278 \ 279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ 280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); 281 ************/ 282 283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l) 284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l) 285 286#define COPY_BUF(l,b,i,v) \ 287 if (l == 1) b[i++] = (xmlChar) v; \ 288 else i += xmlCopyChar(l,&b[i],v) 289 290/** 291 * htmlCurrentChar: 292 * @ctxt: the HTML parser context 293 * @len: pointer to the length of the char read 294 * 295 * The current char value, if using UTF-8 this may actually span multiple 296 * bytes in the input buffer. Implement the end of line normalization: 297 * 2.11 End-of-Line Handling 298 * If the encoding is unspecified, in the case we find an ISO-Latin-1 299 * char, then the encoding converter is plugged in automatically. 300 * 301 * Returns the current char value and its length 302 */ 303 304static int 305htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { 306 if (ctxt->instate == XML_PARSER_EOF) 307 return(0); 308 309 if (ctxt->token != 0) { 310 *len = 0; 311 return(ctxt->token); 312 } 313 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { 314 /* 315 * We are supposed to handle UTF8, check it's valid 316 * From rfc2044: encoding of the Unicode values on UTF-8: 317 * 318 * UCS-4 range (hex.) UTF-8 octet sequence (binary) 319 * 0000 0000-0000 007F 0xxxxxxx 320 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 321 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 322 * 323 * Check for the 0x110000 limit too 324 */ 325 const unsigned char *cur = ctxt->input->cur; 326 unsigned char c; 327 unsigned int val; 328 329 c = *cur; 330 if (c & 0x80) { 331 if (cur[1] == 0) 332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 333 if ((cur[1] & 0xc0) != 0x80) 334 goto encoding_error; 335 if ((c & 0xe0) == 0xe0) { 336 337 if (cur[2] == 0) 338 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 339 if ((cur[2] & 0xc0) != 0x80) 340 goto encoding_error; 341 if ((c & 0xf0) == 0xf0) { 342 if (cur[3] == 0) 343 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 344 if (((c & 0xf8) != 0xf0) || 345 ((cur[3] & 0xc0) != 0x80)) 346 goto encoding_error; 347 /* 4-byte code */ 348 *len = 4; 349 val = (cur[0] & 0x7) << 18; 350 val |= (cur[1] & 0x3f) << 12; 351 val |= (cur[2] & 0x3f) << 6; 352 val |= cur[3] & 0x3f; 353 } else { 354 /* 3-byte code */ 355 *len = 3; 356 val = (cur[0] & 0xf) << 12; 357 val |= (cur[1] & 0x3f) << 6; 358 val |= cur[2] & 0x3f; 359 } 360 } else { 361 /* 2-byte code */ 362 *len = 2; 363 val = (cur[0] & 0x1f) << 6; 364 val |= cur[1] & 0x3f; 365 } 366 if (!IS_CHAR(val)) { 367 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 368 "Char 0x%X out of allowed range\n", val); 369 } 370 return(val); 371 } else { 372 /* 1-byte code */ 373 *len = 1; 374 return((int) *ctxt->input->cur); 375 } 376 } 377 /* 378 * Assume it's a fixed length encoding (1) with 379 * a compatible encoding for the ASCII set, since 380 * XML constructs only use < 128 chars 381 */ 382 *len = 1; 383 if ((int) *ctxt->input->cur < 0x80) 384 return((int) *ctxt->input->cur); 385 386 /* 387 * Humm this is bad, do an automatic flow conversion 388 */ 389 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); 390 ctxt->charset = XML_CHAR_ENCODING_UTF8; 391 return(xmlCurrentChar(ctxt, len)); 392 393encoding_error: 394 /* 395 * If we detect an UTF8 error that probably mean that the 396 * input encoding didn't get properly advertized in the 397 * declaration header. Report the error and switch the encoding 398 * to ISO-Latin-1 (if you don't like this policy, just declare the 399 * encoding !) 400 */ 401 { 402 char buffer[150]; 403 404 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", 405 ctxt->input->cur[0], ctxt->input->cur[1], 406 ctxt->input->cur[2], ctxt->input->cur[3]); 407 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 408 "Input is not proper UTF-8, indicate encoding !\n", 409 BAD_CAST buffer, NULL); 410 } 411 412 ctxt->charset = XML_CHAR_ENCODING_8859_1; 413 *len = 1; 414 return((int) *ctxt->input->cur); 415} 416 417/** 418 * htmlSkipBlankChars: 419 * @ctxt: the HTML parser context 420 * 421 * skip all blanks character found at that point in the input streams. 422 * 423 * Returns the number of space chars skipped 424 */ 425 426static int 427htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { 428 int res = 0; 429 430 while (IS_BLANK_CH(*(ctxt->input->cur))) { 431 if ((*ctxt->input->cur == 0) && 432 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { 433 xmlPopInput(ctxt); 434 } else { 435 if (*(ctxt->input->cur) == '\n') { 436 ctxt->input->line++; ctxt->input->col = 1; 437 } else ctxt->input->col++; 438 ctxt->input->cur++; 439 ctxt->nbChars++; 440 if (*ctxt->input->cur == 0) 441 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 442 } 443 res++; 444 } 445 return(res); 446} 447 448 449 450/************************************************************************ 451 * * 452 * The list of HTML elements and their properties * 453 * * 454 ************************************************************************/ 455 456/* 457 * Start Tag: 1 means the start tag can be ommited 458 * End Tag: 1 means the end tag can be ommited 459 * 2 means it's forbidden (empty elements) 460 * 3 means the tag is stylistic and should be closed easily 461 * Depr: this element is deprecated 462 * DTD: 1 means that this element is valid only in the Loose DTD 463 * 2 means that this element is valid only in the Frameset DTD 464 * 465 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description 466 , subElements , impliedsubelt , Attributes, userdata 467 */ 468 469/* Definitions and a couple of vars for HTML Elements */ 470 471#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small" 472#define NB_FONTSTYLE 8 473#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym" 474#define NB_PHRASE 10 475#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe" 476#define NB_SPECIAL 15 477#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL 478#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL 479#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address" 480#define NB_BLOCK NB_HEADING + NB_LIST + 14 481#define FORMCTRL "input", "select", "textarea", "label", "button" 482#define NB_FORMCTRL 5 483#define PCDATA 484#define NB_PCDATA 0 485#define HEADING "h1", "h2", "h3", "h4", "h5", "h6" 486#define NB_HEADING 6 487#define LIST "ul", "ol", "dir", "menu" 488#define NB_LIST 4 489#define MODIFIER 490#define NB_MODIFIER 0 491#define FLOW BLOCK,INLINE 492#define NB_FLOW NB_BLOCK + NB_INLINE 493#define EMPTY NULL 494 495 496static const char* html_flow[] = { FLOW, NULL } ; 497static const char* html_inline[] = { INLINE, NULL } ; 498 499/* placeholders: elts with content but no subelements */ 500static const char* html_pcdata[] = { NULL } ; 501#define html_cdata html_pcdata 502 503 504/* ... and for HTML Attributes */ 505 506#define COREATTRS "id", "class", "style", "title" 507#define NB_COREATTRS 4 508#define I18N "lang", "dir" 509#define NB_I18N 2 510#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup" 511#define NB_EVENTS 9 512#define ATTRS COREATTRS,I18N,EVENTS 513#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS 514#define CELLHALIGN "align", "char", "charoff" 515#define NB_CELLHALIGN 3 516#define CELLVALIGN "valign" 517#define NB_CELLVALIGN 1 518 519static const char* html_attrs[] = { ATTRS, NULL } ; 520static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ; 521static const char* core_attrs[] = { COREATTRS, NULL } ; 522static const char* i18n_attrs[] = { I18N, NULL } ; 523 524 525/* Other declarations that should go inline ... */ 526static const char* a_attrs[] = { ATTRS, "charset", "type", "name", 527 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords", 528 "tabindex", "onfocus", "onblur", NULL } ; 529static const char* target_attr[] = { "target", NULL } ; 530static const char* rows_cols_attr[] = { "rows", "cols", NULL } ; 531static const char* alt_attr[] = { "alt", NULL } ; 532static const char* src_alt_attrs[] = { "src", "alt", NULL } ; 533static const char* href_attrs[] = { "href", NULL } ; 534static const char* clear_attrs[] = { "clear", NULL } ; 535static const char* inline_p[] = { INLINE, "p", NULL } ; 536static const char* flow_param[] = { FLOW, "param", NULL } ; 537static const char* applet_attrs[] = { COREATTRS , "codebase", 538 "archive", "alt", "name", "height", "width", "align", 539 "hspace", "vspace", NULL } ; 540static const char* area_attrs[] = { "shape", "coords", "href", "nohref", 541 "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 542static const char* basefont_attrs[] = 543 { "id", "size", "color", "face", NULL } ; 544static const char* quote_attrs[] = { ATTRS, "cite", NULL } ; 545static const char* body_contents[] = { FLOW, "ins", "del", NULL } ; 546static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ; 547static const char* body_depr[] = { "background", "bgcolor", "text", 548 "link", "vlink", "alink", NULL } ; 549static const char* button_attrs[] = { ATTRS, "name", "value", "type", 550 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 551 552 553static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ; 554static const char* col_elt[] = { "col", NULL } ; 555static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ; 556static const char* compact_attrs[] = { ATTRS, "compact", NULL } ; 557static const char* dl_contents[] = { "dt", "dd", NULL } ; 558static const char* compact_attr[] = { "compact", NULL } ; 559static const char* label_attr[] = { "label", NULL } ; 560static const char* fieldset_contents[] = { FLOW, "legend" } ; 561static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ; 562static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ; 563static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ; 564static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ; 565static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ; 566static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ; 567static const char* head_attrs[] = { I18N, "profile", NULL } ; 568static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ; 569static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ; 570static const char* version_attr[] = { "version", NULL } ; 571static const char* html_content[] = { "head", "body", "frameset", NULL } ; 572static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ; 573static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ; 574static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ; 575static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ; 576static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ; 577static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ; 578static const char* align_attr[] = { "align", NULL } ; 579static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ; 580static const char* map_contents[] = { BLOCK, "area", NULL } ; 581static const char* name_attr[] = { "name", NULL } ; 582static const char* action_attr[] = { "action", NULL } ; 583static const char* blockli_elt[] = { BLOCK, "li", NULL } ; 584static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ; 585static const char* content_attr[] = { "content", NULL } ; 586static const char* type_attr[] = { "type", NULL } ; 587static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ; 588static const char* object_contents[] = { FLOW, "param", NULL } ; 589static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ; 590static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ; 591static const char* ol_attrs[] = { "type", "compact", "start", NULL} ; 592static const char* option_elt[] = { "option", NULL } ; 593static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ; 594static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ; 595static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ; 596static const char* width_attr[] = { "width", NULL } ; 597static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ; 598static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ; 599static const char* language_attr[] = { "language", NULL } ; 600static const char* select_content[] = { "optgroup", "option", NULL } ; 601static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ; 602static const char* style_attrs[] = { I18N, "media", "title", NULL } ; 603static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ; 604static const char* table_depr[] = { "align", "bgcolor", NULL } ; 605static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ; 606static const char* tr_elt[] = { "tr", NULL } ; 607static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ; 608static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ; 609static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ; 610static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ; 611static const char* tr_contents[] = { "th", "td", NULL } ; 612static const char* bgcolor_attr[] = { "bgcolor", NULL } ; 613static const char* li_elt[] = { "li", NULL } ; 614static const char* ul_depr[] = { "type", "compact", NULL} ; 615static const char* dir_attr[] = { "dir", NULL} ; 616 617#define DECL (const char**) 618 619static const htmlElemDesc 620html40ElementTable[] = { 621{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ", 622 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL 623}, 624{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form", 625 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 626}, 627{ "acronym", 0, 0, 0, 0, 0, 0, 1, "", 628 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 629}, 630{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ", 631 DECL inline_p , NULL , DECL html_attrs, NULL, NULL 632}, 633{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ", 634 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL 635}, 636{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ", 637 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr 638}, 639{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style", 640 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 641}, 642{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ", 643 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs 644}, 645{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " , 646 EMPTY , NULL , NULL, DECL basefont_attrs, NULL 647}, 648{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ", 649 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr 650}, 651{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style", 652 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 653}, 654{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ", 655 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL 656}, 657{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ", 658 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL 659}, 660{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ", 661 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL 662}, 663{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ", 664 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL 665}, 666{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ", 667 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 668}, 669{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ", 670 DECL html_flow , NULL , NULL, DECL html_attrs, NULL 671}, 672{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation", 673 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 674}, 675{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment", 676 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 677}, 678{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ", 679 EMPTY , NULL , DECL col_attrs , NULL, NULL 680}, 681{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ", 682 DECL col_elt , "col" , DECL col_attrs , NULL, NULL 683}, 684{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ", 685 DECL html_flow , NULL , DECL html_attrs, NULL, NULL 686}, 687{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ", 688 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL 689}, 690{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition", 691 DECL html_inline , NULL , DECL html_attrs, NULL, NULL 692}, 693{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list", 694 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL 695}, 696{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container", 697 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL 698}, 699{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ", 700 DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL 701}, 702{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ", 703 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 704}, 705{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis", 706 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 707}, 708{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ", 709 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL 710}, 711{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ", 712 DECL html_inline, NULL, NULL, DECL font_attrs, NULL 713}, 714{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ", 715 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr 716}, 717{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " , 718 EMPTY, NULL, NULL, DECL frame_attrs, NULL 719}, 720{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" , 721 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL 722}, 723{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ", 724 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 725}, 726{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ", 727 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 728}, 729{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ", 730 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 731}, 732{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ", 733 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 734}, 735{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ", 736 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 737}, 738{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ", 739 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 740}, 741{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ", 742 DECL head_contents, NULL, DECL head_attrs, NULL, NULL 743}, 744{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " , 745 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL 746}, 747{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ", 748 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL 749}, 750{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style", 751 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 752}, 753{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ", 754 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL 755}, 756{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ", 757 EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs 758}, 759{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ", 760 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL 761}, 762{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text", 763 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL 764}, 765{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ", 766 EMPTY, NULL, NULL, DECL prompt_attrs, NULL 767}, 768{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user", 769 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 770}, 771{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ", 772 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL 773}, 774{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ", 775 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL 776}, 777{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ", 778 DECL html_flow, NULL, DECL html_attrs, NULL, NULL 779}, 780{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ", 781 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL 782}, 783{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ", 784 DECL map_contents , NULL, DECL html_attrs , NULL, name_attr 785}, 786{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ", 787 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL 788}, 789{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ", 790 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr 791}, 792{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ", 793 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL 794}, 795{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ", 796 DECL html_flow, "div", DECL html_attrs, NULL, NULL 797}, 798{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ", 799 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL 800}, 801{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ", 802 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL 803}, 804{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ", 805 option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr 806}, 807{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " , 808 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL 809}, 810{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ", 811 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 812}, 813{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ", 814 EMPTY, NULL, DECL param_attrs, NULL, name_attr 815}, 816{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ", 817 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL 818}, 819{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ", 820 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL 821}, 822{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style", 823 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 824}, 825{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.", 826 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 827}, 828{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ", 829 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr 830}, 831{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ", 832 DECL select_content, NULL, DECL select_attrs, NULL, NULL 833}, 834{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style", 835 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 836}, 837{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ", 838 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 839}, 840{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text", 841 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 842}, 843{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis", 844 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 845}, 846{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ", 847 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr 848}, 849{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript", 850 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 851}, 852{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ", 853 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 854}, 855{ "table", 0, 0, 0, 0, 0, 0, 0, "", 856 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL 857}, 858{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ", 859 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 860}, 861{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell", 862 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 863}, 864{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ", 865 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr 866}, 867{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ", 868 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 869}, 870{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell", 871 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 872}, 873{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ", 874 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 875}, 876{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ", 877 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL 878}, 879{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ", 880 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL 881}, 882{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style", 883 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 884}, 885{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style", 886 DECL html_inline, NULL, NULL, DECL html_attrs, NULL 887}, 888{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ", 889 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL 890}, 891{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument", 892 DECL html_inline, NULL, DECL html_attrs, NULL, NULL 893} 894}; 895 896/* 897 * start tags that imply the end of current element 898 */ 899static const char *htmlStartClose[] = { 900"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6", 901 "dl", "ul", "ol", "menu", "dir", "address", "pre", 902 "listing", "xmp", "head", NULL, 903"head", "p", NULL, 904"title", "p", NULL, 905"body", "head", "style", "link", "title", "p", NULL, 906"frameset", "head", "style", "link", "title", "p", NULL, 907"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address", 908 "pre", "listing", "xmp", "head", "li", NULL, 909"hr", "p", "head", NULL, 910"h1", "p", "head", NULL, 911"h2", "p", "head", NULL, 912"h3", "p", "head", NULL, 913"h4", "p", "head", NULL, 914"h5", "p", "head", NULL, 915"h6", "p", "head", NULL, 916"dir", "p", "head", NULL, 917"address", "p", "head", "ul", NULL, 918"pre", "p", "head", "ul", NULL, 919"listing", "p", "head", NULL, 920"xmp", "p", "head", NULL, 921"blockquote", "p", "head", NULL, 922"dl", "p", "dt", "menu", "dir", "address", "pre", "listing", 923 "xmp", "head", NULL, 924"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp", 925 "head", "dd", NULL, 926"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp", 927 "head", "dt", NULL, 928"ul", "p", "head", "ol", "menu", "dir", "address", "pre", 929 "listing", "xmp", NULL, 930"ol", "p", "head", "ul", NULL, 931"menu", "p", "head", "ul", NULL, 932"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL, 933"div", "p", "head", NULL, 934"noscript", "p", "head", NULL, 935"center", "font", "b", "i", "p", "head", NULL, 936"a", "a", NULL, 937"caption", "p", NULL, 938"colgroup", "caption", "colgroup", "col", "p", NULL, 939"col", "caption", "col", "p", NULL, 940"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", 941 "listing", "xmp", "a", NULL, 942"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 943"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 944"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL, 945"thead", "caption", "col", "colgroup", NULL, 946"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead", 947 "tbody", "p", NULL, 948"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead", 949 "tfoot", "tbody", "p", NULL, 950"optgroup", "option", NULL, 951"option", "option", NULL, 952"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", 953 "pre", "listing", "xmp", "a", NULL, 954NULL 955}; 956 957/* 958 * The list of HTML elements which are supposed not to have 959 * CDATA content and where a p element will be implied 960 * 961 * TODO: extend that list by reading the HTML SGML DTD on 962 * implied paragraph 963 */ 964static const char *htmlNoContentElements[] = { 965 "html", 966 "head", 967 NULL 968}; 969 970/* 971 * The list of HTML attributes which are of content %Script; 972 * NOTE: when adding ones, check htmlIsScriptAttribute() since 973 * it assumes the name starts with 'on' 974 */ 975static const char *htmlScriptAttributes[] = { 976 "onclick", 977 "ondblclick", 978 "onmousedown", 979 "onmouseup", 980 "onmouseover", 981 "onmousemove", 982 "onmouseout", 983 "onkeypress", 984 "onkeydown", 985 "onkeyup", 986 "onload", 987 "onunload", 988 "onfocus", 989 "onblur", 990 "onsubmit", 991 "onrest", 992 "onchange", 993 "onselect" 994}; 995 996/* 997 * This table is used by the htmlparser to know what to do with 998 * broken html pages. By assigning different priorities to different 999 * elements the parser can decide how to handle extra endtags. 1000 * Endtags are only allowed to close elements with lower or equal 1001 * priority. 1002 */ 1003 1004typedef struct { 1005 const char *name; 1006 int priority; 1007} elementPriority; 1008 1009static const elementPriority htmlEndPriority[] = { 1010 {"div", 150}, 1011 {"td", 160}, 1012 {"th", 160}, 1013 {"tr", 170}, 1014 {"thead", 180}, 1015 {"tbody", 180}, 1016 {"tfoot", 180}, 1017 {"table", 190}, 1018 {"head", 200}, 1019 {"body", 200}, 1020 {"html", 220}, 1021 {NULL, 100} /* Default priority */ 1022}; 1023 1024static const char** htmlStartCloseIndex[100]; 1025static int htmlStartCloseIndexinitialized = 0; 1026 1027/************************************************************************ 1028 * * 1029 * functions to handle HTML specific data * 1030 * * 1031 ************************************************************************/ 1032 1033/** 1034 * htmlInitAutoClose: 1035 * 1036 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1037 * This is not reentrant. Call xmlInitParser() once before processing in 1038 * case of use in multithreaded programs. 1039 */ 1040void 1041htmlInitAutoClose(void) { 1042 int indx, i = 0; 1043 1044 if (htmlStartCloseIndexinitialized) return; 1045 1046 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL; 1047 indx = 0; 1048 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) { 1049 htmlStartCloseIndex[indx++] = &htmlStartClose[i]; 1050 while (htmlStartClose[i] != NULL) i++; 1051 i++; 1052 } 1053 htmlStartCloseIndexinitialized = 1; 1054} 1055 1056/** 1057 * htmlTagLookup: 1058 * @tag: The tag name in lowercase 1059 * 1060 * Lookup the HTML tag in the ElementTable 1061 * 1062 * Returns the related htmlElemDescPtr or NULL if not found. 1063 */ 1064const htmlElemDesc * 1065htmlTagLookup(const xmlChar *tag) { 1066 unsigned int i; 1067 1068 for (i = 0; i < (sizeof(html40ElementTable) / 1069 sizeof(html40ElementTable[0]));i++) { 1070 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name)) 1071 return((htmlElemDescPtr) &html40ElementTable[i]); 1072 } 1073 return(NULL); 1074} 1075 1076/** 1077 * htmlGetEndPriority: 1078 * @name: The name of the element to look up the priority for. 1079 * 1080 * Return value: The "endtag" priority. 1081 **/ 1082static int 1083htmlGetEndPriority (const xmlChar *name) { 1084 int i = 0; 1085 1086 while ((htmlEndPriority[i].name != NULL) && 1087 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name))) 1088 i++; 1089 1090 return(htmlEndPriority[i].priority); 1091} 1092 1093 1094/** 1095 * htmlCheckAutoClose: 1096 * @newtag: The new tag name 1097 * @oldtag: The old tag name 1098 * 1099 * Checks whether the new tag is one of the registered valid tags for 1100 * closing old. 1101 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1102 * 1103 * Returns 0 if no, 1 if yes. 1104 */ 1105static int 1106htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag) 1107{ 1108 int i, indx; 1109 const char **closed = NULL; 1110 1111 if (htmlStartCloseIndexinitialized == 0) 1112 htmlInitAutoClose(); 1113 1114 /* inefficient, but not a big deal */ 1115 for (indx = 0; indx < 100; indx++) { 1116 closed = htmlStartCloseIndex[indx]; 1117 if (closed == NULL) 1118 return (0); 1119 if (xmlStrEqual(BAD_CAST * closed, newtag)) 1120 break; 1121 } 1122 1123 i = closed - htmlStartClose; 1124 i++; 1125 while (htmlStartClose[i] != NULL) { 1126 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) { 1127 return (1); 1128 } 1129 i++; 1130 } 1131 return (0); 1132} 1133 1134/** 1135 * htmlAutoCloseOnClose: 1136 * @ctxt: an HTML parser context 1137 * @newtag: The new tag name 1138 * @force: force the tag closure 1139 * 1140 * The HTML DTD allows an ending tag to implicitly close other tags. 1141 */ 1142static void 1143htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1144{ 1145 const htmlElemDesc *info; 1146 int i, priority; 1147 1148 priority = htmlGetEndPriority(newtag); 1149 1150 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1151 1152 if (xmlStrEqual(newtag, ctxt->nameTab[i])) 1153 break; 1154 /* 1155 * A missplaced endtag can only close elements with lower 1156 * or equal priority, so if we find an element with higher 1157 * priority before we find an element with 1158 * matching name, we just ignore this endtag 1159 */ 1160 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority) 1161 return; 1162 } 1163 if (i < 0) 1164 return; 1165 1166 while (!xmlStrEqual(newtag, ctxt->name)) { 1167 info = htmlTagLookup(ctxt->name); 1168 if ((info != NULL) && (info->endTag == 3)) { 1169 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 1170 "Opening and ending tag mismatch: %s and %s\n", 1171 newtag, ctxt->name); 1172 } 1173 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1174 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1175 htmlnamePop(ctxt); 1176 } 1177} 1178 1179/** 1180 * htmlAutoCloseOnEnd: 1181 * @ctxt: an HTML parser context 1182 * 1183 * Close all remaining tags at the end of the stream 1184 */ 1185static void 1186htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) 1187{ 1188 int i; 1189 1190 if (ctxt->nameNr == 0) 1191 return; 1192 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1193 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1194 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1195 htmlnamePop(ctxt); 1196 } 1197} 1198 1199/** 1200 * htmlAutoClose: 1201 * @ctxt: an HTML parser context 1202 * @newtag: The new tag name or NULL 1203 * 1204 * The HTML DTD allows a tag to implicitly close other tags. 1205 * The list is kept in htmlStartClose array. This function is 1206 * called when a new tag has been detected and generates the 1207 * appropriates closes if possible/needed. 1208 * If newtag is NULL this mean we are at the end of the resource 1209 * and we should check 1210 */ 1211static void 1212htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1213{ 1214 while ((newtag != NULL) && (ctxt->name != NULL) && 1215 (htmlCheckAutoClose(newtag, ctxt->name))) { 1216 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1217 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1218 htmlnamePop(ctxt); 1219 } 1220 if (newtag == NULL) { 1221 htmlAutoCloseOnEnd(ctxt); 1222 return; 1223 } 1224 while ((newtag == NULL) && (ctxt->name != NULL) && 1225 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) || 1226 (xmlStrEqual(ctxt->name, BAD_CAST "body")) || 1227 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) { 1228 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1229 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1230 htmlnamePop(ctxt); 1231 } 1232} 1233 1234/** 1235 * htmlAutoCloseTag: 1236 * @doc: the HTML document 1237 * @name: The tag name 1238 * @elem: the HTML element 1239 * 1240 * The HTML DTD allows a tag to implicitly close other tags. 1241 * The list is kept in htmlStartClose array. This function checks 1242 * if the element or one of it's children would autoclose the 1243 * given tag. 1244 * 1245 * Returns 1 if autoclose, 0 otherwise 1246 */ 1247int 1248htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) { 1249 htmlNodePtr child; 1250 1251 if (elem == NULL) return(1); 1252 if (xmlStrEqual(name, elem->name)) return(0); 1253 if (htmlCheckAutoClose(elem->name, name)) return(1); 1254 child = elem->children; 1255 while (child != NULL) { 1256 if (htmlAutoCloseTag(doc, name, child)) return(1); 1257 child = child->next; 1258 } 1259 return(0); 1260} 1261 1262/** 1263 * htmlIsAutoClosed: 1264 * @doc: the HTML document 1265 * @elem: the HTML element 1266 * 1267 * The HTML DTD allows a tag to implicitly close other tags. 1268 * The list is kept in htmlStartClose array. This function checks 1269 * if a tag is autoclosed by one of it's child 1270 * 1271 * Returns 1 if autoclosed, 0 otherwise 1272 */ 1273int 1274htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) { 1275 htmlNodePtr child; 1276 1277 if (elem == NULL) return(1); 1278 child = elem->children; 1279 while (child != NULL) { 1280 if (htmlAutoCloseTag(doc, elem->name, child)) return(1); 1281 child = child->next; 1282 } 1283 return(0); 1284} 1285 1286/** 1287 * htmlCheckImplied: 1288 * @ctxt: an HTML parser context 1289 * @newtag: The new tag name 1290 * 1291 * The HTML DTD allows a tag to exists only implicitly 1292 * called when a new tag has been detected and generates the 1293 * appropriates implicit tags if missing 1294 */ 1295static void 1296htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { 1297 if (!htmlOmittedDefaultValue) 1298 return; 1299 if (xmlStrEqual(newtag, BAD_CAST"html")) 1300 return; 1301 if (ctxt->nameNr <= 0) { 1302 htmlnamePush(ctxt, BAD_CAST"html"); 1303 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1304 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL); 1305 } 1306 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head"))) 1307 return; 1308 if ((ctxt->nameNr <= 1) && 1309 ((xmlStrEqual(newtag, BAD_CAST"script")) || 1310 (xmlStrEqual(newtag, BAD_CAST"style")) || 1311 (xmlStrEqual(newtag, BAD_CAST"meta")) || 1312 (xmlStrEqual(newtag, BAD_CAST"link")) || 1313 (xmlStrEqual(newtag, BAD_CAST"title")) || 1314 (xmlStrEqual(newtag, BAD_CAST"base")))) { 1315 /* 1316 * dropped OBJECT ... i you put it first BODY will be 1317 * assumed ! 1318 */ 1319 htmlnamePush(ctxt, BAD_CAST"head"); 1320 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1321 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL); 1322 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) && 1323 (!xmlStrEqual(newtag, BAD_CAST"frame")) && 1324 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) { 1325 int i; 1326 for (i = 0;i < ctxt->nameNr;i++) { 1327 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) { 1328 return; 1329 } 1330 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) { 1331 return; 1332 } 1333 } 1334 1335 htmlnamePush(ctxt, BAD_CAST"body"); 1336 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1337 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL); 1338 } 1339} 1340 1341/** 1342 * htmlCheckParagraph 1343 * @ctxt: an HTML parser context 1344 * 1345 * Check whether a p element need to be implied before inserting 1346 * characters in the current element. 1347 * 1348 * Returns 1 if a paragraph has been inserted, 0 if not and -1 1349 * in case of error. 1350 */ 1351 1352static int 1353htmlCheckParagraph(htmlParserCtxtPtr ctxt) { 1354 const xmlChar *tag; 1355 int i; 1356 1357 if (ctxt == NULL) 1358 return(-1); 1359 tag = ctxt->name; 1360 if (tag == NULL) { 1361 htmlAutoClose(ctxt, BAD_CAST"p"); 1362 htmlCheckImplied(ctxt, BAD_CAST"p"); 1363 htmlnamePush(ctxt, BAD_CAST"p"); 1364 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1365 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1366 return(1); 1367 } 1368 if (!htmlOmittedDefaultValue) 1369 return(0); 1370 for (i = 0; htmlNoContentElements[i] != NULL; i++) { 1371 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) { 1372 htmlAutoClose(ctxt, BAD_CAST"p"); 1373 htmlCheckImplied(ctxt, BAD_CAST"p"); 1374 htmlnamePush(ctxt, BAD_CAST"p"); 1375 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1376 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1377 return(1); 1378 } 1379 } 1380 return(0); 1381} 1382 1383/** 1384 * htmlIsScriptAttribute: 1385 * @name: an attribute name 1386 * 1387 * Check if an attribute is of content type Script 1388 * 1389 * Returns 1 is the attribute is a script 0 otherwise 1390 */ 1391int 1392htmlIsScriptAttribute(const xmlChar *name) { 1393 unsigned int i; 1394 1395 if (name == NULL) 1396 return(0); 1397 /* 1398 * all script attributes start with 'on' 1399 */ 1400 if ((name[0] != 'o') || (name[1] != 'n')) 1401 return(0); 1402 for (i = 0; 1403 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]); 1404 i++) { 1405 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i])) 1406 return(1); 1407 } 1408 return(0); 1409} 1410 1411/************************************************************************ 1412 * * 1413 * The list of HTML predefined entities * 1414 * * 1415 ************************************************************************/ 1416 1417 1418static const htmlEntityDesc html40EntitiesTable[] = { 1419/* 1420 * the 4 absolute ones, plus apostrophe. 1421 */ 1422{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" }, 1423{ 38, "amp", "ampersand, U+0026 ISOnum" }, 1424{ 39, "apos", "single quote" }, 1425{ 60, "lt", "less-than sign, U+003C ISOnum" }, 1426{ 62, "gt", "greater-than sign, U+003E ISOnum" }, 1427 1428/* 1429 * A bunch still in the 128-255 range 1430 * Replacing them depend really on the charset used. 1431 */ 1432{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" }, 1433{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" }, 1434{ 162, "cent", "cent sign, U+00A2 ISOnum" }, 1435{ 163, "pound","pound sign, U+00A3 ISOnum" }, 1436{ 164, "curren","currency sign, U+00A4 ISOnum" }, 1437{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" }, 1438{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" }, 1439{ 167, "sect", "section sign, U+00A7 ISOnum" }, 1440{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" }, 1441{ 169, "copy", "copyright sign, U+00A9 ISOnum" }, 1442{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" }, 1443{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" }, 1444{ 172, "not", "not sign, U+00AC ISOnum" }, 1445{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" }, 1446{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" }, 1447{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" }, 1448{ 176, "deg", "degree sign, U+00B0 ISOnum" }, 1449{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" }, 1450{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" }, 1451{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" }, 1452{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" }, 1453{ 181, "micro","micro sign, U+00B5 ISOnum" }, 1454{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" }, 1455{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" }, 1456{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" }, 1457{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" }, 1458{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" }, 1459{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" }, 1460{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" }, 1461{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" }, 1462{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" }, 1463{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" }, 1464{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" }, 1465{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" }, 1466{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" }, 1467{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" }, 1468{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" }, 1469{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" }, 1470{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" }, 1471{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" }, 1472{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" }, 1473{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" }, 1474{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" }, 1475{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" }, 1476{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" }, 1477{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" }, 1478{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" }, 1479{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" }, 1480{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" }, 1481{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" }, 1482{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" }, 1483{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" }, 1484{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" }, 1485{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" }, 1486{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" }, 1487{ 215, "times","multiplication sign, U+00D7 ISOnum" }, 1488{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" }, 1489{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" }, 1490{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" }, 1491{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" }, 1492{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" }, 1493{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" }, 1494{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" }, 1495{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" }, 1496{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" }, 1497{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" }, 1498{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" }, 1499{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" }, 1500{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" }, 1501{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" }, 1502{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" }, 1503{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" }, 1504{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" }, 1505{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" }, 1506{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" }, 1507{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" }, 1508{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" }, 1509{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" }, 1510{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" }, 1511{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" }, 1512{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" }, 1513{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" }, 1514{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" }, 1515{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" }, 1516{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" }, 1517{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" }, 1518{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" }, 1519{ 247, "divide","division sign, U+00F7 ISOnum" }, 1520{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" }, 1521{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" }, 1522{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" }, 1523{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" }, 1524{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" }, 1525{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" }, 1526{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" }, 1527{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" }, 1528 1529{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" }, 1530{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" }, 1531{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" }, 1532{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" }, 1533{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" }, 1534 1535/* 1536 * Anything below should really be kept as entities references 1537 */ 1538{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" }, 1539 1540{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" }, 1541{ 732, "tilde","small tilde, U+02DC ISOdia" }, 1542 1543{ 913, "Alpha","greek capital letter alpha, U+0391" }, 1544{ 914, "Beta", "greek capital letter beta, U+0392" }, 1545{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" }, 1546{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" }, 1547{ 917, "Epsilon","greek capital letter epsilon, U+0395" }, 1548{ 918, "Zeta", "greek capital letter zeta, U+0396" }, 1549{ 919, "Eta", "greek capital letter eta, U+0397" }, 1550{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" }, 1551{ 921, "Iota", "greek capital letter iota, U+0399" }, 1552{ 922, "Kappa","greek capital letter kappa, U+039A" }, 1553{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" }, 1554{ 924, "Mu", "greek capital letter mu, U+039C" }, 1555{ 925, "Nu", "greek capital letter nu, U+039D" }, 1556{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" }, 1557{ 927, "Omicron","greek capital letter omicron, U+039F" }, 1558{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" }, 1559{ 929, "Rho", "greek capital letter rho, U+03A1" }, 1560{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" }, 1561{ 932, "Tau", "greek capital letter tau, U+03A4" }, 1562{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" }, 1563{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" }, 1564{ 935, "Chi", "greek capital letter chi, U+03A7" }, 1565{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" }, 1566{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" }, 1567 1568{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" }, 1569{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" }, 1570{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" }, 1571{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" }, 1572{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" }, 1573{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" }, 1574{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" }, 1575{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" }, 1576{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" }, 1577{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" }, 1578{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" }, 1579{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" }, 1580{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" }, 1581{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" }, 1582{ 959, "omicron","greek small letter omicron, U+03BF NEW" }, 1583{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" }, 1584{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" }, 1585{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" }, 1586{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" }, 1587{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" }, 1588{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" }, 1589{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" }, 1590{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" }, 1591{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" }, 1592{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" }, 1593{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" }, 1594{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" }, 1595{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" }, 1596 1597{ 8194, "ensp", "en space, U+2002 ISOpub" }, 1598{ 8195, "emsp", "em space, U+2003 ISOpub" }, 1599{ 8201, "thinsp","thin space, U+2009 ISOpub" }, 1600{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" }, 1601{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" }, 1602{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" }, 1603{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" }, 1604{ 8211, "ndash","en dash, U+2013 ISOpub" }, 1605{ 8212, "mdash","em dash, U+2014 ISOpub" }, 1606{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" }, 1607{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" }, 1608{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" }, 1609{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" }, 1610{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" }, 1611{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" }, 1612{ 8224, "dagger","dagger, U+2020 ISOpub" }, 1613{ 8225, "Dagger","double dagger, U+2021 ISOpub" }, 1614 1615{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" }, 1616{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" }, 1617 1618{ 8240, "permil","per mille sign, U+2030 ISOtech" }, 1619 1620{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" }, 1621{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" }, 1622 1623{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" }, 1624{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" }, 1625 1626{ 8254, "oline","overline = spacing overscore, U+203E NEW" }, 1627{ 8260, "frasl","fraction slash, U+2044 NEW" }, 1628 1629{ 8364, "euro", "euro sign, U+20AC NEW" }, 1630 1631{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" }, 1632{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" }, 1633{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" }, 1634{ 8482, "trade","trade mark sign, U+2122 ISOnum" }, 1635{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" }, 1636{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" }, 1637{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" }, 1638{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" }, 1639{ 8595, "darr", "downwards arrow, U+2193 ISOnum" }, 1640{ 8596, "harr", "left right arrow, U+2194 ISOamsa" }, 1641{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" }, 1642{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" }, 1643{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" }, 1644{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" }, 1645{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" }, 1646{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" }, 1647 1648{ 8704, "forall","for all, U+2200 ISOtech" }, 1649{ 8706, "part", "partial differential, U+2202 ISOtech" }, 1650{ 8707, "exist","there exists, U+2203 ISOtech" }, 1651{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" }, 1652{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" }, 1653{ 8712, "isin", "element of, U+2208 ISOtech" }, 1654{ 8713, "notin","not an element of, U+2209 ISOtech" }, 1655{ 8715, "ni", "contains as member, U+220B ISOtech" }, 1656{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" }, 1657{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" }, 1658{ 8722, "minus","minus sign, U+2212 ISOtech" }, 1659{ 8727, "lowast","asterisk operator, U+2217 ISOtech" }, 1660{ 8730, "radic","square root = radical sign, U+221A ISOtech" }, 1661{ 8733, "prop", "proportional to, U+221D ISOtech" }, 1662{ 8734, "infin","infinity, U+221E ISOtech" }, 1663{ 8736, "ang", "angle, U+2220 ISOamso" }, 1664{ 8743, "and", "logical and = wedge, U+2227 ISOtech" }, 1665{ 8744, "or", "logical or = vee, U+2228 ISOtech" }, 1666{ 8745, "cap", "intersection = cap, U+2229 ISOtech" }, 1667{ 8746, "cup", "union = cup, U+222A ISOtech" }, 1668{ 8747, "int", "integral, U+222B ISOtech" }, 1669{ 8756, "there4","therefore, U+2234 ISOtech" }, 1670{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" }, 1671{ 8773, "cong", "approximately equal to, U+2245 ISOtech" }, 1672{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" }, 1673{ 8800, "ne", "not equal to, U+2260 ISOtech" }, 1674{ 8801, "equiv","identical to, U+2261 ISOtech" }, 1675{ 8804, "le", "less-than or equal to, U+2264 ISOtech" }, 1676{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" }, 1677{ 8834, "sub", "subset of, U+2282 ISOtech" }, 1678{ 8835, "sup", "superset of, U+2283 ISOtech" }, 1679{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" }, 1680{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" }, 1681{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" }, 1682{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" }, 1683{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" }, 1684{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" }, 1685{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" }, 1686{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" }, 1687{ 8969, "rceil","right ceiling, U+2309 ISOamsc" }, 1688{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" }, 1689{ 8971, "rfloor","right floor, U+230B ISOamsc" }, 1690{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" }, 1691{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" }, 1692{ 9674, "loz", "lozenge, U+25CA ISOpub" }, 1693 1694{ 9824, "spades","black spade suit, U+2660 ISOpub" }, 1695{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" }, 1696{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" }, 1697{ 9830, "diams","black diamond suit, U+2666 ISOpub" }, 1698 1699}; 1700 1701/************************************************************************ 1702 * * 1703 * Commodity functions to handle entities * 1704 * * 1705 ************************************************************************/ 1706 1707/* 1708 * Macro used to grow the current buffer. 1709 */ 1710#define growBuffer(buffer) { \ 1711 xmlChar *tmp; \ 1712 buffer##_size *= 2; \ 1713 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \ 1714 if (tmp == NULL) { \ 1715 htmlErrMemory(ctxt, "growing buffer\n"); \ 1716 xmlFree(buffer); \ 1717 return(NULL); \ 1718 } \ 1719 buffer = tmp; \ 1720} 1721 1722/** 1723 * htmlEntityLookup: 1724 * @name: the entity name 1725 * 1726 * Lookup the given entity in EntitiesTable 1727 * 1728 * TODO: the linear scan is really ugly, an hash table is really needed. 1729 * 1730 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1731 */ 1732const htmlEntityDesc * 1733htmlEntityLookup(const xmlChar *name) { 1734 unsigned int i; 1735 1736 for (i = 0;i < (sizeof(html40EntitiesTable)/ 1737 sizeof(html40EntitiesTable[0]));i++) { 1738 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) { 1739 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1740 } 1741 } 1742 return(NULL); 1743} 1744 1745/** 1746 * htmlEntityValueLookup: 1747 * @value: the entity's unicode value 1748 * 1749 * Lookup the given entity in EntitiesTable 1750 * 1751 * TODO: the linear scan is really ugly, an hash table is really needed. 1752 * 1753 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1754 */ 1755const htmlEntityDesc * 1756htmlEntityValueLookup(unsigned int value) { 1757 unsigned int i; 1758 1759 for (i = 0;i < (sizeof(html40EntitiesTable)/ 1760 sizeof(html40EntitiesTable[0]));i++) { 1761 if (html40EntitiesTable[i].value >= value) { 1762 if (html40EntitiesTable[i].value > value) 1763 break; 1764 return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1765 } 1766 } 1767 return(NULL); 1768} 1769 1770/** 1771 * UTF8ToHtml: 1772 * @out: a pointer to an array of bytes to store the result 1773 * @outlen: the length of @out 1774 * @in: a pointer to an array of UTF-8 chars 1775 * @inlen: the length of @in 1776 * 1777 * Take a block of UTF-8 chars in and try to convert it to an ASCII 1778 * plus HTML entities block of chars out. 1779 * 1780 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 1781 * The value of @inlen after return is the number of octets consumed 1782 * as the return value is positive, else unpredictable. 1783 * The value of @outlen after return is the number of octets consumed. 1784 */ 1785int 1786UTF8ToHtml(unsigned char* out, int *outlen, 1787 const unsigned char* in, int *inlen) { 1788 const unsigned char* processed = in; 1789 const unsigned char* outend; 1790 const unsigned char* outstart = out; 1791 const unsigned char* instart = in; 1792 const unsigned char* inend; 1793 unsigned int c, d; 1794 int trailing; 1795 1796 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1); 1797 if (in == NULL) { 1798 /* 1799 * initialization nothing to do 1800 */ 1801 *outlen = 0; 1802 *inlen = 0; 1803 return(0); 1804 } 1805 inend = in + (*inlen); 1806 outend = out + (*outlen); 1807 while (in < inend) { 1808 d = *in++; 1809 if (d < 0x80) { c= d; trailing= 0; } 1810 else if (d < 0xC0) { 1811 /* trailing byte in leading position */ 1812 *outlen = out - outstart; 1813 *inlen = processed - instart; 1814 return(-2); 1815 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 1816 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 1817 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 1818 else { 1819 /* no chance for this in Ascii */ 1820 *outlen = out - outstart; 1821 *inlen = processed - instart; 1822 return(-2); 1823 } 1824 1825 if (inend - in < trailing) { 1826 break; 1827 } 1828 1829 for ( ; trailing; trailing--) { 1830 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) 1831 break; 1832 c <<= 6; 1833 c |= d & 0x3F; 1834 } 1835 1836 /* assertion: c is a single UTF-4 value */ 1837 if (c < 0x80) { 1838 if (out + 1 >= outend) 1839 break; 1840 *out++ = c; 1841 } else { 1842 int len; 1843 const htmlEntityDesc * ent; 1844 1845 /* 1846 * Try to lookup a predefined HTML entity for it 1847 */ 1848 1849 ent = htmlEntityValueLookup(c); 1850 if (ent == NULL) { 1851 /* no chance for this in Ascii */ 1852 *outlen = out - outstart; 1853 *inlen = processed - instart; 1854 return(-2); 1855 } 1856 len = strlen(ent->name); 1857 if (out + 2 + len >= outend) 1858 break; 1859 *out++ = '&'; 1860 memcpy(out, ent->name, len); 1861 out += len; 1862 *out++ = ';'; 1863 } 1864 processed = in; 1865 } 1866 *outlen = out - outstart; 1867 *inlen = processed - instart; 1868 return(0); 1869} 1870 1871/** 1872 * htmlEncodeEntities: 1873 * @out: a pointer to an array of bytes to store the result 1874 * @outlen: the length of @out 1875 * @in: a pointer to an array of UTF-8 chars 1876 * @inlen: the length of @in 1877 * @quoteChar: the quote character to escape (' or ") or zero. 1878 * 1879 * Take a block of UTF-8 chars in and try to convert it to an ASCII 1880 * plus HTML entities block of chars out. 1881 * 1882 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 1883 * The value of @inlen after return is the number of octets consumed 1884 * as the return value is positive, else unpredictable. 1885 * The value of @outlen after return is the number of octets consumed. 1886 */ 1887int 1888htmlEncodeEntities(unsigned char* out, int *outlen, 1889 const unsigned char* in, int *inlen, int quoteChar) { 1890 const unsigned char* processed = in; 1891 const unsigned char* outend; 1892 const unsigned char* outstart = out; 1893 const unsigned char* instart = in; 1894 const unsigned char* inend; 1895 unsigned int c, d; 1896 int trailing; 1897 1898 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) 1899 return(-1); 1900 outend = out + (*outlen); 1901 inend = in + (*inlen); 1902 while (in < inend) { 1903 d = *in++; 1904 if (d < 0x80) { c= d; trailing= 0; } 1905 else if (d < 0xC0) { 1906 /* trailing byte in leading position */ 1907 *outlen = out - outstart; 1908 *inlen = processed - instart; 1909 return(-2); 1910 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 1911 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 1912 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 1913 else { 1914 /* no chance for this in Ascii */ 1915 *outlen = out - outstart; 1916 *inlen = processed - instart; 1917 return(-2); 1918 } 1919 1920 if (inend - in < trailing) 1921 break; 1922 1923 while (trailing--) { 1924 if (((d= *in++) & 0xC0) != 0x80) { 1925 *outlen = out - outstart; 1926 *inlen = processed - instart; 1927 return(-2); 1928 } 1929 c <<= 6; 1930 c |= d & 0x3F; 1931 } 1932 1933 /* assertion: c is a single UTF-4 value */ 1934 if ((c < 0x80) && (c != (unsigned int) quoteChar) && 1935 (c != '&') && (c != '<') && (c != '>')) { 1936 if (out >= outend) 1937 break; 1938 *out++ = c; 1939 } else { 1940 const htmlEntityDesc * ent; 1941 const char *cp; 1942 char nbuf[16]; 1943 int len; 1944 1945 /* 1946 * Try to lookup a predefined HTML entity for it 1947 */ 1948 ent = htmlEntityValueLookup(c); 1949 if (ent == NULL) { 1950 snprintf(nbuf, sizeof(nbuf), "#%u", c); 1951 cp = nbuf; 1952 } 1953 else 1954 cp = ent->name; 1955 len = strlen(cp); 1956 if (out + 2 + len > outend) 1957 break; 1958 *out++ = '&'; 1959 memcpy(out, cp, len); 1960 out += len; 1961 *out++ = ';'; 1962 } 1963 processed = in; 1964 } 1965 *outlen = out - outstart; 1966 *inlen = processed - instart; 1967 return(0); 1968} 1969 1970/************************************************************************ 1971 * * 1972 * Commodity functions to handle streams * 1973 * * 1974 ************************************************************************/ 1975 1976/** 1977 * htmlNewInputStream: 1978 * @ctxt: an HTML parser context 1979 * 1980 * Create a new input stream structure 1981 * Returns the new input stream or NULL 1982 */ 1983static htmlParserInputPtr 1984htmlNewInputStream(htmlParserCtxtPtr ctxt) { 1985 htmlParserInputPtr input; 1986 1987 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput)); 1988 if (input == NULL) { 1989 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n"); 1990 return(NULL); 1991 } 1992 memset(input, 0, sizeof(htmlParserInput)); 1993 input->filename = NULL; 1994 input->directory = NULL; 1995 input->base = NULL; 1996 input->cur = NULL; 1997 input->buf = NULL; 1998 input->line = 1; 1999 input->col = 1; 2000 input->buf = NULL; 2001 input->free = NULL; 2002 input->version = NULL; 2003 input->consumed = 0; 2004 input->length = 0; 2005 return(input); 2006} 2007 2008 2009/************************************************************************ 2010 * * 2011 * Commodity functions, cleanup needed ? * 2012 * * 2013 ************************************************************************/ 2014/* 2015 * all tags allowing pc data from the html 4.01 loose dtd 2016 * NOTE: it might be more apropriate to integrate this information 2017 * into the html40ElementTable array but I don't want to risk any 2018 * binary incomptibility 2019 */ 2020static const char *allowPCData[] = { 2021 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big", 2022 "blockquote", "body", "button", "caption", "center", "cite", "code", 2023 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2", 2024 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend", 2025 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp", 2026 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var" 2027}; 2028 2029/** 2030 * areBlanks: 2031 * @ctxt: an HTML parser context 2032 * @str: a xmlChar * 2033 * @len: the size of @str 2034 * 2035 * Is this a sequence of blank chars that one can ignore ? 2036 * 2037 * Returns 1 if ignorable 0 otherwise. 2038 */ 2039 2040static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { 2041 unsigned int i; 2042 int j; 2043 xmlNodePtr lastChild; 2044 xmlDtdPtr dtd; 2045 2046 for (j = 0;j < len;j++) 2047 if (!(IS_BLANK_CH(str[j]))) return(0); 2048 2049 if (CUR == 0) return(1); 2050 if (CUR != '<') return(0); 2051 if (ctxt->name == NULL) 2052 return(1); 2053 if (xmlStrEqual(ctxt->name, BAD_CAST"html")) 2054 return(1); 2055 if (xmlStrEqual(ctxt->name, BAD_CAST"head")) 2056 return(1); 2057 2058 /* Only strip CDATA children of the body tag for strict HTML DTDs */ 2059 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) { 2060 dtd = xmlGetIntSubset(ctxt->myDoc); 2061 if (dtd != NULL && dtd->ExternalID != NULL) { 2062 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") || 2063 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN")) 2064 return(1); 2065 } 2066 } 2067 2068 if (ctxt->node == NULL) return(0); 2069 lastChild = xmlGetLastChild(ctxt->node); 2070 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE)) 2071 lastChild = lastChild->prev; 2072 if (lastChild == NULL) { 2073 if ((ctxt->node->type != XML_ELEMENT_NODE) && 2074 (ctxt->node->content != NULL)) return(0); 2075 /* keep ws in constructs like ...<b> </b>... 2076 for all tags "b" allowing PCDATA */ 2077 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2078 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) { 2079 return(0); 2080 } 2081 } 2082 } else if (xmlNodeIsText(lastChild)) { 2083 return(0); 2084 } else { 2085 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p> 2086 for all tags "p" allowing PCDATA */ 2087 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2088 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) { 2089 return(0); 2090 } 2091 } 2092 } 2093 return(1); 2094} 2095 2096/** 2097 * htmlNewDocNoDtD: 2098 * @URI: URI for the dtd, or NULL 2099 * @ExternalID: the external ID of the DTD, or NULL 2100 * 2101 * Creates a new HTML document without a DTD node if @URI and @ExternalID 2102 * are NULL 2103 * 2104 * Returns a new document, do not initialize the DTD if not provided 2105 */ 2106htmlDocPtr 2107htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) { 2108 xmlDocPtr cur; 2109 2110 /* 2111 * Allocate a new document and fill the fields. 2112 */ 2113 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc)); 2114 if (cur == NULL) { 2115 htmlErrMemory(NULL, "HTML document creation failed\n"); 2116 return(NULL); 2117 } 2118 memset(cur, 0, sizeof(xmlDoc)); 2119 2120 cur->type = XML_HTML_DOCUMENT_NODE; 2121 cur->version = NULL; 2122 cur->intSubset = NULL; 2123 cur->doc = cur; 2124 cur->name = NULL; 2125 cur->children = NULL; 2126 cur->extSubset = NULL; 2127 cur->oldNs = NULL; 2128 cur->encoding = NULL; 2129 cur->standalone = 1; 2130 cur->compression = 0; 2131 cur->ids = NULL; 2132 cur->refs = NULL; 2133 cur->_private = NULL; 2134 cur->charset = XML_CHAR_ENCODING_UTF8; 2135 if ((ExternalID != NULL) || 2136 (URI != NULL)) 2137 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI); 2138 return(cur); 2139} 2140 2141/** 2142 * htmlNewDoc: 2143 * @URI: URI for the dtd, or NULL 2144 * @ExternalID: the external ID of the DTD, or NULL 2145 * 2146 * Creates a new HTML document 2147 * 2148 * Returns a new document 2149 */ 2150htmlDocPtr 2151htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { 2152 if ((URI == NULL) && (ExternalID == NULL)) 2153 return(htmlNewDocNoDtD( 2154 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd", 2155 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN")); 2156 2157 return(htmlNewDocNoDtD(URI, ExternalID)); 2158} 2159 2160 2161/************************************************************************ 2162 * * 2163 * The parser itself * 2164 * Relates to http://www.w3.org/TR/html40 * 2165 * * 2166 ************************************************************************/ 2167 2168/************************************************************************ 2169 * * 2170 * The parser itself * 2171 * * 2172 ************************************************************************/ 2173 2174static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); 2175 2176/** 2177 * htmlParseHTMLName: 2178 * @ctxt: an HTML parser context 2179 * 2180 * parse an HTML tag or attribute name, note that we convert it to lowercase 2181 * since HTML names are not case-sensitive. 2182 * 2183 * Returns the Tag Name parsed or NULL 2184 */ 2185 2186static const xmlChar * 2187htmlParseHTMLName(htmlParserCtxtPtr ctxt) { 2188 int i = 0; 2189 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2190 2191 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') && 2192 (CUR != ':')) return(NULL); 2193 2194 while ((i < HTML_PARSER_BUFFER_SIZE) && 2195 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) || 2196 (CUR == ':') || (CUR == '-') || (CUR == '_'))) { 2197 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; 2198 else loc[i] = CUR; 2199 i++; 2200 2201 NEXT; 2202 } 2203 2204 return(xmlDictLookup(ctxt->dict, loc, i)); 2205} 2206 2207/** 2208 * htmlParseName: 2209 * @ctxt: an HTML parser context 2210 * 2211 * parse an HTML name, this routine is case sensitive. 2212 * 2213 * Returns the Name parsed or NULL 2214 */ 2215 2216static const xmlChar * 2217htmlParseName(htmlParserCtxtPtr ctxt) { 2218 const xmlChar *in; 2219 const xmlChar *ret; 2220 int count = 0; 2221 2222 GROW; 2223 2224 /* 2225 * Accelerator for simple ASCII names 2226 */ 2227 in = ctxt->input->cur; 2228 if (((*in >= 0x61) && (*in <= 0x7A)) || 2229 ((*in >= 0x41) && (*in <= 0x5A)) || 2230 (*in == '_') || (*in == ':')) { 2231 in++; 2232 while (((*in >= 0x61) && (*in <= 0x7A)) || 2233 ((*in >= 0x41) && (*in <= 0x5A)) || 2234 ((*in >= 0x30) && (*in <= 0x39)) || 2235 (*in == '_') || (*in == '-') || 2236 (*in == ':') || (*in == '.')) 2237 in++; 2238 if ((*in > 0) && (*in < 0x80)) { 2239 count = in - ctxt->input->cur; 2240 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); 2241 ctxt->input->cur = in; 2242 ctxt->nbChars += count; 2243 ctxt->input->col += count; 2244 return(ret); 2245 } 2246 } 2247 return(htmlParseNameComplex(ctxt)); 2248} 2249 2250static const xmlChar * 2251htmlParseNameComplex(xmlParserCtxtPtr ctxt) { 2252 int len = 0, l; 2253 int c; 2254 int count = 0; 2255 2256 /* 2257 * Handler for more complex cases 2258 */ 2259 GROW; 2260 c = CUR_CHAR(l); 2261 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ 2262 (!IS_LETTER(c) && (c != '_') && 2263 (c != ':'))) { 2264 return(NULL); 2265 } 2266 2267 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ 2268 ((IS_LETTER(c)) || (IS_DIGIT(c)) || 2269 (c == '.') || (c == '-') || 2270 (c == '_') || (c == ':') || 2271 (IS_COMBINING(c)) || 2272 (IS_EXTENDER(c)))) { 2273 if (count++ > 100) { 2274 count = 0; 2275 GROW; 2276 } 2277 len += l; 2278 NEXTL(l); 2279 c = CUR_CHAR(l); 2280 } 2281 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); 2282} 2283 2284 2285/** 2286 * htmlParseHTMLAttribute: 2287 * @ctxt: an HTML parser context 2288 * @stop: a char stop value 2289 * 2290 * parse an HTML attribute value till the stop (quote), if 2291 * stop is 0 then it stops at the first space 2292 * 2293 * Returns the attribute parsed or NULL 2294 */ 2295 2296static xmlChar * 2297htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { 2298 xmlChar *buffer = NULL; 2299 int buffer_size = 0; 2300 xmlChar *out = NULL; 2301 const xmlChar *name = NULL; 2302 const xmlChar *cur = NULL; 2303 const htmlEntityDesc * ent; 2304 2305 /* 2306 * allocate a translation buffer. 2307 */ 2308 buffer_size = HTML_PARSER_BUFFER_SIZE; 2309 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar)); 2310 if (buffer == NULL) { 2311 htmlErrMemory(ctxt, "buffer allocation failed\n"); 2312 return(NULL); 2313 } 2314 out = buffer; 2315 2316 /* 2317 * Ok loop until we reach one of the ending chars 2318 */ 2319 while ((CUR != 0) && (CUR != stop)) { 2320 if ((stop == 0) && (CUR == '>')) break; 2321 if ((stop == 0) && (IS_BLANK_CH(CUR))) break; 2322 if (CUR == '&') { 2323 if (NXT(1) == '#') { 2324 unsigned int c; 2325 int bits; 2326 2327 c = htmlParseCharRef(ctxt); 2328 if (c < 0x80) 2329 { *out++ = c; bits= -6; } 2330 else if (c < 0x800) 2331 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2332 else if (c < 0x10000) 2333 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2334 else 2335 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2336 2337 for ( ; bits >= 0; bits-= 6) { 2338 *out++ = ((c >> bits) & 0x3F) | 0x80; 2339 } 2340 2341 if (out - buffer > buffer_size - 100) { 2342 int indx = out - buffer; 2343 2344 growBuffer(buffer); 2345 out = &buffer[indx]; 2346 } 2347 } else { 2348 ent = htmlParseEntityRef(ctxt, &name); 2349 if (name == NULL) { 2350 *out++ = '&'; 2351 if (out - buffer > buffer_size - 100) { 2352 int indx = out - buffer; 2353 2354 growBuffer(buffer); 2355 out = &buffer[indx]; 2356 } 2357 } else if (ent == NULL) { 2358 *out++ = '&'; 2359 cur = name; 2360 while (*cur != 0) { 2361 if (out - buffer > buffer_size - 100) { 2362 int indx = out - buffer; 2363 2364 growBuffer(buffer); 2365 out = &buffer[indx]; 2366 } 2367 *out++ = *cur++; 2368 } 2369 } else { 2370 unsigned int c; 2371 int bits; 2372 2373 if (out - buffer > buffer_size - 100) { 2374 int indx = out - buffer; 2375 2376 growBuffer(buffer); 2377 out = &buffer[indx]; 2378 } 2379 c = (xmlChar)ent->value; 2380 if (c < 0x80) 2381 { *out++ = c; bits= -6; } 2382 else if (c < 0x800) 2383 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2384 else if (c < 0x10000) 2385 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2386 else 2387 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2388 2389 for ( ; bits >= 0; bits-= 6) { 2390 *out++ = ((c >> bits) & 0x3F) | 0x80; 2391 } 2392 } 2393 } 2394 } else { 2395 unsigned int c; 2396 int bits, l; 2397 2398 if (out - buffer > buffer_size - 100) { 2399 int indx = out - buffer; 2400 2401 growBuffer(buffer); 2402 out = &buffer[indx]; 2403 } 2404 c = CUR_CHAR(l); 2405 if (c < 0x80) 2406 { *out++ = c; bits= -6; } 2407 else if (c < 0x800) 2408 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2409 else if (c < 0x10000) 2410 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2411 else 2412 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2413 2414 for ( ; bits >= 0; bits-= 6) { 2415 *out++ = ((c >> bits) & 0x3F) | 0x80; 2416 } 2417 NEXT; 2418 } 2419 } 2420 *out++ = 0; 2421 return(buffer); 2422} 2423 2424/** 2425 * htmlParseEntityRef: 2426 * @ctxt: an HTML parser context 2427 * @str: location to store the entity name 2428 * 2429 * parse an HTML ENTITY references 2430 * 2431 * [68] EntityRef ::= '&' Name ';' 2432 * 2433 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise, 2434 * if non-NULL *str will have to be freed by the caller. 2435 */ 2436const htmlEntityDesc * 2437htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) { 2438 const xmlChar *name; 2439 const htmlEntityDesc * ent = NULL; 2440 2441 if (str != NULL) *str = NULL; 2442 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL); 2443 2444 if (CUR == '&') { 2445 NEXT; 2446 name = htmlParseName(ctxt); 2447 if (name == NULL) { 2448 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 2449 "htmlParseEntityRef: no name\n", NULL, NULL); 2450 } else { 2451 GROW; 2452 if (CUR == ';') { 2453 if (str != NULL) 2454 *str = name; 2455 2456 /* 2457 * Lookup the entity in the table. 2458 */ 2459 ent = htmlEntityLookup(name); 2460 if (ent != NULL) /* OK that's ugly !!! */ 2461 NEXT; 2462 } else { 2463 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING, 2464 "htmlParseEntityRef: expecting ';'\n", 2465 NULL, NULL); 2466 if (str != NULL) 2467 *str = name; 2468 } 2469 } 2470 } 2471 return(ent); 2472} 2473 2474/** 2475 * htmlParseAttValue: 2476 * @ctxt: an HTML parser context 2477 * 2478 * parse a value for an attribute 2479 * Note: the parser won't do substitution of entities here, this 2480 * will be handled later in xmlStringGetNodeList, unless it was 2481 * asked for ctxt->replaceEntities != 0 2482 * 2483 * Returns the AttValue parsed or NULL. 2484 */ 2485 2486static xmlChar * 2487htmlParseAttValue(htmlParserCtxtPtr ctxt) { 2488 xmlChar *ret = NULL; 2489 2490 if (CUR == '"') { 2491 NEXT; 2492 ret = htmlParseHTMLAttribute(ctxt, '"'); 2493 if (CUR != '"') { 2494 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2495 "AttValue: \" expected\n", NULL, NULL); 2496 } else 2497 NEXT; 2498 } else if (CUR == '\'') { 2499 NEXT; 2500 ret = htmlParseHTMLAttribute(ctxt, '\''); 2501 if (CUR != '\'') { 2502 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2503 "AttValue: ' expected\n", NULL, NULL); 2504 } else 2505 NEXT; 2506 } else { 2507 /* 2508 * That's an HTMLism, the attribute value may not be quoted 2509 */ 2510 ret = htmlParseHTMLAttribute(ctxt, 0); 2511 if (ret == NULL) { 2512 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE, 2513 "AttValue: no value found\n", NULL, NULL); 2514 } 2515 } 2516 return(ret); 2517} 2518 2519/** 2520 * htmlParseSystemLiteral: 2521 * @ctxt: an HTML parser context 2522 * 2523 * parse an HTML Literal 2524 * 2525 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") 2526 * 2527 * Returns the SystemLiteral parsed or NULL 2528 */ 2529 2530static xmlChar * 2531htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { 2532 const xmlChar *q; 2533 xmlChar *ret = NULL; 2534 2535 if (CUR == '"') { 2536 NEXT; 2537 q = CUR_PTR; 2538 while ((IS_CHAR_CH(CUR)) && (CUR != '"')) 2539 NEXT; 2540 if (!IS_CHAR_CH(CUR)) { 2541 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2542 "Unfinished SystemLiteral\n", NULL, NULL); 2543 } else { 2544 ret = xmlStrndup(q, CUR_PTR - q); 2545 NEXT; 2546 } 2547 } else if (CUR == '\'') { 2548 NEXT; 2549 q = CUR_PTR; 2550 while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) 2551 NEXT; 2552 if (!IS_CHAR_CH(CUR)) { 2553 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2554 "Unfinished SystemLiteral\n", NULL, NULL); 2555 } else { 2556 ret = xmlStrndup(q, CUR_PTR - q); 2557 NEXT; 2558 } 2559 } else { 2560 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2561 " or ' expected\n", NULL, NULL); 2562 } 2563 2564 return(ret); 2565} 2566 2567/** 2568 * htmlParsePubidLiteral: 2569 * @ctxt: an HTML parser context 2570 * 2571 * parse an HTML public literal 2572 * 2573 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" 2574 * 2575 * Returns the PubidLiteral parsed or NULL. 2576 */ 2577 2578static xmlChar * 2579htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { 2580 const xmlChar *q; 2581 xmlChar *ret = NULL; 2582 /* 2583 * Name ::= (Letter | '_') (NameChar)* 2584 */ 2585 if (CUR == '"') { 2586 NEXT; 2587 q = CUR_PTR; 2588 while (IS_PUBIDCHAR_CH(CUR)) NEXT; 2589 if (CUR != '"') { 2590 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2591 "Unfinished PubidLiteral\n", NULL, NULL); 2592 } else { 2593 ret = xmlStrndup(q, CUR_PTR - q); 2594 NEXT; 2595 } 2596 } else if (CUR == '\'') { 2597 NEXT; 2598 q = CUR_PTR; 2599 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')) 2600 NEXT; 2601 if (CUR != '\'') { 2602 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2603 "Unfinished PubidLiteral\n", NULL, NULL); 2604 } else { 2605 ret = xmlStrndup(q, CUR_PTR - q); 2606 NEXT; 2607 } 2608 } else { 2609 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2610 "PubidLiteral \" or ' expected\n", NULL, NULL); 2611 } 2612 2613 return(ret); 2614} 2615 2616/** 2617 * htmlParseScript: 2618 * @ctxt: an HTML parser context 2619 * 2620 * parse the content of an HTML SCRIPT or STYLE element 2621 * http://www.w3.org/TR/html4/sgml/dtd.html#Script 2622 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet 2623 * http://www.w3.org/TR/html4/types.html#type-script 2624 * http://www.w3.org/TR/html4/types.html#h-6.15 2625 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1 2626 * 2627 * Script data ( %Script; in the DTD) can be the content of the SCRIPT 2628 * element and the value of intrinsic event attributes. User agents must 2629 * not evaluate script data as HTML markup but instead must pass it on as 2630 * data to a script engine. 2631 * NOTES: 2632 * - The content is passed like CDATA 2633 * - the attributes for style and scripting "onXXX" are also described 2634 * as CDATA but SGML allows entities references in attributes so their 2635 * processing is identical as other attributes 2636 */ 2637static void 2638htmlParseScript(htmlParserCtxtPtr ctxt) { 2639 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2640 int nbchar = 0; 2641 int cur,l; 2642 2643 SHRINK; 2644 cur = CUR_CHAR(l); 2645 while (IS_CHAR_CH(cur)) { 2646 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') && 2647 (NXT(3) == '-')) { 2648 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2649 if (ctxt->sax->cdataBlock!= NULL) { 2650 /* 2651 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2652 */ 2653 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2654 } else if (ctxt->sax->characters != NULL) { 2655 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2656 } 2657 } 2658 nbchar = 0; 2659 htmlParseComment(ctxt); 2660 cur = CUR_CHAR(l); 2661 continue; 2662 } else if ((cur == '<') && (NXT(1) == '/')) { 2663 /* 2664 * One should break here, the specification is clear: 2665 * Authors should therefore escape "</" within the content. 2666 * Escape mechanisms are specific to each scripting or 2667 * style sheet language. 2668 * 2669 * In recovery mode, only break if end tag match the 2670 * current tag, effectively ignoring all tags inside the 2671 * script/style block and treating the entire block as 2672 * CDATA. 2673 */ 2674 if (ctxt->recovery) { 2675 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, 2676 xmlStrlen(ctxt->name)) == 0) 2677 { 2678 break; /* while */ 2679 } else { 2680 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 2681 "Element %s embeds close tag\n", 2682 ctxt->name, NULL); 2683 } 2684 } else { 2685 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || 2686 ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) 2687 { 2688 break; /* while */ 2689 } 2690 } 2691 } 2692 COPY_BUF(l,buf,nbchar,cur); 2693 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2694 if (ctxt->sax->cdataBlock!= NULL) { 2695 /* 2696 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2697 */ 2698 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2699 } else if (ctxt->sax->characters != NULL) { 2700 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2701 } 2702 nbchar = 0; 2703 } 2704 GROW; 2705 NEXTL(l); 2706 cur = CUR_CHAR(l); 2707 } 2708 2709 if (!(IS_CHAR_CH(cur))) { 2710 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2711 "Invalid char in CDATA 0x%X\n", cur); 2712 NEXT; 2713 } 2714 2715 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2716 if (ctxt->sax->cdataBlock!= NULL) { 2717 /* 2718 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2719 */ 2720 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2721 } else if (ctxt->sax->characters != NULL) { 2722 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2723 } 2724 } 2725} 2726 2727 2728/** 2729 * htmlParseCharData: 2730 * @ctxt: an HTML parser context 2731 * 2732 * parse a CharData section. 2733 * if we are within a CDATA section ']]>' marks an end of section. 2734 * 2735 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 2736 */ 2737 2738static void 2739htmlParseCharData(htmlParserCtxtPtr ctxt) { 2740 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2741 int nbchar = 0; 2742 int cur, l; 2743 2744 SHRINK; 2745 cur = CUR_CHAR(l); 2746 while (((cur != '<') || (ctxt->token == '<')) && 2747 ((cur != '&') || (ctxt->token == '&')) && 2748 (IS_CHAR(cur))) { 2749 COPY_BUF(l,buf,nbchar,cur); 2750 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2751 /* 2752 * Ok the segment is to be consumed as chars. 2753 */ 2754 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2755 if (areBlanks(ctxt, buf, nbchar)) { 2756 if (ctxt->sax->ignorableWhitespace != NULL) 2757 ctxt->sax->ignorableWhitespace(ctxt->userData, 2758 buf, nbchar); 2759 } else { 2760 htmlCheckParagraph(ctxt); 2761 if (ctxt->sax->characters != NULL) 2762 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2763 } 2764 } 2765 nbchar = 0; 2766 } 2767 NEXTL(l); 2768 cur = CUR_CHAR(l); 2769 if (cur == 0) { 2770 SHRINK; 2771 GROW; 2772 cur = CUR_CHAR(l); 2773 } 2774 } 2775 if (nbchar != 0) { 2776 buf[nbchar] = 0; 2777 2778 /* 2779 * Ok the segment is to be consumed as chars. 2780 */ 2781 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2782 if (areBlanks(ctxt, buf, nbchar)) { 2783 if (ctxt->sax->ignorableWhitespace != NULL) 2784 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar); 2785 } else { 2786 htmlCheckParagraph(ctxt); 2787 if (ctxt->sax->characters != NULL) 2788 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2789 } 2790 } 2791 } else { 2792 /* 2793 * Loop detection 2794 */ 2795 if (cur == 0) 2796 ctxt->instate = XML_PARSER_EOF; 2797 } 2798} 2799 2800/** 2801 * htmlParseExternalID: 2802 * @ctxt: an HTML parser context 2803 * @publicID: a xmlChar** receiving PubidLiteral 2804 * 2805 * Parse an External ID or a Public ID 2806 * 2807 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral 2808 * | 'PUBLIC' S PubidLiteral S SystemLiteral 2809 * 2810 * [83] PublicID ::= 'PUBLIC' S PubidLiteral 2811 * 2812 * Returns the function returns SystemLiteral and in the second 2813 * case publicID receives PubidLiteral, is strict is off 2814 * it is possible to return NULL and have publicID set. 2815 */ 2816 2817static xmlChar * 2818htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) { 2819 xmlChar *URI = NULL; 2820 2821 if ((UPPER == 'S') && (UPP(1) == 'Y') && 2822 (UPP(2) == 'S') && (UPP(3) == 'T') && 2823 (UPP(4) == 'E') && (UPP(5) == 'M')) { 2824 SKIP(6); 2825 if (!IS_BLANK_CH(CUR)) { 2826 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 2827 "Space required after 'SYSTEM'\n", NULL, NULL); 2828 } 2829 SKIP_BLANKS; 2830 URI = htmlParseSystemLiteral(ctxt); 2831 if (URI == NULL) { 2832 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED, 2833 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL); 2834 } 2835 } else if ((UPPER == 'P') && (UPP(1) == 'U') && 2836 (UPP(2) == 'B') && (UPP(3) == 'L') && 2837 (UPP(4) == 'I') && (UPP(5) == 'C')) { 2838 SKIP(6); 2839 if (!IS_BLANK_CH(CUR)) { 2840 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 2841 "Space required after 'PUBLIC'\n", NULL, NULL); 2842 } 2843 SKIP_BLANKS; 2844 *publicID = htmlParsePubidLiteral(ctxt); 2845 if (*publicID == NULL) { 2846 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED, 2847 "htmlParseExternalID: PUBLIC, no Public Identifier\n", 2848 NULL, NULL); 2849 } 2850 SKIP_BLANKS; 2851 if ((CUR == '"') || (CUR == '\'')) { 2852 URI = htmlParseSystemLiteral(ctxt); 2853 } 2854 } 2855 return(URI); 2856} 2857 2858/** 2859 * xmlParsePI: 2860 * @ctxt: an XML parser context 2861 * 2862 * parse an XML Processing Instruction. 2863 * 2864 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 2865 */ 2866static void 2867htmlParsePI(htmlParserCtxtPtr ctxt) { 2868 xmlChar *buf = NULL; 2869 int len = 0; 2870 int size = HTML_PARSER_BUFFER_SIZE; 2871 int cur, l; 2872 const xmlChar *target; 2873 xmlParserInputState state; 2874 int count = 0; 2875 2876 if ((RAW == '<') && (NXT(1) == '?')) { 2877 state = ctxt->instate; 2878 ctxt->instate = XML_PARSER_PI; 2879 /* 2880 * this is a Processing Instruction. 2881 */ 2882 SKIP(2); 2883 SHRINK; 2884 2885 /* 2886 * Parse the target name and check for special support like 2887 * namespace. 2888 */ 2889 target = htmlParseName(ctxt); 2890 if (target != NULL) { 2891 if (RAW == '>') { 2892 SKIP(1); 2893 2894 /* 2895 * SAX: PI detected. 2896 */ 2897 if ((ctxt->sax) && (!ctxt->disableSAX) && 2898 (ctxt->sax->processingInstruction != NULL)) 2899 ctxt->sax->processingInstruction(ctxt->userData, 2900 target, NULL); 2901 ctxt->instate = state; 2902 return; 2903 } 2904 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 2905 if (buf == NULL) { 2906 htmlErrMemory(ctxt, NULL); 2907 ctxt->instate = state; 2908 return; 2909 } 2910 cur = CUR; 2911 if (!IS_BLANK(cur)) { 2912 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 2913 "ParsePI: PI %s space expected\n", target, NULL); 2914 } 2915 SKIP_BLANKS; 2916 cur = CUR_CHAR(l); 2917 while (IS_CHAR(cur) && (cur != '>')) { 2918 if (len + 5 >= size) { 2919 xmlChar *tmp; 2920 2921 size *= 2; 2922 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 2923 if (tmp == NULL) { 2924 htmlErrMemory(ctxt, NULL); 2925 xmlFree(buf); 2926 ctxt->instate = state; 2927 return; 2928 } 2929 buf = tmp; 2930 } 2931 count++; 2932 if (count > 50) { 2933 GROW; 2934 count = 0; 2935 } 2936 COPY_BUF(l,buf,len,cur); 2937 NEXTL(l); 2938 cur = CUR_CHAR(l); 2939 if (cur == 0) { 2940 SHRINK; 2941 GROW; 2942 cur = CUR_CHAR(l); 2943 } 2944 } 2945 buf[len] = 0; 2946 if (cur != '>') { 2947 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED, 2948 "ParsePI: PI %s never end ...\n", target, NULL); 2949 } else { 2950 SKIP(1); 2951 2952 /* 2953 * SAX: PI detected. 2954 */ 2955 if ((ctxt->sax) && (!ctxt->disableSAX) && 2956 (ctxt->sax->processingInstruction != NULL)) 2957 ctxt->sax->processingInstruction(ctxt->userData, 2958 target, buf); 2959 } 2960 xmlFree(buf); 2961 } else { 2962 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED, 2963 "PI is not started correctly", NULL, NULL); 2964 } 2965 ctxt->instate = state; 2966 } 2967} 2968 2969/** 2970 * htmlParseComment: 2971 * @ctxt: an HTML parser context 2972 * 2973 * Parse an XML (SGML) comment <!-- .... --> 2974 * 2975 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' 2976 */ 2977static void 2978htmlParseComment(htmlParserCtxtPtr ctxt) { 2979 xmlChar *buf = NULL; 2980 int len; 2981 int size = HTML_PARSER_BUFFER_SIZE; 2982 int q, ql; 2983 int r, rl; 2984 int cur, l; 2985 xmlParserInputState state; 2986 2987 /* 2988 * Check that there is a comment right here. 2989 */ 2990 if ((RAW != '<') || (NXT(1) != '!') || 2991 (NXT(2) != '-') || (NXT(3) != '-')) return; 2992 2993 state = ctxt->instate; 2994 ctxt->instate = XML_PARSER_COMMENT; 2995 SHRINK; 2996 SKIP(4); 2997 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 2998 if (buf == NULL) { 2999 htmlErrMemory(ctxt, "buffer allocation failed\n"); 3000 ctxt->instate = state; 3001 return; 3002 } 3003 q = CUR_CHAR(ql); 3004 NEXTL(ql); 3005 r = CUR_CHAR(rl); 3006 NEXTL(rl); 3007 cur = CUR_CHAR(l); 3008 len = 0; 3009 while (IS_CHAR(cur) && 3010 ((cur != '>') || 3011 (r != '-') || (q != '-'))) { 3012 if (len + 5 >= size) { 3013 xmlChar *tmp; 3014 3015 size *= 2; 3016 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3017 if (tmp == NULL) { 3018 xmlFree(buf); 3019 htmlErrMemory(ctxt, "growing buffer failed\n"); 3020 ctxt->instate = state; 3021 return; 3022 } 3023 buf = tmp; 3024 } 3025 COPY_BUF(ql,buf,len,q); 3026 q = r; 3027 ql = rl; 3028 r = cur; 3029 rl = l; 3030 NEXTL(l); 3031 cur = CUR_CHAR(l); 3032 if (cur == 0) { 3033 SHRINK; 3034 GROW; 3035 cur = CUR_CHAR(l); 3036 } 3037 } 3038 buf[len] = 0; 3039 if (!IS_CHAR(cur)) { 3040 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, 3041 "Comment not terminated \n<!--%.50s\n", buf, NULL); 3042 xmlFree(buf); 3043 } else { 3044 NEXT; 3045 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && 3046 (!ctxt->disableSAX)) 3047 ctxt->sax->comment(ctxt->userData, buf); 3048 xmlFree(buf); 3049 } 3050 ctxt->instate = state; 3051} 3052 3053/** 3054 * htmlParseCharRef: 3055 * @ctxt: an HTML parser context 3056 * 3057 * parse Reference declarations 3058 * 3059 * [66] CharRef ::= '&#' [0-9]+ ';' | 3060 * '&#x' [0-9a-fA-F]+ ';' 3061 * 3062 * Returns the value parsed (as an int) 3063 */ 3064int 3065htmlParseCharRef(htmlParserCtxtPtr ctxt) { 3066 int val = 0; 3067 3068 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3069 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3070 "htmlParseCharRef: context error\n", 3071 NULL, NULL); 3072 return(0); 3073 } 3074 if ((CUR == '&') && (NXT(1) == '#') && 3075 ((NXT(2) == 'x') || NXT(2) == 'X')) { 3076 SKIP(3); 3077 while (CUR != ';') { 3078 if ((CUR >= '0') && (CUR <= '9')) 3079 val = val * 16 + (CUR - '0'); 3080 else if ((CUR >= 'a') && (CUR <= 'f')) 3081 val = val * 16 + (CUR - 'a') + 10; 3082 else if ((CUR >= 'A') && (CUR <= 'F')) 3083 val = val * 16 + (CUR - 'A') + 10; 3084 else { 3085 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF, 3086 "htmlParseCharRef: invalid hexadecimal value\n", 3087 NULL, NULL); 3088 return(0); 3089 } 3090 NEXT; 3091 } 3092 if (CUR == ';') 3093 NEXT; 3094 } else if ((CUR == '&') && (NXT(1) == '#')) { 3095 SKIP(2); 3096 while (CUR != ';') { 3097 if ((CUR >= '0') && (CUR <= '9')) 3098 val = val * 10 + (CUR - '0'); 3099 else { 3100 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF, 3101 "htmlParseCharRef: invalid decimal value\n", 3102 NULL, NULL); 3103 return(0); 3104 } 3105 NEXT; 3106 } 3107 if (CUR == ';') 3108 NEXT; 3109 } else { 3110 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF, 3111 "htmlParseCharRef: invalid value\n", NULL, NULL); 3112 } 3113 /* 3114 * Check the value IS_CHAR ... 3115 */ 3116 if (IS_CHAR(val)) { 3117 return(val); 3118 } else { 3119 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 3120 "htmlParseCharRef: invalid xmlChar value %d\n", 3121 val); 3122 } 3123 return(0); 3124} 3125 3126 3127/** 3128 * htmlParseDocTypeDecl: 3129 * @ctxt: an HTML parser context 3130 * 3131 * parse a DOCTYPE declaration 3132 * 3133 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? 3134 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' 3135 */ 3136 3137static void 3138htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { 3139 const xmlChar *name; 3140 xmlChar *ExternalID = NULL; 3141 xmlChar *URI = NULL; 3142 3143 /* 3144 * We know that '<!DOCTYPE' has been detected. 3145 */ 3146 SKIP(9); 3147 3148 SKIP_BLANKS; 3149 3150 /* 3151 * Parse the DOCTYPE name. 3152 */ 3153 name = htmlParseName(ctxt); 3154 if (name == NULL) { 3155 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3156 "htmlParseDocTypeDecl : no DOCTYPE name !\n", 3157 NULL, NULL); 3158 } 3159 /* 3160 * Check that upper(name) == "HTML" !!!!!!!!!!!!! 3161 */ 3162 3163 SKIP_BLANKS; 3164 3165 /* 3166 * Check for SystemID and ExternalID 3167 */ 3168 URI = htmlParseExternalID(ctxt, &ExternalID); 3169 SKIP_BLANKS; 3170 3171 /* 3172 * We should be at the end of the DOCTYPE declaration. 3173 */ 3174 if (CUR != '>') { 3175 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED, 3176 "DOCTYPE improperly terminated\n", NULL, NULL); 3177 /* We shouldn't try to resynchronize ... */ 3178 } 3179 NEXT; 3180 3181 /* 3182 * Create or update the document accordingly to the DOCTYPE 3183 */ 3184 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) && 3185 (!ctxt->disableSAX)) 3186 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI); 3187 3188 /* 3189 * Cleanup, since we don't use all those identifiers 3190 */ 3191 if (URI != NULL) xmlFree(URI); 3192 if (ExternalID != NULL) xmlFree(ExternalID); 3193} 3194 3195/** 3196 * htmlParseAttribute: 3197 * @ctxt: an HTML parser context 3198 * @value: a xmlChar ** used to store the value of the attribute 3199 * 3200 * parse an attribute 3201 * 3202 * [41] Attribute ::= Name Eq AttValue 3203 * 3204 * [25] Eq ::= S? '=' S? 3205 * 3206 * With namespace: 3207 * 3208 * [NS 11] Attribute ::= QName Eq AttValue 3209 * 3210 * Also the case QName == xmlns:??? is handled independently as a namespace 3211 * definition. 3212 * 3213 * Returns the attribute name, and the value in *value. 3214 */ 3215 3216static const xmlChar * 3217htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { 3218 const xmlChar *name; 3219 xmlChar *val = NULL; 3220 3221 *value = NULL; 3222 name = htmlParseHTMLName(ctxt); 3223 if (name == NULL) { 3224 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3225 "error parsing attribute name\n", NULL, NULL); 3226 return(NULL); 3227 } 3228 3229 /* 3230 * read the value 3231 */ 3232 SKIP_BLANKS; 3233 if (CUR == '=') { 3234 NEXT; 3235 SKIP_BLANKS; 3236 val = htmlParseAttValue(ctxt); 3237 /****** 3238 } else { 3239 * TODO : some attribute must have values, some may not 3240 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) 3241 ctxt->sax->warning(ctxt->userData, 3242 "No value for attribute %s\n", name); */ 3243 } 3244 3245 *value = val; 3246 return(name); 3247} 3248 3249/** 3250 * htmlCheckEncoding: 3251 * @ctxt: an HTML parser context 3252 * @attvalue: the attribute value 3253 * 3254 * Checks an http-equiv attribute from a Meta tag to detect 3255 * the encoding 3256 * If a new encoding is detected the parser is switched to decode 3257 * it and pass UTF8 3258 */ 3259static void 3260htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { 3261 const xmlChar *encoding; 3262 3263 if ((ctxt == NULL) || (attvalue == NULL)) 3264 return; 3265 3266 /* do not change encoding */ 3267 if (ctxt->input->encoding != NULL) 3268 return; 3269 3270 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset="); 3271 if (encoding != NULL) { 3272 encoding += 8; 3273 } else { 3274 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset ="); 3275 if (encoding != NULL) 3276 encoding += 9; 3277 } 3278 if (encoding != NULL) { 3279 xmlCharEncoding enc; 3280 xmlCharEncodingHandlerPtr handler; 3281 3282 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 3283 3284 if (ctxt->input->encoding != NULL) 3285 xmlFree((xmlChar *) ctxt->input->encoding); 3286 ctxt->input->encoding = xmlStrdup(encoding); 3287 3288 enc = xmlParseCharEncoding((const char *) encoding); 3289 /* 3290 * registered set of known encodings 3291 */ 3292 if (enc != XML_CHAR_ENCODING_ERROR) { 3293 xmlSwitchEncoding(ctxt, enc); 3294 ctxt->charset = XML_CHAR_ENCODING_UTF8; 3295 } else { 3296 /* 3297 * fallback for unknown encodings 3298 */ 3299 handler = xmlFindCharEncodingHandler((const char *) encoding); 3300 if (handler != NULL) { 3301 xmlSwitchToEncoding(ctxt, handler); 3302 ctxt->charset = XML_CHAR_ENCODING_UTF8; 3303 } else { 3304 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; 3305 } 3306 } 3307 3308 if ((ctxt->input->buf != NULL) && 3309 (ctxt->input->buf->encoder != NULL) && 3310 (ctxt->input->buf->raw != NULL) && 3311 (ctxt->input->buf->buffer != NULL)) { 3312 int nbchars; 3313 int processed; 3314 3315 /* 3316 * convert as much as possible to the parser reading buffer. 3317 */ 3318 processed = ctxt->input->cur - ctxt->input->base; 3319 xmlBufferShrink(ctxt->input->buf->buffer, processed); 3320 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder, 3321 ctxt->input->buf->buffer, 3322 ctxt->input->buf->raw); 3323 if (nbchars < 0) { 3324 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 3325 "htmlCheckEncoding: encoder error\n", 3326 NULL, NULL); 3327 } 3328 ctxt->input->base = 3329 ctxt->input->cur = ctxt->input->buf->buffer->content; 3330 } 3331 } 3332} 3333 3334/** 3335 * htmlCheckMeta: 3336 * @ctxt: an HTML parser context 3337 * @atts: the attributes values 3338 * 3339 * Checks an attributes from a Meta tag 3340 */ 3341static void 3342htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { 3343 int i; 3344 const xmlChar *att, *value; 3345 int http = 0; 3346 const xmlChar *content = NULL; 3347 3348 if ((ctxt == NULL) || (atts == NULL)) 3349 return; 3350 3351 i = 0; 3352 att = atts[i++]; 3353 while (att != NULL) { 3354 value = atts[i++]; 3355 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv")) 3356 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 3357 http = 1; 3358 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content"))) 3359 content = value; 3360 att = atts[i++]; 3361 } 3362 if ((http) && (content != NULL)) 3363 htmlCheckEncoding(ctxt, content); 3364 3365} 3366 3367/** 3368 * htmlParseStartTag: 3369 * @ctxt: an HTML parser context 3370 * 3371 * parse a start of tag either for rule element or 3372 * EmptyElement. In both case we don't parse the tag closing chars. 3373 * 3374 * [40] STag ::= '<' Name (S Attribute)* S? '>' 3375 * 3376 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' 3377 * 3378 * With namespace: 3379 * 3380 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>' 3381 * 3382 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>' 3383 * 3384 * Returns 0 in case of success and -1 in case of error. 3385 */ 3386 3387static int 3388htmlParseStartTag(htmlParserCtxtPtr ctxt) { 3389 const xmlChar *name; 3390 const xmlChar *attname; 3391 xmlChar *attvalue; 3392 const xmlChar **atts; 3393 int nbatts = 0; 3394 int maxatts; 3395 int meta = 0; 3396 int i; 3397 3398 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3399 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3400 "htmlParseStartTag: context error\n", NULL, NULL); 3401 return -1; 3402 } 3403 if (CUR != '<') return -1; 3404 NEXT; 3405 3406 atts = ctxt->atts; 3407 maxatts = ctxt->maxatts; 3408 3409 GROW; 3410 name = htmlParseHTMLName(ctxt); 3411 if (name == NULL) { 3412 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3413 "htmlParseStartTag: invalid element name\n", 3414 NULL, NULL); 3415 /* Dump the bogus tag like browsers do */ 3416 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 3417 NEXT; 3418 return -1; 3419 } 3420 if (xmlStrEqual(name, BAD_CAST"meta")) 3421 meta = 1; 3422 3423 /* 3424 * Check for auto-closure of HTML elements. 3425 */ 3426 htmlAutoClose(ctxt, name); 3427 3428 /* 3429 * Check for implied HTML elements. 3430 */ 3431 htmlCheckImplied(ctxt, name); 3432 3433 /* 3434 * Avoid html at any level > 0, head at any level != 1 3435 * or any attempt to recurse body 3436 */ 3437 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) { 3438 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3439 "htmlParseStartTag: misplaced <html> tag\n", 3440 name, NULL); 3441 return 0; 3442 } 3443 if ((ctxt->nameNr != 1) && 3444 (xmlStrEqual(name, BAD_CAST"head"))) { 3445 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3446 "htmlParseStartTag: misplaced <head> tag\n", 3447 name, NULL); 3448 return 0; 3449 } 3450 if (xmlStrEqual(name, BAD_CAST"body")) { 3451 int indx; 3452 for (indx = 0;indx < ctxt->nameNr;indx++) { 3453 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) { 3454 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3455 "htmlParseStartTag: misplaced <body> tag\n", 3456 name, NULL); 3457 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 3458 NEXT; 3459 return 0; 3460 } 3461 } 3462 } 3463 3464 /* 3465 * Now parse the attributes, it ends up with the ending 3466 * 3467 * (S Attribute)* S? 3468 */ 3469 SKIP_BLANKS; 3470 while ((IS_CHAR_CH(CUR)) && 3471 (CUR != '>') && 3472 ((CUR != '/') || (NXT(1) != '>'))) { 3473 long cons = ctxt->nbChars; 3474 3475 GROW; 3476 attname = htmlParseAttribute(ctxt, &attvalue); 3477 if (attname != NULL) { 3478 3479 /* 3480 * Well formedness requires at most one declaration of an attribute 3481 */ 3482 for (i = 0; i < nbatts;i += 2) { 3483 if (xmlStrEqual(atts[i], attname)) { 3484 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED, 3485 "Attribute %s redefined\n", attname, NULL); 3486 if (attvalue != NULL) 3487 xmlFree(attvalue); 3488 goto failed; 3489 } 3490 } 3491 3492 /* 3493 * Add the pair to atts 3494 */ 3495 if (atts == NULL) { 3496 maxatts = 22; /* allow for 10 attrs by default */ 3497 atts = (const xmlChar **) 3498 xmlMalloc(maxatts * sizeof(xmlChar *)); 3499 if (atts == NULL) { 3500 htmlErrMemory(ctxt, NULL); 3501 if (attvalue != NULL) 3502 xmlFree(attvalue); 3503 goto failed; 3504 } 3505 ctxt->atts = atts; 3506 ctxt->maxatts = maxatts; 3507 } else if (nbatts + 4 > maxatts) { 3508 const xmlChar **n; 3509 3510 maxatts *= 2; 3511 n = (const xmlChar **) xmlRealloc((void *) atts, 3512 maxatts * sizeof(const xmlChar *)); 3513 if (n == NULL) { 3514 htmlErrMemory(ctxt, NULL); 3515 if (attvalue != NULL) 3516 xmlFree(attvalue); 3517 goto failed; 3518 } 3519 atts = n; 3520 ctxt->atts = atts; 3521 ctxt->maxatts = maxatts; 3522 } 3523 atts[nbatts++] = attname; 3524 atts[nbatts++] = attvalue; 3525 atts[nbatts] = NULL; 3526 atts[nbatts + 1] = NULL; 3527 } 3528 else { 3529 if (attvalue != NULL) 3530 xmlFree(attvalue); 3531 /* Dump the bogus attribute string up to the next blank or 3532 * the end of the tag. */ 3533 while ((IS_CHAR_CH(CUR)) && 3534 !(IS_BLANK_CH(CUR)) && (CUR != '>') && 3535 ((CUR != '/') || (NXT(1) != '>'))) 3536 NEXT; 3537 } 3538 3539failed: 3540 SKIP_BLANKS; 3541 if (cons == ctxt->nbChars) { 3542 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3543 "htmlParseStartTag: problem parsing attributes\n", 3544 NULL, NULL); 3545 break; 3546 } 3547 } 3548 3549 /* 3550 * Handle specific association to the META tag 3551 */ 3552 if (meta) 3553 htmlCheckMeta(ctxt, atts); 3554 3555 /* 3556 * SAX: Start of Element ! 3557 */ 3558 htmlnamePush(ctxt, name); 3559 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) { 3560 if (nbatts != 0) 3561 ctxt->sax->startElement(ctxt->userData, name, atts); 3562 else 3563 ctxt->sax->startElement(ctxt->userData, name, NULL); 3564 } 3565 3566 if (atts != NULL) { 3567 for (i = 1;i < nbatts;i += 2) { 3568 if (atts[i] != NULL) 3569 xmlFree((xmlChar *) atts[i]); 3570 } 3571 } 3572 3573 return 0; 3574} 3575 3576/** 3577 * htmlParseEndTag: 3578 * @ctxt: an HTML parser context 3579 * 3580 * parse an end of tag 3581 * 3582 * [42] ETag ::= '</' Name S? '>' 3583 * 3584 * With namespace 3585 * 3586 * [NS 9] ETag ::= '</' QName S? '>' 3587 * 3588 * Returns 1 if the current level should be closed. 3589 */ 3590 3591static int 3592htmlParseEndTag(htmlParserCtxtPtr ctxt) 3593{ 3594 const xmlChar *name; 3595 const xmlChar *oldname; 3596 int i, ret; 3597 3598 if ((CUR != '<') || (NXT(1) != '/')) { 3599 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED, 3600 "htmlParseEndTag: '</' not found\n", NULL, NULL); 3601 return (0); 3602 } 3603 SKIP(2); 3604 3605 name = htmlParseHTMLName(ctxt); 3606 if (name == NULL) 3607 return (0); 3608 3609 /* 3610 * We should definitely be at the ending "S? '>'" part 3611 */ 3612 SKIP_BLANKS; 3613 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) { 3614 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 3615 "End tag : expected '>'\n", NULL, NULL); 3616 if (ctxt->recovery) { 3617 /* 3618 * We're not at the ending > !! 3619 * Error, unless in recover mode where we search forwards 3620 * until we find a > 3621 */ 3622 while (CUR != '\0' && CUR != '>') NEXT; 3623 NEXT; 3624 } 3625 } else 3626 NEXT; 3627 3628 /* 3629 * If the name read is not one of the element in the parsing stack 3630 * then return, it's just an error. 3631 */ 3632 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 3633 if (xmlStrEqual(name, ctxt->nameTab[i])) 3634 break; 3635 } 3636 if (i < 0) { 3637 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 3638 "Unexpected end tag : %s\n", name, NULL); 3639 return (0); 3640 } 3641 3642 3643 /* 3644 * Check for auto-closure of HTML elements. 3645 */ 3646 3647 htmlAutoCloseOnClose(ctxt, name); 3648 3649 /* 3650 * Well formedness constraints, opening and closing must match. 3651 * With the exception that the autoclose may have popped stuff out 3652 * of the stack. 3653 */ 3654 if (!xmlStrEqual(name, ctxt->name)) { 3655 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) { 3656 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 3657 "Opening and ending tag mismatch: %s and %s\n", 3658 name, ctxt->name); 3659 } 3660 } 3661 3662 /* 3663 * SAX: End of Tag 3664 */ 3665 oldname = ctxt->name; 3666 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) { 3667 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 3668 ctxt->sax->endElement(ctxt->userData, name); 3669 htmlnamePop(ctxt); 3670 ret = 1; 3671 } else { 3672 ret = 0; 3673 } 3674 3675 return (ret); 3676} 3677 3678 3679/** 3680 * htmlParseReference: 3681 * @ctxt: an HTML parser context 3682 * 3683 * parse and handle entity references in content, 3684 * this will end-up in a call to character() since this is either a 3685 * CharRef, or a predefined entity. 3686 */ 3687static void 3688htmlParseReference(htmlParserCtxtPtr ctxt) { 3689 const htmlEntityDesc * ent; 3690 xmlChar out[6]; 3691 const xmlChar *name; 3692 if (CUR != '&') return; 3693 3694 if (NXT(1) == '#') { 3695 unsigned int c; 3696 int bits, i = 0; 3697 3698 c = htmlParseCharRef(ctxt); 3699 if (c == 0) 3700 return; 3701 3702 if (c < 0x80) { out[i++]= c; bits= -6; } 3703 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 3704 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 3705 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 3706 3707 for ( ; bits >= 0; bits-= 6) { 3708 out[i++]= ((c >> bits) & 0x3F) | 0x80; 3709 } 3710 out[i] = 0; 3711 3712 htmlCheckParagraph(ctxt); 3713 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 3714 ctxt->sax->characters(ctxt->userData, out, i); 3715 } else { 3716 ent = htmlParseEntityRef(ctxt, &name); 3717 if (name == NULL) { 3718 htmlCheckParagraph(ctxt); 3719 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 3720 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 3721 return; 3722 } 3723 if ((ent == NULL) || !(ent->value > 0)) { 3724 htmlCheckParagraph(ctxt); 3725 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) { 3726 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); 3727 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name)); 3728 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */ 3729 } 3730 } else { 3731 unsigned int c; 3732 int bits, i = 0; 3733 3734 c = ent->value; 3735 if (c < 0x80) 3736 { out[i++]= c; bits= -6; } 3737 else if (c < 0x800) 3738 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 3739 else if (c < 0x10000) 3740 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 3741 else 3742 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 3743 3744 for ( ; bits >= 0; bits-= 6) { 3745 out[i++]= ((c >> bits) & 0x3F) | 0x80; 3746 } 3747 out[i] = 0; 3748 3749 htmlCheckParagraph(ctxt); 3750 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 3751 ctxt->sax->characters(ctxt->userData, out, i); 3752 } 3753 } 3754} 3755 3756/** 3757 * htmlParseContent: 3758 * @ctxt: an HTML parser context 3759 * 3760 * Parse a content: comment, sub-element, reference or text. 3761 */ 3762 3763static void 3764htmlParseContent(htmlParserCtxtPtr ctxt) { 3765 xmlChar *currentNode; 3766 int depth; 3767 3768 currentNode = xmlStrdup(ctxt->name); 3769 depth = ctxt->nameNr; 3770 while (1) { 3771 long cons = ctxt->nbChars; 3772 3773 GROW; 3774 /* 3775 * Our tag or one of it's parent or children is ending. 3776 */ 3777 if ((CUR == '<') && (NXT(1) == '/')) { 3778 if (htmlParseEndTag(ctxt) && 3779 ((currentNode != NULL) || (ctxt->nameNr == 0))) { 3780 if (currentNode != NULL) 3781 xmlFree(currentNode); 3782 return; 3783 } 3784 continue; /* while */ 3785 } 3786 3787 /* 3788 * Has this node been popped out during parsing of 3789 * the next element 3790 */ 3791 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && 3792 (!xmlStrEqual(currentNode, ctxt->name))) 3793 { 3794 if (currentNode != NULL) xmlFree(currentNode); 3795 return; 3796 } 3797 3798 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) || 3799 (xmlStrEqual(currentNode, BAD_CAST"style")))) { 3800 /* 3801 * Handle SCRIPT/STYLE separately 3802 */ 3803 htmlParseScript(ctxt); 3804 } else { 3805 /* 3806 * Sometimes DOCTYPE arrives in the middle of the document 3807 */ 3808 if ((CUR == '<') && (NXT(1) == '!') && 3809 (UPP(2) == 'D') && (UPP(3) == 'O') && 3810 (UPP(4) == 'C') && (UPP(5) == 'T') && 3811 (UPP(6) == 'Y') && (UPP(7) == 'P') && 3812 (UPP(8) == 'E')) { 3813 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3814 "Misplaced DOCTYPE declaration\n", 3815 BAD_CAST "DOCTYPE" , NULL); 3816 htmlParseDocTypeDecl(ctxt); 3817 } 3818 3819 /* 3820 * First case : a comment 3821 */ 3822 if ((CUR == '<') && (NXT(1) == '!') && 3823 (NXT(2) == '-') && (NXT(3) == '-')) { 3824 htmlParseComment(ctxt); 3825 } 3826 3827 /* 3828 * Second case : a Processing Instruction. 3829 */ 3830 else if ((CUR == '<') && (NXT(1) == '?')) { 3831 htmlParsePI(ctxt); 3832 } 3833 3834 /* 3835 * Third case : a sub-element. 3836 */ 3837 else if (CUR == '<') { 3838 htmlParseElement(ctxt); 3839 } 3840 3841 /* 3842 * Fourth case : a reference. If if has not been resolved, 3843 * parsing returns it's Name, create the node 3844 */ 3845 else if (CUR == '&') { 3846 htmlParseReference(ctxt); 3847 } 3848 3849 /* 3850 * Fifth case : end of the resource 3851 */ 3852 else if (CUR == 0) { 3853 htmlAutoCloseOnEnd(ctxt); 3854 break; 3855 } 3856 3857 /* 3858 * Last case, text. Note that References are handled directly. 3859 */ 3860 else { 3861 htmlParseCharData(ctxt); 3862 } 3863 3864 if (cons == ctxt->nbChars) { 3865 if (ctxt->node != NULL) { 3866 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3867 "detected an error in element content\n", 3868 NULL, NULL); 3869 } 3870 break; 3871 } 3872 } 3873 GROW; 3874 } 3875 if (currentNode != NULL) xmlFree(currentNode); 3876} 3877 3878/** 3879 * htmlParseContent: 3880 * @ctxt: an HTML parser context 3881 * 3882 * Parse a content: comment, sub-element, reference or text. 3883 */ 3884 3885void 3886__htmlParseContent(void *ctxt) { 3887 if (ctxt != NULL) 3888 htmlParseContent((htmlParserCtxtPtr) ctxt); 3889} 3890 3891/** 3892 * htmlParseElement: 3893 * @ctxt: an HTML parser context 3894 * 3895 * parse an HTML element, this is highly recursive 3896 * 3897 * [39] element ::= EmptyElemTag | STag content ETag 3898 * 3899 * [41] Attribute ::= Name Eq AttValue 3900 */ 3901 3902void 3903htmlParseElement(htmlParserCtxtPtr ctxt) { 3904 const xmlChar *name; 3905 xmlChar *currentNode = NULL; 3906 const htmlElemDesc * info; 3907 htmlParserNodeInfo node_info; 3908 int failed; 3909 int depth; 3910 const xmlChar *oldptr; 3911 3912 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3913 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3914 "htmlParseElement: context error\n", NULL, NULL); 3915 return; 3916 } 3917 /* Capture start position */ 3918 if (ctxt->record_info) { 3919 node_info.begin_pos = ctxt->input->consumed + 3920 (CUR_PTR - ctxt->input->base); 3921 node_info.begin_line = ctxt->input->line; 3922 } 3923 3924 failed = htmlParseStartTag(ctxt); 3925 name = ctxt->name; 3926 if (failed || (name == NULL)) { 3927 if (CUR == '>') 3928 NEXT; 3929 return; 3930 } 3931 3932 /* 3933 * Lookup the info for that element. 3934 */ 3935 info = htmlTagLookup(name); 3936 if (info == NULL) { 3937 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 3938 "Tag %s invalid\n", name, NULL); 3939 } 3940 3941 /* 3942 * Check for an Empty Element labeled the XML/SGML way 3943 */ 3944 if ((CUR == '/') && (NXT(1) == '>')) { 3945 SKIP(2); 3946 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 3947 ctxt->sax->endElement(ctxt->userData, name); 3948 htmlnamePop(ctxt); 3949 return; 3950 } 3951 3952 if (CUR == '>') { 3953 NEXT; 3954 } else { 3955 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 3956 "Couldn't find end of Start Tag %s\n", name, NULL); 3957 3958 /* 3959 * end of parsing of this node. 3960 */ 3961 if (xmlStrEqual(name, ctxt->name)) { 3962 nodePop(ctxt); 3963 htmlnamePop(ctxt); 3964 } 3965 3966 /* 3967 * Capture end position and add node 3968 */ 3969 if (ctxt->record_info) { 3970 node_info.end_pos = ctxt->input->consumed + 3971 (CUR_PTR - ctxt->input->base); 3972 node_info.end_line = ctxt->input->line; 3973 node_info.node = ctxt->node; 3974 xmlParserAddNodeInfo(ctxt, &node_info); 3975 } 3976 return; 3977 } 3978 3979 /* 3980 * Check for an Empty Element from DTD definition 3981 */ 3982 if ((info != NULL) && (info->empty)) { 3983 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 3984 ctxt->sax->endElement(ctxt->userData, name); 3985 htmlnamePop(ctxt); 3986 return; 3987 } 3988 3989 /* 3990 * Parse the content of the element: 3991 */ 3992 currentNode = xmlStrdup(ctxt->name); 3993 depth = ctxt->nameNr; 3994 while (IS_CHAR_CH(CUR)) { 3995 oldptr = ctxt->input->cur; 3996 htmlParseContent(ctxt); 3997 if (oldptr==ctxt->input->cur) break; 3998 if (ctxt->nameNr < depth) break; 3999 } 4000 4001 /* 4002 * Capture end position and add node 4003 */ 4004 if ( currentNode != NULL && ctxt->record_info ) { 4005 node_info.end_pos = ctxt->input->consumed + 4006 (CUR_PTR - ctxt->input->base); 4007 node_info.end_line = ctxt->input->line; 4008 node_info.node = ctxt->node; 4009 xmlParserAddNodeInfo(ctxt, &node_info); 4010 } 4011 if (!IS_CHAR_CH(CUR)) { 4012 htmlAutoCloseOnEnd(ctxt); 4013 } 4014 4015 if (currentNode != NULL) 4016 xmlFree(currentNode); 4017} 4018 4019/** 4020 * htmlParseDocument: 4021 * @ctxt: an HTML parser context 4022 * 4023 * parse an HTML document (and build a tree if using the standard SAX 4024 * interface). 4025 * 4026 * Returns 0, -1 in case of error. the parser context is augmented 4027 * as a result of the parsing. 4028 */ 4029 4030int 4031htmlParseDocument(htmlParserCtxtPtr ctxt) { 4032 xmlDtdPtr dtd; 4033 4034 xmlInitParser(); 4035 4036 htmlDefaultSAXHandlerInit(); 4037 4038 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4039 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4040 "htmlParseDocument: context error\n", NULL, NULL); 4041 return(XML_ERR_INTERNAL_ERROR); 4042 } 4043 ctxt->html = 1; 4044 GROW; 4045 /* 4046 * SAX: beginning of the document processing. 4047 */ 4048 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 4049 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); 4050 4051 /* 4052 * Wipe out everything which is before the first '<' 4053 */ 4054 SKIP_BLANKS; 4055 if (CUR == 0) { 4056 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY, 4057 "Document is empty\n", NULL, NULL); 4058 } 4059 4060 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) 4061 ctxt->sax->startDocument(ctxt->userData); 4062 4063 4064 /* 4065 * Parse possible comments and PIs before any content 4066 */ 4067 while (((CUR == '<') && (NXT(1) == '!') && 4068 (NXT(2) == '-') && (NXT(3) == '-')) || 4069 ((CUR == '<') && (NXT(1) == '?'))) { 4070 htmlParseComment(ctxt); 4071 htmlParsePI(ctxt); 4072 SKIP_BLANKS; 4073 } 4074 4075 4076 /* 4077 * Then possibly doc type declaration(s) and more Misc 4078 * (doctypedecl Misc*)? 4079 */ 4080 if ((CUR == '<') && (NXT(1) == '!') && 4081 (UPP(2) == 'D') && (UPP(3) == 'O') && 4082 (UPP(4) == 'C') && (UPP(5) == 'T') && 4083 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4084 (UPP(8) == 'E')) { 4085 htmlParseDocTypeDecl(ctxt); 4086 } 4087 SKIP_BLANKS; 4088 4089 /* 4090 * Parse possible comments and PIs before any content 4091 */ 4092 while (((CUR == '<') && (NXT(1) == '!') && 4093 (NXT(2) == '-') && (NXT(3) == '-')) || 4094 ((CUR == '<') && (NXT(1) == '?'))) { 4095 htmlParseComment(ctxt); 4096 htmlParsePI(ctxt); 4097 SKIP_BLANKS; 4098 } 4099 4100 /* 4101 * Time to start parsing the tree itself 4102 */ 4103 htmlParseContent(ctxt); 4104 4105 /* 4106 * autoclose 4107 */ 4108 if (CUR == 0) 4109 htmlAutoCloseOnEnd(ctxt); 4110 4111 4112 /* 4113 * SAX: end of the document processing. 4114 */ 4115 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 4116 ctxt->sax->endDocument(ctxt->userData); 4117 4118 if (ctxt->myDoc != NULL) { 4119 dtd = xmlGetIntSubset(ctxt->myDoc); 4120 if (dtd == NULL) 4121 ctxt->myDoc->intSubset = 4122 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 4123 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 4124 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 4125 } 4126 if (! ctxt->wellFormed) return(-1); 4127 return(0); 4128} 4129 4130 4131/************************************************************************ 4132 * * 4133 * Parser contexts handling * 4134 * * 4135 ************************************************************************/ 4136 4137/** 4138 * htmlInitParserCtxt: 4139 * @ctxt: an HTML parser context 4140 * 4141 * Initialize a parser context 4142 * 4143 * Returns 0 in case of success and -1 in case of error 4144 */ 4145 4146static int 4147htmlInitParserCtxt(htmlParserCtxtPtr ctxt) 4148{ 4149 htmlSAXHandler *sax; 4150 4151 if (ctxt == NULL) return(-1); 4152 memset(ctxt, 0, sizeof(htmlParserCtxt)); 4153 4154 ctxt->dict = xmlDictCreate(); 4155 if (ctxt->dict == NULL) { 4156 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4157 return(-1); 4158 } 4159 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler)); 4160 if (sax == NULL) { 4161 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4162 return(-1); 4163 } 4164 else 4165 memset(sax, 0, sizeof(htmlSAXHandler)); 4166 4167 /* Allocate the Input stack */ 4168 ctxt->inputTab = (htmlParserInputPtr *) 4169 xmlMalloc(5 * sizeof(htmlParserInputPtr)); 4170 if (ctxt->inputTab == NULL) { 4171 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4172 ctxt->inputNr = 0; 4173 ctxt->inputMax = 0; 4174 ctxt->input = NULL; 4175 return(-1); 4176 } 4177 ctxt->inputNr = 0; 4178 ctxt->inputMax = 5; 4179 ctxt->input = NULL; 4180 ctxt->version = NULL; 4181 ctxt->encoding = NULL; 4182 ctxt->standalone = -1; 4183 ctxt->instate = XML_PARSER_START; 4184 4185 /* Allocate the Node stack */ 4186 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr)); 4187 if (ctxt->nodeTab == NULL) { 4188 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4189 ctxt->nodeNr = 0; 4190 ctxt->nodeMax = 0; 4191 ctxt->node = NULL; 4192 ctxt->inputNr = 0; 4193 ctxt->inputMax = 0; 4194 ctxt->input = NULL; 4195 return(-1); 4196 } 4197 ctxt->nodeNr = 0; 4198 ctxt->nodeMax = 10; 4199 ctxt->node = NULL; 4200 4201 /* Allocate the Name stack */ 4202 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *)); 4203 if (ctxt->nameTab == NULL) { 4204 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4205 ctxt->nameNr = 0; 4206 ctxt->nameMax = 10; 4207 ctxt->name = NULL; 4208 ctxt->nodeNr = 0; 4209 ctxt->nodeMax = 0; 4210 ctxt->node = NULL; 4211 ctxt->inputNr = 0; 4212 ctxt->inputMax = 0; 4213 ctxt->input = NULL; 4214 return(-1); 4215 } 4216 ctxt->nameNr = 0; 4217 ctxt->nameMax = 10; 4218 ctxt->name = NULL; 4219 4220 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler; 4221 else { 4222 ctxt->sax = sax; 4223 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 4224 } 4225 ctxt->userData = ctxt; 4226 ctxt->myDoc = NULL; 4227 ctxt->wellFormed = 1; 4228 ctxt->replaceEntities = 0; 4229 ctxt->linenumbers = xmlLineNumbersDefaultValue; 4230 ctxt->html = 1; 4231 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0; 4232 ctxt->vctxt.userData = ctxt; 4233 ctxt->vctxt.error = xmlParserValidityError; 4234 ctxt->vctxt.warning = xmlParserValidityWarning; 4235 ctxt->record_info = 0; 4236 ctxt->validate = 0; 4237 ctxt->nbChars = 0; 4238 ctxt->checkIndex = 0; 4239 ctxt->catalogs = NULL; 4240 xmlInitNodeInfoSeq(&ctxt->node_seq); 4241 return(0); 4242} 4243 4244/** 4245 * htmlFreeParserCtxt: 4246 * @ctxt: an HTML parser context 4247 * 4248 * Free all the memory used by a parser context. However the parsed 4249 * document in ctxt->myDoc is not freed. 4250 */ 4251 4252void 4253htmlFreeParserCtxt(htmlParserCtxtPtr ctxt) 4254{ 4255 xmlFreeParserCtxt(ctxt); 4256} 4257 4258/** 4259 * htmlNewParserCtxt: 4260 * 4261 * Allocate and initialize a new parser context. 4262 * 4263 * Returns the xmlParserCtxtPtr or NULL 4264 */ 4265 4266static htmlParserCtxtPtr 4267htmlNewParserCtxt(void) 4268{ 4269 xmlParserCtxtPtr ctxt; 4270 4271 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt)); 4272 if (ctxt == NULL) { 4273 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n"); 4274 return(NULL); 4275 } 4276 memset(ctxt, 0, sizeof(xmlParserCtxt)); 4277 if (htmlInitParserCtxt(ctxt) < 0) { 4278 htmlFreeParserCtxt(ctxt); 4279 return(NULL); 4280 } 4281 return(ctxt); 4282} 4283 4284/** 4285 * htmlCreateMemoryParserCtxt: 4286 * @buffer: a pointer to a char array 4287 * @size: the size of the array 4288 * 4289 * Create a parser context for an HTML in-memory document. 4290 * 4291 * Returns the new parser context or NULL 4292 */ 4293htmlParserCtxtPtr 4294htmlCreateMemoryParserCtxt(const char *buffer, int size) { 4295 xmlParserCtxtPtr ctxt; 4296 xmlParserInputPtr input; 4297 xmlParserInputBufferPtr buf; 4298 4299 if (buffer == NULL) 4300 return(NULL); 4301 if (size <= 0) 4302 return(NULL); 4303 4304 ctxt = htmlNewParserCtxt(); 4305 if (ctxt == NULL) 4306 return(NULL); 4307 4308 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 4309 if (buf == NULL) return(NULL); 4310 4311 input = xmlNewInputStream(ctxt); 4312 if (input == NULL) { 4313 xmlFreeParserCtxt(ctxt); 4314 return(NULL); 4315 } 4316 4317 input->filename = NULL; 4318 input->buf = buf; 4319 input->base = input->buf->buffer->content; 4320 input->cur = input->buf->buffer->content; 4321 input->end = &input->buf->buffer->content[input->buf->buffer->use]; 4322 4323 inputPush(ctxt, input); 4324 return(ctxt); 4325} 4326 4327/** 4328 * htmlCreateDocParserCtxt: 4329 * @cur: a pointer to an array of xmlChar 4330 * @encoding: a free form C string describing the HTML document encoding, or NULL 4331 * 4332 * Create a parser context for an HTML document. 4333 * 4334 * TODO: check the need to add encoding handling there 4335 * 4336 * Returns the new parser context or NULL 4337 */ 4338static htmlParserCtxtPtr 4339htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) { 4340 int len; 4341 htmlParserCtxtPtr ctxt; 4342 4343 if (cur == NULL) 4344 return(NULL); 4345 len = xmlStrlen(cur); 4346 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len); 4347 4348 if (encoding != NULL) { 4349 xmlCharEncoding enc; 4350 xmlCharEncodingHandlerPtr handler; 4351 4352 if (ctxt->input->encoding != NULL) 4353 xmlFree((xmlChar *) ctxt->input->encoding); 4354 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding); 4355 4356 enc = xmlParseCharEncoding(encoding); 4357 /* 4358 * registered set of known encodings 4359 */ 4360 if (enc != XML_CHAR_ENCODING_ERROR) { 4361 xmlSwitchEncoding(ctxt, enc); 4362 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { 4363 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 4364 "Unsupported encoding %s\n", 4365 (const xmlChar *) encoding, NULL); 4366 } 4367 } else { 4368 /* 4369 * fallback for unknown encodings 4370 */ 4371 handler = xmlFindCharEncodingHandler((const char *) encoding); 4372 if (handler != NULL) { 4373 xmlSwitchToEncoding(ctxt, handler); 4374 } else { 4375 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 4376 "Unsupported encoding %s\n", 4377 (const xmlChar *) encoding, NULL); 4378 } 4379 } 4380 } 4381 return(ctxt); 4382} 4383 4384#ifdef LIBXML_PUSH_ENABLED 4385/************************************************************************ 4386 * * 4387 * Progressive parsing interfaces * 4388 * * 4389 ************************************************************************/ 4390 4391/** 4392 * htmlParseLookupSequence: 4393 * @ctxt: an HTML parser context 4394 * @first: the first char to lookup 4395 * @next: the next char to lookup or zero 4396 * @third: the next char to lookup or zero 4397 * @comment: flag to force checking inside comments 4398 * 4399 * Try to find if a sequence (first, next, third) or just (first next) or 4400 * (first) is available in the input stream. 4401 * This function has a side effect of (possibly) incrementing ctxt->checkIndex 4402 * to avoid rescanning sequences of bytes, it DOES change the state of the 4403 * parser, do not use liberally. 4404 * This is basically similar to xmlParseLookupSequence() 4405 * 4406 * Returns the index to the current parsing point if the full sequence 4407 * is available, -1 otherwise. 4408 */ 4409static int 4410htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, 4411 xmlChar next, xmlChar third, int iscomment) { 4412 int base, len; 4413 htmlParserInputPtr in; 4414 const xmlChar *buf; 4415 int incomment = 0; 4416 4417 in = ctxt->input; 4418 if (in == NULL) return(-1); 4419 base = in->cur - in->base; 4420 if (base < 0) return(-1); 4421 if (ctxt->checkIndex > base) 4422 base = ctxt->checkIndex; 4423 if (in->buf == NULL) { 4424 buf = in->base; 4425 len = in->length; 4426 } else { 4427 buf = in->buf->buffer->content; 4428 len = in->buf->buffer->use; 4429 } 4430 /* take into account the sequence length */ 4431 if (third) len -= 2; 4432 else if (next) len --; 4433 for (;base < len;base++) { 4434 if (!incomment && (base + 4 < len) && !iscomment) { 4435 if ((buf[base] == '<') && (buf[base + 1] == '!') && 4436 (buf[base + 2] == '-') && (buf[base + 3] == '-')) { 4437 incomment = 1; 4438 /* do not increment past <! - some people use <!--> */ 4439 base += 2; 4440 } 4441 } 4442 if (incomment) { 4443 if (base + 3 > len) 4444 return(-1); 4445 if ((buf[base] == '-') && (buf[base + 1] == '-') && 4446 (buf[base + 2] == '>')) { 4447 incomment = 0; 4448 base += 2; 4449 } 4450 continue; 4451 } 4452 if (buf[base] == first) { 4453 if (third != 0) { 4454 if ((buf[base + 1] != next) || 4455 (buf[base + 2] != third)) continue; 4456 } else if (next != 0) { 4457 if (buf[base + 1] != next) continue; 4458 } 4459 ctxt->checkIndex = 0; 4460#ifdef DEBUG_PUSH 4461 if (next == 0) 4462 xmlGenericError(xmlGenericErrorContext, 4463 "HPP: lookup '%c' found at %d\n", 4464 first, base); 4465 else if (third == 0) 4466 xmlGenericError(xmlGenericErrorContext, 4467 "HPP: lookup '%c%c' found at %d\n", 4468 first, next, base); 4469 else 4470 xmlGenericError(xmlGenericErrorContext, 4471 "HPP: lookup '%c%c%c' found at %d\n", 4472 first, next, third, base); 4473#endif 4474 return(base - (in->cur - in->base)); 4475 } 4476 } 4477 ctxt->checkIndex = base; 4478#ifdef DEBUG_PUSH 4479 if (next == 0) 4480 xmlGenericError(xmlGenericErrorContext, 4481 "HPP: lookup '%c' failed\n", first); 4482 else if (third == 0) 4483 xmlGenericError(xmlGenericErrorContext, 4484 "HPP: lookup '%c%c' failed\n", first, next); 4485 else 4486 xmlGenericError(xmlGenericErrorContext, 4487 "HPP: lookup '%c%c%c' failed\n", first, next, third); 4488#endif 4489 return(-1); 4490} 4491 4492/** 4493 * htmlParseTryOrFinish: 4494 * @ctxt: an HTML parser context 4495 * @terminate: last chunk indicator 4496 * 4497 * Try to progress on parsing 4498 * 4499 * Returns zero if no parsing was possible 4500 */ 4501static int 4502htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { 4503 int ret = 0; 4504 htmlParserInputPtr in; 4505 int avail = 0; 4506 xmlChar cur, next; 4507 4508#ifdef DEBUG_PUSH 4509 switch (ctxt->instate) { 4510 case XML_PARSER_EOF: 4511 xmlGenericError(xmlGenericErrorContext, 4512 "HPP: try EOF\n"); break; 4513 case XML_PARSER_START: 4514 xmlGenericError(xmlGenericErrorContext, 4515 "HPP: try START\n"); break; 4516 case XML_PARSER_MISC: 4517 xmlGenericError(xmlGenericErrorContext, 4518 "HPP: try MISC\n");break; 4519 case XML_PARSER_COMMENT: 4520 xmlGenericError(xmlGenericErrorContext, 4521 "HPP: try COMMENT\n");break; 4522 case XML_PARSER_PROLOG: 4523 xmlGenericError(xmlGenericErrorContext, 4524 "HPP: try PROLOG\n");break; 4525 case XML_PARSER_START_TAG: 4526 xmlGenericError(xmlGenericErrorContext, 4527 "HPP: try START_TAG\n");break; 4528 case XML_PARSER_CONTENT: 4529 xmlGenericError(xmlGenericErrorContext, 4530 "HPP: try CONTENT\n");break; 4531 case XML_PARSER_CDATA_SECTION: 4532 xmlGenericError(xmlGenericErrorContext, 4533 "HPP: try CDATA_SECTION\n");break; 4534 case XML_PARSER_END_TAG: 4535 xmlGenericError(xmlGenericErrorContext, 4536 "HPP: try END_TAG\n");break; 4537 case XML_PARSER_ENTITY_DECL: 4538 xmlGenericError(xmlGenericErrorContext, 4539 "HPP: try ENTITY_DECL\n");break; 4540 case XML_PARSER_ENTITY_VALUE: 4541 xmlGenericError(xmlGenericErrorContext, 4542 "HPP: try ENTITY_VALUE\n");break; 4543 case XML_PARSER_ATTRIBUTE_VALUE: 4544 xmlGenericError(xmlGenericErrorContext, 4545 "HPP: try ATTRIBUTE_VALUE\n");break; 4546 case XML_PARSER_DTD: 4547 xmlGenericError(xmlGenericErrorContext, 4548 "HPP: try DTD\n");break; 4549 case XML_PARSER_EPILOG: 4550 xmlGenericError(xmlGenericErrorContext, 4551 "HPP: try EPILOG\n");break; 4552 case XML_PARSER_PI: 4553 xmlGenericError(xmlGenericErrorContext, 4554 "HPP: try PI\n");break; 4555 case XML_PARSER_SYSTEM_LITERAL: 4556 xmlGenericError(xmlGenericErrorContext, 4557 "HPP: try SYSTEM_LITERAL\n");break; 4558 } 4559#endif 4560 4561 while (1) { 4562 4563 in = ctxt->input; 4564 if (in == NULL) break; 4565 if (in->buf == NULL) 4566 avail = in->length - (in->cur - in->base); 4567 else 4568 avail = in->buf->buffer->use - (in->cur - in->base); 4569 if ((avail == 0) && (terminate)) { 4570 htmlAutoCloseOnEnd(ctxt); 4571 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 4572 /* 4573 * SAX: end of the document processing. 4574 */ 4575 ctxt->instate = XML_PARSER_EOF; 4576 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 4577 ctxt->sax->endDocument(ctxt->userData); 4578 } 4579 } 4580 if (avail < 1) 4581 goto done; 4582 cur = in->cur[0]; 4583 if (cur == 0) { 4584 SKIP(1); 4585 continue; 4586 } 4587 4588 switch (ctxt->instate) { 4589 case XML_PARSER_EOF: 4590 /* 4591 * Document parsing is done ! 4592 */ 4593 goto done; 4594 case XML_PARSER_START: 4595 /* 4596 * Very first chars read from the document flow. 4597 */ 4598 cur = in->cur[0]; 4599 if (IS_BLANK_CH(cur)) { 4600 SKIP_BLANKS; 4601 if (in->buf == NULL) 4602 avail = in->length - (in->cur - in->base); 4603 else 4604 avail = in->buf->buffer->use - (in->cur - in->base); 4605 } 4606 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 4607 ctxt->sax->setDocumentLocator(ctxt->userData, 4608 &xmlDefaultSAXLocator); 4609 if ((ctxt->sax) && (ctxt->sax->startDocument) && 4610 (!ctxt->disableSAX)) 4611 ctxt->sax->startDocument(ctxt->userData); 4612 4613 cur = in->cur[0]; 4614 next = in->cur[1]; 4615 if ((cur == '<') && (next == '!') && 4616 (UPP(2) == 'D') && (UPP(3) == 'O') && 4617 (UPP(4) == 'C') && (UPP(5) == 'T') && 4618 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4619 (UPP(8) == 'E')) { 4620 if ((!terminate) && 4621 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4622 goto done; 4623#ifdef DEBUG_PUSH 4624 xmlGenericError(xmlGenericErrorContext, 4625 "HPP: Parsing internal subset\n"); 4626#endif 4627 htmlParseDocTypeDecl(ctxt); 4628 ctxt->instate = XML_PARSER_PROLOG; 4629#ifdef DEBUG_PUSH 4630 xmlGenericError(xmlGenericErrorContext, 4631 "HPP: entering PROLOG\n"); 4632#endif 4633 } else { 4634 ctxt->instate = XML_PARSER_MISC; 4635#ifdef DEBUG_PUSH 4636 xmlGenericError(xmlGenericErrorContext, 4637 "HPP: entering MISC\n"); 4638#endif 4639 } 4640 break; 4641 case XML_PARSER_MISC: 4642 SKIP_BLANKS; 4643 if (in->buf == NULL) 4644 avail = in->length - (in->cur - in->base); 4645 else 4646 avail = in->buf->buffer->use - (in->cur - in->base); 4647 if (avail < 2) 4648 goto done; 4649 cur = in->cur[0]; 4650 next = in->cur[1]; 4651 if ((cur == '<') && (next == '!') && 4652 (in->cur[2] == '-') && (in->cur[3] == '-')) { 4653 if ((!terminate) && 4654 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0)) 4655 goto done; 4656#ifdef DEBUG_PUSH 4657 xmlGenericError(xmlGenericErrorContext, 4658 "HPP: Parsing Comment\n"); 4659#endif 4660 htmlParseComment(ctxt); 4661 ctxt->instate = XML_PARSER_MISC; 4662 } else if ((cur == '<') && (next == '?')) { 4663 if ((!terminate) && 4664 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4665 goto done; 4666#ifdef DEBUG_PUSH 4667 xmlGenericError(xmlGenericErrorContext, 4668 "HPP: Parsing PI\n"); 4669#endif 4670 htmlParsePI(ctxt); 4671 ctxt->instate = XML_PARSER_MISC; 4672 } else if ((cur == '<') && (next == '!') && 4673 (UPP(2) == 'D') && (UPP(3) == 'O') && 4674 (UPP(4) == 'C') && (UPP(5) == 'T') && 4675 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4676 (UPP(8) == 'E')) { 4677 if ((!terminate) && 4678 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4679 goto done; 4680#ifdef DEBUG_PUSH 4681 xmlGenericError(xmlGenericErrorContext, 4682 "HPP: Parsing internal subset\n"); 4683#endif 4684 htmlParseDocTypeDecl(ctxt); 4685 ctxt->instate = XML_PARSER_PROLOG; 4686#ifdef DEBUG_PUSH 4687 xmlGenericError(xmlGenericErrorContext, 4688 "HPP: entering PROLOG\n"); 4689#endif 4690 } else if ((cur == '<') && (next == '!') && 4691 (avail < 9)) { 4692 goto done; 4693 } else { 4694 ctxt->instate = XML_PARSER_START_TAG; 4695#ifdef DEBUG_PUSH 4696 xmlGenericError(xmlGenericErrorContext, 4697 "HPP: entering START_TAG\n"); 4698#endif 4699 } 4700 break; 4701 case XML_PARSER_PROLOG: 4702 SKIP_BLANKS; 4703 if (in->buf == NULL) 4704 avail = in->length - (in->cur - in->base); 4705 else 4706 avail = in->buf->buffer->use - (in->cur - in->base); 4707 if (avail < 2) 4708 goto done; 4709 cur = in->cur[0]; 4710 next = in->cur[1]; 4711 if ((cur == '<') && (next == '!') && 4712 (in->cur[2] == '-') && (in->cur[3] == '-')) { 4713 if ((!terminate) && 4714 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0)) 4715 goto done; 4716#ifdef DEBUG_PUSH 4717 xmlGenericError(xmlGenericErrorContext, 4718 "HPP: Parsing Comment\n"); 4719#endif 4720 htmlParseComment(ctxt); 4721 ctxt->instate = XML_PARSER_PROLOG; 4722 } else if ((cur == '<') && (next == '?')) { 4723 if ((!terminate) && 4724 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4725 goto done; 4726#ifdef DEBUG_PUSH 4727 xmlGenericError(xmlGenericErrorContext, 4728 "HPP: Parsing PI\n"); 4729#endif 4730 htmlParsePI(ctxt); 4731 ctxt->instate = XML_PARSER_PROLOG; 4732 } else if ((cur == '<') && (next == '!') && 4733 (avail < 4)) { 4734 goto done; 4735 } else { 4736 ctxt->instate = XML_PARSER_START_TAG; 4737#ifdef DEBUG_PUSH 4738 xmlGenericError(xmlGenericErrorContext, 4739 "HPP: entering START_TAG\n"); 4740#endif 4741 } 4742 break; 4743 case XML_PARSER_EPILOG: 4744 if (in->buf == NULL) 4745 avail = in->length - (in->cur - in->base); 4746 else 4747 avail = in->buf->buffer->use - (in->cur - in->base); 4748 if (avail < 1) 4749 goto done; 4750 cur = in->cur[0]; 4751 if (IS_BLANK_CH(cur)) { 4752 htmlParseCharData(ctxt); 4753 goto done; 4754 } 4755 if (avail < 2) 4756 goto done; 4757 next = in->cur[1]; 4758 if ((cur == '<') && (next == '!') && 4759 (in->cur[2] == '-') && (in->cur[3] == '-')) { 4760 if ((!terminate) && 4761 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0)) 4762 goto done; 4763#ifdef DEBUG_PUSH 4764 xmlGenericError(xmlGenericErrorContext, 4765 "HPP: Parsing Comment\n"); 4766#endif 4767 htmlParseComment(ctxt); 4768 ctxt->instate = XML_PARSER_EPILOG; 4769 } else if ((cur == '<') && (next == '?')) { 4770 if ((!terminate) && 4771 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4772 goto done; 4773#ifdef DEBUG_PUSH 4774 xmlGenericError(xmlGenericErrorContext, 4775 "HPP: Parsing PI\n"); 4776#endif 4777 htmlParsePI(ctxt); 4778 ctxt->instate = XML_PARSER_EPILOG; 4779 } else if ((cur == '<') && (next == '!') && 4780 (avail < 4)) { 4781 goto done; 4782 } else { 4783 ctxt->errNo = XML_ERR_DOCUMENT_END; 4784 ctxt->wellFormed = 0; 4785 ctxt->instate = XML_PARSER_EOF; 4786#ifdef DEBUG_PUSH 4787 xmlGenericError(xmlGenericErrorContext, 4788 "HPP: entering EOF\n"); 4789#endif 4790 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 4791 ctxt->sax->endDocument(ctxt->userData); 4792 goto done; 4793 } 4794 break; 4795 case XML_PARSER_START_TAG: { 4796 const xmlChar *name; 4797 int failed; 4798 const htmlElemDesc * info; 4799 4800 if (avail < 2) 4801 goto done; 4802 cur = in->cur[0]; 4803 if (cur != '<') { 4804 ctxt->instate = XML_PARSER_CONTENT; 4805#ifdef DEBUG_PUSH 4806 xmlGenericError(xmlGenericErrorContext, 4807 "HPP: entering CONTENT\n"); 4808#endif 4809 break; 4810 } 4811 if (in->cur[1] == '/') { 4812 ctxt->instate = XML_PARSER_END_TAG; 4813 ctxt->checkIndex = 0; 4814#ifdef DEBUG_PUSH 4815 xmlGenericError(xmlGenericErrorContext, 4816 "HPP: entering END_TAG\n"); 4817#endif 4818 break; 4819 } 4820 if ((!terminate) && 4821 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4822 goto done; 4823 4824 failed = htmlParseStartTag(ctxt); 4825 name = ctxt->name; 4826 if (failed || 4827 (name == NULL)) { 4828 if (CUR == '>') 4829 NEXT; 4830 break; 4831 } 4832 4833 /* 4834 * Lookup the info for that element. 4835 */ 4836 info = htmlTagLookup(name); 4837 if (info == NULL) { 4838 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 4839 "Tag %s invalid\n", name, NULL); 4840 } 4841 4842 /* 4843 * Check for an Empty Element labeled the XML/SGML way 4844 */ 4845 if ((CUR == '/') && (NXT(1) == '>')) { 4846 SKIP(2); 4847 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4848 ctxt->sax->endElement(ctxt->userData, name); 4849 htmlnamePop(ctxt); 4850 ctxt->instate = XML_PARSER_CONTENT; 4851#ifdef DEBUG_PUSH 4852 xmlGenericError(xmlGenericErrorContext, 4853 "HPP: entering CONTENT\n"); 4854#endif 4855 break; 4856 } 4857 4858 if (CUR == '>') { 4859 NEXT; 4860 } else { 4861 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 4862 "Couldn't find end of Start Tag %s\n", 4863 name, NULL); 4864 4865 /* 4866 * end of parsing of this node. 4867 */ 4868 if (xmlStrEqual(name, ctxt->name)) { 4869 nodePop(ctxt); 4870 htmlnamePop(ctxt); 4871 } 4872 4873 ctxt->instate = XML_PARSER_CONTENT; 4874#ifdef DEBUG_PUSH 4875 xmlGenericError(xmlGenericErrorContext, 4876 "HPP: entering CONTENT\n"); 4877#endif 4878 break; 4879 } 4880 4881 /* 4882 * Check for an Empty Element from DTD definition 4883 */ 4884 if ((info != NULL) && (info->empty)) { 4885 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 4886 ctxt->sax->endElement(ctxt->userData, name); 4887 htmlnamePop(ctxt); 4888 } 4889 ctxt->instate = XML_PARSER_CONTENT; 4890#ifdef DEBUG_PUSH 4891 xmlGenericError(xmlGenericErrorContext, 4892 "HPP: entering CONTENT\n"); 4893#endif 4894 break; 4895 } 4896 case XML_PARSER_CONTENT: { 4897 long cons; 4898 /* 4899 * Handle preparsed entities and charRef 4900 */ 4901 if (ctxt->token != 0) { 4902 xmlChar chr[2] = { 0 , 0 } ; 4903 4904 chr[0] = (xmlChar) ctxt->token; 4905 htmlCheckParagraph(ctxt); 4906 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 4907 ctxt->sax->characters(ctxt->userData, chr, 1); 4908 ctxt->token = 0; 4909 ctxt->checkIndex = 0; 4910 } 4911 if ((avail == 1) && (terminate)) { 4912 cur = in->cur[0]; 4913 if ((cur != '<') && (cur != '&')) { 4914 if (ctxt->sax != NULL) { 4915 if (IS_BLANK_CH(cur)) { 4916 if (ctxt->sax->ignorableWhitespace != NULL) 4917 ctxt->sax->ignorableWhitespace( 4918 ctxt->userData, &cur, 1); 4919 } else { 4920 htmlCheckParagraph(ctxt); 4921 if (ctxt->sax->characters != NULL) 4922 ctxt->sax->characters( 4923 ctxt->userData, &cur, 1); 4924 } 4925 } 4926 ctxt->token = 0; 4927 ctxt->checkIndex = 0; 4928 in->cur++; 4929 break; 4930 } 4931 } 4932 if (avail < 2) 4933 goto done; 4934 cur = in->cur[0]; 4935 next = in->cur[1]; 4936 cons = ctxt->nbChars; 4937 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) || 4938 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) { 4939 /* 4940 * Handle SCRIPT/STYLE separately 4941 */ 4942 if ((!terminate) && 4943 (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0)) 4944 goto done; 4945 htmlParseScript(ctxt); 4946 if ((cur == '<') && (next == '/')) { 4947 ctxt->instate = XML_PARSER_END_TAG; 4948 ctxt->checkIndex = 0; 4949#ifdef DEBUG_PUSH 4950 xmlGenericError(xmlGenericErrorContext, 4951 "HPP: entering END_TAG\n"); 4952#endif 4953 break; 4954 } 4955 } else { 4956 /* 4957 * Sometimes DOCTYPE arrives in the middle of the document 4958 */ 4959 if ((cur == '<') && (next == '!') && 4960 (UPP(2) == 'D') && (UPP(3) == 'O') && 4961 (UPP(4) == 'C') && (UPP(5) == 'T') && 4962 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4963 (UPP(8) == 'E')) { 4964 if ((!terminate) && 4965 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4966 goto done; 4967 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 4968 "Misplaced DOCTYPE declaration\n", 4969 BAD_CAST "DOCTYPE" , NULL); 4970 htmlParseDocTypeDecl(ctxt); 4971 } else if ((cur == '<') && (next == '!') && 4972 (in->cur[2] == '-') && (in->cur[3] == '-')) { 4973 if ((!terminate) && 4974 (htmlParseLookupSequence( 4975 ctxt, '-', '-', '>', 1) < 0)) 4976 goto done; 4977#ifdef DEBUG_PUSH 4978 xmlGenericError(xmlGenericErrorContext, 4979 "HPP: Parsing Comment\n"); 4980#endif 4981 htmlParseComment(ctxt); 4982 ctxt->instate = XML_PARSER_CONTENT; 4983 } else if ((cur == '<') && (next == '?')) { 4984 if ((!terminate) && 4985 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 4986 goto done; 4987#ifdef DEBUG_PUSH 4988 xmlGenericError(xmlGenericErrorContext, 4989 "HPP: Parsing PI\n"); 4990#endif 4991 htmlParsePI(ctxt); 4992 ctxt->instate = XML_PARSER_CONTENT; 4993 } else if ((cur == '<') && (next == '!') && (avail < 4)) { 4994 goto done; 4995 } else if ((cur == '<') && (next == '/')) { 4996 ctxt->instate = XML_PARSER_END_TAG; 4997 ctxt->checkIndex = 0; 4998#ifdef DEBUG_PUSH 4999 xmlGenericError(xmlGenericErrorContext, 5000 "HPP: entering END_TAG\n"); 5001#endif 5002 break; 5003 } else if (cur == '<') { 5004 ctxt->instate = XML_PARSER_START_TAG; 5005 ctxt->checkIndex = 0; 5006#ifdef DEBUG_PUSH 5007 xmlGenericError(xmlGenericErrorContext, 5008 "HPP: entering START_TAG\n"); 5009#endif 5010 break; 5011 } else if (cur == '&') { 5012 if ((!terminate) && 5013 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0)) 5014 goto done; 5015#ifdef DEBUG_PUSH 5016 xmlGenericError(xmlGenericErrorContext, 5017 "HPP: Parsing Reference\n"); 5018#endif 5019 /* TODO: check generation of subtrees if noent !!! */ 5020 htmlParseReference(ctxt); 5021 } else { 5022 /* 5023 * check that the text sequence is complete 5024 * before handing out the data to the parser 5025 * to avoid problems with erroneous end of 5026 * data detection. 5027 */ 5028 if ((!terminate) && 5029 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0)) 5030 goto done; 5031 ctxt->checkIndex = 0; 5032#ifdef DEBUG_PUSH 5033 xmlGenericError(xmlGenericErrorContext, 5034 "HPP: Parsing char data\n"); 5035#endif 5036 htmlParseCharData(ctxt); 5037 } 5038 } 5039 if (cons == ctxt->nbChars) { 5040 if (ctxt->node != NULL) { 5041 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5042 "detected an error in element content\n", 5043 NULL, NULL); 5044 } 5045 NEXT; 5046 break; 5047 } 5048 5049 break; 5050 } 5051 case XML_PARSER_END_TAG: 5052 if (avail < 2) 5053 goto done; 5054 if ((!terminate) && 5055 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5056 goto done; 5057 htmlParseEndTag(ctxt); 5058 if (ctxt->nameNr == 0) { 5059 ctxt->instate = XML_PARSER_EPILOG; 5060 } else { 5061 ctxt->instate = XML_PARSER_CONTENT; 5062 } 5063 ctxt->checkIndex = 0; 5064#ifdef DEBUG_PUSH 5065 xmlGenericError(xmlGenericErrorContext, 5066 "HPP: entering CONTENT\n"); 5067#endif 5068 break; 5069 case XML_PARSER_CDATA_SECTION: 5070 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5071 "HPP: internal error, state == CDATA\n", 5072 NULL, NULL); 5073 ctxt->instate = XML_PARSER_CONTENT; 5074 ctxt->checkIndex = 0; 5075#ifdef DEBUG_PUSH 5076 xmlGenericError(xmlGenericErrorContext, 5077 "HPP: entering CONTENT\n"); 5078#endif 5079 break; 5080 case XML_PARSER_DTD: 5081 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5082 "HPP: internal error, state == DTD\n", 5083 NULL, NULL); 5084 ctxt->instate = XML_PARSER_CONTENT; 5085 ctxt->checkIndex = 0; 5086#ifdef DEBUG_PUSH 5087 xmlGenericError(xmlGenericErrorContext, 5088 "HPP: entering CONTENT\n"); 5089#endif 5090 break; 5091 case XML_PARSER_COMMENT: 5092 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5093 "HPP: internal error, state == COMMENT\n", 5094 NULL, NULL); 5095 ctxt->instate = XML_PARSER_CONTENT; 5096 ctxt->checkIndex = 0; 5097#ifdef DEBUG_PUSH 5098 xmlGenericError(xmlGenericErrorContext, 5099 "HPP: entering CONTENT\n"); 5100#endif 5101 break; 5102 case XML_PARSER_PI: 5103 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5104 "HPP: internal error, state == PI\n", 5105 NULL, NULL); 5106 ctxt->instate = XML_PARSER_CONTENT; 5107 ctxt->checkIndex = 0; 5108#ifdef DEBUG_PUSH 5109 xmlGenericError(xmlGenericErrorContext, 5110 "HPP: entering CONTENT\n"); 5111#endif 5112 break; 5113 case XML_PARSER_ENTITY_DECL: 5114 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5115 "HPP: internal error, state == ENTITY_DECL\n", 5116 NULL, NULL); 5117 ctxt->instate = XML_PARSER_CONTENT; 5118 ctxt->checkIndex = 0; 5119#ifdef DEBUG_PUSH 5120 xmlGenericError(xmlGenericErrorContext, 5121 "HPP: entering CONTENT\n"); 5122#endif 5123 break; 5124 case XML_PARSER_ENTITY_VALUE: 5125 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5126 "HPP: internal error, state == ENTITY_VALUE\n", 5127 NULL, NULL); 5128 ctxt->instate = XML_PARSER_CONTENT; 5129 ctxt->checkIndex = 0; 5130#ifdef DEBUG_PUSH 5131 xmlGenericError(xmlGenericErrorContext, 5132 "HPP: entering DTD\n"); 5133#endif 5134 break; 5135 case XML_PARSER_ATTRIBUTE_VALUE: 5136 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5137 "HPP: internal error, state == ATTRIBUTE_VALUE\n", 5138 NULL, NULL); 5139 ctxt->instate = XML_PARSER_START_TAG; 5140 ctxt->checkIndex = 0; 5141#ifdef DEBUG_PUSH 5142 xmlGenericError(xmlGenericErrorContext, 5143 "HPP: entering START_TAG\n"); 5144#endif 5145 break; 5146 case XML_PARSER_SYSTEM_LITERAL: 5147 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5148 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n", 5149 NULL, NULL); 5150 ctxt->instate = XML_PARSER_CONTENT; 5151 ctxt->checkIndex = 0; 5152#ifdef DEBUG_PUSH 5153 xmlGenericError(xmlGenericErrorContext, 5154 "HPP: entering CONTENT\n"); 5155#endif 5156 break; 5157 case XML_PARSER_IGNORE: 5158 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5159 "HPP: internal error, state == XML_PARSER_IGNORE\n", 5160 NULL, NULL); 5161 ctxt->instate = XML_PARSER_CONTENT; 5162 ctxt->checkIndex = 0; 5163#ifdef DEBUG_PUSH 5164 xmlGenericError(xmlGenericErrorContext, 5165 "HPP: entering CONTENT\n"); 5166#endif 5167 break; 5168 case XML_PARSER_PUBLIC_LITERAL: 5169 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5170 "HPP: internal error, state == XML_PARSER_LITERAL\n", 5171 NULL, NULL); 5172 ctxt->instate = XML_PARSER_CONTENT; 5173 ctxt->checkIndex = 0; 5174#ifdef DEBUG_PUSH 5175 xmlGenericError(xmlGenericErrorContext, 5176 "HPP: entering CONTENT\n"); 5177#endif 5178 break; 5179 5180 } 5181 } 5182done: 5183 if ((avail == 0) && (terminate)) { 5184 htmlAutoCloseOnEnd(ctxt); 5185 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5186 /* 5187 * SAX: end of the document processing. 5188 */ 5189 ctxt->instate = XML_PARSER_EOF; 5190 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5191 ctxt->sax->endDocument(ctxt->userData); 5192 } 5193 } 5194 if ((ctxt->myDoc != NULL) && 5195 ((terminate) || (ctxt->instate == XML_PARSER_EOF) || 5196 (ctxt->instate == XML_PARSER_EPILOG))) { 5197 xmlDtdPtr dtd; 5198 dtd = xmlGetIntSubset(ctxt->myDoc); 5199 if (dtd == NULL) 5200 ctxt->myDoc->intSubset = 5201 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 5202 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 5203 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 5204 } 5205#ifdef DEBUG_PUSH 5206 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret); 5207#endif 5208 return(ret); 5209} 5210 5211/** 5212 * htmlParseChunk: 5213 * @ctxt: an HTML parser context 5214 * @chunk: an char array 5215 * @size: the size in byte of the chunk 5216 * @terminate: last chunk indicator 5217 * 5218 * Parse a Chunk of memory 5219 * 5220 * Returns zero if no error, the xmlParserErrors otherwise. 5221 */ 5222int 5223htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, 5224 int terminate) { 5225 if ((ctxt == NULL) || (ctxt->input == NULL)) { 5226 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5227 "htmlParseChunk: context error\n", NULL, NULL); 5228 return(XML_ERR_INTERNAL_ERROR); 5229 } 5230 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 5231 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { 5232 int base = ctxt->input->base - ctxt->input->buf->buffer->content; 5233 int cur = ctxt->input->cur - ctxt->input->base; 5234 int res; 5235 5236 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 5237 if (res < 0) { 5238 ctxt->errNo = XML_PARSER_EOF; 5239 ctxt->disableSAX = 1; 5240 return (XML_PARSER_EOF); 5241 } 5242 ctxt->input->base = ctxt->input->buf->buffer->content + base; 5243 ctxt->input->cur = ctxt->input->base + cur; 5244 ctxt->input->end = 5245 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 5246#ifdef DEBUG_PUSH 5247 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 5248#endif 5249 5250#if 0 5251 if ((terminate) || (ctxt->input->buf->buffer->use > 80)) 5252 htmlParseTryOrFinish(ctxt, terminate); 5253#endif 5254 } else if (ctxt->instate != XML_PARSER_EOF) { 5255 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { 5256 xmlParserInputBufferPtr in = ctxt->input->buf; 5257 if ((in->encoder != NULL) && (in->buffer != NULL) && 5258 (in->raw != NULL)) { 5259 int nbchars; 5260 5261 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw); 5262 if (nbchars < 0) { 5263 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 5264 "encoder error\n", NULL, NULL); 5265 return(XML_ERR_INVALID_ENCODING); 5266 } 5267 } 5268 } 5269 } 5270 htmlParseTryOrFinish(ctxt, terminate); 5271 if (terminate) { 5272 if ((ctxt->instate != XML_PARSER_EOF) && 5273 (ctxt->instate != XML_PARSER_EPILOG) && 5274 (ctxt->instate != XML_PARSER_MISC)) { 5275 ctxt->errNo = XML_ERR_DOCUMENT_END; 5276 ctxt->wellFormed = 0; 5277 } 5278 if (ctxt->instate != XML_PARSER_EOF) { 5279 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5280 ctxt->sax->endDocument(ctxt->userData); 5281 } 5282 ctxt->instate = XML_PARSER_EOF; 5283 } 5284 return((xmlParserErrors) ctxt->errNo); 5285} 5286 5287/************************************************************************ 5288 * * 5289 * User entry points * 5290 * * 5291 ************************************************************************/ 5292 5293/** 5294 * htmlCreatePushParserCtxt: 5295 * @sax: a SAX handler 5296 * @user_data: The user data returned on SAX callbacks 5297 * @chunk: a pointer to an array of chars 5298 * @size: number of chars in the array 5299 * @filename: an optional file name or URI 5300 * @enc: an optional encoding 5301 * 5302 * Create a parser context for using the HTML parser in push mode 5303 * The value of @filename is used for fetching external entities 5304 * and error/warning reports. 5305 * 5306 * Returns the new parser context or NULL 5307 */ 5308htmlParserCtxtPtr 5309htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, 5310 const char *chunk, int size, const char *filename, 5311 xmlCharEncoding enc) { 5312 htmlParserCtxtPtr ctxt; 5313 htmlParserInputPtr inputStream; 5314 xmlParserInputBufferPtr buf; 5315 5316 xmlInitParser(); 5317 5318 buf = xmlAllocParserInputBuffer(enc); 5319 if (buf == NULL) return(NULL); 5320 5321 ctxt = htmlNewParserCtxt(); 5322 if (ctxt == NULL) { 5323 xmlFreeParserInputBuffer(buf); 5324 return(NULL); 5325 } 5326 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder) 5327 ctxt->charset=XML_CHAR_ENCODING_UTF8; 5328 if (sax != NULL) { 5329 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler) 5330 xmlFree(ctxt->sax); 5331 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler)); 5332 if (ctxt->sax == NULL) { 5333 xmlFree(buf); 5334 xmlFree(ctxt); 5335 return(NULL); 5336 } 5337 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler)); 5338 if (user_data != NULL) 5339 ctxt->userData = user_data; 5340 } 5341 if (filename == NULL) { 5342 ctxt->directory = NULL; 5343 } else { 5344 ctxt->directory = xmlParserGetDirectory(filename); 5345 } 5346 5347 inputStream = htmlNewInputStream(ctxt); 5348 if (inputStream == NULL) { 5349 xmlFreeParserCtxt(ctxt); 5350 xmlFree(buf); 5351 return(NULL); 5352 } 5353 5354 if (filename == NULL) 5355 inputStream->filename = NULL; 5356 else 5357 inputStream->filename = (char *) 5358 xmlCanonicPath((const xmlChar *) filename); 5359 inputStream->buf = buf; 5360 inputStream->base = inputStream->buf->buffer->content; 5361 inputStream->cur = inputStream->buf->buffer->content; 5362 inputStream->end = 5363 &inputStream->buf->buffer->content[inputStream->buf->buffer->use]; 5364 5365 inputPush(ctxt, inputStream); 5366 5367 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 5368 (ctxt->input->buf != NULL)) { 5369 int base = ctxt->input->base - ctxt->input->buf->buffer->content; 5370 int cur = ctxt->input->cur - ctxt->input->base; 5371 5372 xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 5373 5374 ctxt->input->base = ctxt->input->buf->buffer->content + base; 5375 ctxt->input->cur = ctxt->input->base + cur; 5376 ctxt->input->end = 5377 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 5378#ifdef DEBUG_PUSH 5379 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 5380#endif 5381 } 5382 5383 return(ctxt); 5384} 5385#endif /* LIBXML_PUSH_ENABLED */ 5386 5387/** 5388 * htmlSAXParseDoc: 5389 * @cur: a pointer to an array of xmlChar 5390 * @encoding: a free form C string describing the HTML document encoding, or NULL 5391 * @sax: the SAX handler block 5392 * @userData: if using SAX, this pointer will be provided on callbacks. 5393 * 5394 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks 5395 * to handle parse events. If sax is NULL, fallback to the default DOM 5396 * behavior and return a tree. 5397 * 5398 * Returns the resulting document tree unless SAX is NULL or the document is 5399 * not well formed. 5400 */ 5401 5402htmlDocPtr 5403htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) { 5404 htmlDocPtr ret; 5405 htmlParserCtxtPtr ctxt; 5406 5407 xmlInitParser(); 5408 5409 if (cur == NULL) return(NULL); 5410 5411 5412 ctxt = htmlCreateDocParserCtxt(cur, encoding); 5413 if (ctxt == NULL) return(NULL); 5414 if (sax != NULL) { 5415 if (ctxt->sax != NULL) xmlFree (ctxt->sax); 5416 ctxt->sax = sax; 5417 ctxt->userData = userData; 5418 } 5419 5420 htmlParseDocument(ctxt); 5421 ret = ctxt->myDoc; 5422 if (sax != NULL) { 5423 ctxt->sax = NULL; 5424 ctxt->userData = NULL; 5425 } 5426 htmlFreeParserCtxt(ctxt); 5427 5428 return(ret); 5429} 5430 5431/** 5432 * htmlParseDoc: 5433 * @cur: a pointer to an array of xmlChar 5434 * @encoding: a free form C string describing the HTML document encoding, or NULL 5435 * 5436 * parse an HTML in-memory document and build a tree. 5437 * 5438 * Returns the resulting document tree 5439 */ 5440 5441htmlDocPtr 5442htmlParseDoc(xmlChar *cur, const char *encoding) { 5443 return(htmlSAXParseDoc(cur, encoding, NULL, NULL)); 5444} 5445 5446 5447/** 5448 * htmlCreateFileParserCtxt: 5449 * @filename: the filename 5450 * @encoding: a free form C string describing the HTML document encoding, or NULL 5451 * 5452 * Create a parser context for a file content. 5453 * Automatic support for ZLIB/Compress compressed document is provided 5454 * by default if found at compile-time. 5455 * 5456 * Returns the new parser context or NULL 5457 */ 5458htmlParserCtxtPtr 5459htmlCreateFileParserCtxt(const char *filename, const char *encoding) 5460{ 5461 htmlParserCtxtPtr ctxt; 5462 htmlParserInputPtr inputStream; 5463 char *canonicFilename; 5464 /* htmlCharEncoding enc; */ 5465 xmlChar *content, *content_line = (xmlChar *) "charset="; 5466 5467 if (filename == NULL) 5468 return(NULL); 5469 5470 ctxt = htmlNewParserCtxt(); 5471 if (ctxt == NULL) { 5472 return(NULL); 5473 } 5474 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename); 5475 if (canonicFilename == NULL) { 5476#ifdef LIBXML_SAX1_ENABLED 5477 if (xmlDefaultSAXHandler.error != NULL) { 5478 xmlDefaultSAXHandler.error(NULL, "out of memory\n"); 5479 } 5480#endif 5481 xmlFreeParserCtxt(ctxt); 5482 return(NULL); 5483 } 5484 5485 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt); 5486 xmlFree(canonicFilename); 5487 if (inputStream == NULL) { 5488 xmlFreeParserCtxt(ctxt); 5489 return(NULL); 5490 } 5491 5492 inputPush(ctxt, inputStream); 5493 5494 /* set encoding */ 5495 if (encoding) { 5496 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1); 5497 if (content) { 5498 strcpy ((char *)content, (char *)content_line); 5499 strcat ((char *)content, (char *)encoding); 5500 htmlCheckEncoding (ctxt, content); 5501 xmlFree (content); 5502 } 5503 } 5504 5505 return(ctxt); 5506} 5507 5508/** 5509 * htmlSAXParseFile: 5510 * @filename: the filename 5511 * @encoding: a free form C string describing the HTML document encoding, or NULL 5512 * @sax: the SAX handler block 5513 * @userData: if using SAX, this pointer will be provided on callbacks. 5514 * 5515 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 5516 * compressed document is provided by default if found at compile-time. 5517 * It use the given SAX function block to handle the parsing callback. 5518 * If sax is NULL, fallback to the default DOM tree building routines. 5519 * 5520 * Returns the resulting document tree unless SAX is NULL or the document is 5521 * not well formed. 5522 */ 5523 5524htmlDocPtr 5525htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax, 5526 void *userData) { 5527 htmlDocPtr ret; 5528 htmlParserCtxtPtr ctxt; 5529 htmlSAXHandlerPtr oldsax = NULL; 5530 5531 xmlInitParser(); 5532 5533 ctxt = htmlCreateFileParserCtxt(filename, encoding); 5534 if (ctxt == NULL) return(NULL); 5535 if (sax != NULL) { 5536 oldsax = ctxt->sax; 5537 ctxt->sax = sax; 5538 ctxt->userData = userData; 5539 } 5540 5541 htmlParseDocument(ctxt); 5542 5543 ret = ctxt->myDoc; 5544 if (sax != NULL) { 5545 ctxt->sax = oldsax; 5546 ctxt->userData = NULL; 5547 } 5548 htmlFreeParserCtxt(ctxt); 5549 5550 return(ret); 5551} 5552 5553/** 5554 * htmlParseFile: 5555 * @filename: the filename 5556 * @encoding: a free form C string describing the HTML document encoding, or NULL 5557 * 5558 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 5559 * compressed document is provided by default if found at compile-time. 5560 * 5561 * Returns the resulting document tree 5562 */ 5563 5564htmlDocPtr 5565htmlParseFile(const char *filename, const char *encoding) { 5566 return(htmlSAXParseFile(filename, encoding, NULL, NULL)); 5567} 5568 5569/** 5570 * htmlHandleOmittedElem: 5571 * @val: int 0 or 1 5572 * 5573 * Set and return the previous value for handling HTML omitted tags. 5574 * 5575 * Returns the last value for 0 for no handling, 1 for auto insertion. 5576 */ 5577 5578int 5579htmlHandleOmittedElem(int val) { 5580 int old = htmlOmittedDefaultValue; 5581 5582 htmlOmittedDefaultValue = val; 5583 return(old); 5584} 5585 5586/** 5587 * htmlElementAllowedHere: 5588 * @parent: HTML parent element 5589 * @elt: HTML element 5590 * 5591 * Checks whether an HTML element may be a direct child of a parent element. 5592 * Note - doesn't check for deprecated elements 5593 * 5594 * Returns 1 if allowed; 0 otherwise. 5595 */ 5596int 5597htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) { 5598 const char** p ; 5599 5600 if ( ! elt || ! parent || ! parent->subelts ) 5601 return 0 ; 5602 5603 for ( p = parent->subelts; *p; ++p ) 5604 if ( !xmlStrcmp((const xmlChar *)*p, elt) ) 5605 return 1 ; 5606 5607 return 0 ; 5608} 5609/** 5610 * htmlElementStatusHere: 5611 * @parent: HTML parent element 5612 * @elt: HTML element 5613 * 5614 * Checks whether an HTML element may be a direct child of a parent element. 5615 * and if so whether it is valid or deprecated. 5616 * 5617 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID 5618 */ 5619htmlStatus 5620htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) { 5621 if ( ! parent || ! elt ) 5622 return HTML_INVALID ; 5623 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) ) 5624 return HTML_INVALID ; 5625 5626 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ; 5627} 5628/** 5629 * htmlAttrAllowed: 5630 * @elt: HTML element 5631 * @attr: HTML attribute 5632 * @legacy: whether to allow deprecated attributes 5633 * 5634 * Checks whether an attribute is valid for an element 5635 * Has full knowledge of Required and Deprecated attributes 5636 * 5637 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID 5638 */ 5639htmlStatus 5640htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) { 5641 const char** p ; 5642 5643 if ( !elt || ! attr ) 5644 return HTML_INVALID ; 5645 5646 if ( elt->attrs_req ) 5647 for ( p = elt->attrs_req; *p; ++p) 5648 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 5649 return HTML_REQUIRED ; 5650 5651 if ( elt->attrs_opt ) 5652 for ( p = elt->attrs_opt; *p; ++p) 5653 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 5654 return HTML_VALID ; 5655 5656 if ( legacy && elt->attrs_depr ) 5657 for ( p = elt->attrs_depr; *p; ++p) 5658 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 5659 return HTML_DEPRECATED ; 5660 5661 return HTML_INVALID ; 5662} 5663/** 5664 * htmlNodeStatus: 5665 * @node: an htmlNodePtr in a tree 5666 * @legacy: whether to allow deprecated elements (YES is faster here 5667 * for Element nodes) 5668 * 5669 * Checks whether the tree node is valid. Experimental (the author 5670 * only uses the HTML enhancements in a SAX parser) 5671 * 5672 * Return: for Element nodes, a return from htmlElementAllowedHere (if 5673 * legacy allowed) or htmlElementStatusHere (otherwise). 5674 * for Attribute nodes, a return from htmlAttrAllowed 5675 * for other nodes, HTML_NA (no checks performed) 5676 */ 5677htmlStatus 5678htmlNodeStatus(const htmlNodePtr node, int legacy) { 5679 if ( ! node ) 5680 return HTML_INVALID ; 5681 5682 switch ( node->type ) { 5683 case XML_ELEMENT_NODE: 5684 return legacy 5685 ? ( htmlElementAllowedHere ( 5686 htmlTagLookup(node->parent->name) , node->name 5687 ) ? HTML_VALID : HTML_INVALID ) 5688 : htmlElementStatusHere( 5689 htmlTagLookup(node->parent->name) , 5690 htmlTagLookup(node->name) ) 5691 ; 5692 case XML_ATTRIBUTE_NODE: 5693 return htmlAttrAllowed( 5694 htmlTagLookup(node->parent->name) , node->name, legacy) ; 5695 default: return HTML_NA ; 5696 } 5697} 5698/************************************************************************ 5699 * * 5700 * New set (2.6.0) of simpler and more flexible APIs * 5701 * * 5702 ************************************************************************/ 5703/** 5704 * DICT_FREE: 5705 * @str: a string 5706 * 5707 * Free a string if it is not owned by the "dict" dictionnary in the 5708 * current scope 5709 */ 5710#define DICT_FREE(str) \ 5711 if ((str) && ((!dict) || \ 5712 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \ 5713 xmlFree((char *)(str)); 5714 5715/** 5716 * htmlCtxtReset: 5717 * @ctxt: an HTML parser context 5718 * 5719 * Reset a parser context 5720 */ 5721void 5722htmlCtxtReset(htmlParserCtxtPtr ctxt) 5723{ 5724 xmlParserInputPtr input; 5725 xmlDictPtr dict; 5726 5727 if (ctxt == NULL) 5728 return; 5729 5730 dict = ctxt->dict; 5731 5732 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */ 5733 xmlFreeInputStream(input); 5734 } 5735 ctxt->inputNr = 0; 5736 ctxt->input = NULL; 5737 5738 ctxt->spaceNr = 0; 5739 if (ctxt->spaceTab != NULL) { 5740 ctxt->spaceTab[0] = -1; 5741 ctxt->space = &ctxt->spaceTab[0]; 5742 } else { 5743 ctxt->space = NULL; 5744 } 5745 5746 5747 ctxt->nodeNr = 0; 5748 ctxt->node = NULL; 5749 5750 ctxt->nameNr = 0; 5751 ctxt->name = NULL; 5752 5753 DICT_FREE(ctxt->version); 5754 ctxt->version = NULL; 5755 DICT_FREE(ctxt->encoding); 5756 ctxt->encoding = NULL; 5757 DICT_FREE(ctxt->directory); 5758 ctxt->directory = NULL; 5759 DICT_FREE(ctxt->extSubURI); 5760 ctxt->extSubURI = NULL; 5761 DICT_FREE(ctxt->extSubSystem); 5762 ctxt->extSubSystem = NULL; 5763 if (ctxt->myDoc != NULL) 5764 xmlFreeDoc(ctxt->myDoc); 5765 ctxt->myDoc = NULL; 5766 5767 ctxt->standalone = -1; 5768 ctxt->hasExternalSubset = 0; 5769 ctxt->hasPErefs = 0; 5770 ctxt->html = 1; 5771 ctxt->external = 0; 5772 ctxt->instate = XML_PARSER_START; 5773 ctxt->token = 0; 5774 5775 ctxt->wellFormed = 1; 5776 ctxt->nsWellFormed = 1; 5777 ctxt->valid = 1; 5778 ctxt->vctxt.userData = ctxt; 5779 ctxt->vctxt.error = xmlParserValidityError; 5780 ctxt->vctxt.warning = xmlParserValidityWarning; 5781 ctxt->record_info = 0; 5782 ctxt->nbChars = 0; 5783 ctxt->checkIndex = 0; 5784 ctxt->inSubset = 0; 5785 ctxt->errNo = XML_ERR_OK; 5786 ctxt->depth = 0; 5787 ctxt->charset = XML_CHAR_ENCODING_UTF8; 5788 ctxt->catalogs = NULL; 5789 xmlInitNodeInfoSeq(&ctxt->node_seq); 5790 5791 if (ctxt->attsDefault != NULL) { 5792 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree); 5793 ctxt->attsDefault = NULL; 5794 } 5795 if (ctxt->attsSpecial != NULL) { 5796 xmlHashFree(ctxt->attsSpecial, NULL); 5797 ctxt->attsSpecial = NULL; 5798 } 5799} 5800 5801/** 5802 * htmlCtxtUseOptions: 5803 * @ctxt: an HTML parser context 5804 * @options: a combination of htmlParserOption(s) 5805 * 5806 * Applies the options to the parser context 5807 * 5808 * Returns 0 in case of success, the set of unknown or unimplemented options 5809 * in case of error. 5810 */ 5811int 5812htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) 5813{ 5814 if (ctxt == NULL) 5815 return(-1); 5816 5817 if (options & HTML_PARSE_NOWARNING) { 5818 ctxt->sax->warning = NULL; 5819 ctxt->vctxt.warning = NULL; 5820 options -= XML_PARSE_NOWARNING; 5821 ctxt->options |= XML_PARSE_NOWARNING; 5822 } 5823 if (options & HTML_PARSE_NOERROR) { 5824 ctxt->sax->error = NULL; 5825 ctxt->vctxt.error = NULL; 5826 ctxt->sax->fatalError = NULL; 5827 options -= XML_PARSE_NOERROR; 5828 ctxt->options |= XML_PARSE_NOERROR; 5829 } 5830 if (options & HTML_PARSE_PEDANTIC) { 5831 ctxt->pedantic = 1; 5832 options -= XML_PARSE_PEDANTIC; 5833 ctxt->options |= XML_PARSE_PEDANTIC; 5834 } else 5835 ctxt->pedantic = 0; 5836 if (options & XML_PARSE_NOBLANKS) { 5837 ctxt->keepBlanks = 0; 5838 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; 5839 options -= XML_PARSE_NOBLANKS; 5840 ctxt->options |= XML_PARSE_NOBLANKS; 5841 } else 5842 ctxt->keepBlanks = 1; 5843 if (options & HTML_PARSE_RECOVER) { 5844 ctxt->recovery = 1; 5845 } else 5846 ctxt->recovery = 0; 5847 if (options & HTML_PARSE_COMPACT) { 5848 ctxt->options |= HTML_PARSE_COMPACT; 5849 options -= HTML_PARSE_COMPACT; 5850 } 5851 ctxt->dictNames = 0; 5852 return (options); 5853} 5854 5855/** 5856 * htmlDoRead: 5857 * @ctxt: an HTML parser context 5858 * @URL: the base URL to use for the document 5859 * @encoding: the document encoding, or NULL 5860 * @options: a combination of htmlParserOption(s) 5861 * @reuse: keep the context for reuse 5862 * 5863 * Common front-end for the htmlRead functions 5864 * 5865 * Returns the resulting document tree or NULL 5866 */ 5867static htmlDocPtr 5868htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding, 5869 int options, int reuse) 5870{ 5871 htmlDocPtr ret; 5872 5873 htmlCtxtUseOptions(ctxt, options); 5874 ctxt->html = 1; 5875 if (encoding != NULL) { 5876 xmlCharEncodingHandlerPtr hdlr; 5877 5878 hdlr = xmlFindCharEncodingHandler(encoding); 5879 if (hdlr != NULL) 5880 xmlSwitchToEncoding(ctxt, hdlr); 5881 } 5882 if ((URL != NULL) && (ctxt->input != NULL) && 5883 (ctxt->input->filename == NULL)) 5884 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL); 5885 htmlParseDocument(ctxt); 5886 ret = ctxt->myDoc; 5887 ctxt->myDoc = NULL; 5888 if (!reuse) { 5889 if ((ctxt->dictNames) && 5890 (ret != NULL) && 5891 (ret->dict == ctxt->dict)) 5892 ctxt->dict = NULL; 5893 xmlFreeParserCtxt(ctxt); 5894 } 5895 return (ret); 5896} 5897 5898/** 5899 * htmlReadDoc: 5900 * @cur: a pointer to a zero terminated string 5901 * @URL: the base URL to use for the document 5902 * @encoding: the document encoding, or NULL 5903 * @options: a combination of htmlParserOption(s) 5904 * 5905 * parse an XML in-memory document and build a tree. 5906 * 5907 * Returns the resulting document tree 5908 */ 5909htmlDocPtr 5910htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options) 5911{ 5912 htmlParserCtxtPtr ctxt; 5913 5914 if (cur == NULL) 5915 return (NULL); 5916 5917 ctxt = xmlCreateDocParserCtxt(cur); 5918 if (ctxt == NULL) 5919 return (NULL); 5920 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 5921} 5922 5923/** 5924 * htmlReadFile: 5925 * @filename: a file or URL 5926 * @encoding: the document encoding, or NULL 5927 * @options: a combination of htmlParserOption(s) 5928 * 5929 * parse an XML file from the filesystem or the network. 5930 * 5931 * Returns the resulting document tree 5932 */ 5933htmlDocPtr 5934htmlReadFile(const char *filename, const char *encoding, int options) 5935{ 5936 htmlParserCtxtPtr ctxt; 5937 5938 ctxt = htmlCreateFileParserCtxt(filename, encoding); 5939 if (ctxt == NULL) 5940 return (NULL); 5941 return (htmlDoRead(ctxt, NULL, NULL, options, 0)); 5942} 5943 5944/** 5945 * htmlReadMemory: 5946 * @buffer: a pointer to a char array 5947 * @size: the size of the array 5948 * @URL: the base URL to use for the document 5949 * @encoding: the document encoding, or NULL 5950 * @options: a combination of htmlParserOption(s) 5951 * 5952 * parse an XML in-memory document and build a tree. 5953 * 5954 * Returns the resulting document tree 5955 */ 5956htmlDocPtr 5957htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options) 5958{ 5959 htmlParserCtxtPtr ctxt; 5960 5961 ctxt = xmlCreateMemoryParserCtxt(buffer, size); 5962 if (ctxt == NULL) 5963 return (NULL); 5964 if (ctxt->sax != NULL) 5965 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 5966 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 5967} 5968 5969/** 5970 * htmlReadFd: 5971 * @fd: an open file descriptor 5972 * @URL: the base URL to use for the document 5973 * @encoding: the document encoding, or NULL 5974 * @options: a combination of htmlParserOption(s) 5975 * 5976 * parse an XML from a file descriptor and build a tree. 5977 * 5978 * Returns the resulting document tree 5979 */ 5980htmlDocPtr 5981htmlReadFd(int fd, const char *URL, const char *encoding, int options) 5982{ 5983 htmlParserCtxtPtr ctxt; 5984 xmlParserInputBufferPtr input; 5985 xmlParserInputPtr stream; 5986 5987 if (fd < 0) 5988 return (NULL); 5989 5990 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 5991 if (input == NULL) 5992 return (NULL); 5993 ctxt = xmlNewParserCtxt(); 5994 if (ctxt == NULL) { 5995 xmlFreeParserInputBuffer(input); 5996 return (NULL); 5997 } 5998 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 5999 if (stream == NULL) { 6000 xmlFreeParserInputBuffer(input); 6001 xmlFreeParserCtxt(ctxt); 6002 return (NULL); 6003 } 6004 inputPush(ctxt, stream); 6005 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6006} 6007 6008/** 6009 * htmlReadIO: 6010 * @ioread: an I/O read function 6011 * @ioclose: an I/O close function 6012 * @ioctx: an I/O handler 6013 * @URL: the base URL to use for the document 6014 * @encoding: the document encoding, or NULL 6015 * @options: a combination of htmlParserOption(s) 6016 * 6017 * parse an HTML document from I/O functions and source and build a tree. 6018 * 6019 * Returns the resulting document tree 6020 */ 6021htmlDocPtr 6022htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, 6023 void *ioctx, const char *URL, const char *encoding, int options) 6024{ 6025 htmlParserCtxtPtr ctxt; 6026 xmlParserInputBufferPtr input; 6027 xmlParserInputPtr stream; 6028 6029 if (ioread == NULL) 6030 return (NULL); 6031 6032 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 6033 XML_CHAR_ENCODING_NONE); 6034 if (input == NULL) 6035 return (NULL); 6036 ctxt = xmlNewParserCtxt(); 6037 if (ctxt == NULL) { 6038 xmlFreeParserInputBuffer(input); 6039 return (NULL); 6040 } 6041 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6042 if (stream == NULL) { 6043 xmlFreeParserInputBuffer(input); 6044 xmlFreeParserCtxt(ctxt); 6045 return (NULL); 6046 } 6047 inputPush(ctxt, stream); 6048 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6049} 6050 6051/** 6052 * htmlCtxtReadDoc: 6053 * @ctxt: an HTML parser context 6054 * @cur: a pointer to a zero terminated string 6055 * @URL: the base URL to use for the document 6056 * @encoding: the document encoding, or NULL 6057 * @options: a combination of htmlParserOption(s) 6058 * 6059 * parse an XML in-memory document and build a tree. 6060 * This reuses the existing @ctxt parser context 6061 * 6062 * Returns the resulting document tree 6063 */ 6064htmlDocPtr 6065htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur, 6066 const char *URL, const char *encoding, int options) 6067{ 6068 xmlParserInputPtr stream; 6069 6070 if (cur == NULL) 6071 return (NULL); 6072 if (ctxt == NULL) 6073 return (NULL); 6074 6075 htmlCtxtReset(ctxt); 6076 6077 stream = xmlNewStringInputStream(ctxt, cur); 6078 if (stream == NULL) { 6079 return (NULL); 6080 } 6081 inputPush(ctxt, stream); 6082 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6083} 6084 6085/** 6086 * htmlCtxtReadFile: 6087 * @ctxt: an HTML parser context 6088 * @filename: a file or URL 6089 * @encoding: the document encoding, or NULL 6090 * @options: a combination of htmlParserOption(s) 6091 * 6092 * parse an XML file from the filesystem or the network. 6093 * This reuses the existing @ctxt parser context 6094 * 6095 * Returns the resulting document tree 6096 */ 6097htmlDocPtr 6098htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename, 6099 const char *encoding, int options) 6100{ 6101 xmlParserInputPtr stream; 6102 6103 if (filename == NULL) 6104 return (NULL); 6105 if (ctxt == NULL) 6106 return (NULL); 6107 6108 htmlCtxtReset(ctxt); 6109 6110 stream = xmlLoadExternalEntity(filename, NULL, ctxt); 6111 if (stream == NULL) { 6112 return (NULL); 6113 } 6114 inputPush(ctxt, stream); 6115 return (htmlDoRead(ctxt, NULL, encoding, options, 1)); 6116} 6117 6118/** 6119 * htmlCtxtReadMemory: 6120 * @ctxt: an HTML parser context 6121 * @buffer: a pointer to a char array 6122 * @size: the size of the array 6123 * @URL: the base URL to use for the document 6124 * @encoding: the document encoding, or NULL 6125 * @options: a combination of htmlParserOption(s) 6126 * 6127 * parse an XML in-memory document and build a tree. 6128 * This reuses the existing @ctxt parser context 6129 * 6130 * Returns the resulting document tree 6131 */ 6132htmlDocPtr 6133htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size, 6134 const char *URL, const char *encoding, int options) 6135{ 6136 xmlParserInputBufferPtr input; 6137 xmlParserInputPtr stream; 6138 6139 if (ctxt == NULL) 6140 return (NULL); 6141 if (buffer == NULL) 6142 return (NULL); 6143 6144 htmlCtxtReset(ctxt); 6145 6146 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 6147 if (input == NULL) { 6148 return(NULL); 6149 } 6150 6151 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6152 if (stream == NULL) { 6153 xmlFreeParserInputBuffer(input); 6154 return(NULL); 6155 } 6156 6157 inputPush(ctxt, stream); 6158 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6159} 6160 6161/** 6162 * htmlCtxtReadFd: 6163 * @ctxt: an HTML parser context 6164 * @fd: an open file descriptor 6165 * @URL: the base URL to use for the document 6166 * @encoding: the document encoding, or NULL 6167 * @options: a combination of htmlParserOption(s) 6168 * 6169 * parse an XML from a file descriptor and build a tree. 6170 * This reuses the existing @ctxt parser context 6171 * 6172 * Returns the resulting document tree 6173 */ 6174htmlDocPtr 6175htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd, 6176 const char *URL, const char *encoding, int options) 6177{ 6178 xmlParserInputBufferPtr input; 6179 xmlParserInputPtr stream; 6180 6181 if (fd < 0) 6182 return (NULL); 6183 if (ctxt == NULL) 6184 return (NULL); 6185 6186 htmlCtxtReset(ctxt); 6187 6188 6189 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 6190 if (input == NULL) 6191 return (NULL); 6192 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6193 if (stream == NULL) { 6194 xmlFreeParserInputBuffer(input); 6195 return (NULL); 6196 } 6197 inputPush(ctxt, stream); 6198 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6199} 6200 6201/** 6202 * htmlCtxtReadIO: 6203 * @ctxt: an HTML parser context 6204 * @ioread: an I/O read function 6205 * @ioclose: an I/O close function 6206 * @ioctx: an I/O handler 6207 * @URL: the base URL to use for the document 6208 * @encoding: the document encoding, or NULL 6209 * @options: a combination of htmlParserOption(s) 6210 * 6211 * parse an HTML document from I/O functions and source and build a tree. 6212 * This reuses the existing @ctxt parser context 6213 * 6214 * Returns the resulting document tree 6215 */ 6216htmlDocPtr 6217htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, 6218 xmlInputCloseCallback ioclose, void *ioctx, 6219 const char *URL, 6220 const char *encoding, int options) 6221{ 6222 xmlParserInputBufferPtr input; 6223 xmlParserInputPtr stream; 6224 6225 if (ioread == NULL) 6226 return (NULL); 6227 if (ctxt == NULL) 6228 return (NULL); 6229 6230 htmlCtxtReset(ctxt); 6231 6232 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 6233 XML_CHAR_ENCODING_NONE); 6234 if (input == NULL) 6235 return (NULL); 6236 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6237 if (stream == NULL) { 6238 xmlFreeParserInputBuffer(input); 6239 return (NULL); 6240 } 6241 inputPush(ctxt, stream); 6242 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6243} 6244 6245#define bottom_HTMLparser 6246#include "elfgcchack.h" 6247#endif /* LIBXML_HTML_ENABLED */ 6248