1/* 2 * HTMLtree.c : implementation of access function for an HTML tree. 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel@veillard.com 7 */ 8 9 10#define IN_LIBXML 11#include "libxml.h" 12#ifdef LIBXML_HTML_ENABLED 13 14#include <string.h> /* for memset() only ! */ 15 16#ifdef HAVE_CTYPE_H 17#include <ctype.h> 18#endif 19#ifdef HAVE_STDLIB_H 20#include <stdlib.h> 21#endif 22 23#include <libxml/xmlmemory.h> 24#include <libxml/HTMLparser.h> 25#include <libxml/HTMLtree.h> 26#include <libxml/entities.h> 27#include <libxml/valid.h> 28#include <libxml/xmlerror.h> 29#include <libxml/parserInternals.h> 30#include <libxml/globals.h> 31#include <libxml/uri.h> 32 33/************************************************************************ 34 * * 35 * Getting/Setting encoding meta tags * 36 * * 37 ************************************************************************/ 38 39/** 40 * htmlGetMetaEncoding: 41 * @doc: the document 42 * 43 * Encoding definition lookup in the Meta tags 44 * 45 * Returns the current encoding as flagged in the HTML source 46 */ 47const xmlChar * 48htmlGetMetaEncoding(htmlDocPtr doc) { 49 htmlNodePtr cur; 50 const xmlChar *content; 51 const xmlChar *encoding; 52 53 if (doc == NULL) 54 return(NULL); 55 cur = doc->children; 56 57 /* 58 * Search the html 59 */ 60 while (cur != NULL) { 61 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 62 if (xmlStrEqual(cur->name, BAD_CAST"html")) 63 break; 64 if (xmlStrEqual(cur->name, BAD_CAST"head")) 65 goto found_head; 66 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 67 goto found_meta; 68 } 69 cur = cur->next; 70 } 71 if (cur == NULL) 72 return(NULL); 73 cur = cur->children; 74 75 /* 76 * Search the head 77 */ 78 while (cur != NULL) { 79 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 80 if (xmlStrEqual(cur->name, BAD_CAST"head")) 81 break; 82 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 83 goto found_meta; 84 } 85 cur = cur->next; 86 } 87 if (cur == NULL) 88 return(NULL); 89found_head: 90 cur = cur->children; 91 92 /* 93 * Search the meta elements 94 */ 95found_meta: 96 while (cur != NULL) { 97 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 98 if (xmlStrEqual(cur->name, BAD_CAST"meta")) { 99 xmlAttrPtr attr = cur->properties; 100 int http; 101 const xmlChar *value; 102 103 content = NULL; 104 http = 0; 105 while (attr != NULL) { 106 if ((attr->children != NULL) && 107 (attr->children->type == XML_TEXT_NODE) && 108 (attr->children->next == NULL)) { 109 value = attr->children->content; 110 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 111 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 112 http = 1; 113 else if ((value != NULL) 114 && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 115 content = value; 116 if ((http != 0) && (content != NULL)) 117 goto found_content; 118 } 119 attr = attr->next; 120 } 121 } 122 } 123 cur = cur->next; 124 } 125 return(NULL); 126 127found_content: 128 encoding = xmlStrstr(content, BAD_CAST"charset="); 129 if (encoding == NULL) 130 encoding = xmlStrstr(content, BAD_CAST"Charset="); 131 if (encoding == NULL) 132 encoding = xmlStrstr(content, BAD_CAST"CHARSET="); 133 if (encoding != NULL) { 134 encoding += 8; 135 } else { 136 encoding = xmlStrstr(content, BAD_CAST"charset ="); 137 if (encoding == NULL) 138 encoding = xmlStrstr(content, BAD_CAST"Charset ="); 139 if (encoding == NULL) 140 encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); 141 if (encoding != NULL) 142 encoding += 9; 143 } 144 if (encoding != NULL) { 145 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 146 } 147 return(encoding); 148} 149 150/** 151 * htmlSetMetaEncoding: 152 * @doc: the document 153 * @encoding: the encoding string 154 * 155 * Sets the current encoding in the Meta tags 156 * NOTE: this will not change the document content encoding, just 157 * the META flag associated. 158 * 159 * Returns 0 in case of success and -1 in case of error 160 */ 161int 162htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { 163 htmlNodePtr cur, meta; 164 const xmlChar *content; 165 char newcontent[100]; 166 167 168 if (doc == NULL) 169 return(-1); 170 171 if (encoding != NULL) { 172 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s", 173 (char *)encoding); 174 newcontent[sizeof(newcontent) - 1] = 0; 175 } 176 177 cur = doc->children; 178 179 /* 180 * Search the html 181 */ 182 while (cur != NULL) { 183 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 184 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0) 185 break; 186 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 187 goto found_head; 188 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) 189 goto found_meta; 190 } 191 cur = cur->next; 192 } 193 if (cur == NULL) 194 return(-1); 195 cur = cur->children; 196 197 /* 198 * Search the head 199 */ 200 while (cur != NULL) { 201 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 202 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 203 break; 204 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) 205 goto found_meta; 206 } 207 cur = cur->next; 208 } 209 if (cur == NULL) 210 return(-1); 211found_head: 212 if (cur->children == NULL) { 213 if (encoding == NULL) 214 return(0); 215 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); 216 xmlAddChild(cur, meta); 217 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); 218 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); 219 return(0); 220 } 221 cur = cur->children; 222 223found_meta: 224 if (encoding != NULL) { 225 /* 226 * Create a new Meta element with the right attributes 227 */ 228 229 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); 230 xmlAddPrevSibling(cur, meta); 231 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); 232 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); 233 } 234 235 /* 236 * Search and destroy all the remaining the meta elements carrying 237 * encoding informations 238 */ 239 while (cur != NULL) { 240 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 241 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 242 xmlAttrPtr attr = cur->properties; 243 int http; 244 const xmlChar *value; 245 246 content = NULL; 247 http = 0; 248 while (attr != NULL) { 249 if ((attr->children != NULL) && 250 (attr->children->type == XML_TEXT_NODE) && 251 (attr->children->next == NULL)) { 252 value = attr->children->content; 253 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 254 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 255 http = 1; 256 else 257 { 258 if ((value != NULL) && 259 (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 260 content = value; 261 } 262 if ((http != 0) && (content != NULL)) 263 break; 264 } 265 attr = attr->next; 266 } 267 if ((http != 0) && (content != NULL)) { 268 meta = cur; 269 cur = cur->next; 270 xmlUnlinkNode(meta); 271 xmlFreeNode(meta); 272 continue; 273 } 274 275 } 276 } 277 cur = cur->next; 278 } 279 return(0); 280} 281 282/** 283 * booleanHTMLAttrs: 284 * 285 * These are the HTML attributes which will be output 286 * in minimized form, i.e. <option selected="selected"> will be 287 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method" 288 * 289 */ 290static const char* htmlBooleanAttrs[] = { 291 "checked", "compact", "declare", "defer", "disabled", "ismap", 292 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", 293 "selected", NULL 294}; 295 296 297/** 298 * htmlIsBooleanAttr: 299 * @name: the name of the attribute to check 300 * 301 * Determine if a given attribute is a boolean attribute. 302 * 303 * returns: false if the attribute is not boolean, true otherwise. 304 */ 305int 306htmlIsBooleanAttr(const xmlChar *name) 307{ 308 int i = 0; 309 310 while (htmlBooleanAttrs[i] != NULL) { 311 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0) 312 return 1; 313 i++; 314 } 315 return 0; 316} 317 318#ifdef LIBXML_OUTPUT_ENABLED 319/************************************************************************ 320 * * 321 * Output error handlers * 322 * * 323 ************************************************************************/ 324/** 325 * htmlSaveErrMemory: 326 * @extra: extra informations 327 * 328 * Handle an out of memory condition 329 */ 330static void 331htmlSaveErrMemory(const char *extra) 332{ 333 __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra); 334} 335 336/** 337 * htmlSaveErr: 338 * @code: the error number 339 * @node: the location of the error. 340 * @extra: extra informations 341 * 342 * Handle an out of memory condition 343 */ 344static void 345htmlSaveErr(int code, xmlNodePtr node, const char *extra) 346{ 347 const char *msg = NULL; 348 349 switch(code) { 350 case XML_SAVE_NOT_UTF8: 351 msg = "string is not in UTF-8"; 352 break; 353 case XML_SAVE_CHAR_INVALID: 354 msg = "invalid character value"; 355 break; 356 case XML_SAVE_UNKNOWN_ENCODING: 357 msg = "unknown encoding %s"; 358 break; 359 case XML_SAVE_NO_DOCTYPE: 360 msg = "HTML has no DOCTYPE"; 361 break; 362 default: 363 msg = "unexpected error number"; 364 } 365 __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra); 366} 367 368/************************************************************************ 369 * * 370 * Dumping HTML tree content to a simple buffer * 371 * * 372 ************************************************************************/ 373 374static int 375htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, 376 int format); 377 378/** 379 * htmlNodeDumpFormat: 380 * @buf: the HTML buffer output 381 * @doc: the document 382 * @cur: the current node 383 * @format: should formatting spaces been added 384 * 385 * Dump an HTML node, recursive behaviour,children are printed too. 386 * 387 * Returns the number of byte written or -1 in case of error 388 */ 389static int 390htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, 391 int format) { 392 unsigned int use; 393 int ret; 394 xmlOutputBufferPtr outbuf; 395 396 if (cur == NULL) { 397 return (-1); 398 } 399 if (buf == NULL) { 400 return (-1); 401 } 402 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer)); 403 if (outbuf == NULL) { 404 htmlSaveErrMemory("allocating HTML output buffer"); 405 return (-1); 406 } 407 memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer)); 408 outbuf->buffer = buf; 409 outbuf->encoder = NULL; 410 outbuf->writecallback = NULL; 411 outbuf->closecallback = NULL; 412 outbuf->context = NULL; 413 outbuf->written = 0; 414 415 use = buf->use; 416 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format); 417 xmlFree(outbuf); 418 ret = buf->use - use; 419 return (ret); 420} 421 422/** 423 * htmlNodeDump: 424 * @buf: the HTML buffer output 425 * @doc: the document 426 * @cur: the current node 427 * 428 * Dump an HTML node, recursive behaviour,children are printed too, 429 * and formatting returns are added. 430 * 431 * Returns the number of byte written or -1 in case of error 432 */ 433int 434htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) { 435 xmlInitParser(); 436 437 return(htmlNodeDumpFormat(buf, doc, cur, 1)); 438} 439 440/** 441 * htmlNodeDumpFileFormat: 442 * @out: the FILE pointer 443 * @doc: the document 444 * @cur: the current node 445 * @encoding: the document encoding 446 * @format: should formatting spaces been added 447 * 448 * Dump an HTML node, recursive behaviour,children are printed too. 449 * 450 * TODO: if encoding == NULL try to save in the doc encoding 451 * 452 * returns: the number of byte written or -1 in case of failure. 453 */ 454int 455htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, 456 xmlNodePtr cur, const char *encoding, int format) { 457 xmlOutputBufferPtr buf; 458 xmlCharEncodingHandlerPtr handler = NULL; 459 int ret; 460 461 xmlInitParser(); 462 463 if (encoding != NULL) { 464 xmlCharEncoding enc; 465 466 enc = xmlParseCharEncoding(encoding); 467 if (enc != XML_CHAR_ENCODING_UTF8) { 468 handler = xmlFindCharEncodingHandler(encoding); 469 if (handler == NULL) 470 return(-1); 471 } 472 } 473 474 /* 475 * Fallback to HTML or ASCII when the encoding is unspecified 476 */ 477 if (handler == NULL) 478 handler = xmlFindCharEncodingHandler("HTML"); 479 if (handler == NULL) 480 handler = xmlFindCharEncodingHandler("ascii"); 481 482 /* 483 * save the content to a temp buffer. 484 */ 485 buf = xmlOutputBufferCreateFile(out, handler); 486 if (buf == NULL) return(0); 487 488 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); 489 490 ret = xmlOutputBufferClose(buf); 491 return(ret); 492} 493 494/** 495 * htmlNodeDumpFile: 496 * @out: the FILE pointer 497 * @doc: the document 498 * @cur: the current node 499 * 500 * Dump an HTML node, recursive behaviour,children are printed too, 501 * and formatting returns are added. 502 */ 503void 504htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) { 505 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1); 506} 507 508/** 509 * htmlDocDumpMemoryFormat: 510 * @cur: the document 511 * @mem: OUT: the memory pointer 512 * @size: OUT: the memory length 513 * @format: should formatting spaces been added 514 * 515 * Dump an HTML document in memory and return the xmlChar * and it's size. 516 * It's up to the caller to free the memory. 517 */ 518void 519htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) { 520 xmlOutputBufferPtr buf; 521 xmlCharEncodingHandlerPtr handler = NULL; 522 const char *encoding; 523 524 xmlInitParser(); 525 526 if ((mem == NULL) || (size == NULL)) 527 return; 528 if (cur == NULL) { 529 *mem = NULL; 530 *size = 0; 531 return; 532 } 533 534 encoding = (const char *) htmlGetMetaEncoding(cur); 535 536 if (encoding != NULL) { 537 xmlCharEncoding enc; 538 539 enc = xmlParseCharEncoding(encoding); 540 if (enc != cur->charset) { 541 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 542 /* 543 * Not supported yet 544 */ 545 *mem = NULL; 546 *size = 0; 547 return; 548 } 549 550 handler = xmlFindCharEncodingHandler(encoding); 551 if (handler == NULL) { 552 *mem = NULL; 553 *size = 0; 554 return; 555 } 556 } else { 557 handler = xmlFindCharEncodingHandler(encoding); 558 } 559 } 560 561 /* 562 * Fallback to HTML or ASCII when the encoding is unspecified 563 */ 564 if (handler == NULL) 565 handler = xmlFindCharEncodingHandler("HTML"); 566 if (handler == NULL) 567 handler = xmlFindCharEncodingHandler("ascii"); 568 569 buf = xmlAllocOutputBuffer(handler); 570 if (buf == NULL) { 571 *mem = NULL; 572 *size = 0; 573 return; 574 } 575 576 htmlDocContentDumpFormatOutput(buf, cur, NULL, format); 577 578 xmlOutputBufferFlush(buf); 579 if (buf->conv != NULL) { 580 *size = buf->conv->use; 581 *mem = xmlStrndup(buf->conv->content, *size); 582 } else { 583 *size = buf->buffer->use; 584 *mem = xmlStrndup(buf->buffer->content, *size); 585 } 586 (void)xmlOutputBufferClose(buf); 587} 588 589/** 590 * htmlDocDumpMemory: 591 * @cur: the document 592 * @mem: OUT: the memory pointer 593 * @size: OUT: the memory length 594 * 595 * Dump an HTML document in memory and return the xmlChar * and it's size. 596 * It's up to the caller to free the memory. 597 */ 598void 599htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) { 600 htmlDocDumpMemoryFormat(cur, mem, size, 1); 601} 602 603 604/************************************************************************ 605 * * 606 * Dumping HTML tree content to an I/O output buffer * 607 * * 608 ************************************************************************/ 609 610void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur); 611 612/** 613 * htmlDtdDumpOutput: 614 * @buf: the HTML buffer output 615 * @doc: the document 616 * @encoding: the encoding string 617 * 618 * TODO: check whether encoding is needed 619 * 620 * Dump the HTML document DTD, if any. 621 */ 622static void 623htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 624 const char *encoding ATTRIBUTE_UNUSED) { 625 xmlDtdPtr cur = doc->intSubset; 626 627 if (cur == NULL) { 628 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL); 629 return; 630 } 631 xmlOutputBufferWriteString(buf, "<!DOCTYPE "); 632 xmlOutputBufferWriteString(buf, (const char *)cur->name); 633 if (cur->ExternalID != NULL) { 634 xmlOutputBufferWriteString(buf, " PUBLIC "); 635 xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID); 636 if (cur->SystemID != NULL) { 637 xmlOutputBufferWriteString(buf, " "); 638 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID); 639 } 640 } else if (cur->SystemID != NULL) { 641 xmlOutputBufferWriteString(buf, " SYSTEM "); 642 xmlBufferWriteQuotedString(buf->buffer, cur->SystemID); 643 } 644 xmlOutputBufferWriteString(buf, ">\n"); 645} 646 647/** 648 * htmlAttrDumpOutput: 649 * @buf: the HTML buffer output 650 * @doc: the document 651 * @cur: the attribute pointer 652 * @encoding: the encoding string 653 * 654 * Dump an HTML attribute 655 */ 656static void 657htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, 658 const char *encoding ATTRIBUTE_UNUSED) { 659 xmlChar *value; 660 661 /* 662 * TODO: The html output method should not escape a & character 663 * occurring in an attribute value immediately followed by 664 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation). 665 */ 666 667 if (cur == NULL) { 668 return; 669 } 670 xmlOutputBufferWriteString(buf, " "); 671 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 672 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 673 xmlOutputBufferWriteString(buf, ":"); 674 } 675 xmlOutputBufferWriteString(buf, (const char *)cur->name); 676 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) { 677 value = xmlNodeListGetString(doc, cur->children, 0); 678 if (value) { 679 xmlOutputBufferWriteString(buf, "="); 680 if ((cur->ns == NULL) && (cur->parent != NULL) && 681 (cur->parent->ns == NULL) && 682 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) || 683 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) || 684 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) || 685 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) && 686 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) { 687 xmlChar *escaped; 688 xmlChar *tmp = value; 689 690 while (IS_BLANK_CH(*tmp)) tmp++; 691 692 escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+"); 693 if (escaped != NULL) { 694 xmlBufferWriteQuotedString(buf->buffer, escaped); 695 xmlFree(escaped); 696 } else { 697 xmlBufferWriteQuotedString(buf->buffer, value); 698 } 699 } else { 700 xmlBufferWriteQuotedString(buf->buffer, value); 701 } 702 xmlFree(value); 703 } else { 704 xmlOutputBufferWriteString(buf, "=\"\""); 705 } 706 } 707} 708 709/** 710 * htmlAttrListDumpOutput: 711 * @buf: the HTML buffer output 712 * @doc: the document 713 * @cur: the first attribute pointer 714 * @encoding: the encoding string 715 * 716 * Dump a list of HTML attributes 717 */ 718static void 719htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) { 720 if (cur == NULL) { 721 return; 722 } 723 while (cur != NULL) { 724 htmlAttrDumpOutput(buf, doc, cur, encoding); 725 cur = cur->next; 726 } 727} 728 729 730 731/** 732 * htmlNodeListDumpOutput: 733 * @buf: the HTML buffer output 734 * @doc: the document 735 * @cur: the first node 736 * @encoding: the encoding string 737 * @format: should formatting spaces been added 738 * 739 * Dump an HTML node list, recursive behaviour,children are printed too. 740 */ 741static void 742htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 743 xmlNodePtr cur, const char *encoding, int format) { 744 if (cur == NULL) { 745 return; 746 } 747 while (cur != NULL) { 748 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); 749 cur = cur->next; 750 } 751} 752 753/** 754 * htmlNodeDumpFormatOutput: 755 * @buf: the HTML buffer output 756 * @doc: the document 757 * @cur: the current node 758 * @encoding: the encoding string 759 * @format: should formatting spaces been added 760 * 761 * Dump an HTML node, recursive behaviour,children are printed too. 762 */ 763void 764htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 765 xmlNodePtr cur, const char *encoding, int format) { 766 const htmlElemDesc * info; 767 768 xmlInitParser(); 769 770 if ((cur == NULL) || (buf == NULL)) { 771 return; 772 } 773 /* 774 * Special cases. 775 */ 776 if (cur->type == XML_DTD_NODE) 777 return; 778 if ((cur->type == XML_HTML_DOCUMENT_NODE) || 779 (cur->type == XML_DOCUMENT_NODE)){ 780 htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding); 781 return; 782 } 783 if (cur->type == HTML_TEXT_NODE) { 784 if (cur->content != NULL) { 785 if (((cur->name == (const xmlChar *)xmlStringText) || 786 (cur->name != (const xmlChar *)xmlStringTextNoenc)) && 787 ((cur->parent == NULL) || 788 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) && 789 (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) { 790 xmlChar *buffer; 791 792 buffer = xmlEncodeEntitiesReentrant(doc, cur->content); 793 if (buffer != NULL) { 794 xmlOutputBufferWriteString(buf, (const char *)buffer); 795 xmlFree(buffer); 796 } 797 } else { 798 xmlOutputBufferWriteString(buf, (const char *)cur->content); 799 } 800 } 801 return; 802 } 803 if (cur->type == HTML_COMMENT_NODE) { 804 if (cur->content != NULL) { 805 xmlOutputBufferWriteString(buf, "<!--"); 806 xmlOutputBufferWriteString(buf, (const char *)cur->content); 807 xmlOutputBufferWriteString(buf, "-->"); 808 } 809 return; 810 } 811 if (cur->type == HTML_PI_NODE) { 812 if (cur->name == NULL) 813 return; 814 xmlOutputBufferWriteString(buf, "<?"); 815 xmlOutputBufferWriteString(buf, (const char *)cur->name); 816 if (cur->content != NULL) { 817 xmlOutputBufferWriteString(buf, " "); 818 xmlOutputBufferWriteString(buf, (const char *)cur->content); 819 } 820 xmlOutputBufferWriteString(buf, ">"); 821 return; 822 } 823 if (cur->type == HTML_ENTITY_REF_NODE) { 824 xmlOutputBufferWriteString(buf, "&"); 825 xmlOutputBufferWriteString(buf, (const char *)cur->name); 826 xmlOutputBufferWriteString(buf, ";"); 827 return; 828 } 829 if (cur->type == HTML_PRESERVE_NODE) { 830 if (cur->content != NULL) { 831 xmlOutputBufferWriteString(buf, (const char *)cur->content); 832 } 833 return; 834 } 835 836 /* 837 * Get specific HTML info for that node. 838 */ 839 if (cur->ns == NULL) 840 info = htmlTagLookup(cur->name); 841 else 842 info = NULL; 843 844 xmlOutputBufferWriteString(buf, "<"); 845 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 846 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 847 xmlOutputBufferWriteString(buf, ":"); 848 } 849 xmlOutputBufferWriteString(buf, (const char *)cur->name); 850 if (cur->nsDef) 851 xmlNsListDumpOutput(buf, cur->nsDef); 852 if (cur->properties != NULL) 853 htmlAttrListDumpOutput(buf, doc, cur->properties, encoding); 854 855 if ((info != NULL) && (info->empty)) { 856 xmlOutputBufferWriteString(buf, ">"); 857 if ((format) && (!info->isinline) && (cur->next != NULL)) { 858 if ((cur->next->type != HTML_TEXT_NODE) && 859 (cur->next->type != HTML_ENTITY_REF_NODE) && 860 (cur->parent != NULL) && 861 (cur->parent->name != NULL) && 862 (cur->parent->name[0] != 'p')) /* p, pre, param */ 863 xmlOutputBufferWriteString(buf, "\n"); 864 } 865 return; 866 } 867 if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) && 868 (cur->children == NULL)) { 869 if ((info != NULL) && (info->saveEndTag != 0) && 870 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) && 871 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) { 872 xmlOutputBufferWriteString(buf, ">"); 873 } else { 874 xmlOutputBufferWriteString(buf, "></"); 875 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 876 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 877 xmlOutputBufferWriteString(buf, ":"); 878 } 879 xmlOutputBufferWriteString(buf, (const char *)cur->name); 880 xmlOutputBufferWriteString(buf, ">"); 881 } 882 if ((format) && (cur->next != NULL) && 883 (info != NULL) && (!info->isinline)) { 884 if ((cur->next->type != HTML_TEXT_NODE) && 885 (cur->next->type != HTML_ENTITY_REF_NODE) && 886 (cur->parent != NULL) && 887 (cur->parent->name != NULL) && 888 (cur->parent->name[0] != 'p')) /* p, pre, param */ 889 xmlOutputBufferWriteString(buf, "\n"); 890 } 891 return; 892 } 893 xmlOutputBufferWriteString(buf, ">"); 894 if ((cur->type != XML_ELEMENT_NODE) && 895 (cur->content != NULL)) { 896 /* 897 * Uses the OutputBuffer property to automatically convert 898 * invalids to charrefs 899 */ 900 901 xmlOutputBufferWriteString(buf, (const char *) cur->content); 902 } 903 if (cur->children != NULL) { 904 if ((format) && (info != NULL) && (!info->isinline) && 905 (cur->children->type != HTML_TEXT_NODE) && 906 (cur->children->type != HTML_ENTITY_REF_NODE) && 907 (cur->children != cur->last) && 908 (cur->name != NULL) && 909 (cur->name[0] != 'p')) /* p, pre, param */ 910 xmlOutputBufferWriteString(buf, "\n"); 911 htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format); 912 if ((format) && (info != NULL) && (!info->isinline) && 913 (cur->last->type != HTML_TEXT_NODE) && 914 (cur->last->type != HTML_ENTITY_REF_NODE) && 915 (cur->children != cur->last) && 916 (cur->name != NULL) && 917 (cur->name[0] != 'p')) /* p, pre, param */ 918 xmlOutputBufferWriteString(buf, "\n"); 919 } 920 xmlOutputBufferWriteString(buf, "</"); 921 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 922 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 923 xmlOutputBufferWriteString(buf, ":"); 924 } 925 xmlOutputBufferWriteString(buf, (const char *)cur->name); 926 xmlOutputBufferWriteString(buf, ">"); 927 if ((format) && (info != NULL) && (!info->isinline) && 928 (cur->next != NULL)) { 929 if ((cur->next->type != HTML_TEXT_NODE) && 930 (cur->next->type != HTML_ENTITY_REF_NODE) && 931 (cur->parent != NULL) && 932 (cur->parent->name != NULL) && 933 (cur->parent->name[0] != 'p')) /* p, pre, param */ 934 xmlOutputBufferWriteString(buf, "\n"); 935 } 936} 937 938/** 939 * htmlNodeDumpOutput: 940 * @buf: the HTML buffer output 941 * @doc: the document 942 * @cur: the current node 943 * @encoding: the encoding string 944 * 945 * Dump an HTML node, recursive behaviour,children are printed too, 946 * and formatting returns/spaces are added. 947 */ 948void 949htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 950 xmlNodePtr cur, const char *encoding) { 951 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1); 952} 953 954/** 955 * htmlDocContentDumpFormatOutput: 956 * @buf: the HTML buffer output 957 * @cur: the document 958 * @encoding: the encoding string 959 * @format: should formatting spaces been added 960 * 961 * Dump an HTML document. 962 */ 963void 964htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 965 const char *encoding, int format) { 966 int type; 967 968 xmlInitParser(); 969 970 if ((buf == NULL) || (cur == NULL)) 971 return; 972 973 /* 974 * force to output the stuff as HTML, especially for entities 975 */ 976 type = cur->type; 977 cur->type = XML_HTML_DOCUMENT_NODE; 978 if (cur->intSubset != NULL) { 979 htmlDtdDumpOutput(buf, cur, NULL); 980 } 981 if (cur->children != NULL) { 982 htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format); 983 } 984 xmlOutputBufferWriteString(buf, "\n"); 985 cur->type = (xmlElementType) type; 986} 987 988/** 989 * htmlDocContentDumpOutput: 990 * @buf: the HTML buffer output 991 * @cur: the document 992 * @encoding: the encoding string 993 * 994 * Dump an HTML document. Formating return/spaces are added. 995 */ 996void 997htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 998 const char *encoding) { 999 htmlDocContentDumpFormatOutput(buf, cur, encoding, 1); 1000} 1001 1002/************************************************************************ 1003 * * 1004 * Saving functions front-ends * 1005 * * 1006 ************************************************************************/ 1007 1008/** 1009 * htmlDocDump: 1010 * @f: the FILE* 1011 * @cur: the document 1012 * 1013 * Dump an HTML document to an open FILE. 1014 * 1015 * returns: the number of byte written or -1 in case of failure. 1016 */ 1017int 1018htmlDocDump(FILE *f, xmlDocPtr cur) { 1019 xmlOutputBufferPtr buf; 1020 xmlCharEncodingHandlerPtr handler = NULL; 1021 const char *encoding; 1022 int ret; 1023 1024 xmlInitParser(); 1025 1026 if ((cur == NULL) || (f == NULL)) { 1027 return(-1); 1028 } 1029 1030 encoding = (const char *) htmlGetMetaEncoding(cur); 1031 1032 if (encoding != NULL) { 1033 xmlCharEncoding enc; 1034 1035 enc = xmlParseCharEncoding(encoding); 1036 if (enc != cur->charset) { 1037 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1038 /* 1039 * Not supported yet 1040 */ 1041 return(-1); 1042 } 1043 1044 handler = xmlFindCharEncodingHandler(encoding); 1045 if (handler == NULL) 1046 return(-1); 1047 } else { 1048 handler = xmlFindCharEncodingHandler(encoding); 1049 } 1050 } 1051 1052 /* 1053 * Fallback to HTML or ASCII when the encoding is unspecified 1054 */ 1055 if (handler == NULL) 1056 handler = xmlFindCharEncodingHandler("HTML"); 1057 if (handler == NULL) 1058 handler = xmlFindCharEncodingHandler("ascii"); 1059 1060 buf = xmlOutputBufferCreateFile(f, handler); 1061 if (buf == NULL) return(-1); 1062 htmlDocContentDumpOutput(buf, cur, NULL); 1063 1064 ret = xmlOutputBufferClose(buf); 1065 return(ret); 1066} 1067 1068/** 1069 * htmlSaveFile: 1070 * @filename: the filename (or URL) 1071 * @cur: the document 1072 * 1073 * Dump an HTML document to a file. If @filename is "-" the stdout file is 1074 * used. 1075 * returns: the number of byte written or -1 in case of failure. 1076 */ 1077int 1078htmlSaveFile(const char *filename, xmlDocPtr cur) { 1079 xmlOutputBufferPtr buf; 1080 xmlCharEncodingHandlerPtr handler = NULL; 1081 const char *encoding; 1082 int ret; 1083 1084 if ((cur == NULL) || (filename == NULL)) 1085 return(-1); 1086 1087 xmlInitParser(); 1088 1089 encoding = (const char *) htmlGetMetaEncoding(cur); 1090 1091 if (encoding != NULL) { 1092 xmlCharEncoding enc; 1093 1094 enc = xmlParseCharEncoding(encoding); 1095 if (enc != cur->charset) { 1096 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1097 /* 1098 * Not supported yet 1099 */ 1100 return(-1); 1101 } 1102 1103 handler = xmlFindCharEncodingHandler(encoding); 1104 if (handler == NULL) 1105 return(-1); 1106 } 1107 } 1108 1109 /* 1110 * Fallback to HTML or ASCII when the encoding is unspecified 1111 */ 1112 if (handler == NULL) 1113 handler = xmlFindCharEncodingHandler("HTML"); 1114 if (handler == NULL) 1115 handler = xmlFindCharEncodingHandler("ascii"); 1116 1117 /* 1118 * save the content to a temp buffer. 1119 */ 1120 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); 1121 if (buf == NULL) return(0); 1122 1123 htmlDocContentDumpOutput(buf, cur, NULL); 1124 1125 ret = xmlOutputBufferClose(buf); 1126 return(ret); 1127} 1128 1129/** 1130 * htmlSaveFileFormat: 1131 * @filename: the filename 1132 * @cur: the document 1133 * @format: should formatting spaces been added 1134 * @encoding: the document encoding 1135 * 1136 * Dump an HTML document to a file using a given encoding. 1137 * 1138 * returns: the number of byte written or -1 in case of failure. 1139 */ 1140int 1141htmlSaveFileFormat(const char *filename, xmlDocPtr cur, 1142 const char *encoding, int format) { 1143 xmlOutputBufferPtr buf; 1144 xmlCharEncodingHandlerPtr handler = NULL; 1145 int ret; 1146 1147 if ((cur == NULL) || (filename == NULL)) 1148 return(-1); 1149 1150 xmlInitParser(); 1151 1152 if (encoding != NULL) { 1153 xmlCharEncoding enc; 1154 1155 enc = xmlParseCharEncoding(encoding); 1156 if (enc != cur->charset) { 1157 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1158 /* 1159 * Not supported yet 1160 */ 1161 return(-1); 1162 } 1163 1164 handler = xmlFindCharEncodingHandler(encoding); 1165 if (handler == NULL) 1166 return(-1); 1167 htmlSetMetaEncoding(cur, (const xmlChar *) encoding); 1168 } 1169 } else { 1170 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8"); 1171 } 1172 1173 /* 1174 * Fallback to HTML or ASCII when the encoding is unspecified 1175 */ 1176 if (handler == NULL) 1177 handler = xmlFindCharEncodingHandler("HTML"); 1178 if (handler == NULL) 1179 handler = xmlFindCharEncodingHandler("ascii"); 1180 1181 /* 1182 * save the content to a temp buffer. 1183 */ 1184 buf = xmlOutputBufferCreateFilename(filename, handler, 0); 1185 if (buf == NULL) return(0); 1186 1187 htmlDocContentDumpFormatOutput(buf, cur, encoding, format); 1188 1189 ret = xmlOutputBufferClose(buf); 1190 return(ret); 1191} 1192 1193/** 1194 * htmlSaveFileEnc: 1195 * @filename: the filename 1196 * @cur: the document 1197 * @encoding: the document encoding 1198 * 1199 * Dump an HTML document to a file using a given encoding 1200 * and formatting returns/spaces are added. 1201 * 1202 * returns: the number of byte written or -1 in case of failure. 1203 */ 1204int 1205htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) { 1206 return(htmlSaveFileFormat(filename, cur, encoding, 1)); 1207} 1208 1209#endif /* LIBXML_OUTPUT_ENABLED */ 1210 1211#define bottom_HTMLtree 1212#include "elfgcchack.h" 1213#endif /* LIBXML_HTML_ENABLED */ 1214