1/*
2 * HTMLtree.c : implementation of access function for an HTML tree.
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9
10#define IN_LIBXML
11#include "libxml.h"
12#ifdef LIBXML_HTML_ENABLED
13
14#include <string.h> /* for memset() only ! */
15
16#ifdef HAVE_CTYPE_H
17#include <ctype.h>
18#endif
19#ifdef HAVE_STDLIB_H
20#include <stdlib.h>
21#endif
22
23#include <libxml/xmlmemory.h>
24#include <libxml/HTMLparser.h>
25#include <libxml/HTMLtree.h>
26#include <libxml/entities.h>
27#include <libxml/valid.h>
28#include <libxml/xmlerror.h>
29#include <libxml/parserInternals.h>
30#include <libxml/globals.h>
31#include <libxml/uri.h>
32
33/************************************************************************
34 *									*
35 *   		Getting/Setting encoding meta tags			*
36 *									*
37 ************************************************************************/
38
39/**
40 * htmlGetMetaEncoding:
41 * @doc:  the document
42 *
43 * Encoding definition lookup in the Meta tags
44 *
45 * Returns the current encoding as flagged in the HTML source
46 */
47const xmlChar *
48htmlGetMetaEncoding(htmlDocPtr doc) {
49    htmlNodePtr cur;
50    const xmlChar *content;
51    const xmlChar *encoding;
52
53    if (doc == NULL)
54	return(NULL);
55    cur = doc->children;
56
57    /*
58     * Search the html
59     */
60    while (cur != NULL) {
61	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
62	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
63		break;
64	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
65		goto found_head;
66	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
67		goto found_meta;
68	}
69	cur = cur->next;
70    }
71    if (cur == NULL)
72	return(NULL);
73    cur = cur->children;
74
75    /*
76     * Search the head
77     */
78    while (cur != NULL) {
79	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
80	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
81		break;
82	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
83		goto found_meta;
84	}
85	cur = cur->next;
86    }
87    if (cur == NULL)
88	return(NULL);
89found_head:
90    cur = cur->children;
91
92    /*
93     * Search the meta elements
94     */
95found_meta:
96    while (cur != NULL) {
97	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
98	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
99		xmlAttrPtr attr = cur->properties;
100		int http;
101		const xmlChar *value;
102
103		content = NULL;
104		http = 0;
105		while (attr != NULL) {
106		    if ((attr->children != NULL) &&
107		        (attr->children->type == XML_TEXT_NODE) &&
108		        (attr->children->next == NULL)) {
109			value = attr->children->content;
110			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
111			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
112			    http = 1;
113			else if ((value != NULL)
114			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
115			    content = value;
116			if ((http != 0) && (content != NULL))
117			    goto found_content;
118		    }
119		    attr = attr->next;
120		}
121	    }
122	}
123	cur = cur->next;
124    }
125    return(NULL);
126
127found_content:
128    encoding = xmlStrstr(content, BAD_CAST"charset=");
129    if (encoding == NULL)
130	encoding = xmlStrstr(content, BAD_CAST"Charset=");
131    if (encoding == NULL)
132	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
133    if (encoding != NULL) {
134	encoding += 8;
135    } else {
136	encoding = xmlStrstr(content, BAD_CAST"charset =");
137	if (encoding == NULL)
138	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
139	if (encoding == NULL)
140	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
141	if (encoding != NULL)
142	    encoding += 9;
143    }
144    if (encoding != NULL) {
145	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
146    }
147    return(encoding);
148}
149
150/**
151 * htmlSetMetaEncoding:
152 * @doc:  the document
153 * @encoding:  the encoding string
154 *
155 * Sets the current encoding in the Meta tags
156 * NOTE: this will not change the document content encoding, just
157 * the META flag associated.
158 *
159 * Returns 0 in case of success and -1 in case of error
160 */
161int
162htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
163    htmlNodePtr cur, meta;
164    const xmlChar *content;
165    char newcontent[100];
166
167
168    if (doc == NULL)
169	return(-1);
170
171    if (encoding != NULL) {
172	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
173                (char *)encoding);
174	newcontent[sizeof(newcontent) - 1] = 0;
175    }
176
177    cur = doc->children;
178
179    /*
180     * Search the html
181     */
182    while (cur != NULL) {
183	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
184	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
185		break;
186	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
187		goto found_head;
188	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
189		goto found_meta;
190	}
191	cur = cur->next;
192    }
193    if (cur == NULL)
194	return(-1);
195    cur = cur->children;
196
197    /*
198     * Search the head
199     */
200    while (cur != NULL) {
201	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
202	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
203		break;
204	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
205		goto found_meta;
206	}
207	cur = cur->next;
208    }
209    if (cur == NULL)
210	return(-1);
211found_head:
212    if (cur->children == NULL) {
213	if (encoding == NULL)
214	    return(0);
215	meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
216	xmlAddChild(cur, meta);
217	xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
218	xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
219	return(0);
220    }
221    cur = cur->children;
222
223found_meta:
224    if (encoding != NULL) {
225	/*
226	 * Create a new Meta element with the right attributes
227	 */
228
229	meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
230	xmlAddPrevSibling(cur, meta);
231	xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
232	xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
233    }
234
235    /*
236     * Search and destroy all the remaining the meta elements carrying
237     * encoding informations
238     */
239    while (cur != NULL) {
240	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
241	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
242		xmlAttrPtr attr = cur->properties;
243		int http;
244		const xmlChar *value;
245
246		content = NULL;
247		http = 0;
248		while (attr != NULL) {
249		    if ((attr->children != NULL) &&
250		        (attr->children->type == XML_TEXT_NODE) &&
251		        (attr->children->next == NULL)) {
252			value = attr->children->content;
253			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
254			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
255			    http = 1;
256			else
257                        {
258                           if ((value != NULL) &&
259				(!xmlStrcasecmp(attr->name, BAD_CAST"content")))
260			      content = value;
261                        }
262		        if ((http != 0) && (content != NULL))
263			    break;
264		    }
265		    attr = attr->next;
266		}
267		if ((http != 0) && (content != NULL)) {
268		    meta = cur;
269		    cur = cur->next;
270		    xmlUnlinkNode(meta);
271                    xmlFreeNode(meta);
272		    continue;
273		}
274
275	    }
276	}
277	cur = cur->next;
278    }
279    return(0);
280}
281
282/**
283 * booleanHTMLAttrs:
284 *
285 * These are the HTML attributes which will be output
286 * in minimized form, i.e. <option selected="selected"> will be
287 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
288 *
289 */
290static const char* htmlBooleanAttrs[] = {
291  "checked", "compact", "declare", "defer", "disabled", "ismap",
292  "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
293  "selected", NULL
294};
295
296
297/**
298 * htmlIsBooleanAttr:
299 * @name:  the name of the attribute to check
300 *
301 * Determine if a given attribute is a boolean attribute.
302 *
303 * returns: false if the attribute is not boolean, true otherwise.
304 */
305int
306htmlIsBooleanAttr(const xmlChar *name)
307{
308    int i = 0;
309
310    while (htmlBooleanAttrs[i] != NULL) {
311        if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
312            return 1;
313        i++;
314    }
315    return 0;
316}
317
318#ifdef LIBXML_OUTPUT_ENABLED
319/************************************************************************
320 *									*
321 * 			Output error handlers				*
322 *									*
323 ************************************************************************/
324/**
325 * htmlSaveErrMemory:
326 * @extra:  extra informations
327 *
328 * Handle an out of memory condition
329 */
330static void
331htmlSaveErrMemory(const char *extra)
332{
333    __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
334}
335
336/**
337 * htmlSaveErr:
338 * @code:  the error number
339 * @node:  the location of the error.
340 * @extra:  extra informations
341 *
342 * Handle an out of memory condition
343 */
344static void
345htmlSaveErr(int code, xmlNodePtr node, const char *extra)
346{
347    const char *msg = NULL;
348
349    switch(code) {
350        case XML_SAVE_NOT_UTF8:
351	    msg = "string is not in UTF-8";
352	    break;
353	case XML_SAVE_CHAR_INVALID:
354	    msg = "invalid character value";
355	    break;
356	case XML_SAVE_UNKNOWN_ENCODING:
357	    msg = "unknown encoding %s";
358	    break;
359	case XML_SAVE_NO_DOCTYPE:
360	    msg = "HTML has no DOCTYPE";
361	    break;
362	default:
363	    msg = "unexpected error number";
364    }
365    __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
366}
367
368/************************************************************************
369 *									*
370 *   		Dumping HTML tree content to a simple buffer		*
371 *									*
372 ************************************************************************/
373
374static int
375htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
376	           int format);
377
378/**
379 * htmlNodeDumpFormat:
380 * @buf:  the HTML buffer output
381 * @doc:  the document
382 * @cur:  the current node
383 * @format:  should formatting spaces been added
384 *
385 * Dump an HTML node, recursive behaviour,children are printed too.
386 *
387 * Returns the number of byte written or -1 in case of error
388 */
389static int
390htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
391	           int format) {
392    unsigned int use;
393    int ret;
394    xmlOutputBufferPtr outbuf;
395
396    if (cur == NULL) {
397	return (-1);
398    }
399    if (buf == NULL) {
400	return (-1);
401    }
402    outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
403    if (outbuf == NULL) {
404        htmlSaveErrMemory("allocating HTML output buffer");
405	return (-1);
406    }
407    memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
408    outbuf->buffer = buf;
409    outbuf->encoder = NULL;
410    outbuf->writecallback = NULL;
411    outbuf->closecallback = NULL;
412    outbuf->context = NULL;
413    outbuf->written = 0;
414
415    use = buf->use;
416    htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
417    xmlFree(outbuf);
418    ret = buf->use - use;
419    return (ret);
420}
421
422/**
423 * htmlNodeDump:
424 * @buf:  the HTML buffer output
425 * @doc:  the document
426 * @cur:  the current node
427 *
428 * Dump an HTML node, recursive behaviour,children are printed too,
429 * and formatting returns are added.
430 *
431 * Returns the number of byte written or -1 in case of error
432 */
433int
434htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
435    xmlInitParser();
436
437    return(htmlNodeDumpFormat(buf, doc, cur, 1));
438}
439
440/**
441 * htmlNodeDumpFileFormat:
442 * @out:  the FILE pointer
443 * @doc:  the document
444 * @cur:  the current node
445 * @encoding: the document encoding
446 * @format:  should formatting spaces been added
447 *
448 * Dump an HTML node, recursive behaviour,children are printed too.
449 *
450 * TODO: if encoding == NULL try to save in the doc encoding
451 *
452 * returns: the number of byte written or -1 in case of failure.
453 */
454int
455htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
456	               xmlNodePtr cur, const char *encoding, int format) {
457    xmlOutputBufferPtr buf;
458    xmlCharEncodingHandlerPtr handler = NULL;
459    int ret;
460
461    xmlInitParser();
462
463    if (encoding != NULL) {
464	xmlCharEncoding enc;
465
466	enc = xmlParseCharEncoding(encoding);
467	if (enc != XML_CHAR_ENCODING_UTF8) {
468	    handler = xmlFindCharEncodingHandler(encoding);
469	    if (handler == NULL)
470		return(-1);
471	}
472    }
473
474    /*
475     * Fallback to HTML or ASCII when the encoding is unspecified
476     */
477    if (handler == NULL)
478	handler = xmlFindCharEncodingHandler("HTML");
479    if (handler == NULL)
480	handler = xmlFindCharEncodingHandler("ascii");
481
482    /*
483     * save the content to a temp buffer.
484     */
485    buf = xmlOutputBufferCreateFile(out, handler);
486    if (buf == NULL) return(0);
487
488    htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
489
490    ret = xmlOutputBufferClose(buf);
491    return(ret);
492}
493
494/**
495 * htmlNodeDumpFile:
496 * @out:  the FILE pointer
497 * @doc:  the document
498 * @cur:  the current node
499 *
500 * Dump an HTML node, recursive behaviour,children are printed too,
501 * and formatting returns are added.
502 */
503void
504htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
505    htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
506}
507
508/**
509 * htmlDocDumpMemoryFormat:
510 * @cur:  the document
511 * @mem:  OUT: the memory pointer
512 * @size:  OUT: the memory length
513 * @format:  should formatting spaces been added
514 *
515 * Dump an HTML document in memory and return the xmlChar * and it's size.
516 * It's up to the caller to free the memory.
517 */
518void
519htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
520    xmlOutputBufferPtr buf;
521    xmlCharEncodingHandlerPtr handler = NULL;
522    const char *encoding;
523
524    xmlInitParser();
525
526    if ((mem == NULL) || (size == NULL))
527        return;
528    if (cur == NULL) {
529	*mem = NULL;
530	*size = 0;
531	return;
532    }
533
534    encoding = (const char *) htmlGetMetaEncoding(cur);
535
536    if (encoding != NULL) {
537	xmlCharEncoding enc;
538
539	enc = xmlParseCharEncoding(encoding);
540	if (enc != cur->charset) {
541	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
542		/*
543		 * Not supported yet
544		 */
545		*mem = NULL;
546		*size = 0;
547		return;
548	    }
549
550	    handler = xmlFindCharEncodingHandler(encoding);
551	    if (handler == NULL) {
552		*mem = NULL;
553		*size = 0;
554		return;
555	    }
556	} else {
557	    handler = xmlFindCharEncodingHandler(encoding);
558	}
559    }
560
561    /*
562     * Fallback to HTML or ASCII when the encoding is unspecified
563     */
564    if (handler == NULL)
565	handler = xmlFindCharEncodingHandler("HTML");
566    if (handler == NULL)
567	handler = xmlFindCharEncodingHandler("ascii");
568
569    buf = xmlAllocOutputBuffer(handler);
570    if (buf == NULL) {
571	*mem = NULL;
572	*size = 0;
573	return;
574    }
575
576	htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
577
578    xmlOutputBufferFlush(buf);
579    if (buf->conv != NULL) {
580	*size = buf->conv->use;
581	*mem = xmlStrndup(buf->conv->content, *size);
582    } else {
583	*size = buf->buffer->use;
584	*mem = xmlStrndup(buf->buffer->content, *size);
585    }
586    (void)xmlOutputBufferClose(buf);
587}
588
589/**
590 * htmlDocDumpMemory:
591 * @cur:  the document
592 * @mem:  OUT: the memory pointer
593 * @size:  OUT: the memory length
594 *
595 * Dump an HTML document in memory and return the xmlChar * and it's size.
596 * It's up to the caller to free the memory.
597 */
598void
599htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
600	htmlDocDumpMemoryFormat(cur, mem, size, 1);
601}
602
603
604/************************************************************************
605 *									*
606 *   		Dumping HTML tree content to an I/O output buffer	*
607 *									*
608 ************************************************************************/
609
610void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
611
612/**
613 * htmlDtdDumpOutput:
614 * @buf:  the HTML buffer output
615 * @doc:  the document
616 * @encoding:  the encoding string
617 *
618 * TODO: check whether encoding is needed
619 *
620 * Dump the HTML document DTD, if any.
621 */
622static void
623htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
624	          const char *encoding ATTRIBUTE_UNUSED) {
625    xmlDtdPtr cur = doc->intSubset;
626
627    if (cur == NULL) {
628	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
629	return;
630    }
631    xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
632    xmlOutputBufferWriteString(buf, (const char *)cur->name);
633    if (cur->ExternalID != NULL) {
634	xmlOutputBufferWriteString(buf, " PUBLIC ");
635	xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
636	if (cur->SystemID != NULL) {
637	    xmlOutputBufferWriteString(buf, " ");
638	    xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
639	}
640    }  else if (cur->SystemID != NULL) {
641	xmlOutputBufferWriteString(buf, " SYSTEM ");
642	xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
643    }
644    xmlOutputBufferWriteString(buf, ">\n");
645}
646
647/**
648 * htmlAttrDumpOutput:
649 * @buf:  the HTML buffer output
650 * @doc:  the document
651 * @cur:  the attribute pointer
652 * @encoding:  the encoding string
653 *
654 * Dump an HTML attribute
655 */
656static void
657htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
658	           const char *encoding ATTRIBUTE_UNUSED) {
659    xmlChar *value;
660
661    /*
662     * TODO: The html output method should not escape a & character
663     *       occurring in an attribute value immediately followed by
664     *       a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
665     */
666
667    if (cur == NULL) {
668	return;
669    }
670    xmlOutputBufferWriteString(buf, " ");
671    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
672        xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
673	xmlOutputBufferWriteString(buf, ":");
674    }
675    xmlOutputBufferWriteString(buf, (const char *)cur->name);
676    if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
677	value = xmlNodeListGetString(doc, cur->children, 0);
678	if (value) {
679	    xmlOutputBufferWriteString(buf, "=");
680	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
681		(cur->parent->ns == NULL) &&
682		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
683	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
684		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
685		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
686		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
687		xmlChar *escaped;
688		xmlChar *tmp = value;
689
690		while (IS_BLANK_CH(*tmp)) tmp++;
691
692		escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+");
693		if (escaped != NULL) {
694		    xmlBufferWriteQuotedString(buf->buffer, escaped);
695		    xmlFree(escaped);
696		} else {
697		    xmlBufferWriteQuotedString(buf->buffer, value);
698		}
699	    } else {
700		xmlBufferWriteQuotedString(buf->buffer, value);
701	    }
702	    xmlFree(value);
703	} else  {
704	    xmlOutputBufferWriteString(buf, "=\"\"");
705	}
706    }
707}
708
709/**
710 * htmlAttrListDumpOutput:
711 * @buf:  the HTML buffer output
712 * @doc:  the document
713 * @cur:  the first attribute pointer
714 * @encoding:  the encoding string
715 *
716 * Dump a list of HTML attributes
717 */
718static void
719htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
720    if (cur == NULL) {
721	return;
722    }
723    while (cur != NULL) {
724        htmlAttrDumpOutput(buf, doc, cur, encoding);
725	cur = cur->next;
726    }
727}
728
729
730
731/**
732 * htmlNodeListDumpOutput:
733 * @buf:  the HTML buffer output
734 * @doc:  the document
735 * @cur:  the first node
736 * @encoding:  the encoding string
737 * @format:  should formatting spaces been added
738 *
739 * Dump an HTML node list, recursive behaviour,children are printed too.
740 */
741static void
742htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
743	               xmlNodePtr cur, const char *encoding, int format) {
744    if (cur == NULL) {
745	return;
746    }
747    while (cur != NULL) {
748        htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
749	cur = cur->next;
750    }
751}
752
753/**
754 * htmlNodeDumpFormatOutput:
755 * @buf:  the HTML buffer output
756 * @doc:  the document
757 * @cur:  the current node
758 * @encoding:  the encoding string
759 * @format:  should formatting spaces been added
760 *
761 * Dump an HTML node, recursive behaviour,children are printed too.
762 */
763void
764htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
765	                 xmlNodePtr cur, const char *encoding, int format) {
766    const htmlElemDesc * info;
767
768    xmlInitParser();
769
770    if ((cur == NULL) || (buf == NULL)) {
771	return;
772    }
773    /*
774     * Special cases.
775     */
776    if (cur->type == XML_DTD_NODE)
777	return;
778    if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
779        (cur->type == XML_DOCUMENT_NODE)){
780	htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
781	return;
782    }
783    if (cur->type == HTML_TEXT_NODE) {
784	if (cur->content != NULL) {
785	    if (((cur->name == (const xmlChar *)xmlStringText) ||
786		 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
787		((cur->parent == NULL) ||
788		 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
789		  (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
790		xmlChar *buffer;
791
792		buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
793		if (buffer != NULL) {
794		    xmlOutputBufferWriteString(buf, (const char *)buffer);
795		    xmlFree(buffer);
796		}
797	    } else {
798		xmlOutputBufferWriteString(buf, (const char *)cur->content);
799	    }
800	}
801	return;
802    }
803    if (cur->type == HTML_COMMENT_NODE) {
804	if (cur->content != NULL) {
805	    xmlOutputBufferWriteString(buf, "<!--");
806	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
807	    xmlOutputBufferWriteString(buf, "-->");
808	}
809	return;
810    }
811    if (cur->type == HTML_PI_NODE) {
812	if (cur->name == NULL)
813	    return;
814	xmlOutputBufferWriteString(buf, "<?");
815	xmlOutputBufferWriteString(buf, (const char *)cur->name);
816	if (cur->content != NULL) {
817	    xmlOutputBufferWriteString(buf, " ");
818	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
819	}
820	xmlOutputBufferWriteString(buf, ">");
821	return;
822    }
823    if (cur->type == HTML_ENTITY_REF_NODE) {
824        xmlOutputBufferWriteString(buf, "&");
825	xmlOutputBufferWriteString(buf, (const char *)cur->name);
826        xmlOutputBufferWriteString(buf, ";");
827	return;
828    }
829    if (cur->type == HTML_PRESERVE_NODE) {
830	if (cur->content != NULL) {
831	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
832	}
833	return;
834    }
835
836    /*
837     * Get specific HTML info for that node.
838     */
839    if (cur->ns == NULL)
840	info = htmlTagLookup(cur->name);
841    else
842	info = NULL;
843
844    xmlOutputBufferWriteString(buf, "<");
845    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
846        xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
847	xmlOutputBufferWriteString(buf, ":");
848    }
849    xmlOutputBufferWriteString(buf, (const char *)cur->name);
850    if (cur->nsDef)
851	xmlNsListDumpOutput(buf, cur->nsDef);
852    if (cur->properties != NULL)
853        htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
854
855    if ((info != NULL) && (info->empty)) {
856        xmlOutputBufferWriteString(buf, ">");
857	if ((format) && (!info->isinline) && (cur->next != NULL)) {
858	    if ((cur->next->type != HTML_TEXT_NODE) &&
859		(cur->next->type != HTML_ENTITY_REF_NODE) &&
860		(cur->parent != NULL) &&
861		(cur->parent->name != NULL) &&
862		(cur->parent->name[0] != 'p')) /* p, pre, param */
863		xmlOutputBufferWriteString(buf, "\n");
864	}
865	return;
866    }
867    if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
868	(cur->children == NULL)) {
869        if ((info != NULL) && (info->saveEndTag != 0) &&
870	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
871	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
872	    xmlOutputBufferWriteString(buf, ">");
873	} else {
874	    xmlOutputBufferWriteString(buf, "></");
875            if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
876                xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
877                xmlOutputBufferWriteString(buf, ":");
878            }
879	    xmlOutputBufferWriteString(buf, (const char *)cur->name);
880	    xmlOutputBufferWriteString(buf, ">");
881	}
882	if ((format) && (cur->next != NULL) &&
883            (info != NULL) && (!info->isinline)) {
884	    if ((cur->next->type != HTML_TEXT_NODE) &&
885		(cur->next->type != HTML_ENTITY_REF_NODE) &&
886		(cur->parent != NULL) &&
887		(cur->parent->name != NULL) &&
888		(cur->parent->name[0] != 'p')) /* p, pre, param */
889		xmlOutputBufferWriteString(buf, "\n");
890	}
891	return;
892    }
893    xmlOutputBufferWriteString(buf, ">");
894    if ((cur->type != XML_ELEMENT_NODE) &&
895	(cur->content != NULL)) {
896	    /*
897	     * Uses the OutputBuffer property to automatically convert
898	     * invalids to charrefs
899	     */
900
901            xmlOutputBufferWriteString(buf, (const char *) cur->content);
902    }
903    if (cur->children != NULL) {
904        if ((format) && (info != NULL) && (!info->isinline) &&
905	    (cur->children->type != HTML_TEXT_NODE) &&
906	    (cur->children->type != HTML_ENTITY_REF_NODE) &&
907	    (cur->children != cur->last) &&
908	    (cur->name != NULL) &&
909	    (cur->name[0] != 'p')) /* p, pre, param */
910	    xmlOutputBufferWriteString(buf, "\n");
911	htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
912        if ((format) && (info != NULL) && (!info->isinline) &&
913	    (cur->last->type != HTML_TEXT_NODE) &&
914	    (cur->last->type != HTML_ENTITY_REF_NODE) &&
915	    (cur->children != cur->last) &&
916	    (cur->name != NULL) &&
917	    (cur->name[0] != 'p')) /* p, pre, param */
918	    xmlOutputBufferWriteString(buf, "\n");
919    }
920    xmlOutputBufferWriteString(buf, "</");
921    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
922        xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
923	xmlOutputBufferWriteString(buf, ":");
924    }
925    xmlOutputBufferWriteString(buf, (const char *)cur->name);
926    xmlOutputBufferWriteString(buf, ">");
927    if ((format) && (info != NULL) && (!info->isinline) &&
928	(cur->next != NULL)) {
929        if ((cur->next->type != HTML_TEXT_NODE) &&
930	    (cur->next->type != HTML_ENTITY_REF_NODE) &&
931	    (cur->parent != NULL) &&
932	    (cur->parent->name != NULL) &&
933	    (cur->parent->name[0] != 'p')) /* p, pre, param */
934	    xmlOutputBufferWriteString(buf, "\n");
935    }
936}
937
938/**
939 * htmlNodeDumpOutput:
940 * @buf:  the HTML buffer output
941 * @doc:  the document
942 * @cur:  the current node
943 * @encoding:  the encoding string
944 *
945 * Dump an HTML node, recursive behaviour,children are printed too,
946 * and formatting returns/spaces are added.
947 */
948void
949htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
950	           xmlNodePtr cur, const char *encoding) {
951    htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
952}
953
954/**
955 * htmlDocContentDumpFormatOutput:
956 * @buf:  the HTML buffer output
957 * @cur:  the document
958 * @encoding:  the encoding string
959 * @format:  should formatting spaces been added
960 *
961 * Dump an HTML document.
962 */
963void
964htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
965	                       const char *encoding, int format) {
966    int type;
967
968    xmlInitParser();
969
970    if ((buf == NULL) || (cur == NULL))
971        return;
972
973    /*
974     * force to output the stuff as HTML, especially for entities
975     */
976    type = cur->type;
977    cur->type = XML_HTML_DOCUMENT_NODE;
978    if (cur->intSubset != NULL) {
979        htmlDtdDumpOutput(buf, cur, NULL);
980    }
981    if (cur->children != NULL) {
982        htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
983    }
984    xmlOutputBufferWriteString(buf, "\n");
985    cur->type = (xmlElementType) type;
986}
987
988/**
989 * htmlDocContentDumpOutput:
990 * @buf:  the HTML buffer output
991 * @cur:  the document
992 * @encoding:  the encoding string
993 *
994 * Dump an HTML document. Formating return/spaces are added.
995 */
996void
997htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
998	                 const char *encoding) {
999    htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
1000}
1001
1002/************************************************************************
1003 *									*
1004 *		Saving functions front-ends				*
1005 *									*
1006 ************************************************************************/
1007
1008/**
1009 * htmlDocDump:
1010 * @f:  the FILE*
1011 * @cur:  the document
1012 *
1013 * Dump an HTML document to an open FILE.
1014 *
1015 * returns: the number of byte written or -1 in case of failure.
1016 */
1017int
1018htmlDocDump(FILE *f, xmlDocPtr cur) {
1019    xmlOutputBufferPtr buf;
1020    xmlCharEncodingHandlerPtr handler = NULL;
1021    const char *encoding;
1022    int ret;
1023
1024    xmlInitParser();
1025
1026    if ((cur == NULL) || (f == NULL)) {
1027	return(-1);
1028    }
1029
1030    encoding = (const char *) htmlGetMetaEncoding(cur);
1031
1032    if (encoding != NULL) {
1033	xmlCharEncoding enc;
1034
1035	enc = xmlParseCharEncoding(encoding);
1036	if (enc != cur->charset) {
1037	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1038		/*
1039		 * Not supported yet
1040		 */
1041		return(-1);
1042	    }
1043
1044	    handler = xmlFindCharEncodingHandler(encoding);
1045	    if (handler == NULL)
1046		return(-1);
1047	} else {
1048	    handler = xmlFindCharEncodingHandler(encoding);
1049	}
1050    }
1051
1052    /*
1053     * Fallback to HTML or ASCII when the encoding is unspecified
1054     */
1055    if (handler == NULL)
1056	handler = xmlFindCharEncodingHandler("HTML");
1057    if (handler == NULL)
1058	handler = xmlFindCharEncodingHandler("ascii");
1059
1060    buf = xmlOutputBufferCreateFile(f, handler);
1061    if (buf == NULL) return(-1);
1062    htmlDocContentDumpOutput(buf, cur, NULL);
1063
1064    ret = xmlOutputBufferClose(buf);
1065    return(ret);
1066}
1067
1068/**
1069 * htmlSaveFile:
1070 * @filename:  the filename (or URL)
1071 * @cur:  the document
1072 *
1073 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1074 * used.
1075 * returns: the number of byte written or -1 in case of failure.
1076 */
1077int
1078htmlSaveFile(const char *filename, xmlDocPtr cur) {
1079    xmlOutputBufferPtr buf;
1080    xmlCharEncodingHandlerPtr handler = NULL;
1081    const char *encoding;
1082    int ret;
1083
1084    if ((cur == NULL) || (filename == NULL))
1085        return(-1);
1086
1087    xmlInitParser();
1088
1089    encoding = (const char *) htmlGetMetaEncoding(cur);
1090
1091    if (encoding != NULL) {
1092	xmlCharEncoding enc;
1093
1094	enc = xmlParseCharEncoding(encoding);
1095	if (enc != cur->charset) {
1096	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1097		/*
1098		 * Not supported yet
1099		 */
1100		return(-1);
1101	    }
1102
1103	    handler = xmlFindCharEncodingHandler(encoding);
1104	    if (handler == NULL)
1105		return(-1);
1106	}
1107    }
1108
1109    /*
1110     * Fallback to HTML or ASCII when the encoding is unspecified
1111     */
1112    if (handler == NULL)
1113	handler = xmlFindCharEncodingHandler("HTML");
1114    if (handler == NULL)
1115	handler = xmlFindCharEncodingHandler("ascii");
1116
1117    /*
1118     * save the content to a temp buffer.
1119     */
1120    buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1121    if (buf == NULL) return(0);
1122
1123    htmlDocContentDumpOutput(buf, cur, NULL);
1124
1125    ret = xmlOutputBufferClose(buf);
1126    return(ret);
1127}
1128
1129/**
1130 * htmlSaveFileFormat:
1131 * @filename:  the filename
1132 * @cur:  the document
1133 * @format:  should formatting spaces been added
1134 * @encoding: the document encoding
1135 *
1136 * Dump an HTML document to a file using a given encoding.
1137 *
1138 * returns: the number of byte written or -1 in case of failure.
1139 */
1140int
1141htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1142	           const char *encoding, int format) {
1143    xmlOutputBufferPtr buf;
1144    xmlCharEncodingHandlerPtr handler = NULL;
1145    int ret;
1146
1147    if ((cur == NULL) || (filename == NULL))
1148        return(-1);
1149
1150    xmlInitParser();
1151
1152    if (encoding != NULL) {
1153	xmlCharEncoding enc;
1154
1155	enc = xmlParseCharEncoding(encoding);
1156	if (enc != cur->charset) {
1157	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1158		/*
1159		 * Not supported yet
1160		 */
1161		return(-1);
1162	    }
1163
1164	    handler = xmlFindCharEncodingHandler(encoding);
1165	    if (handler == NULL)
1166		return(-1);
1167            htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1168	}
1169    } else {
1170	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1171    }
1172
1173    /*
1174     * Fallback to HTML or ASCII when the encoding is unspecified
1175     */
1176    if (handler == NULL)
1177	handler = xmlFindCharEncodingHandler("HTML");
1178    if (handler == NULL)
1179	handler = xmlFindCharEncodingHandler("ascii");
1180
1181    /*
1182     * save the content to a temp buffer.
1183     */
1184    buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1185    if (buf == NULL) return(0);
1186
1187    htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1188
1189    ret = xmlOutputBufferClose(buf);
1190    return(ret);
1191}
1192
1193/**
1194 * htmlSaveFileEnc:
1195 * @filename:  the filename
1196 * @cur:  the document
1197 * @encoding: the document encoding
1198 *
1199 * Dump an HTML document to a file using a given encoding
1200 * and formatting returns/spaces are added.
1201 *
1202 * returns: the number of byte written or -1 in case of failure.
1203 */
1204int
1205htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1206    return(htmlSaveFileFormat(filename, cur, encoding, 1));
1207}
1208
1209#endif /* LIBXML_OUTPUT_ENABLED */
1210
1211#define bottom_HTMLtree
1212#include "elfgcchack.h"
1213#endif /* LIBXML_HTML_ENABLED */
1214