1/*
2 * HTMLtree.c : implementation of access function for an HTML tree.
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9
10#define IN_LIBXML
11#include "libxml.h"
12#ifdef LIBXML_HTML_ENABLED
13
14#include <string.h> /* for memset() only ! */
15
16#ifdef HAVE_CTYPE_H
17#include <ctype.h>
18#endif
19#ifdef HAVE_STDLIB_H
20#include <stdlib.h>
21#endif
22
23#include <libxml/xmlmemory.h>
24#include <libxml/HTMLparser.h>
25#include <libxml/HTMLtree.h>
26#include <libxml/entities.h>
27#include <libxml/valid.h>
28#include <libxml/xmlerror.h>
29#include <libxml/parserInternals.h>
30#include <libxml/globals.h>
31#include <libxml/uri.h>
32
33/************************************************************************
34 *									*
35 *   		Getting/Setting encoding meta tags			*
36 *									*
37 ************************************************************************/
38
39/**
40 * htmlGetMetaEncoding:
41 * @doc:  the document
42 *
43 * Encoding definition lookup in the Meta tags
44 *
45 * Returns the current encoding as flagged in the HTML source
46 */
47const xmlChar *
48htmlGetMetaEncoding(htmlDocPtr doc) {
49    htmlNodePtr cur;
50    const xmlChar *content;
51    const xmlChar *encoding;
52
53    if (doc == NULL)
54	return(NULL);
55    cur = doc->children;
56
57    /*
58     * Search the html
59     */
60    while (cur != NULL) {
61	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
62	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
63		break;
64	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
65		goto found_head;
66	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
67		goto found_meta;
68	}
69	cur = cur->next;
70    }
71    if (cur == NULL)
72	return(NULL);
73    cur = cur->children;
74
75    /*
76     * Search the head
77     */
78    while (cur != NULL) {
79	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
80	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
81		break;
82	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
83		goto found_meta;
84	}
85	cur = cur->next;
86    }
87    if (cur == NULL)
88	return(NULL);
89found_head:
90    cur = cur->children;
91
92    /*
93     * Search the meta elements
94     */
95found_meta:
96    while (cur != NULL) {
97	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
98	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
99		xmlAttrPtr attr = cur->properties;
100		int http;
101		const xmlChar *value;
102
103		content = NULL;
104		http = 0;
105		while (attr != NULL) {
106		    if ((attr->children != NULL) &&
107		        (attr->children->type == XML_TEXT_NODE) &&
108		        (attr->children->next == NULL)) {
109			value = attr->children->content;
110			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
111			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
112			    http = 1;
113			else if ((value != NULL)
114			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
115			    content = value;
116			if ((http != 0) && (content != NULL))
117			    goto found_content;
118		    }
119		    attr = attr->next;
120		}
121	    }
122	}
123	cur = cur->next;
124    }
125    return(NULL);
126
127found_content:
128    encoding = xmlStrstr(content, BAD_CAST"charset=");
129    if (encoding == NULL)
130	encoding = xmlStrstr(content, BAD_CAST"Charset=");
131    if (encoding == NULL)
132	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
133    if (encoding != NULL) {
134	encoding += 8;
135    } else {
136	encoding = xmlStrstr(content, BAD_CAST"charset =");
137	if (encoding == NULL)
138	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
139	if (encoding == NULL)
140	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
141	if (encoding != NULL)
142	    encoding += 9;
143    }
144    if (encoding != NULL) {
145	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
146    }
147    return(encoding);
148}
149
150/**
151 * htmlSetMetaEncoding:
152 * @doc:  the document
153 * @encoding:  the encoding string
154 *
155 * Sets the current encoding in the Meta tags
156 * NOTE: this will not change the document content encoding, just
157 * the META flag associated.
158 *
159 * Returns 0 in case of success and -1 in case of error
160 */
161int
162htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
163    htmlNodePtr cur, meta;
164    const xmlChar *content;
165    char newcontent[100];
166
167
168    if (doc == NULL)
169	return(-1);
170
171    if (encoding != NULL) {
172	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
173                (char *)encoding);
174	newcontent[sizeof(newcontent) - 1] = 0;
175    }
176
177    cur = doc->children;
178
179    /*
180     * Search the html
181     */
182    while (cur != NULL) {
183	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
184	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
185		break;
186	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
187		goto found_head;
188	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
189		goto found_meta;
190	}
191	cur = cur->next;
192    }
193    if (cur == NULL)
194	return(-1);
195    cur = cur->children;
196
197    /*
198     * Search the head
199     */
200    while (cur != NULL) {
201	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
202	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
203		break;
204	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
205		goto found_meta;
206	}
207	cur = cur->next;
208    }
209    if (cur == NULL)
210	return(-1);
211found_head:
212    if (cur->children == NULL) {
213	if (encoding == NULL)
214	    return(0);
215	meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
216	xmlAddChild(cur, meta);
217	xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
218	xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
219	return(0);
220    }
221    cur = cur->children;
222
223found_meta:
224    if (encoding != NULL) {
225	/*
226	 * Create a new Meta element with the right attributes
227	 */
228
229	meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
230	xmlAddPrevSibling(cur, meta);
231	xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
232	xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
233    }
234
235    /*
236     * Search and destroy all the remaining the meta elements carrying
237     * encoding informations
238     */
239    while (cur != NULL) {
240	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
241	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
242		xmlAttrPtr attr = cur->properties;
243		int http;
244		const xmlChar *value;
245
246		content = NULL;
247		http = 0;
248		while (attr != NULL) {
249		    if ((attr->children != NULL) &&
250		        (attr->children->type == XML_TEXT_NODE) &&
251		        (attr->children->next == NULL)) {
252			value = attr->children->content;
253			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
254			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
255			    http = 1;
256			else
257                        {
258                           if ((value != NULL) &&
259				(!xmlStrcasecmp(attr->name, BAD_CAST"content")))
260			      content = value;
261                        }
262		        if ((http != 0) && (content != NULL))
263			    break;
264		    }
265		    attr = attr->next;
266		}
267		if ((http != 0) && (content != NULL)) {
268		    meta = cur;
269		    cur = cur->next;
270		    xmlUnlinkNode(meta);
271                    xmlFreeNode(meta);
272		    continue;
273		}
274
275	    }
276	}
277	cur = cur->next;
278    }
279    return(0);
280}
281
282/**
283 * booleanHTMLAttrs:
284 *
285 * These are the HTML attributes which will be output
286 * in minimized form, i.e. <option selected="selected"> will be
287 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
288 *
289 */
290static const char* htmlBooleanAttrs[] = {
291  "checked", "compact", "declare", "defer", "disabled", "ismap",
292  "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
293  "selected", NULL
294};
295
296
297/**
298 * htmlIsBooleanAttr:
299 * @name:  the name of the attribute to check
300 *
301 * Determine if a given attribute is a boolean attribute.
302 *
303 * returns: false if the attribute is not boolean, true otherwise.
304 */
305int
306htmlIsBooleanAttr(const xmlChar *name)
307{
308    int i = 0;
309
310    while (htmlBooleanAttrs[i] != NULL) {
311        if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
312            return 1;
313        i++;
314    }
315    return 0;
316}
317
318#ifdef LIBXML_OUTPUT_ENABLED
319/*
320 * private routine exported from xmlIO.c
321 */
322xmlOutputBufferPtr
323xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
324/************************************************************************
325 *									*
326 * 			Output error handlers				*
327 *									*
328 ************************************************************************/
329/**
330 * htmlSaveErrMemory:
331 * @extra:  extra informations
332 *
333 * Handle an out of memory condition
334 */
335static void
336htmlSaveErrMemory(const char *extra)
337{
338    __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
339}
340
341/**
342 * htmlSaveErr:
343 * @code:  the error number
344 * @node:  the location of the error.
345 * @extra:  extra informations
346 *
347 * Handle an out of memory condition
348 */
349static void
350htmlSaveErr(int code, xmlNodePtr node, const char *extra)
351{
352    const char *msg = NULL;
353
354    switch(code) {
355        case XML_SAVE_NOT_UTF8:
356	    msg = "string is not in UTF-8\n";
357	    break;
358	case XML_SAVE_CHAR_INVALID:
359	    msg = "invalid character value\n";
360	    break;
361	case XML_SAVE_UNKNOWN_ENCODING:
362	    msg = "unknown encoding %s\n";
363	    break;
364	case XML_SAVE_NO_DOCTYPE:
365	    msg = "HTML has no DOCTYPE\n";
366	    break;
367	default:
368	    msg = "unexpected error number\n";
369    }
370    __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
371}
372
373/************************************************************************
374 *									*
375 *   		Dumping HTML tree content to a simple buffer		*
376 *									*
377 ************************************************************************/
378
379static int
380htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
381	           int format);
382
383/**
384 * htmlNodeDumpFormat:
385 * @buf:  the HTML buffer output
386 * @doc:  the document
387 * @cur:  the current node
388 * @format:  should formatting spaces been added
389 *
390 * Dump an HTML node, recursive behaviour,children are printed too.
391 *
392 * Returns the number of byte written or -1 in case of error
393 */
394static int
395htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
396	           int format) {
397    unsigned int use;
398    int ret;
399    xmlOutputBufferPtr outbuf;
400
401    if (cur == NULL) {
402	return (-1);
403    }
404    if (buf == NULL) {
405	return (-1);
406    }
407    outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
408    if (outbuf == NULL) {
409        htmlSaveErrMemory("allocating HTML output buffer");
410	return (-1);
411    }
412    memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
413    outbuf->buffer = buf;
414    outbuf->encoder = NULL;
415    outbuf->writecallback = NULL;
416    outbuf->closecallback = NULL;
417    outbuf->context = NULL;
418    outbuf->written = 0;
419
420    use = buf->use;
421    htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
422    xmlFree(outbuf);
423    ret = buf->use - use;
424    return (ret);
425}
426
427/**
428 * htmlNodeDump:
429 * @buf:  the HTML buffer output
430 * @doc:  the document
431 * @cur:  the current node
432 *
433 * Dump an HTML node, recursive behaviour,children are printed too,
434 * and formatting returns are added.
435 *
436 * Returns the number of byte written or -1 in case of error
437 */
438int
439htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
440    xmlInitParser();
441
442    return(htmlNodeDumpFormat(buf, doc, cur, 1));
443}
444
445/**
446 * htmlNodeDumpFileFormat:
447 * @out:  the FILE pointer
448 * @doc:  the document
449 * @cur:  the current node
450 * @encoding: the document encoding
451 * @format:  should formatting spaces been added
452 *
453 * Dump an HTML node, recursive behaviour,children are printed too.
454 *
455 * TODO: if encoding == NULL try to save in the doc encoding
456 *
457 * returns: the number of byte written or -1 in case of failure.
458 */
459int
460htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
461	               xmlNodePtr cur, const char *encoding, int format) {
462    xmlOutputBufferPtr buf;
463    xmlCharEncodingHandlerPtr handler = NULL;
464    int ret;
465
466    xmlInitParser();
467
468    if (encoding != NULL) {
469	xmlCharEncoding enc;
470
471	enc = xmlParseCharEncoding(encoding);
472	if (enc != XML_CHAR_ENCODING_UTF8) {
473	    handler = xmlFindCharEncodingHandler(encoding);
474	    if (handler == NULL)
475		return(-1);
476	}
477    }
478
479    /*
480     * Fallback to HTML or ASCII when the encoding is unspecified
481     */
482    if (handler == NULL)
483	handler = xmlFindCharEncodingHandler("HTML");
484    if (handler == NULL)
485	handler = xmlFindCharEncodingHandler("ascii");
486
487    /*
488     * save the content to a temp buffer.
489     */
490    buf = xmlOutputBufferCreateFile(out, handler);
491    if (buf == NULL) return(0);
492
493    htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
494
495    ret = xmlOutputBufferClose(buf);
496    return(ret);
497}
498
499/**
500 * htmlNodeDumpFile:
501 * @out:  the FILE pointer
502 * @doc:  the document
503 * @cur:  the current node
504 *
505 * Dump an HTML node, recursive behaviour,children are printed too,
506 * and formatting returns are added.
507 */
508void
509htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
510    htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
511}
512
513/**
514 * htmlDocDumpMemoryFormat:
515 * @cur:  the document
516 * @mem:  OUT: the memory pointer
517 * @size:  OUT: the memory length
518 * @format:  should formatting spaces been added
519 *
520 * Dump an HTML document in memory and return the xmlChar * and it's size.
521 * It's up to the caller to free the memory.
522 */
523void
524htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
525    xmlOutputBufferPtr buf;
526    xmlCharEncodingHandlerPtr handler = NULL;
527    const char *encoding;
528
529    xmlInitParser();
530
531    if ((mem == NULL) || (size == NULL))
532        return;
533    if (cur == NULL) {
534	*mem = NULL;
535	*size = 0;
536	return;
537    }
538
539    encoding = (const char *) htmlGetMetaEncoding(cur);
540
541    if (encoding != NULL) {
542	xmlCharEncoding enc;
543
544	enc = xmlParseCharEncoding(encoding);
545	if (enc != cur->charset) {
546	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
547		/*
548		 * Not supported yet
549		 */
550		*mem = NULL;
551		*size = 0;
552		return;
553	    }
554
555	    handler = xmlFindCharEncodingHandler(encoding);
556	    if (handler == NULL) {
557		*mem = NULL;
558		*size = 0;
559		return;
560	    }
561	} else {
562	    handler = xmlFindCharEncodingHandler(encoding);
563	}
564    }
565
566    /*
567     * Fallback to HTML or ASCII when the encoding is unspecified
568     */
569    if (handler == NULL)
570	handler = xmlFindCharEncodingHandler("HTML");
571    if (handler == NULL)
572	handler = xmlFindCharEncodingHandler("ascii");
573
574    buf = xmlAllocOutputBufferInternal(handler);
575    if (buf == NULL) {
576	*mem = NULL;
577	*size = 0;
578	return;
579    }
580
581	htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
582
583    xmlOutputBufferFlush(buf);
584    if (buf->conv != NULL) {
585	*size = buf->conv->use;
586	*mem = xmlStrndup(buf->conv->content, *size);
587    } else {
588	*size = buf->buffer->use;
589	*mem = xmlStrndup(buf->buffer->content, *size);
590    }
591    (void)xmlOutputBufferClose(buf);
592}
593
594/**
595 * htmlDocDumpMemory:
596 * @cur:  the document
597 * @mem:  OUT: the memory pointer
598 * @size:  OUT: the memory length
599 *
600 * Dump an HTML document in memory and return the xmlChar * and it's size.
601 * It's up to the caller to free the memory.
602 */
603void
604htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
605	htmlDocDumpMemoryFormat(cur, mem, size, 1);
606}
607
608
609/************************************************************************
610 *									*
611 *   		Dumping HTML tree content to an I/O output buffer	*
612 *									*
613 ************************************************************************/
614
615void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
616
617/**
618 * htmlDtdDumpOutput:
619 * @buf:  the HTML buffer output
620 * @doc:  the document
621 * @encoding:  the encoding string
622 *
623 * TODO: check whether encoding is needed
624 *
625 * Dump the HTML document DTD, if any.
626 */
627static void
628htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
629	          const char *encoding ATTRIBUTE_UNUSED) {
630    xmlDtdPtr cur = doc->intSubset;
631
632    if (cur == NULL) {
633	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
634	return;
635    }
636    xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
637    xmlOutputBufferWriteString(buf, (const char *)cur->name);
638    if (cur->ExternalID != NULL) {
639	xmlOutputBufferWriteString(buf, " PUBLIC ");
640	xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
641	if (cur->SystemID != NULL) {
642	    xmlOutputBufferWriteString(buf, " ");
643	    xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
644	}
645    }  else if (cur->SystemID != NULL) {
646	xmlOutputBufferWriteString(buf, " SYSTEM ");
647	xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
648    }
649    xmlOutputBufferWriteString(buf, ">\n");
650}
651
652/**
653 * htmlAttrDumpOutput:
654 * @buf:  the HTML buffer output
655 * @doc:  the document
656 * @cur:  the attribute pointer
657 * @encoding:  the encoding string
658 *
659 * Dump an HTML attribute
660 */
661static void
662htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
663	           const char *encoding ATTRIBUTE_UNUSED) {
664    xmlChar *value;
665
666    /*
667     * TODO: The html output method should not escape a & character
668     *       occurring in an attribute value immediately followed by
669     *       a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
670     */
671
672    if (cur == NULL) {
673	return;
674    }
675    xmlOutputBufferWriteString(buf, " ");
676    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
677        xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
678	xmlOutputBufferWriteString(buf, ":");
679    }
680    xmlOutputBufferWriteString(buf, (const char *)cur->name);
681    if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
682	value = xmlNodeListGetString(doc, cur->children, 0);
683	if (value) {
684	    xmlOutputBufferWriteString(buf, "=");
685	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
686		(cur->parent->ns == NULL) &&
687		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
688	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
689		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
690		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
691		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
692		xmlChar *escaped;
693		xmlChar *tmp = value;
694
695		while (IS_BLANK_CH(*tmp)) tmp++;
696
697		escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+");
698		if (escaped != NULL) {
699		    xmlBufferWriteQuotedString(buf->buffer, escaped);
700		    xmlFree(escaped);
701		} else {
702		    xmlBufferWriteQuotedString(buf->buffer, value);
703		}
704	    } else {
705		xmlBufferWriteQuotedString(buf->buffer, value);
706	    }
707	    xmlFree(value);
708	} else  {
709	    xmlOutputBufferWriteString(buf, "=\"\"");
710	}
711    }
712}
713
714/**
715 * htmlAttrListDumpOutput:
716 * @buf:  the HTML buffer output
717 * @doc:  the document
718 * @cur:  the first attribute pointer
719 * @encoding:  the encoding string
720 *
721 * Dump a list of HTML attributes
722 */
723static void
724htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
725    if (cur == NULL) {
726	return;
727    }
728    while (cur != NULL) {
729        htmlAttrDumpOutput(buf, doc, cur, encoding);
730	cur = cur->next;
731    }
732}
733
734
735
736/**
737 * htmlNodeListDumpOutput:
738 * @buf:  the HTML buffer output
739 * @doc:  the document
740 * @cur:  the first node
741 * @encoding:  the encoding string
742 * @format:  should formatting spaces been added
743 *
744 * Dump an HTML node list, recursive behaviour,children are printed too.
745 */
746static void
747htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
748	               xmlNodePtr cur, const char *encoding, int format) {
749    if (cur == NULL) {
750	return;
751    }
752    while (cur != NULL) {
753        htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
754	cur = cur->next;
755    }
756}
757
758/**
759 * htmlNodeDumpFormatOutput:
760 * @buf:  the HTML buffer output
761 * @doc:  the document
762 * @cur:  the current node
763 * @encoding:  the encoding string
764 * @format:  should formatting spaces been added
765 *
766 * Dump an HTML node, recursive behaviour,children are printed too.
767 */
768void
769htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
770	                 xmlNodePtr cur, const char *encoding, int format) {
771    const htmlElemDesc * info;
772
773    xmlInitParser();
774
775    if ((cur == NULL) || (buf == NULL)) {
776	return;
777    }
778    /*
779     * Special cases.
780     */
781    if (cur->type == XML_DTD_NODE)
782	return;
783    if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
784        (cur->type == XML_DOCUMENT_NODE)){
785	htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
786	return;
787    }
788    if (cur->type == XML_ATTRIBUTE_NODE) {
789        htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
790	return;
791    }
792    if (cur->type == HTML_TEXT_NODE) {
793	if (cur->content != NULL) {
794	    if (((cur->name == (const xmlChar *)xmlStringText) ||
795		 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
796		((cur->parent == NULL) ||
797		 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
798		  (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
799		xmlChar *buffer;
800
801		buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
802		if (buffer != NULL) {
803		    xmlOutputBufferWriteString(buf, (const char *)buffer);
804		    xmlFree(buffer);
805		}
806	    } else {
807		xmlOutputBufferWriteString(buf, (const char *)cur->content);
808	    }
809	}
810	return;
811    }
812    if (cur->type == HTML_COMMENT_NODE) {
813	if (cur->content != NULL) {
814	    xmlOutputBufferWriteString(buf, "<!--");
815	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
816	    xmlOutputBufferWriteString(buf, "-->");
817	}
818	return;
819    }
820    if (cur->type == HTML_PI_NODE) {
821	if (cur->name == NULL)
822	    return;
823	xmlOutputBufferWriteString(buf, "<?");
824	xmlOutputBufferWriteString(buf, (const char *)cur->name);
825	if (cur->content != NULL) {
826	    xmlOutputBufferWriteString(buf, " ");
827	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
828	}
829	xmlOutputBufferWriteString(buf, ">");
830	return;
831    }
832    if (cur->type == HTML_ENTITY_REF_NODE) {
833        xmlOutputBufferWriteString(buf, "&");
834	xmlOutputBufferWriteString(buf, (const char *)cur->name);
835        xmlOutputBufferWriteString(buf, ";");
836	return;
837    }
838    if (cur->type == HTML_PRESERVE_NODE) {
839	if (cur->content != NULL) {
840	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
841	}
842	return;
843    }
844
845    /*
846     * Get specific HTML info for that node.
847     */
848    if (cur->ns == NULL)
849	info = htmlTagLookup(cur->name);
850    else
851	info = NULL;
852
853    xmlOutputBufferWriteString(buf, "<");
854    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
855        xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
856	xmlOutputBufferWriteString(buf, ":");
857    }
858    xmlOutputBufferWriteString(buf, (const char *)cur->name);
859    if (cur->nsDef)
860	xmlNsListDumpOutput(buf, cur->nsDef);
861    if (cur->properties != NULL)
862        htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
863
864    if ((info != NULL) && (info->empty)) {
865        xmlOutputBufferWriteString(buf, ">");
866	if ((format) && (!info->isinline) && (cur->next != NULL)) {
867	    if ((cur->next->type != HTML_TEXT_NODE) &&
868		(cur->next->type != HTML_ENTITY_REF_NODE) &&
869		(cur->parent != NULL) &&
870		(cur->parent->name != NULL) &&
871		(cur->parent->name[0] != 'p')) /* p, pre, param */
872		xmlOutputBufferWriteString(buf, "\n");
873	}
874	return;
875    }
876    if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
877	(cur->children == NULL)) {
878        if ((info != NULL) && (info->saveEndTag != 0) &&
879	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
880	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
881	    xmlOutputBufferWriteString(buf, ">");
882	} else {
883	    xmlOutputBufferWriteString(buf, "></");
884            if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
885                xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
886                xmlOutputBufferWriteString(buf, ":");
887            }
888	    xmlOutputBufferWriteString(buf, (const char *)cur->name);
889	    xmlOutputBufferWriteString(buf, ">");
890	}
891	if ((format) && (cur->next != NULL) &&
892            (info != NULL) && (!info->isinline)) {
893	    if ((cur->next->type != HTML_TEXT_NODE) &&
894		(cur->next->type != HTML_ENTITY_REF_NODE) &&
895		(cur->parent != NULL) &&
896		(cur->parent->name != NULL) &&
897		(cur->parent->name[0] != 'p')) /* p, pre, param */
898		xmlOutputBufferWriteString(buf, "\n");
899	}
900	return;
901    }
902    xmlOutputBufferWriteString(buf, ">");
903    if ((cur->type != XML_ELEMENT_NODE) &&
904	(cur->content != NULL)) {
905	    /*
906	     * Uses the OutputBuffer property to automatically convert
907	     * invalids to charrefs
908	     */
909
910            xmlOutputBufferWriteString(buf, (const char *) cur->content);
911    }
912    if (cur->children != NULL) {
913        if ((format) && (info != NULL) && (!info->isinline) &&
914	    (cur->children->type != HTML_TEXT_NODE) &&
915	    (cur->children->type != HTML_ENTITY_REF_NODE) &&
916	    (cur->children != cur->last) &&
917	    (cur->name != NULL) &&
918	    (cur->name[0] != 'p')) /* p, pre, param */
919	    xmlOutputBufferWriteString(buf, "\n");
920	htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
921        if ((format) && (info != NULL) && (!info->isinline) &&
922	    (cur->last->type != HTML_TEXT_NODE) &&
923	    (cur->last->type != HTML_ENTITY_REF_NODE) &&
924	    (cur->children != cur->last) &&
925	    (cur->name != NULL) &&
926	    (cur->name[0] != 'p')) /* p, pre, param */
927	    xmlOutputBufferWriteString(buf, "\n");
928    }
929    xmlOutputBufferWriteString(buf, "</");
930    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
931        xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
932	xmlOutputBufferWriteString(buf, ":");
933    }
934    xmlOutputBufferWriteString(buf, (const char *)cur->name);
935    xmlOutputBufferWriteString(buf, ">");
936    if ((format) && (info != NULL) && (!info->isinline) &&
937	(cur->next != NULL)) {
938        if ((cur->next->type != HTML_TEXT_NODE) &&
939	    (cur->next->type != HTML_ENTITY_REF_NODE) &&
940	    (cur->parent != NULL) &&
941	    (cur->parent->name != NULL) &&
942	    (cur->parent->name[0] != 'p')) /* p, pre, param */
943	    xmlOutputBufferWriteString(buf, "\n");
944    }
945}
946
947/**
948 * htmlNodeDumpOutput:
949 * @buf:  the HTML buffer output
950 * @doc:  the document
951 * @cur:  the current node
952 * @encoding:  the encoding string
953 *
954 * Dump an HTML node, recursive behaviour,children are printed too,
955 * and formatting returns/spaces are added.
956 */
957void
958htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
959	           xmlNodePtr cur, const char *encoding) {
960    htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
961}
962
963/**
964 * htmlDocContentDumpFormatOutput:
965 * @buf:  the HTML buffer output
966 * @cur:  the document
967 * @encoding:  the encoding string
968 * @format:  should formatting spaces been added
969 *
970 * Dump an HTML document.
971 */
972void
973htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
974	                       const char *encoding, int format) {
975    int type;
976
977    xmlInitParser();
978
979    if ((buf == NULL) || (cur == NULL))
980        return;
981
982    /*
983     * force to output the stuff as HTML, especially for entities
984     */
985    type = cur->type;
986    cur->type = XML_HTML_DOCUMENT_NODE;
987    if (cur->intSubset != NULL) {
988        htmlDtdDumpOutput(buf, cur, NULL);
989    }
990    if (cur->children != NULL) {
991        htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
992    }
993    xmlOutputBufferWriteString(buf, "\n");
994    cur->type = (xmlElementType) type;
995}
996
997/**
998 * htmlDocContentDumpOutput:
999 * @buf:  the HTML buffer output
1000 * @cur:  the document
1001 * @encoding:  the encoding string
1002 *
1003 * Dump an HTML document. Formating return/spaces are added.
1004 */
1005void
1006htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1007	                 const char *encoding) {
1008    htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
1009}
1010
1011/************************************************************************
1012 *									*
1013 *		Saving functions front-ends				*
1014 *									*
1015 ************************************************************************/
1016
1017/**
1018 * htmlDocDump:
1019 * @f:  the FILE*
1020 * @cur:  the document
1021 *
1022 * Dump an HTML document to an open FILE.
1023 *
1024 * returns: the number of byte written or -1 in case of failure.
1025 */
1026int
1027htmlDocDump(FILE *f, xmlDocPtr cur) {
1028    xmlOutputBufferPtr buf;
1029    xmlCharEncodingHandlerPtr handler = NULL;
1030    const char *encoding;
1031    int ret;
1032
1033    xmlInitParser();
1034
1035    if ((cur == NULL) || (f == NULL)) {
1036	return(-1);
1037    }
1038
1039    encoding = (const char *) htmlGetMetaEncoding(cur);
1040
1041    if (encoding != NULL) {
1042	xmlCharEncoding enc;
1043
1044	enc = xmlParseCharEncoding(encoding);
1045	if (enc != cur->charset) {
1046	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1047		/*
1048		 * Not supported yet
1049		 */
1050		return(-1);
1051	    }
1052
1053	    handler = xmlFindCharEncodingHandler(encoding);
1054	    if (handler == NULL)
1055		return(-1);
1056	} else {
1057	    handler = xmlFindCharEncodingHandler(encoding);
1058	}
1059    }
1060
1061    /*
1062     * Fallback to HTML or ASCII when the encoding is unspecified
1063     */
1064    if (handler == NULL)
1065	handler = xmlFindCharEncodingHandler("HTML");
1066    if (handler == NULL)
1067	handler = xmlFindCharEncodingHandler("ascii");
1068
1069    buf = xmlOutputBufferCreateFile(f, handler);
1070    if (buf == NULL) return(-1);
1071    htmlDocContentDumpOutput(buf, cur, NULL);
1072
1073    ret = xmlOutputBufferClose(buf);
1074    return(ret);
1075}
1076
1077/**
1078 * htmlSaveFile:
1079 * @filename:  the filename (or URL)
1080 * @cur:  the document
1081 *
1082 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1083 * used.
1084 * returns: the number of byte written or -1 in case of failure.
1085 */
1086int
1087htmlSaveFile(const char *filename, xmlDocPtr cur) {
1088    xmlOutputBufferPtr buf;
1089    xmlCharEncodingHandlerPtr handler = NULL;
1090    const char *encoding;
1091    int ret;
1092
1093    if ((cur == NULL) || (filename == NULL))
1094        return(-1);
1095
1096    xmlInitParser();
1097
1098    encoding = (const char *) htmlGetMetaEncoding(cur);
1099
1100    if (encoding != NULL) {
1101	xmlCharEncoding enc;
1102
1103	enc = xmlParseCharEncoding(encoding);
1104	if (enc != cur->charset) {
1105	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1106		/*
1107		 * Not supported yet
1108		 */
1109		return(-1);
1110	    }
1111
1112	    handler = xmlFindCharEncodingHandler(encoding);
1113	    if (handler == NULL)
1114		return(-1);
1115	}
1116    }
1117
1118    /*
1119     * Fallback to HTML or ASCII when the encoding is unspecified
1120     */
1121    if (handler == NULL)
1122	handler = xmlFindCharEncodingHandler("HTML");
1123    if (handler == NULL)
1124	handler = xmlFindCharEncodingHandler("ascii");
1125
1126    /*
1127     * save the content to a temp buffer.
1128     */
1129    buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1130    if (buf == NULL) return(0);
1131
1132    htmlDocContentDumpOutput(buf, cur, NULL);
1133
1134    ret = xmlOutputBufferClose(buf);
1135    return(ret);
1136}
1137
1138/**
1139 * htmlSaveFileFormat:
1140 * @filename:  the filename
1141 * @cur:  the document
1142 * @format:  should formatting spaces been added
1143 * @encoding: the document encoding
1144 *
1145 * Dump an HTML document to a file using a given encoding.
1146 *
1147 * returns: the number of byte written or -1 in case of failure.
1148 */
1149int
1150htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1151	           const char *encoding, int format) {
1152    xmlOutputBufferPtr buf;
1153    xmlCharEncodingHandlerPtr handler = NULL;
1154    int ret;
1155
1156    if ((cur == NULL) || (filename == NULL))
1157        return(-1);
1158
1159    xmlInitParser();
1160
1161    if (encoding != NULL) {
1162	xmlCharEncoding enc;
1163
1164	enc = xmlParseCharEncoding(encoding);
1165	if (enc != cur->charset) {
1166	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1167		/*
1168		 * Not supported yet
1169		 */
1170		return(-1);
1171	    }
1172
1173	    handler = xmlFindCharEncodingHandler(encoding);
1174	    if (handler == NULL)
1175		return(-1);
1176            htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1177	}
1178    } else {
1179	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1180    }
1181
1182    /*
1183     * Fallback to HTML or ASCII when the encoding is unspecified
1184     */
1185    if (handler == NULL)
1186	handler = xmlFindCharEncodingHandler("HTML");
1187    if (handler == NULL)
1188	handler = xmlFindCharEncodingHandler("ascii");
1189
1190    /*
1191     * save the content to a temp buffer.
1192     */
1193    buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1194    if (buf == NULL) return(0);
1195
1196    htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1197
1198    ret = xmlOutputBufferClose(buf);
1199    return(ret);
1200}
1201
1202/**
1203 * htmlSaveFileEnc:
1204 * @filename:  the filename
1205 * @cur:  the document
1206 * @encoding: the document encoding
1207 *
1208 * Dump an HTML document to a file using a given encoding
1209 * and formatting returns/spaces are added.
1210 *
1211 * returns: the number of byte written or -1 in case of failure.
1212 */
1213int
1214htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1215    return(htmlSaveFileFormat(filename, cur, encoding, 1));
1216}
1217
1218#endif /* LIBXML_OUTPUT_ENABLED */
1219
1220#define bottom_HTMLtree
1221#include "elfgcchack.h"
1222#endif /* LIBXML_HTML_ENABLED */
1223