1/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9#define IN_LIBXML
10#include "libxml.h"
11#ifdef LIBXML_HTML_ENABLED
12
13#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
39#include <libxml/HTMLtree.h>
40#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
44#include <libxml/globals.h>
45#include <libxml/uri.h>
46
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
54static int htmlOmittedDefaultValue = 1;
55
56xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57			     xmlChar end, xmlChar  end2, xmlChar end3);
58static void htmlParseComment(htmlParserCtxtPtr ctxt);
59
60/************************************************************************
61 *									*
62 * 		Some factorized error routines				*
63 *									*
64 ************************************************************************/
65
66/**
67 * htmlErrMemory:
68 * @ctxt:  an HTML parser context
69 * @extra:  extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
76    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77        (ctxt->instate == XML_PARSER_EOF))
78	return;
79    if (ctxt != NULL) {
80        ctxt->errNo = XML_ERR_NO_MEMORY;
81        ctxt->instate = XML_PARSER_EOF;
82        ctxt->disableSAX = 1;
83    }
84    if (extra)
85        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
86                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87                        NULL, NULL, 0, 0,
88                        "Memory allocation failed : %s\n", extra);
89    else
90        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
91                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92                        NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt:  an HTML parser context
98 * @error:  the error number
99 * @msg:  the error message
100 * @str1:  string infor
101 * @str2:  string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107             const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
109    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110        (ctxt->instate == XML_PARSER_EOF))
111	return;
112    if (ctxt != NULL)
113	ctxt->errNo = error;
114    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
115                    XML_ERR_ERROR, NULL, 0,
116		    (const char *) str1, (const char *) str2,
117		    NULL, 0, 0,
118		    msg, str1, str2);
119    if (ctxt != NULL)
120	ctxt->wellFormed = 0;
121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt:  an HTML parser context
126 * @error:  the error number
127 * @msg:  the error message
128 * @val:  integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134             const char *msg, int val)
135{
136    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137        (ctxt->instate == XML_PARSER_EOF))
138	return;
139    if (ctxt != NULL)
140	ctxt->errNo = error;
141    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
142                    XML_ERR_ERROR, NULL, 0, NULL, NULL,
143		    NULL, val, 0, msg, val);
144    if (ctxt != NULL)
145	ctxt->wellFormed = 0;
146}
147
148/************************************************************************
149 *									*
150 * 		Parser stacks related functions and macros		*
151 *									*
152 ************************************************************************/
153
154/**
155 * htmlnamePush:
156 * @ctxt:  an HTML parser context
157 * @value:  the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
162 */
163static int
164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
165{
166    if (ctxt->nameNr >= ctxt->nameMax) {
167        ctxt->nameMax *= 2;
168        ctxt->nameTab = (const xmlChar * *)
169                         xmlRealloc((xmlChar * *)ctxt->nameTab,
170                                    ctxt->nameMax *
171                                    sizeof(ctxt->nameTab[0]));
172        if (ctxt->nameTab == NULL) {
173            htmlErrMemory(ctxt, NULL);
174            return (0);
175        }
176    }
177    ctxt->nameTab[ctxt->nameNr] = value;
178    ctxt->name = value;
179    return (ctxt->nameNr++);
180}
181/**
182 * htmlnamePop:
183 * @ctxt: an HTML parser context
184 *
185 * Pops the top element name from the name stack
186 *
187 * Returns the name just removed
188 */
189static const xmlChar *
190htmlnamePop(htmlParserCtxtPtr ctxt)
191{
192    const xmlChar *ret;
193
194    if (ctxt->nameNr <= 0)
195        return (NULL);
196    ctxt->nameNr--;
197    if (ctxt->nameNr < 0)
198        return (NULL);
199    if (ctxt->nameNr > 0)
200        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201    else
202        ctxt->name = NULL;
203    ret = ctxt->nameTab[ctxt->nameNr];
204    ctxt->nameTab[ctxt->nameNr] = NULL;
205    return (ret);
206}
207
208/*
209 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported.
211 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them
213 *
214 *   CUR_PTR return the current pointer to the xmlChar to be parsed.
215 *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
216 *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 *           in UNICODE mode. This should be used internally by the parser
218 *           only to compare to ASCII values otherwise it would break when
219 *           running with UTF-8 encoding.
220 *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
221 *           to compare on ASCII based substring.
222 *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
223 *           it should be used only to compare on ASCII based substring.
224 *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
225 *           strings without newlines within the parser.
226 *
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228 *
229 *   CURRENT Returns the current char value, with the full decoding of
230 *           UTF-8 if we are using this mode. It returns an int.
231 *   NEXT    Skip to the next character, this does the proper decoding
232 *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
233 *   NEXTL(l) Skip the current unicode character of l xmlChars long.
234 *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235 */
236
237#define UPPER (toupper(*ctxt->input->cur))
238
239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
240
241#define NXT(val) ctxt->input->cur[(val)]
242
243#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245#define CUR_PTR ctxt->input->cur
246
247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249	xmlParserInputShrink(ctxt->input)
250
251#define GROW if ((ctxt->progressive == 0) &&				\
252		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
253	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
254
255#define CURRENT ((int) (*ctxt->input->cur))
256
257#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259/* Inported from XML */
260
261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262#define CUR ((int) (*ctxt->input->cur))
263#define NEXT xmlNextChar(ctxt)
264
265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
266#define NXT(val) ctxt->input->cur[(val)]
267#define CUR_PTR ctxt->input->cur
268
269
270#define NEXTL(l) do {							\
271    if (*(ctxt->input->cur) == '\n') {					\
272	ctxt->input->line++; ctxt->input->col = 1;			\
273    } else ctxt->input->col++;						\
274    ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;		\
275  } while (0)
276
277/************
278    \
279    if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
280    if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/
282
283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286#define COPY_BUF(l,b,i,v)						\
287    if (l == 1) b[i++] = (xmlChar) v;					\
288    else i += xmlCopyChar(l,&b[i],v)
289
290/**
291 * htmlCurrentChar:
292 * @ctxt:  the HTML parser context
293 * @len:  pointer to the length of the char read
294 *
295 * The current char value, if using UTF-8 this may actually span multiple
296 * bytes in the input buffer. Implement the end of line normalization:
297 * 2.11 End-of-Line Handling
298 * If the encoding is unspecified, in the case we find an ISO-Latin-1
299 * char, then the encoding converter is plugged in automatically.
300 *
301 * Returns the current char value and its length
302 */
303
304static int
305htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306    if (ctxt->instate == XML_PARSER_EOF)
307	return(0);
308
309    if (ctxt->token != 0) {
310	*len = 0;
311	return(ctxt->token);
312    }
313    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
314	/*
315	 * We are supposed to handle UTF8, check it's valid
316	 * From rfc2044: encoding of the Unicode values on UTF-8:
317	 *
318	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
319	 * 0000 0000-0000 007F   0xxxxxxx
320	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
321	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
322	 *
323	 * Check for the 0x110000 limit too
324	 */
325	const unsigned char *cur = ctxt->input->cur;
326	unsigned char c;
327	unsigned int val;
328
329	c = *cur;
330	if (c & 0x80) {
331	    if (cur[1] == 0)
332		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333	    if ((cur[1] & 0xc0) != 0x80)
334		goto encoding_error;
335	    if ((c & 0xe0) == 0xe0) {
336
337		if (cur[2] == 0)
338		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339		if ((cur[2] & 0xc0) != 0x80)
340		    goto encoding_error;
341		if ((c & 0xf0) == 0xf0) {
342		    if (cur[3] == 0)
343			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344		    if (((c & 0xf8) != 0xf0) ||
345			((cur[3] & 0xc0) != 0x80))
346			goto encoding_error;
347		    /* 4-byte code */
348		    *len = 4;
349		    val = (cur[0] & 0x7) << 18;
350		    val |= (cur[1] & 0x3f) << 12;
351		    val |= (cur[2] & 0x3f) << 6;
352		    val |= cur[3] & 0x3f;
353		} else {
354		  /* 3-byte code */
355		    *len = 3;
356		    val = (cur[0] & 0xf) << 12;
357		    val |= (cur[1] & 0x3f) << 6;
358		    val |= cur[2] & 0x3f;
359		}
360	    } else {
361	      /* 2-byte code */
362		*len = 2;
363		val = (cur[0] & 0x1f) << 6;
364		val |= cur[1] & 0x3f;
365	    }
366	    if (!IS_CHAR(val)) {
367	        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368				"Char 0x%X out of allowed range\n", val);
369	    }
370	    return(val);
371	} else {
372	    /* 1-byte code */
373	    *len = 1;
374	    return((int) *ctxt->input->cur);
375	}
376    }
377    /*
378     * Assume it's a fixed length encoding (1) with
379     * a compatible encoding for the ASCII set, since
380     * XML constructs only use < 128 chars
381     */
382    *len = 1;
383    if ((int) *ctxt->input->cur < 0x80)
384	return((int) *ctxt->input->cur);
385
386    /*
387     * Humm this is bad, do an automatic flow conversion
388     */
389    xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390    ctxt->charset = XML_CHAR_ENCODING_UTF8;
391    return(xmlCurrentChar(ctxt, len));
392
393encoding_error:
394    /*
395     * If we detect an UTF8 error that probably mean that the
396     * input encoding didn't get properly advertized in the
397     * declaration header. Report the error and switch the encoding
398     * to ISO-Latin-1 (if you don't like this policy, just declare the
399     * encoding !)
400     */
401    {
402        char buffer[150];
403
404	snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
405			ctxt->input->cur[0], ctxt->input->cur[1],
406			ctxt->input->cur[2], ctxt->input->cur[3]);
407	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
408		     "Input is not proper UTF-8, indicate encoding !\n",
409		     BAD_CAST buffer, NULL);
410    }
411
412    ctxt->charset = XML_CHAR_ENCODING_8859_1;
413    *len = 1;
414    return((int) *ctxt->input->cur);
415}
416
417/**
418 * htmlSkipBlankChars:
419 * @ctxt:  the HTML parser context
420 *
421 * skip all blanks character found at that point in the input streams.
422 *
423 * Returns the number of space chars skipped
424 */
425
426static int
427htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
428    int res = 0;
429
430    while (IS_BLANK_CH(*(ctxt->input->cur))) {
431	if ((*ctxt->input->cur == 0) &&
432	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
433		xmlPopInput(ctxt);
434	} else {
435	    if (*(ctxt->input->cur) == '\n') {
436		ctxt->input->line++; ctxt->input->col = 1;
437	    } else ctxt->input->col++;
438	    ctxt->input->cur++;
439	    ctxt->nbChars++;
440	    if (*ctxt->input->cur == 0)
441		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
442	}
443	res++;
444    }
445    return(res);
446}
447
448
449
450/************************************************************************
451 *									*
452 * 		The list of HTML elements and their properties		*
453 *									*
454 ************************************************************************/
455
456/*
457 *  Start Tag: 1 means the start tag can be ommited
458 *  End Tag:   1 means the end tag can be ommited
459 *             2 means it's forbidden (empty elements)
460 *             3 means the tag is stylistic and should be closed easily
461 *  Depr:      this element is deprecated
462 *  DTD:       1 means that this element is valid only in the Loose DTD
463 *             2 means that this element is valid only in the Frameset DTD
464 *
465 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
466	, subElements , impliedsubelt , Attributes, userdata
467 */
468
469/* Definitions and a couple of vars for HTML Elements */
470
471#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
472#define NB_FONTSTYLE 8
473#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
474#define NB_PHRASE 10
475#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
476#define NB_SPECIAL 15
477#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
478#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
479#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
480#define NB_BLOCK NB_HEADING + NB_LIST + 14
481#define FORMCTRL "input", "select", "textarea", "label", "button"
482#define NB_FORMCTRL 5
483#define PCDATA
484#define NB_PCDATA 0
485#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
486#define NB_HEADING 6
487#define LIST "ul", "ol", "dir", "menu"
488#define NB_LIST 4
489#define MODIFIER
490#define NB_MODIFIER 0
491#define FLOW BLOCK,INLINE
492#define NB_FLOW NB_BLOCK + NB_INLINE
493#define EMPTY NULL
494
495
496static const char* html_flow[] = { FLOW, NULL } ;
497static const char* html_inline[] = { INLINE, NULL } ;
498
499/* placeholders: elts with content but no subelements */
500static const char* html_pcdata[] = { NULL } ;
501#define html_cdata html_pcdata
502
503
504/* ... and for HTML Attributes */
505
506#define COREATTRS "id", "class", "style", "title"
507#define NB_COREATTRS 4
508#define I18N "lang", "dir"
509#define NB_I18N 2
510#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
511#define NB_EVENTS 9
512#define ATTRS COREATTRS,I18N,EVENTS
513#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
514#define CELLHALIGN "align", "char", "charoff"
515#define NB_CELLHALIGN 3
516#define CELLVALIGN "valign"
517#define NB_CELLVALIGN 1
518
519static const char* html_attrs[] = { ATTRS, NULL } ;
520static const char* core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
521static const char* core_attrs[] = { COREATTRS, NULL } ;
522static const char* i18n_attrs[] = { I18N, NULL } ;
523
524
525/* Other declarations that should go inline ... */
526static const char* a_attrs[] = { ATTRS, "charset", "type", "name",
527	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
528	"tabindex", "onfocus", "onblur", NULL } ;
529static const char* target_attr[] = { "target", NULL } ;
530static const char* rows_cols_attr[] = { "rows", "cols", NULL } ;
531static const char* alt_attr[] = { "alt", NULL } ;
532static const char* src_alt_attrs[] = { "src", "alt", NULL } ;
533static const char* href_attrs[] = { "href", NULL } ;
534static const char* clear_attrs[] = { "clear", NULL } ;
535static const char* inline_p[] = { INLINE, "p", NULL } ;
536static const char* flow_param[] = { FLOW, "param", NULL } ;
537static const char* applet_attrs[] = { COREATTRS , "codebase",
538		"archive", "alt", "name", "height", "width", "align",
539		"hspace", "vspace", NULL } ;
540static const char* area_attrs[] = { "shape", "coords", "href", "nohref",
541	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
542static const char* basefont_attrs[] =
543	{ "id", "size", "color", "face", NULL } ;
544static const char* quote_attrs[] = { ATTRS, "cite", NULL } ;
545static const char* body_contents[] = { FLOW, "ins", "del", NULL } ;
546static const char* body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
547static const char* body_depr[] = { "background", "bgcolor", "text",
548	"link", "vlink", "alink", NULL } ;
549static const char* button_attrs[] = { ATTRS, "name", "value", "type",
550	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
551
552
553static const char* col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
554static const char* col_elt[] = { "col", NULL } ;
555static const char* edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
556static const char* compact_attrs[] = { ATTRS, "compact", NULL } ;
557static const char* dl_contents[] = { "dt", "dd", NULL } ;
558static const char* compact_attr[] = { "compact", NULL } ;
559static const char* label_attr[] = { "label", NULL } ;
560static const char* fieldset_contents[] = { FLOW, "legend" } ;
561static const char* font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
562static const char* form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
563static const char* form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
564static const char* frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
565static const char* frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
566static const char* frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
567static const char* head_attrs[] = { I18N, "profile", NULL } ;
568static const char* head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
569static const char* hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
570static const char* version_attr[] = { "version", NULL } ;
571static const char* html_content[] = { "head", "body", "frameset", NULL } ;
572static const char* iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
573static const char* img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
574static const char* input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
575static const char* prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
576static const char* label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
577static const char* legend_attrs[] = { ATTRS, "accesskey", NULL } ;
578static const char* align_attr[] = { "align", NULL } ;
579static const char* link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
580static const char* map_contents[] = { BLOCK, "area", NULL } ;
581static const char* name_attr[] = { "name", NULL } ;
582static const char* action_attr[] = { "action", NULL } ;
583static const char* blockli_elt[] = { BLOCK, "li", NULL } ;
584static const char* meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
585static const char* content_attr[] = { "content", NULL } ;
586static const char* type_attr[] = { "type", NULL } ;
587static const char* noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
588static const char* object_contents[] = { FLOW, "param", NULL } ;
589static const char* object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
590static const char* object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
591static const char* ol_attrs[] = { "type", "compact", "start", NULL} ;
592static const char* option_elt[] = { "option", NULL } ;
593static const char* optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
594static const char* option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
595static const char* param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
596static const char* width_attr[] = { "width", NULL } ;
597static const char* pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
598static const char* script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
599static const char* language_attr[] = { "language", NULL } ;
600static const char* select_content[] = { "optgroup", "option", NULL } ;
601static const char* select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
602static const char* style_attrs[] = { I18N, "media", "title", NULL } ;
603static const char* table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
604static const char* table_depr[] = { "align", "bgcolor", NULL } ;
605static const char* table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
606static const char* tr_elt[] = { "tr", NULL } ;
607static const char* talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
608static const char* th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
609static const char* th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
610static const char* textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
611static const char* tr_contents[] = { "th", "td", NULL } ;
612static const char* bgcolor_attr[] = { "bgcolor", NULL } ;
613static const char* li_elt[] = { "li", NULL } ;
614static const char* ul_depr[] = { "type", "compact", NULL} ;
615static const char* dir_attr[] = { "dir", NULL} ;
616
617#define DECL (const char**)
618
619static const htmlElemDesc
620html40ElementTable[] = {
621{ "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
622	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
623},
624{ "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
625	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
626},
627{ "acronym",	0, 0, 0, 0, 0, 0, 1, "",
628	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
629},
630{ "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
631	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
632},
633{ "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
634	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
635},
636{ "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
637	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
638},
639{ "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
640	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
641},
642{ "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
643	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
644},
645{ "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
646	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
647},
648{ "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
649	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
650},
651{ "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
652	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
653},
654{ "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
655	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
656},
657{ "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
658	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
659},
660{ "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
661	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
662},
663{ "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
664	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
665},
666{ "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
667	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
668},
669{ "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
670	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
671},
672{ "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
673	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
674},
675{ "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
676	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
677},
678{ "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
679	EMPTY , NULL , DECL col_attrs , NULL, NULL
680},
681{ "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
682	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
683},
684{ "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
685	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
686},
687{ "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
688	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
689},
690{ "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
691	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
692},
693{ "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
694	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
695},
696{ "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
697	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
698},
699{ "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
700	DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
701},
702{ "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
703	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
704},
705{ "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
706	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
707},
708{ "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
709	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
710},
711{ "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
712	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
713},
714{ "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
715	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
716},
717{ "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
718	EMPTY, NULL, NULL, DECL frame_attrs, NULL
719},
720{ "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
721	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
722},
723{ "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
724	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
725},
726{ "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
727	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
728},
729{ "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
730	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
731},
732{ "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
733	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
734},
735{ "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
736	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
737},
738{ "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
739	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
740},
741{ "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
742	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
743},
744{ "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
745	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
746},
747{ "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
748	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
749},
750{ "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
751	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
752},
753{ "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
754	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
755},
756{ "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
757	EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
758},
759{ "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
760	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
761},
762{ "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
763	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
764},
765{ "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
766	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
767},
768{ "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
769	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
770},
771{ "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
772	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
773},
774{ "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
775	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
776},
777{ "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
778	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
779},
780{ "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
781	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
782},
783{ "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
784	DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
785},
786{ "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
787	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
788},
789{ "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
790	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
791},
792{ "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
793	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
794},
795{ "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
796	DECL html_flow, "div", DECL html_attrs, NULL, NULL
797},
798{ "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
799	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
800},
801{ "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
802	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
803},
804{ "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
805	option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
806},
807{ "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
808	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
809},
810{ "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
811	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
812},
813{ "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
814	EMPTY, NULL, DECL param_attrs, NULL, name_attr
815},
816{ "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
817	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
818},
819{ "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
820	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
821},
822{ "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
823	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
824},
825{ "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
826	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
827},
828{ "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
829	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
830},
831{ "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
832	DECL select_content, NULL, DECL select_attrs, NULL, NULL
833},
834{ "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
835	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
836},
837{ "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
838	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
839},
840{ "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
841	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
842},
843{ "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
844	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
845},
846{ "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
847	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
848},
849{ "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
850	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
851},
852{ "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
853	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854},
855{ "table",	0, 0, 0, 0, 0, 0, 0, "",
856	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
857},
858{ "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
859	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
860},
861{ "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
862	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
863},
864{ "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
865	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
866},
867{ "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
868	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
869},
870{ "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
871	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
872},
873{ "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
874	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
875},
876{ "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
877	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
878},
879{ "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
880	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
881},
882{ "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
883	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
884},
885{ "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
886	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
887},
888{ "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
889	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
890},
891{ "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
892	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
893}
894};
895
896/*
897 * start tags that imply the end of current element
898 */
899static const char *htmlStartClose[] = {
900"form",		"form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
901		"dl", "ul", "ol", "menu", "dir", "address", "pre",
902		"listing", "xmp", "head", NULL,
903"head",		"p", NULL,
904"title",	"p", NULL,
905"body",		"head", "style", "link", "title", "p", NULL,
906"frameset",	"head", "style", "link", "title", "p", NULL,
907"li",		"p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
908		"pre", "listing", "xmp", "head", "li", NULL,
909"hr",		"p", "head", NULL,
910"h1",		"p", "head", NULL,
911"h2",		"p", "head", NULL,
912"h3",		"p", "head", NULL,
913"h4",		"p", "head", NULL,
914"h5",		"p", "head", NULL,
915"h6",		"p", "head", NULL,
916"dir",		"p", "head", NULL,
917"address",	"p", "head", "ul", NULL,
918"pre",		"p", "head", "ul", NULL,
919"listing",	"p", "head", NULL,
920"xmp",		"p", "head", NULL,
921"blockquote",	"p", "head", NULL,
922"dl",		"p", "dt", "menu", "dir", "address", "pre", "listing",
923		"xmp", "head", NULL,
924"dt",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
925                "head", "dd", NULL,
926"dd",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
927                "head", "dt", NULL,
928"ul",		"p", "head", "ol", "menu", "dir", "address", "pre",
929		"listing", "xmp", NULL,
930"ol",		"p", "head", "ul", NULL,
931"menu",		"p", "head", "ul", NULL,
932"p",		"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
933"div",		"p", "head", NULL,
934"noscript",	"p", "head", NULL,
935"center",	"font", "b", "i", "p", "head", NULL,
936"a",		"a", NULL,
937"caption",	"p", NULL,
938"colgroup",	"caption", "colgroup", "col", "p", NULL,
939"col",		"caption", "col", "p", NULL,
940"table",	"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
941		"listing", "xmp", "a", NULL,
942"th",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
943"td",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
944"tr",		"th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
945"thead",	"caption", "col", "colgroup", NULL,
946"tfoot",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
947		"tbody", "p", NULL,
948"tbody",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
949		"tfoot", "tbody", "p", NULL,
950"optgroup",	"option", NULL,
951"option",	"option", NULL,
952"fieldset",	"legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
953		"pre", "listing", "xmp", "a", NULL,
954NULL
955};
956
957/*
958 * The list of HTML elements which are supposed not to have
959 * CDATA content and where a p element will be implied
960 *
961 * TODO: extend that list by reading the HTML SGML DTD on
962 *       implied paragraph
963 */
964static const char *htmlNoContentElements[] = {
965    "html",
966    "head",
967    NULL
968};
969
970/*
971 * The list of HTML attributes which are of content %Script;
972 * NOTE: when adding ones, check htmlIsScriptAttribute() since
973 *       it assumes the name starts with 'on'
974 */
975static const char *htmlScriptAttributes[] = {
976    "onclick",
977    "ondblclick",
978    "onmousedown",
979    "onmouseup",
980    "onmouseover",
981    "onmousemove",
982    "onmouseout",
983    "onkeypress",
984    "onkeydown",
985    "onkeyup",
986    "onload",
987    "onunload",
988    "onfocus",
989    "onblur",
990    "onsubmit",
991    "onrest",
992    "onchange",
993    "onselect"
994};
995
996/*
997 * This table is used by the htmlparser to know what to do with
998 * broken html pages. By assigning different priorities to different
999 * elements the parser can decide how to handle extra endtags.
1000 * Endtags are only allowed to close elements with lower or equal
1001 * priority.
1002 */
1003
1004typedef struct {
1005    const char *name;
1006    int priority;
1007} elementPriority;
1008
1009static const elementPriority htmlEndPriority[] = {
1010    {"div",   150},
1011    {"td",    160},
1012    {"th",    160},
1013    {"tr",    170},
1014    {"thead", 180},
1015    {"tbody", 180},
1016    {"tfoot", 180},
1017    {"table", 190},
1018    {"head",  200},
1019    {"body",  200},
1020    {"html",  220},
1021    {NULL,    100} /* Default priority */
1022};
1023
1024static const char** htmlStartCloseIndex[100];
1025static int htmlStartCloseIndexinitialized = 0;
1026
1027/************************************************************************
1028 *									*
1029 * 		functions to handle HTML specific data			*
1030 *									*
1031 ************************************************************************/
1032
1033/**
1034 * htmlInitAutoClose:
1035 *
1036 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1037 * This is not reentrant. Call xmlInitParser() once before processing in
1038 * case of use in multithreaded programs.
1039 */
1040void
1041htmlInitAutoClose(void) {
1042    int indx, i = 0;
1043
1044    if (htmlStartCloseIndexinitialized) return;
1045
1046    for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1047    indx = 0;
1048    while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1049        htmlStartCloseIndex[indx++] = &htmlStartClose[i];
1050	while (htmlStartClose[i] != NULL) i++;
1051	i++;
1052    }
1053    htmlStartCloseIndexinitialized = 1;
1054}
1055
1056/**
1057 * htmlTagLookup:
1058 * @tag:  The tag name in lowercase
1059 *
1060 * Lookup the HTML tag in the ElementTable
1061 *
1062 * Returns the related htmlElemDescPtr or NULL if not found.
1063 */
1064const htmlElemDesc *
1065htmlTagLookup(const xmlChar *tag) {
1066    unsigned int i;
1067
1068    for (i = 0; i < (sizeof(html40ElementTable) /
1069                     sizeof(html40ElementTable[0]));i++) {
1070        if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1071	    return((htmlElemDescPtr) &html40ElementTable[i]);
1072    }
1073    return(NULL);
1074}
1075
1076/**
1077 * htmlGetEndPriority:
1078 * @name: The name of the element to look up the priority for.
1079 *
1080 * Return value: The "endtag" priority.
1081 **/
1082static int
1083htmlGetEndPriority (const xmlChar *name) {
1084    int i = 0;
1085
1086    while ((htmlEndPriority[i].name != NULL) &&
1087	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1088	i++;
1089
1090    return(htmlEndPriority[i].priority);
1091}
1092
1093
1094/**
1095 * htmlCheckAutoClose:
1096 * @newtag:  The new tag name
1097 * @oldtag:  The old tag name
1098 *
1099 * Checks whether the new tag is one of the registered valid tags for
1100 * closing old.
1101 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1102 *
1103 * Returns 0 if no, 1 if yes.
1104 */
1105static int
1106htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1107{
1108    int i, indx;
1109    const char **closed = NULL;
1110
1111    if (htmlStartCloseIndexinitialized == 0)
1112        htmlInitAutoClose();
1113
1114    /* inefficient, but not a big deal */
1115    for (indx = 0; indx < 100; indx++) {
1116        closed = htmlStartCloseIndex[indx];
1117        if (closed == NULL)
1118            return (0);
1119        if (xmlStrEqual(BAD_CAST * closed, newtag))
1120            break;
1121    }
1122
1123    i = closed - htmlStartClose;
1124    i++;
1125    while (htmlStartClose[i] != NULL) {
1126        if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1127            return (1);
1128        }
1129        i++;
1130    }
1131    return (0);
1132}
1133
1134/**
1135 * htmlAutoCloseOnClose:
1136 * @ctxt:  an HTML parser context
1137 * @newtag:  The new tag name
1138 * @force:  force the tag closure
1139 *
1140 * The HTML DTD allows an ending tag to implicitly close other tags.
1141 */
1142static void
1143htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1144{
1145    const htmlElemDesc *info;
1146    int i, priority;
1147
1148    priority = htmlGetEndPriority(newtag);
1149
1150    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1151
1152        if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1153            break;
1154        /*
1155         * A missplaced endtag can only close elements with lower
1156         * or equal priority, so if we find an element with higher
1157         * priority before we find an element with
1158         * matching name, we just ignore this endtag
1159         */
1160        if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1161            return;
1162    }
1163    if (i < 0)
1164        return;
1165
1166    while (!xmlStrEqual(newtag, ctxt->name)) {
1167        info = htmlTagLookup(ctxt->name);
1168        if ((info != NULL) && (info->endTag == 3)) {
1169            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1170	                 "Opening and ending tag mismatch: %s and %s\n",
1171			 newtag, ctxt->name);
1172        }
1173        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1174            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1175	htmlnamePop(ctxt);
1176    }
1177}
1178
1179/**
1180 * htmlAutoCloseOnEnd:
1181 * @ctxt:  an HTML parser context
1182 *
1183 * Close all remaining tags at the end of the stream
1184 */
1185static void
1186htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1187{
1188    int i;
1189
1190    if (ctxt->nameNr == 0)
1191        return;
1192    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1193        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1194            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1195	htmlnamePop(ctxt);
1196    }
1197}
1198
1199/**
1200 * htmlAutoClose:
1201 * @ctxt:  an HTML parser context
1202 * @newtag:  The new tag name or NULL
1203 *
1204 * The HTML DTD allows a tag to implicitly close other tags.
1205 * The list is kept in htmlStartClose array. This function is
1206 * called when a new tag has been detected and generates the
1207 * appropriates closes if possible/needed.
1208 * If newtag is NULL this mean we are at the end of the resource
1209 * and we should check
1210 */
1211static void
1212htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1213{
1214    while ((newtag != NULL) && (ctxt->name != NULL) &&
1215           (htmlCheckAutoClose(newtag, ctxt->name))) {
1216        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1217            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1218	htmlnamePop(ctxt);
1219    }
1220    if (newtag == NULL) {
1221        htmlAutoCloseOnEnd(ctxt);
1222        return;
1223    }
1224    while ((newtag == NULL) && (ctxt->name != NULL) &&
1225           ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1226            (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1227            (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1228        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1229            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1230	htmlnamePop(ctxt);
1231    }
1232}
1233
1234/**
1235 * htmlAutoCloseTag:
1236 * @doc:  the HTML document
1237 * @name:  The tag name
1238 * @elem:  the HTML element
1239 *
1240 * The HTML DTD allows a tag to implicitly close other tags.
1241 * The list is kept in htmlStartClose array. This function checks
1242 * if the element or one of it's children would autoclose the
1243 * given tag.
1244 *
1245 * Returns 1 if autoclose, 0 otherwise
1246 */
1247int
1248htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1249    htmlNodePtr child;
1250
1251    if (elem == NULL) return(1);
1252    if (xmlStrEqual(name, elem->name)) return(0);
1253    if (htmlCheckAutoClose(elem->name, name)) return(1);
1254    child = elem->children;
1255    while (child != NULL) {
1256        if (htmlAutoCloseTag(doc, name, child)) return(1);
1257	child = child->next;
1258    }
1259    return(0);
1260}
1261
1262/**
1263 * htmlIsAutoClosed:
1264 * @doc:  the HTML document
1265 * @elem:  the HTML element
1266 *
1267 * The HTML DTD allows a tag to implicitly close other tags.
1268 * The list is kept in htmlStartClose array. This function checks
1269 * if a tag is autoclosed by one of it's child
1270 *
1271 * Returns 1 if autoclosed, 0 otherwise
1272 */
1273int
1274htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1275    htmlNodePtr child;
1276
1277    if (elem == NULL) return(1);
1278    child = elem->children;
1279    while (child != NULL) {
1280	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1281	child = child->next;
1282    }
1283    return(0);
1284}
1285
1286/**
1287 * htmlCheckImplied:
1288 * @ctxt:  an HTML parser context
1289 * @newtag:  The new tag name
1290 *
1291 * The HTML DTD allows a tag to exists only implicitly
1292 * called when a new tag has been detected and generates the
1293 * appropriates implicit tags if missing
1294 */
1295static void
1296htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1297    if (!htmlOmittedDefaultValue)
1298	return;
1299    if (xmlStrEqual(newtag, BAD_CAST"html"))
1300	return;
1301    if (ctxt->nameNr <= 0) {
1302	htmlnamePush(ctxt, BAD_CAST"html");
1303	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1304	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1305    }
1306    if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1307        return;
1308    if ((ctxt->nameNr <= 1) &&
1309        ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1310	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1311	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1312	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1313	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1314	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1315	    /*
1316	     * dropped OBJECT ... i you put it first BODY will be
1317	     * assumed !
1318	     */
1319	    htmlnamePush(ctxt, BAD_CAST"head");
1320	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1321		ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1322    } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1323	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1324	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1325	int i;
1326	for (i = 0;i < ctxt->nameNr;i++) {
1327	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1328		return;
1329	    }
1330	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1331		return;
1332	    }
1333	}
1334
1335	htmlnamePush(ctxt, BAD_CAST"body");
1336	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1337	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1338    }
1339}
1340
1341/**
1342 * htmlCheckParagraph
1343 * @ctxt:  an HTML parser context
1344 *
1345 * Check whether a p element need to be implied before inserting
1346 * characters in the current element.
1347 *
1348 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1349 *         in case of error.
1350 */
1351
1352static int
1353htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1354    const xmlChar *tag;
1355    int i;
1356
1357    if (ctxt == NULL)
1358	return(-1);
1359    tag = ctxt->name;
1360    if (tag == NULL) {
1361	htmlAutoClose(ctxt, BAD_CAST"p");
1362	htmlCheckImplied(ctxt, BAD_CAST"p");
1363	htmlnamePush(ctxt, BAD_CAST"p");
1364	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1365	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1366	return(1);
1367    }
1368    if (!htmlOmittedDefaultValue)
1369	return(0);
1370    for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1371	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1372	    htmlAutoClose(ctxt, BAD_CAST"p");
1373	    htmlCheckImplied(ctxt, BAD_CAST"p");
1374	    htmlnamePush(ctxt, BAD_CAST"p");
1375	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1376		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1377	    return(1);
1378	}
1379    }
1380    return(0);
1381}
1382
1383/**
1384 * htmlIsScriptAttribute:
1385 * @name:  an attribute name
1386 *
1387 * Check if an attribute is of content type Script
1388 *
1389 * Returns 1 is the attribute is a script 0 otherwise
1390 */
1391int
1392htmlIsScriptAttribute(const xmlChar *name) {
1393    unsigned int i;
1394
1395    if (name == NULL)
1396       	return(0);
1397    /*
1398     * all script attributes start with 'on'
1399     */
1400    if ((name[0] != 'o') || (name[1] != 'n'))
1401       	return(0);
1402    for (i = 0;
1403	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1404	 i++) {
1405	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1406	    return(1);
1407    }
1408    return(0);
1409}
1410
1411/************************************************************************
1412 *									*
1413 * 		The list of HTML predefined entities			*
1414 *									*
1415 ************************************************************************/
1416
1417
1418static const htmlEntityDesc  html40EntitiesTable[] = {
1419/*
1420 * the 4 absolute ones, plus apostrophe.
1421 */
1422{ 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1423{ 38,	"amp",	"ampersand, U+0026 ISOnum" },
1424{ 39,	"apos",	"single quote" },
1425{ 60,	"lt",	"less-than sign, U+003C ISOnum" },
1426{ 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1427
1428/*
1429 * A bunch still in the 128-255 range
1430 * Replacing them depend really on the charset used.
1431 */
1432{ 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1433{ 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1434{ 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1435{ 163,	"pound","pound sign, U+00A3 ISOnum" },
1436{ 164,	"curren","currency sign, U+00A4 ISOnum" },
1437{ 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1438{ 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1439{ 167,	"sect",	"section sign, U+00A7 ISOnum" },
1440{ 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1441{ 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1442{ 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1443{ 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1444{ 172,	"not",	"not sign, U+00AC ISOnum" },
1445{ 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1446{ 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1447{ 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1448{ 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1449{ 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1450{ 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1451{ 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1452{ 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1453{ 181,	"micro","micro sign, U+00B5 ISOnum" },
1454{ 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1455{ 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1456{ 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1457{ 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1458{ 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1459{ 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1460{ 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1461{ 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1462{ 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1463{ 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1464{ 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1465{ 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1466{ 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1467{ 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1468{ 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1469{ 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1470{ 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1471{ 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1472{ 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1473{ 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1474{ 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1475{ 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1476{ 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1477{ 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1478{ 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1479{ 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1480{ 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1481{ 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1482{ 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1483{ 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1484{ 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1485{ 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1486{ 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1487{ 215,	"times","multiplication sign, U+00D7 ISOnum" },
1488{ 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1489{ 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1490{ 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1491{ 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1492{ 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1493{ 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1494{ 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1495{ 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1496{ 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1497{ 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1498{ 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1499{ 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1500{ 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1501{ 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1502{ 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1503{ 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1504{ 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1505{ 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1506{ 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1507{ 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1508{ 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1509{ 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1510{ 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1511{ 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1512{ 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1513{ 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1514{ 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1515{ 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1516{ 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1517{ 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1518{ 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1519{ 247,	"divide","division sign, U+00F7 ISOnum" },
1520{ 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1521{ 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1522{ 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1523{ 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1524{ 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1525{ 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1526{ 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1527{ 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1528
1529{ 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1530{ 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1531{ 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1532{ 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1533{ 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1534
1535/*
1536 * Anything below should really be kept as entities references
1537 */
1538{ 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1539
1540{ 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1541{ 732,	"tilde","small tilde, U+02DC ISOdia" },
1542
1543{ 913,	"Alpha","greek capital letter alpha, U+0391" },
1544{ 914,	"Beta",	"greek capital letter beta, U+0392" },
1545{ 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1546{ 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1547{ 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1548{ 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1549{ 919,	"Eta",	"greek capital letter eta, U+0397" },
1550{ 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1551{ 921,	"Iota",	"greek capital letter iota, U+0399" },
1552{ 922,	"Kappa","greek capital letter kappa, U+039A" },
1553{ 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1554{ 924,	"Mu",	"greek capital letter mu, U+039C" },
1555{ 925,	"Nu",	"greek capital letter nu, U+039D" },
1556{ 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1557{ 927,	"Omicron","greek capital letter omicron, U+039F" },
1558{ 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1559{ 929,	"Rho",	"greek capital letter rho, U+03A1" },
1560{ 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1561{ 932,	"Tau",	"greek capital letter tau, U+03A4" },
1562{ 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1563{ 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1564{ 935,	"Chi",	"greek capital letter chi, U+03A7" },
1565{ 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1566{ 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1567
1568{ 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1569{ 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1570{ 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1571{ 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1572{ 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1573{ 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1574{ 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1575{ 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1576{ 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1577{ 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1578{ 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1579{ 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1580{ 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1581{ 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1582{ 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1583{ 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1584{ 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1585{ 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1586{ 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1587{ 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1588{ 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1589{ 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1590{ 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1591{ 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1592{ 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1593{ 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1594{ 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1595{ 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1596
1597{ 8194,	"ensp",	"en space, U+2002 ISOpub" },
1598{ 8195,	"emsp",	"em space, U+2003 ISOpub" },
1599{ 8201,	"thinsp","thin space, U+2009 ISOpub" },
1600{ 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1601{ 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1602{ 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1603{ 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1604{ 8211,	"ndash","en dash, U+2013 ISOpub" },
1605{ 8212,	"mdash","em dash, U+2014 ISOpub" },
1606{ 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1607{ 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1608{ 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1609{ 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1610{ 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1611{ 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1612{ 8224,	"dagger","dagger, U+2020 ISOpub" },
1613{ 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1614
1615{ 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1616{ 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1617
1618{ 8240,	"permil","per mille sign, U+2030 ISOtech" },
1619
1620{ 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1621{ 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1622
1623{ 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1624{ 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1625
1626{ 8254,	"oline","overline = spacing overscore, U+203E NEW" },
1627{ 8260,	"frasl","fraction slash, U+2044 NEW" },
1628
1629{ 8364,	"euro",	"euro sign, U+20AC NEW" },
1630
1631{ 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1632{ 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1633{ 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
1634{ 8482,	"trade","trade mark sign, U+2122 ISOnum" },
1635{ 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1636{ 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
1637{ 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
1638{ 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
1639{ 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
1640{ 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
1641{ 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1642{ 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
1643{ 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
1644{ 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
1645{ 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
1646{ 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
1647
1648{ 8704,	"forall","for all, U+2200 ISOtech" },
1649{ 8706,	"part",	"partial differential, U+2202 ISOtech" },
1650{ 8707,	"exist","there exists, U+2203 ISOtech" },
1651{ 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
1652{ 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
1653{ 8712,	"isin",	"element of, U+2208 ISOtech" },
1654{ 8713,	"notin","not an element of, U+2209 ISOtech" },
1655{ 8715,	"ni",	"contains as member, U+220B ISOtech" },
1656{ 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
1657{ 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
1658{ 8722,	"minus","minus sign, U+2212 ISOtech" },
1659{ 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
1660{ 8730,	"radic","square root = radical sign, U+221A ISOtech" },
1661{ 8733,	"prop",	"proportional to, U+221D ISOtech" },
1662{ 8734,	"infin","infinity, U+221E ISOtech" },
1663{ 8736,	"ang",	"angle, U+2220 ISOamso" },
1664{ 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
1665{ 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
1666{ 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
1667{ 8746,	"cup",	"union = cup, U+222A ISOtech" },
1668{ 8747,	"int",	"integral, U+222B ISOtech" },
1669{ 8756,	"there4","therefore, U+2234 ISOtech" },
1670{ 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
1671{ 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
1672{ 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1673{ 8800,	"ne",	"not equal to, U+2260 ISOtech" },
1674{ 8801,	"equiv","identical to, U+2261 ISOtech" },
1675{ 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
1676{ 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
1677{ 8834,	"sub",	"subset of, U+2282 ISOtech" },
1678{ 8835,	"sup",	"superset of, U+2283 ISOtech" },
1679{ 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
1680{ 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
1681{ 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
1682{ 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
1683{ 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
1684{ 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1685{ 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
1686{ 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1687{ 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
1688{ 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
1689{ 8971,	"rfloor","right floor, U+230B ISOamsc" },
1690{ 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
1691{ 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
1692{ 9674,	"loz",	"lozenge, U+25CA ISOpub" },
1693
1694{ 9824,	"spades","black spade suit, U+2660 ISOpub" },
1695{ 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
1696{ 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
1697{ 9830,	"diams","black diamond suit, U+2666 ISOpub" },
1698
1699};
1700
1701/************************************************************************
1702 *									*
1703 *		Commodity functions to handle entities			*
1704 *									*
1705 ************************************************************************/
1706
1707/*
1708 * Macro used to grow the current buffer.
1709 */
1710#define growBuffer(buffer) {						\
1711    xmlChar *tmp;							\
1712    buffer##_size *= 2;							\
1713    tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1714    if (tmp == NULL) {						\
1715	htmlErrMemory(ctxt, "growing buffer\n");			\
1716	xmlFree(buffer);						\
1717	return(NULL);							\
1718    }									\
1719    buffer = tmp;							\
1720}
1721
1722/**
1723 * htmlEntityLookup:
1724 * @name: the entity name
1725 *
1726 * Lookup the given entity in EntitiesTable
1727 *
1728 * TODO: the linear scan is really ugly, an hash table is really needed.
1729 *
1730 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1731 */
1732const htmlEntityDesc *
1733htmlEntityLookup(const xmlChar *name) {
1734    unsigned int i;
1735
1736    for (i = 0;i < (sizeof(html40EntitiesTable)/
1737                    sizeof(html40EntitiesTable[0]));i++) {
1738        if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1739            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1740	}
1741    }
1742    return(NULL);
1743}
1744
1745/**
1746 * htmlEntityValueLookup:
1747 * @value: the entity's unicode value
1748 *
1749 * Lookup the given entity in EntitiesTable
1750 *
1751 * TODO: the linear scan is really ugly, an hash table is really needed.
1752 *
1753 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1754 */
1755const htmlEntityDesc *
1756htmlEntityValueLookup(unsigned int value) {
1757    unsigned int i;
1758
1759    for (i = 0;i < (sizeof(html40EntitiesTable)/
1760                    sizeof(html40EntitiesTable[0]));i++) {
1761        if (html40EntitiesTable[i].value >= value) {
1762	    if (html40EntitiesTable[i].value > value)
1763		break;
1764            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1765	}
1766    }
1767    return(NULL);
1768}
1769
1770/**
1771 * UTF8ToHtml:
1772 * @out:  a pointer to an array of bytes to store the result
1773 * @outlen:  the length of @out
1774 * @in:  a pointer to an array of UTF-8 chars
1775 * @inlen:  the length of @in
1776 *
1777 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1778 * plus HTML entities block of chars out.
1779 *
1780 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1781 * The value of @inlen after return is the number of octets consumed
1782 *     as the return value is positive, else unpredictable.
1783 * The value of @outlen after return is the number of octets consumed.
1784 */
1785int
1786UTF8ToHtml(unsigned char* out, int *outlen,
1787              const unsigned char* in, int *inlen) {
1788    const unsigned char* processed = in;
1789    const unsigned char* outend;
1790    const unsigned char* outstart = out;
1791    const unsigned char* instart = in;
1792    const unsigned char* inend;
1793    unsigned int c, d;
1794    int trailing;
1795
1796    if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1797    if (in == NULL) {
1798        /*
1799	 * initialization nothing to do
1800	 */
1801	*outlen = 0;
1802	*inlen = 0;
1803	return(0);
1804    }
1805    inend = in + (*inlen);
1806    outend = out + (*outlen);
1807    while (in < inend) {
1808	d = *in++;
1809	if      (d < 0x80)  { c= d; trailing= 0; }
1810	else if (d < 0xC0) {
1811	    /* trailing byte in leading position */
1812	    *outlen = out - outstart;
1813	    *inlen = processed - instart;
1814	    return(-2);
1815        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1816        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1817        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1818	else {
1819	    /* no chance for this in Ascii */
1820	    *outlen = out - outstart;
1821	    *inlen = processed - instart;
1822	    return(-2);
1823	}
1824
1825	if (inend - in < trailing) {
1826	    break;
1827	}
1828
1829	for ( ; trailing; trailing--) {
1830	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1831		break;
1832	    c <<= 6;
1833	    c |= d & 0x3F;
1834	}
1835
1836	/* assertion: c is a single UTF-4 value */
1837	if (c < 0x80) {
1838	    if (out + 1 >= outend)
1839		break;
1840	    *out++ = c;
1841	} else {
1842	    int len;
1843	    const htmlEntityDesc * ent;
1844
1845	    /*
1846	     * Try to lookup a predefined HTML entity for it
1847	     */
1848
1849	    ent = htmlEntityValueLookup(c);
1850	    if (ent == NULL) {
1851		/* no chance for this in Ascii */
1852		*outlen = out - outstart;
1853		*inlen = processed - instart;
1854		return(-2);
1855	    }
1856	    len = strlen(ent->name);
1857	    if (out + 2 + len >= outend)
1858		break;
1859	    *out++ = '&';
1860	    memcpy(out, ent->name, len);
1861	    out += len;
1862	    *out++ = ';';
1863	}
1864	processed = in;
1865    }
1866    *outlen = out - outstart;
1867    *inlen = processed - instart;
1868    return(0);
1869}
1870
1871/**
1872 * htmlEncodeEntities:
1873 * @out:  a pointer to an array of bytes to store the result
1874 * @outlen:  the length of @out
1875 * @in:  a pointer to an array of UTF-8 chars
1876 * @inlen:  the length of @in
1877 * @quoteChar: the quote character to escape (' or ") or zero.
1878 *
1879 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1880 * plus HTML entities block of chars out.
1881 *
1882 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1883 * The value of @inlen after return is the number of octets consumed
1884 *     as the return value is positive, else unpredictable.
1885 * The value of @outlen after return is the number of octets consumed.
1886 */
1887int
1888htmlEncodeEntities(unsigned char* out, int *outlen,
1889		   const unsigned char* in, int *inlen, int quoteChar) {
1890    const unsigned char* processed = in;
1891    const unsigned char* outend;
1892    const unsigned char* outstart = out;
1893    const unsigned char* instart = in;
1894    const unsigned char* inend;
1895    unsigned int c, d;
1896    int trailing;
1897
1898    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1899        return(-1);
1900    outend = out + (*outlen);
1901    inend = in + (*inlen);
1902    while (in < inend) {
1903	d = *in++;
1904	if      (d < 0x80)  { c= d; trailing= 0; }
1905	else if (d < 0xC0) {
1906	    /* trailing byte in leading position */
1907	    *outlen = out - outstart;
1908	    *inlen = processed - instart;
1909	    return(-2);
1910        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1911        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1912        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1913	else {
1914	    /* no chance for this in Ascii */
1915	    *outlen = out - outstart;
1916	    *inlen = processed - instart;
1917	    return(-2);
1918	}
1919
1920	if (inend - in < trailing)
1921	    break;
1922
1923	while (trailing--) {
1924	    if (((d= *in++) & 0xC0) != 0x80) {
1925		*outlen = out - outstart;
1926		*inlen = processed - instart;
1927		return(-2);
1928	    }
1929	    c <<= 6;
1930	    c |= d & 0x3F;
1931	}
1932
1933	/* assertion: c is a single UTF-4 value */
1934	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1935	    (c != '&') && (c != '<') && (c != '>')) {
1936	    if (out >= outend)
1937		break;
1938	    *out++ = c;
1939	} else {
1940	    const htmlEntityDesc * ent;
1941	    const char *cp;
1942	    char nbuf[16];
1943	    int len;
1944
1945	    /*
1946	     * Try to lookup a predefined HTML entity for it
1947	     */
1948	    ent = htmlEntityValueLookup(c);
1949	    if (ent == NULL) {
1950		snprintf(nbuf, sizeof(nbuf), "#%u", c);
1951		cp = nbuf;
1952	    }
1953	    else
1954		cp = ent->name;
1955	    len = strlen(cp);
1956	    if (out + 2 + len > outend)
1957		break;
1958	    *out++ = '&';
1959	    memcpy(out, cp, len);
1960	    out += len;
1961	    *out++ = ';';
1962	}
1963	processed = in;
1964    }
1965    *outlen = out - outstart;
1966    *inlen = processed - instart;
1967    return(0);
1968}
1969
1970/************************************************************************
1971 *									*
1972 *		Commodity functions to handle streams			*
1973 *									*
1974 ************************************************************************/
1975
1976/**
1977 * htmlNewInputStream:
1978 * @ctxt:  an HTML parser context
1979 *
1980 * Create a new input stream structure
1981 * Returns the new input stream or NULL
1982 */
1983static htmlParserInputPtr
1984htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1985    htmlParserInputPtr input;
1986
1987    input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1988    if (input == NULL) {
1989        htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
1990	return(NULL);
1991    }
1992    memset(input, 0, sizeof(htmlParserInput));
1993    input->filename = NULL;
1994    input->directory = NULL;
1995    input->base = NULL;
1996    input->cur = NULL;
1997    input->buf = NULL;
1998    input->line = 1;
1999    input->col = 1;
2000    input->buf = NULL;
2001    input->free = NULL;
2002    input->version = NULL;
2003    input->consumed = 0;
2004    input->length = 0;
2005    return(input);
2006}
2007
2008
2009/************************************************************************
2010 *									*
2011 *		Commodity functions, cleanup needed ?			*
2012 *									*
2013 ************************************************************************/
2014/*
2015 * all tags allowing pc data from the html 4.01 loose dtd
2016 * NOTE: it might be more apropriate to integrate this information
2017 * into the html40ElementTable array but I don't want to risk any
2018 * binary incomptibility
2019 */
2020static const char *allowPCData[] = {
2021    "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2022    "blockquote", "body", "button", "caption", "center", "cite", "code",
2023    "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2024    "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2025    "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2026    "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2027};
2028
2029/**
2030 * areBlanks:
2031 * @ctxt:  an HTML parser context
2032 * @str:  a xmlChar *
2033 * @len:  the size of @str
2034 *
2035 * Is this a sequence of blank chars that one can ignore ?
2036 *
2037 * Returns 1 if ignorable 0 otherwise.
2038 */
2039
2040static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2041    unsigned int i;
2042    int j;
2043    xmlNodePtr lastChild;
2044    xmlDtdPtr dtd;
2045
2046    for (j = 0;j < len;j++)
2047        if (!(IS_BLANK_CH(str[j]))) return(0);
2048
2049    if (CUR == 0) return(1);
2050    if (CUR != '<') return(0);
2051    if (ctxt->name == NULL)
2052	return(1);
2053    if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2054	return(1);
2055    if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2056	return(1);
2057
2058    /* Only strip CDATA children of the body tag for strict HTML DTDs */
2059    if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2060        dtd = xmlGetIntSubset(ctxt->myDoc);
2061        if (dtd != NULL && dtd->ExternalID != NULL) {
2062            if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2063                    !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2064                return(1);
2065        }
2066    }
2067
2068    if (ctxt->node == NULL) return(0);
2069    lastChild = xmlGetLastChild(ctxt->node);
2070    while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2071	lastChild = lastChild->prev;
2072    if (lastChild == NULL) {
2073        if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2074            (ctxt->node->content != NULL)) return(0);
2075	/* keep ws in constructs like ...<b> </b>...
2076	   for all tags "b" allowing PCDATA */
2077	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2078	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2079		return(0);
2080	    }
2081	}
2082    } else if (xmlNodeIsText(lastChild)) {
2083        return(0);
2084    } else {
2085	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2086	   for all tags "p" allowing PCDATA */
2087	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2088	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2089		return(0);
2090	    }
2091	}
2092    }
2093    return(1);
2094}
2095
2096/**
2097 * htmlNewDocNoDtD:
2098 * @URI:  URI for the dtd, or NULL
2099 * @ExternalID:  the external ID of the DTD, or NULL
2100 *
2101 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2102 * are NULL
2103 *
2104 * Returns a new document, do not initialize the DTD if not provided
2105 */
2106htmlDocPtr
2107htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2108    xmlDocPtr cur;
2109
2110    /*
2111     * Allocate a new document and fill the fields.
2112     */
2113    cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2114    if (cur == NULL) {
2115	htmlErrMemory(NULL, "HTML document creation failed\n");
2116	return(NULL);
2117    }
2118    memset(cur, 0, sizeof(xmlDoc));
2119
2120    cur->type = XML_HTML_DOCUMENT_NODE;
2121    cur->version = NULL;
2122    cur->intSubset = NULL;
2123    cur->doc = cur;
2124    cur->name = NULL;
2125    cur->children = NULL;
2126    cur->extSubset = NULL;
2127    cur->oldNs = NULL;
2128    cur->encoding = NULL;
2129    cur->standalone = 1;
2130    cur->compression = 0;
2131    cur->ids = NULL;
2132    cur->refs = NULL;
2133    cur->_private = NULL;
2134    cur->charset = XML_CHAR_ENCODING_UTF8;
2135    if ((ExternalID != NULL) ||
2136	(URI != NULL))
2137	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2138    return(cur);
2139}
2140
2141/**
2142 * htmlNewDoc:
2143 * @URI:  URI for the dtd, or NULL
2144 * @ExternalID:  the external ID of the DTD, or NULL
2145 *
2146 * Creates a new HTML document
2147 *
2148 * Returns a new document
2149 */
2150htmlDocPtr
2151htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2152    if ((URI == NULL) && (ExternalID == NULL))
2153	return(htmlNewDocNoDtD(
2154		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2155		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2156
2157    return(htmlNewDocNoDtD(URI, ExternalID));
2158}
2159
2160
2161/************************************************************************
2162 *									*
2163 *			The parser itself				*
2164 *	Relates to http://www.w3.org/TR/html40				*
2165 *									*
2166 ************************************************************************/
2167
2168/************************************************************************
2169 *									*
2170 *			The parser itself				*
2171 *									*
2172 ************************************************************************/
2173
2174static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2175
2176/**
2177 * htmlParseHTMLName:
2178 * @ctxt:  an HTML parser context
2179 *
2180 * parse an HTML tag or attribute name, note that we convert it to lowercase
2181 * since HTML names are not case-sensitive.
2182 *
2183 * Returns the Tag Name parsed or NULL
2184 */
2185
2186static const xmlChar *
2187htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2188    int i = 0;
2189    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2190
2191    if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2192        (CUR != ':')) return(NULL);
2193
2194    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2195           ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2196	   (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2197	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2198        else loc[i] = CUR;
2199	i++;
2200
2201	NEXT;
2202    }
2203
2204    return(xmlDictLookup(ctxt->dict, loc, i));
2205}
2206
2207/**
2208 * htmlParseName:
2209 * @ctxt:  an HTML parser context
2210 *
2211 * parse an HTML name, this routine is case sensitive.
2212 *
2213 * Returns the Name parsed or NULL
2214 */
2215
2216static const xmlChar *
2217htmlParseName(htmlParserCtxtPtr ctxt) {
2218    const xmlChar *in;
2219    const xmlChar *ret;
2220    int count = 0;
2221
2222    GROW;
2223
2224    /*
2225     * Accelerator for simple ASCII names
2226     */
2227    in = ctxt->input->cur;
2228    if (((*in >= 0x61) && (*in <= 0x7A)) ||
2229	((*in >= 0x41) && (*in <= 0x5A)) ||
2230	(*in == '_') || (*in == ':')) {
2231	in++;
2232	while (((*in >= 0x61) && (*in <= 0x7A)) ||
2233	       ((*in >= 0x41) && (*in <= 0x5A)) ||
2234	       ((*in >= 0x30) && (*in <= 0x39)) ||
2235	       (*in == '_') || (*in == '-') ||
2236	       (*in == ':') || (*in == '.'))
2237	    in++;
2238	if ((*in > 0) && (*in < 0x80)) {
2239	    count = in - ctxt->input->cur;
2240	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2241	    ctxt->input->cur = in;
2242	    ctxt->nbChars += count;
2243	    ctxt->input->col += count;
2244	    return(ret);
2245	}
2246    }
2247    return(htmlParseNameComplex(ctxt));
2248}
2249
2250static const xmlChar *
2251htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2252    int len = 0, l;
2253    int c;
2254    int count = 0;
2255
2256    /*
2257     * Handler for more complex cases
2258     */
2259    GROW;
2260    c = CUR_CHAR(l);
2261    if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2262	(!IS_LETTER(c) && (c != '_') &&
2263         (c != ':'))) {
2264	return(NULL);
2265    }
2266
2267    while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2268	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2269            (c == '.') || (c == '-') ||
2270	    (c == '_') || (c == ':') ||
2271	    (IS_COMBINING(c)) ||
2272	    (IS_EXTENDER(c)))) {
2273	if (count++ > 100) {
2274	    count = 0;
2275	    GROW;
2276	}
2277	len += l;
2278	NEXTL(l);
2279	c = CUR_CHAR(l);
2280    }
2281    return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2282}
2283
2284
2285/**
2286 * htmlParseHTMLAttribute:
2287 * @ctxt:  an HTML parser context
2288 * @stop:  a char stop value
2289 *
2290 * parse an HTML attribute value till the stop (quote), if
2291 * stop is 0 then it stops at the first space
2292 *
2293 * Returns the attribute parsed or NULL
2294 */
2295
2296static xmlChar *
2297htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2298    xmlChar *buffer = NULL;
2299    int buffer_size = 0;
2300    xmlChar *out = NULL;
2301    const xmlChar *name = NULL;
2302    const xmlChar *cur = NULL;
2303    const htmlEntityDesc * ent;
2304
2305    /*
2306     * allocate a translation buffer.
2307     */
2308    buffer_size = HTML_PARSER_BUFFER_SIZE;
2309    buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2310    if (buffer == NULL) {
2311	htmlErrMemory(ctxt, "buffer allocation failed\n");
2312	return(NULL);
2313    }
2314    out = buffer;
2315
2316    /*
2317     * Ok loop until we reach one of the ending chars
2318     */
2319    while ((CUR != 0) && (CUR != stop)) {
2320	if ((stop == 0) && (CUR == '>')) break;
2321	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2322        if (CUR == '&') {
2323	    if (NXT(1) == '#') {
2324		unsigned int c;
2325		int bits;
2326
2327		c = htmlParseCharRef(ctxt);
2328		if      (c <    0x80)
2329		        { *out++  = c;                bits= -6; }
2330		else if (c <   0x800)
2331		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2332		else if (c < 0x10000)
2333		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2334		else
2335		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2336
2337		for ( ; bits >= 0; bits-= 6) {
2338		    *out++  = ((c >> bits) & 0x3F) | 0x80;
2339		}
2340
2341		if (out - buffer > buffer_size - 100) {
2342			int indx = out - buffer;
2343
2344			growBuffer(buffer);
2345			out = &buffer[indx];
2346		}
2347	    } else {
2348		ent = htmlParseEntityRef(ctxt, &name);
2349		if (name == NULL) {
2350		    *out++ = '&';
2351		    if (out - buffer > buffer_size - 100) {
2352			int indx = out - buffer;
2353
2354			growBuffer(buffer);
2355			out = &buffer[indx];
2356		    }
2357		} else if (ent == NULL) {
2358		    *out++ = '&';
2359		    cur = name;
2360		    while (*cur != 0) {
2361			if (out - buffer > buffer_size - 100) {
2362			    int indx = out - buffer;
2363
2364			    growBuffer(buffer);
2365			    out = &buffer[indx];
2366			}
2367			*out++ = *cur++;
2368		    }
2369		} else {
2370		    unsigned int c;
2371		    int bits;
2372
2373		    if (out - buffer > buffer_size - 100) {
2374			int indx = out - buffer;
2375
2376			growBuffer(buffer);
2377			out = &buffer[indx];
2378		    }
2379		    c = (xmlChar)ent->value;
2380		    if      (c <    0x80)
2381			{ *out++  = c;                bits= -6; }
2382		    else if (c <   0x800)
2383			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2384		    else if (c < 0x10000)
2385			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2386		    else
2387			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2388
2389		    for ( ; bits >= 0; bits-= 6) {
2390			*out++  = ((c >> bits) & 0x3F) | 0x80;
2391		    }
2392		}
2393	    }
2394	} else {
2395	    unsigned int c;
2396	    int bits, l;
2397
2398	    if (out - buffer > buffer_size - 100) {
2399		int indx = out - buffer;
2400
2401		growBuffer(buffer);
2402		out = &buffer[indx];
2403	    }
2404	    c = CUR_CHAR(l);
2405	    if      (c <    0x80)
2406		    { *out++  = c;                bits= -6; }
2407	    else if (c <   0x800)
2408		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2409	    else if (c < 0x10000)
2410		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2411	    else
2412		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2413
2414	    for ( ; bits >= 0; bits-= 6) {
2415		*out++  = ((c >> bits) & 0x3F) | 0x80;
2416	    }
2417	    NEXT;
2418	}
2419    }
2420    *out++ = 0;
2421    return(buffer);
2422}
2423
2424/**
2425 * htmlParseEntityRef:
2426 * @ctxt:  an HTML parser context
2427 * @str:  location to store the entity name
2428 *
2429 * parse an HTML ENTITY references
2430 *
2431 * [68] EntityRef ::= '&' Name ';'
2432 *
2433 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2434 *         if non-NULL *str will have to be freed by the caller.
2435 */
2436const htmlEntityDesc *
2437htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2438    const xmlChar *name;
2439    const htmlEntityDesc * ent = NULL;
2440
2441    if (str != NULL) *str = NULL;
2442    if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2443
2444    if (CUR == '&') {
2445        NEXT;
2446        name = htmlParseName(ctxt);
2447	if (name == NULL) {
2448	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2449	                 "htmlParseEntityRef: no name\n", NULL, NULL);
2450	} else {
2451	    GROW;
2452	    if (CUR == ';') {
2453	        if (str != NULL)
2454		    *str = name;
2455
2456		/*
2457		 * Lookup the entity in the table.
2458		 */
2459		ent = htmlEntityLookup(name);
2460		if (ent != NULL) /* OK that's ugly !!! */
2461		    NEXT;
2462	    } else {
2463		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2464		             "htmlParseEntityRef: expecting ';'\n",
2465			     NULL, NULL);
2466	        if (str != NULL)
2467		    *str = name;
2468	    }
2469	}
2470    }
2471    return(ent);
2472}
2473
2474/**
2475 * htmlParseAttValue:
2476 * @ctxt:  an HTML parser context
2477 *
2478 * parse a value for an attribute
2479 * Note: the parser won't do substitution of entities here, this
2480 * will be handled later in xmlStringGetNodeList, unless it was
2481 * asked for ctxt->replaceEntities != 0
2482 *
2483 * Returns the AttValue parsed or NULL.
2484 */
2485
2486static xmlChar *
2487htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2488    xmlChar *ret = NULL;
2489
2490    if (CUR == '"') {
2491        NEXT;
2492	ret = htmlParseHTMLAttribute(ctxt, '"');
2493        if (CUR != '"') {
2494	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2495	                 "AttValue: \" expected\n", NULL, NULL);
2496	} else
2497	    NEXT;
2498    } else if (CUR == '\'') {
2499        NEXT;
2500	ret = htmlParseHTMLAttribute(ctxt, '\'');
2501        if (CUR != '\'') {
2502	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2503	                 "AttValue: ' expected\n", NULL, NULL);
2504	} else
2505	    NEXT;
2506    } else {
2507        /*
2508	 * That's an HTMLism, the attribute value may not be quoted
2509	 */
2510	ret = htmlParseHTMLAttribute(ctxt, 0);
2511	if (ret == NULL) {
2512	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2513	                 "AttValue: no value found\n", NULL, NULL);
2514	}
2515    }
2516    return(ret);
2517}
2518
2519/**
2520 * htmlParseSystemLiteral:
2521 * @ctxt:  an HTML parser context
2522 *
2523 * parse an HTML Literal
2524 *
2525 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2526 *
2527 * Returns the SystemLiteral parsed or NULL
2528 */
2529
2530static xmlChar *
2531htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2532    const xmlChar *q;
2533    xmlChar *ret = NULL;
2534
2535    if (CUR == '"') {
2536        NEXT;
2537	q = CUR_PTR;
2538	while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2539	    NEXT;
2540	if (!IS_CHAR_CH(CUR)) {
2541	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2542			 "Unfinished SystemLiteral\n", NULL, NULL);
2543	} else {
2544	    ret = xmlStrndup(q, CUR_PTR - q);
2545	    NEXT;
2546        }
2547    } else if (CUR == '\'') {
2548        NEXT;
2549	q = CUR_PTR;
2550	while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2551	    NEXT;
2552	if (!IS_CHAR_CH(CUR)) {
2553	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2554			 "Unfinished SystemLiteral\n", NULL, NULL);
2555	} else {
2556	    ret = xmlStrndup(q, CUR_PTR - q);
2557	    NEXT;
2558        }
2559    } else {
2560	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2561	             " or ' expected\n", NULL, NULL);
2562    }
2563
2564    return(ret);
2565}
2566
2567/**
2568 * htmlParsePubidLiteral:
2569 * @ctxt:  an HTML parser context
2570 *
2571 * parse an HTML public literal
2572 *
2573 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2574 *
2575 * Returns the PubidLiteral parsed or NULL.
2576 */
2577
2578static xmlChar *
2579htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2580    const xmlChar *q;
2581    xmlChar *ret = NULL;
2582    /*
2583     * Name ::= (Letter | '_') (NameChar)*
2584     */
2585    if (CUR == '"') {
2586        NEXT;
2587	q = CUR_PTR;
2588	while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2589	if (CUR != '"') {
2590	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2591	                 "Unfinished PubidLiteral\n", NULL, NULL);
2592	} else {
2593	    ret = xmlStrndup(q, CUR_PTR - q);
2594	    NEXT;
2595	}
2596    } else if (CUR == '\'') {
2597        NEXT;
2598	q = CUR_PTR;
2599	while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2600	    NEXT;
2601	if (CUR != '\'') {
2602	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2603	                 "Unfinished PubidLiteral\n", NULL, NULL);
2604	} else {
2605	    ret = xmlStrndup(q, CUR_PTR - q);
2606	    NEXT;
2607	}
2608    } else {
2609	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2610	             "PubidLiteral \" or ' expected\n", NULL, NULL);
2611    }
2612
2613    return(ret);
2614}
2615
2616/**
2617 * htmlParseScript:
2618 * @ctxt:  an HTML parser context
2619 *
2620 * parse the content of an HTML SCRIPT or STYLE element
2621 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2622 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2623 * http://www.w3.org/TR/html4/types.html#type-script
2624 * http://www.w3.org/TR/html4/types.html#h-6.15
2625 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2626 *
2627 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2628 * element and the value of intrinsic event attributes. User agents must
2629 * not evaluate script data as HTML markup but instead must pass it on as
2630 * data to a script engine.
2631 * NOTES:
2632 * - The content is passed like CDATA
2633 * - the attributes for style and scripting "onXXX" are also described
2634 *   as CDATA but SGML allows entities references in attributes so their
2635 *   processing is identical as other attributes
2636 */
2637static void
2638htmlParseScript(htmlParserCtxtPtr ctxt) {
2639    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2640    int nbchar = 0;
2641    int cur,l;
2642
2643    SHRINK;
2644    cur = CUR_CHAR(l);
2645    while (IS_CHAR_CH(cur)) {
2646	if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2647	    (NXT(3) == '-')) {
2648	    if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2649		if (ctxt->sax->cdataBlock!= NULL) {
2650		    /*
2651		     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2652		     */
2653		    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2654		} else if (ctxt->sax->characters != NULL) {
2655		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2656		}
2657	    }
2658	    nbchar = 0;
2659	    htmlParseComment(ctxt);
2660	    cur = CUR_CHAR(l);
2661	    continue;
2662	} else if ((cur == '<') && (NXT(1) == '/')) {
2663            /*
2664             * One should break here, the specification is clear:
2665             * Authors should therefore escape "</" within the content.
2666             * Escape mechanisms are specific to each scripting or
2667             * style sheet language.
2668             *
2669             * In recovery mode, only break if end tag match the
2670             * current tag, effectively ignoring all tags inside the
2671             * script/style block and treating the entire block as
2672             * CDATA.
2673             */
2674            if (ctxt->recovery) {
2675                if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2676				   xmlStrlen(ctxt->name)) == 0)
2677                {
2678                    break; /* while */
2679                } else {
2680		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2681				 "Element %s embeds close tag\n",
2682		                 ctxt->name, NULL);
2683		}
2684            } else {
2685                if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2686                    ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2687                {
2688                    break; /* while */
2689                }
2690            }
2691	}
2692	COPY_BUF(l,buf,nbchar,cur);
2693	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2694	    if (ctxt->sax->cdataBlock!= NULL) {
2695		/*
2696		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2697		 */
2698		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2699	    } else if (ctxt->sax->characters != NULL) {
2700		ctxt->sax->characters(ctxt->userData, buf, nbchar);
2701	    }
2702	    nbchar = 0;
2703	}
2704	GROW;
2705	NEXTL(l);
2706	cur = CUR_CHAR(l);
2707    }
2708
2709    if (!(IS_CHAR_CH(cur))) {
2710	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2711	                "Invalid char in CDATA 0x%X\n", cur);
2712	NEXT;
2713    }
2714
2715    if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2716	if (ctxt->sax->cdataBlock!= NULL) {
2717	    /*
2718	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2719	     */
2720	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2721	} else if (ctxt->sax->characters != NULL) {
2722	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2723	}
2724    }
2725}
2726
2727
2728/**
2729 * htmlParseCharData:
2730 * @ctxt:  an HTML parser context
2731 *
2732 * parse a CharData section.
2733 * if we are within a CDATA section ']]>' marks an end of section.
2734 *
2735 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2736 */
2737
2738static void
2739htmlParseCharData(htmlParserCtxtPtr ctxt) {
2740    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2741    int nbchar = 0;
2742    int cur, l;
2743
2744    SHRINK;
2745    cur = CUR_CHAR(l);
2746    while (((cur != '<') || (ctxt->token == '<')) &&
2747           ((cur != '&') || (ctxt->token == '&')) &&
2748	   (IS_CHAR(cur))) {
2749	COPY_BUF(l,buf,nbchar,cur);
2750	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2751	    /*
2752	     * Ok the segment is to be consumed as chars.
2753	     */
2754	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2755		if (areBlanks(ctxt, buf, nbchar)) {
2756		    if (ctxt->sax->ignorableWhitespace != NULL)
2757			ctxt->sax->ignorableWhitespace(ctxt->userData,
2758			                               buf, nbchar);
2759		} else {
2760		    htmlCheckParagraph(ctxt);
2761		    if (ctxt->sax->characters != NULL)
2762			ctxt->sax->characters(ctxt->userData, buf, nbchar);
2763		}
2764	    }
2765	    nbchar = 0;
2766	}
2767	NEXTL(l);
2768	cur = CUR_CHAR(l);
2769	if (cur == 0) {
2770	    SHRINK;
2771	    GROW;
2772	    cur = CUR_CHAR(l);
2773	}
2774    }
2775    if (nbchar != 0) {
2776        buf[nbchar] = 0;
2777
2778	/*
2779	 * Ok the segment is to be consumed as chars.
2780	 */
2781	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2782	    if (areBlanks(ctxt, buf, nbchar)) {
2783		if (ctxt->sax->ignorableWhitespace != NULL)
2784		    ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2785	    } else {
2786		htmlCheckParagraph(ctxt);
2787		if (ctxt->sax->characters != NULL)
2788		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2789	    }
2790	}
2791    } else {
2792	/*
2793	 * Loop detection
2794	 */
2795	if (cur == 0)
2796	    ctxt->instate = XML_PARSER_EOF;
2797    }
2798}
2799
2800/**
2801 * htmlParseExternalID:
2802 * @ctxt:  an HTML parser context
2803 * @publicID:  a xmlChar** receiving PubidLiteral
2804 *
2805 * Parse an External ID or a Public ID
2806 *
2807 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2808 *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
2809 *
2810 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2811 *
2812 * Returns the function returns SystemLiteral and in the second
2813 *                case publicID receives PubidLiteral, is strict is off
2814 *                it is possible to return NULL and have publicID set.
2815 */
2816
2817static xmlChar *
2818htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
2819    xmlChar *URI = NULL;
2820
2821    if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2822         (UPP(2) == 'S') && (UPP(3) == 'T') &&
2823	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2824        SKIP(6);
2825	if (!IS_BLANK_CH(CUR)) {
2826	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2827	                 "Space required after 'SYSTEM'\n", NULL, NULL);
2828	}
2829        SKIP_BLANKS;
2830	URI = htmlParseSystemLiteral(ctxt);
2831	if (URI == NULL) {
2832	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2833	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
2834        }
2835    } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2836	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
2837	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
2838        SKIP(6);
2839	if (!IS_BLANK_CH(CUR)) {
2840	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2841	                 "Space required after 'PUBLIC'\n", NULL, NULL);
2842	}
2843        SKIP_BLANKS;
2844	*publicID = htmlParsePubidLiteral(ctxt);
2845	if (*publicID == NULL) {
2846	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2847	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2848			 NULL, NULL);
2849	}
2850        SKIP_BLANKS;
2851        if ((CUR == '"') || (CUR == '\'')) {
2852	    URI = htmlParseSystemLiteral(ctxt);
2853	}
2854    }
2855    return(URI);
2856}
2857
2858/**
2859 * xmlParsePI:
2860 * @ctxt:  an XML parser context
2861 *
2862 * parse an XML Processing Instruction.
2863 *
2864 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2865 */
2866static void
2867htmlParsePI(htmlParserCtxtPtr ctxt) {
2868    xmlChar *buf = NULL;
2869    int len = 0;
2870    int size = HTML_PARSER_BUFFER_SIZE;
2871    int cur, l;
2872    const xmlChar *target;
2873    xmlParserInputState state;
2874    int count = 0;
2875
2876    if ((RAW == '<') && (NXT(1) == '?')) {
2877	state = ctxt->instate;
2878        ctxt->instate = XML_PARSER_PI;
2879	/*
2880	 * this is a Processing Instruction.
2881	 */
2882	SKIP(2);
2883	SHRINK;
2884
2885	/*
2886	 * Parse the target name and check for special support like
2887	 * namespace.
2888	 */
2889        target = htmlParseName(ctxt);
2890	if (target != NULL) {
2891	    if (RAW == '>') {
2892		SKIP(1);
2893
2894		/*
2895		 * SAX: PI detected.
2896		 */
2897		if ((ctxt->sax) && (!ctxt->disableSAX) &&
2898		    (ctxt->sax->processingInstruction != NULL))
2899		    ctxt->sax->processingInstruction(ctxt->userData,
2900		                                     target, NULL);
2901		ctxt->instate = state;
2902		return;
2903	    }
2904	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2905	    if (buf == NULL) {
2906		htmlErrMemory(ctxt, NULL);
2907		ctxt->instate = state;
2908		return;
2909	    }
2910	    cur = CUR;
2911	    if (!IS_BLANK(cur)) {
2912		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2913			  "ParsePI: PI %s space expected\n", target, NULL);
2914	    }
2915            SKIP_BLANKS;
2916	    cur = CUR_CHAR(l);
2917	    while (IS_CHAR(cur) && (cur != '>')) {
2918		if (len + 5 >= size) {
2919		    xmlChar *tmp;
2920
2921		    size *= 2;
2922		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2923		    if (tmp == NULL) {
2924			htmlErrMemory(ctxt, NULL);
2925			xmlFree(buf);
2926			ctxt->instate = state;
2927			return;
2928		    }
2929		    buf = tmp;
2930		}
2931		count++;
2932		if (count > 50) {
2933		    GROW;
2934		    count = 0;
2935		}
2936		COPY_BUF(l,buf,len,cur);
2937		NEXTL(l);
2938		cur = CUR_CHAR(l);
2939		if (cur == 0) {
2940		    SHRINK;
2941		    GROW;
2942		    cur = CUR_CHAR(l);
2943		}
2944	    }
2945	    buf[len] = 0;
2946	    if (cur != '>') {
2947		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2948		      "ParsePI: PI %s never end ...\n", target, NULL);
2949	    } else {
2950		SKIP(1);
2951
2952		/*
2953		 * SAX: PI detected.
2954		 */
2955		if ((ctxt->sax) && (!ctxt->disableSAX) &&
2956		    (ctxt->sax->processingInstruction != NULL))
2957		    ctxt->sax->processingInstruction(ctxt->userData,
2958		                                     target, buf);
2959	    }
2960	    xmlFree(buf);
2961	} else {
2962	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
2963                         "PI is not started correctly", NULL, NULL);
2964	}
2965	ctxt->instate = state;
2966    }
2967}
2968
2969/**
2970 * htmlParseComment:
2971 * @ctxt:  an HTML parser context
2972 *
2973 * Parse an XML (SGML) comment <!-- .... -->
2974 *
2975 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2976 */
2977static void
2978htmlParseComment(htmlParserCtxtPtr ctxt) {
2979    xmlChar *buf = NULL;
2980    int len;
2981    int size = HTML_PARSER_BUFFER_SIZE;
2982    int q, ql;
2983    int r, rl;
2984    int cur, l;
2985    xmlParserInputState state;
2986
2987    /*
2988     * Check that there is a comment right here.
2989     */
2990    if ((RAW != '<') || (NXT(1) != '!') ||
2991        (NXT(2) != '-') || (NXT(3) != '-')) return;
2992
2993    state = ctxt->instate;
2994    ctxt->instate = XML_PARSER_COMMENT;
2995    SHRINK;
2996    SKIP(4);
2997    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2998    if (buf == NULL) {
2999        htmlErrMemory(ctxt, "buffer allocation failed\n");
3000	ctxt->instate = state;
3001	return;
3002    }
3003    q = CUR_CHAR(ql);
3004    NEXTL(ql);
3005    r = CUR_CHAR(rl);
3006    NEXTL(rl);
3007    cur = CUR_CHAR(l);
3008    len = 0;
3009    while (IS_CHAR(cur) &&
3010           ((cur != '>') ||
3011	    (r != '-') || (q != '-'))) {
3012	if (len + 5 >= size) {
3013	    xmlChar *tmp;
3014
3015	    size *= 2;
3016	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3017	    if (tmp == NULL) {
3018	        xmlFree(buf);
3019	        htmlErrMemory(ctxt, "growing buffer failed\n");
3020		ctxt->instate = state;
3021		return;
3022	    }
3023	    buf = tmp;
3024	}
3025	COPY_BUF(ql,buf,len,q);
3026	q = r;
3027	ql = rl;
3028	r = cur;
3029	rl = l;
3030	NEXTL(l);
3031	cur = CUR_CHAR(l);
3032	if (cur == 0) {
3033	    SHRINK;
3034	    GROW;
3035	    cur = CUR_CHAR(l);
3036	}
3037    }
3038    buf[len] = 0;
3039    if (!IS_CHAR(cur)) {
3040	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3041	             "Comment not terminated \n<!--%.50s\n", buf, NULL);
3042	xmlFree(buf);
3043    } else {
3044        NEXT;
3045	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3046	    (!ctxt->disableSAX))
3047	    ctxt->sax->comment(ctxt->userData, buf);
3048	xmlFree(buf);
3049    }
3050    ctxt->instate = state;
3051}
3052
3053/**
3054 * htmlParseCharRef:
3055 * @ctxt:  an HTML parser context
3056 *
3057 * parse Reference declarations
3058 *
3059 * [66] CharRef ::= '&#' [0-9]+ ';' |
3060 *                  '&#x' [0-9a-fA-F]+ ';'
3061 *
3062 * Returns the value parsed (as an int)
3063 */
3064int
3065htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3066    int val = 0;
3067
3068    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3069	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3070		     "htmlParseCharRef: context error\n",
3071		     NULL, NULL);
3072        return(0);
3073    }
3074    if ((CUR == '&') && (NXT(1) == '#') &&
3075        ((NXT(2) == 'x') || NXT(2) == 'X')) {
3076	SKIP(3);
3077	while (CUR != ';') {
3078	    if ((CUR >= '0') && (CUR <= '9'))
3079	        val = val * 16 + (CUR - '0');
3080	    else if ((CUR >= 'a') && (CUR <= 'f'))
3081	        val = val * 16 + (CUR - 'a') + 10;
3082	    else if ((CUR >= 'A') && (CUR <= 'F'))
3083	        val = val * 16 + (CUR - 'A') + 10;
3084	    else {
3085	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3086		             "htmlParseCharRef: invalid hexadecimal value\n",
3087			     NULL, NULL);
3088		return(0);
3089	    }
3090	    NEXT;
3091	}
3092	if (CUR == ';')
3093	    NEXT;
3094    } else if  ((CUR == '&') && (NXT(1) == '#')) {
3095	SKIP(2);
3096	while (CUR != ';') {
3097	    if ((CUR >= '0') && (CUR <= '9'))
3098	        val = val * 10 + (CUR - '0');
3099	    else {
3100	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3101		             "htmlParseCharRef: invalid decimal value\n",
3102			     NULL, NULL);
3103		return(0);
3104	    }
3105	    NEXT;
3106	}
3107	if (CUR == ';')
3108	    NEXT;
3109    } else {
3110	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3111	             "htmlParseCharRef: invalid value\n", NULL, NULL);
3112    }
3113    /*
3114     * Check the value IS_CHAR ...
3115     */
3116    if (IS_CHAR(val)) {
3117        return(val);
3118    } else {
3119	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3120			"htmlParseCharRef: invalid xmlChar value %d\n",
3121			val);
3122    }
3123    return(0);
3124}
3125
3126
3127/**
3128 * htmlParseDocTypeDecl:
3129 * @ctxt:  an HTML parser context
3130 *
3131 * parse a DOCTYPE declaration
3132 *
3133 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3134 *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3135 */
3136
3137static void
3138htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3139    const xmlChar *name;
3140    xmlChar *ExternalID = NULL;
3141    xmlChar *URI = NULL;
3142
3143    /*
3144     * We know that '<!DOCTYPE' has been detected.
3145     */
3146    SKIP(9);
3147
3148    SKIP_BLANKS;
3149
3150    /*
3151     * Parse the DOCTYPE name.
3152     */
3153    name = htmlParseName(ctxt);
3154    if (name == NULL) {
3155	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3156	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3157		     NULL, NULL);
3158    }
3159    /*
3160     * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3161     */
3162
3163    SKIP_BLANKS;
3164
3165    /*
3166     * Check for SystemID and ExternalID
3167     */
3168    URI = htmlParseExternalID(ctxt, &ExternalID);
3169    SKIP_BLANKS;
3170
3171    /*
3172     * We should be at the end of the DOCTYPE declaration.
3173     */
3174    if (CUR != '>') {
3175	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3176	             "DOCTYPE improperly terminated\n", NULL, NULL);
3177        /* We shouldn't try to resynchronize ... */
3178    }
3179    NEXT;
3180
3181    /*
3182     * Create or update the document accordingly to the DOCTYPE
3183     */
3184    if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3185	(!ctxt->disableSAX))
3186	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3187
3188    /*
3189     * Cleanup, since we don't use all those identifiers
3190     */
3191    if (URI != NULL) xmlFree(URI);
3192    if (ExternalID != NULL) xmlFree(ExternalID);
3193}
3194
3195/**
3196 * htmlParseAttribute:
3197 * @ctxt:  an HTML parser context
3198 * @value:  a xmlChar ** used to store the value of the attribute
3199 *
3200 * parse an attribute
3201 *
3202 * [41] Attribute ::= Name Eq AttValue
3203 *
3204 * [25] Eq ::= S? '=' S?
3205 *
3206 * With namespace:
3207 *
3208 * [NS 11] Attribute ::= QName Eq AttValue
3209 *
3210 * Also the case QName == xmlns:??? is handled independently as a namespace
3211 * definition.
3212 *
3213 * Returns the attribute name, and the value in *value.
3214 */
3215
3216static const xmlChar *
3217htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3218    const xmlChar *name;
3219    xmlChar *val = NULL;
3220
3221    *value = NULL;
3222    name = htmlParseHTMLName(ctxt);
3223    if (name == NULL) {
3224	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3225	             "error parsing attribute name\n", NULL, NULL);
3226        return(NULL);
3227    }
3228
3229    /*
3230     * read the value
3231     */
3232    SKIP_BLANKS;
3233    if (CUR == '=') {
3234        NEXT;
3235	SKIP_BLANKS;
3236	val = htmlParseAttValue(ctxt);
3237	/******
3238    } else {
3239        * TODO : some attribute must have values, some may not
3240	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3241	    ctxt->sax->warning(ctxt->userData,
3242	       "No value for attribute %s\n", name); */
3243    }
3244
3245    *value = val;
3246    return(name);
3247}
3248
3249/**
3250 * htmlCheckEncoding:
3251 * @ctxt:  an HTML parser context
3252 * @attvalue: the attribute value
3253 *
3254 * Checks an http-equiv attribute from a Meta tag to detect
3255 * the encoding
3256 * If a new encoding is detected the parser is switched to decode
3257 * it and pass UTF8
3258 */
3259static void
3260htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3261    const xmlChar *encoding;
3262
3263    if ((ctxt == NULL) || (attvalue == NULL))
3264	return;
3265
3266    /* do not change encoding */
3267    if (ctxt->input->encoding != NULL)
3268        return;
3269
3270    encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3271    if (encoding != NULL) {
3272	encoding += 8;
3273    } else {
3274	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3275	if (encoding != NULL)
3276	    encoding += 9;
3277    }
3278    if (encoding != NULL) {
3279	xmlCharEncoding enc;
3280	xmlCharEncodingHandlerPtr handler;
3281
3282	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3283
3284	if (ctxt->input->encoding != NULL)
3285	    xmlFree((xmlChar *) ctxt->input->encoding);
3286	ctxt->input->encoding = xmlStrdup(encoding);
3287
3288	enc = xmlParseCharEncoding((const char *) encoding);
3289	/*
3290	 * registered set of known encodings
3291	 */
3292	if (enc != XML_CHAR_ENCODING_ERROR) {
3293	    xmlSwitchEncoding(ctxt, enc);
3294	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
3295	} else {
3296	    /*
3297	     * fallback for unknown encodings
3298	     */
3299	    handler = xmlFindCharEncodingHandler((const char *) encoding);
3300	    if (handler != NULL) {
3301		xmlSwitchToEncoding(ctxt, handler);
3302		ctxt->charset = XML_CHAR_ENCODING_UTF8;
3303	    } else {
3304		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3305	    }
3306	}
3307
3308	if ((ctxt->input->buf != NULL) &&
3309	    (ctxt->input->buf->encoder != NULL) &&
3310	    (ctxt->input->buf->raw != NULL) &&
3311	    (ctxt->input->buf->buffer != NULL)) {
3312	    int nbchars;
3313	    int processed;
3314
3315	    /*
3316	     * convert as much as possible to the parser reading buffer.
3317	     */
3318	    processed = ctxt->input->cur - ctxt->input->base;
3319	    xmlBufferShrink(ctxt->input->buf->buffer, processed);
3320	    nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3321		                       ctxt->input->buf->buffer,
3322				       ctxt->input->buf->raw);
3323	    if (nbchars < 0) {
3324		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3325		             "htmlCheckEncoding: encoder error\n",
3326			     NULL, NULL);
3327	    }
3328	    ctxt->input->base =
3329	    ctxt->input->cur = ctxt->input->buf->buffer->content;
3330	}
3331    }
3332}
3333
3334/**
3335 * htmlCheckMeta:
3336 * @ctxt:  an HTML parser context
3337 * @atts:  the attributes values
3338 *
3339 * Checks an attributes from a Meta tag
3340 */
3341static void
3342htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3343    int i;
3344    const xmlChar *att, *value;
3345    int http = 0;
3346    const xmlChar *content = NULL;
3347
3348    if ((ctxt == NULL) || (atts == NULL))
3349	return;
3350
3351    i = 0;
3352    att = atts[i++];
3353    while (att != NULL) {
3354	value = atts[i++];
3355	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3356	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3357	    http = 1;
3358	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3359	    content = value;
3360	att = atts[i++];
3361    }
3362    if ((http) && (content != NULL))
3363	htmlCheckEncoding(ctxt, content);
3364
3365}
3366
3367/**
3368 * htmlParseStartTag:
3369 * @ctxt:  an HTML parser context
3370 *
3371 * parse a start of tag either for rule element or
3372 * EmptyElement. In both case we don't parse the tag closing chars.
3373 *
3374 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3375 *
3376 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3377 *
3378 * With namespace:
3379 *
3380 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3381 *
3382 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3383 *
3384 * Returns 0 in case of success and -1 in case of error.
3385 */
3386
3387static int
3388htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3389    const xmlChar *name;
3390    const xmlChar *attname;
3391    xmlChar *attvalue;
3392    const xmlChar **atts;
3393    int nbatts = 0;
3394    int maxatts;
3395    int meta = 0;
3396    int i;
3397
3398    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3399	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3400		     "htmlParseStartTag: context error\n", NULL, NULL);
3401	return -1;
3402    }
3403    if (CUR != '<') return -1;
3404    NEXT;
3405
3406    atts = ctxt->atts;
3407    maxatts = ctxt->maxatts;
3408
3409    GROW;
3410    name = htmlParseHTMLName(ctxt);
3411    if (name == NULL) {
3412	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3413	             "htmlParseStartTag: invalid element name\n",
3414		     NULL, NULL);
3415	/* Dump the bogus tag like browsers do */
3416	while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3417	    NEXT;
3418        return -1;
3419    }
3420    if (xmlStrEqual(name, BAD_CAST"meta"))
3421	meta = 1;
3422
3423    /*
3424     * Check for auto-closure of HTML elements.
3425     */
3426    htmlAutoClose(ctxt, name);
3427
3428    /*
3429     * Check for implied HTML elements.
3430     */
3431    htmlCheckImplied(ctxt, name);
3432
3433    /*
3434     * Avoid html at any level > 0, head at any level != 1
3435     * or any attempt to recurse body
3436     */
3437    if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3438	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3439	             "htmlParseStartTag: misplaced <html> tag\n",
3440		     name, NULL);
3441	return 0;
3442    }
3443    if ((ctxt->nameNr != 1) &&
3444	(xmlStrEqual(name, BAD_CAST"head"))) {
3445	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3446	             "htmlParseStartTag: misplaced <head> tag\n",
3447		     name, NULL);
3448	return 0;
3449    }
3450    if (xmlStrEqual(name, BAD_CAST"body")) {
3451	int indx;
3452	for (indx = 0;indx < ctxt->nameNr;indx++) {
3453	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3454		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3455		             "htmlParseStartTag: misplaced <body> tag\n",
3456			     name, NULL);
3457		while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3458		    NEXT;
3459		return 0;
3460	    }
3461	}
3462    }
3463
3464    /*
3465     * Now parse the attributes, it ends up with the ending
3466     *
3467     * (S Attribute)* S?
3468     */
3469    SKIP_BLANKS;
3470    while ((IS_CHAR_CH(CUR)) &&
3471           (CUR != '>') &&
3472	   ((CUR != '/') || (NXT(1) != '>'))) {
3473	long cons = ctxt->nbChars;
3474
3475	GROW;
3476	attname = htmlParseAttribute(ctxt, &attvalue);
3477        if (attname != NULL) {
3478
3479	    /*
3480	     * Well formedness requires at most one declaration of an attribute
3481	     */
3482	    for (i = 0; i < nbatts;i += 2) {
3483	        if (xmlStrEqual(atts[i], attname)) {
3484		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3485		                 "Attribute %s redefined\n", attname, NULL);
3486		    if (attvalue != NULL)
3487			xmlFree(attvalue);
3488		    goto failed;
3489		}
3490	    }
3491
3492	    /*
3493	     * Add the pair to atts
3494	     */
3495	    if (atts == NULL) {
3496	        maxatts = 22; /* allow for 10 attrs by default */
3497	        atts = (const xmlChar **)
3498		       xmlMalloc(maxatts * sizeof(xmlChar *));
3499		if (atts == NULL) {
3500		    htmlErrMemory(ctxt, NULL);
3501		    if (attvalue != NULL)
3502			xmlFree(attvalue);
3503		    goto failed;
3504		}
3505		ctxt->atts = atts;
3506		ctxt->maxatts = maxatts;
3507	    } else if (nbatts + 4 > maxatts) {
3508	        const xmlChar **n;
3509
3510	        maxatts *= 2;
3511	        n = (const xmlChar **) xmlRealloc((void *) atts,
3512					     maxatts * sizeof(const xmlChar *));
3513		if (n == NULL) {
3514		    htmlErrMemory(ctxt, NULL);
3515		    if (attvalue != NULL)
3516			xmlFree(attvalue);
3517		    goto failed;
3518		}
3519		atts = n;
3520		ctxt->atts = atts;
3521		ctxt->maxatts = maxatts;
3522	    }
3523	    atts[nbatts++] = attname;
3524	    atts[nbatts++] = attvalue;
3525	    atts[nbatts] = NULL;
3526	    atts[nbatts + 1] = NULL;
3527	}
3528	else {
3529	    if (attvalue != NULL)
3530	        xmlFree(attvalue);
3531	    /* Dump the bogus attribute string up to the next blank or
3532	     * the end of the tag. */
3533	    while ((IS_CHAR_CH(CUR)) &&
3534	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3535		   ((CUR != '/') || (NXT(1) != '>')))
3536		NEXT;
3537	}
3538
3539failed:
3540	SKIP_BLANKS;
3541        if (cons == ctxt->nbChars) {
3542	    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3543	                 "htmlParseStartTag: problem parsing attributes\n",
3544			 NULL, NULL);
3545	    break;
3546	}
3547    }
3548
3549    /*
3550     * Handle specific association to the META tag
3551     */
3552    if (meta)
3553	htmlCheckMeta(ctxt, atts);
3554
3555    /*
3556     * SAX: Start of Element !
3557     */
3558    htmlnamePush(ctxt, name);
3559    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3560	if (nbatts != 0)
3561            ctxt->sax->startElement(ctxt->userData, name, atts);
3562	else
3563            ctxt->sax->startElement(ctxt->userData, name, NULL);
3564    }
3565
3566    if (atts != NULL) {
3567        for (i = 1;i < nbatts;i += 2) {
3568	    if (atts[i] != NULL)
3569		xmlFree((xmlChar *) atts[i]);
3570	}
3571    }
3572
3573    return 0;
3574}
3575
3576/**
3577 * htmlParseEndTag:
3578 * @ctxt:  an HTML parser context
3579 *
3580 * parse an end of tag
3581 *
3582 * [42] ETag ::= '</' Name S? '>'
3583 *
3584 * With namespace
3585 *
3586 * [NS 9] ETag ::= '</' QName S? '>'
3587 *
3588 * Returns 1 if the current level should be closed.
3589 */
3590
3591static int
3592htmlParseEndTag(htmlParserCtxtPtr ctxt)
3593{
3594    const xmlChar *name;
3595    const xmlChar *oldname;
3596    int i, ret;
3597
3598    if ((CUR != '<') || (NXT(1) != '/')) {
3599        htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3600	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
3601        return (0);
3602    }
3603    SKIP(2);
3604
3605    name = htmlParseHTMLName(ctxt);
3606    if (name == NULL)
3607        return (0);
3608
3609    /*
3610     * We should definitely be at the ending "S? '>'" part
3611     */
3612    SKIP_BLANKS;
3613    if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3614        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3615	             "End tag : expected '>'\n", NULL, NULL);
3616	if (ctxt->recovery) {
3617	    /*
3618	     * We're not at the ending > !!
3619	     * Error, unless in recover mode where we search forwards
3620	     * until we find a >
3621	     */
3622	    while (CUR != '\0' && CUR != '>') NEXT;
3623	    NEXT;
3624	}
3625    } else
3626        NEXT;
3627
3628    /*
3629     * If the name read is not one of the element in the parsing stack
3630     * then return, it's just an error.
3631     */
3632    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3633        if (xmlStrEqual(name, ctxt->nameTab[i]))
3634            break;
3635    }
3636    if (i < 0) {
3637        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3638	             "Unexpected end tag : %s\n", name, NULL);
3639        return (0);
3640    }
3641
3642
3643    /*
3644     * Check for auto-closure of HTML elements.
3645     */
3646
3647    htmlAutoCloseOnClose(ctxt, name);
3648
3649    /*
3650     * Well formedness constraints, opening and closing must match.
3651     * With the exception that the autoclose may have popped stuff out
3652     * of the stack.
3653     */
3654    if (!xmlStrEqual(name, ctxt->name)) {
3655        if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3656            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3657	                 "Opening and ending tag mismatch: %s and %s\n",
3658			 name, ctxt->name);
3659        }
3660    }
3661
3662    /*
3663     * SAX: End of Tag
3664     */
3665    oldname = ctxt->name;
3666    if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3667        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3668            ctxt->sax->endElement(ctxt->userData, name);
3669        htmlnamePop(ctxt);
3670        ret = 1;
3671    } else {
3672        ret = 0;
3673    }
3674
3675    return (ret);
3676}
3677
3678
3679/**
3680 * htmlParseReference:
3681 * @ctxt:  an HTML parser context
3682 *
3683 * parse and handle entity references in content,
3684 * this will end-up in a call to character() since this is either a
3685 * CharRef, or a predefined entity.
3686 */
3687static void
3688htmlParseReference(htmlParserCtxtPtr ctxt) {
3689    const htmlEntityDesc * ent;
3690    xmlChar out[6];
3691    const xmlChar *name;
3692    if (CUR != '&') return;
3693
3694    if (NXT(1) == '#') {
3695	unsigned int c;
3696	int bits, i = 0;
3697
3698	c = htmlParseCharRef(ctxt);
3699	if (c == 0)
3700	    return;
3701
3702        if      (c <    0x80) { out[i++]= c;                bits= -6; }
3703        else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3704        else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3705        else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3706
3707        for ( ; bits >= 0; bits-= 6) {
3708            out[i++]= ((c >> bits) & 0x3F) | 0x80;
3709        }
3710	out[i] = 0;
3711
3712	htmlCheckParagraph(ctxt);
3713	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3714	    ctxt->sax->characters(ctxt->userData, out, i);
3715    } else {
3716	ent = htmlParseEntityRef(ctxt, &name);
3717	if (name == NULL) {
3718	    htmlCheckParagraph(ctxt);
3719	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3720	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3721	    return;
3722	}
3723	if ((ent == NULL) || !(ent->value > 0)) {
3724	    htmlCheckParagraph(ctxt);
3725	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3726		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3727		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3728		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3729	    }
3730	} else {
3731	    unsigned int c;
3732	    int bits, i = 0;
3733
3734	    c = ent->value;
3735	    if      (c <    0x80)
3736	            { out[i++]= c;                bits= -6; }
3737	    else if (c <   0x800)
3738	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3739	    else if (c < 0x10000)
3740	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3741	    else
3742	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3743
3744	    for ( ; bits >= 0; bits-= 6) {
3745		out[i++]= ((c >> bits) & 0x3F) | 0x80;
3746	    }
3747	    out[i] = 0;
3748
3749	    htmlCheckParagraph(ctxt);
3750	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3751		ctxt->sax->characters(ctxt->userData, out, i);
3752	}
3753    }
3754}
3755
3756/**
3757 * htmlParseContent:
3758 * @ctxt:  an HTML parser context
3759 *
3760 * Parse a content: comment, sub-element, reference or text.
3761 */
3762
3763static void
3764htmlParseContent(htmlParserCtxtPtr ctxt) {
3765    xmlChar *currentNode;
3766    int depth;
3767
3768    currentNode = xmlStrdup(ctxt->name);
3769    depth = ctxt->nameNr;
3770    while (1) {
3771	long cons = ctxt->nbChars;
3772
3773        GROW;
3774	/*
3775	 * Our tag or one of it's parent or children is ending.
3776	 */
3777        if ((CUR == '<') && (NXT(1) == '/')) {
3778	    if (htmlParseEndTag(ctxt) &&
3779		((currentNode != NULL) || (ctxt->nameNr == 0))) {
3780		if (currentNode != NULL)
3781		    xmlFree(currentNode);
3782		return;
3783	    }
3784	    continue; /* while */
3785        }
3786
3787	/*
3788	 * Has this node been popped out during parsing of
3789	 * the next element
3790	 */
3791        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3792	    (!xmlStrEqual(currentNode, ctxt->name)))
3793	     {
3794	    if (currentNode != NULL) xmlFree(currentNode);
3795	    return;
3796	}
3797
3798	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3799	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
3800	    /*
3801	     * Handle SCRIPT/STYLE separately
3802	     */
3803	    htmlParseScript(ctxt);
3804	} else {
3805	    /*
3806	     * Sometimes DOCTYPE arrives in the middle of the document
3807	     */
3808	    if ((CUR == '<') && (NXT(1) == '!') &&
3809		(UPP(2) == 'D') && (UPP(3) == 'O') &&
3810		(UPP(4) == 'C') && (UPP(5) == 'T') &&
3811		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
3812		(UPP(8) == 'E')) {
3813		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3814		             "Misplaced DOCTYPE declaration\n",
3815			     BAD_CAST "DOCTYPE" , NULL);
3816		htmlParseDocTypeDecl(ctxt);
3817	    }
3818
3819	    /*
3820	     * First case :  a comment
3821	     */
3822	    if ((CUR == '<') && (NXT(1) == '!') &&
3823		(NXT(2) == '-') && (NXT(3) == '-')) {
3824		htmlParseComment(ctxt);
3825	    }
3826
3827	    /*
3828	     * Second case : a Processing Instruction.
3829	     */
3830	    else if ((CUR == '<') && (NXT(1) == '?')) {
3831		htmlParsePI(ctxt);
3832	    }
3833
3834	    /*
3835	     * Third case :  a sub-element.
3836	     */
3837	    else if (CUR == '<') {
3838		htmlParseElement(ctxt);
3839	    }
3840
3841	    /*
3842	     * Fourth case : a reference. If if has not been resolved,
3843	     *    parsing returns it's Name, create the node
3844	     */
3845	    else if (CUR == '&') {
3846		htmlParseReference(ctxt);
3847	    }
3848
3849	    /*
3850	     * Fifth case : end of the resource
3851	     */
3852	    else if (CUR == 0) {
3853		htmlAutoCloseOnEnd(ctxt);
3854		break;
3855	    }
3856
3857	    /*
3858	     * Last case, text. Note that References are handled directly.
3859	     */
3860	    else {
3861		htmlParseCharData(ctxt);
3862	    }
3863
3864	    if (cons == ctxt->nbChars) {
3865		if (ctxt->node != NULL) {
3866		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3867		                 "detected an error in element content\n",
3868				 NULL, NULL);
3869		}
3870		break;
3871	    }
3872	}
3873        GROW;
3874    }
3875    if (currentNode != NULL) xmlFree(currentNode);
3876}
3877
3878/**
3879 * htmlParseContent:
3880 * @ctxt:  an HTML parser context
3881 *
3882 * Parse a content: comment, sub-element, reference or text.
3883 */
3884
3885void
3886__htmlParseContent(void *ctxt) {
3887    if (ctxt != NULL)
3888	htmlParseContent((htmlParserCtxtPtr) ctxt);
3889}
3890
3891/**
3892 * htmlParseElement:
3893 * @ctxt:  an HTML parser context
3894 *
3895 * parse an HTML element, this is highly recursive
3896 *
3897 * [39] element ::= EmptyElemTag | STag content ETag
3898 *
3899 * [41] Attribute ::= Name Eq AttValue
3900 */
3901
3902void
3903htmlParseElement(htmlParserCtxtPtr ctxt) {
3904    const xmlChar *name;
3905    xmlChar *currentNode = NULL;
3906    const htmlElemDesc * info;
3907    htmlParserNodeInfo node_info;
3908    int failed;
3909    int depth;
3910    const xmlChar *oldptr;
3911
3912    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3913	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3914		     "htmlParseElement: context error\n", NULL, NULL);
3915	return;
3916    }
3917    /* Capture start position */
3918    if (ctxt->record_info) {
3919        node_info.begin_pos = ctxt->input->consumed +
3920                          (CUR_PTR - ctxt->input->base);
3921	node_info.begin_line = ctxt->input->line;
3922    }
3923
3924    failed = htmlParseStartTag(ctxt);
3925    name = ctxt->name;
3926    if (failed || (name == NULL)) {
3927	if (CUR == '>')
3928	    NEXT;
3929        return;
3930    }
3931
3932    /*
3933     * Lookup the info for that element.
3934     */
3935    info = htmlTagLookup(name);
3936    if (info == NULL) {
3937	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
3938	             "Tag %s invalid\n", name, NULL);
3939    }
3940
3941    /*
3942     * Check for an Empty Element labeled the XML/SGML way
3943     */
3944    if ((CUR == '/') && (NXT(1) == '>')) {
3945        SKIP(2);
3946	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3947	    ctxt->sax->endElement(ctxt->userData, name);
3948	htmlnamePop(ctxt);
3949	return;
3950    }
3951
3952    if (CUR == '>') {
3953        NEXT;
3954    } else {
3955	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3956	             "Couldn't find end of Start Tag %s\n", name, NULL);
3957
3958	/*
3959	 * end of parsing of this node.
3960	 */
3961	if (xmlStrEqual(name, ctxt->name)) {
3962	    nodePop(ctxt);
3963	    htmlnamePop(ctxt);
3964	}
3965
3966	/*
3967	 * Capture end position and add node
3968	 */
3969	if (ctxt->record_info) {
3970	   node_info.end_pos = ctxt->input->consumed +
3971			      (CUR_PTR - ctxt->input->base);
3972	   node_info.end_line = ctxt->input->line;
3973	   node_info.node = ctxt->node;
3974	   xmlParserAddNodeInfo(ctxt, &node_info);
3975	}
3976	return;
3977    }
3978
3979    /*
3980     * Check for an Empty Element from DTD definition
3981     */
3982    if ((info != NULL) && (info->empty)) {
3983	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3984	    ctxt->sax->endElement(ctxt->userData, name);
3985	htmlnamePop(ctxt);
3986	return;
3987    }
3988
3989    /*
3990     * Parse the content of the element:
3991     */
3992    currentNode = xmlStrdup(ctxt->name);
3993    depth = ctxt->nameNr;
3994    while (IS_CHAR_CH(CUR)) {
3995	oldptr = ctxt->input->cur;
3996	htmlParseContent(ctxt);
3997	if (oldptr==ctxt->input->cur) break;
3998	if (ctxt->nameNr < depth) break;
3999    }
4000
4001    /*
4002     * Capture end position and add node
4003     */
4004    if ( currentNode != NULL && ctxt->record_info ) {
4005       node_info.end_pos = ctxt->input->consumed +
4006                          (CUR_PTR - ctxt->input->base);
4007       node_info.end_line = ctxt->input->line;
4008       node_info.node = ctxt->node;
4009       xmlParserAddNodeInfo(ctxt, &node_info);
4010    }
4011    if (!IS_CHAR_CH(CUR)) {
4012	htmlAutoCloseOnEnd(ctxt);
4013    }
4014
4015    if (currentNode != NULL)
4016	xmlFree(currentNode);
4017}
4018
4019/**
4020 * htmlParseDocument:
4021 * @ctxt:  an HTML parser context
4022 *
4023 * parse an HTML document (and build a tree if using the standard SAX
4024 * interface).
4025 *
4026 * Returns 0, -1 in case of error. the parser context is augmented
4027 *                as a result of the parsing.
4028 */
4029
4030int
4031htmlParseDocument(htmlParserCtxtPtr ctxt) {
4032    xmlDtdPtr dtd;
4033
4034    xmlInitParser();
4035
4036    htmlDefaultSAXHandlerInit();
4037
4038    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4039	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4040		     "htmlParseDocument: context error\n", NULL, NULL);
4041	return(XML_ERR_INTERNAL_ERROR);
4042    }
4043    ctxt->html = 1;
4044    GROW;
4045    /*
4046     * SAX: beginning of the document processing.
4047     */
4048    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4049        ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4050
4051    /*
4052     * Wipe out everything which is before the first '<'
4053     */
4054    SKIP_BLANKS;
4055    if (CUR == 0) {
4056	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4057	             "Document is empty\n", NULL, NULL);
4058    }
4059
4060    if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4061	ctxt->sax->startDocument(ctxt->userData);
4062
4063
4064    /*
4065     * Parse possible comments and PIs before any content
4066     */
4067    while (((CUR == '<') && (NXT(1) == '!') &&
4068            (NXT(2) == '-') && (NXT(3) == '-')) ||
4069	   ((CUR == '<') && (NXT(1) == '?'))) {
4070        htmlParseComment(ctxt);
4071        htmlParsePI(ctxt);
4072	SKIP_BLANKS;
4073    }
4074
4075
4076    /*
4077     * Then possibly doc type declaration(s) and more Misc
4078     * (doctypedecl Misc*)?
4079     */
4080    if ((CUR == '<') && (NXT(1) == '!') &&
4081	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4082	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4083	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4084	(UPP(8) == 'E')) {
4085	htmlParseDocTypeDecl(ctxt);
4086    }
4087    SKIP_BLANKS;
4088
4089    /*
4090     * Parse possible comments and PIs before any content
4091     */
4092    while (((CUR == '<') && (NXT(1) == '!') &&
4093            (NXT(2) == '-') && (NXT(3) == '-')) ||
4094	   ((CUR == '<') && (NXT(1) == '?'))) {
4095        htmlParseComment(ctxt);
4096        htmlParsePI(ctxt);
4097	SKIP_BLANKS;
4098    }
4099
4100    /*
4101     * Time to start parsing the tree itself
4102     */
4103    htmlParseContent(ctxt);
4104
4105    /*
4106     * autoclose
4107     */
4108    if (CUR == 0)
4109	htmlAutoCloseOnEnd(ctxt);
4110
4111
4112    /*
4113     * SAX: end of the document processing.
4114     */
4115    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4116        ctxt->sax->endDocument(ctxt->userData);
4117
4118    if (ctxt->myDoc != NULL) {
4119	dtd = xmlGetIntSubset(ctxt->myDoc);
4120	if (dtd == NULL)
4121	    ctxt->myDoc->intSubset =
4122		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4123		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4124		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4125    }
4126    if (! ctxt->wellFormed) return(-1);
4127    return(0);
4128}
4129
4130
4131/************************************************************************
4132 *									*
4133 *			Parser contexts handling			*
4134 *									*
4135 ************************************************************************/
4136
4137/**
4138 * htmlInitParserCtxt:
4139 * @ctxt:  an HTML parser context
4140 *
4141 * Initialize a parser context
4142 *
4143 * Returns 0 in case of success and -1 in case of error
4144 */
4145
4146static int
4147htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4148{
4149    htmlSAXHandler *sax;
4150
4151    if (ctxt == NULL) return(-1);
4152    memset(ctxt, 0, sizeof(htmlParserCtxt));
4153
4154    ctxt->dict = xmlDictCreate();
4155    if (ctxt->dict == NULL) {
4156        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4157	return(-1);
4158    }
4159    sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4160    if (sax == NULL) {
4161        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4162	return(-1);
4163    }
4164    else
4165        memset(sax, 0, sizeof(htmlSAXHandler));
4166
4167    /* Allocate the Input stack */
4168    ctxt->inputTab = (htmlParserInputPtr *)
4169                      xmlMalloc(5 * sizeof(htmlParserInputPtr));
4170    if (ctxt->inputTab == NULL) {
4171        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4172	ctxt->inputNr = 0;
4173	ctxt->inputMax = 0;
4174	ctxt->input = NULL;
4175	return(-1);
4176    }
4177    ctxt->inputNr = 0;
4178    ctxt->inputMax = 5;
4179    ctxt->input = NULL;
4180    ctxt->version = NULL;
4181    ctxt->encoding = NULL;
4182    ctxt->standalone = -1;
4183    ctxt->instate = XML_PARSER_START;
4184
4185    /* Allocate the Node stack */
4186    ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4187    if (ctxt->nodeTab == NULL) {
4188        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4189	ctxt->nodeNr = 0;
4190	ctxt->nodeMax = 0;
4191	ctxt->node = NULL;
4192	ctxt->inputNr = 0;
4193	ctxt->inputMax = 0;
4194	ctxt->input = NULL;
4195	return(-1);
4196    }
4197    ctxt->nodeNr = 0;
4198    ctxt->nodeMax = 10;
4199    ctxt->node = NULL;
4200
4201    /* Allocate the Name stack */
4202    ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4203    if (ctxt->nameTab == NULL) {
4204        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4205	ctxt->nameNr = 0;
4206	ctxt->nameMax = 10;
4207	ctxt->name = NULL;
4208	ctxt->nodeNr = 0;
4209	ctxt->nodeMax = 0;
4210	ctxt->node = NULL;
4211	ctxt->inputNr = 0;
4212	ctxt->inputMax = 0;
4213	ctxt->input = NULL;
4214	return(-1);
4215    }
4216    ctxt->nameNr = 0;
4217    ctxt->nameMax = 10;
4218    ctxt->name = NULL;
4219
4220    if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4221    else {
4222        ctxt->sax = sax;
4223	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4224    }
4225    ctxt->userData = ctxt;
4226    ctxt->myDoc = NULL;
4227    ctxt->wellFormed = 1;
4228    ctxt->replaceEntities = 0;
4229    ctxt->linenumbers = xmlLineNumbersDefaultValue;
4230    ctxt->html = 1;
4231    ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4232    ctxt->vctxt.userData = ctxt;
4233    ctxt->vctxt.error = xmlParserValidityError;
4234    ctxt->vctxt.warning = xmlParserValidityWarning;
4235    ctxt->record_info = 0;
4236    ctxt->validate = 0;
4237    ctxt->nbChars = 0;
4238    ctxt->checkIndex = 0;
4239    ctxt->catalogs = NULL;
4240    xmlInitNodeInfoSeq(&ctxt->node_seq);
4241    return(0);
4242}
4243
4244/**
4245 * htmlFreeParserCtxt:
4246 * @ctxt:  an HTML parser context
4247 *
4248 * Free all the memory used by a parser context. However the parsed
4249 * document in ctxt->myDoc is not freed.
4250 */
4251
4252void
4253htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4254{
4255    xmlFreeParserCtxt(ctxt);
4256}
4257
4258/**
4259 * htmlNewParserCtxt:
4260 *
4261 * Allocate and initialize a new parser context.
4262 *
4263 * Returns the xmlParserCtxtPtr or NULL
4264 */
4265
4266static htmlParserCtxtPtr
4267htmlNewParserCtxt(void)
4268{
4269    xmlParserCtxtPtr ctxt;
4270
4271    ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4272    if (ctxt == NULL) {
4273        htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4274	return(NULL);
4275    }
4276    memset(ctxt, 0, sizeof(xmlParserCtxt));
4277    if (htmlInitParserCtxt(ctxt) < 0) {
4278        htmlFreeParserCtxt(ctxt);
4279	return(NULL);
4280    }
4281    return(ctxt);
4282}
4283
4284/**
4285 * htmlCreateMemoryParserCtxt:
4286 * @buffer:  a pointer to a char array
4287 * @size:  the size of the array
4288 *
4289 * Create a parser context for an HTML in-memory document.
4290 *
4291 * Returns the new parser context or NULL
4292 */
4293htmlParserCtxtPtr
4294htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4295    xmlParserCtxtPtr ctxt;
4296    xmlParserInputPtr input;
4297    xmlParserInputBufferPtr buf;
4298
4299    if (buffer == NULL)
4300	return(NULL);
4301    if (size <= 0)
4302	return(NULL);
4303
4304    ctxt = htmlNewParserCtxt();
4305    if (ctxt == NULL)
4306	return(NULL);
4307
4308    buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4309    if (buf == NULL) return(NULL);
4310
4311    input = xmlNewInputStream(ctxt);
4312    if (input == NULL) {
4313	xmlFreeParserCtxt(ctxt);
4314	return(NULL);
4315    }
4316
4317    input->filename = NULL;
4318    input->buf = buf;
4319    input->base = input->buf->buffer->content;
4320    input->cur = input->buf->buffer->content;
4321    input->end = &input->buf->buffer->content[input->buf->buffer->use];
4322
4323    inputPush(ctxt, input);
4324    return(ctxt);
4325}
4326
4327/**
4328 * htmlCreateDocParserCtxt:
4329 * @cur:  a pointer to an array of xmlChar
4330 * @encoding:  a free form C string describing the HTML document encoding, or NULL
4331 *
4332 * Create a parser context for an HTML document.
4333 *
4334 * TODO: check the need to add encoding handling there
4335 *
4336 * Returns the new parser context or NULL
4337 */
4338static htmlParserCtxtPtr
4339htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
4340    int len;
4341    htmlParserCtxtPtr ctxt;
4342
4343    if (cur == NULL)
4344	return(NULL);
4345    len = xmlStrlen(cur);
4346    ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4347
4348    if (encoding != NULL) {
4349	xmlCharEncoding enc;
4350	xmlCharEncodingHandlerPtr handler;
4351
4352	if (ctxt->input->encoding != NULL)
4353	    xmlFree((xmlChar *) ctxt->input->encoding);
4354	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4355
4356	enc = xmlParseCharEncoding(encoding);
4357	/*
4358	 * registered set of known encodings
4359	 */
4360	if (enc != XML_CHAR_ENCODING_ERROR) {
4361	    xmlSwitchEncoding(ctxt, enc);
4362	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4363		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4364		             "Unsupported encoding %s\n",
4365			     (const xmlChar *) encoding, NULL);
4366	    }
4367	} else {
4368	    /*
4369	     * fallback for unknown encodings
4370	     */
4371	    handler = xmlFindCharEncodingHandler((const char *) encoding);
4372	    if (handler != NULL) {
4373		xmlSwitchToEncoding(ctxt, handler);
4374	    } else {
4375		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4376		             "Unsupported encoding %s\n",
4377			     (const xmlChar *) encoding, NULL);
4378	    }
4379	}
4380    }
4381    return(ctxt);
4382}
4383
4384#ifdef LIBXML_PUSH_ENABLED
4385/************************************************************************
4386 *									*
4387 * 		Progressive parsing interfaces				*
4388 *									*
4389 ************************************************************************/
4390
4391/**
4392 * htmlParseLookupSequence:
4393 * @ctxt:  an HTML parser context
4394 * @first:  the first char to lookup
4395 * @next:  the next char to lookup or zero
4396 * @third:  the next char to lookup or zero
4397 * @comment: flag to force checking inside comments
4398 *
4399 * Try to find if a sequence (first, next, third) or  just (first next) or
4400 * (first) is available in the input stream.
4401 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4402 * to avoid rescanning sequences of bytes, it DOES change the state of the
4403 * parser, do not use liberally.
4404 * This is basically similar to xmlParseLookupSequence()
4405 *
4406 * Returns the index to the current parsing point if the full sequence
4407 *      is available, -1 otherwise.
4408 */
4409static int
4410htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4411                        xmlChar next, xmlChar third, int iscomment) {
4412    int base, len;
4413    htmlParserInputPtr in;
4414    const xmlChar *buf;
4415    int incomment = 0;
4416
4417    in = ctxt->input;
4418    if (in == NULL) return(-1);
4419    base = in->cur - in->base;
4420    if (base < 0) return(-1);
4421    if (ctxt->checkIndex > base)
4422        base = ctxt->checkIndex;
4423    if (in->buf == NULL) {
4424	buf = in->base;
4425	len = in->length;
4426    } else {
4427	buf = in->buf->buffer->content;
4428	len = in->buf->buffer->use;
4429    }
4430    /* take into account the sequence length */
4431    if (third) len -= 2;
4432    else if (next) len --;
4433    for (;base < len;base++) {
4434	if (!incomment && (base + 4 < len) && !iscomment) {
4435	    if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4436		(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4437		incomment = 1;
4438		/* do not increment past <! - some people use <!--> */
4439		base += 2;
4440	    }
4441	}
4442	if (incomment) {
4443	    if (base + 3 > len)
4444		return(-1);
4445	    if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4446		(buf[base + 2] == '>')) {
4447		incomment = 0;
4448		base += 2;
4449	    }
4450	    continue;
4451	}
4452        if (buf[base] == first) {
4453	    if (third != 0) {
4454		if ((buf[base + 1] != next) ||
4455		    (buf[base + 2] != third)) continue;
4456	    } else if (next != 0) {
4457		if (buf[base + 1] != next) continue;
4458	    }
4459	    ctxt->checkIndex = 0;
4460#ifdef DEBUG_PUSH
4461	    if (next == 0)
4462		xmlGenericError(xmlGenericErrorContext,
4463			"HPP: lookup '%c' found at %d\n",
4464			first, base);
4465	    else if (third == 0)
4466		xmlGenericError(xmlGenericErrorContext,
4467			"HPP: lookup '%c%c' found at %d\n",
4468			first, next, base);
4469	    else
4470		xmlGenericError(xmlGenericErrorContext,
4471			"HPP: lookup '%c%c%c' found at %d\n",
4472			first, next, third, base);
4473#endif
4474	    return(base - (in->cur - in->base));
4475	}
4476    }
4477    ctxt->checkIndex = base;
4478#ifdef DEBUG_PUSH
4479    if (next == 0)
4480	xmlGenericError(xmlGenericErrorContext,
4481		"HPP: lookup '%c' failed\n", first);
4482    else if (third == 0)
4483	xmlGenericError(xmlGenericErrorContext,
4484		"HPP: lookup '%c%c' failed\n", first, next);
4485    else
4486	xmlGenericError(xmlGenericErrorContext,
4487		"HPP: lookup '%c%c%c' failed\n", first, next, third);
4488#endif
4489    return(-1);
4490}
4491
4492/**
4493 * htmlParseTryOrFinish:
4494 * @ctxt:  an HTML parser context
4495 * @terminate:  last chunk indicator
4496 *
4497 * Try to progress on parsing
4498 *
4499 * Returns zero if no parsing was possible
4500 */
4501static int
4502htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4503    int ret = 0;
4504    htmlParserInputPtr in;
4505    int avail = 0;
4506    xmlChar cur, next;
4507
4508#ifdef DEBUG_PUSH
4509    switch (ctxt->instate) {
4510	case XML_PARSER_EOF:
4511	    xmlGenericError(xmlGenericErrorContext,
4512		    "HPP: try EOF\n"); break;
4513	case XML_PARSER_START:
4514	    xmlGenericError(xmlGenericErrorContext,
4515		    "HPP: try START\n"); break;
4516	case XML_PARSER_MISC:
4517	    xmlGenericError(xmlGenericErrorContext,
4518		    "HPP: try MISC\n");break;
4519	case XML_PARSER_COMMENT:
4520	    xmlGenericError(xmlGenericErrorContext,
4521		    "HPP: try COMMENT\n");break;
4522	case XML_PARSER_PROLOG:
4523	    xmlGenericError(xmlGenericErrorContext,
4524		    "HPP: try PROLOG\n");break;
4525	case XML_PARSER_START_TAG:
4526	    xmlGenericError(xmlGenericErrorContext,
4527		    "HPP: try START_TAG\n");break;
4528	case XML_PARSER_CONTENT:
4529	    xmlGenericError(xmlGenericErrorContext,
4530		    "HPP: try CONTENT\n");break;
4531	case XML_PARSER_CDATA_SECTION:
4532	    xmlGenericError(xmlGenericErrorContext,
4533		    "HPP: try CDATA_SECTION\n");break;
4534	case XML_PARSER_END_TAG:
4535	    xmlGenericError(xmlGenericErrorContext,
4536		    "HPP: try END_TAG\n");break;
4537	case XML_PARSER_ENTITY_DECL:
4538	    xmlGenericError(xmlGenericErrorContext,
4539		    "HPP: try ENTITY_DECL\n");break;
4540	case XML_PARSER_ENTITY_VALUE:
4541	    xmlGenericError(xmlGenericErrorContext,
4542		    "HPP: try ENTITY_VALUE\n");break;
4543	case XML_PARSER_ATTRIBUTE_VALUE:
4544	    xmlGenericError(xmlGenericErrorContext,
4545		    "HPP: try ATTRIBUTE_VALUE\n");break;
4546	case XML_PARSER_DTD:
4547	    xmlGenericError(xmlGenericErrorContext,
4548		    "HPP: try DTD\n");break;
4549	case XML_PARSER_EPILOG:
4550	    xmlGenericError(xmlGenericErrorContext,
4551		    "HPP: try EPILOG\n");break;
4552	case XML_PARSER_PI:
4553	    xmlGenericError(xmlGenericErrorContext,
4554		    "HPP: try PI\n");break;
4555	case XML_PARSER_SYSTEM_LITERAL:
4556	    xmlGenericError(xmlGenericErrorContext,
4557		    "HPP: try SYSTEM_LITERAL\n");break;
4558    }
4559#endif
4560
4561    while (1) {
4562
4563	in = ctxt->input;
4564	if (in == NULL) break;
4565	if (in->buf == NULL)
4566	    avail = in->length - (in->cur - in->base);
4567	else
4568	    avail = in->buf->buffer->use - (in->cur - in->base);
4569	if ((avail == 0) && (terminate)) {
4570	    htmlAutoCloseOnEnd(ctxt);
4571	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4572		/*
4573		 * SAX: end of the document processing.
4574		 */
4575		ctxt->instate = XML_PARSER_EOF;
4576		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4577		    ctxt->sax->endDocument(ctxt->userData);
4578	    }
4579	}
4580        if (avail < 1)
4581	    goto done;
4582	cur = in->cur[0];
4583	if (cur == 0) {
4584	    SKIP(1);
4585	    continue;
4586	}
4587
4588        switch (ctxt->instate) {
4589            case XML_PARSER_EOF:
4590	        /*
4591		 * Document parsing is done !
4592		 */
4593	        goto done;
4594            case XML_PARSER_START:
4595	        /*
4596		 * Very first chars read from the document flow.
4597		 */
4598		cur = in->cur[0];
4599		if (IS_BLANK_CH(cur)) {
4600		    SKIP_BLANKS;
4601		    if (in->buf == NULL)
4602			avail = in->length - (in->cur - in->base);
4603		    else
4604			avail = in->buf->buffer->use - (in->cur - in->base);
4605		}
4606		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4607		    ctxt->sax->setDocumentLocator(ctxt->userData,
4608						  &xmlDefaultSAXLocator);
4609		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4610	            (!ctxt->disableSAX))
4611		    ctxt->sax->startDocument(ctxt->userData);
4612
4613		cur = in->cur[0];
4614		next = in->cur[1];
4615		if ((cur == '<') && (next == '!') &&
4616		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
4617		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
4618		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4619		    (UPP(8) == 'E')) {
4620		    if ((!terminate) &&
4621		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4622			goto done;
4623#ifdef DEBUG_PUSH
4624		    xmlGenericError(xmlGenericErrorContext,
4625			    "HPP: Parsing internal subset\n");
4626#endif
4627		    htmlParseDocTypeDecl(ctxt);
4628		    ctxt->instate = XML_PARSER_PROLOG;
4629#ifdef DEBUG_PUSH
4630		    xmlGenericError(xmlGenericErrorContext,
4631			    "HPP: entering PROLOG\n");
4632#endif
4633                } else {
4634		    ctxt->instate = XML_PARSER_MISC;
4635#ifdef DEBUG_PUSH
4636		    xmlGenericError(xmlGenericErrorContext,
4637			    "HPP: entering MISC\n");
4638#endif
4639		}
4640		break;
4641            case XML_PARSER_MISC:
4642		SKIP_BLANKS;
4643		if (in->buf == NULL)
4644		    avail = in->length - (in->cur - in->base);
4645		else
4646		    avail = in->buf->buffer->use - (in->cur - in->base);
4647		if (avail < 2)
4648		    goto done;
4649		cur = in->cur[0];
4650		next = in->cur[1];
4651	        if ((cur == '<') && (next == '!') &&
4652		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
4653		    if ((!terminate) &&
4654		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4655			goto done;
4656#ifdef DEBUG_PUSH
4657		    xmlGenericError(xmlGenericErrorContext,
4658			    "HPP: Parsing Comment\n");
4659#endif
4660		    htmlParseComment(ctxt);
4661		    ctxt->instate = XML_PARSER_MISC;
4662	        } else if ((cur == '<') && (next == '?')) {
4663		    if ((!terminate) &&
4664		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4665			goto done;
4666#ifdef DEBUG_PUSH
4667		    xmlGenericError(xmlGenericErrorContext,
4668			    "HPP: Parsing PI\n");
4669#endif
4670		    htmlParsePI(ctxt);
4671		    ctxt->instate = XML_PARSER_MISC;
4672		} else if ((cur == '<') && (next == '!') &&
4673		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
4674		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
4675		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4676		    (UPP(8) == 'E')) {
4677		    if ((!terminate) &&
4678		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4679			goto done;
4680#ifdef DEBUG_PUSH
4681		    xmlGenericError(xmlGenericErrorContext,
4682			    "HPP: Parsing internal subset\n");
4683#endif
4684		    htmlParseDocTypeDecl(ctxt);
4685		    ctxt->instate = XML_PARSER_PROLOG;
4686#ifdef DEBUG_PUSH
4687		    xmlGenericError(xmlGenericErrorContext,
4688			    "HPP: entering PROLOG\n");
4689#endif
4690		} else if ((cur == '<') && (next == '!') &&
4691		           (avail < 9)) {
4692		    goto done;
4693		} else {
4694		    ctxt->instate = XML_PARSER_START_TAG;
4695#ifdef DEBUG_PUSH
4696		    xmlGenericError(xmlGenericErrorContext,
4697			    "HPP: entering START_TAG\n");
4698#endif
4699		}
4700		break;
4701            case XML_PARSER_PROLOG:
4702		SKIP_BLANKS;
4703		if (in->buf == NULL)
4704		    avail = in->length - (in->cur - in->base);
4705		else
4706		    avail = in->buf->buffer->use - (in->cur - in->base);
4707		if (avail < 2)
4708		    goto done;
4709		cur = in->cur[0];
4710		next = in->cur[1];
4711		if ((cur == '<') && (next == '!') &&
4712		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
4713		    if ((!terminate) &&
4714		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4715			goto done;
4716#ifdef DEBUG_PUSH
4717		    xmlGenericError(xmlGenericErrorContext,
4718			    "HPP: Parsing Comment\n");
4719#endif
4720		    htmlParseComment(ctxt);
4721		    ctxt->instate = XML_PARSER_PROLOG;
4722	        } else if ((cur == '<') && (next == '?')) {
4723		    if ((!terminate) &&
4724		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4725			goto done;
4726#ifdef DEBUG_PUSH
4727		    xmlGenericError(xmlGenericErrorContext,
4728			    "HPP: Parsing PI\n");
4729#endif
4730		    htmlParsePI(ctxt);
4731		    ctxt->instate = XML_PARSER_PROLOG;
4732		} else if ((cur == '<') && (next == '!') &&
4733		           (avail < 4)) {
4734		    goto done;
4735		} else {
4736		    ctxt->instate = XML_PARSER_START_TAG;
4737#ifdef DEBUG_PUSH
4738		    xmlGenericError(xmlGenericErrorContext,
4739			    "HPP: entering START_TAG\n");
4740#endif
4741		}
4742		break;
4743            case XML_PARSER_EPILOG:
4744		if (in->buf == NULL)
4745		    avail = in->length - (in->cur - in->base);
4746		else
4747		    avail = in->buf->buffer->use - (in->cur - in->base);
4748		if (avail < 1)
4749		    goto done;
4750		cur = in->cur[0];
4751		if (IS_BLANK_CH(cur)) {
4752		    htmlParseCharData(ctxt);
4753		    goto done;
4754		}
4755		if (avail < 2)
4756		    goto done;
4757		next = in->cur[1];
4758	        if ((cur == '<') && (next == '!') &&
4759		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
4760		    if ((!terminate) &&
4761		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4762			goto done;
4763#ifdef DEBUG_PUSH
4764		    xmlGenericError(xmlGenericErrorContext,
4765			    "HPP: Parsing Comment\n");
4766#endif
4767		    htmlParseComment(ctxt);
4768		    ctxt->instate = XML_PARSER_EPILOG;
4769	        } else if ((cur == '<') && (next == '?')) {
4770		    if ((!terminate) &&
4771		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4772			goto done;
4773#ifdef DEBUG_PUSH
4774		    xmlGenericError(xmlGenericErrorContext,
4775			    "HPP: Parsing PI\n");
4776#endif
4777		    htmlParsePI(ctxt);
4778		    ctxt->instate = XML_PARSER_EPILOG;
4779		} else if ((cur == '<') && (next == '!') &&
4780		           (avail < 4)) {
4781		    goto done;
4782		} else {
4783		    ctxt->errNo = XML_ERR_DOCUMENT_END;
4784		    ctxt->wellFormed = 0;
4785		    ctxt->instate = XML_PARSER_EOF;
4786#ifdef DEBUG_PUSH
4787		    xmlGenericError(xmlGenericErrorContext,
4788			    "HPP: entering EOF\n");
4789#endif
4790		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4791			ctxt->sax->endDocument(ctxt->userData);
4792		    goto done;
4793		}
4794		break;
4795            case XML_PARSER_START_TAG: {
4796	        const xmlChar *name;
4797		int failed;
4798		const htmlElemDesc * info;
4799
4800		if (avail < 2)
4801		    goto done;
4802		cur = in->cur[0];
4803	        if (cur != '<') {
4804		    ctxt->instate = XML_PARSER_CONTENT;
4805#ifdef DEBUG_PUSH
4806		    xmlGenericError(xmlGenericErrorContext,
4807			    "HPP: entering CONTENT\n");
4808#endif
4809		    break;
4810		}
4811		if (in->cur[1] == '/') {
4812		    ctxt->instate = XML_PARSER_END_TAG;
4813		    ctxt->checkIndex = 0;
4814#ifdef DEBUG_PUSH
4815		    xmlGenericError(xmlGenericErrorContext,
4816			    "HPP: entering END_TAG\n");
4817#endif
4818		    break;
4819		}
4820		if ((!terminate) &&
4821		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4822		    goto done;
4823
4824		failed = htmlParseStartTag(ctxt);
4825		name = ctxt->name;
4826		if (failed ||
4827		    (name == NULL)) {
4828		    if (CUR == '>')
4829			NEXT;
4830		    break;
4831		}
4832
4833		/*
4834		 * Lookup the info for that element.
4835		 */
4836		info = htmlTagLookup(name);
4837		if (info == NULL) {
4838		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4839		                 "Tag %s invalid\n", name, NULL);
4840		}
4841
4842		/*
4843		 * Check for an Empty Element labeled the XML/SGML way
4844		 */
4845		if ((CUR == '/') && (NXT(1) == '>')) {
4846		    SKIP(2);
4847		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4848			ctxt->sax->endElement(ctxt->userData, name);
4849		    htmlnamePop(ctxt);
4850		    ctxt->instate = XML_PARSER_CONTENT;
4851#ifdef DEBUG_PUSH
4852		    xmlGenericError(xmlGenericErrorContext,
4853			    "HPP: entering CONTENT\n");
4854#endif
4855		    break;
4856		}
4857
4858		if (CUR == '>') {
4859		    NEXT;
4860		} else {
4861		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4862		                 "Couldn't find end of Start Tag %s\n",
4863				 name, NULL);
4864
4865		    /*
4866		     * end of parsing of this node.
4867		     */
4868		    if (xmlStrEqual(name, ctxt->name)) {
4869			nodePop(ctxt);
4870			htmlnamePop(ctxt);
4871		    }
4872
4873		    ctxt->instate = XML_PARSER_CONTENT;
4874#ifdef DEBUG_PUSH
4875		    xmlGenericError(xmlGenericErrorContext,
4876			    "HPP: entering CONTENT\n");
4877#endif
4878		    break;
4879		}
4880
4881		/*
4882		 * Check for an Empty Element from DTD definition
4883		 */
4884		if ((info != NULL) && (info->empty)) {
4885		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4886			ctxt->sax->endElement(ctxt->userData, name);
4887		    htmlnamePop(ctxt);
4888		}
4889		ctxt->instate = XML_PARSER_CONTENT;
4890#ifdef DEBUG_PUSH
4891		xmlGenericError(xmlGenericErrorContext,
4892			"HPP: entering CONTENT\n");
4893#endif
4894                break;
4895	    }
4896            case XML_PARSER_CONTENT: {
4897		long cons;
4898                /*
4899		 * Handle preparsed entities and charRef
4900		 */
4901		if (ctxt->token != 0) {
4902		    xmlChar chr[2] = { 0 , 0 } ;
4903
4904		    chr[0] = (xmlChar) ctxt->token;
4905		    htmlCheckParagraph(ctxt);
4906		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4907			ctxt->sax->characters(ctxt->userData, chr, 1);
4908		    ctxt->token = 0;
4909		    ctxt->checkIndex = 0;
4910		}
4911		if ((avail == 1) && (terminate)) {
4912		    cur = in->cur[0];
4913		    if ((cur != '<') && (cur != '&')) {
4914			if (ctxt->sax != NULL) {
4915			    if (IS_BLANK_CH(cur)) {
4916				if (ctxt->sax->ignorableWhitespace != NULL)
4917				    ctxt->sax->ignorableWhitespace(
4918					    ctxt->userData, &cur, 1);
4919			    } else {
4920				htmlCheckParagraph(ctxt);
4921				if (ctxt->sax->characters != NULL)
4922				    ctxt->sax->characters(
4923					    ctxt->userData, &cur, 1);
4924			    }
4925			}
4926			ctxt->token = 0;
4927			ctxt->checkIndex = 0;
4928			in->cur++;
4929			break;
4930		    }
4931		}
4932		if (avail < 2)
4933		    goto done;
4934		cur = in->cur[0];
4935		next = in->cur[1];
4936		cons = ctxt->nbChars;
4937		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4938		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4939		    /*
4940		     * Handle SCRIPT/STYLE separately
4941		     */
4942		    if ((!terminate) &&
4943		        (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0))
4944			goto done;
4945		    htmlParseScript(ctxt);
4946		    if ((cur == '<') && (next == '/')) {
4947			ctxt->instate = XML_PARSER_END_TAG;
4948			ctxt->checkIndex = 0;
4949#ifdef DEBUG_PUSH
4950			xmlGenericError(xmlGenericErrorContext,
4951				"HPP: entering END_TAG\n");
4952#endif
4953			break;
4954		    }
4955		} else {
4956		    /*
4957		     * Sometimes DOCTYPE arrives in the middle of the document
4958		     */
4959		    if ((cur == '<') && (next == '!') &&
4960			(UPP(2) == 'D') && (UPP(3) == 'O') &&
4961			(UPP(4) == 'C') && (UPP(5) == 'T') &&
4962			(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4963			(UPP(8) == 'E')) {
4964			if ((!terminate) &&
4965			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4966			    goto done;
4967			htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4968			             "Misplaced DOCTYPE declaration\n",
4969				     BAD_CAST "DOCTYPE" , NULL);
4970			htmlParseDocTypeDecl(ctxt);
4971		    } else if ((cur == '<') && (next == '!') &&
4972			(in->cur[2] == '-') && (in->cur[3] == '-')) {
4973			if ((!terminate) &&
4974			    (htmlParseLookupSequence(
4975			    		ctxt, '-', '-', '>', 1) < 0))
4976			    goto done;
4977#ifdef DEBUG_PUSH
4978			xmlGenericError(xmlGenericErrorContext,
4979				"HPP: Parsing Comment\n");
4980#endif
4981			htmlParseComment(ctxt);
4982			ctxt->instate = XML_PARSER_CONTENT;
4983		    } else if ((cur == '<') && (next == '?')) {
4984			if ((!terminate) &&
4985			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4986			    goto done;
4987#ifdef DEBUG_PUSH
4988			xmlGenericError(xmlGenericErrorContext,
4989				"HPP: Parsing PI\n");
4990#endif
4991			htmlParsePI(ctxt);
4992			ctxt->instate = XML_PARSER_CONTENT;
4993		    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4994			goto done;
4995		    } else if ((cur == '<') && (next == '/')) {
4996			ctxt->instate = XML_PARSER_END_TAG;
4997			ctxt->checkIndex = 0;
4998#ifdef DEBUG_PUSH
4999			xmlGenericError(xmlGenericErrorContext,
5000				"HPP: entering END_TAG\n");
5001#endif
5002			break;
5003		    } else if (cur == '<') {
5004			ctxt->instate = XML_PARSER_START_TAG;
5005			ctxt->checkIndex = 0;
5006#ifdef DEBUG_PUSH
5007			xmlGenericError(xmlGenericErrorContext,
5008				"HPP: entering START_TAG\n");
5009#endif
5010			break;
5011		    } else if (cur == '&') {
5012			if ((!terminate) &&
5013			    (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
5014			    goto done;
5015#ifdef DEBUG_PUSH
5016			xmlGenericError(xmlGenericErrorContext,
5017				"HPP: Parsing Reference\n");
5018#endif
5019			/* TODO: check generation of subtrees if noent !!! */
5020			htmlParseReference(ctxt);
5021		    } else {
5022		        /*
5023			 * check that the text sequence is complete
5024			 * before handing out the data to the parser
5025			 * to avoid problems with erroneous end of
5026			 * data detection.
5027			 */
5028			if ((!terminate) &&
5029			    (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5030			    goto done;
5031			ctxt->checkIndex = 0;
5032#ifdef DEBUG_PUSH
5033			xmlGenericError(xmlGenericErrorContext,
5034				"HPP: Parsing char data\n");
5035#endif
5036			htmlParseCharData(ctxt);
5037		    }
5038		}
5039		if (cons == ctxt->nbChars) {
5040		    if (ctxt->node != NULL) {
5041			htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5042			             "detected an error in element content\n",
5043				     NULL, NULL);
5044		    }
5045		    NEXT;
5046		    break;
5047		}
5048
5049		break;
5050	    }
5051            case XML_PARSER_END_TAG:
5052		if (avail < 2)
5053		    goto done;
5054		if ((!terminate) &&
5055		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5056		    goto done;
5057		htmlParseEndTag(ctxt);
5058		if (ctxt->nameNr == 0) {
5059		    ctxt->instate = XML_PARSER_EPILOG;
5060		} else {
5061		    ctxt->instate = XML_PARSER_CONTENT;
5062		}
5063		ctxt->checkIndex = 0;
5064#ifdef DEBUG_PUSH
5065		xmlGenericError(xmlGenericErrorContext,
5066			"HPP: entering CONTENT\n");
5067#endif
5068	        break;
5069            case XML_PARSER_CDATA_SECTION:
5070		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5071			"HPP: internal error, state == CDATA\n",
5072			     NULL, NULL);
5073		ctxt->instate = XML_PARSER_CONTENT;
5074		ctxt->checkIndex = 0;
5075#ifdef DEBUG_PUSH
5076		xmlGenericError(xmlGenericErrorContext,
5077			"HPP: entering CONTENT\n");
5078#endif
5079		break;
5080            case XML_PARSER_DTD:
5081		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5082			"HPP: internal error, state == DTD\n",
5083			     NULL, NULL);
5084		ctxt->instate = XML_PARSER_CONTENT;
5085		ctxt->checkIndex = 0;
5086#ifdef DEBUG_PUSH
5087		xmlGenericError(xmlGenericErrorContext,
5088			"HPP: entering CONTENT\n");
5089#endif
5090		break;
5091            case XML_PARSER_COMMENT:
5092		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5093			"HPP: internal error, state == COMMENT\n",
5094			     NULL, NULL);
5095		ctxt->instate = XML_PARSER_CONTENT;
5096		ctxt->checkIndex = 0;
5097#ifdef DEBUG_PUSH
5098		xmlGenericError(xmlGenericErrorContext,
5099			"HPP: entering CONTENT\n");
5100#endif
5101		break;
5102            case XML_PARSER_PI:
5103		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5104			"HPP: internal error, state == PI\n",
5105			     NULL, NULL);
5106		ctxt->instate = XML_PARSER_CONTENT;
5107		ctxt->checkIndex = 0;
5108#ifdef DEBUG_PUSH
5109		xmlGenericError(xmlGenericErrorContext,
5110			"HPP: entering CONTENT\n");
5111#endif
5112		break;
5113            case XML_PARSER_ENTITY_DECL:
5114		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5115			"HPP: internal error, state == ENTITY_DECL\n",
5116			     NULL, NULL);
5117		ctxt->instate = XML_PARSER_CONTENT;
5118		ctxt->checkIndex = 0;
5119#ifdef DEBUG_PUSH
5120		xmlGenericError(xmlGenericErrorContext,
5121			"HPP: entering CONTENT\n");
5122#endif
5123		break;
5124            case XML_PARSER_ENTITY_VALUE:
5125		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5126			"HPP: internal error, state == ENTITY_VALUE\n",
5127			     NULL, NULL);
5128		ctxt->instate = XML_PARSER_CONTENT;
5129		ctxt->checkIndex = 0;
5130#ifdef DEBUG_PUSH
5131		xmlGenericError(xmlGenericErrorContext,
5132			"HPP: entering DTD\n");
5133#endif
5134		break;
5135            case XML_PARSER_ATTRIBUTE_VALUE:
5136		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5137			"HPP: internal error, state == ATTRIBUTE_VALUE\n",
5138			     NULL, NULL);
5139		ctxt->instate = XML_PARSER_START_TAG;
5140		ctxt->checkIndex = 0;
5141#ifdef DEBUG_PUSH
5142		xmlGenericError(xmlGenericErrorContext,
5143			"HPP: entering START_TAG\n");
5144#endif
5145		break;
5146	    case XML_PARSER_SYSTEM_LITERAL:
5147		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5148		    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5149			     NULL, NULL);
5150		ctxt->instate = XML_PARSER_CONTENT;
5151		ctxt->checkIndex = 0;
5152#ifdef DEBUG_PUSH
5153		xmlGenericError(xmlGenericErrorContext,
5154			"HPP: entering CONTENT\n");
5155#endif
5156		break;
5157	    case XML_PARSER_IGNORE:
5158		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5159			"HPP: internal error, state == XML_PARSER_IGNORE\n",
5160			     NULL, NULL);
5161		ctxt->instate = XML_PARSER_CONTENT;
5162		ctxt->checkIndex = 0;
5163#ifdef DEBUG_PUSH
5164		xmlGenericError(xmlGenericErrorContext,
5165			"HPP: entering CONTENT\n");
5166#endif
5167		break;
5168	    case XML_PARSER_PUBLIC_LITERAL:
5169		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5170			"HPP: internal error, state == XML_PARSER_LITERAL\n",
5171			     NULL, NULL);
5172		ctxt->instate = XML_PARSER_CONTENT;
5173		ctxt->checkIndex = 0;
5174#ifdef DEBUG_PUSH
5175		xmlGenericError(xmlGenericErrorContext,
5176			"HPP: entering CONTENT\n");
5177#endif
5178		break;
5179
5180	}
5181    }
5182done:
5183    if ((avail == 0) && (terminate)) {
5184	htmlAutoCloseOnEnd(ctxt);
5185	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5186	    /*
5187	     * SAX: end of the document processing.
5188	     */
5189	    ctxt->instate = XML_PARSER_EOF;
5190	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5191		ctxt->sax->endDocument(ctxt->userData);
5192	}
5193    }
5194    if ((ctxt->myDoc != NULL) &&
5195	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5196	 (ctxt->instate == XML_PARSER_EPILOG))) {
5197	xmlDtdPtr dtd;
5198	dtd = xmlGetIntSubset(ctxt->myDoc);
5199	if (dtd == NULL)
5200	    ctxt->myDoc->intSubset =
5201		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5202		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5203		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5204    }
5205#ifdef DEBUG_PUSH
5206    xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5207#endif
5208    return(ret);
5209}
5210
5211/**
5212 * htmlParseChunk:
5213 * @ctxt:  an HTML parser context
5214 * @chunk:  an char array
5215 * @size:  the size in byte of the chunk
5216 * @terminate:  last chunk indicator
5217 *
5218 * Parse a Chunk of memory
5219 *
5220 * Returns zero if no error, the xmlParserErrors otherwise.
5221 */
5222int
5223htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5224              int terminate) {
5225    if ((ctxt == NULL) || (ctxt->input == NULL)) {
5226	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5227		     "htmlParseChunk: context error\n", NULL, NULL);
5228	return(XML_ERR_INTERNAL_ERROR);
5229    }
5230    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5231        (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
5232	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5233	int cur = ctxt->input->cur - ctxt->input->base;
5234	int res;
5235
5236	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5237	if (res < 0) {
5238	    ctxt->errNo = XML_PARSER_EOF;
5239	    ctxt->disableSAX = 1;
5240	    return (XML_PARSER_EOF);
5241	}
5242	ctxt->input->base = ctxt->input->buf->buffer->content + base;
5243	ctxt->input->cur = ctxt->input->base + cur;
5244	ctxt->input->end =
5245	  &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5246#ifdef DEBUG_PUSH
5247	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5248#endif
5249
5250#if 0
5251	if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5252	    htmlParseTryOrFinish(ctxt, terminate);
5253#endif
5254    } else if (ctxt->instate != XML_PARSER_EOF) {
5255	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5256	    xmlParserInputBufferPtr in = ctxt->input->buf;
5257	    if ((in->encoder != NULL) && (in->buffer != NULL) &&
5258		    (in->raw != NULL)) {
5259		int nbchars;
5260
5261		nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5262		if (nbchars < 0) {
5263		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5264			         "encoder error\n", NULL, NULL);
5265		    return(XML_ERR_INVALID_ENCODING);
5266		}
5267	    }
5268	}
5269    }
5270    htmlParseTryOrFinish(ctxt, terminate);
5271    if (terminate) {
5272	if ((ctxt->instate != XML_PARSER_EOF) &&
5273	    (ctxt->instate != XML_PARSER_EPILOG) &&
5274	    (ctxt->instate != XML_PARSER_MISC)) {
5275	    ctxt->errNo = XML_ERR_DOCUMENT_END;
5276	    ctxt->wellFormed = 0;
5277	}
5278	if (ctxt->instate != XML_PARSER_EOF) {
5279	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5280		ctxt->sax->endDocument(ctxt->userData);
5281	}
5282	ctxt->instate = XML_PARSER_EOF;
5283    }
5284    return((xmlParserErrors) ctxt->errNo);
5285}
5286
5287/************************************************************************
5288 *									*
5289 *			User entry points				*
5290 *									*
5291 ************************************************************************/
5292
5293/**
5294 * htmlCreatePushParserCtxt:
5295 * @sax:  a SAX handler
5296 * @user_data:  The user data returned on SAX callbacks
5297 * @chunk:  a pointer to an array of chars
5298 * @size:  number of chars in the array
5299 * @filename:  an optional file name or URI
5300 * @enc:  an optional encoding
5301 *
5302 * Create a parser context for using the HTML parser in push mode
5303 * The value of @filename is used for fetching external entities
5304 * and error/warning reports.
5305 *
5306 * Returns the new parser context or NULL
5307 */
5308htmlParserCtxtPtr
5309htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5310                         const char *chunk, int size, const char *filename,
5311			 xmlCharEncoding enc) {
5312    htmlParserCtxtPtr ctxt;
5313    htmlParserInputPtr inputStream;
5314    xmlParserInputBufferPtr buf;
5315
5316    xmlInitParser();
5317
5318    buf = xmlAllocParserInputBuffer(enc);
5319    if (buf == NULL) return(NULL);
5320
5321    ctxt = htmlNewParserCtxt();
5322    if (ctxt == NULL) {
5323	xmlFreeParserInputBuffer(buf);
5324	return(NULL);
5325    }
5326    if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5327	ctxt->charset=XML_CHAR_ENCODING_UTF8;
5328    if (sax != NULL) {
5329	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
5330	    xmlFree(ctxt->sax);
5331	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5332	if (ctxt->sax == NULL) {
5333	    xmlFree(buf);
5334	    xmlFree(ctxt);
5335	    return(NULL);
5336	}
5337	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5338	if (user_data != NULL)
5339	    ctxt->userData = user_data;
5340    }
5341    if (filename == NULL) {
5342	ctxt->directory = NULL;
5343    } else {
5344        ctxt->directory = xmlParserGetDirectory(filename);
5345    }
5346
5347    inputStream = htmlNewInputStream(ctxt);
5348    if (inputStream == NULL) {
5349	xmlFreeParserCtxt(ctxt);
5350	xmlFree(buf);
5351	return(NULL);
5352    }
5353
5354    if (filename == NULL)
5355	inputStream->filename = NULL;
5356    else
5357	inputStream->filename = (char *)
5358	    xmlCanonicPath((const xmlChar *) filename);
5359    inputStream->buf = buf;
5360    inputStream->base = inputStream->buf->buffer->content;
5361    inputStream->cur = inputStream->buf->buffer->content;
5362    inputStream->end =
5363	&inputStream->buf->buffer->content[inputStream->buf->buffer->use];
5364
5365    inputPush(ctxt, inputStream);
5366
5367    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5368        (ctxt->input->buf != NULL))  {
5369	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5370	int cur = ctxt->input->cur - ctxt->input->base;
5371
5372	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5373
5374	ctxt->input->base = ctxt->input->buf->buffer->content + base;
5375	ctxt->input->cur = ctxt->input->base + cur;
5376	ctxt->input->end =
5377	    &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5378#ifdef DEBUG_PUSH
5379	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5380#endif
5381    }
5382
5383    return(ctxt);
5384}
5385#endif /* LIBXML_PUSH_ENABLED */
5386
5387/**
5388 * htmlSAXParseDoc:
5389 * @cur:  a pointer to an array of xmlChar
5390 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5391 * @sax:  the SAX handler block
5392 * @userData: if using SAX, this pointer will be provided on callbacks.
5393 *
5394 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5395 * to handle parse events. If sax is NULL, fallback to the default DOM
5396 * behavior and return a tree.
5397 *
5398 * Returns the resulting document tree unless SAX is NULL or the document is
5399 *     not well formed.
5400 */
5401
5402htmlDocPtr
5403htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5404    htmlDocPtr ret;
5405    htmlParserCtxtPtr ctxt;
5406
5407    xmlInitParser();
5408
5409    if (cur == NULL) return(NULL);
5410
5411
5412    ctxt = htmlCreateDocParserCtxt(cur, encoding);
5413    if (ctxt == NULL) return(NULL);
5414    if (sax != NULL) {
5415        if (ctxt->sax != NULL) xmlFree (ctxt->sax);
5416        ctxt->sax = sax;
5417        ctxt->userData = userData;
5418    }
5419
5420    htmlParseDocument(ctxt);
5421    ret = ctxt->myDoc;
5422    if (sax != NULL) {
5423	ctxt->sax = NULL;
5424	ctxt->userData = NULL;
5425    }
5426    htmlFreeParserCtxt(ctxt);
5427
5428    return(ret);
5429}
5430
5431/**
5432 * htmlParseDoc:
5433 * @cur:  a pointer to an array of xmlChar
5434 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5435 *
5436 * parse an HTML in-memory document and build a tree.
5437 *
5438 * Returns the resulting document tree
5439 */
5440
5441htmlDocPtr
5442htmlParseDoc(xmlChar *cur, const char *encoding) {
5443    return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5444}
5445
5446
5447/**
5448 * htmlCreateFileParserCtxt:
5449 * @filename:  the filename
5450 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5451 *
5452 * Create a parser context for a file content.
5453 * Automatic support for ZLIB/Compress compressed document is provided
5454 * by default if found at compile-time.
5455 *
5456 * Returns the new parser context or NULL
5457 */
5458htmlParserCtxtPtr
5459htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5460{
5461    htmlParserCtxtPtr ctxt;
5462    htmlParserInputPtr inputStream;
5463    char *canonicFilename;
5464    /* htmlCharEncoding enc; */
5465    xmlChar *content, *content_line = (xmlChar *) "charset=";
5466
5467    if (filename == NULL)
5468        return(NULL);
5469
5470    ctxt = htmlNewParserCtxt();
5471    if (ctxt == NULL) {
5472	return(NULL);
5473    }
5474    canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5475    if (canonicFilename == NULL) {
5476#ifdef LIBXML_SAX1_ENABLED
5477	if (xmlDefaultSAXHandler.error != NULL) {
5478	    xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5479	}
5480#endif
5481	xmlFreeParserCtxt(ctxt);
5482	return(NULL);
5483    }
5484
5485    inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5486    xmlFree(canonicFilename);
5487    if (inputStream == NULL) {
5488	xmlFreeParserCtxt(ctxt);
5489	return(NULL);
5490    }
5491
5492    inputPush(ctxt, inputStream);
5493
5494    /* set encoding */
5495    if (encoding) {
5496        content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
5497	if (content) {
5498	    strcpy ((char *)content, (char *)content_line);
5499            strcat ((char *)content, (char *)encoding);
5500            htmlCheckEncoding (ctxt, content);
5501	    xmlFree (content);
5502	}
5503    }
5504
5505    return(ctxt);
5506}
5507
5508/**
5509 * htmlSAXParseFile:
5510 * @filename:  the filename
5511 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5512 * @sax:  the SAX handler block
5513 * @userData: if using SAX, this pointer will be provided on callbacks.
5514 *
5515 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5516 * compressed document is provided by default if found at compile-time.
5517 * It use the given SAX function block to handle the parsing callback.
5518 * If sax is NULL, fallback to the default DOM tree building routines.
5519 *
5520 * Returns the resulting document tree unless SAX is NULL or the document is
5521 *     not well formed.
5522 */
5523
5524htmlDocPtr
5525htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5526                 void *userData) {
5527    htmlDocPtr ret;
5528    htmlParserCtxtPtr ctxt;
5529    htmlSAXHandlerPtr oldsax = NULL;
5530
5531    xmlInitParser();
5532
5533    ctxt = htmlCreateFileParserCtxt(filename, encoding);
5534    if (ctxt == NULL) return(NULL);
5535    if (sax != NULL) {
5536	oldsax = ctxt->sax;
5537        ctxt->sax = sax;
5538        ctxt->userData = userData;
5539    }
5540
5541    htmlParseDocument(ctxt);
5542
5543    ret = ctxt->myDoc;
5544    if (sax != NULL) {
5545        ctxt->sax = oldsax;
5546        ctxt->userData = NULL;
5547    }
5548    htmlFreeParserCtxt(ctxt);
5549
5550    return(ret);
5551}
5552
5553/**
5554 * htmlParseFile:
5555 * @filename:  the filename
5556 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5557 *
5558 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5559 * compressed document is provided by default if found at compile-time.
5560 *
5561 * Returns the resulting document tree
5562 */
5563
5564htmlDocPtr
5565htmlParseFile(const char *filename, const char *encoding) {
5566    return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5567}
5568
5569/**
5570 * htmlHandleOmittedElem:
5571 * @val:  int 0 or 1
5572 *
5573 * Set and return the previous value for handling HTML omitted tags.
5574 *
5575 * Returns the last value for 0 for no handling, 1 for auto insertion.
5576 */
5577
5578int
5579htmlHandleOmittedElem(int val) {
5580    int old = htmlOmittedDefaultValue;
5581
5582    htmlOmittedDefaultValue = val;
5583    return(old);
5584}
5585
5586/**
5587 * htmlElementAllowedHere:
5588 * @parent: HTML parent element
5589 * @elt: HTML element
5590 *
5591 * Checks whether an HTML element may be a direct child of a parent element.
5592 * Note - doesn't check for deprecated elements
5593 *
5594 * Returns 1 if allowed; 0 otherwise.
5595 */
5596int
5597htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5598  const char** p ;
5599
5600  if ( ! elt || ! parent || ! parent->subelts )
5601	return 0 ;
5602
5603  for ( p = parent->subelts; *p; ++p )
5604    if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5605      return 1 ;
5606
5607  return 0 ;
5608}
5609/**
5610 * htmlElementStatusHere:
5611 * @parent: HTML parent element
5612 * @elt: HTML element
5613 *
5614 * Checks whether an HTML element may be a direct child of a parent element.
5615 * and if so whether it is valid or deprecated.
5616 *
5617 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5618 */
5619htmlStatus
5620htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5621  if ( ! parent || ! elt )
5622    return HTML_INVALID ;
5623  if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5624    return HTML_INVALID ;
5625
5626  return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5627}
5628/**
5629 * htmlAttrAllowed:
5630 * @elt: HTML element
5631 * @attr: HTML attribute
5632 * @legacy: whether to allow deprecated attributes
5633 *
5634 * Checks whether an attribute is valid for an element
5635 * Has full knowledge of Required and Deprecated attributes
5636 *
5637 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5638 */
5639htmlStatus
5640htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5641  const char** p ;
5642
5643  if ( !elt || ! attr )
5644	return HTML_INVALID ;
5645
5646  if ( elt->attrs_req )
5647    for ( p = elt->attrs_req; *p; ++p)
5648      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5649        return HTML_REQUIRED ;
5650
5651  if ( elt->attrs_opt )
5652    for ( p = elt->attrs_opt; *p; ++p)
5653      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5654        return HTML_VALID ;
5655
5656  if ( legacy && elt->attrs_depr )
5657    for ( p = elt->attrs_depr; *p; ++p)
5658      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5659        return HTML_DEPRECATED ;
5660
5661  return HTML_INVALID ;
5662}
5663/**
5664 * htmlNodeStatus:
5665 * @node: an htmlNodePtr in a tree
5666 * @legacy: whether to allow deprecated elements (YES is faster here
5667 *	for Element nodes)
5668 *
5669 * Checks whether the tree node is valid.  Experimental (the author
5670 *     only uses the HTML enhancements in a SAX parser)
5671 *
5672 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5673 *	legacy allowed) or htmlElementStatusHere (otherwise).
5674 *	for Attribute nodes, a return from htmlAttrAllowed
5675 *	for other nodes, HTML_NA (no checks performed)
5676 */
5677htmlStatus
5678htmlNodeStatus(const htmlNodePtr node, int legacy) {
5679  if ( ! node )
5680    return HTML_INVALID ;
5681
5682  switch ( node->type ) {
5683    case XML_ELEMENT_NODE:
5684      return legacy
5685	? ( htmlElementAllowedHere (
5686		htmlTagLookup(node->parent->name) , node->name
5687		) ? HTML_VALID : HTML_INVALID )
5688	: htmlElementStatusHere(
5689		htmlTagLookup(node->parent->name) ,
5690		htmlTagLookup(node->name) )
5691	;
5692    case XML_ATTRIBUTE_NODE:
5693      return htmlAttrAllowed(
5694	htmlTagLookup(node->parent->name) , node->name, legacy) ;
5695    default: return HTML_NA ;
5696  }
5697}
5698/************************************************************************
5699 *									*
5700 *	New set (2.6.0) of simpler and more flexible APIs		*
5701 *									*
5702 ************************************************************************/
5703/**
5704 * DICT_FREE:
5705 * @str:  a string
5706 *
5707 * Free a string if it is not owned by the "dict" dictionnary in the
5708 * current scope
5709 */
5710#define DICT_FREE(str)						\
5711	if ((str) && ((!dict) || 				\
5712	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
5713	    xmlFree((char *)(str));
5714
5715/**
5716 * htmlCtxtReset:
5717 * @ctxt: an HTML parser context
5718 *
5719 * Reset a parser context
5720 */
5721void
5722htmlCtxtReset(htmlParserCtxtPtr ctxt)
5723{
5724    xmlParserInputPtr input;
5725    xmlDictPtr dict;
5726
5727    if (ctxt == NULL)
5728        return;
5729
5730    dict = ctxt->dict;
5731
5732    while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5733        xmlFreeInputStream(input);
5734    }
5735    ctxt->inputNr = 0;
5736    ctxt->input = NULL;
5737
5738    ctxt->spaceNr = 0;
5739    if (ctxt->spaceTab != NULL) {
5740	ctxt->spaceTab[0] = -1;
5741	ctxt->space = &ctxt->spaceTab[0];
5742    } else {
5743	ctxt->space = NULL;
5744    }
5745
5746
5747    ctxt->nodeNr = 0;
5748    ctxt->node = NULL;
5749
5750    ctxt->nameNr = 0;
5751    ctxt->name = NULL;
5752
5753    DICT_FREE(ctxt->version);
5754    ctxt->version = NULL;
5755    DICT_FREE(ctxt->encoding);
5756    ctxt->encoding = NULL;
5757    DICT_FREE(ctxt->directory);
5758    ctxt->directory = NULL;
5759    DICT_FREE(ctxt->extSubURI);
5760    ctxt->extSubURI = NULL;
5761    DICT_FREE(ctxt->extSubSystem);
5762    ctxt->extSubSystem = NULL;
5763    if (ctxt->myDoc != NULL)
5764        xmlFreeDoc(ctxt->myDoc);
5765    ctxt->myDoc = NULL;
5766
5767    ctxt->standalone = -1;
5768    ctxt->hasExternalSubset = 0;
5769    ctxt->hasPErefs = 0;
5770    ctxt->html = 1;
5771    ctxt->external = 0;
5772    ctxt->instate = XML_PARSER_START;
5773    ctxt->token = 0;
5774
5775    ctxt->wellFormed = 1;
5776    ctxt->nsWellFormed = 1;
5777    ctxt->valid = 1;
5778    ctxt->vctxt.userData = ctxt;
5779    ctxt->vctxt.error = xmlParserValidityError;
5780    ctxt->vctxt.warning = xmlParserValidityWarning;
5781    ctxt->record_info = 0;
5782    ctxt->nbChars = 0;
5783    ctxt->checkIndex = 0;
5784    ctxt->inSubset = 0;
5785    ctxt->errNo = XML_ERR_OK;
5786    ctxt->depth = 0;
5787    ctxt->charset = XML_CHAR_ENCODING_UTF8;
5788    ctxt->catalogs = NULL;
5789    xmlInitNodeInfoSeq(&ctxt->node_seq);
5790
5791    if (ctxt->attsDefault != NULL) {
5792        xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5793        ctxt->attsDefault = NULL;
5794    }
5795    if (ctxt->attsSpecial != NULL) {
5796        xmlHashFree(ctxt->attsSpecial, NULL);
5797        ctxt->attsSpecial = NULL;
5798    }
5799}
5800
5801/**
5802 * htmlCtxtUseOptions:
5803 * @ctxt: an HTML parser context
5804 * @options:  a combination of htmlParserOption(s)
5805 *
5806 * Applies the options to the parser context
5807 *
5808 * Returns 0 in case of success, the set of unknown or unimplemented options
5809 *         in case of error.
5810 */
5811int
5812htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5813{
5814    if (ctxt == NULL)
5815        return(-1);
5816
5817    if (options & HTML_PARSE_NOWARNING) {
5818        ctxt->sax->warning = NULL;
5819        ctxt->vctxt.warning = NULL;
5820        options -= XML_PARSE_NOWARNING;
5821	ctxt->options |= XML_PARSE_NOWARNING;
5822    }
5823    if (options & HTML_PARSE_NOERROR) {
5824        ctxt->sax->error = NULL;
5825        ctxt->vctxt.error = NULL;
5826        ctxt->sax->fatalError = NULL;
5827        options -= XML_PARSE_NOERROR;
5828	ctxt->options |= XML_PARSE_NOERROR;
5829    }
5830    if (options & HTML_PARSE_PEDANTIC) {
5831        ctxt->pedantic = 1;
5832        options -= XML_PARSE_PEDANTIC;
5833	ctxt->options |= XML_PARSE_PEDANTIC;
5834    } else
5835        ctxt->pedantic = 0;
5836    if (options & XML_PARSE_NOBLANKS) {
5837        ctxt->keepBlanks = 0;
5838        ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5839        options -= XML_PARSE_NOBLANKS;
5840	ctxt->options |= XML_PARSE_NOBLANKS;
5841    } else
5842        ctxt->keepBlanks = 1;
5843    if (options & HTML_PARSE_RECOVER) {
5844        ctxt->recovery = 1;
5845    } else
5846        ctxt->recovery = 0;
5847    if (options & HTML_PARSE_COMPACT) {
5848	ctxt->options |= HTML_PARSE_COMPACT;
5849        options -= HTML_PARSE_COMPACT;
5850    }
5851    ctxt->dictNames = 0;
5852    return (options);
5853}
5854
5855/**
5856 * htmlDoRead:
5857 * @ctxt:  an HTML parser context
5858 * @URL:  the base URL to use for the document
5859 * @encoding:  the document encoding, or NULL
5860 * @options:  a combination of htmlParserOption(s)
5861 * @reuse:  keep the context for reuse
5862 *
5863 * Common front-end for the htmlRead functions
5864 *
5865 * Returns the resulting document tree or NULL
5866 */
5867static htmlDocPtr
5868htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5869          int options, int reuse)
5870{
5871    htmlDocPtr ret;
5872
5873    htmlCtxtUseOptions(ctxt, options);
5874    ctxt->html = 1;
5875    if (encoding != NULL) {
5876        xmlCharEncodingHandlerPtr hdlr;
5877
5878	hdlr = xmlFindCharEncodingHandler(encoding);
5879	if (hdlr != NULL)
5880	    xmlSwitchToEncoding(ctxt, hdlr);
5881    }
5882    if ((URL != NULL) && (ctxt->input != NULL) &&
5883        (ctxt->input->filename == NULL))
5884        ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5885    htmlParseDocument(ctxt);
5886    ret = ctxt->myDoc;
5887    ctxt->myDoc = NULL;
5888    if (!reuse) {
5889        if ((ctxt->dictNames) &&
5890	    (ret != NULL) &&
5891	    (ret->dict == ctxt->dict))
5892	    ctxt->dict = NULL;
5893	xmlFreeParserCtxt(ctxt);
5894    }
5895    return (ret);
5896}
5897
5898/**
5899 * htmlReadDoc:
5900 * @cur:  a pointer to a zero terminated string
5901 * @URL:  the base URL to use for the document
5902 * @encoding:  the document encoding, or NULL
5903 * @options:  a combination of htmlParserOption(s)
5904 *
5905 * parse an XML in-memory document and build a tree.
5906 *
5907 * Returns the resulting document tree
5908 */
5909htmlDocPtr
5910htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5911{
5912    htmlParserCtxtPtr ctxt;
5913
5914    if (cur == NULL)
5915        return (NULL);
5916
5917    ctxt = xmlCreateDocParserCtxt(cur);
5918    if (ctxt == NULL)
5919        return (NULL);
5920    return (htmlDoRead(ctxt, URL, encoding, options, 0));
5921}
5922
5923/**
5924 * htmlReadFile:
5925 * @filename:  a file or URL
5926 * @encoding:  the document encoding, or NULL
5927 * @options:  a combination of htmlParserOption(s)
5928 *
5929 * parse an XML file from the filesystem or the network.
5930 *
5931 * Returns the resulting document tree
5932 */
5933htmlDocPtr
5934htmlReadFile(const char *filename, const char *encoding, int options)
5935{
5936    htmlParserCtxtPtr ctxt;
5937
5938    ctxt = htmlCreateFileParserCtxt(filename, encoding);
5939    if (ctxt == NULL)
5940        return (NULL);
5941    return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5942}
5943
5944/**
5945 * htmlReadMemory:
5946 * @buffer:  a pointer to a char array
5947 * @size:  the size of the array
5948 * @URL:  the base URL to use for the document
5949 * @encoding:  the document encoding, or NULL
5950 * @options:  a combination of htmlParserOption(s)
5951 *
5952 * parse an XML in-memory document and build a tree.
5953 *
5954 * Returns the resulting document tree
5955 */
5956htmlDocPtr
5957htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5958{
5959    htmlParserCtxtPtr ctxt;
5960
5961    ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5962    if (ctxt == NULL)
5963        return (NULL);
5964    if (ctxt->sax != NULL)
5965        memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
5966    return (htmlDoRead(ctxt, URL, encoding, options, 0));
5967}
5968
5969/**
5970 * htmlReadFd:
5971 * @fd:  an open file descriptor
5972 * @URL:  the base URL to use for the document
5973 * @encoding:  the document encoding, or NULL
5974 * @options:  a combination of htmlParserOption(s)
5975 *
5976 * parse an XML from a file descriptor and build a tree.
5977 *
5978 * Returns the resulting document tree
5979 */
5980htmlDocPtr
5981htmlReadFd(int fd, const char *URL, const char *encoding, int options)
5982{
5983    htmlParserCtxtPtr ctxt;
5984    xmlParserInputBufferPtr input;
5985    xmlParserInputPtr stream;
5986
5987    if (fd < 0)
5988        return (NULL);
5989
5990    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5991    if (input == NULL)
5992        return (NULL);
5993    ctxt = xmlNewParserCtxt();
5994    if (ctxt == NULL) {
5995        xmlFreeParserInputBuffer(input);
5996        return (NULL);
5997    }
5998    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5999    if (stream == NULL) {
6000        xmlFreeParserInputBuffer(input);
6001	xmlFreeParserCtxt(ctxt);
6002        return (NULL);
6003    }
6004    inputPush(ctxt, stream);
6005    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6006}
6007
6008/**
6009 * htmlReadIO:
6010 * @ioread:  an I/O read function
6011 * @ioclose:  an I/O close function
6012 * @ioctx:  an I/O handler
6013 * @URL:  the base URL to use for the document
6014 * @encoding:  the document encoding, or NULL
6015 * @options:  a combination of htmlParserOption(s)
6016 *
6017 * parse an HTML document from I/O functions and source and build a tree.
6018 *
6019 * Returns the resulting document tree
6020 */
6021htmlDocPtr
6022htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6023          void *ioctx, const char *URL, const char *encoding, int options)
6024{
6025    htmlParserCtxtPtr ctxt;
6026    xmlParserInputBufferPtr input;
6027    xmlParserInputPtr stream;
6028
6029    if (ioread == NULL)
6030        return (NULL);
6031
6032    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6033                                         XML_CHAR_ENCODING_NONE);
6034    if (input == NULL)
6035        return (NULL);
6036    ctxt = xmlNewParserCtxt();
6037    if (ctxt == NULL) {
6038        xmlFreeParserInputBuffer(input);
6039        return (NULL);
6040    }
6041    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6042    if (stream == NULL) {
6043        xmlFreeParserInputBuffer(input);
6044	xmlFreeParserCtxt(ctxt);
6045        return (NULL);
6046    }
6047    inputPush(ctxt, stream);
6048    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6049}
6050
6051/**
6052 * htmlCtxtReadDoc:
6053 * @ctxt:  an HTML parser context
6054 * @cur:  a pointer to a zero terminated string
6055 * @URL:  the base URL to use for the document
6056 * @encoding:  the document encoding, or NULL
6057 * @options:  a combination of htmlParserOption(s)
6058 *
6059 * parse an XML in-memory document and build a tree.
6060 * This reuses the existing @ctxt parser context
6061 *
6062 * Returns the resulting document tree
6063 */
6064htmlDocPtr
6065htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6066               const char *URL, const char *encoding, int options)
6067{
6068    xmlParserInputPtr stream;
6069
6070    if (cur == NULL)
6071        return (NULL);
6072    if (ctxt == NULL)
6073        return (NULL);
6074
6075    htmlCtxtReset(ctxt);
6076
6077    stream = xmlNewStringInputStream(ctxt, cur);
6078    if (stream == NULL) {
6079        return (NULL);
6080    }
6081    inputPush(ctxt, stream);
6082    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6083}
6084
6085/**
6086 * htmlCtxtReadFile:
6087 * @ctxt:  an HTML parser context
6088 * @filename:  a file or URL
6089 * @encoding:  the document encoding, or NULL
6090 * @options:  a combination of htmlParserOption(s)
6091 *
6092 * parse an XML file from the filesystem or the network.
6093 * This reuses the existing @ctxt parser context
6094 *
6095 * Returns the resulting document tree
6096 */
6097htmlDocPtr
6098htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6099                const char *encoding, int options)
6100{
6101    xmlParserInputPtr stream;
6102
6103    if (filename == NULL)
6104        return (NULL);
6105    if (ctxt == NULL)
6106        return (NULL);
6107
6108    htmlCtxtReset(ctxt);
6109
6110    stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6111    if (stream == NULL) {
6112        return (NULL);
6113    }
6114    inputPush(ctxt, stream);
6115    return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6116}
6117
6118/**
6119 * htmlCtxtReadMemory:
6120 * @ctxt:  an HTML parser context
6121 * @buffer:  a pointer to a char array
6122 * @size:  the size of the array
6123 * @URL:  the base URL to use for the document
6124 * @encoding:  the document encoding, or NULL
6125 * @options:  a combination of htmlParserOption(s)
6126 *
6127 * parse an XML in-memory document and build a tree.
6128 * This reuses the existing @ctxt parser context
6129 *
6130 * Returns the resulting document tree
6131 */
6132htmlDocPtr
6133htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6134                  const char *URL, const char *encoding, int options)
6135{
6136    xmlParserInputBufferPtr input;
6137    xmlParserInputPtr stream;
6138
6139    if (ctxt == NULL)
6140        return (NULL);
6141    if (buffer == NULL)
6142        return (NULL);
6143
6144    htmlCtxtReset(ctxt);
6145
6146    input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6147    if (input == NULL) {
6148	return(NULL);
6149    }
6150
6151    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6152    if (stream == NULL) {
6153	xmlFreeParserInputBuffer(input);
6154	return(NULL);
6155    }
6156
6157    inputPush(ctxt, stream);
6158    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6159}
6160
6161/**
6162 * htmlCtxtReadFd:
6163 * @ctxt:  an HTML parser context
6164 * @fd:  an open file descriptor
6165 * @URL:  the base URL to use for the document
6166 * @encoding:  the document encoding, or NULL
6167 * @options:  a combination of htmlParserOption(s)
6168 *
6169 * parse an XML from a file descriptor and build a tree.
6170 * This reuses the existing @ctxt parser context
6171 *
6172 * Returns the resulting document tree
6173 */
6174htmlDocPtr
6175htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6176              const char *URL, const char *encoding, int options)
6177{
6178    xmlParserInputBufferPtr input;
6179    xmlParserInputPtr stream;
6180
6181    if (fd < 0)
6182        return (NULL);
6183    if (ctxt == NULL)
6184        return (NULL);
6185
6186    htmlCtxtReset(ctxt);
6187
6188
6189    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6190    if (input == NULL)
6191        return (NULL);
6192    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6193    if (stream == NULL) {
6194        xmlFreeParserInputBuffer(input);
6195        return (NULL);
6196    }
6197    inputPush(ctxt, stream);
6198    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6199}
6200
6201/**
6202 * htmlCtxtReadIO:
6203 * @ctxt:  an HTML parser context
6204 * @ioread:  an I/O read function
6205 * @ioclose:  an I/O close function
6206 * @ioctx:  an I/O handler
6207 * @URL:  the base URL to use for the document
6208 * @encoding:  the document encoding, or NULL
6209 * @options:  a combination of htmlParserOption(s)
6210 *
6211 * parse an HTML document from I/O functions and source and build a tree.
6212 * This reuses the existing @ctxt parser context
6213 *
6214 * Returns the resulting document tree
6215 */
6216htmlDocPtr
6217htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6218              xmlInputCloseCallback ioclose, void *ioctx,
6219	      const char *URL,
6220              const char *encoding, int options)
6221{
6222    xmlParserInputBufferPtr input;
6223    xmlParserInputPtr stream;
6224
6225    if (ioread == NULL)
6226        return (NULL);
6227    if (ctxt == NULL)
6228        return (NULL);
6229
6230    htmlCtxtReset(ctxt);
6231
6232    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6233                                         XML_CHAR_ENCODING_NONE);
6234    if (input == NULL)
6235        return (NULL);
6236    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6237    if (stream == NULL) {
6238        xmlFreeParserInputBuffer(input);
6239        return (NULL);
6240    }
6241    inputPush(ctxt, stream);
6242    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6243}
6244
6245#define bottom_HTMLparser
6246#include "elfgcchack.h"
6247#endif /* LIBXML_HTML_ENABLED */
6248