• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-WNDR4500v2-V1.0.0.60_1.0.38/ap/gpl/timemachine/gettext-0.17/gettext-tools/gnulib-lib/libxml/
1/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9#define IN_LIBXML
10#include "libxml.h"
11#ifdef LIBXML_HTML_ENABLED
12
13#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
39#include <libxml/HTMLtree.h>
40#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
44#include <libxml/globals.h>
45#include <libxml/uri.h>
46
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
54static int htmlOmittedDefaultValue = 1;
55
56xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57			     xmlChar end, xmlChar  end2, xmlChar end3);
58static void htmlParseComment(htmlParserCtxtPtr ctxt);
59
60/************************************************************************
61 *									*
62 * 		Some factorized error routines				*
63 *									*
64 ************************************************************************/
65
66/**
67 * htmlErrMemory:
68 * @ctxt:  an HTML parser context
69 * @extra:  extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
76    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77        (ctxt->instate == XML_PARSER_EOF))
78	return;
79    if (ctxt != NULL) {
80        ctxt->errNo = XML_ERR_NO_MEMORY;
81        ctxt->instate = XML_PARSER_EOF;
82        ctxt->disableSAX = 1;
83    }
84    if (extra)
85        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
86                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87                        NULL, NULL, 0, 0,
88                        "Memory allocation failed : %s\n", extra);
89    else
90        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
91                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92                        NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt:  an HTML parser context
98 * @error:  the error number
99 * @msg:  the error message
100 * @str1:  string infor
101 * @str2:  string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107             const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
109    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110        (ctxt->instate == XML_PARSER_EOF))
111	return;
112    if (ctxt != NULL)
113	ctxt->errNo = error;
114    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
115                    XML_ERR_ERROR, NULL, 0,
116		    (const char *) str1, (const char *) str2,
117		    NULL, 0, 0,
118		    msg, str1, str2);
119    if (ctxt != NULL)
120	ctxt->wellFormed = 0;
121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt:  an HTML parser context
126 * @error:  the error number
127 * @msg:  the error message
128 * @val:  integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134             const char *msg, int val)
135{
136    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137        (ctxt->instate == XML_PARSER_EOF))
138	return;
139    if (ctxt != NULL)
140	ctxt->errNo = error;
141    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
142                    XML_ERR_ERROR, NULL, 0, NULL, NULL,
143		    NULL, val, 0, msg, val);
144    if (ctxt != NULL)
145	ctxt->wellFormed = 0;
146}
147
148/************************************************************************
149 *									*
150 * 		Parser stacks related functions and macros		*
151 *									*
152 ************************************************************************/
153
154/**
155 * htmlnamePush:
156 * @ctxt:  an HTML parser context
157 * @value:  the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
162 */
163static int
164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
165{
166    if (ctxt->nameNr >= ctxt->nameMax) {
167        ctxt->nameMax *= 2;
168        ctxt->nameTab = (const xmlChar * *)
169                         xmlRealloc((xmlChar * *)ctxt->nameTab,
170                                    ctxt->nameMax *
171                                    sizeof(ctxt->nameTab[0]));
172        if (ctxt->nameTab == NULL) {
173            htmlErrMemory(ctxt, NULL);
174            return (0);
175        }
176    }
177    ctxt->nameTab[ctxt->nameNr] = value;
178    ctxt->name = value;
179    return (ctxt->nameNr++);
180}
181/**
182 * htmlnamePop:
183 * @ctxt: an HTML parser context
184 *
185 * Pops the top element name from the name stack
186 *
187 * Returns the name just removed
188 */
189static const xmlChar *
190htmlnamePop(htmlParserCtxtPtr ctxt)
191{
192    const xmlChar *ret;
193
194    if (ctxt->nameNr <= 0)
195        return (NULL);
196    ctxt->nameNr--;
197    if (ctxt->nameNr < 0)
198        return (NULL);
199    if (ctxt->nameNr > 0)
200        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201    else
202        ctxt->name = NULL;
203    ret = ctxt->nameTab[ctxt->nameNr];
204    ctxt->nameTab[ctxt->nameNr] = NULL;
205    return (ret);
206}
207
208/*
209 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported.
211 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them
213 *
214 *   CUR_PTR return the current pointer to the xmlChar to be parsed.
215 *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
216 *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 *           in UNICODE mode. This should be used internally by the parser
218 *           only to compare to ASCII values otherwise it would break when
219 *           running with UTF-8 encoding.
220 *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
221 *           to compare on ASCII based substring.
222 *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
223 *           it should be used only to compare on ASCII based substring.
224 *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
225 *           strings without newlines within the parser.
226 *
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228 *
229 *   CURRENT Returns the current char value, with the full decoding of
230 *           UTF-8 if we are using this mode. It returns an int.
231 *   NEXT    Skip to the next character, this does the proper decoding
232 *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
233 *   NEXTL(l) Skip the current unicode character of l xmlChars long.
234 *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235 */
236
237#define UPPER (toupper(*ctxt->input->cur))
238
239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
240
241#define NXT(val) ctxt->input->cur[(val)]
242
243#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245#define CUR_PTR ctxt->input->cur
246
247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249	xmlParserInputShrink(ctxt->input)
250
251#define GROW if ((ctxt->progressive == 0) &&				\
252		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
253	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
254
255#define CURRENT ((int) (*ctxt->input->cur))
256
257#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259/* Inported from XML */
260
261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262#define CUR ((int) (*ctxt->input->cur))
263#define NEXT xmlNextChar(ctxt)
264
265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
266#define NXT(val) ctxt->input->cur[(val)]
267#define CUR_PTR ctxt->input->cur
268
269
270#define NEXTL(l) do {							\
271    if (*(ctxt->input->cur) == '\n') {					\
272	ctxt->input->line++; ctxt->input->col = 1;			\
273    } else ctxt->input->col++;						\
274    ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;		\
275  } while (0)
276
277/************
278    \
279    if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
280    if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/
282
283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286#define COPY_BUF(l,b,i,v)						\
287    if (l == 1) b[i++] = (xmlChar) v;					\
288    else i += xmlCopyChar(l,&b[i],v)
289
290/**
291 * htmlCurrentChar:
292 * @ctxt:  the HTML parser context
293 * @len:  pointer to the length of the char read
294 *
295 * The current char value, if using UTF-8 this may actually span multiple
296 * bytes in the input buffer. Implement the end of line normalization:
297 * 2.11 End-of-Line Handling
298 * If the encoding is unspecified, in the case we find an ISO-Latin-1
299 * char, then the encoding converter is plugged in automatically.
300 *
301 * Returns the current char value and its length
302 */
303
304static int
305htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306    if (ctxt->instate == XML_PARSER_EOF)
307	return(0);
308
309    if (ctxt->token != 0) {
310	*len = 0;
311	return(ctxt->token);
312    }
313    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
314	/*
315	 * We are supposed to handle UTF8, check it's valid
316	 * From rfc2044: encoding of the Unicode values on UTF-8:
317	 *
318	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
319	 * 0000 0000-0000 007F   0xxxxxxx
320	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
321	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
322	 *
323	 * Check for the 0x110000 limit too
324	 */
325	const unsigned char *cur = ctxt->input->cur;
326	unsigned char c;
327	unsigned int val;
328
329	c = *cur;
330	if (c & 0x80) {
331	    if (cur[1] == 0)
332		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333	    if ((cur[1] & 0xc0) != 0x80)
334		goto encoding_error;
335	    if ((c & 0xe0) == 0xe0) {
336
337		if (cur[2] == 0)
338		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339		if ((cur[2] & 0xc0) != 0x80)
340		    goto encoding_error;
341		if ((c & 0xf0) == 0xf0) {
342		    if (cur[3] == 0)
343			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344		    if (((c & 0xf8) != 0xf0) ||
345			((cur[3] & 0xc0) != 0x80))
346			goto encoding_error;
347		    /* 4-byte code */
348		    *len = 4;
349		    val = (cur[0] & 0x7) << 18;
350		    val |= (cur[1] & 0x3f) << 12;
351		    val |= (cur[2] & 0x3f) << 6;
352		    val |= cur[3] & 0x3f;
353		} else {
354		  /* 3-byte code */
355		    *len = 3;
356		    val = (cur[0] & 0xf) << 12;
357		    val |= (cur[1] & 0x3f) << 6;
358		    val |= cur[2] & 0x3f;
359		}
360	    } else {
361	      /* 2-byte code */
362		*len = 2;
363		val = (cur[0] & 0x1f) << 6;
364		val |= cur[1] & 0x3f;
365	    }
366	    if (!IS_CHAR(val)) {
367	        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368				"Char 0x%X out of allowed range\n", val);
369	    }
370	    return(val);
371	} else {
372	    /* 1-byte code */
373	    *len = 1;
374	    return((int) *ctxt->input->cur);
375	}
376    }
377    /*
378     * Assume it's a fixed length encoding (1) with
379     * a compatible encoding for the ASCII set, since
380     * XML constructs only use < 128 chars
381     */
382    *len = 1;
383    if ((int) *ctxt->input->cur < 0x80)
384	return((int) *ctxt->input->cur);
385
386    /*
387     * Humm this is bad, do an automatic flow conversion
388     */
389    xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390    ctxt->charset = XML_CHAR_ENCODING_UTF8;
391    return(xmlCurrentChar(ctxt, len));
392
393encoding_error:
394    /*
395     * If we detect an UTF8 error that probably mean that the
396     * input encoding didn't get properly advertized in the
397     * declaration header. Report the error and switch the encoding
398     * to ISO-Latin-1 (if you don't like this policy, just declare the
399     * encoding !)
400     */
401    {
402        char buffer[150];
403
404	snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
405			ctxt->input->cur[0], ctxt->input->cur[1],
406			ctxt->input->cur[2], ctxt->input->cur[3]);
407	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
408		     "Input is not proper UTF-8, indicate encoding !\n",
409		     BAD_CAST buffer, NULL);
410    }
411
412    ctxt->charset = XML_CHAR_ENCODING_8859_1;
413    *len = 1;
414    return((int) *ctxt->input->cur);
415}
416
417/**
418 * htmlSkipBlankChars:
419 * @ctxt:  the HTML parser context
420 *
421 * skip all blanks character found at that point in the input streams.
422 *
423 * Returns the number of space chars skipped
424 */
425
426static int
427htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
428    int res = 0;
429
430    while (IS_BLANK_CH(*(ctxt->input->cur))) {
431	if ((*ctxt->input->cur == 0) &&
432	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
433		xmlPopInput(ctxt);
434	} else {
435	    if (*(ctxt->input->cur) == '\n') {
436		ctxt->input->line++; ctxt->input->col = 1;
437	    } else ctxt->input->col++;
438	    ctxt->input->cur++;
439	    ctxt->nbChars++;
440	    if (*ctxt->input->cur == 0)
441		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
442	}
443	res++;
444    }
445    return(res);
446}
447
448
449
450/************************************************************************
451 *									*
452 * 		The list of HTML elements and their properties		*
453 *									*
454 ************************************************************************/
455
456/*
457 *  Start Tag: 1 means the start tag can be ommited
458 *  End Tag:   1 means the end tag can be ommited
459 *             2 means it's forbidden (empty elements)
460 *             3 means the tag is stylistic and should be closed easily
461 *  Depr:      this element is deprecated
462 *  DTD:       1 means that this element is valid only in the Loose DTD
463 *             2 means that this element is valid only in the Frameset DTD
464 *
465 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
466	, subElements , impliedsubelt , Attributes, userdata
467 */
468
469/* Definitions and a couple of vars for HTML Elements */
470
471#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
472#define NB_FONTSTYLE 8
473#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
474#define NB_PHRASE 10
475#define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
476#define NB_SPECIAL 15
477#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
478#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
479#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
480#define NB_BLOCK NB_HEADING + NB_LIST + 14
481#define FORMCTRL "input", "select", "textarea", "label", "button"
482#define NB_FORMCTRL 5
483#define PCDATA
484#define NB_PCDATA 0
485#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
486#define NB_HEADING 6
487#define LIST "ul", "ol", "dir", "menu"
488#define NB_LIST 4
489#define MODIFIER
490#define NB_MODIFIER 0
491#define FLOW BLOCK,INLINE
492#define NB_FLOW NB_BLOCK + NB_INLINE
493#define EMPTY NULL
494
495
496static const char* const html_flow[] = { FLOW, NULL } ;
497static const char* const html_inline[] = { INLINE, NULL } ;
498
499/* placeholders: elts with content but no subelements */
500static const char* const html_pcdata[] = { NULL } ;
501#define html_cdata html_pcdata
502
503
504/* ... and for HTML Attributes */
505
506#define COREATTRS "id", "class", "style", "title"
507#define NB_COREATTRS 4
508#define I18N "lang", "dir"
509#define NB_I18N 2
510#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
511#define NB_EVENTS 9
512#define ATTRS COREATTRS,I18N,EVENTS
513#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
514#define CELLHALIGN "align", "char", "charoff"
515#define NB_CELLHALIGN 3
516#define CELLVALIGN "valign"
517#define NB_CELLVALIGN 1
518
519static const char* const html_attrs[] = { ATTRS, NULL } ;
520static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
521static const char* const core_attrs[] = { COREATTRS, NULL } ;
522static const char* const i18n_attrs[] = { I18N, NULL } ;
523
524
525/* Other declarations that should go inline ... */
526static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
527	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
528	"tabindex", "onfocus", "onblur", NULL } ;
529static const char* const target_attr[] = { "target", NULL } ;
530static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
531static const char* const alt_attr[] = { "alt", NULL } ;
532static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
533static const char* const href_attrs[] = { "href", NULL } ;
534static const char* const clear_attrs[] = { "clear", NULL } ;
535static const char* const inline_p[] = { INLINE, "p", NULL } ;
536
537static const char* const flow_param[] = { FLOW, "param", NULL } ;
538static const char* const applet_attrs[] = { COREATTRS , "codebase",
539		"archive", "alt", "name", "height", "width", "align",
540		"hspace", "vspace", NULL } ;
541static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
542	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
543static const char* const basefont_attrs[] =
544	{ "id", "size", "color", "face", NULL } ;
545static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
546static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
547static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
548static const char* const body_depr[] = { "background", "bgcolor", "text",
549	"link", "vlink", "alink", NULL } ;
550static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
551	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
552
553
554static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
555static const char* const col_elt[] = { "col", NULL } ;
556static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
557static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
558static const char* const dl_contents[] = { "dt", "dd", NULL } ;
559static const char* const compact_attr[] = { "compact", NULL } ;
560static const char* const label_attr[] = { "label", NULL } ;
561static const char* const fieldset_contents[] = { FLOW, "legend" } ;
562static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
563static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
564static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
565static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
566static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
567static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
568static const char* const head_attrs[] = { I18N, "profile", NULL } ;
569static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
570static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
571static const char* const version_attr[] = { "version", NULL } ;
572static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
573static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
574static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
575static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
576static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
577static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
578static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
579static const char* const align_attr[] = { "align", NULL } ;
580static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
581static const char* const map_contents[] = { BLOCK, "area", NULL } ;
582static const char* const name_attr[] = { "name", NULL } ;
583static const char* const action_attr[] = { "action", NULL } ;
584static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
585static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
586static const char* const content_attr[] = { "content", NULL } ;
587static const char* const type_attr[] = { "type", NULL } ;
588static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
589static const char* const object_contents[] = { FLOW, "param", NULL } ;
590static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
591static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
592static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
593static const char* const option_elt[] = { "option", NULL } ;
594static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
595static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
596static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
597static const char* const width_attr[] = { "width", NULL } ;
598static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
599static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
600static const char* const language_attr[] = { "language", NULL } ;
601static const char* const select_content[] = { "optgroup", "option", NULL } ;
602static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
603static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
604static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
605static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
606static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
607static const char* const tr_elt[] = { "tr", NULL } ;
608static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
609static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
610static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
611static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
612static const char* const tr_contents[] = { "th", "td", NULL } ;
613static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
614static const char* const li_elt[] = { "li", NULL } ;
615static const char* const ul_depr[] = { "type", "compact", NULL} ;
616static const char* const dir_attr[] = { "dir", NULL} ;
617
618#define DECL (const char**)
619
620static const htmlElemDesc
621html40ElementTable[] = {
622{ "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
623	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
624},
625{ "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
626	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
627},
628{ "acronym",	0, 0, 0, 0, 0, 0, 1, "",
629	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
630},
631{ "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
632	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
633},
634{ "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
635	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
636},
637{ "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
638	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
639},
640{ "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
641	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
642},
643{ "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
644	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
645},
646{ "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
647	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
648},
649{ "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
650	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
651},
652{ "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
653	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
654},
655{ "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
656	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
657},
658{ "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
659	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
660},
661{ "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
662	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
663},
664{ "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
665	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
666},
667{ "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
668	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
669},
670{ "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
671	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
672},
673{ "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
674	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
675},
676{ "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
677	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
678},
679{ "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
680	EMPTY , NULL , DECL col_attrs , NULL, NULL
681},
682{ "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
683	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
684},
685{ "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
686	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
687},
688{ "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
689	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
690},
691{ "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
692	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
693},
694{ "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
695	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
696},
697{ "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
698	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
699},
700{ "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
701	DECL dl_contents , "dd" , html_attrs, DECL compact_attr, NULL
702},
703{ "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
704	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
705},
706{ "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
707	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
708},
709{ "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
710	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
711},
712{ "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
713	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
714},
715{ "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
716	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
717},
718{ "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
719	EMPTY, NULL, NULL, DECL frame_attrs, NULL
720},
721{ "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
722	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
723},
724{ "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
725	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
726},
727{ "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
728	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
729},
730{ "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
731	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
732},
733{ "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
734	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
735},
736{ "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
737	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
738},
739{ "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
740	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
741},
742{ "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
743	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
744},
745{ "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
746	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
747},
748{ "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
749	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
750},
751{ "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
752	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
753},
754{ "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
755	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
756},
757{ "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
758	EMPTY, NULL, DECL img_attrs, DECL align_attr, src_alt_attrs
759},
760{ "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
761	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
762},
763{ "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
764	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
765},
766{ "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
767	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
768},
769{ "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
770	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
771},
772{ "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
773	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
774},
775{ "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
776	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
777},
778{ "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
779	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
780},
781{ "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
782	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
783},
784{ "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
785	DECL map_contents , NULL, DECL html_attrs , NULL, name_attr
786},
787{ "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
788	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
789},
790{ "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
791	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
792},
793{ "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
794	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
795},
796{ "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
797	DECL html_flow, "div", DECL html_attrs, NULL, NULL
798},
799{ "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
800	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
801},
802{ "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
803	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
804},
805{ "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
806	option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
807},
808{ "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
809	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
810},
811{ "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
812	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
813},
814{ "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
815	EMPTY, NULL, DECL param_attrs, NULL, name_attr
816},
817{ "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
818	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
819},
820{ "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
821	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
822},
823{ "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
824	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
825},
826{ "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
827	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
828},
829{ "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
830	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
831},
832{ "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
833	DECL select_content, NULL, DECL select_attrs, NULL, NULL
834},
835{ "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
836	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
837},
838{ "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
839	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
840},
841{ "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
842	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
843},
844{ "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
845	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
846},
847{ "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
848	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
849},
850{ "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
851	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
852},
853{ "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
854	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
855},
856{ "table",	0, 0, 0, 0, 0, 0, 0, "",
857	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
858},
859{ "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
860	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
861},
862{ "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
863	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
864},
865{ "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
866	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
867},
868{ "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
869	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
870},
871{ "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
872	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
873},
874{ "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
875	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
876},
877{ "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
878	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
879},
880{ "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
881	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
882},
883{ "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
884	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
885},
886{ "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
887	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
888},
889{ "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
890	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
891},
892{ "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
893	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
894}
895};
896
897/*
898 * start tags that imply the end of current element
899 */
900static const char * const htmlStartClose[] = {
901"form",		"form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
902		"dl", "ul", "ol", "menu", "dir", "address", "pre",
903		"listing", "xmp", "head", NULL,
904"head",		"p", NULL,
905"title",	"p", NULL,
906"body",		"head", "style", "link", "title", "p", NULL,
907"frameset",	"head", "style", "link", "title", "p", NULL,
908"li",		"p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
909		"pre", "listing", "xmp", "head", "li", NULL,
910"hr",		"p", "head", NULL,
911"h1",		"p", "head", NULL,
912"h2",		"p", "head", NULL,
913"h3",		"p", "head", NULL,
914"h4",		"p", "head", NULL,
915"h5",		"p", "head", NULL,
916"h6",		"p", "head", NULL,
917"dir",		"p", "head", NULL,
918"address",	"p", "head", "ul", NULL,
919"pre",		"p", "head", "ul", NULL,
920"listing",	"p", "head", NULL,
921"xmp",		"p", "head", NULL,
922"blockquote",	"p", "head", NULL,
923"dl",		"p", "dt", "menu", "dir", "address", "pre", "listing",
924		"xmp", "head", NULL,
925"dt",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
926                "head", "dd", NULL,
927"dd",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
928                "head", "dt", NULL,
929"ul",		"p", "head", "ol", "menu", "dir", "address", "pre",
930		"listing", "xmp", NULL,
931"ol",		"p", "head", "ul", NULL,
932"menu",		"p", "head", "ul", NULL,
933"p",		"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
934"div",		"p", "head", NULL,
935"noscript",	"p", "head", NULL,
936"center",	"font", "b", "i", "p", "head", NULL,
937"a",		"a", NULL,
938"caption",	"p", NULL,
939"colgroup",	"caption", "colgroup", "col", "p", NULL,
940"col",		"caption", "col", "p", NULL,
941"table",	"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
942		"listing", "xmp", "a", NULL,
943"th",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
944"td",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
945"tr",		"th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
946"thead",	"caption", "col", "colgroup", NULL,
947"tfoot",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
948		"tbody", "p", NULL,
949"tbody",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
950		"tfoot", "tbody", "p", NULL,
951"optgroup",	"option", NULL,
952"option",	"option", NULL,
953"fieldset",	"legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
954		"pre", "listing", "xmp", "a", NULL,
955NULL
956};
957
958/*
959 * The list of HTML elements which are supposed not to have
960 * CDATA content and where a p element will be implied
961 *
962 * TODO: extend that list by reading the HTML SGML DTD on
963 *       implied paragraph
964 */
965static const char *const htmlNoContentElements[] = {
966    "html",
967    "head",
968    NULL
969};
970
971/*
972 * The list of HTML attributes which are of content %Script;
973 * NOTE: when adding ones, check htmlIsScriptAttribute() since
974 *       it assumes the name starts with 'on'
975 */
976static const char *const htmlScriptAttributes[] = {
977    "onclick",
978    "ondblclick",
979    "onmousedown",
980    "onmouseup",
981    "onmouseover",
982    "onmousemove",
983    "onmouseout",
984    "onkeypress",
985    "onkeydown",
986    "onkeyup",
987    "onload",
988    "onunload",
989    "onfocus",
990    "onblur",
991    "onsubmit",
992    "onrest",
993    "onchange",
994    "onselect"
995};
996
997/*
998 * This table is used by the htmlparser to know what to do with
999 * broken html pages. By assigning different priorities to different
1000 * elements the parser can decide how to handle extra endtags.
1001 * Endtags are only allowed to close elements with lower or equal
1002 * priority.
1003 */
1004
1005typedef struct {
1006    const char *name;
1007    int priority;
1008} elementPriority;
1009
1010static const elementPriority htmlEndPriority[] = {
1011    {"div",   150},
1012    {"td",    160},
1013    {"th",    160},
1014    {"tr",    170},
1015    {"thead", 180},
1016    {"tbody", 180},
1017    {"tfoot", 180},
1018    {"table", 190},
1019    {"head",  200},
1020    {"body",  200},
1021    {"html",  220},
1022    {NULL,    100} /* Default priority */
1023};
1024
1025static const char** htmlStartCloseIndex[100];
1026static int htmlStartCloseIndexinitialized = 0;
1027
1028/************************************************************************
1029 *									*
1030 * 		functions to handle HTML specific data			*
1031 *									*
1032 ************************************************************************/
1033
1034/**
1035 * htmlInitAutoClose:
1036 *
1037 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1038 * This is not reentrant. Call xmlInitParser() once before processing in
1039 * case of use in multithreaded programs.
1040 */
1041void
1042htmlInitAutoClose(void) {
1043    int indx, i = 0;
1044
1045    if (htmlStartCloseIndexinitialized) return;
1046
1047    for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1048    indx = 0;
1049    while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1050        htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1051	while (htmlStartClose[i] != NULL) i++;
1052	i++;
1053    }
1054    htmlStartCloseIndexinitialized = 1;
1055}
1056
1057/**
1058 * htmlTagLookup:
1059 * @tag:  The tag name in lowercase
1060 *
1061 * Lookup the HTML tag in the ElementTable
1062 *
1063 * Returns the related htmlElemDescPtr or NULL if not found.
1064 */
1065const htmlElemDesc *
1066htmlTagLookup(const xmlChar *tag) {
1067    unsigned int i;
1068
1069    for (i = 0; i < (sizeof(html40ElementTable) /
1070                     sizeof(html40ElementTable[0]));i++) {
1071        if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1072	    return((htmlElemDescPtr) &html40ElementTable[i]);
1073    }
1074    return(NULL);
1075}
1076
1077/**
1078 * htmlGetEndPriority:
1079 * @name: The name of the element to look up the priority for.
1080 *
1081 * Return value: The "endtag" priority.
1082 **/
1083static int
1084htmlGetEndPriority (const xmlChar *name) {
1085    int i = 0;
1086
1087    while ((htmlEndPriority[i].name != NULL) &&
1088	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1089	i++;
1090
1091    return(htmlEndPriority[i].priority);
1092}
1093
1094
1095/**
1096 * htmlCheckAutoClose:
1097 * @newtag:  The new tag name
1098 * @oldtag:  The old tag name
1099 *
1100 * Checks whether the new tag is one of the registered valid tags for
1101 * closing old.
1102 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1103 *
1104 * Returns 0 if no, 1 if yes.
1105 */
1106static int
1107htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1108{
1109    int i, indx;
1110    const char **closed = NULL;
1111
1112    if (htmlStartCloseIndexinitialized == 0)
1113        htmlInitAutoClose();
1114
1115    /* inefficient, but not a big deal */
1116    for (indx = 0; indx < 100; indx++) {
1117        closed = htmlStartCloseIndex[indx];
1118        if (closed == NULL)
1119            return (0);
1120        if (xmlStrEqual(BAD_CAST * closed, newtag))
1121            break;
1122    }
1123
1124    i = closed - htmlStartClose;
1125    i++;
1126    while (htmlStartClose[i] != NULL) {
1127        if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1128            return (1);
1129        }
1130        i++;
1131    }
1132    return (0);
1133}
1134
1135/**
1136 * htmlAutoCloseOnClose:
1137 * @ctxt:  an HTML parser context
1138 * @newtag:  The new tag name
1139 * @force:  force the tag closure
1140 *
1141 * The HTML DTD allows an ending tag to implicitly close other tags.
1142 */
1143static void
1144htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1145{
1146    const htmlElemDesc *info;
1147    int i, priority;
1148
1149    priority = htmlGetEndPriority(newtag);
1150
1151    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1152
1153        if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1154            break;
1155        /*
1156         * A missplaced endtag can only close elements with lower
1157         * or equal priority, so if we find an element with higher
1158         * priority before we find an element with
1159         * matching name, we just ignore this endtag
1160         */
1161        if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1162            return;
1163    }
1164    if (i < 0)
1165        return;
1166
1167    while (!xmlStrEqual(newtag, ctxt->name)) {
1168        info = htmlTagLookup(ctxt->name);
1169        if ((info != NULL) && (info->endTag == 3)) {
1170            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1171	                 "Opening and ending tag mismatch: %s and %s\n",
1172			 newtag, ctxt->name);
1173        }
1174        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1175            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1176	htmlnamePop(ctxt);
1177    }
1178}
1179
1180/**
1181 * htmlAutoCloseOnEnd:
1182 * @ctxt:  an HTML parser context
1183 *
1184 * Close all remaining tags at the end of the stream
1185 */
1186static void
1187htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1188{
1189    int i;
1190
1191    if (ctxt->nameNr == 0)
1192        return;
1193    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1194        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1195            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1196	htmlnamePop(ctxt);
1197    }
1198}
1199
1200/**
1201 * htmlAutoClose:
1202 * @ctxt:  an HTML parser context
1203 * @newtag:  The new tag name or NULL
1204 *
1205 * The HTML DTD allows a tag to implicitly close other tags.
1206 * The list is kept in htmlStartClose array. This function is
1207 * called when a new tag has been detected and generates the
1208 * appropriates closes if possible/needed.
1209 * If newtag is NULL this mean we are at the end of the resource
1210 * and we should check
1211 */
1212static void
1213htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1214{
1215    while ((newtag != NULL) && (ctxt->name != NULL) &&
1216           (htmlCheckAutoClose(newtag, ctxt->name))) {
1217        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1218            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1219	htmlnamePop(ctxt);
1220    }
1221    if (newtag == NULL) {
1222        htmlAutoCloseOnEnd(ctxt);
1223        return;
1224    }
1225    while ((newtag == NULL) && (ctxt->name != NULL) &&
1226           ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1227            (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1228            (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1229        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1230            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1231	htmlnamePop(ctxt);
1232    }
1233}
1234
1235/**
1236 * htmlAutoCloseTag:
1237 * @doc:  the HTML document
1238 * @name:  The tag name
1239 * @elem:  the HTML element
1240 *
1241 * The HTML DTD allows a tag to implicitly close other tags.
1242 * The list is kept in htmlStartClose array. This function checks
1243 * if the element or one of it's children would autoclose the
1244 * given tag.
1245 *
1246 * Returns 1 if autoclose, 0 otherwise
1247 */
1248int
1249htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1250    htmlNodePtr child;
1251
1252    if (elem == NULL) return(1);
1253    if (xmlStrEqual(name, elem->name)) return(0);
1254    if (htmlCheckAutoClose(elem->name, name)) return(1);
1255    child = elem->children;
1256    while (child != NULL) {
1257        if (htmlAutoCloseTag(doc, name, child)) return(1);
1258	child = child->next;
1259    }
1260    return(0);
1261}
1262
1263/**
1264 * htmlIsAutoClosed:
1265 * @doc:  the HTML document
1266 * @elem:  the HTML element
1267 *
1268 * The HTML DTD allows a tag to implicitly close other tags.
1269 * The list is kept in htmlStartClose array. This function checks
1270 * if a tag is autoclosed by one of it's child
1271 *
1272 * Returns 1 if autoclosed, 0 otherwise
1273 */
1274int
1275htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1276    htmlNodePtr child;
1277
1278    if (elem == NULL) return(1);
1279    child = elem->children;
1280    while (child != NULL) {
1281	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1282	child = child->next;
1283    }
1284    return(0);
1285}
1286
1287/**
1288 * htmlCheckImplied:
1289 * @ctxt:  an HTML parser context
1290 * @newtag:  The new tag name
1291 *
1292 * The HTML DTD allows a tag to exists only implicitly
1293 * called when a new tag has been detected and generates the
1294 * appropriates implicit tags if missing
1295 */
1296static void
1297htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1298    if (!htmlOmittedDefaultValue)
1299	return;
1300    if (xmlStrEqual(newtag, BAD_CAST"html"))
1301	return;
1302    if (ctxt->nameNr <= 0) {
1303	htmlnamePush(ctxt, BAD_CAST"html");
1304	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1305	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1306    }
1307    if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1308        return;
1309    if ((ctxt->nameNr <= 1) &&
1310        ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1311	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1312	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1313	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1314	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1315	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1316	    /*
1317	     * dropped OBJECT ... i you put it first BODY will be
1318	     * assumed !
1319	     */
1320	    htmlnamePush(ctxt, BAD_CAST"head");
1321	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1322		ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1323    } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1324	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1325	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1326	int i;
1327	for (i = 0;i < ctxt->nameNr;i++) {
1328	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1329		return;
1330	    }
1331	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1332		return;
1333	    }
1334	}
1335
1336	htmlnamePush(ctxt, BAD_CAST"body");
1337	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1338	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1339    }
1340}
1341
1342/**
1343 * htmlCheckParagraph
1344 * @ctxt:  an HTML parser context
1345 *
1346 * Check whether a p element need to be implied before inserting
1347 * characters in the current element.
1348 *
1349 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1350 *         in case of error.
1351 */
1352
1353static int
1354htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1355    const xmlChar *tag;
1356    int i;
1357
1358    if (ctxt == NULL)
1359	return(-1);
1360    tag = ctxt->name;
1361    if (tag == NULL) {
1362	htmlAutoClose(ctxt, BAD_CAST"p");
1363	htmlCheckImplied(ctxt, BAD_CAST"p");
1364	htmlnamePush(ctxt, BAD_CAST"p");
1365	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1366	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1367	return(1);
1368    }
1369    if (!htmlOmittedDefaultValue)
1370	return(0);
1371    for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1372	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1373	    htmlAutoClose(ctxt, BAD_CAST"p");
1374	    htmlCheckImplied(ctxt, BAD_CAST"p");
1375	    htmlnamePush(ctxt, BAD_CAST"p");
1376	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1377		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1378	    return(1);
1379	}
1380    }
1381    return(0);
1382}
1383
1384/**
1385 * htmlIsScriptAttribute:
1386 * @name:  an attribute name
1387 *
1388 * Check if an attribute is of content type Script
1389 *
1390 * Returns 1 is the attribute is a script 0 otherwise
1391 */
1392int
1393htmlIsScriptAttribute(const xmlChar *name) {
1394    unsigned int i;
1395
1396    if (name == NULL)
1397       	return(0);
1398    /*
1399     * all script attributes start with 'on'
1400     */
1401    if ((name[0] != 'o') || (name[1] != 'n'))
1402       	return(0);
1403    for (i = 0;
1404	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1405	 i++) {
1406	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1407	    return(1);
1408    }
1409    return(0);
1410}
1411
1412/************************************************************************
1413 *									*
1414 * 		The list of HTML predefined entities			*
1415 *									*
1416 ************************************************************************/
1417
1418
1419static const htmlEntityDesc  html40EntitiesTable[] = {
1420/*
1421 * the 4 absolute ones, plus apostrophe.
1422 */
1423{ 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1424{ 38,	"amp",	"ampersand, U+0026 ISOnum" },
1425{ 39,	"apos",	"single quote" },
1426{ 60,	"lt",	"less-than sign, U+003C ISOnum" },
1427{ 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1428
1429/*
1430 * A bunch still in the 128-255 range
1431 * Replacing them depend really on the charset used.
1432 */
1433{ 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1434{ 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1435{ 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1436{ 163,	"pound","pound sign, U+00A3 ISOnum" },
1437{ 164,	"curren","currency sign, U+00A4 ISOnum" },
1438{ 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1439{ 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1440{ 167,	"sect",	"section sign, U+00A7 ISOnum" },
1441{ 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1442{ 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1443{ 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1444{ 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1445{ 172,	"not",	"not sign, U+00AC ISOnum" },
1446{ 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1447{ 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1448{ 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1449{ 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1450{ 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1451{ 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1452{ 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1453{ 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1454{ 181,	"micro","micro sign, U+00B5 ISOnum" },
1455{ 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1456{ 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1457{ 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1458{ 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1459{ 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1460{ 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1461{ 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1462{ 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1463{ 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1464{ 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1465{ 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1466{ 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1467{ 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1468{ 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1469{ 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1470{ 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1471{ 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1472{ 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1473{ 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1474{ 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1475{ 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1476{ 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1477{ 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1478{ 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1479{ 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1480{ 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1481{ 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1482{ 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1483{ 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1484{ 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1485{ 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1486{ 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1487{ 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1488{ 215,	"times","multiplication sign, U+00D7 ISOnum" },
1489{ 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1490{ 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1491{ 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1492{ 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1493{ 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1494{ 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1495{ 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1496{ 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1497{ 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1498{ 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1499{ 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1500{ 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1501{ 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1502{ 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1503{ 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1504{ 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1505{ 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1506{ 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1507{ 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1508{ 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1509{ 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1510{ 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1511{ 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1512{ 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1513{ 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1514{ 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1515{ 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1516{ 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1517{ 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1518{ 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1519{ 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1520{ 247,	"divide","division sign, U+00F7 ISOnum" },
1521{ 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1522{ 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1523{ 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1524{ 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1525{ 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1526{ 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1527{ 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1528{ 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1529
1530{ 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1531{ 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1532{ 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1533{ 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1534{ 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1535
1536/*
1537 * Anything below should really be kept as entities references
1538 */
1539{ 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1540
1541{ 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1542{ 732,	"tilde","small tilde, U+02DC ISOdia" },
1543
1544{ 913,	"Alpha","greek capital letter alpha, U+0391" },
1545{ 914,	"Beta",	"greek capital letter beta, U+0392" },
1546{ 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1547{ 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1548{ 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1549{ 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1550{ 919,	"Eta",	"greek capital letter eta, U+0397" },
1551{ 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1552{ 921,	"Iota",	"greek capital letter iota, U+0399" },
1553{ 922,	"Kappa","greek capital letter kappa, U+039A" },
1554{ 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1555{ 924,	"Mu",	"greek capital letter mu, U+039C" },
1556{ 925,	"Nu",	"greek capital letter nu, U+039D" },
1557{ 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1558{ 927,	"Omicron","greek capital letter omicron, U+039F" },
1559{ 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1560{ 929,	"Rho",	"greek capital letter rho, U+03A1" },
1561{ 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1562{ 932,	"Tau",	"greek capital letter tau, U+03A4" },
1563{ 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1564{ 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1565{ 935,	"Chi",	"greek capital letter chi, U+03A7" },
1566{ 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1567{ 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1568
1569{ 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1570{ 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1571{ 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1572{ 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1573{ 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1574{ 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1575{ 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1576{ 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1577{ 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1578{ 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1579{ 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1580{ 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1581{ 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1582{ 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1583{ 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1584{ 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1585{ 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1586{ 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1587{ 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1588{ 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1589{ 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1590{ 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1591{ 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1592{ 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1593{ 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1594{ 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1595{ 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1596{ 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1597
1598{ 8194,	"ensp",	"en space, U+2002 ISOpub" },
1599{ 8195,	"emsp",	"em space, U+2003 ISOpub" },
1600{ 8201,	"thinsp","thin space, U+2009 ISOpub" },
1601{ 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1602{ 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1603{ 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1604{ 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1605{ 8211,	"ndash","en dash, U+2013 ISOpub" },
1606{ 8212,	"mdash","em dash, U+2014 ISOpub" },
1607{ 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1608{ 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1609{ 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1610{ 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1611{ 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1612{ 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1613{ 8224,	"dagger","dagger, U+2020 ISOpub" },
1614{ 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1615
1616{ 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1617{ 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1618
1619{ 8240,	"permil","per mille sign, U+2030 ISOtech" },
1620
1621{ 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1622{ 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1623
1624{ 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1625{ 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1626
1627{ 8254,	"oline","overline = spacing overscore, U+203E NEW" },
1628{ 8260,	"frasl","fraction slash, U+2044 NEW" },
1629
1630{ 8364,	"euro",	"euro sign, U+20AC NEW" },
1631
1632{ 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1633{ 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1634{ 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
1635{ 8482,	"trade","trade mark sign, U+2122 ISOnum" },
1636{ 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1637{ 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
1638{ 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
1639{ 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
1640{ 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
1641{ 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
1642{ 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1643{ 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
1644{ 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
1645{ 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
1646{ 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
1647{ 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
1648
1649{ 8704,	"forall","for all, U+2200 ISOtech" },
1650{ 8706,	"part",	"partial differential, U+2202 ISOtech" },
1651{ 8707,	"exist","there exists, U+2203 ISOtech" },
1652{ 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
1653{ 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
1654{ 8712,	"isin",	"element of, U+2208 ISOtech" },
1655{ 8713,	"notin","not an element of, U+2209 ISOtech" },
1656{ 8715,	"ni",	"contains as member, U+220B ISOtech" },
1657{ 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
1658{ 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
1659{ 8722,	"minus","minus sign, U+2212 ISOtech" },
1660{ 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
1661{ 8730,	"radic","square root = radical sign, U+221A ISOtech" },
1662{ 8733,	"prop",	"proportional to, U+221D ISOtech" },
1663{ 8734,	"infin","infinity, U+221E ISOtech" },
1664{ 8736,	"ang",	"angle, U+2220 ISOamso" },
1665{ 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
1666{ 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
1667{ 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
1668{ 8746,	"cup",	"union = cup, U+222A ISOtech" },
1669{ 8747,	"int",	"integral, U+222B ISOtech" },
1670{ 8756,	"there4","therefore, U+2234 ISOtech" },
1671{ 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
1672{ 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
1673{ 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1674{ 8800,	"ne",	"not equal to, U+2260 ISOtech" },
1675{ 8801,	"equiv","identical to, U+2261 ISOtech" },
1676{ 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
1677{ 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
1678{ 8834,	"sub",	"subset of, U+2282 ISOtech" },
1679{ 8835,	"sup",	"superset of, U+2283 ISOtech" },
1680{ 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
1681{ 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
1682{ 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
1683{ 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
1684{ 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
1685{ 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1686{ 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
1687{ 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1688{ 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
1689{ 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
1690{ 8971,	"rfloor","right floor, U+230B ISOamsc" },
1691{ 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
1692{ 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
1693{ 9674,	"loz",	"lozenge, U+25CA ISOpub" },
1694
1695{ 9824,	"spades","black spade suit, U+2660 ISOpub" },
1696{ 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
1697{ 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
1698{ 9830,	"diams","black diamond suit, U+2666 ISOpub" },
1699
1700};
1701
1702/************************************************************************
1703 *									*
1704 *		Commodity functions to handle entities			*
1705 *									*
1706 ************************************************************************/
1707
1708/*
1709 * Macro used to grow the current buffer.
1710 */
1711#define growBuffer(buffer) {						\
1712    xmlChar *tmp;							\
1713    buffer##_size *= 2;							\
1714    tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1715    if (tmp == NULL) {						\
1716	htmlErrMemory(ctxt, "growing buffer\n");			\
1717	xmlFree(buffer);						\
1718	return(NULL);							\
1719    }									\
1720    buffer = tmp;							\
1721}
1722
1723/**
1724 * htmlEntityLookup:
1725 * @name: the entity name
1726 *
1727 * Lookup the given entity in EntitiesTable
1728 *
1729 * TODO: the linear scan is really ugly, an hash table is really needed.
1730 *
1731 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1732 */
1733const htmlEntityDesc *
1734htmlEntityLookup(const xmlChar *name) {
1735    unsigned int i;
1736
1737    for (i = 0;i < (sizeof(html40EntitiesTable)/
1738                    sizeof(html40EntitiesTable[0]));i++) {
1739        if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1740            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1741	}
1742    }
1743    return(NULL);
1744}
1745
1746/**
1747 * htmlEntityValueLookup:
1748 * @value: the entity's unicode value
1749 *
1750 * Lookup the given entity in EntitiesTable
1751 *
1752 * TODO: the linear scan is really ugly, an hash table is really needed.
1753 *
1754 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1755 */
1756const htmlEntityDesc *
1757htmlEntityValueLookup(unsigned int value) {
1758    unsigned int i;
1759
1760    for (i = 0;i < (sizeof(html40EntitiesTable)/
1761                    sizeof(html40EntitiesTable[0]));i++) {
1762        if (html40EntitiesTable[i].value >= value) {
1763	    if (html40EntitiesTable[i].value > value)
1764		break;
1765            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1766	}
1767    }
1768    return(NULL);
1769}
1770
1771/**
1772 * UTF8ToHtml:
1773 * @out:  a pointer to an array of bytes to store the result
1774 * @outlen:  the length of @out
1775 * @in:  a pointer to an array of UTF-8 chars
1776 * @inlen:  the length of @in
1777 *
1778 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1779 * plus HTML entities block of chars out.
1780 *
1781 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1782 * The value of @inlen after return is the number of octets consumed
1783 *     as the return value is positive, else unpredictable.
1784 * The value of @outlen after return is the number of octets consumed.
1785 */
1786int
1787UTF8ToHtml(unsigned char* out, int *outlen,
1788              const unsigned char* in, int *inlen) {
1789    const unsigned char* processed = in;
1790    const unsigned char* outend;
1791    const unsigned char* outstart = out;
1792    const unsigned char* instart = in;
1793    const unsigned char* inend;
1794    unsigned int c, d;
1795    int trailing;
1796
1797    if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1798    if (in == NULL) {
1799        /*
1800	 * initialization nothing to do
1801	 */
1802	*outlen = 0;
1803	*inlen = 0;
1804	return(0);
1805    }
1806    inend = in + (*inlen);
1807    outend = out + (*outlen);
1808    while (in < inend) {
1809	d = *in++;
1810	if      (d < 0x80)  { c= d; trailing= 0; }
1811	else if (d < 0xC0) {
1812	    /* trailing byte in leading position */
1813	    *outlen = out - outstart;
1814	    *inlen = processed - instart;
1815	    return(-2);
1816        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1817        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1818        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1819	else {
1820	    /* no chance for this in Ascii */
1821	    *outlen = out - outstart;
1822	    *inlen = processed - instart;
1823	    return(-2);
1824	}
1825
1826	if (inend - in < trailing) {
1827	    break;
1828	}
1829
1830	for ( ; trailing; trailing--) {
1831	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1832		break;
1833	    c <<= 6;
1834	    c |= d & 0x3F;
1835	}
1836
1837	/* assertion: c is a single UTF-4 value */
1838	if (c < 0x80) {
1839	    if (out + 1 >= outend)
1840		break;
1841	    *out++ = c;
1842	} else {
1843	    int len;
1844	    const htmlEntityDesc * ent;
1845
1846	    /*
1847	     * Try to lookup a predefined HTML entity for it
1848	     */
1849
1850	    ent = htmlEntityValueLookup(c);
1851	    if (ent == NULL) {
1852		/* no chance for this in Ascii */
1853		*outlen = out - outstart;
1854		*inlen = processed - instart;
1855		return(-2);
1856	    }
1857	    len = strlen(ent->name);
1858	    if (out + 2 + len >= outend)
1859		break;
1860	    *out++ = '&';
1861	    memcpy(out, ent->name, len);
1862	    out += len;
1863	    *out++ = ';';
1864	}
1865	processed = in;
1866    }
1867    *outlen = out - outstart;
1868    *inlen = processed - instart;
1869    return(0);
1870}
1871
1872/**
1873 * htmlEncodeEntities:
1874 * @out:  a pointer to an array of bytes to store the result
1875 * @outlen:  the length of @out
1876 * @in:  a pointer to an array of UTF-8 chars
1877 * @inlen:  the length of @in
1878 * @quoteChar: the quote character to escape (' or ") or zero.
1879 *
1880 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1881 * plus HTML entities block of chars out.
1882 *
1883 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1884 * The value of @inlen after return is the number of octets consumed
1885 *     as the return value is positive, else unpredictable.
1886 * The value of @outlen after return is the number of octets consumed.
1887 */
1888int
1889htmlEncodeEntities(unsigned char* out, int *outlen,
1890		   const unsigned char* in, int *inlen, int quoteChar) {
1891    const unsigned char* processed = in;
1892    const unsigned char* outend;
1893    const unsigned char* outstart = out;
1894    const unsigned char* instart = in;
1895    const unsigned char* inend;
1896    unsigned int c, d;
1897    int trailing;
1898
1899    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1900        return(-1);
1901    outend = out + (*outlen);
1902    inend = in + (*inlen);
1903    while (in < inend) {
1904	d = *in++;
1905	if      (d < 0x80)  { c= d; trailing= 0; }
1906	else if (d < 0xC0) {
1907	    /* trailing byte in leading position */
1908	    *outlen = out - outstart;
1909	    *inlen = processed - instart;
1910	    return(-2);
1911        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1912        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1913        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1914	else {
1915	    /* no chance for this in Ascii */
1916	    *outlen = out - outstart;
1917	    *inlen = processed - instart;
1918	    return(-2);
1919	}
1920
1921	if (inend - in < trailing)
1922	    break;
1923
1924	while (trailing--) {
1925	    if (((d= *in++) & 0xC0) != 0x80) {
1926		*outlen = out - outstart;
1927		*inlen = processed - instart;
1928		return(-2);
1929	    }
1930	    c <<= 6;
1931	    c |= d & 0x3F;
1932	}
1933
1934	/* assertion: c is a single UTF-4 value */
1935	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1936	    (c != '&') && (c != '<') && (c != '>')) {
1937	    if (out >= outend)
1938		break;
1939	    *out++ = c;
1940	} else {
1941	    const htmlEntityDesc * ent;
1942	    const char *cp;
1943	    char nbuf[16];
1944	    int len;
1945
1946	    /*
1947	     * Try to lookup a predefined HTML entity for it
1948	     */
1949	    ent = htmlEntityValueLookup(c);
1950	    if (ent == NULL) {
1951		snprintf(nbuf, sizeof(nbuf), "#%u", c);
1952		cp = nbuf;
1953	    }
1954	    else
1955		cp = ent->name;
1956	    len = strlen(cp);
1957	    if (out + 2 + len > outend)
1958		break;
1959	    *out++ = '&';
1960	    memcpy(out, cp, len);
1961	    out += len;
1962	    *out++ = ';';
1963	}
1964	processed = in;
1965    }
1966    *outlen = out - outstart;
1967    *inlen = processed - instart;
1968    return(0);
1969}
1970
1971/************************************************************************
1972 *									*
1973 *		Commodity functions to handle streams			*
1974 *									*
1975 ************************************************************************/
1976
1977/**
1978 * htmlNewInputStream:
1979 * @ctxt:  an HTML parser context
1980 *
1981 * Create a new input stream structure
1982 * Returns the new input stream or NULL
1983 */
1984static htmlParserInputPtr
1985htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1986    htmlParserInputPtr input;
1987
1988    input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1989    if (input == NULL) {
1990        htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
1991	return(NULL);
1992    }
1993    memset(input, 0, sizeof(htmlParserInput));
1994    input->filename = NULL;
1995    input->directory = NULL;
1996    input->base = NULL;
1997    input->cur = NULL;
1998    input->buf = NULL;
1999    input->line = 1;
2000    input->col = 1;
2001    input->buf = NULL;
2002    input->free = NULL;
2003    input->version = NULL;
2004    input->consumed = 0;
2005    input->length = 0;
2006    return(input);
2007}
2008
2009
2010/************************************************************************
2011 *									*
2012 *		Commodity functions, cleanup needed ?			*
2013 *									*
2014 ************************************************************************/
2015/*
2016 * all tags allowing pc data from the html 4.01 loose dtd
2017 * NOTE: it might be more apropriate to integrate this information
2018 * into the html40ElementTable array but I don't want to risk any
2019 * binary incomptibility
2020 */
2021static const char *allowPCData[] = {
2022    "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2023    "blockquote", "body", "button", "caption", "center", "cite", "code",
2024    "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2025    "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2026    "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2027    "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2028};
2029
2030/**
2031 * areBlanks:
2032 * @ctxt:  an HTML parser context
2033 * @str:  a xmlChar *
2034 * @len:  the size of @str
2035 *
2036 * Is this a sequence of blank chars that one can ignore ?
2037 *
2038 * Returns 1 if ignorable 0 otherwise.
2039 */
2040
2041static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2042    unsigned int i;
2043    int j;
2044    xmlNodePtr lastChild;
2045    xmlDtdPtr dtd;
2046
2047    for (j = 0;j < len;j++)
2048        if (!(IS_BLANK_CH(str[j]))) return(0);
2049
2050    if (CUR == 0) return(1);
2051    if (CUR != '<') return(0);
2052    if (ctxt->name == NULL)
2053	return(1);
2054    if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2055	return(1);
2056    if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2057	return(1);
2058
2059    /* Only strip CDATA children of the body tag for strict HTML DTDs */
2060    if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2061        dtd = xmlGetIntSubset(ctxt->myDoc);
2062        if (dtd != NULL && dtd->ExternalID != NULL) {
2063            if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2064                    !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2065                return(1);
2066        }
2067    }
2068
2069    if (ctxt->node == NULL) return(0);
2070    lastChild = xmlGetLastChild(ctxt->node);
2071    while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2072	lastChild = lastChild->prev;
2073    if (lastChild == NULL) {
2074        if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2075            (ctxt->node->content != NULL)) return(0);
2076	/* keep ws in constructs like ...<b> </b>...
2077	   for all tags "b" allowing PCDATA */
2078	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2079	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2080		return(0);
2081	    }
2082	}
2083    } else if (xmlNodeIsText(lastChild)) {
2084        return(0);
2085    } else {
2086	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2087	   for all tags "p" allowing PCDATA */
2088	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2089	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2090		return(0);
2091	    }
2092	}
2093    }
2094    return(1);
2095}
2096
2097/**
2098 * htmlNewDocNoDtD:
2099 * @URI:  URI for the dtd, or NULL
2100 * @ExternalID:  the external ID of the DTD, or NULL
2101 *
2102 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2103 * are NULL
2104 *
2105 * Returns a new document, do not initialize the DTD if not provided
2106 */
2107htmlDocPtr
2108htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2109    xmlDocPtr cur;
2110
2111    /*
2112     * Allocate a new document and fill the fields.
2113     */
2114    cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2115    if (cur == NULL) {
2116	htmlErrMemory(NULL, "HTML document creation failed\n");
2117	return(NULL);
2118    }
2119    memset(cur, 0, sizeof(xmlDoc));
2120
2121    cur->type = XML_HTML_DOCUMENT_NODE;
2122    cur->version = NULL;
2123    cur->intSubset = NULL;
2124    cur->doc = cur;
2125    cur->name = NULL;
2126    cur->children = NULL;
2127    cur->extSubset = NULL;
2128    cur->oldNs = NULL;
2129    cur->encoding = NULL;
2130    cur->standalone = 1;
2131    cur->compression = 0;
2132    cur->ids = NULL;
2133    cur->refs = NULL;
2134    cur->_private = NULL;
2135    cur->charset = XML_CHAR_ENCODING_UTF8;
2136    if ((ExternalID != NULL) ||
2137	(URI != NULL))
2138	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2139    return(cur);
2140}
2141
2142/**
2143 * htmlNewDoc:
2144 * @URI:  URI for the dtd, or NULL
2145 * @ExternalID:  the external ID of the DTD, or NULL
2146 *
2147 * Creates a new HTML document
2148 *
2149 * Returns a new document
2150 */
2151htmlDocPtr
2152htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2153    if ((URI == NULL) && (ExternalID == NULL))
2154	return(htmlNewDocNoDtD(
2155		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2156		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2157
2158    return(htmlNewDocNoDtD(URI, ExternalID));
2159}
2160
2161
2162/************************************************************************
2163 *									*
2164 *			The parser itself				*
2165 *	Relates to http://www.w3.org/TR/html40				*
2166 *									*
2167 ************************************************************************/
2168
2169/************************************************************************
2170 *									*
2171 *			The parser itself				*
2172 *									*
2173 ************************************************************************/
2174
2175static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2176
2177/**
2178 * htmlParseHTMLName:
2179 * @ctxt:  an HTML parser context
2180 *
2181 * parse an HTML tag or attribute name, note that we convert it to lowercase
2182 * since HTML names are not case-sensitive.
2183 *
2184 * Returns the Tag Name parsed or NULL
2185 */
2186
2187static const xmlChar *
2188htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2189    int i = 0;
2190    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2191
2192    if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2193        (CUR != ':')) return(NULL);
2194
2195    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2196           ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2197	   (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2198	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2199        else loc[i] = CUR;
2200	i++;
2201
2202	NEXT;
2203    }
2204
2205    return(xmlDictLookup(ctxt->dict, loc, i));
2206}
2207
2208/**
2209 * htmlParseName:
2210 * @ctxt:  an HTML parser context
2211 *
2212 * parse an HTML name, this routine is case sensitive.
2213 *
2214 * Returns the Name parsed or NULL
2215 */
2216
2217static const xmlChar *
2218htmlParseName(htmlParserCtxtPtr ctxt) {
2219    const xmlChar *in;
2220    const xmlChar *ret;
2221    int count = 0;
2222
2223    GROW;
2224
2225    /*
2226     * Accelerator for simple ASCII names
2227     */
2228    in = ctxt->input->cur;
2229    if (((*in >= 0x61) && (*in <= 0x7A)) ||
2230	((*in >= 0x41) && (*in <= 0x5A)) ||
2231	(*in == '_') || (*in == ':')) {
2232	in++;
2233	while (((*in >= 0x61) && (*in <= 0x7A)) ||
2234	       ((*in >= 0x41) && (*in <= 0x5A)) ||
2235	       ((*in >= 0x30) && (*in <= 0x39)) ||
2236	       (*in == '_') || (*in == '-') ||
2237	       (*in == ':') || (*in == '.'))
2238	    in++;
2239	if ((*in > 0) && (*in < 0x80)) {
2240	    count = in - ctxt->input->cur;
2241	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2242	    ctxt->input->cur = in;
2243	    ctxt->nbChars += count;
2244	    ctxt->input->col += count;
2245	    return(ret);
2246	}
2247    }
2248    return(htmlParseNameComplex(ctxt));
2249}
2250
2251static const xmlChar *
2252htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2253    int len = 0, l;
2254    int c;
2255    int count = 0;
2256
2257    /*
2258     * Handler for more complex cases
2259     */
2260    GROW;
2261    c = CUR_CHAR(l);
2262    if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2263	(!IS_LETTER(c) && (c != '_') &&
2264         (c != ':'))) {
2265	return(NULL);
2266    }
2267
2268    while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2269	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2270            (c == '.') || (c == '-') ||
2271	    (c == '_') || (c == ':') ||
2272	    (IS_COMBINING(c)) ||
2273	    (IS_EXTENDER(c)))) {
2274	if (count++ > 100) {
2275	    count = 0;
2276	    GROW;
2277	}
2278	len += l;
2279	NEXTL(l);
2280	c = CUR_CHAR(l);
2281    }
2282    return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2283}
2284
2285
2286/**
2287 * htmlParseHTMLAttribute:
2288 * @ctxt:  an HTML parser context
2289 * @stop:  a char stop value
2290 *
2291 * parse an HTML attribute value till the stop (quote), if
2292 * stop is 0 then it stops at the first space
2293 *
2294 * Returns the attribute parsed or NULL
2295 */
2296
2297static xmlChar *
2298htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2299    xmlChar *buffer = NULL;
2300    int buffer_size = 0;
2301    xmlChar *out = NULL;
2302    const xmlChar *name = NULL;
2303    const xmlChar *cur = NULL;
2304    const htmlEntityDesc * ent;
2305
2306    /*
2307     * allocate a translation buffer.
2308     */
2309    buffer_size = HTML_PARSER_BUFFER_SIZE;
2310    buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2311    if (buffer == NULL) {
2312	htmlErrMemory(ctxt, "buffer allocation failed\n");
2313	return(NULL);
2314    }
2315    out = buffer;
2316
2317    /*
2318     * Ok loop until we reach one of the ending chars
2319     */
2320    while ((CUR != 0) && (CUR != stop)) {
2321	if ((stop == 0) && (CUR == '>')) break;
2322	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2323        if (CUR == '&') {
2324	    if (NXT(1) == '#') {
2325		unsigned int c;
2326		int bits;
2327
2328		c = htmlParseCharRef(ctxt);
2329		if      (c <    0x80)
2330		        { *out++  = c;                bits= -6; }
2331		else if (c <   0x800)
2332		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2333		else if (c < 0x10000)
2334		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2335		else
2336		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2337
2338		for ( ; bits >= 0; bits-= 6) {
2339		    *out++  = ((c >> bits) & 0x3F) | 0x80;
2340		}
2341
2342		if (out - buffer > buffer_size - 100) {
2343			int indx = out - buffer;
2344
2345			growBuffer(buffer);
2346			out = &buffer[indx];
2347		}
2348	    } else {
2349		ent = htmlParseEntityRef(ctxt, &name);
2350		if (name == NULL) {
2351		    *out++ = '&';
2352		    if (out - buffer > buffer_size - 100) {
2353			int indx = out - buffer;
2354
2355			growBuffer(buffer);
2356			out = &buffer[indx];
2357		    }
2358		} else if (ent == NULL) {
2359		    *out++ = '&';
2360		    cur = name;
2361		    while (*cur != 0) {
2362			if (out - buffer > buffer_size - 100) {
2363			    int indx = out - buffer;
2364
2365			    growBuffer(buffer);
2366			    out = &buffer[indx];
2367			}
2368			*out++ = *cur++;
2369		    }
2370		} else {
2371		    unsigned int c;
2372		    int bits;
2373
2374		    if (out - buffer > buffer_size - 100) {
2375			int indx = out - buffer;
2376
2377			growBuffer(buffer);
2378			out = &buffer[indx];
2379		    }
2380		    c = ent->value;
2381		    if      (c <    0x80)
2382			{ *out++  = c;                bits= -6; }
2383		    else if (c <   0x800)
2384			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2385		    else if (c < 0x10000)
2386			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2387		    else
2388			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2389
2390		    for ( ; bits >= 0; bits-= 6) {
2391			*out++  = ((c >> bits) & 0x3F) | 0x80;
2392		    }
2393		}
2394	    }
2395	} else {
2396	    unsigned int c;
2397	    int bits, l;
2398
2399	    if (out - buffer > buffer_size - 100) {
2400		int indx = out - buffer;
2401
2402		growBuffer(buffer);
2403		out = &buffer[indx];
2404	    }
2405	    c = CUR_CHAR(l);
2406	    if      (c <    0x80)
2407		    { *out++  = c;                bits= -6; }
2408	    else if (c <   0x800)
2409		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2410	    else if (c < 0x10000)
2411		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2412	    else
2413		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2414
2415	    for ( ; bits >= 0; bits-= 6) {
2416		*out++  = ((c >> bits) & 0x3F) | 0x80;
2417	    }
2418	    NEXT;
2419	}
2420    }
2421    *out++ = 0;
2422    return(buffer);
2423}
2424
2425/**
2426 * htmlParseEntityRef:
2427 * @ctxt:  an HTML parser context
2428 * @str:  location to store the entity name
2429 *
2430 * parse an HTML ENTITY references
2431 *
2432 * [68] EntityRef ::= '&' Name ';'
2433 *
2434 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2435 *         if non-NULL *str will have to be freed by the caller.
2436 */
2437const htmlEntityDesc *
2438htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2439    const xmlChar *name;
2440    const htmlEntityDesc * ent = NULL;
2441
2442    if (str != NULL) *str = NULL;
2443    if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2444
2445    if (CUR == '&') {
2446        NEXT;
2447        name = htmlParseName(ctxt);
2448	if (name == NULL) {
2449	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2450	                 "htmlParseEntityRef: no name\n", NULL, NULL);
2451	} else {
2452	    GROW;
2453	    if (CUR == ';') {
2454	        if (str != NULL)
2455		    *str = name;
2456
2457		/*
2458		 * Lookup the entity in the table.
2459		 */
2460		ent = htmlEntityLookup(name);
2461		if (ent != NULL) /* OK that's ugly !!! */
2462		    NEXT;
2463	    } else {
2464		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2465		             "htmlParseEntityRef: expecting ';'\n",
2466			     NULL, NULL);
2467	        if (str != NULL)
2468		    *str = name;
2469	    }
2470	}
2471    }
2472    return(ent);
2473}
2474
2475/**
2476 * htmlParseAttValue:
2477 * @ctxt:  an HTML parser context
2478 *
2479 * parse a value for an attribute
2480 * Note: the parser won't do substitution of entities here, this
2481 * will be handled later in xmlStringGetNodeList, unless it was
2482 * asked for ctxt->replaceEntities != 0
2483 *
2484 * Returns the AttValue parsed or NULL.
2485 */
2486
2487static xmlChar *
2488htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2489    xmlChar *ret = NULL;
2490
2491    if (CUR == '"') {
2492        NEXT;
2493	ret = htmlParseHTMLAttribute(ctxt, '"');
2494        if (CUR != '"') {
2495	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2496	                 "AttValue: \" expected\n", NULL, NULL);
2497	} else
2498	    NEXT;
2499    } else if (CUR == '\'') {
2500        NEXT;
2501	ret = htmlParseHTMLAttribute(ctxt, '\'');
2502        if (CUR != '\'') {
2503	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2504	                 "AttValue: ' expected\n", NULL, NULL);
2505	} else
2506	    NEXT;
2507    } else {
2508        /*
2509	 * That's an HTMLism, the attribute value may not be quoted
2510	 */
2511	ret = htmlParseHTMLAttribute(ctxt, 0);
2512	if (ret == NULL) {
2513	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2514	                 "AttValue: no value found\n", NULL, NULL);
2515	}
2516    }
2517    return(ret);
2518}
2519
2520/**
2521 * htmlParseSystemLiteral:
2522 * @ctxt:  an HTML parser context
2523 *
2524 * parse an HTML Literal
2525 *
2526 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2527 *
2528 * Returns the SystemLiteral parsed or NULL
2529 */
2530
2531static xmlChar *
2532htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2533    const xmlChar *q;
2534    xmlChar *ret = NULL;
2535
2536    if (CUR == '"') {
2537        NEXT;
2538	q = CUR_PTR;
2539	while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2540	    NEXT;
2541	if (!IS_CHAR_CH(CUR)) {
2542	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2543			 "Unfinished SystemLiteral\n", NULL, NULL);
2544	} else {
2545	    ret = xmlStrndup(q, CUR_PTR - q);
2546	    NEXT;
2547        }
2548    } else if (CUR == '\'') {
2549        NEXT;
2550	q = CUR_PTR;
2551	while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2552	    NEXT;
2553	if (!IS_CHAR_CH(CUR)) {
2554	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2555			 "Unfinished SystemLiteral\n", NULL, NULL);
2556	} else {
2557	    ret = xmlStrndup(q, CUR_PTR - q);
2558	    NEXT;
2559        }
2560    } else {
2561	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2562	             " or ' expected\n", NULL, NULL);
2563    }
2564
2565    return(ret);
2566}
2567
2568/**
2569 * htmlParsePubidLiteral:
2570 * @ctxt:  an HTML parser context
2571 *
2572 * parse an HTML public literal
2573 *
2574 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2575 *
2576 * Returns the PubidLiteral parsed or NULL.
2577 */
2578
2579static xmlChar *
2580htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2581    const xmlChar *q;
2582    xmlChar *ret = NULL;
2583    /*
2584     * Name ::= (Letter | '_') (NameChar)*
2585     */
2586    if (CUR == '"') {
2587        NEXT;
2588	q = CUR_PTR;
2589	while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2590	if (CUR != '"') {
2591	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2592	                 "Unfinished PubidLiteral\n", NULL, NULL);
2593	} else {
2594	    ret = xmlStrndup(q, CUR_PTR - q);
2595	    NEXT;
2596	}
2597    } else if (CUR == '\'') {
2598        NEXT;
2599	q = CUR_PTR;
2600	while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2601	    NEXT;
2602	if (CUR != '\'') {
2603	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2604	                 "Unfinished PubidLiteral\n", NULL, NULL);
2605	} else {
2606	    ret = xmlStrndup(q, CUR_PTR - q);
2607	    NEXT;
2608	}
2609    } else {
2610	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2611	             "PubidLiteral \" or ' expected\n", NULL, NULL);
2612    }
2613
2614    return(ret);
2615}
2616
2617/**
2618 * htmlParseScript:
2619 * @ctxt:  an HTML parser context
2620 *
2621 * parse the content of an HTML SCRIPT or STYLE element
2622 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2623 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2624 * http://www.w3.org/TR/html4/types.html#type-script
2625 * http://www.w3.org/TR/html4/types.html#h-6.15
2626 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2627 *
2628 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2629 * element and the value of intrinsic event attributes. User agents must
2630 * not evaluate script data as HTML markup but instead must pass it on as
2631 * data to a script engine.
2632 * NOTES:
2633 * - The content is passed like CDATA
2634 * - the attributes for style and scripting "onXXX" are also described
2635 *   as CDATA but SGML allows entities references in attributes so their
2636 *   processing is identical as other attributes
2637 */
2638static void
2639htmlParseScript(htmlParserCtxtPtr ctxt) {
2640    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2641    int nbchar = 0;
2642    int cur,l;
2643
2644    SHRINK;
2645    cur = CUR_CHAR(l);
2646    while (IS_CHAR_CH(cur)) {
2647	if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2648	    (NXT(3) == '-')) {
2649	    if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2650		if (ctxt->sax->cdataBlock!= NULL) {
2651		    /*
2652		     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2653		     */
2654		    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2655		} else if (ctxt->sax->characters != NULL) {
2656		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2657		}
2658	    }
2659	    nbchar = 0;
2660	    htmlParseComment(ctxt);
2661	    cur = CUR_CHAR(l);
2662	    continue;
2663	} else if ((cur == '<') && (NXT(1) == '/')) {
2664            /*
2665             * One should break here, the specification is clear:
2666             * Authors should therefore escape "</" within the content.
2667             * Escape mechanisms are specific to each scripting or
2668             * style sheet language.
2669             *
2670             * In recovery mode, only break if end tag match the
2671             * current tag, effectively ignoring all tags inside the
2672             * script/style block and treating the entire block as
2673             * CDATA.
2674             */
2675            if (ctxt->recovery) {
2676                if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2677				   xmlStrlen(ctxt->name)) == 0)
2678                {
2679                    break; /* while */
2680                } else {
2681		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2682				 "Element %s embeds close tag\n",
2683		                 ctxt->name, NULL);
2684		}
2685            } else {
2686                if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2687                    ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2688                {
2689                    break; /* while */
2690                }
2691            }
2692	}
2693	COPY_BUF(l,buf,nbchar,cur);
2694	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2695	    if (ctxt->sax->cdataBlock!= NULL) {
2696		/*
2697		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2698		 */
2699		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2700	    } else if (ctxt->sax->characters != NULL) {
2701		ctxt->sax->characters(ctxt->userData, buf, nbchar);
2702	    }
2703	    nbchar = 0;
2704	}
2705	GROW;
2706	NEXTL(l);
2707	cur = CUR_CHAR(l);
2708    }
2709
2710    if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2711	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2712	                "Invalid char in CDATA 0x%X\n", cur);
2713	NEXT;
2714    }
2715
2716    if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2717	if (ctxt->sax->cdataBlock!= NULL) {
2718	    /*
2719	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2720	     */
2721	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2722	} else if (ctxt->sax->characters != NULL) {
2723	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2724	}
2725    }
2726}
2727
2728
2729/**
2730 * htmlParseCharData:
2731 * @ctxt:  an HTML parser context
2732 *
2733 * parse a CharData section.
2734 * if we are within a CDATA section ']]>' marks an end of section.
2735 *
2736 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2737 */
2738
2739static void
2740htmlParseCharData(htmlParserCtxtPtr ctxt) {
2741    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2742    int nbchar = 0;
2743    int cur, l;
2744
2745    SHRINK;
2746    cur = CUR_CHAR(l);
2747    while (((cur != '<') || (ctxt->token == '<')) &&
2748           ((cur != '&') || (ctxt->token == '&')) &&
2749	   (IS_CHAR(cur))) {
2750	COPY_BUF(l,buf,nbchar,cur);
2751	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2752	    /*
2753	     * Ok the segment is to be consumed as chars.
2754	     */
2755	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2756		if (areBlanks(ctxt, buf, nbchar)) {
2757		    if (ctxt->sax->ignorableWhitespace != NULL)
2758			ctxt->sax->ignorableWhitespace(ctxt->userData,
2759			                               buf, nbchar);
2760		} else {
2761		    htmlCheckParagraph(ctxt);
2762		    if (ctxt->sax->characters != NULL)
2763			ctxt->sax->characters(ctxt->userData, buf, nbchar);
2764		}
2765	    }
2766	    nbchar = 0;
2767	}
2768	NEXTL(l);
2769	cur = CUR_CHAR(l);
2770	if (cur == 0) {
2771	    SHRINK;
2772	    GROW;
2773	    cur = CUR_CHAR(l);
2774	}
2775    }
2776    if (nbchar != 0) {
2777        buf[nbchar] = 0;
2778
2779	/*
2780	 * Ok the segment is to be consumed as chars.
2781	 */
2782	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2783	    if (areBlanks(ctxt, buf, nbchar)) {
2784		if (ctxt->sax->ignorableWhitespace != NULL)
2785		    ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2786	    } else {
2787		htmlCheckParagraph(ctxt);
2788		if (ctxt->sax->characters != NULL)
2789		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2790	    }
2791	}
2792    } else {
2793	/*
2794	 * Loop detection
2795	 */
2796	if (cur == 0)
2797	    ctxt->instate = XML_PARSER_EOF;
2798    }
2799}
2800
2801/**
2802 * htmlParseExternalID:
2803 * @ctxt:  an HTML parser context
2804 * @publicID:  a xmlChar** receiving PubidLiteral
2805 *
2806 * Parse an External ID or a Public ID
2807 *
2808 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2809 *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
2810 *
2811 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2812 *
2813 * Returns the function returns SystemLiteral and in the second
2814 *                case publicID receives PubidLiteral, is strict is off
2815 *                it is possible to return NULL and have publicID set.
2816 */
2817
2818static xmlChar *
2819htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
2820    xmlChar *URI = NULL;
2821
2822    if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2823         (UPP(2) == 'S') && (UPP(3) == 'T') &&
2824	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2825        SKIP(6);
2826	if (!IS_BLANK_CH(CUR)) {
2827	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2828	                 "Space required after 'SYSTEM'\n", NULL, NULL);
2829	}
2830        SKIP_BLANKS;
2831	URI = htmlParseSystemLiteral(ctxt);
2832	if (URI == NULL) {
2833	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2834	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
2835        }
2836    } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2837	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
2838	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
2839        SKIP(6);
2840	if (!IS_BLANK_CH(CUR)) {
2841	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2842	                 "Space required after 'PUBLIC'\n", NULL, NULL);
2843	}
2844        SKIP_BLANKS;
2845	*publicID = htmlParsePubidLiteral(ctxt);
2846	if (*publicID == NULL) {
2847	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2848	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2849			 NULL, NULL);
2850	}
2851        SKIP_BLANKS;
2852        if ((CUR == '"') || (CUR == '\'')) {
2853	    URI = htmlParseSystemLiteral(ctxt);
2854	}
2855    }
2856    return(URI);
2857}
2858
2859/**
2860 * xmlParsePI:
2861 * @ctxt:  an XML parser context
2862 *
2863 * parse an XML Processing Instruction.
2864 *
2865 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2866 */
2867static void
2868htmlParsePI(htmlParserCtxtPtr ctxt) {
2869    xmlChar *buf = NULL;
2870    int len = 0;
2871    int size = HTML_PARSER_BUFFER_SIZE;
2872    int cur, l;
2873    const xmlChar *target;
2874    xmlParserInputState state;
2875    int count = 0;
2876
2877    if ((RAW == '<') && (NXT(1) == '?')) {
2878	state = ctxt->instate;
2879        ctxt->instate = XML_PARSER_PI;
2880	/*
2881	 * this is a Processing Instruction.
2882	 */
2883	SKIP(2);
2884	SHRINK;
2885
2886	/*
2887	 * Parse the target name and check for special support like
2888	 * namespace.
2889	 */
2890        target = htmlParseName(ctxt);
2891	if (target != NULL) {
2892	    if (RAW == '>') {
2893		SKIP(1);
2894
2895		/*
2896		 * SAX: PI detected.
2897		 */
2898		if ((ctxt->sax) && (!ctxt->disableSAX) &&
2899		    (ctxt->sax->processingInstruction != NULL))
2900		    ctxt->sax->processingInstruction(ctxt->userData,
2901		                                     target, NULL);
2902		ctxt->instate = state;
2903		return;
2904	    }
2905	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2906	    if (buf == NULL) {
2907		htmlErrMemory(ctxt, NULL);
2908		ctxt->instate = state;
2909		return;
2910	    }
2911	    cur = CUR;
2912	    if (!IS_BLANK(cur)) {
2913		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2914			  "ParsePI: PI %s space expected\n", target, NULL);
2915	    }
2916            SKIP_BLANKS;
2917	    cur = CUR_CHAR(l);
2918	    while (IS_CHAR(cur) && (cur != '>')) {
2919		if (len + 5 >= size) {
2920		    xmlChar *tmp;
2921
2922		    size *= 2;
2923		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2924		    if (tmp == NULL) {
2925			htmlErrMemory(ctxt, NULL);
2926			xmlFree(buf);
2927			ctxt->instate = state;
2928			return;
2929		    }
2930		    buf = tmp;
2931		}
2932		count++;
2933		if (count > 50) {
2934		    GROW;
2935		    count = 0;
2936		}
2937		COPY_BUF(l,buf,len,cur);
2938		NEXTL(l);
2939		cur = CUR_CHAR(l);
2940		if (cur == 0) {
2941		    SHRINK;
2942		    GROW;
2943		    cur = CUR_CHAR(l);
2944		}
2945	    }
2946	    buf[len] = 0;
2947	    if (cur != '>') {
2948		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2949		      "ParsePI: PI %s never end ...\n", target, NULL);
2950	    } else {
2951		SKIP(1);
2952
2953		/*
2954		 * SAX: PI detected.
2955		 */
2956		if ((ctxt->sax) && (!ctxt->disableSAX) &&
2957		    (ctxt->sax->processingInstruction != NULL))
2958		    ctxt->sax->processingInstruction(ctxt->userData,
2959		                                     target, buf);
2960	    }
2961	    xmlFree(buf);
2962	} else {
2963	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
2964                         "PI is not started correctly", NULL, NULL);
2965	}
2966	ctxt->instate = state;
2967    }
2968}
2969
2970/**
2971 * htmlParseComment:
2972 * @ctxt:  an HTML parser context
2973 *
2974 * Parse an XML (SGML) comment <!-- .... -->
2975 *
2976 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2977 */
2978static void
2979htmlParseComment(htmlParserCtxtPtr ctxt) {
2980    xmlChar *buf = NULL;
2981    int len;
2982    int size = HTML_PARSER_BUFFER_SIZE;
2983    int q, ql;
2984    int r, rl;
2985    int cur, l;
2986    xmlParserInputState state;
2987
2988    /*
2989     * Check that there is a comment right here.
2990     */
2991    if ((RAW != '<') || (NXT(1) != '!') ||
2992        (NXT(2) != '-') || (NXT(3) != '-')) return;
2993
2994    state = ctxt->instate;
2995    ctxt->instate = XML_PARSER_COMMENT;
2996    SHRINK;
2997    SKIP(4);
2998    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2999    if (buf == NULL) {
3000        htmlErrMemory(ctxt, "buffer allocation failed\n");
3001	ctxt->instate = state;
3002	return;
3003    }
3004    q = CUR_CHAR(ql);
3005    NEXTL(ql);
3006    r = CUR_CHAR(rl);
3007    NEXTL(rl);
3008    cur = CUR_CHAR(l);
3009    len = 0;
3010    while (IS_CHAR(cur) &&
3011           ((cur != '>') ||
3012	    (r != '-') || (q != '-'))) {
3013	if (len + 5 >= size) {
3014	    xmlChar *tmp;
3015
3016	    size *= 2;
3017	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3018	    if (tmp == NULL) {
3019	        xmlFree(buf);
3020	        htmlErrMemory(ctxt, "growing buffer failed\n");
3021		ctxt->instate = state;
3022		return;
3023	    }
3024	    buf = tmp;
3025	}
3026	COPY_BUF(ql,buf,len,q);
3027	q = r;
3028	ql = rl;
3029	r = cur;
3030	rl = l;
3031	NEXTL(l);
3032	cur = CUR_CHAR(l);
3033	if (cur == 0) {
3034	    SHRINK;
3035	    GROW;
3036	    cur = CUR_CHAR(l);
3037	}
3038    }
3039    buf[len] = 0;
3040    if (!IS_CHAR(cur)) {
3041	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3042	             "Comment not terminated \n<!--%.50s\n", buf, NULL);
3043	xmlFree(buf);
3044    } else {
3045        NEXT;
3046	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3047	    (!ctxt->disableSAX))
3048	    ctxt->sax->comment(ctxt->userData, buf);
3049	xmlFree(buf);
3050    }
3051    ctxt->instate = state;
3052}
3053
3054/**
3055 * htmlParseCharRef:
3056 * @ctxt:  an HTML parser context
3057 *
3058 * parse Reference declarations
3059 *
3060 * [66] CharRef ::= '&#' [0-9]+ ';' |
3061 *                  '&#x' [0-9a-fA-F]+ ';'
3062 *
3063 * Returns the value parsed (as an int)
3064 */
3065int
3066htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3067    int val = 0;
3068
3069    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3070	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3071		     "htmlParseCharRef: context error\n",
3072		     NULL, NULL);
3073        return(0);
3074    }
3075    if ((CUR == '&') && (NXT(1) == '#') &&
3076        ((NXT(2) == 'x') || NXT(2) == 'X')) {
3077	SKIP(3);
3078	while (CUR != ';') {
3079	    if ((CUR >= '0') && (CUR <= '9'))
3080	        val = val * 16 + (CUR - '0');
3081	    else if ((CUR >= 'a') && (CUR <= 'f'))
3082	        val = val * 16 + (CUR - 'a') + 10;
3083	    else if ((CUR >= 'A') && (CUR <= 'F'))
3084	        val = val * 16 + (CUR - 'A') + 10;
3085	    else {
3086	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3087		             "htmlParseCharRef: invalid hexadecimal value\n",
3088			     NULL, NULL);
3089		return(0);
3090	    }
3091	    NEXT;
3092	}
3093	if (CUR == ';')
3094	    NEXT;
3095    } else if  ((CUR == '&') && (NXT(1) == '#')) {
3096	SKIP(2);
3097	while (CUR != ';') {
3098	    if ((CUR >= '0') && (CUR <= '9'))
3099	        val = val * 10 + (CUR - '0');
3100	    else {
3101	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3102		             "htmlParseCharRef: invalid decimal value\n",
3103			     NULL, NULL);
3104		return(0);
3105	    }
3106	    NEXT;
3107	}
3108	if (CUR == ';')
3109	    NEXT;
3110    } else {
3111	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3112	             "htmlParseCharRef: invalid value\n", NULL, NULL);
3113    }
3114    /*
3115     * Check the value IS_CHAR ...
3116     */
3117    if (IS_CHAR(val)) {
3118        return(val);
3119    } else {
3120	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3121			"htmlParseCharRef: invalid xmlChar value %d\n",
3122			val);
3123    }
3124    return(0);
3125}
3126
3127
3128/**
3129 * htmlParseDocTypeDecl:
3130 * @ctxt:  an HTML parser context
3131 *
3132 * parse a DOCTYPE declaration
3133 *
3134 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3135 *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3136 */
3137
3138static void
3139htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3140    const xmlChar *name;
3141    xmlChar *ExternalID = NULL;
3142    xmlChar *URI = NULL;
3143
3144    /*
3145     * We know that '<!DOCTYPE' has been detected.
3146     */
3147    SKIP(9);
3148
3149    SKIP_BLANKS;
3150
3151    /*
3152     * Parse the DOCTYPE name.
3153     */
3154    name = htmlParseName(ctxt);
3155    if (name == NULL) {
3156	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3157	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3158		     NULL, NULL);
3159    }
3160    /*
3161     * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3162     */
3163
3164    SKIP_BLANKS;
3165
3166    /*
3167     * Check for SystemID and ExternalID
3168     */
3169    URI = htmlParseExternalID(ctxt, &ExternalID);
3170    SKIP_BLANKS;
3171
3172    /*
3173     * We should be at the end of the DOCTYPE declaration.
3174     */
3175    if (CUR != '>') {
3176	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3177	             "DOCTYPE improperly terminated\n", NULL, NULL);
3178        /* We shouldn't try to resynchronize ... */
3179    }
3180    NEXT;
3181
3182    /*
3183     * Create or update the document accordingly to the DOCTYPE
3184     */
3185    if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3186	(!ctxt->disableSAX))
3187	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3188
3189    /*
3190     * Cleanup, since we don't use all those identifiers
3191     */
3192    if (URI != NULL) xmlFree(URI);
3193    if (ExternalID != NULL) xmlFree(ExternalID);
3194}
3195
3196/**
3197 * htmlParseAttribute:
3198 * @ctxt:  an HTML parser context
3199 * @value:  a xmlChar ** used to store the value of the attribute
3200 *
3201 * parse an attribute
3202 *
3203 * [41] Attribute ::= Name Eq AttValue
3204 *
3205 * [25] Eq ::= S? '=' S?
3206 *
3207 * With namespace:
3208 *
3209 * [NS 11] Attribute ::= QName Eq AttValue
3210 *
3211 * Also the case QName == xmlns:??? is handled independently as a namespace
3212 * definition.
3213 *
3214 * Returns the attribute name, and the value in *value.
3215 */
3216
3217static const xmlChar *
3218htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3219    const xmlChar *name;
3220    xmlChar *val = NULL;
3221
3222    *value = NULL;
3223    name = htmlParseHTMLName(ctxt);
3224    if (name == NULL) {
3225	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3226	             "error parsing attribute name\n", NULL, NULL);
3227        return(NULL);
3228    }
3229
3230    /*
3231     * read the value
3232     */
3233    SKIP_BLANKS;
3234    if (CUR == '=') {
3235        NEXT;
3236	SKIP_BLANKS;
3237	val = htmlParseAttValue(ctxt);
3238    } else if (htmlIsBooleanAttr(name)) {
3239        /*
3240	 * assume a minimized attribute
3241	 */
3242	val = xmlStrdup(name);
3243    }
3244
3245    *value = val;
3246    return(name);
3247}
3248
3249/**
3250 * htmlCheckEncoding:
3251 * @ctxt:  an HTML parser context
3252 * @attvalue: the attribute value
3253 *
3254 * Checks an http-equiv attribute from a Meta tag to detect
3255 * the encoding
3256 * If a new encoding is detected the parser is switched to decode
3257 * it and pass UTF8
3258 */
3259static void
3260htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3261    const xmlChar *encoding;
3262
3263    if ((ctxt == NULL) || (attvalue == NULL))
3264	return;
3265
3266    /* do not change encoding */
3267    if (ctxt->input->encoding != NULL)
3268        return;
3269
3270    encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3271    if (encoding != NULL) {
3272	encoding += 8;
3273    } else {
3274	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3275	if (encoding != NULL)
3276	    encoding += 9;
3277    }
3278    if (encoding != NULL) {
3279	xmlCharEncoding enc;
3280	xmlCharEncodingHandlerPtr handler;
3281
3282	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3283
3284	if (ctxt->input->encoding != NULL)
3285	    xmlFree((xmlChar *) ctxt->input->encoding);
3286	ctxt->input->encoding = xmlStrdup(encoding);
3287
3288	enc = xmlParseCharEncoding((const char *) encoding);
3289	/*
3290	 * registered set of known encodings
3291	 */
3292	if (enc != XML_CHAR_ENCODING_ERROR) {
3293	    if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3294	         (enc == XML_CHAR_ENCODING_UTF16BE) ||
3295		 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3296		 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3297		(ctxt->input->buf != NULL) &&
3298		(ctxt->input->buf->encoder == NULL)) {
3299		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3300		             "htmlCheckEncoding: wrong encoding meta\n",
3301			     NULL, NULL);
3302	    } else {
3303		xmlSwitchEncoding(ctxt, enc);
3304	    }
3305	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
3306	} else {
3307	    /*
3308	     * fallback for unknown encodings
3309	     */
3310	    handler = xmlFindCharEncodingHandler((const char *) encoding);
3311	    if (handler != NULL) {
3312		xmlSwitchToEncoding(ctxt, handler);
3313		ctxt->charset = XML_CHAR_ENCODING_UTF8;
3314	    } else {
3315		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3316	    }
3317	}
3318
3319	if ((ctxt->input->buf != NULL) &&
3320	    (ctxt->input->buf->encoder != NULL) &&
3321	    (ctxt->input->buf->raw != NULL) &&
3322	    (ctxt->input->buf->buffer != NULL)) {
3323	    int nbchars;
3324	    int processed;
3325
3326	    /*
3327	     * convert as much as possible to the parser reading buffer.
3328	     */
3329	    processed = ctxt->input->cur - ctxt->input->base;
3330	    xmlBufferShrink(ctxt->input->buf->buffer, processed);
3331	    nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3332		                       ctxt->input->buf->buffer,
3333				       ctxt->input->buf->raw);
3334	    if (nbchars < 0) {
3335		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3336		             "htmlCheckEncoding: encoder error\n",
3337			     NULL, NULL);
3338	    }
3339	    ctxt->input->base =
3340	    ctxt->input->cur = ctxt->input->buf->buffer->content;
3341	}
3342    }
3343}
3344
3345/**
3346 * htmlCheckMeta:
3347 * @ctxt:  an HTML parser context
3348 * @atts:  the attributes values
3349 *
3350 * Checks an attributes from a Meta tag
3351 */
3352static void
3353htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3354    int i;
3355    const xmlChar *att, *value;
3356    int http = 0;
3357    const xmlChar *content = NULL;
3358
3359    if ((ctxt == NULL) || (atts == NULL))
3360	return;
3361
3362    i = 0;
3363    att = atts[i++];
3364    while (att != NULL) {
3365	value = atts[i++];
3366	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3367	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3368	    http = 1;
3369	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3370	    content = value;
3371	att = atts[i++];
3372    }
3373    if ((http) && (content != NULL))
3374	htmlCheckEncoding(ctxt, content);
3375
3376}
3377
3378/**
3379 * htmlParseStartTag:
3380 * @ctxt:  an HTML parser context
3381 *
3382 * parse a start of tag either for rule element or
3383 * EmptyElement. In both case we don't parse the tag closing chars.
3384 *
3385 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3386 *
3387 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3388 *
3389 * With namespace:
3390 *
3391 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3392 *
3393 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3394 *
3395 * Returns 0 in case of success and -1 in case of error.
3396 */
3397
3398static int
3399htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3400    const xmlChar *name;
3401    const xmlChar *attname;
3402    xmlChar *attvalue;
3403    const xmlChar **atts;
3404    int nbatts = 0;
3405    int maxatts;
3406    int meta = 0;
3407    int i;
3408
3409    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3410	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3411		     "htmlParseStartTag: context error\n", NULL, NULL);
3412	return -1;
3413    }
3414    if (CUR != '<') return -1;
3415    NEXT;
3416
3417    atts = ctxt->atts;
3418    maxatts = ctxt->maxatts;
3419
3420    GROW;
3421    name = htmlParseHTMLName(ctxt);
3422    if (name == NULL) {
3423	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3424	             "htmlParseStartTag: invalid element name\n",
3425		     NULL, NULL);
3426	/* Dump the bogus tag like browsers do */
3427	while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3428	    NEXT;
3429        return -1;
3430    }
3431    if (xmlStrEqual(name, BAD_CAST"meta"))
3432	meta = 1;
3433
3434    /*
3435     * Check for auto-closure of HTML elements.
3436     */
3437    htmlAutoClose(ctxt, name);
3438
3439    /*
3440     * Check for implied HTML elements.
3441     */
3442    htmlCheckImplied(ctxt, name);
3443
3444    /*
3445     * Avoid html at any level > 0, head at any level != 1
3446     * or any attempt to recurse body
3447     */
3448    if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3449	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3450	             "htmlParseStartTag: misplaced <html> tag\n",
3451		     name, NULL);
3452	return 0;
3453    }
3454    if ((ctxt->nameNr != 1) &&
3455	(xmlStrEqual(name, BAD_CAST"head"))) {
3456	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3457	             "htmlParseStartTag: misplaced <head> tag\n",
3458		     name, NULL);
3459	return 0;
3460    }
3461    if (xmlStrEqual(name, BAD_CAST"body")) {
3462	int indx;
3463	for (indx = 0;indx < ctxt->nameNr;indx++) {
3464	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3465		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3466		             "htmlParseStartTag: misplaced <body> tag\n",
3467			     name, NULL);
3468		while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3469		    NEXT;
3470		return 0;
3471	    }
3472	}
3473    }
3474
3475    /*
3476     * Now parse the attributes, it ends up with the ending
3477     *
3478     * (S Attribute)* S?
3479     */
3480    SKIP_BLANKS;
3481    while ((IS_CHAR_CH(CUR)) &&
3482           (CUR != '>') &&
3483	   ((CUR != '/') || (NXT(1) != '>'))) {
3484	long cons = ctxt->nbChars;
3485
3486	GROW;
3487	attname = htmlParseAttribute(ctxt, &attvalue);
3488        if (attname != NULL) {
3489
3490	    /*
3491	     * Well formedness requires at most one declaration of an attribute
3492	     */
3493	    for (i = 0; i < nbatts;i += 2) {
3494	        if (xmlStrEqual(atts[i], attname)) {
3495		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3496		                 "Attribute %s redefined\n", attname, NULL);
3497		    if (attvalue != NULL)
3498			xmlFree(attvalue);
3499		    goto failed;
3500		}
3501	    }
3502
3503	    /*
3504	     * Add the pair to atts
3505	     */
3506	    if (atts == NULL) {
3507	        maxatts = 22; /* allow for 10 attrs by default */
3508	        atts = (const xmlChar **)
3509		       xmlMalloc(maxatts * sizeof(xmlChar *));
3510		if (atts == NULL) {
3511		    htmlErrMemory(ctxt, NULL);
3512		    if (attvalue != NULL)
3513			xmlFree(attvalue);
3514		    goto failed;
3515		}
3516		ctxt->atts = atts;
3517		ctxt->maxatts = maxatts;
3518	    } else if (nbatts + 4 > maxatts) {
3519	        const xmlChar **n;
3520
3521	        maxatts *= 2;
3522	        n = (const xmlChar **) xmlRealloc((void *) atts,
3523					     maxatts * sizeof(const xmlChar *));
3524		if (n == NULL) {
3525		    htmlErrMemory(ctxt, NULL);
3526		    if (attvalue != NULL)
3527			xmlFree(attvalue);
3528		    goto failed;
3529		}
3530		atts = n;
3531		ctxt->atts = atts;
3532		ctxt->maxatts = maxatts;
3533	    }
3534	    atts[nbatts++] = attname;
3535	    atts[nbatts++] = attvalue;
3536	    atts[nbatts] = NULL;
3537	    atts[nbatts + 1] = NULL;
3538	}
3539	else {
3540	    if (attvalue != NULL)
3541	        xmlFree(attvalue);
3542	    /* Dump the bogus attribute string up to the next blank or
3543	     * the end of the tag. */
3544	    while ((IS_CHAR_CH(CUR)) &&
3545	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3546		   ((CUR != '/') || (NXT(1) != '>')))
3547		NEXT;
3548	}
3549
3550failed:
3551	SKIP_BLANKS;
3552        if (cons == ctxt->nbChars) {
3553	    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3554	                 "htmlParseStartTag: problem parsing attributes\n",
3555			 NULL, NULL);
3556	    break;
3557	}
3558    }
3559
3560    /*
3561     * Handle specific association to the META tag
3562     */
3563    if (meta)
3564	htmlCheckMeta(ctxt, atts);
3565
3566    /*
3567     * SAX: Start of Element !
3568     */
3569    htmlnamePush(ctxt, name);
3570    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3571	if (nbatts != 0)
3572            ctxt->sax->startElement(ctxt->userData, name, atts);
3573	else
3574            ctxt->sax->startElement(ctxt->userData, name, NULL);
3575    }
3576
3577    if (atts != NULL) {
3578        for (i = 1;i < nbatts;i += 2) {
3579	    if (atts[i] != NULL)
3580		xmlFree((xmlChar *) atts[i]);
3581	}
3582    }
3583
3584    return 0;
3585}
3586
3587/**
3588 * htmlParseEndTag:
3589 * @ctxt:  an HTML parser context
3590 *
3591 * parse an end of tag
3592 *
3593 * [42] ETag ::= '</' Name S? '>'
3594 *
3595 * With namespace
3596 *
3597 * [NS 9] ETag ::= '</' QName S? '>'
3598 *
3599 * Returns 1 if the current level should be closed.
3600 */
3601
3602static int
3603htmlParseEndTag(htmlParserCtxtPtr ctxt)
3604{
3605    const xmlChar *name;
3606    const xmlChar *oldname;
3607    int i, ret;
3608
3609    if ((CUR != '<') || (NXT(1) != '/')) {
3610        htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3611	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
3612        return (0);
3613    }
3614    SKIP(2);
3615
3616    name = htmlParseHTMLName(ctxt);
3617    if (name == NULL)
3618        return (0);
3619
3620    /*
3621     * We should definitely be at the ending "S? '>'" part
3622     */
3623    SKIP_BLANKS;
3624    if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3625        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3626	             "End tag : expected '>'\n", NULL, NULL);
3627	if (ctxt->recovery) {
3628	    /*
3629	     * We're not at the ending > !!
3630	     * Error, unless in recover mode where we search forwards
3631	     * until we find a >
3632	     */
3633	    while (CUR != '\0' && CUR != '>') NEXT;
3634	    NEXT;
3635	}
3636    } else
3637        NEXT;
3638
3639    /*
3640     * If the name read is not one of the element in the parsing stack
3641     * then return, it's just an error.
3642     */
3643    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3644        if (xmlStrEqual(name, ctxt->nameTab[i]))
3645            break;
3646    }
3647    if (i < 0) {
3648        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3649	             "Unexpected end tag : %s\n", name, NULL);
3650        return (0);
3651    }
3652
3653
3654    /*
3655     * Check for auto-closure of HTML elements.
3656     */
3657
3658    htmlAutoCloseOnClose(ctxt, name);
3659
3660    /*
3661     * Well formedness constraints, opening and closing must match.
3662     * With the exception that the autoclose may have popped stuff out
3663     * of the stack.
3664     */
3665    if (!xmlStrEqual(name, ctxt->name)) {
3666        if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3667            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3668	                 "Opening and ending tag mismatch: %s and %s\n",
3669			 name, ctxt->name);
3670        }
3671    }
3672
3673    /*
3674     * SAX: End of Tag
3675     */
3676    oldname = ctxt->name;
3677    if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3678        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3679            ctxt->sax->endElement(ctxt->userData, name);
3680        htmlnamePop(ctxt);
3681        ret = 1;
3682    } else {
3683        ret = 0;
3684    }
3685
3686    return (ret);
3687}
3688
3689
3690/**
3691 * htmlParseReference:
3692 * @ctxt:  an HTML parser context
3693 *
3694 * parse and handle entity references in content,
3695 * this will end-up in a call to character() since this is either a
3696 * CharRef, or a predefined entity.
3697 */
3698static void
3699htmlParseReference(htmlParserCtxtPtr ctxt) {
3700    const htmlEntityDesc * ent;
3701    xmlChar out[6];
3702    const xmlChar *name;
3703    if (CUR != '&') return;
3704
3705    if (NXT(1) == '#') {
3706	unsigned int c;
3707	int bits, i = 0;
3708
3709	c = htmlParseCharRef(ctxt);
3710	if (c == 0)
3711	    return;
3712
3713        if      (c <    0x80) { out[i++]= c;                bits= -6; }
3714        else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3715        else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3716        else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3717
3718        for ( ; bits >= 0; bits-= 6) {
3719            out[i++]= ((c >> bits) & 0x3F) | 0x80;
3720        }
3721	out[i] = 0;
3722
3723	htmlCheckParagraph(ctxt);
3724	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3725	    ctxt->sax->characters(ctxt->userData, out, i);
3726    } else {
3727	ent = htmlParseEntityRef(ctxt, &name);
3728	if (name == NULL) {
3729	    htmlCheckParagraph(ctxt);
3730	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3731	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3732	    return;
3733	}
3734	if ((ent == NULL) || !(ent->value > 0)) {
3735	    htmlCheckParagraph(ctxt);
3736	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3737		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3738		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3739		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3740	    }
3741	} else {
3742	    unsigned int c;
3743	    int bits, i = 0;
3744
3745	    c = ent->value;
3746	    if      (c <    0x80)
3747	            { out[i++]= c;                bits= -6; }
3748	    else if (c <   0x800)
3749	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3750	    else if (c < 0x10000)
3751	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3752	    else
3753	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3754
3755	    for ( ; bits >= 0; bits-= 6) {
3756		out[i++]= ((c >> bits) & 0x3F) | 0x80;
3757	    }
3758	    out[i] = 0;
3759
3760	    htmlCheckParagraph(ctxt);
3761	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3762		ctxt->sax->characters(ctxt->userData, out, i);
3763	}
3764    }
3765}
3766
3767/**
3768 * htmlParseContent:
3769 * @ctxt:  an HTML parser context
3770 *
3771 * Parse a content: comment, sub-element, reference or text.
3772 */
3773
3774static void
3775htmlParseContent(htmlParserCtxtPtr ctxt) {
3776    xmlChar *currentNode;
3777    int depth;
3778
3779    currentNode = xmlStrdup(ctxt->name);
3780    depth = ctxt->nameNr;
3781    while (1) {
3782	long cons = ctxt->nbChars;
3783
3784        GROW;
3785	/*
3786	 * Our tag or one of it's parent or children is ending.
3787	 */
3788        if ((CUR == '<') && (NXT(1) == '/')) {
3789	    if (htmlParseEndTag(ctxt) &&
3790		((currentNode != NULL) || (ctxt->nameNr == 0))) {
3791		if (currentNode != NULL)
3792		    xmlFree(currentNode);
3793		return;
3794	    }
3795	    continue; /* while */
3796        }
3797
3798	/*
3799	 * Has this node been popped out during parsing of
3800	 * the next element
3801	 */
3802        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3803	    (!xmlStrEqual(currentNode, ctxt->name)))
3804	     {
3805	    if (currentNode != NULL) xmlFree(currentNode);
3806	    return;
3807	}
3808
3809	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3810	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
3811	    /*
3812	     * Handle SCRIPT/STYLE separately
3813	     */
3814	    htmlParseScript(ctxt);
3815	} else {
3816	    /*
3817	     * Sometimes DOCTYPE arrives in the middle of the document
3818	     */
3819	    if ((CUR == '<') && (NXT(1) == '!') &&
3820		(UPP(2) == 'D') && (UPP(3) == 'O') &&
3821		(UPP(4) == 'C') && (UPP(5) == 'T') &&
3822		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
3823		(UPP(8) == 'E')) {
3824		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3825		             "Misplaced DOCTYPE declaration\n",
3826			     BAD_CAST "DOCTYPE" , NULL);
3827		htmlParseDocTypeDecl(ctxt);
3828	    }
3829
3830	    /*
3831	     * First case :  a comment
3832	     */
3833	    if ((CUR == '<') && (NXT(1) == '!') &&
3834		(NXT(2) == '-') && (NXT(3) == '-')) {
3835		htmlParseComment(ctxt);
3836	    }
3837
3838	    /*
3839	     * Second case : a Processing Instruction.
3840	     */
3841	    else if ((CUR == '<') && (NXT(1) == '?')) {
3842		htmlParsePI(ctxt);
3843	    }
3844
3845	    /*
3846	     * Third case :  a sub-element.
3847	     */
3848	    else if (CUR == '<') {
3849		htmlParseElement(ctxt);
3850	    }
3851
3852	    /*
3853	     * Fourth case : a reference. If if has not been resolved,
3854	     *    parsing returns it's Name, create the node
3855	     */
3856	    else if (CUR == '&') {
3857		htmlParseReference(ctxt);
3858	    }
3859
3860	    /*
3861	     * Fifth case : end of the resource
3862	     */
3863	    else if (CUR == 0) {
3864		htmlAutoCloseOnEnd(ctxt);
3865		break;
3866	    }
3867
3868	    /*
3869	     * Last case, text. Note that References are handled directly.
3870	     */
3871	    else {
3872		htmlParseCharData(ctxt);
3873	    }
3874
3875	    if (cons == ctxt->nbChars) {
3876		if (ctxt->node != NULL) {
3877		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3878		                 "detected an error in element content\n",
3879				 NULL, NULL);
3880		}
3881		break;
3882	    }
3883	}
3884        GROW;
3885    }
3886    if (currentNode != NULL) xmlFree(currentNode);
3887}
3888
3889/**
3890 * htmlParseContent:
3891 * @ctxt:  an HTML parser context
3892 *
3893 * Parse a content: comment, sub-element, reference or text.
3894 */
3895
3896void
3897__htmlParseContent(void *ctxt) {
3898    if (ctxt != NULL)
3899	htmlParseContent((htmlParserCtxtPtr) ctxt);
3900}
3901
3902/**
3903 * htmlParseElement:
3904 * @ctxt:  an HTML parser context
3905 *
3906 * parse an HTML element, this is highly recursive
3907 *
3908 * [39] element ::= EmptyElemTag | STag content ETag
3909 *
3910 * [41] Attribute ::= Name Eq AttValue
3911 */
3912
3913void
3914htmlParseElement(htmlParserCtxtPtr ctxt) {
3915    const xmlChar *name;
3916    xmlChar *currentNode = NULL;
3917    const htmlElemDesc * info;
3918    htmlParserNodeInfo node_info;
3919    int failed;
3920    int depth;
3921    const xmlChar *oldptr;
3922
3923    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3924	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3925		     "htmlParseElement: context error\n", NULL, NULL);
3926	return;
3927    }
3928    /* Capture start position */
3929    if (ctxt->record_info) {
3930        node_info.begin_pos = ctxt->input->consumed +
3931                          (CUR_PTR - ctxt->input->base);
3932	node_info.begin_line = ctxt->input->line;
3933    }
3934
3935    failed = htmlParseStartTag(ctxt);
3936    name = ctxt->name;
3937    if (failed || (name == NULL)) {
3938	if (CUR == '>')
3939	    NEXT;
3940        return;
3941    }
3942
3943    /*
3944     * Lookup the info for that element.
3945     */
3946    info = htmlTagLookup(name);
3947    if (info == NULL) {
3948	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
3949	             "Tag %s invalid\n", name, NULL);
3950    }
3951
3952    /*
3953     * Check for an Empty Element labeled the XML/SGML way
3954     */
3955    if ((CUR == '/') && (NXT(1) == '>')) {
3956        SKIP(2);
3957	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3958	    ctxt->sax->endElement(ctxt->userData, name);
3959	htmlnamePop(ctxt);
3960	return;
3961    }
3962
3963    if (CUR == '>') {
3964        NEXT;
3965    } else {
3966	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3967	             "Couldn't find end of Start Tag %s\n", name, NULL);
3968
3969	/*
3970	 * end of parsing of this node.
3971	 */
3972	if (xmlStrEqual(name, ctxt->name)) {
3973	    nodePop(ctxt);
3974	    htmlnamePop(ctxt);
3975	}
3976
3977	/*
3978	 * Capture end position and add node
3979	 */
3980	if (ctxt->record_info) {
3981	   node_info.end_pos = ctxt->input->consumed +
3982			      (CUR_PTR - ctxt->input->base);
3983	   node_info.end_line = ctxt->input->line;
3984	   node_info.node = ctxt->node;
3985	   xmlParserAddNodeInfo(ctxt, &node_info);
3986	}
3987	return;
3988    }
3989
3990    /*
3991     * Check for an Empty Element from DTD definition
3992     */
3993    if ((info != NULL) && (info->empty)) {
3994	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3995	    ctxt->sax->endElement(ctxt->userData, name);
3996	htmlnamePop(ctxt);
3997	return;
3998    }
3999
4000    /*
4001     * Parse the content of the element:
4002     */
4003    currentNode = xmlStrdup(ctxt->name);
4004    depth = ctxt->nameNr;
4005    while (IS_CHAR_CH(CUR)) {
4006	oldptr = ctxt->input->cur;
4007	htmlParseContent(ctxt);
4008	if (oldptr==ctxt->input->cur) break;
4009	if (ctxt->nameNr < depth) break;
4010    }
4011
4012    /*
4013     * Capture end position and add node
4014     */
4015    if ( currentNode != NULL && ctxt->record_info ) {
4016       node_info.end_pos = ctxt->input->consumed +
4017                          (CUR_PTR - ctxt->input->base);
4018       node_info.end_line = ctxt->input->line;
4019       node_info.node = ctxt->node;
4020       xmlParserAddNodeInfo(ctxt, &node_info);
4021    }
4022    if (!IS_CHAR_CH(CUR)) {
4023	htmlAutoCloseOnEnd(ctxt);
4024    }
4025
4026    if (currentNode != NULL)
4027	xmlFree(currentNode);
4028}
4029
4030/**
4031 * htmlParseDocument:
4032 * @ctxt:  an HTML parser context
4033 *
4034 * parse an HTML document (and build a tree if using the standard SAX
4035 * interface).
4036 *
4037 * Returns 0, -1 in case of error. the parser context is augmented
4038 *                as a result of the parsing.
4039 */
4040
4041int
4042htmlParseDocument(htmlParserCtxtPtr ctxt) {
4043    xmlDtdPtr dtd;
4044
4045    xmlInitParser();
4046
4047    htmlDefaultSAXHandlerInit();
4048
4049    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4050	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4051		     "htmlParseDocument: context error\n", NULL, NULL);
4052	return(XML_ERR_INTERNAL_ERROR);
4053    }
4054    ctxt->html = 1;
4055    GROW;
4056    /*
4057     * SAX: beginning of the document processing.
4058     */
4059    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4060        ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4061
4062    /*
4063     * Wipe out everything which is before the first '<'
4064     */
4065    SKIP_BLANKS;
4066    if (CUR == 0) {
4067	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4068	             "Document is empty\n", NULL, NULL);
4069    }
4070
4071    if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4072	ctxt->sax->startDocument(ctxt->userData);
4073
4074
4075    /*
4076     * Parse possible comments and PIs before any content
4077     */
4078    while (((CUR == '<') && (NXT(1) == '!') &&
4079            (NXT(2) == '-') && (NXT(3) == '-')) ||
4080	   ((CUR == '<') && (NXT(1) == '?'))) {
4081        htmlParseComment(ctxt);
4082        htmlParsePI(ctxt);
4083	SKIP_BLANKS;
4084    }
4085
4086
4087    /*
4088     * Then possibly doc type declaration(s) and more Misc
4089     * (doctypedecl Misc*)?
4090     */
4091    if ((CUR == '<') && (NXT(1) == '!') &&
4092	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4093	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4094	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4095	(UPP(8) == 'E')) {
4096	htmlParseDocTypeDecl(ctxt);
4097    }
4098    SKIP_BLANKS;
4099
4100    /*
4101     * Parse possible comments and PIs before any content
4102     */
4103    while (((CUR == '<') && (NXT(1) == '!') &&
4104            (NXT(2) == '-') && (NXT(3) == '-')) ||
4105	   ((CUR == '<') && (NXT(1) == '?'))) {
4106        htmlParseComment(ctxt);
4107        htmlParsePI(ctxt);
4108	SKIP_BLANKS;
4109    }
4110
4111    /*
4112     * Time to start parsing the tree itself
4113     */
4114    htmlParseContent(ctxt);
4115
4116    /*
4117     * autoclose
4118     */
4119    if (CUR == 0)
4120	htmlAutoCloseOnEnd(ctxt);
4121
4122
4123    /*
4124     * SAX: end of the document processing.
4125     */
4126    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4127        ctxt->sax->endDocument(ctxt->userData);
4128
4129    if (ctxt->myDoc != NULL) {
4130	dtd = xmlGetIntSubset(ctxt->myDoc);
4131	if (dtd == NULL)
4132	    ctxt->myDoc->intSubset =
4133		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4134		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4135		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4136    }
4137    if (! ctxt->wellFormed) return(-1);
4138    return(0);
4139}
4140
4141
4142/************************************************************************
4143 *									*
4144 *			Parser contexts handling			*
4145 *									*
4146 ************************************************************************/
4147
4148/**
4149 * htmlInitParserCtxt:
4150 * @ctxt:  an HTML parser context
4151 *
4152 * Initialize a parser context
4153 *
4154 * Returns 0 in case of success and -1 in case of error
4155 */
4156
4157static int
4158htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4159{
4160    htmlSAXHandler *sax;
4161
4162    if (ctxt == NULL) return(-1);
4163    memset(ctxt, 0, sizeof(htmlParserCtxt));
4164
4165    ctxt->dict = xmlDictCreate();
4166    if (ctxt->dict == NULL) {
4167        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4168	return(-1);
4169    }
4170    sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4171    if (sax == NULL) {
4172        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4173	return(-1);
4174    }
4175    else
4176        memset(sax, 0, sizeof(htmlSAXHandler));
4177
4178    /* Allocate the Input stack */
4179    ctxt->inputTab = (htmlParserInputPtr *)
4180                      xmlMalloc(5 * sizeof(htmlParserInputPtr));
4181    if (ctxt->inputTab == NULL) {
4182        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4183	ctxt->inputNr = 0;
4184	ctxt->inputMax = 0;
4185	ctxt->input = NULL;
4186	return(-1);
4187    }
4188    ctxt->inputNr = 0;
4189    ctxt->inputMax = 5;
4190    ctxt->input = NULL;
4191    ctxt->version = NULL;
4192    ctxt->encoding = NULL;
4193    ctxt->standalone = -1;
4194    ctxt->instate = XML_PARSER_START;
4195
4196    /* Allocate the Node stack */
4197    ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4198    if (ctxt->nodeTab == NULL) {
4199        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4200	ctxt->nodeNr = 0;
4201	ctxt->nodeMax = 0;
4202	ctxt->node = NULL;
4203	ctxt->inputNr = 0;
4204	ctxt->inputMax = 0;
4205	ctxt->input = NULL;
4206	return(-1);
4207    }
4208    ctxt->nodeNr = 0;
4209    ctxt->nodeMax = 10;
4210    ctxt->node = NULL;
4211
4212    /* Allocate the Name stack */
4213    ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4214    if (ctxt->nameTab == NULL) {
4215        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4216	ctxt->nameNr = 0;
4217	ctxt->nameMax = 10;
4218	ctxt->name = NULL;
4219	ctxt->nodeNr = 0;
4220	ctxt->nodeMax = 0;
4221	ctxt->node = NULL;
4222	ctxt->inputNr = 0;
4223	ctxt->inputMax = 0;
4224	ctxt->input = NULL;
4225	return(-1);
4226    }
4227    ctxt->nameNr = 0;
4228    ctxt->nameMax = 10;
4229    ctxt->name = NULL;
4230
4231    if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4232    else {
4233        ctxt->sax = sax;
4234	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4235    }
4236    ctxt->userData = ctxt;
4237    ctxt->myDoc = NULL;
4238    ctxt->wellFormed = 1;
4239    ctxt->replaceEntities = 0;
4240    ctxt->linenumbers = xmlLineNumbersDefaultValue;
4241    ctxt->html = 1;
4242    ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4243    ctxt->vctxt.userData = ctxt;
4244    ctxt->vctxt.error = xmlParserValidityError;
4245    ctxt->vctxt.warning = xmlParserValidityWarning;
4246    ctxt->record_info = 0;
4247    ctxt->validate = 0;
4248    ctxt->nbChars = 0;
4249    ctxt->checkIndex = 0;
4250    ctxt->catalogs = NULL;
4251    xmlInitNodeInfoSeq(&ctxt->node_seq);
4252    return(0);
4253}
4254
4255/**
4256 * htmlFreeParserCtxt:
4257 * @ctxt:  an HTML parser context
4258 *
4259 * Free all the memory used by a parser context. However the parsed
4260 * document in ctxt->myDoc is not freed.
4261 */
4262
4263void
4264htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4265{
4266    xmlFreeParserCtxt(ctxt);
4267}
4268
4269/**
4270 * htmlNewParserCtxt:
4271 *
4272 * Allocate and initialize a new parser context.
4273 *
4274 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4275 */
4276
4277htmlParserCtxtPtr
4278htmlNewParserCtxt(void)
4279{
4280    xmlParserCtxtPtr ctxt;
4281
4282    ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4283    if (ctxt == NULL) {
4284        htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4285	return(NULL);
4286    }
4287    memset(ctxt, 0, sizeof(xmlParserCtxt));
4288    if (htmlInitParserCtxt(ctxt) < 0) {
4289        htmlFreeParserCtxt(ctxt);
4290	return(NULL);
4291    }
4292    return(ctxt);
4293}
4294
4295/**
4296 * htmlCreateMemoryParserCtxt:
4297 * @buffer:  a pointer to a char array
4298 * @size:  the size of the array
4299 *
4300 * Create a parser context for an HTML in-memory document.
4301 *
4302 * Returns the new parser context or NULL
4303 */
4304htmlParserCtxtPtr
4305htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4306    xmlParserCtxtPtr ctxt;
4307    xmlParserInputPtr input;
4308    xmlParserInputBufferPtr buf;
4309
4310    if (buffer == NULL)
4311	return(NULL);
4312    if (size <= 0)
4313	return(NULL);
4314
4315    ctxt = htmlNewParserCtxt();
4316    if (ctxt == NULL)
4317	return(NULL);
4318
4319    buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4320    if (buf == NULL) return(NULL);
4321
4322    input = xmlNewInputStream(ctxt);
4323    if (input == NULL) {
4324	xmlFreeParserCtxt(ctxt);
4325	return(NULL);
4326    }
4327
4328    input->filename = NULL;
4329    input->buf = buf;
4330    input->base = input->buf->buffer->content;
4331    input->cur = input->buf->buffer->content;
4332    input->end = &input->buf->buffer->content[input->buf->buffer->use];
4333
4334    inputPush(ctxt, input);
4335    return(ctxt);
4336}
4337
4338/**
4339 * htmlCreateDocParserCtxt:
4340 * @cur:  a pointer to an array of xmlChar
4341 * @encoding:  a free form C string describing the HTML document encoding, or NULL
4342 *
4343 * Create a parser context for an HTML document.
4344 *
4345 * TODO: check the need to add encoding handling there
4346 *
4347 * Returns the new parser context or NULL
4348 */
4349static htmlParserCtxtPtr
4350htmlCreateDocParserCtxt(const xmlChar *cur,
4351                        const char *encoding ATTRIBUTE_UNUSED) {
4352    int len;
4353    htmlParserCtxtPtr ctxt;
4354
4355    if (cur == NULL)
4356	return(NULL);
4357    len = xmlStrlen(cur);
4358    ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4359
4360    if (encoding != NULL) {
4361	xmlCharEncoding enc;
4362	xmlCharEncodingHandlerPtr handler;
4363
4364	if (ctxt->input->encoding != NULL)
4365	    xmlFree((xmlChar *) ctxt->input->encoding);
4366	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4367
4368	enc = xmlParseCharEncoding(encoding);
4369	/*
4370	 * registered set of known encodings
4371	 */
4372	if (enc != XML_CHAR_ENCODING_ERROR) {
4373	    xmlSwitchEncoding(ctxt, enc);
4374	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4375		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4376		             "Unsupported encoding %s\n",
4377			     (const xmlChar *) encoding, NULL);
4378	    }
4379	} else {
4380	    /*
4381	     * fallback for unknown encodings
4382	     */
4383	    handler = xmlFindCharEncodingHandler((const char *) encoding);
4384	    if (handler != NULL) {
4385		xmlSwitchToEncoding(ctxt, handler);
4386	    } else {
4387		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4388		             "Unsupported encoding %s\n",
4389			     (const xmlChar *) encoding, NULL);
4390	    }
4391	}
4392    }
4393    return(ctxt);
4394}
4395
4396#ifdef LIBXML_PUSH_ENABLED
4397/************************************************************************
4398 *									*
4399 * 		Progressive parsing interfaces				*
4400 *									*
4401 ************************************************************************/
4402
4403/**
4404 * htmlParseLookupSequence:
4405 * @ctxt:  an HTML parser context
4406 * @first:  the first char to lookup
4407 * @next:  the next char to lookup or zero
4408 * @third:  the next char to lookup or zero
4409 * @comment: flag to force checking inside comments
4410 *
4411 * Try to find if a sequence (first, next, third) or  just (first next) or
4412 * (first) is available in the input stream.
4413 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4414 * to avoid rescanning sequences of bytes, it DOES change the state of the
4415 * parser, do not use liberally.
4416 * This is basically similar to xmlParseLookupSequence()
4417 *
4418 * Returns the index to the current parsing point if the full sequence
4419 *      is available, -1 otherwise.
4420 */
4421static int
4422htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4423                        xmlChar next, xmlChar third, int iscomment) {
4424    int base, len;
4425    htmlParserInputPtr in;
4426    const xmlChar *buf;
4427    int incomment = 0;
4428
4429    in = ctxt->input;
4430    if (in == NULL) return(-1);
4431    base = in->cur - in->base;
4432    if (base < 0) return(-1);
4433    if (ctxt->checkIndex > base)
4434        base = ctxt->checkIndex;
4435    if (in->buf == NULL) {
4436	buf = in->base;
4437	len = in->length;
4438    } else {
4439	buf = in->buf->buffer->content;
4440	len = in->buf->buffer->use;
4441    }
4442    /* take into account the sequence length */
4443    if (third) len -= 2;
4444    else if (next) len --;
4445    for (;base < len;base++) {
4446	if (!incomment && (base + 4 < len) && !iscomment) {
4447	    if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4448		(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4449		incomment = 1;
4450		/* do not increment past <! - some people use <!--> */
4451		base += 2;
4452	    }
4453	}
4454	if (incomment) {
4455	    if (base + 3 > len)
4456		return(-1);
4457	    if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4458		(buf[base + 2] == '>')) {
4459		incomment = 0;
4460		base += 2;
4461	    }
4462	    continue;
4463	}
4464        if (buf[base] == first) {
4465	    if (third != 0) {
4466		if ((buf[base + 1] != next) ||
4467		    (buf[base + 2] != third)) continue;
4468	    } else if (next != 0) {
4469		if (buf[base + 1] != next) continue;
4470	    }
4471	    ctxt->checkIndex = 0;
4472#ifdef DEBUG_PUSH
4473	    if (next == 0)
4474		xmlGenericError(xmlGenericErrorContext,
4475			"HPP: lookup '%c' found at %d\n",
4476			first, base);
4477	    else if (third == 0)
4478		xmlGenericError(xmlGenericErrorContext,
4479			"HPP: lookup '%c%c' found at %d\n",
4480			first, next, base);
4481	    else
4482		xmlGenericError(xmlGenericErrorContext,
4483			"HPP: lookup '%c%c%c' found at %d\n",
4484			first, next, third, base);
4485#endif
4486	    return(base - (in->cur - in->base));
4487	}
4488    }
4489    ctxt->checkIndex = base;
4490#ifdef DEBUG_PUSH
4491    if (next == 0)
4492	xmlGenericError(xmlGenericErrorContext,
4493		"HPP: lookup '%c' failed\n", first);
4494    else if (third == 0)
4495	xmlGenericError(xmlGenericErrorContext,
4496		"HPP: lookup '%c%c' failed\n", first, next);
4497    else
4498	xmlGenericError(xmlGenericErrorContext,
4499		"HPP: lookup '%c%c%c' failed\n", first, next, third);
4500#endif
4501    return(-1);
4502}
4503
4504/**
4505 * htmlParseTryOrFinish:
4506 * @ctxt:  an HTML parser context
4507 * @terminate:  last chunk indicator
4508 *
4509 * Try to progress on parsing
4510 *
4511 * Returns zero if no parsing was possible
4512 */
4513static int
4514htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4515    int ret = 0;
4516    htmlParserInputPtr in;
4517    int avail = 0;
4518    xmlChar cur, next;
4519
4520#ifdef DEBUG_PUSH
4521    switch (ctxt->instate) {
4522	case XML_PARSER_EOF:
4523	    xmlGenericError(xmlGenericErrorContext,
4524		    "HPP: try EOF\n"); break;
4525	case XML_PARSER_START:
4526	    xmlGenericError(xmlGenericErrorContext,
4527		    "HPP: try START\n"); break;
4528	case XML_PARSER_MISC:
4529	    xmlGenericError(xmlGenericErrorContext,
4530		    "HPP: try MISC\n");break;
4531	case XML_PARSER_COMMENT:
4532	    xmlGenericError(xmlGenericErrorContext,
4533		    "HPP: try COMMENT\n");break;
4534	case XML_PARSER_PROLOG:
4535	    xmlGenericError(xmlGenericErrorContext,
4536		    "HPP: try PROLOG\n");break;
4537	case XML_PARSER_START_TAG:
4538	    xmlGenericError(xmlGenericErrorContext,
4539		    "HPP: try START_TAG\n");break;
4540	case XML_PARSER_CONTENT:
4541	    xmlGenericError(xmlGenericErrorContext,
4542		    "HPP: try CONTENT\n");break;
4543	case XML_PARSER_CDATA_SECTION:
4544	    xmlGenericError(xmlGenericErrorContext,
4545		    "HPP: try CDATA_SECTION\n");break;
4546	case XML_PARSER_END_TAG:
4547	    xmlGenericError(xmlGenericErrorContext,
4548		    "HPP: try END_TAG\n");break;
4549	case XML_PARSER_ENTITY_DECL:
4550	    xmlGenericError(xmlGenericErrorContext,
4551		    "HPP: try ENTITY_DECL\n");break;
4552	case XML_PARSER_ENTITY_VALUE:
4553	    xmlGenericError(xmlGenericErrorContext,
4554		    "HPP: try ENTITY_VALUE\n");break;
4555	case XML_PARSER_ATTRIBUTE_VALUE:
4556	    xmlGenericError(xmlGenericErrorContext,
4557		    "HPP: try ATTRIBUTE_VALUE\n");break;
4558	case XML_PARSER_DTD:
4559	    xmlGenericError(xmlGenericErrorContext,
4560		    "HPP: try DTD\n");break;
4561	case XML_PARSER_EPILOG:
4562	    xmlGenericError(xmlGenericErrorContext,
4563		    "HPP: try EPILOG\n");break;
4564	case XML_PARSER_PI:
4565	    xmlGenericError(xmlGenericErrorContext,
4566		    "HPP: try PI\n");break;
4567	case XML_PARSER_SYSTEM_LITERAL:
4568	    xmlGenericError(xmlGenericErrorContext,
4569		    "HPP: try SYSTEM_LITERAL\n");break;
4570    }
4571#endif
4572
4573    while (1) {
4574
4575	in = ctxt->input;
4576	if (in == NULL) break;
4577	if (in->buf == NULL)
4578	    avail = in->length - (in->cur - in->base);
4579	else
4580	    avail = in->buf->buffer->use - (in->cur - in->base);
4581	if ((avail == 0) && (terminate)) {
4582	    htmlAutoCloseOnEnd(ctxt);
4583	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4584		/*
4585		 * SAX: end of the document processing.
4586		 */
4587		ctxt->instate = XML_PARSER_EOF;
4588		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4589		    ctxt->sax->endDocument(ctxt->userData);
4590	    }
4591	}
4592        if (avail < 1)
4593	    goto done;
4594	cur = in->cur[0];
4595	if (cur == 0) {
4596	    SKIP(1);
4597	    continue;
4598	}
4599
4600        switch (ctxt->instate) {
4601            case XML_PARSER_EOF:
4602	        /*
4603		 * Document parsing is done !
4604		 */
4605	        goto done;
4606            case XML_PARSER_START:
4607	        /*
4608		 * Very first chars read from the document flow.
4609		 */
4610		cur = in->cur[0];
4611		if (IS_BLANK_CH(cur)) {
4612		    SKIP_BLANKS;
4613		    if (in->buf == NULL)
4614			avail = in->length - (in->cur - in->base);
4615		    else
4616			avail = in->buf->buffer->use - (in->cur - in->base);
4617		}
4618		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4619		    ctxt->sax->setDocumentLocator(ctxt->userData,
4620						  &xmlDefaultSAXLocator);
4621		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4622	            (!ctxt->disableSAX))
4623		    ctxt->sax->startDocument(ctxt->userData);
4624
4625		cur = in->cur[0];
4626		next = in->cur[1];
4627		if ((cur == '<') && (next == '!') &&
4628		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
4629		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
4630		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4631		    (UPP(8) == 'E')) {
4632		    if ((!terminate) &&
4633		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4634			goto done;
4635#ifdef DEBUG_PUSH
4636		    xmlGenericError(xmlGenericErrorContext,
4637			    "HPP: Parsing internal subset\n");
4638#endif
4639		    htmlParseDocTypeDecl(ctxt);
4640		    ctxt->instate = XML_PARSER_PROLOG;
4641#ifdef DEBUG_PUSH
4642		    xmlGenericError(xmlGenericErrorContext,
4643			    "HPP: entering PROLOG\n");
4644#endif
4645                } else {
4646		    ctxt->instate = XML_PARSER_MISC;
4647#ifdef DEBUG_PUSH
4648		    xmlGenericError(xmlGenericErrorContext,
4649			    "HPP: entering MISC\n");
4650#endif
4651		}
4652		break;
4653            case XML_PARSER_MISC:
4654		SKIP_BLANKS;
4655		if (in->buf == NULL)
4656		    avail = in->length - (in->cur - in->base);
4657		else
4658		    avail = in->buf->buffer->use - (in->cur - in->base);
4659		if (avail < 2)
4660		    goto done;
4661		cur = in->cur[0];
4662		next = in->cur[1];
4663	        if ((cur == '<') && (next == '!') &&
4664		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
4665		    if ((!terminate) &&
4666		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4667			goto done;
4668#ifdef DEBUG_PUSH
4669		    xmlGenericError(xmlGenericErrorContext,
4670			    "HPP: Parsing Comment\n");
4671#endif
4672		    htmlParseComment(ctxt);
4673		    ctxt->instate = XML_PARSER_MISC;
4674	        } else if ((cur == '<') && (next == '?')) {
4675		    if ((!terminate) &&
4676		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4677			goto done;
4678#ifdef DEBUG_PUSH
4679		    xmlGenericError(xmlGenericErrorContext,
4680			    "HPP: Parsing PI\n");
4681#endif
4682		    htmlParsePI(ctxt);
4683		    ctxt->instate = XML_PARSER_MISC;
4684		} else if ((cur == '<') && (next == '!') &&
4685		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
4686		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
4687		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4688		    (UPP(8) == 'E')) {
4689		    if ((!terminate) &&
4690		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4691			goto done;
4692#ifdef DEBUG_PUSH
4693		    xmlGenericError(xmlGenericErrorContext,
4694			    "HPP: Parsing internal subset\n");
4695#endif
4696		    htmlParseDocTypeDecl(ctxt);
4697		    ctxt->instate = XML_PARSER_PROLOG;
4698#ifdef DEBUG_PUSH
4699		    xmlGenericError(xmlGenericErrorContext,
4700			    "HPP: entering PROLOG\n");
4701#endif
4702		} else if ((cur == '<') && (next == '!') &&
4703		           (avail < 9)) {
4704		    goto done;
4705		} else {
4706		    ctxt->instate = XML_PARSER_START_TAG;
4707#ifdef DEBUG_PUSH
4708		    xmlGenericError(xmlGenericErrorContext,
4709			    "HPP: entering START_TAG\n");
4710#endif
4711		}
4712		break;
4713            case XML_PARSER_PROLOG:
4714		SKIP_BLANKS;
4715		if (in->buf == NULL)
4716		    avail = in->length - (in->cur - in->base);
4717		else
4718		    avail = in->buf->buffer->use - (in->cur - in->base);
4719		if (avail < 2)
4720		    goto done;
4721		cur = in->cur[0];
4722		next = in->cur[1];
4723		if ((cur == '<') && (next == '!') &&
4724		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
4725		    if ((!terminate) &&
4726		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4727			goto done;
4728#ifdef DEBUG_PUSH
4729		    xmlGenericError(xmlGenericErrorContext,
4730			    "HPP: Parsing Comment\n");
4731#endif
4732		    htmlParseComment(ctxt);
4733		    ctxt->instate = XML_PARSER_PROLOG;
4734	        } else if ((cur == '<') && (next == '?')) {
4735		    if ((!terminate) &&
4736		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4737			goto done;
4738#ifdef DEBUG_PUSH
4739		    xmlGenericError(xmlGenericErrorContext,
4740			    "HPP: Parsing PI\n");
4741#endif
4742		    htmlParsePI(ctxt);
4743		    ctxt->instate = XML_PARSER_PROLOG;
4744		} else if ((cur == '<') && (next == '!') &&
4745		           (avail < 4)) {
4746		    goto done;
4747		} else {
4748		    ctxt->instate = XML_PARSER_START_TAG;
4749#ifdef DEBUG_PUSH
4750		    xmlGenericError(xmlGenericErrorContext,
4751			    "HPP: entering START_TAG\n");
4752#endif
4753		}
4754		break;
4755            case XML_PARSER_EPILOG:
4756		if (in->buf == NULL)
4757		    avail = in->length - (in->cur - in->base);
4758		else
4759		    avail = in->buf->buffer->use - (in->cur - in->base);
4760		if (avail < 1)
4761		    goto done;
4762		cur = in->cur[0];
4763		if (IS_BLANK_CH(cur)) {
4764		    htmlParseCharData(ctxt);
4765		    goto done;
4766		}
4767		if (avail < 2)
4768		    goto done;
4769		next = in->cur[1];
4770	        if ((cur == '<') && (next == '!') &&
4771		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
4772		    if ((!terminate) &&
4773		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4774			goto done;
4775#ifdef DEBUG_PUSH
4776		    xmlGenericError(xmlGenericErrorContext,
4777			    "HPP: Parsing Comment\n");
4778#endif
4779		    htmlParseComment(ctxt);
4780		    ctxt->instate = XML_PARSER_EPILOG;
4781	        } else if ((cur == '<') && (next == '?')) {
4782		    if ((!terminate) &&
4783		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4784			goto done;
4785#ifdef DEBUG_PUSH
4786		    xmlGenericError(xmlGenericErrorContext,
4787			    "HPP: Parsing PI\n");
4788#endif
4789		    htmlParsePI(ctxt);
4790		    ctxt->instate = XML_PARSER_EPILOG;
4791		} else if ((cur == '<') && (next == '!') &&
4792		           (avail < 4)) {
4793		    goto done;
4794		} else {
4795		    ctxt->errNo = XML_ERR_DOCUMENT_END;
4796		    ctxt->wellFormed = 0;
4797		    ctxt->instate = XML_PARSER_EOF;
4798#ifdef DEBUG_PUSH
4799		    xmlGenericError(xmlGenericErrorContext,
4800			    "HPP: entering EOF\n");
4801#endif
4802		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4803			ctxt->sax->endDocument(ctxt->userData);
4804		    goto done;
4805		}
4806		break;
4807            case XML_PARSER_START_TAG: {
4808	        const xmlChar *name;
4809		int failed;
4810		const htmlElemDesc * info;
4811
4812		if (avail < 2)
4813		    goto done;
4814		cur = in->cur[0];
4815	        if (cur != '<') {
4816		    ctxt->instate = XML_PARSER_CONTENT;
4817#ifdef DEBUG_PUSH
4818		    xmlGenericError(xmlGenericErrorContext,
4819			    "HPP: entering CONTENT\n");
4820#endif
4821		    break;
4822		}
4823		if (in->cur[1] == '/') {
4824		    ctxt->instate = XML_PARSER_END_TAG;
4825		    ctxt->checkIndex = 0;
4826#ifdef DEBUG_PUSH
4827		    xmlGenericError(xmlGenericErrorContext,
4828			    "HPP: entering END_TAG\n");
4829#endif
4830		    break;
4831		}
4832		if ((!terminate) &&
4833		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4834		    goto done;
4835
4836		failed = htmlParseStartTag(ctxt);
4837		name = ctxt->name;
4838		if (failed ||
4839		    (name == NULL)) {
4840		    if (CUR == '>')
4841			NEXT;
4842		    break;
4843		}
4844
4845		/*
4846		 * Lookup the info for that element.
4847		 */
4848		info = htmlTagLookup(name);
4849		if (info == NULL) {
4850		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4851		                 "Tag %s invalid\n", name, NULL);
4852		}
4853
4854		/*
4855		 * Check for an Empty Element labeled the XML/SGML way
4856		 */
4857		if ((CUR == '/') && (NXT(1) == '>')) {
4858		    SKIP(2);
4859		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4860			ctxt->sax->endElement(ctxt->userData, name);
4861		    htmlnamePop(ctxt);
4862		    ctxt->instate = XML_PARSER_CONTENT;
4863#ifdef DEBUG_PUSH
4864		    xmlGenericError(xmlGenericErrorContext,
4865			    "HPP: entering CONTENT\n");
4866#endif
4867		    break;
4868		}
4869
4870		if (CUR == '>') {
4871		    NEXT;
4872		} else {
4873		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4874		                 "Couldn't find end of Start Tag %s\n",
4875				 name, NULL);
4876
4877		    /*
4878		     * end of parsing of this node.
4879		     */
4880		    if (xmlStrEqual(name, ctxt->name)) {
4881			nodePop(ctxt);
4882			htmlnamePop(ctxt);
4883		    }
4884
4885		    ctxt->instate = XML_PARSER_CONTENT;
4886#ifdef DEBUG_PUSH
4887		    xmlGenericError(xmlGenericErrorContext,
4888			    "HPP: entering CONTENT\n");
4889#endif
4890		    break;
4891		}
4892
4893		/*
4894		 * Check for an Empty Element from DTD definition
4895		 */
4896		if ((info != NULL) && (info->empty)) {
4897		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4898			ctxt->sax->endElement(ctxt->userData, name);
4899		    htmlnamePop(ctxt);
4900		}
4901		ctxt->instate = XML_PARSER_CONTENT;
4902#ifdef DEBUG_PUSH
4903		xmlGenericError(xmlGenericErrorContext,
4904			"HPP: entering CONTENT\n");
4905#endif
4906                break;
4907	    }
4908            case XML_PARSER_CONTENT: {
4909		long cons;
4910                /*
4911		 * Handle preparsed entities and charRef
4912		 */
4913		if (ctxt->token != 0) {
4914		    xmlChar chr[2] = { 0 , 0 } ;
4915
4916		    chr[0] = (xmlChar) ctxt->token;
4917		    htmlCheckParagraph(ctxt);
4918		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4919			ctxt->sax->characters(ctxt->userData, chr, 1);
4920		    ctxt->token = 0;
4921		    ctxt->checkIndex = 0;
4922		}
4923		if ((avail == 1) && (terminate)) {
4924		    cur = in->cur[0];
4925		    if ((cur != '<') && (cur != '&')) {
4926			if (ctxt->sax != NULL) {
4927			    if (IS_BLANK_CH(cur)) {
4928				if (ctxt->sax->ignorableWhitespace != NULL)
4929				    ctxt->sax->ignorableWhitespace(
4930					    ctxt->userData, &cur, 1);
4931			    } else {
4932				htmlCheckParagraph(ctxt);
4933				if (ctxt->sax->characters != NULL)
4934				    ctxt->sax->characters(
4935					    ctxt->userData, &cur, 1);
4936			    }
4937			}
4938			ctxt->token = 0;
4939			ctxt->checkIndex = 0;
4940			in->cur++;
4941			break;
4942		    }
4943		}
4944		if (avail < 2)
4945		    goto done;
4946		cur = in->cur[0];
4947		next = in->cur[1];
4948		cons = ctxt->nbChars;
4949		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4950		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4951		    /*
4952		     * Handle SCRIPT/STYLE separately
4953		     */
4954		    if (!terminate) {
4955		        int idx;
4956			xmlChar val;
4957
4958			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
4959			if (idx < 0)
4960			    goto done;
4961		        val = in->cur[idx + 2];
4962			if (val == 0) /* bad cut of input */
4963			    goto done;
4964		    }
4965		    htmlParseScript(ctxt);
4966		    if ((cur == '<') && (next == '/')) {
4967			ctxt->instate = XML_PARSER_END_TAG;
4968			ctxt->checkIndex = 0;
4969#ifdef DEBUG_PUSH
4970			xmlGenericError(xmlGenericErrorContext,
4971				"HPP: entering END_TAG\n");
4972#endif
4973			break;
4974		    }
4975		} else {
4976		    /*
4977		     * Sometimes DOCTYPE arrives in the middle of the document
4978		     */
4979		    if ((cur == '<') && (next == '!') &&
4980			(UPP(2) == 'D') && (UPP(3) == 'O') &&
4981			(UPP(4) == 'C') && (UPP(5) == 'T') &&
4982			(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4983			(UPP(8) == 'E')) {
4984			if ((!terminate) &&
4985			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4986			    goto done;
4987			htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4988			             "Misplaced DOCTYPE declaration\n",
4989				     BAD_CAST "DOCTYPE" , NULL);
4990			htmlParseDocTypeDecl(ctxt);
4991		    } else if ((cur == '<') && (next == '!') &&
4992			(in->cur[2] == '-') && (in->cur[3] == '-')) {
4993			if ((!terminate) &&
4994			    (htmlParseLookupSequence(
4995			    		ctxt, '-', '-', '>', 1) < 0))
4996			    goto done;
4997#ifdef DEBUG_PUSH
4998			xmlGenericError(xmlGenericErrorContext,
4999				"HPP: Parsing Comment\n");
5000#endif
5001			htmlParseComment(ctxt);
5002			ctxt->instate = XML_PARSER_CONTENT;
5003		    } else if ((cur == '<') && (next == '?')) {
5004			if ((!terminate) &&
5005			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5006			    goto done;
5007#ifdef DEBUG_PUSH
5008			xmlGenericError(xmlGenericErrorContext,
5009				"HPP: Parsing PI\n");
5010#endif
5011			htmlParsePI(ctxt);
5012			ctxt->instate = XML_PARSER_CONTENT;
5013		    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5014			goto done;
5015		    } else if ((cur == '<') && (next == '/')) {
5016			ctxt->instate = XML_PARSER_END_TAG;
5017			ctxt->checkIndex = 0;
5018#ifdef DEBUG_PUSH
5019			xmlGenericError(xmlGenericErrorContext,
5020				"HPP: entering END_TAG\n");
5021#endif
5022			break;
5023		    } else if (cur == '<') {
5024			ctxt->instate = XML_PARSER_START_TAG;
5025			ctxt->checkIndex = 0;
5026#ifdef DEBUG_PUSH
5027			xmlGenericError(xmlGenericErrorContext,
5028				"HPP: entering START_TAG\n");
5029#endif
5030			break;
5031		    } else if (cur == '&') {
5032			if ((!terminate) &&
5033			    (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
5034			    goto done;
5035#ifdef DEBUG_PUSH
5036			xmlGenericError(xmlGenericErrorContext,
5037				"HPP: Parsing Reference\n");
5038#endif
5039			/* TODO: check generation of subtrees if noent !!! */
5040			htmlParseReference(ctxt);
5041		    } else {
5042		        /*
5043			 * check that the text sequence is complete
5044			 * before handing out the data to the parser
5045			 * to avoid problems with erroneous end of
5046			 * data detection.
5047			 */
5048			if ((!terminate) &&
5049			    (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5050			    goto done;
5051			ctxt->checkIndex = 0;
5052#ifdef DEBUG_PUSH
5053			xmlGenericError(xmlGenericErrorContext,
5054				"HPP: Parsing char data\n");
5055#endif
5056			htmlParseCharData(ctxt);
5057		    }
5058		}
5059		if (cons == ctxt->nbChars) {
5060		    if (ctxt->node != NULL) {
5061			htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5062			             "detected an error in element content\n",
5063				     NULL, NULL);
5064		    }
5065		    NEXT;
5066		    break;
5067		}
5068
5069		break;
5070	    }
5071            case XML_PARSER_END_TAG:
5072		if (avail < 2)
5073		    goto done;
5074		if ((!terminate) &&
5075		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5076		    goto done;
5077		htmlParseEndTag(ctxt);
5078		if (ctxt->nameNr == 0) {
5079		    ctxt->instate = XML_PARSER_EPILOG;
5080		} else {
5081		    ctxt->instate = XML_PARSER_CONTENT;
5082		}
5083		ctxt->checkIndex = 0;
5084#ifdef DEBUG_PUSH
5085		xmlGenericError(xmlGenericErrorContext,
5086			"HPP: entering CONTENT\n");
5087#endif
5088	        break;
5089            case XML_PARSER_CDATA_SECTION:
5090		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5091			"HPP: internal error, state == CDATA\n",
5092			     NULL, NULL);
5093		ctxt->instate = XML_PARSER_CONTENT;
5094		ctxt->checkIndex = 0;
5095#ifdef DEBUG_PUSH
5096		xmlGenericError(xmlGenericErrorContext,
5097			"HPP: entering CONTENT\n");
5098#endif
5099		break;
5100            case XML_PARSER_DTD:
5101		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5102			"HPP: internal error, state == DTD\n",
5103			     NULL, NULL);
5104		ctxt->instate = XML_PARSER_CONTENT;
5105		ctxt->checkIndex = 0;
5106#ifdef DEBUG_PUSH
5107		xmlGenericError(xmlGenericErrorContext,
5108			"HPP: entering CONTENT\n");
5109#endif
5110		break;
5111            case XML_PARSER_COMMENT:
5112		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5113			"HPP: internal error, state == COMMENT\n",
5114			     NULL, NULL);
5115		ctxt->instate = XML_PARSER_CONTENT;
5116		ctxt->checkIndex = 0;
5117#ifdef DEBUG_PUSH
5118		xmlGenericError(xmlGenericErrorContext,
5119			"HPP: entering CONTENT\n");
5120#endif
5121		break;
5122            case XML_PARSER_PI:
5123		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5124			"HPP: internal error, state == PI\n",
5125			     NULL, NULL);
5126		ctxt->instate = XML_PARSER_CONTENT;
5127		ctxt->checkIndex = 0;
5128#ifdef DEBUG_PUSH
5129		xmlGenericError(xmlGenericErrorContext,
5130			"HPP: entering CONTENT\n");
5131#endif
5132		break;
5133            case XML_PARSER_ENTITY_DECL:
5134		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5135			"HPP: internal error, state == ENTITY_DECL\n",
5136			     NULL, NULL);
5137		ctxt->instate = XML_PARSER_CONTENT;
5138		ctxt->checkIndex = 0;
5139#ifdef DEBUG_PUSH
5140		xmlGenericError(xmlGenericErrorContext,
5141			"HPP: entering CONTENT\n");
5142#endif
5143		break;
5144            case XML_PARSER_ENTITY_VALUE:
5145		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5146			"HPP: internal error, state == ENTITY_VALUE\n",
5147			     NULL, NULL);
5148		ctxt->instate = XML_PARSER_CONTENT;
5149		ctxt->checkIndex = 0;
5150#ifdef DEBUG_PUSH
5151		xmlGenericError(xmlGenericErrorContext,
5152			"HPP: entering DTD\n");
5153#endif
5154		break;
5155            case XML_PARSER_ATTRIBUTE_VALUE:
5156		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5157			"HPP: internal error, state == ATTRIBUTE_VALUE\n",
5158			     NULL, NULL);
5159		ctxt->instate = XML_PARSER_START_TAG;
5160		ctxt->checkIndex = 0;
5161#ifdef DEBUG_PUSH
5162		xmlGenericError(xmlGenericErrorContext,
5163			"HPP: entering START_TAG\n");
5164#endif
5165		break;
5166	    case XML_PARSER_SYSTEM_LITERAL:
5167		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5168		    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5169			     NULL, NULL);
5170		ctxt->instate = XML_PARSER_CONTENT;
5171		ctxt->checkIndex = 0;
5172#ifdef DEBUG_PUSH
5173		xmlGenericError(xmlGenericErrorContext,
5174			"HPP: entering CONTENT\n");
5175#endif
5176		break;
5177	    case XML_PARSER_IGNORE:
5178		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5179			"HPP: internal error, state == XML_PARSER_IGNORE\n",
5180			     NULL, NULL);
5181		ctxt->instate = XML_PARSER_CONTENT;
5182		ctxt->checkIndex = 0;
5183#ifdef DEBUG_PUSH
5184		xmlGenericError(xmlGenericErrorContext,
5185			"HPP: entering CONTENT\n");
5186#endif
5187		break;
5188	    case XML_PARSER_PUBLIC_LITERAL:
5189		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5190			"HPP: internal error, state == XML_PARSER_LITERAL\n",
5191			     NULL, NULL);
5192		ctxt->instate = XML_PARSER_CONTENT;
5193		ctxt->checkIndex = 0;
5194#ifdef DEBUG_PUSH
5195		xmlGenericError(xmlGenericErrorContext,
5196			"HPP: entering CONTENT\n");
5197#endif
5198		break;
5199
5200	}
5201    }
5202done:
5203    if ((avail == 0) && (terminate)) {
5204	htmlAutoCloseOnEnd(ctxt);
5205	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5206	    /*
5207	     * SAX: end of the document processing.
5208	     */
5209	    ctxt->instate = XML_PARSER_EOF;
5210	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5211		ctxt->sax->endDocument(ctxt->userData);
5212	}
5213    }
5214    if ((ctxt->myDoc != NULL) &&
5215	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5216	 (ctxt->instate == XML_PARSER_EPILOG))) {
5217	xmlDtdPtr dtd;
5218	dtd = xmlGetIntSubset(ctxt->myDoc);
5219	if (dtd == NULL)
5220	    ctxt->myDoc->intSubset =
5221		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5222		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5223		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5224    }
5225#ifdef DEBUG_PUSH
5226    xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5227#endif
5228    return(ret);
5229}
5230
5231/**
5232 * htmlParseChunk:
5233 * @ctxt:  an HTML parser context
5234 * @chunk:  an char array
5235 * @size:  the size in byte of the chunk
5236 * @terminate:  last chunk indicator
5237 *
5238 * Parse a Chunk of memory
5239 *
5240 * Returns zero if no error, the xmlParserErrors otherwise.
5241 */
5242int
5243htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5244              int terminate) {
5245    if ((ctxt == NULL) || (ctxt->input == NULL)) {
5246	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5247		     "htmlParseChunk: context error\n", NULL, NULL);
5248	return(XML_ERR_INTERNAL_ERROR);
5249    }
5250    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5251        (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
5252	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5253	int cur = ctxt->input->cur - ctxt->input->base;
5254	int res;
5255
5256	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5257	if (res < 0) {
5258	    ctxt->errNo = XML_PARSER_EOF;
5259	    ctxt->disableSAX = 1;
5260	    return (XML_PARSER_EOF);
5261	}
5262	ctxt->input->base = ctxt->input->buf->buffer->content + base;
5263	ctxt->input->cur = ctxt->input->base + cur;
5264	ctxt->input->end =
5265	  &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5266#ifdef DEBUG_PUSH
5267	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5268#endif
5269
5270#if 0
5271	if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5272	    htmlParseTryOrFinish(ctxt, terminate);
5273#endif
5274    } else if (ctxt->instate != XML_PARSER_EOF) {
5275	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5276	    xmlParserInputBufferPtr in = ctxt->input->buf;
5277	    if ((in->encoder != NULL) && (in->buffer != NULL) &&
5278		    (in->raw != NULL)) {
5279		int nbchars;
5280
5281		nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5282		if (nbchars < 0) {
5283		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5284			         "encoder error\n", NULL, NULL);
5285		    return(XML_ERR_INVALID_ENCODING);
5286		}
5287	    }
5288	}
5289    }
5290    htmlParseTryOrFinish(ctxt, terminate);
5291    if (terminate) {
5292	if ((ctxt->instate != XML_PARSER_EOF) &&
5293	    (ctxt->instate != XML_PARSER_EPILOG) &&
5294	    (ctxt->instate != XML_PARSER_MISC)) {
5295	    ctxt->errNo = XML_ERR_DOCUMENT_END;
5296	    ctxt->wellFormed = 0;
5297	}
5298	if (ctxt->instate != XML_PARSER_EOF) {
5299	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5300		ctxt->sax->endDocument(ctxt->userData);
5301	}
5302	ctxt->instate = XML_PARSER_EOF;
5303    }
5304    return((xmlParserErrors) ctxt->errNo);
5305}
5306
5307/************************************************************************
5308 *									*
5309 *			User entry points				*
5310 *									*
5311 ************************************************************************/
5312
5313/**
5314 * htmlCreatePushParserCtxt:
5315 * @sax:  a SAX handler
5316 * @user_data:  The user data returned on SAX callbacks
5317 * @chunk:  a pointer to an array of chars
5318 * @size:  number of chars in the array
5319 * @filename:  an optional file name or URI
5320 * @enc:  an optional encoding
5321 *
5322 * Create a parser context for using the HTML parser in push mode
5323 * The value of @filename is used for fetching external entities
5324 * and error/warning reports.
5325 *
5326 * Returns the new parser context or NULL
5327 */
5328htmlParserCtxtPtr
5329htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5330                         const char *chunk, int size, const char *filename,
5331			 xmlCharEncoding enc) {
5332    htmlParserCtxtPtr ctxt;
5333    htmlParserInputPtr inputStream;
5334    xmlParserInputBufferPtr buf;
5335
5336    xmlInitParser();
5337
5338    buf = xmlAllocParserInputBuffer(enc);
5339    if (buf == NULL) return(NULL);
5340
5341    ctxt = htmlNewParserCtxt();
5342    if (ctxt == NULL) {
5343	xmlFreeParserInputBuffer(buf);
5344	return(NULL);
5345    }
5346    if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5347	ctxt->charset=XML_CHAR_ENCODING_UTF8;
5348    if (sax != NULL) {
5349	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
5350	    xmlFree(ctxt->sax);
5351	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5352	if (ctxt->sax == NULL) {
5353	    xmlFree(buf);
5354	    xmlFree(ctxt);
5355	    return(NULL);
5356	}
5357	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5358	if (user_data != NULL)
5359	    ctxt->userData = user_data;
5360    }
5361    if (filename == NULL) {
5362	ctxt->directory = NULL;
5363    } else {
5364        ctxt->directory = xmlParserGetDirectory(filename);
5365    }
5366
5367    inputStream = htmlNewInputStream(ctxt);
5368    if (inputStream == NULL) {
5369	xmlFreeParserCtxt(ctxt);
5370	xmlFree(buf);
5371	return(NULL);
5372    }
5373
5374    if (filename == NULL)
5375	inputStream->filename = NULL;
5376    else
5377	inputStream->filename = (char *)
5378	    xmlCanonicPath((const xmlChar *) filename);
5379    inputStream->buf = buf;
5380    inputStream->base = inputStream->buf->buffer->content;
5381    inputStream->cur = inputStream->buf->buffer->content;
5382    inputStream->end =
5383	&inputStream->buf->buffer->content[inputStream->buf->buffer->use];
5384
5385    inputPush(ctxt, inputStream);
5386
5387    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5388        (ctxt->input->buf != NULL))  {
5389	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5390	int cur = ctxt->input->cur - ctxt->input->base;
5391
5392	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5393
5394	ctxt->input->base = ctxt->input->buf->buffer->content + base;
5395	ctxt->input->cur = ctxt->input->base + cur;
5396	ctxt->input->end =
5397	    &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5398#ifdef DEBUG_PUSH
5399	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5400#endif
5401    }
5402    ctxt->progressive = 1;
5403
5404    return(ctxt);
5405}
5406#endif /* LIBXML_PUSH_ENABLED */
5407
5408/**
5409 * htmlSAXParseDoc:
5410 * @cur:  a pointer to an array of xmlChar
5411 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5412 * @sax:  the SAX handler block
5413 * @userData: if using SAX, this pointer will be provided on callbacks.
5414 *
5415 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5416 * to handle parse events. If sax is NULL, fallback to the default DOM
5417 * behavior and return a tree.
5418 *
5419 * Returns the resulting document tree unless SAX is NULL or the document is
5420 *     not well formed.
5421 */
5422
5423htmlDocPtr
5424htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5425    htmlDocPtr ret;
5426    htmlParserCtxtPtr ctxt;
5427
5428    xmlInitParser();
5429
5430    if (cur == NULL) return(NULL);
5431
5432
5433    ctxt = htmlCreateDocParserCtxt(cur, encoding);
5434    if (ctxt == NULL) return(NULL);
5435    if (sax != NULL) {
5436        if (ctxt->sax != NULL) xmlFree (ctxt->sax);
5437        ctxt->sax = sax;
5438        ctxt->userData = userData;
5439    }
5440
5441    htmlParseDocument(ctxt);
5442    ret = ctxt->myDoc;
5443    if (sax != NULL) {
5444	ctxt->sax = NULL;
5445	ctxt->userData = NULL;
5446    }
5447    htmlFreeParserCtxt(ctxt);
5448
5449    return(ret);
5450}
5451
5452/**
5453 * htmlParseDoc:
5454 * @cur:  a pointer to an array of xmlChar
5455 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5456 *
5457 * parse an HTML in-memory document and build a tree.
5458 *
5459 * Returns the resulting document tree
5460 */
5461
5462htmlDocPtr
5463htmlParseDoc(xmlChar *cur, const char *encoding) {
5464    return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5465}
5466
5467
5468/**
5469 * htmlCreateFileParserCtxt:
5470 * @filename:  the filename
5471 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5472 *
5473 * Create a parser context for a file content.
5474 * Automatic support for ZLIB/Compress compressed document is provided
5475 * by default if found at compile-time.
5476 *
5477 * Returns the new parser context or NULL
5478 */
5479htmlParserCtxtPtr
5480htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5481{
5482    htmlParserCtxtPtr ctxt;
5483    htmlParserInputPtr inputStream;
5484    char *canonicFilename;
5485    /* htmlCharEncoding enc; */
5486    xmlChar *content, *content_line = (xmlChar *) "charset=";
5487
5488    if (filename == NULL)
5489        return(NULL);
5490
5491    ctxt = htmlNewParserCtxt();
5492    if (ctxt == NULL) {
5493	return(NULL);
5494    }
5495    canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5496    if (canonicFilename == NULL) {
5497#ifdef LIBXML_SAX1_ENABLED
5498	if (xmlDefaultSAXHandler.error != NULL) {
5499	    xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5500	}
5501#endif
5502	xmlFreeParserCtxt(ctxt);
5503	return(NULL);
5504    }
5505
5506    inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5507    xmlFree(canonicFilename);
5508    if (inputStream == NULL) {
5509	xmlFreeParserCtxt(ctxt);
5510	return(NULL);
5511    }
5512
5513    inputPush(ctxt, inputStream);
5514
5515    /* set encoding */
5516    if (encoding) {
5517        content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
5518	if (content) {
5519	    strcpy ((char *)content, (char *)content_line);
5520            strcat ((char *)content, (char *)encoding);
5521            htmlCheckEncoding (ctxt, content);
5522	    xmlFree (content);
5523	}
5524    }
5525
5526    return(ctxt);
5527}
5528
5529/**
5530 * htmlSAXParseFile:
5531 * @filename:  the filename
5532 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5533 * @sax:  the SAX handler block
5534 * @userData: if using SAX, this pointer will be provided on callbacks.
5535 *
5536 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5537 * compressed document is provided by default if found at compile-time.
5538 * It use the given SAX function block to handle the parsing callback.
5539 * If sax is NULL, fallback to the default DOM tree building routines.
5540 *
5541 * Returns the resulting document tree unless SAX is NULL or the document is
5542 *     not well formed.
5543 */
5544
5545htmlDocPtr
5546htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5547                 void *userData) {
5548    htmlDocPtr ret;
5549    htmlParserCtxtPtr ctxt;
5550    htmlSAXHandlerPtr oldsax = NULL;
5551
5552    xmlInitParser();
5553
5554    ctxt = htmlCreateFileParserCtxt(filename, encoding);
5555    if (ctxt == NULL) return(NULL);
5556    if (sax != NULL) {
5557	oldsax = ctxt->sax;
5558        ctxt->sax = sax;
5559        ctxt->userData = userData;
5560    }
5561
5562    htmlParseDocument(ctxt);
5563
5564    ret = ctxt->myDoc;
5565    if (sax != NULL) {
5566        ctxt->sax = oldsax;
5567        ctxt->userData = NULL;
5568    }
5569    htmlFreeParserCtxt(ctxt);
5570
5571    return(ret);
5572}
5573
5574/**
5575 * htmlParseFile:
5576 * @filename:  the filename
5577 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5578 *
5579 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5580 * compressed document is provided by default if found at compile-time.
5581 *
5582 * Returns the resulting document tree
5583 */
5584
5585htmlDocPtr
5586htmlParseFile(const char *filename, const char *encoding) {
5587    return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5588}
5589
5590/**
5591 * htmlHandleOmittedElem:
5592 * @val:  int 0 or 1
5593 *
5594 * Set and return the previous value for handling HTML omitted tags.
5595 *
5596 * Returns the last value for 0 for no handling, 1 for auto insertion.
5597 */
5598
5599int
5600htmlHandleOmittedElem(int val) {
5601    int old = htmlOmittedDefaultValue;
5602
5603    htmlOmittedDefaultValue = val;
5604    return(old);
5605}
5606
5607/**
5608 * htmlElementAllowedHere:
5609 * @parent: HTML parent element
5610 * @elt: HTML element
5611 *
5612 * Checks whether an HTML element may be a direct child of a parent element.
5613 * Note - doesn't check for deprecated elements
5614 *
5615 * Returns 1 if allowed; 0 otherwise.
5616 */
5617int
5618htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5619  const char** p ;
5620
5621  if ( ! elt || ! parent || ! parent->subelts )
5622	return 0 ;
5623
5624  for ( p = parent->subelts; *p; ++p )
5625    if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5626      return 1 ;
5627
5628  return 0 ;
5629}
5630/**
5631 * htmlElementStatusHere:
5632 * @parent: HTML parent element
5633 * @elt: HTML element
5634 *
5635 * Checks whether an HTML element may be a direct child of a parent element.
5636 * and if so whether it is valid or deprecated.
5637 *
5638 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5639 */
5640htmlStatus
5641htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5642  if ( ! parent || ! elt )
5643    return HTML_INVALID ;
5644  if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5645    return HTML_INVALID ;
5646
5647  return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5648}
5649/**
5650 * htmlAttrAllowed:
5651 * @elt: HTML element
5652 * @attr: HTML attribute
5653 * @legacy: whether to allow deprecated attributes
5654 *
5655 * Checks whether an attribute is valid for an element
5656 * Has full knowledge of Required and Deprecated attributes
5657 *
5658 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5659 */
5660htmlStatus
5661htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5662  const char** p ;
5663
5664  if ( !elt || ! attr )
5665	return HTML_INVALID ;
5666
5667  if ( elt->attrs_req )
5668    for ( p = elt->attrs_req; *p; ++p)
5669      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5670        return HTML_REQUIRED ;
5671
5672  if ( elt->attrs_opt )
5673    for ( p = elt->attrs_opt; *p; ++p)
5674      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5675        return HTML_VALID ;
5676
5677  if ( legacy && elt->attrs_depr )
5678    for ( p = elt->attrs_depr; *p; ++p)
5679      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5680        return HTML_DEPRECATED ;
5681
5682  return HTML_INVALID ;
5683}
5684/**
5685 * htmlNodeStatus:
5686 * @node: an htmlNodePtr in a tree
5687 * @legacy: whether to allow deprecated elements (YES is faster here
5688 *	for Element nodes)
5689 *
5690 * Checks whether the tree node is valid.  Experimental (the author
5691 *     only uses the HTML enhancements in a SAX parser)
5692 *
5693 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5694 *	legacy allowed) or htmlElementStatusHere (otherwise).
5695 *	for Attribute nodes, a return from htmlAttrAllowed
5696 *	for other nodes, HTML_NA (no checks performed)
5697 */
5698htmlStatus
5699htmlNodeStatus(const htmlNodePtr node, int legacy) {
5700  if ( ! node )
5701    return HTML_INVALID ;
5702
5703  switch ( node->type ) {
5704    case XML_ELEMENT_NODE:
5705      return legacy
5706	? ( htmlElementAllowedHere (
5707		htmlTagLookup(node->parent->name) , node->name
5708		) ? HTML_VALID : HTML_INVALID )
5709	: htmlElementStatusHere(
5710		htmlTagLookup(node->parent->name) ,
5711		htmlTagLookup(node->name) )
5712	;
5713    case XML_ATTRIBUTE_NODE:
5714      return htmlAttrAllowed(
5715	htmlTagLookup(node->parent->name) , node->name, legacy) ;
5716    default: return HTML_NA ;
5717  }
5718}
5719/************************************************************************
5720 *									*
5721 *	New set (2.6.0) of simpler and more flexible APIs		*
5722 *									*
5723 ************************************************************************/
5724/**
5725 * DICT_FREE:
5726 * @str:  a string
5727 *
5728 * Free a string if it is not owned by the "dict" dictionnary in the
5729 * current scope
5730 */
5731#define DICT_FREE(str)						\
5732	if ((str) && ((!dict) || 				\
5733	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
5734	    xmlFree((char *)(str));
5735
5736/**
5737 * htmlCtxtReset:
5738 * @ctxt: an HTML parser context
5739 *
5740 * Reset a parser context
5741 */
5742void
5743htmlCtxtReset(htmlParserCtxtPtr ctxt)
5744{
5745    xmlParserInputPtr input;
5746    xmlDictPtr dict;
5747
5748    if (ctxt == NULL)
5749        return;
5750
5751    xmlInitParser();
5752    dict = ctxt->dict;
5753
5754    while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5755        xmlFreeInputStream(input);
5756    }
5757    ctxt->inputNr = 0;
5758    ctxt->input = NULL;
5759
5760    ctxt->spaceNr = 0;
5761    if (ctxt->spaceTab != NULL) {
5762	ctxt->spaceTab[0] = -1;
5763	ctxt->space = &ctxt->spaceTab[0];
5764    } else {
5765	ctxt->space = NULL;
5766    }
5767
5768
5769    ctxt->nodeNr = 0;
5770    ctxt->node = NULL;
5771
5772    ctxt->nameNr = 0;
5773    ctxt->name = NULL;
5774
5775    DICT_FREE(ctxt->version);
5776    ctxt->version = NULL;
5777    DICT_FREE(ctxt->encoding);
5778    ctxt->encoding = NULL;
5779    DICT_FREE(ctxt->directory);
5780    ctxt->directory = NULL;
5781    DICT_FREE(ctxt->extSubURI);
5782    ctxt->extSubURI = NULL;
5783    DICT_FREE(ctxt->extSubSystem);
5784    ctxt->extSubSystem = NULL;
5785    if (ctxt->myDoc != NULL)
5786        xmlFreeDoc(ctxt->myDoc);
5787    ctxt->myDoc = NULL;
5788
5789    ctxt->standalone = -1;
5790    ctxt->hasExternalSubset = 0;
5791    ctxt->hasPErefs = 0;
5792    ctxt->html = 1;
5793    ctxt->external = 0;
5794    ctxt->instate = XML_PARSER_START;
5795    ctxt->token = 0;
5796
5797    ctxt->wellFormed = 1;
5798    ctxt->nsWellFormed = 1;
5799    ctxt->valid = 1;
5800    ctxt->vctxt.userData = ctxt;
5801    ctxt->vctxt.error = xmlParserValidityError;
5802    ctxt->vctxt.warning = xmlParserValidityWarning;
5803    ctxt->record_info = 0;
5804    ctxt->nbChars = 0;
5805    ctxt->checkIndex = 0;
5806    ctxt->inSubset = 0;
5807    ctxt->errNo = XML_ERR_OK;
5808    ctxt->depth = 0;
5809    ctxt->charset = XML_CHAR_ENCODING_UTF8;
5810    ctxt->catalogs = NULL;
5811    xmlInitNodeInfoSeq(&ctxt->node_seq);
5812
5813    if (ctxt->attsDefault != NULL) {
5814        xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5815        ctxt->attsDefault = NULL;
5816    }
5817    if (ctxt->attsSpecial != NULL) {
5818        xmlHashFree(ctxt->attsSpecial, NULL);
5819        ctxt->attsSpecial = NULL;
5820    }
5821}
5822
5823/**
5824 * htmlCtxtUseOptions:
5825 * @ctxt: an HTML parser context
5826 * @options:  a combination of htmlParserOption(s)
5827 *
5828 * Applies the options to the parser context
5829 *
5830 * Returns 0 in case of success, the set of unknown or unimplemented options
5831 *         in case of error.
5832 */
5833int
5834htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5835{
5836    if (ctxt == NULL)
5837        return(-1);
5838
5839    if (options & HTML_PARSE_NOWARNING) {
5840        ctxt->sax->warning = NULL;
5841        ctxt->vctxt.warning = NULL;
5842        options -= XML_PARSE_NOWARNING;
5843	ctxt->options |= XML_PARSE_NOWARNING;
5844    }
5845    if (options & HTML_PARSE_NOERROR) {
5846        ctxt->sax->error = NULL;
5847        ctxt->vctxt.error = NULL;
5848        ctxt->sax->fatalError = NULL;
5849        options -= XML_PARSE_NOERROR;
5850	ctxt->options |= XML_PARSE_NOERROR;
5851    }
5852    if (options & HTML_PARSE_PEDANTIC) {
5853        ctxt->pedantic = 1;
5854        options -= XML_PARSE_PEDANTIC;
5855	ctxt->options |= XML_PARSE_PEDANTIC;
5856    } else
5857        ctxt->pedantic = 0;
5858    if (options & XML_PARSE_NOBLANKS) {
5859        ctxt->keepBlanks = 0;
5860        ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5861        options -= XML_PARSE_NOBLANKS;
5862	ctxt->options |= XML_PARSE_NOBLANKS;
5863    } else
5864        ctxt->keepBlanks = 1;
5865    if (options & HTML_PARSE_RECOVER) {
5866        ctxt->recovery = 1;
5867	options -= HTML_PARSE_RECOVER;
5868    } else
5869        ctxt->recovery = 0;
5870    if (options & HTML_PARSE_COMPACT) {
5871	ctxt->options |= HTML_PARSE_COMPACT;
5872        options -= HTML_PARSE_COMPACT;
5873    }
5874    ctxt->dictNames = 0;
5875    return (options);
5876}
5877
5878/**
5879 * htmlDoRead:
5880 * @ctxt:  an HTML parser context
5881 * @URL:  the base URL to use for the document
5882 * @encoding:  the document encoding, or NULL
5883 * @options:  a combination of htmlParserOption(s)
5884 * @reuse:  keep the context for reuse
5885 *
5886 * Common front-end for the htmlRead functions
5887 *
5888 * Returns the resulting document tree or NULL
5889 */
5890static htmlDocPtr
5891htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5892          int options, int reuse)
5893{
5894    htmlDocPtr ret;
5895
5896    htmlCtxtUseOptions(ctxt, options);
5897    ctxt->html = 1;
5898    if (encoding != NULL) {
5899        xmlCharEncodingHandlerPtr hdlr;
5900
5901	hdlr = xmlFindCharEncodingHandler(encoding);
5902	if (hdlr != NULL)
5903	    xmlSwitchToEncoding(ctxt, hdlr);
5904    }
5905    if ((URL != NULL) && (ctxt->input != NULL) &&
5906        (ctxt->input->filename == NULL))
5907        ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5908    htmlParseDocument(ctxt);
5909    ret = ctxt->myDoc;
5910    ctxt->myDoc = NULL;
5911    if (!reuse) {
5912        if ((ctxt->dictNames) &&
5913	    (ret != NULL) &&
5914	    (ret->dict == ctxt->dict))
5915	    ctxt->dict = NULL;
5916	xmlFreeParserCtxt(ctxt);
5917    }
5918    return (ret);
5919}
5920
5921/**
5922 * htmlReadDoc:
5923 * @cur:  a pointer to a zero terminated string
5924 * @URL:  the base URL to use for the document
5925 * @encoding:  the document encoding, or NULL
5926 * @options:  a combination of htmlParserOption(s)
5927 *
5928 * parse an XML in-memory document and build a tree.
5929 *
5930 * Returns the resulting document tree
5931 */
5932htmlDocPtr
5933htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5934{
5935    htmlParserCtxtPtr ctxt;
5936
5937    if (cur == NULL)
5938        return (NULL);
5939
5940    xmlInitParser();
5941    ctxt = htmlCreateDocParserCtxt(cur, NULL);
5942    if (ctxt == NULL)
5943        return (NULL);
5944    return (htmlDoRead(ctxt, URL, encoding, options, 0));
5945}
5946
5947/**
5948 * htmlReadFile:
5949 * @filename:  a file or URL
5950 * @encoding:  the document encoding, or NULL
5951 * @options:  a combination of htmlParserOption(s)
5952 *
5953 * parse an XML file from the filesystem or the network.
5954 *
5955 * Returns the resulting document tree
5956 */
5957htmlDocPtr
5958htmlReadFile(const char *filename, const char *encoding, int options)
5959{
5960    htmlParserCtxtPtr ctxt;
5961
5962    xmlInitParser();
5963    ctxt = htmlCreateFileParserCtxt(filename, encoding);
5964    if (ctxt == NULL)
5965        return (NULL);
5966    return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5967}
5968
5969/**
5970 * htmlReadMemory:
5971 * @buffer:  a pointer to a char array
5972 * @size:  the size of the array
5973 * @URL:  the base URL to use for the document
5974 * @encoding:  the document encoding, or NULL
5975 * @options:  a combination of htmlParserOption(s)
5976 *
5977 * parse an XML in-memory document and build a tree.
5978 *
5979 * Returns the resulting document tree
5980 */
5981htmlDocPtr
5982htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5983{
5984    htmlParserCtxtPtr ctxt;
5985
5986    xmlInitParser();
5987    ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5988    if (ctxt == NULL)
5989        return (NULL);
5990    htmlDefaultSAXHandlerInit();
5991    if (ctxt->sax != NULL)
5992        memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
5993    return (htmlDoRead(ctxt, URL, encoding, options, 0));
5994}
5995
5996/**
5997 * htmlReadFd:
5998 * @fd:  an open file descriptor
5999 * @URL:  the base URL to use for the document
6000 * @encoding:  the document encoding, or NULL
6001 * @options:  a combination of htmlParserOption(s)
6002 *
6003 * parse an XML from a file descriptor and build a tree.
6004 *
6005 * Returns the resulting document tree
6006 */
6007htmlDocPtr
6008htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6009{
6010    htmlParserCtxtPtr ctxt;
6011    xmlParserInputBufferPtr input;
6012    xmlParserInputPtr stream;
6013
6014    if (fd < 0)
6015        return (NULL);
6016
6017    xmlInitParser();
6018    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6019    if (input == NULL)
6020        return (NULL);
6021    ctxt = xmlNewParserCtxt();
6022    if (ctxt == NULL) {
6023        xmlFreeParserInputBuffer(input);
6024        return (NULL);
6025    }
6026    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6027    if (stream == NULL) {
6028        xmlFreeParserInputBuffer(input);
6029	xmlFreeParserCtxt(ctxt);
6030        return (NULL);
6031    }
6032    inputPush(ctxt, stream);
6033    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6034}
6035
6036/**
6037 * htmlReadIO:
6038 * @ioread:  an I/O read function
6039 * @ioclose:  an I/O close function
6040 * @ioctx:  an I/O handler
6041 * @URL:  the base URL to use for the document
6042 * @encoding:  the document encoding, or NULL
6043 * @options:  a combination of htmlParserOption(s)
6044 *
6045 * parse an HTML document from I/O functions and source and build a tree.
6046 *
6047 * Returns the resulting document tree
6048 */
6049htmlDocPtr
6050htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6051          void *ioctx, const char *URL, const char *encoding, int options)
6052{
6053    htmlParserCtxtPtr ctxt;
6054    xmlParserInputBufferPtr input;
6055    xmlParserInputPtr stream;
6056
6057    if (ioread == NULL)
6058        return (NULL);
6059    xmlInitParser();
6060
6061    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6062                                         XML_CHAR_ENCODING_NONE);
6063    if (input == NULL)
6064        return (NULL);
6065    ctxt = htmlNewParserCtxt();
6066    if (ctxt == NULL) {
6067        xmlFreeParserInputBuffer(input);
6068        return (NULL);
6069    }
6070    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6071    if (stream == NULL) {
6072        xmlFreeParserInputBuffer(input);
6073	xmlFreeParserCtxt(ctxt);
6074        return (NULL);
6075    }
6076    inputPush(ctxt, stream);
6077    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6078}
6079
6080/**
6081 * htmlCtxtReadDoc:
6082 * @ctxt:  an HTML parser context
6083 * @cur:  a pointer to a zero terminated string
6084 * @URL:  the base URL to use for the document
6085 * @encoding:  the document encoding, or NULL
6086 * @options:  a combination of htmlParserOption(s)
6087 *
6088 * parse an XML in-memory document and build a tree.
6089 * This reuses the existing @ctxt parser context
6090 *
6091 * Returns the resulting document tree
6092 */
6093htmlDocPtr
6094htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6095               const char *URL, const char *encoding, int options)
6096{
6097    xmlParserInputPtr stream;
6098
6099    if (cur == NULL)
6100        return (NULL);
6101    if (ctxt == NULL)
6102        return (NULL);
6103
6104    htmlCtxtReset(ctxt);
6105
6106    stream = xmlNewStringInputStream(ctxt, cur);
6107    if (stream == NULL) {
6108        return (NULL);
6109    }
6110    inputPush(ctxt, stream);
6111    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6112}
6113
6114/**
6115 * htmlCtxtReadFile:
6116 * @ctxt:  an HTML parser context
6117 * @filename:  a file or URL
6118 * @encoding:  the document encoding, or NULL
6119 * @options:  a combination of htmlParserOption(s)
6120 *
6121 * parse an XML file from the filesystem or the network.
6122 * This reuses the existing @ctxt parser context
6123 *
6124 * Returns the resulting document tree
6125 */
6126htmlDocPtr
6127htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6128                const char *encoding, int options)
6129{
6130    xmlParserInputPtr stream;
6131
6132    if (filename == NULL)
6133        return (NULL);
6134    if (ctxt == NULL)
6135        return (NULL);
6136
6137    htmlCtxtReset(ctxt);
6138
6139    stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6140    if (stream == NULL) {
6141        return (NULL);
6142    }
6143    inputPush(ctxt, stream);
6144    return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6145}
6146
6147/**
6148 * htmlCtxtReadMemory:
6149 * @ctxt:  an HTML parser context
6150 * @buffer:  a pointer to a char array
6151 * @size:  the size of the array
6152 * @URL:  the base URL to use for the document
6153 * @encoding:  the document encoding, or NULL
6154 * @options:  a combination of htmlParserOption(s)
6155 *
6156 * parse an XML in-memory document and build a tree.
6157 * This reuses the existing @ctxt parser context
6158 *
6159 * Returns the resulting document tree
6160 */
6161htmlDocPtr
6162htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6163                  const char *URL, const char *encoding, int options)
6164{
6165    xmlParserInputBufferPtr input;
6166    xmlParserInputPtr stream;
6167
6168    if (ctxt == NULL)
6169        return (NULL);
6170    if (buffer == NULL)
6171        return (NULL);
6172
6173    htmlCtxtReset(ctxt);
6174
6175    input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6176    if (input == NULL) {
6177	return(NULL);
6178    }
6179
6180    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6181    if (stream == NULL) {
6182	xmlFreeParserInputBuffer(input);
6183	return(NULL);
6184    }
6185
6186    inputPush(ctxt, stream);
6187    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6188}
6189
6190/**
6191 * htmlCtxtReadFd:
6192 * @ctxt:  an HTML parser context
6193 * @fd:  an open file descriptor
6194 * @URL:  the base URL to use for the document
6195 * @encoding:  the document encoding, or NULL
6196 * @options:  a combination of htmlParserOption(s)
6197 *
6198 * parse an XML from a file descriptor and build a tree.
6199 * This reuses the existing @ctxt parser context
6200 *
6201 * Returns the resulting document tree
6202 */
6203htmlDocPtr
6204htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6205              const char *URL, const char *encoding, int options)
6206{
6207    xmlParserInputBufferPtr input;
6208    xmlParserInputPtr stream;
6209
6210    if (fd < 0)
6211        return (NULL);
6212    if (ctxt == NULL)
6213        return (NULL);
6214
6215    htmlCtxtReset(ctxt);
6216
6217
6218    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6219    if (input == NULL)
6220        return (NULL);
6221    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6222    if (stream == NULL) {
6223        xmlFreeParserInputBuffer(input);
6224        return (NULL);
6225    }
6226    inputPush(ctxt, stream);
6227    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6228}
6229
6230/**
6231 * htmlCtxtReadIO:
6232 * @ctxt:  an HTML parser context
6233 * @ioread:  an I/O read function
6234 * @ioclose:  an I/O close function
6235 * @ioctx:  an I/O handler
6236 * @URL:  the base URL to use for the document
6237 * @encoding:  the document encoding, or NULL
6238 * @options:  a combination of htmlParserOption(s)
6239 *
6240 * parse an HTML document from I/O functions and source and build a tree.
6241 * This reuses the existing @ctxt parser context
6242 *
6243 * Returns the resulting document tree
6244 */
6245htmlDocPtr
6246htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6247              xmlInputCloseCallback ioclose, void *ioctx,
6248	      const char *URL,
6249              const char *encoding, int options)
6250{
6251    xmlParserInputBufferPtr input;
6252    xmlParserInputPtr stream;
6253
6254    if (ioread == NULL)
6255        return (NULL);
6256    if (ctxt == NULL)
6257        return (NULL);
6258
6259    htmlCtxtReset(ctxt);
6260
6261    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6262                                         XML_CHAR_ENCODING_NONE);
6263    if (input == NULL)
6264        return (NULL);
6265    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6266    if (stream == NULL) {
6267        xmlFreeParserInputBuffer(input);
6268        return (NULL);
6269    }
6270    inputPush(ctxt, stream);
6271    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6272}
6273
6274#define bottom_HTMLparser
6275#include "elfgcchack.h"
6276#endif /* LIBXML_HTML_ENABLED */
6277