1/* 2 * Summary: interface for an HTML 4.0 non-verifying parser 3 * Description: this module implements an HTML 4.0 non-verifying parser 4 * with API compatible with the XML parser ones. It should 5 * be able to parse "real world" HTML, even if severely 6 * broken from a specification point of view. 7 * 8 * Copy: See Copyright for the status of this software. 9 * 10 * Author: Daniel Veillard 11 */ 12 13#ifndef __HTML_PARSER_H__ 14#define __HTML_PARSER_H__ 15#include <libxml/xmlversion.h> 16#include <libxml/parser.h> 17 18#ifdef LIBXML_HTML_ENABLED 19 20#ifdef __cplusplus 21extern "C" { 22#endif 23 24/* 25 * Most of the back-end structures from XML and HTML are shared. 26 */ 27typedef xmlParserCtxt htmlParserCtxt; 28typedef xmlParserCtxtPtr htmlParserCtxtPtr; 29typedef xmlParserNodeInfo htmlParserNodeInfo; 30typedef xmlSAXHandler htmlSAXHandler; 31typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; 32typedef xmlParserInput htmlParserInput; 33typedef xmlParserInputPtr htmlParserInputPtr; 34typedef xmlDocPtr htmlDocPtr; 35typedef xmlNodePtr htmlNodePtr; 36 37/* 38 * Internal description of an HTML element, representing HTML 4.01 39 * and XHTML 1.0 (which share the same structure). 40 */ 41typedef struct _htmlElemDesc htmlElemDesc; 42typedef htmlElemDesc *htmlElemDescPtr; 43struct _htmlElemDesc { 44 const char *name; /* The tag name */ 45 char startTag; /* Whether the start tag can be implied */ 46 char endTag; /* Whether the end tag can be implied */ 47 char saveEndTag; /* Whether the end tag should be saved */ 48 char empty; /* Is this an empty element ? */ 49 char depr; /* Is this a deprecated element ? */ 50 char dtd; /* 1: only in Loose DTD, 2: only Frameset one */ 51 char isinline; /* is this a block 0 or inline 1 element */ 52 const char *desc; /* the description */ 53 54/* NRK Jan.2003 55 * New fields encapsulating HTML structure 56 * 57 * Bugs: 58 * This is a very limited representation. It fails to tell us when 59 * an element *requires* subelements (we only have whether they're 60 * allowed or not), and it doesn't tell us where CDATA and PCDATA 61 * are allowed. Some element relationships are not fully represented: 62 * these are flagged with the word MODIFIER 63 */ 64 const char** subelts; /* allowed sub-elements of this element */ 65 const char* defaultsubelt; /* subelement for suggested auto-repair 66 if necessary or NULL */ 67 const char** attrs_opt; /* Optional Attributes */ 68 const char** attrs_depr; /* Additional deprecated attributes */ 69 const char** attrs_req; /* Required attributes */ 70}; 71 72/* 73 * Internal description of an HTML entity. 74 */ 75typedef struct _htmlEntityDesc htmlEntityDesc; 76typedef htmlEntityDesc *htmlEntityDescPtr; 77struct _htmlEntityDesc { 78 unsigned int value; /* the UNICODE value for the character */ 79 const char *name; /* The entity name */ 80 const char *desc; /* the description */ 81}; 82 83/* 84 * There is only few public functions. 85 */ 86XMLPUBFUN const htmlElemDesc * XMLCALL 87 htmlTagLookup (const xmlChar *tag); 88XMLPUBFUN const htmlEntityDesc * XMLCALL 89 htmlEntityLookup(const xmlChar *name); 90XMLPUBFUN const htmlEntityDesc * XMLCALL 91 htmlEntityValueLookup(unsigned int value); 92 93XMLPUBFUN int XMLCALL 94 htmlIsAutoClosed(htmlDocPtr doc, 95 htmlNodePtr elem); 96XMLPUBFUN int XMLCALL 97 htmlAutoCloseTag(htmlDocPtr doc, 98 const xmlChar *name, 99 htmlNodePtr elem); 100XMLPUBFUN const htmlEntityDesc * XMLCALL 101 htmlParseEntityRef(htmlParserCtxtPtr ctxt, 102 const xmlChar **str); 103XMLPUBFUN int XMLCALL 104 htmlParseCharRef(htmlParserCtxtPtr ctxt); 105XMLPUBFUN void XMLCALL 106 htmlParseElement(htmlParserCtxtPtr ctxt); 107 108XMLPUBFUN htmlParserCtxtPtr XMLCALL 109 htmlCreateMemoryParserCtxt(const char *buffer, 110 int size); 111 112XMLPUBFUN int XMLCALL 113 htmlParseDocument(htmlParserCtxtPtr ctxt); 114XMLPUBFUN htmlDocPtr XMLCALL 115 htmlSAXParseDoc (xmlChar *cur, 116 const char *encoding, 117 htmlSAXHandlerPtr sax, 118 void *userData); 119XMLPUBFUN htmlDocPtr XMLCALL 120 htmlParseDoc (xmlChar *cur, 121 const char *encoding); 122XMLPUBFUN htmlDocPtr XMLCALL 123 htmlSAXParseFile(const char *filename, 124 const char *encoding, 125 htmlSAXHandlerPtr sax, 126 void *userData); 127XMLPUBFUN htmlDocPtr XMLCALL 128 htmlParseFile (const char *filename, 129 const char *encoding); 130XMLPUBFUN int XMLCALL 131 UTF8ToHtml (unsigned char *out, 132 int *outlen, 133 const unsigned char *in, 134 int *inlen); 135XMLPUBFUN int XMLCALL 136 htmlEncodeEntities(unsigned char *out, 137 int *outlen, 138 const unsigned char *in, 139 int *inlen, int quoteChar); 140XMLPUBFUN int XMLCALL 141 htmlIsScriptAttribute(const xmlChar *name); 142XMLPUBFUN int XMLCALL 143 htmlHandleOmittedElem(int val); 144 145#ifdef LIBXML_PUSH_ENABLED 146/** 147 * Interfaces for the Push mode. 148 */ 149XMLPUBFUN htmlParserCtxtPtr XMLCALL 150 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, 151 void *user_data, 152 const char *chunk, 153 int size, 154 const char *filename, 155 xmlCharEncoding enc); 156XMLPUBFUN int XMLCALL 157 htmlParseChunk (htmlParserCtxtPtr ctxt, 158 const char *chunk, 159 int size, 160 int terminate); 161#endif /* LIBXML_PUSH_ENABLED */ 162 163XMLPUBFUN void XMLCALL 164 htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); 165 166/* 167 * New set of simpler/more flexible APIs 168 */ 169/** 170 * xmlParserOption: 171 * 172 * This is the set of XML parser options that can be passed down 173 * to the xmlReadDoc() and similar calls. 174 */ 175typedef enum { 176 HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */ 177 HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ 178 HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */ 179 HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */ 180 HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */ 181 HTML_PARSE_NONET = 1<<11,/* Forbid network access */ 182 HTML_PARSE_COMPACT = 1<<16 /* compact small text nodes */ 183} htmlParserOption; 184 185XMLPUBFUN void XMLCALL 186 htmlCtxtReset (htmlParserCtxtPtr ctxt); 187XMLPUBFUN int XMLCALL 188 htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, 189 int options); 190XMLPUBFUN htmlDocPtr XMLCALL 191 htmlReadDoc (const xmlChar *cur, 192 const char *URL, 193 const char *encoding, 194 int options); 195XMLPUBFUN htmlDocPtr XMLCALL 196 htmlReadFile (const char *URL, 197 const char *encoding, 198 int options); 199XMLPUBFUN htmlDocPtr XMLCALL 200 htmlReadMemory (const char *buffer, 201 int size, 202 const char *URL, 203 const char *encoding, 204 int options); 205XMLPUBFUN htmlDocPtr XMLCALL 206 htmlReadFd (int fd, 207 const char *URL, 208 const char *encoding, 209 int options); 210XMLPUBFUN htmlDocPtr XMLCALL 211 htmlReadIO (xmlInputReadCallback ioread, 212 xmlInputCloseCallback ioclose, 213 void *ioctx, 214 const char *URL, 215 const char *encoding, 216 int options); 217XMLPUBFUN htmlDocPtr XMLCALL 218 htmlCtxtReadDoc (xmlParserCtxtPtr ctxt, 219 const xmlChar *cur, 220 const char *URL, 221 const char *encoding, 222 int options); 223XMLPUBFUN htmlDocPtr XMLCALL 224 htmlCtxtReadFile (xmlParserCtxtPtr ctxt, 225 const char *filename, 226 const char *encoding, 227 int options); 228XMLPUBFUN htmlDocPtr XMLCALL 229 htmlCtxtReadMemory (xmlParserCtxtPtr ctxt, 230 const char *buffer, 231 int size, 232 const char *URL, 233 const char *encoding, 234 int options); 235XMLPUBFUN htmlDocPtr XMLCALL 236 htmlCtxtReadFd (xmlParserCtxtPtr ctxt, 237 int fd, 238 const char *URL, 239 const char *encoding, 240 int options); 241XMLPUBFUN htmlDocPtr XMLCALL 242 htmlCtxtReadIO (xmlParserCtxtPtr ctxt, 243 xmlInputReadCallback ioread, 244 xmlInputCloseCallback ioclose, 245 void *ioctx, 246 const char *URL, 247 const char *encoding, 248 int options); 249 250/* NRK/Jan2003: further knowledge of HTML structure 251 */ 252typedef enum { 253 HTML_NA = 0 , /* something we don't check at all */ 254 HTML_INVALID = 0x1 , 255 HTML_DEPRECATED = 0x2 , 256 HTML_VALID = 0x4 , 257 HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */ 258} htmlStatus ; 259 260/* Using htmlElemDesc rather than name here, to emphasise the fact 261 that otherwise there's a lookup overhead 262*/ 263XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ; 264XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ; 265XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ; 266XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ; 267/** 268 * htmlDefaultSubelement: 269 * @elt: HTML element 270 * 271 * Returns the default subelement for this element 272 */ 273#define htmlDefaultSubelement(elt) elt->defaultsubelt 274/** 275 * htmlElementAllowedHereDesc: 276 * @parent: HTML parent element 277 * @elt: HTML element 278 * 279 * Checks whether an HTML element description may be a 280 * direct child of the specified element. 281 * 282 * Returns 1 if allowed; 0 otherwise. 283 */ 284#define htmlElementAllowedHereDesc(parent,elt) \ 285 htmlElementAllowedHere((parent), (elt)->name) 286/** 287 * htmlRequiredAttrs: 288 * @elt: HTML element 289 * 290 * Returns the attributes required for the specified element. 291 */ 292#define htmlRequiredAttrs(elt) (elt)->attrs_req 293 294 295#ifdef __cplusplus 296} 297#endif 298 299#endif /* LIBXML_HTML_ENABLED */ 300#endif /* __HTML_PARSER_H__ */ 301