1/*
2   Wrapper interface to XML parser
3   Copyright (C) 1999-2007, 2009, Joe Orton <joe@manyfish.co.uk>
4
5   This library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Library General Public
7   License as published by the Free Software Foundation; either
8   version 2 of the License, or (at your option) any later version.
9
10   This library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Library General Public License for more details.
14
15   You should have received a copy of the GNU Library General Public
16   License along with this library; if not, write to the Free
17   Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
18   MA 02111-1307, USA
19
20*/
21
22#include "config.h"
23
24#ifdef HAVE_STDLIB_H
25#include <stdlib.h>
26#endif
27#ifdef HAVE_STRING_H
28#include <string.h>
29#endif
30#ifdef HAVE_STRINGS_H
31#include <strings.h>
32#endif
33
34#include "ne_internal.h"
35
36#include "ne_alloc.h"
37#include "ne_xml.h"
38#include "ne_utils.h"
39#include "ne_string.h"
40
41#if defined(HAVE_EXPAT)
42/* expat support: */
43#ifdef HAVE_XMLPARSE_H
44#include "xmlparse.h"
45#else
46#include <expat.h>
47#endif
48typedef XML_Char ne_xml_char;
49
50#if !defined(XML_MAJOR_VERSION)
51#define NEED_BOM_HANDLING
52#elif XML_MAJOR_VERSION < 2 && XML_MINOR_VERSION == 95 && XML_MICRO_VERSION < 2
53#define NEED_BOM_HANDLING
54#endif
55
56#elif defined(HAVE_LIBXML)
57/* libxml2 support: */
58#include <libxml/xmlversion.h>
59#include <libxml/parser.h>
60typedef xmlChar ne_xml_char;
61
62#if LIBXML_VERSION < 20619
63/* 2.6.19 and earlier have broken BOM handling */
64#define NEED_BOM_HANDLING
65#endif
66#else /* not HAVE_LIBXML */
67#  error need an XML parser
68#endif /* not HAVE_EXPAT */
69
70/* Approx. one screen of text: */
71#define ERR_SIZE (2048)
72
73struct handler {
74    ne_xml_startelm_cb *startelm_cb; /* start-element callback */
75    ne_xml_endelm_cb *endelm_cb; /* end-element callback */
76    ne_xml_cdata_cb *cdata_cb; /* character-data callback. */
77    void *userdata; /* userdata for the above. */
78    struct handler *next; /* next handler in stack. */
79};
80
81#ifdef HAVE_LIBXML
82static void sax_error(void *ctx, const char *msg, ...);
83#endif
84
85struct element {
86    const ne_xml_char *nspace;
87    ne_xml_char *name;
88
89    int state; /* opaque state integer */
90
91    /* Namespaces declared in this element */
92    ne_xml_char *default_ns; /* A default namespace */
93    struct namespace *nspaces; /* List of other namespace scopes */
94
95    struct handler *handler; /* Handler for this element */
96
97    struct element *parent; /* parent element, or NULL */
98};
99
100/* We pass around a ne_xml_parser as the userdata in the parsing
101 * library.  This maintains the current state of the parse and various
102 * other bits and bobs. Within the parse, we store the current branch
103 * of the tree, i.e., the current element and all its parents, up to
104 * the root, but nothing other than that.  */
105struct ne_xml_parser_s {
106    struct element *root; /* the root of the document */
107    struct element *current; /* current element in the branch */
108    struct handler *top_handlers; /* always points at the
109					   * handler on top of the stack. */
110    int failure; /* zero whilst parse should continue */
111    int prune; /* if non-zero, depth within a dead branch */
112
113#ifdef NEED_BOM_HANDLING
114    int bom_pos;
115#endif
116
117#ifdef HAVE_EXPAT
118    XML_Parser parser;
119    char *encoding;
120#else
121    xmlParserCtxtPtr parser;
122#endif
123    char error[ERR_SIZE];
124};
125
126/* The callback handlers */
127static void start_element(void *userdata, const ne_xml_char *name, const ne_xml_char **atts);
128static void end_element(void *userdata, const ne_xml_char *name);
129static void char_data(void *userdata, const ne_xml_char *cdata, int len);
130static const char *resolve_nspace(const struct element *elm,
131                                  const char *prefix, size_t pfxlen);
132
133/* Linked list of namespace scopes */
134struct namespace {
135    ne_xml_char *name;
136    ne_xml_char *uri;
137    struct namespace *next;
138};
139
140#ifdef HAVE_LIBXML
141
142/* Could be const as far as we care, but libxml doesn't want that */
143static xmlSAXHandler sax_handler = {
144    NULL, /* internalSubset */
145    NULL, /* isStandalone */
146    NULL, /* hasInternalSubset */
147    NULL, /* hasExternalSubset */
148    NULL, /* resolveEntity */
149    NULL, /* getEntity */
150    NULL, /* entityDecl */
151    NULL, /* notationDecl */
152    NULL, /* attributeDecl */
153    NULL, /* elementDecl */
154    NULL, /* unparsedEntityDecl */
155    NULL, /* setDocumentLocator */
156    NULL, /* startDocument */
157    NULL, /* endDocument */
158    start_element, /* startElement */
159    end_element, /* endElement */
160    NULL, /* reference */
161    char_data, /* characters */
162    NULL, /* ignorableWhitespace */
163    NULL, /* processingInstruction */
164    NULL, /* comment */
165    NULL, /* xmlParserWarning */
166    sax_error, /* xmlParserError */
167    sax_error, /* fatal error (never called by libxml2?) */
168    NULL, /* getParameterEntity */
169    char_data /* cdataBlock */
170};
171
172/* empty attributes array to mimic expat behaviour */
173static const char *const empty_atts[] = {NULL, NULL};
174
175/* macro for determining the attributes array to pass */
176#define PASS_ATTS(atts) (atts ? (const char **)(atts) : empty_atts)
177
178#else
179
180#define PASS_ATTS(atts) ((const char **)(atts))
181
182/* XML declaration callback for expat. */
183static void decl_handler(void *userdata,
184			 const XML_Char *version, const XML_Char *encoding,
185			 int standalone)
186{
187    ne_xml_parser *p = userdata;
188    if (encoding) p->encoding = ne_strdup(encoding);
189}
190
191#endif /* HAVE_LIBXML */
192
193int ne_xml_currentline(ne_xml_parser *p)
194{
195#ifdef HAVE_EXPAT
196    return XML_GetCurrentLineNumber(p->parser);
197#else
198    return p->parser->input->line;
199#endif
200}
201
202const char *ne_xml_doc_encoding(const ne_xml_parser *p)
203{
204#ifdef HAVE_LIBXML
205    return p->parser->encoding;
206#else
207    return p->encoding;
208#endif
209}
210
211/* The first character of the REC-xml-names "NCName" rule excludes
212 * "Digit | '.' | '-' | '_' | CombiningChar | Extender"; the XML
213 * parser will not enforce this rule in a namespace declaration since
214 * it treats the entire attribute name as a REC-xml "Name" rule.  It's
215 * too hard to check for all of CombiningChar | Digit | Extender here,
216 * but the valid_ncname_ch1 macro catches some of the rest. */
217
218/* Return non-zero if 'ch' is an invalid start character for an NCName: */
219#define invalid_ncname_ch1(ch) ((ch) == '\0' || strchr("-.0123456789", (ch)) != NULL)
220
221/* Subversion repositories have been deployed which use property names
222 * marshalled as NCNames including a colon character; these should
223 * also be rejected but will be allowed for the time being. */
224#define invalid_ncname(xn) (invalid_ncname_ch1((xn)[0]))
225
226/* Extract the namespace prefix declarations from 'atts'. */
227static int declare_nspaces(ne_xml_parser *p, struct element *elm,
228                           const ne_xml_char **atts)
229{
230    int n;
231
232    for (n = 0; atts && atts[n]; n += 2) {
233        if (strcmp(atts[n], "xmlns") == 0) {
234            /* New default namespace */
235            elm->default_ns = ne_strdup(atts[n+1]);
236        } else if (strncmp(atts[n], "xmlns:", 6) == 0) {
237            struct namespace *ns;
238
239            /* Reject some invalid NCNames as namespace prefix, and an
240             * empty URI as the namespace URI */
241            if (invalid_ncname(atts[n] + 6) || atts[n+1][0] == '\0') {
242                ne_snprintf(p->error, ERR_SIZE,
243                            ("XML parse error at line %d: invalid namespace "
244                             "declaration"), ne_xml_currentline(p));
245                return -1;
246            }
247
248            /* New namespace scope */
249            ns = ne_calloc(sizeof(*ns));
250            ns->next = elm->nspaces;
251            elm->nspaces = ns;
252            ns->name = ne_strdup(atts[n]+6); /* skip the xmlns= */
253            ns->uri = ne_strdup(atts[n+1]);
254        }
255    }
256
257    return 0;
258}
259
260/* Expand an XML qualified name, which may include a namespace prefix
261 * as well as the local part. */
262static int expand_qname(ne_xml_parser *p, struct element *elm,
263                        const ne_xml_char *qname)
264{
265    const ne_xml_char *pfx;
266
267    pfx = strchr(qname, ':');
268    if (pfx == NULL) {
269        struct element *e = elm;
270
271        /* Find default namespace; guaranteed to terminate as the root
272         * element always has default_ns="". */
273        while (e->default_ns == NULL)
274            e = e->parent;
275
276        elm->name = ne_strdup(qname);
277        elm->nspace = e->default_ns;
278    } else if (invalid_ncname(pfx + 1) || qname == pfx) {
279        ne_snprintf(p->error, ERR_SIZE,
280                    _("XML parse error at line %d: invalid element name"),
281                    ne_xml_currentline(p));
282        return -1;
283    } else {
284        const char *uri = resolve_nspace(elm, qname, pfx-qname);
285
286	if (uri) {
287	    elm->name = ne_strdup(pfx+1);
288            elm->nspace = uri;
289	} else {
290	    ne_snprintf(p->error, ERR_SIZE,
291                        ("XML parse error at line %d: undeclared namespace prefix"),
292                        ne_xml_currentline(p));
293	    return -1;
294	}
295    }
296    return 0;
297}
298
299/* Called with the start of a new element. */
300static void start_element(void *userdata, const ne_xml_char *name,
301			  const ne_xml_char **atts)
302{
303    ne_xml_parser *p = userdata;
304    struct element *elm;
305    struct handler *hand;
306    int state = NE_XML_DECLINE;
307
308    if (p->failure) return;
309
310    if (p->prune) {
311        p->prune++;
312        return;
313    }
314
315    /* Create a new element */
316    elm = ne_calloc(sizeof *elm);
317    elm->parent = p->current;
318    p->current = elm;
319
320    if (declare_nspaces(p, elm, atts) || expand_qname(p, elm, name)) {
321        p->failure = 1;
322        return;
323    }
324
325    /* Find a handler which will accept this element (or abort the parse) */
326    for (hand = elm->parent->handler; hand && state == NE_XML_DECLINE;
327         hand = hand->next) {
328        elm->handler = hand;
329        state = hand->startelm_cb(hand->userdata, elm->parent->state,
330                                  elm->nspace, elm->name, PASS_ATTS(atts));
331    }
332
333    NE_DEBUG(NE_DBG_XML, "XML: start-element (%d, {%s, %s}) => %d\n",
334             elm->parent->state, elm->nspace, elm->name, state);
335
336    if (state > 0)
337        elm->state = state;
338    else if (state == NE_XML_DECLINE)
339        /* prune this branch. */
340        p->prune++;
341    else /* state < 0 => abort parse  */
342        p->failure = state;
343}
344
345/* Destroys an element structure. */
346static void destroy_element(struct element *elm)
347{
348    struct namespace *this_ns, *next_ns;
349    ne_free(elm->name);
350    /* Free the namespaces */
351    this_ns = elm->nspaces;
352    while (this_ns != NULL) {
353	next_ns = this_ns->next;
354	ne_free(this_ns->name);
355	ne_free(this_ns->uri);
356	ne_free(this_ns);
357	this_ns = next_ns;
358    }
359    if (elm->default_ns)
360        ne_free(elm->default_ns);
361    ne_free(elm);
362}
363
364/* cdata SAX callback */
365static void char_data(void *userdata, const ne_xml_char *data, int len)
366{
367    ne_xml_parser *p = userdata;
368    struct element *elm = p->current;
369
370    if (p->failure || p->prune) return;
371
372    if (elm->handler->cdata_cb) {
373        p->failure = elm->handler->cdata_cb(elm->handler->userdata, elm->state, data, len);
374        NE_DEBUG(NE_DBG_XML, "XML: char-data (%d) returns %d\n",
375                 elm->state, p->failure);
376    }
377}
378
379/* Called with the end of an element */
380static void end_element(void *userdata, const ne_xml_char *name)
381{
382    ne_xml_parser *p = userdata;
383    struct element *elm = p->current;
384
385    if (p->failure) return;
386
387    if (p->prune) {
388        if (p->prune-- > 1) return;
389    } else if (elm->handler->endelm_cb) {
390        p->failure = elm->handler->endelm_cb(elm->handler->userdata, elm->state,
391                                             elm->nspace, elm->name);
392        if (p->failure) {
393            NE_DEBUG(NE_DBG_XML, "XML: end-element for %d failed with %d.\n",
394                     elm->state, p->failure);
395        }
396    }
397
398    NE_DEBUG(NE_DBG_XML, "XML: end-element (%d, {%s, %s})\n",
399             elm->state, elm->nspace, elm->name);
400
401    /* move back up the tree */
402    p->current = elm->parent;
403    p->prune = 0;
404
405    destroy_element(elm);
406}
407
408#if defined(HAVE_EXPAT) && XML_MAJOR_VERSION > 1
409/* Stop the parser if an entity declaration is hit. */
410static void entity_declaration(void *userData, const XML_Char *entityName,
411                              int is_parameter_entity, const XML_Char *value,
412                              int value_length, const XML_Char *base,
413                              const XML_Char *systemId, const XML_Char *publicId,
414                              const XML_Char *notationName)
415{
416    ne_xml_parser *parser = userData;
417
418    NE_DEBUG(NE_DBG_XMLPARSE, "XML: entity declaration [%s]. Failing.\n",
419             entityName);
420
421    XML_StopParser(parser->parser, XML_FALSE);
422}
423#elif defined(HAVE_EXPAT)
424/* A noop default_handler. */
425static void default_handler(void *userData, const XML_Char *s, int len)
426{
427}
428#endif
429
430/* Find a namespace definition for 'prefix' in given element, where
431 * length of prefix is 'pfxlen'.  Returns the URI or NULL. */
432static const char *resolve_nspace(const struct element *elm,
433                                  const char *prefix, size_t pfxlen)
434{
435    const struct element *s;
436
437    /* Search up the tree. */
438    for (s = elm; s != NULL; s = s->parent) {
439	const struct namespace *ns;
440	/* Iterate over defined spaces on this node. */
441	for (ns = s->nspaces; ns != NULL; ns = ns->next) {
442	    if (strlen(ns->name) == pfxlen &&
443		memcmp(ns->name, prefix, pfxlen) == 0)
444		return ns->uri;
445	}
446    }
447
448    return NULL;
449}
450
451const char *ne_xml_resolve_nspace(ne_xml_parser *parser,
452                                  const char *prefix, size_t length)
453{
454    if (prefix) {
455        return resolve_nspace(parser->current, prefix, length);
456    }
457    else {
458        struct element *e = parser->current;
459
460        while (e->default_ns == NULL)
461            e = e->parent;
462
463        return e->default_ns;
464    }
465}
466
467ne_xml_parser *ne_xml_create(void)
468{
469    ne_xml_parser *p = ne_calloc(sizeof *p);
470    /* Placeholder for the root element */
471    p->current = p->root = ne_calloc(sizeof *p->root);
472    p->root->default_ns = "";
473    p->root->state = 0;
474    strcpy(p->error, _("Unknown error"));
475#ifdef HAVE_EXPAT
476    p->parser = XML_ParserCreate(NULL);
477    if (p->parser == NULL) {
478	abort();
479    }
480    XML_SetElementHandler(p->parser, start_element, end_element);
481    XML_SetCharacterDataHandler(p->parser, char_data);
482    XML_SetUserData(p->parser, (void *) p);
483    XML_SetXmlDeclHandler(p->parser, decl_handler);
484
485    /* Prevent the "billion laughs" attack against expat by disabling
486     * internal entity expansion.  With 2.x, forcibly stop the parser
487     * if an entity is declared - this is safer and a more obvious
488     * failure mode.  With older versions, installing a noop
489     * DefaultHandler means that internal entities will be expanded as
490     * the empty string, which is also sufficient to prevent the
491     * attack. */
492#if XML_MAJOR_VERSION > 1
493    XML_SetEntityDeclHandler(p->parser, entity_declaration);
494#else
495    XML_SetDefaultHandler(p->parser, default_handler);
496#endif
497
498#else /* HAVE_LIBXML */
499    p->parser = xmlCreatePushParserCtxt(&sax_handler,
500					(void *)p, NULL, 0, NULL);
501    if (p->parser == NULL) {
502	abort();
503    }
504#if LIBXML_VERSION < 20602
505    p->parser->replaceEntities = 1;
506#else
507    /* Enable expansion of entities, and disable network access. */
508    xmlCtxtUseOptions(p->parser, XML_PARSE_NOENT | XML_PARSE_NONET);
509#endif
510
511#endif /* HAVE_LIBXML || HAVE_EXPAT */
512    return p;
513}
514
515void ne_xml_push_handler(ne_xml_parser *p,
516			 ne_xml_startelm_cb *startelm_cb,
517			 ne_xml_cdata_cb *cdata_cb,
518			 ne_xml_endelm_cb *endelm_cb,
519			 void *userdata)
520{
521    struct handler *hand = ne_calloc(sizeof(struct handler));
522
523    hand->startelm_cb = startelm_cb;
524    hand->cdata_cb = cdata_cb;
525    hand->endelm_cb = endelm_cb;
526    hand->userdata = userdata;
527
528    /* If this is the first handler registered, update the
529     * base pointer too. */
530    if (p->top_handlers == NULL) {
531	p->root->handler = hand;
532	p->top_handlers = hand;
533    } else {
534	p->top_handlers->next = hand;
535	p->top_handlers = hand;
536    }
537}
538
539int ne_xml_parse_v(void *userdata, const char *block, size_t len)
540{
541    ne_xml_parser *p = userdata;
542    return ne_xml_parse(p, (const ne_xml_char *)block, len);
543}
544
545#define BOM_UTF8 "\xEF\xBB\xBF" /* UTF-8 BOM */
546
547int ne_xml_parse(ne_xml_parser *p, const char *block, size_t len)
548{
549    int ret, flag;
550    /* duck out if it's broken */
551    if (p->failure) {
552	NE_DEBUG(NE_DBG_XMLPARSE, "XML: Failed; ignoring %" NE_FMT_SIZE_T
553                 " bytes.\n", len);
554	return p->failure;
555    }
556    if (len == 0) {
557	flag = -1;
558	block = "";
559	NE_DEBUG(NE_DBG_XMLPARSE, "XML: End of document.\n");
560    } else {
561	NE_DEBUG(NE_DBG_XMLPARSE, "XML: Parsing %" NE_FMT_SIZE_T " bytes.\n", len);
562	flag = 0;
563    }
564
565#ifdef NEED_BOM_HANDLING
566    if (p->bom_pos < 3) {
567        NE_DEBUG(NE_DBG_XMLPARSE, "Checking for UTF-8 BOM.\n");
568        while (len > 0 && p->bom_pos < 3 &&
569               block[0] == BOM_UTF8[p->bom_pos]) {
570            block++;
571            len--;
572            p->bom_pos++;
573        }
574        if (len == 0)
575            return 0;
576        if (p->bom_pos == 0) {
577            p->bom_pos = 3; /* no BOM */
578        } else if (p->bom_pos > 0 && p->bom_pos < 3) {
579            strcpy(p->error, _("Invalid Byte Order Mark"));
580            return p->failure = 1;
581        }
582    }
583#endif
584
585    /* Note, don't write a parser error if p->failure, since an error
586     * will already have been written in that case. */
587#ifdef HAVE_EXPAT
588    ret = XML_Parse(p->parser, block, len, flag);
589    NE_DEBUG(NE_DBG_XMLPARSE, "XML: XML_Parse returned %d\n", ret);
590    if (ret == 0 && p->failure == 0) {
591	ne_snprintf(p->error, ERR_SIZE,
592		    "XML parse error at line %" NE_FMT_XML_SIZE ": %s",
593		    XML_GetCurrentLineNumber(p->parser),
594		    XML_ErrorString(XML_GetErrorCode(p->parser)));
595	p->failure = 1;
596        NE_DEBUG(NE_DBG_XMLPARSE, "XML: Parse error: %s\n", p->error);
597    }
598#else
599    ret = xmlParseChunk(p->parser, block, len, flag);
600    NE_DEBUG(NE_DBG_XMLPARSE, "XML: xmlParseChunk returned %d\n", ret);
601    /* Parse errors are normally caught by the sax_error() callback,
602     * which clears p->valid. */
603    if (p->parser->errNo && p->failure == 0) {
604	ne_snprintf(p->error, ERR_SIZE, "XML parse error at line %d",
605		    ne_xml_currentline(p));
606	p->failure = 1;
607        NE_DEBUG(NE_DBG_XMLPARSE, "XML: Parse error: %s\n", p->error);
608    }
609#endif
610    return p->failure;
611}
612
613int ne_xml_failed(ne_xml_parser *p)
614{
615    return p->failure;
616}
617
618void ne_xml_destroy(ne_xml_parser *p)
619{
620    struct element *elm, *parent;
621    struct handler *hand, *next;
622
623    /* Free up the handlers on the stack: the root element has the
624     * pointer to the base of the handler stack. */
625    for (hand = p->root->handler; hand!=NULL; hand=next) {
626	next = hand->next;
627	ne_free(hand);
628    }
629
630    /* Clean up remaining elements */
631    for (elm = p->current; elm != p->root; elm = parent) {
632	parent = elm->parent;
633	destroy_element(elm);
634    }
635
636    /* free root element */
637    ne_free(p->root);
638
639#ifdef HAVE_EXPAT
640    XML_ParserFree(p->parser);
641    if (p->encoding) ne_free(p->encoding);
642#else
643    xmlFreeParserCtxt(p->parser);
644#endif
645
646    ne_free(p);
647}
648
649void ne_xml_set_error(ne_xml_parser *p, const char *msg)
650{
651    ne_snprintf(p->error, ERR_SIZE, "%s", msg);
652}
653
654#ifdef HAVE_LIBXML
655static void sax_error(void *ctx, const char *msg, ...)
656{
657    ne_xml_parser *p = ctx;
658    va_list ap;
659    char buf[1024];
660
661    va_start(ap, msg);
662    ne_vsnprintf(buf, 1024, msg, ap);
663    va_end(ap);
664
665    if (p->failure == 0) {
666        ne_snprintf(p->error, ERR_SIZE,
667                    _("XML parse error at line %d: %s"),
668                    p->parser->input->line, buf);
669        p->failure = 1;
670    }
671}
672#endif
673
674const char *ne_xml_get_error(ne_xml_parser *p)
675{
676    return p->error;
677}
678
679const char *
680ne_xml_get_attr(ne_xml_parser *p, const char **attrs,
681		const char *nspace, const char *name)
682{
683    int n;
684
685    for (n = 0; attrs[n] != NULL; n += 2) {
686	char *pnt = strchr(attrs[n], ':');
687
688	if (!nspace && !pnt && strcmp(attrs[n], name) == 0) {
689	    return attrs[n+1];
690	} else if (nspace && pnt) {
691	    /* If a namespace is given, and the local part matches,
692	     * then resolve the namespace and compare that too. */
693	    if (strcmp(pnt + 1, name) == 0) {
694		const char *uri = resolve_nspace(p->current,
695						 attrs[n], pnt - attrs[n]);
696		if (uri && strcmp(uri, nspace) == 0)
697		    return attrs[n+1];
698	    }
699	}
700    }
701
702    return NULL;
703}
704
705int ne_xml_mapid(const struct ne_xml_idmap map[], size_t maplen,
706                 const char *nspace, const char *name)
707{
708    size_t n;
709
710    for (n = 0; n < maplen; n++)
711        if (strcmp(name, map[n].name) == 0 &&
712            strcmp(nspace, map[n].nspace) == 0)
713            return map[n].id;
714
715    return 0;
716}
717