1214501Srpaulo/*
2214501Srpaulo * UPnP XML helper routines
3214501Srpaulo * Copyright (c) 2000-2003 Intel Corporation
4214501Srpaulo * Copyright (c) 2006-2007 Sony Corporation
5214501Srpaulo * Copyright (c) 2008-2009 Atheros Communications
6214501Srpaulo * Copyright (c) 2009, Jouni Malinen <j@w1.fi>
7214501Srpaulo *
8214501Srpaulo * See wps_upnp.c for more details on licensing and code history.
9214501Srpaulo */
10214501Srpaulo
11214501Srpaulo#include "includes.h"
12214501Srpaulo
13214501Srpaulo#include "common.h"
14214501Srpaulo#include "base64.h"
15214501Srpaulo#include "http.h"
16214501Srpaulo#include "upnp_xml.h"
17214501Srpaulo
18214501Srpaulo
19214501Srpaulo/*
20214501Srpaulo * XML parsing and formatting
21214501Srpaulo *
22214501Srpaulo * XML is a markup language based on unicode; usually (and in our case,
23214501Srpaulo * always!) based on utf-8. utf-8 uses a variable number of bytes per
24214501Srpaulo * character. utf-8 has the advantage that all non-ASCII unicode characters are
25214501Srpaulo * represented by sequences of non-ascii (high bit set) bytes, whereas ASCII
26214501Srpaulo * characters are single ascii bytes, thus we can use typical text processing.
27214501Srpaulo *
28214501Srpaulo * (One other interesting thing about utf-8 is that it is possible to look at
29214501Srpaulo * any random byte and determine if it is the first byte of a character as
30214501Srpaulo * versus a continuation byte).
31214501Srpaulo *
32214501Srpaulo * The base syntax of XML uses a few ASCII punctionation characters; any
33214501Srpaulo * characters that would appear in the payload data are rewritten using
34214501Srpaulo * sequences, e.g., &amp; for ampersand(&) and &lt for left angle bracket (<).
35214501Srpaulo * Five such escapes total (more can be defined but that does not apply to our
36214501Srpaulo * case). Thus we can safely parse for angle brackets etc.
37214501Srpaulo *
38214501Srpaulo * XML describes tree structures of tagged data, with each element beginning
39214501Srpaulo * with an opening tag <label> and ending with a closing tag </label> with
40214501Srpaulo * matching label. (There is also a self-closing tag <label/> which is supposed
41214501Srpaulo * to be equivalent to <label></label>, i.e., no payload, but we are unlikely
42214501Srpaulo * to see it for our purpose).
43214501Srpaulo *
44214501Srpaulo * Actually the opening tags are a little more complicated because they can
45214501Srpaulo * contain "attributes" after the label (delimited by ascii space or tab chars)
46214501Srpaulo * of the form attribute_label="value" or attribute_label='value'; as it turns
47214501Srpaulo * out we do not have to read any of these attributes, just ignore them.
48214501Srpaulo *
49214501Srpaulo * Labels are any sequence of chars other than space, tab, right angle bracket
50214501Srpaulo * (and ?), but may have an inner structure of <namespace><colon><plain_label>.
51214501Srpaulo * As it turns out, we can ignore the namespaces, in fact we can ignore the
52214501Srpaulo * entire tree hierarchy, because the plain labels we are looking for will be
53214501Srpaulo * unique (not in general, but for this application). We do however have to be
54214501Srpaulo * careful to skip over the namespaces.
55214501Srpaulo *
56214501Srpaulo * In generating XML we have to be more careful, but that is easy because
57214501Srpaulo * everything we do is pretty canned. The only real care to take is to escape
58214501Srpaulo * any special chars in our payload.
59214501Srpaulo */
60214501Srpaulo
61214501Srpaulo/**
62214501Srpaulo * xml_next_tag - Advance to next tag
63214501Srpaulo * @in: Input
64214501Srpaulo * @out: OUT: start of tag just after '<'
65214501Srpaulo * @out_tagname: OUT: start of name of tag, skipping namespace
66214501Srpaulo * @end: OUT: one after tag
67214501Srpaulo * Returns: 0 on success, 1 on failure
68214501Srpaulo *
69214501Srpaulo * A tag has form:
70214501Srpaulo *     <left angle bracket><...><right angle bracket>
71214501Srpaulo * Within the angle brackets, there is an optional leading forward slash (which
72214501Srpaulo * makes the tag an ending tag), then an optional leading label (followed by
73214501Srpaulo * colon) and then the tag name itself.
74214501Srpaulo *
75214501Srpaulo * Note that angle brackets present in the original data must have been encoded
76214501Srpaulo * as &lt; and &gt; so they will not trouble us.
77214501Srpaulo */
78252726Srpauloint xml_next_tag(const char *in, const char **out,
79252726Srpaulo		 const char **out_tagname, const char **end)
80214501Srpaulo{
81214501Srpaulo	while (*in && *in != '<')
82214501Srpaulo		in++;
83214501Srpaulo	if (*in != '<')
84214501Srpaulo		return 1;
85214501Srpaulo	*out = ++in;
86214501Srpaulo	if (*in == '/')
87214501Srpaulo		in++;
88214501Srpaulo	*out_tagname = in; /* maybe */
89214501Srpaulo	while (isalnum(*in) || *in == '-')
90214501Srpaulo		in++;
91214501Srpaulo	if (*in == ':')
92214501Srpaulo		*out_tagname = ++in;
93214501Srpaulo	while (*in && *in != '>')
94214501Srpaulo		in++;
95214501Srpaulo	if (*in != '>')
96214501Srpaulo		return 1;
97214501Srpaulo	*end = ++in;
98214501Srpaulo	return 0;
99214501Srpaulo}
100214501Srpaulo
101214501Srpaulo
102214501Srpaulo/* xml_data_encode -- format data for xml file, escaping special characters.
103214501Srpaulo *
104214501Srpaulo * Note that we assume we are using utf8 both as input and as output!
105214501Srpaulo * In utf8, characters may be classed as follows:
106214501Srpaulo *     0xxxxxxx(2) -- 1 byte ascii char
107214501Srpaulo *     11xxxxxx(2) -- 1st byte of multi-byte char w/ unicode value >= 0x80
108214501Srpaulo *         110xxxxx(2) -- 1st byte of 2 byte sequence (5 payload bits here)
109214501Srpaulo *         1110xxxx(2) -- 1st byte of 3 byte sequence (4 payload bits here)
110214501Srpaulo *         11110xxx(2) -- 1st byte of 4 byte sequence (3 payload bits here)
111214501Srpaulo *      10xxxxxx(2) -- extension byte (6 payload bits per byte)
112214501Srpaulo *      Some values implied by the above are however illegal because they
113214501Srpaulo *      do not represent unicode chars or are not the shortest encoding.
114214501Srpaulo * Actually, we can almost entirely ignore the above and just do
115214501Srpaulo * text processing same as for ascii text.
116214501Srpaulo *
117214501Srpaulo * XML is written with arbitrary unicode characters, except that five
118214501Srpaulo * characters have special meaning and so must be escaped where they
119214501Srpaulo * appear in payload data... which we do here.
120214501Srpaulo */
121214501Srpaulovoid xml_data_encode(struct wpabuf *buf, const char *data, int len)
122214501Srpaulo{
123214501Srpaulo	int i;
124214501Srpaulo	for (i = 0; i < len; i++) {
125214501Srpaulo		u8 c = ((u8 *) data)[i];
126214501Srpaulo		if (c == '<') {
127214501Srpaulo			wpabuf_put_str(buf, "&lt;");
128214501Srpaulo			continue;
129214501Srpaulo		}
130214501Srpaulo		if (c == '>') {
131214501Srpaulo			wpabuf_put_str(buf, "&gt;");
132214501Srpaulo			continue;
133214501Srpaulo		}
134214501Srpaulo		if (c == '&') {
135214501Srpaulo			wpabuf_put_str(buf, "&amp;");
136214501Srpaulo			continue;
137214501Srpaulo		}
138214501Srpaulo		if (c == '\'') {
139214501Srpaulo			wpabuf_put_str(buf, "&apos;");
140214501Srpaulo			continue;
141214501Srpaulo		}
142214501Srpaulo		if (c == '"') {
143214501Srpaulo			wpabuf_put_str(buf, "&quot;");
144214501Srpaulo			continue;
145214501Srpaulo		}
146214501Srpaulo		/*
147214501Srpaulo		 * We could try to represent control characters using the
148214501Srpaulo		 * sequence: &#x; where x is replaced by a hex numeral, but not
149214501Srpaulo		 * clear why we would do this.
150214501Srpaulo		 */
151214501Srpaulo		wpabuf_put_u8(buf, c);
152214501Srpaulo	}
153214501Srpaulo}
154214501Srpaulo
155214501Srpaulo
156214501Srpaulo/* xml_add_tagged_data -- format tagged data as a new xml line.
157214501Srpaulo *
158214501Srpaulo * tag must not have any special chars.
159214501Srpaulo * data may have special chars, which are escaped.
160214501Srpaulo */
161214501Srpaulovoid xml_add_tagged_data(struct wpabuf *buf, const char *tag, const char *data)
162214501Srpaulo{
163214501Srpaulo	wpabuf_printf(buf, "<%s>", tag);
164214501Srpaulo	xml_data_encode(buf, data, os_strlen(data));
165214501Srpaulo	wpabuf_printf(buf, "</%s>\n", tag);
166214501Srpaulo}
167214501Srpaulo
168214501Srpaulo
169214501Srpaulo/* A POST body looks something like (per upnp spec):
170214501Srpaulo * <?xml version="1.0"?>
171214501Srpaulo * <s:Envelope
172214501Srpaulo *     xmlns:s="http://schemas.xmlsoap.org/soap/envelope/"
173214501Srpaulo *     s:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/">
174214501Srpaulo *   <s:Body>
175214501Srpaulo *     <u:actionName xmlns:u="urn:schemas-upnp-org:service:serviceType:v">
176214501Srpaulo *       <argumentName>in arg value</argumentName>
177214501Srpaulo *       other in args and their values go here, if any
178214501Srpaulo *     </u:actionName>
179214501Srpaulo *   </s:Body>
180214501Srpaulo * </s:Envelope>
181214501Srpaulo *
182214501Srpaulo * where :
183214501Srpaulo *      s: might be some other namespace name followed by colon
184214501Srpaulo *      u: might be some other namespace name followed by colon
185214501Srpaulo *      actionName will be replaced according to action requested
186214501Srpaulo *      schema following actionName will be WFA scheme instead
187214501Srpaulo *      argumentName will be actual argument name
188214501Srpaulo *      (in arg value) will be actual argument value
189214501Srpaulo */
190214501Srpaulochar * xml_get_first_item(const char *doc, const char *item)
191214501Srpaulo{
192214501Srpaulo	const char *match = item;
193214501Srpaulo	int match_len = os_strlen(item);
194214501Srpaulo	const char *tag, *tagname, *end;
195214501Srpaulo	char *value;
196214501Srpaulo
197214501Srpaulo	/*
198214501Srpaulo	 * This is crude: ignore any possible tag name conflicts and go right
199214501Srpaulo	 * to the first tag of this name. This should be ok for the limited
200214501Srpaulo	 * domain of UPnP messages.
201214501Srpaulo	 */
202214501Srpaulo	for (;;) {
203214501Srpaulo		if (xml_next_tag(doc, &tag, &tagname, &end))
204214501Srpaulo			return NULL;
205214501Srpaulo		doc = end;
206214501Srpaulo		if (!os_strncasecmp(tagname, match, match_len) &&
207214501Srpaulo		    *tag != '/' &&
208214501Srpaulo		    (tagname[match_len] == '>' ||
209214501Srpaulo		     !isgraph(tagname[match_len]))) {
210214501Srpaulo			break;
211214501Srpaulo		}
212214501Srpaulo	}
213214501Srpaulo	end = doc;
214214501Srpaulo	while (*end && *end != '<')
215214501Srpaulo		end++;
216214501Srpaulo	value = os_zalloc(1 + (end - doc));
217214501Srpaulo	if (value == NULL)
218214501Srpaulo		return NULL;
219214501Srpaulo	os_memcpy(value, doc, end - doc);
220214501Srpaulo	return value;
221214501Srpaulo}
222214501Srpaulo
223214501Srpaulo
224214501Srpaulostruct wpabuf * xml_get_base64_item(const char *data, const char *name,
225214501Srpaulo				    enum http_reply_code *ret)
226214501Srpaulo{
227214501Srpaulo	char *msg;
228214501Srpaulo	struct wpabuf *buf;
229214501Srpaulo	unsigned char *decoded;
230214501Srpaulo	size_t len;
231214501Srpaulo
232214501Srpaulo	msg = xml_get_first_item(data, name);
233214501Srpaulo	if (msg == NULL) {
234214501Srpaulo		*ret = UPNP_ARG_VALUE_INVALID;
235214501Srpaulo		return NULL;
236214501Srpaulo	}
237214501Srpaulo
238214501Srpaulo	decoded = base64_decode((unsigned char *) msg, os_strlen(msg), &len);
239214501Srpaulo	os_free(msg);
240214501Srpaulo	if (decoded == NULL) {
241214501Srpaulo		*ret = UPNP_OUT_OF_MEMORY;
242214501Srpaulo		return NULL;
243214501Srpaulo	}
244214501Srpaulo
245214501Srpaulo	buf = wpabuf_alloc_ext_data(decoded, len);
246214501Srpaulo	if (buf == NULL) {
247214501Srpaulo		os_free(decoded);
248214501Srpaulo		*ret = UPNP_OUT_OF_MEMORY;
249214501Srpaulo		return NULL;
250214501Srpaulo	}
251214501Srpaulo	return buf;
252214501Srpaulo}
253