1/*
2 * Copyright 2004-2008, Fran��ois Revol, <revol@free.fr>.
3 * Distributed under the terms of the MIT License.
4 */
5
6#include <malloc.h>
7#include <string.h>
8#include "string_utils.h"
9
10//#define TESTME
11
12#ifdef _KERNEL_MODE
13#define printf dprintf
14#undef TESTME
15#endif
16
17
18
19char *urlify_string(const char *str)
20{
21	char *dst, *d;
22	const char *p;
23	const char *allowed = "abcdefghijklmnopqrstuvwxyz" \
24						  "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \
25						  "0123456789" \
26						  "-_.!~*'()"; /* cf. RFC 2396 */
27	const char *hex = "0123456789ABCDEF";
28	if (!str)
29		return NULL;
30	// hacky, but safe
31	dst = malloc(strlen(str)*3);
32	if (!dst)
33		return NULL;
34	for (p = str, d = dst; *p; p++) {
35		if (strchr(allowed, *p))
36			*d++ = *p;
37		else if (*p == ' ') {
38			*d++ = '+';
39		} else {
40			/* use hex value */
41			*d++ = '%';
42			*d++ = hex[(*(unsigned char *)p >> 4) & 0x0F];
43			*d++ = hex[(*(unsigned char *)p) & 0x0F];
44		}
45	}
46	*d = '\0';
47	return dst;
48}
49
50// cf. http://www.htmlhelp.com/reference/html40/entities/
51
52static const char *entities_tab[][2] = {
53{ "lt", "<" },
54{ "gt", ">" },
55{ "amp", "&" },
56{ "nbsp", " " },
57{ "quot", "\"" },
58{ "raquo", "��" },
59//{ "laquo", "" },
60{ "ccedil", "��" },
61// grave
62{ "agrave", "��" },
63{ "egrave", "��" },
64// acute
65//{ "aacute", "" },
66{ "eacute", "��" },
67// circ
68{ "acirc", "��" },
69{ "ecirc", "��" },
70{ "icirc", "��" },
71{ "ocirc", "��" },
72{ "ucirc", "��" },
73{ "copy", "��" },
74{ "trade", "���" },
75//{ "", "" },
76{ NULL, NULL },
77};
78
79char *unentitify_string(const char *str)
80{
81	char *dst, *d;
82	const char *p;
83	const char *hex = "0123456789abcdef";
84	int i;
85	if (!str)
86		return NULL;
87	// hacky, but safe
88	dst = malloc(strlen(str)+2);
89	if (!dst)
90		return NULL;
91	for (p = str, d = dst; *p; p++) {
92		if (*p != '&')
93			*d++ = *p;
94		/* those case convert to binary, but won't check for valid multibyte UTF-8 sequences */
95		else if ((p[1] == '#') && p[2] && p[3] && (p[4] == ';') &&
96				isdigit(p[2]) &&
97				isdigit(p[3])) {
98			/* &#nn; */
99			char c = ((p[2]) - '0') * 10 +
100					 ((p[3]) - '0');
101			*d++ = c;
102			p += 4;
103		} else if ((p[1] == '#') && p[2] && p[3] && p[4] && (p[5] == ';') &&
104				isdigit(p[2]) &&
105				isdigit(p[3]) &&
106				isdigit(p[4])) {
107			/* &#nnn; */
108			char c = ((p[2]) - '0') * 100 +
109					 ((p[3]) - '0') * 10 +
110					 ((p[4]) - '0');
111			*d++ = c;
112			p += 5;
113		} else if ((p[1] == '#') && (p[2] == 'x') && p[3] && p[4] && (p[5] == ';') &&
114				strchr(hex, tolower(p[3])) &&
115				strchr(hex, tolower(p[4]))) {
116			/* &#xnn; */
117			char c = (strchr(hex, tolower(p[3])) - hex) << 4 |
118					 (strchr(hex, tolower(p[4])) - hex);
119			*d++ = c;
120			p += 5;
121		} else {
122			char buf[20];
123			strncpy(buf, p+1, 20);
124			buf[19] = '\0';
125			if (!strchr(buf, ';')) {
126				*d++ = *p;
127				continue;
128			}
129			*(strchr(buf, ';')) = '\0';
130			for (i = 0; entities_tab[i][0]; i++) {
131				if (!strcmp(buf, entities_tab[i][0])) {
132					strcpy(d, entities_tab[i][1]);
133					d += strlen(d);
134					p += strlen(entities_tab[i][0]) + 1;
135					break;
136				}
137			}
138			if (!entities_tab[i][0]) /* not found */
139				*d++ = '&';
140		}
141	}
142	*d = '\0';
143	return dst;
144}
145
146#ifdef TESTME
147int main(int argc, char **argv)
148{
149	char *p;
150	if (argc < 2)
151		return 1;
152	p = unentitify_string(argv[1]);
153	printf("'%s'\n", p);
154	free(p);
155	free(malloc(10));
156	return 0;
157}
158#endif
159