1/*
2 * Copyright 2004-2008, Fran��ois Revol, <revol@free.fr>.
3 * Distributed under the terms of the MIT License.
4 */
5
6#include <ctype.h>
7#include <malloc.h>
8#include <string.h>
9#include "string_utils.h"
10
11//#define TESTME
12
13#ifdef _KERNEL_MODE
14#define printf dprintf
15#undef TESTME
16#endif
17
18
19
20char *urlify_string(const char *str)
21{
22	char *dst, *d;
23	const char *p;
24	const char *allowed = "abcdefghijklmnopqrstuvwxyz" \
25						  "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \
26						  "0123456789" \
27						  "-_.!~*'()"; /* cf. RFC 2396 */
28	const char *hex = "0123456789ABCDEF";
29	if (!str)
30		return NULL;
31	// hacky, but safe
32	dst = malloc(strlen(str)*3);
33	if (!dst)
34		return NULL;
35	for (p = str, d = dst; *p; p++) {
36		if (strchr(allowed, *p))
37			*d++ = *p;
38		else if (*p == ' ') {
39			*d++ = '+';
40		} else {
41			/* use hex value */
42			*d++ = '%';
43			*d++ = hex[(*(unsigned char *)p >> 4) & 0x0F];
44			*d++ = hex[(*(unsigned char *)p) & 0x0F];
45		}
46	}
47	*d = '\0';
48	return dst;
49}
50
51// cf. http://www.htmlhelp.com/reference/html40/entities/
52
53static const char *entities_tab[][2] = {
54{ "lt", "<" },
55{ "gt", ">" },
56{ "amp", "&" },
57{ "nbsp", " " },
58{ "quot", "\"" },
59{ "raquo", "��" },
60//{ "laquo", "" },
61{ "ccedil", "��" },
62// grave
63{ "agrave", "��" },
64{ "egrave", "��" },
65// acute
66//{ "aacute", "" },
67{ "eacute", "��" },
68// circ
69{ "acirc", "��" },
70{ "ecirc", "��" },
71{ "icirc", "��" },
72{ "ocirc", "��" },
73{ "ucirc", "��" },
74{ "copy", "��" },
75{ "trade", "���" },
76//{ "", "" },
77{ NULL, NULL },
78};
79
80char *unentitify_string(const char *str)
81{
82	char *dst, *d;
83	const char *p;
84	const char *hex = "0123456789abcdef";
85	int i;
86	if (!str)
87		return NULL;
88	// hacky, but safe
89	dst = malloc(strlen(str)+2);
90	if (!dst)
91		return NULL;
92	for (p = str, d = dst; *p; p++) {
93		if (*p != '&')
94			*d++ = *p;
95		/* those case convert to binary, but won't check for valid multibyte UTF-8 sequences */
96		else if ((p[1] == '#') && p[2] && p[3] && (p[4] == ';') &&
97				isdigit(p[2]) &&
98				isdigit(p[3])) {
99			/* &#nn; */
100			char c = ((p[2]) - '0') * 10 +
101					 ((p[3]) - '0');
102			*d++ = c;
103			p += 4;
104		} else if ((p[1] == '#') && p[2] && p[3] && p[4] && (p[5] == ';') &&
105				isdigit(p[2]) &&
106				isdigit(p[3]) &&
107				isdigit(p[4])) {
108			/* &#nnn; */
109			char c = ((p[2]) - '0') * 100 +
110					 ((p[3]) - '0') * 10 +
111					 ((p[4]) - '0');
112			*d++ = c;
113			p += 5;
114		} else if ((p[1] == '#') && (p[2] == 'x') && p[3] && p[4] && (p[5] == ';') &&
115				strchr(hex, tolower(p[3])) &&
116				strchr(hex, tolower(p[4]))) {
117			/* &#xnn; */
118			char c = (strchr(hex, tolower(p[3])) - hex) << 4 |
119					 (strchr(hex, tolower(p[4])) - hex);
120			*d++ = c;
121			p += 5;
122		} else {
123			char buf[20];
124			strncpy(buf, p+1, 20);
125			buf[19] = '\0';
126			if (!strchr(buf, ';')) {
127				*d++ = *p;
128				continue;
129			}
130			*(strchr(buf, ';')) = '\0';
131			for (i = 0; entities_tab[i][0]; i++) {
132				if (!strcmp(buf, entities_tab[i][0])) {
133					strcpy(d, entities_tab[i][1]);
134					d += strlen(d);
135					p += strlen(entities_tab[i][0]) + 1;
136					break;
137				}
138			}
139			if (!entities_tab[i][0]) /* not found */
140				*d++ = '&';
141		}
142	}
143	*d = '\0';
144	return dst;
145}
146
147#ifdef TESTME
148int main(int argc, char **argv)
149{
150	char *p;
151	if (argc < 2)
152		return 1;
153	p = unentitify_string(argv[1]);
154	printf("'%s'\n", p);
155	free(p);
156	free(malloc(10));
157	return 0;
158}
159#endif
160