1/* 2 * Copyright 2004-2008, Fran��ois Revol, <revol@free.fr>. 3 * Distributed under the terms of the MIT License. 4 */ 5 6#include <malloc.h> 7#include <string.h> 8#include "string_utils.h" 9 10//#define TESTME 11 12#ifdef _KERNEL_MODE 13#define printf dprintf 14#undef TESTME 15#endif 16 17 18 19char *urlify_string(const char *str) 20{ 21 char *dst, *d; 22 const char *p; 23 const char *allowed = "abcdefghijklmnopqrstuvwxyz" \ 24 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \ 25 "0123456789" \ 26 "-_.!~*'()"; /* cf. RFC 2396 */ 27 const char *hex = "0123456789ABCDEF"; 28 if (!str) 29 return NULL; 30 // hacky, but safe 31 dst = malloc(strlen(str)*3); 32 if (!dst) 33 return NULL; 34 for (p = str, d = dst; *p; p++) { 35 if (strchr(allowed, *p)) 36 *d++ = *p; 37 else if (*p == ' ') { 38 *d++ = '+'; 39 } else { 40 /* use hex value */ 41 *d++ = '%'; 42 *d++ = hex[(*(unsigned char *)p >> 4) & 0x0F]; 43 *d++ = hex[(*(unsigned char *)p) & 0x0F]; 44 } 45 } 46 *d = '\0'; 47 return dst; 48} 49 50// cf. http://www.htmlhelp.com/reference/html40/entities/ 51 52static const char *entities_tab[][2] = { 53{ "lt", "<" }, 54{ "gt", ">" }, 55{ "amp", "&" }, 56{ "nbsp", " " }, 57{ "quot", "\"" }, 58{ "raquo", "��" }, 59//{ "laquo", "" }, 60{ "ccedil", "��" }, 61// grave 62{ "agrave", "��" }, 63{ "egrave", "��" }, 64// acute 65//{ "aacute", "" }, 66{ "eacute", "��" }, 67// circ 68{ "acirc", "��" }, 69{ "ecirc", "��" }, 70{ "icirc", "��" }, 71{ "ocirc", "��" }, 72{ "ucirc", "��" }, 73{ "copy", "��" }, 74{ "trade", "���" }, 75//{ "", "" }, 76{ NULL, NULL }, 77}; 78 79char *unentitify_string(const char *str) 80{ 81 char *dst, *d; 82 const char *p; 83 const char *hex = "0123456789abcdef"; 84 int i; 85 if (!str) 86 return NULL; 87 // hacky, but safe 88 dst = malloc(strlen(str)+2); 89 if (!dst) 90 return NULL; 91 for (p = str, d = dst; *p; p++) { 92 if (*p != '&') 93 *d++ = *p; 94 /* those case convert to binary, but won't check for valid multibyte UTF-8 sequences */ 95 else if ((p[1] == '#') && p[2] && p[3] && (p[4] == ';') && 96 isdigit(p[2]) && 97 isdigit(p[3])) { 98 /* &#nn; */ 99 char c = ((p[2]) - '0') * 10 + 100 ((p[3]) - '0'); 101 *d++ = c; 102 p += 4; 103 } else if ((p[1] == '#') && p[2] && p[3] && p[4] && (p[5] == ';') && 104 isdigit(p[2]) && 105 isdigit(p[3]) && 106 isdigit(p[4])) { 107 /* &#nnn; */ 108 char c = ((p[2]) - '0') * 100 + 109 ((p[3]) - '0') * 10 + 110 ((p[4]) - '0'); 111 *d++ = c; 112 p += 5; 113 } else if ((p[1] == '#') && (p[2] == 'x') && p[3] && p[4] && (p[5] == ';') && 114 strchr(hex, tolower(p[3])) && 115 strchr(hex, tolower(p[4]))) { 116 /* &#xnn; */ 117 char c = (strchr(hex, tolower(p[3])) - hex) << 4 | 118 (strchr(hex, tolower(p[4])) - hex); 119 *d++ = c; 120 p += 5; 121 } else { 122 char buf[20]; 123 strncpy(buf, p+1, 20); 124 buf[19] = '\0'; 125 if (!strchr(buf, ';')) { 126 *d++ = *p; 127 continue; 128 } 129 *(strchr(buf, ';')) = '\0'; 130 for (i = 0; entities_tab[i][0]; i++) { 131 if (!strcmp(buf, entities_tab[i][0])) { 132 strcpy(d, entities_tab[i][1]); 133 d += strlen(d); 134 p += strlen(entities_tab[i][0]) + 1; 135 break; 136 } 137 } 138 if (!entities_tab[i][0]) /* not found */ 139 *d++ = '&'; 140 } 141 } 142 *d = '\0'; 143 return dst; 144} 145 146#ifdef TESTME 147int main(int argc, char **argv) 148{ 149 char *p; 150 if (argc < 2) 151 return 1; 152 p = unentitify_string(argv[1]); 153 printf("'%s'\n", p); 154 free(p); 155 free(malloc(10)); 156 return 0; 157} 158#endif 159