1/* 2 * Copyright 2004-2008, Fran��ois Revol, <revol@free.fr>. 3 * Distributed under the terms of the MIT License. 4 */ 5 6#include <ctype.h> 7#include <malloc.h> 8#include <string.h> 9#include "string_utils.h" 10 11//#define TESTME 12 13#ifdef _KERNEL_MODE 14#define printf dprintf 15#undef TESTME 16#endif 17 18 19 20char *urlify_string(const char *str) 21{ 22 char *dst, *d; 23 const char *p; 24 const char *allowed = "abcdefghijklmnopqrstuvwxyz" \ 25 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \ 26 "0123456789" \ 27 "-_.!~*'()"; /* cf. RFC 2396 */ 28 const char *hex = "0123456789ABCDEF"; 29 if (!str) 30 return NULL; 31 // hacky, but safe 32 dst = malloc(strlen(str)*3); 33 if (!dst) 34 return NULL; 35 for (p = str, d = dst; *p; p++) { 36 if (strchr(allowed, *p)) 37 *d++ = *p; 38 else if (*p == ' ') { 39 *d++ = '+'; 40 } else { 41 /* use hex value */ 42 *d++ = '%'; 43 *d++ = hex[(*(unsigned char *)p >> 4) & 0x0F]; 44 *d++ = hex[(*(unsigned char *)p) & 0x0F]; 45 } 46 } 47 *d = '\0'; 48 return dst; 49} 50 51// cf. http://www.htmlhelp.com/reference/html40/entities/ 52 53static const char *entities_tab[][2] = { 54{ "lt", "<" }, 55{ "gt", ">" }, 56{ "amp", "&" }, 57{ "nbsp", " " }, 58{ "quot", "\"" }, 59{ "raquo", "��" }, 60//{ "laquo", "" }, 61{ "ccedil", "��" }, 62// grave 63{ "agrave", "��" }, 64{ "egrave", "��" }, 65// acute 66//{ "aacute", "" }, 67{ "eacute", "��" }, 68// circ 69{ "acirc", "��" }, 70{ "ecirc", "��" }, 71{ "icirc", "��" }, 72{ "ocirc", "��" }, 73{ "ucirc", "��" }, 74{ "copy", "��" }, 75{ "trade", "���" }, 76//{ "", "" }, 77{ NULL, NULL }, 78}; 79 80char *unentitify_string(const char *str) 81{ 82 char *dst, *d; 83 const char *p; 84 const char *hex = "0123456789abcdef"; 85 int i; 86 if (!str) 87 return NULL; 88 // hacky, but safe 89 dst = malloc(strlen(str)+2); 90 if (!dst) 91 return NULL; 92 for (p = str, d = dst; *p; p++) { 93 if (*p != '&') 94 *d++ = *p; 95 /* those case convert to binary, but won't check for valid multibyte UTF-8 sequences */ 96 else if ((p[1] == '#') && p[2] && p[3] && (p[4] == ';') && 97 isdigit(p[2]) && 98 isdigit(p[3])) { 99 /* &#nn; */ 100 char c = ((p[2]) - '0') * 10 + 101 ((p[3]) - '0'); 102 *d++ = c; 103 p += 4; 104 } else if ((p[1] == '#') && p[2] && p[3] && p[4] && (p[5] == ';') && 105 isdigit(p[2]) && 106 isdigit(p[3]) && 107 isdigit(p[4])) { 108 /* &#nnn; */ 109 char c = ((p[2]) - '0') * 100 + 110 ((p[3]) - '0') * 10 + 111 ((p[4]) - '0'); 112 *d++ = c; 113 p += 5; 114 } else if ((p[1] == '#') && (p[2] == 'x') && p[3] && p[4] && (p[5] == ';') && 115 strchr(hex, tolower(p[3])) && 116 strchr(hex, tolower(p[4]))) { 117 /* &#xnn; */ 118 char c = (strchr(hex, tolower(p[3])) - hex) << 4 | 119 (strchr(hex, tolower(p[4])) - hex); 120 *d++ = c; 121 p += 5; 122 } else { 123 char buf[20]; 124 strncpy(buf, p+1, 20); 125 buf[19] = '\0'; 126 if (!strchr(buf, ';')) { 127 *d++ = *p; 128 continue; 129 } 130 *(strchr(buf, ';')) = '\0'; 131 for (i = 0; entities_tab[i][0]; i++) { 132 if (!strcmp(buf, entities_tab[i][0])) { 133 strcpy(d, entities_tab[i][1]); 134 d += strlen(d); 135 p += strlen(entities_tab[i][0]) + 1; 136 break; 137 } 138 } 139 if (!entities_tab[i][0]) /* not found */ 140 *d++ = '&'; 141 } 142 } 143 *d = '\0'; 144 return dst; 145} 146 147#ifdef TESTME 148int main(int argc, char **argv) 149{ 150 char *p; 151 if (argc < 2) 152 return 1; 153 p = unentitify_string(argv[1]); 154 printf("'%s'\n", p); 155 free(p); 156 free(malloc(10)); 157 return 0; 158} 159#endif 160