1/* 2 * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org> 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 */ 18 19/* 20 * See the corresponding header file for a description of the functions 21 * that this file provides. 22 * 23 * This was first written for Ogg Vorbis but could be of general use. 24 * 25 * The only deliberate assumption about data sizes is that a short has 26 * at least 16 bits, but this code has only been tested on systems with 27 * 8-bit char, 16-bit short and 32-bit int. 28 */ 29 30#if HAVE_CONFIG_H 31# include <config.h> 32#endif 33 34#ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */ 35 36#include <stdlib.h> 37 38#include "share/alloc.h" 39#include "charset.h" 40 41#include "charmaps.h" 42 43/* 44 * This is like the standard strcasecmp, but it does not depend 45 * on the locale. Locale-dependent functions can be dangerous: 46 * we once had a bug involving strcasecmp("iso", "ISO") in a 47 * Turkish locale! 48 * 49 * (I'm not really sure what the official standard says 50 * about the sign of strcasecmp("Z", "["), but usually 51 * we're only interested in whether it's zero.) 52 */ 53 54static int ascii_strcasecmp(const char *s1, const char *s2) 55{ 56 char c1, c2; 57 58 for (;; s1++, s2++) { 59 if (!*s1 || !*s1) 60 break; 61 if (*s1 == *s2) 62 continue; 63 c1 = *s1; 64 if ('a' <= c1 && c1 <= 'z') 65 c1 += 'A' - 'a'; 66 c2 = *s2; 67 if ('a' <= c2 && c2 <= 'z') 68 c2 += 'A' - 'a'; 69 if (c1 != c2) 70 break; 71 } 72 return (unsigned char)*s1 - (unsigned char)*s2; 73} 74 75/* 76 * UTF-8 equivalents of the C library's wctomb() and mbtowc(). 77 */ 78 79int utf8_mbtowc(int *pwc, const char *s, size_t n) 80{ 81 unsigned char c; 82 int wc, i, k; 83 84 if (!n || !s) 85 return 0; 86 87 c = *s; 88 if (c < 0x80) { 89 if (pwc) 90 *pwc = c; 91 return c ? 1 : 0; 92 } 93 else if (c < 0xc2) 94 return -1; 95 else if (c < 0xe0) { 96 if (n >= 2 && (s[1] & 0xc0) == 0x80) { 97 if (pwc) 98 *pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f); 99 return 2; 100 } 101 else 102 return -1; 103 } 104 else if (c < 0xf0) 105 k = 3; 106 else if (c < 0xf8) 107 k = 4; 108 else if (c < 0xfc) 109 k = 5; 110 else if (c < 0xfe) 111 k = 6; 112 else 113 return -1; 114 115 if (n < (size_t)k) 116 return -1; 117 wc = *s++ & ((1 << (7 - k)) - 1); 118 for (i = 1; i < k; i++) { 119 if ((*s & 0xc0) != 0x80) 120 return -1; 121 wc = (wc << 6) | (*s++ & 0x3f); 122 } 123 if (wc < (1 << (5 * k - 4))) 124 return -1; 125 if (pwc) 126 *pwc = wc; 127 return k; 128} 129 130int utf8_wctomb(char *s, int wc1) 131{ 132 unsigned int wc = wc1; 133 134 if (!s) 135 return 0; 136 if (wc < (1u << 7)) { 137 *s++ = wc; 138 return 1; 139 } 140 else if (wc < (1u << 11)) { 141 *s++ = 0xc0 | (wc >> 6); 142 *s++ = 0x80 | (wc & 0x3f); 143 return 2; 144 } 145 else if (wc < (1u << 16)) { 146 *s++ = 0xe0 | (wc >> 12); 147 *s++ = 0x80 | ((wc >> 6) & 0x3f); 148 *s++ = 0x80 | (wc & 0x3f); 149 return 3; 150 } 151 else if (wc < (1u << 21)) { 152 *s++ = 0xf0 | (wc >> 18); 153 *s++ = 0x80 | ((wc >> 12) & 0x3f); 154 *s++ = 0x80 | ((wc >> 6) & 0x3f); 155 *s++ = 0x80 | (wc & 0x3f); 156 return 4; 157 } 158 else if (wc < (1u << 26)) { 159 *s++ = 0xf8 | (wc >> 24); 160 *s++ = 0x80 | ((wc >> 18) & 0x3f); 161 *s++ = 0x80 | ((wc >> 12) & 0x3f); 162 *s++ = 0x80 | ((wc >> 6) & 0x3f); 163 *s++ = 0x80 | (wc & 0x3f); 164 return 5; 165 } 166 else if (wc < (1u << 31)) { 167 *s++ = 0xfc | (wc >> 30); 168 *s++ = 0x80 | ((wc >> 24) & 0x3f); 169 *s++ = 0x80 | ((wc >> 18) & 0x3f); 170 *s++ = 0x80 | ((wc >> 12) & 0x3f); 171 *s++ = 0x80 | ((wc >> 6) & 0x3f); 172 *s++ = 0x80 | (wc & 0x3f); 173 return 6; 174 } 175 else 176 return -1; 177} 178 179/* 180 * The charset "object" and methods. 181 */ 182 183struct charset { 184 int max; 185 int (*mbtowc)(void *table, int *pwc, const char *s, size_t n); 186 int (*wctomb)(void *table, char *s, int wc); 187 void *map; 188}; 189 190int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n) 191{ 192 return (*charset->mbtowc)(charset->map, pwc, s, n); 193} 194 195int charset_wctomb(struct charset *charset, char *s, int wc) 196{ 197 return (*charset->wctomb)(charset->map, s, wc); 198} 199 200int charset_max(struct charset *charset) 201{ 202 return charset->max; 203} 204 205/* 206 * Implementation of UTF-8. 207 */ 208 209static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n) 210{ 211 (void)map; 212 return utf8_mbtowc(pwc, s, n); 213} 214 215static int wctomb_utf8(void *map, char *s, int wc) 216{ 217 (void)map; 218 return utf8_wctomb(s, wc); 219} 220 221/* 222 * Implementation of US-ASCII. 223 * Probably on most architectures this compiles to less than 256 bytes 224 * of code, so we can save space by not having a table for this one. 225 */ 226 227static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n) 228{ 229 int wc; 230 231 (void)map; 232 if (!n || !s) 233 return 0; 234 wc = (unsigned char)*s; 235 if (wc & ~0x7f) 236 return -1; 237 if (pwc) 238 *pwc = wc; 239 return wc ? 1 : 0; 240} 241 242static int wctomb_ascii(void *map, char *s, int wc) 243{ 244 (void)map; 245 if (!s) 246 return 0; 247 if (wc & ~0x7f) 248 return -1; 249 *s = wc; 250 return 1; 251} 252 253/* 254 * Implementation of ISO-8859-1. 255 * Probably on most architectures this compiles to less than 256 bytes 256 * of code, so we can save space by not having a table for this one. 257 */ 258 259static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n) 260{ 261 int wc; 262 263 (void)map; 264 if (!n || !s) 265 return 0; 266 wc = (unsigned char)*s; 267 if (wc & ~0xff) 268 return -1; 269 if (pwc) 270 *pwc = wc; 271 return wc ? 1 : 0; 272} 273 274static int wctomb_iso1(void *map, char *s, int wc) 275{ 276 (void)map; 277 if (!s) 278 return 0; 279 if (wc & ~0xff) 280 return -1; 281 *s = wc; 282 return 1; 283} 284 285/* 286 * Implementation of any 8-bit charset. 287 */ 288 289struct map { 290 const unsigned short *from; 291 struct inverse_map *to; 292}; 293 294static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n) 295{ 296 struct map *map = map1; 297 unsigned short wc; 298 299 if (!n || !s) 300 return 0; 301 wc = map->from[(unsigned char)*s]; 302 if (wc == 0xffff) 303 return -1; 304 if (pwc) 305 *pwc = (int)wc; 306 return wc ? 1 : 0; 307} 308 309/* 310 * For the inverse map we use a hash table, which has the advantages 311 * of small constant memory requirement and simple memory allocation, 312 * but the disadvantage of slow conversion in the worst case. 313 * If you need real-time performance while letting a potentially 314 * malicious user define their own map, then the method used in 315 * linux/drivers/char/consolemap.c would be more appropriate. 316 */ 317 318struct inverse_map { 319 unsigned char first[256]; 320 unsigned char next[256]; 321}; 322 323/* 324 * The simple hash is good enough for this application. 325 * Use the alternative trivial hashes for testing. 326 */ 327#define HASH(i) ((i) & 0xff) 328/* #define HASH(i) 0 */ 329/* #define HASH(i) 99 */ 330 331static struct inverse_map *make_inverse_map(const unsigned short *from) 332{ 333 struct inverse_map *to; 334 char used[256]; 335 int i, j, k; 336 337 to = (struct inverse_map *)malloc(sizeof(struct inverse_map)); 338 if (!to) 339 return 0; 340 for (i = 0; i < 256; i++) 341 to->first[i] = to->next[i] = used[i] = 0; 342 for (i = 255; i >= 0; i--) 343 if (from[i] != 0xffff) { 344 k = HASH(from[i]); 345 to->next[i] = to->first[k]; 346 to->first[k] = i; 347 used[k] = 1; 348 } 349 350 /* Point the empty buckets at an empty list. */ 351 for (i = 0; i < 256; i++) 352 if (!to->next[i]) 353 break; 354 if (i < 256) 355 for (j = 0; j < 256; j++) 356 if (!used[j]) 357 to->first[j] = i; 358 359 return to; 360} 361 362int wctomb_8bit(void *map1, char *s, int wc1) 363{ 364 struct map *map = map1; 365 unsigned short wc = wc1; 366 int i; 367 368 if (!s) 369 return 0; 370 371 if (wc1 & ~0xffff) 372 return -1; 373 374 if (1) /* Change 1 to 0 to test the case where malloc fails. */ 375 if (!map->to) 376 map->to = make_inverse_map(map->from); 377 378 if (map->to) { 379 /* Use the inverse map. */ 380 i = map->to->first[HASH(wc)]; 381 for (;;) { 382 if (map->from[i] == wc) { 383 *s = i; 384 return 1; 385 } 386 if (!(i = map->to->next[i])) 387 break; 388 } 389 } 390 else { 391 /* We don't have an inverse map, so do a linear search. */ 392 for (i = 0; i < 256; i++) 393 if (map->from[i] == wc) { 394 *s = i; 395 return 1; 396 } 397 } 398 399 return -1; 400} 401 402/* 403 * The "constructor" charset_find(). 404 */ 405 406struct charset charset_utf8 = { 407 6, 408 &mbtowc_utf8, 409 &wctomb_utf8, 410 0 411}; 412 413struct charset charset_iso1 = { 414 1, 415 &mbtowc_iso1, 416 &wctomb_iso1, 417 0 418}; 419 420struct charset charset_ascii = { 421 1, 422 &mbtowc_ascii, 423 &wctomb_ascii, 424 0 425}; 426 427struct charset *charset_find(const char *code) 428{ 429 int i; 430 431 /* Find good (MIME) name. */ 432 for (i = 0; names[i].bad; i++) 433 if (!ascii_strcasecmp(code, names[i].bad)) { 434 code = names[i].good; 435 break; 436 } 437 438 /* Recognise some charsets for which we avoid using a table. */ 439 if (!ascii_strcasecmp(code, "UTF-8")) 440 return &charset_utf8; 441 if (!ascii_strcasecmp(code, "US-ASCII")) 442 return &charset_ascii; 443 if (!ascii_strcasecmp(code, "ISO-8859-1")) 444 return &charset_iso1; 445 446 /* Look for a mapping for a simple 8-bit encoding. */ 447 for (i = 0; maps[i].name; i++) 448 if (!ascii_strcasecmp(code, maps[i].name)) { 449 if (!maps[i].charset) { 450 maps[i].charset = (struct charset *)malloc(sizeof(struct charset)); 451 if (maps[i].charset) { 452 struct map *map = (struct map *)malloc(sizeof(struct map)); 453 if (!map) { 454 free(maps[i].charset); 455 maps[i].charset = 0; 456 } 457 else { 458 maps[i].charset->max = 1; 459 maps[i].charset->mbtowc = &mbtowc_8bit; 460 maps[i].charset->wctomb = &wctomb_8bit; 461 maps[i].charset->map = map; 462 map->from = maps[i].map; 463 map->to = 0; /* inverse mapping is created when required */ 464 } 465 } 466 } 467 return maps[i].charset; 468 } 469 470 return 0; 471} 472 473/* 474 * Function to convert a buffer from one encoding to another. 475 * Invalid bytes are replaced by '#', and characters that are 476 * not available in the target encoding are replaced by '?'. 477 * Each of TO and TOLEN may be zero, if the result is not needed. 478 * The output buffer is null-terminated, so it is all right to 479 * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0). 480 */ 481 482int charset_convert(const char *fromcode, const char *tocode, 483 const char *from, size_t fromlen, 484 char **to, size_t *tolen) 485{ 486 int ret = 0; 487 struct charset *charset1, *charset2; 488 char *tobuf, *p, *newbuf; 489 int i, j, wc; 490 491 charset1 = charset_find(fromcode); 492 charset2 = charset_find(tocode); 493 if (!charset1 || !charset2 ) 494 return -1; 495 496 tobuf = (char *)safe_malloc_mul2add_(fromlen, /*times*/charset2->max, /*+*/1); 497 if (!tobuf) 498 return -2; 499 500 for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) { 501 i = charset_mbtowc(charset1, &wc, from, fromlen); 502 if (!i) 503 i = 1; 504 else if (i == -1) { 505 i = 1; 506 wc = '#'; 507 ret = 2; 508 } 509 j = charset_wctomb(charset2, p, wc); 510 if (j == -1) { 511 if (!ret) 512 ret = 1; 513 j = charset_wctomb(charset2, p, '?'); 514 if (j == -1) 515 j = 0; 516 } 517 } 518 519 if (tolen) 520 *tolen = p - tobuf; 521 *p++ = '\0'; 522 if (to) { 523 newbuf = realloc(tobuf, p - tobuf); 524 *to = newbuf ? newbuf : tobuf; 525 } 526 else 527 free(tobuf); 528 529 return ret; 530} 531 532#endif /* USE_CHARSET_ICONV */ 533