1/*
2 * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18
19/*
20 * See the corresponding header file for a description of the functions
21 * that this file provides.
22 *
23 * This was first written for Ogg Vorbis but could be of general use.
24 *
25 * The only deliberate assumption about data sizes is that a short has
26 * at least 16 bits, but this code has only been tested on systems with
27 * 8-bit char, 16-bit short and 32-bit int.
28 */
29
30#if HAVE_CONFIG_H
31#  include <config.h>
32#endif
33
34#ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
35
36#include <stdlib.h>
37
38#include "share/alloc.h"
39#include "charset.h"
40
41#include "charmaps.h"
42
43/*
44 * This is like the standard strcasecmp, but it does not depend
45 * on the locale. Locale-dependent functions can be dangerous:
46 * we once had a bug involving strcasecmp("iso", "ISO") in a
47 * Turkish locale!
48 *
49 * (I'm not really sure what the official standard says
50 * about the sign of strcasecmp("Z", "["), but usually
51 * we're only interested in whether it's zero.)
52 */
53
54static int ascii_strcasecmp(const char *s1, const char *s2)
55{
56  char c1, c2;
57
58  for (;; s1++, s2++) {
59    if (!*s1 || !*s1)
60      break;
61    if (*s1 == *s2)
62      continue;
63    c1 = *s1;
64    if ('a' <= c1 && c1 <= 'z')
65      c1 += 'A' - 'a';
66    c2 = *s2;
67    if ('a' <= c2 && c2 <= 'z')
68      c2 += 'A' - 'a';
69    if (c1 != c2)
70      break;
71  }
72  return (unsigned char)*s1 - (unsigned char)*s2;
73}
74
75/*
76 * UTF-8 equivalents of the C library's wctomb() and mbtowc().
77 */
78
79int utf8_mbtowc(int *pwc, const char *s, size_t n)
80{
81  unsigned char c;
82  int wc, i, k;
83
84  if (!n || !s)
85    return 0;
86
87  c = *s;
88  if (c < 0x80) {
89    if (pwc)
90      *pwc = c;
91    return c ? 1 : 0;
92  }
93  else if (c < 0xc2)
94    return -1;
95  else if (c < 0xe0) {
96    if (n >= 2 && (s[1] & 0xc0) == 0x80) {
97      if (pwc)
98	*pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);
99      return 2;
100    }
101    else
102      return -1;
103  }
104  else if (c < 0xf0)
105    k = 3;
106  else if (c < 0xf8)
107    k = 4;
108  else if (c < 0xfc)
109    k = 5;
110  else if (c < 0xfe)
111    k = 6;
112  else
113    return -1;
114
115  if (n < (size_t)k)
116    return -1;
117  wc = *s++ & ((1 << (7 - k)) - 1);
118  for (i = 1; i < k; i++) {
119    if ((*s & 0xc0) != 0x80)
120      return -1;
121    wc = (wc << 6) | (*s++ & 0x3f);
122  }
123  if (wc < (1 << (5 * k - 4)))
124    return -1;
125  if (pwc)
126    *pwc = wc;
127  return k;
128}
129
130int utf8_wctomb(char *s, int wc1)
131{
132  unsigned int wc = wc1;
133
134  if (!s)
135    return 0;
136  if (wc < (1u << 7)) {
137    *s++ = wc;
138    return 1;
139  }
140  else if (wc < (1u << 11)) {
141    *s++ = 0xc0 | (wc >> 6);
142    *s++ = 0x80 | (wc & 0x3f);
143    return 2;
144  }
145  else if (wc < (1u << 16)) {
146    *s++ = 0xe0 | (wc >> 12);
147    *s++ = 0x80 | ((wc >> 6) & 0x3f);
148    *s++ = 0x80 | (wc & 0x3f);
149    return 3;
150  }
151  else if (wc < (1u << 21)) {
152    *s++ = 0xf0 | (wc >> 18);
153    *s++ = 0x80 | ((wc >> 12) & 0x3f);
154    *s++ = 0x80 | ((wc >> 6) & 0x3f);
155    *s++ = 0x80 | (wc & 0x3f);
156    return 4;
157  }
158  else if (wc < (1u << 26)) {
159    *s++ = 0xf8 | (wc >> 24);
160    *s++ = 0x80 | ((wc >> 18) & 0x3f);
161    *s++ = 0x80 | ((wc >> 12) & 0x3f);
162    *s++ = 0x80 | ((wc >> 6) & 0x3f);
163    *s++ = 0x80 | (wc & 0x3f);
164    return 5;
165  }
166  else if (wc < (1u << 31)) {
167    *s++ = 0xfc | (wc >> 30);
168    *s++ = 0x80 | ((wc >> 24) & 0x3f);
169    *s++ = 0x80 | ((wc >> 18) & 0x3f);
170    *s++ = 0x80 | ((wc >> 12) & 0x3f);
171    *s++ = 0x80 | ((wc >> 6) & 0x3f);
172    *s++ = 0x80 | (wc & 0x3f);
173    return 6;
174  }
175  else
176    return -1;
177}
178
179/*
180 * The charset "object" and methods.
181 */
182
183struct charset {
184  int max;
185  int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);
186  int (*wctomb)(void *table, char *s, int wc);
187  void *map;
188};
189
190int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n)
191{
192  return (*charset->mbtowc)(charset->map, pwc, s, n);
193}
194
195int charset_wctomb(struct charset *charset, char *s, int wc)
196{
197  return (*charset->wctomb)(charset->map, s, wc);
198}
199
200int charset_max(struct charset *charset)
201{
202  return charset->max;
203}
204
205/*
206 * Implementation of UTF-8.
207 */
208
209static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n)
210{
211  (void)map;
212  return utf8_mbtowc(pwc, s, n);
213}
214
215static int wctomb_utf8(void *map, char *s, int wc)
216{
217  (void)map;
218  return utf8_wctomb(s, wc);
219}
220
221/*
222 * Implementation of US-ASCII.
223 * Probably on most architectures this compiles to less than 256 bytes
224 * of code, so we can save space by not having a table for this one.
225 */
226
227static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n)
228{
229  int wc;
230
231  (void)map;
232  if (!n || !s)
233    return 0;
234  wc = (unsigned char)*s;
235  if (wc & ~0x7f)
236    return -1;
237  if (pwc)
238    *pwc = wc;
239  return wc ? 1 : 0;
240}
241
242static int wctomb_ascii(void *map, char *s, int wc)
243{
244  (void)map;
245  if (!s)
246    return 0;
247  if (wc & ~0x7f)
248    return -1;
249  *s = wc;
250  return 1;
251}
252
253/*
254 * Implementation of ISO-8859-1.
255 * Probably on most architectures this compiles to less than 256 bytes
256 * of code, so we can save space by not having a table for this one.
257 */
258
259static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n)
260{
261  int wc;
262
263  (void)map;
264  if (!n || !s)
265    return 0;
266  wc = (unsigned char)*s;
267  if (wc & ~0xff)
268    return -1;
269  if (pwc)
270    *pwc = wc;
271  return wc ? 1 : 0;
272}
273
274static int wctomb_iso1(void *map, char *s, int wc)
275{
276  (void)map;
277  if (!s)
278    return 0;
279  if (wc & ~0xff)
280    return -1;
281  *s = wc;
282  return 1;
283}
284
285/*
286 * Implementation of any 8-bit charset.
287 */
288
289struct map {
290  const unsigned short *from;
291  struct inverse_map *to;
292};
293
294static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n)
295{
296  struct map *map = map1;
297  unsigned short wc;
298
299  if (!n || !s)
300    return 0;
301  wc = map->from[(unsigned char)*s];
302  if (wc == 0xffff)
303    return -1;
304  if (pwc)
305    *pwc = (int)wc;
306  return wc ? 1 : 0;
307}
308
309/*
310 * For the inverse map we use a hash table, which has the advantages
311 * of small constant memory requirement and simple memory allocation,
312 * but the disadvantage of slow conversion in the worst case.
313 * If you need real-time performance while letting a potentially
314 * malicious user define their own map, then the method used in
315 * linux/drivers/char/consolemap.c would be more appropriate.
316 */
317
318struct inverse_map {
319  unsigned char first[256];
320  unsigned char next[256];
321};
322
323/*
324 * The simple hash is good enough for this application.
325 * Use the alternative trivial hashes for testing.
326 */
327#define HASH(i) ((i) & 0xff)
328/* #define HASH(i) 0 */
329/* #define HASH(i) 99 */
330
331static struct inverse_map *make_inverse_map(const unsigned short *from)
332{
333  struct inverse_map *to;
334  char used[256];
335  int i, j, k;
336
337  to = (struct inverse_map *)malloc(sizeof(struct inverse_map));
338  if (!to)
339    return 0;
340  for (i = 0; i < 256; i++)
341    to->first[i] = to->next[i] = used[i] = 0;
342  for (i = 255; i >= 0; i--)
343    if (from[i] != 0xffff) {
344      k = HASH(from[i]);
345      to->next[i] = to->first[k];
346      to->first[k] = i;
347      used[k] = 1;
348    }
349
350  /* Point the empty buckets at an empty list. */
351  for (i = 0; i < 256; i++)
352    if (!to->next[i])
353      break;
354  if (i < 256)
355    for (j = 0; j < 256; j++)
356      if (!used[j])
357	to->first[j] = i;
358
359  return to;
360}
361
362int wctomb_8bit(void *map1, char *s, int wc1)
363{
364  struct map *map = map1;
365  unsigned short wc = wc1;
366  int i;
367
368  if (!s)
369    return 0;
370
371  if (wc1 & ~0xffff)
372    return -1;
373
374  if (1) /* Change 1 to 0 to test the case where malloc fails. */
375    if (!map->to)
376      map->to = make_inverse_map(map->from);
377
378  if (map->to) {
379    /* Use the inverse map. */
380    i = map->to->first[HASH(wc)];
381    for (;;) {
382      if (map->from[i] == wc) {
383	*s = i;
384	return 1;
385      }
386      if (!(i = map->to->next[i]))
387	break;
388    }
389  }
390  else {
391    /* We don't have an inverse map, so do a linear search. */
392    for (i = 0; i < 256; i++)
393      if (map->from[i] == wc) {
394	*s = i;
395	return 1;
396      }
397  }
398
399  return -1;
400}
401
402/*
403 * The "constructor" charset_find().
404 */
405
406struct charset charset_utf8 = {
407  6,
408  &mbtowc_utf8,
409  &wctomb_utf8,
410  0
411};
412
413struct charset charset_iso1 = {
414  1,
415  &mbtowc_iso1,
416  &wctomb_iso1,
417  0
418};
419
420struct charset charset_ascii = {
421  1,
422  &mbtowc_ascii,
423  &wctomb_ascii,
424  0
425};
426
427struct charset *charset_find(const char *code)
428{
429  int i;
430
431  /* Find good (MIME) name. */
432  for (i = 0; names[i].bad; i++)
433    if (!ascii_strcasecmp(code, names[i].bad)) {
434      code = names[i].good;
435      break;
436    }
437
438  /* Recognise some charsets for which we avoid using a table. */
439  if (!ascii_strcasecmp(code, "UTF-8"))
440    return &charset_utf8;
441  if (!ascii_strcasecmp(code, "US-ASCII"))
442    return &charset_ascii;
443  if (!ascii_strcasecmp(code, "ISO-8859-1"))
444    return &charset_iso1;
445
446  /* Look for a mapping for a simple 8-bit encoding. */
447  for (i = 0; maps[i].name; i++)
448    if (!ascii_strcasecmp(code, maps[i].name)) {
449      if (!maps[i].charset) {
450	maps[i].charset = (struct charset *)malloc(sizeof(struct charset));
451	if (maps[i].charset) {
452	  struct map *map = (struct map *)malloc(sizeof(struct map));
453	  if (!map) {
454	    free(maps[i].charset);
455	    maps[i].charset = 0;
456	  }
457	  else {
458	    maps[i].charset->max = 1;
459	    maps[i].charset->mbtowc = &mbtowc_8bit;
460	    maps[i].charset->wctomb = &wctomb_8bit;
461	    maps[i].charset->map = map;
462	    map->from = maps[i].map;
463	    map->to = 0; /* inverse mapping is created when required */
464	  }
465	}
466      }
467      return maps[i].charset;
468    }
469
470  return 0;
471}
472
473/*
474 * Function to convert a buffer from one encoding to another.
475 * Invalid bytes are replaced by '#', and characters that are
476 * not available in the target encoding are replaced by '?'.
477 * Each of TO and TOLEN may be zero, if the result is not needed.
478 * The output buffer is null-terminated, so it is all right to
479 * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
480 */
481
482int charset_convert(const char *fromcode, const char *tocode,
483		    const char *from, size_t fromlen,
484		    char **to, size_t *tolen)
485{
486  int ret = 0;
487  struct charset *charset1, *charset2;
488  char *tobuf, *p, *newbuf;
489  int i, j, wc;
490
491  charset1 = charset_find(fromcode);
492  charset2 = charset_find(tocode);
493  if (!charset1 || !charset2 )
494    return -1;
495
496  tobuf = (char *)safe_malloc_mul2add_(fromlen, /*times*/charset2->max, /*+*/1);
497  if (!tobuf)
498    return -2;
499
500  for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) {
501    i = charset_mbtowc(charset1, &wc, from, fromlen);
502    if (!i)
503      i = 1;
504    else if (i == -1) {
505      i  = 1;
506      wc = '#';
507      ret = 2;
508    }
509    j = charset_wctomb(charset2, p, wc);
510    if (j == -1) {
511      if (!ret)
512	ret = 1;
513      j = charset_wctomb(charset2, p, '?');
514      if (j == -1)
515	j = 0;
516    }
517  }
518
519  if (tolen)
520    *tolen = p - tobuf;
521  *p++ = '\0';
522  if (to) {
523    newbuf = realloc(tobuf, p - tobuf);
524    *to = newbuf ? newbuf : tobuf;
525  }
526  else
527    free(tobuf);
528
529  return ret;
530}
531
532#endif /* USE_CHARSET_ICONV */
533